├── tests
├── __init__.py
├── core
│ ├── __init__.py
│ ├── explorer
│ │ ├── __init__.py
│ │ └── test_feature.py
│ ├── representation
│ │ ├── test_manifold.py
│ │ ├── test_trajectory.py
│ │ ├── conftest.py
│ │ └── test_reduction.py
│ ├── test_local_config.py
│ └── test_neural.py
├── recipes
│ ├── __init__.py
│ ├── test_subroutine.py
│ ├── local_helper.py
│ └── test_experimental.py
├── module_config
│ ├── hover_alt_config_3.ini
│ ├── hover_alt_config_2.ini
│ └── hover_alt_config_1.ini
├── local_config.py
└── utils
│ ├── conftest.py
│ ├── test_metrics.py
│ ├── test_common_nn.py
│ ├── test_typecheck.py
│ ├── test_snorkel_helper.py
│ ├── test_datasets.py
│ ├── test_bokeh_helper.py
│ ├── test_torch_helper.py
│ └── test_misc.py
├── fixture_module
├── __init__.py
├── .gitignore
├── text_vector_net
│ └── __init__.py
├── audio_vector_net
│ └── __init__.py
└── image_vector_net
│ └── __init__.py
├── hover
├── utils
│ ├── meta
│ │ ├── __init__.py
│ │ └── traceback.py
│ ├── __init__.py
│ ├── metrics.py
│ ├── typecheck.py
│ ├── bokeh_helper
│ │ └── local_config.py
│ ├── common_nn.py
│ ├── snorkel_helper.py
│ ├── misc.py
│ ├── datasets.py
│ └── torch_helper.py
├── recipes
│ ├── __init__.py
│ ├── local_config.py
│ └── stable.py
├── core
│ ├── representation
│ │ ├── __init__.py
│ │ ├── local_config.py
│ │ ├── trajectory.py
│ │ ├── reduction.py
│ │ └── manifold.py
│ ├── explorer
│ │ ├── __init__.py
│ │ └── local_config.py
│ ├── __init__.py
│ └── local_config.py
├── module_config.py
└── config_constants.py
├── docs
├── pages
│ ├── guides
│ │ ├── datatype-multimodal.md
│ │ ├── g2-hover-config.md
│ │ ├── g0-datatype-image.md
│ │ └── g1-datatype-audio.md
│ ├── reference
│ │ ├── core-neural.md
│ │ ├── core-dataset.md
│ │ ├── core-explorer-base.md
│ │ ├── core-explorer-feature.md
│ │ ├── core-representation.md
│ │ ├── core-explorer-functionality.md
│ │ ├── core-explorer-specialization.md
│ │ ├── utils-bokeh_helper.md
│ │ ├── utils-snorkel_helper.md
│ │ └── recipes.md
│ └── tutorial
│ │ ├── t6-softlabel-joint-filter.md
│ │ ├── t5-finder-filter.md
│ │ ├── t2-bokeh-app.md
│ │ ├── t7-snorkel-improvise-rules.md
│ │ ├── t3-dataset-population-selection.md
│ │ └── t1-active-learning.md
├── index.zh.md
├── snippets
│ ├── py
│ │ ├── g2-4-config-hint.txt
│ │ ├── g0-4a-reduction-print.txt
│ │ ├── t4-5-dataset-view.txt
│ │ ├── g2-1-configure-palette.txt
│ │ ├── t6-1-softlabel-filter.txt
│ │ ├── g2-2-configure-abstain-color.txt
│ │ ├── t3-0-dataset-population-table.txt
│ │ ├── g0-4-reduction.txt
│ │ ├── t0-2z-reduction-3d.txt
│ │ ├── t4-1-annotator-subset-toggle.txt
│ │ ├── t7-2-snorkel-filter-button.txt
│ │ ├── t4-2-annotator-selection-option.txt
│ │ ├── t7-0a-lf-list-edit.txt
│ │ ├── g2-3-configure-reduction-method.txt
│ │ ├── t0-0a-dataset-text-print.txt
│ │ ├── t4-4-annotator-search-box.txt
│ │ ├── t0-3-simple-annotator.txt
│ │ ├── t1-1-active-learning.txt
│ │ ├── t5-1-finder-figure.txt
│ │ ├── t0-2a-reduction-print.txt
│ │ ├── t3-2-dataset-selection-table.txt
│ │ ├── t7-3-snorkel-crosscheck.txt
│ │ ├── t0-1a-vectorizer-print.txt
│ │ ├── tz-bokeh-notebook-remote.txt
│ │ ├── t3-3-dataset-evict-patch.txt
│ │ ├── tz-bokeh-notebook-common.txt
│ │ ├── t4-3-annotator-choose-axes.txt
│ │ ├── t6-0-softlabel-figure.txt
│ │ ├── t3-1-dataset-commit-dedup.txt
│ │ ├── t7-1-snorkel-apply-button.txt
│ │ ├── t1-0a-vecnet-callback-print.txt
│ │ ├── g0-1-url-to-content.txt
│ │ ├── g0-2-url-to-image.txt
│ │ ├── t5-0-finder-filter.txt
│ │ ├── tz-bokeh-show-notebook.txt
│ │ ├── g1-1-url-to-audio.txt
│ │ ├── t0-2-reduction.txt
│ │ ├── t4-0-annotator-basics.txt
│ │ ├── t1-0-vecnet-callback.txt
│ │ ├── t0-1-vectorizer.txt
│ │ ├── g1-2-audio-vectorizer.txt
│ │ ├── g2-0-color-palette.txt
│ │ ├── tz-bokeh-show-server.txt
│ │ ├── g1-0-dataset-audio.txt
│ │ ├── g0-0-dataset-image.txt
│ │ ├── t0-0-dataset-text.txt
│ │ ├── g0-3-image-vectorizer.txt
│ │ ├── tz-dataset-text-full.txt
│ │ └── t7-0-lf-list.txt
│ ├── markdown
│ │ ├── local-dep-audio.md
│ │ ├── local-dep-snorkel.md
│ │ ├── local-dep-image.md
│ │ ├── local-dependency.md
│ │ ├── readme
│ │ │ ├── 0-opener.zh.md
│ │ │ ├── 5-announcements.zh.md
│ │ │ ├── 3-install.md
│ │ │ ├── 0-opener.en.md
│ │ │ ├── 5-announcements.en.md
│ │ │ ├── 0a-language-badges.md
│ │ │ ├── 4-resources.zh.md
│ │ │ ├── 0c-intro.zh.md
│ │ │ ├── 4-resources.en.md
│ │ │ ├── 1-live-demos.zh.md
│ │ │ ├── 0c-intro.en.md
│ │ │ ├── 1-live-demos.en.md
│ │ │ ├── 6-remarks.zh.md
│ │ │ ├── 0b-status-badges.md
│ │ │ ├── 6-remarks.en.md
│ │ │ ├── 2-features.zh.md
│ │ │ └── 2-features.en.md
│ │ ├── local-dep-text.md
│ │ ├── tutorial-required.md
│ │ ├── dataset-prep.md
│ │ ├── wrappy-cache.md
│ │ ├── local-dep-jupyter-bokeh.md
│ │ ├── component-tutorial.md
│ │ ├── binder-kernel.md
│ │ └── jupyterlab-js-issue.md
│ └── html
│ │ ├── stylesheet.html
│ │ └── thebe.html
├── index.md
├── images
│ ├── favicon.png
│ ├── hover-logo-dark.png
│ ├── hover-logo-light.png
│ └── hover-logo-title.png
├── pipelines
│ ├── requirements-doc-scripts.txt
│ ├── README.md.template
│ ├── local_helper.py
│ ├── generate_readme.py
│ ├── local_config.py
│ └── check_scripts.py
└── styles
│ └── monokai.css
├── notebooks
├── .gitignore
├── archive-prototype
│ ├── Programmatic-Event.ipynb
│ ├── Dynamic-Widget.ipynb
│ ├── Editing-Datatable.ipynb
│ ├── Programmatic-Polyselect.ipynb
│ └── Slider-Filter.ipynb
└── Image-Experiment.ipynb
├── .gitignore
├── .github
├── dependabot.yml
└── workflows
│ ├── cross-os-conda-build.yml
│ ├── handle-inactive.yml
│ ├── cross-os-install-source.yml
│ ├── cross-os-source-test.yml
│ ├── assemble-readme.yml
│ ├── doc-script-test.yml
│ ├── doc-auto-notebook.yml
│ └── quick-source-test.yml
├── pytest.ini
├── requirements-dev.txt
├── .pre-commit-config.yaml
├── LICENSE
├── conda-recipe
├── stable.yaml
└── meta.yaml
├── setup.py
├── tox.ini
└── mkdocs.yml
/tests/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/core/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/fixture_module/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/hover/utils/meta/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/recipes/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/core/explorer/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/docs/pages/guides/datatype-multimodal.md:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/notebooks/.gitignore:
--------------------------------------------------------------------------------
1 | *.pt
2 | *.pt.*
3 | *.csv
4 |
--------------------------------------------------------------------------------
/fixture_module/.gitignore:
--------------------------------------------------------------------------------
1 | *.pkl
2 | *.pt
3 | *.pt.*
4 |
--------------------------------------------------------------------------------
/docs/index.zh.md:
--------------------------------------------------------------------------------
1 | # Hover - 0.8.1 文档
2 |
3 | {!README.md!}
4 |
--------------------------------------------------------------------------------
/docs/snippets/py/g2-4-config-hint.txt:
--------------------------------------------------------------------------------
1 | hover.config.hint()
2 |
--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
1 | # Hover - 0.8.1 Documentation
2 |
3 | {!README.md!}
4 |
--------------------------------------------------------------------------------
/tests/module_config/hover_alt_config_3.ini:
--------------------------------------------------------------------------------
1 | [io]
2 | data_save_dir = .
3 |
--------------------------------------------------------------------------------
/docs/snippets/py/g0-4a-reduction-print.txt:
--------------------------------------------------------------------------------
1 | dataset.dfs["raw"]().head(5)
2 |
--------------------------------------------------------------------------------
/docs/snippets/py/t4-5-dataset-view.txt:
--------------------------------------------------------------------------------
1 | show(dataset.view(), notebook_url=notebook_url)
2 |
--------------------------------------------------------------------------------
/docs/images/favicon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/phurwicz/hover/HEAD/docs/images/favicon.png
--------------------------------------------------------------------------------
/docs/snippets/py/g2-1-configure-palette.txt:
--------------------------------------------------------------------------------
1 | hover.config["visual"]["abstain_hexcolor"] = "#bababa"
2 |
--------------------------------------------------------------------------------
/docs/snippets/py/t6-1-softlabel-filter.txt:
--------------------------------------------------------------------------------
1 | show(softlabel.score_filter, notebook_url=notebook_url)
2 |
--------------------------------------------------------------------------------
/docs/snippets/py/g2-2-configure-abstain-color.txt:
--------------------------------------------------------------------------------
1 | hover.config["visual"]["abstain_hexcolor"] = "#bababa"
2 |
--------------------------------------------------------------------------------
/docs/snippets/py/t3-0-dataset-population-table.txt:
--------------------------------------------------------------------------------
1 | show(dataset.pop_table, notebook_url=notebook_url)
2 |
--------------------------------------------------------------------------------
/docs/images/hover-logo-dark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/phurwicz/hover/HEAD/docs/images/hover-logo-dark.png
--------------------------------------------------------------------------------
/docs/snippets/py/g0-4-reduction.txt:
--------------------------------------------------------------------------------
1 | reducer = dataset.compute_nd_embedding(vectorizer, "umap", dimension=2)
2 |
--------------------------------------------------------------------------------
/docs/images/hover-logo-light.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/phurwicz/hover/HEAD/docs/images/hover-logo-light.png
--------------------------------------------------------------------------------
/docs/images/hover-logo-title.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/phurwicz/hover/HEAD/docs/images/hover-logo-title.png
--------------------------------------------------------------------------------
/docs/pages/reference/core-neural.md:
--------------------------------------------------------------------------------
1 | - ::: hover.core.neural
2 | rendering:
3 | show_root_heading: false
4 |
--------------------------------------------------------------------------------
/docs/snippets/py/t0-2z-reduction-3d.txt:
--------------------------------------------------------------------------------
1 | reducer = dataset.compute_nd_embedding(vectorizer, "umap", dimension=3)
2 |
--------------------------------------------------------------------------------
/docs/snippets/py/t4-1-annotator-subset-toggle.txt:
--------------------------------------------------------------------------------
1 | show(annotator.data_key_button_group, notebook_url=notebook_url)
2 |
--------------------------------------------------------------------------------
/docs/snippets/py/t7-2-snorkel-filter-button.txt:
--------------------------------------------------------------------------------
1 | show(snorkel_plot.lf_filter_trigger, notebook_url=notebook_url)
2 |
--------------------------------------------------------------------------------
/docs/pages/reference/core-dataset.md:
--------------------------------------------------------------------------------
1 | - ::: hover.core.dataset
2 | rendering:
3 | show_root_heading: false
4 |
--------------------------------------------------------------------------------
/docs/snippets/py/t4-2-annotator-selection-option.txt:
--------------------------------------------------------------------------------
1 | show(annotator.selection_option_box, notebook_url=notebook_url)
2 |
--------------------------------------------------------------------------------
/docs/snippets/py/t7-0a-lf-list-edit.txt:
--------------------------------------------------------------------------------
1 | # we will come back to this block later on
2 | # LABELING_FUNCTIONS.pop(-1)
3 |
--------------------------------------------------------------------------------
/docs/snippets/py/g2-3-configure-reduction-method.txt:
--------------------------------------------------------------------------------
1 | hover.config["data.embedding"]["default_reduction_method"] = "ivis"
2 |
--------------------------------------------------------------------------------
/docs/pages/reference/core-explorer-base.md:
--------------------------------------------------------------------------------
1 | - ::: hover.core.explorer.base
2 | rendering:
3 | show_root_heading: false
4 |
--------------------------------------------------------------------------------
/docs/snippets/py/t0-0a-dataset-text-print.txt:
--------------------------------------------------------------------------------
1 | # each subset can be accessed as its own DataFrame
2 | dataset.dfs["raw"]().head(5)
3 |
--------------------------------------------------------------------------------
/docs/snippets/py/t4-4-annotator-search-box.txt:
--------------------------------------------------------------------------------
1 | show(row(annotator.search_pos, annotator.search_neg), notebook_url=notebook_url)
2 |
--------------------------------------------------------------------------------
/docs/pages/reference/core-explorer-feature.md:
--------------------------------------------------------------------------------
1 | - ::: hover.core.explorer.feature
2 | rendering:
3 | show_root_heading: false
4 |
--------------------------------------------------------------------------------
/docs/snippets/markdown/local-dep-audio.md:
--------------------------------------------------------------------------------
1 | To run the audio embedding code on this page, you need `pip install librosa wrappy`.
2 |
--------------------------------------------------------------------------------
/docs/pages/reference/core-representation.md:
--------------------------------------------------------------------------------
1 | - ::: hover.core.representation.reduction
2 | rendering:
3 | show_root_heading: true
4 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *ipynb_checkpoints*
2 | *pycache*
3 | .*.pkl
4 | .tox
5 | .coverage
6 | cobertura.xml
7 | .DS_Store
8 | annoy.index
9 | site/
10 |
--------------------------------------------------------------------------------
/docs/pages/reference/core-explorer-functionality.md:
--------------------------------------------------------------------------------
1 | - ::: hover.core.explorer.functionality
2 | rendering:
3 | show_root_heading: false
4 |
--------------------------------------------------------------------------------
/docs/pages/reference/core-explorer-specialization.md:
--------------------------------------------------------------------------------
1 | - ::: hover.core.explorer.specialization
2 | rendering:
3 | show_root_heading: false
4 |
--------------------------------------------------------------------------------
/docs/snippets/py/t0-3-simple-annotator.txt:
--------------------------------------------------------------------------------
1 | from hover.recipes.stable import simple_annotator
2 |
3 | interactive_plot = simple_annotator(dataset)
4 |
--------------------------------------------------------------------------------
/docs/snippets/markdown/local-dep-snorkel.md:
--------------------------------------------------------------------------------
1 | To use `snorkel` labeling functions, you need:
2 | ```shell
3 | pip install snorkel
4 | ```
5 |
--------------------------------------------------------------------------------
/docs/snippets/markdown/local-dep-image.md:
--------------------------------------------------------------------------------
1 | To run the image embedding code on this page, you need `pip install efficientnet_pytorch torchvision wrappy`.
2 |
--------------------------------------------------------------------------------
/docs/snippets/py/t1-1-active-learning.txt:
--------------------------------------------------------------------------------
1 | from hover.recipes.experimental import active_learning
2 |
3 | interactive_plot = active_learning(dataset, vecnet)
4 |
--------------------------------------------------------------------------------
/docs/snippets/py/t5-1-finder-figure.txt:
--------------------------------------------------------------------------------
1 | show(column(
2 | row(finder.search_pos, finder.search_neg),
3 | finder.figure,
4 | ), notebook_url=notebook_url)
5 |
--------------------------------------------------------------------------------
/docs/snippets/py/t0-2a-reduction-print.txt:
--------------------------------------------------------------------------------
1 | # what we did adds 'embed_2d_0' and 'embed_2d_1' columns to the DataFrames in dataset.dfs
2 | dataset.dfs["raw"]().head(5)
3 |
--------------------------------------------------------------------------------
/docs/snippets/py/t3-2-dataset-selection-table.txt:
--------------------------------------------------------------------------------
1 | dataset._callback_update_selection(dataset.dfs["raw"][:10])
2 |
3 | show(dataset.sel_table, notebook_url=notebook_url)
4 |
--------------------------------------------------------------------------------
/docs/snippets/py/t7-3-snorkel-crosscheck.txt:
--------------------------------------------------------------------------------
1 | from hover.recipes.experimental import snorkel_crosscheck
2 |
3 | interactive_plot = snorkel_crosscheck(dataset, LABELING_FUNCTIONS)
4 |
--------------------------------------------------------------------------------
/tests/module_config/hover_alt_config_2.ini:
--------------------------------------------------------------------------------
1 | [data.values]
2 | abstain_decoded = LABEL.ABSTAIN
3 | abstain_encoded = -2
4 |
5 | [data.columns]
6 | dataset_subset_field = subset
7 |
--------------------------------------------------------------------------------
/docs/snippets/markdown/local-dependency.md:
--------------------------------------------------------------------------------
1 | ??? info "Dependencies for {== local environments ==}"
2 | When you run the code locally, you may need to install additional packages.
3 |
--------------------------------------------------------------------------------
/docs/snippets/py/t0-1a-vectorizer-print.txt:
--------------------------------------------------------------------------------
1 | text = dataset.dfs["raw"]().loc[0, "text"]
2 | vec = vectorizer(text)
3 | print(f"Text: {text}")
4 | print(f"Vector shape: {vec.shape}")
5 |
--------------------------------------------------------------------------------
/hover/recipes/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | ???+ note "High-level functions to produce an interactive annotation interface."
3 | """
4 | from .stable import simple_annotator, linked_annotator
5 |
--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
1 | version: 2
2 | updates:
3 | - package-ecosystem: "pip"
4 | directory: "/"
5 | schedule:
6 | interval: "daily"
7 | open-pull-requests-limit: 10
8 |
--------------------------------------------------------------------------------
/docs/snippets/markdown/readme/0-opener.zh.md:
--------------------------------------------------------------------------------
1 | 
2 |
3 | > 通过向量降维, 极速探索和批量标注数据, 并用作模型训练或其它用途.
4 |
--------------------------------------------------------------------------------
/docs/pages/reference/utils-bokeh_helper.md:
--------------------------------------------------------------------------------
1 | - ::: hover.utils.bokeh_helper
2 | rendering:
3 | show_root_heading: false
4 | show_root_toc_entry: false
5 | heading_level: 3
6 |
--------------------------------------------------------------------------------
/docs/pages/reference/utils-snorkel_helper.md:
--------------------------------------------------------------------------------
1 | - ::: hover.utils.snorkel_helper
2 | rendering:
3 | show_root_heading: false
4 | show_root_toc_entry: false
5 | heading_level: 3
6 |
--------------------------------------------------------------------------------
/docs/snippets/markdown/local-dep-text.md:
--------------------------------------------------------------------------------
1 | To run the text embedding code on this page, you need:
2 | ```shell
3 | pip install spacy
4 | python -m spacy download en_core_web_md
5 | ```
6 |
--------------------------------------------------------------------------------
/docs/snippets/markdown/readme/5-announcements.zh.md:
--------------------------------------------------------------------------------
1 | ## :flags: 新动态
2 |
3 | - **Jan 21, 2023** 新版本 0.8.0 已就绪, 可查看 [changelog](https://github.com/phurwicz/hover/blob/main/CHANGELOG.md) 获悉详情 :partying_face:.
4 |
--------------------------------------------------------------------------------
/docs/snippets/py/tz-bokeh-notebook-remote.txt:
--------------------------------------------------------------------------------
1 | # special configuration for this remotely hosted tutorial
2 | from local_lib.binder_helper import remote_jupyter_proxy_url
3 | notebook_url = remote_jupyter_proxy_url
4 |
--------------------------------------------------------------------------------
/docs/snippets/markdown/readme/3-install.md:
--------------------------------------------------------------------------------
1 | ## :package: Install
2 |
3 | > Python: 3.8+
4 | >
5 | > OS: Linux & Mac & Windows
6 |
7 | PyPI: `pip install hover`
8 |
9 | Conda: `conda install -c conda-forge hover`
10 |
--------------------------------------------------------------------------------
/docs/snippets/py/t3-3-dataset-evict-patch.txt:
--------------------------------------------------------------------------------
1 | show(column(
2 | row(
3 | dataset.selection_evictor,
4 | dataset.selection_patcher,
5 | ),
6 | dataset.sel_table,
7 | ), notebook_url=notebook_url)
8 |
--------------------------------------------------------------------------------
/docs/snippets/py/tz-bokeh-notebook-common.txt:
--------------------------------------------------------------------------------
1 | from bokeh.io import show, output_notebook
2 |
3 | output_notebook()
4 |
5 | # normally your would skip notebook_url or use Jupyter address
6 | notebook_url = 'localhost:8888'
7 |
--------------------------------------------------------------------------------
/docs/snippets/html/stylesheet.html:
--------------------------------------------------------------------------------
1 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/docs/snippets/py/t4-3-annotator-choose-axes.txt:
--------------------------------------------------------------------------------
1 | annotator = standard_annotator(dataset)
2 |
3 | show(column(
4 | row(annotator.dropdown_x_axis, annotator.dropdown_y_axis),
5 | annotator.figure,
6 | ), notebook_url=notebook_url)
7 |
--------------------------------------------------------------------------------
/docs/snippets/py/t6-0-softlabel-figure.txt:
--------------------------------------------------------------------------------
1 | from hover.recipes.subroutine import standard_softlabel
2 | from bokeh.layouts import row, column
3 |
4 | softlabel = standard_softlabel(dataset)
5 | show(softlabel.figure, notebook_url=notebook_url)
6 |
--------------------------------------------------------------------------------
/hover/core/representation/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | ???+ note "Leveraging dimensionality reduction to make 2-D representations of data points."
3 |
4 | This is intended to be useful for making interactive general-purpose data explorers.
5 | """
6 |
--------------------------------------------------------------------------------
/docs/snippets/markdown/readme/0-opener.en.md:
--------------------------------------------------------------------------------
1 | 
2 |
3 | > Explore and label on a map of your data.
4 | >
5 | > Get enough to feed your model in no time.
6 |
--------------------------------------------------------------------------------
/docs/snippets/markdown/readme/5-announcements.en.md:
--------------------------------------------------------------------------------
1 | ## :flags: Announcements
2 |
3 | - **Jan 21, 2023** version 0.8.0 is now available. Check out the [changelog](https://github.com/phurwicz/hover/blob/main/CHANGELOG.md) for details :partying_face:.
4 |
--------------------------------------------------------------------------------
/docs/snippets/markdown/tutorial-required.md:
--------------------------------------------------------------------------------
1 | ???+ warning "This page assumes that you have know the basics"
2 | i.e. simple usage of `dataset` and `annotator`. Please visit the [quickstart tutorial](/hover/pages/tutorial/t0-quickstart) if you haven't done so.
3 |
--------------------------------------------------------------------------------
/docs/snippets/py/t3-1-dataset-commit-dedup.txt:
--------------------------------------------------------------------------------
1 | from bokeh.layouts import row, column
2 |
3 | show(column(
4 | row(
5 | dataset.data_committer,
6 | dataset.dedup_trigger,
7 | ),
8 | dataset.pop_table,
9 | ), notebook_url=notebook_url)
10 |
--------------------------------------------------------------------------------
/docs/snippets/py/t7-1-snorkel-apply-button.txt:
--------------------------------------------------------------------------------
1 | from hover.recipes.subroutine import standard_snorkel
2 |
3 | snorkel_plot = standard_snorkel(dataset)
4 | snorkel_plot.subscribed_lf_list = LABELING_FUNCTIONS
5 | show(snorkel_plot.lf_apply_trigger, notebook_url=notebook_url)
6 |
--------------------------------------------------------------------------------
/docs/snippets/markdown/readme/0a-language-badges.md:
--------------------------------------------------------------------------------
1 | [](https://github.com/phurwicz/hover/blob/main/README.md)
2 | [](https://github.com/phurwicz/hover/blob/main/README.zh.md)
3 |
--------------------------------------------------------------------------------
/docs/snippets/py/t1-0a-vecnet-callback-print.txt:
--------------------------------------------------------------------------------
1 | # predict_proba accepts individual strings or list
2 | # text -> vector -> class probabilities
3 | # if no classes right now, will see an empty list
4 | print(vecnet.predict_proba(text))
5 | print(vecnet.predict_proba([text]))
6 |
--------------------------------------------------------------------------------
/hover/recipes/local_config.py:
--------------------------------------------------------------------------------
1 | import hover
2 | from hover.config_constants import (
3 | ConfigSection as Section,
4 | ConfigKey as Key,
5 | )
6 |
7 | DEFAULT_REDUCTION_METHOD = hover.config[Section.DATA_EMBEDDING][
8 | Key.DEFAULT_REDUCTION_METHOD
9 | ]
10 |
--------------------------------------------------------------------------------
/hover/utils/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Utility functions or classes, for example:
3 |
4 | (1) connectors to another library like Torch/Snorkel/etc.
5 | (2) optional data structures that work smoothly with the core module.
6 | (3) supportive subroutines such as logging templates.
7 | """
8 |
--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | markers =
3 | core: central functionalities involved in most, if not all, use cases.
4 | lite: fast to run and no platform-specific extra dependency.
5 | benchmark: about performance rather than correctness.
6 | builtin: 'vanilla' extension beyond the core.
7 |
--------------------------------------------------------------------------------
/tests/local_config.py:
--------------------------------------------------------------------------------
1 | import random
2 |
3 | PSEUDO_LABELS = ["A", "B"]
4 |
5 | VECTORIZER_BREAKER = "VECTORIZER_FALLS_APART"
6 |
7 |
8 | def RANDOM_LABEL(row):
9 | return random.choice(PSEUDO_LABELS)
10 |
11 |
12 | def RANDOM_SCORE(row):
13 | return random.uniform(0.2, 1.0)
14 |
--------------------------------------------------------------------------------
/docs/snippets/py/g0-1-url-to-content.txt:
--------------------------------------------------------------------------------
1 | import requests
2 | from functools import lru_cache
3 |
4 | @lru_cache(maxsize=10000)
5 | def url_to_content(url):
6 | """
7 | Turn a URL to response content.
8 | """
9 | response = requests.get(url)
10 | return response.content
11 |
--------------------------------------------------------------------------------
/docs/snippets/py/g0-2-url-to-image.txt:
--------------------------------------------------------------------------------
1 | from PIL import Image
2 | from io import BytesIO
3 |
4 | @lru_cache(maxsize=10000)
5 | def url_to_image(url):
6 | """
7 | Turn a URL to a PIL Image.
8 | """
9 | img = Image.open(BytesIO(url_to_content(url))).convert("RGB")
10 | return img
11 |
--------------------------------------------------------------------------------
/docs/snippets/py/t5-0-finder-filter.txt:
--------------------------------------------------------------------------------
1 | from hover.recipes.subroutine import standard_finder
2 | from bokeh.layouts import row, column
3 |
4 | finder = standard_finder(dataset)
5 | show(row(
6 | column(finder.search_pos, finder.search_neg),
7 | finder.search_filter_box,
8 | ), notebook_url=notebook_url)
9 |
--------------------------------------------------------------------------------
/docs/snippets/py/tz-bokeh-show-notebook.txt:
--------------------------------------------------------------------------------
1 | # ---------- NOTEBOOK MODE: for your actual Jupyter environment ---------
2 | # this code will render the entire plot in Jupyter
3 | # from bokeh.io import show, output_notebook
4 | # output_notebook()
5 | # show(interactive_plot, notebook_url='https://localhost:8888')
6 |
--------------------------------------------------------------------------------
/docs/snippets/py/g1-1-url-to-audio.txt:
--------------------------------------------------------------------------------
1 | import librosa
2 | from io import BytesIO
3 |
4 | @lru_cache(maxsize=10000)
5 | def url_to_audio(url):
6 | """
7 | Turn a URL to audio data.
8 | """
9 | data, sampling_rate = librosa.load(BytesIO(url_to_content(url)))
10 | return data, sampling_rate
11 |
--------------------------------------------------------------------------------
/docs/snippets/py/t0-2-reduction.txt:
--------------------------------------------------------------------------------
1 | # any kwargs will be passed onto the corresponding reduction
2 | # for umap: https://umap-learn.readthedocs.io/en/latest/parameters.html
3 | # for ivis: https://bering-ivis.readthedocs.io/en/latest/api.html
4 | reducer = dataset.compute_nd_embedding(vectorizer, "umap", dimension=2)
5 |
--------------------------------------------------------------------------------
/requirements-dev.txt:
--------------------------------------------------------------------------------
1 | # automation
2 | tox
3 | tox-gh-actions
4 | pre-commit
5 | # documentation
6 | mkdocs
7 | mkdocs-material
8 | mkdocs-macros-plugin
9 | mkdocs-static-i18n
10 | mkdocstrings
11 | mkdocstrings-python
12 | markdown-include
13 | mike
14 | # interactive development
15 | jupyter
16 | jupyterlab
17 |
--------------------------------------------------------------------------------
/docs/snippets/markdown/readme/4-resources.zh.md:
--------------------------------------------------------------------------------
1 | ## :book: 资料
2 |
3 | - [教程](https://phurwicz.github.io/hover/pages/tutorial/t0-quickstart/)
4 | - [Binder仓库](https://github.com/phurwicz/hover-binder)
5 | - [版本说明](https://github.com/phurwicz/hover/blob/main/CHANGELOG.md)
6 | - [文档](https://phurwicz.github.io/hover/)
7 |
--------------------------------------------------------------------------------
/docs/snippets/markdown/readme/0c-intro.zh.md:
--------------------------------------------------------------------------------
1 | `hover` 是一个批量标注数据的工具, 只需数据能被向量表示.
2 |
3 | - 标注过程很简单, 如同给散点图上色.
4 | - 通过移动鼠标和框选, 来观察数据(在降维后的)点簇.
5 | - 使用小工具(如搜索/过滤/规则/主动学习)来提升精度.
6 | - 输入合适的标签, 并点击"Apply"按钮, 即可标注!
7 |
8 | 
9 |
--------------------------------------------------------------------------------
/docs/snippets/markdown/dataset-prep.md:
--------------------------------------------------------------------------------
1 | As always, start with a ready-for-plot dataset:
2 |
3 |
4 | {!docs/snippets/py/tz-dataset-text-full.txt!}
5 |
6 |
7 |
8 | {!docs/snippets/py/t0-1-vectorizer.txt!}
9 |
10 | {!docs/snippets/py/t0-2-reduction.txt!}
11 |
12 |
--------------------------------------------------------------------------------
/docs/snippets/py/t4-0-annotator-basics.txt:
--------------------------------------------------------------------------------
1 | from hover.recipes.subroutine import standard_annotator
2 | from bokeh.layouts import row, column
3 |
4 | annotator = standard_annotator(dataset)
5 | show(column(
6 | row(annotator.annotator_input, annotator.annotator_apply),
7 | annotator.figure,
8 | ), notebook_url=notebook_url)
9 |
--------------------------------------------------------------------------------
/docs/snippets/markdown/readme/4-resources.en.md:
--------------------------------------------------------------------------------
1 | ## :book: Resources
2 |
3 | - [Tutorials](https://phurwicz.github.io/hover/pages/tutorial/t0-quickstart/)
4 | - [Binder repo](https://github.com/phurwicz/hover-binder)
5 | - [Changelog](https://github.com/phurwicz/hover/blob/main/CHANGELOG.md)
6 | - [Documentation](https://phurwicz.github.io/hover/)
7 |
--------------------------------------------------------------------------------
/docs/pages/reference/recipes.md:
--------------------------------------------------------------------------------
1 | # `hover.recipes`
2 |
3 | - ::: hover.recipes.stable
4 | rendering:
5 | show_root_heading: true
6 |
7 | ---
8 |
9 | - ::: hover.recipes.experimental
10 | rendering:
11 | show_root_heading: true
12 |
13 | ---
14 |
15 | - ::: hover.recipes.subroutine
16 | rendering:
17 | show_root_heading: true
18 |
--------------------------------------------------------------------------------
/docs/snippets/markdown/readme/1-live-demos.zh.md:
--------------------------------------------------------------------------------
1 | ## :rocket: 在线演示
2 |
3 | ### [**Notebook教程**](https://phurwicz.github.io/hover/pages/tutorial/t0-quickstart/)
4 |
5 | - 查看含代码的教程, 可在浏览器中编辑和运行, 无需安装依赖.
6 |
7 | ### [**示例标注界面**](https://mybinder.org/v2/gh/phurwicz/hover-binder/master?urlpath=/proxy/5006/app-simple-annotator)
8 |
9 | - 跳过所有代码, 进入托管在Binder上的标注界面.
10 |
--------------------------------------------------------------------------------
/tests/utils/conftest.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | from bokeh.plotting import figure
3 |
4 |
5 | @pytest.fixture
6 | def dummy_working_recipe():
7 | def recipe(*args, **kwargs):
8 | return figure()
9 |
10 | return recipe
11 |
12 |
13 | @pytest.fixture
14 | def dummy_broken_recipe():
15 | def recipe(*args, **kwargs):
16 | assert False
17 |
18 | return recipe
19 |
--------------------------------------------------------------------------------
/docs/snippets/py/t1-0-vecnet-callback.txt:
--------------------------------------------------------------------------------
1 | from hover.core.neural import VectorNet
2 | from hover.utils.common_nn import LogisticRegression
3 |
4 | # Create a model with vectorizer-NN architecture.
5 | # model.pt will point to a PyTorch state dict (to be created)
6 | # the label classes in the dataset can change, and vecnet can adjust to that
7 | vecnet = VectorNet(vectorizer, LogisticRegression, "model.pt", dataset.classes)
8 |
--------------------------------------------------------------------------------
/docs/snippets/py/t0-1-vectorizer.txt:
--------------------------------------------------------------------------------
1 | import spacy
2 | import re
3 | from functools import lru_cache
4 |
5 | # use your preferred embedding for the task
6 | nlp = spacy.load("en_core_web_md")
7 |
8 | # raw data (str in this case) -> np.array
9 | @lru_cache(maxsize=int(1e+4))
10 | def vectorizer(text):
11 | clean_text = re.sub(r"[\s]+", r" ", str(text))
12 | return nlp(clean_text, disable=nlp.pipe_names).vector
13 |
--------------------------------------------------------------------------------
/docs/snippets/markdown/wrappy-cache.md:
--------------------------------------------------------------------------------
1 | ???+ info "Caching and reading from disk"
2 | This guide uses [`@wrappy.memoize`](https://erniethornhill.github.io/wrappy/) in place of `@functools.lru_cache` for caching.
3 |
4 | - The benefit is that `wrappy.memoize` can persist the cache to disk, speeding up code across sessions.
5 |
6 | Cached values for this guide have been pre-computed, making it much master to run the guide.
7 |
--------------------------------------------------------------------------------
/hover/core/representation/local_config.py:
--------------------------------------------------------------------------------
1 | import hover
2 | from hover.config_constants import (
3 | ConfigSection as Section,
4 | ConfigKey as Key,
5 | )
6 |
7 | KWARG_TRANSLATOR = {
8 | "dimension": {
9 | "umap": "n_components",
10 | "ivis": "embedding_dims",
11 | },
12 | }
13 |
14 | DEFAULT_REDUCTION_METHOD = hover.config[Section.DATA_EMBEDDING][
15 | Key.DEFAULT_REDUCTION_METHOD
16 | ]
17 |
--------------------------------------------------------------------------------
/docs/snippets/py/g1-2-audio-vectorizer.txt:
--------------------------------------------------------------------------------
1 | import wrappy
2 |
3 | @wrappy.memoize(cache_limit=10000, persist_path='custom_cache/audio_url_to_vector.pkl')
4 | def vectorizer(url):
5 | """
6 | Averaged MFCC over time.
7 | Resembles word-embedding-average-as-doc-embedding for texts.
8 | """
9 | y, sr = url_to_audio(url)
10 | mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=32)
11 | return mfcc.mean(axis=1)
12 |
--------------------------------------------------------------------------------
/docs/snippets/py/g2-0-color-palette.txt:
--------------------------------------------------------------------------------
1 | import hover
2 | from hover.utils.bokeh_helper import auto_label_color
3 | from rich.console import Console
4 |
5 | console = Console()
6 | labels = ["A", "B", "C", "D", "E", "F"]
7 | color_dict = auto_label_color(labels)
8 | abstain = hover.config['data.values']['abstain_decoded']
9 |
10 | for _label in [abstain, *labels]:
11 | console.print(f"\u2b24{_label}", style=color_dict[_label])
12 |
--------------------------------------------------------------------------------
/tests/core/representation/test_manifold.py:
--------------------------------------------------------------------------------
1 | from hover.core.representation.manifold import LayerwiseManifold
2 | import numpy as np
3 |
4 |
5 | def test_LayerwiseManifold(distance_preserving_array_sequence):
6 | LM = LayerwiseManifold(distance_preserving_array_sequence)
7 | LM.unfold(method="umap", random_state=0, transform_seed=0)
8 | _, disparities = LM.procrustes()
9 | assert (np.array(disparities) < 1e-16).all()
10 |
--------------------------------------------------------------------------------
/docs/pipelines/requirements-doc-scripts.txt:
--------------------------------------------------------------------------------
1 | # auto-parse scripts in markdown files
2 | markdown
3 | markdown-include>=0.7.0
4 | # Jupyter environment
5 | jupyter
6 | jupyterlab
7 | # dependencies for specific code
8 | ## distant supervision
9 | snorkel>=0.9.8
10 | ## text vectorizer
11 | spacy
12 | ## image vectorizer
13 | efficientnet_pytorch
14 | torchvision
15 | ## audio handling
16 | librosa
17 | ## disk-persistent caching
18 | wrappy>=0.2.6
19 |
--------------------------------------------------------------------------------
/docs/snippets/markdown/local-dep-jupyter-bokeh.md:
--------------------------------------------------------------------------------
1 | To render `bokeh` plots in Jupyter, you need:
2 | ```shell
3 | pip install jupyter_bokeh
4 | ```
5 |
6 | If you are using JupyterLab older than 3.0, use this instead ([reference](https://pypi.org/project/jupyter-bokeh/)):
7 | ```shell
8 | jupyter labextension install @jupyter-widgets/jupyterlab-manager
9 | jupyter labextension install @bokeh/jupyter_bokeh
10 | ```
11 |
--------------------------------------------------------------------------------
/docs/snippets/markdown/readme/0c-intro.en.md:
--------------------------------------------------------------------------------
1 | `hover` is a tool for mass-labeling data points that can be represented by vectors.
2 |
3 | - Labeling is as easy as coloring a scatter plot.
4 | - Hover your mouse and lasso-select to inspect any cluster.
5 | - Use a variety of widgets to narrow down further.
6 | - Enter a suitable label and hit "Apply"!
7 |
8 | 
9 |
--------------------------------------------------------------------------------
/docs/snippets/markdown/component-tutorial.md:
--------------------------------------------------------------------------------
1 | ???+ warning "This page addresses **single components** of `hover`"
2 | For illustration, we are using code snippets to pick out specific widgets so that the documentation can explain what they do.
3 |
4 | - Please be aware that you won't need to get the widgets by code in an actual use case.
5 | - Typical usage deals with [recipes](../../tutorial/t1-active-learning) where the individual parts have been tied together.
6 |
--------------------------------------------------------------------------------
/docs/snippets/markdown/readme/1-live-demos.en.md:
--------------------------------------------------------------------------------
1 | ## :rocket: Live Demos
2 |
3 | ### [**With code**](https://phurwicz.github.io/hover/pages/tutorial/t0-quickstart/)
4 |
5 | - edit & run code in your browser to get a labeling interface, with guides along the way.
6 |
7 | ### [**Without code**](https://mybinder.org/v2/gh/phurwicz/hover-binder/master?urlpath=/proxy/5006/app-simple-annotator)
8 |
9 | - go directly to an example labeling interface hosted on Binder.
10 |
--------------------------------------------------------------------------------
/hover/utils/metrics.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | def classification_accuracy(true, pred):
5 | """
6 | Accuracy measure on two arrays. Intended for classification problems.
7 | :param true: true labels.
8 | :type true: Numpy array
9 | :param pred: predicted labels.
10 | :type pred: Numpy array
11 | """
12 | assert true.shape[0] == pred.shape[0]
13 | correct = np.equal(true, pred).sum()
14 | return float(correct) / float(true.shape[0])
15 |
--------------------------------------------------------------------------------
/tests/utils/test_metrics.py:
--------------------------------------------------------------------------------
1 | from hover.utils.metrics import classification_accuracy
2 | import numpy as np
3 | import pytest
4 |
5 |
6 | @pytest.mark.lite
7 | def test_classification_accuracy():
8 | true = np.array([1, 2, 3, 4, 5, 6, 7, 7])
9 | pred = np.array([1, 2, 3, 4, 5, 6, 7, 8])
10 | accl = classification_accuracy(true, pred)
11 | accr = classification_accuracy(pred, true)
12 | assert np.allclose(accl, 7 / 8)
13 | assert np.allclose(accr, 7 / 8)
14 |
--------------------------------------------------------------------------------
/docs/snippets/markdown/binder-kernel.md:
--------------------------------------------------------------------------------
1 | ???+ info "Running Python right here"
2 | Think of this page as *almost* a Jupyter notebook. You can edit code and press `Shift+Enter` to execute.
3 |
4 | Behind the scene is a [Binder](https://mybinder.org/)-hosted Python environment. Below is the status of the kernel:
5 |
6 |
7 | To download a notebook file instead, visit [here](https://github.com/phurwicz/hover/tree/main/docs/pipelines/generated).
8 |
--------------------------------------------------------------------------------
/hover/core/explorer/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | ???+ note "Interactive graphical interfaces based on Bokeh."
3 | """
4 | from .specialization import (
5 | BokehTextFinder,
6 | BokehTextAnnotator,
7 | BokehTextSoftLabel,
8 | BokehTextMargin,
9 | BokehTextSnorkel,
10 | BokehAudioFinder,
11 | BokehAudioAnnotator,
12 | BokehAudioSoftLabel,
13 | BokehAudioMargin,
14 | BokehAudioSnorkel,
15 | BokehImageFinder,
16 | BokehImageAnnotator,
17 | BokehImageSoftLabel,
18 | BokehImageMargin,
19 | BokehImageSnorkel,
20 | )
21 |
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | repos:
2 | - repo: https://github.com/pre-commit/pre-commit-hooks
3 | rev: v4.1.0
4 | hooks:
5 | - id: check-yaml
6 | exclude: |
7 | (?x)^(
8 | conda-recipe/meta.yaml|
9 | conda-recipe/stable.yaml
10 | )$
11 | - id: end-of-file-fixer
12 | - id: trailing-whitespace
13 | - repo: https://github.com/psf/black
14 | rev: 22.1.0
15 | hooks:
16 | - id: black
17 | - repo: https://gitlab.com/pycqa/flake8
18 | rev: 3.9.2
19 | hooks:
20 | - id: flake8
21 |
--------------------------------------------------------------------------------
/docs/snippets/py/tz-bokeh-show-server.txt:
--------------------------------------------------------------------------------
1 | # ---------- SERVER MODE: for the documentation page ----------
2 | # because this tutorial is remotely hosted, we need explicit serving to expose the plot to you
3 | from local_lib.binder_helper import binder_proxy_app_url
4 | from bokeh.server.server import Server
5 | server = Server({'/my-app': interactive_plot}, port=5007, allow_websocket_origin=['*'], use_xheaders=True)
6 | server.start()
7 | # visit this URL printed in cell output to see the interactive plot; locally you would just do "https://localhost:5007/my-app"
8 | binder_proxy_app_url('my-app', port=5007)
9 |
--------------------------------------------------------------------------------
/docs/snippets/html/thebe.html:
--------------------------------------------------------------------------------
1 |
2 |
22 |
--------------------------------------------------------------------------------
/docs/snippets/py/g1-0-dataset-audio.txt:
--------------------------------------------------------------------------------
1 | from hover.core.dataset import SupervisableAudioDataset
2 | import pandas as pd
3 |
4 | # this is a table of audio-MNIST (pronounced digit 0-9) urls, 100 audios per digit
5 | example_csv_path = "https://raw.githubusercontent.com/phurwicz/hover-gallery/main/0.7.0/audio_mnist.csv"
6 | df = pd.read_csv(example_csv_path).sample(frac=1).reset_index(drop=True)
7 | df["SUBSET"] = "raw"
8 | df.loc[500:800, 'SUBSET'] = 'train'
9 | df.loc[800:900, 'SUBSET'] = 'dev'
10 | df.loc[900:, 'SUBSET'] = 'test'
11 |
12 | dataset = SupervisableAudioDataset.from_pandas(df, feature_key="audio", label_key="label")
13 |
--------------------------------------------------------------------------------
/docs/snippets/py/g0-0-dataset-image.txt:
--------------------------------------------------------------------------------
1 | from hover.core.dataset import SupervisableImageDataset
2 | import pandas as pd
3 |
4 | # this is a 1000-image-url set of ImageNet data
5 | # with custom labels: animal, object, food
6 | example_csv_path = "https://raw.githubusercontent.com/phurwicz/hover-gallery/main/0.7.0/imagenet_custom.csv"
7 | df = pd.read_csv(example_csv_path).sample(frac=1).reset_index(drop=True)
8 | df["SUBSET"] = "raw"
9 | df.loc[500:800, 'SUBSET'] = 'train'
10 | df.loc[800:900, 'SUBSET'] = 'dev'
11 | df.loc[900:, 'SUBSET'] = 'test'
12 |
13 | dataset = SupervisableImageDataset.from_pandas(df, feature_key="image", label_key="label")
14 |
--------------------------------------------------------------------------------
/docs/pipelines/README.md.template:
--------------------------------------------------------------------------------
1 | {!docs/snippets/markdown/readme/0-opener..md!}
2 |
3 | {!docs/snippets/markdown/readme/0a-language-badges.md!}
4 |
5 | {!docs/snippets/markdown/readme/0b-status-badges.md!}
6 |
7 | {!docs/snippets/markdown/readme/0c-intro..md!}
8 |
9 | {!docs/snippets/markdown/readme/1-live-demos..md!}
10 |
11 | {!docs/snippets/markdown/readme/2-features..md!}
12 |
13 | {!docs/snippets/markdown/readme/3-install.md!}
14 |
15 | {!docs/snippets/markdown/readme/4-resources..md!}
16 |
17 | {!docs/snippets/markdown/readme/5-announcements..md!}
18 |
19 | {!docs/snippets/markdown/readme/6-remarks..md!}
20 |
--------------------------------------------------------------------------------
/hover/utils/typecheck.py:
--------------------------------------------------------------------------------
1 | class TypedValueDict(dict):
2 | """
3 | A dict that only allows values of a certain type.
4 | """
5 |
6 | def __init__(self, type_, *args, **kwargs):
7 | self._type = type_
8 | super().__init__(*args, **kwargs)
9 |
10 | def __setitem__(self, key, value):
11 | self.typecheck(value)
12 | super().__setitem__(key, value)
13 |
14 | def typecheck(self, value):
15 | if not isinstance(value, self._type):
16 | raise TypeError(f"Value must be of type {self._type}, got {type(value)}")
17 |
18 | def update(self, other):
19 | for _value in other.values():
20 | self.typecheck(_value)
21 | super().update(other)
22 |
--------------------------------------------------------------------------------
/docs/snippets/py/t0-0-dataset-text.txt:
--------------------------------------------------------------------------------
1 | from hover.core.dataset import SupervisableTextDataset
2 | import pandas as pd
3 |
4 | example_csv_path = "https://raw.githubusercontent.com/phurwicz/hover-gallery/main/0.5.0/20_newsgroups_raw.csv"
5 | # for fast, low-memory demonstration purpose, sample the data
6 | df_raw = pd.read_csv(example_csv_path).sample(1000)
7 | df_raw["text"] = df_raw["text"].astype(str)
8 |
9 | # data is divided into 4 subsets: "raw" / "train" / "dev" / "test"
10 | # this example assumes no labeled data available., i.e. only "raw"
11 | df_raw["SUBSET"] = "raw"
12 |
13 | # this class stores the dataset throught the labeling process
14 | dataset = SupervisableTextDataset.from_pandas(df_raw, feature_key="text", label_key="label")
15 |
--------------------------------------------------------------------------------
/tests/utils/test_common_nn.py:
--------------------------------------------------------------------------------
1 | from hover.utils.common_nn import MLP, LogisticRegression
2 | import numpy as np
3 | import torch
4 | import pytest
5 |
6 |
7 | def architecture_subroutine(architecture, dim_inp=300, dim_out=2, num_vecs=10):
8 | """
9 | Test a specific architecture.
10 | """
11 | nn = architecture(dim_inp, dim_out)
12 | inp = torch.Tensor(np.random.rand(num_vecs, dim_inp))
13 | out = nn(inp)
14 | assert out.shape == (num_vecs, dim_out)
15 | out = nn.eval_per_layer(inp)[-1]
16 | assert out.shape == (num_vecs, dim_out)
17 |
18 |
19 | @pytest.mark.lite
20 | def test_MLP():
21 | architecture_subroutine(MLP)
22 |
23 |
24 | @pytest.mark.lite
25 | def test_LR():
26 | architecture_subroutine(LogisticRegression)
27 |
--------------------------------------------------------------------------------
/hover/core/explorer/local_config.py:
--------------------------------------------------------------------------------
1 | import hover
2 | from hover.config_constants import (
3 | ConfigSection as Section,
4 | ConfigKey as Key,
5 | )
6 |
7 | SOURCE_COLOR_FIELD = hover.config[Section.DATA_COLUMNS][Key.SOURCE_COLOR_FIELD]
8 | SOURCE_ALPHA_FIELD = hover.config[Section.DATA_COLUMNS][Key.SOURCE_ALPHA_FIELD]
9 | SEARCH_SCORE_FIELD = hover.config[Section.DATA_COLUMNS][Key.SEARCH_SCORE_FIELD]
10 |
11 | TOOLTIP_IMG_STYLE = hover.config[Section.VISUAL][Key.TOOLTIP_IMG_STYLE]
12 |
13 | SEARCH_MATCH_HEXCOLOR = hover.config[Section.VISUAL][Key.SEARCH_MATCH_HEXCOLOR]
14 | DATAPOINT_BASE_SIZE = hover.config[Section.VISUAL][Key.DATAPOINT_BASE_SIZE]
15 | SEARCH_DATAPOINT_SIZE_PARAMS = (
16 | "size",
17 | DATAPOINT_BASE_SIZE + 3,
18 | DATAPOINT_BASE_SIZE - 2,
19 | DATAPOINT_BASE_SIZE,
20 | )
21 |
--------------------------------------------------------------------------------
/tests/module_config/hover_alt_config_1.ini:
--------------------------------------------------------------------------------
1 | [visual]
2 | abstain_hexcolor = #b0b0b0
3 | bokeh_palette = ["#b0ffff", "#ffb0ff", "#ffffb0", "#b0b0ff", "#b0ffb0", "#ffb0b0", "#a0eeee", "#eea0ee", "#eeeea0", "#a0a0ee", "#a0eea0", "#eea0a0", "#90dddd", "#dd90dd", "#dddd90", "#9090dd", "#90dd90", "#dd9090", "#80cccc", "#cc80cc", "#cccc80", "#8080cc", "#80cc80", "#cc8080"]
4 |
5 | [backend]
6 | dataframe_library = polars
7 |
8 | [data.embedding]
9 | default_reduction_method = ivis
10 |
11 | [data.values]
12 | abstain_decoded = label.abstain
13 | abstain_encoded = -2
14 |
15 | [data.columns]
16 | encoded_label_key = LABEL_ENCODED
17 | dataset_subset_field = __SUBSET__
18 | embedding_field_prefix = EMBED_
19 | source_color_field = __SOURCE_COLOR__
20 | source_alpha_field = __SOURCE_ALPHA__
21 | search_score_field = __SOURCE_SEARCH_SCORE__
22 |
--------------------------------------------------------------------------------
/docs/snippets/markdown/jupyterlab-js-issue.md:
--------------------------------------------------------------------------------
1 | ??? info "Showcase widgets here are not interactive"
2 | {== Plotted widgets **on this page** are not interactive, but only for illustration. ==}
3 |
4 | Widgets {== will be interactive when you actually use them ==} (in your local environment or server apps like in the quickstart).
5 |
6 | - be sure to use a whole `recipe` rather than individual widgets.
7 | - if you really want to plot interactive widgets on their own, try `from hover.utils.bokeh_helper import show_as_interactive as show` instead of `from bokeh.io import show`.
8 | - this works in your own environment but still not on the documentation page.
9 | - [`show_as_interactive`](/hover/pages/reference/utils-bokeh_helper/#hover.utils.bokeh_helper.show_as_interactive) is a simple tweak of `bokeh.io.show` by turning standalone LayoutDOM to an application.
10 |
--------------------------------------------------------------------------------
/docs/snippets/markdown/readme/6-remarks.zh.md:
--------------------------------------------------------------------------------
1 | ## :bell: 其它说明
2 |
3 | ### 鸣谢和推荐
4 |
5 | - 我们推荐 [`Bokeh`](https://bokeh.org) 可视化框架, `hover`正是基于它的图表同步和回调函数来实现非常重要的功能.
6 | - 感谢 [Philip Vollet](https://de.linkedin.com/in/philipvollet) 在`hover`的迭代早期 无偿地帮助在开源社区内推广.
7 |
8 | ### 提供贡献
9 |
10 | - 我们欢迎任何反馈, **特别是使用中的痛点!**
11 | - `./requirements-dev.txt` 列出了开发者所需的依赖.
12 | - 我们建议在提交PR前启用[.pre-commit-config.yaml](https://github.com/phurwicz/hover/blob/main/.pre-commit-config.yaml)中列出的pre-commit hook.
13 |
14 | ### 引用
15 |
16 | 如果`hover`对您的工作有帮助, 请[告诉我们](https://github.com/phurwicz/hover/discussions)或引用 :hugs:
17 |
18 | ```tex
19 | @misc{hover,
20 | title={{hover}: label data at scale},
21 | url={https://github.com/phurwicz/hover},
22 | note={Open software from https://github.com/phurwicz/hover},
23 | author={
24 | Pavel Hurwicz and
25 | Haochuan Wei},
26 | year={2021},
27 | }
28 | ```
29 |
--------------------------------------------------------------------------------
/tests/utils/test_typecheck.py:
--------------------------------------------------------------------------------
1 | from hover.utils.typecheck import TypedValueDict
2 | from collections import defaultdict
3 |
4 |
5 | class TestTypedValueDict:
6 | def test_basic(self):
7 | tdict = TypedValueDict(int)
8 | tdict["key1"] = 1
9 | assert tdict["key1"] == 1
10 |
11 | tdict.update({"key2": 2, "key3": 3})
12 | assert tdict["key2"] == 2
13 | assert tdict["key3"] == 3
14 |
15 | try:
16 | tdict["key4"] = "4"
17 | raise AssertionError("Should have raised TypeError")
18 | except TypeError:
19 | pass
20 |
21 | def test_subclass(self):
22 | tdict = TypedValueDict(dict)
23 | tdict["key1"] = {"foo": "bar"}
24 | assert tdict["key1"] == {"foo": "bar"}
25 |
26 | ddict = defaultdict(str)
27 | tdict.update({"key2": ddict})
28 | assert tdict["key2"] is ddict
29 |
--------------------------------------------------------------------------------
/.github/workflows/cross-os-conda-build.yml:
--------------------------------------------------------------------------------
1 | # This workflow will build the latest source files in Anaconda.
2 |
3 | name: Cross-OS Conda Build
4 |
5 | on:
6 | schedule:
7 | - cron: "0 0 * * 2,5"
8 | workflow_dispatch:
9 |
10 | jobs:
11 | conda-build:
12 |
13 | runs-on: ${{ matrix.os }}
14 | strategy:
15 | fail-fast: false
16 | matrix:
17 | python-version: ['3.8', '3.10']
18 | os: [ubuntu-latest, macos-latest, windows-latest]
19 |
20 | steps:
21 | - uses: actions/checkout@v3
22 | - uses: conda-incubator/setup-miniconda@v2
23 | with:
24 | python-version: ${{ matrix.python-version }}
25 |
26 | - name: Conda build
27 | run: |
28 | conda update conda
29 | conda install conda-build
30 | conda build --channel conda-forge --channel pytorch --override-channels --output-folder ./conda-out/ ./conda-recipe/
31 |
--------------------------------------------------------------------------------
/hover/module_config.py:
--------------------------------------------------------------------------------
1 | import hover
2 | from .config_constants import (
3 | ConfigSection as Section,
4 | ConfigKey as Key,
5 | )
6 | from .utils.dataframe import (
7 | PandasDataframe,
8 | PolarsDataframe,
9 | )
10 |
11 | # dataframe implementation
12 | DataFrame = (
13 | PandasDataframe
14 | if hover.config[Section.BACKEND][Key.DATAFRAME_LIBRARY].lower() == "pandas"
15 | else PolarsDataframe
16 | )
17 |
18 | # constants for the abstain mechanism
19 | ABSTAIN_DECODED = hover.config[Section.DATA_VALUES][Key.ABSTAIN_DECODED]
20 | ABSTAIN_ENCODED = hover.config[Section.DATA_VALUES][Key.ABSTAIN_ENCODED]
21 | ABSTAIN_HEXCOLOR = hover.config[Section.VISUAL][Key.ABSTAIN_HEXCOLOR]
22 |
23 | # constants for label encoding mechanism
24 | ENCODED_LABEL_KEY = hover.config[Section.DATA_COLUMNS][Key.ENCODED_LABEL_KEY]
25 |
26 | # constants for saving work
27 | DATA_SAVE_DIR = hover.config[Section.IO][Key.DATA_SAVE_DIR]
28 |
--------------------------------------------------------------------------------
/.github/workflows/handle-inactive.yml:
--------------------------------------------------------------------------------
1 | name: Handle inactive issues / PRs
2 | on:
3 | schedule:
4 | - cron: "0 12 * * *"
5 | workflow_dispatch:
6 |
7 | jobs:
8 | check-issues-and-prs:
9 | runs-on: ubuntu-latest
10 | permissions:
11 | issues: write
12 | pull-requests: write
13 | steps:
14 | - uses: actions/stale@v3
15 | with:
16 | days-before-issue-stale: 30
17 | days-before-issue-close: 14
18 | any-of-issue-labels: "solved pending confirmation,invalid,wontfix"
19 | stale-issue-label: "stale"
20 | stale-issue-message: "This issue is stale because it has been open for 30 days with no activity."
21 | close-issue-message: "This issue was closed because it has been inactive for 14 days since being marked as stale."
22 | days-before-pr-stale: -1
23 | days-before-pr-close: -1
24 | repo-token: ${{ secrets.GITHUB_TOKEN }}
25 |
--------------------------------------------------------------------------------
/docs/snippets/markdown/readme/0b-status-badges.md:
--------------------------------------------------------------------------------
1 | [](https://pypi.org/project/hover/)
2 | [](https://github.com/conda-forge/hover-feedstock)
3 | 
4 | 
5 | 
6 | 
7 | 
8 |
--------------------------------------------------------------------------------
/docs/snippets/py/g0-3-image-vectorizer.txt:
--------------------------------------------------------------------------------
1 | import torch
2 | import wrappy
3 | from efficientnet_pytorch import EfficientNet
4 | from torchvision import transforms
5 |
6 | # EfficientNet is a series of pre-trained models
7 | # https://github.com/lukemelas/EfficientNet-PyTorch
8 | effnet = EfficientNet.from_pretrained("efficientnet-b0")
9 | effnet.eval()
10 |
11 | # standard transformations for ImageNet-trained models
12 | tfms = transforms.Compose(
13 | [
14 | transforms.Resize(224),
15 | transforms.ToTensor(),
16 | transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
17 | ]
18 | )
19 |
20 | @wrappy.memoize(cache_limit=10000, persist_path='custom_cache/image_url_to_vector.pkl')
21 | def vectorizer(url):
22 | """
23 | Using logits on ImageNet-1000 classes.
24 | """
25 | img = tfms(url_to_image(url)).unsqueeze(0)
26 |
27 | with torch.no_grad():
28 | outputs = effnet(img)
29 |
30 | return outputs.detach().numpy().flatten()
31 |
--------------------------------------------------------------------------------
/docs/pipelines/local_helper.py:
--------------------------------------------------------------------------------
1 | """
2 | Local library for shared functions.
3 | """
4 | from rich.console import Console
5 |
6 |
7 | def batch_routine(func, name_to_file_path_dict):
8 | """
9 | Run a function on a collections of files.
10 | Collect all exceptions along the way.
11 | """
12 | all_success = True
13 | console = Console()
14 |
15 | for _name, _path in name_to_file_path_dict.items():
16 | console.print(f"==== Running {func.__name__} on {_name} ====")
17 | _script, _process = func(_name, _path)
18 | _success = _process.returncode == 0
19 | all_success = all_success and _success
20 |
21 | if not _success:
22 | console.print(
23 | f"!!!! Error from {func.__name__} on {_name} !!!!", style="red bold"
24 | )
25 | console.print(f"{_script}\n\n", style="blue")
26 | console.print(f"{_process.stderr}\n\n", style="red")
27 |
28 | if not all_success:
29 | raise RuntimeError("Script test failed.")
30 |
--------------------------------------------------------------------------------
/docs/pipelines/generate_readme.py:
--------------------------------------------------------------------------------
1 | import os
2 | from markdown_include.include import MarkdownInclude, IncludePreprocessor
3 |
4 | README_TEMPLATE_PATH = os.path.join(os.path.dirname(__file__), "README.md.template")
5 | LANGUAGE_PLACEHOLDER = ""
6 | LANGS = ["en", "zh"]
7 | DEFAULT_LANG = "en"
8 |
9 |
10 | def main():
11 | with open(README_TEMPLATE_PATH, "r") as f:
12 | template = f.read()
13 | include = MarkdownInclude()
14 | preprocessor = IncludePreprocessor(template, include.getConfigs())
15 |
16 | for lang in LANGS:
17 | filename = "README.md" if lang == DEFAULT_LANG else f"README.{lang}.md"
18 | readme_path = os.path.join(os.path.dirname(__file__), filename)
19 | transformed = "\n".join(
20 | preprocessor.run(template.replace(LANGUAGE_PLACEHOLDER, lang).split("\n"))
21 | )
22 | with open(readme_path, "w") as f:
23 | f.write(transformed)
24 | print(f"Generated {readme_path} for language {lang}.")
25 |
26 |
27 | if __name__ == "__main__":
28 | main()
29 |
--------------------------------------------------------------------------------
/docs/snippets/py/tz-dataset-text-full.txt:
--------------------------------------------------------------------------------
1 | from hover.core.dataset import SupervisableTextDataset
2 | import pandas as pd
3 |
4 | raw_csv_path = "https://raw.githubusercontent.com/phurwicz/hover-gallery/main/0.5.0/20_newsgroups_raw.csv"
5 | train_csv_path = "https://raw.githubusercontent.com/phurwicz/hover-gallery/main/0.5.0/20_newsgroups_train.csv"
6 |
7 | # for fast, low-memory demonstration purpose, sample the data
8 | df_raw = pd.read_csv(raw_csv_path).sample(400)
9 | df_raw["SUBSET"] = "raw"
10 | df_train = pd.read_csv(train_csv_path).sample(400)
11 | df_train["SUBSET"] = "train"
12 | df_dev = pd.read_csv(train_csv_path).sample(100)
13 | df_dev["SUBSET"] = "dev"
14 | df_test = pd.read_csv(train_csv_path).sample(100)
15 | df_test["SUBSET"] = "test"
16 |
17 | # build overall dataframe and ensure feature type
18 | df = pd.concat([df_raw, df_train, df_dev, df_test])
19 | df["text"] = df["text"].astype(str)
20 |
21 | # this class stores the dataset throught the labeling process
22 | dataset = SupervisableTextDataset.from_pandas(df, feature_key="text", label_key="label")
23 |
--------------------------------------------------------------------------------
/tests/utils/test_snorkel_helper.py:
--------------------------------------------------------------------------------
1 | from hover.utils.snorkel_helper import labeling_function
2 | import pytest
3 |
4 |
5 | @pytest.mark.lite
6 | def test_labeling_function(example_raw_df):
7 | def original(row):
8 | return "long" if len(row["text"]) > 5 else "short"
9 |
10 | targets = ["long", "short"]
11 | one_row = example_raw_df.get_row_as_dict(0)
12 |
13 | # create LF with pre-determined label encodings
14 | label_encoder = {t: i for i, t in enumerate(targets)}
15 | preencoded = labeling_function(
16 | targets=targets,
17 | label_encoder=label_encoder,
18 | name="pre-encoded",
19 | )(original)
20 |
21 | assert isinstance(preencoded(one_row), str)
22 | assert isinstance(preencoded.snorkel(one_row), int)
23 |
24 | # create LF with undetermined label encodings
25 | unencoded = labeling_function(
26 | targets=targets,
27 | label_encoder=None,
28 | name="unencoded",
29 | )(original)
30 |
31 | assert isinstance(unencoded(one_row), str)
32 | assert unencoded.snorkel is None
33 |
--------------------------------------------------------------------------------
/.github/workflows/cross-os-install-source.yml:
--------------------------------------------------------------------------------
1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
3 |
4 | name: Cross-OS Install (Source)
5 |
6 | on:
7 | schedule:
8 | - cron: "0 0 * * 1,4"
9 | workflow_dispatch:
10 |
11 | jobs:
12 | install-source:
13 |
14 | runs-on: ${{ matrix.os }}
15 | strategy:
16 | fail-fast: false
17 | matrix:
18 | python-version: ['3.8', '3.10']
19 | os: [ubuntu-latest, macos-latest, windows-latest]
20 |
21 | steps:
22 | - uses: actions/checkout@v3
23 | - name: Set up Python ${{ matrix.python-version }}
24 | uses: actions/setup-python@v3
25 | with:
26 | python-version: ${{ matrix.python-version }}
27 |
28 | - name: Install tox
29 | run: |
30 | pip install --upgrade pip
31 | pip install --upgrade tox tox-gh-actions
32 |
33 | - name: Install hover
34 | run: |
35 | tox -e install
36 |
--------------------------------------------------------------------------------
/tests/utils/test_datasets.py:
--------------------------------------------------------------------------------
1 | from hover.utils.datasets import newsgroups_dictl, newsgroups_reduced_dictl
2 | import pytest
3 |
4 |
5 | @pytest.mark.lite
6 | def test_20_newsgroups():
7 | for dictl_method, num_classes in [
8 | (newsgroups_dictl, 20),
9 | (newsgroups_reduced_dictl, 7),
10 | ]:
11 | my_20ng, label_encoder, label_decoder = dictl_method()
12 |
13 | assert isinstance(my_20ng, dict)
14 | for _key in ["train", "test"]:
15 | assert isinstance(my_20ng["train"], list)
16 | assert isinstance(my_20ng["train"][0], dict)
17 | assert isinstance(my_20ng["train"][0]["label"], str)
18 | assert isinstance(my_20ng["train"][0]["text"], str)
19 |
20 | assert isinstance(label_encoder, dict)
21 | assert isinstance(label_decoder, dict)
22 | assert len(label_encoder) == num_classes + 1
23 | assert len(label_decoder) == num_classes + 1
24 | assert set(label_encoder.keys()) == set(label_decoder.values())
25 | assert set(label_decoder.keys()) == set(label_encoder.values())
26 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2020 phurwicz
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/tests/recipes/test_subroutine.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | from hover.core.explorer.functionality import (
3 | BokehDataAnnotator,
4 | BokehDataFinder,
5 | BokehSoftLabelExplorer,
6 | BokehSnorkelExplorer,
7 | )
8 | from hover.recipes.subroutine import (
9 | standard_annotator,
10 | standard_finder,
11 | standard_snorkel,
12 | standard_softlabel,
13 | )
14 |
15 |
16 | @pytest.mark.lite
17 | def test_autobuild_explorer(
18 | example_text_dataset,
19 | example_image_dataset,
20 | example_audio_dataset,
21 | ):
22 | for dataset in [
23 | example_text_dataset,
24 | example_image_dataset,
25 | example_audio_dataset,
26 | ]:
27 | dataset = dataset.copy()
28 |
29 | annotator = standard_annotator(dataset)
30 | assert isinstance(annotator, BokehDataAnnotator)
31 |
32 | finder = standard_finder(dataset)
33 | assert isinstance(finder, BokehDataFinder)
34 |
35 | softlabel = standard_softlabel(dataset)
36 | assert isinstance(softlabel, BokehSoftLabelExplorer)
37 |
38 | snorkel = standard_snorkel(dataset)
39 | assert isinstance(snorkel, BokehSnorkelExplorer)
40 |
--------------------------------------------------------------------------------
/docs/snippets/markdown/readme/6-remarks.en.md:
--------------------------------------------------------------------------------
1 | ## :bell: Remarks
2 |
3 | ### Shoutouts
4 |
5 | - Thanks to [`Bokeh`](https://bokeh.org) because `hover` would not exist without linked plots and callbacks, or be nearly as good without embeddable server apps.
6 | - Thanks to [Philip Vollet](https://de.linkedin.com/in/philipvollet) for sharing `hover` with the community even when it was really green.
7 |
8 | ### Contributing
9 |
10 | - All feedbacks are welcome, **especially what you find lacking and want it fixed!**
11 | - `./requirements-dev.txt` lists required packages for development.
12 | - Pull requests are advised to use a superset of the pre-commit hooks listed in [.pre-commit-config.yaml](https://github.com/phurwicz/hover/blob/main/.pre-commit-config.yaml).
13 |
14 | ### Citation
15 |
16 | If you have found `hover` useful to your work, please [let us know](https://github.com/phurwicz/hover/discussions) :hugs:
17 |
18 | ```tex
19 | @misc{hover,
20 | title={{hover}: label data at scale},
21 | url={https://github.com/phurwicz/hover},
22 | note={Open software from https://github.com/phurwicz/hover},
23 | author={
24 | Pavel Hurwicz and
25 | Haochuan Wei},
26 | year={2021},
27 | }
28 | ```
29 |
--------------------------------------------------------------------------------
/.github/workflows/cross-os-source-test.yml:
--------------------------------------------------------------------------------
1 | # This workflow will install Python dependencies and run tests on the source code.
2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
3 |
4 | name: Cross-OS Source Test
5 |
6 | on:
7 | schedule:
8 | - cron: "0 0 * * 1,4"
9 | workflow_dispatch:
10 |
11 | jobs:
12 | test-api:
13 |
14 | runs-on: ${{ matrix.os }}
15 | strategy:
16 | fail-fast: false
17 | matrix:
18 | python-version: ['3.8', '3.9', '3.10']
19 | os: [ubuntu-latest, macos-latest, windows-latest]
20 |
21 | steps:
22 | - uses: actions/checkout@v3
23 | - name: Set up Python ${{ matrix.python-version }}
24 | uses: actions/setup-python@v3
25 | with:
26 | python-version: ${{ matrix.python-version }}
27 |
28 | - name: Get dependencies
29 | run: |
30 | pip install --upgrade pip
31 | pip install --upgrade tox tox-gh-actions
32 |
33 | - name: Test - default config
34 | run: |
35 | tox -e test_api
36 |
37 | - name: Test - alt config 1
38 | run: |
39 | tox -e test_api -- --hover-ini tests/module_config/hover_alt_config_1.ini
40 |
--------------------------------------------------------------------------------
/conda-recipe/stable.yaml:
--------------------------------------------------------------------------------
1 | {% set name = "hover" %}
2 | {% set version = "0.7.0" %}
3 |
4 |
5 | package:
6 | name: {{ name }}
7 | version: {{ version }}
8 |
9 | source:
10 | url: https://pypi.io/packages/source/{{ name[0] }}/{{ name }}/{{ name }}-{{ version }}.tar.gz
11 | sha256: 1aae054f90ec869e898affa2f06bed08c1b00531d21f02f1faeafefc19ff6d98
12 |
13 | build:
14 | number: 0
15 | noarch: python
16 | script: python -m pip install . -vv
17 |
18 | requirements:
19 | host:
20 | - python >=3.7
21 | - pip
22 | run:
23 | - python >=3.7
24 | - bokeh >=2.3.3
25 | - scikit-learn >=0.20.0
26 | - pytorch >=1.10.0
27 | - pandas >=1.3.0
28 | - numpy >=1.14
29 | - scipy >=1.3.2
30 | - tqdm >=4.0
31 | - rich >=11.0.0
32 | - deprecated >=1.1.0
33 | - umap-learn >=0.3.10
34 |
35 | test:
36 | imports:
37 | - hover
38 | commands:
39 | - pip check
40 | requires:
41 | - pip
42 |
43 | about:
44 | home: https://phurwicz.github.io/hover
45 | license: MIT
46 | license_file: LICENSE
47 | summary: Label data at scale. Fun and precision included.
48 | dev_url: https://github.com/phurwicz/hover
49 |
50 | extra:
51 | recipe-maintainers:
52 | - phurwicz
53 | - haochuanwei
54 |
--------------------------------------------------------------------------------
/tests/utils/test_bokeh_helper.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | from urllib.parse import urlparse
3 | from hover.utils.bokeh_helper import (
4 | servable,
5 | binder_proxy_app_url,
6 | remote_jupyter_proxy_url,
7 | )
8 | from tests.recipes.local_helper import execute_handle_function
9 |
10 |
11 | @pytest.mark.lite
12 | def test_binder_proxy_app_url():
13 | """
14 | The function being tested is only intended for Binder.
15 | """
16 | url = binder_proxy_app_url("simple-annotator", port=5007)
17 | _ = urlparse(url)
18 |
19 |
20 | @pytest.mark.lite
21 | def test_remote_jupyter_proxy_url():
22 | """
23 | Not a full test, rather just validating urls.
24 | """
25 | for port in [8888, None]:
26 | url = remote_jupyter_proxy_url(port)
27 | _ = urlparse(url)
28 |
29 |
30 | @pytest.mark.lite
31 | def test_servable_wrapper(dummy_working_recipe, dummy_broken_recipe):
32 | try:
33 | dummy_broken_recipe()
34 | pytest.fail("The dummy broken recipe above should have raised an exception.")
35 | except AssertionError:
36 | pass
37 |
38 | for recipe in [dummy_working_recipe, dummy_broken_recipe]:
39 | handle = servable()(recipe)
40 | execute_handle_function(handle)
41 |
--------------------------------------------------------------------------------
/tests/core/representation/test_trajectory.py:
--------------------------------------------------------------------------------
1 | from hover.core.representation.trajectory import spline, manifold_spline
2 | import numpy as np
3 | import pytest
4 |
5 |
6 | @pytest.mark.lite
7 | def test_spline(one_to_two_and_square):
8 | x, y = one_to_two_and_square
9 |
10 | traj_x, traj_y = spline([x, y], points_per_step=1, splprep_kwargs={"k": 2})
11 | assert (np.absolute(traj_x - x) < 1e-2).all()
12 | assert (np.absolute(traj_y - y) < 1e-2).all()
13 |
14 |
15 | @pytest.mark.lite
16 | def test_manifold_spline(one_to_two_and_square, num_points=100):
17 | # shape: dim-by-step
18 | arr = np.array(one_to_two_and_square)
19 |
20 | # shape: point-by-dim-by-step
21 | arr = np.array([arr] * num_points)
22 |
23 | # shape: step-by-point-by-dim
24 | arr = np.swapaxes(arr, 1, 2)
25 | arr = np.swapaxes(arr, 0, 1)
26 | L, M, N = arr.shape
27 |
28 | # add a displacement that varies by point
29 | arr += np.linspace(0.0, 0.1, num_points)[np.newaxis, :, np.newaxis]
30 |
31 | traj = manifold_spline(arr, points_per_step=1, splprep_kwargs={"k": 2})
32 | assert traj.shape == (L, M, N)
33 |
34 | traj = manifold_spline(arr, points_per_step=3, splprep_kwargs={"k": 2})
35 | assert traj.shape == (3 * L - 2, M, N)
36 |
--------------------------------------------------------------------------------
/docs/pipelines/local_config.py:
--------------------------------------------------------------------------------
1 | import os
2 | from markdown_include.include import MarkdownInclude
3 |
4 |
5 | DIR_PATH = os.path.dirname(__file__)
6 | NAME_TO_SCRIPT_REL = {
7 | "t0-quickstart": "../pages/tutorial/t0-quickstart.md",
8 | "t1-using-recipes": "../pages/tutorial/t1-active-learning.md",
9 | # tutorial-t2 has no script currently
10 | "t3-dataset-mechanisms": "../pages/tutorial/t3-dataset-population-selection.md",
11 | "t4-annotator-plot-tools": "../pages/tutorial/t4-annotator-dataset-interaction.md",
12 | "t5-finder-selection-filter": "../pages/tutorial/t5-finder-filter.md",
13 | "t6-soft-label-joint-filters": "../pages/tutorial/t6-softlabel-joint-filter.md",
14 | "t7-custom-labeling-functions": "../pages/tutorial/t7-snorkel-improvise-rules.md",
15 | "g0-image-data": "../pages/guides/g0-datatype-image.md",
16 | "g1-audio-data": "../pages/guides/g1-datatype-audio.md",
17 | }
18 | NAME_TO_SCRIPT_ABS = {
19 | _k: os.path.join(DIR_PATH, _v) for _k, _v in NAME_TO_SCRIPT_REL.items()
20 | }
21 |
22 |
23 | MARKDOWN_INCLUDE = MarkdownInclude(
24 | configs={
25 | "base_path": os.path.join(DIR_PATH, "../../"),
26 | "encoding": "utf-8",
27 | }
28 | )
29 |
30 | THEBE_PATTERN_CODE_ONLY = r"(?<=)[\s\S]*?(?= )"
31 | THEBE_PATTERN_WITH_TAGS = r"[\s\S]*? "
32 |
--------------------------------------------------------------------------------
/tests/core/representation/conftest.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | import numpy as np
3 | import random
4 |
5 |
6 | @pytest.fixture(scope="module")
7 | def one_to_two_and_square():
8 | x = np.linspace(1.0, 2.0, 11)
9 | y = x * x
10 | return [x, y]
11 |
12 |
13 | @pytest.fixture(scope="module")
14 | def example_array(n_vecs=1000, dim=30):
15 | return np.random.rand(n_vecs, dim)
16 |
17 |
18 | @pytest.fixture(scope="module")
19 | def distance_preserving_array_sequence(example_array):
20 | A = example_array
21 | # translation
22 | B = A + 1.0
23 | # dilation
24 | C = 3.0 * B
25 | # rotation of axes
26 | D = np.concatenate((C[:, 1:], C[:, :1]), axis=1)
27 | # reflection of random axes
28 | E = np.array([random.choice([-1, 1]) for i in range(D.shape[1])])[np.newaxis, :] * D
29 |
30 | return [A, B, C, D, E]
31 |
32 |
33 | @pytest.fixture(scope="module")
34 | def diagonal_multiplication_array_sequence(example_array):
35 | A = example_array
36 | M = np.diag(np.random.rand(A.shape[-1]))
37 | B = A @ M
38 |
39 | return [A, B]
40 |
41 |
42 | @pytest.fixture(scope="module")
43 | def random_multiplication_array_sequence(example_array):
44 | A = example_array
45 | ref_dim = A.shape[-1]
46 | M = np.random.rand(ref_dim, np.random.randint(ref_dim // 2, ref_dim))
47 | B = A @ M
48 |
49 | return [A, B]
50 |
--------------------------------------------------------------------------------
/.github/workflows/assemble-readme.yml:
--------------------------------------------------------------------------------
1 | # This workflow will generate README files based on the doc snippets.
2 |
3 | name: Assemble Multilingual README
4 |
5 | on:
6 | push:
7 | branches:
8 | - main
9 | paths:
10 | - 'docs/snippets/markdown/readme/'
11 | - 'docs/pipelines/README.md.template'
12 | - 'docs/pipelines/generate_readme.py'
13 | workflow_dispatch:
14 |
15 | jobs:
16 | assemble-readme:
17 | runs-on: ubuntu-latest
18 | steps:
19 | - uses: actions/checkout@v3
20 | with:
21 | fetch-depth: 0
22 |
23 | - name: Prepare Git
24 | run: |
25 | git config user.name ${{ secrets.ACTIONS_GIT_USERNAME }}
26 | git config user.email ${{ secrets.ACTIONS_GIT_EMAIL }}
27 |
28 | - name: Run script and get output files
29 | run: |
30 | pip install -r requirements-dev.txt
31 | python docs/pipelines/generate_readme.py
32 | mv docs/pipelines/README*.md ./
33 | git add ./README*.md
34 | git commit -m "Assemble README files from snippets"
35 |
36 | - name: Create Pull Request
37 | uses: peter-evans/create-pull-request@v5
38 | with:
39 | commit-message: Assemble README files from snippets
40 | title: Automatic README update
41 | body: Assemble README files from snippets
42 | branch: assemble-readme
43 |
--------------------------------------------------------------------------------
/conda-recipe/meta.yaml:
--------------------------------------------------------------------------------
1 | {% set name = "hover" %}
2 | {% set version = "0.8.1" %}
3 |
4 |
5 | package:
6 | name: {{ name }}
7 | version: {{ version }}
8 |
9 | source:
10 | git_url: https://github.com/phurwicz/hover.git
11 |
12 | build:
13 | number: 0
14 | noarch: python
15 | script: python -m pip install . -vv
16 |
17 | requirements:
18 | host:
19 | - python >=3.7
20 | - pip
21 | run:
22 | - python >=3.7
23 | - bokeh >=3.0.3
24 | - scikit-learn >=0.20.0
25 | - pytorch >=1.10.0
26 | - pandas >=1.3.0
27 | - numpy >=1.14
28 | - scipy >=1.3.2
29 | - tqdm >=4.0
30 | - rich >=11.0.0
31 | - deprecated >=1.1.0
32 | - umap-learn >=0.3.10
33 | - flexmod >=0.1.0
34 |
35 | test:
36 | imports:
37 | - hover
38 | commands:
39 | - python -m spacy download en_core_web_md
40 | - pytest -m lite
41 | requires:
42 | - pip
43 | - pytest
44 | - spacy
45 | - faker
46 | - snorkel>=0.9.8
47 | - openpyxl
48 | - wrappy
49 | - shaffle
50 | source_files:
51 | - fixture_module
52 | - tests
53 | - pytest.ini
54 |
55 | about:
56 | home: https://phurwicz.github.io/hover
57 | license: MIT
58 | license_file: LICENSE
59 | summary: Label data at scale. Fun and precision included.
60 | dev_url: https://github.com/phurwicz/hover
61 |
62 | extra:
63 | recipe-maintainers:
64 | - phurwicz
65 | - haochuanwei
66 |
--------------------------------------------------------------------------------
/docs/pipelines/check_scripts.py:
--------------------------------------------------------------------------------
1 | """
2 | Tests the scripts in the docs.
3 | Intended for a Binder environment, and should be used in conjunction with a local libary in phurwicz/hover-binder.
4 | """
5 | import re
6 | import uuid
7 | import markdown
8 | import subprocess
9 | from local_helper import batch_routine
10 | from local_config import NAME_TO_SCRIPT_ABS, MARKDOWN_INCLUDE, THEBE_PATTERN_CODE_ONLY
11 |
12 |
13 | def main():
14 | """
15 | Test all code blocks in the scripts listed in this file.
16 | Collect all exceptions along the way.
17 | """
18 | batch_routine(parse_script_and_run, NAME_TO_SCRIPT_ABS)
19 |
20 |
21 | def parse_script_and_run(script_name, source_abs_path):
22 | """
23 | Retrieve and run code blocks from documentation file.
24 | Note that the doc file can be using markdown-include.
25 | """
26 | script_tmp_path = f"{script_name}-{uuid.uuid1()}.py"
27 |
28 | with open(source_abs_path, "r") as f_source:
29 | source = f_source.read()
30 | html = markdown.markdown(source, extensions=[MARKDOWN_INCLUDE])
31 | script = "\n".join(re.findall(THEBE_PATTERN_CODE_ONLY, html))
32 |
33 | with open(script_tmp_path, "w") as f_script:
34 | f_script.write(script)
35 |
36 | process = subprocess.run(
37 | ["python", script_tmp_path], capture_output=True, timeout=1200
38 | )
39 | return script, process
40 |
41 |
42 | if __name__ == "__main__":
43 | main()
44 |
--------------------------------------------------------------------------------
/docs/snippets/py/t7-0-lf-list.txt:
--------------------------------------------------------------------------------
1 | from hover.utils.snorkel_helper import labeling_function
2 | from hover.module_config import ABSTAIN_DECODED as ABSTAIN
3 | import re
4 |
5 |
6 | @labeling_function(targets=["rec.autos"])
7 | def auto_keywords(row):
8 | flag = re.search(
9 | r"(?i)(diesel|gasoline|automobile|vehicle|drive|driving)", row.text
10 | )
11 | return "rec.autos" if flag else ABSTAIN
12 |
13 |
14 | @labeling_function(targets=["rec.sport.baseball"])
15 | def baseball_keywords(row):
16 | flag = re.search(r"(?i)(baseball|stadium|\ bat\ |\ base\ )", row.text)
17 | return "rec.sport.baseball" if flag else ABSTAIN
18 |
19 |
20 | @labeling_function(targets=["sci.crypt"])
21 | def crypt_keywords(row):
22 | flag = re.search(r"(?i)(crypt|math|encode|decode|key)", row.text)
23 | return "sci.crypt" if flag else ABSTAIN
24 |
25 |
26 | @labeling_function(targets=["talk.politics.guns"])
27 | def guns_keywords(row):
28 | flag = re.search(r"(?i)(gun|rifle|ammunition|violence|shoot)", row.text)
29 | return "talk.politics.guns" if flag else ABSTAIN
30 |
31 |
32 | @labeling_function(targets=["misc.forsale"])
33 | def forsale_keywords(row):
34 | flag = re.search(r"(?i)(sale|deal|price|discount)", row.text)
35 | return "misc.forsale" if flag else ABSTAIN
36 |
37 |
38 | LABELING_FUNCTIONS = [
39 | auto_keywords,
40 | baseball_keywords,
41 | crypt_keywords,
42 | guns_keywords,
43 | forsale_keywords,
44 | ]
45 |
--------------------------------------------------------------------------------
/docs/styles/monokai.css:
--------------------------------------------------------------------------------
1 | .cm-s-monokai {
2 | padding: 2em 2em 2em;
3 | height: auto;
4 | font-size: 0.8em;
5 | line-height: 1.5em;
6 | font-family: inconsolata, monospace;
7 | letter-spacing: 0.3px;
8 | word-spacing: 1px;
9 | background: #272822;
10 | color: #F8F8F2;
11 | }
12 | .cm-s-monokai .CodeMirror-lines {
13 | padding: 8px 0;
14 | }
15 | .cm-s-monokai .CodeMirror-gutters {
16 | box-shadow: 1px 0 2px 0 rgba(0, 0, 0, 0.5);
17 | -webkit-box-shadow: 1px 0 2px 0 rgba(0, 0, 0, 0.5);
18 | background-color: #272822;
19 | padding-right: 10px;
20 | z-index: 3;
21 | border: none;
22 | }
23 | .cm-s-monokai div.CodeMirror-cursor {
24 | border-left: 3px solid #F8F8F2;
25 | }
26 | .cm-s-monokai .CodeMirror-activeline-background {
27 | background: #49483E;
28 | }
29 | .cm-s-monokai .CodeMirror-selected {
30 | background: #49483E;
31 | }
32 | .cm-s-monokai .cm-comment {
33 | color: #75715E;
34 | }
35 | .cm-s-monokai .cm-string {
36 | color: #E6DB74;
37 | }
38 | .cm-s-monokai .cm-number {
39 | color: #66D9EF;
40 | }
41 | .cm-s-monokai .cm-atom {
42 | color: #66D9EF;
43 | }
44 | .cm-s-monokai .cm-keyword {
45 | color: #F92672;
46 | }
47 | .cm-s-monokai .cm-variable {
48 | color: #A6E22E;
49 | }
50 | .cm-s-monokai .cm-def {
51 | color: #FD971F;
52 | }
53 | .cm-s-monokai .cm-variable-2 {
54 | color: #F92672;
55 | }
56 | .cm-s-monokai .cm-property {
57 | color: #66D9EF;
58 | }
59 | .cm-s-monokai .cm-operator {
60 | color: #F92672;
61 | }
62 | .cm-s-monokai .CodeMirror-linenumber {
63 | color: #75715E;
64 | }
65 |
--------------------------------------------------------------------------------
/fixture_module/text_vector_net/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Example importable module holding customized ingredients of a workflow with hover.
3 | Specifically for text data.
4 | """
5 |
6 | import os
7 | import re
8 | import numpy as np
9 | import wrappy
10 |
11 | CACHE_PATH = os.path.join(os.path.dirname(__file__), "vecs.pkl")
12 |
13 |
14 | def get_vectorizer():
15 | """
16 | :returns: a text vectorizer.
17 | """
18 | import spacy
19 |
20 | # SpaCy 'vector' models are perfect for this
21 | # nlp = spacy.load('en_vectors_web_lg')
22 |
23 | # 'core' models are slower due to linguistic features
24 | nlp = spacy.load("en_core_web_md")
25 |
26 | # could use a transformer if speed is ok
27 | # nlp = spacy.load('en_trf_bertbaseuncased_lg')
28 |
29 | # memoization can be useful if the function takes a while to run, e.g. transformer models
30 | @wrappy.memoize(
31 | cache_limit=50000,
32 | return_copy=False,
33 | persist_path=CACHE_PATH,
34 | persist_batch_size=1000,
35 | )
36 | def vectorizer(text):
37 | """
38 | A more serious example of a text vectorizer.
39 | """
40 | clean_text = re.sub(r"[\t\n]", r" ", text)
41 | return nlp(clean_text, disable=nlp.pipe_names).vector
42 |
43 | return vectorizer
44 |
45 |
46 | def get_architecture():
47 | from hover.utils.common_nn import MLP
48 |
49 | return MLP
50 |
51 |
52 | def get_state_dict_path():
53 | dir_path = os.path.dirname(__file__)
54 | return os.path.join(dir_path, "model.pt")
55 |
--------------------------------------------------------------------------------
/tests/core/representation/test_reduction.py:
--------------------------------------------------------------------------------
1 | from hover.core.representation.reduction import DimensionalityReducer
2 | import numpy as np
3 | import pytest
4 |
5 |
6 | @pytest.mark.lite
7 | def test_create_reducer(n_points=1000):
8 | # if marked as lite, only test the default reducer library
9 | from umap import UMAP
10 |
11 | reducer = DimensionalityReducer.create_reducer(
12 | "umap",
13 | dimension=4,
14 | n_neighbors=10,
15 | )
16 | assert isinstance(reducer, UMAP)
17 | # dimension is expected to override n_components (default 2)
18 | assert reducer.n_components == 4
19 | # other kwargs are expected to simply get forwarded
20 | assert reducer.n_neighbors == 10
21 |
22 |
23 | def test_dimensionality_reduction(n_points=1000):
24 |
25 | arr = np.random.rand(n_points, 20)
26 | reducer = DimensionalityReducer(arr)
27 |
28 | reducer.fit_transform(
29 | "umap", n_neighbors=3, min_dist=0.01, dimension=3, metric="euclidean"
30 | )
31 | embedding = reducer.transform(arr, "umap")
32 | assert embedding.shape == (n_points, 3)
33 | embedding = reducer.transform(np.array([]))
34 | assert embedding.shape == (0,)
35 |
36 | reducer.fit_transform(
37 | "ivis", dimension=4, k=3, distance="pn", batch_size=16, epochs=20
38 | )
39 | embedding = reducer.transform(arr, "ivis")
40 | assert embedding.shape == (n_points, 4)
41 |
42 | try:
43 | reducer.fit_transform("invalid_method")
44 | pytest.fail("Expected exception from invalid reduction method.")
45 | except ValueError:
46 | pass
47 |
--------------------------------------------------------------------------------
/hover/core/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Core module: contains classes which are centerpieces of the vast majority of use cases.
3 |
4 | - dataset: defines the primary data structure to work with.
5 | - explorer: defines high-level building blocks of the interactive UI.
6 | - neural: defines sub-applications that involve neural networks.
7 | """
8 | from rich.console import Console
9 | from hover.utils.meta.traceback import RichTracebackMeta
10 |
11 |
12 | class Loggable(metaclass=RichTracebackMeta):
13 | """
14 | Base class that provides consistently templated logging.
15 |
16 | Inspired by `wasabi`'s `good`/`info`/`warn`/`fail` methods.
17 |
18 | [`Rich` style guide](https://rich.readthedocs.io/en/latest/style.html)
19 | """
20 |
21 | CONSOLE = Console()
22 |
23 | def _print(self, *args, **kwargs):
24 | self.__class__.CONSOLE.print(*args, **kwargs)
25 |
26 | def _good(self, message):
27 | self.__class__.CONSOLE.print(
28 | f":green_circle: {self.__class__.__name__}: {message}",
29 | style="green",
30 | )
31 |
32 | def _info(self, message):
33 | self.__class__.CONSOLE.print(
34 | f":blue_circle: {self.__class__.__name__}: {message}", style="blue"
35 | )
36 |
37 | def _warn(self, message):
38 | self.__class__.CONSOLE.print(
39 | f":yellow_circle: {self.__class__.__name__}: {message}",
40 | style="yellow",
41 | )
42 |
43 | def _fail(self, message):
44 | self.__class__.CONSOLE.print(
45 | f":red_circle: {self.__class__.__name__}: {message}", style="red"
46 | )
47 |
--------------------------------------------------------------------------------
/tests/core/test_local_config.py:
--------------------------------------------------------------------------------
1 | from bokeh.models import (
2 | TableColumn,
3 | )
4 | from hover.core.local_config import (
5 | embedding_field,
6 | is_embedding_field,
7 | blank_callback_on_change,
8 | dataset_default_sel_table_columns,
9 | dataset_default_sel_table_kwargs,
10 | )
11 | import pytest
12 |
13 |
14 | @pytest.mark.lite
15 | def test_embedding_field():
16 | for i in range(2, 10):
17 | for j in range(i):
18 | assert is_embedding_field(embedding_field(i, j))
19 |
20 |
21 | @pytest.mark.lite
22 | def test_blank_callback_on_change():
23 | blank_callback_on_change("value", 0, 1)
24 |
25 |
26 | @pytest.mark.lite
27 | def test_dataset_default_sel_table_columns():
28 | for feature in ["text", "image", "audio"]:
29 | columns = dataset_default_sel_table_columns(feature)
30 | assert isinstance(columns, list)
31 | assert isinstance(columns[0], TableColumn)
32 |
33 | try:
34 | dataset_default_sel_table_columns("invalid_feature")
35 | pytest.fail("Expected an exception from creating columns on invalid feature.")
36 | except ValueError:
37 | pass
38 |
39 |
40 | @pytest.mark.lite
41 | def test_dataset_default_sel_table_kwargs():
42 | for feature in ["text", "image", "audio"]:
43 | kwargs = dataset_default_sel_table_kwargs(feature)
44 | assert isinstance(kwargs, dict)
45 | assert kwargs
46 |
47 | try:
48 | dataset_default_sel_table_kwargs("invalid_feature")
49 | pytest.fail("Expected an exception from creating kwargs on invalid feature.")
50 | except ValueError:
51 | pass
52 |
--------------------------------------------------------------------------------
/.github/workflows/doc-script-test.yml:
--------------------------------------------------------------------------------
1 | # This workflow will install Python dependencies and run tests on the code snippets included in the documentation.
2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
3 |
4 | name: Documentation Script Test
5 |
6 | on:
7 | schedule:
8 | - cron: "0 0 * * 1"
9 | push:
10 | branches: [ main ]
11 | paths:
12 | - 'hover/**.py'
13 | - 'docs/pages/**.md'
14 | - 'docs/pipelines/*.py'
15 | - 'docs/snippets/markdown/dataset-prep.md'
16 | - 'docs/snippets/py/*.*'
17 | pull_request:
18 | branches: [ main ]
19 | paths:
20 | - 'hover/**.py'
21 | - 'docs/pages/**.md'
22 | - 'docs/pipelines/*.py'
23 | - 'docs/snippets/markdown/dataset-prep.md'
24 | - 'docs/snippets/py/*.*'
25 | workflow_dispatch:
26 |
27 | jobs:
28 | doc-script:
29 |
30 | runs-on: ${{ matrix.os }}
31 | strategy:
32 | fail-fast: false
33 | matrix:
34 | python-version: ['3.9']
35 | os: [ubuntu-latest]
36 |
37 | steps:
38 | - name: Clone hover
39 | uses: actions/checkout@v3
40 |
41 | - name: Set up Python ${{ matrix.python-version }}
42 | uses: actions/setup-python@v3
43 | with:
44 | python-version: ${{ matrix.python-version }}
45 |
46 | - name: Install Non-Python Dependencies
47 | run: |
48 | sudo apt-get update
49 | sudo apt-get install libsndfile1
50 |
51 | - name: Test with Tox
52 | run: |
53 | pip install --upgrade pip
54 | pip install --upgrade tox tox-gh-actions
55 | tox -e test_doc_scripts
56 |
--------------------------------------------------------------------------------
/tests/utils/test_torch_helper.py:
--------------------------------------------------------------------------------
1 | from torch.utils.data import Dataset, DataLoader
2 | from hover.utils.torch_helper import (
3 | VectorDataset,
4 | MultiVectorDataset,
5 | one_hot,
6 | label_smoothing,
7 | )
8 | import numpy as np
9 | import pytest
10 |
11 |
12 | @pytest.mark.lite
13 | def test_vector_dataset(num_entries=100, dim_inp=128, dim_out=3):
14 | vec_inp = np.random.rand(num_entries, dim_inp)
15 | vec_out = np.random.rand(num_entries, dim_out)
16 |
17 | for dataset in [
18 | VectorDataset(vec_inp, vec_out),
19 | MultiVectorDataset([vec_inp] * 2, vec_out),
20 | ]:
21 | loader = dataset.loader(batch_size=min(num_entries, 16))
22 | assert isinstance(dataset, Dataset)
23 | assert isinstance(loader, DataLoader)
24 | assert len(dataset) == num_entries
25 | inp, out, idx = dataset[0]
26 |
27 |
28 | @pytest.mark.lite
29 | def test_one_hot():
30 | categorical_labels = [0, 1, 2, 1]
31 | one_hot_labels = one_hot(categorical_labels, 3)
32 | assert one_hot_labels.shape == (4, 3)
33 |
34 |
35 | @pytest.mark.lite
36 | def test_label_smoothing(num_entries=100, num_classes=3, coeff=0.1):
37 | assert num_classes >= 2
38 | assert coeff >= 0.0
39 |
40 | categorical_labels = [0] * num_entries
41 | prob_labels = one_hot(categorical_labels, num_classes)
42 |
43 | assert np.allclose(label_smoothing(prob_labels, coefficient=0.0), prob_labels)
44 | smoothed = label_smoothing(prob_labels, coefficient=coeff)
45 | np.testing.assert_almost_equal(
46 | smoothed[0][0], 1.0 - coeff * (1.0 - 1.0 / num_classes)
47 | )
48 | np.testing.assert_almost_equal(smoothed[0][1], coeff / num_classes)
49 |
--------------------------------------------------------------------------------
/hover/utils/bokeh_helper/local_config.py:
--------------------------------------------------------------------------------
1 | import hover
2 | from hover.config_constants import (
3 | ConfigSection as Section,
4 | ConfigKey as Key,
5 | )
6 |
7 |
8 | BOKEH_PALETTE = hover.config[Section.VISUAL][Key.BOKEH_PALETTE]
9 | BOKEH_PALETTE_USAGE = hover.config[Section.VISUAL][Key.BOKEH_PALETTE_USAGE]
10 |
11 | TOOLTIP_TEXT_TEMPLATE = """
12 |
13 |
14 | {key}: @{field}
15 |
16 |
17 | """
18 |
19 | TOOLTIP_IMAGE_TEMPLATE = """
20 |
21 |
25 |
26 | """
27 |
28 | TOOLTIP_AUDIO_TEMPLATE = """
29 |
37 | """
38 |
39 | TOOLTIP_CUSTOM_TEMPLATE = """
40 |
41 |
42 | {key}: @{field}
43 |
44 |
45 | """
46 |
47 | TOOLTIP_LABEL_TEMPLATE = """
48 |
49 |
50 | {key}: @{field}
51 |
52 |
53 | """
54 |
55 | TOOLTIP_COORDS_DIV = """
56 |
57 |
58 | Coordinates: ($x, $y)
59 |
60 |
61 | """
62 |
63 | TOOLTIP_INDEX_DIV = """
64 |
65 |
66 | Index: [$index]
67 |
68 |
69 | """
70 |
--------------------------------------------------------------------------------
/.github/workflows/doc-auto-notebook.yml:
--------------------------------------------------------------------------------
1 | # This workflow will generate Jupyter notebooks based on the code scripts in the documentation.
2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
3 |
4 | name: Automatic Notebook Generation
5 |
6 | on:
7 | schedule:
8 | - cron: "0 0 * * 2"
9 | workflow_dispatch:
10 |
11 | jobs:
12 | auto-notebook:
13 |
14 | runs-on: ${{ matrix.os }}
15 | strategy:
16 | matrix:
17 | python-version: ['3.9']
18 | os: [ubuntu-latest]
19 |
20 | steps:
21 | - name: Clone hover all branches
22 | uses: actions/checkout@v3
23 | with:
24 | fetch-depth: 0
25 |
26 | - name: Set up Python ${{ matrix.python-version }}
27 | uses: actions/setup-python@v3
28 | with:
29 | python-version: ${{ matrix.python-version }}
30 |
31 | - name: Install Non-Python Dependencies
32 | run: |
33 | sudo apt-get update
34 | sudo apt-get install libsndfile1
35 |
36 | - name: Prepare Git
37 | run: |
38 | git config user.name ${{ secrets.ACTIONS_GIT_USERNAME }}
39 | git config user.email ${{ secrets.ACTIONS_GIT_EMAIL }}
40 | git checkout pipeline/notebook-generation
41 | git merge origin/main --no-edit
42 | git push
43 |
44 | - name: Test with Tox
45 | run: |
46 | pip install --upgrade pip
47 | pip install --upgrade tox tox-gh-actions
48 | tox -e test_notebook_generation
49 |
50 | - name: Update Generated Notebooks
51 | run: |
52 | git add docs/pipelines/generated/*.ipynb
53 | git commit -m "Automatic update of notebooks generated from documentation scripts"
54 | git push
55 |
--------------------------------------------------------------------------------
/hover/utils/meta/traceback.py:
--------------------------------------------------------------------------------
1 | from abc import ABCMeta
2 | from functools import wraps
3 | from types import FunctionType
4 | from rich.console import Console
5 |
6 |
7 | class RichTracebackMeta(type):
8 | """
9 | ???+ note "Metaclass to mass-add traceback override to class methods."
10 |
11 | [`Rich` traceback guide](https://rich.readthedocs.io/en/stable/traceback.html)
12 | """
13 |
14 | def __new__(meta, class_name, bases, class_dict):
15 | # prefers the class's CONSOLE attribute; create one otherwise
16 | console = class_dict.get("CONSOLE", Console())
17 |
18 | def wrapper(func):
19 | @wraps(func)
20 | def wrapped(*args, **kwargs):
21 | try:
22 | return func(*args, **kwargs)
23 | except Exception as e_original:
24 | func_name = f"{func.__module__}.{func.__qualname__}"
25 | console.print(
26 | f":red_circle: {func_name} failed: {e_original}",
27 | style="red bold",
28 | )
29 | console.print_exception(show_locals=False)
30 | raise e_original
31 |
32 | return wrapped
33 |
34 | new_class_dict = {}
35 | for attr_name, attr_value in class_dict.items():
36 | # replace each method with a wrapped version
37 | if isinstance(attr_value, FunctionType):
38 | attr_value = wrapper(attr_value)
39 | new_class_dict[attr_name] = attr_value
40 | return type.__new__(meta, class_name, bases, new_class_dict)
41 |
42 |
43 | class RichTracebackABCMeta(RichTracebackMeta, ABCMeta):
44 | """
45 | ???+ note "Metaclass for rich-traceback abstract base classes."
46 |
47 | To resolve the metaclass conflict between RichTracebackMeta and ABCMeta.
48 | """
49 |
50 | pass
51 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | import setuptools
2 | import os
3 |
4 |
5 | def get_description():
6 | if os.path.isfile("README.md"):
7 | with open("README.md", "r") as fh:
8 | desc = fh.read()
9 | else:
10 | desc = ""
11 | return desc
12 |
13 |
14 | setuptools.setup(
15 | name="hover",
16 | version="0.9.0",
17 | description="Label data at scale. Fun and precision included.",
18 | long_description=get_description(),
19 | long_description_content_type="text/markdown",
20 | author="Pavel",
21 | author_email="pavelhurwicz@gmail.com",
22 | url="https://github.com/phurwicz/hover",
23 | packages=setuptools.find_packages(include=["hover*"]),
24 | install_requires=[
25 | # python-version-specific example: "numpy>=1.14,<=1.21.5;python_version<'3.8.0'",
26 | # interactive/static visualization
27 | "bokeh>=3.0.3",
28 | # preprocessors
29 | "scikit-learn>=0.20.0",
30 | # neural stuff
31 | "torch>=1.10.0",
32 | # data handling
33 | "pandas>=1.3.0",
34 | "polars>=0.17.0",
35 | "pyarrow>=11.0.0",
36 | "numpy>=1.22",
37 | # computations
38 | "scipy>=1.3.2",
39 | # utilities
40 | "tqdm>=4.0",
41 | "rich>=11.0.0",
42 | "deprecated>=1.1.0",
43 | # dimensionality reduction: UMAP is included
44 | "umap-learn>=0.3.10",
45 | # module config customization
46 | "flexmod>=0.1.2",
47 | # optional: more dimensionality reduction methods
48 | # "ivis[cpu]>=1.7",
49 | # optional: distant supervision
50 | # "snorkel>=0.9.8",
51 | ],
52 | python_requires=">=3.8",
53 | classifiers=[
54 | "Programming Language :: Python :: 3",
55 | "Development Status :: 4 - Beta",
56 | "License :: OSI Approved :: MIT License",
57 | "Operating System :: OS Independent",
58 | ],
59 | )
60 |
--------------------------------------------------------------------------------
/fixture_module/audio_vector_net/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Example importable module holding customized ingredients of a workflow with hover.
3 | Specifically for audio data in URLs.
4 | """
5 |
6 | import os
7 | import re
8 | import numpy as np
9 | import wrappy
10 | import requests
11 | import librosa
12 | from io import BytesIO
13 |
14 |
15 | DIR_PATH = os.path.dirname(__file__)
16 | RAW_CACHE_PATH = os.path.join(DIR_PATH, "raws.pkl")
17 | AUD_CACHE_PATH = os.path.join(DIR_PATH, "auds.pkl")
18 | VEC_CACHE_PATH = os.path.join(DIR_PATH, "vecs.pkl")
19 |
20 |
21 | @wrappy.memoize(
22 | cache_limit=50000,
23 | return_copy=False,
24 | persist_path=RAW_CACHE_PATH,
25 | persist_batch_size=100,
26 | )
27 | def url_to_content(url):
28 | """
29 | Turn a URL to response content.
30 | """
31 | response = requests.get(url)
32 | return response.content
33 |
34 |
35 | @wrappy.memoize(
36 | cache_limit=50000,
37 | return_copy=False,
38 | persist_path=AUD_CACHE_PATH,
39 | persist_batch_size=100,
40 | )
41 | def url_to_audio(url):
42 | """
43 | Turn a URL to audio data.
44 | """
45 | data, sampling_rate = librosa.load(BytesIO(url_to_content(url)))
46 | return data, sampling_rate
47 |
48 |
49 | def get_vectorizer():
50 | @wrappy.memoize(
51 | cache_limit=50000,
52 | return_copy=False,
53 | persist_path=VEC_CACHE_PATH,
54 | persist_batch_size=100,
55 | )
56 | def vectorizer(url):
57 | """
58 | Averaged MFCC over time.
59 | Resembles word-embedding-average-as-doc-embedding for texts.
60 | """
61 | y, sr = url_to_audio(url)
62 | mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=32)
63 | return mfcc.mean(axis=1)
64 |
65 | return vectorizer
66 |
67 |
68 | def get_architecture():
69 | from hover.utils.common_nn import LogisticRegression
70 |
71 | return LogisticRegression
72 |
73 |
74 | def get_state_dict_path():
75 | return os.path.join(DIR_PATH, "model.pt")
76 |
--------------------------------------------------------------------------------
/hover/utils/common_nn.py:
--------------------------------------------------------------------------------
1 | import torch.nn as nn
2 |
3 |
4 | class BaseSequential(nn.Module):
5 | """
6 | Sequential neural net with no specified architecture.
7 | """
8 |
9 | def __init__(self):
10 | """
11 | Inheriting the parent constructor.
12 | """
13 | super().__init__()
14 |
15 | def init_weights(self):
16 | for _layer in self.model:
17 | if isinstance(_layer, nn.Linear):
18 | nn.init.kaiming_normal_(_layer.weight, a=0.01)
19 | nn.init.constant_(_layer.bias, 0.0)
20 |
21 | def forward(self, input_tensor):
22 | return self.model(input_tensor)
23 |
24 | def eval_per_layer(self, input_tensor):
25 | """
26 | Return the input, all intermediates, and the output.
27 | """
28 | tensors = [input_tensor]
29 | current = input_tensor
30 | self.model.eval()
31 |
32 | for _layer in self.model.children():
33 | current = _layer(current)
34 | tensors.append(current)
35 |
36 | return tensors
37 |
38 |
39 | class MLP(BaseSequential):
40 | def __init__(self, embed_dim, num_classes, dropout=0.25, n_hid=128):
41 | """
42 | Set up a proportionally fixed architecture.
43 | """
44 | super().__init__()
45 | self.model = nn.Sequential(
46 | nn.Dropout(dropout),
47 | nn.Linear(embed_dim, n_hid),
48 | nn.ReLU(),
49 | nn.BatchNorm1d(n_hid),
50 | nn.Dropout(dropout),
51 | nn.Linear(n_hid, n_hid // 4),
52 | nn.ReLU(),
53 | nn.BatchNorm1d(n_hid // 4),
54 | nn.Dropout(dropout),
55 | nn.Linear(n_hid // 4, num_classes),
56 | )
57 | self.init_weights()
58 |
59 |
60 | class LogisticRegression(BaseSequential):
61 | def __init__(self, embed_dim, num_classes):
62 | """
63 | Set up a minimal architecture.
64 | """
65 | super().__init__()
66 | self.model = nn.Sequential(nn.Linear(embed_dim, num_classes))
67 | self.init_weights()
68 |
--------------------------------------------------------------------------------
/docs/snippets/markdown/readme/2-features.zh.md:
--------------------------------------------------------------------------------
1 | ## :sparkles: 具体功能
2 |
3 | :telescope: 将向量降维得到二维数据散点图, 并配有
4 |
5 |
6 | 提示框 来显示具体数据内容
7 |
8 |
9 |
10 |
11 | 表格来 批量检视 选中的数据
12 |
13 |
14 |
15 |
16 | 切换按钮来 区分数据子集
17 |
18 |
19 |
20 |
21 | 文本/正则匹配 来定向搜寻数据
22 |
23 |
24 |
25 | :microscope: 与标注界面同步的辅助模式
26 |
27 |
28 | `Finder`: 以匹配条件来 过滤 选中的数据
29 |
30 |
31 |
32 |
33 | `SoftLabel`: 主动学习 用模型打分过滤选中的数据
34 |
35 |
36 |
37 |
38 | `Snorkel`: 自定义函数 来过滤数据或直接打标
39 |
40 |
41 |
42 | :toolbox: 更多的补充工具
43 |
44 |
45 | 降维时保留 更多维度 (3D? 4D?) 并动态选择观察的平面
46 |
47 |
48 |
49 |
50 | 跨界面/跨维度地进行 持续选取/反选 以达到更高精度
51 |
52 |
53 |
54 |
55 | 剔除 选中数据中的异类 以及 修订 发现的误标
56 |
57 |
58 |
--------------------------------------------------------------------------------
/tests/recipes/local_helper.py:
--------------------------------------------------------------------------------
1 | import time
2 | import operator
3 | from bokeh.document import Document
4 | from bokeh.events import ButtonClick, MenuItemClick
5 | from hover import module_config
6 |
7 |
8 | def action_view_selection(dataset):
9 | view_event = ButtonClick(dataset.selection_viewer)
10 | dataset.selection_viewer._trigger_event(view_event)
11 | # dataset.sel_table.source.data is a {"field": []}-like dict
12 | view_data = dataset.sel_table.source.data.copy()
13 | return view_data
14 |
15 |
16 | def action_evict_selection(dataset):
17 | old_view_data = dataset.sel_table.source.data.copy()
18 | evict_event = ButtonClick(dataset.selection_evictor)
19 | dataset.selection_evictor._trigger_event(evict_event)
20 | new_view_data = dataset.sel_table.source.data.copy()
21 | return old_view_data, new_view_data
22 |
23 |
24 | def action_patch_selection(dataset):
25 | patch_event = ButtonClick(dataset.selection_patcher)
26 | dataset.selection_patcher._trigger_event(patch_event)
27 |
28 |
29 | def action_apply_labels(annotator):
30 | apply_event = ButtonClick(annotator.annotator_apply)
31 | annotator.annotator_apply._trigger_event(apply_event)
32 | labeled_slice = annotator.dfs["raw"].filter_rows_by_operator(
33 | "label", operator.ne, module_config.ABSTAIN_DECODED
34 | )()
35 | return labeled_slice
36 |
37 |
38 | def action_commit_selection(dataset, subset="train"):
39 | commit_event = MenuItemClick(dataset.data_committer, item=subset)
40 | dataset.data_committer._trigger_event(commit_event)
41 |
42 |
43 | def action_deduplicate(dataset):
44 | dedup_event = ButtonClick(dataset.dedup_trigger)
45 | dataset.dedup_trigger._trigger_event(dedup_event)
46 |
47 |
48 | def action_push_data(dataset):
49 | push_event = ButtonClick(dataset.update_pusher)
50 | dataset.update_pusher._trigger_event(push_event)
51 |
52 |
53 | def execute_handle_function(handle):
54 | doc = Document()
55 | handle(doc)
56 | # a few seconds to activate timed callcacks
57 | time.sleep(10)
58 | for wrapped_callback in doc.session_callbacks:
59 | wrapped_callback.callback()
60 |
--------------------------------------------------------------------------------
/.github/workflows/quick-source-test.yml:
--------------------------------------------------------------------------------
1 | # This workflow will install Python dependencies and run tests on the source code.
2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
3 |
4 | name: Quick Source Test
5 |
6 | on:
7 | push:
8 | branches: [ main, nightly ]
9 | paths:
10 | - 'hover/**.py'
11 | - 'tests/**.py'
12 | - 'pytest.ini'
13 | - 'fixture_module/**.py'
14 | pull_request:
15 | branches: [ main, nightly ]
16 | paths:
17 | - 'hover/**.py'
18 | - 'tests/**.py'
19 | - 'pytest.ini'
20 | - 'fixture_module/**.py'
21 | workflow_dispatch:
22 |
23 | jobs:
24 | test-api:
25 |
26 | runs-on: ${{ matrix.os }}
27 | strategy:
28 | fail-fast: false
29 | matrix:
30 | # test oldest and newest supported Python version
31 | python-version: ['3.8', '3.10']
32 | os: [ubuntu-latest]
33 |
34 | steps:
35 | - uses: actions/checkout@v3
36 | - name: Set up Python ${{ matrix.python-version }}
37 | uses: actions/setup-python@v3
38 | with:
39 | python-version: ${{ matrix.python-version }}
40 |
41 | - name: Find cached tox env
42 | id: find-venv
43 | uses: actions/cache@v3
44 | with:
45 | path: .tox
46 | key: ${{ runner.os }}-${{ runner.python-version }}-tox-env-${{ hashFiles('**/setup.py') }}
47 | restore-keys: |
48 | ${{ runner.os }}-${{ runner.python-version }}-tox-env-
49 |
50 | - name: Get dependencies
51 | run: |
52 | pip install --upgrade pip
53 | pip install --upgrade tox tox-gh-actions
54 |
55 | - name: Test - default config
56 | run: |
57 | tox -e test_api
58 |
59 | - name: Test - alt config 1
60 | run: |
61 | tox -e test_api -- --hover-ini tests/module_config/hover_alt_config_1.ini
62 |
63 | - name: Codacy Coverage Reporter
64 | uses: codacy/codacy-coverage-reporter-action@master
65 | if: ${{ github.event_name == 'workflow_dispatch' || github.event_name == 'push' }}
66 | with:
67 | project-token: ${{ secrets.CODACY_PROJECT_TOKEN }}
68 | coverage-reports: cobertura.xml
69 |
--------------------------------------------------------------------------------
/docs/pages/tutorial/t6-softlabel-joint-filter.md:
--------------------------------------------------------------------------------
1 | > `hover` filters can stack together.
2 | >
3 | > :speedboat: This makes selections incredibly powerful.
4 |
5 | {!docs/snippets/html/thebe.html!}
6 | {!docs/snippets/markdown/binder-kernel.md!}
7 | {!docs/snippets/markdown/component-tutorial.md!}
8 | {!docs/snippets/markdown/local-dependency.md!}
9 | {!docs/snippets/markdown/local-dep-text.md!}
10 | {!docs/snippets/markdown/local-dep-jupyter-bokeh.md!}
11 |
12 | ## **Preparation**
13 |
14 | {!docs/snippets/markdown/dataset-prep.md!}
15 |
16 | ## **Soft-Label Explorer**
17 |
18 | Active learning works by predicting labels and scores (i.e. soft labels) and utilizing that prediction. An intuitive way to plot soft labels is to color-code labels and use opacity ("alpha" by `bokeh` terminology) to represent scores.
19 |
20 | `SoftLabelExplorer` delivers this functionality:
21 |
22 |
23 | {!docs/snippets/py/tz-bokeh-notebook-common.txt!}
24 |
25 | {!docs/snippets/py/tz-bokeh-notebook-remote.txt!}
26 |
27 | {!docs/snippets/py/t6-0-softlabel-figure.txt!}
28 |
29 |
30 | ## **Filter Selection by Score Range**
31 |
32 | Similarly to `finder`, a `softlabel` plot has its own selection filter. The difference lies in the filter condition:
33 |
34 | {!docs/snippets/markdown/jupyterlab-js-issue.md!}
35 |
36 |
37 | {!docs/snippets/py/t6-1-softlabel-filter.txt!}
38 |
39 |
40 | ## **Linked Selections & Joint Filters**
41 |
42 | When we plot multiple `explorer`s for the same `dataset`, it makes sense to synchronize selections between those plots. `hover` recipes take care of this synchronization.
43 |
44 | - :tada: This also works with cumulative selections. Consequently, the cumulative toggle is synchronized too.
45 |
46 | Since each filter is narrowing down the selections we make, joint filters is just set intersection, extended
47 |
48 | - from two sets (original selection + filter)
49 | - to N sets (original selection + filter A + filter B + ...)
50 |
51 | The [`active_learning` recipe]((../t1-active-learning/)) is built of `softlabel + annotator + finder`, plus a few widgets for iterating the model-in-loop.
52 |
53 | In the next tutorial(s), we will see more recipes taking advantage of linked selections and joint filters. Powerful indeed!
54 |
55 | {!docs/snippets/html/stylesheet.html!}
56 |
--------------------------------------------------------------------------------
/docs/pages/guides/g2-hover-config.md:
--------------------------------------------------------------------------------
1 | > `hover` can be customized through its module config.
2 | >
3 | > :bulb: Let's explore a few use cases.
4 |
5 | {!docs/snippets/markdown/tutorial-required.md!}
6 | {!docs/snippets/html/thebe.html!}
7 | {!docs/snippets/markdown/binder-kernel.md!}
8 |
9 | ## **Color Palette for Labeled Data Points**
10 |
11 | You may want to customize the color palette for better contrast or accessibility, which can depend on specific scenarios.
12 |
13 | The snippet below shows an example of default colors assigned to 6 classes. `hover` by default samples [`Turbo256`](https://docs.bokeh.org/en/latest/docs/reference/palettes.html#large-palettes) to accommodate a large number of classes while keeping good contrast.
14 |
15 |
16 | {!docs/snippets/py/g2-0-color-palette.txt!}
17 |
18 |
19 | You can change the palette using any `bokeh` palette, or any iterable of hex colors like `"#000000"`.
20 |
21 | {!docs/snippets/py/g2-1-configure-palette.txt!}
22 |
23 |
24 | ???+ note "Config changes should happen early"
25 | `hover.config` assignments need to happen before plotting your data.
26 |
27 | - This is because `hover` locks config values for consistency as soon as each config value is read by other code.
28 | - Ideally you should change config immediately after `import hover`.
29 |
30 | ## **Color of Unlabeled Data Points**
31 |
32 | For unlabeled data points, `hover` uses a light gray color `"#dcdcdc"`. This is not configured in the color palette above, but here:
33 |
34 |
35 | {!docs/snippets/py/g2-2-configure-abstain-color.txt!}
36 |
37 |
38 | ## **Dimensionality Reduction Method**
39 |
40 | `hover` uses dimensionality reduction in a lot of places. It can be cumbersome to find these places and use your preferred method. In such cases a module-level override can be handy:
41 |
42 |
43 | {!docs/snippets/py/g2-3-configure-reduction-method.txt!}
44 |
45 |
46 | ## **Browse more configs**
47 |
48 | There are more configurations that are more niche which we will skip here. You can find a full list of configurations, default values, and hints here:
49 |
50 |
51 | {!docs/snippets/py/g2-4-config-hint.txt!}
52 |
53 |
54 | Happy customizing!
55 |
56 | {!docs/snippets/html/stylesheet.html!}
57 |
--------------------------------------------------------------------------------
/hover/utils/snorkel_helper.py:
--------------------------------------------------------------------------------
1 | import uuid
2 |
3 |
4 | def labeling_function(targets, label_encoder=None, **kwargs):
5 | """
6 | ???+ note "Hover's flavor of the Snorkel labeling_function decorator."
7 | However, due to the dynamic label encoding nature of hover,
8 | the decorated function should return the original string label, not its encoding integer.
9 |
10 | - assigns a UUID for easy identification
11 | - keeps track of LF targets
12 |
13 | | Param | Type | Description |
14 | | :-------------- | :----- | :----------------------------------- |
15 | | `targets` | `list` of `str` | labels that the labeling function is intended to create |
16 | | `label_encoder` | `dict` | {decoded_label -> encoded_label} mapping, if you also want an original snorkel-style labeling function linked as a `.snorkel` attribute |
17 | | `**kwargs` | | forwarded to `snorkel`'s `labeling_function()` |
18 | """
19 | # lazy import so that the package does not require snorkel
20 | # Feb 3, 2022: snorkel's dependency handling is too strict
21 | # for other dependencies like NumPy, SciPy, SpaCy, etc.
22 | # Let's cite Snorkel and lazy import or copy functions.
23 | # DO NOT explicitly depend on Snorkel without confirming
24 | # that all builds/tests pass by Anaconda standards, else
25 | # we risk having to drop conda support.
26 | from snorkel.labeling import (
27 | labeling_function as snorkel_lf,
28 | LabelingFunction as SnorkelLF,
29 | )
30 |
31 | def wrapper(func):
32 | # set up kwargs for Snorkel's LF
33 | # a default name that can be overridden
34 | snorkel_kwargs = {"name": func.__name__}
35 | snorkel_kwargs.update(kwargs)
36 |
37 | # return value of hover's decorator
38 | lf = SnorkelLF(f=func, **snorkel_kwargs)
39 |
40 | # additional attributes
41 | lf.uuid = uuid.uuid1()
42 | lf.targets = targets[:]
43 |
44 | # link a snorkel-style labeling function if applicable
45 | if label_encoder:
46 | lf.label_encoder = label_encoder
47 |
48 | def snorkel_style_func(x):
49 | return lf.label_encoder[func(x)]
50 |
51 | lf.snorkel = snorkel_lf(**kwargs)(snorkel_style_func)
52 | else:
53 | lf.label_encoder = None
54 | lf.snorkel = None
55 |
56 | return lf
57 |
58 | return wrapper
59 |
--------------------------------------------------------------------------------
/hover/config_constants.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 |
4 | class ConfigSection:
5 | IO = "io"
6 | BACKEND = "backend"
7 | VISUAL = "visual"
8 | DATA_EMBEDDING = "data.embedding"
9 | DATA_COLUMNS = "data.columns"
10 | DATA_VALUES = "data.values"
11 |
12 |
13 | class ConfigKey:
14 | DATA_SAVE_DIR = "data_save_dir"
15 | DATAFRAME_LIBRARY = "dataframe_library"
16 | ABSTAIN_HEXCOLOR = "abstain_hexcolor"
17 | BOKEH_PALETTE = "bokeh_palette"
18 | BOKEH_PALETTE_USAGE = "bokeh_palette_usage"
19 | TABLE_IMG_STYLE = "table_img_style"
20 | TOOLTIP_IMG_STYLE = "tooltip_img_style"
21 | SEARCH_MATCH_HEXCOLOR = "search_match_hexcolor"
22 | DATAPOINT_BASE_SIZE = "datapoint_base_size"
23 | DEFAULT_REDUCTION_METHOD = "default_reduction_method"
24 | ENCODED_LABEL_KEY = "encoded_label_key"
25 | DATASET_SUBSET_FIELD = "dataset_subset_field"
26 | EMBEDDING_FIELD_PREFIX = "embedding_field_prefix"
27 | SOURCE_COLOR_FIELD = "source_color_field"
28 | SOURCE_ALPHA_FIELD = "source_alpha_field"
29 | SEARCH_SCORE_FIELD = "search_score_field"
30 | ABSTAIN_DECODED = "abstain_decoded"
31 | ABSTAIN_ENCODED = "abstain_encoded"
32 |
33 |
34 | class Validator:
35 | @staticmethod
36 | def is_hex_color(x):
37 | return bool(re.match(r"^\#[0-9a-fA-F]{6}$", x))
38 |
39 | @staticmethod
40 | def is_iterable(x):
41 | return hasattr(x, "__iter__")
42 |
43 | @staticmethod
44 | def is_iterable_of_hex_color(x):
45 | if not Validator.is_iterable(x):
46 | return False
47 | for i in x:
48 | if not Validator.is_hex_color(i):
49 | return False
50 | return True
51 |
52 | @staticmethod
53 | def is_supported_dataframe_library(x):
54 | return x in ["pandas", "polars"]
55 |
56 | @staticmethod
57 | def is_supported_dimensionality_reduction(x):
58 | return x in ["umap", "ivis"]
59 |
60 | @staticmethod
61 | def is_supported_traversal_mode(x):
62 | return x in ["iterate", "linspace"]
63 |
64 | @staticmethod
65 | def is_str(x):
66 | return isinstance(x, str)
67 |
68 | @staticmethod
69 | def is_int_and_compare(op, value):
70 | def func(x):
71 | return isinstance(x, int) and op(x, value)
72 |
73 | return func
74 |
75 |
76 | class Preprocessor:
77 | @staticmethod
78 | def remove_quote_at_ends(x):
79 | return re.sub(r"(^[\'\"]|[\'\"]$)", "", x)
80 |
81 | @staticmethod
82 | def lower(x):
83 | return x.lower()
84 |
--------------------------------------------------------------------------------
/docs/pages/guides/g0-datatype-image.md:
--------------------------------------------------------------------------------
1 | > `hover` supports bulk-labeling images through their URLs (which can be local).
2 | >
3 | > :bulb: Let's do a quickstart for images and note what's different from texts.
4 |
5 | {!docs/snippets/markdown/tutorial-required.md!}
6 | {!docs/snippets/html/thebe.html!}
7 | {!docs/snippets/markdown/binder-kernel.md!}
8 |
9 | ## **Dataset for Images**
10 |
11 | `hover` handles images through their URL addresses. URLs are strings which can be easily stored, hashed, and looked up against. They are also convenient for rendering tooltips in the annotation interface.
12 |
13 | Similarly to `SupervisableTextDataset`, we can build one for images:
14 |
15 |
16 | {!docs/snippets/py/g0-0-dataset-image.txt!}
17 |
18 | {!docs/snippets/py/t0-0a-dataset-text-print.txt!}
19 |
20 |
21 | ## **Vectorizer for Images**
22 |
23 | We can follow a `URL -> content -> image object -> vector` path.
24 |
25 |
26 | {!docs/snippets/py/g0-1-url-to-content.txt!}
27 |
28 |
29 |
30 | {!docs/snippets/py/g0-2-url-to-image.txt!}
31 |
32 |
33 | {!docs/snippets/markdown/wrappy-cache.md!}
34 |
35 |
36 | {!docs/snippets/py/g0-3-image-vectorizer.txt!}
37 |
38 |
39 | ## **Embedding and Plot**
40 |
41 | This is exactly the same as in the quickstart, just switching to image data:
42 |
43 |
44 | {!docs/snippets/py/t0-2-reduction.txt!}
45 |
46 |
47 |
48 | {!docs/snippets/py/t0-3-simple-annotator.txt!}
49 |
50 | {!docs/snippets/py/tz-bokeh-show-server.txt!}
51 |
52 | {!docs/snippets/py/tz-bokeh-show-notebook.txt!}
53 |
54 |
55 | ???+ note "What's special for images?"
56 | **Tooltips**
57 |
58 | For text, the tooltip shows the original value.
59 |
60 | For images, the tooltip embeds the image based on URL.
61 |
62 | - images in the local file system shall be served through [`python -m http.server`](https://docs.python.org/3/library/http.server.html).
63 | - they can then be accessed through `https://localhost:/relative/path/to/file`.
64 |
65 | **Search**
66 |
67 | For text, the search widget is based on regular expressions.
68 |
69 | For images, the search widget is based on vector cosine similarity.
70 |
71 | - the `dataset` has remembered the `vectorizer` under the hood and passed it to the `annotator`.
72 | - {== please [**let us know**](https://github.com/phurwicz/hover/issues/new) if you think there's a better way to search images in this case. ==}
73 |
74 |
75 | {!docs/snippets/html/stylesheet.html!}
76 |
--------------------------------------------------------------------------------
/notebooks/archive-prototype/Programmatic-Event.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "%load_ext autoreload\n",
10 | "\n",
11 | "%autoreload 2"
12 | ]
13 | },
14 | {
15 | "cell_type": "code",
16 | "execution_count": null,
17 | "metadata": {},
18 | "outputs": [],
19 | "source": [
20 | "import sys\n",
21 | "sys.path.append('../')"
22 | ]
23 | },
24 | {
25 | "cell_type": "markdown",
26 | "metadata": {},
27 | "source": [
28 | "### Programmatically Trigger Events"
29 | ]
30 | },
31 | {
32 | "cell_type": "code",
33 | "execution_count": null,
34 | "metadata": {},
35 | "outputs": [],
36 | "source": [
37 | "from bokeh.io import output_notebook, show\n",
38 | "from bokeh.plotting import figure\n",
39 | "from bokeh.models import ColumnDataSource\n",
40 | "\n",
41 | "output_notebook()"
42 | ]
43 | },
44 | {
45 | "cell_type": "code",
46 | "execution_count": null,
47 | "metadata": {
48 | "tags": []
49 | },
50 | "outputs": [],
51 | "source": [
52 | "from bokeh.models import Button\n",
53 | "from bokeh.events import ButtonClick, MenuItemClick\n",
54 | "from datetime import datetime\n",
55 | "import time\n",
56 | "\n",
57 | "button = Button(label=\"Click me\")\n",
58 | "button_click = ButtonClick(button)\n",
59 | "\n",
60 | "def callback(event):\n",
61 | " print(f\"Clicked at {datetime.now()}\", end=\"\\r\")\n",
62 | "\n",
63 | "button.on_click(callback)\n",
64 | "\n",
65 | "for i in range(10):\n",
66 | " button._trigger_event(button_click)"
67 | ]
68 | },
69 | {
70 | "cell_type": "code",
71 | "execution_count": null,
72 | "metadata": {},
73 | "outputs": [],
74 | "source": []
75 | },
76 | {
77 | "cell_type": "code",
78 | "execution_count": null,
79 | "metadata": {},
80 | "outputs": [],
81 | "source": []
82 | }
83 | ],
84 | "metadata": {
85 | "kernelspec": {
86 | "display_name": "Python 3 (ipykernel)",
87 | "language": "python",
88 | "name": "python3"
89 | },
90 | "language_info": {
91 | "codemirror_mode": {
92 | "name": "ipython",
93 | "version": 3
94 | },
95 | "file_extension": ".py",
96 | "mimetype": "text/x-python",
97 | "name": "python",
98 | "nbconvert_exporter": "python",
99 | "pygments_lexer": "ipython3",
100 | "version": "3.9.7"
101 | }
102 | },
103 | "nbformat": 4,
104 | "nbformat_minor": 4
105 | }
106 |
--------------------------------------------------------------------------------
/docs/snippets/markdown/readme/2-features.en.md:
--------------------------------------------------------------------------------
1 | ## :sparkles: Features
2 |
3 | > **It's fast because it labels data in bulk.**
4 |
5 | :telescope: A semantic scatter plot of your data for labeling, equipped with
6 |
7 |
8 | Tooltip for each point on mouse hover
9 |
10 |
11 |
12 |
13 | Table view for inspecting selected points
14 |
15 |
16 |
17 |
18 | Toggle buttons that clearly distinguish data subsets
19 |
20 |
21 |
22 |
23 | Search widgets for ad-hoc data highlight
24 |
25 |
26 |
27 | > **It's accurate because multiple components work together.**
28 |
29 | :microscope: Supplementary views to use in conjunction with the annotator, including
30 |
31 |
32 | `Finder`: filter data by search criteria
33 |
34 |
35 |
36 |
37 | `SoftLabel`: active learning by in-the-loop model prediction score
38 |
39 |
40 |
41 |
42 | `Snorkel`: custom functions for labeling and filtering
43 |
44 |
45 |
46 | > **It's flexible (and fun!) because the process never gets old.**
47 |
48 | :toolbox: Additional tools and options that allow you to
49 |
50 |
51 | Go to higher dimensions (3D? 4D?) and choose your xy-axes
52 |
53 |
54 |
55 |
56 | Consecutively select across areas, dimensions, and views
57 |
58 |
59 |
60 |
61 | Kick outliers and fix mistakes
62 |
63 |
64 |
--------------------------------------------------------------------------------
/notebooks/archive-prototype/Dynamic-Widget.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "%load_ext autoreload\n",
10 | "\n",
11 | "%autoreload 2"
12 | ]
13 | },
14 | {
15 | "cell_type": "code",
16 | "execution_count": null,
17 | "metadata": {},
18 | "outputs": [],
19 | "source": [
20 | "import sys\n",
21 | "sys.path.append('../../')"
22 | ]
23 | },
24 | {
25 | "cell_type": "markdown",
26 | "metadata": {},
27 | "source": [
28 | "### Dynamically Change Widget Behavior"
29 | ]
30 | },
31 | {
32 | "cell_type": "code",
33 | "execution_count": null,
34 | "metadata": {},
35 | "outputs": [],
36 | "source": [
37 | "from bokeh.models import Selection, RangeSlider, Button, Dropdown\n",
38 | "from bokeh.layouts import row, column\n",
39 | "from bokeh.io import output_notebook, show\n",
40 | "from hover.utils.bokeh_helper import servable\n",
41 | "\n",
42 | "output_notebook()\n",
43 | "\n",
44 | "slider = RangeSlider(start=-1.0, end=1.0, value=(-0.5, 0.5), step=0.01)\n",
45 | "slider.on_change(\"value\", lambda attr, old, new: print(f\"Range changed to {slider.value}\"))\n",
46 | "\n",
47 | "@servable()\n",
48 | "def burner():\n",
49 | " arr = ['1', '2', '3']\n",
50 | " dropdown = Dropdown(\n",
51 | " label=\"Select Element\",\n",
52 | " button_type=\"primary\",\n",
53 | " menu=arr,\n",
54 | " )\n",
55 | " \n",
56 | " button = Button(label=\"Click Me\", height=100)\n",
57 | " def button_callcack(event):\n",
58 | " dropdown.menu.append(str(int(dropdown.menu[-1]) + 1))\n",
59 | " print(f\"Button Clicked! Got menu: {dropdown.menu}\")\n",
60 | " button.on_click(button_callcack)\n",
61 | "\n",
62 | " return column(dropdown, button)\n",
63 | "\n",
64 | "handle = burner()\n",
65 | "show(handle)"
66 | ]
67 | },
68 | {
69 | "cell_type": "code",
70 | "execution_count": null,
71 | "metadata": {},
72 | "outputs": [],
73 | "source": []
74 | }
75 | ],
76 | "metadata": {
77 | "kernelspec": {
78 | "display_name": "Python 3 (ipykernel)",
79 | "language": "python",
80 | "name": "python3"
81 | },
82 | "language_info": {
83 | "codemirror_mode": {
84 | "name": "ipython",
85 | "version": 3
86 | },
87 | "file_extension": ".py",
88 | "mimetype": "text/x-python",
89 | "name": "python",
90 | "nbconvert_exporter": "python",
91 | "pygments_lexer": "ipython3",
92 | "version": "3.9.7"
93 | }
94 | },
95 | "nbformat": 4,
96 | "nbformat_minor": 4
97 | }
98 |
--------------------------------------------------------------------------------
/fixture_module/image_vector_net/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Example importable module holding customized ingredients of a workflow with hover.
3 | Specifically for 3-channel image data in URLs.
4 | """
5 |
6 | import os
7 | import re
8 | import numpy as np
9 | import wrappy
10 | import requests
11 | from PIL import Image
12 | from io import BytesIO
13 |
14 |
15 | DIR_PATH = os.path.dirname(__file__)
16 | RAW_CACHE_PATH = os.path.join(DIR_PATH, "raws.pkl")
17 | IMG_CACHE_PATH = os.path.join(DIR_PATH, "imgs.pkl")
18 | VEC_CACHE_PATH = os.path.join(DIR_PATH, "vecs.pkl")
19 |
20 |
21 | @wrappy.memoize(
22 | cache_limit=50000,
23 | return_copy=False,
24 | persist_path=RAW_CACHE_PATH,
25 | persist_batch_size=100,
26 | )
27 | def url_to_content(url):
28 | """
29 | Turn a URL to response content.
30 | """
31 | response = requests.get(url)
32 | return response.content
33 |
34 |
35 | @wrappy.memoize(
36 | cache_limit=50000,
37 | return_copy=False,
38 | persist_path=IMG_CACHE_PATH,
39 | persist_batch_size=100,
40 | )
41 | def url_to_image(url):
42 | """
43 | Turn a URL to a PIL Image.
44 | """
45 | img = Image.open(BytesIO(url_to_content(url))).convert("RGB")
46 | return img
47 |
48 |
49 | def get_vectorizer():
50 | import torch
51 | from efficientnet_pytorch import EfficientNet
52 | from torchvision import transforms
53 |
54 | # EfficientNet is a series of pre-trained models
55 | # https://github.com/lukemelas/EfficientNet-PyTorch
56 | model = EfficientNet.from_pretrained("efficientnet-b0")
57 | model.eval()
58 |
59 | # standard transformations for ImageNet-trained models
60 | tfms = transforms.Compose(
61 | [
62 | transforms.Resize(224),
63 | transforms.ToTensor(),
64 | transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
65 | ]
66 | )
67 |
68 | # memoization can be useful if the function takes a while to run, which is common for images
69 | @wrappy.memoize(
70 | cache_limit=50000,
71 | return_copy=False,
72 | persist_path=VEC_CACHE_PATH,
73 | persist_batch_size=100,
74 | )
75 | def vectorizer(url):
76 | """
77 | Using logits on ImageNet-1000 classes.
78 | """
79 | img = tfms(url_to_image(url)).unsqueeze(0)
80 |
81 | with torch.no_grad():
82 | outputs = model(img)
83 |
84 | return outputs.detach().numpy().flatten()
85 |
86 | return vectorizer
87 |
88 |
89 | def get_architecture():
90 | from hover.utils.common_nn import LogisticRegression
91 |
92 | return LogisticRegression
93 |
94 |
95 | def get_state_dict_path():
96 | return os.path.join(DIR_PATH, "model.pt")
97 |
--------------------------------------------------------------------------------
/docs/pages/guides/g1-datatype-audio.md:
--------------------------------------------------------------------------------
1 | > `hover` supports bulk-labeling audios through their URLs (which can be local).
2 | >
3 | > :bulb: Let's do a quickstart for audios and note what's different from texts.
4 |
5 | {!docs/snippets/markdown/tutorial-required.md!}
6 | {!docs/snippets/html/thebe.html!}
7 | {!docs/snippets/markdown/binder-kernel.md!}
8 |
9 | ## **Dataset for audios**
10 |
11 | `hover` handles audios through their URL addresses. URLs are strings which can be easily stored, hashed, and looked up against. They are also convenient for rendering tooltips in the annotation interface.
12 |
13 | Similarly to `SupervisableTextDataset`, we can build one for audios:
14 |
15 |
16 | {!docs/snippets/py/g1-0-dataset-audio.txt!}
17 |
18 | {!docs/snippets/py/t0-0a-dataset-text-print.txt!}
19 |
20 |
21 | ## **Vectorizer for audios**
22 |
23 | We can follow a `URL -> content -> audio array -> vector` path.
24 |
25 |
26 | {!docs/snippets/py/g0-1-url-to-content.txt!}
27 |
28 |
29 |
30 | {!docs/snippets/py/g1-1-url-to-audio.txt!}
31 |
32 |
33 | {!docs/snippets/markdown/wrappy-cache.md!}
34 |
35 |
36 | {!docs/snippets/py/g1-2-audio-vectorizer.txt!}
37 |
38 |
39 | ## **Embedding and Plot**
40 |
41 | This is exactly the same as in the quickstart, just switching to audio data:
42 |
43 |
44 | {!docs/snippets/py/t0-2-reduction.txt!}
45 |
46 |
47 |
48 | {!docs/snippets/py/t0-3-simple-annotator.txt!}
49 |
50 | {!docs/snippets/py/tz-bokeh-show-server.txt!}
51 |
52 | {!docs/snippets/py/tz-bokeh-show-notebook.txt!}
53 |
54 |
55 | ???+ note "What's special for audios?"
56 | **Tooltips**
57 |
58 | For text, the tooltip shows the original value.
59 |
60 | For audios, the tooltip embeds the audio based on URL.
61 |
62 | - audios in the local file system shall be served through [`python -m http.server`](https://docs.python.org/3/library/http.server.html).
63 | - they can then be accessed through `https://localhost:/relative/path/to/file`.
64 |
65 | **Search**
66 |
67 | For text, the search widget is based on regular expressions.
68 |
69 | For audios, the search widget is based on vector cosine similarity.
70 |
71 | - the `dataset` has remembered the `vectorizer` under the hood and passed it to the `annotator`.
72 | - {== please [**let us know**](https://github.com/phurwicz/hover/issues/new) if you think there's a better way to search audios in this case. ==}
73 | - dynamic time warping, due to its running time (> 10ms per pair for small 100x10 MFCC arrays), is too slow for search.
74 | - we are experimenting with subsampled signals and pre-selected data points (by vector similarity, for example).
75 |
76 |
77 | {!docs/snippets/html/stylesheet.html!}
78 |
--------------------------------------------------------------------------------
/docs/pages/tutorial/t5-finder-filter.md:
--------------------------------------------------------------------------------
1 | > `Finder` is an `explorer` focused on **search**.
2 | >
3 | > :speedboat: It can help you select points using a **filter** based on search results.
4 |
5 | {!docs/snippets/html/thebe.html!}
6 | {!docs/snippets/markdown/binder-kernel.md!}
7 | {!docs/snippets/markdown/component-tutorial.md!}
8 | {!docs/snippets/markdown/local-dependency.md!}
9 | {!docs/snippets/markdown/local-dep-text.md!}
10 | {!docs/snippets/markdown/local-dep-jupyter-bokeh.md!}
11 |
12 | ## **More Angles -> Better Results**
13 |
14 | `Explorer`s other than `annotator` are specialized in finding additional insight to help us understand the data. Having them juxtaposed with `annotator`, we can label more accurately, more confidently, and even faster.
15 |
16 | ## **Preparation**
17 |
18 | {!docs/snippets/markdown/dataset-prep.md!}
19 |
20 | ## **Filter Toggles**
21 |
22 | When we use lasso or polygon select, we are describing a shape. Sometimes that shape is not accurate enough -- we need extra conditions to narrow down the data.
23 |
24 | Just like `annotator`, `finder` has search widgets. But unlike `annotator`, `finder` has a **filter toggle** which can directly **intersect** *what we selected* with *what meets the search criteria*.
25 |
26 | {!docs/snippets/markdown/jupyterlab-js-issue.md!}
27 |
28 |
29 | {!docs/snippets/py/tz-bokeh-notebook-common.txt!}
30 |
31 | {!docs/snippets/py/tz-bokeh-notebook-remote.txt!}
32 |
33 | {!docs/snippets/py/t5-0-finder-filter.txt!}
34 |
35 |
36 | Next to the search widgets is a checkbox. The filter will stay active as long as the checkbox is.
37 |
38 | ???+ info "How the filter interacts with selection options"
39 | Selection options apply before filters.
40 |
41 | `hover` memorizes your pre-filter selections, so you can keep selecting without having to tweaking the filter toggle.
42 |
43 | - Example:
44 | - suppose you have previously selected a set of points called `A`.
45 | - then you toggled a filter `f`, giving you `A∩F` where `F` is the set satisfying `f`.
46 | - now, with selection option "union", you select a set of points called `B`.
47 | - your current selection will be `(A ∪ B) ∩ F`, i.e. `(A ∩ F) ∪ (B ∩ F)`.
48 | - similarly, you would get `(A ∩ B) ∩ F` for "intersection" and `(A ∖ B) ∩ F` for "difference".
49 | - if you untoggle the filter now, you selection would be `A ∪ B`.
50 |
51 | - In the later tutorials, we shall see multiple filters in action together.
52 | - spoiler: `F = F1 ∩ F2 ∩ ...` and that's it!
53 |
54 | ## **Stronger Highlight for Search**
55 |
56 | `finder` also colors data points based on search criteria, making them easier to find.
57 |
58 | {!docs/snippets/markdown/jupyterlab-js-issue.md!}
59 |
60 |
61 | {!docs/snippets/py/t5-1-finder-figure.txt!}
62 |
63 |
64 | {!docs/snippets/html/stylesheet.html!}
65 |
--------------------------------------------------------------------------------
/hover/utils/misc.py:
--------------------------------------------------------------------------------
1 | """Mini-functions that do not belong elsewhere."""
2 | from datetime import datetime
3 | from abc import ABC, abstractmethod
4 |
5 |
6 | def current_time(template="%Y%m%d %H:%M:%S"):
7 | return datetime.now().strftime(template)
8 |
9 |
10 | class BaseUnionFind(ABC):
11 | """
12 | ???+ note "Data attached to union-find."
13 | """
14 |
15 | def __init__(self, data):
16 | self._data = data
17 | self._parent = None
18 | self._count = 1
19 |
20 | def __repr__(self):
21 | return self.data.__repr__()
22 |
23 | @property
24 | def count(self):
25 | if self.parent is None:
26 | return self._count
27 | return self.find().count
28 |
29 | @count.setter
30 | def count(self, count):
31 | self._count = count
32 |
33 | @property
34 | def parent(self):
35 | return self._parent
36 |
37 | @parent.setter
38 | def parent(self, other):
39 | assert isinstance(other, BaseUnionFind)
40 | self._parent = other
41 |
42 | def find(self):
43 | if self.parent:
44 | self.parent = self.parent.find()
45 | return self.parent
46 | return self
47 |
48 | @abstractmethod
49 | def union(self, other):
50 | pass
51 |
52 |
53 | class NodeUnionFind(BaseUnionFind):
54 | """
55 | ???+ note "Each node keeps its own data."
56 | """
57 |
58 | @property
59 | def data(self):
60 | return self._data
61 |
62 | @data.setter
63 | def data(self, data):
64 | self._data = data
65 |
66 | def union(self, other):
67 | root = self.find()
68 | other_root = other.find()
69 | if root is other_root:
70 | return
71 |
72 | # merge the smaller trees into the larger
73 | if root.count < other_root.count:
74 | other_root.count += root.count
75 | root.parent = other_root
76 | else:
77 | root.count += other_root.count
78 | other_root.parent = root
79 |
80 |
81 | class RootUnionFind(BaseUnionFind):
82 | """
83 | ???+ note "Union always uses left as root. Each node looks up its root for data."
84 | """
85 |
86 | @property
87 | def data(self):
88 | root = self.find()
89 | if self is root:
90 | return self._data
91 | return root.data
92 |
93 | @data.setter
94 | def data(self, data):
95 | root = self.find()
96 | if self is root:
97 | self._data = data
98 | root._data = data
99 |
100 | def union(self, other):
101 | root = self.find()
102 | other_root = other.find()
103 |
104 | # clear the data on the other root
105 | other_root.data = None
106 | root.count += other_root.count
107 | other_root.parent = root
108 |
--------------------------------------------------------------------------------
/notebooks/archive-prototype/Editing-Datatable.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "id": "2bf553ea-eb52-49f8-ae88-da179ca9e793",
7 | "metadata": {},
8 | "outputs": [],
9 | "source": [
10 | "from bokeh.models import (\n",
11 | " Button,\n",
12 | " ColumnDataSource,\n",
13 | " DataTable,\n",
14 | " TableColumn\n",
15 | ")\n",
16 | "from bokeh.layouts import column\n",
17 | "from hover.utils.bokeh_helper import servable\n",
18 | "\n",
19 | "@servable()\n",
20 | "def burner():\n",
21 | " df = pd.DataFrame({\n",
22 | " 'f0': np.random.uniform(-1.0, 1.0, size=(100,)).tolist(),\n",
23 | " 'f1': np.random.uniform(-1.0, 1.0, size=(100,)).tolist(),\n",
24 | " 'f2': np.random.uniform(-1.0, 1.0, size=(100,)).tolist(),\n",
25 | " 'f3': np.random.uniform(-1.0, 1.0, size=(100,)).tolist(),\n",
26 | " })\n",
27 | " forward_button = Button(label=\"df-to-table\")\n",
28 | " backward_button = Button(label=\"table-to-df\")\n",
29 | " \n",
30 | " sel_source = ColumnDataSource(dict())\n",
31 | " show_columns = ['f0', 'f2']\n",
32 | " sel_columns = [TableColumn(field=_col, title=_col) for _col in show_columns]\n",
33 | " sel_table = DataTable(source=sel_source, columns=sel_columns, selectable=\"checkbox\", editable=True)\n",
34 | " \n",
35 | " def df_to_table(event):\n",
36 | " sel_source.data = df.to_dict(orient=\"list\")\n",
37 | " \n",
38 | " def table_to_df(event):\n",
39 | " indices = sel_source.selected.indices\n",
40 | " for _col in show_columns:\n",
41 | " _values = sel_source.data[_col]\n",
42 | " _patches = [_values[i] for i in indices]\n",
43 | " df.loc[indices, _col] = _patches\n",
44 | " print(_col, indices, len(_patches))\n",
45 | " \n",
46 | " forward_button.on_click(df_to_table)\n",
47 | " backward_button.on_click(table_to_df)\n",
48 | " return column(forward_button, backward_button, sel_table)"
49 | ]
50 | },
51 | {
52 | "cell_type": "code",
53 | "execution_count": null,
54 | "id": "1446d4c0-8da7-4101-9816-9b30297036aa",
55 | "metadata": {},
56 | "outputs": [],
57 | "source": [
58 | "handle = burner()\n",
59 | "show(handle)"
60 | ]
61 | }
62 | ],
63 | "metadata": {
64 | "kernelspec": {
65 | "display_name": "Python 3 (ipykernel)",
66 | "language": "python",
67 | "name": "python3"
68 | },
69 | "language_info": {
70 | "codemirror_mode": {
71 | "name": "ipython",
72 | "version": 3
73 | },
74 | "file_extension": ".py",
75 | "mimetype": "text/x-python",
76 | "name": "python",
77 | "nbconvert_exporter": "python",
78 | "pygments_lexer": "ipython3",
79 | "version": "3.9.7"
80 | }
81 | },
82 | "nbformat": 4,
83 | "nbformat_minor": 5
84 | }
85 |
--------------------------------------------------------------------------------
/hover/core/representation/trajectory.py:
--------------------------------------------------------------------------------
1 | """
2 | Trajectory interpolation for sequences of vectors.
3 | """
4 | from scipy import interpolate
5 | import numpy as np
6 |
7 |
8 | def spline(arr_per_dim, points_per_step=1, splprep_kwargs=None):
9 | """
10 | Fit a spline and evaluate it at a specified density of points.
11 |
12 | - param arr_per_dim(numpy.ndarray): dim-by-points array representing the part of the curve in each dimension.
13 |
14 | - param points_per_step(int): number of points interpolated in between each given point on the curve.
15 |
16 | - param splprep_kwargs(dict): keyword arguments to the splprep() function for fitting the spline in SciPy.
17 | """
18 |
19 | # cast to array if appropriate
20 | if isinstance(arr_per_dim, list):
21 | arr_per_dim = np.array(arr_per_dim)
22 |
23 | assert points_per_step >= 1, "Need at least one point per step"
24 | splprep_kwargs = splprep_kwargs or dict()
25 |
26 | # check the number of given points in the curve
27 | num_given_points = arr_per_dim[0].shape[0]
28 | assert num_given_points > 1, "Need at least two points to fit a line"
29 |
30 | # check if two vectors are almost identical, and apply a noise in that case
31 | # note that we did not modify arr_per_dim in place
32 | # and that the noise only goes up in a greedy random-walk manner
33 | noise_arr = np.zeros((len(arr_per_dim), num_given_points))
34 | for i in range(1, num_given_points):
35 | prev_vec, vec = arr_per_dim[:, i - 1] + noise_arr[:, i - 1], arr_per_dim[:, i]
36 | while np.allclose(vec + noise_arr[:, i], prev_vec):
37 | noise_arr[:, i] += np.random.normal(loc=0.0, scale=1e-6, size=vec.shape)
38 |
39 | # reduce spline order if necessary, then fit the spline parameters
40 | splprep_kwargs["k"] = min(3, num_given_points - 1)
41 | tck, u = interpolate.splprep(arr_per_dim + noise_arr, **splprep_kwargs)
42 |
43 | # determine points at which the spline should be evaluated
44 | points_to_eval = []
45 | for i in range(0, u.shape[0] - 1):
46 | _pts = np.linspace(u[i], u[i + 1], points_per_step, endpoint=False)
47 | points_to_eval.append(_pts)
48 | points_to_eval.append([u[-1]])
49 | points_to_eval = np.concatenate(points_to_eval)
50 |
51 | traj_per_dim = interpolate.splev(points_to_eval, tck)
52 | return traj_per_dim
53 |
54 |
55 | def manifold_spline(seq_arr, **kwargs):
56 | """
57 | Fit a spline to every sequence of points in a manifold.
58 | - param seq_arr: L-sequence of M-by-N arrays each containing vectors matched by index.
59 | :type seq_arr: numpy.ndarray
60 | """
61 | # L is unused
62 | _L, M, N = seq_arr.shape
63 |
64 | # this gives M-by-N-by-f(L, args)
65 | traj_arr = np.array(
66 | [
67 | spline(np.array([seq_arr[:, _m, _n] for _n in range(N)]), **kwargs)
68 | for _m in range(M)
69 | ]
70 | )
71 |
72 | # return f(L, args)-by-M-by-N
73 | traj_arr = np.swapaxes(traj_arr, 1, 2)
74 | traj_arr = np.swapaxes(traj_arr, 0, 1)
75 | return traj_arr
76 |
--------------------------------------------------------------------------------
/hover/utils/datasets.py:
--------------------------------------------------------------------------------
1 | """
2 | Submodule that loads and preprocesses public datasets into formats that work smoothly.
3 | """
4 |
5 | from sklearn.datasets import fetch_20newsgroups
6 | from hover import module_config
7 | import re
8 |
9 |
10 | def clean_string(text, sub_from=r"[^a-zA-Z0-9\ ]", sub_to=r" "):
11 | cleaned = re.sub(sub_from, sub_to, text)
12 | cleaned = re.sub(r" +", r" ", cleaned)
13 | return cleaned
14 |
15 |
16 | def newsgroups_dictl(
17 | data_home="~/scikit_learn_data",
18 | to_remove=("headers", "footers", "quotes"),
19 | text_key="text",
20 | label_key="label",
21 | label_mapping=None,
22 | ):
23 | """
24 | Load the 20 Newsgroups dataset into a list of dicts, deterministically.
25 | """
26 | label_mapping = label_mapping or dict()
27 | dataset = dict()
28 | label_set = set()
29 | for _key in ["train", "test"]:
30 | _dictl = []
31 |
32 | # load subset and transform into a list of dicts
33 | _bunch = fetch_20newsgroups(
34 | data_home=data_home, subset=_key, random_state=42, remove=to_remove
35 | )
36 | for i, text in enumerate(_bunch.data):
37 | _text = clean_string(text)
38 | _label = _bunch.target_names[_bunch.target[i]]
39 | _label = label_mapping.get(_label, _label)
40 |
41 | _text_actual_characters = re.sub(r"[^a-zA-Z0-9]", r"", _text)
42 | if len(_text_actual_characters) > 5:
43 | label_set.add(_label)
44 | _entry = {text_key: _text, label_key: _label}
45 | _dictl.append(_entry)
46 |
47 | # add to dataset
48 | dataset[_key] = _dictl
49 |
50 | label_list = sorted(list(label_set))
51 | label_decoder = {idx: value for idx, value in enumerate(label_list)}
52 | label_decoder[module_config.ABSTAIN_ENCODED] = module_config.ABSTAIN_DECODED
53 | label_encoder = {value: idx for idx, value in label_decoder.items()}
54 | return dataset, label_encoder, label_decoder
55 |
56 |
57 | def newsgroups_reduced_dictl(**kwargs):
58 | """
59 | Load the 20 Newsgroups dataset but reduce categories using a custom mapping.
60 | """
61 | label_mapping = {
62 | "alt.atheism": "religion",
63 | "comp.graphics": "computer",
64 | "comp.os.ms-windows.misc": "computer",
65 | "comp.sys.ibm.pc.hardware": "computer",
66 | "comp.sys.mac.hardware": "computer",
67 | "comp.windows.x": "computer",
68 | "misc.forsale": "forsale",
69 | "rec.autos": "recreation",
70 | "rec.motorcycles": "recreation",
71 | "rec.sport.baseball": "recreation",
72 | "rec.sport.hockey": "recreation",
73 | "sci.crypt": "computer",
74 | "sci.electronics": "computer",
75 | "sci.med": "med",
76 | "sci.space": "space",
77 | "soc.religion.christian": "religion",
78 | "talk.politics.guns": "politics",
79 | "talk.politics.mideast": "politics",
80 | "talk.politics.misc": "politics",
81 | "talk.religion.misc": "religion",
82 | }
83 | kwargs["label_mapping"] = label_mapping
84 | return newsgroups_dictl(**kwargs)
85 |
--------------------------------------------------------------------------------
/docs/pages/tutorial/t2-bokeh-app.md:
--------------------------------------------------------------------------------
1 | > `hover` creates a [`bokeh` server app](https://docs.bokeh.org/en/latest/docs/user_guide/server.html) to deliver its annotation interface.
2 | >
3 | > :rocket: This app can be served flexibly based on your needs.
4 |
5 | {!docs/snippets/html/stylesheet.html!}
6 |
7 | ## **Prerequisites**
8 |
9 | Suppose that we've already used a `recipe` to create a `handle` function like in the [quickstart](../t0-quickstart/#apply-labels).
10 |
11 | ??? info "Recap from the tutorials before"
12 | - the `handle` is a function which renders plot elements on a [`bokeh` document](https://docs.bokeh.org/en/latest/docs/reference/document.html).
13 |
14 | ## **Option 1: Jupyter**
15 |
16 | We are probably familiar with this now:
17 |
18 | ```Python
19 | from bokeh.io import show, output_notebook
20 | output_notebook()
21 | show(handle) # notebook_url='http://localhost:8888'
22 | ```
23 |
24 | ???+ tip "Pros & Cons"
25 | This inline Jupyter mode can integrate particularly well with your notebook workflow. For example, when your are (tentatively) done with annotation, the `SupervisableDataset` can be accessed directly in the notebook, rather than exported to a file and loaded back.
26 |
27 | The inline mode is highly recommended for local usage.
28 |
29 | - On the contrary, with a remote Jupyter server, it may have trouble displaying the plots.
30 |
31 | - this can be due to failure of loading JavaScript libraries or accessing implicit bokeh server ports.
32 |
33 | ## **Option 2: Command Line**
34 |
35 | [`bokeh serve`](https://docs.bokeh.org/en/latest/docs/user_guide/server.html) starts an explicit `tornado` server from the command line:
36 |
37 | ```bash
38 | bokeh serve my-app.py
39 | ```
40 |
41 | ```Python
42 | # my-app.py
43 |
44 | # handle = ...
45 |
46 | from bokeh.io import curdoc
47 | doc = curdoc()
48 | handle(doc)
49 | ```
50 |
51 | ???+ tip "Pros & Cons"
52 | This is the "classic" approach to run a `bokeh` server. Remote access is simple through parameters [**specified here**](https://docs.bokeh.org/en/latest/docs/reference/command/subcommands/serve.html). The bokeh plot tools are mobile-friendly too -- this means you can host a server, e.g. an http-enabled cloud virtual machine, and annotate from a tablet.
53 |
54 | The command line mode is less interactive, since Python objects in the script cannot be accessed on the fly.
55 |
56 | ## **Option 3: Anywhere in Python**
57 |
58 | It is possible to [embed the app](https://docs.bokeh.org/en/latest/docs/user_guide/server.html#embedding-bokeh-server-as-a-library) in regular Python:
59 |
60 | ```Python
61 | from bokeh.server.server import Server
62 | server = Server({'/my-app': handle})
63 | server.start()
64 | ```
65 |
66 | ???+ tip "Pros & Cons"
67 | This embedded mode is a go-to for serving within a greater application.
68 |
69 | Also note that each command line argument for `bokeh serve` has a corresponding keyword argument to `Server()`.
70 |
71 | For instance, `bokeh serve --allow-websocket-origin=*` in the command line mirrors `Server(*args, allow_websocket_origin='*')` in Python.
72 |
73 | The embedded mode gives you the most control of your server.
74 |
--------------------------------------------------------------------------------
/hover/recipes/stable.py:
--------------------------------------------------------------------------------
1 | """
2 | ???+ note "High-level functions to produce an interactive annotation interface."
3 | Stable recipes whose function signatures should almost never change in the future.
4 | """
5 | from hover.utils.bokeh_helper import servable
6 | from .subroutine import recipe_layout, standard_annotator, standard_finder
7 |
8 |
9 | @servable(title="Simple Annotator")
10 | def simple_annotator(dataset, **kwargs):
11 | """
12 | ???+ note "Display the dataset with on a 2D map for annotation."
13 |
14 | | Param | Type | Description |
15 | | :-------- | :------- | :----------------------------------- |
16 | | `dataset` | `SupervisableDataset` | the dataset to link to |
17 | | `**kwargs` | | kwargs to forward to each Bokeh figure |
18 |
19 | Expected visual layout:
20 |
21 | | SupervisableDataset | BokehDataAnnotator |
22 | | :------------------ | :----------------- |
23 | | manage data subsets | make annotations |
24 | """
25 | dataset.setup_bokeh_elements(reset=True)
26 | layout, _ = _simple_annotator(dataset, **kwargs)
27 | return layout
28 |
29 |
30 | def _simple_annotator(dataset, layout_style="horizontal", **kwargs):
31 | """
32 | ???+ note "Cousin of simple_annotator which exposes objects in the layout."
33 | """
34 | annotator = standard_annotator(dataset, **kwargs)
35 |
36 | sidebar = dataset.view()
37 | layout = recipe_layout(sidebar, annotator.view(), style=layout_style)
38 |
39 | objects = {"dataset": dataset, "annotator": annotator, "sidebar": sidebar}
40 | return layout, objects
41 |
42 |
43 | @servable(title="Linked Annotator")
44 | def linked_annotator(dataset, **kwargs):
45 | """
46 | ???+ note "Display the dataset on a 2D map in two views, one for search and one for annotation."
47 |
48 | | Param | Type | Description |
49 | | :-------- | :------- | :----------------------------------- |
50 | | `dataset` | `SupervisableDataset` | the dataset to link to |
51 | | `**kwargs` | | kwargs to forward to each Bokeh figure |
52 |
53 | Expected visual layout:
54 |
55 | | SupervisableDataset | BokehDataFinder | BokehDataAnnotator |
56 | | :------------------ | :------------------ | :----------------- |
57 | | manage data subsets | search -> highlight | make annotations |
58 | """
59 | dataset.setup_bokeh_elements(reset=True)
60 | layout, _ = _linked_annotator(dataset, **kwargs)
61 | return layout
62 |
63 |
64 | def _linked_annotator(dataset, layout_style="horizontal", **kwargs):
65 | """
66 | ???+ note "Cousin of linked_annotator which exposes objects in the layout."
67 | """
68 | finder = standard_finder(dataset, **kwargs)
69 | annotator = standard_annotator(dataset, **kwargs)
70 |
71 | # link selections
72 | annotator.link_selection(
73 | finder,
74 | {_key: _key for _key in ["raw", "train", "dev", "test"]},
75 | )
76 |
77 | sidebar = dataset.view()
78 | layout = recipe_layout(sidebar, finder.view(), annotator.view(), style=layout_style)
79 |
80 | objects = {
81 | "dataset": dataset,
82 | "annotator": annotator,
83 | "finder": finder,
84 | "sidebar": sidebar,
85 | }
86 | return layout, objects
87 |
--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
1 | [tox]
2 | envlist = py38, py39, py310
3 |
4 | [gh-actions]
5 | python =
6 | 3.8: py38
7 | 3.9: py39
8 | 3.10: py310
9 |
10 | [testenv:test_api]
11 | deps =
12 | # standard testing
13 | pytest
14 | coverage
15 | # text generation
16 | faker
17 | # pseudo-random str-to-float
18 | shaffle
19 | # common NLP and vectorizers
20 | spacy
21 | # dimensionality reduction
22 | ivis[cpu]>=1.7
23 | tensorflow>=2.9
24 | # distant supervision
25 | snorkel>=0.9.8
26 | # utility wrappers
27 | wrappy>=0.2.6
28 | # exporting excel files
29 | openpyxl
30 |
31 | commands =
32 | # get extra dependencies that deps do not cover
33 | python -m spacy download en_core_web_md
34 | # run tests and get coverage report
35 | coverage run --source=./hover -m pytest -m "not benchmark" {posargs}
36 | coverage xml -o cobertura.xml
37 |
38 | install_command =
39 | python -m pip install --upgrade {opts} {packages}
40 |
41 | [testenv:test_api_lite]
42 | # low-dependency fast test suite for compatibility
43 | deps =
44 | pytest
45 | faker
46 | shaffle
47 | spacy
48 | snorkel>=0.9.8
49 | wrappy>=0.2.6
50 | openpyxl
51 |
52 | commands =
53 | python -m spacy download en_core_web_md
54 | # run tests with "lite" mark
55 | pytest -m lite {posargs}
56 |
57 | install_command =
58 | python -m pip install --upgrade {opts} {packages}
59 |
60 | [testenv:test_api_benchmark]
61 | deps =
62 | pytest
63 | faker
64 | shaffle
65 | wrappy>=0.2.6
66 |
67 | commands =
68 | # specify threads limit for numpy and polars
69 | export OMP_NUM_THREADS=1 POLARS_MAX_THREADS=1
70 | # run tests with "benchmark" mark
71 | pytest -m benchmark {posargs}
72 |
73 | install_command =
74 | python -m pip install --upgrade {opts} {packages}
75 |
76 | [testenv:test_doc_scripts]
77 | allowlist_externals =
78 | git
79 | cp
80 | rm
81 | deps = -rdocs/pipelines/requirements-doc-scripts.txt
82 |
83 | commands =
84 | python -m spacy download en_core_web_md
85 | git clone https://github.com/phurwicz/hover-binder
86 | cp -r hover-binder/local_lib ./local_lib
87 | cp -r hover-binder/custom_cache ./custom_cache
88 | rm -rf hover-binder
89 | python docs/pipelines/check_scripts.py
90 | rm -rf local_lib
91 | rm -rf custom_cache
92 |
93 | [testenv:test_notebook_generation]
94 | allowlist_externals =
95 | mkdir
96 | deps = -rdocs/pipelines/requirements-doc-scripts.txt
97 |
98 | commands =
99 | python -m spacy download en_core_web_md
100 | mkdir custom_cache
101 | python docs/pipelines/generate_notebooks.py
102 |
103 | [testenv:install]
104 | commands =
105 | python setup.py install {posargs}
106 |
107 | [testenv:publish]
108 | allowlist_externals =
109 | rm
110 | deps =
111 | twine
112 |
113 | commands =
114 | python setup.py sdist bdist_wheel
115 | twine check dist/*
116 | twine upload dist/*
117 | rm -rf build dist hover.egg-info
118 |
119 | [flake8]
120 | ignore =
121 | # black breaks these
122 | E203,
123 | E501,
124 | W503,
125 | per-file-ignores =
126 | # "imported but unused": intended in __init__ files
127 | __init__.py: F401
128 | conftest.py: E402
129 | exclude = .git,__pycache__,docs,build,dist
130 | max-complexity = 10
131 |
--------------------------------------------------------------------------------
/tests/utils/test_misc.py:
--------------------------------------------------------------------------------
1 | from hover.utils.misc import current_time, NodeUnionFind, RootUnionFind
2 | import pytest
3 |
4 |
5 | @pytest.mark.lite
6 | def test_current_time():
7 | timestamp = current_time()
8 | assert isinstance(timestamp, str)
9 |
10 |
11 | def node_data_from_uf_array(arr):
12 | """Subroutine for testing utility."""
13 | return [_node.data for _node in arr]
14 |
15 |
16 | def find_data_from_uf_array(arr):
17 | """Subroutine for testing utility."""
18 | return [_node.find().data for _node in arr]
19 |
20 |
21 | def counts_from_uf_array(arr):
22 | """Subroutine for testing utility."""
23 | return [_node.count for _node in arr]
24 |
25 |
26 | def check_unionfind(arr, nodes, finds, counts):
27 | assert node_data_from_uf_array(arr) == nodes
28 | assert find_data_from_uf_array(arr) == finds
29 | assert counts_from_uf_array(arr) == counts
30 |
31 |
32 | @pytest.mark.lite
33 | def test_nodeunionfind():
34 | arr = [NodeUnionFind(i) for i in range(8)]
35 | assert repr(arr[0]) == "0"
36 |
37 | for _l, _r, _nodes, _finds, _counts in [
38 | (
39 | 0,
40 | 1,
41 | [0, 1, 2, 3, 4, 5, 6, 7],
42 | [0, 0, 2, 3, 4, 5, 6, 7],
43 | [2, 2, 1, 1, 1, 1, 1, 1],
44 | ),
45 | (
46 | 1,
47 | 2,
48 | [0, 1, 2, 3, 4, 5, 6, 7],
49 | [0, 0, 0, 3, 4, 5, 6, 7],
50 | [3, 3, 3, 1, 1, 1, 1, 1],
51 | ),
52 | (
53 | 0,
54 | 2,
55 | [0, 1, 2, 3, 4, 5, 6, 7],
56 | [0, 0, 0, 3, 4, 5, 6, 7],
57 | [3, 3, 3, 1, 1, 1, 1, 1],
58 | ),
59 | (
60 | 3,
61 | 4,
62 | [0, 1, 2, 3, 4, 5, 6, 7],
63 | [0, 0, 0, 3, 3, 5, 6, 7],
64 | [3, 3, 3, 2, 2, 1, 1, 1],
65 | ),
66 | (
67 | 4,
68 | 2,
69 | [0, 1, 2, 3, 4, 5, 6, 7],
70 | [0, 0, 0, 0, 0, 5, 6, 7],
71 | [5, 5, 5, 5, 5, 1, 1, 1],
72 | ),
73 | ]:
74 | arr[_l].union(arr[_r])
75 | check_unionfind(arr, _nodes, _finds, _counts)
76 |
77 | # test data assignment
78 | arr[0].data = 8
79 | check_unionfind(
80 | arr,
81 | [8, 1, 2, 3, 4, 5, 6, 7],
82 | [8, 8, 8, 8, 8, 5, 6, 7],
83 | [5, 5, 5, 5, 5, 1, 1, 1],
84 | )
85 |
86 |
87 | @pytest.mark.lite
88 | def test_rootunionfind():
89 | arr = [RootUnionFind(i) for i in range(8)]
90 |
91 | for _l, _r, _nodes, _finds, _counts in [
92 | (
93 | 0,
94 | 1,
95 | [0, 0, 2, 3, 4, 5, 6, 7],
96 | [0, 0, 2, 3, 4, 5, 6, 7],
97 | [2, 2, 1, 1, 1, 1, 1, 1],
98 | ),
99 | (
100 | 1,
101 | 2,
102 | [0, 0, 0, 3, 4, 5, 6, 7],
103 | [0, 0, 0, 3, 4, 5, 6, 7],
104 | [3, 3, 3, 1, 1, 1, 1, 1],
105 | ),
106 | (
107 | 3,
108 | 4,
109 | [0, 0, 0, 3, 3, 5, 6, 7],
110 | [0, 0, 0, 3, 3, 5, 6, 7],
111 | [3, 3, 3, 2, 2, 1, 1, 1],
112 | ),
113 | (
114 | 4,
115 | 2,
116 | [3, 3, 3, 3, 3, 5, 6, 7],
117 | [3, 3, 3, 3, 3, 5, 6, 7],
118 | [5, 5, 5, 5, 5, 1, 1, 1],
119 | ),
120 | ]:
121 | arr[_l].union(arr[_r])
122 | check_unionfind(arr, _nodes, _finds, _counts)
123 |
--------------------------------------------------------------------------------
/docs/pages/tutorial/t7-snorkel-improvise-rules.md:
--------------------------------------------------------------------------------
1 | > Suppose we have some custom functions for labeling or filtering data, which resembles [`snorkel`](https://github.com/snorkel-team/snorkel)'s typical scenario.
2 | >
3 | > :speedboat: Let's see how these functions can be combined with `hover`.
4 |
5 | {!docs/snippets/html/thebe.html!}
6 | {!docs/snippets/markdown/binder-kernel.md!}
7 | {!docs/snippets/markdown/component-tutorial.md!}
8 | {!docs/snippets/markdown/local-dependency.md!}
9 | {!docs/snippets/markdown/local-dep-text.md!}
10 | {!docs/snippets/markdown/local-dep-snorkel.md!}
11 | {!docs/snippets/markdown/local-dep-jupyter-bokeh.md!}
12 |
13 | ## **Preparation**
14 |
15 | {!docs/snippets/markdown/dataset-prep.md!}
16 |
17 | ## **Labeling Functions**
18 |
19 | Labeling functions are functions that **take a `pd.DataFrame` row and return a label or abstain**.
20 |
21 | Inside the function one can do many things, but let's start with simple keywords wrapped in regex:
22 |
23 | ??? info "About the decorator @labeling_function"
24 | ::: hover.utils.snorkel_helper.labeling_function
25 |
26 |
27 | {!docs/snippets/py/t7-0-lf-list.txt!}
28 |
29 |
30 |
31 | {!docs/snippets/py/t7-0a-lf-list-edit.txt!}
32 |
33 |
34 | ### **Using a Function to Apply Labels**
35 |
36 | Hover's `SnorkelExplorer` (short as `snorkel`) can take the labeling functions above and apply them on areas of data that you choose. The widget below is responsible for labeling:
37 |
38 | {!docs/snippets/markdown/jupyterlab-js-issue.md!}
39 |
40 |
41 | {!docs/snippets/py/tz-bokeh-notebook-common.txt!}
42 |
43 | {!docs/snippets/py/tz-bokeh-notebook-remote.txt!}
44 |
45 | {!docs/snippets/py/t7-1-snorkel-apply-button.txt!}
46 |
47 |
48 | ### **Using a Function to Apply Filters**
49 |
50 | Any function that labels is also a function that filters. The filter condition is `"keep if did not abstain"`. The widget below handles filtering:
51 |
52 | {!docs/snippets/markdown/jupyterlab-js-issue.md!}
53 |
54 |
55 | {!docs/snippets/py/t7-2-snorkel-filter-button.txt!}
56 |
57 |
58 | Unlike the toggled filters for `finder` and `softlabel`, filtering with functions is on a per-click basis. In other words, this particular filtration doesn't persist when you select another area.
59 |
60 | ## **Dynamic List of Functions**
61 |
62 | Python lists are mutable, and we are going to take advantage of that for improvising and editing labeling functions on the fly.
63 |
64 | Run the block below and open the resulting URL to launch a recipe.
65 |
66 | - labeling functions are evaluated against the `dev` set.
67 | - hence you are advised to send the labels produced by these functions to the `train` set, not the `dev` set.
68 | - come back and edit the list of labeling functions **in-place** in one of the code cells above.
69 | - then go to the launched app and refresh the functions!
70 |
71 |
72 | {!docs/snippets/py/t7-3-snorkel-crosscheck.txt!}
73 |
74 | {!docs/snippets/py/tz-bokeh-show-server.txt!}
75 |
76 | {!docs/snippets/py/tz-bokeh-show-notebook.txt!}
77 |
78 |
79 | What's really cool is that in your local environment, this update-and-refresh operation can be done all in a notebook. So now you can
80 |
81 | - interactively evaluate and revise labeling functions
82 | - visually assign specific data regions to apply those functions
83 |
84 | which makes labeling functions significantly more accurate and applicable.
85 |
86 | {!docs/snippets/html/stylesheet.html!}
87 |
--------------------------------------------------------------------------------
/hover/utils/torch_helper.py:
--------------------------------------------------------------------------------
1 | """
2 | Submodule that handles interaction with PyTorch.
3 | """
4 | import numpy as np
5 | import torch
6 | import torch.nn.functional as F
7 | from torch.utils.data import Dataset, DataLoader
8 |
9 |
10 | class VectorDataset(Dataset):
11 | """
12 | PyTorch Dataset of vectors and probabilistic classification targets.
13 | """
14 |
15 | DEFAULT_LOADER_KWARGS = dict(batch_size=64, shuffle=True, drop_last=False)
16 |
17 | def __init__(self, input_vectors, output_vectors):
18 | """Overrides the parent constructor."""
19 | assert len(input_vectors) == len(output_vectors)
20 | self.input_tensor = torch.FloatTensor(np.asarray(input_vectors))
21 | self.output_tensor = torch.FloatTensor(np.asarray(output_vectors))
22 |
23 | def __getitem__(self, index):
24 | """Makes the dataset an iterable."""
25 | return self.input_tensor[index], self.output_tensor[index], index
26 |
27 | def __len__(self):
28 | """Defines the length measure."""
29 | return len(self.input_tensor)
30 |
31 | def loader(self, **kwargs):
32 | keyword_args = self.__class__.DEFAULT_LOADER_KWARGS.copy()
33 | keyword_args.update(kwargs)
34 | return DataLoader(dataset=self, **keyword_args)
35 |
36 |
37 | class MultiVectorDataset(Dataset):
38 | """
39 | PyTorch Dataset of multi-vectors and probabilistic classification targets.
40 | """
41 |
42 | DEFAULT_LOADER_KWARGS = dict(batch_size=64, shuffle=True, drop_last=False)
43 |
44 | def __init__(self, input_vector_lists, output_vectors):
45 | """Overrides the parent constructor."""
46 | for _list in input_vector_lists:
47 | assert len(_list) == len(output_vectors)
48 | self.input_tensors = [
49 | torch.FloatTensor(np.asarray(_list)) for _list in input_vector_lists
50 | ]
51 | self.output_tensor = torch.FloatTensor(np.asarray(output_vectors))
52 |
53 | def __getitem__(self, index):
54 | """Makes the dataset an iterable."""
55 | input_vectors = [_tensor[index] for _tensor in self.input_tensors]
56 | return input_vectors, self.output_tensor[index], index
57 |
58 | def __len__(self):
59 | """Defines the length measure."""
60 | return len(self.output_tensor)
61 |
62 | def loader(self, **kwargs):
63 | keyword_args = self.__class__.DEFAULT_LOADER_KWARGS.copy()
64 | keyword_args.update(kwargs)
65 | return DataLoader(dataset=self, **keyword_args)
66 |
67 |
68 | def one_hot(encoded_labels, num_classes):
69 | """
70 | One-hot encoding into a float form.
71 |
72 | :param encoded_labels: integer-encoded labels.
73 | :type encoded_labels: list of int
74 | :param num_classes: the number of classes to encode.
75 | :type num_classes: int
76 | """
77 | return F.one_hot(torch.LongTensor(encoded_labels), num_classes=num_classes).float()
78 |
79 |
80 | def label_smoothing(probabilistic_labels, coefficient=0.1):
81 | """
82 | Smooth probabilistic labels, auto-detecting the number of classes.
83 |
84 | :param probabilistic_labels: N by num_classes tensor
85 | :type probabilistic_labels: torch.Tensor or numpy.ndarray
86 | :param coefficient: the smoothing coeffient for soft labels.
87 | :type coefficient: float
88 | """
89 | assert (
90 | len(probabilistic_labels.shape) == 2
91 | ), f"Expected 2 dimensions, got shape {probabilistic_labels.shape}"
92 | assert coefficient >= 0.0, f"Expected non-negative smoothing, got {coefficient}"
93 | num_classes = probabilistic_labels.shape[-1]
94 | return (1.0 - coefficient) * probabilistic_labels + coefficient / num_classes
95 |
--------------------------------------------------------------------------------
/hover/core/local_config.py:
--------------------------------------------------------------------------------
1 | import re
2 | import hover
3 | from hover.config_constants import (
4 | ConfigSection as Section,
5 | ConfigKey as Key,
6 | )
7 | from bokeh.models import (
8 | Div,
9 | TableColumn,
10 | CellEditor,
11 | HTMLTemplateFormatter,
12 | )
13 |
14 |
15 | DEFAULT_REDUCTION_METHOD = hover.config[Section.DATA_EMBEDDING][
16 | Key.DEFAULT_REDUCTION_METHOD
17 | ]
18 | DATASET_SUBSET_FIELD = hover.config[Section.DATA_COLUMNS][Key.DATASET_SUBSET_FIELD]
19 |
20 | COLOR_GLYPH_TEMPLATE = """
21 |
22 | <%= "███" %>
23 |
24 | """
25 |
26 | EMBEDDING_FIELD_PREFIX = hover.config[Section.DATA_COLUMNS][Key.EMBEDDING_FIELD_PREFIX]
27 | EMBEDDING_FIELD_REGEX = r"\d+d_\d+$"
28 |
29 |
30 | def embedding_field(total_dim, specific_dim):
31 | return f"{EMBEDDING_FIELD_PREFIX}{total_dim}d_{specific_dim}"
32 |
33 |
34 | def is_embedding_field(column_name):
35 | if not column_name.startswith(EMBEDDING_FIELD_PREFIX):
36 | return False
37 | return bool(re.search(EMBEDDING_FIELD_REGEX, column_name))
38 |
39 |
40 | def blank_callback_on_change(attr, old, new):
41 | return None
42 |
43 |
44 | def dataset_help_widget():
45 | text = 'Dataset Widgets Help '
46 | return Div(text=text)
47 |
48 |
49 | def dataset_default_sel_table_columns(feature_key):
50 | """
51 | ???+ note "Default `SupervisableDataset` selection table columns based on feature type."
52 |
53 | Always allow multi-selection and editing. Based on feature type:
54 | - increases row height for viewing images.
55 | """
56 | # disable editing the feature through a blank editor
57 | feature_col_kwargs = dict(editor=CellEditor())
58 | if feature_key == "text":
59 | feature_col_kwargs["formatter"] = HTMLTemplateFormatter(
60 | template="""<%= value %> """
61 | )
62 | elif feature_key == "image":
63 | style = hover.config[Section.VISUAL][Key.TABLE_IMG_STYLE]
64 | # width is easily adjustable on the UI, no need to make configurable here
65 | feature_col_kwargs["width"] = 200
66 | feature_col_kwargs["formatter"] = HTMLTemplateFormatter(
67 | template=f""" style="{style}">""",
68 | )
69 | elif feature_key == "audio":
70 | feature_col_kwargs["width"] = 50
71 | feature_col_kwargs["formatter"] = HTMLTemplateFormatter(
72 | template="""> """,
73 | )
74 | else:
75 | raise ValueError(f"Unsupported feature type {feature_key}")
76 |
77 | columns = [
78 | TableColumn(field=feature_key, title=feature_key, **feature_col_kwargs),
79 | TableColumn(field="label", title="label"),
80 | ]
81 | return columns
82 |
83 |
84 | def dataset_default_sel_table_kwargs(feature_key):
85 | """
86 | ???+ note "Default `SupervisableDataset` selection table kwargs based on feature type."
87 |
88 | Always allow multi-selection and editing. Based on feature type:
89 | - increases row height for viewing images.
90 | """
91 | kwargs = dict(selectable="checkbox", editable=True)
92 | if feature_key == "text":
93 | pass
94 | elif feature_key == "image":
95 | kwargs["row_height"] = 200
96 | elif feature_key == "audio":
97 | pass
98 | else:
99 | raise ValueError(f"Unsupported feature type {feature_key}")
100 |
101 | return kwargs
102 |
--------------------------------------------------------------------------------
/hover/core/representation/reduction.py:
--------------------------------------------------------------------------------
1 | """
2 | ???+ note "Linker data structures which tie (potentially multiple) dimensionality reducers to arrays."
3 |
4 | The point is to make it clear which reduction is in reference to which array.
5 |
6 | Icing on the cake: unify the syntax across different kinds of reducers.
7 | """
8 | import numpy as np
9 | from hover.core import Loggable
10 | from .local_config import KWARG_TRANSLATOR, DEFAULT_REDUCTION_METHOD
11 |
12 |
13 | class DimensionalityReducer(Loggable):
14 | def __init__(self, array):
15 | """
16 | ???+ note "Link self to the shared input array for reduction methods."
17 | | Param | Type | Description |
18 | | :------ | :----------- | :---------------------------- |
19 | | `array` | `np.ndarray` | the input array to fit on |
20 | """
21 | self.reference_array = array
22 |
23 | @staticmethod
24 | def create_reducer(method=DEFAULT_REDUCTION_METHOD, *args, **kwargs):
25 | """
26 | ???+ note "Handle kwarg translation and dynamic imports."
27 |
28 | | Param | Type | Description |
29 | | :--------- | :----- | :----------------------- |
30 | | `method` | `str` | `"umap"` or `"ivis"` |
31 | | `*args` | | forwarded to the reducer |
32 | | `**kwargs` | | translated and forwarded |
33 | """
34 | if method == "umap":
35 | import umap
36 |
37 | reducer_cls = umap.UMAP
38 | elif method == "ivis":
39 | import ivis
40 |
41 | reducer_cls = ivis.Ivis
42 | else:
43 | raise ValueError("Expected 'umap' or 'ivis' as reduction method")
44 |
45 | translated_kwargs = kwargs.copy()
46 | for _key, _value in kwargs.items():
47 | _trans_dict = KWARG_TRANSLATOR.get(_key, {})
48 | if method in _trans_dict:
49 | _trans_key = _trans_dict[method]
50 | translated_kwargs.pop(_key)
51 | translated_kwargs[_trans_key] = _value
52 |
53 | reducer = reducer_cls(*args, **translated_kwargs)
54 | return reducer
55 |
56 | def fit_transform(self, method=DEFAULT_REDUCTION_METHOD, *args, **kwargs):
57 | """
58 | ???+ note "Fit and transform an array and store the reducer."
59 | | Param | Type | Description |
60 | | :--------- | :----- | :----------------------- |
61 | | `method` | `str` | `"umap"` or `"ivis"` |
62 | | `*args` | | forwarded to the reducer |
63 | | `**kwargs` | | forwarded to the reducer |
64 | """
65 | reducer = DimensionalityReducer.create_reducer(method=method, *args, **kwargs)
66 | embedding = reducer.fit_transform(self.reference_array)
67 | setattr(self, method, reducer)
68 | return embedding
69 |
70 | def transform(self, array, method=DEFAULT_REDUCTION_METHOD):
71 | """
72 | ???+ note "Transform an array with a already-fitted reducer."
73 | | Param | Type | Description |
74 | | :--------- | :----------- | :----------------------- |
75 | | `array` | `np.ndarray` | the array to transform |
76 | | `method` | `str` | `"umap"` or `"ivis"` |
77 | """
78 | assert isinstance(array, np.ndarray), f"Expected np.ndarray, got {type(array)}"
79 | # edge case: array is too small
80 | if array.shape[0] < 1:
81 | return np.array([])
82 |
83 | reducer = getattr(self, method)
84 | return reducer.transform(array)
85 |
--------------------------------------------------------------------------------
/hover/core/representation/manifold.py:
--------------------------------------------------------------------------------
1 | """
2 | Manifold similarity measures for any collection of sequences of vectors.
3 | Can be useful for improved interpretability of neural nets.
4 | """
5 | from tqdm import tqdm
6 | from scipy.spatial import procrustes
7 | from hover.core import Loggable
8 | from .reduction import DimensionalityReducer
9 | from .local_config import DEFAULT_REDUCTION_METHOD
10 |
11 |
12 | class LayerwiseManifold(Loggable):
13 | """
14 | Takes a sequence of arrays (each row of the array is a vector) and does the following:
15 | (1) unfold vectors into lower dimensions, typically 2D or 3D;
16 | (2) for every array:
17 | run Procrustes analysis for fitting to the previous array. The first array is fitted to itself.
18 | """
19 |
20 | DEFAULT_UNFOLD_KWARGS = {
21 | "umap": {
22 | "random_state": 0,
23 | "transform_seed": 0,
24 | }
25 | }
26 |
27 | def __init__(self, seq_arr):
28 | """
29 | :param seq_arr: sequence of arrays to fit the manifold with.
30 | :type seq_arr: list of numpy.ndarrays.
31 | """
32 | self.arrays = seq_arr[:]
33 | self.validate()
34 | self.standardize()
35 |
36 | def validate(self):
37 | """
38 | Sanity check of array dimensions.
39 | """
40 | assert (
41 | len(self.arrays) > 1
42 | ), "Need at least two arrays to compute layerwise manifold."
43 | self.n_vecs = self.arrays[0].shape[0]
44 | for _arr in self.arrays:
45 | assert _arr.shape[0] == self.n_vecs
46 |
47 | def standardize(self):
48 | """
49 | Standardize each array to the Procrustes form where
50 | - tr(A^T A) = 1
51 | - A.mean(axis=0) = 0
52 | """
53 |
54 | def transform(arr):
55 | matrix, _, _ = procrustes(arr, arr)
56 | return matrix
57 |
58 | self.arrays = [transform(_arr) for _arr in self.arrays]
59 |
60 | def unfold(self, method=None, **kwargs):
61 | """
62 | Compute lower-dimensional manifolds.
63 | :param method: the dimensionality reduction method to use.
64 | :type method: str
65 | """
66 | if method is None:
67 | method = DEFAULT_REDUCTION_METHOD
68 |
69 | # default kwargs should fix random state and seed
70 | # so that randomness does not introduce disparity
71 | use_kwargs = self.__class__.DEFAULT_UNFOLD_KWARGS.get(method, {}).copy()
72 | use_kwargs.update(kwargs)
73 | self.manifolds = []
74 | self._info(f"Running {method}...")
75 | for _arr in tqdm(self.arrays, total=len(self.arrays)):
76 | _reducer = DimensionalityReducer(_arr)
77 | _manifold = _reducer.fit_transform(method, **use_kwargs)
78 | self.manifolds.append(_manifold)
79 | self._good("unfolded arrays into manifolds")
80 |
81 | def procrustes(self, arrays=None):
82 | """
83 | Run Procrustes analysis, optionally on a specified list of arrays.
84 | """
85 | if arrays is None:
86 | arrays = self.manifolds
87 | disparities = []
88 | fit_arrays = []
89 |
90 | # fit each array to its fitted predecessor
91 | for i, _arr in enumerate(arrays):
92 | if i == 0:
93 | # fit the first array to itself
94 | _, _matrix, _disparity = procrustes(_arr, _arr)
95 | else:
96 | _, _matrix, _disparity = procrustes(fit_arrays[i - 1], _arr)
97 | disparities.append(_disparity)
98 | fit_arrays.append(_matrix)
99 |
100 | self._good("carried out Procrustes analysis")
101 | return fit_arrays, disparities
102 |
--------------------------------------------------------------------------------
/tests/core/explorer/test_feature.py:
--------------------------------------------------------------------------------
1 | """
2 | Corresponds to the `hover.core.explorer.feature` module.
3 | For mechanisms that are invariant across `hover.core.explorer.functionality`.
4 | """
5 |
6 | import pytest
7 | import math
8 | from hover.recipes.subroutine import get_explorer_class
9 | from tests.local_config import VECTORIZER_BREAKER
10 | from .local_helper import (
11 | FUNCTIONALITY_TO_SPECIAL_ARGS,
12 | subroutine_search_source_response,
13 | )
14 |
15 | MAIN_FUNCTIONALITIES = list(FUNCTIONALITY_TO_SPECIAL_ARGS.keys())
16 |
17 |
18 | def subroutine_searchable_explorer(dataset, functionality, feature):
19 | explorer_cls = get_explorer_class(functionality, feature)
20 | subset_mapping = explorer_cls.DEFAULT_SUBSET_MAPPING.copy()
21 | special_args = FUNCTIONALITY_TO_SPECIAL_ARGS[functionality]
22 | explorer = explorer_cls.from_dataset(dataset, subset_mapping, *special_args)
23 | explorer.activate_search()
24 | return explorer
25 |
26 |
27 | @pytest.mark.core
28 | class TestBokehForText:
29 | @staticmethod
30 | @pytest.mark.lite
31 | def test_search(example_text_dataset):
32 | for _functionality in MAIN_FUNCTIONALITIES:
33 | _explorer = subroutine_searchable_explorer(
34 | example_text_dataset,
35 | _functionality,
36 | "text",
37 | )
38 |
39 | def search_a():
40 | _explorer.search_pos.value = r"a"
41 |
42 | def desearch_a():
43 | _explorer.search_neg.value = r"a"
44 |
45 | subroutine_search_source_response(
46 | _explorer,
47 | [
48 | (search_a, True),
49 | (desearch_a, True),
50 | ],
51 | )
52 |
53 |
54 | @pytest.mark.core
55 | class TestBokehForImage:
56 | @staticmethod
57 | @pytest.mark.lite
58 | def test_search(example_image_dataset):
59 | for _functionality in MAIN_FUNCTIONALITIES:
60 | _explorer = subroutine_searchable_explorer(
61 | example_image_dataset,
62 | _functionality,
63 | "image",
64 | )
65 |
66 | def enter_first_image():
67 | _explorer.search_sim.value = _explorer.dfs["raw"]["image"][0]
68 |
69 | def enter_second_image():
70 | _explorer.search_sim.value = _explorer.dfs["raw"]["image"][1]
71 |
72 | def invalid_search():
73 | _explorer.search_sim.value = VECTORIZER_BREAKER
74 |
75 | subroutine_search_source_response(
76 | _explorer,
77 | [
78 | (enter_first_image, True),
79 | (enter_second_image, True),
80 | (invalid_search, False),
81 | ],
82 | )
83 |
84 |
85 | @pytest.mark.core
86 | class TestBokehForAudio:
87 | @staticmethod
88 | @pytest.mark.lite
89 | def test_search(example_audio_dataset):
90 | for _functionality in MAIN_FUNCTIONALITIES:
91 | _explorer = subroutine_searchable_explorer(
92 | example_audio_dataset,
93 | _functionality,
94 | "audio",
95 | )
96 |
97 | def enter_first_audio():
98 | _explorer.search_sim.value = _explorer.dfs["raw"]["audio"][0]
99 |
100 | def alter_sim_thresh():
101 | shifted = _explorer.search_threshold.value + 0.5
102 | _explorer.search_threshold.value = shifted - math.floor(shifted)
103 |
104 | subroutine_search_source_response(
105 | _explorer,
106 | [
107 | (enter_first_audio, True),
108 | (alter_sim_thresh, True),
109 | ],
110 | )
111 |
--------------------------------------------------------------------------------
/docs/pages/tutorial/t3-dataset-population-selection.md:
--------------------------------------------------------------------------------
1 | > `SupervisableDataset` holds your data throughout the labeling process.
2 | >
3 | > :speedboat: Let's take a look at its core mechanisms.
4 |
5 | {!docs/snippets/html/thebe.html!}
6 | {!docs/snippets/markdown/binder-kernel.md!}
7 | {!docs/snippets/markdown/component-tutorial.md!}
8 | {!docs/snippets/markdown/local-dependency.md!}
9 | {!docs/snippets/markdown/local-dep-jupyter-bokeh.md!}
10 |
11 | ## **Data Subsets**
12 |
13 | We place unlabeled data and labeled data in different subsets: "raw", "train", "dev", and "test". Unlabeled data start from the "raw" subset, and can be transferred to other subsets after it gets labeled.
14 |
15 | `SupervisableDataset` uses a "population table", `dataset.pop_table`, to show the size of each subset:
16 |
17 |
18 | {!docs/snippets/py/tz-dataset-text-full.txt!}
19 |
20 |
21 |
22 | {!docs/snippets/py/tz-bokeh-notebook-common.txt!}
23 |
24 | {!docs/snippets/py/tz-bokeh-notebook-remote.txt!}
25 |
26 | {!docs/snippets/py/t3-0-dataset-population-table.txt!}
27 |
28 |
29 | ### **Transfer Data Between Subsets**
30 |
31 | `COMMIT` and `DEDUP` are the mechanisms that `hover` uses to transfer data between subsets.
32 |
33 | - `COMMIT` copies selected points (to be discussed later) to a destination subset
34 | - labeled-raw-only: `COMMIT` automatically detects which points are in the raw set with a valid label. Other points will not get copied.
35 | - keep-last: you can commit the same point to the same subset multiple times and the last copy will be kept. This can be useful for revising labels before `DEDUP`.
36 | - `DEDUP` removes duplicates (identified by feature value) across subsets
37 | - priority rule: test > dev > train > raw, i.e. test set data always gets kept during deduplication
38 |
39 | ???+ info "FAQ"
40 | ??? help "Why does COMMIT only work on the raw subset?"
41 | Most selections will happen through plots, where different subsets are on top of each other. This means selections can contain both unlabeled and labeled points.
42 |
43 | Way too often we find ourselves trying to view both the labeled and the unlabeled, but only moving the unlabeled "raw" points. So it's handy that COMMIT picks those points only.
44 |
45 | These mechanisms correspond to buttons in `hover`'s annotation interface, which you have encountered in the quickstart:
46 |
47 | {!docs/snippets/markdown/jupyterlab-js-issue.md!}
48 |
49 |
50 | {!docs/snippets/py/t3-1-dataset-commit-dedup.txt!}
51 |
52 |
53 | Of course, so far we have nothing to move, because there's no data selected. We shall now discuss selections.
54 |
55 | ## **Selection**
56 |
57 | `hover` labels data points in bulk, which requires selecting groups of homogeneous data, i.e. semantically similar or going to have the same label. Being able to skim through what you selected gives you confidence about homogeneity.
58 |
59 | Normally, selection happens through a plot (`explorer`), as we have seen in the quickstart. For the purpose here, we will "cheat" and assign the selection programmatically:
60 |
61 |
62 | {!docs/snippets/py/t3-2-dataset-selection-table.txt!}
63 |
64 |
65 | ### **Edit Data Within a Selection**
66 |
67 | Often the points selected are not perfectly homogeneous, i.e. some outliers belong to a different label from the selected group overall. It would be helpful to `EVICT` them, and `SupervisableDataset` has a button for it.
68 |
69 | Sometimes you may also wish to edit data values on the fly. In hover this is called `PATCH`, and there also is a button for it.
70 |
71 | - by default, labels can be edited but feature values cannot.
72 |
73 | Let's plot the forementioned buttons along with the selection table. Toggle any number of rows in the table, then click the button to `EVICT` or `PATCH` those rows:
74 |
75 | {!docs/snippets/markdown/jupyterlab-js-issue.md!}
76 |
77 |
78 | {!docs/snippets/py/t3-3-dataset-evict-patch.txt!}
79 |
80 |
81 |
82 | {!docs/snippets/html/stylesheet.html!}
83 |
--------------------------------------------------------------------------------
/notebooks/Image-Experiment.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "id": "4e78f829-57e4-4bb8-b696-9173057943fe",
7 | "metadata": {},
8 | "outputs": [],
9 | "source": [
10 | "import sys\n",
11 | "sys.path.append('../')"
12 | ]
13 | },
14 | {
15 | "cell_type": "code",
16 | "execution_count": null,
17 | "id": "dba2348a-bb21-4896-a7be-e2cf4d4daff5",
18 | "metadata": {},
19 | "outputs": [],
20 | "source": []
21 | },
22 | {
23 | "cell_type": "code",
24 | "execution_count": null,
25 | "id": "ced47d3d-e030-4136-87b8-377b302a84d6",
26 | "metadata": {},
27 | "outputs": [],
28 | "source": [
29 | "import pandas as pd\n",
30 | "\n",
31 | "df = pd.read_csv('imagenet_custom.csv').sample(frac=1.0).reset_index(drop=True)\n",
32 | "df.head()"
33 | ]
34 | },
35 | {
36 | "cell_type": "code",
37 | "execution_count": null,
38 | "id": "1e1b71ea-b811-4810-9915-9f542829fe13",
39 | "metadata": {},
40 | "outputs": [],
41 | "source": [
42 | "df['SUBSET'] = 'raw'\n",
43 | "df['image1'] = df['image']\n",
44 | "df.loc[500:800, 'SUBSET'] = 'train'\n",
45 | "df.loc[800:900, 'SUBSET'] = 'dev'\n",
46 | "df.loc[900:, 'SUBSET'] = 'test'"
47 | ]
48 | },
49 | {
50 | "cell_type": "code",
51 | "execution_count": null,
52 | "id": "b2f81c8a-66ba-400a-9d2d-54abb1a852ab",
53 | "metadata": {},
54 | "outputs": [],
55 | "source": [
56 | "import pandas as pd\n",
57 | "from hover.core.dataset import SupervisableImageDataset\n",
58 | "\n",
59 | "# skip this block if EXPORT_PATH does not have a corresponding file\n",
60 | "dataset = SupervisableImageDataset.from_pandas(df)"
61 | ]
62 | },
63 | {
64 | "cell_type": "code",
65 | "execution_count": null,
66 | "id": "8ee2b295-343c-4e32-a651-c2a0fe2c9a45",
67 | "metadata": {},
68 | "outputs": [],
69 | "source": [
70 | "from fixture_module import image_vector_net\n",
71 | "\n",
72 | "vectorizer = image_vector_net.get_vectorizer()"
73 | ]
74 | },
75 | {
76 | "cell_type": "code",
77 | "execution_count": null,
78 | "id": "727fab83-57b7-4267-a1ca-2f76fa863ad7",
79 | "metadata": {},
80 | "outputs": [],
81 | "source": [
82 | "dataset.compute_nd_embedding(vectorizer, \"umap\", dimension=2)\n",
83 | "dataset.dfs[\"raw\"].head(5)"
84 | ]
85 | },
86 | {
87 | "cell_type": "code",
88 | "execution_count": null,
89 | "id": "e2877436-e5c3-483c-85ae-84b66da59977",
90 | "metadata": {},
91 | "outputs": [],
92 | "source": [
93 | "from hover.recipes import simple_annotator\n",
94 | "from hover.utils.bokeh_helper import bokeh_hover_tooltip\n",
95 | "from bokeh.io import show, output_notebook\n",
96 | "\n",
97 | "handle = simple_annotator(\n",
98 | " dataset.copy(), width=800, height=600,\n",
99 | " #tooltips=bokeh_hover_tooltip(label={\"label\": \"Label\"}, image={\"image\": 60, \"image1\": 80}),\n",
100 | ")\n",
101 | "\n",
102 | "output_notebook()\n",
103 | "show(handle)"
104 | ]
105 | },
106 | {
107 | "cell_type": "code",
108 | "execution_count": null,
109 | "id": "6264287b-0e98-4785-9ee1-d5e22cd54256",
110 | "metadata": {},
111 | "outputs": [],
112 | "source": []
113 | },
114 | {
115 | "cell_type": "code",
116 | "execution_count": null,
117 | "id": "2c2535ed-de5a-49e5-ab52-6b9ac6852ab8",
118 | "metadata": {},
119 | "outputs": [],
120 | "source": []
121 | }
122 | ],
123 | "metadata": {
124 | "kernelspec": {
125 | "display_name": "Python 3 (ipykernel)",
126 | "language": "python",
127 | "name": "python3"
128 | },
129 | "language_info": {
130 | "codemirror_mode": {
131 | "name": "ipython",
132 | "version": 3
133 | },
134 | "file_extension": ".py",
135 | "mimetype": "text/x-python",
136 | "name": "python",
137 | "nbconvert_exporter": "python",
138 | "pygments_lexer": "ipython3",
139 | "version": "3.9.7"
140 | }
141 | },
142 | "nbformat": 4,
143 | "nbformat_minor": 5
144 | }
145 |
--------------------------------------------------------------------------------
/tests/core/test_neural.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | import numpy as np
3 | from copy import deepcopy
4 | from hover.core.neural import VectorNet
5 | from hover.module_config import DataFrame
6 |
7 |
8 | @pytest.fixture
9 | def example_vecnet_args(example_text_dataset):
10 | module_name = "fixture_module.text_vector_net"
11 | target_labels = example_text_dataset.classes[:]
12 | return (module_name, target_labels)
13 |
14 |
15 | @pytest.fixture
16 | def blank_vecnet():
17 | model = VectorNet.from_module("fixture_module.text_vector_net", [], verbose=10)
18 | return model
19 |
20 |
21 | @pytest.fixture
22 | def example_vecnet(example_vecnet_args):
23 | model = VectorNet.from_module(*example_vecnet_args, verbose=10)
24 | return model
25 |
26 |
27 | def subroutine_predict_proba(net, dataset):
28 | num_classes = len(dataset.classes)
29 | proba_single = net.predict_proba("hello")
30 | assert proba_single.shape[0] == num_classes
31 | proba_multi = net.predict_proba(["hello", "bye", "ciao"])
32 | assert proba_multi.shape[0] == 3
33 | assert proba_multi.shape[1] == num_classes
34 |
35 |
36 | @pytest.mark.core
37 | class TestVectorNet(object):
38 | """
39 | For the VectorNet base class.
40 | """
41 |
42 | @staticmethod
43 | @pytest.mark.lite
44 | def test_save_and_load(example_vecnet, example_vecnet_args):
45 | default_path = example_vecnet.nn_update_path
46 | example_vecnet.save(f"{default_path}.test")
47 | loaded_vecnet = VectorNet.from_module(*example_vecnet_args)
48 | loaded_vecnet.save()
49 |
50 | @staticmethod
51 | @pytest.mark.lite
52 | def test_auto_adjust_setup(blank_vecnet, example_text_dataset):
53 | vecnet = deepcopy(blank_vecnet)
54 | targets = example_text_dataset.classes
55 | old_classes = sorted(
56 | vecnet.label_encoder.keys(),
57 | key=lambda k: vecnet.label_encoder[k],
58 | )
59 | old_nn = vecnet.nn
60 | # normal change of classes should create a new NN
61 | vecnet.auto_adjust_setup(targets)
62 | first_nn = vecnet.nn
63 | assert first_nn is not old_nn
64 | # identical classes should trigger autoskip
65 | vecnet.auto_adjust_setup(targets)
66 | second_nn = vecnet.nn
67 | assert second_nn is first_nn
68 | # change of class order should create a new NN
69 | vecnet.auto_adjust_setup(targets[1:] + targets[:1])
70 | third_nn = vecnet.nn
71 | assert third_nn is not second_nn
72 | vecnet.auto_adjust_setup(old_classes)
73 |
74 | @staticmethod
75 | @pytest.mark.lite
76 | def test_adjust_optimier_params(example_vecnet):
77 | example_vecnet.adjust_optimizer_params()
78 |
79 | @staticmethod
80 | @pytest.mark.lite
81 | def test_predict_proba(example_vecnet, example_text_dataset):
82 | subroutine_predict_proba(example_vecnet, example_text_dataset)
83 |
84 | @staticmethod
85 | def test_manifold_trajectory(example_vecnet, example_raw_df):
86 | for _method in ["umap", "ivis"]:
87 | traj_arr, seq_arr, disparities = example_vecnet.manifold_trajectory(
88 | DataFrame.series_tolist(example_raw_df["text"])
89 | )
90 | assert isinstance(traj_arr, np.ndarray)
91 | assert isinstance(seq_arr, np.ndarray)
92 | assert isinstance(disparities, list)
93 | assert isinstance(disparities[0], float)
94 |
95 | @staticmethod
96 | def test_train_and_evaluate(example_vecnet, example_text_dataset):
97 | vecnet = deepcopy(example_vecnet)
98 | dataset = example_text_dataset
99 | dev_loader = dataset.loader("dev", example_vecnet.vectorizer)
100 | test_loader = dataset.loader("test", example_vecnet.vectorizer)
101 |
102 | train_info = vecnet.train(dev_loader, dev_loader, epochs=5)
103 | accuracy, conf_mat = vecnet.evaluate(test_loader)
104 |
105 | assert isinstance(train_info, list)
106 | assert isinstance(train_info[0], dict)
107 | assert isinstance(accuracy, float)
108 | assert isinstance(conf_mat, np.ndarray)
109 |
--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
1 | site_name: Hover
2 | site_description: "Hover and label data rapidly."
3 | site_url: "https://phurwicz.github.io/hover"
4 | repo_url: "https://github.com/phurwicz/hover.git"
5 | repo_name: "phurwicz/hover"
6 |
7 | theme:
8 | name: material
9 | icon:
10 | logo: material/alpha-h-box
11 | favicon: images/favicon.png
12 | font:
13 | text: Roboto
14 | code: Roboto Mono
15 | features:
16 | - navigation.expand
17 | - navigation.tabs
18 | - search.suggest
19 | - toc.integrate
20 | palette:
21 | # Palette toggle for light mode
22 | - scheme: default
23 | toggle:
24 | icon: material/weather-night
25 | name: Switch to dark mode
26 |
27 | # Palette toggle for dark mode
28 | - scheme: slate
29 | toggle:
30 | icon: material/weather-sunny
31 | name: Switch to light mode
32 |
33 | nav:
34 | - Home: 'index.md'
35 | - 'Basics':
36 | - 'Quickstart': 'pages/tutorial/t0-quickstart.md'
37 | - 'Using Recipes': 'pages/tutorial/t1-active-learning.md'
38 | - 'Handling Images': 'pages/guides/g0-datatype-image.md'
39 | - 'Handling Audio': 'pages/guides/g1-datatype-audio.md'
40 | - 'Mechanisms':
41 | - 'Managing Data': 'pages/tutorial/t3-dataset-population-selection.md'
42 | - 'Applying Labels': 'pages/tutorial/t4-annotator-dataset-interaction.md'
43 | - 'Options':
44 | - 'Host Options': 'pages/tutorial/t2-bokeh-app.md'
45 | - 'Custom Config': 'pages/guides/g2-hover-config.md'
46 | - 'Powerful Tricks':
47 | - 'Finder & Selection Filter': 'pages/tutorial/t5-finder-filter.md'
48 | - 'Soft Label & Joint Filters': 'pages/tutorial/t6-softlabel-joint-filter.md'
49 | - 'Custom Labeling Functions': 'pages/tutorial/t7-snorkel-improvise-rules.md'
50 | # - 'Data Type: Multimodal': 'pages/topics/datatype-multimodal.md'
51 | #- 'Why Hover': 'pages/topics/what-hover-is.md'
52 | #- 'Customized Usage':
53 | # - 'API Levels': 'pages/topics/api-levels.md' # discuss the interaction between recipe / dataset / explorer
54 | # - 'Custom Recipe': 'pages/topics/custom-recipe.md' # discuss the caveats when making a recipe
55 | # - 'Subclassing Dataset': 'pages/topics/custom-dataset.md' # discuss the caveats when subclassing a SupervisableDataset
56 | # - 'Subclassing Explorer': 'pages/topics/custom-explorer.md' # discuss the caveats when subclassing a BokehBaseExplorer
57 | - 'API Reference':
58 | - 'hover.recipes': 'pages/reference/recipes.md'
59 | - 'hover.core':
60 | - '.dataset': 'pages/reference/core-dataset.md'
61 | - '.explorer':
62 | - '.base': 'pages/reference/core-explorer-base.md'
63 | - '.feature': 'pages/reference/core-explorer-feature.md'
64 | - '.functionality': 'pages/reference/core-explorer-functionality.md'
65 | - '.specialization': 'pages/reference/core-explorer-specialization.md'
66 | - '.neural': 'pages/reference/core-neural.md'
67 | - '.representation': 'pages/reference/core-representation.md'
68 | - 'hover.utils':
69 | - '.bokeh_helper': 'pages/reference/utils-bokeh_helper.md'
70 | - '.snorkel_helper': 'pages/reference/utils-snorkel_helper.md'
71 |
72 | markdown_extensions:
73 | - admonition
74 | - def_list
75 | - markdown_include.include
76 | - pymdownx.critic
77 | - pymdownx.details
78 | - pymdownx.emoji
79 | - pymdownx.superfences
80 | - pymdownx.tabbed:
81 | alternate_style: true
82 |
83 | plugins:
84 | - macros
85 | - search:
86 | - mkdocstrings:
87 | default_handler: python
88 | handlers:
89 | python:
90 | rendering:
91 | show_root_heading: true
92 | show_source: true
93 | watch:
94 | - hover
95 | - i18n:
96 | default_language: en
97 | languages:
98 | en: English
99 | # fr: français
100 | zh: 简体中文
101 | nav_translations:
102 | zh:
103 | Home: 主页
104 | Basics: 基础使用
105 | Mechanisms: 理解机制
106 | Options: 自定配置
107 | Powerful Tricks: 高级技巧
108 | API Reference: API 指南
109 |
110 | extra:
111 | version:
112 | provider: mike
113 | analytics:
114 | provider: google
115 | property: G-M3WR5YEJ33
116 |
--------------------------------------------------------------------------------
/tests/recipes/test_experimental.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | import numpy as np
3 | from hover.recipes.experimental import (
4 | _active_learning,
5 | _snorkel_crosscheck,
6 | active_learning,
7 | snorkel_crosscheck,
8 | )
9 | from hover.module_config import DataFrame as DF
10 | from bokeh.events import ButtonClick, SelectionGeometry
11 | from .local_helper import execute_handle_function
12 |
13 |
14 | def test_active_learning(example_text_dataset, dummy_vecnet_callback):
15 | def read_scores(dataset, subset):
16 | return DF.series_values(dataset.dfs[subset]["pred_score"]).copy()
17 |
18 | dataset = example_text_dataset.copy()
19 | vecnet = dummy_vecnet_callback(dataset)
20 | layout, objects = _active_learning(dataset, vecnet)
21 | assert layout.visible
22 |
23 | initial_scores = read_scores(dataset, "raw")
24 |
25 | finder, annotator = objects["finder"], objects["annotator"]
26 | softlabel = objects["softlabel"]
27 | coords_slider = softlabel._dynamic_widgets["patch_slider"]
28 | model_trainer = objects["model_trainer"]
29 | train_event = ButtonClick(model_trainer)
30 |
31 | # train for default number of epochs
32 | model_trainer._trigger_event(train_event)
33 | first_scores = read_scores(dataset, "raw")
34 | assert not np.allclose(first_scores, initial_scores)
35 |
36 | # emulating user interaction: slide coords to view manifold trajectory
37 | for _value in range(1, min(coords_slider.end + 1, 4)):
38 | coords_slider.value = _value
39 |
40 | # train for 1 more epoch
41 | model_trainer._trigger_event(train_event)
42 | second_scores = read_scores(dataset, "raw")
43 | assert not np.allclose(second_scores, first_scores)
44 | # take 25 and 75 percentiles of scores for later use
45 | range_low, range_high = np.percentile(second_scores, [25, 75]).tolist()
46 |
47 | # emulate user interface: select everything through a SelectionGeometry event
48 | total_raw = softlabel.dfs["raw"].shape[0]
49 | initial_select = list(range(total_raw))
50 | # check linked selection
51 | assert annotator.sources["raw"].selected.indices == []
52 | softlabel.sources["raw"].selected.indices = initial_select
53 | box_select = SelectionGeometry(
54 | softlabel.figure,
55 | geometry={
56 | "type": "poly",
57 | "sx": [-1e4, -1e4, 1e4, 1e4],
58 | "sy": [-1e4, 1e4, 1e4, -1e4],
59 | "x": [None, None, None, None],
60 | "y": [None, None, None, None],
61 | },
62 | )
63 | softlabel.figure._trigger_event(box_select)
64 | assert annotator.sources["raw"].selected.indices == initial_select
65 |
66 | # check score filtering
67 | # nothing happens when filter is inactive
68 | softlabel.score_range.value = (range_low, range_high)
69 | assert softlabel.sources["raw"].selected.indices == initial_select
70 | # activate score filter
71 | softlabel.score_filter_box.active = [0]
72 | first_select = softlabel.sources["raw"].selected.indices[:]
73 | assert first_select != initial_select
74 | assert set(first_select).issubset(set(initial_select))
75 | assert first_select == annotator.sources["raw"].selected.indices
76 |
77 | # check regex co-filtering
78 | finder.search_filter_box.active = [0]
79 | finder.search_pos.value = r"(?i)s[aeiou]\ "
80 | second_select = softlabel.sources["raw"].selected.indices[:]
81 | assert second_select != first_select
82 | assert set(second_select).issubset(set(first_select))
83 |
84 | # check filter interaction: untoggle score filter
85 | softlabel.score_filter_box.active = []
86 | third_select = softlabel.sources["raw"].selected.indices[:]
87 | assert third_select != second_select
88 | assert set(second_select).issubset(set(third_select))
89 |
90 | # deactivate regex filter too
91 | finder.search_filter_box.active = []
92 | unfilter_select = softlabel.sources["raw"].selected.indices[:]
93 | assert unfilter_select == initial_select
94 |
95 |
96 | def test_snorkel_crosscheck(example_audio_dataset, dummy_labeling_function_list):
97 | dataset = example_audio_dataset.copy()
98 | layout, objects = _snorkel_crosscheck(dataset, dummy_labeling_function_list)
99 | assert layout.visible
100 |
101 | # TODO: add emulations of user activity
102 | assert objects
103 |
104 |
105 | @pytest.mark.lite
106 | def test_servable_experimental(
107 | example_text_dataset,
108 | dummy_vecnet_callback,
109 | dummy_labeling_function_list,
110 | ):
111 | # one dataset for each recipe
112 | dataset = example_text_dataset.copy()
113 | vecnet = dummy_vecnet_callback(dataset)
114 | active = active_learning(dataset, vecnet)
115 |
116 | dataset = example_text_dataset.copy()
117 | snorkel = snorkel_crosscheck(dataset, dummy_labeling_function_list)
118 |
119 | for handle in [active, snorkel]:
120 | execute_handle_function(handle)
121 |
--------------------------------------------------------------------------------
/docs/pages/tutorial/t1-active-learning.md:
--------------------------------------------------------------------------------
1 | > The most common usage of `hover` is through built-in `recipe`s like in the quickstart.
2 | >
3 | > :ferris_wheel: Let's explore another `recipe` -- an active learning example.
4 |
5 | {!docs/snippets/html/thebe.html!}
6 | {!docs/snippets/markdown/binder-kernel.md!}
7 | {!docs/snippets/markdown/local-dependency.md!}
8 | {!docs/snippets/markdown/local-dep-text.md!}
9 | {!docs/snippets/markdown/local-dep-jupyter-bokeh.md!}
10 |
11 | ## **Fundamentals**
12 |
13 | Hover `recipe`s are functions that take a `SupervisableDataset` and return an annotation interface.
14 |
15 | The `SupervisableDataset` is assumed to have some data and embeddings.
16 |
17 | ## **Recap: Data & Embeddings**
18 |
19 | Let's preprare a dataset with embeddings. This is almost the same as in the [quickstart](../t0-quickstart/):
20 |
21 |
22 | {!docs/snippets/py/tz-dataset-text-full.txt!}
23 |
24 |
25 |
26 | {!docs/snippets/py/t0-1-vectorizer.txt!}
27 |
28 | {!docs/snippets/py/t0-1a-vectorizer-print.txt!}
29 |
30 |
31 |
32 | {!docs/snippets/py/t0-2-reduction.txt!}
33 |
34 |
35 | ## **Recipe-Specific Ingredient**
36 |
37 | Each recipe has different functionalities and potentially different signature.
38 |
39 | To utilize active learning, we need to specify how to get a model in the loop.
40 |
41 | `hover` considers the `vectorizer` as a "frozen" embedding and follows up with a neural network, which infers its own dimensionality from the vectorizer and the output classes.
42 |
43 | - This architecture named [`VectorNet`](../../reference/core-neural/#hover.core.neural.VectorNet) is the (default) basis of active learning in `hover`.
44 |
45 | ??? info "Custom models"
46 | It is possible to use a model other than `VectorNet` or its subclass.
47 |
48 | You will need to implement the following methods with the same signatures as `VectorNet`:
49 |
50 | - [`train`](../../reference/core-neural/#hover.core.neural.VectorNet.train)
51 | - [`save`](../../reference/core-neural/#hover.core.neural.VectorNet.save)
52 | - [`predict_proba`](../../reference/core-neural/#hover.core.neural.VectorNet.predict_proba)
53 | - [`prepare_loader`](../../reference/core-neural/#hover.core.neural.VectorNet.prepare_loader)
54 | - [`manifold_trajectory`](../../reference/core-neural/#hover.core.neural.VectorNet.manifold_trajectory)
55 |
56 |
57 | {!docs/snippets/py/t1-0-vecnet-callback.txt!}
58 |
59 | {!docs/snippets/py/t1-0a-vecnet-callback-print.txt!}
60 |
61 |
62 | Note how the callback dynamically takes `dataset.classes`, which means the model architecture will adapt when we add classes during annotation.
63 |
64 |
65 | ## :sparkles: **Apply Labels**
66 |
67 | Now we invoke the `active_learning` recipe.
68 |
69 | ??? tip "Tips: how recipes work programmatically"
70 | In general, a `recipe` is a function taking a `SupervisableDataset` and other arguments based on its functionality.
71 |
72 | Here are a few common recipes:
73 |
74 | === "active_learning"
75 |
76 | ::: hover.recipes.experimental.active_learning
77 | rendering:
78 | show_root_heading: false
79 | show_root_toc_entry: false
80 |
81 | === "simple_annotator"
82 |
83 | ::: hover.recipes.stable.simple_annotator
84 | rendering:
85 | show_root_heading: false
86 | show_root_toc_entry: false
87 |
88 | === "linked_annotator"
89 |
90 | ::: hover.recipes.stable.linked_annotator
91 | rendering:
92 | show_root_heading: false
93 | show_root_toc_entry: false
94 |
95 | The recipe returns a `handle` function which `bokeh` can use to visualize an annotation interface in multiple settings.
96 |
97 |
98 | {!docs/snippets/py/t1-1-active-learning.txt!}
99 |
100 | {!docs/snippets/py/tz-bokeh-show-server.txt!}
101 |
102 | {!docs/snippets/py/tz-bokeh-show-notebook.txt!}
103 |
104 |
105 | ???+ tip "Tips: annotation interface with multiple plots"
106 | ??? example "Video guide: leveraging linked selection"
107 | VIDEO
108 |
109 | ???+ example "Video guide: active learning"
110 | VIDEO
111 |
112 | ??? info "Text guide: active learning"
113 | Inspecting model predictions allows us to
114 |
115 | - get an idea of how the current set of annotations will likely teach the model.
116 | - locate the most valuable samples for further annotation.
117 |
118 | {!docs/snippets/html/stylesheet.html!}
119 |
--------------------------------------------------------------------------------
/notebooks/archive-prototype/Programmatic-Polyselect.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "%load_ext autoreload\n",
10 | "\n",
11 | "%autoreload 2"
12 | ]
13 | },
14 | {
15 | "cell_type": "code",
16 | "execution_count": null,
17 | "metadata": {},
18 | "outputs": [],
19 | "source": [
20 | "import sys\n",
21 | "sys.path.append('../../')"
22 | ]
23 | },
24 | {
25 | "cell_type": "markdown",
26 | "metadata": {},
27 | "source": [
28 | "### Use a Event Trigger to Make Selections: **Seems not working**"
29 | ]
30 | },
31 | {
32 | "cell_type": "code",
33 | "execution_count": null,
34 | "metadata": {},
35 | "outputs": [],
36 | "source": [
37 | "import pandas as pd\n",
38 | "import numpy as np\n",
39 | "import random"
40 | ]
41 | },
42 | {
43 | "cell_type": "code",
44 | "execution_count": null,
45 | "metadata": {},
46 | "outputs": [],
47 | "source": [
48 | "from bokeh.io import output_notebook, show\n",
49 | "from bokeh.plotting import figure\n",
50 | "from bokeh.models import ColumnDataSource\n",
51 | "\n",
52 | "output_notebook()"
53 | ]
54 | },
55 | {
56 | "cell_type": "code",
57 | "execution_count": null,
58 | "metadata": {},
59 | "outputs": [],
60 | "source": [
61 | "from bokeh.models import RangeSlider\n",
62 | "from bokeh.layouts import column\n",
63 | "from bokeh.events import SelectionGeometry\n",
64 | "from hover.utils.bokeh_helper import servable\n",
65 | "\n",
66 | "def almost_global_select(figure):\n",
67 | " select_event = SelectionGeometry(\n",
68 | " figure,\n",
69 | " geometry={\n",
70 | " \"type\": \"poly\",\n",
71 | " \"x\": [-1e4, -1e4, 1e4, 1e4],\n",
72 | " \"y\": [-1e4, 1e4, 1e4, -1e4],\n",
73 | " \"sx\": [None, None, None, None],\n",
74 | " \"sy\": [None, None, None, None],\n",
75 | " },\n",
76 | " )\n",
77 | " return select_event\n",
78 | "\n",
79 | "@servable()\n",
80 | "def burner():\n",
81 | " \"\"\"\n",
82 | " Trying to simulate ploygon-based selections.\n",
83 | " \"\"\"\n",
84 | " df = pd.DataFrame({\n",
85 | " 'x': np.random.uniform(-1.0, 1.0, size=(100,)).tolist(),\n",
86 | " 'y': np.random.uniform(-1.0, 1.0, size=(100,)).tolist(),\n",
87 | " 'flag': [random.choice([True, False]) for i in range(100)],\n",
88 | " })\n",
89 | " \n",
90 | " source = ColumnDataSource(df)\n",
91 | " plot = figure(tools=['poly_select', 'lasso_select'])\n",
92 | " plot.circle(source=source)\n",
93 | " x_slider = RangeSlider(start=-1.0, end=1.0, value=(-0.5, 0.5), step=0.01)\n",
94 | " y_slider = RangeSlider(start=-1.0, end=1.0, value=(-0.5, 0.5), step=0.01)\n",
95 | " \n",
96 | " def slider_callback(attr, old, new):\n",
97 | " x_l, x_r = x_slider.value\n",
98 | " y_d, y_u = y_slider.value\n",
99 | " select_event = SelectionGeometry(\n",
100 | " plot,\n",
101 | " geometry={\n",
102 | " \"type\": \"poly\",\n",
103 | " \"x\": [x_l, x_l, x_r, x_r],\n",
104 | " \"y\": [y_d, y_u, y_u, y_d],\n",
105 | " #\"sx\": [None, None, None, None],\n",
106 | " #\"sy\": [None, None, None, None],\n",
107 | " },\n",
108 | " )\n",
109 | " plot._trigger_event(select_event)\n",
110 | " # use a patch to verify the polygon\n",
111 | " plot.patch([x_l, x_l, x_r, x_r], [y_d, y_u, y_u, y_d], alpha=0.2, line_width=1)\n",
112 | " # check the number of selected points\n",
113 | " print(len(source.selected.indices), end=\"\\r\")\n",
114 | " \n",
115 | " x_slider.on_change('value', slider_callback)\n",
116 | " y_slider.on_change('value', slider_callback)\n",
117 | " \n",
118 | " return column(x_slider, y_slider, plot)"
119 | ]
120 | },
121 | {
122 | "cell_type": "code",
123 | "execution_count": null,
124 | "metadata": {},
125 | "outputs": [],
126 | "source": [
127 | "handle = burner()\n",
128 | "show(handle)"
129 | ]
130 | },
131 | {
132 | "cell_type": "code",
133 | "execution_count": null,
134 | "metadata": {},
135 | "outputs": [],
136 | "source": []
137 | }
138 | ],
139 | "metadata": {
140 | "kernelspec": {
141 | "display_name": "Python 3 (ipykernel)",
142 | "language": "python",
143 | "name": "python3"
144 | },
145 | "language_info": {
146 | "codemirror_mode": {
147 | "name": "ipython",
148 | "version": 3
149 | },
150 | "file_extension": ".py",
151 | "mimetype": "text/x-python",
152 | "name": "python",
153 | "nbconvert_exporter": "python",
154 | "pygments_lexer": "ipython3",
155 | "version": "3.9.7"
156 | }
157 | },
158 | "nbformat": 4,
159 | "nbformat_minor": 4
160 | }
161 |
--------------------------------------------------------------------------------
/notebooks/archive-prototype/Slider-Filter.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "%load_ext autoreload\n",
10 | "\n",
11 | "%autoreload 2"
12 | ]
13 | },
14 | {
15 | "cell_type": "code",
16 | "execution_count": null,
17 | "metadata": {},
18 | "outputs": [],
19 | "source": [
20 | "import sys\n",
21 | "sys.path.append('../../')"
22 | ]
23 | },
24 | {
25 | "cell_type": "markdown",
26 | "metadata": {},
27 | "source": [
28 | "### Use a Slider for Filtering Data Points"
29 | ]
30 | },
31 | {
32 | "cell_type": "code",
33 | "execution_count": null,
34 | "metadata": {},
35 | "outputs": [],
36 | "source": [
37 | "import pandas as pd\n",
38 | "import numpy as np\n",
39 | "import random"
40 | ]
41 | },
42 | {
43 | "cell_type": "code",
44 | "execution_count": null,
45 | "metadata": {},
46 | "outputs": [],
47 | "source": [
48 | "df = pd.DataFrame({\n",
49 | " 'x': np.random.uniform(-1.0, 1.0, size=(100,)).tolist(),\n",
50 | " 'y': np.random.uniform(-1.0, 1.0, size=(100,)).tolist(),\n",
51 | " 'flag': [random.choice([True, False]) for i in range(100)],\n",
52 | "})\n",
53 | "df.head()"
54 | ]
55 | },
56 | {
57 | "cell_type": "code",
58 | "execution_count": null,
59 | "metadata": {},
60 | "outputs": [],
61 | "source": [
62 | "from bokeh.io import output_notebook, show\n",
63 | "from bokeh.plotting import figure\n",
64 | "from bokeh.models import ColumnDataSource\n",
65 | "\n",
66 | "output_notebook()"
67 | ]
68 | },
69 | {
70 | "cell_type": "code",
71 | "execution_count": null,
72 | "metadata": {},
73 | "outputs": [],
74 | "source": [
75 | "from bokeh.models import RangeSlider\n",
76 | "from bokeh.layouts import column\n",
77 | "from bokeh.events import SelectionGeometry\n",
78 | "from hover.utils.bokeh_helper import servable\n",
79 | "\n",
80 | "@servable()\n",
81 | "def burner():\n",
82 | " \"\"\"\n",
83 | " Trying to intersect the last manually specified selection with a slider coords/attribute range.\n",
84 | " \"\"\"\n",
85 | " slider = RangeSlider(start=-1.0, end=1.0, value=(-0.5, 0.5), step=0.01)\n",
86 | " source = ColumnDataSource(df)\n",
87 | " plot = figure(tools=['poly_select', 'lasso_select', 'pan', 'wheel_zoom'])\n",
88 | " plot.circle(source=source)\n",
89 | " \n",
90 | " last_manual_selection = set()\n",
91 | " \n",
92 | " def subroutine(lower, upper):\n",
93 | " filter_l = set(np.where(df['y'] > lower)[0])\n",
94 | " filter_u = set(np.where(df['y'] < upper)[0])\n",
95 | " filtered = filter_l.intersection(filter_u)\n",
96 | " return filtered\n",
97 | " \n",
98 | " def selection_callback(event):\n",
99 | " \"\"\"\n",
100 | " CAUTION: this has to overwrite the last manual selection.\n",
101 | " Hence only manual selections should trigger this callback.\n",
102 | " \"\"\"\n",
103 | " last_manual_selection.clear()\n",
104 | " last_manual_selection.update(source.selected.indices.copy())\n",
105 | " filtered = subroutine(*slider.value)\n",
106 | " print('A')\n",
107 | " source.selected.indices = list(filtered.intersection(last_manual_selection))\n",
108 | " \n",
109 | " def foo(event):\n",
110 | " print('B')\n",
111 | " \n",
112 | " def slider_callback(attr, old, new):\n",
113 | " to_select = subroutine(*new)\n",
114 | " if last_manual_selection:\n",
115 | " to_select = to_select.intersection(last_manual_selection)\n",
116 | " source.selected.indices = list(to_select)\n",
117 | " \n",
118 | " plot.on_event(SelectionGeometry, selection_callback)\n",
119 | " plot.on_event(SelectionGeometry, foo)\n",
120 | " slider.on_change('value', slider_callback)\n",
121 | " \n",
122 | " return column(slider, plot)"
123 | ]
124 | },
125 | {
126 | "cell_type": "code",
127 | "execution_count": null,
128 | "metadata": {},
129 | "outputs": [],
130 | "source": [
131 | "handle = burner()\n",
132 | "show(handle)"
133 | ]
134 | },
135 | {
136 | "cell_type": "code",
137 | "execution_count": null,
138 | "metadata": {},
139 | "outputs": [],
140 | "source": []
141 | }
142 | ],
143 | "metadata": {
144 | "kernelspec": {
145 | "display_name": "Python 3 (ipykernel)",
146 | "language": "python",
147 | "name": "python3"
148 | },
149 | "language_info": {
150 | "codemirror_mode": {
151 | "name": "ipython",
152 | "version": 3
153 | },
154 | "file_extension": ".py",
155 | "mimetype": "text/x-python",
156 | "name": "python",
157 | "nbconvert_exporter": "python",
158 | "pygments_lexer": "ipython3",
159 | "version": "3.9.7"
160 | }
161 | },
162 | "nbformat": 4,
163 | "nbformat_minor": 4
164 | }
165 |
--------------------------------------------------------------------------------