├── tests ├── __init__.py ├── core │ ├── __init__.py │ ├── explorer │ │ ├── __init__.py │ │ └── test_feature.py │ ├── representation │ │ ├── test_manifold.py │ │ ├── test_trajectory.py │ │ ├── conftest.py │ │ └── test_reduction.py │ ├── test_local_config.py │ └── test_neural.py ├── recipes │ ├── __init__.py │ ├── test_subroutine.py │ ├── local_helper.py │ └── test_experimental.py ├── module_config │ ├── hover_alt_config_3.ini │ ├── hover_alt_config_2.ini │ └── hover_alt_config_1.ini ├── local_config.py └── utils │ ├── conftest.py │ ├── test_metrics.py │ ├── test_common_nn.py │ ├── test_typecheck.py │ ├── test_snorkel_helper.py │ ├── test_datasets.py │ ├── test_bokeh_helper.py │ ├── test_torch_helper.py │ └── test_misc.py ├── fixture_module ├── __init__.py ├── .gitignore ├── text_vector_net │ └── __init__.py ├── audio_vector_net │ └── __init__.py └── image_vector_net │ └── __init__.py ├── hover ├── utils │ ├── meta │ │ ├── __init__.py │ │ └── traceback.py │ ├── __init__.py │ ├── metrics.py │ ├── typecheck.py │ ├── bokeh_helper │ │ └── local_config.py │ ├── common_nn.py │ ├── snorkel_helper.py │ ├── misc.py │ ├── datasets.py │ └── torch_helper.py ├── recipes │ ├── __init__.py │ ├── local_config.py │ └── stable.py ├── core │ ├── representation │ │ ├── __init__.py │ │ ├── local_config.py │ │ ├── trajectory.py │ │ ├── reduction.py │ │ └── manifold.py │ ├── explorer │ │ ├── __init__.py │ │ └── local_config.py │ ├── __init__.py │ └── local_config.py ├── module_config.py └── config_constants.py ├── docs ├── pages │ ├── guides │ │ ├── datatype-multimodal.md │ │ ├── g2-hover-config.md │ │ ├── g0-datatype-image.md │ │ └── g1-datatype-audio.md │ ├── reference │ │ ├── core-neural.md │ │ ├── core-dataset.md │ │ ├── core-explorer-base.md │ │ ├── core-explorer-feature.md │ │ ├── core-representation.md │ │ ├── core-explorer-functionality.md │ │ ├── core-explorer-specialization.md │ │ ├── utils-bokeh_helper.md │ │ ├── utils-snorkel_helper.md │ │ └── recipes.md │ └── tutorial │ │ ├── t6-softlabel-joint-filter.md │ │ ├── t5-finder-filter.md │ │ ├── t2-bokeh-app.md │ │ ├── t7-snorkel-improvise-rules.md │ │ ├── t3-dataset-population-selection.md │ │ └── t1-active-learning.md ├── index.zh.md ├── snippets │ ├── py │ │ ├── g2-4-config-hint.txt │ │ ├── g0-4a-reduction-print.txt │ │ ├── t4-5-dataset-view.txt │ │ ├── g2-1-configure-palette.txt │ │ ├── t6-1-softlabel-filter.txt │ │ ├── g2-2-configure-abstain-color.txt │ │ ├── t3-0-dataset-population-table.txt │ │ ├── g0-4-reduction.txt │ │ ├── t0-2z-reduction-3d.txt │ │ ├── t4-1-annotator-subset-toggle.txt │ │ ├── t7-2-snorkel-filter-button.txt │ │ ├── t4-2-annotator-selection-option.txt │ │ ├── t7-0a-lf-list-edit.txt │ │ ├── g2-3-configure-reduction-method.txt │ │ ├── t0-0a-dataset-text-print.txt │ │ ├── t4-4-annotator-search-box.txt │ │ ├── t0-3-simple-annotator.txt │ │ ├── t1-1-active-learning.txt │ │ ├── t5-1-finder-figure.txt │ │ ├── t0-2a-reduction-print.txt │ │ ├── t3-2-dataset-selection-table.txt │ │ ├── t7-3-snorkel-crosscheck.txt │ │ ├── t0-1a-vectorizer-print.txt │ │ ├── tz-bokeh-notebook-remote.txt │ │ ├── t3-3-dataset-evict-patch.txt │ │ ├── tz-bokeh-notebook-common.txt │ │ ├── t4-3-annotator-choose-axes.txt │ │ ├── t6-0-softlabel-figure.txt │ │ ├── t3-1-dataset-commit-dedup.txt │ │ ├── t7-1-snorkel-apply-button.txt │ │ ├── t1-0a-vecnet-callback-print.txt │ │ ├── g0-1-url-to-content.txt │ │ ├── g0-2-url-to-image.txt │ │ ├── t5-0-finder-filter.txt │ │ ├── tz-bokeh-show-notebook.txt │ │ ├── g1-1-url-to-audio.txt │ │ ├── t0-2-reduction.txt │ │ ├── t4-0-annotator-basics.txt │ │ ├── t1-0-vecnet-callback.txt │ │ ├── t0-1-vectorizer.txt │ │ ├── g1-2-audio-vectorizer.txt │ │ ├── g2-0-color-palette.txt │ │ ├── tz-bokeh-show-server.txt │ │ ├── g1-0-dataset-audio.txt │ │ ├── g0-0-dataset-image.txt │ │ ├── t0-0-dataset-text.txt │ │ ├── g0-3-image-vectorizer.txt │ │ ├── tz-dataset-text-full.txt │ │ └── t7-0-lf-list.txt │ ├── markdown │ │ ├── local-dep-audio.md │ │ ├── local-dep-snorkel.md │ │ ├── local-dep-image.md │ │ ├── local-dependency.md │ │ ├── readme │ │ │ ├── 0-opener.zh.md │ │ │ ├── 5-announcements.zh.md │ │ │ ├── 3-install.md │ │ │ ├── 0-opener.en.md │ │ │ ├── 5-announcements.en.md │ │ │ ├── 0a-language-badges.md │ │ │ ├── 4-resources.zh.md │ │ │ ├── 0c-intro.zh.md │ │ │ ├── 4-resources.en.md │ │ │ ├── 1-live-demos.zh.md │ │ │ ├── 0c-intro.en.md │ │ │ ├── 1-live-demos.en.md │ │ │ ├── 6-remarks.zh.md │ │ │ ├── 0b-status-badges.md │ │ │ ├── 6-remarks.en.md │ │ │ ├── 2-features.zh.md │ │ │ └── 2-features.en.md │ │ ├── local-dep-text.md │ │ ├── tutorial-required.md │ │ ├── dataset-prep.md │ │ ├── wrappy-cache.md │ │ ├── local-dep-jupyter-bokeh.md │ │ ├── component-tutorial.md │ │ ├── binder-kernel.md │ │ └── jupyterlab-js-issue.md │ └── html │ │ ├── stylesheet.html │ │ └── thebe.html ├── index.md ├── images │ ├── favicon.png │ ├── hover-logo-dark.png │ ├── hover-logo-light.png │ └── hover-logo-title.png ├── pipelines │ ├── requirements-doc-scripts.txt │ ├── README.md.template │ ├── local_helper.py │ ├── generate_readme.py │ ├── local_config.py │ └── check_scripts.py └── styles │ └── monokai.css ├── notebooks ├── .gitignore ├── archive-prototype │ ├── Programmatic-Event.ipynb │ ├── Dynamic-Widget.ipynb │ ├── Editing-Datatable.ipynb │ ├── Programmatic-Polyselect.ipynb │ └── Slider-Filter.ipynb └── Image-Experiment.ipynb ├── .gitignore ├── .github ├── dependabot.yml └── workflows │ ├── cross-os-conda-build.yml │ ├── handle-inactive.yml │ ├── cross-os-install-source.yml │ ├── cross-os-source-test.yml │ ├── assemble-readme.yml │ ├── doc-script-test.yml │ ├── doc-auto-notebook.yml │ └── quick-source-test.yml ├── pytest.ini ├── requirements-dev.txt ├── .pre-commit-config.yaml ├── LICENSE ├── conda-recipe ├── stable.yaml └── meta.yaml ├── setup.py ├── tox.ini └── mkdocs.yml /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/core/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /fixture_module/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /hover/utils/meta/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/recipes/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/core/explorer/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /docs/pages/guides/datatype-multimodal.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /notebooks/.gitignore: -------------------------------------------------------------------------------- 1 | *.pt 2 | *.pt.* 3 | *.csv 4 | -------------------------------------------------------------------------------- /fixture_module/.gitignore: -------------------------------------------------------------------------------- 1 | *.pkl 2 | *.pt 3 | *.pt.* 4 | -------------------------------------------------------------------------------- /docs/index.zh.md: -------------------------------------------------------------------------------- 1 | # Hover - 0.8.1 文档 2 | 3 | {!README.md!} 4 | -------------------------------------------------------------------------------- /docs/snippets/py/g2-4-config-hint.txt: -------------------------------------------------------------------------------- 1 | hover.config.hint() 2 | -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | # Hover - 0.8.1 Documentation 2 | 3 | {!README.md!} 4 | -------------------------------------------------------------------------------- /tests/module_config/hover_alt_config_3.ini: -------------------------------------------------------------------------------- 1 | [io] 2 | data_save_dir = . 3 | -------------------------------------------------------------------------------- /docs/snippets/py/g0-4a-reduction-print.txt: -------------------------------------------------------------------------------- 1 | dataset.dfs["raw"]().head(5) 2 | -------------------------------------------------------------------------------- /docs/snippets/py/t4-5-dataset-view.txt: -------------------------------------------------------------------------------- 1 | show(dataset.view(), notebook_url=notebook_url) 2 | -------------------------------------------------------------------------------- /docs/images/favicon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/phurwicz/hover/HEAD/docs/images/favicon.png -------------------------------------------------------------------------------- /docs/snippets/py/g2-1-configure-palette.txt: -------------------------------------------------------------------------------- 1 | hover.config["visual"]["abstain_hexcolor"] = "#bababa" 2 | -------------------------------------------------------------------------------- /docs/snippets/py/t6-1-softlabel-filter.txt: -------------------------------------------------------------------------------- 1 | show(softlabel.score_filter, notebook_url=notebook_url) 2 | -------------------------------------------------------------------------------- /docs/snippets/py/g2-2-configure-abstain-color.txt: -------------------------------------------------------------------------------- 1 | hover.config["visual"]["abstain_hexcolor"] = "#bababa" 2 | -------------------------------------------------------------------------------- /docs/snippets/py/t3-0-dataset-population-table.txt: -------------------------------------------------------------------------------- 1 | show(dataset.pop_table, notebook_url=notebook_url) 2 | -------------------------------------------------------------------------------- /docs/images/hover-logo-dark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/phurwicz/hover/HEAD/docs/images/hover-logo-dark.png -------------------------------------------------------------------------------- /docs/snippets/py/g0-4-reduction.txt: -------------------------------------------------------------------------------- 1 | reducer = dataset.compute_nd_embedding(vectorizer, "umap", dimension=2) 2 | -------------------------------------------------------------------------------- /docs/images/hover-logo-light.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/phurwicz/hover/HEAD/docs/images/hover-logo-light.png -------------------------------------------------------------------------------- /docs/images/hover-logo-title.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/phurwicz/hover/HEAD/docs/images/hover-logo-title.png -------------------------------------------------------------------------------- /docs/pages/reference/core-neural.md: -------------------------------------------------------------------------------- 1 | - ::: hover.core.neural 2 | rendering: 3 | show_root_heading: false 4 | -------------------------------------------------------------------------------- /docs/snippets/py/t0-2z-reduction-3d.txt: -------------------------------------------------------------------------------- 1 | reducer = dataset.compute_nd_embedding(vectorizer, "umap", dimension=3) 2 | -------------------------------------------------------------------------------- /docs/snippets/py/t4-1-annotator-subset-toggle.txt: -------------------------------------------------------------------------------- 1 | show(annotator.data_key_button_group, notebook_url=notebook_url) 2 | -------------------------------------------------------------------------------- /docs/snippets/py/t7-2-snorkel-filter-button.txt: -------------------------------------------------------------------------------- 1 | show(snorkel_plot.lf_filter_trigger, notebook_url=notebook_url) 2 | -------------------------------------------------------------------------------- /docs/pages/reference/core-dataset.md: -------------------------------------------------------------------------------- 1 | - ::: hover.core.dataset 2 | rendering: 3 | show_root_heading: false 4 | -------------------------------------------------------------------------------- /docs/snippets/py/t4-2-annotator-selection-option.txt: -------------------------------------------------------------------------------- 1 | show(annotator.selection_option_box, notebook_url=notebook_url) 2 | -------------------------------------------------------------------------------- /docs/snippets/py/t7-0a-lf-list-edit.txt: -------------------------------------------------------------------------------- 1 | # we will come back to this block later on 2 | # LABELING_FUNCTIONS.pop(-1) 3 | -------------------------------------------------------------------------------- /docs/snippets/py/g2-3-configure-reduction-method.txt: -------------------------------------------------------------------------------- 1 | hover.config["data.embedding"]["default_reduction_method"] = "ivis" 2 | -------------------------------------------------------------------------------- /docs/pages/reference/core-explorer-base.md: -------------------------------------------------------------------------------- 1 | - ::: hover.core.explorer.base 2 | rendering: 3 | show_root_heading: false 4 | -------------------------------------------------------------------------------- /docs/snippets/py/t0-0a-dataset-text-print.txt: -------------------------------------------------------------------------------- 1 | # each subset can be accessed as its own DataFrame 2 | dataset.dfs["raw"]().head(5) 3 | -------------------------------------------------------------------------------- /docs/snippets/py/t4-4-annotator-search-box.txt: -------------------------------------------------------------------------------- 1 | show(row(annotator.search_pos, annotator.search_neg), notebook_url=notebook_url) 2 | -------------------------------------------------------------------------------- /docs/pages/reference/core-explorer-feature.md: -------------------------------------------------------------------------------- 1 | - ::: hover.core.explorer.feature 2 | rendering: 3 | show_root_heading: false 4 | -------------------------------------------------------------------------------- /docs/snippets/markdown/local-dep-audio.md: -------------------------------------------------------------------------------- 1 | To run the audio embedding code on this page, you need `pip install librosa wrappy`. 2 | -------------------------------------------------------------------------------- /docs/pages/reference/core-representation.md: -------------------------------------------------------------------------------- 1 | - ::: hover.core.representation.reduction 2 | rendering: 3 | show_root_heading: true 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *ipynb_checkpoints* 2 | *pycache* 3 | .*.pkl 4 | .tox 5 | .coverage 6 | cobertura.xml 7 | .DS_Store 8 | annoy.index 9 | site/ 10 | -------------------------------------------------------------------------------- /docs/pages/reference/core-explorer-functionality.md: -------------------------------------------------------------------------------- 1 | - ::: hover.core.explorer.functionality 2 | rendering: 3 | show_root_heading: false 4 | -------------------------------------------------------------------------------- /docs/pages/reference/core-explorer-specialization.md: -------------------------------------------------------------------------------- 1 | - ::: hover.core.explorer.specialization 2 | rendering: 3 | show_root_heading: false 4 | -------------------------------------------------------------------------------- /docs/snippets/py/t0-3-simple-annotator.txt: -------------------------------------------------------------------------------- 1 | from hover.recipes.stable import simple_annotator 2 | 3 | interactive_plot = simple_annotator(dataset) 4 | -------------------------------------------------------------------------------- /docs/snippets/markdown/local-dep-snorkel.md: -------------------------------------------------------------------------------- 1 | To use `snorkel` labeling functions, you need: 2 | ```shell 3 | pip install snorkel 4 | ``` 5 | -------------------------------------------------------------------------------- /docs/snippets/markdown/local-dep-image.md: -------------------------------------------------------------------------------- 1 | To run the image embedding code on this page, you need `pip install efficientnet_pytorch torchvision wrappy`. 2 | -------------------------------------------------------------------------------- /docs/snippets/py/t1-1-active-learning.txt: -------------------------------------------------------------------------------- 1 | from hover.recipes.experimental import active_learning 2 | 3 | interactive_plot = active_learning(dataset, vecnet) 4 | -------------------------------------------------------------------------------- /docs/snippets/py/t5-1-finder-figure.txt: -------------------------------------------------------------------------------- 1 | show(column( 2 | row(finder.search_pos, finder.search_neg), 3 | finder.figure, 4 | ), notebook_url=notebook_url) 5 | -------------------------------------------------------------------------------- /docs/snippets/py/t0-2a-reduction-print.txt: -------------------------------------------------------------------------------- 1 | # what we did adds 'embed_2d_0' and 'embed_2d_1' columns to the DataFrames in dataset.dfs 2 | dataset.dfs["raw"]().head(5) 3 | -------------------------------------------------------------------------------- /docs/snippets/py/t3-2-dataset-selection-table.txt: -------------------------------------------------------------------------------- 1 | dataset._callback_update_selection(dataset.dfs["raw"][:10]) 2 | 3 | show(dataset.sel_table, notebook_url=notebook_url) 4 | -------------------------------------------------------------------------------- /docs/snippets/py/t7-3-snorkel-crosscheck.txt: -------------------------------------------------------------------------------- 1 | from hover.recipes.experimental import snorkel_crosscheck 2 | 3 | interactive_plot = snorkel_crosscheck(dataset, LABELING_FUNCTIONS) 4 | -------------------------------------------------------------------------------- /tests/module_config/hover_alt_config_2.ini: -------------------------------------------------------------------------------- 1 | [data.values] 2 | abstain_decoded = LABEL.ABSTAIN 3 | abstain_encoded = -2 4 | 5 | [data.columns] 6 | dataset_subset_field = subset 7 | -------------------------------------------------------------------------------- /docs/snippets/markdown/local-dependency.md: -------------------------------------------------------------------------------- 1 | ??? info "Dependencies for {== local environments ==}" 2 | When you run the code locally, you may need to install additional packages. 3 | -------------------------------------------------------------------------------- /docs/snippets/py/t0-1a-vectorizer-print.txt: -------------------------------------------------------------------------------- 1 | text = dataset.dfs["raw"]().loc[0, "text"] 2 | vec = vectorizer(text) 3 | print(f"Text: {text}") 4 | print(f"Vector shape: {vec.shape}") 5 | -------------------------------------------------------------------------------- /hover/recipes/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | ???+ note "High-level functions to produce an interactive annotation interface." 3 | """ 4 | from .stable import simple_annotator, linked_annotator 5 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: "pip" 4 | directory: "/" 5 | schedule: 6 | interval: "daily" 7 | open-pull-requests-limit: 10 8 | -------------------------------------------------------------------------------- /docs/snippets/markdown/readme/0-opener.zh.md: -------------------------------------------------------------------------------- 1 | ![Hover](https://raw.githubusercontent.com/phurwicz/hover/main/docs/images/hover-logo-title.png) 2 | 3 | > 通过向量降维, 极速探索和批量标注数据, 并用作模型训练或其它用途. 4 | -------------------------------------------------------------------------------- /docs/pages/reference/utils-bokeh_helper.md: -------------------------------------------------------------------------------- 1 | - ::: hover.utils.bokeh_helper 2 | rendering: 3 | show_root_heading: false 4 | show_root_toc_entry: false 5 | heading_level: 3 6 | -------------------------------------------------------------------------------- /docs/pages/reference/utils-snorkel_helper.md: -------------------------------------------------------------------------------- 1 | - ::: hover.utils.snorkel_helper 2 | rendering: 3 | show_root_heading: false 4 | show_root_toc_entry: false 5 | heading_level: 3 6 | -------------------------------------------------------------------------------- /docs/snippets/markdown/local-dep-text.md: -------------------------------------------------------------------------------- 1 | To run the text embedding code on this page, you need: 2 | ```shell 3 | pip install spacy 4 | python -m spacy download en_core_web_md 5 | ``` 6 | -------------------------------------------------------------------------------- /docs/snippets/markdown/readme/5-announcements.zh.md: -------------------------------------------------------------------------------- 1 | ## :flags: 新动态 2 | 3 | - **Jan 21, 2023** 新版本 0.8.0 已就绪, 可查看 [changelog](https://github.com/phurwicz/hover/blob/main/CHANGELOG.md) 获悉详情 :partying_face:. 4 | -------------------------------------------------------------------------------- /docs/snippets/py/tz-bokeh-notebook-remote.txt: -------------------------------------------------------------------------------- 1 | # special configuration for this remotely hosted tutorial 2 | from local_lib.binder_helper import remote_jupyter_proxy_url 3 | notebook_url = remote_jupyter_proxy_url 4 | -------------------------------------------------------------------------------- /docs/snippets/markdown/readme/3-install.md: -------------------------------------------------------------------------------- 1 | ## :package: Install 2 | 3 | > Python: 3.8+ 4 | > 5 | > OS: Linux & Mac & Windows 6 | 7 | PyPI: `pip install hover` 8 | 9 | Conda: `conda install -c conda-forge hover` 10 | -------------------------------------------------------------------------------- /docs/snippets/py/t3-3-dataset-evict-patch.txt: -------------------------------------------------------------------------------- 1 | show(column( 2 | row( 3 | dataset.selection_evictor, 4 | dataset.selection_patcher, 5 | ), 6 | dataset.sel_table, 7 | ), notebook_url=notebook_url) 8 | -------------------------------------------------------------------------------- /docs/snippets/py/tz-bokeh-notebook-common.txt: -------------------------------------------------------------------------------- 1 | from bokeh.io import show, output_notebook 2 | 3 | output_notebook() 4 | 5 | # normally your would skip notebook_url or use Jupyter address 6 | notebook_url = 'localhost:8888' 7 | -------------------------------------------------------------------------------- /docs/snippets/html/stylesheet.html: -------------------------------------------------------------------------------- 1 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /docs/snippets/py/t4-3-annotator-choose-axes.txt: -------------------------------------------------------------------------------- 1 | annotator = standard_annotator(dataset) 2 | 3 | show(column( 4 | row(annotator.dropdown_x_axis, annotator.dropdown_y_axis), 5 | annotator.figure, 6 | ), notebook_url=notebook_url) 7 | -------------------------------------------------------------------------------- /docs/snippets/py/t6-0-softlabel-figure.txt: -------------------------------------------------------------------------------- 1 | from hover.recipes.subroutine import standard_softlabel 2 | from bokeh.layouts import row, column 3 | 4 | softlabel = standard_softlabel(dataset) 5 | show(softlabel.figure, notebook_url=notebook_url) 6 | -------------------------------------------------------------------------------- /hover/core/representation/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | ???+ note "Leveraging dimensionality reduction to make 2-D representations of data points." 3 | 4 | This is intended to be useful for making interactive general-purpose data explorers. 5 | """ 6 | -------------------------------------------------------------------------------- /docs/snippets/markdown/readme/0-opener.en.md: -------------------------------------------------------------------------------- 1 | ![Hover](https://raw.githubusercontent.com/phurwicz/hover/main/docs/images/hover-logo-title.png) 2 | 3 | > Explore and label on a map of your data. 4 | > 5 | > Get enough to feed your model in no time. 6 | -------------------------------------------------------------------------------- /docs/snippets/markdown/readme/5-announcements.en.md: -------------------------------------------------------------------------------- 1 | ## :flags: Announcements 2 | 3 | - **Jan 21, 2023** version 0.8.0 is now available. Check out the [changelog](https://github.com/phurwicz/hover/blob/main/CHANGELOG.md) for details :partying_face:. 4 | -------------------------------------------------------------------------------- /docs/snippets/markdown/tutorial-required.md: -------------------------------------------------------------------------------- 1 | ???+ warning "This page assumes that you have know the basics" 2 | i.e. simple usage of `dataset` and `annotator`. Please visit the [quickstart tutorial](/hover/pages/tutorial/t0-quickstart) if you haven't done so. 3 | -------------------------------------------------------------------------------- /docs/snippets/py/t3-1-dataset-commit-dedup.txt: -------------------------------------------------------------------------------- 1 | from bokeh.layouts import row, column 2 | 3 | show(column( 4 | row( 5 | dataset.data_committer, 6 | dataset.dedup_trigger, 7 | ), 8 | dataset.pop_table, 9 | ), notebook_url=notebook_url) 10 | -------------------------------------------------------------------------------- /docs/snippets/py/t7-1-snorkel-apply-button.txt: -------------------------------------------------------------------------------- 1 | from hover.recipes.subroutine import standard_snorkel 2 | 3 | snorkel_plot = standard_snorkel(dataset) 4 | snorkel_plot.subscribed_lf_list = LABELING_FUNCTIONS 5 | show(snorkel_plot.lf_apply_trigger, notebook_url=notebook_url) 6 | -------------------------------------------------------------------------------- /docs/snippets/markdown/readme/0a-language-badges.md: -------------------------------------------------------------------------------- 1 | [![en](https://img.shields.io/badge/lang-en-green.svg)](https://github.com/phurwicz/hover/blob/main/README.md) 2 | [![zh](https://img.shields.io/badge/语言-中文-green.svg)](https://github.com/phurwicz/hover/blob/main/README.zh.md) 3 | -------------------------------------------------------------------------------- /docs/snippets/py/t1-0a-vecnet-callback-print.txt: -------------------------------------------------------------------------------- 1 | # predict_proba accepts individual strings or list 2 | # text -> vector -> class probabilities 3 | # if no classes right now, will see an empty list 4 | print(vecnet.predict_proba(text)) 5 | print(vecnet.predict_proba([text])) 6 | -------------------------------------------------------------------------------- /hover/recipes/local_config.py: -------------------------------------------------------------------------------- 1 | import hover 2 | from hover.config_constants import ( 3 | ConfigSection as Section, 4 | ConfigKey as Key, 5 | ) 6 | 7 | DEFAULT_REDUCTION_METHOD = hover.config[Section.DATA_EMBEDDING][ 8 | Key.DEFAULT_REDUCTION_METHOD 9 | ] 10 | -------------------------------------------------------------------------------- /hover/utils/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Utility functions or classes, for example: 3 | 4 | (1) connectors to another library like Torch/Snorkel/etc. 5 | (2) optional data structures that work smoothly with the core module. 6 | (3) supportive subroutines such as logging templates. 7 | """ 8 | -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | markers = 3 | core: central functionalities involved in most, if not all, use cases. 4 | lite: fast to run and no platform-specific extra dependency. 5 | benchmark: about performance rather than correctness. 6 | builtin: 'vanilla' extension beyond the core. 7 | -------------------------------------------------------------------------------- /tests/local_config.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | PSEUDO_LABELS = ["A", "B"] 4 | 5 | VECTORIZER_BREAKER = "VECTORIZER_FALLS_APART" 6 | 7 | 8 | def RANDOM_LABEL(row): 9 | return random.choice(PSEUDO_LABELS) 10 | 11 | 12 | def RANDOM_SCORE(row): 13 | return random.uniform(0.2, 1.0) 14 | -------------------------------------------------------------------------------- /docs/snippets/py/g0-1-url-to-content.txt: -------------------------------------------------------------------------------- 1 | import requests 2 | from functools import lru_cache 3 | 4 | @lru_cache(maxsize=10000) 5 | def url_to_content(url): 6 | """ 7 | Turn a URL to response content. 8 | """ 9 | response = requests.get(url) 10 | return response.content 11 | -------------------------------------------------------------------------------- /docs/snippets/py/g0-2-url-to-image.txt: -------------------------------------------------------------------------------- 1 | from PIL import Image 2 | from io import BytesIO 3 | 4 | @lru_cache(maxsize=10000) 5 | def url_to_image(url): 6 | """ 7 | Turn a URL to a PIL Image. 8 | """ 9 | img = Image.open(BytesIO(url_to_content(url))).convert("RGB") 10 | return img 11 | -------------------------------------------------------------------------------- /docs/snippets/py/t5-0-finder-filter.txt: -------------------------------------------------------------------------------- 1 | from hover.recipes.subroutine import standard_finder 2 | from bokeh.layouts import row, column 3 | 4 | finder = standard_finder(dataset) 5 | show(row( 6 | column(finder.search_pos, finder.search_neg), 7 | finder.search_filter_box, 8 | ), notebook_url=notebook_url) 9 | -------------------------------------------------------------------------------- /docs/snippets/py/tz-bokeh-show-notebook.txt: -------------------------------------------------------------------------------- 1 | # ---------- NOTEBOOK MODE: for your actual Jupyter environment --------- 2 | # this code will render the entire plot in Jupyter 3 | # from bokeh.io import show, output_notebook 4 | # output_notebook() 5 | # show(interactive_plot, notebook_url='https://localhost:8888') 6 | -------------------------------------------------------------------------------- /docs/snippets/py/g1-1-url-to-audio.txt: -------------------------------------------------------------------------------- 1 | import librosa 2 | from io import BytesIO 3 | 4 | @lru_cache(maxsize=10000) 5 | def url_to_audio(url): 6 | """ 7 | Turn a URL to audio data. 8 | """ 9 | data, sampling_rate = librosa.load(BytesIO(url_to_content(url))) 10 | return data, sampling_rate 11 | -------------------------------------------------------------------------------- /docs/snippets/py/t0-2-reduction.txt: -------------------------------------------------------------------------------- 1 | # any kwargs will be passed onto the corresponding reduction 2 | # for umap: https://umap-learn.readthedocs.io/en/latest/parameters.html 3 | # for ivis: https://bering-ivis.readthedocs.io/en/latest/api.html 4 | reducer = dataset.compute_nd_embedding(vectorizer, "umap", dimension=2) 5 | -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | # automation 2 | tox 3 | tox-gh-actions 4 | pre-commit 5 | # documentation 6 | mkdocs 7 | mkdocs-material 8 | mkdocs-macros-plugin 9 | mkdocs-static-i18n 10 | mkdocstrings 11 | mkdocstrings-python 12 | markdown-include 13 | mike 14 | # interactive development 15 | jupyter 16 | jupyterlab 17 | -------------------------------------------------------------------------------- /docs/snippets/markdown/readme/4-resources.zh.md: -------------------------------------------------------------------------------- 1 | ## :book: 资料 2 | 3 | - [教程](https://phurwicz.github.io/hover/pages/tutorial/t0-quickstart/) 4 | - [Binder仓库](https://github.com/phurwicz/hover-binder) 5 | - [版本说明](https://github.com/phurwicz/hover/blob/main/CHANGELOG.md) 6 | - [文档](https://phurwicz.github.io/hover/) 7 | -------------------------------------------------------------------------------- /docs/snippets/markdown/readme/0c-intro.zh.md: -------------------------------------------------------------------------------- 1 | `hover` 是一个批量标注数据的工具, 只需数据能被向量表示. 2 | 3 | - 标注过程很简单, 如同给散点图上色. 4 | - 通过移动鼠标和框选, 来观察数据(在降维后的)点簇. 5 | - 使用小工具(如搜索/过滤/规则/主动学习)来提升精度. 6 | - 输入合适的标签, 并点击"Apply"按钮, 即可标注! 7 | 8 | ![GIF Demo](https://raw.githubusercontent.com/phurwicz/hover-gallery/main/0.5.0/trailer-short.gif) 9 | -------------------------------------------------------------------------------- /docs/snippets/markdown/dataset-prep.md: -------------------------------------------------------------------------------- 1 | As always, start with a ready-for-plot dataset: 2 | 3 |
 4 | {!docs/snippets/py/tz-dataset-text-full.txt!}
 5 | 

6 | 7 |
 8 | {!docs/snippets/py/t0-1-vectorizer.txt!}
 9 | 
10 | {!docs/snippets/py/t0-2-reduction.txt!}
11 | 

12 | -------------------------------------------------------------------------------- /docs/snippets/py/t4-0-annotator-basics.txt: -------------------------------------------------------------------------------- 1 | from hover.recipes.subroutine import standard_annotator 2 | from bokeh.layouts import row, column 3 | 4 | annotator = standard_annotator(dataset) 5 | show(column( 6 | row(annotator.annotator_input, annotator.annotator_apply), 7 | annotator.figure, 8 | ), notebook_url=notebook_url) 9 | -------------------------------------------------------------------------------- /docs/snippets/markdown/readme/4-resources.en.md: -------------------------------------------------------------------------------- 1 | ## :book: Resources 2 | 3 | - [Tutorials](https://phurwicz.github.io/hover/pages/tutorial/t0-quickstart/) 4 | - [Binder repo](https://github.com/phurwicz/hover-binder) 5 | - [Changelog](https://github.com/phurwicz/hover/blob/main/CHANGELOG.md) 6 | - [Documentation](https://phurwicz.github.io/hover/) 7 | -------------------------------------------------------------------------------- /docs/pages/reference/recipes.md: -------------------------------------------------------------------------------- 1 | # `hover.recipes` 2 | 3 | - ::: hover.recipes.stable 4 | rendering: 5 | show_root_heading: true 6 | 7 | --- 8 | 9 | - ::: hover.recipes.experimental 10 | rendering: 11 | show_root_heading: true 12 | 13 | --- 14 | 15 | - ::: hover.recipes.subroutine 16 | rendering: 17 | show_root_heading: true 18 | -------------------------------------------------------------------------------- /docs/snippets/markdown/readme/1-live-demos.zh.md: -------------------------------------------------------------------------------- 1 | ## :rocket: 在线演示 2 | 3 | ### [**Notebook教程**](https://phurwicz.github.io/hover/pages/tutorial/t0-quickstart/) 4 | 5 | - 查看含代码的教程, 可在浏览器中编辑和运行, 无需安装依赖. 6 | 7 | ### [**示例标注界面**](https://mybinder.org/v2/gh/phurwicz/hover-binder/master?urlpath=/proxy/5006/app-simple-annotator) 8 | 9 | - 跳过所有代码, 进入托管在Binder上的标注界面. 10 | -------------------------------------------------------------------------------- /tests/utils/conftest.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from bokeh.plotting import figure 3 | 4 | 5 | @pytest.fixture 6 | def dummy_working_recipe(): 7 | def recipe(*args, **kwargs): 8 | return figure() 9 | 10 | return recipe 11 | 12 | 13 | @pytest.fixture 14 | def dummy_broken_recipe(): 15 | def recipe(*args, **kwargs): 16 | assert False 17 | 18 | return recipe 19 | -------------------------------------------------------------------------------- /docs/snippets/py/t1-0-vecnet-callback.txt: -------------------------------------------------------------------------------- 1 | from hover.core.neural import VectorNet 2 | from hover.utils.common_nn import LogisticRegression 3 | 4 | # Create a model with vectorizer-NN architecture. 5 | # model.pt will point to a PyTorch state dict (to be created) 6 | # the label classes in the dataset can change, and vecnet can adjust to that 7 | vecnet = VectorNet(vectorizer, LogisticRegression, "model.pt", dataset.classes) 8 | -------------------------------------------------------------------------------- /docs/snippets/py/t0-1-vectorizer.txt: -------------------------------------------------------------------------------- 1 | import spacy 2 | import re 3 | from functools import lru_cache 4 | 5 | # use your preferred embedding for the task 6 | nlp = spacy.load("en_core_web_md") 7 | 8 | # raw data (str in this case) -> np.array 9 | @lru_cache(maxsize=int(1e+4)) 10 | def vectorizer(text): 11 | clean_text = re.sub(r"[\s]+", r" ", str(text)) 12 | return nlp(clean_text, disable=nlp.pipe_names).vector 13 | -------------------------------------------------------------------------------- /docs/snippets/markdown/wrappy-cache.md: -------------------------------------------------------------------------------- 1 | ???+ info "Caching and reading from disk" 2 | This guide uses [`@wrappy.memoize`](https://erniethornhill.github.io/wrappy/) in place of `@functools.lru_cache` for caching. 3 | 4 | - The benefit is that `wrappy.memoize` can persist the cache to disk, speeding up code across sessions. 5 | 6 | Cached values for this guide have been pre-computed, making it much master to run the guide. 7 | -------------------------------------------------------------------------------- /hover/core/representation/local_config.py: -------------------------------------------------------------------------------- 1 | import hover 2 | from hover.config_constants import ( 3 | ConfigSection as Section, 4 | ConfigKey as Key, 5 | ) 6 | 7 | KWARG_TRANSLATOR = { 8 | "dimension": { 9 | "umap": "n_components", 10 | "ivis": "embedding_dims", 11 | }, 12 | } 13 | 14 | DEFAULT_REDUCTION_METHOD = hover.config[Section.DATA_EMBEDDING][ 15 | Key.DEFAULT_REDUCTION_METHOD 16 | ] 17 | -------------------------------------------------------------------------------- /docs/snippets/py/g1-2-audio-vectorizer.txt: -------------------------------------------------------------------------------- 1 | import wrappy 2 | 3 | @wrappy.memoize(cache_limit=10000, persist_path='custom_cache/audio_url_to_vector.pkl') 4 | def vectorizer(url): 5 | """ 6 | Averaged MFCC over time. 7 | Resembles word-embedding-average-as-doc-embedding for texts. 8 | """ 9 | y, sr = url_to_audio(url) 10 | mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=32) 11 | return mfcc.mean(axis=1) 12 | -------------------------------------------------------------------------------- /docs/snippets/py/g2-0-color-palette.txt: -------------------------------------------------------------------------------- 1 | import hover 2 | from hover.utils.bokeh_helper import auto_label_color 3 | from rich.console import Console 4 | 5 | console = Console() 6 | labels = ["A", "B", "C", "D", "E", "F"] 7 | color_dict = auto_label_color(labels) 8 | abstain = hover.config['data.values']['abstain_decoded'] 9 | 10 | for _label in [abstain, *labels]: 11 | console.print(f"\u2b24{_label}", style=color_dict[_label]) 12 | -------------------------------------------------------------------------------- /tests/core/representation/test_manifold.py: -------------------------------------------------------------------------------- 1 | from hover.core.representation.manifold import LayerwiseManifold 2 | import numpy as np 3 | 4 | 5 | def test_LayerwiseManifold(distance_preserving_array_sequence): 6 | LM = LayerwiseManifold(distance_preserving_array_sequence) 7 | LM.unfold(method="umap", random_state=0, transform_seed=0) 8 | _, disparities = LM.procrustes() 9 | assert (np.array(disparities) < 1e-16).all() 10 | -------------------------------------------------------------------------------- /docs/pipelines/requirements-doc-scripts.txt: -------------------------------------------------------------------------------- 1 | # auto-parse scripts in markdown files 2 | markdown 3 | markdown-include>=0.7.0 4 | # Jupyter environment 5 | jupyter 6 | jupyterlab 7 | # dependencies for specific code 8 | ## distant supervision 9 | snorkel>=0.9.8 10 | ## text vectorizer 11 | spacy 12 | ## image vectorizer 13 | efficientnet_pytorch 14 | torchvision 15 | ## audio handling 16 | librosa 17 | ## disk-persistent caching 18 | wrappy>=0.2.6 19 | -------------------------------------------------------------------------------- /docs/snippets/markdown/local-dep-jupyter-bokeh.md: -------------------------------------------------------------------------------- 1 | To render `bokeh` plots in Jupyter, you need: 2 | ```shell 3 | pip install jupyter_bokeh 4 | ``` 5 | 6 | If you are using JupyterLab older than 3.0, use this instead ([reference](https://pypi.org/project/jupyter-bokeh/)): 7 | ```shell 8 | jupyter labextension install @jupyter-widgets/jupyterlab-manager 9 | jupyter labextension install @bokeh/jupyter_bokeh 10 | ``` 11 | -------------------------------------------------------------------------------- /docs/snippets/markdown/readme/0c-intro.en.md: -------------------------------------------------------------------------------- 1 | `hover` is a tool for mass-labeling data points that can be represented by vectors. 2 | 3 | - Labeling is as easy as coloring a scatter plot. 4 | - Hover your mouse and lasso-select to inspect any cluster. 5 | - Use a variety of widgets to narrow down further. 6 | - Enter a suitable label and hit "Apply"! 7 | 8 | ![GIF Demo](https://raw.githubusercontent.com/phurwicz/hover-gallery/main/0.5.0/trailer-short.gif) 9 | -------------------------------------------------------------------------------- /docs/snippets/markdown/component-tutorial.md: -------------------------------------------------------------------------------- 1 | ???+ warning "This page addresses **single components** of `hover`" 2 | For illustration, we are using code snippets to pick out specific widgets so that the documentation can explain what they do. 3 | 4 | - Please be aware that you won't need to get the widgets by code in an actual use case. 5 | - Typical usage deals with [recipes](../../tutorial/t1-active-learning) where the individual parts have been tied together. 6 | -------------------------------------------------------------------------------- /docs/snippets/markdown/readme/1-live-demos.en.md: -------------------------------------------------------------------------------- 1 | ## :rocket: Live Demos 2 | 3 | ### [**With code**](https://phurwicz.github.io/hover/pages/tutorial/t0-quickstart/) 4 | 5 | - edit & run code in your browser to get a labeling interface, with guides along the way. 6 | 7 | ### [**Without code**](https://mybinder.org/v2/gh/phurwicz/hover-binder/master?urlpath=/proxy/5006/app-simple-annotator) 8 | 9 | - go directly to an example labeling interface hosted on Binder. 10 | -------------------------------------------------------------------------------- /hover/utils/metrics.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def classification_accuracy(true, pred): 5 | """ 6 | Accuracy measure on two arrays. Intended for classification problems. 7 | :param true: true labels. 8 | :type true: Numpy array 9 | :param pred: predicted labels. 10 | :type pred: Numpy array 11 | """ 12 | assert true.shape[0] == pred.shape[0] 13 | correct = np.equal(true, pred).sum() 14 | return float(correct) / float(true.shape[0]) 15 | -------------------------------------------------------------------------------- /tests/utils/test_metrics.py: -------------------------------------------------------------------------------- 1 | from hover.utils.metrics import classification_accuracy 2 | import numpy as np 3 | import pytest 4 | 5 | 6 | @pytest.mark.lite 7 | def test_classification_accuracy(): 8 | true = np.array([1, 2, 3, 4, 5, 6, 7, 7]) 9 | pred = np.array([1, 2, 3, 4, 5, 6, 7, 8]) 10 | accl = classification_accuracy(true, pred) 11 | accr = classification_accuracy(pred, true) 12 | assert np.allclose(accl, 7 / 8) 13 | assert np.allclose(accr, 7 / 8) 14 | -------------------------------------------------------------------------------- /docs/snippets/markdown/binder-kernel.md: -------------------------------------------------------------------------------- 1 | ???+ info "Running Python right here" 2 | Think of this page as *almost* a Jupyter notebook. You can edit code and press `Shift+Enter` to execute. 3 | 4 | Behind the scene is a [Binder](https://mybinder.org/)-hosted Python environment. Below is the status of the kernel: 5 |
6 | 7 | To download a notebook file instead, visit [here](https://github.com/phurwicz/hover/tree/main/docs/pipelines/generated). 8 | -------------------------------------------------------------------------------- /hover/core/explorer/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | ???+ note "Interactive graphical interfaces based on Bokeh." 3 | """ 4 | from .specialization import ( 5 | BokehTextFinder, 6 | BokehTextAnnotator, 7 | BokehTextSoftLabel, 8 | BokehTextMargin, 9 | BokehTextSnorkel, 10 | BokehAudioFinder, 11 | BokehAudioAnnotator, 12 | BokehAudioSoftLabel, 13 | BokehAudioMargin, 14 | BokehAudioSnorkel, 15 | BokehImageFinder, 16 | BokehImageAnnotator, 17 | BokehImageSoftLabel, 18 | BokehImageMargin, 19 | BokehImageSnorkel, 20 | ) 21 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v4.1.0 4 | hooks: 5 | - id: check-yaml 6 | exclude: | 7 | (?x)^( 8 | conda-recipe/meta.yaml| 9 | conda-recipe/stable.yaml 10 | )$ 11 | - id: end-of-file-fixer 12 | - id: trailing-whitespace 13 | - repo: https://github.com/psf/black 14 | rev: 22.1.0 15 | hooks: 16 | - id: black 17 | - repo: https://gitlab.com/pycqa/flake8 18 | rev: 3.9.2 19 | hooks: 20 | - id: flake8 21 | -------------------------------------------------------------------------------- /docs/snippets/py/tz-bokeh-show-server.txt: -------------------------------------------------------------------------------- 1 | # ---------- SERVER MODE: for the documentation page ---------- 2 | # because this tutorial is remotely hosted, we need explicit serving to expose the plot to you 3 | from local_lib.binder_helper import binder_proxy_app_url 4 | from bokeh.server.server import Server 5 | server = Server({'/my-app': interactive_plot}, port=5007, allow_websocket_origin=['*'], use_xheaders=True) 6 | server.start() 7 | # visit this URL printed in cell output to see the interactive plot; locally you would just do "https://localhost:5007/my-app" 8 | binder_proxy_app_url('my-app', port=5007) 9 | -------------------------------------------------------------------------------- /docs/snippets/html/thebe.html: -------------------------------------------------------------------------------- 1 | 2 | 22 | -------------------------------------------------------------------------------- /docs/snippets/py/g1-0-dataset-audio.txt: -------------------------------------------------------------------------------- 1 | from hover.core.dataset import SupervisableAudioDataset 2 | import pandas as pd 3 | 4 | # this is a table of audio-MNIST (pronounced digit 0-9) urls, 100 audios per digit 5 | example_csv_path = "https://raw.githubusercontent.com/phurwicz/hover-gallery/main/0.7.0/audio_mnist.csv" 6 | df = pd.read_csv(example_csv_path).sample(frac=1).reset_index(drop=True) 7 | df["SUBSET"] = "raw" 8 | df.loc[500:800, 'SUBSET'] = 'train' 9 | df.loc[800:900, 'SUBSET'] = 'dev' 10 | df.loc[900:, 'SUBSET'] = 'test' 11 | 12 | dataset = SupervisableAudioDataset.from_pandas(df, feature_key="audio", label_key="label") 13 | -------------------------------------------------------------------------------- /docs/snippets/py/g0-0-dataset-image.txt: -------------------------------------------------------------------------------- 1 | from hover.core.dataset import SupervisableImageDataset 2 | import pandas as pd 3 | 4 | # this is a 1000-image-url set of ImageNet data 5 | # with custom labels: animal, object, food 6 | example_csv_path = "https://raw.githubusercontent.com/phurwicz/hover-gallery/main/0.7.0/imagenet_custom.csv" 7 | df = pd.read_csv(example_csv_path).sample(frac=1).reset_index(drop=True) 8 | df["SUBSET"] = "raw" 9 | df.loc[500:800, 'SUBSET'] = 'train' 10 | df.loc[800:900, 'SUBSET'] = 'dev' 11 | df.loc[900:, 'SUBSET'] = 'test' 12 | 13 | dataset = SupervisableImageDataset.from_pandas(df, feature_key="image", label_key="label") 14 | -------------------------------------------------------------------------------- /docs/pipelines/README.md.template: -------------------------------------------------------------------------------- 1 | {!docs/snippets/markdown/readme/0-opener..md!} 2 | 3 | {!docs/snippets/markdown/readme/0a-language-badges.md!} 4 | 5 | {!docs/snippets/markdown/readme/0b-status-badges.md!} 6 | 7 | {!docs/snippets/markdown/readme/0c-intro..md!} 8 | 9 | {!docs/snippets/markdown/readme/1-live-demos..md!} 10 | 11 | {!docs/snippets/markdown/readme/2-features..md!} 12 | 13 | {!docs/snippets/markdown/readme/3-install.md!} 14 | 15 | {!docs/snippets/markdown/readme/4-resources..md!} 16 | 17 | {!docs/snippets/markdown/readme/5-announcements..md!} 18 | 19 | {!docs/snippets/markdown/readme/6-remarks..md!} 20 | -------------------------------------------------------------------------------- /hover/utils/typecheck.py: -------------------------------------------------------------------------------- 1 | class TypedValueDict(dict): 2 | """ 3 | A dict that only allows values of a certain type. 4 | """ 5 | 6 | def __init__(self, type_, *args, **kwargs): 7 | self._type = type_ 8 | super().__init__(*args, **kwargs) 9 | 10 | def __setitem__(self, key, value): 11 | self.typecheck(value) 12 | super().__setitem__(key, value) 13 | 14 | def typecheck(self, value): 15 | if not isinstance(value, self._type): 16 | raise TypeError(f"Value must be of type {self._type}, got {type(value)}") 17 | 18 | def update(self, other): 19 | for _value in other.values(): 20 | self.typecheck(_value) 21 | super().update(other) 22 | -------------------------------------------------------------------------------- /docs/snippets/py/t0-0-dataset-text.txt: -------------------------------------------------------------------------------- 1 | from hover.core.dataset import SupervisableTextDataset 2 | import pandas as pd 3 | 4 | example_csv_path = "https://raw.githubusercontent.com/phurwicz/hover-gallery/main/0.5.0/20_newsgroups_raw.csv" 5 | # for fast, low-memory demonstration purpose, sample the data 6 | df_raw = pd.read_csv(example_csv_path).sample(1000) 7 | df_raw["text"] = df_raw["text"].astype(str) 8 | 9 | # data is divided into 4 subsets: "raw" / "train" / "dev" / "test" 10 | # this example assumes no labeled data available., i.e. only "raw" 11 | df_raw["SUBSET"] = "raw" 12 | 13 | # this class stores the dataset throught the labeling process 14 | dataset = SupervisableTextDataset.from_pandas(df_raw, feature_key="text", label_key="label") 15 | -------------------------------------------------------------------------------- /tests/utils/test_common_nn.py: -------------------------------------------------------------------------------- 1 | from hover.utils.common_nn import MLP, LogisticRegression 2 | import numpy as np 3 | import torch 4 | import pytest 5 | 6 | 7 | def architecture_subroutine(architecture, dim_inp=300, dim_out=2, num_vecs=10): 8 | """ 9 | Test a specific architecture. 10 | """ 11 | nn = architecture(dim_inp, dim_out) 12 | inp = torch.Tensor(np.random.rand(num_vecs, dim_inp)) 13 | out = nn(inp) 14 | assert out.shape == (num_vecs, dim_out) 15 | out = nn.eval_per_layer(inp)[-1] 16 | assert out.shape == (num_vecs, dim_out) 17 | 18 | 19 | @pytest.mark.lite 20 | def test_MLP(): 21 | architecture_subroutine(MLP) 22 | 23 | 24 | @pytest.mark.lite 25 | def test_LR(): 26 | architecture_subroutine(LogisticRegression) 27 | -------------------------------------------------------------------------------- /hover/core/explorer/local_config.py: -------------------------------------------------------------------------------- 1 | import hover 2 | from hover.config_constants import ( 3 | ConfigSection as Section, 4 | ConfigKey as Key, 5 | ) 6 | 7 | SOURCE_COLOR_FIELD = hover.config[Section.DATA_COLUMNS][Key.SOURCE_COLOR_FIELD] 8 | SOURCE_ALPHA_FIELD = hover.config[Section.DATA_COLUMNS][Key.SOURCE_ALPHA_FIELD] 9 | SEARCH_SCORE_FIELD = hover.config[Section.DATA_COLUMNS][Key.SEARCH_SCORE_FIELD] 10 | 11 | TOOLTIP_IMG_STYLE = hover.config[Section.VISUAL][Key.TOOLTIP_IMG_STYLE] 12 | 13 | SEARCH_MATCH_HEXCOLOR = hover.config[Section.VISUAL][Key.SEARCH_MATCH_HEXCOLOR] 14 | DATAPOINT_BASE_SIZE = hover.config[Section.VISUAL][Key.DATAPOINT_BASE_SIZE] 15 | SEARCH_DATAPOINT_SIZE_PARAMS = ( 16 | "size", 17 | DATAPOINT_BASE_SIZE + 3, 18 | DATAPOINT_BASE_SIZE - 2, 19 | DATAPOINT_BASE_SIZE, 20 | ) 21 | -------------------------------------------------------------------------------- /tests/module_config/hover_alt_config_1.ini: -------------------------------------------------------------------------------- 1 | [visual] 2 | abstain_hexcolor = #b0b0b0 3 | bokeh_palette = ["#b0ffff", "#ffb0ff", "#ffffb0", "#b0b0ff", "#b0ffb0", "#ffb0b0", "#a0eeee", "#eea0ee", "#eeeea0", "#a0a0ee", "#a0eea0", "#eea0a0", "#90dddd", "#dd90dd", "#dddd90", "#9090dd", "#90dd90", "#dd9090", "#80cccc", "#cc80cc", "#cccc80", "#8080cc", "#80cc80", "#cc8080"] 4 | 5 | [backend] 6 | dataframe_library = polars 7 | 8 | [data.embedding] 9 | default_reduction_method = ivis 10 | 11 | [data.values] 12 | abstain_decoded = label.abstain 13 | abstain_encoded = -2 14 | 15 | [data.columns] 16 | encoded_label_key = LABEL_ENCODED 17 | dataset_subset_field = __SUBSET__ 18 | embedding_field_prefix = EMBED_ 19 | source_color_field = __SOURCE_COLOR__ 20 | source_alpha_field = __SOURCE_ALPHA__ 21 | search_score_field = __SOURCE_SEARCH_SCORE__ 22 | -------------------------------------------------------------------------------- /docs/snippets/markdown/jupyterlab-js-issue.md: -------------------------------------------------------------------------------- 1 | ??? info "Showcase widgets here are not interactive" 2 | {== Plotted widgets **on this page** are not interactive, but only for illustration. ==} 3 | 4 | Widgets {== will be interactive when you actually use them ==} (in your local environment or server apps like in the quickstart). 5 | 6 | - be sure to use a whole `recipe` rather than individual widgets. 7 | - if you really want to plot interactive widgets on their own, try `from hover.utils.bokeh_helper import show_as_interactive as show` instead of `from bokeh.io import show`. 8 | - this works in your own environment but still not on the documentation page. 9 | - [`show_as_interactive`](/hover/pages/reference/utils-bokeh_helper/#hover.utils.bokeh_helper.show_as_interactive) is a simple tweak of `bokeh.io.show` by turning standalone LayoutDOM to an application. 10 | -------------------------------------------------------------------------------- /docs/snippets/markdown/readme/6-remarks.zh.md: -------------------------------------------------------------------------------- 1 | ## :bell: 其它说明 2 | 3 | ### 鸣谢和推荐 4 | 5 | - 我们推荐 [`Bokeh`](https://bokeh.org) 可视化框架, `hover`正是基于它的图表同步和回调函数来实现非常重要的功能. 6 | - 感谢 [Philip Vollet](https://de.linkedin.com/in/philipvollet) 在`hover`的迭代早期 无偿地帮助在开源社区内推广. 7 | 8 | ### 提供贡献 9 | 10 | - 我们欢迎任何反馈, **特别是使用中的痛点!** 11 | - `./requirements-dev.txt` 列出了开发者所需的依赖. 12 | - 我们建议在提交PR前启用[.pre-commit-config.yaml](https://github.com/phurwicz/hover/blob/main/.pre-commit-config.yaml)中列出的pre-commit hook. 13 | 14 | ### 引用 15 | 16 | 如果`hover`对您的工作有帮助, 请[告诉我们](https://github.com/phurwicz/hover/discussions)或引用 :hugs: 17 | 18 | ```tex 19 | @misc{hover, 20 | title={{hover}: label data at scale}, 21 | url={https://github.com/phurwicz/hover}, 22 | note={Open software from https://github.com/phurwicz/hover}, 23 | author={ 24 | Pavel Hurwicz and 25 | Haochuan Wei}, 26 | year={2021}, 27 | } 28 | ``` 29 | -------------------------------------------------------------------------------- /tests/utils/test_typecheck.py: -------------------------------------------------------------------------------- 1 | from hover.utils.typecheck import TypedValueDict 2 | from collections import defaultdict 3 | 4 | 5 | class TestTypedValueDict: 6 | def test_basic(self): 7 | tdict = TypedValueDict(int) 8 | tdict["key1"] = 1 9 | assert tdict["key1"] == 1 10 | 11 | tdict.update({"key2": 2, "key3": 3}) 12 | assert tdict["key2"] == 2 13 | assert tdict["key3"] == 3 14 | 15 | try: 16 | tdict["key4"] = "4" 17 | raise AssertionError("Should have raised TypeError") 18 | except TypeError: 19 | pass 20 | 21 | def test_subclass(self): 22 | tdict = TypedValueDict(dict) 23 | tdict["key1"] = {"foo": "bar"} 24 | assert tdict["key1"] == {"foo": "bar"} 25 | 26 | ddict = defaultdict(str) 27 | tdict.update({"key2": ddict}) 28 | assert tdict["key2"] is ddict 29 | -------------------------------------------------------------------------------- /.github/workflows/cross-os-conda-build.yml: -------------------------------------------------------------------------------- 1 | # This workflow will build the latest source files in Anaconda. 2 | 3 | name: Cross-OS Conda Build 4 | 5 | on: 6 | schedule: 7 | - cron: "0 0 * * 2,5" 8 | workflow_dispatch: 9 | 10 | jobs: 11 | conda-build: 12 | 13 | runs-on: ${{ matrix.os }} 14 | strategy: 15 | fail-fast: false 16 | matrix: 17 | python-version: ['3.8', '3.10'] 18 | os: [ubuntu-latest, macos-latest, windows-latest] 19 | 20 | steps: 21 | - uses: actions/checkout@v3 22 | - uses: conda-incubator/setup-miniconda@v2 23 | with: 24 | python-version: ${{ matrix.python-version }} 25 | 26 | - name: Conda build 27 | run: | 28 | conda update conda 29 | conda install conda-build 30 | conda build --channel conda-forge --channel pytorch --override-channels --output-folder ./conda-out/ ./conda-recipe/ 31 | -------------------------------------------------------------------------------- /hover/module_config.py: -------------------------------------------------------------------------------- 1 | import hover 2 | from .config_constants import ( 3 | ConfigSection as Section, 4 | ConfigKey as Key, 5 | ) 6 | from .utils.dataframe import ( 7 | PandasDataframe, 8 | PolarsDataframe, 9 | ) 10 | 11 | # dataframe implementation 12 | DataFrame = ( 13 | PandasDataframe 14 | if hover.config[Section.BACKEND][Key.DATAFRAME_LIBRARY].lower() == "pandas" 15 | else PolarsDataframe 16 | ) 17 | 18 | # constants for the abstain mechanism 19 | ABSTAIN_DECODED = hover.config[Section.DATA_VALUES][Key.ABSTAIN_DECODED] 20 | ABSTAIN_ENCODED = hover.config[Section.DATA_VALUES][Key.ABSTAIN_ENCODED] 21 | ABSTAIN_HEXCOLOR = hover.config[Section.VISUAL][Key.ABSTAIN_HEXCOLOR] 22 | 23 | # constants for label encoding mechanism 24 | ENCODED_LABEL_KEY = hover.config[Section.DATA_COLUMNS][Key.ENCODED_LABEL_KEY] 25 | 26 | # constants for saving work 27 | DATA_SAVE_DIR = hover.config[Section.IO][Key.DATA_SAVE_DIR] 28 | -------------------------------------------------------------------------------- /.github/workflows/handle-inactive.yml: -------------------------------------------------------------------------------- 1 | name: Handle inactive issues / PRs 2 | on: 3 | schedule: 4 | - cron: "0 12 * * *" 5 | workflow_dispatch: 6 | 7 | jobs: 8 | check-issues-and-prs: 9 | runs-on: ubuntu-latest 10 | permissions: 11 | issues: write 12 | pull-requests: write 13 | steps: 14 | - uses: actions/stale@v3 15 | with: 16 | days-before-issue-stale: 30 17 | days-before-issue-close: 14 18 | any-of-issue-labels: "solved pending confirmation,invalid,wontfix" 19 | stale-issue-label: "stale" 20 | stale-issue-message: "This issue is stale because it has been open for 30 days with no activity." 21 | close-issue-message: "This issue was closed because it has been inactive for 14 days since being marked as stale." 22 | days-before-pr-stale: -1 23 | days-before-pr-close: -1 24 | repo-token: ${{ secrets.GITHUB_TOKEN }} 25 | -------------------------------------------------------------------------------- /docs/snippets/markdown/readme/0b-status-badges.md: -------------------------------------------------------------------------------- 1 | [![PyPI Version](https://img.shields.io/pypi/v/hover?logo=pypi&logoColor=white)](https://pypi.org/project/hover/) 2 | [![Conda Version](https://img.shields.io/conda/vn/conda-forge/hover)](https://github.com/conda-forge/hover-feedstock) 3 | ![Downloads](https://static.pepy.tech/personalized-badge/hover?period=total&units=international_system&left_color=grey&right_color=brightgreen&left_text=pypi%20downloads) 4 | ![Main Build Status](https://img.shields.io/github/actions/workflow/status/phurwicz/hover/cross-os-source-test.yml?branch=main&label=main&logo=github) 5 | ![Nightly Build Status](https://img.shields.io/github/actions/workflow/status/phurwicz/hover/quick-source-test.yml?branch=nightly&label=nightly&logo=github) 6 | ![Codacy Grade](https://img.shields.io/codacy/grade/689827d9077b43ac8721c7658d122d1a?logo=codacy&logoColor=white) 7 | ![Codacy Coverage](https://img.shields.io/codacy/coverage/689827d9077b43ac8721c7658d122d1a/main?logo=codacy&logoColor=white) 8 | -------------------------------------------------------------------------------- /docs/snippets/py/g0-3-image-vectorizer.txt: -------------------------------------------------------------------------------- 1 | import torch 2 | import wrappy 3 | from efficientnet_pytorch import EfficientNet 4 | from torchvision import transforms 5 | 6 | # EfficientNet is a series of pre-trained models 7 | # https://github.com/lukemelas/EfficientNet-PyTorch 8 | effnet = EfficientNet.from_pretrained("efficientnet-b0") 9 | effnet.eval() 10 | 11 | # standard transformations for ImageNet-trained models 12 | tfms = transforms.Compose( 13 | [ 14 | transforms.Resize(224), 15 | transforms.ToTensor(), 16 | transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]), 17 | ] 18 | ) 19 | 20 | @wrappy.memoize(cache_limit=10000, persist_path='custom_cache/image_url_to_vector.pkl') 21 | def vectorizer(url): 22 | """ 23 | Using logits on ImageNet-1000 classes. 24 | """ 25 | img = tfms(url_to_image(url)).unsqueeze(0) 26 | 27 | with torch.no_grad(): 28 | outputs = effnet(img) 29 | 30 | return outputs.detach().numpy().flatten() 31 | -------------------------------------------------------------------------------- /docs/pipelines/local_helper.py: -------------------------------------------------------------------------------- 1 | """ 2 | Local library for shared functions. 3 | """ 4 | from rich.console import Console 5 | 6 | 7 | def batch_routine(func, name_to_file_path_dict): 8 | """ 9 | Run a function on a collections of files. 10 | Collect all exceptions along the way. 11 | """ 12 | all_success = True 13 | console = Console() 14 | 15 | for _name, _path in name_to_file_path_dict.items(): 16 | console.print(f"==== Running {func.__name__} on {_name} ====") 17 | _script, _process = func(_name, _path) 18 | _success = _process.returncode == 0 19 | all_success = all_success and _success 20 | 21 | if not _success: 22 | console.print( 23 | f"!!!! Error from {func.__name__} on {_name} !!!!", style="red bold" 24 | ) 25 | console.print(f"{_script}\n\n", style="blue") 26 | console.print(f"{_process.stderr}\n\n", style="red") 27 | 28 | if not all_success: 29 | raise RuntimeError("Script test failed.") 30 | -------------------------------------------------------------------------------- /docs/pipelines/generate_readme.py: -------------------------------------------------------------------------------- 1 | import os 2 | from markdown_include.include import MarkdownInclude, IncludePreprocessor 3 | 4 | README_TEMPLATE_PATH = os.path.join(os.path.dirname(__file__), "README.md.template") 5 | LANGUAGE_PLACEHOLDER = "" 6 | LANGS = ["en", "zh"] 7 | DEFAULT_LANG = "en" 8 | 9 | 10 | def main(): 11 | with open(README_TEMPLATE_PATH, "r") as f: 12 | template = f.read() 13 | include = MarkdownInclude() 14 | preprocessor = IncludePreprocessor(template, include.getConfigs()) 15 | 16 | for lang in LANGS: 17 | filename = "README.md" if lang == DEFAULT_LANG else f"README.{lang}.md" 18 | readme_path = os.path.join(os.path.dirname(__file__), filename) 19 | transformed = "\n".join( 20 | preprocessor.run(template.replace(LANGUAGE_PLACEHOLDER, lang).split("\n")) 21 | ) 22 | with open(readme_path, "w") as f: 23 | f.write(transformed) 24 | print(f"Generated {readme_path} for language {lang}.") 25 | 26 | 27 | if __name__ == "__main__": 28 | main() 29 | -------------------------------------------------------------------------------- /docs/snippets/py/tz-dataset-text-full.txt: -------------------------------------------------------------------------------- 1 | from hover.core.dataset import SupervisableTextDataset 2 | import pandas as pd 3 | 4 | raw_csv_path = "https://raw.githubusercontent.com/phurwicz/hover-gallery/main/0.5.0/20_newsgroups_raw.csv" 5 | train_csv_path = "https://raw.githubusercontent.com/phurwicz/hover-gallery/main/0.5.0/20_newsgroups_train.csv" 6 | 7 | # for fast, low-memory demonstration purpose, sample the data 8 | df_raw = pd.read_csv(raw_csv_path).sample(400) 9 | df_raw["SUBSET"] = "raw" 10 | df_train = pd.read_csv(train_csv_path).sample(400) 11 | df_train["SUBSET"] = "train" 12 | df_dev = pd.read_csv(train_csv_path).sample(100) 13 | df_dev["SUBSET"] = "dev" 14 | df_test = pd.read_csv(train_csv_path).sample(100) 15 | df_test["SUBSET"] = "test" 16 | 17 | # build overall dataframe and ensure feature type 18 | df = pd.concat([df_raw, df_train, df_dev, df_test]) 19 | df["text"] = df["text"].astype(str) 20 | 21 | # this class stores the dataset throught the labeling process 22 | dataset = SupervisableTextDataset.from_pandas(df, feature_key="text", label_key="label") 23 | -------------------------------------------------------------------------------- /tests/utils/test_snorkel_helper.py: -------------------------------------------------------------------------------- 1 | from hover.utils.snorkel_helper import labeling_function 2 | import pytest 3 | 4 | 5 | @pytest.mark.lite 6 | def test_labeling_function(example_raw_df): 7 | def original(row): 8 | return "long" if len(row["text"]) > 5 else "short" 9 | 10 | targets = ["long", "short"] 11 | one_row = example_raw_df.get_row_as_dict(0) 12 | 13 | # create LF with pre-determined label encodings 14 | label_encoder = {t: i for i, t in enumerate(targets)} 15 | preencoded = labeling_function( 16 | targets=targets, 17 | label_encoder=label_encoder, 18 | name="pre-encoded", 19 | )(original) 20 | 21 | assert isinstance(preencoded(one_row), str) 22 | assert isinstance(preencoded.snorkel(one_row), int) 23 | 24 | # create LF with undetermined label encodings 25 | unencoded = labeling_function( 26 | targets=targets, 27 | label_encoder=None, 28 | name="unencoded", 29 | )(original) 30 | 31 | assert isinstance(unencoded(one_row), str) 32 | assert unencoded.snorkel is None 33 | -------------------------------------------------------------------------------- /.github/workflows/cross-os-install-source.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions 3 | 4 | name: Cross-OS Install (Source) 5 | 6 | on: 7 | schedule: 8 | - cron: "0 0 * * 1,4" 9 | workflow_dispatch: 10 | 11 | jobs: 12 | install-source: 13 | 14 | runs-on: ${{ matrix.os }} 15 | strategy: 16 | fail-fast: false 17 | matrix: 18 | python-version: ['3.8', '3.10'] 19 | os: [ubuntu-latest, macos-latest, windows-latest] 20 | 21 | steps: 22 | - uses: actions/checkout@v3 23 | - name: Set up Python ${{ matrix.python-version }} 24 | uses: actions/setup-python@v3 25 | with: 26 | python-version: ${{ matrix.python-version }} 27 | 28 | - name: Install tox 29 | run: | 30 | pip install --upgrade pip 31 | pip install --upgrade tox tox-gh-actions 32 | 33 | - name: Install hover 34 | run: | 35 | tox -e install 36 | -------------------------------------------------------------------------------- /tests/utils/test_datasets.py: -------------------------------------------------------------------------------- 1 | from hover.utils.datasets import newsgroups_dictl, newsgroups_reduced_dictl 2 | import pytest 3 | 4 | 5 | @pytest.mark.lite 6 | def test_20_newsgroups(): 7 | for dictl_method, num_classes in [ 8 | (newsgroups_dictl, 20), 9 | (newsgroups_reduced_dictl, 7), 10 | ]: 11 | my_20ng, label_encoder, label_decoder = dictl_method() 12 | 13 | assert isinstance(my_20ng, dict) 14 | for _key in ["train", "test"]: 15 | assert isinstance(my_20ng["train"], list) 16 | assert isinstance(my_20ng["train"][0], dict) 17 | assert isinstance(my_20ng["train"][0]["label"], str) 18 | assert isinstance(my_20ng["train"][0]["text"], str) 19 | 20 | assert isinstance(label_encoder, dict) 21 | assert isinstance(label_decoder, dict) 22 | assert len(label_encoder) == num_classes + 1 23 | assert len(label_decoder) == num_classes + 1 24 | assert set(label_encoder.keys()) == set(label_decoder.values()) 25 | assert set(label_decoder.keys()) == set(label_encoder.values()) 26 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 phurwicz 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /tests/recipes/test_subroutine.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from hover.core.explorer.functionality import ( 3 | BokehDataAnnotator, 4 | BokehDataFinder, 5 | BokehSoftLabelExplorer, 6 | BokehSnorkelExplorer, 7 | ) 8 | from hover.recipes.subroutine import ( 9 | standard_annotator, 10 | standard_finder, 11 | standard_snorkel, 12 | standard_softlabel, 13 | ) 14 | 15 | 16 | @pytest.mark.lite 17 | def test_autobuild_explorer( 18 | example_text_dataset, 19 | example_image_dataset, 20 | example_audio_dataset, 21 | ): 22 | for dataset in [ 23 | example_text_dataset, 24 | example_image_dataset, 25 | example_audio_dataset, 26 | ]: 27 | dataset = dataset.copy() 28 | 29 | annotator = standard_annotator(dataset) 30 | assert isinstance(annotator, BokehDataAnnotator) 31 | 32 | finder = standard_finder(dataset) 33 | assert isinstance(finder, BokehDataFinder) 34 | 35 | softlabel = standard_softlabel(dataset) 36 | assert isinstance(softlabel, BokehSoftLabelExplorer) 37 | 38 | snorkel = standard_snorkel(dataset) 39 | assert isinstance(snorkel, BokehSnorkelExplorer) 40 | -------------------------------------------------------------------------------- /docs/snippets/markdown/readme/6-remarks.en.md: -------------------------------------------------------------------------------- 1 | ## :bell: Remarks 2 | 3 | ### Shoutouts 4 | 5 | - Thanks to [`Bokeh`](https://bokeh.org) because `hover` would not exist without linked plots and callbacks, or be nearly as good without embeddable server apps. 6 | - Thanks to [Philip Vollet](https://de.linkedin.com/in/philipvollet) for sharing `hover` with the community even when it was really green. 7 | 8 | ### Contributing 9 | 10 | - All feedbacks are welcome, **especially what you find lacking and want it fixed!** 11 | - `./requirements-dev.txt` lists required packages for development. 12 | - Pull requests are advised to use a superset of the pre-commit hooks listed in [.pre-commit-config.yaml](https://github.com/phurwicz/hover/blob/main/.pre-commit-config.yaml). 13 | 14 | ### Citation 15 | 16 | If you have found `hover` useful to your work, please [let us know](https://github.com/phurwicz/hover/discussions) :hugs: 17 | 18 | ```tex 19 | @misc{hover, 20 | title={{hover}: label data at scale}, 21 | url={https://github.com/phurwicz/hover}, 22 | note={Open software from https://github.com/phurwicz/hover}, 23 | author={ 24 | Pavel Hurwicz and 25 | Haochuan Wei}, 26 | year={2021}, 27 | } 28 | ``` 29 | -------------------------------------------------------------------------------- /.github/workflows/cross-os-source-test.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies and run tests on the source code. 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions 3 | 4 | name: Cross-OS Source Test 5 | 6 | on: 7 | schedule: 8 | - cron: "0 0 * * 1,4" 9 | workflow_dispatch: 10 | 11 | jobs: 12 | test-api: 13 | 14 | runs-on: ${{ matrix.os }} 15 | strategy: 16 | fail-fast: false 17 | matrix: 18 | python-version: ['3.8', '3.9', '3.10'] 19 | os: [ubuntu-latest, macos-latest, windows-latest] 20 | 21 | steps: 22 | - uses: actions/checkout@v3 23 | - name: Set up Python ${{ matrix.python-version }} 24 | uses: actions/setup-python@v3 25 | with: 26 | python-version: ${{ matrix.python-version }} 27 | 28 | - name: Get dependencies 29 | run: | 30 | pip install --upgrade pip 31 | pip install --upgrade tox tox-gh-actions 32 | 33 | - name: Test - default config 34 | run: | 35 | tox -e test_api 36 | 37 | - name: Test - alt config 1 38 | run: | 39 | tox -e test_api -- --hover-ini tests/module_config/hover_alt_config_1.ini 40 | -------------------------------------------------------------------------------- /conda-recipe/stable.yaml: -------------------------------------------------------------------------------- 1 | {% set name = "hover" %} 2 | {% set version = "0.7.0" %} 3 | 4 | 5 | package: 6 | name: {{ name }} 7 | version: {{ version }} 8 | 9 | source: 10 | url: https://pypi.io/packages/source/{{ name[0] }}/{{ name }}/{{ name }}-{{ version }}.tar.gz 11 | sha256: 1aae054f90ec869e898affa2f06bed08c1b00531d21f02f1faeafefc19ff6d98 12 | 13 | build: 14 | number: 0 15 | noarch: python 16 | script: python -m pip install . -vv 17 | 18 | requirements: 19 | host: 20 | - python >=3.7 21 | - pip 22 | run: 23 | - python >=3.7 24 | - bokeh >=2.3.3 25 | - scikit-learn >=0.20.0 26 | - pytorch >=1.10.0 27 | - pandas >=1.3.0 28 | - numpy >=1.14 29 | - scipy >=1.3.2 30 | - tqdm >=4.0 31 | - rich >=11.0.0 32 | - deprecated >=1.1.0 33 | - umap-learn >=0.3.10 34 | 35 | test: 36 | imports: 37 | - hover 38 | commands: 39 | - pip check 40 | requires: 41 | - pip 42 | 43 | about: 44 | home: https://phurwicz.github.io/hover 45 | license: MIT 46 | license_file: LICENSE 47 | summary: Label data at scale. Fun and precision included. 48 | dev_url: https://github.com/phurwicz/hover 49 | 50 | extra: 51 | recipe-maintainers: 52 | - phurwicz 53 | - haochuanwei 54 | -------------------------------------------------------------------------------- /tests/utils/test_bokeh_helper.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from urllib.parse import urlparse 3 | from hover.utils.bokeh_helper import ( 4 | servable, 5 | binder_proxy_app_url, 6 | remote_jupyter_proxy_url, 7 | ) 8 | from tests.recipes.local_helper import execute_handle_function 9 | 10 | 11 | @pytest.mark.lite 12 | def test_binder_proxy_app_url(): 13 | """ 14 | The function being tested is only intended for Binder. 15 | """ 16 | url = binder_proxy_app_url("simple-annotator", port=5007) 17 | _ = urlparse(url) 18 | 19 | 20 | @pytest.mark.lite 21 | def test_remote_jupyter_proxy_url(): 22 | """ 23 | Not a full test, rather just validating urls. 24 | """ 25 | for port in [8888, None]: 26 | url = remote_jupyter_proxy_url(port) 27 | _ = urlparse(url) 28 | 29 | 30 | @pytest.mark.lite 31 | def test_servable_wrapper(dummy_working_recipe, dummy_broken_recipe): 32 | try: 33 | dummy_broken_recipe() 34 | pytest.fail("The dummy broken recipe above should have raised an exception.") 35 | except AssertionError: 36 | pass 37 | 38 | for recipe in [dummy_working_recipe, dummy_broken_recipe]: 39 | handle = servable()(recipe) 40 | execute_handle_function(handle) 41 | -------------------------------------------------------------------------------- /tests/core/representation/test_trajectory.py: -------------------------------------------------------------------------------- 1 | from hover.core.representation.trajectory import spline, manifold_spline 2 | import numpy as np 3 | import pytest 4 | 5 | 6 | @pytest.mark.lite 7 | def test_spline(one_to_two_and_square): 8 | x, y = one_to_two_and_square 9 | 10 | traj_x, traj_y = spline([x, y], points_per_step=1, splprep_kwargs={"k": 2}) 11 | assert (np.absolute(traj_x - x) < 1e-2).all() 12 | assert (np.absolute(traj_y - y) < 1e-2).all() 13 | 14 | 15 | @pytest.mark.lite 16 | def test_manifold_spline(one_to_two_and_square, num_points=100): 17 | # shape: dim-by-step 18 | arr = np.array(one_to_two_and_square) 19 | 20 | # shape: point-by-dim-by-step 21 | arr = np.array([arr] * num_points) 22 | 23 | # shape: step-by-point-by-dim 24 | arr = np.swapaxes(arr, 1, 2) 25 | arr = np.swapaxes(arr, 0, 1) 26 | L, M, N = arr.shape 27 | 28 | # add a displacement that varies by point 29 | arr += np.linspace(0.0, 0.1, num_points)[np.newaxis, :, np.newaxis] 30 | 31 | traj = manifold_spline(arr, points_per_step=1, splprep_kwargs={"k": 2}) 32 | assert traj.shape == (L, M, N) 33 | 34 | traj = manifold_spline(arr, points_per_step=3, splprep_kwargs={"k": 2}) 35 | assert traj.shape == (3 * L - 2, M, N) 36 | -------------------------------------------------------------------------------- /docs/pipelines/local_config.py: -------------------------------------------------------------------------------- 1 | import os 2 | from markdown_include.include import MarkdownInclude 3 | 4 | 5 | DIR_PATH = os.path.dirname(__file__) 6 | NAME_TO_SCRIPT_REL = { 7 | "t0-quickstart": "../pages/tutorial/t0-quickstart.md", 8 | "t1-using-recipes": "../pages/tutorial/t1-active-learning.md", 9 | # tutorial-t2 has no script currently 10 | "t3-dataset-mechanisms": "../pages/tutorial/t3-dataset-population-selection.md", 11 | "t4-annotator-plot-tools": "../pages/tutorial/t4-annotator-dataset-interaction.md", 12 | "t5-finder-selection-filter": "../pages/tutorial/t5-finder-filter.md", 13 | "t6-soft-label-joint-filters": "../pages/tutorial/t6-softlabel-joint-filter.md", 14 | "t7-custom-labeling-functions": "../pages/tutorial/t7-snorkel-improvise-rules.md", 15 | "g0-image-data": "../pages/guides/g0-datatype-image.md", 16 | "g1-audio-data": "../pages/guides/g1-datatype-audio.md", 17 | } 18 | NAME_TO_SCRIPT_ABS = { 19 | _k: os.path.join(DIR_PATH, _v) for _k, _v in NAME_TO_SCRIPT_REL.items() 20 | } 21 | 22 | 23 | MARKDOWN_INCLUDE = MarkdownInclude( 24 | configs={ 25 | "base_path": os.path.join(DIR_PATH, "../../"), 26 | "encoding": "utf-8", 27 | } 28 | ) 29 | 30 | THEBE_PATTERN_CODE_ONLY = r"(?<=
)[\s\S]*?(?=
)" 31 | THEBE_PATTERN_WITH_TAGS = r"
[\s\S]*?
" 32 | -------------------------------------------------------------------------------- /tests/core/representation/conftest.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import numpy as np 3 | import random 4 | 5 | 6 | @pytest.fixture(scope="module") 7 | def one_to_two_and_square(): 8 | x = np.linspace(1.0, 2.0, 11) 9 | y = x * x 10 | return [x, y] 11 | 12 | 13 | @pytest.fixture(scope="module") 14 | def example_array(n_vecs=1000, dim=30): 15 | return np.random.rand(n_vecs, dim) 16 | 17 | 18 | @pytest.fixture(scope="module") 19 | def distance_preserving_array_sequence(example_array): 20 | A = example_array 21 | # translation 22 | B = A + 1.0 23 | # dilation 24 | C = 3.0 * B 25 | # rotation of axes 26 | D = np.concatenate((C[:, 1:], C[:, :1]), axis=1) 27 | # reflection of random axes 28 | E = np.array([random.choice([-1, 1]) for i in range(D.shape[1])])[np.newaxis, :] * D 29 | 30 | return [A, B, C, D, E] 31 | 32 | 33 | @pytest.fixture(scope="module") 34 | def diagonal_multiplication_array_sequence(example_array): 35 | A = example_array 36 | M = np.diag(np.random.rand(A.shape[-1])) 37 | B = A @ M 38 | 39 | return [A, B] 40 | 41 | 42 | @pytest.fixture(scope="module") 43 | def random_multiplication_array_sequence(example_array): 44 | A = example_array 45 | ref_dim = A.shape[-1] 46 | M = np.random.rand(ref_dim, np.random.randint(ref_dim // 2, ref_dim)) 47 | B = A @ M 48 | 49 | return [A, B] 50 | -------------------------------------------------------------------------------- /.github/workflows/assemble-readme.yml: -------------------------------------------------------------------------------- 1 | # This workflow will generate README files based on the doc snippets. 2 | 3 | name: Assemble Multilingual README 4 | 5 | on: 6 | push: 7 | branches: 8 | - main 9 | paths: 10 | - 'docs/snippets/markdown/readme/' 11 | - 'docs/pipelines/README.md.template' 12 | - 'docs/pipelines/generate_readme.py' 13 | workflow_dispatch: 14 | 15 | jobs: 16 | assemble-readme: 17 | runs-on: ubuntu-latest 18 | steps: 19 | - uses: actions/checkout@v3 20 | with: 21 | fetch-depth: 0 22 | 23 | - name: Prepare Git 24 | run: | 25 | git config user.name ${{ secrets.ACTIONS_GIT_USERNAME }} 26 | git config user.email ${{ secrets.ACTIONS_GIT_EMAIL }} 27 | 28 | - name: Run script and get output files 29 | run: | 30 | pip install -r requirements-dev.txt 31 | python docs/pipelines/generate_readme.py 32 | mv docs/pipelines/README*.md ./ 33 | git add ./README*.md 34 | git commit -m "Assemble README files from snippets" 35 | 36 | - name: Create Pull Request 37 | uses: peter-evans/create-pull-request@v5 38 | with: 39 | commit-message: Assemble README files from snippets 40 | title: Automatic README update 41 | body: Assemble README files from snippets 42 | branch: assemble-readme 43 | -------------------------------------------------------------------------------- /conda-recipe/meta.yaml: -------------------------------------------------------------------------------- 1 | {% set name = "hover" %} 2 | {% set version = "0.8.1" %} 3 | 4 | 5 | package: 6 | name: {{ name }} 7 | version: {{ version }} 8 | 9 | source: 10 | git_url: https://github.com/phurwicz/hover.git 11 | 12 | build: 13 | number: 0 14 | noarch: python 15 | script: python -m pip install . -vv 16 | 17 | requirements: 18 | host: 19 | - python >=3.7 20 | - pip 21 | run: 22 | - python >=3.7 23 | - bokeh >=3.0.3 24 | - scikit-learn >=0.20.0 25 | - pytorch >=1.10.0 26 | - pandas >=1.3.0 27 | - numpy >=1.14 28 | - scipy >=1.3.2 29 | - tqdm >=4.0 30 | - rich >=11.0.0 31 | - deprecated >=1.1.0 32 | - umap-learn >=0.3.10 33 | - flexmod >=0.1.0 34 | 35 | test: 36 | imports: 37 | - hover 38 | commands: 39 | - python -m spacy download en_core_web_md 40 | - pytest -m lite 41 | requires: 42 | - pip 43 | - pytest 44 | - spacy 45 | - faker 46 | - snorkel>=0.9.8 47 | - openpyxl 48 | - wrappy 49 | - shaffle 50 | source_files: 51 | - fixture_module 52 | - tests 53 | - pytest.ini 54 | 55 | about: 56 | home: https://phurwicz.github.io/hover 57 | license: MIT 58 | license_file: LICENSE 59 | summary: Label data at scale. Fun and precision included. 60 | dev_url: https://github.com/phurwicz/hover 61 | 62 | extra: 63 | recipe-maintainers: 64 | - phurwicz 65 | - haochuanwei 66 | -------------------------------------------------------------------------------- /docs/pipelines/check_scripts.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tests the scripts in the docs. 3 | Intended for a Binder environment, and should be used in conjunction with a local libary in phurwicz/hover-binder. 4 | """ 5 | import re 6 | import uuid 7 | import markdown 8 | import subprocess 9 | from local_helper import batch_routine 10 | from local_config import NAME_TO_SCRIPT_ABS, MARKDOWN_INCLUDE, THEBE_PATTERN_CODE_ONLY 11 | 12 | 13 | def main(): 14 | """ 15 | Test all code blocks in the scripts listed in this file. 16 | Collect all exceptions along the way. 17 | """ 18 | batch_routine(parse_script_and_run, NAME_TO_SCRIPT_ABS) 19 | 20 | 21 | def parse_script_and_run(script_name, source_abs_path): 22 | """ 23 | Retrieve and run code blocks from documentation file. 24 | Note that the doc file can be using markdown-include. 25 | """ 26 | script_tmp_path = f"{script_name}-{uuid.uuid1()}.py" 27 | 28 | with open(source_abs_path, "r") as f_source: 29 | source = f_source.read() 30 | html = markdown.markdown(source, extensions=[MARKDOWN_INCLUDE]) 31 | script = "\n".join(re.findall(THEBE_PATTERN_CODE_ONLY, html)) 32 | 33 | with open(script_tmp_path, "w") as f_script: 34 | f_script.write(script) 35 | 36 | process = subprocess.run( 37 | ["python", script_tmp_path], capture_output=True, timeout=1200 38 | ) 39 | return script, process 40 | 41 | 42 | if __name__ == "__main__": 43 | main() 44 | -------------------------------------------------------------------------------- /docs/snippets/py/t7-0-lf-list.txt: -------------------------------------------------------------------------------- 1 | from hover.utils.snorkel_helper import labeling_function 2 | from hover.module_config import ABSTAIN_DECODED as ABSTAIN 3 | import re 4 | 5 | 6 | @labeling_function(targets=["rec.autos"]) 7 | def auto_keywords(row): 8 | flag = re.search( 9 | r"(?i)(diesel|gasoline|automobile|vehicle|drive|driving)", row.text 10 | ) 11 | return "rec.autos" if flag else ABSTAIN 12 | 13 | 14 | @labeling_function(targets=["rec.sport.baseball"]) 15 | def baseball_keywords(row): 16 | flag = re.search(r"(?i)(baseball|stadium|\ bat\ |\ base\ )", row.text) 17 | return "rec.sport.baseball" if flag else ABSTAIN 18 | 19 | 20 | @labeling_function(targets=["sci.crypt"]) 21 | def crypt_keywords(row): 22 | flag = re.search(r"(?i)(crypt|math|encode|decode|key)", row.text) 23 | return "sci.crypt" if flag else ABSTAIN 24 | 25 | 26 | @labeling_function(targets=["talk.politics.guns"]) 27 | def guns_keywords(row): 28 | flag = re.search(r"(?i)(gun|rifle|ammunition|violence|shoot)", row.text) 29 | return "talk.politics.guns" if flag else ABSTAIN 30 | 31 | 32 | @labeling_function(targets=["misc.forsale"]) 33 | def forsale_keywords(row): 34 | flag = re.search(r"(?i)(sale|deal|price|discount)", row.text) 35 | return "misc.forsale" if flag else ABSTAIN 36 | 37 | 38 | LABELING_FUNCTIONS = [ 39 | auto_keywords, 40 | baseball_keywords, 41 | crypt_keywords, 42 | guns_keywords, 43 | forsale_keywords, 44 | ] 45 | -------------------------------------------------------------------------------- /docs/styles/monokai.css: -------------------------------------------------------------------------------- 1 | .cm-s-monokai { 2 | padding: 2em 2em 2em; 3 | height: auto; 4 | font-size: 0.8em; 5 | line-height: 1.5em; 6 | font-family: inconsolata, monospace; 7 | letter-spacing: 0.3px; 8 | word-spacing: 1px; 9 | background: #272822; 10 | color: #F8F8F2; 11 | } 12 | .cm-s-monokai .CodeMirror-lines { 13 | padding: 8px 0; 14 | } 15 | .cm-s-monokai .CodeMirror-gutters { 16 | box-shadow: 1px 0 2px 0 rgba(0, 0, 0, 0.5); 17 | -webkit-box-shadow: 1px 0 2px 0 rgba(0, 0, 0, 0.5); 18 | background-color: #272822; 19 | padding-right: 10px; 20 | z-index: 3; 21 | border: none; 22 | } 23 | .cm-s-monokai div.CodeMirror-cursor { 24 | border-left: 3px solid #F8F8F2; 25 | } 26 | .cm-s-monokai .CodeMirror-activeline-background { 27 | background: #49483E; 28 | } 29 | .cm-s-monokai .CodeMirror-selected { 30 | background: #49483E; 31 | } 32 | .cm-s-monokai .cm-comment { 33 | color: #75715E; 34 | } 35 | .cm-s-monokai .cm-string { 36 | color: #E6DB74; 37 | } 38 | .cm-s-monokai .cm-number { 39 | color: #66D9EF; 40 | } 41 | .cm-s-monokai .cm-atom { 42 | color: #66D9EF; 43 | } 44 | .cm-s-monokai .cm-keyword { 45 | color: #F92672; 46 | } 47 | .cm-s-monokai .cm-variable { 48 | color: #A6E22E; 49 | } 50 | .cm-s-monokai .cm-def { 51 | color: #FD971F; 52 | } 53 | .cm-s-monokai .cm-variable-2 { 54 | color: #F92672; 55 | } 56 | .cm-s-monokai .cm-property { 57 | color: #66D9EF; 58 | } 59 | .cm-s-monokai .cm-operator { 60 | color: #F92672; 61 | } 62 | .cm-s-monokai .CodeMirror-linenumber { 63 | color: #75715E; 64 | } 65 | -------------------------------------------------------------------------------- /fixture_module/text_vector_net/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Example importable module holding customized ingredients of a workflow with hover. 3 | Specifically for text data. 4 | """ 5 | 6 | import os 7 | import re 8 | import numpy as np 9 | import wrappy 10 | 11 | CACHE_PATH = os.path.join(os.path.dirname(__file__), "vecs.pkl") 12 | 13 | 14 | def get_vectorizer(): 15 | """ 16 | :returns: a text vectorizer. 17 | """ 18 | import spacy 19 | 20 | # SpaCy 'vector' models are perfect for this 21 | # nlp = spacy.load('en_vectors_web_lg') 22 | 23 | # 'core' models are slower due to linguistic features 24 | nlp = spacy.load("en_core_web_md") 25 | 26 | # could use a transformer if speed is ok 27 | # nlp = spacy.load('en_trf_bertbaseuncased_lg') 28 | 29 | # memoization can be useful if the function takes a while to run, e.g. transformer models 30 | @wrappy.memoize( 31 | cache_limit=50000, 32 | return_copy=False, 33 | persist_path=CACHE_PATH, 34 | persist_batch_size=1000, 35 | ) 36 | def vectorizer(text): 37 | """ 38 | A more serious example of a text vectorizer. 39 | """ 40 | clean_text = re.sub(r"[\t\n]", r" ", text) 41 | return nlp(clean_text, disable=nlp.pipe_names).vector 42 | 43 | return vectorizer 44 | 45 | 46 | def get_architecture(): 47 | from hover.utils.common_nn import MLP 48 | 49 | return MLP 50 | 51 | 52 | def get_state_dict_path(): 53 | dir_path = os.path.dirname(__file__) 54 | return os.path.join(dir_path, "model.pt") 55 | -------------------------------------------------------------------------------- /tests/core/representation/test_reduction.py: -------------------------------------------------------------------------------- 1 | from hover.core.representation.reduction import DimensionalityReducer 2 | import numpy as np 3 | import pytest 4 | 5 | 6 | @pytest.mark.lite 7 | def test_create_reducer(n_points=1000): 8 | # if marked as lite, only test the default reducer library 9 | from umap import UMAP 10 | 11 | reducer = DimensionalityReducer.create_reducer( 12 | "umap", 13 | dimension=4, 14 | n_neighbors=10, 15 | ) 16 | assert isinstance(reducer, UMAP) 17 | # dimension is expected to override n_components (default 2) 18 | assert reducer.n_components == 4 19 | # other kwargs are expected to simply get forwarded 20 | assert reducer.n_neighbors == 10 21 | 22 | 23 | def test_dimensionality_reduction(n_points=1000): 24 | 25 | arr = np.random.rand(n_points, 20) 26 | reducer = DimensionalityReducer(arr) 27 | 28 | reducer.fit_transform( 29 | "umap", n_neighbors=3, min_dist=0.01, dimension=3, metric="euclidean" 30 | ) 31 | embedding = reducer.transform(arr, "umap") 32 | assert embedding.shape == (n_points, 3) 33 | embedding = reducer.transform(np.array([])) 34 | assert embedding.shape == (0,) 35 | 36 | reducer.fit_transform( 37 | "ivis", dimension=4, k=3, distance="pn", batch_size=16, epochs=20 38 | ) 39 | embedding = reducer.transform(arr, "ivis") 40 | assert embedding.shape == (n_points, 4) 41 | 42 | try: 43 | reducer.fit_transform("invalid_method") 44 | pytest.fail("Expected exception from invalid reduction method.") 45 | except ValueError: 46 | pass 47 | -------------------------------------------------------------------------------- /hover/core/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Core module: contains classes which are centerpieces of the vast majority of use cases. 3 | 4 | - dataset: defines the primary data structure to work with. 5 | - explorer: defines high-level building blocks of the interactive UI. 6 | - neural: defines sub-applications that involve neural networks. 7 | """ 8 | from rich.console import Console 9 | from hover.utils.meta.traceback import RichTracebackMeta 10 | 11 | 12 | class Loggable(metaclass=RichTracebackMeta): 13 | """ 14 | Base class that provides consistently templated logging. 15 | 16 | Inspired by `wasabi`'s `good`/`info`/`warn`/`fail` methods. 17 | 18 | [`Rich` style guide](https://rich.readthedocs.io/en/latest/style.html) 19 | """ 20 | 21 | CONSOLE = Console() 22 | 23 | def _print(self, *args, **kwargs): 24 | self.__class__.CONSOLE.print(*args, **kwargs) 25 | 26 | def _good(self, message): 27 | self.__class__.CONSOLE.print( 28 | f":green_circle: {self.__class__.__name__}: {message}", 29 | style="green", 30 | ) 31 | 32 | def _info(self, message): 33 | self.__class__.CONSOLE.print( 34 | f":blue_circle: {self.__class__.__name__}: {message}", style="blue" 35 | ) 36 | 37 | def _warn(self, message): 38 | self.__class__.CONSOLE.print( 39 | f":yellow_circle: {self.__class__.__name__}: {message}", 40 | style="yellow", 41 | ) 42 | 43 | def _fail(self, message): 44 | self.__class__.CONSOLE.print( 45 | f":red_circle: {self.__class__.__name__}: {message}", style="red" 46 | ) 47 | -------------------------------------------------------------------------------- /tests/core/test_local_config.py: -------------------------------------------------------------------------------- 1 | from bokeh.models import ( 2 | TableColumn, 3 | ) 4 | from hover.core.local_config import ( 5 | embedding_field, 6 | is_embedding_field, 7 | blank_callback_on_change, 8 | dataset_default_sel_table_columns, 9 | dataset_default_sel_table_kwargs, 10 | ) 11 | import pytest 12 | 13 | 14 | @pytest.mark.lite 15 | def test_embedding_field(): 16 | for i in range(2, 10): 17 | for j in range(i): 18 | assert is_embedding_field(embedding_field(i, j)) 19 | 20 | 21 | @pytest.mark.lite 22 | def test_blank_callback_on_change(): 23 | blank_callback_on_change("value", 0, 1) 24 | 25 | 26 | @pytest.mark.lite 27 | def test_dataset_default_sel_table_columns(): 28 | for feature in ["text", "image", "audio"]: 29 | columns = dataset_default_sel_table_columns(feature) 30 | assert isinstance(columns, list) 31 | assert isinstance(columns[0], TableColumn) 32 | 33 | try: 34 | dataset_default_sel_table_columns("invalid_feature") 35 | pytest.fail("Expected an exception from creating columns on invalid feature.") 36 | except ValueError: 37 | pass 38 | 39 | 40 | @pytest.mark.lite 41 | def test_dataset_default_sel_table_kwargs(): 42 | for feature in ["text", "image", "audio"]: 43 | kwargs = dataset_default_sel_table_kwargs(feature) 44 | assert isinstance(kwargs, dict) 45 | assert kwargs 46 | 47 | try: 48 | dataset_default_sel_table_kwargs("invalid_feature") 49 | pytest.fail("Expected an exception from creating kwargs on invalid feature.") 50 | except ValueError: 51 | pass 52 | -------------------------------------------------------------------------------- /.github/workflows/doc-script-test.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies and run tests on the code snippets included in the documentation. 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions 3 | 4 | name: Documentation Script Test 5 | 6 | on: 7 | schedule: 8 | - cron: "0 0 * * 1" 9 | push: 10 | branches: [ main ] 11 | paths: 12 | - 'hover/**.py' 13 | - 'docs/pages/**.md' 14 | - 'docs/pipelines/*.py' 15 | - 'docs/snippets/markdown/dataset-prep.md' 16 | - 'docs/snippets/py/*.*' 17 | pull_request: 18 | branches: [ main ] 19 | paths: 20 | - 'hover/**.py' 21 | - 'docs/pages/**.md' 22 | - 'docs/pipelines/*.py' 23 | - 'docs/snippets/markdown/dataset-prep.md' 24 | - 'docs/snippets/py/*.*' 25 | workflow_dispatch: 26 | 27 | jobs: 28 | doc-script: 29 | 30 | runs-on: ${{ matrix.os }} 31 | strategy: 32 | fail-fast: false 33 | matrix: 34 | python-version: ['3.9'] 35 | os: [ubuntu-latest] 36 | 37 | steps: 38 | - name: Clone hover 39 | uses: actions/checkout@v3 40 | 41 | - name: Set up Python ${{ matrix.python-version }} 42 | uses: actions/setup-python@v3 43 | with: 44 | python-version: ${{ matrix.python-version }} 45 | 46 | - name: Install Non-Python Dependencies 47 | run: | 48 | sudo apt-get update 49 | sudo apt-get install libsndfile1 50 | 51 | - name: Test with Tox 52 | run: | 53 | pip install --upgrade pip 54 | pip install --upgrade tox tox-gh-actions 55 | tox -e test_doc_scripts 56 | -------------------------------------------------------------------------------- /tests/utils/test_torch_helper.py: -------------------------------------------------------------------------------- 1 | from torch.utils.data import Dataset, DataLoader 2 | from hover.utils.torch_helper import ( 3 | VectorDataset, 4 | MultiVectorDataset, 5 | one_hot, 6 | label_smoothing, 7 | ) 8 | import numpy as np 9 | import pytest 10 | 11 | 12 | @pytest.mark.lite 13 | def test_vector_dataset(num_entries=100, dim_inp=128, dim_out=3): 14 | vec_inp = np.random.rand(num_entries, dim_inp) 15 | vec_out = np.random.rand(num_entries, dim_out) 16 | 17 | for dataset in [ 18 | VectorDataset(vec_inp, vec_out), 19 | MultiVectorDataset([vec_inp] * 2, vec_out), 20 | ]: 21 | loader = dataset.loader(batch_size=min(num_entries, 16)) 22 | assert isinstance(dataset, Dataset) 23 | assert isinstance(loader, DataLoader) 24 | assert len(dataset) == num_entries 25 | inp, out, idx = dataset[0] 26 | 27 | 28 | @pytest.mark.lite 29 | def test_one_hot(): 30 | categorical_labels = [0, 1, 2, 1] 31 | one_hot_labels = one_hot(categorical_labels, 3) 32 | assert one_hot_labels.shape == (4, 3) 33 | 34 | 35 | @pytest.mark.lite 36 | def test_label_smoothing(num_entries=100, num_classes=3, coeff=0.1): 37 | assert num_classes >= 2 38 | assert coeff >= 0.0 39 | 40 | categorical_labels = [0] * num_entries 41 | prob_labels = one_hot(categorical_labels, num_classes) 42 | 43 | assert np.allclose(label_smoothing(prob_labels, coefficient=0.0), prob_labels) 44 | smoothed = label_smoothing(prob_labels, coefficient=coeff) 45 | np.testing.assert_almost_equal( 46 | smoothed[0][0], 1.0 - coeff * (1.0 - 1.0 / num_classes) 47 | ) 48 | np.testing.assert_almost_equal(smoothed[0][1], coeff / num_classes) 49 | -------------------------------------------------------------------------------- /hover/utils/bokeh_helper/local_config.py: -------------------------------------------------------------------------------- 1 | import hover 2 | from hover.config_constants import ( 3 | ConfigSection as Section, 4 | ConfigKey as Key, 5 | ) 6 | 7 | 8 | BOKEH_PALETTE = hover.config[Section.VISUAL][Key.BOKEH_PALETTE] 9 | BOKEH_PALETTE_USAGE = hover.config[Section.VISUAL][Key.BOKEH_PALETTE_USAGE] 10 | 11 | TOOLTIP_TEXT_TEMPLATE = """ 12 |
13 | 14 | {key}: @{field} 15 | 16 |
17 | """ 18 | 19 | TOOLTIP_IMAGE_TEMPLATE = """ 20 |
21 | @{field} 25 |
26 | """ 27 | 28 | TOOLTIP_AUDIO_TEMPLATE = """ 29 |
30 | 36 |
37 | """ 38 | 39 | TOOLTIP_CUSTOM_TEMPLATE = """ 40 |
41 | 42 | {key}: @{field} 43 | 44 |
45 | """ 46 | 47 | TOOLTIP_LABEL_TEMPLATE = """ 48 |
49 | 50 | {key}: @{field} 51 | 52 |
53 | """ 54 | 55 | TOOLTIP_COORDS_DIV = """ 56 |
57 | 58 | Coordinates: ($x, $y) 59 | 60 |
61 | """ 62 | 63 | TOOLTIP_INDEX_DIV = """ 64 |
65 | 66 | Index: [$index] 67 | 68 |
69 | """ 70 | -------------------------------------------------------------------------------- /.github/workflows/doc-auto-notebook.yml: -------------------------------------------------------------------------------- 1 | # This workflow will generate Jupyter notebooks based on the code scripts in the documentation. 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions 3 | 4 | name: Automatic Notebook Generation 5 | 6 | on: 7 | schedule: 8 | - cron: "0 0 * * 2" 9 | workflow_dispatch: 10 | 11 | jobs: 12 | auto-notebook: 13 | 14 | runs-on: ${{ matrix.os }} 15 | strategy: 16 | matrix: 17 | python-version: ['3.9'] 18 | os: [ubuntu-latest] 19 | 20 | steps: 21 | - name: Clone hover all branches 22 | uses: actions/checkout@v3 23 | with: 24 | fetch-depth: 0 25 | 26 | - name: Set up Python ${{ matrix.python-version }} 27 | uses: actions/setup-python@v3 28 | with: 29 | python-version: ${{ matrix.python-version }} 30 | 31 | - name: Install Non-Python Dependencies 32 | run: | 33 | sudo apt-get update 34 | sudo apt-get install libsndfile1 35 | 36 | - name: Prepare Git 37 | run: | 38 | git config user.name ${{ secrets.ACTIONS_GIT_USERNAME }} 39 | git config user.email ${{ secrets.ACTIONS_GIT_EMAIL }} 40 | git checkout pipeline/notebook-generation 41 | git merge origin/main --no-edit 42 | git push 43 | 44 | - name: Test with Tox 45 | run: | 46 | pip install --upgrade pip 47 | pip install --upgrade tox tox-gh-actions 48 | tox -e test_notebook_generation 49 | 50 | - name: Update Generated Notebooks 51 | run: | 52 | git add docs/pipelines/generated/*.ipynb 53 | git commit -m "Automatic update of notebooks generated from documentation scripts" 54 | git push 55 | -------------------------------------------------------------------------------- /hover/utils/meta/traceback.py: -------------------------------------------------------------------------------- 1 | from abc import ABCMeta 2 | from functools import wraps 3 | from types import FunctionType 4 | from rich.console import Console 5 | 6 | 7 | class RichTracebackMeta(type): 8 | """ 9 | ???+ note "Metaclass to mass-add traceback override to class methods." 10 | 11 | [`Rich` traceback guide](https://rich.readthedocs.io/en/stable/traceback.html) 12 | """ 13 | 14 | def __new__(meta, class_name, bases, class_dict): 15 | # prefers the class's CONSOLE attribute; create one otherwise 16 | console = class_dict.get("CONSOLE", Console()) 17 | 18 | def wrapper(func): 19 | @wraps(func) 20 | def wrapped(*args, **kwargs): 21 | try: 22 | return func(*args, **kwargs) 23 | except Exception as e_original: 24 | func_name = f"{func.__module__}.{func.__qualname__}" 25 | console.print( 26 | f":red_circle: {func_name} failed: {e_original}", 27 | style="red bold", 28 | ) 29 | console.print_exception(show_locals=False) 30 | raise e_original 31 | 32 | return wrapped 33 | 34 | new_class_dict = {} 35 | for attr_name, attr_value in class_dict.items(): 36 | # replace each method with a wrapped version 37 | if isinstance(attr_value, FunctionType): 38 | attr_value = wrapper(attr_value) 39 | new_class_dict[attr_name] = attr_value 40 | return type.__new__(meta, class_name, bases, new_class_dict) 41 | 42 | 43 | class RichTracebackABCMeta(RichTracebackMeta, ABCMeta): 44 | """ 45 | ???+ note "Metaclass for rich-traceback abstract base classes." 46 | 47 | To resolve the metaclass conflict between RichTracebackMeta and ABCMeta. 48 | """ 49 | 50 | pass 51 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | import os 3 | 4 | 5 | def get_description(): 6 | if os.path.isfile("README.md"): 7 | with open("README.md", "r") as fh: 8 | desc = fh.read() 9 | else: 10 | desc = "" 11 | return desc 12 | 13 | 14 | setuptools.setup( 15 | name="hover", 16 | version="0.9.0", 17 | description="Label data at scale. Fun and precision included.", 18 | long_description=get_description(), 19 | long_description_content_type="text/markdown", 20 | author="Pavel", 21 | author_email="pavelhurwicz@gmail.com", 22 | url="https://github.com/phurwicz/hover", 23 | packages=setuptools.find_packages(include=["hover*"]), 24 | install_requires=[ 25 | # python-version-specific example: "numpy>=1.14,<=1.21.5;python_version<'3.8.0'", 26 | # interactive/static visualization 27 | "bokeh>=3.0.3", 28 | # preprocessors 29 | "scikit-learn>=0.20.0", 30 | # neural stuff 31 | "torch>=1.10.0", 32 | # data handling 33 | "pandas>=1.3.0", 34 | "polars>=0.17.0", 35 | "pyarrow>=11.0.0", 36 | "numpy>=1.22", 37 | # computations 38 | "scipy>=1.3.2", 39 | # utilities 40 | "tqdm>=4.0", 41 | "rich>=11.0.0", 42 | "deprecated>=1.1.0", 43 | # dimensionality reduction: UMAP is included 44 | "umap-learn>=0.3.10", 45 | # module config customization 46 | "flexmod>=0.1.2", 47 | # optional: more dimensionality reduction methods 48 | # "ivis[cpu]>=1.7", 49 | # optional: distant supervision 50 | # "snorkel>=0.9.8", 51 | ], 52 | python_requires=">=3.8", 53 | classifiers=[ 54 | "Programming Language :: Python :: 3", 55 | "Development Status :: 4 - Beta", 56 | "License :: OSI Approved :: MIT License", 57 | "Operating System :: OS Independent", 58 | ], 59 | ) 60 | -------------------------------------------------------------------------------- /fixture_module/audio_vector_net/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Example importable module holding customized ingredients of a workflow with hover. 3 | Specifically for audio data in URLs. 4 | """ 5 | 6 | import os 7 | import re 8 | import numpy as np 9 | import wrappy 10 | import requests 11 | import librosa 12 | from io import BytesIO 13 | 14 | 15 | DIR_PATH = os.path.dirname(__file__) 16 | RAW_CACHE_PATH = os.path.join(DIR_PATH, "raws.pkl") 17 | AUD_CACHE_PATH = os.path.join(DIR_PATH, "auds.pkl") 18 | VEC_CACHE_PATH = os.path.join(DIR_PATH, "vecs.pkl") 19 | 20 | 21 | @wrappy.memoize( 22 | cache_limit=50000, 23 | return_copy=False, 24 | persist_path=RAW_CACHE_PATH, 25 | persist_batch_size=100, 26 | ) 27 | def url_to_content(url): 28 | """ 29 | Turn a URL to response content. 30 | """ 31 | response = requests.get(url) 32 | return response.content 33 | 34 | 35 | @wrappy.memoize( 36 | cache_limit=50000, 37 | return_copy=False, 38 | persist_path=AUD_CACHE_PATH, 39 | persist_batch_size=100, 40 | ) 41 | def url_to_audio(url): 42 | """ 43 | Turn a URL to audio data. 44 | """ 45 | data, sampling_rate = librosa.load(BytesIO(url_to_content(url))) 46 | return data, sampling_rate 47 | 48 | 49 | def get_vectorizer(): 50 | @wrappy.memoize( 51 | cache_limit=50000, 52 | return_copy=False, 53 | persist_path=VEC_CACHE_PATH, 54 | persist_batch_size=100, 55 | ) 56 | def vectorizer(url): 57 | """ 58 | Averaged MFCC over time. 59 | Resembles word-embedding-average-as-doc-embedding for texts. 60 | """ 61 | y, sr = url_to_audio(url) 62 | mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=32) 63 | return mfcc.mean(axis=1) 64 | 65 | return vectorizer 66 | 67 | 68 | def get_architecture(): 69 | from hover.utils.common_nn import LogisticRegression 70 | 71 | return LogisticRegression 72 | 73 | 74 | def get_state_dict_path(): 75 | return os.path.join(DIR_PATH, "model.pt") 76 | -------------------------------------------------------------------------------- /hover/utils/common_nn.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | 3 | 4 | class BaseSequential(nn.Module): 5 | """ 6 | Sequential neural net with no specified architecture. 7 | """ 8 | 9 | def __init__(self): 10 | """ 11 | Inheriting the parent constructor. 12 | """ 13 | super().__init__() 14 | 15 | def init_weights(self): 16 | for _layer in self.model: 17 | if isinstance(_layer, nn.Linear): 18 | nn.init.kaiming_normal_(_layer.weight, a=0.01) 19 | nn.init.constant_(_layer.bias, 0.0) 20 | 21 | def forward(self, input_tensor): 22 | return self.model(input_tensor) 23 | 24 | def eval_per_layer(self, input_tensor): 25 | """ 26 | Return the input, all intermediates, and the output. 27 | """ 28 | tensors = [input_tensor] 29 | current = input_tensor 30 | self.model.eval() 31 | 32 | for _layer in self.model.children(): 33 | current = _layer(current) 34 | tensors.append(current) 35 | 36 | return tensors 37 | 38 | 39 | class MLP(BaseSequential): 40 | def __init__(self, embed_dim, num_classes, dropout=0.25, n_hid=128): 41 | """ 42 | Set up a proportionally fixed architecture. 43 | """ 44 | super().__init__() 45 | self.model = nn.Sequential( 46 | nn.Dropout(dropout), 47 | nn.Linear(embed_dim, n_hid), 48 | nn.ReLU(), 49 | nn.BatchNorm1d(n_hid), 50 | nn.Dropout(dropout), 51 | nn.Linear(n_hid, n_hid // 4), 52 | nn.ReLU(), 53 | nn.BatchNorm1d(n_hid // 4), 54 | nn.Dropout(dropout), 55 | nn.Linear(n_hid // 4, num_classes), 56 | ) 57 | self.init_weights() 58 | 59 | 60 | class LogisticRegression(BaseSequential): 61 | def __init__(self, embed_dim, num_classes): 62 | """ 63 | Set up a minimal architecture. 64 | """ 65 | super().__init__() 66 | self.model = nn.Sequential(nn.Linear(embed_dim, num_classes)) 67 | self.init_weights() 68 | -------------------------------------------------------------------------------- /docs/snippets/markdown/readme/2-features.zh.md: -------------------------------------------------------------------------------- 1 | ## :sparkles: 具体功能 2 | 3 | :telescope: 将向量降维得到二维数据散点图, 并配有 4 | 5 |
6 | 提示框 来显示具体数据内容 7 | 8 |
9 | 10 |
11 | 表格来 批量检视 选中的数据 12 | 13 |
14 | 15 |
16 | 切换按钮来 区分数据子集 17 | 18 |
19 | 20 |
21 | 文本/正则匹配 来定向搜寻数据 22 | 23 |
24 | 25 | :microscope: 与标注界面同步的辅助模式 26 | 27 |
28 | `Finder`: 以匹配条件来 过滤 选中的数据 29 | 30 |
31 | 32 |
33 | `SoftLabel`: 主动学习 用模型打分过滤选中的数据 34 | 35 |
36 | 37 |
38 | `Snorkel`: 自定义函数 来过滤数据或直接打标 39 | 40 |
41 | 42 | :toolbox: 更多的补充工具 43 | 44 |
45 | 降维时保留 更多维度 (3D? 4D?) 并动态选择观察的平面 46 | 47 |
48 | 49 |
50 | 跨界面/跨维度地进行 持续选取/反选 以达到更高精度 51 | 52 |
53 | 54 |
55 | 剔除选中数据中的异类 以及 修订发现的误标 56 | 57 |
58 | -------------------------------------------------------------------------------- /tests/recipes/local_helper.py: -------------------------------------------------------------------------------- 1 | import time 2 | import operator 3 | from bokeh.document import Document 4 | from bokeh.events import ButtonClick, MenuItemClick 5 | from hover import module_config 6 | 7 | 8 | def action_view_selection(dataset): 9 | view_event = ButtonClick(dataset.selection_viewer) 10 | dataset.selection_viewer._trigger_event(view_event) 11 | # dataset.sel_table.source.data is a {"field": []}-like dict 12 | view_data = dataset.sel_table.source.data.copy() 13 | return view_data 14 | 15 | 16 | def action_evict_selection(dataset): 17 | old_view_data = dataset.sel_table.source.data.copy() 18 | evict_event = ButtonClick(dataset.selection_evictor) 19 | dataset.selection_evictor._trigger_event(evict_event) 20 | new_view_data = dataset.sel_table.source.data.copy() 21 | return old_view_data, new_view_data 22 | 23 | 24 | def action_patch_selection(dataset): 25 | patch_event = ButtonClick(dataset.selection_patcher) 26 | dataset.selection_patcher._trigger_event(patch_event) 27 | 28 | 29 | def action_apply_labels(annotator): 30 | apply_event = ButtonClick(annotator.annotator_apply) 31 | annotator.annotator_apply._trigger_event(apply_event) 32 | labeled_slice = annotator.dfs["raw"].filter_rows_by_operator( 33 | "label", operator.ne, module_config.ABSTAIN_DECODED 34 | )() 35 | return labeled_slice 36 | 37 | 38 | def action_commit_selection(dataset, subset="train"): 39 | commit_event = MenuItemClick(dataset.data_committer, item=subset) 40 | dataset.data_committer._trigger_event(commit_event) 41 | 42 | 43 | def action_deduplicate(dataset): 44 | dedup_event = ButtonClick(dataset.dedup_trigger) 45 | dataset.dedup_trigger._trigger_event(dedup_event) 46 | 47 | 48 | def action_push_data(dataset): 49 | push_event = ButtonClick(dataset.update_pusher) 50 | dataset.update_pusher._trigger_event(push_event) 51 | 52 | 53 | def execute_handle_function(handle): 54 | doc = Document() 55 | handle(doc) 56 | # a few seconds to activate timed callcacks 57 | time.sleep(10) 58 | for wrapped_callback in doc.session_callbacks: 59 | wrapped_callback.callback() 60 | -------------------------------------------------------------------------------- /.github/workflows/quick-source-test.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies and run tests on the source code. 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions 3 | 4 | name: Quick Source Test 5 | 6 | on: 7 | push: 8 | branches: [ main, nightly ] 9 | paths: 10 | - 'hover/**.py' 11 | - 'tests/**.py' 12 | - 'pytest.ini' 13 | - 'fixture_module/**.py' 14 | pull_request: 15 | branches: [ main, nightly ] 16 | paths: 17 | - 'hover/**.py' 18 | - 'tests/**.py' 19 | - 'pytest.ini' 20 | - 'fixture_module/**.py' 21 | workflow_dispatch: 22 | 23 | jobs: 24 | test-api: 25 | 26 | runs-on: ${{ matrix.os }} 27 | strategy: 28 | fail-fast: false 29 | matrix: 30 | # test oldest and newest supported Python version 31 | python-version: ['3.8', '3.10'] 32 | os: [ubuntu-latest] 33 | 34 | steps: 35 | - uses: actions/checkout@v3 36 | - name: Set up Python ${{ matrix.python-version }} 37 | uses: actions/setup-python@v3 38 | with: 39 | python-version: ${{ matrix.python-version }} 40 | 41 | - name: Find cached tox env 42 | id: find-venv 43 | uses: actions/cache@v3 44 | with: 45 | path: .tox 46 | key: ${{ runner.os }}-${{ runner.python-version }}-tox-env-${{ hashFiles('**/setup.py') }} 47 | restore-keys: | 48 | ${{ runner.os }}-${{ runner.python-version }}-tox-env- 49 | 50 | - name: Get dependencies 51 | run: | 52 | pip install --upgrade pip 53 | pip install --upgrade tox tox-gh-actions 54 | 55 | - name: Test - default config 56 | run: | 57 | tox -e test_api 58 | 59 | - name: Test - alt config 1 60 | run: | 61 | tox -e test_api -- --hover-ini tests/module_config/hover_alt_config_1.ini 62 | 63 | - name: Codacy Coverage Reporter 64 | uses: codacy/codacy-coverage-reporter-action@master 65 | if: ${{ github.event_name == 'workflow_dispatch' || github.event_name == 'push' }} 66 | with: 67 | project-token: ${{ secrets.CODACY_PROJECT_TOKEN }} 68 | coverage-reports: cobertura.xml 69 | -------------------------------------------------------------------------------- /docs/pages/tutorial/t6-softlabel-joint-filter.md: -------------------------------------------------------------------------------- 1 | > `hover` filters can stack together. 2 | > 3 | > :speedboat: This makes selections incredibly powerful. 4 | 5 | {!docs/snippets/html/thebe.html!} 6 | {!docs/snippets/markdown/binder-kernel.md!} 7 | {!docs/snippets/markdown/component-tutorial.md!} 8 | {!docs/snippets/markdown/local-dependency.md!} 9 | {!docs/snippets/markdown/local-dep-text.md!} 10 | {!docs/snippets/markdown/local-dep-jupyter-bokeh.md!} 11 | 12 | ## **Preparation** 13 | 14 | {!docs/snippets/markdown/dataset-prep.md!} 15 | 16 | ## **Soft-Label Explorer** 17 | 18 | Active learning works by predicting labels and scores (i.e. soft labels) and utilizing that prediction. An intuitive way to plot soft labels is to color-code labels and use opacity ("alpha" by `bokeh` terminology) to represent scores. 19 | 20 | `SoftLabelExplorer` delivers this functionality: 21 | 22 |
23 | {!docs/snippets/py/tz-bokeh-notebook-common.txt!}
24 | 
25 | {!docs/snippets/py/tz-bokeh-notebook-remote.txt!}
26 | 
27 | {!docs/snippets/py/t6-0-softlabel-figure.txt!}
28 | 

29 | 30 | ## **Filter Selection by Score Range** 31 | 32 | Similarly to `finder`, a `softlabel` plot has its own selection filter. The difference lies in the filter condition: 33 | 34 | {!docs/snippets/markdown/jupyterlab-js-issue.md!} 35 | 36 |
37 | {!docs/snippets/py/t6-1-softlabel-filter.txt!}
38 | 

39 | 40 | ## **Linked Selections & Joint Filters** 41 | 42 | When we plot multiple `explorer`s for the same `dataset`, it makes sense to synchronize selections between those plots. `hover` recipes take care of this synchronization. 43 | 44 | - :tada: This also works with cumulative selections. Consequently, the cumulative toggle is synchronized too. 45 | 46 | Since each filter is narrowing down the selections we make, joint filters is just set intersection, extended 47 | 48 | - from two sets (original selection + filter) 49 | - to N sets (original selection + filter A + filter B + ...) 50 | 51 | The [`active_learning` recipe]((../t1-active-learning/)) is built of `softlabel + annotator + finder`, plus a few widgets for iterating the model-in-loop. 52 | 53 | In the next tutorial(s), we will see more recipes taking advantage of linked selections and joint filters. Powerful indeed! 54 | 55 | {!docs/snippets/html/stylesheet.html!} 56 | -------------------------------------------------------------------------------- /docs/pages/guides/g2-hover-config.md: -------------------------------------------------------------------------------- 1 | > `hover` can be customized through its module config. 2 | > 3 | > :bulb: Let's explore a few use cases. 4 | 5 | {!docs/snippets/markdown/tutorial-required.md!} 6 | {!docs/snippets/html/thebe.html!} 7 | {!docs/snippets/markdown/binder-kernel.md!} 8 | 9 | ## **Color Palette for Labeled Data Points** 10 | 11 | You may want to customize the color palette for better contrast or accessibility, which can depend on specific scenarios. 12 | 13 | The snippet below shows an example of default colors assigned to 6 classes. `hover` by default samples [`Turbo256`](https://docs.bokeh.org/en/latest/docs/reference/palettes.html#large-palettes) to accommodate a large number of classes while keeping good contrast. 14 | 15 |
16 | {!docs/snippets/py/g2-0-color-palette.txt!}
17 | 
18 | 19 | You can change the palette using any `bokeh` palette, or any iterable of hex colors like `"#000000"`. 20 |
21 | {!docs/snippets/py/g2-1-configure-palette.txt!}
22 | 
23 | 24 | ???+ note "Config changes should happen early" 25 | `hover.config` assignments need to happen before plotting your data. 26 | 27 | - This is because `hover` locks config values for consistency as soon as each config value is read by other code. 28 | - Ideally you should change config immediately after `import hover`. 29 | 30 | ## **Color of Unlabeled Data Points** 31 | 32 | For unlabeled data points, `hover` uses a light gray color `"#dcdcdc"`. This is not configured in the color palette above, but here: 33 | 34 |
35 | {!docs/snippets/py/g2-2-configure-abstain-color.txt!}
36 | 
37 | 38 | ## **Dimensionality Reduction Method** 39 | 40 | `hover` uses dimensionality reduction in a lot of places. It can be cumbersome to find these places and use your preferred method. In such cases a module-level override can be handy: 41 | 42 |
43 | {!docs/snippets/py/g2-3-configure-reduction-method.txt!}
44 | 
45 | 46 | ## **Browse more configs** 47 | 48 | There are more configurations that are more niche which we will skip here. You can find a full list of configurations, default values, and hints here: 49 | 50 |
51 | {!docs/snippets/py/g2-4-config-hint.txt!}
52 | 
53 | 54 | Happy customizing! 55 | 56 | {!docs/snippets/html/stylesheet.html!} 57 | -------------------------------------------------------------------------------- /hover/utils/snorkel_helper.py: -------------------------------------------------------------------------------- 1 | import uuid 2 | 3 | 4 | def labeling_function(targets, label_encoder=None, **kwargs): 5 | """ 6 | ???+ note "Hover's flavor of the Snorkel labeling_function decorator." 7 | However, due to the dynamic label encoding nature of hover, 8 | the decorated function should return the original string label, not its encoding integer. 9 | 10 | - assigns a UUID for easy identification 11 | - keeps track of LF targets 12 | 13 | | Param | Type | Description | 14 | | :-------------- | :----- | :----------------------------------- | 15 | | `targets` | `list` of `str` | labels that the labeling function is intended to create | 16 | | `label_encoder` | `dict` | {decoded_label -> encoded_label} mapping, if you also want an original snorkel-style labeling function linked as a `.snorkel` attribute | 17 | | `**kwargs` | | forwarded to `snorkel`'s `labeling_function()` | 18 | """ 19 | # lazy import so that the package does not require snorkel 20 | # Feb 3, 2022: snorkel's dependency handling is too strict 21 | # for other dependencies like NumPy, SciPy, SpaCy, etc. 22 | # Let's cite Snorkel and lazy import or copy functions. 23 | # DO NOT explicitly depend on Snorkel without confirming 24 | # that all builds/tests pass by Anaconda standards, else 25 | # we risk having to drop conda support. 26 | from snorkel.labeling import ( 27 | labeling_function as snorkel_lf, 28 | LabelingFunction as SnorkelLF, 29 | ) 30 | 31 | def wrapper(func): 32 | # set up kwargs for Snorkel's LF 33 | # a default name that can be overridden 34 | snorkel_kwargs = {"name": func.__name__} 35 | snorkel_kwargs.update(kwargs) 36 | 37 | # return value of hover's decorator 38 | lf = SnorkelLF(f=func, **snorkel_kwargs) 39 | 40 | # additional attributes 41 | lf.uuid = uuid.uuid1() 42 | lf.targets = targets[:] 43 | 44 | # link a snorkel-style labeling function if applicable 45 | if label_encoder: 46 | lf.label_encoder = label_encoder 47 | 48 | def snorkel_style_func(x): 49 | return lf.label_encoder[func(x)] 50 | 51 | lf.snorkel = snorkel_lf(**kwargs)(snorkel_style_func) 52 | else: 53 | lf.label_encoder = None 54 | lf.snorkel = None 55 | 56 | return lf 57 | 58 | return wrapper 59 | -------------------------------------------------------------------------------- /hover/config_constants.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | 4 | class ConfigSection: 5 | IO = "io" 6 | BACKEND = "backend" 7 | VISUAL = "visual" 8 | DATA_EMBEDDING = "data.embedding" 9 | DATA_COLUMNS = "data.columns" 10 | DATA_VALUES = "data.values" 11 | 12 | 13 | class ConfigKey: 14 | DATA_SAVE_DIR = "data_save_dir" 15 | DATAFRAME_LIBRARY = "dataframe_library" 16 | ABSTAIN_HEXCOLOR = "abstain_hexcolor" 17 | BOKEH_PALETTE = "bokeh_palette" 18 | BOKEH_PALETTE_USAGE = "bokeh_palette_usage" 19 | TABLE_IMG_STYLE = "table_img_style" 20 | TOOLTIP_IMG_STYLE = "tooltip_img_style" 21 | SEARCH_MATCH_HEXCOLOR = "search_match_hexcolor" 22 | DATAPOINT_BASE_SIZE = "datapoint_base_size" 23 | DEFAULT_REDUCTION_METHOD = "default_reduction_method" 24 | ENCODED_LABEL_KEY = "encoded_label_key" 25 | DATASET_SUBSET_FIELD = "dataset_subset_field" 26 | EMBEDDING_FIELD_PREFIX = "embedding_field_prefix" 27 | SOURCE_COLOR_FIELD = "source_color_field" 28 | SOURCE_ALPHA_FIELD = "source_alpha_field" 29 | SEARCH_SCORE_FIELD = "search_score_field" 30 | ABSTAIN_DECODED = "abstain_decoded" 31 | ABSTAIN_ENCODED = "abstain_encoded" 32 | 33 | 34 | class Validator: 35 | @staticmethod 36 | def is_hex_color(x): 37 | return bool(re.match(r"^\#[0-9a-fA-F]{6}$", x)) 38 | 39 | @staticmethod 40 | def is_iterable(x): 41 | return hasattr(x, "__iter__") 42 | 43 | @staticmethod 44 | def is_iterable_of_hex_color(x): 45 | if not Validator.is_iterable(x): 46 | return False 47 | for i in x: 48 | if not Validator.is_hex_color(i): 49 | return False 50 | return True 51 | 52 | @staticmethod 53 | def is_supported_dataframe_library(x): 54 | return x in ["pandas", "polars"] 55 | 56 | @staticmethod 57 | def is_supported_dimensionality_reduction(x): 58 | return x in ["umap", "ivis"] 59 | 60 | @staticmethod 61 | def is_supported_traversal_mode(x): 62 | return x in ["iterate", "linspace"] 63 | 64 | @staticmethod 65 | def is_str(x): 66 | return isinstance(x, str) 67 | 68 | @staticmethod 69 | def is_int_and_compare(op, value): 70 | def func(x): 71 | return isinstance(x, int) and op(x, value) 72 | 73 | return func 74 | 75 | 76 | class Preprocessor: 77 | @staticmethod 78 | def remove_quote_at_ends(x): 79 | return re.sub(r"(^[\'\"]|[\'\"]$)", "", x) 80 | 81 | @staticmethod 82 | def lower(x): 83 | return x.lower() 84 | -------------------------------------------------------------------------------- /docs/pages/guides/g0-datatype-image.md: -------------------------------------------------------------------------------- 1 | > `hover` supports bulk-labeling images through their URLs (which can be local). 2 | > 3 | > :bulb: Let's do a quickstart for images and note what's different from texts. 4 | 5 | {!docs/snippets/markdown/tutorial-required.md!} 6 | {!docs/snippets/html/thebe.html!} 7 | {!docs/snippets/markdown/binder-kernel.md!} 8 | 9 | ## **Dataset for Images** 10 | 11 | `hover` handles images through their URL addresses. URLs are strings which can be easily stored, hashed, and looked up against. They are also convenient for rendering tooltips in the annotation interface. 12 | 13 | Similarly to `SupervisableTextDataset`, we can build one for images: 14 | 15 |
16 | {!docs/snippets/py/g0-0-dataset-image.txt!}
17 | 
18 | {!docs/snippets/py/t0-0a-dataset-text-print.txt!}
19 | 
20 | 21 | ## **Vectorizer for Images** 22 | 23 | We can follow a `URL -> content -> image object -> vector` path. 24 | 25 |
26 | {!docs/snippets/py/g0-1-url-to-content.txt!}
27 | 
28 | 29 |
30 | {!docs/snippets/py/g0-2-url-to-image.txt!}
31 | 
32 | 33 | {!docs/snippets/markdown/wrappy-cache.md!} 34 | 35 |
36 | {!docs/snippets/py/g0-3-image-vectorizer.txt!}
37 | 
38 | 39 | ## **Embedding and Plot** 40 | 41 | This is exactly the same as in the quickstart, just switching to image data: 42 | 43 |
44 | {!docs/snippets/py/t0-2-reduction.txt!}
45 | 
46 | 47 |
48 | {!docs/snippets/py/t0-3-simple-annotator.txt!}
49 | 
50 | {!docs/snippets/py/tz-bokeh-show-server.txt!}
51 | 
52 | {!docs/snippets/py/tz-bokeh-show-notebook.txt!}
53 | 
54 | 55 | ???+ note "What's special for images?" 56 | **Tooltips** 57 | 58 | For text, the tooltip shows the original value. 59 | 60 | For images, the tooltip embeds the image based on URL. 61 | 62 | - images in the local file system shall be served through [`python -m http.server`](https://docs.python.org/3/library/http.server.html). 63 | - they can then be accessed through `https://localhost:/relative/path/to/file`. 64 | 65 | **Search** 66 | 67 | For text, the search widget is based on regular expressions. 68 | 69 | For images, the search widget is based on vector cosine similarity. 70 | 71 | - the `dataset` has remembered the `vectorizer` under the hood and passed it to the `annotator`. 72 | - {== please [**let us know**](https://github.com/phurwicz/hover/issues/new) if you think there's a better way to search images in this case. ==} 73 | 74 | 75 | {!docs/snippets/html/stylesheet.html!} 76 | -------------------------------------------------------------------------------- /notebooks/archive-prototype/Programmatic-Event.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "%load_ext autoreload\n", 10 | "\n", 11 | "%autoreload 2" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": null, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "import sys\n", 21 | "sys.path.append('../')" 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "metadata": {}, 27 | "source": [ 28 | "### Programmatically Trigger Events" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": null, 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "from bokeh.io import output_notebook, show\n", 38 | "from bokeh.plotting import figure\n", 39 | "from bokeh.models import ColumnDataSource\n", 40 | "\n", 41 | "output_notebook()" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": null, 47 | "metadata": { 48 | "tags": [] 49 | }, 50 | "outputs": [], 51 | "source": [ 52 | "from bokeh.models import Button\n", 53 | "from bokeh.events import ButtonClick, MenuItemClick\n", 54 | "from datetime import datetime\n", 55 | "import time\n", 56 | "\n", 57 | "button = Button(label=\"Click me\")\n", 58 | "button_click = ButtonClick(button)\n", 59 | "\n", 60 | "def callback(event):\n", 61 | " print(f\"Clicked at {datetime.now()}\", end=\"\\r\")\n", 62 | "\n", 63 | "button.on_click(callback)\n", 64 | "\n", 65 | "for i in range(10):\n", 66 | " button._trigger_event(button_click)" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": null, 72 | "metadata": {}, 73 | "outputs": [], 74 | "source": [] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": null, 79 | "metadata": {}, 80 | "outputs": [], 81 | "source": [] 82 | } 83 | ], 84 | "metadata": { 85 | "kernelspec": { 86 | "display_name": "Python 3 (ipykernel)", 87 | "language": "python", 88 | "name": "python3" 89 | }, 90 | "language_info": { 91 | "codemirror_mode": { 92 | "name": "ipython", 93 | "version": 3 94 | }, 95 | "file_extension": ".py", 96 | "mimetype": "text/x-python", 97 | "name": "python", 98 | "nbconvert_exporter": "python", 99 | "pygments_lexer": "ipython3", 100 | "version": "3.9.7" 101 | } 102 | }, 103 | "nbformat": 4, 104 | "nbformat_minor": 4 105 | } 106 | -------------------------------------------------------------------------------- /docs/snippets/markdown/readme/2-features.en.md: -------------------------------------------------------------------------------- 1 | ## :sparkles: Features 2 | 3 | > **It's fast because it labels data in bulk.** 4 | 5 | :telescope: A semantic scatter plot of your data for labeling, equipped with 6 | 7 |
8 | Tooltip for each point on mouse hover 9 | 10 |
11 | 12 |
13 | Table view for inspecting selected points 14 | 15 |
16 | 17 |
18 | Toggle buttons that clearly distinguish data subsets 19 | 20 |
21 | 22 |
23 | Search widgets for ad-hoc data highlight 24 | 25 |
26 | 27 | > **It's accurate because multiple components work together.** 28 | 29 | :microscope: Supplementary views to use in conjunction with the annotator, including 30 | 31 |
32 | `Finder`: filter data by search criteria 33 | 34 |
35 | 36 |
37 | `SoftLabel`: active learning by in-the-loop model prediction score 38 | 39 |
40 | 41 |
42 | `Snorkel`: custom functions for labeling and filtering 43 | 44 |
45 | 46 | > **It's flexible (and fun!) because the process never gets old.** 47 | 48 | :toolbox: Additional tools and options that allow you to 49 | 50 |
51 | Go to higher dimensions (3D? 4D?) and choose your xy-axes 52 | 53 |
54 | 55 |
56 | Consecutively select across areas, dimensions, and views 57 | 58 |
59 | 60 |
61 | Kick outliers and fix mistakes 62 | 63 |
64 | -------------------------------------------------------------------------------- /notebooks/archive-prototype/Dynamic-Widget.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "%load_ext autoreload\n", 10 | "\n", 11 | "%autoreload 2" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": null, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "import sys\n", 21 | "sys.path.append('../../')" 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "metadata": {}, 27 | "source": [ 28 | "### Dynamically Change Widget Behavior" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": null, 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "from bokeh.models import Selection, RangeSlider, Button, Dropdown\n", 38 | "from bokeh.layouts import row, column\n", 39 | "from bokeh.io import output_notebook, show\n", 40 | "from hover.utils.bokeh_helper import servable\n", 41 | "\n", 42 | "output_notebook()\n", 43 | "\n", 44 | "slider = RangeSlider(start=-1.0, end=1.0, value=(-0.5, 0.5), step=0.01)\n", 45 | "slider.on_change(\"value\", lambda attr, old, new: print(f\"Range changed to {slider.value}\"))\n", 46 | "\n", 47 | "@servable()\n", 48 | "def burner():\n", 49 | " arr = ['1', '2', '3']\n", 50 | " dropdown = Dropdown(\n", 51 | " label=\"Select Element\",\n", 52 | " button_type=\"primary\",\n", 53 | " menu=arr,\n", 54 | " )\n", 55 | " \n", 56 | " button = Button(label=\"Click Me\", height=100)\n", 57 | " def button_callcack(event):\n", 58 | " dropdown.menu.append(str(int(dropdown.menu[-1]) + 1))\n", 59 | " print(f\"Button Clicked! Got menu: {dropdown.menu}\")\n", 60 | " button.on_click(button_callcack)\n", 61 | "\n", 62 | " return column(dropdown, button)\n", 63 | "\n", 64 | "handle = burner()\n", 65 | "show(handle)" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": null, 71 | "metadata": {}, 72 | "outputs": [], 73 | "source": [] 74 | } 75 | ], 76 | "metadata": { 77 | "kernelspec": { 78 | "display_name": "Python 3 (ipykernel)", 79 | "language": "python", 80 | "name": "python3" 81 | }, 82 | "language_info": { 83 | "codemirror_mode": { 84 | "name": "ipython", 85 | "version": 3 86 | }, 87 | "file_extension": ".py", 88 | "mimetype": "text/x-python", 89 | "name": "python", 90 | "nbconvert_exporter": "python", 91 | "pygments_lexer": "ipython3", 92 | "version": "3.9.7" 93 | } 94 | }, 95 | "nbformat": 4, 96 | "nbformat_minor": 4 97 | } 98 | -------------------------------------------------------------------------------- /fixture_module/image_vector_net/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Example importable module holding customized ingredients of a workflow with hover. 3 | Specifically for 3-channel image data in URLs. 4 | """ 5 | 6 | import os 7 | import re 8 | import numpy as np 9 | import wrappy 10 | import requests 11 | from PIL import Image 12 | from io import BytesIO 13 | 14 | 15 | DIR_PATH = os.path.dirname(__file__) 16 | RAW_CACHE_PATH = os.path.join(DIR_PATH, "raws.pkl") 17 | IMG_CACHE_PATH = os.path.join(DIR_PATH, "imgs.pkl") 18 | VEC_CACHE_PATH = os.path.join(DIR_PATH, "vecs.pkl") 19 | 20 | 21 | @wrappy.memoize( 22 | cache_limit=50000, 23 | return_copy=False, 24 | persist_path=RAW_CACHE_PATH, 25 | persist_batch_size=100, 26 | ) 27 | def url_to_content(url): 28 | """ 29 | Turn a URL to response content. 30 | """ 31 | response = requests.get(url) 32 | return response.content 33 | 34 | 35 | @wrappy.memoize( 36 | cache_limit=50000, 37 | return_copy=False, 38 | persist_path=IMG_CACHE_PATH, 39 | persist_batch_size=100, 40 | ) 41 | def url_to_image(url): 42 | """ 43 | Turn a URL to a PIL Image. 44 | """ 45 | img = Image.open(BytesIO(url_to_content(url))).convert("RGB") 46 | return img 47 | 48 | 49 | def get_vectorizer(): 50 | import torch 51 | from efficientnet_pytorch import EfficientNet 52 | from torchvision import transforms 53 | 54 | # EfficientNet is a series of pre-trained models 55 | # https://github.com/lukemelas/EfficientNet-PyTorch 56 | model = EfficientNet.from_pretrained("efficientnet-b0") 57 | model.eval() 58 | 59 | # standard transformations for ImageNet-trained models 60 | tfms = transforms.Compose( 61 | [ 62 | transforms.Resize(224), 63 | transforms.ToTensor(), 64 | transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]), 65 | ] 66 | ) 67 | 68 | # memoization can be useful if the function takes a while to run, which is common for images 69 | @wrappy.memoize( 70 | cache_limit=50000, 71 | return_copy=False, 72 | persist_path=VEC_CACHE_PATH, 73 | persist_batch_size=100, 74 | ) 75 | def vectorizer(url): 76 | """ 77 | Using logits on ImageNet-1000 classes. 78 | """ 79 | img = tfms(url_to_image(url)).unsqueeze(0) 80 | 81 | with torch.no_grad(): 82 | outputs = model(img) 83 | 84 | return outputs.detach().numpy().flatten() 85 | 86 | return vectorizer 87 | 88 | 89 | def get_architecture(): 90 | from hover.utils.common_nn import LogisticRegression 91 | 92 | return LogisticRegression 93 | 94 | 95 | def get_state_dict_path(): 96 | return os.path.join(DIR_PATH, "model.pt") 97 | -------------------------------------------------------------------------------- /docs/pages/guides/g1-datatype-audio.md: -------------------------------------------------------------------------------- 1 | > `hover` supports bulk-labeling audios through their URLs (which can be local). 2 | > 3 | > :bulb: Let's do a quickstart for audios and note what's different from texts. 4 | 5 | {!docs/snippets/markdown/tutorial-required.md!} 6 | {!docs/snippets/html/thebe.html!} 7 | {!docs/snippets/markdown/binder-kernel.md!} 8 | 9 | ## **Dataset for audios** 10 | 11 | `hover` handles audios through their URL addresses. URLs are strings which can be easily stored, hashed, and looked up against. They are also convenient for rendering tooltips in the annotation interface. 12 | 13 | Similarly to `SupervisableTextDataset`, we can build one for audios: 14 | 15 |
16 | {!docs/snippets/py/g1-0-dataset-audio.txt!}
17 | 
18 | {!docs/snippets/py/t0-0a-dataset-text-print.txt!}
19 | 
20 | 21 | ## **Vectorizer for audios** 22 | 23 | We can follow a `URL -> content -> audio array -> vector` path. 24 | 25 |
26 | {!docs/snippets/py/g0-1-url-to-content.txt!}
27 | 
28 | 29 |
30 | {!docs/snippets/py/g1-1-url-to-audio.txt!}
31 | 
32 | 33 | {!docs/snippets/markdown/wrappy-cache.md!} 34 | 35 |
36 | {!docs/snippets/py/g1-2-audio-vectorizer.txt!}
37 | 
38 | 39 | ## **Embedding and Plot** 40 | 41 | This is exactly the same as in the quickstart, just switching to audio data: 42 | 43 |
44 | {!docs/snippets/py/t0-2-reduction.txt!}
45 | 
46 | 47 |
48 | {!docs/snippets/py/t0-3-simple-annotator.txt!}
49 | 
50 | {!docs/snippets/py/tz-bokeh-show-server.txt!}
51 | 
52 | {!docs/snippets/py/tz-bokeh-show-notebook.txt!}
53 | 
54 | 55 | ???+ note "What's special for audios?" 56 | **Tooltips** 57 | 58 | For text, the tooltip shows the original value. 59 | 60 | For audios, the tooltip embeds the audio based on URL. 61 | 62 | - audios in the local file system shall be served through [`python -m http.server`](https://docs.python.org/3/library/http.server.html). 63 | - they can then be accessed through `https://localhost:/relative/path/to/file`. 64 | 65 | **Search** 66 | 67 | For text, the search widget is based on regular expressions. 68 | 69 | For audios, the search widget is based on vector cosine similarity. 70 | 71 | - the `dataset` has remembered the `vectorizer` under the hood and passed it to the `annotator`. 72 | - {== please [**let us know**](https://github.com/phurwicz/hover/issues/new) if you think there's a better way to search audios in this case. ==} 73 | - dynamic time warping, due to its running time (> 10ms per pair for small 100x10 MFCC arrays), is too slow for search. 74 | - we are experimenting with subsampled signals and pre-selected data points (by vector similarity, for example). 75 | 76 | 77 | {!docs/snippets/html/stylesheet.html!} 78 | -------------------------------------------------------------------------------- /docs/pages/tutorial/t5-finder-filter.md: -------------------------------------------------------------------------------- 1 | > `Finder` is an `explorer` focused on **search**. 2 | > 3 | > :speedboat: It can help you select points using a **filter** based on search results. 4 | 5 | {!docs/snippets/html/thebe.html!} 6 | {!docs/snippets/markdown/binder-kernel.md!} 7 | {!docs/snippets/markdown/component-tutorial.md!} 8 | {!docs/snippets/markdown/local-dependency.md!} 9 | {!docs/snippets/markdown/local-dep-text.md!} 10 | {!docs/snippets/markdown/local-dep-jupyter-bokeh.md!} 11 | 12 | ## **More Angles -> Better Results** 13 | 14 | `Explorer`s other than `annotator` are specialized in finding additional insight to help us understand the data. Having them juxtaposed with `annotator`, we can label more accurately, more confidently, and even faster. 15 | 16 | ## **Preparation** 17 | 18 | {!docs/snippets/markdown/dataset-prep.md!} 19 | 20 | ## **Filter Toggles** 21 | 22 | When we use lasso or polygon select, we are describing a shape. Sometimes that shape is not accurate enough -- we need extra conditions to narrow down the data. 23 | 24 | Just like `annotator`, `finder` has search widgets. But unlike `annotator`, `finder` has a **filter toggle** which can directly **intersect** *what we selected* with *what meets the search criteria*. 25 | 26 | {!docs/snippets/markdown/jupyterlab-js-issue.md!} 27 | 28 |
29 | {!docs/snippets/py/tz-bokeh-notebook-common.txt!}
30 | 
31 | {!docs/snippets/py/tz-bokeh-notebook-remote.txt!}
32 | 
33 | {!docs/snippets/py/t5-0-finder-filter.txt!}
34 | 

35 | 36 | Next to the search widgets is a checkbox. The filter will stay active as long as the checkbox is. 37 | 38 | ???+ info "How the filter interacts with selection options" 39 | Selection options apply before filters. 40 | 41 | `hover` memorizes your pre-filter selections, so you can keep selecting without having to tweaking the filter toggle. 42 | 43 | - Example: 44 | - suppose you have previously selected a set of points called `A`. 45 | - then you toggled a filter `f`, giving you `A∩F` where `F` is the set satisfying `f`. 46 | - now, with selection option "union", you select a set of points called `B`. 47 | - your current selection will be `(A ∪ B) ∩ F`, i.e. `(A ∩ F) ∪ (B ∩ F)`. 48 | - similarly, you would get `(A ∩ B) ∩ F` for "intersection" and `(A ∖ B) ∩ F` for "difference". 49 | - if you untoggle the filter now, you selection would be `A ∪ B`. 50 | 51 | - In the later tutorials, we shall see multiple filters in action together. 52 | - spoiler: `F = F1 ∩ F2 ∩ ...` and that's it! 53 | 54 | ## **Stronger Highlight for Search** 55 | 56 | `finder` also colors data points based on search criteria, making them easier to find. 57 | 58 | {!docs/snippets/markdown/jupyterlab-js-issue.md!} 59 | 60 |
61 | {!docs/snippets/py/t5-1-finder-figure.txt!}
62 | 

63 | 64 | {!docs/snippets/html/stylesheet.html!} 65 | -------------------------------------------------------------------------------- /hover/utils/misc.py: -------------------------------------------------------------------------------- 1 | """Mini-functions that do not belong elsewhere.""" 2 | from datetime import datetime 3 | from abc import ABC, abstractmethod 4 | 5 | 6 | def current_time(template="%Y%m%d %H:%M:%S"): 7 | return datetime.now().strftime(template) 8 | 9 | 10 | class BaseUnionFind(ABC): 11 | """ 12 | ???+ note "Data attached to union-find." 13 | """ 14 | 15 | def __init__(self, data): 16 | self._data = data 17 | self._parent = None 18 | self._count = 1 19 | 20 | def __repr__(self): 21 | return self.data.__repr__() 22 | 23 | @property 24 | def count(self): 25 | if self.parent is None: 26 | return self._count 27 | return self.find().count 28 | 29 | @count.setter 30 | def count(self, count): 31 | self._count = count 32 | 33 | @property 34 | def parent(self): 35 | return self._parent 36 | 37 | @parent.setter 38 | def parent(self, other): 39 | assert isinstance(other, BaseUnionFind) 40 | self._parent = other 41 | 42 | def find(self): 43 | if self.parent: 44 | self.parent = self.parent.find() 45 | return self.parent 46 | return self 47 | 48 | @abstractmethod 49 | def union(self, other): 50 | pass 51 | 52 | 53 | class NodeUnionFind(BaseUnionFind): 54 | """ 55 | ???+ note "Each node keeps its own data." 56 | """ 57 | 58 | @property 59 | def data(self): 60 | return self._data 61 | 62 | @data.setter 63 | def data(self, data): 64 | self._data = data 65 | 66 | def union(self, other): 67 | root = self.find() 68 | other_root = other.find() 69 | if root is other_root: 70 | return 71 | 72 | # merge the smaller trees into the larger 73 | if root.count < other_root.count: 74 | other_root.count += root.count 75 | root.parent = other_root 76 | else: 77 | root.count += other_root.count 78 | other_root.parent = root 79 | 80 | 81 | class RootUnionFind(BaseUnionFind): 82 | """ 83 | ???+ note "Union always uses left as root. Each node looks up its root for data." 84 | """ 85 | 86 | @property 87 | def data(self): 88 | root = self.find() 89 | if self is root: 90 | return self._data 91 | return root.data 92 | 93 | @data.setter 94 | def data(self, data): 95 | root = self.find() 96 | if self is root: 97 | self._data = data 98 | root._data = data 99 | 100 | def union(self, other): 101 | root = self.find() 102 | other_root = other.find() 103 | 104 | # clear the data on the other root 105 | other_root.data = None 106 | root.count += other_root.count 107 | other_root.parent = root 108 | -------------------------------------------------------------------------------- /notebooks/archive-prototype/Editing-Datatable.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "2bf553ea-eb52-49f8-ae88-da179ca9e793", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "from bokeh.models import (\n", 11 | " Button,\n", 12 | " ColumnDataSource,\n", 13 | " DataTable,\n", 14 | " TableColumn\n", 15 | ")\n", 16 | "from bokeh.layouts import column\n", 17 | "from hover.utils.bokeh_helper import servable\n", 18 | "\n", 19 | "@servable()\n", 20 | "def burner():\n", 21 | " df = pd.DataFrame({\n", 22 | " 'f0': np.random.uniform(-1.0, 1.0, size=(100,)).tolist(),\n", 23 | " 'f1': np.random.uniform(-1.0, 1.0, size=(100,)).tolist(),\n", 24 | " 'f2': np.random.uniform(-1.0, 1.0, size=(100,)).tolist(),\n", 25 | " 'f3': np.random.uniform(-1.0, 1.0, size=(100,)).tolist(),\n", 26 | " })\n", 27 | " forward_button = Button(label=\"df-to-table\")\n", 28 | " backward_button = Button(label=\"table-to-df\")\n", 29 | " \n", 30 | " sel_source = ColumnDataSource(dict())\n", 31 | " show_columns = ['f0', 'f2']\n", 32 | " sel_columns = [TableColumn(field=_col, title=_col) for _col in show_columns]\n", 33 | " sel_table = DataTable(source=sel_source, columns=sel_columns, selectable=\"checkbox\", editable=True)\n", 34 | " \n", 35 | " def df_to_table(event):\n", 36 | " sel_source.data = df.to_dict(orient=\"list\")\n", 37 | " \n", 38 | " def table_to_df(event):\n", 39 | " indices = sel_source.selected.indices\n", 40 | " for _col in show_columns:\n", 41 | " _values = sel_source.data[_col]\n", 42 | " _patches = [_values[i] for i in indices]\n", 43 | " df.loc[indices, _col] = _patches\n", 44 | " print(_col, indices, len(_patches))\n", 45 | " \n", 46 | " forward_button.on_click(df_to_table)\n", 47 | " backward_button.on_click(table_to_df)\n", 48 | " return column(forward_button, backward_button, sel_table)" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": null, 54 | "id": "1446d4c0-8da7-4101-9816-9b30297036aa", 55 | "metadata": {}, 56 | "outputs": [], 57 | "source": [ 58 | "handle = burner()\n", 59 | "show(handle)" 60 | ] 61 | } 62 | ], 63 | "metadata": { 64 | "kernelspec": { 65 | "display_name": "Python 3 (ipykernel)", 66 | "language": "python", 67 | "name": "python3" 68 | }, 69 | "language_info": { 70 | "codemirror_mode": { 71 | "name": "ipython", 72 | "version": 3 73 | }, 74 | "file_extension": ".py", 75 | "mimetype": "text/x-python", 76 | "name": "python", 77 | "nbconvert_exporter": "python", 78 | "pygments_lexer": "ipython3", 79 | "version": "3.9.7" 80 | } 81 | }, 82 | "nbformat": 4, 83 | "nbformat_minor": 5 84 | } 85 | -------------------------------------------------------------------------------- /hover/core/representation/trajectory.py: -------------------------------------------------------------------------------- 1 | """ 2 | Trajectory interpolation for sequences of vectors. 3 | """ 4 | from scipy import interpolate 5 | import numpy as np 6 | 7 | 8 | def spline(arr_per_dim, points_per_step=1, splprep_kwargs=None): 9 | """ 10 | Fit a spline and evaluate it at a specified density of points. 11 | 12 | - param arr_per_dim(numpy.ndarray): dim-by-points array representing the part of the curve in each dimension. 13 | 14 | - param points_per_step(int): number of points interpolated in between each given point on the curve. 15 | 16 | - param splprep_kwargs(dict): keyword arguments to the splprep() function for fitting the spline in SciPy. 17 | """ 18 | 19 | # cast to array if appropriate 20 | if isinstance(arr_per_dim, list): 21 | arr_per_dim = np.array(arr_per_dim) 22 | 23 | assert points_per_step >= 1, "Need at least one point per step" 24 | splprep_kwargs = splprep_kwargs or dict() 25 | 26 | # check the number of given points in the curve 27 | num_given_points = arr_per_dim[0].shape[0] 28 | assert num_given_points > 1, "Need at least two points to fit a line" 29 | 30 | # check if two vectors are almost identical, and apply a noise in that case 31 | # note that we did not modify arr_per_dim in place 32 | # and that the noise only goes up in a greedy random-walk manner 33 | noise_arr = np.zeros((len(arr_per_dim), num_given_points)) 34 | for i in range(1, num_given_points): 35 | prev_vec, vec = arr_per_dim[:, i - 1] + noise_arr[:, i - 1], arr_per_dim[:, i] 36 | while np.allclose(vec + noise_arr[:, i], prev_vec): 37 | noise_arr[:, i] += np.random.normal(loc=0.0, scale=1e-6, size=vec.shape) 38 | 39 | # reduce spline order if necessary, then fit the spline parameters 40 | splprep_kwargs["k"] = min(3, num_given_points - 1) 41 | tck, u = interpolate.splprep(arr_per_dim + noise_arr, **splprep_kwargs) 42 | 43 | # determine points at which the spline should be evaluated 44 | points_to_eval = [] 45 | for i in range(0, u.shape[0] - 1): 46 | _pts = np.linspace(u[i], u[i + 1], points_per_step, endpoint=False) 47 | points_to_eval.append(_pts) 48 | points_to_eval.append([u[-1]]) 49 | points_to_eval = np.concatenate(points_to_eval) 50 | 51 | traj_per_dim = interpolate.splev(points_to_eval, tck) 52 | return traj_per_dim 53 | 54 | 55 | def manifold_spline(seq_arr, **kwargs): 56 | """ 57 | Fit a spline to every sequence of points in a manifold. 58 | - param seq_arr: L-sequence of M-by-N arrays each containing vectors matched by index. 59 | :type seq_arr: numpy.ndarray 60 | """ 61 | # L is unused 62 | _L, M, N = seq_arr.shape 63 | 64 | # this gives M-by-N-by-f(L, args) 65 | traj_arr = np.array( 66 | [ 67 | spline(np.array([seq_arr[:, _m, _n] for _n in range(N)]), **kwargs) 68 | for _m in range(M) 69 | ] 70 | ) 71 | 72 | # return f(L, args)-by-M-by-N 73 | traj_arr = np.swapaxes(traj_arr, 1, 2) 74 | traj_arr = np.swapaxes(traj_arr, 0, 1) 75 | return traj_arr 76 | -------------------------------------------------------------------------------- /hover/utils/datasets.py: -------------------------------------------------------------------------------- 1 | """ 2 | Submodule that loads and preprocesses public datasets into formats that work smoothly. 3 | """ 4 | 5 | from sklearn.datasets import fetch_20newsgroups 6 | from hover import module_config 7 | import re 8 | 9 | 10 | def clean_string(text, sub_from=r"[^a-zA-Z0-9\ ]", sub_to=r" "): 11 | cleaned = re.sub(sub_from, sub_to, text) 12 | cleaned = re.sub(r" +", r" ", cleaned) 13 | return cleaned 14 | 15 | 16 | def newsgroups_dictl( 17 | data_home="~/scikit_learn_data", 18 | to_remove=("headers", "footers", "quotes"), 19 | text_key="text", 20 | label_key="label", 21 | label_mapping=None, 22 | ): 23 | """ 24 | Load the 20 Newsgroups dataset into a list of dicts, deterministically. 25 | """ 26 | label_mapping = label_mapping or dict() 27 | dataset = dict() 28 | label_set = set() 29 | for _key in ["train", "test"]: 30 | _dictl = [] 31 | 32 | # load subset and transform into a list of dicts 33 | _bunch = fetch_20newsgroups( 34 | data_home=data_home, subset=_key, random_state=42, remove=to_remove 35 | ) 36 | for i, text in enumerate(_bunch.data): 37 | _text = clean_string(text) 38 | _label = _bunch.target_names[_bunch.target[i]] 39 | _label = label_mapping.get(_label, _label) 40 | 41 | _text_actual_characters = re.sub(r"[^a-zA-Z0-9]", r"", _text) 42 | if len(_text_actual_characters) > 5: 43 | label_set.add(_label) 44 | _entry = {text_key: _text, label_key: _label} 45 | _dictl.append(_entry) 46 | 47 | # add to dataset 48 | dataset[_key] = _dictl 49 | 50 | label_list = sorted(list(label_set)) 51 | label_decoder = {idx: value for idx, value in enumerate(label_list)} 52 | label_decoder[module_config.ABSTAIN_ENCODED] = module_config.ABSTAIN_DECODED 53 | label_encoder = {value: idx for idx, value in label_decoder.items()} 54 | return dataset, label_encoder, label_decoder 55 | 56 | 57 | def newsgroups_reduced_dictl(**kwargs): 58 | """ 59 | Load the 20 Newsgroups dataset but reduce categories using a custom mapping. 60 | """ 61 | label_mapping = { 62 | "alt.atheism": "religion", 63 | "comp.graphics": "computer", 64 | "comp.os.ms-windows.misc": "computer", 65 | "comp.sys.ibm.pc.hardware": "computer", 66 | "comp.sys.mac.hardware": "computer", 67 | "comp.windows.x": "computer", 68 | "misc.forsale": "forsale", 69 | "rec.autos": "recreation", 70 | "rec.motorcycles": "recreation", 71 | "rec.sport.baseball": "recreation", 72 | "rec.sport.hockey": "recreation", 73 | "sci.crypt": "computer", 74 | "sci.electronics": "computer", 75 | "sci.med": "med", 76 | "sci.space": "space", 77 | "soc.religion.christian": "religion", 78 | "talk.politics.guns": "politics", 79 | "talk.politics.mideast": "politics", 80 | "talk.politics.misc": "politics", 81 | "talk.religion.misc": "religion", 82 | } 83 | kwargs["label_mapping"] = label_mapping 84 | return newsgroups_dictl(**kwargs) 85 | -------------------------------------------------------------------------------- /docs/pages/tutorial/t2-bokeh-app.md: -------------------------------------------------------------------------------- 1 | > `hover` creates a [`bokeh` server app](https://docs.bokeh.org/en/latest/docs/user_guide/server.html) to deliver its annotation interface. 2 | > 3 | > :rocket: This app can be served flexibly based on your needs. 4 | 5 | {!docs/snippets/html/stylesheet.html!} 6 | 7 | ## **Prerequisites** 8 | 9 | Suppose that we've already used a `recipe` to create a `handle` function like in the [quickstart](../t0-quickstart/#apply-labels). 10 | 11 | ??? info "Recap from the tutorials before" 12 | - the `handle` is a function which renders plot elements on a [`bokeh` document](https://docs.bokeh.org/en/latest/docs/reference/document.html). 13 | 14 | ## **Option 1: Jupyter** 15 | 16 | We are probably familiar with this now: 17 | 18 | ```Python 19 | from bokeh.io import show, output_notebook 20 | output_notebook() 21 | show(handle) # notebook_url='http://localhost:8888' 22 | ``` 23 | 24 | ???+ tip "Pros & Cons" 25 | This inline Jupyter mode can integrate particularly well with your notebook workflow. For example, when your are (tentatively) done with annotation, the `SupervisableDataset` can be accessed directly in the notebook, rather than exported to a file and loaded back. 26 | 27 | The inline mode is highly recommended for local usage. 28 | 29 | - On the contrary, with a remote Jupyter server, it may have trouble displaying the plots. 30 | 31 | - this can be due to failure of loading JavaScript libraries or accessing implicit bokeh server ports. 32 | 33 | ## **Option 2: Command Line** 34 | 35 | [`bokeh serve`](https://docs.bokeh.org/en/latest/docs/user_guide/server.html) starts an explicit `tornado` server from the command line: 36 | 37 | ```bash 38 | bokeh serve my-app.py 39 | ``` 40 | 41 | ```Python 42 | # my-app.py 43 | 44 | # handle = ... 45 | 46 | from bokeh.io import curdoc 47 | doc = curdoc() 48 | handle(doc) 49 | ``` 50 | 51 | ???+ tip "Pros & Cons" 52 | This is the "classic" approach to run a `bokeh` server. Remote access is simple through parameters [**specified here**](https://docs.bokeh.org/en/latest/docs/reference/command/subcommands/serve.html). The bokeh plot tools are mobile-friendly too -- this means you can host a server, e.g. an http-enabled cloud virtual machine, and annotate from a tablet. 53 | 54 | The command line mode is less interactive, since Python objects in the script cannot be accessed on the fly. 55 | 56 | ## **Option 3: Anywhere in Python** 57 | 58 | It is possible to [embed the app](https://docs.bokeh.org/en/latest/docs/user_guide/server.html#embedding-bokeh-server-as-a-library) in regular Python: 59 | 60 | ```Python 61 | from bokeh.server.server import Server 62 | server = Server({'/my-app': handle}) 63 | server.start() 64 | ``` 65 | 66 | ???+ tip "Pros & Cons" 67 | This embedded mode is a go-to for serving within a greater application. 68 | 69 | Also note that each command line argument for `bokeh serve` has a corresponding keyword argument to `Server()`. 70 | 71 | For instance, `bokeh serve --allow-websocket-origin=*` in the command line mirrors `Server(*args, allow_websocket_origin='*')` in Python. 72 | 73 | The embedded mode gives you the most control of your server. 74 | -------------------------------------------------------------------------------- /hover/recipes/stable.py: -------------------------------------------------------------------------------- 1 | """ 2 | ???+ note "High-level functions to produce an interactive annotation interface." 3 | Stable recipes whose function signatures should almost never change in the future. 4 | """ 5 | from hover.utils.bokeh_helper import servable 6 | from .subroutine import recipe_layout, standard_annotator, standard_finder 7 | 8 | 9 | @servable(title="Simple Annotator") 10 | def simple_annotator(dataset, **kwargs): 11 | """ 12 | ???+ note "Display the dataset with on a 2D map for annotation." 13 | 14 | | Param | Type | Description | 15 | | :-------- | :------- | :----------------------------------- | 16 | | `dataset` | `SupervisableDataset` | the dataset to link to | 17 | | `**kwargs` | | kwargs to forward to each Bokeh figure | 18 | 19 | Expected visual layout: 20 | 21 | | SupervisableDataset | BokehDataAnnotator | 22 | | :------------------ | :----------------- | 23 | | manage data subsets | make annotations | 24 | """ 25 | dataset.setup_bokeh_elements(reset=True) 26 | layout, _ = _simple_annotator(dataset, **kwargs) 27 | return layout 28 | 29 | 30 | def _simple_annotator(dataset, layout_style="horizontal", **kwargs): 31 | """ 32 | ???+ note "Cousin of simple_annotator which exposes objects in the layout." 33 | """ 34 | annotator = standard_annotator(dataset, **kwargs) 35 | 36 | sidebar = dataset.view() 37 | layout = recipe_layout(sidebar, annotator.view(), style=layout_style) 38 | 39 | objects = {"dataset": dataset, "annotator": annotator, "sidebar": sidebar} 40 | return layout, objects 41 | 42 | 43 | @servable(title="Linked Annotator") 44 | def linked_annotator(dataset, **kwargs): 45 | """ 46 | ???+ note "Display the dataset on a 2D map in two views, one for search and one for annotation." 47 | 48 | | Param | Type | Description | 49 | | :-------- | :------- | :----------------------------------- | 50 | | `dataset` | `SupervisableDataset` | the dataset to link to | 51 | | `**kwargs` | | kwargs to forward to each Bokeh figure | 52 | 53 | Expected visual layout: 54 | 55 | | SupervisableDataset | BokehDataFinder | BokehDataAnnotator | 56 | | :------------------ | :------------------ | :----------------- | 57 | | manage data subsets | search -> highlight | make annotations | 58 | """ 59 | dataset.setup_bokeh_elements(reset=True) 60 | layout, _ = _linked_annotator(dataset, **kwargs) 61 | return layout 62 | 63 | 64 | def _linked_annotator(dataset, layout_style="horizontal", **kwargs): 65 | """ 66 | ???+ note "Cousin of linked_annotator which exposes objects in the layout." 67 | """ 68 | finder = standard_finder(dataset, **kwargs) 69 | annotator = standard_annotator(dataset, **kwargs) 70 | 71 | # link selections 72 | annotator.link_selection( 73 | finder, 74 | {_key: _key for _key in ["raw", "train", "dev", "test"]}, 75 | ) 76 | 77 | sidebar = dataset.view() 78 | layout = recipe_layout(sidebar, finder.view(), annotator.view(), style=layout_style) 79 | 80 | objects = { 81 | "dataset": dataset, 82 | "annotator": annotator, 83 | "finder": finder, 84 | "sidebar": sidebar, 85 | } 86 | return layout, objects 87 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py38, py39, py310 3 | 4 | [gh-actions] 5 | python = 6 | 3.8: py38 7 | 3.9: py39 8 | 3.10: py310 9 | 10 | [testenv:test_api] 11 | deps = 12 | # standard testing 13 | pytest 14 | coverage 15 | # text generation 16 | faker 17 | # pseudo-random str-to-float 18 | shaffle 19 | # common NLP and vectorizers 20 | spacy 21 | # dimensionality reduction 22 | ivis[cpu]>=1.7 23 | tensorflow>=2.9 24 | # distant supervision 25 | snorkel>=0.9.8 26 | # utility wrappers 27 | wrappy>=0.2.6 28 | # exporting excel files 29 | openpyxl 30 | 31 | commands = 32 | # get extra dependencies that deps do not cover 33 | python -m spacy download en_core_web_md 34 | # run tests and get coverage report 35 | coverage run --source=./hover -m pytest -m "not benchmark" {posargs} 36 | coverage xml -o cobertura.xml 37 | 38 | install_command = 39 | python -m pip install --upgrade {opts} {packages} 40 | 41 | [testenv:test_api_lite] 42 | # low-dependency fast test suite for compatibility 43 | deps = 44 | pytest 45 | faker 46 | shaffle 47 | spacy 48 | snorkel>=0.9.8 49 | wrappy>=0.2.6 50 | openpyxl 51 | 52 | commands = 53 | python -m spacy download en_core_web_md 54 | # run tests with "lite" mark 55 | pytest -m lite {posargs} 56 | 57 | install_command = 58 | python -m pip install --upgrade {opts} {packages} 59 | 60 | [testenv:test_api_benchmark] 61 | deps = 62 | pytest 63 | faker 64 | shaffle 65 | wrappy>=0.2.6 66 | 67 | commands = 68 | # specify threads limit for numpy and polars 69 | export OMP_NUM_THREADS=1 POLARS_MAX_THREADS=1 70 | # run tests with "benchmark" mark 71 | pytest -m benchmark {posargs} 72 | 73 | install_command = 74 | python -m pip install --upgrade {opts} {packages} 75 | 76 | [testenv:test_doc_scripts] 77 | allowlist_externals = 78 | git 79 | cp 80 | rm 81 | deps = -rdocs/pipelines/requirements-doc-scripts.txt 82 | 83 | commands = 84 | python -m spacy download en_core_web_md 85 | git clone https://github.com/phurwicz/hover-binder 86 | cp -r hover-binder/local_lib ./local_lib 87 | cp -r hover-binder/custom_cache ./custom_cache 88 | rm -rf hover-binder 89 | python docs/pipelines/check_scripts.py 90 | rm -rf local_lib 91 | rm -rf custom_cache 92 | 93 | [testenv:test_notebook_generation] 94 | allowlist_externals = 95 | mkdir 96 | deps = -rdocs/pipelines/requirements-doc-scripts.txt 97 | 98 | commands = 99 | python -m spacy download en_core_web_md 100 | mkdir custom_cache 101 | python docs/pipelines/generate_notebooks.py 102 | 103 | [testenv:install] 104 | commands = 105 | python setup.py install {posargs} 106 | 107 | [testenv:publish] 108 | allowlist_externals = 109 | rm 110 | deps = 111 | twine 112 | 113 | commands = 114 | python setup.py sdist bdist_wheel 115 | twine check dist/* 116 | twine upload dist/* 117 | rm -rf build dist hover.egg-info 118 | 119 | [flake8] 120 | ignore = 121 | # black breaks these 122 | E203, 123 | E501, 124 | W503, 125 | per-file-ignores = 126 | # "imported but unused": intended in __init__ files 127 | __init__.py: F401 128 | conftest.py: E402 129 | exclude = .git,__pycache__,docs,build,dist 130 | max-complexity = 10 131 | -------------------------------------------------------------------------------- /tests/utils/test_misc.py: -------------------------------------------------------------------------------- 1 | from hover.utils.misc import current_time, NodeUnionFind, RootUnionFind 2 | import pytest 3 | 4 | 5 | @pytest.mark.lite 6 | def test_current_time(): 7 | timestamp = current_time() 8 | assert isinstance(timestamp, str) 9 | 10 | 11 | def node_data_from_uf_array(arr): 12 | """Subroutine for testing utility.""" 13 | return [_node.data for _node in arr] 14 | 15 | 16 | def find_data_from_uf_array(arr): 17 | """Subroutine for testing utility.""" 18 | return [_node.find().data for _node in arr] 19 | 20 | 21 | def counts_from_uf_array(arr): 22 | """Subroutine for testing utility.""" 23 | return [_node.count for _node in arr] 24 | 25 | 26 | def check_unionfind(arr, nodes, finds, counts): 27 | assert node_data_from_uf_array(arr) == nodes 28 | assert find_data_from_uf_array(arr) == finds 29 | assert counts_from_uf_array(arr) == counts 30 | 31 | 32 | @pytest.mark.lite 33 | def test_nodeunionfind(): 34 | arr = [NodeUnionFind(i) for i in range(8)] 35 | assert repr(arr[0]) == "0" 36 | 37 | for _l, _r, _nodes, _finds, _counts in [ 38 | ( 39 | 0, 40 | 1, 41 | [0, 1, 2, 3, 4, 5, 6, 7], 42 | [0, 0, 2, 3, 4, 5, 6, 7], 43 | [2, 2, 1, 1, 1, 1, 1, 1], 44 | ), 45 | ( 46 | 1, 47 | 2, 48 | [0, 1, 2, 3, 4, 5, 6, 7], 49 | [0, 0, 0, 3, 4, 5, 6, 7], 50 | [3, 3, 3, 1, 1, 1, 1, 1], 51 | ), 52 | ( 53 | 0, 54 | 2, 55 | [0, 1, 2, 3, 4, 5, 6, 7], 56 | [0, 0, 0, 3, 4, 5, 6, 7], 57 | [3, 3, 3, 1, 1, 1, 1, 1], 58 | ), 59 | ( 60 | 3, 61 | 4, 62 | [0, 1, 2, 3, 4, 5, 6, 7], 63 | [0, 0, 0, 3, 3, 5, 6, 7], 64 | [3, 3, 3, 2, 2, 1, 1, 1], 65 | ), 66 | ( 67 | 4, 68 | 2, 69 | [0, 1, 2, 3, 4, 5, 6, 7], 70 | [0, 0, 0, 0, 0, 5, 6, 7], 71 | [5, 5, 5, 5, 5, 1, 1, 1], 72 | ), 73 | ]: 74 | arr[_l].union(arr[_r]) 75 | check_unionfind(arr, _nodes, _finds, _counts) 76 | 77 | # test data assignment 78 | arr[0].data = 8 79 | check_unionfind( 80 | arr, 81 | [8, 1, 2, 3, 4, 5, 6, 7], 82 | [8, 8, 8, 8, 8, 5, 6, 7], 83 | [5, 5, 5, 5, 5, 1, 1, 1], 84 | ) 85 | 86 | 87 | @pytest.mark.lite 88 | def test_rootunionfind(): 89 | arr = [RootUnionFind(i) for i in range(8)] 90 | 91 | for _l, _r, _nodes, _finds, _counts in [ 92 | ( 93 | 0, 94 | 1, 95 | [0, 0, 2, 3, 4, 5, 6, 7], 96 | [0, 0, 2, 3, 4, 5, 6, 7], 97 | [2, 2, 1, 1, 1, 1, 1, 1], 98 | ), 99 | ( 100 | 1, 101 | 2, 102 | [0, 0, 0, 3, 4, 5, 6, 7], 103 | [0, 0, 0, 3, 4, 5, 6, 7], 104 | [3, 3, 3, 1, 1, 1, 1, 1], 105 | ), 106 | ( 107 | 3, 108 | 4, 109 | [0, 0, 0, 3, 3, 5, 6, 7], 110 | [0, 0, 0, 3, 3, 5, 6, 7], 111 | [3, 3, 3, 2, 2, 1, 1, 1], 112 | ), 113 | ( 114 | 4, 115 | 2, 116 | [3, 3, 3, 3, 3, 5, 6, 7], 117 | [3, 3, 3, 3, 3, 5, 6, 7], 118 | [5, 5, 5, 5, 5, 1, 1, 1], 119 | ), 120 | ]: 121 | arr[_l].union(arr[_r]) 122 | check_unionfind(arr, _nodes, _finds, _counts) 123 | -------------------------------------------------------------------------------- /docs/pages/tutorial/t7-snorkel-improvise-rules.md: -------------------------------------------------------------------------------- 1 | > Suppose we have some custom functions for labeling or filtering data, which resembles [`snorkel`](https://github.com/snorkel-team/snorkel)'s typical scenario. 2 | > 3 | > :speedboat: Let's see how these functions can be combined with `hover`. 4 | 5 | {!docs/snippets/html/thebe.html!} 6 | {!docs/snippets/markdown/binder-kernel.md!} 7 | {!docs/snippets/markdown/component-tutorial.md!} 8 | {!docs/snippets/markdown/local-dependency.md!} 9 | {!docs/snippets/markdown/local-dep-text.md!} 10 | {!docs/snippets/markdown/local-dep-snorkel.md!} 11 | {!docs/snippets/markdown/local-dep-jupyter-bokeh.md!} 12 | 13 | ## **Preparation** 14 | 15 | {!docs/snippets/markdown/dataset-prep.md!} 16 | 17 | ## **Labeling Functions** 18 | 19 | Labeling functions are functions that **take a `pd.DataFrame` row and return a label or abstain**. 20 | 21 | Inside the function one can do many things, but let's start with simple keywords wrapped in regex: 22 | 23 | ??? info "About the decorator @labeling_function" 24 | ::: hover.utils.snorkel_helper.labeling_function 25 | 26 |
27 | {!docs/snippets/py/t7-0-lf-list.txt!}
28 | 

29 | 30 |
31 | {!docs/snippets/py/t7-0a-lf-list-edit.txt!}
32 | 

33 | 34 | ### **Using a Function to Apply Labels** 35 | 36 | Hover's `SnorkelExplorer` (short as `snorkel`) can take the labeling functions above and apply them on areas of data that you choose. The widget below is responsible for labeling: 37 | 38 | {!docs/snippets/markdown/jupyterlab-js-issue.md!} 39 | 40 |
41 | {!docs/snippets/py/tz-bokeh-notebook-common.txt!}
42 | 
43 | {!docs/snippets/py/tz-bokeh-notebook-remote.txt!}
44 | 
45 | {!docs/snippets/py/t7-1-snorkel-apply-button.txt!}
46 | 

47 | 48 | ### **Using a Function to Apply Filters** 49 | 50 | Any function that labels is also a function that filters. The filter condition is `"keep if did not abstain"`. The widget below handles filtering: 51 | 52 | {!docs/snippets/markdown/jupyterlab-js-issue.md!} 53 | 54 |
55 | {!docs/snippets/py/t7-2-snorkel-filter-button.txt!}
56 | 

57 | 58 | Unlike the toggled filters for `finder` and `softlabel`, filtering with functions is on a per-click basis. In other words, this particular filtration doesn't persist when you select another area. 59 | 60 | ## **Dynamic List of Functions** 61 | 62 | Python lists are mutable, and we are going to take advantage of that for improvising and editing labeling functions on the fly. 63 | 64 | Run the block below and open the resulting URL to launch a recipe. 65 | 66 | - labeling functions are evaluated against the `dev` set. 67 | - hence you are advised to send the labels produced by these functions to the `train` set, not the `dev` set. 68 | - come back and edit the list of labeling functions **in-place** in one of the code cells above. 69 | - then go to the launched app and refresh the functions! 70 | 71 |
72 | {!docs/snippets/py/t7-3-snorkel-crosscheck.txt!}
73 | 
74 | {!docs/snippets/py/tz-bokeh-show-server.txt!}
75 | 
76 | {!docs/snippets/py/tz-bokeh-show-notebook.txt!}
77 | 
78 | 79 | What's really cool is that in your local environment, this update-and-refresh operation can be done all in a notebook. So now you can 80 | 81 | - interactively evaluate and revise labeling functions 82 | - visually assign specific data regions to apply those functions 83 | 84 | which makes labeling functions significantly more accurate and applicable. 85 | 86 | {!docs/snippets/html/stylesheet.html!} 87 | -------------------------------------------------------------------------------- /hover/utils/torch_helper.py: -------------------------------------------------------------------------------- 1 | """ 2 | Submodule that handles interaction with PyTorch. 3 | """ 4 | import numpy as np 5 | import torch 6 | import torch.nn.functional as F 7 | from torch.utils.data import Dataset, DataLoader 8 | 9 | 10 | class VectorDataset(Dataset): 11 | """ 12 | PyTorch Dataset of vectors and probabilistic classification targets. 13 | """ 14 | 15 | DEFAULT_LOADER_KWARGS = dict(batch_size=64, shuffle=True, drop_last=False) 16 | 17 | def __init__(self, input_vectors, output_vectors): 18 | """Overrides the parent constructor.""" 19 | assert len(input_vectors) == len(output_vectors) 20 | self.input_tensor = torch.FloatTensor(np.asarray(input_vectors)) 21 | self.output_tensor = torch.FloatTensor(np.asarray(output_vectors)) 22 | 23 | def __getitem__(self, index): 24 | """Makes the dataset an iterable.""" 25 | return self.input_tensor[index], self.output_tensor[index], index 26 | 27 | def __len__(self): 28 | """Defines the length measure.""" 29 | return len(self.input_tensor) 30 | 31 | def loader(self, **kwargs): 32 | keyword_args = self.__class__.DEFAULT_LOADER_KWARGS.copy() 33 | keyword_args.update(kwargs) 34 | return DataLoader(dataset=self, **keyword_args) 35 | 36 | 37 | class MultiVectorDataset(Dataset): 38 | """ 39 | PyTorch Dataset of multi-vectors and probabilistic classification targets. 40 | """ 41 | 42 | DEFAULT_LOADER_KWARGS = dict(batch_size=64, shuffle=True, drop_last=False) 43 | 44 | def __init__(self, input_vector_lists, output_vectors): 45 | """Overrides the parent constructor.""" 46 | for _list in input_vector_lists: 47 | assert len(_list) == len(output_vectors) 48 | self.input_tensors = [ 49 | torch.FloatTensor(np.asarray(_list)) for _list in input_vector_lists 50 | ] 51 | self.output_tensor = torch.FloatTensor(np.asarray(output_vectors)) 52 | 53 | def __getitem__(self, index): 54 | """Makes the dataset an iterable.""" 55 | input_vectors = [_tensor[index] for _tensor in self.input_tensors] 56 | return input_vectors, self.output_tensor[index], index 57 | 58 | def __len__(self): 59 | """Defines the length measure.""" 60 | return len(self.output_tensor) 61 | 62 | def loader(self, **kwargs): 63 | keyword_args = self.__class__.DEFAULT_LOADER_KWARGS.copy() 64 | keyword_args.update(kwargs) 65 | return DataLoader(dataset=self, **keyword_args) 66 | 67 | 68 | def one_hot(encoded_labels, num_classes): 69 | """ 70 | One-hot encoding into a float form. 71 | 72 | :param encoded_labels: integer-encoded labels. 73 | :type encoded_labels: list of int 74 | :param num_classes: the number of classes to encode. 75 | :type num_classes: int 76 | """ 77 | return F.one_hot(torch.LongTensor(encoded_labels), num_classes=num_classes).float() 78 | 79 | 80 | def label_smoothing(probabilistic_labels, coefficient=0.1): 81 | """ 82 | Smooth probabilistic labels, auto-detecting the number of classes. 83 | 84 | :param probabilistic_labels: N by num_classes tensor 85 | :type probabilistic_labels: torch.Tensor or numpy.ndarray 86 | :param coefficient: the smoothing coeffient for soft labels. 87 | :type coefficient: float 88 | """ 89 | assert ( 90 | len(probabilistic_labels.shape) == 2 91 | ), f"Expected 2 dimensions, got shape {probabilistic_labels.shape}" 92 | assert coefficient >= 0.0, f"Expected non-negative smoothing, got {coefficient}" 93 | num_classes = probabilistic_labels.shape[-1] 94 | return (1.0 - coefficient) * probabilistic_labels + coefficient / num_classes 95 | -------------------------------------------------------------------------------- /hover/core/local_config.py: -------------------------------------------------------------------------------- 1 | import re 2 | import hover 3 | from hover.config_constants import ( 4 | ConfigSection as Section, 5 | ConfigKey as Key, 6 | ) 7 | from bokeh.models import ( 8 | Div, 9 | TableColumn, 10 | CellEditor, 11 | HTMLTemplateFormatter, 12 | ) 13 | 14 | 15 | DEFAULT_REDUCTION_METHOD = hover.config[Section.DATA_EMBEDDING][ 16 | Key.DEFAULT_REDUCTION_METHOD 17 | ] 18 | DATASET_SUBSET_FIELD = hover.config[Section.DATA_COLUMNS][Key.DATASET_SUBSET_FIELD] 19 | 20 | COLOR_GLYPH_TEMPLATE = """ 21 |

22 | <%= "███" %> 23 |

24 | """ 25 | 26 | EMBEDDING_FIELD_PREFIX = hover.config[Section.DATA_COLUMNS][Key.EMBEDDING_FIELD_PREFIX] 27 | EMBEDDING_FIELD_REGEX = r"\d+d_\d+$" 28 | 29 | 30 | def embedding_field(total_dim, specific_dim): 31 | return f"{EMBEDDING_FIELD_PREFIX}{total_dim}d_{specific_dim}" 32 | 33 | 34 | def is_embedding_field(column_name): 35 | if not column_name.startswith(EMBEDDING_FIELD_PREFIX): 36 | return False 37 | return bool(re.search(EMBEDDING_FIELD_REGEX, column_name)) 38 | 39 | 40 | def blank_callback_on_change(attr, old, new): 41 | return None 42 | 43 | 44 | def dataset_help_widget(): 45 | text = 'Dataset Widgets Help' 46 | return Div(text=text) 47 | 48 | 49 | def dataset_default_sel_table_columns(feature_key): 50 | """ 51 | ???+ note "Default `SupervisableDataset` selection table columns based on feature type." 52 | 53 | Always allow multi-selection and editing. Based on feature type: 54 | - increases row height for viewing images. 55 | """ 56 | # disable editing the feature through a blank editor 57 | feature_col_kwargs = dict(editor=CellEditor()) 58 | if feature_key == "text": 59 | feature_col_kwargs["formatter"] = HTMLTemplateFormatter( 60 | template="""<%= value %>""" 61 | ) 62 | elif feature_key == "image": 63 | style = hover.config[Section.VISUAL][Key.TABLE_IMG_STYLE] 64 | # width is easily adjustable on the UI, no need to make configurable here 65 | feature_col_kwargs["width"] = 200 66 | feature_col_kwargs["formatter"] = HTMLTemplateFormatter( 67 | template=f""" style="{style}">""", 68 | ) 69 | elif feature_key == "audio": 70 | feature_col_kwargs["width"] = 50 71 | feature_col_kwargs["formatter"] = HTMLTemplateFormatter( 72 | template="""""", 73 | ) 74 | else: 75 | raise ValueError(f"Unsupported feature type {feature_key}") 76 | 77 | columns = [ 78 | TableColumn(field=feature_key, title=feature_key, **feature_col_kwargs), 79 | TableColumn(field="label", title="label"), 80 | ] 81 | return columns 82 | 83 | 84 | def dataset_default_sel_table_kwargs(feature_key): 85 | """ 86 | ???+ note "Default `SupervisableDataset` selection table kwargs based on feature type." 87 | 88 | Always allow multi-selection and editing. Based on feature type: 89 | - increases row height for viewing images. 90 | """ 91 | kwargs = dict(selectable="checkbox", editable=True) 92 | if feature_key == "text": 93 | pass 94 | elif feature_key == "image": 95 | kwargs["row_height"] = 200 96 | elif feature_key == "audio": 97 | pass 98 | else: 99 | raise ValueError(f"Unsupported feature type {feature_key}") 100 | 101 | return kwargs 102 | -------------------------------------------------------------------------------- /hover/core/representation/reduction.py: -------------------------------------------------------------------------------- 1 | """ 2 | ???+ note "Linker data structures which tie (potentially multiple) dimensionality reducers to arrays." 3 | 4 | The point is to make it clear which reduction is in reference to which array. 5 | 6 | Icing on the cake: unify the syntax across different kinds of reducers. 7 | """ 8 | import numpy as np 9 | from hover.core import Loggable 10 | from .local_config import KWARG_TRANSLATOR, DEFAULT_REDUCTION_METHOD 11 | 12 | 13 | class DimensionalityReducer(Loggable): 14 | def __init__(self, array): 15 | """ 16 | ???+ note "Link self to the shared input array for reduction methods." 17 | | Param | Type | Description | 18 | | :------ | :----------- | :---------------------------- | 19 | | `array` | `np.ndarray` | the input array to fit on | 20 | """ 21 | self.reference_array = array 22 | 23 | @staticmethod 24 | def create_reducer(method=DEFAULT_REDUCTION_METHOD, *args, **kwargs): 25 | """ 26 | ???+ note "Handle kwarg translation and dynamic imports." 27 | 28 | | Param | Type | Description | 29 | | :--------- | :----- | :----------------------- | 30 | | `method` | `str` | `"umap"` or `"ivis"` | 31 | | `*args` | | forwarded to the reducer | 32 | | `**kwargs` | | translated and forwarded | 33 | """ 34 | if method == "umap": 35 | import umap 36 | 37 | reducer_cls = umap.UMAP 38 | elif method == "ivis": 39 | import ivis 40 | 41 | reducer_cls = ivis.Ivis 42 | else: 43 | raise ValueError("Expected 'umap' or 'ivis' as reduction method") 44 | 45 | translated_kwargs = kwargs.copy() 46 | for _key, _value in kwargs.items(): 47 | _trans_dict = KWARG_TRANSLATOR.get(_key, {}) 48 | if method in _trans_dict: 49 | _trans_key = _trans_dict[method] 50 | translated_kwargs.pop(_key) 51 | translated_kwargs[_trans_key] = _value 52 | 53 | reducer = reducer_cls(*args, **translated_kwargs) 54 | return reducer 55 | 56 | def fit_transform(self, method=DEFAULT_REDUCTION_METHOD, *args, **kwargs): 57 | """ 58 | ???+ note "Fit and transform an array and store the reducer." 59 | | Param | Type | Description | 60 | | :--------- | :----- | :----------------------- | 61 | | `method` | `str` | `"umap"` or `"ivis"` | 62 | | `*args` | | forwarded to the reducer | 63 | | `**kwargs` | | forwarded to the reducer | 64 | """ 65 | reducer = DimensionalityReducer.create_reducer(method=method, *args, **kwargs) 66 | embedding = reducer.fit_transform(self.reference_array) 67 | setattr(self, method, reducer) 68 | return embedding 69 | 70 | def transform(self, array, method=DEFAULT_REDUCTION_METHOD): 71 | """ 72 | ???+ note "Transform an array with a already-fitted reducer." 73 | | Param | Type | Description | 74 | | :--------- | :----------- | :----------------------- | 75 | | `array` | `np.ndarray` | the array to transform | 76 | | `method` | `str` | `"umap"` or `"ivis"` | 77 | """ 78 | assert isinstance(array, np.ndarray), f"Expected np.ndarray, got {type(array)}" 79 | # edge case: array is too small 80 | if array.shape[0] < 1: 81 | return np.array([]) 82 | 83 | reducer = getattr(self, method) 84 | return reducer.transform(array) 85 | -------------------------------------------------------------------------------- /hover/core/representation/manifold.py: -------------------------------------------------------------------------------- 1 | """ 2 | Manifold similarity measures for any collection of sequences of vectors. 3 | Can be useful for improved interpretability of neural nets. 4 | """ 5 | from tqdm import tqdm 6 | from scipy.spatial import procrustes 7 | from hover.core import Loggable 8 | from .reduction import DimensionalityReducer 9 | from .local_config import DEFAULT_REDUCTION_METHOD 10 | 11 | 12 | class LayerwiseManifold(Loggable): 13 | """ 14 | Takes a sequence of arrays (each row of the array is a vector) and does the following: 15 | (1) unfold vectors into lower dimensions, typically 2D or 3D; 16 | (2) for every array: 17 | run Procrustes analysis for fitting to the previous array. The first array is fitted to itself. 18 | """ 19 | 20 | DEFAULT_UNFOLD_KWARGS = { 21 | "umap": { 22 | "random_state": 0, 23 | "transform_seed": 0, 24 | } 25 | } 26 | 27 | def __init__(self, seq_arr): 28 | """ 29 | :param seq_arr: sequence of arrays to fit the manifold with. 30 | :type seq_arr: list of numpy.ndarrays. 31 | """ 32 | self.arrays = seq_arr[:] 33 | self.validate() 34 | self.standardize() 35 | 36 | def validate(self): 37 | """ 38 | Sanity check of array dimensions. 39 | """ 40 | assert ( 41 | len(self.arrays) > 1 42 | ), "Need at least two arrays to compute layerwise manifold." 43 | self.n_vecs = self.arrays[0].shape[0] 44 | for _arr in self.arrays: 45 | assert _arr.shape[0] == self.n_vecs 46 | 47 | def standardize(self): 48 | """ 49 | Standardize each array to the Procrustes form where 50 | - tr(A^T A) = 1 51 | - A.mean(axis=0) = 0 52 | """ 53 | 54 | def transform(arr): 55 | matrix, _, _ = procrustes(arr, arr) 56 | return matrix 57 | 58 | self.arrays = [transform(_arr) for _arr in self.arrays] 59 | 60 | def unfold(self, method=None, **kwargs): 61 | """ 62 | Compute lower-dimensional manifolds. 63 | :param method: the dimensionality reduction method to use. 64 | :type method: str 65 | """ 66 | if method is None: 67 | method = DEFAULT_REDUCTION_METHOD 68 | 69 | # default kwargs should fix random state and seed 70 | # so that randomness does not introduce disparity 71 | use_kwargs = self.__class__.DEFAULT_UNFOLD_KWARGS.get(method, {}).copy() 72 | use_kwargs.update(kwargs) 73 | self.manifolds = [] 74 | self._info(f"Running {method}...") 75 | for _arr in tqdm(self.arrays, total=len(self.arrays)): 76 | _reducer = DimensionalityReducer(_arr) 77 | _manifold = _reducer.fit_transform(method, **use_kwargs) 78 | self.manifolds.append(_manifold) 79 | self._good("unfolded arrays into manifolds") 80 | 81 | def procrustes(self, arrays=None): 82 | """ 83 | Run Procrustes analysis, optionally on a specified list of arrays. 84 | """ 85 | if arrays is None: 86 | arrays = self.manifolds 87 | disparities = [] 88 | fit_arrays = [] 89 | 90 | # fit each array to its fitted predecessor 91 | for i, _arr in enumerate(arrays): 92 | if i == 0: 93 | # fit the first array to itself 94 | _, _matrix, _disparity = procrustes(_arr, _arr) 95 | else: 96 | _, _matrix, _disparity = procrustes(fit_arrays[i - 1], _arr) 97 | disparities.append(_disparity) 98 | fit_arrays.append(_matrix) 99 | 100 | self._good("carried out Procrustes analysis") 101 | return fit_arrays, disparities 102 | -------------------------------------------------------------------------------- /tests/core/explorer/test_feature.py: -------------------------------------------------------------------------------- 1 | """ 2 | Corresponds to the `hover.core.explorer.feature` module. 3 | For mechanisms that are invariant across `hover.core.explorer.functionality`. 4 | """ 5 | 6 | import pytest 7 | import math 8 | from hover.recipes.subroutine import get_explorer_class 9 | from tests.local_config import VECTORIZER_BREAKER 10 | from .local_helper import ( 11 | FUNCTIONALITY_TO_SPECIAL_ARGS, 12 | subroutine_search_source_response, 13 | ) 14 | 15 | MAIN_FUNCTIONALITIES = list(FUNCTIONALITY_TO_SPECIAL_ARGS.keys()) 16 | 17 | 18 | def subroutine_searchable_explorer(dataset, functionality, feature): 19 | explorer_cls = get_explorer_class(functionality, feature) 20 | subset_mapping = explorer_cls.DEFAULT_SUBSET_MAPPING.copy() 21 | special_args = FUNCTIONALITY_TO_SPECIAL_ARGS[functionality] 22 | explorer = explorer_cls.from_dataset(dataset, subset_mapping, *special_args) 23 | explorer.activate_search() 24 | return explorer 25 | 26 | 27 | @pytest.mark.core 28 | class TestBokehForText: 29 | @staticmethod 30 | @pytest.mark.lite 31 | def test_search(example_text_dataset): 32 | for _functionality in MAIN_FUNCTIONALITIES: 33 | _explorer = subroutine_searchable_explorer( 34 | example_text_dataset, 35 | _functionality, 36 | "text", 37 | ) 38 | 39 | def search_a(): 40 | _explorer.search_pos.value = r"a" 41 | 42 | def desearch_a(): 43 | _explorer.search_neg.value = r"a" 44 | 45 | subroutine_search_source_response( 46 | _explorer, 47 | [ 48 | (search_a, True), 49 | (desearch_a, True), 50 | ], 51 | ) 52 | 53 | 54 | @pytest.mark.core 55 | class TestBokehForImage: 56 | @staticmethod 57 | @pytest.mark.lite 58 | def test_search(example_image_dataset): 59 | for _functionality in MAIN_FUNCTIONALITIES: 60 | _explorer = subroutine_searchable_explorer( 61 | example_image_dataset, 62 | _functionality, 63 | "image", 64 | ) 65 | 66 | def enter_first_image(): 67 | _explorer.search_sim.value = _explorer.dfs["raw"]["image"][0] 68 | 69 | def enter_second_image(): 70 | _explorer.search_sim.value = _explorer.dfs["raw"]["image"][1] 71 | 72 | def invalid_search(): 73 | _explorer.search_sim.value = VECTORIZER_BREAKER 74 | 75 | subroutine_search_source_response( 76 | _explorer, 77 | [ 78 | (enter_first_image, True), 79 | (enter_second_image, True), 80 | (invalid_search, False), 81 | ], 82 | ) 83 | 84 | 85 | @pytest.mark.core 86 | class TestBokehForAudio: 87 | @staticmethod 88 | @pytest.mark.lite 89 | def test_search(example_audio_dataset): 90 | for _functionality in MAIN_FUNCTIONALITIES: 91 | _explorer = subroutine_searchable_explorer( 92 | example_audio_dataset, 93 | _functionality, 94 | "audio", 95 | ) 96 | 97 | def enter_first_audio(): 98 | _explorer.search_sim.value = _explorer.dfs["raw"]["audio"][0] 99 | 100 | def alter_sim_thresh(): 101 | shifted = _explorer.search_threshold.value + 0.5 102 | _explorer.search_threshold.value = shifted - math.floor(shifted) 103 | 104 | subroutine_search_source_response( 105 | _explorer, 106 | [ 107 | (enter_first_audio, True), 108 | (alter_sim_thresh, True), 109 | ], 110 | ) 111 | -------------------------------------------------------------------------------- /docs/pages/tutorial/t3-dataset-population-selection.md: -------------------------------------------------------------------------------- 1 | > `SupervisableDataset` holds your data throughout the labeling process. 2 | > 3 | > :speedboat: Let's take a look at its core mechanisms. 4 | 5 | {!docs/snippets/html/thebe.html!} 6 | {!docs/snippets/markdown/binder-kernel.md!} 7 | {!docs/snippets/markdown/component-tutorial.md!} 8 | {!docs/snippets/markdown/local-dependency.md!} 9 | {!docs/snippets/markdown/local-dep-jupyter-bokeh.md!} 10 | 11 | ## **Data Subsets** 12 | 13 | We place unlabeled data and labeled data in different subsets: "raw", "train", "dev", and "test". Unlabeled data start from the "raw" subset, and can be transferred to other subsets after it gets labeled. 14 | 15 | `SupervisableDataset` uses a "population table", `dataset.pop_table`, to show the size of each subset: 16 | 17 |
18 | {!docs/snippets/py/tz-dataset-text-full.txt!}
19 | 

20 | 21 |
22 | {!docs/snippets/py/tz-bokeh-notebook-common.txt!}
23 | 
24 | {!docs/snippets/py/tz-bokeh-notebook-remote.txt!}
25 | 
26 | {!docs/snippets/py/t3-0-dataset-population-table.txt!}
27 | 

28 | 29 | ### **Transfer Data Between Subsets** 30 | 31 | `COMMIT` and `DEDUP` are the mechanisms that `hover` uses to transfer data between subsets. 32 | 33 | - `COMMIT` copies selected points (to be discussed later) to a destination subset 34 | - labeled-raw-only: `COMMIT` automatically detects which points are in the raw set with a valid label. Other points will not get copied. 35 | - keep-last: you can commit the same point to the same subset multiple times and the last copy will be kept. This can be useful for revising labels before `DEDUP`. 36 | - `DEDUP` removes duplicates (identified by feature value) across subsets 37 | - priority rule: test > dev > train > raw, i.e. test set data always gets kept during deduplication 38 | 39 | ???+ info "FAQ" 40 | ??? help "Why does COMMIT only work on the raw subset?" 41 | Most selections will happen through plots, where different subsets are on top of each other. This means selections can contain both unlabeled and labeled points. 42 | 43 | Way too often we find ourselves trying to view both the labeled and the unlabeled, but only moving the unlabeled "raw" points. So it's handy that COMMIT picks those points only. 44 | 45 | These mechanisms correspond to buttons in `hover`'s annotation interface, which you have encountered in the quickstart: 46 | 47 | {!docs/snippets/markdown/jupyterlab-js-issue.md!} 48 | 49 |
50 | {!docs/snippets/py/t3-1-dataset-commit-dedup.txt!}
51 | 

52 | 53 | Of course, so far we have nothing to move, because there's no data selected. We shall now discuss selections. 54 | 55 | ## **Selection** 56 | 57 | `hover` labels data points in bulk, which requires selecting groups of homogeneous data, i.e. semantically similar or going to have the same label. Being able to skim through what you selected gives you confidence about homogeneity. 58 | 59 | Normally, selection happens through a plot (`explorer`), as we have seen in the quickstart. For the purpose here, we will "cheat" and assign the selection programmatically: 60 | 61 |
62 | {!docs/snippets/py/t3-2-dataset-selection-table.txt!}
63 | 

64 | 65 | ### **Edit Data Within a Selection** 66 | 67 | Often the points selected are not perfectly homogeneous, i.e. some outliers belong to a different label from the selected group overall. It would be helpful to `EVICT` them, and `SupervisableDataset` has a button for it. 68 | 69 | Sometimes you may also wish to edit data values on the fly. In hover this is called `PATCH`, and there also is a button for it. 70 | 71 | - by default, labels can be edited but feature values cannot. 72 | 73 | Let's plot the forementioned buttons along with the selection table. Toggle any number of rows in the table, then click the button to `EVICT` or `PATCH` those rows: 74 | 75 | {!docs/snippets/markdown/jupyterlab-js-issue.md!} 76 | 77 |
78 | {!docs/snippets/py/t3-3-dataset-evict-patch.txt!}
79 | 

80 | 81 | 82 | {!docs/snippets/html/stylesheet.html!} 83 | -------------------------------------------------------------------------------- /notebooks/Image-Experiment.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "4e78f829-57e4-4bb8-b696-9173057943fe", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import sys\n", 11 | "sys.path.append('../')" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": null, 17 | "id": "dba2348a-bb21-4896-a7be-e2cf4d4daff5", 18 | "metadata": {}, 19 | "outputs": [], 20 | "source": [] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": null, 25 | "id": "ced47d3d-e030-4136-87b8-377b302a84d6", 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "import pandas as pd\n", 30 | "\n", 31 | "df = pd.read_csv('imagenet_custom.csv').sample(frac=1.0).reset_index(drop=True)\n", 32 | "df.head()" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": null, 38 | "id": "1e1b71ea-b811-4810-9915-9f542829fe13", 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "df['SUBSET'] = 'raw'\n", 43 | "df['image1'] = df['image']\n", 44 | "df.loc[500:800, 'SUBSET'] = 'train'\n", 45 | "df.loc[800:900, 'SUBSET'] = 'dev'\n", 46 | "df.loc[900:, 'SUBSET'] = 'test'" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": null, 52 | "id": "b2f81c8a-66ba-400a-9d2d-54abb1a852ab", 53 | "metadata": {}, 54 | "outputs": [], 55 | "source": [ 56 | "import pandas as pd\n", 57 | "from hover.core.dataset import SupervisableImageDataset\n", 58 | "\n", 59 | "# skip this block if EXPORT_PATH does not have a corresponding file\n", 60 | "dataset = SupervisableImageDataset.from_pandas(df)" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": null, 66 | "id": "8ee2b295-343c-4e32-a651-c2a0fe2c9a45", 67 | "metadata": {}, 68 | "outputs": [], 69 | "source": [ 70 | "from fixture_module import image_vector_net\n", 71 | "\n", 72 | "vectorizer = image_vector_net.get_vectorizer()" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": null, 78 | "id": "727fab83-57b7-4267-a1ca-2f76fa863ad7", 79 | "metadata": {}, 80 | "outputs": [], 81 | "source": [ 82 | "dataset.compute_nd_embedding(vectorizer, \"umap\", dimension=2)\n", 83 | "dataset.dfs[\"raw\"].head(5)" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": null, 89 | "id": "e2877436-e5c3-483c-85ae-84b66da59977", 90 | "metadata": {}, 91 | "outputs": [], 92 | "source": [ 93 | "from hover.recipes import simple_annotator\n", 94 | "from hover.utils.bokeh_helper import bokeh_hover_tooltip\n", 95 | "from bokeh.io import show, output_notebook\n", 96 | "\n", 97 | "handle = simple_annotator(\n", 98 | " dataset.copy(), width=800, height=600,\n", 99 | " #tooltips=bokeh_hover_tooltip(label={\"label\": \"Label\"}, image={\"image\": 60, \"image1\": 80}),\n", 100 | ")\n", 101 | "\n", 102 | "output_notebook()\n", 103 | "show(handle)" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": null, 109 | "id": "6264287b-0e98-4785-9ee1-d5e22cd54256", 110 | "metadata": {}, 111 | "outputs": [], 112 | "source": [] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": null, 117 | "id": "2c2535ed-de5a-49e5-ab52-6b9ac6852ab8", 118 | "metadata": {}, 119 | "outputs": [], 120 | "source": [] 121 | } 122 | ], 123 | "metadata": { 124 | "kernelspec": { 125 | "display_name": "Python 3 (ipykernel)", 126 | "language": "python", 127 | "name": "python3" 128 | }, 129 | "language_info": { 130 | "codemirror_mode": { 131 | "name": "ipython", 132 | "version": 3 133 | }, 134 | "file_extension": ".py", 135 | "mimetype": "text/x-python", 136 | "name": "python", 137 | "nbconvert_exporter": "python", 138 | "pygments_lexer": "ipython3", 139 | "version": "3.9.7" 140 | } 141 | }, 142 | "nbformat": 4, 143 | "nbformat_minor": 5 144 | } 145 | -------------------------------------------------------------------------------- /tests/core/test_neural.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import numpy as np 3 | from copy import deepcopy 4 | from hover.core.neural import VectorNet 5 | from hover.module_config import DataFrame 6 | 7 | 8 | @pytest.fixture 9 | def example_vecnet_args(example_text_dataset): 10 | module_name = "fixture_module.text_vector_net" 11 | target_labels = example_text_dataset.classes[:] 12 | return (module_name, target_labels) 13 | 14 | 15 | @pytest.fixture 16 | def blank_vecnet(): 17 | model = VectorNet.from_module("fixture_module.text_vector_net", [], verbose=10) 18 | return model 19 | 20 | 21 | @pytest.fixture 22 | def example_vecnet(example_vecnet_args): 23 | model = VectorNet.from_module(*example_vecnet_args, verbose=10) 24 | return model 25 | 26 | 27 | def subroutine_predict_proba(net, dataset): 28 | num_classes = len(dataset.classes) 29 | proba_single = net.predict_proba("hello") 30 | assert proba_single.shape[0] == num_classes 31 | proba_multi = net.predict_proba(["hello", "bye", "ciao"]) 32 | assert proba_multi.shape[0] == 3 33 | assert proba_multi.shape[1] == num_classes 34 | 35 | 36 | @pytest.mark.core 37 | class TestVectorNet(object): 38 | """ 39 | For the VectorNet base class. 40 | """ 41 | 42 | @staticmethod 43 | @pytest.mark.lite 44 | def test_save_and_load(example_vecnet, example_vecnet_args): 45 | default_path = example_vecnet.nn_update_path 46 | example_vecnet.save(f"{default_path}.test") 47 | loaded_vecnet = VectorNet.from_module(*example_vecnet_args) 48 | loaded_vecnet.save() 49 | 50 | @staticmethod 51 | @pytest.mark.lite 52 | def test_auto_adjust_setup(blank_vecnet, example_text_dataset): 53 | vecnet = deepcopy(blank_vecnet) 54 | targets = example_text_dataset.classes 55 | old_classes = sorted( 56 | vecnet.label_encoder.keys(), 57 | key=lambda k: vecnet.label_encoder[k], 58 | ) 59 | old_nn = vecnet.nn 60 | # normal change of classes should create a new NN 61 | vecnet.auto_adjust_setup(targets) 62 | first_nn = vecnet.nn 63 | assert first_nn is not old_nn 64 | # identical classes should trigger autoskip 65 | vecnet.auto_adjust_setup(targets) 66 | second_nn = vecnet.nn 67 | assert second_nn is first_nn 68 | # change of class order should create a new NN 69 | vecnet.auto_adjust_setup(targets[1:] + targets[:1]) 70 | third_nn = vecnet.nn 71 | assert third_nn is not second_nn 72 | vecnet.auto_adjust_setup(old_classes) 73 | 74 | @staticmethod 75 | @pytest.mark.lite 76 | def test_adjust_optimier_params(example_vecnet): 77 | example_vecnet.adjust_optimizer_params() 78 | 79 | @staticmethod 80 | @pytest.mark.lite 81 | def test_predict_proba(example_vecnet, example_text_dataset): 82 | subroutine_predict_proba(example_vecnet, example_text_dataset) 83 | 84 | @staticmethod 85 | def test_manifold_trajectory(example_vecnet, example_raw_df): 86 | for _method in ["umap", "ivis"]: 87 | traj_arr, seq_arr, disparities = example_vecnet.manifold_trajectory( 88 | DataFrame.series_tolist(example_raw_df["text"]) 89 | ) 90 | assert isinstance(traj_arr, np.ndarray) 91 | assert isinstance(seq_arr, np.ndarray) 92 | assert isinstance(disparities, list) 93 | assert isinstance(disparities[0], float) 94 | 95 | @staticmethod 96 | def test_train_and_evaluate(example_vecnet, example_text_dataset): 97 | vecnet = deepcopy(example_vecnet) 98 | dataset = example_text_dataset 99 | dev_loader = dataset.loader("dev", example_vecnet.vectorizer) 100 | test_loader = dataset.loader("test", example_vecnet.vectorizer) 101 | 102 | train_info = vecnet.train(dev_loader, dev_loader, epochs=5) 103 | accuracy, conf_mat = vecnet.evaluate(test_loader) 104 | 105 | assert isinstance(train_info, list) 106 | assert isinstance(train_info[0], dict) 107 | assert isinstance(accuracy, float) 108 | assert isinstance(conf_mat, np.ndarray) 109 | -------------------------------------------------------------------------------- /mkdocs.yml: -------------------------------------------------------------------------------- 1 | site_name: Hover 2 | site_description: "Hover and label data rapidly." 3 | site_url: "https://phurwicz.github.io/hover" 4 | repo_url: "https://github.com/phurwicz/hover.git" 5 | repo_name: "phurwicz/hover" 6 | 7 | theme: 8 | name: material 9 | icon: 10 | logo: material/alpha-h-box 11 | favicon: images/favicon.png 12 | font: 13 | text: Roboto 14 | code: Roboto Mono 15 | features: 16 | - navigation.expand 17 | - navigation.tabs 18 | - search.suggest 19 | - toc.integrate 20 | palette: 21 | # Palette toggle for light mode 22 | - scheme: default 23 | toggle: 24 | icon: material/weather-night 25 | name: Switch to dark mode 26 | 27 | # Palette toggle for dark mode 28 | - scheme: slate 29 | toggle: 30 | icon: material/weather-sunny 31 | name: Switch to light mode 32 | 33 | nav: 34 | - Home: 'index.md' 35 | - 'Basics': 36 | - 'Quickstart': 'pages/tutorial/t0-quickstart.md' 37 | - 'Using Recipes': 'pages/tutorial/t1-active-learning.md' 38 | - 'Handling Images': 'pages/guides/g0-datatype-image.md' 39 | - 'Handling Audio': 'pages/guides/g1-datatype-audio.md' 40 | - 'Mechanisms': 41 | - 'Managing Data': 'pages/tutorial/t3-dataset-population-selection.md' 42 | - 'Applying Labels': 'pages/tutorial/t4-annotator-dataset-interaction.md' 43 | - 'Options': 44 | - 'Host Options': 'pages/tutorial/t2-bokeh-app.md' 45 | - 'Custom Config': 'pages/guides/g2-hover-config.md' 46 | - 'Powerful Tricks': 47 | - 'Finder & Selection Filter': 'pages/tutorial/t5-finder-filter.md' 48 | - 'Soft Label & Joint Filters': 'pages/tutorial/t6-softlabel-joint-filter.md' 49 | - 'Custom Labeling Functions': 'pages/tutorial/t7-snorkel-improvise-rules.md' 50 | # - 'Data Type: Multimodal': 'pages/topics/datatype-multimodal.md' 51 | #- 'Why Hover': 'pages/topics/what-hover-is.md' 52 | #- 'Customized Usage': 53 | # - 'API Levels': 'pages/topics/api-levels.md' # discuss the interaction between recipe / dataset / explorer 54 | # - 'Custom Recipe': 'pages/topics/custom-recipe.md' # discuss the caveats when making a recipe 55 | # - 'Subclassing Dataset': 'pages/topics/custom-dataset.md' # discuss the caveats when subclassing a SupervisableDataset 56 | # - 'Subclassing Explorer': 'pages/topics/custom-explorer.md' # discuss the caveats when subclassing a BokehBaseExplorer 57 | - 'API Reference': 58 | - 'hover.recipes': 'pages/reference/recipes.md' 59 | - 'hover.core': 60 | - '.dataset': 'pages/reference/core-dataset.md' 61 | - '.explorer': 62 | - '.base': 'pages/reference/core-explorer-base.md' 63 | - '.feature': 'pages/reference/core-explorer-feature.md' 64 | - '.functionality': 'pages/reference/core-explorer-functionality.md' 65 | - '.specialization': 'pages/reference/core-explorer-specialization.md' 66 | - '.neural': 'pages/reference/core-neural.md' 67 | - '.representation': 'pages/reference/core-representation.md' 68 | - 'hover.utils': 69 | - '.bokeh_helper': 'pages/reference/utils-bokeh_helper.md' 70 | - '.snorkel_helper': 'pages/reference/utils-snorkel_helper.md' 71 | 72 | markdown_extensions: 73 | - admonition 74 | - def_list 75 | - markdown_include.include 76 | - pymdownx.critic 77 | - pymdownx.details 78 | - pymdownx.emoji 79 | - pymdownx.superfences 80 | - pymdownx.tabbed: 81 | alternate_style: true 82 | 83 | plugins: 84 | - macros 85 | - search: 86 | - mkdocstrings: 87 | default_handler: python 88 | handlers: 89 | python: 90 | rendering: 91 | show_root_heading: true 92 | show_source: true 93 | watch: 94 | - hover 95 | - i18n: 96 | default_language: en 97 | languages: 98 | en: English 99 | # fr: français 100 | zh: 简体中文 101 | nav_translations: 102 | zh: 103 | Home: 主页 104 | Basics: 基础使用 105 | Mechanisms: 理解机制 106 | Options: 自定配置 107 | Powerful Tricks: 高级技巧 108 | API Reference: API 指南 109 | 110 | extra: 111 | version: 112 | provider: mike 113 | analytics: 114 | provider: google 115 | property: G-M3WR5YEJ33 116 | -------------------------------------------------------------------------------- /tests/recipes/test_experimental.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import numpy as np 3 | from hover.recipes.experimental import ( 4 | _active_learning, 5 | _snorkel_crosscheck, 6 | active_learning, 7 | snorkel_crosscheck, 8 | ) 9 | from hover.module_config import DataFrame as DF 10 | from bokeh.events import ButtonClick, SelectionGeometry 11 | from .local_helper import execute_handle_function 12 | 13 | 14 | def test_active_learning(example_text_dataset, dummy_vecnet_callback): 15 | def read_scores(dataset, subset): 16 | return DF.series_values(dataset.dfs[subset]["pred_score"]).copy() 17 | 18 | dataset = example_text_dataset.copy() 19 | vecnet = dummy_vecnet_callback(dataset) 20 | layout, objects = _active_learning(dataset, vecnet) 21 | assert layout.visible 22 | 23 | initial_scores = read_scores(dataset, "raw") 24 | 25 | finder, annotator = objects["finder"], objects["annotator"] 26 | softlabel = objects["softlabel"] 27 | coords_slider = softlabel._dynamic_widgets["patch_slider"] 28 | model_trainer = objects["model_trainer"] 29 | train_event = ButtonClick(model_trainer) 30 | 31 | # train for default number of epochs 32 | model_trainer._trigger_event(train_event) 33 | first_scores = read_scores(dataset, "raw") 34 | assert not np.allclose(first_scores, initial_scores) 35 | 36 | # emulating user interaction: slide coords to view manifold trajectory 37 | for _value in range(1, min(coords_slider.end + 1, 4)): 38 | coords_slider.value = _value 39 | 40 | # train for 1 more epoch 41 | model_trainer._trigger_event(train_event) 42 | second_scores = read_scores(dataset, "raw") 43 | assert not np.allclose(second_scores, first_scores) 44 | # take 25 and 75 percentiles of scores for later use 45 | range_low, range_high = np.percentile(second_scores, [25, 75]).tolist() 46 | 47 | # emulate user interface: select everything through a SelectionGeometry event 48 | total_raw = softlabel.dfs["raw"].shape[0] 49 | initial_select = list(range(total_raw)) 50 | # check linked selection 51 | assert annotator.sources["raw"].selected.indices == [] 52 | softlabel.sources["raw"].selected.indices = initial_select 53 | box_select = SelectionGeometry( 54 | softlabel.figure, 55 | geometry={ 56 | "type": "poly", 57 | "sx": [-1e4, -1e4, 1e4, 1e4], 58 | "sy": [-1e4, 1e4, 1e4, -1e4], 59 | "x": [None, None, None, None], 60 | "y": [None, None, None, None], 61 | }, 62 | ) 63 | softlabel.figure._trigger_event(box_select) 64 | assert annotator.sources["raw"].selected.indices == initial_select 65 | 66 | # check score filtering 67 | # nothing happens when filter is inactive 68 | softlabel.score_range.value = (range_low, range_high) 69 | assert softlabel.sources["raw"].selected.indices == initial_select 70 | # activate score filter 71 | softlabel.score_filter_box.active = [0] 72 | first_select = softlabel.sources["raw"].selected.indices[:] 73 | assert first_select != initial_select 74 | assert set(first_select).issubset(set(initial_select)) 75 | assert first_select == annotator.sources["raw"].selected.indices 76 | 77 | # check regex co-filtering 78 | finder.search_filter_box.active = [0] 79 | finder.search_pos.value = r"(?i)s[aeiou]\ " 80 | second_select = softlabel.sources["raw"].selected.indices[:] 81 | assert second_select != first_select 82 | assert set(second_select).issubset(set(first_select)) 83 | 84 | # check filter interaction: untoggle score filter 85 | softlabel.score_filter_box.active = [] 86 | third_select = softlabel.sources["raw"].selected.indices[:] 87 | assert third_select != second_select 88 | assert set(second_select).issubset(set(third_select)) 89 | 90 | # deactivate regex filter too 91 | finder.search_filter_box.active = [] 92 | unfilter_select = softlabel.sources["raw"].selected.indices[:] 93 | assert unfilter_select == initial_select 94 | 95 | 96 | def test_snorkel_crosscheck(example_audio_dataset, dummy_labeling_function_list): 97 | dataset = example_audio_dataset.copy() 98 | layout, objects = _snorkel_crosscheck(dataset, dummy_labeling_function_list) 99 | assert layout.visible 100 | 101 | # TODO: add emulations of user activity 102 | assert objects 103 | 104 | 105 | @pytest.mark.lite 106 | def test_servable_experimental( 107 | example_text_dataset, 108 | dummy_vecnet_callback, 109 | dummy_labeling_function_list, 110 | ): 111 | # one dataset for each recipe 112 | dataset = example_text_dataset.copy() 113 | vecnet = dummy_vecnet_callback(dataset) 114 | active = active_learning(dataset, vecnet) 115 | 116 | dataset = example_text_dataset.copy() 117 | snorkel = snorkel_crosscheck(dataset, dummy_labeling_function_list) 118 | 119 | for handle in [active, snorkel]: 120 | execute_handle_function(handle) 121 | -------------------------------------------------------------------------------- /docs/pages/tutorial/t1-active-learning.md: -------------------------------------------------------------------------------- 1 | > The most common usage of `hover` is through built-in `recipe`s like in the quickstart. 2 | > 3 | > :ferris_wheel: Let's explore another `recipe` -- an active learning example. 4 | 5 | {!docs/snippets/html/thebe.html!} 6 | {!docs/snippets/markdown/binder-kernel.md!} 7 | {!docs/snippets/markdown/local-dependency.md!} 8 | {!docs/snippets/markdown/local-dep-text.md!} 9 | {!docs/snippets/markdown/local-dep-jupyter-bokeh.md!} 10 | 11 | ## **Fundamentals** 12 | 13 | Hover `recipe`s are functions that take a `SupervisableDataset` and return an annotation interface. 14 | 15 | The `SupervisableDataset` is assumed to have some data and embeddings. 16 | 17 | ## **Recap: Data & Embeddings** 18 | 19 | Let's preprare a dataset with embeddings. This is almost the same as in the [quickstart](../t0-quickstart/): 20 | 21 |
 22 | {!docs/snippets/py/tz-dataset-text-full.txt!}
 23 | 

24 | 25 |
 26 | {!docs/snippets/py/t0-1-vectorizer.txt!}
 27 | 
 28 | {!docs/snippets/py/t0-1a-vectorizer-print.txt!}
 29 | 

30 | 31 |
 32 | {!docs/snippets/py/t0-2-reduction.txt!}
 33 | 

34 | 35 | ## **Recipe-Specific Ingredient** 36 | 37 | Each recipe has different functionalities and potentially different signature. 38 | 39 | To utilize active learning, we need to specify how to get a model in the loop. 40 | 41 | `hover` considers the `vectorizer` as a "frozen" embedding and follows up with a neural network, which infers its own dimensionality from the vectorizer and the output classes. 42 | 43 | - This architecture named [`VectorNet`](../../reference/core-neural/#hover.core.neural.VectorNet) is the (default) basis of active learning in `hover`. 44 | 45 | ??? info "Custom models" 46 | It is possible to use a model other than `VectorNet` or its subclass. 47 | 48 | You will need to implement the following methods with the same signatures as `VectorNet`: 49 | 50 | - [`train`](../../reference/core-neural/#hover.core.neural.VectorNet.train) 51 | - [`save`](../../reference/core-neural/#hover.core.neural.VectorNet.save) 52 | - [`predict_proba`](../../reference/core-neural/#hover.core.neural.VectorNet.predict_proba) 53 | - [`prepare_loader`](../../reference/core-neural/#hover.core.neural.VectorNet.prepare_loader) 54 | - [`manifold_trajectory`](../../reference/core-neural/#hover.core.neural.VectorNet.manifold_trajectory) 55 | 56 |
 57 | {!docs/snippets/py/t1-0-vecnet-callback.txt!}
 58 | 
 59 | {!docs/snippets/py/t1-0a-vecnet-callback-print.txt!}
 60 | 
61 | 62 | Note how the callback dynamically takes `dataset.classes`, which means the model architecture will adapt when we add classes during annotation. 63 | 64 | 65 | ## :sparkles: **Apply Labels** 66 | 67 | Now we invoke the `active_learning` recipe. 68 | 69 | ??? tip "Tips: how recipes work programmatically" 70 | In general, a `recipe` is a function taking a `SupervisableDataset` and other arguments based on its functionality. 71 | 72 | Here are a few common recipes: 73 | 74 | === "active_learning" 75 | 76 | ::: hover.recipes.experimental.active_learning 77 | rendering: 78 | show_root_heading: false 79 | show_root_toc_entry: false 80 | 81 | === "simple_annotator" 82 | 83 | ::: hover.recipes.stable.simple_annotator 84 | rendering: 85 | show_root_heading: false 86 | show_root_toc_entry: false 87 | 88 | === "linked_annotator" 89 | 90 | ::: hover.recipes.stable.linked_annotator 91 | rendering: 92 | show_root_heading: false 93 | show_root_toc_entry: false 94 | 95 | The recipe returns a `handle` function which `bokeh` can use to visualize an annotation interface in multiple settings. 96 | 97 |
 98 | {!docs/snippets/py/t1-1-active-learning.txt!}
 99 | 
100 | {!docs/snippets/py/tz-bokeh-show-server.txt!}
101 | 
102 | {!docs/snippets/py/tz-bokeh-show-notebook.txt!}
103 | 
104 | 105 | ???+ tip "Tips: annotation interface with multiple plots" 106 | ??? example "Video guide: leveraging linked selection" 107 | 108 | 109 | ???+ example "Video guide: active learning" 110 | 111 | 112 | ??? info "Text guide: active learning" 113 | Inspecting model predictions allows us to 114 | 115 | - get an idea of how the current set of annotations will likely teach the model. 116 | - locate the most valuable samples for further annotation. 117 | 118 | {!docs/snippets/html/stylesheet.html!} 119 | -------------------------------------------------------------------------------- /notebooks/archive-prototype/Programmatic-Polyselect.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "%load_ext autoreload\n", 10 | "\n", 11 | "%autoreload 2" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": null, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "import sys\n", 21 | "sys.path.append('../../')" 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "metadata": {}, 27 | "source": [ 28 | "### Use a Event Trigger to Make Selections: **Seems not working**" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": null, 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "import pandas as pd\n", 38 | "import numpy as np\n", 39 | "import random" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": null, 45 | "metadata": {}, 46 | "outputs": [], 47 | "source": [ 48 | "from bokeh.io import output_notebook, show\n", 49 | "from bokeh.plotting import figure\n", 50 | "from bokeh.models import ColumnDataSource\n", 51 | "\n", 52 | "output_notebook()" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": null, 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "from bokeh.models import RangeSlider\n", 62 | "from bokeh.layouts import column\n", 63 | "from bokeh.events import SelectionGeometry\n", 64 | "from hover.utils.bokeh_helper import servable\n", 65 | "\n", 66 | "def almost_global_select(figure):\n", 67 | " select_event = SelectionGeometry(\n", 68 | " figure,\n", 69 | " geometry={\n", 70 | " \"type\": \"poly\",\n", 71 | " \"x\": [-1e4, -1e4, 1e4, 1e4],\n", 72 | " \"y\": [-1e4, 1e4, 1e4, -1e4],\n", 73 | " \"sx\": [None, None, None, None],\n", 74 | " \"sy\": [None, None, None, None],\n", 75 | " },\n", 76 | " )\n", 77 | " return select_event\n", 78 | "\n", 79 | "@servable()\n", 80 | "def burner():\n", 81 | " \"\"\"\n", 82 | " Trying to simulate ploygon-based selections.\n", 83 | " \"\"\"\n", 84 | " df = pd.DataFrame({\n", 85 | " 'x': np.random.uniform(-1.0, 1.0, size=(100,)).tolist(),\n", 86 | " 'y': np.random.uniform(-1.0, 1.0, size=(100,)).tolist(),\n", 87 | " 'flag': [random.choice([True, False]) for i in range(100)],\n", 88 | " })\n", 89 | " \n", 90 | " source = ColumnDataSource(df)\n", 91 | " plot = figure(tools=['poly_select', 'lasso_select'])\n", 92 | " plot.circle(source=source)\n", 93 | " x_slider = RangeSlider(start=-1.0, end=1.0, value=(-0.5, 0.5), step=0.01)\n", 94 | " y_slider = RangeSlider(start=-1.0, end=1.0, value=(-0.5, 0.5), step=0.01)\n", 95 | " \n", 96 | " def slider_callback(attr, old, new):\n", 97 | " x_l, x_r = x_slider.value\n", 98 | " y_d, y_u = y_slider.value\n", 99 | " select_event = SelectionGeometry(\n", 100 | " plot,\n", 101 | " geometry={\n", 102 | " \"type\": \"poly\",\n", 103 | " \"x\": [x_l, x_l, x_r, x_r],\n", 104 | " \"y\": [y_d, y_u, y_u, y_d],\n", 105 | " #\"sx\": [None, None, None, None],\n", 106 | " #\"sy\": [None, None, None, None],\n", 107 | " },\n", 108 | " )\n", 109 | " plot._trigger_event(select_event)\n", 110 | " # use a patch to verify the polygon\n", 111 | " plot.patch([x_l, x_l, x_r, x_r], [y_d, y_u, y_u, y_d], alpha=0.2, line_width=1)\n", 112 | " # check the number of selected points\n", 113 | " print(len(source.selected.indices), end=\"\\r\")\n", 114 | " \n", 115 | " x_slider.on_change('value', slider_callback)\n", 116 | " y_slider.on_change('value', slider_callback)\n", 117 | " \n", 118 | " return column(x_slider, y_slider, plot)" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": null, 124 | "metadata": {}, 125 | "outputs": [], 126 | "source": [ 127 | "handle = burner()\n", 128 | "show(handle)" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": null, 134 | "metadata": {}, 135 | "outputs": [], 136 | "source": [] 137 | } 138 | ], 139 | "metadata": { 140 | "kernelspec": { 141 | "display_name": "Python 3 (ipykernel)", 142 | "language": "python", 143 | "name": "python3" 144 | }, 145 | "language_info": { 146 | "codemirror_mode": { 147 | "name": "ipython", 148 | "version": 3 149 | }, 150 | "file_extension": ".py", 151 | "mimetype": "text/x-python", 152 | "name": "python", 153 | "nbconvert_exporter": "python", 154 | "pygments_lexer": "ipython3", 155 | "version": "3.9.7" 156 | } 157 | }, 158 | "nbformat": 4, 159 | "nbformat_minor": 4 160 | } 161 | -------------------------------------------------------------------------------- /notebooks/archive-prototype/Slider-Filter.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "%load_ext autoreload\n", 10 | "\n", 11 | "%autoreload 2" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": null, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "import sys\n", 21 | "sys.path.append('../../')" 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "metadata": {}, 27 | "source": [ 28 | "### Use a Slider for Filtering Data Points" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": null, 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "import pandas as pd\n", 38 | "import numpy as np\n", 39 | "import random" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": null, 45 | "metadata": {}, 46 | "outputs": [], 47 | "source": [ 48 | "df = pd.DataFrame({\n", 49 | " 'x': np.random.uniform(-1.0, 1.0, size=(100,)).tolist(),\n", 50 | " 'y': np.random.uniform(-1.0, 1.0, size=(100,)).tolist(),\n", 51 | " 'flag': [random.choice([True, False]) for i in range(100)],\n", 52 | "})\n", 53 | "df.head()" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": null, 59 | "metadata": {}, 60 | "outputs": [], 61 | "source": [ 62 | "from bokeh.io import output_notebook, show\n", 63 | "from bokeh.plotting import figure\n", 64 | "from bokeh.models import ColumnDataSource\n", 65 | "\n", 66 | "output_notebook()" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": null, 72 | "metadata": {}, 73 | "outputs": [], 74 | "source": [ 75 | "from bokeh.models import RangeSlider\n", 76 | "from bokeh.layouts import column\n", 77 | "from bokeh.events import SelectionGeometry\n", 78 | "from hover.utils.bokeh_helper import servable\n", 79 | "\n", 80 | "@servable()\n", 81 | "def burner():\n", 82 | " \"\"\"\n", 83 | " Trying to intersect the last manually specified selection with a slider coords/attribute range.\n", 84 | " \"\"\"\n", 85 | " slider = RangeSlider(start=-1.0, end=1.0, value=(-0.5, 0.5), step=0.01)\n", 86 | " source = ColumnDataSource(df)\n", 87 | " plot = figure(tools=['poly_select', 'lasso_select', 'pan', 'wheel_zoom'])\n", 88 | " plot.circle(source=source)\n", 89 | " \n", 90 | " last_manual_selection = set()\n", 91 | " \n", 92 | " def subroutine(lower, upper):\n", 93 | " filter_l = set(np.where(df['y'] > lower)[0])\n", 94 | " filter_u = set(np.where(df['y'] < upper)[0])\n", 95 | " filtered = filter_l.intersection(filter_u)\n", 96 | " return filtered\n", 97 | " \n", 98 | " def selection_callback(event):\n", 99 | " \"\"\"\n", 100 | " CAUTION: this has to overwrite the last manual selection.\n", 101 | " Hence only manual selections should trigger this callback.\n", 102 | " \"\"\"\n", 103 | " last_manual_selection.clear()\n", 104 | " last_manual_selection.update(source.selected.indices.copy())\n", 105 | " filtered = subroutine(*slider.value)\n", 106 | " print('A')\n", 107 | " source.selected.indices = list(filtered.intersection(last_manual_selection))\n", 108 | " \n", 109 | " def foo(event):\n", 110 | " print('B')\n", 111 | " \n", 112 | " def slider_callback(attr, old, new):\n", 113 | " to_select = subroutine(*new)\n", 114 | " if last_manual_selection:\n", 115 | " to_select = to_select.intersection(last_manual_selection)\n", 116 | " source.selected.indices = list(to_select)\n", 117 | " \n", 118 | " plot.on_event(SelectionGeometry, selection_callback)\n", 119 | " plot.on_event(SelectionGeometry, foo)\n", 120 | " slider.on_change('value', slider_callback)\n", 121 | " \n", 122 | " return column(slider, plot)" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": null, 128 | "metadata": {}, 129 | "outputs": [], 130 | "source": [ 131 | "handle = burner()\n", 132 | "show(handle)" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": null, 138 | "metadata": {}, 139 | "outputs": [], 140 | "source": [] 141 | } 142 | ], 143 | "metadata": { 144 | "kernelspec": { 145 | "display_name": "Python 3 (ipykernel)", 146 | "language": "python", 147 | "name": "python3" 148 | }, 149 | "language_info": { 150 | "codemirror_mode": { 151 | "name": "ipython", 152 | "version": 3 153 | }, 154 | "file_extension": ".py", 155 | "mimetype": "text/x-python", 156 | "name": "python", 157 | "nbconvert_exporter": "python", 158 | "pygments_lexer": "ipython3", 159 | "version": "3.9.7" 160 | } 161 | }, 162 | "nbformat": 4, 163 | "nbformat_minor": 4 164 | } 165 | --------------------------------------------------------------------------------