├── .gitignore
├── README.md
├── demo.html
├── flake.lock
├── flake.nix
├── hammock
├── __init__.py
├── anki.py
├── cache.py
├── calibre
│ ├── core.py
│ └── detect_core.py
├── cluster.py
├── color.py
├── core.py
├── embedding.py
├── gunicorn.py
├── plot.py
├── util.py
└── web.py
├── poetry.lock
├── pyproject.toml
├── pyrightconfig.json
├── screenshot.png
├── stubs
├── InstructorEmbedding
│ └── __init__.pyi
├── arxiv
│ └── __init__.pyi
├── flask_compress
│ └── __init__.pyi
├── gunicorn
│ ├── app
│ │ └── wsgiapp.pyi
│ └── config.pyi
├── gutenbergpy
│ ├── __init__.py
│ ├── gutenbergcache.pyi
│ └── textget.pyi
├── hdbscan
│ └── __init__.pyi
├── networkx
│ └── __init__.pyi
├── nltk
│ ├── __init__.pyi
│ └── tokenize.pyi
├── numba
│ └── core
│ │ └── errors.pyi
├── numpy
│ ├── __init__.pyi
│ ├── lib
│ │ └── stride_tricks.pyi
│ └── linalg.pyi
├── plotly
│ ├── __init__.pyi
│ ├── colors
│ │ ├── __init__.pyi
│ │ ├── qualitative.pyi
│ │ └── sequential.pyi
│ ├── graph_objects
│ │ ├── __init__.pyi
│ │ └── layout
│ │ │ ├── __init__.pyi
│ │ │ └── scene.pyi
│ └── subplots.pyi
├── scipy
│ ├── __init__.pyi
│ └── spatial
│ │ └── distance.pyi
├── sentence_transformers
│ ├── __init__.pyi
│ └── models.pyi
├── sklearn
│ ├── __init__.pyi
│ ├── _base.pyi
│ ├── linear_model.pyi
│ ├── metrics.pyi
│ └── preprocessing.pyi
├── sklearn_extra
│ └── cluster.pyi
├── transformers
│ └── __init__.pyi
├── umap
│ └── __init__.pyi
└── wikipedia
│ └── __init__.pyi
├── templates
├── books.html
├── index.html
├── main.html
└── plotly.js
└── tsconfig.json
/.gitignore:
--------------------------------------------------------------------------------
1 | .direnv/
2 | dist/
3 | output/
4 | result
5 | cache/
6 | *.bak
7 | texts/
8 | .envrc
9 | *__pycache__*
10 | gutenbergindex.db
11 | private-topics.txt
12 | .venv/
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Visualize and compare embeddings for text sequences
2 |
3 | This is a small project for visualizing embeddings.
4 |
5 | 
6 |
7 | See the screenshot above or the interactive demo output [here](https://raw.githack.com/colehaus/hammock-public/main/demo.html).
8 |
9 | The overall flow is:
10 |
11 | 1. One or more text sequences is split into sentences or paragraphs.
12 | 2. Each resulting text fragment is embedded using the requested embedding model.
13 | 3. The embeddings are reduced to two or three dimensions (as requested) via [UMAP](https://umap-learn.readthedocs.io/en/latest/index.html).
14 | 4. The low dimensionality embeddings are optionally clustered with [hdbscan](https://hdbscan.readthedocs.io/en/latest/index.html). A single set of embeddings can be clustered at multiple granularities.
15 | 5. Clusters are optionally summarized using the requested language model.
16 |
17 | The resulting points and cluster info are plotted in an interactive 3D scatter plot.
18 | - Plotly provides a number of default handlers for interaction.
19 | - Left and right arrow keys step through text fragments in original text order (i.e. you could, in theory, read a book by stepping through its fragments and simultaneously see how the fragments relate to each other in embedding space).
20 | - Up and down arrow keys step through clustering granularities. (i.e. you can see a very high-level "table" of contents and then "zoom in" to more and more granular "tables" of contents)
21 |
22 | There's also some support and integration with a few existing text sources. A small web interface (`python -m hammock.gunicorn`) is provided for:
23 |
24 | - Visualizing arbitrary text submitted in a textarea
25 | - Fetching one or more books from Project Gutenberg by title and visualizing them
26 | - Fetching one or more articles from Wikipedia by title and visualizing them
27 |
28 | Modules can be accessed from the command line to:
29 |
30 | - Batch process epubs from [Calibre](https://calibre-ebook.com/) (`python -m hammock.calibre.core`)
31 | - Fetch cards from an [Anki](https://apps.ankiweb.net/) database and visualize them (`python -m hammock.anki -d -p `)
32 |
33 | If you have `nix`, you can simply do `nix run` from the project directory to set up the project launch the web server on localhost. `nix develop` will dump you into a shell with all the dependencies set up.
34 |
--------------------------------------------------------------------------------
/flake.lock:
--------------------------------------------------------------------------------
1 | {
2 | "nodes": {
3 | "flake-utils": {
4 | "inputs": {
5 | "systems": "systems"
6 | },
7 | "locked": {
8 | "lastModified": 1687709756,
9 | "narHash": "sha256-Y5wKlQSkgEK2weWdOu4J3riRd+kV/VCgHsqLNTTWQ/0=",
10 | "owner": "numtide",
11 | "repo": "flake-utils",
12 | "rev": "dbabf0ca0c0c4bce6ea5eaf65af5cb694d2082c7",
13 | "type": "github"
14 | },
15 | "original": {
16 | "owner": "numtide",
17 | "repo": "flake-utils",
18 | "type": "github"
19 | }
20 | },
21 | "flake-utils_2": {
22 | "inputs": {
23 | "systems": "systems_2"
24 | },
25 | "locked": {
26 | "lastModified": 1687709756,
27 | "narHash": "sha256-Y5wKlQSkgEK2weWdOu4J3riRd+kV/VCgHsqLNTTWQ/0=",
28 | "owner": "numtide",
29 | "repo": "flake-utils",
30 | "rev": "dbabf0ca0c0c4bce6ea5eaf65af5cb694d2082c7",
31 | "type": "github"
32 | },
33 | "original": {
34 | "owner": "numtide",
35 | "repo": "flake-utils",
36 | "type": "github"
37 | }
38 | },
39 | "nixpkgs": {
40 | "locked": {
41 | "lastModified": 1688590700,
42 | "narHash": "sha256-ZF055rIUP89cVwiLpG5xkJzx00gEuuGFF60Bs/LM3wc=",
43 | "owner": "NixOS",
44 | "repo": "nixpkgs",
45 | "rev": "f292b4964cb71f9dfbbd30dc9f511d6165cd109b",
46 | "type": "github"
47 | },
48 | "original": {
49 | "owner": "NixOS",
50 | "ref": "nixos-unstable",
51 | "repo": "nixpkgs",
52 | "type": "github"
53 | }
54 | },
55 | "poetry2nix": {
56 | "inputs": {
57 | "flake-utils": "flake-utils_2",
58 | "nixpkgs": [
59 | "nixpkgs"
60 | ]
61 | },
62 | "locked": {
63 | "lastModified": 1688732421,
64 | "narHash": "sha256-fy5CYRNkwcjEBeh9oJpNtKHj1BzitMku87OG6LzdH7A=",
65 | "owner": "nix-community",
66 | "repo": "poetry2nix",
67 | "rev": "02e4a29cb4ec64f2f5e8989084b80951df2bbb64",
68 | "type": "github"
69 | },
70 | "original": {
71 | "owner": "nix-community",
72 | "repo": "poetry2nix",
73 | "type": "github"
74 | }
75 | },
76 | "root": {
77 | "inputs": {
78 | "flake-utils": "flake-utils",
79 | "nixpkgs": "nixpkgs",
80 | "poetry2nix": "poetry2nix"
81 | }
82 | },
83 | "systems": {
84 | "locked": {
85 | "lastModified": 1681028828,
86 | "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
87 | "owner": "nix-systems",
88 | "repo": "default",
89 | "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
90 | "type": "github"
91 | },
92 | "original": {
93 | "owner": "nix-systems",
94 | "repo": "default",
95 | "type": "github"
96 | }
97 | },
98 | "systems_2": {
99 | "locked": {
100 | "lastModified": 1681028828,
101 | "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
102 | "owner": "nix-systems",
103 | "repo": "default",
104 | "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
105 | "type": "github"
106 | },
107 | "original": {
108 | "owner": "nix-systems",
109 | "repo": "default",
110 | "type": "github"
111 | }
112 | }
113 | },
114 | "root": "root",
115 | "version": 7
116 | }
117 |
--------------------------------------------------------------------------------
/flake.nix:
--------------------------------------------------------------------------------
1 | {
2 | description = "Application packaged using poetry2nix";
3 |
4 | inputs.flake-utils.url = "github:numtide/flake-utils";
5 | inputs.nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable";
6 | inputs.poetry2nix = {
7 | url = "github:nix-community/poetry2nix";
8 | inputs.nixpkgs.follows = "nixpkgs";
9 | };
10 |
11 | outputs = { self, nixpkgs, flake-utils, poetry2nix }:
12 | flake-utils.lib.eachDefaultSystem (system:
13 | let
14 | inherit (poetry2nix.legacyPackages.${system})
15 | mkPoetryApplication mkPoetryEnv overrides;
16 | pkgs = nixpkgs.legacyPackages.${system};
17 | python = pkgs.python311;
18 | myOverrides = overrides.withDefaults (final: prev:
19 | # Missing setup tools dependency declarations
20 | (pkgs.lib.genAttrs [
21 | "httpsproxy-urllib2"
22 | "instructorembedding"
23 | "wikipedia"
24 | ] (name:
25 | prev.${name}.overridePythonAttrs (old: {
26 | buildInputs = (old.buildInputs or [ ]) ++ [ prev.setuptools ];
27 | }))) //
28 | # Miscellaneous build problems that are most easily fixed by using wheels
29 | (pkgs.lib.genAttrs [
30 | "cmake"
31 | "ruff"
32 | "safetensors"
33 | "tokenizers"
34 | "pybind11"
35 | "scipy"
36 | "urllib3"
37 | ] (name: prev.${name}.override { preferWheel = true; })));
38 | poetryAttrs = {
39 | projectDir = ./.;
40 | preferWheels = false;
41 | python = python;
42 | overrides = myOverrides;
43 | };
44 | in rec {
45 | formatter = pkgs.nixfmt;
46 | defaultApp = mkPoetryApplication poetryAttrs;
47 | devShells.default = (mkPoetryEnv poetryAttrs).env.overrideAttrs
48 | (final: prev: {
49 | nativeBuildInputs = (prev.nativeBuildInputs or [ ]) ++ [
50 | poetry2nix.packages.${system}.poetry
51 | pkgs.typescript
52 | pkgs.nodePackages.prettier
53 | ];
54 | });
55 | });
56 | }
57 |
--------------------------------------------------------------------------------
/hammock/__init__.py:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/hammock/anki.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | from pathlib import Path
3 | import re
4 | import sqlite3
5 | from typing import Mapping, NamedTuple, NewType
6 |
7 | import html2text
8 |
9 | from hammock.plot import Source
10 |
11 | from .core import plot_single
12 | from .cluster import CCColorAndSummarize
13 | from .embedding import instructor_large
14 |
15 |
16 | def strip_html(text: str) -> str:
17 | h = html2text.HTML2Text()
18 | h.ignore_links = True
19 | h.ignore_images = True
20 | h.ignore_tables = True
21 | h.ignore_emphasis = True
22 | # Do it twice because some cards about HTML turn into valid HTML after the first pass!
23 | return h.handle(h.handle(text)).strip()
24 |
25 |
26 | class StripClozeResults(NamedTuple):
27 | text: str
28 | had_cloze_deletions: bool
29 |
30 |
31 | def strip_cloze_deletions(text: str) -> StripClozeResults:
32 | out = re.sub(r"{{c\d+::(.*?)(::.*?)?}}", r"\1", text, flags=re.DOTALL)
33 | return StripClozeResults(out, text != out)
34 |
35 |
36 | def compress_spaces(text: str) -> str:
37 | return re.sub(r"\s+", " ", text)
38 |
39 |
40 | NoteID = NewType("NoteID", int)
41 |
42 |
43 | def extract_text_from_anki(private_path: Path, db_path: Path) -> Mapping[NoteID, str]:
44 | with open(private_path) as f:
45 | private_topics = [line.strip() for line in f.readlines()]
46 | with sqlite3.connect(db_path) as conn:
47 | cursor = conn.cursor()
48 | cursor.execute("SELECT id, flds FROM notes")
49 | field_separator = "\x1f"
50 | return {
51 | NoteID(note_id): stripped.text
52 | for note_id, stripped in [
53 | (row[0], strip_cloze_deletions(compress_spaces(strip_html(row[1].split(field_separator)[0]))))
54 | for row in cursor.fetchall()
55 | ]
56 | if stripped.had_cloze_deletions
57 | # and "