├── notebooks
    ├── __init__.py
    ├── memery
    ├── memery.ipynb
    ├── 07_cli.ipynb
    ├── 05_ranker.ipynb
    ├── 03_encoder.ipynb
    ├── 04_indexer.ipynb
    ├── 09_streamlit_app.ipynb
    ├── 02_crafter.ipynb
    ├── 01_loader.ipynb
    ├── 08_jupyter_gui.ipynb
    ├── 00_core.ipynb
    ├── _visualize.ipynb
    └── _working_pipeline.ipynb
├── windows-run.bat
├── docs
    └── testimage.png
├── graphs
    ├── embed_d.jpg
    ├── embed_n.jpg
    ├── mde_d.gif
    ├── mde_n.gif
    ├── normalized-d.jpg
    ├── normalized.jpg
    ├── plotted_ims.jpg
    └── normalized-lg.jpg
├── memery
    ├── __init__.py
    ├── ranker.py
    ├── indexer.py
    ├── encoder.py
    ├── cli.py
    ├── loader.py
    ├── crafter.py
    ├── gui.py
    ├── streamlit_app.py
    └── core.py
├── images
    ├── E2GoeMyWEAAkcLz.jpeg
    ├── memes
    │   ├── stonks-meme.jpg
    │   ├── Envato-Elements.png
    │   ├── Shrek_screenshot.jpg
    │   ├── Wholesome-Meme-1.jpg
    │   ├── Wholesome-Meme-3.jpg
    │   ├── Wholesome-Meme-4.jpg
    │   ├── Wholesome-Meme-5.jpg
    │   ├── Wholesome-Meme-6.jpg
    │   ├── Wholesome-Meme-7.jpg
    │   ├── Wholesome-Meme-8.jpg
    │   ├── Wholesome-Meme-9.jpg
    │   ├── Wholesome-Meme.jpg
    │   ├── corrupted-file.jpeg
    │   ├── Wholesome-Meme-10.jpg
    │   ├── Wholesome-Meme-12.jpg
    │   ├── Wholesome-Meme-13.jpg
    │   ├── Wholesome-Meme-14.jpg
    │   ├── Wholesome-Meme-15.jpg
    │   ├── Wholesome-Meme-16.jpg
    │   ├── Wholesome-Meme-17.jpg
    │   ├── Wholesome-Meme-18.jpg
    │   ├── Wholesome-Meme-21.jpg
    │   ├── Wholesome-Meme-22.jpg
    │   ├── Wholesome-Meme-23.jpg
    │   ├── Wholesome-Meme-25.jpg
    │   ├── Wholesome-Meme-27.jpg
    │   ├── Wholesome-Meme-28.jpg
    │   ├── Wholesome-Meme-29.jpg
    │   ├── Wholesome-Meme-31.jpg
    │   ├── Wholesome-Meme-33.jpg
    │   ├── Wholesome-Meme-34.jpg
    │   ├── Wholesome-Meme-35.jpg
    │   ├── Wholesome-Meme-36.jpg
    │   ├── Wholesome-Meme-39.jpg
    │   ├── Wholesome-Meme-40.jpg
    │   ├── Wholesome-Meme-40.png
    │   ├── Wholesome-Meme-41.jpg
    │   ├── Wholesome-Meme-42.jpg
    │   ├── Wholesome-Meme-44.png
    │   ├── Wholesome-Meme-45.jpg
    │   ├── Wholesome-Meme-57.jpg
    │   ├── Wholesome-Meme-59.jpg
    │   ├── Wholesome-Meme-60.jpg
    │   ├── Wholesome-Meme-61.png
    │   ├── Wholesome-Meme-63.jpg
    │   ├── Wholesome-Meme-64.jpg
    │   ├── Wholesome-Meme-65.jpg
    │   ├── Wholesome-Meme-67.png
    │   ├── Wholesome-Meme-68.jpg
    │   ├── Wholesome-Meme-69.jpg
    │   ├── Wholesome-Meme-70.jpg
    │   ├── Wholesome-Meme-71.jpg
    │   ├── Wholesome-Meme-72.jpg
    │   ├── Wholesome-Meme-73.png
    │   ├── Wholesome-Meme-74.jpg
    │   ├── Wholesome-Meme-76.jpg
    │   ├── Wholesome-Meme-77.jpg
    │   ├── Wholesome-Meme-78.jpg
    │   ├── Wholesome-Meme-80.jpg
    │   ├── Wholesome-Meme-81.jpg
    │   ├── Wholesome-Meme-82.jpg
    │   ├── Wholesome-Meme-84.jpg
    │   ├── Wholesome-Meme-85.jpg
    │   ├── Wholesome-Meme-86.jpg
    │   ├── Wholesome-Meme-88.jpg
    │   ├── Wholesome-Meme-89.jpg
    │   ├── Wholesome-Meme-97.jpg
    │   ├── Wholesome-Meme-98.jpg
    │   ├── Wholesome-Meme-99.jpg
    │   ├── halloween-Pumpkin-min.jpg
    │   ├── mexican-food-concept-EXFWKZG.jpg
    │   ├── embarassed-dog-on-bed-SA2BDZW.jpg
    │   ├── love-from-the-past-PPBEUVU-min.jpg
    │   ├── cute-baby-touching-his-moms-face.jpeg
    │   ├── cute-dog-with-cupcake-P9E2YL5-min.jpg
    │   ├── portrait-of-happy-birthday-boy-B8VU4LZ.jpg
    │   ├── braydon-anderson-wOHH-NUTvVc-unsplash-min.jpg
    │   ├── Father-and-son-having-fun-at-the-breakfast-table.jpg
    │   ├── i-love-you-note-in-the-valentine-day-settings-X87BZ44.jpg
    │   └── happy-young-couple-eat-breakfast-in-bed-in-morning-RH4KQ72.jpg
    ├── jupyter-screenshot.png
    └── streamlit-screenshot.png
├── install
    ├── requirements.txt
    ├── windows-install.py
    └── cuda_install.py
├── .streamlit
    └── config.toml
├── pyproject.toml
├── LICENSE
├── .github
    └── workflows
    │   └── python-app.yml
├── windows-uninstall.bat
├── CONTRIBUTING.md
├── .gitignore
├── windows-install.bat
└── README.md


/notebooks/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/notebooks/memery:
--------------------------------------------------------------------------------
1 | ../memery/


--------------------------------------------------------------------------------
/windows-run.bat:
--------------------------------------------------------------------------------
1 | memery serve


--------------------------------------------------------------------------------
/docs/testimage.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexfazio/memery/main/docs/testimage.png


--------------------------------------------------------------------------------
/graphs/embed_d.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexfazio/memery/main/graphs/embed_d.jpg


--------------------------------------------------------------------------------
/graphs/embed_n.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexfazio/memery/main/graphs/embed_n.jpg


--------------------------------------------------------------------------------
/graphs/mde_d.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexfazio/memery/main/graphs/mde_d.gif


--------------------------------------------------------------------------------
/graphs/mde_n.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexfazio/memery/main/graphs/mde_n.gif


--------------------------------------------------------------------------------
/memery/__init__.py:
--------------------------------------------------------------------------------
1 | __path__ = __import__("pkgutil").extend_path(__path__, __name__)
2 | 


--------------------------------------------------------------------------------
/graphs/normalized-d.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexfazio/memery/main/graphs/normalized-d.jpg


--------------------------------------------------------------------------------
/graphs/normalized.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexfazio/memery/main/graphs/normalized.jpg


--------------------------------------------------------------------------------
/graphs/plotted_ims.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexfazio/memery/main/graphs/plotted_ims.jpg


--------------------------------------------------------------------------------
/graphs/normalized-lg.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexfazio/memery/main/graphs/normalized-lg.jpg


--------------------------------------------------------------------------------
/images/E2GoeMyWEAAkcLz.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexfazio/memery/main/images/E2GoeMyWEAAkcLz.jpeg


--------------------------------------------------------------------------------
/images/memes/stonks-meme.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexfazio/memery/main/images/memes/stonks-meme.jpg


--------------------------------------------------------------------------------
/images/jupyter-screenshot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexfazio/memery/main/images/jupyter-screenshot.png


--------------------------------------------------------------------------------
/install/requirements.txt:
--------------------------------------------------------------------------------
1 | packaging>=20.0
2 | poetry>=1.6.1
3 | protobuf==3.20.*
4 | setuptools>=68.2.2
5 | 


--------------------------------------------------------------------------------
/images/memes/Envato-Elements.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexfazio/memery/main/images/memes/Envato-Elements.png


--------------------------------------------------------------------------------
/images/memes/Shrek_screenshot.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexfazio/memery/main/images/memes/Shrek_screenshot.jpg


--------------------------------------------------------------------------------
/images/memes/Wholesome-Meme-1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexfazio/memery/main/images/memes/Wholesome-Meme-1.jpg


--------------------------------------------------------------------------------
/images/memes/Wholesome-Meme-3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexfazio/memery/main/images/memes/Wholesome-Meme-3.jpg


--------------------------------------------------------------------------------
/images/memes/Wholesome-Meme-4.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexfazio/memery/main/images/memes/Wholesome-Meme-4.jpg


--------------------------------------------------------------------------------
/images/memes/Wholesome-Meme-5.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexfazio/memery/main/images/memes/Wholesome-Meme-5.jpg


--------------------------------------------------------------------------------
/images/memes/Wholesome-Meme-6.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexfazio/memery/main/images/memes/Wholesome-Meme-6.jpg


--------------------------------------------------------------------------------
/images/memes/Wholesome-Meme-7.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexfazio/memery/main/images/memes/Wholesome-Meme-7.jpg


--------------------------------------------------------------------------------
/images/memes/Wholesome-Meme-8.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexfazio/memery/main/images/memes/Wholesome-Meme-8.jpg


--------------------------------------------------------------------------------
/images/memes/Wholesome-Meme-9.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexfazio/memery/main/images/memes/Wholesome-Meme-9.jpg


--------------------------------------------------------------------------------
/images/memes/Wholesome-Meme.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexfazio/memery/main/images/memes/Wholesome-Meme.jpg


--------------------------------------------------------------------------------
/images/memes/corrupted-file.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexfazio/memery/main/images/memes/corrupted-file.jpeg


--------------------------------------------------------------------------------
/images/streamlit-screenshot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexfazio/memery/main/images/streamlit-screenshot.png


--------------------------------------------------------------------------------
/images/memes/Wholesome-Meme-10.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexfazio/memery/main/images/memes/Wholesome-Meme-10.jpg


--------------------------------------------------------------------------------
/images/memes/Wholesome-Meme-12.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexfazio/memery/main/images/memes/Wholesome-Meme-12.jpg


--------------------------------------------------------------------------------
/images/memes/Wholesome-Meme-13.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexfazio/memery/main/images/memes/Wholesome-Meme-13.jpg


--------------------------------------------------------------------------------
/images/memes/Wholesome-Meme-14.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexfazio/memery/main/images/memes/Wholesome-Meme-14.jpg


--------------------------------------------------------------------------------
/images/memes/Wholesome-Meme-15.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexfazio/memery/main/images/memes/Wholesome-Meme-15.jpg


--------------------------------------------------------------------------------
/images/memes/Wholesome-Meme-16.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexfazio/memery/main/images/memes/Wholesome-Meme-16.jpg


--------------------------------------------------------------------------------
/images/memes/Wholesome-Meme-17.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexfazio/memery/main/images/memes/Wholesome-Meme-17.jpg


--------------------------------------------------------------------------------
/images/memes/Wholesome-Meme-18.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexfazio/memery/main/images/memes/Wholesome-Meme-18.jpg


--------------------------------------------------------------------------------
/images/memes/Wholesome-Meme-21.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexfazio/memery/main/images/memes/Wholesome-Meme-21.jpg


--------------------------------------------------------------------------------
/images/memes/Wholesome-Meme-22.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexfazio/memery/main/images/memes/Wholesome-Meme-22.jpg


--------------------------------------------------------------------------------
/images/memes/Wholesome-Meme-23.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexfazio/memery/main/images/memes/Wholesome-Meme-23.jpg


--------------------------------------------------------------------------------
/images/memes/Wholesome-Meme-25.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexfazio/memery/main/images/memes/Wholesome-Meme-25.jpg


--------------------------------------------------------------------------------
/images/memes/Wholesome-Meme-27.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexfazio/memery/main/images/memes/Wholesome-Meme-27.jpg


--------------------------------------------------------------------------------
/images/memes/Wholesome-Meme-28.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexfazio/memery/main/images/memes/Wholesome-Meme-28.jpg


--------------------------------------------------------------------------------
/images/memes/Wholesome-Meme-29.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexfazio/memery/main/images/memes/Wholesome-Meme-29.jpg


--------------------------------------------------------------------------------
/images/memes/Wholesome-Meme-31.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexfazio/memery/main/images/memes/Wholesome-Meme-31.jpg


--------------------------------------------------------------------------------
/images/memes/Wholesome-Meme-33.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexfazio/memery/main/images/memes/Wholesome-Meme-33.jpg


--------------------------------------------------------------------------------
/images/memes/Wholesome-Meme-34.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexfazio/memery/main/images/memes/Wholesome-Meme-34.jpg


--------------------------------------------------------------------------------
/images/memes/Wholesome-Meme-35.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexfazio/memery/main/images/memes/Wholesome-Meme-35.jpg


--------------------------------------------------------------------------------
/images/memes/Wholesome-Meme-36.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexfazio/memery/main/images/memes/Wholesome-Meme-36.jpg


--------------------------------------------------------------------------------
/images/memes/Wholesome-Meme-39.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexfazio/memery/main/images/memes/Wholesome-Meme-39.jpg


--------------------------------------------------------------------------------
/images/memes/Wholesome-Meme-40.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexfazio/memery/main/images/memes/Wholesome-Meme-40.jpg


--------------------------------------------------------------------------------
/images/memes/Wholesome-Meme-40.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexfazio/memery/main/images/memes/Wholesome-Meme-40.png


--------------------------------------------------------------------------------
/images/memes/Wholesome-Meme-41.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexfazio/memery/main/images/memes/Wholesome-Meme-41.jpg


--------------------------------------------------------------------------------
/images/memes/Wholesome-Meme-42.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexfazio/memery/main/images/memes/Wholesome-Meme-42.jpg


--------------------------------------------------------------------------------
/images/memes/Wholesome-Meme-44.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexfazio/memery/main/images/memes/Wholesome-Meme-44.png


--------------------------------------------------------------------------------
/images/memes/Wholesome-Meme-45.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexfazio/memery/main/images/memes/Wholesome-Meme-45.jpg


--------------------------------------------------------------------------------
/images/memes/Wholesome-Meme-57.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexfazio/memery/main/images/memes/Wholesome-Meme-57.jpg


--------------------------------------------------------------------------------
/images/memes/Wholesome-Meme-59.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexfazio/memery/main/images/memes/Wholesome-Meme-59.jpg


--------------------------------------------------------------------------------
/images/memes/Wholesome-Meme-60.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexfazio/memery/main/images/memes/Wholesome-Meme-60.jpg


--------------------------------------------------------------------------------
/images/memes/Wholesome-Meme-61.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexfazio/memery/main/images/memes/Wholesome-Meme-61.png


--------------------------------------------------------------------------------
/images/memes/Wholesome-Meme-63.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexfazio/memery/main/images/memes/Wholesome-Meme-63.jpg


--------------------------------------------------------------------------------
/images/memes/Wholesome-Meme-64.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexfazio/memery/main/images/memes/Wholesome-Meme-64.jpg


--------------------------------------------------------------------------------
/images/memes/Wholesome-Meme-65.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexfazio/memery/main/images/memes/Wholesome-Meme-65.jpg


--------------------------------------------------------------------------------
/images/memes/Wholesome-Meme-67.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexfazio/memery/main/images/memes/Wholesome-Meme-67.png


--------------------------------------------------------------------------------
/images/memes/Wholesome-Meme-68.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexfazio/memery/main/images/memes/Wholesome-Meme-68.jpg


--------------------------------------------------------------------------------
/images/memes/Wholesome-Meme-69.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexfazio/memery/main/images/memes/Wholesome-Meme-69.jpg


--------------------------------------------------------------------------------
/images/memes/Wholesome-Meme-70.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexfazio/memery/main/images/memes/Wholesome-Meme-70.jpg


--------------------------------------------------------------------------------
/images/memes/Wholesome-Meme-71.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexfazio/memery/main/images/memes/Wholesome-Meme-71.jpg


--------------------------------------------------------------------------------
/images/memes/Wholesome-Meme-72.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexfazio/memery/main/images/memes/Wholesome-Meme-72.jpg


--------------------------------------------------------------------------------
/images/memes/Wholesome-Meme-73.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexfazio/memery/main/images/memes/Wholesome-Meme-73.png


--------------------------------------------------------------------------------
/images/memes/Wholesome-Meme-74.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexfazio/memery/main/images/memes/Wholesome-Meme-74.jpg


--------------------------------------------------------------------------------
/images/memes/Wholesome-Meme-76.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexfazio/memery/main/images/memes/Wholesome-Meme-76.jpg


--------------------------------------------------------------------------------
/images/memes/Wholesome-Meme-77.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexfazio/memery/main/images/memes/Wholesome-Meme-77.jpg


--------------------------------------------------------------------------------
/images/memes/Wholesome-Meme-78.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexfazio/memery/main/images/memes/Wholesome-Meme-78.jpg


--------------------------------------------------------------------------------
/images/memes/Wholesome-Meme-80.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexfazio/memery/main/images/memes/Wholesome-Meme-80.jpg


--------------------------------------------------------------------------------
/images/memes/Wholesome-Meme-81.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexfazio/memery/main/images/memes/Wholesome-Meme-81.jpg


--------------------------------------------------------------------------------
/images/memes/Wholesome-Meme-82.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexfazio/memery/main/images/memes/Wholesome-Meme-82.jpg


--------------------------------------------------------------------------------
/images/memes/Wholesome-Meme-84.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexfazio/memery/main/images/memes/Wholesome-Meme-84.jpg


--------------------------------------------------------------------------------
/images/memes/Wholesome-Meme-85.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexfazio/memery/main/images/memes/Wholesome-Meme-85.jpg


--------------------------------------------------------------------------------
/images/memes/Wholesome-Meme-86.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexfazio/memery/main/images/memes/Wholesome-Meme-86.jpg


--------------------------------------------------------------------------------
/images/memes/Wholesome-Meme-88.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexfazio/memery/main/images/memes/Wholesome-Meme-88.jpg


--------------------------------------------------------------------------------
/images/memes/Wholesome-Meme-89.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexfazio/memery/main/images/memes/Wholesome-Meme-89.jpg


--------------------------------------------------------------------------------
/images/memes/Wholesome-Meme-97.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexfazio/memery/main/images/memes/Wholesome-Meme-97.jpg


--------------------------------------------------------------------------------
/images/memes/Wholesome-Meme-98.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexfazio/memery/main/images/memes/Wholesome-Meme-98.jpg


--------------------------------------------------------------------------------
/images/memes/Wholesome-Meme-99.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexfazio/memery/main/images/memes/Wholesome-Meme-99.jpg


--------------------------------------------------------------------------------
/images/memes/halloween-Pumpkin-min.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexfazio/memery/main/images/memes/halloween-Pumpkin-min.jpg


--------------------------------------------------------------------------------
/images/memes/mexican-food-concept-EXFWKZG.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexfazio/memery/main/images/memes/mexican-food-concept-EXFWKZG.jpg


--------------------------------------------------------------------------------
/images/memes/embarassed-dog-on-bed-SA2BDZW.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexfazio/memery/main/images/memes/embarassed-dog-on-bed-SA2BDZW.jpg


--------------------------------------------------------------------------------
/images/memes/love-from-the-past-PPBEUVU-min.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexfazio/memery/main/images/memes/love-from-the-past-PPBEUVU-min.jpg


--------------------------------------------------------------------------------
/images/memes/cute-baby-touching-his-moms-face.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexfazio/memery/main/images/memes/cute-baby-touching-his-moms-face.jpeg


--------------------------------------------------------------------------------
/images/memes/cute-dog-with-cupcake-P9E2YL5-min.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexfazio/memery/main/images/memes/cute-dog-with-cupcake-P9E2YL5-min.jpg


--------------------------------------------------------------------------------
/images/memes/portrait-of-happy-birthday-boy-B8VU4LZ.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexfazio/memery/main/images/memes/portrait-of-happy-birthday-boy-B8VU4LZ.jpg


--------------------------------------------------------------------------------
/images/memes/braydon-anderson-wOHH-NUTvVc-unsplash-min.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexfazio/memery/main/images/memes/braydon-anderson-wOHH-NUTvVc-unsplash-min.jpg


--------------------------------------------------------------------------------
/.streamlit/config.toml:
--------------------------------------------------------------------------------
1 | [theme]
2 | primaryColor="#55ff00"
3 | backgroundColor="#252724"
4 | secondaryBackgroundColor="#616460"
5 | textColor="#80cb59"
6 | font="monospace"
7 | 


--------------------------------------------------------------------------------
/images/memes/Father-and-son-having-fun-at-the-breakfast-table.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexfazio/memery/main/images/memes/Father-and-son-having-fun-at-the-breakfast-table.jpg


--------------------------------------------------------------------------------
/images/memes/i-love-you-note-in-the-valentine-day-settings-X87BZ44.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexfazio/memery/main/images/memes/i-love-you-note-in-the-valentine-day-settings-X87BZ44.jpg


--------------------------------------------------------------------------------
/images/memes/happy-young-couple-eat-breakfast-in-bed-in-morning-RH4KQ72.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexfazio/memery/main/images/memes/happy-young-couple-eat-breakfast-in-bed-in-morning-RH4KQ72.jpg


--------------------------------------------------------------------------------
/memery/ranker.py:
--------------------------------------------------------------------------------
1 | from annoy import AnnoyIndex
2 | 
3 | def ranker(query_vec, treemap: AnnoyIndex) -> list[int]:
4 |     nn_indexes = treemap.get_nns_by_vector(query_vec[0], treemap.get_n_items())
5 |     return(nn_indexes)
6 | 
7 | def nns_to_files(db, indexes) -> list[str]:
8 | #     return([[v['fpath'] for k,v in db.items() if v['index'] == ind][0] for ind in indexes])
9 |     return([db[ind]['fpath'] for ind in indexes])


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "memery"
 3 | version = "0.1.0"
 4 | description = ""
 5 | authors = ["deepfates <deepfates@gmail.com>", "wkrettek <warrenkrettek@gmail.com>"]
 6 | 
 7 | [tool.poetry.dependencies]
 8 | python = "^3.9"
 9 | torch = "^2.2.0"
10 | annoy = "^1.17.0"
11 | torchvision = "^0.17.0"
12 | tqdm = "^4.64.0"
13 | Pillow = "^9.1.0"
14 | typer = "^0.4.1"
15 | streamlit = "1.3.1"
16 | clip = {git = "https://github.com/openai/CLIP", rev = "main"}
17 | ftfy = "^6.1.1"
18 | regex = "^2022.4.24"
19 | altair = "^4.0.0"
20 | numpy = "^1.24.0"
21 | protobuf = "^3.20.0"
22 | 
23 | [tool.poetry.scripts]
24 | memery = "memery.cli:main"
25 | 
26 | [tool.poetry.dev-dependencies]
27 | ipywidgets = "^7.7.0"
28 | ipython = "^8.3.0"
29 | 
30 | [build-system]
31 | requires = ["poetry-core>=1.0.0"]
32 | build-backend = "poetry.core.masonry.api"
33 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 max brewer
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/memery/indexer.py:
--------------------------------------------------------------------------------
 1 | from annoy import AnnoyIndex
 2 | import torch
 3 | 
 4 | def join_all(db, new_files, new_embeddings) -> dict:
 5 |     start = len(db)
 6 |     for i, file in enumerate(new_files):
 7 |         path, hash = file
 8 |         index = i + start
 9 |         db[index] = {
10 |             'hash': hash,
11 |             'fpath': path,
12 |             'embed': new_embeddings[i],
13 |         }
14 |     return(db)
15 | 
16 | def build_treemap(db) -> AnnoyIndex:
17 |     treemap = AnnoyIndex(512, 'angular')
18 |     for k, v in db.items():
19 |         treemap.add_item(k, v['embed'])
20 | 
21 |     # Build the treemap, with 5 trees rn
22 |     treemap.build(5)
23 | 
24 |     return(treemap)
25 | 
26 | 
27 | def save_archives(root, treemap, db) -> tuple[str, str]:
28 |     dbpath = root/'memery.pt'
29 |     if dbpath.exists():
30 | #         dbpath.rename(root/'memery-bak.pt')
31 |         dbpath.unlink()
32 |     torch.save(db, dbpath)
33 | 
34 |     treepath = root/'memery.ann'
35 |     if treepath.exists():
36 | #         treepath.rename(root/'memery-bak.ann')
37 |         treepath.unlink()
38 |     treemap.save(str(treepath))
39 | 
40 |     return(str(dbpath), str(treepath))


--------------------------------------------------------------------------------
/.github/workflows/python-app.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will install Python dependencies, run tests and lint with a single version of Python
 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
 3 | 
 4 | name: Python application
 5 | 
 6 | on:
 7 |   push:
 8 |     branches: [ main ]
 9 |   pull_request:
10 |     branches: [ main ]
11 | 
12 | permissions:
13 |   contents: read
14 | 
15 | jobs:
16 |   build:
17 | 
18 |     runs-on: ubuntu-latest
19 | 
20 |     steps:
21 |     - uses: actions/checkout@v3
22 |     - name: Set up Python 3.10
23 |       uses: actions/setup-python@v3
24 |       with:
25 |         python-version: "3.10"
26 |     - name: Install dependencies
27 |       run: |
28 |         python -m pip install --upgrade pip
29 |         pip install flake8 pytest poetry
30 |         poetry install
31 |     - name: Lint with flake8
32 |       run: |
33 |         # stop the build if there are Python syntax errors or undefined names
34 |         flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
35 |         # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
36 |         flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
37 | 


--------------------------------------------------------------------------------
/windows-uninstall.bat:
--------------------------------------------------------------------------------
 1 | @echo off
 2 | setlocal enabledelayedexpansion
 3 | 
 4 | :: Display warning message
 5 | echo WARNING this uninstalls *** ALL *** python libraries
 6 | echo WARNING this deletes the poetry.lock file
 7 | echo Are you sure you want to continue? y/N
 8 | set /p user_input=
 9 | 
10 | :: Check user input
11 | if /i "%user_input%"=="y" (
12 |     echo Uninstalling Python libraries...
13 |     pip freeze > installed_packages.txt
14 |     
15 |     :: Check if installed_packages.txt is empty
16 |     for %%A in (installed_packages.txt) do (
17 |         if %%~zA==0 (
18 |             echo all pip packages uninstalled
19 |             goto poetry_check
20 |         )
21 |     )
22 |     
23 |     pip uninstall -r installed_packages.txt -y
24 |     goto poetry_check
25 | ) else (
26 |     echo Exiting...
27 |     goto end
28 | )
29 | 
30 | :poetry_check
31 | :: Check if poetry is installed
32 | poetry -V >nul 2>&1
33 | if %ERRORLEVEL% equ 0 (
34 |     echo Poetry is installed, removing all environments...
35 |     for /f "delims=" %%i in ('poetry env list') do poetry env remove %%i
36 | ) else (
37 |     echo Poetry is not installed, skipping poetry environment removal.
38 | )
39 | 
40 | :cleanup
41 | :: Clean up
42 | if exist installed_packages.txt del installed_packages.txt
43 | 
44 | :: Delete poetry.lock if it exists
45 | if exist poetry.lock del poetry.lock
46 | 
47 | :end
48 | :: End of script
49 | endlocal
50 | 


--------------------------------------------------------------------------------
/memery/encoder.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import clip
 3 | from clip.model import CLIP
 4 | from tqdm import tqdm
 5 | from torch.utils.data import DataLoader
 6 | from torch import Tensor, device
 7 | from torchvision.transforms import Compose
 8 | 
 9 | def load_model(device: device) -> CLIP:
10 |     model, _ = clip.load("ViT-B/32", device, jit=False)
11 |     model = model.float()
12 |     return(model)
13 | 
14 | def image_encoder(img_loader: DataLoader, device: device, model: CLIP):
15 |     image_embeddings = torch.tensor(()).to(device)
16 |     with torch.no_grad():
17 |         for images, labels in tqdm(img_loader):
18 |             batch_features = model.encode_image(images.to(device))
19 |             image_embeddings = torch.cat((image_embeddings, batch_features)).to(device)
20 | 
21 |     image_embeddings = image_embeddings / image_embeddings.norm(dim=-1, keepdim=True)
22 |     return(image_embeddings)
23 | 
24 | def text_encoder(text: str, device: device, model: CLIP):
25 |     with torch.no_grad():
26 |         text = clip.tokenize(text).to(device)
27 |         text_features = model.encode_text(text)
28 |         text_features = text_features / text_features.norm(dim=-1, keepdim=True)
29 |     return(text_features)
30 | 
31 | def image_query_encoder(image: Tensor, device: device, model: CLIP):
32 |     with torch.no_grad():
33 |         image_embed = model.encode_image(image.unsqueeze(0).to(device))
34 |     image_embed = image_embed / image_embed.norm(dim=-1, keepdim=True)
35 |     return(image_embed)


--------------------------------------------------------------------------------
/install/windows-install.py:
--------------------------------------------------------------------------------
 1 | import subprocess
 2 | import platform
 3 | import webbrowser
 4 | from packaging import version
 5 | 
 6 | from cuda_install import cuda_check
 7 | 
 8 | def get_python_version():
 9 |     return platform.python_version()
10 | 
11 | def open_python_download_page():
12 |     webbrowser.open("https://www.python.org/downloads/") 
13 | 
14 | def is_poetry_installed():
15 |     try:
16 |         subprocess.run(["poetry", "--version"], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
17 |         return True
18 |     except (subprocess.CalledProcessError, FileNotFoundError):
19 |         return False
20 | 
21 | def install_poetry():
22 |     try:
23 |         subprocess.run(["pip", "install", "poetry"], check=True)
24 |         print("Poetry installed successfully.")
25 |     except (subprocess.CalledProcessError, FileNotFoundError) as e:
26 |         print(f"Failed to install Poetry: {e}")
27 | 
28 | if __name__ == "__main__":
29 |     current_version = get_python_version()
30 |     print(f"Found Python version: {current_version}. Project tested with Python 3.10.6.")
31 |     
32 |     if version.parse(current_version) < version.parse("3.9.0"):
33 |         print("\033[91mProject requires Python greater than 3.9. Please install Python 3.9 or greater.\033[0m")
34 |         open_python_download_page()
35 | 
36 |     if is_poetry_installed():
37 |         print("Poetry is already installed.")
38 |     else:
39 |         print("Poetry is not installed. Installing...")
40 |         install_poetry()    
41 | 
42 |     # check if cuda is installed
43 |     # cuda_check("11.3.0")
44 | 


--------------------------------------------------------------------------------
/memery/cli.py:
--------------------------------------------------------------------------------
 1 | import typer
 2 | from memery.core import Memery
 3 | import memery
 4 | import streamlit.cli
 5 | from typing import Optional
 6 | # Sometimes you just want to be able to pipe information through the terminal. This is that command
 7 | 
 8 | app = typer.Typer()
 9 | 
10 | def main():
11 |     app()
12 | 
13 | @app.command()
14 | def recall(
15 |     root: str = typer.Argument('.', help="Image folder to search"),
16 |     text: str = typer.Option(None, *("-t", "--text"), help="Text query"),
17 |     image: str = typer.Option(None, *("-i", "--image"), help="Filepath to image query") ,
18 |     number: int = typer.Option(10, *("-n", "--number"), help="Number of results to return")
19 |     ) -> list[str]:
20 |     """Search recursively over a folder from the command line"""
21 |     memery = Memery()
22 |     ranked = memery.query_flow(root, query=text, image_query=image)
23 |     print(ranked[:number])
24 | 
25 | @app.command()
26 | def serve(root: Optional[str] = typer.Argument(None)):
27 |     """Runs the streamlit GUI in your browser"""
28 |     app_path = memery.__file__.replace('__init__.py','streamlit_app.py')
29 |     if root is None:
30 |         streamlit.cli.main(['run', app_path, './images'])
31 |     else:
32 |         streamlit.cli.main(['run', app_path, f'{root}'])
33 | 
34 | @app.command()
35 | def build(
36 |     root: str = typer.Argument('.'),
37 |     workers: int = typer.Option(default=0)
38 |     ):
39 |     '''
40 |     Indexes the directory and all subdirectories
41 |     '''
42 |     memery = Memery()
43 |     memery.index_flow(root, num_workers=workers)
44 |     return None
45 | 
46 | @app.command()
47 | def purge(root: str = typer.Argument('.')):
48 |     """
49 |     Cleans out all files saved by memery
50 |     """
51 |     memery = Memery()
52 |     memery.clean(root)
53 |     print("Purged files!")
54 | 
55 | if __name__ == "__main__":
56 |     main()


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # How to contribute
 2 | 
 3 | ## Did you find a bug?
 4 | 
 5 | * Ensure the bug was not already reported by searching on GitHub under Issues.
 6 | * If you're unable to find an open issue addressing the problem, open a new one. Be sure to include a title and clear description, as much relevant information as possible, and a code sample or an executable test case demonstrating the expected behavior that is not occurring.
 7 | * Be sure to add the complete error messages.
 8 | 
 9 | #### Did you write a patch that fixes a bug?
10 | 
11 | * Open a new GitHub pull request with the patch.
12 | * Ensure that your PR includes a test that fails without your patch, and pass with it.
13 | * Ensure the PR description clearly describes the problem and solution. Include the relevant issue number if applicable.
14 | 
15 | ## PR submission guidelines
16 | 
17 | * Keep each PR focused. While it's more convenient, do not combine several unrelated fixes together. Create as many branches as needing to keep each PR focused.
18 | * Do not mix style changes/fixes with "functional" changes. It's very difficult to review such PRs and it most likely get rejected.
19 | * Do not add/remove vertical whitespace. Preserve the original style of the file you edit as much as you can.
20 | * Do not turn an already submitted PR into your development playground. If after you submitted PR, you discovered that more work is needed - close the PR, do the required work and then submit a new PR. Otherwise each of your commits requires attention from maintainers of the project.
21 | * If, however, you submitted a PR and received a request for changes, you should proceed with commits inside that PR, so that the maintainer can see the incremental fixes and won't need to review the whole PR again. In the exception case where you realize it'll take many many commits to complete the requests, then it's probably best to close the PR, do the work and then submit it again. Use common sense where you'd choose one way over another.


--------------------------------------------------------------------------------
/notebooks/memery.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": null,
 6 |    "metadata": {},
 7 |    "outputs": [],
 8 |    "source": [
 9 |     "from memery.core import Memery"
10 |    ]
11 |   },
12 |   {
13 |    "cell_type": "code",
14 |    "execution_count": null,
15 |    "metadata": {},
16 |    "outputs": [],
17 |    "source": [
18 |     "memery = Memery()\n",
19 |     "ranked = memery.query_flow('../images', 'dad joke')\n",
20 |     "\n",
21 |     "print(ranked[:5])"
22 |    ]
23 |   },
24 |   {
25 |    "cell_type": "code",
26 |    "execution_count": null,
27 |    "metadata": {},
28 |    "outputs": [],
29 |    "source": [
30 |     "memery = Memery()\n",
31 |     "root = '../images/'\n",
32 |     "db = memery.get_db(root + 'memery.pt')\n",
33 |     "index = memery.get_index(root + 'memery.ann')\n",
34 |     "model = memery.get_model()"
35 |    ]
36 |   },
37 |   {
38 |    "cell_type": "code",
39 |    "execution_count": null,
40 |    "metadata": {},
41 |    "outputs": [],
42 |    "source": [
43 |     "memery.index_flow(root)\n"
44 |    ]
45 |   },
46 |   {
47 |    "cell_type": "code",
48 |    "execution_count": null,
49 |    "metadata": {},
50 |    "outputs": [],
51 |    "source": [
52 |     "memery.reset_state()\n",
53 |     "memery.model = None"
54 |    ]
55 |   },
56 |   {
57 |    "cell_type": "code",
58 |    "execution_count": null,
59 |    "metadata": {},
60 |    "outputs": [],
61 |    "source": [
62 |     "memery.query_flow(root, 'Wow its already working')"
63 |    ]
64 |   }
65 |  ],
66 |  "metadata": {
67 |   "interpreter": {
68 |    "hash": "deeee0b52e76b5e3a563dfd39c9570f6111f9f254cd04b55dab6af9643751b0b"
69 |   },
70 |   "kernelspec": {
71 |    "display_name": "Python 3.9.12 ('memery-OXFjyqC6-py3.9')",
72 |    "language": "python",
73 |    "name": "python3"
74 |   },
75 |   "language_info": {
76 |    "codemirror_mode": {
77 |     "name": "ipython",
78 |     "version": 3
79 |    },
80 |    "file_extension": ".py",
81 |    "mimetype": "text/x-python",
82 |    "name": "python",
83 |    "nbconvert_exporter": "python",
84 |    "pygments_lexer": "ipython3",
85 |    "version": "3.9.6"
86 |   },
87 |   "orig_nbformat": 4
88 |  },
89 |  "nbformat": 4,
90 |  "nbformat_minor": 2
91 | }
92 | 


--------------------------------------------------------------------------------
/memery/loader.py:
--------------------------------------------------------------------------------
 1 | # Builtins
 2 | from pathlib import Path
 3 | from typing import Any
 4 | 
 5 | # External
 6 | from PIL import Image
 7 | import torch
 8 | from torch import device
 9 | from annoy import AnnoyIndex
10 | import logging
11 | 
12 | # We take the filename and last modified time to check for modified images
13 | def hash_path(filepath: str) -> str:
14 |     return f'{filepath.stem}_{str(filepath.stat().st_mtime).split(".")[0]}'
15 | 
16 | def get_image_files(path: Path) -> list[str]:
17 |     img_extensions = {'.jpg', '.jpeg', '.png', '.ppm', '.bmp', '.pgm', '.tif', '.tiff', '.webp'}
18 |     return [(f, hash_path(f)) for f in path.rglob('*') if f.suffix in img_extensions]
19 | 
20 | def get_valid_images(path: Path) -> list[str]:
21 |     filepaths = get_image_files(path)
22 |     return [f for f in filepaths if verify_image(f[0])]
23 | 
24 | def verify_image(f: str):
25 |     try:
26 |         img = Image.open(f)
27 |         img.verify()
28 |         return(True)
29 |     except Exception as e:
30 |         logging.exception('Skipping bad file: %s\ndue to %s', f, e)
31 |         pass
32 | 
33 | def archive_loader(filepaths: list[str], db: Any) -> tuple[ set[str], list[str] ]: # Just guessing on the return type
34 | 
35 |     current_hashes = [hash for path, hash in filepaths]
36 |     archive_db = {i:db[item[0]] for i, item in enumerate(db.items()) if item[1]['hash'] in current_hashes}
37 |     archive_hashes = [v['hash'] for v in archive_db.values()]
38 |     new_files = [(str(path), hash) for path, hash in filepaths if hash not in archive_hashes and verify_image(path)]
39 | 
40 |     return(archive_db, new_files)
41 | 
42 | def db_loader(dbpath: str, device: device) -> Any:
43 |     '''
44 |     Loads a .pt file
45 |     '''
46 |     if Path(dbpath).exists():
47 |         db = torch.load(dbpath, device)
48 |     else:
49 |         db = {}
50 |     return(db)
51 | 
52 | def treemap_loader(treepath: str) -> AnnoyIndex:
53 |     '''
54 |     Loads a .ann file
55 |     '''
56 |     treemap = AnnoyIndex(512, 'angular')
57 |     treepath = Path(treepath)
58 |     if treepath.exists():
59 |         treemap.load(str(treepath))
60 |     else:
61 |         treemap = None
62 |     return(treemap)
63 | 
64 | if __name__ == '__main__':
65 |     print('TESTING')


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | *.bak
  2 | .gitattributes
  3 | .last_checked
  4 | .gitconfig
  5 | *.bak
  6 | *.log
  7 | *~
  8 | ~*
  9 | _tmp*
 10 | tmp*
 11 | tags
 12 | 
 13 | # memery files
 14 | *.ann
 15 | *.pt
 16 | 
 17 | # Byte-compiled / optimized / DLL files
 18 | __pycache__/
 19 | *.py[cod]
 20 | *$py.class
 21 | 
 22 | # C extensions
 23 | *.so
 24 | 
 25 | # Distribution / packaging
 26 | .Python
 27 | env/
 28 | build/
 29 | develop-eggs/
 30 | dist/
 31 | downloads/
 32 | eggs/
 33 | .eggs/
 34 | lib/
 35 | lib64/
 36 | parts/
 37 | sdist/
 38 | var/
 39 | wheels/
 40 | *.egg-info/
 41 | .installed.cfg
 42 | *.egg
 43 | share/
 44 | bin/
 45 | etc/
 46 | 
 47 | # PyInstaller
 48 | #  Usually these files are written by a python script from a template
 49 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 50 | *.manifest
 51 | *.spec
 52 | 
 53 | # Installer logs
 54 | pip-log.txt
 55 | pip-delete-this-directory.txt
 56 | 
 57 | # Unit test / coverage reports
 58 | htmlcov/
 59 | .tox/
 60 | .coverage
 61 | .coverage.*
 62 | .cache
 63 | nosetests.xml
 64 | coverage.xml
 65 | *.cover
 66 | .hypothesis/
 67 | 
 68 | # Translations
 69 | *.mo
 70 | *.pot
 71 | 
 72 | # Django stuff:
 73 | *.log
 74 | local_settings.py
 75 | 
 76 | # Flask stuff:
 77 | instance/
 78 | .webassets-cache
 79 | 
 80 | # Scrapy stuff:
 81 | .scrapy
 82 | 
 83 | # Sphinx documentation
 84 | docs/_build/
 85 | 
 86 | # PyBuilder
 87 | target/
 88 | 
 89 | # Jupyter Notebook
 90 | .ipynb_checkpoints
 91 | 
 92 | # pyenv
 93 | .python-version
 94 | 
 95 | # celery beat schedule file
 96 | celerybeat-schedule
 97 | 
 98 | # SageMath parsed files
 99 | *.sage.py
100 | 
101 | # dotenv
102 | .env
103 | 
104 | # virtualenv
105 | .venv
106 | venv/
107 | ENV/
108 | 
109 | # Spyder project settings
110 | .spyderproject
111 | .spyproject
112 | 
113 | # Rope project settings
114 | .ropeproject
115 | 
116 | # mkdocs documentation
117 | /site
118 | 
119 | # mypy
120 | .mypy_cache/
121 | 
122 | .vscode
123 | *.swp
124 | 
125 | # osx generated files
126 | .DS_Store
127 | .DS_Store?
128 | .Trashes
129 | ehthumbs.db
130 | Thumbs.db
131 | .idea
132 | 
133 | # pytest
134 | .pytest_cache
135 | 
136 | # tools/trust-doc-nbs
137 | docs_src/.last_checked
138 | 
139 | # symlinks to fastai
140 | docs_src/fastai
141 | tools/fastai
142 | 
143 | # link checker
144 | checklink/cookies.txt
145 | 
146 | # .gitconfig is now autogenerated
147 | .gitconfig
148 | poetry.lock
149 | 


--------------------------------------------------------------------------------
/memery/crafter.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import Tensor, device
 3 | from torchvision.datasets import VisionDataset
 4 | from PIL import Image, ImageFile
 5 | from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize
 6 | from torch.utils.data import DataLoader
 7 | 
 8 | 
 9 | def make_dataset(new_files: list[str]) -> tuple[list[str], list[str]]:
10 |     '''Returns a list of samples of a form (path_to_sample, class) and in
11 |     this case the class is just the filename'''
12 |     samples = []
13 |     slugs = []
14 |     for i, f in enumerate(new_files):
15 |         path, slug = f
16 |         samples.append((str(path), i))
17 |         slugs.append((slug, i))
18 |     return(samples, slugs)
19 | 
20 | def pil_loader(path: str) -> Image.Image:
21 |     ImageFile.LOAD_TRUNCATED_IMAGES = True  # Allow truncated images
22 |     try:
23 |         # open path as file to avoid ResourceWarning (https://github.com/python-pillow/Pillow/issues/835)
24 |         with open(path, 'rb') as f:            
25 |             img = Image.open(f)
26 |             return img.convert('RGB')
27 |     except Exception as e:
28 |         print(f"Skipping image {path}: {e}")
29 |         return None
30 | 
31 | class DatasetImagePaths(VisionDataset):
32 |     
33 |     def __init__(self, new_files, transforms = None):
34 |         super(DatasetImagePaths, self).__init__(new_files, transforms=transforms)
35 |         samples, slugs = make_dataset(new_files)
36 |         self.samples = samples
37 |         self.slugs = slugs
38 |         self.loader = pil_loader
39 |         self.root = 'file dataset'
40 |     def __len__(self):
41 |         return(len(self.samples))
42 | 
43 |     def __getitem__(self, index):
44 |         path, target = self.samples[index]
45 |         try:
46 |             sample = self.loader(path)
47 |             if sample is not None:
48 |                 if self.transforms is not None:
49 |                     sample = self.transforms(sample)
50 |                 return sample, target
51 |         except Exception as e:
52 |             print(f"Skipping file {path} due to error: {e}")
53 |             return None
54 | 
55 | def clip_transform(n_px: int) -> Compose:
56 |     return Compose([
57 |         Resize(n_px, interpolation=Image.BICUBIC),
58 |         CenterCrop(n_px),
59 |         ToTensor(),
60 |         Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
61 |     ])
62 | 
63 | def crafter(new_files: list[str], device: device, batch_size: int=128, num_workers: int=4):
64 |     with torch.no_grad():
65 |         imagefiles=DatasetImagePaths(new_files, clip_transform(224))
66 |         img_loader=DataLoader(imagefiles, batch_size=batch_size, shuffle=False, num_workers=num_workers)
67 |     return(img_loader)
68 | 
69 | def preproc(img: Tensor) -> Compose:
70 |     transformed = clip_transform(224)(img)
71 |     return(transformed)


--------------------------------------------------------------------------------
/install/cuda_install.py:
--------------------------------------------------------------------------------
 1 | import platform
 2 | import subprocess
 3 | import torch
 4 | 
 5 | 
 6 | def cuda_check(required_cuda_version: str):
 7 |     if torch.cuda.is_available():        
 8 |         print(f"Detected CUDA version: {torch.version.cuda}, torch: {torch.__version__}")                
 9 |     else:
10 |         # Only call check_and_install_cuda if CUDA is not available
11 |         cuda_installed = check_and_install_cuda(required_cuda_version)        
12 |         if cuda_installed and torch.cuda.is_available():            
13 |             print(f"Detected CUDA version: {torch.version.cuda}, torch: {torch.__version__}")        
14 | 
15 | def check_for_nvidia_gpu():
16 |     try:
17 |         result = subprocess.run(["nvidia-smi", "-L"], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
18 |         return "GPU" in result.stdout
19 |     except subprocess.CalledProcessError:
20 |         return False
21 | 
22 | def download_and_install_cuda(required_cuda_version: str):
23 |     system = platform.system()
24 |     if system == "Linux":
25 |         subprocess.run(["wget", f"https://developer.nvidia.com/cuda-{required_cuda_version}-download-archive"], check=True)
26 |     elif system == "Windows":                
27 |         subprocess.run(["start", f"https://developer.nvidia.com/cuda-{required_cuda_version}-download-archive"], shell=True, check=True)
28 |     elif system == "Darwin":
29 |         print("Sorry, CUDA is not supported on macOS.")
30 |         return False
31 |     else:
32 |         print("Unsupported OS.")
33 |         return False
34 |     print("Please follow the instructions on the opened webpage to install CUDA. After CUDA has been installed you will need to run the following commands")
35 |     print("pip install torch==1.11.0+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html")
36 |     print("pip install -e .")
37 |     return True
38 | 
39 | def check_and_install_cuda(required_cuda_version: str):
40 |     if not check_for_nvidia_gpu():
41 |         print("No CUDA-compatible GPU detected. You must use CPU mode.")
42 |         return False
43 | 
44 |     try:
45 |         import torch
46 |         installed_cuda_version = torch.version.cuda
47 |         if installed_cuda_version == required_cuda_version:
48 |             return True
49 |     except ImportError:
50 |         print("PyTorch is not installed. Unable to check CUDA version.")
51 |         return False
52 | 
53 |     # Print in red
54 |     print(f"\033[91mDetected CUDA {installed_cuda_version}, requires {required_cuda_version} for gpu.\033[0m")
55 | 
56 |     # Collect user input
57 |     user_input = input("\033[94mUsing cpu mode by default. Install CUDA version for gpu mode? [y/N]: \033[0m\n\n")
58 | 
59 |     if user_input.lower() == 'y':
60 |         return download_and_install_cuda(required_cuda_version)
61 |     else:
62 |         print("Proceeding with CPU mode.")
63 |         return False
64 | 


--------------------------------------------------------------------------------
/windows-install.bat:
--------------------------------------------------------------------------------
  1 | @echo off
  2 | SETLOCAL
  3 | 
  4 | REM Check if Python is installed and in PATH
  5 | where python >nul 2>nul
  6 | if %errorlevel% neq 0 (
  7 |     echo Python is not installed or not in PATH.
  8 |     goto InstallPython
  9 | )
 10 | 
 11 | REM Check Python version if installed
 12 | for /f "tokens=*" %%i in ('python --version 2^>^&1') do set PYTHON_VERSION=%%i
 13 | 
 14 | REM Check if the version string actually contains "Python"
 15 | echo %PYTHON_VERSION% | find "Python" > nul
 16 | if %errorlevel% neq 0 (
 17 |     echo Python is not installed or not you do not have python > 3.9 in PATH.
 18 |     goto InstallPython
 19 | )
 20 | 
 21 | set PYTHON_VERSION=%PYTHON_VERSION:~7%
 22 | for /f "tokens=1,2,3 delims=." %%a in ("%PYTHON_VERSION%") do (
 23 |     set Major=%%a
 24 |     set Minor=%%b
 25 |     set Patch=%%c
 26 | )
 27 | echo Detected Python version: %Major%.%Minor%.%Patch%
 28 | 
 29 | REM Perform numerical comparison to check if version is adequate
 30 | if %Major% geq 3 (
 31 |     if %Major% gtr 3 (
 32 |         echo Python version is adequate.
 33 |         goto End
 34 |     ) else (
 35 |         if %Minor% geq 9 (
 36 |             echo Python version is adequate.	    
 37 |             goto End
 38 |         )
 39 |     )
 40 | )
 41 | 
 42 | echo Python version is not adequate.
 43 | goto InstallPython
 44 | 
 45 | :InstallPython
 46 | echo Installing Python 3.10.6...
 47 | REM Note: Internet access is required for this part, and it's disabled in this session. Make sure you're connected.
 48 | REM Check if Python installer already exists
 49 | if not exist "%CD%\python-3.10.6-amd64.exe" (
 50 |     REM Download Python 3.10.6 using curl.
 51 |     curl -O https://www.python.org/ftp/python/3.10.6/python-3.10.6-amd64.exe
 52 | )
 53 | start /wait python-3.10.6-amd64.exe InstallAllUsers=1 PrependPath=1
 54 | if %errorlevel% neq 0 (
 55 |     echo Failed to install Python.
 56 |     exit /b 1
 57 | )
 58 | echo Python has been installed, run the install script again.
 59 | echo This will refresh the environment variables, ensure all other command windows are closed.
 60 | pause
 61 | 
 62 | 
 63 | :End
 64 | ENDLOCAL
 65 | 
 66 | echo Upgrade pip
 67 | python -m pip install --upgrade pip
 68 | if %errorlevel% neq 0 (
 69 |     echo Failed to upgrade pip
 70 |     pause
 71 | )
 72 | 
 73 | REM Install Python dependencies from requirements.txt
 74 | pip install -r ./install/requirements.txt
 75 | if %errorlevel% neq 0 (
 76 |     echo Failed to install Python dependencies from requirements.txt.
 77 |     pause
 78 | )
 79 | 
 80 | REM Check if Poetry is installed and in PATH
 81 | where poetry >nul 2>nul
 82 | if %errorlevel% neq 0 (
 83 |     echo Poetry is not installed or not in PATH.
 84 |     pause
 85 | )
 86 | 
 87 | REM Run 'poetry install' to install dependencies via Poetry
 88 | poetry install
 89 | if %errorlevel% neq 0 (
 90 |     echo Failed to install Python dependencies via Poetry.
 91 |     pause
 92 | )
 93 | 
 94 | REM Use local build folder
 95 | echo Use local build folder
 96 | pip install -e .
 97 | 
 98 | REM Run the Python script
 99 | python ./install/windows-install.py
100 | 
101 | REM Check the exit code of the Python script
102 | if %errorlevel% neq 0 (
103 |     echo Failed to execute Python script.
104 |     pause
105 | )
106 | echo Success Installed.
107 | pause


--------------------------------------------------------------------------------
/memery/gui.py:
--------------------------------------------------------------------------------
 1 | # import ipywidgets as widgets
 2 | 
 3 | # from .core import query_flow
 4 | # from pathlib import Path
 5 | # from IPython.display import clear_output
 6 | 
 7 | 
 8 | 
 9 | # def get_image(file_loc):
10 | #     filepath = Path(file_loc)
11 | #     file = open(filepath, 'rb')
12 | #     image = widgets.Image(value=file.read(),width=200)
13 | 
14 | #     return(image)
15 | 
16 | # def get_grid(filepaths, n=4):
17 | #     imgs = [get_image(f) for f in filepaths[:n] if Path(f).exists()]
18 | #     grid = widgets.GridBox(imgs, layout=widgets.Layout(grid_template_columns="repeat(auto-fit, 200px)"))
19 | #     return(grid)
20 | 
21 | # from PIL import Image
22 | # from io import BytesIO
23 | 
24 | # def update_tabs(path, query, n_images, searches, tabs, logbox, im_display_zone, image_query=None):
25 | #     stem = Path(path.value).stem
26 | #     slug = f"{stem}:{str(query.value)}"
27 | #     if slug not in searches.keys():
28 | #         with logbox:
29 | #             print(slug)
30 | #             if image_query:
31 | #                 im_queries = [name for name, data in image_query.items()]
32 | 
33 | #                 img = [Image.open(BytesIO(file_info['content'])).convert('RGB') for name, file_info in image_query.items()]
34 | #                 ranked = query_flow(path.value, query.value, image_query=img[-1])
35 | #                 slug = slug + f'/{im_queries}'
36 | 
37 | #                 if len(im_queries) > 0:
38 | #                     with im_display_zone:
39 | #                         clear_output()
40 | #                         display(img[-1])
41 | #             else:
42 | #                 ranked = query_flow(path.value, query.value)
43 | #             searches[f'{slug}'] = ranked
44 | 
45 | #     tabs.children = [get_grid(v, n=n_images.value) for v in searches.values()]
46 | #     for i, k in enumerate(searches.keys()):
47 | #         tabs.set_title(i, k)
48 | #     tabs.selected_index = len(searches)-1
49 | 
50 | 
51 | # #     return(True)
52 | 
53 | # class appPage():
54 | 
55 | #     def __init__(self):
56 | #         self.inputs_layout =  widgets.Layout(max_width='80%')
57 | 
58 | #         self.path = widgets.Text(placeholder='path/to/image/folder', value='images/', layout=self.inputs_layout)
59 | #         self.query = widgets.Text(placeholder='a funny dog meme', value='a funny dog meme', layout=self.inputs_layout)
60 | 
61 | #         self.image_query = widgets.FileUpload()
62 | #         self.im_display_zone = widgets.Output(max_height='5rem')
63 | 
64 | #         self.n_images = widgets.IntSlider(description='#', value=4, layout=self.inputs_layout)
65 | #         self.go = widgets.Button(description="Search", layout=self.inputs_layout)
66 | #         self.logbox = widgets.Output(layout=widgets.Layout(max_width='80%', height="3rem", overflow="none"))
67 | #         self.all_inputs_layout =  widgets.Layout(max_width='80vw', min_height='40vh', flex_flow='row wrap', align_content='flex-start')
68 | 
69 | #         self.inputs = widgets.Box([self.path, self.query, self.image_query, self.n_images, self.go, self.im_display_zone, self.logbox], layout=self.all_inputs_layout)
70 | #         self.tabs = widgets.Tab()
71 | #         self.page = widgets.AppLayout(left_sidebar=self.inputs, center=self.tabs)
72 | 
73 | #         self.searches = {}
74 | #         self.go.on_click(self.page_update)
75 | 
76 | #         display(self.page)
77 | 
78 | #     def page_update(self, b):
79 | 
80 | #         update_tabs(self.path, self.query, self.n_images, self.searches, self.tabs, self.logbox, self.im_display_zone, self.image_query.value)
81 | 
82 | 
83 | 


--------------------------------------------------------------------------------
/notebooks/07_cli.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "#default_exp cli"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": null,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "#hide\n",
 19 |     "from nbdev.showdoc import *"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "markdown",
 24 |    "metadata": {},
 25 |    "source": [
 26 |     "# CLI"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": null,
 32 |    "metadata": {},
 33 |    "outputs": [],
 34 |    "source": [
 35 |     "#export\n",
 36 |     "import typer\n",
 37 |     "import memery.core\n",
 38 |     "import streamlit.cli"
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "code",
 43 |    "execution_count": null,
 44 |    "metadata": {},
 45 |    "outputs": [],
 46 |    "source": [
 47 |     "#export\n",
 48 |     "app = typer.Typer()"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "markdown",
 53 |    "metadata": {},
 54 |    "source": [
 55 |     "Sometimes you just want to be able to pipe information through the terminal. This is that command"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": null,
 61 |    "metadata": {},
 62 |    "outputs": [],
 63 |    "source": [
 64 |     "#export\n",
 65 |     "@app.command()\n",
 66 |     "def recall(path: str, query: str, n: int = 10):\n",
 67 |     "    \"\"\"Search recursively over a folder from the command line\"\"\"\n",
 68 |     "    ranked = memery.core.query_flow(path, query=query)\n",
 69 |     "    print(ranked[:n])\n",
 70 |     "#     return(ranked)"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "code",
 75 |    "execution_count": null,
 76 |    "metadata": {},
 77 |    "outputs": [],
 78 |    "source": [
 79 |     "recall('./images', 'a funny dog meme')"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "markdown",
 84 |    "metadata": {},
 85 |    "source": [
 86 |     "More often, though, you probably want to sift through image visually. The `memery serve` command will open a browser app on your local device, using Streamlit library."
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "code",
 91 |    "execution_count": null,
 92 |    "metadata": {},
 93 |    "outputs": [],
 94 |    "source": [
 95 |     "#export\n",
 96 |     "@app.command()\n",
 97 |     "def serve():\n",
 98 |     "    \"\"\"Runs the streamlit GUI in your browser\"\"\"\n",
 99 |     "    path = memery.__file__.replace('__init__.py','streamlit_app.py')\n",
100 |     "    streamlit.cli.main(['run',path])"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "code",
105 |    "execution_count": null,
106 |    "metadata": {},
107 |    "outputs": [],
108 |    "source": [
109 |     "# serve()"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "code",
114 |    "execution_count": null,
115 |    "metadata": {},
116 |    "outputs": [],
117 |    "source": [
118 |     "#export \n",
119 |     "def __main__():\n",
120 |     "    app()"
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "code",
125 |    "execution_count": null,
126 |    "metadata": {},
127 |    "outputs": [],
128 |    "source": []
129 |   },
130 |   {
131 |    "cell_type": "code",
132 |    "execution_count": null,
133 |    "metadata": {},
134 |    "outputs": [],
135 |    "source": []
136 |   }
137 |  ],
138 |  "metadata": {
139 |   "kernelspec": {
140 |    "display_name": "Python 3",
141 |    "language": "python",
142 |    "name": "python3"
143 |   }
144 |  },
145 |  "nbformat": 4,
146 |  "nbformat_minor": 4
147 | }
148 | 


--------------------------------------------------------------------------------
/notebooks/05_ranker.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "#default_exp ranker"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": null,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "#hide\n",
 19 |     "from nbdev.showdoc import *"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "markdown",
 24 |    "metadata": {},
 25 |    "source": [
 26 |     "# Ranker\n",
 27 |     "\n",
 28 |     "Takes a query and an index and finds the nearest neighbors or most similar scores. Ideally this is just a simple Annoy `get_nns_by_vector`, or in the simple case a similarity score across all the vectors."
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": null,
 34 |    "metadata": {},
 35 |    "outputs": [],
 36 |    "source": [
 37 |     "import torch\n",
 38 |     "\n",
 39 |     "\n",
 40 |     "from pathlib import Path\n",
 41 |     "\n",
 42 |     "from memery.loader import treemap_loader, db_loader\n",
 43 |     "from memery.encoder import text_encoder"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": null,
 49 |    "metadata": {},
 50 |    "outputs": [],
 51 |    "source": [
 52 |     "treemap = treemap_loader(Path('./images/memery.ann'))"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "code",
 57 |    "execution_count": null,
 58 |    "metadata": {},
 59 |    "outputs": [],
 60 |    "source": [
 61 |     "if treemap:\n",
 62 |     "    treemap.get_n_items()"
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "code",
 67 |    "execution_count": null,
 68 |    "metadata": {},
 69 |    "outputs": [],
 70 |    "source": [
 71 |     "#export\n",
 72 |     "def ranker(query_vec, treemap):\n",
 73 |     "    nn_indexes = treemap.get_nns_by_vector(query_vec[0], treemap.get_n_items())\n",
 74 |     "    return(nn_indexes)"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "code",
 79 |    "execution_count": null,
 80 |    "metadata": {},
 81 |    "outputs": [],
 82 |    "source": [
 83 |     "#export\n",
 84 |     "def nns_to_files(db, indexes):\n",
 85 |     "#     return([[v['fpath'] for k,v in db.items() if v['index'] == ind][0] for ind in indexes])\n",
 86 |     "    return([db[ind]['fpath'] for ind in indexes])"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "code",
 91 |    "execution_count": null,
 92 |    "metadata": {},
 93 |    "outputs": [],
 94 |    "source": [
 95 |     "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
 96 |     "db = db_loader(Path('images/memery.pt'), device)"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "code",
101 |    "execution_count": null,
102 |    "metadata": {},
103 |    "outputs": [],
104 |    "source": [
105 |     "query = 'dog'"
106 |    ]
107 |   },
108 |   {
109 |    "cell_type": "code",
110 |    "execution_count": null,
111 |    "metadata": {},
112 |    "outputs": [],
113 |    "source": [
114 |     "query_vec = text_encoder(query, device)\n",
115 |     "indexes = ranker(query_vec, treemap)\n",
116 |     "ranked_files = nns_to_files(db, indexes)"
117 |    ]
118 |   },
119 |   {
120 |    "cell_type": "code",
121 |    "execution_count": null,
122 |    "metadata": {},
123 |    "outputs": [],
124 |    "source": [
125 |     "ranked_files[:5]"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "code",
130 |    "execution_count": null,
131 |    "metadata": {},
132 |    "outputs": [],
133 |    "source": []
134 |   },
135 |   {
136 |    "cell_type": "code",
137 |    "execution_count": null,
138 |    "metadata": {},
139 |    "outputs": [],
140 |    "source": []
141 |   }
142 |  ],
143 |  "metadata": {
144 |   "kernelspec": {
145 |    "display_name": "Python 3",
146 |    "language": "python",
147 |    "name": "python3"
148 |   }
149 |  },
150 |  "nbformat": 4,
151 |  "nbformat_minor": 4
152 | }
153 | 


--------------------------------------------------------------------------------
/notebooks/03_encoder.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "#default_exp encoder"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": null,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "#hide\n",
 19 |     "from nbdev.showdoc import *"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "markdown",
 24 |    "metadata": {},
 25 |    "source": [
 26 |     "# Encoder\n",
 27 |     "\n",
 28 |     "\n",
 29 |     "This is just a wrapper around CLIP functions. Cool thing here is we can use the one model for both image and text!\n"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": null,
 35 |    "metadata": {},
 36 |    "outputs": [],
 37 |    "source": [
 38 |     "#export\n",
 39 |     "import torch\n",
 40 |     "import clip\n",
 41 |     "from tqdm import tqdm\n",
 42 |     "\n",
 43 |     "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
 44 |     "model, _ = clip.load(\"ViT-B/32\", device, jit=False)    \n",
 45 |     "model = model.float()"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "code",
 50 |    "execution_count": null,
 51 |    "metadata": {},
 52 |    "outputs": [],
 53 |    "source": [
 54 |     "#export\n",
 55 |     "def image_encoder(img_loader, device):\n",
 56 |     "    image_embeddings = torch.tensor(()).to(device)\n",
 57 |     "    with torch.no_grad():\n",
 58 |     "        for images, labels in tqdm(img_loader):\n",
 59 |     "            batch_features = model.encode_image(images.to(device))\n",
 60 |     "            image_embeddings = torch.cat((image_embeddings, batch_features)).to(device)\n",
 61 |     "    \n",
 62 |     "    image_embeddings = image_embeddings / image_embeddings.norm(dim=-1, keepdim=True)\n",
 63 |     "    return(image_embeddings)"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "code",
 68 |    "execution_count": null,
 69 |    "metadata": {},
 70 |    "outputs": [],
 71 |    "source": [
 72 |     "new_files = [('images/memes/Wholesome-Meme-8.jpg', 'Wholesome-Meme-8'), ('images/memes/Wholesome-Meme-1.jpg', 'Wholesome-Meme-1')]"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "code",
 77 |    "execution_count": null,
 78 |    "metadata": {},
 79 |    "outputs": [],
 80 |    "source": [
 81 |     "from memery.crafter import crafter"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": null,
 87 |    "metadata": {},
 88 |    "outputs": [],
 89 |    "source": [
 90 |     "img_loader = crafter(new_files, device)"
 91 |    ]
 92 |   },
 93 |   {
 94 |    "cell_type": "code",
 95 |    "execution_count": null,
 96 |    "metadata": {},
 97 |    "outputs": [],
 98 |    "source": [
 99 |     "for images, labels in img_loader:\n",
100 |     "    print(images)"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "code",
105 |    "execution_count": null,
106 |    "metadata": {},
107 |    "outputs": [],
108 |    "source": [
109 |     "image_embeddings = image_encoder(img_loader, device)"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "code",
114 |    "execution_count": null,
115 |    "metadata": {},
116 |    "outputs": [],
117 |    "source": [
118 |     "image_embeddings.shape"
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "markdown",
123 |    "metadata": {},
124 |    "source": [
125 |     "The text encoder returns a 512d vector just like the image encoder"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "code",
130 |    "execution_count": null,
131 |    "metadata": {},
132 |    "outputs": [],
133 |    "source": [
134 |     "#export\n",
135 |     "def text_encoder(text, device):\n",
136 |     "    with torch.no_grad():\n",
137 |     "        text = clip.tokenize(text).to(device)\n",
138 |     "        text_features = model.encode_text(text)\n",
139 |     "        text_features = text_features / text_features.norm(dim=-1, keepdim=True)\n",
140 |     "    return(text_features)"
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "code",
145 |    "execution_count": null,
146 |    "metadata": {},
147 |    "outputs": [],
148 |    "source": [
149 |     "text_embedding = text_encoder('a funny dog meme', device)"
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "code",
154 |    "execution_count": null,
155 |    "metadata": {},
156 |    "outputs": [],
157 |    "source": [
158 |     "text_embedding.shape"
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "code",
163 |    "execution_count": null,
164 |    "metadata": {},
165 |    "outputs": [],
166 |    "source": [
167 |     "#export\n",
168 |     "def image_query_encoder(image, device):\n",
169 |     "    with torch.no_grad():\n",
170 |     "        image_embed = model.encode_image(image.unsqueeze(0).to(device))\n",
171 |     "    image_embed = image_embed / image_embed.norm(dim=-1, keepdim=True)\n",
172 |     "    return(image_embed)"
173 |    ]
174 |   }
175 |  ],
176 |  "metadata": {
177 |   "kernelspec": {
178 |    "display_name": "Python 3",
179 |    "language": "python",
180 |    "name": "python3"
181 |   }
182 |  },
183 |  "nbformat": 4,
184 |  "nbformat_minor": 4
185 | }
186 | 


--------------------------------------------------------------------------------
/memery/streamlit_app.py:
--------------------------------------------------------------------------------
  1 | # Builtins
  2 | from pathlib import Path
  3 | from PIL import Image
  4 | from io import StringIO
  5 | import sys
  6 | import argparse
  7 | from threading import current_thread
  8 | from contextlib import contextmanager
  9 | 
 10 | # Local
 11 | from memery.core import Memery
 12 | 
 13 | # Dependencies
 14 | import streamlit as st
 15 | from streamlit.report_thread import REPORT_CONTEXT_ATTR_NAME
 16 | 
 17 | 
 18 | # Parses the args from the command line
 19 | def parse_args(args: list[str]):
 20 |     parser = argparse.ArgumentParser()
 21 |     parser.add_argument('root', help='starting directory to search')
 22 |     return parser.parse_args(args)
 23 | 
 24 | # Initalize session state
 25 | args = parse_args(sys.argv[1:])
 26 | if 'memery' not in st.session_state:
 27 |     st.session_state['memery'] = Memery()
 28 | memery: Memery = st.session_state['memery']
 29 | 
 30 | # Configs
 31 | st.set_page_config(page_title='Memery', layout="centered")
 32 | 
 33 | # Draw the sidebar
 34 | st.sidebar.title("Memery")
 35 | 
 36 | settings = st.sidebar.expander(label="Settings", expanded=False)
 37 | with settings:
 38 |     do_clear_cache = st.button(label="Clear Cache")
 39 |     num_workers = st.slider(label="Number of workers", max_value=8)
 40 | 
 41 | dir_l, dir_r = st.sidebar.columns([3,1])
 42 | with dir_l:
 43 |     path = st.text_input(label='Directory', value=args.root)
 44 | with dir_r:
 45 |     st.title("")
 46 |     do_index = st.button(label="Index", key='do_index')
 47 | 
 48 | search_l, search_r = st.sidebar.columns([3,1])
 49 | with search_l:
 50 |     text_query = st.text_input(label='Text query', value='')
 51 |     negative_text_query = st.text_input(label='Negative Text query', value='')
 52 | with search_r:
 53 |     st.title("")
 54 |     search_button = st.button(label="Search", key="search_button")
 55 | 
 56 | 
 57 | image_query = st.sidebar.file_uploader(label='Image query')
 58 | image_query_display = st.sidebar.container()
 59 | if image_query: # Display the image query if there is one
 60 |     img = Image.open(image_query).convert('RGB')
 61 |     with image_query_display:
 62 |         st.image(img)
 63 | logbox = st.sidebar.empty()
 64 | skipped_files_box = st.sidebar.expander(label='Skipped files', expanded=False)
 65 | 
 66 | # Draw the main page
 67 | sizes = {'small': 115, 'medium':230, 'large':332, 'xlarge':600}
 68 | l, m, r = st.columns([4,1,1])
 69 | with l:
 70 |     num_images = st.slider(label='Number of images',value=12)
 71 | 
 72 | with m:
 73 |     size_choice = st.selectbox(label='Image width', options=[k for k in sizes.keys()], index=1)
 74 | with r:
 75 |     captions_on = st.checkbox(label="Caption filenames", value=False)
 76 | image_display_zone = st.container()
 77 | 
 78 | # Index the directory
 79 | def index(logbox, path, num_workers):
 80 |     if Path(path).exists():
 81 |         with logbox:
 82 |             with st_stdout('info'):
 83 |                     memery.index_flow(path, num_workers)
 84 |     else:
 85 |         with logbox:
 86 |             with st_stdout('warning'):
 87 |                 print(f'{path} does not exist!')
 88 | 
 89 | # Clears out the database and treemap files
 90 | def clear_cache(root, logbox):
 91 |     memery.clean(root)
 92 |     with logbox:
 93 |         with st_stdout('info'):
 94 |             print("Cleaned database and index files")
 95 | 
 96 | # Runs a search
 97 | def search(root, text_query, negative_text_query, image_query, image_display_zone, skipped_files_box, num_images, captions_on, sizes, size_choice):
 98 |     if not Path(path).exists():
 99 |         with logbox:
100 |             with st_stdout('warning'):
101 |                 print(f'{path} does not exist!')
102 |                 return
103 |     with logbox:
104 |         with st_stdout('info'):
105 |             ranked = memery.query_flow(root, text_query, negative_text_query, image_query)  # Modified line
106 |     ims_to_display = {}
107 |     size = sizes[size_choice]
108 |     for o in ranked[:num_images]:
109 |         name = o.replace(path, '')
110 |         try:
111 |             ims_to_display[name] = Image.open(o).convert('RGB')
112 |         except Exception as e:
113 |             with skipped_files_box:
114 |                 st.warning(f'Skipping bad file: {name}\ndue to {type(e)}')
115 |                 pass
116 |     with image_display_zone:
117 |         if captions_on:
118 |             st.image([o for o in ims_to_display.values()], width=size, channels='RGB', caption=[o for o in ims_to_display.keys()])
119 |         else:
120 |             st.image([o for o in ims_to_display.values()], width=sizes[size_choice], channels='RGB')
121 | 
122 | 
123 | @contextmanager
124 | def st_redirect(src, dst):
125 |     placeholder = st.empty()
126 |     output_func = getattr(placeholder, dst)
127 | 
128 |     with StringIO() as buffer:
129 |         old_write = src.write
130 | 
131 |         def new_write(b):
132 |             if getattr(current_thread(), REPORT_CONTEXT_ATTR_NAME, None):
133 |                 buffer.write(b + '')
134 |                 output_func(buffer.getvalue() + '')
135 |             else:
136 |                 old_write(b)
137 | 
138 |         try:
139 |             src.write = new_write
140 |             yield
141 |         finally:
142 |             src.write = old_write
143 | 
144 | 
145 | @contextmanager
146 | def st_stdout(dst):
147 |     with st_redirect(sys.stdout, dst):
148 |         yield
149 | 
150 | 
151 | @contextmanager
152 | def st_stderr(dst):
153 |     with st_redirect(sys.stderr, dst):
154 |         yield
155 | 
156 | # Decide which actions to take
157 | if do_clear_cache:
158 |     clear_cache(path, logbox)
159 | elif do_index:
160 |     index(logbox, path, num_workers)
161 | elif search_button or text_query or image_query:
162 |     search(path, text_query, negative_text_query, image_query, image_display_zone, skipped_files_box, num_images, captions_on, sizes, size_choice)  # Modified line
163 | 
164 | 


--------------------------------------------------------------------------------
/notebooks/04_indexer.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "#default_exp indexer"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": null,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "#hide\n",
 19 |     "from nbdev.showdoc import *"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "markdown",
 24 |    "metadata": {},
 25 |    "source": [
 26 |     "# Indexer\n",
 27 |     "\n",
 28 |     "Given a dataset of tensors, returns a dictionary archive and a treemap structure (and saves them to disk)"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "markdown",
 33 |    "metadata": {},
 34 |    "source": [
 35 |     "## Joiner\n",
 36 |     "\n",
 37 |     "This executor `needs` both Encoder and Loader to send it the new and old vectors, respectively. So it needs to be preceded by the **join_all** component to make sure we're not missing new data before handing it over to the indexer -- or indexing old data that no longer exists!"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "code",
 42 |    "execution_count": null,
 43 |    "metadata": {},
 44 |    "outputs": [],
 45 |    "source": [
 46 |     "#export\n",
 47 |     "def join_all(db, new_files, new_embeddings):\n",
 48 |     "    start = len(db)\n",
 49 |     "    for i, file in enumerate(new_files):\n",
 50 |     "        path, slug = file\n",
 51 |     "        index = i + start\n",
 52 |     "        db[index] = {\n",
 53 |     "            'slug': slug,\n",
 54 |     "            'fpath': path,\n",
 55 |     "            'embed': new_embeddings[i],\n",
 56 |     "        }\n",
 57 |     "    return(db)"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "code",
 62 |    "execution_count": null,
 63 |    "metadata": {},
 64 |    "outputs": [],
 65 |    "source": [
 66 |     "import torch\n",
 67 |     "from pathlib import Path\n",
 68 |     "from memery.loader import get_image_files, db_loader, archive_loader\n",
 69 |     "from memery.crafter import crafter\n",
 70 |     "from memery.encoder import image_encoder\n",
 71 |     "\n",
 72 |     "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "code",
 77 |    "execution_count": null,
 78 |    "metadata": {},
 79 |    "outputs": [],
 80 |    "source": [
 81 |     "root = Path('images/')"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": null,
 87 |    "metadata": {},
 88 |    "outputs": [],
 89 |    "source": [
 90 |     "filepaths = get_image_files(root)\n",
 91 |     "archive_db = {}\n",
 92 |     "\n",
 93 |     "\n",
 94 |     "archive_db, new_files = archive_loader(filepaths, root, device)\n",
 95 |     "print(f\"Loaded {len(archive_db)} encodings\")\n",
 96 |     "print(f\"Encoding {len(new_files)} new images\")\n",
 97 |     "\n",
 98 |     "crafted_files = crafter(new_files, device)\n",
 99 |     "new_embeddings = image_encoder(crafted_files, device)\n",
100 |     "\n",
101 |     "db = join_all(archive_db, new_files, new_embeddings)"
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "code",
106 |    "execution_count": null,
107 |    "metadata": {},
108 |    "outputs": [],
109 |    "source": [
110 |     "# db = db_loader(root/'memery.pt',device)"
111 |    ]
112 |   },
113 |   {
114 |    "cell_type": "code",
115 |    "execution_count": null,
116 |    "metadata": {},
117 |    "outputs": [],
118 |    "source": [
119 |     "[o[0] for o in db.items()][:5]"
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "code",
124 |    "execution_count": null,
125 |    "metadata": {},
126 |    "outputs": [],
127 |    "source": [
128 |     "len(db)"
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "markdown",
133 |    "metadata": {},
134 |    "source": [
135 |     "Building treemap takes a long time. I don't think `annoy` uses the GPU at all?"
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "code",
140 |    "execution_count": null,
141 |    "metadata": {},
142 |    "outputs": [],
143 |    "source": [
144 |     "#export\n",
145 |     "from annoy import AnnoyIndex"
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "code",
150 |    "execution_count": null,
151 |    "metadata": {},
152 |    "outputs": [],
153 |    "source": [
154 |     "#export\n",
155 |     "def build_treemap(db):\n",
156 |     "    treemap = AnnoyIndex(512, 'angular')\n",
157 |     "    for k, v in db.items():\n",
158 |     "        treemap.add_item(k, v['embed'])\n",
159 |     "\n",
160 |     "    # Build the treemap, with 5 trees rn\n",
161 |     "    treemap.build(5)\n",
162 |     "\n",
163 |     "    return(treemap)\n",
164 |     "    "
165 |    ]
166 |   },
167 |   {
168 |    "cell_type": "code",
169 |    "execution_count": null,
170 |    "metadata": {},
171 |    "outputs": [],
172 |    "source": [
173 |     "t = build_treemap(db)"
174 |    ]
175 |   },
176 |   {
177 |    "cell_type": "code",
178 |    "execution_count": null,
179 |    "metadata": {},
180 |    "outputs": [],
181 |    "source": [
182 |     "t.get_n_items(), t.get_n_trees()"
183 |    ]
184 |   },
185 |   {
186 |    "cell_type": "code",
187 |    "execution_count": null,
188 |    "metadata": {},
189 |    "outputs": [],
190 |    "source": [
191 |     "#export\n",
192 |     "import torch"
193 |    ]
194 |   },
195 |   {
196 |    "cell_type": "code",
197 |    "execution_count": null,
198 |    "metadata": {},
199 |    "outputs": [],
200 |    "source": [
201 |     "#export\n",
202 |     "def save_archives(root, treemap, db):\n",
203 |     "    dbpath = root/'memery.pt'\n",
204 |     "    if dbpath.exists():\n",
205 |     "#         dbpath.rename(root/'memery-bak.pt')\n",
206 |     "        dbpath.unlink()\n",
207 |     "    torch.save(db, dbpath)\n",
208 |     "    \n",
209 |     "    treepath = root/'memery.ann'\n",
210 |     "    if treepath.exists():\n",
211 |     "#         treepath.rename(root/'memery-bak.ann')\n",
212 |     "        treepath.unlink()\n",
213 |     "    treemap.save(str(treepath))\n",
214 |     "    \n",
215 |     "    return(str(dbpath), str(treepath))"
216 |    ]
217 |   },
218 |   {
219 |    "cell_type": "code",
220 |    "execution_count": null,
221 |    "metadata": {},
222 |    "outputs": [],
223 |    "source": [
224 |     "save_archives(root, t, db)"
225 |    ]
226 |   },
227 |   {
228 |    "cell_type": "code",
229 |    "execution_count": null,
230 |    "metadata": {},
231 |    "outputs": [],
232 |    "source": []
233 |   }
234 |  ],
235 |  "metadata": {
236 |   "kernelspec": {
237 |    "display_name": "Python 3",
238 |    "language": "python",
239 |    "name": "python3"
240 |   }
241 |  },
242 |  "nbformat": 4,
243 |  "nbformat_minor": 4
244 | }
245 | 


--------------------------------------------------------------------------------
/memery/core.py:
--------------------------------------------------------------------------------
  1 | # Builtins 
  2 | import time
  3 | from pathlib import Path
  4 | import logging
  5 | 
  6 | # Dependencies
  7 | import torch
  8 | from torch import Tensor, device
  9 | from torchvision.transforms import Compose
 10 | from PIL import Image
 11 | 
 12 | 
 13 | # Local imports
 14 | from memery import loader, crafter, encoder, indexer, ranker
 15 | 
 16 | class Memery():
 17 |     def __init__(self, root: str = '.'):
 18 |         self.index_file = 'memery.ann'
 19 |         self.db_file = 'memery.pt'
 20 |         self.root = root
 21 |         self.index = None
 22 |         self.db = None
 23 |         self.model = None
 24 |         self.device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
 25 |         print(f"Using {self.device} for computation.")
 26 | 
 27 |     def index_flow(self, root: str, num_workers=0) -> tuple[str, str]:
 28 |         '''Indexes images in path, returns the location of save files'''
 29 | 
 30 |         start_time = time.time()
 31 |         if self.root != root:
 32 |             self.root = root
 33 |             self.reset_state()
 34 |         
 35 |         path = Path(root)
 36 |         if not path.is_dir():
 37 |             logging.error("Invalid path: %s", root)
 38 |             return
 39 |         device = self.device
 40 | 
 41 |         # Check if we should re-index the files
 42 |         print("Checking files...")
 43 |         dbpath = path/self.db_file
 44 |         db = self.get_db(str(dbpath))
 45 |         treepath = path/self.index_file
 46 |         treemap = self.get_index(str(treepath))
 47 |         filepaths = loader.get_valid_images(path)
 48 |         
 49 |         db_set = set([o['hash'] for o in db.values()])
 50 |         fp_set = set([o for _, o in filepaths])
 51 | 
 52 |         if treemap == None or db_set != fp_set:
 53 |             archive_db = {}
 54 | 
 55 |             archive_db, new_files = loader.archive_loader(filepaths, db)
 56 |             print(f"Loaded {len(archive_db)} encodings")
 57 |             print(f"Encoding {len(new_files)} new images")
 58 | 
 59 |             # Crafting and encoding
 60 |             crafted_files = crafter.crafter(new_files, device, num_workers=num_workers)
 61 |             model = self.get_model()
 62 |             new_embeddings = encoder.image_encoder(crafted_files, device, model)
 63 | 
 64 |             # Reindexing
 65 |             db = indexer.join_all(archive_db, new_files, new_embeddings)
 66 |             print("Building treemap")
 67 |             treemap = indexer.build_treemap(db)
 68 | 
 69 |             print(f"Saving {len(db)} encodings")
 70 |             save_paths = indexer.save_archives(path, treemap, db)
 71 | 
 72 |         else:
 73 |             save_paths = (str(dbpath), str(treepath))
 74 |         self.reset_state()
 75 |         print(f"Done in {time.time() - start_time} seconds")
 76 | 
 77 |         return(save_paths)
 78 | 
 79 |     def query_flow(self, root: str, query: str=None, negative_query: str=None, image_query: str=None, reindex: bool=False) -> list[str]:
 80 |         '''
 81 |         Indexes a folder and returns file paths ranked by query.
 82 | 
 83 |         Parameters:
 84 |             path (str): Folder to search
 85 |             query (str): Positive search query text
 86 |             negative_query (str): Negative search query text
 87 |             image_query (Tensor): Search query image(s)
 88 |             reindex (bool): Reindex the folder if True
 89 |         Returns:
 90 |             list of file paths ranked by query
 91 |         '''
 92 |         start_time = time.time()
 93 | 
 94 |         if self.root != root:
 95 |             self.root = root
 96 |             self.reset_state()
 97 |         path = Path(root)
 98 |         if not path.is_dir():
 99 |             logging.error("Invalid path: %s", root)
100 |             return
101 |         device = self.device
102 | 
103 |         dbpath = path/self.db_file
104 |         treepath = path/self.index_file
105 |         treemap = self.get_index(treepath)
106 |         db = self.get_db(dbpath)
107 | 
108 |         # Rebuild the tree if it doesn't exist
109 |         if reindex==True or len(db) == 0 or treemap == None:
110 |             print('Indexing')
111 |             dbpath, treepath = self.index_flow(path)
112 |             self.reset_state()
113 |             treemap = self.get_index(treepath)
114 |             db = self.get_db(dbpath)
115 | 
116 |         model = self.get_model()
117 |         # Convert queries to vector
118 |         print('Converting query')
119 |         if image_query:
120 |             image_query = Image.open(image_query).convert('RGB')
121 |             img = crafter.preproc(image_query)
122 |         if query and image_query:
123 |             text_vec = encoder.text_encoder(query, device, model)
124 |             image_vec = encoder.image_query_encoder(img, device, model)
125 |             query_vec = text_vec + image_vec
126 |         elif query:
127 |             query_vec = encoder.text_encoder(query, device, model)
128 |             if negative_query:
129 |                 negative_query_vec = encoder.text_encoder(negative_query, self.device, model)
130 |                 query_vec = query_vec - negative_query_vec  # Subtract negative query vector from positive query vector
131 |         elif image_query:
132 |             query_vec = encoder.image_query_encoder(img, device, model)
133 |         else:
134 |             print('No query!')
135 |             return ""
136 | 
137 |         # Rank db by query
138 |         print(f"Searching {len(db)} images")
139 |         indexes = ranker.ranker(query_vec, treemap)
140 |         ranked_files = ranker.nns_to_files(db, indexes)
141 |         print(f"Done in {time.time() - start_time} seconds")
142 | 
143 |         return(ranked_files)
144 | 
145 |     def clean(self, root: str) -> None:
146 |         '''
147 |         Removes all files produced by Memery
148 |         '''
149 |         path = Path(root)
150 |         if not path.is_dir():
151 |             logging.error("Invalid path: %s", root)
152 |         db_path = path/Path(self.db_file)
153 |         treemap_path = path/Path(self.index_file)
154 |         db_path.unlink(missing_ok=True), treemap_path.unlink(missing_ok=True)
155 | 
156 |     def get_model(self):
157 |         '''
158 |         Gets a new clip model if not initialized
159 |         '''
160 |         if self.model == None:
161 |             self.model = encoder.load_model(self.device)
162 |         return self.model
163 | 
164 |     def get_index(self, treepath: str):
165 |         '''
166 |         Gets a new index if not initialized
167 | 
168 |         Parameters:
169 |             path (str): Path to index
170 |         '''
171 |         if self.index == None:
172 |             self.index = loader.treemap_loader(treepath)
173 |         return self.index
174 | 
175 |     def get_db(self, dbpath: str):
176 |         '''
177 |         Gets a new db if not initialized
178 | 
179 |         Parameters:
180 |             path (str): Path to db
181 |         '''
182 |         if self.db == None:
183 |             self.db = loader.db_loader(dbpath, self.device)
184 |         return self.db
185 | 
186 |     def reset_state(self) -> None:
187 |         '''
188 |         Resets the index and db
189 |         '''
190 |         self.index = None
191 |         self.db = None
192 | 


--------------------------------------------------------------------------------
/notebooks/09_streamlit_app.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "#default_exp streamlit_app"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": null,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "#hide\n",
 19 |     "from nbdev.showdoc import *"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "markdown",
 24 |    "metadata": {},
 25 |    "source": [
 26 |     "# Streamlit app"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "markdown",
 31 |    "metadata": {},
 32 |    "source": [
 33 |     "Streamlit is a more convenient way to activate a quick user-facing GUI than Voila was, especially because of Voila having conflicting dependencies with nbdev.\n",
 34 |     "\n",
 35 |     "However, Streamlit wants a `.py` file instead of a notebook for development. This is kind of annoying, because to get the hot-reload effect from Streamlit we have to develop outside the notebook, but to maintain documentation (and compile with everything else) we have to keep the main source of truth right here. Perhaps a solution will present itself later; meanwhile, I have been using a scratch file `streamlit-app.py` for development and then copied it back here."
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "markdown",
 40 |    "metadata": {},
 41 |    "source": [
 42 |     "This is a workaround for the query_flow printing to stdout. Maybe it should be handled natively in Streamlit? "
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": null,
 48 |    "metadata": {},
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "#export \n",
 52 |     "import streamlit as st\n",
 53 |     "from memery import core\n",
 54 |     "\n",
 55 |     "from pathlib import Path\n",
 56 |     "from PIL import Image\n",
 57 |     "\n",
 58 |     "from streamlit.report_thread import REPORT_CONTEXT_ATTR_NAME\n",
 59 |     "from threading import current_thread\n",
 60 |     "from contextlib import contextmanager\n",
 61 |     "from io import StringIO\n",
 62 |     "import sys"
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "code",
 67 |    "execution_count": null,
 68 |    "metadata": {},
 69 |    "outputs": [],
 70 |    "source": [
 71 |     "#export \n",
 72 |     "@contextmanager\n",
 73 |     "def st_redirect(src, dst):\n",
 74 |     "    placeholder = st.empty()\n",
 75 |     "    output_func = getattr(placeholder, dst)\n",
 76 |     "\n",
 77 |     "    with StringIO() as buffer:\n",
 78 |     "        old_write = src.write\n",
 79 |     "\n",
 80 |     "        def new_write(b):\n",
 81 |     "            if getattr(current_thread(), REPORT_CONTEXT_ATTR_NAME, None):\n",
 82 |     "                buffer.write(b + '')\n",
 83 |     "                output_func(buffer.getvalue() + '')\n",
 84 |     "            else:\n",
 85 |     "                old_write(b)\n",
 86 |     "\n",
 87 |     "        try:\n",
 88 |     "            src.write = new_write\n",
 89 |     "            yield\n",
 90 |     "        finally:\n",
 91 |     "            src.write = old_write\n",
 92 |     "\n",
 93 |     "\n",
 94 |     "@contextmanager\n",
 95 |     "def st_stdout(dst):\n",
 96 |     "    with st_redirect(sys.stdout, dst):\n",
 97 |     "        yield\n",
 98 |     "\n",
 99 |     "\n",
100 |     "@contextmanager\n",
101 |     "def st_stderr(dst):\n",
102 |     "    with st_redirect(sys.stderr, dst):\n",
103 |     "        yield"
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "markdown",
108 |    "metadata": {},
109 |    "source": [
110 |     "Trying to make good use of streamlit's caching service here; if the search query and folder are the same as a previous search, it will serve the cached version. Might present some breakage points though, yet to see."
111 |    ]
112 |   },
113 |   {
114 |    "cell_type": "code",
115 |    "execution_count": null,
116 |    "metadata": {},
117 |    "outputs": [],
118 |    "source": [
119 |     "#export\n",
120 |     "@st.cache\n",
121 |     "def send_image_query(path, text_query, image_query):\n",
122 |     "    ranked = core.query_flow(path, text_query, image_query=img)\n",
123 |     "    return(ranked)\n",
124 |     "\n",
125 |     "@st.cache\n",
126 |     "def send_text_query(path, text_query):\n",
127 |     "    ranked = core.query_flow(path, text_query)\n",
128 |     "    return(ranked)"
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "markdown",
133 |    "metadata": {},
134 |    "source": [
135 |     "This is the sidebar content"
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "code",
140 |    "execution_count": null,
141 |    "metadata": {},
142 |    "outputs": [],
143 |    "source": [
144 |     "#export\n",
145 |     "st.sidebar.title(\"Memery\")\n",
146 |     "\n",
147 |     "path = st.sidebar.text_input(label='Directory', value='./images')\n",
148 |     "text_query = st.sidebar.text_input(label='Text query', value='')\n",
149 |     "image_query = st.sidebar.file_uploader(label='Image query')\n",
150 |     "im_display_zone = st.sidebar.beta_container()\n",
151 |     "logbox = st.sidebar.beta_container()"
152 |    ]
153 |   },
154 |   {
155 |    "cell_type": "markdown",
156 |    "metadata": {},
157 |    "source": [
158 |     "The image grid parameters"
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "code",
163 |    "execution_count": null,
164 |    "metadata": {},
165 |    "outputs": [],
166 |    "source": [
167 |     "#export\n",
168 |     "sizes = {'small': 115, 'medium':230, 'large':332, 'xlarge':600}\n",
169 |     "\n",
170 |     "l, m, r = st.beta_columns([4,1,1])\n",
171 |     "with l:\n",
172 |     "    num_images = st.slider(label='Number of images',value=12)\n",
173 |     "with m:\n",
174 |     "    size_choice = st.selectbox(label='Image width', options=[k for k in sizes.keys()], index=1)\n",
175 |     "with r:\n",
176 |     "    captions_on = st.checkbox(label=\"Caption filenames\", value=False)"
177 |    ]
178 |   },
179 |   {
180 |    "cell_type": "markdown",
181 |    "metadata": {},
182 |    "source": [
183 |     "And the main event loop, triggered every time the query parameters change.\n",
184 |     "\n",
185 |     "This doesn't really work in Jupyter at all. Hope it does once it's compiled."
186 |    ]
187 |   },
188 |   {
189 |    "cell_type": "code",
190 |    "execution_count": null,
191 |    "metadata": {},
192 |    "outputs": [],
193 |    "source": [
194 |     "#export\n",
195 |     "if text_query or image_query:\n",
196 |     "    with logbox:\n",
197 |     "        with st_stdout('info'):\n",
198 |     "            if image_query is not None:\n",
199 |     "                img = Image.open(image_query).convert('RGB')\n",
200 |     "                with im_display_zone:\n",
201 |     "                    st.image(img)\n",
202 |     "                ranked = send_image_query(path, text_query, image_query)\n",
203 |     "            else:\n",
204 |     "                ranked = send_text_query(path, text_query)\n",
205 |     "    ims = [Image.open(o).convert('RGB') for o in ranked[:num_images]]\n",
206 |     "    names = [o.replace(path, '') for o in ranked[:num_images]]\n",
207 |     "\n",
208 |     "    if captions_on:\n",
209 |     "        images = st.image(ims, width=sizes[size_choice], channels='RGB', caption=names)\n",
210 |     "    else:\n",
211 |     "        images = st.image(ims, width=sizes[size_choice], channels='RGB')"
212 |    ]
213 |   }
214 |  ],
215 |  "metadata": {
216 |   "kernelspec": {
217 |    "display_name": "Python 3",
218 |    "language": "python",
219 |    "name": "python3"
220 |   }
221 |  },
222 |  "nbformat": 4,
223 |  "nbformat_minor": 4
224 | }
225 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Memery
  2 | > Use human language to search your image folders!
  3 | 
  4 | ## What is memery?
  5 | 
  6 | ![meme about having too many memes](images/E2GoeMyWEAAkcLz.jpeg)
  7 | 
  8 | The problem: you have a huge folder of images. Memes, screenshots, datasets, product photos, inspo albums, anything. You know that somewhere in that folder is the exact image you want, but you can't remember the filename or what day you saved it. There's nothing you can do but scroll through the folder, skimming hundreds of thumbnails, hoping you don't accidentally miss it, and that you'll recognize it when you do see it. 
  9 | 
 10 | Humans do this amazingly well. But even with computers, local image search is still a manual effort - you're still sorting through folders of images, like an archivist of old.
 11 | 
 12 | **Now there's Memery**.
 13 | 
 14 | The `memery` package provides natural language search over local images. You can use it to search for things like "a line drawing of a woman facing to the left" and get _reasonably good results!_ 
 15 | 
 16 | You can do this over thousands of images (it's not optimized for performance yet, but search times scale well under O(n)). 
 17 | 
 18 | You can view the images in a browser GUI, or pipe them through command line tools. 
 19 | 
 20 | You can use `memery` or its modules in Jupyter notebooks, including GUI functions! 
 21 | 
 22 | Under the hood, `memery` makes use of **CLIP**, the [Contrastive Language-Image Pretraining transformer](https://github.com/openai/CLIP), released by OpenAI in 2021. CLIP trains a vision transformer and a language transformer to find the same latent space for images and their captions. This makes it perfect for the purpose of natural language image search. CLIP is a giant achievement, and `memery` stands on its shoulders.
 23 | 
 24 | Outline:
 25 | - Usage
 26 |   - Install locally
 27 |   - Use GUI
 28 |   - Use CLI
 29 |   - Use the library
 30 | - Development
 31 | - Contributing
 32 |   - Who works on this project
 33 | 
 34 | ## Quickstart-Windows
 35 | - Run the `windows-install.bat` file
 36 | - Run the `windows-run.bat` file
 37 | 
 38 | ## Installation
 39 | 
 40 | With Python 3.9 or greater:
 41 | 
 42 | From github (recommended)
 43 | ```
 44 | pip install git+https://github.com/deepfates/memery.git
 45 | ```
 46 | or
 47 | ```
 48 | git clone https://github.com/deepfates/memery.git
 49 | cd memery
 50 | poetry install
 51 | ```
 52 | From PyPi
 53 | ```
 54 | pip install memery
 55 | pip install git+https://github.com/openai/CLIP.git
 56 | ```
 57 | 
 58 | Currently memery defaults to GPU installation. This will 
 59 | probably be switched in a future version. 
 60 | 
 61 | For now, if you want to run CPU-only, run the following command after installing memery:
 62 | 
 63 | `pip install torch==1.7.1+cpu torchvision==0.8.2+cpu torchaudio==0.7.2 -f https://download.pytorch.org/whl/torch_stable.html`
 64 | 
 65 | Someday memery will be packaged in an easy to use format, but since this is a Python project it is hard to predict when that day will be.
 66 | 
 67 | If you want to help develop memery, you'll need to clone the repo. See below.
 68 | 
 69 | ## Usage
 70 | 
 71 | What's your use case? 
 72 | 
 73 | **I have images and want to search them with a GUI app**
 74 |    
 75 |    ↳  Use the Browser GUI
 76 |    
 77 | **i have a program/workflow and want to use image search as one part of it**
 78 |    
 79 |    ↳ Use as a Python module
 80 |    
 81 |    ↳ Use from command line or shell scripts
 82 |    
 83 | **i want to improve on and/or contribute to memery development**
 84 |  
 85 |    ↳ Start by cloning the repo 
 86 | 
 87 | ### Use GUI
 88 | 
 89 | Currently memery has a rough browser-based GUI. To launch it, run the following in a command line: 
 90 | 
 91 | ```memery serve```
 92 | 
 93 | or set up a desktop shortcut that points to the above command.
 94 | 
 95 | Optionally, you can pass a directory to open on startup, like so:
 96 | 
 97 | ```memery serve home/user/Pictures/memes```
 98 | 
 99 | Relative directories will also work:
100 | 
101 | ```
102 | cd ~/Pictures
103 | memery serve memes
104 | ```
105 | 
106 | The default directory passed will be `./images`, which is memery's example meme directory.
107 | 
108 | Memery will open in a browser window. The interface is pretty straightforward, but it has some quirks.
109 | 
110 | ![screenshot of memery GUI displaying wholesome memes](images/streamlit-screenshot.png)
111 | 
112 | The sidebar on the left controls the location and query for the search. The "Directory" box requires a full directory path; unfortunately, Streamlit does not yet have a folder-picker component. The path is relative to your current working directory when you run `memery serve`.
113 | 
114 | The search will run once you enter a text or image query. If you enter both text and image queries, memery will search for the combination.
115 | 
116 | Beneath these widgets is the output area for temporary messages displayed with each search. Mostly this can be ignored.
117 | 
118 | The right hand panel displays the images and associated options. Major errors will appear here as giant stack traces; sometimes, changing variables in the other widgets will fix these errors live. If you get a large error here it's helpful to take a screenshot and share it with us in Github Issues.
119 | 
120 | ### Use CLI
121 | 
122 | The memery command line matches the core functionality of memery.
123 | 
124 | Use the `recall` command to search for images, passing the path and optionally passing the -n flag to control how many images are returned (default 10). Use the -t flag to pass a text query and the -i flag to pass an image query or both
125 | 
126 | ```
127 | memery recall PATH/TO/IMAGE/FOLDER -t 'text_query' -i 'PATH/TO/IMAGE.jpg' -n 20
128 | ```
129 | 
130 | You can encode and index all the images with the `build` command, optionally specifying the number of workers to build the dataset with (default 0)
131 | 
132 | ```
133 | memery build PATH/TO/IMAGE/FOLDER --workers 4
134 | ```
135 | 
136 | Clear out the encodings and index using the `purge` command
137 | 
138 | ```
139 | memery purge PATH/TO/IMAGE/FOLDER`
140 | ```
141 | 
142 | ### Use as a library
143 | 
144 | The core functionality of memery is wrapped into the Memery() class 
145 | 
146 | ```
147 | from memery.core import Memery
148 | memery = Memery()
149 | ```
150 | 
151 | The function currently called `query_flow` accepts a folder name and a query and returns a ranked list of image files. You can query with text or a filepath to an image or both. 
152 | 
153 | 
154 | ```
155 | ranked = memery.query_flow('./images', 'dad joke')
156 | 
157 | print(ranked[:5])
158 | ```
159 | ```
160 | Converting query
161 | Searching 82 images
162 | Done in 4.014755964279175 seconds
163 | ['images/memes/Wholesome-Meme-68.jpg', 'images/memes/Wholesome-Meme-74.jpg', 'images/memes/Wholesome-Meme-88.jpg', 'images/memes/Wholesome-Meme-78.jpg', 'images/memes/Wholesome-Meme-23.jpg']
164 | ```
165 | 
166 | Here's the first result from that list:
167 | 
168 | ![](images/memes/Wholesome-Meme-68.jpg)
169 | 
170 | 
171 | So that's how to use memery. Let's look at how you can help make it better.
172 | 
173 | ## Development
174 | 
175 | ### Pull the repo
176 | 
177 | Clone this repository from Github:
178 | 
179 | `git clone https://github.com/deepfates/memery.git`
180 | 
181 | 
182 | ### Install dependencies and memery
183 | Enter the `memery` folder and install requirements:
184 | 
185 | ```
186 | cd memery
187 | poetry install
188 | ```
189 | 
190 | And finally install your local, editable copy of memery with 
191 | 
192 | `pip install -e .`
193 | 
194 | ## Contributing
195 | 
196 | Memery is open source and you can contribute. See [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines on how you can help.
197 | 
198 | ### Who works on this project
199 | 
200 | Memery was first written by Max Anton Brewer aka @deepfates in the summer of 2021. Some commits are listed from @robotface-io but that was just me using the wrong account when I first started. 
201 | 
202 | Many UI and back-end improvements were added by @wkrettek in 2022! 🙌🎉🌟 
203 | 
204 | I wrote this to solve my own needs and learn notebook-based development. I hope it helps other people too. If you can help me make it better, please do. I welcome any contribution, guidance or criticism.
205 | 
206 | **The ideal way to get support is to open an issue on Github**. However, the *fastest* way to get a response from me is probably to [direct message me on twitter](twitter.com/deepfates).
207 | 
208 | 


--------------------------------------------------------------------------------
/notebooks/02_crafter.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "#default_exp crafter"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": null,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "#hide\n",
 19 |     "from nbdev.showdoc import *"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "markdown",
 24 |    "metadata": {},
 25 |    "source": [
 26 |     "# Crafter\n",
 27 |     "\n",
 28 |     "Takes a list of image filenames and transforms them to batches of the correct dimensions for CLIP. \n",
 29 |     "\n",
 30 |     "This executor subclasses PyTorch's VisionDataset (for its file-loading expertise) and DataLoaders. The `DatasetImagePaths` takes a list of image paths and a transfom, returns the transformed tensors when called. DataLoader does batching internally so we pass it along to the encoder in that format.\n",
 31 |     "\n"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": null,
 37 |    "metadata": {},
 38 |    "outputs": [],
 39 |    "source": [
 40 |     "#export\n",
 41 |     "import torch\n",
 42 |     "from torchvision.datasets import VisionDataset\n",
 43 |     "from PIL import Image\n"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": null,
 49 |    "metadata": {},
 50 |    "outputs": [],
 51 |    "source": [
 52 |     "#export\n",
 53 |     "def make_dataset(new_files):\n",
 54 |     "    '''Returns a list of samples of a form (path_to_sample, class) and in \n",
 55 |     "    this case the class is just the filename'''\n",
 56 |     "    samples = []\n",
 57 |     "    slugs = []\n",
 58 |     "    for i, f in enumerate(new_files):\n",
 59 |     "        path, slug = f\n",
 60 |     "        samples.append((str(path), i))\n",
 61 |     "        slugs.append((slug, i))\n",
 62 |     "    return(samples, slugs)"
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "code",
 67 |    "execution_count": null,
 68 |    "metadata": {},
 69 |    "outputs": [],
 70 |    "source": [
 71 |     "#export\n",
 72 |     "def pil_loader(path: str) -> Image.Image:\n",
 73 |     "    # open path as file to avoid ResourceWarning (https://github.com/python-pillow/Pillow/issues/835)\n",
 74 |     "    with open(path, 'rb') as f:\n",
 75 |     "        img = Image.open(f)\n",
 76 |     "        return img.convert('RGB')"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": null,
 82 |    "metadata": {},
 83 |    "outputs": [],
 84 |    "source": [
 85 |     "#export\n",
 86 |     "class DatasetImagePaths(VisionDataset):\n",
 87 |     "    def __init__(self, new_files, transforms = None):\n",
 88 |     "        super(DatasetImagePaths, self).__init__(new_files, transforms=transforms)\n",
 89 |     "        samples, slugs = make_dataset(new_files)\n",
 90 |     "        self.samples = samples\n",
 91 |     "        self.slugs = slugs\n",
 92 |     "        self.loader = pil_loader\n",
 93 |     "        self.root = 'file dataset'\n",
 94 |     "    def __len__(self):\n",
 95 |     "        return(len(self.samples))\n",
 96 |     "        \n",
 97 |     "    def __getitem__(self, index):\n",
 98 |     "        path, target = self.samples[index]\n",
 99 |     "        sample = self.loader(path)\n",
100 |     "        if sample is not None:\n",
101 |     "            if self.transforms is not None:\n",
102 |     "                sample = self.transforms(sample)\n",
103 |     "            return sample, target"
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "code",
108 |    "execution_count": null,
109 |    "metadata": {},
110 |    "outputs": [],
111 |    "source": [
112 |     "new_files = [('images/memes/Wholesome-Meme-8.jpg', 'Wholesome-Meme-8'), ('images/memes/Wholesome-Meme-1.jpg', 'Wholesome-Meme-1')]#, ('images/corrupted-file.jpeg', 'corrupted-file.jpeg')]"
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "code",
117 |    "execution_count": null,
118 |    "metadata": {},
119 |    "outputs": [],
120 |    "source": [
121 |     "crafted = DatasetImagePaths(new_files)"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "code",
126 |    "execution_count": null,
127 |    "metadata": {},
128 |    "outputs": [],
129 |    "source": [
130 |     "crafted[0][0]"
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "markdown",
135 |    "metadata": {},
136 |    "source": [
137 |     "Okay, that seems to work decently. Test with transforms, which I will just find in CLIP source code and copy over, to prevent having to import CLIP in this executor."
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "code",
142 |    "execution_count": null,
143 |    "metadata": {},
144 |    "outputs": [],
145 |    "source": [
146 |     "#export\n",
147 |     "from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize"
148 |    ]
149 |   },
150 |   {
151 |    "cell_type": "code",
152 |    "execution_count": null,
153 |    "metadata": {},
154 |    "outputs": [],
155 |    "source": [
156 |     "#export\n",
157 |     "def clip_transform(n_px):\n",
158 |     "    return Compose([\n",
159 |     "        Resize(n_px, interpolation=Image.BICUBIC),\n",
160 |     "        CenterCrop(n_px),\n",
161 |     "        ToTensor(),\n",
162 |     "        Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),\n",
163 |     "    ])"
164 |    ]
165 |   },
166 |   {
167 |    "cell_type": "markdown",
168 |    "metadata": {},
169 |    "source": [
170 |     "Put that all together, and wrap in a DataLoader for batching. In future, need to figure out how to pick batch size and number of workers programmatically bsed on device capabilities."
171 |    ]
172 |   },
173 |   {
174 |    "cell_type": "code",
175 |    "execution_count": null,
176 |    "metadata": {},
177 |    "outputs": [],
178 |    "source": [
179 |     "#export\n",
180 |     "def crafter(new_files, device, batch_size=128, num_workers=4): \n",
181 |     "    with torch.no_grad():\n",
182 |     "        imagefiles=DatasetImagePaths(new_files, clip_transform(224))\n",
183 |     "        img_loader=torch.utils.data.DataLoader(imagefiles, batch_size=batch_size, shuffle=False, num_workers=num_workers)\n",
184 |     "    return(img_loader)"
185 |    ]
186 |   },
187 |   {
188 |    "cell_type": "code",
189 |    "execution_count": null,
190 |    "metadata": {},
191 |    "outputs": [],
192 |    "source": [
193 |     "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
194 |     "device"
195 |    ]
196 |   },
197 |   {
198 |    "cell_type": "code",
199 |    "execution_count": null,
200 |    "metadata": {},
201 |    "outputs": [],
202 |    "source": [
203 |     "crafted_files = crafter(new_files, device)"
204 |    ]
205 |   },
206 |   {
207 |    "cell_type": "code",
208 |    "execution_count": null,
209 |    "metadata": {},
210 |    "outputs": [],
211 |    "source": [
212 |     "crafted_files.batch_size, crafted_files.num_workers"
213 |    ]
214 |   },
215 |   {
216 |    "cell_type": "code",
217 |    "execution_count": null,
218 |    "metadata": {},
219 |    "outputs": [],
220 |    "source": [
221 |     "file = new_files[1][0]"
222 |    ]
223 |   },
224 |   {
225 |    "cell_type": "code",
226 |    "execution_count": null,
227 |    "metadata": {},
228 |    "outputs": [],
229 |    "source": [
230 |     "#export\n",
231 |     "def preproc(img):\n",
232 |     "    transformed = clip_transform(224)(img)\n",
233 |     "    return(transformed)"
234 |    ]
235 |   },
236 |   {
237 |    "cell_type": "code",
238 |    "execution_count": null,
239 |    "metadata": {},
240 |    "outputs": [],
241 |    "source": [
242 |     "im = preproc([Image.open(file)][0])"
243 |    ]
244 |   },
245 |   {
246 |    "cell_type": "code",
247 |    "execution_count": null,
248 |    "metadata": {},
249 |    "outputs": [],
250 |    "source": [
251 |     "# %matplotlib inline"
252 |    ]
253 |   },
254 |   {
255 |    "cell_type": "code",
256 |    "execution_count": null,
257 |    "metadata": {},
258 |    "outputs": [],
259 |    "source": [
260 |     "# show_image(im)"
261 |    ]
262 |   },
263 |   {
264 |    "cell_type": "code",
265 |    "execution_count": null,
266 |    "metadata": {},
267 |    "outputs": [],
268 |    "source": []
269 |   }
270 |  ],
271 |  "metadata": {
272 |   "kernelspec": {
273 |    "display_name": "Python 3",
274 |    "language": "python",
275 |    "name": "python3"
276 |   }
277 |  },
278 |  "nbformat": 4,
279 |  "nbformat_minor": 4
280 | }
281 | 


--------------------------------------------------------------------------------
/notebooks/01_loader.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "#default_exp loader"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": null,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "#hide\n",
 19 |     "from nbdev.showdoc import *"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "markdown",
 24 |    "metadata": {},
 25 |    "source": [
 26 |     "# Loader\n",
 27 |     "> Functions for finding and loading image files and saved embeddings\n"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "markdown",
 32 |    "metadata": {},
 33 |    "source": [
 34 |     "## File manipulation"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": null,
 40 |    "metadata": {},
 41 |    "outputs": [],
 42 |    "source": [
 43 |     "#export\n",
 44 |     "from pathlib import Path\n",
 45 |     "from PIL import Image\n",
 46 |     "from tqdm import tqdm"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "markdown",
 51 |    "metadata": {},
 52 |    "source": [
 53 |     "**NB: A lot of this implementation is too specific, especially the slugified filenames being used for dictionary IDs. Should be replaced with a better database implementation.**"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "code",
 58 |    "execution_count": null,
 59 |    "metadata": {},
 60 |    "outputs": [],
 61 |    "source": [
 62 |     "#export\n",
 63 |     "def slugify(filepath):\n",
 64 |     "    return f'{filepath.stem}_{str(filepath.stat().st_mtime).split(\".\")[0]}'\n",
 65 |     "\n",
 66 |     "def get_image_files(path):\n",
 67 |     "    img_extensions = {'.jpg', '.jpeg', '.png', '.ppm', '.bmp', '.pgm', '.tif', '.tiff', '.webp'}\n",
 68 |     "    return [(f, slugify(f)) for f in tqdm(path.rglob('*')) if f.suffix in img_extensions]\n",
 69 |     "\n",
 70 |     "def get_valid_images(path):\n",
 71 |     "    filepaths = get_image_files(path)\n",
 72 |     "    return [f for f in filepaths if verify_image(f[0])]\n",
 73 |     "\n",
 74 |     "# This returns boolean and should be called is_valid_image or something like that\n",
 75 |     "def verify_image(f):\n",
 76 |     "    try:\n",
 77 |     "        img = Image.open(f)\n",
 78 |     "        img.verify() \n",
 79 |     "        return(True)\n",
 80 |     "    except Exception as e:\n",
 81 |     "        print(f'Skipping bad file: {f}\\ndue to {type(e)}')\n",
 82 |     "        pass\n",
 83 |     "    "
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "markdown",
 88 |    "metadata": {},
 89 |    "source": [
 90 |     "Demonstrating the usage here, not a great test though:"
 91 |    ]
 92 |   },
 93 |   {
 94 |    "cell_type": "code",
 95 |    "execution_count": null,
 96 |    "metadata": {},
 97 |    "outputs": [],
 98 |    "source": [
 99 |     "root = Path('./images')\n",
100 |     "\n",
101 |     "\n",
102 |     "filepaths = get_image_files(root)\n",
103 |     "\n",
104 |     "len(filepaths)"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "code",
109 |    "execution_count": null,
110 |    "metadata": {},
111 |    "outputs": [],
112 |    "source": [
113 |     "filepaths[:3]"
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "markdown",
118 |    "metadata": {},
119 |    "source": [
120 |     "## Loaders\n",
121 |     "\n",
122 |     "So we have a list of paths and slugified filenames from the folder. We want to see if there's an archive, so that we don't have to recalculate tensors for images we've seen before. Then we want to pass that directly to the indexer, but send the new images through the crafter and encoder first.\n",
123 |     "\n"
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "code",
128 |    "execution_count": null,
129 |    "metadata": {},
130 |    "outputs": [],
131 |    "source": [
132 |     "#export\n",
133 |     "import torch\n",
134 |     "import torchvision"
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "markdown",
139 |    "metadata": {},
140 |    "source": [
141 |     "We want to use the GPU, if possible, for all the pyTorch functions. But if we can't get access to it we need to fallback to CPU. Either way we call it `device` and pass it to each function in the executors that use torch."
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "code",
146 |    "execution_count": null,
147 |    "metadata": {},
148 |    "outputs": [],
149 |    "source": [
150 |     "#export\n",
151 |     "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
152 |     "device"
153 |    ]
154 |   },
155 |   {
156 |    "cell_type": "markdown",
157 |    "metadata": {},
158 |    "source": [
159 |     "The `archive_loader` is only called in `indexFlow`. It takes the list of image files and the folder they're in (and the torch device), opens an archive if there is one"
160 |    ]
161 |   },
162 |   {
163 |    "cell_type": "code",
164 |    "execution_count": null,
165 |    "metadata": {},
166 |    "outputs": [],
167 |    "source": [
168 |     "#export\n",
169 |     "def archive_loader(filepaths, root, device):\n",
170 |     "    dbpath = root/'memery.pt'\n",
171 |     "#     dbpath_backup = root/'memery.pt'\n",
172 |     "    db = db_loader(dbpath, device)\n",
173 |     "    \n",
174 |     "    current_slugs = [slug for path, slug in filepaths]    \n",
175 |     "    archive_db = {i:db[item[0]] for i, item in enumerate(db.items()) if item[1]['slug'] in current_slugs}      \n",
176 |     "    archive_slugs = [v['slug'] for v in archive_db.values()]\n",
177 |     "    new_files = [(str(path), slug) for path, slug in filepaths if slug not in archive_slugs and verify_image(path)]\n",
178 |     "    \n",
179 |     "    return(archive_db, new_files)"
180 |    ]
181 |   },
182 |   {
183 |    "cell_type": "markdown",
184 |    "metadata": {},
185 |    "source": [
186 |     "The `db_loader` takes a location and returns either the archive dictionary or an empty dictionary. Decomposed to its own function so it can be called separately from `archive_loader` or `queryFlow`. "
187 |    ]
188 |   },
189 |   {
190 |    "cell_type": "code",
191 |    "execution_count": null,
192 |    "metadata": {},
193 |    "outputs": [],
194 |    "source": [
195 |     "#export\n",
196 |     "def db_loader(dbpath, device):\n",
197 |     "\n",
198 |     "    # check for savefile or backup and extract\n",
199 |     "    if Path(dbpath).exists():\n",
200 |     "        db = torch.load(dbpath, device)\n",
201 |     "#     elif dbpath_backup.exists():\n",
202 |     "#         db = torch.load(dbpath_backup)\n",
203 |     "    else:\n",
204 |     "        db = {}\n",
205 |     "    return(db)"
206 |    ]
207 |   },
208 |   {
209 |    "cell_type": "markdown",
210 |    "metadata": {},
211 |    "source": [
212 |     "The library `annoy`, [Approximate Nearest Neighbors Oh Yeah!](https://github.com/spotify/annoy) allows us to search through vector space for approximate matches instead of exact best-similarity matches. We sacrifice accuracy for speed, so we can search through tens of thousands of images in less than a thousand times the time it would take to search through tens of images. There's got to be a better way to put that."
213 |    ]
214 |   },
215 |   {
216 |    "cell_type": "code",
217 |    "execution_count": null,
218 |    "metadata": {},
219 |    "outputs": [],
220 |    "source": [
221 |     "#export\n",
222 |     "from annoy import AnnoyIndex"
223 |    ]
224 |   },
225 |   {
226 |    "cell_type": "code",
227 |    "execution_count": null,
228 |    "metadata": {},
229 |    "outputs": [],
230 |    "source": [
231 |     "#export\n",
232 |     "def treemap_loader(treepath):\n",
233 |     "    treemap = AnnoyIndex(512, 'angular')\n",
234 |     "\n",
235 |     "    if treepath.exists():\n",
236 |     "        treemap.load(str(treepath))\n",
237 |     "    else:\n",
238 |     "        treemap = None\n",
239 |     "    return(treemap)"
240 |    ]
241 |   },
242 |   {
243 |    "cell_type": "code",
244 |    "execution_count": null,
245 |    "metadata": {},
246 |    "outputs": [],
247 |    "source": [
248 |     "treepath = Path('images/memery.ann')"
249 |    ]
250 |   },
251 |   {
252 |    "cell_type": "code",
253 |    "execution_count": null,
254 |    "metadata": {},
255 |    "outputs": [],
256 |    "source": [
257 |     "treemap = AnnoyIndex(512, 'angular')"
258 |    ]
259 |   },
260 |   {
261 |    "cell_type": "code",
262 |    "execution_count": null,
263 |    "metadata": {},
264 |    "outputs": [],
265 |    "source": [
266 |     "if treepath.exists():\n",
267 |     "    treemap.load(str(treepath))\n",
268 |     "else:\n",
269 |     "    treemap = None"
270 |    ]
271 |   },
272 |   {
273 |    "cell_type": "markdown",
274 |    "metadata": {},
275 |    "source": [
276 |     "Here we just test on the local image folder"
277 |    ]
278 |   },
279 |   {
280 |    "cell_type": "code",
281 |    "execution_count": null,
282 |    "metadata": {},
283 |    "outputs": [],
284 |    "source": [
285 |     "archive_db, new_files = archive_loader(get_image_files(root), root, device)"
286 |    ]
287 |   },
288 |   {
289 |    "cell_type": "code",
290 |    "execution_count": null,
291 |    "metadata": {},
292 |    "outputs": [],
293 |    "source": [
294 |     "len(archive_db), len(new_files), treemap.get_n_items()"
295 |    ]
296 |   },
297 |   {
298 |    "cell_type": "code",
299 |    "execution_count": null,
300 |    "metadata": {},
301 |    "outputs": [],
302 |    "source": [
303 |     "dbpath = root/'memery.pt'\n",
304 |     "#     dbpath_backup = root/'memery.pt'\n",
305 |     "db = db_loader(dbpath, device)\n",
306 |     "\n",
307 |     "current_slugs = [slug for path, slug in filepaths]    "
308 |    ]
309 |   },
310 |   {
311 |    "cell_type": "code",
312 |    "execution_count": null,
313 |    "metadata": {},
314 |    "outputs": [],
315 |    "source": [
316 |     "archive_db = {i:db[item[0]] for i, item in enumerate(db.items()) if item[1]['slug'] in current_slugs}  "
317 |    ]
318 |   },
319 |   {
320 |    "cell_type": "code",
321 |    "execution_count": null,
322 |    "metadata": {},
323 |    "outputs": [],
324 |    "source": [
325 |     "len(archive_db)"
326 |    ]
327 |   }
328 |  ],
329 |  "metadata": {
330 |   "kernelspec": {
331 |    "display_name": "Python 3",
332 |    "language": "python",
333 |    "name": "python3"
334 |   }
335 |  },
336 |  "nbformat": 4,
337 |  "nbformat_minor": 4
338 | }
339 | 


--------------------------------------------------------------------------------
/notebooks/08_jupyter_gui.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "#default_exp gui"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": null,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "#hide\n",
 19 |     "from nbdev.showdoc import *"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "markdown",
 24 |    "metadata": {},
 25 |    "source": [
 26 |     "# GUI"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": null,
 32 |    "metadata": {},
 33 |    "outputs": [],
 34 |    "source": [
 35 |     "#export\n",
 36 |     "import ipywidgets as widgets\n",
 37 |     "\n",
 38 |     "from memery.core import query_flow\n",
 39 |     "from pathlib import Path\n",
 40 |     "from IPython.display import clear_output\n",
 41 |     "\n"
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "markdown",
 46 |    "metadata": {},
 47 |    "source": [
 48 |     "## App design\n",
 49 |     "\n",
 50 |     "So what zones do we need for a proper image search app? Two examples come to mind: https://same.energy and https://images.google.com. One is minimalist and brutalist while the other is maximalist in features and refined in design.\n",
 51 |     "\n",
 52 |     "Same.energy proves that all you need for image search is a text box, a button, and images. (At least, that's how it started off, and sometimes how it is today. They're A/B testing heavily right now, and we'll see what it evolves into.) If you click on an image result, you are now searching for that image. If you add text, it asks if you want to search for the image with text or just the image. This can lead in any hill-climbing direction the user wants, I suppose. \n",
 53 |     "\n",
 54 |     "Google Images has up to six toolbars overhanging the images, and a complicated lightbox selection window that shows the individual image with a subset of similar images below it. Nested and stacked, providing lots of specific search and filtering capabilities. Not as likely to induce a wikiwalk. They've introduced \"collections\" now, which are presumably meant to replace the \"download to random image folder\" functionality of current browsers.\n",
 55 |     "\n",
 56 |     "There's also Pinterest, of course, though their engineering is geared more toward gaming Google results than finding the right image by search. Thye have a great browse mode though, and save features. Best of all, they have a goodreads-style user tagging function that allows for a whole different way of sorting images than availableon the other sites.\n",
 57 |     "\n",
 58 |     "The functions available from these sites include:\n",
 59 |     "\n",
 60 |     "- Text query\n",
 61 |     "- Image query\n",
 62 |     "- Text and image query (totally doable with CLIP vectors)\n",
 63 |     "- Browse visually similar images\n",
 64 |     "- Save images (to cloud mostly)\n",
 65 |     "- Filter images by:\n",
 66 |     "  - Size\n",
 67 |     "  - Color\n",
 68 |     "  - Type\n",
 69 |     "  - Time\n",
 70 |     "  - Usage rights\n",
 71 |     "- Visit homepage for image\n",
 72 |     "- Tagging images\n",
 73 |     "- Searching by tags additively\n",
 74 |     "- Filtering out by tags\n",
 75 |     "\n",
 76 |     "Tags and filter categories can both be simulated with CLIP vectors of text tokens like \"green\" or \"noisy\" or \"illustration\" or \"menswear\". Size of image can be inferred directly from filesize or recorded from bitmap data in the `crafter`. Images as search queries and visually similar image browser are the same function but in different user interaction modes. And image links can be to local files, rather than homepages. Saving images not as relevant in this context, though easily sending them somewhere else is. \n",
 77 |     "\n",
 78 |     "Thus there are really three projects here:\n",
 79 |     "- Basic app functionality with search and grid\n",
 80 |     "- Visually simillar image browsing and search\n",
 81 |     "- Tagging and filtering, auto and manual\n",
 82 |     "\n"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "markdown",
 87 |    "metadata": {},
 88 |    "source": [
 89 |     "## Basic app functionality\n",
 90 |     "\n",
 91 |     "We want a unified search bar (variable inputs and a button) and an image grid. And each search should remain accessible after it's run, so we can navigate between and compare. It would be nice to use browser-native navigation but for now, with the plan to run a notebook in Voila and serve locally, better to use `ipywidgets` Tabs mode. Eventually it would also be good to replace or upgrade `ipyplot` or better navigation, but first we should sketch out the new-tab functionality.\n",
 92 |     "\n",
 93 |     "Need a tabs output, an event loop, a dictionary of searches run, each search returning a list of filenames to be printed in a sub-output within the tab. All wrapped in a VBox with the inputs.\n"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "code",
 98 |    "execution_count": null,
 99 |    "metadata": {},
100 |    "outputs": [],
101 |    "source": [
102 |     "filepaths = ['images/memes/Wholesome-Meme-8.jpg', 'images/memes/Wholesome-Meme-1.jpg']"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": null,
108 |    "metadata": {},
109 |    "outputs": [],
110 |    "source": [
111 |     "#export\n",
112 |     "def get_image(file_loc):\n",
113 |     "    filepath = Path(file_loc)\n",
114 |     "    file = open(filepath, 'rb')\n",
115 |     "    image = widgets.Image(value=file.read(),width=200)\n",
116 |     "    \n",
117 |     "    return(image)"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "code",
122 |    "execution_count": null,
123 |    "metadata": {},
124 |    "outputs": [],
125 |    "source": [
126 |     "display(get_image(filepaths[0]))"
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "code",
131 |    "execution_count": null,
132 |    "metadata": {},
133 |    "outputs": [],
134 |    "source": [
135 |     "imgs = [get_image(f) for f in filepaths]"
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "code",
140 |    "execution_count": null,
141 |    "metadata": {},
142 |    "outputs": [],
143 |    "source": [
144 |     "#export\n",
145 |     "def get_grid(filepaths, n=4):\n",
146 |     "    imgs = [get_image(f) for f in filepaths[:n] if Path(f).exists()]\n",
147 |     "    grid = widgets.GridBox(imgs, layout=widgets.Layout(grid_template_columns=\"repeat(auto-fit, 200px)\"))\n",
148 |     "    return(grid)"
149 |    ]
150 |   },
151 |   {
152 |    "cell_type": "code",
153 |    "execution_count": null,
154 |    "metadata": {},
155 |    "outputs": [],
156 |    "source": [
157 |     "get_grid(filepaths)"
158 |    ]
159 |   },
160 |   {
161 |    "cell_type": "code",
162 |    "execution_count": null,
163 |    "metadata": {},
164 |    "outputs": [],
165 |    "source": [
166 |     "#export\n",
167 |     "from PIL import Image\n",
168 |     "from io import BytesIO"
169 |    ]
170 |   },
171 |   {
172 |    "cell_type": "code",
173 |    "execution_count": null,
174 |    "metadata": {},
175 |    "outputs": [],
176 |    "source": [
177 |     "#export\n",
178 |     "def update_tabs(path, query, n_images, searches, tabs, logbox, im_display_zone, image_query=None):\n",
179 |     "    stem = Path(path.value).stem\n",
180 |     "    slug = f\"{stem}:{str(query.value)}\"\n",
181 |     "    if slug not in searches.keys():\n",
182 |     "        with logbox:\n",
183 |     "            print(slug)\n",
184 |     "            if image_query:\n",
185 |     "                im_queries = [name for name, data in image_query.items()]\n",
186 |     "                \n",
187 |     "                img = [Image.open(BytesIO(file_info['content'])).convert('RGB') for name, file_info in image_query.items()]\n",
188 |     "                ranked = query_flow(path.value, query.value, image_query=img[-1])\n",
189 |     "                slug = slug + f'/{im_queries}'\n",
190 |     "                \n",
191 |     "                if len(im_queries) > 0:\n",
192 |     "                    with im_display_zone:\n",
193 |     "                        clear_output()\n",
194 |     "                        display(img[-1])\n",
195 |     "            else:\n",
196 |     "                ranked = query_flow(path.value, query.value)\n",
197 |     "            searches[f'{slug}'] = ranked\n",
198 |     "        \n",
199 |     "    tabs.children = [get_grid(v, n=n_images.value) for v in searches.values()]\n",
200 |     "    for i, k in enumerate(searches.keys()):\n",
201 |     "        tabs.set_title(i, k)\n",
202 |     "    tabs.selected_index = len(searches)-1\n",
203 |     "\n",
204 |     "        \n",
205 |     "#     return(True)"
206 |    ]
207 |   },
208 |   {
209 |    "cell_type": "code",
210 |    "execution_count": null,
211 |    "metadata": {},
212 |    "outputs": [],
213 |    "source": [
214 |     "#export\n",
215 |     "class appPage():\n",
216 |     "    \n",
217 |     "    def __init__(self):\n",
218 |     "        self.inputs_layout =  widgets.Layout(max_width='80%')\n",
219 |     "\n",
220 |     "        self.path = widgets.Text(placeholder='path/to/image/folder', value='images/', layout=self.inputs_layout)\n",
221 |     "        self.query = widgets.Text(placeholder='a funny dog meme', value='a funny dog meme', layout=self.inputs_layout)\n",
222 |     "        \n",
223 |     "        self.image_query = widgets.FileUpload()\n",
224 |     "        self.im_display_zone = widgets.Output(max_height='5rem')\n",
225 |     "\n",
226 |     "        self.n_images = widgets.IntSlider(description='#', value=4, layout=self.inputs_layout)\n",
227 |     "        self.go = widgets.Button(description=\"Search\", layout=self.inputs_layout)\n",
228 |     "        self.logbox = widgets.Output(layout=widgets.Layout(max_width='80%', height=\"3rem\", overflow=\"none\"))\n",
229 |     "        self.all_inputs_layout =  widgets.Layout(max_width='80vw', min_height='40vh', flex_flow='row wrap', align_content='flex-start')\n",
230 |     "\n",
231 |     "        self.inputs = widgets.Box([self.path, self.query, self.image_query, self.n_images, self.go, self.im_display_zone, self.logbox], layout=self.all_inputs_layout)\n",
232 |     "        self.tabs = widgets.Tab()\n",
233 |     "        self.page = widgets.AppLayout(left_sidebar=self.inputs, center=self.tabs)\n",
234 |     "\n",
235 |     "        self.searches = {}\n",
236 |     "        self.go.on_click(self.page_update)\n",
237 |     "        \n",
238 |     "        display(self.page)\n",
239 |     "\n",
240 |     "    def page_update(self, b):\n",
241 |     "        \n",
242 |     "        update_tabs(self.path, self.query, self.n_images, self.searches, self.tabs, self.logbox, self.im_display_zone, self.image_query.value)\n",
243 |     "\n",
244 |     "        \n"
245 |    ]
246 |   },
247 |   {
248 |    "cell_type": "code",
249 |    "execution_count": null,
250 |    "metadata": {},
251 |    "outputs": [],
252 |    "source": [
253 |     "app = appPage()"
254 |    ]
255 |   }
256 |  ],
257 |  "metadata": {
258 |   "kernelspec": {
259 |    "display_name": "Python 3",
260 |    "language": "python",
261 |    "name": "python3"
262 |   }
263 |  },
264 |  "nbformat": 4,
265 |  "nbformat_minor": 4
266 | }
267 | 


--------------------------------------------------------------------------------
/notebooks/00_core.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "# default_exp core"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": null,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "#hide\n",
 19 |     "from nbdev.showdoc import *"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "markdown",
 24 |    "metadata": {},
 25 |    "source": [
 26 |     "# Core\n",
 27 |     "\n",
 28 |     "> Index, query and save embeddings of images by folder"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "markdown",
 33 |    "metadata": {},
 34 |    "source": [
 35 |     "## Rationale\n",
 36 |     "\n",
 37 |     "**Memery takes a folder of images, and a search query, and returns a list of ranked images.**\n",
 38 |     "\n",
 39 |     "The images and query are both projected into a high-dimensional semantic space, courtesy of OpenAI's [https://github.com/openai/CLIP](https://openai.com/blog/clip/). These embeddings are indexed and treemapped using the [Annoy](https://github.com/spotify/annoy) library, which provides nearest-neighbor results for the search query. These results are then transmitted to the user interface (currently as a list of file locations).\n",
 40 |     "\n",
 41 |     "We provide various interfaces for the end user, which all call upon the function `query_flow` and `index_flow` below.\n"
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "markdown",
 46 |    "metadata": {},
 47 |    "source": [
 48 |     "## Modular flow system"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "markdown",
 53 |    "metadata": {},
 54 |    "source": [
 55 |     "Memery uses the Neural Search design pattern as described by Han Xiao in e.g. [General Neural Elastic Search and Go Way Beyond](https://hanxiao.io/2019/07/29/Generic-Neural-Elastic-Search-From-bert-as-service-and-Go-Way-Beyond)&c.\n",
 56 |     "\n",
 57 |     "This is a system designed to be scalable and distributed if necessary. Even for a single-machine scenario, I like the functional style of it: grab data, transform it and pass it downstream, all the way from the folder to the output widget.\n",
 58 |     "\n",
 59 |     "There are two main types of operater in this pattern: **flows** and **executors**.\n",
 60 |     "\n",
 61 |     "**Flows** are specific patterns of data manipulation and storage. **Executors** are the operators that transform the data within the flow. \n",
 62 |     "\n",
 63 |     "There are two core flows to any search system: indexing, and querying. The plan here is to make executors that can be composed into flows and then compose the flows into a UI that supports querying and, to some extent, indexing as well.\n",
 64 |     "\n",
 65 |     "The core executors for this use case are:\n",
 66 |     " - Loader\n",
 67 |     " - Crafter\n",
 68 |     " - Encoder\n",
 69 |     " - Indexer\n",
 70 |     " - Ranker\n",
 71 |     " - Gateway\n",
 72 |     " \n",
 73 |     "\n",
 74 |     "**NB: The executors are currently implemented as functions. A future upgrade will change the names to verbs to match, or change their implementation to classes if they're going to act as nouns.**\n",
 75 |     "\n",
 76 |     "These executors are being implemented ad hoc in the flow functions, but should probably be given single entry points and have their specific logic happen within their own files. Deeper abstractions with less coupling."
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "markdown",
 81 |    "metadata": {},
 82 |    "source": [
 83 |     "## Flows"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "code",
 88 |    "execution_count": null,
 89 |    "metadata": {},
 90 |    "outputs": [],
 91 |    "source": [
 92 |     "#export\n",
 93 |     "import time\n",
 94 |     "import torch\n",
 95 |     "\n",
 96 |     "from pathlib import Path\n",
 97 |     "from memery.loader import get_image_files, get_valid_images, archive_loader, db_loader, treemap_loader \n",
 98 |     "from memery.crafter import crafter, preproc\n",
 99 |     "from memery.encoder import image_encoder, text_encoder, image_query_encoder\n",
100 |     "from memery.indexer import join_all, build_treemap, save_archives\n",
101 |     "from memery.ranker import ranker, nns_to_files"
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "markdown",
106 |    "metadata": {},
107 |    "source": [
108 |     "#### Indexing"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": null,
114 |    "metadata": {},
115 |    "outputs": [],
116 |    "source": [
117 |     "#export\n",
118 |     "def index_flow(path):\n",
119 |     "    '''Indexes images in path, returns the location of save files'''\n",
120 |     "    root = Path(path)\n",
121 |     "    device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
122 |     "    \n",
123 |     "    # Loading\n",
124 |     "    filepaths = loader.get_image_files(root)\n",
125 |     "    archive_db = {}\n",
126 |     "    \n",
127 |     "    archive_db, new_files = loader.archive_loader(filepaths, root, device)\n",
128 |     "    print(f\"Loaded {len(archive_db)} encodings\")\n",
129 |     "    print(f\"Encoding {len(new_files)} new images\")\n",
130 |     "\n",
131 |     "    # Crafting and encoding\n",
132 |     "    crafted_files = crafter.crafter(new_files, device)\n",
133 |     "    new_embeddings = encoder.image_encoder(crafted_files, device)\n",
134 |     "    \n",
135 |     "    # Reindexing\n",
136 |     "    db = indexer.join_all(archive_db, new_files, new_embeddings)\n",
137 |     "    print(\"Building treemap\")\n",
138 |     "    t = indexer.build_treemap(db)\n",
139 |     "    \n",
140 |     "    print(f\"Saving {len(db)} encodings\")\n",
141 |     "    save_paths = indexer.save_archives(root, t, db)\n",
142 |     "\n",
143 |     "    return(save_paths)"
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "code",
148 |    "execution_count": null,
149 |    "metadata": {},
150 |    "outputs": [],
151 |    "source": [
152 |     "show_doc(index_flow)"
153 |    ]
154 |   },
155 |   {
156 |    "cell_type": "markdown",
157 |    "metadata": {},
158 |    "source": [
159 |     "We can index the local `images` folder to test"
160 |    ]
161 |   },
162 |   {
163 |    "cell_type": "code",
164 |    "execution_count": null,
165 |    "metadata": {},
166 |    "outputs": [],
167 |    "source": [
168 |     "\n",
169 |     "# delete the current savefile for testing purposes\n",
170 |     "Path('images/memery.pt').unlink()\n",
171 |     "Path('images/memery.ann').unlink()\n",
172 |     "\n",
173 |     "# run the index flow. returns the path\n",
174 |     "save_paths = index_flow('./images')"
175 |    ]
176 |   },
177 |   {
178 |    "cell_type": "code",
179 |    "execution_count": null,
180 |    "metadata": {},
181 |    "outputs": [],
182 |    "source": [
183 |     "assert save_paths # Returns True if the path exists\n",
184 |     "save_paths"
185 |    ]
186 |   },
187 |   {
188 |    "cell_type": "markdown",
189 |    "metadata": {},
190 |    "source": [
191 |     "#### Querying"
192 |    ]
193 |   },
194 |   {
195 |    "cell_type": "code",
196 |    "execution_count": null,
197 |    "metadata": {},
198 |    "outputs": [],
199 |    "source": [
200 |     "#export\n",
201 |     "def query_flow(path, query=None, image_query=None):\n",
202 |     "    '''\n",
203 |     "    Indexes a folder and returns file paths ranked by query.\n",
204 |     "    \n",
205 |     "    Parameters:\n",
206 |     "        path (str): Folder to search\n",
207 |     "        query (str): Search query text\n",
208 |     "        image_query (Tensor): Search query image(s)\n",
209 |     "\n",
210 |     "    Returns:\n",
211 |     "        list of file paths ranked by query\n",
212 |     "    '''\n",
213 |     "    start_time = time.time()\n",
214 |     "    root = Path(path)\n",
215 |     "    device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
216 |     "    \n",
217 |     "    # Check if we should re-index the files\n",
218 |     "    print(\"Checking files\")\n",
219 |     "    dbpath = root/'memery.pt'\n",
220 |     "    db = loader.db_loader(dbpath, device)\n",
221 |     "    treepath = root/'memery.ann'\n",
222 |     "    treemap = treemap_loader(treepath)\n",
223 |     "    filepaths = get_valid_images(root)\n",
224 |     "\n",
225 |     "    # # Rebuild the tree if it doesn't \n",
226 |     "    # if treemap == None or len(db) != len(filepaths):\n",
227 |     "    #     print('Indexing')\n",
228 |     "    #     dbpath, treepath = index_flow(root)\n",
229 |     "    #     treemap = loader.treemap_loader(Path(treepath))\n",
230 |     "    #     db = loader.db_loader(dbpath, device)\n",
231 |     "    \n",
232 |     "    # Convert queries to vector\n",
233 |     "    print('Converting query')\n",
234 |     "    if image_query:\n",
235 |     "        img = crafter.preproc(image_query)\n",
236 |     "    if query and image_query:\n",
237 |     "        text_vec = encoder.text_encoder(query, device)\n",
238 |     "        image_vec = encoder.image_query_encoder(img, device)\n",
239 |     "        query_vec = text_vec + image_vec\n",
240 |     "    elif query:\n",
241 |     "        query_vec = encoder.text_encoder(query, device)\n",
242 |     "    elif image_query:\n",
243 |     "        query_vec = encoder.image_query_encoder(img, device)\n",
244 |     "    else:\n",
245 |     "        print('No query!')\n",
246 |     "\n",
247 |     "    # Rank db by query    \n",
248 |     "    print(f\"Searching {len(db)} images\")\n",
249 |     "    indexes = ranker.ranker(query_vec, treemap)\n",
250 |     "    ranked_files = ranker.nns_to_files(db, indexes)\n",
251 |     "    \n",
252 |     "    print(f\"Done in {time.time() - start_time} seconds\")\n",
253 |     "    \n",
254 |     "    return(ranked_files)\n",
255 |     "\n",
256 |     "        "
257 |    ]
258 |   },
259 |   {
260 |    "cell_type": "code",
261 |    "execution_count": null,
262 |    "metadata": {},
263 |    "outputs": [],
264 |    "source": [
265 |     "show_doc(query_flow)"
266 |    ]
267 |   },
268 |   {
269 |    "cell_type": "code",
270 |    "execution_count": null,
271 |    "metadata": {},
272 |    "outputs": [],
273 |    "source": [
274 |     "ranked = query_flow('./images', 'dog')\n",
275 |     "\n",
276 |     "print(ranked[0])\n"
277 |    ]
278 |   },
279 |   {
280 |    "cell_type": "code",
281 |    "execution_count": null,
282 |    "metadata": {},
283 |    "outputs": [],
284 |    "source": [
285 |     "assert ranked[0] == \"images/memes/Wholesome-Meme-8.jpg\""
286 |    ]
287 |   },
288 |   {
289 |    "cell_type": "markdown",
290 |    "metadata": {},
291 |    "source": [
292 |     "![](images/memes/Wholesome-Meme-8.jpg)\n",
293 |     "\n",
294 |     "*Then what?! What are the limitations of this system? What are its options? What configuration can i do if i'm a power user? Why did you organize things this way instead of a different way?*\n",
295 |     "\n",
296 |     "*This, and probably each of the following notebooks, would benefit from a small recording session where I try to explain it to an imaginary audience. So that I can get the narrative of how it works, and then arrange the code around that.*\n"
297 |    ]
298 |   },
299 |   {
300 |    "cell_type": "markdown",
301 |    "metadata": {},
302 |    "source": []
303 |   }
304 |  ],
305 |  "metadata": {
306 |   "kernelspec": {
307 |    "display_name": "Python 3",
308 |    "language": "python",
309 |    "name": "python3"
310 |   },
311 |   "language_info": {
312 |    "name": "python",
313 |    "version": "3.7.7"
314 |   }
315 |  },
316 |  "nbformat": 4,
317 |  "nbformat_minor": 4
318 | }
319 | 


--------------------------------------------------------------------------------
/notebooks/_visualize.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "#default_exp visualizer"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": null,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "#hide\n",
 19 |     "from nbdev.showdoc import *\n",
 20 |     "%matplotlib inline"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "markdown",
 25 |    "metadata": {},
 26 |    "source": [
 27 |     "# Visualize"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "markdown",
 32 |    "metadata": {},
 33 |    "source": [
 34 |     "---\n",
 35 |     "## Dimensionality Reduction\n",
 36 |     "\n",
 37 |     "One use-case for `memery` is to explore large image datasets, for cleaning and curation purposes. Sifting images by hand takes a long time, and it's near impossible to keep all the images in your mind at noce.\n",
 38 |     "\n",
 39 |     "Even with semantic search capabilities, it's hard to get an overview of all the images. CLIP sees things in many more dimensions than humans do, so no matter how many searches you run you can't be sure if you're missing some outliers you don't even know to search for.\n",
 40 |     "\n",
 41 |     "The ideal overview would be a map of all the images along all the dimensions, but we don't know how to visualize or parse 512-dimensional spaces for human brains. So we have to do dimensional reduction: find a function in some space with ≤ 3 dimensions that best emulates the 512-dim embeddings we have, and map that instead.\n",
 42 |     "\n",
 43 |     "The recent advance in dimensional reduction is Minimum Distortion Embedding, an abstraction over all types of embeddings like PCA, t-SNE, or k-means clustering. We can use the `pymde` library to embed them and `matplotlib` to draw the images as their own markers on the graph. We'll also need `torch` to process the tensors, and `memery` functions to process the database"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": null,
 49 |    "metadata": {},
 50 |    "outputs": [],
 51 |    "source": [
 52 |     "import pymde\n",
 53 |     "import torch\n",
 54 |     "from pathlib import Path\n",
 55 |     "from memery.loader import db_loader\n",
 56 |     "\n",
 57 |     "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "markdown",
 62 |    "metadata": {},
 63 |    "source": [
 64 |     "Let's get a database of embeddings from the local folder"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": null,
 70 |    "metadata": {},
 71 |    "outputs": [],
 72 |    "source": [
 73 |     "db = db_loader('images/memery.pt', device)"
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "code",
 78 |    "execution_count": null,
 79 |    "metadata": {},
 80 |    "outputs": [],
 81 |    "source": [
 82 |     "db[0].keys()"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "code",
 87 |    "execution_count": null,
 88 |    "metadata": {},
 89 |    "outputs": [],
 90 |    "source": [
 91 |     "embeds = torch.stack([v['embed'] for v in db.values()], 0)"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "markdown",
 96 |    "metadata": {},
 97 |    "source": [
 98 |     "There are two methods to invoke with `pymde`: `preserve_neighbors` and `preserve_distances`. They create different textures in the final product. Let's see what each looks like on our sample dataset."
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "code",
103 |    "execution_count": null,
104 |    "metadata": {},
105 |    "outputs": [],
106 |    "source": [
107 |     "mde_n = pymde.preserve_neighbors(embeds, verbose=False, device='cuda')\n",
108 |     "mde_d = pymde.preserve_distances(embeds, verbose=False, device='cuda')"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": null,
114 |    "metadata": {},
115 |    "outputs": [],
116 |    "source": [
117 |     "embed_n = mde_n.embed(verbose=False, snapshot_every=1)\n",
118 |     "embed_d = mde_d.embed(verbose=False, snapshot_every=1)"
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "code",
123 |    "execution_count": null,
124 |    "metadata": {},
125 |    "outputs": [],
126 |    "source": [
127 |     "pymde.plot(embed_n)"
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "code",
132 |    "execution_count": null,
133 |    "metadata": {},
134 |    "outputs": [],
135 |    "source": [
136 |     "pymde.plot(embed_d)"
137 |    ]
138 |   },
139 |   {
140 |    "cell_type": "code",
141 |    "execution_count": null,
142 |    "metadata": {},
143 |    "outputs": [],
144 |    "source": [
145 |     "mde_n.play(savepath='./graphs/mde_n.gif')"
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "code",
150 |    "execution_count": null,
151 |    "metadata": {},
152 |    "outputs": [],
153 |    "source": [
154 |     "mde_d.play(savepath='./graphs/mde_d.gif')"
155 |    ]
156 |   },
157 |   {
158 |    "cell_type": "code",
159 |    "execution_count": null,
160 |    "metadata": {},
161 |    "outputs": [],
162 |    "source": [
163 |     "assert embed_n.shape"
164 |    ]
165 |   },
166 |   {
167 |    "cell_type": "markdown",
168 |    "metadata": {},
169 |    "source": [
170 |     "---\n",
171 |     "Now I want to plot images as markers, instead of little dots. Haven't figured out yet how to merge this with `pymde.plot` functions, so I'm doing it right in matplotlib. \n",
172 |     "\n",
173 |     "If we just plot the images at their coordinates, they will overlap (especially on the `preserve_neighbors` plot) so eventually maybe I can normalize the x and y axes and plot things on a grid? at least a little bit"
174 |    ]
175 |   },
176 |   {
177 |    "cell_type": "code",
178 |    "execution_count": null,
179 |    "metadata": {},
180 |    "outputs": [],
181 |    "source": [
182 |     "import matplotlib.pyplot as plt\n",
183 |     "from matplotlib.offsetbox import OffsetImage, AnnotationBbox\n",
184 |     "from tqdm import tqdm"
185 |    ]
186 |   },
187 |   {
188 |    "cell_type": "code",
189 |    "execution_count": null,
190 |    "metadata": {},
191 |    "outputs": [],
192 |    "source": [
193 |     "def plot_images_from_tensors(coords, image_paths, dpi=600, savefile = 'default.jpg', zoom=0.03):\n",
194 |     "    fig, ax = plt.subplots()\n",
195 |     "    fig.dpi = dpi\n",
196 |     "    fig.set_size_inches(8,8)\n",
197 |     "    \n",
198 |     "    ax.xaxis.set_visible(False)    \n",
199 |     "    ax.yaxis.set_visible(False)\n",
200 |     "    \n",
201 |     "    cc = coords.cpu()\n",
202 |     "    x_max, y_max = cc.argmax(0)\n",
203 |     "    x_min, y_min = cc.argmin(0)\n",
204 |     "    \n",
205 |     "    low = min(cc[x_min][0], cc[y_min][1])\n",
206 |     "    high = max(cc[x_max][0], cc[y_max][1])\n",
207 |     "    sq_lim = max(abs(low), abs(high))\n",
208 |     "    \n",
209 |     "    plt.xlim(low, high)\n",
210 |     "    plt.ylim(low, high)\n",
211 |     "    \n",
212 |     "#     plt.xlim(-sq_lim, sq_lim)\n",
213 |     "#     plt.ylim(-sq_lim, sq_lim)\n",
214 |     "\n",
215 |     "    for i, coord in tqdm(enumerate(coords)):\n",
216 |     "        try:\n",
217 |     "            x, y = coord\n",
218 |     "\n",
219 |     "            path = str(image_paths[i])\n",
220 |     "            with open(path, 'rb') as image_file:\n",
221 |     "                image = plt.imread(image_file)\n",
222 |     "\n",
223 |     "                im = OffsetImage(image, zoom=zoom, resample=False)\n",
224 |     "                im.image.axes = ax\n",
225 |     "                ab = AnnotationBbox(im, (x,y), frameon=False, pad=0.0,)\n",
226 |     "                ax.add_artist(ab)\n",
227 |     "        except SyntaxError:\n",
228 |     "            pass\n",
229 |     "    print(\"Drawing images as markers...\")\n",
230 |     "    plt.savefig(savefile)\n",
231 |     "    print(f'Saved image to {savefile}')\n"
232 |    ]
233 |   },
234 |   {
235 |    "cell_type": "code",
236 |    "execution_count": null,
237 |    "metadata": {},
238 |    "outputs": [],
239 |    "source": [
240 |     "filenames = [v['fpath'] for v in db.values()]"
241 |    ]
242 |   },
243 |   {
244 |    "cell_type": "code",
245 |    "execution_count": null,
246 |    "metadata": {},
247 |    "outputs": [],
248 |    "source": [
249 |     "savefile = 'graphs/embed_n.jpg'\n",
250 |     "\n",
251 |     "plot_images_from_tensors(embed_n, filenames, savefile=savefile)"
252 |    ]
253 |   },
254 |   {
255 |    "cell_type": "code",
256 |    "execution_count": null,
257 |    "metadata": {},
258 |    "outputs": [],
259 |    "source": [
260 |     "savefile = 'graphs/embed_d.jpg'\n",
261 |     "\n",
262 |     "plot_images_from_tensors(embed_d, filenames, savefile=savefile)"
263 |    ]
264 |   },
265 |   {
266 |    "cell_type": "markdown",
267 |    "metadata": {},
268 |    "source": [
269 |     "I suppose it makes sense that the `preserve_neighbors` function clumps things together and the `preserve_distances` spreads them out. It's nice to see the actual distances and texture of the data, for sure. But I'd also like to be able to see them bigger, with only relative data about where they are to each other. Let's see if we can implement a normalization function and plot them again.\n",
270 |     "\n",
271 |     "Currently the embedding tensor is basically a list pairs of floats. Can I convert those to a set of integers that's the length of the amount of images? I don't know how to do this in matrix math so I'll try it more simply first."
272 |    ]
273 |   },
274 |   {
275 |    "cell_type": "code",
276 |    "execution_count": null,
277 |    "metadata": {},
278 |    "outputs": [],
279 |    "source": [
280 |     "len(embed_n)"
281 |    ]
282 |   },
283 |   {
284 |    "cell_type": "code",
285 |    "execution_count": null,
286 |    "metadata": {},
287 |    "outputs": [],
288 |    "source": [
289 |     "embed_list = [(float(x),float(y)) for x,y in embed_n]\n",
290 |     "embed_dict = {k: v for k, v in zip(filenames, embed_list)}\n",
291 |     "len(embed_dict)"
292 |    ]
293 |   },
294 |   {
295 |    "cell_type": "code",
296 |    "execution_count": null,
297 |    "metadata": {},
298 |    "outputs": [],
299 |    "source": [
300 |     "def normalize_embeds(embed_dict):\n",
301 |     "    sort_x = {k: v[0] for k, v in sorted(embed_dict.items(), key=lambda item: item[1][0])}\n",
302 |     "    norm_x = {item[0]: i for i, item in enumerate(sort_x.items())}\n",
303 |     "    \n",
304 |     "    sort_y = {k: v[1] for k, v in sorted(embed_dict.items(), key=lambda item: item[1][1])}\n",
305 |     "    norm_y = {item[0]: i for i, item in enumerate(sort_y.items())}\n",
306 |     "\n",
307 |     "    normalized_dict = {k: (norm_x[k], norm_y[k]) for k in embed_dict.keys()}\n",
308 |     "    return(normalized_dict)"
309 |    ]
310 |   },
311 |   {
312 |    "cell_type": "code",
313 |    "execution_count": null,
314 |    "metadata": {},
315 |    "outputs": [],
316 |    "source": [
317 |     "norm_dict = normalize_embeds(embed_dict)"
318 |    ]
319 |   },
320 |   {
321 |    "cell_type": "code",
322 |    "execution_count": null,
323 |    "metadata": {},
324 |    "outputs": [],
325 |    "source": [
326 |     "len(norm_dict)"
327 |    ]
328 |   },
329 |   {
330 |    "cell_type": "markdown",
331 |    "metadata": {},
332 |    "source": [
333 |     "I probably could do that all in torch but right now I'm just going to pipe it back into tensors and put it through my plotting function:"
334 |    ]
335 |   },
336 |   {
337 |    "cell_type": "code",
338 |    "execution_count": null,
339 |    "metadata": {},
340 |    "outputs": [],
341 |    "source": [
342 |     "norms = torch.stack([torch.tensor([x, y]) for x, y in norm_dict.values()])\n"
343 |    ]
344 |   },
345 |   {
346 |    "cell_type": "code",
347 |    "execution_count": null,
348 |    "metadata": {},
349 |    "outputs": [],
350 |    "source": [
351 |     "plot_images_from_tensors(norms, filenames, savefile='graphs/normalized.jpg')"
352 |    ]
353 |   },
354 |   {
355 |    "cell_type": "markdown",
356 |    "metadata": {},
357 |    "source": [
358 |     "It worked!! The clusters still exist but their distances are relaxed so they can be displayed better on the graph. It's removing some information, for sure. but unclear if that is information a human needs.\n",
359 |     "\n",
360 |     "I wonder if it works on the `preserve_distances` method..."
361 |    ]
362 |   },
363 |   {
364 |    "cell_type": "code",
365 |    "execution_count": null,
366 |    "metadata": {},
367 |    "outputs": [],
368 |    "source": []
369 |   },
370 |   {
371 |    "cell_type": "code",
372 |    "execution_count": null,
373 |    "metadata": {},
374 |    "outputs": [],
375 |    "source": [
376 |     "embed_list = [(float(x),float(y)) for x,y in embed_d]\n",
377 |     "embed_dict = {k: v for k, v in zip(filenames, embed_list)}\n",
378 |     "norm_dict = normalize_embeds(embed_dict)\n",
379 |     "norms = torch.stack([torch.tensor([x, y]) for x, y in norm_dict.values()])"
380 |    ]
381 |   },
382 |   {
383 |    "cell_type": "code",
384 |    "execution_count": null,
385 |    "metadata": {},
386 |    "outputs": [],
387 |    "source": [
388 |     "plot_images_from_tensors(norms, filenames, savefile='graphs/normalized-d.jpg')"
389 |    ]
390 |   },
391 |   {
392 |    "cell_type": "markdown",
393 |    "metadata": {},
394 |    "source": [
395 |     "This looks okay. It reduces overall distances but keeps relative distances? Still not sure what the actionalbe difference between these two methods is. \n",
396 |     "\n",
397 |     "Well, it works okay for now. The next question is, how to incorporate it into a working GUI?\n",
398 |     "\n",
399 |     "I wonder how matplotlib does natively, for a much larger dataset. Let's see:\n",
400 |     "\n",
401 |     "# Large dataset"
402 |    ]
403 |   },
404 |   {
405 |    "cell_type": "code",
406 |    "execution_count": null,
407 |    "metadata": {},
408 |    "outputs": [],
409 |    "source": [
410 |     "def normalize_tensors(embdgs, names):\n",
411 |     "    embed_list = [(float(x),float(y)) for x,y in embdgs]\n",
412 |     "    embed_dict = {k: v for k, v in zip(names, embed_list)}\n",
413 |     "    norm_dict = normalize_embeds(embed_dict)\n",
414 |     "    norms = torch.stack([torch.tensor([x, y]) for x, y in norm_dict.values()])\n",
415 |     "    return(norms)"
416 |    ]
417 |   },
418 |   {
419 |    "cell_type": "code",
420 |    "execution_count": null,
421 |    "metadata": {},
422 |    "outputs": [],
423 |    "source": [
424 |     "db = db_loader('/home/mage/Pictures/memes/memery.pt', device)"
425 |    ]
426 |   },
427 |   {
428 |    "cell_type": "code",
429 |    "execution_count": null,
430 |    "metadata": {},
431 |    "outputs": [],
432 |    "source": [
433 |     "filenames = [v['fpath'] for v in db.values()]"
434 |    ]
435 |   },
436 |   {
437 |    "cell_type": "code",
438 |    "execution_count": null,
439 |    "metadata": {},
440 |    "outputs": [],
441 |    "source": [
442 |     "clips = torch.stack([v['embed'] for v in db.values()])"
443 |    ]
444 |   },
445 |   {
446 |    "cell_type": "code",
447 |    "execution_count": null,
448 |    "metadata": {},
449 |    "outputs": [],
450 |    "source": [
451 |     "filenames[:5]"
452 |    ]
453 |   },
454 |   {
455 |    "cell_type": "code",
456 |    "execution_count": null,
457 |    "metadata": {},
458 |    "outputs": [],
459 |    "source": [
460 |     "mde_lg = pymde.preserve_neighbors(clips, verbose=False, device='cuda')"
461 |    ]
462 |   },
463 |   {
464 |    "cell_type": "code",
465 |    "execution_count": null,
466 |    "metadata": {},
467 |    "outputs": [],
468 |    "source": [
469 |     "embed_lg = mde_lg.embed(verbose=False, snapshot_every=1)"
470 |    ]
471 |   },
472 |   {
473 |    "cell_type": "code",
474 |    "execution_count": null,
475 |    "metadata": {},
476 |    "outputs": [],
477 |    "source": [
478 |     "norms_lg = normalize_tensors(embed_lg,filenames)\n",
479 |     "len(norms_lg)"
480 |    ]
481 |   },
482 |   {
483 |    "cell_type": "code",
484 |    "execution_count": null,
485 |    "metadata": {},
486 |    "outputs": [],
487 |    "source": [
488 |     "plot_images_from_tensors(embed_lg, filenames, savefile='graphs/normalized-lg.jpg')"
489 |    ]
490 |   },
491 |   {
492 |    "cell_type": "markdown",
493 |    "metadata": {},
494 |    "source": [
495 |     "---\n",
496 |     "\n",
497 |     "### Be careful here\n",
498 |     "\n",
499 |     "It is possible to use embeddings as target coordinates to delete sections of the data:"
500 |    ]
501 |   },
502 |   {
503 |    "cell_type": "code",
504 |    "execution_count": null,
505 |    "metadata": {},
506 |    "outputs": [],
507 |    "source": [
508 |     "to_delete = []\n",
509 |     "for coord, img in zip(#embedding, filenames):\n",
510 |     "    x, y = coord\n",
511 |     "    if x < -2 or y < -1:\n",
512 |     "        to_delete.append(img)"
513 |    ]
514 |   },
515 |   {
516 |    "cell_type": "code",
517 |    "execution_count": null,
518 |    "metadata": {},
519 |    "outputs": [],
520 |    "source": [
521 |     "len(to_delete)"
522 |    ]
523 |   },
524 |   {
525 |    "cell_type": "code",
526 |    "execution_count": null,
527 |    "metadata": {},
528 |    "outputs": [],
529 |    "source": [
530 |     "for img in to_delete:\n",
531 |     "    imgpath = Path(img)\n",
532 |     "    imgpath.unlink()"
533 |    ]
534 |   },
535 |   {
536 |    "cell_type": "markdown",
537 |    "metadata": {},
538 |    "source": [
539 |     "It worked! A better distribution and fewer of the wrong things"
540 |    ]
541 |   }
542 |  ],
543 |  "metadata": {
544 |   "kernelspec": {
545 |    "display_name": "Python 3",
546 |    "language": "python",
547 |    "name": "python3"
548 |   }
549 |  },
550 |  "nbformat": 4,
551 |  "nbformat_minor": 4
552 | }
553 | 


--------------------------------------------------------------------------------
/notebooks/_working_pipeline.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "metadata": {},
   6 |    "source": [
   7 |     "# Modular flow system\n",
   8 |     "\n",
   9 |     "I have decided to adapt the design system from Jina into this repo, at least for prototyping purposes. Their distributed systems approach seems quite good but is too muc complexity for me to add right away. Insetead I'm going to replicate the essential design pattern, that of Flows and Executors.\n",
  10 |     "\n",
  11 |     "**Flows** are specific patterns of data manipulation and storage. **Executors** are the operators that transform the data within the flow. \n",
  12 |     "\n",
  13 |     "There are two core flows to any search system: indexing, and querying. The plan here is to make executors that can be composed into flows and then compose the flows into a UI that supports querying and, to some extent, indexing as well.\n",
  14 |     "\n",
  15 |     "The core executors for this use case are:\n",
  16 |     " - Loader\n",
  17 |     " - Crafter\n",
  18 |     " - Encoder\n",
  19 |     " - Indexer\n",
  20 |     " - Ranker\n",
  21 |     " - Gateway\n",
  22 |     " \n",
  23 |     "In this file I try to build these so that the Jupyter notebook itself can be run as a Flow for indexing and then querying. From there it should be easy to abstract the functions and classes and messaging or whatever is necessary for microservices etc."
  24 |    ]
  25 |   },
  26 |   {
  27 |    "cell_type": "code",
  28 |    "execution_count": null,
  29 |    "metadata": {
  30 |     "execution": {
  31 |      "iopub.execute_input": "2021-05-17T23:26:12.215480Z",
  32 |      "iopub.status.busy": "2021-05-17T23:26:12.215153Z",
  33 |      "iopub.status.idle": "2021-05-17T23:26:12.218180Z",
  34 |      "shell.execute_reply": "2021-05-17T23:26:12.217661Z",
  35 |      "shell.execute_reply.started": "2021-05-17T23:26:12.215435Z"
  36 |     }
  37 |    },
  38 |    "outputs": [],
  39 |    "source": [
  40 |     "# move these to main function eventually but for now we're going in notebook order\n",
  41 |     "args = {\n",
  42 |     "    \"path\": \"/home/mage/Pictures/memes/\",\n",
  43 |     "    \"query\": \"scary cat\",\n",
  44 |     "}"
  45 |    ]
  46 |   },
  47 |   {
  48 |    "cell_type": "markdown",
  49 |    "metadata": {},
  50 |    "source": [
  51 |     "## Loader\n",
  52 |     "\n",
  53 |     "The loader takes a directory or list of image files and checks them against database or checkpoint. If there is a saved checkpoint and the files haven't changed, it loads the checkpoint and sends the data directly to Ranker. If not, it sends them to Crafter. Ideally  it could send new images to Crafter and load dictionary of old images at the same time, without re-encoding old images.\n",
  54 |     "\n",
  55 |     "The process of indexing could actually happen in the background while querying happens on the old index! This means putting the logic in the Flow rather than the Loader, I suppose.\n",
  56 |     "\n",
  57 |     "Maybe build dictionary `{filename_timestamp : vector}` to databse as a simple version control mechanism. Then, if any filenames exist but with a different timestamp, we load those under their own key. And we can throw out any filename_timestamp that doesn't exist, before indexing. "
  58 |    ]
  59 |   },
  60 |   {
  61 |    "cell_type": "code",
  62 |    "execution_count": null,
  63 |    "metadata": {
  64 |     "execution": {
  65 |      "iopub.execute_input": "2021-05-17T23:26:12.219219Z",
  66 |      "iopub.status.busy": "2021-05-17T23:26:12.218988Z",
  67 |      "iopub.status.idle": "2021-05-17T23:26:12.221615Z",
  68 |      "shell.execute_reply": "2021-05-17T23:26:12.221131Z",
  69 |      "shell.execute_reply.started": "2021-05-17T23:26:12.219201Z"
  70 |     }
  71 |    },
  72 |    "outputs": [],
  73 |    "source": [
  74 |     "from pathlib import Path\n",
  75 |     "\n",
  76 |     "root = Path(args['path'])"
  77 |    ]
  78 |   },
  79 |   {
  80 |    "cell_type": "code",
  81 |    "execution_count": null,
  82 |    "metadata": {
  83 |     "execution": {
  84 |      "iopub.execute_input": "2021-05-17T23:26:12.222765Z",
  85 |      "iopub.status.busy": "2021-05-17T23:26:12.222612Z",
  86 |      "iopub.status.idle": "2021-05-17T23:26:12.225218Z",
  87 |      "shell.execute_reply": "2021-05-17T23:26:12.224478Z",
  88 |      "shell.execute_reply.started": "2021-05-17T23:26:12.222747Z"
  89 |     }
  90 |    },
  91 |    "outputs": [],
  92 |    "source": [
  93 |     "def slugify(filepath):\n",
  94 |     "    return f'{filepath.stem}_{str(filepath.stat().st_mtime).split(\".\")[0]}'"
  95 |    ]
  96 |   },
  97 |   {
  98 |    "cell_type": "code",
  99 |    "execution_count": null,
 100 |    "metadata": {
 101 |     "execution": {
 102 |      "iopub.execute_input": "2021-05-17T23:26:12.226412Z",
 103 |      "iopub.status.busy": "2021-05-17T23:26:12.226182Z",
 104 |      "iopub.status.idle": "2021-05-17T23:26:12.229182Z",
 105 |      "shell.execute_reply": "2021-05-17T23:26:12.228619Z",
 106 |      "shell.execute_reply.started": "2021-05-17T23:26:12.226386Z"
 107 |     }
 108 |    },
 109 |    "outputs": [],
 110 |    "source": [
 111 |     "# filenames = path.iterdir()\n",
 112 |     "def get_image_files(path):\n",
 113 |     "    return [(f, slugify(f)) for f in path.rglob('*') if f.suffix in ['.jpg', '.png', '.jpeg']]"
 114 |    ]
 115 |   },
 116 |   {
 117 |    "cell_type": "code",
 118 |    "execution_count": null,
 119 |    "metadata": {
 120 |     "execution": {
 121 |      "iopub.execute_input": "2021-05-17T23:26:12.230161Z",
 122 |      "iopub.status.busy": "2021-05-17T23:26:12.229994Z",
 123 |      "iopub.status.idle": "2021-05-17T23:26:12.271774Z",
 124 |      "shell.execute_reply": "2021-05-17T23:26:12.271310Z",
 125 |      "shell.execute_reply.started": "2021-05-17T23:26:12.230139Z"
 126 |     }
 127 |    },
 128 |    "outputs": [],
 129 |    "source": [
 130 |     "filepaths = get_image_files(root)"
 131 |    ]
 132 |   },
 133 |   {
 134 |    "cell_type": "code",
 135 |    "execution_count": null,
 136 |    "metadata": {
 137 |     "execution": {
 138 |      "iopub.execute_input": "2021-05-17T23:26:12.272506Z",
 139 |      "iopub.status.busy": "2021-05-17T23:26:12.272342Z",
 140 |      "iopub.status.idle": "2021-05-17T23:26:12.279507Z",
 141 |      "shell.execute_reply": "2021-05-17T23:26:12.279149Z",
 142 |      "shell.execute_reply.started": "2021-05-17T23:26:12.272456Z"
 143 |     }
 144 |    },
 145 |    "outputs": [],
 146 |    "source": [
 147 |     "len(filepaths)"
 148 |    ]
 149 |   },
 150 |   {
 151 |    "cell_type": "code",
 152 |    "execution_count": null,
 153 |    "metadata": {
 154 |     "execution": {
 155 |      "iopub.execute_input": "2021-05-17T23:26:12.280416Z",
 156 |      "iopub.status.busy": "2021-05-17T23:26:12.280299Z",
 157 |      "iopub.status.idle": "2021-05-17T23:26:12.283291Z",
 158 |      "shell.execute_reply": "2021-05-17T23:26:12.282872Z",
 159 |      "shell.execute_reply.started": "2021-05-17T23:26:12.280400Z"
 160 |     }
 161 |    },
 162 |    "outputs": [],
 163 |    "source": [
 164 |     "filepaths[:5]"
 165 |    ]
 166 |   },
 167 |   {
 168 |    "cell_type": "markdown",
 169 |    "metadata": {},
 170 |    "source": [
 171 |     "So we have a list of paths and slugified filenames from the folder. We want to see if there's an archive, so that we don't have to recalculate tensors for images we've seen before. Then we want to pass that directly to the indexer, but send the new images through the crafter and encoder first.\n",
 172 |     "\n"
 173 |    ]
 174 |   },
 175 |   {
 176 |    "cell_type": "markdown",
 177 |    "metadata": {
 178 |     "execution": {
 179 |      "iopub.execute_input": "2021-05-12T23:37:14.029490Z",
 180 |      "iopub.status.busy": "2021-05-12T23:37:14.028825Z",
 181 |      "iopub.status.idle": "2021-05-12T23:37:14.080913Z",
 182 |      "shell.execute_reply": "2021-05-12T23:37:14.080380Z",
 183 |      "shell.execute_reply.started": "2021-05-12T23:37:14.029406Z"
 184 |     }
 185 |    },
 186 |    "source": [
 187 |     "But I need to separate out the logic for the crafter and encoder from the simple loading of archives and pictures. This component should only provide the dictionary of archived CLIP embeddings, the treemap (eventually) and the locations of the new images to review, and let the downstream components deal with them."
 188 |    ]
 189 |   },
 190 |   {
 191 |    "cell_type": "code",
 192 |    "execution_count": null,
 193 |    "metadata": {
 194 |     "execution": {
 195 |      "iopub.execute_input": "2021-05-17T23:26:12.283975Z",
 196 |      "iopub.status.busy": "2021-05-17T23:26:12.283864Z",
 197 |      "iopub.status.idle": "2021-05-17T23:26:12.768725Z",
 198 |      "shell.execute_reply": "2021-05-17T23:26:12.768101Z",
 199 |      "shell.execute_reply.started": "2021-05-17T23:26:12.283960Z"
 200 |     }
 201 |    },
 202 |    "outputs": [],
 203 |    "source": [
 204 |     "import torch\n",
 205 |     "import torchvision\n",
 206 |     "\n",
 207 |     "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
 208 |     "device"
 209 |    ]
 210 |   },
 211 |   {
 212 |    "cell_type": "code",
 213 |    "execution_count": null,
 214 |    "metadata": {
 215 |     "execution": {
 216 |      "iopub.execute_input": "2021-05-17T23:26:12.770648Z",
 217 |      "iopub.status.busy": "2021-05-17T23:26:12.770436Z",
 218 |      "iopub.status.idle": "2021-05-17T23:26:12.775477Z",
 219 |      "shell.execute_reply": "2021-05-17T23:26:12.774783Z",
 220 |      "shell.execute_reply.started": "2021-05-17T23:26:12.770627Z"
 221 |     }
 222 |    },
 223 |    "outputs": [],
 224 |    "source": [
 225 |     "def files_archive_loader(filepaths, root, device):\n",
 226 |     "    dbpath = root/'memery.pt'\n",
 227 |     "#     dbpath_backup = root/'memery.pt'\n",
 228 |     "    db = db_loader(dbpath)\n",
 229 |     "    \n",
 230 |     "    current_slugs = [slug for path, slug in filepaths]    \n",
 231 |     "    archive_db = {k:db[k] for k in db if k in current_slugs}   \n",
 232 |     "    archive_slugs = [v['slug'] for v in archive_db.values()]\n",
 233 |     "    new_files = [(str(path), slug) for path, slug in filepaths if slug not in archive_slugs]\n",
 234 |     "    \n",
 235 |     "    return(archive_db, new_files)"
 236 |    ]
 237 |   },
 238 |   {
 239 |    "cell_type": "code",
 240 |    "execution_count": null,
 241 |    "metadata": {
 242 |     "execution": {
 243 |      "iopub.execute_input": "2021-05-17T23:26:12.778681Z",
 244 |      "iopub.status.busy": "2021-05-17T23:26:12.778540Z",
 245 |      "iopub.status.idle": "2021-05-17T23:26:12.781397Z",
 246 |      "shell.execute_reply": "2021-05-17T23:26:12.780882Z",
 247 |      "shell.execute_reply.started": "2021-05-17T23:26:12.778662Z"
 248 |     }
 249 |    },
 250 |    "outputs": [],
 251 |    "source": [
 252 |     "def db_loader(dbpath):\n",
 253 |     "    # check for savefile or backup and extract\n",
 254 |     "    if dbpath.exists():\n",
 255 |     "        db = torch.load(dbpath)\n",
 256 |     "#     elif dbpath_backup.exists():\n",
 257 |     "#         db = torch.load(dbpath_backup)\n",
 258 |     "    else:\n",
 259 |     "        db = {}\n",
 260 |     "    return(db)"
 261 |    ]
 262 |   },
 263 |   {
 264 |    "cell_type": "code",
 265 |    "execution_count": null,
 266 |    "metadata": {
 267 |     "execution": {
 268 |      "iopub.execute_input": "2021-05-17T23:26:12.782479Z",
 269 |      "iopub.status.busy": "2021-05-17T23:26:12.782283Z",
 270 |      "iopub.status.idle": "2021-05-17T23:26:12.785981Z",
 271 |      "shell.execute_reply": "2021-05-17T23:26:12.785150Z",
 272 |      "shell.execute_reply.started": "2021-05-17T23:26:12.782459Z"
 273 |     }
 274 |    },
 275 |    "outputs": [],
 276 |    "source": [
 277 |     "def treemap_loader(treepath):\n",
 278 |     "    treemap = AnnoyIndex(512, 'angular')\n",
 279 |     "\n",
 280 |     "    if treepath.exists():\n",
 281 |     "        treemap.load(str(treepath))\n",
 282 |     "    else:\n",
 283 |     "        treemap = None\n",
 284 |     "    return(treemap)"
 285 |    ]
 286 |   },
 287 |   {
 288 |    "cell_type": "code",
 289 |    "execution_count": null,
 290 |    "metadata": {
 291 |     "execution": {
 292 |      "iopub.execute_input": "2021-05-17T23:26:12.787044Z",
 293 |      "iopub.status.busy": "2021-05-17T23:26:12.786880Z",
 294 |      "iopub.status.idle": "2021-05-17T23:26:14.981021Z",
 295 |      "shell.execute_reply": "2021-05-17T23:26:14.980482Z",
 296 |      "shell.execute_reply.started": "2021-05-17T23:26:12.787026Z"
 297 |     }
 298 |    },
 299 |    "outputs": [],
 300 |    "source": [
 301 |     "archive_db, new_files = files_archive_loader(get_image_files(Path(args['path'])), root, device)"
 302 |    ]
 303 |   },
 304 |   {
 305 |    "cell_type": "code",
 306 |    "execution_count": null,
 307 |    "metadata": {
 308 |     "execution": {
 309 |      "iopub.execute_input": "2021-05-17T23:26:14.982030Z",
 310 |      "iopub.status.busy": "2021-05-17T23:26:14.981859Z",
 311 |      "iopub.status.idle": "2021-05-17T23:26:14.986891Z",
 312 |      "shell.execute_reply": "2021-05-17T23:26:14.986449Z",
 313 |      "shell.execute_reply.started": "2021-05-17T23:26:14.982010Z"
 314 |     }
 315 |    },
 316 |    "outputs": [],
 317 |    "source": [
 318 |     "len(archive_db)"
 319 |    ]
 320 |   },
 321 |   {
 322 |    "cell_type": "code",
 323 |    "execution_count": null,
 324 |    "metadata": {
 325 |     "execution": {
 326 |      "iopub.execute_input": "2021-05-17T23:26:14.987764Z",
 327 |      "iopub.status.busy": "2021-05-17T23:26:14.987610Z",
 328 |      "iopub.status.idle": "2021-05-17T23:26:14.991252Z",
 329 |      "shell.execute_reply": "2021-05-17T23:26:14.990625Z",
 330 |      "shell.execute_reply.started": "2021-05-17T23:26:14.987747Z"
 331 |     }
 332 |    },
 333 |    "outputs": [],
 334 |    "source": [
 335 |     "len(new_files)"
 336 |    ]
 337 |   },
 338 |   {
 339 |    "cell_type": "code",
 340 |    "execution_count": null,
 341 |    "metadata": {
 342 |     "execution": {
 343 |      "iopub.execute_input": "2021-05-17T23:26:14.992366Z",
 344 |      "iopub.status.busy": "2021-05-17T23:26:14.992157Z",
 345 |      "iopub.status.idle": "2021-05-17T23:26:14.996332Z",
 346 |      "shell.execute_reply": "2021-05-17T23:26:14.995383Z",
 347 |      "shell.execute_reply.started": "2021-05-17T23:26:14.992343Z"
 348 |     }
 349 |    },
 350 |    "outputs": [],
 351 |    "source": [
 352 |     "\n",
 353 |     "len(new_files),len(archive_db)"
 354 |    ]
 355 |   },
 356 |   {
 357 |    "cell_type": "markdown",
 358 |    "metadata": {},
 359 |    "source": [
 360 |     "## Crafter\n",
 361 |     "\n",
 362 |     "Takes a list of image filenames and transforms them to batches of the correct dimensions for CLIP. Need to figure out a way around torchvision's loader  idiosyncrasies here: currently it just loads images from subfolders, needs to operate okay if pointed at a single folder of images, or recursively, or an arbitrary list of files.\n",
 363 |     "\n",
 364 |     "Then, too, it would be nice to eventually putthis work on the client computer using torchscript or something. So that it only sends 224x224x3 images over the wire. And we only have to compute those once per image, since we're storing a database of finished vectors which should be even smaller\n"
 365 |    ]
 366 |   },
 367 |   {
 368 |    "cell_type": "code",
 369 |    "execution_count": null,
 370 |    "metadata": {
 371 |     "execution": {
 372 |      "iopub.execute_input": "2021-05-17T23:26:14.997972Z",
 373 |      "iopub.status.busy": "2021-05-17T23:26:14.997652Z",
 374 |      "iopub.status.idle": "2021-05-17T23:26:15.000660Z",
 375 |      "shell.execute_reply": "2021-05-17T23:26:15.000138Z",
 376 |      "shell.execute_reply.started": "2021-05-17T23:26:14.997945Z"
 377 |     }
 378 |    },
 379 |    "outputs": [],
 380 |    "source": [
 381 |     "from torchvision.datasets import VisionDataset\n",
 382 |     "from PIL import Image"
 383 |    ]
 384 |   },
 385 |   {
 386 |    "cell_type": "code",
 387 |    "execution_count": null,
 388 |    "metadata": {
 389 |     "execution": {
 390 |      "iopub.execute_input": "2021-05-17T23:26:15.005865Z",
 391 |      "iopub.status.busy": "2021-05-17T23:26:15.005459Z",
 392 |      "iopub.status.idle": "2021-05-17T23:26:15.010256Z",
 393 |      "shell.execute_reply": "2021-05-17T23:26:15.009011Z",
 394 |      "shell.execute_reply.started": "2021-05-17T23:26:15.005837Z"
 395 |     }
 396 |    },
 397 |    "outputs": [],
 398 |    "source": [
 399 |     "def make_dataset(new_files):\n",
 400 |     "    '''Returns a list of samples of a form (path_to_sample, class) and in \n",
 401 |     "    this case the class is just the filename'''\n",
 402 |     "    samples = []\n",
 403 |     "    slugs = []\n",
 404 |     "    for i, f in enumerate(new_files):\n",
 405 |     "        path, slug = f\n",
 406 |     "        samples.append((str(path), i))\n",
 407 |     "        slugs.append((slug, i))\n",
 408 |     "    return(samples, slugs)"
 409 |    ]
 410 |   },
 411 |   {
 412 |    "cell_type": "code",
 413 |    "execution_count": null,
 414 |    "metadata": {
 415 |     "execution": {
 416 |      "iopub.execute_input": "2021-05-17T23:26:15.011746Z",
 417 |      "iopub.status.busy": "2021-05-17T23:26:15.011203Z",
 418 |      "iopub.status.idle": "2021-05-17T23:26:15.015031Z",
 419 |      "shell.execute_reply": "2021-05-17T23:26:15.014629Z",
 420 |      "shell.execute_reply.started": "2021-05-17T23:26:15.011681Z"
 421 |     }
 422 |    },
 423 |    "outputs": [],
 424 |    "source": [
 425 |     "def pil_loader(path: str) -> Image.Image:\n",
 426 |     "    # open path as file to avoid ResourceWarning (https://github.com/python-pillow/Pillow/issues/835)\n",
 427 |     "    with open(path, 'rb') as f:\n",
 428 |     "        img = Image.open(f)\n",
 429 |     "        return img.convert('RGB')"
 430 |    ]
 431 |   },
 432 |   {
 433 |    "cell_type": "code",
 434 |    "execution_count": null,
 435 |    "metadata": {
 436 |     "execution": {
 437 |      "iopub.execute_input": "2021-05-17T23:26:15.015970Z",
 438 |      "iopub.status.busy": "2021-05-17T23:26:15.015807Z",
 439 |      "iopub.status.idle": "2021-05-17T23:26:15.020168Z",
 440 |      "shell.execute_reply": "2021-05-17T23:26:15.019612Z",
 441 |      "shell.execute_reply.started": "2021-05-17T23:26:15.015951Z"
 442 |     }
 443 |    },
 444 |    "outputs": [],
 445 |    "source": [
 446 |     "class DatasetImagePaths(VisionDataset):\n",
 447 |     "    def __init__(self, new_files, transforms = None):\n",
 448 |     "        super(DatasetImagePaths, self).__init__(new_files, transforms=transforms)\n",
 449 |     "        samples, slugs = make_dataset(new_files)\n",
 450 |     "        self.samples = samples\n",
 451 |     "        self.slugs = slugs\n",
 452 |     "        self.loader = pil_loader\n",
 453 |     "        self.root = 'file dataset'\n",
 454 |     "    def __len__(self):\n",
 455 |     "        return(len(self.samples))\n",
 456 |     "        \n",
 457 |     "    def __getitem__(self, index):\n",
 458 |     "        path, target = self.samples[index]\n",
 459 |     "        sample = self.loader(path)\n",
 460 |     "        if self.transforms is not None:\n",
 461 |     "            sample = self.transforms(sample)\n",
 462 |     "        return sample, target"
 463 |    ]
 464 |   },
 465 |   {
 466 |    "cell_type": "code",
 467 |    "execution_count": null,
 468 |    "metadata": {
 469 |     "execution": {
 470 |      "iopub.execute_input": "2021-05-17T23:26:15.021137Z",
 471 |      "iopub.status.busy": "2021-05-17T23:26:15.020930Z",
 472 |      "iopub.status.idle": "2021-05-17T23:26:15.024133Z",
 473 |      "shell.execute_reply": "2021-05-17T23:26:15.023597Z",
 474 |      "shell.execute_reply.started": "2021-05-17T23:26:15.021117Z"
 475 |     }
 476 |    },
 477 |    "outputs": [],
 478 |    "source": [
 479 |     "crafted = DatasetImagePaths(new_files)"
 480 |    ]
 481 |   },
 482 |   {
 483 |    "cell_type": "code",
 484 |    "execution_count": null,
 485 |    "metadata": {
 486 |     "execution": {
 487 |      "iopub.execute_input": "2021-05-17T23:27:10.327359Z",
 488 |      "iopub.status.busy": "2021-05-17T23:27:10.327061Z",
 489 |      "iopub.status.idle": "2021-05-17T23:27:10.331376Z",
 490 |      "shell.execute_reply": "2021-05-17T23:27:10.330348Z",
 491 |      "shell.execute_reply.started": "2021-05-17T23:27:10.327324Z"
 492 |     }
 493 |    },
 494 |    "outputs": [],
 495 |    "source": [
 496 |     "if len(crafted) > 0:\n",
 497 |     "    crafted[0][0].show()"
 498 |    ]
 499 |   },
 500 |   {
 501 |    "cell_type": "markdown",
 502 |    "metadata": {},
 503 |    "source": [
 504 |     "Okay, that seems to work decently. Test with transforms, which I will just find in CLIP source code and copy over, to prevent having to import CLIP in this executor."
 505 |    ]
 506 |   },
 507 |   {
 508 |    "cell_type": "code",
 509 |    "execution_count": null,
 510 |    "metadata": {
 511 |     "execution": {
 512 |      "iopub.execute_input": "2021-05-17T23:27:10.532077Z",
 513 |      "iopub.status.busy": "2021-05-17T23:27:10.531910Z",
 514 |      "iopub.status.idle": "2021-05-17T23:27:10.535139Z",
 515 |      "shell.execute_reply": "2021-05-17T23:27:10.534199Z",
 516 |      "shell.execute_reply.started": "2021-05-17T23:27:10.532056Z"
 517 |     }
 518 |    },
 519 |    "outputs": [],
 520 |    "source": [
 521 |     "from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize"
 522 |    ]
 523 |   },
 524 |   {
 525 |    "cell_type": "code",
 526 |    "execution_count": null,
 527 |    "metadata": {
 528 |     "execution": {
 529 |      "iopub.execute_input": "2021-05-17T23:27:10.672197Z",
 530 |      "iopub.status.busy": "2021-05-17T23:27:10.672025Z",
 531 |      "iopub.status.idle": "2021-05-17T23:27:10.675311Z",
 532 |      "shell.execute_reply": "2021-05-17T23:27:10.674703Z",
 533 |      "shell.execute_reply.started": "2021-05-17T23:27:10.672178Z"
 534 |     }
 535 |    },
 536 |    "outputs": [],
 537 |    "source": [
 538 |     "def clip_transform(n_px):\n",
 539 |     "    return Compose([\n",
 540 |     "        Resize(n_px, interpolation=Image.BICUBIC),\n",
 541 |     "        CenterCrop(n_px),\n",
 542 |     "        ToTensor(),\n",
 543 |     "        Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),\n",
 544 |     "    ])"
 545 |    ]
 546 |   },
 547 |   {
 548 |    "cell_type": "code",
 549 |    "execution_count": null,
 550 |    "metadata": {
 551 |     "execution": {
 552 |      "iopub.execute_input": "2021-05-17T23:27:10.783218Z",
 553 |      "iopub.status.busy": "2021-05-17T23:27:10.783066Z",
 554 |      "iopub.status.idle": "2021-05-17T23:27:10.785719Z",
 555 |      "shell.execute_reply": "2021-05-17T23:27:10.785213Z",
 556 |      "shell.execute_reply.started": "2021-05-17T23:27:10.783202Z"
 557 |     }
 558 |    },
 559 |    "outputs": [],
 560 |    "source": [
 561 |     "crafted_transformed = DatasetImagePaths(new_files, clip_transform(224))"
 562 |    ]
 563 |   },
 564 |   {
 565 |    "cell_type": "code",
 566 |    "execution_count": null,
 567 |    "metadata": {
 568 |     "execution": {
 569 |      "iopub.execute_input": "2021-05-17T23:27:10.914257Z",
 570 |      "iopub.status.busy": "2021-05-17T23:27:10.914086Z",
 571 |      "iopub.status.idle": "2021-05-17T23:27:10.916361Z",
 572 |      "shell.execute_reply": "2021-05-17T23:27:10.915817Z",
 573 |      "shell.execute_reply.started": "2021-05-17T23:27:10.914238Z"
 574 |     }
 575 |    },
 576 |    "outputs": [],
 577 |    "source": [
 578 |     "# crafted_transformed[0][0].shape"
 579 |    ]
 580 |   },
 581 |   {
 582 |    "cell_type": "code",
 583 |    "execution_count": null,
 584 |    "metadata": {
 585 |     "execution": {
 586 |      "iopub.execute_input": "2021-05-17T23:27:11.049878Z",
 587 |      "iopub.status.busy": "2021-05-17T23:27:11.049661Z",
 588 |      "iopub.status.idle": "2021-05-17T23:27:11.052757Z",
 589 |      "shell.execute_reply": "2021-05-17T23:27:11.052083Z",
 590 |      "shell.execute_reply.started": "2021-05-17T23:27:11.049856Z"
 591 |     }
 592 |    },
 593 |    "outputs": [],
 594 |    "source": [
 595 |     "# to_pil = torchvision.transforms.ToPILImage()\n",
 596 |     "# img = to_pil(crafted_transformed[0][0])\n",
 597 |     "# img.show()"
 598 |    ]
 599 |   },
 600 |   {
 601 |    "cell_type": "markdown",
 602 |    "metadata": {},
 603 |    "source": [
 604 |     "Put that all together, and wrap in a DataLoader for batching. In future, need to figure out how to pick batch size and number of workers programmatically bsed on device capabilities."
 605 |    ]
 606 |   },
 607 |   {
 608 |    "cell_type": "code",
 609 |    "execution_count": null,
 610 |    "metadata": {
 611 |     "execution": {
 612 |      "iopub.execute_input": "2021-05-17T23:27:11.510935Z",
 613 |      "iopub.status.busy": "2021-05-17T23:27:11.510756Z",
 614 |      "iopub.status.idle": "2021-05-17T23:27:11.514316Z",
 615 |      "shell.execute_reply": "2021-05-17T23:27:11.513561Z",
 616 |      "shell.execute_reply.started": "2021-05-17T23:27:11.510917Z"
 617 |     }
 618 |    },
 619 |    "outputs": [],
 620 |    "source": [
 621 |     "def crafter(new_files, device, batch_size=128, num_workers=4): \n",
 622 |     "    with torch.no_grad():\n",
 623 |     "        imagefiles=DatasetImagePaths(new_files, clip_transform(224))\n",
 624 |     "        img_loader=torch.utils.data.DataLoader(imagefiles, batch_size=batch_size, shuffle=False, num_workers=num_workers)\n",
 625 |     "    return(img_loader)"
 626 |    ]
 627 |   },
 628 |   {
 629 |    "cell_type": "code",
 630 |    "execution_count": null,
 631 |    "metadata": {
 632 |     "execution": {
 633 |      "iopub.execute_input": "2021-05-17T23:27:11.876682Z",
 634 |      "iopub.status.busy": "2021-05-17T23:27:11.876512Z",
 635 |      "iopub.status.idle": "2021-05-17T23:27:11.880238Z",
 636 |      "shell.execute_reply": "2021-05-17T23:27:11.879080Z",
 637 |      "shell.execute_reply.started": "2021-05-17T23:27:11.876665Z"
 638 |     }
 639 |    },
 640 |    "outputs": [],
 641 |    "source": [
 642 |     "img_loader = crafter(new_files, device)"
 643 |    ]
 644 |   },
 645 |   {
 646 |    "cell_type": "code",
 647 |    "execution_count": null,
 648 |    "metadata": {
 649 |     "execution": {
 650 |      "iopub.execute_input": "2021-05-17T23:27:12.182811Z",
 651 |      "iopub.status.busy": "2021-05-17T23:27:12.182645Z",
 652 |      "iopub.status.idle": "2021-05-17T23:27:12.186305Z",
 653 |      "shell.execute_reply": "2021-05-17T23:27:12.185653Z",
 654 |      "shell.execute_reply.started": "2021-05-17T23:27:12.182794Z"
 655 |     }
 656 |    },
 657 |    "outputs": [],
 658 |    "source": [
 659 |     "img_loader"
 660 |    ]
 661 |   },
 662 |   {
 663 |    "cell_type": "markdown",
 664 |    "metadata": {},
 665 |    "source": [
 666 |     "## Encoder\n",
 667 |     "\n",
 668 |     "CLIP wrapper takes batched tensors or text queries and returns batched 512-dim vectors. size of batch depends on GPU, but if we're putting all that on a server anyway it's a matter of accounting. Does batching go here though? Or in the crafter?\n",
 669 |     "\n",
 670 |     "cool thing here is we can use one encoder for both image and text, just check type on the way in. but first probably keep it simple and make two functions.\n",
 671 |     "\n",
 672 |     "could index previous queries as vectors in a different map and use for predictive/history -- keep a little database of previous queries already in vector format and their ranked NNs, so that the user can see history offline?"
 673 |    ]
 674 |   },
 675 |   {
 676 |    "cell_type": "code",
 677 |    "execution_count": null,
 678 |    "metadata": {
 679 |     "execution": {
 680 |      "iopub.execute_input": "2021-05-17T23:27:13.353998Z",
 681 |      "iopub.status.busy": "2021-05-17T23:27:13.353324Z",
 682 |      "iopub.status.idle": "2021-05-17T23:27:14.660546Z",
 683 |      "shell.execute_reply": "2021-05-17T23:27:14.659900Z",
 684 |      "shell.execute_reply.started": "2021-05-17T23:27:13.353916Z"
 685 |     }
 686 |    },
 687 |    "outputs": [],
 688 |    "source": [
 689 |     "import clip\n",
 690 |     "from tqdm import tqdm\n",
 691 |     "model, _ = clip.load(\"ViT-B/32\", device)"
 692 |    ]
 693 |   },
 694 |   {
 695 |    "cell_type": "code",
 696 |    "execution_count": null,
 697 |    "metadata": {
 698 |     "execution": {
 699 |      "iopub.execute_input": "2021-05-17T23:27:14.661653Z",
 700 |      "iopub.status.busy": "2021-05-17T23:27:14.661465Z",
 701 |      "iopub.status.idle": "2021-05-17T23:27:14.665037Z",
 702 |      "shell.execute_reply": "2021-05-17T23:27:14.664655Z",
 703 |      "shell.execute_reply.started": "2021-05-17T23:27:14.661618Z"
 704 |     }
 705 |    },
 706 |    "outputs": [],
 707 |    "source": [
 708 |     "def image_encoder(img_loader, device):\n",
 709 |     "    image_embeddings = torch.tensor(()).to(device)\n",
 710 |     "    with torch.no_grad():\n",
 711 |     "        for images, labels in tqdm(img_loader):\n",
 712 |     "            batch_features = model.encode_image(images)\n",
 713 |     "            image_embeddings = torch.cat((image_embeddings, batch_features)).to(device)\n",
 714 |     "    \n",
 715 |     "    image_embeddings = image_embeddings / image_embeddings.norm(dim=-1, keepdim=True)\n",
 716 |     "    return(image_embeddings)"
 717 |    ]
 718 |   },
 719 |   {
 720 |    "cell_type": "code",
 721 |    "execution_count": null,
 722 |    "metadata": {
 723 |     "execution": {
 724 |      "iopub.execute_input": "2021-05-17T23:27:15.166413Z",
 725 |      "iopub.status.busy": "2021-05-17T23:27:15.166108Z",
 726 |      "iopub.status.idle": "2021-05-17T23:27:15.300321Z",
 727 |      "shell.execute_reply": "2021-05-17T23:27:15.299842Z",
 728 |      "shell.execute_reply.started": "2021-05-17T23:27:15.166374Z"
 729 |     }
 730 |    },
 731 |    "outputs": [],
 732 |    "source": [
 733 |     "new_embeddings = image_encoder(img_loader, device)"
 734 |    ]
 735 |   },
 736 |   {
 737 |    "cell_type": "code",
 738 |    "execution_count": null,
 739 |    "metadata": {
 740 |     "execution": {
 741 |      "iopub.execute_input": "2021-05-17T23:27:16.366074Z",
 742 |      "iopub.status.busy": "2021-05-17T23:27:16.365827Z",
 743 |      "iopub.status.idle": "2021-05-17T23:27:16.370513Z",
 744 |      "shell.execute_reply": "2021-05-17T23:27:16.369708Z",
 745 |      "shell.execute_reply.started": "2021-05-17T23:27:16.366042Z"
 746 |     }
 747 |    },
 748 |    "outputs": [],
 749 |    "source": [
 750 |     "def text_encoder(text, device):\n",
 751 |     "    with torch.no_grad():\n",
 752 |     "        text = clip.tokenize(text).to(device)\n",
 753 |     "        text_features = model.encode_text(text)\n",
 754 |     "        text_features = text_features / text_features.norm(dim=-1, keepdim=True)\n",
 755 |     "    return(text_features)"
 756 |    ]
 757 |   },
 758 |   {
 759 |    "cell_type": "markdown",
 760 |    "metadata": {},
 761 |    "source": [
 762 |     "## Indexer\n",
 763 |     "\n",
 764 |     "Annoy treemap or FAISS or other solutions. Given a dataset of tensors, returns a dictionary or database or treemap structure, something that is searchable for later. It would be nice to be able to diff this somehow, or make sure that it's up-to-date. Maybe keeping two copies is okay? One for backup and quick-searching, one for main search once it's indexed any new images. \n",
 765 |     "\n",
 766 |     "This executor `needs` both Encoder and Loader to send it the new and old vectors, respectively. So it needs to be preceded by some kind of **join_all** component that can makesure we're not missing new data before handing it over to the indexer. Hm"
 767 |    ]
 768 |   },
 769 |   {
 770 |    "cell_type": "code",
 771 |    "execution_count": null,
 772 |    "metadata": {
 773 |     "execution": {
 774 |      "iopub.execute_input": "2021-05-17T23:27:22.830650Z",
 775 |      "iopub.status.busy": "2021-05-17T23:27:22.829915Z",
 776 |      "iopub.status.idle": "2021-05-17T23:27:22.835413Z",
 777 |      "shell.execute_reply": "2021-05-17T23:27:22.834944Z",
 778 |      "shell.execute_reply.started": "2021-05-17T23:27:22.830565Z"
 779 |     }
 780 |    },
 781 |    "outputs": [],
 782 |    "source": [
 783 |     "root = Path(args['path'])"
 784 |    ]
 785 |   },
 786 |   {
 787 |    "cell_type": "code",
 788 |    "execution_count": null,
 789 |    "metadata": {
 790 |     "execution": {
 791 |      "iopub.execute_input": "2021-05-17T23:27:23.548128Z",
 792 |      "iopub.status.busy": "2021-05-17T23:27:23.547975Z",
 793 |      "iopub.status.idle": "2021-05-17T23:27:23.551213Z",
 794 |      "shell.execute_reply": "2021-05-17T23:27:23.550679Z",
 795 |      "shell.execute_reply.started": "2021-05-17T23:27:23.548112Z"
 796 |     }
 797 |    },
 798 |    "outputs": [],
 799 |    "source": [
 800 |     "def join_all(db, new_files, new_embeddings):\n",
 801 |     "    for i, file in enumerate(new_files):\n",
 802 |     "        path, slug = file\n",
 803 |     "        start = len(db)\n",
 804 |     "        index = i + start\n",
 805 |     "        archive_db[slug] = {\n",
 806 |     "            'slug': slug,\n",
 807 |     "            'fpath': path,\n",
 808 |     "            'embed': new_embeddings[i],\n",
 809 |     "            'index': index\n",
 810 |     "        }\n",
 811 |     "    return(db)"
 812 |    ]
 813 |   },
 814 |   {
 815 |    "cell_type": "code",
 816 |    "execution_count": null,
 817 |    "metadata": {
 818 |     "execution": {
 819 |      "iopub.execute_input": "2021-05-17T23:27:26.689841Z",
 820 |      "iopub.status.busy": "2021-05-17T23:27:26.689681Z",
 821 |      "iopub.status.idle": "2021-05-17T23:27:26.692632Z",
 822 |      "shell.execute_reply": "2021-05-17T23:27:26.691974Z",
 823 |      "shell.execute_reply.started": "2021-05-17T23:27:26.689825Z"
 824 |     }
 825 |    },
 826 |    "outputs": [],
 827 |    "source": [
 828 |     "db = join_all(archive_db,\n",
 829 |     "         new_files,\n",
 830 |     "         new_embeddings\n",
 831 |     "        )"
 832 |    ]
 833 |   },
 834 |   {
 835 |    "cell_type": "code",
 836 |    "execution_count": null,
 837 |    "metadata": {
 838 |     "execution": {
 839 |      "iopub.execute_input": "2021-05-17T23:27:27.321954Z",
 840 |      "iopub.status.busy": "2021-05-17T23:27:27.321741Z",
 841 |      "iopub.status.idle": "2021-05-17T23:27:27.326550Z",
 842 |      "shell.execute_reply": "2021-05-17T23:27:27.325029Z",
 843 |      "shell.execute_reply.started": "2021-05-17T23:27:27.321935Z"
 844 |     }
 845 |    },
 846 |    "outputs": [],
 847 |    "source": [
 848 |     "len(db)"
 849 |    ]
 850 |   },
 851 |   {
 852 |    "cell_type": "markdown",
 853 |    "metadata": {},
 854 |    "source": [
 855 |     "And build treemap"
 856 |    ]
 857 |   },
 858 |   {
 859 |    "cell_type": "code",
 860 |    "execution_count": null,
 861 |    "metadata": {
 862 |     "execution": {
 863 |      "iopub.execute_input": "2021-05-17T23:27:28.602453Z",
 864 |      "iopub.status.busy": "2021-05-17T23:27:28.601731Z",
 865 |      "iopub.status.idle": "2021-05-17T23:27:28.613957Z",
 866 |      "shell.execute_reply": "2021-05-17T23:27:28.611655Z",
 867 |      "shell.execute_reply.started": "2021-05-17T23:27:28.602368Z"
 868 |     }
 869 |    },
 870 |    "outputs": [],
 871 |    "source": [
 872 |     "from annoy import AnnoyIndex"
 873 |    ]
 874 |   },
 875 |   {
 876 |    "cell_type": "code",
 877 |    "execution_count": null,
 878 |    "metadata": {
 879 |     "execution": {
 880 |      "iopub.execute_input": "2021-05-17T23:27:29.075028Z",
 881 |      "iopub.status.busy": "2021-05-17T23:27:29.074813Z",
 882 |      "iopub.status.idle": "2021-05-17T23:27:29.078199Z",
 883 |      "shell.execute_reply": "2021-05-17T23:27:29.077644Z",
 884 |      "shell.execute_reply.started": "2021-05-17T23:27:29.075010Z"
 885 |     }
 886 |    },
 887 |    "outputs": [],
 888 |    "source": [
 889 |     "def build_treemap(db):\n",
 890 |     "    treemap = AnnoyIndex(512, 'angular')\n",
 891 |     "    for v in db.values():\n",
 892 |     "        treemap.add_item(v['index'], v['embed'])\n",
 893 |     "\n",
 894 |     "    # Build the treemap, with 5 trees rn\n",
 895 |     "    treemap.build(5)\n",
 896 |     "\n",
 897 |     "    return(treemap)\n",
 898 |     "    "
 899 |    ]
 900 |   },
 901 |   {
 902 |    "cell_type": "code",
 903 |    "execution_count": null,
 904 |    "metadata": {
 905 |     "execution": {
 906 |      "iopub.execute_input": "2021-05-17T23:27:29.615962Z",
 907 |      "iopub.status.busy": "2021-05-17T23:27:29.615800Z",
 908 |      "iopub.status.idle": "2021-05-17T23:27:47.259986Z",
 909 |      "shell.execute_reply": "2021-05-17T23:27:47.259488Z",
 910 |      "shell.execute_reply.started": "2021-05-17T23:27:29.615943Z"
 911 |     }
 912 |    },
 913 |    "outputs": [],
 914 |    "source": [
 915 |     "t = build_treemap(db)"
 916 |    ]
 917 |   },
 918 |   {
 919 |    "cell_type": "code",
 920 |    "execution_count": null,
 921 |    "metadata": {
 922 |     "execution": {
 923 |      "iopub.execute_input": "2021-05-17T23:27:47.261342Z",
 924 |      "iopub.status.busy": "2021-05-17T23:27:47.261093Z",
 925 |      "iopub.status.idle": "2021-05-17T23:27:47.265327Z",
 926 |      "shell.execute_reply": "2021-05-17T23:27:47.264924Z",
 927 |      "shell.execute_reply.started": "2021-05-17T23:27:47.261322Z"
 928 |     }
 929 |    },
 930 |    "outputs": [],
 931 |    "source": [
 932 |     "t.get_n_items(), t.get_n_trees()"
 933 |    ]
 934 |   },
 935 |   {
 936 |    "cell_type": "code",
 937 |    "execution_count": null,
 938 |    "metadata": {
 939 |     "execution": {
 940 |      "iopub.execute_input": "2021-05-17T23:27:47.266399Z",
 941 |      "iopub.status.busy": "2021-05-17T23:27:47.266168Z",
 942 |      "iopub.status.idle": "2021-05-17T23:27:47.269406Z",
 943 |      "shell.execute_reply": "2021-05-17T23:27:47.269053Z",
 944 |      "shell.execute_reply.started": "2021-05-17T23:27:47.266382Z"
 945 |     }
 946 |    },
 947 |    "outputs": [],
 948 |    "source": [
 949 |     "def save_archives(root, treemap, db):\n",
 950 |     "    dbpath = root/'memery.pt'\n",
 951 |     "    if dbpath.exists():\n",
 952 |     "#         dbpath.rename(root/'memery-bak.pt')\n",
 953 |     "        dbpath.unlink()\n",
 954 |     "    torch.save(db, dbpath)\n",
 955 |     "    \n",
 956 |     "    treepath = root/'memery.ann'\n",
 957 |     "    if treepath.exists():\n",
 958 |     "#         treepath.rename(root/'memery-bak.ann')\n",
 959 |     "        treepath.unlink()\n",
 960 |     "    treemap.save(str(treepath))\n",
 961 |     "    \n",
 962 |     "    return(str(dbpath), str(treepath))"
 963 |    ]
 964 |   },
 965 |   {
 966 |    "cell_type": "code",
 967 |    "execution_count": null,
 968 |    "metadata": {
 969 |     "execution": {
 970 |      "iopub.execute_input": "2021-05-17T23:27:47.270195Z",
 971 |      "iopub.status.busy": "2021-05-17T23:27:47.270078Z",
 972 |      "iopub.status.idle": "2021-05-17T23:27:47.361769Z",
 973 |      "shell.execute_reply": "2021-05-17T23:27:47.361432Z",
 974 |      "shell.execute_reply.started": "2021-05-17T23:27:47.270180Z"
 975 |     }
 976 |    },
 977 |    "outputs": [],
 978 |    "source": [
 979 |     "save_archives(root, t, db)"
 980 |    ]
 981 |   },
 982 |   {
 983 |    "cell_type": "markdown",
 984 |    "metadata": {},
 985 |    "source": [
 986 |     "## Ranker\n",
 987 |     "\n",
 988 |     "Takes a query and an index and finds the nearest neighbors or most similar scores. Ideally this is just a simple Annoy `get_nns_by_vector`, or in the simple case a similarity score across all the vectors."
 989 |    ]
 990 |   },
 991 |   {
 992 |    "cell_type": "code",
 993 |    "execution_count": null,
 994 |    "metadata": {
 995 |     "execution": {
 996 |      "iopub.execute_input": "2021-05-17T23:27:55.387079Z",
 997 |      "iopub.status.busy": "2021-05-17T23:27:55.386363Z",
 998 |      "iopub.status.idle": "2021-05-17T23:27:55.397260Z",
 999 |      "shell.execute_reply": "2021-05-17T23:27:55.394454Z",
1000 |      "shell.execute_reply.started": "2021-05-17T23:27:55.386997Z"
1001 |     }
1002 |    },
1003 |    "outputs": [],
1004 |    "source": [
1005 |     "def ranker(query_vec, treemap):\n",
1006 |     "    nn_indexes = treemap.get_nns_by_vector(query_vec[0], treemap.get_n_items())\n",
1007 |     "    return(nn_indexes)"
1008 |    ]
1009 |   },
1010 |   {
1011 |    "cell_type": "code",
1012 |    "execution_count": null,
1013 |    "metadata": {
1014 |     "execution": {
1015 |      "iopub.execute_input": "2021-05-17T23:26:15.001671Z",
1016 |      "iopub.status.busy": "2021-05-17T23:26:15.001534Z",
1017 |      "iopub.status.idle": "2021-05-17T23:26:15.004383Z",
1018 |      "shell.execute_reply": "2021-05-17T23:26:15.003536Z",
1019 |      "shell.execute_reply.started": "2021-05-17T23:26:15.001654Z"
1020 |     }
1021 |    },
1022 |    "outputs": [],
1023 |    "source": [
1024 |     "from IPython.display import Image as IMG"
1025 |    ]
1026 |   },
1027 |   {
1028 |    "cell_type": "code",
1029 |    "execution_count": null,
1030 |    "metadata": {
1031 |     "execution": {
1032 |      "iopub.execute_input": "2021-05-17T23:27:56.008469Z",
1033 |      "iopub.status.busy": "2021-05-17T23:27:56.008293Z",
1034 |      "iopub.status.idle": "2021-05-17T23:27:56.012267Z",
1035 |      "shell.execute_reply": "2021-05-17T23:27:56.011056Z",
1036 |      "shell.execute_reply.started": "2021-05-17T23:27:56.008450Z"
1037 |     }
1038 |    },
1039 |    "outputs": [],
1040 |    "source": [
1041 |     "def printi(filenames, n=5):\n",
1042 |     "    for im in filenames[:n]:\n",
1043 |     "        display(IMG(filename=im[0], width=200))"
1044 |    ]
1045 |   },
1046 |   {
1047 |    "cell_type": "code",
1048 |    "execution_count": null,
1049 |    "metadata": {
1050 |     "execution": {
1051 |      "iopub.execute_input": "2021-05-17T23:27:55.520152Z",
1052 |      "iopub.status.busy": "2021-05-17T23:27:55.519884Z",
1053 |      "iopub.status.idle": "2021-05-17T23:27:55.524543Z",
1054 |      "shell.execute_reply": "2021-05-17T23:27:55.523632Z",
1055 |      "shell.execute_reply.started": "2021-05-17T23:27:55.520126Z"
1056 |     }
1057 |    },
1058 |    "outputs": [],
1059 |    "source": [
1060 |     "def rank_5(text):\n",
1061 |     "    query_vec = text_encoder(text, device)\n",
1062 |     "    indexes = ranker(query_vec, t)\n",
1063 |     "    filenames =[[v['fpath'] for k,v in db.items() if v['index'] == ind] for ind in indexes]\n",
1064 |     "    return(filenames)"
1065 |    ]
1066 |   },
1067 |   {
1068 |    "cell_type": "code",
1069 |    "execution_count": null,
1070 |    "metadata": {
1071 |     "execution": {
1072 |      "iopub.execute_input": "2021-05-17T23:27:56.551897Z",
1073 |      "iopub.status.busy": "2021-05-17T23:27:56.551621Z",
1074 |      "iopub.status.idle": "2021-05-17T23:27:57.496956Z",
1075 |      "shell.execute_reply": "2021-05-17T23:27:57.496325Z",
1076 |      "shell.execute_reply.started": "2021-05-17T23:27:56.551836Z"
1077 |     }
1078 |    },
1079 |    "outputs": [],
1080 |    "source": [
1081 |     "printi(rank_5(args['query']))"
1082 |    ]
1083 |   },
1084 |   {
1085 |    "cell_type": "markdown",
1086 |    "metadata": {},
1087 |    "source": [
1088 |     "I think we have to call that a success!"
1089 |    ]
1090 |   },
1091 |   {
1092 |    "cell_type": "markdown",
1093 |    "metadata": {},
1094 |    "source": [
1095 |     "## Gateway\n",
1096 |     "\n",
1097 |     "Takes a query and processes it through either Indexing Flow or Querying Flow, passing along arguments. The main entrypoint for each iteration of the index/query process.\n",
1098 |     "\n",
1099 |     "Querying Flow can technically process either text or image search, becuase the CLIP encoder will put them into the same embedding space. So we might as well build in a method for either, and make it available to the user, since it's impressive and useful and relatively easy to build.\n",
1100 |     "\n",
1101 |     "Eventually the Gateway process probably needs to be quite complicated, for serving all the different users and for delivering REST APIs to different clients. For now we will run this locally, in a notebook. Then build out a GUI from there using `mediapy` or `widgets`. That should reveal the basic necessities of the UI, and then we can separate out the GUI client from the server."
1102 |    ]
1103 |   },
1104 |   {
1105 |    "cell_type": "code",
1106 |    "execution_count": null,
1107 |    "metadata": {
1108 |     "execution": {
1109 |      "iopub.execute_input": "2021-05-17T23:28:07.161111Z",
1110 |      "iopub.status.busy": "2021-05-17T23:28:07.160442Z",
1111 |      "iopub.status.idle": "2021-05-17T23:28:07.173215Z",
1112 |      "shell.execute_reply": "2021-05-17T23:28:07.171845Z",
1113 |      "shell.execute_reply.started": "2021-05-17T23:28:07.161028Z"
1114 |     }
1115 |    },
1116 |    "outputs": [],
1117 |    "source": [
1118 |     "def indexFlow(path):\n",
1119 |     "    root = Path(path)\n",
1120 |     "    device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
1121 |     "    \n",
1122 |     "    filepaths = get_image_files(root)\n",
1123 |     "    archive_db, new_files = files_archive_loader(filepaths, root, device)\n",
1124 |     "    print(f\"Loaded {len(archive_db)} encodings\")\n",
1125 |     "    print(f\"Encoding {len(new_files)} new images\")\n",
1126 |     "    crafted_files = crafter(new_files, device)\n",
1127 |     "    new_embeddings = image_encoder(crafted_files, device)\n",
1128 |     "    \n",
1129 |     "    db = join_all(archive_db, new_files, new_embeddings)\n",
1130 |     "    print(\"Building treemap\")\n",
1131 |     "    t = build_treemap(db)\n",
1132 |     "    \n",
1133 |     "    print(f\"Saving {len(db)}images\")\n",
1134 |     "    save_paths = save_archives(root, t, db)\n",
1135 |     "    print(\"Done\")\n",
1136 |     "    return(save_paths)"
1137 |    ]
1138 |   },
1139 |   {
1140 |    "cell_type": "code",
1141 |    "execution_count": null,
1142 |    "metadata": {
1143 |     "execution": {
1144 |      "iopub.execute_input": "2021-05-17T23:28:10.313351Z",
1145 |      "iopub.status.busy": "2021-05-17T23:28:10.313166Z",
1146 |      "iopub.status.idle": "2021-05-17T23:28:28.543108Z",
1147 |      "shell.execute_reply": "2021-05-17T23:28:28.542515Z",
1148 |      "shell.execute_reply.started": "2021-05-17T23:28:10.313334Z"
1149 |     }
1150 |    },
1151 |    "outputs": [],
1152 |    "source": [
1153 |     "save_paths = indexFlow(args['path'])"
1154 |    ]
1155 |   },
1156 |   {
1157 |    "cell_type": "code",
1158 |    "execution_count": null,
1159 |    "metadata": {
1160 |     "execution": {
1161 |      "iopub.execute_input": "2021-05-17T23:28:28.544063Z",
1162 |      "iopub.status.busy": "2021-05-17T23:28:28.543945Z",
1163 |      "iopub.status.idle": "2021-05-17T23:28:28.547992Z",
1164 |      "shell.execute_reply": "2021-05-17T23:28:28.547123Z",
1165 |      "shell.execute_reply.started": "2021-05-17T23:28:28.544047Z"
1166 |     }
1167 |    },
1168 |    "outputs": [],
1169 |    "source": [
1170 |     "save_paths"
1171 |    ]
1172 |   },
1173 |   {
1174 |    "cell_type": "markdown",
1175 |    "metadata": {},
1176 |    "source": [
1177 |     "To search:"
1178 |    ]
1179 |   },
1180 |   {
1181 |    "cell_type": "code",
1182 |    "execution_count": null,
1183 |    "metadata": {
1184 |     "execution": {
1185 |      "iopub.execute_input": "2021-05-17T23:08:36.054349Z",
1186 |      "iopub.status.busy": "2021-05-17T23:08:36.054132Z",
1187 |      "iopub.status.idle": "2021-05-17T23:08:36.059544Z",
1188 |      "shell.execute_reply": "2021-05-17T23:08:36.058990Z",
1189 |      "shell.execute_reply.started": "2021-05-17T23:08:36.054324Z"
1190 |     }
1191 |    },
1192 |    "outputs": [],
1193 |    "source": [
1194 |     "def queryFlow(path, query): \n",
1195 |     "    root = Path(path)\n",
1196 |     "    device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
1197 |     "    \n",
1198 |     "    dbpath = root/'memery.pt'\n",
1199 |     "    db = db_loader(dbpath)\n",
1200 |     "    treepath = root/'memery.ann'\n",
1201 |     "    treemap = treemap_loader(treepath)\n",
1202 |     "    \n",
1203 |     "    if treemap == None or db == {}:\n",
1204 |     "        dbpath, treepath = indexFlow(root)\n",
1205 |     "        treemap = treemap_loader(treepath)\n",
1206 |     "        db = file\n",
1207 |     "    \n",
1208 |     "    print(f\"Searching {len(db)} images\")\n",
1209 |     "    query_vec = text_encoder(query, device)\n",
1210 |     "    indexes = ranker(query_vec, treemap)\n",
1211 |     "    ranked_files = [[v['fpath'] for k,v in db.items() if v['index'] == ind] for ind in indexes]\n",
1212 |     "    return(ranked_files)\n",
1213 |     "\n",
1214 |     "        "
1215 |    ]
1216 |   },
1217 |   {
1218 |    "cell_type": "code",
1219 |    "execution_count": null,
1220 |    "metadata": {
1221 |     "execution": {
1222 |      "iopub.execute_input": "2021-05-17T23:12:15.974818Z",
1223 |      "iopub.status.busy": "2021-05-17T23:12:15.974655Z",
1224 |      "iopub.status.idle": "2021-05-17T23:12:16.791693Z",
1225 |      "shell.execute_reply": "2021-05-17T23:12:16.791335Z",
1226 |      "shell.execute_reply.started": "2021-05-17T23:12:15.974800Z"
1227 |     }
1228 |    },
1229 |    "outputs": [],
1230 |    "source": [
1231 |     "ranked = queryFlow(args['path'], 'dog')"
1232 |    ]
1233 |   },
1234 |   {
1235 |    "cell_type": "code",
1236 |    "execution_count": null,
1237 |    "metadata": {
1238 |     "execution": {
1239 |      "iopub.execute_input": "2021-05-17T23:12:16.792617Z",
1240 |      "iopub.status.busy": "2021-05-17T23:12:16.792501Z",
1241 |      "iopub.status.idle": "2021-05-17T23:12:16.808254Z",
1242 |      "shell.execute_reply": "2021-05-17T23:12:16.807905Z",
1243 |      "shell.execute_reply.started": "2021-05-17T23:12:16.792601Z"
1244 |     }
1245 |    },
1246 |    "outputs": [],
1247 |    "source": [
1248 |     "printi(ranked)"
1249 |    ]
1250 |   },
1251 |   {
1252 |    "cell_type": "markdown",
1253 |    "metadata": {},
1254 |    "source": [
1255 |     "## Interactive process\n",
1256 |     "Currently the objective is to take the following inputs:\n",
1257 |     "- a location with images\n",
1258 |     "- a text or image query,\n",
1259 |     "\n",
1260 |     "and return the following outputs:\n",
1261 |     "- a list of image files within that location ranked by similarity to that query,\n",
1262 |     "\n",
1263 |     "with a minimum of duplicated effort, and a general ease-of-use for both the programmer and the casual API user."
1264 |    ]
1265 |   },
1266 |   {
1267 |    "cell_type": "code",
1268 |    "execution_count": null,
1269 |    "metadata": {},
1270 |    "outputs": [],
1271 |    "source": []
1272 |   },
1273 |   {
1274 |    "cell_type": "markdown",
1275 |    "metadata": {},
1276 |    "source": [
1277 |     "## TODO:\n",
1278 |     "\n",
1279 |     "- Cleanup repo\n",
1280 |     "- Rough interactive GUI\n",
1281 |     "\n",
1282 |     "- Optimize the image loader and number of trees based on memory and db size\n",
1283 |     "- Type annotations\n",
1284 |     "\n",
1285 |     "## DONE:\n",
1286 |     "- _Code for joining archived data to new data_\n",
1287 |     "- _Code for saving indexes to archive_\n",
1288 |     "- _Flows_\n"
1289 |    ]
1290 |   },
1291 |   {
1292 |    "cell_type": "code",
1293 |    "execution_count": null,
1294 |    "metadata": {},
1295 |    "outputs": [],
1296 |    "source": []
1297 |   }
1298 |  ],
1299 |  "metadata": {
1300 |   "kernelspec": {
1301 |    "display_name": "Python 3",
1302 |    "language": "python",
1303 |    "name": "python3"
1304 |   },
1305 |   "language_info": {
1306 |    "codemirror_mode": {
1307 |     "name": "ipython",
1308 |     "version": 3
1309 |    },
1310 |    "file_extension": ".py",
1311 |    "mimetype": "text/x-python",
1312 |    "name": "python",
1313 |    "nbconvert_exporter": "python",
1314 |    "pygments_lexer": "ipython3",
1315 |    "version": "3.7.7"
1316 |   }
1317 |  },
1318 |  "nbformat": 4,
1319 |  "nbformat_minor": 4
1320 | }
1321 | 


--------------------------------------------------------------------------------