├── .gitignore ├── LICENSE ├── README.md ├── docs └── supported_methods.md ├── examples ├── drug_discovery.md └── multimodal_rag.md ├── pyproject.toml └── radient ├── __init__.py ├── _milvus.py ├── factory.py ├── orchestrate ├── __init__.py ├── runners.py └── workflow.py ├── tasks ├── __init__.py ├── _base.py ├── accelerate.py ├── sinks │ ├── __init__.py │ ├── _base.py │ ├── local │ │ ├── __init__.py │ │ ├── _gkmeans.py │ │ └── gann.py │ └── milvus.py ├── sources │ ├── __init__.py │ ├── _base.py │ ├── ingest.py │ ├── local.py │ └── youtube.py ├── transforms │ ├── __init__.py │ ├── _base.py │ ├── document_screenshot │ │ ├── __init__.py │ │ ├── _base.py │ │ └── pymupdf.py │ ├── speech_to_text │ │ ├── __init__.py │ │ ├── _base.py │ │ └── whisper.py │ └── video_demux │ │ ├── __init__.py │ │ ├── _base.py │ │ ├── default.py │ │ └── ffmpeg.py └── vectorizers │ ├── __init__.py │ ├── _base.py │ ├── _imagebind.py │ ├── audio │ ├── __init__.py │ ├── _base.py │ ├── imagebind.py │ └── torchaudio.py │ ├── graph │ ├── __init__.py │ ├── _base.py │ └── fastrp.py │ ├── image │ ├── __init__.py │ ├── _base.py │ ├── imagebind.py │ └── timm.py │ ├── molecule │ ├── __init__.py │ ├── _base.py │ └── rdkit.py │ ├── multimodal.py │ └── text │ ├── __init__.py │ ├── _base.py │ ├── cohere.py │ ├── imagebind.py │ ├── sbert.py │ ├── sklearn.py │ └── voyage.py ├── utils ├── __init__.py ├── flatten_inputs.py └── lazy_import.py └── vector.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Jupyter notebooks 2 | .ipynb_checkpoints 3 | *.ipynb 4 | 5 | # Debugging 6 | .checkpoints 7 | bpe/ 8 | test/ 9 | 10 | # Packaging / distribution 11 | dist/ 12 | build/ 13 | *.egg-info/ 14 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2024 Frank Liu 2 | 3 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 4 | 5 | 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 6 | 7 | 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 8 | 9 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 10 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Radient 2 | 3 | Radient is a developer-friendly, lightweight library for unstructured data ETL, i.e. turning audio, graphs, images, molecules, text, and other data types into embeddings. Radient supports simple vectorization as well as complex vector-centric workflows. 4 | 5 | ```shell 6 | $ pip install radient 7 | ``` 8 | 9 | If you find this project helpful or interesting, please consider giving it a star. :star: 10 | 11 | ### Getting started 12 | 13 | Basic vectorization can be performed as follows: 14 | 15 | ```python 16 | from radient import text_vectorizer 17 | vz = text_vectorizer() 18 | vz.vectorize("Hello, world!") 19 | # Vector([-3.21440510e-02, -5.10351397e-02, 3.69579718e-02, ...]) 20 | ``` 21 | 22 | The above snippet vectorizes the string `"Hello, world!"` using a default model, namely `bge-small-en-v1.5` from `sentence-transformers`. If your Python environment does not contain the `sentence-transformers` library, Radient will prompt you for it: 23 | 24 | ```python 25 | vz = text_vectorizer() 26 | # Vectorizer requires sentence-transformers. Install? [Y/n] 27 | ``` 28 | 29 | You can type "Y" to have Radient install it for you automatically. 30 | 31 | Each vectorizer can take a `method` parameter along with optional keyword arguments which get passed directly to the underlying vectorization library. For example, we can pick Mixbread AI's `mxbai-embed-large-v1` model using the `sentence-transformers` library via: 32 | 33 | ```python 34 | vz_mbai = text_vectorizer(method="sentence-transformers", model_name_or_path="mixedbread-ai/mxbai-embed-large-v1") 35 | vz_mbai.vectorize("Hello, world!") 36 | # Vector([ 0.01729078, 0.04468533, 0.00055427, ...]) 37 | ``` 38 | 39 | ### More than just text 40 | 41 | With Radient, you're not limited to text. Audio, graphs, images, and molecules can be vectorized as well: 42 | 43 | ```python 44 | from radient import ( 45 | audio_vectorizer, 46 | graph_vectorizer, 47 | image_vectorizer, 48 | molecule_vectorizer, 49 | ) 50 | avec = audio_vectorizer().vectorize(str(Path.home() / "audio.wav")) 51 | gvec = graph_vectorizer().vectorize(nx.karate_club_graph()) 52 | ivec = image_vectorizer().vectorize(str(Path.home() / "image.jpg")) 53 | mvec = molecule_vectorizer().vectorize("O=C=O") 54 | ``` 55 | 56 | A partial list of methods and optional kwargs supported by each modality can be found [here](https://github.com/fzliu/radient/blob/main/docs/supported_methods.md). 57 | 58 | For production use cases with large quantities of data, performance is key. Radient also provides an `accelerate` function to optimize vectorizers on-the-fly: 59 | 60 | ```python 61 | import numpy as np 62 | vz = text_vectorizer() 63 | vec0 = vz.vectorize("Hello, world!") 64 | vz.accelerate() 65 | vec1 = vz.vectorize("Hello, world!") 66 | np.allclose(vec0, vec1) 67 | # True 68 | ``` 69 | 70 | On a 2.3 GHz Quad-Core Intel Core i7, the original vectorizer returns in ~32ms, while the accelerated vectorizer returns in ~17ms. 71 | 72 | ### Building unstructured data ETL 73 | 74 | Aside from running experiments, pure vectorization is not particularly useful. Mirroring strutured data ETL pipelines, unstructured data ETL workloads often require a combination of four components: a data __source__ where unstructured data is stored, one more more __transform__ modules that perform data conversions and pre-processing, a __vectorizer__ which turns the data into semantically rich embeddings, and a __sink__ to persist the vectors once they have been computed. 75 | 76 | Radient provides a `Workflow` object specifically for building vector-centric ETL applications. With Workflows, you can combine any number of each of these components into a directed graph. For example, a workflow to continuously read text documents from Google Drive, vectorize them with [Voyage AI](https://www.voyageai.com/), and vectorize them into Milvus might look like: 77 | 78 | ```python 79 | from radient import make_operator 80 | from radient import Workflow 81 | 82 | extract = make_operator("source", method="google-drive", task_params={"folder": "My Files"}) 83 | transform = make_operator("transform", method="read-text", task_params={}) 84 | vectorize = make_operator("vectorizer", method="voyage-ai", modality="text", task_params={}) 85 | load = make_operator("sink", method="milvus", task_params={"operation": "insert"}) 86 | 87 | wf = ( 88 | Workflow() 89 | .add(extract, name="extract") 90 | .add(transform, name="transform") 91 | .add(vectorize, name="vectorize") 92 | .add(load, name="load") 93 | ) 94 | ``` 95 | 96 | You can use accelerated vectorizers and transforms in a Workflow by specifying `accelerate=True` for all supported operators. 97 | 98 | ### Supported vectorizer engines 99 | 100 | Radient builds atop work from the broader ML community. Most vectorizers come from other libraries: 101 | 102 | - [Imagebind](https://imagebind.metademolab.com/) 103 | - [Pytorch Image Models](https://huggingface.co/timm) 104 | - [RDKit](https://rdkit.org) 105 | - [Sentence Transformers](https://sbert.net) 106 | - [scikit-learn](https://scikit-learn.org) 107 | - [TorchAudio](https://pytorch.org/audio) 108 | 109 | On-the-fly model acceleration is done via [ONNX](https://onnx.ai). 110 | 111 | A massive thank you to all the creators and maintainers of these libraries. 112 | 113 | ### Coming soon™ 114 | 115 | A couple of features slated for the near-term (hopefully): 116 | 1) Sparse vector, binary vector, and multi-vector support 117 | 2) Support for all relevant embedding models on Huggingface 118 | 119 | LLM connectors _will not_ be a feature that Radient provides. Building context-aware systems around LLMs is a complex task, and not one that Radient intends to solve. Projects such as [Haystack](https://haystack.deepset.ai/) and [Llamaindex](https://www.llamaindex.ai/) are two of the many great options to consider if you're looking to extract maximum RAG performance. 120 | 121 | Full write-up on Radient will come later, along with more sample applications, so stay tuned. 122 | 123 | -------------------------------------------------------------------------------- /docs/supported_methods.md: -------------------------------------------------------------------------------- 1 | ## Supported methods 2 | 3 | Below is a series of tables that lists out supported method + kwarg pairs for each modality of data in Radient. 4 | 5 | __Audio__ [^1] 6 | 7 | | `method` | `model_name` | Description | 8 | | --- | --- | --- | 9 | | `torchaudio` | `WAV2VEC2_BASE` | Wav2vec 2.0 model ("base" architecture), pre-trained on 960 hours of unlabeled audio from LibriSpeech dataset [Panayotov et al., 2015] (the combination of "train-clean-100", "train-clean-360", and "train-other-500"), not fine-tuned. | 10 | | `torchaudio` | `WAV2VEC2_LARGE` | Wav2vec 2.0 model ("large" architecture), pre-trained on 960 hours of unlabeled audio from LibriSpeech dataset [Panayotov et al., 2015] (the combination of "train-clean-100", "train-clean-360", and "train-other-500"), not fine-tuned. | 11 | | `torchaudio` | `WAV2VEC2_LARGE_LV60K` | Wav2vec 2.0 model ("large-lv60k" architecture), pre-trained on 60,000 hours of unlabeled audio from Libri-Light dataset [Kahn et al., 2020], not fine-tuned. | 12 | | `torchaudio` | `WAV2VEC2_XLSR53` | Wav2vec 2.0 model ("base" architecture), pre-trained on 56,000 hours of unlabeled audio from multiple datasets ( Multilingual LibriSpeech [Pratap et al., 2020], CommonVoice [Ardila et al., 2020] and BABEL [Gales et al., 2014]), not fine-tuned. | 13 | | `torchaudio` | `WAV2VEC2_XLSR_300M` | XLS-R model with 300 million parameters, pre-trained on 436,000 hours of unlabeled audio from multiple datasets ( Multilingual LibriSpeech [Pratap et al., 2020], CommonVoice [Ardila et al., 2020], VoxLingua107 [Valk and Alumäe, 2021], BABEL [Gales et al., 2014], and VoxPopuli [Wang et al., 2021]) in 128 languages, not fine-tuned. | 14 | | `torchaudio` | `WAV2VEC2_XLSR_1B` | XLS-R model with 1 billion parameters, pre-trained on 436,000 hours of unlabeled audio from multiple datasets ( Multilingual LibriSpeech [Pratap et al., 2020], CommonVoice [Ardila et al., 2020], VoxLingua107 [Valk and Alumäe, 2021], BABEL [Gales et al., 2014], and VoxPopuli [Wang et al., 2021]) in 128 languages, not fine-tuned. | 15 | | `torchaudio` | `WAV2VEC2_XLSR_2B` | XLS-R model with 2 billion parameters, pre-trained on 436,000 hours of unlabeled audio from multiple datasets ( Multilingual LibriSpeech [Pratap et al., 2020], CommonVoice [Ardila et al., 2020], VoxLingua107 [Valk and Alumäe, 2021], BABEL [Gales et al., 2014], and VoxPopuli [Wang et al., 2021]) in 128 languages, not fine-tuned. | 16 | | `torchaudio` | `HUBERT_BASE` | HuBERT model ("base" architecture), pre-trained on 960 hours of unlabeled audio from LibriSpeech dataset [Panayotov et al., 2015] (the combination of "train-clean-100", "train-clean-360", and "train-other-500"), not fine-tuned. | 17 | | `torchaudio` | `HUBERT_LARGE` | HuBERT model ("large" architecture), pre-trained on 60,000 hours of unlabeled audio from Libri-Light dataset [Kahn et al., 2020], not fine-tuned. | 18 | | `torchaudio` | `HUBERT_XLARGE` | HuBERT model ("extra large" architecture), pre-trained on 60,000 hours of unlabeled audio from Libri-Light dataset [Kahn et al., 2020], not fine-tuned. | 19 | | `torchaudio` | `WAVLM_BASE` | WavLM Base model ("base" architecture), pre-trained on 960 hours of unlabeled audio from LibriSpeech dataset [Panayotov et al., 2015], not fine-tuned. | 20 | | `torchaudio` | `WAVLM_BASE_PLUS` | WavLM Base+ model ("base" architecture), pre-trained on 60,000 hours of Libri-Light dataset [Kahn et al., 2020], 10,000 hours of GigaSpeech [Chen et al., 2021], and 24,000 hours of VoxPopuli [Wang et al., 2021], not fine-tuned. | 21 | | `torchaudio` | `WAVLM_LARGE` | WavLM Large model ("large" architecture), pre-trained on 60,000 hours of Libri-Light dataset [Kahn et al., 2020], 10,000 hours of GigaSpeech [Chen et al., 2021], and 24,000 hours of VoxPopuli [Wang et al., 2021], not fine-tuned. | 22 | 23 | __Graph__ 24 | 25 | | `method` | `dimension` | Description | 26 | | --- | --- | --- | 27 | | `fastrp` | any positive integer | The FastRP (Fast Random Projection) algorithm is an efficient method for node embedding in graphs, utilizing random projections to reduce dimensionality while approximately preserving pairwise distances among nodes. | 28 | 29 | __Image__ 30 | 31 | | `method` | `model_name` | Description | 32 | | --- | --- | --- | 33 | | `timm` | any model in `timm.list_models(pretrained=True)` | | 34 | 35 | __Molecule__ 36 | 37 | | `method` | `fingerprint_type` | Description | 38 | | --- | --- | --- | 39 | | `rdkit` | `topological` | Topological fingerprints represent molecules by encoding the presence or absence of particular substructures and patterns of connectivity within the molecule, focusing on the molecule's structural topology without considering the three-dimensional layout. | 40 | | `rdkit` | `morgan` | Morgan fingerprints characterize the molecular structure based on the connectivity of atoms within a defined radius around each atom, capturing the local chemical environment in a more detailed way than simple topological features. | 41 | 42 | __Text__ 43 | 44 | | `method` | `model_name_or_path` | Description | 45 | | --- | --- | --- | 46 | | `sentence-transformers` | any pretrained [Sentence Transformers model](https://huggingface.co/models?library=sentence-transformers) | | 47 | 48 | --- 49 | 50 | [^1]: [Torchaudio documentation](https://pytorch.org/audio/stable/pipelines.html) 51 | -------------------------------------------------------------------------------- /examples/drug_discovery.md: -------------------------------------------------------------------------------- 1 | ## Drug discovery 2 | 3 | Outside of biotech circles, AI-powered drug discovery isn't a well-known use case for embedding vectors. In reality, billion-scale vector search is frequently in this industry, and this particular application demonstrates how vectors can be used to represent non-traditional unstructured data. 4 | 5 | By way of example, let's use the default `molecule_vectorizer` to generate embedding vectors for molecular structures. We'll first grab the dataset of FDA-approved drugs and their corresponding SMILES strings (SMILES is a way to describe molecular structure using a string of letters and symbols). We'll then vectorize all of these SMILES strings and search the results to see if we can discover alternatives to Ibuprofen (often sold as Advil or Motrin), an analgesic, anti-inflammatory drug. 6 | 7 | ```shell 8 | pip install -U radient 9 | ``` 10 | 11 | We'll start with our imports: 12 | 13 | ```python 14 | import csv 15 | 16 | import numpy as np 17 | import requests 18 | import scipy as sp 19 | 20 | from radient import molecule_vectorizer 21 | ``` 22 | 23 | From here, let's use `requests` and `csv` to download and parse the dataset, respectively: 24 | 25 | ```python 26 | r = requests.get("https://gist.githubusercontent.com/fzliu/8052bd4d609bc6260ab7e8c838d2f518/raw/f1c9efb816d6b8514c0a643323f7afa29372b1c4/fda_approved_structures.csv") 27 | csv_data = csv.reader(r.text.splitlines(), delimiter=",") 28 | mol_data = [{"name": d[0], "mol": d[1]} for d in csv_data] 29 | ``` 30 | 31 | Now we'll create our vectorizer and compute vectors for all molecules. The query vector is generated from the SMILES string for Ibuprofen: 32 | 33 | ```python 34 | vectorizer = molecule_vectorizer() 35 | vec_data = vectorizer.vectorize([row["mol"] for row in mol_data]) 36 | query = vectorizer.vectorize("CC(C)CC1=CC=C(C=C1)C(C)C(O)=O") 37 | ``` 38 | 39 | With that out of the way, let's find the "closest" drugs to Ibuprofen. We're using [Jaccard similarity](https://en.wikipedia.org/wiki/Jaccard_index) since the default molecule vectorizer returns binary vectors: 40 | 41 | ```python 42 | dists = sp.spatial.distance.cdist( 43 | query[np.newaxis,...], 44 | vectors, 45 | metric="jaccard" 46 | ).squeeze() 47 | top10 = [mol_data[i]["name"] for i in np.argsort(dists)[:10]] 48 | print(top10) 49 | ``` 50 | 51 | ['Dexibuprofen', 'Ibuprofen', 'Loxoprofen', 'Phenylacetic acid', 'Naproxen', 'Fenoprofen', 'Ketoprofen', 'Dexketoprofen', 'Mandelic acid', 'Oxeladin'] 52 | 53 | Ibuprofen's similarity with many of these drugs is clear: Loxoprofen, Phenylacetic acid, Naproxen, Fenoprofen, and Ketoprofen, are, like Ibuprofen, all analgesic, anti-inflammatory drugs. Surprisingly, Mandelic acid and Oxeladin are relevant too; some studies show that they also possess anti-inflammatory properties. 54 | 55 | Keep in mind that this is a highly simplified example - modern drug discovery pipelines are extremely complex, and vector search is one of the many ways that new molecules are discovered. 56 | 57 | For convenience, here's the full script: 58 | 59 | ```python 60 | import csv 61 | 62 | import numpy as np 63 | import requests 64 | import scipy as sp 65 | 66 | from radient import molecule_vectorizer 67 | 68 | r = requests.get("https://gist.githubusercontent.com/fzliu/8052bd4d609bc6260ab7e8c838d2f518/raw/f1c9efb816d6b8514c0a643323f7afa29372b1c4/fda_approved_structures.csv") 69 | csv_data = csv.reader(r.text.splitlines(), delimiter=",") 70 | mol_data = [{"name": d[0], "mol": d[1]} for d in csv_data] 71 | 72 | vectorizer = molecule_vectorizer() 73 | vec_data = vectorizer.vectorize([row["mol"] for row in mol_data]) 74 | query = vectorizer.vectorize("CC(C)CC1=CC=C(C=C1)C(C)C(O)=O") 75 | 76 | dists = sp.spatial.distance.cdist( 77 | query[np.newaxis,...], 78 | vectors, 79 | metric="jaccard" 80 | ).squeeze() 81 | top10 = [mol_data[i]["name"] for i in np.argsort(dists)[:10]] 82 | print(top10) 83 | ``` -------------------------------------------------------------------------------- /examples/multimodal_rag.md: -------------------------------------------------------------------------------- 1 | ## Multimodal RAG (with Meta Chameleon 7B) 2 | 3 | We've seen an influx of powerful multimodal capabilities in many LLMs, notably [GPT-4o](https://openai.com/index/hello-gpt-4o) and [Gemini](https://blog.google/technology/ai/google-gemini-next-generation-model-february-2024). Moving forward, most of the modalities won't be "searchable" in the traditional sense - using human-labelled tags or descriptions to retrieve relevant video or audio is not a scalable solution for multimodal RAG. We need to use dense vectors as semantic representations for _all modalities of data_. 4 | 5 | In this example, we'll vectorize audio, text, and images into the same embedding space with [ImageBind](https://imagebind.metademolab.com/), store the vectors in [Milvus Lite](https://milvus.io/docs/milvus_lite.md), retrieve all relevant data given a query, and input multimodal data as context into [Chameleon](https://ai.meta.com/blog/meta-fair-research-new-releases/)-7B (vision/language model). 6 | 7 | If you'd like to follow along but aren't 100% familiar with RAG just yet, LlamaIndex provides an excellent yet concise [RAG overview](https://docs.llamaindex.ai/en/stable/getting_started/concepts/). 8 | 9 |
10 | 11 |

Multimodal RAG using Radient.

12 |
13 | 14 | We'll start by specifying our imports. We'll use `radient` to build a video vectorization workflow and `transformers` to run Chameleon: 15 | 16 | ```shell 17 | pip install -U radient 18 | pip install -U transformers 19 | ``` 20 | 21 | ```python 22 | from radient import make_operator 23 | from radient import Workflow 24 | from transformers import ChameleonProcessor, ChameleonForConditionalGeneration 25 | from PIL import Image 26 | ``` 27 | 28 | We're going to use the 2024 Google I/O Pre-Show as the video for this example (linked via the image below). Prior to taking the stage, musician Marc Rebillet climbed out of a human-sized coffee mug plcaed to the side of the stage, and began using Google's MusicFX DJ to create AI-generated beats and tunes as a part of his performance. The video is a great example of a rich, multimodal piece of unstructured data which we can use to perform multimodal RAG: 29 | 30 |
31 | 32 |
33 | 34 | Turning this video into vectors is a multistep process that involves: 1) splitting the video into a combination of audio and visual snippets, 2) vectorizing all snippets into the same embedding space, and 3) storing these into our vector database. Radient provides a `Workflow` object to repeatably run these steps: 35 | 36 | ```python 37 | # The `read` operator grabs a video or playlist from Youtube and stores it locally. 38 | # The `demux` operator splits the video into audio and visual snippets at 5.0 second intervals. 39 | # The `vectorize` operator embeds all audio snippets and frames into a common embedding space using ImageBind. 40 | # The `store` operator stores the vectors into Milvus. If you don't specify a URI, it will use local mode by default. 41 | read = make_operator(task_name="source", task_type="youtube", task_params={"url": "https://www.youtube.com/watch?v=wwk1QIDswcQ"}) 42 | demux = make_operator(task_name="transform", task_type="video-demux", task_params={"method": "ffmpeg", "interval": 5.0}) 43 | vectorize = make_operator(task_name="vectorizer", task_type="multimodal", task_params={"method": "imagebind"}) 44 | store = make_operator(task_name="sink", task_type="milvus", task_params={"operation": "insert"}) 45 | 46 | # All of these operators are then combined into an end-to-end workflow. 47 | insert_wf = (Workflow() 48 | .add(read, name="read") 49 | .add(demux, name="demux") 50 | .add(vectorize, name="vectorize") 51 | .add(store, name="store") 52 | ) 53 | ``` 54 | 55 | We can then run the workflow to process our video file: 56 | 57 | ```python 58 | insert_wf() 59 | ``` 60 | 61 | If all goes well, you should see an output that looks something like this: 62 | 63 | ``` 64 | [[{'insert_count': 258, 'ids': [450857205535866880, 450857205535866881, ... 65 | {'insert_count': 258, 'ids': [450857205569946116, 450857205569946117, ...]] 66 | ``` 67 | 68 | This is the result of the two `insert` operations into Milvus - one for the audio vectors, and one for the image vectors. 69 | 70 | All the data we need is now in our vector database; given some query text, we can now search nearest neighbors in multiple modalities. Searches can be done with a workflow as well: 71 | 72 | ```python 73 | vectorize = make_operator("vectorizer", "text", task_params={"method": "imagebind"}) 74 | search = make_operator("sink", "milvus", task_params={"operation": "search", "output_fields": None}) 75 | 76 | search_wf = (Workflow() 77 | .add(vectorize, name="vectorize") 78 | .add(search, name="search") 79 | ) 80 | ``` 81 | 82 | The output of this workflow are the top ten results for each query. We can test this by passing a text prompt into it: 83 | 84 | ```python 85 | prompt = "What was weird about the coffee mug?" 86 | search_wf(data=prompt) 87 | ``` 88 | 89 | The output should look something like this: 90 | 91 | ``` 92 | [[[{'id': 450857205535866888, 'distance': 0.27359023690223694, 'entity': {}}, 93 | {'id': 450857205535866886, 'distance': 0.26841503381729126, 'entity': {}}, 94 | ...]]] 95 | ``` 96 | 97 | We'll need to pass a few extra parameters - namely, a [top-k limit](https://milvus.io/docs/single-vector-search.md#Basic-search) and [output fields](https://milvus.io/docs/single-vector-search.md#Basic-search) - before we can pass the results into Chameleon's context window. These variables are passed directly to the `search` task, which forwards them to Milvus as keyword arguments: 98 | 99 | ```python 100 | search_vars = { 101 | "limit": 1, # top-k limit 102 | "output_fields": ["*"] # output fields 103 | } 104 | results = search_wf( 105 | extra_vars={"search": search_vars}, 106 | data=prompts 107 | ) 108 | results 109 | ``` 110 | 111 | The results are now exactly what we need: 112 | 113 | ``` 114 | [[[{'id': 450857205535866888, 115 | 'distance': 0.27359023690223694, 116 | 'entity': {'data': '/your/home/.radient/data/video_demux/b53ebb6f-6e8e-476c-8b10-7888932c9a81/frame_0006.png', 117 | 'modality': 'image'}}]]] 118 | ``` 119 | 120 | Here's what the data stored in the returned entity (`frame_0006.png`) looks like: 121 | 122 |
123 | 124 |

Most relevant context retrieved with the prompt "What was weird about the coffee mug?"

125 |
126 | 127 | We've now completed the indexing and retrieval portion of our multimodal RAG system; the final step is to pass the results into Chameleon. We can do this by loading the tokenizer and model, then generating text based on the prompt and image: 128 | 129 | ```python 130 | processor = ChameleonProcessor.from_pretrained("nopperl/chameleon-7b-hf") 131 | model = ChameleonForConditionalGeneration.from_pretrained("nopperl/chameleon-7b-hf", torch_dtype=torch.bfloat16, device_map="cpu") 132 | 133 | image = Image.open(results[0][0][0]["entity"]["data"]) 134 | prompt = f"{prompt}" 135 | 136 | inputs = processor(prompt, image, return_tensors="pt").to(model.device, dtype=torch.bfloat16) 137 | out = model.generate(**inputs, max_new_tokens=50, do_sample=False) 138 | generated_text = processor.batch_decode(out, skip_special_tokens=False)[0] 139 | print(generated_text) 140 | ``` 141 | 142 | Which returns something like this (YMMV, depending on the temperature that you set): 143 | 144 | ``` 145 | The coffee mug was weirder because of the person in the image. 146 | ``` 147 | 148 | And that's it! We've successfully built a multimodal RAG system in just a few lines of code. Although we used only one video in this example, this framework is extensible to any number of videos. 149 | 150 | This example is available on [Google Colab](https://colab.research.google.com/drive/1Z13NffkMpGjipBSExhsxQuqo28gL9VpF). For convenience, here's the full script: 151 | 152 | ```shell 153 | pip install -U radient 154 | pip install -U transformers 155 | ``` 156 | 157 | ```python 158 | from radient import make_operator 159 | from radient import Workflow 160 | from transformers import ChameleonProcessor, ChameleonForConditionalGeneration 161 | from PIL import Image 162 | 163 | # 164 | # Add multimodal (visual + audio) data into our vector database. 165 | # 166 | 167 | read = make_operator(optype="source", method="youtube", task_params={"url": "https://www.youtube.com/watch?v=wwk1QIDswcQ"}) 168 | demux = make_operator(optype="transform", method="video-demux", task_params={"interval": 5.0}) 169 | vectorize = make_operator(optype="vectorizer", method="imagebind", modality="multimodal", task_params={}) 170 | store = make_operator(optype="sink", method="milvus", task_params={"operation": "insert"}) 171 | 172 | insert_wf = (Workflow() 173 | .add(read, name="read") 174 | .add(demux, name="demux") 175 | .add(vectorize, name="vectorize") 176 | .add(store, name="store") 177 | ) 178 | 179 | # 180 | # With data ingestion complete, we can now create a workflow for searches. 181 | # 182 | 183 | vectorize = make_operator(optype="vectorizer", method="imagebind", modality="text", task_params={}) 184 | search = make_operator(optype="sink", method="milvus", task_params={"operation": "search", "output_fields": None}) 185 | 186 | search_wf = (Workflow() 187 | .add(vectorize, name="vectorize") 188 | .add(search, name="search") 189 | ) 190 | search_vars = { 191 | "limit": 1, 192 | "output_fields": ["*"], 193 | "filter": 'modality like "image"', 194 | } 195 | 196 | # 197 | # Perform the search and send the results to Chameleon. 198 | # 199 | 200 | results = search_wf( 201 | extra_vars={"search": search_vars}, 202 | data=prompts 203 | ) 204 | 205 | processor = ChameleonProcessor.from_pretrained("nopperl/chameleon-7b-hf") 206 | model = ChameleonForConditionalGeneration.from_pretrained("nopperl/chameleon-7b-hf", device_map="cpu") 207 | 208 | image = Image.open(results[0][0][0]["entity"]["data"]) 209 | prompt = f"{prompt}" 210 | 211 | inputs = processor(prompt, image, return_tensors="pt").to(model.device) 212 | out = model.generate(**inputs, max_new_tokens=50, do_sample=False) 213 | generated_text = processor.batch_decode(out, skip_special_tokens=False)[0] 214 | 215 | # 216 | # Print our result. 217 | # 218 | print(generated_text) 219 | ``` 220 | 221 | --- 222 | 223 | A few notes and other parting words: 224 | 225 | 1. We're using Imagebind in this example because it seems to one of the more powerful multimodal embedding models circa July 2024. [AudioCLIP](https://arxiv.org/abs/2106.13043) and [UForm](https://github.com/unum-cloud/uform) are two other another options that may be interesting to play around with, although UForm doesn't support audio just yet. Either way, we'll see an influx of multimodal embedding models to pair with LMMs - I'm experimenting with a couple of training strategies myself and hope to show something soon™. 226 | 227 | 2. Although it doesn't support audio modalities yet, Meta's Chameleon is still a solid option for multimodal RAG. The model is trained on a large and diverse mulitimodal (text and image) dataset, and it's been shown to perform well on many simpler tasks despite its small size. I'll update this example once there's a solid LMM that supports audio along with image and text modalities. 228 | 229 | 3. As an unstructured data ETL framework, Radient is only meant to solve the _retrieval_ portion of retrieval-augmented generation rather than be an end-to-end RAG solution. [Llamaindex](https://www.llamaindex.ai/) and [Haystack](https://haystack.deepset.ai/) are two of the many great open source options to consider if you're looking to extract maximum RAG performance. 230 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "radient" 3 | version = "2024.11.28" 4 | authors = [ 5 | { name="Frank Liu", email="frank@frankzliu.com" }, 6 | ] 7 | requires-python = '>=3.9' 8 | description = "Turn unstructured data into vectors" 9 | readme = "README.md" 10 | dependencies=[ 11 | "numpy >= 1.19", 12 | "pip >= 20.1", 13 | ] 14 | 15 | classifiers=[ 16 | "Programming Language :: Python :: 3", 17 | "License :: OSI Approved :: BSD License", 18 | ] 19 | 20 | [project.urls] 21 | Homepage = "https://github.com/fzliu/radient" 22 | Issues = "https://github.com/fzliu/radient/issues" 23 | -------------------------------------------------------------------------------- /radient/__init__.py: -------------------------------------------------------------------------------- 1 | # Vector base class 2 | from radient.vector import Vector 3 | 4 | # Vectorization only 5 | from radient.tasks.vectorizers import ( 6 | audio_vectorizer, 7 | graph_vectorizer, 8 | image_vectorizer, 9 | molecule_vectorizer, 10 | text_vectorizer, 11 | multimodal_vectorizer 12 | ) 13 | 14 | # Orchestration 15 | from radient.factory import make_operator 16 | from radient.orchestrate.runners import ( 17 | LocalRunner, 18 | LazyLocalRunner 19 | ) 20 | from radient.orchestrate.workflow import Workflow -------------------------------------------------------------------------------- /radient/_milvus.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Optional, TYPE_CHECKING 2 | 3 | from radient.utils.lazy_import import LazyImport 4 | 5 | if TYPE_CHECKING: 6 | from pymilvus import MilvusClient 7 | import pymilvus 8 | else: 9 | MilvusClient = LazyImport("pymilvus", attribute="MilvusClient", min_version="2.4.2") 10 | pymilvus = LazyImport("pymilvus", min_version="2.4.2") # Milvus Python SDK 11 | 12 | 13 | class _MilvusInterface(object): 14 | """Interface to the Milvus vector database. 15 | 16 | This interface also works with Zilliz Cloud (https://zilliz.com/cloud). 17 | """ 18 | 19 | _clients = {} 20 | _collection_fields = {} 21 | 22 | def __new__(cls, *args, **kwargs): 23 | return cls._get_client(*args, **kwargs) 24 | 25 | @classmethod 26 | def _get_client( 27 | cls, 28 | milvus_uri: str, 29 | collection_name: str, 30 | dimension: Optional[int] = None 31 | ) -> tuple["MilvusClient", dict[str, str]]: 32 | 33 | milvus_uri = milvus_uri.replace("localhost", "127.0.0.1") 34 | 35 | # If a local Milvus installation was specified, check to see if it's up 36 | # and running first. If not, prompt the user and start an embedded 37 | # Milvus instance. 38 | if milvus_uri not in cls._clients: 39 | pymilvus.connections.connect(uri=milvus_uri) 40 | cls._clients[milvus_uri] = MilvusClient(uri=milvus_uri) 41 | client = cls._clients[milvus_uri] 42 | 43 | # Grab the collection information. If it doesn't exist yet, create it 44 | # with some default settings. With the collection information, we then 45 | # store the vector field names inside the `_collection_fields` global 46 | # object. 47 | uri_and_coll = (milvus_uri, collection_name) 48 | if uri_and_coll not in cls._collection_fields: 49 | if not client.has_collection(collection_name=collection_name): 50 | client.create_collection( 51 | collection_name=collection_name, 52 | dimension=dimension, 53 | auto_id=True, 54 | enable_dynamic_field=True 55 | ) 56 | info = client.describe_collection(collection_name=collection_name) 57 | fields = {} 58 | # TODO(fzliu): support multiple vector fields of the same type. 59 | for f in info["fields"]: 60 | if f["type"] == pymilvus.DataType.BINARY_VECTOR: 61 | fields["binary"] = f["name"] 62 | elif f["type"] == pymilvus.DataType.FLOAT_VECTOR: 63 | fields["dense"] = f["name"] 64 | elif (pymilvus.__version__ >= "2.4.0" and 65 | f["type"] == pymilvus.DataType.SPARSE_FLOAT_VECTOR): 66 | fields["sparse"] = f["name"] 67 | cls._collection_fields[uri_and_coll] = fields 68 | info = cls._collection_fields[uri_and_coll] 69 | 70 | return (client, info) 71 | -------------------------------------------------------------------------------- /radient/factory.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, Type 2 | 3 | from radient.orchestrate.runners import * 4 | from radient.tasks.sinks import * 5 | from radient.tasks.sources import * 6 | from radient.tasks.transforms import * 7 | from radient.tasks.vectorizers import * 8 | 9 | 10 | def make_operator( 11 | task_name: str, 12 | task_type: str, 13 | runner: Optional[Type] = None, 14 | task_params: Optional[dict] = None 15 | ) -> Runner: 16 | 17 | runner = runner or LocalRunner 18 | task_params = task_params or {} 19 | 20 | # Create a data sink. 21 | if task_name == "sink": 22 | if task_type == "milvus": 23 | return runner(MilvusSink, task_params=task_params) 24 | else: 25 | raise ValueError(f"unknown data store: {task_type}") 26 | 27 | # Create a data source. 28 | elif task_name == "source": 29 | if task_type == "local": 30 | return runner(LocalSource, task_params=task_params) 31 | elif task_type == "youtube": 32 | return runner(YoutubeSource, task_params=task_params) 33 | elif task_type == "ingest": 34 | return runner(IngestSource, task_params=task_params) 35 | else: 36 | raise ValueError(f"unknown data source: {task_type}") 37 | 38 | # Create a data-to-data transformation. 39 | elif task_name == "transform": 40 | if task_type == "video-demux": 41 | return runner(video_demux_transform, task_params=task_params) 42 | elif task_type == "speech-to-text": 43 | return runner(speech_to_text_transform, task_params=task_params) 44 | else: 45 | raise ValueError(f"unknown transform method: {task_type}") 46 | 47 | # Create an data-to-vector transformation. 48 | elif task_name == "vectorizer": 49 | if task_type == "audio": 50 | return runner(audio_vectorizer, task_params=task_params) 51 | elif task_type == "graph": 52 | return runner(graph_vectorizer, task_params=task_params) 53 | elif task_type == "image": 54 | return runner(image_vectorizer, task_params=task_params) 55 | elif task_type == "molecule": 56 | return runner(molecule_vectorizer, task_params=task_params) 57 | elif task_type == "text": 58 | return runner(text_vectorizer, task_params=task_params) 59 | elif task_type == "multimodal": 60 | return runner(multimodal_vectorizer, task_params=task_params) 61 | else: 62 | raise NotImplementedError 63 | 64 | -------------------------------------------------------------------------------- /radient/orchestrate/__init__.py: -------------------------------------------------------------------------------- 1 | from radient.orchestrate.runners import LocalRunner 2 | from radient.orchestrate.runners import LazyLocalRunner 3 | from radient.orchestrate.workflow import Workflow -------------------------------------------------------------------------------- /radient/orchestrate/runners.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | import random 3 | from typing import Type, Optional 4 | 5 | from radient.utils.flatten_inputs import flattened 6 | 7 | 8 | class Runner(ABC): 9 | 10 | @abstractmethod 11 | def __init__( 12 | self, 13 | task: Type, 14 | task_params: Optional[dict] = None, 15 | flatten_inputs: Optional[str] = False 16 | ): 17 | self._task = task 18 | self._task_params = task_params or {} 19 | self._flatten_inputs = flatten_inputs 20 | self._result = None 21 | 22 | def _evaluate(self): 23 | self._result = self._task( 24 | **self._task_params 25 | ) 26 | 27 | @property 28 | def result(self): 29 | return self._result 30 | 31 | def __call__(self, *args, **kwargs): 32 | if self._flatten_inputs: 33 | outputs = [] 34 | for flat_args, flat_kwargs in flattened(*args, **kwargs): 35 | outputs.append(self.result(*flat_args, **flat_kwargs)) 36 | return outputs 37 | return self.result(*args, **kwargs) 38 | 39 | 40 | class LocalRunner(Runner): 41 | """Evaluate a function or instance locally. 42 | """ 43 | 44 | def __init__(self, *args, **kwargs): 45 | super().__init__(*args, **kwargs) 46 | self._evaluate() 47 | 48 | 49 | class LazyLocalRunner(Runner): 50 | """Lazily (on-demand) evaluate a function or instance. 51 | """ 52 | 53 | def __init__(self, *args, **kwargs): 54 | super().__init__(*args, **kwargs) 55 | 56 | @property 57 | def result(self): 58 | if not self._result: 59 | self._evaluate() 60 | return self._result 61 | -------------------------------------------------------------------------------- /radient/orchestrate/workflow.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict, OrderedDict 2 | from collections.abc import Callable, Iterator 3 | from graphlib import TopologicalSorter 4 | from typing import Any, Optional, Sequence, Union 5 | 6 | from radient.utils.flatten_inputs import flattened 7 | 8 | 9 | class Workflow: 10 | """Workflows are used to chain together independent tasks together in a 11 | DAG. The output of each task is maintained in a table and passed to 12 | subsequent tasks that need the corresponding result. 13 | """ 14 | 15 | def __init__(self): 16 | self._runners = OrderedDict() 17 | self._dependencies = {} 18 | self._runner_graph = None 19 | 20 | def __call__(self, *args, **kwargs) -> Any: 21 | self.compile() 22 | return self.execute(*args, **kwargs) 23 | 24 | def add( 25 | self, 26 | runner: Callable, 27 | name: str, 28 | dependencies: Optional[Sequence[str]] = None 29 | ) -> "Workflow": 30 | 31 | # By default, new tasks have a single dependency: the preceding task. 32 | if not dependencies: 33 | names = list(self._runners.keys()) 34 | dependencies = (names[-1],) if names else () 35 | self._dependencies[name] = dependencies 36 | 37 | self._runners[name] = runner 38 | 39 | return self 40 | 41 | def compile(self): 42 | self._runner_graph = TopologicalSorter(self._dependencies) 43 | self._all_outputs = defaultdict(list) 44 | 45 | def execute( 46 | self, 47 | extra_vars: Optional[dict[str, dict[str, Any]]] = None, 48 | **kwargs 49 | ) -> Any: 50 | if self._runner_graph is None: 51 | raise ValueError("call compile() first") 52 | 53 | extra_vars = extra_vars or {} 54 | 55 | # TODO(fzliu): workflows may be persistent rather than returning a 56 | # single output or set of outputs 57 | for name in self._runner_graph.static_order(): 58 | inputs = [] 59 | if not self._dependencies[name]: 60 | # A task with no dependencies is a "seed" task. 61 | inputs.append([kwargs]) 62 | else: 63 | for d in self._dependencies[name]: 64 | inputs.append(self._all_outputs[d][-1]) 65 | 66 | # Ignore if any of the results are `None`. 67 | if [None] in inputs: 68 | self._all_outputs[name].append([None]) 69 | continue 70 | 71 | # A task can return a single item or multiple items in a list. 72 | outputs = [] 73 | for args, _ in flattened(*inputs): 74 | kwargs = {k: v for d in args for k, v in d.items()} 75 | kwargs.update(extra_vars.get(name, {})) 76 | result = self._runners[name](**kwargs) 77 | if isinstance(result, list): 78 | outputs.extend(result) 79 | else: 80 | outputs.append(result) 81 | self._all_outputs[name].append(outputs) 82 | 83 | return self._all_outputs[name] 84 | 85 | 86 | -------------------------------------------------------------------------------- /radient/tasks/__init__.py: -------------------------------------------------------------------------------- 1 | from radient.tasks._base import Task -------------------------------------------------------------------------------- /radient/tasks/_base.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import Any 3 | 4 | 5 | class Task(ABC): 6 | """Tasks are operators that can include transforms, vectorizers, and sinks. 7 | Data sources will be supported soon^{TM}. 8 | """ 9 | 10 | @abstractmethod 11 | def __init__(self): 12 | pass 13 | 14 | @abstractmethod 15 | def __call__(self, *args, **kwargs) -> list[Any]: 16 | pass -------------------------------------------------------------------------------- /radient/tasks/accelerate.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from typing import Any, Callable, Optional, Sequence, Union 3 | 4 | import numpy as np 5 | 6 | from radient.utils import fully_qualified_name 7 | from radient.utils.lazy_import import LazyImport 8 | from radient.tasks.vectorizers._base import Vectorizer 9 | 10 | torch = LazyImport("torch") 11 | onnx = LazyImport("onnx") 12 | ort = LazyImport("onnxruntime", package_name="onnxruntime-gpu") 13 | 14 | 15 | def export_to_onnx( 16 | vectorizer: Vectorizer, 17 | model_args: Union[tuple[Any, ...], Any], 18 | axes_names: Sequence[str] = [], 19 | input_names: Sequence[str] = [], 20 | output_names: Sequence[str] = [], 21 | model_type: Optional[str] = None 22 | ): 23 | """Attempts to export a model in ONNX format for use with `onnxruntime`. 24 | Switches export implementation based on torch, tensorflow, or scikit-learn 25 | models. 26 | """ 27 | 28 | # If a model type/library was not specified, attempt to programmatically 29 | # determine it using the object's fully qualified name. This doesn't work 30 | # for child classes (e.g. inheriting `nn.Module` or `nn.Sequential`) yet. 31 | if not model_type: 32 | model_qualified_name = fully_qualified_name(vectorizer.model) 33 | if "torch.nn" in model_type: 34 | model_type = "pytorch" 35 | elif "tensorflow" in model_type: 36 | model_type = "tensorflow" 37 | elif "sklearn.feature_extraction" in model_type: 38 | model_type = "sklearn" 39 | else: 40 | raise NotImplementedError 41 | 42 | # Model path example: 43 | # "~/.radient/accelerated_models//.onnx" 44 | onnx_model_path = Path.home() / ".radient" / "accelerated_models" 45 | onnx_model_path /= vectorizer.vtype 46 | onnx_model_path /= vectorizer.model_name + ".onnx" 47 | onnx_model_path.parent.mkdir(parents=True, exist_ok=True) 48 | onnx_model_path = str(onnx_model_path) 49 | 50 | if model_type in ("pytorch", "torch"): 51 | # Generate dynamic axes on-the-fly. 52 | dynamic_axes = {} 53 | if input_names and output_names: 54 | #symbolic_names = {0: "batch_size", 1: "max_seq_len"} 55 | symbolic_names = dict(zip(range(len(axes_names)), axes_names)) 56 | dynamic_axes.update({k: symbolic_names for k in input_names}) 57 | dynamic_axes.update({k: symbolic_names for k in output_names}) 58 | torch.onnx.export( 59 | vectorizer.model, 60 | model_args, 61 | onnx_model_path, 62 | do_constant_folding=True, 63 | input_names=input_names, 64 | output_names=output_names, 65 | dynamic_axes=dynamic_axes 66 | ) 67 | elif model_type in ("tensorflow", "tf"): 68 | raise NotImplementedError 69 | elif model_type in ("scikit-learn", "sklearn"): 70 | raise NotImplementedError 71 | else: 72 | raise NotImplementedError 73 | 74 | return onnx_model_path 75 | 76 | 77 | class ONNXForward(object): 78 | """Callable object that runs forward inference on an ONNX model. 79 | """ 80 | 81 | def __init__( 82 | self, 83 | model_path: str, 84 | output_names: Optional[list[str]] = None, 85 | output_class: Optional[Callable] = None, 86 | providers: Optional[list[str]] = None 87 | ): 88 | super().__init__() 89 | self._session = ort.InferenceSession(model_path, providers=providers) 90 | self._output_names = output_names 91 | self._output_class = output_class 92 | 93 | def __call__( 94 | self, 95 | inputs: Union[dict, Sequence, np.ndarray] 96 | ) -> list[Union[dict, np.ndarray]]: 97 | inputs_ = {} 98 | input_names = [node.name for node in self._session.get_inputs()] 99 | if isinstance(inputs, dict): 100 | # For dictionary inputs, ONNX has a tendency to append a `.N` for 101 | # tensors that have the identical names in Pytorch model 102 | # definitions. For example: 103 | # 104 | # `attention_mask` -> `attention_mask.3` 105 | # 106 | # We automatically detect and compensate for these changes here. 107 | for name, feat in inputs.items(): 108 | is_match = lambda x: name == x.split(".")[0] 109 | nms = [nm for nm in input_names if is_match(nm)] 110 | assert len(nms) == 1, "found conflicting input names" 111 | inputs_[nms[0]] = np.array(feat) 112 | elif isinstance(inputs, list): 113 | inputs = [np.array(item) for item in inputs] 114 | inputs_ = dict(zip(input_names, inputs)) 115 | else: 116 | inputs_ = {input_names[0]: np.array(inputs)} 117 | 118 | # Optionally cast model outputs to the desired type, e.g. torch.Tensor. 119 | result = self._session.run(self._output_names, inputs_) 120 | if self._output_class: 121 | result = [self._output_class(arr) for arr in result] 122 | 123 | if self._output_names: 124 | # If output names were specified, return the result as a 125 | # dictionary rather than a list. 126 | assert len(result) == len(self._output_names), "length mismatch" 127 | result_ = dict(zip(self._output_names, result)) 128 | elif len(result) == 1: 129 | result_ = result[0] 130 | else: 131 | result_ = result 132 | return result_ 133 | -------------------------------------------------------------------------------- /radient/tasks/sinks/__init__.py: -------------------------------------------------------------------------------- 1 | from radient.tasks.sinks._base import Sink 2 | from radient.tasks.sinks.milvus import MilvusSink 3 | -------------------------------------------------------------------------------- /radient/tasks/sinks/_base.py: -------------------------------------------------------------------------------- 1 | from abc import abstractmethod 2 | from typing import Union 3 | 4 | from radient.tasks._base import Task 5 | from radient.vector import Vector 6 | 7 | 8 | class Sink(Task): 9 | """Sinks in Radient are destinations for vector data. The penultimate 10 | operation prior to sinks is usually the result of some data merging 11 | function and, in some cases, can be the direct output of a vectorizer or 12 | set of vectorizers. 13 | """ 14 | 15 | @abstractmethod 16 | def __init__(self): 17 | super().__init__() 18 | 19 | def __call__(self, *args, **kwargs): 20 | return self.transact(*args, **kwargs) 21 | 22 | @abstractmethod 23 | def transact( 24 | self, 25 | vectors: Union[Vector, list[Vector]], 26 | **kwargs 27 | ) -> bool: 28 | pass 29 | 30 | -------------------------------------------------------------------------------- /radient/tasks/sinks/local/__init__.py: -------------------------------------------------------------------------------- 1 | from typing import Union 2 | 3 | from radient.tasks.sinks._base import Sink 4 | from radient.utils.lazy_import import LazyImport 5 | from radient.vector import Vector 6 | 7 | 8 | class LocalVectorSink(Sink): 9 | 10 | def __init__( 11 | self 12 | ): 13 | super().__init__() 14 | raise NotImplementedError 15 | 16 | def transact( 17 | self, 18 | vectors: Union[Vector, list[Vector]], 19 | **kwargs 20 | ) -> dict[str, Union[int, list[int]]]: 21 | raise NotImplementedError 22 | -------------------------------------------------------------------------------- /radient/tasks/sinks/local/_gkmeans.py: -------------------------------------------------------------------------------- 1 | from typing import TYPE_CHECKING 2 | 3 | import numpy as np 4 | 5 | from radient.utils.lazy_import import LazyImport 6 | 7 | if TYPE_CHECKING: 8 | import torch 9 | import torch.nn as nn 10 | import torch.optim as optim 11 | else: 12 | torch = LazyImport("torch") 13 | nn = LazyImport("torch.nn") 14 | optim = LazyImport("torch.optim") 15 | 16 | 17 | WARMUP_EPOCHS = 10 18 | 19 | 20 | def torch_auto_device( 21 | device: str | torch.device | None = None 22 | ): 23 | if device is None: 24 | device = "cuda" if torch.cuda.is_available() else "cpu" 25 | torch.set_default_device(device) 26 | 27 | device_type = torch.get_default_device().type 28 | if "cuda" in device_type and torch.cuda.is_bf16_supported(): 29 | torch.set_default_dtype(torch.bfloat16) 30 | else: 31 | torch.set_default_dtype(torch.float32) 32 | 33 | 34 | def _torch_bincount( 35 | x: torch.Tensor, 36 | dim: int = -1 37 | ): 38 | dim = dim % x.dim() 39 | shape = list(x.shape) 40 | shape[dim] = x.max().item() + 1 41 | count = torch.zeros(shape, dtype=x.dtype) 42 | return count.scatter_add_(dim, x, src=torch.ones_like(x)) 43 | 44 | 45 | def _torch_masked_softmax( 46 | x: torch.Tensor, 47 | mask: torch.Tensor, 48 | dim: int = -1 49 | ): 50 | x_exp = x.exp() 51 | if mask is not None: 52 | x_exp *= mask 53 | return x_exp / x_exp.sum(dim=dim) 54 | 55 | 56 | def _torch_euclidean_distance( 57 | A: torch.Tensor, 58 | B: torch.Tensor 59 | ) -> torch.Tensor: 60 | dists = ((A**2).sum(dim=-1, keepdim=True) + 61 | (B**2).sum(dim=-1, keepdim=True) - 62 | 2.0 * torch.bmm(A, B.transpose(-2, -1))) 63 | dists.clamp_(min=0.0).sqrt_() 64 | return dists 65 | 66 | 67 | def _torch_lp_norm_distance( 68 | A: torch.Tensor, 69 | B: torch.Tensor, 70 | p: float = 2 71 | ) -> torch.Tensor: 72 | return torch.cdist(A, B, p=p) 73 | 74 | 75 | def _torch_cosine_distance( 76 | A: torch.Tensor, 77 | B: torch.Tensor 78 | ): 79 | A_norm = torch.nn.functional.normalize(A, p=2, dim=-1) 80 | B_norm = torch.nn.functional.normalize(B, p=2, dim=-1) 81 | return 1.0 - torch.bmm(A_norm, B_norm.transpose(-2, -1)) 82 | 83 | 84 | class GKMeans(nn.Module): 85 | def __init__( 86 | self, 87 | n_clusters: int = 8, 88 | max_iter: int = 600, 89 | tol: float = 1e-3, 90 | random_state: int | None = None, 91 | distance_metric: str = "lp-norm", 92 | size_decay: float = 1.0, 93 | verbose: bool = False, 94 | **kwargs 95 | ): 96 | super().__init__(**kwargs) 97 | self._n_clusters = n_clusters 98 | self._max_iter = max_iter 99 | self._tol = tol 100 | 101 | self._size_decay = size_decay 102 | self._verbose = verbose 103 | 104 | # Set distance metric 105 | if distance_metric == "euclidean": 106 | self.forward = _torch_euclidean_distance 107 | elif distance_metric == "cosine": 108 | self.forward = _torch_cosine_distance 109 | elif distance_metric == "lp-norm": 110 | self.forward = _torch_lp_norm_distance 111 | else: 112 | raise ValueError(f"invalid distance metric: {distance_metric}") 113 | 114 | # Set seed 115 | if random_state: 116 | np.random.seed(random_state) 117 | torch.manual_seed(random_state) 118 | 119 | @property 120 | def cluster_centers_(self) -> np.ndarray: 121 | return self._C.numpy() 122 | 123 | def _create_batched_dataset( 124 | self, 125 | X: torch.Tensor, 126 | groups: np.ndarray | None = None 127 | ) -> torch.Tensor: 128 | """Takes a flat 2d dataset specified by `X` and adds a batch dimension, 129 | where each batch corresponds to a pre-existing subgroup of indexes into 130 | `X` (specified by `groups`). 131 | """ 132 | if groups is None: 133 | return X.unsqueeze(0) 134 | else: 135 | X_out = torch.empty(groups.shape + X.shape[1:2], dtype=X.dtype) 136 | for (n, idxs) in enumerate(groups): 137 | X_out[n,:len(idxs),:] = X[idxs] 138 | return X_out 139 | 140 | def _lr_lambda(self, epoch: int): 141 | if epoch < WARMUP_EPOCHS: 142 | # Exponential warm-up 143 | return np.e ** (epoch - WARMUP_EPOCHS) 144 | else: 145 | # Cosine decay 146 | decay_epochs = self._max_iter - WARMUP_EPOCHS 147 | return 0.5 * (1 + np.cos(np.pi * (epoch - WARMUP_EPOCHS) / decay_epochs)) 148 | 149 | def forward_loss( 150 | self, 151 | X: torch.Tensor, 152 | C: torch.Tensor 153 | ): 154 | d = self.forward(X, C) ** 2 155 | c = X.shape[1] 156 | l_a = (-1.0*d).softmax(dim=2) 157 | l_s = (l_a.sum(dim=1) - c/self._n_clusters)**2 158 | l = ((l_a * d).sum(dim=1) + self._size_decay * l_s) / c 159 | return l.sum() 160 | 161 | def fit( 162 | self, 163 | X: np.ndarray, 164 | y: np.ndarray | None = None, 165 | sample_weight: np.ndarray | None = None, 166 | groups: list[list[int]] | None = None 167 | ): 168 | """Generates cluster centers using the input data. If `groups` is 169 | `None`, then this function mimics the normal usage of clustering 170 | algorithms within `scikit-learn`. 171 | 172 | If `groups` is not `None`, then this function will take existing 173 | groups of points and create a new set of cluster centers (totaling 174 | `len(groups) * n_clusters`), treating each group as an independent 175 | dataset. Groups are expected to be a list of lists, where each inner 176 | list contains the indices of the points in the group. 177 | """ 178 | 179 | if groups is None: 180 | groups = [list(np.arange(X.shape[0]))] 181 | 182 | # Create data and cluster center tensors 183 | X = torch.from_numpy(X).to( 184 | device=torch.get_default_device(), 185 | dtype=torch.get_default_dtype() 186 | ) 187 | C = torch.empty((len(groups), self._n_clusters, X.shape[1])) 188 | 189 | to_run = list(range(groups.shape[0])) 190 | while len(to_run) > 0: 191 | 192 | # Initialize cluster centers using k-means++ 193 | for n in to_run: 194 | X_n = X[groups[n],:] 195 | C_n = C[n,:,:] 196 | C_n[0,:] = X_n[np.random.choice(X_n.shape[0]),:] 197 | for m in range(1, self._n_clusters): 198 | d, _ = self.forward(X_n, C_n[:m,:]).min(dim=1) 199 | p = d.to(torch.float32).cpu().numpy()**2 200 | p /= p.sum() 201 | C_n[m,:] = X_n[np.random.choice(X_n.shape[0], p=p),:] 202 | 203 | # Create dataset, optimizer, and scheduler 204 | C_ = C[to_run,:,:].requires_grad_() 205 | X_ = self._create_batched_dataset(X, groups=groups[to_run]) 206 | optimizer = optim.Adam([C_], lr=1.0/X_.shape[1]) 207 | 208 | # Training loop 209 | # TODO: batching for large vector datasets 210 | for epoch in range(self._max_iter): 211 | optimizer.zero_grad() 212 | loss = self.forward_loss(X_, C_) 213 | loss.backward() 214 | optimizer.step() 215 | if self._verbose and epoch % 25 == 0: 216 | with torch.inference_mode(): 217 | #loss = self.forward_loss(X_, C_) 218 | print(f"Epoch {epoch}, loss: {loss.item():.5f}") 219 | 220 | # Post-training cleanup 221 | C_ = C_.detach() 222 | C[to_run,:,:] = C_ 223 | self.zero_grad() 224 | 225 | # Determine whether the output clusters are imbalanced 226 | a = self.forward(X_, C_).argmin(dim=2) 227 | c = _torch_bincount(a, dim=1) 228 | b = (c.max(dim=1)[0] - c.min(dim=1)[0]) / a.shape[1] 229 | to_run = [to_run[n] for n in range(b.numel()) if b[n] > 0.03] 230 | if self._verbose: 231 | print(f"Average imbalance: {b.mean():.5f}") 232 | if to_run: 233 | print( 234 | f"{len(to_run)} / {len(groups)} " 235 | "groups are imbalanced, rerunning on these groups" 236 | ) 237 | 238 | self._C = C 239 | 240 | 241 | def predict( 242 | self, 243 | X: np.ndarray, 244 | groups: np.ndarray | None = None 245 | ): 246 | X = torch.from_numpy(X).to( 247 | device=torch.get_default_device(), 248 | dtype=torch.get_default_dtype() 249 | ) 250 | (X_, _) = self._create_batched_dataset(X, groups=groups) 251 | a = self.forward(X_, self._C).argmin(dim=2) 252 | return a.numpy() 253 | 254 | def fit_predict( 255 | self, 256 | X: np.ndarray, 257 | y: np.ndarray | None = None, 258 | sample_weight: np.ndarray | None = None, 259 | groups: np.ndarray | None = None 260 | ): 261 | self.fit(X, y=y, sample_weight=sample_weight, groups=groups) 262 | return self.predict(X, groups=groups) 263 | -------------------------------------------------------------------------------- /radient/tasks/sinks/local/gann.py: -------------------------------------------------------------------------------- 1 | from multiprocessing import Pool 2 | import time 3 | 4 | import numpy as np 5 | 6 | from radient.tasks.sinks.local._gkmeans import GKMeans, torch_auto_device 7 | 8 | 9 | MAX_LEAF_SIZE = 200 10 | 11 | 12 | class _GANNTree(): 13 | 14 | def __init__( 15 | self, 16 | dataset: np.ndarray, 17 | verbose: bool = False, 18 | **kwargs 19 | ): 20 | super().__init__() 21 | self._dataset = dataset 22 | self._verbose = verbose 23 | self._centers = [] 24 | self._leaves = np.arange(dataset.shape[0])[np.newaxis,:] 25 | 26 | def build(self, spill: float = 0.0): 27 | """Builds the tree. 28 | """ 29 | 30 | gkmeans = GKMeans(n_clusters=2, verbose=self._verbose) 31 | 32 | while True: 33 | 34 | # Get indexes for each cluster 35 | gkmeans.fit(self._dataset, groups=self._leaves) 36 | C = gkmeans.cluster_centers_ 37 | #idxs_C = [np.where(a==n)[0] for n in range(len(C))] 38 | 39 | new_leaves = [] 40 | for (n, leaf) in enumerate(self._leaves): 41 | vectors = self._dataset[leaf,:] 42 | 43 | # Compute distances to the hyperplane which separates the two 44 | # cluster centroids 45 | w = C[n,1,:] - C[n,0,:] 46 | b = -(C[n,1,:] + C[n,0,:]).dot(w) / 2.0 47 | d = (vectors.dot(w) + b) / np.linalg.norm(w) 48 | 49 | # Compute each point's distance to the hyperplane 50 | child_size = int(vectors.shape[0] * (0.5 + spill)) 51 | idxs_by_dist = np.argsort(d) 52 | new_leaves.append(leaf[idxs_by_dist[:child_size]]) 53 | new_leaves.append(leaf[idxs_by_dist[-child_size:]]) 54 | 55 | self._centers.append(C) 56 | self._leaves = np.array(new_leaves) 57 | 58 | if self._verbose: 59 | print(f"Num leaves: {len(self._leaves)}") 60 | 61 | # Continue until the average leaf size is below the threshold 62 | mean_leaf_size = np.mean([len(leaf) for leaf in self._leaves]) 63 | if np.mean(mean_leaf_size) < MAX_LEAF_SIZE: 64 | if self._verbose: 65 | print(f"Done, avg leaf size {np.mean(mean_leaf_size)}") 66 | print() 67 | break 68 | 69 | def get_candidates(self, query: np.ndarray): 70 | """Returns nearest neighbor candidates for a query vector. 71 | """ 72 | idx = 0 73 | for center in self._centers: 74 | idx = 2 * idx + np.linalg.norm(center[idx] - query, axis=1).argmin() 75 | return self._leaves[idx] 76 | 77 | 78 | class GANN(): 79 | 80 | def __init__( 81 | self, 82 | n_trees: int = 1, 83 | spill: float = 0.0, 84 | verbose: bool = False, 85 | **kwargs 86 | ): 87 | super().__init__() 88 | self._n_trees = n_trees 89 | self._spill = spill 90 | self._verbose = verbose 91 | 92 | self._dataset = [] 93 | 94 | def _build_tree(self, n: int) -> _GANNTree: 95 | np.random.seed(None) 96 | torch_auto_device() 97 | tree = _GANNTree(dataset=self._dataset, verbose=self._verbose) 98 | tree.build(spill=self._spill) 99 | return tree 100 | 101 | @property 102 | def n_trees(self): 103 | return self._n_trees 104 | 105 | @property 106 | def sealed(self): 107 | return hasattr(self, "_trees") 108 | 109 | def insert(self, vector: np.ndarray): 110 | """Inserts a vector into the index. 111 | """ 112 | if self.sealed: 113 | raise ValueError("Cannot insert into a sealed index.") 114 | self._dataset.append(vector) 115 | 116 | def build(self, n_proc: int = 1): 117 | if self.sealed: 118 | raise ValueError("Index is already built.") 119 | 120 | self._dataset = np.array(self._dataset, dtype=np.float32) 121 | with Pool(n_proc) as pool: 122 | self._trees = pool.map(self._build_tree, range(self._n_trees)) 123 | 124 | def search(self, query: np.ndarray, top_k: int = 10) -> list[int]: 125 | if not self.sealed: 126 | raise ValueError("Build the index before searching.") 127 | 128 | candidates = set() 129 | candidates.update(*[t.get_candidates(query) for t in self._trees]) 130 | candidates = np.array(list(candidates)) 131 | vectors = self._dataset[candidates,:] 132 | best = np.linalg.norm(vectors - query, axis=1).argsort() 133 | return candidates[best[:top_k]] 134 | -------------------------------------------------------------------------------- /radient/tasks/sinks/milvus.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | from typing import Optional, Union 4 | 5 | from radient._milvus import _MilvusInterface 6 | from radient.tasks.sinks._base import Sink 7 | from radient.utils import fully_qualified_name 8 | from radient.utils.lazy_import import LazyImport 9 | from radient.vector import Vector 10 | 11 | 12 | DEFAULT_MILVUS_URI = str(Path.home() / ".radient" / "default.db") 13 | #DEFAULT_MILVUS_URI = "http://127.0.0.1:19530" 14 | DEFAULT_COLLECTION_NAME = "radient" 15 | 16 | 17 | class MilvusSink(Sink): 18 | 19 | def __init__( 20 | self, 21 | operation: str, 22 | milvus_uri: str = DEFAULT_MILVUS_URI, 23 | collection_name: str = DEFAULT_COLLECTION_NAME, 24 | vector_field: Optional[str] = None, 25 | **kwargs 26 | ): 27 | super().__init__() 28 | self._operation = operation 29 | self._milvus_uri = milvus_uri 30 | self._collection_name = collection_name 31 | self._vector_field = vector_field 32 | 33 | def transact( 34 | self, 35 | vectors: Union[Vector, list[Vector]], 36 | **kwargs 37 | ) -> dict[str, Union[int, list[int]]]: 38 | if not isinstance(vectors, list): 39 | vectors = [vectors] 40 | client, info = _MilvusInterface._get_client( 41 | milvus_uri=self._milvus_uri, 42 | collection_name=self._collection_name, 43 | dimension=vectors[0].size 44 | ) 45 | # If `field_name` is None, attempt to automatically acquire the field 46 | # name from the collection info. 47 | vector_field = self._vector_field or info["dense"] 48 | 49 | if self._operation == "insert": 50 | return client.insert( 51 | collection_name=self._collection_name, 52 | data=[v.todict(vector_field=vector_field) for v in vectors], 53 | **kwargs 54 | ) 55 | elif self._operation == "search": 56 | return client.search( 57 | collection_name=self._collection_name, 58 | data=[v.tolist() for v in vectors], 59 | **kwargs 60 | ) 61 | else: 62 | raise TypeError("invalid Milvus operation") 63 | -------------------------------------------------------------------------------- /radient/tasks/sources/__init__.py: -------------------------------------------------------------------------------- 1 | from radient.tasks.sources._base import Source 2 | from radient.tasks.sources.ingest import IngestSource 3 | from radient.tasks.sources.local import LocalSource 4 | from radient.tasks.sources.youtube import YoutubeSource 5 | -------------------------------------------------------------------------------- /radient/tasks/sources/_base.py: -------------------------------------------------------------------------------- 1 | from abc import abstractmethod 2 | from typing import Any 3 | 4 | from radient.tasks._base import Task 5 | 6 | 7 | class Source(Task): 8 | """Sources in Radient are task objects that yield data. Depending on the 9 | downstream transform, this can be raw bytes, or it can be filenames/URIs. 10 | """ 11 | 12 | @abstractmethod 13 | def __init__(self): 14 | super().__init__() 15 | 16 | def __call__(self, *args, **kwargs): 17 | return self.read(*args, **kwargs) 18 | 19 | @abstractmethod 20 | def read(self, **kwargs) -> Any: 21 | pass 22 | -------------------------------------------------------------------------------- /radient/tasks/sources/ingest.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | import shutil 3 | 4 | from radient.tasks.sources._base import Source 5 | 6 | 7 | class IngestSource(Source): 8 | 9 | def __init__(self, path: str, **kwargs): 10 | super().__init__() 11 | source = Path(path).expanduser() 12 | destination = Path("~") / ".radient" / "data" / "ingest" 13 | destination = destination.expanduser() 14 | destination.mkdir(parents=True, exist_ok=True) 15 | shutil.copy(source, destination / source.name) 16 | 17 | def read(self) -> dict[str, str]: 18 | return None 19 | -------------------------------------------------------------------------------- /radient/tasks/sources/local.py: -------------------------------------------------------------------------------- 1 | from collections.abc import Iterator 2 | from pathlib import Path 3 | 4 | from radient.tasks.sources._base import Source 5 | 6 | 7 | def _path_walk(path): 8 | p = Path(path) 9 | if p.is_file(): 10 | yield path 11 | elif p.is_dir(): 12 | for sub in p.iterdir(): 13 | yield from _path_walk(sub) 14 | 15 | 16 | class LocalSource(Source): 17 | """Reads filenames from a local directory. This source is mostly useful for 18 | backfills from local disk and or for testing purposes. 19 | """ 20 | 21 | def __init__(self, path: str, **kwargs): 22 | super().__init__() 23 | self._paths_iter = _path_walk(path) 24 | 25 | def read(self): 26 | return None 27 | -------------------------------------------------------------------------------- /radient/tasks/sources/youtube.py: -------------------------------------------------------------------------------- 1 | from collections.abc import Iterator 2 | from pathlib import Path 3 | from typing import TYPE_CHECKING 4 | import uuid 5 | 6 | from radient.tasks.sources._base import Source 7 | from radient.utils.lazy_import import LazyImport 8 | 9 | if TYPE_CHECKING: 10 | import yt_dlp 11 | else: 12 | yt_dlp = LazyImport("yt_dlp") 13 | 14 | 15 | class YoutubeSource(Source): 16 | """Downloads videos from Youtube to a local directory. The `url` argument 17 | can be a single video or a playlist. 18 | """ 19 | 20 | def __init__(self, 21 | url: str, 22 | output_directory: str = "~/.radient/data/youtube", 23 | **kwargs 24 | ): 25 | super().__init__() 26 | 27 | # Create a new output directory for downloading and storing the videos. 28 | output_directory = Path(output_directory).expanduser() 29 | output_directory = output_directory / str(uuid.uuid4()) 30 | output_directory.mkdir(parents=True, exist_ok=True) 31 | 32 | # The input URL may be a single video or a playlist of videos. Here, we 33 | # extract a list of all video URLs for use in the `read` function. 34 | with yt_dlp.YoutubeDL({"extract_flat": True, "quiet": True}) as ydl: 35 | info = ydl.extract_info(url, download=False) 36 | if "entries" in info: 37 | self._video_urls = [entry["webpage_url"] for entry in info["entries"]] 38 | else: 39 | self._video_urls = [info["webpage_url"]] 40 | self._url_idx = 0 41 | 42 | # Add a hook to dynamically determine what the output filename is for 43 | # each video. 44 | ydl_opts = { 45 | "format": "bestvideo+bestaudio/best", 46 | "merge_output_format": "mp4", 47 | "outtmpl": str(output_directory / "%(id)s.%(ext)s") 48 | } 49 | self._youtube_dl = yt_dlp.YoutubeDL(ydl_opts) 50 | 51 | def read(self): 52 | 53 | if self._url_idx == len(self._video_urls): 54 | return None 55 | url = self._video_urls[self._url_idx] 56 | 57 | meta = self._youtube_dl.extract_info(url, download=False) 58 | meta = self._youtube_dl.sanitize_info(meta) 59 | path = self._youtube_dl.prepare_filename(meta) 60 | 61 | self._youtube_dl.download(url) 62 | self._url_idx += 1 63 | 64 | return {"data": path} 65 | 66 | 67 | 68 | 69 | -------------------------------------------------------------------------------- /radient/tasks/transforms/__init__.py: -------------------------------------------------------------------------------- 1 | from radient.tasks.transforms.speech_to_text import speech_to_text_transform 2 | from radient.tasks.transforms.video_demux import video_demux_transform 3 | -------------------------------------------------------------------------------- /radient/tasks/transforms/_base.py: -------------------------------------------------------------------------------- 1 | from abc import abstractmethod 2 | from typing import Any 3 | 4 | import numpy as np 5 | 6 | from radient.tasks._base import Task 7 | 8 | 9 | class Transform(Task): 10 | """Transforms are operations that perform multimodal data transformation, 11 | such as such as turning a video into independent frames. Because these are 12 | usually I/O-bound operations, batching is not innately supported. 13 | """ 14 | 15 | @abstractmethod 16 | def __init__(self): 17 | super().__init__() 18 | 19 | def __call__(self, *args, **kwargs): 20 | return self.transform(*args, **kwargs) 21 | 22 | @abstractmethod 23 | def transform(self, data: Any) -> Any: 24 | pass 25 | -------------------------------------------------------------------------------- /radient/tasks/transforms/document_screenshot/__init__.py: -------------------------------------------------------------------------------- 1 | __all__ = [ 2 | "DocumentScreenshotTransform", 3 | "PyMuPDFDocumentScreenshotTransform" 4 | ] 5 | 6 | from typing import Optional 7 | 8 | from radient.tasks.transforms.document_screenshot._base import DocumentScreenshotTransform 9 | from radient.tasks.transforms.document_screenshot.pymupdf import PyMuPDFDocumentScreenshotTransform 10 | 11 | 12 | def pdf_to_screenshot_transform(method: str = "PyMuPDF", **kwargs) -> DocumentScreenshotTransform: 13 | """Creates a transform which performs document screenshotting. 14 | """ 15 | 16 | if method.lower() in ("pymupdf", None): 17 | return PyMuPDFDocumentScreenshotTransform(**kwargs) 18 | else: 19 | raise NotImplementedError 20 | -------------------------------------------------------------------------------- /radient/tasks/transforms/document_screenshot/_base.py: -------------------------------------------------------------------------------- 1 | from abc import abstractmethod 2 | from typing import Union 3 | 4 | from radient.tasks.transforms._base import Transform 5 | 6 | 7 | class DocumentScreenshotTransform(Transform): 8 | 9 | def __init__(self, **kwargs): 10 | super().__init__() 11 | 12 | @abstractmethod 13 | def transform(self, data: str) -> dict[str, str]: 14 | pass 15 | -------------------------------------------------------------------------------- /radient/tasks/transforms/document_screenshot/pymupdf.py: -------------------------------------------------------------------------------- 1 | from io import BytesIO 2 | from typing import TYPE_CHECKING 3 | import urllib.request 4 | 5 | from radient.tasks.transforms.document_screenshot._base import DocumentScreenshotTransform 6 | from radient.utils.lazy_import import LazyImport 7 | 8 | if TYPE_CHECKING: 9 | import pymupdf 10 | from PIL import Image 11 | else: 12 | pymupdf = LazyImport("PyMuPDF", min_version="1.24.3") 13 | Image = LazyImport("PIL", attribute="Image", package_name="Pillow") 14 | 15 | 16 | class PyMuPDFDocumentScreenshotTransform(DocumentScreenshotTransform): 17 | 18 | def __init__(self, zoom: float = 1.0): 19 | super().__init__() 20 | self._zoom = zoom 21 | 22 | def transform(self, data: str) -> dict[str, str]: 23 | 24 | # Ensure that the path is valid 25 | if not data.endswith(".pdf"): 26 | raise ValueError("Invalid path") 27 | 28 | # Ensure that the URL is valid 29 | if data.startswith("http"): 30 | with urllib.request.urlopen(data) as response: 31 | pdf_data = response.read() 32 | pdf_stream = BytesIO(pdf_data) 33 | pdf = pymupdf.open(stream=pdf_stream, filetype="pdf") 34 | else: 35 | pdf = pymupdf.open(data, filetype="pdf") 36 | 37 | # Create a transformation object 38 | mat = pymupdf.Matrix(self._zoom, self._zoom) 39 | 40 | # Output the results 41 | images = [] 42 | for n in range(pdf.page_count): 43 | pix = pdf[n].get_pixmap(matrix=mat) 44 | img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) 45 | images.append(img) 46 | 47 | return images 48 | -------------------------------------------------------------------------------- /radient/tasks/transforms/speech_to_text/__init__.py: -------------------------------------------------------------------------------- 1 | __all__ = [ 2 | "SpeechToTextTransform", 3 | "WhisperSpeechToTextTransform" 4 | ] 5 | 6 | from typing import Optional 7 | 8 | from radient.tasks.transforms.speech_to_text._base import SpeechToTextTransform 9 | from radient.tasks.transforms.speech_to_text.whisper import WhisperSpeechToTextTransform 10 | 11 | 12 | def speech_to_text_transform(method: str = "whisper", **kwargs) -> SpeechToTextTransform: 13 | """Creates a Huggingface pipeline for ASR specified by `method`. 14 | """ 15 | 16 | if method in ("whisper", None): 17 | return WhisperSpeechToTextTransform(**kwargs) 18 | else: 19 | raise NotImplementedError 20 | -------------------------------------------------------------------------------- /radient/tasks/transforms/speech_to_text/_base.py: -------------------------------------------------------------------------------- 1 | from abc import abstractmethod 2 | 3 | from radient.tasks.transforms._base import Transform 4 | 5 | 6 | class SpeechToTextTransform(Transform): 7 | 8 | def __init__(self, **kwargs): 9 | super().__init__() 10 | 11 | @abstractmethod 12 | def transform(self, data: str) -> dict[str, str]: 13 | pass 14 | -------------------------------------------------------------------------------- /radient/tasks/transforms/speech_to_text/whisper.py: -------------------------------------------------------------------------------- 1 | from typing import TYPE_CHECKING, Union 2 | 3 | from radient.tasks.transforms.speech_to_text._base import SpeechToTextTransform 4 | from radient.utils.lazy_import import LazyImport 5 | 6 | if TYPE_CHECKING: 7 | from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline 8 | import torch 9 | else: 10 | AutoModelForSpeechSeq2Seq = LazyImport("transformers", attribute="AutoModelForSpeechSeq2Seq", package_name="transformers") 11 | AutoProcessor = LazyImport("transformers", attribute="AutoProcessor", package_name="transformers") 12 | pipeline = LazyImport("transformers", attribute="pipeline", package_name="transformers") 13 | torch = LazyImport("torch") 14 | 15 | 16 | class WhisperSpeechToTextTransform(SpeechToTextTransform): 17 | 18 | def __init__(self, 19 | model_id: str = "openai/whisper-large-v3", 20 | torch_dtype: torch.dtype = torch.float16, 21 | **kwargs 22 | ): 23 | 24 | # Create model and preprocessor. 25 | model = AutoModelForSpeechSeq2Seq.from_pretrained( 26 | model_id, 27 | low_cpu_mem_usage=True, 28 | use_safetensors=True, 29 | device="cpu" 30 | ) 31 | processor = AutoProcessor.from_pretrained(model_id) 32 | 33 | # Instantiate ASR pipeline. 34 | self._pipeline = pipeline( 35 | "automatic-speech-recognition", 36 | model=model, 37 | tokenizer=processor.tokenizer, 38 | feature_extractor=processor.feature_extractor, 39 | torch_dtype=torch_dtype, 40 | device="cpu", 41 | ) 42 | 43 | def transform(self, data: str) -> dict[str, str]: 44 | result = self._pipeline(data) 45 | return {"data": result["text"], "type": "text"} 46 | -------------------------------------------------------------------------------- /radient/tasks/transforms/video_demux/__init__.py: -------------------------------------------------------------------------------- 1 | __all__ = [ 2 | "VideoDemuxTransform" 3 | ] 4 | 5 | from typing import Optional 6 | 7 | from radient.tasks.transforms.video_demux._base import VideoDemuxTransform 8 | from radient.tasks.transforms.video_demux.default import DefaultVideoDemuxTransform 9 | from radient.tasks.transforms.video_demux.ffmpeg import FFmpegVideoDemuxTransform 10 | 11 | 12 | def video_demux_transform(method: str = "default", **kwargs) -> VideoDemuxTransform: 13 | """Creates a video demultiplexer specified by `method`. 14 | """ 15 | 16 | if method in ("default", None): 17 | return DefaultVideoDemuxTransform(**kwargs) 18 | elif method in ("ffmpeg",): 19 | return FFmpegVideoDemuxTransform(**kwargs) 20 | else: 21 | raise NotImplementedError 22 | -------------------------------------------------------------------------------- /radient/tasks/transforms/video_demux/_base.py: -------------------------------------------------------------------------------- 1 | from abc import abstractmethod 2 | from pathlib import Path 3 | import uuid 4 | 5 | from radient.tasks.transforms._base import Transform 6 | 7 | 8 | class VideoDemuxTransform(Transform): 9 | 10 | def __init__(self, 11 | interval: float = 2.0, 12 | output_directory: str = "~/.radient/data/video_demux", 13 | **kwargs 14 | ): 15 | super().__init__() 16 | self._interval = interval 17 | output_directory = Path(output_directory).expanduser() 18 | self._output_directory = output_directory 19 | 20 | @abstractmethod 21 | def transform(self, data: str) -> dict[str, list[str]]: 22 | """Extracts frames and audio snippets from a video file. 23 | """ 24 | pass 25 | 26 | def _make_output_dir(self): 27 | # The full output directory comes from a combination of the user's 28 | # specification plus a unique identifier for the current run. 29 | # TODO(fzliu): switch to an incremental identifier e.g. UUIDv7 30 | output_path = Path(self._output_directory) / str(uuid.uuid4()) 31 | output_path.mkdir(parents=True, exist_ok=True) 32 | return output_path 33 | -------------------------------------------------------------------------------- /radient/tasks/transforms/video_demux/default.py: -------------------------------------------------------------------------------- 1 | from typing import TYPE_CHECKING 2 | 3 | import numpy as np 4 | 5 | from radient.tasks.transforms.video_demux._base import VideoDemuxTransform 6 | from radient.utils.lazy_import import LazyImport 7 | 8 | if TYPE_CHECKING: 9 | import cv2 10 | import librosa 11 | import soundfile as sf 12 | else: 13 | cv2 = LazyImport("cv2", package_name="opencv-python") 14 | librosa = LazyImport("librosa") 15 | sf = LazyImport("soundfile") 16 | 17 | 18 | class DefaultVideoDemuxTransform(VideoDemuxTransform): 19 | 20 | def __init__(self, **kwargs): 21 | super().__init__(**kwargs) 22 | 23 | def transform(self, data: str): 24 | """Extracts frames and audio snippets from a video file. 25 | """ 26 | 27 | video_path = data 28 | output_path = self._make_output_dir() 29 | 30 | # Grab the total number of frames as well as the video's FPS to 31 | # determine the interval in frames and stopping condition. 32 | video_capture = cv2.VideoCapture(video_path) 33 | frame_count = video_capture.get(cv2.CAP_PROP_FRAME_COUNT) 34 | frame_interval = video_capture.get(cv2.CAP_PROP_FPS) * self._interval 35 | 36 | frames = {"data": [], "type": "image"} 37 | for i, n in enumerate(np.arange(0, frame_count, frame_interval)): 38 | video_capture.set(cv2.CAP_PROP_POS_FRAMES, int(n)) 39 | retval, frame = video_capture.read() 40 | if not retval: 41 | break 42 | frame_path = str(output_path / f"frame_{i:04d}.png") 43 | cv2.imwrite(frame_path, frame) 44 | frames["data"].append(frame_path) 45 | 46 | video_capture.release() 47 | 48 | # Extract audio snippet as raw data. With the raw audio, we store it 49 | # in `.wav` format with the original sample rate. 50 | audios = {"data": [], "type": "audio"} 51 | waveform, sample_rate = librosa.load(video_path, sr=None, mono=False) 52 | sample_interval = int(sample_rate * self._interval) 53 | if len(waveform.shape) == 1: 54 | y = np.expand_dims(y, axis=0) 55 | for i, n in enumerate(np.arange(0, waveform.shape[1], sample_interval)): 56 | n_end = n + sample_interval 57 | audio_path = str(output_path / f"audio_{i:04d}.wav") 58 | sf.write(audio_path, waveform[:,n:n_end].T, sample_rate) 59 | audios["data"].append(audio_path) 60 | 61 | return [frames, audios] 62 | -------------------------------------------------------------------------------- /radient/tasks/transforms/video_demux/ffmpeg.py: -------------------------------------------------------------------------------- 1 | import shutil 2 | import subprocess 3 | 4 | import numpy as np 5 | 6 | from radient.tasks.transforms.video_demux._base import VideoDemuxTransform 7 | 8 | 9 | class FFmpegVideoDemuxTransform(VideoDemuxTransform): 10 | 11 | def __init__(self, **kwargs): 12 | super().__init__(**kwargs) 13 | 14 | def transform(self, data: str): 15 | """Extracts frames and audio snippets from a video file. 16 | """ 17 | 18 | # Ensure that FFmpeg is installed and available from the command line. 19 | if not shutil.which("ffmpeg"): 20 | raise FileNotFoundError(f"ffmpeg not found, try specifying 'method': 'default' in params") 21 | if not shutil.which("ffprobe"): 22 | raise FileNotFoundError(f"ffmpeg not found, try specifying 'method': 'default' in params") 23 | 24 | frames = {"data": [], "type": "image"} 25 | audios = {"data": [], "type": "audio"} 26 | video_path = data 27 | output_path = self._make_output_dir() 28 | 29 | # Grab video information using ffprobe. 30 | frame_info = subprocess.run( 31 | ["ffprobe", "-v", "error", "-select_streams", "v:0", 32 | "-count_packets", "-show_entries", 33 | "stream=r_frame_rate,nb_read_packets", "-of", 34 | "default=noprint_wrappers=1:nokey=1", video_path], 35 | capture_output=True, text=True).stdout.split() 36 | frame_rate = eval(frame_info[0]) 37 | frame_count = eval(frame_info[1]) 38 | frame_interval = frame_rate * self._interval 39 | audio_info = subprocess.run( 40 | ["ffprobe", "-v", "error", "-select_streams", "a:0", 41 | "-show_entries", "stream=sample_rate", "-of", 42 | "default=noprint_wrappers=1:nokey=1", video_path], 43 | capture_output=True, text=True).stdout.split() 44 | 45 | for i, n in enumerate(np.arange(0, frame_count, frame_interval)): 46 | start_time = n / frame_rate 47 | 48 | # Extract frames. 49 | frame_path = str(output_path / f"frame_{i:04d}.png") 50 | subprocess.run(["ffmpeg", "-v", "error", "-ss", str(start_time), 51 | "-i", video_path, "-vframes", "1", frame_path]) 52 | frames["data"].append(frame_path) 53 | 54 | # Extract audio. 55 | audio_path = str(output_path / f"audio_{i:04d}.wav") 56 | subprocess.run(["ffmpeg", "-v", "error", "-ss", str(start_time), 57 | "-i", video_path, "-t", str(self._interval), 58 | "-q:a", "0", "-map", "a", audio_path]) 59 | audios["data"].append(audio_path) 60 | 61 | return [frames, audios] 62 | -------------------------------------------------------------------------------- /radient/tasks/vectorizers/__init__.py: -------------------------------------------------------------------------------- 1 | from radient.tasks.vectorizers._base import Vectorizer 2 | from radient.tasks.vectorizers.audio import audio_vectorizer 3 | from radient.tasks.vectorizers.graph import graph_vectorizer 4 | from radient.tasks.vectorizers.image import image_vectorizer 5 | from radient.tasks.vectorizers.molecule import molecule_vectorizer 6 | from radient.tasks.vectorizers.text import text_vectorizer 7 | from radient.tasks.vectorizers.multimodal import multimodal_vectorizer 8 | -------------------------------------------------------------------------------- /radient/tasks/vectorizers/_base.py: -------------------------------------------------------------------------------- 1 | from abc import abstractmethod 2 | from collections.abc import Sequence 3 | from typing import Any, Optional, Union 4 | import warnings 5 | 6 | import numpy as np 7 | 8 | from radient.tasks import Task 9 | from radient.utils import fully_qualified_name 10 | from radient.vector import Vector 11 | 12 | 13 | def normalize_vector(vector: Vector, inplace: bool = True): 14 | if not np.issubdtype(vector.dtype, np.floating): 15 | warnings.warn("non-float vectors are not normalized") 16 | else: 17 | if inplace: 18 | vector /= np.linalg.norm(vector) 19 | else: 20 | vector = vector / np.linalg.norm(vector) 21 | return vector 22 | 23 | 24 | class Vectorizer(Task): 25 | """Base class for all vectorizers. Custom vectorizers shouldn't directly 26 | inherit this class, but should inherit the appropriate subclass e.g. 27 | `ImageVectorizer` or `AudioVectorizer`. 28 | """ 29 | 30 | @abstractmethod 31 | def __init__(self): 32 | super().__init__() 33 | self._model_name = None 34 | self._model = None 35 | 36 | def __call__(self, *args, **kwargs): 37 | vectors = self.vectorize(*args, **kwargs) 38 | return {"vectors": vectors} 39 | 40 | @property 41 | def model_name(self) -> Optional[str]: 42 | return self._model_name 43 | 44 | @property 45 | def model(self) -> Optional[Any]: 46 | return self._model 47 | 48 | @property 49 | def vtype(self) -> str: 50 | return fully_qualified_name(self).split(".")[3] 51 | 52 | def _preprocess(self, item: Any, **kwargs) -> Any: 53 | return item 54 | 55 | @abstractmethod 56 | def _vectorize(self, data: Any, **kwargs) -> Vector: 57 | pass 58 | 59 | def _postprocess(self, vector: Vector, normalize: bool = True, **kwargs) -> Vector: 60 | if normalize: 61 | # Some vectorizers return a _sequence_ of vectors for a single 62 | # piece of data (most commonly data that varies with time such as 63 | # videos and audio). Automatically check for these here and 64 | # normalize them if this is the case. 65 | if isinstance(vector, Sequence): 66 | for v in vector: 67 | normalize_vector(v) 68 | else: 69 | normalize_vector(vector) 70 | return vector 71 | 72 | def modalities(self) -> list[str]: 73 | return [self.vtype] 74 | 75 | def vectorize( 76 | self, 77 | data: Union[Any, list[Any]], 78 | modality: Optional[str] = None, 79 | normalize: bool = True, 80 | **kwargs 81 | ) -> Union[Vector, list[Vector], dict[str, Union[list[Vector], Vector]]]: 82 | """Vectorizers accept two types of inputs: 83 | 84 | 1) One instance of the object/data, 85 | 2) A list of data to be vectorized. 86 | 87 | This function handles both of these cases automatically. 88 | """ 89 | modality = modality or self.vtype 90 | if modality in self.modalities(): 91 | data_ = data if isinstance(data, list) else [data] 92 | vectors = [] 93 | for d in data_: 94 | v = self._preprocess(d, modality=modality) 95 | v = self._vectorize(v, modality=modality) 96 | v = self._postprocess(v, modality=modality, normalize=normalize) 97 | # TODO(fzliu): only store the original paths, e.g. no base64 98 | # encodings or long-form text stored as metadata 99 | v.putmeta("data", str(d)).putmeta("type", modality) 100 | vectors.append(v) 101 | return vectors[0] if not isinstance(data, list) else vectors 102 | else: 103 | warnings.warn(f"vectorizer does not support modality: {modality}") 104 | 105 | def accelerate(self): 106 | warnings.warn("vectorizer does not support acceleration") 107 | -------------------------------------------------------------------------------- /radient/tasks/vectorizers/_imagebind.py: -------------------------------------------------------------------------------- 1 | from radient.utils.lazy_import import LazyImport 2 | 3 | imagebind_model = LazyImport("imagebind.models", attribute="imagebind_model", package_name="git+https://github.com/fzliu/ImageBind@main") 4 | 5 | IMAGEBIND_MODULE_NAMES = ( 6 | "modality_preprocessors", 7 | "modality_trunks", 8 | "modality_heads", 9 | "modality_postprocessors" 10 | ) 11 | 12 | 13 | def create_imagebind_model(modality: str, model_name: str = "imagebind_huge"): 14 | """Wrapper around `imagebind_model` to load a specific modality. Modalities 15 | aside from the one specified are removed from the model. (It's might be 16 | better to get this code merged into the original ImageBind repo.) 17 | """ 18 | 19 | if modality == "image": 20 | modality = imagebind_model.ModalityType.VISION 21 | elif modality == "text": 22 | modality = imagebind_model.ModalityType.TEXT 23 | elif modality == "audio": 24 | modality = imagebind_model.ModalityType.AUDIO 25 | 26 | model = getattr(imagebind_model, model_name)(pretrained=True) 27 | 28 | # Delete unnecessary modality trunks, preprocessors, and postprocessors 29 | # from the model. 30 | for module_name in IMAGEBIND_MODULE_NAMES: 31 | for modality_type in imagebind_model.ModalityType.__dict__.values(): 32 | if modality_type != modality: 33 | module = getattr(model, module_name) 34 | delattr(module, modality_type) 35 | 36 | return model 37 | 38 | -------------------------------------------------------------------------------- /radient/tasks/vectorizers/audio/__init__.py: -------------------------------------------------------------------------------- 1 | __all__ = [ 2 | "TorchaudioAudioVectorizer", 3 | "audio_vectorizer" 4 | ] 5 | 6 | from typing import Optional 7 | 8 | from radient.tasks.vectorizers.audio._base import AudioVectorizer 9 | from radient.tasks.vectorizers.audio.imagebind import ImageBindAudioVectorizer 10 | from radient.tasks.vectorizers.audio.torchaudio import TorchaudioAudioVectorizer 11 | 12 | 13 | def audio_vectorizer(method: str = "torchaudio", **kwargs) -> AudioVectorizer: 14 | """Creates an image vectorizer specified by `method`. 15 | """ 16 | 17 | # Return a reasonable default vectorizer in the event that the user does 18 | # not specify one. 19 | if method in ("torchaudio",): 20 | return TorchaudioAudioVectorizer(**kwargs) 21 | elif method in ("imagebind",): 22 | return ImageBindAudioVectorizer(**kwargs) 23 | else: 24 | raise NotImplementedError 25 | -------------------------------------------------------------------------------- /radient/tasks/vectorizers/audio/_base.py: -------------------------------------------------------------------------------- 1 | __all__ = [ 2 | "AudioVectorizer" 3 | ] 4 | 5 | from abc import abstractmethod 6 | from typing import Any, Union 7 | 8 | import numpy as np 9 | 10 | from radient.tasks.vectorizers._base import Vectorizer 11 | from radient.utils import fully_qualified_name 12 | from radient.utils.lazy_import import LazyImport 13 | from radient.vector import Vector 14 | 15 | librosa = LazyImport("librosa") 16 | 17 | 18 | class AudioVectorizer(Vectorizer): 19 | 20 | @abstractmethod 21 | def __init__(self): 22 | super().__init__() 23 | 24 | def _preprocess(self, audio: Any, **kwargs) -> np.ndarray: 25 | if isinstance(audio, tuple) and isinstance(audio[0], np.ndarray): 26 | waveform, source_rate = audio 27 | elif isinstance(audio, str): 28 | waveform, source_rate = librosa.load(audio, sr=None, mono=False) 29 | if len(waveform.shape) == 1: 30 | waveform = np.expand_dims(waveform, 0) 31 | 32 | if source_rate != self.sample_rate: 33 | waveform = librosa.resample( 34 | waveform, 35 | orig_sr=source_rate, 36 | target_sr=self.sample_rate 37 | ) 38 | 39 | return waveform 40 | 41 | @property 42 | @abstractmethod 43 | def sample_rate(self) -> int: 44 | """Returns the sample rate required by this vectorizer. 45 | """ 46 | pass 47 | -------------------------------------------------------------------------------- /radient/tasks/vectorizers/audio/imagebind.py: -------------------------------------------------------------------------------- 1 | __all__ = [ 2 | "ImageBindAudioVectorizer" 3 | ] 4 | 5 | from typing import Any 6 | 7 | import numpy as np 8 | 9 | from radient.tasks.accelerate import export_to_onnx, ONNXForward 10 | from radient.tasks.vectorizers._imagebind import create_imagebind_model 11 | from radient.tasks.vectorizers._imagebind import imagebind_model 12 | from radient.tasks.vectorizers.audio._base import AudioVectorizer 13 | from radient.utils.lazy_import import LazyImport 14 | from radient.vector import Vector 15 | 16 | torch = LazyImport("torch") 17 | transforms = LazyImport("torchvision", attribute="transforms") 18 | waveform2melspec = LazyImport("imagebind.data", attribute="waveform2melspec") 19 | 20 | CLIP_DURATION = 2 21 | NUM_MEL_BINS = 128 22 | TARGET_LENGTH = 204 23 | 24 | 25 | class ImageBindAudioVectorizer(AudioVectorizer): 26 | """Computes image embeddings using FAIR's ImageBind model. 27 | """ 28 | 29 | def __init__(self, model_name = "imagebind_huge", **kwargs): 30 | super().__init__() 31 | self._model_name = model_name 32 | self._model = create_imagebind_model(model_name=model_name, modality="audio") 33 | self._model.eval() 34 | self._normalize = transforms.Normalize(mean=-4.268, std=9.138) 35 | 36 | def _transform(self, waveform: np.ndarray, **kwargs): 37 | output = [] 38 | # Split the waveform into clips of duration CLIP_DURATION. Each 39 | # waveform is then converted into its Mel spectrum representation. 40 | waveform = torch.from_numpy(waveform) 41 | samples_per_clip = self.sample_rate * CLIP_DURATION 42 | for n in np.arange(0, waveform.shape[1], samples_per_clip): 43 | end = n + samples_per_clip 44 | mel_spec = waveform2melspec( 45 | waveform[:,n:end], 46 | self.sample_rate, 47 | NUM_MEL_BINS, 48 | TARGET_LENGTH 49 | ) 50 | output.append(self._normalize(mel_spec)) 51 | return torch.stack(output, dim=0) 52 | 53 | def _preprocess(self, audio: Any, **kwargs) -> np.ndarray: 54 | audio = super()._preprocess(audio) 55 | audio = self._transform(audio).unsqueeze(0) 56 | return audio 57 | 58 | def _vectorize(self, audio: np.ndarray, **kwargs) -> Vector: 59 | # TODO(fzliu): dynamic batching 60 | with torch.inference_mode(): 61 | output = self._model({imagebind_model.ModalityType.AUDIO: audio}) 62 | vector = output[imagebind_model.ModalityType.AUDIO].squeeze() 63 | if isinstance(vector, torch.Tensor): 64 | vector = vector.numpy() 65 | return vector.view(Vector) 66 | 67 | def accelerate(self): 68 | modality = imagebind_model.ModalityType.AUDIO 69 | inputs = ({modality: torch.randn((1, 400))}, {}) 70 | input_names = output_names = [modality] 71 | onnx_model_path = export_to_onnx( 72 | self, 73 | inputs, 74 | axes_names=["batch_size", "seq_len"], 75 | input_names=[modality], 76 | output_names=[modality], 77 | model_type="pytorch" 78 | ) 79 | 80 | self._model.forward = ONNXForward( 81 | onnx_model_path, 82 | output_names=output_names, 83 | ) 84 | 85 | @property 86 | def sample_rate(self): 87 | return 16_000 88 | 89 | 90 | -------------------------------------------------------------------------------- /radient/tasks/vectorizers/audio/torchaudio.py: -------------------------------------------------------------------------------- 1 | __all__ = [ 2 | "TorchaudioAudioVectorizer" 3 | ] 4 | 5 | import numpy as np 6 | 7 | from radient.tasks.accelerate import export_to_onnx, ONNXForward 8 | from radient.tasks.vectorizers.audio._base import AudioVectorizer 9 | from radient.utils.lazy_import import LazyImport 10 | from radient.vector import Vector 11 | 12 | torchaudio = LazyImport("torchaudio") 13 | torch = LazyImport("torch") 14 | 15 | 16 | class TorchaudioAudioVectorizer(AudioVectorizer): 17 | """Computes audio embeddings using `torchaudio`. 18 | """ 19 | 20 | def __init__( 21 | self, 22 | model_name: str = "HUBERT_BASE", 23 | reduce_method: str = "avg_pool", 24 | **kwargs 25 | ): 26 | super().__init__() 27 | self._model_name = model_name 28 | self._reduce_method = reduce_method 29 | bundle = getattr(torchaudio.pipelines, model_name) 30 | self._sample_rate = bundle.sample_rate 31 | self._model = bundle.get_model() 32 | 33 | def _vectorize(self, audio: np.ndarray, **kwargs) -> list[Vector]: 34 | with torch.inference_mode(): 35 | output = self._model.forward(torch.from_numpy(audio)) 36 | features = output[0] if isinstance(output, tuple) else output 37 | if isinstance(features, torch.Tensor): 38 | features = features.numpy() 39 | 40 | # Torchaudio vectorizers output a list of features, so we 41 | # optionally reduce the features to a single 1D vector using 42 | # the method specified by the function caller. 43 | if self._reduce_method == "avg_pool": 44 | output = np.mean(features, axis=(0,1)).view(Vector) 45 | else: 46 | output = [v.view(Vector) for v in np.mean(features, axis=0)] 47 | 48 | return output 49 | 50 | def accelerate(self): 51 | # Torchaudio-based vectorizers take an optional `lengths` parameter, 52 | # which we ignore here. 53 | onnx_model_path = export_to_onnx( 54 | self, 55 | torch.randn((1, 400)), 56 | axes_names=["batch_size", "seq_len"], 57 | input_names=["waveforms"], 58 | output_names=["features"], 59 | model_type="pytorch" 60 | ) 61 | self._model.forward = ONNXForward( 62 | onnx_model_path 63 | ) 64 | 65 | @property 66 | def sample_rate(self): 67 | return self._sample_rate 68 | -------------------------------------------------------------------------------- /radient/tasks/vectorizers/graph/__init__.py: -------------------------------------------------------------------------------- 1 | __all__ = [ 2 | "FastRPGraphVectorizer", 3 | "graph_vectorizer" 4 | ] 5 | 6 | from typing import Optional 7 | 8 | from radient.tasks.vectorizers.graph._base import GraphVectorizer 9 | from radient.tasks.vectorizers.graph.fastrp import FastRPGraphVectorizer 10 | 11 | 12 | def graph_vectorizer(method: str = "fastrp", **kwargs) -> GraphVectorizer: 13 | """Creates an image vectorizer specified by `method`. 14 | """ 15 | 16 | # Return a reasonable default vectorizer in the event that the user does 17 | # not specify one. 18 | if method in ("fastrp"): 19 | return FastRPGraphVectorizer(**kwargs) 20 | else: 21 | raise NotImplementedError 22 | -------------------------------------------------------------------------------- /radient/tasks/vectorizers/graph/_base.py: -------------------------------------------------------------------------------- 1 | __all__ = [ 2 | "GraphVectorizer" 3 | ] 4 | 5 | from abc import abstractmethod 6 | from typing import Any 7 | 8 | import numpy as np 9 | 10 | from radient.tasks.vectorizers._base import Vectorizer 11 | from radient.utils import fully_qualified_name 12 | from radient.utils.lazy_import import LazyImport 13 | 14 | nx = LazyImport("networkx") 15 | sp = LazyImport("scipy") 16 | 17 | 18 | class GraphVectorizer(Vectorizer): 19 | 20 | @abstractmethod 21 | def __init__(self): 22 | super().__init__() 23 | 24 | @classmethod 25 | def _preprocess(cls, graph: Any, **kwargs) -> str: 26 | # Turn input graphs into adjacency matrices. 27 | full_name = fully_qualified_name(graph) 28 | if isinstance(graph, sp.sparse.sparray): 29 | return graph 30 | elif isinstance(graph, sp.sparse.spmatrix): 31 | # TODO(fzliu): determine if support for this is necessary 32 | raise NotImplementedError 33 | elif full_name == "networkx.classes.graph.Graph": 34 | return nx.to_scipy_sparse_array(graph) 35 | else: 36 | raise TypeError 37 | -------------------------------------------------------------------------------- /radient/tasks/vectorizers/graph/fastrp.py: -------------------------------------------------------------------------------- 1 | __all__ = [ 2 | "FastRPGraphVectorizer" 3 | ] 4 | 5 | from typing import Any, Sequence 6 | 7 | import numpy as np 8 | 9 | from radient.tasks.vectorizers.graph._base import GraphVectorizer 10 | from radient.utils.lazy_import import LazyImport 11 | from radient.vector import Vector 12 | 13 | sp = LazyImport("scipy") 14 | SparseRandomProjection = LazyImport("sklearn.random_projection", attribute="SparseRandomProjection", package_name="scikit-learn") 15 | 16 | 17 | class FastRPGraphVectorizer(GraphVectorizer): 18 | """Computes node (not graph) embeddings using the FastRP algorithm. 19 | """ 20 | 21 | def __init__( 22 | self, 23 | dimension: int = 64, 24 | weights: Sequence = (0.1, 0.2, 1.0, 3.0), 25 | beta: float = 0.0 26 | ): 27 | super().__init__() 28 | self._projection = SparseRandomProjection(n_components=dimension) 29 | self._weights = weights 30 | self._beta = beta 31 | 32 | def _vectorize(self, graph: "sp.sparse.sparray", **kwargs) -> Vector: 33 | """Radient-specific implementation of the FastRP vectorization 34 | algorithm. 35 | """ 36 | projector = self._projection.fit(graph) 37 | R = projector.components_.T 38 | 39 | # Compute \mathbf{D} as per Chen et al. equation 0: 40 | # D = sum(Sip, axis=p) if i = j else 0 41 | L = graph.sum(axis=1) 42 | L = 0.5 * L**self._beta / graph.shape[0] 43 | L = sp.sparse.diags_array(L) 44 | N_k = graph @ L @ R 45 | 46 | # Compute a weighted combination of the powers of the projections. 47 | result = self._weights[0] * N_k 48 | for k in range(1, len(self._weights)): 49 | N_k = graph @ N_k 50 | result += self._weights[k] * N_k 51 | 52 | result = result.todense().view(Vector) 53 | return [item for item in result] 54 | -------------------------------------------------------------------------------- /radient/tasks/vectorizers/image/__init__.py: -------------------------------------------------------------------------------- 1 | __all__ = [ 2 | "TimmImageVectorizer", 3 | "image_vectorizer" 4 | ] 5 | 6 | from typing import Optional 7 | 8 | from radient.tasks.vectorizers.image._base import ImageVectorizer 9 | from radient.tasks.vectorizers.image.timm import TimmImageVectorizer 10 | from radient.tasks.vectorizers.image.imagebind import ImageBindImageVectorizer 11 | 12 | 13 | def image_vectorizer(method: str = "timm", **kwargs) -> ImageVectorizer: 14 | """Creates an image vectorizer specified by `method`. 15 | """ 16 | 17 | # Return a reasonable default vectorizer in the event that the user does 18 | # not specify one. 19 | if method in ("timm", "pytorch-image-models"): 20 | return TimmImageVectorizer(**kwargs) 21 | elif method in ("imagebind",): 22 | return ImageBindImageVectorizer(**kwargs) 23 | else: 24 | raise NotImplementedError 25 | -------------------------------------------------------------------------------- /radient/tasks/vectorizers/image/_base.py: -------------------------------------------------------------------------------- 1 | __all__ = [ 2 | "ImageVectorizer" 3 | ] 4 | 5 | from abc import abstractmethod 6 | import base64 7 | import io 8 | from pathlib import Path 9 | from typing import Any, TYPE_CHECKING 10 | import urllib.request 11 | 12 | import numpy as np 13 | 14 | from radient.tasks.vectorizers._base import Vectorizer 15 | from radient.utils import fully_qualified_name 16 | from radient.utils.lazy_import import LazyImport 17 | from radient.vector import Vector 18 | 19 | if TYPE_CHECKING: 20 | from PIL import Image 21 | import validators 22 | else: 23 | Image = LazyImport("PIL.Image", package_name="pillow") 24 | validators = LazyImport("validators") 25 | 26 | 27 | class ImageVectorizer(Vectorizer): 28 | 29 | @abstractmethod 30 | def __init__(self): 31 | super().__init__() 32 | 33 | def _preprocess(self, image: Any, mode: str = "RGB", **kwargs) -> "Image.Image": 34 | """Converts the input images into a common format, i.e. a PIL Image. 35 | Video files are loaded into `torchvision.io.VideoReader` objects. 36 | """ 37 | # Acquire the full class path, i.e. qualified name plus module name. 38 | # There might be a better way to do this that takes module rebinding 39 | # into consideration, but this will do for now. 40 | full_name = fully_qualified_name(image) 41 | if full_name == "PIL.Image.Image": 42 | return image 43 | elif full_name == "numpy.ndarray": 44 | return Image.fromarray(image, mode=mode) 45 | elif full_name == "builtins.str": 46 | # For string inputs, we support three options - a base64 encoded 47 | # string containing the image data, a path to a filename which is 48 | # a valid image type, or a URL that contains the image. 49 | imgpath = Path(image) 50 | if imgpath.suffix in Image.registered_extensions().keys(): 51 | if imgpath.exists(): 52 | return Image.open(image) 53 | elif validators.url(image): 54 | with urllib.request.urlopen(image) as resp: 55 | imgbytes = io.BytesIO(resp.read()) 56 | return Image.open(imgbytes) 57 | else: 58 | try: 59 | imgbytes = io.BytesIO(base64.b64decode(image)) 60 | return Image.open(imgbytes) 61 | except: 62 | raise TypeError 63 | else: 64 | raise TypeError 65 | 66 | @abstractmethod 67 | def _vectorize(self, data: Image.Image, **kwargs): 68 | pass -------------------------------------------------------------------------------- /radient/tasks/vectorizers/image/imagebind.py: -------------------------------------------------------------------------------- 1 | __all__ = [ 2 | "ImageBindImageVectorizer" 3 | ] 4 | 5 | from typing import Any, TYPE_CHECKING 6 | 7 | from radient.tasks.accelerate import export_to_onnx, ONNXForward 8 | from radient.tasks.vectorizers._imagebind import create_imagebind_model 9 | from radient.tasks.vectorizers._imagebind import imagebind_model 10 | from radient.tasks.vectorizers.image._base import ImageVectorizer 11 | from radient.utils.lazy_import import LazyImport 12 | from radient.vector import Vector 13 | 14 | if TYPE_CHECKING: 15 | from PIL import Image 16 | import torch 17 | from torchvision import transforms 18 | else: 19 | Image = LazyImport("PIL", attribute="Image", package_name="Pillow") 20 | torch = LazyImport("torch") 21 | transforms = LazyImport("torchvision", attribute="transforms") 22 | 23 | 24 | class ImageBindImageVectorizer(ImageVectorizer): 25 | """Computes image embeddings using FAIR's ImageBind model. 26 | """ 27 | 28 | def __init__(self, model_name: str = "imagebind_huge", **kwargs): 29 | super().__init__() 30 | self._model_name = model_name 31 | # TODO(fzliu): remove non-image trunks from this model 32 | self._model = create_imagebind_model(model_name=model_name, modality="image") 33 | self._model.eval() 34 | self._transform = transforms.Compose([ 35 | transforms.Resize( 36 | 224, interpolation=transforms.InterpolationMode.BICUBIC 37 | ), 38 | transforms.CenterCrop(224), 39 | transforms.ToTensor(), 40 | transforms.Normalize( 41 | mean=(0.48145466, 0.4578275, 0.40821073), 42 | std=(0.26862954, 0.26130258, 0.27577711), 43 | ), 44 | ]) 45 | 46 | def _vectorize(self, image: Image.Image, **kwargs): 47 | # TODO(fzliu): dynamic batching 48 | with torch.inference_mode(): 49 | x = self._transform(image.convert("RGB")).unsqueeze(0) 50 | output = self._model({imagebind_model.ModalityType.VISION: x}) 51 | vector = output[imagebind_model.ModalityType.VISION].squeeze() 52 | if isinstance(vector, torch.Tensor): 53 | vector = vector.numpy() 54 | return vector.view(Vector) 55 | 56 | def accelerate(self): 57 | modality = imagebind_model.ModalityType.VISION 58 | inputs = ({modality: torch.randn((1, 3, 224, 224))}, {}) 59 | input_names = output_names = [modality] 60 | onnx_model_path = export_to_onnx( 61 | self, 62 | inputs, 63 | axes_names=["batch_size"], 64 | input_names=input_names, 65 | output_names=output_names, 66 | model_type="pytorch" 67 | ) 68 | # TODO(fzliu): delete all tensors from model 69 | self._model.forward = ONNXForward( 70 | onnx_model_path, 71 | output_names=output_names, 72 | ) 73 | 74 | -------------------------------------------------------------------------------- /radient/tasks/vectorizers/image/timm.py: -------------------------------------------------------------------------------- 1 | __all__ = [ 2 | "TimmImageVectorizer" 3 | ] 4 | 5 | from typing import Any, TYPE_CHECKING 6 | 7 | from radient.tasks.accelerate import export_to_onnx, ONNXForward 8 | from radient.tasks.vectorizers.image._base import ImageVectorizer 9 | from radient.utils.lazy_import import LazyImport 10 | from radient.vector import Vector 11 | 12 | 13 | if TYPE_CHECKING: 14 | from PIL import Image 15 | import timm 16 | import torch 17 | else: 18 | Image = LazyImport("PIL", attribute="Image", package_name="Pillow") 19 | timm = LazyImport("timm") 20 | torch = LazyImport("torch") 21 | 22 | 23 | class TimmImageVectorizer(ImageVectorizer): 24 | """Computes image embeddings using `timm`. 25 | """ 26 | 27 | def __init__(self, model_name: str = "resnet50", **kwargs): 28 | super().__init__() 29 | self._model_name = model_name 30 | self._model = timm.create_model(model_name, pretrained=True, **kwargs) 31 | self._model.reset_classifier(0) 32 | self._model.eval() 33 | data_config = timm.data.resolve_model_data_config(self._model) 34 | self._transform = timm.data.create_transform(**data_config) 35 | 36 | def _vectorize(self, image: Image, **kwargs): 37 | # TODO(fzliu): dynamic batching 38 | with torch.inference_mode(): 39 | x = self._transform(image.convert("RGB")).unsqueeze(0) 40 | vector = self._model(x).squeeze() 41 | if isinstance(vector, torch.Tensor): 42 | vector = vector.numpy() 43 | return vector.view(Vector) 44 | 45 | def accelerate(self): 46 | # `timm` models take a single 4D tensor (`B x C x H x W`) as input. 47 | onnx_model_path = export_to_onnx( 48 | self, 49 | torch.randn((1, 3, 224, 224)), 50 | axes_names=["batch_size"], 51 | input_names=["images"], 52 | output_names=["vectors"], 53 | model_type="pytorch" 54 | ) 55 | 56 | self._model.forward = ONNXForward( 57 | onnx_model_path 58 | ) 59 | 60 | -------------------------------------------------------------------------------- /radient/tasks/vectorizers/molecule/__init__.py: -------------------------------------------------------------------------------- 1 | __all__ = [ 2 | "RDKitMoleculeVectorizer" 3 | "molecule_vectorizer" 4 | ] 5 | 6 | from typing import Optional 7 | 8 | from radient.tasks.vectorizers.molecule._base import MoleculeVectorizer 9 | from radient.tasks.vectorizers.molecule.rdkit import RDKitMoleculeVectorizer 10 | 11 | 12 | def molecule_vectorizer(method: str = "rdkit", **kwargs) -> MoleculeVectorizer: 13 | """Creates a text vectorizer specified by `method`. 14 | """ 15 | 16 | # Return a reasonable default vectorizer in the event that the user does 17 | # not specify one. 18 | if method in ("rdkit",): 19 | return RDKitMoleculeVectorizer(**kwargs) 20 | else: 21 | raise NotImplementedError 22 | -------------------------------------------------------------------------------- /radient/tasks/vectorizers/molecule/_base.py: -------------------------------------------------------------------------------- 1 | __all__ = [ 2 | "MoleculeVectorizer" 3 | ] 4 | 5 | from abc import abstractmethod 6 | from typing import Any 7 | 8 | from radient.vector import Vector 9 | from radient.utils import fully_qualified_name 10 | from radient.utils.lazy_import import LazyImport 11 | from radient.tasks.vectorizers._base import Vectorizer 12 | 13 | Chem = LazyImport("rdkit.Chem") 14 | 15 | 16 | class MoleculeVectorizer(Vectorizer): 17 | 18 | @abstractmethod 19 | def __init__(self): 20 | super().__init__() 21 | 22 | def _preprocess(self, molecule: Any, **kwargs) -> str: 23 | full_name = fully_qualified_name(molecule) 24 | if full_name == "builtins.str": 25 | return molecule 26 | elif full_name == "rdkit.Chem.rdchem.Mol": 27 | # TODO: don't do this step for `RDKitMoleculeVectorizer` 28 | return Chem.MolToSmiles(molecule) 29 | else: 30 | raise TypeError 31 | -------------------------------------------------------------------------------- /radient/tasks/vectorizers/molecule/rdkit.py: -------------------------------------------------------------------------------- 1 | __all__ = [ 2 | "RDKitMoleculeVectorizer" 3 | ] 4 | 5 | from typing import Any 6 | 7 | import numpy as np 8 | 9 | from radient.utils.lazy_import import LazyImport 10 | from radient.vector import Vector 11 | from radient.tasks.vectorizers.molecule._base import MoleculeVectorizer 12 | 13 | Chem = LazyImport("rdkit.Chem") 14 | AllChem = LazyImport("rdkit.Chem.AllChem") 15 | 16 | 17 | class RDKitMoleculeVectorizer(MoleculeVectorizer): 18 | """Generates binary fingerprints for molecules. 19 | """ 20 | 21 | def __init__(self, fingerprint_type: str = "topological", **kwargs): 22 | super().__init__() 23 | self._fingerprint_type = fingerprint_type 24 | if fingerprint_type == "topological": 25 | self._fpgen = AllChem.GetRDKitFPGenerator(**kwargs) 26 | elif fingerprint_type == "morgan": 27 | self._fpgen = AllChem.GetMorganGenerator(**kwargs) 28 | 29 | def _vectorize(self, molecule: str, **kwargs): 30 | if isinstance(molecule, str): 31 | molecule = Chem.MolFromSmiles(molecule) 32 | fp = self._fpgen.GetFingerprint(molecule) 33 | # Use dtype=bool to avoid having to bit-pack into `uint8`. 34 | vector = np.array(fp.ToList(), dtype=bool) 35 | return vector.view(Vector) 36 | 37 | @property 38 | def fingerprint_type(self) -> str: 39 | return self._fingerprint_type 40 | -------------------------------------------------------------------------------- /radient/tasks/vectorizers/multimodal.py: -------------------------------------------------------------------------------- 1 | __all__ = [ 2 | "multimodal_vectorizer" 3 | ] 4 | 5 | from typing import Any, Type, Optional 6 | 7 | from radient.tasks.vectorizers._base import Vectorizer 8 | from radient.tasks.vectorizers.audio.imagebind import ImageBindAudioVectorizer 9 | from radient.tasks.vectorizers.image.imagebind import ImageBindImageVectorizer 10 | from radient.tasks.vectorizers.text.imagebind import ImageBindTextVectorizer 11 | 12 | IMAGEBIND_VECTORIZERS = { 13 | ImageBindAudioVectorizer, 14 | ImageBindImageVectorizer, 15 | ImageBindTextVectorizer 16 | } 17 | 18 | 19 | class MultimodalVectorizer(Vectorizer): 20 | 21 | def __init__(self, vectypes: set[Type], **kwargs): 22 | super().__init__() 23 | self._vectorizers = {} 24 | for VecType in vectypes: 25 | vectorizer = VecType(**kwargs) 26 | self._vectorizers[vectorizer.vtype] = vectorizer 27 | 28 | def modalities(self) -> list[str]: 29 | return list(self._vectorizers.keys()) 30 | 31 | def _preprocess(self, data: Any, modality: str, **kwargs) -> Any: 32 | vectorizer = self._vectorizers[modality] 33 | return vectorizer._preprocess(data, **kwargs) 34 | 35 | def _vectorize(self, data: Any, modality: str, **kwargs) -> Any: 36 | vectorizer = self._vectorizers[modality] 37 | vector = vectorizer._vectorize(data, **kwargs) 38 | return vector 39 | 40 | def _postprocess(self, data: Any, modality: str, **kwargs) -> Any: 41 | vectorizer = self._vectorizers[modality] 42 | return vectorizer._postprocess(data, **kwargs) 43 | 44 | def accelerate(self, **kwargs): 45 | for vectorizer in self._vectorizers.values(): 46 | vectorizer.accelerate(**kwargs) 47 | 48 | 49 | def multimodal_vectorizer(method: Optional[str], **kwargs) -> MultimodalVectorizer: 50 | """Creates a text vectorizer specified by `method`. 51 | """ 52 | 53 | if method in ("imagebind",): 54 | return MultimodalVectorizer(IMAGEBIND_VECTORIZERS, **kwargs) 55 | else: 56 | raise NotImplementedError -------------------------------------------------------------------------------- /radient/tasks/vectorizers/text/__init__.py: -------------------------------------------------------------------------------- 1 | __all__ = [ 2 | "TextVectorizer", 3 | "CohereTextVectorizer" 4 | "ImageBindTextVectorizer" 5 | "SBERTTextVectorizer", 6 | "SklearnTextVectorizer", 7 | "VoyageTextVectorizer" 8 | "text_vectorizer" 9 | ] 10 | 11 | from typing import Optional 12 | 13 | from radient.tasks.vectorizers.text._base import TextVectorizer 14 | from radient.tasks.vectorizers.text.cohere import CohereTextVectorizer 15 | from radient.tasks.vectorizers.text.imagebind import ImageBindTextVectorizer 16 | from radient.tasks.vectorizers.text.sbert import SBERTTextVectorizer 17 | from radient.tasks.vectorizers.text.sklearn import SklearnTextVectorizer 18 | from radient.tasks.vectorizers.text.voyage import VoyageTextVectorizer 19 | 20 | 21 | def text_vectorizer(method: str = "sbert", **kwargs) -> TextVectorizer: 22 | """Creates a text vectorizer specified by `method`. 23 | """ 24 | 25 | # Return a reasonable default vectorizer in the event that the user does 26 | # not specify one. 27 | if method in ("sbert", "sentence-transformers"): 28 | return SBERTTextVectorizer(**kwargs) 29 | elif method in ("imagebind",): 30 | return ImageBindTextVectorizer(**kwargs) 31 | elif method in ("sklearn", "scikit-learn"): 32 | return SklearnTextVectorizer(**kwargs) 33 | elif method in ("cohere",): 34 | return CohereTextVectorizer(**kwargs) 35 | elif method in ("voyage", "voyageai"): 36 | return VoyageTextVectorizer(**kwargs) 37 | else: 38 | raise NotImplementedError 39 | -------------------------------------------------------------------------------- /radient/tasks/vectorizers/text/_base.py: -------------------------------------------------------------------------------- 1 | __all__ = [ 2 | "TextVectorizer" 3 | ] 4 | 5 | from abc import abstractmethod 6 | from typing import Any 7 | 8 | from radient.tasks.vectorizers._base import Vectorizer 9 | from radient.utils import fully_qualified_name 10 | from radient.vector import Vector 11 | 12 | 13 | class TextVectorizer(Vectorizer): 14 | 15 | @abstractmethod 16 | def __init__(self): 17 | super().__init__() 18 | 19 | def _preprocess(self, text: Any, **kwargs) -> str: 20 | if not isinstance(text, str): 21 | return str(text) 22 | return text 23 | -------------------------------------------------------------------------------- /radient/tasks/vectorizers/text/cohere.py: -------------------------------------------------------------------------------- 1 | __all__ = [ 2 | "CohereTextVectorizer" 3 | ] 4 | 5 | import os 6 | from typing import Optional, TYPE_CHECKING 7 | 8 | from radient.utils.lazy_import import LazyImport 9 | from radient.vector import Vector 10 | from radient.tasks.vectorizers.text._base import TextVectorizer 11 | 12 | if TYPE_CHECKING: 13 | import cohere 14 | else: 15 | cohere = LazyImport("cohere") 16 | 17 | 18 | class CohereTextVectorizer(TextVectorizer): 19 | """Text vectorization with Cohere (https://www.cohere.com). 20 | """ 21 | 22 | def __init__(self, model_name: str = "embed-english-v3.0", **kwargs): 23 | super().__init__() 24 | self._model_name = model_name 25 | if "COHERE_API_KEY" in os.environ: 26 | api_key = os.envirion["COHERE_API_KEY"] 27 | elif api_key in kwargs: 28 | api_key = kwargs["api_key"] 29 | else: 30 | raise ValueError("API key not found") 31 | self._client = cohere.Client(api_key) 32 | 33 | def _vectorize(self, text: str, **kwargs): 34 | vector = self._client.embed([text], model=self._model_name) 35 | return vector.view(Vector) 36 | 37 | @property 38 | def model_name(self): 39 | return self._model_name 40 | -------------------------------------------------------------------------------- /radient/tasks/vectorizers/text/imagebind.py: -------------------------------------------------------------------------------- 1 | __all__ = [ 2 | "ImageBindTextVectorizer" 3 | ] 4 | 5 | from typing import Any, TYPE_CHECKING 6 | 7 | import numpy as np 8 | 9 | import urllib.request 10 | 11 | from radient.tasks.accelerate import export_to_onnx, ONNXForward 12 | from radient.tasks.vectorizers._imagebind import create_imagebind_model 13 | from radient.tasks.vectorizers._imagebind import imagebind_model 14 | from radient.tasks.vectorizers.text._base import TextVectorizer 15 | from radient.utils import download_cache_file 16 | from radient.utils.lazy_import import LazyImport 17 | from radient.vector import Vector 18 | 19 | if TYPE_CHECKING: 20 | from imagebind.models.multimodal_preprocessors import SimpleTokenizer 21 | import torch 22 | else: 23 | SimpleTokenizer = LazyImport("imagebind.models.multimodal_preprocessors", attribute="SimpleTokenizer", package_name="git+https://github.com/fzliu/ImageBind@main") 24 | torch = LazyImport("torch") 25 | 26 | IMAGEBIND_VOCAB_URL = "https://github.com/fzliu/ImageBind/raw/main/imagebind/bpe/bpe_simple_vocab_16e6.txt.gz" 27 | 28 | 29 | 30 | class ImageBindTextVectorizer(TextVectorizer): 31 | """Computes image embeddings using FAIR's ImageBind model. 32 | """ 33 | 34 | def __init__(self, model_name = "imagebind_huge", **kwargs): 35 | super().__init__() 36 | self._model_name = model_name 37 | self._model = create_imagebind_model(model_name=model_name, modality="text") 38 | self._model.eval() 39 | vocab_path = download_cache_file(IMAGEBIND_VOCAB_URL) 40 | self._tokenizer = SimpleTokenizer(bpe_path=vocab_path) 41 | 42 | def _vectorize(self, text: str, **kwargs): 43 | # TODO(fzliu): dynamic batching 44 | with torch.inference_mode(): 45 | tokens = self._tokenizer(text).unsqueeze(0) 46 | output = self._model({imagebind_model.ModalityType.TEXT: tokens}) 47 | vector = output[imagebind_model.ModalityType.TEXT].squeeze() 48 | if isinstance(vector, torch.Tensor): 49 | vector = vector.numpy() 50 | return vector.view(Vector) 51 | 52 | def accelerate(self): 53 | modality = imagebind_model.ModalityType.TEXT 54 | input_names = output_names = [modality] 55 | inputs = ({modality: self._tokenizer("a")}, {}) 56 | onnx_model_path = export_to_onnx( 57 | self, 58 | inputs, 59 | axes_names=["batch_size"], 60 | input_names=input_names, 61 | output_names=output_names, 62 | model_type="pytorch" 63 | ) 64 | 65 | self._model.forward = ONNXForward( 66 | onnx_model_path, 67 | output_names=output_names, 68 | ) 69 | 70 | @property 71 | def sample_rate(self): 72 | return 16_000 73 | 74 | 75 | -------------------------------------------------------------------------------- /radient/tasks/vectorizers/text/sbert.py: -------------------------------------------------------------------------------- 1 | __all__ = [ 2 | "SBERTTextVectorizer" 3 | ] 4 | 5 | from typing import TYPE_CHECKING 6 | 7 | from radient.tasks.accelerate import export_to_onnx, ONNXForward 8 | from radient.tasks.vectorizers.text._base import TextVectorizer 9 | from radient.utils.lazy_import import LazyImport 10 | from radient.vector import Vector 11 | 12 | if TYPE_CHECKING: 13 | from sentence_transformers import SentenceTransformer 14 | import torch 15 | else: 16 | SentenceTransformer = LazyImport("sentence_transformers", attribute="SentenceTransformer", package_name="sentence-transformers") 17 | torch = LazyImport("torch") 18 | 19 | 20 | class SBERTTextVectorizer(TextVectorizer): 21 | """Text vectorization with `sentence-transformers`. 22 | """ 23 | 24 | def __init__(self, model_name: str = "BAAI/bge-small-en-v1.5", **kwargs): 25 | super().__init__() 26 | if "model_name_or_path" in kwargs: 27 | model_name = kwargs.pop("model_name_or_path") 28 | self._model_name = model_name 29 | self._model = SentenceTransformer(model_name_or_path=model_name, **kwargs) 30 | 31 | def _vectorize(self, text: str, **kwargs): 32 | # TODO(fzliu): token length check 33 | # TODO(fzliu): dynamic batching 34 | with torch.inference_mode(): 35 | vector = self._model.encode(text) 36 | return vector.view(Vector) 37 | 38 | @property 39 | def model_name(self): 40 | return self._model_name 41 | 42 | def accelerate(self): 43 | # Store the model in ONNX format to maximize compatibility with 44 | # different backends. Since `sentence-transformers` takes a single 45 | # dictionary input in its underlying `forward` call, the export 46 | # function will need to take a second empty dictionary as kwargs. 47 | # Output names are acquired by running the `encode` function and 48 | # specifying all outputs. 49 | model_args = (self._model.tokenize(["a"]), {}) 50 | input_names = list(model_args[0].keys()) 51 | output_names = list(self._model.encode("a", output_value=None).keys()) 52 | onnx_model_path = export_to_onnx( 53 | self, 54 | model_args, 55 | axes_names=["batch_size", "seq_len"], 56 | input_names=input_names, 57 | output_names=output_names, 58 | model_type="pytorch" 59 | ) 60 | 61 | # Monkey-patch the the underlying model's `forward` function to run the 62 | # optimized ONNX model rather than the torch version. 63 | self._model.forward = ONNXForward( 64 | onnx_model_path, 65 | output_names=output_names, 66 | output_class=torch.tensor, 67 | ) 68 | -------------------------------------------------------------------------------- /radient/tasks/vectorizers/text/sklearn.py: -------------------------------------------------------------------------------- 1 | __all__ = [ 2 | "CountTextVectorizer" 3 | ] 4 | 5 | from typing import Optional 6 | import warnings 7 | 8 | from radient.tasks.vectorizers.text._base import TextVectorizer 9 | from radient.utils.lazy_import import LazyImport 10 | from radient.vector import Vector 11 | 12 | CountVectorizer = LazyImport("sklearn.feature_extraction.text", attribute="CountVectorizer", package_name="scikit-learn") 13 | TfidfVectorizer = LazyImport("sklearn.feature_extraction.text", attribute="TfidfVectorizer", package_name="scikit-learn") 14 | HashingVectorizer = LazyImport("sklearn.feature_extraction.text", attribute="HashingVectorizer", package_name="scikit-learn") 15 | 16 | 17 | class SklearnTextVectorizer(TextVectorizer): 18 | """Text vectorization with `sentence-transformers`. 19 | """ 20 | 21 | def __init__(self, **kwargs): 22 | super().__init__() 23 | self._model = CountVectorizer(**kwargs) 24 | 25 | def _vectorize(self, texts: str, **kwargs): 26 | vectors = self._model.transform(texts) 27 | # TODO(fzliu): sparse vector type 28 | return vectors 29 | #return [v.view(Vector) for v in vectors] 30 | -------------------------------------------------------------------------------- /radient/tasks/vectorizers/text/voyage.py: -------------------------------------------------------------------------------- 1 | __all__ = [ 2 | "VoyageTextVectorizer" 3 | ] 4 | 5 | import numpy as np 6 | 7 | from radient.tasks.vectorizers.text._base import TextVectorizer 8 | from radient.utils.lazy_import import LazyImport 9 | from radient.vector import Vector 10 | 11 | voyageai = LazyImport("voyageai") 12 | 13 | 14 | class VoyageTextVectorizer(TextVectorizer): 15 | """Text vectorization with Voyage AI (https://www.voyageai.com). 16 | """ 17 | 18 | def __init__(self, model_name: str = "voyage-2", **kwargs): 19 | super().__init__() 20 | self._model_name = model_name 21 | self._client = voyageai.Client() 22 | 23 | def _vectorize(self, text: str, **kwargs): 24 | res = self._client.embed(text, model=self._model_name) 25 | return np.array(res.embeddings).view(Vector) 26 | 27 | @property 28 | def model_name(self): 29 | return self._model_name 30 | -------------------------------------------------------------------------------- /radient/utils/__init__.py: -------------------------------------------------------------------------------- 1 | import urllib.request 2 | from pathlib import Path 3 | 4 | from typing import Any, Optional 5 | 6 | 7 | def download_cache_file(url: str, filename: Optional[str] = None) -> None: 8 | """Download a file from a URL and save it to a local file. 9 | """ 10 | if not filename: 11 | filename = url.split("/")[-1].split("?")[0] 12 | path = Path.home() / ".cache" / "radient" / filename 13 | path.parents[0].mkdir(parents=True, exist_ok=True) 14 | if not path.exists(): 15 | urllib.request.urlretrieve(url, path) 16 | return path 17 | 18 | 19 | def fully_qualified_name(instance: Any) -> str: 20 | return f"{instance.__class__.__module__}.{instance.__class__.__qualname__}" -------------------------------------------------------------------------------- /radient/utils/flatten_inputs.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | from collections.abc import Iterator 3 | from itertools import cycle, islice 4 | 5 | from typing import Any, Union 6 | 7 | 8 | def _traverse(data: Union[Any, list[Any]]) -> Iterator: 9 | """Traverse a nested data structure and yield its elements. 10 | """ 11 | #if isinstance(data, dict): 12 | # for k, v in data.items(): 13 | # yield from _traverse(v) 14 | if isinstance(data, list): 15 | for d in data: 16 | yield from _traverse(d) 17 | else: 18 | yield data 19 | 20 | 21 | def _datalen(data: Union[Any, list[Any]]) -> int: 22 | """Returns the length of the input data when used with `_traverse`. 23 | """ 24 | #if isinstance(data, dict): 25 | # return sum([_datalen(v) for v in data.values()]) 26 | if isinstance(data, list): 27 | return len(data) 28 | else: 29 | return 1 if data else 0 30 | 31 | 32 | def flattened(*args, **kwargs) -> Iterator: 33 | """For use when `flatten_inputs` evaluates to True. Parses out `dict` and 34 | `list` inputs so that each individual element is passed as an argument into 35 | the downstream function. 36 | """ 37 | 38 | kwargs = OrderedDict(kwargs) 39 | 40 | # Combine `args` and `kwargs` datas into a single list, then cycle through 41 | # all of them until the maximum length is reached. 42 | datas = list(kwargs.values()) + list(args) 43 | maxlen = max([_datalen(d) for d in datas]) 44 | generator = zip(*[islice(cycle(_traverse(d)), maxlen) for d in datas]) 45 | 46 | # Recombine the flattened inputs into the original form. 47 | for inputs in generator: 48 | flat_kwargs = dict(zip(kwargs.keys(), *inputs)) 49 | flat_args = inputs[len(kwargs):] 50 | yield flat_args, flat_kwargs -------------------------------------------------------------------------------- /radient/utils/lazy_import.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | import importlib.util 3 | import subprocess 4 | from types import ModuleType 5 | from typing import _UnionGenericAlias, Any, TypeVar, Union 6 | import warnings 7 | 8 | T = TypeVar("T") 9 | 10 | 11 | def prompt_install( 12 | package: str, 13 | version: str | None = None 14 | ) -> bool: 15 | """Checks whether the user wants to install a module before proceeding. 16 | """ 17 | if version: 18 | package = f"{package}>={version}" 19 | # Ignore "no" responses in `prompt_install` so the user can optionally 20 | # install it themselves when prompted. 21 | if input(f"This requires {package}. Install? [Y/n]\n") == "Y": 22 | if subprocess.check_call(["pip", "install", "-q", package]) == 0: 23 | return True 24 | else: 25 | warnings.warn(f"Could not install required package {package}") 26 | return False 27 | 28 | 29 | class LazyImport(ModuleType): 30 | """Lazily import a module to avoid unnecessary dependencies. If a required 31 | dependency does not exist, it will prompt the user for it. 32 | 33 | Adapted from tensorflow/python/util/lazy_loader.py. 34 | """ 35 | 36 | def __init__( 37 | self, 38 | name: str, 39 | attribute: str | None = None, 40 | package_name: str | None = None, 41 | min_version: str | None = None 42 | ): 43 | super().__init__(name) 44 | self._attribute = attribute 45 | self._top_name = name.split(".")[0] 46 | self._package_name = package_name if package_name else self._top_name 47 | self._min_version = min_version 48 | self._module = None 49 | 50 | def __call__(self, *args, **kwargs) -> Any: 51 | return self._evaluate()(*args, **kwargs) 52 | 53 | def __getattr__(self, attribute: str) -> Any: 54 | return getattr(self._evaluate(), attribute) 55 | 56 | def __dir__(self) -> list: 57 | return dir(self._evaluate()) 58 | 59 | def _evaluate(self) -> ModuleType: 60 | if not self._module: 61 | if not importlib.util.find_spec(self._top_name): 62 | prompt_install(self._package_name, self._min_version) 63 | self._module = importlib.import_module(self.__name__) 64 | if self._min_version and self._module.__version__ < self._min_version: 65 | prompt_install(self._package_name, self._min_version) 66 | self._module = importlib.import_module(self.__name__) 67 | if self._attribute: 68 | return getattr(self._module, self._attribute) 69 | return self._module 70 | 71 | def __or__(self, other: Any) -> Any: 72 | """Support for Union types with the | operator.""" 73 | return _UnionGenericAlias(Union, (self, other)) 74 | -------------------------------------------------------------------------------- /radient/vector.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | from typing import Any, Sequence, Optional, Union 3 | 4 | import numpy as np 5 | 6 | from radient._milvus import _MilvusInterface 7 | 8 | 9 | class Vector(np.ndarray): 10 | """Wrapper around `numpy.ndarray` specifically for working with embeddings. 11 | We try to use Numpy naming conventions here where possible, such as concise 12 | function names and 13 | """ 14 | 15 | def __new__(cls, *args, **kwargs): 16 | return super().__new__(cls, *args, **kwargs) 17 | 18 | def __array_finalize__(self, obj): 19 | """Attach metadata to be associated with this vector. 20 | """ 21 | self._metadata = OrderedDict() 22 | 23 | @property 24 | def metadata(self) -> OrderedDict: 25 | return self._metadata 26 | 27 | @metadata.setter 28 | def metadata(self, data: dict): 29 | self._metadata = OrderedDict(data) 30 | 31 | def putmeta( 32 | self, 33 | key: str, 34 | value: Union[dict[str, Union[str, float, int]], str, float, int] 35 | ) -> "Vector": 36 | self._metadata[key] = value 37 | # Enable chaining function calls together. 38 | return self 39 | 40 | def popmeta( 41 | self, 42 | key: str 43 | ) -> Union[dict[str, Union[str, float, int]], str, float, int]: 44 | return self._metadata.pop(key) 45 | 46 | def todict( 47 | self, 48 | vector_field: str = "vector" 49 | ) -> dict[str, Union["Vector", str, float, int]]: 50 | return dict(self._metadata, **{vector_field: self.tolist()}) 51 | 52 | def store( 53 | self, 54 | sink_type: Union[Sequence[str], str] = "vectordb", 55 | **kwargs 56 | ): 57 | """Stores this vector in the specified sink. This function is for 58 | convenience only. 59 | """ 60 | if isinstance(sink_type, str): 61 | sink_type = [sink_type] 62 | for sink in sink_type: 63 | if sink == "vectordb": 64 | return self._store_vectordb(**kwargs) 65 | 66 | def _store_vectordb( 67 | self, 68 | milvus_uri: str, 69 | collection_name: str, 70 | field_name: Optional[str] = None 71 | ) -> dict[str, Union[str, list]]: 72 | """Stores this vector in the specified collection. 73 | """ 74 | client, info = _MilvusInterface(milvus_uri, collection_name, dim=self.size) 75 | field = info["dense"] 76 | # We can get away with using the dict constructor because all schema 77 | # field names are strings. 78 | data = dict(self._metadata, **{field: self.tolist()}) 79 | return client.insert( 80 | collection_name=collection_name, 81 | data=data 82 | ) 83 | 84 | def _search_vectordb( 85 | self, 86 | milvus_uri: str, 87 | collection_name: str, 88 | metric_type: Optional[str] = "COSINE", 89 | topk: int = 10 90 | ) -> list[list[np.ndarray]]: 91 | """Queries the specified collection for nearest neighbors. 92 | """ 93 | client, info = _MilvusInterface(milvus_uri, collection_name) 94 | return client.search( 95 | collection_name="test_collection", 96 | data=[self.tolist()], 97 | limit=topk, 98 | search_params={"metric_type": metric_type, "params": {}} 99 | ) 100 | 101 | --------------------------------------------------------------------------------