├── .gitignore ├── .pre-commit-config.yaml ├── LICENSE ├── README.md ├── fiftyone ├── __init__.py └── docs_search │ ├── __init__.py │ ├── cli.py │ ├── common.py │ ├── create_index.py │ ├── images │ ├── cli_example.gif │ └── python_example.gif │ ├── query_index.py │ └── read_docs.py ├── pyproject.toml ├── requirements.txt └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | 3 | __pycache__ 4 | 5 | *.egg-info 6 | 7 | **/*.DS_store 8 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/asottile/blacken-docs 3 | rev: v1.12.0 4 | hooks: 5 | - id: blacken-docs 6 | additional_dependencies: [black==21.12b0] 7 | args: ["-l 79"] 8 | exclude: ^docs/theme/|\.rst$ 9 | - repo: https://github.com/ambv/black 10 | rev: 22.3.0 11 | hooks: 12 | - id: black 13 | language_version: python3 14 | args: ["-l 79"] 15 | exclude: ^docs/theme/ 16 | - repo: local 17 | hooks: 18 | - id: pylint 19 | name: pylint 20 | language: system 21 | files: \.py$ 22 | entry: pylint 23 | args: ["--errors-only"] 24 | exclude: ^(docs/theme/|app/) 25 | - repo: local 26 | hooks: 27 | - id: ipynb-strip 28 | name: ipynb-strip 29 | language: system 30 | files: \.ipynb$ 31 | exclude: ^docs/ # *do* commit ipynb outputs in `docs/` 32 | entry: jupyter nbconvert --clear-output --ClearOutputPreprocessor.enabled=True 33 | args: ["--log-level=ERROR"] 34 | - repo: https://github.com/pre-commit/mirrors-prettier 35 | rev: v2.6.2 36 | hooks: 37 | - id: prettier 38 | exclude: ^docs/theme/ 39 | language_version: system 40 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Search the FiftyOne Docs with an LLM 2 | 3 | This repository contains the code to enable semantic search on the 4 | [Voxel51 documentation](https://docs.voxel51.com) from Python or the command 5 | line. The search is powered by [FiftyOne](https://github.com/voxel51/fiftyone), 6 | OpenAI's [text-embedding-ada-002 model](https://platform.openai.com/docs/guides/embeddings), and [Qdrant vector search](https://qdrant.tech/). 7 | 8 | !['fiftyone-docs-search-cli'](fiftyone/docs_search/images/cli_example.gif) 9 | 10 | ## Updates 11 | 12 | - **2021-06-14**: The `fiftyone-docs-search` package has been updated in the following ways: 13 | - [FiftyOne Documentation](https://voxel51.com/docs/fiftyone) embeddings have been updated to FiftyOne 0.21.0. 14 | - Splitting of documents is simplified and more robust. LangChain splitters are used in conjunction with our custom Markdown parsing. 15 | - The `block_type` argument has been removed to make search results more robust. 16 | 17 | ## Installation 18 | 19 | 1. Clone the repository: 20 | 21 | ```shell 22 | git clone https://github.com/voxel51/fiftyone-docs-search 23 | cd fiftyone-docs-search 24 | ``` 25 | 26 | 2. Install the package: 27 | 28 | ```shell 29 | pip install -e . 30 | ``` 31 | 32 | 3. Register your OpenAI API key 33 | ([create one](https://platform.openai.com/account/api-keys)): 34 | 35 | ```shell 36 | export OPENAI_API_KEY=XXXXXXXX 37 | ``` 38 | 39 | 4. Launch a Qdrant server: 40 | 41 | ```shell 42 | docker pull qdrant/qdrant 43 | docker run -d -p 6333:6333 qdrant/qdrant 44 | ``` 45 | 46 | ## Usage 47 | 48 | ### Command line 49 | 50 | The `fiftyone-docs-search` package provides a command line interface for 51 | searching the Voxel51 documentation. To use it, run: 52 | 53 | ```shell 54 | fiftyone-docs-search query 55 | ``` 56 | 57 | where `` is the search query. For example: 58 | 59 | ```shell 60 | fiftyone-docs-search query "how to load a dataset" 61 | ``` 62 | 63 | The following flags can give you control over the search behavior: 64 | 65 | - `--num_results`: the number of results returned 66 | - `--open_url`: whether to open the top result in your browser 67 | - `--score`: whether to return the score of each result 68 | - `--doc_types`: the types of docs to search over (e.g., "tutorials", "api", "guides") 69 | 70 | You can also use the `--help` flag to see all available options: 71 | 72 | ```shell 73 | fiftyone-docs-search --help 74 | ``` 75 | 76 | #### Aliasing the command 77 | 78 | If you find `fiftyone-docs-search query` cumbersome, you can alias the command, by adding the following to your `~/.bashrc` or `~/.zshrc` file: 79 | 80 | ```bash 81 | alias fosearch='fiftyone-docs-search query' 82 | ``` 83 | 84 | ### Python 85 | 86 | !['fiftyone-docs-search-python'](fiftyone/docs_search/images/python_example.gif) 87 | 88 | The `fiftyone-docs-search` package also provides a Python API for searching the 89 | Voxel51 documentation. To use it, run: 90 | 91 | ```py 92 | from fiftyone.docs_search import FiftyOneDocsSearch 93 | 94 | fods = FiftyOneDocsSearch() 95 | results = fods("how to load a dataset") 96 | ``` 97 | 98 | You can set defaults for the search behavior by passing arguments to the 99 | constructor: 100 | 101 | ```py 102 | fods = FiftyOneDocsSearch( 103 | num_results=5, 104 | open_url=True, 105 | score=True, 106 | doc_types=["tutorials", "api", "guides"], 107 | ) 108 | ``` 109 | 110 | For any individual search, you can override these defaults by passing arguments. 111 | 112 | ## Versioning 113 | 114 | The `fiftyone-docs-search` package is versioned to match the version of the 115 | Voxel51 FiftyOne documentation that it is searching. For example, the `v0.20.1` 116 | version of the `fiftyone-docs-search` package is designed to search the 117 | `v0.20.1` version of the Voxel51 FiftyOne documentation. 118 | 119 | ## Building the index from scratch 120 | 121 | By default, if you do not have a Qdrant collection instantiated yet, when you 122 | run a search, the `fiftyone-docs-search` package will automatically download 123 | a JSON file containing a vector indexing of the latest version of the Voxel51 124 | FiftyOne documentation. 125 | 126 | If you would like, you can also build the index yourself from a local copy of 127 | the Voxel51 FiftyOne documentation. To do so, first clone the FiftyOne repo if 128 | you haven't already: 129 | 130 | ```shell 131 | git clone https://github.com/voxel51/fiftyone 132 | ``` 133 | 134 | and install FiftyOne, as described in the detailed installation instructions 135 | [here](https://github.com/voxel51/fiftyone#installation-1). 136 | 137 | Build a local version of the docs by running: 138 | 139 | ```shell 140 | bash docs/generate_docs.bash 141 | ``` 142 | 143 | Then, set a `FIFTYONE_DIR` environment variable to the path to the local 144 | FiftyOne repo. For example, if you cloned the repo to `~/fiftyone`, you would 145 | run: 146 | 147 | ```shell 148 | export FIFTYONE_DIR=~/fiftyone 149 | ``` 150 | 151 | Finally, run the following command to build the index: 152 | 153 | ```shell 154 | fiftyone-docs-search create 155 | ``` 156 | 157 | If you would like to save the Qdrant index to JSON, you can run: 158 | 159 | ```shell 160 | fiftyone-docs-search save -o 161 | ``` 162 | 163 | ## Contributing 164 | 165 | Contributions are welcome! 166 | 167 | ## About FiftyOne 168 | 169 | If you've made it this far, we'd greatly appreciate if you'd take a moment to 170 | check out [FiftyOne](https://github.com/voxel51/fiftyone) and give us a star! 171 | 172 | FiftyOne is an open source library for building high-quality datasets and 173 | computer vision models. It's the engine that powers this project. 174 | 175 | Thanks for visiting! 😊 176 | 177 | ## Join the Community 178 | 179 | If you want join a fast-growing community of engineers, researchers, and 180 | practitioners who love computer vision, join the 181 | [FiftyOne Slack community](https://slack.voxel51.com/)! 🚀🚀🚀 182 | -------------------------------------------------------------------------------- /fiftyone/__init__.py: -------------------------------------------------------------------------------- 1 | from pkgutil import extend_path 2 | 3 | # This statement allows multiple `fiftyone.XXX` packages to be installed in the 4 | # same environment and used simultaneously. 5 | # 6 | # https://docs.python.org/3/library/pkgutil.html#pkgutil.extend_path 7 | # 8 | __path__ = extend_path(__path__, __name__) 9 | 10 | from fiftyone.__public__ import * 11 | -------------------------------------------------------------------------------- /fiftyone/docs_search/__init__.py: -------------------------------------------------------------------------------- 1 | from .query_index import FiftyOneDocsSearch 2 | -------------------------------------------------------------------------------- /fiftyone/docs_search/cli.py: -------------------------------------------------------------------------------- 1 | """ 2 | Definition of the `fiftyone-docs-search` command-line interface (CLI). 3 | | Copyright 2017-2023, Voxel51, Inc. 4 | | `voxel51.com `_ 5 | | 6 | """ 7 | 8 | import argparse 9 | import argcomplete 10 | import os 11 | 12 | 13 | import fiftyone.docs_search.create_index as dsci 14 | import fiftyone.docs_search.query_index as dsqi 15 | 16 | ################################################################ 17 | 18 | 19 | class Command(object): 20 | """Interface for defining commands. 21 | Command instances must implement the `setup()` method, and they should 22 | implement the `execute()` method if they perform any functionality beyond 23 | defining subparsers. 24 | """ 25 | 26 | @staticmethod 27 | def setup(parser): 28 | """Setup the command-line arguments for the command. 29 | Args: 30 | parser: an `argparse.ArgumentParser` instance 31 | """ 32 | raise NotImplementedError("subclass must implement setup()") 33 | 34 | @staticmethod 35 | def execute(parser, args): 36 | """Executes the command on the given args. 37 | args: 38 | parser: the `argparse.ArgumentParser` instance for the command 39 | args: an `argparse.Namespace` instance containing the arguments 40 | for the command 41 | """ 42 | raise NotImplementedError("subclass must implement execute()") 43 | 44 | 45 | class FiftyOneDocsSearchCommand(Command): 46 | """The FiftyOneDocsSearch command-line interface.""" 47 | 48 | @staticmethod 49 | def setup(parser): 50 | subparsers = parser.add_subparsers(title="available commands") 51 | _register_command(subparsers, "create", CreateIndexCommand) 52 | _register_command(subparsers, "save", SaveIndexCommand) 53 | _register_command(subparsers, "load", LoadIndexCommand) 54 | _register_command(subparsers, "query", QueryIndexCommand) 55 | 56 | @staticmethod 57 | def execute(parser, args): 58 | parser.print_help() 59 | 60 | 61 | class CreateIndexCommand(Command): 62 | """Creates the vector index for the docs. 63 | 64 | Examples:: 65 | 66 | fiftyone-docs-search create --name my_name 67 | 68 | """ 69 | 70 | @staticmethod 71 | def setup(parser): 72 | parser.add_argument( 73 | "-n", 74 | "--name", 75 | metavar="COLLECTION_NAME", 76 | default="fiftyone_docs", 77 | help="the name of the Qdrant collection to create", 78 | ) 79 | 80 | @staticmethod 81 | def execute(parser, args): 82 | os.environ["FIFTYONE_DOCS_COLLECTION"] = str(args.collection_name) 83 | dsci.generate_index_from_html_docs() 84 | 85 | 86 | class SaveIndexCommand(Command): 87 | """Saves the vector index for the docs. 88 | 89 | Examples:: 90 | 91 | fiftyone-docs-search save -o my_index.json -b 100 92 | 93 | """ 94 | 95 | @staticmethod 96 | def setup(parser): 97 | parser.add_argument( 98 | "-o", 99 | "--out_path", 100 | metavar="INDEX_JSON", 101 | default="fiftyone_docs_index.json", 102 | help="the name of the JSON file to save the index to", 103 | ) 104 | 105 | parser.add_argument( 106 | "-b", 107 | "--batch_size", 108 | metavar="BATCH_SIZE", 109 | default=50, 110 | help="the pagination size for retrieving vectors from Qdrant index", 111 | ) 112 | 113 | @staticmethod 114 | def execute(parser, args): 115 | dsci.save_index_to_json( 116 | docs_index_file=args.out_path, batch_size=args.batch_size 117 | ) 118 | 119 | 120 | class LoadIndexCommand(Command): 121 | """Loads the vector index for the docs from JSON. 122 | 123 | Examples:: 124 | 125 | fiftyone-docs-search load -i my_index.json 126 | 127 | """ 128 | 129 | @staticmethod 130 | def setup(parser): 131 | parser.add_argument( 132 | "-i", 133 | "--in_path", 134 | metavar="INDEX_JSON", 135 | default="fiftyone_docs_index.json", 136 | help="the name of the JSON file to load the index from", 137 | ) 138 | 139 | # pylint: disable=unexpected-keyword-arg 140 | @staticmethod 141 | def execute(parser, args): 142 | dsci.load_index_from_json(docs_index_file=args.in_path) 143 | 144 | 145 | def str2bool(v): 146 | if isinstance(v, bool): 147 | return v 148 | if v.lower() in ("yes", "true", "t", "y", "1"): 149 | return True 150 | elif v.lower() in ("no", "false", "f", "n", "0"): 151 | return False 152 | else: 153 | raise argparse.ArgumentTypeError("Boolean value expected.") 154 | 155 | 156 | class QueryIndexCommand(Command): 157 | """Queries the vector index for the docs. 158 | 159 | Examples:: 160 | 161 | fiftyone-docs-search query "How do I load a dataset in FiftyOne?" -n 10 162 | 163 | """ 164 | 165 | @staticmethod 166 | def setup(parser): 167 | parser.add_argument( 168 | "query", 169 | metavar="QUERY", 170 | nargs="?", 171 | default="How do I load a dataset in FiftyOne?", 172 | help="the query string to search for", 173 | ) 174 | 175 | parser.add_argument( 176 | "-n", 177 | "--num_results", 178 | metavar="NUM_RESULTS", 179 | default=10, 180 | help="the number of results to return", 181 | ) 182 | 183 | parser.add_argument( 184 | "-o", 185 | "--open_url", 186 | metavar="OPEN_URL", 187 | default=True, 188 | type=str2bool, 189 | help="whether to open the first result in a web browser", 190 | ) 191 | 192 | parser.add_argument( 193 | "-s", 194 | "--score", 195 | metavar="SCORE", 196 | default=False, 197 | type=str2bool, 198 | help="whether to print the score of each result", 199 | ) 200 | 201 | parser.add_argument( 202 | "-d", 203 | "--doc_types", 204 | metavar="DOC_TYPES", 205 | default=None, 206 | help="the types of docs to search through", 207 | ) 208 | 209 | @staticmethod 210 | def execute(parser, args): 211 | dsqi.fiftyone_docs_search( 212 | args.query, 213 | top_k=args.num_results, 214 | open_url=args.open_url, 215 | score=args.score, 216 | doc_types=args.doc_types, 217 | ) 218 | 219 | 220 | def _has_subparsers(parser): 221 | for action in parser._actions: 222 | if isinstance(action, argparse._SubParsersAction): 223 | return True 224 | 225 | return False 226 | 227 | 228 | def _iter_subparsers(parser): 229 | for action in parser._actions: 230 | if isinstance(action, argparse._SubParsersAction): 231 | for subparser in action.choices.values(): 232 | yield subparser 233 | 234 | 235 | class _RecursiveHelpAction(argparse._HelpAction): 236 | def __call__(self, parser, *args, **kwargs): 237 | self._recurse(parser) 238 | parser.exit() 239 | 240 | @staticmethod 241 | def _recurse(parser): 242 | print("\n%s\n%s" % ("*" * 79, parser.format_help())) 243 | for subparser in _iter_subparsers(parser): 244 | _RecursiveHelpAction._recurse(subparser) 245 | 246 | 247 | def _register_main_command(command, version=None, recursive_help=True): 248 | parser = argparse.ArgumentParser(description=command.__doc__.rstrip()) 249 | 250 | parser.set_defaults(execute=lambda args: command.execute(parser, args)) 251 | command.setup(parser) 252 | 253 | if version: 254 | parser.add_argument( 255 | "-v", 256 | "--version", 257 | action="version", 258 | version=version, 259 | help="show version info", 260 | ) 261 | 262 | if recursive_help and _has_subparsers(parser): 263 | parser.add_argument( 264 | "--all-help", 265 | action=_RecursiveHelpAction, 266 | help="show help recursively and exit", 267 | ) 268 | 269 | argcomplete.autocomplete(parser) 270 | return parser 271 | 272 | 273 | def _register_command(parent, name, command, recursive_help=True): 274 | parser = parent.add_parser( 275 | name, 276 | help=command.__doc__.splitlines()[0], 277 | description=command.__doc__.rstrip(), 278 | formatter_class=argparse.RawTextHelpFormatter, 279 | ) 280 | 281 | parser.set_defaults(execute=lambda args: command.execute(parser, args)) 282 | command.setup(parser) 283 | 284 | if recursive_help and _has_subparsers(parser): 285 | parser.add_argument( 286 | "--all-help", 287 | action=_RecursiveHelpAction, 288 | help="show help recursively and exit", 289 | ) 290 | 291 | return parser 292 | 293 | 294 | __version__ = "0.21.0" 295 | 296 | 297 | def main(): 298 | """Executes the `fiftyone-docs-search` tool with the given command-line args.""" 299 | parser = _register_main_command( 300 | FiftyOneDocsSearchCommand, 301 | version="FiftyOneDocsSearch v%s" % __version__, 302 | ) 303 | args = parser.parse_args() 304 | args.execute(args) 305 | -------------------------------------------------------------------------------- /fiftyone/docs_search/common.py: -------------------------------------------------------------------------------- 1 | """ 2 | Common function declarations. 3 | | Copyright 2017-2023, Voxel51, Inc. 4 | | `voxel51.com `_ 5 | | 6 | """ 7 | 8 | import openai 9 | import os 10 | import qdrant_client as qc 11 | import qdrant_client.http.models as models 12 | 13 | DOC_TYPES = ( 14 | "cheat_sheets", 15 | "cli", 16 | "environments", 17 | "faq", 18 | "getting_started", 19 | "integrations", 20 | "plugins", 21 | "recipes", 22 | "teams", 23 | "tutorials", 24 | "user_guide", 25 | ) 26 | 27 | BLOCK_TYPES = ( 28 | "code", 29 | "text", 30 | ) 31 | 32 | MODEL = "text-embedding-ada-002" 33 | 34 | CLIENT = qc.QdrantClient(url="localhost") 35 | METRIC = models.Distance.DOT 36 | DIMENSION = 1536 37 | 38 | DEFAULT_COLLECTION_NAME = "fiftyone_docs" 39 | 40 | HOME = os.path.expanduser("~") 41 | FIFTYONE_DOCS_INDEX_FOLDER = os.path.join(HOME, ".fiftyone_docs_search") 42 | FIFTYONE_DOCS_INDEX_FILENAME = "fiftyone_docs_index.json" 43 | FIFTYONE_DOCS_INDEX_FILEPATH = os.path.join( 44 | FIFTYONE_DOCS_INDEX_FOLDER, FIFTYONE_DOCS_INDEX_FILENAME 45 | ) 46 | 47 | BASE_DOCS_URL = "https://docs.voxel51.com/" 48 | 49 | ################################################################ 50 | 51 | 52 | def get_collection_name(): 53 | collection_name = os.getenv("FIFTYONE_DOCS_COLLECTION") 54 | if collection_name is None or collection_name == "None": 55 | collection_name = DEFAULT_COLLECTION_NAME 56 | return collection_name 57 | 58 | 59 | ################################################################ 60 | 61 | 62 | def embed_text(text): 63 | response = openai.Embedding.create(input=text, model=MODEL) 64 | embeddings = response["data"][0]["embedding"] 65 | return embeddings 66 | 67 | 68 | ################################################################ 69 | -------------------------------------------------------------------------------- /fiftyone/docs_search/create_index.py: -------------------------------------------------------------------------------- 1 | """ 2 | Index creating and loading function declarations. 3 | | Copyright 2017-2023, Voxel51, Inc. 4 | | `voxel51.com `_ 5 | | 6 | """ 7 | from google.cloud import storage 8 | import json 9 | import os 10 | import qdrant_client.http.models as models 11 | import shutil 12 | from tqdm import tqdm 13 | import uuid 14 | 15 | from fiftyone.docs_search.common import * 16 | from fiftyone.docs_search.read_docs import ( 17 | get_docs_list, 18 | get_markdown_documents, 19 | ) 20 | 21 | ################################################################ 22 | 23 | 24 | def generate_id(): 25 | return str(uuid.uuid1().int)[:32] 26 | 27 | 28 | def get_page_url(filepath): 29 | return f"{BASE_DOCS_URL}{filepath.split('html/')[1]}" 30 | 31 | 32 | def get_doc_type(doc_path): 33 | for doc_type in DOC_TYPES: 34 | if doc_type in doc_path: 35 | return doc_type 36 | return None 37 | 38 | 39 | ################################################################ 40 | 41 | 42 | def initialize_index(): 43 | collection_name = get_collection_name() 44 | 45 | CLIENT.recreate_collection( 46 | collection_name=collection_name, 47 | vectors_config=models.VectorParams( 48 | size=DIMENSION, 49 | distance=METRIC, 50 | ), 51 | ) 52 | 53 | 54 | def add_vectors_to_index(ids, vectors, payloads): 55 | collection_name = get_collection_name() 56 | CLIENT.upsert( 57 | collection_name=collection_name, 58 | points=models.Batch(ids=ids, vectors=vectors, payloads=payloads), 59 | ) 60 | 61 | 62 | def create_subsection_vector( 63 | subsection_content, 64 | section_anchor, 65 | page_url, 66 | doc_type, 67 | ): 68 | 69 | vector = embed_text(subsection_content) 70 | id = generate_id() 71 | payload = { 72 | "text": subsection_content, 73 | "url": page_url, 74 | "section_anchor": section_anchor, 75 | "doc_type": doc_type, 76 | } 77 | return id, vector, payload 78 | 79 | 80 | ################################################################ 81 | 82 | 83 | def add_doc_to_index(filepath): 84 | subsections = get_markdown_documents(filepath) 85 | 86 | page_url = get_page_url(filepath) 87 | doc_type = get_doc_type(filepath) 88 | 89 | ids = [] 90 | vectors = [] 91 | payloads = [] 92 | 93 | for section_anchor, section_content in subsections.items(): 94 | if section_content == []: 95 | continue 96 | for subsection in section_content: 97 | id, vector, payload = create_subsection_vector( 98 | subsection, 99 | section_anchor, 100 | page_url, 101 | doc_type, 102 | ) 103 | ids.append(id) 104 | vectors.append(vector) 105 | payloads.append(payload) 106 | 107 | add_vectors_to_index(ids, vectors, payloads) 108 | 109 | 110 | ################################################################ 111 | 112 | 113 | def generate_json_from_html_doc(doc): 114 | doc_json = {} 115 | sections = get_markdown_documents(doc) 116 | 117 | if len(sections) == 0: 118 | return 119 | if len(sections) == 1 and None in list(sections.keys()): 120 | return 121 | 122 | page_url = get_page_url(doc) 123 | doc_type = get_doc_type(doc) 124 | 125 | for section_anchor, section in sections.items(): 126 | for subsection_content in section: 127 | if subsection_content == []: 128 | continue 129 | 130 | id, vector, payload = create_subsection_vector( 131 | subsection_content, 132 | section_anchor, 133 | page_url, 134 | doc_type, 135 | ) 136 | doc_json[id] = {"vector": vector, **payload} 137 | 138 | return doc_json 139 | 140 | 141 | def generate_json_from_html_docs(docs_index_file="fiftyone_docs_index.json"): 142 | docs_json = {} 143 | 144 | docs = get_docs_list() 145 | for doc in tqdm(docs): 146 | doc_json = generate_json_from_html_doc(doc) 147 | if doc_json is None: 148 | continue 149 | for id in doc_json: 150 | docs_json[id] = doc_json[id] 151 | 152 | with open(docs_index_file, "w") as f: 153 | json.dump(docs_json, f) 154 | 155 | 156 | ################################################################ 157 | 158 | 159 | def generate_index_from_html_docs(): 160 | initialize_index() 161 | 162 | docs = get_docs_list() 163 | for doc in tqdm(docs): 164 | add_doc_to_index(doc) 165 | 166 | print("Index created successfully!") 167 | 168 | 169 | ################################################################ 170 | 171 | 172 | def save_index_to_json( 173 | docs_index_file="fiftyone_docs_index.json", batch_size=50 174 | ): 175 | collection_name = get_collection_name() 176 | collection = CLIENT.get_collection(collection_name=collection_name) 177 | num_vectors = collection.points_count 178 | docs_index = {} 179 | 180 | curr_points = CLIENT.scroll( 181 | collection_name=collection_name, 182 | limit=batch_size, 183 | with_payloads=True, 184 | with_vectors=True, 185 | ) 186 | 187 | for i in tqdm(range(0, num_vectors, batch_size)): 188 | min_ind = i * batch_size 189 | 190 | curr_points = CLIENT.scroll( 191 | collection_name=collection_name, 192 | limit=10, 193 | offset=min_ind, 194 | with_payload=True, 195 | with_vectors=True, 196 | )[0] 197 | 198 | for point in curr_points: 199 | docs_index[point.id] = {"vector": point.vector, **point.payload} 200 | 201 | with open(docs_index_file, "w") as f: 202 | json.dump(docs_index, f) 203 | 204 | print(f"Index saved successfully to {docs_index_file}!") 205 | 206 | 207 | ################################################################ 208 | 209 | 210 | def load_index_from_json(docs_index_file=None, batch_size=500): 211 | 212 | initialize_index() 213 | tmp_index_file = ( 214 | docs_index_file 215 | if docs_index_file is not None 216 | else FIFTYONE_DOCS_INDEX_FILENAME 217 | ) 218 | shutil.copyfile(FIFTYONE_DOCS_INDEX_FILEPATH, tmp_index_file) 219 | with open(tmp_index_file, "r") as f: 220 | docs_index = json.load(f) 221 | os.remove(tmp_index_file) 222 | 223 | ids = [] 224 | vectors = [] 225 | payloads = [] 226 | 227 | for id, value in docs_index.items(): 228 | ids.append(id) 229 | vectors.append(value["vector"]) 230 | 231 | payload_keys = ( 232 | "text", 233 | "url", 234 | "section_anchor", 235 | "doc_type", 236 | "block_type", 237 | ) 238 | payload = {key: value[key] for key in payload_keys} 239 | payloads.append(payload) 240 | 241 | for i in tqdm(range(0, len(ids), batch_size)): 242 | min_ind = i 243 | max_ind = min(i + batch_size, len(ids)) 244 | 245 | curr_ids = ids[min_ind:max_ind] 246 | curr_vectors = vectors[min_ind:max_ind] 247 | curr_payloads = payloads[min_ind:max_ind] 248 | 249 | add_vectors_to_index(curr_ids, curr_vectors, curr_payloads) 250 | 251 | print("Index created successfully!") 252 | 253 | 254 | ################################################################ 255 | 256 | 257 | def download_index(): 258 | print("Downloading index JSON from Google Drive...") 259 | storage_client = storage.Client.create_anonymous_client() 260 | bucket = storage_client.bucket("fiftyone-docs-search") 261 | blob = bucket.blob("fiftyone_docs_index.json") 262 | 263 | tmp_file = FIFTYONE_DOCS_INDEX_FILENAME 264 | blob.download_to_filename(tmp_file) 265 | 266 | if not os.path.exists(FIFTYONE_DOCS_INDEX_FOLDER): 267 | os.mkdir(FIFTYONE_DOCS_INDEX_FOLDER) 268 | 269 | os.replace(tmp_file, FIFTYONE_DOCS_INDEX_FILEPATH) 270 | -------------------------------------------------------------------------------- /fiftyone/docs_search/images/cli_example.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/voxel51/fiftyone-docs-search/61d658271cdc8aad243b8f4d4e51a03661eeb166/fiftyone/docs_search/images/cli_example.gif -------------------------------------------------------------------------------- /fiftyone/docs_search/images/python_example.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/voxel51/fiftyone-docs-search/61d658271cdc8aad243b8f4d4e51a03661eeb166/fiftyone/docs_search/images/python_example.gif -------------------------------------------------------------------------------- /fiftyone/docs_search/query_index.py: -------------------------------------------------------------------------------- 1 | """ 2 | Index querying function declaration. 3 | | Copyright 2017-2023, Voxel51, Inc. 4 | | `voxel51.com `_ 5 | | 6 | """ 7 | 8 | import os 9 | import qdrant_client as qc 10 | import qdrant_client.http.models as models 11 | from rich import print 12 | import webbrowser 13 | 14 | from fiftyone.docs_search.create_index import ( 15 | download_index, 16 | load_index_from_json, 17 | ) 18 | from fiftyone.docs_search.common import * 19 | 20 | ################################################################ 21 | 22 | 23 | def parse_doc_types(doc_types): 24 | if doc_types is None: 25 | doc_types = DOC_TYPES 26 | elif type(doc_types) == str: 27 | doc_types = [doc_types] 28 | return doc_types 29 | 30 | 31 | ################################################################ 32 | 33 | 34 | def collection_exists(collection_name): 35 | collections = CLIENT.get_collections().collections 36 | collection_names = [collection.name for collection in collections] 37 | return collection_name in collection_names 38 | 39 | 40 | def query_index(query, top_k=10, doc_types=None): 41 | collection_name = get_collection_name() 42 | 43 | if not collection_exists(collection_name): 44 | print(f"Collection {collection_name} does not exist. Creating...") 45 | if not os.path.exists(FIFTYONE_DOCS_INDEX_FILEPATH): 46 | print( 47 | f"Index JSON file {FIFTYONE_DOCS_INDEX_FILEPATH} does not exist." 48 | ) 49 | download_index() 50 | load_index_from_json() 51 | 52 | vector = embed_text(query) 53 | 54 | _search_params = models.SearchParams(hnsw_ef=128, exact=False) 55 | 56 | doc_types = parse_doc_types(doc_types) 57 | 58 | _filter = models.Filter( 59 | must=[ 60 | models.Filter( 61 | should=[ 62 | models.FieldCondition( 63 | key="doc_type", 64 | match=models.MatchValue(value=dt), 65 | ) 66 | for dt in doc_types 67 | ], 68 | ) 69 | ] 70 | ) 71 | 72 | results = CLIENT.search( 73 | collection_name=collection_name, 74 | query_vector=vector, 75 | query_filter=_filter, 76 | limit=top_k, 77 | with_payload=True, 78 | search_params=_search_params, 79 | ) 80 | 81 | results = [ 82 | ( 83 | f"{res.payload['url']}#{res.payload['section_anchor']}", 84 | res.payload["text"], 85 | res.score, 86 | ) 87 | for res in results 88 | ] 89 | 90 | return results 91 | 92 | 93 | ################################################################ 94 | 95 | 96 | def add_breadcrumbs(): 97 | fo_url = "https://github.com/voxel51/fiftyone" 98 | breadcrumb_string = "" 99 | breadcrumb_string += ( 100 | f"\u2B50 Star the FiftyOne repo! \u2B50 {fo_url} \u2B50\n" 101 | ) 102 | return f"{breadcrumb_string : ^40}" 103 | 104 | 105 | def format_string(s): 106 | s = s.replace("\(", "(").replace("\)", ")") 107 | return s 108 | 109 | 110 | def print_results(query, results, score=True): 111 | print("\n" * 3) 112 | print("=" * 80) 113 | str = f"Query: {query}" 114 | print(f"{str: ^80}") 115 | print("=" * 80) 116 | for i in range(len(results)): 117 | result = format_string(results[i][1]) 118 | print(f"{i+1}) {results[i][0]}") 119 | print(f"--> {result}") 120 | if score: 121 | print(f"Score: {results[i][2]}") 122 | print("-" * 80) 123 | print("\n" * 2) 124 | print(add_breadcrumbs()) 125 | 126 | 127 | ################################################################ 128 | 129 | 130 | def fiftyone_docs_search( 131 | query, top_k=10, doc_types=None, score=False, open_url=True 132 | ): 133 | results = query_index( 134 | query, 135 | top_k=top_k, 136 | doc_types=doc_types, 137 | ) 138 | 139 | print_results(query, results, score=score) 140 | if open_url: 141 | top_url = results[0][0] 142 | webbrowser.open(top_url) 143 | 144 | 145 | ################################################################ 146 | 147 | 148 | class FiftyOneDocsSearch: 149 | """Class for handling FiftyOneDocsSearch queries.""" 150 | 151 | def __init__(self, top_k=None, doc_types=None, score=False, open_url=True): 152 | self.default_top_k = top_k 153 | self.default_doc_types = doc_types 154 | self.default_score = score 155 | self.default_open_url = open_url 156 | 157 | def __call__( 158 | self, query, top_k=None, doc_types=None, score=None, open_url=None 159 | ): 160 | args_dict = {} 161 | 162 | if top_k is None: 163 | top_k = self.default_top_k 164 | if top_k is not None: 165 | args_dict["top_k"] = top_k 166 | 167 | if doc_types is None: 168 | doc_types = self.default_doc_types 169 | if doc_types is not None: 170 | args_dict["doc_types"] = doc_types 171 | 172 | if score is None: 173 | score = self.default_score 174 | if score is not None: 175 | args_dict["score"] = score 176 | 177 | if open_url is None: 178 | open_url = self.default_open_url 179 | if open_url is not None: 180 | args_dict["open_url"] = open_url 181 | 182 | fiftyone_docs_search(query, **args_dict) 183 | -------------------------------------------------------------------------------- /fiftyone/docs_search/read_docs.py: -------------------------------------------------------------------------------- 1 | """ 2 | Declaration of document reading functions. 3 | | Copyright 2017-2023, Voxel51, Inc. 4 | | `voxel51.com `_ 5 | | 6 | """ 7 | 8 | from glob import glob 9 | import os.path 10 | import os 11 | import re 12 | 13 | from langchain.schema import Document 14 | from langchain.text_splitter import MarkdownTextSplitter 15 | 16 | import fiftyone.core.utils as fou 17 | 18 | md = fou.lazy_import("markdownify") 19 | 20 | ################################################################ 21 | 22 | splitter = MarkdownTextSplitter(chunk_size=1000) 23 | 24 | 25 | def get_docs_list(): 26 | FO_DIR = os.getenv("FIFTYONE_DIR") 27 | FO_HTML_DOCS_DIR = os.path.join(FO_DIR, "docs/build/html") 28 | 29 | all_docs = [] 30 | for pattern in ["*/*.html", "*/*/*.html"]: 31 | all_docs += glob( 32 | os.path.join(FO_HTML_DOCS_DIR, pattern), recursive=True 33 | ) 34 | return [doc for doc in all_docs if "api/" not in doc] 35 | 36 | 37 | ################################################################ 38 | def remove_footer(page_md): 39 | return page_md.split("[Next ![]")[0] 40 | 41 | 42 | def remove_header(page_md): 43 | md_lines = page_md.split("\n") 44 | 45 | body_lines = [] 46 | in_body = False 47 | for mdl in md_lines: 48 | if len(mdl) > 0 and mdl[0] == "#": 49 | in_body = True 50 | if in_body: 51 | body_lines.append(mdl) 52 | page_md = "\n".join(body_lines) 53 | return page_md 54 | 55 | 56 | def remove_extra_newlines(page_md): 57 | lines = page_md.split("\n") 58 | lines = [line for line in lines if line.strip() != "!"] 59 | page_md = "\n".join(lines) 60 | return re.sub(r"\n{3,}", "\n\n", page_md) 61 | 62 | 63 | def remove_empty_code_blocks(page_md): 64 | text_and_code = page_md.split("```") 65 | text_blocks = text_and_code[::2] 66 | code_blocks = text_and_code[1::2] 67 | code_blocks = [ 68 | cb 69 | for cb in code_blocks 70 | if len(cb.strip()) > 0 and not set(cb).issubset(set("| -\n")) 71 | ] 72 | 73 | page_md = "" 74 | for tb, cb in zip(text_blocks, code_blocks): 75 | page_md += tb + "```" + cb + "```" 76 | 77 | page_md += text_and_code[-1] 78 | return re.sub(r"```py\s*```", "", page_md, flags=re.MULTILINE) 79 | 80 | 81 | def remove_jupyter_widgets(page_md): 82 | lines = page_md.split("\n") 83 | lines = [ 84 | line 85 | for line in lines 86 | if len(line) == 0 or (line[0] != "{" and "jupyter-widgets" not in line) 87 | ] 88 | return "\n".join(lines) 89 | 90 | 91 | def remove_xml(page_md): 92 | lines = page_md.split("\n") 93 | lines = [line for line in lines if not line.startswith(" 0 and not set(cb).issubset(set("| -")) 145 | ] 146 | 147 | page_md = "" 148 | for tb, cb in zip(text_blocks, code_blocks): 149 | page_md += tb + "```py" + cb + "```" 150 | 151 | page_md += text_and_code[-1] 152 | return page_md 153 | 154 | 155 | def merge_adjacent_code_blocks(page_md): 156 | pattern = r"```\n```py" 157 | page_md = re.sub(pattern, "", page_md) 158 | return re.sub(r"```py\n```py", r"```py", page_md) 159 | 160 | 161 | def remove_bad_elements(page_md): 162 | pattern = r"\(function\(\) {[\s\S]*?}\)\(\);" 163 | page_md = re.sub(pattern, "", page_md, flags=re.MULTILINE) 164 | 165 | lines = page_md.split("\n") 166 | lines = [line for line in lines if not line.startswith("@import")] 167 | 168 | bad_keywords = [ 169 | "#focontainer", 170 | "#fooverlay", 171 | "#foactivate", 172 | ] 173 | 174 | good_lines = [] 175 | flag = True 176 | for line in lines: 177 | if any([keyword in line for keyword in bad_keywords]): 178 | flag = False 179 | if flag: 180 | good_lines.append(line) 181 | if "}" in line and not flag: 182 | flag = True 183 | 184 | return "\n".join(good_lines) 185 | 186 | 187 | def remove_links(page_md): 188 | match = re.search("\[.*?\]\(.*?\)", page_md) 189 | if match is not None: 190 | start, end = match.span() 191 | link = page_md[start:end] 192 | link_text = link[1:].split("]")[0] 193 | if link_text != "¶": 194 | return page_md[:start] + link_text + remove_links(page_md[end:]) 195 | else: 196 | return page_md[:end] + link + remove_links(page_md[end:]) 197 | return page_md 198 | 199 | 200 | def reformat_markdown(page_md): 201 | page_md = page_md.replace("\_", "_").replace("\*", "*") 202 | page_md = remove_links(page_md) 203 | page_md = remove_images(page_md) 204 | page_md = remove_jupyter_widgets(page_md) 205 | page_md = remove_xml(page_md) 206 | page_md = remove_extra_newlines(page_md) 207 | page_md = remove_bad_elements(page_md) 208 | page_md = remove_code_cell_vestiges(page_md) 209 | return page_md 210 | 211 | 212 | def remove_unicode(page_md): 213 | for uchar in ["\u2500", "\u2514", "\u251c", "\u2502"]: 214 | page_md = page_md.replace(uchar, "") 215 | for uchar in ["\u2588", "\u2019"]: 216 | page_md = page_md.replace(uchar, "'") 217 | for uchar in ["\u201d", "\u201c"]: 218 | page_md = page_md.replace(uchar, '"') 219 | page_md = page_md.replace("\u00a9", "copyright") 220 | return page_md 221 | 222 | 223 | def parse_page_markdown(page_md): 224 | page_md = remove_header(page_md) 225 | page_md = remove_footer(page_md) 226 | page_md = remove_line_numbers(page_md) 227 | page_md = remove_table_rows(page_md) 228 | page_md = remove_empty_code_blocks(page_md) 229 | page_md = add_syntax_highlight_to_code_blocks(page_md) 230 | page_md = merge_adjacent_code_blocks(page_md) 231 | 232 | ## reformat now that the markdown is clean 233 | page_md = reformat_markdown(page_md) 234 | page_md = remove_empty_code_blocks(page_md) 235 | page_md = remove_extra_newlines(page_md) 236 | page_md = remove_unicode(page_md) 237 | return page_md 238 | 239 | 240 | def get_page_markdown(filepath): 241 | with open(filepath) as f: 242 | page_html = f.read() 243 | 244 | page_md = md.markdownify(page_html, heading_style="ATX") 245 | page_md = parse_page_markdown(page_md) 246 | 247 | return page_md 248 | 249 | 250 | def split_at_anchors(page_md): 251 | md_lines = page_md.split("\n") 252 | md_sections = {} 253 | curr_anchor = None 254 | curr_section = [] 255 | for line in md_lines: 256 | if "Permalink" in line: 257 | if curr_anchor is not None: 258 | md_sections[curr_anchor] = "\n".join(curr_section) 259 | curr_section = [] 260 | curr_anchor = line.split('"Permalink')[0].split("#")[-1].strip() 261 | else: 262 | curr_section.append(line) 263 | 264 | md_sections[curr_anchor] = "\n".join(curr_section) 265 | return md_sections 266 | 267 | 268 | def split_section_into_chunks(text): 269 | document = Document(page_content=text) 270 | documents = splitter.split_documents([document]) 271 | return [d.page_content for d in documents] 272 | 273 | 274 | def split_page_into_chunks(page_md): 275 | md_sections = split_at_anchors(page_md) 276 | chunks = {} 277 | for anchor, section in md_sections.items(): 278 | chunks[anchor] = split_section_into_chunks(section) 279 | 280 | return chunks 281 | 282 | 283 | def get_markdown_documents(filepath): 284 | page_md = get_page_markdown(filepath) 285 | return split_page_into_chunks(page_md) 286 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.black] 2 | line-length = 79 3 | include = '\.pyi?$' 4 | exclude = ''' 5 | /( 6 | | \.git 7 | )/ 8 | ''' -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | argcomplete==1.11.0 2 | google-cloud-storage>=2.8.0 3 | langchain>=0.0.179 4 | markdownify>=0.11.6 5 | openai>=0.27.2,<1.0.0 6 | qdrant-client>=1.1.1 7 | packaging==20.3 8 | pre-commit>=2.18.1 9 | regex>=2022.8.17 10 | rich>=12.6.0 11 | setuptools>=45.2.0 12 | tqdm>=4.64.1 13 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Installs FiftyOne Docs Search. 4 | | Copyright 2017-2023, Voxel51, Inc. 5 | | `voxel51.com `_ 6 | | 7 | """ 8 | from setuptools import setup, find_packages 9 | 10 | INSTALL_REQUIRES = [ 11 | "argcomplete", 12 | "google-cloud-storage", 13 | "langchain", 14 | "markdownify", 15 | "openai", 16 | "packaging", 17 | "qdrant-client", 18 | "regex", 19 | "rich", 20 | "setuptools", 21 | "tqdm", 22 | ] 23 | 24 | with open("README.md", "r") as fh: 25 | description = fh.read() 26 | 27 | setup( 28 | name="fiftyone-docs-search", 29 | version="0.21.0", 30 | author="Voxel51, Inc.", 31 | author_email="info@voxel51.com", 32 | packages=find_packages(), 33 | description="Semantic search for the FiftyOne Docs from command line", 34 | long_description=description, 35 | long_description_content_type="text/markdown", 36 | url="https://github.com/voxel51/fiftyone-docs-search", 37 | license="Apache", 38 | python_requires=">=3.8", 39 | install_requires=INSTALL_REQUIRES, 40 | include_package_data=True, 41 | classifiers=[ 42 | "Development Status :: 4 - Beta", 43 | "Intended Audience :: Developers", 44 | "Intended Audience :: Science/Research", 45 | "License :: OSI Approved :: Apache Software License", 46 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 47 | "Topic :: Scientific/Engineering :: Image Processing", 48 | "Topic :: Scientific/Engineering :: Image Recognition", 49 | "Topic :: Scientific/Engineering :: Information Analysis", 50 | "Topic :: Scientific/Engineering :: Visualization", 51 | "Operating System :: MacOS :: MacOS X", 52 | "Operating System :: POSIX :: Linux", 53 | "Operating System :: Microsoft :: Windows", 54 | "Programming Language :: Python :: 3", 55 | "Programming Language :: Python :: 3.7", 56 | "Programming Language :: Python :: 3.8", 57 | "Programming Language :: Python :: 3.9", 58 | "Programming Language :: Python :: 3.10", 59 | ], 60 | entry_points={ 61 | "console_scripts": [ 62 | "fiftyone-docs-search=fiftyone.docs_search.cli:main" 63 | ] 64 | }, 65 | ) 66 | --------------------------------------------------------------------------------