├── .gitignore
├── .pre-commit-config.yaml
├── LICENSE
├── README.md
├── fiftyone
    ├── __init__.py
    └── docs_search
    │   ├── __init__.py
    │   ├── cli.py
    │   ├── common.py
    │   ├── create_index.py
    │   ├── images
    │       ├── cli_example.gif
    │       └── python_example.gif
    │   ├── query_index.py
    │   └── read_docs.py
├── pyproject.toml
├── requirements.txt
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | 
3 | __pycache__
4 | 
5 | *.egg-info
6 | 
7 | **/*.DS_store
8 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 |   - repo: https://github.com/asottile/blacken-docs
 3 |     rev: v1.12.0
 4 |     hooks:
 5 |       - id: blacken-docs
 6 |         additional_dependencies: [black==21.12b0]
 7 |         args: ["-l 79"]
 8 |         exclude: ^docs/theme/|\.rst$
 9 |   - repo: https://github.com/ambv/black
10 |     rev: 22.3.0
11 |     hooks:
12 |       - id: black
13 |         language_version: python3
14 |         args: ["-l 79"]
15 |         exclude: ^docs/theme/
16 |   - repo: local
17 |     hooks:
18 |       - id: pylint
19 |         name: pylint
20 |         language: system
21 |         files: \.py$
22 |         entry: pylint
23 |         args: ["--errors-only"]
24 |         exclude: ^(docs/theme/|app/)
25 |   - repo: local
26 |     hooks:
27 |       - id: ipynb-strip
28 |         name: ipynb-strip
29 |         language: system
30 |         files: \.ipynb$
31 |         exclude: ^docs/ # *do* commit ipynb outputs in `docs/`
32 |         entry: jupyter nbconvert --clear-output --ClearOutputPreprocessor.enabled=True
33 |         args: ["--log-level=ERROR"]
34 |   - repo: https://github.com/pre-commit/mirrors-prettier
35 |     rev: v2.6.2
36 |     hooks:
37 |       - id: prettier
38 |         exclude: ^docs/theme/
39 |         language_version: system
40 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Search the FiftyOne Docs with an LLM
  2 | 
  3 | This repository contains the code to enable semantic search on the
  4 | [Voxel51 documentation](https://docs.voxel51.com) from Python or the command
  5 | line. The search is powered by [FiftyOne](https://github.com/voxel51/fiftyone),
  6 | OpenAI's [text-embedding-ada-002 model](https://platform.openai.com/docs/guides/embeddings), and [Qdrant vector search](https://qdrant.tech/).
  7 | 
  8 | !['fiftyone-docs-search-cli'](fiftyone/docs_search/images/cli_example.gif)
  9 | 
 10 | ## Updates
 11 | 
 12 | - **2021-06-14**: The `fiftyone-docs-search` package has been updated in the following ways:
 13 |   - [FiftyOne Documentation](https://voxel51.com/docs/fiftyone) embeddings have been updated to FiftyOne 0.21.0.
 14 |   - Splitting of documents is simplified and more robust. LangChain splitters are used in conjunction with our custom Markdown parsing.
 15 |   - The `block_type` argument has been removed to make search results more robust.
 16 | 
 17 | ## Installation
 18 | 
 19 | 1. Clone the repository:
 20 | 
 21 | ```shell
 22 | git clone https://github.com/voxel51/fiftyone-docs-search
 23 | cd fiftyone-docs-search
 24 | ```
 25 | 
 26 | 2. Install the package:
 27 | 
 28 | ```shell
 29 | pip install -e .
 30 | ```
 31 | 
 32 | 3. Register your OpenAI API key
 33 |    ([create one](https://platform.openai.com/account/api-keys)):
 34 | 
 35 | ```shell
 36 | export OPENAI_API_KEY=XXXXXXXX
 37 | ```
 38 | 
 39 | 4. Launch a Qdrant server:
 40 | 
 41 | ```shell
 42 | docker pull qdrant/qdrant
 43 | docker run -d -p 6333:6333 qdrant/qdrant
 44 | ```
 45 | 
 46 | ## Usage
 47 | 
 48 | ### Command line
 49 | 
 50 | The `fiftyone-docs-search` package provides a command line interface for
 51 | searching the Voxel51 documentation. To use it, run:
 52 | 
 53 | ```shell
 54 | fiftyone-docs-search query <query>
 55 | ```
 56 | 
 57 | where `<query>` is the search query. For example:
 58 | 
 59 | ```shell
 60 | fiftyone-docs-search query "how to load a dataset"
 61 | ```
 62 | 
 63 | The following flags can give you control over the search behavior:
 64 | 
 65 | - `--num_results`: the number of results returned
 66 | - `--open_url`: whether to open the top result in your browser
 67 | - `--score`: whether to return the score of each result
 68 | - `--doc_types`: the types of docs to search over (e.g., "tutorials", "api", "guides")
 69 | 
 70 | You can also use the `--help` flag to see all available options:
 71 | 
 72 | ```shell
 73 | fiftyone-docs-search --help
 74 | ```
 75 | 
 76 | #### Aliasing the command
 77 | 
 78 | If you find `fiftyone-docs-search query` cumbersome, you can alias the command, by adding the following to your `~/.bashrc` or `~/.zshrc` file:
 79 | 
 80 | ```bash
 81 | alias fosearch='fiftyone-docs-search query'
 82 | ```
 83 | 
 84 | ### Python
 85 | 
 86 | !['fiftyone-docs-search-python'](fiftyone/docs_search/images/python_example.gif)
 87 | 
 88 | The `fiftyone-docs-search` package also provides a Python API for searching the
 89 | Voxel51 documentation. To use it, run:
 90 | 
 91 | ```py
 92 | from fiftyone.docs_search import FiftyOneDocsSearch
 93 | 
 94 | fods = FiftyOneDocsSearch()
 95 | results = fods("how to load a dataset")
 96 | ```
 97 | 
 98 | You can set defaults for the search behavior by passing arguments to the
 99 | constructor:
100 | 
101 | ```py
102 | fods = FiftyOneDocsSearch(
103 |     num_results=5,
104 |     open_url=True,
105 |     score=True,
106 |     doc_types=["tutorials", "api", "guides"],
107 | )
108 | ```
109 | 
110 | For any individual search, you can override these defaults by passing arguments.
111 | 
112 | ## Versioning
113 | 
114 | The `fiftyone-docs-search` package is versioned to match the version of the
115 | Voxel51 FiftyOne documentation that it is searching. For example, the `v0.20.1`
116 | version of the `fiftyone-docs-search` package is designed to search the
117 | `v0.20.1` version of the Voxel51 FiftyOne documentation.
118 | 
119 | ## Building the index from scratch
120 | 
121 | By default, if you do not have a Qdrant collection instantiated yet, when you
122 | run a search, the `fiftyone-docs-search` package will automatically download
123 | a JSON file containing a vector indexing of the latest version of the Voxel51
124 | FiftyOne documentation.
125 | 
126 | If you would like, you can also build the index yourself from a local copy of
127 | the Voxel51 FiftyOne documentation. To do so, first clone the FiftyOne repo if
128 | you haven't already:
129 | 
130 | ```shell
131 | git clone https://github.com/voxel51/fiftyone
132 | ```
133 | 
134 | and install FiftyOne, as described in the detailed installation instructions
135 | [here](https://github.com/voxel51/fiftyone#installation-1).
136 | 
137 | Build a local version of the docs by running:
138 | 
139 | ```shell
140 | bash docs/generate_docs.bash
141 | ```
142 | 
143 | Then, set a `FIFTYONE_DIR` environment variable to the path to the local
144 | FiftyOne repo. For example, if you cloned the repo to `~/fiftyone`, you would
145 | run:
146 | 
147 | ```shell
148 | export FIFTYONE_DIR=~/fiftyone
149 | ```
150 | 
151 | Finally, run the following command to build the index:
152 | 
153 | ```shell
154 | fiftyone-docs-search create
155 | ```
156 | 
157 | If you would like to save the Qdrant index to JSON, you can run:
158 | 
159 | ```shell
160 | fiftyone-docs-search save -o <path to JSON file>
161 | ```
162 | 
163 | ## Contributing
164 | 
165 | Contributions are welcome!
166 | 
167 | ## About FiftyOne
168 | 
169 | If you've made it this far, we'd greatly appreciate if you'd take a moment to
170 | check out [FiftyOne](https://github.com/voxel51/fiftyone) and give us a star!
171 | 
172 | FiftyOne is an open source library for building high-quality datasets and
173 | computer vision models. It's the engine that powers this project.
174 | 
175 | Thanks for visiting! 😊
176 | 
177 | ## Join the Community
178 | 
179 | If you want join a fast-growing community of engineers, researchers, and
180 | practitioners who love computer vision, join the
181 | [FiftyOne Slack community](https://slack.voxel51.com/)! 🚀🚀🚀
182 | 


--------------------------------------------------------------------------------
/fiftyone/__init__.py:
--------------------------------------------------------------------------------
 1 | from pkgutil import extend_path
 2 | 
 3 | # This statement allows multiple `fiftyone.XXX` packages to be installed in the
 4 | # same environment and used simultaneously.
 5 | #
 6 | # https://docs.python.org/3/library/pkgutil.html#pkgutil.extend_path
 7 | #
 8 | __path__ = extend_path(__path__, __name__)
 9 | 
10 | from fiftyone.__public__ import *
11 | 


--------------------------------------------------------------------------------
/fiftyone/docs_search/__init__.py:
--------------------------------------------------------------------------------
1 | from .query_index import FiftyOneDocsSearch
2 | 


--------------------------------------------------------------------------------
/fiftyone/docs_search/cli.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Definition of the `fiftyone-docs-search` command-line interface (CLI).
  3 | | Copyright 2017-2023, Voxel51, Inc.
  4 | | `voxel51.com <https://voxel51.com/>`_
  5 | |
  6 | """
  7 | 
  8 | import argparse
  9 | import argcomplete
 10 | import os
 11 | 
 12 | 
 13 | import fiftyone.docs_search.create_index as dsci
 14 | import fiftyone.docs_search.query_index as dsqi
 15 | 
 16 | ################################################################
 17 | 
 18 | 
 19 | class Command(object):
 20 |     """Interface for defining commands.
 21 |     Command instances must implement the `setup()` method, and they should
 22 |     implement the `execute()` method if they perform any functionality beyond
 23 |     defining subparsers.
 24 |     """
 25 | 
 26 |     @staticmethod
 27 |     def setup(parser):
 28 |         """Setup the command-line arguments for the command.
 29 |         Args:
 30 |             parser: an `argparse.ArgumentParser` instance
 31 |         """
 32 |         raise NotImplementedError("subclass must implement setup()")
 33 | 
 34 |     @staticmethod
 35 |     def execute(parser, args):
 36 |         """Executes the command on the given args.
 37 |         args:
 38 |             parser: the `argparse.ArgumentParser` instance for the command
 39 |             args: an `argparse.Namespace` instance containing the arguments
 40 |                 for the command
 41 |         """
 42 |         raise NotImplementedError("subclass must implement execute()")
 43 | 
 44 | 
 45 | class FiftyOneDocsSearchCommand(Command):
 46 |     """The FiftyOneDocsSearch command-line interface."""
 47 | 
 48 |     @staticmethod
 49 |     def setup(parser):
 50 |         subparsers = parser.add_subparsers(title="available commands")
 51 |         _register_command(subparsers, "create", CreateIndexCommand)
 52 |         _register_command(subparsers, "save", SaveIndexCommand)
 53 |         _register_command(subparsers, "load", LoadIndexCommand)
 54 |         _register_command(subparsers, "query", QueryIndexCommand)
 55 | 
 56 |     @staticmethod
 57 |     def execute(parser, args):
 58 |         parser.print_help()
 59 | 
 60 | 
 61 | class CreateIndexCommand(Command):
 62 |     """Creates the vector index for the docs.
 63 | 
 64 |     Examples::
 65 | 
 66 |         fiftyone-docs-search create --name my_name
 67 | 
 68 |     """
 69 | 
 70 |     @staticmethod
 71 |     def setup(parser):
 72 |         parser.add_argument(
 73 |             "-n",
 74 |             "--name",
 75 |             metavar="COLLECTION_NAME",
 76 |             default="fiftyone_docs",
 77 |             help="the name of the Qdrant collection to create",
 78 |         )
 79 | 
 80 |     @staticmethod
 81 |     def execute(parser, args):
 82 |         os.environ["FIFTYONE_DOCS_COLLECTION"] = str(args.collection_name)
 83 |         dsci.generate_index_from_html_docs()
 84 | 
 85 | 
 86 | class SaveIndexCommand(Command):
 87 |     """Saves the vector index for the docs.
 88 | 
 89 |     Examples::
 90 | 
 91 |         fiftyone-docs-search save -o my_index.json -b 100
 92 | 
 93 |     """
 94 | 
 95 |     @staticmethod
 96 |     def setup(parser):
 97 |         parser.add_argument(
 98 |             "-o",
 99 |             "--out_path",
100 |             metavar="INDEX_JSON",
101 |             default="fiftyone_docs_index.json",
102 |             help="the name of the JSON file to save the index to",
103 |         )
104 | 
105 |         parser.add_argument(
106 |             "-b",
107 |             "--batch_size",
108 |             metavar="BATCH_SIZE",
109 |             default=50,
110 |             help="the pagination size for retrieving vectors from Qdrant index",
111 |         )
112 | 
113 |     @staticmethod
114 |     def execute(parser, args):
115 |         dsci.save_index_to_json(
116 |             docs_index_file=args.out_path, batch_size=args.batch_size
117 |         )
118 | 
119 | 
120 | class LoadIndexCommand(Command):
121 |     """Loads the vector index for the docs from JSON.
122 | 
123 |     Examples::
124 | 
125 |         fiftyone-docs-search load -i my_index.json
126 | 
127 |     """
128 | 
129 |     @staticmethod
130 |     def setup(parser):
131 |         parser.add_argument(
132 |             "-i",
133 |             "--in_path",
134 |             metavar="INDEX_JSON",
135 |             default="fiftyone_docs_index.json",
136 |             help="the name of the JSON file to load the index from",
137 |         )
138 | 
139 |     # pylint: disable=unexpected-keyword-arg
140 |     @staticmethod
141 |     def execute(parser, args):
142 |         dsci.load_index_from_json(docs_index_file=args.in_path)
143 | 
144 | 
145 | def str2bool(v):
146 |     if isinstance(v, bool):
147 |         return v
148 |     if v.lower() in ("yes", "true", "t", "y", "1"):
149 |         return True
150 |     elif v.lower() in ("no", "false", "f", "n", "0"):
151 |         return False
152 |     else:
153 |         raise argparse.ArgumentTypeError("Boolean value expected.")
154 | 
155 | 
156 | class QueryIndexCommand(Command):
157 |     """Queries the vector index for the docs.
158 | 
159 |     Examples::
160 | 
161 |         fiftyone-docs-search query "How do I load a dataset in FiftyOne?" -n 10
162 | 
163 |     """
164 | 
165 |     @staticmethod
166 |     def setup(parser):
167 |         parser.add_argument(
168 |             "query",
169 |             metavar="QUERY",
170 |             nargs="?",
171 |             default="How do I load a dataset in FiftyOne?",
172 |             help="the query string to search for",
173 |         )
174 | 
175 |         parser.add_argument(
176 |             "-n",
177 |             "--num_results",
178 |             metavar="NUM_RESULTS",
179 |             default=10,
180 |             help="the number of results to return",
181 |         )
182 | 
183 |         parser.add_argument(
184 |             "-o",
185 |             "--open_url",
186 |             metavar="OPEN_URL",
187 |             default=True,
188 |             type=str2bool,
189 |             help="whether to open the first result in a web browser",
190 |         )
191 | 
192 |         parser.add_argument(
193 |             "-s",
194 |             "--score",
195 |             metavar="SCORE",
196 |             default=False,
197 |             type=str2bool,
198 |             help="whether to print the score of each result",
199 |         )
200 | 
201 |         parser.add_argument(
202 |             "-d",
203 |             "--doc_types",
204 |             metavar="DOC_TYPES",
205 |             default=None,
206 |             help="the types of docs to search through",
207 |         )
208 | 
209 |     @staticmethod
210 |     def execute(parser, args):
211 |         dsqi.fiftyone_docs_search(
212 |             args.query,
213 |             top_k=args.num_results,
214 |             open_url=args.open_url,
215 |             score=args.score,
216 |             doc_types=args.doc_types,
217 |         )
218 | 
219 | 
220 | def _has_subparsers(parser):
221 |     for action in parser._actions:
222 |         if isinstance(action, argparse._SubParsersAction):
223 |             return True
224 | 
225 |     return False
226 | 
227 | 
228 | def _iter_subparsers(parser):
229 |     for action in parser._actions:
230 |         if isinstance(action, argparse._SubParsersAction):
231 |             for subparser in action.choices.values():
232 |                 yield subparser
233 | 
234 | 
235 | class _RecursiveHelpAction(argparse._HelpAction):
236 |     def __call__(self, parser, *args, **kwargs):
237 |         self._recurse(parser)
238 |         parser.exit()
239 | 
240 |     @staticmethod
241 |     def _recurse(parser):
242 |         print("\n%s\n%s" % ("*" * 79, parser.format_help()))
243 |         for subparser in _iter_subparsers(parser):
244 |             _RecursiveHelpAction._recurse(subparser)
245 | 
246 | 
247 | def _register_main_command(command, version=None, recursive_help=True):
248 |     parser = argparse.ArgumentParser(description=command.__doc__.rstrip())
249 | 
250 |     parser.set_defaults(execute=lambda args: command.execute(parser, args))
251 |     command.setup(parser)
252 | 
253 |     if version:
254 |         parser.add_argument(
255 |             "-v",
256 |             "--version",
257 |             action="version",
258 |             version=version,
259 |             help="show version info",
260 |         )
261 | 
262 |     if recursive_help and _has_subparsers(parser):
263 |         parser.add_argument(
264 |             "--all-help",
265 |             action=_RecursiveHelpAction,
266 |             help="show help recursively and exit",
267 |         )
268 | 
269 |     argcomplete.autocomplete(parser)
270 |     return parser
271 | 
272 | 
273 | def _register_command(parent, name, command, recursive_help=True):
274 |     parser = parent.add_parser(
275 |         name,
276 |         help=command.__doc__.splitlines()[0],
277 |         description=command.__doc__.rstrip(),
278 |         formatter_class=argparse.RawTextHelpFormatter,
279 |     )
280 | 
281 |     parser.set_defaults(execute=lambda args: command.execute(parser, args))
282 |     command.setup(parser)
283 | 
284 |     if recursive_help and _has_subparsers(parser):
285 |         parser.add_argument(
286 |             "--all-help",
287 |             action=_RecursiveHelpAction,
288 |             help="show help recursively and exit",
289 |         )
290 | 
291 |     return parser
292 | 
293 | 
294 | __version__ = "0.21.0"
295 | 
296 | 
297 | def main():
298 |     """Executes the `fiftyone-docs-search` tool with the given command-line args."""
299 |     parser = _register_main_command(
300 |         FiftyOneDocsSearchCommand,
301 |         version="FiftyOneDocsSearch v%s" % __version__,
302 |     )
303 |     args = parser.parse_args()
304 |     args.execute(args)
305 | 


--------------------------------------------------------------------------------
/fiftyone/docs_search/common.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Common function declarations.
 3 | | Copyright 2017-2023, Voxel51, Inc.
 4 | | `voxel51.com <https://voxel51.com/>`_
 5 | |
 6 | """
 7 | 
 8 | import openai
 9 | import os
10 | import qdrant_client as qc
11 | import qdrant_client.http.models as models
12 | 
13 | DOC_TYPES = (
14 |     "cheat_sheets",
15 |     "cli",
16 |     "environments",
17 |     "faq",
18 |     "getting_started",
19 |     "integrations",
20 |     "plugins",
21 |     "recipes",
22 |     "teams",
23 |     "tutorials",
24 |     "user_guide",
25 | )
26 | 
27 | BLOCK_TYPES = (
28 |     "code",
29 |     "text",
30 | )
31 | 
32 | MODEL = "text-embedding-ada-002"
33 | 
34 | CLIENT = qc.QdrantClient(url="localhost")
35 | METRIC = models.Distance.DOT
36 | DIMENSION = 1536
37 | 
38 | DEFAULT_COLLECTION_NAME = "fiftyone_docs"
39 | 
40 | HOME = os.path.expanduser("~")
41 | FIFTYONE_DOCS_INDEX_FOLDER = os.path.join(HOME, ".fiftyone_docs_search")
42 | FIFTYONE_DOCS_INDEX_FILENAME = "fiftyone_docs_index.json"
43 | FIFTYONE_DOCS_INDEX_FILEPATH = os.path.join(
44 |     FIFTYONE_DOCS_INDEX_FOLDER, FIFTYONE_DOCS_INDEX_FILENAME
45 | )
46 | 
47 | BASE_DOCS_URL = "https://docs.voxel51.com/"
48 | 
49 | ################################################################
50 | 
51 | 
52 | def get_collection_name():
53 |     collection_name = os.getenv("FIFTYONE_DOCS_COLLECTION")
54 |     if collection_name is None or collection_name == "None":
55 |         collection_name = DEFAULT_COLLECTION_NAME
56 |     return collection_name
57 | 
58 | 
59 | ################################################################
60 | 
61 | 
62 | def embed_text(text):
63 |     response = openai.Embedding.create(input=text, model=MODEL)
64 |     embeddings = response["data"][0]["embedding"]
65 |     return embeddings
66 | 
67 | 
68 | ################################################################
69 | 


--------------------------------------------------------------------------------
/fiftyone/docs_search/create_index.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Index creating and loading function declarations.
  3 | | Copyright 2017-2023, Voxel51, Inc.
  4 | | `voxel51.com <https://voxel51.com/>`_
  5 | |
  6 | """
  7 | from google.cloud import storage
  8 | import json
  9 | import os
 10 | import qdrant_client.http.models as models
 11 | import shutil
 12 | from tqdm import tqdm
 13 | import uuid
 14 | 
 15 | from fiftyone.docs_search.common import *
 16 | from fiftyone.docs_search.read_docs import (
 17 |     get_docs_list,
 18 |     get_markdown_documents,
 19 | )
 20 | 
 21 | ################################################################
 22 | 
 23 | 
 24 | def generate_id():
 25 |     return str(uuid.uuid1().int)[:32]
 26 | 
 27 | 
 28 | def get_page_url(filepath):
 29 |     return f"{BASE_DOCS_URL}{filepath.split('html/')[1]}"
 30 | 
 31 | 
 32 | def get_doc_type(doc_path):
 33 |     for doc_type in DOC_TYPES:
 34 |         if doc_type in doc_path:
 35 |             return doc_type
 36 |     return None
 37 | 
 38 | 
 39 | ################################################################
 40 | 
 41 | 
 42 | def initialize_index():
 43 |     collection_name = get_collection_name()
 44 | 
 45 |     CLIENT.recreate_collection(
 46 |         collection_name=collection_name,
 47 |         vectors_config=models.VectorParams(
 48 |             size=DIMENSION,
 49 |             distance=METRIC,
 50 |         ),
 51 |     )
 52 | 
 53 | 
 54 | def add_vectors_to_index(ids, vectors, payloads):
 55 |     collection_name = get_collection_name()
 56 |     CLIENT.upsert(
 57 |         collection_name=collection_name,
 58 |         points=models.Batch(ids=ids, vectors=vectors, payloads=payloads),
 59 |     )
 60 | 
 61 | 
 62 | def create_subsection_vector(
 63 |     subsection_content,
 64 |     section_anchor,
 65 |     page_url,
 66 |     doc_type,
 67 | ):
 68 | 
 69 |     vector = embed_text(subsection_content)
 70 |     id = generate_id()
 71 |     payload = {
 72 |         "text": subsection_content,
 73 |         "url": page_url,
 74 |         "section_anchor": section_anchor,
 75 |         "doc_type": doc_type,
 76 |     }
 77 |     return id, vector, payload
 78 | 
 79 | 
 80 | ################################################################
 81 | 
 82 | 
 83 | def add_doc_to_index(filepath):
 84 |     subsections = get_markdown_documents(filepath)
 85 | 
 86 |     page_url = get_page_url(filepath)
 87 |     doc_type = get_doc_type(filepath)
 88 | 
 89 |     ids = []
 90 |     vectors = []
 91 |     payloads = []
 92 | 
 93 |     for section_anchor, section_content in subsections.items():
 94 |         if section_content == []:
 95 |             continue
 96 |         for subsection in section_content:
 97 |             id, vector, payload = create_subsection_vector(
 98 |                 subsection,
 99 |                 section_anchor,
100 |                 page_url,
101 |                 doc_type,
102 |             )
103 |             ids.append(id)
104 |             vectors.append(vector)
105 |             payloads.append(payload)
106 | 
107 |     add_vectors_to_index(ids, vectors, payloads)
108 | 
109 | 
110 | ################################################################
111 | 
112 | 
113 | def generate_json_from_html_doc(doc):
114 |     doc_json = {}
115 |     sections = get_markdown_documents(doc)
116 | 
117 |     if len(sections) == 0:
118 |         return
119 |     if len(sections) == 1 and None in list(sections.keys()):
120 |         return
121 | 
122 |     page_url = get_page_url(doc)
123 |     doc_type = get_doc_type(doc)
124 | 
125 |     for section_anchor, section in sections.items():
126 |         for subsection_content in section:
127 |             if subsection_content == []:
128 |                 continue
129 | 
130 |             id, vector, payload = create_subsection_vector(
131 |                 subsection_content,
132 |                 section_anchor,
133 |                 page_url,
134 |                 doc_type,
135 |             )
136 |             doc_json[id] = {"vector": vector, **payload}
137 | 
138 |     return doc_json
139 | 
140 | 
141 | def generate_json_from_html_docs(docs_index_file="fiftyone_docs_index.json"):
142 |     docs_json = {}
143 | 
144 |     docs = get_docs_list()
145 |     for doc in tqdm(docs):
146 |         doc_json = generate_json_from_html_doc(doc)
147 |         if doc_json is None:
148 |             continue
149 |         for id in doc_json:
150 |             docs_json[id] = doc_json[id]
151 | 
152 |     with open(docs_index_file, "w") as f:
153 |         json.dump(docs_json, f)
154 | 
155 | 
156 | ################################################################
157 | 
158 | 
159 | def generate_index_from_html_docs():
160 |     initialize_index()
161 | 
162 |     docs = get_docs_list()
163 |     for doc in tqdm(docs):
164 |         add_doc_to_index(doc)
165 | 
166 |     print("Index created successfully!")
167 | 
168 | 
169 | ################################################################
170 | 
171 | 
172 | def save_index_to_json(
173 |     docs_index_file="fiftyone_docs_index.json", batch_size=50
174 | ):
175 |     collection_name = get_collection_name()
176 |     collection = CLIENT.get_collection(collection_name=collection_name)
177 |     num_vectors = collection.points_count
178 |     docs_index = {}
179 | 
180 |     curr_points = CLIENT.scroll(
181 |         collection_name=collection_name,
182 |         limit=batch_size,
183 |         with_payloads=True,
184 |         with_vectors=True,
185 |     )
186 | 
187 |     for i in tqdm(range(0, num_vectors, batch_size)):
188 |         min_ind = i * batch_size
189 | 
190 |         curr_points = CLIENT.scroll(
191 |             collection_name=collection_name,
192 |             limit=10,
193 |             offset=min_ind,
194 |             with_payload=True,
195 |             with_vectors=True,
196 |         )[0]
197 | 
198 |         for point in curr_points:
199 |             docs_index[point.id] = {"vector": point.vector, **point.payload}
200 | 
201 |     with open(docs_index_file, "w") as f:
202 |         json.dump(docs_index, f)
203 | 
204 |     print(f"Index saved successfully to {docs_index_file}!")
205 | 
206 | 
207 | ################################################################
208 | 
209 | 
210 | def load_index_from_json(docs_index_file=None, batch_size=500):
211 | 
212 |     initialize_index()
213 |     tmp_index_file = (
214 |         docs_index_file
215 |         if docs_index_file is not None
216 |         else FIFTYONE_DOCS_INDEX_FILENAME
217 |     )
218 |     shutil.copyfile(FIFTYONE_DOCS_INDEX_FILEPATH, tmp_index_file)
219 |     with open(tmp_index_file, "r") as f:
220 |         docs_index = json.load(f)
221 |     os.remove(tmp_index_file)
222 | 
223 |     ids = []
224 |     vectors = []
225 |     payloads = []
226 | 
227 |     for id, value in docs_index.items():
228 |         ids.append(id)
229 |         vectors.append(value["vector"])
230 | 
231 |         payload_keys = (
232 |             "text",
233 |             "url",
234 |             "section_anchor",
235 |             "doc_type",
236 |             "block_type",
237 |         )
238 |         payload = {key: value[key] for key in payload_keys}
239 |         payloads.append(payload)
240 | 
241 |     for i in tqdm(range(0, len(ids), batch_size)):
242 |         min_ind = i
243 |         max_ind = min(i + batch_size, len(ids))
244 | 
245 |         curr_ids = ids[min_ind:max_ind]
246 |         curr_vectors = vectors[min_ind:max_ind]
247 |         curr_payloads = payloads[min_ind:max_ind]
248 | 
249 |         add_vectors_to_index(curr_ids, curr_vectors, curr_payloads)
250 | 
251 |     print("Index created successfully!")
252 | 
253 | 
254 | ################################################################
255 | 
256 | 
257 | def download_index():
258 |     print("Downloading index JSON from Google Drive...")
259 |     storage_client = storage.Client.create_anonymous_client()
260 |     bucket = storage_client.bucket("fiftyone-docs-search")
261 |     blob = bucket.blob("fiftyone_docs_index.json")
262 | 
263 |     tmp_file = FIFTYONE_DOCS_INDEX_FILENAME
264 |     blob.download_to_filename(tmp_file)
265 | 
266 |     if not os.path.exists(FIFTYONE_DOCS_INDEX_FOLDER):
267 |         os.mkdir(FIFTYONE_DOCS_INDEX_FOLDER)
268 | 
269 |     os.replace(tmp_file, FIFTYONE_DOCS_INDEX_FILEPATH)
270 | 


--------------------------------------------------------------------------------
/fiftyone/docs_search/images/cli_example.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/voxel51/fiftyone-docs-search/61d658271cdc8aad243b8f4d4e51a03661eeb166/fiftyone/docs_search/images/cli_example.gif


--------------------------------------------------------------------------------
/fiftyone/docs_search/images/python_example.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/voxel51/fiftyone-docs-search/61d658271cdc8aad243b8f4d4e51a03661eeb166/fiftyone/docs_search/images/python_example.gif


--------------------------------------------------------------------------------
/fiftyone/docs_search/query_index.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Index querying function declaration.
  3 | | Copyright 2017-2023, Voxel51, Inc.
  4 | | `voxel51.com <https://voxel51.com/>`_
  5 | |
  6 | """
  7 | 
  8 | import os
  9 | import qdrant_client as qc
 10 | import qdrant_client.http.models as models
 11 | from rich import print
 12 | import webbrowser
 13 | 
 14 | from fiftyone.docs_search.create_index import (
 15 |     download_index,
 16 |     load_index_from_json,
 17 | )
 18 | from fiftyone.docs_search.common import *
 19 | 
 20 | ################################################################
 21 | 
 22 | 
 23 | def parse_doc_types(doc_types):
 24 |     if doc_types is None:
 25 |         doc_types = DOC_TYPES
 26 |     elif type(doc_types) == str:
 27 |         doc_types = [doc_types]
 28 |     return doc_types
 29 | 
 30 | 
 31 | ################################################################
 32 | 
 33 | 
 34 | def collection_exists(collection_name):
 35 |     collections = CLIENT.get_collections().collections
 36 |     collection_names = [collection.name for collection in collections]
 37 |     return collection_name in collection_names
 38 | 
 39 | 
 40 | def query_index(query, top_k=10, doc_types=None):
 41 |     collection_name = get_collection_name()
 42 | 
 43 |     if not collection_exists(collection_name):
 44 |         print(f"Collection {collection_name} does not exist. Creating...")
 45 |         if not os.path.exists(FIFTYONE_DOCS_INDEX_FILEPATH):
 46 |             print(
 47 |                 f"Index JSON file {FIFTYONE_DOCS_INDEX_FILEPATH} does not exist."
 48 |             )
 49 |             download_index()
 50 |         load_index_from_json()
 51 | 
 52 |     vector = embed_text(query)
 53 | 
 54 |     _search_params = models.SearchParams(hnsw_ef=128, exact=False)
 55 | 
 56 |     doc_types = parse_doc_types(doc_types)
 57 | 
 58 |     _filter = models.Filter(
 59 |         must=[
 60 |             models.Filter(
 61 |                 should=[
 62 |                     models.FieldCondition(
 63 |                         key="doc_type",
 64 |                         match=models.MatchValue(value=dt),
 65 |                     )
 66 |                     for dt in doc_types
 67 |                 ],
 68 |             )
 69 |         ]
 70 |     )
 71 | 
 72 |     results = CLIENT.search(
 73 |         collection_name=collection_name,
 74 |         query_vector=vector,
 75 |         query_filter=_filter,
 76 |         limit=top_k,
 77 |         with_payload=True,
 78 |         search_params=_search_params,
 79 |     )
 80 | 
 81 |     results = [
 82 |         (
 83 |             f"{res.payload['url']}#{res.payload['section_anchor']}",
 84 |             res.payload["text"],
 85 |             res.score,
 86 |         )
 87 |         for res in results
 88 |     ]
 89 | 
 90 |     return results
 91 | 
 92 | 
 93 | ################################################################
 94 | 
 95 | 
 96 | def add_breadcrumbs():
 97 |     fo_url = "https://github.com/voxel51/fiftyone"
 98 |     breadcrumb_string = ""
 99 |     breadcrumb_string += (
100 |         f"\u2B50 Star the FiftyOne repo! \u2B50 {fo_url} \u2B50\n"
101 |     )
102 |     return f"{breadcrumb_string : ^40}"
103 | 
104 | 
105 | def format_string(s):
106 |     s = s.replace("\(", "(").replace("\)", ")")
107 |     return s
108 | 
109 | 
110 | def print_results(query, results, score=True):
111 |     print("\n" * 3)
112 |     print("=" * 80)
113 |     str = f"Query: {query}"
114 |     print(f"{str: ^80}")
115 |     print("=" * 80)
116 |     for i in range(len(results)):
117 |         result = format_string(results[i][1])
118 |         print(f"{i+1}) {results[i][0]}")
119 |         print(f"--> {result}")
120 |         if score:
121 |             print(f"Score: {results[i][2]}")
122 |         print("-" * 80)
123 |     print("\n" * 2)
124 |     print(add_breadcrumbs())
125 | 
126 | 
127 | ################################################################
128 | 
129 | 
130 | def fiftyone_docs_search(
131 |     query, top_k=10, doc_types=None, score=False, open_url=True
132 | ):
133 |     results = query_index(
134 |         query,
135 |         top_k=top_k,
136 |         doc_types=doc_types,
137 |     )
138 | 
139 |     print_results(query, results, score=score)
140 |     if open_url:
141 |         top_url = results[0][0]
142 |         webbrowser.open(top_url)
143 | 
144 | 
145 | ################################################################
146 | 
147 | 
148 | class FiftyOneDocsSearch:
149 |     """Class for handling FiftyOneDocsSearch queries."""
150 | 
151 |     def __init__(self, top_k=None, doc_types=None, score=False, open_url=True):
152 |         self.default_top_k = top_k
153 |         self.default_doc_types = doc_types
154 |         self.default_score = score
155 |         self.default_open_url = open_url
156 | 
157 |     def __call__(
158 |         self, query, top_k=None, doc_types=None, score=None, open_url=None
159 |     ):
160 |         args_dict = {}
161 | 
162 |         if top_k is None:
163 |             top_k = self.default_top_k
164 |         if top_k is not None:
165 |             args_dict["top_k"] = top_k
166 | 
167 |         if doc_types is None:
168 |             doc_types = self.default_doc_types
169 |         if doc_types is not None:
170 |             args_dict["doc_types"] = doc_types
171 | 
172 |         if score is None:
173 |             score = self.default_score
174 |         if score is not None:
175 |             args_dict["score"] = score
176 | 
177 |         if open_url is None:
178 |             open_url = self.default_open_url
179 |         if open_url is not None:
180 |             args_dict["open_url"] = open_url
181 | 
182 |         fiftyone_docs_search(query, **args_dict)
183 | 


--------------------------------------------------------------------------------
/fiftyone/docs_search/read_docs.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Declaration of document reading functions.
  3 | | Copyright 2017-2023, Voxel51, Inc.
  4 | | `voxel51.com <https://voxel51.com/>`_
  5 | |
  6 | """
  7 | 
  8 | from glob import glob
  9 | import os.path
 10 | import os
 11 | import re
 12 | 
 13 | from langchain.schema import Document
 14 | from langchain.text_splitter import MarkdownTextSplitter
 15 | 
 16 | import fiftyone.core.utils as fou
 17 | 
 18 | md = fou.lazy_import("markdownify")
 19 | 
 20 | ################################################################
 21 | 
 22 | splitter = MarkdownTextSplitter(chunk_size=1000)
 23 | 
 24 | 
 25 | def get_docs_list():
 26 |     FO_DIR = os.getenv("FIFTYONE_DIR")
 27 |     FO_HTML_DOCS_DIR = os.path.join(FO_DIR, "docs/build/html")
 28 | 
 29 |     all_docs = []
 30 |     for pattern in ["*/*.html", "*/*/*.html"]:
 31 |         all_docs += glob(
 32 |             os.path.join(FO_HTML_DOCS_DIR, pattern), recursive=True
 33 |         )
 34 |     return [doc for doc in all_docs if "api/" not in doc]
 35 | 
 36 | 
 37 | ################################################################
 38 | def remove_footer(page_md):
 39 |     return page_md.split("[Next ![]")[0]
 40 | 
 41 | 
 42 | def remove_header(page_md):
 43 |     md_lines = page_md.split("\n")
 44 | 
 45 |     body_lines = []
 46 |     in_body = False
 47 |     for mdl in md_lines:
 48 |         if len(mdl) > 0 and mdl[0] == "#":
 49 |             in_body = True
 50 |         if in_body:
 51 |             body_lines.append(mdl)
 52 |     page_md = "\n".join(body_lines)
 53 |     return page_md
 54 | 
 55 | 
 56 | def remove_extra_newlines(page_md):
 57 |     lines = page_md.split("\n")
 58 |     lines = [line for line in lines if line.strip() != "!"]
 59 |     page_md = "\n".join(lines)
 60 |     return re.sub(r"\n{3,}", "\n\n", page_md)
 61 | 
 62 | 
 63 | def remove_empty_code_blocks(page_md):
 64 |     text_and_code = page_md.split("```")
 65 |     text_blocks = text_and_code[::2]
 66 |     code_blocks = text_and_code[1::2]
 67 |     code_blocks = [
 68 |         cb
 69 |         for cb in code_blocks
 70 |         if len(cb.strip()) > 0 and not set(cb).issubset(set("| -\n"))
 71 |     ]
 72 | 
 73 |     page_md = ""
 74 |     for tb, cb in zip(text_blocks, code_blocks):
 75 |         page_md += tb + "```" + cb + "```"
 76 | 
 77 |     page_md += text_and_code[-1]
 78 |     return re.sub(r"```py\s*```", "", page_md, flags=re.MULTILINE)
 79 | 
 80 | 
 81 | def remove_jupyter_widgets(page_md):
 82 |     lines = page_md.split("\n")
 83 |     lines = [
 84 |         line
 85 |         for line in lines
 86 |         if len(line) == 0 or (line[0] != "{" and "jupyter-widgets" not in line)
 87 |     ]
 88 |     return "\n".join(lines)
 89 | 
 90 | 
 91 | def remove_xml(page_md):
 92 |     lines = page_md.split("\n")
 93 |     lines = [line for line in lines if not line.startswith("<?xml")]
 94 |     return "\n".join(lines)
 95 | 
 96 | 
 97 | def _remove_code_block_line_numbers(code_block):
 98 |     lines = code_block.split("\n")
 99 |     lines = [re.sub(r"^\s*\d+\s*", "", line) for line in lines]
100 |     return "\n".join(lines)
101 | 
102 | 
103 | def remove_line_numbers(page_md):
104 |     text_and_code = page_md.split("```")
105 |     text_blocks = text_and_code[::2]
106 |     code_blocks = text_and_code[1::2]
107 |     code_blocks = [_remove_code_block_line_numbers(cb) for cb in code_blocks]
108 | 
109 |     page_md = ""
110 |     for tb, cb in zip(text_blocks, code_blocks):
111 |         page_md += tb + "```" + cb + "```"
112 | 
113 |     page_md += text_and_code[-1]
114 |     return page_md
115 | 
116 | 
117 | def remove_table_rows(page_md):
118 |     lines = page_md.split("\n")
119 |     lines = [
120 |         line
121 |         for line in lines
122 |         if len(line) == 0 or not set(line).issubset(set("| -"))
123 |     ]
124 |     return "\n".join(lines)
125 | 
126 | 
127 | def remove_images(page_md):
128 |     page_md = re.sub("!\[\]\(data:image\/png;base64.*?\)", "", page_md)
129 |     page_md = re.sub(r"^![\w-]+$", "", page_md, flags=re.MULTILINE)
130 |     return page_md
131 | 
132 | 
133 | def remove_code_cell_vestiges(page_md):
134 |     return re.sub(r"^\[\s*\d*\]:$", "", page_md, flags=re.MULTILINE)
135 | 
136 | 
137 | def add_syntax_highlight_to_code_blocks(page_md):
138 |     text_and_code = page_md.split("```")
139 |     text_blocks = text_and_code[::2]
140 |     code_blocks = text_and_code[1::2]
141 |     code_blocks = [
142 |         cb
143 |         for cb in code_blocks
144 |         if len(cb.strip()) > 0 and not set(cb).issubset(set("| -"))
145 |     ]
146 | 
147 |     page_md = ""
148 |     for tb, cb in zip(text_blocks, code_blocks):
149 |         page_md += tb + "```py" + cb + "```"
150 | 
151 |     page_md += text_and_code[-1]
152 |     return page_md
153 | 
154 | 
155 | def merge_adjacent_code_blocks(page_md):
156 |     pattern = r"```\n```py"
157 |     page_md = re.sub(pattern, "", page_md)
158 |     return re.sub(r"```py\n```py", r"```py", page_md)
159 | 
160 | 
161 | def remove_bad_elements(page_md):
162 |     pattern = r"\(function\(\) {[\s\S]*?}\)\(\);"
163 |     page_md = re.sub(pattern, "", page_md, flags=re.MULTILINE)
164 | 
165 |     lines = page_md.split("\n")
166 |     lines = [line for line in lines if not line.startswith("@import")]
167 | 
168 |     bad_keywords = [
169 |         "#focontainer",
170 |         "#fooverlay",
171 |         "#foactivate",
172 |     ]
173 | 
174 |     good_lines = []
175 |     flag = True
176 |     for line in lines:
177 |         if any([keyword in line for keyword in bad_keywords]):
178 |             flag = False
179 |         if flag:
180 |             good_lines.append(line)
181 |         if "}" in line and not flag:
182 |             flag = True
183 | 
184 |     return "\n".join(good_lines)
185 | 
186 | 
187 | def remove_links(page_md):
188 |     match = re.search("\[.*?\]\(.*?\)", page_md)
189 |     if match is not None:
190 |         start, end = match.span()
191 |         link = page_md[start:end]
192 |         link_text = link[1:].split("]")[0]
193 |         if link_text != "¶":
194 |             return page_md[:start] + link_text + remove_links(page_md[end:])
195 |         else:
196 |             return page_md[:end] + link + remove_links(page_md[end:])
197 |     return page_md
198 | 
199 | 
200 | def reformat_markdown(page_md):
201 |     page_md = page_md.replace("\_", "_").replace("\*", "*")
202 |     page_md = remove_links(page_md)
203 |     page_md = remove_images(page_md)
204 |     page_md = remove_jupyter_widgets(page_md)
205 |     page_md = remove_xml(page_md)
206 |     page_md = remove_extra_newlines(page_md)
207 |     page_md = remove_bad_elements(page_md)
208 |     page_md = remove_code_cell_vestiges(page_md)
209 |     return page_md
210 | 
211 | 
212 | def remove_unicode(page_md):
213 |     for uchar in ["\u2500", "\u2514", "\u251c", "\u2502"]:
214 |         page_md = page_md.replace(uchar, "")
215 |     for uchar in ["\u2588", "\u2019"]:
216 |         page_md = page_md.replace(uchar, "'")
217 |     for uchar in ["\u201d", "\u201c"]:
218 |         page_md = page_md.replace(uchar, '"')
219 |     page_md = page_md.replace("\u00a9", "copyright")
220 |     return page_md
221 | 
222 | 
223 | def parse_page_markdown(page_md):
224 |     page_md = remove_header(page_md)
225 |     page_md = remove_footer(page_md)
226 |     page_md = remove_line_numbers(page_md)
227 |     page_md = remove_table_rows(page_md)
228 |     page_md = remove_empty_code_blocks(page_md)
229 |     page_md = add_syntax_highlight_to_code_blocks(page_md)
230 |     page_md = merge_adjacent_code_blocks(page_md)
231 | 
232 |     ## reformat now that the markdown is clean
233 |     page_md = reformat_markdown(page_md)
234 |     page_md = remove_empty_code_blocks(page_md)
235 |     page_md = remove_extra_newlines(page_md)
236 |     page_md = remove_unicode(page_md)
237 |     return page_md
238 | 
239 | 
240 | def get_page_markdown(filepath):
241 |     with open(filepath) as f:
242 |         page_html = f.read()
243 | 
244 |     page_md = md.markdownify(page_html, heading_style="ATX")
245 |     page_md = parse_page_markdown(page_md)
246 | 
247 |     return page_md
248 | 
249 | 
250 | def split_at_anchors(page_md):
251 |     md_lines = page_md.split("\n")
252 |     md_sections = {}
253 |     curr_anchor = None
254 |     curr_section = []
255 |     for line in md_lines:
256 |         if "Permalink" in line:
257 |             if curr_anchor is not None:
258 |                 md_sections[curr_anchor] = "\n".join(curr_section)
259 |             curr_section = []
260 |             curr_anchor = line.split('"Permalink')[0].split("#")[-1].strip()
261 |         else:
262 |             curr_section.append(line)
263 | 
264 |     md_sections[curr_anchor] = "\n".join(curr_section)
265 |     return md_sections
266 | 
267 | 
268 | def split_section_into_chunks(text):
269 |     document = Document(page_content=text)
270 |     documents = splitter.split_documents([document])
271 |     return [d.page_content for d in documents]
272 | 
273 | 
274 | def split_page_into_chunks(page_md):
275 |     md_sections = split_at_anchors(page_md)
276 |     chunks = {}
277 |     for anchor, section in md_sections.items():
278 |         chunks[anchor] = split_section_into_chunks(section)
279 | 
280 |     return chunks
281 | 
282 | 
283 | def get_markdown_documents(filepath):
284 |     page_md = get_page_markdown(filepath)
285 |     return split_page_into_chunks(page_md)
286 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [tool.black]
2 | line-length = 79
3 | include = '\.pyi?$'
4 | exclude = '''
5 | /(
6 |   | \.git
7 | )/
8 | '''


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | argcomplete==1.11.0
 2 | google-cloud-storage>=2.8.0
 3 | langchain>=0.0.179
 4 | markdownify>=0.11.6
 5 | openai>=0.27.2,<1.0.0
 6 | qdrant-client>=1.1.1
 7 | packaging==20.3
 8 | pre-commit>=2.18.1
 9 | regex>=2022.8.17
10 | rich>=12.6.0
11 | setuptools>=45.2.0
12 | tqdm>=4.64.1
13 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | Installs FiftyOne Docs Search.
 4 | | Copyright 2017-2023, Voxel51, Inc.
 5 | | `voxel51.com <https://voxel51.com/>`_
 6 | |
 7 | """
 8 | from setuptools import setup, find_packages
 9 | 
10 | INSTALL_REQUIRES = [
11 |     "argcomplete",
12 |     "google-cloud-storage",
13 |     "langchain",
14 |     "markdownify",
15 |     "openai",
16 |     "packaging",
17 |     "qdrant-client",
18 |     "regex",
19 |     "rich",
20 |     "setuptools",
21 |     "tqdm",
22 | ]
23 | 
24 | with open("README.md", "r") as fh:
25 |     description = fh.read()
26 | 
27 | setup(
28 |     name="fiftyone-docs-search",
29 |     version="0.21.0",
30 |     author="Voxel51, Inc.",
31 |     author_email="info@voxel51.com",
32 |     packages=find_packages(),
33 |     description="Semantic search for the FiftyOne Docs from command line",
34 |     long_description=description,
35 |     long_description_content_type="text/markdown",
36 |     url="https://github.com/voxel51/fiftyone-docs-search",
37 |     license="Apache",
38 |     python_requires=">=3.8",
39 |     install_requires=INSTALL_REQUIRES,
40 |     include_package_data=True,
41 |     classifiers=[
42 |         "Development Status :: 4 - Beta",
43 |         "Intended Audience :: Developers",
44 |         "Intended Audience :: Science/Research",
45 |         "License :: OSI Approved :: Apache Software License",
46 |         "Topic :: Scientific/Engineering :: Artificial Intelligence",
47 |         "Topic :: Scientific/Engineering :: Image Processing",
48 |         "Topic :: Scientific/Engineering :: Image Recognition",
49 |         "Topic :: Scientific/Engineering :: Information Analysis",
50 |         "Topic :: Scientific/Engineering :: Visualization",
51 |         "Operating System :: MacOS :: MacOS X",
52 |         "Operating System :: POSIX :: Linux",
53 |         "Operating System :: Microsoft :: Windows",
54 |         "Programming Language :: Python :: 3",
55 |         "Programming Language :: Python :: 3.7",
56 |         "Programming Language :: Python :: 3.8",
57 |         "Programming Language :: Python :: 3.9",
58 |         "Programming Language :: Python :: 3.10",
59 |     ],
60 |     entry_points={
61 |         "console_scripts": [
62 |             "fiftyone-docs-search=fiftyone.docs_search.cli:main"
63 |         ]
64 |     },
65 | )
66 | 


--------------------------------------------------------------------------------