├── .dockerignore ├── .gitignore ├── LICENSE ├── README.md ├── backend-image ├── .dockerignore ├── Dockerfile ├── app.py ├── config.py ├── executors.py ├── helper.py └── requirements.txt ├── backend-text ├── .dockerignore ├── Dockerfile ├── app.py ├── config.py ├── helper.py └── requirements.txt ├── docker-compose.yml ├── frontend ├── .dockerignore ├── .streamlit │ └── config.toml ├── Dockerfile ├── README.md ├── config.py ├── frontend.py ├── helper.py ├── powered_by_jina.png ├── requirements.txt └── samples │ ├── buttons.jpg │ ├── crying.jpg │ ├── koala.jpg │ ├── leo.jpg │ └── squidward.jpg ├── get_data.sh └── get_memes.py /.dockerignore: -------------------------------------------------------------------------------- 1 | */env 2 | */__pycache__ 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | **/cache 2 | **/data*/ 3 | **/workspace*/ 4 | **/env 5 | 6 | ### Created by https://www.gitignore.io 7 | ### Python ### 8 | # Byte-compiled / optimized / DLL files 9 | __pycache__/ 10 | *.py[cod] 11 | *$py.class 12 | 13 | # C extensions 14 | *.so 15 | 16 | # Distribution / packaging 17 | .Python 18 | build/ 19 | develop-eggs/ 20 | dist/ 21 | downloads/ 22 | eggs/ 23 | .eggs/ 24 | parts/ 25 | sdist/ 26 | var/ 27 | wheels/ 28 | pip-wheel-metadata/ 29 | share/python-wheels/ 30 | *.egg-info/ 31 | .installed.cfg 32 | *.egg 33 | MANIFEST 34 | 35 | # PyInstaller 36 | # Usually these files are written by a python script from a template 37 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 38 | *.manifest 39 | *.spec 40 | 41 | # Installer logs 42 | pip-log.txt 43 | pip-delete-this-directory.txt 44 | 45 | # Unit test / coverage reports 46 | htmlcov/ 47 | .tox/ 48 | .nox/ 49 | .coverage 50 | .coverage.* 51 | .cache 52 | nosetests.xml 53 | coverage.xml 54 | *.cover 55 | *.py,cover 56 | .hypothesis/ 57 | .pytest_cache/ 58 | pytestdebug.log 59 | 60 | # Translations 61 | *.mo 62 | *.pot 63 | 64 | # Django stuff: 65 | *.log 66 | local_settings.py 67 | db.sqlite3 68 | db.sqlite3-journal 69 | 70 | # Flask stuff: 71 | instance/ 72 | .webassets-cache 73 | 74 | # Scrapy stuff: 75 | .scrapy 76 | 77 | # Sphinx documentation 78 | docs/_build/ 79 | doc/_build/ 80 | 81 | # PyBuilder 82 | target/ 83 | 84 | # Jupyter Notebook 85 | .ipynb_checkpoints 86 | 87 | # IPython 88 | profile_default/ 89 | ipython_config.py 90 | 91 | # pyenv 92 | .python-version 93 | 94 | # pipenv 95 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 96 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 97 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 98 | # install all needed dependencies. 99 | #Pipfile.lock 100 | 101 | # poetry 102 | #poetry.lock 103 | 104 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 105 | __pypackages__/ 106 | 107 | # Celery stuff 108 | celerybeat-schedule 109 | celerybeat.pid 110 | 111 | # SageMath parsed files 112 | *.sage.py 113 | 114 | # Environments 115 | # .env 116 | .env/ 117 | .venv/ 118 | env/ 119 | venv/ 120 | ENV/ 121 | env.bak/ 122 | venv.bak/ 123 | pythonenv* 124 | 125 | # Spyder project settings 126 | .spyderproject 127 | .spyproject 128 | 129 | # Rope project settings 130 | .ropeproject 131 | 132 | # mkdocs documentation 133 | /site 134 | 135 | # mypy 136 | .mypy_cache/ 137 | .dmypy.json 138 | dmypy.json 139 | 140 | # Pyre type checker 141 | .pyre/ 142 | 143 | # pytype static type analyzer 144 | .pytype/ 145 | 146 | # operating system-related files 147 | # file properties cache/storage on macOS 148 | *.DS_Store 149 | # thumbnail cache on Windows 150 | Thumbs.db 151 | 152 | # profiling data 153 | .prof 154 | 155 | tags 156 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # This repo is deprecated. This code will not run with recent versions of Jina! 2 | 3 | --- 4 | 5 | # Jina meme search 6 | 7 | In this repo you can find three folders to help build your own meme search engine powered by [Jina](https://github.com/jina-ai/jina/). 8 | 9 | - [backend-text](./backend-text) - Uses SpaCy to encode and search through meme captions 10 | - [backend-image](./backend-image) - Uses CLIP to encode and search through meme images 11 | - [frontend](./frontend) - A [Streamlit](https://streamlit.io) frontend for the whole thing 12 | 13 | Each of these can be run independently. You can play with a [live demo](https://examples.jina.ai/memes) or [notebook](https://github.com/jina-ai/workshops/tree/main/memes) (text-search only) to get a feel for it. 14 | 15 | ## Note 16 | 17 | I'm still revamping the README's, so documentation might not be fully up-to-date for a little while. 18 | 19 | 20 | ## Instructions 21 | 22 | ### Set up 23 | 24 | - Create a virtual environment 25 | - `python get_memes.py 200000` (where 200000 is the number of memes you want to download) 26 | 27 | ## Text search 28 | 29 | ```shell 30 | cd backend-text 31 | pip install -r requirements.txt 32 | python app.py -t index -n 1000 # Index 1000 memes 33 | python app.py -t search # Open RESTful gateway 34 | ``` 35 | 36 | ## Image search 37 | 38 | Edit `app.py` to set number of memes to index. 39 | 40 | ```shell 41 | cd backend-image 42 | pip install -r requirements.txt 43 | python app.py -t index -n 1000 # Index 1000 memes 44 | python app.py -t search # Open RESTful gateway 45 | ``` 46 | 47 | ## Frontend 48 | 49 | ```shell 50 | cd frontend 51 | pip install -r requirements.txt 52 | streamlit app.py 53 | ``` 54 | 55 | ## Via `docker-compose` 56 | 57 | Note: This opens up the search interfaces for meme search, including the frontend. It **doesn't** index the data. Be sure to do that beforehand. 58 | 59 | 1. Follow instructions above for setup and indexing (don't query anything yet) 60 | 2. In root dir, `docker-compose up` 61 | 62 | ## Troubleshooting 63 | 64 | ### Running out of memory 65 | 66 | If you're on Linux you can create a swapfile: 67 | 68 | ```shell 69 | dd if=/dev/zero of=swapfile bs=1M count=10240 status=progress # 10240mb = 10gb 70 | chmod 600 swapfile 71 | mkswap swapfile 72 | swapon swapfile 73 | ``` 74 | -------------------------------------------------------------------------------- /backend-image/.dockerignore: -------------------------------------------------------------------------------- 1 | env 2 | __pycache__ 3 | -------------------------------------------------------------------------------- /backend-image/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM jinaai/jina:3.2.10-py39-standard 2 | 3 | # setup the workspace 4 | COPY . /workspace 5 | WORKDIR /workspace 6 | 7 | RUN apt-get update && apt-get install --no-install-recommends -y zlib1g zlib1g-dev git build-essential g++ libjpeg-dev && pip install -r requirements.txt 8 | 9 | ENTRYPOINT ["python", "app.py", "-t"] 10 | CMD ["search"] 11 | 12 | EXPOSE 65432 13 | -------------------------------------------------------------------------------- /backend-image/app.py: -------------------------------------------------------------------------------- 1 | import click 2 | from jina import Flow 3 | from config import WORKSPACE_DIR, NUM_DOCS, DATA_DIR, REQUEST_SIZE, PORT, BENCHMARK 4 | from executors import ImageNormalizer 5 | from helper import generate_docs, check_gpu 6 | from datetime import datetime 7 | 8 | encoder = "jinahub://CLIPImageEncoder/" 9 | 10 | if check_gpu(): 11 | print("Using GPU") 12 | encoder += "-gpu" 13 | uses_with = {"device": "cuda"} 14 | else: 15 | print("Using CPU") 16 | uses_with = {"device": "cpu"} 17 | 18 | flow = ( 19 | Flow(protocol="http", port=PORT) 20 | .add(name="image_normalizer", uses=ImageNormalizer) 21 | .add( 22 | name="meme_image_encoder", 23 | uses=encoder, 24 | uses_metas={"workspace": WORKSPACE_DIR}, 25 | uses_with=uses_with, 26 | gpus="all", 27 | volumes="./data:/encoder/data", 28 | install_requirements=True, 29 | ) 30 | .add( 31 | name="meme_image_indexer", 32 | uses="jinahub://PQLiteIndexer/0.2.6", 33 | uses_with={ 34 | "limit": 12, 35 | "dim": 512, # SpaCy en_core_md uses 300 dims 36 | "include_metadata": True 37 | }, 38 | # volumes=f"./{WORKSPACE_DIR}:/workspace/workspace", 39 | install_requirements=True, 40 | ) 41 | ) 42 | 43 | 44 | def index(num_docs=NUM_DOCS): 45 | docs = generate_docs(DATA_DIR, num_docs) 46 | 47 | with flow: 48 | if BENCHMARK: 49 | start_time = datetime.now() 50 | flow.index(inputs=docs, show_progress=True, request_size=REQUEST_SIZE) 51 | if BENCHMARK: 52 | end_time = datetime.now() 53 | difference = end_time - start_time 54 | print(difference.seconds) 55 | minutes = difference.seconds / 60 56 | print(f"Indexing took {minutes} minutes ({minutes/60} hours)") 57 | 58 | 59 | 60 | 61 | def search(): 62 | with flow: 63 | flow.block() 64 | 65 | 66 | @click.command() 67 | @click.option( 68 | "--task", 69 | "-t", 70 | type=click.Choice(["index", "search"], case_sensitive=False), 71 | ) 72 | @click.option("--num_docs", "-n", default=NUM_DOCS) 73 | def main(task: str, num_docs): 74 | if task == "index": 75 | index(num_docs=num_docs) 76 | elif task == "search": 77 | search() 78 | else: 79 | print("Please add '-t index' or '-t search' to your command") 80 | 81 | 82 | if __name__ == "__main__": 83 | main() 84 | -------------------------------------------------------------------------------- /backend-image/config.py: -------------------------------------------------------------------------------- 1 | PORT = 65432 2 | NUM_DOCS = 10 3 | REQUEST_SIZE = 16 # Lower = lower memory usage 4 | FORMATS = ["jpg", "png", "jpeg"] 5 | DATA_DIR = "../data" 6 | WORKSPACE_DIR = "workspace" 7 | BENCHMARK = True 8 | -------------------------------------------------------------------------------- /backend-image/executors.py: -------------------------------------------------------------------------------- 1 | from jina import Executor, requests 2 | from docarray import Document, DocumentArray 3 | 4 | 5 | class ImageNormalizer(Executor): 6 | @requests(on="/index") 7 | def process_images(self, docs, **kwargs): 8 | if type(docs) == Document: 9 | docs = DocumentArray([docs]) 10 | 11 | for doc in docs: 12 | doc.load_uri_to_image_tensor() 13 | doc.set_image_tensor_shape((64, 64)) 14 | doc.set_image_tensor_normalization() 15 | -------------------------------------------------------------------------------- /backend-image/helper.py: -------------------------------------------------------------------------------- 1 | from jina import Document, DocumentArray 2 | from config import NUM_DOCS, FORMATS 3 | 4 | 5 | def generate_docs(directory, num_docs=NUM_DOCS, formats=FORMATS): 6 | docs = DocumentArray() 7 | for format in formats: 8 | docarray = DocumentArray.from_files(f"{directory}/**/*.{format}", size=num_docs) 9 | docs.extend(docarray) 10 | 11 | # docs = process_images(docs) 12 | 13 | return docs[:num_docs] 14 | 15 | 16 | def process_images(images): 17 | if type(images) == Document: 18 | images = DocumentArray([images]) 19 | 20 | for image in images: 21 | image.load_uri_to_image_blob() 22 | image.set_image_blob_shape((64, 64)) 23 | image.set_image_blob_normalization() 24 | 25 | return images 26 | 27 | 28 | def print_result(resp): 29 | """ 30 | Callback function to receive results. 31 | 32 | :param resp: returned response with data 33 | """ 34 | matches = [] 35 | for doc in resp.docs: 36 | for match in doc.matches: 37 | kmi = match.uri 38 | matches.append(kmi) 39 | 40 | for match in doc.matches: 41 | print(f"{match.uri}") 42 | 43 | 44 | def check_gpu(): 45 | import GPUtil 46 | 47 | gpu_list = GPUtil.getAvailable() 48 | if len(gpu_list) > 0: 49 | return True 50 | else: 51 | return False 52 | -------------------------------------------------------------------------------- /backend-image/requirements.txt: -------------------------------------------------------------------------------- 1 | jina==3.2.10 2 | Pillow-SIMD==7.0.0.post3 3 | GPUtil==1.4.0 4 | -------------------------------------------------------------------------------- /backend-text/.dockerignore: -------------------------------------------------------------------------------- 1 | env 2 | __pycache__ 3 | -------------------------------------------------------------------------------- /backend-text/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM jinaai/jina:3.2.9-py39-standard 2 | 3 | # setup the workspace 4 | COPY . /workspace 5 | WORKDIR /workspace 6 | 7 | RUN apt-get update && apt-get install --no-install-recommends -y git build-essential g++ 8 | 9 | ENTRYPOINT ["python", "app.py", "-t"] 10 | CMD ["search"] 11 | 12 | EXPOSE 45679 13 | -------------------------------------------------------------------------------- /backend-text/app.py: -------------------------------------------------------------------------------- 1 | import click 2 | from jina import Flow 3 | from config import PORT, DATAFILE, MAX_DOCS, MODEL, CACHE_DIR 4 | from helper import prep_docs 5 | 6 | flow = ( 7 | Flow(protocol="http", port=PORT) 8 | .add( 9 | name="meme_text_encoder", 10 | uses="jinahub://SpacyTextEncoder/v0.4", 11 | uses_with={"model_name": MODEL}, 12 | volumes=f"{CACHE_DIR}:/root/.cache", 13 | install_requirements=True, 14 | ) 15 | .add( 16 | name="meme_text_indexer", 17 | uses="jinahub://PQLiteIndexer/0.2.6", 18 | uses_with={ 19 | "limit": 12, 20 | "dim": 300, # SpaCy en_core_md uses 300 dims 21 | "include_metadata": True 22 | }, 23 | install_requirements=True, 24 | ) 25 | ) 26 | 27 | 28 | def index(num_docs: int = MAX_DOCS): 29 | """ 30 | Build index for your search 31 | :param num_docs: maximum number of Documents to index 32 | """ 33 | with flow: 34 | flow.index( 35 | inputs=prep_docs(input_file=DATAFILE, num_docs=num_docs), 36 | request_size=64, 37 | read_mode="r", 38 | show_progress=True, 39 | ) 40 | 41 | 42 | def search(): 43 | """ 44 | Query index 45 | """ 46 | with flow: 47 | flow.block() 48 | 49 | 50 | @click.command() 51 | @click.option( 52 | "--task", 53 | "-t", 54 | type=click.Choice(["index", "search"], case_sensitive=False), 55 | ) 56 | @click.option("--num_docs", "-n", default=MAX_DOCS) 57 | def main(task: str, num_docs: int): 58 | if task == "index": 59 | index(num_docs=num_docs) 60 | elif task == "search": 61 | search() 62 | else: 63 | print("Please add '-t index' or '-t search' to your command") 64 | 65 | 66 | if __name__ == "__main__": 67 | main() 68 | -------------------------------------------------------------------------------- /backend-text/config.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | MODEL = "en_core_web_md" # Which SpaCy model do we use? 4 | PORT = 45679 5 | CACHE_DIR = os.path.expanduser('~/.cache') 6 | DATAFILE = "../data/memes.json" 7 | MAX_DOCS = 1000 8 | RANDOM_SEED = 1337 9 | -------------------------------------------------------------------------------- /backend-text/helper.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import sys 4 | import json 5 | from jina import Document, DocumentArray 6 | from config import RANDOM_SEED 7 | 8 | 9 | def prep_docs(input_file, num_docs=None, shuffle=True): 10 | docs = DocumentArray() 11 | memes = [] 12 | print(f"Processing {input_file}") 13 | with open(input_file, "r") as file: 14 | raw_json = json.loads(file.read()) 15 | 16 | for template in raw_json: 17 | for meme in template["generated_memes"]: 18 | meme["template"] = template["name"] 19 | memes.extend(template["generated_memes"]) 20 | 21 | if shuffle: 22 | import random 23 | 24 | random.seed(RANDOM_SEED) 25 | random.shuffle(memes) 26 | 27 | for meme in memes[:num_docs]: 28 | doctext = f"{meme['template']} - {meme['caption_text']}" 29 | doc = Document(text=doctext) 30 | doc.tags = meme 31 | doc.tags["uri_absolute"] = "http" + doc.tags["image_url"] 32 | docs.extend([doc]) 33 | 34 | return docs 35 | -------------------------------------------------------------------------------- /backend-text/requirements.txt: -------------------------------------------------------------------------------- 1 | jina==3.2.9 # higher version uses version of click that breaks spacy? 2 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3.7" 2 | 3 | services: 4 | frontend: 5 | build: frontend 6 | environment: 7 | - BACKEND_TEXT=backend-text 8 | - BACKEND_IMAGE=backend-image 9 | - DEBUG=True 10 | ports: 11 | - 8510:8510 12 | depends_on: 13 | - backend-text 14 | - backend-image 15 | volumes: 16 | - ./frontend:/workspace 17 | # - ./data:/data 18 | restart: always 19 | 20 | backend-text: 21 | build: backend-text 22 | ports: 23 | - 45679:45679 24 | volumes: 25 | - ./backend-text:/workspace 26 | restart: always 27 | 28 | backend-image: 29 | build: backend-image 30 | ports: 31 | - 65432:65432 32 | volumes: 33 | - ./backend-image:/workspace 34 | # - ./data:/data 35 | restart: always 36 | -------------------------------------------------------------------------------- /frontend/.dockerignore: -------------------------------------------------------------------------------- 1 | */env/* 2 | */__pycache__/* 3 | -------------------------------------------------------------------------------- /frontend/.streamlit/config.toml: -------------------------------------------------------------------------------- 1 | [ server ] 2 | 3 | port = 8510 4 | 5 | [theme] 6 | primaryColor="#009191" 7 | -------------------------------------------------------------------------------- /frontend/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM jinaai/jina:3.2.10-py39-standard 2 | 3 | # setup the workspace 4 | COPY . /workspace 5 | WORKDIR /workspace 6 | 7 | RUN apt-get update && apt-get install --no-install-recommends -y zlib1g zlib1g-dev git build-essential g++ libjpeg-dev && pip install -r requirements.txt 8 | 9 | EXPOSE 8510 10 | 11 | ENTRYPOINT ["streamlit"] 12 | CMD ["run", "frontend.py"] 13 | -------------------------------------------------------------------------------- /frontend/README.md: -------------------------------------------------------------------------------- 1 | 1. Clone this repo 2 | 2. Enter the directory 3 | 3. Create and activate virtualenv 4 | 4. `pip install -r requirements.txt` 5 | 5. `streamlit run app.py` 6 | 6. Open the URL in your browser 7 | -------------------------------------------------------------------------------- /frontend/config.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | # Text search 4 | TEXT_PORT = 45679 5 | TEXT_SERVER = os.getenv("BACKEND_TEXT", "0.0.0.0") 6 | TEXT_SAMPLES = ["cute fuzzy animals", "so you're telling me willy wonka", "school sucks"] 7 | 8 | # Image search 9 | IMAGE_SERVER = os.getenv("BACKEND_IMAGE", "0.0.0.0") 10 | IMAGE_PORT = 65432 11 | 12 | # General 13 | TOP_K = 10 14 | DEBUG = os.getenv("DEBUG", False) 15 | DATA_DIR = "../data" 16 | 17 | -------------------------------------------------------------------------------- /frontend/frontend.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | from config import IMAGE_PORT, IMAGE_SERVER, DEBUG, TEXT_PORT, TEXT_SERVER, TEXT_SAMPLES, DATA_DIR 3 | from helper import search_by_file, search_by_text, UI, convert_file_to_document, get_image_url 4 | 5 | matches = [] 6 | 7 | # Layout 8 | st.set_page_config(page_title="Jina meme search") 9 | st.markdown( 10 | body=UI.css, 11 | unsafe_allow_html=True, 12 | ) 13 | st.write( 14 | "", 15 | unsafe_allow_html=True, 16 | ) 17 | 18 | # Sidebar 19 | st.sidebar.markdown(UI.about_block, unsafe_allow_html=True) 20 | 21 | if DEBUG: 22 | with st.sidebar.expander("Debug"): 23 | TEXT_SERVER = st.text_input(label="Text server", value=TEXT_SERVER) 24 | TEXT_PORT = st.text_input(label="Text port", value=TEXT_PORT) 25 | IMAGE_SERVER = st.text_input(label="Image server", value=IMAGE_SERVER) 26 | IMAGE_PORT = st.text_input(label="Image port", value=IMAGE_PORT) 27 | 28 | st.header("Jina Meme Search") 29 | media_type = st.radio("Search with...", ["Text", "Image"]) 30 | 31 | if media_type == "Image": 32 | upload_cell, preview_cell = st.columns([12, 1]) 33 | query = upload_cell.file_uploader("") 34 | if query: 35 | doc = convert_file_to_document(query) 36 | if st.button(label="Search"): 37 | if not query: 38 | st.markdown("Please enter a query") 39 | else: 40 | matches = search_by_file(document=doc, server=IMAGE_SERVER, port=IMAGE_PORT) 41 | 42 | elif media_type == "Text": 43 | query = st.text_input("", key="text_search_box") 44 | if st.button("Search", key="text_search"): 45 | matches = search_by_text(input=query, server=TEXT_SERVER, port=TEXT_PORT) 46 | print(matches[0].tags) 47 | st.subheader("...or search from a sample") 48 | 49 | for text in TEXT_SAMPLES: 50 | if st.button(text): 51 | matches = search_by_text(input=text, server=TEXT_SERVER, port=TEXT_PORT) 52 | 53 | 54 | # Results area 55 | cell1, cell2, cell3 = st.columns(3) 56 | cell4, cell5, cell6 = st.columns(3) 57 | cell7, cell8, cell9 = st.columns(3) 58 | all_cells = [cell1, cell2, cell3, cell4, cell5, cell6, cell7, cell8, cell9] 59 | 60 | for cell, match in zip(all_cells, matches): 61 | if media_type == "Text": 62 | cell.image(f"http:{match.tags['image_url']}") 63 | else: 64 | cell.image(get_image_url(match.uri)) 65 | -------------------------------------------------------------------------------- /frontend/helper.py: -------------------------------------------------------------------------------- 1 | from jina import Client, Document 2 | from config import TEXT_PORT, TEXT_SERVER, IMAGE_PORT, IMAGE_SERVER, TOP_K 3 | 4 | 5 | class UI: 6 | about_block = """ 7 | 8 | ### About 9 | 10 | This is a meme search engine using [Jina's neural search framework](https://github.com/jina-ai/jina/). 11 | 12 | - [Live demo](https://examples.jina.ai/memes) 13 | - [Play with it in a notebook](https://colab.research.google.com/github/jina-ai/workshops/blob/main/memes/meme_search.ipynb) (text-only) 14 | - [Repo](https://github.com/alexcg1/jina-meme-search) 15 | - [Dataset](https://www.kaggle.com/abhishtagatya/imgflipscraped-memes-caption-dataset) 16 | """ 17 | 18 | css = f""" 19 | 32 | """ 33 | 34 | 35 | headers = {"Content-Type": "application/json"} 36 | 37 | 38 | def search_by_text(input, server=TEXT_SERVER, port=TEXT_PORT, limit=TOP_K): 39 | client = Client(host=server, protocol="http", port=port) 40 | response = client.search( 41 | Document(text=input), 42 | parameters={"limit": limit}, 43 | ) 44 | matches = response[0].matches 45 | 46 | return matches 47 | 48 | 49 | def search_by_file(document, server=IMAGE_SERVER, port=IMAGE_PORT, limit=TOP_K): 50 | """ 51 | Wrap file in Jina Document for searching, and do all necessary conversion to make similar to indexed Docs 52 | """ 53 | client = Client(host=server, protocol="http", port=port) 54 | query_doc = document 55 | query_doc.convert_blob_to_image_tensor() 56 | query_doc.set_image_tensor_shape((64,64)) 57 | query_doc.set_image_tensor_normalization() 58 | response = client.search( 59 | query_doc, 60 | parameters={"limit": limit}, 61 | return_results=True, 62 | show_progress=True, 63 | ) 64 | matches = response[0].matches 65 | 66 | return matches 67 | 68 | 69 | def convert_file_to_document(query): 70 | data = query.read() 71 | 72 | doc = Document(blob=data) 73 | # print(doc) 74 | 75 | return doc 76 | 77 | 78 | def get_image_url(file_path, domain="http://i.imgflip.com/"): 79 | filename = file_path.split("/")[-1] 80 | url = domain + filename 81 | 82 | return url 83 | -------------------------------------------------------------------------------- /frontend/powered_by_jina.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jina-ai/example-meme-search/b23fed3ec93888677a454d63a39a77ec6d1f2b6a/frontend/powered_by_jina.png -------------------------------------------------------------------------------- /frontend/requirements.txt: -------------------------------------------------------------------------------- 1 | jina==3.2.10 2 | Pillow-SIMD==7.0.0.post3 3 | streamlit==1.8.1 4 | python-magic==0.4.24 5 | -------------------------------------------------------------------------------- /frontend/samples/buttons.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jina-ai/example-meme-search/b23fed3ec93888677a454d63a39a77ec6d1f2b6a/frontend/samples/buttons.jpg -------------------------------------------------------------------------------- /frontend/samples/crying.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jina-ai/example-meme-search/b23fed3ec93888677a454d63a39a77ec6d1f2b6a/frontend/samples/crying.jpg -------------------------------------------------------------------------------- /frontend/samples/koala.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jina-ai/example-meme-search/b23fed3ec93888677a454d63a39a77ec6d1f2b6a/frontend/samples/koala.jpg -------------------------------------------------------------------------------- /frontend/samples/leo.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jina-ai/example-meme-search/b23fed3ec93888677a454d63a39a77ec6d1f2b6a/frontend/samples/leo.jpg -------------------------------------------------------------------------------- /frontend/samples/squidward.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jina-ai/example-meme-search/b23fed3ec93888677a454d63a39a77ec6d1f2b6a/frontend/samples/squidward.jpg -------------------------------------------------------------------------------- /get_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | mkdir -p data 3 | wget -O data/memes.json https://jina-examples-datasets.s3.amazonaws.com/memes/memes.json 4 | -------------------------------------------------------------------------------- /get_memes.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import json 4 | import requests 5 | 6 | MAX_DOCS = int(sys.argv[1]) 7 | JSON_URL = "https://jina-examples-datasets.s3.amazonaws.com/memes/memes.json" 8 | OUTPUT_DIR = "./data" 9 | 10 | 11 | def get_json(url, output_dir): 12 | if not os.path.isfile(f"{output_dir}/memes.json"): 13 | 14 | if not os.path.isdir(output_dir): 15 | os.makedirs(output_dir) 16 | print(f"Downloading {url} to '{output_dir}' directory") 17 | r = requests.get(url, allow_redirects=True) 18 | if r.status_code == 200: 19 | with open(f"{output_dir}/memes.json", "wb") as file: 20 | file.write(r.content) 21 | 22 | 23 | def prep_docs(input_file, max_docs, output_dir, random_seed=1337, shuffle=True): 24 | print(f"Preparing {max_docs} Documents") 25 | 26 | memes = [] 27 | print(f"Processing {input_file}") 28 | with open(input_file, "r") as file: 29 | raw_json = json.loads(file.read()) 30 | 31 | for template in raw_json: 32 | for meme in template["generated_memes"]: 33 | meme["template"] = template["name"] 34 | memes.extend(template["generated_memes"]) 35 | 36 | if shuffle: 37 | import random 38 | 39 | random.seed(random_seed) 40 | random.shuffle(memes) 41 | 42 | os.chdir(output_dir) 43 | counter = 1 44 | for meme in memes[:max_docs]: 45 | 46 | # Download image 47 | 48 | url = f'http:{meme["image_url"]}' 49 | filename = meme["image_url"].split("/")[-1] 50 | if not os.path.isfile(filename): 51 | print(f"Downloading {filename} - {counter}/{max_docs}") 52 | try: 53 | r = requests.get(url, allow_redirects=True) 54 | if r.status_code == 200: 55 | with open(filename, "wb") as file: 56 | file.write(r.content) 57 | counter += 1 58 | except: 59 | print(f"Error on {filename}, skipping.") 60 | else: 61 | print(f"{filename} already downloaded, skipping") 62 | counter +=1 63 | 64 | 65 | get_json(url=JSON_URL, output_dir=OUTPUT_DIR) 66 | prep_docs("data/memes.json", max_docs=MAX_DOCS, output_dir=OUTPUT_DIR, shuffle=True) 67 | --------------------------------------------------------------------------------