├── .dockerignore
├── .env.template
├── .github
└── workflows
│ └── run_tests.yml
├── .gitignore
├── .vscode
├── launch.json
└── settings.json
├── Dockerfile
├── LICENSE
├── Makefile
├── README.md
├── backend
├── .dockerignore
├── arxivsearch
│ ├── __init__.py
│ ├── api
│ │ ├── __init__.py
│ │ ├── main.py
│ │ └── routes
│ │ │ ├── __init__.py
│ │ │ └── papers.py
│ ├── config.py
│ ├── db
│ │ ├── __init__.py
│ │ ├── load.py
│ │ ├── schema
│ │ │ └── index.yaml
│ │ └── utils.py
│ ├── main.py
│ ├── schema
│ │ ├── __init__.py
│ │ └── models.py
│ ├── spa.py
│ ├── templates
│ │ └── build
│ │ │ └── build.txt
│ ├── tests
│ │ ├── __init__.py
│ │ ├── api
│ │ │ ├── __init__.py
│ │ │ └── routes
│ │ │ │ ├── __init__.py
│ │ │ │ └── test_papers.py
│ │ ├── conftest.py
│ │ ├── db
│ │ │ └── test_load.py
│ │ └── test_vectors.json
│ └── utils
│ │ ├── __init__.py
│ │ └── embeddings.py
├── data
│ └── redis-logo.png
├── poetry.lock
├── pyproject.toml
└── scripts.py
├── docker-local-redis.yml
├── frontend
├── .gitignore
├── README.md
├── package-lock.json
├── package.json
├── public
│ ├── Redis_Mark_Red.svg
│ ├── favicon.ico
│ ├── github-mark-white.svg
│ ├── index.html
│ ├── manifest.json
│ ├── robots.txt
│ ├── site.webmanifest
│ └── x-logo.svg
├── src
│ ├── App.css
│ ├── App.test.tsx
│ ├── App.tsx
│ ├── Layout.tsx
│ ├── Routes.tsx
│ ├── api.ts
│ ├── config
│ │ └── index.tsx
│ ├── index.css
│ ├── index.tsx
│ ├── logo.svg
│ ├── react-app-env.d.ts
│ ├── reportWebVitals copy.ts
│ ├── reportWebVitals.ts
│ ├── setupTests.ts
│ ├── styles
│ │ ├── Card.css
│ │ ├── Footer.css
│ │ ├── Header.css
│ │ └── Home.css
│ └── views
│ │ ├── Card.tsx
│ │ ├── Footer.tsx
│ │ ├── Header.tsx
│ │ ├── Home.tsx
│ │ └── index.ts
└── tsconfig.json
└── k8s
├── README.md
├── backend.yaml
├── cluster.yaml
└── redis-vector-db.yaml
/.dockerignore:
--------------------------------------------------------------------------------
1 | # ignore .git folders
2 | .git
3 | .env
4 |
5 | # ignore specific.json files
6 | data/arxiv-metadata-oai-snapshot.json
7 |
8 | # ignore node_modules
9 | *node_modules
--------------------------------------------------------------------------------
/.env.template:
--------------------------------------------------------------------------------
1 | DEPLOYMENT=dev
2 | REDIS_HOST=redis
3 | REDIS_PORT=6379
4 | OPENAI_API_KEY=YOUR-OPENAI-API-KEY
5 | COHERE_API_KEY=YOUR-COHERE-API-KEY
6 | TOKENIZERS_PARALLELISM=False
--------------------------------------------------------------------------------
/.github/workflows/run_tests.yml:
--------------------------------------------------------------------------------
1 | name: Test Suite
2 |
3 | on:
4 | pull_request:
5 | branches:
6 | - main
7 |
8 | push:
9 | branches:
10 | - main
11 |
12 | schedule:
13 | - cron: '0 0 * * 0' # Runs every Sunday
14 |
15 | jobs:
16 | test:
17 | name: Python ${{ matrix.python-version }} - ${{ matrix.connection }} [redis-stack ${{matrix.redis-stack-version}}]
18 | runs-on: ubuntu-latest
19 |
20 | strategy:
21 | fail-fast: false
22 | matrix:
23 | python-version: ["3.11"]
24 | redis-stack-version: ['latest']
25 |
26 | services:
27 | redis:
28 | image: redis/redis-stack-server:${{matrix.redis-stack-version}}
29 | ports:
30 | - 6379:6379
31 |
32 | steps:
33 | - uses: actions/checkout@v2
34 | - name: Set up Python ${{ matrix.python-version }}
35 | uses: actions/setup-python@v4
36 | with:
37 | python-version: ${{ matrix.python-version }}
38 | cache: 'pip'
39 |
40 | - name: Install Poetry
41 | uses: snok/install-poetry@v1
42 |
43 | - name: Install dependencies
44 | working-directory: ./backend
45 | run: |
46 | poetry install --all-extras
47 |
48 | - name: Run tests
49 | env:
50 | OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
51 | COHERE_API_KEY: ${{ secrets.COHERE_API_KEY }}
52 | working-directory: ./backend
53 | run: |
54 | poetry run test
55 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | arxiv-metadata-oai-snapshot.json
2 | arxiv-papers-1000.json
3 | arxiv.zip
4 | *.DS_STORE
5 | *.log
6 | .env
7 | .ipynb_checkpoints
8 | *.pkl
9 | .venv
10 | venv
11 | __pycache__
12 | new_backend/arxivsearch/templates/
13 | */.nvm
14 | .coverage*
15 | coverage.*
16 | htmlcov/
17 | legacy-data/
18 | .python-version
--------------------------------------------------------------------------------
/.vscode/launch.json:
--------------------------------------------------------------------------------
1 | {
2 | // Use IntelliSense to learn about possible attributes.
3 | // Hover to view descriptions of existing attributes.
4 | // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
5 | "version": "0.2.0",
6 | "configurations": [
7 | {
8 | "name": "Python Debugger: FastAPI",
9 | "type": "debugpy",
10 | "cwd": "${workspaceFolder}/backend/",
11 | "env": {
12 | "PYTHONPATH": "${cwd}"
13 | },
14 | "request": "launch",
15 | "module": "uvicorn",
16 | "args": [
17 | "arxivsearch.main:app",
18 | "--port=8888",
19 | "--reload"
20 | ],
21 | "jinja": true,
22 | }
23 | ]
24 | }
--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 | "python.testing.pytestArgs": [
3 | "backend"
4 | ],
5 | "python.testing.unittestEnabled": false,
6 | "python.testing.pytestEnabled": true,
7 | "python.testing.cwd": "${workspaceFolder}/backend/",
8 | }
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM node:22.0.0 AS ReactImage
2 |
3 | WORKDIR /app/frontend
4 |
5 | ENV NODE_PATH=/app/frontend/node_modules
6 | ENV PATH=$PATH:/app/frontend/node_modules/.bin
7 |
8 | COPY ./frontend/package.json ./
9 | RUN npm install
10 |
11 | ADD ./frontend ./
12 | RUN npm run build
13 |
14 |
15 | FROM python:3.11 AS ApiImage
16 |
17 | ENV PYTHONUNBUFFERED 1
18 | ENV PYTHONDONTWRITEBYTECODE 1
19 |
20 | WORKDIR /app/
21 | VOLUME [ "/data" ]
22 |
23 | RUN apt-get update && \
24 | apt-get install -y curl && \
25 | rm -rf /var/lib/apt/lists/*
26 |
27 | RUN curl -sSL https://install.python-poetry.org | POETRY_HOME=/opt/poetry python && \
28 | cd /usr/local/bin && \
29 | ln -s /opt/poetry/bin/poetry && \
30 | poetry config virtualenvs.create false
31 |
32 | RUN mkdir -p /app/backend
33 |
34 | # copy deps first so we don't have to reload everytime
35 | COPY ./backend/poetry.lock ./backend/pyproject.toml ./backend/
36 |
37 | WORKDIR /app/backend
38 | RUN poetry install --all-extras --no-interaction
39 |
40 | COPY ./backend/ .
41 |
42 | # add static react files to fastapi image
43 | COPY --from=ReactImage /app/frontend/build /app/backend/arxivsearch/templates/build
44 |
45 | CMD ["poetry", "run", "start-app"]
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | BSD 3-Clause License
2 |
3 | Copyright (c) 2022, Redis Inc
4 | All rights reserved.
5 |
6 | Redistribution and use in source and binary forms, with or without
7 | modification, are permitted provided that the following conditions are met:
8 |
9 | * Redistributions of source code must retain the above copyright notice, this
10 | list of conditions and the following disclaimer.
11 |
12 | * Redistributions in binary form must reproduce the above copyright notice,
13 | this list of conditions and the following disclaimer in the documentation
14 | and/or other materials provided with the distribution.
15 |
16 | * Neither the name of the copyright holder nor the names of its
17 | contributors may be used to endorse or promote products derived from
18 | this software without specific prior written permission.
19 |
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | .PHONY: deploy
2 |
3 | deploy:
4 | docker compose -f docker-local-redis.yml up
5 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
🔎 arXiv Search API
7 |
14 |
15 |
16 |
17 |
18 |
19 | *This repository is the official codebase for the arxiv paper search app hosted at: **https://docsearch.redisvl.com***
20 |
21 |
22 | [Redis](https://redis.com) is a highly performant, production-ready vector database, which can be used for many types of applications. Here we showcase Redis vector search applied to a document retrieval use case. Read more about AI-powered search in [the technical blog post](https://datasciencedojo.com/blog/ai-powered-document-search/) published by our partners, *[Data Science Dojo](https://datasciencedojo.com)*.
23 |
24 | ### Dataset
25 |
26 | The arXiv papers dataset was sourced from the the following [Kaggle link](https://www.kaggle.com/Cornell-University/arxiv). arXiv is commonly used for scientific research in a variety of fields. Exposing a semantic search layer enables natural human language to be used to discover relevant papers.
27 |
28 |
29 | ## Application
30 |
31 | This app was built as a Single Page Application (SPA) with the following components:
32 |
33 | - **[Redis Stack](https://redis.io/docs/stack/)** for vector database
34 | - **[RedisVL](https://redisvl.com)** for Python vector db client
35 | - **[FastAPI](https://fastapi.tiangolo.com/)** for Python API
36 | - **[Pydantic](https://pydantic-docs.helpmanual.io/)** for schema and validation
37 | - **[React](https://reactjs.org/)** (with Typescript)
38 | - **[Docker Compose](https://docs.docker.com/compose/)** for development
39 | - **[MaterialUI](https://material-ui.com/)** for some UI elements/components
40 | - **[React-Bootstrap](https://react-bootstrap.github.io/)** for some UI elements
41 | - **[Huggingface](https://huggingface.co/sentence-transformers)**, **[OpenAI](https://platform.openai.com)**, and **[Cohere](https://cohere.com)** for vector embedding creation
42 |
43 | Some inspiration was taken from this [tiangolo/full-stack-fastapi-template](https://github.com/tiangolo/full-stack-fastapi-template)
44 | and turned into a SPA application instead of a separate front-end server approach.
45 |
46 | ### General Project Structure
47 |
48 | ```
49 | /backend
50 | /arxivsearch
51 | /api
52 | /routes
53 | papers.py # primary paper search logic lives here
54 | /db
55 | load.py # seeds Redis DB
56 | redis_helpers.py # redis util
57 | /schema
58 | # pydantic models for serialization/validation from API
59 | /tests
60 | /utils
61 | config.py
62 | spa.py # logic for serving compiled react project
63 | main.py # entrypoint
64 | /frontend
65 | /public
66 | # index, manifest, logos, etc.
67 | /src
68 | /config
69 | /styles
70 | /views
71 | # primary components live here
72 |
73 | api.ts # logic for connecting with BE
74 | App.tsx # project entry
75 | Routes.tsk # route definitions
76 | ...
77 | /data
78 | # folder mounted as volume in Docker
79 | # load script auto populates initial data from S3
80 |
81 | ```
82 |
83 | ### Embedding Providers
84 | Embeddings represent the semantic properies of the raw text and enable vector similarity search. This applications supports `HuggingFace`, `OpenAI`, and `Cohere` embeddings out of the box.
85 |
86 | | Provider | Embedding Model | Required? |
87 | | ------------- |-------------| ----- |
88 | | HuggingFace | `sentence-transformers/all-mpnet-base-v2` | Yes |
89 | | OpenAI | `text-embedding-ada-002` | Yes |
90 | | Cohere | `embed-multilingual-v3.0` | Yes |
91 |
92 | **Interested in a different embedding provider?** Feel free to open a PR and make a suggested addition.
93 |
94 | **Want to use a different model than the one listed?** Set the following environment variables in your `.env` file (see below) to change:
95 |
96 | - `SENTENCE_TRANSFORMER_MODEL`
97 | - `OPENAI_EMBEDDING_MODEL`
98 | - `COHERE_EMBEDDING_MODEL`
99 |
100 |
101 | ## 🚀 Running the App
102 | 1. Before running the app, install [Docker Desktop](https://www.docker.com/products/docker-desktop/).
103 | 2. Clone (and optionally fork) this Github repo to your machine.
104 | ```bash
105 | $ git clone https://github.com/redis-developer/redis-arxiv-search
106 | ```
107 | 3. Make a copy of the `.env.template` file:
108 | ```bash
109 | $ cd redis-arXiv-search/
110 | $ cp .env.template .env
111 | ```
112 | - Add your `OPENAI_API_KEY` to the `.env` file. **Need one?** [Get an API key](https://platform.openai.com)
113 | - Add you `COHERE_API_KEY` to the `.env` file. **Need one?** [Get an API key](https://cohere.ai)
114 |
115 | ### Run locally with Redis 8 CE
116 | ```bash
117 | make deploy
118 | ```
119 |
120 |
121 | ## Customizing (optional)
122 |
123 | ### Run local redis with Docker
124 | ```bash
125 | docker run -d --name redis -p 6379:6379 -p 8001:8001 redis:8.0-M03
126 | ```
127 |
128 | ### FastApi with poetry
129 | To run the backend locally
130 |
131 | 1. `cd backend`
132 | 2. `poetry install`
133 | 3. `poetry run start-app`
134 |
135 | *poetry run start-app runs the initial db load script and launch the API*
136 |
137 | ### React Dev Environment
138 | It's typically easier to build front end in an interactive environment, testing changes in realtime.
139 |
140 | 1. Deploy the app using steps above.
141 | 2. Install packages
142 | ```bash
143 | $ cd frontend/
144 | $ npm install
145 | ````
146 | 4. Use `npm` to serve the application from your machine
147 | ```bash
148 | $ npm run start
149 | ```
150 | 5. Navigate to `http://localhost:3000` in a browser.
151 |
152 | All changes to your frontend code will be reflected in your display in semi realtime.
153 |
154 |
155 | ### Troubleshooting
156 | Every once and a while you need to clear out some Docker cached artifacts. Run `docker system prune`, restart Docker Desktop, and try again.
157 |
158 | This project is maintained by Redis on a good faith basis. Please, open an issue here on GitHub and we will try to be responsive to these.
159 |
--------------------------------------------------------------------------------
/backend/.dockerignore:
--------------------------------------------------------------------------------
1 | # Python
2 | __pycache__
3 | app.egg-info
4 | *.pyc
5 | .mypy_cache
6 | .coverage
7 | htmlcov
8 | .venv
9 |
--------------------------------------------------------------------------------
/backend/arxivsearch/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/redis-developer/redis-arXiv-search/281e3923032b481b9037e905417357be2f309e98/backend/arxivsearch/__init__.py
--------------------------------------------------------------------------------
/backend/arxivsearch/api/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/redis-developer/redis-arXiv-search/281e3923032b481b9037e905417357be2f309e98/backend/arxivsearch/api/__init__.py
--------------------------------------------------------------------------------
/backend/arxivsearch/api/main.py:
--------------------------------------------------------------------------------
1 | from fastapi import APIRouter
2 |
3 | from arxivsearch.api.routes import papers
4 |
5 | api_router = APIRouter()
6 | api_router.include_router(papers.router, prefix="/papers", tags=["papers"])
7 |
--------------------------------------------------------------------------------
/backend/arxivsearch/api/routes/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/redis-developer/redis-arXiv-search/281e3923032b481b9037e905417357be2f309e98/backend/arxivsearch/api/routes/__init__.py
--------------------------------------------------------------------------------
/backend/arxivsearch/api/routes/papers.py:
--------------------------------------------------------------------------------
1 | import asyncio
2 | import logging
3 |
4 | import numpy as np
5 | from fastapi import APIRouter, Depends, Query
6 | from redisvl.index import AsyncSearchIndex
7 | from redisvl.query import CountQuery, FilterQuery, VectorQuery
8 |
9 | from arxivsearch import config
10 | from arxivsearch.db import utils
11 | from arxivsearch.schema.models import (
12 | PaperSimilarityRequest,
13 | SearchResponse,
14 | UserTextSimilarityRequest,
15 | VectorSearchResponse,
16 | )
17 | from arxivsearch.utils.embeddings import Embeddings
18 |
19 | logger = logging.getLogger(__name__)
20 |
21 |
22 | # Initialize the API router
23 | router = APIRouter()
24 |
25 | embeddings = Embeddings()
26 |
27 |
28 | @router.get("/", response_model=SearchResponse)
29 | async def get_papers(
30 | index: AsyncSearchIndex = Depends(utils.get_async_index),
31 | limit: int = Query(default=20, description="Maximum number of papers to return."),
32 | skip: int = Query(
33 | default=0, description="Number of papers to skip for pagination."
34 | ),
35 | years: str = Query(
36 | default="", description="Comma-separated string of years to filter papers."
37 | ),
38 | categories: str = Query(
39 | default="", description="Comma-separated string of categories to filter papers."
40 | ),
41 | ):
42 | """Fetch and return papers with optional filtering by years and categories.
43 |
44 | Args:
45 | limit (int, optional): Maximum number of papers to return.
46 | Defaults to 20.
47 | skip (int, optional): Number of papers to skip for pagination.
48 | Defaults to 0.
49 | years (str, optional): Comma-separated string of years to filter papers.
50 | Defaults to "".
51 | categories (str, optional): Comma-separated string of categories to
52 | filter papers. Defaults to "".
53 |
54 | Returns:
55 | SearchResponse: Pydantic model containing papers and total count.
56 | """
57 | # Build queries
58 | filter_expression = utils.build_filter_expression(
59 | years.split(","), categories.split(",")
60 | )
61 | filter_query = FilterQuery(return_fields=[], filter_expression=filter_expression)
62 | filter_query.paging(skip, limit)
63 | count_query = CountQuery(filter_expression)
64 |
65 | # Execute searches
66 | total_count, result_papers = await asyncio.gather(
67 | index.query(count_query), index.query(filter_query)
68 | )
69 | return SearchResponse(total=total_count, papers=result_papers)
70 |
71 |
72 | @router.post("/vector_search/by_paper", response_model=VectorSearchResponse)
73 | async def find_papers_by_paper(
74 | similarity_request: PaperSimilarityRequest,
75 | index: AsyncSearchIndex = Depends(utils.get_async_index),
76 | ):
77 | """
78 | Find and return papers similar to a given paper based on vector
79 | similarity.
80 |
81 | Args:
82 | PaperSimilarityRequest: Similarity request object
83 | containing paper_id, provider, number_of_results, years, and
84 | categories for filtering.
85 |
86 | Returns:
87 | VectorSearchResponse: Pydantic model with paper content.
88 | """
89 |
90 | # Fetch paper vector from the HASH, cast to numpy array
91 | paper = await index.fetch(similarity_request.paper_id)
92 | paper_vector = np.frombuffer(
93 | paper[similarity_request.provider.value], dtype=np.float32
94 | )
95 | # Build filter expression
96 | filter_expression = utils.build_filter_expression(
97 | similarity_request.years, similarity_request.categories
98 | )
99 | # Create queries
100 | paper_similarity_query = VectorQuery(
101 | vector=paper_vector,
102 | vector_field_name=similarity_request.provider.value,
103 | num_results=similarity_request.number_of_results,
104 | return_fields=config.RETURN_FIELDS,
105 | filter_expression=filter_expression,
106 | )
107 | count_query = CountQuery(filter_expression)
108 | # Execute searches
109 | total_count, result_papers = await asyncio.gather(
110 | index.query(count_query), index.query(paper_similarity_query)
111 | )
112 | return VectorSearchResponse(total=total_count, papers=result_papers)
113 |
114 |
115 | @router.post("/vector_search/by_text", response_model=VectorSearchResponse)
116 | async def find_papers_by_text(
117 | similarity_request: UserTextSimilarityRequest,
118 | index: AsyncSearchIndex = Depends(utils.get_async_index),
119 | ):
120 | """
121 | Find and return papers similar to user-provided text based on
122 | vector similarity.
123 |
124 | Args:
125 | UserTextSimilarityRequest: Similarity request
126 | object containing user_text, provider, number_of_results, years,
127 | and categories for filtering.
128 |
129 | Returns:
130 | VectorSearchResponse: Pydantic model with paper content.
131 | """
132 |
133 | # Build filter expression
134 | filter_expression = utils.build_filter_expression(
135 | similarity_request.years, similarity_request.categories
136 | )
137 | # Check available paper count and create vector from user text
138 | count_query = CountQuery(filter_expression)
139 | query_vector = await embeddings.get(
140 | provider=similarity_request.provider.value, text=similarity_request.user_text
141 | )
142 | # Assemble vector query
143 | paper_similarity_query = VectorQuery(
144 | vector=query_vector,
145 | vector_field_name=similarity_request.provider.value,
146 | num_results=similarity_request.number_of_results,
147 | return_fields=config.RETURN_FIELDS,
148 | filter_expression=filter_expression,
149 | )
150 | # Execute searches
151 | total_count, result_papers = await asyncio.gather(
152 | index.query(count_query), index.query(paper_similarity_query)
153 | )
154 | return VectorSearchResponse(total=total_count, papers=result_papers)
155 |
--------------------------------------------------------------------------------
/backend/arxivsearch/config.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | # Project Details
4 | PROJECT_NAME = "arxivsearch"
5 | API_DOCS = "/api/docs"
6 | OPENAPI_DOCS = "/api/openapi.json"
7 | API_V1_STR = "/api/v1"
8 |
9 | # Configuration
10 | DEFAULT_DATASET = os.environ.get("DEFAULT_DATASET", "arxiv-papers-1000.json")
11 | S3_DATA_URL = "https://arxiv-search.s3.us-east-2.amazonaws.com/arxiv-papers-1000.json"
12 | DATA_LOCATION = os.environ.get("DATA_LOCATION", "../data")
13 | DEPLOYMENT_ENV = os.environ.get("DEPLOYMENT", "dev")
14 | WRITE_CONCURRENCY = os.environ.get("WRITE_CONCURRENCY", 150)
15 | RETURN_FIELDS = [
16 | "paper_id",
17 | "authors",
18 | "categories",
19 | "year",
20 | "title",
21 | "vector_distance",
22 | ]
23 |
24 | # Redis
25 | REDIS_HOST = os.environ.get("REDIS_HOST", "localhost")
26 | REDIS_PORT = os.environ.get("REDIS_PORT", 6379)
27 | REDIS_PASSWORD = os.environ.get("REDIS_PASSWORD")
28 | if REDIS_PASSWORD:
29 | REDIS_URL = f"redis://:{REDIS_PASSWORD}@{REDIS_HOST}:{REDIS_PORT}"
30 | else:
31 | REDIS_URL = f"redis://{REDIS_HOST}:{REDIS_PORT}"
32 |
33 |
34 | # Model Providers
35 | DEFAULT_PROVIDER = "huggingface"
36 | SENTENCE_TRANSFORMER_MODEL = os.environ.get(
37 | "SENTENCE_TRANSFORMER_MODEL", "sentence-transformers/all-mpnet-base-v2"
38 | )
39 | OPENAI_EMBEDDING_MODEL = os.environ.get(
40 | "OPENAI_EMBEDDING_MODEL", "text-embedding-ada-002"
41 | )
42 | COHERE_EMBEDDING_MODEL = os.environ.get(
43 | "COHERE_EMBEDDING_MODEL", "embed-multilingual-v3.0"
44 | )
45 |
--------------------------------------------------------------------------------
/backend/arxivsearch/db/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/redis-developer/redis-arXiv-search/281e3923032b481b9037e905417357be2f309e98/backend/arxivsearch/db/__init__.py
--------------------------------------------------------------------------------
/backend/arxivsearch/db/load.py:
--------------------------------------------------------------------------------
1 | import asyncio
2 | import json
3 | import logging
4 | import os
5 | from typing import Any, Dict, List
6 |
7 | import numpy as np
8 | import requests
9 | from redisvl.index import AsyncSearchIndex
10 |
11 | from arxivsearch import config
12 | from arxivsearch.db.utils import get_async_index, get_schema
13 | from arxivsearch.schema.models import Provider
14 |
15 | logger = logging.getLogger(__name__)
16 |
17 |
18 | def read_from_s3(path):
19 | res = requests.get(config.S3_DATA_URL)
20 | data = res.json()
21 |
22 | if os.path.isdir(config.DATA_LOCATION):
23 | logger.info(f"Writing s3 file to {path}")
24 | with open(path, "w") as f:
25 | json.dump(data, f)
26 | else:
27 | logger.warning(
28 | f"Data directory {config.DATA_LOCATION} not found. Skipping write of S3 data"
29 | )
30 | return data
31 |
32 |
33 | def read_paper_json() -> List[Dict[str, Any]]:
34 | """
35 | Load JSON array of arXiv papers and embeddings.
36 | """
37 | logger.info("Loading papers dataset from disk")
38 | path = os.path.join(config.DATA_LOCATION, config.DEFAULT_DATASET)
39 | try:
40 | with open(path, "r") as f:
41 | data = json.load(f)
42 | except:
43 | logger.info(f"Failed to read {path} => getting from s3")
44 | data = read_from_s3(path)
45 |
46 | return data
47 |
48 |
49 | async def write_async(index: AsyncSearchIndex, papers: list):
50 | """
51 | Write arXiv paper records to Redis asynchronously.
52 | """
53 |
54 | async def preprocess_paper(paper: dict) -> dict:
55 | for provider_vector in Provider:
56 | paper[provider_vector] = np.array(
57 | paper[provider_vector], dtype=np.float32
58 | ).tobytes()
59 | paper["paper_id"] = paper.pop("id")
60 | paper["categories"] = paper["categories"].replace(",", "|")
61 | return paper
62 |
63 | logger.info("Loading papers dataset to Redis")
64 |
65 | _ = await index.load(
66 | data=papers,
67 | preprocess=preprocess_paper,
68 | concurrency=int(config.WRITE_CONCURRENCY),
69 | id_field="id",
70 | )
71 |
72 | logger.info("All papers loaded")
73 |
74 |
75 | async def load_data():
76 | # Load schema specs and create index in Redis
77 | index = await get_async_index()
78 |
79 | # Load dataset and create index
80 | try:
81 | # Check if index exists
82 | if await index.exists() and len((await index.search("*")).docs) > 0:
83 | # if running local and not seeing logger logs make sure index isn't already created
84 | logger.info("Index and data already exists, skipping load")
85 | else:
86 | logger.info("Creating new index")
87 | await index.create(overwrite=True)
88 | papers = read_paper_json()
89 | await write_async(index=index, papers=papers)
90 | except Exception as e:
91 | logger.exception(
92 | "An exception occurred while trying to load the index and dataset"
93 | )
94 | raise
95 |
96 | # Wait for any indexing to finish
97 | while True:
98 | info = await index.info()
99 | if info["percent_indexed"] == "1":
100 | logger.info("Indexing is complete!")
101 | break
102 | logger.info(f"{info['percent_indexed']} indexed...")
103 | await asyncio.sleep(5)
104 |
105 |
106 | if __name__ == "__main__":
107 | asyncio.run(load_data())
108 |
--------------------------------------------------------------------------------
/backend/arxivsearch/db/schema/index.yaml:
--------------------------------------------------------------------------------
1 | version: '0.1.0'
2 |
3 | index:
4 | name: arxiv
5 | prefix: paper
6 | storage_type: hash
7 |
8 | fields:
9 | - name: categories
10 | type: tag
11 | attrs:
12 | separator: '|'
13 | - name: year
14 | type: tag
15 | attrs:
16 | separator: '|'
17 | - name: huggingface
18 | type: vector
19 | attrs:
20 | dims: 768
21 | type: float32
22 | algorithm: hnsw
23 | distance_metric: cosine
24 | - name: openai
25 | type: vector
26 | attrs:
27 | dims: 1536
28 | type: float32
29 | algorithm: hnsw
30 | distance_metric: cosine
31 | - name: cohere
32 | type: vector
33 | attrs:
34 | dims: 1024
35 | type: float32
36 | algorithm: hnsw
37 | distance_metric: cosine
38 |
--------------------------------------------------------------------------------
/backend/arxivsearch/db/utils.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import os
3 | from typing import List
4 |
5 | from redisvl.index import AsyncSearchIndex
6 | from redisvl.query.filter import FilterExpression, Tag
7 | from redisvl.schema import IndexSchema
8 |
9 | from arxivsearch import config
10 |
11 | logger = logging.getLogger(__name__)
12 |
13 |
14 | # global search index
15 | _global_index = None
16 |
17 |
18 | def get_schema() -> IndexSchema:
19 | dir_path = os.path.dirname(os.path.realpath(__file__)) + "/schema"
20 | file_path = os.path.join(dir_path, "index.yaml")
21 | return IndexSchema.from_yaml(file_path)
22 |
23 |
24 | async def get_async_index():
25 | global _global_index
26 | if not _global_index:
27 | _global_index = AsyncSearchIndex(get_schema(), redis_url=config.REDIS_URL)
28 | return _global_index
29 |
30 |
31 | def build_filter_expression(
32 | years: List[str], categories: List[str]
33 | ) -> FilterExpression:
34 | """
35 | Construct a filter expression based on the provided years and categories.
36 |
37 | Args:
38 | years (list): A list of years (integers or strings) to be included in
39 | the filter expression. An empty list means there's no filter applied
40 | based on years.
41 | categories (list): A list of category strings to be included in the
42 | filter expression. An empty list means there's no filter applied
43 | based on categories.
44 |
45 | Returns:
46 | FilterExpression: A FilterExpression object representing the combined
47 | filter for both years and categories.
48 | """
49 | year_filter = Tag("year") == [str(year) for year in years if year]
50 | category_filter = Tag("categories") == [
51 | str(category) for category in categories if category
52 | ]
53 | return year_filter & category_filter
54 |
--------------------------------------------------------------------------------
/backend/arxivsearch/main.py:
--------------------------------------------------------------------------------
1 | import logging
2 | from contextlib import asynccontextmanager
3 | from pathlib import Path
4 |
5 | import uvicorn
6 | from fastapi import FastAPI
7 | from fastapi.staticfiles import StaticFiles
8 | from starlette.middleware.cors import CORSMiddleware
9 |
10 | from arxivsearch import config
11 | from arxivsearch.api.main import api_router
12 | from arxivsearch.db.utils import get_async_index
13 | from arxivsearch.spa import SinglePageApplication
14 |
15 | logging.basicConfig(
16 | level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
17 | )
18 |
19 |
20 | @asynccontextmanager
21 | async def lifespan(app: FastAPI):
22 | index = await get_async_index()
23 | async with index:
24 | yield
25 |
26 |
27 | app = FastAPI(
28 | title=config.PROJECT_NAME,
29 | docs_url=config.API_DOCS,
30 | openapi_url=config.OPENAPI_DOCS,
31 | lifespan=lifespan,
32 | )
33 |
34 | app.add_middleware(
35 | CORSMiddleware,
36 | allow_origins="*",
37 | allow_credentials=True,
38 | allow_methods=["*"],
39 | allow_headers=["*"],
40 | )
41 |
42 | # Routers
43 | app.include_router(
44 | api_router,
45 | prefix=config.API_V1_STR,
46 | )
47 |
48 | # static image files
49 | app.mount("/data", StaticFiles(directory="data"), name="data")
50 |
51 | ## mount the built GUI react files into the static dir to be served.
52 | current_file = Path(__file__)
53 | project_root = current_file.parent.resolve()
54 | gui_build_dir = project_root / "templates" / "build"
55 | app.mount(path="/", app=SinglePageApplication(directory=gui_build_dir), name="SPA")
56 |
57 |
58 | def main():
59 | server_attr = {"host": "0.0.0.0", "reload": True, "port": 8888, "workers": 1}
60 | if config.DEPLOYMENT_ENV == "prod":
61 | server_attr.update(
62 | {
63 | "reload": False,
64 | "workers": 2,
65 | "ssl_keyfile": "app/backend/arxivsearch/key.pem",
66 | "ssl_certfile": "app/backend/arxivsearch/full.pem",
67 | }
68 | )
69 |
70 | uvicorn.run("arxivsearch.main:app", **server_attr)
71 |
72 |
73 | if __name__ == "__main__":
74 | main()
75 |
--------------------------------------------------------------------------------
/backend/arxivsearch/schema/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/redis-developer/redis-arXiv-search/281e3923032b481b9037e905417357be2f309e98/backend/arxivsearch/schema/__init__.py
--------------------------------------------------------------------------------
/backend/arxivsearch/schema/models.py:
--------------------------------------------------------------------------------
1 | from enum import Enum
2 |
3 | from pydantic import BaseModel
4 |
5 |
6 | class Provider(str, Enum):
7 | """Embedding model provider"""
8 |
9 | huggingface = "huggingface"
10 | openai = "openai"
11 | cohere = "cohere"
12 |
13 |
14 | class BaseRequest(BaseModel):
15 | categories: list[str]
16 | years: list[str]
17 | provider: Provider
18 | number_of_results: int = 15
19 | search_type: str = "KNN"
20 |
21 |
22 | class PaperSimilarityRequest(BaseRequest):
23 | paper_id: str
24 |
25 |
26 | class UserTextSimilarityRequest(BaseRequest):
27 | user_text: str
28 |
29 |
30 | class Paper(BaseModel):
31 | paper_id: str
32 | authors: str
33 | categories: str
34 | year: str
35 | title: str
36 | abstract: str = ""
37 |
38 |
39 | class BaseSearchPaper(Paper):
40 | # vector embeddings
41 | huggingface: str
42 | openai: str
43 | cohere: str
44 |
45 |
46 | class VectorSearchPaper(Paper):
47 | vector_distance: float
48 | similarity_score: float
49 |
50 | def __init__(self, *args, **kwargs):
51 | kwargs["similarity_score"] = 1 - float(kwargs["vector_distance"])
52 | super().__init__(*args, **kwargs)
53 |
54 |
55 | class SearchResponse(BaseModel):
56 | total: int
57 | papers: list[BaseSearchPaper]
58 |
59 |
60 | class VectorSearchResponse(BaseModel):
61 | total: int
62 | papers: list[VectorSearchPaper]
63 |
--------------------------------------------------------------------------------
/backend/arxivsearch/spa.py:
--------------------------------------------------------------------------------
1 | import os
2 | from typing import Tuple
3 |
4 | from fastapi.staticfiles import StaticFiles
5 |
6 |
7 | class SinglePageApplication(StaticFiles):
8 | """Configure a FastAPI application to both serve static files mounted
9 | at a specific directory and also serve a number of routers for a backend
10 | API at the same time.
11 | """
12 |
13 | def __init__(self, directory: os.PathLike, index="index.html") -> None:
14 | self.index = index
15 |
16 | # set html=True to resolve the index even when no
17 | # the base path is passed in
18 | super().__init__(directory=directory, packages=None, html=True, check_dir=True)
19 |
20 | async def get_response(self, path: str, scope):
21 | response = await super().get_response(path, scope)
22 | if response.status_code == 404:
23 | response = await super().get_response(".", scope)
24 | return response
25 |
26 | def lookup_path(self, path: str) -> Tuple[str, os.stat_result | None]:
27 | results = super().lookup_path(path)
28 | full_path, stat_result = results
29 |
30 | # if a file cannot be found
31 | if stat_result is None:
32 | return super().lookup_path(self.index)
33 |
34 | return (full_path, stat_result)
35 |
--------------------------------------------------------------------------------
/backend/arxivsearch/templates/build/build.txt:
--------------------------------------------------------------------------------
1 | this is where SPA app goes once built
--------------------------------------------------------------------------------
/backend/arxivsearch/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/redis-developer/redis-arXiv-search/281e3923032b481b9037e905417357be2f309e98/backend/arxivsearch/tests/__init__.py
--------------------------------------------------------------------------------
/backend/arxivsearch/tests/api/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/redis-developer/redis-arXiv-search/281e3923032b481b9037e905417357be2f309e98/backend/arxivsearch/tests/api/__init__.py
--------------------------------------------------------------------------------
/backend/arxivsearch/tests/api/routes/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/redis-developer/redis-arXiv-search/281e3923032b481b9037e905417357be2f309e98/backend/arxivsearch/tests/api/routes/__init__.py
--------------------------------------------------------------------------------
/backend/arxivsearch/tests/api/routes/test_papers.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | from httpx import AsyncClient
3 |
4 | from arxivsearch.schema.models import PaperSimilarityRequest, UserTextSimilarityRequest
5 |
6 |
7 | @pytest.fixture(scope="module")
8 | def years(test_data):
9 | return test_data[0]["year"]
10 |
11 |
12 | @pytest.fixture(scope="module")
13 | def categories(test_data):
14 | return test_data[0]["categories"]
15 |
16 |
17 | @pytest.fixture(scope="module")
18 | def bad_req_json():
19 | return {"not": "valid"}
20 |
21 |
22 | @pytest.fixture(scope="module")
23 | def text_req(years, categories):
24 | return UserTextSimilarityRequest(
25 | categories=[categories],
26 | years=[years],
27 | provider="huggingface",
28 | user_text="deep learning",
29 | )
30 |
31 |
32 | @pytest.fixture(scope="module")
33 | def paper_req(test_data):
34 | return PaperSimilarityRequest(
35 | categories=[],
36 | years=[],
37 | provider="huggingface",
38 | paper_id=test_data[0]["paper_id"],
39 | )
40 |
41 |
42 | @pytest.mark.asyncio(scope="session")
43 | async def test_root_w_filters(
44 | async_client: AsyncClient, years: str, categories: str
45 | ) -> None:
46 |
47 | response = await async_client.get(
48 | f"papers/?limit=1&years={years}&categories={categories}"
49 | )
50 |
51 | assert response.status_code == 200
52 | content = response.json()
53 |
54 | assert content["total"] == 1
55 | assert len(content["papers"]) == 1
56 | assert content["papers"][0]["categories"] == categories
57 | assert content["papers"][0]["year"] == years
58 |
59 |
60 | @pytest.mark.asyncio(scope="session")
61 | async def test_root_na_category(async_client: AsyncClient, years: str):
62 |
63 | response = await async_client.get(f"papers/?limit=1&years={years}&categories=NA")
64 |
65 | assert response.status_code == 200
66 | content = response.json()
67 | assert content["total"] == 0
68 | assert len(content["papers"]) == 0
69 |
70 |
71 | @pytest.mark.asyncio(scope="session")
72 | async def test_vector_by_text(
73 | async_client: AsyncClient,
74 | years: str,
75 | categories: str,
76 | text_req: UserTextSimilarityRequest,
77 | ):
78 | response = await async_client.post(
79 | f"papers/vector_search/by_text", json=text_req.model_dump()
80 | )
81 |
82 | assert response.status_code == 200
83 | content = response.json()
84 |
85 | assert content["total"] == 1
86 | assert len(content["papers"]) == 1
87 | assert content["papers"][0]["categories"] == categories
88 | assert content["papers"][0]["year"] == years
89 |
90 |
91 | @pytest.mark.asyncio(scope="session")
92 | async def test_vector_by_text_bad_input(async_client: AsyncClient, bad_req_json: dict):
93 |
94 | response = await async_client.post(
95 | f"papers/vector_search/by_text", json=bad_req_json
96 | )
97 |
98 | assert response.status_code == 422
99 |
100 |
101 | @pytest.mark.asyncio(scope="session")
102 | async def test_vector_by_paper(
103 | async_client: AsyncClient,
104 | paper_req: PaperSimilarityRequest,
105 | ):
106 | response = await async_client.post(
107 | f"papers/vector_search/by_paper", json=paper_req.model_dump()
108 | )
109 |
110 | assert response.status_code == 200
111 | content = response.json()
112 |
113 | assert content["total"] == 2
114 | assert len(content["papers"]) == 2
115 |
116 |
117 | @pytest.mark.asyncio(scope="session")
118 | async def test_vector_by_paper_bad_input(async_client: AsyncClient, bad_req_json: dict):
119 |
120 | response = await async_client.post(
121 | f"papers/vector_search/by_paper", json=bad_req_json
122 | )
123 |
124 | assert response.status_code == 422
125 |
--------------------------------------------------------------------------------
/backend/arxivsearch/tests/conftest.py:
--------------------------------------------------------------------------------
1 | import json
2 | import os
3 |
4 | import httpx
5 | import numpy as np
6 | import pytest
7 | import pytest_asyncio
8 | from asgi_lifespan import LifespanManager
9 | from httpx import AsyncClient
10 | from redisvl.index import SearchIndex
11 |
12 | from arxivsearch import config
13 | from arxivsearch.db.utils import get_async_index, get_schema
14 | from arxivsearch.main import app
15 |
16 |
17 | @pytest.fixture(scope="session")
18 | def index():
19 | index = SearchIndex(schema=get_schema(), redis_url=config.REDIS_URL)
20 | index.create()
21 | yield index
22 | index.disconnect()
23 |
24 |
25 | @pytest.fixture(scope="session", autouse=True)
26 | def test_data(index):
27 | cwd = os.getcwd()
28 | with open(f"{cwd}/arxivsearch/tests/test_vectors.json", "r") as f:
29 | papers = json.load(f)
30 |
31 | # convert to bytes
32 | for paper in papers:
33 | paper["huggingface"] = np.array(
34 | paper["huggingface"], dtype=np.float32
35 | ).tobytes()
36 | paper["openai"] = np.array(paper["openai"], dtype=np.float32).tobytes()
37 | paper["cohere"] = np.array(paper["cohere"], dtype=np.float32).tobytes()
38 |
39 | _ = index.load(data=papers, id_field="paper_id")
40 | return papers
41 |
42 |
43 | @pytest_asyncio.fixture(scope="session")
44 | async def async_client():
45 | async with LifespanManager(app=app) as lifespan:
46 | async with AsyncClient(
47 | transport=httpx.ASGITransport(app=app), base_url="http://test/api/v1/" # type: ignore
48 | ) as client:
49 | yield client
50 |
--------------------------------------------------------------------------------
/backend/arxivsearch/tests/db/test_load.py:
--------------------------------------------------------------------------------
1 | # import pytest
2 | from unittest.mock import mock_open, patch
3 |
4 | from arxivsearch.db.load import read_paper_json
5 |
6 |
7 | # Test when the file exists locally
8 | @patch("arxivsearch.db.load.os.path.join")
9 | @patch(
10 | "arxivsearch.db.load.open",
11 | new_callable=mock_open,
12 | read_data='[{"id": "1234", "title": "Test Paper"}]',
13 | )
14 | @patch("arxivsearch.db.load.json.load")
15 | def test_read_paper_json_local(mock_json_load, mock_file_open, mock_path_join):
16 | mock_path_join.return_value = "dummy_path"
17 | mock_json_load.return_value = [{"id": "1234", "title": "Test Paper"}]
18 |
19 | result = read_paper_json()
20 |
21 | mock_file_open.assert_called_once_with("dummy_path", "r")
22 | mock_json_load.assert_called_once()
23 | assert result == [{"id": "1234", "title": "Test Paper"}]
24 |
25 |
26 | # Test when the file needs to be fetched from S3
27 | @patch("arxivsearch.db.load.os.path.join")
28 | @patch("arxivsearch.db.load.read_from_s3")
29 | @patch("arxivsearch.db.load.json.load", side_effect=Exception("File not found"))
30 | def test_read_paper_json_s3(
31 | mock_json_load,
32 | mock_read_from_s3,
33 | mock_path_join,
34 | ):
35 | mock_path_join.return_value = "dummy_path"
36 | mock_read_from_s3.return_value = [{"id": "5678", "title": "Test Paper from S3"}]
37 |
38 | result = read_paper_json()
39 |
40 | mock_read_from_s3.assert_called_once()
41 | mock_read_from_s3.assert_called_with("dummy_path")
42 |
43 | assert result == [{"id": "5678", "title": "Test Paper from S3"}]
44 |
--------------------------------------------------------------------------------
/backend/arxivsearch/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/redis-developer/redis-arXiv-search/281e3923032b481b9037e905417357be2f309e98/backend/arxivsearch/utils/__init__.py
--------------------------------------------------------------------------------
/backend/arxivsearch/utils/embeddings.py:
--------------------------------------------------------------------------------
1 | import re
2 | import string
3 |
4 | from redisvl.utils.vectorize import (
5 | CohereTextVectorizer,
6 | HFTextVectorizer,
7 | OpenAITextVectorizer,
8 | )
9 |
10 | from arxivsearch import config
11 | from arxivsearch.schema.models import Provider
12 |
13 |
14 | def preprocess_text(text: str) -> str:
15 | if not text:
16 | return ""
17 | # remove unicode characters
18 | text = text.encode("ascii", "ignore").decode()
19 |
20 | # remove punctuation
21 | text = re.sub("[%s]" % re.escape(string.punctuation), " ", text)
22 |
23 | # clean up the spacing
24 | text = re.sub("\s{2,}", " ", text)
25 |
26 | # remove newlines
27 | text = text.replace("\n", " ")
28 |
29 | # split on capitalized words
30 | text = " ".join(re.split("(?=[A-Z])", text))
31 |
32 | # clean up the spacing again
33 | text = re.sub("\s{2,}", " ", text)
34 |
35 | # make all words lowercase
36 | text = text.lower()
37 |
38 | return text.strip()
39 |
40 |
41 | class Embeddings:
42 |
43 | def __init__(self):
44 | self.oai_vectorizer = OpenAITextVectorizer(model=config.OPENAI_EMBEDDING_MODEL)
45 | self.co_vectorizer = CohereTextVectorizer(model=config.COHERE_EMBEDDING_MODEL)
46 | self.hf_vectorizer = HFTextVectorizer(model=config.SENTENCE_TRANSFORMER_MODEL)
47 |
48 | async def get(self, provider: str, text: str):
49 | """
50 | Create embeddings from input text.
51 |
52 | Args:
53 | provider (str): Specified provider to use
54 | text (str): Text to embed.
55 | """
56 | if provider == Provider.huggingface.value:
57 | # Use HuggingFace Sentence Transformer
58 | return self.hf_vectorizer.embed(text, preprocess=preprocess_text)
59 | elif provider == Provider.openai.value:
60 | # Use OpenAI Embeddings API
61 | return await self.oai_vectorizer.aembed(text, preprocess=preprocess_text)
62 | elif provider == Provider.cohere.value:
63 | return self.co_vectorizer.embed(
64 | text, input_type="search_query", preprocess=preprocess_text
65 | )
66 |
--------------------------------------------------------------------------------
/backend/data/redis-logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/redis-developer/redis-arXiv-search/281e3923032b481b9037e905417357be2f309e98/backend/data/redis-logo.png
--------------------------------------------------------------------------------
/backend/pyproject.toml:
--------------------------------------------------------------------------------
1 | [tool.poetry]
2 | name = "arxivsearch"
3 | version = "0.2.0"
4 | description = "Reference architecture for vector search application with Redis"
5 | authors = ["Robert Shelton "]
6 | readme = "README.md"
7 | package-mode = false
8 |
9 | [tool.poetry.dependencies]
10 | python = ">=3.11,<3.14"
11 | fastapi = "^0.111.0"
12 | uvicorn = "^0.30.1"
13 | ipython = "^8.26.0"
14 | numpy = "^1.26.4"
15 | redisvl = "^0.4.1"
16 | cohere = "^5.5.8"
17 | openai = "^1.35.9"
18 | sentence-transformers = "^3.0.1"
19 | asgi-lifespan = "^2.1.0"
20 |
21 | [tool.poetry.group.dev.dependencies]
22 | mypy = "1.9.0"
23 | black = ">=20.8b1"
24 | pylint = "3.1.0"
25 | isort = ">=5.6.4"
26 | pytest-cov = "5.0.0"
27 | pytest-asyncio = "^0.23.7"
28 | pytest = "^8.2.2"
29 | httpx = "0.27.2"
30 | types-redis = "*"
31 | types-pyyaml = "*"
32 | types-tabulate = "*"
33 | anyio = {extras = ["trio"], version = "^4.4.0"}
34 |
35 | [tool.poetry.scripts]
36 | start = "arxivsearch.main:main"
37 | start-app = "scripts:start_app"
38 | load = "scripts:load"
39 | format = "scripts:format"
40 | check-format = "scripts:check_format"
41 | sort-imports = "scripts:sort_imports"
42 | check-sort-imports = "scripts:check_sort_imports"
43 | check-lint = "scripts:check_lint"
44 | mypy = "scripts:mypy"
45 | test = "scripts:test"
46 | test-cov = "scripts:test_cov"
47 | cov = "scripts:cov"
48 |
49 | [build-system]
50 | requires = ["poetry-core"]
51 | build-backend = "poetry.core.masonry.api"
52 |
53 | [tool.coverage.html]
54 | directory = "htmlcov"
55 |
56 | [tool.mypy]
57 | warn_unused_configs = true
58 | ignore_missing_imports = true
59 |
--------------------------------------------------------------------------------
/backend/scripts.py:
--------------------------------------------------------------------------------
1 | import subprocess
2 |
3 |
4 | def load_data():
5 | subprocess.run(["python", "-m", "arxivsearch.db.load"], check=True)
6 |
7 |
8 | def start_app():
9 | # load data
10 | subprocess.run(["python", "-m", "arxivsearch.db.load"], check=True)
11 | # start app
12 | subprocess.run(["python", "-m", "arxivsearch.main"], check=True)
13 |
14 |
15 | def format():
16 | subprocess.run(
17 | ["isort", "./arxivsearch", "./tests/", "--profile", "black"], check=True
18 | )
19 | subprocess.run(["black", "./arxivsearch"], check=True)
20 |
21 |
22 | def check_format():
23 | subprocess.run(["black", "--check", "./arxivsearch"], check=True)
24 |
25 |
26 | def sort_imports():
27 | subprocess.run(
28 | ["isort", "./arxivsearch", "./tests/", "--profile", "black"], check=True
29 | )
30 |
31 |
32 | def check_sort_imports():
33 | subprocess.run(
34 | ["isort", "./arxivsearch", "--check-only", "--profile", "black"], check=True
35 | )
36 |
37 |
38 | def check_lint():
39 | subprocess.run(["pylint", "--rcfile=.pylintrc", "./arxivsearch"], check=True)
40 |
41 |
42 | def mypy():
43 | subprocess.run(["python", "-m", "mypy", "./arxivsearch"], check=True)
44 |
45 |
46 | def test():
47 | subprocess.run(
48 | ["python", "-m", "pytest", "arxivsearch", "--log-level=CRITICAL"], check=True
49 | )
50 |
51 |
52 | def test_cov():
53 | subprocess.run(
54 | [
55 | "python",
56 | "-m",
57 | "pytest",
58 | "-vv",
59 | "--cov=./arxivsearch",
60 | "--cov-report=xml",
61 | "--log-level=CRITICAL",
62 | ],
63 | check=True,
64 | )
65 |
66 |
67 | def cov():
68 | subprocess.run(["coverage", "html"], check=True)
69 | print("If data was present, coverage report is in ./htmlcov/index.html")
70 |
--------------------------------------------------------------------------------
/docker-local-redis.yml:
--------------------------------------------------------------------------------
1 | version: '3.9'
2 |
3 | services:
4 | backend:
5 | build:
6 | context: "."
7 | dockerfile: Dockerfile
8 | env_file:
9 | - .env
10 | expose:
11 | - "8888"
12 | ports:
13 | - "8888:8888"
14 | volumes:
15 | - ./data:/app/data
16 | depends_on:
17 | - "redis"
18 | redis:
19 | image: redis:8.0-M03
20 | ports:
21 | - "6379:6379"
22 | - "8001:8001"
23 | healthcheck:
24 | test: ["CMD", "redis-cli", "-h", "localhost", "-p", "6379", "ping"]
25 | interval: 2s
26 | timeout: 1m30s
27 | retries: 5
28 | start_period: 5s
29 |
--------------------------------------------------------------------------------
/frontend/.gitignore:
--------------------------------------------------------------------------------
1 | # See https://help.github.com/articles/ignoring-files/ for more about ignoring files.
2 |
3 | # dependencies
4 | /node_modules
5 | /.pnp
6 | .pnp.js
7 |
8 | # testing
9 | /coverage
10 |
11 | # production
12 | /build
13 |
14 | # misc
15 | */.nvm
16 | .DS_Store
17 | .env.local
18 | .env.development.local
19 | .env.test.local
20 | .env.production.local
21 |
22 | npm-debug.log*
23 | yarn-debug.log*
24 | yarn-error.log*
25 |
--------------------------------------------------------------------------------
/frontend/README.md:
--------------------------------------------------------------------------------
1 | # Getting Started with Create React App
2 |
3 | This project was bootstrapped with [Create React App](https://github.com/facebook/create-react-app).
4 |
5 | ## Available Scripts
6 |
7 | In the project directory, you can run:
8 |
9 | ### `yarn start`
10 |
11 | Runs the app in the development mode.\
12 | Open [http://localhost:3000](http://localhost:3000) to view it in the browser.
13 |
14 | The page will reload if you make edits.\
15 | You will also see any lint errors in the console.
16 |
17 | ### `yarn test`
18 |
19 | Launches the test runner in the interactive watch mode.\
20 | See the section about [running tests](https://facebook.github.io/create-react-app/docs/running-tests) for more information.
21 |
22 | ### `yarn build`
23 |
24 | Builds the app for production to the `build` folder.\
25 | It correctly bundles React in production mode and optimizes the build for the best performance.
26 |
27 | The build is minified and the filenames include the hashes.\
28 | Your app is ready to be deployed!
29 |
30 | See the section about [deployment](https://facebook.github.io/create-react-app/docs/deployment) for more information.
31 |
32 | ### `yarn eject`
33 |
34 | **Note: this is a one-way operation. Once you `eject`, you can’t go back!**
35 |
36 | If you aren’t satisfied with the build tool and configuration choices, you can `eject` at any time. This command will remove the single build dependency from your project.
37 |
38 | Instead, it will copy all the configuration files and the transitive dependencies (webpack, Babel, ESLint, etc) right into your project so you have full control over them. All of the commands except `eject` will still work, but they will point to the copied scripts so you can tweak them. At this point you’re on your own.
39 |
40 | You don’t have to ever use `eject`. The curated feature set is suitable for small and middle deployments, and you shouldn’t feel obligated to use this feature. However we understand that this tool wouldn’t be useful if you couldn’t customize it when you are ready for it.
41 |
42 | ## Learn More
43 |
44 | You can learn more in the [Create React App documentation](https://facebook.github.io/create-react-app/docs/getting-started).
45 |
46 | To learn React, check out the [React documentation](https://reactjs.org/).
47 |
--------------------------------------------------------------------------------
/frontend/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "redis-arXiv-search",
3 | "version": "0.1.0",
4 | "private": true,
5 | "dependencies": {
6 | "@emotion/react": "^11.13.0",
7 | "@emotion/styled": "^11.13.0",
8 | "@mui/material": "^5.16.4",
9 | "@testing-library/jest-dom": "^5.17.0",
10 | "@testing-library/react": "^13.4.0",
11 | "@testing-library/user-event": "^13.5.0",
12 | "@types/jest": "^27.5.2",
13 | "@types/node": "^16.18.103",
14 | "@types/react": "^18.3.3",
15 | "@types/react-dom": "^18.3.0",
16 | "react": "^18.3.1",
17 | "react-dom": "^18.3.1",
18 | "react-router-dom": "^6.25.1",
19 | "react-scripts": "5.0.1",
20 | "typescript": "^4.9.5",
21 | "web-vitals": "^2.1.4"
22 | },
23 | "scripts": {
24 | "start": "react-scripts start",
25 | "build": "react-scripts build",
26 | "test": "react-scripts test",
27 | "eject": "react-scripts eject"
28 | },
29 | "eslintConfig": {
30 | "extends": [
31 | "react-app",
32 | "react-app/jest"
33 | ]
34 | },
35 | "browserslist": {
36 | "production": [
37 | ">0.2%",
38 | "not dead",
39 | "not op_mini all"
40 | ],
41 | "development": [
42 | "last 1 chrome version",
43 | "last 1 firefox version",
44 | "last 1 safari version"
45 | ]
46 | },
47 | "proxy": "http://localhost:8888"
48 | }
--------------------------------------------------------------------------------
/frontend/public/Redis_Mark_Red.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 | ]>
13 |
16 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
37 |
38 |
--------------------------------------------------------------------------------
/frontend/public/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/redis-developer/redis-arXiv-search/281e3923032b481b9037e905417357be2f309e98/frontend/public/favicon.ico
--------------------------------------------------------------------------------
/frontend/public/github-mark-white.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/frontend/public/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
20 |
21 |
30 | Redis Arxiv Search
31 |
32 |
33 |
34 | You need to enable JavaScript to run this app.
35 |
36 |
46 |
47 |
48 |
--------------------------------------------------------------------------------
/frontend/public/manifest.json:
--------------------------------------------------------------------------------
1 | {
2 | "short_name": "React App",
3 | "name": "Create React App Sample",
4 | "icons": [
5 | {
6 | "src": "favicon.ico",
7 | "sizes": "64x64 32x32 24x24 16x16",
8 | "type": "image/x-icon"
9 | },
10 | {
11 | "src": "Redis_Mark_Red.svg",
12 | "type": "image/svg"
13 | }
14 | ],
15 | "start_url": ".",
16 | "display": "standalone",
17 | "theme_color": "#000000",
18 | "background_color": "#ffffff"
19 | }
--------------------------------------------------------------------------------
/frontend/public/robots.txt:
--------------------------------------------------------------------------------
1 | # https://www.robotstxt.org/robotstxt.html
2 | User-agent: *
3 | Disallow:
4 |
--------------------------------------------------------------------------------
/frontend/public/site.webmanifest:
--------------------------------------------------------------------------------
1 | {
2 | "name": "Redis Vector Similarity Search Demo",
3 | "short_name": "Redis VSS",
4 | "icons": [
5 | {
6 | "src": "/android-chrome-192x192.png",
7 | "sizes": "192x192",
8 | "type": "image/png"
9 | },
10 | {
11 | "src": "/android-chrome-512x512.png",
12 | "sizes": "512x512",
13 | "type": "image/png"
14 | }
15 | ],
16 | "theme_color": "#ffffff",
17 | "background_color": "#ffffff",
18 | "display": "standalone"
19 | }
20 |
--------------------------------------------------------------------------------
/frontend/public/x-logo.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/frontend/src/App.css:
--------------------------------------------------------------------------------
1 | html,
2 | body {
3 | font-family: 'Space Grotesk', sans-serif;
4 | }
5 |
6 | .App {
7 | text-align: center;
8 | }
9 |
10 | .App-logo {
11 | height: 40vmin;
12 | pointer-events: none;
13 | }
14 |
15 | @media (prefers-reduced-motion: no-preference) {
16 | .App-logo {
17 | animation: App-logo-spin infinite 20s linear;
18 | }
19 | }
20 |
21 | .App-header {
22 | background-color: #282c34;
23 | min-height: 100vh;
24 | display: flex;
25 | flex-direction: column;
26 | align-items: center;
27 | justify-content: center;
28 | font-size: calc(10px + 2vmin);
29 | color: white;
30 | }
31 |
32 | .App-link {
33 | color: #61dafb;
34 | }
35 |
36 | @keyframes App-logo-spin {
37 | from {
38 | transform: rotate(0deg);
39 | }
40 |
41 | to {
42 | transform: rotate(360deg);
43 | }
44 | }
--------------------------------------------------------------------------------
/frontend/src/App.test.tsx:
--------------------------------------------------------------------------------
1 | import React from 'react';
2 | import { render, screen } from '@testing-library/react';
3 | import App from './App';
4 |
5 | test('renders learn react link', () => {
6 | render( );
7 | const linkElement = screen.getByText(/learn react/i);
8 | expect(linkElement).toBeInTheDocument();
9 | });
10 |
--------------------------------------------------------------------------------
/frontend/src/App.tsx:
--------------------------------------------------------------------------------
1 | import { FC } from 'react';
2 | import { AppRoutes } from './Routes';
3 | import './App.css';
4 |
5 |
6 | const App: FC = () => {
7 | return ;
8 | }
9 |
10 |
11 | export default App;
12 |
--------------------------------------------------------------------------------
/frontend/src/Layout.tsx:
--------------------------------------------------------------------------------
1 | import { FC } from 'react';
2 | import { Header } from './views/Header';
3 | import { Home } from './views/Home';
4 | import { Footer } from './views/Footer';
5 |
6 | export const Layout: FC = () => {
7 | return (
8 | <>
9 |
10 |
11 |
12 | >
13 | );
14 | };
15 |
16 | export default Layout;
--------------------------------------------------------------------------------
/frontend/src/Routes.tsx:
--------------------------------------------------------------------------------
1 | import { FC } from 'react';
2 | import { BrowserRouter as Router, Route, Routes } from 'react-router-dom';
3 | import Layout from './Layout';
4 |
5 | export const AppRoutes: FC = () => {
6 |
7 | return (
8 |
9 |
10 | } />
11 |
12 |
13 | );
14 | };
15 |
--------------------------------------------------------------------------------
/frontend/src/api.ts:
--------------------------------------------------------------------------------
1 | import { MASTER_URL } from './config';
2 |
3 | export const fetchFromBackend = async (url: string, method: string, body?: any) => {
4 | const request = new Request(url, {
5 | method,
6 | body: JSON.stringify(body),
7 | headers: {
8 | 'Content-Type': 'application/json'
9 | },
10 | });
11 |
12 | const response = await fetch(request);
13 |
14 | if (response.status === 500) {
15 | throw new Error('Internal server error');
16 | }
17 | if (response.status === 401 || response.status === 403) {
18 | // redirect to home page
19 | window.location.href = "/";
20 | }
21 |
22 | const data = await response.json();
23 |
24 | if (response.status > 400 && response.status < 500) {
25 | if (data.detail) {
26 | throw data.detail;
27 | }
28 | throw data;
29 | }
30 |
31 | return data;
32 | }
33 |
34 | export const getPapers = async (limit = 15, skip = 0, years: string[] = [], categories: string[] = []) => {
35 | var params: string;
36 | if (!years.length && !categories.length) {
37 | var params = `?limit=${limit}&skip=${skip}`
38 | } else {
39 | if (years.length && categories.length) {
40 | var params = `?limit=${limit}&skip=${skip}&years=${years.join()}&categories=${categories.join()}`
41 | } else if (years.length) {
42 | var params = `?limit=${limit}&skip=${skip}&years=${years.join()}`
43 | } else {
44 | var params = `?limit=${limit}&skip=${skip}&categories=${categories.join()}`
45 | }
46 | }
47 | return fetchFromBackend(`${MASTER_URL}${params}`, 'GET');
48 | }
49 | // get papers from Redis through the FastAPI backend
50 |
51 |
52 | export const getSemanticallySimilarPapers = async (
53 | paper_id: string,
54 | years: string[],
55 | categories: string[],
56 | provider: string,
57 | search = 'KNN',
58 | limit = 15
59 | ) => {
60 | console.log(paper_id);
61 |
62 | let body = {
63 | paper_id: paper_id,
64 | provider: provider,
65 | search_type: search,
66 | number_of_results: limit,
67 | years: years,
68 | categories: categories
69 | }
70 |
71 | const url = MASTER_URL + "vector_search/by_paper";
72 | return fetchFromBackend(url, 'POST', body);
73 | };
74 |
75 |
76 | export const getSemanticallySimilarPapersbyText = async (
77 | text: string,
78 | years: string[],
79 | categories: string[],
80 | provider: string,
81 | search = 'KNN',
82 | limit = 15
83 | ) => {
84 | let body = {
85 | user_text: text,
86 | provider: provider,
87 | search_type: search,
88 | number_of_results: limit,
89 | years: years,
90 | categories: categories
91 | }
92 |
93 | console.log(body)
94 |
95 | const url = MASTER_URL + "vector_search/by_text";
96 | return fetchFromBackend(url, 'POST', body);
97 | };
98 |
--------------------------------------------------------------------------------
/frontend/src/config/index.tsx:
--------------------------------------------------------------------------------
1 | export const BASE_URL: string = '';
2 | export const MASTER_URL: string = '/api/v1/papers/';
3 | export const EMAIL = "applied.ai@redis.com"
4 |
--------------------------------------------------------------------------------
/frontend/src/index.css:
--------------------------------------------------------------------------------
1 | body {
2 | margin: 0;
3 | font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', 'Roboto', 'Oxygen',
4 | 'Ubuntu', 'Cantarell', 'Fira Sans', 'Droid Sans', 'Helvetica Neue',
5 | sans-serif;
6 | -webkit-font-smoothing: antialiased;
7 | -moz-osx-font-smoothing: grayscale;
8 | }
9 |
10 | code {
11 | font-family: source-code-pro, Menlo, Monaco, Consolas, 'Courier New',
12 | monospace;
13 | }
14 |
--------------------------------------------------------------------------------
/frontend/src/index.tsx:
--------------------------------------------------------------------------------
1 | import React from 'react';
2 | import ReactDOM from 'react-dom/client';
3 | import './index.css';
4 | import App from './App';
5 | import reportWebVitals from './reportWebVitals';
6 |
7 | const root = ReactDOM.createRoot(
8 | document.getElementById('root') as HTMLElement
9 | );
10 | root.render(
11 |
12 |
13 |
14 | );
15 |
16 | // If you want to start measuring performance in your app, pass a function
17 | // to log results (for example: reportWebVitals(console.log))
18 | // or send to an analytics endpoint. Learn more: https://bit.ly/CRA-vitals
19 | reportWebVitals();
20 |
--------------------------------------------------------------------------------
/frontend/src/logo.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/frontend/src/react-app-env.d.ts:
--------------------------------------------------------------------------------
1 | ///
2 |
--------------------------------------------------------------------------------
/frontend/src/reportWebVitals copy.ts:
--------------------------------------------------------------------------------
1 | import { ReportHandler } from 'web-vitals';
2 |
3 | const reportWebVitals = (onPerfEntry?: ReportHandler) => {
4 | if (onPerfEntry && onPerfEntry instanceof Function) {
5 | import('web-vitals').then(({ getCLS, getFID, getFCP, getLCP, getTTFB }) => {
6 | getCLS(onPerfEntry);
7 | getFID(onPerfEntry);
8 | getFCP(onPerfEntry);
9 | getLCP(onPerfEntry);
10 | getTTFB(onPerfEntry);
11 | });
12 | }
13 | };
14 |
15 | export default reportWebVitals;
16 |
--------------------------------------------------------------------------------
/frontend/src/reportWebVitals.ts:
--------------------------------------------------------------------------------
1 | import { ReportHandler } from 'web-vitals';
2 |
3 | const reportWebVitals = (onPerfEntry?: ReportHandler) => {
4 | if (onPerfEntry && onPerfEntry instanceof Function) {
5 | import('web-vitals').then(({ getCLS, getFID, getFCP, getLCP, getTTFB }) => {
6 | getCLS(onPerfEntry);
7 | getFID(onPerfEntry);
8 | getFCP(onPerfEntry);
9 | getLCP(onPerfEntry);
10 | getTTFB(onPerfEntry);
11 | });
12 | }
13 | };
14 |
15 | export default reportWebVitals;
16 |
--------------------------------------------------------------------------------
/frontend/src/setupTests.ts:
--------------------------------------------------------------------------------
1 | // jest-dom adds custom jest matchers for asserting on DOM nodes.
2 | // allows you to do things like:
3 | // expect(element).toHaveTextContent(/react/i)
4 | // learn more: https://github.com/testing-library/jest-dom
5 | import '@testing-library/jest-dom';
6 |
--------------------------------------------------------------------------------
/frontend/src/styles/Card.css:
--------------------------------------------------------------------------------
1 | .card {
2 | display: flex;
3 | flex-direction: column;
4 | justify-content: space-around;
5 | width: 26rem;
6 | margin: 1rem 0;
7 | padding: 1rem;
8 | border: 1px solid black;
9 | border-radius: 0.5rem;
10 | }
11 |
12 | .card-top {
13 | padding-top: 1rem;
14 | }
15 |
16 | .card-btns-space {
17 | width: 0.5rem;
18 | display: inline-block;
19 | }
20 |
21 | .card-btn {
22 | background: #f0f0f0;
23 | padding: 0.5rem;
24 | border: none;
25 | font: inherit;
26 | border-radius: 0.25rem;
27 | text-decoration: none;
28 | }
--------------------------------------------------------------------------------
/frontend/src/styles/Footer.css:
--------------------------------------------------------------------------------
1 | .footer {
2 | background: #f0f0f0;
3 | padding: 2rem;
4 | text-align: center;
5 | }
--------------------------------------------------------------------------------
/frontend/src/styles/Header.css:
--------------------------------------------------------------------------------
1 | .header {
2 | background: #091a23;
3 | display: flex;
4 | justify-content: space-between;
5 | align-items: center;
6 | padding: 2.25rem;
7 | }
8 |
9 | .header-logo {
10 | height: 2rem;
11 | }
12 |
13 | .header-icon-link {
14 | height: 1.5rem;
15 | width: 1.5rem;
16 | margin-right: 1rem;
17 | }
18 |
19 | .header-cta {
20 | background: #dcff1e;
21 | padding: 10px;
22 | border-radius: 2px;
23 | color: black;
24 | text-decoration: none;
25 | }
26 |
27 | .cta-nav {
28 | display: flex;
29 | align-items: center;
30 | }
--------------------------------------------------------------------------------
/frontend/src/styles/Home.css:
--------------------------------------------------------------------------------
1 | .home-padding {
2 | padding: 0 25px;
3 | }
4 |
5 | .home-heading {
6 | text-align: center;
7 | }
8 |
9 | .home-options {
10 | padding: 2rem 5%;
11 | }
12 |
13 | .home-filters {
14 | padding: 1rem 0 3rem 0;
15 | }
16 |
17 | .home-search-results {
18 | padding: 0 0 0 5%;
19 | }
20 |
21 | .home-cards {
22 | display: flex;
23 | flex-wrap: wrap;
24 | justify-content: space-between;
25 | padding: 0 5%;
26 | }
27 |
28 | .home-loader {
29 | display: flex;
30 | padding: 2rem 5%;
31 | }
--------------------------------------------------------------------------------
/frontend/src/views/Card.tsx:
--------------------------------------------------------------------------------
1 | /* eslint-disable jsx-a11y/anchor-is-valid */
2 | import { getSemanticallySimilarPapers } from "../api"
3 | import Tooltip from '@mui/material/Tooltip';
4 | import '../styles/Card.css'
5 |
6 | interface Props {
7 | paperId: string;
8 | numPapers: number;
9 | title: string;
10 | authors: string;
11 | paperCat: string;
12 | paperYear: number;
13 | categories: string[];
14 | years: string[];
15 | provider: string;
16 | similarity_score: number;
17 | setState: (state: any) => void;
18 | setTotal: (state: any) => void;
19 | }
20 |
21 | export const Card = (props: Props) => {
22 | const querySemanticallySimilarPapers = async () => {
23 | try {
24 | const results = await getSemanticallySimilarPapers(
25 | props.paperId,
26 | props.years,
27 | props.categories,
28 | props.provider,
29 | "KNN",
30 | props.numPapers);
31 | props.setState(results.papers)
32 | props.setTotal(results.total)
33 | } catch (err) {
34 | console.log(String(err));
35 | }
36 | };
37 |
38 | return (
39 |
40 |
41 | {props.title}
42 |
43 |
44 |
Authors: {props.authors}
45 |
Categories: {props.paperCat.replaceAll("|", ", ")}
46 |
Year: {props.paperYear}
47 |
48 | {props.similarity_score ? (
Vector search similarity score: {props.similarity_score.toFixed(2)}
) : <>>}
49 |
50 |
51 |
52 |
53 | querySemanticallySimilarPapers()}
57 | >
58 | More Like This
59 |
60 |
61 |
62 |
63 | Download
67 |
68 |
69 |
70 |
71 | );
72 | };
--------------------------------------------------------------------------------
/frontend/src/views/Footer.tsx:
--------------------------------------------------------------------------------
1 | /* eslint-disable jsx-a11y/anchor-is-valid */
2 | import { EMAIL } from '../config'
3 | import '../styles/Footer.css';
4 |
5 | export const Footer = () => {
6 | return (
7 |
8 |
9 |
12 |
29 |
contact: {EMAIL}
30 |
31 |
32 | );
33 | };
--------------------------------------------------------------------------------
/frontend/src/views/Header.tsx:
--------------------------------------------------------------------------------
1 | import { BASE_URL, EMAIL } from "../config";
2 | import Tooltip from '@mui/material/Tooltip';
3 | import '../styles/Header.css';
4 |
5 | /* eslint-disable jsx-a11y/anchor-is-valid */
6 | export const Header = () => {
7 | return (
8 |
9 |
10 |
14 |
15 |
40 |
41 |
42 | );
43 | };
44 |
--------------------------------------------------------------------------------
/frontend/src/views/Home.tsx:
--------------------------------------------------------------------------------
1 | import { useState, useEffect } from 'react';
2 | import { getPapers, getSemanticallySimilarPapersbyText } from '../api';
3 | import { Card } from "./Card"
4 | import Box from '@mui/material/Box';
5 | import TextField from '@mui/material/TextField';
6 |
7 |
8 | import OutlinedInput from '@mui/material/OutlinedInput';
9 | import InputLabel from '@mui/material/InputLabel';
10 | import MenuItem from '@mui/material/MenuItem';
11 | import FormControl from '@mui/material/FormControl';
12 | import Radio from '@mui/material/Radio';
13 | import RadioGroup from '@mui/material/RadioGroup';
14 | import FormControlLabel from '@mui/material/FormControlLabel';
15 | import ListItemText from '@mui/material/ListItemText';
16 | import Select, { SelectChangeEvent } from '@mui/material/Select';
17 | import Checkbox from '@mui/material/Checkbox';
18 | import Tooltip from '@mui/material/Tooltip';
19 | import CircularProgress from '@mui/material/CircularProgress';
20 |
21 | import '../styles/Home.css';
22 |
23 | /* eslint-disable jsx-a11y/anchor-is-valid */
24 | /* eslint-disable @typescript-eslint/no-unused-vars */
25 |
26 | interface Props { }
27 |
28 | export const Home = (props: Props) => {
29 | const [error, setError] = useState('');
30 | const [skip, setSkip] = useState(0);
31 | const [limit, setLimit] = useState(15);
32 | const [papers, setPapers] = useState([]);
33 | const [years, setYears] = useState([]);
34 | const [categories, setCategories] = useState([]);
35 | const [provider, setProvider] = useState('huggingface');
36 | const [searchState, setSearchState] = useState('');
37 | const [loading, setLoadingState] = useState(false);
38 | const [total, setTotal] = useState(0);
39 |
40 | const ITEM_HEIGHT = 48;
41 | const ITEM_PADDING_TOP = 8;
42 | const MenuProps = {
43 | PaperProps: {
44 | style: {
45 | maxHeight: ITEM_HEIGHT * 4.5 + ITEM_PADDING_TOP,
46 | width: 150,
47 | },
48 | },
49 | };
50 |
51 | const yearOptions = [
52 | '2022',
53 | '2021',
54 | '2020',
55 | '2019',
56 | '2018',
57 | '2017',
58 | '2016',
59 | '2015',
60 | '2014',
61 | '2013',
62 | '2012',
63 | '2011'
64 | ];
65 |
66 | const categoryOptions = [
67 | 'cs.LG',
68 | 'math-ph',
69 | 'quant-ph',
70 | 'cond-mat.mes-hall',
71 | 'hep-ph',
72 | 'hep-th',
73 | 'gr-qc',
74 | 'cond-mat.mtrl-sci',
75 | 'cond-mat.str-el',
76 | 'cond-mat.stat-mech',
77 | 'astro-ph.CO',
78 | 'math.MP',
79 | 'astro-ph.HE',
80 | 'physics.optics',
81 | 'astro-ph.GA'
82 | ]
83 |
84 | function EmbeddingModelOptions() {
85 | const handleChange = (event: React.ChangeEvent) => {
86 | setProvider((event.target as HTMLInputElement).value);
87 | };
88 | return (
89 |
90 |
97 | } label="all-mpnet-base-v2 (huggingface)" />
98 | } label="text-embedding-ada-002 (openai)" />
99 | } label="embed-multilingual-v3.0 (cohere)" />
100 |
101 |
102 | );
103 | }
104 |
105 | function YearOptions() {
106 | const handleChange = (event: SelectChangeEvent) => {
107 | const {
108 | target: { value },
109 | } = event;
110 | setSkip(0);
111 | setYears(
112 | // On autofill we get a stringified value.
113 | typeof value === 'string' ? value.split(',') : value,
114 | )
115 | };
116 | return (
117 |
118 | Year
119 | }
126 | renderValue={(selected) => selected.join(', ')}
127 | MenuProps={MenuProps}
128 | >
129 | {yearOptions.map((year) => (
130 |
131 | -1} />
132 |
133 |
134 | ))}
135 |
136 |
137 | );
138 | }
139 |
140 | function CategoryOptions() {
141 | const handleChange = (event: SelectChangeEvent) => {
142 | const {
143 | target: { value },
144 | } = event;
145 | setCategories(
146 | // On autofill we get a stringified value.
147 | typeof value === 'string' ? value.split(',') : value,
148 | );
149 | setSkip(0);
150 | };
151 | return (
152 |
153 | Category
154 | }
161 | renderValue={(selected) => selected.join(', ')}
162 | MenuProps={MenuProps}
163 | >
164 | {categoryOptions.map((cat) => (
165 |
166 | -1} />
167 |
168 |
169 | ))}
170 |
171 |
172 | );
173 | }
174 |
175 | const handleSearchChange = async (newValue: string) => {
176 | setLoadingState(true);
177 | setSearchState(newValue);
178 | }
179 |
180 | const queryPapers = async () => {
181 | try {
182 | if (searchState) {
183 | const result = await getSemanticallySimilarPapersbyText(searchState, years, categories, provider)
184 | setPapers(result.papers)
185 | setLoadingState(false);
186 | setTotal(result.total)
187 | } else {
188 | setSkip(skip + limit);
189 | const result = await getPapers(limit, skip, years, categories);
190 | setPapers(result.papers)
191 | setLoadingState(false);
192 | setTotal(result.total)
193 | }
194 | } catch (err) {
195 | setError(String(err));
196 | }
197 | };
198 |
199 | useEffect(() => {
200 | queryPapers();
201 | }, [categories])
202 |
203 | useEffect(() => {
204 | queryPapers();
205 | }, [years])
206 |
207 | return (
208 | <>
209 |
210 |
211 |
212 |
213 |
arXiv Paper Search
214 |
215 | Search for scholarly papers on arXiv using natural language queries and filters, or use the "more like this" button to find semantically similar papers.
216 |
217 |
218 |
219 |
220 |
221 |
Embedding model
222 |
223 |
224 |
225 |
Filters
226 |
227 |
228 |
229 |
230 |
Vector query
231 |
handleSearchChange(newValue.target.value)}
239 | onKeyDown={() => { queryPapers() }}
240 | />
241 |
242 |
243 |
244 |
245 | {total} searchable arXiv papers
246 |
247 |
248 |
249 |
250 |
251 |
252 |
253 |
Search results
254 | {loading &&
255 |
256 | }
257 |
258 | {!loading && papers && (
259 |
260 | {papers.map((paper) => (
261 |
276 | ))}
277 |
278 | )}
279 |
280 |
281 |
282 | >
283 | );
284 | };
--------------------------------------------------------------------------------
/frontend/src/views/index.ts:
--------------------------------------------------------------------------------
1 | export * from './Home';
2 | export * from './Header';
3 | export * from './Footer';
4 |
--------------------------------------------------------------------------------
/frontend/tsconfig.json:
--------------------------------------------------------------------------------
1 | {
2 | "compilerOptions": {
3 | "target": "es5",
4 | "lib": [
5 | "dom",
6 | "dom.iterable",
7 | "esnext"
8 | ],
9 | "allowJs": true,
10 | "skipLibCheck": true,
11 | "esModuleInterop": true,
12 | "allowSyntheticDefaultImports": true,
13 | "strict": true,
14 | "forceConsistentCasingInFileNames": true,
15 | "noFallthroughCasesInSwitch": true,
16 | "module": "esnext",
17 | "moduleResolution": "node",
18 | "resolveJsonModule": true,
19 | "isolatedModules": true,
20 | "noEmit": true,
21 | "jsx": "react-jsx"
22 | },
23 | "include": [
24 | "src"
25 | ]
26 | }
27 |
--------------------------------------------------------------------------------
/k8s/README.md:
--------------------------------------------------------------------------------
1 | # Running with Kubernetes
2 |
3 | This demo leverages the pre-built docker containers deployed in a sandbox k8s environment on your localhost.
4 |
5 | ## Prerequisites
6 | If you already have a K8s cluster, bypass the setup steps and utilize the K8s manifests and configs however you need.
7 |
8 | Install [kubectl](https://kubernetes.io/docs/reference/kubectl/), [kind](https://kind.sigs.k8s.io/), and [kompose](https://kompose.io/).
9 |
10 | ## Create Cluster with Kind
11 | ```bash
12 | kind create cluster --config=cluster.yaml
13 | kubectl cluster-info --context kind-arxiv-search
14 | ```
15 |
16 | Then apply the ingress:
17 | ```bash
18 | kubectl apply -f https://raw.githubusercontent.com/kubernetes/ingress-nginx/main/deploy/static/provider/kind/deploy.yaml
19 | ```
20 |
21 | Apply the resources:
22 | ```bash
23 | kubectl apply -f redis-vector-db.yaml
24 | ```
25 | >Pause for like 30 seconds here to make sure Redis is up
26 |
27 | ```bash
28 | kubectl apply -f backend.yaml
29 | ```
30 |
31 | ## Validate Cluster
32 | ```bash
33 | kubectl get nodes
34 | ```
35 | ```bash
36 | kubectl get pods
37 | ```
38 | Inspect logs etc...
39 | ## Expose Ports and Test App
40 | Port forward the backend service to connect to the app on `localhost:8888`:
41 | ```
42 | kubectl port-forward service/backend 8888:8888
43 | ```
44 |
45 | Then navigate to `http://localhost:8888/`
46 |
47 | ## Cleaning Up
48 |
49 | ```bash
50 | kind delete cluster --name arxiv-search
51 | ```
52 |
53 |
--------------------------------------------------------------------------------
/k8s/backend.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: apps/v1
2 | kind: Deployment
3 | metadata:
4 | labels:
5 | io.kompose.service: backend
6 | name: backend
7 | spec:
8 | replicas: 1
9 | selector:
10 | matchLabels:
11 | io.kompose.service: backend
12 | template:
13 | metadata:
14 | labels:
15 | io.kompose.service: backend
16 | spec:
17 | containers:
18 | - env:
19 | - name: DEPLOYMENT
20 | value: dev
21 | - name: REDIS_DB
22 | value: "0"
23 | - name: REDIS_HOST
24 | value: redis-vector-db
25 | - name: REDIS_PASSWORD
26 | value: testing123
27 | - name: REDIS_PORT
28 | value: "6379"
29 | image: ghcr.io/redis-developer/redis-arxiv-search:latest
30 | name: backend
31 | ports:
32 | - containerPort: 8888
33 | resources: {}
34 | restartPolicy: Always
35 | ---
36 | apiVersion: v1
37 | kind: Service
38 | metadata:
39 | labels:
40 | io.kompose.service: backend
41 | name: backend
42 | spec:
43 | type: ClusterIP
44 | ports:
45 | - name: "8888"
46 | port: 8888
47 | targetPort: 8888
48 | selector:
49 | io.kompose.service: backend
50 | ---
51 | apiVersion: networking.k8s.io/v1
52 | kind: Ingress
53 | metadata:
54 | name: ingress
55 | spec:
56 | rules:
57 | - http:
58 | paths:
59 | - pathType: Prefix
60 | path: "/"
61 | backend:
62 | service:
63 | name: backend
64 | port:
65 | number: 8888
66 |
67 |
--------------------------------------------------------------------------------
/k8s/cluster.yaml:
--------------------------------------------------------------------------------
1 | kind: Cluster
2 | apiVersion: kind.x-k8s.io/v1alpha4
3 | name: arxiv-search
4 | nodes:
5 | - role: control-plane
6 | kubeadmConfigPatches:
7 | - |
8 | kind: InitConfiguration
9 | nodeRegistration:
10 | kubeletExtraArgs:
11 | node-labels: "ingress-ready=true"
12 | extraPortMappings:
13 | - containerPort: 80
14 | hostPort: 8080
15 | protocol: TCP
16 | - containerPort: 443
17 | hostPort: 44300
18 | protocol: TCP
--------------------------------------------------------------------------------
/k8s/redis-vector-db.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: apps/v1
2 | kind: Deployment
3 | metadata:
4 | labels:
5 | io.kompose.service: redis-vector-db
6 | name: redis-vector-db
7 | spec:
8 | replicas: 1
9 | selector:
10 | matchLabels:
11 | io.kompose.service: redis-vector-db
12 | strategy:
13 | type: Recreate
14 | template:
15 | metadata:
16 | labels:
17 | io.kompose.service: redis-vector-db
18 | spec:
19 | containers:
20 | - name: redis-vector-db
21 | image: redis/redis-stack:latest
22 | command:
23 | - redis-stack-server
24 | - "/redis-config/redis.conf"
25 | ports:
26 | - containerPort: 6379
27 | - containerPort: 8001
28 | resources: {}
29 | volumeMounts:
30 | - mountPath: /data/
31 | name: redis-vector-db-pvc
32 | - mountPath: /redis-config
33 | name: config
34 | restartPolicy: Always
35 | volumes:
36 | - name: redis-vector-db-pvc
37 | persistentVolumeClaim:
38 | claimName: redis-vector-db-pvc
39 | - name: config
40 | configMap:
41 | name: redis-config
42 | items:
43 | - key: redis-config
44 | path: redis.conf
45 | ---
46 | apiVersion: v1
47 | kind: ConfigMap
48 | metadata:
49 | name: redis-config
50 | data:
51 | redis-config: |
52 | requirepass testing123
53 | ---
54 | apiVersion: v1
55 | kind: Service
56 | metadata:
57 | labels:
58 | io.kompose.service: redis-vector-db
59 | name: redis-vector-db
60 | spec:
61 | type: ClusterIP
62 | ports:
63 | - name: "6379"
64 | port: 6379
65 | targetPort: 6379
66 | - name: "8001"
67 | port: 8001
68 | targetPort: 8001
69 | selector:
70 | io.kompose.service: redis-vector-db
71 | ---
72 | apiVersion: v1
73 | kind: PersistentVolumeClaim
74 | metadata:
75 | labels:
76 | io.kompose.service: redis-vector-db-pvc
77 | name: redis-vector-db-pvc
78 | spec:
79 | accessModes:
80 | - ReadWriteOnce
81 | resources:
82 | requests:
83 | storage: 100Mi
84 |
--------------------------------------------------------------------------------