├── .github ├── FUNDING.yml ├── dependabot_npm.yml └── dependabot_pip.yml ├── .gitignore ├── Dockerfile.core ├── Dockerfile.crawler ├── Dockerfile.frontend ├── LICENSE ├── README.md ├── assets ├── imgs │ ├── aisearch_question_suggestion.png │ ├── aisearch_result.png │ ├── chat_view.png │ ├── code_view.png │ ├── financial-table-1.png │ ├── financial-table-2.png │ └── search_view.png ├── neosearch.png └── neosearch.webp ├── changelog.md ├── deploy_searxng_with_docker.sh ├── docker-compose.yaml ├── neosearch ├── .env.template ├── .gitignore ├── .python-version ├── README.md ├── __init__.py ├── api │ ├── __init__.py │ └── routers │ │ ├── __init__.py │ │ ├── chat.py │ │ ├── health_check.py │ │ ├── query.py │ │ └── search.py ├── app │ ├── __init__.py │ ├── rag.py │ ├── server.py │ └── worker_broker.py ├── config.py ├── config.yaml ├── constants │ ├── __init__.py │ ├── bedrock.py │ ├── circuitbreaker.py │ ├── embeddings.py │ ├── logging.py │ ├── memory.py │ ├── queue.py │ ├── rag_search.py │ ├── retriever.py │ ├── searxng.py │ └── trace.py ├── datastore │ ├── __init__.py │ ├── crud │ │ ├── __init__.py │ │ ├── chat.py │ │ ├── document.py │ │ ├── message.py │ │ └── vote.py │ ├── database.py │ ├── model │ │ ├── __init__.py │ │ ├── base.py │ │ ├── chat.py │ │ ├── document.py │ │ ├── message.py │ │ └── vote.py │ └── vectorstores │ │ ├── __init__.py │ │ ├── base.py │ │ ├── milvus_vector_stores.py │ │ ├── pg_vector_stores.py │ │ ├── pgrs_vector_stores.py │ │ └── qdrant_vector_stores.py ├── engine │ ├── __init__.py │ ├── agents │ │ ├── __init__.py │ │ ├── deep_research.py │ │ ├── research.py │ │ └── tools │ │ │ ├── __init__.py │ │ │ ├── research_tools.py │ │ │ └── web_search.py │ ├── constants.py │ ├── db_utils.py │ ├── index.py │ ├── loader.py │ ├── prompts │ │ ├── __init__.py │ │ ├── chat.py │ │ ├── crag_workflow.py │ │ ├── deep_research.py │ │ └── search_o1.py │ ├── query_filter.py │ ├── rag_engine │ │ ├── __init__.py │ │ ├── chat_engine.py │ │ └── query_engine.py │ ├── reranker │ │ ├── __init__.py │ │ └── cohere.py │ ├── retriever │ │ ├── __init__.py │ │ ├── base.py │ │ ├── bm25_hybrid.py │ │ ├── doc_hybrid.py │ │ ├── keyword_hybrid.py │ │ ├── paradedb.py │ │ └── searxng.py │ ├── search │ │ ├── __init__.py │ │ └── bing_search.py │ ├── tools │ │ ├── __init__.py │ │ ├── artifact.py │ │ ├── document_generator.py │ │ ├── duckduckgo.py │ │ ├── form_filling.py │ │ ├── img_gen.py │ │ ├── interpreter.py │ │ ├── openapi_action.py │ │ └── weather.py │ ├── utils │ │ ├── __init__.py │ │ ├── chat.py │ │ ├── query.py │ │ ├── rag_search.py │ │ └── searxng.py │ └── workflow │ │ ├── __init__.py │ │ ├── crag.py │ │ └── events │ │ ├── __init__.py │ │ └── crag.py ├── exceptions │ ├── __init__.py │ ├── bedrock.py │ └── engine │ │ ├── __init__.py │ │ └── retriever.py ├── export_requirements_txt.sh ├── infrastructure │ ├── __init__.py │ └── aws │ │ ├── __init__.py │ │ ├── bedrock_adapter.py │ │ └── s3_adapter.py ├── main.py ├── middlewares │ ├── __init__.py │ ├── request_id.py │ └── request_logger.py ├── models │ ├── __init__.py │ ├── chat_models.py │ ├── health_check.py │ └── query_models.py ├── mypy.ini ├── pyproject.toml ├── response │ ├── __init__.py │ └── chat.py ├── search_with_lepton.py ├── services │ ├── __init__.py │ ├── file.py │ └── next_question_suggesion.py ├── settings │ ├── __init__.py │ ├── fastembed.py │ ├── gemini.py │ ├── huggingface.py │ ├── llmhub.py │ ├── mistral.py │ ├── ollama.py │ └── openai.py ├── tests │ └── __init__.py ├── utils │ ├── __init__.py │ ├── configs.py │ ├── events.py │ ├── gc_tuning.py │ ├── logging.py │ ├── ratelimitter.py │ ├── ray.py │ └── singleton.py ├── uv.lock └── worker.py ├── neosearch_ai ├── README.md ├── configs │ ├── __init__.py │ ├── app.py │ ├── embedding_param_manager.py │ └── reranker_param_manager.py ├── constants │ ├── __init__.py │ └── logging.py ├── embedding.py ├── engine │ ├── __init__.py │ ├── embeddings.py │ └── flash_reranker.py ├── flashrerank.py ├── main.py ├── pyproject.toml ├── utils │ ├── __init__.py │ ├── logger.py │ └── singleton.py └── uv.lock ├── neosearch_crawler ├── .env.sample ├── .gitignore ├── __init__.py ├── constants │ ├── __init__.py │ ├── crawl_seeds.py │ ├── logger.py │ └── modes.py ├── crawlers │ ├── __init__.py │ ├── base.py │ ├── github.py │ ├── linkedin.py │ ├── medium.py │ └── s3 │ │ ├── __init__.py │ │ └── s3_crawler.py ├── datastore │ ├── __init__.py │ └── database.py ├── dispatchers │ ├── __init__.py │ ├── base.py │ └── lib.py ├── engine │ ├── __init__.py │ ├── agent │ │ ├── __init__.py │ │ ├── base.py │ │ ├── web_corpus_collector.py │ │ └── wikidump_parser.py │ ├── base_dispatcher.py │ └── runner │ │ ├── __init__.py │ │ ├── base.py │ │ └── common_crawl.py ├── exception │ ├── __init__.py │ └── dispatcher.py ├── export_requirements_from_poetry.sh ├── main.py ├── mongo_db │ ├── __init__.py │ ├── documents.py │ ├── mongo.py │ └── mongo_config.py ├── pyproject.toml ├── sample_crawler.py ├── sample_offline_rag_for_web_search_agent.py ├── utils │ ├── __init__.py │ ├── domain_name_utils.py │ ├── errors.py │ ├── logger.py │ ├── pdf_util.py │ ├── singleton.py │ └── trafilatura_util.py └── uv.lock ├── neosearch_frontend ├── .env.example ├── .eslintrc.json ├── .gitignore ├── README.md ├── app │ ├── (auth) │ │ ├── actions.ts │ │ ├── api │ │ │ └── auth │ │ │ │ ├── [...nextauth] │ │ │ │ └── route.ts │ │ │ │ └── guest │ │ │ │ └── route.ts │ │ ├── auth.config.ts │ │ ├── auth.ts │ │ ├── login │ │ │ └── page.tsx │ │ └── register │ │ │ └── page.tsx │ ├── (chat) │ │ ├── actions.ts │ │ ├── api │ │ │ ├── chat │ │ │ │ ├── route.ts │ │ │ │ └── schema.ts │ │ │ ├── document │ │ │ │ └── route.ts │ │ │ ├── files │ │ │ │ └── upload │ │ │ │ │ └── route.ts │ │ │ ├── history │ │ │ │ └── route.ts │ │ │ ├── suggestions │ │ │ │ └── route.ts │ │ │ └── vote │ │ │ │ └── route.ts │ │ ├── chat │ │ │ └── [id] │ │ │ │ └── page.tsx │ │ ├── layout.tsx │ │ ├── opengraph-image.png │ │ ├── page.tsx │ │ └── twitter-image.png │ ├── aisearch │ │ ├── page.tsx │ │ ├── search │ │ │ ├── [id] │ │ │ │ └── page.tsx │ │ │ └── page.tsx │ │ └── share │ │ │ └── [id] │ │ │ └── page.tsx │ ├── api │ │ ├── advanced-search │ │ │ └── route.ts │ │ └── search-chat │ │ │ └── route.ts │ ├── favicon.ico │ ├── globals.css │ └── layout.tsx ├── artifacts │ ├── actions.ts │ ├── code │ │ ├── client.tsx │ │ └── server.ts │ ├── image │ │ ├── client.tsx │ │ └── server.ts │ ├── sheet │ │ ├── client.tsx │ │ └── server.ts │ └── text │ │ ├── client.tsx │ │ └── server.ts ├── biome.jsonc ├── components.json ├── components │ ├── Chat.tsx │ ├── action_search_bar.tsx │ ├── app-sidebar.tsx │ ├── artifact-actions.tsx │ ├── artifact-close-button.tsx │ ├── artifact-messages.tsx │ ├── artifact.tsx │ ├── auth-form.tsx │ ├── block-actions.tsx │ ├── block-close-button.tsx │ ├── block-messages.tsx │ ├── block.tsx │ ├── chat-header.tsx │ ├── code-block.tsx │ ├── code-editor.tsx │ ├── console.tsx │ ├── create-artifact.tsx │ ├── create-block.tsx │ ├── data-stream-handler.tsx │ ├── diffview.tsx │ ├── document-preview.tsx │ ├── document-skeleton.tsx │ ├── document.tsx │ ├── editor.tsx │ ├── greeting.tsx │ ├── icons.tsx │ ├── image-editor.tsx │ ├── markdown.tsx │ ├── message-actions.tsx │ ├── message-editor.tsx │ ├── message-reasoning.tsx │ ├── message.tsx │ ├── messages.tsx │ ├── model-selector.tsx │ ├── multimodal-input.tsx │ ├── overview.tsx │ ├── preview-attachment.tsx │ ├── sheet-editor.tsx │ ├── sidebar-history-item.tsx │ ├── sidebar-history.tsx │ ├── sidebar-toggle.tsx │ ├── sidebar-user-nav.tsx │ ├── sign-out-form.tsx │ ├── styles │ │ └── colors.ts │ ├── submit-button.tsx │ ├── suggested-actions.tsx │ ├── suggestion.tsx │ ├── text-editor.tsx │ ├── theme-provider.tsx │ ├── toast.tsx │ ├── toolbar.tsx │ ├── ui │ │ ├── accordion.tsx │ │ ├── alert-dialog.tsx │ │ ├── button.tsx │ │ ├── card.tsx │ │ ├── dropdown-menu.tsx │ │ ├── financials-table.tsx │ │ ├── input.tsx │ │ ├── label.tsx │ │ ├── news.tsx │ │ ├── select.tsx │ │ ├── separator.tsx │ │ ├── sheet.tsx │ │ ├── sidebar.tsx │ │ ├── skeleton.tsx │ │ ├── stock-chart.tsx │ │ ├── stock-screener-table.tsx │ │ ├── table.tsx │ │ ├── textarea.tsx │ │ └── tooltip.tsx │ ├── use-scroll-to-bottom.ts │ ├── version-footer.tsx │ ├── visibility-selector.tsx │ └── weather.tsx ├── docs │ ├── 01-quick-start.md │ ├── 02-update-models.md │ └── 03-blocks.md ├── drizzle.config.ts ├── hooks │ ├── use-artifact.ts │ ├── use-auto-resume.ts │ ├── use-chat-visibility.ts │ ├── use-messages.tsx │ ├── use-mobile.tsx │ └── use-scroll-to-bottom.tsx ├── lib │ ├── actions │ │ └── chat.ts │ ├── agents │ │ ├── generate-related-questions.ts │ │ ├── manual-researcher.ts │ │ └── researcher.ts │ ├── ai │ │ ├── entitlements.ts │ │ ├── models.test.ts │ │ ├── models.ts │ │ ├── prompts.ts │ │ ├── providers.ts │ │ └── tools │ │ │ ├── create-document.ts │ │ │ ├── get-weather.ts │ │ │ ├── request-suggestions.ts │ │ │ └── update-document.ts │ ├── api │ │ └── stock-filters.ts │ ├── artifacts │ │ └── server.ts │ ├── config │ │ ├── default-models.json │ │ └── models.ts │ ├── constants.ts │ ├── db │ │ ├── helpers │ │ │ └── 01-core-to-parts.ts │ │ ├── migrate.ts │ │ ├── migrations │ │ │ ├── 0000_curious_darwin.sql │ │ │ ├── 0000_keen_devos.sql │ │ │ ├── 0001_sparkling_blue_marvel.sql │ │ │ ├── 0002_wandering_riptide.sql │ │ │ ├── 0003_cloudy_glorian.sql │ │ │ ├── 0004_odd_slayback.sql │ │ │ ├── 0005_wooden_whistler.sql │ │ │ ├── 0006_marvelous_frog_thor.sql │ │ │ └── meta │ │ │ │ ├── 0000_snapshot.json │ │ │ │ └── _journal.json │ │ ├── queries.ts │ │ ├── schema.ts │ │ └── utils.ts │ ├── editor │ │ ├── config.ts │ │ ├── diff.js │ │ ├── functions.tsx │ │ ├── react-renderer.tsx │ │ └── suggestions.tsx │ ├── errors.ts │ ├── hooks │ │ └── use-copy-to-clipboard.ts │ ├── redis │ │ └── config.ts │ ├── schema │ │ ├── related.tsx │ │ ├── retrieve.tsx │ │ └── search.tsx │ ├── search_constants │ │ └── index.ts │ ├── search_utils │ │ ├── context-window.ts │ │ ├── cookies.ts │ │ ├── index.ts │ │ └── registry.ts │ ├── streaming │ │ ├── create-manual-tool-stream.ts │ │ ├── create-tool-calling-stream.ts │ │ ├── handle-stream-finish.ts │ │ ├── parse-tool-call.ts │ │ ├── tool-execution.ts │ │ └── types.ts │ ├── tools │ │ ├── finantial-news.ts │ │ ├── retrieve.ts │ │ ├── search.ts │ │ └── video-search.ts │ ├── types.ts │ ├── types │ │ ├── index.ts │ │ └── models.ts │ └── utils.ts ├── middleware.ts ├── next-env.d.ts ├── next.config.ts ├── package.json ├── playwright.config.ts ├── pnpm-lock.yaml ├── postcss.config.mjs ├── public │ ├── config │ │ └── models.json │ ├── fonts │ │ ├── geist-mono.woff2 │ │ └── geist.woff2 │ ├── images │ │ ├── demo-thumbnail.png │ │ └── placeholder-image.png │ └── providers │ │ └── logos │ │ ├── anthropic.svg │ │ ├── azure.svg │ │ ├── deepseek.svg │ │ ├── fireworks.svg │ │ ├── google.svg │ │ ├── groq.svg │ │ ├── ollama.svg │ │ ├── openai-compatible.svg │ │ ├── openai.svg │ │ └── xai.svg ├── search_components │ ├── answer-section.tsx │ ├── chat-messages.tsx │ ├── chat-panel.tsx │ ├── chat-share.tsx │ ├── chat.tsx │ ├── clear-history.tsx │ ├── collapsible-message.tsx │ ├── custom-link.tsx │ ├── default-skeleton.tsx │ ├── empty-screen.tsx │ ├── footer.tsx │ ├── header.tsx │ ├── history-container.tsx │ ├── history-item.tsx │ ├── history-list.tsx │ ├── history-skeleton.tsx │ ├── history.tsx │ ├── message-actions.tsx │ ├── message.tsx │ ├── mode-toggle.tsx │ ├── model-selector.tsx │ ├── reasoning-answer-section.tsx │ ├── related-questions.tsx │ ├── render-message.tsx │ ├── retrieve-section.tsx │ ├── search-mode-toggle.tsx │ ├── search-results-image.tsx │ ├── search-results.tsx │ ├── search-section.tsx │ ├── section.tsx │ ├── sidebar.tsx │ ├── theme-provider.tsx │ ├── tool-badge.tsx │ ├── tool-section.tsx │ ├── ui │ │ ├── alert-dialog.tsx │ │ ├── avatar.tsx │ │ ├── badge.tsx │ │ ├── button.tsx │ │ ├── card.tsx │ │ ├── carousel.tsx │ │ ├── checkbox.tsx │ │ ├── codeblock.tsx │ │ ├── collapsible.tsx │ │ ├── command.tsx │ │ ├── dialog.tsx │ │ ├── dropdown-menu.tsx │ │ ├── icons.tsx │ │ ├── input.tsx │ │ ├── label.tsx │ │ ├── markdown.tsx │ │ ├── popover.tsx │ │ ├── select.tsx │ │ ├── separator.tsx │ │ ├── sheet.tsx │ │ ├── skeleton.tsx │ │ ├── slider.tsx │ │ ├── sonner.tsx │ │ ├── spinner.tsx │ │ ├── status-indicator.tsx │ │ ├── switch.tsx │ │ ├── textarea.tsx │ │ ├── toggle.tsx │ │ └── tooltip.tsx │ ├── user-message.tsx │ ├── video-search-results.tsx │ └── video-search-section.tsx ├── tailwind.config.ts ├── tests │ ├── e2e │ │ ├── artifacts.test.ts │ │ ├── chat.test.ts │ │ ├── reasoning.test.ts │ │ └── session.test.ts │ ├── fixtures.ts │ ├── helpers.ts │ ├── pages │ │ ├── artifact.ts │ │ ├── auth.ts │ │ └── chat.ts │ ├── prompts │ │ ├── basic.ts │ │ ├── routes.ts │ │ └── utils.ts │ └── routes │ │ ├── chat.test.ts │ │ └── document.test.ts └── tsconfig.json ├── neosearch_llm ├── sglang │ ├── deploy_sgllm_docker.sh │ ├── export_requirements_from_poetry.sh │ ├── launch_sgllm.sh │ ├── launch_sgllm_tensor_parallel.sh │ └── pyproject.toml └── vllm │ ├── export_requirements_from_poetry.sh │ ├── inference.py │ ├── launch_vllm.sh │ ├── pyproject.toml │ ├── run_cluster.sh │ ├── run_with_ray_cluster.md │ └── uv.lock ├── public_icann_suffix.dat ├── public_suffix_list.dat ├── resources ├── docs │ ├── advanced_rag.md │ ├── crawling.md │ ├── data.md │ ├── deep_research.md │ ├── getting_started.md │ ├── gpt_deep_research_backend.md │ ├── helpful_resources.md │ ├── imgs │ │ ├── search_bench.png │ │ └── yandex_search_architecture.png │ ├── kuberay.md │ ├── migrate_from_poetry_to_uv.md │ ├── prompts │ │ ├── dense_x_prompt.md │ │ └── perplexica.md │ ├── references.md │ └── yandex_search_architecture.md ├── factors │ └── yandex_factors_gen.txt ├── postgres │ ├── .gitignore │ ├── cloudnative_pg │ │ ├── examples │ │ │ ├── auth-prod.yaml │ │ │ ├── backup-od.yaml │ │ │ ├── cluster-prod.yaml │ │ │ ├── cluster-restore.yaml │ │ │ ├── storageclass-gp3.yaml │ │ │ ├── storageclass.yaml │ │ │ └── world.sql │ │ ├── helm-files │ │ │ └── values.yaml │ │ └── monitoring │ │ │ ├── alerts.yaml │ │ │ ├── cnpg-prometheusrule.yaml │ │ │ ├── grafana-configmap.yaml │ │ │ ├── grafana-dashboard.json │ │ │ └── kube-stack-config.yaml │ ├── electric │ │ ├── README.md │ │ └── docker_compose │ │ │ ├── docker-compose.yml │ │ │ └── postgres.conf │ ├── hybrid_search.md │ ├── paradedb │ │ ├── README.md │ │ ├── autocomplete_tutorial.md │ │ ├── connect_psql.sh │ │ ├── extract_all_stored_procedures.sh │ │ ├── procedures_paradedb_0.13.1.sql │ │ ├── run_with_docker.sh │ │ ├── search_tutorial.md │ │ ├── values.yaml │ │ └── wikipedia_data.md │ ├── pgvectorscale │ │ └── README.md │ ├── postgis │ │ └── install_postgis_on_mac_with_postgresql16.md │ └── psql │ │ ├── efficient_search_engine.md │ │ ├── fulltext_search_english.sql │ │ ├── korean_dictionary_setup.sql │ │ ├── postgres_fulltext_search.sql │ │ ├── ts_config.sql │ │ └── vector_search.md ├── sample_codes │ ├── claude_contextual_retrieval │ │ ├── inference_adapter.py │ │ ├── lambda_function.py │ │ └── s3_adapter.py │ ├── golden_retriever.py │ ├── late-chunking │ │ ├── README.md │ │ ├── chunked_pooling │ │ │ ├── __init__.py │ │ │ ├── chunked_eval_tasks.py │ │ │ ├── chunking.py │ │ │ ├── mteb_chunked_eval.py │ │ │ └── wrappers.py │ │ ├── examples.ipynb │ │ ├── explanatory_contextual_retrieval.py │ │ ├── img │ │ │ ├── context-problem.png │ │ │ ├── method.png │ │ │ └── rag.png │ │ ├── pyproject.toml │ │ ├── run_chunked_eval.py │ │ └── tests │ │ │ ├── __init__.py │ │ │ ├── conftest.py │ │ │ ├── test_api.py │ │ │ ├── test_chunking_methods.py │ │ │ └── test_v3.py │ ├── pgvector_python │ │ ├── citus_with_pgvector.py │ │ ├── colbert_exact_match.py │ │ ├── hybrid_search │ │ │ ├── cross_encoder.py │ │ │ └── hybrid_search.py │ │ └── image_search.py │ ├── vectordb │ │ ├── __init__.py │ │ ├── chunking.py │ │ ├── embedding.py │ │ ├── memory.py │ │ ├── storage.py │ │ └── vector_search.py │ └── web-crawler │ │ ├── README.md │ │ ├── anacron │ │ ├── backupSearch Engine-20200705T185356Z-001.zip │ │ ├── build_db.py │ │ ├── main.py │ │ ├── readme.txt │ │ ├── run.sh │ │ └── tempclean.py ├── stopwords │ ├── Afrikaans.txt │ ├── Albanian.txt │ ├── Arabic.txt │ ├── Aragonese.txt │ ├── Armenian.txt │ ├── Aromanian.txt │ ├── Asturian.txt │ ├── Azerbaijani.txt │ ├── Basque.txt │ ├── Belarusian.txt │ ├── Belarusian_Taraskievica.txt │ ├── Bengali.txt │ ├── Bishnupriya_Manipuri.txt │ ├── Bosnian.txt │ ├── Breton.txt │ ├── Bulgarian.txt │ ├── Catalan.txt │ ├── Cebuano.txt │ ├── Chuvash.txt │ ├── Croatian.txt │ ├── Czech.txt │ ├── Danish.txt │ ├── Dutch.txt │ ├── English.txt │ ├── Esperanto.txt │ ├── Estonian.txt │ ├── Finnish.txt │ ├── French.txt │ ├── Galician.txt │ ├── Georgian.txt │ ├── German.txt │ ├── Greek.txt │ ├── Gujarati.txt │ ├── Haitian.txt │ ├── Hebrew.txt │ ├── Hindi.txt │ ├── Hungarian.txt │ ├── Icelandic.txt │ ├── Ido.txt │ ├── Igbo.txt │ ├── Indonesian.txt │ ├── Irish.txt │ ├── Italian.txt │ ├── Japanese.txt │ ├── Javanese.txt │ ├── Kannada.txt │ ├── Kazakh.txt │ ├── Korean.txt │ ├── Kurdish.txt │ ├── Kyrgyz.txt │ ├── Latin.txt │ ├── Latvian.txt │ ├── Lithuanian.txt │ ├── Lombard.txt │ ├── Low_Saxon.txt │ ├── Luxembourgish.txt │ ├── Macedonian.txt │ ├── Malay.txt │ ├── Malayalam.txt │ ├── Maltese.txt │ ├── Marathi.txt │ ├── Neapolitan.txt │ ├── Nepali.txt │ ├── Newar.txt │ ├── Norwegian_Bokmal.txt │ ├── Norwegian_Nynorsk.txt │ ├── Occitan.txt │ ├── Persian.txt │ ├── Piedmontese.txt │ ├── Polish.txt │ ├── Portuguese.txt │ ├── Quechua.txt │ ├── Romanian.txt │ ├── Russian.txt │ ├── Samogitian.txt │ ├── Serbian.txt │ ├── Serbo_Croatian.txt │ ├── Sicilian.txt │ ├── Simple_English.txt │ ├── Slovak.txt │ ├── Slovenian.txt │ ├── Spanish.txt │ ├── Sundanese.txt │ ├── Swahili.txt │ ├── Swedish.txt │ ├── Tagalog.txt │ ├── Tamil.txt │ ├── Telugu.txt │ ├── Turkish.txt │ ├── Turkmen.txt │ ├── Ukrainian.txt │ ├── Urdu.txt │ ├── Uzbek.txt │ ├── Vietnamese.txt │ ├── Volapuk.txt │ ├── Walloon.txt │ ├── Waray_Waray.txt │ ├── Welsh.txt │ ├── West_Frisian.txt │ ├── Western_Panjabi.txt │ └── Yoruba.txt └── system_prompts_leaks │ ├── ChatGPT-4o-image-safety-policies.md │ ├── ChatGPT-Advanced-voice-mode.md │ ├── chatgpt-4o-latest-injection │ ├── chatgpt-automation-tool.md │ ├── claude-3.7-full-system-message-with-all-tools.md │ ├── claude-3.7-sonnet-full-system-message-humanreadable.md │ ├── o3-o4-mini-api.md │ └── o4-mini-chatgpt.com.md └── searxng ├── limiter.toml ├── settings.yml └── uwsgi.ini /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | github: YeonwooSung 4 | patreon: # Replace with a single Patreon username 5 | open_collective: # Replace with open collective username 6 | ko_fi: # Replace with a single Ko-fi username 7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel 8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry 9 | liberapay: # Replace with a single Liberapay username 10 | issuehunt: # Replace with a single IssueHunt username 11 | lfx_crowdfunding: # Replace with a single LFX Crowdfunding project-name e.g., cloud-foundry 12 | polar: # Replace with a single Polar username 13 | buy_me_a_coffee: blackbeenie 14 | thanks_dev: # Replace with a single thanks.dev username 15 | custom: -------------------------------------------------------------------------------- /.github/dependabot_npm.yml: -------------------------------------------------------------------------------- 1 | # To get started with Dependabot version updates, you'll need to specify which 2 | # package ecosystems to update and where the package manifests are located. 3 | # Please see the documentation for all configuration options: 4 | # https://docs.github.com/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file 5 | 6 | version: 2 7 | updates: 8 | - package-ecosystem: "npm" # See documentation for possible values 9 | directory: "/" # Location of package manifests 10 | schedule: 11 | interval: "weekly" 12 | -------------------------------------------------------------------------------- /.github/dependabot_pip.yml: -------------------------------------------------------------------------------- 1 | # To get started with Dependabot version updates, you'll need to specify which 2 | # package ecosystems to update and where the package manifests are located. 3 | # Please see the documentation for all configuration options: 4 | # https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates 5 | 6 | version: 2 7 | updates: 8 | - package-ecosystem: "pip" # See documentation for possible values 9 | directory: "/" # Location of package manifests 10 | schedule: 11 | interval: "weekly" 12 | -------------------------------------------------------------------------------- /Dockerfile.core: -------------------------------------------------------------------------------- 1 | FROM python:3 2 | 3 | ENV VERSION 0.1.0 4 | 5 | WORKDIR /app 6 | 7 | COPY neosearch . 8 | 9 | # move to neosearch directory 10 | WORKDIR /app/neosearch 11 | 12 | RUN pip install poetry 13 | RUN sh export_requirements_txt_from_poetry.sh 14 | RUN pip uninstall -y poetry 15 | 16 | # Install dependencies 17 | RUN pip install --no-cache-dir -r requirements.txt 18 | 19 | # Expose port for networking 20 | EXPOSE 8000 21 | 22 | # Run the server 23 | CMD ["python", "main.py"] 24 | -------------------------------------------------------------------------------- /Dockerfile.crawler: -------------------------------------------------------------------------------- 1 | FROM python:3.11 2 | 3 | ENV VERSION 0.1.2 4 | 5 | WORKDIR /app 6 | 7 | COPY neosearch_crawler . 8 | 9 | # move to the neosearch_crawler directory 10 | WORKDIR /app/neosearch_crawler 11 | 12 | RUN pip install poetry 13 | RUN sh export_requirements_txt_from_poetry.sh 14 | RUN pip uninstall -y poetry 15 | 16 | # Install dependencies 17 | RUN pip install --no-cache-dir -r requirements.txt 18 | 19 | # run the crawler 20 | RUN ray start --head 21 | CMD ["python", "main.py"] 22 | 23 | # Expose port for networking 24 | EXPOSE 8265 25 | -------------------------------------------------------------------------------- /Dockerfile.frontend: -------------------------------------------------------------------------------- 1 | FROM node:20.18.0-alpine 2 | 3 | ENV VERSION 0.1.0 4 | ARG NEXT_PUBLIC_WS_URL=ws://127.0.0.1:3001 5 | ARG NEXT_PUBLIC_API_URL=http://127.0.0.1:3001/api 6 | ENV NEXT_PUBLIC_WS_URL=${NEXT_PUBLIC_WS_URL} 7 | ENV NEXT_PUBLIC_API_URL=${NEXT_PUBLIC_API_URL} 8 | 9 | WORKDIR /app 10 | 11 | COPY neosearch_frontend . 12 | 13 | # move to neosearch-frontend directory 14 | WORKDIR /app/neosearch_frontend 15 | 16 | # Install dependencies 17 | RUN pnpm install 18 | 19 | # Expose port for networking 20 | EXPOSE 3000 21 | 22 | # run the frontend 23 | CMD ["npm", "run", "dev"] 24 | -------------------------------------------------------------------------------- /assets/imgs/aisearch_question_suggestion.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NEOS-AI/Neosearch/144921c094eae48e3aaac078e6f726c342720b78/assets/imgs/aisearch_question_suggestion.png -------------------------------------------------------------------------------- /assets/imgs/aisearch_result.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NEOS-AI/Neosearch/144921c094eae48e3aaac078e6f726c342720b78/assets/imgs/aisearch_result.png -------------------------------------------------------------------------------- /assets/imgs/chat_view.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NEOS-AI/Neosearch/144921c094eae48e3aaac078e6f726c342720b78/assets/imgs/chat_view.png -------------------------------------------------------------------------------- /assets/imgs/code_view.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NEOS-AI/Neosearch/144921c094eae48e3aaac078e6f726c342720b78/assets/imgs/code_view.png -------------------------------------------------------------------------------- /assets/imgs/financial-table-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NEOS-AI/Neosearch/144921c094eae48e3aaac078e6f726c342720b78/assets/imgs/financial-table-1.png -------------------------------------------------------------------------------- /assets/imgs/financial-table-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NEOS-AI/Neosearch/144921c094eae48e3aaac078e6f726c342720b78/assets/imgs/financial-table-2.png -------------------------------------------------------------------------------- /assets/imgs/search_view.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NEOS-AI/Neosearch/144921c094eae48e3aaac078e6f726c342720b78/assets/imgs/search_view.png -------------------------------------------------------------------------------- /assets/neosearch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NEOS-AI/Neosearch/144921c094eae48e3aaac078e6f726c342720b78/assets/neosearch.png -------------------------------------------------------------------------------- /assets/neosearch.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NEOS-AI/Neosearch/144921c094eae48e3aaac078e6f726c342720b78/assets/neosearch.webp -------------------------------------------------------------------------------- /changelog.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | ## ~ 2025.03.13 4 | 5 | - [x] Implementing AISearch view 6 | - [x] Make AISearch view to support tavily 7 | - [x] Make AISearch view to support searxng 8 | 9 | - [x] Replace poetry with uv 10 | - [x] Replace poetry with uv for `neosearch` 11 | - [x] Replace poetry with uv for `neosearch_ai` 12 | - [x] Replace poetry with uv for `neosearch_llm` 13 | - [x] Replace poetry with uv for `neosearch_crawler` 14 | 15 | - [x] Implement the batch system for spider 16 | - [x] Implement the spider with Trafilatura 17 | - [x] Implement the continuous batching for spider 18 | 19 | - [x] Update Rag Retriever to use the searxng engine 20 | 21 | - [x] Implement the CRAG workflow for the Rag Retriever 22 | - [x] Add support for CRAG API that runs the CRAG workflow with user's query 23 | 24 | - [x] Implement the reranker 25 | - [x] Add support for Cohere Reranker 26 | - [x] Add support for [FlashRank](https://github.com/PrithivirajDamodaran/FlashRank) Reranker 27 | -------------------------------------------------------------------------------- /deploy_searxng_with_docker.sh: -------------------------------------------------------------------------------- 1 | 2 | PORT=8080 3 | docker pull searxng/searxng 4 | docker run --rm \ 5 | -d -p ${PORT}:8080 \ 6 | -v "${PWD}/searxng:/etc/searxng:rw" \ 7 | -e "BASE_URL=http://localhost:$PORT/" \ 8 | -e "INSTANCE_NAME=searxng-instance" \ 9 | searxng/searxng 10 | -------------------------------------------------------------------------------- /docker-compose.yaml: -------------------------------------------------------------------------------- 1 | # This is a Docker Compose file for setting up the neosearch-stack environment. 2 | 3 | name: neosearch-stack 4 | services: 5 | neosearch: 6 | build: 7 | context: . # The build context is the current directory 8 | dockerfile: Dockerfile.frontend 9 | command: npm run dev 10 | env_file: neosearch_frontend/.env 11 | ports: 12 | - '3000:3000' # Maps port 3000 on the host to port 3000 in the container. 13 | depends_on: 14 | - redis 15 | - searxng 16 | 17 | redis: 18 | image: redis:alpine 19 | ports: 20 | - '6379:6379' 21 | volumes: 22 | - redis_data:/data 23 | command: redis-server --appendonly yes 24 | 25 | searxng: 26 | image: searxng/searxng 27 | ports: 28 | - '${SEARXNG_PORT:-8080}:8080' 29 | volumes: 30 | - ./searxng/limiter.toml:/etc/searxng/limiter.toml 31 | - ./searxng/settings.yml:/etc/searxng/settings.yml 32 | - searxng_data:/data 33 | 34 | volumes: 35 | redis_data: 36 | searxng_data: 37 | -------------------------------------------------------------------------------- /neosearch/.env.template: -------------------------------------------------------------------------------- 1 | LLM_TEMPERATURE= 2 | LLM_MAX_TOKENS= 3 | 4 | # anthropic 5 | ANTHROPIC_MODEL=claude-3.7 6 | ANTHROPIC_API_KEY=sk-2xX3 7 | 8 | # openai 9 | OPENAI_MODEL=gpt-4 10 | OPENAI_API_KEY= 11 | 12 | # ollama 13 | OLLAMA_MODEL=llama3.1:latest 14 | OLLAMA_EMBEDDING_MODEL=bge-m3 15 | 16 | # pg_vector 17 | PG_CONNECTION_STRING= 18 | 19 | # qdrant 20 | QDRANT_URL=http://localhost:6333 21 | QDRANT_API_KEY= # Optional, if not set, it will be ignored 22 | 23 | # Web search API 24 | WEB_SEARCH_API="tavily" # tavily, searxng 25 | 26 | # Tavily 27 | TAVILY_API_KEY=tvly-... 28 | -------------------------------------------------------------------------------- /neosearch/.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | storage 3 | .env 4 | 5 | # sample query response 6 | sample_query_response.txt 7 | -------------------------------------------------------------------------------- /neosearch/.python-version: -------------------------------------------------------------------------------- 1 | 3.11 -------------------------------------------------------------------------------- /neosearch/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NEOS-AI/Neosearch/144921c094eae48e3aaac078e6f726c342720b78/neosearch/__init__.py -------------------------------------------------------------------------------- /neosearch/api/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NEOS-AI/Neosearch/144921c094eae48e3aaac078e6f726c342720b78/neosearch/api/__init__.py -------------------------------------------------------------------------------- /neosearch/api/routers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NEOS-AI/Neosearch/144921c094eae48e3aaac078e6f726c342720b78/neosearch/api/routers/__init__.py -------------------------------------------------------------------------------- /neosearch/api/routers/health_check.py: -------------------------------------------------------------------------------- 1 | from fastapi import APIRouter, status 2 | 3 | # custom module 4 | from neosearch.models.health_check import HealthCheck 5 | from neosearch.utils.logging import Logger 6 | 7 | 8 | logger = Logger() 9 | 10 | # Create a router for the chat endpoint 11 | health_router = r = APIRouter() 12 | 13 | 14 | @r.get( 15 | "", 16 | summary="Perform a Health Check", 17 | response_description="Return HTTP Status Code 200 (OK)", 18 | status_code=status.HTTP_200_OK, 19 | response_model=HealthCheck, 20 | ) 21 | async def health_check() -> dict: 22 | """ 23 | Health check endpoint to verify the API is running. 24 | """ 25 | return HealthCheck(status="OK") 26 | -------------------------------------------------------------------------------- /neosearch/app/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NEOS-AI/Neosearch/144921c094eae48e3aaac078e6f726c342720b78/neosearch/app/__init__.py -------------------------------------------------------------------------------- /neosearch/app/worker_broker.py: -------------------------------------------------------------------------------- 1 | from faststream.kafka import KafkaBroker 2 | from faststream.redis import RedisBroker 3 | 4 | # custom modules 5 | from neosearch.constants.queue import USE_QUEUE, QUEUE_TYPE 6 | 7 | 8 | # global singleton 9 | _my_broker = None 10 | 11 | 12 | def get_worker_broker(): 13 | global _my_broker 14 | if _my_broker is not None: 15 | return _my_broker 16 | 17 | if not USE_QUEUE: 18 | raise Exception("Queue is not enabled") 19 | 20 | if QUEUE_TYPE == "redis": 21 | from neosearch.constants.queue import REDIS_URL, REDIS_DB 22 | 23 | broker = RedisBroker( 24 | url=REDIS_URL, db=REDIS_DB 25 | ) 26 | elif QUEUE_TYPE == "kafka": 27 | from neosearch.constants.queue import ( 28 | KAFKA_BOOTSTRAP_SERVERS, 29 | KAFKA_REQUEST_TIMEOUT_MS, 30 | KAFKA_MAX_IDLE_MS, 31 | KAFKA_COMPRESSION_TYPE, 32 | ) 33 | 34 | broker = KafkaBroker( 35 | bootstrap_servers=KAFKA_BOOTSTRAP_SERVERS, 36 | request_timeout_ms=KAFKA_REQUEST_TIMEOUT_MS, 37 | connections_max_idle_ms=KAFKA_MAX_IDLE_MS, 38 | compression_type=KAFKA_COMPRESSION_TYPE, 39 | ) 40 | 41 | else: 42 | raise Exception("Invalid queue type") 43 | 44 | _my_broker = broker 45 | return broker 46 | -------------------------------------------------------------------------------- /neosearch/config.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | DATA_DIR = "data" 4 | STATIC_DIR = os.getenv("STATIC_DIR", "static") 5 | -------------------------------------------------------------------------------- /neosearch/config.yaml: -------------------------------------------------------------------------------- 1 | neosearch: 2 | llm: 3 | # ["anthropic", "openai", "ollama"] 4 | type: ollama 5 | -------------------------------------------------------------------------------- /neosearch/constants/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NEOS-AI/Neosearch/144921c094eae48e3aaac078e6f726c342720b78/neosearch/constants/__init__.py -------------------------------------------------------------------------------- /neosearch/constants/circuitbreaker.py: -------------------------------------------------------------------------------- 1 | CB_FAIL_MAX = 5 2 | CB_RESET_TIMEOUT = 60 -------------------------------------------------------------------------------- /neosearch/constants/embeddings.py: -------------------------------------------------------------------------------- 1 | # Use Ollama embeddings for true, otherwise use fastembed 2 | # Claude, Grok does not support embeddings, so we use either Ollama or FastEmbed 3 | USE_OLLAMA_FOR_DEFAULT_EMBEDDING = True 4 | 5 | OLLAMA_EMBEDDING_MODEL_BASE = "bge-m3" 6 | -------------------------------------------------------------------------------- /neosearch/constants/logging.py: -------------------------------------------------------------------------------- 1 | LOG_DEFAULT_LOG_NAME = "neosearch" 2 | LOG_DEFAULT_LOG_LEVEL = "DEBUG" 3 | LOG_DEFAULT_CONSOLE_LOG_LEVEL = "WARNING" 4 | LOG_DEFAULT_MAX_BYTES = 10485760 5 | LOG_DEFAULT_BACKUP_COUNT = 10 6 | LOG_DEFAULT_LOGGING_WORKERS = 1 7 | -------------------------------------------------------------------------------- /neosearch/constants/memory.py: -------------------------------------------------------------------------------- 1 | MAX_MEMORY_TOKEN_SIZE = 8000 2 | -------------------------------------------------------------------------------- /neosearch/constants/queue.py: -------------------------------------------------------------------------------- 1 | USE_QUEUE = True 2 | QUEUE_TYPE = "redis" # "redis" or "kafka" 3 | 4 | # redis 5 | REDIS_URL = "redis://localhost:6379" 6 | REDIS_DB = "redis" 7 | 8 | # kafka 9 | KAFKA_BOOTSTRAP_SERVERS = "localhost:9092" 10 | KAFKA_REQUEST_TIMEOUT_MS = 3000 11 | KAFKA_MAX_IDLE_MS = 540000 12 | KAFKA_COMPRESSION_TYPE = "zstd" # 'gzip', 'snappy', 'lz4', 'zstd' 13 | -------------------------------------------------------------------------------- /neosearch/constants/retriever.py: -------------------------------------------------------------------------------- 1 | VECTOR_INDEX_SIM_TOP_K=5 2 | VECTOR_INDEX_EMPTY_QUERY_TOP_K=10 3 | VECTOR_INDEX_VERBOSE=False 4 | -------------------------------------------------------------------------------- /neosearch/constants/searxng.py: -------------------------------------------------------------------------------- 1 | SEARXNG_BASE_URL = "https://searx.example.com" 2 | -------------------------------------------------------------------------------- /neosearch/constants/trace.py: -------------------------------------------------------------------------------- 1 | USE_TRACELOOP = False 2 | -------------------------------------------------------------------------------- /neosearch/datastore/__init__.py: -------------------------------------------------------------------------------- 1 | from .database import get_async_session, get_session, engine, async_engine 2 | 3 | 4 | __all__ = [ 5 | "get_async_session", 6 | "get_session", 7 | "engine", 8 | "async_engine", 9 | ] -------------------------------------------------------------------------------- /neosearch/datastore/crud/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NEOS-AI/Neosearch/144921c094eae48e3aaac078e6f726c342720b78/neosearch/datastore/crud/__init__.py -------------------------------------------------------------------------------- /neosearch/datastore/model/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NEOS-AI/Neosearch/144921c094eae48e3aaac078e6f726c342720b78/neosearch/datastore/model/__init__.py -------------------------------------------------------------------------------- /neosearch/datastore/model/chat.py: -------------------------------------------------------------------------------- 1 | from sqlalchemy import Column, Text, ForeignKey, TIMESTAMP 2 | from sqlalchemy.dialects.postgresql import UUID, ENUM 3 | from sqlalchemy.sql import func 4 | 5 | # custom modules 6 | from .base import Base 7 | 8 | 9 | visibility_enum = ENUM('public', 'private', name='visibility_enum', create_type=False) 10 | 11 | class Chat(Base): 12 | __tablename__ = 'Chat' 13 | 14 | id = Column( 15 | UUID(as_uuid=True), 16 | primary_key=True, 17 | server_default=func.gen_random_uuid(), 18 | nullable=False, 19 | ) 20 | created_at = Column(TIMESTAMP, nullable=False, server_default=func.now()) 21 | title = Column(Text, nullable=False) 22 | user_id = Column(UUID(as_uuid=True), ForeignKey('User.id'), nullable=False) 23 | visibility = Column( 24 | visibility_enum, 25 | nullable=False, 26 | server_default="private" 27 | ) 28 | -------------------------------------------------------------------------------- /neosearch/datastore/model/document.py: -------------------------------------------------------------------------------- 1 | from sqlalchemy import Column, Text, ForeignKey, TIMESTAMP, PrimaryKeyConstraint 2 | from sqlalchemy.dialects.postgresql import UUID, ENUM 3 | from sqlalchemy.sql import func 4 | from uuid_extensions import uuid7str 5 | 6 | # custom modules 7 | from .base import Base 8 | 9 | 10 | # Define ENUM for the 'kind' field 11 | kind_enum = ENUM('text', 'code', name='kind_enum', create_type=False) 12 | 13 | 14 | class Document(Base): 15 | __tablename__ = 'Document' 16 | 17 | id = Column(UUID(as_uuid=True), nullable=False, server_default=uuid7str()) 18 | created_at = Column(TIMESTAMP, nullable=False, server_default=func.now()) 19 | title = Column(Text, nullable=False) 20 | content = Column(Text, nullable=True) 21 | kind = Column(kind_enum, nullable=False, server_default='text') 22 | user_id = Column(UUID(as_uuid=True), ForeignKey('user.id'), nullable=False) 23 | 24 | # Composite primary key 25 | __table_args__ = ( 26 | PrimaryKeyConstraint('id', 'created_at', name='document_pk'), 27 | ) 28 | -------------------------------------------------------------------------------- /neosearch/datastore/model/message.py: -------------------------------------------------------------------------------- 1 | from sqlalchemy import Column, String, JSON, ForeignKey, TIMESTAMP 2 | from sqlalchemy.dialects.postgresql import UUID 3 | from sqlalchemy.sql import func 4 | from uuid_extensions import uuid7str 5 | 6 | # custom modules 7 | from .base import Base 8 | 9 | 10 | class Message(Base): 11 | __tablename__ = 'Message' 12 | 13 | id = Column(UUID(as_uuid=True), primary_key=True, server_default=uuid7str(), nullable=False) 14 | chat_id = Column(UUID(as_uuid=True), ForeignKey('Chat.id'), nullable=False) 15 | role = Column(String, nullable=False) 16 | content = Column(JSON, nullable=False) 17 | created_at = Column(TIMESTAMP, nullable=False, server_default=func.now()) 18 | 19 | def __repr__(self): 20 | return f"" 21 | -------------------------------------------------------------------------------- /neosearch/datastore/model/vote.py: -------------------------------------------------------------------------------- 1 | from sqlalchemy import Column, Boolean, ForeignKey, PrimaryKeyConstraint 2 | from sqlalchemy.dialects.postgresql import UUID 3 | 4 | # custom modules 5 | from .base import Base 6 | 7 | 8 | class Vote(Base): 9 | __tablename__ = 'Vote' 10 | 11 | chat_id = Column(UUID(as_uuid=True), ForeignKey('Chat.id'), nullable=False) 12 | message_id = Column(UUID(as_uuid=True), ForeignKey('Message.id'), nullable=False) 13 | is_upvoted = Column(Boolean, nullable=False) 14 | 15 | # Composite primary key 16 | __table_args__ = ( 17 | PrimaryKeyConstraint('chat_id', 'message_id', name='vote_pk'), 18 | ) 19 | -------------------------------------------------------------------------------- /neosearch/datastore/vectorstores/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NEOS-AI/Neosearch/144921c094eae48e3aaac078e6f726c342720b78/neosearch/datastore/vectorstores/__init__.py -------------------------------------------------------------------------------- /neosearch/datastore/vectorstores/base.py: -------------------------------------------------------------------------------- 1 | from abc import abstractmethod 2 | 3 | # custom module 4 | from neosearch.utils.singleton import Singleton 5 | 6 | 7 | class BaseVectorStore(metaclass=Singleton): 8 | @abstractmethod 9 | def _build_vector_store(self): 10 | pass 11 | 12 | @abstractmethod 13 | def get_store(self): 14 | pass 15 | 16 | @abstractmethod 17 | def refresh(self): 18 | pass 19 | -------------------------------------------------------------------------------- /neosearch/datastore/vectorstores/qdrant_vector_stores.py: -------------------------------------------------------------------------------- 1 | import os 2 | from qdrant_client import QdrantClient 3 | from llama_index.vector_stores.qdrant import QdrantVectorStore 4 | 5 | # custom module 6 | from neosearch.utils.singleton import Singleton 7 | 8 | from .base import BaseVectorStore 9 | 10 | 11 | class QdrantVectorStoreContainer(BaseVectorStore, metaclass=Singleton): 12 | def __init__(self): 13 | self._build_vector_store() 14 | 15 | def _build_vector_store(self): 16 | self.vec_db_client = QdrantClient( 17 | url=os.environ.get("QDRANT_URL"), 18 | api_key=os.environ.get("QDRANT_API_KEY") 19 | ) 20 | self.store = QdrantVectorStore(self.vec_db_client) 21 | 22 | def get_store(self): 23 | return self.store 24 | 25 | def refresh(self): 26 | self._build_vector_store() 27 | return self.store 28 | -------------------------------------------------------------------------------- /neosearch/engine/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NEOS-AI/Neosearch/144921c094eae48e3aaac078e6f726c342720b78/neosearch/engine/__init__.py -------------------------------------------------------------------------------- /neosearch/engine/agents/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NEOS-AI/Neosearch/144921c094eae48e3aaac078e6f726c342720b78/neosearch/engine/agents/__init__.py -------------------------------------------------------------------------------- /neosearch/engine/agents/tools/__init__.py: -------------------------------------------------------------------------------- 1 | from .research_tools import record_notes, review_report, write_report, save_generate_questions 2 | from .web_search import search_web 3 | 4 | 5 | __all__ = [ 6 | # web_search.py 7 | "search_web", 8 | # research_tools.py 9 | "record_notes", 10 | "review_report", 11 | "write_report", 12 | "save_generate_questions", 13 | ] -------------------------------------------------------------------------------- /neosearch/engine/agents/tools/web_search.py: -------------------------------------------------------------------------------- 1 | import os 2 | from tavily import AsyncTavilyClient 3 | from llama_index.core.workflow import Context 4 | 5 | # custom modules 6 | from neosearch.engine.utils.searxng import SearxngAdaptor 7 | 8 | 9 | TAVILY_API_KEY = os.getenv("TAVILY_API_KEY", "tvly-...") 10 | WEB_SEARCH_API = os.getenv("WEB_SEARCH_API", "tavily") 11 | SEARXNG_BASE_URL = os.getenv("SEARXNG_BASE_URL", "http://localhost:8888") 12 | 13 | 14 | async def search_web(ctx: Context, query: str) -> str: 15 | """Useful for using the web to answer questions.""" 16 | search_result = "" 17 | 18 | if WEB_SEARCH_API == "tavily": 19 | client = AsyncTavilyClient(api_key=TAVILY_API_KEY) 20 | search_result = await client.search( 21 | query, 22 | search_depth="basic", # "basic", advanced 23 | topic="general", # "general", "news" 24 | max_results=20, 25 | ) 26 | elif WEB_SEARCH_API == "searxng": 27 | # searxng_search_result = await searxng_search(query) 28 | adaptor = SearxngAdaptor(SEARXNG_BASE_URL) 29 | search_result = await adaptor.asearch(query) 30 | 31 | if search_result != "": 32 | current_state = await ctx.get("state") 33 | current_state["web_search_result"] = search_result 34 | await ctx.set("state", current_state) 35 | 36 | return str(search_result) 37 | -------------------------------------------------------------------------------- /neosearch/engine/constants.py: -------------------------------------------------------------------------------- 1 | PGVECTOR_SCHEMA = "public" 2 | PGVECTOR_TABLE = "llamaindex_embedding" -------------------------------------------------------------------------------- /neosearch/engine/db_utils.py: -------------------------------------------------------------------------------- 1 | 2 | # custom module 3 | from neosearch.datastore.vectorstores.pg_vector_stores import PgVectorStoreContainer 4 | from neosearch.datastore.vectorstores.pgrs_vector_stores import PgRsVectorStoreContainer 5 | 6 | 7 | def init_pg_vector_store_from_env(): 8 | # use singleton to ensure only one instance of the vector store is created 9 | vectorstore = PgVectorStoreContainer() 10 | return vectorstore.get_store() 11 | 12 | 13 | def init_pg_vecto_rs_store_from_env(): 14 | # use singleton to ensure only one instance of the vector store is created 15 | vectorstore = PgRsVectorStoreContainer() 16 | return vectorstore.get_store() 17 | -------------------------------------------------------------------------------- /neosearch/engine/index.py: -------------------------------------------------------------------------------- 1 | from llama_index.core.indices.vector_store import VectorStoreIndex 2 | 3 | # custom module 4 | from neosearch.engine.db_utils import init_pg_vector_store_from_env 5 | from neosearch.utils.logging import Logger 6 | 7 | logger = Logger() 8 | 9 | 10 | def get_pg_index(): 11 | logger.log_info("Connecting to index from PGVector...") 12 | store = init_pg_vector_store_from_env() 13 | index = VectorStoreIndex.from_vector_store(store, use_async=True) 14 | logger.log_info("Finished connecting to index from PGVector.") 15 | return index 16 | 17 | def get_index(vector_store_type: str = "pg") -> VectorStoreIndex: 18 | if vector_store_type == "pg": 19 | return get_pg_index() 20 | else: 21 | raise ValueError(f"Invalid vector store type: {vector_store_type}") 22 | -------------------------------------------------------------------------------- /neosearch/engine/loader.py: -------------------------------------------------------------------------------- 1 | from llama_index.core.readers import SimpleDirectoryReader 2 | 3 | DATA_DIR = "data" # directory to cache the generated index 4 | 5 | 6 | def get_documents(): 7 | return SimpleDirectoryReader(DATA_DIR).load_data() 8 | -------------------------------------------------------------------------------- /neosearch/engine/prompts/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NEOS-AI/Neosearch/144921c094eae48e3aaac078e6f726c342720b78/neosearch/engine/prompts/__init__.py -------------------------------------------------------------------------------- /neosearch/engine/prompts/crag_workflow.py: -------------------------------------------------------------------------------- 1 | CRAG_RELEVANCY_PROMPT_TEMPLATE = """As a grader, your task is to evaluate the relevance of a document retrieved in response to a user's question. 2 | 3 | Retrieved Document: 4 | ------------------- 5 | {context_str} 6 | 7 | User Question: 8 | -------------- 9 | {query_str} 10 | 11 | Evaluation Criteria: 12 | - Consider whether the document contains keywords or topics related to the user's question. 13 | - The evaluation should not be overly stringent; the primary objective is to identify and filter out clearly irrelevant retrievals. 14 | 15 | Decision: 16 | - Assign a binary score to indicate the document's relevance. 17 | - Use 'yes' if the document is relevant to the question, or 'no' if it is not. 18 | 19 | Please provide your binary score ('yes' or 'no') below to indicate the document's relevance to the user question.""" 20 | 21 | 22 | CRAG_TRANSFORM_QUERY_TEMPLATE = """Your task is to refine a query to ensure it is highly effective for retrieving relevant search results. \n 23 | Analyze the given input to grasp the core semantic intent or meaning. \n 24 | Original Query: 25 | \n ------- \n 26 | {query_str} 27 | \n ------- \n 28 | Your goal is to rephrase or enhance this query to improve its search performance. Ensure the revised query is concise and directly aligned with the intended search objective. \n 29 | Respond with the optimized query only:""" 30 | -------------------------------------------------------------------------------- /neosearch/engine/query_filter.py: -------------------------------------------------------------------------------- 1 | from llama_index.core.vector_stores.types import MetadataFilter, MetadataFilters 2 | 3 | 4 | def generate_filters(doc_ids): 5 | """ 6 | Generate public/private document filters based on the doc_ids and the vector store. 7 | """ 8 | public_doc_filter = MetadataFilter( 9 | key="private", 10 | value="true", 11 | operator="!=", # type: ignore 12 | ) 13 | selected_doc_filter = MetadataFilter( 14 | key="doc_id", 15 | value=doc_ids, 16 | operator="in", # type: ignore 17 | ) 18 | if len(doc_ids) > 0: 19 | # If doc_ids are provided, we will select both public and selected documents 20 | filters = MetadataFilters( 21 | filters=[ 22 | public_doc_filter, 23 | selected_doc_filter, 24 | ], 25 | condition="or", # type: ignore 26 | ) 27 | else: 28 | filters = MetadataFilters( 29 | filters=[ 30 | public_doc_filter, 31 | ] 32 | ) 33 | 34 | return filters 35 | -------------------------------------------------------------------------------- /neosearch/engine/rag_engine/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NEOS-AI/Neosearch/144921c094eae48e3aaac078e6f726c342720b78/neosearch/engine/rag_engine/__init__.py -------------------------------------------------------------------------------- /neosearch/engine/reranker/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NEOS-AI/Neosearch/144921c094eae48e3aaac078e6f726c342720b78/neosearch/engine/reranker/__init__.py -------------------------------------------------------------------------------- /neosearch/engine/reranker/cohere.py: -------------------------------------------------------------------------------- 1 | import os 2 | from llama_index.postprocessor.cohere_rerank import CohereRerank 3 | 4 | 5 | api_key = os.getenv("COHERE_API_KEY", None) 6 | 7 | 8 | def get_cohere_rerank(top_n: int = 2): 9 | if api_key is None: 10 | raise ValueError("COHERE_API_KEY is not set") 11 | return CohereRerank(api_key=api_key, top_n=top_n) 12 | -------------------------------------------------------------------------------- /neosearch/engine/retriever/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NEOS-AI/Neosearch/144921c094eae48e3aaac078e6f726c342720b78/neosearch/engine/retriever/__init__.py -------------------------------------------------------------------------------- /neosearch/engine/retriever/base.py: -------------------------------------------------------------------------------- 1 | from llama_index.core.retrievers import BaseRetriever 2 | 3 | # custom modules 4 | from neosearch.engine.index import get_index 5 | 6 | 7 | def get_base_retriever() -> BaseRetriever: 8 | return get_index().as_retriever() 9 | -------------------------------------------------------------------------------- /neosearch/engine/retriever/paradedb.py: -------------------------------------------------------------------------------- 1 | from llama_index.core.retrievers import ( 2 | BaseRetriever, 3 | VectorIndexRetriever, 4 | RouterRetriever, 5 | ) 6 | from llama_index.core.tools import RetrieverTool 7 | from llama_index.core.settings import Settings 8 | 9 | # custom modules 10 | from neosearch.datastore import engine, async_engine, get_session, get_async_session 11 | 12 | 13 | class ParadeDBRetriever(BaseRetriever): 14 | def __init__(self): 15 | super().__init__() 16 | self.engine = engine 17 | self.async_engine = async_engine 18 | 19 | def _retrieve(self, query: str, **kwargs) -> list: 20 | session = get_session(self.engine) 21 | return [] 22 | 23 | 24 | async def _aretrieve(self, query, **kwargs) -> list: 25 | session = get_async_session(self.async_engine) 26 | return [] 27 | 28 | 29 | def create_router_retriever(self): 30 | retriever_tools = [ 31 | RetrieverTool.from_defaults( 32 | retriever=self, 33 | description="Useful in most cases", 34 | ), 35 | ] 36 | 37 | # load settings 38 | llm = Settings.llm 39 | 40 | return RouterRetriever.from_defaults( 41 | retriever_tools=retriever_tools, 42 | llm=llm, 43 | select_multi=True, 44 | ) 45 | -------------------------------------------------------------------------------- /neosearch/engine/search/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NEOS-AI/Neosearch/144921c094eae48e3aaac078e6f726c342720b78/neosearch/engine/search/__init__.py -------------------------------------------------------------------------------- /neosearch/engine/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NEOS-AI/Neosearch/144921c094eae48e3aaac078e6f726c342720b78/neosearch/engine/utils/__init__.py -------------------------------------------------------------------------------- /neosearch/engine/utils/chat.py: -------------------------------------------------------------------------------- 1 | from fastapi import HTTPException, status 2 | from llama_index.core.llms import MessageRole 3 | 4 | # custom imports 5 | from neosearch.models.chat_models import ChatData 6 | 7 | 8 | async def validate_chat_data(data: ChatData): 9 | # check preconditions and get last message 10 | if len(data.messages) == 0: 11 | raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="No messages provided",) # noqa: E501 12 | lastMessage = data.messages.pop() 13 | if lastMessage.role != MessageRole.USER: 14 | raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Last message must be from user",) # noqa: E501 15 | return lastMessage 16 | -------------------------------------------------------------------------------- /neosearch/engine/utils/query.py: -------------------------------------------------------------------------------- 1 | from fastapi import HTTPException, status 2 | 3 | # custom imports 4 | from neosearch.models.query_models import QueryData 5 | 6 | 7 | async def validate_query_data(data: QueryData): 8 | query_data = data.query 9 | if query_data is None: 10 | raise HTTPException( 11 | status_code=status.HTTP_400_BAD_REQUEST, detail="No query provided", 12 | ) 13 | return query_data 14 | -------------------------------------------------------------------------------- /neosearch/engine/workflow/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NEOS-AI/Neosearch/144921c094eae48e3aaac078e6f726c342720b78/neosearch/engine/workflow/__init__.py -------------------------------------------------------------------------------- /neosearch/engine/workflow/events/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NEOS-AI/Neosearch/144921c094eae48e3aaac078e6f726c342720b78/neosearch/engine/workflow/events/__init__.py -------------------------------------------------------------------------------- /neosearch/engine/workflow/events/crag.py: -------------------------------------------------------------------------------- 1 | from llama_index.core.workflow import Event 2 | from llama_index.core.schema import NodeWithScore 3 | 4 | 5 | class PrepEvent(Event): 6 | """Prep event (prepares for retrieval).""" 7 | 8 | pass 9 | 10 | 11 | class RetrieveEvent(Event): 12 | """Retrieve event (gets retrieved nodes).""" 13 | 14 | retrieved_nodes: list[NodeWithScore] 15 | 16 | 17 | class RelevanceEvalEvent(Event): 18 | """Relevance evaluation event (gets results of relevance evaluation).""" 19 | 20 | relevant_results: list[str] 21 | 22 | 23 | class TextExtractEvent(Event): 24 | """Text extract event. Extracts relevant text and concatenates.""" 25 | 26 | relevant_text: str 27 | 28 | 29 | class QueryEvent(Event): 30 | """Query event. Queries given relevant text and search text.""" 31 | 32 | relevant_text: str 33 | search_text: str 34 | 35 | 36 | # streaming events 37 | 38 | class CragStreamingEvents(Event): 39 | msg: str 40 | 41 | class RetrieveSuccessEvent(CragStreamingEvents): 42 | pass 43 | 44 | class RetrieveFailureEvent(CragStreamingEvents): 45 | pass 46 | 47 | class TransformQueryResultEvent(CragStreamingEvents): 48 | pass 49 | -------------------------------------------------------------------------------- /neosearch/exceptions/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NEOS-AI/Neosearch/144921c094eae48e3aaac078e6f726c342720b78/neosearch/exceptions/__init__.py -------------------------------------------------------------------------------- /neosearch/exceptions/bedrock.py: -------------------------------------------------------------------------------- 1 | class BedrockInvalidModelIdException(Exception): 2 | ... 3 | -------------------------------------------------------------------------------- /neosearch/exceptions/engine/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NEOS-AI/Neosearch/144921c094eae48e3aaac078e6f726c342720b78/neosearch/exceptions/engine/__init__.py -------------------------------------------------------------------------------- /neosearch/exceptions/engine/retriever.py: -------------------------------------------------------------------------------- 1 | class VectorStoreIsNullError(Exception): 2 | ... 3 | -------------------------------------------------------------------------------- /neosearch/export_requirements_txt.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # check if requrirements.txt exists 4 | if [ -f requirements.txt ]; then 5 | echo "requirements.txt already exists. Remove it first." 6 | rm requirements.txt 7 | fi 8 | 9 | # poetry export --without-hashes --format=requirements.txt > requirements.txt 10 | uv export --no-hashes --format requirements-txt > requirements.txt -------------------------------------------------------------------------------- /neosearch/infrastructure/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NEOS-AI/Neosearch/144921c094eae48e3aaac078e6f726c342720b78/neosearch/infrastructure/__init__.py -------------------------------------------------------------------------------- /neosearch/infrastructure/aws/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NEOS-AI/Neosearch/144921c094eae48e3aaac078e6f726c342720b78/neosearch/infrastructure/aws/__init__.py -------------------------------------------------------------------------------- /neosearch/middlewares/__init__.py: -------------------------------------------------------------------------------- 1 | from .request_id import RequestID 2 | from .request_logger import RequestLogger 3 | 4 | __all__ = ["RequestID", "RequestLogger"] 5 | -------------------------------------------------------------------------------- /neosearch/middlewares/request_id.py: -------------------------------------------------------------------------------- 1 | """Adds uuid to the request header for debugging.""" 2 | 3 | from uuid import uuid4 4 | from fastapi import Request 5 | from starlette.middleware.base import BaseHTTPMiddleware 6 | from starlette.responses import JSONResponse 7 | 8 | # custom modules 9 | from neosearch.utils.logging import Logger 10 | 11 | logger = Logger() 12 | 13 | 14 | class RequestID(BaseHTTPMiddleware): 15 | """Add a uuid to the request header. 16 | 17 | Args: 18 | app (fastapi.Request): Instance of a FastAPI class. 19 | """ 20 | 21 | def __init__(self, app): 22 | super().__init__(app) 23 | 24 | async def dispatch(self, request: Request, call_next): 25 | """ 26 | Implement the dispatch method. 27 | 28 | Args: 29 | request (fastapi.Request): Instance of a FastAPI class. 30 | call_next (function): Function to call next middleware. 31 | """ 32 | 33 | try: 34 | request_id = uuid4() 35 | request.state.request_id = request_id 36 | response = await call_next(request) 37 | response.headers["request_id"] = str(request_id) 38 | return response 39 | except Exception as e: 40 | logger.log_warning( 41 | f"method={request.method} | {request.url} | {request.state.request_id} | {e}" 42 | ) 43 | return JSONResponse(status_code=500, content={"reason": str(e)}) 44 | -------------------------------------------------------------------------------- /neosearch/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .chat_models import ( 2 | AnnotationFileData, 3 | AgentAnnotation, 4 | ArtifactAnnotation, 5 | Annotation, 6 | ChatData, 7 | ChatConfig, 8 | Message, 9 | SourceNodes, 10 | Result, 11 | ) 12 | from .health_check import HealthCheck 13 | 14 | 15 | __all__ = [ 16 | # health check 17 | "HealthCheck", 18 | # chat models 19 | "AnnotationFileData", 20 | "AgentAnnotation", 21 | "ArtifactAnnotation", 22 | "Annotation", 23 | "ChatData", 24 | "ChatConfig", 25 | "Message", 26 | "SourceNodes", 27 | "Result", 28 | ] -------------------------------------------------------------------------------- /neosearch/models/health_check.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel 2 | 3 | 4 | class HealthCheck(BaseModel): 5 | """Response model to validate and return when performing a health check.""" 6 | 7 | status: str = "OK" 8 | -------------------------------------------------------------------------------- /neosearch/models/query_models.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel 2 | from datetime import datetime 3 | 4 | # custom module 5 | from neosearch.constants.memory import MAX_MEMORY_TOKEN_SIZE 6 | 7 | 8 | class QueryData(BaseModel): 9 | query: str 10 | timezone: str = "UTC" 11 | 12 | 13 | class Memory(BaseModel): 14 | """ 15 | Memory model. 16 | 17 | Attributes: 18 | id (str): The memory ID. (ULID based) 19 | updated_at (str): The updated timestamp. 20 | content (str): The content of the memory. 21 | """ 22 | id: str 23 | updated_at: str = datetime.now().isoformat() 24 | content: str 25 | 26 | class MemoryResponse(BaseModel): 27 | """ 28 | Memory data model. 29 | This represents the memory data model, which contains the additional memory data. 30 | 31 | Attributes: 32 | messages (list[Memory]): The list of messages. 33 | memory_max_tokens (int): The maximum memory tokens. 34 | memory_num_tokens (int): The number of memory tokens that are currently in use (cannot exceed memory_max_tokens). 35 | """ 36 | messages: list[Memory] 37 | memory_max_tokens: int = MAX_MEMORY_TOKEN_SIZE 38 | memory_num_tokens: int = 0 39 | -------------------------------------------------------------------------------- /neosearch/mypy.ini: -------------------------------------------------------------------------------- 1 | [mypy] 2 | follow_imports = skip 3 | check_untyped_defs = True 4 | disallow_untyped_defs = True 5 | files = tests/challenges/**/*.py 6 | 7 | [mypy-requests.*] 8 | ignore_missing_imports = True 9 | [mypy-yaml.*] 10 | ignore_missing_imports = True -------------------------------------------------------------------------------- /neosearch/response/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NEOS-AI/Neosearch/144921c094eae48e3aaac078e6f726c342720b78/neosearch/response/__init__.py -------------------------------------------------------------------------------- /neosearch/services/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NEOS-AI/Neosearch/144921c094eae48e3aaac078e6f726c342720b78/neosearch/services/__init__.py -------------------------------------------------------------------------------- /neosearch/settings/fastembed.py: -------------------------------------------------------------------------------- 1 | import os 2 | from llama_index.core.settings import Settings 3 | 4 | 5 | def init_fastembed( 6 | model_name: str = "BAAI/bge-m3", 7 | max_length: int = 512, 8 | threads: int = 4, 9 | ): 10 | try: 11 | from llama_index.embeddings.fastembed import FastEmbedEmbedding 12 | except ImportError: 13 | raise ImportError( 14 | "FastEmbed support is not installed. Please install it with `poetry add llama-index-embeddings-fastembed`" 15 | ) 16 | 17 | embedding_model = os.getenv("FASTEMBED_EMBEDDING_MODEL", model_name) 18 | if embedding_model is None: 19 | raise ValueError("EMBEDDING_MODEL environment variable is not set") 20 | 21 | # This will download the model automatically if it is not already downloaded 22 | Settings.embed_model = FastEmbedEmbedding( 23 | model_name=embedding_model, 24 | max_length=max_length, 25 | threads=threads, 26 | ) 27 | -------------------------------------------------------------------------------- /neosearch/settings/gemini.py: -------------------------------------------------------------------------------- 1 | from llama_index.core.settings import Settings 2 | import os 3 | 4 | 5 | def init_gemini(): 6 | try: 7 | from llama_index.embeddings.gemini import GeminiEmbedding 8 | from llama_index.llms.gemini import Gemini 9 | except ImportError: 10 | raise ImportError( 11 | "Gemini support is not installed. Please install it with `poetry add llama-index-llms-gemini` and `poetry add llama-index-embeddings-gemini`" 12 | ) 13 | 14 | model_name = f"models/{os.getenv('MODEL')}" 15 | embed_model_name = f"models/{os.getenv('EMBEDDING_MODEL')}" 16 | 17 | Settings.llm = Gemini(model=model_name) 18 | Settings.embed_model = GeminiEmbedding(model_name=embed_model_name) 19 | -------------------------------------------------------------------------------- /neosearch/settings/huggingface.py: -------------------------------------------------------------------------------- 1 | import os 2 | from llama_index.core.settings import Settings 3 | 4 | 5 | def init_huggingface(): 6 | try: 7 | from llama_index.llms.huggingface import HuggingFaceLLM 8 | except ImportError: 9 | raise ImportError( 10 | "Hugging Face support is not installed. Please install it with `poetry add llama-index-llms-huggingface` and `poetry add llama-index-embeddings-huggingface`" 11 | ) 12 | 13 | Settings.llm = HuggingFaceLLM( 14 | model_name=os.getenv("MODEL"), 15 | tokenizer_name=os.getenv("MODEL"), 16 | ) 17 | init_huggingface_embedding() 18 | 19 | 20 | def init_huggingface_embedding(): 21 | try: 22 | from llama_index.embeddings.huggingface import HuggingFaceEmbedding 23 | except ImportError: 24 | raise ImportError( 25 | "Hugging Face support is not installed. Please install it with `poetry add llama-index-embeddings-huggingface`" 26 | ) 27 | 28 | embedding_model = os.getenv("EMBEDDING_MODEL", "all-MiniLM-L6-v2") 29 | backend = os.getenv("EMBEDDING_BACKEND", "onnx") # "torch", "onnx", or "openvino" 30 | trust_remote_code = ( 31 | os.getenv("EMBEDDING_TRUST_REMOTE_CODE", "false").lower() == "true" 32 | ) 33 | 34 | Settings.embed_model = HuggingFaceEmbedding( 35 | model_name=embedding_model, 36 | trust_remote_code=trust_remote_code, 37 | backend=backend, 38 | ) 39 | -------------------------------------------------------------------------------- /neosearch/settings/mistral.py: -------------------------------------------------------------------------------- 1 | import os 2 | from llama_index.core.settings import Settings 3 | 4 | 5 | def init_mistral(): 6 | from llama_index.embeddings.mistralai import MistralAIEmbedding 7 | from llama_index.llms.mistralai import MistralAI 8 | 9 | Settings.llm = MistralAI(model=os.getenv("MODEL")) 10 | Settings.embed_model = MistralAIEmbedding(model_name=os.getenv("EMBEDDING_MODEL")) 11 | -------------------------------------------------------------------------------- /neosearch/settings/ollama.py: -------------------------------------------------------------------------------- 1 | from llama_index.llms.ollama.base import DEFAULT_REQUEST_TIMEOUT, Ollama 2 | from llama_index.core.settings import Settings 3 | import os 4 | 5 | # custom modules 6 | from neosearch.constants.embeddings import OLLAMA_EMBEDDING_MODEL_BASE 7 | 8 | 9 | def init_ollama_embedding(): 10 | try: 11 | from llama_index.embeddings.ollama import OllamaEmbedding 12 | except ImportError: 13 | raise ImportError( 14 | "Ollama support is not installed. Please install it with `poetry add llama-index-llms-ollama` and `poetry add llama-index-embeddings-ollama`" 15 | ) 16 | base_url = os.getenv("OLLAMA_BASE_URL") or "http://127.0.0.1:11434" 17 | Settings.embed_model = OllamaEmbedding( 18 | base_url=base_url, 19 | model_name=os.getenv("OLLAMA_EMBEDDING_MODEL", OLLAMA_EMBEDDING_MODEL_BASE), 20 | ) 21 | 22 | 23 | def init_ollama(): 24 | base_url = os.getenv("OLLAMA_BASE_URL") or "http://127.0.0.1:11434" 25 | request_timeout = float( 26 | os.getenv("OLLAMA_REQUEST_TIMEOUT", DEFAULT_REQUEST_TIMEOUT) 27 | ) 28 | init_ollama_embedding() 29 | Settings.llm = Ollama( 30 | base_url=base_url, model=os.getenv("OLLAMA_MODEL"), request_timeout=request_timeout 31 | ) 32 | -------------------------------------------------------------------------------- /neosearch/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NEOS-AI/Neosearch/144921c094eae48e3aaac078e6f726c342720b78/neosearch/tests/__init__.py -------------------------------------------------------------------------------- /neosearch/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .logging import Logger 2 | from .ratelimitter import limiter 3 | from .singleton import Singleton 4 | 5 | 6 | __ALL__ = [ 7 | Logger, 8 | limiter, 9 | Singleton, 10 | ] 11 | -------------------------------------------------------------------------------- /neosearch/utils/configs.py: -------------------------------------------------------------------------------- 1 | import yaml 2 | 3 | # custom imports 4 | from neosearch.utils.singleton import Singleton 5 | 6 | 7 | def get_config(): 8 | with open("config.yaml", "r") as f: 9 | return yaml.safe_load(f) 10 | 11 | 12 | class Config(metaclass=Singleton): 13 | def __init__(self): 14 | self.config = get_config() 15 | 16 | def get(self, key): 17 | return self.config.get(key) 18 | 19 | def get_llm_configs(self): 20 | llm_config = self.config.get("neosearch", {}).get("llm", {}) 21 | return llm_config 22 | -------------------------------------------------------------------------------- /neosearch/utils/gc_tuning.py: -------------------------------------------------------------------------------- 1 | import gc 2 | 3 | 4 | def get_current_gc_threshold(): 5 | return gc.get_threshold() 6 | 7 | 8 | def gc_optimization_on_startup(debug:bool=False, disable_gc:bool=False): 9 | if debug: 10 | # gc.DEBUG_STATS: print statistics 11 | # gc.DEBUG_LEAK: print objects that are likely to be leaked 12 | # gc.DEBUG_UNCOLLECTABLE: print objects that cannot be collected 13 | gc.set_debug(gc.DEBUG_STATS | gc.DEBUG_LEAK | gc.DEBUG_UNCOLLECTABLE) 14 | 15 | if disable_gc: 16 | gc.disable() 17 | return 18 | 19 | # numpy나 torch는 import를 통해 초기화 시, 내부적으로 많은 object를 생성한다. 20 | # 이러한 object들이 reference count에 영향을 주고, gc가 더 자주 동작하도록 만든다. 21 | gc.freeze() 22 | 23 | # gc가 너무 자주 불리는 것도 문제가 될 수 있음. 24 | gc.set_threshold(80_000, 20, 20) 25 | -------------------------------------------------------------------------------- /neosearch/utils/ratelimitter.py: -------------------------------------------------------------------------------- 1 | from slowapi import Limiter 2 | from slowapi.util import get_remote_address 3 | 4 | 5 | limiter = Limiter(key_func=get_remote_address) 6 | -------------------------------------------------------------------------------- /neosearch/utils/ray.py: -------------------------------------------------------------------------------- 1 | import ray 2 | 3 | # custom modules 4 | from neosearch.constants.queue import USE_QUEUE 5 | 6 | 7 | # decorator for ray remote 8 | def ray_remote_if_enabled(func): 9 | if not USE_QUEUE: 10 | return ray.remote(func) 11 | return func 12 | -------------------------------------------------------------------------------- /neosearch/utils/singleton.py: -------------------------------------------------------------------------------- 1 | class Singleton(type): 2 | """The singleton metaclass.""" 3 | 4 | _instances: dict = {} 5 | 6 | def __call__(cls, *args, **kwargs): 7 | """Override to create only one instance ever. 8 | 9 | Returns: 10 | object: Instance of the class initialized. 11 | """ 12 | if cls not in cls._instances: 13 | cls._instances[cls] = super(Singleton, cls).__call__(*args, **kwargs) 14 | return cls._instances[cls] 15 | -------------------------------------------------------------------------------- /neosearch/worker.py: -------------------------------------------------------------------------------- 1 | from dotenv import load_dotenv 2 | import sys 3 | import warnings 4 | from faststream import FastStream 5 | 6 | # Load environment variables 7 | load_dotenv() 8 | 9 | # Ignore warnings 10 | warnings.filterwarnings("ignore") 11 | 12 | # Add the root directory to the path so that we can import the settings 13 | sys.path.append("..") 14 | 15 | # custom module 16 | from neosearch.constants.queue import USE_QUEUE # noqa: E402 17 | from neosearch.app.worker_broker import get_worker_broker # noqa: E402 18 | 19 | if not USE_QUEUE: 20 | raise Exception("Queue is not enabled") 21 | 22 | # init broker 23 | broker = get_worker_broker() 24 | 25 | # init faststream app 26 | app = FastStream(broker) 27 | 28 | 29 | if __name__ == "__main__": 30 | from neosearch.engine.agents.deep_research import background_research_task # noqa: E402 31 | # app.run() 32 | background_research_task("task_id", "How to build a google-level search engine?") 33 | -------------------------------------------------------------------------------- /neosearch_ai/README.md: -------------------------------------------------------------------------------- 1 | # Neosearch AI 2 | 3 | Run AI models for RAG search. 4 | 5 | ## Embeddings 6 | 7 | ```bash 8 | # 9 | # export environment variables 10 | # 11 | 12 | # huggingface sentence transformers model 13 | export MODEL_NAME answerdotai/ModernBERT-large 14 | # device type (cpu, gpu, etc) 15 | export DEVICE cpu 16 | # precision (float32, float16, bfloat16, etc) 17 | export PRECISION float32 18 | # retriever batch size 19 | export RETRIEVER_BATCH_SIZE 8 20 | # reader batch size 21 | export READER_BATCH_SIZE 8 22 | # max batch size 23 | export max_batch_size 8 24 | 25 | # if you use gpu, then set the num of gpus (otherwise, torch.cuda.device_count() is used) 26 | export NUM_GPUS 1 27 | 28 | # 29 | # run ray serve 30 | # 31 | 32 | serve run embedding:embedding_deployment 33 | ``` 34 | 35 | ## Reranker 36 | 37 | ### FlashRerank 38 | 39 | ```bash 40 | # 41 | # export environment variables 42 | # 43 | 44 | # huggingface sentence transformers model 45 | export MODEL_NAME rank_zephyr_7b_v1_full 46 | # device type (cpu, gpu, etc) 47 | export DEVICE cpu 48 | # precision (float32, float16, bfloat16, etc) 49 | export PRECISION float32 50 | # retriever batch size 51 | export RETRIEVER_BATCH_SIZE 8 52 | # reader batch size 53 | export READER_BATCH_SIZE 8 54 | # max batch size 55 | export max_batch_size 8 56 | 57 | # if you use gpu, then set the num of gpus (otherwise, torch.cuda.device_count() is used) 58 | export NUM_GPUS 1 59 | 60 | # 61 | # run ray serve 62 | # 63 | 64 | serve run flashrerank:rerank_deployment 65 | ``` 66 | -------------------------------------------------------------------------------- /neosearch_ai/configs/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NEOS-AI/Neosearch/144921c094eae48e3aaac078e6f726c342720b78/neosearch_ai/configs/__init__.py -------------------------------------------------------------------------------- /neosearch_ai/configs/app.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | import torch 3 | import multiprocessing 4 | 5 | 6 | @dataclass 7 | class NeosAiConfig: 8 | num_of_cpus: int = multiprocessing.cpu_count() 9 | cuda_available: bool = torch.cuda.is_available() 10 | use_llm2vec: bool = False 11 | avoid_thread_contention: bool = True 12 | run_monitoring: bool = True 13 | monitoring_port: int = 8518 14 | -------------------------------------------------------------------------------- /neosearch_ai/configs/embedding_param_manager.py: -------------------------------------------------------------------------------- 1 | import os 2 | from dataclasses import dataclass 3 | import torch 4 | 5 | 6 | @dataclass 7 | class ServerParameterManager: 8 | model_name: str = os.environ.get("MODEL_NAME", "answerdotai/ModernBERT-large") 9 | device: str = os.environ.get("DEVICE", "cpu") 10 | precision: int | str | None = os.environ.get("PRECISION", "fp32") 11 | retriever_batch_size: int = int(os.environ.get("RETRIEVER_BATCH_SIZE", 32)) 12 | reader_batch_size: int = int(os.environ.get("READER_BATCH_SIZE", 32)) 13 | max_batch_size: int = int(os.environ.get("MAX_BATCH_SIZE", 32)) 14 | 15 | 16 | class RayParameterManager: 17 | def __init__(self) -> None: 18 | self.num_gpus = int(os.environ.get("NUM_GPUS", torch.cuda.device_count())) 19 | self.min_replicas = int(os.environ.get("MIN_REPLICAS", 1)) 20 | self.max_replicas = int(os.environ.get("MAX_REPLICAS", 1)) 21 | -------------------------------------------------------------------------------- /neosearch_ai/configs/reranker_param_manager.py: -------------------------------------------------------------------------------- 1 | import os 2 | from dataclasses import dataclass 3 | import torch 4 | 5 | 6 | @dataclass 7 | class RerankServerParameterManager: 8 | model_name: str = os.environ.get("MODEL_NAME", "rank_zephyr_7b_v1_full") 9 | device: str = os.environ.get("DEVICE", "cpu") 10 | precision: int | str | None = os.environ.get("PRECISION", "fp32") 11 | retriever_batch_size: int = int(os.environ.get("RETRIEVER_BATCH_SIZE", 32)) 12 | reader_batch_size: int = int(os.environ.get("READER_BATCH_SIZE", 32)) 13 | max_batch_size: int = int(os.environ.get("MAX_BATCH_SIZE", 32)) 14 | 15 | 16 | class RerankRayParameterManager: 17 | def __init__(self) -> None: 18 | self.num_gpus = int(os.environ.get("NUM_GPUS", torch.cuda.device_count())) 19 | self.min_replicas = int(os.environ.get("MIN_REPLICAS", 1)) 20 | self.max_replicas = int(os.environ.get("MAX_REPLICAS", 1)) 21 | -------------------------------------------------------------------------------- /neosearch_ai/constants/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NEOS-AI/Neosearch/144921c094eae48e3aaac078e6f726c342720b78/neosearch_ai/constants/__init__.py -------------------------------------------------------------------------------- /neosearch_ai/constants/logging.py: -------------------------------------------------------------------------------- 1 | LOG_DEFAULT_LOG_NAME = "neosearch_ai" 2 | LOG_DEFAULT_LOG_LEVEL = "DEBUG" 3 | LOG_DEFAULT_CONSOLE_LOG_LEVEL = "WARNING" 4 | LOG_DEFAULT_MAX_BYTES = 10485760 5 | LOG_DEFAULT_BACKUP_COUNT = 10 6 | LOG_DEFAULT_LOGGING_WORKERS = 1 -------------------------------------------------------------------------------- /neosearch_ai/embedding.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | sys.path.append(".") 4 | sys.path.append("..") 5 | 6 | # custom modules 7 | from engine.embeddings import EmbeddingDeployment 8 | 9 | # Deploy the Ray Serve application. 10 | embedding_deployment = EmbeddingDeployment.bind() 11 | -------------------------------------------------------------------------------- /neosearch_ai/engine/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NEOS-AI/Neosearch/144921c094eae48e3aaac078e6f726c342720b78/neosearch_ai/engine/__init__.py -------------------------------------------------------------------------------- /neosearch_ai/flashrerank.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | sys.path.append("..") 4 | 5 | # custom modules 6 | from engine.flash_reranker import FlashRerankDeployment 7 | 8 | # Deploy the Ray Serve application. 9 | rerank_deployment = FlashRerankDeployment.bind() 10 | -------------------------------------------------------------------------------- /neosearch_ai/pyproject.toml: -------------------------------------------------------------------------------- 1 | # [tool.pdm.build] 2 | # includes = [] 3 | # [build-system] 4 | # requires = ["pdm-backend"] 5 | # build-backend = "pdm.backend" 6 | 7 | 8 | [project] 9 | authors = [ 10 | {name = "YeonwooSung", email = "neos960518@gmail.com"}, 11 | ] 12 | requires-python = "<3.13,>=3.10" 13 | dependencies = [ 14 | "ray[serve]<3.0.0,>=2.34.0", 15 | "fastapi==0.115.6", 16 | "sentence-transformers<4.0.0,>=3.0.1", 17 | "torch<3.0.0,>=2.4.0", 18 | "transformers<5.0.0,>=4.48.3", 19 | "psutil<7.0.0,>=6.0.0", 20 | "vllm<1.0.0,>=0.6.2", 21 | "flashrank[listwise]<1.0.0,>=0.2.9", 22 | "uvloop<1.0.0,>=0.21.0", 23 | ] 24 | name = "neosearch-ai" 25 | version = "0.3.0" 26 | description = "AI components for neosearch" 27 | readme = "README.md" 28 | -------------------------------------------------------------------------------- /neosearch_ai/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NEOS-AI/Neosearch/144921c094eae48e3aaac078e6f726c342720b78/neosearch_ai/utils/__init__.py -------------------------------------------------------------------------------- /neosearch_ai/utils/singleton.py: -------------------------------------------------------------------------------- 1 | class Singleton(type): 2 | """The singleton metaclass.""" 3 | 4 | _instances: dict = {} 5 | 6 | def __call__(cls, *args, **kwargs): 7 | """Override to create only one instance ever. 8 | 9 | Returns: 10 | object: Instance of the class initialized. 11 | """ 12 | if cls not in cls._instances: 13 | cls._instances[cls] = super(Singleton, cls).__call__(*args, **kwargs) 14 | return cls._instances[cls] -------------------------------------------------------------------------------- /neosearch_crawler/.env.sample: -------------------------------------------------------------------------------- 1 | FOR_TEST=0 2 | PG_CONNECTION_STRING= 3 | -------------------------------------------------------------------------------- /neosearch_crawler/.gitignore: -------------------------------------------------------------------------------- 1 | # commoncrawl data 2 | data/ 3 | 4 | known_urls.txt 5 | web_corpus.parquet 6 | web_corpus.jsonl 7 | -------------------------------------------------------------------------------- /neosearch_crawler/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NEOS-AI/Neosearch/144921c094eae48e3aaac078e6f726c342720b78/neosearch_crawler/__init__.py -------------------------------------------------------------------------------- /neosearch_crawler/constants/__init__.py: -------------------------------------------------------------------------------- 1 | from dotenv import load_dotenv 2 | import os 3 | 4 | # custom modules 5 | from .modes import ( 6 | BASE_WEB_CRAWL_AGENT_MODE, 7 | COMMON_CRAWL_RUNNER_MODE, 8 | PARSE_WIKI_TO_PARADEDB_MODE, 9 | ) 10 | 11 | 12 | # Load the environment variables 13 | load_dotenv() 14 | 15 | FOR_TEST = os.getenv("FOR_TEST", "0") == "1" 16 | 17 | __all__ = [ 18 | # __init__.py 19 | "FOR_TEST", 20 | # modes.py 21 | "BASE_WEB_CRAWL_AGENT_MODE", 22 | "COMMON_CRAWL_RUNNER_MODE", 23 | "PARSE_WIKI_TO_PARADEDB_MODE", 24 | ] -------------------------------------------------------------------------------- /neosearch_crawler/constants/crawl_seeds.py: -------------------------------------------------------------------------------- 1 | INITIAL_SEEDS = [ 2 | # Wiki english main portals 3 | 'https://en.wikipedia.org/wiki/Main_Page', 4 | 'https://en.wikipedia.org/wiki/Portal:Contents', 5 | 6 | # 위키백과 메인 포털들 7 | 'https://ko.wikipedia.org/wiki/위키백과:대문', 8 | 'https://ko.wikipedia.org/wiki/포털:목차', 9 | 10 | # 네이버 지식백과 메인 카테고리 11 | 'https://terms.naver.com/', 12 | 13 | # scholar portals 14 | # 학술정보 포털 15 | 'https://arxiv.org/', 16 | 'https://www.dbpia.co.kr/', 17 | 'https://scholar.google.co.kr/' 18 | ] 19 | 20 | NEWS_SEEDS = [ 21 | 'https://news.naver.com/', 22 | 'https://news.daum.net/', 23 | 'https://news.google.com/' 24 | ] 25 | 26 | OPENSOURCE_DEV_SEEDS = [ 27 | # Open sources 28 | 'https://www.tensorflow.org/', 29 | 'https://pytorch.org/', 30 | 'https://react.dev/', 31 | ] 32 | 33 | # Full seeds (concatenated) 34 | _FULL_SEEDS = INITIAL_SEEDS + NEWS_SEEDS + OPENSOURCE_DEV_SEEDS 35 | FULL_SEEDS = list(set(_FULL_SEEDS)) 36 | -------------------------------------------------------------------------------- /neosearch_crawler/constants/logger.py: -------------------------------------------------------------------------------- 1 | LOG_DEFAULT_LOG_NAME = "neosearch-crawler" 2 | LOG_DEFAULT_LOG_LEVEL = "DEBUG" 3 | LOG_DEFAULT_CONSOLE_LOG_LEVEL = "WARNING" 4 | LOG_DEFAULT_MAX_BYTES = 10485760 5 | LOG_DEFAULT_BACKUP_COUNT = 10 6 | LOG_DEFAULT_LOGGING_WORKERS = 1 7 | -------------------------------------------------------------------------------- /neosearch_crawler/constants/modes.py: -------------------------------------------------------------------------------- 1 | BASE_WEB_CRAWL_AGENT_MODE = "web_crawl_agent" 2 | COMMON_CRAWL_RUNNER_MODE = "cc" 3 | PARSE_WIKI_TO_PARADEDB_MODE = "parse_wiki_to_paradedb" 4 | -------------------------------------------------------------------------------- /neosearch_crawler/crawlers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NEOS-AI/Neosearch/144921c094eae48e3aaac078e6f726c342720b78/neosearch_crawler/crawlers/__init__.py -------------------------------------------------------------------------------- /neosearch_crawler/crawlers/s3/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NEOS-AI/Neosearch/144921c094eae48e3aaac078e6f726c342720b78/neosearch_crawler/crawlers/s3/__init__.py -------------------------------------------------------------------------------- /neosearch_crawler/datastore/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NEOS-AI/Neosearch/144921c094eae48e3aaac078e6f726c342720b78/neosearch_crawler/datastore/__init__.py -------------------------------------------------------------------------------- /neosearch_crawler/dispatchers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NEOS-AI/Neosearch/144921c094eae48e3aaac078e6f726c342720b78/neosearch_crawler/dispatchers/__init__.py -------------------------------------------------------------------------------- /neosearch_crawler/dispatchers/base.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from crawlers.base import BaseCrawler 4 | 5 | 6 | class CrawlerDispatcher: 7 | def __init__(self) -> None: 8 | self._crawlers = {} 9 | 10 | def register(self, domain: str, crawler: type[BaseCrawler]) -> None: 11 | self._crawlers[r"https://(www\.)?{}.com/*".format(re.escape(domain))] = crawler 12 | 13 | def get_crawler(self, url: str) -> BaseCrawler: 14 | for pattern, crawler in self._crawlers.items(): 15 | if re.match(pattern, url): 16 | return crawler() 17 | else: 18 | raise ValueError("No crawler found for the provided link") 19 | -------------------------------------------------------------------------------- /neosearch_crawler/dispatchers/lib.py: -------------------------------------------------------------------------------- 1 | from neosearch_crawler.exception.dispatcher import ImproperlyConfigured 2 | 3 | 4 | def user_to_names(user: str | None) -> tuple[str, str]: 5 | if user is None: 6 | raise ImproperlyConfigured("User name is empty") 7 | 8 | name_tokens = user.split(" ") 9 | if len(name_tokens) == 0: 10 | raise ImproperlyConfigured("User name is empty") 11 | elif len(name_tokens) == 1: 12 | first_name, last_name = name_tokens[0], name_tokens[0] 13 | else: 14 | first_name, last_name = " ".join(name_tokens[:-1]), name_tokens[-1] 15 | 16 | return first_name, last_name 17 | -------------------------------------------------------------------------------- /neosearch_crawler/engine/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NEOS-AI/Neosearch/144921c094eae48e3aaac078e6f726c342720b78/neosearch_crawler/engine/__init__.py -------------------------------------------------------------------------------- /neosearch_crawler/engine/agent/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NEOS-AI/Neosearch/144921c094eae48e3aaac078e6f726c342720b78/neosearch_crawler/engine/agent/__init__.py -------------------------------------------------------------------------------- /neosearch_crawler/engine/agent/base.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel 2 | 3 | 4 | class BaseArgs(BaseModel): 5 | id: str 6 | 7 | 8 | class BaseAgent: 9 | def __init__(self): 10 | pass 11 | 12 | def run(self, args: BaseArgs): 13 | pass 14 | -------------------------------------------------------------------------------- /neosearch_crawler/engine/runner/__init__.py: -------------------------------------------------------------------------------- 1 | from functools import wraps 2 | 3 | # custom modules 4 | from .base import BaseRunner 5 | 6 | 7 | def step(index): 8 | """Decorator to mark a method as a DAG step.""" 9 | def decorator(func): 10 | setattr(func, "_step_index", index) 11 | 12 | @wraps(func) 13 | def wrapper(*args, **kwargs): 14 | return func(*args, **kwargs) 15 | return wrapper 16 | return decorator 17 | 18 | 19 | __all__ = [ 20 | 'BaseRunner', 21 | 'step' 22 | ] -------------------------------------------------------------------------------- /neosearch_crawler/exception/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NEOS-AI/Neosearch/144921c094eae48e3aaac078e6f726c342720b78/neosearch_crawler/exception/__init__.py -------------------------------------------------------------------------------- /neosearch_crawler/exception/dispatcher.py: -------------------------------------------------------------------------------- 1 | class ScrabbleException(Exception): 2 | pass 3 | 4 | 5 | class ImproperlyConfigured(ScrabbleException): 6 | pass 7 | -------------------------------------------------------------------------------- /neosearch_crawler/export_requirements_from_poetry.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # check if requrirements.txt exists 4 | if [ -f requirements.txt ]; then 5 | echo "requirements.txt already exists. Remove it first." 6 | rm requirements.txt 7 | fi 8 | 9 | # poetry export --without-hashes --format=requirements.txt > requirements.txt 10 | uv export --no-hashes --format requirements-txt > requirements.txt -------------------------------------------------------------------------------- /neosearch_crawler/mongo_db/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NEOS-AI/Neosearch/144921c094eae48e3aaac078e6f726c342720b78/neosearch_crawler/mongo_db/__init__.py -------------------------------------------------------------------------------- /neosearch_crawler/mongo_db/mongo.py: -------------------------------------------------------------------------------- 1 | from pymongo import MongoClient 2 | from pymongo.errors import ConnectionFailure 3 | 4 | # custom modules 5 | from .mongo_config import settings 6 | 7 | 8 | class MongoDatabaseConnector: 9 | """Singleton class to connect to MongoDB database.""" 10 | 11 | _instance: MongoClient = None 12 | 13 | def __new__(cls, *args, **kwargs): 14 | if cls._instance is None: 15 | try: 16 | cls._instance = MongoClient(settings.MONGO_DATABASE_HOST) 17 | except ConnectionFailure as e: 18 | print(f"Couldn't connect to the database: {str(e)}") 19 | raise 20 | 21 | print( 22 | f"Connection to database with uri: {settings.MONGO_DATABASE_HOST} successful" 23 | ) 24 | return cls._instance 25 | 26 | def get_database(self): 27 | return self._instance[settings.MONGO_DATABASE_NAME] 28 | 29 | def close(self): 30 | if self._instance: 31 | self._instance.close() 32 | print("Connected to database has been closed.") 33 | 34 | 35 | connection = MongoDatabaseConnector() -------------------------------------------------------------------------------- /neosearch_crawler/mongo_db/mongo_config.py: -------------------------------------------------------------------------------- 1 | from pydantic_settings import BaseSettings, SettingsConfigDict 2 | 3 | 4 | class Settings(BaseSettings): 5 | model_config = SettingsConfigDict(env_file="../.env", env_file_encoding="utf-8") 6 | 7 | MONGO_DATABASE_HOST: str = ( 8 | "mongodb://mongo1:30001,mongo2:30002,mongo3:30003/?replicaSet=my-replica-set" 9 | ) 10 | MONGO_DATABASE_NAME: str = "scrabble" 11 | 12 | # Optional LinkedIn credentials for scraping your profile 13 | LINKEDIN_USERNAME: str | None = None 14 | LINKEDIN_PASSWORD: str | None = None 15 | 16 | 17 | settings = Settings() 18 | -------------------------------------------------------------------------------- /neosearch_crawler/pyproject.toml: -------------------------------------------------------------------------------- 1 | # [tool.pdm.build] 2 | # includes = [] 3 | # [build-system] 4 | # requires = ["pdm-backend"] 5 | # build-backend = "pdm.backend" 6 | 7 | 8 | [project] 9 | authors = [ 10 | {name = "YeonwooSung", email = "neos960518@gmail.com"}, 11 | ] 12 | requires-python = "<3.13,>=3.10" 13 | dependencies = [ 14 | "ray[serve]<3.0.0,>=2.10.0", 15 | "trafilatura[all]==1.8.0", 16 | "langchain==0.3.19", 17 | "llama-index-core==0.12.35", 18 | "llama-index==0.12.35", 19 | "llama-index-llms-openai==0.3.20", 20 | "llama-index-llms-replicate==0.4.0", 21 | "llama-index-embeddings-huggingface==0.5.1", 22 | "aws-lambda-powertools<4.0.0,>=3.0.0", 23 | "selenium>=4.25.0,<5.0.0", 24 | "pymongo<5.0.0,>=4.9.1", 25 | "pydantic-settings<3.0.0,>=2.6.0", 26 | "scrapy<3.0.0,>=2.12.0", 27 | "mypy-boto3-s3<2.0.0,>=1.35.67", 28 | "polars>=1.18.0,<2.0.0", 29 | "scrapegraphai<2.0.0,>=1.37.1", 30 | "sqlmodel>=0.0.23", 31 | "psycopg2>=2.9.10", 32 | "asyncpg>=0.30.0", 33 | "requests>=2.32.3", 34 | "pypdf2>=3.0.1", 35 | ] 36 | name = "neosearch_crawler" 37 | version = "0.2.1" 38 | description = "" 39 | readme = "README.md" 40 | -------------------------------------------------------------------------------- /neosearch_crawler/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .trafilatura_util import extract_url_content 2 | 3 | 4 | __all__ = ["extract_url_content"] 5 | -------------------------------------------------------------------------------- /neosearch_crawler/utils/domain_name_utils.py: -------------------------------------------------------------------------------- 1 | def reverse_domain(domain: str): 2 | """ 3 | Reverse a domain name in URI format. 4 | 5 | Args: 6 | domain (str): The domain name to reverse. 7 | 8 | Returns: 9 | str: The reversed domain name. 10 | """ 11 | parts = domain.split('.') 12 | return '.'.join(reversed(parts)) 13 | 14 | 15 | if __name__ == "__main__": 16 | domain = "com.naver" 17 | reversed_domain = reverse_domain(domain) 18 | print(reversed_domain) # naver.com 19 | -------------------------------------------------------------------------------- /neosearch_crawler/utils/errors.py: -------------------------------------------------------------------------------- 1 | class ScrabbleException(Exception): 2 | pass 3 | 4 | 5 | class ImproperlyConfigured(ScrabbleException): 6 | pass 7 | -------------------------------------------------------------------------------- /neosearch_crawler/utils/pdf_util.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from io import BytesIO 3 | from PyPDF2 import PdfReader 4 | import orjson 5 | 6 | 7 | def extract_pdf_from_url(url): 8 | response = requests.get(url) 9 | response.raise_for_status() 10 | 11 | with BytesIO(response.content) as pdf_file: 12 | reader = PdfReader(pdf_file) 13 | metadata = reader.metadata 14 | 15 | # 제목(title) 추출 16 | title = metadata.title if metadata.title else "" 17 | description = None 18 | 19 | # 본문(content) 추출 20 | content = "" 21 | for page in reader.pages: 22 | content += page.extract_text() 23 | 24 | # 메타데이터(metadata)를 딕셔너리 형태로 변환 25 | metadata_dict = {key[1:]: value for key, value in metadata.items()} 26 | 27 | metadata_dict_str = orjson.dumps(metadata_dict).decode("utf-8") 28 | 29 | return { 30 | "title": title, 31 | "url":url, 32 | "content":content, 33 | "description":description, 34 | "metadata":metadata_dict_str, 35 | } 36 | -------------------------------------------------------------------------------- /neosearch_crawler/utils/singleton.py: -------------------------------------------------------------------------------- 1 | class Singleton(type): 2 | """The singleton metaclass.""" 3 | 4 | _instances: dict = {} 5 | 6 | def __call__(cls, *args, **kwargs): 7 | """Override to create only one instance ever. 8 | 9 | Returns: 10 | object: Instance of the class initialized. 11 | """ 12 | if cls not in cls._instances: 13 | cls._instances[cls] = super(Singleton, cls).__call__(*args, **kwargs) 14 | return cls._instances[cls] 15 | -------------------------------------------------------------------------------- /neosearch_frontend/.eslintrc.json: -------------------------------------------------------------------------------- 1 | { 2 | "extends": [ 3 | "next/core-web-vitals", 4 | "plugin:import/recommended", 5 | "plugin:import/typescript", 6 | "prettier", 7 | "plugin:tailwindcss/recommended" 8 | ], 9 | "plugins": ["tailwindcss"], 10 | "rules": { 11 | "tailwindcss/no-custom-classname": "off", 12 | "tailwindcss/classnames-order": "off" 13 | }, 14 | "settings": { 15 | "import/resolver": { 16 | "typescript": { 17 | "alwaysTryTypes": true 18 | } 19 | } 20 | }, 21 | "ignorePatterns": ["**/components/ui/**"] 22 | } 23 | -------------------------------------------------------------------------------- /neosearch_frontend/.gitignore: -------------------------------------------------------------------------------- 1 | # See https://help.github.com/articles/ignoring-files/ for more about ignoring files. 2 | 3 | # dependencies 4 | node_modules 5 | .pnp 6 | .pnp.js 7 | 8 | # testing 9 | coverage 10 | 11 | # next.js 12 | .next/ 13 | out/ 14 | build 15 | 16 | # misc 17 | .DS_Store 18 | *.pem 19 | 20 | # debug 21 | npm-debug.log* 22 | yarn-debug.log* 23 | yarn-error.log* 24 | .pnpm-debug.log* 25 | 26 | # local env files 27 | .env.local 28 | .env.development.local 29 | .env.test.local 30 | .env.production.local 31 | 32 | # turbo 33 | .turbo 34 | 35 | .env 36 | .vercel 37 | .vscode 38 | .env*.local 39 | -------------------------------------------------------------------------------- /neosearch_frontend/app/(auth)/api/auth/[...nextauth]/route.ts: -------------------------------------------------------------------------------- 1 | export { GET, POST } from '@/app/(auth)/auth'; 2 | -------------------------------------------------------------------------------- /neosearch_frontend/app/(auth)/api/auth/guest/route.ts: -------------------------------------------------------------------------------- 1 | import { signIn } from '@/app/(auth)/auth'; 2 | import { isDevelopmentEnvironment } from '@/lib/constants'; 3 | import { getToken } from 'next-auth/jwt'; 4 | import { NextResponse } from 'next/server'; 5 | 6 | export async function GET(request: Request) { 7 | const { searchParams } = new URL(request.url); 8 | const redirectUrl = searchParams.get('redirectUrl') || '/'; 9 | 10 | const token = await getToken({ 11 | req: request, 12 | secret: process.env.AUTH_SECRET, 13 | secureCookie: !isDevelopmentEnvironment, 14 | }); 15 | 16 | if (token) { 17 | return NextResponse.redirect(new URL('/', request.url)); 18 | } 19 | 20 | return signIn('guest', { redirect: true, redirectTo: redirectUrl }); 21 | } 22 | -------------------------------------------------------------------------------- /neosearch_frontend/app/(auth)/auth.config.ts: -------------------------------------------------------------------------------- 1 | import type { NextAuthConfig } from 'next-auth'; 2 | 3 | export const authConfig = { 4 | pages: { 5 | signIn: '/login', 6 | newUser: '/', 7 | }, 8 | providers: [ 9 | // added later in auth.ts since it requires bcrypt which is only compatible with Node.js 10 | // while this file is also used in non-Node.js environments 11 | ], 12 | callbacks: {}, 13 | } satisfies NextAuthConfig; 14 | -------------------------------------------------------------------------------- /neosearch_frontend/app/(chat)/api/chat/schema.ts: -------------------------------------------------------------------------------- 1 | import { z } from 'zod'; 2 | 3 | const textPartSchema = z.object({ 4 | text: z.string().min(1).max(2000), 5 | type: z.enum(['text']), 6 | }); 7 | 8 | export const postRequestBodySchema = z.object({ 9 | id: z.string().uuid(), 10 | message: z.object({ 11 | id: z.string().uuid(), 12 | createdAt: z.coerce.date(), 13 | role: z.enum(['user']), 14 | content: z.string().min(1).max(2000), 15 | parts: z.array(textPartSchema), 16 | experimental_attachments: z 17 | .array( 18 | z.object({ 19 | url: z.string().url(), 20 | name: z.string().min(1).max(2000), 21 | contentType: z.enum(['image/png', 'image/jpg', 'image/jpeg']), 22 | }), 23 | ) 24 | .optional(), 25 | }), 26 | selectedChatModel: z.enum(['chat-model', 'chat-model-reasoning']), 27 | selectedVisibilityType: z.enum(['public', 'private']), 28 | }); 29 | 30 | export type PostRequestBody = z.infer; 31 | -------------------------------------------------------------------------------- /neosearch_frontend/app/(chat)/api/history/route.ts: -------------------------------------------------------------------------------- 1 | import { auth } from '@/app/(auth)/auth'; 2 | import type { NextRequest } from 'next/server'; 3 | import { getChatsByUserId } from '@/lib/db/queries'; 4 | import { ChatSDKError } from '@/lib/errors'; 5 | 6 | export async function GET(request: NextRequest) { 7 | const { searchParams } = request.nextUrl; 8 | 9 | const limit = Number.parseInt(searchParams.get('limit') || '10'); 10 | const startingAfter = searchParams.get('starting_after'); 11 | const endingBefore = searchParams.get('ending_before'); 12 | 13 | if (startingAfter && endingBefore) { 14 | return new ChatSDKError( 15 | 'bad_request:api', 16 | 'Only one of starting_after or ending_before can be provided.', 17 | ).toResponse(); 18 | } 19 | 20 | const session = await auth(); 21 | 22 | if (!session?.user) { 23 | return new ChatSDKError('unauthorized:chat').toResponse(); 24 | } 25 | 26 | const chats = await getChatsByUserId({ 27 | id: session.user.id, 28 | limit, 29 | startingAfter, 30 | endingBefore, 31 | }); 32 | 33 | return Response.json(chats); 34 | } 35 | -------------------------------------------------------------------------------- /neosearch_frontend/app/(chat)/api/suggestions/route.ts: -------------------------------------------------------------------------------- 1 | import { auth } from '@/app/(auth)/auth'; 2 | import { getSuggestionsByDocumentId } from '@/lib/db/queries'; 3 | import { ChatSDKError } from '@/lib/errors'; 4 | 5 | export async function GET(request: Request) { 6 | const { searchParams } = new URL(request.url); 7 | const documentId = searchParams.get('documentId'); 8 | 9 | if (!documentId) { 10 | return new ChatSDKError( 11 | 'bad_request:api', 12 | 'Parameter documentId is required.', 13 | ).toResponse(); 14 | } 15 | 16 | const session = await auth(); 17 | 18 | if (!session?.user) { 19 | return new ChatSDKError('unauthorized:suggestions').toResponse(); 20 | } 21 | 22 | const suggestions = await getSuggestionsByDocumentId({ 23 | documentId, 24 | }); 25 | 26 | const [suggestion] = suggestions; 27 | 28 | if (!suggestion) { 29 | return Response.json([], { status: 200 }); 30 | } 31 | 32 | if (suggestion.userId !== session.user.id) { 33 | return new ChatSDKError('forbidden:api').toResponse(); 34 | } 35 | 36 | return Response.json(suggestions, { status: 200 }); 37 | } 38 | -------------------------------------------------------------------------------- /neosearch_frontend/app/(chat)/layout.tsx: -------------------------------------------------------------------------------- 1 | import { cookies } from 'next/headers'; 2 | 3 | import { AppSidebar } from '@/components/app-sidebar'; 4 | import { SidebarInset, SidebarProvider } from '@/components/ui/sidebar'; 5 | 6 | import { auth } from '../(auth)/auth'; 7 | import Script from 'next/script'; 8 | 9 | export const experimental_ppr = true; 10 | 11 | export default async function Layout({ 12 | children, 13 | }: { 14 | children: React.ReactNode; 15 | }) { 16 | const [session, cookieStore] = await Promise.all([auth(), cookies()]); 17 | const isCollapsed = cookieStore.get('sidebar:state')?.value !== 'true'; 18 | 19 | return ( 20 | <> 21 |