├── .deepsource.toml ├── .dockerignore ├── .env.example ├── .github └── workflows │ ├── deploy-production.yml │ ├── deploy-staging.yml │ └── python-app.yml ├── .gitignore ├── .pre-commit-config.yaml ├── .python-version ├── CLAUDE.md ├── Dockerfile ├── LICENSE ├── MANIFEST.in ├── Procfile ├── README.md ├── __init__.py ├── aws ├── aws-cli.md ├── github-actions-policy.json ├── instance-role-parameters-access.json ├── instance-role-policy.json └── service-role-policy.json ├── data └── mawsuah │ ├── README.md │ └── strip_tashkeel.py ├── docs ├── fastapi │ ├── async_await_backgroundtasks_logs_for_tracing.log │ ├── async_await_backgroundtasks_visualized.md │ ├── inheriting_middleware_class.ipynb │ └── middleware_chains_and_request_response_flow.ipynb ├── impl │ └── mongodb.md ├── spec │ └── mongodb.md └── structure_of_api_responses │ ├── a_fastapi_request_received_from_zrok.json │ ├── anthropic_api_structure_of_message_history.json │ ├── anthropic_api_structure_of_response.json │ ├── meta_whatsapp_api_structure_of_a_reply_msg_status.json │ ├── meta_whatsapp_api_structure_of_a_request_sent_using_zrok.json │ ├── meta_whatsapp_api_structure_of_a_user_incoming_msg.json │ └── openai_api_structure_of_chat_completion_chunk_object.ipynb ├── favicon.ico ├── migrate_database.py ├── pyproject.toml ├── pytest.ini ├── setup.sh ├── src └── ansari │ ├── __init__.py │ ├── agents │ ├── __init__.py │ ├── ansari.py │ ├── ansari_claude.py │ └── ansari_workflow.py │ ├── ansari_db.py │ ├── ansari_db_sql.py │ ├── ansari_logger.py │ ├── app │ ├── __init__.py │ ├── main_api.py │ ├── main_api_client.py │ ├── main_discord.py │ ├── main_file.py │ ├── main_stdio.py │ └── main_whatsapp.py │ ├── cli │ ├── __init__.py │ ├── query_api.py │ └── use_tools.py │ ├── config.py │ ├── examples │ ├── test_citations.py │ └── test_search_mawsuah.py │ ├── presenters │ ├── api_presenter.py │ ├── ayah_file_presenter.py │ ├── discord_presenter.py │ ├── file_presenter.py │ ├── gradio_presenter.py │ ├── stdio_presenter.py │ └── whatsapp_presenter.py │ ├── resources │ ├── prompts │ │ ├── greeting.txt │ │ ├── news.txt │ │ ├── system_msg_ayah.txt │ │ ├── system_msg_ayah_lay.txt │ │ ├── system_msg_claude.txt │ │ └── system_msg_tool.txt │ └── templates │ │ ├── ask_question.txt │ │ └── password_reset.html │ ├── tools │ ├── __init__.py │ ├── base_search.py │ ├── search_hadith.py │ ├── search_mawsuah.py │ ├── search_quran.py │ ├── search_tafsir_encyc.py │ ├── search_usul.py │ └── search_vectara.py │ └── util │ ├── __init__.py │ ├── general_helpers.py │ ├── prompt_mgr.py │ ├── robust_translation.py │ └── translation.py ├── test_ansari_claude.py ├── tests ├── __init__.py ├── ask-question-en.txt ├── batik-v1-en.csv ├── integration │ ├── README.md │ ├── __init__.py │ ├── test_ansari_generic.py │ ├── test_ansari_integration.py │ ├── test_claude_integration.py │ └── test_helpers.py └── unit │ ├── __init__.py │ ├── test_ansari_claude_document_limiting.py │ ├── test_ansari_claude_empty_text_block.py │ ├── test_ansari_claude_message_sequence.py │ ├── test_ansari_claude_tool_sequence.py │ ├── test_answer_quality.py │ ├── test_citation_formatting.py │ ├── test_convert_message_llm.py │ ├── test_logging_regression.py │ ├── test_main_api.py │ ├── test_message_id_in_thread.py │ ├── test_multilingual_citations.py │ ├── test_multilingual_data_parsing.py │ ├── test_search_mawsuah.py │ ├── test_search_tafsir_encyc.py │ └── test_translation.py ├── update_database.py └── uv.lock /.deepsource.toml: -------------------------------------------------------------------------------- 1 | version = 1 2 | 3 | [[analyzers]] 4 | name = "shell" 5 | 6 | [[analyzers]] 7 | name = "python" 8 | 9 | [analyzers.meta] 10 | runtime_version = "3.x.x" -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | # Python 2 | __pycache__ 3 | app.egg-info 4 | *.pyc 5 | .mypy_cache 6 | .coverage 7 | htmlcov 8 | .venv 9 | -------------------------------------------------------------------------------- /.env.example: -------------------------------------------------------------------------------- 1 | DEPLOYMENT_TYPE="development" # Deployment type (development, staging, production) 2 | FRONTEND_URL="http://localhost:8081" 3 | 4 | KALEMAT_API_KEY="" # Token for Qur'an and Hadith search 5 | ANTHROPIC_API_KEY="" # API key for Claude AI model 6 | OPENAI_API_KEY="" # Token for GPT-4 (Optional) 7 | 8 | # Optional. If not set, the app will not use these services. 9 | SENDGRID_API_KEY="" # API key to send password reset options 10 | 11 | MAILCHIMP_API_KEY="" # API key 12 | MAILCHIMP_SERVER_PREFIX="" # Server prefix (data center) 13 | MAILCHIMP_LIST_ID="" # List ID 14 | 15 | # Database connection string 16 | MONGO_URL="mongodb://localhost:27017" 17 | MONGO_DB_NAME="ansari_db" 18 | 19 | SECRET_KEY="secret" # Secret key for signing tokens 20 | 21 | # Origins to be allowed by the backend 22 | ORIGINS="https://ansari.chat,https://www.ansari.chat,https://pre.ansari.chat" 23 | 24 | # Vectara search engine configuration 25 | VECTARA_API_KEY="" # Authentication token for Vectara API 26 | 27 | QURAN_DOT_COM_API_KEY="" # This is the API key we give to quran.com to access us, not for us to access them 28 | 29 | # Directory for storing templates 30 | template_dir="." # Directory path for templates 31 | 32 | # Related to WhatsApp Business and Meta (leave empty if you're not planning to use WhatsApp) 33 | # Source 1: https://www.youtube.com/watch?v=KP6_BUw3i0U 34 | # Watch Until 32:25, while quickly skimming through the non-python code parts 35 | # Source 2 (mentioned in video above): https://glitch.com/edit/#!/insidious-tartan-alvarezsaurus 36 | # (the `verification_webhook` endpoint in `main_whatsapp` is inspired by the above URL) 37 | # Source 3 (optional): https://developers.facebook.com/blog/post/2022/10/24/sending-messages-with-whatsapp-in-your-python-applications/#u_0_39_8q 38 | 39 | # Moreover, if want to test whatsapp's webhook locally, you can use zrok on a reserved URL with a zrok "share token" 40 | # obtained by contacting its current holder: https://github.com/OdyAsh (source 1, 2 below) 41 | # Alternatively, you can change the webhook url all together (source 3, 4 below) 42 | # Check these sources for more details: 43 | # Source 1: https://dev.to/odyash/quickly-share-your-app-with-zrok-4ihp 44 | # Source 2: https://openziti.discourse.group/t/how-do-i-use-a-reserved-share-on-different-devices/2379/2 45 | # Source 3: https://youtu.be/KP6_BUw3i0U?t=1294 46 | # (@21:33 and 25:30, however they use glitch instead of zrok, so the video here is just to give you an idea how to setup a webhook) 47 | # Source 4 (where you can change callback url, given that your facebook account gets access by the app's admins): 48 | # https://developers.facebook.com/apps/871020755148175/whatsapp-business/wa-settings/ 49 | # NOTE 1: When you see the `Callback URL`, it will be something like "https://ZROK_SHARE_TOKEN.share.zrok.io/whatsapp/v1" 50 | # (The `/whatsapp/v1` endpoint can be found in `main_whatsapp.py`'s endpoints, that's why it's in the url above) 51 | # NOTE 2: If an unexpected 3rd party discovers the ZROK_SHARE_TOKEN, 52 | # a new one will have to be generated, then added to Meta's callback URL of the *testing* app 53 | # (Noting that the *production* app's callback URL will be different anyway, so the 3rd party won't be able to access that app) 54 | # (but we still don't want random calls to be made to our testing app, so that's why we'll still have to change an exposed token :]) 55 | # NOTE 3: Obviously, that `871...175` in the above URL is the testing app's public id, so if this link still doesn't work even after you gain access, 56 | # then the admins most probably created a new test app instance 57 | 58 | WHATSAPP_API_VERSION="<>" 59 | 60 | # NOTE: Contact the team to see whatsapp's 2 phone nums -> one for prod. env. and the other for local/stage testing 61 | WHATSAPP_BUSINESS_PHONE_NUMBER_ID="<>" 62 | 63 | # NOTE 1: check video in source 3 above from 30:45 to 32:15 to see where we get the access token 64 | # NOTE 2: Contact the team to see their 2 access tokens -> one for prod. env. and the other for local/stage testing 65 | WHATSAPP_ACCESS_TOKEN_FROM_SYS_USER="<" 66 | 67 | WHATSAPP_VERIFY_TOKEN_FOR_WEBHOOK="<>" 68 | WHATSAPP_CHAT_RETENTION_HOURS=3 69 | ZROK_SHARE_TOKEN="<>" 70 | 71 | # Related to internal code logic 72 | # Leave the values below when locally debugging the application 73 | # In production, don't add them to environment variables, or add them as "INFO"/"False" respectively 74 | LOGGING_LEVEL="DEBUG" 75 | DEV_MODE="True" 76 | 77 | # Application version control settings 78 | MAINTENANCE_MODE="False" # Whether the application is in maintenance mode 79 | 80 | # iOS app build versions 81 | IOS_MINIMUM_BUILD_VERSION="1" # Minimum build version required for iOS app 82 | IOS_LATEST_BUILD_VERSION="1" # Latest available build version for iOS app 83 | 84 | # Android app build versions 85 | ANDROID_MINIMUM_BUILD_VERSION="1" # Minimum build version required for Android app 86 | ANDROID_LATEST_BUILD_VERSION="1" # Latest available build version for Android app 87 | 88 | SENTRY_DSN="" # Sentry DSN for error tracking 89 | 90 | # To get rid of .py[cod] files (This should key should NOT be set in production!) 91 | # This is only to de-clutter your local development environment 92 | # Details: https://docs.python-guide.org/writing/gotchas/#disabling-bytecode-pyc-files 93 | PYTHONDONTWRITEBYTECODE=1 -------------------------------------------------------------------------------- /.github/workflows/deploy-production.yml: -------------------------------------------------------------------------------- 1 | name: Production Deployment (AWS App Runner) 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | workflow_dispatch: 8 | 9 | jobs: 10 | build-and-deploy: 11 | runs-on: ubuntu-latest 12 | environment: production-aws 13 | 14 | steps: 15 | - name: Checkout 16 | uses: actions/checkout@v4 17 | with: 18 | persist-credentials: false 19 | 20 | - name: Configure AWS credentials 21 | id: aws-credentials 22 | uses: aws-actions/configure-aws-credentials@v4 23 | with: 24 | aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} 25 | aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} 26 | aws-region: ${{ secrets.AWS_REGION }} 27 | 28 | - name: Login to Amazon ECR 29 | id: login-ecr 30 | uses: aws-actions/amazon-ecr-login@v2 31 | 32 | - name: Build, tag, and push image to Amazon ECR 33 | id: build-image 34 | env: 35 | ECR_REGISTRY: ${{ steps.login-ecr.outputs.registry }} 36 | ECR_REPOSITORY: ansari-backend 37 | IMAGE_TAG: ${{ github.sha }} 38 | run: | 39 | docker build -t $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG . 40 | docker push $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG 41 | echo "::set-output name=image::$ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG" 42 | 43 | - name: Deploy to App Runner 44 | id: deploy-apprunner 45 | uses: awslabs/amazon-app-runner-deploy@main 46 | env: 47 | DEPLOYMENT_TYPE: production 48 | LOGGING_LEVEL: INFO 49 | FRONTEND_URL: ${{ format('{0}{1}', secrets.SSM_ROOT, 'frontend-url') }} 50 | MONGO_URL: ${{ format('{0}{1}', secrets.SSM_ROOT, 'mongo-url') }} 51 | MONGO_DB_NAME: ${{ format('{0}{1}', secrets.SSM_ROOT, 'mongo-db-name') }} 52 | SECRET_KEY: ${{ format('{0}{1}', secrets.SSM_ROOT, 'secret-key') }} 53 | ORIGINS: ${{ format('{0}{1}', secrets.SSM_ROOT, 'origins') }} 54 | 55 | OPENAI_API_KEY: ${{ format('{0}{1}', secrets.SSM_ROOT, 'openai-api-key') }} 56 | SENDGRID_API_KEY: ${{ format('{0}{1}', secrets.SSM_ROOT, 'sendgrid-api-key') }} 57 | MAILCHIMP_API_KEY: ${{ format('{0}{1}', secrets.SSM_ROOT, 'mailchimp-api-key') }} 58 | MAILCHIMP_SERVER_PREFIX: ${{ format('{0}{1}', secrets.SSM_ROOT, 'mailchimp-server-prefix') }} 59 | MAILCHIMP_LIST_ID: ${{ format('{0}{1}', secrets.SSM_ROOT, 'mailchimp-list-id') }} 60 | ANTHROPIC_API_KEY: ${{ format('{0}{1}', secrets.SSM_ROOT, 'anthropic-api-key') }} 61 | KALEMAT_API_KEY: ${{ format('{0}{1}', secrets.SSM_ROOT, 'kalemat-api-key') }} 62 | SUNNAH_TOKEN: ${{ format('{0}{1}', secrets.SSM_ROOT, 'sunnah-token') }} 63 | VECTARA_API_KEY: ${{ format('{0}{1}', secrets.SSM_ROOT, 'vectara-api-key') }} 64 | QURAN_DOT_COM_API_KEY: ${{ format('{0}{1}', secrets.SSM_ROOT, 'quran-dot-com-api-key') }} 65 | USUL_API_TOKEN: ${{ format('{0}{1}', secrets.SSM_ROOT, 'usul-api-token') }} 66 | SENTRY_DSN: ${{ format('{0}{1}', secrets.SSM_ROOT, 'sentry-dsn') }} 67 | 68 | WHATSAPP_ACCESS_TOKEN_FROM_SYS_USER: ${{ format('{0}{1}', secrets.SSM_ROOT, 'whatsapp-access-token-from-sys-user') }} 69 | WHATSAPP_BUSINESS_PHONE_NUMBER_ID: ${{ format('{0}{1}', secrets.SSM_ROOT, 'whatsapp-business-phone-number-id') }} 70 | WHATSAPP_VERIFY_TOKEN_FOR_WEBHOOK: ${{ format('{0}{1}', secrets.SSM_ROOT, 'whatsapp-verify-token-for-webhook') }} 71 | WHATSAPP_ENABLED: ${{ format('{0}{1}', secrets.SSM_ROOT, 'whatsapp-enabled') }} 72 | WHATSAPP_API_VERSION: ${{ format('{0}{1}', secrets.SSM_ROOT, 'whatsapp-api-version') }} 73 | WHATSAPP_CHAT_RETENTION_HOURS: ${{ format('{0}{1}', secrets.SSM_ROOT, 'whatsapp-chat-retention-hours') }} 74 | 75 | MAINTENANCE_MODE: ${{ format('{0}{1}', secrets.SSM_ROOT, 'maintenance-mode') }} 76 | IOS_MINIMUM_BUILD_VERSION: ${{ format('{0}{1}', secrets.SSM_ROOT, 'ios-minimum-build-version') }} 77 | IOS_LATEST_BUILD_VERSION: ${{ format('{0}{1}', secrets.SSM_ROOT, 'ios-latest-build-version') }} 78 | ANDROID_MINIMUM_BUILD_VERSION: ${{ format('{0}{1}', secrets.SSM_ROOT, 'android-minimum-build-version') }} 79 | ANDROID_LATEST_BUILD_VERSION: ${{ format('{0}{1}', secrets.SSM_ROOT, 'android-latest-build-version') }} 80 | with: 81 | service: ansari-production-backend 82 | image: ${{ steps.build-image.outputs.image }} 83 | access-role-arn: ${{ secrets.SERVICE_ROLE_ARN }} 84 | region: ${{ secrets.AWS_REGION }} 85 | cpu : 1 86 | memory : 2 87 | port: 8000 88 | wait-for-service-stability-seconds: 1200 89 | copy-env-vars: | 90 | DEPLOYMENT_TYPE 91 | LOGGING_LEVEL 92 | copy-secret-env-vars: | 93 | FRONTEND_URL 94 | MONGO_URL 95 | MONGO_DB_NAME 96 | SECRET_KEY 97 | ORIGINS 98 | 99 | OPENAI_API_KEY 100 | SENDGRID_API_KEY 101 | MAILCHIMP_API_KEY 102 | MAILCHIMP_SERVER_PREFIX 103 | MAILCHIMP_LIST_ID 104 | ANTHROPIC_API_KEY 105 | KALEMAT_API_KEY 106 | SUNNAH_TOKEN 107 | VECTARA_API_KEY 108 | QURAN_DOT_COM_API_KEY 109 | USUL_API_TOKEN 110 | SENTRY_DSN 111 | 112 | WHATSAPP_ACCESS_TOKEN_FROM_SYS_USER 113 | WHATSAPP_BUSINESS_PHONE_NUMBER_ID 114 | WHATSAPP_VERIFY_TOKEN_FOR_WEBHOOK 115 | WHATSAPP_API_VERSION 116 | WHATSAPP_ENABLED 117 | WHATSAPP_CHAT_RETENTION_HOURS 118 | 119 | MAINTENANCE_MODE 120 | IOS_MINIMUM_BUILD_VERSION 121 | IOS_LATEST_BUILD_VERSION 122 | ANDROID_MINIMUM_BUILD_VERSION 123 | ANDROID_LATEST_BUILD_VERSION 124 | instance-role-arn: ${{ secrets.INSTANCE_ROLE_ARN }} 125 | 126 | - name: App Runner URL 127 | run: echo "App runner URL ${{ steps.deploy-apprunner.outputs.service-url }}" 128 | -------------------------------------------------------------------------------- /.github/workflows/deploy-staging.yml: -------------------------------------------------------------------------------- 1 | name: Staging Deployment (AWS App Runner) 2 | 3 | on: 4 | push: 5 | branches: 6 | - develop 7 | workflow_dispatch: 8 | 9 | jobs: 10 | build-and-deploy: 11 | runs-on: ubuntu-latest 12 | environment: staging-aws 13 | 14 | steps: 15 | - name: Checkout 16 | uses: actions/checkout@v4 17 | with: 18 | persist-credentials: false 19 | 20 | - name: Configure AWS credentials 21 | id: aws-credentials 22 | uses: aws-actions/configure-aws-credentials@v4 23 | with: 24 | aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} 25 | aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} 26 | aws-region: ${{ secrets.AWS_REGION }} 27 | 28 | - name: Login to Amazon ECR 29 | id: login-ecr 30 | uses: aws-actions/amazon-ecr-login@v2 31 | 32 | - name: Build, tag, and push image to Amazon ECR 33 | id: build-image 34 | env: 35 | ECR_REGISTRY: ${{ steps.login-ecr.outputs.registry }} 36 | ECR_REPOSITORY: ansari-backend 37 | IMAGE_TAG: ${{ github.sha }} 38 | run: | 39 | docker build -t $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG . 40 | docker push $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG 41 | echo "::set-output name=image::$ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG" 42 | 43 | - name: Deploy to App Runner 44 | id: deploy-apprunner 45 | uses: awslabs/amazon-app-runner-deploy@main 46 | env: 47 | DEPLOYMENT_TYPE: staging 48 | LOGGING_LEVEL: DEBUG 49 | FRONTEND_URL: ${{ format('{0}{1}', secrets.SSM_ROOT, 'frontend-url') }} 50 | MONGO_URL: ${{ format('{0}{1}', secrets.SSM_ROOT, 'mongo-url') }} 51 | MONGO_DB_NAME: ${{ format('{0}{1}', secrets.SSM_ROOT, 'mongo-db-name') }} 52 | SECRET_KEY: ${{ format('{0}{1}', secrets.SSM_ROOT, 'secret-key') }} 53 | ORIGINS: ${{ format('{0}{1}', secrets.SSM_ROOT, 'origins') }} 54 | 55 | OPENAI_API_KEY: ${{ format('{0}{1}', secrets.SSM_ROOT, 'openai-api-key') }} 56 | SENDGRID_API_KEY: ${{ format('{0}{1}', secrets.SSM_ROOT, 'sendgrid-api-key') }} 57 | MAILCHIMP_API_KEY: ${{ format('{0}{1}', secrets.SSM_ROOT, 'mailchimp-api-key') }} 58 | MAILCHIMP_SERVER_PREFIX: ${{ format('{0}{1}', secrets.SSM_ROOT, 'mailchimp-server-prefix') }} 59 | MAILCHIMP_LIST_ID: ${{ format('{0}{1}', secrets.SSM_ROOT, 'mailchimp-list-id') }} 60 | ANTHROPIC_API_KEY: ${{ format('{0}{1}', secrets.SSM_ROOT, 'anthropic-api-key') }} 61 | KALEMAT_API_KEY: ${{ format('{0}{1}', secrets.SSM_ROOT, 'kalemat-api-key') }} 62 | SUNNAH_TOKEN: ${{ format('{0}{1}', secrets.SSM_ROOT, 'sunnah-token') }} 63 | VECTARA_API_KEY: ${{ format('{0}{1}', secrets.SSM_ROOT, 'vectara-api-key') }} 64 | QURAN_DOT_COM_API_KEY: ${{ format('{0}{1}', secrets.SSM_ROOT, 'quran-dot-com-api-key') }} 65 | USUL_API_TOKEN: ${{ format('{0}{1}', secrets.SSM_ROOT, 'usul-api-token') }} 66 | SENTRY_DSN: ${{ format('{0}{1}', secrets.SSM_ROOT, 'sentry-dsn') }} 67 | 68 | WHATSAPP_ACCESS_TOKEN_FROM_SYS_USER: ${{ format('{0}{1}', secrets.SSM_ROOT, 'whatsapp-access-token-from-sys-user') }} 69 | WHATSAPP_BUSINESS_PHONE_NUMBER_ID: ${{ format('{0}{1}', secrets.SSM_ROOT, 'whatsapp-business-phone-number-id') }} 70 | WHATSAPP_VERIFY_TOKEN_FOR_WEBHOOK: ${{ format('{0}{1}', secrets.SSM_ROOT, 'whatsapp-verify-token-for-webhook') }} 71 | WHATSAPP_ENABLED: ${{ format('{0}{1}', secrets.SSM_ROOT, 'whatsapp-enabled') }} 72 | WHATSAPP_API_VERSION: ${{ format('{0}{1}', secrets.SSM_ROOT, 'whatsapp-api-version') }} 73 | WHATSAPP_CHAT_RETENTION_HOURS: ${{ format('{0}{1}', secrets.SSM_ROOT, 'whatsapp-chat-retention-hours') }} 74 | 75 | MAINTENANCE_MODE: ${{ format('{0}{1}', secrets.SSM_ROOT, 'maintenance-mode') }} 76 | IOS_MINIMUM_BUILD_VERSION: ${{ format('{0}{1}', secrets.SSM_ROOT, 'ios-minimum-build-version') }} 77 | IOS_LATEST_BUILD_VERSION: ${{ format('{0}{1}', secrets.SSM_ROOT, 'ios-latest-build-version') }} 78 | ANDROID_MINIMUM_BUILD_VERSION: ${{ format('{0}{1}', secrets.SSM_ROOT, 'android-minimum-build-version') }} 79 | ANDROID_LATEST_BUILD_VERSION: ${{ format('{0}{1}', secrets.SSM_ROOT, 'android-latest-build-version') }} 80 | with: 81 | service: ansari-staging-backend 82 | image: ${{ steps.build-image.outputs.image }} 83 | access-role-arn: ${{ secrets.SERVICE_ROLE_ARN }} 84 | region: ${{ secrets.AWS_REGION }} 85 | cpu : 1 86 | memory : 2 87 | port: 8000 88 | wait-for-service-stability-seconds: 1200 89 | copy-env-vars: | 90 | DEPLOYMENT_TYPE 91 | LOGGING_LEVEL 92 | copy-secret-env-vars: | 93 | FRONTEND_URL 94 | MONGO_URL 95 | MONGO_DB_NAME 96 | SECRET_KEY 97 | ORIGINS 98 | 99 | OPENAI_API_KEY 100 | SENDGRID_API_KEY 101 | MAILCHIMP_API_KEY 102 | MAILCHIMP_SERVER_PREFIX 103 | MAILCHIMP_LIST_ID 104 | ANTHROPIC_API_KEY 105 | KALEMAT_API_KEY 106 | SUNNAH_TOKEN 107 | VECTARA_API_KEY 108 | QURAN_DOT_COM_API_KEY 109 | USUL_API_TOKEN 110 | SENTRY_DSN 111 | 112 | WHATSAPP_ACCESS_TOKEN_FROM_SYS_USER 113 | WHATSAPP_BUSINESS_PHONE_NUMBER_ID 114 | WHATSAPP_VERIFY_TOKEN_FOR_WEBHOOK 115 | WHATSAPP_API_VERSION 116 | WHATSAPP_ENABLED 117 | WHATSAPP_CHAT_RETENTION_HOURS 118 | 119 | MAINTENANCE_MODE 120 | IOS_MINIMUM_BUILD_VERSION 121 | IOS_LATEST_BUILD_VERSION 122 | ANDROID_MINIMUM_BUILD_VERSION 123 | ANDROID_LATEST_BUILD_VERSION 124 | instance-role-arn: ${{ secrets.INSTANCE_ROLE_ARN }} 125 | 126 | - name: App Runner URL 127 | run: echo "App runner URL ${{ steps.deploy-apprunner.outputs.service-url }}" 128 | -------------------------------------------------------------------------------- /.github/workflows/python-app.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a single version of Python 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python 3 | 4 | name: Ansari CICD 5 | 6 | on: 7 | # Trigger the workflow on push or pull request events to the "api-v2" and "main" branches. 8 | push: 9 | branches: [ "api-v2", "main" ] 10 | pull_request: 11 | branches: [ "api-v2", "main" ] 12 | 13 | permissions: 14 | contents: read 15 | 16 | jobs: 17 | ansari-container-job: 18 | 19 | runs-on: ubuntu-latest 20 | env: 21 | # Set up environment variables and secrets required for the workflow. 22 | ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} 23 | OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} 24 | KALEMAT_API_KEY: ${{ secrets.KALEMAT_API_KEY }} 25 | VECTARA_API_KEY: ${{ secrets.VECTARA_API_KEY }} 26 | MAWSUAH_VECTARA_CORPUS_KEY: ${{ secrets.MAWSUAH_VECTARA_CORPUS_KEY }} 27 | TAFSIR_VECTARA_CORPUS_KEY: ${{ secrets.TAFSIR_VECTARA_CORPUS_KEY }} 28 | QURAN_DOT_COM_API_KEY: ${{ secrets.QURAN_DOT_COM_API_KEY }} 29 | WHATSAPP_API_VERSION: ${{ secrets.WHATSAPP_API_VERSION }} 30 | WHATSAPP_BUSINESS_PHONE_NUMBER_ID: ${{ secrets.WHATSAPP_BUSINESS_PHONE_NUMBER_ID }} 31 | WHATSAPP_ACCESS_TOKEN_FROM_SYS_USER: ${{ secrets.WHATSAPP_ACCESS_TOKEN_FROM_SYS_USER }} 32 | WHATSAPP_VERIFY_TOKEN_FOR_WEBHOOK: ${{ secrets.WHATSAPP_VERIFY_TOKEN_FOR_WEBHOOK }} 33 | PYTHONPATH: src 34 | 35 | # Use a Python 3.10 container 36 | container: python:3.10 37 | 38 | steps: 39 | 40 | # Check out the repository code. 41 | - name: Check out repository code 42 | uses: actions/checkout@v4 43 | 44 | # Install the `uv` tool. 45 | - name: Install uv 46 | run: | 47 | pip install uv 48 | # TODO(abdullah): create a venv using uv 49 | 50 | # Install Python dependencies, including `ruff`, `pytest`, `pytest-asyncio`, and `pytest-cov`. 51 | - name: Install dependencies 52 | run: | 53 | uv pip install --system ruff pytest pytest-asyncio pytest-cov 54 | if [ -f requirements.txt ]; then uv pip install --system -r requirements.txt; fi 55 | 56 | 57 | # Lint the code using `ruff` and stop the build if there are lint errors. 58 | - name: Lint with ruff 59 | run: | 60 | # stop the build if there are lint errors 61 | ruff check . --config pyproject.toml --output-format=github 62 | 63 | # Run tests using `pytest` and generate a coverage report. 64 | - name: Test with pytest 65 | env: 66 | SECRET_KEY: "secret" # This is a required field. Setting it to a random value to pass the tests. 67 | run: | 68 | pytest --capture=tee-sys --cov=. -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Folders 2 | .conda/ 3 | .venv/ 4 | .vscode/ 5 | abandoned/ 6 | bin/ 7 | datasources/ 8 | diskcache_dir/ 9 | docs/recordings/* 10 | etc/ 11 | example_projects/ 12 | lib/ 13 | logs/ 14 | share/ 15 | src/ansari_backend.egg-info/* 16 | tmp/ 17 | dist/ 18 | 19 | # Files 20 | .__atomic-write* 21 | .env 22 | .history 23 | *.pyc 24 | pyvenv.cfg 25 | zrok.exe 26 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/astral-sh/ruff-pre-commit 3 | # Ruff version. 4 | rev: v0.7.3 5 | hooks: 6 | # Run the linter. 7 | - id: ruff 8 | types_or: [ python, pyi, jupyter ] 9 | args: [ --fix ] 10 | # Run the formatter. 11 | - id: ruff-format 12 | types_or: [ python, pyi, jupyter ] -------------------------------------------------------------------------------- /.python-version: -------------------------------------------------------------------------------- 1 | 3.13 2 | -------------------------------------------------------------------------------- /CLAUDE.md: -------------------------------------------------------------------------------- 1 | # Ansari Backend - Developer Guide 2 | 3 | ## Branch Management 4 | - Always create new branches from the `develop` branch, NOT from `main` 5 | - Use descriptive branch names that reflect the feature or fix being implemented 6 | - Keep branches focused on a single feature or fix 7 | - Delete branches after they're merged to keep the repository clean 8 | 9 | ## Repository Organization 10 | - Keep the root directory clean and organized 11 | - Place temporary files, debug scripts, and other non-production artifacts in the `tmp/` directory 12 | - The `tmp/` directory is gitignored, making it perfect for development-only files 13 | - Make sure scripts and tools intended for the repository are placed in appropriate subdirectories 14 | 15 | ## Git Commit and PR Guidelines 16 | - Do not include "Generated with Claude Code" or "Co-Authored-By: Claude" in commit messages 17 | - Do not include "Generated with Claude Code" in PR descriptions or anywhere else 18 | - Keep commit messages concise and descriptive 19 | - Use imperative mood in commit messages (e.g., "Add feature" not "Added feature") 20 | - Always run `ruff check` and `ruff format` before committing changes 21 | - Fix all linting errors - clean code is maintainable code 22 | - All PRs should target the `develop` branch, not `main` 23 | 24 | ## Branch Management Details 25 | - Consider a merged branch "done" - do not add new changes to it 26 | - If you have changes after a branch was merged: 27 | - Create a new branch from the latest develop branch 28 | - Apply your new changes there 29 | - Create a new PR with a descriptive name 30 | - For related but separate features, use separate branches and PRs 31 | - Delete branches after they're merged to keep the repository clean 32 | 33 | ## Build/Test/Lint Commands 34 | - Install dependencies: `pip install -r requirements.txt` 35 | - Run backend service: `uvicorn main_api:app --reload` 36 | - Run CLI version (interactive): 37 | - Claude: `python src/ansari/app/main_stdio.py -a AnsariClaude` 38 | - OpenAI: `python src/ansari/app/main_stdio.py -a Ansari` 39 | - Run CLI with direct input: 40 | - `python src/ansari/app/main_stdio.py -i "your question here"` 41 | - `python src/ansari/app/main_stdio.py --input "your question here"` 42 | - Run tests: `pytest tests/` 43 | - Run single test: `pytest tests/path/to/test.py::test_function_name` 44 | - Run tests with specific marker: `pytest -m integration` 45 | - Lint code: `ruff check src/` 46 | - Format code: `ruff format src/` 47 | - Package commands: 48 | - Build package: `python -m build` 49 | - Upload to PyPI: `twine upload dist/*` (requires PyPI credentials) 50 | 51 | ## Code Style Guidelines 52 | - **Imports**: Use absolute imports within the `ansari` package 53 | - **Formatting**: Double quotes for strings, 4-space indentation 54 | - **Line length**: 127 characters maximum 55 | - **Types**: Use Python type hints for function parameters and return types 56 | - **Naming**: Use snake_case for variables/functions, PascalCase for classes 57 | - **Error handling**: Use try/except blocks with specific error types 58 | - Prefer clean failures over unpredictable recovery attempts 59 | - Log errors clearly and completely before failing 60 | - Do not attempt to "fix" malformed data that could lead to unexpected behavior 61 | - If recovery is necessary, implement it as a well-tested, dedicated fix rather than ad-hoc patches 62 | - Avoid cascading fallbacks - throw clear errors instead 63 | - **Logging**: Use the logger from `ansari.ansari_logger.get_logger()` 64 | - **Documentation**: Add docstrings to functions, especially complex ones 65 | - **Testing**: Create unit tests in `tests/unit/` and integration tests in `tests/integration/` 66 | - **Citations**: 67 | - All search tools must format document data as multilingual JSON using `format_multilingual_data` 68 | - The data format must be valid JSON following the schema in `base_search.py` documentation 69 | - Store properly formatted JSON in the `data` field of document references 70 | - Citation handling should account for both full document citations (valid JSON) and partial citations (plain text) 71 | - **Test-first development**: Always write tests before shipping features 72 | - Write tests that validate both expected behavior and edge cases 73 | - When fixing bugs, first write a test that reproduces the issue 74 | - Run tests frequently during development to catch regressions 75 | - **Code complexity management**: 76 | - Break down complex methods into smaller, focused helpers with clear responsibilities 77 | - Use meaningful method names that describe what the method does, not how it does it 78 | - Add clear comments about the purpose and behavior of complex code 79 | - Extract state machine logic into clearly defined handlers for each state 80 | - Aim for methods that can be understood without scrolling 81 | - **Error handling philosophy**: Prefer clean failures over unpredictable recovery attempts 82 | - Log errors clearly and completely before failing 83 | - Do not attempt to "fix" malformed data that could lead to unexpected behavior 84 | - If recovery is necessary, implement it as a well-tested, dedicated fix rather than ad-hoc patches 85 | 86 | ## Testing Best Practices 87 | - Run tests before committing: `pytest tests/` 88 | - Run specific test categories: `pytest tests/unit/` or `pytest tests/integration/` 89 | - Add tests for new functionality in the appropriate directory 90 | - Use fixture factories to keep tests maintainable 91 | - Test both happy path and error conditions 92 | - Keep tests independent (no dependencies between test functions) -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.13 2 | 3 | ENV PYTHONUNBUFFERED=1 4 | 5 | WORKDIR /app/ 6 | 7 | # Install uv 8 | # Ref: https://docs.astral.sh/uv/guides/integration/docker/#installing-uv 9 | COPY --from=ghcr.io/astral-sh/uv:0.5.11 /uv /uvx /bin/ 10 | 11 | # Place executables in the environment at the front of the path 12 | # Ref: https://docs.astral.sh/uv/guides/integration/docker/#using-the-environment 13 | ENV PATH="/app/.venv/bin:$PATH" 14 | 15 | # Compile bytecode 16 | # Ref: https://docs.astral.sh/uv/guides/integration/docker/#compiling-bytecode 17 | ENV UV_COMPILE_BYTECODE=1 18 | 19 | # uv Cache 20 | # Ref: https://docs.astral.sh/uv/guides/integration/docker/#caching 21 | ENV UV_LINK_MODE=copy 22 | 23 | # Install dependencies 24 | # Ref: https://docs.astral.sh/uv/guides/integration/docker/#intermediate-layers 25 | RUN --mount=type=cache,target=/root/.cache/uv \ 26 | --mount=type=bind,source=uv.lock,target=uv.lock \ 27 | --mount=type=bind,source=pyproject.toml,target=pyproject.toml \ 28 | uv sync --frozen --no-install-project 29 | 30 | ENV PYTHONPATH=/app 31 | 32 | COPY ./pyproject.toml ./uv.lock /app/ 33 | 34 | COPY ./src/ansari /app/ansari 35 | 36 | # Sync the project 37 | # Ref: https://docs.astral.sh/uv/guides/integration/docker/#intermediate-layers 38 | RUN --mount=type=cache,target=/root/.cache/uv \ 39 | uv sync 40 | 41 | CMD ["fastapi", "run", "--workers", "4", "ansari/app/main_api.py"] -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 Ansari Project 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.md 2 | include LICENSE 3 | include requirements.txt 4 | include pyproject.toml 5 | include MANIFEST.in 6 | 7 | recursive-include src/ansari/resources * 8 | recursive-include src/ansari/templates * -------------------------------------------------------------------------------- /Procfile: -------------------------------------------------------------------------------- 1 | web: PYTHONPATH=$PYTHONPATH:src gunicorn -w 4 -k uvicorn.workers.UvicornWorker --pythonpath src ansari.app.main_api:app --max-requests 500 2 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ansari-project/ansari-backend/accfce869dd1a98c02eb4e931985525b91632c7d/__init__.py -------------------------------------------------------------------------------- /aws/aws-cli.md: -------------------------------------------------------------------------------- 1 | # Create App Runner Service Role 2 | aws iam create-role --role-name CustomAppRunnerServiceRole ` 3 | --assume-role-policy-document file://service-role-policy.json ` 4 | --profile ansari --region us-west-2 5 | 6 | aws iam attach-role-policy ` 7 | --policy-arn arn:aws:iam::aws:policy/service-role/AWSAppRunnerServicePolicyForECRAccess ` 8 | --role-name CustomAppRunnerServiceRole ` 9 | --profile ansari --region us-west-2 10 | 11 | # Create GitHub Actions User 12 | aws iam create-policy ` 13 | --policy-name CustomGitHubActionsPolicy ` 14 | --policy-document file://github-actions-policy.json ` 15 | --profile ansari --region us-west-2 16 | 17 | aws iam create-user ` 18 | --user-name app-runner-github-actions-user ` 19 | --profile ansari --region us-west-2 20 | 21 | aws iam attach-user-policy ` 22 | --policy-arn arn:aws:iam:::policy/CustomGitHubActionsPolicy ` 23 | --user-name app-runner-github-actions-user ` 24 | --profile ansari --region us-west-2 25 | 26 | # Create an ECR Registry 27 | aws ecr create-repository --repository-name ansari-backend ` 28 | --profile ansari --region us-west-2 29 | 30 | # Create App Runner Instance Role 31 | aws iam create-role --role-name CustomAppRunnerInstanceRole ` 32 | --assume-role-policy-document file://instance-role-policy.json ` 33 | --profile ansari --region us-west-2 34 | 35 | aws iam put-role-policy ` 36 | --role-name CustomAppRunnerInstanceRole ` 37 | --policy-name CustomAccessParameters ` 38 | --policy-document file://instance-role-parameters-access.json ` 39 | --profile ansari --region us-west-2 40 | 41 | # Create staging env variables 42 | aws ssm put-parameter ` 43 | --name "/app-runtime/ansari-backend/staging/env-var-name" ` 44 | --value "changethis" ` 45 | --type SecureString ` 46 | --profile ansari --region us-west-2 47 | 48 | # Create production env variables 49 | aws ssm put-parameter ` 50 | --name "/app-runtime/ansari-backend/production/env-var-name" ` 51 | --value "changethis" ` 52 | --type SecureString ` 53 | --profile ansari --region us-west-2 -------------------------------------------------------------------------------- /aws/github-actions-policy.json: -------------------------------------------------------------------------------- 1 | { 2 | "Version": "2012-10-17", 3 | "Statement": [ 4 | { 5 | "Sid": "VisualEditor0", 6 | "Effect": "Allow", 7 | "Action": "apprunner:*", 8 | "Resource": "*" 9 | }, 10 | { 11 | "Sid": "VisualEditor1", 12 | "Effect": "Allow", 13 | "Action":[ 14 | "iam:PassRole", 15 | "iam:CreateServiceLinkedRole" 16 | ], 17 | "Resource": "*" 18 | }, 19 | { 20 | "Sid": "VisualEditor2", 21 | "Effect": "Allow", 22 | "Action": "sts:AssumeRole", 23 | "Resource": "arn:aws:iam:::role/CustomAppRunnerServiceRole" 24 | }, 25 | { 26 | "Sid": "VisualEditor3", 27 | "Effect": "Allow", 28 | "Action": [ 29 | "ecr:GetDownloadUrlForLayer", 30 | "ecr:BatchGetImage", 31 | "ecr:BatchCheckLayerAvailability", 32 | "ecr:PutImage", 33 | "ecr:InitiateLayerUpload", 34 | "ecr:UploadLayerPart", 35 | "ecr:CompleteLayerUpload", 36 | "ecr:GetAuthorizationToken" 37 | ], 38 | "Resource": "*" 39 | } 40 | ] 41 | } -------------------------------------------------------------------------------- /aws/instance-role-parameters-access.json: -------------------------------------------------------------------------------- 1 | { 2 | "Version": "2012-10-17", 3 | "Statement": [ 4 | { 5 | "Effect": "Allow", 6 | "Action":[ 7 | "ssm:DescribeParameters" 8 | ], 9 | "Resource": "*" 10 | }, 11 | { 12 | "Effect": "Allow", 13 | "Action":[ 14 | "ssm:GetParameters", 15 | "ssm:GetParameter", 16 | "ssm:GetParametersByPath" 17 | ], 18 | "Resource": [ 19 | "arn:aws:ssm:::parameter/app-runtime/*" 20 | ] 21 | } 22 | ] 23 | } 24 | -------------------------------------------------------------------------------- /aws/instance-role-policy.json: -------------------------------------------------------------------------------- 1 | { 2 | "Version": "2012-10-17", 3 | "Statement": [ 4 | { 5 | "Effect": "Allow", 6 | "Principal": { 7 | "Service": "tasks.apprunner.amazonaws.com" 8 | }, 9 | "Action": "sts:AssumeRole" 10 | } 11 | ] 12 | } 13 | -------------------------------------------------------------------------------- /aws/service-role-policy.json: -------------------------------------------------------------------------------- 1 | { 2 | "Version": "2012-10-17", 3 | "Statement": [ 4 | { 5 | "Effect": "Allow", 6 | "Principal": { 7 | "Service": "build.apprunner.amazonaws.com" 8 | }, 9 | "Action": "sts:AssumeRole" 10 | } 11 | ] 12 | } -------------------------------------------------------------------------------- /data/mawsuah/README.md: -------------------------------------------------------------------------------- 1 | # Arabic Diacritic Stripping for Word Documents 2 | 3 | ## Overview 4 | 5 | This Python script prepares Arabic text in Microsoft Word documents from "The Kuwaiti Encyclopaedia of Islamic Jurisprudence" for use with Vectara Arabic text embedding models. It does this by removing diacritics (tashkeel) from the text. This preprocessing step is essential for optimal semantic indexing and search functionality with Vectara. 6 | 7 | ## Why is this Important? 8 | 9 | Without removing diacritics, Vectara's Arabic text embedding models cannot accurately represent the core meaning of words. This severely hinders the effectiveness of semantic search within the text. 10 | 11 | ## Script Functionality 12 | 13 | The script leverages the following libraries to achieve its task: 14 | 15 | * **textract:** Extracts text content from Microsoft Word (.doc) files. 16 | * **pyarabic.araby:** Provides tools for stripping diacritics from Arabic text. 17 | 18 | ## How to Use 19 | 20 | **1. Install Dependencies** 21 | 22 | Ensure you have the required libraries: 23 | 24 | ```bash 25 | pip install PyArabic==0.6.15 textract==1.6.5 tqdm==4.66.1 26 | ``` 27 | 28 | **2. Obtain the Source Documents** 29 | 30 | * Download "The Kuwaiti Encyclopaedia of Islamic Jurisprudence" Word documents from [this link](https://content.awqaf.gov.kw/BasicPages/2020/9/4fcf6da511ff40cfa278d5873f5ff3ad.rar). 31 | * Unrar the archive. 32 | * Place the extracted Word documents in a dedicated directory. 33 | 34 | **3. Configure the Script** 35 | 36 | * Open the Python script. 37 | * Update the `input_dir` variable with the full path to the directory containing the Word documents. 38 | 39 | **4. Run the Script** 40 | 41 | Execute the script from your terminal: 42 | 43 | ```bash 44 | python strip_tashkeel.py 45 | ``` 46 | 47 | The script will process each Word document (.doc) in your specified directory and create a corresponding text file (.txt) with diacritics removed. The output files will be saved in a new folder called "txt" within the input directory. 48 | -------------------------------------------------------------------------------- /data/mawsuah/strip_tashkeel.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path, PurePath 2 | 3 | import textract 4 | from pyarabic import araby 5 | from tqdm.auto import tqdm 6 | 7 | from ansari.ansari_logger import get_logger 8 | 9 | logger = get_logger(__name__) 10 | 11 | 12 | def strip_tashkeel_from_doc(input_file, output_file): 13 | text = textract.process(input_file).decode("utf-8") # Extract text from .doc file 14 | 15 | stripped_text = araby.strip_diacritics(text) 16 | 17 | with open(output_file, "w", encoding="utf-8") as f: 18 | f.write(stripped_text) 19 | 20 | 21 | input_dir = Path("/path/to/The Kuwaiti Encyclopaedia of Islamic Jurisprudence/word") 22 | path_components = list(input_dir.parts) 23 | path_components[-1] = "txt" 24 | output_dir = PurePath( 25 | *path_components, 26 | ) # --> "/path/to/The Kuwaiti Encyclopaedia of Islamic Jurisprudence/txt" 27 | 28 | # iterate over all files in the directory 29 | for input_file in tqdm(input_dir.glob("*.doc")): 30 | if input_file.is_file() and input_file.suffix == ".doc": 31 | logger.info(f"Processing {input_file.name}...") 32 | strip_tashkeel_from_doc( 33 | input_file, 34 | output_dir.joinpath(input_file.with_suffix(".txt").name), 35 | ) 36 | logger.info(f"Done processing {input_file.name}") 37 | -------------------------------------------------------------------------------- /docs/spec/mongodb.md: -------------------------------------------------------------------------------- 1 | # MongoDB migration plan 2 | 3 | ## Purpose 4 | 5 | Ansari was first built when the representation of messages between LLM providers and Ansari were simply text. 6 | 7 | Since then, the representation has become considerably richer. There are: 8 | 9 | - Tool use requests 10 | - Tool results 11 | - Documents 12 | - References 13 | - Thinking 14 | - Citations 15 | 16 | And more. The content of a message has changed from being a simple string to a list of polymorphic types. 17 | 18 | Thus we need to consider whether to augment our existing SQL db with supporting a set of polymorphic blocks and then 19 | having a three level hierarchy (threads --> messages --> blocks), or switch to a document database. 20 | 21 | Also, this affects the frontend-backend protocol. The frontend was initially designed for the simple representation. 22 | 23 | But now as we try to render more advanced things, it has become a requirement to migrate to this. 24 | 25 | We've also committed to Claude as the backend for Ansari. 26 | 27 | We've chosen to migrate threads (only threads) to MongoDB. This is the plan for how to do that. 28 | 29 | ## What needs to change 30 | 31 | Threads, Messages and Blocks will be stored in MongoDB. 32 | 33 | Here is the current definition of a thread and messages (from sql/00_create_schema.sql): 34 | 35 | ```sql 36 | -- Threads table - integrated for both web and WhatsApp users 37 | CREATE TABLE threads ( 38 | id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), 39 | name VARCHAR(100), 40 | user_id UUID NOT NULL, 41 | initial_source source_type NOT NULL DEFAULT 'web', 42 | created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, 43 | updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, 44 | FOREIGN KEY (user_id) REFERENCES users(id) 45 | ); 46 | 47 | -- Messages table - integrated for both web and WhatsApp users 48 | CREATE TABLE messages ( 49 | id SERIAL PRIMARY KEY, 50 | user_id UUID NOT NULL, 51 | thread_id UUID NOT NULL, 52 | role TEXT NOT NULL, 53 | tool_name TEXT, 54 | tool_details JSONB DEFAULT '{}'::jsonb, 55 | ref_list JSONB, 56 | content TEXT NOT NULL, 57 | timestamp TIMESTAMP DEFAULT CURRENT_TIMESTAMP, 58 | source source_type NOT NULL DEFAULT 'web', 59 | created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, 60 | updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, 61 | FOREIGN KEY (user_id) REFERENCES users(id), 62 | FOREIGN KEY (thread_id) REFERENCES threads(id) ON DELETE CASCADE 63 | ); 64 | ``` 65 | As you can see it's pretty messy. 66 | 67 | Instead, we will be adopting Anthropic's representation of a message as outlined at: 68 | 69 | https://docs.anthropic.com/en/api/messages 70 | 71 | Not only will we be using this for our storage representation, but we will be storing it 72 | as our wire format and largely passing it unmodified through to the frontend. 73 | 74 | ## How we'll do this 75 | 76 | We will set up a MongoDB serverless instance using mongo cloud. 77 | 78 | We will add a new set of endpoints at /api/v3 for 79 | 80 | - GET /threads # Get All Threads 81 | - POST /threads # Create A Thread 82 | - POST /threads/{thread_id} # Add a message to a thread 83 | - GET /threads/{thread_id} # Get a thread 84 | - DELETE /threads/{thread_id} # Delete a thread 85 | - POST /share/{thread_id} # Snapshot a thread for sharing 86 | - GET /share/{thread_id} # See a shared thread 87 | - POST /threads/{thread_id}/name # Set the name of a thread. 88 | 89 | We need to work out how to structure the FastAPI calls to support this. 90 | 91 | ## Historical threads 92 | 93 | The above methods will still hit the existing database for existing threads. 94 | 95 | But they will return values in the simpler format we used to use. Newly created threads 96 | will be stored only in MongoDB. The above calls will have to do some fusion. 97 | 98 | ## Ansari Classes that need to change 99 | 100 | - main_api.py -- this is one of the messier files in the code base. We should take this as an opportunity to clean this up. How Ansari objects are created will also need to be modified to use the new derived classes below. 101 | - ansari_db.py -- Also messy. We may create a derived class from AnsariDB specifically for supporting the new use cases. We will create a new 102 | - ansari_claude.py -- This changes the way many things work in Ansari Claude. We will create a derived class called AnsariClaudeHybrid. 103 | - Misc tests. 104 | 105 | ## Long term migration 106 | 107 | Eventually we will move all our efforts to the new service, and we will deprecate /api/v2. 108 | 109 | 110 | 111 | 112 | -------------------------------------------------------------------------------- /docs/structure_of_api_responses/a_fastapi_request_received_from_zrok.json: -------------------------------------------------------------------------------- 1 | { 2 | "scope": { 3 | "type": "http", 4 | "asgi": { 5 | "version": "3.0", 6 | "spec_version": "2.4" 7 | }, 8 | "http_version": "1.1", 9 | "server": ["127.0.0.1", 8000], // When running locally 10 | "client": ["127.0.0.1", 11563], // The port here changes dynamically 11 | "scheme": "http", 12 | "method": "POST", 13 | "root_path": "", 14 | "path": "/api/v2/users/login", 15 | "raw_path": "/api/v2/users/login", 16 | "query_string": "", 17 | "headers": [ 18 | ["host", "localhost:8000"], 19 | ["connection", "keep-alive"], 20 | ["content-length", "83"], 21 | ["sec-ch-ua-platform", "\"Windows\""], 22 | ["user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"], 23 | ["x-mobile-ansari", "ANSARI"], 24 | ["sec-ch-ua", "\"Google Chrome\";v=\"131\", \"Chromium\";v=\"131\", \"Not_A Brand\";v=\"24\""], 25 | ["content-type", "application/json"], 26 | ["sec-ch-ua-mobile", "?0"], 27 | ["accept", "*/*"], 28 | ["origin", "http://localhost:8081"], 29 | ["sec-fetch-site", "same-site"], 30 | ["sec-fetch-mode", "cors"], 31 | ["sec-fetch-dest", "empty"], 32 | ["referer", "http://localhost:8081/"], 33 | ["accept-encoding", "gzip, deflate, br, zstd"], 34 | ["accept-language", "en-GB,en;q=0.9,ar-EG;q=0.8,ar;q=0.7,en-US;q=0.6"] 35 | ], 36 | "state": {}, 37 | "app": "", 38 | "starlette.exception_handlers": { 39 | "": "", 40 | "": "", 41 | "": "", 42 | "": "" 43 | }, 44 | "router": "", 45 | "endpoint": "", 46 | "path_params": {}, 47 | "route": { 48 | "path": "/api/v2/users/login", 49 | "name": "login_user", 50 | "methods": ["POST"] 51 | } 52 | }, 53 | "_receive": "", 54 | "_send": ".wrapped_app..sender>", 55 | "_stream_consumed": true, 56 | "_is_disconnected": false, 57 | "_form": null, 58 | // _body's value (and other strings) were actually binary strings (i.e., start with b'...') 59 | "_body": "{\"email\":\"guest_<>@endeavorpal.com\",\"password\":\"<>\",\"guest\":true}", 60 | // this is what actually gets returned when accessing headers property (e.g., `request.headers`) 61 | // Check Starlette's implementation (which FastAPI uses) for details: 62 | // https://github.com/encode/starlette/blob/b68a142a356ede730083347f254e1eae8b5c803e/starlette/requests.py#L12 63 | "_headers": { 64 | "host": "localhost:8000", 65 | "connection": "...", 66 | "...": "..." 67 | // I.e., the value of the `_headers` key is a dictionary of the headers already mentioned above 68 | }, 69 | "_json": { 70 | "email": "guest_<>@endeavorpal.com", 71 | "...": ["..."] 72 | // I.e., the value of the `_json` key is simply the dictionary equivalent of `_body`'s string value 73 | }, 74 | "_query_params": "", 75 | "_cookies": {} 76 | } -------------------------------------------------------------------------------- /docs/structure_of_api_responses/anthropic_api_structure_of_message_history.json: -------------------------------------------------------------------------------- 1 | [ // This is a dummy example, so don't focus on the content of the messages 2 | { 3 | "role": "user", 4 | "content": "Can you search for information about machine learning?" 5 | }, 6 | { 7 | "role": "assistant", 8 | "content": [ 9 | { 10 | "type": "text", 11 | "text": "I'll search for information about machine learning for you." 12 | }, 13 | { 14 | "type": "tool_use", 15 | "id": "tool-123456", 16 | "name": "search_database", 17 | "input": { 18 | "query": "machine learning", 19 | "limit": 3 20 | } 21 | } 22 | ] 23 | }, 24 | { 25 | "role": "user", 26 | "content": [ 27 | { 28 | "type": "tool_result", 29 | "tool_use_id": "tool-123456", 30 | "content": "{\"results\": [{\"title\": \"Introduction to ML\", \"url\": \"https://example.com/ml1\"}, {\"title\": \"Deep Learning Fundamentals\", \"url\": \"https://example.com/ml2\"}]}" 31 | } 32 | ] 33 | }, 34 | { 35 | "role": "assistant", 36 | "content": [ 37 | { 38 | "type": "text", 39 | "text": "Based on my search, here are some resources about machine learning:\n\n1. Introduction to ML - This resource covers the basics of machine learning algorithms and techniques.\n2. Deep Learning Fundamentals - This focuses on neural networks and deep learning approaches." 40 | } 41 | ] 42 | }, 43 | { 44 | "role": "user", 45 | "content": "Can you explain neural networks in more detail?" 46 | }, 47 | { 48 | "role": "assistant", 49 | "content": [ 50 | { 51 | "type": "text", 52 | "text": "Neural networks are computational models inspired by the human brain. They consist of layers of interconnected nodes or \"neurons\" that process information.\n\nThe basic structure includes:\n\n1. Input layer: Receives initial data\n2. Hidden layers: Perform computations on inputs\n3. Output layer: Produces the final result\n\nEach connection between neurons has a weight that gets adjusted during training[1]." 53 | } 54 | ] 55 | } 56 | ] -------------------------------------------------------------------------------- /docs/structure_of_api_responses/anthropic_api_structure_of_response.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "content_block_start | content_block_delta | content_block_stop | message_delta | message_stop", 3 | "content_block": { 4 | "type": "text | tool_use", 5 | "id": "tool-123456", 6 | "name": "search_database" 7 | }, 8 | "delta": { 9 | "text": "Here's information about your query...", 10 | "partial_json": "{ \"query\": \"ma", 11 | "type": "citations_delta", 12 | "citation": { 13 | "start": 23, 14 | "end": 45, 15 | "number": 1, 16 | "text": "according to source X..." 17 | }, 18 | "stop_reason": "end_turn | tool_use" 19 | } 20 | } -------------------------------------------------------------------------------- /docs/structure_of_api_responses/meta_whatsapp_api_structure_of_a_reply_msg_status.json: -------------------------------------------------------------------------------- 1 | { 2 | "object": "whatsapp_business_account", 3 | "entry": [ 4 | { 5 | "id": "<>", 6 | "changes": [ 7 | { 8 | "value": { 9 | "messaging_product": "whatsapp", 10 | "metadata": { 11 | "display_phone_number": "< 15555555555)>>", 12 | "phone_number_id": "<>" 13 | }, 14 | "statuses": [ 15 | { 16 | "id": "wamid.<>", 17 | "status": "<>", 18 | "timestamp": "<>", 19 | "recipient_id": "<>", 20 | "conversation": { 21 | "id": "a hexadecimal representation of a hash or a unique identifier (could be MD5 hash or UUID) for the conversation", 22 | "origin": { 23 | "type": "service" 24 | } 25 | }, 26 | "pricing": { 27 | "billable": "True/False (actually sent as a boolean value, so no quotes)", 28 | "pricing_model": "CBP", 29 | "category": "service" 30 | } 31 | } 32 | ] 33 | }, 34 | "field": "messages" 35 | } 36 | ] 37 | } 38 | ] 39 | } -------------------------------------------------------------------------------- /docs/structure_of_api_responses/meta_whatsapp_api_structure_of_a_request_sent_using_zrok.json: -------------------------------------------------------------------------------- 1 | { 2 | "scope": { 3 | "type": "http", 4 | "asgi": { 5 | "version": "3.0", 6 | "spec_version": "2.4" 7 | }, 8 | "http_version": "1.1", 9 | "server": ["127.0.0.1", 8000], // When running locally 10 | "client": ["<>", 0], 11 | "scheme": "https", 12 | "method": "POST", 13 | "root_path": "", 14 | "path": "/whatsapp/v1", 15 | "raw_path": "/whatsapp/v1", 16 | "query_string": "", 17 | "headers": [ 18 | ["host", "YOUR_ZROK_SHARE_TOKEN.share.zrok.io"], 19 | ["user-agent", "facebookexternalua"], 20 | ["content-length", "545"], 21 | ["accept", "*/*"], 22 | ["accept-encoding", "deflate, gzip"], 23 | ["content-type", "application/json"], 24 | ["x-amzn-trace-id", "Root=1-674b2035-0f0a8ab27075asce3324dcdb"], // trace value here is fake 25 | ["x-forwarded-for", "173.REST.OF.IP, <>"], 26 | ["x-forwarded-port", "443"], 27 | ["x-forwarded-proto", "https"], 28 | ["x-hub-signature", "sha1=8a3e35da6fb5dfaaf5aaa46c8d059d519e18112d"], // sha1 hash here is fake 29 | ["x-hub-signature-256", "sha256=51d62480d40ffd0f48d1cde1ea47656452fd65b5ac29077fe3c6b4e68d74c827"], // sha256 here is fake 30 | ["x-proxy", "zrok"] 31 | ], 32 | "state": {}, 33 | "app": "", 34 | "starlette.exception_handlers": { 35 | "": "", 36 | "": "", 37 | "": "", 38 | "": "" 39 | }, 40 | "router": "", 41 | "endpoint": "", 42 | "path_params": {}, 43 | "route": { 44 | "path": "/whatsapp/v1", 45 | "name": "main_webhook", 46 | "methods": ["POST"] 47 | } 48 | }, 49 | "_receive": "", 50 | "_send": ".wrapped_app..sender>", 51 | "_stream_consumed": true, 52 | "_is_disconnected": false, 53 | "_form": null, 54 | "_query_params": "", 55 | // this is what actually gets returned when accessing headers property (e.g., `request.headers`) 56 | // Check Starlette's implementation (which FastAPI uses) for details: 57 | // https://github.com/encode/starlette/blob/b68a142a356ede730083347f254e1eae8b5c803e/starlette/requests.py#L125 58 | "_headers": { 59 | "host": "...", 60 | "user-agent": "...", 61 | "...": "..." 62 | // I.e., the value of the `_headers` key is a dictionary of the headers already mentioned above 63 | }, 64 | "_cookies": {}, 65 | // _body's value (and other strings) were actually binary strings (i.e., start with b'...') 66 | // Also, it contains content mentioned in other `meta_whatsapp_*.json` files 67 | "_body": "{\"object\":\"whatsapp_business_account\", ...}", 68 | "_json": { 69 | "object": "whatsapp_business_account", 70 | "...": ["..."] 71 | // I.e., the value of the `_json` key is simply the dictionary equivalent of `_body`'s string value 72 | } 73 | } -------------------------------------------------------------------------------- /docs/structure_of_api_responses/openai_api_structure_of_chat_completion_chunk_object.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# This documentation is inferred from OpenAI's official documentation as of 2025-01-01\n", 10 | "\n", 11 | "\n", 12 | "def ModelResponse():\n", 13 | " return (\n", 14 | " \"This object returned from here (in case `stream=True`): \"\n", 15 | " + \"https://platform.openai.com/docs/api-reference/chat/streaming\"\n", 16 | " )\n", 17 | "\n", 18 | "\n", 19 | "def StreamingChoices():\n", 20 | " return (\n", 21 | " \"This object returned under `choices` as mentioned here: \"\n", 22 | " + \"https://platform.openai.com/docs/api-reference/chat/streaming#chat/streaming-choices\"\n", 23 | " )\n", 24 | "\n", 25 | "\n", 26 | "def Delta():\n", 27 | " return (\n", 28 | " \"This object returned under `delta` as mentioned under `choices` here: \"\n", 29 | " + \"https://platform.openai.com/docs/api-reference/chat/streaming#chat/streaming-choices\"\n", 30 | " )\n", 31 | "\n", 32 | "\n", 33 | "def ChatCompletionDeltaToolCall():\n", 34 | " return (\n", 35 | " \"This object returned under `tool_calls` as mentioned in this abstract implementation: \"\n", 36 | " + \"https://github.com/openai/openai-python/blob/main/src/openai/types/chat/chat_completion_message_tool_call.py\"\n", 37 | " )\n", 38 | "\n", 39 | "\n", 40 | "def Function():\n", 41 | " return (\n", 42 | " \"This object returned under `ChatCompletionDeltaToolCall` object as mentioned in this abstract implementation: \"\n", 43 | " + \"https://github.com/openai/openai-python/blob/main/src/openai/types/chat/chat_completion_message_tool_call.py\"\n", 44 | " )" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": null, 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "ModelResponse(\n", 54 | " id=\"Unique identifier for the response.\",\n", 55 | " choices=[\n", 56 | " StreamingChoices(\n", 57 | " finish_reason=(\n", 58 | " \"The reason the model stopped generating tokens. This will be `stop` if model hits a natural stop point or a \"\n", 59 | " + \"provided stop sequence, `length` if the maximum number of tokens specified in the request was reached, \"\n", 60 | " + \"`content_filter` if content was omitted due to a flag from our content filters,\"\n", 61 | " + \"`tool_calls` if the model called a tool, \"\n", 62 | " + \"or function_call (deprecated) if the model called a function.\"\n", 63 | " ),\n", 64 | " index=\"The index of the choice in the list of choices.\",\n", 65 | " delta=Delta(\n", 66 | " refusal=\"Reason for refusal, if any.\",\n", 67 | " content=(\n", 68 | " \"The contents of the chunk message.\"\n", 69 | " + \"SUBTLE NOTE: Will be `None` if the value of the `tool_calls` key is not an empty list `[]`.\"\n", 70 | " ),\n", 71 | " role=\"The role of the author of this message chunk (user, role, or assistant).\",\n", 72 | " function_call=(\n", 73 | " \"###### Deprecated ###### and replaced by `tool_calls`.\"\n", 74 | " + \"The name and arguments of a function that should be called, as generated by the model.\"\n", 75 | " ),\n", 76 | " # SUBTLE NOTE: `tool_calls` value will be an empty list if the model deduced that no tool calls are needed\n", 77 | " tool_calls=[\n", 78 | " ChatCompletionDeltaToolCall(\n", 79 | " id=\"The ID of the tool call.\",\n", 80 | " type=\"The type of the tool. As of 2024-09-01, only `function` is supported.\",\n", 81 | " function=Function(\n", 82 | " name=\"Name of the function being called\",\n", 83 | " arguments=(\n", 84 | " \"The arguments to call the function with, as generated by the model in JSON format.\"\n", 85 | " + \"Note that the model does not always generate valid JSON, \"\n", 86 | " + \"and may hallucinate parameters not defined by your function schema.\"\n", 87 | " + \"Validate the arguments in your code before calling your function.\"\n", 88 | " ),\n", 89 | " ),\n", 90 | " index=\"Index of the tool call in the response.\",\n", 91 | " )\n", 92 | " ],\n", 93 | " ),\n", 94 | " logprobs=\"Log probabilities of the tokens, if available.\",\n", 95 | " )\n", 96 | " ],\n", 97 | " created=\"Timestamp when the response was created.\",\n", 98 | " model=\"Name of the model used to generate the response.\",\n", 99 | " object=\"Type of the object (e.g., chat.completion.chunk).\",\n", 100 | " system_fingerprint=\"Unique fingerprint of the system.\",\n", 101 | ")" 102 | ] 103 | } 104 | ], 105 | "metadata": { 106 | "kernelspec": { 107 | "display_name": ".venv", 108 | "language": "python", 109 | "name": "python3" 110 | }, 111 | "language_info": { 112 | "name": "python", 113 | "version": "3.13.2" 114 | } 115 | }, 116 | "nbformat": 4, 117 | "nbformat_minor": 2 118 | } 119 | -------------------------------------------------------------------------------- /favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ansari-project/ansari-backend/accfce869dd1a98c02eb4e931985525b91632c7d/favicon.ico -------------------------------------------------------------------------------- /migrate_database.py: -------------------------------------------------------------------------------- 1 | from bson import ObjectId 2 | import pymongo 3 | from bson import json_util 4 | 5 | from ansari.ansari_db_sql import AnsariSQLDB 6 | from ansari.ansari_logger import get_logger 7 | from ansari.config import get_settings 8 | 9 | logger = get_logger(__name__) 10 | 11 | 12 | def migrate_database(): 13 | try: 14 | settings = get_settings() 15 | sql_db = AnsariSQLDB(settings) 16 | db_url = settings.MONGO_URL 17 | db_name = settings.MONGO_DB_NAME 18 | mongo_connection = pymongo.MongoClient(db_url) 19 | mongo_db = mongo_connection[db_name] 20 | 21 | users_collection = mongo_db["users"] 22 | threads_collection = mongo_db["threads"] 23 | messages_collection = mongo_db["messages"] 24 | feedback_collection = mongo_db["feedback"] 25 | 26 | # Step 1: Process feedback 27 | logger.info("Step 1: Process feedback documents") 28 | logger.info(f"Estimated document count: {feedback_collection.estimated_document_count()}") 29 | 30 | while True: 31 | feedbacks = list(feedback_collection.find({"migrated": {"$exists": False}}).limit(1000)) 32 | if len(feedbacks) == 0: 33 | break 34 | 35 | feedback_operations = [] 36 | message_operations = [] 37 | for i, feedback in enumerate(feedbacks, 1): 38 | logger.info(f"{i} Processing Feedback: {str(feedback["_id"])}") 39 | message_id = feedback.get("original_message_id") 40 | if message_id: 41 | message = messages_collection.find_one({"original_id": message_id}) 42 | if message: 43 | feedback_operations.append(pymongo.UpdateOne( 44 | {"_id": feedback["_id"]}, 45 | {"$set": {"migrated": True}} 46 | )) 47 | 48 | message_operations.append(pymongo.UpdateOne( 49 | {"original_id": message_id}, 50 | {"$set": {"feedback": { 51 | "class": feedback.get("class"), 52 | "comment": feedback.get("comment"), 53 | "created_at": feedback.get("created_at"), 54 | "updated_at": feedback.get("updated_at") 55 | }}} 56 | )) 57 | 58 | logger.info("Saving changes...") 59 | feedback_results = feedback_collection.bulk_write(feedback_operations) 60 | logger.info(f"Feedback results: {feedback_results}") 61 | 62 | message_results = messages_collection.bulk_write(message_operations) 63 | logger.info(f"Message results: {message_results}") 64 | 65 | logger.info("Step 1: Process feedback documents - Done\n\n") 66 | 67 | # Step 2: Process messages 68 | logger.info("Step 2: Process message documents") 69 | logger.info(f"Estimated document count: {messages_collection.estimated_document_count()}") 70 | while True: 71 | messages = list(messages_collection.find({"migrated": {"$exists": False}}).limit(1000)) 72 | if len(messages) == 0: 73 | break 74 | 75 | operations = [] 76 | for i, message in enumerate(messages, 1): 77 | logger.info(f"{i} Processing Message: {str(message["_id"])}") 78 | query = {"_id": message["_id"]} 79 | 80 | original_message = ( 81 | message.get("original_id"), 82 | message.get("role"), 83 | message.get("content"), 84 | message.get("tool_name"), 85 | message.get("tool_details"), 86 | message.get("ref_list"), 87 | ) 88 | converted_message = sql_db.convert_message_llm(original_message)[0] 89 | 90 | updated_message = { 91 | "role": converted_message["role"], 92 | "content": converted_message["content"], 93 | "id": str(ObjectId()), 94 | "source": message["source"], 95 | "created_at": message["created_at"], 96 | "original_id": message["original_id"], 97 | "original_thread_id": message["original_thread_id"], 98 | "original_message": json_util.dumps(message), 99 | "migrated": True, 100 | } 101 | 102 | operations.append(pymongo.ReplaceOne(query, updated_message)) 103 | 104 | logger.info("Saving changes...") 105 | results = messages_collection.bulk_write(operations) 106 | logger.info(f"Message results: {results}") 107 | 108 | logger.info("Step 2: Process message documents - Done\n\n") 109 | 110 | # Step 3: Embed messages in threads 111 | logger.info("Step 3: Process thread documents") 112 | logger.info(f"Estimated document count: {threads_collection.estimated_document_count()}") 113 | 114 | while True: 115 | threads = list(threads_collection.find({"migrated": {"$exists": False}}).limit(1000)) 116 | if len(threads) == 0: 117 | break 118 | 119 | operations = [] 120 | for i, thread in enumerate(threads, 1): 121 | logger.info(f"{i} Migrating: {str(thread["_id"])}") 122 | query = {"_id": thread["_id"]} 123 | 124 | if thread.get("original_user_id") is None: 125 | logger.warning(f"Thread {str(thread['_id'])} does not have an original user ID.") 126 | continue 127 | 128 | user = users_collection.find_one({"original_id": thread["original_user_id"]}) 129 | messages = list(messages_collection.find({"original_thread_id": thread["original_id"]}) 130 | .sort("created_at", pymongo.ASCENDING)) 131 | 132 | thread_messages = [] 133 | for message in messages: 134 | if message.get("role") == "tool" or message.get("role") == "function": 135 | continue 136 | 137 | content = message.get("content") 138 | if isinstance(content, list) and any(block.get("type") == "tool_use" for block in content): 139 | continue 140 | 141 | if isinstance(content, list) and any(block.get("type") == "tool_result" for block in content): 142 | continue 143 | 144 | del message["_id"] 145 | del message["original_id"] 146 | del message["original_thread_id"] 147 | del message["migrated"] 148 | 149 | thread_messages.append(message) 150 | 151 | set_values = { 152 | "migrated": True, 153 | "user_id": user["_id"], 154 | "messages": thread_messages 155 | } 156 | 157 | operations.append(pymongo.UpdateOne(query, {"$set": set_values})) 158 | 159 | logger.info("Saving changes...") 160 | results = threads_collection.bulk_write(operations) 161 | logger.info(f"Thread results: {results}") 162 | 163 | logger.info("Step 3: Process thread documents - Done\n\n") 164 | 165 | except (Exception) as error: 166 | logger.error(f"Error: {error}") 167 | finally: 168 | if mongo_connection is not None: 169 | mongo_connection.close() 170 | 171 | 172 | migrate_database() 173 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools", "wheel"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "ansari-backend" 7 | version = "0.1.0" 8 | description = "Ansari is an AI assistant to enhance understanding and practice of Islam." 9 | authors = [ 10 | { name = "Ansari Project", email = "feedback@ansari.chat" } 11 | ] 12 | readme = "README.md" 13 | requires-python = ">=3.13" 14 | license = {text = "MIT"} 15 | classifiers = [ 16 | "Development Status :: 4 - Beta", 17 | "Intended Audience :: Developers", 18 | "License :: OSI Approved :: MIT License", 19 | "Programming Language :: Python :: 3", 20 | "Programming Language :: Python :: 3.8", 21 | "Programming Language :: Python :: 3.9", 22 | "Programming Language :: Python :: 3.10", 23 | "Topic :: Software Development :: Libraries", 24 | ] 25 | dependencies = [ 26 | "anthropic", 27 | "bcrypt", 28 | "build", 29 | "discord.py", 30 | "diskcache", 31 | "email-validator", 32 | "fastapi[standard]", 33 | "gunicorn", 34 | "jinja2", 35 | "langdetect", 36 | "litellm", 37 | "loguru", 38 | "pymongo", 39 | "openai", 40 | "pandas", 41 | "psycopg2-binary", 42 | "pydantic_settings", 43 | "pyjwt", 44 | "pytest-asyncio", 45 | "pytest-mock", 46 | "pytest-xdist", 47 | "rich", 48 | "sendgrid", 49 | "sentry-sdk[fastapi]", 50 | "setuptools", 51 | "tenacity", 52 | "tiktoken", 53 | "typer", 54 | "uvicorn", 55 | "wheel", 56 | "zxcvbn", 57 | ] 58 | 59 | [project.urls] 60 | Homepage = "https://github.com/ansari-project/ansari-backend" 61 | Documentation = "https://github.com/ansari-project/ansari-backend" 62 | Source = "https://github.com/ansari-project/ansari-backend" 63 | Tracker = "https://github.com/ansari-project/ansari-backend/issues" 64 | 65 | [project.scripts] 66 | ansari = "ansari.app.main_stdio:main" 67 | 68 | [tool.ruff] 69 | line-length = 127 70 | indent-width = 4 71 | target-version = "py310" 72 | lint.select = ["E", "F"] 73 | lint.fixable = ["ALL"] 74 | lint.ignore = [ 75 | "D100", # ignore missing docs 76 | "E402", # false positives for local imports 77 | "TRY003", # external messages in exceptions are too verbose 78 | ] 79 | lint.mccabe.max-complexity = 10 80 | 81 | [tool.ruff.format] 82 | # Like Black, use double quotes for strings. 83 | quote-style = "double" 84 | # Like Black, indent with spaces, rather than tabs. 85 | indent-style = "space" 86 | # Like Black, respect magic trailing commas. 87 | skip-magic-trailing-comma = false 88 | # Like Black, automatically detect the appropriate line ending. 89 | line-ending = "auto" 90 | -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | pythonpath = src 3 | asyncio_mode = strict 4 | asyncio_default_fixture_loop_scope = function 5 | markers = 6 | asyncio: mark a test as asyncio 7 | integration: mark a test as an integration test 8 | filterwarnings = 9 | ignore::DeprecationWarning:pydantic.*: 10 | ignore::UserWarning:pydantic.*: 11 | ignore::Warning:pydantic.*: 12 | ignore:Valid config keys have changed in V2:UserWarning 13 | ignore:Support for class-based `config` is deprecated:UserWarning -------------------------------------------------------------------------------- /setup.sh: -------------------------------------------------------------------------------- 1 | export GRADIO_SERVER_NAME=0.0.0.0 2 | export GRADIO_SERVER_PORT="$PORT" 3 | 4 | -------------------------------------------------------------------------------- /src/ansari/__init__.py: -------------------------------------------------------------------------------- 1 | # This file marks the directory as a Python package. 2 | from .config import Settings, get_settings 3 | from . import ansari_logger 4 | 5 | __all__ = ["Settings", "get_settings", "ansari_logger"] 6 | -------------------------------------------------------------------------------- /src/ansari/agents/__init__.py: -------------------------------------------------------------------------------- 1 | from .ansari import Ansari 2 | from .ansari_workflow import AnsariWorkflow 3 | from .ansari_claude import AnsariClaude 4 | 5 | __all__ = ["Ansari", "AnsariWorkflow", "AnsariClaude"] 6 | -------------------------------------------------------------------------------- /src/ansari/ansari_logger.py: -------------------------------------------------------------------------------- 1 | # This file provides a standard Python logging instance for the caller file (e.g., main_api.py, etc.). 2 | 3 | import os 4 | import logging 5 | import sys 6 | 7 | from ansari.config import get_settings 8 | 9 | 10 | def get_logger(name: str) -> logging.Logger: 11 | """Creates and returns a logger instance for the specified module. 12 | 13 | Args: 14 | name (str): The name of the module requesting the logger (typically __name__). 15 | 16 | Returns: 17 | logging.Logger: Configured logger instance. 18 | """ 19 | logging_level = get_settings().LOGGING_LEVEL.upper() 20 | 21 | # Create a logger 22 | logger = logging.getLogger(name) 23 | 24 | # Clear any existing handlers to avoid duplicate logs 25 | if logger.handlers: 26 | logger.handlers.clear() 27 | 28 | # Set the logging level 29 | logger.setLevel(logging_level) 30 | 31 | # Create console handler 32 | console_handler = logging.StreamHandler(sys.stdout) 33 | console_handler.setLevel(logging_level) 34 | 35 | # Create formatter 36 | formatter = logging.Formatter( 37 | "%(asctime)s | %(levelname)s | %(name)s:%(funcName)s:%(lineno)d | %(message)s", 38 | datefmt="%Y-%m-%d %H:%M:%S", 39 | ) 40 | 41 | # Add formatter to handler 42 | console_handler.setFormatter(formatter) 43 | 44 | # Add handler to logger 45 | logger.addHandler(console_handler) 46 | 47 | # Add file handler if DEV_MODE is enabled 48 | if get_settings().DEV_MODE: 49 | # Ensure logs directory exists 50 | log_dir = os.path.join(os.getcwd(), "logs") 51 | os.makedirs(log_dir, exist_ok=True) 52 | 53 | log_file = os.path.join(log_dir, f"{name}.log") 54 | # Using standard FileHandler instead of TimedRotatingFileHandler 55 | # Add encoding='utf-8' to handle Unicode characters like emojis 56 | file_handler = logging.FileHandler( 57 | filename=log_file, 58 | mode="a", # Append mode 59 | encoding="utf-8", # Use UTF-8 encoding to support Unicode characters 60 | ) 61 | file_handler.setLevel(logging_level) 62 | file_handler.setFormatter(formatter) 63 | logger.addHandler(file_handler) 64 | 65 | return logger 66 | -------------------------------------------------------------------------------- /src/ansari/app/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ansari-project/ansari-backend/accfce869dd1a98c02eb4e931985525b91632c7d/src/ansari/app/__init__.py -------------------------------------------------------------------------------- /src/ansari/app/main_discord.py: -------------------------------------------------------------------------------- 1 | # This file aims to process input from Discord and generate answers back to Discord using a specified LLM model. 2 | 3 | from presenters.discord_presenter import DiscordPresenter 4 | 5 | from ansari.agents import Ansari 6 | from ansari.config import get_settings 7 | 8 | # This work involves 3 agents, with Ansari as primary. 9 | agent = Ansari(get_settings()) 10 | presenter = DiscordPresenter( 11 | agent, 12 | token=get_settings().DISCORD_TOKEN.get_secret_value(), 13 | ) 14 | 15 | # This starts the UI. 16 | presenter.present() 17 | -------------------------------------------------------------------------------- /src/ansari/app/main_file.py: -------------------------------------------------------------------------------- 1 | # This file aims to process an input file and generate answers using a specified LLM model. 2 | # Steps: 3 | # 1. Import necessary modules and configure logging. 4 | # 2. Define the main function with certain parameters. 5 | # 3. Retrieve settings from `config.py`. 6 | # 4. Update settings with the provided system message file and model if specified. 7 | # 5. Depending on the ayah_mode flag, initialize the appropriate presenter (AyahFilePresenter or FilePresenter). 8 | # 6. Call the present method of the presenter to process the input file and generate the output file. 9 | # 7. Use typer.run to execute the main function when the script is run directly. 10 | # (Read more about typer here: https://blog.stackademic.com/typer-the-easiest-way-to-build-command-line-tools-1f3effa569d1) 11 | 12 | import logging 13 | from typing import Optional 14 | 15 | import typer 16 | 17 | from ansari.agents import Ansari 18 | from ansari.config import get_settings 19 | from ansari.presenters.ayah_file_presenter import AyahFilePresenter 20 | from ansari.presenters.file_presenter import FilePresenter 21 | 22 | logging.basicConfig( 23 | level=logging.DEBUG, 24 | ) 25 | 26 | 27 | def main( 28 | input_file: str, 29 | output_file: str, 30 | ayah_mode: bool = typer.Option( 31 | False, 32 | "--ayah-mode", 33 | "-a", 34 | help="Process input as ayah questions (CSV format: surah:ayah,question)", 35 | ), 36 | use_query_generation: bool = typer.Option( 37 | True, 38 | "--use-query-generation", 39 | "-q", 40 | help="Use query generation step in ayah mode", 41 | ), 42 | answer_column: str = typer.Option( 43 | "answer", 44 | "--answer-column", 45 | "-c", 46 | help="Name of the column to store answers in the output CSV (ayah mode only)", 47 | ), 48 | system_message: Optional[str] = typer.Option( 49 | None, 50 | "--system-message", 51 | "-s", 52 | help="The name of the system message file. If not provided, uses default.", 53 | ), 54 | model: str = typer.Option( 55 | "gpt-4", 56 | "--model", 57 | "-m", 58 | help="The LLM model to use (e.g., gpt-4, gpt-3.5-turbo)", 59 | ), 60 | ): 61 | """ 62 | Process input file and generate answers 63 | 64 | Args: 65 | input_file: Path to input file 66 | output_file: Path to output file 67 | ayah_mode: Whether to process in ayah mode 68 | use_query_generation: Whether to use query generation 69 | answer_column: Name of column to store answers 70 | system_message: The name of the system message file. If not provided, uses default. 71 | model: The LLM model to use for generating answers 72 | """ 73 | settings = get_settings() 74 | 75 | if system_message: 76 | settings.AYAH_SYSTEM_PROMPT_FILE_NAME = system_message 77 | 78 | # Set the model in settings 79 | settings.MODEL = model 80 | 81 | if ayah_mode: 82 | presenter = AyahFilePresenter( 83 | settings=settings, use_query_generation=use_query_generation, answer_column=answer_column 84 | ) 85 | else: 86 | ansari = Ansari(settings) 87 | presenter = FilePresenter(ansari) 88 | 89 | presenter.present(input_file, output_file) 90 | 91 | 92 | if __name__ == "__main__": 93 | typer.run(main) 94 | -------------------------------------------------------------------------------- /src/ansari/app/main_stdio.py: -------------------------------------------------------------------------------- 1 | # This file aims to process input from standard input and generate answers using a specified LLM model. 2 | 3 | import logging 4 | import typer 5 | from typing import Optional 6 | 7 | 8 | from ansari.agents import Ansari 9 | from ansari.agents.ansari_claude import AnsariClaude 10 | from ansari.ansari_logger import get_logger 11 | from ansari.config import get_settings 12 | from ansari.presenters.stdio_presenter import StdioPresenter 13 | 14 | logger = get_logger(__name__) 15 | 16 | app = typer.Typer() 17 | 18 | 19 | @app.command() 20 | def main( 21 | agent: str = typer.Option("Ansari", "--agent", "-a", help="Agent to use (AnsariClaude or Ansari)"), 22 | log_level: str = typer.Option( 23 | "INFO", "--log-level", "-l", help="Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)", case_sensitive=False 24 | ), 25 | input: Optional[str] = typer.Option( 26 | None, "--input", "-i", help="Input to send to the agent. If not provided, starts interactive mode." 27 | ), 28 | ): 29 | """ 30 | Run the Ansari agent. If input is provided, process it and exit. 31 | If no input is provided, start interactive mode. 32 | """ 33 | # Convert log level string to logging constant 34 | numeric_level = getattr(logging, log_level.upper(), None) 35 | if not isinstance(numeric_level, int): 36 | raise ValueError(f"Invalid log level: {log_level}") 37 | 38 | logging.basicConfig(level=numeric_level) 39 | settings = get_settings() 40 | 41 | if agent == "AnsariClaude": 42 | agent_instance = AnsariClaude(settings) 43 | elif agent == "Ansari": 44 | agent_instance = Ansari(settings) 45 | else: 46 | raise ValueError(f"Unknown agent type: {agent}. Must be one of: AnsariClaude, Ansari") 47 | 48 | # Print greeting 49 | print(agent_instance.greet()) 50 | 51 | if input: 52 | # Process single input and exit 53 | result = agent_instance.process_input(input) 54 | # Handle the result which could be either a generator or other iterable 55 | if result: 56 | for word in result: 57 | if word is not None: 58 | print(word, end="", flush=True) 59 | print() 60 | else: 61 | # No input provided, start interactive mode 62 | presenter = StdioPresenter(agent_instance, skip_greeting=True) 63 | presenter.present() 64 | 65 | 66 | if __name__ == "__main__": 67 | logger.debug("Starting the Ansari chatbot in terminal (stdio)...") 68 | app() 69 | -------------------------------------------------------------------------------- /src/ansari/cli/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ansari-project/ansari-backend/accfce869dd1a98c02eb4e931985525b91632c7d/src/ansari/cli/__init__.py -------------------------------------------------------------------------------- /src/ansari/cli/use_tools.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Command line tool to print search results from various Ansari search tools. 4 | This tool takes a query and search tool name, and pretty prints the returned value. 5 | """ 6 | 7 | import json 8 | from enum import Enum 9 | from typing import Any 10 | 11 | import typer 12 | from rich.console import Console 13 | from rich.panel import Panel 14 | 15 | from ansari.config import get_settings 16 | from ansari.tools.search_hadith import SearchHadith 17 | from ansari.tools.search_mawsuah import SearchMawsuah 18 | from ansari.tools.search_quran import SearchQuran 19 | from ansari.tools.search_tafsir_encyc import SearchTafsirEncyc 20 | 21 | # Remove usul import 22 | # Remove vectara import 23 | from ansari.ansari_logger import get_logger 24 | 25 | logger = get_logger(__name__) 26 | settings = get_settings() 27 | console = Console() 28 | app = typer.Typer(help="Ansari search tools result printer") 29 | 30 | 31 | class OutputFormat(str, Enum): 32 | """Output format options for search results.""" 33 | 34 | RAW = "raw" 35 | STRING = "string" 36 | LIST = "list" 37 | FORMATTED = "formatted" 38 | REF_LIST = "ref_list" 39 | 40 | 41 | def format_json(data: Any) -> str: 42 | """Format data as indented JSON for better readability.""" 43 | return json.dumps(data, ensure_ascii=False, indent=2) 44 | 45 | 46 | def pretty_print_results(results: Any, output_format: str) -> None: 47 | """Pretty print results based on the specified format.""" 48 | if not results: 49 | console.print("[bold red]No results found.[/bold red]") 50 | return 51 | 52 | if output_format == OutputFormat.RAW: 53 | console.print_json(json.dumps(results)) 54 | elif output_format == OutputFormat.STRING: 55 | if isinstance(results, str): 56 | console.print(results) 57 | else: 58 | console.print_json(json.dumps(results)) 59 | elif output_format == OutputFormat.LIST: 60 | if isinstance(results, list): 61 | for i, item in enumerate(results, 1): 62 | console.print(Panel(f"{item}", title=f"Result {i}", border_style="blue")) 63 | console.print() 64 | else: 65 | console.print_json(json.dumps(results)) 66 | elif output_format == OutputFormat.REF_LIST: 67 | if isinstance(results, list): 68 | # Pretty print the entire ref_list as formatted JSON 69 | console.print_json(format_json(results)) 70 | else: 71 | console.print_json(json.dumps(results)) 72 | else: 73 | if isinstance(results, dict) and "tool_result" in results: 74 | console.print(Panel(format_json(results["tool_result"]), title="Tool Result", border_style="green")) 75 | if "response_message" in results: 76 | console.print(Panel(results["response_message"], title="Response Message", border_style="yellow")) 77 | else: 78 | console.print(results) 79 | 80 | 81 | def create_search_tool(tool_name: str) -> Any: 82 | """Create and return the appropriate search tool instance based on the tool name.""" 83 | tools = { 84 | "hadith": lambda: SearchHadith( 85 | kalimat_api_key=settings.KALEMAT_API_KEY.get_secret_value() if hasattr(settings, "KALEMAT_API_KEY") else "" 86 | ), 87 | "mawsuah": lambda: SearchMawsuah( 88 | vectara_api_key=settings.VECTARA_API_KEY.get_secret_value(), vectara_corpus_key=settings.MAWSUAH_CORPUS_ID 89 | ), 90 | "quran": lambda: SearchQuran( 91 | kalimat_api_key=settings.KALEMAT_API_KEY.get_secret_value() if hasattr(settings, "KALEMAT_API_KEY") else "" 92 | ), 93 | "tafsir": lambda: SearchTafsirEncyc(api_token=settings.USUL_API_TOKEN.get_secret_value()), 94 | } 95 | 96 | if tool_name.lower() not in tools: 97 | available_tools = ", ".join(tools.keys()) 98 | console.print(f"[bold red]Error:[/bold red] Unknown tool '{tool_name}'") 99 | console.print(f"Available tools: {available_tools}") 100 | raise typer.Exit(code=1) 101 | 102 | return tools[tool_name.lower()]() 103 | 104 | 105 | @app.command() 106 | def main( 107 | query: str = typer.Argument(..., help="The search query to run"), 108 | tool_name: str = typer.Option(..., "--tool", "-t", help="The search tool to use"), 109 | output_format: OutputFormat = typer.Option(OutputFormat.FORMATTED, "--format", "-f", help="Output format"), 110 | ): 111 | """ 112 | Search using the specified tool and print the results. 113 | """ 114 | try: 115 | with console.status(f"Searching for '{query}' using {tool_name}..."): 116 | # Create the appropriate search tool 117 | search_tool = create_search_tool(tool_name) 118 | 119 | # Run the search 120 | raw_results = search_tool.run(query) 121 | 122 | # Format based on the specified output format 123 | if output_format == OutputFormat.RAW: 124 | results = raw_results 125 | elif output_format == OutputFormat.STRING: 126 | if hasattr(search_tool, "run_as_string"): 127 | results = search_tool.run_as_string(query) 128 | else: 129 | # Fallback for tools without run_as_string method 130 | tool_result = search_tool.format_as_tool_result(raw_results) 131 | results = format_json(tool_result) 132 | elif output_format == OutputFormat.LIST: 133 | if hasattr(search_tool, "format_as_list"): 134 | results = search_tool.format_as_list(raw_results) 135 | else: 136 | results = ["Format not supported for this tool"] 137 | elif output_format == OutputFormat.REF_LIST: 138 | if hasattr(search_tool, "format_as_ref_list"): 139 | results = search_tool.format_as_ref_list(raw_results) 140 | else: 141 | results = ["Format not supported for this tool"] 142 | else: # formatted 143 | tool_result = search_tool.format_as_tool_result(raw_results) 144 | response_message = "" 145 | if hasattr(search_tool, "format_tool_response"): 146 | response_message = search_tool.format_tool_response(raw_results) 147 | 148 | results = {"tool_result": tool_result, "response_message": response_message} 149 | 150 | # Print the results 151 | pretty_print_results(results, output_format) 152 | 153 | except Exception as e: 154 | logger.exception(f"Error running search: {e}") 155 | console.print(f"[bold red]Error:[/bold red] {e}") 156 | raise typer.Exit(code=1) 157 | 158 | 159 | if __name__ == "__main__": 160 | app() 161 | -------------------------------------------------------------------------------- /src/ansari/examples/test_citations.py: -------------------------------------------------------------------------------- 1 | from dotenv import load_dotenv 2 | import os 3 | import anthropic 4 | from ansari.tools.search_quran import SearchQuran 5 | import sys 6 | 7 | 8 | def create_quran_document(ayah: dict) -> dict: 9 | """Convert an ayah to a document format that supports citations.""" 10 | return { 11 | "type": "document", 12 | "source": { 13 | "type": "content", 14 | "content": [ 15 | {"type": "text", "text": f"Arabic Text: {ayah['text']}"}, 16 | {"type": "text", "text": f"English Text: {ayah['en_text']}"}, 17 | ], 18 | }, 19 | "title": f"Quran {ayah['id']}", 20 | "citations": {"enabled": True}, 21 | } 22 | 23 | 24 | def get_prompt(query: str) -> str: 25 | return f"""Based on these Quranic verses, please explain the Islamic teachings about {query}. 26 | 27 | Tell me how many verses are below, and how many you actually used. 28 | """ 29 | 30 | 31 | def format_response_with_citations(response) -> str: 32 | """Format the response with numbered citations and a references section.""" 33 | citations = [] 34 | formatted_text = "" 35 | 36 | # First pass: collect citations and build citation map 37 | citation_map = {} # Maps doc_title to citation number 38 | for content in response.content: 39 | if content.type == "text" and hasattr(content, "citations") and content.citations: 40 | for citation in content.citations: 41 | doc_title = citation.document_title 42 | if doc_title not in citation_map: 43 | text = ( 44 | citation.cited_text.split("English Text:", 1)[1].strip() 45 | if "English Text:" in citation.cited_text 46 | else citation.cited_text.strip() 47 | ) 48 | citations.append({"doc_title": doc_title, "text": text}) 49 | citation_map[doc_title] = len(citations) 50 | 51 | # Second pass: format text with citation numbers 52 | for content in response.content: 53 | if content.type == "text": 54 | text = content.text 55 | if hasattr(content, "citations") and content.citations: 56 | # Add citation numbers after the text block 57 | citation_nums = [] 58 | for citation in content.citations: 59 | ref_num = citation_map[citation.document_title] 60 | citation_nums.append(str(ref_num)) 61 | text += f" [{', '.join(citation_nums)}]" 62 | formatted_text += text 63 | 64 | # Add references section 65 | if citations: 66 | formatted_text += "\n\nReferences:\n" 67 | for i, citation in enumerate(citations, 1): 68 | formatted_text += f"[{i}] {citation['doc_title']}: {citation['text']}\n\n" 69 | 70 | return formatted_text 71 | 72 | 73 | def get_request_params(query: str) -> dict: 74 | load_dotenv() 75 | 76 | # Get API keys 77 | kalemat_api_key = os.getenv("KALEMAT_API_KEY") 78 | if not kalemat_api_key: 79 | raise ValueError("KALEMAT_API_KEY environment variable not set") 80 | 81 | anthropic_api_key = os.getenv("ANTHROPIC_API_KEY") 82 | if not anthropic_api_key: 83 | raise ValueError("ANTHROPIC_API_KEY environment variable not set") 84 | 85 | # Initialize clients 86 | quran_search = SearchQuran(kalemat_api_key) 87 | 88 | # Search for relevant ayahs 89 | search_results = quran_search.run(query, num_results=15) 90 | documents = [create_quran_document(ayah) for ayah in search_results] 91 | 92 | # Create message with documents and prompt 93 | return { 94 | "model": "claude-3-5-sonnet-20241022", 95 | "max_tokens": 4096, 96 | "messages": [{"role": "user", "content": documents}, {"role": "user", "content": get_prompt(query)}], 97 | } 98 | 99 | 100 | if __name__ == "__main__": 101 | if len(sys.argv) < 2: 102 | print("Usage: python3 test_citations.py ") 103 | sys.exit(1) 104 | 105 | query = " ".join(sys.argv[1:]) 106 | client = anthropic.Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY")) 107 | response = client.messages.create(**get_request_params(query)) 108 | print("Response:") 109 | print(format_response_with_citations(response)) 110 | -------------------------------------------------------------------------------- /src/ansari/examples/test_search_mawsuah.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test script for the SearchMawsuah class. 3 | This script verifies that the SearchMawsuah class correctly inherits from SearchVectara 4 | and that its translation and formatting methods work as expected. 5 | """ 6 | 7 | import os 8 | import sys 9 | 10 | # Add the src directory to the path so we can import the modules 11 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../"))) 12 | 13 | from src.ansari.tools.search_mawsuah import SearchMawsuah 14 | from src.ansari.config import get_settings 15 | 16 | 17 | def main(): 18 | """Test the SearchMawsuah class.""" 19 | # Get settings 20 | settings = get_settings() 21 | 22 | # Create a SearchMawsuah instance 23 | sm = SearchMawsuah( 24 | vectara_api_key=settings.VECTARA_API_KEY.get_secret_value(), 25 | vectara_corpus_key=settings.MAWSUAH_VECTARA_CORPUS_KEY, 26 | ) 27 | 28 | # Test basic search 29 | print("Testing basic search...") 30 | query = "prayer" 31 | results = sm.run(query, num_results=2) 32 | 33 | # Check if results are in the expected format 34 | print(f"Results type: {type(results)}") 35 | 36 | # If search_results is present, the parent class's API format is being used correctly 37 | if "search_results" in results: 38 | print(f"Found {len(results['search_results'])} search results") 39 | 40 | # Test format_as_list 41 | print("\nTesting format_as_list...") 42 | text_results = sm.format_as_list(results) 43 | print(f"format_as_list produced {len(text_results)} results") 44 | 45 | # Test format_as_ref_list 46 | print("\nTesting format_as_ref_list...") 47 | ref_list = sm.format_as_ref_list(results) 48 | print(f"format_as_ref_list produced {len(ref_list)} documents") 49 | 50 | # Check if translation worked in ref_list 51 | if ref_list and not isinstance(ref_list[0], str): 52 | text = ref_list[0]["source"]["data"] 53 | print("First document text includes translation:", "English:" in text) 54 | 55 | # Test run_as_string 56 | print("\nTesting run_as_string...") 57 | string_results = sm.run_as_string(query, num_results=2) 58 | print(f"run_as_string output length: {len(string_results)}") 59 | print("run_as_string output includes translation:", "English Translation:" in string_results) 60 | 61 | print("\nSearchMawsuah works correctly and inherits properly from SearchVectara!") 62 | else: 63 | print("ERROR: Results don't have search_results key. API format mismatch.") 64 | print(f"Results keys: {results.keys()}") 65 | 66 | 67 | if __name__ == "__main__": 68 | main() 69 | -------------------------------------------------------------------------------- /src/ansari/presenters/api_presenter.py: -------------------------------------------------------------------------------- 1 | # Unlike other files, the presenter's role here is just to provide functions related to the LLM 2 | 3 | 4 | from fastapi.responses import StreamingResponse 5 | 6 | from ansari.agents import Ansari, AnsariClaude 7 | from ansari.ansari_db import MessageLogger 8 | 9 | 10 | class ApiPresenter: 11 | def __init__(self, app, agent: Ansari | AnsariClaude): 12 | self.app = app 13 | self.settings = agent.settings 14 | 15 | def complete(self, messages: dict, message_logger: MessageLogger = None): 16 | print("Complete called.") 17 | if self.settings.AGENT == "Ansari": 18 | agent = Ansari(settings=self.settings, message_logger=message_logger) 19 | elif self.settings.AGENT == "AnsariClaude": 20 | agent = AnsariClaude(settings=self.settings, message_logger=message_logger) 21 | 22 | return StreamingResponse(agent.replace_message_history(messages["messages"])) 23 | 24 | def present(self): 25 | pass 26 | -------------------------------------------------------------------------------- /src/ansari/presenters/ayah_file_presenter.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import logging 3 | import os 4 | from typing import Tuple 5 | 6 | from ansari.agents.ansari_workflow import AnsariWorkflow 7 | 8 | 9 | class AyahFilePresenter: 10 | def __init__(self, settings, use_query_generation: bool = False, answer_column: str = "answer"): 11 | self.settings = settings 12 | self.use_query_generation = use_query_generation 13 | self.answer_column = answer_column 14 | 15 | def _parse_ayah_reference(self, ayah_ref: str) -> Tuple[int, int]: 16 | """Parse a surah:ayah reference into separate numbers. 17 | 18 | Args: 19 | ayah_ref: String in format "surah:ayah" 20 | 21 | Returns: 22 | Tuple of (surah_num, ayah_num) 23 | 24 | Raises: 25 | ValueError: If format is invalid or empty 26 | """ 27 | if not ayah_ref or not ayah_ref.strip(): 28 | raise ValueError("Empty ayah reference") 29 | 30 | try: 31 | surah_str, ayah_str = ayah_ref.strip().split(":") 32 | return int(surah_str), int(ayah_str) 33 | except ValueError: 34 | raise ValueError(f"Invalid ayah reference format: {ayah_ref}. Expected format: surah:ayah (e.g. 1:1)") 35 | 36 | def present(self, input_file_path: str, output_file_path: str): 37 | try: 38 | # First pass: read header to get all field names 39 | with open(input_file_path, newline="") as input_file: 40 | # Skip empty lines and get header 41 | for line in input_file: 42 | if line.strip(): # First non-empty line is header 43 | reader = csv.reader([line]) 44 | header = next(reader) 45 | if len(header) < 2: 46 | logging.error("Input CSV must contain at least two columns") 47 | return 48 | break 49 | else: 50 | logging.error("Empty input file") 51 | return 52 | 53 | # Create fieldnames, preserving original names 54 | fieldnames = header 55 | if self.answer_column not in fieldnames: 56 | fieldnames = fieldnames + [self.answer_column] 57 | 58 | # Second pass: process all rows 59 | with open(input_file_path, newline="") as input_file: 60 | reader = csv.reader(input_file) 61 | 62 | # Open output file and write 63 | with open(output_file_path, "w", newline="") as output_file: 64 | writer = csv.writer(output_file) 65 | writer.writerow(fieldnames) 66 | 67 | for row in reader: 68 | # Skip empty lines 69 | if not any(row): 70 | continue 71 | 72 | try: 73 | # Get values from first and second columns using column positions 74 | ayah_ref = row[0] 75 | question = row[1] 76 | 77 | # Validate required fields 78 | if not ayah_ref or not question: 79 | raise ValueError("Missing required fields in first or second column") 80 | 81 | surah, ayah = self._parse_ayah_reference(ayah_ref) 82 | question = question.strip() 83 | 84 | print(f"Processing surah {surah}, ayah {ayah}, question: {question}") 85 | 86 | # Create a new workflow instance for each question 87 | workflow = AnsariWorkflow( 88 | self.settings, system_prompt_file=self.settings.AYAH_SYSTEM_PROMPT_FILE_NAME 89 | ) 90 | 91 | ayah_id = surah * 1000 + ayah 92 | workflow_steps = [ 93 | ( 94 | "search", 95 | { 96 | "query": question, 97 | "tool_name": "search_tafsir", 98 | "metadata_filter": f"part.from_ayah_int<={ayah_id} AND part.to_ayah_int>={ayah_id}", 99 | }, 100 | ), 101 | ] 102 | 103 | if self.use_query_generation: 104 | workflow_steps.append(("gen_query", {"input": question, "target_corpus": "tafsir"})) 105 | 106 | workflow_steps.append(("gen_answer", {"input": question, "search_results_indices": [0]})) 107 | 108 | # Execute the workflow 109 | workflow_output = workflow.execute_workflow(workflow_steps) 110 | # The answer is the last item in the workflow output 111 | answer = workflow_output[-1] 112 | 113 | # Add answer to row and write 114 | row.append(answer) 115 | writer.writerow(row) 116 | output_file.flush() 117 | 118 | except Exception as e: 119 | logging.error(f"Error processing row: {e}") 120 | row.append(f"ERROR: {str(e)}") 121 | writer.writerow(row) 122 | output_file.flush() 123 | continue 124 | 125 | print(f"Results saved to {os.path.abspath(output_file_path)}") 126 | 127 | except Exception as e: 128 | logging.error(f"Error processing file: {e}") 129 | return 130 | -------------------------------------------------------------------------------- /src/ansari/presenters/discord_presenter.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import time 3 | 4 | import discord 5 | 6 | 7 | class MyClient(discord.Client): 8 | def __init__(self, agent, intents): 9 | super().__init__(intents=intents) 10 | self.agent = agent 11 | 12 | async def on_ready(self): 13 | print(f"We have logged in as {self.user}") 14 | 15 | async def on_message(self, message): 16 | if message.author == self.user: 17 | return 18 | agent = copy.deepcopy(self.agent) 19 | print(f"User said: {message.content} and mentioned {message.mentions}") 20 | st = time.time() 21 | if ( 22 | isinstance(message.channel, discord.channel.DMChannel) 23 | or message.content.startswith("<@&1150526640552673324>") 24 | or (message.mentions and message.mentions[0] and message.mentions[0].name == "Ansari") 25 | ): 26 | msg = await message.channel.send(f"Thinking, {message.author}...") 27 | msg_so_far = "" 28 | for token in agent.process_input(message.content): 29 | msg_so_far = msg_so_far + token 30 | print(f"Message so far: {msg_so_far}") 31 | et = time.time() - st 32 | print(f"Elapsed time: {et}") 33 | if et > 3: 34 | print("Enough time has passed. Sending message so far.") 35 | if msg_so_far: 36 | await msg.edit(content=msg_so_far) 37 | else: 38 | print(f"For some reason response was empty. {msg_so_far}, {et}") 39 | st = time.time() 40 | if msg_so_far: 41 | await msg.edit(content=msg_so_far) 42 | else: 43 | await msg.edit(content="Something went wrong. Flagging.") 44 | else: 45 | print(f"Got a message. Not for me: {message.content}") 46 | 47 | 48 | class DiscordPresenter: 49 | def __init__(self, agent, token): 50 | self.agent = agent 51 | self.token = token 52 | intents = discord.Intents.default() 53 | intents.message_content = True 54 | self.client = MyClient(agent=agent, intents=intents) 55 | 56 | def present(self): 57 | self.client.run(self.token) 58 | -------------------------------------------------------------------------------- /src/ansari/presenters/file_presenter.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import os 3 | 4 | 5 | class FilePresenter: 6 | def __init__(self, agent): 7 | self.agent = agent 8 | 9 | def present(self, input_file_path, output_file_path): 10 | # Read lines from input file 11 | with open(input_file_path) as input_file: 12 | lines = input_file.readlines() 13 | 14 | # Send each line to agent and get result 15 | with open(output_file_path, "w+") as output_file: 16 | for line in lines: 17 | print(f"Answering: {line}") 18 | agent = copy.deepcopy(self.agent) 19 | # Drop none that occurs between answers. 20 | result = [tok for tok in agent.process_input(line) if tok] 21 | answer = "".join(result) 22 | (question, answer) = (line.strip(), answer) 23 | output_file.write(f"## {question}\n\n{answer}\n\n") 24 | output_file.flush() 25 | print(f"Result saved to {os.path.abspath(output_file_path)}") 26 | -------------------------------------------------------------------------------- /src/ansari/presenters/gradio_presenter.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import uuid 3 | 4 | import gradio as gr 5 | 6 | CSS = """ 7 | .contain { display: flex; flex-direction: column; } 8 | #component-0 { height: 100%; flex-grow: 1; } 9 | #chatbot { flex-grow: 1; overflow: auto;} 10 | """ 11 | 12 | 13 | class GradioPresenter: 14 | def __init__(self, agent, app_name, favicon_path): 15 | self.agent = agent 16 | self.app_name = app_name 17 | self.favicon_path = favicon_path 18 | 19 | def present(self): 20 | self.instances = {} 21 | self.histories = {} 22 | 23 | def generate_session_id(): 24 | return str(f"{uuid.uuid4()}") 25 | 26 | def append_flag(msg): 27 | msg = msg + "Please flag this. " 28 | 29 | def clear_contents(msg): 30 | return "" 31 | 32 | with gr.Blocks(title=self.app_name, css=CSS) as app: 33 | # Note: Gradio Presenter is incredibly confusing. 34 | # We can't pass agents because they are not serializable. 35 | # instead what we do is that we maintain a dictionary of 36 | # LangChainChatAgents by uuid. 37 | my_uuid = gr.State(generate_session_id) 38 | 39 | chatbot = gr.Chatbot( 40 | [["", self.agent.greet()]], 41 | elem_id="chatbot", 42 | line_breaks=True, 43 | ) 44 | msg = gr.Textbox(show_label=False, scale=10) 45 | with gr.Row(): 46 | clr = gr.Button( 47 | value="Clear", 48 | size="sm", 49 | scale=1, 50 | variant="secondary", 51 | elem_id="clr", 52 | ) 53 | btn = gr.Button( 54 | value="Send", 55 | size="sm", 56 | scale=2, 57 | variant="primary", 58 | elem_id="btn", 59 | ) 60 | 61 | def user(user_message, history, my_uuid): 62 | if self.instances.get(my_uuid) is None: 63 | self.instances[my_uuid] = copy.deepcopy(self.agent) 64 | self.instances[my_uuid].session_tag = f"ses_{my_uuid}" 65 | self.histories[my_uuid] = [["", self.agent.greet()]] 66 | self.histories[my_uuid].append([user_message, None]) 67 | print("history is ", self.histories[my_uuid]) 68 | return "", self.histories[my_uuid], my_uuid 69 | 70 | def bot(history, my_uuid): 71 | # Check if we've seen this uuid before. If not, greet then add to instances 72 | if self.instances.get(my_uuid) is None: 73 | self.instances[my_uuid] = copy.deepcopy(self.agent) 74 | self.instances[my_uuid].session_tag = f"ses_{my_uuid}" 75 | self.histories[my_uuid] = [["", self.agent.greet()]] 76 | instance = self.instances[my_uuid] 77 | history = self.histories[my_uuid] 78 | 79 | history[-1][1] = "" 80 | print(f"history is {history}") 81 | for word in instance.process_input(history[-1][0]): 82 | if word is None: 83 | continue 84 | history[-1][1] += word 85 | yield history, my_uuid 86 | 87 | msg.submit( 88 | fn=user, 89 | inputs=[msg, chatbot, my_uuid], 90 | outputs=[msg, chatbot, my_uuid], 91 | queue=False, 92 | ).then(fn=bot, inputs=[chatbot, my_uuid], outputs=[chatbot, my_uuid]) 93 | 94 | # Clicking on the button does the same thing as submitting. 95 | btn.click( 96 | fn=user, 97 | inputs=[msg, chatbot, my_uuid], 98 | outputs=[msg, chatbot, my_uuid], 99 | queue=False, 100 | ).then(fn=bot, inputs=[chatbot, my_uuid], outputs=[chatbot, my_uuid]) 101 | 102 | clr.click(fn=clear_contents, inputs=[msg], outputs=[msg], queue=False) 103 | 104 | if self.favicon_path: 105 | app.launch(favicon_path=self.favicon_path) 106 | else: 107 | app.launch() 108 | -------------------------------------------------------------------------------- /src/ansari/presenters/stdio_presenter.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | from ansari.agents.ansari import Ansari 4 | 5 | 6 | class StdioPresenter: 7 | def __init__(self, agent: Ansari, skip_greeting=False): 8 | self.agent = agent 9 | self.skip_greeting = skip_greeting 10 | 11 | def present(self): 12 | if not self.skip_greeting: 13 | sys.stdout.write(self.agent.greet() + "\n") 14 | sys.stdout.write("> ") 15 | sys.stdout.flush() 16 | inp = sys.stdin.readline() 17 | while inp: 18 | result = self.agent.process_input(inp) 19 | # Handle the result which could be either a generator or other iterable 20 | if result: 21 | for word in result: 22 | if word is not None: 23 | sys.stdout.write(word) 24 | sys.stdout.flush() 25 | sys.stdout.write("\n> ") 26 | sys.stdout.flush() 27 | inp = sys.stdin.readline() 28 | -------------------------------------------------------------------------------- /src/ansari/resources/prompts/greeting.txt: -------------------------------------------------------------------------------- 1 | Assalamu alaikum! My name is Ansari. I can help you with your questions about Islam. 2 | 3 | Ask me about: 4 | 5 | - Dua to make in particular situation 6 | - Spiritual remedies for challenges you are facing 7 | - Islamic perspectives on topics 8 | 9 | [Click here](https://waleedkadous.github.io/ansari/) for a more comprehensive guide to what Ansari can do. 10 | 11 | But I still get things wrong sometimes. It is always best to consult a real Islamic Scholar. 12 | 13 | 14 | *Important note*: If I say anything wrong, confusing, great, funny or interesting, please flag it. Anything flagged will be reviewed by humans. To flag a conversation, just say "I want to flag this conversation." 15 | 16 | 17 | I am multilingual. I can understand Arabic (including transliteration), Turkish, Urdu, Bahasa, Bosnian and many other languages. -------------------------------------------------------------------------------- /src/ansari/resources/prompts/news.txt: -------------------------------------------------------------------------------- 1 | **News 2023-07-23**: Major code rewrite. Much more flexible now. 2 | 3 | -------------------------------------------------------------------------------- /src/ansari/resources/prompts/system_msg_ayah.txt: -------------------------------------------------------------------------------- 1 | You are Ansari Qur'an, a multilingual Islamic digital assistant designed to answer 2 | Quran-related questions with accuracy and depth. 3 | 4 | Fluent in languages such as Arabic (including transliteration), 5 | Bahasa, Bosnian, French, Turkish, Urdu, and more, you, Ansari, 6 | craft precise, evidence-based responses exclusively from the Sunni tradition. 7 | 8 | Here's how you work: You receive a an ayah and a question along with the 9 | desired response language and search results from any tafsirs available. 10 | Currently that includes Ibn Kathir. 11 | 12 | If you attribute a statement or opinion to a scholar, you will include EXACTLY 13 | the sentence in which the mufassir says so. 14 | 15 | If you say there is a hadith that says something, you will include the hadith 16 | EXACTLY as it was in the source text. 17 | 18 | Quoting from the source material is highly recommended when attributing 19 | statements or opinions to scholars or hadith, especially when the source text is 20 | weak or unverified. 21 | 22 | Crucially, only attribute specific statements or opinions to these scholars if you 23 | have specific referenceable evidence to support that attribution. When referencing 24 | the Quran, you, Ansari, include the ayah number, Arabic text, and translation 25 | (if the user's language is different from Arabic). 26 | 27 | If you provide a translation, include the name of the translation (e.g. Saheeh 28 | International). Generally Ibn Kathir uses Saheeh International. 29 | 30 | The person reading your answer is a well informed scholar. You may use terms 31 | that an informed scholar would use. You should use more citations and references 32 | than a general member of the public would. 33 | 34 | 35 | 36 | -------------------------------------------------------------------------------- /src/ansari/resources/prompts/system_msg_ayah_lay.txt: -------------------------------------------------------------------------------- 1 | You are Ansari Qur'an, a multilingual Islamic digital assistant designed to answer 2 | Quran-related questions with accuracy and depth. 3 | 4 | Fluent in languages such as Arabic (including transliteration), 5 | Bahasa, Bosnian, French, Turkish, Urdu, and more, you, Ansari, 6 | craft precise, evidence-based responses exclusively from the Sunni tradition. 7 | 8 | Here's how you work: You receive a an ayah and a question along with the 9 | desired response language and search results from any tafsirs available. 10 | Currently that includes Ibn Kathir. 11 | 12 | If you attribute a statement or opinion to a scholar, you will include EXACTLY 13 | the sentence in which the mufassir says so. 14 | 15 | If you say there is a hadith that says something, you will include the hadith 16 | EXACTLY as it was in the source text. 17 | 18 | Quoting from the source material is highly recommended when attributing 19 | statements or opinions to scholars or hadith, especially when the source text is 20 | weak or unverified. 21 | 22 | Crucially, only attribute specific statements or opinions to these scholars if you 23 | have specific referenceable evidence to support that attribution. When referencing 24 | the Quran, you, Ansari, include the ayah number, Arabic text, and translation 25 | (if the user's language is different from Arabic). 26 | 27 | If you provide a translation, include the name of the translation (e.g. Saheeh 28 | International). Generally Ibn Kathir uses Saheeh International. 29 | The person reading your answer is a general member of the public who 30 | may or may not be a Muslim. Assume the reader only has a basic knowledge of 31 | Islam. 32 | 33 | -------------------------------------------------------------------------------- /src/ansari/resources/prompts/system_msg_claude.txt: -------------------------------------------------------------------------------- 1 | You are Ansari, a multilingual Islamic bot designed to answer 2 | Islam-related questions with accuracy and depth. Fluent in languages such as 3 | Arabic (including transliteration), Bahasa, Bosnian, French, Turkish, Urdu, 4 | and more, craft precise, evidence-based responses exclusively 5 | from the Sunni tradition. Here's how you work: You receive a question along 6 | with the desired response language and search results from Hadith, Quran, and Mawsuah. 7 | 8 | Provide a concise, well-supported answer, citing classical 9 | scholars like Al Ghazali, Ibn Al Qayyim, Ibn Taymiyah, Imam Shafiee, Imam Nawawi, 10 | Imam Abu Hanifah, Ibn Hajr al Asqalani, Imam Ahmad bin Hanbal, Imam Malik, and Ibn Hazm, 11 | as well as modern scholars like Yusuf Al Qaradawi, Yasir Qadhi, 12 | Ma'in Al Qudah, Shu'aib Al Arnaout, Hamza Yusuf, Zaid Shakir, Taqiuddin Usmani, 13 | Muhammad Shinqeeti, Ismail Menk, Omar Suleiman, Salman Al-Awdah, Jamaaluddin Zarabozo, 14 | and Yaser Birjas. 15 | 16 | Crucially, only attribute specific statements or opinions to these scholars 17 | if you have specific referenceable evidence to support that attribution. 18 | When referencing the Quran, include the ayah number, Arabic text, 19 | and translation (if the user's language is different from Arabic). 20 | 21 | For Hadith, only those found in the search results are used, complete with the collection, 22 | LK id, text, and grade. If unsure about a Hadith reference, 23 | indicate this clearly as 'I believe (though not 100% sure of the reference) 24 | there is a hadith that says: [text of hadith]'. 25 | 26 | Especially cautious about obligatory or prohibited matters, 27 | ensure all answers are backed by direct evidence. Instead of vague references, 28 | specific scholars are quoted for clarity. 29 | 30 | Answer questions with thorough, well-researched answers, 31 | grounded in the rich tradition of Sunni scholarship. Use 32 | extensive citations to support your opinions and statements. 33 | 34 | Engage with the Holy Quran, Hadith, and the Encyclopedia of Islamic jurisprudence 35 | (also known as al Mawsuah Al Fiqhiyyah) and the Encyclopedia of Evidence-based Tafseer 36 | to improve your knowledge. Reflect on diverse questions to craft Arabic 37 | search queries with increased accuracy and depth. Strive for a richer understanding 38 | and nuanced responses by exploring various topics consistently. 39 | 40 | When approaching controversial topics or disagreements among scholars: 41 | 1. Present the main scholarly positions objectively 42 | 2. Highlight areas of consensus first before discussing differences 43 | 3. Avoid presenting minority opinions as mainstream views 44 | 4. State the evidence and reasoning behind different positions 45 | 5. Refrain from declaring one position definitively correct when legitimate scholarly disagreement exists 46 | 47 | When using search tools, follow these strategies: 48 | 1. Start with broad searches to understand the topic scope 49 | 2. Refine search terms based on initial results 50 | 3. Use different tools strategically based on question type: 51 | - Quran search for scriptural basis 52 | - Hadith search for prophetic guidance 53 | - Mawsuah for juristic rulings and scholarly interpretations 54 | 4. Combine search results to create comprehensive answers 55 | 5. Only repeat the same tool if there is good reason to believe it will yield different results: 56 | - Vary search terms significantly when repeating searches 57 | - Do not search for the same terms in the same tools repeatedly 58 | - Consider different sources or approaches if initial searches are unproductive 59 | 6. Do not repeatedly use the same tool more than three times in a row 60 | 7. Do not use tools more than a total of 10 times per query (THIS IS A HARD LIMIT) 61 | 8. If you reach any tool usage limit, you MUST: 62 | - Stop using tools immediately 63 | - Synthesize a complete answer based on the information you already have 64 | - ALWAYS provide your answer in EXACTLY the format specified in the user's prompt 65 | - Make your best determination based on available information, even if incomplete 66 | - For questions requiring a specific format, maintain that format exactly as requested 67 | - If appropriate, you may include a brief note like "I attempted [number] searches, but couldn't find the exact references." ONLY AFTER providing your complete answer 68 | 9. ALWAYS complete your response with a direct answer to the user's question, even if your research is incomplete 69 | 70 | For questions outside Islamic knowledge domain: 71 | 1. Politely explain that you are specialized in Islamic topics 72 | 2. Suggest reformulating the question to relate to Islamic perspective if relevant 73 | 3. For purely secular topics, acknowledge the limits of your expertise 74 | 4. Avoid speculation on topics outside your knowledge base 75 | 76 | -------------------------------------------------------------------------------- /src/ansari/resources/prompts/system_msg_tool.txt: -------------------------------------------------------------------------------- 1 | You are Ansari, a multilingual Islamic bot designed to answer Islam-related questions with accuracy and depth. Fluent in languages such as Arabic (including transliteration), Bahasa, Bosnian, French, Turkish, Urdu, and more, you, Ansari, craft precise, evidence-based responses exclusively from the Sunni tradition. Here's how you work: You receive a question along with the desired response language and search results from Hadith, Quran, and Mawsuah. You, Ansari, will then provide a concise, well-supported answer, citing classical scholars like Al Ghazali, Ibn Al Qayyim, Ibn Taymiyah, Imam Shafiee, Imam Nawawi, Imam Abu Hanifah, Ibn Hajr al Asqalani, Imam Ahmad bin Hanbal, Imam Malik, and Ibn Hazm, as well as modern scholars like Yusuf Al Qaradawi, Yasir Qadhi, Ma'in Al Qudah, Shu'aib Al Arnaout, Hamza Yusuf, Zaid Shakir, Taqiuddin Usmani, Muhammad Shinqeeti, Ismail Menk, Omar Suleiman, Salman Al-Awdah, Jamaaluddin Zarabozo, and Yaser Birjas. Crucially, only attribute specific statements or opinions to these scholars if you have specific referenceable evidence to support that attribution. When referencing the Quran, you, Ansari, include the ayah number, Arabic text, and translation (if the user's language is different from Arabic). For Hadith, only those found in the search results are used, complete with the collection, LK id, text, and grade. If unsure about a Hadith reference, you, Ansari, will indicate this clearly as 'I believe (though not 100% sure of the reference) there is a hadith that says: [text of hadith]'. Especially cautious about obligatory or prohibited matters, you, Ansari, ensure all answers are backed by direct evidence. Instead of vague references, specific scholars are quoted for clarity. You, Ansari, will answer questions with thorough, well-researched answers, grounded in the rich tradition of Sunni scholarship. 2 | 3 | Islamic Studies: Engage with the Holy Quran, Hadith, and Mawsuah regularly to optimize performance. Reflect on diverse questions to craft Arabic search queries with increased accuracy and depth. Strive for a richer understanding and nuanced responses by exploring various topics consistently. 4 | -------------------------------------------------------------------------------- /src/ansari/resources/templates/ask_question.txt: -------------------------------------------------------------------------------- 1 | Read the provided question, consider all the listed options after "OPTIONS:", and select the correct answer option. Provide an elaboration for your choice in the "explanation" field. Respond strictly in the specified JSON format with the keys "explanation" and "answer" (for the option text). Replace "Correct Option" with the exact text of the chosen option, without including any option letter or number. Ensure the response adheres to the JSON structure with the key "answer" and the correct option text as the value, enclosed in double quotes. Do not provide any additional explanations or comments outside the JSON format. 2 | 3 | --- 4 | 5 | Question: {{ question }} 6 | 7 | OPTIONS: 8 | {% for option in options -%} 9 | {{ option }}{% if not loop.last %}, {% endif %} 10 | {%- endfor %} 11 | 12 | ```json 13 | { 14 | "explanation": "explanation", 15 | "answer": "Correct Option" 16 | } 17 | ``` -------------------------------------------------------------------------------- /src/ansari/resources/templates/password_reset.html: -------------------------------------------------------------------------------- 1 |

Reset your Ansari Password

2 | 3 |

Click the link below to reset your password for Ansari.

4 | 5 |

If you did not request a reset of your Ansari password, you can safely ignore this.

6 | 7 |

Click on this link to reset your password.

8 | 9 | Or paste this link into your browser: 10 | 11 | {{frontend_url}}/reset-password?token={{reset_token}} 12 | -------------------------------------------------------------------------------- /src/ansari/tools/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ansari-project/ansari-backend/accfce869dd1a98c02eb4e931985525b91632c7d/src/ansari/tools/__init__.py -------------------------------------------------------------------------------- /src/ansari/tools/base_search.py: -------------------------------------------------------------------------------- 1 | import json 2 | from abc import ABC, abstractmethod 3 | from typing import Dict, List, Any, Union 4 | 5 | 6 | class BaseSearchTool(ABC): 7 | """Base class for all search tools.""" 8 | 9 | @abstractmethod 10 | def get_tool_name(self) -> str: 11 | """Get the name of the tool.""" 12 | pass 13 | 14 | @abstractmethod 15 | def get_tool_description(self) -> Dict[str, Any]: 16 | """Get the tool description in OpenAI function format.""" 17 | pass 18 | 19 | @abstractmethod 20 | def run(self, query: str, **kwargs) -> Dict[str, Any]: 21 | """Execute the search and return raw results. 22 | 23 | Args: 24 | query: The search query 25 | **kwargs: Additional search parameters 26 | 27 | Returns: 28 | Dict containing raw search results 29 | """ 30 | pass 31 | 32 | @abstractmethod 33 | def format_as_ref_list(self, results: Dict[str, Any]) -> List[Union[Dict[str, Any], str]]: 34 | """Format raw results as a list of document dictionaries. 35 | 36 | Args: 37 | results: Raw results from run() 38 | 39 | Returns: 40 | List of document dictionaries in the format: 41 | { 42 | "type": "document", 43 | "source": { 44 | "type": "text", 45 | "media_type": "text/plain", 46 | "data": str (JSON string representing language-text pairs) 47 | }, 48 | "title": str, 49 | "context": str, 50 | "citations": {"enabled": bool}, 51 | ... 52 | } 53 | 54 | The data field should contain a JSON string in the format: 55 | [ 56 | {"lang": "ar", "text": "النص العربي"}, 57 | {"lang": "en", "text": "English translation"} # Optional 58 | ] 59 | 60 | Or a list containing a single string "No results found." if no results. 61 | """ 62 | pass 63 | 64 | @abstractmethod 65 | def format_as_tool_result(self, results: Dict[str, Any]) -> Dict[str, Any]: 66 | """Format raw results as a tool result for Claude. 67 | 68 | Args: 69 | results: Raw results from run() 70 | 71 | Returns: 72 | Dict containing formatted results for Claude 73 | """ 74 | pass 75 | 76 | def format_multilingual_data(self, text_entries: Dict[str, str]) -> str: 77 | """Convert a dictionary of language-text pairs to a JSON string. 78 | 79 | Args: 80 | text_entries: Dictionary mapping language codes to text 81 | e.g., {"ar": "النص العربي", "en": "English text"} 82 | 83 | Returns: 84 | JSON string representing language-text pairs 85 | """ 86 | result = [] 87 | for lang, text in text_entries.items(): 88 | if text: # Only include non-empty text 89 | result.append({"lang": lang, "text": text}) 90 | return json.dumps(result) 91 | 92 | def format_document_as_string(self, document: Dict[str, Any]) -> str: 93 | """Helper method to format a document object as a string. 94 | 95 | Args: 96 | document: A document dictionary as returned by format_as_ref_list 97 | 98 | Returns: 99 | A string representation of the document 100 | """ 101 | if isinstance(document, str): 102 | return document 103 | 104 | if document.get("type") != "document" or "source" not in document: 105 | return str(document) 106 | 107 | # Use the title as is - it should already be trimmed by the individual search tools 108 | title = document.get("title", "") 109 | data = document["source"].get("data", "") 110 | context = document.get("context", "") 111 | 112 | result = f"{title}\n" 113 | if context: 114 | result += f"Context: {context}\n" 115 | 116 | # Try to parse data as JSON to extract multilingual content 117 | try: 118 | lang_entries = json.loads(data) 119 | if isinstance(lang_entries, list): 120 | for entry in lang_entries: 121 | if isinstance(entry, dict) and "lang" in entry and "text" in entry: 122 | result += f"\n{entry['lang'].upper()}: {entry['text']}" 123 | return result 124 | except (json.JSONDecodeError, TypeError): 125 | pass 126 | 127 | # Fallback to original data if not JSON 128 | result += f"{data}" 129 | 130 | return result 131 | -------------------------------------------------------------------------------- /src/ansari/tools/search_hadith.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import logging 3 | from ansari.util.translation import format_multilingual_data 4 | from ansari.util.general_helpers import trim_citation_title 5 | 6 | # Set up logging 7 | logger = logging.getLogger(__name__) 8 | logger.setLevel(logging.INFO) 9 | 10 | KALEMAT_BASE_URL = "https://api.kalimat.dev/search" 11 | TOOL_NAME = "search_hadith" 12 | 13 | 14 | class SearchHadith: 15 | def __init__(self, kalimat_api_key): 16 | self.api_key = kalimat_api_key 17 | self.base_url = KALEMAT_BASE_URL 18 | 19 | def get_tool_description(self): 20 | return { 21 | "type": "function", 22 | "function": { 23 | "name": "search_hadith", 24 | "description": "Search for relevant Hadith narrations based on a specific topic.", 25 | "parameters": { 26 | "type": "object", 27 | "properties": { 28 | "query": { 29 | "type": "string", 30 | "description": "Topic or subject matter to search for in Hadith collections", 31 | }, 32 | }, 33 | "required": ["query"], 34 | }, 35 | }, 36 | } 37 | 38 | def get_tool_name(self): 39 | return TOOL_NAME 40 | 41 | def run(self, query: str, num_results: int = 10): 42 | headers = {"x-api-key": self.api_key} 43 | payload = { 44 | "query": query, 45 | "numResults": num_results, 46 | "indexes": '["sunnah_lk"]', 47 | "getText": 2, 48 | } 49 | 50 | response = requests.get(self.base_url, headers=headers, params=payload) 51 | 52 | if response.status_code != 200: 53 | print( 54 | f"Query failed with code {response.status_code}, reason {response.reason}, text {response.text}", 55 | ) 56 | response.raise_for_status() 57 | 58 | return response.json() 59 | 60 | def pp_hadith(self, h): 61 | en = h["en_text"] 62 | grade = h["grade_en"].strip() 63 | if grade: 64 | grade = f"\nGrade: {grade}\n" 65 | src = f"Collection: {h['source_book']} Chapter: {h['chapter_number']} Hadith: {h['hadith_number']} LK id: {h['id']}" 66 | result = f"{src}\n{en}\n{grade}" 67 | return result 68 | 69 | def format_as_list(self, results): 70 | """Format raw API results as a list of strings.""" 71 | return [self.pp_hadith(r) for r in results] 72 | 73 | def format_as_ref_list(self, results): 74 | """Format raw API results as a list of reference documents for Claude.""" 75 | documents = [] 76 | for result in results: 77 | source_book = result.get("source_book", "") 78 | chapter = result.get("chapter_number", "") 79 | chapter_name = result.get("chapter_english", "") 80 | hadith = result.get("hadith_number", "") 81 | section_number = result.get("section_number", "") 82 | section_name = result.get("section_english", "") 83 | id = result.get("id", "") 84 | text = result.get("en_text", "") 85 | ar_text = result.get("ar_text", "") 86 | grade = result.get("grade_en", "").strip() 87 | 88 | # Create citation title (including grade if available) 89 | title = ( 90 | f"{source_book} - Chapter {chapter}: {chapter_name}, " 91 | f"Section {section_number}: {section_name}, Hadith {hadith}, LK id {id}" 92 | ) 93 | if grade: 94 | title += f" (Grade: {grade})" 95 | 96 | # Trim title to prevent Anthropic API crashes with long titles 97 | title = trim_citation_title(title) 98 | 99 | # Format both Arabic and English texts in multilingual JSON format 100 | # This is expected by the base_search.py documentation 101 | text_entries = {} 102 | if ar_text: 103 | text_entries["ar"] = ar_text 104 | if text: 105 | text_entries["en"] = text 106 | 107 | # Format as multilingual JSON data 108 | doc_text = format_multilingual_data(text_entries) 109 | 110 | document = { 111 | "type": "document", 112 | "source": {"type": "text", "media_type": "text/plain", "data": doc_text}, 113 | "title": title, 114 | "context": "Retrieved from hadith collections", 115 | "citations": {"enabled": True}, 116 | } 117 | documents.append(document) 118 | 119 | return documents 120 | 121 | def format_as_tool_result(self, results): 122 | """Format raw API results as a tool result dictionary.""" 123 | formatted_results = [] 124 | for result in results: 125 | formatted_results.append( 126 | { 127 | "type": "text", 128 | "text": f""" 129 | Hadith: {result.get("en_text", "")} \n\n 130 | Source: {result.get("source_book", "")}, Hadith {result.get("hadith_number", "")}\n\n 131 | Grade: {result.get("grade_en", "")}\n 132 | """, 133 | } 134 | ) 135 | 136 | return formatted_results 137 | 138 | def run_as_list(self, query: str, num_results: int = 10): 139 | print(f'Searching hadith for "{query}"') 140 | results = self.run(query, num_results) 141 | return self.format_as_list(results) 142 | 143 | def run_as_string(self, query: str, num_results: int = 3): 144 | results = self.run(query, num_results) 145 | return "\n".join(self.format_as_list(results)) 146 | -------------------------------------------------------------------------------- /src/ansari/tools/search_mawsuah.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from typing import Dict, List, Any 3 | from ansari.tools.search_vectara import SearchVectara 4 | from ansari.util.translation import format_multilingual_data 5 | from ansari.util.general_helpers import trim_citation_title 6 | 7 | TOOL_NAME = "search_mawsuah" 8 | 9 | # Set up logging 10 | logger = logging.getLogger(__name__) 11 | 12 | 13 | class SearchMawsuah(SearchVectara): 14 | def __init__(self, vectara_api_key, vectara_corpus_key): 15 | # Initialize the SearchVectara parent with the necessary parameters 16 | super().__init__( 17 | vectara_api_key=vectara_api_key, 18 | vectara_corpus_key=vectara_corpus_key, 19 | fn_name=TOOL_NAME, 20 | fn_description=( 21 | "Queries an encyclopedia of Islamic jurisprudence (fiqh) for relevant rulings. " 22 | "You call this tool when you need to provide information about Islamic law. " 23 | "Regardless of the language used in the original conversation, you will translate " 24 | "the query into Arabic before searching the encyclopedia. The tool returns a list " 25 | "of **potentially** relevant matches, which may include multiple paragraphs." 26 | ), 27 | params=[ 28 | { 29 | "name": "query", 30 | "type": "string", 31 | "description": "The topic to search for in the fiqh encyclopedia. " 32 | "You will translate this query into Arabic.", 33 | } 34 | ], 35 | required_params=["query"], 36 | ) 37 | 38 | def format_as_ref_list(self, response: Dict[str, Any]) -> List[Dict[str, Any]]: 39 | """ 40 | Format raw API results as a list of reference documents for Claude. 41 | Each reference will include only the original Arabic text for efficiency. 42 | The English translation will be added later only for the parts that are cited. 43 | 44 | Args: 45 | response: The raw API response from Vectara 46 | 47 | Returns: 48 | A list of reference documents formatted for Claude with Arabic text 49 | """ 50 | # Get base documents from parent class 51 | documents = super().format_as_ref_list(response) 52 | 53 | if not documents: 54 | return ["No results found."] 55 | 56 | # Update documents with just Arabic text and citation support 57 | for doc in documents: 58 | if isinstance(doc, str): 59 | continue 60 | 61 | # Keep only the Arabic text and remove HTML tags 62 | text = doc["source"]["data"] 63 | text = text.replace("", "").replace("", "") 64 | 65 | # Convert to multilingual format (Arabic only) 66 | # Note: Mawsuah only returns results in Arabic, so we only have Arabic text here. 67 | # The English translation will be added later by AnsariClaude when a citation is actually used. 68 | doc["source"]["data"] = format_multilingual_data({"ar": text}) 69 | doc["title"] = trim_citation_title("Encyclopedia of Islamic Jurisprudence") 70 | doc["citations"] = {"enabled": True} 71 | 72 | return documents 73 | 74 | def format_as_tool_result(self, response: Dict[str, Any]) -> Dict[str, Any]: 75 | """ 76 | Format raw API results as a tool result dictionary for Claude. 77 | 78 | Args: 79 | response: The raw API response from Vectara 80 | 81 | Returns: 82 | A tool result dictionary with formatted results 83 | """ 84 | # Get base tool result from parent class 85 | result = super().format_as_tool_result(response) 86 | 87 | # If no results were found, return as is 88 | if not result.get("results", []): 89 | return {"type": "text", "text": "No results found."} 90 | 91 | return {"type": "text", "text": "Please see the references below."} 92 | 93 | def run_as_string(self, query: str, num_results: int = 10, **kwargs) -> str: 94 | """Return results as a human-readable string with Arabic text only.""" 95 | # Get the response using the parent's run method 96 | response = self.run(query, num_results, **kwargs) 97 | 98 | # Handle no results case 99 | if not response.get("search_results"): 100 | return "No results found." 101 | 102 | # Process results 103 | results = [] 104 | for i, result in enumerate(response.get("search_results", [])): 105 | arabic_text = result.get("text", "").replace("", "").replace("", "") 106 | 107 | entry = f"Entry {i + 1}:\n" 108 | entry += f"Arabic Text: {arabic_text}\n" 109 | 110 | results.append(entry) 111 | 112 | return "\n\n".join(results) 113 | -------------------------------------------------------------------------------- /src/ansari/tools/search_quran.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from ansari.ansari_logger import get_logger 3 | from ansari.util.translation import format_multilingual_data 4 | from ansari.util.general_helpers import trim_citation_title 5 | 6 | logger = get_logger(__name__) 7 | KALEMAT_BASE_URL = "https://api.kalimat.dev/search" 8 | TOOL_NAME = "search_quran" 9 | 10 | 11 | class SearchQuran: 12 | def __init__(self, kalimat_api_key): 13 | self.api_key = kalimat_api_key 14 | self.base_url = KALEMAT_BASE_URL 15 | 16 | def get_tool_description(self): 17 | return { 18 | "type": "function", 19 | "function": { 20 | "name": "search_quran", 21 | "description": """ 22 | Search and retrieve relevant ayahs based on a specific topic. 23 | Returns multiple ayahs when applicable.""", 24 | "parameters": { 25 | "type": "object", 26 | "properties": { 27 | "query": { 28 | "type": "string", 29 | "description": """ 30 | Topic or subject matter to search for within the Holy Quran. 31 | Make this as specific as possible. 32 | Do not include the word quran in the request. 33 | 34 | Returns results both as tool results and as 35 | references for citations. 36 | """, 37 | }, 38 | }, 39 | "required": ["query"], 40 | }, 41 | }, 42 | } 43 | 44 | def get_tool_name(self): 45 | return TOOL_NAME 46 | 47 | def run(self, query: str, num_results: int = 10): 48 | headers = {"x-api-key": self.api_key} 49 | payload = { 50 | "query": query, 51 | "numResults": num_results, 52 | "getText": 1, # 1 is the Qur'an 53 | } 54 | 55 | response = requests.get(self.base_url, headers=headers, params=payload) 56 | 57 | if response.status_code != 200: 58 | logger.error( 59 | f"Query failed with code {response.status_code}, reason {response.reason}, text {response.text}", 60 | ) 61 | response.raise_for_status() 62 | 63 | # Return the JSON response directly as in the original implementation 64 | return response.json() 65 | 66 | def pp_ayah(self, ayah): 67 | # Added debug logging to understand the ayah structure 68 | logger.debug(f"Ayah data type: {type(ayah)}") 69 | logger.debug(f"Ayah content: {str(ayah)[:200]}") 70 | 71 | # Handle if ayah is not a dictionary 72 | if not isinstance(ayah, dict): 73 | logger.error(f"Expected ayah to be a dict but got {type(ayah)}") 74 | return f"Error: Invalid ayah format - {str(ayah)[:100]}..." 75 | 76 | try: 77 | ayah_num = ayah["id"] 78 | ayah_ar = ayah.get("text", "Not retrieved") 79 | ayah_en = ayah.get("en_text", "Not retrieved") 80 | result = f"Ayah: {ayah_num}\nArabic Text: {ayah_ar}\n\nEnglish Text: {ayah_en}\n\n" 81 | return result 82 | except Exception as e: 83 | logger.error(f"Error formatting ayah: {str(e)}") 84 | logger.error(f"Problematic ayah: {str(ayah)}") 85 | return f"Error processing ayah: {str(e)}" 86 | 87 | def format_as_list(self, results): 88 | """Format raw API results as a list of strings.""" 89 | return [self.pp_ayah(r) for r in results] 90 | 91 | def format_as_ref_list(self, results): 92 | """Format raw API results as a list of document objects for Claude. 93 | 94 | Args: 95 | results: Raw API results 96 | 97 | Returns: 98 | List of document objects formatted for Claude 99 | """ 100 | documents = [] 101 | for result in results: 102 | id = result.get("id", "") 103 | arabic = result.get("text", "") 104 | english = result.get("en_text", "") 105 | 106 | # Create citation title and trim to safe length 107 | title = trim_citation_title(f"Quran {id}") 108 | 109 | # Format both Arabic and English texts in multilingual JSON format 110 | # This is expected by the base_search.py documentation 111 | text_entries = {} 112 | if arabic: 113 | text_entries["ar"] = arabic 114 | if english: 115 | text_entries["en"] = english 116 | 117 | # Format as multilingual JSON data 118 | doc_text = format_multilingual_data(text_entries) 119 | 120 | documents.append( 121 | { 122 | "type": "document", 123 | "source": {"type": "text", "media_type": "text/plain", "data": doc_text}, 124 | "title": title, 125 | "context": "Retrieved from the Holy Quran", 126 | "citations": {"enabled": True}, 127 | } 128 | ) 129 | 130 | return documents 131 | 132 | def format_as_tool_result(self, results): 133 | """Format raw API results as a tool result dictionary.""" 134 | formatted_results = [] 135 | for result in results: 136 | formatted_results.append( 137 | { 138 | "type": "text", 139 | "text": f""" 140 | Arabic text: {result.get("text", "")} \n\n 141 | English text: {result.get("en_text", "")}\n\n 142 | Ayah number: {result.get("id", "")}\n 143 | """, 144 | } 145 | ) 146 | 147 | return formatted_results 148 | 149 | def run_as_list(self, query: str, num_results: int = 10): 150 | logger.info(f'Searching quran for "{query}"') 151 | results = self.run(query, num_results) 152 | logger.debug(f"Results from API: {type(results)}") 153 | try: 154 | # Use the direct approach from the original implementation 155 | formatted_results = [] 156 | for r in results: 157 | ayah_str = self.pp_ayah(r) 158 | formatted_results.append(ayah_str) 159 | return formatted_results 160 | except Exception as e: 161 | import traceback 162 | 163 | logger.error(f"Error formatting results: {str(e)}") 164 | logger.error(f"Full traceback: {traceback.format_exc()}") 165 | logger.error(f"Results that caused error: {results}") 166 | return [f"Error processing results: {str(e)} - {traceback.format_exc()}"] 167 | 168 | def run_as_string(self, query: str, num_results: int = 10): 169 | results = self.run(query, num_results) 170 | try: 171 | return "\n".join([self.pp_ayah(r) for r in results]) 172 | except Exception as e: 173 | logger.error(f"Error formatting results as string: {str(e)}") 174 | return f"Error processing results: {str(e)}" 175 | -------------------------------------------------------------------------------- /src/ansari/util/__init__.py: -------------------------------------------------------------------------------- 1 | # This file makes the 'util' directory a package. 2 | from .prompt_mgr import PromptMgr 3 | from .translation import translate_text 4 | 5 | __all__ = ["PromptMgr", "translate_text"] 6 | -------------------------------------------------------------------------------- /src/ansari/util/prompt_mgr.py: -------------------------------------------------------------------------------- 1 | # This file aims to provide prompt-related functions that can be used across the codebase. 2 | # Specifically, it load prompts (from resources/) and manage them for Ansari agent. 3 | 4 | from pathlib import Path 5 | 6 | from pydantic import BaseModel 7 | 8 | 9 | class Prompt(BaseModel): 10 | file_path: str 11 | cached: str | None = None 12 | hot_reload: bool = True 13 | 14 | def render(self, **kwargs) -> str: 15 | if (self.cached is None) or (self.hot_reload): 16 | with open(self.file_path) as f: 17 | self.cached = f.read() 18 | return self.cached.format(**kwargs) 19 | 20 | 21 | class PromptMgr: 22 | def get_resource_path(filename): 23 | # Get the directory of the current script 24 | script_dir = Path(__file__).resolve() 25 | # Construct the path to the resources directory 26 | resources_dir = script_dir.parent.parent / "resources" 27 | # Construct the full path to the resource file 28 | path = resources_dir / filename 29 | return path 30 | 31 | def __init__(self, hot_reload: bool = True, src_dir: str = str(get_resource_path("prompts"))): 32 | """Creates a prompt manager. 33 | 34 | Args: 35 | hot_reload: If true, reloads the prompt every time it is called. 36 | src_dir: The directory where the prompts are stored. 37 | 38 | """ 39 | self.hot_reload = hot_reload 40 | self.src_dir = src_dir 41 | 42 | def bind(self, prompt_id: str) -> Prompt: 43 | return Prompt( 44 | file_path=f"{self.src_dir}/{prompt_id}.txt", 45 | hot_reload=self.hot_reload, 46 | ) 47 | -------------------------------------------------------------------------------- /src/ansari/util/robust_translation.py: -------------------------------------------------------------------------------- 1 | # Enhanced version of the translation utility with more robust parsing 2 | 3 | import json 4 | import logging 5 | from typing import Dict 6 | 7 | from ansari.util.general_helpers import get_language_from_text 8 | 9 | # Set up logging 10 | logger = logging.getLogger(__name__) 11 | 12 | 13 | def format_multilingual_data(text_entries: Dict[str, str]) -> str: 14 | """Convert a dictionary of language-text pairs to a JSON string. 15 | 16 | This function is used by search tools to format multilingual content 17 | in a consistent way. The format allows tools that return content in 18 | multiple languages (like Quran and Hadith) to be properly handled, 19 | and avoids duplicate translations. 20 | 21 | Args: 22 | text_entries: Dictionary mapping language codes to text 23 | e.g., {"ar": "النص العربي", "en": "English text"} 24 | 25 | Returns: 26 | JSON string representing language-text pairs in the format: 27 | [ 28 | {"lang": "ar", "text": "النص العربي"}, 29 | {"lang": "en", "text": "English translation"} 30 | ] 31 | """ 32 | result = [] 33 | for lang, text in text_entries.items(): 34 | if text: # Only include non-empty text 35 | result.append({"lang": lang, "text": text}) 36 | return json.dumps(result) 37 | 38 | 39 | def parse_multilingual_data(data: str) -> Dict[str, str]: 40 | """Parse a JSON string representing multilingual content into a dictionary. 41 | 42 | This is an enhanced version of the original parse_multilingual_data function 43 | with more robust error handling. 44 | 45 | Args: 46 | data: JSON string in the format returned by format_multilingual_data 47 | OR plain text that will be detected and handled 48 | 49 | Returns: 50 | Dictionary mapping language codes to text 51 | e.g., {"ar": "النص العربي", "en": "English text"} 52 | """ 53 | # First, try standard JSON parsing 54 | try: 55 | parsed = json.loads(data) 56 | if not isinstance(parsed, list): 57 | logger.warning("Expected a JSON array but got something else") 58 | # Fall back to treating as plain text 59 | return {"text": data} 60 | 61 | result = {} 62 | for item in parsed: 63 | if not isinstance(item, dict) or "lang" not in item or "text" not in item: 64 | logger.warning("JSON item missing 'lang' or 'text' fields") 65 | continue 66 | result[item["lang"]] = item["text"] 67 | 68 | # If we extracted any languages, return them 69 | if result: 70 | return result 71 | 72 | # Otherwise, treat as plain text 73 | logger.warning("No valid language entries found in JSON") 74 | return {"text": data} 75 | 76 | except json.JSONDecodeError: 77 | # If JSON parsing fails, try to detect if it's Arabic text 78 | logger.debug("JSON parsing failed, attempting language detection") 79 | 80 | try: 81 | # If it contains Arabic characters, it's likely Arabic text 82 | if any(0x0600 <= ord(c) <= 0x06FF for c in data[:50]): 83 | logger.debug("Detected Arabic text based on character range") 84 | return {"ar": data} 85 | 86 | # Otherwise use language detection 87 | lang = get_language_from_text(data) 88 | logger.debug(f"Detected language: {lang}") 89 | 90 | if lang == "ar": 91 | return {"ar": data} 92 | else: 93 | # Use the detected language 94 | return {lang: data} 95 | 96 | except Exception as e: 97 | logger.error(f"Error during language detection: {e}") 98 | # Fall back to treating as generic text 99 | return {"text": data} 100 | 101 | except Exception as e: 102 | logger.error(f"Unexpected error in parse_multilingual_data: {e}") 103 | # Create a safe fallback dictionary 104 | return {"text": data} 105 | 106 | 107 | def process_document_source_data(doc: dict) -> dict: 108 | """Process a document's source data to ensure it's properly formatted. 109 | 110 | This function tries to parse the document's source data as JSON, and if that fails, 111 | it formats the text based on language detection. 112 | 113 | Args: 114 | doc: The document to process 115 | 116 | Returns: 117 | The processed document 118 | """ 119 | if "source" not in doc or "data" not in doc["source"]: 120 | return doc 121 | 122 | try: 123 | # Try to parse the source data as multilingual data 124 | original_data = doc["source"]["data"] 125 | parsed_data = parse_multilingual_data(original_data) 126 | 127 | # Format the data based on the parsed result 128 | text_list = [] 129 | if "ar" in parsed_data: 130 | text_list.append(f"Arabic: {parsed_data['ar']}") 131 | if "en" in parsed_data: 132 | text_list.append(f"English: {parsed_data['en']}") 133 | if not text_list and "text" in parsed_data: 134 | text_list.append(f"Text: {parsed_data['text']}") 135 | 136 | # Set the source data to the formatted text 137 | if text_list: 138 | doc["source"]["data"] = "\n\n".join(text_list) 139 | 140 | except Exception as e: 141 | logger.error(f"Error processing document source data: {e}") 142 | # Try a simple fallback 143 | try: 144 | original_text = doc["source"]["data"] 145 | if isinstance(original_text, str): 146 | # Just prefix with "Text:" to maintain expected format 147 | doc["source"]["data"] = f"Text: {original_text}" 148 | except Exception: 149 | pass 150 | 151 | return doc -------------------------------------------------------------------------------- /src/ansari/util/translation.py: -------------------------------------------------------------------------------- 1 | # Translation utility for Ansari using Claude models 2 | 3 | import anthropic 4 | from typing import Dict, Optional 5 | import asyncio 6 | import json 7 | 8 | from ansari.ansari_logger import get_logger 9 | from ansari.config import get_settings 10 | from ansari.util.general_helpers import get_language_from_text 11 | 12 | logger = get_logger(__name__) 13 | 14 | 15 | def translate_text( 16 | text: str, target_lang: str, source_lang: Optional[str] = None, model: str = "claude-3-5-haiku-20241022" 17 | ) -> str: 18 | """Translates text using Claude models, defaulting to latest Haiku. 19 | 20 | Args: 21 | text (str): The text to translate 22 | target_lang (str): Target language code (e.g., "ar", "en") or name (e.g., "Arabic", "English") 23 | source_lang (Optional[str], optional): Source language code or name. If None, auto-detected. 24 | model (str, optional): Claude model to use. Defaults to "claude-3-5-haiku-20241022". 25 | 26 | Returns: 27 | str: The translated text 28 | 29 | Raises: 30 | Exception: If translation fails 31 | """ 32 | if not text: 33 | return "" 34 | 35 | # Get settings and initialize Anthropic client 36 | settings = get_settings() 37 | client = anthropic.Anthropic(api_key=settings.ANTHROPIC_API_KEY.get_secret_value()) 38 | 39 | # Detect source language if not provided 40 | if not source_lang: 41 | source_lang = get_language_from_text(text) 42 | 43 | # Return original text if target language is the same as source 44 | if source_lang == target_lang: 45 | return text 46 | 47 | try: 48 | # Call Claude for translation 49 | response = client.messages.create( 50 | model=model, 51 | max_tokens=1024, 52 | temperature=0.0, 53 | system=( 54 | "You are a professional translator. Translate the text accurately while preserving meaning, tone, " 55 | "and formatting. Only return the translation, nothing else." 56 | ), 57 | messages=[{"role": "user", "content": f"Translate this text from {source_lang} to {target_lang}:\n\n{text}"}], 58 | ) 59 | 60 | translation = response.content[0].text.strip() 61 | return translation 62 | 63 | except Exception as e: 64 | logger.error(f"Translation error: {str(e)}") 65 | raise 66 | 67 | 68 | async def translate_texts_parallel(texts: list[str], target_lang: str = "en", source_lang: str = "ar") -> list[str]: 69 | """ 70 | Translate multiple texts in parallel. 71 | 72 | Args: 73 | texts: List of texts to translate 74 | target_lang: Target language code (e.g., "ar", "en") 75 | source_lang: Source language code (e.g., "ar", "en") 76 | 77 | Returns: 78 | List of translations 79 | """ 80 | if not texts: 81 | return [] 82 | 83 | # Create translation tasks for all texts 84 | tasks = [asyncio.to_thread(translate_text, text, target_lang, source_lang) for text in texts] 85 | 86 | # Run all translations in parallel and return results 87 | return await asyncio.gather(*tasks) 88 | 89 | 90 | def format_multilingual_data(text_entries: Dict[str, str]) -> str: 91 | """Convert a dictionary of language-text pairs to a JSON string. 92 | 93 | This function is used by search tools to format multilingual content 94 | in a consistent way. The format allows tools that return content in 95 | multiple languages (like Quran and Hadith) to be properly handled, 96 | and avoids duplicate translations. 97 | 98 | Args: 99 | text_entries: Dictionary mapping language codes to text 100 | e.g., {"ar": "النص العربي", "en": "English text"} 101 | 102 | Returns: 103 | JSON string representing language-text pairs in the format: 104 | [ 105 | {"lang": "ar", "text": "النص العربي"}, 106 | {"lang": "en", "text": "English translation"} 107 | ] 108 | """ 109 | result = [] 110 | for lang, text in text_entries.items(): 111 | if text: # Only include non-empty text 112 | result.append({"lang": lang, "text": text}) 113 | return json.dumps(result) 114 | 115 | 116 | def parse_multilingual_data(data: str) -> Dict[str, str]: 117 | """Parse a JSON string representing multilingual content into a dictionary. 118 | 119 | This is the reverse of format_multilingual_data. 120 | 121 | Args: 122 | data: JSON string in the format returned by format_multilingual_data 123 | 124 | Returns: 125 | Dictionary mapping language codes to text 126 | e.g., {"ar": "النص العربي", "en": "English text"} 127 | 128 | Raises: 129 | json.JSONDecodeError: If the data is not valid JSON 130 | ValueError: If the data is not in the expected format 131 | """ 132 | try: 133 | parsed = json.loads(data) 134 | if not isinstance(parsed, list): 135 | raise ValueError("Expected a JSON array") 136 | 137 | result = {} 138 | for item in parsed: 139 | if not isinstance(item, dict) or "lang" not in item or "text" not in item: 140 | raise ValueError("Expected items with 'lang' and 'text' fields") 141 | result[item["lang"]] = item["text"] 142 | return result 143 | 144 | except json.JSONDecodeError: 145 | raise 146 | except Exception as e: 147 | raise ValueError(f"Invalid multilingual data format: {str(e)}") 148 | -------------------------------------------------------------------------------- /test_ansari_claude.py: -------------------------------------------------------------------------------- 1 | #\!/usr/bin/env python 2 | 3 | import sys 4 | 5 | from ansari.agents.ansari_claude import AnsariClaude 6 | from ansari.config import get_settings 7 | 8 | def test_message_structure(): 9 | """Test that message history conversion works properly.""" 10 | settings = get_settings() 11 | agent = AnsariClaude(settings) 12 | 13 | # Setup test message history with mixed formats 14 | message_history = [ 15 | {"role": "user", "content": "Hello, this is a test"}, 16 | {"role": "assistant", "content": "This is a plain text response"}, # Plain string content 17 | {"role": "user", "content": "What is the definition of Tashahhud?"} 18 | ] 19 | 20 | # Process through replace_message_history 21 | try: 22 | generator = agent.replace_message_history(message_history) 23 | # Just run through the generator to process it 24 | for _ in generator: 25 | pass 26 | print("Test passed - no errors in message processing") 27 | except Exception as e: 28 | print(f"Test failed with error: {e}") 29 | sys.exit(1) 30 | 31 | if __name__ == "__main__": 32 | test_message_structure() 33 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ansari-project/ansari-backend/accfce869dd1a98c02eb4e931985525b91632c7d/tests/__init__.py -------------------------------------------------------------------------------- /tests/ask-question-en.txt: -------------------------------------------------------------------------------- 1 | Select the best answer to the following question: 2 | 3 | {{ question }} 4 | 5 | from the list of options (one option per line) 6 | 7 | {% for option in options %}{{ option }} 8 | {% endfor %} 9 | 10 | Output your answer in json format, with a single field, "answer" 11 | and the answer to the above question. 12 | -------------------------------------------------------------------------------- /tests/integration/README.md: -------------------------------------------------------------------------------- 1 | # Ansari Integration Tests 2 | 3 | This directory contains integration tests for Ansari and its various implementations. The goal 4 | is to test different Ansari implementations with the same test cases to ensure consistent behavior. 5 | 6 | ## Test Structure 7 | 8 | The integration tests are organized as follows: 9 | 10 | 1. `test_helpers.py` - Contains helper functions used by the other test files 11 | 2. `test_ansari_generic.py` - Contains generic test cases that can be applied to any Ansari implementation 12 | 3. `test_ansari_integration.py` - Tests specifically targeting the base Ansari implementation 13 | 4. `test_claude_integration.py` - Tests specifically targeting the AnsariClaude implementation 14 | 15 | ## Generic Testing Framework 16 | 17 | The `test_ansari_generic.py` module provides a reusable testing framework through the `AnsariTester` class. 18 | This allows running the same test cases against different Ansari implementations to ensure consistent behavior. 19 | 20 | ```python 21 | from tests.integration.test_ansari_generic import AnsariTester 22 | from ansari.agents.ansari import Ansari 23 | 24 | # Create a tester for a specific implementation 25 | tester = AnsariTester(Ansari) 26 | 27 | # Run a specific test 28 | tester.test_simple_conversation() 29 | 30 | # Run all tests 31 | results = tester.run_all_tests() 32 | ``` 33 | 34 | ## Test Cases 35 | 36 | The following test cases are implemented: 37 | 38 | 1. **Simple Conversation** - Tests a basic conversation flow with no tools/references 39 | 2. **Conversation with References** - Tests a conversation that should trigger tool usage for references 40 | 3. **Multi-turn Conversation** - Tests context retention across multiple conversation turns 41 | 4. **Message Reconstruction** - Tests the database storage and reconstruction of messages 42 | 43 | ## Running the Tests 44 | 45 | To run the integration tests: 46 | 47 | ```bash 48 | # Run all integration tests 49 | pytest tests/integration/ -m integration 50 | 51 | # Run specific test file 52 | pytest tests/integration/test_ansari_generic.py -m integration 53 | 54 | # Run a specific test case 55 | pytest tests/integration/test_ansari_generic.py::test_simple_conversation_all_agents -v 56 | ``` 57 | 58 | ## Adding New Implementations 59 | 60 | To test a new Ansari implementation: 61 | 62 | 1. Create a new test file (e.g., `test_new_impl_integration.py`) 63 | 2. Import the `AnsariTester` from `test_ansari_generic.py` 64 | 3. Create a fixture that returns an `AnsariTester` for your implementation 65 | 4. Add tests using the tester instance 66 | 5. Add the new implementation to the parametrized tests in `test_ansari_generic.py` 67 | -------------------------------------------------------------------------------- /tests/integration/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /tests/integration/test_ansari_integration.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import pytest 4 | 5 | from ansari.agents.ansari import Ansari 6 | from ansari.ansari_db import MessageLogger, SourceType 7 | from ansari.ansari_logger import get_logger 8 | from ansari.config import Settings 9 | from tests.integration.test_ansari_generic import AnsariTester, IntegrationMessageLogger, MockDatabase 10 | 11 | logger = get_logger(__name__) 12 | 13 | 14 | @pytest.fixture 15 | def settings(): 16 | settings = Settings() 17 | return settings 18 | 19 | 20 | @pytest.fixture 21 | def message_logger(): 22 | return IntegrationMessageLogger() 23 | 24 | 25 | @pytest.fixture 26 | def mock_db(): 27 | return MockDatabase() 28 | 29 | 30 | @pytest.fixture 31 | def ansari_tester(settings): 32 | """Create an AnsariTester configured for base Ansari""" 33 | return AnsariTester(Ansari, settings) 34 | 35 | 36 | @pytest.mark.integration 37 | def test_simple_conversation(ansari_tester): 38 | """Integration test for a simple conversation with Ansari""" 39 | logger.info("Starting simple conversation integration test for Ansari") 40 | assert ansari_tester.test_simple_conversation() 41 | 42 | 43 | @pytest.mark.integration 44 | def test_conversation_with_references(ansari_tester): 45 | """Integration test for a conversation that should include Quran/Hadith references""" 46 | logger.info("Starting conversation with references integration test for Ansari") 47 | assert ansari_tester.test_conversation_with_references() 48 | 49 | 50 | @pytest.mark.integration 51 | def test_multi_turn_conversation(ansari_tester): 52 | """Integration test for a multi-turn conversation""" 53 | logger.info("Starting multi-turn conversation integration test for Ansari") 54 | assert ansari_tester.test_multi_turn_conversation() 55 | 56 | 57 | class TestMessageReconstruction: 58 | """Tests that focus specifically on message reconstruction between agent and database""" 59 | 60 | @pytest.mark.integration 61 | def test_full_reconstruction_cycle(self, settings, mock_db): 62 | """Test the full cycle: Message creation → Database storage → Retrieval → Reconstruction""" 63 | logger.info("Testing full message reconstruction cycle") 64 | 65 | # Create logger that uses our mock database 66 | message_logger = MessageLogger(mock_db, SourceType.WEB, 1, 1) 67 | 68 | # Create the agent 69 | agent = Ansari(settings=settings, message_logger=message_logger) 70 | 71 | # Process a query likely to use tools 72 | for _ in agent.process_input("What does Surah Al-Baqarah say about fasting?"): 73 | pass 74 | 75 | # Verify we have messages in the agent's history 76 | assert len(agent.message_history) > 0, "No messages in agent history" 77 | 78 | # Get the stored messages from the mock DB 79 | stored_messages = mock_db.get_stored_messages() 80 | assert len(stored_messages) > 0, "No messages stored in mock database" 81 | 82 | # Reconstruct messages using the convert_message_llm method 83 | reconstructed_messages = [] 84 | for msg in stored_messages: 85 | reconstructed_msgs = mock_db.convert_message_llm(msg) 86 | reconstructed_messages.extend(reconstructed_msgs) 87 | 88 | # Verify reconstructed messages match agent's history in structure 89 | assert len(reconstructed_messages) > 0, "No messages were reconstructed" 90 | 91 | # Check each message for structural validity 92 | for msg in reconstructed_messages: 93 | assert "role" in msg, "Reconstructed message missing role" 94 | assert "content" in msg, "Reconstructed message missing content" 95 | 96 | @pytest.mark.integration 97 | def test_edge_cases(self, settings): 98 | """Test edge cases for message reconstruction""" 99 | logger.info("Testing message reconstruction edge cases") 100 | 101 | mock_db = MockDatabase() 102 | 103 | # Test Case 1: Plain text message 104 | plain_text_msg = ("assistant", "Simple text response", None, None, None) 105 | reconstructed = mock_db.convert_message_llm(plain_text_msg) 106 | assert len(reconstructed) == 1, "Should have one reconstructed message" 107 | assert reconstructed[0]["role"] == "assistant", "Role should be preserved" 108 | # The content is now a list of objects for Claude format 109 | assert isinstance(reconstructed[0]["content"], list), "Content should be a list for Claude format" 110 | assert reconstructed[0]["content"][0]["type"] == "text", "Content should be text type" 111 | assert reconstructed[0]["content"][0]["text"] == "Simple text response", "Text content should be preserved" 112 | 113 | # Test Case 2: Tool call message 114 | tool_msg = ( 115 | "assistant", 116 | "Let me search for that", 117 | "search_quran", 118 | json.dumps({"id": "123", "input": {"query": "test"}}), 119 | None, 120 | ) 121 | reconstructed = mock_db.convert_message_llm(tool_msg) 122 | assert len(reconstructed) == 1, "Should have one reconstructed message" 123 | assert reconstructed[0]["role"] == "assistant", "Role should be preserved" 124 | # For Claude, content should be a list of objects 125 | assert isinstance(reconstructed[0]["content"], list), "Content should be a list for Claude format" 126 | 127 | # Test Case 3: Message with tool results 128 | tool_result_msg = ( 129 | "tool", 130 | "Tool result text", 131 | "search_quran", 132 | json.dumps({"id": "123", "internal_message": "Internal message", "tool_message": "Tool message"}), 133 | None, 134 | ) 135 | reconstructed = mock_db.convert_message_llm(tool_result_msg) 136 | assert len(reconstructed) == 1, "Should have one reconstructed message" 137 | assert reconstructed[0]["role"] == "tool", "Role should be preserved" 138 | 139 | # For Claude, this could be either a string or an object depending on format 140 | content = reconstructed[0]["content"] 141 | if isinstance(content, str): 142 | assert "Tool result text" in content, "Content should contain the tool result text" 143 | else: 144 | assert isinstance(content, dict), "Content should be a dictionary if not a string" 145 | assert "name" in content, "Content dictionary should have a name" 146 | 147 | 148 | @pytest.mark.integration 149 | def test_run_all_ansari_tests(settings): 150 | """Run all tests for base Ansari using the generic tester""" 151 | tester = AnsariTester(Ansari, settings) 152 | results = tester.run_all_tests() 153 | assert all(results), "All tests should pass for base Ansari" 154 | -------------------------------------------------------------------------------- /tests/unit/__init__.py: -------------------------------------------------------------------------------- 1 | """Unit tests for the Ansari project.""" 2 | -------------------------------------------------------------------------------- /tests/unit/test_ansari_claude_empty_text_block.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from unittest.mock import MagicMock, patch 3 | 4 | from ansari.agents.ansari_claude import AnsariClaude 5 | from ansari.config import Settings 6 | 7 | 8 | class TestAnsariClaudeEmptyTextBlock(unittest.TestCase): 9 | """Test that empty text blocks are not created in AnsariClaude responses.""" 10 | 11 | def setUp(self): 12 | """Set up test fixtures.""" 13 | # Mock settings 14 | self.settings = Settings() 15 | self.settings.ANTHROPIC_MODEL = "test-model" 16 | self.settings.ANTHROPIC_API_KEY = "test-key" 17 | self.settings.MAX_FAILURES = 1 18 | 19 | # Create message logger mock 20 | self.message_logger = MagicMock() 21 | 22 | # Patch anthropic module 23 | self.patcher = patch("anthropic.Anthropic") 24 | self.mock_anthropic = self.patcher.start() 25 | self.mock_client = MagicMock() 26 | self.mock_anthropic.return_value = self.mock_client 27 | 28 | # Create instance with mocks 29 | self.agent = AnsariClaude(self.settings, self.message_logger) 30 | 31 | # Setup history with a user message 32 | self.agent.message_history = [{"role": "user", "content": [{"type": "text", "text": "test question"}]}] 33 | 34 | def tearDown(self): 35 | """Clean up after tests.""" 36 | self.patcher.stop() 37 | 38 | def test_tool_use_empty_text(self): 39 | """Test that _finish_response doesn't create empty text blocks during tool_use.""" 40 | # Mock direct call to _finish_response with empty text and tool calls 41 | assistant_text = "" # Empty text 42 | tool_calls = [{"type": "tool_use", "id": "tool_123", "name": "test_tool", "input": {"query": "test"}}] 43 | 44 | # Setup tool handling mock 45 | self.agent.tool_name_to_instance = {"test_tool": MagicMock()} 46 | self.agent.tool_name_to_instance["test_tool"].run = MagicMock(return_value=[]) 47 | self.agent.tool_name_to_instance["test_tool"].format_as_tool_result = MagicMock(return_value=[]) 48 | self.agent.tool_name_to_instance["test_tool"].format_as_ref_list = MagicMock(return_value=[]) 49 | self.agent.process_tool_call = MagicMock(return_value=([], [])) 50 | 51 | # Call the method directly 52 | self.agent._finish_response(assistant_text, tool_calls) 53 | 54 | # Check that no empty text blocks were created 55 | for msg in self.agent.message_history: 56 | if msg["role"] == "assistant": 57 | for block in msg.get("content", []): 58 | if block.get("type") == "text": 59 | self.assertNotEqual("", block.get("text", "non-empty"), "Empty text block found in message") 60 | 61 | def test_tool_use_stop_reason_handling(self): 62 | """Test that we handle the 'tool_use' stop reason correctly without creating empty text blocks.""" 63 | # Mock the _finish_response method to check how it's called 64 | self.agent._finish_response = MagicMock(return_value=None) 65 | self.agent.process_tool_call = MagicMock(return_value=([], [])) 66 | 67 | # Create a message_delta chunk with tool_use stop reason 68 | message_delta = MagicMock() 69 | message_delta.type = "message_delta" 70 | message_delta.delta = MagicMock() 71 | message_delta.delta.stop_reason = "tool_use" 72 | 73 | # Simulate the state with just a tool call 74 | tool_calls = [{"type": "tool_use", "id": "tool_123", "name": "test_tool", "input": {"query": "test"}}] 75 | response_finished = False 76 | 77 | # Create a method to test the chunk handling logic directly 78 | def test_handler(): 79 | # This simulates the chunk handling code in process_one_round 80 | if message_delta.delta.stop_reason == "tool_use": 81 | if not response_finished: 82 | # Process tool calls directly without calling _finish_response 83 | for tc in tool_calls: 84 | self.agent.process_tool_call(tc["name"], tc["input"], tc["id"]) 85 | 86 | # Run the test handler 87 | test_handler() 88 | 89 | # Verify _finish_response was NOT called for tool_use 90 | self.agent._finish_response.assert_not_called() 91 | 92 | # Verify process_tool_call was called instead 93 | self.agent.process_tool_call.assert_called_with("test_tool", {"query": "test"}, "tool_123") 94 | 95 | def test_tool_call_error_handling(self): 96 | """Test that tool call errors are properly handled without empty messages.""" 97 | # Set up a tool that will raise an exception 98 | self.agent.tool_name_to_instance = {"test_tool": MagicMock()} 99 | self.agent.tool_name_to_instance["test_tool"].run = MagicMock(side_effect=Exception("Test error")) 100 | self.agent._log_message = MagicMock() # Mock the logging method 101 | 102 | # Remember the initial message history length 103 | initial_length = len(self.agent.message_history) 104 | 105 | # Execute tool call process 106 | tool_calls = [{"type": "tool_use", "id": "tool_123", "name": "test_tool", "input": {"query": "test"}}] 107 | 108 | # Process the tool calls 109 | self.agent._process_tool_calls(tool_calls) 110 | 111 | # Check that an error message was added to the message history 112 | self.assertEqual(len(self.agent.message_history), initial_length + 1) 113 | last_message = self.agent.message_history[-1] 114 | self.assertEqual(last_message["role"], "user") 115 | self.assertEqual(last_message["content"][0]["type"], "tool_result") 116 | self.assertEqual(last_message["content"][0]["tool_use_id"], "tool_123") 117 | self.assertIn("Test error", last_message["content"][0]["content"]) 118 | 119 | # Verify log_message was called 120 | self.agent._log_message.assert_called_once() 121 | -------------------------------------------------------------------------------- /tests/unit/test_ansari_claude_message_sequence.py: -------------------------------------------------------------------------------- 1 | import uuid 2 | import sys 3 | import os 4 | from unittest.mock import MagicMock, patch 5 | 6 | # Add the src directory to the path so we can import the modules 7 | src_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../")) 8 | sys.path.insert(0, src_path) 9 | 10 | from ansari.agents.ansari_claude import AnsariClaude 11 | from ansari.config import Settings 12 | from ansari.ansari_db import MessageLogger 13 | 14 | 15 | def test_message_sequence_with_tool_use(): 16 | """ 17 | Test a complete message sequence with tool use and tool result. 18 | 19 | This tests the following sequence: 20 | 1. Assistant message with tool_use 21 | 2. User message with tool_result 22 | 3. Assistant explanation 23 | 4. Simple user message 24 | """ 25 | # Create a mock settings object 26 | settings = MagicMock(spec=Settings) 27 | settings.ANTHROPIC_MODEL = "claude-3-opus-20240229" 28 | settings.diskcache_dir = "/tmp/diskcache" 29 | settings.MAX_FAILURES = 3 30 | 31 | # Create a mock message logger 32 | message_logger = MagicMock(spec=MessageLogger) 33 | 34 | # Create a mocked AnsariClaude instance with initial tools setup 35 | with patch("anthropic.Anthropic"), patch.object(AnsariClaude, "__init__", return_value=None): 36 | claude = AnsariClaude.__new__(AnsariClaude) 37 | claude.settings = settings 38 | claude.message_logger = message_logger 39 | 40 | # Set needed attributes that would normally be set in __init__ 41 | claude.tools = [] 42 | claude.tool_name_to_instance = {} 43 | claude.citations = [] 44 | claude.message_history = [] 45 | claude.client = MagicMock() 46 | 47 | # Create a unique tool ID 48 | tool_id = str(uuid.uuid4()) 49 | 50 | # Setup a message sequence with tool use and tool result 51 | claude.message_history = [ 52 | # 1. Assistant message with tool_use 53 | { 54 | "role": "assistant", 55 | "content": [ 56 | {"type": "text", "text": "Let me search for that information."}, 57 | {"type": "tool_use", "id": tool_id, "name": "search_quran", "input": {"query": "mercy in quran"}}, 58 | ], 59 | }, 60 | # 2. User message with tool_result 61 | { 62 | "role": "user", 63 | "content": [{"type": "tool_result", "tool_use_id": tool_id, "content": "Found 114 verses mentioning mercy."}], 64 | }, 65 | # 3. Assistant explanation 66 | { 67 | "role": "assistant", 68 | "content": [{"type": "text", "text": "I found that the concept of mercy appears frequently in the Quran."}], 69 | }, 70 | # 4. Simple user message 71 | {"role": "user", "content": "Can you tell me more about that?"}, 72 | ] 73 | 74 | # Process the message history 75 | # Mock the API response to avoid actual API calls 76 | mock_response = MagicMock() 77 | claude.client.messages.create.return_value = mock_response 78 | 79 | # Create a more sophisticated mock for process_one_round 80 | # This will append an assistant message to message_history to break the loop 81 | def mock_process_one_round(*args, **kwargs): 82 | # Add an assistant message to break the loop in process_message_history 83 | claude.message_history.append( 84 | {"role": "assistant", "content": [{"type": "text", "text": "I've processed your request."}]} 85 | ) 86 | return ["I've processed your request."] 87 | 88 | claude.process_one_round = MagicMock(side_effect=mock_process_one_round) 89 | 90 | # Run the message processing 91 | # Make a copy of the history for comparison 92 | original_history = [msg.copy() for msg in claude.message_history] 93 | 94 | # Process the message history 95 | list(claude.process_message_history(use_tool=False)) 96 | 97 | # Check that the message history structure was preserved 98 | processed_history = claude.message_history 99 | 100 | # Compare the content of each message to ensure the structure is maintained 101 | for i, (orig, processed) in enumerate(zip(original_history, processed_history)): 102 | # Check that roles match 103 | assert orig["role"] == processed["role"], f"Role mismatch at message {i}" 104 | 105 | # For assistant messages, ensure content remains a list of blocks 106 | if orig["role"] == "assistant": 107 | assert isinstance(processed["content"], list), f"Assistant content should be a list at message {i}" 108 | 109 | # Check for tool_use blocks 110 | orig_tool_blocks = [b for b in orig["content"] if b.get("type") == "tool_use"] 111 | processed_tool_blocks = [b for b in processed["content"] if b.get("type") == "tool_use"] 112 | 113 | assert len(orig_tool_blocks) == len(processed_tool_blocks), f"Tool use blocks count mismatch at message {i}" 114 | 115 | # If there are tool blocks, check that IDs are preserved 116 | if orig_tool_blocks: 117 | assert orig_tool_blocks[0]["id"] == processed_tool_blocks[0]["id"], f"Tool use ID mismatch at message {i}" 118 | 119 | # For user messages with tool_result, ensure structure is maintained 120 | if orig["role"] == "user" and isinstance(orig["content"], list): 121 | # Check that content is still a list 122 | assert isinstance(processed["content"], list), f"User tool_result content should remain a list at message {i}" 123 | 124 | # Check for tool_result blocks 125 | orig_result_blocks = [b for b in orig["content"] if b.get("type") == "tool_result"] 126 | processed_result_blocks = [b for b in processed["content"] if b.get("type") == "tool_result"] 127 | 128 | assert len(orig_result_blocks) == len( 129 | processed_result_blocks 130 | ), f"Tool result blocks count mismatch at message {i}" 131 | 132 | # If there are result blocks, check that IDs are preserved 133 | if orig_result_blocks: 134 | assert ( 135 | orig_result_blocks[0]["tool_use_id"] == processed_result_blocks[0]["tool_use_id"] 136 | ), f"Tool result ID mismatch at message {i}" 137 | 138 | print("All assertions passed - message sequence with tool use/result is correctly processed!") 139 | 140 | 141 | if __name__ == "__main__": 142 | test_message_sequence_with_tool_use() 143 | -------------------------------------------------------------------------------- /tests/unit/test_ansari_claude_tool_sequence.py: -------------------------------------------------------------------------------- 1 | import uuid 2 | import sys 3 | import os 4 | 5 | # Add the src directory to the path so we can import the modules 6 | src_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../")) 7 | sys.path.insert(0, src_path) 8 | 9 | from unittest.mock import MagicMock, patch 10 | from ansari.agents.ansari_claude import AnsariClaude 11 | from ansari.config import Settings 12 | 13 | 14 | def test_process_message_history_with_tools(): 15 | """Test that message processing correctly handles tool_use and tool_result relationships.""" 16 | 17 | # Create mock settings 18 | settings = MagicMock(spec=Settings) 19 | settings.ANTHROPIC_MODEL = "claude-3-opus-20240229" 20 | 21 | # Create a unique tool ID for testing 22 | tool_id = str(uuid.uuid4()) 23 | invalid_tool_id = str(uuid.uuid4()) # This won't match any tool_use block 24 | 25 | # Create a test message history with tool use and tool result 26 | test_history = [ 27 | # Message 1: Assistant with tool_use 28 | { 29 | "role": "assistant", 30 | "content": [ 31 | {"type": "text", "text": "Let me search for that information."}, 32 | {"type": "tool_use", "id": tool_id, "name": "search_quran", "input": {"query": "mercy in quran"}}, 33 | ], 34 | }, 35 | # Message 2: User with tool_result (valid tool_use_id) 36 | { 37 | "role": "user", 38 | "content": [{"type": "tool_result", "tool_use_id": tool_id, "content": "Found 114 verses mentioning mercy."}], 39 | }, 40 | # Message 3: User with tool_result (invalid tool_use_id) 41 | { 42 | "role": "user", 43 | "content": [{"type": "tool_result", "tool_use_id": invalid_tool_id, "content": "This should be filtered out"}], 44 | }, 45 | # Message 4: User with simple text 46 | {"role": "user", "content": "Tell me more about mercy in the Quran."}, 47 | ] 48 | 49 | # Mock the necessary parts of AnsariClaude 50 | with ( 51 | patch.object(AnsariClaude, "__init__", return_value=None), 52 | patch.object(AnsariClaude, "process_one_round", return_value=[]), 53 | ): 54 | claude = AnsariClaude.__new__(AnsariClaude) 55 | claude.settings = settings 56 | claude.message_history = test_history.copy() 57 | claude.message_logger = None 58 | claude.client = MagicMock() 59 | 60 | # Add a final assistant response to avoid infinite loop 61 | def add_assistant_response(*args, **kwargs): 62 | if len(claude.message_history) > 0 and claude.message_history[-1]["role"] == "user": 63 | claude.message_history.append({"role": "assistant", "content": [{"type": "text", "text": "Test response"}]}) 64 | return [] 65 | 66 | claude.process_one_round = MagicMock(side_effect=add_assistant_response) 67 | 68 | # Run the message processing 69 | list(claude.process_message_history(use_tool=False)) 70 | 71 | # Verify the results 72 | processed_history = claude.message_history 73 | 74 | # Message 1 (assistant with tool_use) should keep its structure 75 | assert processed_history[0]["role"] == "assistant" 76 | assert len(processed_history[0]["content"]) == 2 77 | assert processed_history[0]["content"][0]["type"] == "text" 78 | assert processed_history[0]["content"][1]["type"] == "tool_use" 79 | assert processed_history[0]["content"][1]["id"] == tool_id 80 | 81 | # Message 2 (user with valid tool_result) should keep its structure 82 | assert processed_history[1]["role"] == "user" 83 | assert isinstance(processed_history[1]["content"], list) 84 | assert len(processed_history[1]["content"]) == 1 85 | assert processed_history[1]["content"][0]["type"] == "tool_result" 86 | assert processed_history[1]["content"][0]["tool_use_id"] == tool_id 87 | 88 | # Message 3 (user with invalid tool_result) should be filtered 89 | assert processed_history[2]["role"] == "user" 90 | if isinstance(processed_history[2]["content"], list): 91 | assert len(processed_history[2]["content"]) == 0 92 | else: 93 | assert isinstance(processed_history[2]["content"], str) 94 | 95 | # Message 4 (user with simple text) should remain unchanged 96 | assert processed_history[3]["role"] == "user" 97 | assert processed_history[3]["content"] == "Tell me more about mercy in the Quran." 98 | 99 | print("All assertions passed - message processing correctly handled tool relationships!") 100 | 101 | 102 | if __name__ == "__main__": 103 | test_process_message_history_with_tools() 104 | -------------------------------------------------------------------------------- /tests/unit/test_answer_quality.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import pandas as pd 4 | import pytest 5 | from jinja2 import Environment, FileSystemLoader 6 | 7 | from ansari.agents import Ansari 8 | from ansari.ansari_logger import get_logger 9 | from ansari.config import get_settings 10 | 11 | logger = get_logger(__name__) 12 | 13 | 14 | @pytest.fixture(scope="module") 15 | def data(): 16 | tenv = Environment(loader=FileSystemLoader("src/ansari/resources/templates/")) 17 | # NOTE (optional): the content inside the `ask_question.txt` is written in the Jinja2 template language 18 | # Refer to this for details: https://www.packetcoders.io/an-introduction-to-jinja2-for-network-automation/ 19 | q_temp = tenv.get_template("ask_question.txt") 20 | df = pd.read_csv("tests/batik-v1-en.csv") 21 | cache = {} 22 | return df, q_temp, cache 23 | 24 | 25 | def answer_question(question, q_temp, cache): 26 | logger.info(f"Answering question: {question['question']}") 27 | options = [o.strip() for o in question["options"].split(",")] 28 | prompt = q_temp.render(question=question["question"], options=options) 29 | if prompt in cache.keys(): 30 | logger.info(f"Found {question['question']} in cache") 31 | return cache[prompt] 32 | ansari = Ansari(get_settings()) 33 | result = "".join(filter(lambda x: x is not None, ansari.process_input(prompt))) 34 | logger.info(f"Answer: {result}") 35 | cache[prompt] = result 36 | return result 37 | 38 | 39 | def extract_prediction(row): 40 | try: 41 | raw = row["json_prediction"] 42 | raw = raw.replace("```", "") 43 | raw = raw.replace("json", "") 44 | raw = "{" + raw.split("{")[1] 45 | raw = raw.split("}")[0] + "}" 46 | raw = raw.strip() 47 | logger.info(f"raw is: {raw}") 48 | raw_dict = json.loads(raw) 49 | return str(raw_dict["answer"]) 50 | except IndexError: 51 | return "OUTPUT_PARSE_ERROR" 52 | 53 | 54 | def is_correct(row): 55 | clean_prediction = row["predicted"].strip().lower() 56 | clean_correct = row["correct"].replace("Ayah", "").strip().lower() 57 | return clean_correct == clean_prediction 58 | 59 | 60 | def test_ansari_agent(data): 61 | df, q_temp, cache = data 62 | df = df.sample(10) 63 | # For cost and efficiency reasons, we will only test 10 questions 64 | df["json_prediction"] = df.apply( 65 | lambda row: answer_question(row, q_temp, cache), 66 | axis=1, 67 | ) 68 | df["predicted"] = df.apply(extract_prediction, axis=1) 69 | df["correct_prediction"] = df.apply(is_correct, axis=1) 70 | correct_percentage = df["correct_prediction"].mean() * 100 71 | logger.info(f"Percentage of correct predictions: {correct_percentage:.2f}%") 72 | 73 | wrong_predictions = df[~df["correct_prediction"]] 74 | if not wrong_predictions.empty: 75 | logger.info("\nQuestions with wrong predictions:") 76 | for index, row in wrong_predictions.iterrows(): 77 | logger.info(f"Question: {row['question']}") 78 | logger.info(f"Correct Answer: {row['correct']}") 79 | logger.info(f"Predicted Answer: {row['predicted']}") 80 | logger.info("---------------------------------------") 81 | 82 | assert correct_percentage >= 80, f"Correct predictions percentage ({correct_percentage:.2f}%) is less than 95%" 83 | -------------------------------------------------------------------------------- /tests/unit/test_citation_formatting.py: -------------------------------------------------------------------------------- 1 | """Tests for citation formatting in Quran and Hadith searches.""" 2 | 3 | import json 4 | import unittest 5 | from unittest.mock import MagicMock, patch 6 | 7 | from ansari.agents.ansari_claude import AnsariClaude 8 | from ansari.config import Settings 9 | from ansari.tools.search_hadith import SearchHadith 10 | from ansari.tools.search_quran import SearchQuran 11 | 12 | 13 | class TestCitationFormatting(unittest.TestCase): 14 | """Tests to verify that citations are properly formatted without JSON data.""" 15 | 16 | def setUp(self): 17 | """Set up test fixtures.""" 18 | # Create mock settings 19 | self.settings = Settings( 20 | OPENAI_API_KEY="mock-openai-key", 21 | ANTHROPIC_API_KEY="mock-anthropic-key", 22 | KALEMAT_API_KEY="mock-kalemat-key", 23 | ANTHROPIC_MODEL="claude-3-opus-20240229", 24 | DEV_MODE=True, 25 | ) 26 | 27 | # Create mock MessageLogger 28 | self.message_logger = MagicMock() 29 | 30 | # Initialize an AnsariClaude agent with mock settings and logger 31 | self.agent = AnsariClaude(self.settings, self.message_logger) 32 | 33 | @patch("ansari.tools.search_quran.SearchQuran.run") 34 | def test_quran_search_sleeplessness_citation_format(self, mock_run): 35 | """Test that Quran search for 'sleeplessness' properly formats data as JSON in citations.""" 36 | # Mock the API response for Quran search 37 | mock_results = [ 38 | { 39 | "id": "25:47", 40 | "text": "وَهُوَ ٱلَّذِى جَعَلَ لَكُمُ ٱلَّيْلَ لِبَاسًا وَٱلنَّوْمَ سُبَاتًا وَجَعَلَ ٱلنَّهَارَ نُشُورًا", 41 | "en_text": """He is the One Who has made the night for you as a cover, 42 | and made sleep for resting, and the day for rising.""", 43 | }, 44 | {"id": "78:9", "text": "وَجَعَلْنَا نَوْمَكُمْ سُبَاتًا", "en_text": "and made your sleep for rest,"}, 45 | ] 46 | mock_run.return_value = mock_results 47 | 48 | # Create a Quran search tool instance 49 | quran_tool = SearchQuran(kalimat_api_key="mock-key") 50 | 51 | # Get ref_list from the tool 52 | ref_list = quran_tool.format_as_ref_list(mock_results) 53 | 54 | # Check that the data field doesn't contain JSON 55 | for doc in ref_list: 56 | self.assertIsInstance(doc, dict) 57 | self.assertIn("source", doc) 58 | self.assertIn("data", doc["source"]) 59 | data = doc["source"]["data"] 60 | 61 | # Verify data is valid JSON format 62 | try: 63 | parsed_data = json.loads(data) 64 | self.assertIsInstance(parsed_data, list) 65 | 66 | # Check that it contains language-text entries 67 | self.assertTrue(len(parsed_data) > 0) 68 | self.assertIn("lang", parsed_data[0]) 69 | self.assertIn("text", parsed_data[0]) 70 | 71 | # If we have an Arabic entry, verify it matches one of the mock texts 72 | for item in parsed_data: 73 | if item["lang"] == "ar": 74 | self.assertTrue( 75 | item["text"] == mock_results[0]["text"] or item["text"] == mock_results[1]["text"], 76 | f"Expected Arabic text to match mock data, but got: {item['text']}", 77 | ) 78 | except json.JSONDecodeError: 79 | self.fail(f"Data should be valid JSON but got: {data}") 80 | 81 | @patch("ansari.tools.search_hadith.SearchHadith.run") 82 | def test_hadith_search_day_of_judgment_citation_format(self, mock_run): 83 | """Test that Hadith search for 'signs of the day of judgment' doesn't return JSON in citations.""" 84 | # Mock the API response for Hadith search 85 | mock_results = [ 86 | { 87 | "id": "1_2_37_50", 88 | "source_book": "Bukhari", 89 | "chapter_number": "2", 90 | "chapter_english": "Belief", 91 | "section_number": "37", 92 | "section_english": "The asking of Jibreel about Iman, Islam, Ihsan", 93 | "hadith_number": "50", 94 | "ar_text": "عَنْ أَبِي هُرَيْرَةَ، قَالَ كَانَ النَّبِيُّ صلى الله عليه وسلم بَارِزًا يَوْمًا لِلنَّاسِ...", 95 | "en_text": """Narrated Abu Huraira: One day while the Prophet (ﷺ) was sitting in the company of some people, 96 | (The angel) Gabriel came and asked, "What is faith?"...""", 97 | "grade_en": "Sahih-Authentic", 98 | }, 99 | { 100 | "id": "3_39_1598_4178", 101 | "source_book": "AbuDaud", 102 | "chapter_number": "39", 103 | "chapter_english": "Battles", 104 | "section_number": "1598", 105 | "section_english": "Signs of the hour", 106 | "hadith_number": "4178", 107 | "ar_text": "قال رسول الله صلى الله عليه وسلم: لا تقوم الساعة حتى تكون عشر آيات...", 108 | "en_text": """The Messenger of Allah (peace be upon him) said: 109 | The last hour will not come or happen until there appear ten signs before it...""", 110 | "grade_en": "Sahih - Authentic", 111 | }, 112 | ] 113 | mock_run.return_value = mock_results 114 | 115 | # Create a Hadith search tool instance 116 | hadith_tool = SearchHadith(kalimat_api_key="mock-key") 117 | 118 | # Get ref_list from the tool 119 | ref_list = hadith_tool.format_as_ref_list(mock_results) 120 | 121 | # Check that the data field doesn't contain JSON 122 | for doc in ref_list: 123 | self.assertIsInstance(doc, dict) 124 | self.assertIn("source", doc) 125 | self.assertIn("data", doc["source"]) 126 | data = doc["source"]["data"] 127 | 128 | # Verify data is valid JSON format 129 | try: 130 | parsed_data = json.loads(data) 131 | self.assertIsInstance(parsed_data, list) 132 | 133 | # Check that it contains language-text entries 134 | self.assertTrue(len(parsed_data) > 0) 135 | self.assertIn("lang", parsed_data[0]) 136 | self.assertIn("text", parsed_data[0]) 137 | 138 | # Verify text content if we have Arabic or English entries 139 | for item in parsed_data: 140 | if item["lang"] == "ar": 141 | self.assertTrue( 142 | item["text"] == mock_results[0]["ar_text"] or item["text"] == mock_results[1]["ar_text"], 143 | f"Expected Arabic text to match mock data, but got: {item['text']}", 144 | ) 145 | elif item["lang"] == "en": 146 | self.assertTrue( 147 | item["text"] == mock_results[0]["en_text"] or item["text"] == mock_results[1]["en_text"], 148 | f"Expected English text to match mock data, but got: {item['text']}", 149 | ) 150 | except json.JSONDecodeError: 151 | self.fail(f"Data should be valid JSON but got: {data}") 152 | -------------------------------------------------------------------------------- /tests/unit/test_convert_message_llm.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tests for message formatting in convert_message_llm. 3 | """ 4 | 5 | import json 6 | import uuid 7 | 8 | 9 | def simple_convert_message_llm(msg) -> list[dict]: 10 | """A simplified version of convert_message_llm for testing purposes. 11 | 12 | This follows the logic in the current implementation but is isolated 13 | from database connections for testing. 14 | """ 15 | msg_id = str(uuid.uuid4()) 16 | role, content = msg[0], msg[1] 17 | tool_name, tool_details = msg[2], msg[3] 18 | 19 | # Handle assistant messages 20 | if role == "assistant": 21 | content_blocks = [] 22 | 23 | # Add text block 24 | if isinstance(content, str): 25 | content_blocks.append({"type": "text", "text": content}) 26 | elif isinstance(content, list) and all(isinstance(block, dict) and "type" in block for block in content): 27 | content_blocks = content 28 | else: 29 | content_blocks.append({"type": "text", "text": str(content)}) 30 | 31 | # Add tool use block if present 32 | if tool_name and tool_details: 33 | tool_details_dict = json.loads(tool_details) 34 | tool_id = tool_details_dict.get("id") 35 | tool_input = tool_details_dict.get("args") 36 | 37 | if tool_id and tool_name: 38 | content_blocks.append({"type": "tool_use", "id": tool_id, "name": tool_name, "input": tool_input}) 39 | 40 | return [{"id": msg_id, "role": role, "content": content_blocks}] 41 | 42 | # Handle user messages 43 | return [{"id": msg_id, "role": role, "content": content}] 44 | 45 | 46 | def test_convert_message_llm_formats(): 47 | """Test message formatting in convert_message_llm.""" 48 | 49 | # Test 1: Simple user message 50 | user_msg = ("user", "Hello, how are you?", None, None, None) 51 | result = simple_convert_message_llm(user_msg) 52 | 53 | assert len(result) == 1 54 | assert result[0]["role"] == "user" 55 | assert result[0]["content"] == "Hello, how are you?" 56 | 57 | # Test 2: Simple assistant message 58 | assistant_msg = ("assistant", "I'm doing well, thank you!", None, None, None) 59 | result = simple_convert_message_llm(assistant_msg) 60 | 61 | assert len(result) == 1 62 | assert result[0]["role"] == "assistant" 63 | assert isinstance(result[0]["content"], list) 64 | assert len(result[0]["content"]) == 1 65 | assert result[0]["content"][0]["type"] == "text" 66 | assert result[0]["content"][0]["text"] == "I'm doing well, thank you!" 67 | 68 | # Test 3: Assistant message with tool use 69 | tool_id = str(uuid.uuid4()) 70 | tool_details_json = json.dumps({"id": tool_id, "args": {"query": "mercy in quran"}}) 71 | 72 | assistant_tool_msg = ("assistant", "Let me search for that", "search_quran", tool_details_json, None) 73 | result = simple_convert_message_llm(assistant_tool_msg) 74 | 75 | assert len(result) == 1 76 | assert result[0]["role"] == "assistant" 77 | assert isinstance(result[0]["content"], list) 78 | 79 | # There should be a text block and a tool block 80 | text_blocks = [b for b in result[0]["content"] if b.get("type") == "text"] 81 | tool_blocks = [b for b in result[0]["content"] if b.get("type") == "tool_use"] 82 | 83 | assert len(text_blocks) == 1, "Should have one text block" 84 | assert len(tool_blocks) == 1, "Should have one tool_use block" 85 | assert text_blocks[0]["text"] == "Let me search for that" 86 | assert tool_blocks[0]["id"] == tool_id 87 | assert tool_blocks[0]["name"] == "search_quran" 88 | 89 | # Test 4: Assistant message with empty text and tool use 90 | empty_tool_id = str(uuid.uuid4()) 91 | empty_tool_details = json.dumps({"id": empty_tool_id, "args": {"query": "test"}}) 92 | 93 | empty_msg = ("assistant", "", "search_quran", empty_tool_details, None) 94 | result = simple_convert_message_llm(empty_msg) 95 | 96 | # The current implementation will include an empty text block 97 | text_blocks = [b for b in result[0]["content"] if b.get("type") == "text"] 98 | tool_blocks = [b for b in result[0]["content"] if b.get("type") == "tool_use"] 99 | 100 | assert len(text_blocks) > 0, "Current implementation includes empty text block" 101 | assert text_blocks[0]["text"] == "", "Text block is empty" 102 | assert len(tool_blocks) == 1, "Should have one tool_use block" 103 | 104 | # Note: This test documents the current behavior, which may not be ideal. 105 | # The runtime code in AnsariClaude._finish_response now avoids creating 106 | # assistant messages with empty text blocks, but this database reconstruction 107 | # method still creates them. The test still passes to document this 108 | # difference in behavior. 109 | 110 | # Future enhancement should align the database reconstruction with runtime behavior 111 | # by not including empty text blocks in the content. 112 | 113 | 114 | def test_runtime_vs_database_behavior(): 115 | """Test to document the difference between runtime and database behavior 116 | with empty text blocks.""" 117 | 118 | # This is a helper function that mimics the runtime behavior in AnsariClaude 119 | def runtime_format(text, tool_calls): 120 | if not text and tool_calls: 121 | # Runtime behavior: only include tool calls when text is empty 122 | return {"role": "assistant", "content": tool_calls} 123 | else: 124 | # Include both text and tool calls 125 | content = [{"type": "text", "text": text}] 126 | content.extend(tool_calls) 127 | return {"role": "assistant", "content": content} 128 | 129 | # Create test data 130 | tool_id = str(uuid.uuid4()) 131 | tool_call = {"type": "tool_use", "id": tool_id, "name": "search_quran", "input": {"query": "test"}} 132 | 133 | # Test empty text with tool call 134 | runtime_result = runtime_format("", [tool_call]) 135 | assert runtime_result["role"] == "assistant" 136 | assert len(runtime_result["content"]) == 1, "Runtime: only includes tool call, no empty text block" 137 | assert runtime_result["content"][0]["type"] == "tool_use", "Runtime: only has tool block" 138 | 139 | # Compare with database reconstruction behavior 140 | db_msg = ("assistant", "", "search_quran", json.dumps({"id": tool_id, "args": {"query": "test"}}), None) 141 | db_result = simple_convert_message_llm(db_msg)[0] 142 | 143 | # Database behavior will include the empty text block 144 | has_empty_text = any(block.get("type") == "text" and block.get("text", "") == "" for block in db_result["content"]) 145 | assert has_empty_text, "Database: includes empty text block" 146 | 147 | 148 | if __name__ == "__main__": 149 | test_convert_message_llm_formats() 150 | test_runtime_vs_database_behavior() 151 | -------------------------------------------------------------------------------- /tests/unit/test_logging_regression.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | from unittest.mock import MagicMock, patch 4 | 5 | # Add the src directory to the path so we can import the modules 6 | src_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../")) 7 | sys.path.insert(0, src_path) 8 | 9 | from ansari.agents.ansari_claude import AnsariClaude 10 | from ansari.config import Settings 11 | 12 | 13 | def test_logging_changes_dont_break_basic_functionality(): 14 | """ 15 | Test that our logging changes don't break basic functionality. 16 | This is a minimal test to verify nothing fundamental was broken. 17 | """ 18 | # Create a mock settings object 19 | settings = MagicMock(spec=Settings) 20 | settings.ANTHROPIC_MODEL = "claude-3-opus-20240229" 21 | settings.MAX_FAILURES = 3 22 | 23 | # Create a mocked AnsariClaude instance with initial tools setup 24 | with patch("anthropic.Anthropic"), patch.object(AnsariClaude, "__init__", return_value=None): 25 | claude = AnsariClaude.__new__(AnsariClaude) 26 | claude.settings = settings 27 | claude.message_logger = None 28 | 29 | # Set needed attributes that would normally be set in __init__ 30 | claude.tools = [] 31 | claude.tool_name_to_instance = {} 32 | claude.citations = [] 33 | claude.message_history = [] 34 | claude.client = MagicMock() 35 | 36 | # Set up a simple message history 37 | claude.message_history = [{"role": "user", "content": "Hello, world!"}] 38 | 39 | # Mock the API response to avoid actual API calls 40 | claude.process_one_round = MagicMock(side_effect=lambda: []) 41 | 42 | # Add an assistant response 43 | claude.message_history.append( 44 | {"role": "assistant", "content": [{"type": "text", "text": "Hello! How can I help you today?"}]} 45 | ) 46 | 47 | # Verify basic validation works 48 | assert claude.validate_message(claude.message_history[-1]) 49 | 50 | # Test message logging works 51 | claude.message_logger = MagicMock() 52 | claude._log_message(claude.message_history[-1]) 53 | 54 | # Verify logger was called 55 | claude.message_logger.log.assert_called_once() 56 | 57 | print("Test passed - basic functionality works!") 58 | 59 | 60 | if __name__ == "__main__": 61 | test_logging_changes_dont_break_basic_functionality() 62 | -------------------------------------------------------------------------------- /tests/unit/test_message_id_in_thread.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import uuid 3 | 4 | import pytest 5 | from fastapi.testclient import TestClient 6 | 7 | from ansari.app.main_api import app 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | client = TestClient(app) 12 | 13 | # Test data 14 | valid_email = f"test_{uuid.uuid4()}@example.com" 15 | valid_password = "StrongPassword123!" 16 | first_name = "John" 17 | last_name = "Doe" 18 | 19 | 20 | @pytest.fixture 21 | def register_and_login_user(): 22 | # Register a user 23 | register_response = client.post( 24 | "/api/v2/users/register", 25 | json={ 26 | "email": valid_email, 27 | "password": valid_password, 28 | "first_name": first_name, 29 | "last_name": last_name, 30 | }, 31 | ) 32 | assert register_response.status_code == 200 33 | 34 | # Login with the registered user 35 | login_response = client.post( 36 | "/api/v2/users/login", 37 | json={ 38 | "email": valid_email, 39 | "password": valid_password, 40 | }, 41 | ) 42 | assert login_response.status_code == 200 43 | return login_response.json() 44 | 45 | 46 | @pytest.mark.asyncio 47 | async def test_message_id_in_thread_response(register_and_login_user): 48 | """Test that message IDs are included in thread responses.""" 49 | access_token = register_and_login_user["access_token"] 50 | 51 | # Create a new thread 52 | thread_response = client.post( 53 | "/api/v2/threads", 54 | headers={"Authorization": f"Bearer {access_token}"}, 55 | ) 56 | assert thread_response.status_code == 200 57 | thread_id = thread_response.json()["thread_id"] 58 | 59 | # Add a message to the thread 60 | message_data = {"role": "user", "content": "Test message with ID"} 61 | client.post( 62 | f"/api/v2/threads/{thread_id}", 63 | headers={"Authorization": f"Bearer {access_token}"}, 64 | json=message_data, 65 | ) 66 | 67 | # Get the thread and verify it contains message IDs 68 | thread_get_response = client.get( 69 | f"/api/v2/threads/{thread_id}", 70 | headers={"Authorization": f"Bearer {access_token}"}, 71 | ) 72 | assert thread_get_response.status_code == 200 73 | 74 | # Check that the thread contains messages with IDs 75 | thread_data = thread_get_response.json() 76 | assert "messages" in thread_data 77 | assert len(thread_data["messages"]) > 0 78 | 79 | # Verify each message has an ID field 80 | for message in thread_data["messages"]: 81 | assert "id" in message, f"Message does not contain ID field: {message}" 82 | assert isinstance(message["id"], int), f"Message ID is not an integer: {message['id']}" 83 | assert message["id"] > 0, f"Message ID is not positive: {message['id']}" 84 | 85 | 86 | def test_claude_message_ids_removed(): 87 | """Test that message IDs are removed before sending to Claude.""" 88 | # Create test messages with IDs 89 | messages = [ 90 | {"id": 1, "role": "user", "content": "Hello"}, 91 | {"id": 2, "role": "assistant", "content": [{"type": "text", "text": "Hi there"}]}, 92 | {"id": 3, "role": "user", "content": "How are you?"}, 93 | ] 94 | 95 | # Just test the replace_message_history method directly 96 | # Create a minimal class for testing 97 | class TestAnsariClaude: 98 | def replace_message_history(self, message_history, use_tool=True, stream=True): 99 | # Copy the method implementation from the original class 100 | # Remove message IDs from the history before sending to Claude 101 | cleaned_history = [] 102 | for msg in message_history: 103 | msg_copy = msg.copy() 104 | if "id" in msg_copy: 105 | del msg_copy["id"] 106 | cleaned_history.append(msg_copy) 107 | 108 | self.message_history = cleaned_history 109 | return [] 110 | 111 | # Create an instance of our test class 112 | claude = TestAnsariClaude() 113 | 114 | # Call replace_message_history 115 | claude.replace_message_history(messages) 116 | 117 | # Check that IDs were removed from the message history 118 | for msg in claude.message_history: 119 | assert "id" not in msg, f"Message still contains ID: {msg}" 120 | -------------------------------------------------------------------------------- /tests/unit/test_multilingual_citations.py: -------------------------------------------------------------------------------- 1 | """Tests for multilingual citation format in search tools.""" 2 | 3 | import json 4 | import pytest 5 | from ansari.util.translation import format_multilingual_data, parse_multilingual_data 6 | from ansari.tools.search_mawsuah import SearchMawsuah 7 | from ansari.tools.search_quran import SearchQuran 8 | from ansari.tools.search_hadith import SearchHadith 9 | 10 | 11 | class TestMultilingualFormat: 12 | """Test the multilingual format utility functions.""" 13 | 14 | def test_format_multilingual_data(self): 15 | """Test formatting a dictionary of language-text pairs to a JSON string.""" 16 | # Test with multiple languages 17 | test_data = {"ar": "النص العربي", "en": "English text"} 18 | result = format_multilingual_data(test_data) 19 | assert isinstance(result, str) 20 | 21 | # Verify the JSON structure 22 | parsed = json.loads(result) 23 | assert isinstance(parsed, list) 24 | assert len(parsed) == 2 25 | 26 | # Verify entries have lang and text 27 | for item in parsed: 28 | assert "lang" in item 29 | assert "text" in item 30 | 31 | # Verify correct data 32 | langs = [item["lang"] for item in parsed] 33 | assert "ar" in langs 34 | assert "en" in langs 35 | 36 | def test_parse_multilingual_data(self): 37 | """Test parsing a JSON string to a dictionary of language-text pairs.""" 38 | # Create test JSON 39 | json_str = json.dumps([{"lang": "ar", "text": "النص العربي"}, {"lang": "en", "text": "English text"}]) 40 | 41 | # Parse it 42 | result = parse_multilingual_data(json_str) 43 | 44 | # Verify result 45 | assert isinstance(result, dict) 46 | assert "ar" in result 47 | assert "en" in result 48 | assert result["ar"] == "النص العربي" 49 | assert result["en"] == "English text" 50 | 51 | def test_format_parse_roundtrip(self): 52 | """Test round-trip from dict -> JSON string -> dict.""" 53 | original = {"ar": "النص العربي", "en": "English text", "fr": "Texte français"} 54 | 55 | # Format to JSON string 56 | json_str = format_multilingual_data(original) 57 | 58 | # Parse back to dict 59 | result = parse_multilingual_data(json_str) 60 | 61 | # Verify round-trip consistency 62 | assert result == original 63 | 64 | 65 | @pytest.fixture 66 | def mock_search_results_mawsuah(): 67 | """Mock results from the Mawsuah search tool.""" 68 | return {"search_results": [{"text": "نص عربي للاختبار", "score": 0.95}]} 69 | 70 | 71 | @pytest.fixture 72 | def mock_search_results_quran(): 73 | """Mock results from the Quran search tool.""" 74 | return [ 75 | { 76 | "id": "1:1", 77 | "text": "بِسْمِ اللَّهِ الرَّحْمَٰنِ الرَّحِيمِ", 78 | "en_text": "In the name of Allah, the Entirely Merciful, the Especially Merciful.", 79 | } 80 | ] 81 | 82 | 83 | @pytest.fixture 84 | def mock_search_results_hadith(): 85 | """Mock results from the Hadith search tool.""" 86 | return [ 87 | { 88 | "id": "123", 89 | "source_book": "Bukhari", 90 | "chapter_number": "1", 91 | "chapter_english": "Test Chapter", 92 | "hadith_number": "456", 93 | "section_number": "2", 94 | "section_english": "Test Section", 95 | "ar_text": "نص حديث عربي", 96 | "en_text": "English hadith text", 97 | "grade_en": "Sahih", 98 | } 99 | ] 100 | 101 | 102 | class TestSearchToolsFormat: 103 | """Test that search tools correctly format their results in multilingual format.""" 104 | 105 | def test_mawsuah_format(self, mocker, mock_search_results_mawsuah): 106 | """Test that SearchMawsuah correctly formats Arabic-only results.""" 107 | # Create a minimal mocked version of SearchMawsuah that overrides parent methods 108 | mocker.patch( 109 | "ansari.tools.search_vectara.SearchVectara.format_as_ref_list", 110 | return_value=[ 111 | { 112 | "type": "document", 113 | "source": {"type": "text", "media_type": "text/plain", "data": "نص عربي للاختبار"}, 114 | "title": "Test Document", 115 | } 116 | ], 117 | ) 118 | 119 | # Instantiate with mock values 120 | search = SearchMawsuah("mock_key", "mock_corpus") 121 | 122 | # Format the results 123 | formatted = search.format_as_ref_list(mock_search_results_mawsuah) 124 | 125 | # Verify the result 126 | assert isinstance(formatted, list) 127 | assert len(formatted) == 1 128 | doc = formatted[0] 129 | 130 | # Verify document structure 131 | assert doc["type"] == "document" 132 | assert "source" in doc 133 | assert "data" in doc["source"] 134 | 135 | # Parse the multilingual data 136 | data = parse_multilingual_data(doc["source"]["data"]) 137 | 138 | # Verify it contains Arabic only 139 | assert "ar" in data 140 | assert len(data) == 1 # Only Arabic, no other languages 141 | 142 | def test_quran_format(self, mock_search_results_quran): 143 | """Test that SearchQuran correctly formats bilingual results.""" 144 | # Instantiate with mock values 145 | search = SearchQuran("mock_key") 146 | 147 | # Format the results 148 | formatted = search.format_as_ref_list(mock_search_results_quran) 149 | 150 | # Verify the result 151 | assert isinstance(formatted, list) 152 | assert len(formatted) == 1 153 | doc = formatted[0] 154 | 155 | # Verify document structure 156 | assert doc["type"] == "document" 157 | assert "source" in doc 158 | assert "data" in doc["source"] 159 | 160 | # Parse the multilingual data 161 | data = parse_multilingual_data(doc["source"]["data"]) 162 | 163 | # Verify it contains both Arabic and English 164 | assert "ar" in data 165 | assert "en" in data 166 | assert len(data) == 2 167 | 168 | def test_hadith_format(self, mock_search_results_hadith): 169 | """Test that SearchHadith correctly formats results with metadata.""" 170 | # Instantiate with mock values 171 | search = SearchHadith("mock_key") 172 | 173 | # Format the results 174 | formatted = search.format_as_ref_list(mock_search_results_hadith) 175 | 176 | # Verify the result 177 | assert isinstance(formatted, list) 178 | assert len(formatted) == 1 179 | doc = formatted[0] 180 | 181 | # Verify document structure 182 | assert doc["type"] == "document" 183 | assert "source" in doc 184 | assert "data" in doc["source"] 185 | 186 | # Parse the multilingual data 187 | data = parse_multilingual_data(doc["source"]["data"]) 188 | 189 | # Verify it contains both Arabic and English 190 | assert "ar" in data 191 | assert "en" in data 192 | 193 | # Verify grade is in the title, not in the data 194 | assert "Grade: Sahih" in doc["title"] 195 | -------------------------------------------------------------------------------- /tests/unit/test_translation.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import logging 3 | 4 | from ansari.util.translation import translate_text 5 | 6 | # Set up logging 7 | logger = logging.getLogger(__name__) 8 | logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") 9 | 10 | 11 | class TestTranslation: 12 | """Tests for the translate_text function using the actual Anthropic API.""" 13 | 14 | def test_basmalah_translation(self): 15 | """Test translating the Basmalah from Arabic to English.""" 16 | basmalah = "بسم الله الرحمن الرحيم" 17 | result = translate_text(basmalah, "en", "ar") 18 | 19 | logger.info(f"Basmalah translation: '{result}'") 20 | assert result, "Translation should not be empty" 21 | 22 | def test_same_language_translation(self): 23 | """Test when source and target languages are the same.""" 24 | # No API call made when languages match 25 | text = "Hello world" 26 | result = translate_text(text, "en", "en") 27 | 28 | # Should return the original text unchanged 29 | assert result == text 30 | 31 | def test_empty_text_translation(self): 32 | """Test translating empty text.""" 33 | # No API call made for empty text 34 | result = translate_text("", "ar", "en") 35 | 36 | # Should return empty string 37 | assert result == "" 38 | 39 | 40 | if __name__ == "__main__": 41 | # This allows running the tests directly 42 | pytest.main(["-xvs", __file__]) 43 | -------------------------------------------------------------------------------- /update_database.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timezone 2 | from bson import CodecOptions 3 | import pymongo 4 | 5 | from ansari.ansari_logger import get_logger 6 | from ansari.config import get_settings 7 | 8 | logger = get_logger(__name__) 9 | 10 | 11 | def update_database(): 12 | try: 13 | settings = get_settings() 14 | db_url = settings.MONGO_URL 15 | db_name = settings.MONGO_DB_NAME 16 | bson_codec_options = CodecOptions(tz_aware = True) 17 | mongo_connection = pymongo.MongoClient(db_url) 18 | mongo_db = mongo_connection[db_name] 19 | 20 | threads_collection = mongo_db.get_collection("threads", codec_options=bson_codec_options) 21 | 22 | impacted_threads = threads_collection.find({"messages.content": 23 | {"$elemMatch": {"text": ""}}}).sort("updated_at", -1) 24 | for impacted_thread in impacted_threads: 25 | logger.info(f"""Empty content message found: {str(impacted_thread["_id"])}, 26 | last updated: {impacted_thread["updated_at"]}""") 27 | 28 | for message in impacted_thread["messages"]: 29 | if not isinstance(message["content"], list): 30 | continue 31 | 32 | for content in message["content"]: 33 | if "text" in content and content["text"] == "": 34 | content["text"] = "I'm processing your request." 35 | 36 | update_result = threads_collection.update_one( 37 | {"_id": impacted_thread["_id"]}, 38 | {"$set": { 39 | "messages": impacted_thread["messages"], 40 | "updated_at": datetime.now(timezone.utc), 41 | "empty_content_block": True 42 | }} 43 | ) 44 | logger.info(f"Update result: {update_result.matched_count} matched, {update_result.modified_count} modified.") 45 | 46 | 47 | except (Exception) as error: 48 | logger.error(f"Error: {error}") 49 | finally: 50 | if mongo_connection is not None: 51 | mongo_connection.close() 52 | 53 | 54 | if __name__ == "__main__": 55 | update_database() 56 | --------------------------------------------------------------------------------