├── .deepsource.toml
├── .dockerignore
├── .env.example
├── .github
    └── workflows
    │   ├── deploy-production.yml
    │   ├── deploy-staging.yml
    │   └── python-app.yml
├── .gitignore
├── .pre-commit-config.yaml
├── .python-version
├── CLAUDE.md
├── Dockerfile
├── LICENSE
├── MANIFEST.in
├── Procfile
├── README.md
├── __init__.py
├── aws
    ├── aws-cli.md
    ├── github-actions-policy.json
    ├── instance-role-parameters-access.json
    ├── instance-role-policy.json
    └── service-role-policy.json
├── data
    └── mawsuah
    │   ├── README.md
    │   └── strip_tashkeel.py
├── docs
    ├── fastapi
    │   ├── async_await_backgroundtasks_logs_for_tracing.log
    │   ├── async_await_backgroundtasks_visualized.md
    │   ├── inheriting_middleware_class.ipynb
    │   └── middleware_chains_and_request_response_flow.ipynb
    ├── impl
    │   └── mongodb.md
    ├── spec
    │   └── mongodb.md
    └── structure_of_api_responses
    │   ├── a_fastapi_request_received_from_zrok.json
    │   ├── anthropic_api_structure_of_message_history.json
    │   ├── anthropic_api_structure_of_response.json
    │   ├── meta_whatsapp_api_structure_of_a_reply_msg_status.json
    │   ├── meta_whatsapp_api_structure_of_a_request_sent_using_zrok.json
    │   ├── meta_whatsapp_api_structure_of_a_user_incoming_msg.json
    │   └── openai_api_structure_of_chat_completion_chunk_object.ipynb
├── favicon.ico
├── migrate_database.py
├── pyproject.toml
├── pytest.ini
├── setup.sh
├── src
    └── ansari
    │   ├── __init__.py
    │   ├── agents
    │       ├── __init__.py
    │       ├── ansari.py
    │       ├── ansari_claude.py
    │       └── ansari_workflow.py
    │   ├── ansari_db.py
    │   ├── ansari_db_sql.py
    │   ├── ansari_logger.py
    │   ├── app
    │       ├── __init__.py
    │       ├── main_api.py
    │       ├── main_api_client.py
    │       ├── main_discord.py
    │       ├── main_file.py
    │       ├── main_stdio.py
    │       └── main_whatsapp.py
    │   ├── cli
    │       ├── __init__.py
    │       ├── query_api.py
    │       └── use_tools.py
    │   ├── config.py
    │   ├── examples
    │       ├── test_citations.py
    │       └── test_search_mawsuah.py
    │   ├── presenters
    │       ├── api_presenter.py
    │       ├── ayah_file_presenter.py
    │       ├── discord_presenter.py
    │       ├── file_presenter.py
    │       ├── gradio_presenter.py
    │       ├── stdio_presenter.py
    │       └── whatsapp_presenter.py
    │   ├── resources
    │       ├── prompts
    │       │   ├── greeting.txt
    │       │   ├── news.txt
    │       │   ├── system_msg_ayah.txt
    │       │   ├── system_msg_ayah_lay.txt
    │       │   ├── system_msg_claude.txt
    │       │   └── system_msg_tool.txt
    │       └── templates
    │       │   ├── ask_question.txt
    │       │   └── password_reset.html
    │   ├── tools
    │       ├── __init__.py
    │       ├── base_search.py
    │       ├── search_hadith.py
    │       ├── search_mawsuah.py
    │       ├── search_quran.py
    │       ├── search_tafsir_encyc.py
    │       ├── search_usul.py
    │       └── search_vectara.py
    │   └── util
    │       ├── __init__.py
    │       ├── general_helpers.py
    │       ├── prompt_mgr.py
    │       ├── robust_translation.py
    │       └── translation.py
├── test_ansari_claude.py
├── tests
    ├── __init__.py
    ├── ask-question-en.txt
    ├── batik-v1-en.csv
    ├── integration
    │   ├── README.md
    │   ├── __init__.py
    │   ├── test_ansari_generic.py
    │   ├── test_ansari_integration.py
    │   ├── test_claude_integration.py
    │   └── test_helpers.py
    └── unit
    │   ├── __init__.py
    │   ├── test_ansari_claude_document_limiting.py
    │   ├── test_ansari_claude_empty_text_block.py
    │   ├── test_ansari_claude_message_sequence.py
    │   ├── test_ansari_claude_tool_sequence.py
    │   ├── test_answer_quality.py
    │   ├── test_citation_formatting.py
    │   ├── test_convert_message_llm.py
    │   ├── test_logging_regression.py
    │   ├── test_main_api.py
    │   ├── test_message_id_in_thread.py
    │   ├── test_multilingual_citations.py
    │   ├── test_multilingual_data_parsing.py
    │   ├── test_search_mawsuah.py
    │   ├── test_search_tafsir_encyc.py
    │   └── test_translation.py
├── update_database.py
└── uv.lock


/.deepsource.toml:
--------------------------------------------------------------------------------
 1 | version = 1
 2 | 
 3 | [[analyzers]]
 4 | name = "shell"
 5 | 
 6 | [[analyzers]]
 7 | name = "python"
 8 | 
 9 |   [analyzers.meta]
10 |   runtime_version = "3.x.x"


--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
1 | # Python
2 | __pycache__
3 | app.egg-info
4 | *.pyc
5 | .mypy_cache
6 | .coverage
7 | htmlcov
8 | .venv
9 | 


--------------------------------------------------------------------------------
/.env.example:
--------------------------------------------------------------------------------
 1 | DEPLOYMENT_TYPE="development"    # Deployment type (development, staging, production)
 2 | FRONTEND_URL="http://localhost:8081"
 3 | 
 4 | KALEMAT_API_KEY=""          # Token for Qur'an and Hadith search
 5 | ANTHROPIC_API_KEY=""        # API key for Claude AI model
 6 | OPENAI_API_KEY=""           # Token for GPT-4 (Optional)
 7 | 
 8 | # Optional. If not set, the app will not use these services.
 9 | SENDGRID_API_KEY=""         # API key to send password reset options
10 | 
11 | MAILCHIMP_API_KEY=""         # API key
12 | MAILCHIMP_SERVER_PREFIX="" # Server prefix (data center)
13 | MAILCHIMP_LIST_ID=""      # List ID
14 | 
15 | # Database connection string
16 | MONGO_URL="mongodb://localhost:27017"
17 | MONGO_DB_NAME="ansari_db"
18 | 
19 | SECRET_KEY="secret"         # Secret key for signing tokens
20 | 
21 | # Origins to be allowed by the backend
22 | ORIGINS="https://ansari.chat,https://www.ansari.chat,https://pre.ansari.chat"
23 | 
24 | # Vectara search engine configuration
25 | VECTARA_API_KEY=""           # Authentication token for Vectara API
26 | 
27 | QURAN_DOT_COM_API_KEY="" # This is the API key we give to quran.com to access us, not for us to access them
28 | 
29 | # Directory for storing templates
30 | template_dir="."             # Directory path for templates
31 | 
32 | # Related to WhatsApp Business and Meta (leave empty if you're not planning to use WhatsApp)
33 | # Source 1: https://www.youtube.com/watch?v=KP6_BUw3i0U
34 | #   Watch Until 32:25, while quickly skimming through the non-python code parts
35 | # Source 2 (mentioned in video above): https://glitch.com/edit/#!/insidious-tartan-alvarezsaurus
36 | #           (the `verification_webhook` endpoint in `main_whatsapp` is inspired by the above URL)
37 | # Source 3 (optional): https://developers.facebook.com/blog/post/2022/10/24/sending-messages-with-whatsapp-in-your-python-applications/#u_0_39_8q
38 | 
39 | # Moreover, if want to test whatsapp's webhook locally, you can use zrok on a reserved URL with a zrok "share token"
40 | # obtained by contacting its current holder: https://github.com/OdyAsh (source 1, 2 below)
41 | # Alternatively, you can change the webhook url all together (source 3, 4 below)
42 | # Check these sources for more details:
43 | # Source 1: https://dev.to/odyash/quickly-share-your-app-with-zrok-4ihp
44 | # Source 2: https://openziti.discourse.group/t/how-do-i-use-a-reserved-share-on-different-devices/2379/2
45 | # Source 3: https://youtu.be/KP6_BUw3i0U?t=1294
46 | #   (@21:33 and 25:30, however they use glitch instead of zrok, so the video here is just to give you an idea how to setup a webhook)
47 | # Source 4 (where you can change callback url, given that your facebook account gets access by the app's admins):
48 | #   https://developers.facebook.com/apps/871020755148175/whatsapp-business/wa-settings/
49 | #   NOTE 1: When you see the `Callback URL`, it will be something like "https://ZROK_SHARE_TOKEN.share.zrok.io/whatsapp/v1"
50 | #           (The `/whatsapp/v1` endpoint can be found in `main_whatsapp.py`'s endpoints, that's why it's in the url above)
51 | #   NOTE 2: If an unexpected 3rd party discovers the ZROK_SHARE_TOKEN,
52 | #   a new one will have to be generated, then added to Meta's callback URL of the *testing* app
53 | #   (Noting that the *production* app's callback URL will be different anyway, so the 3rd party won't be able to access that app)
54 | #   (but we still don't want random calls to be made to our testing app, so that's why we'll still have to change an exposed token :])
55 | #   NOTE 3: Obviously, that `871...175` in the above URL is the testing app's public id, so if this link still doesn't work even after you gain access,
56 | #   then the admins most probably created a new test app instance
57 | 
58 | WHATSAPP_API_VERSION="<<CURRENT-VERSION-AS-MENTIONED-IN-SOURCE-URL-ABOVE>>"
59 | 
60 | # NOTE: Contact the team to see whatsapp's 2 phone nums -> one for prod. env. and the other for local/stage testing
61 | WHATSAPP_BUSINESS_PHONE_NUMBER_ID="<<YOUR-WHATSAPP-BUSINESS-PHONE-NUMBER-ID>>"
62 | 
63 | # NOTE 1: check video in source 3 above from 30:45 to 32:15 to see where we get the access token
64 | # NOTE 2: Contact the team to see their 2 access tokens -> one for prod. env. and the other for local/stage testing
65 | WHATSAPP_ACCESS_TOKEN_FROM_SYS_USER="<<YOUR-SYSTEM-USER-ACCESS-TOKEN>"
66 | 
67 | WHATSAPP_VERIFY_TOKEN_FOR_WEBHOOK="<<The-VERIFIFY-TOKEN-CURRENTLY-USED-TO-VERIFY-META'S-CALLBACK-URL>>"
68 | WHATSAPP_CHAT_RETENTION_HOURS=3
69 | ZROK_SHARE_TOKEN="<<THE-ZROK-SHARE-TOKEN-CURRENTLY-USED-IN-META'S-CALLBACK-URL>>"
70 | 
71 | # Related to internal code logic
72 | # Leave the values below when locally debugging the application
73 | # In production, don't add them to environment variables, or add them as "INFO"/"False" respectively
74 | LOGGING_LEVEL="DEBUG"
75 | DEV_MODE="True"
76 | 
77 | # Application version control settings
78 | MAINTENANCE_MODE="False"                # Whether the application is in maintenance mode
79 | 
80 | # iOS app build versions
81 | IOS_MINIMUM_BUILD_VERSION="1"           # Minimum build version required for iOS app
82 | IOS_LATEST_BUILD_VERSION="1"            # Latest available build version for iOS app
83 | 
84 | # Android app build versions
85 | ANDROID_MINIMUM_BUILD_VERSION="1"       # Minimum build version required for Android app
86 | ANDROID_LATEST_BUILD_VERSION="1"        # Latest available build version for Android app
87 | 
88 | SENTRY_DSN=""                # Sentry DSN for error tracking
89 | 
90 | # To get rid of .py[cod] files (This should key should NOT be set in production!)
91 | # This is only to de-clutter your local development environment
92 | # Details: https://docs.python-guide.org/writing/gotchas/#disabling-bytecode-pyc-files
93 | PYTHONDONTWRITEBYTECODE=1


--------------------------------------------------------------------------------
/.github/workflows/deploy-production.yml:
--------------------------------------------------------------------------------
  1 | name: Production Deployment (AWS App Runner)
  2 | 
  3 | on:
  4 |   push:
  5 |     branches:
  6 |       - main
  7 |   workflow_dispatch:
  8 | 
  9 | jobs:
 10 |   build-and-deploy:
 11 |     runs-on: ubuntu-latest
 12 |     environment: production-aws
 13 | 
 14 |     steps:
 15 |       - name: Checkout
 16 |         uses: actions/checkout@v4
 17 |         with:
 18 |           persist-credentials: false
 19 | 
 20 |       - name: Configure AWS credentials
 21 |         id: aws-credentials
 22 |         uses: aws-actions/configure-aws-credentials@v4
 23 |         with:
 24 |           aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
 25 |           aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
 26 |           aws-region: ${{ secrets.AWS_REGION }}
 27 | 
 28 |       - name: Login to Amazon ECR
 29 |         id: login-ecr
 30 |         uses: aws-actions/amazon-ecr-login@v2
 31 | 
 32 |       - name: Build, tag, and push image to Amazon ECR
 33 |         id: build-image
 34 |         env:
 35 |           ECR_REGISTRY: ${{ steps.login-ecr.outputs.registry }}
 36 |           ECR_REPOSITORY: ansari-backend
 37 |           IMAGE_TAG: ${{ github.sha }}
 38 |         run: |
 39 |           docker build -t $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG .
 40 |           docker push $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG
 41 |           echo "::set-output name=image::$ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG"
 42 | 
 43 |       - name: Deploy to App Runner
 44 |         id: deploy-apprunner
 45 |         uses: awslabs/amazon-app-runner-deploy@main
 46 |         env:
 47 |           DEPLOYMENT_TYPE: production
 48 |           LOGGING_LEVEL: INFO
 49 |           FRONTEND_URL: ${{ format('{0}{1}', secrets.SSM_ROOT, 'frontend-url') }}
 50 |           MONGO_URL: ${{ format('{0}{1}', secrets.SSM_ROOT, 'mongo-url') }}
 51 |           MONGO_DB_NAME: ${{ format('{0}{1}', secrets.SSM_ROOT, 'mongo-db-name') }}
 52 |           SECRET_KEY: ${{ format('{0}{1}', secrets.SSM_ROOT, 'secret-key') }}
 53 |           ORIGINS:  ${{ format('{0}{1}', secrets.SSM_ROOT, 'origins') }}
 54 | 
 55 |           OPENAI_API_KEY: ${{ format('{0}{1}', secrets.SSM_ROOT, 'openai-api-key') }}
 56 |           SENDGRID_API_KEY: ${{ format('{0}{1}', secrets.SSM_ROOT, 'sendgrid-api-key') }}
 57 |           MAILCHIMP_API_KEY: ${{ format('{0}{1}', secrets.SSM_ROOT, 'mailchimp-api-key') }}
 58 |           MAILCHIMP_SERVER_PREFIX: ${{ format('{0}{1}', secrets.SSM_ROOT, 'mailchimp-server-prefix') }}
 59 |           MAILCHIMP_LIST_ID: ${{ format('{0}{1}', secrets.SSM_ROOT, 'mailchimp-list-id') }}
 60 |           ANTHROPIC_API_KEY: ${{ format('{0}{1}', secrets.SSM_ROOT, 'anthropic-api-key') }}
 61 |           KALEMAT_API_KEY: ${{ format('{0}{1}', secrets.SSM_ROOT, 'kalemat-api-key') }}
 62 |           SUNNAH_TOKEN: ${{ format('{0}{1}', secrets.SSM_ROOT, 'sunnah-token') }}
 63 |           VECTARA_API_KEY: ${{ format('{0}{1}', secrets.SSM_ROOT, 'vectara-api-key') }}
 64 |           QURAN_DOT_COM_API_KEY: ${{ format('{0}{1}', secrets.SSM_ROOT, 'quran-dot-com-api-key') }}
 65 |           USUL_API_TOKEN: ${{ format('{0}{1}', secrets.SSM_ROOT, 'usul-api-token') }}
 66 |           SENTRY_DSN: ${{ format('{0}{1}', secrets.SSM_ROOT, 'sentry-dsn') }}
 67 | 
 68 |           WHATSAPP_ACCESS_TOKEN_FROM_SYS_USER: ${{ format('{0}{1}', secrets.SSM_ROOT, 'whatsapp-access-token-from-sys-user') }}
 69 |           WHATSAPP_BUSINESS_PHONE_NUMBER_ID: ${{ format('{0}{1}', secrets.SSM_ROOT, 'whatsapp-business-phone-number-id') }}
 70 |           WHATSAPP_VERIFY_TOKEN_FOR_WEBHOOK: ${{ format('{0}{1}', secrets.SSM_ROOT, 'whatsapp-verify-token-for-webhook') }}
 71 |           WHATSAPP_ENABLED: ${{ format('{0}{1}', secrets.SSM_ROOT, 'whatsapp-enabled') }}
 72 |           WHATSAPP_API_VERSION: ${{ format('{0}{1}', secrets.SSM_ROOT, 'whatsapp-api-version') }}
 73 |           WHATSAPP_CHAT_RETENTION_HOURS: ${{ format('{0}{1}', secrets.SSM_ROOT, 'whatsapp-chat-retention-hours') }}
 74 | 
 75 |           MAINTENANCE_MODE: ${{ format('{0}{1}', secrets.SSM_ROOT, 'maintenance-mode') }}
 76 |           IOS_MINIMUM_BUILD_VERSION: ${{ format('{0}{1}', secrets.SSM_ROOT, 'ios-minimum-build-version') }}
 77 |           IOS_LATEST_BUILD_VERSION: ${{ format('{0}{1}', secrets.SSM_ROOT, 'ios-latest-build-version') }}
 78 |           ANDROID_MINIMUM_BUILD_VERSION: ${{ format('{0}{1}', secrets.SSM_ROOT, 'android-minimum-build-version') }}
 79 |           ANDROID_LATEST_BUILD_VERSION: ${{ format('{0}{1}', secrets.SSM_ROOT, 'android-latest-build-version') }}
 80 |         with:
 81 |           service: ansari-production-backend
 82 |           image: ${{ steps.build-image.outputs.image }}
 83 |           access-role-arn: ${{ secrets.SERVICE_ROLE_ARN }}
 84 |           region: ${{ secrets.AWS_REGION }}
 85 |           cpu : 1
 86 |           memory : 2
 87 |           port: 8000
 88 |           wait-for-service-stability-seconds: 1200
 89 |           copy-env-vars: |
 90 |             DEPLOYMENT_TYPE
 91 |             LOGGING_LEVEL
 92 |           copy-secret-env-vars: |
 93 |             FRONTEND_URL
 94 |             MONGO_URL
 95 |             MONGO_DB_NAME
 96 |             SECRET_KEY
 97 |             ORIGINS
 98 | 
 99 |             OPENAI_API_KEY
100 |             SENDGRID_API_KEY
101 |             MAILCHIMP_API_KEY
102 |             MAILCHIMP_SERVER_PREFIX
103 |             MAILCHIMP_LIST_ID
104 |             ANTHROPIC_API_KEY
105 |             KALEMAT_API_KEY
106 |             SUNNAH_TOKEN
107 |             VECTARA_API_KEY
108 |             QURAN_DOT_COM_API_KEY
109 |             USUL_API_TOKEN
110 |             SENTRY_DSN
111 | 
112 |             WHATSAPP_ACCESS_TOKEN_FROM_SYS_USER
113 |             WHATSAPP_BUSINESS_PHONE_NUMBER_ID
114 |             WHATSAPP_VERIFY_TOKEN_FOR_WEBHOOK
115 |             WHATSAPP_API_VERSION
116 |             WHATSAPP_ENABLED
117 |             WHATSAPP_CHAT_RETENTION_HOURS
118 | 
119 |             MAINTENANCE_MODE
120 |             IOS_MINIMUM_BUILD_VERSION
121 |             IOS_LATEST_BUILD_VERSION
122 |             ANDROID_MINIMUM_BUILD_VERSION
123 |             ANDROID_LATEST_BUILD_VERSION
124 |           instance-role-arn: ${{ secrets.INSTANCE_ROLE_ARN }}
125 | 
126 |       - name: App Runner URL
127 |         run: echo "App runner URL ${{ steps.deploy-apprunner.outputs.service-url }}"
128 | 


--------------------------------------------------------------------------------
/.github/workflows/deploy-staging.yml:
--------------------------------------------------------------------------------
  1 | name: Staging Deployment (AWS App Runner)
  2 | 
  3 | on:
  4 |   push:
  5 |     branches:
  6 |       - develop
  7 |   workflow_dispatch:
  8 | 
  9 | jobs:
 10 |   build-and-deploy:
 11 |     runs-on: ubuntu-latest
 12 |     environment: staging-aws
 13 | 
 14 |     steps:
 15 |       - name: Checkout
 16 |         uses: actions/checkout@v4
 17 |         with:
 18 |           persist-credentials: false
 19 | 
 20 |       - name: Configure AWS credentials
 21 |         id: aws-credentials
 22 |         uses: aws-actions/configure-aws-credentials@v4
 23 |         with:
 24 |           aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
 25 |           aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
 26 |           aws-region: ${{ secrets.AWS_REGION }}
 27 | 
 28 |       - name: Login to Amazon ECR
 29 |         id: login-ecr
 30 |         uses: aws-actions/amazon-ecr-login@v2
 31 | 
 32 |       - name: Build, tag, and push image to Amazon ECR
 33 |         id: build-image
 34 |         env:
 35 |           ECR_REGISTRY: ${{ steps.login-ecr.outputs.registry }}
 36 |           ECR_REPOSITORY: ansari-backend
 37 |           IMAGE_TAG: ${{ github.sha }}
 38 |         run: |
 39 |           docker build -t $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG .
 40 |           docker push $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG
 41 |           echo "::set-output name=image::$ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG"
 42 | 
 43 |       - name: Deploy to App Runner
 44 |         id: deploy-apprunner
 45 |         uses: awslabs/amazon-app-runner-deploy@main
 46 |         env:
 47 |           DEPLOYMENT_TYPE: staging
 48 |           LOGGING_LEVEL: DEBUG
 49 |           FRONTEND_URL: ${{ format('{0}{1}', secrets.SSM_ROOT, 'frontend-url') }}
 50 |           MONGO_URL: ${{ format('{0}{1}', secrets.SSM_ROOT, 'mongo-url') }}
 51 |           MONGO_DB_NAME: ${{ format('{0}{1}', secrets.SSM_ROOT, 'mongo-db-name') }}
 52 |           SECRET_KEY: ${{ format('{0}{1}', secrets.SSM_ROOT, 'secret-key') }}
 53 |           ORIGINS:  ${{ format('{0}{1}', secrets.SSM_ROOT, 'origins') }}
 54 | 
 55 |           OPENAI_API_KEY: ${{ format('{0}{1}', secrets.SSM_ROOT, 'openai-api-key') }}
 56 |           SENDGRID_API_KEY: ${{ format('{0}{1}', secrets.SSM_ROOT, 'sendgrid-api-key') }}
 57 |           MAILCHIMP_API_KEY: ${{ format('{0}{1}', secrets.SSM_ROOT, 'mailchimp-api-key') }}
 58 |           MAILCHIMP_SERVER_PREFIX: ${{ format('{0}{1}', secrets.SSM_ROOT, 'mailchimp-server-prefix') }}
 59 |           MAILCHIMP_LIST_ID: ${{ format('{0}{1}', secrets.SSM_ROOT, 'mailchimp-list-id') }}
 60 |           ANTHROPIC_API_KEY: ${{ format('{0}{1}', secrets.SSM_ROOT, 'anthropic-api-key') }}
 61 |           KALEMAT_API_KEY: ${{ format('{0}{1}', secrets.SSM_ROOT, 'kalemat-api-key') }}
 62 |           SUNNAH_TOKEN: ${{ format('{0}{1}', secrets.SSM_ROOT, 'sunnah-token') }}
 63 |           VECTARA_API_KEY: ${{ format('{0}{1}', secrets.SSM_ROOT, 'vectara-api-key') }}
 64 |           QURAN_DOT_COM_API_KEY: ${{ format('{0}{1}', secrets.SSM_ROOT, 'quran-dot-com-api-key') }}
 65 |           USUL_API_TOKEN: ${{ format('{0}{1}', secrets.SSM_ROOT, 'usul-api-token') }}
 66 |           SENTRY_DSN: ${{ format('{0}{1}', secrets.SSM_ROOT, 'sentry-dsn') }}
 67 | 
 68 |           WHATSAPP_ACCESS_TOKEN_FROM_SYS_USER: ${{ format('{0}{1}', secrets.SSM_ROOT, 'whatsapp-access-token-from-sys-user') }}
 69 |           WHATSAPP_BUSINESS_PHONE_NUMBER_ID: ${{ format('{0}{1}', secrets.SSM_ROOT, 'whatsapp-business-phone-number-id') }}
 70 |           WHATSAPP_VERIFY_TOKEN_FOR_WEBHOOK: ${{ format('{0}{1}', secrets.SSM_ROOT, 'whatsapp-verify-token-for-webhook') }}
 71 |           WHATSAPP_ENABLED: ${{ format('{0}{1}', secrets.SSM_ROOT, 'whatsapp-enabled') }}
 72 |           WHATSAPP_API_VERSION: ${{ format('{0}{1}', secrets.SSM_ROOT, 'whatsapp-api-version') }}
 73 |           WHATSAPP_CHAT_RETENTION_HOURS: ${{ format('{0}{1}', secrets.SSM_ROOT, 'whatsapp-chat-retention-hours') }}
 74 | 
 75 |           MAINTENANCE_MODE: ${{ format('{0}{1}', secrets.SSM_ROOT, 'maintenance-mode') }}
 76 |           IOS_MINIMUM_BUILD_VERSION: ${{ format('{0}{1}', secrets.SSM_ROOT, 'ios-minimum-build-version') }}
 77 |           IOS_LATEST_BUILD_VERSION: ${{ format('{0}{1}', secrets.SSM_ROOT, 'ios-latest-build-version') }}
 78 |           ANDROID_MINIMUM_BUILD_VERSION: ${{ format('{0}{1}', secrets.SSM_ROOT, 'android-minimum-build-version') }}
 79 |           ANDROID_LATEST_BUILD_VERSION: ${{ format('{0}{1}', secrets.SSM_ROOT, 'android-latest-build-version') }}
 80 |         with:
 81 |           service: ansari-staging-backend
 82 |           image: ${{ steps.build-image.outputs.image }}
 83 |           access-role-arn: ${{ secrets.SERVICE_ROLE_ARN }}
 84 |           region: ${{ secrets.AWS_REGION }}
 85 |           cpu : 1
 86 |           memory : 2
 87 |           port: 8000
 88 |           wait-for-service-stability-seconds: 1200
 89 |           copy-env-vars: |
 90 |             DEPLOYMENT_TYPE
 91 |             LOGGING_LEVEL
 92 |           copy-secret-env-vars: |
 93 |             FRONTEND_URL
 94 |             MONGO_URL
 95 |             MONGO_DB_NAME
 96 |             SECRET_KEY
 97 |             ORIGINS
 98 | 
 99 |             OPENAI_API_KEY
100 |             SENDGRID_API_KEY
101 |             MAILCHIMP_API_KEY
102 |             MAILCHIMP_SERVER_PREFIX
103 |             MAILCHIMP_LIST_ID
104 |             ANTHROPIC_API_KEY
105 |             KALEMAT_API_KEY
106 |             SUNNAH_TOKEN
107 |             VECTARA_API_KEY
108 |             QURAN_DOT_COM_API_KEY
109 |             USUL_API_TOKEN
110 |             SENTRY_DSN
111 | 
112 |             WHATSAPP_ACCESS_TOKEN_FROM_SYS_USER
113 |             WHATSAPP_BUSINESS_PHONE_NUMBER_ID
114 |             WHATSAPP_VERIFY_TOKEN_FOR_WEBHOOK
115 |             WHATSAPP_API_VERSION
116 |             WHATSAPP_ENABLED
117 |             WHATSAPP_CHAT_RETENTION_HOURS
118 | 
119 |             MAINTENANCE_MODE
120 |             IOS_MINIMUM_BUILD_VERSION
121 |             IOS_LATEST_BUILD_VERSION
122 |             ANDROID_MINIMUM_BUILD_VERSION
123 |             ANDROID_LATEST_BUILD_VERSION
124 |           instance-role-arn: ${{ secrets.INSTANCE_ROLE_ARN }}
125 | 
126 |       - name: App Runner URL
127 |         run: echo "App runner URL ${{ steps.deploy-apprunner.outputs.service-url }}"
128 | 


--------------------------------------------------------------------------------
/.github/workflows/python-app.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will install Python dependencies, run tests and lint with a single version of Python
 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
 3 | 
 4 | name: Ansari CICD
 5 | 
 6 | on:
 7 |   # Trigger the workflow on push or pull request events to the "api-v2" and "main" branches.
 8 |   push:
 9 |     branches: [ "api-v2", "main" ]
10 |   pull_request:
11 |     branches: [ "api-v2", "main" ]
12 | 
13 | permissions:
14 |   contents: read
15 | 
16 | jobs:
17 |   ansari-container-job:
18 | 
19 |     runs-on: ubuntu-latest
20 |     env:
21 |       # Set up environment variables and secrets required for the workflow.
22 |       ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
23 |       OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
24 |       KALEMAT_API_KEY: ${{ secrets.KALEMAT_API_KEY }}
25 |       VECTARA_API_KEY: ${{ secrets.VECTARA_API_KEY }}
26 |       MAWSUAH_VECTARA_CORPUS_KEY: ${{ secrets.MAWSUAH_VECTARA_CORPUS_KEY }}
27 |       TAFSIR_VECTARA_CORPUS_KEY: ${{ secrets.TAFSIR_VECTARA_CORPUS_KEY }}
28 |       QURAN_DOT_COM_API_KEY: ${{ secrets.QURAN_DOT_COM_API_KEY }}
29 |       WHATSAPP_API_VERSION: ${{ secrets.WHATSAPP_API_VERSION }}
30 |       WHATSAPP_BUSINESS_PHONE_NUMBER_ID: ${{ secrets.WHATSAPP_BUSINESS_PHONE_NUMBER_ID }}
31 |       WHATSAPP_ACCESS_TOKEN_FROM_SYS_USER: ${{ secrets.WHATSAPP_ACCESS_TOKEN_FROM_SYS_USER }}
32 |       WHATSAPP_VERIFY_TOKEN_FOR_WEBHOOK: ${{ secrets.WHATSAPP_VERIFY_TOKEN_FOR_WEBHOOK }}
33 |       PYTHONPATH: src
34 | 
35 |     # Use a Python 3.10 container
36 |     container: python:3.10
37 | 
38 |     steps:
39 | 
40 |     # Check out the repository code.
41 |     - name: Check out repository code
42 |       uses: actions/checkout@v4
43 |     
44 |     # Install the `uv` tool.
45 |     - name: Install uv
46 |       run: |
47 |         pip install uv
48 |         # TODO(abdullah): create a venv using uv
49 | 
50 |     # Install Python dependencies, including `ruff`, `pytest`, `pytest-asyncio`, and `pytest-cov`.
51 |     - name: Install dependencies
52 |       run: |
53 |         uv pip install --system ruff pytest pytest-asyncio pytest-cov 
54 |         if [ -f requirements.txt ]; then uv pip install --system -r requirements.txt; fi
55 | 
56 | 
57 |     # Lint the code using `ruff` and stop the build if there are lint errors.
58 |     - name: Lint with ruff
59 |       run: |
60 |         # stop the build if there are lint errors
61 |         ruff check . --config pyproject.toml --output-format=github
62 | 
63 |     # Run tests using `pytest` and generate a coverage report.
64 |     - name: Test with pytest
65 |       env:
66 |         SECRET_KEY: "secret" # This is a required field. Setting it to a random value to pass the tests.
67 |       run: |
68 |         pytest --capture=tee-sys --cov=.


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Folders
 2 | .conda/
 3 | .venv/
 4 | .vscode/
 5 | abandoned/
 6 | bin/
 7 | datasources/
 8 | diskcache_dir/
 9 | docs/recordings/*
10 | etc/
11 | example_projects/
12 | lib/
13 | logs/
14 | share/
15 | src/ansari_backend.egg-info/*
16 | tmp/
17 | dist/
18 | 
19 | # Files
20 | .__atomic-write*
21 | .env
22 | .history
23 | *.pyc
24 | pyvenv.cfg
25 | zrok.exe
26 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 | - repo: https://github.com/astral-sh/ruff-pre-commit
 3 |   # Ruff version.
 4 |   rev: v0.7.3
 5 |   hooks:
 6 |     # Run the linter.
 7 |     - id: ruff
 8 |       types_or: [ python, pyi, jupyter ]
 9 |       args: [ --fix ]
10 |     # Run the formatter.
11 |     - id: ruff-format
12 |       types_or: [ python, pyi, jupyter ]


--------------------------------------------------------------------------------
/.python-version:
--------------------------------------------------------------------------------
1 | 3.13
2 | 


--------------------------------------------------------------------------------
/CLAUDE.md:
--------------------------------------------------------------------------------
 1 | # Ansari Backend - Developer Guide
 2 | 
 3 | ## Branch Management
 4 | - Always create new branches from the `develop` branch, NOT from `main`
 5 | - Use descriptive branch names that reflect the feature or fix being implemented
 6 | - Keep branches focused on a single feature or fix
 7 | - Delete branches after they're merged to keep the repository clean
 8 | 
 9 | ## Repository Organization
10 | - Keep the root directory clean and organized
11 | - Place temporary files, debug scripts, and other non-production artifacts in the `tmp/` directory
12 | - The `tmp/` directory is gitignored, making it perfect for development-only files
13 | - Make sure scripts and tools intended for the repository are placed in appropriate subdirectories
14 | 
15 | ## Git Commit and PR Guidelines
16 | - Do not include "Generated with Claude Code" or "Co-Authored-By: Claude" in commit messages
17 | - Do not include "Generated with Claude Code" in PR descriptions or anywhere else
18 | - Keep commit messages concise and descriptive
19 | - Use imperative mood in commit messages (e.g., "Add feature" not "Added feature")
20 | - Always run `ruff check` and `ruff format` before committing changes
21 | - Fix all linting errors - clean code is maintainable code
22 | - All PRs should target the `develop` branch, not `main`
23 | 
24 | ## Branch Management Details
25 | - Consider a merged branch "done" - do not add new changes to it
26 | - If you have changes after a branch was merged:
27 |   - Create a new branch from the latest develop branch
28 |   - Apply your new changes there
29 |   - Create a new PR with a descriptive name
30 | - For related but separate features, use separate branches and PRs
31 | - Delete branches after they're merged to keep the repository clean
32 | 
33 | ## Build/Test/Lint Commands
34 | - Install dependencies: `pip install -r requirements.txt`
35 | - Run backend service: `uvicorn main_api:app --reload`
36 | - Run CLI version (interactive):
37 |   - Claude: `python src/ansari/app/main_stdio.py -a AnsariClaude`
38 |   - OpenAI: `python src/ansari/app/main_stdio.py -a Ansari`
39 | - Run CLI with direct input:
40 |   - `python src/ansari/app/main_stdio.py -i "your question here"`
41 |   - `python src/ansari/app/main_stdio.py --input "your question here"`
42 | - Run tests: `pytest tests/`
43 | - Run single test: `pytest tests/path/to/test.py::test_function_name`
44 | - Run tests with specific marker: `pytest -m integration`
45 | - Lint code: `ruff check src/`
46 | - Format code: `ruff format src/`
47 | - Package commands:
48 |   - Build package: `python -m build`
49 |   - Upload to PyPI: `twine upload dist/*` (requires PyPI credentials)
50 | 
51 | ## Code Style Guidelines
52 | - **Imports**: Use absolute imports within the `ansari` package
53 | - **Formatting**: Double quotes for strings, 4-space indentation
54 | - **Line length**: 127 characters maximum
55 | - **Types**: Use Python type hints for function parameters and return types
56 | - **Naming**: Use snake_case for variables/functions, PascalCase for classes
57 | - **Error handling**: Use try/except blocks with specific error types
58 |   - Prefer clean failures over unpredictable recovery attempts
59 |   - Log errors clearly and completely before failing
60 |   - Do not attempt to "fix" malformed data that could lead to unexpected behavior
61 |   - If recovery is necessary, implement it as a well-tested, dedicated fix rather than ad-hoc patches
62 |   - Avoid cascading fallbacks - throw clear errors instead
63 | - **Logging**: Use the logger from `ansari.ansari_logger.get_logger()`
64 | - **Documentation**: Add docstrings to functions, especially complex ones
65 | - **Testing**: Create unit tests in `tests/unit/` and integration tests in `tests/integration/`
66 | - **Citations**:
67 |   - All search tools must format document data as multilingual JSON using `format_multilingual_data`
68 |   - The data format must be valid JSON following the schema in `base_search.py` documentation
69 |   - Store properly formatted JSON in the `data` field of document references
70 |   - Citation handling should account for both full document citations (valid JSON) and partial citations (plain text)
71 | - **Test-first development**: Always write tests before shipping features
72 |   - Write tests that validate both expected behavior and edge cases
73 |   - When fixing bugs, first write a test that reproduces the issue
74 |   - Run tests frequently during development to catch regressions
75 | - **Code complexity management**:
76 |   - Break down complex methods into smaller, focused helpers with clear responsibilities
77 |   - Use meaningful method names that describe what the method does, not how it does it
78 |   - Add clear comments about the purpose and behavior of complex code
79 |   - Extract state machine logic into clearly defined handlers for each state
80 |   - Aim for methods that can be understood without scrolling
81 | - **Error handling philosophy**: Prefer clean failures over unpredictable recovery attempts
82 |   - Log errors clearly and completely before failing
83 |   - Do not attempt to "fix" malformed data that could lead to unexpected behavior
84 |   - If recovery is necessary, implement it as a well-tested, dedicated fix rather than ad-hoc patches
85 | 
86 | ## Testing Best Practices
87 | - Run tests before committing: `pytest tests/`
88 | - Run specific test categories: `pytest tests/unit/` or `pytest tests/integration/`
89 | - Add tests for new functionality in the appropriate directory
90 | - Use fixture factories to keep tests maintainable
91 | - Test both happy path and error conditions
92 | - Keep tests independent (no dependencies between test functions)


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.13
 2 | 
 3 | ENV PYTHONUNBUFFERED=1
 4 | 
 5 | WORKDIR /app/
 6 | 
 7 | # Install uv
 8 | # Ref: https://docs.astral.sh/uv/guides/integration/docker/#installing-uv
 9 | COPY --from=ghcr.io/astral-sh/uv:0.5.11 /uv /uvx /bin/
10 | 
11 | # Place executables in the environment at the front of the path
12 | # Ref: https://docs.astral.sh/uv/guides/integration/docker/#using-the-environment
13 | ENV PATH="/app/.venv/bin:$PATH"
14 | 
15 | # Compile bytecode
16 | # Ref: https://docs.astral.sh/uv/guides/integration/docker/#compiling-bytecode
17 | ENV UV_COMPILE_BYTECODE=1
18 | 
19 | # uv Cache
20 | # Ref: https://docs.astral.sh/uv/guides/integration/docker/#caching
21 | ENV UV_LINK_MODE=copy
22 | 
23 | # Install dependencies
24 | # Ref: https://docs.astral.sh/uv/guides/integration/docker/#intermediate-layers
25 | RUN --mount=type=cache,target=/root/.cache/uv \
26 |   --mount=type=bind,source=uv.lock,target=uv.lock \
27 |   --mount=type=bind,source=pyproject.toml,target=pyproject.toml \
28 |   uv sync --frozen --no-install-project
29 | 
30 | ENV PYTHONPATH=/app
31 | 
32 | COPY ./pyproject.toml ./uv.lock /app/
33 | 
34 | COPY ./src/ansari /app/ansari
35 | 
36 | # Sync the project
37 | # Ref: https://docs.astral.sh/uv/guides/integration/docker/#intermediate-layers
38 | RUN --mount=type=cache,target=/root/.cache/uv \
39 |   uv sync
40 | 
41 | CMD ["fastapi", "run", "--workers", "4", "ansari/app/main_api.py"]


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2025 Ansari Project
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.md
2 | include LICENSE
3 | include requirements.txt
4 | include pyproject.toml
5 | include MANIFEST.in
6 | 
7 | recursive-include src/ansari/resources *
8 | recursive-include src/ansari/templates *


--------------------------------------------------------------------------------
/Procfile:
--------------------------------------------------------------------------------
1 | web: PYTHONPATH=$PYTHONPATH:src gunicorn -w 4 -k uvicorn.workers.UvicornWorker --pythonpath src ansari.app.main_api:app --max-requests 500
2 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ansari-project/ansari-backend/accfce869dd1a98c02eb4e931985525b91632c7d/__init__.py


--------------------------------------------------------------------------------
/aws/aws-cli.md:
--------------------------------------------------------------------------------
 1 | # Create App Runner Service Role
 2 | aws iam create-role --role-name CustomAppRunnerServiceRole `
 3 |     --assume-role-policy-document file://service-role-policy.json `
 4 |     --profile ansari --region us-west-2
 5 | 
 6 | aws iam attach-role-policy `
 7 |     --policy-arn arn:aws:iam::aws:policy/service-role/AWSAppRunnerServicePolicyForECRAccess `
 8 |     --role-name CustomAppRunnerServiceRole `
 9 |     --profile ansari --region us-west-2
10 | 
11 | # Create GitHub Actions User
12 | aws iam create-policy `
13 |     --policy-name CustomGitHubActionsPolicy `
14 |     --policy-document file://github-actions-policy.json `
15 |     --profile ansari --region us-west-2
16 | 
17 | aws iam create-user `
18 |     --user-name app-runner-github-actions-user `
19 |     --profile ansari --region us-west-2
20 | 
21 | aws iam attach-user-policy `
22 |     --policy-arn arn:aws:iam::<aws_account_id>:policy/CustomGitHubActionsPolicy `
23 |     --user-name app-runner-github-actions-user `
24 |     --profile ansari --region us-west-2
25 | 
26 | # Create an ECR Registry
27 | aws ecr create-repository --repository-name ansari-backend `
28 |     --profile ansari --region us-west-2
29 | 
30 | # Create App Runner Instance Role
31 | aws iam create-role --role-name CustomAppRunnerInstanceRole `
32 |     --assume-role-policy-document file://instance-role-policy.json `
33 |     --profile ansari --region us-west-2
34 | 
35 | aws iam put-role-policy `
36 |     --role-name CustomAppRunnerInstanceRole `
37 |     --policy-name CustomAccessParameters `
38 |     --policy-document file://instance-role-parameters-access.json `
39 |     --profile ansari --region us-west-2
40 | 
41 | # Create staging env variables
42 | aws ssm put-parameter `
43 |   --name "/app-runtime/ansari-backend/staging/env-var-name" `
44 |   --value "changethis" `
45 |   --type SecureString `
46 |   --profile ansari --region us-west-2
47 | 
48 | # Create production env variables
49 | aws ssm put-parameter `
50 |   --name "/app-runtime/ansari-backend/production/env-var-name" `
51 |   --value "changethis" `
52 |   --type SecureString `
53 |   --profile ansari --region us-west-2


--------------------------------------------------------------------------------
/aws/github-actions-policy.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "Version": "2012-10-17",
 3 |   "Statement": [
 4 |       {
 5 |           "Sid": "VisualEditor0",
 6 |           "Effect": "Allow",
 7 |           "Action": "apprunner:*",
 8 |           "Resource": "*"
 9 |       },
10 |       {
11 |           "Sid": "VisualEditor1",
12 |           "Effect": "Allow",
13 |           "Action":[
14 |               "iam:PassRole",
15 |               "iam:CreateServiceLinkedRole"
16 |           ],
17 |           "Resource": "*"
18 |       },
19 |       {
20 |           "Sid": "VisualEditor2",
21 |           "Effect": "Allow",
22 |           "Action": "sts:AssumeRole",
23 |           "Resource": "arn:aws:iam::<aws_account_id>:role/CustomAppRunnerServiceRole"
24 |       },
25 |       {
26 |           "Sid": "VisualEditor3",
27 |           "Effect": "Allow",
28 |           "Action": [
29 |               "ecr:GetDownloadUrlForLayer",
30 |               "ecr:BatchGetImage",
31 |               "ecr:BatchCheckLayerAvailability",
32 |               "ecr:PutImage",
33 |               "ecr:InitiateLayerUpload",
34 |               "ecr:UploadLayerPart",
35 |               "ecr:CompleteLayerUpload",
36 |               "ecr:GetAuthorizationToken"
37 |           ],
38 |           "Resource": "*"
39 |       }
40 |   ]
41 | }


--------------------------------------------------------------------------------
/aws/instance-role-parameters-access.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "Version": "2012-10-17",
 3 |   "Statement": [
 4 |     {
 5 |       "Effect": "Allow",
 6 |       "Action":[
 7 |           "ssm:DescribeParameters"
 8 |       ],
 9 |       "Resource": "*"
10 |     },
11 |     {
12 |       "Effect": "Allow",
13 |       "Action":[
14 |         "ssm:GetParameters",
15 |         "ssm:GetParameter",
16 |         "ssm:GetParametersByPath"
17 |       ],
18 |       "Resource": [
19 |         "arn:aws:ssm:<region>:<aws_account_id>:parameter/app-runtime/*"
20 |       ]
21 |     }
22 |   ]
23 | }
24 | 


--------------------------------------------------------------------------------
/aws/instance-role-policy.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "Version": "2012-10-17",
 3 |   "Statement": [
 4 |     {
 5 |       "Effect": "Allow",
 6 |       "Principal": {
 7 |         "Service": "tasks.apprunner.amazonaws.com"
 8 |       },
 9 |       "Action": "sts:AssumeRole"
10 |     }
11 |   ]
12 | }
13 | 


--------------------------------------------------------------------------------
/aws/service-role-policy.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "Version": "2012-10-17",
 3 |   "Statement": [
 4 |     {
 5 |       "Effect": "Allow",
 6 |       "Principal": {
 7 |         "Service": "build.apprunner.amazonaws.com"
 8 |       },
 9 |       "Action": "sts:AssumeRole"
10 |     }
11 |   ]
12 | }


--------------------------------------------------------------------------------
/data/mawsuah/README.md:
--------------------------------------------------------------------------------
 1 | # Arabic Diacritic Stripping for Word Documents
 2 | 
 3 | ## Overview
 4 | 
 5 | This Python script prepares Arabic text in Microsoft Word documents from "The Kuwaiti Encyclopaedia of Islamic Jurisprudence" for use with Vectara Arabic text embedding models. It does this by removing diacritics (tashkeel) from the text.  This preprocessing step is essential for optimal semantic indexing and search functionality with Vectara. 
 6 | 
 7 | ## Why is this Important? 
 8 | 
 9 | Without removing diacritics, Vectara's Arabic text embedding models cannot accurately represent the core meaning of words. This severely hinders the effectiveness of semantic search within the text.
10 | 
11 | ## Script Functionality
12 | 
13 | The script leverages the following libraries to achieve its task:
14 | 
15 | * **textract:** Extracts text content from Microsoft Word (.doc) files.
16 | * **pyarabic.araby:** Provides tools for stripping diacritics from Arabic text.
17 | 
18 | ## How to Use
19 | 
20 | **1. Install Dependencies**
21 | 
22 | Ensure you have the required libraries:
23 | 
24 | ```bash
25 | pip install PyArabic==0.6.15 textract==1.6.5 tqdm==4.66.1 
26 | ```
27 | 
28 | **2. Obtain the Source Documents**
29 | 
30 | * Download "The Kuwaiti Encyclopaedia of Islamic Jurisprudence" Word documents from [this link](https://content.awqaf.gov.kw/BasicPages/2020/9/4fcf6da511ff40cfa278d5873f5ff3ad.rar).
31 | * Unrar the archive.
32 | * Place the extracted Word documents in a dedicated directory.
33 | 
34 | **3. Configure the Script**
35 | 
36 | * Open the Python script.
37 | * Update the `input_dir` variable with the full path to the directory containing the Word documents.
38 | 
39 | **4. Run the Script**
40 | 
41 | Execute the script from your terminal:
42 | 
43 | ```bash
44 | python strip_tashkeel.py
45 | ```
46 | 
47 | The script will process each Word document (.doc) in your specified directory and create a corresponding text file (.txt) with diacritics removed. The output files will be saved in a new folder called "txt" within the input directory.
48 | 


--------------------------------------------------------------------------------
/data/mawsuah/strip_tashkeel.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path, PurePath
 2 | 
 3 | import textract
 4 | from pyarabic import araby
 5 | from tqdm.auto import tqdm
 6 | 
 7 | from ansari.ansari_logger import get_logger
 8 | 
 9 | logger = get_logger(__name__)
10 | 
11 | 
12 | def strip_tashkeel_from_doc(input_file, output_file):
13 |     text = textract.process(input_file).decode("utf-8")  # Extract text from .doc file
14 | 
15 |     stripped_text = araby.strip_diacritics(text)
16 | 
17 |     with open(output_file, "w", encoding="utf-8") as f:
18 |         f.write(stripped_text)
19 | 
20 | 
21 | input_dir = Path("/path/to/The Kuwaiti Encyclopaedia of Islamic Jurisprudence/word")
22 | path_components = list(input_dir.parts)
23 | path_components[-1] = "txt"
24 | output_dir = PurePath(
25 |     *path_components,
26 | )  # --> "/path/to/The Kuwaiti Encyclopaedia of Islamic Jurisprudence/txt"
27 | 
28 | # iterate over all files in the directory
29 | for input_file in tqdm(input_dir.glob("*.doc")):
30 |     if input_file.is_file() and input_file.suffix == ".doc":
31 |         logger.info(f"Processing {input_file.name}...")
32 |         strip_tashkeel_from_doc(
33 |             input_file,
34 |             output_dir.joinpath(input_file.with_suffix(".txt").name),
35 |         )
36 |         logger.info(f"Done processing {input_file.name}")
37 | 


--------------------------------------------------------------------------------
/docs/spec/mongodb.md:
--------------------------------------------------------------------------------
  1 | # MongoDB migration plan
  2 | 
  3 | ## Purpose
  4 | 
  5 | Ansari was first built when the representation of messages between LLM providers and Ansari were simply text. 
  6 | 
  7 | Since then, the representation has become considerably richer. There are: 
  8 | 
  9 | - Tool use requests
 10 | - Tool results
 11 | - Documents
 12 | - References
 13 | - Thinking
 14 | - Citations
 15 | 
 16 | And more. The content of a message has changed from being a simple string to a list of polymorphic types. 
 17 | 
 18 | Thus we need to consider whether to augment our existing SQL db with supporting a set of polymorphic blocks and then
 19 | having a three level hierarchy (threads --> messages --> blocks), or switch to a document database. 
 20 | 
 21 | Also, this affects the frontend-backend protocol. The frontend was initially designed for the simple representation. 
 22 | 
 23 | But now as we try to render more advanced things, it has become a requirement to migrate to this. 
 24 | 
 25 | We've also committed to Claude as the backend for Ansari. 
 26 | 
 27 | We've chosen to migrate threads (only threads) to MongoDB. This is the plan for how to do that. 
 28 | 
 29 | ## What needs to change
 30 | 
 31 | Threads, Messages and Blocks will be stored in MongoDB. 
 32 | 
 33 | Here is the current definition of a thread and messages (from sql/00_create_schema.sql): 
 34 | 
 35 | ```sql
 36 | -- Threads table - integrated for both web and WhatsApp users
 37 | CREATE TABLE threads (
 38 |     id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
 39 |     name VARCHAR(100),
 40 |     user_id UUID NOT NULL,
 41 |     initial_source source_type NOT NULL DEFAULT 'web',
 42 |     created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
 43 |     updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
 44 |     FOREIGN KEY (user_id) REFERENCES users(id)
 45 | );
 46 | 
 47 | -- Messages table - integrated for both web and WhatsApp users
 48 | CREATE TABLE messages (
 49 |     id SERIAL PRIMARY KEY,
 50 |     user_id UUID NOT NULL,
 51 |     thread_id UUID NOT NULL,
 52 |     role TEXT NOT NULL, 
 53 |     tool_name TEXT,
 54 |     tool_details JSONB DEFAULT '{}'::jsonb,
 55 |     ref_list JSONB,
 56 |     content TEXT NOT NULL,
 57 |     timestamp TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
 58 |     source source_type NOT NULL DEFAULT 'web',
 59 |     created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
 60 |     updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
 61 |     FOREIGN KEY (user_id) REFERENCES users(id),
 62 |     FOREIGN KEY (thread_id) REFERENCES threads(id) ON DELETE CASCADE
 63 | );
 64 | ```
 65 | As you can see it's pretty messy. 
 66 | 
 67 | Instead, we will be adopting Anthropic's representation of a message as outlined at: 
 68 | 
 69 | https://docs.anthropic.com/en/api/messages
 70 | 
 71 | Not only will we be using this for our storage representation, but we will be storing it
 72 | as our wire format and largely passing it unmodified through to the frontend. 
 73 | 
 74 | ## How we'll do this
 75 | 
 76 | We will set up a MongoDB serverless instance using mongo cloud. 
 77 | 
 78 | We will add a new set of endpoints at /api/v3 for
 79 | 
 80 | - GET /threads # Get All Threads
 81 | - POST /threads # Create A Thread
 82 | - POST /threads/{thread_id} # Add a message to a thread
 83 | - GET /threads/{thread_id} # Get a thread
 84 | - DELETE /threads/{thread_id} # Delete a thread
 85 | - POST /share/{thread_id} # Snapshot a thread for sharing 
 86 | - GET /share/{thread_id} # See a shared thread
 87 | - POST /threads/{thread_id}/name # Set the name of a thread. 
 88 | 
 89 | We need to work out how to structure the FastAPI calls to support this. 
 90 | 
 91 | ## Historical threads
 92 | 
 93 | The above methods will still hit the existing database for existing threads. 
 94 | 
 95 | But they will return values in the simpler format we used to use. Newly created threads
 96 | will be stored only in MongoDB. The above calls will have to do some fusion. 
 97 | 
 98 | ## Ansari Classes that need to change
 99 | 
100 | - main_api.py -- this is one of the messier files in the code base. We should take this as an opportunity to clean this up. How Ansari objects are created will also need to be modified to use the new derived classes below. 
101 | - ansari_db.py -- Also messy. We may create a derived class from AnsariDB specifically for supporting the new use cases. We will create a new
102 | - ansari_claude.py -- This changes the way many things work in Ansari Claude. We will create a derived class called AnsariClaudeHybrid. 
103 | - Misc tests. 
104 | 
105 | ## Long term migration
106 | 
107 | Eventually we will move all our efforts to the new service, and we will deprecate /api/v2. 
108 | 
109 | 
110 | 
111 | 
112 | 


--------------------------------------------------------------------------------
/docs/structure_of_api_responses/a_fastapi_request_received_from_zrok.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"scope": {
 3 | 		"type": "http",
 4 | 		"asgi": {
 5 | 			"version": "3.0",
 6 | 			"spec_version": "2.4"
 7 | 		},
 8 | 		"http_version": "1.1",
 9 | 		"server": ["127.0.0.1", 8000], // When running locally
10 | 		"client": ["127.0.0.1", 11563], // The port here changes dynamically
11 | 		"scheme": "http",
12 | 		"method": "POST",
13 | 		"root_path": "",
14 | 		"path": "/api/v2/users/login",
15 | 		"raw_path": "/api/v2/users/login",
16 | 		"query_string": "",
17 | 		"headers": [
18 | 			["host", "localhost:8000"],
19 | 			["connection", "keep-alive"],
20 | 			["content-length", "83"],
21 | 			["sec-ch-ua-platform", "\"Windows\""],
22 | 			["user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"],
23 | 			["x-mobile-ansari", "ANSARI"],
24 | 			["sec-ch-ua", "\"Google Chrome\";v=\"131\", \"Chromium\";v=\"131\", \"Not_A Brand\";v=\"24\""],
25 | 			["content-type", "application/json"],
26 | 			["sec-ch-ua-mobile", "?0"],
27 | 			["accept", "*/*"],
28 | 			["origin", "http://localhost:8081"],
29 | 			["sec-fetch-site", "same-site"],
30 | 			["sec-fetch-mode", "cors"],
31 | 			["sec-fetch-dest", "empty"],
32 | 			["referer", "http://localhost:8081/"],
33 | 			["accept-encoding", "gzip, deflate, br, zstd"],
34 | 			["accept-language", "en-GB,en;q=0.9,ar-EG;q=0.8,ar;q=0.7,en-US;q=0.6"]
35 | 		],
36 | 		"state": {},
37 | 		"app": "<fastapi.applications.FastAPI object>",
38 | 		"starlette.exception_handlers": {
39 | 			"<class 'starlette.exceptions.HTTPException'>": "<function http_exception_handler>",
40 | 			"<class 'starlette.exceptions.WebSocketException'>": "<bound method ExceptionMiddleware.websocket_exception>",
41 | 			"<class 'fastapi.exceptions.RequestValidationError'>": "<function request_validation_exception_handler>",
42 | 			"<class 'fastapi.exceptions.WebSocketRequestValidationError'>": "<function websocket_request_validation_exception_handler>"
43 | 		},
44 | 		"router": "<fastapi.routing.APIRouter object>",
45 | 		"endpoint": "<function login_user>",
46 | 		"path_params": {},
47 | 		"route": {
48 | 			"path": "/api/v2/users/login",
49 | 			"name": "login_user",
50 | 			"methods": ["POST"]
51 | 		}
52 | 	},
53 | 	"_receive": "<bound method RequestResponseCycle.receive>",
54 | 	"_send": "<function wrap_app_handling_exceptions.<locals>.wrapped_app.<locals>.sender>",
55 | 	"_stream_consumed": true,
56 | 	"_is_disconnected": false,
57 | 	"_form": null,
58 | 	// _body's value (and other strings) were actually binary strings (i.e., start with b'...')
59 | 	"_body": "{\"email\":\"guest_<<AUTO_GENERATED_KEY>>@endeavorpal.com\",\"password\":\"<<PASSWORD_SENT_VIA_ANSARI_WEBSITE>>\",\"guest\":true}",
60 |     // this is what actually gets returned when accessing headers property (e.g., `request.headers`)
61 |     // Check Starlette's implementation (which FastAPI uses) for details:
62 |     // https://github.com/encode/starlette/blob/b68a142a356ede730083347f254e1eae8b5c803e/starlette/requests.py#L12
63 | 	"_headers": {
64 | 		"host": "localhost:8000",
65 | 		"connection": "...",
66 |         "...": "..."
67 |         // I.e., the value of the `_headers` key is a dictionary of the headers already mentioned above
68 | 	},
69 | 	"_json": {
70 | 		"email": "guest_<<AUTO_GENERATED_KEY>>@endeavorpal.com",
71 |         "...": ["..."] 
72 |         // I.e.,  the value of the `_json` key is simply the dictionary equivalent of `_body`'s string value
73 | 	},
74 | 	"_query_params": "",
75 | 	"_cookies": {}
76 | }


--------------------------------------------------------------------------------
/docs/structure_of_api_responses/anthropic_api_structure_of_message_history.json:
--------------------------------------------------------------------------------
 1 | [ // This is a dummy example, so don't focus on the content of the messages
 2 |   {
 3 |     "role": "user",
 4 |     "content": "Can you search for information about machine learning?"
 5 |   },
 6 |   {
 7 |     "role": "assistant",
 8 |     "content": [
 9 |       {
10 |         "type": "text",
11 |         "text": "I'll search for information about machine learning for you."
12 |       },
13 |       {
14 |         "type": "tool_use",
15 |         "id": "tool-123456",
16 |         "name": "search_database",
17 |         "input": {
18 |           "query": "machine learning",
19 |           "limit": 3
20 |         }
21 |       }
22 |     ]
23 |   },
24 |   {
25 |     "role": "user",
26 |     "content": [
27 |       {
28 |         "type": "tool_result",
29 |         "tool_use_id": "tool-123456",
30 |         "content": "{\"results\": [{\"title\": \"Introduction to ML\", \"url\": \"https://example.com/ml1\"}, {\"title\": \"Deep Learning Fundamentals\", \"url\": \"https://example.com/ml2\"}]}"
31 |       }
32 |     ]
33 |   },
34 |   {
35 |     "role": "assistant",
36 |     "content": [
37 |       {
38 |         "type": "text",
39 |         "text": "Based on my search, here are some resources about machine learning:\n\n1. Introduction to ML - This resource covers the basics of machine learning algorithms and techniques.\n2. Deep Learning Fundamentals - This focuses on neural networks and deep learning approaches."
40 |       }
41 |     ]
42 |   },
43 |   {
44 |     "role": "user",
45 |     "content": "Can you explain neural networks in more detail?"
46 |   },
47 |   {
48 |     "role": "assistant",
49 |     "content": [
50 |       {
51 |         "type": "text",
52 |         "text": "Neural networks are computational models inspired by the human brain. They consist of layers of interconnected nodes or \"neurons\" that process information.\n\nThe basic structure includes:\n\n1. Input layer: Receives initial data\n2. Hidden layers: Perform computations on inputs\n3. Output layer: Produces the final result\n\nEach connection between neurons has a weight that gets adjusted during training[1]."
53 |       }
54 |     ]
55 |   }
56 | ]


--------------------------------------------------------------------------------
/docs/structure_of_api_responses/anthropic_api_structure_of_response.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "type": "content_block_start | content_block_delta | content_block_stop | message_delta | message_stop",
 3 |   "content_block": {
 4 |     "type": "text | tool_use",
 5 |     "id": "tool-123456",
 6 |     "name": "search_database"
 7 |   },
 8 |   "delta": {
 9 |     "text": "Here's information about your query...",
10 |     "partial_json": "{ \"query\": \"ma",
11 |     "type": "citations_delta",
12 |     "citation": {
13 |       "start": 23,
14 |       "end": 45,
15 |       "number": 1,
16 |       "text": "according to source X..."
17 |     },
18 |     "stop_reason": "end_turn | tool_use"
19 |   }
20 | }


--------------------------------------------------------------------------------
/docs/structure_of_api_responses/meta_whatsapp_api_structure_of_a_reply_msg_status.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "object": "whatsapp_business_account",
 3 |     "entry": [
 4 |         {
 5 |             "id": "<<unique numeric ID for this specific response>>",
 6 |             "changes": [
 7 |                 {
 8 |                     "value": {
 9 |                         "messaging_product": "whatsapp",
10 |                         "metadata": {
11 |                             "display_phone_number": "<<the WHATSAPP_BUSINESS_PHONE_NUMBER without non-numeric characters (e.g., +1 (555) 555-5555 -> 15555555555)>>",
12 |                             "phone_number_id": "<<the WHATSAPP_BUSINESS_PHONE_NUMBER_ID (this ID is a numeric value)>>"
13 |                         },
14 |                         "statuses": [
15 |                             {
16 |                                 "id": "wamid.<<UNIQUE_NUMERIC_ID_FOR_THIS_SPECIFIC_MSG>>",
17 |                                 "status": "<<the status of the message sent back to the user by our application (sent, delivered, read)>>",
18 |                                 "timestamp": "<<Unix timestamp of the message (e.g., 1730548169)>>",
19 |                                 "recipient_id": "<<the whatsapp sender's phone number without non-numeric characters>>",
20 |                                 "conversation": {
21 |                                     "id": "a hexadecimal representation of a hash or a unique identifier (could be MD5 hash or UUID) for the conversation",
22 |                                     "origin": {
23 |                                         "type": "service"
24 |                                     }
25 |                                 },
26 |                                 "pricing": {
27 |                                     "billable": "True/False (actually sent as a boolean value, so no quotes)",
28 |                                     "pricing_model": "CBP",
29 |                                     "category": "service"
30 |                                 }
31 |                             }
32 |                         ]
33 |                     },
34 |                     "field": "messages"
35 |                 }
36 |             ]
37 |         }
38 |     ]
39 | }


--------------------------------------------------------------------------------
/docs/structure_of_api_responses/meta_whatsapp_api_structure_of_a_request_sent_using_zrok.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "scope": {
 3 |         "type": "http",
 4 |         "asgi": {
 5 |             "version": "3.0",
 6 |             "spec_version": "2.4"
 7 |         },
 8 |         "http_version": "1.1",
 9 |         "server": ["127.0.0.1", 8000], // When running locally
10 |         "client": ["<<WHATSAPP.USER.IP>>", 0],
11 |         "scheme": "https",
12 |         "method": "POST",
13 |         "root_path": "",
14 |         "path": "/whatsapp/v1",
15 |         "raw_path": "/whatsapp/v1",
16 |         "query_string": "",
17 |         "headers": [
18 |             ["host", "YOUR_ZROK_SHARE_TOKEN.share.zrok.io"],
19 |             ["user-agent", "facebookexternalua"],
20 |             ["content-length", "545"],
21 |             ["accept", "*/*"],
22 |             ["accept-encoding", "deflate, gzip"],
23 |             ["content-type", "application/json"],
24 |             ["x-amzn-trace-id", "Root=1-674b2035-0f0a8ab27075asce3324dcdb"], // trace value here is fake
25 |             ["x-forwarded-for", "173.REST.OF.IP, <<WHATSAPP.USER.IP>>"],
26 |             ["x-forwarded-port", "443"],
27 |             ["x-forwarded-proto", "https"],
28 |             ["x-hub-signature", "sha1=8a3e35da6fb5dfaaf5aaa46c8d059d519e18112d"], // sha1 hash here is fake
29 |             ["x-hub-signature-256", "sha256=51d62480d40ffd0f48d1cde1ea47656452fd65b5ac29077fe3c6b4e68d74c827"], // sha256 here is fake
30 |             ["x-proxy", "zrok"]
31 |         ],
32 |         "state": {},
33 |         "app": "<FastAPI object>",
34 |         "starlette.exception_handlers": {
35 |             "<class 'starlette.exceptions.HTTPException'>": "<function http_exception_handler>",
36 |             "<class 'starlette.exceptions.WebSocketException'>": "<bound method ExceptionMiddleware.websocket_exception>",
37 |             "<class 'fastapi.exceptions.RequestValidationError'>": "<function request_validation_exception_handler>",
38 |             "<class 'fastapi.exceptions.WebSocketRequestValidationError'>": "<function websocket_request_validation_exception_handler>"
39 |         },
40 |         "router": "<APIRouter object>",
41 |         "endpoint": "<function main_webhook>",
42 |         "path_params": {},
43 |         "route": {
44 |             "path": "/whatsapp/v1",
45 |             "name": "main_webhook",
46 |             "methods": ["POST"]
47 |         }
48 |     },
49 |     "_receive": "<bound method RequestResponseCycle.receive>",
50 |     "_send": "<function wrap_app_handling_exceptions.<locals>.wrapped_app.<locals>.sender>",
51 |     "_stream_consumed": true,
52 |     "_is_disconnected": false,
53 |     "_form": null,
54 |     "_query_params": "",
55 |     // this is what actually gets returned when accessing headers property (e.g., `request.headers`)
56 |     // Check Starlette's implementation (which FastAPI uses) for details:
57 |     // https://github.com/encode/starlette/blob/b68a142a356ede730083347f254e1eae8b5c803e/starlette/requests.py#L125
58 |     "_headers": {
59 |         "host": "...",
60 |         "user-agent": "...",
61 |         "...": "..."
62 |         // I.e., the value of the `_headers` key is a dictionary of the headers already mentioned above
63 |     },
64 |     "_cookies": {},
65 |     // _body's value (and other strings) were actually binary strings (i.e., start with b'...')
66 |     // Also, it contains content mentioned in other `meta_whatsapp_*.json` files
67 |     "_body": "{\"object\":\"whatsapp_business_account\", ...}",
68 |     "_json": {
69 |         "object": "whatsapp_business_account",
70 |         "...": ["..."] 
71 |         // I.e.,  the value of the `_json` key is simply the dictionary equivalent of `_body`'s string value
72 |     }
73 | }


--------------------------------------------------------------------------------
/docs/structure_of_api_responses/openai_api_structure_of_chat_completion_chunk_object.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "# This documentation is inferred from OpenAI's official documentation as of 2025-01-01\n",
 10 |     "\n",
 11 |     "\n",
 12 |     "def ModelResponse():\n",
 13 |     "    return (\n",
 14 |     "        \"This object returned from here (in case `stream=True`): \"\n",
 15 |     "        + \"https://platform.openai.com/docs/api-reference/chat/streaming\"\n",
 16 |     "    )\n",
 17 |     "\n",
 18 |     "\n",
 19 |     "def StreamingChoices():\n",
 20 |     "    return (\n",
 21 |     "        \"This object returned under `choices` as mentioned here: \"\n",
 22 |     "        + \"https://platform.openai.com/docs/api-reference/chat/streaming#chat/streaming-choices\"\n",
 23 |     "    )\n",
 24 |     "\n",
 25 |     "\n",
 26 |     "def Delta():\n",
 27 |     "    return (\n",
 28 |     "        \"This object returned under `delta` as mentioned under `choices` here: \"\n",
 29 |     "        + \"https://platform.openai.com/docs/api-reference/chat/streaming#chat/streaming-choices\"\n",
 30 |     "    )\n",
 31 |     "\n",
 32 |     "\n",
 33 |     "def ChatCompletionDeltaToolCall():\n",
 34 |     "    return (\n",
 35 |     "        \"This object returned under `tool_calls` as mentioned in this abstract implementation: \"\n",
 36 |     "        + \"https://github.com/openai/openai-python/blob/main/src/openai/types/chat/chat_completion_message_tool_call.py\"\n",
 37 |     "    )\n",
 38 |     "\n",
 39 |     "\n",
 40 |     "def Function():\n",
 41 |     "    return (\n",
 42 |     "        \"This object returned under `ChatCompletionDeltaToolCall` object as mentioned in this abstract implementation: \"\n",
 43 |     "        + \"https://github.com/openai/openai-python/blob/main/src/openai/types/chat/chat_completion_message_tool_call.py\"\n",
 44 |     "    )"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": null,
 50 |    "metadata": {},
 51 |    "outputs": [],
 52 |    "source": [
 53 |     "ModelResponse(\n",
 54 |     "    id=\"Unique identifier for the response.\",\n",
 55 |     "    choices=[\n",
 56 |     "        StreamingChoices(\n",
 57 |     "            finish_reason=(\n",
 58 |     "                \"The reason the model stopped generating tokens. This will be `stop` if model hits a natural stop point or a \"\n",
 59 |     "                + \"provided stop sequence, `length` if the maximum number of tokens specified in the request was reached, \"\n",
 60 |     "                + \"`content_filter` if content was omitted due to a flag from our content filters,\"\n",
 61 |     "                + \"`tool_calls` if the model called a tool, \"\n",
 62 |     "                + \"or function_call (deprecated) if the model called a function.\"\n",
 63 |     "            ),\n",
 64 |     "            index=\"The index of the choice in the list of choices.\",\n",
 65 |     "            delta=Delta(\n",
 66 |     "                refusal=\"Reason for refusal, if any.\",\n",
 67 |     "                content=(\n",
 68 |     "                    \"The contents of the chunk message.\"\n",
 69 |     "                    + \"SUBTLE NOTE: Will be `None` if the value of the `tool_calls` key is not an empty list `[]`.\"\n",
 70 |     "                ),\n",
 71 |     "                role=\"The role of the author of this message chunk (user, role, or assistant).\",\n",
 72 |     "                function_call=(\n",
 73 |     "                    \"###### Deprecated ###### and replaced by `tool_calls`.\"\n",
 74 |     "                    + \"The name and arguments of a function that should be called, as generated by the model.\"\n",
 75 |     "                ),\n",
 76 |     "                # SUBTLE NOTE: `tool_calls` value will be an empty list if the model deduced that no tool calls are needed\n",
 77 |     "                tool_calls=[\n",
 78 |     "                    ChatCompletionDeltaToolCall(\n",
 79 |     "                        id=\"The ID of the tool call.\",\n",
 80 |     "                        type=\"The type of the tool. As of 2024-09-01, only `function` is supported.\",\n",
 81 |     "                        function=Function(\n",
 82 |     "                            name=\"Name of the function being called\",\n",
 83 |     "                            arguments=(\n",
 84 |     "                                \"The arguments to call the function with, as generated by the model in JSON format.\"\n",
 85 |     "                                + \"Note that the model does not always generate valid JSON, \"\n",
 86 |     "                                + \"and may hallucinate parameters not defined by your function schema.\"\n",
 87 |     "                                + \"Validate the arguments in your code before calling your function.\"\n",
 88 |     "                            ),\n",
 89 |     "                        ),\n",
 90 |     "                        index=\"Index of the tool call in the response.\",\n",
 91 |     "                    )\n",
 92 |     "                ],\n",
 93 |     "            ),\n",
 94 |     "            logprobs=\"Log probabilities of the tokens, if available.\",\n",
 95 |     "        )\n",
 96 |     "    ],\n",
 97 |     "    created=\"Timestamp when the response was created.\",\n",
 98 |     "    model=\"Name of the model used to generate the response.\",\n",
 99 |     "    object=\"Type of the object (e.g., chat.completion.chunk).\",\n",
100 |     "    system_fingerprint=\"Unique fingerprint of the system.\",\n",
101 |     ")"
102 |    ]
103 |   }
104 |  ],
105 |  "metadata": {
106 |   "kernelspec": {
107 |    "display_name": ".venv",
108 |    "language": "python",
109 |    "name": "python3"
110 |   },
111 |   "language_info": {
112 |    "name": "python",
113 |    "version": "3.13.2"
114 |   }
115 |  },
116 |  "nbformat": 4,
117 |  "nbformat_minor": 2
118 | }
119 | 


--------------------------------------------------------------------------------
/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ansari-project/ansari-backend/accfce869dd1a98c02eb4e931985525b91632c7d/favicon.ico


--------------------------------------------------------------------------------
/migrate_database.py:
--------------------------------------------------------------------------------
  1 | from bson import ObjectId
  2 | import pymongo
  3 | from bson import json_util
  4 | 
  5 | from ansari.ansari_db_sql import AnsariSQLDB
  6 | from ansari.ansari_logger import get_logger
  7 | from ansari.config import get_settings
  8 | 
  9 | logger = get_logger(__name__)
 10 | 
 11 | 
 12 | def migrate_database():
 13 |     try:
 14 |         settings = get_settings()
 15 |         sql_db = AnsariSQLDB(settings)
 16 |         db_url = settings.MONGO_URL
 17 |         db_name = settings.MONGO_DB_NAME
 18 |         mongo_connection = pymongo.MongoClient(db_url)
 19 |         mongo_db = mongo_connection[db_name]
 20 | 
 21 |         users_collection = mongo_db["users"]
 22 |         threads_collection = mongo_db["threads"]
 23 |         messages_collection = mongo_db["messages"]
 24 |         feedback_collection = mongo_db["feedback"]
 25 | 
 26 |         # Step 1: Process feedback
 27 |         logger.info("Step 1: Process feedback documents")
 28 |         logger.info(f"Estimated document count: {feedback_collection.estimated_document_count()}")
 29 | 
 30 |         while True:
 31 |             feedbacks = list(feedback_collection.find({"migrated": {"$exists": False}}).limit(1000))
 32 |             if len(feedbacks) == 0:
 33 |                 break
 34 | 
 35 |             feedback_operations = []
 36 |             message_operations = []
 37 |             for i, feedback in enumerate(feedbacks, 1):
 38 |                 logger.info(f"{i} Processing Feedback: {str(feedback["_id"])}")
 39 |                 message_id = feedback.get("original_message_id")
 40 |                 if message_id:
 41 |                     message = messages_collection.find_one({"original_id": message_id})
 42 |                     if message:
 43 |                         feedback_operations.append(pymongo.UpdateOne(
 44 |                             {"_id": feedback["_id"]},
 45 |                             {"$set": {"migrated": True}}
 46 |                         ))
 47 | 
 48 |                         message_operations.append(pymongo.UpdateOne(
 49 |                             {"original_id": message_id},
 50 |                             {"$set": {"feedback": {
 51 |                                 "class": feedback.get("class"),
 52 |                                 "comment": feedback.get("comment"),
 53 |                                 "created_at": feedback.get("created_at"),
 54 |                                 "updated_at": feedback.get("updated_at")
 55 |                             }}}
 56 |                         ))
 57 | 
 58 |             logger.info("Saving changes...")
 59 |             feedback_results = feedback_collection.bulk_write(feedback_operations)
 60 |             logger.info(f"Feedback results: {feedback_results}")
 61 | 
 62 |             message_results = messages_collection.bulk_write(message_operations)
 63 |             logger.info(f"Message results: {message_results}")
 64 | 
 65 |         logger.info("Step 1: Process feedback documents - Done\n\n")
 66 | 
 67 |         # Step 2: Process messages
 68 |         logger.info("Step 2: Process message documents")
 69 |         logger.info(f"Estimated document count: {messages_collection.estimated_document_count()}")
 70 |         while True:
 71 |             messages = list(messages_collection.find({"migrated": {"$exists": False}}).limit(1000))
 72 |             if len(messages) == 0:
 73 |                 break
 74 | 
 75 |             operations = []
 76 |             for i, message in enumerate(messages, 1):
 77 |                 logger.info(f"{i} Processing Message: {str(message["_id"])}")
 78 |                 query = {"_id": message["_id"]}
 79 | 
 80 |                 original_message = (
 81 |                     message.get("original_id"),
 82 |                     message.get("role"),
 83 |                     message.get("content"),
 84 |                     message.get("tool_name"),
 85 |                     message.get("tool_details"),
 86 |                     message.get("ref_list"),
 87 |                 )
 88 |                 converted_message = sql_db.convert_message_llm(original_message)[0]
 89 | 
 90 |                 updated_message = {
 91 |                     "role": converted_message["role"],
 92 |                     "content": converted_message["content"],
 93 |                     "id": str(ObjectId()),
 94 |                     "source": message["source"],
 95 |                     "created_at": message["created_at"],
 96 |                     "original_id": message["original_id"],
 97 |                     "original_thread_id": message["original_thread_id"],
 98 |                     "original_message": json_util.dumps(message),
 99 |                     "migrated": True,
100 |                 }
101 | 
102 |                 operations.append(pymongo.ReplaceOne(query, updated_message))
103 | 
104 |             logger.info("Saving changes...")
105 |             results = messages_collection.bulk_write(operations)
106 |             logger.info(f"Message results: {results}")
107 | 
108 |         logger.info("Step 2: Process message documents - Done\n\n")
109 | 
110 |         # Step 3: Embed messages in threads
111 |         logger.info("Step 3: Process thread documents")
112 |         logger.info(f"Estimated document count: {threads_collection.estimated_document_count()}")
113 | 
114 |         while True:
115 |             threads = list(threads_collection.find({"migrated": {"$exists": False}}).limit(1000))
116 |             if len(threads) == 0:
117 |                 break
118 | 
119 |             operations = []
120 |             for i, thread in enumerate(threads, 1):
121 |                 logger.info(f"{i} Migrating: {str(thread["_id"])}")
122 |                 query = {"_id": thread["_id"]}
123 | 
124 |                 if thread.get("original_user_id") is None:
125 |                     logger.warning(f"Thread {str(thread['_id'])} does not have an original user ID.")
126 |                     continue
127 | 
128 |                 user = users_collection.find_one({"original_id": thread["original_user_id"]})
129 |                 messages = list(messages_collection.find({"original_thread_id": thread["original_id"]})
130 |                                 .sort("created_at", pymongo.ASCENDING))
131 | 
132 |                 thread_messages = []
133 |                 for message in messages:
134 |                     if message.get("role") == "tool" or message.get("role") == "function":
135 |                         continue
136 | 
137 |                     content = message.get("content")
138 |                     if isinstance(content, list) and any(block.get("type") == "tool_use" for block in content):
139 |                         continue
140 | 
141 |                     if isinstance(content, list) and any(block.get("type") == "tool_result" for block in content):
142 |                         continue
143 | 
144 |                     del message["_id"]
145 |                     del message["original_id"]
146 |                     del message["original_thread_id"]
147 |                     del message["migrated"]
148 | 
149 |                     thread_messages.append(message)
150 | 
151 |                 set_values = {
152 |                     "migrated": True,
153 |                     "user_id": user["_id"],
154 |                     "messages": thread_messages
155 |                 }
156 | 
157 |                 operations.append(pymongo.UpdateOne(query, {"$set": set_values}))
158 | 
159 |             logger.info("Saving changes...")
160 |             results = threads_collection.bulk_write(operations)
161 |             logger.info(f"Thread results: {results}")
162 | 
163 |         logger.info("Step 3: Process thread documents - Done\n\n")
164 | 
165 |     except (Exception) as error:
166 |         logger.error(f"Error: {error}")
167 |     finally:
168 |         if mongo_connection is not None:
169 |             mongo_connection.close()
170 | 
171 | 
172 | migrate_database()
173 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools", "wheel"]
 3 | build-backend = "setuptools.build_meta"
 4 | 
 5 | [project]
 6 | name = "ansari-backend"
 7 | version = "0.1.0"
 8 | description = "Ansari is an AI assistant to enhance understanding and practice of Islam."
 9 | authors = [
10 |     { name = "Ansari Project", email = "feedback@ansari.chat" }
11 | ]
12 | readme = "README.md"
13 | requires-python = ">=3.13"
14 | license = {text = "MIT"}
15 | classifiers = [
16 |     "Development Status :: 4 - Beta",
17 |     "Intended Audience :: Developers",
18 |     "License :: OSI Approved :: MIT License",
19 |     "Programming Language :: Python :: 3",
20 |     "Programming Language :: Python :: 3.8",
21 |     "Programming Language :: Python :: 3.9",
22 |     "Programming Language :: Python :: 3.10",
23 |     "Topic :: Software Development :: Libraries",
24 | ]
25 | dependencies = [
26 |     "anthropic",
27 |     "bcrypt",
28 |     "build",
29 |     "discord.py",
30 |     "diskcache",
31 |     "email-validator",
32 |     "fastapi[standard]",
33 |     "gunicorn",
34 |     "jinja2",
35 |     "langdetect",
36 |     "litellm",
37 |     "loguru",
38 |     "pymongo",
39 |     "openai",
40 |     "pandas",
41 |     "psycopg2-binary",
42 |     "pydantic_settings",
43 |     "pyjwt",
44 |     "pytest-asyncio",
45 |     "pytest-mock",
46 |     "pytest-xdist",
47 |     "rich",
48 |     "sendgrid",
49 |     "sentry-sdk[fastapi]",
50 |     "setuptools",
51 |     "tenacity",
52 |     "tiktoken",
53 |     "typer",
54 |     "uvicorn",
55 |     "wheel",
56 |     "zxcvbn",
57 | ]
58 | 
59 | [project.urls]
60 | Homepage = "https://github.com/ansari-project/ansari-backend"
61 | Documentation = "https://github.com/ansari-project/ansari-backend"
62 | Source = "https://github.com/ansari-project/ansari-backend"
63 | Tracker = "https://github.com/ansari-project/ansari-backend/issues"
64 | 
65 | [project.scripts]
66 | ansari = "ansari.app.main_stdio:main"
67 | 
68 | [tool.ruff]
69 | line-length = 127
70 | indent-width = 4
71 | target-version = "py310"
72 | lint.select = ["E", "F"]
73 | lint.fixable = ["ALL"]
74 | lint.ignore = [
75 |     "D100",   # ignore missing docs
76 |     "E402",   # false positives for local imports
77 |     "TRY003", # external messages in exceptions are too verbose
78 | ]
79 | lint.mccabe.max-complexity = 10
80 | 
81 | [tool.ruff.format]
82 | # Like Black, use double quotes for strings.
83 | quote-style = "double"
84 | # Like Black, indent with spaces, rather than tabs.
85 | indent-style = "space"
86 | # Like Black, respect magic trailing commas.
87 | skip-magic-trailing-comma = false
88 | # Like Black, automatically detect the appropriate line ending.
89 | line-ending = "auto"
90 | 


--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
 1 | [pytest]
 2 | pythonpath = src
 3 | asyncio_mode = strict
 4 | asyncio_default_fixture_loop_scope = function
 5 | markers =
 6 |     asyncio: mark a test as asyncio
 7 |     integration: mark a test as an integration test
 8 | filterwarnings =
 9 |     ignore::DeprecationWarning:pydantic.*:
10 |     ignore::UserWarning:pydantic.*:
11 |     ignore::Warning:pydantic.*:
12 |     ignore:Valid config keys have changed in V2:UserWarning
13 |     ignore:Support for class-based `config` is deprecated:UserWarning


--------------------------------------------------------------------------------
/setup.sh:
--------------------------------------------------------------------------------
1 | export GRADIO_SERVER_NAME=0.0.0.0 
2 | export GRADIO_SERVER_PORT="$PORT"
3 | 
4 | 


--------------------------------------------------------------------------------
/src/ansari/__init__.py:
--------------------------------------------------------------------------------
1 | # This file marks the directory as a Python package.
2 | from .config import Settings, get_settings
3 | from . import ansari_logger
4 | 
5 | __all__ = ["Settings", "get_settings", "ansari_logger"]
6 | 


--------------------------------------------------------------------------------
/src/ansari/agents/__init__.py:
--------------------------------------------------------------------------------
1 | from .ansari import Ansari
2 | from .ansari_workflow import AnsariWorkflow
3 | from .ansari_claude import AnsariClaude
4 | 
5 | __all__ = ["Ansari", "AnsariWorkflow", "AnsariClaude"]
6 | 


--------------------------------------------------------------------------------
/src/ansari/ansari_logger.py:
--------------------------------------------------------------------------------
 1 | # This file provides a standard Python logging instance for the caller file (e.g., main_api.py, etc.).
 2 | 
 3 | import os
 4 | import logging
 5 | import sys
 6 | 
 7 | from ansari.config import get_settings
 8 | 
 9 | 
10 | def get_logger(name: str) -> logging.Logger:
11 |     """Creates and returns a logger instance for the specified module.
12 | 
13 |     Args:
14 |         name (str): The name of the module requesting the logger (typically __name__).
15 | 
16 |     Returns:
17 |         logging.Logger: Configured logger instance.
18 |     """
19 |     logging_level = get_settings().LOGGING_LEVEL.upper()
20 | 
21 |     # Create a logger
22 |     logger = logging.getLogger(name)
23 | 
24 |     # Clear any existing handlers to avoid duplicate logs
25 |     if logger.handlers:
26 |         logger.handlers.clear()
27 | 
28 |     # Set the logging level
29 |     logger.setLevel(logging_level)
30 | 
31 |     # Create console handler
32 |     console_handler = logging.StreamHandler(sys.stdout)
33 |     console_handler.setLevel(logging_level)
34 | 
35 |     # Create formatter
36 |     formatter = logging.Formatter(
37 |         "%(asctime)s | %(levelname)s | %(name)s:%(funcName)s:%(lineno)d | %(message)s",
38 |         datefmt="%Y-%m-%d %H:%M:%S",
39 |     )
40 | 
41 |     # Add formatter to handler
42 |     console_handler.setFormatter(formatter)
43 | 
44 |     # Add handler to logger
45 |     logger.addHandler(console_handler)
46 | 
47 |     # Add file handler if DEV_MODE is enabled
48 |     if get_settings().DEV_MODE:
49 |         # Ensure logs directory exists
50 |         log_dir = os.path.join(os.getcwd(), "logs")
51 |         os.makedirs(log_dir, exist_ok=True)
52 | 
53 |         log_file = os.path.join(log_dir, f"{name}.log")
54 |         # Using standard FileHandler instead of TimedRotatingFileHandler
55 |         # Add encoding='utf-8' to handle Unicode characters like emojis
56 |         file_handler = logging.FileHandler(
57 |             filename=log_file,
58 |             mode="a",  # Append mode
59 |             encoding="utf-8",  # Use UTF-8 encoding to support Unicode characters
60 |         )
61 |         file_handler.setLevel(logging_level)
62 |         file_handler.setFormatter(formatter)
63 |         logger.addHandler(file_handler)
64 | 
65 |     return logger
66 | 


--------------------------------------------------------------------------------
/src/ansari/app/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ansari-project/ansari-backend/accfce869dd1a98c02eb4e931985525b91632c7d/src/ansari/app/__init__.py


--------------------------------------------------------------------------------
/src/ansari/app/main_discord.py:
--------------------------------------------------------------------------------
 1 | # This file aims to process input from Discord and generate answers back to Discord using a specified LLM model.
 2 | 
 3 | from presenters.discord_presenter import DiscordPresenter
 4 | 
 5 | from ansari.agents import Ansari
 6 | from ansari.config import get_settings
 7 | 
 8 | # This work involves 3 agents, with Ansari as primary.
 9 | agent = Ansari(get_settings())
10 | presenter = DiscordPresenter(
11 |     agent,
12 |     token=get_settings().DISCORD_TOKEN.get_secret_value(),
13 | )
14 | 
15 | # This starts the UI.
16 | presenter.present()
17 | 


--------------------------------------------------------------------------------
/src/ansari/app/main_file.py:
--------------------------------------------------------------------------------
 1 | # This file aims to process an input file and generate answers using a specified LLM model.
 2 | # Steps:
 3 | #    1. Import necessary modules and configure logging.
 4 | #    2. Define the main function with certain parameters.
 5 | #    3. Retrieve settings from `config.py`.
 6 | #    4. Update settings with the provided system message file and model if specified.
 7 | #    5. Depending on the ayah_mode flag, initialize the appropriate presenter (AyahFilePresenter or FilePresenter).
 8 | #    6. Call the present method of the presenter to process the input file and generate the output file.
 9 | #    7. Use typer.run to execute the main function when the script is run directly.
10 | #       (Read more about typer here: https://blog.stackademic.com/typer-the-easiest-way-to-build-command-line-tools-1f3effa569d1)
11 | 
12 | import logging
13 | from typing import Optional
14 | 
15 | import typer
16 | 
17 | from ansari.agents import Ansari
18 | from ansari.config import get_settings
19 | from ansari.presenters.ayah_file_presenter import AyahFilePresenter
20 | from ansari.presenters.file_presenter import FilePresenter
21 | 
22 | logging.basicConfig(
23 |     level=logging.DEBUG,
24 | )
25 | 
26 | 
27 | def main(
28 |     input_file: str,
29 |     output_file: str,
30 |     ayah_mode: bool = typer.Option(
31 |         False,
32 |         "--ayah-mode",
33 |         "-a",
34 |         help="Process input as ayah questions (CSV format: surah:ayah,question)",
35 |     ),
36 |     use_query_generation: bool = typer.Option(
37 |         True,
38 |         "--use-query-generation",
39 |         "-q",
40 |         help="Use query generation step in ayah mode",
41 |     ),
42 |     answer_column: str = typer.Option(
43 |         "answer",
44 |         "--answer-column",
45 |         "-c",
46 |         help="Name of the column to store answers in the output CSV (ayah mode only)",
47 |     ),
48 |     system_message: Optional[str] = typer.Option(
49 |         None,
50 |         "--system-message",
51 |         "-s",
52 |         help="The name of the system message file. If not provided, uses default.",
53 |     ),
54 |     model: str = typer.Option(
55 |         "gpt-4",
56 |         "--model",
57 |         "-m",
58 |         help="The LLM model to use (e.g., gpt-4, gpt-3.5-turbo)",
59 |     ),
60 | ):
61 |     """
62 |     Process input file and generate answers
63 | 
64 |     Args:
65 |         input_file: Path to input file
66 |         output_file: Path to output file
67 |         ayah_mode: Whether to process in ayah mode
68 |         use_query_generation: Whether to use query generation
69 |         answer_column: Name of column to store answers
70 |         system_message: The name of the system message file. If not provided, uses default.
71 |         model: The LLM model to use for generating answers
72 |     """
73 |     settings = get_settings()
74 | 
75 |     if system_message:
76 |         settings.AYAH_SYSTEM_PROMPT_FILE_NAME = system_message
77 | 
78 |     # Set the model in settings
79 |     settings.MODEL = model
80 | 
81 |     if ayah_mode:
82 |         presenter = AyahFilePresenter(
83 |             settings=settings, use_query_generation=use_query_generation, answer_column=answer_column
84 |         )
85 |     else:
86 |         ansari = Ansari(settings)
87 |         presenter = FilePresenter(ansari)
88 | 
89 |     presenter.present(input_file, output_file)
90 | 
91 | 
92 | if __name__ == "__main__":
93 |     typer.run(main)
94 | 


--------------------------------------------------------------------------------
/src/ansari/app/main_stdio.py:
--------------------------------------------------------------------------------
 1 | # This file aims to process input from standard input and generate answers using a specified LLM model.
 2 | 
 3 | import logging
 4 | import typer
 5 | from typing import Optional
 6 | 
 7 | 
 8 | from ansari.agents import Ansari
 9 | from ansari.agents.ansari_claude import AnsariClaude
10 | from ansari.ansari_logger import get_logger
11 | from ansari.config import get_settings
12 | from ansari.presenters.stdio_presenter import StdioPresenter
13 | 
14 | logger = get_logger(__name__)
15 | 
16 | app = typer.Typer()
17 | 
18 | 
19 | @app.command()
20 | def main(
21 |     agent: str = typer.Option("Ansari", "--agent", "-a", help="Agent to use (AnsariClaude or Ansari)"),
22 |     log_level: str = typer.Option(
23 |         "INFO", "--log-level", "-l", help="Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)", case_sensitive=False
24 |     ),
25 |     input: Optional[str] = typer.Option(
26 |         None, "--input", "-i", help="Input to send to the agent. If not provided, starts interactive mode."
27 |     ),
28 | ):
29 |     """
30 |     Run the Ansari agent. If input is provided, process it and exit.
31 |     If no input is provided, start interactive mode.
32 |     """
33 |     # Convert log level string to logging constant
34 |     numeric_level = getattr(logging, log_level.upper(), None)
35 |     if not isinstance(numeric_level, int):
36 |         raise ValueError(f"Invalid log level: {log_level}")
37 | 
38 |     logging.basicConfig(level=numeric_level)
39 |     settings = get_settings()
40 | 
41 |     if agent == "AnsariClaude":
42 |         agent_instance = AnsariClaude(settings)
43 |     elif agent == "Ansari":
44 |         agent_instance = Ansari(settings)
45 |     else:
46 |         raise ValueError(f"Unknown agent type: {agent}. Must be one of: AnsariClaude, Ansari")
47 | 
48 |     # Print greeting
49 |     print(agent_instance.greet())
50 | 
51 |     if input:
52 |         # Process single input and exit
53 |         result = agent_instance.process_input(input)
54 |         # Handle the result which could be either a generator or other iterable
55 |         if result:
56 |             for word in result:
57 |                 if word is not None:
58 |                     print(word, end="", flush=True)
59 |             print()
60 |     else:
61 |         # No input provided, start interactive mode
62 |         presenter = StdioPresenter(agent_instance, skip_greeting=True)
63 |         presenter.present()
64 | 
65 | 
66 | if __name__ == "__main__":
67 |     logger.debug("Starting the Ansari chatbot in terminal (stdio)...")
68 |     app()
69 | 


--------------------------------------------------------------------------------
/src/ansari/cli/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ansari-project/ansari-backend/accfce869dd1a98c02eb4e931985525b91632c7d/src/ansari/cli/__init__.py


--------------------------------------------------------------------------------
/src/ansari/cli/use_tools.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | """
  3 | Command line tool to print search results from various Ansari search tools.
  4 | This tool takes a query and search tool name, and pretty prints the returned value.
  5 | """
  6 | 
  7 | import json
  8 | from enum import Enum
  9 | from typing import Any
 10 | 
 11 | import typer
 12 | from rich.console import Console
 13 | from rich.panel import Panel
 14 | 
 15 | from ansari.config import get_settings
 16 | from ansari.tools.search_hadith import SearchHadith
 17 | from ansari.tools.search_mawsuah import SearchMawsuah
 18 | from ansari.tools.search_quran import SearchQuran
 19 | from ansari.tools.search_tafsir_encyc import SearchTafsirEncyc
 20 | 
 21 | # Remove usul import
 22 | # Remove vectara import
 23 | from ansari.ansari_logger import get_logger
 24 | 
 25 | logger = get_logger(__name__)
 26 | settings = get_settings()
 27 | console = Console()
 28 | app = typer.Typer(help="Ansari search tools result printer")
 29 | 
 30 | 
 31 | class OutputFormat(str, Enum):
 32 |     """Output format options for search results."""
 33 | 
 34 |     RAW = "raw"
 35 |     STRING = "string"
 36 |     LIST = "list"
 37 |     FORMATTED = "formatted"
 38 |     REF_LIST = "ref_list"
 39 | 
 40 | 
 41 | def format_json(data: Any) -> str:
 42 |     """Format data as indented JSON for better readability."""
 43 |     return json.dumps(data, ensure_ascii=False, indent=2)
 44 | 
 45 | 
 46 | def pretty_print_results(results: Any, output_format: str) -> None:
 47 |     """Pretty print results based on the specified format."""
 48 |     if not results:
 49 |         console.print("[bold red]No results found.[/bold red]")
 50 |         return
 51 | 
 52 |     if output_format == OutputFormat.RAW:
 53 |         console.print_json(json.dumps(results))
 54 |     elif output_format == OutputFormat.STRING:
 55 |         if isinstance(results, str):
 56 |             console.print(results)
 57 |         else:
 58 |             console.print_json(json.dumps(results))
 59 |     elif output_format == OutputFormat.LIST:
 60 |         if isinstance(results, list):
 61 |             for i, item in enumerate(results, 1):
 62 |                 console.print(Panel(f"{item}", title=f"Result {i}", border_style="blue"))
 63 |                 console.print()
 64 |         else:
 65 |             console.print_json(json.dumps(results))
 66 |     elif output_format == OutputFormat.REF_LIST:
 67 |         if isinstance(results, list):
 68 |             # Pretty print the entire ref_list as formatted JSON
 69 |             console.print_json(format_json(results))
 70 |         else:
 71 |             console.print_json(json.dumps(results))
 72 |     else:
 73 |         if isinstance(results, dict) and "tool_result" in results:
 74 |             console.print(Panel(format_json(results["tool_result"]), title="Tool Result", border_style="green"))
 75 |             if "response_message" in results:
 76 |                 console.print(Panel(results["response_message"], title="Response Message", border_style="yellow"))
 77 |             else:
 78 |                 console.print(results)
 79 | 
 80 | 
 81 | def create_search_tool(tool_name: str) -> Any:
 82 |     """Create and return the appropriate search tool instance based on the tool name."""
 83 |     tools = {
 84 |         "hadith": lambda: SearchHadith(
 85 |             kalimat_api_key=settings.KALEMAT_API_KEY.get_secret_value() if hasattr(settings, "KALEMAT_API_KEY") else ""
 86 |         ),
 87 |         "mawsuah": lambda: SearchMawsuah(
 88 |             vectara_api_key=settings.VECTARA_API_KEY.get_secret_value(), vectara_corpus_key=settings.MAWSUAH_CORPUS_ID
 89 |         ),
 90 |         "quran": lambda: SearchQuran(
 91 |             kalimat_api_key=settings.KALEMAT_API_KEY.get_secret_value() if hasattr(settings, "KALEMAT_API_KEY") else ""
 92 |         ),
 93 |         "tafsir": lambda: SearchTafsirEncyc(api_token=settings.USUL_API_TOKEN.get_secret_value()),
 94 |     }
 95 | 
 96 |     if tool_name.lower() not in tools:
 97 |         available_tools = ", ".join(tools.keys())
 98 |         console.print(f"[bold red]Error:[/bold red] Unknown tool '{tool_name}'")
 99 |         console.print(f"Available tools: {available_tools}")
100 |         raise typer.Exit(code=1)
101 | 
102 |     return tools[tool_name.lower()]()
103 | 
104 | 
105 | @app.command()
106 | def main(
107 |     query: str = typer.Argument(..., help="The search query to run"),
108 |     tool_name: str = typer.Option(..., "--tool", "-t", help="The search tool to use"),
109 |     output_format: OutputFormat = typer.Option(OutputFormat.FORMATTED, "--format", "-f", help="Output format"),
110 | ):
111 |     """
112 |     Search using the specified tool and print the results.
113 |     """
114 |     try:
115 |         with console.status(f"Searching for '{query}' using {tool_name}..."):
116 |             # Create the appropriate search tool
117 |             search_tool = create_search_tool(tool_name)
118 | 
119 |             # Run the search
120 |             raw_results = search_tool.run(query)
121 | 
122 |         # Format based on the specified output format
123 |         if output_format == OutputFormat.RAW:
124 |             results = raw_results
125 |         elif output_format == OutputFormat.STRING:
126 |             if hasattr(search_tool, "run_as_string"):
127 |                 results = search_tool.run_as_string(query)
128 |             else:
129 |                 # Fallback for tools without run_as_string method
130 |                 tool_result = search_tool.format_as_tool_result(raw_results)
131 |                 results = format_json(tool_result)
132 |         elif output_format == OutputFormat.LIST:
133 |             if hasattr(search_tool, "format_as_list"):
134 |                 results = search_tool.format_as_list(raw_results)
135 |             else:
136 |                 results = ["Format not supported for this tool"]
137 |         elif output_format == OutputFormat.REF_LIST:
138 |             if hasattr(search_tool, "format_as_ref_list"):
139 |                 results = search_tool.format_as_ref_list(raw_results)
140 |             else:
141 |                 results = ["Format not supported for this tool"]
142 |         else:  # formatted
143 |             tool_result = search_tool.format_as_tool_result(raw_results)
144 |             response_message = ""
145 |             if hasattr(search_tool, "format_tool_response"):
146 |                 response_message = search_tool.format_tool_response(raw_results)
147 | 
148 |             results = {"tool_result": tool_result, "response_message": response_message}
149 | 
150 |         # Print the results
151 |         pretty_print_results(results, output_format)
152 | 
153 |     except Exception as e:
154 |         logger.exception(f"Error running search: {e}")
155 |         console.print(f"[bold red]Error:[/bold red] {e}")
156 |         raise typer.Exit(code=1)
157 | 
158 | 
159 | if __name__ == "__main__":
160 |     app()
161 | 


--------------------------------------------------------------------------------
/src/ansari/examples/test_citations.py:
--------------------------------------------------------------------------------
  1 | from dotenv import load_dotenv
  2 | import os
  3 | import anthropic
  4 | from ansari.tools.search_quran import SearchQuran
  5 | import sys
  6 | 
  7 | 
  8 | def create_quran_document(ayah: dict) -> dict:
  9 |     """Convert an ayah to a document format that supports citations."""
 10 |     return {
 11 |         "type": "document",
 12 |         "source": {
 13 |             "type": "content",
 14 |             "content": [
 15 |                 {"type": "text", "text": f"Arabic Text: {ayah['text']}"},
 16 |                 {"type": "text", "text": f"English Text: {ayah['en_text']}"},
 17 |             ],
 18 |         },
 19 |         "title": f"Quran {ayah['id']}",
 20 |         "citations": {"enabled": True},
 21 |     }
 22 | 
 23 | 
 24 | def get_prompt(query: str) -> str:
 25 |     return f"""Based on these Quranic verses, please explain the Islamic teachings about {query}. 
 26 |     
 27 |     Tell me how many verses are below, and how many you actually used. 
 28 |     """
 29 | 
 30 | 
 31 | def format_response_with_citations(response) -> str:
 32 |     """Format the response with numbered citations and a references section."""
 33 |     citations = []
 34 |     formatted_text = ""
 35 | 
 36 |     # First pass: collect citations and build citation map
 37 |     citation_map = {}  # Maps doc_title to citation number
 38 |     for content in response.content:
 39 |         if content.type == "text" and hasattr(content, "citations") and content.citations:
 40 |             for citation in content.citations:
 41 |                 doc_title = citation.document_title
 42 |                 if doc_title not in citation_map:
 43 |                     text = (
 44 |                         citation.cited_text.split("English Text:", 1)[1].strip()
 45 |                         if "English Text:" in citation.cited_text
 46 |                         else citation.cited_text.strip()
 47 |                     )
 48 |                     citations.append({"doc_title": doc_title, "text": text})
 49 |                     citation_map[doc_title] = len(citations)
 50 | 
 51 |     # Second pass: format text with citation numbers
 52 |     for content in response.content:
 53 |         if content.type == "text":
 54 |             text = content.text
 55 |             if hasattr(content, "citations") and content.citations:
 56 |                 # Add citation numbers after the text block
 57 |                 citation_nums = []
 58 |                 for citation in content.citations:
 59 |                     ref_num = citation_map[citation.document_title]
 60 |                     citation_nums.append(str(ref_num))
 61 |                 text += f" [{', '.join(citation_nums)}]"
 62 |             formatted_text += text
 63 | 
 64 |     # Add references section
 65 |     if citations:
 66 |         formatted_text += "\n\nReferences:\n"
 67 |         for i, citation in enumerate(citations, 1):
 68 |             formatted_text += f"[{i}] {citation['doc_title']}: {citation['text']}\n\n"
 69 | 
 70 |     return formatted_text
 71 | 
 72 | 
 73 | def get_request_params(query: str) -> dict:
 74 |     load_dotenv()
 75 | 
 76 |     # Get API keys
 77 |     kalemat_api_key = os.getenv("KALEMAT_API_KEY")
 78 |     if not kalemat_api_key:
 79 |         raise ValueError("KALEMAT_API_KEY environment variable not set")
 80 | 
 81 |     anthropic_api_key = os.getenv("ANTHROPIC_API_KEY")
 82 |     if not anthropic_api_key:
 83 |         raise ValueError("ANTHROPIC_API_KEY environment variable not set")
 84 | 
 85 |     # Initialize clients
 86 |     quran_search = SearchQuran(kalemat_api_key)
 87 | 
 88 |     # Search for relevant ayahs
 89 |     search_results = quran_search.run(query, num_results=15)
 90 |     documents = [create_quran_document(ayah) for ayah in search_results]
 91 | 
 92 |     # Create message with documents and prompt
 93 |     return {
 94 |         "model": "claude-3-5-sonnet-20241022",
 95 |         "max_tokens": 4096,
 96 |         "messages": [{"role": "user", "content": documents}, {"role": "user", "content": get_prompt(query)}],
 97 |     }
 98 | 
 99 | 
100 | if __name__ == "__main__":
101 |     if len(sys.argv) < 2:
102 |         print("Usage: python3 test_citations.py <query>")
103 |         sys.exit(1)
104 | 
105 |     query = " ".join(sys.argv[1:])
106 |     client = anthropic.Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))
107 |     response = client.messages.create(**get_request_params(query))
108 |     print("Response:")
109 |     print(format_response_with_citations(response))
110 | 


--------------------------------------------------------------------------------
/src/ansari/examples/test_search_mawsuah.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Test script for the SearchMawsuah class.
 3 | This script verifies that the SearchMawsuah class correctly inherits from SearchVectara
 4 | and that its translation and formatting methods work as expected.
 5 | """
 6 | 
 7 | import os
 8 | import sys
 9 | 
10 | # Add the src directory to the path so we can import the modules
11 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../")))
12 | 
13 | from src.ansari.tools.search_mawsuah import SearchMawsuah
14 | from src.ansari.config import get_settings
15 | 
16 | 
17 | def main():
18 |     """Test the SearchMawsuah class."""
19 |     # Get settings
20 |     settings = get_settings()
21 | 
22 |     # Create a SearchMawsuah instance
23 |     sm = SearchMawsuah(
24 |         vectara_api_key=settings.VECTARA_API_KEY.get_secret_value(),
25 |         vectara_corpus_key=settings.MAWSUAH_VECTARA_CORPUS_KEY,
26 |     )
27 | 
28 |     # Test basic search
29 |     print("Testing basic search...")
30 |     query = "prayer"
31 |     results = sm.run(query, num_results=2)
32 | 
33 |     # Check if results are in the expected format
34 |     print(f"Results type: {type(results)}")
35 | 
36 |     # If search_results is present, the parent class's API format is being used correctly
37 |     if "search_results" in results:
38 |         print(f"Found {len(results['search_results'])} search results")
39 | 
40 |         # Test format_as_list
41 |         print("\nTesting format_as_list...")
42 |         text_results = sm.format_as_list(results)
43 |         print(f"format_as_list produced {len(text_results)} results")
44 | 
45 |         # Test format_as_ref_list
46 |         print("\nTesting format_as_ref_list...")
47 |         ref_list = sm.format_as_ref_list(results)
48 |         print(f"format_as_ref_list produced {len(ref_list)} documents")
49 | 
50 |         # Check if translation worked in ref_list
51 |         if ref_list and not isinstance(ref_list[0], str):
52 |             text = ref_list[0]["source"]["data"]
53 |             print("First document text includes translation:", "English:" in text)
54 | 
55 |         # Test run_as_string
56 |         print("\nTesting run_as_string...")
57 |         string_results = sm.run_as_string(query, num_results=2)
58 |         print(f"run_as_string output length: {len(string_results)}")
59 |         print("run_as_string output includes translation:", "English Translation:" in string_results)
60 | 
61 |         print("\nSearchMawsuah works correctly and inherits properly from SearchVectara!")
62 |     else:
63 |         print("ERROR: Results don't have search_results key. API format mismatch.")
64 |         print(f"Results keys: {results.keys()}")
65 | 
66 | 
67 | if __name__ == "__main__":
68 |     main()
69 | 


--------------------------------------------------------------------------------
/src/ansari/presenters/api_presenter.py:
--------------------------------------------------------------------------------
 1 | # Unlike other files, the presenter's role here is just to provide functions related to the LLM
 2 | 
 3 | 
 4 | from fastapi.responses import StreamingResponse
 5 | 
 6 | from ansari.agents import Ansari, AnsariClaude
 7 | from ansari.ansari_db import MessageLogger
 8 | 
 9 | 
10 | class ApiPresenter:
11 |     def __init__(self, app, agent: Ansari | AnsariClaude):
12 |         self.app = app
13 |         self.settings = agent.settings
14 | 
15 |     def complete(self, messages: dict, message_logger: MessageLogger = None):
16 |         print("Complete called.")
17 |         if self.settings.AGENT == "Ansari":
18 |             agent = Ansari(settings=self.settings, message_logger=message_logger)
19 |         elif self.settings.AGENT == "AnsariClaude":
20 |             agent = AnsariClaude(settings=self.settings, message_logger=message_logger)
21 | 
22 |         return StreamingResponse(agent.replace_message_history(messages["messages"]))
23 | 
24 |     def present(self):
25 |         pass
26 | 


--------------------------------------------------------------------------------
/src/ansari/presenters/ayah_file_presenter.py:
--------------------------------------------------------------------------------
  1 | import csv
  2 | import logging
  3 | import os
  4 | from typing import Tuple
  5 | 
  6 | from ansari.agents.ansari_workflow import AnsariWorkflow
  7 | 
  8 | 
  9 | class AyahFilePresenter:
 10 |     def __init__(self, settings, use_query_generation: bool = False, answer_column: str = "answer"):
 11 |         self.settings = settings
 12 |         self.use_query_generation = use_query_generation
 13 |         self.answer_column = answer_column
 14 | 
 15 |     def _parse_ayah_reference(self, ayah_ref: str) -> Tuple[int, int]:
 16 |         """Parse a surah:ayah reference into separate numbers.
 17 | 
 18 |         Args:
 19 |             ayah_ref: String in format "surah:ayah"
 20 | 
 21 |         Returns:
 22 |             Tuple of (surah_num, ayah_num)
 23 | 
 24 |         Raises:
 25 |             ValueError: If format is invalid or empty
 26 |         """
 27 |         if not ayah_ref or not ayah_ref.strip():
 28 |             raise ValueError("Empty ayah reference")
 29 | 
 30 |         try:
 31 |             surah_str, ayah_str = ayah_ref.strip().split(":")
 32 |             return int(surah_str), int(ayah_str)
 33 |         except ValueError:
 34 |             raise ValueError(f"Invalid ayah reference format: {ayah_ref}. Expected format: surah:ayah (e.g. 1:1)")
 35 | 
 36 |     def present(self, input_file_path: str, output_file_path: str):
 37 |         try:
 38 |             # First pass: read header to get all field names
 39 |             with open(input_file_path, newline="") as input_file:
 40 |                 # Skip empty lines and get header
 41 |                 for line in input_file:
 42 |                     if line.strip():  # First non-empty line is header
 43 |                         reader = csv.reader([line])
 44 |                         header = next(reader)
 45 |                         if len(header) < 2:
 46 |                             logging.error("Input CSV must contain at least two columns")
 47 |                             return
 48 |                         break
 49 |                 else:
 50 |                     logging.error("Empty input file")
 51 |                     return
 52 | 
 53 |                 # Create fieldnames, preserving original names
 54 |                 fieldnames = header
 55 |                 if self.answer_column not in fieldnames:
 56 |                     fieldnames = fieldnames + [self.answer_column]
 57 | 
 58 |             # Second pass: process all rows
 59 |             with open(input_file_path, newline="") as input_file:
 60 |                 reader = csv.reader(input_file)
 61 | 
 62 |                 # Open output file and write
 63 |                 with open(output_file_path, "w", newline="") as output_file:
 64 |                     writer = csv.writer(output_file)
 65 |                     writer.writerow(fieldnames)
 66 | 
 67 |                     for row in reader:
 68 |                         # Skip empty lines
 69 |                         if not any(row):
 70 |                             continue
 71 | 
 72 |                         try:
 73 |                             # Get values from first and second columns using column positions
 74 |                             ayah_ref = row[0]
 75 |                             question = row[1]
 76 | 
 77 |                             # Validate required fields
 78 |                             if not ayah_ref or not question:
 79 |                                 raise ValueError("Missing required fields in first or second column")
 80 | 
 81 |                             surah, ayah = self._parse_ayah_reference(ayah_ref)
 82 |                             question = question.strip()
 83 | 
 84 |                             print(f"Processing surah {surah}, ayah {ayah}, question: {question}")
 85 | 
 86 |                             # Create a new workflow instance for each question
 87 |                             workflow = AnsariWorkflow(
 88 |                                 self.settings, system_prompt_file=self.settings.AYAH_SYSTEM_PROMPT_FILE_NAME
 89 |                             )
 90 | 
 91 |                             ayah_id = surah * 1000 + ayah
 92 |                             workflow_steps = [
 93 |                                 (
 94 |                                     "search",
 95 |                                     {
 96 |                                         "query": question,
 97 |                                         "tool_name": "search_tafsir",
 98 |                                         "metadata_filter": f"part.from_ayah_int<={ayah_id} AND part.to_ayah_int>={ayah_id}",
 99 |                                     },
100 |                                 ),
101 |                             ]
102 | 
103 |                             if self.use_query_generation:
104 |                                 workflow_steps.append(("gen_query", {"input": question, "target_corpus": "tafsir"}))
105 | 
106 |                             workflow_steps.append(("gen_answer", {"input": question, "search_results_indices": [0]}))
107 | 
108 |                             # Execute the workflow
109 |                             workflow_output = workflow.execute_workflow(workflow_steps)
110 |                             # The answer is the last item in the workflow output
111 |                             answer = workflow_output[-1]
112 | 
113 |                             # Add answer to row and write
114 |                             row.append(answer)
115 |                             writer.writerow(row)
116 |                             output_file.flush()
117 | 
118 |                         except Exception as e:
119 |                             logging.error(f"Error processing row: {e}")
120 |                             row.append(f"ERROR: {str(e)}")
121 |                             writer.writerow(row)
122 |                             output_file.flush()
123 |                             continue
124 | 
125 |             print(f"Results saved to {os.path.abspath(output_file_path)}")
126 | 
127 |         except Exception as e:
128 |             logging.error(f"Error processing file: {e}")
129 |             return
130 | 


--------------------------------------------------------------------------------
/src/ansari/presenters/discord_presenter.py:
--------------------------------------------------------------------------------
 1 | import copy
 2 | import time
 3 | 
 4 | import discord
 5 | 
 6 | 
 7 | class MyClient(discord.Client):
 8 |     def __init__(self, agent, intents):
 9 |         super().__init__(intents=intents)
10 |         self.agent = agent
11 | 
12 |     async def on_ready(self):
13 |         print(f"We have logged in as {self.user}")
14 | 
15 |     async def on_message(self, message):
16 |         if message.author == self.user:
17 |             return
18 |         agent = copy.deepcopy(self.agent)
19 |         print(f"User said: {message.content} and mentioned {message.mentions}")
20 |         st = time.time()
21 |         if (
22 |             isinstance(message.channel, discord.channel.DMChannel)
23 |             or message.content.startswith("<@&1150526640552673324>")
24 |             or (message.mentions and message.mentions[0] and message.mentions[0].name == "Ansari")
25 |         ):
26 |             msg = await message.channel.send(f"Thinking, {message.author}...")
27 |             msg_so_far = ""
28 |             for token in agent.process_input(message.content):
29 |                 msg_so_far = msg_so_far + token
30 |                 print(f"Message so far: {msg_so_far}")
31 |                 et = time.time() - st
32 |                 print(f"Elapsed time: {et}")
33 |                 if et > 3:
34 |                     print("Enough time has passed. Sending message so far.")
35 |                     if msg_so_far:
36 |                         await msg.edit(content=msg_so_far)
37 |                     else:
38 |                         print(f"For some reason response was empty. {msg_so_far}, {et}")
39 |                     st = time.time()
40 |             if msg_so_far:
41 |                 await msg.edit(content=msg_so_far)
42 |             else:
43 |                 await msg.edit(content="Something went wrong. Flagging.")
44 |         else:
45 |             print(f"Got a message. Not for me: {message.content}")
46 | 
47 | 
48 | class DiscordPresenter:
49 |     def __init__(self, agent, token):
50 |         self.agent = agent
51 |         self.token = token
52 |         intents = discord.Intents.default()
53 |         intents.message_content = True
54 |         self.client = MyClient(agent=agent, intents=intents)
55 | 
56 |     def present(self):
57 |         self.client.run(self.token)
58 | 


--------------------------------------------------------------------------------
/src/ansari/presenters/file_presenter.py:
--------------------------------------------------------------------------------
 1 | import copy
 2 | import os
 3 | 
 4 | 
 5 | class FilePresenter:
 6 |     def __init__(self, agent):
 7 |         self.agent = agent
 8 | 
 9 |     def present(self, input_file_path, output_file_path):
10 |         # Read lines from input file
11 |         with open(input_file_path) as input_file:
12 |             lines = input_file.readlines()
13 | 
14 |         # Send each line to agent and get result
15 |         with open(output_file_path, "w+") as output_file:
16 |             for line in lines:
17 |                 print(f"Answering: {line}")
18 |                 agent = copy.deepcopy(self.agent)
19 |                 # Drop none that occurs between answers.
20 |                 result = [tok for tok in agent.process_input(line) if tok]
21 |                 answer = "".join(result)
22 |                 (question, answer) = (line.strip(), answer)
23 |                 output_file.write(f"## {question}\n\n{answer}\n\n")
24 |                 output_file.flush()
25 |             print(f"Result saved to {os.path.abspath(output_file_path)}")
26 | 


--------------------------------------------------------------------------------
/src/ansari/presenters/gradio_presenter.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import uuid
  3 | 
  4 | import gradio as gr
  5 | 
  6 | CSS = """
  7 | .contain { display: flex; flex-direction: column; }
  8 | #component-0 { height: 100%; flex-grow: 1; }
  9 | #chatbot { flex-grow: 1; overflow: auto;}
 10 | """
 11 | 
 12 | 
 13 | class GradioPresenter:
 14 |     def __init__(self, agent, app_name, favicon_path):
 15 |         self.agent = agent
 16 |         self.app_name = app_name
 17 |         self.favicon_path = favicon_path
 18 | 
 19 |     def present(self):
 20 |         self.instances = {}
 21 |         self.histories = {}
 22 | 
 23 |         def generate_session_id():
 24 |             return str(f"{uuid.uuid4()}")
 25 | 
 26 |         def append_flag(msg):
 27 |             msg = msg + "Please flag this. "
 28 | 
 29 |         def clear_contents(msg):
 30 |             return ""
 31 | 
 32 |         with gr.Blocks(title=self.app_name, css=CSS) as app:
 33 |             # Note: Gradio Presenter is incredibly confusing.
 34 |             # We can't pass agents because they are not serializable.
 35 |             # instead what we do is that we maintain a dictionary of
 36 |             # LangChainChatAgents by uuid.
 37 |             my_uuid = gr.State(generate_session_id)
 38 | 
 39 |             chatbot = gr.Chatbot(
 40 |                 [["", self.agent.greet()]],
 41 |                 elem_id="chatbot",
 42 |                 line_breaks=True,
 43 |             )
 44 |             msg = gr.Textbox(show_label=False, scale=10)
 45 |             with gr.Row():
 46 |                 clr = gr.Button(
 47 |                     value="Clear",
 48 |                     size="sm",
 49 |                     scale=1,
 50 |                     variant="secondary",
 51 |                     elem_id="clr",
 52 |                 )
 53 |                 btn = gr.Button(
 54 |                     value="Send",
 55 |                     size="sm",
 56 |                     scale=2,
 57 |                     variant="primary",
 58 |                     elem_id="btn",
 59 |                 )
 60 | 
 61 |             def user(user_message, history, my_uuid):
 62 |                 if self.instances.get(my_uuid) is None:
 63 |                     self.instances[my_uuid] = copy.deepcopy(self.agent)
 64 |                     self.instances[my_uuid].session_tag = f"ses_{my_uuid}"
 65 |                     self.histories[my_uuid] = [["", self.agent.greet()]]
 66 |                 self.histories[my_uuid].append([user_message, None])
 67 |                 print("history is ", self.histories[my_uuid])
 68 |                 return "", self.histories[my_uuid], my_uuid
 69 | 
 70 |             def bot(history, my_uuid):
 71 |                 # Check if we've seen this uuid before. If not, greet then add to instances
 72 |                 if self.instances.get(my_uuid) is None:
 73 |                     self.instances[my_uuid] = copy.deepcopy(self.agent)
 74 |                     self.instances[my_uuid].session_tag = f"ses_{my_uuid}"
 75 |                     self.histories[my_uuid] = [["", self.agent.greet()]]
 76 |                 instance = self.instances[my_uuid]
 77 |                 history = self.histories[my_uuid]
 78 | 
 79 |                 history[-1][1] = ""
 80 |                 print(f"history is {history}")
 81 |                 for word in instance.process_input(history[-1][0]):
 82 |                     if word is None:
 83 |                         continue
 84 |                     history[-1][1] += word
 85 |                     yield history, my_uuid
 86 | 
 87 |             msg.submit(
 88 |                 fn=user,
 89 |                 inputs=[msg, chatbot, my_uuid],
 90 |                 outputs=[msg, chatbot, my_uuid],
 91 |                 queue=False,
 92 |             ).then(fn=bot, inputs=[chatbot, my_uuid], outputs=[chatbot, my_uuid])
 93 | 
 94 |             # Clicking on the button does the same thing as submitting.
 95 |             btn.click(
 96 |                 fn=user,
 97 |                 inputs=[msg, chatbot, my_uuid],
 98 |                 outputs=[msg, chatbot, my_uuid],
 99 |                 queue=False,
100 |             ).then(fn=bot, inputs=[chatbot, my_uuid], outputs=[chatbot, my_uuid])
101 | 
102 |             clr.click(fn=clear_contents, inputs=[msg], outputs=[msg], queue=False)
103 | 
104 |         if self.favicon_path:
105 |             app.launch(favicon_path=self.favicon_path)
106 |         else:
107 |             app.launch()
108 | 


--------------------------------------------------------------------------------
/src/ansari/presenters/stdio_presenter.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | from ansari.agents.ansari import Ansari
 4 | 
 5 | 
 6 | class StdioPresenter:
 7 |     def __init__(self, agent: Ansari, skip_greeting=False):
 8 |         self.agent = agent
 9 |         self.skip_greeting = skip_greeting
10 | 
11 |     def present(self):
12 |         if not self.skip_greeting:
13 |             sys.stdout.write(self.agent.greet() + "\n")
14 |         sys.stdout.write("> ")
15 |         sys.stdout.flush()
16 |         inp = sys.stdin.readline()
17 |         while inp:
18 |             result = self.agent.process_input(inp)
19 |             # Handle the result which could be either a generator or other iterable
20 |             if result:
21 |                 for word in result:
22 |                     if word is not None:
23 |                         sys.stdout.write(word)
24 |                         sys.stdout.flush()
25 |             sys.stdout.write("\n> ")
26 |             sys.stdout.flush()
27 |             inp = sys.stdin.readline()
28 | 


--------------------------------------------------------------------------------
/src/ansari/resources/prompts/greeting.txt:
--------------------------------------------------------------------------------
 1 | Assalamu alaikum! My name is Ansari. I can help you with your questions about Islam. 
 2 | 
 3 | Ask me about: 
 4 | 
 5 | - Dua to make in particular situation
 6 | - Spiritual remedies for challenges you are facing
 7 | - Islamic perspectives on topics
 8 | 
 9 | [Click here](https://waleedkadous.github.io/ansari/) for a more comprehensive guide to what Ansari can do. 
10 | 
11 | But I still get things wrong sometimes. It is always best to consult a real Islamic Scholar. 
12 | 
13 | 
14 | *Important note*: If I say anything wrong, confusing, great, funny or interesting, please flag it. Anything flagged will be reviewed by humans. To flag a conversation, just say "I want to flag this conversation." 
15 | 
16 | 
17 | I am multilingual. I can understand Arabic (including transliteration), Turkish, Urdu, Bahasa, Bosnian and many other languages. 


--------------------------------------------------------------------------------
/src/ansari/resources/prompts/news.txt:
--------------------------------------------------------------------------------
1 | **News 2023-07-23**: Major code rewrite. Much more flexible now. 
2 | 
3 | 


--------------------------------------------------------------------------------
/src/ansari/resources/prompts/system_msg_ayah.txt:
--------------------------------------------------------------------------------
 1 | You are Ansari Qur'an, a multilingual Islamic digital assistant designed to answer 
 2 | Quran-related questions with accuracy and depth. 
 3 | 
 4 | Fluent in languages such as Arabic (including transliteration), 
 5 | Bahasa, Bosnian, French, Turkish, Urdu, and more, you, Ansari, 
 6 | craft precise, evidence-based responses exclusively from the Sunni tradition. 
 7 | 
 8 | Here's how you work: You receive a an ayah and a question along with the
 9 | desired response language and search results from any tafsirs available. 
10 | Currently that includes Ibn Kathir. 
11 | 
12 | If you attribute a statement or opinion to a scholar, you will include EXACTLY
13 | the sentence in which the mufassir says so. 
14 | 
15 | If you say there is a hadith that says something, you will include the hadith 
16 | EXACTLY as it was in the source text.  
17 | 
18 | Quoting from the source material is highly recommended when attributing 
19 | statements or opinions to scholars or hadith, especially when the source text is 
20 | weak or unverified.
21 | 
22 | Crucially, only attribute specific statements or opinions to these scholars if you 
23 | have specific referenceable evidence to support that attribution. When referencing 
24 | the Quran, you, Ansari, include the ayah number, Arabic text, and translation 
25 | (if the user's language is different from Arabic). 
26 | 
27 | If you provide a translation, include the name of the translation (e.g. Saheeh
28 | International). Generally Ibn Kathir uses Saheeh International.
29 | 
30 | The person reading your answer is a well informed scholar. You may use terms
31 | that an informed scholar would use. You should use more citations and references
32 | than a general member of the public would. 
33 | 
34 | 
35 | 
36 | 


--------------------------------------------------------------------------------
/src/ansari/resources/prompts/system_msg_ayah_lay.txt:
--------------------------------------------------------------------------------
 1 | You are Ansari Qur'an, a multilingual Islamic digital assistant designed to answer 
 2 | Quran-related questions with accuracy and depth. 
 3 | 
 4 | Fluent in languages such as Arabic (including transliteration), 
 5 | Bahasa, Bosnian, French, Turkish, Urdu, and more, you, Ansari, 
 6 | craft precise, evidence-based responses exclusively from the Sunni tradition. 
 7 | 
 8 | Here's how you work: You receive a an ayah and a question along with the
 9 | desired response language and search results from any tafsirs available. 
10 | Currently that includes Ibn Kathir. 
11 | 
12 | If you attribute a statement or opinion to a scholar, you will include EXACTLY
13 | the sentence in which the mufassir says so. 
14 | 
15 | If you say there is a hadith that says something, you will include the hadith 
16 | EXACTLY as it was in the source text.  
17 | 
18 | Quoting from the source material is highly recommended when attributing 
19 | statements or opinions to scholars or hadith, especially when the source text is 
20 | weak or unverified.
21 | 
22 | Crucially, only attribute specific statements or opinions to these scholars if you 
23 | have specific referenceable evidence to support that attribution. When referencing 
24 | the Quran, you, Ansari, include the ayah number, Arabic text, and translation 
25 | (if the user's language is different from Arabic). 
26 | 
27 | If you provide a translation, include the name of the translation (e.g. Saheeh
28 | International). Generally Ibn Kathir uses Saheeh International.
29 | The person reading your answer is a general member of the public who 
30 | may or may not be a Muslim. Assume the reader only has a basic knowledge of
31 | Islam. 
32 | 
33 | 


--------------------------------------------------------------------------------
/src/ansari/resources/prompts/system_msg_claude.txt:
--------------------------------------------------------------------------------
 1 | You are Ansari, a multilingual Islamic bot designed to answer 
 2 | Islam-related questions with accuracy and depth. Fluent in languages such as 
 3 | Arabic (including transliteration), Bahasa, Bosnian, French, Turkish, Urdu, 
 4 | and more, craft precise, evidence-based responses exclusively 
 5 | from the Sunni tradition. Here's how you work: You receive a question along 
 6 | with the desired response language and search results from Hadith, Quran, and Mawsuah. 
 7 | 
 8 | Provide a concise, well-supported answer, citing classical 
 9 | scholars like Al Ghazali, Ibn Al Qayyim, Ibn Taymiyah, Imam Shafiee, Imam Nawawi, 
10 | Imam Abu Hanifah, Ibn Hajr al Asqalani, Imam Ahmad bin Hanbal, Imam Malik, and Ibn Hazm, 
11 | as well as modern scholars like Yusuf Al Qaradawi, Yasir Qadhi, 
12 | Ma'in Al Qudah, Shu'aib Al Arnaout, Hamza Yusuf, Zaid Shakir, Taqiuddin Usmani, 
13 | Muhammad Shinqeeti, Ismail Menk, Omar Suleiman, Salman Al-Awdah, Jamaaluddin Zarabozo, 
14 | and Yaser Birjas. 
15 | 
16 | Crucially, only attribute specific statements or opinions to these scholars 
17 | if you have specific referenceable evidence to support that attribution. 
18 | When referencing the Quran, include the ayah number, Arabic text, 
19 | and translation (if the user's language is different from Arabic). 
20 | 
21 | For Hadith, only those found in the search results are used, complete with the collection,
22 | LK id, text, and grade. If unsure about a Hadith reference, 
23 | indicate this clearly as 'I believe (though not 100% sure of the reference) 
24 | there is a hadith that says: [text of hadith]'. 
25 |  
26 | Especially cautious about obligatory or prohibited matters, 
27 | ensure all answers are backed by direct evidence. Instead of vague references, 
28 | specific scholars are quoted for clarity. 
29 | 
30 | Answer questions with thorough, well-researched answers, 
31 | grounded in the rich tradition of Sunni scholarship. Use 
32 | extensive citations to support your opinions and statements. 
33 | 
34 | Engage with the Holy Quran, Hadith, and the Encyclopedia of Islamic jurisprudence
35 | (also known as al Mawsuah Al Fiqhiyyah) and the Encyclopedia of Evidence-based Tafseer 
36 | to improve your knowledge. Reflect on diverse questions to craft Arabic 
37 | search queries with increased accuracy and depth. Strive for a richer understanding
38 | and nuanced responses by exploring various topics consistently.
39 | 
40 | When approaching controversial topics or disagreements among scholars:
41 | 1. Present the main scholarly positions objectively
42 | 2. Highlight areas of consensus first before discussing differences
43 | 3. Avoid presenting minority opinions as mainstream views
44 | 4. State the evidence and reasoning behind different positions
45 | 5. Refrain from declaring one position definitively correct when legitimate scholarly disagreement exists
46 | 
47 | When using search tools, follow these strategies:
48 | 1. Start with broad searches to understand the topic scope
49 | 2. Refine search terms based on initial results
50 | 3. Use different tools strategically based on question type:
51 |    - Quran search for scriptural basis
52 |    - Hadith search for prophetic guidance
53 |    - Mawsuah for juristic rulings and scholarly interpretations
54 | 4. Combine search results to create comprehensive answers
55 | 5. Only repeat the same tool if there is good reason to believe it will yield different results:
56 |    - Vary search terms significantly when repeating searches
57 |    - Do not search for the same terms in the same tools repeatedly
58 |    - Consider different sources or approaches if initial searches are unproductive
59 | 6. Do not repeatedly use the same tool more than three times in a row
60 | 7. Do not use tools more than a total of 10 times per query (THIS IS A HARD LIMIT)
61 | 8. If you reach any tool usage limit, you MUST:
62 |    - Stop using tools immediately
63 |    - Synthesize a complete answer based on the information you already have
64 |    - ALWAYS provide your answer in EXACTLY the format specified in the user's prompt
65 |    - Make your best determination based on available information, even if incomplete
66 |    - For questions requiring a specific format, maintain that format exactly as requested
67 |    - If appropriate, you may include a brief note like "I attempted [number] searches, but couldn't find the exact references." ONLY AFTER providing your complete answer
68 | 9. ALWAYS complete your response with a direct answer to the user's question, even if your research is incomplete
69 | 
70 | For questions outside Islamic knowledge domain:
71 | 1. Politely explain that you are specialized in Islamic topics
72 | 2. Suggest reformulating the question to relate to Islamic perspective if relevant
73 | 3. For purely secular topics, acknowledge the limits of your expertise
74 | 4. Avoid speculation on topics outside your knowledge base
75 | 
76 | 


--------------------------------------------------------------------------------
/src/ansari/resources/prompts/system_msg_tool.txt:
--------------------------------------------------------------------------------
1 | You are Ansari, a multilingual Islamic bot designed to answer Islam-related questions with accuracy and depth. Fluent in languages such as Arabic (including transliteration), Bahasa, Bosnian, French, Turkish, Urdu, and more, you, Ansari, craft precise, evidence-based responses exclusively from the Sunni tradition. Here's how you work: You receive a question along with the desired response language and search results from Hadith, Quran, and Mawsuah. You, Ansari, will then provide a concise, well-supported answer, citing classical scholars like Al Ghazali, Ibn Al Qayyim, Ibn Taymiyah, Imam Shafiee, Imam Nawawi, Imam Abu Hanifah, Ibn Hajr al Asqalani, Imam Ahmad bin Hanbal, Imam Malik, and Ibn Hazm, as well as modern scholars like Yusuf Al Qaradawi, Yasir Qadhi, Ma'in Al Qudah, Shu'aib Al Arnaout, Hamza Yusuf, Zaid Shakir, Taqiuddin Usmani, Muhammad Shinqeeti, Ismail Menk, Omar Suleiman, Salman Al-Awdah, Jamaaluddin Zarabozo, and Yaser Birjas. Crucially, only attribute specific statements or opinions to these scholars if you have specific referenceable evidence to support that attribution. When referencing the Quran, you, Ansari, include the ayah number, Arabic text, and translation (if the user's language is different from Arabic). For Hadith, only those found in the search results are used, complete with the collection, LK id, text, and grade. If unsure about a Hadith reference, you, Ansari, will indicate this clearly as 'I believe (though not 100% sure of the reference) there is a hadith that says: [text of hadith]'. Especially cautious about obligatory or prohibited matters, you, Ansari, ensure all answers are backed by direct evidence. Instead of vague references, specific scholars are quoted for clarity. You, Ansari, will answer questions with thorough, well-researched answers, grounded in the rich tradition of Sunni scholarship.
2 | 
3 | Islamic Studies: Engage with the Holy Quran, Hadith, and Mawsuah regularly to optimize performance. Reflect on diverse questions to craft Arabic search queries with increased accuracy and depth. Strive for a richer understanding and nuanced responses by exploring various topics consistently.
4 | 


--------------------------------------------------------------------------------
/src/ansari/resources/templates/ask_question.txt:
--------------------------------------------------------------------------------
 1 | Read the provided question, consider all the listed options after "OPTIONS:", and select the correct answer option. Provide an elaboration for your choice in the "explanation" field. Respond strictly in the specified JSON format with the keys "explanation" and "answer" (for the option text). Replace "Correct Option" with the exact text of the chosen option, without including any option letter or number. Ensure the response adheres to the JSON structure with the key "answer" and the correct option text as the value, enclosed in double quotes. Do not provide any additional explanations or comments outside the JSON format.
 2 | 
 3 | ---
 4 | 
 5 | Question: {{ question }}
 6 | 
 7 | OPTIONS:
 8 | {% for option in options -%}
 9 | {{ option }}{% if not loop.last %}, {% endif %}
10 | {%- endfor %}
11 | 
12 | ```json
13 | {
14 | "explanation": "explanation",
15 | "answer": "Correct Option"
16 | }
17 | ```


--------------------------------------------------------------------------------
/src/ansari/resources/templates/password_reset.html:
--------------------------------------------------------------------------------
 1 | <h2>Reset your Ansari Password</h2>
 2 | 
 3 | <p>Click the link below to reset your password for Ansari.</p>
 4 | 
 5 | <p>If you did not request a reset of your Ansari password, you can safely ignore this.</p>
 6 | 
 7 | <p>Click on <a href="{{frontend_url}}/reset-password?token={{reset_token}}">this link</a> to reset your password.</p>
 8 | 
 9 | Or paste this link into your browser:
10 | 
11 | {{frontend_url}}/reset-password?token={{reset_token}}
12 | 


--------------------------------------------------------------------------------
/src/ansari/tools/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ansari-project/ansari-backend/accfce869dd1a98c02eb4e931985525b91632c7d/src/ansari/tools/__init__.py


--------------------------------------------------------------------------------
/src/ansari/tools/base_search.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | from abc import ABC, abstractmethod
  3 | from typing import Dict, List, Any, Union
  4 | 
  5 | 
  6 | class BaseSearchTool(ABC):
  7 |     """Base class for all search tools."""
  8 | 
  9 |     @abstractmethod
 10 |     def get_tool_name(self) -> str:
 11 |         """Get the name of the tool."""
 12 |         pass
 13 | 
 14 |     @abstractmethod
 15 |     def get_tool_description(self) -> Dict[str, Any]:
 16 |         """Get the tool description in OpenAI function format."""
 17 |         pass
 18 | 
 19 |     @abstractmethod
 20 |     def run(self, query: str, **kwargs) -> Dict[str, Any]:
 21 |         """Execute the search and return raw results.
 22 | 
 23 |         Args:
 24 |             query: The search query
 25 |             **kwargs: Additional search parameters
 26 | 
 27 |         Returns:
 28 |             Dict containing raw search results
 29 |         """
 30 |         pass
 31 | 
 32 |     @abstractmethod
 33 |     def format_as_ref_list(self, results: Dict[str, Any]) -> List[Union[Dict[str, Any], str]]:
 34 |         """Format raw results as a list of document dictionaries.
 35 | 
 36 |         Args:
 37 |             results: Raw results from run()
 38 | 
 39 |         Returns:
 40 |             List of document dictionaries in the format:
 41 |             {
 42 |                 "type": "document",
 43 |                 "source": {
 44 |                     "type": "text",
 45 |                     "media_type": "text/plain",
 46 |                     "data": str (JSON string representing language-text pairs)
 47 |                 },
 48 |                 "title": str,
 49 |                 "context": str,
 50 |                 "citations": {"enabled": bool},
 51 |                 ...
 52 |             }
 53 | 
 54 |             The data field should contain a JSON string in the format:
 55 |             [
 56 |                 {"lang": "ar", "text": "النص العربي"},
 57 |                 {"lang": "en", "text": "English translation"} # Optional
 58 |             ]
 59 | 
 60 |             Or a list containing a single string "No results found." if no results.
 61 |         """
 62 |         pass
 63 | 
 64 |     @abstractmethod
 65 |     def format_as_tool_result(self, results: Dict[str, Any]) -> Dict[str, Any]:
 66 |         """Format raw results as a tool result for Claude.
 67 | 
 68 |         Args:
 69 |             results: Raw results from run()
 70 | 
 71 |         Returns:
 72 |             Dict containing formatted results for Claude
 73 |         """
 74 |         pass
 75 | 
 76 |     def format_multilingual_data(self, text_entries: Dict[str, str]) -> str:
 77 |         """Convert a dictionary of language-text pairs to a JSON string.
 78 | 
 79 |         Args:
 80 |             text_entries: Dictionary mapping language codes to text
 81 |                 e.g., {"ar": "النص العربي", "en": "English text"}
 82 | 
 83 |         Returns:
 84 |             JSON string representing language-text pairs
 85 |         """
 86 |         result = []
 87 |         for lang, text in text_entries.items():
 88 |             if text:  # Only include non-empty text
 89 |                 result.append({"lang": lang, "text": text})
 90 |         return json.dumps(result)
 91 | 
 92 |     def format_document_as_string(self, document: Dict[str, Any]) -> str:
 93 |         """Helper method to format a document object as a string.
 94 | 
 95 |         Args:
 96 |             document: A document dictionary as returned by format_as_ref_list
 97 | 
 98 |         Returns:
 99 |             A string representation of the document
100 |         """
101 |         if isinstance(document, str):
102 |             return document
103 | 
104 |         if document.get("type") != "document" or "source" not in document:
105 |             return str(document)
106 | 
107 |         # Use the title as is - it should already be trimmed by the individual search tools
108 |         title = document.get("title", "")
109 |         data = document["source"].get("data", "")
110 |         context = document.get("context", "")
111 | 
112 |         result = f"{title}\n"
113 |         if context:
114 |             result += f"Context: {context}\n"
115 | 
116 |         # Try to parse data as JSON to extract multilingual content
117 |         try:
118 |             lang_entries = json.loads(data)
119 |             if isinstance(lang_entries, list):
120 |                 for entry in lang_entries:
121 |                     if isinstance(entry, dict) and "lang" in entry and "text" in entry:
122 |                         result += f"\n{entry['lang'].upper()}: {entry['text']}"
123 |                 return result
124 |         except (json.JSONDecodeError, TypeError):
125 |             pass
126 | 
127 |         # Fallback to original data if not JSON
128 |         result += f"{data}"
129 | 
130 |         return result
131 | 


--------------------------------------------------------------------------------
/src/ansari/tools/search_hadith.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | import logging
  3 | from ansari.util.translation import format_multilingual_data
  4 | from ansari.util.general_helpers import trim_citation_title
  5 | 
  6 | # Set up logging
  7 | logger = logging.getLogger(__name__)
  8 | logger.setLevel(logging.INFO)
  9 | 
 10 | KALEMAT_BASE_URL = "https://api.kalimat.dev/search"
 11 | TOOL_NAME = "search_hadith"
 12 | 
 13 | 
 14 | class SearchHadith:
 15 |     def __init__(self, kalimat_api_key):
 16 |         self.api_key = kalimat_api_key
 17 |         self.base_url = KALEMAT_BASE_URL
 18 | 
 19 |     def get_tool_description(self):
 20 |         return {
 21 |             "type": "function",
 22 |             "function": {
 23 |                 "name": "search_hadith",
 24 |                 "description": "Search for relevant Hadith narrations based on a specific topic.",
 25 |                 "parameters": {
 26 |                     "type": "object",
 27 |                     "properties": {
 28 |                         "query": {
 29 |                             "type": "string",
 30 |                             "description": "Topic or subject matter to search for in Hadith collections",
 31 |                         },
 32 |                     },
 33 |                     "required": ["query"],
 34 |                 },
 35 |             },
 36 |         }
 37 | 
 38 |     def get_tool_name(self):
 39 |         return TOOL_NAME
 40 | 
 41 |     def run(self, query: str, num_results: int = 10):
 42 |         headers = {"x-api-key": self.api_key}
 43 |         payload = {
 44 |             "query": query,
 45 |             "numResults": num_results,
 46 |             "indexes": '["sunnah_lk"]',
 47 |             "getText": 2,
 48 |         }
 49 | 
 50 |         response = requests.get(self.base_url, headers=headers, params=payload)
 51 | 
 52 |         if response.status_code != 200:
 53 |             print(
 54 |                 f"Query failed with code {response.status_code}, reason {response.reason}, text {response.text}",
 55 |             )
 56 |             response.raise_for_status()
 57 | 
 58 |         return response.json()
 59 | 
 60 |     def pp_hadith(self, h):
 61 |         en = h["en_text"]
 62 |         grade = h["grade_en"].strip()
 63 |         if grade:
 64 |             grade = f"\nGrade: {grade}\n"
 65 |         src = f"Collection: {h['source_book']} Chapter: {h['chapter_number']} Hadith: {h['hadith_number']} LK id: {h['id']}"
 66 |         result = f"{src}\n{en}\n{grade}"
 67 |         return result
 68 | 
 69 |     def format_as_list(self, results):
 70 |         """Format raw API results as a list of strings."""
 71 |         return [self.pp_hadith(r) for r in results]
 72 | 
 73 |     def format_as_ref_list(self, results):
 74 |         """Format raw API results as a list of reference documents for Claude."""
 75 |         documents = []
 76 |         for result in results:
 77 |             source_book = result.get("source_book", "")
 78 |             chapter = result.get("chapter_number", "")
 79 |             chapter_name = result.get("chapter_english", "")
 80 |             hadith = result.get("hadith_number", "")
 81 |             section_number = result.get("section_number", "")
 82 |             section_name = result.get("section_english", "")
 83 |             id = result.get("id", "")
 84 |             text = result.get("en_text", "")
 85 |             ar_text = result.get("ar_text", "")
 86 |             grade = result.get("grade_en", "").strip()
 87 | 
 88 |             # Create citation title (including grade if available)
 89 |             title = (
 90 |                 f"{source_book} - Chapter {chapter}: {chapter_name}, "
 91 |                 f"Section {section_number}: {section_name}, Hadith {hadith}, LK id {id}"
 92 |             )
 93 |             if grade:
 94 |                 title += f" (Grade: {grade})"
 95 | 
 96 |             # Trim title to prevent Anthropic API crashes with long titles
 97 |             title = trim_citation_title(title)
 98 | 
 99 |             # Format both Arabic and English texts in multilingual JSON format
100 |             # This is expected by the base_search.py documentation
101 |             text_entries = {}
102 |             if ar_text:
103 |                 text_entries["ar"] = ar_text
104 |             if text:
105 |                 text_entries["en"] = text
106 | 
107 |             # Format as multilingual JSON data
108 |             doc_text = format_multilingual_data(text_entries)
109 | 
110 |             document = {
111 |                 "type": "document",
112 |                 "source": {"type": "text", "media_type": "text/plain", "data": doc_text},
113 |                 "title": title,
114 |                 "context": "Retrieved from hadith collections",
115 |                 "citations": {"enabled": True},
116 |             }
117 |             documents.append(document)
118 | 
119 |         return documents
120 | 
121 |     def format_as_tool_result(self, results):
122 |         """Format raw API results as a tool result dictionary."""
123 |         formatted_results = []
124 |         for result in results:
125 |             formatted_results.append(
126 |                 {
127 |                     "type": "text",
128 |                     "text": f"""
129 |                 Hadith: {result.get("en_text", "")} \n\n
130 |                 Source: {result.get("source_book", "")}, Hadith {result.get("hadith_number", "")}\n\n
131 |                 Grade: {result.get("grade_en", "")}\n
132 |                 """,
133 |                 }
134 |             )
135 | 
136 |         return formatted_results
137 | 
138 |     def run_as_list(self, query: str, num_results: int = 10):
139 |         print(f'Searching hadith for "{query}"')
140 |         results = self.run(query, num_results)
141 |         return self.format_as_list(results)
142 | 
143 |     def run_as_string(self, query: str, num_results: int = 3):
144 |         results = self.run(query, num_results)
145 |         return "\n".join(self.format_as_list(results))
146 | 


--------------------------------------------------------------------------------
/src/ansari/tools/search_mawsuah.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from typing import Dict, List, Any
  3 | from ansari.tools.search_vectara import SearchVectara
  4 | from ansari.util.translation import format_multilingual_data
  5 | from ansari.util.general_helpers import trim_citation_title
  6 | 
  7 | TOOL_NAME = "search_mawsuah"
  8 | 
  9 | # Set up logging
 10 | logger = logging.getLogger(__name__)
 11 | 
 12 | 
 13 | class SearchMawsuah(SearchVectara):
 14 |     def __init__(self, vectara_api_key, vectara_corpus_key):
 15 |         # Initialize the SearchVectara parent with the necessary parameters
 16 |         super().__init__(
 17 |             vectara_api_key=vectara_api_key,
 18 |             vectara_corpus_key=vectara_corpus_key,
 19 |             fn_name=TOOL_NAME,
 20 |             fn_description=(
 21 |                 "Queries an encyclopedia of Islamic jurisprudence (fiqh) for relevant rulings. "
 22 |                 "You call this tool when you need to provide information about Islamic law. "
 23 |                 "Regardless of the language used in the original conversation, you will translate "
 24 |                 "the query into Arabic before searching the encyclopedia. The tool returns a list "
 25 |                 "of **potentially** relevant matches, which may include multiple paragraphs."
 26 |             ),
 27 |             params=[
 28 |                 {
 29 |                     "name": "query",
 30 |                     "type": "string",
 31 |                     "description": "The topic to search for in the fiqh encyclopedia. "
 32 |                     "You will translate this query into Arabic.",
 33 |                 }
 34 |             ],
 35 |             required_params=["query"],
 36 |         )
 37 | 
 38 |     def format_as_ref_list(self, response: Dict[str, Any]) -> List[Dict[str, Any]]:
 39 |         """
 40 |         Format raw API results as a list of reference documents for Claude.
 41 |         Each reference will include only the original Arabic text for efficiency.
 42 |         The English translation will be added later only for the parts that are cited.
 43 | 
 44 |         Args:
 45 |             response: The raw API response from Vectara
 46 | 
 47 |         Returns:
 48 |             A list of reference documents formatted for Claude with Arabic text
 49 |         """
 50 |         # Get base documents from parent class
 51 |         documents = super().format_as_ref_list(response)
 52 | 
 53 |         if not documents:
 54 |             return ["No results found."]
 55 | 
 56 |         # Update documents with just Arabic text and citation support
 57 |         for doc in documents:
 58 |             if isinstance(doc, str):
 59 |                 continue
 60 | 
 61 |             # Keep only the Arabic text and remove HTML tags
 62 |             text = doc["source"]["data"]
 63 |             text = text.replace("<em>", "").replace("</em>", "")
 64 | 
 65 |             # Convert to multilingual format (Arabic only)
 66 |             # Note: Mawsuah only returns results in Arabic, so we only have Arabic text here.
 67 |             # The English translation will be added later by AnsariClaude when a citation is actually used.
 68 |             doc["source"]["data"] = format_multilingual_data({"ar": text})
 69 |             doc["title"] = trim_citation_title("Encyclopedia of Islamic Jurisprudence")
 70 |             doc["citations"] = {"enabled": True}
 71 | 
 72 |         return documents
 73 | 
 74 |     def format_as_tool_result(self, response: Dict[str, Any]) -> Dict[str, Any]:
 75 |         """
 76 |         Format raw API results as a tool result dictionary for Claude.
 77 | 
 78 |         Args:
 79 |             response: The raw API response from Vectara
 80 | 
 81 |         Returns:
 82 |             A tool result dictionary with formatted results
 83 |         """
 84 |         # Get base tool result from parent class
 85 |         result = super().format_as_tool_result(response)
 86 | 
 87 |         # If no results were found, return as is
 88 |         if not result.get("results", []):
 89 |             return {"type": "text", "text": "No results found."}
 90 | 
 91 |         return {"type": "text", "text": "Please see the references below."}
 92 | 
 93 |     def run_as_string(self, query: str, num_results: int = 10, **kwargs) -> str:
 94 |         """Return results as a human-readable string with Arabic text only."""
 95 |         # Get the response using the parent's run method
 96 |         response = self.run(query, num_results, **kwargs)
 97 | 
 98 |         # Handle no results case
 99 |         if not response.get("search_results"):
100 |             return "No results found."
101 | 
102 |         # Process results
103 |         results = []
104 |         for i, result in enumerate(response.get("search_results", [])):
105 |             arabic_text = result.get("text", "").replace("<em>", "").replace("</em>", "")
106 | 
107 |             entry = f"Entry {i + 1}:\n"
108 |             entry += f"Arabic Text: {arabic_text}\n"
109 | 
110 |             results.append(entry)
111 | 
112 |         return "\n\n".join(results)
113 | 


--------------------------------------------------------------------------------
/src/ansari/tools/search_quran.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | from ansari.ansari_logger import get_logger
  3 | from ansari.util.translation import format_multilingual_data
  4 | from ansari.util.general_helpers import trim_citation_title
  5 | 
  6 | logger = get_logger(__name__)
  7 | KALEMAT_BASE_URL = "https://api.kalimat.dev/search"
  8 | TOOL_NAME = "search_quran"
  9 | 
 10 | 
 11 | class SearchQuran:
 12 |     def __init__(self, kalimat_api_key):
 13 |         self.api_key = kalimat_api_key
 14 |         self.base_url = KALEMAT_BASE_URL
 15 | 
 16 |     def get_tool_description(self):
 17 |         return {
 18 |             "type": "function",
 19 |             "function": {
 20 |                 "name": "search_quran",
 21 |                 "description": """
 22 |                 Search and retrieve relevant ayahs based on a specific topic. 
 23 |                 Returns multiple ayahs when applicable.""",
 24 |                 "parameters": {
 25 |                     "type": "object",
 26 |                     "properties": {
 27 |                         "query": {
 28 |                             "type": "string",
 29 |                             "description": """
 30 |                             Topic or subject matter to search for within the Holy Quran.
 31 |                             Make this as specific as possible.
 32 |                             Do not include the word quran in the request. 
 33 | 
 34 |                             Returns results both as tool results and as 
 35 |                             references for citations.
 36 |                             """,
 37 |                         },
 38 |                     },
 39 |                     "required": ["query"],
 40 |                 },
 41 |             },
 42 |         }
 43 | 
 44 |     def get_tool_name(self):
 45 |         return TOOL_NAME
 46 | 
 47 |     def run(self, query: str, num_results: int = 10):
 48 |         headers = {"x-api-key": self.api_key}
 49 |         payload = {
 50 |             "query": query,
 51 |             "numResults": num_results,
 52 |             "getText": 1,  # 1 is the Qur'an
 53 |         }
 54 | 
 55 |         response = requests.get(self.base_url, headers=headers, params=payload)
 56 | 
 57 |         if response.status_code != 200:
 58 |             logger.error(
 59 |                 f"Query failed with code {response.status_code}, reason {response.reason}, text {response.text}",
 60 |             )
 61 |             response.raise_for_status()
 62 | 
 63 |         # Return the JSON response directly as in the original implementation
 64 |         return response.json()
 65 | 
 66 |     def pp_ayah(self, ayah):
 67 |         # Added debug logging to understand the ayah structure
 68 |         logger.debug(f"Ayah data type: {type(ayah)}")
 69 |         logger.debug(f"Ayah content: {str(ayah)[:200]}")
 70 | 
 71 |         # Handle if ayah is not a dictionary
 72 |         if not isinstance(ayah, dict):
 73 |             logger.error(f"Expected ayah to be a dict but got {type(ayah)}")
 74 |             return f"Error: Invalid ayah format - {str(ayah)[:100]}..."
 75 | 
 76 |         try:
 77 |             ayah_num = ayah["id"]
 78 |             ayah_ar = ayah.get("text", "Not retrieved")
 79 |             ayah_en = ayah.get("en_text", "Not retrieved")
 80 |             result = f"Ayah: {ayah_num}\nArabic Text: {ayah_ar}\n\nEnglish Text: {ayah_en}\n\n"
 81 |             return result
 82 |         except Exception as e:
 83 |             logger.error(f"Error formatting ayah: {str(e)}")
 84 |             logger.error(f"Problematic ayah: {str(ayah)}")
 85 |             return f"Error processing ayah: {str(e)}"
 86 | 
 87 |     def format_as_list(self, results):
 88 |         """Format raw API results as a list of strings."""
 89 |         return [self.pp_ayah(r) for r in results]
 90 | 
 91 |     def format_as_ref_list(self, results):
 92 |         """Format raw API results as a list of document objects for Claude.
 93 | 
 94 |         Args:
 95 |             results: Raw API results
 96 | 
 97 |         Returns:
 98 |             List of document objects formatted for Claude
 99 |         """
100 |         documents = []
101 |         for result in results:
102 |             id = result.get("id", "")
103 |             arabic = result.get("text", "")
104 |             english = result.get("en_text", "")
105 | 
106 |             # Create citation title and trim to safe length
107 |             title = trim_citation_title(f"Quran {id}")
108 | 
109 |             # Format both Arabic and English texts in multilingual JSON format
110 |             # This is expected by the base_search.py documentation
111 |             text_entries = {}
112 |             if arabic:
113 |                 text_entries["ar"] = arabic
114 |             if english:
115 |                 text_entries["en"] = english
116 | 
117 |             # Format as multilingual JSON data
118 |             doc_text = format_multilingual_data(text_entries)
119 | 
120 |             documents.append(
121 |                 {
122 |                     "type": "document",
123 |                     "source": {"type": "text", "media_type": "text/plain", "data": doc_text},
124 |                     "title": title,
125 |                     "context": "Retrieved from the Holy Quran",
126 |                     "citations": {"enabled": True},
127 |                 }
128 |             )
129 | 
130 |         return documents
131 | 
132 |     def format_as_tool_result(self, results):
133 |         """Format raw API results as a tool result dictionary."""
134 |         formatted_results = []
135 |         for result in results:
136 |             formatted_results.append(
137 |                 {
138 |                     "type": "text",
139 |                     "text": f"""
140 |                 Arabic text: {result.get("text", "")} \n\n
141 |                 English text: {result.get("en_text", "")}\n\n
142 |                 Ayah number: {result.get("id", "")}\n
143 |                 """,
144 |                 }
145 |             )
146 | 
147 |         return formatted_results
148 | 
149 |     def run_as_list(self, query: str, num_results: int = 10):
150 |         logger.info(f'Searching quran for "{query}"')
151 |         results = self.run(query, num_results)
152 |         logger.debug(f"Results from API: {type(results)}")
153 |         try:
154 |             # Use the direct approach from the original implementation
155 |             formatted_results = []
156 |             for r in results:
157 |                 ayah_str = self.pp_ayah(r)
158 |                 formatted_results.append(ayah_str)
159 |             return formatted_results
160 |         except Exception as e:
161 |             import traceback
162 | 
163 |             logger.error(f"Error formatting results: {str(e)}")
164 |             logger.error(f"Full traceback: {traceback.format_exc()}")
165 |             logger.error(f"Results that caused error: {results}")
166 |             return [f"Error processing results: {str(e)} - {traceback.format_exc()}"]
167 | 
168 |     def run_as_string(self, query: str, num_results: int = 10):
169 |         results = self.run(query, num_results)
170 |         try:
171 |             return "\n".join([self.pp_ayah(r) for r in results])
172 |         except Exception as e:
173 |             logger.error(f"Error formatting results as string: {str(e)}")
174 |             return f"Error processing results: {str(e)}"
175 | 


--------------------------------------------------------------------------------
/src/ansari/util/__init__.py:
--------------------------------------------------------------------------------
1 | # This file makes the 'util' directory a package.
2 | from .prompt_mgr import PromptMgr
3 | from .translation import translate_text
4 | 
5 | __all__ = ["PromptMgr", "translate_text"]
6 | 


--------------------------------------------------------------------------------
/src/ansari/util/prompt_mgr.py:
--------------------------------------------------------------------------------
 1 | # This file aims to provide prompt-related functions that can be used across the codebase.
 2 | # Specifically, it load prompts (from resources/) and manage them for Ansari agent.
 3 | 
 4 | from pathlib import Path
 5 | 
 6 | from pydantic import BaseModel
 7 | 
 8 | 
 9 | class Prompt(BaseModel):
10 |     file_path: str
11 |     cached: str | None = None
12 |     hot_reload: bool = True
13 | 
14 |     def render(self, **kwargs) -> str:
15 |         if (self.cached is None) or (self.hot_reload):
16 |             with open(self.file_path) as f:
17 |                 self.cached = f.read()
18 |         return self.cached.format(**kwargs)
19 | 
20 | 
21 | class PromptMgr:
22 |     def get_resource_path(filename):
23 |         # Get the directory of the current script
24 |         script_dir = Path(__file__).resolve()
25 |         # Construct the path to the resources directory
26 |         resources_dir = script_dir.parent.parent / "resources"
27 |         # Construct the full path to the resource file
28 |         path = resources_dir / filename
29 |         return path
30 | 
31 |     def __init__(self, hot_reload: bool = True, src_dir: str = str(get_resource_path("prompts"))):
32 |         """Creates a prompt manager.
33 | 
34 |         Args:
35 |             hot_reload: If true, reloads the prompt every time it is called.
36 |             src_dir: The directory where the prompts are stored.
37 | 
38 |         """
39 |         self.hot_reload = hot_reload
40 |         self.src_dir = src_dir
41 | 
42 |     def bind(self, prompt_id: str) -> Prompt:
43 |         return Prompt(
44 |             file_path=f"{self.src_dir}/{prompt_id}.txt",
45 |             hot_reload=self.hot_reload,
46 |         )
47 | 


--------------------------------------------------------------------------------
/src/ansari/util/robust_translation.py:
--------------------------------------------------------------------------------
  1 | # Enhanced version of the translation utility with more robust parsing
  2 | 
  3 | import json
  4 | import logging
  5 | from typing import Dict
  6 | 
  7 | from ansari.util.general_helpers import get_language_from_text
  8 | 
  9 | # Set up logging
 10 | logger = logging.getLogger(__name__)
 11 | 
 12 | 
 13 | def format_multilingual_data(text_entries: Dict[str, str]) -> str:
 14 |     """Convert a dictionary of language-text pairs to a JSON string.
 15 | 
 16 |     This function is used by search tools to format multilingual content
 17 |     in a consistent way. The format allows tools that return content in
 18 |     multiple languages (like Quran and Hadith) to be properly handled,
 19 |     and avoids duplicate translations.
 20 | 
 21 |     Args:
 22 |         text_entries: Dictionary mapping language codes to text
 23 |             e.g., {"ar": "النص العربي", "en": "English text"}
 24 | 
 25 |     Returns:
 26 |         JSON string representing language-text pairs in the format:
 27 |         [
 28 |             {"lang": "ar", "text": "النص العربي"},
 29 |             {"lang": "en", "text": "English translation"}
 30 |         ]
 31 |     """
 32 |     result = []
 33 |     for lang, text in text_entries.items():
 34 |         if text:  # Only include non-empty text
 35 |             result.append({"lang": lang, "text": text})
 36 |     return json.dumps(result)
 37 | 
 38 | 
 39 | def parse_multilingual_data(data: str) -> Dict[str, str]:
 40 |     """Parse a JSON string representing multilingual content into a dictionary.
 41 | 
 42 |     This is an enhanced version of the original parse_multilingual_data function
 43 |     with more robust error handling.
 44 | 
 45 |     Args:
 46 |         data: JSON string in the format returned by format_multilingual_data
 47 |              OR plain text that will be detected and handled
 48 | 
 49 |     Returns:
 50 |         Dictionary mapping language codes to text
 51 |         e.g., {"ar": "النص العربي", "en": "English text"}
 52 |     """
 53 |     # First, try standard JSON parsing
 54 |     try:
 55 |         parsed = json.loads(data)
 56 |         if not isinstance(parsed, list):
 57 |             logger.warning("Expected a JSON array but got something else")
 58 |             # Fall back to treating as plain text
 59 |             return {"text": data}
 60 | 
 61 |         result = {}
 62 |         for item in parsed:
 63 |             if not isinstance(item, dict) or "lang" not in item or "text" not in item:
 64 |                 logger.warning("JSON item missing 'lang' or 'text' fields")
 65 |                 continue
 66 |             result[item["lang"]] = item["text"]
 67 |         
 68 |         # If we extracted any languages, return them
 69 |         if result:
 70 |             return result
 71 |         
 72 |         # Otherwise, treat as plain text
 73 |         logger.warning("No valid language entries found in JSON")
 74 |         return {"text": data}
 75 | 
 76 |     except json.JSONDecodeError:
 77 |         # If JSON parsing fails, try to detect if it's Arabic text
 78 |         logger.debug("JSON parsing failed, attempting language detection")
 79 |         
 80 |         try:
 81 |             # If it contains Arabic characters, it's likely Arabic text
 82 |             if any(0x0600 <= ord(c) <= 0x06FF for c in data[:50]):
 83 |                 logger.debug("Detected Arabic text based on character range")
 84 |                 return {"ar": data}
 85 |             
 86 |             # Otherwise use language detection
 87 |             lang = get_language_from_text(data)
 88 |             logger.debug(f"Detected language: {lang}")
 89 |             
 90 |             if lang == "ar":
 91 |                 return {"ar": data}
 92 |             else:
 93 |                 # Use the detected language
 94 |                 return {lang: data}
 95 |                 
 96 |         except Exception as e:
 97 |             logger.error(f"Error during language detection: {e}")
 98 |             # Fall back to treating as generic text
 99 |             return {"text": data}
100 |     
101 |     except Exception as e:
102 |         logger.error(f"Unexpected error in parse_multilingual_data: {e}")
103 |         # Create a safe fallback dictionary
104 |         return {"text": data}
105 | 
106 | 
107 | def process_document_source_data(doc: dict) -> dict:
108 |     """Process a document's source data to ensure it's properly formatted.
109 | 
110 |     This function tries to parse the document's source data as JSON, and if that fails,
111 |     it formats the text based on language detection.
112 | 
113 |     Args:
114 |         doc: The document to process
115 | 
116 |     Returns:
117 |         The processed document
118 |     """
119 |     if "source" not in doc or "data" not in doc["source"]:
120 |         return doc
121 |     
122 |     try:
123 |         # Try to parse the source data as multilingual data
124 |         original_data = doc["source"]["data"]
125 |         parsed_data = parse_multilingual_data(original_data)
126 |         
127 |         # Format the data based on the parsed result
128 |         text_list = []
129 |         if "ar" in parsed_data:
130 |             text_list.append(f"Arabic: {parsed_data['ar']}")
131 |         if "en" in parsed_data:
132 |             text_list.append(f"English: {parsed_data['en']}")
133 |         if not text_list and "text" in parsed_data:
134 |             text_list.append(f"Text: {parsed_data['text']}")
135 |         
136 |         # Set the source data to the formatted text
137 |         if text_list:
138 |             doc["source"]["data"] = "\n\n".join(text_list)
139 |         
140 |     except Exception as e:
141 |         logger.error(f"Error processing document source data: {e}")
142 |         # Try a simple fallback
143 |         try:
144 |             original_text = doc["source"]["data"]
145 |             if isinstance(original_text, str):
146 |                 # Just prefix with "Text:" to maintain expected format
147 |                 doc["source"]["data"] = f"Text: {original_text}"
148 |         except Exception:
149 |             pass
150 |     
151 |     return doc


--------------------------------------------------------------------------------
/src/ansari/util/translation.py:
--------------------------------------------------------------------------------
  1 | # Translation utility for Ansari using Claude models
  2 | 
  3 | import anthropic
  4 | from typing import Dict, Optional
  5 | import asyncio
  6 | import json
  7 | 
  8 | from ansari.ansari_logger import get_logger
  9 | from ansari.config import get_settings
 10 | from ansari.util.general_helpers import get_language_from_text
 11 | 
 12 | logger = get_logger(__name__)
 13 | 
 14 | 
 15 | def translate_text(
 16 |     text: str, target_lang: str, source_lang: Optional[str] = None, model: str = "claude-3-5-haiku-20241022"
 17 | ) -> str:
 18 |     """Translates text using Claude models, defaulting to latest Haiku.
 19 | 
 20 |     Args:
 21 |         text (str): The text to translate
 22 |         target_lang (str): Target language code (e.g., "ar", "en") or name (e.g., "Arabic", "English")
 23 |         source_lang (Optional[str], optional): Source language code or name. If None, auto-detected.
 24 |         model (str, optional): Claude model to use. Defaults to "claude-3-5-haiku-20241022".
 25 | 
 26 |     Returns:
 27 |         str: The translated text
 28 | 
 29 |     Raises:
 30 |         Exception: If translation fails
 31 |     """
 32 |     if not text:
 33 |         return ""
 34 | 
 35 |     # Get settings and initialize Anthropic client
 36 |     settings = get_settings()
 37 |     client = anthropic.Anthropic(api_key=settings.ANTHROPIC_API_KEY.get_secret_value())
 38 | 
 39 |     # Detect source language if not provided
 40 |     if not source_lang:
 41 |         source_lang = get_language_from_text(text)
 42 | 
 43 |     # Return original text if target language is the same as source
 44 |     if source_lang == target_lang:
 45 |         return text
 46 | 
 47 |     try:
 48 |         # Call Claude for translation
 49 |         response = client.messages.create(
 50 |             model=model,
 51 |             max_tokens=1024,
 52 |             temperature=0.0,
 53 |             system=(
 54 |                 "You are a professional translator. Translate the text accurately while preserving meaning, tone, "
 55 |                 "and formatting. Only return the translation, nothing else."
 56 |             ),
 57 |             messages=[{"role": "user", "content": f"Translate this text from {source_lang} to {target_lang}:\n\n{text}"}],
 58 |         )
 59 | 
 60 |         translation = response.content[0].text.strip()
 61 |         return translation
 62 | 
 63 |     except Exception as e:
 64 |         logger.error(f"Translation error: {str(e)}")
 65 |         raise
 66 | 
 67 | 
 68 | async def translate_texts_parallel(texts: list[str], target_lang: str = "en", source_lang: str = "ar") -> list[str]:
 69 |     """
 70 |     Translate multiple texts in parallel.
 71 | 
 72 |     Args:
 73 |         texts: List of texts to translate
 74 |         target_lang: Target language code (e.g., "ar", "en")
 75 |         source_lang: Source language code (e.g., "ar", "en")
 76 | 
 77 |     Returns:
 78 |         List of translations
 79 |     """
 80 |     if not texts:
 81 |         return []
 82 | 
 83 |     # Create translation tasks for all texts
 84 |     tasks = [asyncio.to_thread(translate_text, text, target_lang, source_lang) for text in texts]
 85 | 
 86 |     # Run all translations in parallel and return results
 87 |     return await asyncio.gather(*tasks)
 88 | 
 89 | 
 90 | def format_multilingual_data(text_entries: Dict[str, str]) -> str:
 91 |     """Convert a dictionary of language-text pairs to a JSON string.
 92 | 
 93 |     This function is used by search tools to format multilingual content
 94 |     in a consistent way. The format allows tools that return content in
 95 |     multiple languages (like Quran and Hadith) to be properly handled,
 96 |     and avoids duplicate translations.
 97 | 
 98 |     Args:
 99 |         text_entries: Dictionary mapping language codes to text
100 |             e.g., {"ar": "النص العربي", "en": "English text"}
101 | 
102 |     Returns:
103 |         JSON string representing language-text pairs in the format:
104 |         [
105 |             {"lang": "ar", "text": "النص العربي"},
106 |             {"lang": "en", "text": "English translation"}
107 |         ]
108 |     """
109 |     result = []
110 |     for lang, text in text_entries.items():
111 |         if text:  # Only include non-empty text
112 |             result.append({"lang": lang, "text": text})
113 |     return json.dumps(result)
114 | 
115 | 
116 | def parse_multilingual_data(data: str) -> Dict[str, str]:
117 |     """Parse a JSON string representing multilingual content into a dictionary.
118 | 
119 |     This is the reverse of format_multilingual_data.
120 | 
121 |     Args:
122 |         data: JSON string in the format returned by format_multilingual_data
123 | 
124 |     Returns:
125 |         Dictionary mapping language codes to text
126 |         e.g., {"ar": "النص العربي", "en": "English text"}
127 | 
128 |     Raises:
129 |         json.JSONDecodeError: If the data is not valid JSON
130 |         ValueError: If the data is not in the expected format
131 |     """
132 |     try:
133 |         parsed = json.loads(data)
134 |         if not isinstance(parsed, list):
135 |             raise ValueError("Expected a JSON array")
136 | 
137 |         result = {}
138 |         for item in parsed:
139 |             if not isinstance(item, dict) or "lang" not in item or "text" not in item:
140 |                 raise ValueError("Expected items with 'lang' and 'text' fields")
141 |             result[item["lang"]] = item["text"]
142 |         return result
143 | 
144 |     except json.JSONDecodeError:
145 |         raise
146 |     except Exception as e:
147 |         raise ValueError(f"Invalid multilingual data format: {str(e)}")
148 | 


--------------------------------------------------------------------------------
/test_ansari_claude.py:
--------------------------------------------------------------------------------
 1 | #\!/usr/bin/env python
 2 | 
 3 | import sys
 4 | 
 5 | from ansari.agents.ansari_claude import AnsariClaude
 6 | from ansari.config import get_settings
 7 | 
 8 | def test_message_structure():
 9 |     """Test that message history conversion works properly."""
10 |     settings = get_settings()
11 |     agent = AnsariClaude(settings)
12 |     
13 |     # Setup test message history with mixed formats
14 |     message_history = [
15 |         {"role": "user", "content": "Hello, this is a test"},
16 |         {"role": "assistant", "content": "This is a plain text response"},  # Plain string content
17 |         {"role": "user", "content": "What is the definition of Tashahhud?"}
18 |     ]
19 |     
20 |     # Process through replace_message_history
21 |     try:
22 |         generator = agent.replace_message_history(message_history)
23 |         # Just run through the generator to process it
24 |         for _ in generator:
25 |             pass
26 |         print("Test passed - no errors in message processing")
27 |     except Exception as e:
28 |         print(f"Test failed with error: {e}")
29 |         sys.exit(1)
30 | 
31 | if __name__ == "__main__":
32 |     test_message_structure()
33 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ansari-project/ansari-backend/accfce869dd1a98c02eb4e931985525b91632c7d/tests/__init__.py


--------------------------------------------------------------------------------
/tests/ask-question-en.txt:
--------------------------------------------------------------------------------
 1 | Select the best answer to the following question: 
 2 | 
 3 | {{ question }}
 4 | 
 5 | from the list of options (one option per line)
 6 | 
 7 | {% for option in options %}{{ option }}
 8 | {% endfor %}
 9 | 
10 | Output your answer in json format, with a single field, "answer" 
11 | and the answer to the above question. 
12 | 


--------------------------------------------------------------------------------
/tests/integration/README.md:
--------------------------------------------------------------------------------
 1 | # Ansari Integration Tests
 2 | 
 3 | This directory contains integration tests for Ansari and its various implementations. The goal
 4 | is to test different Ansari implementations with the same test cases to ensure consistent behavior.
 5 | 
 6 | ## Test Structure
 7 | 
 8 | The integration tests are organized as follows:
 9 | 
10 | 1. `test_helpers.py` - Contains helper functions used by the other test files
11 | 2. `test_ansari_generic.py` - Contains generic test cases that can be applied to any Ansari implementation 
12 | 3. `test_ansari_integration.py` - Tests specifically targeting the base Ansari implementation
13 | 4. `test_claude_integration.py` - Tests specifically targeting the AnsariClaude implementation
14 | 
15 | ## Generic Testing Framework
16 | 
17 | The `test_ansari_generic.py` module provides a reusable testing framework through the `AnsariTester` class.
18 | This allows running the same test cases against different Ansari implementations to ensure consistent behavior.
19 | 
20 | ```python
21 | from tests.integration.test_ansari_generic import AnsariTester
22 | from ansari.agents.ansari import Ansari
23 | 
24 | # Create a tester for a specific implementation
25 | tester = AnsariTester(Ansari)
26 | 
27 | # Run a specific test
28 | tester.test_simple_conversation()
29 | 
30 | # Run all tests
31 | results = tester.run_all_tests()
32 | ```
33 | 
34 | ## Test Cases
35 | 
36 | The following test cases are implemented:
37 | 
38 | 1. **Simple Conversation** - Tests a basic conversation flow with no tools/references
39 | 2. **Conversation with References** - Tests a conversation that should trigger tool usage for references
40 | 3. **Multi-turn Conversation** - Tests context retention across multiple conversation turns
41 | 4. **Message Reconstruction** - Tests the database storage and reconstruction of messages
42 | 
43 | ## Running the Tests
44 | 
45 | To run the integration tests:
46 | 
47 | ```bash
48 | # Run all integration tests
49 | pytest tests/integration/ -m integration
50 | 
51 | # Run specific test file
52 | pytest tests/integration/test_ansari_generic.py -m integration
53 | 
54 | # Run a specific test case
55 | pytest tests/integration/test_ansari_generic.py::test_simple_conversation_all_agents -v
56 | ```
57 | 
58 | ## Adding New Implementations
59 | 
60 | To test a new Ansari implementation:
61 | 
62 | 1. Create a new test file (e.g., `test_new_impl_integration.py`)
63 | 2. Import the `AnsariTester` from `test_ansari_generic.py`
64 | 3. Create a fixture that returns an `AnsariTester` for your implementation
65 | 4. Add tests using the tester instance
66 | 5. Add the new implementation to the parametrized tests in `test_ansari_generic.py`
67 | 


--------------------------------------------------------------------------------
/tests/integration/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/tests/integration/test_ansari_integration.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | 
  3 | import pytest
  4 | 
  5 | from ansari.agents.ansari import Ansari
  6 | from ansari.ansari_db import MessageLogger, SourceType
  7 | from ansari.ansari_logger import get_logger
  8 | from ansari.config import Settings
  9 | from tests.integration.test_ansari_generic import AnsariTester, IntegrationMessageLogger, MockDatabase
 10 | 
 11 | logger = get_logger(__name__)
 12 | 
 13 | 
 14 | @pytest.fixture
 15 | def settings():
 16 |     settings = Settings()
 17 |     return settings
 18 | 
 19 | 
 20 | @pytest.fixture
 21 | def message_logger():
 22 |     return IntegrationMessageLogger()
 23 | 
 24 | 
 25 | @pytest.fixture
 26 | def mock_db():
 27 |     return MockDatabase()
 28 | 
 29 | 
 30 | @pytest.fixture
 31 | def ansari_tester(settings):
 32 |     """Create an AnsariTester configured for base Ansari"""
 33 |     return AnsariTester(Ansari, settings)
 34 | 
 35 | 
 36 | @pytest.mark.integration
 37 | def test_simple_conversation(ansari_tester):
 38 |     """Integration test for a simple conversation with Ansari"""
 39 |     logger.info("Starting simple conversation integration test for Ansari")
 40 |     assert ansari_tester.test_simple_conversation()
 41 | 
 42 | 
 43 | @pytest.mark.integration
 44 | def test_conversation_with_references(ansari_tester):
 45 |     """Integration test for a conversation that should include Quran/Hadith references"""
 46 |     logger.info("Starting conversation with references integration test for Ansari")
 47 |     assert ansari_tester.test_conversation_with_references()
 48 | 
 49 | 
 50 | @pytest.mark.integration
 51 | def test_multi_turn_conversation(ansari_tester):
 52 |     """Integration test for a multi-turn conversation"""
 53 |     logger.info("Starting multi-turn conversation integration test for Ansari")
 54 |     assert ansari_tester.test_multi_turn_conversation()
 55 | 
 56 | 
 57 | class TestMessageReconstruction:
 58 |     """Tests that focus specifically on message reconstruction between agent and database"""
 59 | 
 60 |     @pytest.mark.integration
 61 |     def test_full_reconstruction_cycle(self, settings, mock_db):
 62 |         """Test the full cycle: Message creation → Database storage → Retrieval → Reconstruction"""
 63 |         logger.info("Testing full message reconstruction cycle")
 64 | 
 65 |         # Create logger that uses our mock database
 66 |         message_logger = MessageLogger(mock_db, SourceType.WEB, 1, 1)
 67 | 
 68 |         # Create the agent
 69 |         agent = Ansari(settings=settings, message_logger=message_logger)
 70 | 
 71 |         # Process a query likely to use tools
 72 |         for _ in agent.process_input("What does Surah Al-Baqarah say about fasting?"):
 73 |             pass
 74 | 
 75 |         # Verify we have messages in the agent's history
 76 |         assert len(agent.message_history) > 0, "No messages in agent history"
 77 | 
 78 |         # Get the stored messages from the mock DB
 79 |         stored_messages = mock_db.get_stored_messages()
 80 |         assert len(stored_messages) > 0, "No messages stored in mock database"
 81 | 
 82 |         # Reconstruct messages using the convert_message_llm method
 83 |         reconstructed_messages = []
 84 |         for msg in stored_messages:
 85 |             reconstructed_msgs = mock_db.convert_message_llm(msg)
 86 |             reconstructed_messages.extend(reconstructed_msgs)
 87 | 
 88 |         # Verify reconstructed messages match agent's history in structure
 89 |         assert len(reconstructed_messages) > 0, "No messages were reconstructed"
 90 | 
 91 |         # Check each message for structural validity
 92 |         for msg in reconstructed_messages:
 93 |             assert "role" in msg, "Reconstructed message missing role"
 94 |             assert "content" in msg, "Reconstructed message missing content"
 95 | 
 96 |     @pytest.mark.integration
 97 |     def test_edge_cases(self, settings):
 98 |         """Test edge cases for message reconstruction"""
 99 |         logger.info("Testing message reconstruction edge cases")
100 | 
101 |         mock_db = MockDatabase()
102 | 
103 |         # Test Case 1: Plain text message
104 |         plain_text_msg = ("assistant", "Simple text response", None, None, None)
105 |         reconstructed = mock_db.convert_message_llm(plain_text_msg)
106 |         assert len(reconstructed) == 1, "Should have one reconstructed message"
107 |         assert reconstructed[0]["role"] == "assistant", "Role should be preserved"
108 |         # The content is now a list of objects for Claude format
109 |         assert isinstance(reconstructed[0]["content"], list), "Content should be a list for Claude format"
110 |         assert reconstructed[0]["content"][0]["type"] == "text", "Content should be text type"
111 |         assert reconstructed[0]["content"][0]["text"] == "Simple text response", "Text content should be preserved"
112 | 
113 |         # Test Case 2: Tool call message
114 |         tool_msg = (
115 |             "assistant",
116 |             "Let me search for that",
117 |             "search_quran",
118 |             json.dumps({"id": "123", "input": {"query": "test"}}),
119 |             None,
120 |         )
121 |         reconstructed = mock_db.convert_message_llm(tool_msg)
122 |         assert len(reconstructed) == 1, "Should have one reconstructed message"
123 |         assert reconstructed[0]["role"] == "assistant", "Role should be preserved"
124 |         # For Claude, content should be a list of objects
125 |         assert isinstance(reconstructed[0]["content"], list), "Content should be a list for Claude format"
126 | 
127 |         # Test Case 3: Message with tool results
128 |         tool_result_msg = (
129 |             "tool",
130 |             "Tool result text",
131 |             "search_quran",
132 |             json.dumps({"id": "123", "internal_message": "Internal message", "tool_message": "Tool message"}),
133 |             None,
134 |         )
135 |         reconstructed = mock_db.convert_message_llm(tool_result_msg)
136 |         assert len(reconstructed) == 1, "Should have one reconstructed message"
137 |         assert reconstructed[0]["role"] == "tool", "Role should be preserved"
138 | 
139 |         # For Claude, this could be either a string or an object depending on format
140 |         content = reconstructed[0]["content"]
141 |         if isinstance(content, str):
142 |             assert "Tool result text" in content, "Content should contain the tool result text"
143 |         else:
144 |             assert isinstance(content, dict), "Content should be a dictionary if not a string"
145 |             assert "name" in content, "Content dictionary should have a name"
146 | 
147 | 
148 | @pytest.mark.integration
149 | def test_run_all_ansari_tests(settings):
150 |     """Run all tests for base Ansari using the generic tester"""
151 |     tester = AnsariTester(Ansari, settings)
152 |     results = tester.run_all_tests()
153 |     assert all(results), "All tests should pass for base Ansari"
154 | 


--------------------------------------------------------------------------------
/tests/unit/__init__.py:
--------------------------------------------------------------------------------
1 | """Unit tests for the Ansari project."""
2 | 


--------------------------------------------------------------------------------
/tests/unit/test_ansari_claude_empty_text_block.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | from unittest.mock import MagicMock, patch
  3 | 
  4 | from ansari.agents.ansari_claude import AnsariClaude
  5 | from ansari.config import Settings
  6 | 
  7 | 
  8 | class TestAnsariClaudeEmptyTextBlock(unittest.TestCase):
  9 |     """Test that empty text blocks are not created in AnsariClaude responses."""
 10 | 
 11 |     def setUp(self):
 12 |         """Set up test fixtures."""
 13 |         # Mock settings
 14 |         self.settings = Settings()
 15 |         self.settings.ANTHROPIC_MODEL = "test-model"
 16 |         self.settings.ANTHROPIC_API_KEY = "test-key"
 17 |         self.settings.MAX_FAILURES = 1
 18 | 
 19 |         # Create message logger mock
 20 |         self.message_logger = MagicMock()
 21 | 
 22 |         # Patch anthropic module
 23 |         self.patcher = patch("anthropic.Anthropic")
 24 |         self.mock_anthropic = self.patcher.start()
 25 |         self.mock_client = MagicMock()
 26 |         self.mock_anthropic.return_value = self.mock_client
 27 | 
 28 |         # Create instance with mocks
 29 |         self.agent = AnsariClaude(self.settings, self.message_logger)
 30 | 
 31 |         # Setup history with a user message
 32 |         self.agent.message_history = [{"role": "user", "content": [{"type": "text", "text": "test question"}]}]
 33 | 
 34 |     def tearDown(self):
 35 |         """Clean up after tests."""
 36 |         self.patcher.stop()
 37 | 
 38 |     def test_tool_use_empty_text(self):
 39 |         """Test that _finish_response doesn't create empty text blocks during tool_use."""
 40 |         # Mock direct call to _finish_response with empty text and tool calls
 41 |         assistant_text = ""  # Empty text
 42 |         tool_calls = [{"type": "tool_use", "id": "tool_123", "name": "test_tool", "input": {"query": "test"}}]
 43 | 
 44 |         # Setup tool handling mock
 45 |         self.agent.tool_name_to_instance = {"test_tool": MagicMock()}
 46 |         self.agent.tool_name_to_instance["test_tool"].run = MagicMock(return_value=[])
 47 |         self.agent.tool_name_to_instance["test_tool"].format_as_tool_result = MagicMock(return_value=[])
 48 |         self.agent.tool_name_to_instance["test_tool"].format_as_ref_list = MagicMock(return_value=[])
 49 |         self.agent.process_tool_call = MagicMock(return_value=([], []))
 50 | 
 51 |         # Call the method directly
 52 |         self.agent._finish_response(assistant_text, tool_calls)
 53 | 
 54 |         # Check that no empty text blocks were created
 55 |         for msg in self.agent.message_history:
 56 |             if msg["role"] == "assistant":
 57 |                 for block in msg.get("content", []):
 58 |                     if block.get("type") == "text":
 59 |                         self.assertNotEqual("", block.get("text", "non-empty"), "Empty text block found in message")
 60 | 
 61 |     def test_tool_use_stop_reason_handling(self):
 62 |         """Test that we handle the 'tool_use' stop reason correctly without creating empty text blocks."""
 63 |         # Mock the _finish_response method to check how it's called
 64 |         self.agent._finish_response = MagicMock(return_value=None)
 65 |         self.agent.process_tool_call = MagicMock(return_value=([], []))
 66 | 
 67 |         # Create a message_delta chunk with tool_use stop reason
 68 |         message_delta = MagicMock()
 69 |         message_delta.type = "message_delta"
 70 |         message_delta.delta = MagicMock()
 71 |         message_delta.delta.stop_reason = "tool_use"
 72 | 
 73 |         # Simulate the state with just a tool call
 74 |         tool_calls = [{"type": "tool_use", "id": "tool_123", "name": "test_tool", "input": {"query": "test"}}]
 75 |         response_finished = False
 76 | 
 77 |         # Create a method to test the chunk handling logic directly
 78 |         def test_handler():
 79 |             # This simulates the chunk handling code in process_one_round
 80 |             if message_delta.delta.stop_reason == "tool_use":
 81 |                 if not response_finished:
 82 |                     # Process tool calls directly without calling _finish_response
 83 |                     for tc in tool_calls:
 84 |                         self.agent.process_tool_call(tc["name"], tc["input"], tc["id"])
 85 | 
 86 |         # Run the test handler
 87 |         test_handler()
 88 | 
 89 |         # Verify _finish_response was NOT called for tool_use
 90 |         self.agent._finish_response.assert_not_called()
 91 | 
 92 |         # Verify process_tool_call was called instead
 93 |         self.agent.process_tool_call.assert_called_with("test_tool", {"query": "test"}, "tool_123")
 94 |         
 95 |     def test_tool_call_error_handling(self):
 96 |         """Test that tool call errors are properly handled without empty messages."""
 97 |         # Set up a tool that will raise an exception
 98 |         self.agent.tool_name_to_instance = {"test_tool": MagicMock()}
 99 |         self.agent.tool_name_to_instance["test_tool"].run = MagicMock(side_effect=Exception("Test error"))
100 |         self.agent._log_message = MagicMock()  # Mock the logging method
101 |         
102 |         # Remember the initial message history length
103 |         initial_length = len(self.agent.message_history)
104 |         
105 |         # Execute tool call process
106 |         tool_calls = [{"type": "tool_use", "id": "tool_123", "name": "test_tool", "input": {"query": "test"}}]
107 |         
108 |         # Process the tool calls
109 |         self.agent._process_tool_calls(tool_calls)
110 |         
111 |         # Check that an error message was added to the message history
112 |         self.assertEqual(len(self.agent.message_history), initial_length + 1)
113 |         last_message = self.agent.message_history[-1]
114 |         self.assertEqual(last_message["role"], "user")
115 |         self.assertEqual(last_message["content"][0]["type"], "tool_result")
116 |         self.assertEqual(last_message["content"][0]["tool_use_id"], "tool_123")
117 |         self.assertIn("Test error", last_message["content"][0]["content"])
118 |         
119 |         # Verify log_message was called
120 |         self.agent._log_message.assert_called_once()
121 | 


--------------------------------------------------------------------------------
/tests/unit/test_ansari_claude_message_sequence.py:
--------------------------------------------------------------------------------
  1 | import uuid
  2 | import sys
  3 | import os
  4 | from unittest.mock import MagicMock, patch
  5 | 
  6 | # Add the src directory to the path so we can import the modules
  7 | src_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../"))
  8 | sys.path.insert(0, src_path)
  9 | 
 10 | from ansari.agents.ansari_claude import AnsariClaude
 11 | from ansari.config import Settings
 12 | from ansari.ansari_db import MessageLogger
 13 | 
 14 | 
 15 | def test_message_sequence_with_tool_use():
 16 |     """
 17 |     Test a complete message sequence with tool use and tool result.
 18 | 
 19 |     This tests the following sequence:
 20 |     1. Assistant message with tool_use
 21 |     2. User message with tool_result
 22 |     3. Assistant explanation
 23 |     4. Simple user message
 24 |     """
 25 |     # Create a mock settings object
 26 |     settings = MagicMock(spec=Settings)
 27 |     settings.ANTHROPIC_MODEL = "claude-3-opus-20240229"
 28 |     settings.diskcache_dir = "/tmp/diskcache"
 29 |     settings.MAX_FAILURES = 3
 30 | 
 31 |     # Create a mock message logger
 32 |     message_logger = MagicMock(spec=MessageLogger)
 33 | 
 34 |     # Create a mocked AnsariClaude instance with initial tools setup
 35 |     with patch("anthropic.Anthropic"), patch.object(AnsariClaude, "__init__", return_value=None):
 36 |         claude = AnsariClaude.__new__(AnsariClaude)
 37 |         claude.settings = settings
 38 |         claude.message_logger = message_logger
 39 | 
 40 |         # Set needed attributes that would normally be set in __init__
 41 |         claude.tools = []
 42 |         claude.tool_name_to_instance = {}
 43 |         claude.citations = []
 44 |         claude.message_history = []
 45 |         claude.client = MagicMock()
 46 | 
 47 |         # Create a unique tool ID
 48 |         tool_id = str(uuid.uuid4())
 49 | 
 50 |         # Setup a message sequence with tool use and tool result
 51 |         claude.message_history = [
 52 |             # 1. Assistant message with tool_use
 53 |             {
 54 |                 "role": "assistant",
 55 |                 "content": [
 56 |                     {"type": "text", "text": "Let me search for that information."},
 57 |                     {"type": "tool_use", "id": tool_id, "name": "search_quran", "input": {"query": "mercy in quran"}},
 58 |                 ],
 59 |             },
 60 |             # 2. User message with tool_result
 61 |             {
 62 |                 "role": "user",
 63 |                 "content": [{"type": "tool_result", "tool_use_id": tool_id, "content": "Found 114 verses mentioning mercy."}],
 64 |             },
 65 |             # 3. Assistant explanation
 66 |             {
 67 |                 "role": "assistant",
 68 |                 "content": [{"type": "text", "text": "I found that the concept of mercy appears frequently in the Quran."}],
 69 |             },
 70 |             # 4. Simple user message
 71 |             {"role": "user", "content": "Can you tell me more about that?"},
 72 |         ]
 73 | 
 74 |         # Process the message history
 75 |         # Mock the API response to avoid actual API calls
 76 |         mock_response = MagicMock()
 77 |         claude.client.messages.create.return_value = mock_response
 78 | 
 79 |         # Create a more sophisticated mock for process_one_round
 80 |         # This will append an assistant message to message_history to break the loop
 81 |         def mock_process_one_round(*args, **kwargs):
 82 |             # Add an assistant message to break the loop in process_message_history
 83 |             claude.message_history.append(
 84 |                 {"role": "assistant", "content": [{"type": "text", "text": "I've processed your request."}]}
 85 |             )
 86 |             return ["I've processed your request."]
 87 | 
 88 |         claude.process_one_round = MagicMock(side_effect=mock_process_one_round)
 89 | 
 90 |         # Run the message processing
 91 |         # Make a copy of the history for comparison
 92 |         original_history = [msg.copy() for msg in claude.message_history]
 93 | 
 94 |         # Process the message history
 95 |         list(claude.process_message_history(use_tool=False))
 96 | 
 97 |         # Check that the message history structure was preserved
 98 |         processed_history = claude.message_history
 99 | 
100 |         # Compare the content of each message to ensure the structure is maintained
101 |         for i, (orig, processed) in enumerate(zip(original_history, processed_history)):
102 |             # Check that roles match
103 |             assert orig["role"] == processed["role"], f"Role mismatch at message {i}"
104 | 
105 |             # For assistant messages, ensure content remains a list of blocks
106 |             if orig["role"] == "assistant":
107 |                 assert isinstance(processed["content"], list), f"Assistant content should be a list at message {i}"
108 | 
109 |                 # Check for tool_use blocks
110 |                 orig_tool_blocks = [b for b in orig["content"] if b.get("type") == "tool_use"]
111 |                 processed_tool_blocks = [b for b in processed["content"] if b.get("type") == "tool_use"]
112 | 
113 |                 assert len(orig_tool_blocks) == len(processed_tool_blocks), f"Tool use blocks count mismatch at message {i}"
114 | 
115 |                 # If there are tool blocks, check that IDs are preserved
116 |                 if orig_tool_blocks:
117 |                     assert orig_tool_blocks[0]["id"] == processed_tool_blocks[0]["id"], f"Tool use ID mismatch at message {i}"
118 | 
119 |             # For user messages with tool_result, ensure structure is maintained
120 |             if orig["role"] == "user" and isinstance(orig["content"], list):
121 |                 # Check that content is still a list
122 |                 assert isinstance(processed["content"], list), f"User tool_result content should remain a list at message {i}"
123 | 
124 |                 # Check for tool_result blocks
125 |                 orig_result_blocks = [b for b in orig["content"] if b.get("type") == "tool_result"]
126 |                 processed_result_blocks = [b for b in processed["content"] if b.get("type") == "tool_result"]
127 | 
128 |                 assert len(orig_result_blocks) == len(
129 |                     processed_result_blocks
130 |                 ), f"Tool result blocks count mismatch at message {i}"
131 | 
132 |                 # If there are result blocks, check that IDs are preserved
133 |                 if orig_result_blocks:
134 |                     assert (
135 |                         orig_result_blocks[0]["tool_use_id"] == processed_result_blocks[0]["tool_use_id"]
136 |                     ), f"Tool result ID mismatch at message {i}"
137 | 
138 |         print("All assertions passed - message sequence with tool use/result is correctly processed!")
139 | 
140 | 
141 | if __name__ == "__main__":
142 |     test_message_sequence_with_tool_use()
143 | 


--------------------------------------------------------------------------------
/tests/unit/test_ansari_claude_tool_sequence.py:
--------------------------------------------------------------------------------
  1 | import uuid
  2 | import sys
  3 | import os
  4 | 
  5 | # Add the src directory to the path so we can import the modules
  6 | src_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../"))
  7 | sys.path.insert(0, src_path)
  8 | 
  9 | from unittest.mock import MagicMock, patch
 10 | from ansari.agents.ansari_claude import AnsariClaude
 11 | from ansari.config import Settings
 12 | 
 13 | 
 14 | def test_process_message_history_with_tools():
 15 |     """Test that message processing correctly handles tool_use and tool_result relationships."""
 16 | 
 17 |     # Create mock settings
 18 |     settings = MagicMock(spec=Settings)
 19 |     settings.ANTHROPIC_MODEL = "claude-3-opus-20240229"
 20 | 
 21 |     # Create a unique tool ID for testing
 22 |     tool_id = str(uuid.uuid4())
 23 |     invalid_tool_id = str(uuid.uuid4())  # This won't match any tool_use block
 24 | 
 25 |     # Create a test message history with tool use and tool result
 26 |     test_history = [
 27 |         # Message 1: Assistant with tool_use
 28 |         {
 29 |             "role": "assistant",
 30 |             "content": [
 31 |                 {"type": "text", "text": "Let me search for that information."},
 32 |                 {"type": "tool_use", "id": tool_id, "name": "search_quran", "input": {"query": "mercy in quran"}},
 33 |             ],
 34 |         },
 35 |         # Message 2: User with tool_result (valid tool_use_id)
 36 |         {
 37 |             "role": "user",
 38 |             "content": [{"type": "tool_result", "tool_use_id": tool_id, "content": "Found 114 verses mentioning mercy."}],
 39 |         },
 40 |         # Message 3: User with tool_result (invalid tool_use_id)
 41 |         {
 42 |             "role": "user",
 43 |             "content": [{"type": "tool_result", "tool_use_id": invalid_tool_id, "content": "This should be filtered out"}],
 44 |         },
 45 |         # Message 4: User with simple text
 46 |         {"role": "user", "content": "Tell me more about mercy in the Quran."},
 47 |     ]
 48 | 
 49 |     # Mock the necessary parts of AnsariClaude
 50 |     with (
 51 |         patch.object(AnsariClaude, "__init__", return_value=None),
 52 |         patch.object(AnsariClaude, "process_one_round", return_value=[]),
 53 |     ):
 54 |         claude = AnsariClaude.__new__(AnsariClaude)
 55 |         claude.settings = settings
 56 |         claude.message_history = test_history.copy()
 57 |         claude.message_logger = None
 58 |         claude.client = MagicMock()
 59 | 
 60 |         # Add a final assistant response to avoid infinite loop
 61 |         def add_assistant_response(*args, **kwargs):
 62 |             if len(claude.message_history) > 0 and claude.message_history[-1]["role"] == "user":
 63 |                 claude.message_history.append({"role": "assistant", "content": [{"type": "text", "text": "Test response"}]})
 64 |             return []
 65 | 
 66 |         claude.process_one_round = MagicMock(side_effect=add_assistant_response)
 67 | 
 68 |         # Run the message processing
 69 |         list(claude.process_message_history(use_tool=False))
 70 | 
 71 |         # Verify the results
 72 |         processed_history = claude.message_history
 73 | 
 74 |         # Message 1 (assistant with tool_use) should keep its structure
 75 |         assert processed_history[0]["role"] == "assistant"
 76 |         assert len(processed_history[0]["content"]) == 2
 77 |         assert processed_history[0]["content"][0]["type"] == "text"
 78 |         assert processed_history[0]["content"][1]["type"] == "tool_use"
 79 |         assert processed_history[0]["content"][1]["id"] == tool_id
 80 | 
 81 |         # Message 2 (user with valid tool_result) should keep its structure
 82 |         assert processed_history[1]["role"] == "user"
 83 |         assert isinstance(processed_history[1]["content"], list)
 84 |         assert len(processed_history[1]["content"]) == 1
 85 |         assert processed_history[1]["content"][0]["type"] == "tool_result"
 86 |         assert processed_history[1]["content"][0]["tool_use_id"] == tool_id
 87 | 
 88 |         # Message 3 (user with invalid tool_result) should be filtered
 89 |         assert processed_history[2]["role"] == "user"
 90 |         if isinstance(processed_history[2]["content"], list):
 91 |             assert len(processed_history[2]["content"]) == 0
 92 |         else:
 93 |             assert isinstance(processed_history[2]["content"], str)
 94 | 
 95 |         # Message 4 (user with simple text) should remain unchanged
 96 |         assert processed_history[3]["role"] == "user"
 97 |         assert processed_history[3]["content"] == "Tell me more about mercy in the Quran."
 98 | 
 99 |         print("All assertions passed - message processing correctly handled tool relationships!")
100 | 
101 | 
102 | if __name__ == "__main__":
103 |     test_process_message_history_with_tools()
104 | 


--------------------------------------------------------------------------------
/tests/unit/test_answer_quality.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | import pandas as pd
 4 | import pytest
 5 | from jinja2 import Environment, FileSystemLoader
 6 | 
 7 | from ansari.agents import Ansari
 8 | from ansari.ansari_logger import get_logger
 9 | from ansari.config import get_settings
10 | 
11 | logger = get_logger(__name__)
12 | 
13 | 
14 | @pytest.fixture(scope="module")
15 | def data():
16 |     tenv = Environment(loader=FileSystemLoader("src/ansari/resources/templates/"))
17 |     # NOTE (optional): the content inside the `ask_question.txt` is written in the Jinja2 template language
18 |     #   Refer to this for details: https://www.packetcoders.io/an-introduction-to-jinja2-for-network-automation/
19 |     q_temp = tenv.get_template("ask_question.txt")
20 |     df = pd.read_csv("tests/batik-v1-en.csv")
21 |     cache = {}
22 |     return df, q_temp, cache
23 | 
24 | 
25 | def answer_question(question, q_temp, cache):
26 |     logger.info(f"Answering question: {question['question']}")
27 |     options = [o.strip() for o in question["options"].split(",")]
28 |     prompt = q_temp.render(question=question["question"], options=options)
29 |     if prompt in cache.keys():
30 |         logger.info(f"Found {question['question']} in cache")
31 |         return cache[prompt]
32 |     ansari = Ansari(get_settings())
33 |     result = "".join(filter(lambda x: x is not None, ansari.process_input(prompt)))
34 |     logger.info(f"Answer: {result}")
35 |     cache[prompt] = result
36 |     return result
37 | 
38 | 
39 | def extract_prediction(row):
40 |     try:
41 |         raw = row["json_prediction"]
42 |         raw = raw.replace("```", "")
43 |         raw = raw.replace("json", "")
44 |         raw = "{" + raw.split("{")[1]
45 |         raw = raw.split("}")[0] + "}"
46 |         raw = raw.strip()
47 |         logger.info(f"raw is: {raw}")
48 |         raw_dict = json.loads(raw)
49 |         return str(raw_dict["answer"])
50 |     except IndexError:
51 |         return "OUTPUT_PARSE_ERROR"
52 | 
53 | 
54 | def is_correct(row):
55 |     clean_prediction = row["predicted"].strip().lower()
56 |     clean_correct = row["correct"].replace("Ayah", "").strip().lower()
57 |     return clean_correct == clean_prediction
58 | 
59 | 
60 | def test_ansari_agent(data):
61 |     df, q_temp, cache = data
62 |     df = df.sample(10)
63 |     # For cost and efficiency reasons, we will only test 10 questions
64 |     df["json_prediction"] = df.apply(
65 |         lambda row: answer_question(row, q_temp, cache),
66 |         axis=1,
67 |     )
68 |     df["predicted"] = df.apply(extract_prediction, axis=1)
69 |     df["correct_prediction"] = df.apply(is_correct, axis=1)
70 |     correct_percentage = df["correct_prediction"].mean() * 100
71 |     logger.info(f"Percentage of correct predictions: {correct_percentage:.2f}%")
72 | 
73 |     wrong_predictions = df[~df["correct_prediction"]]
74 |     if not wrong_predictions.empty:
75 |         logger.info("\nQuestions with wrong predictions:")
76 |         for index, row in wrong_predictions.iterrows():
77 |             logger.info(f"Question: {row['question']}")
78 |             logger.info(f"Correct Answer: {row['correct']}")
79 |             logger.info(f"Predicted Answer: {row['predicted']}")
80 |             logger.info("---------------------------------------")
81 | 
82 |     assert correct_percentage >= 80, f"Correct predictions percentage ({correct_percentage:.2f}%) is less than 95%"
83 | 


--------------------------------------------------------------------------------
/tests/unit/test_citation_formatting.py:
--------------------------------------------------------------------------------
  1 | """Tests for citation formatting in Quran and Hadith searches."""
  2 | 
  3 | import json
  4 | import unittest
  5 | from unittest.mock import MagicMock, patch
  6 | 
  7 | from ansari.agents.ansari_claude import AnsariClaude
  8 | from ansari.config import Settings
  9 | from ansari.tools.search_hadith import SearchHadith
 10 | from ansari.tools.search_quran import SearchQuran
 11 | 
 12 | 
 13 | class TestCitationFormatting(unittest.TestCase):
 14 |     """Tests to verify that citations are properly formatted without JSON data."""
 15 | 
 16 |     def setUp(self):
 17 |         """Set up test fixtures."""
 18 |         # Create mock settings
 19 |         self.settings = Settings(
 20 |             OPENAI_API_KEY="mock-openai-key",
 21 |             ANTHROPIC_API_KEY="mock-anthropic-key",
 22 |             KALEMAT_API_KEY="mock-kalemat-key",
 23 |             ANTHROPIC_MODEL="claude-3-opus-20240229",
 24 |             DEV_MODE=True,
 25 |         )
 26 | 
 27 |         # Create mock MessageLogger
 28 |         self.message_logger = MagicMock()
 29 | 
 30 |         # Initialize an AnsariClaude agent with mock settings and logger
 31 |         self.agent = AnsariClaude(self.settings, self.message_logger)
 32 | 
 33 |     @patch("ansari.tools.search_quran.SearchQuran.run")
 34 |     def test_quran_search_sleeplessness_citation_format(self, mock_run):
 35 |         """Test that Quran search for 'sleeplessness' properly formats data as JSON in citations."""
 36 |         # Mock the API response for Quran search
 37 |         mock_results = [
 38 |             {
 39 |                 "id": "25:47",
 40 |                 "text": "وَهُوَ ٱلَّذِى جَعَلَ لَكُمُ ٱلَّيْلَ لِبَاسًا وَٱلنَّوْمَ سُبَاتًا وَجَعَلَ ٱلنَّهَارَ نُشُورًا",
 41 |                 "en_text": """He is the One Who has made the night for you as a cover,
 42 |                 and made sleep for resting, and the day for rising.""",
 43 |             },
 44 |             {"id": "78:9", "text": "وَجَعَلْنَا نَوْمَكُمْ سُبَاتًا", "en_text": "and made your sleep for rest,"},
 45 |         ]
 46 |         mock_run.return_value = mock_results
 47 | 
 48 |         # Create a Quran search tool instance
 49 |         quran_tool = SearchQuran(kalimat_api_key="mock-key")
 50 | 
 51 |         # Get ref_list from the tool
 52 |         ref_list = quran_tool.format_as_ref_list(mock_results)
 53 | 
 54 |         # Check that the data field doesn't contain JSON
 55 |         for doc in ref_list:
 56 |             self.assertIsInstance(doc, dict)
 57 |             self.assertIn("source", doc)
 58 |             self.assertIn("data", doc["source"])
 59 |             data = doc["source"]["data"]
 60 | 
 61 |             # Verify data is valid JSON format
 62 |             try:
 63 |                 parsed_data = json.loads(data)
 64 |                 self.assertIsInstance(parsed_data, list)
 65 | 
 66 |                 # Check that it contains language-text entries
 67 |                 self.assertTrue(len(parsed_data) > 0)
 68 |                 self.assertIn("lang", parsed_data[0])
 69 |                 self.assertIn("text", parsed_data[0])
 70 | 
 71 |                 # If we have an Arabic entry, verify it matches one of the mock texts
 72 |                 for item in parsed_data:
 73 |                     if item["lang"] == "ar":
 74 |                         self.assertTrue(
 75 |                             item["text"] == mock_results[0]["text"] or item["text"] == mock_results[1]["text"],
 76 |                             f"Expected Arabic text to match mock data, but got: {item['text']}",
 77 |                         )
 78 |             except json.JSONDecodeError:
 79 |                 self.fail(f"Data should be valid JSON but got: {data}")
 80 | 
 81 |     @patch("ansari.tools.search_hadith.SearchHadith.run")
 82 |     def test_hadith_search_day_of_judgment_citation_format(self, mock_run):
 83 |         """Test that Hadith search for 'signs of the day of judgment' doesn't return JSON in citations."""
 84 |         # Mock the API response for Hadith search
 85 |         mock_results = [
 86 |             {
 87 |                 "id": "1_2_37_50",
 88 |                 "source_book": "Bukhari",
 89 |                 "chapter_number": "2",
 90 |                 "chapter_english": "Belief",
 91 |                 "section_number": "37",
 92 |                 "section_english": "The asking of Jibreel about Iman, Islam, Ihsan",
 93 |                 "hadith_number": "50",
 94 |                 "ar_text": "عَنْ أَبِي هُرَيْرَةَ، قَالَ كَانَ النَّبِيُّ صلى الله عليه وسلم بَارِزًا يَوْمًا لِلنَّاسِ...",
 95 |                 "en_text": """Narrated Abu Huraira: One day while the Prophet (ﷺ) was sitting in the company of some people,
 96 |                 (The angel) Gabriel came and asked, "What is faith?"...""",
 97 |                 "grade_en": "Sahih-Authentic",
 98 |             },
 99 |             {
100 |                 "id": "3_39_1598_4178",
101 |                 "source_book": "AbuDaud",
102 |                 "chapter_number": "39",
103 |                 "chapter_english": "Battles",
104 |                 "section_number": "1598",
105 |                 "section_english": "Signs of the hour",
106 |                 "hadith_number": "4178",
107 |                 "ar_text": "قال رسول الله صلى الله عليه وسلم: لا تقوم الساعة حتى تكون عشر آيات...",
108 |                 "en_text": """The Messenger of Allah (peace be upon him) said:
109 |                 The last hour will not come or happen until there appear ten signs before it...""",
110 |                 "grade_en": "Sahih - Authentic",
111 |             },
112 |         ]
113 |         mock_run.return_value = mock_results
114 | 
115 |         # Create a Hadith search tool instance
116 |         hadith_tool = SearchHadith(kalimat_api_key="mock-key")
117 | 
118 |         # Get ref_list from the tool
119 |         ref_list = hadith_tool.format_as_ref_list(mock_results)
120 | 
121 |         # Check that the data field doesn't contain JSON
122 |         for doc in ref_list:
123 |             self.assertIsInstance(doc, dict)
124 |             self.assertIn("source", doc)
125 |             self.assertIn("data", doc["source"])
126 |             data = doc["source"]["data"]
127 | 
128 |             # Verify data is valid JSON format
129 |             try:
130 |                 parsed_data = json.loads(data)
131 |                 self.assertIsInstance(parsed_data, list)
132 | 
133 |                 # Check that it contains language-text entries
134 |                 self.assertTrue(len(parsed_data) > 0)
135 |                 self.assertIn("lang", parsed_data[0])
136 |                 self.assertIn("text", parsed_data[0])
137 | 
138 |                 # Verify text content if we have Arabic or English entries
139 |                 for item in parsed_data:
140 |                     if item["lang"] == "ar":
141 |                         self.assertTrue(
142 |                             item["text"] == mock_results[0]["ar_text"] or item["text"] == mock_results[1]["ar_text"],
143 |                             f"Expected Arabic text to match mock data, but got: {item['text']}",
144 |                         )
145 |                     elif item["lang"] == "en":
146 |                         self.assertTrue(
147 |                             item["text"] == mock_results[0]["en_text"] or item["text"] == mock_results[1]["en_text"],
148 |                             f"Expected English text to match mock data, but got: {item['text']}",
149 |                         )
150 |             except json.JSONDecodeError:
151 |                 self.fail(f"Data should be valid JSON but got: {data}")
152 | 


--------------------------------------------------------------------------------
/tests/unit/test_convert_message_llm.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Tests for message formatting in convert_message_llm.
  3 | """
  4 | 
  5 | import json
  6 | import uuid
  7 | 
  8 | 
  9 | def simple_convert_message_llm(msg) -> list[dict]:
 10 |     """A simplified version of convert_message_llm for testing purposes.
 11 | 
 12 |     This follows the logic in the current implementation but is isolated
 13 |     from database connections for testing.
 14 |     """
 15 |     msg_id = str(uuid.uuid4())
 16 |     role, content = msg[0], msg[1]
 17 |     tool_name, tool_details = msg[2], msg[3]
 18 | 
 19 |     # Handle assistant messages
 20 |     if role == "assistant":
 21 |         content_blocks = []
 22 | 
 23 |         # Add text block
 24 |         if isinstance(content, str):
 25 |             content_blocks.append({"type": "text", "text": content})
 26 |         elif isinstance(content, list) and all(isinstance(block, dict) and "type" in block for block in content):
 27 |             content_blocks = content
 28 |         else:
 29 |             content_blocks.append({"type": "text", "text": str(content)})
 30 | 
 31 |         # Add tool use block if present
 32 |         if tool_name and tool_details:
 33 |             tool_details_dict = json.loads(tool_details)
 34 |             tool_id = tool_details_dict.get("id")
 35 |             tool_input = tool_details_dict.get("args")
 36 | 
 37 |             if tool_id and tool_name:
 38 |                 content_blocks.append({"type": "tool_use", "id": tool_id, "name": tool_name, "input": tool_input})
 39 | 
 40 |         return [{"id": msg_id, "role": role, "content": content_blocks}]
 41 | 
 42 |     # Handle user messages
 43 |     return [{"id": msg_id, "role": role, "content": content}]
 44 | 
 45 | 
 46 | def test_convert_message_llm_formats():
 47 |     """Test message formatting in convert_message_llm."""
 48 | 
 49 |     # Test 1: Simple user message
 50 |     user_msg = ("user", "Hello, how are you?", None, None, None)
 51 |     result = simple_convert_message_llm(user_msg)
 52 | 
 53 |     assert len(result) == 1
 54 |     assert result[0]["role"] == "user"
 55 |     assert result[0]["content"] == "Hello, how are you?"
 56 | 
 57 |     # Test 2: Simple assistant message
 58 |     assistant_msg = ("assistant", "I'm doing well, thank you!", None, None, None)
 59 |     result = simple_convert_message_llm(assistant_msg)
 60 | 
 61 |     assert len(result) == 1
 62 |     assert result[0]["role"] == "assistant"
 63 |     assert isinstance(result[0]["content"], list)
 64 |     assert len(result[0]["content"]) == 1
 65 |     assert result[0]["content"][0]["type"] == "text"
 66 |     assert result[0]["content"][0]["text"] == "I'm doing well, thank you!"
 67 | 
 68 |     # Test 3: Assistant message with tool use
 69 |     tool_id = str(uuid.uuid4())
 70 |     tool_details_json = json.dumps({"id": tool_id, "args": {"query": "mercy in quran"}})
 71 | 
 72 |     assistant_tool_msg = ("assistant", "Let me search for that", "search_quran", tool_details_json, None)
 73 |     result = simple_convert_message_llm(assistant_tool_msg)
 74 | 
 75 |     assert len(result) == 1
 76 |     assert result[0]["role"] == "assistant"
 77 |     assert isinstance(result[0]["content"], list)
 78 | 
 79 |     # There should be a text block and a tool block
 80 |     text_blocks = [b for b in result[0]["content"] if b.get("type") == "text"]
 81 |     tool_blocks = [b for b in result[0]["content"] if b.get("type") == "tool_use"]
 82 | 
 83 |     assert len(text_blocks) == 1, "Should have one text block"
 84 |     assert len(tool_blocks) == 1, "Should have one tool_use block"
 85 |     assert text_blocks[0]["text"] == "Let me search for that"
 86 |     assert tool_blocks[0]["id"] == tool_id
 87 |     assert tool_blocks[0]["name"] == "search_quran"
 88 | 
 89 |     # Test 4: Assistant message with empty text and tool use
 90 |     empty_tool_id = str(uuid.uuid4())
 91 |     empty_tool_details = json.dumps({"id": empty_tool_id, "args": {"query": "test"}})
 92 | 
 93 |     empty_msg = ("assistant", "", "search_quran", empty_tool_details, None)
 94 |     result = simple_convert_message_llm(empty_msg)
 95 | 
 96 |     # The current implementation will include an empty text block
 97 |     text_blocks = [b for b in result[0]["content"] if b.get("type") == "text"]
 98 |     tool_blocks = [b for b in result[0]["content"] if b.get("type") == "tool_use"]
 99 | 
100 |     assert len(text_blocks) > 0, "Current implementation includes empty text block"
101 |     assert text_blocks[0]["text"] == "", "Text block is empty"
102 |     assert len(tool_blocks) == 1, "Should have one tool_use block"
103 | 
104 |     # Note: This test documents the current behavior, which may not be ideal.
105 |     # The runtime code in AnsariClaude._finish_response now avoids creating
106 |     # assistant messages with empty text blocks, but this database reconstruction
107 |     # method still creates them. The test still passes to document this
108 |     # difference in behavior.
109 | 
110 |     # Future enhancement should align the database reconstruction with runtime behavior
111 |     # by not including empty text blocks in the content.
112 | 
113 | 
114 | def test_runtime_vs_database_behavior():
115 |     """Test to document the difference between runtime and database behavior
116 |     with empty text blocks."""
117 | 
118 |     # This is a helper function that mimics the runtime behavior in AnsariClaude
119 |     def runtime_format(text, tool_calls):
120 |         if not text and tool_calls:
121 |             # Runtime behavior: only include tool calls when text is empty
122 |             return {"role": "assistant", "content": tool_calls}
123 |         else:
124 |             # Include both text and tool calls
125 |             content = [{"type": "text", "text": text}]
126 |             content.extend(tool_calls)
127 |             return {"role": "assistant", "content": content}
128 | 
129 |     # Create test data
130 |     tool_id = str(uuid.uuid4())
131 |     tool_call = {"type": "tool_use", "id": tool_id, "name": "search_quran", "input": {"query": "test"}}
132 | 
133 |     # Test empty text with tool call
134 |     runtime_result = runtime_format("", [tool_call])
135 |     assert runtime_result["role"] == "assistant"
136 |     assert len(runtime_result["content"]) == 1, "Runtime: only includes tool call, no empty text block"
137 |     assert runtime_result["content"][0]["type"] == "tool_use", "Runtime: only has tool block"
138 | 
139 |     # Compare with database reconstruction behavior
140 |     db_msg = ("assistant", "", "search_quran", json.dumps({"id": tool_id, "args": {"query": "test"}}), None)
141 |     db_result = simple_convert_message_llm(db_msg)[0]
142 | 
143 |     # Database behavior will include the empty text block
144 |     has_empty_text = any(block.get("type") == "text" and block.get("text", "") == "" for block in db_result["content"])
145 |     assert has_empty_text, "Database: includes empty text block"
146 | 
147 | 
148 | if __name__ == "__main__":
149 |     test_convert_message_llm_formats()
150 |     test_runtime_vs_database_behavior()
151 | 


--------------------------------------------------------------------------------
/tests/unit/test_logging_regression.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | from unittest.mock import MagicMock, patch
 4 | 
 5 | # Add the src directory to the path so we can import the modules
 6 | src_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../"))
 7 | sys.path.insert(0, src_path)
 8 | 
 9 | from ansari.agents.ansari_claude import AnsariClaude
10 | from ansari.config import Settings
11 | 
12 | 
13 | def test_logging_changes_dont_break_basic_functionality():
14 |     """
15 |     Test that our logging changes don't break basic functionality.
16 |     This is a minimal test to verify nothing fundamental was broken.
17 |     """
18 |     # Create a mock settings object
19 |     settings = MagicMock(spec=Settings)
20 |     settings.ANTHROPIC_MODEL = "claude-3-opus-20240229"
21 |     settings.MAX_FAILURES = 3
22 | 
23 |     # Create a mocked AnsariClaude instance with initial tools setup
24 |     with patch("anthropic.Anthropic"), patch.object(AnsariClaude, "__init__", return_value=None):
25 |         claude = AnsariClaude.__new__(AnsariClaude)
26 |         claude.settings = settings
27 |         claude.message_logger = None
28 | 
29 |         # Set needed attributes that would normally be set in __init__
30 |         claude.tools = []
31 |         claude.tool_name_to_instance = {}
32 |         claude.citations = []
33 |         claude.message_history = []
34 |         claude.client = MagicMock()
35 | 
36 |         # Set up a simple message history
37 |         claude.message_history = [{"role": "user", "content": "Hello, world!"}]
38 | 
39 |         # Mock the API response to avoid actual API calls
40 |         claude.process_one_round = MagicMock(side_effect=lambda: [])
41 | 
42 |         # Add an assistant response
43 |         claude.message_history.append(
44 |             {"role": "assistant", "content": [{"type": "text", "text": "Hello! How can I help you today?"}]}
45 |         )
46 | 
47 |         # Verify basic validation works
48 |         assert claude.validate_message(claude.message_history[-1])
49 | 
50 |         # Test message logging works
51 |         claude.message_logger = MagicMock()
52 |         claude._log_message(claude.message_history[-1])
53 | 
54 |         # Verify logger was called
55 |         claude.message_logger.log.assert_called_once()
56 | 
57 |         print("Test passed - basic functionality works!")
58 | 
59 | 
60 | if __name__ == "__main__":
61 |     test_logging_changes_dont_break_basic_functionality()
62 | 


--------------------------------------------------------------------------------
/tests/unit/test_message_id_in_thread.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import uuid
  3 | 
  4 | import pytest
  5 | from fastapi.testclient import TestClient
  6 | 
  7 | from ansari.app.main_api import app
  8 | 
  9 | logger = logging.getLogger(__name__)
 10 | 
 11 | client = TestClient(app)
 12 | 
 13 | # Test data
 14 | valid_email = f"test_{uuid.uuid4()}@example.com"
 15 | valid_password = "StrongPassword123!"
 16 | first_name = "John"
 17 | last_name = "Doe"
 18 | 
 19 | 
 20 | @pytest.fixture
 21 | def register_and_login_user():
 22 |     # Register a user
 23 |     register_response = client.post(
 24 |         "/api/v2/users/register",
 25 |         json={
 26 |             "email": valid_email,
 27 |             "password": valid_password,
 28 |             "first_name": first_name,
 29 |             "last_name": last_name,
 30 |         },
 31 |     )
 32 |     assert register_response.status_code == 200
 33 | 
 34 |     # Login with the registered user
 35 |     login_response = client.post(
 36 |         "/api/v2/users/login",
 37 |         json={
 38 |             "email": valid_email,
 39 |             "password": valid_password,
 40 |         },
 41 |     )
 42 |     assert login_response.status_code == 200
 43 |     return login_response.json()
 44 | 
 45 | 
 46 | @pytest.mark.asyncio
 47 | async def test_message_id_in_thread_response(register_and_login_user):
 48 |     """Test that message IDs are included in thread responses."""
 49 |     access_token = register_and_login_user["access_token"]
 50 | 
 51 |     # Create a new thread
 52 |     thread_response = client.post(
 53 |         "/api/v2/threads",
 54 |         headers={"Authorization": f"Bearer {access_token}"},
 55 |     )
 56 |     assert thread_response.status_code == 200
 57 |     thread_id = thread_response.json()["thread_id"]
 58 | 
 59 |     # Add a message to the thread
 60 |     message_data = {"role": "user", "content": "Test message with ID"}
 61 |     client.post(
 62 |         f"/api/v2/threads/{thread_id}",
 63 |         headers={"Authorization": f"Bearer {access_token}"},
 64 |         json=message_data,
 65 |     )
 66 | 
 67 |     # Get the thread and verify it contains message IDs
 68 |     thread_get_response = client.get(
 69 |         f"/api/v2/threads/{thread_id}",
 70 |         headers={"Authorization": f"Bearer {access_token}"},
 71 |     )
 72 |     assert thread_get_response.status_code == 200
 73 | 
 74 |     # Check that the thread contains messages with IDs
 75 |     thread_data = thread_get_response.json()
 76 |     assert "messages" in thread_data
 77 |     assert len(thread_data["messages"]) > 0
 78 | 
 79 |     # Verify each message has an ID field
 80 |     for message in thread_data["messages"]:
 81 |         assert "id" in message, f"Message does not contain ID field: {message}"
 82 |         assert isinstance(message["id"], int), f"Message ID is not an integer: {message['id']}"
 83 |         assert message["id"] > 0, f"Message ID is not positive: {message['id']}"
 84 | 
 85 | 
 86 | def test_claude_message_ids_removed():
 87 |     """Test that message IDs are removed before sending to Claude."""
 88 |     # Create test messages with IDs
 89 |     messages = [
 90 |         {"id": 1, "role": "user", "content": "Hello"},
 91 |         {"id": 2, "role": "assistant", "content": [{"type": "text", "text": "Hi there"}]},
 92 |         {"id": 3, "role": "user", "content": "How are you?"},
 93 |     ]
 94 | 
 95 |     # Just test the replace_message_history method directly
 96 |     # Create a minimal class for testing
 97 |     class TestAnsariClaude:
 98 |         def replace_message_history(self, message_history, use_tool=True, stream=True):
 99 |             # Copy the method implementation from the original class
100 |             # Remove message IDs from the history before sending to Claude
101 |             cleaned_history = []
102 |             for msg in message_history:
103 |                 msg_copy = msg.copy()
104 |                 if "id" in msg_copy:
105 |                     del msg_copy["id"]
106 |                 cleaned_history.append(msg_copy)
107 | 
108 |             self.message_history = cleaned_history
109 |             return []
110 | 
111 |     # Create an instance of our test class
112 |     claude = TestAnsariClaude()
113 | 
114 |     # Call replace_message_history
115 |     claude.replace_message_history(messages)
116 | 
117 |     # Check that IDs were removed from the message history
118 |     for msg in claude.message_history:
119 |         assert "id" not in msg, f"Message still contains ID: {msg}"
120 | 


--------------------------------------------------------------------------------
/tests/unit/test_multilingual_citations.py:
--------------------------------------------------------------------------------
  1 | """Tests for multilingual citation format in search tools."""
  2 | 
  3 | import json
  4 | import pytest
  5 | from ansari.util.translation import format_multilingual_data, parse_multilingual_data
  6 | from ansari.tools.search_mawsuah import SearchMawsuah
  7 | from ansari.tools.search_quran import SearchQuran
  8 | from ansari.tools.search_hadith import SearchHadith
  9 | 
 10 | 
 11 | class TestMultilingualFormat:
 12 |     """Test the multilingual format utility functions."""
 13 | 
 14 |     def test_format_multilingual_data(self):
 15 |         """Test formatting a dictionary of language-text pairs to a JSON string."""
 16 |         # Test with multiple languages
 17 |         test_data = {"ar": "النص العربي", "en": "English text"}
 18 |         result = format_multilingual_data(test_data)
 19 |         assert isinstance(result, str)
 20 | 
 21 |         # Verify the JSON structure
 22 |         parsed = json.loads(result)
 23 |         assert isinstance(parsed, list)
 24 |         assert len(parsed) == 2
 25 | 
 26 |         # Verify entries have lang and text
 27 |         for item in parsed:
 28 |             assert "lang" in item
 29 |             assert "text" in item
 30 | 
 31 |         # Verify correct data
 32 |         langs = [item["lang"] for item in parsed]
 33 |         assert "ar" in langs
 34 |         assert "en" in langs
 35 | 
 36 |     def test_parse_multilingual_data(self):
 37 |         """Test parsing a JSON string to a dictionary of language-text pairs."""
 38 |         # Create test JSON
 39 |         json_str = json.dumps([{"lang": "ar", "text": "النص العربي"}, {"lang": "en", "text": "English text"}])
 40 | 
 41 |         # Parse it
 42 |         result = parse_multilingual_data(json_str)
 43 | 
 44 |         # Verify result
 45 |         assert isinstance(result, dict)
 46 |         assert "ar" in result
 47 |         assert "en" in result
 48 |         assert result["ar"] == "النص العربي"
 49 |         assert result["en"] == "English text"
 50 | 
 51 |     def test_format_parse_roundtrip(self):
 52 |         """Test round-trip from dict -> JSON string -> dict."""
 53 |         original = {"ar": "النص العربي", "en": "English text", "fr": "Texte français"}
 54 | 
 55 |         # Format to JSON string
 56 |         json_str = format_multilingual_data(original)
 57 | 
 58 |         # Parse back to dict
 59 |         result = parse_multilingual_data(json_str)
 60 | 
 61 |         # Verify round-trip consistency
 62 |         assert result == original
 63 | 
 64 | 
 65 | @pytest.fixture
 66 | def mock_search_results_mawsuah():
 67 |     """Mock results from the Mawsuah search tool."""
 68 |     return {"search_results": [{"text": "نص عربي للاختبار", "score": 0.95}]}
 69 | 
 70 | 
 71 | @pytest.fixture
 72 | def mock_search_results_quran():
 73 |     """Mock results from the Quran search tool."""
 74 |     return [
 75 |         {
 76 |             "id": "1:1",
 77 |             "text": "بِسْمِ اللَّهِ الرَّحْمَٰنِ الرَّحِيمِ",
 78 |             "en_text": "In the name of Allah, the Entirely Merciful, the Especially Merciful.",
 79 |         }
 80 |     ]
 81 | 
 82 | 
 83 | @pytest.fixture
 84 | def mock_search_results_hadith():
 85 |     """Mock results from the Hadith search tool."""
 86 |     return [
 87 |         {
 88 |             "id": "123",
 89 |             "source_book": "Bukhari",
 90 |             "chapter_number": "1",
 91 |             "chapter_english": "Test Chapter",
 92 |             "hadith_number": "456",
 93 |             "section_number": "2",
 94 |             "section_english": "Test Section",
 95 |             "ar_text": "نص حديث عربي",
 96 |             "en_text": "English hadith text",
 97 |             "grade_en": "Sahih",
 98 |         }
 99 |     ]
100 | 
101 | 
102 | class TestSearchToolsFormat:
103 |     """Test that search tools correctly format their results in multilingual format."""
104 | 
105 |     def test_mawsuah_format(self, mocker, mock_search_results_mawsuah):
106 |         """Test that SearchMawsuah correctly formats Arabic-only results."""
107 |         # Create a minimal mocked version of SearchMawsuah that overrides parent methods
108 |         mocker.patch(
109 |             "ansari.tools.search_vectara.SearchVectara.format_as_ref_list",
110 |             return_value=[
111 |                 {
112 |                     "type": "document",
113 |                     "source": {"type": "text", "media_type": "text/plain", "data": "نص عربي للاختبار"},
114 |                     "title": "Test Document",
115 |                 }
116 |             ],
117 |         )
118 | 
119 |         # Instantiate with mock values
120 |         search = SearchMawsuah("mock_key", "mock_corpus")
121 | 
122 |         # Format the results
123 |         formatted = search.format_as_ref_list(mock_search_results_mawsuah)
124 | 
125 |         # Verify the result
126 |         assert isinstance(formatted, list)
127 |         assert len(formatted) == 1
128 |         doc = formatted[0]
129 | 
130 |         # Verify document structure
131 |         assert doc["type"] == "document"
132 |         assert "source" in doc
133 |         assert "data" in doc["source"]
134 | 
135 |         # Parse the multilingual data
136 |         data = parse_multilingual_data(doc["source"]["data"])
137 | 
138 |         # Verify it contains Arabic only
139 |         assert "ar" in data
140 |         assert len(data) == 1  # Only Arabic, no other languages
141 | 
142 |     def test_quran_format(self, mock_search_results_quran):
143 |         """Test that SearchQuran correctly formats bilingual results."""
144 |         # Instantiate with mock values
145 |         search = SearchQuran("mock_key")
146 | 
147 |         # Format the results
148 |         formatted = search.format_as_ref_list(mock_search_results_quran)
149 | 
150 |         # Verify the result
151 |         assert isinstance(formatted, list)
152 |         assert len(formatted) == 1
153 |         doc = formatted[0]
154 | 
155 |         # Verify document structure
156 |         assert doc["type"] == "document"
157 |         assert "source" in doc
158 |         assert "data" in doc["source"]
159 | 
160 |         # Parse the multilingual data
161 |         data = parse_multilingual_data(doc["source"]["data"])
162 | 
163 |         # Verify it contains both Arabic and English
164 |         assert "ar" in data
165 |         assert "en" in data
166 |         assert len(data) == 2
167 | 
168 |     def test_hadith_format(self, mock_search_results_hadith):
169 |         """Test that SearchHadith correctly formats results with metadata."""
170 |         # Instantiate with mock values
171 |         search = SearchHadith("mock_key")
172 | 
173 |         # Format the results
174 |         formatted = search.format_as_ref_list(mock_search_results_hadith)
175 | 
176 |         # Verify the result
177 |         assert isinstance(formatted, list)
178 |         assert len(formatted) == 1
179 |         doc = formatted[0]
180 | 
181 |         # Verify document structure
182 |         assert doc["type"] == "document"
183 |         assert "source" in doc
184 |         assert "data" in doc["source"]
185 | 
186 |         # Parse the multilingual data
187 |         data = parse_multilingual_data(doc["source"]["data"])
188 | 
189 |         # Verify it contains both Arabic and English
190 |         assert "ar" in data
191 |         assert "en" in data
192 | 
193 |         # Verify grade is in the title, not in the data
194 |         assert "Grade: Sahih" in doc["title"]
195 | 


--------------------------------------------------------------------------------
/tests/unit/test_translation.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import logging
 3 | 
 4 | from ansari.util.translation import translate_text
 5 | 
 6 | # Set up logging
 7 | logger = logging.getLogger(__name__)
 8 | logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
 9 | 
10 | 
11 | class TestTranslation:
12 |     """Tests for the translate_text function using the actual Anthropic API."""
13 | 
14 |     def test_basmalah_translation(self):
15 |         """Test translating the Basmalah from Arabic to English."""
16 |         basmalah = "بسم الله الرحمن الرحيم"
17 |         result = translate_text(basmalah, "en", "ar")
18 | 
19 |         logger.info(f"Basmalah translation: '{result}'")
20 |         assert result, "Translation should not be empty"
21 | 
22 |     def test_same_language_translation(self):
23 |         """Test when source and target languages are the same."""
24 |         # No API call made when languages match
25 |         text = "Hello world"
26 |         result = translate_text(text, "en", "en")
27 | 
28 |         # Should return the original text unchanged
29 |         assert result == text
30 | 
31 |     def test_empty_text_translation(self):
32 |         """Test translating empty text."""
33 |         # No API call made for empty text
34 |         result = translate_text("", "ar", "en")
35 | 
36 |         # Should return empty string
37 |         assert result == ""
38 | 
39 | 
40 | if __name__ == "__main__":
41 |     # This allows running the tests directly
42 |     pytest.main(["-xvs", __file__])
43 | 


--------------------------------------------------------------------------------
/update_database.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime, timezone
 2 | from bson import CodecOptions
 3 | import pymongo
 4 | 
 5 | from ansari.ansari_logger import get_logger
 6 | from ansari.config import get_settings
 7 | 
 8 | logger = get_logger(__name__)
 9 | 
10 | 
11 | def update_database():
12 |     try:
13 |         settings = get_settings()
14 |         db_url = settings.MONGO_URL
15 |         db_name = settings.MONGO_DB_NAME
16 |         bson_codec_options = CodecOptions(tz_aware = True)
17 |         mongo_connection = pymongo.MongoClient(db_url)
18 |         mongo_db = mongo_connection[db_name]
19 | 
20 |         threads_collection = mongo_db.get_collection("threads", codec_options=bson_codec_options)
21 | 
22 |         impacted_threads = threads_collection.find({"messages.content":
23 |                                                           {"$elemMatch": {"text": ""}}}).sort("updated_at", -1)
24 |         for impacted_thread in impacted_threads:
25 |             logger.info(f"""Empty content message found: {str(impacted_thread["_id"])},
26 |                         last updated: {impacted_thread["updated_at"]}""")
27 | 
28 |             for message in impacted_thread["messages"]:
29 |                 if not isinstance(message["content"], list):
30 |                     continue
31 | 
32 |                 for content in message["content"]:
33 |                     if "text" in content and content["text"] == "":
34 |                         content["text"] = "I'm processing your request."
35 | 
36 |             update_result = threads_collection.update_one(
37 |                 {"_id": impacted_thread["_id"]},
38 |                 {"$set": {
39 |                     "messages": impacted_thread["messages"],
40 |                     "updated_at": datetime.now(timezone.utc),
41 |                     "empty_content_block": True
42 |                 }}
43 |             )
44 |             logger.info(f"Update result: {update_result.matched_count} matched, {update_result.modified_count} modified.")
45 | 
46 | 
47 |     except (Exception) as error:
48 |         logger.error(f"Error: {error}")
49 |     finally:
50 |         if mongo_connection is not None:
51 |             mongo_connection.close()
52 | 
53 | 
54 | if __name__ == "__main__":
55 |     update_database()
56 | 


--------------------------------------------------------------------------------