├── .flake8 ├── .github └── workflows │ ├── build-and-push.yml │ ├── gh-packages-parallel.yml │ └── gh-packages.yaml ├── .gitignore ├── Dockerfile ├── README.md ├── api.py ├── benchmarks └── __init__.py ├── config.json ├── contribution.md ├── deploy.sh ├── devops └── services │ ├── README.md │ ├── ai-tools │ └── setup_ai-tools_service.yml │ ├── nginx │ ├── Dockerfile │ ├── nginx.conf │ └── setup_nginx_service.yml │ └── vault │ ├── docker-compose.yml │ ├── setup_vault_service.yml │ └── vault.json ├── docker-compose-restructure.yml ├── docker-compose.yml ├── faq.md ├── flake8 ├── __init__.py ├── setup.py └── single_word_module.py ├── generate.sh ├── generate_independent_docker.sh ├── poetry.lock ├── pyproject.toml ├── repository_data.json ├── sample.env ├── src ├── __init__.py ├── asr │ ├── README.md │ ├── ai4bharat │ │ ├── streaming │ │ │ └── README.md │ │ └── url │ │ │ ├── README.md │ │ │ ├── deploy.sh │ │ │ ├── models_info.json │ │ │ └── support.py │ ├── fairseq_mms │ │ ├── README.md │ │ └── local │ │ │ ├── Dockerfile │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ ├── api.py │ │ │ ├── model.py │ │ │ ├── request.py │ │ │ └── requirements.txt │ ├── google │ │ └── remote │ │ │ ├── 1.mp3 │ │ │ ├── 1.wav │ │ │ ├── Dockerfile │ │ │ ├── __init__.py │ │ │ ├── api.py │ │ │ ├── model.py │ │ │ ├── request.py │ │ │ └── requirements.txt │ ├── whisper_en │ │ ├── README.md │ │ └── local │ │ │ ├── Dockerfile │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ ├── api.py │ │ │ ├── model.py │ │ │ ├── request.py │ │ │ └── requirements.txt │ └── whisper_lang_rec │ │ └── local │ │ ├── Dockerfile │ │ ├── README.md │ │ ├── __init__.py │ │ ├── api.py │ │ ├── model.py │ │ ├── request.py │ │ └── requirements.txt ├── chunking │ ├── MPNet │ │ ├── README.md │ │ └── local │ │ │ ├── Dockerfile │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ ├── api.py │ │ │ ├── model.py │ │ │ ├── request.py │ │ │ └── requirements.txt │ └── README.md ├── conversation_terminator │ └── remote │ │ ├── Dockerfile │ │ ├── __init__.py │ │ ├── api.py │ │ ├── model.py │ │ ├── request.py │ │ └── requirements.txt ├── core.py ├── coref │ ├── README.md │ ├── __init__.py │ ├── bart │ │ ├── README.md │ │ ├── __init__.py │ │ └── local │ │ │ ├── Dockerfile │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ ├── api.py │ │ │ ├── model.py │ │ │ ├── request.py │ │ │ └── requirements.txt │ ├── fcoref │ │ ├── README.md │ │ ├── __init__.py │ │ └── local │ │ │ ├── Dockerfile │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ ├── api.py │ │ │ ├── model.py │ │ │ ├── request.py │ │ │ └── requirements.txt │ ├── spacy │ │ ├── __init__.py │ │ └── local │ │ │ ├── Dockerfile │ │ │ ├── __init__.py │ │ │ ├── api.py │ │ │ ├── model.py │ │ │ ├── request.py │ │ │ └── requirements.txt │ └── tests │ │ ├── __init__.py │ │ ├── negative_tests.txt │ │ ├── parse_examples.py │ │ ├── positive_tests.txt │ │ ├── prompt.txt │ │ └── readme.md ├── data_generation │ ├── README.md │ └── dictionary_aug │ │ ├── README.md │ │ └── remote │ │ ├── Dockerfile │ │ ├── README.md │ │ ├── __init__.py │ │ ├── api.py │ │ ├── model.py │ │ ├── request.py │ │ └── requirements.txt ├── dsp │ ├── README.md │ ├── __init__.py │ └── local │ │ ├── Dockerfile │ │ ├── README.md │ │ ├── __init__.py │ │ ├── api.py │ │ ├── model.py │ │ ├── request.py │ │ ├── requirements.txt │ │ └── utils.py ├── embeddings │ ├── README.md │ ├── colbert │ │ └── local │ │ │ ├── Dockerfile │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ ├── api.py │ │ │ ├── model.py │ │ │ ├── request.py │ │ │ └── requirements.txt │ ├── instructor │ │ ├── README.md │ │ └── local │ │ │ ├── Dockerfile │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ ├── api.py │ │ │ ├── model.py │ │ │ ├── request.py │ │ │ └── requirements.txt │ └── openai │ │ ├── README.md │ │ └── remote │ │ ├── Dockerfile │ │ ├── __init__.py │ │ ├── api.py │ │ ├── model.py │ │ ├── request.py │ │ └── requirements.txt ├── intent_recognition │ └── README.md ├── llm │ ├── __init__.py │ └── openai │ │ ├── __init__.py │ │ ├── chatgpt3 │ │ ├── Dockerfile │ │ ├── __init__.py │ │ ├── api.py │ │ ├── model.py │ │ ├── request.py │ │ └── requirements.txt │ │ ├── chatgpt4 │ │ ├── Dockerfile │ │ ├── __init__.py │ │ ├── api.py │ │ ├── model.py │ │ ├── request.py │ │ └── requirements.txt │ │ └── chatgpt4turbo_preview │ │ ├── Dockerfile │ │ ├── __init__.py │ │ ├── api.py │ │ ├── model.py │ │ ├── request.py │ │ └── requirements.txt ├── ner │ ├── README.md │ └── agri_ner_akai │ │ ├── README.md │ │ └── local │ │ ├── Dockerfile │ │ ├── README.md │ │ ├── __init__.py │ │ ├── api.py │ │ ├── bert_ner.py │ │ ├── model.py │ │ ├── regex_parse_ner.py │ │ ├── request.py │ │ └── requirements.txt ├── rerankers │ └── bge_base │ │ └── local │ │ ├── Dockerfile │ │ ├── README.md │ │ ├── __init__.py │ │ ├── api.py │ │ ├── model.py │ │ ├── request.py │ │ └── requirements.txt ├── search │ ├── README.md │ ├── __init__.py │ ├── tf_search │ │ └── local │ │ │ ├── Dockerfile │ │ │ ├── README copy.md │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ ├── api.py │ │ │ ├── model.py │ │ │ ├── request.py │ │ │ └── requirements.txt │ └── word_score │ │ ├── README.md │ │ ├── __init__.py │ │ └── local │ │ ├── Dockerfile │ │ ├── README.md │ │ ├── __init__.py │ │ ├── api.py │ │ ├── model.py │ │ ├── request.py │ │ └── requirements.txt ├── speech_lang_detection │ ├── README.md │ ├── batch │ │ └── README.md │ └── streaming │ │ └── README.md ├── spell_check │ ├── README.md │ ├── kenlm │ │ ├── README.md │ │ └── local │ │ │ ├── Dockerfile │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ ├── api.py │ │ │ ├── model.py │ │ │ ├── request.py │ │ │ ├── requirements.txt │ │ │ └── update.py │ └── spello │ │ ├── README.md │ │ └── local │ │ ├── Dockerfile │ │ ├── README.md │ │ ├── __init__.py │ │ ├── api.py │ │ ├── model.py │ │ ├── request.py │ │ └── requirements.txt ├── t2embedding │ ├── README.md │ ├── bert │ │ └── README.md │ └── openai │ │ ├── README.md │ │ └── remote │ │ ├── Dockerfile │ │ ├── __init__.py │ │ ├── api.py │ │ ├── model.py │ │ ├── request.py │ │ └── requirements.txt ├── text2speech │ └── README.md ├── text_classification │ ├── README.md │ ├── convo_starter_orgbot │ │ ├── README.md │ │ └── local │ │ │ ├── Dockerfile │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ ├── api.py │ │ │ ├── model.py │ │ │ ├── request.py │ │ │ └── requirements.txt │ ├── flow_classification │ │ └── local │ │ │ ├── Dockerfile │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ ├── api.py │ │ │ ├── model.py │ │ │ ├── request.py │ │ │ └── requirements.txt │ └── grievance_recognition │ │ ├── README.md │ │ └── local │ │ ├── Dockerfile │ │ ├── README.md │ │ ├── __init__.py │ │ ├── api.py │ │ ├── model.py │ │ ├── request.py │ │ └── requirements.txt ├── text_lang_detection │ ├── README.md │ └── bhashini │ │ ├── __init__.py │ │ └── remote │ │ ├── Dockerfile │ │ ├── __init__.py │ │ ├── api.py │ │ ├── model.py │ │ ├── request.py │ │ └── requirements.txt ├── text_translation │ ├── README.md │ ├── ai4bharat │ │ ├── batch │ │ │ ├── __init__.py │ │ │ ├── batch.py │ │ │ └── batch_request.py │ │ ├── deploy.sh │ │ ├── readme copy.md │ │ └── remote │ │ │ ├── Dockerfile │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ ├── api.py │ │ │ ├── model.py │ │ │ ├── request.py │ │ │ └── requirements.txt │ ├── azure │ │ ├── README.md │ │ └── remote │ │ │ ├── Dockerfile │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ ├── api.py │ │ │ ├── model.py │ │ │ ├── request.py │ │ │ └── requirements.txt │ ├── azure_dict │ │ ├── README.md │ │ └── remote │ │ │ ├── Dockerfile │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ ├── api.py │ │ │ ├── model.py │ │ │ ├── request.py │ │ │ └── requirements.txt │ ├── bhashini │ │ ├── __init__.py │ │ └── remote │ │ │ ├── Dockerfile │ │ │ ├── __init__.py │ │ │ ├── api.py │ │ │ ├── model.py │ │ │ ├── request.py │ │ │ └── requirements.txt │ └── google │ │ └── remote │ │ ├── Dockerfile │ │ ├── __init__.py │ │ ├── api.py │ │ ├── model.py │ │ ├── request.py │ │ └── requirements.txt ├── text_transliteration │ └── README.md ├── token_counter │ ├── README.md │ └── openai │ │ ├── README.md │ │ └── local │ │ ├── Dockerfile │ │ ├── README.md │ │ ├── __init__.py │ │ ├── api.py │ │ ├── model.py │ │ ├── request.py │ │ └── requirements.txt ├── topic_modelling │ ├── BERTopic │ │ ├── Dockerfile │ │ ├── README.md │ │ ├── __init__.py │ │ ├── api.py │ │ ├── model.py │ │ ├── request.py │ │ └── requirements.txt │ └── README.md ├── utils.py └── vector_search │ ├── README.md │ ├── dotproduct │ └── README.md │ └── faiss │ └── README.md ├── template_batch_model.py ├── template_model_request.py └── test.py /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | ignore = E501 3 | max-line-length = 79 4 | max-complexity = 10 5 | select = C,E,F,W,B,B9 -------------------------------------------------------------------------------- /.github/workflows/build-and-push.yml: -------------------------------------------------------------------------------- 1 | name: Build and Push Docker Image 2 | 3 | on: 4 | workflow_dispatch: 5 | 6 | env: 7 | DOCKER_USERNAME: ${{ github.actor }} 8 | DOCKER_IMAGE_NAME: ${{ github.repository }} 9 | DOCKER_REGISTRY: ghcr.io 10 | DOCKER_IMAGE_TAG: ${{ github.ref_name }} 11 | 12 | jobs: 13 | build-and-push: 14 | runs-on: ubuntu-latest 15 | steps: 16 | - id: lower-repo 17 | shell: pwsh 18 | run: | 19 | "::set-output name=repository::$($env:DOCKER_IMAGE_NAME.ToLowerInvariant())" 20 | 21 | - name: Checkout code 22 | uses: actions/checkout@v2 23 | 24 | - name: Set up Docker Buildx 25 | uses: docker/setup-buildx-action@v2 26 | 27 | - name: Login to Docker registry 28 | uses: docker/login-action@v1 29 | with: 30 | registry: ${{ env.DOCKER_REGISTRY }} 31 | username: ${{ env.DOCKER_USERNAME }} 32 | password: ${{ secrets.PAT }} 33 | 34 | - name: Build and Push Docker image 35 | uses: docker/build-push-action@v4 36 | with: 37 | context: . 38 | push: true 39 | cache-from: type=gha 40 | cache-to: type=gha,mode=max 41 | tags: ${{ env.DOCKER_REGISTRY }}/${{ steps.lower-repo.outputs.repository }}:${{env.DOCKER_IMAGE_TAG}} 42 | labels: org.opencontainers.image.source=https://github.com/${{steps.lower-repo.outputs.repository}} 43 | -------------------------------------------------------------------------------- /.github/workflows/gh-packages.yaml: -------------------------------------------------------------------------------- 1 | name: Push Docker Images 2 | 3 | on: 4 | workflow_dispatch: 5 | 6 | env: 7 | REGISTRY: ghcr.io 8 | IMAGE_NAME: ${{ github.repository }} 9 | 10 | jobs: 11 | build-and-push-image: 12 | runs-on: ubuntu-latest 13 | permissions: 14 | contents: read 15 | packages: write 16 | 17 | steps: 18 | - name: Checkout repository 19 | uses: actions/checkout@v3 20 | with: 21 | fetch-depth: 2 22 | 23 | - name: Log in to the Container registry 24 | id: login 25 | uses: docker/login-action@65b78e6e13532edd9afa3aa52ac7964289d1a9c1 26 | with: 27 | registry: ${{ env.REGISTRY }} 28 | username: ${{ github.actor }} 29 | password: ${{ secrets.PAT }} 30 | 31 | - name: Build and push docker images 32 | run: | 33 | image_names=$(jq -r '.models[].serviceName' ./config.json) 34 | paths=$(jq -r '.models[].modelBasePath' ./config.json) 35 | readarray -t image_array <<< "$image_names" 36 | readarray -t paths_array <<< "$paths" 37 | lowercase_actor=$(echo "${{ github.repository }}" | tr '[:upper:]' '[:lower:]') 38 | for index in "${!image_array[@]}"; do 39 | image="${image_array[index]}" 40 | path="${paths_array[index]}" 41 | docker build "./$path" -t "${{ env.REGISTRY }}/$lowercase_actor/$image:latest" 42 | docker push "${{ env.REGISTRY }}/$lowercase_actor/$image:latest" 43 | docker image prune -a -f 44 | done 45 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # Use an official Python runtime as a parent image 2 | FROM python:3.9-slim 3 | 4 | # Set the working directory to /app 5 | WORKDIR /app 6 | 7 | # Copy the poetry.lock and pyproject.toml files to the working directory 8 | COPY poetry.lock pyproject.toml /app/ 9 | 10 | # Install dependencies using Poetry 11 | RUN pip install poetry && \ 12 | poetry config virtualenvs.create false && \ 13 | poetry install --no-dev --no-root 14 | 15 | # Copy the rest of the application code to the working directory 16 | COPY . /app/ 17 | 18 | # Set the entrypoint for the container 19 | ENTRYPOINT ["hypercorn", "api", "-b", "0.0.0.0"] 20 | 21 | -------------------------------------------------------------------------------- /benchmarks/__init__.py: -------------------------------------------------------------------------------- 1 | from test import * -------------------------------------------------------------------------------- /contribution.md: -------------------------------------------------------------------------------- 1 | ## Style Guide 2 | 3 | ### PEP 8: Style Guide for Python Code 4 | 5 | - Use 4 spaces per indentation level. 6 | - Limit all lines to a maximum of 79 characters. 7 | - Separate top-level functions and class definitions with two blank lines. 8 | - Method definitions inside a class should be separated by one blank line. 9 | - Imports should be on separate lines and grouped in the following order: standard library, related third-party imports, and local application/library-specific imports. 10 | - Use spaces around operators and after commas, but not directly inside parentheses or brackets. 11 | - Avoid extraneous whitespace. 12 | - Follow the naming conventions: modules and packages should have short, lowercase names, classes should use CapWords, function and variable names should be lowercase with words separated by underscores. 13 | 14 | ### PEP 257: Docstring Conventions 15 | 16 | - All modules, classes, and functions should have docstrings. 17 | - One-liners are for short, simple functions or methods. 18 | - Multi-line docstrings have a summary line, followed by a blank line and a more elaborate description. 19 | - Use triple double-quotes for docstrings, even for one-liners. 20 | -------------------------------------------------------------------------------- /deploy.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python3 -m venv venv 4 | source venv/bin/activate 5 | cd ./src/text_translation/ai4bharat & sh deploy.sh & cd .. & cd .. 6 | gunicorn api:app --workers 5 --timeout 600 -------------------------------------------------------------------------------- /devops/services/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samagra-Development/ai-tools/bdb71a281f52b8c3d6a086b9c6e8ad347f103553/devops/services/README.md -------------------------------------------------------------------------------- /devops/services/ai-tools/setup_ai-tools_service.yml: -------------------------------------------------------------------------------- 1 | - name: Deploy Docker Stack 2 | hosts: _manager 3 | become: yes 4 | vars: 5 | destination_directory: "{{ lookup('hashi_vault', 'secret=secret/data/ai-tools:destination_directory') }}" 6 | tasks: 7 | - name: Create a Vault directory if it does not exist already 8 | ansible.builtin.file: 9 | path: "{{ destination_directory }}" 10 | state: directory 11 | mode: "0755" 12 | 13 | - name: Retrieve environment file from Vault 14 | set_fact: 15 | env_file: "{{ lookup('hashi_vault', 'secret=secret/data/generate') }}" 16 | changed_when: false 17 | 18 | - name: Generate .generate.env from Vault data 19 | copy: 20 | dest: "{{ destination_directory }}/.generate.env" 21 | content: | 22 | {% for key, value in env_file.items() %} 23 | {{ key }}={{ value }} 24 | {% endfor %} 25 | vars: 26 | env_file: "{{ env_file }}" 27 | 28 | - name: Get config.json file 29 | get_url: 30 | url: "https://raw.githubusercontent.com/Samagra-Development/ai-tools/restructure/config.json" 31 | dest: "{{ destination_directory }}/config.json" 32 | 33 | - name: Get Generate.sh Script 34 | get_url: 35 | url: "https://raw.githubusercontent.com/Samagra-Development/ai-tools/restructure/generate_independent_docker.sh" 36 | dest: "{{ destination_directory }}/generate_independent_docker.sh" 37 | 38 | - name: Fetch GitHub Details from Vault 39 | set_fact: 40 | USERNAME: "{{ lookup('hashi_vault', 'secret=secret/data/github:USERNAME') }}" 41 | PAT: "{{ lookup('hashi_vault', 'secret=secret/data/github:PAT') }}" 42 | changed_when: false 43 | 44 | - name: Authenticate with GitHub Container Registry 45 | docker_login: 46 | registry_url: docker.pkg.github.com 47 | username: "{{ USERNAME }}" 48 | password: "{{ PAT }}" 49 | reauthorize: yes 50 | 51 | - name: Install jq 52 | apt: 53 | name: jq 54 | state: present 55 | 56 | - name: Generate Docker-Compose File 57 | shell: "./generate_independent_docker.sh" 58 | args: 59 | chdir: "{{ destination_directory }}" 60 | 61 | - name: Pull Docker Images 62 | docker_compose: 63 | project_src: "{{ destination_directory }}" 64 | files: docker-compose-independent-generated.yaml 65 | pull: yes 66 | 67 | - name: Deploy Docker Images to Docker Swarm 68 | docker_stack: 69 | compose_file: "{{ destination_directory }}/docker-compose-independent-generated.yaml" 70 | stack_name: aitools_stack 71 | state: present 72 | -------------------------------------------------------------------------------- /devops/services/nginx/Dockerfile: -------------------------------------------------------------------------------- 1 | # Use an official nginx runtime as a parent image 2 | FROM nginx 3 | 4 | RUN rm -rf /etc/nginx/conf.d/ 5 | RUN rm /etc/nginx/nginx.conf 6 | 7 | # Copy the contents of the local conf.d folder to the image's conf.d directory 8 | COPY nginx.conf /etc/nginx/nginx.conf 9 | COPY conf.d /etc/nginx/conf.d/ 10 | COPY certificates /etc/nginx/certificates -------------------------------------------------------------------------------- /devops/services/nginx/nginx.conf: -------------------------------------------------------------------------------- 1 | user nginx; 2 | worker_processes auto; 3 | 4 | error_log /var/log/nginx/error.log notice; 5 | pid /var/run/nginx.pid; 6 | 7 | 8 | events { 9 | worker_connections 1024; 10 | } 11 | 12 | 13 | http { 14 | include /etc/nginx/mime.types; 15 | default_type application/octet-stream; 16 | 17 | ssl_session_cache shared:SSL:10m; 18 | 19 | log_format main '$remote_addr - $remote_user [$time_local] "$request" ' 20 | '$status $body_bytes_sent "$http_referer" ' 21 | '"$http_user_agent" "$http_x_forwarded_for"'; 22 | 23 | access_log /var/log/nginx/access.log main; 24 | 25 | sendfile on; 26 | #tcp_nopush on; 27 | 28 | keepalive_timeout 65; 29 | 30 | #gzip on; 31 | 32 | include /etc/nginx/conf.d/*.conf; 33 | } -------------------------------------------------------------------------------- /devops/services/vault/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3.6' 2 | services: 3 | vault: 4 | image: vault 5 | container_name: vault 6 | ports: 7 | - "8200:8200" 8 | restart: always 9 | volumes: 10 | - /vault/volumes/logs:/vault/logs 11 | - /vault/volumes/file:/vault/file 12 | - /vault/volumes/config:/vault/config 13 | cap_add: 14 | - IPC_LOCK 15 | entrypoint: vault server -config=/vault/config/vault.json 16 | logging: 17 | driver: syslog 18 | options: 19 | syslog-address: "udp://10.3.1.6:12201" 20 | tag: vault -------------------------------------------------------------------------------- /devops/services/vault/setup_vault_service.yml: -------------------------------------------------------------------------------- 1 | - name: Setup stateful services 2 | hosts: _manual 3 | become: true 4 | tasks: 5 | - name: Create a vault dir if it does not exist 6 | ansible.builtin.file: 7 | path: /vault 8 | state: directory 9 | mode: "0755" 10 | 11 | - name: Create a config dir if it does not exist 12 | ansible.builtin.file: 13 | path: /vault/volumes/config 14 | state: directory 15 | mode: "0755" 16 | 17 | - name: Create a file dir if it does not exist 18 | ansible.builtin.file: 19 | path: /vault/volumes/file 20 | state: directory 21 | mode: "0755" 22 | 23 | - name: Create a logs dir if it does not exist 24 | ansible.builtin.file: 25 | path: /vault/volumes/logs 26 | state: directory 27 | mode: "0755" 28 | 29 | - name: Copy file from host to machine 30 | copy: 31 | src: "{{ playbook_dir }}/docker-compose.yml" 32 | dest: /vault/docker-compose.yml 33 | 34 | - name: Copy file from host to machine 35 | copy: 36 | src: "{{ playbook_dir }}/vault.json" 37 | dest: /vault/volumes/config/vault.json 38 | 39 | - name: Create and start services 40 | community.docker.docker_compose: 41 | project_src: /vault/. 42 | register: output 43 | 44 | - debug: 45 | var: output 46 | -------------------------------------------------------------------------------- /devops/services/vault/vault.json: -------------------------------------------------------------------------------- 1 | { 2 | "backend": { 3 | "file": { 4 | "path": "/vault/file" 5 | } 6 | }, 7 | "listener": { 8 | "tcp": { 9 | "address": "0.0.0.0:8200", 10 | "tls_disable": 1 11 | } 12 | }, 13 | "ui": true 14 | } 15 | -------------------------------------------------------------------------------- /docker-compose-restructure.yml: -------------------------------------------------------------------------------- 1 | version: '3' 2 | 3 | services: 4 | asr_google: 5 | build: 6 | context: src/asr/google/remote/. 7 | dockerfile: Dockerfile 8 | ports: 9 | - "8002:8000" 10 | conversation_terminator: 11 | build: 12 | context: src/conversation_terminator/remote/. 13 | dockerfile: Dockerfile 14 | ports: 15 | - "8003:8000" 16 | coref_spacy: 17 | build: 18 | context: src/coref/spacy/local/. 19 | dockerfile: Dockerfile 20 | ports: 21 | - "8004:8000" 22 | translation_bhasini: 23 | build: 24 | context: src/text_translation/bhashini/remote/. 25 | dockerfile: Dockerfile 26 | ports: 27 | - "8005:8000" 28 | lang_detection_bhasini: 29 | build: 30 | context: src/text_lang_detection/bhashini/remote/. 31 | dockerfile: Dockerfile 32 | ports: 33 | - "8006:8000" 34 | embedding_openai: 35 | build: 36 | context: src/embeddings/openai/remote/. 37 | dockerfile: Dockerfile 38 | ports: 39 | - "8007:8000" 40 | environment: 41 | - OPENAI_API_KEY=${OPENAI_API_KEY} 42 | llm_openai_gpt3: 43 | build: 44 | context: src/llm/openai/chatgpt3/. 45 | dockerfile: Dockerfile 46 | ports: 47 | - "8008:8000" 48 | environment: 49 | - OPENAI_API_KEY=${OPENAI_API_KEY} 50 | llm_openai_gpt4: 51 | build: 52 | context: src/llm/openai/chatgpt4/. 53 | dockerfile: Dockerfile 54 | ports: 55 | - "8009:8000" 56 | environment: 57 | - OPENAI_API_KEY=${OPENAI_API_KEY} 58 | t2embedding_openai: 59 | build: 60 | context: src/t2embedding/openai/remote/. 61 | dockerfile: Dockerfile 62 | ports: 63 | - "8010:8000" 64 | environment: 65 | - OPENAI_API_KEY=${OPENAI_API_KEY} 66 | translation_google: 67 | build: 68 | context: src/text_translation/google/remote/. 69 | dockerfile: Dockerfile 70 | ports: 71 | - "8011:8000" 72 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3' 2 | 3 | services: 4 | app: 5 | build: 6 | context: . 7 | dockerfile: Dockerfile 8 | ports: 9 | - "8000:8000" 10 | restart: always 11 | environment: 12 | - PYTHONUNBUFFERED=1 13 | - PYTHONDONTWRITEBYTECODE=1 14 | - OPENAI_API_KEY=${OPENAI_API_KEY} 15 | - AUTH_HEADER=${AUTH_HEADER} 16 | - AUTH_HEADER_KEY=${AUTH_HEADER_KEY} -------------------------------------------------------------------------------- /faq.md: -------------------------------------------------------------------------------- 1 | # Frequently Asked Questions 2 | 3 | ## I'm not able to access vault from my machine 4 | 5 | ### How to create a HashiCorp Vault 6 | Check https://developer.hashicorp.com/vault/tutorials/getting-started/getting-started-first-secret 7 | 8 | ### Server gave `http` response to `https` client 9 | Add the VAULT Address to your Environement. You might want it to persist. 10 | ``` 11 | export VAULT_ADDR=x.y.x.z:8200 12 | ``` 13 | You may also want to disbale TLS Certficate verification. 14 | You can do that by accessing the vault config file (/etc/vaults or create one in the working directory) and set `tls_disable`=1. Refer to [Vault Config](https://developer.hashicorp.com/vault/tutorials/operations/configure-vault) for more Details. 15 | 16 | ### Secret Path does not exist in Vault 17 | You might have forgotten to enable the secret engine. 18 | ``` 19 | vault secrets enable -version=2 -path=secret kv 20 | ``` 21 | ### How to add secrets to Ansible 22 | To add secrets, you first need to enable the [secrets](#secret-path-does-not-exist-in-vault). 23 | 24 | To add keys in a secret path: 25 | ``` 26 | vault kv put secret/my-app/ password=123 27 | ``` 28 | Here, you are adding the key `password` in secret path secret/my-app 29 | To add multiple keys to your secret path 30 | ``` 31 | vault kv put secret/my-app/cred username=xyz password=123 32 | ``` 33 | Here, you are adding the keys `password` and `username` in secret path secret/my-app/cred 34 | For more examples refer https://blog.ruanbekker.com/blog/2019/05/06/setup-hashicorp-vault-server-on-docker-and-cli-guide/ 35 | 36 | ## How to Access Ansible Target Machine 37 | You need to generate a [ssh key pair](https://docs.oracle.com/en/cloud/cloud-at-customer/occ-get-started/generate-ssh-key-pair.html#GUID-8B9E7FCB-CEA3-4FB3-BF1A-FD3406A2432F) in the local machine (Where ansible script is present) and copy the public key onto the Target Machine. 38 | 39 | ## How to create a Github Personal Access Token 40 | Check https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens -------------------------------------------------------------------------------- /flake8/__init__.py: -------------------------------------------------------------------------------- 1 | from single_word_module import * -------------------------------------------------------------------------------- /flake8/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup( 4 | name="flake8-single-word-lowercase", 5 | version="0.1.0", 6 | description="A Flake8 plugin to enforce single-word lowercase module names", 7 | author="Your Name", 8 | author_email="your.email@example.com", 9 | url="https://github.com/yourusername/flake8-single-word-lowercase", 10 | packages=find_packages(), 11 | install_requires=["flake8", "flake8-plugin-utils"], 12 | entry_points={"flake8.extension": ["SWL = flake8_single_word_lowercase:get_plugin"]}, 13 | classifiers=[ 14 | "Development Status :: 4 - Beta", 15 | "Intended Audience :: Developers", 16 | "License :: OSI Approved :: MIT License", 17 | "Programming Language :: Python", 18 | "Programming Language :: Python :: 3", 19 | "Programming Language :: Python :: 3.6", 20 | "Programming Language :: Python :: 3.7", 21 | "Programming Language :: Python :: 3.8", 22 | ], 23 | ) -------------------------------------------------------------------------------- /flake8/single_word_module.py: -------------------------------------------------------------------------------- 1 | import ast 2 | import os 3 | from flake8_plugin_utils import Error, Visitor, Plugin 4 | 5 | 6 | class ModuleNameError(Error): 7 | code = "SWL100" 8 | message = "module name should be a single-word lowercase" 9 | 10 | 11 | class ModuleNameVisitor(Visitor): 12 | def _check_module_name(self, node): 13 | if not os.path.isabs(node.filename): 14 | return 15 | module_name = os.path.basename(node.filename).split('.')[0] 16 | if not module_name.islower() or '_' in module_name: 17 | self.error(node, ModuleNameError) 18 | 19 | def visit_Module(self, node): 20 | self._check_module_name(node) 21 | self.generic_visit(node) 22 | 23 | 24 | class SingleWordLowercasePlugin(Plugin): 25 | name = 'flake8-single-word-lowercase' 26 | version = '0.1.0' 27 | visitor_class = ModuleNameVisitor 28 | 29 | 30 | def get_plugin(): 31 | return SingleWordLowercasePlugin 32 | -------------------------------------------------------------------------------- /generate_independent_docker.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | source .generate.env 3 | # Install jq based on the operating system 4 | os_name=$(uname -s) 5 | if [ "$os_name" == "Darwin" ]; then 6 | brew install jq 7 | else 8 | sudo apt-get install jq 9 | fi 10 | 11 | # Get the number of models from the config.json file 12 | count=$(jq '.models | length' config.json) 13 | 14 | # Generate docker-compose.yaml file 15 | printf "version: '3'\nservices:\n" > docker-compose-independent-generated.yaml 16 | 17 | # Loop through each model 18 | for ((i=0; i<$count; i++)); do 19 | # Get model details from config.json 20 | serviceName=$(jq -r ".models[$i].serviceName" config.json) 21 | modelBasePath=$(jq -r ".models[$i].modelBasePath" config.json) 22 | apiBasePath=$(jq -r ".models[$i].apiBasePath" config.json) 23 | containerPort=$(jq -r ".models[$i].containerPort" config.json) 24 | 25 | countConstraints=$(jq ".models[$i].constraints | length" config.json) 26 | 27 | # Calculate the exposed port for the model 28 | exposedPort=$((8000 + i)) 29 | 30 | # Get environment variables for the model 31 | environment=($(jq -r ".models[$i].environment | keys[]" config.json)) 32 | 33 | # Add service details to docker-compose.yaml 34 | printf " ${serviceName}:\n image: ${DOCKER_REGISTRY_URL}/${GITHUB_REPOSITORY_URL}/${serviceName}:latest\n container_name: ${serviceName}\n networks:\n - communication\n" >> docker-compose-independent-generated.yaml 35 | 36 | if [[ countConstraints -gt 0 ]]; then 37 | printf " deploy:\n placement:\n constraints:\n" >> docker-compose-independent-generated.yaml 38 | fi 39 | for ((j=0; j<$countConstraints; j++)); do 40 | constraintLine=$(jq -r ".models[$i].constraints[$j]" config.json) 41 | 42 | printf " - ${constraintLine}\n" >> docker-compose-independent-generated.yaml 43 | done 44 | 45 | # Add environment variables to docker-compose.yaml 46 | if [[ ${#environment[@]} -gt 0 ]]; then 47 | printf " environment:\n" >> docker-compose-independent-generated.yaml 48 | fi 49 | for key in "${environment[@]}"; do 50 | printf " - ${key}=\${${key}}\n" >> docker-compose-independent-generated.yaml 51 | done 52 | done 53 | 54 | printf "networks:\n communication:\n external: true\n" >> docker-compose-independent-generated.yaml -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "aitools" 3 | version = "0.1.0" 4 | description = "AI Tooling to bootstrap applications fast" 5 | authors = ["Tushar "] 6 | readme = "README.md" 7 | 8 | [tool.poetry.dependencies] 9 | python = ">=3.9, <3.11" 10 | asyncio = "^3.4.3" 11 | aiohttp = "^3.8.4" 12 | quart = "^0.18.3" 13 | flask = "^2.2.3" 14 | inference = "^0.1" 15 | flake8 = "^6.0.0" 16 | openai-async = "^0.0.3" 17 | openai = "^0.27.4" 18 | async-cache = "^1.1.1" 19 | python-dotenv = "^1.0.0" 20 | tiktoken = "^0.3.3" 21 | numpy = "^1.24.2" 22 | pandas = "^2.0.0" 23 | matplotlib = "^3.7.1" 24 | plotly = "^5.14.1" 25 | spacy-experimental = "0.6.2" 26 | en-coreference-web-trf = {url = "https://github.com/explosion/spacy-experimental/releases/download/v0.6.1/en_coreference_web_trf-3.4.0a2-py3-none-any.whl"} 27 | pipdeptree = "2.0.0" 28 | quart-compress = "^0.2.1" 29 | scipy = "^1.10.1" 30 | scikit-learn = "^1.2.2" 31 | google-cloud-speech = "^2.19.0" 32 | google-auth = "^2.17.3" 33 | pydub = "^0.25.1" 34 | google-cloud-translate = "^3.11.1" 35 | en-core-web-sm = {url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.1/en_core_web_sm-3.4.1-py3-none-any.whl"} 36 | fastcoref = {extras = ["train"], version = "^2.1.6"} 37 | dsp-ml = "^0.1.5" 38 | accelerate = "^0.21.0" 39 | 40 | 41 | [build-system] 42 | requires = ["poetry-core"] 43 | build-backend = "poetry.core.masonry.api" 44 | -------------------------------------------------------------------------------- /sample.env: -------------------------------------------------------------------------------- 1 | OPENAI_API_KEY=YourOpenAPIKey 2 | AUTH_HEADER=auth 3 | AUTH_HEADER_KEY=Authorization 4 | DOMAIN_NAME=default 5 | DOCKER_REGISTRY_URL=ghcr.io 6 | GITHUB_REPOSITORY=aitools 7 | USE_HTTPS=false 8 | -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- 1 | from .core import * 2 | from .text_translation import * -------------------------------------------------------------------------------- /src/asr/README.md: -------------------------------------------------------------------------------- 1 | Intent recognition is to convert speech input to text (input language and output language is provided ) -------------------------------------------------------------------------------- /src/asr/ai4bharat/streaming/README.md: -------------------------------------------------------------------------------- 1 | Intent recognition is to recognize the intent for a chat message for a user -------------------------------------------------------------------------------- /src/asr/ai4bharat/url/README.md: -------------------------------------------------------------------------------- 1 | Intent recognition is to convert speech input to text (input language and output language is provided ) for batch files (.wav files) -------------------------------------------------------------------------------- /src/asr/ai4bharat/url/deploy.sh: -------------------------------------------------------------------------------- 1 | sudo apt-get install liblzma-dev libbz2-dev libzstd-dev libsndfile1-dev libopenblas-dev libfftw3-dev libgflags-dev libgoogle-glog-dev 2 | sudo apt install build-essential cmake libboost-system-dev libboost-thread-dev libboost-program-options-dev libboost-test-dev libeigen3-dev zlib1g-dev libbz2-dev liblzma-dev ffmpeg 3 | python3 -m venv venv 4 | source venv/bin/activate 5 | pip3 install packaging soundfile swifter joblib==1.0.0 indic-nlp-library tqdm==4.56.0 numpy==1.20.0 pandas==1.2.2 progressbar2==3.53.1 python_Levenshtein==0.12.2 editdistance==0.3.1 omegaconf==2.0.6 tensorboard==2.4.1 tensorboardX==2.1 wandb jiwer jupyterlab 6 | git clone https://github.com/AI4Bharat/fairseq.git 7 | pip3 install --editable fairseq/./ 8 | pip3 install install flask flask-cors flask_sockets pydub webrtcvad nltk langdetect simpletransformers flashlight-text protobuf==3.20.1 9 | 10 | git clone https://github.com/kpu/kenlm 11 | cd kenlm 12 | mkdir build 13 | cd build 14 | cmake .. 15 | make -j 4 16 | sudo make install 17 | cd .. 18 | export KENLM_ROOT=$PWD 19 | cd .. 20 | 21 | git clone https://github.com/flashlight/text && cd text 22 | mkdir build && cd build 23 | cmake .. -DFL_TEXT_BUILD_TESTS=OFF 24 | make -j$(nproc) 25 | make test 26 | sudo make install 27 | 28 | 29 | mkdir data && cd data 30 | wget https://indic-asr-public.objectstore.e2enetworks.net/aaai_ckpts/models/or/or.pt 31 | mv or.pt odia.pt 32 | mkdir or && cd or 33 | wget https://indic-asr-public.objectstore.e2enetworks.net/aaai_ckpts/models/or/lexicon.lst 34 | wget https://indic-asr-public.objectstore.e2enetworks.net/aaai_ckpts/models/or/lm.binary 35 | cd .. 36 | cd .. 37 | git clone https://github.com/AI4Bharat/IndicWav2Vec.git 38 | cp models_info.json IndicWav2Vec/ULCA_Compliance/app/ 39 | cp flask_api IndicWav2Vec/ULCA_Compliance/app/ 40 | cp support.py IndicWav2Vec/ULCA_Compliance/app/ 41 | cd IndicWav2Vec/ULCA_Compliance/app/ 42 | gunicorn flask_api:app --workers 5 --timeout 600 -------------------------------------------------------------------------------- /src/asr/ai4bharat/url/models_info.json: -------------------------------------------------------------------------------- 1 | 2 | { 3 | "or":{ 4 | "model_path":"/vol/bhashini_indicWav2vec/data/odia.pt", 5 | "lm_details" : 6 | { 7 | "nbest":1, 8 | "lexicon":"/vol/bhashini_indicWav2vec/data/or/lexicon.lst", 9 | "kenlm_model":"/vol/bhashini_indicWav2vec/data/or/lm.binary", 10 | "beam_size_token": 100, 11 | "beam":8, 12 | "beam_threshold":25, 13 | "lm_weight":1, 14 | "word_score":0.5, 15 | "sil_weight":0.0 16 | }, 17 | "lm_usage" : "True" 18 | 19 | } 20 | } 21 | 22 | -------------------------------------------------------------------------------- /src/asr/fairseq_mms/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samagra-Development/ai-tools/bdb71a281f52b8c3d6a086b9c6e8ad347f103553/src/asr/fairseq_mms/README.md -------------------------------------------------------------------------------- /src/asr/fairseq_mms/local/Dockerfile: -------------------------------------------------------------------------------- 1 | # Use an official Python runtime as a parent image 2 | FROM python:3.9-slim 3 | 4 | WORKDIR /app 5 | 6 | 7 | #install requirements 8 | COPY requirements.txt requirements.txt 9 | RUN pip3 install -r requirements.txt 10 | 11 | # Copy the rest of the application code to the working directory 12 | COPY . /app/ 13 | EXPOSE 8000 14 | # Set the entrypoint for the container 15 | CMD ["hypercorn", "--bind", "0.0.0.0:8000", "api:app"] -------------------------------------------------------------------------------- /src/asr/fairseq_mms/local/README.md: -------------------------------------------------------------------------------- 1 | ### Testing the model deployment : 2 | To run for testing you can follow the following steps : 3 | 4 | - Git clone the repo 5 | - Go to current folder location i.e. ``` cd /src/asr/fairseq_mms/local ``` 6 | - Create docker image file and test the api: 7 | ``` 8 | docker build -t testmodel . 9 | docker run -p 8000:8000 testmodel 10 | curl -X POST -F "file=@anorexia.wav" http://localhost:8000/ 11 | ``` 12 | -------------------------------------------------------------------------------- /src/asr/fairseq_mms/local/__init__.py: -------------------------------------------------------------------------------- 1 | from .request import ModelRequest 2 | from .request import Model 3 | -------------------------------------------------------------------------------- /src/asr/fairseq_mms/local/api.py: -------------------------------------------------------------------------------- 1 | from model import Model 2 | from request import ModelRequest 3 | from quart import Quart, request 4 | from quart_cors import cors # Import the cors function 5 | import aiohttp 6 | import os 7 | import tempfile 8 | 9 | app = Quart(__name__) 10 | app = cors(app) # Apply the cors function to your app to enable CORS for all routes 11 | 12 | model = None 13 | 14 | @app.before_serving 15 | async def startup(): 16 | app.client = aiohttp.ClientSession() 17 | global model 18 | model = Model(app) 19 | 20 | @app.route('/', methods=['POST']) 21 | async def embed(): 22 | global model 23 | 24 | temp_dir = tempfile.mkdtemp() 25 | data = await request.get_json() 26 | files = await request.files # await the coroutine 27 | uploaded_file = files.get('file') # now you can use .get() 28 | 29 | file_path = os.path.join(temp_dir, uploaded_file.name) 30 | await uploaded_file.save(file_path) 31 | 32 | req = ModelRequest(wav_file=file_path) 33 | response = await model.inference(req) 34 | 35 | os.remove(file_path) 36 | os.rmdir(temp_dir) 37 | 38 | return response 39 | 40 | if __name__ == "__main__": 41 | app.run() 42 | -------------------------------------------------------------------------------- /src/asr/fairseq_mms/local/model.py: -------------------------------------------------------------------------------- 1 | from transformers import Wav2Vec2ForCTC, AutoProcessor 2 | import torch 3 | from request import ModelRequest 4 | import librosa 5 | 6 | class Model(): 7 | def __new__(cls, context): 8 | cls.context = context 9 | if not hasattr(cls, 'instance'): 10 | cls.instance = super(Model, cls).__new__(cls) 11 | 12 | model_name = "facebook/mms-1b-all" 13 | target_lang = "ory" 14 | model = Wav2Vec2ForCTC.from_pretrained(model_name) 15 | model.load_adapter(target_lang) 16 | cls.model = model 17 | processor = AutoProcessor.from_pretrained(model_name) 18 | processor.tokenizer.set_target_lang(target_lang) 19 | cls.processor = processor 20 | cls.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 21 | cls.model.to(cls.device) 22 | return cls.instance 23 | 24 | 25 | async def inference(self, request: ModelRequest): 26 | wav_file = request.wav_file 27 | ory_sample, sr = librosa.load(wav_file, sr=16000) 28 | inputs = self.processor(ory_sample, sampling_rate=16_000, return_tensors="pt") 29 | inputs = inputs.to(self.device) 30 | with torch.no_grad(): 31 | outputs = self.model(**inputs).logits 32 | 33 | ids = torch.argmax(outputs, dim=-1)[0] 34 | transcription = self.processor.decode(ids) 35 | if transcription == '': 36 | transcription = 'ଦୟାକରି ପୁଣିଥରେ ଚେଷ୍ଟା କରନ୍ତୁ' 37 | 38 | return transcription 39 | -------------------------------------------------------------------------------- /src/asr/fairseq_mms/local/request.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import json 3 | 4 | 5 | class ModelRequest(): 6 | def __init__(self, wav_file): 7 | self.wav_file = wav_file 8 | 9 | def to_json(self): 10 | return json.dumps(self, default=lambda o: o.__dict__, 11 | sort_keys=True, indent=4) -------------------------------------------------------------------------------- /src/asr/fairseq_mms/local/requirements.txt: -------------------------------------------------------------------------------- 1 | torch==2.0.1 --index-url https://download.pytorch.org/whl/cpu 2 | transformers 3 | quart 4 | aiohttp 5 | librosa 6 | quart-cors 7 | -------------------------------------------------------------------------------- /src/asr/google/remote/1.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samagra-Development/ai-tools/bdb71a281f52b8c3d6a086b9c6e8ad347f103553/src/asr/google/remote/1.mp3 -------------------------------------------------------------------------------- /src/asr/google/remote/1.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samagra-Development/ai-tools/bdb71a281f52b8c3d6a086b9c6e8ad347f103553/src/asr/google/remote/1.wav -------------------------------------------------------------------------------- /src/asr/google/remote/Dockerfile: -------------------------------------------------------------------------------- 1 | # Use an official Python runtime as a parent image 2 | FROM python:3.9-slim 3 | 4 | WORKDIR /app 5 | 6 | #install requirments 7 | COPY requirements.txt requirements.txt 8 | RUN pip3 install -r requirements.txt 9 | 10 | # Copy the rest of the application code to the working directory 11 | COPY . /app/ 12 | EXPOSE 8000 13 | # Set the entrypoint for the container 14 | CMD ["hypercorn", "--bind", "0.0.0.0:8000", "api:app"] 15 | 16 | -------------------------------------------------------------------------------- /src/asr/google/remote/__init__.py: -------------------------------------------------------------------------------- 1 | from .request import ModelRequest 2 | from .request import Model -------------------------------------------------------------------------------- /src/asr/google/remote/api.py: -------------------------------------------------------------------------------- 1 | from model import Model 2 | from request import ModelRequest 3 | from quart import Quart, request 4 | import aiohttp 5 | 6 | #from fastapi import FastAPI, Body 7 | app = Quart(__name__) 8 | #app.client = aiohttp.ClientSession() 9 | #app = FastAPI() 10 | 11 | @app.before_serving 12 | async def startup(): 13 | app.client = aiohttp.ClientSession() 14 | 15 | @app.route('/', methods=['POST']) 16 | async def translate(): 17 | data = await request.get_json() 18 | req = ModelRequest(**data) 19 | model = Model(app) 20 | return await model.inference(req) 21 | -------------------------------------------------------------------------------- /src/asr/google/remote/model.py: -------------------------------------------------------------------------------- 1 | from cache import AsyncTTL 2 | from request import ModelRequest 3 | import io 4 | from google.oauth2.service_account import Credentials 5 | from google.cloud import speech_v1p1beta1 as speech 6 | from pydub import AudioSegment 7 | 8 | class Model: 9 | def __new__(cls, context): 10 | cls.context = context 11 | if not hasattr(cls, 'instance'): 12 | # Set up service account credentials 13 | cls.credentials = Credentials.from_service_account_file('google-creds.json') 14 | 15 | # Create a client for the Speech-to-Text API with the service account credentials 16 | cls.client = speech.SpeechClient(credentials=cls.credentials) 17 | 18 | cls.instance = super(Model, cls).__new__(cls) 19 | return cls.instance 20 | 21 | async def inference(self, request: ModelRequest): 22 | file_path = '1.mp3' 23 | audio = AudioSegment.from_file(file_path, format='mp3') 24 | 25 | wav_file_path = '1.wav' 26 | audio.export(wav_file_path, format='wav') 27 | 28 | # Load the audio file into memory 29 | with io.open(wav_file_path, 'rb') as audio_file: 30 | content = audio_file.read() 31 | 32 | # Set up the audio input 33 | audio = speech.RecognitionAudio(content=content) 34 | config = speech.RecognitionConfig( 35 | encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, 36 | language_code='or-IN', 37 | ) 38 | 39 | # Send the API request for speech recognition 40 | response = self.client.recognize(config=config, audio=audio) 41 | 42 | # Print the transcribed text 43 | transcript = '' 44 | for result in response.results: 45 | print('Transcript: {}'.format(result.alternatives[0].transcript)) 46 | transcript += result.alternatives[0].transcript 47 | 48 | return {"text": transcript} 49 | -------------------------------------------------------------------------------- /src/asr/google/remote/request.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | 4 | class ModelRequest(): 5 | def __init__(self, file_url): 6 | self.text = file_url 7 | 8 | def to_json(self): 9 | return json.dumps(self, default=lambda o: o.__dict__, 10 | sort_keys=True, indent=4) 11 | -------------------------------------------------------------------------------- /src/asr/google/remote/requirements.txt: -------------------------------------------------------------------------------- 1 | aiohttp==3.8.4 2 | quart==0.18.3 3 | async-cache==1.1.1 4 | requests 5 | google-cloud-speech 6 | google-auth 7 | pydub -------------------------------------------------------------------------------- /src/asr/whisper_en/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samagra-Development/ai-tools/bdb71a281f52b8c3d6a086b9c6e8ad347f103553/src/asr/whisper_en/README.md -------------------------------------------------------------------------------- /src/asr/whisper_en/local/Dockerfile: -------------------------------------------------------------------------------- 1 | # Use an official Python runtime as a parent image 2 | FROM python:3.9-slim 3 | 4 | WORKDIR /app 5 | 6 | 7 | #install requirements 8 | COPY requirements.txt requirements.txt 9 | RUN pip3 install -r requirements.txt 10 | 11 | # Copy the rest of the application code to the working directory 12 | COPY . /app/ 13 | EXPOSE 8000 14 | # Set the entrypoint for the container 15 | CMD ["hypercorn", "--bind", "0.0.0.0:8000", "api:app"] -------------------------------------------------------------------------------- /src/asr/whisper_en/local/README.md: -------------------------------------------------------------------------------- 1 | ### Testing the model deployment : 2 | To run for testing you can follow the following steps : 3 | 4 | - Git clone the repo 5 | - Go to current folder location i.e. ``` cd /src/asr/fairseq_mms/local ``` 6 | - Create docker image file and test the api: 7 | ``` 8 | docker build -t testmodel . 9 | docker run -p 8000:8000 testmodel 10 | curl -X POST -F "file=@anorexia.wav" http://localhost:8000/ 11 | ``` 12 | -------------------------------------------------------------------------------- /src/asr/whisper_en/local/__init__.py: -------------------------------------------------------------------------------- 1 | from .request import ModelRequest 2 | from .request import Model 3 | -------------------------------------------------------------------------------- /src/asr/whisper_en/local/api.py: -------------------------------------------------------------------------------- 1 | from model import Model 2 | from request import ModelRequest 3 | from quart import Quart, request 4 | from quart_cors import cors # Import the cors function 5 | import aiohttp 6 | import os 7 | import tempfile 8 | 9 | app = Quart(__name__) 10 | app = cors(app) # Apply the cors function to your app to enable CORS for all routes 11 | 12 | model = None 13 | 14 | @app.before_serving 15 | async def startup(): 16 | app.client = aiohttp.ClientSession() 17 | global model 18 | model = Model(app) 19 | 20 | @app.route('/', methods=['POST']) 21 | async def embed(): 22 | global model 23 | 24 | temp_dir = tempfile.mkdtemp() 25 | data = await request.get_json() 26 | files = await request.files # await the coroutine 27 | uploaded_file = files.get('file') # now you can use .get() 28 | 29 | file_path = os.path.join(temp_dir, uploaded_file.name) 30 | await uploaded_file.save(file_path) 31 | 32 | req = ModelRequest(wav_file=file_path) 33 | response = await model.inference(req) 34 | 35 | os.remove(file_path) 36 | os.rmdir(temp_dir) 37 | 38 | return response 39 | 40 | if __name__ == "__main__": 41 | app.run() 42 | -------------------------------------------------------------------------------- /src/asr/whisper_en/local/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torchaudio 3 | from transformers import pipeline 4 | from request import ModelRequest 5 | 6 | 7 | class Model(): 8 | def __new__(cls, context): 9 | cls.context = context 10 | if not hasattr(cls, 'instance'): 11 | cls.instance = super(Model, cls).__new__(cls) 12 | 13 | # Initialize Whisper ASR pipeline 14 | device = "cuda:0" if torch.cuda.is_available() else "cpu" 15 | cls.pipe = pipeline( 16 | "automatic-speech-recognition", 17 | model="openai/whisper-base.en", 18 | chunk_length_s=10, 19 | device=device, 20 | ) 21 | return cls.instance 22 | 23 | def transcribe_audio(self, audio_path): 24 | audio_input, sampling_rate = torchaudio.load(audio_path) 25 | audio_data = { 26 | "array": audio_input.squeeze().numpy(), 27 | "sampling_rate": sampling_rate 28 | } 29 | 30 | # Get the transcription 31 | prediction = self.pipe(audio_data.copy(), batch_size=8)["text"] 32 | return prediction 33 | 34 | async def inference(self, request: ModelRequest): 35 | transcription = self.transcribe_audio(request.wav_file) 36 | if not transcription: 37 | transcription = 'Unable to transcribe the audio.' 38 | return transcription 39 | 40 | 41 | 42 | -------------------------------------------------------------------------------- /src/asr/whisper_en/local/request.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import json 3 | 4 | 5 | class ModelRequest(): 6 | def __init__(self, wav_file): 7 | self.wav_file = wav_file 8 | 9 | def to_json(self): 10 | return json.dumps(self, default=lambda o: o.__dict__, 11 | sort_keys=True, indent=4) -------------------------------------------------------------------------------- /src/asr/whisper_en/local/requirements.txt: -------------------------------------------------------------------------------- 1 | torch 2 | transformers 3 | quart 4 | aiohttp 5 | librosa 6 | quart-cors 7 | torchaudio -------------------------------------------------------------------------------- /src/asr/whisper_lang_rec/local/Dockerfile: -------------------------------------------------------------------------------- 1 | # Use an official Python runtime as a parent image 2 | FROM python:3.9-slim 3 | 4 | WORKDIR /app 5 | 6 | # Install requirements 7 | COPY requirements.txt requirements.txt 8 | RUN pip3 install -r requirements.txt 9 | 10 | # Update aptitude with new repo info, and install FFmpeg 11 | RUN apt-get update \ 12 | && apt-get install -y ffmpeg \ 13 | && apt-get clean \ 14 | && rm -rf /var/lib/apt/lists/* 15 | 16 | # Copy the rest of the application code to the working directory 17 | COPY . /app/ 18 | EXPOSE 8000 19 | 20 | # Set the entrypoint for the container 21 | CMD ["hypercorn", "--bind", "0.0.0.0:8000", "api:app"] 22 | -------------------------------------------------------------------------------- /src/asr/whisper_lang_rec/local/README.md: -------------------------------------------------------------------------------- 1 | ### Testing the model deployment : 2 | To run for testing you can follow the following steps : 3 | 4 | - Git clone the repo 5 | - Go to current folder location i.e. ``` cd /src/asr/whisper_lang_rec/local ``` 6 | - Create docker image file and test the api: 7 | ``` 8 | docker build -t testmodel . 9 | docker run -p 8000:8000 testmodel 10 | curl -X POST -F "file=@male.wav" -F "n_seconds=5" http://localhost:8000/ 11 | ``` 12 | -------------------------------------------------------------------------------- /src/asr/whisper_lang_rec/local/__init__.py: -------------------------------------------------------------------------------- 1 | from .request import ModelRequest 2 | from .request import Model 3 | -------------------------------------------------------------------------------- /src/asr/whisper_lang_rec/local/api.py: -------------------------------------------------------------------------------- 1 | from model import Model 2 | from request import ModelRequest 3 | from quart import Quart, request 4 | from quart_cors import cors # Import the cors function 5 | import aiohttp 6 | import os 7 | import tempfile 8 | import os 9 | 10 | 11 | app = Quart(__name__) 12 | app = cors(app) # Apply the cors function to your app to enable CORS for all routes 13 | 14 | model = None 15 | 16 | @app.before_serving 17 | async def startup(): 18 | app.client = aiohttp.ClientSession() 19 | global model 20 | model = Model(app) 21 | 22 | @app.route('/', methods=['POST']) 23 | async def embed(): 24 | global model 25 | 26 | temp_dir = tempfile.mkdtemp() 27 | data = await request.form 28 | files = await request.files 29 | uploaded_file = files.get('file') 30 | 31 | file_path = os.path.join(temp_dir, uploaded_file.filename) 32 | await uploaded_file.save(file_path) 33 | 34 | n_seconds = int(data.get('n_seconds')) 35 | req = ModelRequest(wav_file=file_path, n_seconds=n_seconds) 36 | response = await model.inference(req) # Removed n_seconds here 37 | 38 | os.remove(file_path) 39 | os.rmdir(temp_dir) 40 | 41 | return response 42 | 43 | 44 | if __name__ == "__main__": 45 | app.run() 46 | -------------------------------------------------------------------------------- /src/asr/whisper_lang_rec/local/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torchaudio 3 | import whisper 4 | from request import ModelRequest 5 | import tempfile 6 | import os 7 | 8 | class Model(): 9 | def __new__(cls, context): 10 | cls.context = context 11 | if not hasattr(cls, 'instance'): 12 | cls.instance = super(Model, cls).__new__(cls) 13 | 14 | # Load Whisper model 15 | cls.model = whisper.load_model("base") 16 | cls.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 17 | cls.model.to(cls.device) 18 | return cls.instance 19 | 20 | def trim_audio(self, audio_path, n_seconds): 21 | audio, sr = torchaudio.load(audio_path) 22 | total_duration = audio.shape[1] / sr # Total duration of the audio in seconds 23 | 24 | # If the audio duration is less than n_seconds, don't trim the audio 25 | if total_duration < n_seconds: 26 | print(f"The audio duration ({total_duration:.2f}s) is less than {n_seconds}s. Using the full audio.") 27 | return audio, sr 28 | 29 | num_samples = int(n_seconds * sr) 30 | audio = audio[:, :num_samples] 31 | return audio, sr 32 | 33 | async def inference(self, request: ModelRequest): 34 | # The n_seconds is now accessed from the request object 35 | n_seconds = request.n_seconds 36 | trimmed_audio, sr = self.trim_audio(request.wav_file, n_seconds) 37 | 38 | # Save the trimmed audio to a temporary file 39 | with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_file: # Add a file extension 40 | torchaudio.save(temp_file.name, trimmed_audio, sr) 41 | 42 | # Process the audio with Whisper 43 | audio = whisper.load_audio(temp_file.name) 44 | audio = whisper.pad_or_trim(audio) 45 | 46 | # Clean up the temporary file 47 | os.unlink(temp_file.name) 48 | 49 | mel = whisper.log_mel_spectrogram(audio).to(self.device) 50 | # Detect the spoken language 51 | _, probs = self.model.detect_language(mel) 52 | detected_language = max(probs, key=probs.get) 53 | 54 | return detected_language 55 | 56 | 57 | -------------------------------------------------------------------------------- /src/asr/whisper_lang_rec/local/request.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import json 3 | 4 | 5 | class ModelRequest(): 6 | def __init__(self, wav_file,n_seconds): 7 | self.wav_file = wav_file 8 | self.n_seconds = n_seconds 9 | 10 | def to_json(self): 11 | return json.dumps(self, default=lambda o: o.__dict__, 12 | sort_keys=True, indent=4) -------------------------------------------------------------------------------- /src/asr/whisper_lang_rec/local/requirements.txt: -------------------------------------------------------------------------------- 1 | torch 2 | torchaudio 3 | transformers 4 | quart 5 | aiohttp 6 | librosa 7 | quart-cors 8 | openai-whisper 9 | -------------------------------------------------------------------------------- /src/chunking/MPNet/README.md: -------------------------------------------------------------------------------- 1 | MPnet uses entropy search to chunk free text 2 | -------------------------------------------------------------------------------- /src/chunking/MPNet/local/Dockerfile: -------------------------------------------------------------------------------- 1 | # Use an official Python runtime as a parent image 2 | FROM python:3.9-slim 3 | 4 | WORKDIR /app 5 | 6 | 7 | #install requirements 8 | COPY requirements.txt requirements.txt 9 | RUN pip3 install -r requirements.txt 10 | 11 | # Copy the rest of the application code to the working directory 12 | COPY . /app/ 13 | EXPOSE 8000 14 | # Set the entrypoint for the container 15 | CMD ["hypercorn", "--bind", "0.0.0.0:8000", "api:app"] 16 | -------------------------------------------------------------------------------- /src/chunking/MPNet/local/README.md: -------------------------------------------------------------------------------- 1 | ## Instructor Embedding model: 2 | 3 | ### Purpose : 4 | Model to Create Embeddings from given text using Instructor Large model. 5 | 6 | ### Testing the model deployment : 7 | To run for testing just the Hugging Face deployment for grievence recognition, you can follow the following steps : 8 | 9 | - Git clone the repo 10 | - Go to current folder location i.e. ``` cd src/chunking/MPNet/local ``` 11 | - Create docker image file and test the api: 12 | ``` 13 | docker build -t testmodel . 14 | docker run -p 8000:8000 testmodel 15 | curl -X POST -H "Content-Type: application/json" -d '{"text": "The English Wikipedia is the primary[a] English-language edition of Wikipedia, an online encyclopedia. It was created by Jimmy Wales and Larry Sanger on January 15, 2001, as Wikipedia'\''s first edition.English Wikipedia is hosted alongside other language editions by the Wikimedia Foundation, an American non-profit organization. Its content is written independently of other editions[1] in various varieties of English, aiming to stay consistent within articles. Its internal newspaper is The Signpost.English Wikipedia is the most-read version of Wikipedia[2] and has the most articles of any edition, at 6,689,175 as of July 2023.[3] It contains 10.9% of articles in all Wikipedias,[3] although it lacks millions of articles found in other editions.[1] The edition'\''s one-billionth edit was made on January 13, 2021.[4]English Wikipedia, often as a stand-in for Wikipedia overall, has been praised for its enablement of the democratization of knowledge, extent of coverage, unique structure, culture, and reduced degree of commercial bias. It has been criticized for exhibiting systemic bias, particularly gender bias against women and ideological bias.[5][6] While its reliability was frequently criticized in the 2000s, it has improved over time, receiving greater praise in the late 2010s and early 2020s,[7][5][8][b] having become an important fact-checking site.[9][10] English Wikipedia has been characterized as having less cultural bias than other language editions due to its broader editor base.[2] "}' http://localhost:8000/ -o output.csv 16 | ``` 17 | 18 | 19 | ``` 20 | curl -X POST -F "file=@content_text.txt" http://localhost:8000/chunking/instructor/local -o output4.csv 21 | ``` 22 | -------------------------------------------------------------------------------- /src/chunking/MPNet/local/__init__.py: -------------------------------------------------------------------------------- 1 | from .request import * 2 | from .model import * 3 | -------------------------------------------------------------------------------- /src/chunking/MPNet/local/api.py: -------------------------------------------------------------------------------- 1 | from model import Model 2 | from request import ModelRequest 3 | from quart import Quart, request, Response, send_file # <- Don't forget to import send_file 4 | import aiohttp 5 | import pandas as pd 6 | import io 7 | import fitz 8 | import os 9 | 10 | def extract_text_from_txt(txt_path): 11 | with open(txt_path, 'r', encoding='utf-8') as file: 12 | return file.read() 13 | 14 | def extract_text_from_pdf(pdf_path): 15 | doc = fitz.open(pdf_path) # open a document 16 | all_text = "" 17 | for page in doc: # iterate the document pages 18 | all_text += page.get_text("text") 19 | 20 | return all_text 21 | 22 | app = Quart(__name__) 23 | 24 | model = None 25 | 26 | @app.before_serving 27 | async def startup(): 28 | app.client = aiohttp.ClientSession() 29 | global model 30 | model = Model(app) 31 | 32 | 33 | @app.route('/', methods=['POST']) 34 | async def embed(): 35 | global model 36 | data = await request.get_json() 37 | files = await request.files # await the coroutine 38 | uploaded_file = files.get('file') # now you can use .get() 39 | 40 | if uploaded_file: 41 | print("1- File uploaded") 42 | 43 | if uploaded_file: 44 | file_extension = os.path.splitext(uploaded_file.filename)[1].lower() 45 | 46 | if file_extension == '.txt': 47 | text_data = uploaded_file.stream.read().decode('utf-8') 48 | elif file_extension == '.pdf': 49 | pdf_file_stream = io.BytesIO(uploaded_file.stream.read()) 50 | doc = fitz.open("pdf", pdf_file_stream.getvalue()) 51 | pages = [(i, page.get_text("text")) for i, page in enumerate(doc)] # Modified line 52 | text_data = pages 53 | else: 54 | return (print('Wrong format of file submitted')) 55 | 56 | req = ModelRequest(text = text_data) 57 | response = await model.inference(req) 58 | 59 | 60 | else : 61 | req = ModelRequest(**data) 62 | response = await model.inference(req) # Await the coroutine to get the actual response 63 | 64 | df = pd.read_csv(io.StringIO(response)) # Convert the CSV string back to a DataFrame 65 | 66 | # Save the DataFrame to a CSV file 67 | df.to_csv('output.csv', index=False) 68 | 69 | return await send_file('output.csv', mimetype='text/csv', as_attachment=True, attachment_filename='output.csv') # Updated line 70 | -------------------------------------------------------------------------------- /src/chunking/MPNet/local/request.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | 4 | class ModelRequest(): 5 | def __init__(self, text): 6 | self.text = text # text of pdf 7 | 8 | def to_json(self): 9 | return json.dumps(self, default=lambda o: o.__dict__, 10 | sort_keys=True, indent=4) -------------------------------------------------------------------------------- /src/chunking/MPNet/local/requirements.txt: -------------------------------------------------------------------------------- 1 | quart 2 | aiohttp 3 | pandas 4 | tqdm 5 | sentence_transformers 6 | segeval 7 | numpy 8 | nltk 9 | scipy 10 | PyMuPDF 11 | -------------------------------------------------------------------------------- /src/chunking/README.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /src/conversation_terminator/remote/Dockerfile: -------------------------------------------------------------------------------- 1 | # Use an official Python runtime as a parent image 2 | FROM python:3.9-slim 3 | 4 | WORKDIR /app 5 | 6 | #install requirements 7 | COPY requirements.txt requirements.txt 8 | RUN pip3 install -r requirements.txt 9 | 10 | # Copy the rest of the application code to the working directory 11 | COPY . /app/ 12 | EXPOSE 8000 13 | # Set the entrypoint for the container 14 | CMD ["hypercorn", "--bind", "0.0.0.0:8000", "api:app"] 15 | 16 | -------------------------------------------------------------------------------- /src/conversation_terminator/remote/__init__.py: -------------------------------------------------------------------------------- 1 | from .request import ModelRequest 2 | from .request import Model -------------------------------------------------------------------------------- /src/conversation_terminator/remote/api.py: -------------------------------------------------------------------------------- 1 | from model import Model 2 | from request import ModelRequest 3 | from quart import Quart, request 4 | import aiohttp 5 | 6 | app = Quart(__name__) 7 | 8 | @app.before_serving 9 | async def startup(): 10 | app.client = aiohttp.ClientSession() 11 | 12 | @app.route('/', methods=['POST']) 13 | async def embed(): 14 | data = await request.get_json() 15 | req = ModelRequest(**data) 16 | model = Model(app) 17 | return await model.inference(req) 18 | -------------------------------------------------------------------------------- /src/conversation_terminator/remote/model.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoTokenizer, TFBertForSequenceClassification, BertTokenizer 2 | import tensorflow as tf 3 | from request import ModelRequest 4 | 5 | class Model: 6 | 7 | def __new__(cls, context): 8 | cls.context = context 9 | if not hasattr(cls, 'instance'): 10 | cls.instance = super(Model, cls).__new__(cls) 11 | model_name = 'Chakshu/conversation_terminator_classifier' 12 | cls.tokenizer = BertTokenizer.from_pretrained(model_name) 13 | cls.model = TFBertForSequenceClassification.from_pretrained(model_name) 14 | return cls.instance 15 | 16 | async def inference(self, request: ModelRequest): 17 | inputs = self.tokenizer(request.text,return_tensors="np", padding=True) 18 | outputs = self.model(inputs.input_ids, inputs.attention_mask) 19 | probabilities = tf.nn.sigmoid(outputs.logits) 20 | predicted_class = tf.round(probabilities) 21 | return {"ans":"'ENDED'" if int(predicted_class.numpy()) == 1 else "'NOT ENDED'"} 22 | -------------------------------------------------------------------------------- /src/conversation_terminator/remote/request.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | 4 | class ModelRequest(): 5 | def __init__(self, text): 6 | self.text = text 7 | 8 | def to_json(self): 9 | return json.dumps(self, default=lambda o: o.__dict__, 10 | sort_keys=True, indent=4) 11 | -------------------------------------------------------------------------------- /src/conversation_terminator/remote/requirements.txt: -------------------------------------------------------------------------------- 1 | aiohttp==3.8.4 2 | quart==0.18.3 3 | tensorflow==2.12.* 4 | transformers -------------------------------------------------------------------------------- /src/core.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | from .utils import async_request 4 | 5 | def translate(text: str, source_lang: str, target_lang: str): 6 | """ 7 | Translate a text using ai4bharat APIs 8 | """ 9 | url = "https://nmt-api.ai4bharat.org/translate_sentence" 10 | payload = json.dumps({ 11 | "text": text, 12 | "source_language": source_lang, 13 | "target_language": target_lang 14 | }) 15 | 16 | headers = { 17 | 'authority': 'nmt-api.ai4bharat.org', 18 | 'accept': '*/*', 19 | 'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8', 20 | 'content-type': 'application/json', 21 | 'origin': 'https://models.ai4bharat.org', 22 | 'referer': 'https://models.ai4bharat.org/', 23 | 'sec-ch-ua': '"Not_A Brand";v="99", "Google Chrome";v="109", "Chromium";v="109"', 24 | 'sec-ch-ua-mobile': '?0', 25 | 'sec-ch-ua-platform': '"macOS"', 26 | 'sec-fetch-dest': 'empty', 27 | 'sec-fetch-mode': 'cors', 28 | 'sec-fetch-site': 'same-site', 29 | 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36' 30 | } 31 | 32 | return async_request('POST', url, headers, payload) 33 | 34 | 35 | def detect_lang(text: str, source_lang: str, target_lang: str): 36 | """ 37 | Translate a text using ai4bharat APIs 38 | """ 39 | url = "https://meity-auth.ulcacontrib.org/ulca/apis/v0/model/compute" 40 | 41 | payload = json.dumps({ 42 | "modelId": "631736990154d6459973318e", 43 | "task": "txt-lang-detection", 44 | "input": [ 45 | { 46 | "source": text 47 | } 48 | ], 49 | "userId": None 50 | }) 51 | 52 | headers = { 53 | 'authority': 'meity-auth.ulcacontrib.org', 54 | 'accept': '*/*', 55 | 'content-type': 'application/json', 56 | 'origin': 'https://bhashini.gov.in' 57 | } 58 | 59 | return async_request('POST', url, headers, payload) 60 | 61 | 62 | -------------------------------------------------------------------------------- /src/coref/README.md: -------------------------------------------------------------------------------- 1 | # Purpose 2 | 3 | Common folder for all coreference resolution models. 4 | 5 | Neural coreference resolution is a natural language processing (NLP) task that involves identifying when two or more words or phrases in a text refer to the same entity or concept. 6 | 7 | For example:
8 | Non-coreferenced conversation:
9 | User : 'Can you tell me where are the shops for paddy seeds?'
10 | User : 'What is the price for them?' 11 | 12 | Coreferenced conversation:
13 | User : 'Can you tell me where are the shops for paddy seeds?'
14 | User : 'What is the price for paddy seeds?' -------------------------------------------------------------------------------- /src/coref/__init__.py: -------------------------------------------------------------------------------- 1 | from spacy import * 2 | from fcoref import * -------------------------------------------------------------------------------- /src/coref/bart/README.md: -------------------------------------------------------------------------------- 1 | # Coreference Resolution 2 | 3 | The model being used in this folder is finetuned [bart-large](https://huggingface.co/facebook/bart-large) based on synthetically generated data by GPT. -------------------------------------------------------------------------------- /src/coref/bart/__init__.py: -------------------------------------------------------------------------------- 1 | from .local import * -------------------------------------------------------------------------------- /src/coref/bart/local/Dockerfile: -------------------------------------------------------------------------------- 1 | # Use an official Python runtime as a parent image 2 | FROM python:3.9-slim 3 | 4 | WORKDIR /app 5 | 6 | 7 | #install requirements 8 | COPY requirements.txt requirements.txt 9 | RUN pip3 install -r requirements.txt 10 | 11 | # Copy the rest of the application code to the working directory 12 | COPY . /app/ 13 | EXPOSE 8000 14 | # Set the entrypoint for the container 15 | CMD ["hypercorn", "--bind", "0.0.0.0:8000", "api:app"] -------------------------------------------------------------------------------- /src/coref/bart/local/README.md: -------------------------------------------------------------------------------- 1 | # Coreference Resolution 2 | 3 | ## Test Deployment 4 | 5 | - Git clone the repo and cd to the project location. 6 | - cd to `local`, i.e., `cd ./src/coref/bart/local`. 7 | - Start your docker engine and `docker build -t bart .` 8 | - Do `docker run -p 8000:8000 bart`. 9 | - `curl -X POST -H "Content-Type: application/json" -d '{"text": TEXT}' http://0.0.0.0:8000`.
Replace `TEXT` with a sentence that needs coreference resolution, for example:
"User: Can you give me more details on application process for the kisan credit card loan with State Bank of India? AI: Yes, I can provide some details on the application process for the Kisan Credit Card loan with State Bank of India. You would need to visit your nearest State Bank of India branch and submit your Aadhar Card and PAN card as mandatory documents along with other documents they may require. The loan has a tenure of 5 years subject to annual review and the effective rate of interest will be linked to One Year MCLR of the Bank. The present one year MCLR of Bank is 7.70% for loans up to 3.00 lakhs and 10.95% for loans above Rs.3.00 lakhs. User: Where is the bank located?" 10 | - The reponse for above might be:
11 | [ 12 | "User: Where is the State Bank of India located? 13 | ] 14 | - Additional optional parameters are `temperature`, `num_beams`, and `max_length` : `curl -X POST -H "Content-Type: application/json" -d '{"text": TEXT, "temperature": TEMPERATURE, "num_beams": NUM_BEAMS, "max_length": MAX_LENGTH}' http://0.0.0.0:8000`. 15 | 16 | -------------------------------------------------------------------------------- /src/coref/bart/local/__init__.py: -------------------------------------------------------------------------------- 1 | from .request import ModelRequest 2 | from .model import Model 3 | -------------------------------------------------------------------------------- /src/coref/bart/local/api.py: -------------------------------------------------------------------------------- 1 | from model import Model 2 | from request import ModelRequest 3 | from quart import Quart, request 4 | import aiohttp 5 | 6 | app = Quart(__name__) 7 | 8 | model = None 9 | 10 | @app.before_serving 11 | async def startup(): 12 | app.client = aiohttp.ClientSession() 13 | global model 14 | model = Model(app) 15 | 16 | @app.route('/', methods=['POST']) 17 | async def embed(): 18 | global model 19 | data = await request.get_json() 20 | req = ModelRequest(**data) 21 | return model.inference(req) 22 | 23 | if __name__ == "__main__": 24 | app.run() -------------------------------------------------------------------------------- /src/coref/bart/local/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from request import ModelRequest 3 | from transformers import AutoTokenizer, AutoModelForSeq2SeqLM 4 | 5 | class Model(): 6 | def __new__(cls, context): 7 | cls.context = context 8 | if not hasattr(cls, 'instance'): 9 | cls.instance = super(Model, cls).__new__(cls) 10 | model_name = "ksgr5566/bartlg-coreference-resolution" 11 | cls.tokenizer = AutoTokenizer.from_pretrained(model_name) 12 | cls.model = AutoModelForSeq2SeqLM.from_pretrained(model_name) 13 | cls.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 14 | cls.model.to(cls.device) 15 | return cls.instance 16 | 17 | 18 | def inference(self, request: ModelRequest): 19 | encoded_prompt = self.tokenizer(request.text, return_tensors="pt", padding=True, truncation=True, max_length=512).to(self.device) 20 | with torch.no_grad(): 21 | output = self.model.generate( 22 | **encoded_prompt, 23 | max_length=request.max_length, 24 | num_beams=request.num_beams, 25 | temperature=request.temperature 26 | ) 27 | decode = self.tokenizer.batch_decode(output, skip_special_tokens=True) 28 | return { "text" : decode[0]} 29 | -------------------------------------------------------------------------------- /src/coref/bart/local/request.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | 4 | class ModelRequest(): 5 | def __init__(self, text, temperature: float = 0.7, max_length: int = 512, num_beams: int = 5): 6 | self.text = text 7 | self.temperature = temperature 8 | self.max_length = max_length 9 | self.num_beams = num_beams 10 | 11 | def to_json(self): 12 | return json.dumps(self, default=lambda o: o.__dict__, 13 | sort_keys=True, indent=4) -------------------------------------------------------------------------------- /src/coref/bart/local/requirements.txt: -------------------------------------------------------------------------------- 1 | torch==2.0.1 --index-url https://download.pytorch.org/whl/cpu 2 | transformers 3 | quart 4 | aiohttp -------------------------------------------------------------------------------- /src/coref/fcoref/README.md: -------------------------------------------------------------------------------- 1 | # Coreference Resolution 2 | 3 | The model being used in this folder is [FastCoref](https://github.com/shon-otmazgin/fastcoref). -------------------------------------------------------------------------------- /src/coref/fcoref/__init__.py: -------------------------------------------------------------------------------- 1 | from .local import * -------------------------------------------------------------------------------- /src/coref/fcoref/local/Dockerfile: -------------------------------------------------------------------------------- 1 | # Use an official Python runtime as a parent image 2 | FROM python:3.9-slim 3 | 4 | WORKDIR /app 5 | 6 | #install requirements 7 | COPY requirements.txt requirements.txt 8 | RUN pip3 install -r requirements.txt 9 | 10 | # Copy the rest of the application code to the working directory 11 | COPY . /app/ 12 | EXPOSE 8000 13 | # Set the entrypoint for the container 14 | CMD ["hypercorn", "--bind", "0.0.0.0:8000", "api:app"] 15 | -------------------------------------------------------------------------------- /src/coref/fcoref/local/README.md: -------------------------------------------------------------------------------- 1 | # Coreference Resolution 2 | 3 | ## Test Deployment 4 | 5 | - Git clone the repo and cd to the project location. 6 | - cd to `local`, i.e., `cd ./src/coref/fcoref/local`. 7 | - Start your docker engine and `docker build -t fcoref .`. 8 | - Do `docker run -p 8000:8000 fcoref`. 9 | - `curl -X POST -H "Content-Type: application/json" -d '{"text": TEXT}' http://0.0.0.0:8000`.
Replace `TEXT` with a sentence that needs coreference resolution, for example:
`"User: Can you give me more details on application process for the kisan credit card loan with State Bank of India? AI: Yes, I can provide some details on the application process for the Kisan Credit Card loan with State Bank of India. You would need to visit your nearest State Bank of India branch and submit your Aadhar Card and PAN card as mandatory documents along with other documents they may require. The loan has a tenure of 5 years subject to annual review and the effective rate of interest will be linked to One Year MCLR of the Bank. The present one year MCLR of Bank is 7.70% for loans up to 3.00 lakhs and 10.95% for loans above Rs.3.00 lakhs. User: Where is the bank located?"` 10 | - The reponse for above would be:
11 | ` 12 | { 13 | "text": "User: Can you give me more details on application process for the kisan credit card loan with State Bank of India? AI: Yes, I can provide some details on the application process for the Kisan Credit Card loan with State Bank of India. You would need to visit your nearest State Bank of India branch and submit your Aadhar Card and PAN card as mandatory documents along with other documents your Aadhar Card and PAN card may require. the kisan credit card loan with State Bank of India has a tenure of 5 years subject to annual review and the effective rate of interest will be linked to One Year MCLR of State Bank of India. The present one year MCLR of State Bank of India is 7.70% for loans up to 3.00 lakhs and 10.95% for loans above Rs.3.00 lakhs. User: Where is State Bank of India located?" 14 | } 15 | ` 16 | 17 | -------------------------------------------------------------------------------- /src/coref/fcoref/local/__init__.py: -------------------------------------------------------------------------------- 1 | from .request import * 2 | from .model import * -------------------------------------------------------------------------------- /src/coref/fcoref/local/api.py: -------------------------------------------------------------------------------- 1 | from model import Model 2 | from request import ModelRequest 3 | from quart import Quart, request 4 | import aiohttp 5 | 6 | #from fastapi import FastAPI, Body 7 | app = Quart(__name__) 8 | #app.client = aiohttp.ClientSession() 9 | #app = FastAPI() 10 | 11 | @app.before_serving 12 | async def startup(): 13 | app.client = aiohttp.ClientSession() 14 | 15 | @app.route('/', methods=['POST']) 16 | async def translate(): 17 | data = await request.get_json() 18 | req = ModelRequest(**data) 19 | model = Model(app) 20 | return await model.inference(req) 21 | 22 | @app.route('/', methods=['GET']) 23 | async def hi(): 24 | return "hi" 25 | 26 | 27 | -------------------------------------------------------------------------------- /src/coref/fcoref/local/model.py: -------------------------------------------------------------------------------- 1 | from cache import AsyncTTL 2 | from request import ModelRequest 3 | from fastcoref import spacy_component 4 | import spacy 5 | 6 | 7 | class Model: 8 | def __new__(cls, context): 9 | cls.context = context 10 | if not hasattr(cls, 'instance'): 11 | cls.nlp = spacy.load("en_core_web_sm", exclude=["parser", "lemmatizer", "ner", "textcat"]) 12 | cls.nlp.add_pipe("fastcoref") 13 | cls.instance = super(Model, cls).__new__(cls) 14 | return cls.instance 15 | 16 | @AsyncTTL(time_to_live=600000, maxsize=1024) 17 | async def inference(self, request: ModelRequest): 18 | text = request.text 19 | doc = self.nlp(text, component_cfg={"fastcoref": {'resolve_text': True}}) 20 | text = doc._.resolved_text 21 | 22 | return {"text": text} 23 | -------------------------------------------------------------------------------- /src/coref/fcoref/local/request.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | 4 | class ModelRequest(): 5 | def __init__(self, text): 6 | self.text = text 7 | 8 | def to_json(self): 9 | return json.dumps(self, default=lambda o: o.__dict__, 10 | sort_keys=True, indent=4) 11 | -------------------------------------------------------------------------------- /src/coref/fcoref/local/requirements.txt: -------------------------------------------------------------------------------- 1 | aiohttp==3.8.4 2 | quart==0.18.3 3 | async-cache==1.1.1 4 | requests 5 | spacy 6 | en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.1/en_core_web_sm-3.4.1-py3-none-any.whl 7 | fastcoref[train]==2.1.6 -------------------------------------------------------------------------------- /src/coref/spacy/__init__.py: -------------------------------------------------------------------------------- 1 | from .local import * -------------------------------------------------------------------------------- /src/coref/spacy/local/Dockerfile: -------------------------------------------------------------------------------- 1 | # Use an official Python runtime as a parent image 2 | FROM python:3.9-slim 3 | 4 | WORKDIR /app 5 | 6 | #install requirements 7 | COPY requirements.txt requirements.txt 8 | RUN pip3 install -r requirements.txt 9 | 10 | RUN python3 -m spacy download en_core_web_trf 11 | # Copy the rest of the application code to the working directory 12 | COPY . /app/ 13 | EXPOSE 8000 14 | # Set the entrypoint for the container 15 | CMD ["hypercorn", "--bind", "0.0.0.0:8000", "api:app"] 16 | 17 | -------------------------------------------------------------------------------- /src/coref/spacy/local/__init__.py: -------------------------------------------------------------------------------- 1 | from .request import ModelRequest 2 | from .request import Model -------------------------------------------------------------------------------- /src/coref/spacy/local/api.py: -------------------------------------------------------------------------------- 1 | from model import Model 2 | from request import ModelRequest 3 | from quart import Quart, request 4 | import aiohttp 5 | 6 | #from fastapi import FastAPI, Body 7 | app = Quart(__name__) 8 | #app.client = aiohttp.ClientSession() 9 | #app = FastAPI() 10 | 11 | @app.before_serving 12 | async def startup(): 13 | app.client = aiohttp.ClientSession() 14 | 15 | @app.route('/', methods=['POST']) 16 | async def translate(): 17 | data = await request.get_json() 18 | req = ModelRequest(**data) 19 | model = Model(app) 20 | return await model.inference(req) 21 | -------------------------------------------------------------------------------- /src/coref/spacy/local/model.py: -------------------------------------------------------------------------------- 1 | from cache import AsyncTTL 2 | from request import ModelRequest 3 | import spacy 4 | 5 | 6 | class Model: 7 | def __new__(cls, context): 8 | cls.context = context 9 | if not hasattr(cls, 'instance'): 10 | cls.nlp = spacy.load("en_core_web_trf") 11 | cls.instance = super(Model, cls).__new__(cls) 12 | return cls.instance 13 | 14 | @AsyncTTL(time_to_live=600000, maxsize=1024) 15 | async def inference(self, request: ModelRequest): 16 | text = request.text 17 | doc = self.nlp(text) 18 | offset = 0 19 | reindex = [] 20 | for chain in doc.spans: 21 | for idx, span in enumerate(doc.spans[chain]): 22 | if idx > 0: 23 | reindex.append([span.start_char, span.end_char, doc.spans[chain][0].text]) 24 | 25 | for span in sorted(reindex, key=lambda x: x[0]): 26 | text = text[0:span[0] + offset] + span[2] + text[span[1] + offset:] 27 | offset += len(span[2]) - (span[1] - span[0]) 28 | 29 | return {"text": text} 30 | -------------------------------------------------------------------------------- /src/coref/spacy/local/request.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | 4 | class ModelRequest(): 5 | def __init__(self, text): 6 | self.text = text 7 | 8 | def to_json(self): 9 | return json.dumps(self, default=lambda o: o.__dict__, 10 | sort_keys=True, indent=4) 11 | -------------------------------------------------------------------------------- /src/coref/spacy/local/requirements.txt: -------------------------------------------------------------------------------- 1 | aiohttp==3.8.4 2 | quart==0.18.3 3 | async-cache==1.1.1 4 | requests 5 | spacy 6 | -------------------------------------------------------------------------------- /src/coref/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samagra-Development/ai-tools/bdb71a281f52b8c3d6a086b9c6e8ad347f103553/src/coref/tests/__init__.py -------------------------------------------------------------------------------- /src/coref/tests/parse_examples.py: -------------------------------------------------------------------------------- 1 | # Split the text by examples 2 | examples = text.strip().split('Example ')[1:] 3 | 4 | # Function to parse input and output 5 | def parse_example(example): 6 | input_start = example.find("Input") + 5 7 | output_start = example.find("Output") + 6 8 | input_text = example[input_start:output_start].strip() 9 | output_text = example[output_start:].strip() 10 | return (input_text, output_text) 11 | 12 | # Create tuples for each example 13 | tuples = [parse_example(example) for example in examples] -------------------------------------------------------------------------------- /src/coref/tests/prompt.txt: -------------------------------------------------------------------------------- 1 | I am writing a model to verify co-reference resolution for a conversation between a chatbot that helps farmers in Odisha India. The format of input is something like below. 2 | 3 | Input 4 | Q: What is the best month to sow paddy? 5 | A: ... 6 | Q: Which schemes can I leverage for it? 7 | 8 | Output 9 | Q: What is the best month to sow paddy? 10 | A: ... 11 | Q: Which schemes can I leverage for paddy? 12 | 13 | You can see that "paddy" was resolved and updated instead of it. Feel free to choose any answer (A) coming from the AI bot. There could be co-reference resolutions from that as well. 14 | 15 | The co-reference resolution should also be in the same format. You are free to choose anywhere between 1 and 3 questions for samples and always terminates at a Question (Q). 16 | 17 | Can you share 20 +ve examples of it? -------------------------------------------------------------------------------- /src/coref/tests/readme.md: -------------------------------------------------------------------------------- 1 | ## Steps 2 | 3 | 1. Use GPT4 and the prompt in prompt.txt to generate more test cases for this model. 4 | 2. Parse the test cases using the model in the spacy directory. 5 | 3. Publish results in a results.txt in the PR 6 | 7 | ```json 8 | { 9 | "total_test_cases": 100, 10 | "total_correct": 50, 11 | "total_incorrect": 50, 12 | } 13 | ``` -------------------------------------------------------------------------------- /src/data_generation/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samagra-Development/ai-tools/bdb71a281f52b8c3d6a086b9c6e8ad347f103553/src/data_generation/README.md -------------------------------------------------------------------------------- /src/data_generation/dictionary_aug/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samagra-Development/ai-tools/bdb71a281f52b8c3d6a086b9c6e8ad347f103553/src/data_generation/dictionary_aug/README.md -------------------------------------------------------------------------------- /src/data_generation/dictionary_aug/remote/Dockerfile: -------------------------------------------------------------------------------- 1 | # Use an official Python runtime as a parent image 2 | FROM python:3.9-slim 3 | 4 | WORKDIR /app 5 | 6 | #install requirements 7 | COPY requirements.txt requirements.txt 8 | RUN pip3 install -r requirements.txt 9 | 10 | # Copy the rest of the application code to the working directory 11 | COPY . /app/ 12 | EXPOSE 8000 13 | # Set the entrypoint for the container 14 | CMD ["hypercorn", "--bind", "0.0.0.0:8000", "api:app"] 15 | 16 | -------------------------------------------------------------------------------- /src/data_generation/dictionary_aug/remote/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ### Testing the model deployment : 4 | To run for testing just the Hugging Face deployment for grievence recognition, you can follow the following steps : 5 | 6 | - Git clone the repo 7 | - Go to current folder location i.e. ``` cd /src/text_classification/grievance_recognition/local ``` 8 | - Create docker image file and test the api: 9 | ``` 10 | docker build -t testmodel . 11 | docker run -p 5000:5000 testmodel 12 | curl -X POST -H "Content-Type: application/json" -d '{"text": "Where is my money? "}' http://localhost:5000/ 13 | ``` 14 | 15 | 16 | 17 | 18 | 19 | Required curls request : 20 | 21 | curl -X POST http://localhost:8000/ -H 'Content-Type: application/json' -d '{ 22 | "source": "ନଡ଼ିଆ", 23 | "translation": "coconut" 24 | }' 25 | -------------------------------------------------------------------------------- /src/data_generation/dictionary_aug/remote/__init__.py: -------------------------------------------------------------------------------- 1 | from .request import ModelRequest 2 | from .request import Model -------------------------------------------------------------------------------- /src/data_generation/dictionary_aug/remote/api.py: -------------------------------------------------------------------------------- 1 | from model import Model 2 | from request import ModelRequest 3 | from quart import Quart, request 4 | import aiohttp 5 | from quart_cors import cors 6 | 7 | #from fastapi import FastAPI, Body 8 | app = Quart(__name__) 9 | app = cors(app) 10 | #app.client = aiohttp.ClientSession() 11 | #app = FastAPI() 12 | 13 | @app.before_serving 14 | async def startup(): 15 | app.client = aiohttp.ClientSession() 16 | 17 | @app.route('/', methods=['POST']) 18 | async def answer(): 19 | data = await request.get_json() 20 | req = ModelRequest(**data) 21 | model = Model(app) 22 | return await model.inference(req) 23 | -------------------------------------------------------------------------------- /src/data_generation/dictionary_aug/remote/model.py: -------------------------------------------------------------------------------- 1 | import os 2 | import openai 3 | import openai_async 4 | from cache import AsyncTTL 5 | from request import ModelRequest 6 | from tenacity import retry, wait_random_exponential, stop_after_attempt 7 | openai.api_key = os.getenv("OPENAI_API_KEY") 8 | 9 | 10 | class Model: 11 | def __new__(cls, context): 12 | cls.context = context 13 | if not hasattr(cls, 'instance'): 14 | cls.instance = super(Model, cls).__new__(cls) 15 | return cls.instance 16 | 17 | @AsyncTTL(time_to_live=600000, maxsize=1024) 18 | @retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6)) 19 | async def inference(self, request: ModelRequest): 20 | messages = [ 21 | { 22 | "role": "user", 23 | "content": f"""{{Answer as succinctly as possible}} 24 | Generate 3 sentences in Odia containing the word {request.source} assuming it means the word {request.translation}. The word must be in different positions in the sentence (middle, beginning, end, etc.). Also provide translations for each sentence to English. Give the output in this format: 25 | [["sentence1", "translation1"], 26 | ["sentence2", "translation2"], 27 | . 28 | . 29 | . 30 | ["sentenceN", "translationN"]]""" 31 | } 32 | ] 33 | response = await openai_async.chat_complete( 34 | openai.api_key, 35 | timeout=20000, 36 | payload={ 37 | "model": "gpt-3.5-turbo-0301", 38 | "temperature": 0.5, 39 | "messages": messages, 40 | }, 41 | ) 42 | try: 43 | gpt_text = response.json()["choices"][0]["message"]["content"] 44 | gpt_text = ''.join(gpt_text.split('\n')) 45 | sentence_translations = json.loads(gpt_text) 46 | return {"sentence_translations": sentence_translations} 47 | except: 48 | return response.json() -------------------------------------------------------------------------------- /src/data_generation/dictionary_aug/remote/request.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | 4 | class ModelRequest(): 5 | def __init__(self, source, translation): 6 | self.source = source 7 | self.translation = translation 8 | 9 | def to_json(self): 10 | return json.dumps(self, default=lambda o: o.__dict__, 11 | sort_keys=True, indent=4) 12 | -------------------------------------------------------------------------------- /src/data_generation/dictionary_aug/remote/requirements.txt: -------------------------------------------------------------------------------- 1 | aiohttp==3.8.4 2 | quart==0.18.3 3 | async-cache==1.1.1 4 | requests 5 | openai 6 | openai_async 7 | tenacity 8 | quart-cors -------------------------------------------------------------------------------- /src/dsp/README.md: -------------------------------------------------------------------------------- 1 | # DSP (Demonstrate, Search, Predict) 2 | 3 | [DSP](https://github.com/stanfordnlp/dspy) to further augment RAG. This module is to specifically make DSP work for long passage answers required for questions rather than short factoid answers. -------------------------------------------------------------------------------- /src/dsp/__init__.py: -------------------------------------------------------------------------------- 1 | from .local import * -------------------------------------------------------------------------------- /src/dsp/local/Dockerfile: -------------------------------------------------------------------------------- 1 | # Use an official Python runtime as a parent image 2 | FROM python:3.9-slim 3 | 4 | WORKDIR /app 5 | 6 | 7 | #install requirements 8 | COPY requirements.txt requirements.txt 9 | RUN pip3 install -r requirements.txt 10 | 11 | # Copy the rest of the application code to the working directory 12 | COPY . /app/ 13 | EXPOSE 8000 14 | # Set the entrypoint for the container 15 | CMD ["hypercorn", "--bind", "0.0.0.0:8000", "api:app"] -------------------------------------------------------------------------------- /src/dsp/local/README.md: -------------------------------------------------------------------------------- 1 | # DSP 2 | 3 | ## Test Deployment 4 | 5 | - Git clone the repo and cd to the project location. 6 | - cd to `local`, i.e., `cd ./src/dsp/local`. 7 | - Use openai api key. 8 | - Start your docker engine and `docker build -t dsp .`. 9 | - Do `docker run -p 8000:8000 dsp`. 10 | - `curl -X POST -H "Content-Type: application/json" -d '{"text": TEXT, "train": TRAIN, "server": SERVER, "model": MODEL}' http://0.0.0.0:8000`. 11 | 12 | `TEXT` is the question. `TRAIN` is the labeled samples required in list format. Ex: `[("Question1", ["Answer1"]), ("Question2", ["Answer2"])]`. `SERVER` is the retrieval model server's api endpoint. Make sure to implement the server so the endpoints work as required for [this](https://github.com/stanfordnlp/dspy/blob/main/dsp/modules/colbertv2.py). `MODEL` is the hugging face model that you may want to use instead of gpt-3+, it is optional. Leave it blank if you want to use gpt-3+. 13 | -------------------------------------------------------------------------------- /src/dsp/local/__init__.py: -------------------------------------------------------------------------------- 1 | from .request import ModelRequest 2 | from .model import Model 3 | -------------------------------------------------------------------------------- /src/dsp/local/api.py: -------------------------------------------------------------------------------- 1 | from model import Model 2 | from request import ModelRequest 3 | from quart import Quart, request 4 | import aiohttp 5 | 6 | app = Quart(__name__) 7 | 8 | model = None 9 | 10 | @app.before_serving 11 | async def startup(): 12 | app.client = aiohttp.ClientSession() 13 | global model 14 | model = Model(app) 15 | 16 | @app.route('/', methods=['POST']) 17 | async def embed(): 18 | global model 19 | data = await request.get_json() 20 | req = ModelRequest(**data) 21 | return model.inference(req) 22 | 23 | if __name__ == "__main__": 24 | app.run() -------------------------------------------------------------------------------- /src/dsp/local/model.py: -------------------------------------------------------------------------------- 1 | from request import ModelRequest 2 | import dsp 3 | from utils import DSP 4 | 5 | 6 | class Model(): 7 | def __new__(cls, context): 8 | cls.context = context 9 | if not hasattr(cls, 'instance'): 10 | cls.dsp = DSP() 11 | cls.instance = super(Model, cls).__new__(cls) 12 | return cls.instance 13 | 14 | 15 | def inference(self, request: ModelRequest): 16 | train = [dsp.Example(question=question, answer=answer) for question, answer in request.train] 17 | answer, history = self.dsp(request.text, train, request.server, request.hf_model) 18 | return {"text": answer, "history": history} 19 | -------------------------------------------------------------------------------- /src/dsp/local/request.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | 4 | class ModelRequest(): 5 | def __init__(self, text, train, server: str, model: str=None): 6 | self.text = text 7 | self.train = train 8 | self.server = server 9 | self.hf_model = model 10 | 11 | def to_json(self): 12 | return json.dumps(self, default=lambda o: o.__dict__, 13 | sort_keys=True, indent=4) -------------------------------------------------------------------------------- /src/dsp/local/requirements.txt: -------------------------------------------------------------------------------- 1 | dsp-ml 2 | accelerate 3 | torch 4 | tiktoken -------------------------------------------------------------------------------- /src/embeddings/README.md: -------------------------------------------------------------------------------- 1 | Embedding text content such that they can be searched using vector search techniques -------------------------------------------------------------------------------- /src/embeddings/colbert/local/Dockerfile: -------------------------------------------------------------------------------- 1 | # Use an official Python runtime as a parent image 2 | FROM python:3.9-slim 3 | 4 | WORKDIR /app 5 | 6 | RUN apt-get update && apt-get install -y \ 7 | build-essential \ 8 | git \ 9 | && rm -rf /var/lib/apt/lists/* 10 | 11 | # Install requirements 12 | COPY requirements.txt requirements.txt 13 | RUN pip3 install -r requirements.txt 14 | 15 | RUN apt-get update && apt-get install -y wget 16 | # Download necessary files 17 | RUN gdown "https://drive.google.com/uc?id=1VlLcGWmDKAoK3aUthVXOFxzOdgzf-SNo" -O Testing1.csv 18 | 19 | # Clone necessary repositories 20 | RUN apt-get update && apt-get install -y git 21 | RUN git clone https://huggingface.co/GautamR/colbert_akai 22 | 23 | # Copy the rest of the application code to the working directory 24 | COPY . /app/ 25 | EXPOSE 8000 26 | # Set the entrypoint for the container 27 | CMD ["hypercorn", "--bind", "0.0.0.0:8000", "api:app"] 28 | -------------------------------------------------------------------------------- /src/embeddings/colbert/local/README.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /src/embeddings/colbert/local/__init__.py: -------------------------------------------------------------------------------- 1 | from .request import ModelRequest 2 | from .request import Model 3 | -------------------------------------------------------------------------------- /src/embeddings/colbert/local/api.py: -------------------------------------------------------------------------------- 1 | from model import Model 2 | from request import ModelRequest 3 | from quart import Quart, request 4 | import aiohttp 5 | import pandas as pd 6 | import gdown 7 | 8 | app = Quart(__name__) 9 | 10 | model = None 11 | 12 | @app.before_serving 13 | async def startup(): 14 | app.client = aiohttp.ClientSession() 15 | global model 16 | model = Model(app) 17 | 18 | @app.route('/', methods=['POST']) 19 | async def embed(): 20 | global model 21 | data = await request.get_json() 22 | req = ModelRequest(**data) 23 | return await model.inference(req) 24 | 25 | if __name__ == "__main__": 26 | app.run() 27 | -------------------------------------------------------------------------------- /src/embeddings/colbert/local/model.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from ragatouille import RAGPretrainedModel 3 | from request import ModelRequest 4 | from colbert import Indexer, Searcher 5 | from colbert.infra import Run, RunConfig, ColBERTConfig 6 | from colbert.data import Queries, Collection 7 | 8 | 9 | 10 | class Model(): 11 | def __new__(cls, context): 12 | cls.context = context 13 | if not hasattr(cls, 'instance'): 14 | cls.instance = super(Model, cls).__new__(cls) 15 | # Initialize Colbert 16 | cls.df = pd.read_csv('/app/Testing1.csv') 17 | cls.df['PID'] = cls.df.index.astype(str) 18 | with Run().context(RunConfig(experiment='notebook')): 19 | cls.searcher = Searcher(index='/app/colbert_akai/', collection=cls.df['content'].to_list()) 20 | print(cls.df.columns) 21 | 22 | return cls.instance 23 | 24 | async def inference(self, request: ModelRequest): 25 | query = request.text 26 | k = int( request.k ) 27 | column_returned = 'id' 28 | results = self.searcher.search(query, k) 29 | searched_ids = self.df.loc[results[0], column_returned].to_list() 30 | searched_content = self.df.loc[results[0], 'content'].to_list() 31 | return {"ids": searched_ids, "content": searched_content, "scores": results[2]} 32 | -------------------------------------------------------------------------------- /src/embeddings/colbert/local/request.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import json 3 | 4 | 5 | class ModelRequest(): 6 | def __init__(self, text, k ): 7 | self.text = text 8 | self.k = k 9 | 10 | def to_json(self): 11 | return json.dumps(self, default=lambda o: o.__dict__, 12 | sort_keys=True, indent=4) 13 | -------------------------------------------------------------------------------- /src/embeddings/colbert/local/requirements.txt: -------------------------------------------------------------------------------- 1 | torch 2 | scikit-learn 3 | quart 4 | aiohttp 5 | pandas 6 | faiss-gpu 7 | datasets 8 | gdown 9 | ragatouille 10 | langchain-openai 11 | colbert-ai 12 | gdown 13 | -------------------------------------------------------------------------------- /src/embeddings/instructor/README.md: -------------------------------------------------------------------------------- 1 | ## Instructor model for generating embedding -------------------------------------------------------------------------------- /src/embeddings/instructor/local/Dockerfile: -------------------------------------------------------------------------------- 1 | # Use an official Python runtime as a parent image 2 | FROM python:3.9-slim 3 | 4 | WORKDIR /app 5 | 6 | 7 | #install requirements 8 | COPY requirements.txt requirements.txt 9 | RUN pip3 install -r requirements.txt 10 | 11 | # Copy the rest of the application code to the working directory 12 | COPY . /app/ 13 | EXPOSE 8000 14 | # Set the entrypoint for the container 15 | CMD ["hypercorn", "--bind", "0.0.0.0:8000", "api:app"] -------------------------------------------------------------------------------- /src/embeddings/instructor/local/README.md: -------------------------------------------------------------------------------- 1 | ## Instructor Embedding model: 2 | 3 | ### Purpose : 4 | Model to Create Embeddings from given text using Instructor Large model. 5 | 6 | ### Testing the model deployment : 7 | To run for testing just the Hugging Face deployment for grievence recognition, you can follow the following steps : 8 | 9 | - Git clone the repo 10 | - Go to current folder location i.e. ``` cd src/embeddings/instructor/local ``` 11 | - Create docker image file and test the api: 12 | ``` 13 | docker build -t testmodel . 14 | docker run -p 8000:8000 testmodel 15 | curl -X POST -H "Content-Type: application/json" -d '{"query": "Where is my money? "}' http://localhost:8000/ 16 | 17 | curl -X POST -F "file=@input.csv" http://localhost:8000/embeddings/instructor/local -o output.csv 18 | ``` 19 | -------------------------------------------------------------------------------- /src/embeddings/instructor/local/__init__.py: -------------------------------------------------------------------------------- 1 | from .request import * 2 | from .model import * 3 | -------------------------------------------------------------------------------- /src/embeddings/instructor/local/api.py: -------------------------------------------------------------------------------- 1 | from model import Model 2 | from request import ModelRequest 3 | from quart import Quart, request,Response, send_file 4 | import aiohttp 5 | import pandas as pd 6 | import io 7 | 8 | app = Quart(__name__) 9 | 10 | model = None 11 | 12 | @app.before_serving 13 | async def startup(): 14 | app.client = aiohttp.ClientSession() 15 | global model 16 | model = Model(app) 17 | 18 | @app.route('/', methods=['POST']) 19 | async def embed(): 20 | global model 21 | data = await request.get_json() 22 | files = await request.files # await the coroutine 23 | uploaded_file = files.get('file') # now you can use .get() 24 | 25 | if uploaded_file: 26 | df = pd.read_csv(uploaded_file.stream) 27 | req = ModelRequest(df=df) # Pass the DataFrame to ModelRequest 28 | response = await model.inference(req) 29 | df = pd.read_csv(io.StringIO(response)) # Convert the CSV string back to a DataFrame 30 | # Save the DataFrame to a CSV file 31 | df.to_csv('output.csv', index=False) 32 | 33 | return await send_file('output.csv', mimetype='text/csv', as_attachment=True, attachment_filename='output.csv') 34 | 35 | else: 36 | req = ModelRequest(**data) 37 | return await model.inference(req) 38 | -------------------------------------------------------------------------------- /src/embeddings/instructor/local/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from request import ModelRequest 3 | from InstructorEmbedding import INSTRUCTOR 4 | import wget 5 | import pandas as pd 6 | import os 7 | 8 | class Model(): 9 | def __new__(cls, context): 10 | cls.context = context 11 | if not hasattr(cls, 'instance'): 12 | cls.instance = super(Model, cls).__new__(cls) 13 | model_name = "hkunlp/instructor-large" 14 | cls.model = INSTRUCTOR(model_name) 15 | return cls.instance 16 | 17 | async def inference(self, request: ModelRequest): 18 | # Modify this function according to model requirements such that inputs and output remains the same 19 | corpus_instruction = "Represent the Wikipedia document for retrieval:" 20 | query_instruction = 'Represent the Wikipedia question for retrieving supporting documents: ' 21 | query = request.query 22 | 23 | if(query != None): 24 | # print('Query Encoding Process :-') 25 | query_embeddings = self.model.encode( 26 | [[query_instruction, query]], 27 | show_progress_bar=False, 28 | batch_size=32, 29 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 30 | ) 31 | return query_embeddings.tolist() 32 | 33 | if not request.df.empty: 34 | # print('Text corpus Encoding Process :-') 35 | data = request.df 36 | 37 | text_corpus = data.loc[:,'content'].to_list() 38 | corpus_embeddings = self.model.encode( 39 | [[corpus_instruction, text] for text in text_corpus], 40 | show_progress_bar=False, 41 | batch_size=32, 42 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 43 | ) 44 | data['embeddings'] = corpus_embeddings.tolist() 45 | csv_string = data.to_csv(index=False) 46 | 47 | return str(csv_string) 48 | -------------------------------------------------------------------------------- /src/embeddings/instructor/local/request.py: -------------------------------------------------------------------------------- 1 | import json 2 | import pandas as pd 3 | 4 | 5 | class ModelRequest(): 6 | def __init__(self, query=None, df = pd.DataFrame()): 7 | # Url to download csv file 8 | self.query = query # String 9 | self.df = df 10 | 11 | def to_json(self): 12 | return json.dumps(self, default=lambda o: o.__dict__, 13 | sort_keys=True, indent=4) -------------------------------------------------------------------------------- /src/embeddings/instructor/local/requirements.txt: -------------------------------------------------------------------------------- 1 | torch 2 | quart 3 | aiohttp 4 | InstructorEmbedding 5 | wget 6 | pandas 7 | tqdm 8 | sentence-transformers==2.2.2 -------------------------------------------------------------------------------- /src/embeddings/openai/README.md: -------------------------------------------------------------------------------- 1 | Using Open AI's mebeddings to embed text to numeric vectors that can be search using vector search -------------------------------------------------------------------------------- /src/embeddings/openai/remote/Dockerfile: -------------------------------------------------------------------------------- 1 | # Use an official Python runtime as a parent image 2 | FROM python:3.9-slim 3 | 4 | WORKDIR /app 5 | 6 | #install requirements 7 | COPY requirements.txt requirements.txt 8 | RUN pip3 install -r requirements.txt 9 | 10 | # Copy the rest of the application code to the working directory 11 | COPY . /app/ 12 | 13 | # Set the entrypoint for the container 14 | ENTRYPOINT ["hypercorn", "api", "-b", "0.0.0.0"] 15 | 16 | -------------------------------------------------------------------------------- /src/embeddings/openai/remote/__init__.py: -------------------------------------------------------------------------------- 1 | from .request import ModelRequest 2 | from .request import Model -------------------------------------------------------------------------------- /src/embeddings/openai/remote/api.py: -------------------------------------------------------------------------------- 1 | from model import Model 2 | from request import ModelRequest 3 | from quart import Quart, request 4 | import aiohttp 5 | 6 | #from fastapi import FastAPI, Body 7 | app = Quart(__name__) 8 | #app.client = aiohttp.ClientSession() 9 | #app = FastAPI() 10 | 11 | @app.before_serving 12 | async def startup(): 13 | app.client = aiohttp.ClientSession() 14 | 15 | @app.route('/', methods=['POST']) 16 | async def translate(): 17 | data = await request.get_json() 18 | req = ModelRequest(**data) 19 | model = Model(app) 20 | return await model.inference(req) 21 | -------------------------------------------------------------------------------- /src/embeddings/openai/remote/model.py: -------------------------------------------------------------------------------- 1 | import os 2 | import openai 3 | from openai.embeddings_utils import get_embedding 4 | from cache import AsyncTTL 5 | from request import ModelRequest 6 | import numpy as np 7 | import pandas as pd 8 | import tiktoken 9 | import ast 10 | from sklearn.metrics.pairwise import cosine_similarity 11 | 12 | openai.api_key = os.getenv("OPENAI_API_KEY") 13 | 14 | 15 | class Model: 16 | embedding_df = None 17 | embedding_model = "text-embedding-ada-002" 18 | embedding_encoding = "cl100k_base" # this the encoding for text-embedding-ada-002 19 | max_tokens = 8000 # the maximum for text-embedding-ada-002 is 8191 20 | 21 | def __new__(cls, context): 22 | cls.context = context 23 | if not hasattr(cls, 'instance'): 24 | cls.embedding_df = pd.read_csv('src/embeddings/openai/remote/akai.csv') 25 | cls.embedding_df['embedding'] = cls.embedding_df['embedding'].apply(ast.literal_eval) 26 | cls.instance = super(Model, cls).__new__(cls) 27 | return cls.instance 28 | 29 | @AsyncTTL(time_to_live=600000, maxsize=1024) 30 | async def inference(self, request: ModelRequest): 31 | print("request.prompt", request.prompt) 32 | new_prompt_embedding = get_embedding(request.prompt, engine=self.embedding_model) 33 | similarity_scores = cosine_similarity( 34 | [new_prompt_embedding], np.stack(self.embedding_df['embedding'], axis=0))[0] 35 | most_similar_indices = np.argsort(similarity_scores)[::-1] 36 | most_similar_prompts = self.embedding_df.loc[most_similar_indices, ['combined_prompt', 'combined_content']] 37 | most_similar_prompts['similarity_score'] = np.sort(similarity_scores)[::-1] 38 | similar_content = most_similar_prompts.iloc[0:20] 39 | sim_cutoff_range = np.max(similar_content['similarity_score']) - request.similarity_score_range 40 | similar_content_df = similar_content.loc[similar_content['similarity_score'] >= sim_cutoff_range, :] 41 | similar_content_df1 = similar_content_df.drop(columns='similarity_score') 42 | similar_content_dict = similar_content_df1.to_dict('records') 43 | # modified_content_dict = remove_content_tags_from_dic(similar_content_dict) 44 | print("similar_content_dict", similar_content_dict) 45 | return (similar_content_dict) 46 | 47 | async def create_embeddings(self, embedding_df): 48 | encoding = tiktoken.get_encoding(self.embedding_encoding) 49 | embedding_df["n_tokens"] = embedding_df.combined_prompt.apply(lambda x: len(encoding.encode(x))) 50 | embedding_df["embedding"] = embedding_df.combined_prompt.apply( 51 | lambda x: get_embedding(x, engine=self.embedding_model)) 52 | return embedding_df -------------------------------------------------------------------------------- /src/embeddings/openai/remote/request.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | 4 | class ModelRequest(): 5 | def __init__(self, prompt, similarity_score_range=0): 6 | self.prompt = prompt 7 | self.similarity_score_range = similarity_score_range 8 | 9 | def to_json(self): 10 | return json.dumps(self, default=lambda o: o.__dict__, 11 | sort_keys=True, indent=4) 12 | -------------------------------------------------------------------------------- /src/embeddings/openai/remote/requirements.txt: -------------------------------------------------------------------------------- 1 | aiohttp==3.8.4 2 | quart==0.18.3 3 | async-cache==1.1.1 4 | requests 5 | openai 6 | numpy 7 | pandas 8 | tiktoken 9 | sklearn -------------------------------------------------------------------------------- /src/intent_recognition/README.md: -------------------------------------------------------------------------------- 1 | Intent recognition is to recognize the intent for a chat message for a user -------------------------------------------------------------------------------- /src/llm/__init__.py: -------------------------------------------------------------------------------- 1 | from .openai import * -------------------------------------------------------------------------------- /src/llm/openai/__init__.py: -------------------------------------------------------------------------------- 1 | from .chatgpt3 import * -------------------------------------------------------------------------------- /src/llm/openai/chatgpt3/Dockerfile: -------------------------------------------------------------------------------- 1 | # Use an official Python runtime as a parent image 2 | FROM python:3.9-slim 3 | 4 | WORKDIR /app 5 | 6 | #install requirements 7 | COPY requirements.txt requirements.txt 8 | RUN pip3 install -r requirements.txt 9 | 10 | # Copy the rest of the application code to the working directory 11 | COPY . /app/ 12 | EXPOSE 8000 13 | # Set the entrypoint for the container 14 | CMD ["hypercorn", "--bind", "0.0.0.0:8000", "api:app"] 15 | 16 | -------------------------------------------------------------------------------- /src/llm/openai/chatgpt3/__init__.py: -------------------------------------------------------------------------------- 1 | from .request import ModelRequest 2 | from .request import Model -------------------------------------------------------------------------------- /src/llm/openai/chatgpt3/api.py: -------------------------------------------------------------------------------- 1 | from model import Model 2 | from request import ModelRequest 3 | from quart import Quart, request 4 | import aiohttp 5 | 6 | #from fastapi import FastAPI, Body 7 | app = Quart(__name__) 8 | #app.client = aiohttp.ClientSession() 9 | #app = FastAPI() 10 | 11 | @app.before_serving 12 | async def startup(): 13 | app.client = aiohttp.ClientSession() 14 | 15 | @app.route('/', methods=['POST']) 16 | async def answer(): 17 | data = await request.get_json() 18 | req = ModelRequest(**data) 19 | model = Model(app) 20 | return await model.inference(req) 21 | -------------------------------------------------------------------------------- /src/llm/openai/chatgpt3/model.py: -------------------------------------------------------------------------------- 1 | import os 2 | import openai 3 | import openai_async 4 | from cache import AsyncTTL 5 | from request import ModelRequest 6 | from tenacity import retry, wait_random_exponential, stop_after_attempt 7 | 8 | openai.api_key = os.getenv("OPENAI_API_KEY") 9 | 10 | 11 | class Model: 12 | def __new__(cls, context): 13 | cls.context = context 14 | if not hasattr(cls, 'instance'): 15 | cls.instance = super(Model, cls).__new__(cls) 16 | return cls.instance 17 | 18 | @AsyncTTL(time_to_live=600000, maxsize=1024) 19 | @retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6)) 20 | async def inference(self, request: ModelRequest): 21 | response = await openai_async.chat_complete( 22 | openai.api_key, 23 | timeout=20000, 24 | payload={ 25 | "model": "gpt-3.5-turbo-0301", 26 | "temperature": 0, 27 | "messages": request.prompt, 28 | }, 29 | ) 30 | return response.json() 31 | -------------------------------------------------------------------------------- /src/llm/openai/chatgpt3/request.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | 4 | class ModelRequest(): 5 | def __init__(self, prompt): 6 | self.prompt = prompt 7 | 8 | def to_json(self): 9 | return json.dumps(self, default=lambda o: o.__dict__, 10 | sort_keys=True, indent=4) 11 | -------------------------------------------------------------------------------- /src/llm/openai/chatgpt3/requirements.txt: -------------------------------------------------------------------------------- 1 | aiohttp==3.8.4 2 | quart 3 | async-cache==1.1.1 4 | requests 5 | openai==0.28 6 | openai_async 7 | tenacity 8 | -------------------------------------------------------------------------------- /src/llm/openai/chatgpt4/Dockerfile: -------------------------------------------------------------------------------- 1 | # Use an official Python runtime as a parent image 2 | FROM python:3.9-slim 3 | 4 | WORKDIR /app 5 | 6 | #install requirements 7 | COPY requirements.txt requirements.txt 8 | RUN pip3 install -r requirements.txt 9 | 10 | # Copy the rest of the application code to the working directory 11 | COPY . /app/ 12 | EXPOSE 8000 13 | # Set the entrypoint for the container 14 | CMD ["hypercorn", "--bind", "0.0.0.0:8000", "api:app"] 15 | 16 | -------------------------------------------------------------------------------- /src/llm/openai/chatgpt4/__init__.py: -------------------------------------------------------------------------------- 1 | from .request import ModelRequest 2 | from .request import Model -------------------------------------------------------------------------------- /src/llm/openai/chatgpt4/api.py: -------------------------------------------------------------------------------- 1 | from model import Model 2 | from request import ModelRequest 3 | from quart import Quart, request 4 | import aiohttp 5 | 6 | #from fastapi import FastAPI, Body 7 | app = Quart(__name__) 8 | #app.client = aiohttp.ClientSession() 9 | #app = FastAPI() 10 | 11 | @app.before_serving 12 | async def startup(): 13 | app.client = aiohttp.ClientSession() 14 | 15 | @app.route('/', methods=['POST']) 16 | async def answer(): 17 | data = await request.get_json() 18 | req = ModelRequest(**data) 19 | model = Model(app) 20 | return await model.inference(req) 21 | -------------------------------------------------------------------------------- /src/llm/openai/chatgpt4/model.py: -------------------------------------------------------------------------------- 1 | import os 2 | import openai 3 | import openai_async 4 | from cache import AsyncTTL 5 | from request import ModelRequest 6 | from tenacity import retry, wait_random_exponential, stop_after_attempt 7 | 8 | openai.api_key = os.getenv("OPENAI_API_KEY") 9 | 10 | 11 | class Model: 12 | def __new__(cls, context): 13 | cls.context = context 14 | if not hasattr(cls, 'instance'): 15 | cls.instance = super(Model, cls).__new__(cls) 16 | return cls.instance 17 | 18 | @AsyncTTL(time_to_live=600000, maxsize=1024) 19 | @retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6)) 20 | async def inference(self, request: ModelRequest): 21 | response = await openai_async.chat_complete( 22 | openai.api_key, 23 | timeout=20000, 24 | payload={ 25 | "model": "gpt-4", 26 | "temperature": 0, 27 | "messages": [{"role":"user","content" : request.prompt}], 28 | }, 29 | ) 30 | try: 31 | ans = response.json()["choices"][0]["message"]["content"] 32 | return {"ans":ans} 33 | except: 34 | return response.json() 35 | -------------------------------------------------------------------------------- /src/llm/openai/chatgpt4/request.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | 4 | class ModelRequest(): 5 | def __init__(self, prompt): 6 | self.prompt = prompt 7 | 8 | def to_json(self): 9 | return json.dumps(self, default=lambda o: o.__dict__, 10 | sort_keys=True, indent=4) 11 | -------------------------------------------------------------------------------- /src/llm/openai/chatgpt4/requirements.txt: -------------------------------------------------------------------------------- 1 | aiohttp==3.8.4 2 | quart==0.18.3 3 | async-cache==1.1.1 4 | requests 5 | openai==0.28 6 | openai_async 7 | tenacity 8 | -------------------------------------------------------------------------------- /src/llm/openai/chatgpt4turbo_preview/Dockerfile: -------------------------------------------------------------------------------- 1 | # Use an official Python runtime as a parent image 2 | FROM python:3.9-slim 3 | 4 | WORKDIR /app 5 | 6 | #install requirements 7 | COPY requirements.txt requirements.txt 8 | RUN pip3 install -r requirements.txt 9 | 10 | # Copy the rest of the application code to the working directory 11 | COPY . /app/ 12 | EXPOSE 8000 13 | # Set the entrypoint for the container 14 | CMD ["hypercorn", "--bind", "0.0.0.0:8000", "api:app"] 15 | 16 | -------------------------------------------------------------------------------- /src/llm/openai/chatgpt4turbo_preview/__init__.py: -------------------------------------------------------------------------------- 1 | from .request import ModelRequest 2 | from .request import Model -------------------------------------------------------------------------------- /src/llm/openai/chatgpt4turbo_preview/api.py: -------------------------------------------------------------------------------- 1 | from model import Model 2 | from request import ModelRequest 3 | from quart import Quart, request 4 | import aiohttp 5 | 6 | #from fastapi import FastAPI, Body 7 | app = Quart(__name__) 8 | #app.client = aiohttp.ClientSession() 9 | #app = FastAPI() 10 | 11 | @app.before_serving 12 | async def startup(): 13 | app.client = aiohttp.ClientSession() 14 | 15 | @app.route('/', methods=['POST']) 16 | async def answer(): 17 | data = await request.get_json() 18 | req = ModelRequest(**data) 19 | model = Model(app) 20 | return await model.inference(req) 21 | -------------------------------------------------------------------------------- /src/llm/openai/chatgpt4turbo_preview/model.py: -------------------------------------------------------------------------------- 1 | import os 2 | import openai 3 | import openai_async 4 | from cache import AsyncTTL 5 | from request import ModelRequest 6 | from tenacity import retry, wait_random_exponential, stop_after_attempt 7 | 8 | openai.api_key = os.getenv("OPENAI_API_KEY") 9 | 10 | class Model: 11 | def __new__(cls, context): 12 | cls.context = context 13 | if not hasattr(cls, 'instance'): 14 | cls.instance = super(Model, cls).__new__(cls) 15 | return cls.instance 16 | 17 | @retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6)) 18 | async def inference(self, request: ModelRequest): 19 | response = await openai_async.chat_complete( 20 | openai.api_key, 21 | timeout=20000, 22 | payload={ 23 | "model": "gpt-4-1106-preview", 24 | "temperature": 0, 25 | "messages": request.prompt, 26 | }, 27 | ) 28 | return response.json() 29 | -------------------------------------------------------------------------------- /src/llm/openai/chatgpt4turbo_preview/request.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | 4 | class ModelRequest(): 5 | def __init__(self, prompt): 6 | self.prompt = prompt 7 | 8 | def to_json(self): 9 | return json.dumps(self, default=lambda o: o.__dict__, 10 | sort_keys=True, indent=4) 11 | -------------------------------------------------------------------------------- /src/llm/openai/chatgpt4turbo_preview/requirements.txt: -------------------------------------------------------------------------------- 1 | aiohttp==3.8.4 2 | quart 3 | async-cache==1.1.1 4 | requests 5 | openai==0.28 6 | openai_async==0.0.3 7 | tenacity -------------------------------------------------------------------------------- /src/ner/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samagra-Development/ai-tools/bdb71a281f52b8c3d6a086b9c6e8ad347f103553/src/ner/README.md -------------------------------------------------------------------------------- /src/ner/agri_ner_akai/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samagra-Development/ai-tools/bdb71a281f52b8c3d6a086b9c6e8ad347f103553/src/ner/agri_ner_akai/README.md -------------------------------------------------------------------------------- /src/ner/agri_ner_akai/local/Dockerfile: -------------------------------------------------------------------------------- 1 | # Use an official Python runtime as a parent image 2 | FROM python:3.9-slim 3 | 4 | WORKDIR /app 5 | 6 | #install requirements 7 | COPY requirements.txt requirements.txt 8 | RUN pip3 install -r requirements.txt 9 | 10 | RUN python -m spacy download en_core_web_sm 11 | 12 | # Copy the rest of the application code to the working directory 13 | COPY . /app/ 14 | EXPOSE 8000 15 | # Set the entrypoint for the container 16 | CMD ["hypercorn", "--bind", "0.0.0.0:8000", "api:app"] 17 | -------------------------------------------------------------------------------- /src/ner/agri_ner_akai/local/README.md: -------------------------------------------------------------------------------- 1 | ## NER: 2 | 3 | ### Purpose : 4 | 5 | Model to detect 6 | 7 | - crops 8 | - pests 9 | - seed type 10 | - email 11 | - time 12 | - phone numbers 13 | - numbers with units 14 | - dates 15 | 16 | ### Testing the model deployment : 17 | 18 | To run for testing just the Hugging Face deployment for grievence recognition, you can follow the following steps : 19 | 20 | - Git clone the repo 21 | - Go to current folder location i.e. ``cd /src/ner/agri_ner_akai/local`` 22 | - Create docker image file and test the api: 23 | 24 | ``` 25 | docker build -t testmodel . 26 | docker run -p 8000:8000 testmodel 27 | ``` 28 | 29 | ### **Request** 30 | 31 | ``` 32 | curl -X POST -H "Content-Type: application/json" -d '{ 33 | "text": "What are tomatoes and potaotes that are being attacked by aphids will be treated next monday?", 34 | "type": ["email", "CROP"] 35 | }' http://localhost:8000/ 36 | ``` 37 | 38 | ``` 39 | curl -X POST -H "Content-Type: application/json" -d '{ 40 | "text": "What are tomatoes and potaotes that are being attacked by aphids? " 41 | }' http://localhost:8000/ 42 | ``` 43 | -------------------------------------------------------------------------------- /src/ner/agri_ner_akai/local/__init__.py: -------------------------------------------------------------------------------- 1 | from .request import ModelRequest 2 | from .request import Model 3 | -------------------------------------------------------------------------------- /src/ner/agri_ner_akai/local/api.py: -------------------------------------------------------------------------------- 1 | from model import Model 2 | from request import ModelRequest 3 | from quart import Quart, request, jsonify 4 | import aiohttp 5 | 6 | app = Quart(__name__) 7 | 8 | model = None 9 | 10 | @app.before_serving 11 | async def startup(): 12 | app.client = aiohttp.ClientSession() 13 | global model 14 | model = Model(app) 15 | 16 | @app.route('/', methods=['POST']) 17 | async def embed(): 18 | global model 19 | data = await request.get_json() 20 | req = ModelRequest(**data) 21 | entities = await model.inference(req) 22 | return jsonify(entities) # Convert the list of entities to JSON format 23 | 24 | if __name__ == "__main__": 25 | app.run() -------------------------------------------------------------------------------- /src/ner/agri_ner_akai/local/model.py: -------------------------------------------------------------------------------- 1 | from transformers import pipeline 2 | from request import ModelRequest 3 | from regex_parse_ner import RegNERModel 4 | from bert_ner import BertNERModel 5 | 6 | class Model(): 7 | def __init__(self, context): 8 | self.context = context 9 | print("Loading models...") 10 | self.regex_model = RegNERModel() 11 | print("Regex model loaded successfully") 12 | self.bert_model = BertNERModel() 13 | print("Bert model loaded successfully") 14 | 15 | def combine_entities(self, reg_entities, bert_entities): 16 | combined_entities = reg_entities 17 | 18 | for entity in bert_entities: 19 | if entity['entity_group'] not in combined_entities: 20 | combined_entities[entity['entity_group']] = [] 21 | 22 | entity_info = { 23 | 'name': entity['word'], 24 | 'start': entity['start'], 25 | 'end': entity['end'], 26 | 'score': entity['score'] 27 | } 28 | 29 | combined_entities[entity['entity_group']].append(entity_info) 30 | 31 | return combined_entities 32 | 33 | async def inference(self, request: ModelRequest): 34 | sentence = request.text 35 | types = request.type 36 | 37 | reg_entities = self.regex_model.inference(sentence) 38 | bert_entities = self.bert_model.inference(sentence) 39 | 40 | combined_entities = self.combine_entities(reg_entities, bert_entities) 41 | 42 | final_entities = {} 43 | 44 | if types is None: 45 | return combined_entities 46 | 47 | for entity_group in combined_entities: 48 | if entity_group in types: 49 | final_entities[entity_group] = combined_entities[entity_group] 50 | 51 | return final_entities -------------------------------------------------------------------------------- /src/ner/agri_ner_akai/local/request.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import json 3 | 4 | 5 | class ModelRequest(): 6 | def __init__(self, text, type=None): 7 | self.text = text 8 | self.type = type 9 | 10 | def to_json(self): 11 | return json.dumps(self, default=lambda o: o.__dict__, 12 | sort_keys=True, indent=4) -------------------------------------------------------------------------------- /src/ner/agri_ner_akai/local/requirements.txt: -------------------------------------------------------------------------------- 1 | torch 2 | transformers 3 | quart 4 | aiohttp 5 | spacy 6 | tensorflow==2.10.0 -------------------------------------------------------------------------------- /src/rerankers/bge_base/local/Dockerfile: -------------------------------------------------------------------------------- 1 | # Use an official Python runtime as a parent image 2 | FROM python:3.9-slim 3 | 4 | WORKDIR /app 5 | 6 | 7 | #install requirements 8 | COPY requirements.txt requirements.txt 9 | RUN pip3 install -r requirements.txt 10 | 11 | # Copy the rest of the application code to the working directory 12 | COPY . /app/ 13 | EXPOSE 8000 14 | # Set the entrypoint for the container 15 | CMD ["hypercorn", "--bind", "0.0.0.0:8000", "api:app"] 16 | -------------------------------------------------------------------------------- /src/rerankers/bge_base/local/README.md: -------------------------------------------------------------------------------- 1 | ## Grievance classification: 2 | 3 | 4 | ### Purpose : 5 | Re rank given a question and a list of contetn 6 | 7 | 8 | ### Testing the model deployment : 9 | To run for testing just the Hugging Face deployment for grievence recognition, you can follow the following steps : 10 | 11 | - Git clone the repo 12 | - Go to current folder location i.e. ``` cd /src/rerankers/bge_base/local ``` 13 | - Create docker image file and test the api: 14 | ``` 15 | docker build -t testmodel . 16 | docker run -p 8000:8000 testmodel 17 | curl -X POST -H "Content-Type: application/json" \ 18 | -d '{"question": "What is agriculture ?", "content_chunks": ["Farming is a practice of growing crops to sell them to generate money", "LLM are the present day hype machine but will they be useful until you can truly reason with them? ", "Things are generally better than what people deep into it feel"]}' \ 19 | http://localhost:8000/ 20 | ``` 21 | -------------------------------------------------------------------------------- /src/rerankers/bge_base/local/__init__.py: -------------------------------------------------------------------------------- 1 | from .request import ModelRequest 2 | from .request import Model 3 | -------------------------------------------------------------------------------- /src/rerankers/bge_base/local/api.py: -------------------------------------------------------------------------------- 1 | from model import Model 2 | from request import ModelRequest 3 | from quart import Quart, request,jsonify 4 | import aiohttp 5 | 6 | app = Quart(__name__) 7 | 8 | model = None 9 | 10 | @app.before_serving 11 | async def startup(): 12 | app.client = aiohttp.ClientSession() 13 | global model 14 | model = Model(app) 15 | 16 | @app.route('/', methods=['POST']) 17 | async def embed(): 18 | global model 19 | data = await request.get_json() 20 | req = ModelRequest(**data) 21 | prediction = await model.inference(req) 22 | # Convert the NumPy array to a list (or another serializable format) and return as JSON 23 | if prediction is not None: 24 | return jsonify(prediction.tolist()) # Assuming 'prediction' is a NumPy array 25 | else: 26 | # Return a meaningful error message if prediction is None 27 | return jsonify({'error': 'Prediction failed'}), 500 28 | 29 | if __name__ == "__main__": 30 | app.run() -------------------------------------------------------------------------------- /src/rerankers/bge_base/local/model.py: -------------------------------------------------------------------------------- 1 | from request import ModelRequest 2 | from sentence_transformers.cross_encoder import CrossEncoder 3 | import torch 4 | 5 | class Model(): 6 | def __new__(cls, context): 7 | cls.context = context 8 | if not hasattr(cls, 'instance'): 9 | cls.instance = super(Model, cls).__new__(cls) 10 | model_name = "BAAI/bge-reranker-base" 11 | cls.model = CrossEncoder(model_name) 12 | return cls.instance 13 | 14 | 15 | async def inference(self, request: ModelRequest): 16 | predict_array = request.predict_array 17 | predictions = self.model.predict(predict_array) 18 | return (predictions) 19 | -------------------------------------------------------------------------------- /src/rerankers/bge_base/local/request.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import json 3 | 4 | class ModelRequest(): 5 | def __init__(self, question, content_chunks): 6 | self.question = question 7 | self.content_chunks = content_chunks 8 | self.predict_array = [[question, content] for content in content_chunks] 9 | 10 | def to_json(self): 11 | return json.dumps(self, default=lambda o: o.__dict__, 12 | sort_keys=True, indent=4) 13 | 14 | 15 | -------------------------------------------------------------------------------- /src/rerankers/bge_base/local/requirements.txt: -------------------------------------------------------------------------------- 1 | torch 2 | sentence_transformers 3 | quart 4 | aiohttp 5 | -------------------------------------------------------------------------------- /src/search/README.md: -------------------------------------------------------------------------------- 1 | # Purpose 2 | 3 | Common folder for scoring methods required for augmenting search and retrieval of documents. -------------------------------------------------------------------------------- /src/search/__init__.py: -------------------------------------------------------------------------------- 1 | from word_score import * -------------------------------------------------------------------------------- /src/search/tf_search/local/Dockerfile: -------------------------------------------------------------------------------- 1 | # Use an official Python runtime as a parent image 2 | FROM python:3.9-slim 3 | 4 | WORKDIR /app 5 | 6 | #install requirements 7 | COPY requirements.txt requirements.txt 8 | RUN pip3 install -r requirements.txt 9 | 10 | # Download the CSV from Google Drive and store it in the "content" directory 11 | RUN apt-get update && apt-get install -y curl && \ 12 | mkdir content && \ 13 | curl -L 'https://drive.google.com/uc?export=download&id=13aDWCvj7PqFw7aPvPK_Qli3-Ei9mVwaO' -o content/data.csv && \ 14 | apt-get remove -y curl && apt-get autoremove -y && rm -rf /var/lib/apt/lists/* 15 | 16 | # Copy the rest of the application code to the working directory 17 | COPY . /app/ 18 | EXPOSE 8000 19 | 20 | # Set the entrypoint for the container 21 | CMD ["hypercorn", "--bind", "0.0.0.0:8000", "api:app"] 22 | -------------------------------------------------------------------------------- /src/search/tf_search/local/README copy.md: -------------------------------------------------------------------------------- 1 | # Word Score 2 | 3 | ## Test Deployment 4 | 5 | - Git clone the repo and cd to the project location. 6 | - cd to `local`, i.e., `cd ./src/search/word_score/local`. 7 | - Replace the link in the Dockerfile to a downloadable csv file of your choice, but the data column should be named `tags`. 8 | - Start your docker engine and `docker build -t word_score .`. 9 | - Do `docker run -p 8000:8000 word_score`. 10 | - `curl -X POST -H "Content-Type: application/json" -d '{"query": "leave policy planned leaves", "k": "6"}' http://localhost:8000/`. < 11 | ` 12 | 13 | 14 | -------------------------------------------------------------------------------- /src/search/tf_search/local/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samagra-Development/ai-tools/bdb71a281f52b8c3d6a086b9c6e8ad347f103553/src/search/tf_search/local/README.md -------------------------------------------------------------------------------- /src/search/tf_search/local/__init__.py: -------------------------------------------------------------------------------- 1 | from .request import * 2 | from .model import * -------------------------------------------------------------------------------- /src/search/tf_search/local/api.py: -------------------------------------------------------------------------------- 1 | from model import Model 2 | from request import ModelRequest 3 | from quart import Quart, request 4 | import aiohttp 5 | import os 6 | import pandas as pd 7 | from sklearn.feature_extraction.text import CountVectorizer 8 | from sklearn.metrics.pairwise import linear_kernel 9 | 10 | app = Quart(__name__) 11 | 12 | # Global variable for the dataframe 13 | df = None 14 | vectorizer = None 15 | content_matrix = None 16 | 17 | files = os.listdir("./content") 18 | df = pd.read_csv(os.path.join("./content", files[0])) 19 | print(df.columns) 20 | 21 | 22 | @app.before_serving 23 | async def startup(): 24 | app.client = aiohttp.ClientSession() 25 | 26 | # Load the dataframe during startup 27 | global df 28 | global content_matrix 29 | global vectorizer 30 | files = os.listdir("./content") 31 | df = pd.read_csv(os.path.join("./content", files[0])) 32 | print(df.columns) 33 | # Initialize a CountVectorizer with additional parameters 34 | vectorizer = CountVectorizer(lowercase=True, ngram_range=(1, 2), binary=True, stop_words='english') 35 | 36 | # Fit the vectorizer on the content column and transform it 37 | content_matrix = vectorizer.fit_transform(df['heading'] + df['content']) 38 | print("Type of vectorizer:", type(vectorizer)) 39 | 40 | @app.route('/', methods=['POST']) 41 | async def translate(): 42 | global vectorizer, content_matrix 43 | data = await request.get_json() 44 | req = ModelRequest(**data) 45 | 46 | print("Inside translate function, type of vectorizer:", type(vectorizer)) 47 | 48 | # Pass the dataframe as an argument to the Model class 49 | model = Model(df, content_matrix, vectorizer, req) 50 | return await model.inference(req) 51 | 52 | @app.route('/', methods=['GET']) 53 | async def hi(): 54 | return df.columns 55 | 56 | 57 | if __name__ == "__main__": 58 | app.run(debug=True, port=8000) 59 | -------------------------------------------------------------------------------- /src/search/tf_search/local/model.py: -------------------------------------------------------------------------------- 1 | from request import ModelRequest 2 | 3 | import numpy as np 4 | from cache import AsyncTTL 5 | import pandas as pd 6 | from sklearn.feature_extraction.text import CountVectorizer 7 | from sklearn.metrics.pairwise import linear_kernel 8 | 9 | 10 | class Model: 11 | def __init__(self, df1, matrix, vec, req: ModelRequest): 12 | self.df = df1 13 | self.content_matrix = matrix 14 | self.vectorizer = vec 15 | 16 | print("Inside Model's constructor, type of vectorizer:", type(self.vectorizer)) 17 | 18 | 19 | @AsyncTTL(time_to_live=600000, maxsize=1024) 20 | async def inference(self, request: ModelRequest): 21 | 22 | query = [request.query] 23 | query_vector = self.vectorizer.transform(query) 24 | k = int(request.k) # k is the number of top k words to consider for the score 25 | 26 | count_scores = linear_kernel(query_vector, self.content_matrix).flatten() 27 | 28 | # Create a new DataFrame with content and count scores 29 | result_df = pd.DataFrame({ 30 | 'content': self.df['content'], 31 | 'count_score': count_scores, 32 | 'chunk_id' : self.df['chunkId'] 33 | }) 34 | 35 | # Sort the DataFrame based on count scores in descending order 36 | sorted_df = result_df.sort_values(by='count_score', ascending=False) 37 | sorted_df = sorted_df.head(k) 38 | 39 | return sorted_df.to_dict() 40 | -------------------------------------------------------------------------------- /src/search/tf_search/local/request.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | 4 | class ModelRequest(): 5 | def __init__(self, query, k=4): 6 | self.query = query 7 | self.k = k 8 | 9 | 10 | def to_json(self): 11 | return json.dumps(self, default=lambda o: o.__dict__, 12 | sort_keys=True, indent=4) 13 | -------------------------------------------------------------------------------- /src/search/tf_search/local/requirements.txt: -------------------------------------------------------------------------------- 1 | quart 2 | aiohttp 3 | async-cache==1.1.1 4 | pandas 5 | scikit-learn -------------------------------------------------------------------------------- /src/search/word_score/README.md: -------------------------------------------------------------------------------- 1 | # Word Score 2 | 3 | This folder consists of an API that scores documents based on an approach that combines IDF and Fuzzy word matching. 4 | 5 | For a given query, it calculates fuzzy matching scores for words in query (max score for a word from entire row), weights them with IDF, takes average of the scores of all words in the query to give a score for the entire query, sorts them, and returns the top n matches. -------------------------------------------------------------------------------- /src/search/word_score/__init__.py: -------------------------------------------------------------------------------- 1 | from .local import * -------------------------------------------------------------------------------- /src/search/word_score/local/Dockerfile: -------------------------------------------------------------------------------- 1 | # Use an official Python runtime as a parent image 2 | FROM python:3.9-slim 3 | 4 | WORKDIR /app 5 | 6 | #install requirements 7 | COPY requirements.txt requirements.txt 8 | RUN pip3 install -r requirements.txt 9 | 10 | # Download the CSV from Google Drive and store it in the "content" directory 11 | RUN apt-get update && apt-get install -y curl && \ 12 | mkdir content && \ 13 | curl -L 'https://drive.google.com/uc?export=download&id=1Ka6cyCCHbRy6h8Ej075_Nk9mMICp_xS6' -o content/data.csv && \ 14 | apt-get remove -y curl && apt-get autoremove -y && rm -rf /var/lib/apt/lists/* 15 | 16 | # Copy the rest of the application code to the working directory 17 | COPY . /app/ 18 | EXPOSE 8000 19 | 20 | # Set the entrypoint for the container 21 | CMD ["hypercorn", "--bind", "0.0.0.0:8000", "api:app"] 22 | -------------------------------------------------------------------------------- /src/search/word_score/local/README.md: -------------------------------------------------------------------------------- 1 | # Word Score 2 | 3 | ## Test Deployment 4 | 5 | - Git clone the repo and cd to the project location. 6 | - cd to `local`, i.e., `cd ./src/search/word_score/local`. 7 | - Replace the link in the Dockerfile to a downloadable csv file of your choice, but the data column should be named `tags`. 8 | - Start your docker engine and `docker build -t word_score .`. 9 | - Do `docker run -p 8000:8000 word_score`. 10 | - `curl -X POST -H "Content-Type: application/json" -d '{"query": "seed procurement district", "n": "5", "search_category" : "seed", "threshold": "0.8", "k": "6"}' http://localhost:8000/`.
Replace `seed procurement district` with a query you want to search and `5` with the number of rows you want to retrieve. Change `threshold` value (0 to 1) to retrieve documents whose score cross the specific threshold. `k` is the number of top k words to consider for thresholding of the score. 11 | - The reponse for above would be:
12 | ` 13 | { 14 | "docs": ["row1", "row2", ... , "rowN"] 15 | } 16 | ` 17 | The list of strings contains the top N rows. 18 | 19 | -------------------------------------------------------------------------------- /src/search/word_score/local/__init__.py: -------------------------------------------------------------------------------- 1 | from .request import * 2 | from .model import * -------------------------------------------------------------------------------- /src/search/word_score/local/api.py: -------------------------------------------------------------------------------- 1 | from model import Model 2 | from request import ModelRequest 3 | from quart import Quart, request 4 | import aiohttp 5 | import os 6 | import pandas as pd 7 | 8 | app = Quart(__name__) 9 | 10 | # Global variable for the dataframe 11 | global_df = None 12 | 13 | @app.before_serving 14 | async def startup(): 15 | app.client = aiohttp.ClientSession() 16 | 17 | # Load the dataframe during startup 18 | global global_df 19 | global seed_df 20 | global pesticide_df 21 | global fertilizer_df 22 | files = os.listdir("./content") 23 | global_df = pd.read_csv(os.path.join("./content", files[0])) 24 | global_df['tags'] = global_df['tags'].str.lower() 25 | seed_df = global_df.loc[global_df.category == 'seed',: ] 26 | pesticide_df = global_df.loc[global_df.category == 'pesticide',: ] 27 | fertilizer_df = global_df.loc[global_df.category == 'fertilizer',: ] 28 | 29 | @app.route('/', methods=['POST']) 30 | async def translate(): 31 | data = await request.get_json() 32 | req = ModelRequest(**data) 33 | # Pass the dataframe as an argument to the Model class 34 | model = Model(seed_df,pesticide_df, fertilizer_df, global_df , req) 35 | return await model.inference(req) 36 | 37 | @app.route('/', methods=['GET']) 38 | async def hi(): 39 | return "hi" 40 | -------------------------------------------------------------------------------- /src/search/word_score/local/model.py: -------------------------------------------------------------------------------- 1 | from request import ModelRequest 2 | from thefuzz import fuzz 3 | import numpy as np 4 | from cache import AsyncTTL 5 | from tqdm import tqdm 6 | 7 | 8 | class Model: 9 | def __init__(self, seed_df,pesticide_df, fertilizer_df, global_df, request: ModelRequest, search_category= 'others'): 10 | self.search_category = request.search_category 11 | if self.search_category == 'seed': 12 | self.df = seed_df 13 | elif self.search_category == 'fertilizer': 14 | self.df = fertilizer_df 15 | elif self.search_category == 'pesticide': 16 | self.df = pesticide_df 17 | else : 18 | self.df = global_df 19 | 20 | 21 | def __fuzzy_match(self, query_tokens, doc_tokens, k): 22 | fuzzy_scores = [] 23 | query_set = set(query_tokens) 24 | doc_set = set(doc_tokens) 25 | 26 | for q_token in query_set: 27 | max_ratio = None 28 | # max_token = None 29 | for token in doc_set: 30 | ratio = fuzz.ratio(token, q_token) 31 | if max_ratio == None or ratio > max_ratio: 32 | max_ratio = ratio 33 | # max_token = token 34 | 35 | fuzzy_scores.append((max_ratio / 100)) 36 | 37 | fuzzy_scores = sorted(fuzzy_scores, reverse=True) 38 | 39 | return np.mean(fuzzy_scores), np.mean(fuzzy_scores[:k]) 40 | 41 | 42 | @AsyncTTL(time_to_live=600000, maxsize=1024) 43 | async def inference(self, request: ModelRequest): 44 | scores = [] 45 | top_k_scores = [] 46 | query = request.query 47 | threshold = float(request.threshold) 48 | k = int(request.k) # k is the number of top k words to consider for the score 49 | n = int(request.n) # n is the number of documents to return 50 | query_tokens = query.lower().split() 51 | 52 | for _, row in tqdm(self.df.iterrows()): 53 | doc_tokens = str(row['tags']).split() 54 | fuzzy_score, top_k_score = self.__fuzzy_match(query_tokens, doc_tokens, k) 55 | scores.append(fuzzy_score) 56 | top_k_scores.append(top_k_score) 57 | 58 | new_df = self.df.copy(deep=True) 59 | new_df['scores'] = scores 60 | new_df['top_k_scores'] = top_k_scores 61 | new_df = new_df[new_df['top_k_scores'] > threshold] 62 | new_df_sorted = new_df.sort_values(by=['scores'], ascending=False).head(n) 63 | return {"docs": new_df_sorted['tags'].to_list()} 64 | -------------------------------------------------------------------------------- /src/search/word_score/local/request.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | 4 | class ModelRequest(): 5 | def __init__(self, query, n, search_category, threshold, k=5): 6 | self.query = query 7 | self.n = n 8 | self.search_category = search_category 9 | self.threshold = threshold 10 | self.k = k 11 | 12 | def to_json(self): 13 | return json.dumps(self, default=lambda o: o.__dict__, 14 | sort_keys=True, indent=4) 15 | -------------------------------------------------------------------------------- /src/search/word_score/local/requirements.txt: -------------------------------------------------------------------------------- 1 | thefuzz 2 | quart 3 | aiohttp 4 | async-cache==1.1.1 5 | pandas 6 | tqdm -------------------------------------------------------------------------------- /src/speech_lang_detection/README.md: -------------------------------------------------------------------------------- 1 | This covers language detection for langauges without any translation etc 2 | -------------------------------------------------------------------------------- /src/speech_lang_detection/batch/README.md: -------------------------------------------------------------------------------- 1 | This covers language detection for langauges without any translation etc for batch files (.wav files) 2 | -------------------------------------------------------------------------------- /src/speech_lang_detection/streaming/README.md: -------------------------------------------------------------------------------- 1 | This covers language detection for langauges without any translation etc for streaming data -------------------------------------------------------------------------------- /src/spell_check/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samagra-Development/ai-tools/bdb71a281f52b8c3d6a086b9c6e8ad347f103553/src/spell_check/README.md -------------------------------------------------------------------------------- /src/spell_check/kenlm/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samagra-Development/ai-tools/bdb71a281f52b8c3d6a086b9c6e8ad347f103553/src/spell_check/kenlm/README.md -------------------------------------------------------------------------------- /src/spell_check/kenlm/local/Dockerfile: -------------------------------------------------------------------------------- 1 | # Use an official Python runtime as a parent image 2 | FROM python:3.9-slim 3 | 4 | WORKDIR /app 5 | 6 | # Install system packages required for building kenlm 7 | RUN apt-get update && apt-get install -y cmake g++ zlib1g-dev 8 | 9 | # Install requirements 10 | COPY requirements.txt requirements.txt 11 | RUN pip3 install -r requirements.txt 12 | 13 | RUN echo "Downloading the language model files" 14 | # Install kenlm using pip 15 | RUN pip3 install https://github.com/kpu/kenlm/archive/master.zip 16 | RUN apt-get update && apt-get install -y wget 17 | 18 | 19 | RUN echo "Downloading the language model files" 20 | RUN apt-get install -y build-essential cmake libboost-system-dev libboost-thread-dev libboost-program-options-dev libboost-test-dev libeigen3-dev zlib1g-dev libbz2-dev liblzma-dev 21 | 22 | RUN echo "Downloading the language model files" 23 | # Download the files using wget 24 | RUN wget "https://drive.google.com/uc?export=download&id=1frSw5-qfRMgrYs4QL961s2yYuq2KplEM" -O '5gram_model.bin' 25 | RUN wget "https://drive.google.com/uc?export=download&id=1o31Z4TZbAOEt6E8Rx7VMONJOGJH-5Mwk" -O 'lexicon.txt' 26 | RUN wget "https://drive.google.com/uc?export=download&id=14cMmeDPlAODbRe37CdHLnhClGX7JXG-A" -O 'freq_dict.txt' 27 | RUN wget "https://drive.google.com/uc?export=download&id=1eVWwarCm8Wqq3vYqsE9f2jvrp-rvr6QZ" -O 'texts.txt' 28 | 29 | RUN wget "https://drive.google.com/uc?export=download&id=1-Dtk5socjYdeGyqhbQzG-rvWJfWVFGqv" -O '5gram_model_eng.bin' 30 | RUN wget "https://drive.google.com/uc?export=download&id=1-59pDTvEXCMUZ-NQ8BwmCnHQZh4Eg6Gw" -O 'lexicon_eng.txt' 31 | RUN wget "https://drive.google.com/uc?export=download&id=1Ztj6k0A4BMi_o87qwSDKJQ6cyhvlvneD" -O 'freq_dict_eng.txt' 32 | RUN wget "https://drive.google.com/uc?export=download&id=1-iZvej7L92Aga9VZ33BM5ybUTiR0hMF8" -O 'texts_eng.txt' 33 | 34 | RUN echo "Downloading the language model files" 35 | # Dowload the kenlm training files 36 | RUN wget -O - https://kheafield.com/code/kenlm.tar.gz | tar xz 37 | RUN mkdir kenlm/build && cd kenlm/build && cmake .. && make -j2 38 | 39 | # Copy the rest of the application code to the working directory 40 | COPY . /app/ 41 | 42 | EXPOSE 8000 43 | 44 | # Set the entrypoint for the container 45 | CMD ["hypercorn", "--bind", "0.0.0.0:8000", "api:app"] 46 | -------------------------------------------------------------------------------- /src/spell_check/kenlm/local/README.md: -------------------------------------------------------------------------------- 1 | .curl request : 2 | 3 | curl -X POST -H "Content-Type: application/json" -d '{ 4 | "text": "ପାମ ମିଶନରୀ ଉପରେ କେତେ % ରିହାତି ଧୈର୍ଯ ହୋଇଛି", 5 | "BEAM_WIDTH": 5, 6 | "SCORE_THRESHOLD": 1.5, 7 | "max_distance": 1, 8 | "lang" : "ory" 9 | }' http://localhost:8000/ 10 | 11 | curl -X POST -H "Content-Type: application/json" -d '{ 12 | "text": "ପାମ ମିଶନରୀ ଉପରେ କେତେ % ରିହାତି ଧୈର୍ଯ ହୋଇଛି", 13 | "BEAM_WIDTH": 5, 14 | "SCORE_THRESHOLD": 1.5, 15 | "max_distance": 1 16 | }' http://localhost:8000/ 17 | 18 | curl -X POST -H "Content-Type: application/json" -d '{ 19 | "text": "how to apply for go-sugem scheme for my paddi crop", 20 | "BEAM_WIDTH": 5, 21 | "SCORE_THRESHOLD": 1.5, 22 | "max_distance": 1, 23 | "lang" : "eng" 24 | }' http://localhost:8000/ 25 | 26 | 27 | 28 | **curl request for update:** 29 | 30 | curl -X PUT -H "Content-Type: application/json" -d '{ 31 | "text": "ମିଶନରୀ", 32 | "lang" : "ory" 33 | }' http://localhost:8000/ 34 | 35 | curl -X PUT -H "Content-Type: application/json" -d '{ 36 | "text": ["ପାମ ମିଶନରୀ ଉପରେ", "ରିହାତି ଧୈର୍ଯ ହୋଇଛି"] 37 | }' http://localhost:8000/ 38 | 39 | curl -X PUT -H "Content-Type: application/json" -d '{ 40 | "text": "go-sugem", 41 | "lang" : "eng" 42 | }' http://localhost:8000/ 43 | 44 | curl -X PUT -H "Content-Type: application/json" -d '{ 45 | "text": ["how to apply for", "scheme for my paddi crop"], 46 | "lang" : "eng" 47 | }' http://localhost:8000/ 48 | -------------------------------------------------------------------------------- /src/spell_check/kenlm/local/__init__.py: -------------------------------------------------------------------------------- 1 | from .request import * 2 | from .model import * 3 | -------------------------------------------------------------------------------- /src/spell_check/kenlm/local/api.py: -------------------------------------------------------------------------------- 1 | from model import Model 2 | from update import UpdationModel 3 | from request import ModelRequest, ModelUpdateRequest 4 | from quart import Quart, request 5 | import aiohttp 6 | 7 | app = Quart(__name__) 8 | 9 | model = None 10 | 11 | model_paths = { 12 | 'ory': '5gram_model.bin', 13 | 'eng': '5gram_model_eng.bin' 14 | } 15 | 16 | vocab_paths = { 17 | 'ory': 'lexicon.txt', 18 | 'eng': 'lexicon_eng.txt' 19 | } 20 | 21 | freq_dict_paths = { 22 | 'ory': 'freq_dict.txt', 23 | 'eng': 'freq_dict_eng.txt' 24 | } 25 | 26 | texts_paths = { 27 | 'ory': 'texts.txt', 28 | 'eng': 'texts_eng.txt' 29 | } 30 | 31 | 32 | @app.before_serving 33 | async def startup(): 34 | app.client = aiohttp.ClientSession() 35 | global model 36 | model = Model(app, model_paths, vocab_paths, freq_dict_paths) 37 | 38 | print("Model loaded successfully") 39 | 40 | @app.route('/', methods=['POST']) 41 | async def embed(): 42 | global model 43 | data = await request.get_json() 44 | req = ModelRequest(**data) 45 | result = await model.inference(req) 46 | return result 47 | 48 | @app.route('/', methods=['PUT']) 49 | async def update(): 50 | global model 51 | data = await request.get_json() 52 | req = ModelUpdateRequest(**data) 53 | result = await UpdationModel(model_paths, vocab_paths, freq_dict_paths, texts_paths).update(req) 54 | 55 | if result: 56 | model = Model(app, model_paths, vocab_paths, freq_dict_paths) 57 | 58 | return result 59 | 60 | if __name__ == "__main__": 61 | app.run() 62 | -------------------------------------------------------------------------------- /src/spell_check/kenlm/local/request.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import json 3 | 4 | class ModelRequest(): 5 | def __init__(self, text, BEAM_WIDTH, SCORE_THRESHOLD, max_distance, lang='ory'): 6 | self.text = text 7 | self.BEAM_WIDTH = BEAM_WIDTH 8 | self.SCORE_THRESHOLD = SCORE_THRESHOLD 9 | self.max_distance = max_distance 10 | self.lang = lang 11 | 12 | def to_json(self): 13 | return json.dumps(self, default=lambda o: o.__dict__, sort_keys=True, indent=4) 14 | 15 | class ModelUpdateRequest(): 16 | def __init__(self, text, lang='ory'): 17 | self.text = text 18 | self.lang = lang 19 | 20 | def to_json(self): 21 | return json.dumps(self, default=lambda o: o.__dict__, sort_keys=True, indent=4) 22 | -------------------------------------------------------------------------------- /src/spell_check/kenlm/local/requirements.txt: -------------------------------------------------------------------------------- 1 | quart 2 | aiohttp 3 | python-Levenshtein 4 | requests 5 | symspellpy -------------------------------------------------------------------------------- /src/spell_check/spello/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samagra-Development/ai-tools/bdb71a281f52b8c3d6a086b9c6e8ad347f103553/src/spell_check/spello/README.md -------------------------------------------------------------------------------- /src/spell_check/spello/local/Dockerfile: -------------------------------------------------------------------------------- 1 | # Use an official Python runtime as a parent image 2 | FROM python:3.9-slim 3 | 4 | WORKDIR /app 5 | 6 | # Install system packages required for building kenlm 7 | RUN apt-get update && apt-get install -y cmake g++ zlib1g-dev 8 | 9 | # Install requirements 10 | COPY requirements.txt requirements.txt 11 | RUN pip3 install -r requirements.txt 12 | 13 | # Install wget 14 | RUN apt-get update && apt-get install -y wget 15 | 16 | # Download the files using wget 17 | RUN wget "https://drive.google.com/uc?export=download&id=14cMmeDPlAODbRe37CdHLnhClGX7JXG-A" -O 'freq_dict.txt' 18 | RUN wget "https://drive.google.com/uc?export=download&id=1Ztj6k0A4BMi_o87qwSDKJQ6cyhvlvneD" -O 'freq_dict_eng.txt' 19 | 20 | # Copy the rest of the application code to the working directory 21 | COPY . /app/ 22 | 23 | EXPOSE 8000 24 | 25 | # Set the entrypoint for the container 26 | CMD ["hypercorn", "--bind", "0.0.0.0:8000", "api:app"] 27 | -------------------------------------------------------------------------------- /src/spell_check/spello/local/README.md: -------------------------------------------------------------------------------- 1 | **curl request for inferenece:** 2 | 3 | curl -X POST -H "Content-Type: application/json" -d '{ 4 | "text": "ପାମ ମିଶନରୀ ଉପରେ କେତେ % ରିହାତି ଧୈର୍ଯ ହୋଇଛି", 5 | "lang" : "ory" 6 | }' http://localhost:8000/ 7 | 8 | curl -X POST -H "Content-Type: application/json" -d '{ 9 | "text": "ପାମ ମିଶନରୀ ଉପରେ କେତେ % ରିହାତି ଧୈର୍ଯ ହୋଇଛି" 10 | }' http://localhost:8000/ 11 | 12 | curl -X POST -H "Content-Type: application/json" -d '{ 13 | "text": "how to apply for go-sugem scheme for my paddi crop", 14 | "lang" : "eng" 15 | }' http://localhost:8000/ 16 | 17 | **curl request for update:** 18 | 19 | curl -X PUT -H "Content-Type: application/json" -d '{ 20 | "text": "ମିଶନରୀ", 21 | "lang" : "ory" 22 | }' http://localhost:8000/ 23 | 24 | curl -X PUT -H "Content-Type: application/json" -d '{ 25 | "text": ["ପାମ ମିଶନରୀ ଉପରେ", "ରିହାତି ଧୈର୍ଯ ହୋଇଛି"] 26 | }' http://localhost:8000/ 27 | 28 | curl -X PUT -H "Content-Type: application/json" -d '{ 29 | "text": "go-sugem", 30 | "lang" : "eng" 31 | }' http://localhost:8000/ 32 | 33 | curl -X PUT -H "Content-Type: application/json" -d '{ 34 | "text": ["how to apply for", "scheme for my paddy crop"], 35 | "lang" : "eng" 36 | }' http://localhost:8000/ 37 | -------------------------------------------------------------------------------- /src/spell_check/spello/local/__init__.py: -------------------------------------------------------------------------------- 1 | from .request import * 2 | from .model import * 3 | -------------------------------------------------------------------------------- /src/spell_check/spello/local/api.py: -------------------------------------------------------------------------------- 1 | from model import Model 2 | from request import ModelRequest 3 | from quart import Quart, request 4 | import aiohttp 5 | 6 | app = Quart(__name__) 7 | 8 | model = None 9 | 10 | freq_dict_paths = { 11 | 'ory': 'freq_dict.txt', 12 | 'eng': 'freq_dict_eng.txt' 13 | } 14 | 15 | spello_model_paths = { 16 | 'ory': 'spello_model.pkl', 17 | 'eng': 'spello_model_eng.pkl' 18 | } 19 | 20 | 21 | @app.before_serving 22 | async def startup(): 23 | app.client = aiohttp.ClientSession() 24 | global model 25 | model = Model(app, freq_dict_paths) 26 | 27 | @app.route('/', methods=['POST']) 28 | async def infer(): 29 | global model 30 | data = await request.get_json() 31 | req = ModelRequest(**data) 32 | result = await model.inference(req) 33 | return result 34 | 35 | @app.route('/', methods=['PUT']) 36 | async def update(): 37 | # print("PUT") 38 | global model 39 | data = await request.get_json() 40 | req = ModelRequest(**data) 41 | result = await model.update(req) 42 | return result 43 | 44 | 45 | if __name__ == "__main__": 46 | app.run() 47 | -------------------------------------------------------------------------------- /src/spell_check/spello/local/request.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import json 3 | 4 | class ModelRequest(): 5 | def __init__(self, text, lang='ory'): 6 | self.text = text 7 | self.lang = lang 8 | 9 | def to_json(self): 10 | return json.dumps(self, default=lambda o: o.__dict__, sort_keys=True, indent=4) 11 | -------------------------------------------------------------------------------- /src/spell_check/spello/local/requirements.txt: -------------------------------------------------------------------------------- 1 | quart 2 | aiohttp 3 | requests 4 | spello -------------------------------------------------------------------------------- /src/t2embedding/README.md: -------------------------------------------------------------------------------- 1 | Embedding text content such that they can be searched using vector search techniques -------------------------------------------------------------------------------- /src/t2embedding/bert/README.md: -------------------------------------------------------------------------------- 1 | BERT is used to embed text content through emebedding that can be searched using vector search techniques -------------------------------------------------------------------------------- /src/t2embedding/openai/README.md: -------------------------------------------------------------------------------- 1 | Using Open AI's mebeddings to embed text to numeric vectors that can be search using vector search -------------------------------------------------------------------------------- /src/t2embedding/openai/remote/Dockerfile: -------------------------------------------------------------------------------- 1 | # Use an official Python runtime as a parent image 2 | FROM python:3.11-slim 3 | 4 | WORKDIR /app 5 | 6 | #install requirements 7 | COPY requirements.txt requirements.txt 8 | RUN pip3 install -r requirements.txt 9 | 10 | # Copy the rest of the application code to the working directory 11 | COPY . /app/ 12 | EXPOSE 8000 13 | # Set the entrypoint for the container 14 | CMD ["hypercorn", "--bind", "0.0.0.0:8000", "api:app"] 15 | 16 | -------------------------------------------------------------------------------- /src/t2embedding/openai/remote/__init__.py: -------------------------------------------------------------------------------- 1 | from .request import ModelRequest 2 | from .request import Model -------------------------------------------------------------------------------- /src/t2embedding/openai/remote/api.py: -------------------------------------------------------------------------------- 1 | from model import Model 2 | from request import ModelRequest 3 | from quart import Quart, request 4 | import aiohttp 5 | 6 | #from fastapi import FastAPI, Body 7 | app = Quart(__name__) 8 | #app.client = aiohttp.ClientSession() 9 | #app = FastAPI() 10 | 11 | @app.before_serving 12 | async def startup(): 13 | app.client = aiohttp.ClientSession() 14 | 15 | @app.route('/', methods=['POST']) 16 | async def embed(): 17 | data = await request.get_json() 18 | req = ModelRequest(**data) 19 | model = Model(app) 20 | return await model.inference(req) 21 | -------------------------------------------------------------------------------- /src/t2embedding/openai/remote/model.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import os 3 | import openai 4 | from request import ModelRequest 5 | from tenacity import retry, wait_random_exponential, stop_after_attempt 6 | 7 | openai.api_key = os.getenv("OPENAI_API_KEY") 8 | 9 | 10 | class Model: 11 | embedding_df = None 12 | embedding_model = "text-embedding-ada-002" 13 | embedding_encoding = "cl100k_base" # this the encoding for text-embedding-ada-002 14 | max_tokens = 8000 # the maximum for text-embedding-ada-002 is 8191 15 | 16 | def __new__(cls, context): 17 | cls.context = context 18 | if not hasattr(cls, 'instance'): 19 | cls.instance = super(Model, cls).__new__(cls) 20 | return cls.instance 21 | 22 | @retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6)) 23 | async def get_embedding(self, text, model): 24 | embedding = await openai.Embedding.acreate(input=[text], model=model) 25 | return {"text": text, "embedding": embedding["data"][0]["embedding"]} 26 | 27 | async def inference(self, request: ModelRequest): 28 | tasks = [self.get_embedding(t, self.embedding_model) for t in request.text] 29 | e = await asyncio.gather(*tasks) 30 | return e 31 | -------------------------------------------------------------------------------- /src/t2embedding/openai/remote/request.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | 4 | class ModelRequest(): 5 | def __init__(self, text): 6 | self.text = text 7 | 8 | def to_json(self): 9 | return json.dumps(self, default=lambda o: o.__dict__, 10 | sort_keys=True, indent=4) 11 | -------------------------------------------------------------------------------- /src/t2embedding/openai/remote/requirements.txt: -------------------------------------------------------------------------------- 1 | aiohttp==3.8.4 2 | quart==0.18.3 3 | async-cache==1.1.1 4 | requests 5 | openai==0.26.5 6 | openai_async 7 | tenacity -------------------------------------------------------------------------------- /src/text2speech/README.md: -------------------------------------------------------------------------------- 1 | Converting text input to speech given the input language -------------------------------------------------------------------------------- /src/text_classification/README.md: -------------------------------------------------------------------------------- 1 | ## Purpose 2 | Common folder for all small NLP model that are are being pulled from Hugging Face and deployed on server -------------------------------------------------------------------------------- /src/text_classification/convo_starter_orgbot/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samagra-Development/ai-tools/bdb71a281f52b8c3d6a086b9c6e8ad347f103553/src/text_classification/convo_starter_orgbot/README.md -------------------------------------------------------------------------------- /src/text_classification/convo_starter_orgbot/local/Dockerfile: -------------------------------------------------------------------------------- 1 | # Use an official Python runtime as a parent image 2 | FROM python:3.9-slim 3 | 4 | WORKDIR /app 5 | 6 | 7 | #install requirements 8 | COPY requirements.txt requirements.txt 9 | RUN pip3 install -r requirements.txt 10 | 11 | # Copy the rest of the application code to the working directory 12 | COPY . /app/ 13 | EXPOSE 8000 14 | # Set the entrypoint for the container 15 | CMD ["hypercorn", "--bind", "0.0.0.0:8000", "api:app"] 16 | -------------------------------------------------------------------------------- /src/text_classification/convo_starter_orgbot/local/README.md: -------------------------------------------------------------------------------- 1 | ## Grievance classification: 2 | 3 | 4 | ### Purpose : 5 | Model to classify grievances into 3 buckets : 6 | - Label 0: 'General questions' 7 | - Label 1: 'Starter: Hi, hello etc' 8 | 9 | 10 | 11 | ### Testing the model deployment : 12 | To run for testing just the Hugging Face deployment for grievence recognition, you can follow the following steps : 13 | 14 | - Git clone the repo 15 | - Go to current folder location i.e. ``` cd /src/text_classification/flow_classification/local ``` 16 | - Create docker image file and test the api: 17 | ``` 18 | docker build -t testmodel . 19 | docker run -p 8000:8000 testmodel 20 | curl -X POST -H "Content-Type: application/json" -d '{"text": "Where is my money? "}' http://localhost:8000/ 21 | ``` 22 | -------------------------------------------------------------------------------- /src/text_classification/convo_starter_orgbot/local/__init__.py: -------------------------------------------------------------------------------- 1 | from .request import ModelRequest 2 | from .request import Model 3 | -------------------------------------------------------------------------------- /src/text_classification/convo_starter_orgbot/local/api.py: -------------------------------------------------------------------------------- 1 | from model import Model 2 | from request import ModelRequest 3 | from quart import Quart, request 4 | import aiohttp 5 | 6 | app = Quart(__name__) 7 | 8 | model = None 9 | 10 | @app.before_serving 11 | async def startup(): 12 | app.client = aiohttp.ClientSession() 13 | global model 14 | model = Model(app) 15 | 16 | @app.route('/', methods=['POST']) 17 | async def embed(): 18 | global model 19 | data = await request.get_json() 20 | req = ModelRequest(**data) 21 | return await model.inference(req) 22 | 23 | if __name__ == "__main__": 24 | app.run() -------------------------------------------------------------------------------- /src/text_classification/convo_starter_orgbot/local/model.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoModelForSequenceClassification, AutoTokenizer 2 | import torch 3 | from request import ModelRequest 4 | 5 | class Model(): 6 | def __new__(cls, context): 7 | cls.context = context 8 | if not hasattr(cls, 'instance'): 9 | cls.instance = super(Model, cls).__new__(cls) 10 | model_name = "GautamR/convo_beginner_orgbot" 11 | cls.tokenizer = AutoTokenizer.from_pretrained(model_name) 12 | cls.model = AutoModelForSequenceClassification.from_pretrained(model_name) 13 | cls.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 14 | cls.model.to(cls.device) 15 | return cls.instance 16 | 17 | 18 | async def inference(self, request: ModelRequest): 19 | inputs = self.tokenizer(request.text, return_tensors="pt") 20 | inputs = {key: value.to(self.device) for key, value in inputs.items()} 21 | with torch.no_grad(): 22 | logits = self.model(**inputs).logits 23 | predicted_class_id = logits.argmax().item() 24 | return self.model.config.id2label[predicted_class_id] -------------------------------------------------------------------------------- /src/text_classification/convo_starter_orgbot/local/request.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import json 3 | 4 | 5 | class ModelRequest(): 6 | def __init__(self, text): 7 | self.text = text 8 | 9 | def to_json(self): 10 | return json.dumps(self, default=lambda o: o.__dict__, 11 | sort_keys=True, indent=4) -------------------------------------------------------------------------------- /src/text_classification/convo_starter_orgbot/local/requirements.txt: -------------------------------------------------------------------------------- 1 | torch==2.0.1 --index-url https://download.pytorch.org/whl/cpu 2 | transformers 3 | quart 4 | aiohttp -------------------------------------------------------------------------------- /src/text_classification/flow_classification/local/Dockerfile: -------------------------------------------------------------------------------- 1 | # Use an official Python runtime as a parent image 2 | FROM python:3.9-slim 3 | 4 | WORKDIR /app 5 | 6 | 7 | #install requirements 8 | COPY requirements.txt requirements.txt 9 | RUN pip3 install -r requirements.txt 10 | 11 | # Copy the rest of the application code to the working directory 12 | COPY . /app/ 13 | EXPOSE 8000 14 | # Set the entrypoint for the container 15 | CMD ["hypercorn", "--bind", "0.0.0.0:8000", "api:app"] 16 | -------------------------------------------------------------------------------- /src/text_classification/flow_classification/local/README.md: -------------------------------------------------------------------------------- 1 | ## Grievance classification: 2 | 3 | 4 | ### Purpose : 5 | Model to classify grievances into 3 buckets : 6 | - Label 0: 'Agri scheme' 7 | - Label 1: 'Other agri content' 8 | - Label 2: 'pest flow' 9 | - Label 3: 'seed flow' 10 | 11 | 12 | ### Testing the model deployment : 13 | To run for testing just the Hugging Face deployment for grievence recognition, you can follow the following steps : 14 | 15 | - Git clone the repo 16 | - Go to current folder location i.e. ``` cd /src/text_classification/flow_classification/local ``` 17 | - Create docker image file and test the api: 18 | ``` 19 | docker build -t testmodel . 20 | docker run -p 8000:8000 testmodel 21 | curl -X POST -H "Content-Type: application/json" -d '{"text": "Where is my money? "}' http://localhost:8000/ 22 | ``` 23 | -------------------------------------------------------------------------------- /src/text_classification/flow_classification/local/__init__.py: -------------------------------------------------------------------------------- 1 | from .request import ModelRequest 2 | from .request import Model 3 | -------------------------------------------------------------------------------- /src/text_classification/flow_classification/local/api.py: -------------------------------------------------------------------------------- 1 | from model import Model 2 | from request import ModelRequest 3 | from quart import Quart, request 4 | import aiohttp 5 | 6 | app = Quart(__name__) 7 | 8 | model = None 9 | 10 | @app.before_serving 11 | async def startup(): 12 | app.client = aiohttp.ClientSession() 13 | global model 14 | model = Model(app) 15 | 16 | @app.route('/', methods=['POST']) 17 | async def embed(): 18 | global model 19 | data = await request.get_json() 20 | req = ModelRequest(**data) 21 | return await model.inference(req) 22 | 23 | if __name__ == "__main__": 24 | app.run() -------------------------------------------------------------------------------- /src/text_classification/flow_classification/local/model.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoModelForSequenceClassification, AutoTokenizer 2 | import torch 3 | from request import ModelRequest 4 | from torch.nn.functional import softmax 5 | 6 | class Model(): 7 | def __new__(cls, context): 8 | cls.context = context 9 | if not hasattr(cls, 'instance'): 10 | cls.instance = super(Model, cls).__new__(cls) 11 | model_name = "GautamR/akai_flow_classifier_pest_seed_scheme" 12 | cls.tokenizer = AutoTokenizer.from_pretrained(model_name) 13 | cls.model = AutoModelForSequenceClassification.from_pretrained(model_name) 14 | cls.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 15 | cls.model.to(cls.device) 16 | return cls.instance 17 | 18 | 19 | async def inference(self, request: ModelRequest): 20 | inputs = self.tokenizer(request.text, return_tensors="pt") 21 | inputs = {key: value.to(self.device) for key, value in inputs.items()} 22 | with torch.no_grad(): 23 | logits = self.model(**inputs).logits 24 | 25 | probabilities = softmax(logits, dim=1) 26 | 27 | output = [] 28 | for idx, score in enumerate(probabilities[0]): 29 | label = self.model.config.id2label[idx] 30 | output.append({"label": label, "score": score.item()}) 31 | 32 | sorted_output = sorted(output, key=lambda x: x['score'], reverse=True) 33 | 34 | return [[item for item in sorted_output]] 35 | -------------------------------------------------------------------------------- /src/text_classification/flow_classification/local/request.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import json 3 | 4 | 5 | class ModelRequest(): 6 | def __init__(self, text): 7 | self.text = text 8 | 9 | def to_json(self): 10 | return json.dumps(self, default=lambda o: o.__dict__, 11 | sort_keys=True, indent=4) -------------------------------------------------------------------------------- /src/text_classification/flow_classification/local/requirements.txt: -------------------------------------------------------------------------------- 1 | torch 2 | transformers 3 | quart 4 | aiohttp 5 | -------------------------------------------------------------------------------- /src/text_classification/grievance_recognition/README.md: -------------------------------------------------------------------------------- 1 | ## Classification of grievances -------------------------------------------------------------------------------- /src/text_classification/grievance_recognition/local/Dockerfile: -------------------------------------------------------------------------------- 1 | # Use an official Python runtime as a parent image 2 | FROM python:3.9-slim 3 | 4 | WORKDIR /app 5 | 6 | 7 | #install requirements 8 | COPY requirements.txt requirements.txt 9 | RUN pip3 install -r requirements.txt 10 | 11 | # Copy the rest of the application code to the working directory 12 | COPY . /app/ 13 | EXPOSE 5000 14 | # Set the entrypoint for the container 15 | CMD ["hypercorn", "--bind", "0.0.0.0:5000", "api:app"] -------------------------------------------------------------------------------- /src/text_classification/grievance_recognition/local/README.md: -------------------------------------------------------------------------------- 1 | ## Grievance classification: 2 | 3 | 4 | ### Purpose : 5 | Model to classify grievances into 3 buckets : 6 | - Label 0: 'Account number is not Correct' 7 | - Label 1: 'Installment not received' 8 | - Label 2: 'Others' 9 | 10 | **'Account number not correct'**: This is for those grievances/ feedback which talk about the bank account not being correct on the portal for these farmers. 11 | 12 | **'Installment not received':** This is for those feedback which talk about the farmer not receiving the installment/money for that month. 13 | 14 | **'Others':** This cover all other types of grievances - including 'Gender being wrong ,'Online Application pnding approval,' other Payment Related issues','Problem in Adhaar Correction' ,'problem in bio-metric based e-kyc','Problem in OTP based e-kyc', 'Tansaction Failed' etc etc. 15 | 16 | 17 | ### Testing the model deployment : 18 | To run for testing just the Hugging Face deployment for grievence recognition, you can follow the following steps : 19 | 20 | - Git clone the repo 21 | - Go to current folder location i.e. ``` cd /src/text_classification/grievance_recognition/local ``` 22 | - Create docker image file and test the api: 23 | ``` 24 | docker build -t testmodel . 25 | docker run -p 5000:5000 testmodel 26 | curl -X POST -H "Content-Type: application/json" -d '{"text": "Where is my money? "}' http://localhost:5000/ 27 | ``` 28 | -------------------------------------------------------------------------------- /src/text_classification/grievance_recognition/local/__init__.py: -------------------------------------------------------------------------------- 1 | from .request import ModelRequest 2 | from .request import Model 3 | -------------------------------------------------------------------------------- /src/text_classification/grievance_recognition/local/api.py: -------------------------------------------------------------------------------- 1 | from model import Model 2 | from request import ModelRequest 3 | from quart import Quart, request 4 | import aiohttp 5 | 6 | app = Quart(__name__) 7 | 8 | model = None 9 | 10 | @app.before_serving 11 | async def startup(): 12 | app.client = aiohttp.ClientSession() 13 | global model 14 | model = Model(app) 15 | 16 | @app.route('/', methods=['POST']) 17 | async def embed(): 18 | global model 19 | data = await request.get_json() 20 | req = ModelRequest(**data) 21 | return await model.inference(req) 22 | 23 | if __name__ == "__main__": 24 | app.run() -------------------------------------------------------------------------------- /src/text_classification/grievance_recognition/local/model.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoModelForSequenceClassification, AutoTokenizer 2 | import torch 3 | from request import ModelRequest 4 | 5 | class Model(): 6 | def __new__(cls, context): 7 | cls.context = context 8 | if not hasattr(cls, 'instance'): 9 | cls.instance = super(Model, cls).__new__(cls) 10 | model_name = "GautamR/model_grievance_class" 11 | cls.tokenizer = AutoTokenizer.from_pretrained(model_name) 12 | cls.model = AutoModelForSequenceClassification.from_pretrained(model_name) 13 | cls.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 14 | cls.model.to(cls.device) 15 | return cls.instance 16 | 17 | 18 | async def inference(self, request: ModelRequest): 19 | inputs = self.tokenizer(request.text, return_tensors="pt") 20 | inputs = {key: value.to(self.device) for key, value in inputs.items()} 21 | with torch.no_grad(): 22 | logits = self.model(**inputs).logits 23 | predicted_class_id = logits.argmax().item() 24 | return self.model.config.id2label[predicted_class_id] -------------------------------------------------------------------------------- /src/text_classification/grievance_recognition/local/request.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import json 3 | 4 | 5 | class ModelRequest(): 6 | def __init__(self, text): 7 | self.text = text 8 | 9 | def to_json(self): 10 | return json.dumps(self, default=lambda o: o.__dict__, 11 | sort_keys=True, indent=4) -------------------------------------------------------------------------------- /src/text_classification/grievance_recognition/local/requirements.txt: -------------------------------------------------------------------------------- 1 | torch==2.0.1 --index-url https://download.pytorch.org/whl/cpu 2 | transformers 3 | quart 4 | aiohttp -------------------------------------------------------------------------------- /src/text_lang_detection/README.md: -------------------------------------------------------------------------------- 1 | Detecting language for text (without translations) -------------------------------------------------------------------------------- /src/text_lang_detection/bhashini/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samagra-Development/ai-tools/bdb71a281f52b8c3d6a086b9c6e8ad347f103553/src/text_lang_detection/bhashini/__init__.py -------------------------------------------------------------------------------- /src/text_lang_detection/bhashini/remote/Dockerfile: -------------------------------------------------------------------------------- 1 | # Use an official Python runtime as a parent image 2 | FROM python:3.9-slim 3 | 4 | WORKDIR /app 5 | 6 | #install requirements 7 | COPY requirements.txt requirements.txt 8 | RUN pip3 install -r requirements.txt 9 | 10 | # Copy the rest of the application code to the working directory 11 | COPY . /app/ 12 | EXPOSE 8000 13 | # Set the entrypoint for the container 14 | CMD ["hypercorn", "--bind", "0.0.0.0:8000", "api:app"] 15 | 16 | -------------------------------------------------------------------------------- /src/text_lang_detection/bhashini/remote/__init__.py: -------------------------------------------------------------------------------- 1 | from .request import ModelRequest 2 | from .request import Model -------------------------------------------------------------------------------- /src/text_lang_detection/bhashini/remote/api.py: -------------------------------------------------------------------------------- 1 | from model import Model 2 | from request import ModelRequest 3 | from quart import Quart, request 4 | import aiohttp 5 | 6 | #from fastapi import FastAPI, Body 7 | app = Quart(__name__) 8 | #app.client = aiohttp.ClientSession() 9 | #app = FastAPI() 10 | 11 | @app.before_serving 12 | async def startup(): 13 | app.client = aiohttp.ClientSession() 14 | 15 | @app.route('/', methods=['POST']) 16 | async def detect(): 17 | data = await request.get_json() 18 | req = ModelRequest(**data) 19 | model = Model(app) 20 | return await model.inference(req) 21 | -------------------------------------------------------------------------------- /src/text_lang_detection/bhashini/remote/model.py: -------------------------------------------------------------------------------- 1 | from cache import AsyncTTL 2 | from request import ModelRequest 3 | import json 4 | from tenacity import retry, wait_random_exponential, stop_after_attempt 5 | import requests 6 | 7 | 8 | 9 | class Model: 10 | def __new__(cls, context): 11 | cls.context = context 12 | if not hasattr(cls, 'instance'): 13 | cls.instance = super(Model, cls).__new__(cls) 14 | return cls.instance 15 | 16 | @AsyncTTL(time_to_live=600000, maxsize=1024) 17 | @retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6)) 18 | async def inference(self, request: ModelRequest): 19 | url = "https://meity-auth.ulcacontrib.org/ulca/apis/v0/model/compute" 20 | 21 | payload = json.dumps({ 22 | "modelId": "631736990154d6459973318e", 23 | "task": "txt-lang-detection", 24 | "input": [ 25 | { 26 | "source": request.text 27 | } 28 | ], 29 | "userId": None 30 | }) 31 | headers = { 32 | 'authority': 'meity-auth.ulcacontrib.org', 33 | 'accept': '*/*', 34 | 'content-type': 'application/json', 35 | 'origin': 'https://bhashini.gov.in' 36 | } 37 | 38 | response = requests.post(url, headers=headers, data=payload) 39 | 40 | # { 41 | # "output": [ 42 | # { 43 | # "source": "महात्मा गांधी का जन्म कहाँ हुआ था?", 44 | # "langPrediction": [ 45 | # { 46 | # "langCode": "hi", 47 | # "ScriptCode": null, 48 | # "langScore": 100 49 | # } 50 | # ] 51 | # } 52 | # ], 53 | # "config": null 54 | # } 55 | resp = response.json() 56 | return {"language": resp["output"][0]["langPrediction"][0]["langCode"], "success": True} 57 | -------------------------------------------------------------------------------- /src/text_lang_detection/bhashini/remote/request.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | 4 | class ModelRequest(): 5 | def __init__(self, text): 6 | self.text = text 7 | 8 | def to_json(self): 9 | return json.dumps(self, default=lambda o: o.__dict__, 10 | sort_keys=True, indent=4) 11 | -------------------------------------------------------------------------------- /src/text_lang_detection/bhashini/remote/requirements.txt: -------------------------------------------------------------------------------- 1 | aiohttp==3.8.4 2 | quart==0.18.3 3 | async-cache==1.1.1 4 | requests 5 | tenacity -------------------------------------------------------------------------------- /src/text_translation/README.md: -------------------------------------------------------------------------------- 1 | Translating from one langauge to another given input and output languages -------------------------------------------------------------------------------- /src/text_translation/ai4bharat/batch/__init__.py: -------------------------------------------------------------------------------- 1 | from .batch import * 2 | from .batch_request import * -------------------------------------------------------------------------------- /src/text_translation/ai4bharat/batch/batch.py: -------------------------------------------------------------------------------- 1 | import sys 2 | # sys.path.insert(0,".") 3 | from . import batch_request 4 | 5 | 6 | class AI4BharatBatchModel(): 7 | def __new__(cls): 8 | if not hasattr(cls, 'instance'): 9 | cls.instance = super(AI4BharatBatchModel, cls).__new__(cls) 10 | cls.load_model(cls) 11 | return cls.instance 12 | 13 | def load_model(self): 14 | from inference.engine import Model 15 | self.model = Model(expdir='../indic-en') 16 | 17 | def inference(self, request: batch_request.AI4BharatBatchModelRequest): 18 | return self.model.batch_translate(request.batch, request.source, request.target) 19 | -------------------------------------------------------------------------------- /src/text_translation/ai4bharat/batch/batch_request.py: -------------------------------------------------------------------------------- 1 | class AI4BharatBatchModelRequest(): 2 | def __init__(self, batch, source, target): 3 | self.batch = batch 4 | self.source = source 5 | self.target = target 6 | 7 | def to_json(self): 8 | import json 9 | return json.dumps(self, default=lambda o: o.__dict__, 10 | sort_keys=True, indent=4) -------------------------------------------------------------------------------- /src/text_translation/ai4bharat/deploy.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | git clone https://github.com/AI4Bharat/indicTrans.git 4 | git clone https://github.com/anoopkunchukuttan/indic_nlp_library.git 5 | git clone https://github.com/anoopkunchukuttan/indic_nlp_resources.git 6 | git clone https://github.com/rsennrich/subword-nmt.git 7 | git clone https://github.com/pytorch/fairseq.git 8 | wget https://ai4b-public-nlu-nlg.objectstore.e2enetworks.net/indic2en.zip 9 | unzip indic2en.zip 10 | cd indicTrans 11 | pip install --ignore-installed pexpect 12 | pip install poetry 13 | poetry add gunicorn flask_cors sacremoses pandas mock sacrebleu tensorboardX pyarrow indic-nlp-library mosestokenizer subword-nmt xformers torch==2.0.0 triton flask_cors webvtt-py 14 | poetry add fairseq 15 | -------------------------------------------------------------------------------- /src/text_translation/ai4bharat/readme copy.md: -------------------------------------------------------------------------------- 1 | In any nvidia gpu machine, create a folder, drop two files(api_main.py and deploy.sh). \ 2 | Run the 'deploy' shell script. 3 | Script will gather all the dependencies and start the API. 4 | 5 | There are 3 APIs: 6 | 7 | 1. Test API to see if API is running 8 | 9 | > curl --location 'http://127.0.0.1:8000/' 10 | 11 | 2. translate_paragraph from any Indian language to English 12 | 13 | > curl --location 'http://127.0.0.1:8000/translate_paragraph' \ 14 | --header 'Content-Type: application/json' \ 15 | --data '{ 16 | "source":"or", 17 | "target":"en", 18 | "paragraph": "ବହୁତ ଦିନ ହେଲାଣି ଦେଖା ନାହିଁ" 19 | }' 20 | 21 | 3. translate batch of sentences from any Indian language to English 22 | 23 | > curl --location 'http://127.0.0.1:8000/batch_translate' \ 24 | --header 'Content-Type: application/json' \ 25 | --header 'Cookie: refresh_token=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ0b2tlbl90eXBlIjoicmVmcmVzaCIsImV4cCI6MTY4ODA2Nzc2MywiaWF0IjoxNjc5NDI3NzYzLCJqdGkiOiIzYWViYTliYjc1MzE0NWFlODBlZGQwNTA0MDdmOGVmYyIsInVzZXJfaWQiOjF9.gE9ln1fZrS6CCjWNzr67dk263PiVGVPLqK2DNmw1zX4' \ 26 | --data '{ 27 | "source":"or", 28 | "target":"en", 29 | "batch": ["ବହୁତ ଦିନ ହେଲାଣି ଦେଖା", "ନାହିଁ"] 30 | }' -------------------------------------------------------------------------------- /src/text_translation/ai4bharat/remote/Dockerfile: -------------------------------------------------------------------------------- 1 | # Use an official Python runtime as a parent image 2 | FROM python:3.9-slim 3 | 4 | WORKDIR /app 5 | 6 | #install requirements 7 | COPY requirements.txt requirements.txt 8 | RUN pip3 install -r requirements.txt 9 | 10 | # Copy the rest of the application code to the working directory 11 | COPY . /app/ 12 | EXPOSE 8000 13 | # Set the entrypoint for the container 14 | CMD ["hypercorn", "--bind", "0.0.0.0:8000", "api:app"] 15 | 16 | -------------------------------------------------------------------------------- /src/text_translation/ai4bharat/remote/README.md: -------------------------------------------------------------------------------- 1 | curl -X POST -H "Content-Type: application/json" -d '{"text": "मेरा पैसा कहाँ है?", "source_language": "hi", "target_language": "en"}' -------------------------------------------------------------------------------- /src/text_translation/ai4bharat/remote/__init__.py: -------------------------------------------------------------------------------- 1 | from .request import ModelRequest 2 | from .request import Model -------------------------------------------------------------------------------- /src/text_translation/ai4bharat/remote/api.py: -------------------------------------------------------------------------------- 1 | from model import Model 2 | from request import ModelRequest 3 | from quart import Quart, request 4 | import aiohttp 5 | 6 | #from fastapi import FastAPI, Body 7 | app = Quart(__name__) 8 | #app.client = aiohttp.ClientSession() 9 | #app = FastAPI() 10 | 11 | @app.before_serving 12 | async def startup(): 13 | app.client = aiohttp.ClientSession() 14 | 15 | @app.route('/', methods=['POST']) 16 | async def translate(): 17 | data = await request.get_json() 18 | req = ModelRequest(**data) 19 | model = Model(app) 20 | return await model.inference(req) 21 | -------------------------------------------------------------------------------- /src/text_translation/ai4bharat/remote/model.py: -------------------------------------------------------------------------------- 1 | from cache import AsyncTTL 2 | from request import ModelRequest 3 | import json 4 | import requests 5 | import os 6 | 7 | authorization_key = os.getenv("AI4BHARAT_KEY") 8 | 9 | 10 | class Model: 11 | def __new__(cls, context): 12 | cls.context = context 13 | if not hasattr(cls, 'instance'): 14 | cls.instance = super(Model, cls).__new__(cls) 15 | return cls.instance 16 | 17 | @AsyncTTL(time_to_live=600000, maxsize=1024) 18 | async def inference(self, request: ModelRequest): 19 | 20 | url = "https://api.dhruva.ai4bharat.org/services/inference/translation?serviceId=ai4bharat%2Findictrans-v2-all-gpu--t4" 21 | headers = { 22 | "Content-Type": "application/json", 23 | "authorization": authorization_key 24 | } 25 | payload = { 26 | "config": { 27 | "language": { 28 | "sourceLanguage": request.source_language, 29 | "targetLanguage": request.target_language 30 | } 31 | }, 32 | "input": [ 33 | { 34 | "source": request.text 35 | } 36 | ] 37 | } 38 | 39 | response = requests.post(url, headers=headers, json=payload) 40 | resp = response.json() 41 | 42 | return {"translated": resp['output'][0]['target'], "success": True} 43 | -------------------------------------------------------------------------------- /src/text_translation/ai4bharat/remote/request.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | 4 | class ModelRequest(): 5 | def __init__(self, text, source_language, target_language): 6 | self.text = text 7 | self.source_language = source_language 8 | self.target_language = target_language 9 | 10 | def to_json(self): 11 | return json.dumps(self, default=lambda o: o.__dict__, 12 | sort_keys=True, indent=4) 13 | -------------------------------------------------------------------------------- /src/text_translation/ai4bharat/remote/requirements.txt: -------------------------------------------------------------------------------- 1 | aiohttp==3.8.4 2 | quart==0.18.3 3 | async-cache==1.1.1 4 | requests 5 | -------------------------------------------------------------------------------- /src/text_translation/azure/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samagra-Development/ai-tools/bdb71a281f52b8c3d6a086b9c6e8ad347f103553/src/text_translation/azure/README.md -------------------------------------------------------------------------------- /src/text_translation/azure/remote/Dockerfile: -------------------------------------------------------------------------------- 1 | # Use an official Python runtime as a parent image 2 | FROM python:3.9-slim 3 | 4 | WORKDIR /app 5 | 6 | #install requirements 7 | COPY requirements.txt requirements.txt 8 | RUN pip3 install -r requirements.txt 9 | 10 | # Copy the rest of the application code to the working directory 11 | COPY . /app/ 12 | EXPOSE 8000 13 | # Set the entrypoint for the container 14 | CMD ["hypercorn", "--bind", "0.0.0.0:8000", "api:app"] 15 | 16 | -------------------------------------------------------------------------------- /src/text_translation/azure/remote/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | docker build -t testmodel1 . 4 | docker run -p 8000:8000 testmodel1 5 | 6 | curl --location 'http://localhost:8000/' --header 'accept: */*' --header 'content-type: application/json' --header 'Authorization: ' --data '{"source_language": "or", "target_language": "en", "text": "ଚାଷୀମାନଙ୍କୁ କିପରୀ ଘର ପାଖରେ ବିହନ ସୁବିଧାରେ ପାଇପାରିବେ"}' -------------------------------------------------------------------------------- /src/text_translation/azure/remote/__init__.py: -------------------------------------------------------------------------------- 1 | from .request import ModelRequest 2 | from .request import Model -------------------------------------------------------------------------------- /src/text_translation/azure/remote/api.py: -------------------------------------------------------------------------------- 1 | from model import Model 2 | from request import ModelRequest 3 | from quart import Quart, request 4 | import aiohttp 5 | 6 | #from fastapi import FastAPI, Body 7 | app = Quart(__name__) 8 | #app.client = aiohttp.ClientSession() 9 | #app = FastAPI() 10 | 11 | @app.before_serving 12 | async def startup(): 13 | app.client = aiohttp.ClientSession() 14 | 15 | @app.route('/', methods=['POST']) 16 | async def translate(): 17 | data = await request.get_json() 18 | req = ModelRequest(**data) 19 | model = Model(app) 20 | return await model.inference(req) 21 | -------------------------------------------------------------------------------- /src/text_translation/azure/remote/model.py: -------------------------------------------------------------------------------- 1 | from cache import AsyncTTL 2 | from request import ModelRequest 3 | import io 4 | import requests 5 | import json 6 | import os 7 | import uuid 8 | 9 | 10 | class Model: 11 | def __new__(cls, context): 12 | cls.context = context 13 | if not hasattr(cls, 'instance'): 14 | # Set up service account credentials 15 | cls.endpoint = 'https://api.cognitive.microsofttranslator.com/translate?api-version=3.0' 16 | cls.subscription_key = os.getenv("AZURE_TRANSLATE_KEY") 17 | cls.headers = { 18 | 'Ocp-Apim-Subscription-Key': cls.subscription_key, 19 | 'Content-type': 'application/json', 20 | 'Ocp-Apim-Subscription-Region': 'southeastasia', 21 | 'X-ClientTraceId': str(uuid.uuid4()) 22 | } 23 | 24 | cls.instance = super(Model, cls).__new__(cls) 25 | return cls.instance 26 | 27 | async def inference(self, request: ModelRequest): 28 | params = '&to=' + request.target_language 29 | body = [{'text': request.text}] 30 | request = requests.post(self.endpoint + params, headers=self.headers, data=json.dumps(body)) 31 | print(request.text) 32 | response = request.json() 33 | 34 | print(response) 35 | 36 | translated_text = response[0]['translations'][0]['text'] 37 | print(translated_text) 38 | 39 | return { 40 | "success": True, 41 | "translated": translated_text 42 | } -------------------------------------------------------------------------------- /src/text_translation/azure/remote/request.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | 4 | class ModelRequest(): 5 | def __init__(self, text, source_language, target_language): 6 | self.text = text 7 | self.source_language = source_language 8 | self.target_language = target_language 9 | 10 | def to_json(self): 11 | return json.dumps(self, default=lambda o: o.__dict__, 12 | sort_keys=True, indent=4) 13 | -------------------------------------------------------------------------------- /src/text_translation/azure/remote/requirements.txt: -------------------------------------------------------------------------------- 1 | aiohttp==3.8.4 2 | quart==0.18.3 3 | async-cache==1.1.1 4 | requests 5 | -------------------------------------------------------------------------------- /src/text_translation/azure_dict/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samagra-Development/ai-tools/bdb71a281f52b8c3d6a086b9c6e8ad347f103553/src/text_translation/azure_dict/README.md -------------------------------------------------------------------------------- /src/text_translation/azure_dict/remote/Dockerfile: -------------------------------------------------------------------------------- 1 | # Use an official Python runtime as a parent image 2 | FROM python:3.9-slim 3 | 4 | WORKDIR /app 5 | 6 | #install requirements 7 | COPY requirements.txt requirements.txt 8 | RUN pip3 install -r requirements.txt 9 | 10 | # Copy the rest of the application code to the working directory 11 | COPY . /app/ 12 | EXPOSE 8000 13 | # Set the entrypoint for the container 14 | CMD ["hypercorn", "--bind", "0.0.0.0:8000", "api:app"] 15 | 16 | -------------------------------------------------------------------------------- /src/text_translation/azure_dict/remote/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | docker build -t testmodel1 . 4 | docker run -p 8000:8000 testmodel1 5 | 6 | curl --location 'http://localhost:8000/' --header 'accept: */*' --header 'content-type: application/json' --header 'Authorization: ' --data '{"source_language": "or", "target_language": "en", "text": "ଚାଷୀମାନଙ୍କୁ କିପରୀ ଘର ପାଖରେ ବିହନ ସୁବିଧାରେ ପାଇପାରିବେ"}' -------------------------------------------------------------------------------- /src/text_translation/azure_dict/remote/__init__.py: -------------------------------------------------------------------------------- 1 | from .request import ModelRequest 2 | from .request import Model -------------------------------------------------------------------------------- /src/text_translation/azure_dict/remote/api.py: -------------------------------------------------------------------------------- 1 | from model import Model 2 | from request import ModelRequest 3 | from quart import Quart, request 4 | import aiohttp 5 | import asyncio 6 | from quart_cors import cors 7 | 8 | app = Quart(__name__) 9 | app = cors(app) 10 | 11 | @app.before_serving 12 | async def startup(): 13 | print("Startup function is being called!") 14 | app.client = aiohttp.ClientSession() 15 | app.model_instance = Model() # instantiate the model 16 | app.update_task = asyncio.create_task(app.model_instance.update_translation_dictionary()) # update the dictionary 17 | 18 | @app.after_serving 19 | async def cleanup(): 20 | app.update_task.cancel() 21 | 22 | @app.route('/', methods=['POST']) 23 | async def translate(): 24 | data = await request.get_json() 25 | req = ModelRequest(**data) 26 | 27 | if req.source and req.translation: 28 | app.model_instance.data_dict[req.source] = req.translation 29 | 30 | return await app.model_instance.inference(req) 31 | -------------------------------------------------------------------------------- /src/text_translation/azure_dict/remote/request.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | class ModelRequest(): 4 | def __init__(self, text, source_language, target_language, source=None, translation=None): 5 | self.text = text 6 | self.source_language = source_language 7 | self.target_language = target_language 8 | self.source = source 9 | self.translation = translation 10 | 11 | def to_json(self): 12 | return json.dumps(self, default=lambda o: o.__dict__, sort_keys=True, indent=4) 13 | -------------------------------------------------------------------------------- /src/text_translation/azure_dict/remote/requirements.txt: -------------------------------------------------------------------------------- 1 | Werkzeug==2.2.2 2 | aiohttp==3.8.4 3 | quart==0.18.3 4 | async-cache==1.1.1 5 | asyncio 6 | requests 7 | quart-cors 8 | -------------------------------------------------------------------------------- /src/text_translation/bhashini/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samagra-Development/ai-tools/bdb71a281f52b8c3d6a086b9c6e8ad347f103553/src/text_translation/bhashini/__init__.py -------------------------------------------------------------------------------- /src/text_translation/bhashini/remote/Dockerfile: -------------------------------------------------------------------------------- 1 | # Use an official Python runtime as a parent image 2 | FROM python:3.9-slim 3 | 4 | WORKDIR /app 5 | 6 | #install requirements 7 | COPY requirements.txt requirements.txt 8 | RUN pip3 install -r requirements.txt 9 | 10 | # Copy the rest of the application code to the working directory 11 | COPY . /app/ 12 | EXPOSE 8000 13 | # Set the entrypoint for the container 14 | CMD ["hypercorn", "--bind", "0.0.0.0:8000", "api:app"] 15 | 16 | -------------------------------------------------------------------------------- /src/text_translation/bhashini/remote/__init__.py: -------------------------------------------------------------------------------- 1 | from .request import ModelRequest 2 | from .request import Model -------------------------------------------------------------------------------- /src/text_translation/bhashini/remote/api.py: -------------------------------------------------------------------------------- 1 | from model import Model 2 | from request import ModelRequest 3 | from quart import Quart, request 4 | import aiohttp 5 | 6 | #from fastapi import FastAPI, Body 7 | app = Quart(__name__) 8 | #app.client = aiohttp.ClientSession() 9 | #app = FastAPI() 10 | 11 | @app.before_serving 12 | async def startup(): 13 | app.client = aiohttp.ClientSession() 14 | 15 | @app.route('/', methods=['POST']) 16 | async def translate(): 17 | data = await request.get_json() 18 | req = ModelRequest(**data) 19 | model = Model(app) 20 | return await model.inference(req) 21 | -------------------------------------------------------------------------------- /src/text_translation/bhashini/remote/model.py: -------------------------------------------------------------------------------- 1 | from cache import AsyncTTL 2 | from request import ModelRequest 3 | import json 4 | import requests 5 | 6 | 7 | 8 | class Model: 9 | def __new__(cls, context): 10 | cls.context = context 11 | if not hasattr(cls, 'instance'): 12 | cls.instance = super(Model, cls).__new__(cls) 13 | return cls.instance 14 | 15 | @AsyncTTL(time_to_live=600000, maxsize=1024) 16 | async def inference(self, request: ModelRequest): 17 | url = "https://nmt-api.ai4bharat.org/translate_sentence" 18 | payload = json.dumps({ 19 | "text": request.text, 20 | "source_language": request.source_language, 21 | "target_language": request.target_language 22 | }) 23 | headers = { 24 | 'authority': 'nmt-api.ai4bharat.org', 25 | 'accept': '*/*', 26 | 'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8', 27 | 'content-type': 'application/json', 28 | 'origin': 'https://models.ai4bharat.org', 29 | 'referer': 'https://models.ai4bharat.org/', 30 | 'sec-ch-ua': '"Not_A Brand";v="99", "Google Chrome";v="109", "Chromium";v="109"', 31 | 'sec-ch-ua-mobile': '?0', 32 | 'sec-ch-ua-platform': '"macOS"', 33 | 'sec-fetch-dest': 'empty', 34 | 'sec-fetch-mode': 'cors', 35 | 'sec-fetch-site': 'same-site', 36 | 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36' 37 | } 38 | 39 | response = requests.post(url, headers=headers, data=payload) 40 | resp = response.json() 41 | print(resp) 42 | return {"translated": resp["text"], "success": True} 43 | -------------------------------------------------------------------------------- /src/text_translation/bhashini/remote/request.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | 4 | class ModelRequest(): 5 | def __init__(self, text, source_language, target_language): 6 | self.text = text 7 | self.source_language = source_language 8 | self.target_language = target_language 9 | 10 | def to_json(self): 11 | return json.dumps(self, default=lambda o: o.__dict__, 12 | sort_keys=True, indent=4) -------------------------------------------------------------------------------- /src/text_translation/bhashini/remote/requirements.txt: -------------------------------------------------------------------------------- 1 | aiohttp==3.8.4 2 | quart==0.18.3 3 | async-cache==1.1.1 4 | requests 5 | -------------------------------------------------------------------------------- /src/text_translation/google/remote/Dockerfile: -------------------------------------------------------------------------------- 1 | # Use an official Python runtime as a parent image 2 | FROM python:3.9-slim 3 | 4 | WORKDIR /app 5 | 6 | #install requirements 7 | COPY requirements.txt requirements.txt 8 | RUN pip3 install -r requirements.txt 9 | 10 | # Copy the rest of the application code to the working directory 11 | COPY . /app/ 12 | EXPOSE 8000 13 | # Set the entrypoint for the container 14 | CMD ["hypercorn", "--bind", "0.0.0.0:8000", "api:app"] 15 | 16 | -------------------------------------------------------------------------------- /src/text_translation/google/remote/__init__.py: -------------------------------------------------------------------------------- 1 | from .request import ModelRequest 2 | from .request import Model -------------------------------------------------------------------------------- /src/text_translation/google/remote/api.py: -------------------------------------------------------------------------------- 1 | from model import Model 2 | from request import ModelRequest 3 | from quart import Quart, request 4 | import aiohttp 5 | 6 | #from fastapi import FastAPI, Body 7 | app = Quart(__name__) 8 | #app.client = aiohttp.ClientSession() 9 | #app = FastAPI() 10 | 11 | @app.before_serving 12 | async def startup(): 13 | app.client = aiohttp.ClientSession() 14 | 15 | @app.route('/', methods=['POST']) 16 | async def translate(): 17 | data = await request.get_json() 18 | req = ModelRequest(**data) 19 | model = Model(app) 20 | return await model.inference(req) 21 | -------------------------------------------------------------------------------- /src/text_translation/google/remote/model.py: -------------------------------------------------------------------------------- 1 | from cache import AsyncTTL 2 | from request import ModelRequest 3 | from google.oauth2.service_account import Credentials 4 | from google.cloud import translate 5 | 6 | class Model: 7 | def __new__(cls, context): 8 | cls.context = context 9 | if not hasattr(cls, 'instance'): 10 | # Set up service account credentials 11 | cls.credentials = Credentials.from_service_account_file('google-creds.json') 12 | 13 | # Create a client for the Speech-to-Text API with the service account credentials 14 | cls.client = translate.TranslationServiceClient(credentials=cls.credentials) 15 | 16 | cls.instance = super(Model, cls).__new__(cls) 17 | return cls.instance 18 | 19 | async def inference(self, request: ModelRequest): 20 | translation = self.client.translate_text( 21 | request={ 22 | "parent": "projects/samagragovernance-in-new", 23 | "mime_type": "text/plain", 24 | "source_language_code": request.source_language, 25 | "target_language_code": request.target_language, 26 | "contents": [request.text], 27 | } 28 | ) 29 | 30 | response = translation.translations[0].translated_text 31 | print(response) 32 | 33 | return { 34 | "success": True, 35 | "translated": response 36 | } 37 | -------------------------------------------------------------------------------- /src/text_translation/google/remote/request.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | 4 | class ModelRequest(): 5 | def __init__(self, text, source_language, target_language): 6 | self.text = text 7 | self.source_language = source_language 8 | self.target_language = target_language 9 | 10 | def to_json(self): 11 | return json.dumps(self, default=lambda o: o.__dict__, 12 | sort_keys=True, indent=4) -------------------------------------------------------------------------------- /src/text_translation/google/remote/requirements.txt: -------------------------------------------------------------------------------- 1 | aiohttp==3.8.4 2 | quart==0.18.3 3 | async-cache==1.1.1 4 | requests 5 | google-cloud-translate 6 | google-auth -------------------------------------------------------------------------------- /src/text_transliteration/README.md: -------------------------------------------------------------------------------- 1 | Tranlisteration of text from one language to another given input and output langauges (without language detection) -------------------------------------------------------------------------------- /src/token_counter/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samagra-Development/ai-tools/bdb71a281f52b8c3d6a086b9c6e8ad347f103553/src/token_counter/README.md -------------------------------------------------------------------------------- /src/token_counter/openai/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samagra-Development/ai-tools/bdb71a281f52b8c3d6a086b9c6e8ad347f103553/src/token_counter/openai/README.md -------------------------------------------------------------------------------- /src/token_counter/openai/local/Dockerfile: -------------------------------------------------------------------------------- 1 | # Use an official Python runtime as a parent image 2 | FROM python:3.9-slim 3 | 4 | WORKDIR /app 5 | 6 | 7 | #install requirements 8 | COPY requirements.txt requirements.txt 9 | RUN pip3 install -r requirements.txt 10 | 11 | # Copy the rest of the application code to the working directory 12 | COPY . /app/ 13 | EXPOSE 8000 14 | # Set the entrypoint for the container 15 | CMD ["hypercorn", "--bind", "0.0.0.0:8000", "api:app"] 16 | -------------------------------------------------------------------------------- /src/token_counter/openai/local/README.md: -------------------------------------------------------------------------------- 1 | ## Token counter: 2 | Simple API to count tokens (GPT 3.5) 3 | 4 | 5 | ### Testing the model deployment : 6 | To run for testing just the Hugging Face deployment for grievence recognition, you can follow the following steps : 7 | 8 | - Git clone the repo 9 | - Go to current folder location i.e. ``` cd /src/token_counter/openai/local ``` 10 | - Create docker image file and test the api: 11 | ``` 12 | docker build -t testmodel . 13 | docker run -p 8000:8000 testmodel 14 | curl -X POST -H "Content-Type: application/json" -d '{"text": "Where is my money? "}' http://localhost:8000/ 15 | curl -X POST -H "Content-Type: application/json" -d '{"query": "Where is my money? "}' http://aitools.v2.akai.samagra.io/token_counter/openai/local/ 16 | ``` 17 | -------------------------------------------------------------------------------- /src/token_counter/openai/local/__init__.py: -------------------------------------------------------------------------------- 1 | from .request import ModelRequest 2 | from .request import Model 3 | -------------------------------------------------------------------------------- /src/token_counter/openai/local/api.py: -------------------------------------------------------------------------------- 1 | from model import Model 2 | from request import ModelRequest 3 | from quart import Quart, request 4 | import aiohttp 5 | 6 | app = Quart(__name__) 7 | 8 | model = None 9 | 10 | @app.before_serving 11 | async def startup(): 12 | app.client = aiohttp.ClientSession() 13 | global model 14 | model = Model(app) 15 | 16 | @app.route('/', methods=['POST']) 17 | async def embed(): 18 | global model 19 | data = await request.get_json() 20 | req = ModelRequest(**data) 21 | return await model.inference(req) 22 | 23 | if __name__ == "__main__": 24 | app.run() -------------------------------------------------------------------------------- /src/token_counter/openai/local/model.py: -------------------------------------------------------------------------------- 1 | import tiktoken 2 | from request import ModelRequest 3 | 4 | class Model(): 5 | def __new__(cls, context): 6 | cls.context = context 7 | if not hasattr(cls, 'instance'): 8 | cls.instance = super(Model, cls).__new__(cls) 9 | model_name = "gpt-3.5-turbo" 10 | cls.encoding = tiktoken.encoding_for_model(model_name) 11 | return cls.instance 12 | 13 | 14 | async def inference(self, request: ModelRequest): 15 | num_tokens = len(self.encoding.encode(request.text)) 16 | return str(num_tokens) -------------------------------------------------------------------------------- /src/token_counter/openai/local/request.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import json 3 | 4 | 5 | class ModelRequest(): 6 | def __init__(self, text): 7 | self.text = text 8 | 9 | def to_json(self): 10 | return json.dumps(self, default=lambda o: o.__dict__, 11 | sort_keys=True, indent=4) -------------------------------------------------------------------------------- /src/token_counter/openai/local/requirements.txt: -------------------------------------------------------------------------------- 1 | tiktoken 2 | quart 3 | aiohttp -------------------------------------------------------------------------------- /src/topic_modelling/BERTopic/Dockerfile: -------------------------------------------------------------------------------- 1 | # Base image 2 | FROM python:3.9 3 | 4 | # Set the working directory 5 | WORKDIR /app 6 | 7 | # Install dependencies 8 | COPY requirements.txt . 9 | RUN pip install -r requirements.txt 10 | 11 | # Copy all source code 12 | COPY . . 13 | COPY . /app/ 14 | 15 | # Expose port for the server 16 | EXPOSE 8000 17 | 18 | # Command to run the server 19 | CMD ["hypercorn", "--bind", "0.0.0.0:8000", "api:app"] -------------------------------------------------------------------------------- /src/topic_modelling/BERTopic/README.md: -------------------------------------------------------------------------------- 1 | ## BERTopic Topic Extraction Model 2 | 3 | ### Purpose : 4 | Model to extract meaningful segmentations of a query dataset 5 | 6 | ### Testing the model deployment : 7 | To run for testing of the model for topic head generation, follow the given below steps: 8 | 9 | - Git clone the repo 10 | - Go to current folder location i.e. ``` cd src/topic_modelling/BERTopic ``` 11 | - Create docker image file and test the api: 12 | #### (IMP) The input .csv file must have one column having preprocessed text and column name as 'text' 13 | ''' 14 | docker build -t testmodel . 15 | docker run -p 8000:8000 testmodel 16 | curl -X POST -F "test.csv" http://localhost:8000/embed -o output4.csv 17 | ''' 18 | -------------------------------------------------------------------------------- /src/topic_modelling/BERTopic/__init__.py: -------------------------------------------------------------------------------- 1 | from .request import * 2 | from .model import * -------------------------------------------------------------------------------- /src/topic_modelling/BERTopic/api.py: -------------------------------------------------------------------------------- 1 | import os 2 | import io 3 | import json 4 | import pandas as pd 5 | from quart import Quart, request, Response, send_file 6 | from model import Model 7 | from request import ModelRequest 8 | 9 | app = Quart(__name__) 10 | 11 | # Initialize the model to be used for inference. 12 | model = None 13 | 14 | @app.before_serving 15 | async def startup(): 16 | """This function is called once before the server starts to initialize the model.""" 17 | global model 18 | model = Model(app) 19 | 20 | @app.route('/embed', methods=['POST']) 21 | async def embed(): 22 | """This endpoint receives a CSV file, extracts text data from it, and uses the model to generate embeddings and topic information.""" 23 | global model 24 | 25 | files = await request.files # Get the uploaded files 26 | uploaded_file = files.get('file') # Get the uploaded CSV file 27 | 28 | if not uploaded_file: 29 | return Response(json.dumps({"error": "No file uploaded"}), status=400, mimetype='application/json') 30 | 31 | # Read the CSV file into a DataFrame 32 | csv_data = pd.read_csv(io.BytesIO(uploaded_file.stream.read())) 33 | 34 | # Extract the text data 35 | text_data = csv_data['text'].tolist() 36 | 37 | # Create a ModelRequest object with the extracted text data 38 | req = ModelRequest(text=text_data) 39 | 40 | # Call the model's inference method and get the response 41 | response = await model.inference(req) 42 | 43 | if response is None: 44 | # If an error occurred during inference, return an error response 45 | return Response(json.dumps({"error": "Inference error"}), status=500, mimetype='application/json') 46 | 47 | # Convert the CSV string from the response into a DataFrame 48 | df = pd.read_csv(io.StringIO(response)) 49 | 50 | # Save the DataFrame to a CSV file 51 | output_file_path = 'output.csv' 52 | df.to_csv(output_file_path, index=False) 53 | 54 | # Send the CSV file back as a download response 55 | return await send_file(output_file_path, mimetype='text/csv', as_attachment=True, attachment_filename='output.csv') -------------------------------------------------------------------------------- /src/topic_modelling/BERTopic/model.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from sentence_transformers import SentenceTransformer 3 | from bertopic import BERTopic 4 | from umap import UMAP 5 | from sklearn.feature_extraction.text import CountVectorizer 6 | import json 7 | import nltk 8 | from request import ModelRequest 9 | 10 | nltk.download("punkt") 11 | 12 | class Model: 13 | def __init__(self, context): 14 | self.context = context 15 | self.sentence_model = SentenceTransformer("all-MiniLM-L6-v2") 16 | self.vectorizer_model = CountVectorizer(stop_words="english") 17 | self.umap_model = UMAP(n_neighbors=15, min_dist=0.0, metric="cosine", random_state=69) 18 | # self.hdbscan_model = HDBSCAN(min_cluster_size=15, metric="euclidean", prediction_data=True) 19 | self.topic_model = BERTopic( 20 | umap_model = self.umap_model, 21 | # hdbscan_model = self.hdbscan_model, 22 | vectorizer_model = self.vectorizer_model, 23 | ) 24 | 25 | async def inference(self, request: ModelRequest): 26 | text = request.text 27 | try: 28 | # Encode the text using SentenceTransformer 29 | corpus_embeddings = self.sentence_model.encode(text) 30 | 31 | # Fit the topic model 32 | topics, probabilities = self.topic_model.fit_transform(text, corpus_embeddings) 33 | 34 | # Get topic information and cluster labels 35 | df_classes = self.topic_model.get_topic_info() 36 | cluster_labels, _ = self.topic_model.transform(text, corpus_embeddings) 37 | 38 | df_result = pd.DataFrame({ 39 | "document_text": text, 40 | "predicted_class_label": cluster_labels, 41 | "probabilities": probabilities, 42 | }) 43 | 44 | # Mapping cluster names to topic labels 45 | cluster_names_map = dict(zip(df_classes["Topic"], df_classes["Name"])) 46 | df_result["predicted_class_name"] = df_result["predicted_class_label"].map(cluster_names_map) 47 | 48 | csv_string = df_result.to_csv(index=False) 49 | 50 | except Exception as e: 51 | # Log & print the error 52 | print(f"Error during inference: {e}") 53 | return None 54 | 55 | return csv_string 56 | 57 | -------------------------------------------------------------------------------- /src/topic_modelling/BERTopic/request.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | class ModelRequest(): 4 | def __init__(self, text): 5 | self.text = text 6 | 7 | def to_json(self): 8 | return json.dumps(self, default=lambda o: o.__dict__, 9 | sort_keys=True, indent=4) -------------------------------------------------------------------------------- /src/topic_modelling/BERTopic/requirements.txt: -------------------------------------------------------------------------------- 1 | quart 2 | aiohttp 3 | pandas 4 | bertopic 5 | sentence_transformers 6 | numpy 7 | nltk 8 | scikit-learn -------------------------------------------------------------------------------- /src/topic_modelling/README.md: -------------------------------------------------------------------------------- 1 | Being able to extract the subject or topic of chunks of text -------------------------------------------------------------------------------- /src/utils.py: -------------------------------------------------------------------------------- 1 | import aiohttp 2 | import asyncio 3 | from functools import wraps 4 | from time import time 5 | 6 | 7 | async def _request(method, url, headers, payload): 8 | async with aiohttp.ClientSession() as session: 9 | async with session.request(method=method, url=url, headers=headers, data=payload) as resp: 10 | return await resp.json() 11 | 12 | 13 | def async_request(method: str, url: str, headers, payload): 14 | """ Make async request calls with aiohttp 15 | 16 | Args: 17 | method (str): HTTP method - POST | GET | PUT 18 | """ 19 | return asyncio.run(_request(method, url, headers, payload)) 20 | 21 | 22 | def timing(f): 23 | @wraps(f) 24 | def wrap(*args, **kw): 25 | ts = time() 26 | result = f(*args, **kw) 27 | te = time() 28 | print('func:%r args:[%r, %r] took: %2.4f sec' % \ 29 | (f.__name__, args, kw, te-ts)) 30 | return result 31 | return wrap 32 | 33 | -------------------------------------------------------------------------------- /src/vector_search/README.md: -------------------------------------------------------------------------------- 1 | Being able to search embeddings created -------------------------------------------------------------------------------- /src/vector_search/dotproduct/README.md: -------------------------------------------------------------------------------- 1 | Being able to search embedding created through simple dot product 2 | -------------------------------------------------------------------------------- /src/vector_search/faiss/README.md: -------------------------------------------------------------------------------- 1 | Using Facebook's FAISS to carry out vector search -------------------------------------------------------------------------------- /template_batch_model.py: -------------------------------------------------------------------------------- 1 | class AI4BharatBatchModel(): 2 | def __new__(cls): 3 | if not hasattr(cls, 'instance'): 4 | cls.instance = super(AI4BharatBatchModel, cls).__new__(cls) 5 | cls.load_model(cls) 6 | return cls.instance 7 | 8 | def load_model(self): 9 | """ Loads the model. This method is called only once when the model is first loaded.""" 10 | pass 11 | 12 | def inference(self, request): 13 | """ Performs inference on the given request. This method is called for every request.""" 14 | pass -------------------------------------------------------------------------------- /template_model_request.py: -------------------------------------------------------------------------------- 1 | class BatchModelRequest(): 2 | def __init__(self): 3 | """ Initializes the request object with the given parameters""" 4 | pass 5 | 6 | def to_json(self): 7 | """ Returns the json representation of the object""" 8 | import json 9 | return json.dumps(self, default=lambda o: o.__dict__, 10 | sort_keys=True, indent=4) -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | from src.text_lang_detection.bhashini.remote import * 2 | import asyncio, aiohttp, time 3 | from quart import Quart 4 | 5 | str_to_test = "ତୁମର କୋଡ୍ ରିଫାକ୍ଟର୍ କରିବାକୁ ଯାହା ଦ୍ you ାରା ତୁମେ ଲୁପ୍ ପାଇଁ ପ୍ରତିକ୍ରିୟାକୁ ଅପେକ୍ଷା କରୁନାହଁ, ତୁମେ ଏକାସାଙ୍ଗରେ ଇନ୍ଫରେନ୍ସ ଫଙ୍କସନ୍ ର ଏକାଧିକ ଉଦାହରଣ ଚଲାଇବାକୁ ବ୍ୟବହାର କରିପାରିବ | ଏଠାରେ ଅପଡେଟ୍ କୋଡ୍ ଅଛି |" 6 | 7 | app = Quart(__name__) 8 | 9 | 10 | async def single_inference(app, text, index): 11 | m = Model(app) 12 | resp = await m.inference(ModelRequest(text=text)) 13 | print(f"{index}: {resp}") 14 | 15 | 16 | async def bench_text_lang_detection(app): 17 | tasks = [] 18 | 19 | for i in range(len(str_to_test)): 20 | task = single_inference(app, str_to_test[0:i], i) 21 | tasks.append(task) 22 | 23 | await asyncio.gather(*tasks) 24 | 25 | 26 | async def main(): 27 | app.client = aiohttp.ClientSession() 28 | start_time = time.perf_counter() 29 | await bench_text_lang_detection(app) 30 | end_time = time.perf_counter() 31 | 32 | print(f"Time taken: {end_time - start_time:.4f} seconds for {len(str_to_test)} characters") 33 | await app.client.close() 34 | 35 | 36 | asyncio.run(main()) 37 | # Time taken: 4.2715 seconds for 104 characters --------------------------------------------------------------------------------