├── .coveragerc ├── .dockerignore ├── .editorconfig ├── .flake8 ├── .github ├── FUNDING.yml └── workflows │ ├── build-bothub-nlp-push-tag-dockerhub.yaml │ └── build-bothub-nlp-push-tag-gcr.yaml ├── .gitignore ├── .travis.yml ├── LICENSE ├── Makefile ├── README.md ├── __init__.py ├── ai_platform ├── aiplatform_app.py ├── aiplatform_requirements.txt └── settings.py ├── aiplatform.Dockerfile ├── bothub ├── __init__.py ├── nlu_worker │ ├── __init__.py │ ├── interpreter_manager.py │ └── task │ │ ├── __init__.py │ │ ├── debug_parse.py │ │ ├── evaluate.py │ │ ├── intent_sentence_suggestion.py │ │ ├── parse.py │ │ ├── sentence_suggestion.py │ │ ├── word_suggestion.py │ │ └── words_distribution.py └── shared │ ├── __init__.py │ ├── evaluate_crossval.py │ ├── settings.py │ ├── train.py │ └── utils │ ├── __init__.py │ ├── backend.py │ ├── helpers.py │ ├── lookup_tables │ ├── en │ │ ├── country.txt │ │ └── email.txt │ └── pt_br │ │ ├── brand.txt │ │ ├── cep.txt │ │ ├── country.txt │ │ ├── cpf.txt │ │ └── email.txt │ ├── persistor.py │ ├── pipeline_builder.py │ ├── pipeline_components │ ├── __init__.py │ ├── diet_classifier.py │ ├── hf_transformer.py │ ├── lm_featurizer.py │ ├── lm_tokenizer.py │ ├── microsoft_recognizers_extractor.py │ ├── preprocessing.py │ ├── regex_entity_extractor.py │ └── spacy_nlp.py │ ├── poke_logging.py │ ├── preprocessing │ ├── __init__.py │ ├── preprocessing_base.py │ ├── preprocessing_english.py │ ├── preprocessing_factory.py │ ├── preprocessing_portuguese.py │ └── preprocessing_spanish.py │ ├── rasa_components │ ├── __init__.py │ ├── bothub_interpreter.py │ └── registry.py │ └── scripts │ ├── download_models.py │ └── link_lang_spacy.py ├── celery_app.py ├── docker-compose.yml ├── nlp.Dockerfile ├── requirements.txt ├── start_celery.py └── tests ├── README.md ├── __init__.py ├── example_bert_pt_br.tar.gz ├── example_generic_language.tar.gz ├── shared ├── __init__.py ├── test_pipeline_builder.py └── test_preprocesing.py ├── test_debug_parse.py ├── test_evaluate.py ├── test_parse.py ├── test_train.py └── test_words_distribution.py /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | source = 3 | ./bothub_nlp_nlu_worker/bothub_nlp_nlu 4 | 5 | omit = 6 | ./bothub_nlp_nlu_worker/bothub_nlp_nlu/scripts/* 7 | ./bothub_nlp_nlu_worker/bothub_nlp_nlu/tests/* 8 | ./bothub_nlp_nlu_worker/bothub_nlp_nlu/pipeline_components/* 9 | 10 | [report] 11 | fail_under = 70 -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | env/ 2 | venv/ 3 | .vscode 4 | .ipynb_checkpoints 5 | .~* 6 | *.pyc 7 | .DS_Store 8 | app/dump.rdb 9 | etc/ 10 | bothub-nlp.log 11 | 12 | # file-based project format: 13 | *.iws 14 | 15 | # tests 16 | tests.db 17 | .coverage 18 | 19 | # env vars files 20 | .env 21 | settings.ini 22 | 23 | # dev 24 | db.sqlite3 25 | 26 | # spacy-langs 27 | bothub-nlp-nlu-worker/spacy-langs 28 | -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | root = true 2 | 3 | [*] 4 | indent_style = space 5 | indent_size = 4 6 | end_of_line = lf 7 | charset = utf-8 8 | trim_trailing_whitespace = true 9 | insert_final_newline = false 10 | 11 | [*.yml] 12 | indent_size = 2 13 | 14 | [.flake8] 15 | indent_size = 2 16 | 17 | [*.{py,yml,sh}] 18 | insert_final_newline = true 19 | 20 | [Makefile] 21 | indent_style = tab 22 | 23 | [{Makefile,Dockerfile,.editorconfig,README.md}] 24 | insert_final_newline = true 25 | -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 119 3 | ignore = E501,W503,E203,E402 4 | exclude = 5 | ./spacy-langs 6 | ./env 7 | ./venv 8 | ./scripts -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | custom: ['https://www.ilhasoft.com.br/en/contact/'] 4 | -------------------------------------------------------------------------------- /.github/workflows/build-bothub-nlp-push-tag-dockerhub.yaml: -------------------------------------------------------------------------------- 1 | name: Build Bothub NLP in Dockerhub 2 | 3 | on: 4 | push: 5 | tags: 6 | - '*.*.*-develop' 7 | - '*.*.*-staging' 8 | - '*.*.*' 9 | 10 | jobs: 11 | docker: 12 | runs-on: ubuntu-latest 13 | 14 | strategy: 15 | matrix: 16 | deployment: [deployment-bert-english.json, deployment-bert-multilang.json, deployment-bert-ptbr.json, deployment-internal-en.json, deployment-internal-multilang.json, deployment-internal-ptbr.json, deployment-spacy-en.json, deployment-spacy-es.json, deployment-spacy-fr.json, deployment-spacy-ptbr.json, deployment-spacy-ru.json] 17 | 18 | steps: 19 | - name: Set variables 20 | run: | 21 | TAG="$( echo "${GITHUB_REF}" | cut -d'/' -f3 )" 22 | if grep -qs -e '^.*.*-develop' <<< "${TAG}" ; then 23 | echo "Found environment: DEVELOP - ${TAG}" 24 | echo "MANIFESTS_ENVIRONMENT=develop" | tee -a "${GITHUB_ENV}" 25 | elif grep -qs -e '^.*.*-staging' <<< "${TAG}" ; then 26 | echo "Found environment: STAGING - ${TAG}" 27 | echo "MANIFESTS_ENVIRONMENT=staging" | tee -a "${GITHUB_ENV}" 28 | elif grep -qs -e '^.*.*' <<< "${TAG}" ; then 29 | echo "No environment found, assuming: PRODUCTION - ${TAG}" 30 | echo "MANIFESTS_ENVIRONMENT=production" | tee -a "${GITHUB_ENV}" 31 | else 32 | echo 'Not a valid tag. Skipping...' 33 | exit 1 34 | fi 35 | echo "TAG=$TAG" | tee -a "${GITHUB_ENV}" 36 | VERSION="${TAG}" 37 | echo "VERSION=${VERSION}" | tee -a "${GITHUB_ENV}" 38 | echo "COMMIT_SHA=$GITHUB_SHA" | tee -a "${GITHUB_ENV}" 39 | echo "MATRIXIN=${{ matrix.deployment }}" 40 | echo "MANIFESTS_PATCH_TARGET=${{ matrix.deployment }}" | tee -a "${GITHUB_ENV}" 41 | if [[ "${{ matrix.deployment }}" == "deployment-bert-english.json" ]]; then 42 | MODEL=en-BERT 43 | echo "MODEL=en-BERT" | tee -a "${GITHUB_ENV}" 44 | elif [[ "${{ matrix.deployment }}" == "deployment-bert-multilang.json" ]]; then 45 | MODEL=xx-BERT 46 | echo "MODEL=xx-BERT" | tee -a "${GITHUB_ENV}" 47 | elif [[ "${{ matrix.deployment }}" == "deployment-bert-ptbr.json" ]]; then 48 | MODEL=pt_br-BERT 49 | echo "MODEL=pt_br-BERT" | tee -a "${GITHUB_ENV}" 50 | elif [[ "${{ matrix.deployment }}" == "deployment-internal-en.json" ]]; then 51 | MODEL=xx-NONE 52 | echo "MODEL=xx-NONE" | tee -a "${GITHUB_ENV}" 53 | elif [[ "${{ matrix.deployment }}" == "deployment-internal-multilang.json" ]]; then 54 | MODEL=xx-NONE 55 | echo "MODEL=xx-NONE" | tee -a "${GITHUB_ENV}" 56 | elif [[ "${{ matrix.deployment }}" == "deployment-internal-ptbr.json" ]]; then 57 | MODEL=xx-NONE 58 | echo "MODEL=xx-NONE" | tee -a "${GITHUB_ENV}" 59 | elif [[ "${{ matrix.deployment }}" == "deployment-spacy-en.json" ]]; then 60 | MODEL=en-SPACY 61 | echo "MODEL=en-SPACY" | tee -a "${GITHUB_ENV}" 62 | elif [[ "${{ matrix.deployment }}" == "deployment-spacy-es.json" ]]; then 63 | MODEL=es-SPACY 64 | echo "MODEL=es-SPACY" | tee -a "${GITHUB_ENV}" 65 | elif [[ "${{ matrix.deployment }}" == "deployment-spacy-fr.json" ]]; then 66 | MODEL=fr-SPACY 67 | echo "MODEL=fr-SPACY" | tee -a "${GITHUB_ENV}" 68 | elif [[ "${{ matrix.deployment }}" == "deployment-spacy-ptbr.json" ]]; then 69 | MODEL=pt_br-SPACY 70 | echo "MODEL=pt_br-SPACY" | tee -a "${GITHUB_ENV}" 71 | elif [[ "${{ matrix.deployment }}" == "deployment-spacy-ru.json" ]]; then 72 | MODEL=ru-SPACY 73 | echo "MODEL=ru-SPACY" | tee -a "${GITHUB_ENV}" 74 | else 75 | echo "Unknown model" 76 | exit 1 77 | fi 78 | 79 | echo "${MODEL}" 80 | echo "IMAGE_TAG=bothubit/bothub-nlp:${TAG}-${MODEL}" | tee -a "${GITHUB_ENV}" 81 | 82 | echo "IMAGE_SOURCE_URL=https://github.com/weni-ai/bothub-nlp" | tee -a "${GITHUB_ENV}" 83 | echo "MANIFESTS_REPOSITORY=weni-ai/kubernetes-manifests-artificial-intelligence" | tee -a "${GITHUB_ENV}" 84 | echo "MANIFESTS_APPLICATION=nlp-workers" | tee -a "${GITHUB_ENV}" 85 | - name: Check out the repo 86 | uses: actions/checkout@v3 87 | with: 88 | ref: "${{env.GITHUB_SHA}}" 89 | 90 | - name: Set up QEMU 91 | uses: docker/setup-qemu-action@v2 92 | 93 | - name: Set up Docker Buildx 94 | uses: docker/setup-buildx-action@v2 95 | 96 | - name: Login to Dockerhub 97 | uses: docker/login-action@v2 98 | with: 99 | username: ${{ secrets.DOCKERHUB_USERNAME }} 100 | password: ${{ secrets.DOCKERHUB_TOKEN }} 101 | 102 | - name: Build and push - Bothub NLP Model ( ${{ env.MODEL }} ) Image 103 | if: ${{ !( matrix.deployment == 'deployment-internal-ptbr.json' || matrix.deployment == 'deployment-internal-multilang.json' ) }} 104 | uses: docker/build-push-action@v3 105 | with: 106 | context: . 107 | labels: | 108 | tag=${{env.TAG}} 109 | commit=${{env.COMMIT_SHA}} 110 | repository=${{env.IMAGE_SOURCE_URL}} 111 | file: ./nlp.Dockerfile 112 | push: true 113 | tags: "${{env.IMAGE_TAG}}" 114 | no-cache: true 115 | build-args: | 116 | DOWNLOAD_MODELS=${{ env.MODEL }} 117 | 118 | - name: Check out Kubernetes Manifests 119 | uses: actions/checkout@master 120 | with: 121 | ref: main 122 | repository: "${{ env.MANIFESTS_REPOSITORY }}" 123 | token: "${{ secrets.DEVOPS_GITHUB_PERMANENT_TOKEN }}" 124 | path: ./kubernetes-manifests/ 125 | 126 | - name: Update image on deployment 127 | run: | 128 | which jq > /dev/null 2>&1 || ( sudo apt update ; sudo apt install -y jq ) 129 | # Dep: coreutils 130 | verlte() { 131 | [ "$1" = "`echo -e "$1\n$2" | sort -V | head -n1`" ] 132 | } 133 | verlt(){ 134 | [ "$1" = "$2" ] && return 1 || verlte $1 $2 135 | } 136 | export PROJECT_DIR="${{ env.MANIFESTS_APPLICATION }}" 137 | ENV_DIR="kubernetes-manifests/${{ env.MANIFESTS_APPLICATION }}/${MANIFESTS_ENVIRONMENT}" 138 | for e in ${ENV_DIR}; do 139 | echo "Update ${e}:" 140 | if [ ! -d "${e}" ] ; then 141 | echo "${e}: Does not exist, skipping" 142 | elif [ ! -r "${e}/kustomization.yaml" ] ; then 143 | echo "${e}/kustomization.yaml: Does not readable, skipping" 144 | elif [ ! -r "${e}/${{ env.MANIFESTS_PATCH_TARGET }}" ] ; then 145 | echo "${e}/${{ env.MANIFESTS_PATCH_TARGET }}: Does not readable, skipping" 146 | else 147 | OLD_IMAGE=$( 148 | cat "${e}/${{ env.MANIFESTS_PATCH_TARGET }}" \ 149 | | jq '.[] | select(.path == "/spec/template/spec/containers/0/image") | .value' 150 | ) 151 | echo "Old image to replace: ${OLD_IMAGE}" 152 | OLD_VERSION=$( 153 | echo "${OLD_IMAGE}" \ 154 | | sed s'/^.*[v:-]\([0-9]*\.[0-9]*\.[0-9]*\).*$/\1/'g \ 155 | | head -n1 156 | ) 157 | echo "Old image version to compare: ${OLD_VERSION}<=${{env.VERSION}}" 158 | if verlte "${OLD_VERSION}" "${VERSION}" || [[ ! "${OLD_VERSION}" =~ [0-9]+\.[0-9]+\.[0-9]+ ]] ; then 159 | echo 'New configurations:' 160 | new_configuration=$( 161 | cat "${e}/${{ env.MANIFESTS_PATCH_TARGET }}" \ 162 | | jq '(..|select(.path == "/spec/template/spec/containers/0/image")?) += {value: "'"${{env.IMAGE_TAG}}"'"}' 163 | ) 164 | echo "${new_configuration}" 165 | echo "${new_configuration}" > "${e}/${{ env.MANIFESTS_PATCH_TARGET }}" 166 | else 167 | echo "Version in file is greater than build, skipping update yaml" 168 | fi 169 | fi 170 | done 171 | - name: Commit & Push changes 172 | uses: actions-js/push@master 173 | with: 174 | github_token: "${{ secrets.DEVOPS_GITHUB_PERMANENT_TOKEN }}" 175 | repository: "${{ env.MANIFESTS_REPOSITORY }}" 176 | directory: ./kubernetes-manifests/ 177 | branch: main 178 | message: "From Bothub NLP Build (Push Tag ${{ env.MANIFESTS_ENVIRONMENT }})" 179 | 180 | 181 | -------------------------------------------------------------------------------- /.github/workflows/build-bothub-nlp-push-tag-gcr.yaml: -------------------------------------------------------------------------------- 1 | name: Build AI-Platform Bothub NLP in GCR 2 | 3 | on: 4 | push: 5 | tags: 6 | - '*.*.*-develop' 7 | - '*.*.*-staging' 8 | - '*.*.*' 9 | 10 | jobs: 11 | docker: 12 | runs-on: ubuntu-latest 13 | 14 | strategy: 15 | matrix: 16 | model: [ xx-NONE, xx-BERT, en-BERT, pt_br-BERT, es-SPACY, fr-SPACY, pt_br-SPACY , ru-SPACY, en-SPACY] 17 | 18 | steps: 19 | - name: Set variables 20 | run: | 21 | TAG="$( echo "${GITHUB_REF}" | cut -d'/' -f3 )" 22 | if grep -qs -e '^.*.*-develop' <<< "${TAG}" ; then 23 | echo "Found environment: DEVELOP - ${TAG}" 24 | echo "ENVIRONMENT=develop" | tee -a "${GITHUB_ENV}" 25 | elif grep -qs -e '^.*.*-staging' <<< "${TAG}" ; then 26 | echo "Found environment: STAGING - ${TAG}" 27 | echo "ENVIRONMENT=staging" | tee -a "${GITHUB_ENV}" 28 | elif grep -qs -e '^.*.*' <<< "${TAG}" ; then 29 | echo "No environment found, assuming: PRODUCTION - ${TAG}" 30 | echo "ENVIRONMENT=production" | tee -a "${GITHUB_ENV}" 31 | else 32 | echo 'Not a valid tag. Skipping...' 33 | exit 1 34 | fi 35 | echo "TAG=$TAG" | tee -a "${GITHUB_ENV}" 36 | VERSION="${TAG}" 37 | echo "VERSION=${VERSION}" | tee -a "${GITHUB_ENV}" 38 | echo "COMMIT_SHA=$GITHUB_SHA" | tee -a "${GITHUB_ENV}" 39 | echo "IMAGE_TAG=us.gcr.io/bothub-273521/bothub-nlp-ai-platform:${TAG}-${{ matrix.model }}" | tee -a "${GITHUB_ENV}" 40 | echo "IMAGE_SOURCE_URL=https://github.com/weni-ai/bothub-nlp" | tee -a "${GITHUB_ENV}" 41 | 42 | 43 | - name: Check out the repo 44 | uses: actions/checkout@v3 45 | with: 46 | ref: "${{env.GITHUB_SHA}}" 47 | 48 | - name: Set up QEMU 49 | uses: docker/setup-qemu-action@v2 50 | 51 | - name: Set up Docker Buildx 52 | uses: docker/setup-buildx-action@v2 53 | 54 | - name: Login to GCR 55 | uses: docker/login-action@v1 56 | with: 57 | registry: us.gcr.io 58 | username: _json_key 59 | password: ${{ secrets.GCR_JSON_KEY }} 60 | 61 | - name: Build and push - AI-Platform Bothub NLP Image 62 | uses: docker/build-push-action@v3 63 | with: 64 | context: . 65 | labels: | 66 | tag=${{env.TAG}} 67 | commit=${{env.COMMIT_SHA}} 68 | repository=${{env.IMAGE_SOURCE_URL}} 69 | file: ./aiplatform.Dockerfile 70 | push: true 71 | tags: "${{env.IMAGE_TAG}}" 72 | no-cache: true 73 | build-args: | 74 | DOWNLOAD_MODELS=${{ matrix.model }} 75 | 76 | 77 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .nox/ 42 | .coverage 43 | .coverage.* 44 | .cache 45 | nosetests.xml 46 | coverage.xml 47 | *.cover 48 | .hypothesis/ 49 | .pytest_cache/ 50 | 51 | # Translations 52 | *.mo 53 | *.pot 54 | 55 | # Django stuff: 56 | *.log 57 | local_settings.py 58 | db.sqlite3 59 | db.sqlite3-journal 60 | 61 | # Flask stuff: 62 | instance/ 63 | .webassets-cache 64 | 65 | # Scrapy stuff: 66 | .scrapy 67 | 68 | # Sphinx documentation 69 | docs/_build/ 70 | 71 | # PyBuilder 72 | target/ 73 | 74 | # Jupyter Notebook 75 | .ipynb_checkpoints 76 | 77 | # IPython 78 | profile_default/ 79 | ipython_config.py 80 | 81 | # pyenv 82 | .python-version 83 | 84 | # celery beat schedule file 85 | celerybeat-schedule 86 | 87 | # SageMath parsed files 88 | *.sage.py 89 | 90 | # Environments 91 | .env 92 | .venv 93 | env/ 94 | venv/ 95 | ENV/ 96 | env.bak/ 97 | venv.bak/ 98 | 99 | # Spyder project settings 100 | .spyderproject 101 | .spyproject 102 | 103 | # Rope project settings 104 | .ropeproject 105 | 106 | # mkdocs documentation 107 | /site 108 | 109 | # mypy 110 | .mypy_cache/ 111 | .dmypy.json 112 | dmypy.json 113 | 114 | # IDEs 115 | .vscode/ 116 | .idea/ 117 | .DS_Store 118 | .idea/ 119 | .env 120 | spacy-langs/ -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "3.6" 4 | services: 5 | - docker 6 | install: 7 | - pip install -r requirements.txt 8 | - pip install coveralls 9 | env: 10 | global: 11 | - BOTHUB_NLP_LANGUAGE_QUEUE="en" 12 | - BOTHUB_NLP_SERVICE_WORKER=true 13 | before_script: 14 | - python bothub/shared/utils/scripts/download_models.py en-BERT 15 | script: 16 | - flake8 17 | - travis_wait coverage run -m unittest discover tests 18 | after_success: 19 | - coveralls 20 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | init_development_env: 2 | @echo "${INFO}Starting init environment...${NC}" 3 | @echo "BOTHUB_ENGINE_URL=http://localhost" >> .env 4 | @echo "BOTHUB_NLP_SERVICE_WORKER=True" >> .env 5 | @echo "BOTHUB_NLP_LANGUAGE_QUEUE=en" >> .env 6 | @echo "BOTHUB_LANGUAGE_MODEL=BERT" >> .env 7 | @echo "${SUCCESS}Finish...${NC}" 8 | 9 | start_development: 10 | @echo "${INFO}Starting Build all project (Docker)...${NC}" 11 | @docker-compose build --build-arg DOWNLOAD_MODELS=en-BERT 12 | @docker-compose up -d 13 | @echo "${SUCCESS}Finish...${NC}" 14 | 15 | 16 | install_development_requirements: 17 | @echo "${INFO}Installing development requirements...${NC}" 18 | @git clone --branch master --depth 1 --single-branch https://github.com/Ilhasoft/spacy-lang-models spacy-langs 19 | @python bothub/shared/utils/scripts/link_lang_spacy.py pt_br ./spacy-langs/pt_br/ 20 | @python bothub/shared/utils/scripts/download_models.py en-BERT 21 | @echo "${SUCCESS}✔${NC} Development requirements installed" 22 | 23 | 24 | start_celery: 25 | @python start_celery.py 26 | 27 | # Utils 28 | 29 | ## Colors 30 | SUCCESS = \033[0;32m 31 | INFO = \033[0;36m 32 | WARNING = \033[0;33m 33 | DANGER = \033[0;31m 34 | NC = \033[0m 35 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Bothub NLP - Natural Language Processing services 2 | 3 | [![Build Status](https://travis-ci.org/bothub-it/bothub-nlp.svg?branch=master)](https://travis-ci.org/bothub-it/bothub-nlp) [![Coverage Status](https://coveralls.io/repos/github/bothub-it/bothub-nlp/badge.svg?branch=master)](https://coveralls.io/github/bothub-it/bothub-nlp?branch=master) ![version 3.0.1](https://img.shields.io/badge/version-3.0.1-blue.svg) [![python 3.6](https://img.shields.io/badge/python-3.6-green.svg)](https://docs.python.org/3.6/whatsnew/changelog.html) [![license AGPL-3.0](https://img.shields.io/badge/license-AGPL--3.0-red.svg)](https://github.com/bothub-it/bothub-nlp/blob/master/LICENSE) 4 | 5 | 6 | 7 | ## Services 8 | 9 | ### bothub-nlp-nlu-worker 10 | 11 | ### [bothub-nlp-api](https://github.com/bothub-it/bothub-nlp-api) 12 | 13 | ## Packages 14 | 15 | ### [bothub-backend](https://github.com/bothub-it/bothub-backend) (python 3.6) 16 | 17 | ### [bothub-nlp-celery](https://github.com/bothub-it/bothub-nlp-celery) (python 3.6) 18 | 19 | 20 | # Requirements 21 | 22 | * Python (3.6) 23 | * Docker 24 | * Docker-Compose 25 | 26 | ## Development 27 | 28 | Use ```make``` commands 29 | 30 | | Command | Description | 31 | |--|--| 32 | | make init_development_env | Init file .env with variables environment | 33 | | make start_development | Start build docker | 34 | | make install_development_requirements | Install some default models | 35 | | make start_celery | Run celery application | 36 | 37 | 38 | ## Environment Variables 39 | 40 | You can set environment variables in your OS, write on ```.env``` file or pass via Docker config. 41 | 42 | ### bothub-backend 43 | 44 | | Variable | Type | Default | Description | 45 | |--|--|--|--| 46 | | BOTHUB_ENGINE_URL | `str` | `https://api.bothub.it` | Web service url | 47 | 48 | ### nlp-nlu-worker / nlp-ai-platform 49 | 50 | You can set environment variables in your OS, write on ```.env``` file or pass via Docker config. 51 | 52 | | Variable | Type | Default | Description | 53 | |--|--|--|--| 54 | | WORKER_CACHE_CLEANING_PERIOD | `float` | `3*3600` | Period of time (seconds) the worker will look for idle interpreters to clean cache | 55 | | INTERPRETER_CACHE_IDLE_LIMIT | `float` | `24*3600` | Idle limit of time (seconds) the interpreter cache will keep cache | 56 | | DYNAMIC_EPOCHS_THRESHOLD | `int` | `10000` | Minimum number of sentences to start decreasing training number of epochs | 57 | | BOTHUB_NLP_AWS_ACCESS_KEY_ID | `str` | | AWS bucket access to save trained models and evaluation results | 58 | | BOTHUB_NLP_AWS_SECRET_ACCESS_KEY | `str` | | AWS bucket access to save trained models and evaluation results | 59 | | BOTHUB_NLP_AWS_S3_BUCKET_NAME | `str` | | AWS bucket access to save trained models and evaluation results | 60 | | BOTHUB_NLP_AWS_REGION_NAME | `str` | | AWS bucket access to save trained models and evaluation results | 61 | 62 | ### bothub-celery 63 | 64 | | Variable | Type | Default | Description | 65 | |--|--|--|--| 66 | | BOTHUB_NLP_CELERY_BROKER_URL | `string` | `redis://localhost:6379/0` | Celery Broker URL, check usage instructions in [Celery Docs](http://docs.celeryproject.org/en/latest/index.html) | 67 | | BOTHUB_NLP_CELERY_BACKEND_URL | `string` | `BOTHUB_NLP_CELERY_BROKER_URL` value | Celery Backend URL, check usage instructions in [Celery Docs](http://docs.celeryproject.org/en/latest/index.html) | 68 | | BOTHUB_NLP_CELERY_SENTRY_CLIENT | `bool` | `False` | Enable Sentry | 69 | | BOTHUB_NLP_CELERY_SENTRY | `str` | `None` | Set URL Sentry Server | 70 | | BOTHUB_NLP_LANGUAGE_QUEUE | `string` | `en` | Set language of model that will be loaded in celery and will define its queue | 71 | | BOTHUB_LANGUAGE_MODEL | `string` | `None` | Set type of model (BERT/SPACY/NONE) | 72 | | TASK_GENERAL_TIME_LIMIT | `int` | `120` | Time limit of celery tasks | 73 | | TASK_PARSE_TIME_LIMIT | `int` | `10` | Time limit of parse task | 74 | 75 | ## Docker Arguments 76 | 77 | You need to set --build-arg when you are building docker-compose 78 | 79 | | Argument | Type | Default | Description | 80 | |--|--|--|--| 81 | | DOWNLOAD_MODELS | ```string```| ```en-BERT``` | Set language and model in build time. Following the format: ```[LANGUAGE_CODE]-[LANGUAGE_MODEL]```. 82 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weni-ai/bothub-nlp/d43b7657aff01544769b71db74adce5f7d366879/__init__.py -------------------------------------------------------------------------------- /ai_platform/aiplatform_app.py: -------------------------------------------------------------------------------- 1 | from bothub.shared.train import train_update as train 2 | from bothub.shared.evaluate_crossval import ( 3 | evaluate_crossval_update as evaluate_crossval, 4 | ) 5 | 6 | if __name__ == "__main__": 7 | from settings import ( 8 | operation, 9 | repository_version_language, 10 | by_id, 11 | repository_authorization, 12 | aws_bucket_authentication, 13 | language 14 | ) 15 | 16 | # Run the job 17 | if operation == "train": 18 | train( 19 | repository_version_language, 20 | by_id, 21 | repository_authorization, 22 | from_queue="ai-platform", 23 | ) 24 | elif operation == "evaluate": 25 | evaluate_crossval( 26 | repository_version_language, 27 | repository_authorization, 28 | aws_bucket_authentication, 29 | language 30 | ) 31 | -------------------------------------------------------------------------------- /ai_platform/aiplatform_requirements.txt: -------------------------------------------------------------------------------- 1 | git+https://github.com/bothub-it/bothub-backend.git@1.0.22 2 | git+https://github.com/bothub-it/bothub-nlp-celery.git@0.1.38 3 | rasa==1.10.6 4 | transformers==2.11.0 5 | emoji==0.6.0 6 | recognizers-text-suite 7 | plac==0.9.6 8 | spacy==2.1.9 9 | Unidecode==1.1.1 10 | urllib3==1.24.3 11 | tensorflow-gpu==2.1.2 12 | requests==2.23.0 13 | pymorphy2==0.8 14 | python-decouple==3.3 15 | h5py==2.10.0 -------------------------------------------------------------------------------- /ai_platform/settings.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | PARSER = argparse.ArgumentParser() 4 | 5 | # Input Arguments 6 | PARSER.add_argument( 7 | "--operation", help='What operation will be done, "train" or "evaluate"' 8 | ) 9 | PARSER.add_argument( 10 | "--repository-version", help="The id of repository-version.", type=int 11 | ) 12 | PARSER.add_argument( 13 | "--by-id", help="User id sending the job", type=int 14 | ) 15 | PARSER.add_argument( 16 | "--repository-authorization", help="Repository authorization string." 17 | ) 18 | PARSER.add_argument( 19 | "--AIPLATFORM_LANGUAGE_QUEUE", type=str, default="" 20 | ) 21 | 22 | PARSER.add_argument( 23 | "--BOTHUB_NLP_AWS_S3_BUCKET_NAME", type=str, default="" 24 | ) 25 | 26 | PARSER.add_argument( 27 | "--BOTHUB_NLP_AWS_ACCESS_KEY_ID", type=str, default="" 28 | ) 29 | 30 | PARSER.add_argument( 31 | "--BOTHUB_NLP_AWS_SECRET_ACCESS_KEY", type=str, default="" 32 | ) 33 | 34 | PARSER.add_argument( 35 | "--BOTHUB_NLP_AWS_REGION_NAME", type=str, default="us-east-1" 36 | ) 37 | 38 | ARGUMENTS, _ = PARSER.parse_known_args() 39 | 40 | operation = ARGUMENTS.operation 41 | repository_version_language = ARGUMENTS.repository_version 42 | by_id = ARGUMENTS.by_id 43 | repository_authorization = ARGUMENTS.repository_authorization 44 | language = ARGUMENTS.AIPLATFORM_LANGUAGE_QUEUE 45 | 46 | aws_bucket_authentication = { 47 | "BOTHUB_NLP_AWS_S3_BUCKET_NAME": ARGUMENTS.BOTHUB_NLP_AWS_S3_BUCKET_NAME, 48 | "BOTHUB_NLP_AWS_ACCESS_KEY_ID": ARGUMENTS.BOTHUB_NLP_AWS_ACCESS_KEY_ID, 49 | "BOTHUB_NLP_AWS_SECRET_ACCESS_KEY": ARGUMENTS.BOTHUB_NLP_AWS_SECRET_ACCESS_KEY, 50 | "BOTHUB_NLP_AWS_REGION_NAME": ARGUMENTS.BOTHUB_NLP_AWS_REGION_NAME, 51 | } 52 | -------------------------------------------------------------------------------- /aiplatform.Dockerfile: -------------------------------------------------------------------------------- 1 | ARG UBUNTU_VERSION=18.04 2 | 3 | ARG ARCH= 4 | ARG CUDA=10.1 5 | FROM nvidia/cuda${ARCH:+-$ARCH}:${CUDA}-base-ubuntu${UBUNTU_VERSION} as base 6 | # ARCH and CUDA are specified again because the FROM directive resets ARGs 7 | # (but their default value is retained if set previously) 8 | ARG ARCH 9 | ARG CUDA 10 | ARG CUDNN=7.6.4.38-1 11 | ARG CUDNN_MAJOR_VERSION=7 12 | ARG LIB_DIR_PREFIX=x86_64 13 | ARG LIBNVINFER=6.0.1-1 14 | ARG LIBNVINFER_MAJOR_VERSION=6 15 | 16 | # Needed for string substitution 17 | SHELL ["/bin/bash", "-c"] 18 | # Pick up some TF dependencies 19 | RUN apt-get update && apt-get install -y --no-install-recommends \ 20 | build-essential \ 21 | cuda-command-line-tools-${CUDA/./-} \ 22 | # There appears to be a regression in libcublas10=10.2.2.89-1 which 23 | # prevents cublas from initializing in TF. See 24 | # https://github.com/tensorflow/tensorflow/issues/9489#issuecomment-562394257 25 | libcublas10=10.2.1.243-1 \ 26 | cuda-nvrtc-${CUDA/./-} \ 27 | cuda-cufft-${CUDA/./-} \ 28 | cuda-curand-${CUDA/./-} \ 29 | cuda-cusolver-${CUDA/./-} \ 30 | cuda-cusparse-${CUDA/./-} \ 31 | curl \ 32 | git \ 33 | wget \ 34 | libcudnn7=${CUDNN}+cuda${CUDA} \ 35 | libfreetype6-dev \ 36 | libhdf5-serial-dev \ 37 | libzmq3-dev \ 38 | pkg-config \ 39 | software-properties-common \ 40 | unzip 41 | 42 | # Install TensorRT if not building for PowerPC 43 | RUN [[ "${ARCH}" = "ppc64le" ]] || { apt-get update && \ 44 | apt-get install -y --no-install-recommends libnvinfer${LIBNVINFER_MAJOR_VERSION}=${LIBNVINFER}+cuda${CUDA} \ 45 | libnvinfer-plugin${LIBNVINFER_MAJOR_VERSION}=${LIBNVINFER}+cuda${CUDA} \ 46 | && apt-get clean \ 47 | && rm -rf /var/lib/apt/lists/*; } 48 | 49 | # For CUDA profiling, TensorFlow requires CUPTI. 50 | ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:/usr/local/cuda/lib64:$LD_LIBRARY_PATH 51 | 52 | # Link the libcuda stub to the location where tensorflow is searching for it and reconfigure 53 | # dynamic linker run-time bindings 54 | RUN ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1 \ 55 | && echo "/usr/local/cuda/lib64/stubs" > /etc/ld.so.conf.d/z-cuda-stubs.conf \ 56 | && ldconfig 57 | 58 | # See http://bugs.python.org/issue19846 59 | ENV LANG C.UTF-8 60 | ENV LC_ALL C.UTF-8 61 | 62 | RUN apt-get update && apt-get install -y \ 63 | python3 \ 64 | python3-pip 65 | 66 | RUN python3 -m pip --no-cache-dir install --upgrade \ 67 | pip \ 68 | setuptools 69 | 70 | # Some TF tools expect a "python" binary 71 | RUN ln -s $(which python3) /usr/local/bin/python 72 | 73 | WORKDIR /home/root/app 74 | 75 | RUN echo ttf-mscorefonts-installer msttcorefonts/accepted-mscorefonts-eula select true | debconf-set-selections 76 | RUN apt-get install -y ttf-mscorefonts-installer \ 77 | && apt-get autoremove -y \ 78 | && apt-get clean -y \ 79 | && rm -rf /var/lib/apt/lists/* 80 | 81 | COPY ai_platform/aiplatform_requirements.txt . 82 | 83 | FROM base as builder 84 | 85 | RUN pip3 wheel --wheel-dir=/wheels -r aiplatform_requirements.txt 86 | 87 | FROM base 88 | 89 | COPY --from=builder /wheels /wheels 90 | 91 | RUN pip3 install --find-links=/wheels -r aiplatform_requirements.txt 92 | 93 | COPY ai_platform/aiplatform_app.py . 94 | COPY ai_platform/settings.py . 95 | COPY bothub/shared /home/root/app/bothub/shared 96 | COPY bothub/__init__.py /home/root/app/bothub 97 | 98 | ARG DOWNLOAD_MODELS 99 | #Install torch with cuda 10.1 100 | RUN if [ "${DOWNLOAD_MODELS}" = "pt_br-BERT" ]; then \ 101 | pip install torch==1.6.0+cu101 torchvision==0.7.0+cu101 -f https://download.pytorch.org/whl/torch_stable.html; \ 102 | fi 103 | 104 | RUN if [ ${DOWNLOAD_MODELS} ]; then \ 105 | python3.6 bothub/shared/utils/scripts/download_models.py ${DOWNLOAD_MODELS}; \ 106 | fi 107 | 108 | 109 | ENTRYPOINT ["python3.6", "aiplatform_app.py"] 110 | -------------------------------------------------------------------------------- /bothub/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weni-ai/bothub-nlp/d43b7657aff01544769b71db74adce5f7d366879/bothub/__init__.py -------------------------------------------------------------------------------- /bothub/nlu_worker/__init__.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | logging.basicConfig( 4 | format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", level=logging.DEBUG 5 | ) 6 | -------------------------------------------------------------------------------- /bothub/nlu_worker/interpreter_manager.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import threading 3 | import time 4 | import gc 5 | 6 | from typing import Callable, Union 7 | from rasa.nlu import components 8 | from tempfile import mkdtemp 9 | from datetime import datetime 10 | 11 | from bothub.shared import settings 12 | from bothub.shared.utils.persistor import BothubPersistor 13 | from bothub.shared.utils.backend import backend 14 | from bothub.shared.utils.rasa_components.bothub_interpreter import BothubInterpreter 15 | 16 | logger = logging.getLogger(__name__) 17 | 18 | 19 | class SetInterval: 20 | """ 21 | Creates a thread that execute a function every x seconds 22 | """ 23 | def __init__(self, interval: Union[int, float], action: Callable): 24 | """ 25 | :param interval: Period in seconds 26 | :param action: Callable function 27 | """ 28 | self.interval = interval 29 | self.action = action 30 | self.stopEvent = threading.Event() 31 | thread = threading.Thread(target=self._set_interval, daemon=True) 32 | thread.start() 33 | 34 | def _set_interval(self): 35 | next_time = time.time() + self.interval 36 | while not self.stopEvent.wait(next_time - time.time()): 37 | next_time += self.interval 38 | self.action() 39 | 40 | def cancel(self): 41 | self.stopEvent.set() 42 | 43 | 44 | class InterpreterManager: 45 | def __init__(self): 46 | self.cached_interpreters = {} 47 | SetInterval(settings.WORKER_CACHE_CLEANING_PERIOD, self._clean_cache) 48 | 49 | def get_interpreter( 50 | self, 51 | repository_version, 52 | repository_authorization, 53 | rasa_version, 54 | use_cache=True 55 | ) -> BothubInterpreter: 56 | 57 | update_request = backend().request_backend_parse_nlu_persistor( 58 | repository_version, repository_authorization, rasa_version, no_bot_data=True 59 | ) 60 | 61 | repository_name = ( 62 | f"{update_request.get('version_id')}_{update_request.get('language')}" 63 | ) 64 | last_training = f"{update_request.get('total_training_end')}" 65 | 66 | # tries to fetch cache 67 | retrieved_cache = self.cached_interpreters.get(repository_name) 68 | if retrieved_cache and use_cache: 69 | # retrieve cache only if it's the same training 70 | if retrieved_cache["last_training"] == last_training: 71 | retrieved_cache["last_request"] = datetime.now() 72 | return retrieved_cache["interpreter_data"] 73 | 74 | persistor = BothubPersistor( 75 | repository_version, repository_authorization, rasa_version 76 | ) 77 | model_directory = mkdtemp() 78 | persistor.retrieve(str(update_request.get("repository_uuid")), model_directory) 79 | 80 | interpreter = BothubInterpreter( 81 | None, {"language": update_request.get("language")} 82 | ) 83 | interpreter = interpreter.load( 84 | model_directory, components.ComponentBuilder(use_cache=False) 85 | ) 86 | 87 | if use_cache: # update/creates cache 88 | self.cached_interpreters[repository_name] = { 89 | "last_training": last_training, 90 | "interpreter_data": interpreter, 91 | "last_request": datetime.now() 92 | } 93 | 94 | return interpreter 95 | 96 | def _clean_cache(self) -> None: 97 | logger.info("Cleaning repositories cache") 98 | cur_time = datetime.now() 99 | 100 | to_remove = [] 101 | for interpreter in self.cached_interpreters: 102 | last_request = self.cached_interpreters[interpreter]['last_request'] 103 | idle_time = (cur_time - last_request).total_seconds() 104 | if idle_time > settings.INTERPRETER_CACHE_IDLE_LIMIT: 105 | to_remove.append(interpreter) 106 | 107 | for interpreter in to_remove: 108 | del self.cached_interpreters[interpreter] 109 | 110 | logger.info(f"{len(to_remove)} interpreters cleaned") 111 | objects_collected = gc.collect() 112 | logger.info(f"{objects_collected} objects collected") 113 | -------------------------------------------------------------------------------- /bothub/nlu_worker/task/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weni-ai/bothub-nlp/d43b7657aff01544769b71db74adce5f7d366879/bothub/nlu_worker/task/__init__.py -------------------------------------------------------------------------------- /bothub/nlu_worker/task/debug_parse.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from collections import OrderedDict 4 | from lime.lime_text import LimeTextExplainer 5 | from rasa.nlu.test import remove_pretrained_extractors 6 | from rasa.nlu import __version__ as rasa_version 7 | 8 | from bothub.shared.utils.backend import backend 9 | 10 | 11 | class DebugSentenceLime: 12 | def __init__(self, interpreter, intention_names): 13 | self.interpreter = interpreter 14 | self.interpreter.pipeline = remove_pretrained_extractors( 15 | self.interpreter.pipeline 16 | ) 17 | self.intention_names = intention_names 18 | 19 | def classifier(self, text_list): 20 | result_list = [] 21 | for text in text_list: 22 | result_json = self.interpreter.parse(text) 23 | 24 | idx_dict = ( 25 | {} 26 | ) # fixing intent name to a index ex: {'violence': 0, 'immigration': 1, ... } 27 | size = len(self.intention_names) 28 | for i in range(size): 29 | idx_dict[self.intention_names[i]] = i 30 | 31 | intent_list = [0] * len(self.intention_names) 32 | intent_name_list = [""] * len(self.intention_names) 33 | size = len(result_json.get("intent_ranking", [])) 34 | for i in range(size): 35 | intent_name = result_json.get("intent_ranking")[i].get("name") 36 | intent_list[idx_dict[intent_name]] = result_json.get("intent_ranking")[ 37 | i 38 | ].get("confidence") 39 | intent_name_list[idx_dict[intent_name]] = result_json.get( 40 | "intent_ranking" 41 | )[i].get("name") 42 | 43 | prob_array = np.array(intent_list) 44 | prob_array = prob_array.reshape((1, len(intent_list))) 45 | result_list.append(prob_array) 46 | 47 | result_array = result_list[0] 48 | for i in range(1, len(result_list)): 49 | result_array = np.vstack([result_array, result_list[i]]) 50 | return result_array 51 | 52 | def get_result_per_word(self, text, num_samples): 53 | if not self.intention_names: 54 | return {} 55 | explainer = LimeTextExplainer(class_names=self.intention_names) 56 | labels = list(range(len(self.intention_names))) # List 57 | try: 58 | exp = explainer.explain_instance( 59 | text, self.classifier, num_features=6, labels=labels, num_samples=num_samples 60 | ) 61 | except ValueError: 62 | labels = [] 63 | result_per_word = {} 64 | for label in labels: 65 | for j in exp.as_list(label=label): 66 | if j[0] not in result_per_word: 67 | result_per_word[j[0]] = [] 68 | result_per_word[j[0]].append( 69 | {"intent": self.intention_names[label], "relevance": j[1] * 100} 70 | ) 71 | for word in result_per_word: 72 | result_per_word[word] = sorted( 73 | result_per_word[word], key=lambda k: k.get("relevance"), reverse=True 74 | ) 75 | return result_per_word 76 | 77 | def get_result_per_intent(self, text, num_samples): 78 | explainer = LimeTextExplainer(class_names=self.intention_names) 79 | labels = list(range(len(self.intention_names))) # List 80 | exp = explainer.explain_instance( 81 | text, self.classifier, num_features=6, labels=labels, num_samples=num_samples 82 | ) 83 | result_per_intent = {} 84 | for intent in self.intention_names: 85 | result_per_intent[intent] = [] 86 | for i in labels: 87 | intent_sum = 0 88 | for j in exp.as_list(label=i): 89 | result_per_intent[self.intention_names[i]].append( 90 | {"word": j[0], "relevance": j[1] * 100} 91 | ) 92 | intent_sum += j[1] 93 | result_per_intent[self.intention_names[i]].append( 94 | {"sum": intent_sum, "relevance": -1} 95 | ) 96 | for intent in result_per_intent: 97 | result_per_intent[intent] = sorted( 98 | result_per_intent[intent], 99 | key=lambda k: k.get("relevance"), 100 | reverse=True, 101 | ) 102 | 103 | return result_per_intent 104 | 105 | 106 | def minimal_entity(entity, self_flag=False): # pragma: no cover 107 | out = { 108 | "value": entity.get("value"), 109 | "entity": entity.get("entity"), 110 | "confidence": entity.get("confidence"), 111 | "start": entity.get("start"), 112 | "end": entity.get("end"), 113 | } 114 | 115 | if self_flag: 116 | out.update({"self": True}) 117 | 118 | return out 119 | 120 | 121 | def get_intention_list(repository_authorization, repository_version_language_id): 122 | info = backend().request_backend_info( 123 | repository_authorization, repository_version_language_id=repository_version_language_id 124 | ) 125 | return info.get("intents", []) 126 | 127 | 128 | def format_debug_parse_output(result_per_word, r): 129 | entities = r.get("entities") 130 | formatted_entities = [] 131 | for entity in entities: 132 | formatted_entities.append(minimal_entity(entity)) 133 | for word in result_per_word: 134 | result_per_word[word] = sorted( 135 | result_per_word[word], key=lambda k: k["relevance"], reverse=True 136 | ) 137 | result_per_word = OrderedDict( 138 | sorted( 139 | result_per_word.items(), key=lambda t: t[1][0]["relevance"], reverse=True 140 | ) 141 | ) 142 | out = OrderedDict( 143 | [ 144 | ("intent", r.get("intent", None)), 145 | ("words", result_per_word), 146 | ("entities", formatted_entities), 147 | ] 148 | ) 149 | return out 150 | 151 | 152 | def n_samples_by_sentence_lenght(sentence): 153 | word_count = len(sentence.split(" ")) 154 | n_samples = min(int(1.8 ** word_count), 128) 155 | return n_samples 156 | 157 | 158 | def debug_parse_text( 159 | repository_version_language_id, 160 | repository_authorization, 161 | interpreter_manager, 162 | text, 163 | use_cache=True, 164 | ): 165 | interpreter = interpreter_manager.get_interpreter( 166 | repository_version_language_id, repository_authorization, rasa_version, use_cache 167 | ) 168 | r = interpreter.parse(text) 169 | 170 | intention_names = get_intention_list(repository_authorization, repository_version_language_id) 171 | result_per_word = DebugSentenceLime( 172 | interpreter, intention_names 173 | ).get_result_per_word(text, n_samples_by_sentence_lenght(text)) 174 | 175 | return format_debug_parse_output(result_per_word, r) 176 | -------------------------------------------------------------------------------- /bothub/nlu_worker/task/intent_sentence_suggestion.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | from bothub_nlp_celery.app import nlp_language 3 | from bothub.shared.utils.preprocessing.preprocessing_factory import PreprocessingFactory 4 | 5 | import random 6 | 7 | from .sentence_suggestion import SentenceSuggestion 8 | from bothub.shared.utils.helpers import get_examples_request 9 | 10 | 11 | class NonexistentIntentError(Exception): 12 | pass 13 | 14 | 15 | def intent_sentence_suggestion_text( 16 | repository_version, repository_authorization, intent, percentage_to_replace, n 17 | ): 18 | if nlp_language is None: 19 | return "spacy model not loaded in this language" 20 | if nlp_language.vocab.vectors_length == 0: 21 | return "language not supported for this feature" 22 | 23 | intent_sentences = get_examples_request(repository_version, repository_authorization, intent=intent) 24 | intent_sentences = [el['text'] for el in intent_sentences] 25 | if len(intent_sentences) == 0: 26 | raise NonexistentIntentError() 27 | intent_sentences_sample = random.sample(intent_sentences, min(n, len(intent_sentences))) 28 | factor = n / len(intent_sentences_sample) 29 | 30 | preprocessor1 = PreprocessingFactory(remove_accent=False).factory() 31 | preprocessor2 = PreprocessingFactory(remove_accent=True).factory() 32 | 33 | suggested_sentences = [] 34 | count = 0 35 | while len(suggested_sentences) < n: 36 | if count > n or count >= len(intent_sentences_sample): 37 | break 38 | generated_sentences = SentenceSuggestion().get_suggestions( 39 | preprocessor1.preprocess_text(intent_sentences_sample[count]), 40 | percentage_to_replace, 41 | random.randint(int(1 * factor), int(3 * factor)) 42 | ) 43 | for generated_sentence in generated_sentences: 44 | preprocessed_sentence = preprocessor2.preprocess_text(generated_sentence) 45 | if preprocessed_sentence not in intent_sentences: 46 | suggested_sentences.append(preprocessed_sentence) 47 | count += 1 48 | 49 | suggested_sentences = suggested_sentences[:n] 50 | 51 | return OrderedDict([("intent", intent), ("suggested_sentences", suggested_sentences)]) 52 | -------------------------------------------------------------------------------- /bothub/nlu_worker/task/parse.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | from rasa.nlu import __version__ as rasa_version 3 | 4 | 5 | def format_parse_output( 6 | repository_version, r, repository_authorization 7 | ): # pragma: no cover 8 | intent = r.get("intent", None) 9 | intent_ranking = r.get("intent_ranking", []) 10 | entities = r.get("entities", []) 11 | 12 | out = OrderedDict( 13 | [ 14 | ("intent", intent), 15 | ("intent_ranking", intent_ranking), 16 | ( 17 | "entities_list", 18 | list(OrderedDict.fromkeys([x.get("entity", None) for x in entities])), 19 | ), 20 | ("entities", entities), 21 | ] 22 | ) 23 | return out 24 | 25 | 26 | def parse_text( 27 | repository_version, 28 | repository_authorization, 29 | interpreter_manager, 30 | text, 31 | rasa_format=False, 32 | use_cache=True, 33 | ): 34 | interpreter = interpreter_manager.get_interpreter( 35 | repository_version, repository_authorization, rasa_version, use_cache 36 | ) 37 | r = interpreter.parse(text) 38 | 39 | if rasa_format: 40 | return r 41 | 42 | return format_parse_output(repository_version, r, repository_authorization) 43 | -------------------------------------------------------------------------------- /bothub/nlu_worker/task/sentence_suggestion.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | from bothub_nlp_celery.app import nlp_language 3 | import random 4 | import numpy as np 5 | 6 | 7 | class SentenceSuggestion: 8 | def __init__(self): 9 | self.nlp = nlp_language 10 | self.to_replace_tags = ["VERB", "NOUN", "ADJ", "ADV", "INTJ", "PROPN"] 11 | self.n_highest = 50 12 | self.row2key = {row: key for key, row in self.nlp.vocab.vectors.key2row.items()} 13 | 14 | def most_similar(self, input_words, *, batch_size=1024, topn=1, sort=True): 15 | words_similar_list = [] 16 | similar_list = [] 17 | words = input_words 18 | if isinstance(input_words, str): 19 | words = [input_words] 20 | for word in words: 21 | input_vector = self.nlp(word).vector.reshape( 22 | 1, self.nlp.vocab.vectors.shape[1] 23 | ) 24 | best_rows = np.zeros((1, self.n_highest), dtype="i") 25 | scores = np.zeros((1, self.n_highest), dtype="f") 26 | 27 | # Work in batches, to avoid memory problems. 28 | for i in range(0, input_vector.shape[0], batch_size): 29 | batch = input_vector[i : i + batch_size] 30 | batch_norms = np.linalg.norm(batch, axis=1, keepdims=True) 31 | batch_norms[batch_norms == 0] = 1 32 | batch /= batch_norms 33 | sims = np.dot(batch, self.nlp.vocab.vectors.data.T) 34 | best_rows[i : i + batch_size] = np.argpartition( 35 | sims, -self.n_highest, axis=1 36 | )[ 37 | :, -self.n_highest : 38 | ] # get n_highest scores rows in O(n) 39 | scores[i : i + batch_size] = np.partition( 40 | sims, -self.n_highest, axis=1 41 | )[ 42 | :, -self.n_highest : 43 | ] # get n_highest scores in O(n) 44 | 45 | # sort the n_highest scores and best_rows 46 | if sort and topn >= 2: 47 | sorted_index = ( 48 | np.arange(scores.shape[0])[:, None][i : i + batch_size], 49 | np.argsort(scores[i : i + batch_size], axis=1)[:, ::-1], 50 | ) 51 | scores[i : i + batch_size] = scores[sorted_index] 52 | best_rows[i : i + batch_size] = best_rows[sorted_index] 53 | 54 | scores = np.around(scores, decimals=4, out=scores) 55 | scores = np.clip(scores, a_min=-1, a_max=1, out=scores) 56 | 57 | # get similar list of tuple (word, score) only if both input and candidate word is lower or large case 58 | similar_list = [] 59 | for i in range(self.n_highest): 60 | row = best_rows[0][i] 61 | score = scores[0][i] 62 | candidate_word_vocab = self.nlp.vocab[self.row2key[row]] 63 | candidate_word = candidate_word_vocab.text 64 | if ( 65 | candidate_word_vocab.is_lower == word.islower() 66 | and candidate_word != word 67 | ): 68 | similar_list.append((candidate_word, score)) 69 | if len(similar_list) >= topn: 70 | break 71 | words_similar_list.append(similar_list) 72 | if isinstance(input_words, str): 73 | return similar_list 74 | return words_similar_list 75 | 76 | @staticmethod # get the indexes of the replaceable words 77 | def get_words_to_replace_idx(similar_words_json, word_list, percentage_to_replace): 78 | percentage_to_replace = np.clip(percentage_to_replace, 0, 1) 79 | word_list_size = len(word_list) 80 | for idx in list(similar_words_json): 81 | if len(similar_words_json[idx].get("similar_words")) == 0: 82 | del similar_words_json[idx] 83 | words_to_replace_idx = [] 84 | # number of words to replace 85 | n_words_to_replace = int(word_list_size * percentage_to_replace) 86 | replaceable_idx_list = list(similar_words_json) 87 | if n_words_to_replace < len(replaceable_idx_list): 88 | to_replace_idx_list = random.sample( 89 | range(len(replaceable_idx_list)), n_words_to_replace 90 | ) 91 | for idx in to_replace_idx_list: 92 | words_to_replace_idx.append(replaceable_idx_list[idx]) 93 | else: 94 | words_to_replace_idx = replaceable_idx_list 95 | return words_to_replace_idx 96 | 97 | def similar_words_json(self, sentence): 98 | similar_words_json = {} 99 | word_list = sentence.split(" ") 100 | sentence_size = len(word_list) 101 | for i in range(sentence_size): 102 | try: 103 | word_pos = self.nlp(word_list[i])[0].pos_ 104 | word_json = { 105 | "word": word_list[i], 106 | "type": word_pos, 107 | "similar_words": [], 108 | } 109 | if word_pos in self.to_replace_tags: 110 | similar_words = self.most_similar(word_list[i], topn=6) 111 | similar_words_size = len(similar_words) 112 | for j in range(similar_words_size): 113 | nlp_similar = self.nlp(similar_words[j][0]) 114 | if len(nlp_similar) > 0 and nlp_similar[0].pos_ == word_pos: 115 | similar_json = { 116 | "word": str(similar_words[j][0]), 117 | "type": str(nlp_similar[0].pos_), 118 | "relevance": str(similar_words[j][1]), 119 | } 120 | word_json["similar_words"].append(similar_json) 121 | similar_words_json[i] = word_json 122 | except KeyError: 123 | pass 124 | return similar_words_json 125 | 126 | def get_suggestions(self, sentence, percentage_to_replace, n): # main method 127 | similar_words_json = self.similar_words_json(sentence) 128 | suggested_sentences = [] 129 | for _ in range(n): 130 | word_list = sentence.split(" ") 131 | words_to_replace_idx = self.get_words_to_replace_idx( 132 | similar_words_json, word_list, percentage_to_replace 133 | ) 134 | for replace_idx in words_to_replace_idx: 135 | similar_words_len = len( 136 | similar_words_json[replace_idx].get("similar_words") 137 | ) 138 | word_list[replace_idx] = ( 139 | similar_words_json[replace_idx] 140 | .get("similar_words")[random.randint(0, similar_words_len - 1)] 141 | .get("word") 142 | ) 143 | suggested_sentences.append(" ".join(word_list)) 144 | suggested_sentences = list(set(suggested_sentences)) # Remove duplicates 145 | return suggested_sentences 146 | 147 | 148 | def sentence_suggestion_text(text, percentage_to_replace, n): 149 | if nlp_language is None: 150 | return "spacy model not loaded in this language" 151 | if nlp_language.vocab.vectors_length == 0: 152 | return "language not supported for this feature" 153 | 154 | similar_sentences = SentenceSuggestion().get_suggestions( 155 | text, percentage_to_replace, n 156 | ) 157 | return OrderedDict([("text", text), ("suggested_sentences", similar_sentences)]) 158 | -------------------------------------------------------------------------------- /bothub/nlu_worker/task/word_suggestion.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | from bothub_nlp_celery.app import nlp_language 3 | from bothub.shared.utils.preprocessing.preprocessing_factory import PreprocessingFactory 4 | import numpy as np 5 | 6 | 7 | class WordSuggestion: 8 | def __init__(self): 9 | self.nlp = nlp_language 10 | self.to_replace_tags = ["VERB", "NOUN", "ADJ", "ADV", "INTJ", "PROPN"] 11 | self.n_highest = 50 12 | self.row2key = {row: key for key, row in self.nlp.vocab.vectors.key2row.items()} 13 | 14 | def most_similar(self, word, *, batch_size=1024, topn=1, sort=True): 15 | input_vector = self.nlp(word).vector.reshape(1, self.nlp.vocab.vectors.shape[1]) 16 | best_rows = np.zeros((1, self.n_highest), dtype="i") 17 | scores = np.zeros((1, self.n_highest), dtype="f") 18 | 19 | # Work in batches, to avoid memory problems. 20 | for i in range(0, input_vector.shape[0], batch_size): 21 | batch = input_vector[i : i + batch_size] 22 | batch_norms = np.linalg.norm(batch, axis=1, keepdims=True) 23 | batch_norms[batch_norms == 0] = 1 24 | batch /= batch_norms 25 | sims = np.dot(batch, self.nlp.vocab.vectors.data.T) 26 | best_rows[i : i + batch_size] = np.argpartition( 27 | sims, -self.n_highest, axis=1 28 | )[ 29 | :, -self.n_highest : 30 | ] # get n_highest scores rows in O(n) 31 | scores[i : i + batch_size] = np.partition(sims, -self.n_highest, axis=1)[ 32 | :, -self.n_highest : 33 | ] # get n_highest scores in O(n) 34 | 35 | # sort the n_highest scores and best_rows 36 | if sort and topn >= 2: 37 | sorted_index = ( 38 | np.arange(scores.shape[0])[:, None][i : i + batch_size], 39 | np.argsort(scores[i : i + batch_size], axis=1)[:, ::-1], 40 | ) 41 | scores[i : i + batch_size] = scores[sorted_index] 42 | best_rows[i : i + batch_size] = best_rows[sorted_index] 43 | 44 | scores = np.around(scores, decimals=4, out=scores) 45 | scores = np.clip(scores, a_min=-1, a_max=1, out=scores) 46 | 47 | # get similar list of tuple (word, score) only if both input and candidate word is lower or large case 48 | similar_list = [] 49 | for i in range(self.n_highest): 50 | row = best_rows[0][i] 51 | score = scores[0][i] 52 | candidate_word_vocab = self.nlp.vocab[self.row2key[row]] 53 | candidate_word = candidate_word_vocab.text 54 | if ( 55 | candidate_word_vocab.is_lower == word.islower() 56 | and candidate_word != word 57 | ): 58 | similar_list.append((candidate_word, str(score))) 59 | if len(similar_list) >= topn: 60 | break 61 | return similar_list 62 | 63 | 64 | def word_suggestion_text(text, n): 65 | if nlp_language is None: 66 | return "spacy model not loaded in this language" 67 | if nlp_language.vocab.vectors_length == 0: 68 | return "language not supported for this feature" 69 | 70 | preprocessor = PreprocessingFactory(remove_accent=False).factory() 71 | text = preprocessor.preprocess_text(text) 72 | similar_words = WordSuggestion().most_similar(text, topn=n) 73 | preprocessor = PreprocessingFactory(remove_accent=True).factory() 74 | similar_words = [(preprocessor.preprocess_text(word[0]), word[1]) for word in similar_words] 75 | 76 | return OrderedDict([("text", text), ("similar_words", similar_words)]) 77 | -------------------------------------------------------------------------------- /bothub/nlu_worker/task/words_distribution.py: -------------------------------------------------------------------------------- 1 | from collections import Counter, OrderedDict 2 | 3 | from bothub.shared.utils.helpers import get_examples_request 4 | 5 | 6 | def words_distribution_text(repository_version, language, repository_authorization): 7 | examples_list = get_examples_request(repository_version, repository_authorization) 8 | 9 | all_intents = [] # the list of all words 10 | intents = {} # all the words separated by intent 11 | all_frequencies = {} # the count of all words 12 | frequencies = {} # the count of words separated by intent 13 | 14 | for example in examples_list: 15 | text = example.get("text") 16 | intent = example.get("intent") 17 | for word in text.split(): 18 | all_intents.append(word.lower()) 19 | if intent in intents: 20 | intents[intent].append(word.lower()) 21 | else: 22 | intents[intent] = [word.lower()] 23 | 24 | all_frequencies = Counter(all_intents) 25 | 26 | for intent in intents: 27 | frequencies[intent] = Counter(intents[intent]) 28 | 29 | for intent in frequencies: 30 | for n_tuple in frequencies[intent].most_common(): 31 | word = n_tuple[0] 32 | try: 33 | frequencies[intent][word] = ( 34 | frequencies[intent][word] / all_frequencies[word] * 100 35 | ) 36 | except ZeroDivisionError: # pragma: no cover 37 | continue # pragma: no cover 38 | 39 | ordered_frequencies = {} 40 | 41 | for intent in frequencies: 42 | if intent not in ordered_frequencies: 43 | ordered_frequencies[intent] = OrderedDict() 44 | for n_tuple in frequencies[intent].most_common(): 45 | word = n_tuple[0] 46 | ordered_frequencies[intent][word] = frequencies[intent][word] 47 | 48 | return {"words": ordered_frequencies} 49 | -------------------------------------------------------------------------------- /bothub/shared/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weni-ai/bothub-nlp/d43b7657aff01544769b71db74adce5f7d366879/bothub/shared/__init__.py -------------------------------------------------------------------------------- /bothub/shared/settings.py: -------------------------------------------------------------------------------- 1 | from decouple import config 2 | 3 | # Period of time (seconds) the worker will look for idle interpreters to free space 4 | WORKER_CACHE_CLEANING_PERIOD = config( 5 | "WORKER_CACHE_CLEANING_PERIOD", cast=float, default=3*3600 6 | ) 7 | # Idle limit of time (seconds) the interpreter will be cached 8 | INTERPRETER_CACHE_IDLE_LIMIT = config( 9 | "INTERPRETER_CACHE_IDLE_LIMIT", cast=float, default=24*3600 10 | ) 11 | # Minimum number of sentences to start decreasing number of epochs 12 | DYNAMIC_EPOCHS_THRESHOLD = config( 13 | "DYNAMIC_EPOCHS_THRESHOLD", cast=int, default=10000 14 | ) 15 | -------------------------------------------------------------------------------- /bothub/shared/train.py: -------------------------------------------------------------------------------- 1 | from tempfile import mkdtemp 2 | import os 3 | import logging 4 | from rasa.nlu import __version__ as rasa_version 5 | from rasa.nlu.model import Trainer 6 | from rasa.nlu.training_data import Message, TrainingData 7 | from rasa.nlu.components import ComponentBuilder 8 | 9 | from bothub.shared.utils.poke_logging import PokeLogging 10 | from bothub.shared.utils.backend import backend 11 | from bothub.shared.utils.helpers import get_examples_request 12 | from bothub.shared.utils.persistor import BothubPersistor 13 | from bothub.shared.utils.pipeline_builder import PipelineBuilder 14 | 15 | logger = logging.getLogger(__name__) 16 | 17 | 18 | def intersection(lst1, lst2): 19 | lst3 = [value for value in lst1 if value in lst2] 20 | return lst3 21 | 22 | 23 | def train_update( 24 | repository_version_language_id, by_user, repository_authorization, from_queue="celery" 25 | ): # pragma: no cover 26 | 27 | update_request = backend().request_backend_start_training_nlu( 28 | repository_version_language_id, by_user, repository_authorization, from_queue 29 | ) 30 | 31 | examples_list = get_examples_request(repository_version_language_id, repository_authorization) 32 | 33 | with PokeLogging() as pl: 34 | try: 35 | examples = [] 36 | 37 | for example in examples_list: 38 | examples.append( 39 | Message.build( 40 | text=example.get("text"), 41 | intent=example.get("intent"), 42 | entities=example.get("entities"), 43 | ) 44 | ) 45 | 46 | update_request["dataset_size"] = len(examples) 47 | 48 | pipeline_builder = PipelineBuilder(update_request) 49 | pipeline_builder.print_pipeline() 50 | rasa_nlu_config = pipeline_builder.get_nlu_model() 51 | 52 | trainer = Trainer(rasa_nlu_config, ComponentBuilder(use_cache=False)) 53 | training_data = TrainingData( 54 | training_examples=examples, lookup_tables=None 55 | ) 56 | 57 | trainer.train(training_data) 58 | 59 | persistor = BothubPersistor( 60 | repository_version_language_id, repository_authorization, rasa_version 61 | ) 62 | trainer.persist( 63 | mkdtemp(), 64 | persistor=persistor, 65 | fixed_model_name=f"{update_request.get('repository_version')}_" 66 | f"{update_request.get('total_training_end') + 1}_" 67 | f"{update_request.get('language')}", 68 | ) 69 | except Exception as e: 70 | logger.exception(e) 71 | backend().request_backend_trainfail_nlu( 72 | repository_version_language_id, repository_authorization 73 | ) 74 | raise e 75 | finally: 76 | backend().request_backend_traininglog_nlu( 77 | repository_version_language_id, pl.getvalue(), repository_authorization 78 | ) 79 | -------------------------------------------------------------------------------- /bothub/shared/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weni-ai/bothub-nlp/d43b7657aff01544769b71db74adce5f7d366879/bothub/shared/utils/__init__.py -------------------------------------------------------------------------------- /bothub/shared/utils/backend.py: -------------------------------------------------------------------------------- 1 | import bothub_backend 2 | import argparse 3 | from decouple import config 4 | 5 | 6 | def backend(): 7 | PARSER = argparse.ArgumentParser() 8 | 9 | # Input Arguments 10 | PARSER.add_argument( 11 | "--base_url", help="Base URL API Engine.", type=str, default=None 12 | ) 13 | 14 | ARGUMENTS, _ = PARSER.parse_known_args() 15 | 16 | return bothub_backend.get_backend( 17 | "bothub_backend.bothub.BothubBackend", 18 | ARGUMENTS.base_url 19 | if ARGUMENTS.base_url 20 | else config("BOTHUB_ENGINE_URL", default="https://api.bothub.it"), 21 | ) 22 | -------------------------------------------------------------------------------- /bothub/shared/utils/helpers.py: -------------------------------------------------------------------------------- 1 | from bothub.shared.utils.backend import backend 2 | 3 | 4 | ALGORITHM_TO_LANGUAGE_MODEL = { 5 | "neural_network_internal": None, 6 | "neural_network_external": "SPACY", 7 | "transformer_network_diet": None, 8 | "transformer_network_diet_word_embedding": "SPACY", 9 | "transformer_network_diet_bert": "BERT", 10 | } 11 | 12 | 13 | def get_examples_request(repository_version_language, repository_authorization, intent=""): # pragma: no cover 14 | 15 | start_examples = backend().request_backend_get_examples( 16 | repository_version_language, None, repository_authorization, intent=intent 17 | ) 18 | 19 | examples = start_examples.get("results") 20 | page = start_examples.get("next") 21 | 22 | if page: 23 | while True: 24 | request_examples_page = backend().request_backend_get_examples( 25 | repository_version_language, page, repository_authorization, intent=intent 26 | ) 27 | 28 | examples += request_examples_page.get("results") 29 | 30 | if request_examples_page.get("next") is None: 31 | break 32 | 33 | page = request_examples_page.get("next") 34 | 35 | return examples 36 | 37 | 38 | def examples_request(repository_authorization, language, repository_version): # pragma: no cover 39 | 40 | start_examples = backend().request_backend_examples( 41 | repository_authorization, language, repository_version, page=None 42 | ) 43 | 44 | examples = start_examples.get("results") 45 | page = start_examples.get("next") 46 | 47 | if page: 48 | while True: 49 | request_examples_page = backend().request_backend_get_examples( 50 | repository_authorization, language, repository_version, page=page 51 | ) 52 | 53 | examples += request_examples_page.get("results") 54 | 55 | if request_examples_page.get("next") is None: 56 | break 57 | 58 | page = request_examples_page.get("next") 59 | 60 | return examples 61 | 62 | 63 | def get_algorithm_info(): 64 | # todo: get data from config file / populate languages 65 | 66 | # Sorted by priority 67 | # last element -> default algorithm 68 | return [ 69 | {"name": "transformer_network_diet_bert", "supported_languages": ["all"]}, 70 | {"name": "transformer_network_diet_word_embedding", "supported_languages": []}, 71 | {"name": "transformer_network_diet", "supported_languages": ["all"]}, 72 | ] 73 | 74 | 75 | def choose_best_algorithm(language): 76 | supported_algorithms = get_algorithm_info() 77 | 78 | for model in supported_algorithms[:-1]: 79 | if language in model["supported_languages"]: 80 | return model["name"] 81 | 82 | # default algorithm 83 | return supported_algorithms[len(supported_algorithms) - 1]["name"] 84 | -------------------------------------------------------------------------------- /bothub/shared/utils/lookup_tables/en/country.txt: -------------------------------------------------------------------------------- 1 | Afghanistan 2 | Albania 3 | Algeria 4 | Andorra 5 | Angola 6 | Antigua and Deps 7 | Argentina 8 | Armenia 9 | Australia 10 | Austria 11 | Azerbaijan 12 | Bahamas 13 | Bahrain 14 | Bangladesh 15 | Barbados 16 | Belarus 17 | Belgium 18 | Belize 19 | Benin 20 | Bhutan 21 | Bolivia 22 | Bosnia Herzegovina 23 | Botswana 24 | Brazil 25 | Brunei 26 | Bulgaria 27 | Burkina 28 | Burundi 29 | Cambodia 30 | Cameroon 31 | Canada 32 | Cape Verde 33 | Central African Republic 34 | Chad 35 | Chile 36 | China 37 | Colombia 38 | Comoros 39 | Congo 40 | Congo 41 | Costa Rica 42 | Croatia 43 | Cuba 44 | Cyprus 45 | Czech Republic 46 | Denmark 47 | Djibouti 48 | Dominica 49 | Dominican Republic 50 | East Timor 51 | Ecuador 52 | Egypt 53 | El Salvador 54 | Equatorial Guinea 55 | Eritrea 56 | Estonia 57 | Ethiopia 58 | Fiji 59 | Finland 60 | France 61 | Gabon 62 | Gambia 63 | Georgia 64 | Germany 65 | Ghana 66 | Greece 67 | Grenada 68 | Guatemala 69 | Guinea 70 | Guinea-Bissau 71 | Guyana 72 | Haiti 73 | Honduras 74 | Hungary 75 | Iceland 76 | India 77 | Indonesia 78 | Iran 79 | Iraq 80 | Ireland 81 | Israel 82 | Italy 83 | Ivory Coast 84 | Jamaica 85 | Japan 86 | Jordan 87 | Kazakhstan 88 | Kenya 89 | Kiribati 90 | Korea North 91 | Korea South 92 | Kosovo 93 | Kuwait 94 | Kyrgyzstan 95 | Laos 96 | Latvia 97 | Lebanon 98 | Lesotho 99 | Liberia 100 | Libya 101 | Liechtenstein 102 | Lithuania 103 | Luxembourg 104 | Macedonia 105 | Madagascar 106 | Malawi 107 | Malaysia 108 | Maldives 109 | Mali 110 | Malta 111 | Marshall Islands 112 | Mauritania 113 | Mauritius 114 | Mexico 115 | Micronesia 116 | Moldova 117 | Monaco 118 | Mongolia 119 | Montenegro 120 | Morocco 121 | Mozambique 122 | Myanmar 123 | Namibia 124 | Nauru 125 | Nepal 126 | Netherlands 127 | New Zealand 128 | Nicaragua 129 | Niger 130 | Nigeria 131 | Norway 132 | Oman 133 | Pakistan 134 | Palau 135 | Panama 136 | Papua New Guinea 137 | Paraguay 138 | Peru 139 | Philippines 140 | Poland 141 | Portugal 142 | Qatar 143 | Romania 144 | Russian Federation 145 | Rwanda 146 | St Kitts and Nevis 147 | St Lucia 148 | Saint Vincent and the Grenadines 149 | Samoa 150 | San Marino 151 | Sao Tome and Principe 152 | Saudi Arabia 153 | Senegal 154 | Serbia 155 | Seychelles 156 | Sierra Leone 157 | Singapore 158 | Slovakia 159 | Slovenia 160 | Solomon Islands 161 | Somalia 162 | South Africa 163 | South Sudan 164 | Spain 165 | Sri Lanka 166 | Sudan 167 | Suriname 168 | Swaziland 169 | Sweden 170 | Switzerland 171 | Syria 172 | Taiwan 173 | Tajikistan 174 | Tanzania 175 | Thailand 176 | Togo 177 | Tonga 178 | Trinidad and Tobago 179 | Tunisia 180 | Turkey 181 | Turkmenistan 182 | Tuvalu 183 | Uganda 184 | Ukraine 185 | United Arab Emirates 186 | United Kingdom 187 | United States 188 | Uruguay 189 | Uzbekistan 190 | Vanuatu 191 | Vatican City 192 | Venezuela 193 | Vietnam 194 | Yemen 195 | Zambia 196 | Zimbabwe 197 | -------------------------------------------------------------------------------- /bothub/shared/utils/lookup_tables/en/email.txt: -------------------------------------------------------------------------------- 1 | regex [\w\-.]+@([\w\-]+\.)+[\w\-]{2,4} 2 | -------------------------------------------------------------------------------- /bothub/shared/utils/lookup_tables/pt_br/brand.txt: -------------------------------------------------------------------------------- 1 | sorriso 2 | omo 3 | inter 4 | sazon 5 | blu 6 | volvo 7 | vigor 8 | sul america 9 | lanix 10 | estacio 11 | chery 12 | itau 13 | smiles 14 | seda 15 | ford 16 | rexona 17 | kfc 18 | arezzo 19 | vitarella 20 | ikea 21 | honda 22 | siemens 23 | pampers 24 | philips 25 | cartier 26 | petrobras 27 | santander 28 | tirol 29 | embratel 30 | fanta 31 | msi 32 | lego 33 | skol 34 | mercedes-benz 35 | lg 36 | brahma 37 | lancome 38 | bauducco 39 | lux 40 | ninho 41 | nescau 42 | hp 43 | club social 44 | orange 45 | zte 46 | renault 47 | qualy 48 | fox 49 | home depot 50 | acer 51 | pwc 52 | accenture 53 | netflix 54 | ebay 55 | santa amalia 56 | lojas americanas 57 | ypioca 58 | at&t 59 | t-mobile 60 | land rover 61 | caterpillar 62 | sap 63 | net 64 | danone 65 | huawei 66 | starbucks 67 | allianz 68 | liza 69 | piraque 70 | uniqlo 71 | ponto frio 72 | chase 73 | cisco 74 | microsoft 75 | rolex 76 | fleury 77 | ype 78 | porsche 79 | lexus 80 | nestle 81 | dell 82 | seara 83 | zara 84 | marlboro 85 | marilan 86 | sbt 87 | intel 88 | porto seguro 89 | sadia 90 | tang 91 | natura 92 | asus 93 | totvs 94 | itambe 95 | chevrolet 96 | cielo 97 | colgate 98 | amazon 99 | nike 100 | ibm 101 | quero 102 | limpol 103 | ipiranga 104 | chanel 105 | gucci 106 | caixa 107 | santa clara 108 | american express 109 | hsbc 110 | renner 111 | italac 112 | marata 113 | perdigao 114 | hyundai 115 | arisco 116 | elege 117 | personal 118 | peugeot 119 | nubank 120 | iguatemi 121 | fiat 122 | hering 123 | htc 124 | paypal 125 | john deere 126 | mcdonald's 127 | ups 128 | vivo 129 | tixan 130 | assai 131 | google 132 | fedex 133 | budweiser 134 | bohemia 135 | toyota 136 | mabel 137 | sony 138 | h&m 139 | lowe's 140 | espn 141 | nissan 142 | piracanjuba 143 | havaianas 144 | oracle 145 | cvs 146 | basf 147 | dove 148 | bombril 149 | gol 150 | pepsi 151 | samsung 152 | audi 153 | miojo 154 | corona 155 | sulamerica 156 | palmolive 157 | magazine luiza 158 | multiplus 159 | hellmann's 160 | bradesco 161 | volkswagen 162 | magalu 163 | walmart 164 | droga raia 165 | casas bahia 166 | globo 167 | disney 168 | kia 169 | veja 170 | nokia 171 | costco 172 | heineken 173 | citi 174 | buscape 175 | lenovo 176 | bank of america 177 | banco do brasil 178 | adobe 179 | hermes 180 | frito-lay 181 | b3 182 | visa 183 | deloitte 184 | nescafe 185 | suvinil 186 | extra 187 | citroen 188 | blackberry 189 | cvc 190 | apple 191 | riachuelo 192 | localiza 193 | xiaomi 194 | rbc 195 | tigre 196 | panco 197 | coca-cola 198 | btg pactual 199 | soya 200 | amil 201 | louis vuitton 202 | camponesa 203 | pao de acucar 204 | facebook 205 | antarctica 206 | gillette 207 | atacadao 208 | netshoes 209 | red bull 210 | verizon 211 | mastercard 212 | bmw 213 | ge 214 | jeep 215 | minuano 216 | schin 217 | mitsubishi 218 | anhanguera 219 | alcatel 220 | l'oreal 221 | sony ericsson 222 | motorola 223 | knorr 224 | dorflex 225 | nivea 226 | kellogg's 227 | drogasil 228 | adidas 229 | -------------------------------------------------------------------------------- /bothub/shared/utils/lookup_tables/pt_br/cep.txt: -------------------------------------------------------------------------------- 1 | regex [0-9]{5}-?[0-9]{3} 2 | 3 | 4 | -------------------------------------------------------------------------------- /bothub/shared/utils/lookup_tables/pt_br/country.txt: -------------------------------------------------------------------------------- 1 | Afeganistão 2 | Albânia 3 | Argélia 4 | Andorra 5 | Angola 6 | Antigua e Deps 7 | Argentina 8 | Armênia 9 | Austrália 10 | Áustria 11 | Azerbaijão 12 | Bahamas 13 | Bahrain 14 | Bangladesh 15 | Barbados 16 | Bielo-Rússia 17 | Bélgica 18 | Belize 19 | Benin 20 | Butão 21 | Bolívia 22 | Bósnia e Herzegovina 23 | Botswana 24 | Brasil 25 | Brunei 26 | Bulgária 27 | Burkina 28 | Burundi 29 | Camboja 30 | Camarões 31 | Canadá 32 | cabo Verde 33 | Republica Centro-Africano 34 | Chade 35 | Chile 36 | China 37 | Colômbia 38 | Comores 39 | Congo 40 | Congo 41 | Costa Rica 42 | Croácia 43 | Cuba 44 | Chipre 45 | República Checa 46 | Dinamarca 47 | Djibouti 48 | Dominica 49 | República Dominicana 50 | Timor Leste 51 | Equador 52 | Egito 53 | El Salvador 54 | Guiné Equatorial 55 | Eritreia 56 | Estônia 57 | Etiópia 58 | Fiji 59 | Finlândia 60 | França 61 | Gabão 62 | Gâmbia 63 | Georgia 64 | Alemanha 65 | Gana 66 | Grécia 67 | Grenada 68 | Guatemala 69 | Guiné 70 | Guiné-bissau 71 | Guiana 72 | Haiti 73 | Honduras 74 | Hungria 75 | Islândia 76 | Índia 77 | Indonésia 78 | Irã 79 | Iraque 80 | Irlanda 81 | Israel 82 | Itália 83 | Costa do Marfim 84 | Jamaica 85 | Japão 86 | Jordânia 87 | Cazaquistão 88 | Quênia 89 | Kiribati 90 | Coreia do Norte 91 | Coreia do Sul 92 | Kosovo 93 | Kuwait 94 | Quirguistão 95 | Laos 96 | Letônia 97 | Líbano 98 | Lesoto 99 | Libéria 100 | Líbia 101 | Liechtenstein 102 | Lituânia 103 | Luxemburgo 104 | Macedonia 105 | Madagáscar 106 | Malawi 107 | Malásia 108 | Maldivas 109 | Mali 110 | Malta 111 | Ilhas Marshall 112 | Mauritânia 113 | Maurício 114 | México 115 | Micronésia 116 | Moldova 117 | Mônaco 118 | Mongólia 119 | Montenegro 120 | Marrocos 121 | Moçambique 122 | Mianmar 123 | Namibia 124 | Nauru 125 | Nepal 126 | Países Baixos 127 | Nova Zelândia 128 | Nicarágua 129 | Níger 130 | Nigéria 131 | Noruega 132 | Omã 133 | Paquistão 134 | Palau 135 | Panamá 136 | Papua Nova Guiné 137 | Paraguai 138 | Peru 139 | Filipinas 140 | Polônia 141 | Portugal 142 | Catar 143 | Romênia 144 | Federação Russa 145 | Ruanda 146 | São Cristóvão e Neves 147 | Santa Lúcia 148 | São Vicente e Granadinas 149 | Samoa 150 | San Marino 151 | São Tomé e Príncipe 152 | Arábia Saudita 153 | Senegal 154 | Sérvia 155 | Seychelles 156 | Serra Leoa 157 | Cingapura 158 | Eslováquia 159 | Eslovênia 160 | Ilhas Salomão 161 | Somália 162 | África do Sul 163 | Sudão do Sul 164 | Espanha 165 | Sri Lanka 166 | Sudão 167 | Suriname 168 | Suazilândia 169 | Suécia 170 | Suíça 171 | Síria 172 | Taiwan 173 | Tajiquistão 174 | Tanzânia 175 | Tailândia 176 | Togo 177 | Tonga 178 | Trinidad e Tobago 179 | Tunísia 180 | Peru 181 | Turcomenistão 182 | Tuvalu 183 | Uganda 184 | Ucrânia 185 | Emirados Árabes Unidos 186 | Reino Unido 187 | Estados Unidos 188 | Uruguai 189 | Uzbequistão 190 | Vanuatu 191 | Vaticano 192 | Venezuela 193 | Vietnã 194 | Iémen 195 | Zâmbia 196 | Zimbábue 197 | -------------------------------------------------------------------------------- /bothub/shared/utils/lookup_tables/pt_br/cpf.txt: -------------------------------------------------------------------------------- 1 | regex [0-9]{3}.[0-9]{3}.[0-9]{3}-[0-9]{2} 2 | regex [0-9]{11} 3 | -------------------------------------------------------------------------------- /bothub/shared/utils/lookup_tables/pt_br/email.txt: -------------------------------------------------------------------------------- 1 | regex [\w\-.]+@([\w\-]+\.)+[\w\-]{2,4} 2 | -------------------------------------------------------------------------------- /bothub/shared/utils/persistor.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import bothub_backend 3 | import argparse 4 | from tempfile import NamedTemporaryFile 5 | 6 | import requests 7 | from rasa.nlu.persistor import Persistor 8 | from decouple import config 9 | 10 | 11 | class BothubPersistor(Persistor): 12 | def __init__( 13 | self, 14 | repository_version=None, 15 | repository_authorization=None, 16 | rasa_version=None, 17 | *args, 18 | **kwargs 19 | ): 20 | super().__init__(*args, **kwargs) 21 | self.repository_version = repository_version 22 | self.repository_authorization = repository_authorization 23 | self.rasa_version = rasa_version 24 | 25 | def backend(self): 26 | PARSER = argparse.ArgumentParser() 27 | 28 | # Input Arguments 29 | PARSER.add_argument( 30 | "--base_url", help="Base URL API Engine.", type=str, default=None 31 | ) 32 | 33 | ARGUMENTS, _ = PARSER.parse_known_args() 34 | 35 | return bothub_backend.get_backend( 36 | "bothub_backend.bothub.BothubBackend", 37 | ARGUMENTS.base_url 38 | if ARGUMENTS.base_url 39 | else config("BOTHUB_ENGINE_URL", default="https://api.bothub.it"), 40 | ) 41 | 42 | def _persist_tar(self, filekey, tarname): 43 | with open(tarname, "rb") as tar_file: 44 | data = tar_file.read() 45 | 46 | self.backend().send_training_backend_nlu_persistor( 47 | self.repository_version, 48 | data, 49 | self.repository_authorization, 50 | self.rasa_version, 51 | ) 52 | 53 | def retrieve(self, model_name, target_path): 54 | tar_name = self._tar_name(model_name) 55 | 56 | train = self.backend().request_backend_parse_nlu_persistor( 57 | self.repository_version, self.repository_authorization, self.rasa_version 58 | ) 59 | 60 | if train.get("from_aws"): 61 | tar_data = requests.get(train.get("bot_data")).content 62 | else: 63 | tar_data = base64.b64decode(train.get("bot_data")) # pragma: no cover 64 | 65 | tar_file = NamedTemporaryFile(suffix=tar_name, delete=False) 66 | tar_file.write(tar_data) 67 | tar_file.close() 68 | 69 | self._decompress(tar_file.name, target_path) 70 | -------------------------------------------------------------------------------- /bothub/shared/utils/pipeline_builder.py: -------------------------------------------------------------------------------- 1 | from typing import List, Callable, Optional 2 | 3 | from bothub.shared import settings 4 | from bothub.shared.utils.helpers import ALGORITHM_TO_LANGUAGE_MODEL 5 | from bothub_nlp_celery import settings as celery_settings 6 | from bothub.shared.utils.rasa_components.registry import language_to_model 7 | from rasa.nlu.config import RasaNLUModelConfig 8 | 9 | 10 | class PipelineBuilder: 11 | def __init__(self, update): 12 | self.language = update.get("language") 13 | self.algorithm = update.get("algorithm") 14 | self.use_name_entities = update.get("use_name_entities") 15 | self.dataset_size = update.get("dataset_size") 16 | self.use_competing_intents = update.get("use_competing_intents") 17 | self.use_analyze_char = update.get("use_analyze_char") 18 | self.prebuilt_entities = update.get("prebuilt_entities", []) 19 | self.model = self._build_model_requirements() 20 | self.pipeline = self._build_pipeline() 21 | 22 | @staticmethod 23 | def _add_spacy_nlp() -> dict: 24 | return {"name": "bothub.shared.utils.pipeline_components.spacy_nlp.SpacyNLP"} 25 | 26 | @staticmethod 27 | def _add_whitespace_tokenizer() -> dict: 28 | return {"name": "WhitespaceTokenizer"} 29 | 30 | def _add_preprocessing(self) -> dict: 31 | return { 32 | "name": "bothub.shared.utils.pipeline_components.preprocessing.Preprocessing", 33 | "language": self.language, 34 | } 35 | 36 | @staticmethod 37 | def _add_regex_entity_extractor() -> dict: 38 | return { 39 | "name": "bothub.shared.utils.pipeline_components.regex_entity_extractor.RegexEntityExtractorCustom" 40 | } 41 | 42 | def _add_countvectors_featurizer(self) -> List[dict]: 43 | featurizers = [] 44 | 45 | if self.use_analyze_char: 46 | featurizers.append( 47 | { 48 | "name": "CountVectorsFeaturizer", 49 | "analyzer": "char_wb", 50 | "min_ngram": 3, 51 | "max_ngram": 3, 52 | } 53 | ) 54 | 55 | featurizers.append( 56 | {"name": "CountVectorsFeaturizer", "token_pattern": r"(?u)\b\w+\b"} 57 | ) 58 | 59 | return featurizers 60 | 61 | def _add_legacy_countvectors_featurizer(self) -> dict: 62 | if self.use_analyze_char: 63 | return { 64 | "name": "CountVectorsFeaturizer", 65 | "analyzer": "char_wb", 66 | "min_ngram": 3, 67 | "max_ngram": 3, 68 | } 69 | else: 70 | return {"name": "CountVectorsFeaturizer", "token_pattern": r"(?u)\b\w+\b"} 71 | 72 | def _add_microsoft_entity_extractor(self) -> dict: 73 | return { 74 | "name": "bothub.shared.utils.pipeline_components.microsoft_recognizers_extractor.MicrosoftRecognizersExtractor", 75 | "dimensions": self.prebuilt_entities, 76 | "language": self.language, 77 | } 78 | 79 | @staticmethod 80 | def _add_embedding_intent_classifier() -> dict: 81 | return { 82 | "name": "bothub.shared.utils.pipeline_components.diet_classifier.DIETClassifierCustom", 83 | "hidden_layers_sizes": {"text": [256, 128]}, 84 | "number_of_transformer_layers": 0, 85 | "weight_sparsity": 0, 86 | "intent_classification": True, 87 | "entity_recognition": True, 88 | "use_masked_language_model": False, 89 | "BILOU_flag": False, 90 | } 91 | 92 | @staticmethod 93 | def _epoch_factor_function1(examples_qnt: int, min_threshold: int) -> float: 94 | """ 95 | :param examples_qnt: Number of examples in dataset 96 | :param min_threshold: Minimum number of examples needed to have a factor > 1 97 | :return: Division factor of defined maximum epochs 98 | 99 | Example: 100 | min_threshold = 10000 101 | examples_qnt = 10000 -> (25*(10000-10000) + 100*10000)//10000 = 100,0 -> 100/100,0 = 1.00 (base case) 102 | examples_qnt = 15000 -> (25*(15000-10000) + 100*10000)//15000 = 75,00 -> 100/75,00 = 1,33 103 | examples_qnt = 30000 -> (25*(30000-10000) + 100*10000)//30000 = 50,00 -> 100/50,00 = 2,00 104 | examples_qnt = 60000 -> (25*(60000-10000) + 100*10000)//60000 = 37,50 -> 100/37,50 = 2,66 105 | examples_qnt = 90000 -> (25*(90000-10000) + 100*10000)//90000 = 33,33 -> 100/33,33 = 3,00 106 | 107 | """ 108 | if examples_qnt <= min_threshold: 109 | return 1.0 110 | 111 | over_qnt = examples_qnt - min_threshold 112 | epochs_ratio = ((25*over_qnt) + (100*min_threshold)) / examples_qnt 113 | factor = 100 / epochs_ratio 114 | 115 | return factor 116 | 117 | def _calculate_epochs_number( 118 | self, 119 | max_epochs: int, 120 | factor_function: Callable[[int, int], float] 121 | ) -> int: 122 | """ 123 | :param max_epochs: Maximum number of epochs to be considered 124 | :param factor_function: Function that returns the division factor 125 | :return: Calculated number of epochs (max_epochs/calculated_factor) 126 | """ 127 | min_threshold = settings.DYNAMIC_EPOCHS_THRESHOLD 128 | 129 | if self.dataset_size < min_threshold: 130 | return max_epochs 131 | 132 | factor = factor_function(self.dataset_size, min_threshold) 133 | epochs = int(max_epochs // factor) 134 | return epochs 135 | 136 | def _add_diet_classifier(self, max_epochs=300, bert=False) -> dict: 137 | epochs = self._calculate_epochs_number(max_epochs, self._epoch_factor_function1) 138 | 139 | model = { 140 | "name": "bothub.shared.utils.pipeline_components.diet_classifier.DIETClassifierCustom", 141 | "entity_recognition": True, 142 | "BILOU_flag": False, 143 | "epochs": epochs, 144 | } 145 | 146 | if bert: 147 | model["hidden_layer_sizes"] = {"text": [256, 64]} 148 | 149 | return model 150 | 151 | def _legacy_internal_config(self) -> List[dict]: 152 | partial_pipeline = [ 153 | self._add_whitespace_tokenizer(), # Tokenizer 154 | self._add_legacy_countvectors_featurizer(), # Featurizer 155 | self._add_embedding_intent_classifier(), # Intent Classifier 156 | ] 157 | return partial_pipeline 158 | 159 | def _legacy_external_config(self) -> List[dict]: 160 | partial_pipeline = [ 161 | {"name": "SpacyTokenizer"}, # Tokenizer 162 | {"name": "SpacyFeaturizer"}, # Spacy Featurizer 163 | self._add_legacy_countvectors_featurizer(), # Bag of Words Featurizer 164 | self._add_embedding_intent_classifier(), # intent classifier 165 | ] 166 | return partial_pipeline 167 | 168 | def _transformer_network_diet_config(self) -> List[dict]: 169 | partial_pipeline = [self._add_whitespace_tokenizer()] 170 | 171 | # partial_pipeline.append(add_regex_entity_extractor()) 172 | # if self.prebuilt_entities: 173 | # partial_pipeline.append(add_microsoft_entity_extractor(update)) # Microsoft Entity Extractor) 174 | partial_pipeline.extend( 175 | self._add_countvectors_featurizer() 176 | ) # Bag of Words Featurizer 177 | partial_pipeline.append( 178 | self._add_diet_classifier(max_epochs=150) 179 | ) # Intent Classifier 180 | 181 | return partial_pipeline 182 | 183 | def _transformer_network_diet_word_embedding_config(self) -> List[dict]: 184 | partial_pipeline = [ 185 | {"name": "SpacyTokenizer"}, # Tokenizer 186 | {"name": "SpacyFeaturizer"}, # Spacy Featurizer 187 | ] 188 | partial_pipeline.extend( 189 | self._add_countvectors_featurizer() 190 | ) # Bag of Words Featurizer 191 | partial_pipeline.append( 192 | self._add_diet_classifier(max_epochs=200) 193 | ) # Intent Classifier 194 | 195 | return partial_pipeline 196 | 197 | def _transformer_network_diet_bert_config(self) -> List[dict]: 198 | partial_pipeline = [ 199 | { # NLP 200 | "name": "bothub.shared.utils.pipeline_components.hf_transformer.HFTransformersNLPCustom", 201 | "model_name": language_to_model.get(self.language, "bert_multilang"), 202 | }, 203 | { # Tokenizer 204 | "name": "bothub.shared.utils.pipeline_components.lm_tokenizer.LanguageModelTokenizerCustom", 205 | "intent_tokenization_flag": False, 206 | "intent_split_symbol": "_", 207 | }, 208 | { # Bert Featurizer 209 | "name": "bothub.shared.utils.pipeline_components.lm_featurizer.LanguageModelFeaturizerCustom" 210 | }, 211 | ] 212 | # partial_pipeline.append(add_regex_entity_extractor()) 213 | # if self.prebuilt_entities: 214 | # partial_pipeline.append(add_microsoft_entity_extractor(update)) # Microsoft Entity Extractor) 215 | 216 | partial_pipeline.extend( 217 | self._add_countvectors_featurizer() 218 | ) # Bag of Words Featurizers 219 | partial_pipeline.append( 220 | self._add_diet_classifier(max_epochs=100, bert=True) 221 | ) # Intent Classifier 222 | 223 | return partial_pipeline 224 | 225 | def _build_model_requirements(self) -> Optional[str]: 226 | model = ALGORITHM_TO_LANGUAGE_MODEL[self.algorithm] 227 | if model == "SPACY" and self.language not in celery_settings.AVAILABLE_SPACY_MODELS: 228 | model = None 229 | if self.algorithm == "neural_network_external": 230 | self.algorithm = "neural_network_internal" 231 | else: 232 | self.algorithm = "transformer_network_diet" 233 | 234 | return model 235 | 236 | def _build_pipeline(self) -> List[dict]: 237 | pipeline = [self._add_preprocessing()] 238 | 239 | if ( 240 | self.use_name_entities 241 | and self.algorithm != "transformer_network_diet_bert" 242 | and self.language in celery_settings.AVAILABLE_SPACY_MODELS 243 | ) or self.algorithm in [ 244 | "neural_network_external", 245 | "transformer_network_diet_word_embedding", 246 | ]: 247 | pipeline.append(self._add_spacy_nlp()) 248 | 249 | if self.algorithm == "neural_network_internal": 250 | pipeline.extend(self._legacy_internal_config()) 251 | elif self.algorithm == "neural_network_external": 252 | pipeline.extend(self._legacy_external_config()) 253 | elif self.algorithm == "transformer_network_diet_bert": 254 | pipeline.extend(self._transformer_network_diet_bert_config()) 255 | elif self.algorithm == "transformer_network_diet_word_embedding": 256 | pipeline.extend(self._transformer_network_diet_word_embedding_config()) 257 | else: 258 | pipeline.extend(self._transformer_network_diet_config()) 259 | 260 | if ( 261 | self.use_name_entities 262 | and self.algorithm != "transformer_network_diet_bert" 263 | and self.language in celery_settings.AVAILABLE_SPACY_MODELS 264 | ): 265 | pipeline.append({"name": "SpacyEntityExtractor"}) 266 | 267 | return pipeline 268 | 269 | def print_pipeline(self) -> None: 270 | import json 271 | 272 | print(f"Pipeline Config:") 273 | for component in self.pipeline: 274 | print(json.dumps(component, indent=2)) 275 | 276 | def get_nlu_model(self) -> RasaNLUModelConfig: 277 | return RasaNLUModelConfig( 278 | {"language": self.language, "pipeline": self.pipeline} 279 | ) 280 | -------------------------------------------------------------------------------- /bothub/shared/utils/pipeline_components/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weni-ai/bothub-nlp/d43b7657aff01544769b71db74adce5f7d366879/bothub/shared/utils/pipeline_components/__init__.py -------------------------------------------------------------------------------- /bothub/shared/utils/pipeline_components/diet_classifier.py: -------------------------------------------------------------------------------- 1 | import rasa.utils.common as common_utils 2 | from rasa.nlu.classifiers.diet_classifier import DIETClassifier 3 | from rasa.constants import DOCS_URL_TRAINING_DATA_NLU 4 | from rasa.nlu.training_data import TrainingData 5 | from rasa.nlu.constants import ( 6 | ENTITIES, 7 | TOKENS_NAMES, 8 | TEXT, 9 | ENTITY_ATTRIBUTE_START, 10 | ENTITY_ATTRIBUTE_END, 11 | INTENT, 12 | ) 13 | 14 | 15 | class DIETClassifierCustom(DIETClassifier): 16 | @staticmethod 17 | def check_correct_entity_annotations(training_data: TrainingData) -> None: 18 | """Check if entities are correctly annotated in the training data. 19 | If the start and end values of an entity do not match any start and end values 20 | of the respected token, we define an entity as misaligned and log a warning. 21 | Args: 22 | training_data: The training data. 23 | """ 24 | for example in training_data.entity_examples: 25 | entity_boundaries = [ 26 | (entity[ENTITY_ATTRIBUTE_START], entity[ENTITY_ATTRIBUTE_END]) 27 | for entity in example.get(ENTITIES) 28 | ] 29 | token_start_positions = [ 30 | t.start for t in example.get(TOKENS_NAMES[TEXT], []) 31 | ] 32 | token_end_positions = [t.end for t in example.get(TOKENS_NAMES[TEXT], [])] 33 | 34 | for entity_start, entity_end in entity_boundaries: 35 | if ( 36 | entity_start not in token_start_positions 37 | or entity_end not in token_end_positions 38 | ): 39 | common_utils.raise_warning( 40 | f"Misaligned entity annotation in message '{example.text}' " 41 | f"with intent '{example.get(INTENT)}'. Make sure the start and " 42 | f"end values of entities in the training data match the token " 43 | f"boundaries (e.g. entities don't include trailing whitespaces " 44 | f"or punctuation).", 45 | docs=DOCS_URL_TRAINING_DATA_NLU, 46 | ) 47 | break 48 | -------------------------------------------------------------------------------- /bothub/shared/utils/pipeline_components/hf_transformer.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from typing import Any, Dict, List, Text, Tuple, Optional 3 | 4 | import numpy as np 5 | import rasa.utils.train_utils as train_utils 6 | 7 | from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer 8 | from rasa.nlu.training_data import Message 9 | from rasa.nlu.tokenizers.tokenizer import Token 10 | from rasa.nlu.utils.hugging_face.hf_transformers import HFTransformersNLP 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | 15 | class HFTransformersNLPCustom(HFTransformersNLP): 16 | """Utility Component for interfacing between Transformers library and Rasa OS. 17 | The transformers(https://github.com/huggingface/transformers) library 18 | is used to load pre-trained language models like BERT, GPT-2, etc. 19 | The component also tokenizes and featurizes dense featurizable attributes of each 20 | message. 21 | """ 22 | 23 | def __init__(self, component_config: Optional[Dict[Text, Any]] = None) -> None: 24 | super(HFTransformersNLP, self).__init__(component_config) 25 | 26 | self._load_model() 27 | self.whitespace_tokenizer = WhitespaceTokenizer() 28 | 29 | def _load_model(self) -> None: 30 | """Try loading the model""" 31 | 32 | from bothub.shared.utils.rasa_components.registry import ( 33 | model_class_dict, 34 | model_weights_defaults, 35 | model_tokenizer_dict, 36 | from_pt_dict, 37 | ) 38 | 39 | self.model_name = self.component_config["model_name"] 40 | 41 | if self.model_name not in model_class_dict: 42 | raise KeyError( 43 | f"'{self.model_name}' not a valid model name. Choose from " 44 | f"{str(list(model_class_dict.keys()))}or create" 45 | f"a new class inheriting from this class to support your model." 46 | ) 47 | 48 | self.model_weights = self.component_config["model_weights"] 49 | self.cache_dir = self.component_config["cache_dir"] 50 | 51 | if not self.model_weights: 52 | logger.info( 53 | f"Model weights not specified. Will choose default model weights: " 54 | f"{model_weights_defaults[self.model_name]}" 55 | ) 56 | self.model_weights = model_weights_defaults[self.model_name] 57 | 58 | logger.debug(f"Loading Tokenizer and Model for {self.model_name}") 59 | 60 | try: 61 | from bothub_nlp_celery.app import nlp_language 62 | 63 | self.tokenizer, self.model = nlp_language 64 | except TypeError: 65 | logger.info( 66 | f"Model could not be retrieved from celery cache " 67 | f"Loading model {self.model_name} in memory" 68 | ) 69 | self.tokenizer = model_tokenizer_dict[self.model_name].from_pretrained( 70 | model_weights_defaults[self.model_name], cache_dir=None 71 | ) 72 | self.model = model_class_dict[self.model_name].from_pretrained( 73 | self.model_name, 74 | cache_dir=None, 75 | from_pt=from_pt_dict.get(self.model_name, False), 76 | ) 77 | 78 | # Use a universal pad token since all transformer architectures do not have a 79 | # consistent token. Instead of pad_token_id we use unk_token_id because 80 | # pad_token_id is not set for all architectures. We can't add a new token as 81 | # well since vocabulary resizing is not yet supported for TF classes. 82 | # Also, this does not hurt the model predictions since we use an attention mask 83 | # while feeding input. 84 | self.pad_token_id = self.tokenizer.unk_token_id 85 | logger.debug(f"Loaded Tokenizer and Model for {self.model_name}") 86 | 87 | def _add_lm_specific_special_tokens( 88 | self, token_ids: List[List[int]] 89 | ) -> List[List[int]]: 90 | """Add language model specific special tokens which were used during their training. 91 | Args: 92 | token_ids: List of token ids for each example in the batch. 93 | Returns: 94 | Augmented list of token ids for each example in the batch. 95 | """ 96 | from bothub.shared.utils.rasa_components.registry import ( 97 | model_special_tokens_pre_processors, 98 | ) 99 | 100 | augmented_tokens = [ 101 | model_special_tokens_pre_processors[self.model_name](example_token_ids) 102 | for example_token_ids in token_ids 103 | ] 104 | return augmented_tokens 105 | 106 | def _lm_specific_token_cleanup( 107 | self, split_token_ids: List[int], token_strings: List[Text] 108 | ) -> Tuple[List[int], List[Text]]: 109 | """Clean up special chars added by tokenizers of language models. 110 | Many language models add a special char in front/back of (some) words. We clean up those chars as they are not 111 | needed once the features are already computed. 112 | Args: 113 | split_token_ids: List of token ids received as output from the language model specific tokenizer. 114 | token_strings: List of token strings received as output from the language model specific tokenizer. 115 | Returns: 116 | Cleaned up token ids and token strings. 117 | """ 118 | from bothub.shared.utils.rasa_components.registry import model_tokens_cleaners 119 | 120 | return model_tokens_cleaners[self.model_name](split_token_ids, token_strings) 121 | 122 | def _post_process_sequence_embeddings( 123 | self, sequence_embeddings: np.ndarray 124 | ) -> Tuple[np.ndarray, np.ndarray]: 125 | """Compute sentence level representations and sequence level representations for relevant tokens. 126 | Args: 127 | sequence_embeddings: Sequence level dense features received as output from language model. 128 | Returns: 129 | Sentence and sequence level representations. 130 | """ 131 | 132 | from bothub.shared.utils.rasa_components.registry import ( 133 | model_embeddings_post_processors, 134 | ) 135 | 136 | sentence_embeddings = [] 137 | post_processed_sequence_embeddings = [] 138 | 139 | for example_embedding in sequence_embeddings: 140 | ( 141 | example_sentence_embedding, 142 | example_post_processed_embedding, 143 | ) = model_embeddings_post_processors[self.model_name](example_embedding) 144 | 145 | sentence_embeddings.append(example_sentence_embedding) 146 | post_processed_sequence_embeddings.append(example_post_processed_embedding) 147 | 148 | return ( 149 | np.array(sentence_embeddings), 150 | np.array(post_processed_sequence_embeddings), 151 | ) 152 | 153 | def _tokenize_example( 154 | self, message: Message, attribute: Text, model_size: int = 384 155 | ) -> Tuple[List[Token], List[int]]: 156 | """Tokenize a single message example. 157 | 158 | Many language models add a special char in front of (some) words and split words into 159 | sub-words. To ensure the entity start and end values matches the token values, 160 | tokenize the text first using the whitespace tokenizer. If individual tokens 161 | are split up into multiple tokens, we make sure that the start and end value 162 | of the first and last respective tokens stay the same. 163 | 164 | Args: 165 | message: Single message object to be processed. 166 | attribute: Property of message to be processed, one of ``TEXT`` or ``RESPONSE``. 167 | model_size: Limit of tokens the model can handle (BERT = 512) 168 | 169 | Returns: 170 | List of token strings and token ids for the corresponding attribute of the message. 171 | """ 172 | 173 | tokens_in = self.whitespace_tokenizer.tokenize(message, attribute) 174 | 175 | tokens_out = [] 176 | 177 | token_ids_out = [] 178 | 179 | for token in tokens_in: 180 | # use lm specific tokenizer to further tokenize the text 181 | split_token_ids, split_token_strings = self._lm_tokenize(token.text) 182 | 183 | split_token_ids, split_token_strings = self._lm_specific_token_cleanup( 184 | split_token_ids, split_token_strings 185 | ) 186 | 187 | if len(tokens_out) + len(split_token_strings) >= model_size: 188 | logger.warning( 189 | f"Sentence number of tokens overflowing model size. Skipping sentence exceeded tokens... " 190 | f"Sentence text: '{message.text[:50]} ...' " 191 | ) 192 | break 193 | 194 | token_ids_out += split_token_ids 195 | 196 | tokens_out += train_utils.align_tokens( 197 | split_token_strings, token.end, token.start 198 | ) 199 | 200 | return tokens_out, token_ids_out 201 | -------------------------------------------------------------------------------- /bothub/shared/utils/pipeline_components/lm_featurizer.py: -------------------------------------------------------------------------------- 1 | from rasa.nlu.featurizers.dense_featurizer.lm_featurizer import LanguageModelFeaturizer 2 | from typing import List, Type 3 | from rasa.nlu.components import Component 4 | 5 | 6 | class LanguageModelFeaturizerCustom(LanguageModelFeaturizer): 7 | @classmethod 8 | def required_components(cls) -> List[Type[Component]]: 9 | return [] 10 | -------------------------------------------------------------------------------- /bothub/shared/utils/pipeline_components/lm_tokenizer.py: -------------------------------------------------------------------------------- 1 | from typing import List, Type 2 | from rasa.nlu.components import Component 3 | from rasa.nlu.tokenizers.lm_tokenizer import LanguageModelTokenizer 4 | 5 | 6 | class LanguageModelTokenizerCustom(LanguageModelTokenizer): 7 | """Tokenizer using transformer based language models. 8 | Uses the output of HFTransformersNLP component to set the tokens 9 | for dense featurizable attributes of each message object. 10 | """ 11 | 12 | @classmethod 13 | def required_components(cls) -> List[Type[Component]]: 14 | return [] 15 | -------------------------------------------------------------------------------- /bothub/shared/utils/pipeline_components/microsoft_recognizers_extractor.py: -------------------------------------------------------------------------------- 1 | from recognizers_suite import ( 2 | recognize_number, 3 | recognize_ordinal, 4 | recognize_age, 5 | recognize_currency, 6 | recognize_dimension, 7 | recognize_temperature, 8 | recognize_datetime, 9 | recognize_phone_number, 10 | recognize_email, 11 | ) 12 | from recognizers_suite import Culture 13 | 14 | from typing import Any, Dict, Text, Optional 15 | from rasa.nlu.constants import ENTITIES 16 | from rasa.nlu.config import RasaNLUModelConfig 17 | from rasa.nlu.extractors.extractor import EntityExtractor 18 | from rasa.nlu.training_data import Message 19 | 20 | recognizers = { 21 | "number": recognize_number, 22 | "ordinal": recognize_ordinal, 23 | "age": recognize_age, 24 | "currency": recognize_currency, 25 | "dimension": recognize_dimension, 26 | "temperature": recognize_temperature, 27 | "datetime": recognize_datetime, 28 | "phone_number": recognize_phone_number, 29 | "email": recognize_email, 30 | } 31 | 32 | cultures = { 33 | "zh": Culture.Chinese, 34 | "nl": Culture.Dutch, 35 | "en": Culture.English, 36 | "fr": Culture.French, 37 | "it": Culture.Italian, 38 | "jp": Culture.Japanese, 39 | "ko": Culture.Korean, 40 | "pt_br": Culture.Portuguese, 41 | "es": Culture.Spanish, 42 | "tr": Culture.Turkish, 43 | } 44 | 45 | 46 | def rasa_format(entity): 47 | return { 48 | "entity": entity.type_name, 49 | "start": entity.start, 50 | "end": entity.end + 1, 51 | "value": entity.text, 52 | } 53 | 54 | 55 | class MicrosoftRecognizersExtractor(EntityExtractor): 56 | defaults = {"dimensions": None} 57 | 58 | def __init__(self, component_config: Optional[Dict[Text, Any]] = None) -> None: 59 | super(MicrosoftRecognizersExtractor, self).__init__(component_config) 60 | self.language = self.component_config["language"] 61 | 62 | @classmethod 63 | def create( 64 | cls, component_config: Dict[Text, Any], config: RasaNLUModelConfig 65 | ) -> "MicrosoftRecognizersExtractor": 66 | return cls(component_config) 67 | 68 | def process(self, message: Message, **kwargs: Any) -> None: 69 | dimensions = self.component_config["dimensions"] 70 | extracted = self.add_extractor_name( 71 | self.extract_entities(message.text, self.language, dimensions) 72 | ) 73 | message.set(ENTITIES, message.get(ENTITIES, []) + extracted, add_to_output=True) 74 | 75 | @staticmethod 76 | def extract_entities(user_input: str, language: str, selected_dimensions): 77 | entities_group = [] 78 | for dimension in recognizers: 79 | if dimension in selected_dimensions: 80 | entities = recognizers[dimension]( 81 | user_input, cultures.get(language, Culture.English) 82 | ) 83 | if entities: 84 | for entity in entities: 85 | entities_group.append(rasa_format(entity)) 86 | 87 | return entities_group 88 | -------------------------------------------------------------------------------- /bothub/shared/utils/pipeline_components/preprocessing.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Optional, Text, Dict, List, Type 2 | 3 | from rasa.nlu.components import Component 4 | from rasa.nlu.config import RasaNLUModelConfig 5 | from rasa.nlu.training_data import Message, TrainingData 6 | 7 | from bothub.shared.utils.preprocessing.preprocessing_factory import PreprocessingFactory 8 | 9 | 10 | class Preprocessing(Component): 11 | 12 | # Which components are required by this component. 13 | # Listed components should appear before the component itself in the pipeline. 14 | @classmethod 15 | def required_components(cls) -> List[Type[Component]]: 16 | """Specify which components need to be present in the pipeline.""" 17 | 18 | return [] 19 | 20 | # Defines the default configuration parameters of a component 21 | # these values can be overwritten in the pipeline configuration 22 | # of the model. The component should choose sensible defaults 23 | # and should be able to create reasonable results with the defaults. 24 | defaults = {"language": None} 25 | 26 | def __init__(self, component_config: Optional[Dict[Text, Any]] = None) -> None: 27 | super(Preprocessing, self).__init__(component_config) 28 | self.language = self.component_config["language"] 29 | 30 | @classmethod 31 | def create( 32 | cls, component_config: Dict[Text, Any], config: RasaNLUModelConfig 33 | ) -> "Preprocessing": 34 | return cls(component_config) 35 | 36 | def provide_context(self) -> Dict[Text, Any]: 37 | return {"language": self.language} 38 | 39 | @staticmethod 40 | def do_entities_overlap(entities: List[Dict]): 41 | sorted_entities = sorted(entities, key=lambda e: e["start"]) 42 | for i in range(len(sorted_entities) - 1): 43 | curr_ent = sorted_entities[i] 44 | next_ent = sorted_entities[i + 1] 45 | if ( 46 | next_ent["start"] < curr_ent["end"] 47 | and next_ent["entity"] != curr_ent["entity"] 48 | ): 49 | return True 50 | return False 51 | 52 | @staticmethod 53 | def remove_overlapping_entities(entities): 54 | new_entities = [] 55 | for i in range(len(entities)): 56 | overlap = False 57 | for j in range(len(entities)): 58 | if i != j and ( 59 | entities[i]["start"] >= entities[j]["start"] 60 | and entities[i]["end"] <= entities[j]["end"] 61 | ): 62 | overlap = True 63 | elif i != j and ( 64 | ( 65 | entities[i]["end"] > entities[j]["start"] 66 | and entities[i]["start"] < entities[j]["end"] 67 | ) 68 | and not ( 69 | entities[j]["start"] >= entities[i]["start"] 70 | and entities[j]["end"] <= entities[i]["end"] 71 | ) 72 | ): 73 | overlap = True 74 | if not overlap: 75 | new_entities.append(entities[i]) 76 | return new_entities 77 | 78 | def train( 79 | self, 80 | training_data: TrainingData, 81 | config: Optional[RasaNLUModelConfig] = None, 82 | **kwargs: Any, 83 | ) -> None: 84 | """Train this component""" 85 | not_repeated_phrases = set() 86 | size = len(training_data.training_examples) 87 | subtract_idx = 0 88 | language_preprocessor = PreprocessingFactory(self.language).factory() 89 | 90 | for idx in range(size): 91 | example = training_data.training_examples[idx - subtract_idx] 92 | 93 | if "entities" in example.data and self.do_entities_overlap( 94 | example.data["entities"] 95 | ): 96 | example.data["entities"] = self.remove_overlapping_entities( 97 | example.data["entities"] 98 | ) 99 | 100 | example = language_preprocessor.preprocess(example) 101 | 102 | if example.text in not_repeated_phrases: 103 | # remove example at this index from training_examples 104 | training_data.training_examples.pop(idx - subtract_idx) 105 | subtract_idx += 1 106 | else: 107 | not_repeated_phrases.add(example.text) 108 | training_data.training_examples[idx - subtract_idx].text = example.text 109 | 110 | def process(self, message: Message, **kwargs: Any) -> None: 111 | """Process an incoming message.""" 112 | 113 | language_preprocessor = PreprocessingFactory(self.language).factory() 114 | _message = language_preprocessor.preprocess(message) 115 | message.text = _message.text 116 | message.data = _message.data 117 | -------------------------------------------------------------------------------- /bothub/shared/utils/pipeline_components/regex_entity_extractor.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import re 4 | from typing import Any, Dict, List, Optional, Text, Union 5 | 6 | import rasa.utils.common 7 | import rasa.utils.io 8 | 9 | from rasa.nlu.model import Metadata 10 | from rasa.nlu.config import RasaNLUModelConfig 11 | from rasa.nlu.training_data import TrainingData 12 | from rasa.nlu.training_data.message import Message 13 | from rasa.nlu.constants import ( 14 | ENTITIES, 15 | ENTITY_ATTRIBUTE_VALUE, 16 | ENTITY_ATTRIBUTE_START, 17 | ENTITY_ATTRIBUTE_END, 18 | TEXT, 19 | ENTITY_ATTRIBUTE_TYPE, 20 | ) 21 | from rasa.nlu.extractors.extractor import EntityExtractor 22 | from ..preprocessing.preprocessing_base import PreprocessingBase 23 | 24 | logger = logging.getLogger(__name__) 25 | 26 | 27 | def read_lookup_table_file(lookup_table_file: Text) -> List[Text]: 28 | """Read the lookup table file. 29 | 30 | Args: 31 | lookup_table_file: the file path to the lookup table 32 | 33 | Returns: 34 | Elements listed in the lookup table file. 35 | """ 36 | try: 37 | f = open(lookup_table_file, "r", encoding=rasa.utils.io.DEFAULT_ENCODING) 38 | except OSError: 39 | raise ValueError( 40 | f"Could not load lookup table {lookup_table_file}. " 41 | f"Please make sure you've provided the correct path." 42 | ) 43 | 44 | elements_to_regex = [] 45 | with f: 46 | for line in f: 47 | new_element = line.strip() 48 | if new_element: 49 | elements_to_regex.append(new_element) 50 | return elements_to_regex 51 | 52 | 53 | def _generate_lookup_regex(lookup_table: Dict[Text, Union[Text, List[Text]]]) -> Text: 54 | """Creates a regex pattern from the given lookup table. 55 | 56 | The lookup table is either a file or a list of entries. 57 | 58 | Args: 59 | lookup_table: The lookup table. 60 | 61 | Returns: 62 | The regex pattern. 63 | """ 64 | lookup_elements = lookup_table["elements"] 65 | 66 | # if it's a list, it should be the elements directly 67 | if isinstance(lookup_elements, list): 68 | elements_to_regex = lookup_elements 69 | # otherwise it's a file path. 70 | else: 71 | elements_to_regex = read_lookup_table_file(lookup_elements) 72 | 73 | # sanitize the regex, escape special characters 74 | preprocessor = PreprocessingBase() 75 | elements_sanitized = [ 76 | re.escape(preprocessor.preprocess(e)) 77 | if not e.startswith("regex ") 78 | else e.split("regex ")[1] 79 | for e in elements_to_regex 80 | ] 81 | 82 | # regex matching elements with word boundaries on either side 83 | return "(\\b" + "\\b|\\b".join(elements_sanitized) + "\\b)" 84 | 85 | 86 | def _convert_lookup_tables_to_regex( 87 | training_data: TrainingData, use_only_entities: bool = False 88 | ) -> List[Dict[Text, Text]]: 89 | """Convert the lookup tables from the training data to regex patterns. 90 | Args: 91 | training_data: The training data. 92 | use_only_entities: If True only regex features with a name equal to a entity 93 | are considered. 94 | 95 | Returns: 96 | A list of regex patterns. 97 | """ 98 | patterns = [] 99 | for table in training_data.lookup_tables: 100 | if use_only_entities and table["name"] not in training_data.entities: 101 | continue 102 | regex_pattern = _generate_lookup_regex(table) 103 | # if file is empty 104 | if regex_pattern == r"(\b\b)": 105 | continue 106 | lookup_regex = {"name": table["name"], "pattern": regex_pattern} 107 | patterns.append(lookup_regex) 108 | 109 | return patterns 110 | 111 | 112 | def _collect_regex_features( 113 | training_data: TrainingData, use_only_entities: bool = False 114 | ) -> List[Dict[Text, Text]]: 115 | """Get regex features from training data. 116 | 117 | Args: 118 | training_data: The training data 119 | use_only_entities: If True only regex features with a name equal to a entity 120 | are considered. 121 | 122 | Returns: 123 | Regex features. 124 | """ 125 | if not use_only_entities: 126 | return training_data.regex_features 127 | 128 | return [ 129 | regex 130 | for regex in training_data.regex_features 131 | if regex["name"] in training_data.entities 132 | ] 133 | 134 | 135 | def extract_patterns( 136 | training_data: TrainingData, 137 | use_lookup_tables: bool = True, 138 | use_regexes: bool = True, 139 | use_only_entities: bool = False, 140 | ) -> List[Dict[Text, Text]]: 141 | """Extract a list of patterns from the training data. 142 | 143 | The patterns are constructed using the regex features and lookup tables defined 144 | in the training data. 145 | 146 | Args: 147 | training_data: The training data. 148 | use_only_entities: If True only lookup tables and regex features with a name 149 | equal to a entity are considered. 150 | use_regexes: Boolean indicating whether to use regex features or not. 151 | use_lookup_tables: Boolean indicating whether to use lookup tables or not. 152 | 153 | Returns: 154 | The list of regex patterns. 155 | """ 156 | if not training_data.lookup_tables and not training_data.regex_features: 157 | return [] 158 | 159 | patterns = [] 160 | 161 | if use_regexes: 162 | patterns.extend(_collect_regex_features(training_data, use_only_entities)) 163 | if use_lookup_tables: 164 | patterns.extend( 165 | _convert_lookup_tables_to_regex(training_data, use_only_entities) 166 | ) 167 | 168 | return patterns 169 | 170 | 171 | class RegexEntityExtractorCustom(EntityExtractor): 172 | """Searches for entities in the user's message using the lookup tables and regexes 173 | defined in the training data.""" 174 | 175 | defaults = { 176 | # text will be processed with case insensitive as default 177 | "case_sensitive": False, 178 | # use lookup tables to extract entities 179 | "use_lookup_tables": True, 180 | # use regexes to extract entities 181 | "use_regexes": True, 182 | } 183 | 184 | def __init__( 185 | self, 186 | component_config: Optional[Dict[Text, Any]] = None, 187 | patterns: Optional[List[Dict[Text, Text]]] = None, 188 | ): 189 | super(RegexEntityExtractorCustom, self).__init__(component_config) 190 | 191 | self.case_sensitive = self.component_config["case_sensitive"] 192 | self.patterns = patterns or [] 193 | 194 | def train( 195 | self, 196 | training_data: TrainingData, 197 | config: Optional[RasaNLUModelConfig] = None, 198 | **kwargs: Any, 199 | ) -> None: 200 | self.patterns = extract_patterns( 201 | training_data, 202 | use_lookup_tables=self.component_config["use_lookup_tables"], 203 | use_regexes=self.component_config["use_regexes"], 204 | use_only_entities=False, 205 | ) 206 | 207 | if not self.patterns: 208 | rasa.utils.common.raise_warning( 209 | "No lookup tables or regexes defined in the training data that have " 210 | "a name equal to any entity in the training data. In order for this " 211 | "component to work you need to define valid lookup tables or regexes " 212 | "in the training data." 213 | ) 214 | 215 | def process(self, message: Message, **kwargs: Any) -> None: 216 | if not self.patterns: 217 | return 218 | 219 | extracted_entities = self._extract_entities(message) 220 | extracted_entities = self.add_extractor_name(extracted_entities) 221 | 222 | message.set( 223 | ENTITIES, message.get(ENTITIES, []) + extracted_entities, add_to_output=True 224 | ) 225 | 226 | def _extract_entities(self, message: Message) -> List[Dict[Text, Any]]: 227 | """Extract entities of the given type from the given user message.""" 228 | entities = [] 229 | 230 | flags = 0 # default flag 231 | if not self.case_sensitive: 232 | flags = re.IGNORECASE 233 | 234 | for pattern in self.patterns: 235 | matches = re.finditer(pattern["pattern"], message.get(TEXT), flags=flags) 236 | matches = list(matches) 237 | 238 | for match in matches: 239 | start_index = match.start() 240 | end_index = match.end() 241 | entities.append( 242 | { 243 | ENTITY_ATTRIBUTE_TYPE: pattern["name"], 244 | ENTITY_ATTRIBUTE_START: start_index, 245 | ENTITY_ATTRIBUTE_END: end_index, 246 | ENTITY_ATTRIBUTE_VALUE: message.get(TEXT)[ 247 | start_index:end_index 248 | ], 249 | } 250 | ) 251 | 252 | return entities 253 | 254 | @classmethod 255 | def load( 256 | cls, 257 | meta: Dict[Text, Any], 258 | model_dir: Optional[Text] = None, 259 | model_metadata: Optional[Metadata] = None, 260 | cached_component: Optional["RegexEntityExtractor"] = None, 261 | **kwargs: Any, 262 | ) -> "RegexEntityExtractorCustom": 263 | 264 | file_name = meta.get("file") 265 | regex_file = os.path.join(model_dir, file_name) 266 | 267 | if os.path.exists(regex_file): 268 | patterns = rasa.utils.io.read_json_file(regex_file) 269 | return RegexEntityExtractorCustom(meta, patterns=patterns) 270 | 271 | return RegexEntityExtractorCustom(meta) 272 | 273 | def persist(self, file_name: Text, model_dir: Text) -> Optional[Dict[Text, Any]]: 274 | """Persist this model into the passed directory. 275 | Return the metadata necessary to load the model again.""" 276 | file_name = f"{file_name}.json" 277 | regex_file = os.path.join(model_dir, file_name) 278 | rasa.utils.io.dump_obj_as_json_to_file(regex_file, self.patterns) 279 | 280 | return {"file": file_name} 281 | -------------------------------------------------------------------------------- /bothub/shared/utils/pipeline_components/spacy_nlp.py: -------------------------------------------------------------------------------- 1 | from bothub_nlp_celery.app import nlp_language 2 | from rasa.nlu.config import override_defaults 3 | from rasa.nlu.utils.spacy_utils import SpacyNLP as RasaNLUSpacyNLP 4 | 5 | 6 | class SpacyNLP(RasaNLUSpacyNLP): 7 | @classmethod 8 | def load( 9 | cls, meta, model_dir=None, model_metadata=None, cached_component=None, **kwargs 10 | ): 11 | if cached_component: 12 | return cached_component 13 | 14 | cls.ensure_proper_language_model(nlp_language) 15 | return cls(meta, nlp_language) 16 | 17 | @classmethod 18 | def create(cls, component_config, config): 19 | component_config = override_defaults(cls.defaults, component_config) 20 | 21 | spacy_model_name = component_config.get("model") 22 | 23 | # if no model is specified, we fall back to the language string 24 | if not spacy_model_name: 25 | component_config["model"] = config.language 26 | 27 | cls.ensure_proper_language_model(nlp_language) 28 | return cls(component_config, nlp_language) 29 | -------------------------------------------------------------------------------- /bothub/shared/utils/poke_logging.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import contextvars 3 | import io 4 | 5 | 6 | class PokeLoggingHandler(logging.StreamHandler): 7 | def __init__(self, pl, *args, **kwargs): 8 | super().__init__(*args, **kwargs) 9 | self.pl = pl 10 | 11 | def emit(self, record): 12 | if self.pl.cxt.get(default=None) is self.pl: 13 | super().emit(record) 14 | 15 | 16 | class PokeLogging: 17 | def __init__(self, loggingLevel=logging.DEBUG): 18 | self.loggingLevel = loggingLevel 19 | 20 | def __enter__(self): 21 | self.cxt = contextvars.ContextVar(self.__class__.__name__) 22 | self.cxt.set(self) 23 | logging.captureWarnings(True) 24 | self.logger = logging.getLogger() 25 | self.logger.setLevel(self.loggingLevel) 26 | self.stream = io.StringIO() 27 | self.handler = PokeLoggingHandler(self, self.stream) 28 | self.formatter = logging.Formatter( 29 | "%(asctime)s - %(name)s - %(levelname)s - %(message)s" 30 | ) 31 | self.handler.setLevel(self.loggingLevel) 32 | self.handler.setFormatter(self.formatter) 33 | self.logger.addHandler(self.handler) 34 | return self.stream 35 | 36 | def __exit__(self, *args): 37 | self.logger.removeHandler(self.logger) 38 | -------------------------------------------------------------------------------- /bothub/shared/utils/preprocessing/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weni-ai/bothub-nlp/d43b7657aff01544769b71db74adce5f7d366879/bothub/shared/utils/preprocessing/__init__.py -------------------------------------------------------------------------------- /bothub/shared/utils/preprocessing/preprocessing_base.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from unidecode import unidecode 3 | import emoji 4 | import re 5 | from rasa.nlu.training_data import Message 6 | 7 | logger = logging.getLogger(__name__) 8 | 9 | 10 | class PreprocessingBase(object): 11 | emoji_contractions = {} 12 | apostrophes = ["'", "`", "’"] 13 | 14 | def __init__(self, remove_accent=True): 15 | self.remove_accent = remove_accent 16 | 17 | def preprocess_text(self, phrase: str) -> str: 18 | phrase = self.emoji_handling(phrase) 19 | phrase, _ = self.default_preprocessing(phrase) 20 | return phrase 21 | 22 | def preprocess(self, example: Message) -> Message: 23 | phrase = example.text 24 | entities = example.data.get('entities') 25 | 26 | phrase = self.emoji_handling(phrase) 27 | phrase, entities = self.default_preprocessing(phrase, entities) 28 | 29 | example.text = phrase 30 | if entities: 31 | example.data['entities'] = entities 32 | 33 | return example 34 | 35 | def _handle_entities(self, phrase, entities): 36 | # Remove apostrophe from the phrase (important to do before s_regex regex) 37 | positions = [] # mark removal positions 38 | for i, char in enumerate(phrase): 39 | if char in self.apostrophes: 40 | positions.append(i) 41 | 42 | for pos in positions: 43 | # check if before or in entity 44 | for entity in entities: 45 | if pos < entity.get('end'): 46 | entity['end'] -= 1 47 | if pos < entity.get('start'): 48 | entity['start'] -= 1 49 | 50 | for entity in entities: 51 | for apostrophe in self.apostrophes: 52 | entity['value'] = entity['value'].replace(apostrophe, "") 53 | 54 | return entities 55 | 56 | def default_preprocessing(self, phrase: str = None, entities=None): 57 | 58 | if phrase is None: 59 | raise ValueError 60 | 61 | if entities: 62 | entities = self._handle_entities(phrase, entities) 63 | 64 | for apostrophe in self.apostrophes: 65 | phrase = phrase.replace(apostrophe, "") 66 | 67 | # lowercasing characters 68 | phrase = phrase.lower() 69 | if entities: 70 | for entity in entities: 71 | entity['value'] = entity['value'].lower() 72 | 73 | if self.remove_accent: 74 | phrase = unidecode(phrase) 75 | if entities: 76 | for entity in entities: 77 | entity['value'] = unidecode(entity['value']) 78 | 79 | return phrase, entities 80 | 81 | @staticmethod 82 | def extract_emoji_text(code): 83 | """ 84 | :param code: is a emoji_code string ex: :smile_face: 85 | :return: "smile face" 86 | """ 87 | if code is None or code[0] != ':' or code[-1] != ':': 88 | raise ValueError 89 | 90 | code = code[1:len(code) - 1] 91 | text = ' '.join(code.split('_')) 92 | return text 93 | 94 | def emoji_handling(self, phrase: str = None): 95 | # turn emojis into text codes 96 | phrase = emoji.demojize(phrase) 97 | 98 | regex_emoji = r":[A-Za-z0-9\-_]+:" 99 | emoji_codes = re.findall(regex_emoji, phrase) 100 | for code in emoji_codes: 101 | try: 102 | phrase = re.sub(code, self.emoji_contractions[code], phrase) 103 | except KeyError: 104 | phrase = re.sub(code, self.extract_emoji_text(code), phrase) 105 | 106 | return phrase 107 | -------------------------------------------------------------------------------- /bothub/shared/utils/preprocessing/preprocessing_english.py: -------------------------------------------------------------------------------- 1 | import re 2 | from bothub.shared.utils.preprocessing.preprocessing_base import PreprocessingBase 3 | 4 | 5 | class PreprocessingEnglish(PreprocessingBase): 6 | emoji_contractions = { 7 | ":face_with_tears_of_joy:": "hahaha", # 😂 8 | ":red_heart_selector:": "love", # ❤️ 9 | ":smiling_face_with_heart-eyes:": "loved it", # 😍 10 | ":rolling_on_the_floor_laughing:": "hahaha", # 🤣 11 | ":smiling_face_with_smiling_eyes:": "happy", # 😊 12 | ":folded_hands:": "amen", # 🙏 13 | ":two_hearts:": "affection", # 💕 14 | ":loudly_crying_face:": "sad", # 😭 15 | ":face_blowing_a_kiss:": "kiss", # 😘 16 | ":thumbs_up:": "ok", # 👍 17 | ":grinning_face_with_sweat:": "hehehe", # 😅 18 | ":clapping_hands:": "congratulations", # 👏 19 | ":beaming_face_with_smiling_eyes:": "happy", # 😁 20 | ":heart_suit_selector:": "love", # ♥️ 21 | ":fire:": "hot", # 🔥 22 | ":broken_heart:": "hurt", # 💔 23 | ":sparkling_heart:": "affection", # 💖 24 | ":blue_heart:": "friendship", # 💙 25 | ":crying_face:": "sad", # 😢 26 | ":thinking_face:": "thinking", # 🤔 27 | ":grinning_squinting_face:": "laughs", # 😆 28 | ":face_with_rolling_eyes:": "doubt", # 🙄 29 | ":flexed_biceps:": "strong", # 💪 30 | ":winking_face:": "wink", # 😉 31 | ":smiling_face_selector:": "happy", # ☺️ 32 | ":OK_hand:": "ok", # 👌 33 | ":hugging_face:": "hug", # 🤗 34 | ":purple_heart:": "love", # 💜 35 | ":pensive_face:": "sad", # 😔 36 | ":smiling_face_with_sunglasses:": "proud", # 😎 37 | ":smiling_face_with_halo:": "saint", # 😇 38 | ":rose:": "rose", # 🌹 39 | ":person_facepalming:": "facepalm", # 🤦 40 | ":party_popper:": "party", # 🎉 41 | ":double_exclamation_mark_selector:": "exclamation", # ‼️ 42 | ":revolving_hearts:": "affection", # 💞 43 | ":victory_hand_selector:": "vitory", # ✌️ 44 | ":sparkles:": "sparkles", # ✨ 45 | ":person_shrugging:": "indiferent", # 🤷 46 | ":face_screaming_in_fear:": "fear", # 😱 47 | ":relieved_face:": "relieved", # 😌 48 | ":cherry_blossom:": "cherry blossom", # 🌸 49 | ":raising_hands:": "glad", # 🙌 50 | ":face_savoring_food:": "face_savoring_food", # 😋 51 | ":growing_heart:": "heart", # 💗 52 | ":green_heart:": "friendship", # 💚 53 | ":smirking_face:": "smirk", # 😏 54 | ":yellow_heart:": "friendship", # 💛 55 | ":slightly_smiling_face:": "smile", # 🙂 56 | ":beating_heart:": "love", # 💓 57 | ":star-struck:": "fabulous", # 🤩 58 | ":grinning_face_with_smiling_eyes:": "happy", # 😄 59 | ":grinning_face:": "happy", # 😀 60 | ":grinning_face_with_big_eyes:": "happy", # 😃 61 | ":hundred_points:": "hundred points", # 💯 62 | ":see-no-evil_monkey:": "joke", # 🙈 63 | ":backhand_index_pointing_down:": "point down", # 👇 64 | ":musical_notes:": "music", # 🎶 65 | ":unamused_face:": "unamused", # 😒 66 | ":face_with_hand_over_mouth:": "laughs", # 🤭 67 | ":heart_exclamation:": "heart", # ❣️ 68 | ":exclamation_mark:": "!", # ❗ 69 | ":winking_face_with_tongue:": "wink", # 😜 70 | ":kiss_mark:": "kiss", # 💋 71 | ":eyes:": "curious", # 👀 72 | ":sleepy_face:": "sleepy", # 😪 73 | ":expressionless_face:": "indiferent", # 😑 74 | ":collision:": "hit", # 💥 75 | ":person_raising_hand:": "raise hand", # 🙋 76 | ":disappointed_face:": "disappointed", # 😞 77 | ":weary_face:": "weary", # 😩 78 | ":pouting_face:": "furious", # 😡 79 | ":zany_face:": "zany", # 🤪 80 | ":oncoming_fist:": "oncoming fist", # 👊 81 | ":sun_selector:": "sun", # ☀️ 82 | ":sad_but_relieved_face:": "sad", # 😥 83 | ":drooling_face:": "drooling", # 🤤 84 | ":backhand_index_pointing_right:": "point right", # 👉 85 | ":woman_dancing:": "dancing", # 💃 86 | ":flushed_face:": "flushed", # 😳 87 | ":raised_hand:": "raised hand", # ✋ 88 | ":kissing_face_with_closed_eyes:": "kiss", # 😚 89 | ":squinting_face_with_tongue:": "joke", # 😝 90 | ":sleeping_face:": "sleepy", # 😴 91 | ":glowing_star:": "glow", # 🌟 92 | ":grimacing_face:": "grimacing", # 😬 93 | ":upside-down_face:": "playful", # 🙃 94 | ":four_leaf_clover:": "clover", # 🍀 95 | ":tulip:": "tulip", # 🌷 96 | ":smiling_cat_face_with_heart-eyes:": "love", # 😻 97 | ":downcast_face_with_sweat:": "disappointed", # 😓 98 | ":white_medium_star:": "star", # ⭐ 99 | ":white_heavy_check_mark:": "check mark", # ✅ 100 | ":rainbow:": "rainbow", # 🌈 101 | ":smiling_face_with_horns:": "evil", # 😈 102 | ":sign_of_the_horns:": "metal", # 🤘 103 | ":sweat_droplets:": "droplets", # 💦 104 | ":check_mark:": "check mark", # ✔️ 105 | ":persevering_face:": "persevering", # 😣 106 | ":person_running:": "running", # 🏃 107 | ":bouquet:": "bouquet", # 💐 108 | ":frowning_face_selector:": "frowning", # ☹️ 109 | ":confetti_ball:": "confetti", # 🎊 110 | ":heart_with_arrow:": "love", # 💘 111 | ":angry_face:": "angry", # 😠 112 | ":index_pointing_up_selector:": "point up", # ☝️ 113 | ":confused_face:": "confused", # 😕 114 | ":hibiscus:": "hibiscus", # 🌺 115 | ":birthday_cake:": "birthday", # 🎂 116 | ":sunflower:": "sunflower", # 🌻 117 | ":neutral_face:": "indiferent", # 😐 118 | ":middle_finger:": "angry", # 🖕 119 | ":heart_with_ribbon:": "heart", # 💝 120 | ":speak-no-evil_monkey:": "secret", # 🙊 121 | ":cat_face_with_tears_of_joy:": "hahaha", # 😹 122 | ":speaking_head_selector:": "talk", # 🗣️ 123 | ":dizzy:": "dizzy", # 💫 124 | ":skull:": "skull", # 💀 125 | ":crown:": "crown", # 👑 126 | ":musical_note:": "music", # 🎵 127 | ":crossed_fingers:": "wishful", # 🤞 128 | ":face_with_tongue:": "joke", # 😛 129 | ":red_circle:": "red circle", # 🔴 130 | ":face_with_steam_from_nose:": "angry", # 😤 131 | ":blossom:": "blossom", # 🌼 132 | ":tired_face:": "tired", # 😫 133 | ":soccer_ball:": "ball", # ⚽ 134 | ":call_me_hand:": "cool", # 🤙 135 | ":hot_beverage:": "hot beverage", # ☕ 136 | ":trophy:": "winner", # 🏆 137 | ":orange_heart:": "heart", # 🧡 138 | ":wrapped_gift:": "gift", # 🎁 139 | ":high_voltage:": "high voltage", # ⚡ 140 | ":sun_with_face:": "sun", # 🌞 141 | ":balloon:": "balloon", # 🎈 142 | ":cross_mark:": "wrong", # ❌ 143 | ":raised_fist:": "fist", # ✊ 144 | ":waving_hand:": "goodbye", # 👋 145 | ":astonished_face:": "astonished", # 😲 146 | ":herb:": "herb", # 🌿 147 | ":shushing_face:": "shush", # 🤫 148 | ":backhand_index_pointing_left:": "point left", # 👈 149 | ":face_with_open_mouth:": "astonished", # 😮 150 | ":person_gesturing_OK:": "ok", # 🙆 151 | ":clinking_beer_mugs:": "toast", # 🍻 152 | ":dog_face:": "dog", # 🐶 153 | ":anxious_face_with_sweat:": "anxious", # 😰 154 | ":face_with_raised_eyebrow:": "doubt", # 🤨 155 | ":face_without_mouth:": "speachless", # 😶 156 | ":handshake:": "deal", # 🤝 157 | ":person_walking:": "walk", # 🚶 158 | ":money_bag:": "money", # 💰 159 | ":strawberry:": "strawberry", # 🍓 160 | ":anger_symbol:": "hit", # 💢 161 | } 162 | 163 | def __init__(self, remove_accent=True): 164 | super(PreprocessingEnglish, self).__init__(remove_accent=remove_accent) 165 | -------------------------------------------------------------------------------- /bothub/shared/utils/preprocessing/preprocessing_factory.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from bothub.shared.utils.preprocessing.preprocessing_english import PreprocessingEnglish 3 | from bothub.shared.utils.preprocessing.preprocessing_portuguese import PreprocessingPortuguese 4 | from bothub.shared.utils.preprocessing.preprocessing_spanish import PreprocessingSpanish 5 | from bothub.shared.utils.preprocessing.preprocessing_base import PreprocessingBase 6 | 7 | logger = logging.getLogger(__name__) 8 | 9 | 10 | class PreprocessingFactory(object): 11 | 12 | def __init__(self, language=None, remove_accent=True): 13 | self.language = language 14 | self.remove_accent = remove_accent 15 | 16 | def factory(self): 17 | """ 18 | Implements Factory Method 19 | :return: Preprocessing Class respective to its language 20 | """ 21 | try: 22 | if self.language == "en": 23 | return PreprocessingEnglish(self.remove_accent) 24 | elif self.language == "pt_br": 25 | return PreprocessingPortuguese(self.remove_accent) 26 | elif self.language == "es": 27 | return PreprocessingSpanish(self.remove_accent) 28 | else: 29 | return PreprocessingBase(self.remove_accent) 30 | 31 | except AssertionError as e: 32 | logger.exception(e) 33 | 34 | return None 35 | -------------------------------------------------------------------------------- /bothub/shared/utils/preprocessing/preprocessing_portuguese.py: -------------------------------------------------------------------------------- 1 | import re 2 | from bothub.shared.utils.preprocessing.preprocessing_base import PreprocessingBase 3 | 4 | 5 | class PreprocessingPortuguese(PreprocessingBase): 6 | emoji_contractions = { 7 | ":face_with_tears_of_joy:": "hahaha", # 😂 8 | ":red_heart_selector:": "amor", # ❤️ 9 | ":smiling_face_with_heart-eyes:": "amei", # 😍 10 | ":rolling_on_the_floor_laughing:": "hahaha", # 🤣 11 | ":smiling_face_with_smiling_eyes:": "feliz", # 😊 12 | ":folded_hands:": "amem", # 🙏 13 | ":two_hearts:": "carinho", # 💕 14 | ":loudly_crying_face:": "triste", # 😭 15 | ":face_blowing_a_kiss:": "beijo", # 😘 16 | ":thumbs_up:": "ok", # 👍 17 | ":grinning_face_with_sweat:": "hehehe", # 😅 18 | ":clapping_hands:": "parabens", # 👏 19 | ":beaming_face_with_smiling_eyes:": "feliz", # 😁 20 | ":heart_suit_selector:": "amor", # ♥️ 21 | ":fire:": "quente", # 🔥 22 | ":broken_heart:": "magoado", # 💔 23 | ":sparkling_heart:": "carinho", # 💖 24 | ":blue_heart:": "amigo", # 💙 25 | ":crying_face:": "triste", # 😢 26 | ":thinking_face:": "pensar", # 🤔 27 | ":grinning_squinting_face:": "risos", # 😆 28 | ":face_with_rolling_eyes:": "duvida", # 🙄 29 | ":flexed_biceps:": "forte", # 💪 30 | ":winking_face:": "piscar", # 😉 31 | ":smiling_face_selector:": "feliz", # ☺️ 32 | ":OK_hand:": "ok", # 👌 33 | ":hugging_face:": "abraco", # 🤗 34 | ":purple_heart:": "amor", # 💜 35 | ":pensive_face:": "triste", # 😔 36 | ":smiling_face_with_sunglasses:": "orgulhoso", # 😎 37 | ":smiling_face_with_halo:": "santo", # 😇 38 | ":rose:": "rosa", # 🌹 39 | ":person_facepalming:": "inacreditavel", # 🤦 40 | ":party_popper:": "festa", # 🎉 41 | ":double_exclamation_mark_selector:": "urgente", # ‼️ 42 | ":revolving_hearts:": "carinho", # 💞 43 | ":victory_hand_selector:": "vitoria", # ✌️ 44 | ":sparkles:": "brilho", # ✨ 45 | ":person_shrugging:": "indiferenca", # 🤷 46 | ":face_screaming_in_fear:": "medo", # 😱 47 | ":relieved_face:": "alivio", # 😌 48 | ":cherry_blossom:": "rosa", # 🌸 49 | ":raising_hands:": "ainda bem", # 🙌 50 | ":face_savoring_food:": "brincadeira", # 😋 51 | ":growing_heart:": "amizade", # 💗 52 | ":green_heart:": "amizade", # 💚 53 | ":smirking_face:": "flertar", # 😏 54 | ":yellow_heart:": "amizade", # 💛 55 | ":slightly_smiling_face:": "feliz", # 🙂 56 | ":beating_heart:": "amor", # 💓 57 | ":star-struck:": "fabuloso", # 🤩 58 | ":grinning_face_with_smiling_eyes:": "sorriso", # 😄 59 | ":grinning_face:": "sorriso", # 😀 60 | ":grinning_face_with_big_eyes:": "feliz", # 😃 61 | ":hundred_points:": "pontuacao maxima", # 💯 62 | ":see-no-evil_monkey:": "brincadeira", # 🙈 63 | ":backhand_index_pointing_down:": "apontar", # 👇 64 | ":musical_notes:": "musica", # 🎶 65 | ":unamused_face:": "chateado", # 😒 66 | ":face_with_hand_over_mouth:": "risada", # 🤭 67 | ":heart_exclamation:": "coracao", # ❣️ 68 | ":exclamation_mark:": "importante", # ❗ 69 | ":winking_face_with_tongue:": "brincalhao", # 😜 70 | ":kiss_mark:": "beijo", # 💋 71 | ":eyes:": "curiosidade", # 👀 72 | ":sleepy_face:": "sono", # 😪 73 | ":expressionless_face:": "indiferente", # 😑 74 | ":collision:": "batida", # 💥 75 | ":person_raising_hand:": "atencao", # 🙋 76 | ":disappointed_face:": "desapontado", # 😞 77 | ":weary_face:": "cansado", # 😩 78 | ":pouting_face:": "furioso", # 😡 79 | ":zany_face:": "brincadeira", # 🤪 80 | ":oncoming_fist:": "firme", # 👊 81 | ":sun_selector:": "sol", # ☀️ 82 | ":sad_but_relieved_face:": "triste", # 😥 83 | ":drooling_face:": "desejo", # 🤤 84 | ":backhand_index_pointing_right:": "apontar", # 👉 85 | ":woman_dancing:": "danca", # 💃 86 | ":flushed_face:": "envergonhado", # 😳 87 | ":raised_hand:": "atencao", # ✋ 88 | ":kissing_face_with_closed_eyes:": "beijo", # 😚 89 | ":squinting_face_with_tongue:": "brincadeira", # 😝 90 | ":sleeping_face:": "sono", # 😴 91 | ":glowing_star:": "estrela", # 🌟 92 | ":grimacing_face:": "sem graca", # 😬 93 | ":upside-down_face:": "brincalhao", # 🙃 94 | ":four_leaf_clover:": "trevo", # 🍀 95 | ":tulip:": "tulipa", # 🌷 96 | ":smiling_cat_face_with_heart-eyes:": "apaixonado", # 😻 97 | ":downcast_face_with_sweat:": "desapontado", # 😓 98 | ":white_medium_star:": "estrela", # ⭐ 99 | ":white_heavy_check_mark:": "concluido", # ✅ 100 | ":rainbow:": "arco-iris", # 🌈 101 | ":smiling_face_with_horns:": "malvado", # 😈 102 | ":sign_of_the_horns:": "metal", # 🤘 103 | ":sweat_droplets:": "respingo", # 💦 104 | ":check_mark:": "concluido", # ✔️ 105 | ":persevering_face:": "exausto", # 😣 106 | ":person_running:": "corrida", # 🏃 107 | ":bouquet:": "flores", # 💐 108 | ":frowning_face_selector:": "triste", # ☹️ 109 | ":confetti_ball:": "festa", # 🎊 110 | ":heart_with_arrow:": "apaixonado", # 💘 111 | ":angry_face:": "furioso", # 😠 112 | ":index_pointing_up_selector:": "atencao", # ☝️ 113 | ":confused_face:": "confuso", # 😕 114 | ":hibiscus:": "flor", # 🌺 115 | ":birthday_cake:": "aniversario", # 🎂 116 | ":sunflower:": "girassol", # 🌻 117 | ":neutral_face:": "indiferente", # 😐 118 | ":middle_finger:": "raiva", # 🖕 119 | ":heart_with_ribbon:": "presente coracao", # 💝 120 | ":speak-no-evil_monkey:": "segredo", # 🙊 121 | ":cat_face_with_tears_of_joy:": "hahaha", # 😹 122 | ":speaking_head_selector:": "falar", # 🗣️ 123 | ":dizzy:": "tontura", # 💫 124 | ":skull:": "caveira", # 💀 125 | ":crown:": "coroa", # 👑 126 | ":musical_note:": "musica", # 🎵 127 | ":crossed_fingers:": "ansioso", # 🤞 128 | ":face_with_tongue:": "pegadinha", # 😛 129 | ":red_circle:": "circulo vermelho", # 🔴 130 | ":face_with_steam_from_nose:": "bravo", # 😤 131 | ":blossom:": "flor", # 🌼 132 | ":tired_face:": "cansado", # 😫 133 | ":soccer_ball:": "bola", # ⚽ 134 | ":call_me_hand:": "maneiro", # 🤙 135 | ":hot_beverage:": "bebida quente", # ☕ 136 | ":trophy:": "vencedor", # 🏆 137 | ":orange_heart:": "amizade", # 🧡 138 | ":wrapped_gift:": "presente", # 🎁 139 | ":high_voltage:": "eletricidade", # ⚡ 140 | ":sun_with_face:": "sol", # 🌞 141 | ":balloon:": "balao", # 🎈 142 | ":cross_mark:": "negacao", # ❌ 143 | ":raised_fist:": "punho", # ✊ 144 | ":waving_hand:": "adeus", # 👋 145 | ":astonished_face:": "perplexo", # 😲 146 | ":herb:": "planta", # 🌿 147 | ":shushing_face:": "segredo", # 🤫 148 | ":backhand_index_pointing_left:": "apontar", # 👈 149 | ":face_with_open_mouth:": "perplexo", # 😮 150 | ":person_gesturing_OK:": "ok", # 🙆 151 | ":clinking_beer_mugs:": "brinde", # 🍻 152 | ":dog_face:": "cachorro", # 🐶 153 | ":anxious_face_with_sweat:": "ansiedade", # 😰 154 | ":face_with_raised_eyebrow:": "duvida", # 🤨 155 | ":face_without_mouth:": "mudo", # 😶 156 | ":handshake:": "acordo", # 🤝 157 | ":person_walking:": "caminhar", # 🚶 158 | ":money_bag:": "dinheiro", # 💰 159 | ":strawberry:": "morango", # 🍓 160 | ":anger_symbol:": "batida", # 💢 161 | } 162 | 163 | def __init__(self, remove_accent=True): 164 | super(PreprocessingPortuguese, self).__init__(remove_accent=remove_accent) 165 | -------------------------------------------------------------------------------- /bothub/shared/utils/preprocessing/preprocessing_spanish.py: -------------------------------------------------------------------------------- 1 | import re 2 | from bothub.shared.utils.preprocessing.preprocessing_base import PreprocessingBase 3 | 4 | 5 | class PreprocessingSpanish(PreprocessingBase): 6 | emoji_contractions = { 7 | ":face_with_tears_of_joy:": "hahaha", # 😂 8 | ":red_heart_selector:": "amor", # ❤️ 9 | ":smiling_face_with_heart-eyes:": "me gusto", # 😍 10 | ":rolling_on_the_floor_laughing:": "hahaha", # 🤣 11 | ":smiling_face_with_smiling_eyes:": "felíz", # 😊 12 | ":folded_hands:": "amén", # 🙏 13 | ":two_hearts:": "afecto", # 💕 14 | ":loudly_crying_face:": "triste", # 😭 15 | ":face_blowing_a_kiss:": "beso", # 😘 16 | ":thumbs_up:": "ok", # 👍 17 | ":grinning_face_with_sweat:": "hehehe", # 😅 18 | ":clapping_hands:": "parabens", # 👏 19 | ":beaming_face_with_smiling_eyes:": "muy felíz", # 😁 20 | ":heart_suit_selector:": "amor", # ♥️ 21 | ":fire:": "caliente", # 🔥 22 | ":broken_heart:": "lastimar", # 💔 23 | ":sparkling_heart:": "afecto", # 💖 24 | ":blue_heart:": "amigo", # 💙 25 | ":crying_face:": "triste", # 😢 26 | ":thinking_face:": "pensar", # 🤔 27 | ":grinning_squinting_face:": "se rié", # 😆 28 | ":face_with_rolling_eyes:": "duda", # 🙄 29 | ":flexed_biceps:": "fuerte", # 💪 30 | ":winking_face:": "parpadear", # 😉 31 | ":smiling_face_selector:": "felíz", # ☺️ 32 | ":OK_hand:": "ok", # 👌 33 | ":hugging_face:": "abrazo", # 🤗 34 | ":purple_heart:": "amor", # 💜 35 | ":pensive_face:": "triste", # 😔 36 | ":smiling_face_with_sunglasses:": "orgulloso", # 😎 37 | ":smiling_face_with_halo:": "santo", # 😇 38 | ":rose:": "rosa", # 🌹 39 | ":person_facepalming:": "increíble", # 🤦 40 | ":party_popper:": "fiesta", # 🎉 41 | ":double_exclamation_mark_selector:": "urgente", # ‼️ 42 | ":revolving_hearts:": "afecto", # 💞 43 | ":victory_hand_selector:": "victoria", # ✌️ 44 | ":sparkles:": "brillo", # ✨ 45 | ":person_shrugging:": "indiferencia", # 🤷 46 | ":face_screaming_in_fear:": "miedo", # 😱 47 | ":relieved_face:": "alivio", # 😌 48 | ":cherry_blossom:": "rosa", # 🌸 49 | ":raising_hands:": "menos mal", # 🙌 50 | ":face_savoring_food:": "es una broma", # 😋 51 | ":growing_heart:": "amistad", # 💗 52 | ":green_heart:": "amistad", # 💚 53 | ":smirking_face:": "flirtear", # 😏 54 | ":yellow_heart:": "amistad", # 💛 55 | ":slightly_smiling_face:": "feliz", # 🙂 56 | ":beating_heart:": "amor", # 💓 57 | ":star-struck:": "fabuloso", # 🤩 58 | ":grinning_face_with_smiling_eyes:": "sonreír", # 😄 59 | ":grinning_face:": "sonreír", # 😀 60 | ":grinning_face_with_big_eyes:": "feliz", # 😃 61 | ":hundred_points:": "puntuación máxima", # 💯 62 | ":see-no-evil_monkey:": "es una broma", # 🙈 63 | ":backhand_index_pointing_down:": "apuntar", # 👇 64 | ":musical_notes:": "musica", # 🎶 65 | ":unamused_face:": "disgustado", # 😒 66 | ":face_with_hand_over_mouth:": "la risa", # 🤭 67 | ":heart_exclamation:": "corazon", # ❣️ 68 | ":exclamation_mark:": "importante", # ❗ 69 | ":winking_face_with_tongue:": "juguetón", # 😜 70 | ":kiss_mark:": "beso", # 💋 71 | ":eyes:": "curiosidad", # 👀 72 | ":sleepy_face:": "sueno", # 😪 73 | ":expressionless_face:": "indiferente", # 😑 74 | ":collision:": "batida", # 💥 75 | ":person_raising_hand:": "atencion", # 🙋 76 | ":disappointed_face:": "decepcionado", # 😞 77 | ":weary_face:": "cansado", # 😩 78 | ":pouting_face:": "furioso", # 😡 79 | ":zany_face:": "es una broma", # 🤪 80 | ":oncoming_fist:": "golpeo", # 👊 81 | ":sun_selector:": "sol", # ☀️ 82 | ":sad_but_relieved_face:": "triste", # 😥 83 | ":drooling_face:": "deseo", # 🤤 84 | ":backhand_index_pointing_right:": "apuntar", # 👉 85 | ":woman_dancing:": "baile", # 💃 86 | ":flushed_face:": "avergonzado", # 😳 87 | ":raised_hand:": "atencion", # ✋ 88 | ":kissing_face_with_closed_eyes:": "beso", # 😚 89 | ":squinting_face_with_tongue:": "es una broma", # 😝 90 | ":sleeping_face:": "sueno", # 😴 91 | ":glowing_star:": "estrella", # 🌟 92 | ":grimacing_face:": "desangelado", # 😬 93 | ":upside-down_face:": "bromista", # 🙃 94 | ":four_leaf_clover:": "trébol", # 🍀 95 | ":tulip:": "tulipan", # 🌷 96 | ":smiling_cat_face_with_heart-eyes:": "enamorado", # 😻 97 | ":downcast_face_with_sweat:": "decepcionado", # 😓 98 | ":white_medium_star:": "estrella", # ⭐ 99 | ":white_heavy_check_mark:": "terminado", # ✅ 100 | ":rainbow:": "arcoiris", # 🌈 101 | ":smiling_face_with_horns:": "malvado", # 😈 102 | ":sign_of_the_horns:": "metal", # 🤘 103 | ":sweat_droplets:": "churrete", # 💦 104 | ":check_mark:": "terminado", # ✔️ 105 | ":persevering_face:": "exhausto ", # 😣 106 | ":person_running:": "carrera", # 🏃 107 | ":bouquet:": "flores", # 💐 108 | ":frowning_face_selector:": "triste", # ☹️ 109 | ":confetti_ball:": "fiesta", # 🎊 110 | ":heart_with_arrow:": "enamorado", # 💘 111 | ":angry_face:": "enfurecido", # 😠 112 | ":index_pointing_up_selector:": "atencion", # ☝️ 113 | ":confused_face:": "lioso", # 😕 114 | ":hibiscus:": "flor", # 🌺 115 | ":birthday_cake:": "cumpleanos", # 🎂 116 | ":sunflower:": "girasol", # 🌻 117 | ":neutral_face:": "indiferente", # 😐 118 | ":middle_finger:": "rabia", # 🖕 119 | ":heart_with_ribbon:": "regalo corazon", # 💝 120 | ":speak-no-evil_monkey:": "secreto", # 🙊 121 | ":cat_face_with_tears_of_joy:": "hahaha", # 😹 122 | ":speaking_head_selector:": "hablar", # 🗣️ 123 | ":dizzy:": "mareo", # 💫 124 | ":skull:": "calavera", # 💀 125 | ":crown:": "corona", # 👑 126 | ":musical_note:": "musica", # 🎵 127 | ":crossed_fingers:": "ansioso", # 🤞 128 | ":face_with_tongue:": "es una broma", # 😛 129 | ":red_circle:": "circulo rojo", # 🔴 130 | ":face_with_steam_from_nose:": "bravo", # 😤 131 | ":blossom:": "flor", # 🌼 132 | ":tired_face:": "cansado", # 😫 133 | ":soccer_ball:": "pelota", # ⚽ 134 | ":call_me_hand:": "chachi", # 🤙 135 | ":hot_beverage:": "bebida caliente", # ☕ 136 | ":trophy:": "vencedor", # 🏆 137 | ":orange_heart:": "amistad", # 🧡 138 | ":wrapped_gift:": "regalo", # 🎁 139 | ":high_voltage:": "electricidad", # ⚡ 140 | ":sun_with_face:": "sol", # 🌞 141 | ":balloon:": "globo", # 🎈 142 | ":cross_mark:": "negacion", # ❌ 143 | ":raised_fist:": "puno", # ✊ 144 | ":waving_hand:": "adiós", # 👋 145 | ":astonished_face:": "perplejo", # 😲 146 | ":herb:": "planta", # 🌿 147 | ":shushing_face:": "secreto", # 🤫 148 | ":backhand_index_pointing_left:": "apuntar", # 👈 149 | ":face_with_open_mouth:": "perplejo", # 😮 150 | ":person_gesturing_OK:": "ok", # 🙆 151 | ":clinking_beer_mugs:": "brindis", # 🍻 152 | ":dog_face:": "perro", # 🐶 153 | ":anxious_face_with_sweat:": "ansiedad", # 😰 154 | ":face_with_raised_eyebrow:": "duda", # 🤨 155 | ":face_without_mouth:": "mudo", # 😶 156 | ":handshake:": "acuerdo", # 🤝 157 | ":person_walking:": "caminar", # 🚶 158 | ":money_bag:": "dinero", # 💰 159 | ":strawberry:": "fresa", # 🍓 160 | ":anger_symbol:": "batida", # 💢 161 | } 162 | 163 | def __init__(self, remove_accent=True): 164 | super(PreprocessingSpanish, self).__init__(remove_accent=remove_accent) 165 | -------------------------------------------------------------------------------- /bothub/shared/utils/rasa_components/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weni-ai/bothub-nlp/d43b7657aff01544769b71db74adce5f7d366879/bothub/shared/utils/rasa_components/__init__.py -------------------------------------------------------------------------------- /bothub/shared/utils/rasa_components/bothub_interpreter.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | from rasa.nlu.model import Metadata, Interpreter 3 | from rasa.nlu.components import Component, ComponentBuilder 4 | from rasa.nlu import components 5 | from rasa.nlu.training_data import Message 6 | from typing import Any, Dict, List, Text, Optional 7 | 8 | 9 | class BothubInterpreter(Interpreter): 10 | """Use a trained pipeline of components to parse text messages.""" 11 | 12 | def __init__( 13 | self, 14 | pipeline: List[Component], 15 | context: Optional[Dict[Text, Any]], 16 | model_metadata: Optional[Metadata] = None, 17 | ) -> None: 18 | 19 | super().__init__(pipeline, context, model_metadata) 20 | 21 | @staticmethod 22 | def load( 23 | model_dir: Text, 24 | component_builder: Optional[ComponentBuilder] = None, 25 | skip_validation: bool = False, 26 | ) -> "BothubInterpreter": 27 | """Create an interpreter based on a persisted model. 28 | 29 | Args: 30 | skip_validation: If set to `True`, tries to check that all 31 | required packages for the components are installed 32 | before loading them. 33 | model_dir: The path of the model to load 34 | component_builder: The 35 | :class:`rasa.nlu.components.ComponentBuilder` to use. 36 | 37 | Returns: 38 | An interpreter that uses the loaded model. 39 | """ 40 | 41 | model_metadata = Metadata.load(model_dir) 42 | 43 | # Adapt Loader to accept new component-name (changed) with older models 44 | metadata = model_metadata.__dict__["metadata"] 45 | for i in range(len(metadata["pipeline"])): 46 | component_name = metadata["pipeline"][i]["class"] 47 | if "bothub_nlp_rasa_utils" in component_name: 48 | metadata["pipeline"][i]["class"] = component_name.replace( 49 | "bothub_nlp_rasa_utils", "bothub.shared.utils", 1 50 | ) 51 | 52 | model_metadata = Metadata(metadata, model_dir) 53 | 54 | BothubInterpreter.ensure_model_compatibility(model_metadata) 55 | return BothubInterpreter.create(model_metadata, component_builder, skip_validation) 56 | 57 | @staticmethod 58 | def create( 59 | model_metadata: Metadata, 60 | component_builder: Optional[ComponentBuilder] = None, 61 | skip_validation: bool = False, 62 | ) -> "BothubInterpreter": 63 | """Load stored model and components defined by the provided metadata.""" 64 | 65 | context = {} 66 | 67 | if component_builder is None: 68 | # If no builder is passed, every interpreter creation will result 69 | # in a new builder. hence, no components are reused. 70 | component_builder = components.ComponentBuilder() 71 | 72 | pipeline = [] 73 | 74 | # Before instantiating the component classes, 75 | # lets check if all required packages are available 76 | if not skip_validation: 77 | components.validate_requirements(model_metadata.component_classes) 78 | 79 | for i in range(model_metadata.number_of_components): 80 | component_meta = model_metadata.for_component(i) 81 | component = component_builder.load_component( 82 | component_meta, model_metadata.model_dir, model_metadata, **context 83 | ) 84 | try: 85 | updates = component.provide_context() 86 | if updates: 87 | context.update(updates) 88 | pipeline.append(component) 89 | except components.MissingArgumentError as e: 90 | raise Exception( 91 | "Failed to initialize component '{}'. " 92 | "{}".format(component.name, e) 93 | ) 94 | 95 | return BothubInterpreter(pipeline, context, model_metadata) 96 | 97 | def parse( 98 | self, 99 | text: Text, 100 | time: Optional[datetime.datetime] = None, 101 | only_output_properties: bool = True, 102 | ) -> Dict[Text, Any]: 103 | """Parse the input text, classify it and return pipeline result. 104 | The pipeline result usually contains intent and entities.""" 105 | 106 | if not text.replace(" ", ""): 107 | # Not all components are able to handle empty strings. So we need 108 | # to prevent that... This default return will not contain all 109 | # output attributes of all components, but in the end, no one 110 | # should pass an empty string in the first place. 111 | output = self.default_output_attributes() 112 | output['intent_ranking'] = [] 113 | output["text"] = "" 114 | 115 | return output 116 | 117 | message = Message(text, self.default_output_attributes(), time=time) 118 | 119 | for component in self.pipeline: 120 | component.process(message, **self.context) 121 | 122 | output = self.default_output_attributes() 123 | output.update(message.as_dict(only_output_properties=only_output_properties)) 124 | 125 | return output 126 | -------------------------------------------------------------------------------- /bothub/shared/utils/rasa_components/registry.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | # Explicitly set logging level for this module before any import 4 | # because otherwise it logs tensorflow/pytorch versions 5 | logging.getLogger("transformers.file_utils").setLevel(logging.WARNING) 6 | 7 | from transformers import ( 8 | TFBertModel, 9 | # TFOpenAIGPTModel, 10 | # TFGPT2Model, 11 | # TFXLNetModel, 12 | # TFXLMModel, 13 | # TFDistilBertModel, 14 | # TFRobertaModel, 15 | BertTokenizer, 16 | # OpenAIGPTTokenizer, 17 | # GPT2Tokenizer, 18 | # XLNetTokenizer, 19 | # XLMTokenizer, 20 | # DistilBertTokenizer, 21 | # RobertaTokenizer, 22 | ) 23 | 24 | from rasa.nlu.utils.hugging_face.transformers_pre_post_processors import ( 25 | bert_tokens_pre_processor, 26 | # gpt_tokens_pre_processor, 27 | # xlnet_tokens_pre_processor, 28 | # roberta_tokens_pre_processor, 29 | bert_embeddings_post_processor, 30 | # gpt_embeddings_post_processor, 31 | # xlnet_embeddings_post_processor, 32 | # roberta_embeddings_post_processor, 33 | bert_tokens_cleaner, 34 | # openaigpt_tokens_cleaner, 35 | # gpt2_tokens_cleaner, 36 | # xlnet_tokens_cleaner, 37 | ) 38 | 39 | language_to_model = { 40 | "en": "bert_english", 41 | "pt_br": "bert_portuguese", 42 | "multilang": "bert_multilang" 43 | } 44 | 45 | from_pt_dict = { 46 | "bert_portuguese": True 47 | } 48 | 49 | model_class_dict = { 50 | "bert_english": TFBertModel, 51 | "bert_portuguese": TFBertModel, 52 | "bert_multilang": TFBertModel, 53 | } 54 | model_tokenizer_dict = { 55 | "bert_english": BertTokenizer, 56 | "bert_portuguese": BertTokenizer, 57 | "bert_multilang": BertTokenizer, 58 | } 59 | model_weights_defaults = { 60 | "bert_english": "bert-base-uncased", 61 | "bert_portuguese": "neuralmind/bert-base-portuguese-cased", 62 | "bert_multilang": "bert-base-multilingual-uncased" 63 | } 64 | 65 | model_special_tokens_pre_processors = { 66 | "bert_english": bert_tokens_pre_processor, 67 | "bert_portuguese": bert_tokens_pre_processor, 68 | "bert_multilang": bert_tokens_pre_processor, 69 | } 70 | 71 | model_tokens_cleaners = { 72 | "bert_english": bert_tokens_cleaner, 73 | "bert_portuguese": bert_tokens_cleaner, 74 | "bert_multilang": bert_tokens_cleaner, 75 | } 76 | 77 | model_embeddings_post_processors = { 78 | "bert_english": bert_embeddings_post_processor, 79 | "bert_portuguese": bert_embeddings_post_processor, 80 | "bert_multilang": bert_embeddings_post_processor, 81 | } 82 | 83 | model_url = { 84 | "bert_portuguese": "https://bothub-nlp-models.s3.amazonaws.com/bert/bert_portuguese.zip", 85 | "bert_english": "https://bothub-nlp-models.s3.amazonaws.com/bert/bert_english.zip", 86 | "bert_multilang": "https://bothub-nlp-models.s3.amazonaws.com/bert/bert_multilang.zip" 87 | } 88 | -------------------------------------------------------------------------------- /bothub/shared/utils/scripts/download_models.py: -------------------------------------------------------------------------------- 1 | """ 2 | Script to download language models on demand 3 | Usage example: 4 | !python download_models.py pt_br-BERT 5 | """ 6 | 7 | # !/usr/bin/env python 8 | import os 9 | import sys 10 | import subprocess 11 | import logging 12 | import plac 13 | import requests 14 | import zipfile 15 | 16 | from decouple import config 17 | from spacy.cli import download 18 | from spacy.cli import link 19 | from spacy.util import get_package_path 20 | 21 | sys.path.insert( 22 | 1, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "..", "..")) 23 | ) 24 | 25 | from bothub.shared.utils.rasa_components.registry import model_url 26 | 27 | logger = logging.getLogger(__name__) 28 | 29 | lang_to_model = { 30 | "en": {"SPACY": "en_core_web_lg", "BERT": "bert_english"}, 31 | "pt_br": { 32 | "SPACY": "pip+pt_nilc_word2vec_cbow_600:https://bothub-nlp-models.s3.amazonaws.com/pt_br-spacy/pt_nilc_word2vec_cbow_600-1.0.0.tar.gz", 33 | "SPACY_SUGGESTION": "pip+pt_nilc_wang2vec_cbow_300:https://bothub-nlp-models.s3.amazonaws.com/pt_br-spacy/pt_nilc_wang2vec_cbow_300-1.0.0.tar.gz", 34 | "BERT": "bert_portuguese", 35 | }, 36 | "es": {"SPACY": "es_core_news_md"}, 37 | "fr": {"SPACY": "fr_core_news_md"}, 38 | "ru": { 39 | "SPACY": "pip+ru_vectors_web_md:https://bothub-nlp-models.s3.amazonaws.com/ru-spacy/ru_vectors_web_md-1.1.0.tar.gz" 40 | }, 41 | "xx": {"BERT": "bert_multilang"}, 42 | } 43 | 44 | 45 | def download_file(url, file_name): 46 | with requests.get(url, stream=True) as r: 47 | r.raise_for_status() 48 | with open(file_name, "wb") as f: 49 | for chunk in r.iter_content(chunk_size=8192): 50 | f.write(chunk) 51 | return file_name 52 | 53 | 54 | def download_bert(model_name): 55 | model_dir = model_name 56 | os.makedirs(model_dir, exist_ok=True) 57 | 58 | zipped_file_name = "temp.zip" 59 | url = model_url.get(model_name) 60 | logger.info(f"downloading {model_name} . . .") 61 | download_file(url, zipped_file_name) 62 | 63 | logger.info(f"extracting {model_name} . . .") 64 | with zipfile.ZipFile(zipped_file_name, 'r') as zip_ref: 65 | zip_ref.extractall(model_dir) 66 | os.remove(zipped_file_name) 67 | 68 | 69 | def cast_supported_languages(languages): 70 | return languages.split("|") 71 | 72 | 73 | @plac.annotations( 74 | languages=plac.Annotation(help="Languages to download"), 75 | debug=plac.Annotation(help="Enable debug", kind="flag", abbrev="D"), 76 | ) 77 | def download_models(languages="", debug=False): 78 | logging.basicConfig( 79 | format="%(name)s - %(levelname)s - %(message)s", 80 | level=logging.DEBUG if debug else logging.INFO, 81 | ) 82 | 83 | languages = cast_supported_languages(languages) 84 | 85 | for lang in languages: 86 | lang = lang.split("-") 87 | 88 | lang_slug = lang[0] 89 | model = lang[1] if len(lang) > 1 else None 90 | 91 | if not model or model == "NONE": 92 | continue 93 | 94 | value = lang_to_model.get(lang_slug, {}).get(model, None) 95 | if model.startswith("SPACY"): 96 | if value.startswith("pip+"): 97 | model_name, pip_package = value[4:].split(":", 1) 98 | logger.debug("model name: {}".format(model_name)) 99 | logger.debug("pip package: {}".format(pip_package)) 100 | cmd = [ 101 | sys.executable, 102 | "-m", 103 | "pip", 104 | "install", 105 | "--no-deps", 106 | "--no-cache-dir", 107 | pip_package, 108 | ] 109 | logger.debug(" ".join(cmd)) 110 | if subprocess.call(cmd, env=os.environ.copy()) == 0: 111 | logger.debug("linking: {} to {}".format(model_name, lang_slug)) 112 | package_path = get_package_path(model_name) 113 | link(model_name, lang_slug, force=True, model_path=package_path) 114 | else: 115 | raise Exception("Error to download {}".format(lang_slug)) 116 | elif lang_slug != value: 117 | logger.debug("downloading {}".format(value)) 118 | download(value) 119 | logger.debug("linking: {} to {}".format(value, lang_slug)) 120 | package_path = get_package_path(value) 121 | link(value, lang_slug, force=True, model_path=package_path) 122 | else: 123 | logger.debug("downloading {}".format(value)) 124 | download(value) 125 | elif model == "BERT": 126 | download_bert(value) 127 | 128 | 129 | if __name__ == "__main__": 130 | plac.call(download_models, sys.argv[1:]) 131 | -------------------------------------------------------------------------------- /bothub/shared/utils/scripts/link_lang_spacy.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import os 3 | import sys 4 | import plac 5 | import importlib 6 | 7 | from pathlib import Path 8 | from spacy.util import get_package_path 9 | from spacy.compat import symlink_to 10 | 11 | 12 | @plac.annotations( 13 | lang=plac.Annotation(help="Language code"), 14 | lang_path=plac.Annotation(help="Language path"), 15 | ) 16 | def link_lang_spacy(lang, lang_path): 17 | origin_path = os.path.join(str(get_package_path("spacy").resolve()), "lang", lang) 18 | try: 19 | symlink_to(Path(origin_path), os.path.abspath(lang_path)) 20 | try: 21 | importlib.import_module("spacy.lang.{}".format(lang)) 22 | print("link created") 23 | except Exception as e: 24 | print("link not created") 25 | raise e 26 | except Exception as e: 27 | print("error to create link to {} from {}".format(lang, lang_path)) 28 | raise e 29 | 30 | 31 | if __name__ == "__main__": 32 | plac.call(link_lang_spacy, sys.argv[1:]) 33 | -------------------------------------------------------------------------------- /celery_app.py: -------------------------------------------------------------------------------- 1 | from bothub_nlp_celery.app import celery_app 2 | 3 | from bothub_nlp_celery.tasks import ( 4 | TASK_NLU_PARSE_TEXT, 5 | TASK_NLU_EVALUATE_UPDATE, 6 | TASK_NLU_INTENT_SENTENCE_SUGGESTION_TEXT, 7 | TASK_NLU_TRAIN_UPDATE, 8 | TASK_NLU_WORDS_DISTRIBUTION, 9 | TASK_NLU_DEBUG_PARSE_TEXT, 10 | TASK_NLU_SENTENCE_SUGGESTION_TEXT, 11 | TASK_NLU_WORD_SUGGESTION_TEXT, 12 | ) 13 | 14 | from bothub.shared.utils.backend import backend 15 | 16 | from bothub.nlu_worker.task.parse import parse_text 17 | from bothub.nlu_worker.task.debug_parse import debug_parse_text 18 | from bothub.nlu_worker.task.sentence_suggestion import sentence_suggestion_text 19 | from bothub.nlu_worker.task.word_suggestion import word_suggestion_text 20 | from bothub.nlu_worker.task.intent_sentence_suggestion import ( 21 | intent_sentence_suggestion_text, 22 | ) 23 | from bothub.nlu_worker.task.words_distribution import words_distribution_text 24 | from bothub.nlu_worker.task.evaluate import evaluate_update 25 | 26 | from bothub.shared.evaluate_crossval import evaluate_crossval_update 27 | from bothub.shared.train import train_update 28 | 29 | from bothub.nlu_worker.interpreter_manager import InterpreterManager 30 | 31 | interpreter_manager = InterpreterManager() 32 | 33 | 34 | @celery_app.task(name=TASK_NLU_PARSE_TEXT) 35 | def celery_parse_text(repository_version, repository_authorization, *args, **kwargs): 36 | return parse_text( 37 | repository_version, 38 | repository_authorization, 39 | interpreter_manager, 40 | *args, 41 | **kwargs 42 | ) 43 | 44 | 45 | @celery_app.task(name=TASK_NLU_DEBUG_PARSE_TEXT) 46 | def celery_debug_parse_text( 47 | repository_version, repository_authorization, *args, **kwargs 48 | ): 49 | return debug_parse_text( 50 | repository_version, 51 | repository_authorization, 52 | interpreter_manager, 53 | *args, 54 | **kwargs 55 | ) 56 | 57 | 58 | @celery_app.task(name=TASK_NLU_SENTENCE_SUGGESTION_TEXT) 59 | def celery_sentence_suggestion_text(*args, **kwargs): 60 | return sentence_suggestion_text(*args, **kwargs) 61 | 62 | 63 | @celery_app.task(name=TASK_NLU_INTENT_SENTENCE_SUGGESTION_TEXT) 64 | def celery_intent_sentence_suggestion_text( 65 | repository_version, repository_authorization, *args, **kwargs 66 | ): 67 | return intent_sentence_suggestion_text( 68 | repository_version, repository_authorization, *args, **kwargs 69 | ) 70 | 71 | 72 | @celery_app.task(name=TASK_NLU_WORD_SUGGESTION_TEXT) 73 | def celery_word_suggestion_text(*args, **kwargs): 74 | return word_suggestion_text(*args, **kwargs) 75 | 76 | 77 | @celery_app.task(name=TASK_NLU_TRAIN_UPDATE) 78 | def celery_train_update(repository_version, by_id, repository_authorization): 79 | backend().request_backend_save_queue_id( 80 | update_id=repository_version, 81 | repository_authorization=repository_authorization, 82 | task_id=celery_app.current_task.request.id, 83 | from_queue=1, 84 | type_processing=0, 85 | ) 86 | return train_update(repository_version, by_id, repository_authorization) 87 | 88 | 89 | @celery_app.task(name=TASK_NLU_EVALUATE_UPDATE) 90 | def celery_evaluate_update( 91 | repository_version_id, repository_version_language_id, repository_authorization, cross_validation, language 92 | ): 93 | if cross_validation: 94 | return evaluate_crossval_update( 95 | repository_version_language_id, repository_authorization, {}, language 96 | ) 97 | return evaluate_update( 98 | repository_version_id, repository_version_language_id, repository_authorization, interpreter_manager, language 99 | ) 100 | 101 | 102 | @celery_app.task(name=TASK_NLU_WORDS_DISTRIBUTION) 103 | def celery_words_distribution(repository_version, language, repository_authorization): 104 | return words_distribution_text( 105 | repository_version, language, repository_authorization 106 | ) 107 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | # Attention: 2 | # Use this docker-compose to: 3 | # - Up development environment: docker-compose up 4 | # - Build docker images: docker-compose build 5 | 6 | version: '3.6' 7 | 8 | services: 9 | bothub-nlp-nlu-worker: 10 | image: ${BOTHUB_NLP_NLU_WORKER_DOCKER_IMAGE_NAME:-ilha/bothub-nlp-nlu-worker}:${BOTHUB_NLP_NLU_WORKER_DOCKER_IMAGE_TAG:-latest} 11 | build: 12 | context: . 13 | dockerfile: nlp.Dockerfile 14 | args: 15 | DOWNLOAD_MODELS: xx-NONE 16 | depends_on: 17 | - bothub-nlp-celery-redis 18 | networks: 19 | - default 20 | environment: 21 | # bothub-nlp aws to save charts 22 | - BOTHUB_NLP_AWS_S3_BUCKET_NAME=${BOTHUB_NLP_AWS_S3_BUCKET_NAME} 23 | - BOTHUB_NLP_AWS_ACCESS_KEY_ID=${BOTHUB_NLP_AWS_ACCESS_KEY_ID} 24 | - BOTHUB_NLP_AWS_SECRET_ACCESS_KEY=${BOTHUB_NLP_AWS_SECRET_ACCESS_KEY} 25 | # bothub-nlp env vars 26 | - BOTHUB_NLP_SENTRY_CLIENT=${BOTHUB_NLP_SENTRY_CLIENT} 27 | # bothub-nlp-celery env vars 28 | - BOTHUB_NLP_CELERY_BROKER_URL=${BOTHUB_NLP_CELERY_BROKER_URL:-redis://bothub-nlp-celery-redis:6379/0} 29 | - BOTHUB_NLP_CELERY_BACKEND_URL=${BOTHUB_NLP_CELERY_BACKEND_URL:-redis://bothub-nlp-celery-redis:6379/0} 30 | - BOTHUB_ENGINE_URL=${BOTHUB_ENGINE_URL:-https://api.bothub.it} 31 | - BOTHUB_NLP_LANGUAGE_QUEUE=${BOTHUB_NLP_LANGUAGE_QUEUE:-en} 32 | bothub-ai-platform: 33 | image: ${BOTHUB_NLP_NLU_WORKER_DOCKER_IMAGE_NAME:-ilha/bothub-ai-platform}:${BOTHUB_NLP_NLU_WORKER_DOCKER_IMAGE_TAG:-latest} 34 | build: 35 | context: . 36 | dockerfile: aiplatform.Dockerfile 37 | args: 38 | DOWNLOAD_MODELS: xx-NONE 39 | networks: 40 | - default 41 | 42 | bothub-nlp-celery-redis: 43 | image: redis 44 | ports: 45 | - 6379:6379 46 | -------------------------------------------------------------------------------- /nlp.Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:18.04 as base 2 | 3 | ENV WORKDIR /home/root/app 4 | ENV LC_ALL C.UTF-8 5 | ENV LANG C.UTF-8 6 | ENV PYTHON_WHEELS_PATH /wheels 7 | ENV PYTHON_BUILD_PACKAGES "software-properties-common curl" 8 | ENV PIP_REQUIREMENTS "-r requirements.txt" 9 | 10 | WORKDIR ${WORKDIR} 11 | 12 | RUN apt-get update && apt-get install --no-install-recommends -y ${PYTHON_BUILD_PACKAGES} git 13 | RUN apt-get install -y python3 python3-pip python3-venv 14 | RUN apt-get install build-essential 15 | 16 | RUN echo ttf-mscorefonts-installer msttcorefonts/accepted-mscorefonts-eula select true | debconf-set-selections 17 | RUN apt-get install -y ttf-mscorefonts-installer \ 18 | && apt-get autoremove -y \ 19 | && apt-get clean -y \ 20 | && rm -rf /var/lib/apt/lists/* 21 | 22 | 23 | RUN bash -c "ln -s /usr/bin/python3 /usr/bin/python; ln -s /usr/bin/pip3 /usr/bin/pip" 24 | 25 | COPY requirements.txt . 26 | 27 | FROM base as builder 28 | 29 | ENV BUILD_PACKAGES "build-essential" 30 | 31 | RUN apt-get update && apt-get install --no-install-recommends -y ${BUILD_PACKAGES} 32 | 33 | RUN pip install --upgrade pip 34 | 35 | RUN pip install -U pip setuptools 36 | 37 | RUN pip wheel --wheel-dir=${PYTHON_WHEELS_PATH} ${PIP_REQUIREMENTS} 38 | 39 | FROM base 40 | 41 | COPY --from=builder ${PYTHON_WHEELS_PATH} ${PYTHON_WHEELS_PATH} 42 | 43 | RUN pip install --upgrade pip 44 | 45 | RUN pip install -U pip setuptools 46 | 47 | RUN pip install --find-links=${PYTHON_WHEELS_PATH} ${PIP_REQUIREMENTS} 48 | 49 | COPY bothub ${WORKDIR}/bothub 50 | 51 | COPY start_celery.py . 52 | COPY celery_app.py . 53 | 54 | ARG DOWNLOAD_MODELS 55 | #Install torch with cuda 10.1 56 | RUN if [ "${DOWNLOAD_MODELS}" = "pt_br-BERT" ]; then \ 57 | pip install torch==1.6.0+cu101 torchvision==0.7.0+cu101 -f https://download.pytorch.org/whl/torch_stable.html; \ 58 | fi 59 | 60 | RUN if [ ${DOWNLOAD_MODELS} ]; then \ 61 | python3.6 bothub/shared/utils/scripts/download_models.py ${DOWNLOAD_MODELS}; \ 62 | fi 63 | 64 | ENTRYPOINT [ "celery", "worker", "--autoscale", "1,1", "-O", "fair", "--workdir", ".", "-A", "celery_app", "-c", "5", "-l", "INFO", "-E", "--pool", "threads" ] 65 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | absl-py==0.9.0 2 | aiofiles==0.4.0 3 | aiohttp==3.6.2 4 | amqp==5.0.6 5 | appdirs==1.4.3 6 | APScheduler==3.6.3 7 | astor==0.8.1 8 | async-generator==1.10 9 | async-timeout==3.0.1 10 | attrs==19.3.0 11 | billiard==3.6.4.0 12 | black==19.3b0 13 | blis==0.2.4 14 | git+https://github.com/bothub-it/bothub-nlp-celery.git@0.1.38 15 | git+https://github.com/bothub-it/bothub-backend.git@1.0.22 16 | boto3==1.12.25 17 | botocore==1.15.25 18 | bz2file==0.98 19 | cachetools==4.0.0 20 | celery==5.1.2 21 | certifi==2019.11.28 22 | cffi==1.14.0 23 | chardet==3.0.4 24 | click==7.1.1 25 | cloudpickle==1.2.2 26 | colorclass==2.2.0 27 | coloredlogs==10.0 28 | colorhash==1.0.2 29 | ConfigArgParse==1.1 30 | contextvars==2.3 31 | cryptography==2.8 32 | cycler==0.10.0 33 | cymem==2.0.3 34 | DAWG-Python==0.7.2 35 | decorator==4.4.2 36 | dill==0.3.1.1 37 | dnspython==1.16.0 38 | docopt==0.6.2 39 | docutils==0.15.2 40 | dopamine-rl==3.0.1 41 | entrypoints==0.3 42 | fbmessenger==6.0.0 43 | flake8==3.7.9 44 | Flask==1.1.1 45 | Flask-Cors==3.0.8 46 | future==0.18.2 47 | gast==0.2.2 48 | gevent==1.4.0 49 | gin-config==0.3.0 50 | google-api-core==1.16.0 51 | google-api-python-client==1.8.0 52 | google-auth==1.11.3 53 | google-auth-httplib2==0.0.3 54 | google-auth-oauthlib==0.4.1 55 | google-pasta==0.2.0 56 | googleapis-common-protos==1.51.0 57 | greenlet==0.4.15 58 | grpcio==1.27.2 59 | gunicorn==20.0.4 60 | gym==0.17.1 61 | h11==0.8.1 62 | h2==3.2.0 63 | h5py==2.10.0 64 | hpack==3.0.0 65 | hstspreload==2020.3.17 66 | httplib2==0.17.0 67 | httptools==0.1.1 68 | httpx==0.9.3 69 | humanfriendly==8.1 70 | hyperframe==5.2.0 71 | idna==2.7 72 | idna-ssl==1.1.0 73 | imageio==2.8.0 74 | immutables==0.6 75 | importlib-metadata==1.6.0 76 | itsdangerous==1.1.0 77 | Jinja2==2.11.1 78 | jmespath==0.9.5 79 | joblib==0.14.1 80 | jsonpickle==1.3 81 | jsonschema==3.2.0 82 | kafka-python==1.4.7 83 | Keras-Applications==1.0.8 84 | Keras-Preprocessing==1.1.0 85 | kfac==0.2.0 86 | kiwisolver==1.1.0 87 | kombu==5.1.0 88 | lime==0.1.1.36 89 | Markdown==3.2.1 90 | MarkupSafe==1.1.1 91 | matplotlib==3.1.2 92 | mattermostwrapper==2.2 93 | mccabe==0.6.1 94 | mesh-tensorflow==0.1.12 95 | mpmath==1.1.0 96 | multidict==4.7.5 97 | murmurhash==1.0.2 98 | networkx==2.4 99 | nltk==3.4.5 100 | numpy==1.18.1 101 | oauth2client==4.1.3 102 | oauthlib==3.1.0 103 | opencv-python==4.2.0.32 104 | opt-einsum==3.2.1 105 | packaging==20.0 106 | pika==1.1.0 107 | Pillow==7.0.0 108 | plac==0.9.6 109 | preshed==2.0.1 110 | promise==2.3 111 | prompt-toolkit==2.0.10 112 | protobuf==3.11.3 113 | psycopg2-binary==2.8.5 114 | pyasn1==0.4.8 115 | pyasn1-modules==0.2.8 116 | pycodestyle==2.5.0 117 | pycparser==2.20 118 | pydot==1.4.1 119 | pyflakes==2.1.1 120 | pyglet==1.5.0 121 | PyJWT==1.7.1 122 | pykwalify==1.7.0 123 | pymongo==3.8.0 124 | pymorphy2==0.8 125 | pymorphy2-dicts==2.4.393442.3710985 126 | pyparsing==2.4.6 127 | pypng==0.0.20 128 | pyrsistent==0.16.0 129 | PySocks==1.7.1 130 | pythainlp==2.1.4 131 | python-crfsuite==0.9.7 132 | python-dateutil==2.8.1 133 | python-decouple==3.3 134 | python-engineio==3.11.2 135 | python-socketio==4.4.0 136 | python-telegram-bot==11.1.0 137 | pytz==2019.3 138 | PyWavelets==1.1.1 139 | PyYAML==5.3.1 140 | questionary==1.5.1 141 | redis==3.4.1 142 | requests==2.23.0 143 | requests-oauthlib==1.3.0 144 | requests-toolbelt==0.9.1 145 | rfc3986==1.3.2 146 | rocketchat-API==0.6.36 147 | rsa==4.0 148 | ruamel.yaml==0.16.10 149 | ruamel.yaml.clib==0.2.0 150 | s3transfer==0.3.3 151 | sanic==19.12.2 152 | Sanic-Cors==0.10.0.post3 153 | sanic-jwt==1.3.2 154 | Sanic-Plugins-Framework==0.9.2 155 | scikit-image==0.16.2 156 | scikit-learn==0.22.2.post1 157 | scipy==1.4.1 158 | sentry-sdk==0.13.2 159 | six==1.14.0 160 | sklearn-crfsuite==0.3.6 161 | slackclient==2.5.0 162 | sniffio==1.1.0 163 | spacy==2.1.9 164 | SQLAlchemy==1.3.15 165 | srsly==1.0.2 166 | sympy==1.5.1 167 | tabulate==0.8.6 168 | tensor2tensor==1.14.1 169 | tensorboard==2.1.1 170 | tensorflow==2.1.2 171 | tensorflow-addons==0.7.1 172 | tensorflow-datasets==2.1.0 173 | tensorflow-estimator==2.1.0 174 | tensorflow-gan==2.0.0 175 | tensorflow-hub==0.7.0 176 | tensorflow-metadata==0.21.1 177 | tensorflow-probability==0.7.0 178 | termcolor==1.1.0 179 | terminaltables==3.1.0 180 | thinc==7.0.8 181 | tinydb==3.15.2 182 | toml==0.10.0 183 | tqdm==4.31.1 184 | twilio==6.26.3 185 | typeguard==2.7.1 186 | typing-extensions==3.7.4.1 187 | tzlocal==2.0.0 188 | ujson==1.35 189 | Unidecode==1.1.1 190 | uritemplate==3.0.1 191 | urllib3==1.24.3 192 | uvloop==0.14.0 193 | vine==5.0.0 194 | wasabi==0.6.0 195 | wcwidth==0.1.8 196 | webexteamssdk==1.1.1 197 | websocket-client==0.54.0 198 | websockets==8.1 199 | Werkzeug==1.0.0 200 | wrapt==1.12.1 201 | yarl==1.4.2 202 | zipp==3.1.0 203 | rasa==1.10.6 204 | transformers==2.11.0 205 | emoji==0.6.0 206 | recognizers-text-suite -------------------------------------------------------------------------------- /start_celery.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | from bothub_nlp_celery.actions import queue_name 3 | from bothub_nlp_celery import settings 4 | 5 | if settings.BOTHUB_LANGUAGE_MODEL: 6 | queue = queue_name(settings.BOTHUB_NLP_LANGUAGE_QUEUE, model_name=settings.BOTHUB_LANGUAGE_MODEL,) 7 | else: 8 | queue = settings.BOTHUB_NLP_LANGUAGE_QUEUE 9 | 10 | 11 | subprocess.run( 12 | [ 13 | "celery", 14 | "-A", 15 | "celery_app", 16 | "worker", 17 | "-O", 18 | "fair", 19 | "-c", 20 | "1", 21 | "-l", 22 | "INFO", 23 | "-E", 24 | "--pool", 25 | "gevent", 26 | "-Q", 27 | queue, 28 | ] 29 | ) 30 | -------------------------------------------------------------------------------- /tests/README.md: -------------------------------------------------------------------------------- 1 | ### Testing 2 | - test_train.py is configured to test en-BERT by default 3 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weni-ai/bothub-nlp/d43b7657aff01544769b71db74adce5f7d366879/tests/__init__.py -------------------------------------------------------------------------------- /tests/example_bert_pt_br.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weni-ai/bothub-nlp/d43b7657aff01544769b71db74adce5f7d366879/tests/example_bert_pt_br.tar.gz -------------------------------------------------------------------------------- /tests/example_generic_language.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weni-ai/bothub-nlp/d43b7657aff01544769b71db74adce5f7d366879/tests/example_generic_language.tar.gz -------------------------------------------------------------------------------- /tests/shared/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weni-ai/bothub-nlp/d43b7657aff01544769b71db74adce5f7d366879/tests/shared/__init__.py -------------------------------------------------------------------------------- /tests/shared/test_pipeline_builder.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import os 3 | 4 | import sys 5 | sys.path.insert(1, os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..'))) 6 | 7 | from bothub.shared.utils.pipeline_builder import PipelineBuilder 8 | from rasa.nlu.registry import class_from_module_path 9 | 10 | 11 | class TestPipelineBuilder(unittest.TestCase): 12 | def setUp(self, *args): 13 | self.update = { 14 | 'language': 'en', 15 | 'repository_version': 47, 16 | 'repository_uuid': '1d8e0d6f-1941-42a3-84c5-788706c7072e', 17 | 'intent': [4, 5], 18 | 'algorithm': 'transformer_network_diet_bert', 19 | 'use_name_entities': False, 20 | 'use_competing_intents': False, 21 | 'use_analyze_char': False, 22 | 'total_training_end': 0, 23 | 'dataset_size': 15000, 24 | } 25 | 26 | self.pipeline_builder = PipelineBuilder(self.update) 27 | 28 | list_dir = os.listdir() 29 | while 'bert_english' not in list_dir: 30 | os.chdir("../") 31 | list_dir = os.listdir() 32 | 33 | def test__add_spacy_nlp(self): 34 | component_name = self.pipeline_builder._add_spacy_nlp().get('name') 35 | if '.' in component_name: 36 | class_from_module_path(component_name) 37 | 38 | def test__add_whitespace_tokenizer(self): 39 | component_name = self.pipeline_builder._add_whitespace_tokenizer().get('name') 40 | if '.' in component_name: 41 | class_from_module_path(component_name) 42 | 43 | def test__add_preprocessing(self): 44 | component_name = self.pipeline_builder._add_preprocessing().get('name') 45 | if '.' in component_name: 46 | class_from_module_path(component_name) 47 | 48 | def test__add_regex_entity_extractor(self): 49 | component_name = self.pipeline_builder._add_regex_entity_extractor().get('name') 50 | if '.' in component_name: 51 | class_from_module_path(component_name) 52 | 53 | def test__add_countvectors_featurizer(self): 54 | components_list = self.pipeline_builder._add_countvectors_featurizer() 55 | for component in components_list: 56 | component_name = component.get('name') 57 | if '.' in component_name: 58 | class_from_module_path(component_name) 59 | 60 | def test__add_legacy_countvectors_featurizer(self): 61 | component_name = self.pipeline_builder._add_legacy_countvectors_featurizer().get('name') 62 | if '.' in component_name: 63 | class_from_module_path(component_name) 64 | 65 | def test__add_microsoft_entity_extractor(self): 66 | component_name = self.pipeline_builder._add_microsoft_entity_extractor().get('name') 67 | if '.' in component_name: 68 | class_from_module_path(component_name) 69 | 70 | def test__add_embedding_intent_classifier(self): 71 | component_name = self.pipeline_builder._add_embedding_intent_classifier().get('name') 72 | if '.' in component_name: 73 | class_from_module_path(component_name) 74 | 75 | def test__add_diet_classifier(self): 76 | component_name = self.pipeline_builder._add_diet_classifier().get('name') 77 | if '.' in component_name: 78 | class_from_module_path(component_name) 79 | 80 | def test__legacy_internal_config(self): 81 | components_list = self.pipeline_builder._legacy_internal_config() 82 | for component in components_list: 83 | component_name = component.get('name') 84 | if '.' in component_name: 85 | class_from_module_path(component_name) 86 | 87 | def test__legacy_external_config(self): 88 | components_list = self.pipeline_builder._legacy_external_config() 89 | for component in components_list: 90 | component_name = component.get('name') 91 | if '.' in component_name: 92 | class_from_module_path(component_name) 93 | 94 | def test__transformer_network_diet_config(self): 95 | components_list = self.pipeline_builder._transformer_network_diet_config() 96 | for component in components_list: 97 | component_name = component.get('name') 98 | if '.' in component_name: 99 | class_from_module_path(component_name) 100 | 101 | def test__transformer_network_diet_word_embedding_config(self): 102 | components_list = self.pipeline_builder._transformer_network_diet_word_embedding_config() 103 | for component in components_list: 104 | component_name = component.get('name') 105 | if '.' in component_name: 106 | class_from_module_path(component_name) 107 | 108 | def test__transformer_network_diet_bert_config(self): 109 | components_list = self.pipeline_builder._transformer_network_diet_bert_config() 110 | for component in components_list: 111 | component_name = component.get('name') 112 | if '.' in component_name: 113 | class_from_module_path(component_name) 114 | 115 | def test_unexisting_model_language(self): 116 | update = { 117 | 'language': 'unexisting', 118 | 'algorithm': 'neural_network_external', 119 | 'use_name_entities': False, 120 | 'dataset_size': 15000, 121 | } 122 | pipeline_builder = PipelineBuilder(update) 123 | self.assertEqual(pipeline_builder.model, None) 124 | 125 | update['algorithm'] = 'transformer_network_diet' 126 | pipeline_builder = PipelineBuilder(update) 127 | self.assertEqual(pipeline_builder.model, None) 128 | 129 | update['algorithm'] = 'neural_network_internal' 130 | pipeline_builder = PipelineBuilder(update) 131 | self.assertEqual(pipeline_builder.model, None) 132 | 133 | update = { 134 | 'language': 'en', 135 | 'algorithm': 'transformer_network_diet_bert', 136 | 'use_name_entities': True, 137 | 'dataset_size': 15000, 138 | } 139 | pipeline_builder = PipelineBuilder(update) 140 | self.assertEqual(pipeline_builder.model, 'BERT') 141 | 142 | def test__dynamic_epochs(self): 143 | self.update["dataset_size"] = 10000 144 | self.pipeline_builder = PipelineBuilder(self.update) 145 | result_epochs = self.pipeline_builder._calculate_epochs_number( 146 | 100, 147 | self.pipeline_builder._epoch_factor_function1 148 | ) 149 | self.assertEqual(result_epochs, 100) 150 | 151 | self.update["dataset_size"] = 15000 152 | self.pipeline_builder = PipelineBuilder(self.update) 153 | result_epochs = self.pipeline_builder._calculate_epochs_number( 154 | 100, 155 | self.pipeline_builder._epoch_factor_function1 156 | ) 157 | self.assertLess(result_epochs, 100) 158 | self.assertGreater(result_epochs, 0) 159 | 160 | self.update["dataset_size"] = 0 161 | self.pipeline_builder = PipelineBuilder(self.update) 162 | result_epochs = self.pipeline_builder._calculate_epochs_number( 163 | 100, 164 | self.pipeline_builder._epoch_factor_function1 165 | ) 166 | self.assertEqual(result_epochs, 100) 167 | 168 | -------------------------------------------------------------------------------- /tests/shared/test_preprocesing.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import os 3 | import emoji 4 | 5 | import sys 6 | sys.path.insert(1, os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..'))) 7 | 8 | from bothub.shared.utils.preprocessing.preprocessing_factory import PreprocessingFactory 9 | from bothub.shared.utils.preprocessing.preprocessing_base import PreprocessingBase 10 | from bothub.shared.utils.preprocessing.preprocessing_english import PreprocessingEnglish 11 | from bothub.shared.utils.preprocessing.preprocessing_portuguese import PreprocessingPortuguese 12 | from bothub.shared.utils.preprocessing.preprocessing_spanish import PreprocessingSpanish 13 | from rasa.nlu.training_data import Message 14 | 15 | 16 | class TestPreprocessing(unittest.TestCase): 17 | def setUp(self, *args): 18 | self.base = PreprocessingFactory().factory() 19 | self.portuguese = PreprocessingFactory('pt_br').factory() 20 | self.english = PreprocessingFactory('en').factory() 21 | self.spanish = PreprocessingFactory('es').factory() 22 | 23 | def test__factory(self): 24 | base = PreprocessingFactory().factory() 25 | self.assertIsInstance(base, PreprocessingBase) 26 | base = PreprocessingFactory('unexisting_language').factory() 27 | self.assertIsInstance(base, PreprocessingBase) 28 | portuguese = PreprocessingFactory('pt_br').factory() 29 | self.assertIsInstance(portuguese, PreprocessingPortuguese) 30 | english = PreprocessingFactory('en').factory() 31 | self.assertIsInstance(english, PreprocessingEnglish) 32 | spanish = PreprocessingFactory('es').factory() 33 | self.assertIsInstance(spanish, PreprocessingSpanish) 34 | 35 | def test__default_preprocessing(self): 36 | phrase = "i'`m GOING não tô é the gym" 37 | expected = "im going nao to e the gym" 38 | self.assertEqual(self.base.default_preprocessing(phrase), (expected, None)) 39 | self.assertEqual(self.portuguese.default_preprocessing(phrase), (expected, None)) 40 | self.assertEqual(self.english.default_preprocessing(phrase), (expected, None)) 41 | self.assertEqual(self.spanish.default_preprocessing(phrase), (expected, None)) 42 | 43 | self.assertRaises(ValueError, self.base.default_preprocessing, None) 44 | 45 | phrase = "i'`m GOING não tô é the 'gym" 46 | expected = "im going nao to e the gym" 47 | entities = [ 48 | { 49 | "start": 0, 50 | "end": 4, 51 | "value": "i'`m", 52 | "entity": "me" 53 | }, 54 | { 55 | "start": 24, 56 | "end": 28, 57 | "value": "'gym", 58 | "entity": "gym" 59 | }, 60 | ] 61 | expected_entities = [ 62 | { 63 | "start": 0, 64 | "end": 2, 65 | "value": "im", 66 | "entity": "me" 67 | }, 68 | { 69 | "start": 22, 70 | "end": 25, 71 | "value": "gym", 72 | "entity": "gym" 73 | }, 74 | ] 75 | self.assertEqual( 76 | self.base.default_preprocessing(phrase, entities), 77 | (expected, expected_entities) 78 | ) 79 | self.assertEqual( 80 | self.base.default_preprocessing(phrase, None), 81 | (expected, None) 82 | ) 83 | 84 | def test__extract_emoji_text(self): 85 | emoji_code = ':smile_face:' 86 | emoji_text = 'smile face' 87 | self.assertEqual(self.base.extract_emoji_text(emoji_code), emoji_text) 88 | self.assertRaises(ValueError, self.base.extract_emoji_text, None) 89 | self.assertRaises(ValueError, self.base.extract_emoji_text, 'not a emoji code') 90 | 91 | def test__emoji_handling(self): 92 | self.assertEqual(self.base.emoji_handling('😂'), "face with tears of joy") 93 | self.assertEqual(self.base.emoji_handling(''), '') 94 | 95 | for emoji_code in self.portuguese.emoji_contractions.keys(): 96 | # transform code to emoji 97 | emj = emoji.emojize(emoji_code) 98 | 99 | self.assertEqual(self.portuguese.emoji_handling(emj), self.portuguese.emoji_contractions[emoji_code]) 100 | self.assertEqual(self.english.emoji_handling(emj), self.english.emoji_contractions[emoji_code]) 101 | self.assertEqual(self.spanish.emoji_handling(emj), self.spanish.emoji_contractions[emoji_code]) 102 | 103 | def test__parse_preprocess(self): 104 | 105 | phrase = "i'`m GOING não tô é the gym 😂" 106 | 107 | self.assertEqual(self.base.preprocess(Message(text=phrase)).text, "im going nao to e the gym face with tears of joy") 108 | self.assertEqual(self.portuguese.preprocess(Message(text=phrase)).text, "im going nao to e the gym hahaha") 109 | self.assertEqual(self.english.preprocess(Message(text=phrase)).text, "im going nao to e the gym hahaha") 110 | self.assertEqual(self.spanish.preprocess(Message(text=phrase)).text, "im going nao to e the gym hahaha") 111 | 112 | pp = PreprocessingFactory(remove_accent=False).factory() 113 | self.assertEqual(pp.preprocess(Message(text=phrase)).text, "im going não tô é the gym face with tears of joy") 114 | pp = PreprocessingFactory('pt_br', remove_accent=False).factory() 115 | self.assertEqual(pp.preprocess(Message(text=phrase)).text, "im going não tô é the gym hahaha") 116 | pp = PreprocessingFactory('en', remove_accent=False).factory() 117 | self.assertEqual(pp.preprocess(Message(text=phrase)).text, "im going não tô é the gym hahaha") 118 | pp = PreprocessingFactory('es', remove_accent=False).factory() 119 | self.assertEqual(pp.preprocess(Message(text=phrase)).text, "im going não tô é the gym hahaha") 120 | 121 | def test__training_preprocess(self): 122 | preprocessors = [ 123 | PreprocessingFactory(remove_accent=False).factory(), 124 | PreprocessingFactory('pt_br', remove_accent=False).factory(), 125 | PreprocessingFactory('en', remove_accent=False).factory(), 126 | PreprocessingFactory('es', remove_accent=False).factory() 127 | ] 128 | for preprocessor in preprocessors: 129 | phrase = "i'`m GOING não tô é the 'gym" 130 | expected_phrase = "im going não tô é the gym" 131 | entities = [ 132 | { 133 | "start": 0, 134 | "end": 4, 135 | "value": "i'`m", 136 | "entity": "me" 137 | }, 138 | { 139 | "start": 24, 140 | "end": 28, 141 | "value": "'gym", 142 | "entity": "gym" 143 | }, 144 | ] 145 | expected_entities = [ 146 | { 147 | "start": 0, 148 | "end": 2, 149 | "value": "im", 150 | "entity": "me" 151 | }, 152 | { 153 | "start": 22, 154 | "end": 25, 155 | "value": "gym", 156 | "entity": "gym" 157 | }, 158 | ] 159 | message = Message.build( 160 | text=phrase, 161 | intent='test', 162 | entities=entities, 163 | ) 164 | 165 | self.assertEqual( 166 | preprocessor.preprocess(message).text, 167 | expected_phrase 168 | ) 169 | self.assertEqual( 170 | preprocessor.preprocess(message).data.get('entities'), 171 | expected_entities 172 | ) 173 | 174 | message = Message.build( 175 | text=phrase, 176 | intent='test', 177 | entities=None, 178 | ) 179 | self.assertEqual( 180 | preprocessor.preprocess(message).text, 181 | expected_phrase 182 | ) 183 | with self.assertRaises(KeyError): 184 | _ = preprocessor.preprocess(message).data['entities'] 185 | 186 | def test_example(self): 187 | example = { 188 | "text": "The new coronavirus doesn\u2019t affect young people.", 189 | "intent": "myth", 190 | "entities": [ 191 | { 192 | "start": 8, 193 | "end": 19, 194 | "value": "coronavirus", 195 | "entity": "coronavirus" 196 | }, 197 | { 198 | "start": 35, 199 | "end": 40, 200 | "value": "young", 201 | "entity": "young" 202 | } 203 | ] 204 | } 205 | message = Message.build( 206 | text=example['text'], 207 | intent=example['intent'], 208 | entities=example['entities'], 209 | ) 210 | 211 | result = PreprocessingFactory('en', remove_accent=False).factory().preprocess(message) 212 | result2 = PreprocessingFactory('en', remove_accent=False).factory().preprocess(Message(text=example['text'])) 213 | 214 | self.assertEqual(result.text, result2.text) 215 | 216 | def test__preprocess_text(self): 217 | phrase = "i'`m GOING não tô é the gym" 218 | expected = "im going nao to e the gym" 219 | self.assertEqual(self.base.preprocess_text(phrase), expected) 220 | self.assertEqual(self.portuguese.preprocess_text(phrase), expected) 221 | self.assertEqual(self.english.preprocess_text(phrase), expected) 222 | self.assertEqual(self.spanish.preprocess_text(phrase), expected) 223 | -------------------------------------------------------------------------------- /tests/test_debug_parse.py: -------------------------------------------------------------------------------- 1 | import json 2 | import unittest 3 | import uuid 4 | import base64 5 | import os 6 | from unittest.mock import patch 7 | 8 | import sys 9 | sys.path.insert(1, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) 10 | 11 | from bothub.nlu_worker.task.debug_parse import debug_parse_text 12 | from bothub.nlu_worker.interpreter_manager import InterpreterManager 13 | 14 | 15 | class TestDebugParseTask(unittest.TestCase): 16 | def setUp(self, *args): 17 | self.repository_authorization = uuid.uuid4() 18 | self.current_update = { 19 | "ready_for_train": True, 20 | "current_version_id": 6647, 21 | "repository_authorization_user_id": 303, 22 | } 23 | self.interpreter_manager = InterpreterManager() 24 | 25 | # change directory to /tests 26 | try: 27 | os.chdir("tests") 28 | except Exception: 29 | pass 30 | 31 | @patch( 32 | "bothub_backend.bothub.BothubBackend.request_backend_parse_nlu_persistor", 33 | return_value={ 34 | "version_id": 49, 35 | "repository_uuid": "0f6b9644-db55-49a2-a20d-2af74106d892", 36 | "total_training_end": 3, 37 | "language": "en", 38 | "bot_data": base64.b64encode( 39 | open("example_generic_language.tar.gz", "rb").read() 40 | ), 41 | }, 42 | ) 43 | @patch( 44 | "bothub_backend.bothub.BothubBackend.request_backend_info", 45 | return_value={"intents": ["affirmative", "negative", "doubt", "bias"]}, 46 | ) 47 | def test_debug_parse_without_rasa_format(self, *args): 48 | result = debug_parse_text( 49 | self.current_update.get("current_version_id"), 50 | self.repository_authorization, 51 | self.interpreter_manager, 52 | "ok", 53 | ) 54 | print(json.dumps(result, indent=2)) 55 | 56 | @patch( 57 | "bothub_backend.bothub.BothubBackend.request_backend_parse_nlu_persistor", 58 | return_value={ 59 | "version_id": 49, 60 | "repository_uuid": "0f6b9644-db55-49a2-a20d-2af74106d892", 61 | "total_training_end": 3, 62 | "language": "en", 63 | "bot_data": base64.b64encode( 64 | open("example_generic_language.tar.gz", "rb").read() 65 | ), 66 | }, 67 | ) 68 | @patch( 69 | "bothub_backend.bothub.BothubBackend.request_backend_info", 70 | return_value={"intents": ["affirmative", "negative", "doubt", "bias"]}, 71 | ) 72 | def test_debug_parse_with_rasa_format(self, *args): 73 | 74 | result = debug_parse_text( 75 | self.current_update.get("current_version_id"), 76 | self.repository_authorization, 77 | self.interpreter_manager, 78 | "ok", 79 | True, 80 | ) 81 | print(json.dumps(result, indent=2)) 82 | -------------------------------------------------------------------------------- /tests/test_evaluate.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import uuid 3 | import base64 4 | import os 5 | from unittest.mock import patch 6 | 7 | import sys 8 | sys.path.insert(1, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) 9 | 10 | from bothub.nlu_worker.task.evaluate import evaluate_update 11 | from bothub.nlu_worker.interpreter_manager import InterpreterManager 12 | 13 | 14 | class TestEvaluateTask(unittest.TestCase): 15 | def setUp(self, *args): 16 | self.repository_authorization = uuid.uuid4() 17 | self.repository_version = 1 18 | self.current_update = { 19 | "ready_for_train": True, 20 | "repository_version": 2, 21 | "current_version_id": 6647, 22 | "repository_authorization_user_id": 303, 23 | } 24 | self.language = "pt_br" 25 | self.interpreter_manager = InterpreterManager() 26 | 27 | # change directory to /tests 28 | try: 29 | os.chdir("tests") 30 | except Exception: 31 | pass 32 | 33 | @patch( 34 | "bothub_backend.bothub.BothubBackend.request_backend_start_evaluation", 35 | return_value=[ 36 | {"text": "nops", "intent": "negative", "entities": []}, 37 | {"text": "nope", "intent": "negative", "entities": []}, 38 | {"text": "nem", "intent": "negative", "entities": []}, 39 | {"text": "no", "intent": "negative", "entities": []}, 40 | {"text": "nn", "intent": "negative", "entities": []}, 41 | {"text": "n", "intent": "negative", "entities": []}, 42 | {"text": "ja namorei", "intent": "affirmative", "entities": []}, 43 | {"text": "já namorei", "intent": "affirmative", "entities": []}, 44 | {"text": "simn", "intent": "affirmative", "entities": []}, 45 | { 46 | "text": "aceito sim, muito obrigado", 47 | "intent": "affirmative", 48 | "entities": [], 49 | }, 50 | {"text": "sim, quero o documento", "intent": "affirmative", "entities": []}, 51 | {"text": "não posso fazer isso", "intent": "negative", "entities": []}, 52 | {"text": "não gostei", "intent": "negative", "entities": []}, 53 | {"text": "deixa para lá", "intent": "negative", "entities": []}, 54 | {"text": "não inventa história", "intent": "negative", "entities": []}, 55 | { 56 | "text": "não queria ter que dizer isso", 57 | "intent": "negative", 58 | "entities": [], 59 | }, 60 | {"text": "não gostei daquele dia", "intent": "negative", "entities": []}, 61 | { 62 | "text": "nem deve ser tão bom assim", 63 | "intent": "negative", 64 | "entities": [], 65 | }, 66 | {"text": "não aceito", "intent": "negative", "entities": []}, 67 | {"text": "nop deixa de onda", "intent": "negative", "entities": []}, 68 | {"text": "melhor nao falar nada", "intent": "negative", "entities": []}, 69 | {"text": "n gosto disso", "intent": "negative", "entities": []}, 70 | {"text": "para com isso, não pode", "intent": "negative", "entities": []}, 71 | {"text": "melhor não", "intent": "negative", "entities": []}, 72 | {"text": "quero mais não", "intent": "negative", "entities": []}, 73 | {"text": "negativo cara", "intent": "negative", "entities": []}, 74 | {"text": "vamo não", "intent": "negative", "entities": []}, 75 | {"text": "vou nem mentir", "intent": "negative", "entities": []}, 76 | {"text": "nem queria dizer isso", "intent": "negative", "entities": []}, 77 | {"text": "funcionou não", "intent": "negative", "entities": []}, 78 | {"text": "nem rola", "intent": "negative", "entities": []}, 79 | {"text": "não posso", "intent": "negative", "entities": []}, 80 | {"text": "não quero", "intent": "negative", "entities": []}, 81 | {"text": "conta comigo", "intent": "affirmative", "entities": []}, 82 | {"text": "sim, preciso de ajuda", "intent": "affirmative", "entities": []}, 83 | {"text": "é, você está certo sim", "intent": "affirmative", "entities": []}, 84 | {"text": "muito bom, aceito", "intent": "affirmative", "entities": []}, 85 | {"text": "sim, gostei disso", "intent": "affirmative", "entities": []}, 86 | {"text": "conte comigo sempre", "intent": "affirmative", "entities": []}, 87 | {"text": "afirmativo", "intent": "affirmative", "entities": []}, 88 | {"text": "ótima ideia, concordo", "intent": "affirmative", "entities": []}, 89 | {"text": "podemos marcar sim", "intent": "affirmative", "entities": []}, 90 | {"text": "quero sim", "intent": "affirmative", "entities": []}, 91 | {"text": "pode contar comigo", "intent": "affirmative", "entities": []}, 92 | { 93 | "text": "posso sim! me confirma a data", 94 | "intent": "affirmative", 95 | "entities": [], 96 | }, 97 | { 98 | "text": "claro que estou disponivel", 99 | "intent": "affirmative", 100 | "entities": [], 101 | }, 102 | {"text": "ótima ideia", "intent": "affirmative", "entities": []}, 103 | { 104 | "text": "seria legal se fossemos", 105 | "intent": "affirmative", 106 | "entities": [], 107 | }, 108 | {"text": "que legal, gosto sim", "intent": "affirmative", "entities": []}, 109 | {"text": "é possivel", "intent": "affirmative", "entities": []}, 110 | {"text": "pode me mandar sim", "intent": "affirmative", "entities": []}, 111 | {"text": "aceito", "intent": "affirmative", "entities": []}, 112 | {"text": "dá sim", "intent": "affirmative", "entities": []}, 113 | { 114 | "text": "adorei a ideia vamos sim", 115 | "intent": "affirmative", 116 | "entities": [], 117 | }, 118 | {"text": "quero", "intent": "affirmative", "entities": []}, 119 | {"text": "vamos sim", "intent": "affirmative", "entities": []}, 120 | {"text": "claro", "intent": "affirmative", "entities": []}, 121 | {"text": "com certeza", "intent": "affirmative", "entities": []}, 122 | {"text": "estou", "intent": "affirmative", "entities": []}, 123 | {"text": "consigu", "intent": "affirmative", "entities": []}, 124 | {"text": "consigo", "intent": "affirmative", "entities": []}, 125 | {"text": "não tenho", "intent": "negative", "entities": []}, 126 | {"text": "nem tenho", "intent": "negative", "entities": []}, 127 | {"text": "pior que não tenho", "intent": "negative", "entities": []}, 128 | {"text": "não tenho email", "intent": "negative", "entities": []}, 129 | {"text": "voces fazem coroa dentaria ?", "intent": "bias", "entities": []}, 130 | {"text": "o plano inclui ceromero?", "intent": "bias", "entities": []}, 131 | {"text": "e buco maxilar facial?", "intent": "bias", "entities": []}, 132 | {"text": "varias vezes", "intent": "affirmative", "entities": []}, 133 | {"text": "um pouco", "intent": "affirmative", "entities": []}, 134 | {"text": "acho que faço isso", "intent": "doubt", "entities": []}, 135 | {"text": "quero sim", "intent": "affirmative", "entities": []}, 136 | {"text": "Não estou bem hoje", "intent": "negative", "entities": []}, 137 | {"text": "não quero mais isso", "intent": "negative", "entities": []}, 138 | {"text": "não estou namorando", "intent": "negative", "entities": []}, 139 | {"text": "a ta sei", "intent": "affirmative", "entities": []}, 140 | {"text": "Nunca namorei", "intent": "negative", "entities": []}, 141 | { 142 | "text": "não, como faço para reconhecer?", 143 | "intent": "negative", 144 | "entities": [], 145 | }, 146 | {"text": "mais ou menos, pq?", "intent": "doubt", "entities": []}, 147 | { 148 | "text": "já mas não foi muito bom", 149 | "intent": "affirmative", 150 | "entities": [], 151 | }, 152 | {"text": "tudo ótimo", "intent": "affirmative", "entities": []}, 153 | {"text": "tudo otimo", "intent": "affirmative", "entities": []}, 154 | {"text": "tudo", "intent": "affirmative", "entities": []}, 155 | {"text": "tudo bem", "intent": "affirmative", "entities": []}, 156 | {"text": "eu estou bem", "intent": "affirmative", "entities": []}, 157 | {"text": "eu estou bem", "intent": "affirmative", "entities": []}, 158 | {"text": "tudo uma merda", "intent": "negative", "entities": []}, 159 | {"text": "tudo horrivel", "intent": "negative", "entities": []}, 160 | {"text": "tudo pessimo", "intent": "negative", "entities": []}, 161 | { 162 | "text": "eu também estou num relacionamento abusivo", 163 | "intent": "bias", 164 | "entities": [], 165 | }, 166 | {"text": "já", "intent": "affirmative", "entities": []}, 167 | { 168 | "text": "hoje já estou num relacionamento abusivo", 169 | "intent": "bias", 170 | "entities": [], 171 | }, 172 | { 173 | "text": "hoje estou num relacionamento abusivo", 174 | "intent": "bias", 175 | "entities": [], 176 | }, 177 | {"text": "nunca passei por isso", "intent": "negative", "entities": []}, 178 | {"text": "as vezes", "intent": "doubt", "entities": []}, 179 | {"text": "sofro abuso emocional", "intent": "bias", "entities": []}, 180 | { 181 | "text": "to naum... mas ja namorei um porquinho?", 182 | "intent": "negative", 183 | "entities": [], 184 | }, 185 | {"text": "estou namorando", "intent": "affirmative", "entities": []}, 186 | {"text": "to namorando", "intent": "affirmative", "entities": []}, 187 | {"text": "pior que ja", "intent": "affirmative", "entities": []}, 188 | {"text": "entendi", "intent": "affirmative", "entities": []}, 189 | {"text": "não entendi", "intent": "doubt", "entities": []}, 190 | {"text": "eu quero", "intent": "affirmative", "entities": []}, 191 | {"text": "gosto de futebol", "intent": "bias", "entities": []}, 192 | { 193 | "text": "meu namorado bateu na minha cara", 194 | "intent": "bias", 195 | "entities": [], 196 | }, 197 | { 198 | "text": "não ne!! meu namorado bateu na minha cara", 199 | "intent": "bias", 200 | "entities": [], 201 | }, 202 | {"text": "eu fui estruprada", "intent": "affirmative", "entities": []}, 203 | {"text": "tenho que pensar", "intent": "doubt", "entities": []}, 204 | {"text": "mais ou menos", "intent": "doubt", "entities": []}, 205 | {"text": "talvez", "intent": "doubt", "entities": []}, 206 | {"text": "nunca", "intent": "negative", "entities": []}, 207 | {"text": "não", "intent": "negative", "entities": []}, 208 | {"text": "tenho", "intent": "affirmative", "entities": []}, 209 | {"text": "meu namorado me bateu", "intent": "bias", "entities": []}, 210 | {"text": "quero", "intent": "affirmative", "entities": []}, 211 | {"text": "não", "intent": "negative", "entities": []}, 212 | {"text": "sim", "intent": "affirmative", "entities": []}, 213 | {"text": "fui agredida", "intent": "bias", "entities": []}, 214 | {"text": "estuprada", "intent": "bias", "entities": []}, 215 | {"text": "tou namorando", "intent": "affirmative", "entities": []}, 216 | {"text": "sim, tou namorando", "intent": "affirmative", "entities": []}, 217 | ], 218 | ) 219 | @patch( 220 | "bothub_backend.bothub.BothubBackend.request_backend_parse_nlu_persistor", 221 | return_value={ 222 | "version_id": 49, 223 | "repository_uuid": "0f6b9644-db55-49a2-a20d-2af74106d892", 224 | "total_training_end": 3, 225 | "language": "pt_br", 226 | "bot_data": base64.b64encode( 227 | open("example_generic_language.tar.gz", "rb").read() 228 | ), 229 | }, 230 | ) 231 | @patch( 232 | "bothub_backend.bothub.BothubBackend.request_backend_info", 233 | return_value={"intents": ['affirmative', 'negative', 'doubt', 'bias']}, 234 | ) 235 | @patch( 236 | "bothub_backend.bothub.BothubBackend.request_backend_create_evaluate_results", 237 | return_value={"evaluate_id": 1787, "evaluate_version": 189}, 238 | ) 239 | @patch( 240 | "bothub_backend.bothub.BothubBackend.request_backend_create_evaluate_results_intent", 241 | return_value={}, 242 | ) 243 | @patch( 244 | "bothub_backend.bothub.BothubBackend.request_backend_create_evaluate_results_score", 245 | return_value={}, 246 | ) 247 | def test_evaluate_ok(self, *args): 248 | result = evaluate_update( 249 | self.repository_version, 250 | self.current_update.get("repository_version"), 251 | self.repository_authorization, 252 | self.interpreter_manager, 253 | self.language 254 | ) 255 | 256 | self.assertEqual(1787, result.get("id")) 257 | self.assertEqual(189, result.get("version")) 258 | -------------------------------------------------------------------------------- /tests/test_parse.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import uuid 3 | import base64 4 | import os 5 | from unittest.mock import patch 6 | 7 | import sys 8 | sys.path.insert(1, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) 9 | 10 | from bothub.nlu_worker.task.parse import parse_text 11 | from bothub.nlu_worker.interpreter_manager import InterpreterManager 12 | 13 | 14 | class TestParseTask(unittest.TestCase): 15 | def setUp(self, *args): 16 | self.repository_authorization = uuid.uuid4() 17 | self.current_update = { 18 | "ready_for_train": True, 19 | "current_version_id": 6647, 20 | "repository_authorization_user_id": 303, 21 | } 22 | self.local_path = os.getcwd() 23 | self.interpreter_manager = InterpreterManager() 24 | 25 | # change directory to /tests 26 | try: 27 | os.chdir("tests") 28 | except Exception: 29 | pass 30 | 31 | @patch( 32 | "bothub_backend.bothub.BothubBackend.request_backend_parse_nlu_persistor", 33 | return_value={ 34 | "version_id": 49, 35 | "repository_uuid": "0f6b9644-db55-49a2-a20d-2af74106d892", 36 | "total_training_end": 3, 37 | "language": "pt_br", 38 | "bot_data": base64.b64encode( 39 | open("example_generic_language.tar.gz", "rb").read() 40 | ), 41 | "from_aws": False, 42 | }, 43 | ) 44 | def test_parse_without_rasa_format(self, *args): 45 | 46 | parse_text( 47 | self.current_update.get("current_version_id"), 48 | self.repository_authorization, 49 | self.interpreter_manager, 50 | "ok", 51 | ) 52 | 53 | @patch( 54 | "bothub_backend.bothub.BothubBackend.request_backend_parse_nlu_persistor", 55 | return_value={ 56 | "version_id": 49, 57 | "repository_uuid": "0f6b9644-db55-49a2-a20d-2af74106d892", 58 | "total_training_end": 3, 59 | "language": "pt_br", 60 | "bot_data": base64.b64encode( 61 | open("example_generic_language.tar.gz", "rb").read() 62 | ), 63 | "from_aws": False, 64 | }, 65 | ) 66 | def test_parse_with_rasa_format(self, *args): 67 | 68 | parse_text( 69 | self.current_update.get("current_version_id"), 70 | self.repository_authorization, 71 | self.interpreter_manager, 72 | "ok", 73 | True, 74 | ) 75 | -------------------------------------------------------------------------------- /tests/test_train.py: -------------------------------------------------------------------------------- 1 | import uuid 2 | from unittest import TestCase 3 | from unittest.mock import patch 4 | 5 | import os 6 | 7 | import sys 8 | sys.path.insert(1, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) 9 | 10 | from bothub.shared import train 11 | 12 | 13 | class TestTrainTask(TestCase): 14 | 15 | # # bert_language = "pt_br" 16 | bert_language = "en" 17 | 18 | def setUp(self, *args): 19 | self.repository_authorization = uuid.uuid4() 20 | self.current_update = { 21 | "ready_for_train": True, 22 | "current_version_id": 6647, 23 | "language": "en", 24 | "algorithm": "transformer_network_diet_bert", 25 | "repository_authorization_user_id": 303, 26 | } 27 | 28 | # change directory to load bert /bert_english 29 | 30 | list_dir = os.listdir() 31 | while 'bert_english' not in list_dir: 32 | os.chdir("../") 33 | list_dir = os.listdir() 34 | print("Current Working Directory ", os.getcwd()) 35 | 36 | @patch( 37 | "bothub_backend.bothub.BothubBackend.request_backend_start_training_nlu", 38 | return_value={ 39 | "language": bert_language, 40 | "repository_version": 6647, 41 | "repository_uuid": "e1e8a0fa-625c-4ba3-8b91-4c9f308db791", 42 | "intent": [], 43 | "algorithm": "transformer_network_diet_bert", 44 | "total_training_end": 4, 45 | "use_name_entities": False, 46 | "use_competing_intents": False, 47 | "use_analyze_char": False, 48 | }, 49 | ) 50 | @patch( 51 | "bothub_backend.bothub.BothubBackend.request_backend_get_examples", 52 | return_value={ 53 | "count": 358, 54 | "next": None, 55 | "previous": None, 56 | "results": [ 57 | {"text": "ss", "intent": "affirmative", "entities": []}, 58 | {"text": "okay", "intent": "affirmative", "entities": []}, 59 | {"text": "afirmativo", "intent": "affirmative", "entities": []}, 60 | {"text": "okk", "intent": "affirmative", "entities": []}, 61 | {"text": "okayy", "intent": "affirmative", "entities": []}, 62 | {"text": "certo", "intent": "affirmative", "entities": []}, 63 | {"text": "nops", "intent": "negative", "entities": []}, 64 | {"text": "no", "intent": "negative", "entities": []}, 65 | {"text": "nope", "intent": "negative", "entities": []}, 66 | {"text": "não sei", "intent": "doubt", "entities": []}, 67 | {"text": "naa", "intent": "negative", "entities": []}, 68 | {"text": "na", "intent": "negative", "entities": []}, 69 | {"text": "não", "intent": "negative", "entities": []}, 70 | {"text": "talvez nao", "intent": "negative", "entities": []}, 71 | {"text": "nnn", "intent": "negative", "entities": []}, 72 | {"text": "nn", "intent": "negative", "entities": []}, 73 | {"text": "isso", "intent": "affirmative", "entities": []}, 74 | { 75 | "text": "sim, preciso daquilo", 76 | "intent": "affirmative", 77 | "entities": [], 78 | }, 79 | {"text": "sim, desejo isso", "intent": "affirmative", "entities": []}, 80 | {"text": "sim, quero isso", "intent": "affirmative", "entities": []}, 81 | {"text": "não ne", "intent": "negative", "entities": []}, 82 | {"text": "tenho que pensar", "intent": "doubt", "entities": []}, 83 | {"text": "talvez", "intent": "doubt", "entities": []}, 84 | {"text": "é", "intent": "affirmative", "entities": []}, 85 | {"text": "quero", "intent": "affirmative", "entities": []}, 86 | {"text": "quero sim", "intent": "affirmative", "entities": []}, 87 | {"text": "negativo", "intent": "negative", "entities": []}, 88 | {"text": "siim", "intent": "affirmative", "entities": []}, 89 | {"text": "boa sim", "intent": "affirmative", "entities": []}, 90 | ], 91 | }, 92 | ) 93 | @patch( 94 | "bothub_backend.bothub.BothubBackend.send_training_backend_nlu_persistor", 95 | return_value={}, 96 | ) 97 | @patch( 98 | "bothub_backend.bothub.BothubBackend.request_backend_traininglog_nlu", 99 | return_value={}, 100 | ) 101 | @patch( 102 | "bothub_backend.bothub.BothubBackend.request_backend_trainfail_nlu", 103 | return_value={}, 104 | ) 105 | def test_train_bert(self, *args): 106 | train.train_update( 107 | self.current_update.get("current_version_id"), 108 | self.current_update.get("repository_authorization_user_id"), 109 | self.repository_authorization, 110 | ) 111 | 112 | @patch( 113 | "bothub_backend.bothub.BothubBackend.request_backend_start_training_nlu", 114 | return_value={ 115 | "language": "pt_br", 116 | "repository_version": 6647, 117 | "repository_uuid": "e1e8a0fa-625c-4ba3-8b91-4c9f308db791", 118 | "intent": [], 119 | "algorithm": "transformer_network_diet", 120 | "total_training_end": 4, 121 | "use_name_entities": False, 122 | "use_competing_intents": False, 123 | "use_analyze_char": False, 124 | }, 125 | ) 126 | @patch( 127 | "bothub_backend.bothub.BothubBackend.request_backend_get_examples", 128 | return_value={ 129 | "count": 358, 130 | "next": None, 131 | "previous": None, 132 | "results": [ 133 | {"text": "ss", "intent": "affirmative", "entities": []}, 134 | {"text": "okay", "intent": "affirmative", "entities": []}, 135 | {"text": "afirmativo", "intent": "affirmative", "entities": []}, 136 | {"text": "okk", "intent": "affirmative", "entities": []}, 137 | {"text": "okayy", "intent": "affirmative", "entities": []}, 138 | {"text": "certo", "intent": "affirmative", "entities": []}, 139 | {"text": "nops", "intent": "negative", "entities": []}, 140 | {"text": "no", "intent": "negative", "entities": []}, 141 | {"text": "nope", "intent": "negative", "entities": []}, 142 | {"text": "não sei", "intent": "doubt", "entities": []}, 143 | {"text": "naa", "intent": "negative", "entities": []}, 144 | {"text": "na", "intent": "negative", "entities": []}, 145 | {"text": "não", "intent": "negative", "entities": []}, 146 | {"text": "talvez nao", "intent": "negative", "entities": []}, 147 | {"text": "nnn", "intent": "negative", "entities": []}, 148 | ], 149 | }, 150 | ) 151 | @patch( 152 | "bothub_backend.bothub.BothubBackend.send_training_backend_nlu_persistor", 153 | return_value={}, 154 | ) 155 | @patch( 156 | "bothub_backend.bothub.BothubBackend.request_backend_traininglog_nlu", 157 | return_value={}, 158 | ) 159 | @patch( 160 | "bothub_backend.bothub.BothubBackend.request_backend_trainfail_nlu", 161 | return_value={}, 162 | ) 163 | def test_train_transformer_diet(self, *args): 164 | train.train_update( 165 | self.current_update.get("current_version_id"), 166 | self.current_update.get("repository_authorization_user_id"), 167 | self.repository_authorization, 168 | ) 169 | --------------------------------------------------------------------------------