├── .coveragerc
├── .dockerignore
├── .editorconfig
├── .flake8
├── .github
    ├── FUNDING.yml
    └── workflows
    │   ├── build-bothub-nlp-push-tag-dockerhub.yaml
    │   └── build-bothub-nlp-push-tag-gcr.yaml
├── .gitignore
├── .travis.yml
├── LICENSE
├── Makefile
├── README.md
├── __init__.py
├── ai_platform
    ├── aiplatform_app.py
    ├── aiplatform_requirements.txt
    └── settings.py
├── aiplatform.Dockerfile
├── bothub
    ├── __init__.py
    ├── nlu_worker
    │   ├── __init__.py
    │   ├── interpreter_manager.py
    │   └── task
    │   │   ├── __init__.py
    │   │   ├── debug_parse.py
    │   │   ├── evaluate.py
    │   │   ├── intent_sentence_suggestion.py
    │   │   ├── parse.py
    │   │   ├── sentence_suggestion.py
    │   │   ├── word_suggestion.py
    │   │   └── words_distribution.py
    └── shared
    │   ├── __init__.py
    │   ├── evaluate_crossval.py
    │   ├── settings.py
    │   ├── train.py
    │   └── utils
    │       ├── __init__.py
    │       ├── backend.py
    │       ├── helpers.py
    │       ├── lookup_tables
    │           ├── en
    │           │   ├── country.txt
    │           │   └── email.txt
    │           └── pt_br
    │           │   ├── brand.txt
    │           │   ├── cep.txt
    │           │   ├── country.txt
    │           │   ├── cpf.txt
    │           │   └── email.txt
    │       ├── persistor.py
    │       ├── pipeline_builder.py
    │       ├── pipeline_components
    │           ├── __init__.py
    │           ├── diet_classifier.py
    │           ├── hf_transformer.py
    │           ├── lm_featurizer.py
    │           ├── lm_tokenizer.py
    │           ├── microsoft_recognizers_extractor.py
    │           ├── preprocessing.py
    │           ├── regex_entity_extractor.py
    │           └── spacy_nlp.py
    │       ├── poke_logging.py
    │       ├── preprocessing
    │           ├── __init__.py
    │           ├── preprocessing_base.py
    │           ├── preprocessing_english.py
    │           ├── preprocessing_factory.py
    │           ├── preprocessing_portuguese.py
    │           └── preprocessing_spanish.py
    │       ├── rasa_components
    │           ├── __init__.py
    │           ├── bothub_interpreter.py
    │           └── registry.py
    │       └── scripts
    │           ├── download_models.py
    │           └── link_lang_spacy.py
├── celery_app.py
├── docker-compose.yml
├── nlp.Dockerfile
├── requirements.txt
├── start_celery.py
└── tests
    ├── README.md
    ├── __init__.py
    ├── example_bert_pt_br.tar.gz
    ├── example_generic_language.tar.gz
    ├── shared
        ├── __init__.py
        ├── test_pipeline_builder.py
        └── test_preprocesing.py
    ├── test_debug_parse.py
    ├── test_evaluate.py
    ├── test_parse.py
    ├── test_train.py
    └── test_words_distribution.py


/.coveragerc:
--------------------------------------------------------------------------------
 1 | [run]
 2 | source =
 3 |     ./bothub_nlp_nlu_worker/bothub_nlp_nlu
 4 | 
 5 | omit =
 6 |     ./bothub_nlp_nlu_worker/bothub_nlp_nlu/scripts/*
 7 |     ./bothub_nlp_nlu_worker/bothub_nlp_nlu/tests/*
 8 |     ./bothub_nlp_nlu_worker/bothub_nlp_nlu/pipeline_components/*
 9 | 
10 | [report]
11 | fail_under = 70


--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
 1 | env/
 2 | venv/
 3 | .vscode
 4 | .ipynb_checkpoints
 5 | .~*
 6 | *.pyc
 7 | .DS_Store
 8 | app/dump.rdb
 9 | etc/
10 | bothub-nlp.log
11 | 
12 | # file-based project format:
13 | *.iws
14 | 
15 | # tests
16 | tests.db
17 | .coverage
18 | 
19 | # env vars files
20 | .env
21 | settings.ini
22 | 
23 | # dev
24 | db.sqlite3
25 | 
26 | # spacy-langs
27 | bothub-nlp-nlu-worker/spacy-langs
28 | 


--------------------------------------------------------------------------------
/.editorconfig:
--------------------------------------------------------------------------------
 1 | root = true
 2 | 
 3 | [*]
 4 | indent_style = space
 5 | indent_size = 4
 6 | end_of_line = lf
 7 | charset = utf-8
 8 | trim_trailing_whitespace = true
 9 | insert_final_newline = false
10 | 
11 | [*.yml]
12 | indent_size = 2
13 | 
14 | [.flake8]
15 | indent_size = 2
16 | 
17 | [*.{py,yml,sh}]
18 | insert_final_newline = true
19 | 
20 | [Makefile]
21 | indent_style = tab
22 | 
23 | [{Makefile,Dockerfile,.editorconfig,README.md}]
24 | insert_final_newline = true
25 | 


--------------------------------------------------------------------------------
/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | max-line-length = 119
3 | ignore = E501,W503,E203,E402
4 | exclude =
5 |   ./spacy-langs
6 |   ./env
7 |   ./venv
8 |   ./scripts


--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | # These are supported funding model platforms
2 | 
3 | custom: ['https://www.ilhasoft.com.br/en/contact/']
4 | 


--------------------------------------------------------------------------------
/.github/workflows/build-bothub-nlp-push-tag-dockerhub.yaml:
--------------------------------------------------------------------------------
  1 | name: Build Bothub NLP in Dockerhub
  2 | 
  3 | on:
  4 |   push:
  5 |     tags:
  6 |       - '*.*.*-develop'
  7 |       - '*.*.*-staging'
  8 |       - '*.*.*'
  9 | 
 10 | jobs:
 11 |   docker:
 12 |     runs-on: ubuntu-latest
 13 | 
 14 |     strategy:
 15 |       matrix:
 16 |         deployment: [deployment-bert-english.json, deployment-bert-multilang.json, deployment-bert-ptbr.json, deployment-internal-en.json, deployment-internal-multilang.json, deployment-internal-ptbr.json, deployment-spacy-en.json, deployment-spacy-es.json, deployment-spacy-fr.json, deployment-spacy-ptbr.json, deployment-spacy-ru.json]
 17 | 
 18 |     steps:
 19 |       - name: Set variables
 20 |         run: |
 21 |             TAG="$( echo "${GITHUB_REF}" | cut -d'/' -f3 )"
 22 |             if grep -qs -e '^.*.*-develop' <<< "${TAG}" ; then
 23 |               echo "Found environment: DEVELOP - ${TAG}"
 24 |               echo "MANIFESTS_ENVIRONMENT=develop" | tee -a "${GITHUB_ENV}"
 25 |             elif grep -qs -e '^.*.*-staging' <<< "${TAG}" ; then
 26 |               echo "Found environment: STAGING - ${TAG}"
 27 |               echo "MANIFESTS_ENVIRONMENT=staging" | tee -a "${GITHUB_ENV}"
 28 |             elif grep -qs -e '^.*.*' <<< "${TAG}" ; then
 29 |               echo "No environment found, assuming: PRODUCTION - ${TAG}"
 30 |               echo "MANIFESTS_ENVIRONMENT=production" | tee -a "${GITHUB_ENV}"
 31 |             else
 32 |               echo 'Not a valid tag. Skipping...'
 33 |               exit 1
 34 |             fi
 35 |             echo "TAG=$TAG" | tee -a "${GITHUB_ENV}"
 36 |             VERSION="${TAG}"
 37 |             echo "VERSION=${VERSION}" | tee -a "${GITHUB_ENV}"
 38 |             echo "COMMIT_SHA=$GITHUB_SHA" | tee -a "${GITHUB_ENV}"
 39 |             echo "MATRIXIN=${{ matrix.deployment }}"
 40 |             echo "MANIFESTS_PATCH_TARGET=${{ matrix.deployment }}" | tee -a "${GITHUB_ENV}"
 41 |             if [[ "${{ matrix.deployment }}" == "deployment-bert-english.json" ]]; then
 42 |               MODEL=en-BERT
 43 |               echo "MODEL=en-BERT" | tee -a "${GITHUB_ENV}"
 44 |             elif [[ "${{ matrix.deployment }}" == "deployment-bert-multilang.json" ]]; then
 45 |               MODEL=xx-BERT
 46 |               echo "MODEL=xx-BERT" | tee -a "${GITHUB_ENV}"
 47 |             elif [[ "${{ matrix.deployment }}" == "deployment-bert-ptbr.json" ]]; then
 48 |               MODEL=pt_br-BERT
 49 |               echo "MODEL=pt_br-BERT" | tee -a "${GITHUB_ENV}"
 50 |             elif [[ "${{ matrix.deployment }}" == "deployment-internal-en.json" ]]; then
 51 |               MODEL=xx-NONE
 52 |               echo "MODEL=xx-NONE" | tee -a "${GITHUB_ENV}"
 53 |             elif [[ "${{ matrix.deployment }}" == "deployment-internal-multilang.json" ]]; then
 54 |               MODEL=xx-NONE
 55 |               echo "MODEL=xx-NONE" | tee -a "${GITHUB_ENV}"
 56 |             elif [[ "${{ matrix.deployment }}" == "deployment-internal-ptbr.json" ]]; then
 57 |               MODEL=xx-NONE
 58 |               echo "MODEL=xx-NONE" | tee -a "${GITHUB_ENV}"
 59 |             elif [[ "${{ matrix.deployment }}" == "deployment-spacy-en.json" ]]; then
 60 |               MODEL=en-SPACY
 61 |               echo "MODEL=en-SPACY" | tee -a "${GITHUB_ENV}"
 62 |             elif [[ "${{ matrix.deployment }}" == "deployment-spacy-es.json" ]]; then
 63 |               MODEL=es-SPACY
 64 |               echo "MODEL=es-SPACY" | tee -a "${GITHUB_ENV}"
 65 |             elif [[ "${{ matrix.deployment }}" == "deployment-spacy-fr.json" ]]; then
 66 |               MODEL=fr-SPACY
 67 |               echo "MODEL=fr-SPACY" | tee -a "${GITHUB_ENV}"
 68 |             elif [[ "${{ matrix.deployment }}" == "deployment-spacy-ptbr.json" ]]; then
 69 |               MODEL=pt_br-SPACY
 70 |               echo "MODEL=pt_br-SPACY" | tee -a "${GITHUB_ENV}"
 71 |             elif [[ "${{ matrix.deployment }}" == "deployment-spacy-ru.json" ]]; then
 72 |               MODEL=ru-SPACY
 73 |               echo "MODEL=ru-SPACY" | tee -a "${GITHUB_ENV}"
 74 |             else
 75 |               echo "Unknown model"
 76 |               exit 1
 77 |             fi
 78 |             
 79 |             echo "${MODEL}"
 80 |             echo "IMAGE_TAG=bothubit/bothub-nlp:${TAG}-${MODEL}" | tee -a "${GITHUB_ENV}"
 81 |             
 82 |             echo "IMAGE_SOURCE_URL=https://github.com/weni-ai/bothub-nlp" | tee -a "${GITHUB_ENV}"
 83 |             echo "MANIFESTS_REPOSITORY=weni-ai/kubernetes-manifests-artificial-intelligence" | tee -a "${GITHUB_ENV}"
 84 |             echo "MANIFESTS_APPLICATION=nlp-workers" | tee -a "${GITHUB_ENV}"
 85 |       - name: Check out the repo
 86 |         uses: actions/checkout@v3
 87 |         with:
 88 |           ref: "${{env.GITHUB_SHA}}"
 89 | 
 90 |       - name: Set up QEMU
 91 |         uses: docker/setup-qemu-action@v2
 92 | 
 93 |       - name: Set up Docker Buildx
 94 |         uses: docker/setup-buildx-action@v2
 95 | 
 96 |       - name: Login to Dockerhub
 97 |         uses: docker/login-action@v2
 98 |         with:
 99 |           username: ${{ secrets.DOCKERHUB_USERNAME }}
100 |           password: ${{ secrets.DOCKERHUB_TOKEN }}
101 | 
102 |       - name: Build and push - Bothub NLP Model ( ${{ env.MODEL }} ) Image
103 |         if: ${{ !( matrix.deployment == 'deployment-internal-ptbr.json' || matrix.deployment == 'deployment-internal-multilang.json' ) }}
104 |         uses: docker/build-push-action@v3
105 |         with:
106 |           context: .
107 |           labels: |
108 |             tag=${{env.TAG}}
109 |             commit=${{env.COMMIT_SHA}}
110 |             repository=${{env.IMAGE_SOURCE_URL}}
111 |           file: ./nlp.Dockerfile
112 |           push: true
113 |           tags: "${{env.IMAGE_TAG}}"
114 |           no-cache: true
115 |           build-args: |
116 |             DOWNLOAD_MODELS=${{ env.MODEL }}
117 |             
118 |       - name: Check out Kubernetes Manifests
119 |         uses: actions/checkout@master
120 |         with:
121 |           ref: main
122 |           repository: "${{ env.MANIFESTS_REPOSITORY }}"
123 |           token: "${{ secrets.DEVOPS_GITHUB_PERMANENT_TOKEN }}"
124 |           path: ./kubernetes-manifests/
125 | 
126 |       - name: Update image on deployment
127 |         run: |
128 |             which jq > /dev/null 2>&1 || ( sudo apt update ; sudo apt install -y jq )
129 |             # Dep: coreutils
130 |             verlte() {
131 |               [ "$1" = "`echo -e "$1\n$2" | sort -V | head -n1`" ]
132 |             }
133 |             verlt(){
134 |               [ "$1" = "$2" ] && return 1 || verlte $1 $2
135 |             }
136 |             export PROJECT_DIR="${{ env.MANIFESTS_APPLICATION }}"
137 |             ENV_DIR="kubernetes-manifests/${{ env.MANIFESTS_APPLICATION }}/${MANIFESTS_ENVIRONMENT}"
138 |             for e in ${ENV_DIR}; do
139 |               echo "Update ${e}:"
140 |               if [ ! -d "${e}" ] ; then
141 |                 echo "${e}: Does not exist, skipping"
142 |               elif [ ! -r "${e}/kustomization.yaml" ] ; then
143 |                 echo "${e}/kustomization.yaml: Does not readable, skipping"
144 |               elif [ ! -r "${e}/${{ env.MANIFESTS_PATCH_TARGET }}" ] ; then
145 |                 echo "${e}/${{ env.MANIFESTS_PATCH_TARGET }}: Does not readable, skipping"
146 |               else
147 |                 OLD_IMAGE=$(
148 |                   cat "${e}/${{ env.MANIFESTS_PATCH_TARGET }}" \
149 |                     | jq '.[] | select(.path == "/spec/template/spec/containers/0/image") | .value'
150 |                 )
151 |                 echo "Old image to replace: ${OLD_IMAGE}"
152 |                 OLD_VERSION=$(
153 |                   echo "${OLD_IMAGE}" \
154 |                     | sed s'/^.*[v:-]\([0-9]*\.[0-9]*\.[0-9]*\).*$/\1/'g \
155 |                     | head -n1
156 |                 )
157 |                 echo "Old image version to compare: ${OLD_VERSION}<=${{env.VERSION}}"
158 |                 if verlte "${OLD_VERSION}" "${VERSION}" || [[ ! "${OLD_VERSION}" =~ [0-9]+\.[0-9]+\.[0-9]+ ]] ; then
159 |                   echo 'New configurations:'
160 |                   new_configuration=$(
161 |                     cat "${e}/${{ env.MANIFESTS_PATCH_TARGET }}" \
162 |                       | jq '(..|select(.path == "/spec/template/spec/containers/0/image")?) += {value: "'"${{env.IMAGE_TAG}}"'"}'
163 |                   )
164 |                   echo "${new_configuration}"
165 |                   echo "${new_configuration}" > "${e}/${{ env.MANIFESTS_PATCH_TARGET }}"
166 |                 else
167 |                   echo "Version in file is greater than build, skipping update yaml"
168 |                 fi
169 |               fi
170 |             done
171 |       - name: Commit & Push changes
172 |         uses: actions-js/push@master
173 |         with:
174 |           github_token: "${{ secrets.DEVOPS_GITHUB_PERMANENT_TOKEN }}"
175 |           repository: "${{ env.MANIFESTS_REPOSITORY }}"
176 |           directory: ./kubernetes-manifests/
177 |           branch: main
178 |           message: "From Bothub NLP Build (Push Tag ${{ env.MANIFESTS_ENVIRONMENT }})"
179 | 
180 | 
181 | 


--------------------------------------------------------------------------------
/.github/workflows/build-bothub-nlp-push-tag-gcr.yaml:
--------------------------------------------------------------------------------
 1 | name: Build AI-Platform Bothub NLP in GCR
 2 | 
 3 | on:
 4 |   push:
 5 |     tags:
 6 |       - '*.*.*-develop'
 7 |       - '*.*.*-staging'
 8 |       - '*.*.*'
 9 | 
10 | jobs:
11 |   docker:
12 |     runs-on: ubuntu-latest
13 |     
14 |     strategy:
15 |       matrix:
16 |         model: [ xx-NONE, xx-BERT, en-BERT, pt_br-BERT, es-SPACY, fr-SPACY, pt_br-SPACY , ru-SPACY, en-SPACY]
17 |         
18 |     steps:
19 |       - name: Set variables
20 |         run: |
21 |             TAG="$( echo "${GITHUB_REF}" | cut -d'/' -f3 )"
22 |             if grep -qs -e '^.*.*-develop' <<< "${TAG}" ; then
23 |               echo "Found environment: DEVELOP - ${TAG}"
24 |               echo "ENVIRONMENT=develop" | tee -a "${GITHUB_ENV}"
25 |             elif grep -qs -e '^.*.*-staging' <<< "${TAG}" ; then
26 |               echo "Found environment: STAGING - ${TAG}"
27 |               echo "ENVIRONMENT=staging" | tee -a "${GITHUB_ENV}"
28 |             elif grep -qs -e '^.*.*' <<< "${TAG}" ; then
29 |               echo "No environment found, assuming: PRODUCTION - ${TAG}"
30 |               echo "ENVIRONMENT=production" | tee -a "${GITHUB_ENV}"
31 |             else
32 |               echo 'Not a valid tag. Skipping...'
33 |               exit 1
34 |             fi
35 |             echo "TAG=$TAG" | tee -a "${GITHUB_ENV}"
36 |             VERSION="${TAG}"
37 |             echo "VERSION=${VERSION}" | tee -a "${GITHUB_ENV}"
38 |             echo "COMMIT_SHA=$GITHUB_SHA" | tee -a "${GITHUB_ENV}"
39 |             echo "IMAGE_TAG=us.gcr.io/bothub-273521/bothub-nlp-ai-platform:${TAG}-${{ matrix.model }}" | tee -a "${GITHUB_ENV}"
40 |             echo "IMAGE_SOURCE_URL=https://github.com/weni-ai/bothub-nlp" | tee -a "${GITHUB_ENV}"
41 | 
42 |             
43 |       - name: Check out the repo
44 |         uses: actions/checkout@v3
45 |         with:
46 |           ref: "${{env.GITHUB_SHA}}"
47 | 
48 |       - name: Set up QEMU
49 |         uses: docker/setup-qemu-action@v2
50 | 
51 |       - name: Set up Docker Buildx
52 |         uses: docker/setup-buildx-action@v2
53 | 
54 |       - name: Login to GCR
55 |         uses: docker/login-action@v1
56 |         with:
57 |           registry: us.gcr.io
58 |           username: _json_key
59 |           password: ${{ secrets.GCR_JSON_KEY }}
60 |           
61 |       - name: Build and push - AI-Platform Bothub NLP Image
62 |         uses: docker/build-push-action@v3
63 |         with:
64 |           context: .
65 |           labels: |
66 |             tag=${{env.TAG}}
67 |             commit=${{env.COMMIT_SHA}}
68 |             repository=${{env.IMAGE_SOURCE_URL}}
69 |           file: ./aiplatform.Dockerfile
70 |           push: true
71 |           tags: "${{env.IMAGE_TAG}}"
72 |           no-cache: true
73 |           build-args: |
74 |             DOWNLOAD_MODELS=${{ matrix.model }}
75 |           
76 |           
77 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .nox/
 42 | .coverage
 43 | .coverage.*
 44 | .cache
 45 | nosetests.xml
 46 | coverage.xml
 47 | *.cover
 48 | .hypothesis/
 49 | .pytest_cache/
 50 | 
 51 | # Translations
 52 | *.mo
 53 | *.pot
 54 | 
 55 | # Django stuff:
 56 | *.log
 57 | local_settings.py
 58 | db.sqlite3
 59 | db.sqlite3-journal
 60 | 
 61 | # Flask stuff:
 62 | instance/
 63 | .webassets-cache
 64 | 
 65 | # Scrapy stuff:
 66 | .scrapy
 67 | 
 68 | # Sphinx documentation
 69 | docs/_build/
 70 | 
 71 | # PyBuilder
 72 | target/
 73 | 
 74 | # Jupyter Notebook
 75 | .ipynb_checkpoints
 76 | 
 77 | # IPython
 78 | profile_default/
 79 | ipython_config.py
 80 | 
 81 | # pyenv
 82 | .python-version
 83 | 
 84 | # celery beat schedule file
 85 | celerybeat-schedule
 86 | 
 87 | # SageMath parsed files
 88 | *.sage.py
 89 | 
 90 | # Environments
 91 | .env
 92 | .venv
 93 | env/
 94 | venv/
 95 | ENV/
 96 | env.bak/
 97 | venv.bak/
 98 | 
 99 | # Spyder project settings
100 | .spyderproject
101 | .spyproject
102 | 
103 | # Rope project settings
104 | .ropeproject
105 | 
106 | # mkdocs documentation
107 | /site
108 | 
109 | # mypy
110 | .mypy_cache/
111 | .dmypy.json
112 | dmypy.json
113 | 
114 | # IDEs
115 | .vscode/
116 | .idea/
117 | .DS_Store
118 | .idea/
119 | .env
120 | spacy-langs/


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | python:
 3 |   - "3.6"
 4 | services:
 5 |   - docker
 6 | install:
 7 |   - pip install -r requirements.txt
 8 |   - pip install coveralls
 9 | env:
10 |   global:
11 |     - BOTHUB_NLP_LANGUAGE_QUEUE="en"
12 |     - BOTHUB_NLP_SERVICE_WORKER=true
13 | before_script:
14 |   - python bothub/shared/utils/scripts/download_models.py en-BERT
15 | script:
16 |   - flake8
17 |   - travis_wait coverage run -m unittest discover tests
18 | after_success:
19 |   - coveralls
20 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | init_development_env:
 2 | 	@echo "${INFO}Starting init environment...${NC}"
 3 | 	@echo "BOTHUB_ENGINE_URL=http://localhost" >> .env
 4 | 	@echo "BOTHUB_NLP_SERVICE_WORKER=True" >> .env
 5 | 	@echo "BOTHUB_NLP_LANGUAGE_QUEUE=en" >> .env
 6 | 	@echo "BOTHUB_LANGUAGE_MODEL=BERT" >> .env
 7 | 	@echo "${SUCCESS}Finish...${NC}"
 8 | 
 9 | start_development:
10 | 	@echo "${INFO}Starting Build all project (Docker)...${NC}"
11 | 	@docker-compose build --build-arg DOWNLOAD_MODELS=en-BERT
12 | 	@docker-compose up -d
13 | 	@echo "${SUCCESS}Finish...${NC}"
14 | 
15 | 
16 | install_development_requirements:
17 | 	@echo "${INFO}Installing development requirements...${NC}"
18 | 	@git clone --branch master --depth 1 --single-branch https://github.com/Ilhasoft/spacy-lang-models spacy-langs
19 | 	@python bothub/shared/utils/scripts/link_lang_spacy.py pt_br ./spacy-langs/pt_br/
20 | 	@python bothub/shared/utils/scripts/download_models.py en-BERT
21 | 	@echo "${SUCCESS}✔${NC} Development requirements installed"
22 | 
23 | 
24 | start_celery:
25 | 	@python start_celery.py
26 | 
27 | # Utils
28 | 
29 | ## Colors
30 | SUCCESS = \033[0;32m
31 | INFO = \033[0;36m
32 | WARNING = \033[0;33m
33 | DANGER = \033[0;31m
34 | NC = \033[0m
35 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Bothub NLP - Natural Language Processing services
 2 | 
 3 | [![Build Status](https://travis-ci.org/bothub-it/bothub-nlp.svg?branch=master)](https://travis-ci.org/bothub-it/bothub-nlp) [![Coverage Status](https://coveralls.io/repos/github/bothub-it/bothub-nlp/badge.svg?branch=master)](https://coveralls.io/github/bothub-it/bothub-nlp?branch=master) ![version 3.0.1](https://img.shields.io/badge/version-3.0.1-blue.svg) [![python 3.6](https://img.shields.io/badge/python-3.6-green.svg)](https://docs.python.org/3.6/whatsnew/changelog.html) [![license AGPL-3.0](https://img.shields.io/badge/license-AGPL--3.0-red.svg)](https://github.com/bothub-it/bothub-nlp/blob/master/LICENSE)
 4 | 
 5 | 
 6 | 
 7 | ## Services
 8 | 
 9 | ### bothub-nlp-nlu-worker
10 | 
11 | ### [bothub-nlp-api](https://github.com/bothub-it/bothub-nlp-api)
12 | 
13 | ## Packages
14 | 
15 | ### [bothub-backend](https://github.com/bothub-it/bothub-backend) (python 3.6)
16 | 
17 | ### [bothub-nlp-celery](https://github.com/bothub-it/bothub-nlp-celery) (python 3.6)
18 | 
19 | 
20 | # Requirements
21 | 
22 | * Python (3.6)
23 | * Docker
24 | * Docker-Compose
25 | 
26 | ## Development
27 | 
28 | Use ```make``` commands
29 | 
30 | | Command | Description |
31 | |--|--|
32 | | make init_development_env | Init file .env with variables environment |
33 | | make start_development | Start build docker |
34 | | make install_development_requirements | Install some default models |
35 | | make start_celery | Run celery application |
36 | 
37 | 
38 | ## Environment Variables
39 | 
40 | You can set environment variables in your OS, write on ```.env``` file or pass via Docker config.
41 | 
42 | ### bothub-backend
43 | 
44 | | Variable | Type | Default | Description |
45 | |--|--|--|--|
46 | | BOTHUB_ENGINE_URL | `str` | `https://api.bothub.it` | Web service url |
47 | 
48 | ### nlp-nlu-worker / nlp-ai-platform
49 | 
50 | You can set environment variables in your OS, write on ```.env``` file or pass via Docker config.
51 | 
52 | | Variable | Type | Default | Description |
53 | |--|--|--|--|
54 | | WORKER_CACHE_CLEANING_PERIOD | `float` | `3*3600` | Period of time (seconds) the worker will look for idle interpreters to clean cache |
55 | | INTERPRETER_CACHE_IDLE_LIMIT | `float` | `24*3600` | Idle limit of time (seconds) the interpreter cache will keep cache |
56 | | DYNAMIC_EPOCHS_THRESHOLD | `int` | `10000` | Minimum number of sentences to start decreasing training number of epochs |
57 | | BOTHUB_NLP_AWS_ACCESS_KEY_ID | `str` | | AWS bucket access to save trained models and evaluation results |
58 | | BOTHUB_NLP_AWS_SECRET_ACCESS_KEY | `str` | | AWS bucket access to save trained models and evaluation results |
59 | | BOTHUB_NLP_AWS_S3_BUCKET_NAME | `str` | | AWS bucket access to save trained models and evaluation results |
60 | | BOTHUB_NLP_AWS_REGION_NAME | `str` | | AWS bucket access to save trained models and evaluation results |
61 | 
62 | ### bothub-celery
63 | 
64 | | Variable | Type | Default | Description |
65 | |--|--|--|--|
66 | | BOTHUB_NLP_CELERY_BROKER_URL | `string` | `redis://localhost:6379/0` | Celery Broker URL, check usage instructions in [Celery Docs](http://docs.celeryproject.org/en/latest/index.html) |
67 | | BOTHUB_NLP_CELERY_BACKEND_URL | `string` | `BOTHUB_NLP_CELERY_BROKER_URL` value | Celery Backend URL, check usage instructions in [Celery Docs](http://docs.celeryproject.org/en/latest/index.html) |
68 | | BOTHUB_NLP_CELERY_SENTRY_CLIENT | `bool` | `False` | Enable Sentry |
69 | | BOTHUB_NLP_CELERY_SENTRY | `str` | `None` | Set URL Sentry Server |
70 | | BOTHUB_NLP_LANGUAGE_QUEUE | `string` | `en` | Set language of model that will be loaded in celery and will define its queue |
71 | | BOTHUB_LANGUAGE_MODEL | `string` | `None` | Set type of model (BERT/SPACY/NONE) |
72 | | TASK_GENERAL_TIME_LIMIT | `int` | `120` | Time limit of celery tasks |
73 | | TASK_PARSE_TIME_LIMIT | `int` | `10` | Time limit of parse task |
74 | 
75 | ## Docker Arguments
76 | 
77 | You need to set --build-arg when you are building docker-compose
78 | 
79 | | Argument | Type | Default | Description |
80 | |--|--|--|--|
81 | | DOWNLOAD_MODELS | ```string```|  ```en-BERT``` | Set language and model in build time. Following the format: ```[LANGUAGE_CODE]-[LANGUAGE_MODEL]```.
82 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weni-ai/bothub-nlp/d43b7657aff01544769b71db74adce5f7d366879/__init__.py


--------------------------------------------------------------------------------
/ai_platform/aiplatform_app.py:
--------------------------------------------------------------------------------
 1 | from bothub.shared.train import train_update as train
 2 | from bothub.shared.evaluate_crossval import (
 3 |     evaluate_crossval_update as evaluate_crossval,
 4 | )
 5 | 
 6 | if __name__ == "__main__":
 7 |     from settings import (
 8 |         operation,
 9 |         repository_version_language,
10 |         by_id,
11 |         repository_authorization,
12 |         aws_bucket_authentication,
13 |         language
14 |     )
15 | 
16 |     # Run the job
17 |     if operation == "train":
18 |         train(
19 |             repository_version_language,
20 |             by_id,
21 |             repository_authorization,
22 |             from_queue="ai-platform",
23 |         )
24 |     elif operation == "evaluate":
25 |         evaluate_crossval(
26 |             repository_version_language,
27 |             repository_authorization,
28 |             aws_bucket_authentication,
29 |             language
30 |         )
31 | 


--------------------------------------------------------------------------------
/ai_platform/aiplatform_requirements.txt:
--------------------------------------------------------------------------------
 1 | git+https://github.com/bothub-it/bothub-backend.git@1.0.22
 2 | git+https://github.com/bothub-it/bothub-nlp-celery.git@0.1.38
 3 | rasa==1.10.6
 4 | transformers==2.11.0
 5 | emoji==0.6.0
 6 | recognizers-text-suite
 7 | plac==0.9.6
 8 | spacy==2.1.9
 9 | Unidecode==1.1.1
10 | urllib3==1.24.3
11 | tensorflow-gpu==2.1.2
12 | requests==2.23.0
13 | pymorphy2==0.8
14 | python-decouple==3.3
15 | h5py==2.10.0


--------------------------------------------------------------------------------
/ai_platform/settings.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | PARSER = argparse.ArgumentParser()
 4 | 
 5 | # Input Arguments
 6 | PARSER.add_argument(
 7 |     "--operation", help='What operation will be done, "train" or "evaluate"'
 8 | )
 9 | PARSER.add_argument(
10 |     "--repository-version", help="The id of repository-version.", type=int
11 | )
12 | PARSER.add_argument(
13 |     "--by-id", help="User id sending the job", type=int
14 | )
15 | PARSER.add_argument(
16 |     "--repository-authorization", help="Repository authorization string."
17 | )
18 | PARSER.add_argument(
19 |     "--AIPLATFORM_LANGUAGE_QUEUE", type=str, default=""
20 | )
21 | 
22 | PARSER.add_argument(
23 |     "--BOTHUB_NLP_AWS_S3_BUCKET_NAME", type=str, default=""
24 | )
25 | 
26 | PARSER.add_argument(
27 |     "--BOTHUB_NLP_AWS_ACCESS_KEY_ID", type=str, default=""
28 | )
29 | 
30 | PARSER.add_argument(
31 |     "--BOTHUB_NLP_AWS_SECRET_ACCESS_KEY", type=str, default=""
32 | )
33 | 
34 | PARSER.add_argument(
35 |     "--BOTHUB_NLP_AWS_REGION_NAME", type=str, default="us-east-1"
36 | )
37 | 
38 | ARGUMENTS, _ = PARSER.parse_known_args()
39 | 
40 | operation = ARGUMENTS.operation
41 | repository_version_language = ARGUMENTS.repository_version
42 | by_id = ARGUMENTS.by_id
43 | repository_authorization = ARGUMENTS.repository_authorization
44 | language = ARGUMENTS.AIPLATFORM_LANGUAGE_QUEUE
45 | 
46 | aws_bucket_authentication = {
47 |     "BOTHUB_NLP_AWS_S3_BUCKET_NAME": ARGUMENTS.BOTHUB_NLP_AWS_S3_BUCKET_NAME,
48 |     "BOTHUB_NLP_AWS_ACCESS_KEY_ID": ARGUMENTS.BOTHUB_NLP_AWS_ACCESS_KEY_ID,
49 |     "BOTHUB_NLP_AWS_SECRET_ACCESS_KEY": ARGUMENTS.BOTHUB_NLP_AWS_SECRET_ACCESS_KEY,
50 |     "BOTHUB_NLP_AWS_REGION_NAME": ARGUMENTS.BOTHUB_NLP_AWS_REGION_NAME,
51 | }
52 | 


--------------------------------------------------------------------------------
/aiplatform.Dockerfile:
--------------------------------------------------------------------------------
  1 | ARG UBUNTU_VERSION=18.04
  2 | 
  3 | ARG ARCH=
  4 | ARG CUDA=10.1
  5 | FROM nvidia/cuda${ARCH:+-$ARCH}:${CUDA}-base-ubuntu${UBUNTU_VERSION} as base
  6 | # ARCH and CUDA are specified again because the FROM directive resets ARGs
  7 | # (but their default value is retained if set previously)
  8 | ARG ARCH
  9 | ARG CUDA
 10 | ARG CUDNN=7.6.4.38-1
 11 | ARG CUDNN_MAJOR_VERSION=7
 12 | ARG LIB_DIR_PREFIX=x86_64
 13 | ARG LIBNVINFER=6.0.1-1
 14 | ARG LIBNVINFER_MAJOR_VERSION=6
 15 | 
 16 | # Needed for string substitution
 17 | SHELL ["/bin/bash", "-c"]
 18 | # Pick up some TF dependencies
 19 | RUN apt-get update && apt-get install -y --no-install-recommends \
 20 |         build-essential \
 21 |         cuda-command-line-tools-${CUDA/./-} \
 22 |         # There appears to be a regression in libcublas10=10.2.2.89-1 which
 23 |         # prevents cublas from initializing in TF. See
 24 |         # https://github.com/tensorflow/tensorflow/issues/9489#issuecomment-562394257
 25 |         libcublas10=10.2.1.243-1 \
 26 |         cuda-nvrtc-${CUDA/./-} \
 27 |         cuda-cufft-${CUDA/./-} \
 28 |         cuda-curand-${CUDA/./-} \
 29 |         cuda-cusolver-${CUDA/./-} \
 30 |         cuda-cusparse-${CUDA/./-} \
 31 |         curl \
 32 |         git \
 33 |         wget \
 34 |         libcudnn7=${CUDNN}+cuda${CUDA} \
 35 |         libfreetype6-dev \
 36 |         libhdf5-serial-dev \
 37 |         libzmq3-dev \
 38 |         pkg-config \
 39 |         software-properties-common \
 40 |         unzip
 41 | 
 42 | # Install TensorRT if not building for PowerPC
 43 | RUN [[ "${ARCH}" = "ppc64le" ]] || { apt-get update && \
 44 |         apt-get install -y --no-install-recommends libnvinfer${LIBNVINFER_MAJOR_VERSION}=${LIBNVINFER}+cuda${CUDA} \
 45 |         libnvinfer-plugin${LIBNVINFER_MAJOR_VERSION}=${LIBNVINFER}+cuda${CUDA} \
 46 |         && apt-get clean \
 47 |         && rm -rf /var/lib/apt/lists/*; }
 48 | 
 49 | # For CUDA profiling, TensorFlow requires CUPTI.
 50 | ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:/usr/local/cuda/lib64:$LD_LIBRARY_PATH
 51 | 
 52 | # Link the libcuda stub to the location where tensorflow is searching for it and reconfigure
 53 | # dynamic linker run-time bindings
 54 | RUN ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1 \
 55 |     && echo "/usr/local/cuda/lib64/stubs" > /etc/ld.so.conf.d/z-cuda-stubs.conf \
 56 |     && ldconfig
 57 | 
 58 | # See http://bugs.python.org/issue19846
 59 | ENV LANG C.UTF-8
 60 | ENV LC_ALL C.UTF-8
 61 | 
 62 | RUN apt-get update && apt-get install -y \
 63 |     python3 \
 64 |     python3-pip
 65 | 
 66 | RUN python3 -m pip --no-cache-dir install --upgrade \
 67 |     pip \
 68 |     setuptools
 69 | 
 70 | # Some TF tools expect a "python" binary
 71 | RUN ln -s $(which python3) /usr/local/bin/python
 72 | 
 73 | WORKDIR /home/root/app
 74 | 
 75 | RUN echo ttf-mscorefonts-installer msttcorefonts/accepted-mscorefonts-eula select true | debconf-set-selections
 76 | RUN apt-get install -y ttf-mscorefonts-installer \
 77 |     && apt-get autoremove -y \
 78 |     && apt-get clean -y \
 79 |     && rm -rf /var/lib/apt/lists/*
 80 | 
 81 | COPY ai_platform/aiplatform_requirements.txt .
 82 | 
 83 | FROM base as builder
 84 | 
 85 | RUN pip3 wheel --wheel-dir=/wheels -r aiplatform_requirements.txt
 86 | 
 87 | FROM base
 88 | 
 89 | COPY --from=builder /wheels /wheels
 90 | 
 91 | RUN pip3 install --find-links=/wheels -r aiplatform_requirements.txt
 92 | 
 93 | COPY ai_platform/aiplatform_app.py .
 94 | COPY ai_platform/settings.py .
 95 | COPY bothub/shared /home/root/app/bothub/shared
 96 | COPY bothub/__init__.py /home/root/app/bothub
 97 | 
 98 | ARG DOWNLOAD_MODELS
 99 | #Install torch with cuda 10.1
100 | RUN if [ "${DOWNLOAD_MODELS}" = "pt_br-BERT" ]; then \
101 |         pip install torch==1.6.0+cu101 torchvision==0.7.0+cu101 -f https://download.pytorch.org/whl/torch_stable.html; \
102 |     fi
103 | 
104 | RUN if [ ${DOWNLOAD_MODELS} ]; then \
105 |         python3.6 bothub/shared/utils/scripts/download_models.py ${DOWNLOAD_MODELS}; \
106 |     fi
107 | 
108 | 
109 | ENTRYPOINT ["python3.6", "aiplatform_app.py"]
110 | 


--------------------------------------------------------------------------------
/bothub/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weni-ai/bothub-nlp/d43b7657aff01544769b71db74adce5f7d366879/bothub/__init__.py


--------------------------------------------------------------------------------
/bothub/nlu_worker/__init__.py:
--------------------------------------------------------------------------------
1 | import logging
2 | 
3 | logging.basicConfig(
4 |     format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", level=logging.DEBUG
5 | )
6 | 


--------------------------------------------------------------------------------
/bothub/nlu_worker/interpreter_manager.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import threading
  3 | import time
  4 | import gc
  5 | 
  6 | from typing import Callable, Union
  7 | from rasa.nlu import components
  8 | from tempfile import mkdtemp
  9 | from datetime import datetime
 10 | 
 11 | from bothub.shared import settings
 12 | from bothub.shared.utils.persistor import BothubPersistor
 13 | from bothub.shared.utils.backend import backend
 14 | from bothub.shared.utils.rasa_components.bothub_interpreter import BothubInterpreter
 15 | 
 16 | logger = logging.getLogger(__name__)
 17 | 
 18 | 
 19 | class SetInterval:
 20 |     """
 21 |     Creates a thread that execute a function every x seconds
 22 |     """
 23 |     def __init__(self, interval: Union[int, float], action: Callable):
 24 |         """
 25 |         :param interval: Period in seconds
 26 |         :param action: Callable function
 27 |         """
 28 |         self.interval = interval
 29 |         self.action = action
 30 |         self.stopEvent = threading.Event()
 31 |         thread = threading.Thread(target=self._set_interval, daemon=True)
 32 |         thread.start()
 33 | 
 34 |     def _set_interval(self):
 35 |         next_time = time.time() + self.interval
 36 |         while not self.stopEvent.wait(next_time - time.time()):
 37 |             next_time += self.interval
 38 |             self.action()
 39 | 
 40 |     def cancel(self):
 41 |         self.stopEvent.set()
 42 | 
 43 | 
 44 | class InterpreterManager:
 45 |     def __init__(self):
 46 |         self.cached_interpreters = {}
 47 |         SetInterval(settings.WORKER_CACHE_CLEANING_PERIOD, self._clean_cache)
 48 | 
 49 |     def get_interpreter(
 50 |         self,
 51 |         repository_version,
 52 |         repository_authorization,
 53 |         rasa_version,
 54 |         use_cache=True
 55 |     ) -> BothubInterpreter:
 56 | 
 57 |         update_request = backend().request_backend_parse_nlu_persistor(
 58 |             repository_version, repository_authorization, rasa_version, no_bot_data=True
 59 |         )
 60 | 
 61 |         repository_name = (
 62 |             f"{update_request.get('version_id')}_{update_request.get('language')}"
 63 |         )
 64 |         last_training = f"{update_request.get('total_training_end')}"
 65 | 
 66 |         # tries to fetch cache
 67 |         retrieved_cache = self.cached_interpreters.get(repository_name)
 68 |         if retrieved_cache and use_cache:
 69 |             # retrieve cache only if it's the same training
 70 |             if retrieved_cache["last_training"] == last_training:
 71 |                 retrieved_cache["last_request"] = datetime.now()
 72 |                 return retrieved_cache["interpreter_data"]
 73 | 
 74 |         persistor = BothubPersistor(
 75 |             repository_version, repository_authorization, rasa_version
 76 |         )
 77 |         model_directory = mkdtemp()
 78 |         persistor.retrieve(str(update_request.get("repository_uuid")), model_directory)
 79 | 
 80 |         interpreter = BothubInterpreter(
 81 |             None, {"language": update_request.get("language")}
 82 |         )
 83 |         interpreter = interpreter.load(
 84 |             model_directory, components.ComponentBuilder(use_cache=False)
 85 |         )
 86 | 
 87 |         if use_cache:  # update/creates cache
 88 |             self.cached_interpreters[repository_name] = {
 89 |                 "last_training": last_training,
 90 |                 "interpreter_data": interpreter,
 91 |                 "last_request": datetime.now()
 92 |             }
 93 | 
 94 |         return interpreter
 95 | 
 96 |     def _clean_cache(self) -> None:
 97 |         logger.info("Cleaning repositories cache")
 98 |         cur_time = datetime.now()
 99 | 
100 |         to_remove = []
101 |         for interpreter in self.cached_interpreters:
102 |             last_request = self.cached_interpreters[interpreter]['last_request']
103 |             idle_time = (cur_time - last_request).total_seconds()
104 |             if idle_time > settings.INTERPRETER_CACHE_IDLE_LIMIT:
105 |                 to_remove.append(interpreter)
106 | 
107 |         for interpreter in to_remove:
108 |             del self.cached_interpreters[interpreter]
109 | 
110 |         logger.info(f"{len(to_remove)} interpreters cleaned")
111 |         objects_collected = gc.collect()
112 |         logger.info(f"{objects_collected} objects collected")
113 | 


--------------------------------------------------------------------------------
/bothub/nlu_worker/task/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weni-ai/bothub-nlp/d43b7657aff01544769b71db74adce5f7d366879/bothub/nlu_worker/task/__init__.py


--------------------------------------------------------------------------------
/bothub/nlu_worker/task/debug_parse.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | from collections import OrderedDict
  4 | from lime.lime_text import LimeTextExplainer
  5 | from rasa.nlu.test import remove_pretrained_extractors
  6 | from rasa.nlu import __version__ as rasa_version
  7 | 
  8 | from bothub.shared.utils.backend import backend
  9 | 
 10 | 
 11 | class DebugSentenceLime:
 12 |     def __init__(self, interpreter, intention_names):
 13 |         self.interpreter = interpreter
 14 |         self.interpreter.pipeline = remove_pretrained_extractors(
 15 |             self.interpreter.pipeline
 16 |         )
 17 |         self.intention_names = intention_names
 18 | 
 19 |     def classifier(self, text_list):
 20 |         result_list = []
 21 |         for text in text_list:
 22 |             result_json = self.interpreter.parse(text)
 23 | 
 24 |             idx_dict = (
 25 |                 {}
 26 |             )  # fixing intent name to a index ex: {'violence': 0, 'immigration': 1, ... }
 27 |             size = len(self.intention_names)
 28 |             for i in range(size):
 29 |                 idx_dict[self.intention_names[i]] = i
 30 | 
 31 |             intent_list = [0] * len(self.intention_names)
 32 |             intent_name_list = [""] * len(self.intention_names)
 33 |             size = len(result_json.get("intent_ranking", []))
 34 |             for i in range(size):
 35 |                 intent_name = result_json.get("intent_ranking")[i].get("name")
 36 |                 intent_list[idx_dict[intent_name]] = result_json.get("intent_ranking")[
 37 |                     i
 38 |                 ].get("confidence")
 39 |                 intent_name_list[idx_dict[intent_name]] = result_json.get(
 40 |                     "intent_ranking"
 41 |                 )[i].get("name")
 42 | 
 43 |             prob_array = np.array(intent_list)
 44 |             prob_array = prob_array.reshape((1, len(intent_list)))
 45 |             result_list.append(prob_array)
 46 | 
 47 |         result_array = result_list[0]
 48 |         for i in range(1, len(result_list)):
 49 |             result_array = np.vstack([result_array, result_list[i]])
 50 |         return result_array
 51 | 
 52 |     def get_result_per_word(self, text, num_samples):
 53 |         if not self.intention_names:
 54 |             return {}
 55 |         explainer = LimeTextExplainer(class_names=self.intention_names)
 56 |         labels = list(range(len(self.intention_names)))  # List
 57 |         try:
 58 |             exp = explainer.explain_instance(
 59 |                 text, self.classifier, num_features=6, labels=labels, num_samples=num_samples
 60 |             )
 61 |         except ValueError:
 62 |             labels = []
 63 |         result_per_word = {}
 64 |         for label in labels:
 65 |             for j in exp.as_list(label=label):
 66 |                 if j[0] not in result_per_word:
 67 |                     result_per_word[j[0]] = []
 68 |                 result_per_word[j[0]].append(
 69 |                     {"intent": self.intention_names[label], "relevance": j[1] * 100}
 70 |                 )
 71 |         for word in result_per_word:
 72 |             result_per_word[word] = sorted(
 73 |                 result_per_word[word], key=lambda k: k.get("relevance"), reverse=True
 74 |             )
 75 |         return result_per_word
 76 | 
 77 |     def get_result_per_intent(self, text, num_samples):
 78 |         explainer = LimeTextExplainer(class_names=self.intention_names)
 79 |         labels = list(range(len(self.intention_names)))  # List
 80 |         exp = explainer.explain_instance(
 81 |             text, self.classifier, num_features=6, labels=labels, num_samples=num_samples
 82 |         )
 83 |         result_per_intent = {}
 84 |         for intent in self.intention_names:
 85 |             result_per_intent[intent] = []
 86 |         for i in labels:
 87 |             intent_sum = 0
 88 |             for j in exp.as_list(label=i):
 89 |                 result_per_intent[self.intention_names[i]].append(
 90 |                     {"word": j[0], "relevance": j[1] * 100}
 91 |                 )
 92 |                 intent_sum += j[1]
 93 |             result_per_intent[self.intention_names[i]].append(
 94 |                 {"sum": intent_sum, "relevance": -1}
 95 |             )
 96 |         for intent in result_per_intent:
 97 |             result_per_intent[intent] = sorted(
 98 |                 result_per_intent[intent],
 99 |                 key=lambda k: k.get("relevance"),
100 |                 reverse=True,
101 |             )
102 | 
103 |         return result_per_intent
104 | 
105 | 
106 | def minimal_entity(entity, self_flag=False):  # pragma: no cover
107 |     out = {
108 |         "value": entity.get("value"),
109 |         "entity": entity.get("entity"),
110 |         "confidence": entity.get("confidence"),
111 |         "start": entity.get("start"),
112 |         "end": entity.get("end"),
113 |     }
114 | 
115 |     if self_flag:
116 |         out.update({"self": True})
117 | 
118 |     return out
119 | 
120 | 
121 | def get_intention_list(repository_authorization, repository_version_language_id):
122 |     info = backend().request_backend_info(
123 |         repository_authorization, repository_version_language_id=repository_version_language_id
124 |     )
125 |     return info.get("intents", [])
126 | 
127 | 
128 | def format_debug_parse_output(result_per_word, r):
129 |     entities = r.get("entities")
130 |     formatted_entities = []
131 |     for entity in entities:
132 |         formatted_entities.append(minimal_entity(entity))
133 |     for word in result_per_word:
134 |         result_per_word[word] = sorted(
135 |             result_per_word[word], key=lambda k: k["relevance"], reverse=True
136 |         )
137 |     result_per_word = OrderedDict(
138 |         sorted(
139 |             result_per_word.items(), key=lambda t: t[1][0]["relevance"], reverse=True
140 |         )
141 |     )
142 |     out = OrderedDict(
143 |         [
144 |             ("intent", r.get("intent", None)),
145 |             ("words", result_per_word),
146 |             ("entities", formatted_entities),
147 |         ]
148 |     )
149 |     return out
150 | 
151 | 
152 | def n_samples_by_sentence_lenght(sentence):
153 |     word_count = len(sentence.split(" "))
154 |     n_samples = min(int(1.8 ** word_count), 128)
155 |     return n_samples
156 | 
157 | 
158 | def debug_parse_text(
159 |     repository_version_language_id,
160 |     repository_authorization,
161 |     interpreter_manager,
162 |     text,
163 |     use_cache=True,
164 | ):
165 |     interpreter = interpreter_manager.get_interpreter(
166 |         repository_version_language_id, repository_authorization, rasa_version, use_cache
167 |     )
168 |     r = interpreter.parse(text)
169 | 
170 |     intention_names = get_intention_list(repository_authorization, repository_version_language_id)
171 |     result_per_word = DebugSentenceLime(
172 |         interpreter, intention_names
173 |     ).get_result_per_word(text, n_samples_by_sentence_lenght(text))
174 | 
175 |     return format_debug_parse_output(result_per_word, r)
176 | 


--------------------------------------------------------------------------------
/bothub/nlu_worker/task/intent_sentence_suggestion.py:
--------------------------------------------------------------------------------
 1 | from collections import OrderedDict
 2 | from bothub_nlp_celery.app import nlp_language
 3 | from bothub.shared.utils.preprocessing.preprocessing_factory import PreprocessingFactory
 4 | 
 5 | import random
 6 | 
 7 | from .sentence_suggestion import SentenceSuggestion
 8 | from bothub.shared.utils.helpers import get_examples_request
 9 | 
10 | 
11 | class NonexistentIntentError(Exception):
12 |     pass
13 | 
14 | 
15 | def intent_sentence_suggestion_text(
16 |     repository_version, repository_authorization, intent, percentage_to_replace, n
17 | ):
18 |     if nlp_language is None:
19 |         return "spacy model not loaded in this language"
20 |     if nlp_language.vocab.vectors_length == 0:
21 |         return "language not supported for this feature"
22 | 
23 |     intent_sentences = get_examples_request(repository_version, repository_authorization, intent=intent)
24 |     intent_sentences = [el['text'] for el in intent_sentences]
25 |     if len(intent_sentences) == 0:
26 |         raise NonexistentIntentError()
27 |     intent_sentences_sample = random.sample(intent_sentences, min(n, len(intent_sentences)))
28 |     factor = n / len(intent_sentences_sample)
29 | 
30 |     preprocessor1 = PreprocessingFactory(remove_accent=False).factory()
31 |     preprocessor2 = PreprocessingFactory(remove_accent=True).factory()
32 | 
33 |     suggested_sentences = []
34 |     count = 0
35 |     while len(suggested_sentences) < n:
36 |         if count > n or count >= len(intent_sentences_sample):
37 |             break
38 |         generated_sentences = SentenceSuggestion().get_suggestions(
39 |             preprocessor1.preprocess_text(intent_sentences_sample[count]),
40 |             percentage_to_replace,
41 |             random.randint(int(1 * factor), int(3 * factor))
42 |         )
43 |         for generated_sentence in generated_sentences:
44 |             preprocessed_sentence = preprocessor2.preprocess_text(generated_sentence)
45 |             if preprocessed_sentence not in intent_sentences:
46 |                 suggested_sentences.append(preprocessed_sentence)
47 |         count += 1
48 | 
49 |     suggested_sentences = suggested_sentences[:n]
50 | 
51 |     return OrderedDict([("intent", intent), ("suggested_sentences", suggested_sentences)])
52 | 


--------------------------------------------------------------------------------
/bothub/nlu_worker/task/parse.py:
--------------------------------------------------------------------------------
 1 | from collections import OrderedDict
 2 | from rasa.nlu import __version__ as rasa_version
 3 | 
 4 | 
 5 | def format_parse_output(
 6 |     repository_version, r, repository_authorization
 7 | ):  # pragma: no cover
 8 |     intent = r.get("intent", None)
 9 |     intent_ranking = r.get("intent_ranking", [])
10 |     entities = r.get("entities", [])
11 | 
12 |     out = OrderedDict(
13 |         [
14 |             ("intent", intent),
15 |             ("intent_ranking", intent_ranking),
16 |             (
17 |                 "entities_list",
18 |                 list(OrderedDict.fromkeys([x.get("entity", None) for x in entities])),
19 |             ),
20 |             ("entities", entities),
21 |         ]
22 |     )
23 |     return out
24 | 
25 | 
26 | def parse_text(
27 |     repository_version,
28 |     repository_authorization,
29 |     interpreter_manager,
30 |     text,
31 |     rasa_format=False,
32 |     use_cache=True,
33 | ):
34 |     interpreter = interpreter_manager.get_interpreter(
35 |         repository_version, repository_authorization, rasa_version, use_cache
36 |     )
37 |     r = interpreter.parse(text)
38 | 
39 |     if rasa_format:
40 |         return r
41 | 
42 |     return format_parse_output(repository_version, r, repository_authorization)
43 | 


--------------------------------------------------------------------------------
/bothub/nlu_worker/task/sentence_suggestion.py:
--------------------------------------------------------------------------------
  1 | from collections import OrderedDict
  2 | from bothub_nlp_celery.app import nlp_language
  3 | import random
  4 | import numpy as np
  5 | 
  6 | 
  7 | class SentenceSuggestion:
  8 |     def __init__(self):
  9 |         self.nlp = nlp_language
 10 |         self.to_replace_tags = ["VERB", "NOUN", "ADJ", "ADV", "INTJ", "PROPN"]
 11 |         self.n_highest = 50
 12 |         self.row2key = {row: key for key, row in self.nlp.vocab.vectors.key2row.items()}
 13 | 
 14 |     def most_similar(self, input_words, *, batch_size=1024, topn=1, sort=True):
 15 |         words_similar_list = []
 16 |         similar_list = []
 17 |         words = input_words
 18 |         if isinstance(input_words, str):
 19 |             words = [input_words]
 20 |         for word in words:
 21 |             input_vector = self.nlp(word).vector.reshape(
 22 |                 1, self.nlp.vocab.vectors.shape[1]
 23 |             )
 24 |             best_rows = np.zeros((1, self.n_highest), dtype="i")
 25 |             scores = np.zeros((1, self.n_highest), dtype="f")
 26 | 
 27 |             # Work in batches, to avoid memory problems.
 28 |             for i in range(0, input_vector.shape[0], batch_size):
 29 |                 batch = input_vector[i : i + batch_size]
 30 |                 batch_norms = np.linalg.norm(batch, axis=1, keepdims=True)
 31 |                 batch_norms[batch_norms == 0] = 1
 32 |                 batch /= batch_norms
 33 |                 sims = np.dot(batch, self.nlp.vocab.vectors.data.T)
 34 |                 best_rows[i : i + batch_size] = np.argpartition(
 35 |                     sims, -self.n_highest, axis=1
 36 |                 )[
 37 |                     :, -self.n_highest :
 38 |                 ]  # get n_highest scores rows in O(n)
 39 |                 scores[i : i + batch_size] = np.partition(
 40 |                     sims, -self.n_highest, axis=1
 41 |                 )[
 42 |                     :, -self.n_highest :
 43 |                 ]  # get n_highest scores in O(n)
 44 | 
 45 |                 # sort the n_highest scores and best_rows
 46 |                 if sort and topn >= 2:
 47 |                     sorted_index = (
 48 |                         np.arange(scores.shape[0])[:, None][i : i + batch_size],
 49 |                         np.argsort(scores[i : i + batch_size], axis=1)[:, ::-1],
 50 |                     )
 51 |                     scores[i : i + batch_size] = scores[sorted_index]
 52 |                     best_rows[i : i + batch_size] = best_rows[sorted_index]
 53 | 
 54 |             scores = np.around(scores, decimals=4, out=scores)
 55 |             scores = np.clip(scores, a_min=-1, a_max=1, out=scores)
 56 | 
 57 |             # get similar list of tuple (word, score) only if both input and candidate word is lower or large case
 58 |             similar_list = []
 59 |             for i in range(self.n_highest):
 60 |                 row = best_rows[0][i]
 61 |                 score = scores[0][i]
 62 |                 candidate_word_vocab = self.nlp.vocab[self.row2key[row]]
 63 |                 candidate_word = candidate_word_vocab.text
 64 |                 if (
 65 |                     candidate_word_vocab.is_lower == word.islower()
 66 |                     and candidate_word != word
 67 |                 ):
 68 |                     similar_list.append((candidate_word, score))
 69 |                 if len(similar_list) >= topn:
 70 |                     break
 71 |             words_similar_list.append(similar_list)
 72 |         if isinstance(input_words, str):
 73 |             return similar_list
 74 |         return words_similar_list
 75 | 
 76 |     @staticmethod  # get the indexes of the replaceable words
 77 |     def get_words_to_replace_idx(similar_words_json, word_list, percentage_to_replace):
 78 |         percentage_to_replace = np.clip(percentage_to_replace, 0, 1)
 79 |         word_list_size = len(word_list)
 80 |         for idx in list(similar_words_json):
 81 |             if len(similar_words_json[idx].get("similar_words")) == 0:
 82 |                 del similar_words_json[idx]
 83 |         words_to_replace_idx = []
 84 |         # number of words to replace
 85 |         n_words_to_replace = int(word_list_size * percentage_to_replace)
 86 |         replaceable_idx_list = list(similar_words_json)
 87 |         if n_words_to_replace < len(replaceable_idx_list):
 88 |             to_replace_idx_list = random.sample(
 89 |                 range(len(replaceable_idx_list)), n_words_to_replace
 90 |             )
 91 |             for idx in to_replace_idx_list:
 92 |                 words_to_replace_idx.append(replaceable_idx_list[idx])
 93 |         else:
 94 |             words_to_replace_idx = replaceable_idx_list
 95 |         return words_to_replace_idx
 96 | 
 97 |     def similar_words_json(self, sentence):
 98 |         similar_words_json = {}
 99 |         word_list = sentence.split(" ")
100 |         sentence_size = len(word_list)
101 |         for i in range(sentence_size):
102 |             try:
103 |                 word_pos = self.nlp(word_list[i])[0].pos_
104 |                 word_json = {
105 |                     "word": word_list[i],
106 |                     "type": word_pos,
107 |                     "similar_words": [],
108 |                 }
109 |                 if word_pos in self.to_replace_tags:
110 |                     similar_words = self.most_similar(word_list[i], topn=6)
111 |                     similar_words_size = len(similar_words)
112 |                     for j in range(similar_words_size):
113 |                         nlp_similar = self.nlp(similar_words[j][0])
114 |                         if len(nlp_similar) > 0 and nlp_similar[0].pos_ == word_pos:
115 |                             similar_json = {
116 |                                 "word": str(similar_words[j][0]),
117 |                                 "type": str(nlp_similar[0].pos_),
118 |                                 "relevance": str(similar_words[j][1]),
119 |                             }
120 |                             word_json["similar_words"].append(similar_json)
121 |                 similar_words_json[i] = word_json
122 |             except KeyError:
123 |                 pass
124 |         return similar_words_json
125 | 
126 |     def get_suggestions(self, sentence, percentage_to_replace, n):  # main method
127 |         similar_words_json = self.similar_words_json(sentence)
128 |         suggested_sentences = []
129 |         for _ in range(n):
130 |             word_list = sentence.split(" ")
131 |             words_to_replace_idx = self.get_words_to_replace_idx(
132 |                 similar_words_json, word_list, percentage_to_replace
133 |             )
134 |             for replace_idx in words_to_replace_idx:
135 |                 similar_words_len = len(
136 |                     similar_words_json[replace_idx].get("similar_words")
137 |                 )
138 |                 word_list[replace_idx] = (
139 |                     similar_words_json[replace_idx]
140 |                     .get("similar_words")[random.randint(0, similar_words_len - 1)]
141 |                     .get("word")
142 |                 )
143 |             suggested_sentences.append(" ".join(word_list))
144 |         suggested_sentences = list(set(suggested_sentences))  # Remove duplicates
145 |         return suggested_sentences
146 | 
147 | 
148 | def sentence_suggestion_text(text, percentage_to_replace, n):
149 |     if nlp_language is None:
150 |         return "spacy model not loaded in this language"
151 |     if nlp_language.vocab.vectors_length == 0:
152 |         return "language not supported for this feature"
153 | 
154 |     similar_sentences = SentenceSuggestion().get_suggestions(
155 |         text, percentage_to_replace, n
156 |     )
157 |     return OrderedDict([("text", text), ("suggested_sentences", similar_sentences)])
158 | 


--------------------------------------------------------------------------------
/bothub/nlu_worker/task/word_suggestion.py:
--------------------------------------------------------------------------------
 1 | from collections import OrderedDict
 2 | from bothub_nlp_celery.app import nlp_language
 3 | from bothub.shared.utils.preprocessing.preprocessing_factory import PreprocessingFactory
 4 | import numpy as np
 5 | 
 6 | 
 7 | class WordSuggestion:
 8 |     def __init__(self):
 9 |         self.nlp = nlp_language
10 |         self.to_replace_tags = ["VERB", "NOUN", "ADJ", "ADV", "INTJ", "PROPN"]
11 |         self.n_highest = 50
12 |         self.row2key = {row: key for key, row in self.nlp.vocab.vectors.key2row.items()}
13 | 
14 |     def most_similar(self, word, *, batch_size=1024, topn=1, sort=True):
15 |         input_vector = self.nlp(word).vector.reshape(1, self.nlp.vocab.vectors.shape[1])
16 |         best_rows = np.zeros((1, self.n_highest), dtype="i")
17 |         scores = np.zeros((1, self.n_highest), dtype="f")
18 | 
19 |         # Work in batches, to avoid memory problems.
20 |         for i in range(0, input_vector.shape[0], batch_size):
21 |             batch = input_vector[i : i + batch_size]
22 |             batch_norms = np.linalg.norm(batch, axis=1, keepdims=True)
23 |             batch_norms[batch_norms == 0] = 1
24 |             batch /= batch_norms
25 |             sims = np.dot(batch, self.nlp.vocab.vectors.data.T)
26 |             best_rows[i : i + batch_size] = np.argpartition(
27 |                 sims, -self.n_highest, axis=1
28 |             )[
29 |                 :, -self.n_highest :
30 |             ]  # get n_highest scores rows in O(n)
31 |             scores[i : i + batch_size] = np.partition(sims, -self.n_highest, axis=1)[
32 |                 :, -self.n_highest :
33 |             ]  # get n_highest scores in O(n)
34 | 
35 |             # sort the n_highest scores and best_rows
36 |             if sort and topn >= 2:
37 |                 sorted_index = (
38 |                     np.arange(scores.shape[0])[:, None][i : i + batch_size],
39 |                     np.argsort(scores[i : i + batch_size], axis=1)[:, ::-1],
40 |                 )
41 |                 scores[i : i + batch_size] = scores[sorted_index]
42 |                 best_rows[i : i + batch_size] = best_rows[sorted_index]
43 | 
44 |         scores = np.around(scores, decimals=4, out=scores)
45 |         scores = np.clip(scores, a_min=-1, a_max=1, out=scores)
46 | 
47 |         # get similar list of tuple (word, score) only if both input and candidate word is lower or large case
48 |         similar_list = []
49 |         for i in range(self.n_highest):
50 |             row = best_rows[0][i]
51 |             score = scores[0][i]
52 |             candidate_word_vocab = self.nlp.vocab[self.row2key[row]]
53 |             candidate_word = candidate_word_vocab.text
54 |             if (
55 |                 candidate_word_vocab.is_lower == word.islower()
56 |                 and candidate_word != word
57 |             ):
58 |                 similar_list.append((candidate_word, str(score)))
59 |             if len(similar_list) >= topn:
60 |                 break
61 |         return similar_list
62 | 
63 | 
64 | def word_suggestion_text(text, n):
65 |     if nlp_language is None:
66 |         return "spacy model not loaded in this language"
67 |     if nlp_language.vocab.vectors_length == 0:
68 |         return "language not supported for this feature"
69 | 
70 |     preprocessor = PreprocessingFactory(remove_accent=False).factory()
71 |     text = preprocessor.preprocess_text(text)
72 |     similar_words = WordSuggestion().most_similar(text, topn=n)
73 |     preprocessor = PreprocessingFactory(remove_accent=True).factory()
74 |     similar_words = [(preprocessor.preprocess_text(word[0]), word[1]) for word in similar_words]
75 | 
76 |     return OrderedDict([("text", text), ("similar_words", similar_words)])
77 | 


--------------------------------------------------------------------------------
/bothub/nlu_worker/task/words_distribution.py:
--------------------------------------------------------------------------------
 1 | from collections import Counter, OrderedDict
 2 | 
 3 | from bothub.shared.utils.helpers import get_examples_request
 4 | 
 5 | 
 6 | def words_distribution_text(repository_version, language, repository_authorization):
 7 |     examples_list = get_examples_request(repository_version, repository_authorization)
 8 | 
 9 |     all_intents = []  # the list of all words
10 |     intents = {}  # all the words separated by intent
11 |     all_frequencies = {}  # the count of all words
12 |     frequencies = {}  # the count of words separated by intent
13 | 
14 |     for example in examples_list:
15 |         text = example.get("text")
16 |         intent = example.get("intent")
17 |         for word in text.split():
18 |             all_intents.append(word.lower())
19 |             if intent in intents:
20 |                 intents[intent].append(word.lower())
21 |             else:
22 |                 intents[intent] = [word.lower()]
23 | 
24 |     all_frequencies = Counter(all_intents)
25 | 
26 |     for intent in intents:
27 |         frequencies[intent] = Counter(intents[intent])
28 | 
29 |     for intent in frequencies:
30 |         for n_tuple in frequencies[intent].most_common():
31 |             word = n_tuple[0]
32 |             try:
33 |                 frequencies[intent][word] = (
34 |                     frequencies[intent][word] / all_frequencies[word] * 100
35 |                 )
36 |             except ZeroDivisionError:  # pragma: no cover
37 |                 continue  # pragma: no cover
38 | 
39 |     ordered_frequencies = {}
40 | 
41 |     for intent in frequencies:
42 |         if intent not in ordered_frequencies:
43 |             ordered_frequencies[intent] = OrderedDict()
44 |         for n_tuple in frequencies[intent].most_common():
45 |             word = n_tuple[0]
46 |             ordered_frequencies[intent][word] = frequencies[intent][word]
47 | 
48 |     return {"words": ordered_frequencies}
49 | 


--------------------------------------------------------------------------------
/bothub/shared/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weni-ai/bothub-nlp/d43b7657aff01544769b71db74adce5f7d366879/bothub/shared/__init__.py


--------------------------------------------------------------------------------
/bothub/shared/settings.py:
--------------------------------------------------------------------------------
 1 | from decouple import config
 2 | 
 3 | # Period of time (seconds) the worker will look for idle interpreters to free space
 4 | WORKER_CACHE_CLEANING_PERIOD = config(
 5 |     "WORKER_CACHE_CLEANING_PERIOD", cast=float, default=3*3600
 6 | )
 7 | # Idle limit of time (seconds) the interpreter will be cached
 8 | INTERPRETER_CACHE_IDLE_LIMIT = config(
 9 |     "INTERPRETER_CACHE_IDLE_LIMIT", cast=float, default=24*3600
10 | )
11 | # Minimum number of sentences to start decreasing number of epochs
12 | DYNAMIC_EPOCHS_THRESHOLD = config(
13 |     "DYNAMIC_EPOCHS_THRESHOLD", cast=int, default=10000
14 | )
15 | 


--------------------------------------------------------------------------------
/bothub/shared/train.py:
--------------------------------------------------------------------------------
 1 | from tempfile import mkdtemp
 2 | import os
 3 | import logging
 4 | from rasa.nlu import __version__ as rasa_version
 5 | from rasa.nlu.model import Trainer
 6 | from rasa.nlu.training_data import Message, TrainingData
 7 | from rasa.nlu.components import ComponentBuilder
 8 | 
 9 | from bothub.shared.utils.poke_logging import PokeLogging
10 | from bothub.shared.utils.backend import backend
11 | from bothub.shared.utils.helpers import get_examples_request
12 | from bothub.shared.utils.persistor import BothubPersistor
13 | from bothub.shared.utils.pipeline_builder import PipelineBuilder
14 | 
15 | logger = logging.getLogger(__name__)
16 | 
17 | 
18 | def intersection(lst1, lst2):
19 |     lst3 = [value for value in lst1 if value in lst2]
20 |     return lst3
21 | 
22 | 
23 | def train_update(
24 |     repository_version_language_id, by_user, repository_authorization, from_queue="celery"
25 | ):  # pragma: no cover
26 | 
27 |     update_request = backend().request_backend_start_training_nlu(
28 |         repository_version_language_id, by_user, repository_authorization, from_queue
29 |     )
30 | 
31 |     examples_list = get_examples_request(repository_version_language_id, repository_authorization)
32 | 
33 |     with PokeLogging() as pl:
34 |         try:
35 |             examples = []
36 | 
37 |             for example in examples_list:
38 |                 examples.append(
39 |                     Message.build(
40 |                         text=example.get("text"),
41 |                         intent=example.get("intent"),
42 |                         entities=example.get("entities"),
43 |                     )
44 |                 )
45 | 
46 |             update_request["dataset_size"] = len(examples)
47 | 
48 |             pipeline_builder = PipelineBuilder(update_request)
49 |             pipeline_builder.print_pipeline()
50 |             rasa_nlu_config = pipeline_builder.get_nlu_model()
51 | 
52 |             trainer = Trainer(rasa_nlu_config, ComponentBuilder(use_cache=False))
53 |             training_data = TrainingData(
54 |                 training_examples=examples, lookup_tables=None
55 |             )
56 | 
57 |             trainer.train(training_data)
58 | 
59 |             persistor = BothubPersistor(
60 |                 repository_version_language_id, repository_authorization, rasa_version
61 |             )
62 |             trainer.persist(
63 |                 mkdtemp(),
64 |                 persistor=persistor,
65 |                 fixed_model_name=f"{update_request.get('repository_version')}_"
66 |                 f"{update_request.get('total_training_end') + 1}_"
67 |                 f"{update_request.get('language')}",
68 |             )
69 |         except Exception as e:
70 |             logger.exception(e)
71 |             backend().request_backend_trainfail_nlu(
72 |                 repository_version_language_id, repository_authorization
73 |             )
74 |             raise e
75 |         finally:
76 |             backend().request_backend_traininglog_nlu(
77 |                 repository_version_language_id, pl.getvalue(), repository_authorization
78 |             )
79 | 


--------------------------------------------------------------------------------
/bothub/shared/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weni-ai/bothub-nlp/d43b7657aff01544769b71db74adce5f7d366879/bothub/shared/utils/__init__.py


--------------------------------------------------------------------------------
/bothub/shared/utils/backend.py:
--------------------------------------------------------------------------------
 1 | import bothub_backend
 2 | import argparse
 3 | from decouple import config
 4 | 
 5 | 
 6 | def backend():
 7 |     PARSER = argparse.ArgumentParser()
 8 | 
 9 |     # Input Arguments
10 |     PARSER.add_argument(
11 |         "--base_url", help="Base URL API Engine.", type=str, default=None
12 |     )
13 | 
14 |     ARGUMENTS, _ = PARSER.parse_known_args()
15 | 
16 |     return bothub_backend.get_backend(
17 |         "bothub_backend.bothub.BothubBackend",
18 |         ARGUMENTS.base_url
19 |         if ARGUMENTS.base_url
20 |         else config("BOTHUB_ENGINE_URL", default="https://api.bothub.it"),
21 |     )
22 | 


--------------------------------------------------------------------------------
/bothub/shared/utils/helpers.py:
--------------------------------------------------------------------------------
 1 | from bothub.shared.utils.backend import backend
 2 | 
 3 | 
 4 | ALGORITHM_TO_LANGUAGE_MODEL = {
 5 |     "neural_network_internal": None,
 6 |     "neural_network_external": "SPACY",
 7 |     "transformer_network_diet": None,
 8 |     "transformer_network_diet_word_embedding": "SPACY",
 9 |     "transformer_network_diet_bert": "BERT",
10 | }
11 | 
12 | 
13 | def get_examples_request(repository_version_language, repository_authorization, intent=""):  # pragma: no cover
14 | 
15 |     start_examples = backend().request_backend_get_examples(
16 |         repository_version_language, None, repository_authorization, intent=intent
17 |     )
18 | 
19 |     examples = start_examples.get("results")
20 |     page = start_examples.get("next")
21 | 
22 |     if page:
23 |         while True:
24 |             request_examples_page = backend().request_backend_get_examples(
25 |                 repository_version_language, page, repository_authorization, intent=intent
26 |             )
27 | 
28 |             examples += request_examples_page.get("results")
29 | 
30 |             if request_examples_page.get("next") is None:
31 |                 break
32 | 
33 |             page = request_examples_page.get("next")
34 | 
35 |     return examples
36 | 
37 | 
38 | def examples_request(repository_authorization, language, repository_version):  # pragma: no cover
39 | 
40 |     start_examples = backend().request_backend_examples(
41 |         repository_authorization, language, repository_version, page=None
42 |     )
43 | 
44 |     examples = start_examples.get("results")
45 |     page = start_examples.get("next")
46 | 
47 |     if page:
48 |         while True:
49 |             request_examples_page = backend().request_backend_get_examples(
50 |                 repository_authorization, language, repository_version, page=page
51 |             )
52 | 
53 |             examples += request_examples_page.get("results")
54 | 
55 |             if request_examples_page.get("next") is None:
56 |                 break
57 | 
58 |             page = request_examples_page.get("next")
59 | 
60 |     return examples
61 | 
62 | 
63 | def get_algorithm_info():
64 |     # todo: get data from config file / populate languages
65 | 
66 |     # Sorted by priority
67 |     # last element -> default algorithm
68 |     return [
69 |         {"name": "transformer_network_diet_bert", "supported_languages": ["all"]},
70 |         {"name": "transformer_network_diet_word_embedding", "supported_languages": []},
71 |         {"name": "transformer_network_diet", "supported_languages": ["all"]},
72 |     ]
73 | 
74 | 
75 | def choose_best_algorithm(language):
76 |     supported_algorithms = get_algorithm_info()
77 | 
78 |     for model in supported_algorithms[:-1]:
79 |         if language in model["supported_languages"]:
80 |             return model["name"]
81 | 
82 |     # default algorithm
83 |     return supported_algorithms[len(supported_algorithms) - 1]["name"]
84 | 


--------------------------------------------------------------------------------
/bothub/shared/utils/lookup_tables/en/country.txt:
--------------------------------------------------------------------------------
  1 | Afghanistan
  2 | Albania
  3 | Algeria
  4 | Andorra
  5 | Angola
  6 | Antigua and Deps
  7 | Argentina
  8 | Armenia
  9 | Australia
 10 | Austria
 11 | Azerbaijan
 12 | Bahamas
 13 | Bahrain
 14 | Bangladesh
 15 | Barbados
 16 | Belarus
 17 | Belgium
 18 | Belize
 19 | Benin
 20 | Bhutan
 21 | Bolivia
 22 | Bosnia Herzegovina
 23 | Botswana
 24 | Brazil
 25 | Brunei
 26 | Bulgaria
 27 | Burkina
 28 | Burundi
 29 | Cambodia
 30 | Cameroon
 31 | Canada
 32 | Cape Verde
 33 | Central African Republic
 34 | Chad
 35 | Chile
 36 | China
 37 | Colombia
 38 | Comoros
 39 | Congo
 40 | Congo
 41 | Costa Rica
 42 | Croatia
 43 | Cuba
 44 | Cyprus
 45 | Czech Republic
 46 | Denmark
 47 | Djibouti
 48 | Dominica
 49 | Dominican Republic
 50 | East Timor
 51 | Ecuador
 52 | Egypt
 53 | El Salvador
 54 | Equatorial Guinea
 55 | Eritrea
 56 | Estonia
 57 | Ethiopia
 58 | Fiji
 59 | Finland
 60 | France
 61 | Gabon
 62 | Gambia
 63 | Georgia
 64 | Germany
 65 | Ghana
 66 | Greece
 67 | Grenada
 68 | Guatemala
 69 | Guinea
 70 | Guinea-Bissau
 71 | Guyana
 72 | Haiti
 73 | Honduras
 74 | Hungary
 75 | Iceland
 76 | India
 77 | Indonesia
 78 | Iran
 79 | Iraq
 80 | Ireland
 81 | Israel
 82 | Italy
 83 | Ivory Coast
 84 | Jamaica
 85 | Japan
 86 | Jordan
 87 | Kazakhstan
 88 | Kenya
 89 | Kiribati
 90 | Korea North
 91 | Korea South
 92 | Kosovo
 93 | Kuwait
 94 | Kyrgyzstan
 95 | Laos
 96 | Latvia
 97 | Lebanon
 98 | Lesotho
 99 | Liberia
100 | Libya
101 | Liechtenstein
102 | Lithuania
103 | Luxembourg
104 | Macedonia
105 | Madagascar
106 | Malawi
107 | Malaysia
108 | Maldives
109 | Mali
110 | Malta
111 | Marshall Islands
112 | Mauritania
113 | Mauritius
114 | Mexico
115 | Micronesia
116 | Moldova
117 | Monaco
118 | Mongolia
119 | Montenegro
120 | Morocco
121 | Mozambique
122 | Myanmar
123 | Namibia
124 | Nauru
125 | Nepal
126 | Netherlands
127 | New Zealand
128 | Nicaragua
129 | Niger
130 | Nigeria
131 | Norway
132 | Oman
133 | Pakistan
134 | Palau
135 | Panama
136 | Papua New Guinea
137 | Paraguay
138 | Peru
139 | Philippines
140 | Poland
141 | Portugal
142 | Qatar
143 | Romania
144 | Russian Federation
145 | Rwanda
146 | St Kitts and Nevis
147 | St Lucia
148 | Saint Vincent and the Grenadines
149 | Samoa
150 | San Marino
151 | Sao Tome and Principe
152 | Saudi Arabia
153 | Senegal
154 | Serbia
155 | Seychelles
156 | Sierra Leone
157 | Singapore
158 | Slovakia
159 | Slovenia
160 | Solomon Islands
161 | Somalia
162 | South Africa
163 | South Sudan
164 | Spain
165 | Sri Lanka
166 | Sudan
167 | Suriname
168 | Swaziland
169 | Sweden
170 | Switzerland
171 | Syria
172 | Taiwan
173 | Tajikistan
174 | Tanzania
175 | Thailand
176 | Togo
177 | Tonga
178 | Trinidad and Tobago
179 | Tunisia
180 | Turkey
181 | Turkmenistan
182 | Tuvalu
183 | Uganda
184 | Ukraine
185 | United Arab Emirates
186 | United Kingdom
187 | United States
188 | Uruguay
189 | Uzbekistan
190 | Vanuatu
191 | Vatican City
192 | Venezuela
193 | Vietnam
194 | Yemen
195 | Zambia
196 | Zimbabwe
197 | 


--------------------------------------------------------------------------------
/bothub/shared/utils/lookup_tables/en/email.txt:
--------------------------------------------------------------------------------
1 | regex [\w\-.]+@([\w\-]+\.)+[\w\-]{2,4}
2 | 


--------------------------------------------------------------------------------
/bothub/shared/utils/lookup_tables/pt_br/brand.txt:
--------------------------------------------------------------------------------
  1 | sorriso
  2 | omo
  3 | inter
  4 | sazon
  5 | blu
  6 | volvo
  7 | vigor
  8 | sul america
  9 | lanix
 10 | estacio
 11 | chery
 12 | itau
 13 | smiles
 14 | seda
 15 | ford
 16 | rexona
 17 | kfc
 18 | arezzo
 19 | vitarella
 20 | ikea
 21 | honda
 22 | siemens
 23 | pampers
 24 | philips
 25 | cartier
 26 | petrobras
 27 | santander
 28 | tirol
 29 | embratel
 30 | fanta
 31 | msi
 32 | lego
 33 | skol
 34 | mercedes-benz
 35 | lg
 36 | brahma
 37 | lancome
 38 | bauducco
 39 | lux
 40 | ninho
 41 | nescau
 42 | hp
 43 | club social
 44 | orange
 45 | zte
 46 | renault
 47 | qualy
 48 | fox
 49 | home depot
 50 | acer
 51 | pwc
 52 | accenture
 53 | netflix
 54 | ebay
 55 | santa amalia
 56 | lojas americanas
 57 | ypioca
 58 | at&t
 59 | t-mobile
 60 | land rover
 61 | caterpillar
 62 | sap
 63 | net
 64 | danone
 65 | huawei
 66 | starbucks
 67 | allianz
 68 | liza
 69 | piraque
 70 | uniqlo
 71 | ponto frio
 72 | chase
 73 | cisco
 74 | microsoft
 75 | rolex
 76 | fleury
 77 | ype
 78 | porsche
 79 | lexus
 80 | nestle
 81 | dell
 82 | seara
 83 | zara
 84 | marlboro
 85 | marilan
 86 | sbt
 87 | intel
 88 | porto seguro
 89 | sadia
 90 | tang
 91 | natura
 92 | asus
 93 | totvs
 94 | itambe
 95 | chevrolet
 96 | cielo
 97 | colgate
 98 | amazon
 99 | nike
100 | ibm
101 | quero
102 | limpol
103 | ipiranga
104 | chanel
105 | gucci
106 | caixa
107 | santa clara
108 | american express
109 | hsbc
110 | renner
111 | italac
112 | marata
113 | perdigao
114 | hyundai
115 | arisco
116 | elege
117 | personal
118 | peugeot
119 | nubank
120 | iguatemi
121 | fiat
122 | hering
123 | htc
124 | paypal
125 | john deere
126 | mcdonald's
127 | ups
128 | vivo
129 | tixan
130 | assai
131 | google
132 | fedex
133 | budweiser
134 | bohemia
135 | toyota
136 | mabel
137 | sony
138 | h&m
139 | lowe's
140 | espn
141 | nissan
142 | piracanjuba
143 | havaianas
144 | oracle
145 | cvs
146 | basf
147 | dove
148 | bombril
149 | gol
150 | pepsi
151 | samsung
152 | audi
153 | miojo
154 | corona
155 | sulamerica
156 | palmolive
157 | magazine luiza
158 | multiplus
159 | hellmann's
160 | bradesco
161 | volkswagen
162 | magalu
163 | walmart
164 | droga raia
165 | casas bahia
166 | globo
167 | disney
168 | kia
169 | veja
170 | nokia
171 | costco
172 | heineken
173 | citi
174 | buscape
175 | lenovo
176 | bank of america
177 | banco do brasil
178 | adobe
179 | hermes
180 | frito-lay
181 | b3
182 | visa
183 | deloitte
184 | nescafe
185 | suvinil
186 | extra
187 | citroen
188 | blackberry
189 | cvc
190 | apple
191 | riachuelo
192 | localiza
193 | xiaomi
194 | rbc
195 | tigre
196 | panco
197 | coca-cola
198 | btg pactual
199 | soya
200 | amil
201 | louis vuitton
202 | camponesa
203 | pao de acucar
204 | facebook
205 | antarctica
206 | gillette
207 | atacadao
208 | netshoes
209 | red bull
210 | verizon
211 | mastercard
212 | bmw
213 | ge
214 | jeep
215 | minuano
216 | schin
217 | mitsubishi
218 | anhanguera
219 | alcatel
220 | l'oreal
221 | sony ericsson
222 | motorola
223 | knorr
224 | dorflex
225 | nivea
226 | kellogg's
227 | drogasil
228 | adidas
229 | 


--------------------------------------------------------------------------------
/bothub/shared/utils/lookup_tables/pt_br/cep.txt:
--------------------------------------------------------------------------------
1 | regex [0-9]{5}-?[0-9]{3}
2 | 
3 | 
4 | 


--------------------------------------------------------------------------------
/bothub/shared/utils/lookup_tables/pt_br/country.txt:
--------------------------------------------------------------------------------
  1 | Afeganistão
  2 | Albânia
  3 | Argélia
  4 | Andorra
  5 | Angola
  6 | Antigua e Deps
  7 | Argentina
  8 | Armênia
  9 | Austrália
 10 | Áustria
 11 | Azerbaijão
 12 | Bahamas
 13 | Bahrain
 14 | Bangladesh
 15 | Barbados
 16 | Bielo-Rússia
 17 | Bélgica
 18 | Belize
 19 | Benin
 20 | Butão
 21 | Bolívia
 22 | Bósnia e Herzegovina
 23 | Botswana
 24 | Brasil
 25 | Brunei
 26 | Bulgária
 27 | Burkina
 28 | Burundi
 29 | Camboja
 30 | Camarões
 31 | Canadá
 32 | cabo Verde
 33 | Republica Centro-Africano
 34 | Chade
 35 | Chile
 36 | China
 37 | Colômbia
 38 | Comores
 39 | Congo
 40 | Congo
 41 | Costa Rica
 42 | Croácia
 43 | Cuba
 44 | Chipre
 45 | República Checa
 46 | Dinamarca
 47 | Djibouti
 48 | Dominica
 49 | República Dominicana
 50 | Timor Leste
 51 | Equador
 52 | Egito
 53 | El Salvador
 54 | Guiné Equatorial
 55 | Eritreia
 56 | Estônia
 57 | Etiópia
 58 | Fiji
 59 | Finlândia
 60 | França
 61 | Gabão
 62 | Gâmbia
 63 | Georgia
 64 | Alemanha
 65 | Gana
 66 | Grécia
 67 | Grenada
 68 | Guatemala
 69 | Guiné
 70 | Guiné-bissau
 71 | Guiana
 72 | Haiti
 73 | Honduras
 74 | Hungria
 75 | Islândia
 76 | Índia
 77 | Indonésia
 78 | Irã
 79 | Iraque
 80 | Irlanda
 81 | Israel
 82 | Itália
 83 | Costa do Marfim
 84 | Jamaica
 85 | Japão
 86 | Jordânia
 87 | Cazaquistão
 88 | Quênia
 89 | Kiribati
 90 | Coreia do Norte
 91 | Coreia do Sul
 92 | Kosovo
 93 | Kuwait
 94 | Quirguistão
 95 | Laos
 96 | Letônia
 97 | Líbano
 98 | Lesoto
 99 | Libéria
100 | Líbia
101 | Liechtenstein
102 | Lituânia
103 | Luxemburgo
104 | Macedonia
105 | Madagáscar
106 | Malawi
107 | Malásia
108 | Maldivas
109 | Mali
110 | Malta
111 | Ilhas Marshall
112 | Mauritânia
113 | Maurício
114 | México
115 | Micronésia
116 | Moldova
117 | Mônaco
118 | Mongólia
119 | Montenegro
120 | Marrocos
121 | Moçambique
122 | Mianmar
123 | Namibia
124 | Nauru
125 | Nepal
126 | Países Baixos
127 | Nova Zelândia
128 | Nicarágua
129 | Níger
130 | Nigéria
131 | Noruega
132 | Omã
133 | Paquistão
134 | Palau
135 | Panamá
136 | Papua Nova Guiné
137 | Paraguai
138 | Peru
139 | Filipinas
140 | Polônia
141 | Portugal
142 | Catar
143 | Romênia
144 | Federação Russa
145 | Ruanda
146 | São Cristóvão e Neves
147 | Santa Lúcia
148 | São Vicente e Granadinas
149 | Samoa
150 | San Marino
151 | São Tomé e Príncipe
152 | Arábia Saudita
153 | Senegal
154 | Sérvia
155 | Seychelles
156 | Serra Leoa
157 | Cingapura
158 | Eslováquia
159 | Eslovênia
160 | Ilhas Salomão
161 | Somália
162 | África do Sul
163 | Sudão do Sul
164 | Espanha
165 | Sri Lanka
166 | Sudão
167 | Suriname
168 | Suazilândia
169 | Suécia
170 | Suíça
171 | Síria
172 | Taiwan
173 | Tajiquistão
174 | Tanzânia
175 | Tailândia
176 | Togo
177 | Tonga
178 | Trinidad e Tobago
179 | Tunísia
180 | Peru
181 | Turcomenistão
182 | Tuvalu
183 | Uganda
184 | Ucrânia
185 | Emirados Árabes Unidos
186 | Reino Unido
187 | Estados Unidos
188 | Uruguai
189 | Uzbequistão
190 | Vanuatu
191 | Vaticano
192 | Venezuela
193 | Vietnã
194 | Iémen
195 | Zâmbia
196 | Zimbábue
197 | 


--------------------------------------------------------------------------------
/bothub/shared/utils/lookup_tables/pt_br/cpf.txt:
--------------------------------------------------------------------------------
1 | regex [0-9]{3}.[0-9]{3}.[0-9]{3}-[0-9]{2}
2 | regex [0-9]{11}
3 | 


--------------------------------------------------------------------------------
/bothub/shared/utils/lookup_tables/pt_br/email.txt:
--------------------------------------------------------------------------------
1 | regex [\w\-.]+@([\w\-]+\.)+[\w\-]{2,4}
2 | 


--------------------------------------------------------------------------------
/bothub/shared/utils/persistor.py:
--------------------------------------------------------------------------------
 1 | import base64
 2 | import bothub_backend
 3 | import argparse
 4 | from tempfile import NamedTemporaryFile
 5 | 
 6 | import requests
 7 | from rasa.nlu.persistor import Persistor
 8 | from decouple import config
 9 | 
10 | 
11 | class BothubPersistor(Persistor):
12 |     def __init__(
13 |         self,
14 |         repository_version=None,
15 |         repository_authorization=None,
16 |         rasa_version=None,
17 |         *args,
18 |         **kwargs
19 |     ):
20 |         super().__init__(*args, **kwargs)
21 |         self.repository_version = repository_version
22 |         self.repository_authorization = repository_authorization
23 |         self.rasa_version = rasa_version
24 | 
25 |     def backend(self):
26 |         PARSER = argparse.ArgumentParser()
27 | 
28 |         # Input Arguments
29 |         PARSER.add_argument(
30 |             "--base_url", help="Base URL API Engine.", type=str, default=None
31 |         )
32 | 
33 |         ARGUMENTS, _ = PARSER.parse_known_args()
34 | 
35 |         return bothub_backend.get_backend(
36 |             "bothub_backend.bothub.BothubBackend",
37 |             ARGUMENTS.base_url
38 |             if ARGUMENTS.base_url
39 |             else config("BOTHUB_ENGINE_URL", default="https://api.bothub.it"),
40 |         )
41 | 
42 |     def _persist_tar(self, filekey, tarname):
43 |         with open(tarname, "rb") as tar_file:
44 |             data = tar_file.read()
45 | 
46 |             self.backend().send_training_backend_nlu_persistor(
47 |                 self.repository_version,
48 |                 data,
49 |                 self.repository_authorization,
50 |                 self.rasa_version,
51 |             )
52 | 
53 |     def retrieve(self, model_name, target_path):
54 |         tar_name = self._tar_name(model_name)
55 | 
56 |         train = self.backend().request_backend_parse_nlu_persistor(
57 |             self.repository_version, self.repository_authorization, self.rasa_version
58 |         )
59 | 
60 |         if train.get("from_aws"):
61 |             tar_data = requests.get(train.get("bot_data")).content
62 |         else:
63 |             tar_data = base64.b64decode(train.get("bot_data"))  # pragma: no cover
64 | 
65 |         tar_file = NamedTemporaryFile(suffix=tar_name, delete=False)
66 |         tar_file.write(tar_data)
67 |         tar_file.close()
68 | 
69 |         self._decompress(tar_file.name, target_path)
70 | 


--------------------------------------------------------------------------------
/bothub/shared/utils/pipeline_builder.py:
--------------------------------------------------------------------------------
  1 | from typing import List, Callable, Optional
  2 | 
  3 | from bothub.shared import settings
  4 | from bothub.shared.utils.helpers import ALGORITHM_TO_LANGUAGE_MODEL
  5 | from bothub_nlp_celery import settings as celery_settings
  6 | from bothub.shared.utils.rasa_components.registry import language_to_model
  7 | from rasa.nlu.config import RasaNLUModelConfig
  8 | 
  9 | 
 10 | class PipelineBuilder:
 11 |     def __init__(self, update):
 12 |         self.language = update.get("language")
 13 |         self.algorithm = update.get("algorithm")
 14 |         self.use_name_entities = update.get("use_name_entities")
 15 |         self.dataset_size = update.get("dataset_size")
 16 |         self.use_competing_intents = update.get("use_competing_intents")
 17 |         self.use_analyze_char = update.get("use_analyze_char")
 18 |         self.prebuilt_entities = update.get("prebuilt_entities", [])
 19 |         self.model = self._build_model_requirements()
 20 |         self.pipeline = self._build_pipeline()
 21 | 
 22 |     @staticmethod
 23 |     def _add_spacy_nlp() -> dict:
 24 |         return {"name": "bothub.shared.utils.pipeline_components.spacy_nlp.SpacyNLP"}
 25 | 
 26 |     @staticmethod
 27 |     def _add_whitespace_tokenizer() -> dict:
 28 |         return {"name": "WhitespaceTokenizer"}
 29 | 
 30 |     def _add_preprocessing(self) -> dict:
 31 |         return {
 32 |             "name": "bothub.shared.utils.pipeline_components.preprocessing.Preprocessing",
 33 |             "language": self.language,
 34 |         }
 35 | 
 36 |     @staticmethod
 37 |     def _add_regex_entity_extractor() -> dict:
 38 |         return {
 39 |             "name": "bothub.shared.utils.pipeline_components.regex_entity_extractor.RegexEntityExtractorCustom"
 40 |         }
 41 | 
 42 |     def _add_countvectors_featurizer(self) -> List[dict]:
 43 |         featurizers = []
 44 | 
 45 |         if self.use_analyze_char:
 46 |             featurizers.append(
 47 |                 {
 48 |                     "name": "CountVectorsFeaturizer",
 49 |                     "analyzer": "char_wb",
 50 |                     "min_ngram": 3,
 51 |                     "max_ngram": 3,
 52 |                 }
 53 |             )
 54 | 
 55 |         featurizers.append(
 56 |             {"name": "CountVectorsFeaturizer", "token_pattern": r"(?u)\b\w+\b"}
 57 |         )
 58 | 
 59 |         return featurizers
 60 | 
 61 |     def _add_legacy_countvectors_featurizer(self) -> dict:
 62 |         if self.use_analyze_char:
 63 |             return {
 64 |                 "name": "CountVectorsFeaturizer",
 65 |                 "analyzer": "char_wb",
 66 |                 "min_ngram": 3,
 67 |                 "max_ngram": 3,
 68 |             }
 69 |         else:
 70 |             return {"name": "CountVectorsFeaturizer", "token_pattern": r"(?u)\b\w+\b"}
 71 | 
 72 |     def _add_microsoft_entity_extractor(self) -> dict:
 73 |         return {
 74 |             "name": "bothub.shared.utils.pipeline_components.microsoft_recognizers_extractor.MicrosoftRecognizersExtractor",
 75 |             "dimensions": self.prebuilt_entities,
 76 |             "language": self.language,
 77 |         }
 78 | 
 79 |     @staticmethod
 80 |     def _add_embedding_intent_classifier() -> dict:
 81 |         return {
 82 |             "name": "bothub.shared.utils.pipeline_components.diet_classifier.DIETClassifierCustom",
 83 |             "hidden_layers_sizes": {"text": [256, 128]},
 84 |             "number_of_transformer_layers": 0,
 85 |             "weight_sparsity": 0,
 86 |             "intent_classification": True,
 87 |             "entity_recognition": True,
 88 |             "use_masked_language_model": False,
 89 |             "BILOU_flag": False,
 90 |         }
 91 | 
 92 |     @staticmethod
 93 |     def _epoch_factor_function1(examples_qnt: int, min_threshold: int) -> float:
 94 |         """
 95 |         :param examples_qnt: Number of examples in dataset
 96 |         :param min_threshold: Minimum number of examples needed to have a factor > 1
 97 |         :return: Division factor of defined maximum epochs
 98 | 
 99 |         Example:
100 |         min_threshold = 10000
101 |           examples_qnt = 10000 -> (25*(10000-10000) + 100*10000)//10000 = 100,0 -> 100/100,0 = 1.00 (base case)
102 |           examples_qnt = 15000 -> (25*(15000-10000) + 100*10000)//15000 = 75,00 -> 100/75,00 = 1,33
103 |           examples_qnt = 30000 -> (25*(30000-10000) + 100*10000)//30000 = 50,00 -> 100/50,00 = 2,00
104 |           examples_qnt = 60000 -> (25*(60000-10000) + 100*10000)//60000 = 37,50 -> 100/37,50 = 2,66
105 |           examples_qnt = 90000 -> (25*(90000-10000) + 100*10000)//90000 = 33,33 -> 100/33,33 = 3,00
106 | 
107 |         """
108 |         if examples_qnt <= min_threshold:
109 |             return 1.0
110 | 
111 |         over_qnt = examples_qnt - min_threshold
112 |         epochs_ratio = ((25*over_qnt) + (100*min_threshold)) / examples_qnt
113 |         factor = 100 / epochs_ratio
114 | 
115 |         return factor
116 | 
117 |     def _calculate_epochs_number(
118 |         self,
119 |         max_epochs: int,
120 |         factor_function: Callable[[int, int], float]
121 |     ) -> int:
122 |         """
123 |         :param max_epochs: Maximum number of epochs to be considered
124 |         :param factor_function: Function that returns the division factor
125 |         :return: Calculated number of epochs (max_epochs/calculated_factor)
126 |         """
127 |         min_threshold = settings.DYNAMIC_EPOCHS_THRESHOLD
128 | 
129 |         if self.dataset_size < min_threshold:
130 |             return max_epochs
131 | 
132 |         factor = factor_function(self.dataset_size, min_threshold)
133 |         epochs = int(max_epochs // factor)
134 |         return epochs
135 | 
136 |     def _add_diet_classifier(self, max_epochs=300, bert=False) -> dict:
137 |         epochs = self._calculate_epochs_number(max_epochs, self._epoch_factor_function1)
138 | 
139 |         model = {
140 |             "name": "bothub.shared.utils.pipeline_components.diet_classifier.DIETClassifierCustom",
141 |             "entity_recognition": True,
142 |             "BILOU_flag": False,
143 |             "epochs": epochs,
144 |         }
145 | 
146 |         if bert:
147 |             model["hidden_layer_sizes"] = {"text": [256, 64]}
148 | 
149 |         return model
150 | 
151 |     def _legacy_internal_config(self) -> List[dict]:
152 |         partial_pipeline = [
153 |             self._add_whitespace_tokenizer(),  # Tokenizer
154 |             self._add_legacy_countvectors_featurizer(),  # Featurizer
155 |             self._add_embedding_intent_classifier(),  # Intent Classifier
156 |         ]
157 |         return partial_pipeline
158 | 
159 |     def _legacy_external_config(self) -> List[dict]:
160 |         partial_pipeline = [
161 |             {"name": "SpacyTokenizer"},  # Tokenizer
162 |             {"name": "SpacyFeaturizer"},  # Spacy Featurizer
163 |             self._add_legacy_countvectors_featurizer(),  # Bag of Words Featurizer
164 |             self._add_embedding_intent_classifier(),  # intent classifier
165 |         ]
166 |         return partial_pipeline
167 | 
168 |     def _transformer_network_diet_config(self) -> List[dict]:
169 |         partial_pipeline = [self._add_whitespace_tokenizer()]
170 | 
171 |         # partial_pipeline.append(add_regex_entity_extractor())
172 |         # if self.prebuilt_entities:
173 |         #     partial_pipeline.append(add_microsoft_entity_extractor(update))  # Microsoft Entity Extractor)
174 |         partial_pipeline.extend(
175 |             self._add_countvectors_featurizer()
176 |         )  # Bag of Words Featurizer
177 |         partial_pipeline.append(
178 |             self._add_diet_classifier(max_epochs=150)
179 |         )  # Intent Classifier
180 | 
181 |         return partial_pipeline
182 | 
183 |     def _transformer_network_diet_word_embedding_config(self) -> List[dict]:
184 |         partial_pipeline = [
185 |             {"name": "SpacyTokenizer"},  # Tokenizer
186 |             {"name": "SpacyFeaturizer"},  # Spacy Featurizer
187 |         ]
188 |         partial_pipeline.extend(
189 |             self._add_countvectors_featurizer()
190 |         )  # Bag of Words Featurizer
191 |         partial_pipeline.append(
192 |             self._add_diet_classifier(max_epochs=200)
193 |         )  # Intent Classifier
194 | 
195 |         return partial_pipeline
196 | 
197 |     def _transformer_network_diet_bert_config(self) -> List[dict]:
198 |         partial_pipeline = [
199 |             {  # NLP
200 |                 "name": "bothub.shared.utils.pipeline_components.hf_transformer.HFTransformersNLPCustom",
201 |                 "model_name": language_to_model.get(self.language, "bert_multilang"),
202 |             },
203 |             {  # Tokenizer
204 |                 "name": "bothub.shared.utils.pipeline_components.lm_tokenizer.LanguageModelTokenizerCustom",
205 |                 "intent_tokenization_flag": False,
206 |                 "intent_split_symbol": "_",
207 |             },
208 |             {  # Bert Featurizer
209 |                 "name": "bothub.shared.utils.pipeline_components.lm_featurizer.LanguageModelFeaturizerCustom"
210 |             },
211 |         ]
212 |         # partial_pipeline.append(add_regex_entity_extractor())
213 |         # if self.prebuilt_entities:
214 |         #     partial_pipeline.append(add_microsoft_entity_extractor(update))  # Microsoft Entity Extractor)
215 | 
216 |         partial_pipeline.extend(
217 |             self._add_countvectors_featurizer()
218 |         )  # Bag of Words Featurizers
219 |         partial_pipeline.append(
220 |             self._add_diet_classifier(max_epochs=100, bert=True)
221 |         )  # Intent Classifier
222 | 
223 |         return partial_pipeline
224 | 
225 |     def _build_model_requirements(self) -> Optional[str]:
226 |         model = ALGORITHM_TO_LANGUAGE_MODEL[self.algorithm]
227 |         if model == "SPACY" and self.language not in celery_settings.AVAILABLE_SPACY_MODELS:
228 |             model = None
229 |             if self.algorithm == "neural_network_external":
230 |                 self.algorithm = "neural_network_internal"
231 |             else:
232 |                 self.algorithm = "transformer_network_diet"
233 | 
234 |         return model
235 | 
236 |     def _build_pipeline(self) -> List[dict]:
237 |         pipeline = [self._add_preprocessing()]
238 | 
239 |         if (
240 |             self.use_name_entities
241 |             and self.algorithm != "transformer_network_diet_bert"
242 |             and self.language in celery_settings.AVAILABLE_SPACY_MODELS
243 |         ) or self.algorithm in [
244 |             "neural_network_external",
245 |             "transformer_network_diet_word_embedding",
246 |         ]:
247 |             pipeline.append(self._add_spacy_nlp())
248 | 
249 |         if self.algorithm == "neural_network_internal":
250 |             pipeline.extend(self._legacy_internal_config())
251 |         elif self.algorithm == "neural_network_external":
252 |             pipeline.extend(self._legacy_external_config())
253 |         elif self.algorithm == "transformer_network_diet_bert":
254 |             pipeline.extend(self._transformer_network_diet_bert_config())
255 |         elif self.algorithm == "transformer_network_diet_word_embedding":
256 |             pipeline.extend(self._transformer_network_diet_word_embedding_config())
257 |         else:
258 |             pipeline.extend(self._transformer_network_diet_config())
259 | 
260 |         if (
261 |             self.use_name_entities
262 |             and self.algorithm != "transformer_network_diet_bert"
263 |             and self.language in celery_settings.AVAILABLE_SPACY_MODELS
264 |         ):
265 |             pipeline.append({"name": "SpacyEntityExtractor"})
266 | 
267 |         return pipeline
268 | 
269 |     def print_pipeline(self) -> None:
270 |         import json
271 | 
272 |         print(f"Pipeline Config:")
273 |         for component in self.pipeline:
274 |             print(json.dumps(component, indent=2))
275 | 
276 |     def get_nlu_model(self) -> RasaNLUModelConfig:
277 |         return RasaNLUModelConfig(
278 |             {"language": self.language, "pipeline": self.pipeline}
279 |         )
280 | 


--------------------------------------------------------------------------------
/bothub/shared/utils/pipeline_components/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weni-ai/bothub-nlp/d43b7657aff01544769b71db74adce5f7d366879/bothub/shared/utils/pipeline_components/__init__.py


--------------------------------------------------------------------------------
/bothub/shared/utils/pipeline_components/diet_classifier.py:
--------------------------------------------------------------------------------
 1 | import rasa.utils.common as common_utils
 2 | from rasa.nlu.classifiers.diet_classifier import DIETClassifier
 3 | from rasa.constants import DOCS_URL_TRAINING_DATA_NLU
 4 | from rasa.nlu.training_data import TrainingData
 5 | from rasa.nlu.constants import (
 6 |     ENTITIES,
 7 |     TOKENS_NAMES,
 8 |     TEXT,
 9 |     ENTITY_ATTRIBUTE_START,
10 |     ENTITY_ATTRIBUTE_END,
11 |     INTENT,
12 | )
13 | 
14 | 
15 | class DIETClassifierCustom(DIETClassifier):
16 |     @staticmethod
17 |     def check_correct_entity_annotations(training_data: TrainingData) -> None:
18 |         """Check if entities are correctly annotated in the training data.
19 |         If the start and end values of an entity do not match any start and end values
20 |         of the respected token, we define an entity as misaligned and log a warning.
21 |         Args:
22 |             training_data: The training data.
23 |         """
24 |         for example in training_data.entity_examples:
25 |             entity_boundaries = [
26 |                 (entity[ENTITY_ATTRIBUTE_START], entity[ENTITY_ATTRIBUTE_END])
27 |                 for entity in example.get(ENTITIES)
28 |             ]
29 |             token_start_positions = [
30 |                 t.start for t in example.get(TOKENS_NAMES[TEXT], [])
31 |             ]
32 |             token_end_positions = [t.end for t in example.get(TOKENS_NAMES[TEXT], [])]
33 | 
34 |             for entity_start, entity_end in entity_boundaries:
35 |                 if (
36 |                     entity_start not in token_start_positions
37 |                     or entity_end not in token_end_positions
38 |                 ):
39 |                     common_utils.raise_warning(
40 |                         f"Misaligned entity annotation in message '{example.text}' "
41 |                         f"with intent '{example.get(INTENT)}'. Make sure the start and "
42 |                         f"end values of entities in the training data match the token "
43 |                         f"boundaries (e.g. entities don't include trailing whitespaces "
44 |                         f"or punctuation).",
45 |                         docs=DOCS_URL_TRAINING_DATA_NLU,
46 |                     )
47 |                     break
48 | 


--------------------------------------------------------------------------------
/bothub/shared/utils/pipeline_components/hf_transformer.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from typing import Any, Dict, List, Text, Tuple, Optional
  3 | 
  4 | import numpy as np
  5 | import rasa.utils.train_utils as train_utils
  6 | 
  7 | from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
  8 | from rasa.nlu.training_data import Message
  9 | from rasa.nlu.tokenizers.tokenizer import Token
 10 | from rasa.nlu.utils.hugging_face.hf_transformers import HFTransformersNLP
 11 | 
 12 | logger = logging.getLogger(__name__)
 13 | 
 14 | 
 15 | class HFTransformersNLPCustom(HFTransformersNLP):
 16 |     """Utility Component for interfacing between Transformers library and Rasa OS.
 17 |     The transformers(https://github.com/huggingface/transformers) library
 18 |     is used to load pre-trained language models like BERT, GPT-2, etc.
 19 |     The component also tokenizes and featurizes dense featurizable attributes of each
 20 |     message.
 21 |     """
 22 | 
 23 |     def __init__(self, component_config: Optional[Dict[Text, Any]] = None) -> None:
 24 |         super(HFTransformersNLP, self).__init__(component_config)
 25 | 
 26 |         self._load_model()
 27 |         self.whitespace_tokenizer = WhitespaceTokenizer()
 28 | 
 29 |     def _load_model(self) -> None:
 30 |         """Try loading the model"""
 31 | 
 32 |         from bothub.shared.utils.rasa_components.registry import (
 33 |             model_class_dict,
 34 |             model_weights_defaults,
 35 |             model_tokenizer_dict,
 36 |             from_pt_dict,
 37 |         )
 38 | 
 39 |         self.model_name = self.component_config["model_name"]
 40 | 
 41 |         if self.model_name not in model_class_dict:
 42 |             raise KeyError(
 43 |                 f"'{self.model_name}' not a valid model name. Choose from "
 44 |                 f"{str(list(model_class_dict.keys()))}or create"
 45 |                 f"a new class inheriting from this class to support your model."
 46 |             )
 47 | 
 48 |         self.model_weights = self.component_config["model_weights"]
 49 |         self.cache_dir = self.component_config["cache_dir"]
 50 | 
 51 |         if not self.model_weights:
 52 |             logger.info(
 53 |                 f"Model weights not specified. Will choose default model weights: "
 54 |                 f"{model_weights_defaults[self.model_name]}"
 55 |             )
 56 |             self.model_weights = model_weights_defaults[self.model_name]
 57 | 
 58 |         logger.debug(f"Loading Tokenizer and Model for {self.model_name}")
 59 | 
 60 |         try:
 61 |             from bothub_nlp_celery.app import nlp_language
 62 | 
 63 |             self.tokenizer, self.model = nlp_language
 64 |         except TypeError:
 65 |             logger.info(
 66 |                 f"Model could not be retrieved from celery cache "
 67 |                 f"Loading model {self.model_name} in memory"
 68 |             )
 69 |             self.tokenizer = model_tokenizer_dict[self.model_name].from_pretrained(
 70 |                 model_weights_defaults[self.model_name], cache_dir=None
 71 |             )
 72 |             self.model = model_class_dict[self.model_name].from_pretrained(
 73 |                 self.model_name,
 74 |                 cache_dir=None,
 75 |                 from_pt=from_pt_dict.get(self.model_name, False),
 76 |             )
 77 | 
 78 |         # Use a universal pad token since all transformer architectures do not have a
 79 |         # consistent token. Instead of pad_token_id we use unk_token_id because
 80 |         # pad_token_id is not set for all architectures. We can't add a new token as
 81 |         # well since vocabulary resizing is not yet supported for TF classes.
 82 |         # Also, this does not hurt the model predictions since we use an attention mask
 83 |         # while feeding input.
 84 |         self.pad_token_id = self.tokenizer.unk_token_id
 85 |         logger.debug(f"Loaded Tokenizer and Model for {self.model_name}")
 86 | 
 87 |     def _add_lm_specific_special_tokens(
 88 |         self, token_ids: List[List[int]]
 89 |     ) -> List[List[int]]:
 90 |         """Add language model specific special tokens which were used during their training.
 91 |         Args:
 92 |             token_ids: List of token ids for each example in the batch.
 93 |         Returns:
 94 |             Augmented list of token ids for each example in the batch.
 95 |         """
 96 |         from bothub.shared.utils.rasa_components.registry import (
 97 |             model_special_tokens_pre_processors,
 98 |         )
 99 | 
100 |         augmented_tokens = [
101 |             model_special_tokens_pre_processors[self.model_name](example_token_ids)
102 |             for example_token_ids in token_ids
103 |         ]
104 |         return augmented_tokens
105 | 
106 |     def _lm_specific_token_cleanup(
107 |         self, split_token_ids: List[int], token_strings: List[Text]
108 |     ) -> Tuple[List[int], List[Text]]:
109 |         """Clean up special chars added by tokenizers of language models.
110 |         Many language models add a special char in front/back of (some) words. We clean up those chars as they are not
111 |         needed once the features are already computed.
112 |         Args:
113 |             split_token_ids: List of token ids received as output from the language model specific tokenizer.
114 |             token_strings: List of token strings received as output from the language model specific tokenizer.
115 |         Returns:
116 |             Cleaned up token ids and token strings.
117 |         """
118 |         from bothub.shared.utils.rasa_components.registry import model_tokens_cleaners
119 | 
120 |         return model_tokens_cleaners[self.model_name](split_token_ids, token_strings)
121 | 
122 |     def _post_process_sequence_embeddings(
123 |         self, sequence_embeddings: np.ndarray
124 |     ) -> Tuple[np.ndarray, np.ndarray]:
125 |         """Compute sentence level representations and sequence level representations for relevant tokens.
126 |         Args:
127 |             sequence_embeddings: Sequence level dense features received as output from language model.
128 |         Returns:
129 |             Sentence and sequence level representations.
130 |         """
131 | 
132 |         from bothub.shared.utils.rasa_components.registry import (
133 |             model_embeddings_post_processors,
134 |         )
135 | 
136 |         sentence_embeddings = []
137 |         post_processed_sequence_embeddings = []
138 | 
139 |         for example_embedding in sequence_embeddings:
140 |             (
141 |                 example_sentence_embedding,
142 |                 example_post_processed_embedding,
143 |             ) = model_embeddings_post_processors[self.model_name](example_embedding)
144 | 
145 |             sentence_embeddings.append(example_sentence_embedding)
146 |             post_processed_sequence_embeddings.append(example_post_processed_embedding)
147 | 
148 |         return (
149 |             np.array(sentence_embeddings),
150 |             np.array(post_processed_sequence_embeddings),
151 |         )
152 | 
153 |     def _tokenize_example(
154 |         self, message: Message, attribute: Text, model_size: int = 384
155 |     ) -> Tuple[List[Token], List[int]]:
156 |         """Tokenize a single message example.
157 | 
158 |         Many language models add a special char in front of (some) words and split words into
159 |         sub-words. To ensure the entity start and end values matches the token values,
160 |         tokenize the text first using the whitespace tokenizer. If individual tokens
161 |         are split up into multiple tokens, we make sure that the start and end value
162 |         of the first and last respective tokens stay the same.
163 | 
164 |         Args:
165 |             message: Single message object to be processed.
166 |             attribute: Property of message to be processed, one of ``TEXT`` or ``RESPONSE``.
167 |             model_size: Limit of tokens the model can handle (BERT = 512)
168 | 
169 |         Returns:
170 |             List of token strings and token ids for the corresponding attribute of the message.
171 |         """
172 | 
173 |         tokens_in = self.whitespace_tokenizer.tokenize(message, attribute)
174 | 
175 |         tokens_out = []
176 | 
177 |         token_ids_out = []
178 | 
179 |         for token in tokens_in:
180 |             # use lm specific tokenizer to further tokenize the text
181 |             split_token_ids, split_token_strings = self._lm_tokenize(token.text)
182 | 
183 |             split_token_ids, split_token_strings = self._lm_specific_token_cleanup(
184 |                 split_token_ids, split_token_strings
185 |             )
186 | 
187 |             if len(tokens_out) + len(split_token_strings) >= model_size:
188 |                 logger.warning(
189 |                     f"Sentence number of tokens overflowing model size. Skipping sentence exceeded tokens... "
190 |                     f"Sentence text: '{message.text[:50]} ...' "
191 |                 )
192 |                 break
193 | 
194 |             token_ids_out += split_token_ids
195 | 
196 |             tokens_out += train_utils.align_tokens(
197 |                 split_token_strings, token.end, token.start
198 |             )
199 | 
200 |         return tokens_out, token_ids_out
201 | 


--------------------------------------------------------------------------------
/bothub/shared/utils/pipeline_components/lm_featurizer.py:
--------------------------------------------------------------------------------
 1 | from rasa.nlu.featurizers.dense_featurizer.lm_featurizer import LanguageModelFeaturizer
 2 | from typing import List, Type
 3 | from rasa.nlu.components import Component
 4 | 
 5 | 
 6 | class LanguageModelFeaturizerCustom(LanguageModelFeaturizer):
 7 |     @classmethod
 8 |     def required_components(cls) -> List[Type[Component]]:
 9 |         return []
10 | 


--------------------------------------------------------------------------------
/bothub/shared/utils/pipeline_components/lm_tokenizer.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Type
 2 | from rasa.nlu.components import Component
 3 | from rasa.nlu.tokenizers.lm_tokenizer import LanguageModelTokenizer
 4 | 
 5 | 
 6 | class LanguageModelTokenizerCustom(LanguageModelTokenizer):
 7 |     """Tokenizer using transformer based language models.
 8 |     Uses the output of HFTransformersNLP component to set the tokens
 9 |     for dense featurizable attributes of each message object.
10 |     """
11 | 
12 |     @classmethod
13 |     def required_components(cls) -> List[Type[Component]]:
14 |         return []
15 | 


--------------------------------------------------------------------------------
/bothub/shared/utils/pipeline_components/microsoft_recognizers_extractor.py:
--------------------------------------------------------------------------------
 1 | from recognizers_suite import (
 2 |     recognize_number,
 3 |     recognize_ordinal,
 4 |     recognize_age,
 5 |     recognize_currency,
 6 |     recognize_dimension,
 7 |     recognize_temperature,
 8 |     recognize_datetime,
 9 |     recognize_phone_number,
10 |     recognize_email,
11 | )
12 | from recognizers_suite import Culture
13 | 
14 | from typing import Any, Dict, Text, Optional
15 | from rasa.nlu.constants import ENTITIES
16 | from rasa.nlu.config import RasaNLUModelConfig
17 | from rasa.nlu.extractors.extractor import EntityExtractor
18 | from rasa.nlu.training_data import Message
19 | 
20 | recognizers = {
21 |     "number": recognize_number,
22 |     "ordinal": recognize_ordinal,
23 |     "age": recognize_age,
24 |     "currency": recognize_currency,
25 |     "dimension": recognize_dimension,
26 |     "temperature": recognize_temperature,
27 |     "datetime": recognize_datetime,
28 |     "phone_number": recognize_phone_number,
29 |     "email": recognize_email,
30 | }
31 | 
32 | cultures = {
33 |     "zh": Culture.Chinese,
34 |     "nl": Culture.Dutch,
35 |     "en": Culture.English,
36 |     "fr": Culture.French,
37 |     "it": Culture.Italian,
38 |     "jp": Culture.Japanese,
39 |     "ko": Culture.Korean,
40 |     "pt_br": Culture.Portuguese,
41 |     "es": Culture.Spanish,
42 |     "tr": Culture.Turkish,
43 | }
44 | 
45 | 
46 | def rasa_format(entity):
47 |     return {
48 |         "entity": entity.type_name,
49 |         "start": entity.start,
50 |         "end": entity.end + 1,
51 |         "value": entity.text,
52 |     }
53 | 
54 | 
55 | class MicrosoftRecognizersExtractor(EntityExtractor):
56 |     defaults = {"dimensions": None}
57 | 
58 |     def __init__(self, component_config: Optional[Dict[Text, Any]] = None) -> None:
59 |         super(MicrosoftRecognizersExtractor, self).__init__(component_config)
60 |         self.language = self.component_config["language"]
61 | 
62 |     @classmethod
63 |     def create(
64 |         cls, component_config: Dict[Text, Any], config: RasaNLUModelConfig
65 |     ) -> "MicrosoftRecognizersExtractor":
66 |         return cls(component_config)
67 | 
68 |     def process(self, message: Message, **kwargs: Any) -> None:
69 |         dimensions = self.component_config["dimensions"]
70 |         extracted = self.add_extractor_name(
71 |             self.extract_entities(message.text, self.language, dimensions)
72 |         )
73 |         message.set(ENTITIES, message.get(ENTITIES, []) + extracted, add_to_output=True)
74 | 
75 |     @staticmethod
76 |     def extract_entities(user_input: str, language: str, selected_dimensions):
77 |         entities_group = []
78 |         for dimension in recognizers:
79 |             if dimension in selected_dimensions:
80 |                 entities = recognizers[dimension](
81 |                     user_input, cultures.get(language, Culture.English)
82 |                 )
83 |                 if entities:
84 |                     for entity in entities:
85 |                         entities_group.append(rasa_format(entity))
86 | 
87 |         return entities_group
88 | 


--------------------------------------------------------------------------------
/bothub/shared/utils/pipeline_components/preprocessing.py:
--------------------------------------------------------------------------------
  1 | from typing import Any, Optional, Text, Dict, List, Type
  2 | 
  3 | from rasa.nlu.components import Component
  4 | from rasa.nlu.config import RasaNLUModelConfig
  5 | from rasa.nlu.training_data import Message, TrainingData
  6 | 
  7 | from bothub.shared.utils.preprocessing.preprocessing_factory import PreprocessingFactory
  8 | 
  9 | 
 10 | class Preprocessing(Component):
 11 | 
 12 |     # Which components are required by this component.
 13 |     # Listed components should appear before the component itself in the pipeline.
 14 |     @classmethod
 15 |     def required_components(cls) -> List[Type[Component]]:
 16 |         """Specify which components need to be present in the pipeline."""
 17 | 
 18 |         return []
 19 | 
 20 |     # Defines the default configuration parameters of a component
 21 |     # these values can be overwritten in the pipeline configuration
 22 |     # of the model. The component should choose sensible defaults
 23 |     # and should be able to create reasonable results with the defaults.
 24 |     defaults = {"language": None}
 25 | 
 26 |     def __init__(self, component_config: Optional[Dict[Text, Any]] = None) -> None:
 27 |         super(Preprocessing, self).__init__(component_config)
 28 |         self.language = self.component_config["language"]
 29 | 
 30 |     @classmethod
 31 |     def create(
 32 |         cls, component_config: Dict[Text, Any], config: RasaNLUModelConfig
 33 |     ) -> "Preprocessing":
 34 |         return cls(component_config)
 35 | 
 36 |     def provide_context(self) -> Dict[Text, Any]:
 37 |         return {"language": self.language}
 38 | 
 39 |     @staticmethod
 40 |     def do_entities_overlap(entities: List[Dict]):
 41 |         sorted_entities = sorted(entities, key=lambda e: e["start"])
 42 |         for i in range(len(sorted_entities) - 1):
 43 |             curr_ent = sorted_entities[i]
 44 |             next_ent = sorted_entities[i + 1]
 45 |             if (
 46 |                 next_ent["start"] < curr_ent["end"]
 47 |                 and next_ent["entity"] != curr_ent["entity"]
 48 |             ):
 49 |                 return True
 50 |         return False
 51 | 
 52 |     @staticmethod
 53 |     def remove_overlapping_entities(entities):
 54 |         new_entities = []
 55 |         for i in range(len(entities)):
 56 |             overlap = False
 57 |             for j in range(len(entities)):
 58 |                 if i != j and (
 59 |                     entities[i]["start"] >= entities[j]["start"]
 60 |                     and entities[i]["end"] <= entities[j]["end"]
 61 |                 ):
 62 |                     overlap = True
 63 |                 elif i != j and (
 64 |                     (
 65 |                         entities[i]["end"] > entities[j]["start"]
 66 |                         and entities[i]["start"] < entities[j]["end"]
 67 |                     )
 68 |                     and not (
 69 |                         entities[j]["start"] >= entities[i]["start"]
 70 |                         and entities[j]["end"] <= entities[i]["end"]
 71 |                     )
 72 |                 ):
 73 |                     overlap = True
 74 |             if not overlap:
 75 |                 new_entities.append(entities[i])
 76 |         return new_entities
 77 | 
 78 |     def train(
 79 |         self,
 80 |         training_data: TrainingData,
 81 |         config: Optional[RasaNLUModelConfig] = None,
 82 |         **kwargs: Any,
 83 |     ) -> None:
 84 |         """Train this component"""
 85 |         not_repeated_phrases = set()
 86 |         size = len(training_data.training_examples)
 87 |         subtract_idx = 0
 88 |         language_preprocessor = PreprocessingFactory(self.language).factory()
 89 | 
 90 |         for idx in range(size):
 91 |             example = training_data.training_examples[idx - subtract_idx]
 92 | 
 93 |             if "entities" in example.data and self.do_entities_overlap(
 94 |                 example.data["entities"]
 95 |             ):
 96 |                 example.data["entities"] = self.remove_overlapping_entities(
 97 |                     example.data["entities"]
 98 |                 )
 99 | 
100 |             example = language_preprocessor.preprocess(example)
101 | 
102 |             if example.text in not_repeated_phrases:
103 |                 # remove example at this index from training_examples
104 |                 training_data.training_examples.pop(idx - subtract_idx)
105 |                 subtract_idx += 1
106 |             else:
107 |                 not_repeated_phrases.add(example.text)
108 |                 training_data.training_examples[idx - subtract_idx].text = example.text
109 | 
110 |     def process(self, message: Message, **kwargs: Any) -> None:
111 |         """Process an incoming message."""
112 | 
113 |         language_preprocessor = PreprocessingFactory(self.language).factory()
114 |         _message = language_preprocessor.preprocess(message)
115 |         message.text = _message.text
116 |         message.data = _message.data
117 | 


--------------------------------------------------------------------------------
/bothub/shared/utils/pipeline_components/regex_entity_extractor.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | import re
  4 | from typing import Any, Dict, List, Optional, Text, Union
  5 | 
  6 | import rasa.utils.common
  7 | import rasa.utils.io
  8 | 
  9 | from rasa.nlu.model import Metadata
 10 | from rasa.nlu.config import RasaNLUModelConfig
 11 | from rasa.nlu.training_data import TrainingData
 12 | from rasa.nlu.training_data.message import Message
 13 | from rasa.nlu.constants import (
 14 |     ENTITIES,
 15 |     ENTITY_ATTRIBUTE_VALUE,
 16 |     ENTITY_ATTRIBUTE_START,
 17 |     ENTITY_ATTRIBUTE_END,
 18 |     TEXT,
 19 |     ENTITY_ATTRIBUTE_TYPE,
 20 | )
 21 | from rasa.nlu.extractors.extractor import EntityExtractor
 22 | from ..preprocessing.preprocessing_base import PreprocessingBase
 23 | 
 24 | logger = logging.getLogger(__name__)
 25 | 
 26 | 
 27 | def read_lookup_table_file(lookup_table_file: Text) -> List[Text]:
 28 |     """Read the lookup table file.
 29 | 
 30 |     Args:
 31 |         lookup_table_file: the file path to the lookup table
 32 | 
 33 |     Returns:
 34 |         Elements listed in the lookup table file.
 35 |     """
 36 |     try:
 37 |         f = open(lookup_table_file, "r", encoding=rasa.utils.io.DEFAULT_ENCODING)
 38 |     except OSError:
 39 |         raise ValueError(
 40 |             f"Could not load lookup table {lookup_table_file}. "
 41 |             f"Please make sure you've provided the correct path."
 42 |         )
 43 | 
 44 |     elements_to_regex = []
 45 |     with f:
 46 |         for line in f:
 47 |             new_element = line.strip()
 48 |             if new_element:
 49 |                 elements_to_regex.append(new_element)
 50 |     return elements_to_regex
 51 | 
 52 | 
 53 | def _generate_lookup_regex(lookup_table: Dict[Text, Union[Text, List[Text]]]) -> Text:
 54 |     """Creates a regex pattern from the given lookup table.
 55 | 
 56 |     The lookup table is either a file or a list of entries.
 57 | 
 58 |     Args:
 59 |         lookup_table: The lookup table.
 60 | 
 61 |     Returns:
 62 |         The regex pattern.
 63 |     """
 64 |     lookup_elements = lookup_table["elements"]
 65 | 
 66 |     # if it's a list, it should be the elements directly
 67 |     if isinstance(lookup_elements, list):
 68 |         elements_to_regex = lookup_elements
 69 |     # otherwise it's a file path.
 70 |     else:
 71 |         elements_to_regex = read_lookup_table_file(lookup_elements)
 72 | 
 73 |     # sanitize the regex, escape special characters
 74 |     preprocessor = PreprocessingBase()
 75 |     elements_sanitized = [
 76 |         re.escape(preprocessor.preprocess(e))
 77 |         if not e.startswith("regex ")
 78 |         else e.split("regex ")[1]
 79 |         for e in elements_to_regex
 80 |     ]
 81 | 
 82 |     # regex matching elements with word boundaries on either side
 83 |     return "(\\b" + "\\b|\\b".join(elements_sanitized) + "\\b)"
 84 | 
 85 | 
 86 | def _convert_lookup_tables_to_regex(
 87 |     training_data: TrainingData, use_only_entities: bool = False
 88 | ) -> List[Dict[Text, Text]]:
 89 |     """Convert the lookup tables from the training data to regex patterns.
 90 |     Args:
 91 |         training_data: The training data.
 92 |         use_only_entities: If True only regex features with a name equal to a entity
 93 |           are considered.
 94 | 
 95 |     Returns:
 96 |         A list of regex patterns.
 97 |     """
 98 |     patterns = []
 99 |     for table in training_data.lookup_tables:
100 |         if use_only_entities and table["name"] not in training_data.entities:
101 |             continue
102 |         regex_pattern = _generate_lookup_regex(table)
103 |         # if file is empty
104 |         if regex_pattern == r"(\b\b)":
105 |             continue
106 |         lookup_regex = {"name": table["name"], "pattern": regex_pattern}
107 |         patterns.append(lookup_regex)
108 | 
109 |     return patterns
110 | 
111 | 
112 | def _collect_regex_features(
113 |     training_data: TrainingData, use_only_entities: bool = False
114 | ) -> List[Dict[Text, Text]]:
115 |     """Get regex features from training data.
116 | 
117 |     Args:
118 |         training_data: The training data
119 |         use_only_entities: If True only regex features with a name equal to a entity
120 |           are considered.
121 | 
122 |     Returns:
123 |         Regex features.
124 |     """
125 |     if not use_only_entities:
126 |         return training_data.regex_features
127 | 
128 |     return [
129 |         regex
130 |         for regex in training_data.regex_features
131 |         if regex["name"] in training_data.entities
132 |     ]
133 | 
134 | 
135 | def extract_patterns(
136 |     training_data: TrainingData,
137 |     use_lookup_tables: bool = True,
138 |     use_regexes: bool = True,
139 |     use_only_entities: bool = False,
140 | ) -> List[Dict[Text, Text]]:
141 |     """Extract a list of patterns from the training data.
142 | 
143 |     The patterns are constructed using the regex features and lookup tables defined
144 |     in the training data.
145 | 
146 |     Args:
147 |         training_data: The training data.
148 |         use_only_entities: If True only lookup tables and regex features with a name
149 |           equal to a entity are considered.
150 |         use_regexes: Boolean indicating whether to use regex features or not.
151 |         use_lookup_tables: Boolean indicating whether to use lookup tables or not.
152 | 
153 |     Returns:
154 |         The list of regex patterns.
155 |     """
156 |     if not training_data.lookup_tables and not training_data.regex_features:
157 |         return []
158 | 
159 |     patterns = []
160 | 
161 |     if use_regexes:
162 |         patterns.extend(_collect_regex_features(training_data, use_only_entities))
163 |     if use_lookup_tables:
164 |         patterns.extend(
165 |             _convert_lookup_tables_to_regex(training_data, use_only_entities)
166 |         )
167 | 
168 |     return patterns
169 | 
170 | 
171 | class RegexEntityExtractorCustom(EntityExtractor):
172 |     """Searches for entities in the user's message using the lookup tables and regexes
173 |     defined in the training data."""
174 | 
175 |     defaults = {
176 |         # text will be processed with case insensitive as default
177 |         "case_sensitive": False,
178 |         # use lookup tables to extract entities
179 |         "use_lookup_tables": True,
180 |         # use regexes to extract entities
181 |         "use_regexes": True,
182 |     }
183 | 
184 |     def __init__(
185 |         self,
186 |         component_config: Optional[Dict[Text, Any]] = None,
187 |         patterns: Optional[List[Dict[Text, Text]]] = None,
188 |     ):
189 |         super(RegexEntityExtractorCustom, self).__init__(component_config)
190 | 
191 |         self.case_sensitive = self.component_config["case_sensitive"]
192 |         self.patterns = patterns or []
193 | 
194 |     def train(
195 |         self,
196 |         training_data: TrainingData,
197 |         config: Optional[RasaNLUModelConfig] = None,
198 |         **kwargs: Any,
199 |     ) -> None:
200 |         self.patterns = extract_patterns(
201 |             training_data,
202 |             use_lookup_tables=self.component_config["use_lookup_tables"],
203 |             use_regexes=self.component_config["use_regexes"],
204 |             use_only_entities=False,
205 |         )
206 | 
207 |         if not self.patterns:
208 |             rasa.utils.common.raise_warning(
209 |                 "No lookup tables or regexes defined in the training data that have "
210 |                 "a name equal to any entity in the training data. In order for this "
211 |                 "component to work you need to define valid lookup tables or regexes "
212 |                 "in the training data."
213 |             )
214 | 
215 |     def process(self, message: Message, **kwargs: Any) -> None:
216 |         if not self.patterns:
217 |             return
218 | 
219 |         extracted_entities = self._extract_entities(message)
220 |         extracted_entities = self.add_extractor_name(extracted_entities)
221 | 
222 |         message.set(
223 |             ENTITIES, message.get(ENTITIES, []) + extracted_entities, add_to_output=True
224 |         )
225 | 
226 |     def _extract_entities(self, message: Message) -> List[Dict[Text, Any]]:
227 |         """Extract entities of the given type from the given user message."""
228 |         entities = []
229 | 
230 |         flags = 0  # default flag
231 |         if not self.case_sensitive:
232 |             flags = re.IGNORECASE
233 | 
234 |         for pattern in self.patterns:
235 |             matches = re.finditer(pattern["pattern"], message.get(TEXT), flags=flags)
236 |             matches = list(matches)
237 | 
238 |             for match in matches:
239 |                 start_index = match.start()
240 |                 end_index = match.end()
241 |                 entities.append(
242 |                     {
243 |                         ENTITY_ATTRIBUTE_TYPE: pattern["name"],
244 |                         ENTITY_ATTRIBUTE_START: start_index,
245 |                         ENTITY_ATTRIBUTE_END: end_index,
246 |                         ENTITY_ATTRIBUTE_VALUE: message.get(TEXT)[
247 |                             start_index:end_index
248 |                         ],
249 |                     }
250 |                 )
251 | 
252 |         return entities
253 | 
254 |     @classmethod
255 |     def load(
256 |         cls,
257 |         meta: Dict[Text, Any],
258 |         model_dir: Optional[Text] = None,
259 |         model_metadata: Optional[Metadata] = None,
260 |         cached_component: Optional["RegexEntityExtractor"] = None,
261 |         **kwargs: Any,
262 |     ) -> "RegexEntityExtractorCustom":
263 | 
264 |         file_name = meta.get("file")
265 |         regex_file = os.path.join(model_dir, file_name)
266 | 
267 |         if os.path.exists(regex_file):
268 |             patterns = rasa.utils.io.read_json_file(regex_file)
269 |             return RegexEntityExtractorCustom(meta, patterns=patterns)
270 | 
271 |         return RegexEntityExtractorCustom(meta)
272 | 
273 |     def persist(self, file_name: Text, model_dir: Text) -> Optional[Dict[Text, Any]]:
274 |         """Persist this model into the passed directory.
275 |         Return the metadata necessary to load the model again."""
276 |         file_name = f"{file_name}.json"
277 |         regex_file = os.path.join(model_dir, file_name)
278 |         rasa.utils.io.dump_obj_as_json_to_file(regex_file, self.patterns)
279 | 
280 |         return {"file": file_name}
281 | 


--------------------------------------------------------------------------------
/bothub/shared/utils/pipeline_components/spacy_nlp.py:
--------------------------------------------------------------------------------
 1 | from bothub_nlp_celery.app import nlp_language
 2 | from rasa.nlu.config import override_defaults
 3 | from rasa.nlu.utils.spacy_utils import SpacyNLP as RasaNLUSpacyNLP
 4 | 
 5 | 
 6 | class SpacyNLP(RasaNLUSpacyNLP):
 7 |     @classmethod
 8 |     def load(
 9 |         cls, meta, model_dir=None, model_metadata=None, cached_component=None, **kwargs
10 |     ):
11 |         if cached_component:
12 |             return cached_component
13 | 
14 |         cls.ensure_proper_language_model(nlp_language)
15 |         return cls(meta, nlp_language)
16 | 
17 |     @classmethod
18 |     def create(cls, component_config, config):
19 |         component_config = override_defaults(cls.defaults, component_config)
20 | 
21 |         spacy_model_name = component_config.get("model")
22 | 
23 |         # if no model is specified, we fall back to the language string
24 |         if not spacy_model_name:
25 |             component_config["model"] = config.language
26 | 
27 |         cls.ensure_proper_language_model(nlp_language)
28 |         return cls(component_config, nlp_language)
29 | 


--------------------------------------------------------------------------------
/bothub/shared/utils/poke_logging.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import contextvars
 3 | import io
 4 | 
 5 | 
 6 | class PokeLoggingHandler(logging.StreamHandler):
 7 |     def __init__(self, pl, *args, **kwargs):
 8 |         super().__init__(*args, **kwargs)
 9 |         self.pl = pl
10 | 
11 |     def emit(self, record):
12 |         if self.pl.cxt.get(default=None) is self.pl:
13 |             super().emit(record)
14 | 
15 | 
16 | class PokeLogging:
17 |     def __init__(self, loggingLevel=logging.DEBUG):
18 |         self.loggingLevel = loggingLevel
19 | 
20 |     def __enter__(self):
21 |         self.cxt = contextvars.ContextVar(self.__class__.__name__)
22 |         self.cxt.set(self)
23 |         logging.captureWarnings(True)
24 |         self.logger = logging.getLogger()
25 |         self.logger.setLevel(self.loggingLevel)
26 |         self.stream = io.StringIO()
27 |         self.handler = PokeLoggingHandler(self, self.stream)
28 |         self.formatter = logging.Formatter(
29 |             "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
30 |         )
31 |         self.handler.setLevel(self.loggingLevel)
32 |         self.handler.setFormatter(self.formatter)
33 |         self.logger.addHandler(self.handler)
34 |         return self.stream
35 | 
36 |     def __exit__(self, *args):
37 |         self.logger.removeHandler(self.logger)
38 | 


--------------------------------------------------------------------------------
/bothub/shared/utils/preprocessing/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weni-ai/bothub-nlp/d43b7657aff01544769b71db74adce5f7d366879/bothub/shared/utils/preprocessing/__init__.py


--------------------------------------------------------------------------------
/bothub/shared/utils/preprocessing/preprocessing_base.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from unidecode import unidecode
  3 | import emoji
  4 | import re
  5 | from rasa.nlu.training_data import Message
  6 | 
  7 | logger = logging.getLogger(__name__)
  8 | 
  9 | 
 10 | class PreprocessingBase(object):
 11 |     emoji_contractions = {}
 12 |     apostrophes = ["'", "`", "’"]
 13 | 
 14 |     def __init__(self, remove_accent=True):
 15 |         self.remove_accent = remove_accent
 16 | 
 17 |     def preprocess_text(self, phrase: str) -> str:
 18 |         phrase = self.emoji_handling(phrase)
 19 |         phrase, _ = self.default_preprocessing(phrase)
 20 |         return phrase
 21 | 
 22 |     def preprocess(self, example: Message) -> Message:
 23 |         phrase = example.text
 24 |         entities = example.data.get('entities')
 25 | 
 26 |         phrase = self.emoji_handling(phrase)
 27 |         phrase, entities = self.default_preprocessing(phrase, entities)
 28 | 
 29 |         example.text = phrase
 30 |         if entities:
 31 |             example.data['entities'] = entities
 32 | 
 33 |         return example
 34 | 
 35 |     def _handle_entities(self, phrase, entities):
 36 |         # Remove apostrophe from the phrase (important to do before s_regex regex)
 37 |         positions = []  # mark removal positions
 38 |         for i, char in enumerate(phrase):
 39 |             if char in self.apostrophes:
 40 |                 positions.append(i)
 41 | 
 42 |         for pos in positions:
 43 |             # check if before or in entity
 44 |             for entity in entities:
 45 |                 if pos < entity.get('end'):
 46 |                     entity['end'] -= 1
 47 |                 if pos < entity.get('start'):
 48 |                     entity['start'] -= 1
 49 | 
 50 |         for entity in entities:
 51 |             for apostrophe in self.apostrophes:
 52 |                 entity['value'] = entity['value'].replace(apostrophe, "")
 53 | 
 54 |         return entities
 55 | 
 56 |     def default_preprocessing(self, phrase: str = None, entities=None):
 57 | 
 58 |         if phrase is None:
 59 |             raise ValueError
 60 | 
 61 |         if entities:
 62 |             entities = self._handle_entities(phrase, entities)
 63 | 
 64 |         for apostrophe in self.apostrophes:
 65 |             phrase = phrase.replace(apostrophe, "")
 66 | 
 67 |         # lowercasing characters
 68 |         phrase = phrase.lower()
 69 |         if entities:
 70 |             for entity in entities:
 71 |                 entity['value'] = entity['value'].lower()
 72 | 
 73 |         if self.remove_accent:
 74 |             phrase = unidecode(phrase)
 75 |             if entities:
 76 |                 for entity in entities:
 77 |                     entity['value'] = unidecode(entity['value'])
 78 | 
 79 |         return phrase, entities
 80 | 
 81 |     @staticmethod
 82 |     def extract_emoji_text(code):
 83 |         """
 84 |         :param code: is a emoji_code string ex:  :smile_face:
 85 |         :return: "smile face"
 86 |         """
 87 |         if code is None or code[0] != ':' or code[-1] != ':':
 88 |             raise ValueError
 89 | 
 90 |         code = code[1:len(code) - 1]
 91 |         text = ' '.join(code.split('_'))
 92 |         return text
 93 | 
 94 |     def emoji_handling(self, phrase: str = None):
 95 |         # turn emojis into text codes
 96 |         phrase = emoji.demojize(phrase)
 97 | 
 98 |         regex_emoji = r":[A-Za-z0-9\-_]+:"
 99 |         emoji_codes = re.findall(regex_emoji, phrase)
100 |         for code in emoji_codes:
101 |             try:
102 |                 phrase = re.sub(code, self.emoji_contractions[code], phrase)
103 |             except KeyError:
104 |                 phrase = re.sub(code, self.extract_emoji_text(code), phrase)
105 | 
106 |         return phrase
107 | 


--------------------------------------------------------------------------------
/bothub/shared/utils/preprocessing/preprocessing_english.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | from bothub.shared.utils.preprocessing.preprocessing_base import PreprocessingBase
  3 | 
  4 | 
  5 | class PreprocessingEnglish(PreprocessingBase):
  6 |     emoji_contractions = {
  7 |         ":face_with_tears_of_joy:": "hahaha",  # 😂
  8 |         ":red_heart_selector:": "love",  # ❤️
  9 |         ":smiling_face_with_heart-eyes:": "loved it",  # 😍
 10 |         ":rolling_on_the_floor_laughing:": "hahaha",  # 🤣
 11 |         ":smiling_face_with_smiling_eyes:": "happy",  # 😊
 12 |         ":folded_hands:": "amen",  # 🙏
 13 |         ":two_hearts:": "affection",  # 💕
 14 |         ":loudly_crying_face:": "sad",  # 😭
 15 |         ":face_blowing_a_kiss:": "kiss",  # 😘
 16 |         ":thumbs_up:": "ok",  # 👍
 17 |         ":grinning_face_with_sweat:": "hehehe",  # 😅
 18 |         ":clapping_hands:": "congratulations",  # 👏
 19 |         ":beaming_face_with_smiling_eyes:": "happy",  # 😁
 20 |         ":heart_suit_selector:": "love",  # ♥️
 21 |         ":fire:": "hot",  # 🔥
 22 |         ":broken_heart:": "hurt",  # 💔
 23 |         ":sparkling_heart:": "affection",  # 💖
 24 |         ":blue_heart:": "friendship",  # 💙
 25 |         ":crying_face:": "sad",  # 😢
 26 |         ":thinking_face:": "thinking",  # 🤔
 27 |         ":grinning_squinting_face:": "laughs",  # 😆
 28 |         ":face_with_rolling_eyes:": "doubt",  # 🙄
 29 |         ":flexed_biceps:": "strong",  # 💪
 30 |         ":winking_face:": "wink",  # 😉
 31 |         ":smiling_face_selector:": "happy",  # ☺️
 32 |         ":OK_hand:": "ok",  # 👌
 33 |         ":hugging_face:": "hug",  # 🤗
 34 |         ":purple_heart:": "love",  # 💜
 35 |         ":pensive_face:": "sad",  # 😔
 36 |         ":smiling_face_with_sunglasses:": "proud",  # 😎
 37 |         ":smiling_face_with_halo:": "saint",  # 😇
 38 |         ":rose:": "rose",  # 🌹
 39 |         ":person_facepalming:": "facepalm",  # 🤦
 40 |         ":party_popper:": "party",  # 🎉
 41 |         ":double_exclamation_mark_selector:": "exclamation",  # ‼️
 42 |         ":revolving_hearts:": "affection",  # 💞
 43 |         ":victory_hand_selector:": "vitory",  # ✌️
 44 |         ":sparkles:": "sparkles",  # ✨
 45 |         ":person_shrugging:": "indiferent",  # 🤷
 46 |         ":face_screaming_in_fear:": "fear",  # 😱
 47 |         ":relieved_face:": "relieved",  # 😌
 48 |         ":cherry_blossom:": "cherry blossom",  # 🌸
 49 |         ":raising_hands:": "glad",  # 🙌
 50 |         ":face_savoring_food:": "face_savoring_food",  # 😋
 51 |         ":growing_heart:": "heart",  # 💗
 52 |         ":green_heart:": "friendship",  # 💚
 53 |         ":smirking_face:": "smirk",  # 😏
 54 |         ":yellow_heart:": "friendship",  # 💛
 55 |         ":slightly_smiling_face:": "smile",  # 🙂
 56 |         ":beating_heart:": "love",  # 💓
 57 |         ":star-struck:": "fabulous",  # 🤩
 58 |         ":grinning_face_with_smiling_eyes:": "happy",  # 😄
 59 |         ":grinning_face:": "happy",  # 😀
 60 |         ":grinning_face_with_big_eyes:": "happy",  # 😃
 61 |         ":hundred_points:": "hundred points",  # 💯
 62 |         ":see-no-evil_monkey:": "joke",  # 🙈
 63 |         ":backhand_index_pointing_down:": "point down",  # 👇
 64 |         ":musical_notes:": "music",  # 🎶
 65 |         ":unamused_face:": "unamused",  # 😒
 66 |         ":face_with_hand_over_mouth:": "laughs",  # 🤭
 67 |         ":heart_exclamation:": "heart",  # ❣️
 68 |         ":exclamation_mark:": "!",  # ❗
 69 |         ":winking_face_with_tongue:": "wink",  # 😜
 70 |         ":kiss_mark:": "kiss",  # 💋
 71 |         ":eyes:": "curious",  # 👀
 72 |         ":sleepy_face:": "sleepy",  # 😪
 73 |         ":expressionless_face:": "indiferent",  # 😑
 74 |         ":collision:": "hit",  # 💥
 75 |         ":person_raising_hand:": "raise hand",  # 🙋
 76 |         ":disappointed_face:": "disappointed",  # 😞
 77 |         ":weary_face:": "weary",  # 😩
 78 |         ":pouting_face:": "furious",  # 😡
 79 |         ":zany_face:": "zany",  # 🤪
 80 |         ":oncoming_fist:": "oncoming fist",  # 👊
 81 |         ":sun_selector:": "sun",  # ☀️
 82 |         ":sad_but_relieved_face:": "sad",  # 😥
 83 |         ":drooling_face:": "drooling",  # 🤤
 84 |         ":backhand_index_pointing_right:": "point right",  # 👉
 85 |         ":woman_dancing:": "dancing",  # 💃
 86 |         ":flushed_face:": "flushed",  # 😳
 87 |         ":raised_hand:": "raised hand",  # ✋
 88 |         ":kissing_face_with_closed_eyes:": "kiss",  # 😚
 89 |         ":squinting_face_with_tongue:": "joke",  # 😝
 90 |         ":sleeping_face:": "sleepy",  # 😴
 91 |         ":glowing_star:": "glow",  # 🌟
 92 |         ":grimacing_face:": "grimacing",  # 😬
 93 |         ":upside-down_face:": "playful",  # 🙃
 94 |         ":four_leaf_clover:": "clover",  # 🍀
 95 |         ":tulip:": "tulip",  # 🌷
 96 |         ":smiling_cat_face_with_heart-eyes:": "love",  # 😻
 97 |         ":downcast_face_with_sweat:": "disappointed",  # 😓
 98 |         ":white_medium_star:": "star",  # ⭐
 99 |         ":white_heavy_check_mark:": "check mark",  # ✅
100 |         ":rainbow:": "rainbow",  # 🌈
101 |         ":smiling_face_with_horns:": "evil",  # 😈
102 |         ":sign_of_the_horns:": "metal",  # 🤘
103 |         ":sweat_droplets:": "droplets",  # 💦
104 |         ":check_mark:": "check mark",  # ✔️
105 |         ":persevering_face:": "persevering",  # 😣
106 |         ":person_running:": "running",  # 🏃
107 |         ":bouquet:": "bouquet",  # 💐
108 |         ":frowning_face_selector:": "frowning",  # ☹️
109 |         ":confetti_ball:": "confetti",  # 🎊
110 |         ":heart_with_arrow:": "love",  # 💘
111 |         ":angry_face:": "angry",  # 😠
112 |         ":index_pointing_up_selector:": "point up",  # ☝️
113 |         ":confused_face:": "confused",  # 😕
114 |         ":hibiscus:": "hibiscus",  # 🌺
115 |         ":birthday_cake:": "birthday",  # 🎂
116 |         ":sunflower:": "sunflower",  # 🌻
117 |         ":neutral_face:": "indiferent",  # 😐
118 |         ":middle_finger:": "angry",  # 🖕
119 |         ":heart_with_ribbon:": "heart",  # 💝
120 |         ":speak-no-evil_monkey:": "secret",  # 🙊
121 |         ":cat_face_with_tears_of_joy:": "hahaha",  # 😹
122 |         ":speaking_head_selector:": "talk",  # 🗣️
123 |         ":dizzy:": "dizzy",  # 💫
124 |         ":skull:": "skull",  # 💀
125 |         ":crown:": "crown",  # 👑
126 |         ":musical_note:": "music",  # 🎵
127 |         ":crossed_fingers:": "wishful",  # 🤞
128 |         ":face_with_tongue:": "joke",  # 😛
129 |         ":red_circle:": "red circle",  # 🔴
130 |         ":face_with_steam_from_nose:": "angry",  # 😤
131 |         ":blossom:": "blossom",  # 🌼
132 |         ":tired_face:": "tired",  # 😫
133 |         ":soccer_ball:": "ball",  # ⚽
134 |         ":call_me_hand:": "cool",  # 🤙
135 |         ":hot_beverage:": "hot beverage",  # ☕
136 |         ":trophy:": "winner",  # 🏆
137 |         ":orange_heart:": "heart",  # 🧡
138 |         ":wrapped_gift:": "gift",  # 🎁
139 |         ":high_voltage:": "high voltage",  # ⚡
140 |         ":sun_with_face:": "sun",  # 🌞
141 |         ":balloon:": "balloon",  # 🎈
142 |         ":cross_mark:": "wrong",  # ❌
143 |         ":raised_fist:": "fist",  # ✊
144 |         ":waving_hand:": "goodbye",  # 👋
145 |         ":astonished_face:": "astonished",  # 😲
146 |         ":herb:": "herb",  # 🌿
147 |         ":shushing_face:": "shush",  # 🤫
148 |         ":backhand_index_pointing_left:": "point left",  # 👈
149 |         ":face_with_open_mouth:": "astonished",  # 😮
150 |         ":person_gesturing_OK:": "ok",  # 🙆
151 |         ":clinking_beer_mugs:": "toast",  # 🍻
152 |         ":dog_face:": "dog",  # 🐶
153 |         ":anxious_face_with_sweat:": "anxious",  # 😰
154 |         ":face_with_raised_eyebrow:": "doubt",  # 🤨
155 |         ":face_without_mouth:": "speachless",  # 😶
156 |         ":handshake:": "deal",  # 🤝
157 |         ":person_walking:": "walk",  # 🚶
158 |         ":money_bag:": "money",  # 💰
159 |         ":strawberry:": "strawberry",  # 🍓
160 |         ":anger_symbol:": "hit",  # 💢
161 |     }
162 | 
163 |     def __init__(self, remove_accent=True):
164 |         super(PreprocessingEnglish, self).__init__(remove_accent=remove_accent)
165 | 


--------------------------------------------------------------------------------
/bothub/shared/utils/preprocessing/preprocessing_factory.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from bothub.shared.utils.preprocessing.preprocessing_english import PreprocessingEnglish
 3 | from bothub.shared.utils.preprocessing.preprocessing_portuguese import PreprocessingPortuguese
 4 | from bothub.shared.utils.preprocessing.preprocessing_spanish import PreprocessingSpanish
 5 | from bothub.shared.utils.preprocessing.preprocessing_base import PreprocessingBase
 6 | 
 7 | logger = logging.getLogger(__name__)
 8 | 
 9 | 
10 | class PreprocessingFactory(object):
11 | 
12 |     def __init__(self, language=None, remove_accent=True):
13 |         self.language = language
14 |         self.remove_accent = remove_accent
15 | 
16 |     def factory(self):
17 |         """
18 |         Implements Factory Method
19 |         :return: Preprocessing Class respective to its language
20 |         """
21 |         try:
22 |             if self.language == "en":
23 |                 return PreprocessingEnglish(self.remove_accent)
24 |             elif self.language == "pt_br":
25 |                 return PreprocessingPortuguese(self.remove_accent)
26 |             elif self.language == "es":
27 |                 return PreprocessingSpanish(self.remove_accent)
28 |             else:
29 |                 return PreprocessingBase(self.remove_accent)
30 | 
31 |         except AssertionError as e:
32 |             logger.exception(e)
33 | 
34 |         return None
35 | 


--------------------------------------------------------------------------------
/bothub/shared/utils/preprocessing/preprocessing_portuguese.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | from bothub.shared.utils.preprocessing.preprocessing_base import PreprocessingBase
  3 | 
  4 | 
  5 | class PreprocessingPortuguese(PreprocessingBase):
  6 |     emoji_contractions = {
  7 |         ":face_with_tears_of_joy:": "hahaha",  # 😂
  8 |         ":red_heart_selector:": "amor",  # ❤️
  9 |         ":smiling_face_with_heart-eyes:": "amei",  # 😍
 10 |         ":rolling_on_the_floor_laughing:": "hahaha",  # 🤣
 11 |         ":smiling_face_with_smiling_eyes:": "feliz",  # 😊
 12 |         ":folded_hands:": "amem",  # 🙏
 13 |         ":two_hearts:": "carinho",  # 💕
 14 |         ":loudly_crying_face:": "triste",  # 😭
 15 |         ":face_blowing_a_kiss:": "beijo",  # 😘
 16 |         ":thumbs_up:": "ok",  # 👍
 17 |         ":grinning_face_with_sweat:": "hehehe",  # 😅
 18 |         ":clapping_hands:": "parabens",  # 👏
 19 |         ":beaming_face_with_smiling_eyes:": "feliz",  # 😁
 20 |         ":heart_suit_selector:": "amor",  # ♥️
 21 |         ":fire:": "quente",  # 🔥
 22 |         ":broken_heart:": "magoado",  # 💔
 23 |         ":sparkling_heart:": "carinho",  # 💖
 24 |         ":blue_heart:": "amigo",  # 💙
 25 |         ":crying_face:": "triste",  # 😢
 26 |         ":thinking_face:": "pensar",  # 🤔
 27 |         ":grinning_squinting_face:": "risos",  # 😆
 28 |         ":face_with_rolling_eyes:": "duvida",  # 🙄
 29 |         ":flexed_biceps:": "forte",  # 💪
 30 |         ":winking_face:": "piscar",  # 😉
 31 |         ":smiling_face_selector:": "feliz",  # ☺️
 32 |         ":OK_hand:": "ok",  # 👌
 33 |         ":hugging_face:": "abraco",  # 🤗
 34 |         ":purple_heart:": "amor",  # 💜
 35 |         ":pensive_face:": "triste",  # 😔
 36 |         ":smiling_face_with_sunglasses:": "orgulhoso",  # 😎
 37 |         ":smiling_face_with_halo:": "santo",  # 😇
 38 |         ":rose:": "rosa",  # 🌹
 39 |         ":person_facepalming:": "inacreditavel",  # 🤦
 40 |         ":party_popper:": "festa",  # 🎉
 41 |         ":double_exclamation_mark_selector:": "urgente",  # ‼️
 42 |         ":revolving_hearts:": "carinho",  # 💞
 43 |         ":victory_hand_selector:": "vitoria",  # ✌️
 44 |         ":sparkles:": "brilho",  # ✨
 45 |         ":person_shrugging:": "indiferenca",  # 🤷
 46 |         ":face_screaming_in_fear:": "medo",  # 😱
 47 |         ":relieved_face:": "alivio",  # 😌
 48 |         ":cherry_blossom:": "rosa",  # 🌸
 49 |         ":raising_hands:": "ainda bem",  # 🙌
 50 |         ":face_savoring_food:": "brincadeira",  # 😋
 51 |         ":growing_heart:": "amizade",  # 💗
 52 |         ":green_heart:": "amizade",  # 💚
 53 |         ":smirking_face:": "flertar",  # 😏
 54 |         ":yellow_heart:": "amizade",  # 💛
 55 |         ":slightly_smiling_face:": "feliz",  # 🙂
 56 |         ":beating_heart:": "amor",  # 💓
 57 |         ":star-struck:": "fabuloso",  # 🤩
 58 |         ":grinning_face_with_smiling_eyes:": "sorriso",  # 😄
 59 |         ":grinning_face:": "sorriso",  # 😀
 60 |         ":grinning_face_with_big_eyes:": "feliz",  # 😃
 61 |         ":hundred_points:": "pontuacao maxima",  # 💯
 62 |         ":see-no-evil_monkey:": "brincadeira",  # 🙈
 63 |         ":backhand_index_pointing_down:": "apontar",  # 👇
 64 |         ":musical_notes:": "musica",  # 🎶
 65 |         ":unamused_face:": "chateado",  # 😒
 66 |         ":face_with_hand_over_mouth:": "risada",  # 🤭
 67 |         ":heart_exclamation:": "coracao",  # ❣️
 68 |         ":exclamation_mark:": "importante",  # ❗
 69 |         ":winking_face_with_tongue:": "brincalhao",  # 😜
 70 |         ":kiss_mark:": "beijo",  # 💋
 71 |         ":eyes:": "curiosidade",  # 👀
 72 |         ":sleepy_face:": "sono",  # 😪
 73 |         ":expressionless_face:": "indiferente",  # 😑
 74 |         ":collision:": "batida",  # 💥
 75 |         ":person_raising_hand:": "atencao",  # 🙋
 76 |         ":disappointed_face:": "desapontado",  # 😞
 77 |         ":weary_face:": "cansado",  # 😩
 78 |         ":pouting_face:": "furioso",  # 😡
 79 |         ":zany_face:": "brincadeira",  # 🤪
 80 |         ":oncoming_fist:": "firme",  # 👊
 81 |         ":sun_selector:": "sol",  # ☀️
 82 |         ":sad_but_relieved_face:": "triste",  # 😥
 83 |         ":drooling_face:": "desejo",  # 🤤
 84 |         ":backhand_index_pointing_right:": "apontar",  # 👉
 85 |         ":woman_dancing:": "danca",  # 💃
 86 |         ":flushed_face:": "envergonhado",  # 😳
 87 |         ":raised_hand:": "atencao",  # ✋
 88 |         ":kissing_face_with_closed_eyes:": "beijo",  # 😚
 89 |         ":squinting_face_with_tongue:": "brincadeira",  # 😝
 90 |         ":sleeping_face:": "sono",  # 😴
 91 |         ":glowing_star:": "estrela",  # 🌟
 92 |         ":grimacing_face:": "sem graca",  # 😬
 93 |         ":upside-down_face:": "brincalhao",  # 🙃
 94 |         ":four_leaf_clover:": "trevo",  # 🍀
 95 |         ":tulip:": "tulipa",  # 🌷
 96 |         ":smiling_cat_face_with_heart-eyes:": "apaixonado",  # 😻
 97 |         ":downcast_face_with_sweat:": "desapontado",  # 😓
 98 |         ":white_medium_star:": "estrela",  # ⭐
 99 |         ":white_heavy_check_mark:": "concluido",  # ✅
100 |         ":rainbow:": "arco-iris",  # 🌈
101 |         ":smiling_face_with_horns:": "malvado",  # 😈
102 |         ":sign_of_the_horns:": "metal",  # 🤘
103 |         ":sweat_droplets:": "respingo",  # 💦
104 |         ":check_mark:": "concluido",  # ✔️
105 |         ":persevering_face:": "exausto",  # 😣
106 |         ":person_running:": "corrida",  # 🏃
107 |         ":bouquet:": "flores",  # 💐
108 |         ":frowning_face_selector:": "triste",  # ☹️
109 |         ":confetti_ball:": "festa",  # 🎊
110 |         ":heart_with_arrow:": "apaixonado",  # 💘
111 |         ":angry_face:": "furioso",  # 😠
112 |         ":index_pointing_up_selector:": "atencao",  # ☝️
113 |         ":confused_face:": "confuso",  # 😕
114 |         ":hibiscus:": "flor",  # 🌺
115 |         ":birthday_cake:": "aniversario",  # 🎂
116 |         ":sunflower:": "girassol",  # 🌻
117 |         ":neutral_face:": "indiferente",  # 😐
118 |         ":middle_finger:": "raiva",  # 🖕
119 |         ":heart_with_ribbon:": "presente coracao",  # 💝
120 |         ":speak-no-evil_monkey:": "segredo",  # 🙊
121 |         ":cat_face_with_tears_of_joy:": "hahaha",  # 😹
122 |         ":speaking_head_selector:": "falar",  # 🗣️
123 |         ":dizzy:": "tontura",  # 💫
124 |         ":skull:": "caveira",  # 💀
125 |         ":crown:": "coroa",  # 👑
126 |         ":musical_note:": "musica",  # 🎵
127 |         ":crossed_fingers:": "ansioso",  # 🤞
128 |         ":face_with_tongue:": "pegadinha",  # 😛
129 |         ":red_circle:": "circulo vermelho",  # 🔴
130 |         ":face_with_steam_from_nose:": "bravo",  # 😤
131 |         ":blossom:": "flor",  # 🌼
132 |         ":tired_face:": "cansado",  # 😫
133 |         ":soccer_ball:": "bola",  # ⚽
134 |         ":call_me_hand:": "maneiro",  # 🤙
135 |         ":hot_beverage:": "bebida quente",  # ☕
136 |         ":trophy:": "vencedor",  # 🏆
137 |         ":orange_heart:": "amizade",  # 🧡
138 |         ":wrapped_gift:": "presente",  # 🎁
139 |         ":high_voltage:": "eletricidade",  # ⚡
140 |         ":sun_with_face:": "sol",  # 🌞
141 |         ":balloon:": "balao",  # 🎈
142 |         ":cross_mark:": "negacao",  # ❌
143 |         ":raised_fist:": "punho",  # ✊
144 |         ":waving_hand:": "adeus",  # 👋
145 |         ":astonished_face:": "perplexo",  # 😲
146 |         ":herb:": "planta",  # 🌿
147 |         ":shushing_face:": "segredo",  # 🤫
148 |         ":backhand_index_pointing_left:": "apontar",  # 👈
149 |         ":face_with_open_mouth:": "perplexo",  # 😮
150 |         ":person_gesturing_OK:": "ok",  # 🙆
151 |         ":clinking_beer_mugs:": "brinde",  # 🍻
152 |         ":dog_face:": "cachorro",  # 🐶
153 |         ":anxious_face_with_sweat:": "ansiedade",  # 😰
154 |         ":face_with_raised_eyebrow:": "duvida",  # 🤨
155 |         ":face_without_mouth:": "mudo",  # 😶
156 |         ":handshake:": "acordo",  # 🤝
157 |         ":person_walking:": "caminhar",  # 🚶
158 |         ":money_bag:": "dinheiro",  # 💰
159 |         ":strawberry:": "morango",  # 🍓
160 |         ":anger_symbol:": "batida",  # 💢
161 |     }
162 | 
163 |     def __init__(self, remove_accent=True):
164 |         super(PreprocessingPortuguese, self).__init__(remove_accent=remove_accent)
165 | 


--------------------------------------------------------------------------------
/bothub/shared/utils/preprocessing/preprocessing_spanish.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | from bothub.shared.utils.preprocessing.preprocessing_base import PreprocessingBase
  3 | 
  4 | 
  5 | class PreprocessingSpanish(PreprocessingBase):
  6 |     emoji_contractions = {
  7 |         ":face_with_tears_of_joy:": "hahaha",  # 😂
  8 |         ":red_heart_selector:": "amor",  # ❤️
  9 |         ":smiling_face_with_heart-eyes:": "me gusto",  # 😍
 10 |         ":rolling_on_the_floor_laughing:": "hahaha",  # 🤣
 11 |         ":smiling_face_with_smiling_eyes:": "felíz",  # 😊
 12 |         ":folded_hands:": "amén",  # 🙏
 13 |         ":two_hearts:": "afecto",  # 💕
 14 |         ":loudly_crying_face:": "triste",  # 😭
 15 |         ":face_blowing_a_kiss:": "beso",  # 😘
 16 |         ":thumbs_up:": "ok",  # 👍
 17 |         ":grinning_face_with_sweat:": "hehehe",  # 😅
 18 |         ":clapping_hands:": "parabens",  # 👏
 19 |         ":beaming_face_with_smiling_eyes:": "muy felíz",  # 😁
 20 |         ":heart_suit_selector:": "amor",  # ♥️
 21 |         ":fire:": "caliente",  # 🔥
 22 |         ":broken_heart:": "lastimar",  # 💔
 23 |         ":sparkling_heart:": "afecto",  # 💖
 24 |         ":blue_heart:": "amigo",  # 💙
 25 |         ":crying_face:": "triste",  # 😢
 26 |         ":thinking_face:": "pensar",  # 🤔
 27 |         ":grinning_squinting_face:": "se rié",  # 😆
 28 |         ":face_with_rolling_eyes:": "duda",  # 🙄
 29 |         ":flexed_biceps:": "fuerte",  # 💪
 30 |         ":winking_face:": "parpadear",  # 😉
 31 |         ":smiling_face_selector:": "felíz",  # ☺️
 32 |         ":OK_hand:": "ok",  # 👌
 33 |         ":hugging_face:": "abrazo",  # 🤗
 34 |         ":purple_heart:": "amor",  # 💜
 35 |         ":pensive_face:": "triste",  # 😔
 36 |         ":smiling_face_with_sunglasses:": "orgulloso",  # 😎
 37 |         ":smiling_face_with_halo:": "santo",  # 😇
 38 |         ":rose:": "rosa",  # 🌹
 39 |         ":person_facepalming:": "increíble",  # 🤦
 40 |         ":party_popper:": "fiesta",  # 🎉
 41 |         ":double_exclamation_mark_selector:": "urgente",  # ‼️
 42 |         ":revolving_hearts:": "afecto",  # 💞
 43 |         ":victory_hand_selector:": "victoria",  # ✌️
 44 |         ":sparkles:": "brillo",  # ✨
 45 |         ":person_shrugging:": "indiferencia",  # 🤷
 46 |         ":face_screaming_in_fear:": "miedo",  # 😱
 47 |         ":relieved_face:": "alivio",  # 😌
 48 |         ":cherry_blossom:": "rosa",  # 🌸
 49 |         ":raising_hands:": "menos mal",  # 🙌
 50 |         ":face_savoring_food:": "es una broma",  # 😋
 51 |         ":growing_heart:": "amistad",  # 💗
 52 |         ":green_heart:": "amistad",  # 💚
 53 |         ":smirking_face:": "flirtear",  # 😏
 54 |         ":yellow_heart:": "amistad",  # 💛
 55 |         ":slightly_smiling_face:": "feliz",  # 🙂
 56 |         ":beating_heart:": "amor",  # 💓
 57 |         ":star-struck:": "fabuloso",  # 🤩
 58 |         ":grinning_face_with_smiling_eyes:": "sonreír",  # 😄
 59 |         ":grinning_face:": "sonreír",  # 😀
 60 |         ":grinning_face_with_big_eyes:": "feliz",  # 😃
 61 |         ":hundred_points:": "puntuación máxima",  # 💯
 62 |         ":see-no-evil_monkey:": "es una broma",  # 🙈
 63 |         ":backhand_index_pointing_down:": "apuntar",  # 👇
 64 |         ":musical_notes:": "musica",  # 🎶
 65 |         ":unamused_face:": "disgustado",  # 😒
 66 |         ":face_with_hand_over_mouth:": "la risa",  # 🤭
 67 |         ":heart_exclamation:": "corazon",  # ❣️
 68 |         ":exclamation_mark:": "importante",  # ❗
 69 |         ":winking_face_with_tongue:": "juguetón",  # 😜
 70 |         ":kiss_mark:": "beso",  # 💋
 71 |         ":eyes:": "curiosidad",  # 👀
 72 |         ":sleepy_face:": "sueno",  # 😪
 73 |         ":expressionless_face:": "indiferente",  # 😑
 74 |         ":collision:": "batida",  # 💥
 75 |         ":person_raising_hand:": "atencion",  # 🙋
 76 |         ":disappointed_face:": "decepcionado",  # 😞
 77 |         ":weary_face:": "cansado",  # 😩
 78 |         ":pouting_face:": "furioso",  # 😡
 79 |         ":zany_face:": "es una broma",  # 🤪
 80 |         ":oncoming_fist:": "golpeo",  # 👊
 81 |         ":sun_selector:": "sol",  # ☀️
 82 |         ":sad_but_relieved_face:": "triste",  # 😥
 83 |         ":drooling_face:": "deseo",  # 🤤
 84 |         ":backhand_index_pointing_right:": "apuntar",  # 👉
 85 |         ":woman_dancing:": "baile",  # 💃
 86 |         ":flushed_face:": "avergonzado",  # 😳
 87 |         ":raised_hand:": "atencion",  # ✋
 88 |         ":kissing_face_with_closed_eyes:": "beso",  # 😚
 89 |         ":squinting_face_with_tongue:": "es una broma",  # 😝
 90 |         ":sleeping_face:": "sueno",  # 😴
 91 |         ":glowing_star:": "estrella",  # 🌟
 92 |         ":grimacing_face:": "desangelado",  # 😬
 93 |         ":upside-down_face:": "bromista",  # 🙃
 94 |         ":four_leaf_clover:": "trébol",  # 🍀
 95 |         ":tulip:": "tulipan",  # 🌷
 96 |         ":smiling_cat_face_with_heart-eyes:": "enamorado",  # 😻
 97 |         ":downcast_face_with_sweat:": "decepcionado",  # 😓
 98 |         ":white_medium_star:": "estrella",  # ⭐
 99 |         ":white_heavy_check_mark:": "terminado",  # ✅
100 |         ":rainbow:": "arcoiris",  # 🌈
101 |         ":smiling_face_with_horns:": "malvado",  # 😈
102 |         ":sign_of_the_horns:": "metal",  # 🤘
103 |         ":sweat_droplets:": "churrete",  # 💦
104 |         ":check_mark:": "terminado",  # ✔️
105 |         ":persevering_face:": "exhausto ",  # 😣
106 |         ":person_running:": "carrera",  # 🏃
107 |         ":bouquet:": "flores",  # 💐
108 |         ":frowning_face_selector:": "triste",  # ☹️
109 |         ":confetti_ball:": "fiesta",  # 🎊
110 |         ":heart_with_arrow:": "enamorado",  # 💘
111 |         ":angry_face:": "enfurecido",  # 😠
112 |         ":index_pointing_up_selector:": "atencion",  # ☝️
113 |         ":confused_face:": "lioso",  # 😕
114 |         ":hibiscus:": "flor",  # 🌺
115 |         ":birthday_cake:": "cumpleanos",  # 🎂
116 |         ":sunflower:": "girasol",  # 🌻
117 |         ":neutral_face:": "indiferente",  # 😐
118 |         ":middle_finger:": "rabia",  # 🖕
119 |         ":heart_with_ribbon:": "regalo corazon",  # 💝
120 |         ":speak-no-evil_monkey:": "secreto",  # 🙊
121 |         ":cat_face_with_tears_of_joy:": "hahaha",  # 😹
122 |         ":speaking_head_selector:": "hablar",  # 🗣️
123 |         ":dizzy:": "mareo",  # 💫
124 |         ":skull:": "calavera",  # 💀
125 |         ":crown:": "corona",  # 👑
126 |         ":musical_note:": "musica",  # 🎵
127 |         ":crossed_fingers:": "ansioso",  # 🤞
128 |         ":face_with_tongue:": "es una broma",  # 😛
129 |         ":red_circle:": "circulo rojo",  # 🔴
130 |         ":face_with_steam_from_nose:": "bravo",  # 😤
131 |         ":blossom:": "flor",  # 🌼
132 |         ":tired_face:": "cansado",  # 😫
133 |         ":soccer_ball:": "pelota",  # ⚽
134 |         ":call_me_hand:": "chachi",  # 🤙
135 |         ":hot_beverage:": "bebida caliente",  # ☕
136 |         ":trophy:": "vencedor",  # 🏆
137 |         ":orange_heart:": "amistad",  # 🧡
138 |         ":wrapped_gift:": "regalo",  # 🎁
139 |         ":high_voltage:": "electricidad",  # ⚡
140 |         ":sun_with_face:": "sol",  # 🌞
141 |         ":balloon:": "globo",  # 🎈
142 |         ":cross_mark:": "negacion",  # ❌
143 |         ":raised_fist:": "puno",  # ✊
144 |         ":waving_hand:": "adiós",  # 👋
145 |         ":astonished_face:": "perplejo",  # 😲
146 |         ":herb:": "planta",  # 🌿
147 |         ":shushing_face:": "secreto",  # 🤫
148 |         ":backhand_index_pointing_left:": "apuntar",  # 👈
149 |         ":face_with_open_mouth:": "perplejo",  # 😮
150 |         ":person_gesturing_OK:": "ok",  # 🙆
151 |         ":clinking_beer_mugs:": "brindis",  # 🍻
152 |         ":dog_face:": "perro",  # 🐶
153 |         ":anxious_face_with_sweat:": "ansiedad",  # 😰
154 |         ":face_with_raised_eyebrow:": "duda",  # 🤨
155 |         ":face_without_mouth:": "mudo",  # 😶
156 |         ":handshake:": "acuerdo",  # 🤝
157 |         ":person_walking:": "caminar",  # 🚶
158 |         ":money_bag:": "dinero",  # 💰
159 |         ":strawberry:": "fresa",  # 🍓
160 |         ":anger_symbol:": "batida",  # 💢
161 |     }
162 | 
163 |     def __init__(self, remove_accent=True):
164 |         super(PreprocessingSpanish, self).__init__(remove_accent=remove_accent)
165 | 


--------------------------------------------------------------------------------
/bothub/shared/utils/rasa_components/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weni-ai/bothub-nlp/d43b7657aff01544769b71db74adce5f7d366879/bothub/shared/utils/rasa_components/__init__.py


--------------------------------------------------------------------------------
/bothub/shared/utils/rasa_components/bothub_interpreter.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | from rasa.nlu.model import Metadata, Interpreter
  3 | from rasa.nlu.components import Component, ComponentBuilder
  4 | from rasa.nlu import components
  5 | from rasa.nlu.training_data import Message
  6 | from typing import Any, Dict, List, Text, Optional
  7 | 
  8 | 
  9 | class BothubInterpreter(Interpreter):
 10 |     """Use a trained pipeline of components to parse text messages."""
 11 | 
 12 |     def __init__(
 13 |         self,
 14 |         pipeline: List[Component],
 15 |         context: Optional[Dict[Text, Any]],
 16 |         model_metadata: Optional[Metadata] = None,
 17 |     ) -> None:
 18 | 
 19 |         super().__init__(pipeline, context, model_metadata)
 20 | 
 21 |     @staticmethod
 22 |     def load(
 23 |         model_dir: Text,
 24 |         component_builder: Optional[ComponentBuilder] = None,
 25 |         skip_validation: bool = False,
 26 |     ) -> "BothubInterpreter":
 27 |         """Create an interpreter based on a persisted model.
 28 | 
 29 |         Args:
 30 |             skip_validation: If set to `True`, tries to check that all
 31 |                 required packages for the components are installed
 32 |                 before loading them.
 33 |             model_dir: The path of the model to load
 34 |             component_builder: The
 35 |                 :class:`rasa.nlu.components.ComponentBuilder` to use.
 36 | 
 37 |         Returns:
 38 |             An interpreter that uses the loaded model.
 39 |         """
 40 | 
 41 |         model_metadata = Metadata.load(model_dir)
 42 | 
 43 |         # Adapt Loader to accept new component-name (changed) with older models
 44 |         metadata = model_metadata.__dict__["metadata"]
 45 |         for i in range(len(metadata["pipeline"])):
 46 |             component_name = metadata["pipeline"][i]["class"]
 47 |             if "bothub_nlp_rasa_utils" in component_name:
 48 |                 metadata["pipeline"][i]["class"] = component_name.replace(
 49 |                     "bothub_nlp_rasa_utils", "bothub.shared.utils", 1
 50 |                 )
 51 | 
 52 |         model_metadata = Metadata(metadata, model_dir)
 53 | 
 54 |         BothubInterpreter.ensure_model_compatibility(model_metadata)
 55 |         return BothubInterpreter.create(model_metadata, component_builder, skip_validation)
 56 | 
 57 |     @staticmethod
 58 |     def create(
 59 |         model_metadata: Metadata,
 60 |         component_builder: Optional[ComponentBuilder] = None,
 61 |         skip_validation: bool = False,
 62 |     ) -> "BothubInterpreter":
 63 |         """Load stored model and components defined by the provided metadata."""
 64 | 
 65 |         context = {}
 66 | 
 67 |         if component_builder is None:
 68 |             # If no builder is passed, every interpreter creation will result
 69 |             # in a new builder. hence, no components are reused.
 70 |             component_builder = components.ComponentBuilder()
 71 | 
 72 |         pipeline = []
 73 | 
 74 |         # Before instantiating the component classes,
 75 |         # lets check if all required packages are available
 76 |         if not skip_validation:
 77 |             components.validate_requirements(model_metadata.component_classes)
 78 | 
 79 |         for i in range(model_metadata.number_of_components):
 80 |             component_meta = model_metadata.for_component(i)
 81 |             component = component_builder.load_component(
 82 |                 component_meta, model_metadata.model_dir, model_metadata, **context
 83 |             )
 84 |             try:
 85 |                 updates = component.provide_context()
 86 |                 if updates:
 87 |                     context.update(updates)
 88 |                 pipeline.append(component)
 89 |             except components.MissingArgumentError as e:
 90 |                 raise Exception(
 91 |                     "Failed to initialize component '{}'. "
 92 |                     "{}".format(component.name, e)
 93 |                 )
 94 | 
 95 |         return BothubInterpreter(pipeline, context, model_metadata)
 96 | 
 97 |     def parse(
 98 |         self,
 99 |         text: Text,
100 |         time: Optional[datetime.datetime] = None,
101 |         only_output_properties: bool = True,
102 |     ) -> Dict[Text, Any]:
103 |         """Parse the input text, classify it and return pipeline result.
104 |         The pipeline result usually contains intent and entities."""
105 | 
106 |         if not text.replace(" ", ""):
107 |             # Not all components are able to handle empty strings. So we need
108 |             # to prevent that... This default return will not contain all
109 |             # output attributes of all components, but in the end, no one
110 |             # should pass an empty string in the first place.
111 |             output = self.default_output_attributes()
112 |             output['intent_ranking'] = []
113 |             output["text"] = ""
114 | 
115 |             return output
116 | 
117 |         message = Message(text, self.default_output_attributes(), time=time)
118 | 
119 |         for component in self.pipeline:
120 |             component.process(message, **self.context)
121 | 
122 |         output = self.default_output_attributes()
123 |         output.update(message.as_dict(only_output_properties=only_output_properties))
124 | 
125 |         return output
126 | 


--------------------------------------------------------------------------------
/bothub/shared/utils/rasa_components/registry.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | # Explicitly set logging level for this module before any import
 4 | # because otherwise it logs tensorflow/pytorch versions
 5 | logging.getLogger("transformers.file_utils").setLevel(logging.WARNING)
 6 | 
 7 | from transformers import (
 8 |     TFBertModel,
 9 |     # TFOpenAIGPTModel,
10 |     # TFGPT2Model,
11 |     # TFXLNetModel,
12 |     # TFXLMModel,
13 |     # TFDistilBertModel,
14 |     # TFRobertaModel,
15 |     BertTokenizer,
16 |     # OpenAIGPTTokenizer,
17 |     # GPT2Tokenizer,
18 |     # XLNetTokenizer,
19 |     # XLMTokenizer,
20 |     # DistilBertTokenizer,
21 |     # RobertaTokenizer,
22 | )
23 | 
24 | from rasa.nlu.utils.hugging_face.transformers_pre_post_processors import (
25 |     bert_tokens_pre_processor,
26 |     # gpt_tokens_pre_processor,
27 |     # xlnet_tokens_pre_processor,
28 |     # roberta_tokens_pre_processor,
29 |     bert_embeddings_post_processor,
30 |     # gpt_embeddings_post_processor,
31 |     # xlnet_embeddings_post_processor,
32 |     # roberta_embeddings_post_processor,
33 |     bert_tokens_cleaner,
34 |     # openaigpt_tokens_cleaner,
35 |     # gpt2_tokens_cleaner,
36 |     # xlnet_tokens_cleaner,
37 | )
38 | 
39 | language_to_model = {
40 |     "en": "bert_english",
41 |     "pt_br": "bert_portuguese",
42 |     "multilang": "bert_multilang"
43 | }
44 | 
45 | from_pt_dict = {
46 |     "bert_portuguese": True
47 | }
48 | 
49 | model_class_dict = {
50 |     "bert_english": TFBertModel,
51 |     "bert_portuguese": TFBertModel,
52 |     "bert_multilang": TFBertModel,
53 | }
54 | model_tokenizer_dict = {
55 |     "bert_english": BertTokenizer,
56 |     "bert_portuguese": BertTokenizer,
57 |     "bert_multilang": BertTokenizer,
58 | }
59 | model_weights_defaults = {
60 |     "bert_english": "bert-base-uncased",
61 |     "bert_portuguese": "neuralmind/bert-base-portuguese-cased",
62 |     "bert_multilang": "bert-base-multilingual-uncased"
63 | }
64 | 
65 | model_special_tokens_pre_processors = {
66 |     "bert_english": bert_tokens_pre_processor,
67 |     "bert_portuguese": bert_tokens_pre_processor,
68 |     "bert_multilang": bert_tokens_pre_processor,
69 | }
70 | 
71 | model_tokens_cleaners = {
72 |     "bert_english": bert_tokens_cleaner,
73 |     "bert_portuguese": bert_tokens_cleaner,
74 |     "bert_multilang": bert_tokens_cleaner,
75 | }
76 | 
77 | model_embeddings_post_processors = {
78 |     "bert_english": bert_embeddings_post_processor,
79 |     "bert_portuguese": bert_embeddings_post_processor,
80 |     "bert_multilang": bert_embeddings_post_processor,
81 | }
82 | 
83 | model_url = {
84 |     "bert_portuguese": "https://bothub-nlp-models.s3.amazonaws.com/bert/bert_portuguese.zip",
85 |     "bert_english": "https://bothub-nlp-models.s3.amazonaws.com/bert/bert_english.zip",
86 |     "bert_multilang": "https://bothub-nlp-models.s3.amazonaws.com/bert/bert_multilang.zip"
87 | }
88 | 


--------------------------------------------------------------------------------
/bothub/shared/utils/scripts/download_models.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Script to download language models on demand
  3 | Usage example:
  4 | !python download_models.py pt_br-BERT
  5 | """
  6 | 
  7 | # !/usr/bin/env python
  8 | import os
  9 | import sys
 10 | import subprocess
 11 | import logging
 12 | import plac
 13 | import requests
 14 | import zipfile
 15 | 
 16 | from decouple import config
 17 | from spacy.cli import download
 18 | from spacy.cli import link
 19 | from spacy.util import get_package_path
 20 | 
 21 | sys.path.insert(
 22 |     1, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "..", ".."))
 23 | )
 24 | 
 25 | from bothub.shared.utils.rasa_components.registry import model_url
 26 | 
 27 | logger = logging.getLogger(__name__)
 28 | 
 29 | lang_to_model = {
 30 |     "en": {"SPACY": "en_core_web_lg", "BERT": "bert_english"},
 31 |     "pt_br": {
 32 |         "SPACY": "pip+pt_nilc_word2vec_cbow_600:https://bothub-nlp-models.s3.amazonaws.com/pt_br-spacy/pt_nilc_word2vec_cbow_600-1.0.0.tar.gz",
 33 |         "SPACY_SUGGESTION": "pip+pt_nilc_wang2vec_cbow_300:https://bothub-nlp-models.s3.amazonaws.com/pt_br-spacy/pt_nilc_wang2vec_cbow_300-1.0.0.tar.gz",
 34 |         "BERT": "bert_portuguese",
 35 |     },
 36 |     "es": {"SPACY": "es_core_news_md"},
 37 |     "fr": {"SPACY": "fr_core_news_md"},
 38 |     "ru": {
 39 |         "SPACY": "pip+ru_vectors_web_md:https://bothub-nlp-models.s3.amazonaws.com/ru-spacy/ru_vectors_web_md-1.1.0.tar.gz"
 40 |     },
 41 |     "xx": {"BERT": "bert_multilang"},
 42 | }
 43 | 
 44 | 
 45 | def download_file(url, file_name):
 46 |     with requests.get(url, stream=True) as r:
 47 |         r.raise_for_status()
 48 |         with open(file_name, "wb") as f:
 49 |             for chunk in r.iter_content(chunk_size=8192):
 50 |                 f.write(chunk)
 51 |     return file_name
 52 | 
 53 | 
 54 | def download_bert(model_name):
 55 |     model_dir = model_name
 56 |     os.makedirs(model_dir, exist_ok=True)
 57 | 
 58 |     zipped_file_name = "temp.zip"
 59 |     url = model_url.get(model_name)
 60 |     logger.info(f"downloading {model_name} . . .")
 61 |     download_file(url, zipped_file_name)
 62 | 
 63 |     logger.info(f"extracting {model_name} . . .")
 64 |     with zipfile.ZipFile(zipped_file_name, 'r') as zip_ref:
 65 |         zip_ref.extractall(model_dir)
 66 |     os.remove(zipped_file_name)
 67 | 
 68 | 
 69 | def cast_supported_languages(languages):
 70 |     return languages.split("|")
 71 | 
 72 | 
 73 | @plac.annotations(
 74 |     languages=plac.Annotation(help="Languages to download"),
 75 |     debug=plac.Annotation(help="Enable debug", kind="flag", abbrev="D"),
 76 | )
 77 | def download_models(languages="", debug=False):
 78 |     logging.basicConfig(
 79 |         format="%(name)s - %(levelname)s - %(message)s",
 80 |         level=logging.DEBUG if debug else logging.INFO,
 81 |     )
 82 | 
 83 |     languages = cast_supported_languages(languages)
 84 | 
 85 |     for lang in languages:
 86 |         lang = lang.split("-")
 87 | 
 88 |         lang_slug = lang[0]
 89 |         model = lang[1] if len(lang) > 1 else None
 90 | 
 91 |         if not model or model == "NONE":
 92 |             continue
 93 | 
 94 |         value = lang_to_model.get(lang_slug, {}).get(model, None)
 95 |         if model.startswith("SPACY"):
 96 |             if value.startswith("pip+"):
 97 |                 model_name, pip_package = value[4:].split(":", 1)
 98 |                 logger.debug("model name: {}".format(model_name))
 99 |                 logger.debug("pip package: {}".format(pip_package))
100 |                 cmd = [
101 |                     sys.executable,
102 |                     "-m",
103 |                     "pip",
104 |                     "install",
105 |                     "--no-deps",
106 |                     "--no-cache-dir",
107 |                     pip_package,
108 |                 ]
109 |                 logger.debug(" ".join(cmd))
110 |                 if subprocess.call(cmd, env=os.environ.copy()) == 0:
111 |                     logger.debug("linking: {} to {}".format(model_name, lang_slug))
112 |                     package_path = get_package_path(model_name)
113 |                     link(model_name, lang_slug, force=True, model_path=package_path)
114 |                 else:
115 |                     raise Exception("Error to download {}".format(lang_slug))
116 |             elif lang_slug != value:
117 |                 logger.debug("downloading {}".format(value))
118 |                 download(value)
119 |                 logger.debug("linking: {} to {}".format(value, lang_slug))
120 |                 package_path = get_package_path(value)
121 |                 link(value, lang_slug, force=True, model_path=package_path)
122 |             else:
123 |                 logger.debug("downloading {}".format(value))
124 |                 download(value)
125 |         elif model == "BERT":
126 |             download_bert(value)
127 | 
128 | 
129 | if __name__ == "__main__":
130 |     plac.call(download_models, sys.argv[1:])
131 | 


--------------------------------------------------------------------------------
/bothub/shared/utils/scripts/link_lang_spacy.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import os
 3 | import sys
 4 | import plac
 5 | import importlib
 6 | 
 7 | from pathlib import Path
 8 | from spacy.util import get_package_path
 9 | from spacy.compat import symlink_to
10 | 
11 | 
12 | @plac.annotations(
13 |     lang=plac.Annotation(help="Language code"),
14 |     lang_path=plac.Annotation(help="Language path"),
15 | )
16 | def link_lang_spacy(lang, lang_path):
17 |     origin_path = os.path.join(str(get_package_path("spacy").resolve()), "lang", lang)
18 |     try:
19 |         symlink_to(Path(origin_path), os.path.abspath(lang_path))
20 |         try:
21 |             importlib.import_module("spacy.lang.{}".format(lang))
22 |             print("link created")
23 |         except Exception as e:
24 |             print("link not created")
25 |             raise e
26 |     except Exception as e:
27 |         print("error to create link to {} from {}".format(lang, lang_path))
28 |         raise e
29 | 
30 | 
31 | if __name__ == "__main__":
32 |     plac.call(link_lang_spacy, sys.argv[1:])
33 | 


--------------------------------------------------------------------------------
/celery_app.py:
--------------------------------------------------------------------------------
  1 | from bothub_nlp_celery.app import celery_app
  2 | 
  3 | from bothub_nlp_celery.tasks import (
  4 |     TASK_NLU_PARSE_TEXT,
  5 |     TASK_NLU_EVALUATE_UPDATE,
  6 |     TASK_NLU_INTENT_SENTENCE_SUGGESTION_TEXT,
  7 |     TASK_NLU_TRAIN_UPDATE,
  8 |     TASK_NLU_WORDS_DISTRIBUTION,
  9 |     TASK_NLU_DEBUG_PARSE_TEXT,
 10 |     TASK_NLU_SENTENCE_SUGGESTION_TEXT,
 11 |     TASK_NLU_WORD_SUGGESTION_TEXT,
 12 | )
 13 | 
 14 | from bothub.shared.utils.backend import backend
 15 | 
 16 | from bothub.nlu_worker.task.parse import parse_text
 17 | from bothub.nlu_worker.task.debug_parse import debug_parse_text
 18 | from bothub.nlu_worker.task.sentence_suggestion import sentence_suggestion_text
 19 | from bothub.nlu_worker.task.word_suggestion import word_suggestion_text
 20 | from bothub.nlu_worker.task.intent_sentence_suggestion import (
 21 |     intent_sentence_suggestion_text,
 22 | )
 23 | from bothub.nlu_worker.task.words_distribution import words_distribution_text
 24 | from bothub.nlu_worker.task.evaluate import evaluate_update
 25 | 
 26 | from bothub.shared.evaluate_crossval import evaluate_crossval_update
 27 | from bothub.shared.train import train_update
 28 | 
 29 | from bothub.nlu_worker.interpreter_manager import InterpreterManager
 30 | 
 31 | interpreter_manager = InterpreterManager()
 32 | 
 33 | 
 34 | @celery_app.task(name=TASK_NLU_PARSE_TEXT)
 35 | def celery_parse_text(repository_version, repository_authorization, *args, **kwargs):
 36 |     return parse_text(
 37 |         repository_version,
 38 |         repository_authorization,
 39 |         interpreter_manager,
 40 |         *args,
 41 |         **kwargs
 42 |     )
 43 | 
 44 | 
 45 | @celery_app.task(name=TASK_NLU_DEBUG_PARSE_TEXT)
 46 | def celery_debug_parse_text(
 47 |     repository_version, repository_authorization, *args, **kwargs
 48 | ):
 49 |     return debug_parse_text(
 50 |         repository_version,
 51 |         repository_authorization,
 52 |         interpreter_manager,
 53 |         *args,
 54 |         **kwargs
 55 |     )
 56 | 
 57 | 
 58 | @celery_app.task(name=TASK_NLU_SENTENCE_SUGGESTION_TEXT)
 59 | def celery_sentence_suggestion_text(*args, **kwargs):
 60 |     return sentence_suggestion_text(*args, **kwargs)
 61 | 
 62 | 
 63 | @celery_app.task(name=TASK_NLU_INTENT_SENTENCE_SUGGESTION_TEXT)
 64 | def celery_intent_sentence_suggestion_text(
 65 |     repository_version, repository_authorization, *args, **kwargs
 66 | ):
 67 |     return intent_sentence_suggestion_text(
 68 |         repository_version, repository_authorization, *args, **kwargs
 69 |     )
 70 | 
 71 | 
 72 | @celery_app.task(name=TASK_NLU_WORD_SUGGESTION_TEXT)
 73 | def celery_word_suggestion_text(*args, **kwargs):
 74 |     return word_suggestion_text(*args, **kwargs)
 75 | 
 76 | 
 77 | @celery_app.task(name=TASK_NLU_TRAIN_UPDATE)
 78 | def celery_train_update(repository_version, by_id, repository_authorization):
 79 |     backend().request_backend_save_queue_id(
 80 |         update_id=repository_version,
 81 |         repository_authorization=repository_authorization,
 82 |         task_id=celery_app.current_task.request.id,
 83 |         from_queue=1,
 84 |         type_processing=0,
 85 |     )
 86 |     return train_update(repository_version, by_id, repository_authorization)
 87 | 
 88 | 
 89 | @celery_app.task(name=TASK_NLU_EVALUATE_UPDATE)
 90 | def celery_evaluate_update(
 91 |     repository_version_id, repository_version_language_id, repository_authorization, cross_validation, language
 92 | ):
 93 |     if cross_validation:
 94 |         return evaluate_crossval_update(
 95 |             repository_version_language_id, repository_authorization, {}, language
 96 |         )
 97 |     return evaluate_update(
 98 |         repository_version_id, repository_version_language_id, repository_authorization, interpreter_manager, language
 99 |     )
100 | 
101 | 
102 | @celery_app.task(name=TASK_NLU_WORDS_DISTRIBUTION)
103 | def celery_words_distribution(repository_version, language, repository_authorization):
104 |     return words_distribution_text(
105 |         repository_version, language, repository_authorization
106 |     )
107 | 


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | # Attention:
 2 | # Use this docker-compose to:
 3 | # - Up development environment: docker-compose up
 4 | # - Build docker images: docker-compose build
 5 | 
 6 | version: '3.6'
 7 | 
 8 | services:
 9 |   bothub-nlp-nlu-worker:
10 |     image: ${BOTHUB_NLP_NLU_WORKER_DOCKER_IMAGE_NAME:-ilha/bothub-nlp-nlu-worker}:${BOTHUB_NLP_NLU_WORKER_DOCKER_IMAGE_TAG:-latest}
11 |     build:
12 |       context: .
13 |       dockerfile: nlp.Dockerfile
14 |       args:
15 |         DOWNLOAD_MODELS: xx-NONE
16 |     depends_on:
17 |       - bothub-nlp-celery-redis
18 |     networks:
19 |       - default
20 |     environment:
21 |       # bothub-nlp aws to save charts
22 |       - BOTHUB_NLP_AWS_S3_BUCKET_NAME=${BOTHUB_NLP_AWS_S3_BUCKET_NAME}
23 |       - BOTHUB_NLP_AWS_ACCESS_KEY_ID=${BOTHUB_NLP_AWS_ACCESS_KEY_ID}
24 |       - BOTHUB_NLP_AWS_SECRET_ACCESS_KEY=${BOTHUB_NLP_AWS_SECRET_ACCESS_KEY}
25 |       # bothub-nlp env vars
26 |       - BOTHUB_NLP_SENTRY_CLIENT=${BOTHUB_NLP_SENTRY_CLIENT}
27 |       # bothub-nlp-celery env vars
28 |       - BOTHUB_NLP_CELERY_BROKER_URL=${BOTHUB_NLP_CELERY_BROKER_URL:-redis://bothub-nlp-celery-redis:6379/0}
29 |       - BOTHUB_NLP_CELERY_BACKEND_URL=${BOTHUB_NLP_CELERY_BACKEND_URL:-redis://bothub-nlp-celery-redis:6379/0}
30 |       - BOTHUB_ENGINE_URL=${BOTHUB_ENGINE_URL:-https://api.bothub.it}
31 |       - BOTHUB_NLP_LANGUAGE_QUEUE=${BOTHUB_NLP_LANGUAGE_QUEUE:-en}
32 |   bothub-ai-platform:
33 |     image: ${BOTHUB_NLP_NLU_WORKER_DOCKER_IMAGE_NAME:-ilha/bothub-ai-platform}:${BOTHUB_NLP_NLU_WORKER_DOCKER_IMAGE_TAG:-latest}
34 |     build:
35 |       context: .
36 |       dockerfile: aiplatform.Dockerfile
37 |       args:
38 |         DOWNLOAD_MODELS: xx-NONE
39 |     networks:
40 |       - default
41 | 
42 |   bothub-nlp-celery-redis:
43 |     image: redis
44 |     ports:
45 |       - 6379:6379
46 | 


--------------------------------------------------------------------------------
/nlp.Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:18.04 as base
 2 | 
 3 | ENV WORKDIR /home/root/app
 4 | ENV LC_ALL C.UTF-8
 5 | ENV LANG C.UTF-8
 6 | ENV PYTHON_WHEELS_PATH /wheels
 7 | ENV PYTHON_BUILD_PACKAGES "software-properties-common curl"
 8 | ENV PIP_REQUIREMENTS "-r requirements.txt"
 9 | 
10 | WORKDIR ${WORKDIR}
11 | 
12 | RUN apt-get update && apt-get install --no-install-recommends -y ${PYTHON_BUILD_PACKAGES} git
13 | RUN apt-get install -y python3 python3-pip python3-venv
14 | RUN apt-get install build-essential
15 | 
16 | RUN echo ttf-mscorefonts-installer msttcorefonts/accepted-mscorefonts-eula select true | debconf-set-selections
17 | RUN apt-get install -y ttf-mscorefonts-installer \
18 |     && apt-get autoremove -y \
19 |     && apt-get clean -y \
20 |     && rm -rf /var/lib/apt/lists/*
21 | 
22 | 
23 | RUN bash -c "ln -s /usr/bin/python3 /usr/bin/python; ln -s /usr/bin/pip3 /usr/bin/pip"
24 | 
25 | COPY requirements.txt .
26 | 
27 | FROM base as builder
28 | 
29 | ENV BUILD_PACKAGES "build-essential"
30 | 
31 | RUN apt-get update && apt-get install --no-install-recommends -y ${BUILD_PACKAGES}
32 | 
33 | RUN pip install --upgrade pip
34 | 
35 | RUN pip install -U pip setuptools
36 | 
37 | RUN pip wheel --wheel-dir=${PYTHON_WHEELS_PATH} ${PIP_REQUIREMENTS}
38 | 
39 | FROM base
40 | 
41 | COPY --from=builder ${PYTHON_WHEELS_PATH} ${PYTHON_WHEELS_PATH}
42 | 
43 | RUN pip install --upgrade pip
44 | 
45 | RUN pip install -U pip setuptools
46 | 
47 | RUN pip install --find-links=${PYTHON_WHEELS_PATH} ${PIP_REQUIREMENTS}
48 | 
49 | COPY bothub ${WORKDIR}/bothub
50 | 
51 | COPY start_celery.py .
52 | COPY celery_app.py .
53 | 
54 | ARG DOWNLOAD_MODELS
55 | #Install torch with cuda 10.1
56 | RUN if [ "${DOWNLOAD_MODELS}" = "pt_br-BERT" ]; then \
57 |         pip install torch==1.6.0+cu101 torchvision==0.7.0+cu101 -f https://download.pytorch.org/whl/torch_stable.html; \
58 |     fi
59 | 
60 | RUN if [ ${DOWNLOAD_MODELS} ]; then \
61 |         python3.6 bothub/shared/utils/scripts/download_models.py ${DOWNLOAD_MODELS}; \
62 |     fi
63 | 
64 | ENTRYPOINT [  "celery", "worker", "--autoscale", "1,1", "-O", "fair", "--workdir", ".", "-A", "celery_app", "-c", "5", "-l", "INFO", "-E", "--pool", "threads" ]
65 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
  1 | absl-py==0.9.0
  2 | aiofiles==0.4.0
  3 | aiohttp==3.6.2
  4 | amqp==5.0.6
  5 | appdirs==1.4.3
  6 | APScheduler==3.6.3
  7 | astor==0.8.1
  8 | async-generator==1.10
  9 | async-timeout==3.0.1
 10 | attrs==19.3.0
 11 | billiard==3.6.4.0
 12 | black==19.3b0
 13 | blis==0.2.4
 14 | git+https://github.com/bothub-it/bothub-nlp-celery.git@0.1.38
 15 | git+https://github.com/bothub-it/bothub-backend.git@1.0.22
 16 | boto3==1.12.25
 17 | botocore==1.15.25
 18 | bz2file==0.98
 19 | cachetools==4.0.0
 20 | celery==5.1.2
 21 | certifi==2019.11.28
 22 | cffi==1.14.0
 23 | chardet==3.0.4
 24 | click==7.1.1
 25 | cloudpickle==1.2.2
 26 | colorclass==2.2.0
 27 | coloredlogs==10.0
 28 | colorhash==1.0.2
 29 | ConfigArgParse==1.1
 30 | contextvars==2.3
 31 | cryptography==2.8
 32 | cycler==0.10.0
 33 | cymem==2.0.3
 34 | DAWG-Python==0.7.2
 35 | decorator==4.4.2
 36 | dill==0.3.1.1
 37 | dnspython==1.16.0
 38 | docopt==0.6.2
 39 | docutils==0.15.2
 40 | dopamine-rl==3.0.1
 41 | entrypoints==0.3
 42 | fbmessenger==6.0.0
 43 | flake8==3.7.9
 44 | Flask==1.1.1
 45 | Flask-Cors==3.0.8
 46 | future==0.18.2
 47 | gast==0.2.2
 48 | gevent==1.4.0
 49 | gin-config==0.3.0
 50 | google-api-core==1.16.0
 51 | google-api-python-client==1.8.0
 52 | google-auth==1.11.3
 53 | google-auth-httplib2==0.0.3
 54 | google-auth-oauthlib==0.4.1
 55 | google-pasta==0.2.0
 56 | googleapis-common-protos==1.51.0
 57 | greenlet==0.4.15
 58 | grpcio==1.27.2
 59 | gunicorn==20.0.4
 60 | gym==0.17.1
 61 | h11==0.8.1
 62 | h2==3.2.0
 63 | h5py==2.10.0
 64 | hpack==3.0.0
 65 | hstspreload==2020.3.17
 66 | httplib2==0.17.0
 67 | httptools==0.1.1
 68 | httpx==0.9.3
 69 | humanfriendly==8.1
 70 | hyperframe==5.2.0
 71 | idna==2.7
 72 | idna-ssl==1.1.0
 73 | imageio==2.8.0
 74 | immutables==0.6
 75 | importlib-metadata==1.6.0
 76 | itsdangerous==1.1.0
 77 | Jinja2==2.11.1
 78 | jmespath==0.9.5
 79 | joblib==0.14.1
 80 | jsonpickle==1.3
 81 | jsonschema==3.2.0
 82 | kafka-python==1.4.7
 83 | Keras-Applications==1.0.8
 84 | Keras-Preprocessing==1.1.0
 85 | kfac==0.2.0
 86 | kiwisolver==1.1.0
 87 | kombu==5.1.0
 88 | lime==0.1.1.36
 89 | Markdown==3.2.1
 90 | MarkupSafe==1.1.1
 91 | matplotlib==3.1.2
 92 | mattermostwrapper==2.2
 93 | mccabe==0.6.1
 94 | mesh-tensorflow==0.1.12
 95 | mpmath==1.1.0
 96 | multidict==4.7.5
 97 | murmurhash==1.0.2
 98 | networkx==2.4
 99 | nltk==3.4.5
100 | numpy==1.18.1
101 | oauth2client==4.1.3
102 | oauthlib==3.1.0
103 | opencv-python==4.2.0.32
104 | opt-einsum==3.2.1
105 | packaging==20.0
106 | pika==1.1.0
107 | Pillow==7.0.0
108 | plac==0.9.6
109 | preshed==2.0.1
110 | promise==2.3
111 | prompt-toolkit==2.0.10
112 | protobuf==3.11.3
113 | psycopg2-binary==2.8.5
114 | pyasn1==0.4.8
115 | pyasn1-modules==0.2.8
116 | pycodestyle==2.5.0
117 | pycparser==2.20
118 | pydot==1.4.1
119 | pyflakes==2.1.1
120 | pyglet==1.5.0
121 | PyJWT==1.7.1
122 | pykwalify==1.7.0
123 | pymongo==3.8.0
124 | pymorphy2==0.8
125 | pymorphy2-dicts==2.4.393442.3710985
126 | pyparsing==2.4.6
127 | pypng==0.0.20
128 | pyrsistent==0.16.0
129 | PySocks==1.7.1
130 | pythainlp==2.1.4
131 | python-crfsuite==0.9.7
132 | python-dateutil==2.8.1
133 | python-decouple==3.3
134 | python-engineio==3.11.2
135 | python-socketio==4.4.0
136 | python-telegram-bot==11.1.0
137 | pytz==2019.3
138 | PyWavelets==1.1.1
139 | PyYAML==5.3.1
140 | questionary==1.5.1
141 | redis==3.4.1
142 | requests==2.23.0
143 | requests-oauthlib==1.3.0
144 | requests-toolbelt==0.9.1
145 | rfc3986==1.3.2
146 | rocketchat-API==0.6.36
147 | rsa==4.0
148 | ruamel.yaml==0.16.10
149 | ruamel.yaml.clib==0.2.0
150 | s3transfer==0.3.3
151 | sanic==19.12.2
152 | Sanic-Cors==0.10.0.post3
153 | sanic-jwt==1.3.2
154 | Sanic-Plugins-Framework==0.9.2
155 | scikit-image==0.16.2
156 | scikit-learn==0.22.2.post1
157 | scipy==1.4.1
158 | sentry-sdk==0.13.2
159 | six==1.14.0
160 | sklearn-crfsuite==0.3.6
161 | slackclient==2.5.0
162 | sniffio==1.1.0
163 | spacy==2.1.9
164 | SQLAlchemy==1.3.15
165 | srsly==1.0.2
166 | sympy==1.5.1
167 | tabulate==0.8.6
168 | tensor2tensor==1.14.1
169 | tensorboard==2.1.1
170 | tensorflow==2.1.2
171 | tensorflow-addons==0.7.1
172 | tensorflow-datasets==2.1.0
173 | tensorflow-estimator==2.1.0
174 | tensorflow-gan==2.0.0
175 | tensorflow-hub==0.7.0
176 | tensorflow-metadata==0.21.1
177 | tensorflow-probability==0.7.0
178 | termcolor==1.1.0
179 | terminaltables==3.1.0
180 | thinc==7.0.8
181 | tinydb==3.15.2
182 | toml==0.10.0
183 | tqdm==4.31.1
184 | twilio==6.26.3
185 | typeguard==2.7.1
186 | typing-extensions==3.7.4.1
187 | tzlocal==2.0.0
188 | ujson==1.35
189 | Unidecode==1.1.1
190 | uritemplate==3.0.1
191 | urllib3==1.24.3
192 | uvloop==0.14.0
193 | vine==5.0.0
194 | wasabi==0.6.0
195 | wcwidth==0.1.8
196 | webexteamssdk==1.1.1
197 | websocket-client==0.54.0
198 | websockets==8.1
199 | Werkzeug==1.0.0
200 | wrapt==1.12.1
201 | yarl==1.4.2
202 | zipp==3.1.0
203 | rasa==1.10.6
204 | transformers==2.11.0
205 | emoji==0.6.0
206 | recognizers-text-suite


--------------------------------------------------------------------------------
/start_celery.py:
--------------------------------------------------------------------------------
 1 | import subprocess
 2 | from bothub_nlp_celery.actions import queue_name
 3 | from bothub_nlp_celery import settings
 4 | 
 5 | if settings.BOTHUB_LANGUAGE_MODEL:
 6 |     queue = queue_name(settings.BOTHUB_NLP_LANGUAGE_QUEUE, model_name=settings.BOTHUB_LANGUAGE_MODEL,)
 7 | else:
 8 |     queue = settings.BOTHUB_NLP_LANGUAGE_QUEUE
 9 | 
10 | 
11 | subprocess.run(
12 |     [
13 |         "celery",
14 |         "-A",
15 |         "celery_app",
16 |         "worker",
17 |         "-O",
18 |         "fair",
19 |         "-c",
20 |         "1",
21 |         "-l",
22 |         "INFO",
23 |         "-E",
24 |         "--pool",
25 |         "gevent",
26 |         "-Q",
27 |         queue,
28 |     ]
29 | )
30 | 


--------------------------------------------------------------------------------
/tests/README.md:
--------------------------------------------------------------------------------
1 | ### Testing
2 | - test_train.py is configured to test en-BERT by default
3 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weni-ai/bothub-nlp/d43b7657aff01544769b71db74adce5f7d366879/tests/__init__.py


--------------------------------------------------------------------------------
/tests/example_bert_pt_br.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weni-ai/bothub-nlp/d43b7657aff01544769b71db74adce5f7d366879/tests/example_bert_pt_br.tar.gz


--------------------------------------------------------------------------------
/tests/example_generic_language.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weni-ai/bothub-nlp/d43b7657aff01544769b71db74adce5f7d366879/tests/example_generic_language.tar.gz


--------------------------------------------------------------------------------
/tests/shared/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weni-ai/bothub-nlp/d43b7657aff01544769b71db74adce5f7d366879/tests/shared/__init__.py


--------------------------------------------------------------------------------
/tests/shared/test_pipeline_builder.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | import os
  3 | 
  4 | import sys
  5 | sys.path.insert(1, os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..')))
  6 | 
  7 | from bothub.shared.utils.pipeline_builder import PipelineBuilder
  8 | from rasa.nlu.registry import class_from_module_path
  9 | 
 10 | 
 11 | class TestPipelineBuilder(unittest.TestCase):
 12 |     def setUp(self, *args):
 13 |         self.update = {
 14 |             'language': 'en',
 15 |             'repository_version': 47,
 16 |             'repository_uuid': '1d8e0d6f-1941-42a3-84c5-788706c7072e',
 17 |             'intent': [4, 5],
 18 |             'algorithm': 'transformer_network_diet_bert',
 19 |             'use_name_entities': False,
 20 |             'use_competing_intents': False,
 21 |             'use_analyze_char': False,
 22 |             'total_training_end': 0,
 23 |             'dataset_size': 15000,
 24 |         }
 25 | 
 26 |         self.pipeline_builder = PipelineBuilder(self.update)
 27 | 
 28 |         list_dir = os.listdir()
 29 |         while 'bert_english' not in list_dir:
 30 |             os.chdir("../")
 31 |             list_dir = os.listdir()
 32 | 
 33 |     def test__add_spacy_nlp(self):
 34 |         component_name = self.pipeline_builder._add_spacy_nlp().get('name')
 35 |         if '.' in component_name:
 36 |             class_from_module_path(component_name)
 37 | 
 38 |     def test__add_whitespace_tokenizer(self):
 39 |         component_name = self.pipeline_builder._add_whitespace_tokenizer().get('name')
 40 |         if '.' in component_name:
 41 |             class_from_module_path(component_name)
 42 | 
 43 |     def test__add_preprocessing(self):
 44 |         component_name = self.pipeline_builder._add_preprocessing().get('name')
 45 |         if '.' in component_name:
 46 |             class_from_module_path(component_name)
 47 | 
 48 |     def test__add_regex_entity_extractor(self):
 49 |         component_name = self.pipeline_builder._add_regex_entity_extractor().get('name')
 50 |         if '.' in component_name:
 51 |             class_from_module_path(component_name)
 52 | 
 53 |     def test__add_countvectors_featurizer(self):
 54 |         components_list = self.pipeline_builder._add_countvectors_featurizer()
 55 |         for component in components_list:
 56 |             component_name = component.get('name')
 57 |             if '.' in component_name:
 58 |                 class_from_module_path(component_name)
 59 | 
 60 |     def test__add_legacy_countvectors_featurizer(self):
 61 |         component_name = self.pipeline_builder._add_legacy_countvectors_featurizer().get('name')
 62 |         if '.' in component_name:
 63 |             class_from_module_path(component_name)
 64 | 
 65 |     def test__add_microsoft_entity_extractor(self):
 66 |         component_name = self.pipeline_builder._add_microsoft_entity_extractor().get('name')
 67 |         if '.' in component_name:
 68 |             class_from_module_path(component_name)
 69 | 
 70 |     def test__add_embedding_intent_classifier(self):
 71 |         component_name = self.pipeline_builder._add_embedding_intent_classifier().get('name')
 72 |         if '.' in component_name:
 73 |             class_from_module_path(component_name)
 74 | 
 75 |     def test__add_diet_classifier(self):
 76 |         component_name = self.pipeline_builder._add_diet_classifier().get('name')
 77 |         if '.' in component_name:
 78 |             class_from_module_path(component_name)
 79 | 
 80 |     def test__legacy_internal_config(self):
 81 |         components_list = self.pipeline_builder._legacy_internal_config()
 82 |         for component in components_list:
 83 |             component_name = component.get('name')
 84 |             if '.' in component_name:
 85 |                 class_from_module_path(component_name)
 86 | 
 87 |     def test__legacy_external_config(self):
 88 |         components_list = self.pipeline_builder._legacy_external_config()
 89 |         for component in components_list:
 90 |             component_name = component.get('name')
 91 |             if '.' in component_name:
 92 |                 class_from_module_path(component_name)
 93 | 
 94 |     def test__transformer_network_diet_config(self):
 95 |         components_list = self.pipeline_builder._transformer_network_diet_config()
 96 |         for component in components_list:
 97 |             component_name = component.get('name')
 98 |             if '.' in component_name:
 99 |                 class_from_module_path(component_name)
100 | 
101 |     def test__transformer_network_diet_word_embedding_config(self):
102 |         components_list = self.pipeline_builder._transformer_network_diet_word_embedding_config()
103 |         for component in components_list:
104 |             component_name = component.get('name')
105 |             if '.' in component_name:
106 |                 class_from_module_path(component_name)
107 | 
108 |     def test__transformer_network_diet_bert_config(self):
109 |         components_list = self.pipeline_builder._transformer_network_diet_bert_config()
110 |         for component in components_list:
111 |             component_name = component.get('name')
112 |             if '.' in component_name:
113 |                 class_from_module_path(component_name)
114 | 
115 |     def test_unexisting_model_language(self):
116 |         update = {
117 |             'language': 'unexisting',
118 |             'algorithm': 'neural_network_external',
119 |             'use_name_entities': False,
120 |             'dataset_size': 15000,
121 |         }
122 |         pipeline_builder = PipelineBuilder(update)
123 |         self.assertEqual(pipeline_builder.model, None)
124 | 
125 |         update['algorithm'] = 'transformer_network_diet'
126 |         pipeline_builder = PipelineBuilder(update)
127 |         self.assertEqual(pipeline_builder.model, None)
128 | 
129 |         update['algorithm'] = 'neural_network_internal'
130 |         pipeline_builder = PipelineBuilder(update)
131 |         self.assertEqual(pipeline_builder.model, None)
132 | 
133 |         update = {
134 |             'language': 'en',
135 |             'algorithm': 'transformer_network_diet_bert',
136 |             'use_name_entities': True,
137 |             'dataset_size': 15000,
138 |         }
139 |         pipeline_builder = PipelineBuilder(update)
140 |         self.assertEqual(pipeline_builder.model, 'BERT')
141 | 
142 |     def test__dynamic_epochs(self):
143 |         self.update["dataset_size"] = 10000
144 |         self.pipeline_builder = PipelineBuilder(self.update)
145 |         result_epochs = self.pipeline_builder._calculate_epochs_number(
146 |             100,
147 |             self.pipeline_builder._epoch_factor_function1
148 |         )
149 |         self.assertEqual(result_epochs, 100)
150 | 
151 |         self.update["dataset_size"] = 15000
152 |         self.pipeline_builder = PipelineBuilder(self.update)
153 |         result_epochs = self.pipeline_builder._calculate_epochs_number(
154 |             100,
155 |             self.pipeline_builder._epoch_factor_function1
156 |         )
157 |         self.assertLess(result_epochs, 100)
158 |         self.assertGreater(result_epochs, 0)
159 | 
160 |         self.update["dataset_size"] = 0
161 |         self.pipeline_builder = PipelineBuilder(self.update)
162 |         result_epochs = self.pipeline_builder._calculate_epochs_number(
163 |             100,
164 |             self.pipeline_builder._epoch_factor_function1
165 |         )
166 |         self.assertEqual(result_epochs, 100)
167 | 
168 | 


--------------------------------------------------------------------------------
/tests/shared/test_preprocesing.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | import os
  3 | import emoji
  4 | 
  5 | import sys
  6 | sys.path.insert(1, os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..')))
  7 | 
  8 | from bothub.shared.utils.preprocessing.preprocessing_factory import PreprocessingFactory
  9 | from bothub.shared.utils.preprocessing.preprocessing_base import PreprocessingBase
 10 | from bothub.shared.utils.preprocessing.preprocessing_english import PreprocessingEnglish
 11 | from bothub.shared.utils.preprocessing.preprocessing_portuguese import PreprocessingPortuguese
 12 | from bothub.shared.utils.preprocessing.preprocessing_spanish import PreprocessingSpanish
 13 | from rasa.nlu.training_data import Message
 14 | 
 15 | 
 16 | class TestPreprocessing(unittest.TestCase):
 17 |     def setUp(self, *args):
 18 |         self.base = PreprocessingFactory().factory()
 19 |         self.portuguese = PreprocessingFactory('pt_br').factory()
 20 |         self.english = PreprocessingFactory('en').factory()
 21 |         self.spanish = PreprocessingFactory('es').factory()
 22 | 
 23 |     def test__factory(self):
 24 |         base = PreprocessingFactory().factory()
 25 |         self.assertIsInstance(base, PreprocessingBase)
 26 |         base = PreprocessingFactory('unexisting_language').factory()
 27 |         self.assertIsInstance(base, PreprocessingBase)
 28 |         portuguese = PreprocessingFactory('pt_br').factory()
 29 |         self.assertIsInstance(portuguese, PreprocessingPortuguese)
 30 |         english = PreprocessingFactory('en').factory()
 31 |         self.assertIsInstance(english, PreprocessingEnglish)
 32 |         spanish = PreprocessingFactory('es').factory()
 33 |         self.assertIsInstance(spanish, PreprocessingSpanish)
 34 | 
 35 |     def test__default_preprocessing(self):
 36 |         phrase = "i'`m GOING não tô é the gym"
 37 |         expected = "im going nao to e the gym"
 38 |         self.assertEqual(self.base.default_preprocessing(phrase), (expected, None))
 39 |         self.assertEqual(self.portuguese.default_preprocessing(phrase), (expected, None))
 40 |         self.assertEqual(self.english.default_preprocessing(phrase), (expected, None))
 41 |         self.assertEqual(self.spanish.default_preprocessing(phrase), (expected, None))
 42 | 
 43 |         self.assertRaises(ValueError, self.base.default_preprocessing, None)
 44 | 
 45 |         phrase = "i'`m GOING não tô é the 'gym"
 46 |         expected = "im going nao to e the gym"
 47 |         entities = [
 48 |             {
 49 |                 "start": 0,
 50 |                 "end": 4,
 51 |                 "value": "i'`m",
 52 |                 "entity": "me"
 53 |             },
 54 |             {
 55 |                 "start": 24,
 56 |                 "end": 28,
 57 |                 "value": "'gym",
 58 |                 "entity": "gym"
 59 |             },
 60 |         ]
 61 |         expected_entities = [
 62 |             {
 63 |                 "start": 0,
 64 |                 "end": 2,
 65 |                 "value": "im",
 66 |                 "entity": "me"
 67 |             },
 68 |             {
 69 |                 "start": 22,
 70 |                 "end": 25,
 71 |                 "value": "gym",
 72 |                 "entity": "gym"
 73 |             },
 74 |         ]
 75 |         self.assertEqual(
 76 |             self.base.default_preprocessing(phrase, entities),
 77 |             (expected, expected_entities)
 78 |         )
 79 |         self.assertEqual(
 80 |             self.base.default_preprocessing(phrase, None),
 81 |             (expected, None)
 82 |         )
 83 | 
 84 |     def test__extract_emoji_text(self):
 85 |         emoji_code = ':smile_face:'
 86 |         emoji_text = 'smile face'
 87 |         self.assertEqual(self.base.extract_emoji_text(emoji_code), emoji_text)
 88 |         self.assertRaises(ValueError, self.base.extract_emoji_text, None)
 89 |         self.assertRaises(ValueError, self.base.extract_emoji_text, 'not a emoji code')
 90 | 
 91 |     def test__emoji_handling(self):
 92 |         self.assertEqual(self.base.emoji_handling('😂'), "face with tears of joy")
 93 |         self.assertEqual(self.base.emoji_handling(''), '')
 94 | 
 95 |         for emoji_code in self.portuguese.emoji_contractions.keys():
 96 |             # transform code to emoji
 97 |             emj = emoji.emojize(emoji_code)
 98 | 
 99 |             self.assertEqual(self.portuguese.emoji_handling(emj), self.portuguese.emoji_contractions[emoji_code])
100 |             self.assertEqual(self.english.emoji_handling(emj), self.english.emoji_contractions[emoji_code])
101 |             self.assertEqual(self.spanish.emoji_handling(emj), self.spanish.emoji_contractions[emoji_code])
102 | 
103 |     def test__parse_preprocess(self):
104 | 
105 |         phrase = "i'`m GOING não tô é the gym 😂"
106 | 
107 |         self.assertEqual(self.base.preprocess(Message(text=phrase)).text, "im going nao to e the gym face with tears of joy")
108 |         self.assertEqual(self.portuguese.preprocess(Message(text=phrase)).text, "im going nao to e the gym hahaha")
109 |         self.assertEqual(self.english.preprocess(Message(text=phrase)).text, "im going nao to e the gym hahaha")
110 |         self.assertEqual(self.spanish.preprocess(Message(text=phrase)).text, "im going nao to e the gym hahaha")
111 | 
112 |         pp = PreprocessingFactory(remove_accent=False).factory()
113 |         self.assertEqual(pp.preprocess(Message(text=phrase)).text, "im going não tô é the gym face with tears of joy")
114 |         pp = PreprocessingFactory('pt_br', remove_accent=False).factory()
115 |         self.assertEqual(pp.preprocess(Message(text=phrase)).text, "im going não tô é the gym hahaha")
116 |         pp = PreprocessingFactory('en', remove_accent=False).factory()
117 |         self.assertEqual(pp.preprocess(Message(text=phrase)).text, "im going não tô é the gym hahaha")
118 |         pp = PreprocessingFactory('es', remove_accent=False).factory()
119 |         self.assertEqual(pp.preprocess(Message(text=phrase)).text, "im going não tô é the gym hahaha")
120 | 
121 |     def test__training_preprocess(self):
122 |         preprocessors = [
123 |             PreprocessingFactory(remove_accent=False).factory(),
124 |             PreprocessingFactory('pt_br', remove_accent=False).factory(),
125 |             PreprocessingFactory('en', remove_accent=False).factory(),
126 |             PreprocessingFactory('es', remove_accent=False).factory()
127 |         ]
128 |         for preprocessor in preprocessors:
129 |             phrase = "i'`m GOING não tô é the 'gym"
130 |             expected_phrase = "im going não tô é the gym"
131 |             entities = [
132 |                 {
133 |                     "start": 0,
134 |                     "end": 4,
135 |                     "value": "i'`m",
136 |                     "entity": "me"
137 |                 },
138 |                 {
139 |                     "start": 24,
140 |                     "end": 28,
141 |                     "value": "'gym",
142 |                     "entity": "gym"
143 |                 },
144 |             ]
145 |             expected_entities = [
146 |                 {
147 |                     "start": 0,
148 |                     "end": 2,
149 |                     "value": "im",
150 |                     "entity": "me"
151 |                 },
152 |                 {
153 |                     "start": 22,
154 |                     "end": 25,
155 |                     "value": "gym",
156 |                     "entity": "gym"
157 |                 },
158 |             ]
159 |             message = Message.build(
160 |                 text=phrase,
161 |                 intent='test',
162 |                 entities=entities,
163 |             )
164 | 
165 |             self.assertEqual(
166 |                 preprocessor.preprocess(message).text,
167 |                 expected_phrase
168 |             )
169 |             self.assertEqual(
170 |                 preprocessor.preprocess(message).data.get('entities'),
171 |                 expected_entities
172 |             )
173 | 
174 |             message = Message.build(
175 |                 text=phrase,
176 |                 intent='test',
177 |                 entities=None,
178 |             )
179 |             self.assertEqual(
180 |                 preprocessor.preprocess(message).text,
181 |                 expected_phrase
182 |             )
183 |             with self.assertRaises(KeyError):
184 |                 _ = preprocessor.preprocess(message).data['entities']
185 | 
186 |     def test_example(self):
187 |         example = {
188 |             "text": "The new coronavirus doesn\u2019t affect young people.",
189 |             "intent": "myth",
190 |             "entities": [
191 |                 {
192 |                     "start": 8,
193 |                     "end": 19,
194 |                     "value": "coronavirus",
195 |                     "entity": "coronavirus"
196 |                 },
197 |                 {
198 |                     "start": 35,
199 |                     "end": 40,
200 |                     "value": "young",
201 |                     "entity": "young"
202 |                 }
203 |             ]
204 |         }
205 |         message = Message.build(
206 |             text=example['text'],
207 |             intent=example['intent'],
208 |             entities=example['entities'],
209 |         )
210 | 
211 |         result = PreprocessingFactory('en', remove_accent=False).factory().preprocess(message)
212 |         result2 = PreprocessingFactory('en', remove_accent=False).factory().preprocess(Message(text=example['text']))
213 | 
214 |         self.assertEqual(result.text, result2.text)
215 | 
216 |     def test__preprocess_text(self):
217 |         phrase = "i'`m GOING não tô é the gym"
218 |         expected = "im going nao to e the gym"
219 |         self.assertEqual(self.base.preprocess_text(phrase), expected)
220 |         self.assertEqual(self.portuguese.preprocess_text(phrase), expected)
221 |         self.assertEqual(self.english.preprocess_text(phrase), expected)
222 |         self.assertEqual(self.spanish.preprocess_text(phrase), expected)
223 | 


--------------------------------------------------------------------------------
/tests/test_debug_parse.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import unittest
 3 | import uuid
 4 | import base64
 5 | import os
 6 | from unittest.mock import patch
 7 | 
 8 | import sys
 9 | sys.path.insert(1, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
10 | 
11 | from bothub.nlu_worker.task.debug_parse import debug_parse_text
12 | from bothub.nlu_worker.interpreter_manager import InterpreterManager
13 | 
14 | 
15 | class TestDebugParseTask(unittest.TestCase):
16 |     def setUp(self, *args):
17 |         self.repository_authorization = uuid.uuid4()
18 |         self.current_update = {
19 |             "ready_for_train": True,
20 |             "current_version_id": 6647,
21 |             "repository_authorization_user_id": 303,
22 |         }
23 |         self.interpreter_manager = InterpreterManager()
24 | 
25 |     # change directory to /tests
26 |     try:
27 |         os.chdir("tests")
28 |     except Exception:
29 |         pass
30 | 
31 |     @patch(
32 |         "bothub_backend.bothub.BothubBackend.request_backend_parse_nlu_persistor",
33 |         return_value={
34 |             "version_id": 49,
35 |             "repository_uuid": "0f6b9644-db55-49a2-a20d-2af74106d892",
36 |             "total_training_end": 3,
37 |             "language": "en",
38 |             "bot_data": base64.b64encode(
39 |                 open("example_generic_language.tar.gz", "rb").read()
40 |             ),
41 |         },
42 |     )
43 |     @patch(
44 |         "bothub_backend.bothub.BothubBackend.request_backend_info",
45 |         return_value={"intents": ["affirmative", "negative", "doubt", "bias"]},
46 |     )
47 |     def test_debug_parse_without_rasa_format(self, *args):
48 |         result = debug_parse_text(
49 |             self.current_update.get("current_version_id"),
50 |             self.repository_authorization,
51 |             self.interpreter_manager,
52 |             "ok",
53 |         )
54 |         print(json.dumps(result, indent=2))
55 | 
56 |     @patch(
57 |         "bothub_backend.bothub.BothubBackend.request_backend_parse_nlu_persistor",
58 |         return_value={
59 |             "version_id": 49,
60 |             "repository_uuid": "0f6b9644-db55-49a2-a20d-2af74106d892",
61 |             "total_training_end": 3,
62 |             "language": "en",
63 |             "bot_data": base64.b64encode(
64 |                 open("example_generic_language.tar.gz", "rb").read()
65 |             ),
66 |         },
67 |     )
68 |     @patch(
69 |         "bothub_backend.bothub.BothubBackend.request_backend_info",
70 |         return_value={"intents": ["affirmative", "negative", "doubt", "bias"]},
71 |     )
72 |     def test_debug_parse_with_rasa_format(self, *args):
73 | 
74 |         result = debug_parse_text(
75 |             self.current_update.get("current_version_id"),
76 |             self.repository_authorization,
77 |             self.interpreter_manager,
78 |             "ok",
79 |             True,
80 |         )
81 |         print(json.dumps(result, indent=2))
82 | 


--------------------------------------------------------------------------------
/tests/test_evaluate.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | import uuid
  3 | import base64
  4 | import os
  5 | from unittest.mock import patch
  6 | 
  7 | import sys
  8 | sys.path.insert(1, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
  9 | 
 10 | from bothub.nlu_worker.task.evaluate import evaluate_update
 11 | from bothub.nlu_worker.interpreter_manager import InterpreterManager
 12 | 
 13 | 
 14 | class TestEvaluateTask(unittest.TestCase):
 15 |     def setUp(self, *args):
 16 |         self.repository_authorization = uuid.uuid4()
 17 |         self.repository_version = 1
 18 |         self.current_update = {
 19 |             "ready_for_train": True,
 20 |             "repository_version": 2,
 21 |             "current_version_id": 6647,
 22 |             "repository_authorization_user_id": 303,
 23 |         }
 24 |         self.language = "pt_br"
 25 |         self.interpreter_manager = InterpreterManager()
 26 | 
 27 |     # change directory to /tests
 28 |     try:
 29 |         os.chdir("tests")
 30 |     except Exception:
 31 |         pass
 32 | 
 33 |     @patch(
 34 |         "bothub_backend.bothub.BothubBackend.request_backend_start_evaluation",
 35 |         return_value=[
 36 |             {"text": "nops", "intent": "negative", "entities": []},
 37 |             {"text": "nope", "intent": "negative", "entities": []},
 38 |             {"text": "nem", "intent": "negative", "entities": []},
 39 |             {"text": "no", "intent": "negative", "entities": []},
 40 |             {"text": "nn", "intent": "negative", "entities": []},
 41 |             {"text": "n", "intent": "negative", "entities": []},
 42 |             {"text": "ja namorei", "intent": "affirmative", "entities": []},
 43 |             {"text": "já namorei", "intent": "affirmative", "entities": []},
 44 |             {"text": "simn", "intent": "affirmative", "entities": []},
 45 |             {
 46 |                 "text": "aceito sim, muito obrigado",
 47 |                 "intent": "affirmative",
 48 |                 "entities": [],
 49 |             },
 50 |             {"text": "sim, quero o documento", "intent": "affirmative", "entities": []},
 51 |             {"text": "não posso fazer isso", "intent": "negative", "entities": []},
 52 |             {"text": "não gostei", "intent": "negative", "entities": []},
 53 |             {"text": "deixa para lá", "intent": "negative", "entities": []},
 54 |             {"text": "não inventa história", "intent": "negative", "entities": []},
 55 |             {
 56 |                 "text": "não queria ter que dizer isso",
 57 |                 "intent": "negative",
 58 |                 "entities": [],
 59 |             },
 60 |             {"text": "não gostei daquele dia", "intent": "negative", "entities": []},
 61 |             {
 62 |                 "text": "nem deve ser tão bom assim",
 63 |                 "intent": "negative",
 64 |                 "entities": [],
 65 |             },
 66 |             {"text": "não aceito", "intent": "negative", "entities": []},
 67 |             {"text": "nop deixa de onda", "intent": "negative", "entities": []},
 68 |             {"text": "melhor nao falar nada", "intent": "negative", "entities": []},
 69 |             {"text": "n gosto disso", "intent": "negative", "entities": []},
 70 |             {"text": "para com isso, não pode", "intent": "negative", "entities": []},
 71 |             {"text": "melhor não", "intent": "negative", "entities": []},
 72 |             {"text": "quero mais não", "intent": "negative", "entities": []},
 73 |             {"text": "negativo cara", "intent": "negative", "entities": []},
 74 |             {"text": "vamo não", "intent": "negative", "entities": []},
 75 |             {"text": "vou nem mentir", "intent": "negative", "entities": []},
 76 |             {"text": "nem queria dizer isso", "intent": "negative", "entities": []},
 77 |             {"text": "funcionou não", "intent": "negative", "entities": []},
 78 |             {"text": "nem rola", "intent": "negative", "entities": []},
 79 |             {"text": "não posso", "intent": "negative", "entities": []},
 80 |             {"text": "não quero", "intent": "negative", "entities": []},
 81 |             {"text": "conta comigo", "intent": "affirmative", "entities": []},
 82 |             {"text": "sim, preciso de ajuda", "intent": "affirmative", "entities": []},
 83 |             {"text": "é, você está certo sim", "intent": "affirmative", "entities": []},
 84 |             {"text": "muito bom, aceito", "intent": "affirmative", "entities": []},
 85 |             {"text": "sim, gostei disso", "intent": "affirmative", "entities": []},
 86 |             {"text": "conte comigo sempre", "intent": "affirmative", "entities": []},
 87 |             {"text": "afirmativo", "intent": "affirmative", "entities": []},
 88 |             {"text": "ótima ideia, concordo", "intent": "affirmative", "entities": []},
 89 |             {"text": "podemos marcar sim", "intent": "affirmative", "entities": []},
 90 |             {"text": "quero sim", "intent": "affirmative", "entities": []},
 91 |             {"text": "pode contar comigo", "intent": "affirmative", "entities": []},
 92 |             {
 93 |                 "text": "posso sim! me confirma a data",
 94 |                 "intent": "affirmative",
 95 |                 "entities": [],
 96 |             },
 97 |             {
 98 |                 "text": "claro que estou disponivel",
 99 |                 "intent": "affirmative",
100 |                 "entities": [],
101 |             },
102 |             {"text": "ótima ideia", "intent": "affirmative", "entities": []},
103 |             {
104 |                 "text": "seria legal se fossemos",
105 |                 "intent": "affirmative",
106 |                 "entities": [],
107 |             },
108 |             {"text": "que legal, gosto sim", "intent": "affirmative", "entities": []},
109 |             {"text": "é possivel", "intent": "affirmative", "entities": []},
110 |             {"text": "pode me mandar sim", "intent": "affirmative", "entities": []},
111 |             {"text": "aceito", "intent": "affirmative", "entities": []},
112 |             {"text": "dá sim", "intent": "affirmative", "entities": []},
113 |             {
114 |                 "text": "adorei a ideia vamos sim",
115 |                 "intent": "affirmative",
116 |                 "entities": [],
117 |             },
118 |             {"text": "quero", "intent": "affirmative", "entities": []},
119 |             {"text": "vamos sim", "intent": "affirmative", "entities": []},
120 |             {"text": "claro", "intent": "affirmative", "entities": []},
121 |             {"text": "com certeza", "intent": "affirmative", "entities": []},
122 |             {"text": "estou", "intent": "affirmative", "entities": []},
123 |             {"text": "consigu", "intent": "affirmative", "entities": []},
124 |             {"text": "consigo", "intent": "affirmative", "entities": []},
125 |             {"text": "não tenho", "intent": "negative", "entities": []},
126 |             {"text": "nem tenho", "intent": "negative", "entities": []},
127 |             {"text": "pior que não tenho", "intent": "negative", "entities": []},
128 |             {"text": "não tenho email", "intent": "negative", "entities": []},
129 |             {"text": "voces fazem coroa dentaria ?", "intent": "bias", "entities": []},
130 |             {"text": "o plano inclui ceromero?", "intent": "bias", "entities": []},
131 |             {"text": "e buco maxilar facial?", "intent": "bias", "entities": []},
132 |             {"text": "varias vezes", "intent": "affirmative", "entities": []},
133 |             {"text": "um pouco", "intent": "affirmative", "entities": []},
134 |             {"text": "acho que faço isso", "intent": "doubt", "entities": []},
135 |             {"text": "quero sim", "intent": "affirmative", "entities": []},
136 |             {"text": "Não estou bem hoje", "intent": "negative", "entities": []},
137 |             {"text": "não quero mais isso", "intent": "negative", "entities": []},
138 |             {"text": "não estou namorando", "intent": "negative", "entities": []},
139 |             {"text": "a ta sei", "intent": "affirmative", "entities": []},
140 |             {"text": "Nunca namorei", "intent": "negative", "entities": []},
141 |             {
142 |                 "text": "não, como faço para reconhecer?",
143 |                 "intent": "negative",
144 |                 "entities": [],
145 |             },
146 |             {"text": "mais ou menos, pq?", "intent": "doubt", "entities": []},
147 |             {
148 |                 "text": "já mas não foi muito bom",
149 |                 "intent": "affirmative",
150 |                 "entities": [],
151 |             },
152 |             {"text": "tudo ótimo", "intent": "affirmative", "entities": []},
153 |             {"text": "tudo otimo", "intent": "affirmative", "entities": []},
154 |             {"text": "tudo", "intent": "affirmative", "entities": []},
155 |             {"text": "tudo bem", "intent": "affirmative", "entities": []},
156 |             {"text": "eu estou bem", "intent": "affirmative", "entities": []},
157 |             {"text": "eu estou bem", "intent": "affirmative", "entities": []},
158 |             {"text": "tudo uma merda", "intent": "negative", "entities": []},
159 |             {"text": "tudo horrivel", "intent": "negative", "entities": []},
160 |             {"text": "tudo pessimo", "intent": "negative", "entities": []},
161 |             {
162 |                 "text": "eu também estou num relacionamento abusivo",
163 |                 "intent": "bias",
164 |                 "entities": [],
165 |             },
166 |             {"text": "já", "intent": "affirmative", "entities": []},
167 |             {
168 |                 "text": "hoje já estou num relacionamento abusivo",
169 |                 "intent": "bias",
170 |                 "entities": [],
171 |             },
172 |             {
173 |                 "text": "hoje estou num relacionamento abusivo",
174 |                 "intent": "bias",
175 |                 "entities": [],
176 |             },
177 |             {"text": "nunca passei por isso", "intent": "negative", "entities": []},
178 |             {"text": "as vezes", "intent": "doubt", "entities": []},
179 |             {"text": "sofro abuso emocional", "intent": "bias", "entities": []},
180 |             {
181 |                 "text": "to naum... mas ja namorei um porquinho?",
182 |                 "intent": "negative",
183 |                 "entities": [],
184 |             },
185 |             {"text": "estou namorando", "intent": "affirmative", "entities": []},
186 |             {"text": "to namorando", "intent": "affirmative", "entities": []},
187 |             {"text": "pior que ja", "intent": "affirmative", "entities": []},
188 |             {"text": "entendi", "intent": "affirmative", "entities": []},
189 |             {"text": "não entendi", "intent": "doubt", "entities": []},
190 |             {"text": "eu quero", "intent": "affirmative", "entities": []},
191 |             {"text": "gosto de futebol", "intent": "bias", "entities": []},
192 |             {
193 |                 "text": "meu namorado bateu na minha cara",
194 |                 "intent": "bias",
195 |                 "entities": [],
196 |             },
197 |             {
198 |                 "text": "não ne!! meu namorado bateu na minha cara",
199 |                 "intent": "bias",
200 |                 "entities": [],
201 |             },
202 |             {"text": "eu fui estruprada", "intent": "affirmative", "entities": []},
203 |             {"text": "tenho que pensar", "intent": "doubt", "entities": []},
204 |             {"text": "mais ou menos", "intent": "doubt", "entities": []},
205 |             {"text": "talvez", "intent": "doubt", "entities": []},
206 |             {"text": "nunca", "intent": "negative", "entities": []},
207 |             {"text": "não", "intent": "negative", "entities": []},
208 |             {"text": "tenho", "intent": "affirmative", "entities": []},
209 |             {"text": "meu namorado me bateu", "intent": "bias", "entities": []},
210 |             {"text": "quero", "intent": "affirmative", "entities": []},
211 |             {"text": "não", "intent": "negative", "entities": []},
212 |             {"text": "sim", "intent": "affirmative", "entities": []},
213 |             {"text": "fui agredida", "intent": "bias", "entities": []},
214 |             {"text": "estuprada", "intent": "bias", "entities": []},
215 |             {"text": "tou namorando", "intent": "affirmative", "entities": []},
216 |             {"text": "sim, tou namorando", "intent": "affirmative", "entities": []},
217 |         ],
218 |     )
219 |     @patch(
220 |         "bothub_backend.bothub.BothubBackend.request_backend_parse_nlu_persistor",
221 |         return_value={
222 |             "version_id": 49,
223 |             "repository_uuid": "0f6b9644-db55-49a2-a20d-2af74106d892",
224 |             "total_training_end": 3,
225 |             "language": "pt_br",
226 |             "bot_data": base64.b64encode(
227 |                 open("example_generic_language.tar.gz", "rb").read()
228 |             ),
229 |         },
230 |     )
231 |     @patch(
232 |         "bothub_backend.bothub.BothubBackend.request_backend_info",
233 |         return_value={"intents": ['affirmative', 'negative', 'doubt', 'bias']},
234 |     )
235 |     @patch(
236 |         "bothub_backend.bothub.BothubBackend.request_backend_create_evaluate_results",
237 |         return_value={"evaluate_id": 1787, "evaluate_version": 189},
238 |     )
239 |     @patch(
240 |         "bothub_backend.bothub.BothubBackend.request_backend_create_evaluate_results_intent",
241 |         return_value={},
242 |     )
243 |     @patch(
244 |         "bothub_backend.bothub.BothubBackend.request_backend_create_evaluate_results_score",
245 |         return_value={},
246 |     )
247 |     def test_evaluate_ok(self, *args):
248 |         result = evaluate_update(
249 |             self.repository_version,
250 |             self.current_update.get("repository_version"),
251 |             self.repository_authorization,
252 |             self.interpreter_manager,
253 |             self.language
254 |         )
255 | 
256 |         self.assertEqual(1787, result.get("id"))
257 |         self.assertEqual(189, result.get("version"))
258 | 


--------------------------------------------------------------------------------
/tests/test_parse.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import uuid
 3 | import base64
 4 | import os
 5 | from unittest.mock import patch
 6 | 
 7 | import sys
 8 | sys.path.insert(1, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
 9 | 
10 | from bothub.nlu_worker.task.parse import parse_text
11 | from bothub.nlu_worker.interpreter_manager import InterpreterManager
12 | 
13 | 
14 | class TestParseTask(unittest.TestCase):
15 |     def setUp(self, *args):
16 |         self.repository_authorization = uuid.uuid4()
17 |         self.current_update = {
18 |             "ready_for_train": True,
19 |             "current_version_id": 6647,
20 |             "repository_authorization_user_id": 303,
21 |         }
22 |         self.local_path = os.getcwd()
23 |         self.interpreter_manager = InterpreterManager()
24 | 
25 |     # change directory to /tests
26 |     try:
27 |         os.chdir("tests")
28 |     except Exception:
29 |         pass
30 | 
31 |     @patch(
32 |         "bothub_backend.bothub.BothubBackend.request_backend_parse_nlu_persistor",
33 |         return_value={
34 |             "version_id": 49,
35 |             "repository_uuid": "0f6b9644-db55-49a2-a20d-2af74106d892",
36 |             "total_training_end": 3,
37 |             "language": "pt_br",
38 |             "bot_data": base64.b64encode(
39 |                 open("example_generic_language.tar.gz", "rb").read()
40 |             ),
41 |             "from_aws": False,
42 |         },
43 |     )
44 |     def test_parse_without_rasa_format(self, *args):
45 | 
46 |         parse_text(
47 |             self.current_update.get("current_version_id"),
48 |             self.repository_authorization,
49 |             self.interpreter_manager,
50 |             "ok",
51 |         )
52 | 
53 |     @patch(
54 |         "bothub_backend.bothub.BothubBackend.request_backend_parse_nlu_persistor",
55 |         return_value={
56 |             "version_id": 49,
57 |             "repository_uuid": "0f6b9644-db55-49a2-a20d-2af74106d892",
58 |             "total_training_end": 3,
59 |             "language": "pt_br",
60 |             "bot_data": base64.b64encode(
61 |                 open("example_generic_language.tar.gz", "rb").read()
62 |             ),
63 |             "from_aws": False,
64 |         },
65 |     )
66 |     def test_parse_with_rasa_format(self, *args):
67 | 
68 |         parse_text(
69 |             self.current_update.get("current_version_id"),
70 |             self.repository_authorization,
71 |             self.interpreter_manager,
72 |             "ok",
73 |             True,
74 |         )
75 | 


--------------------------------------------------------------------------------
/tests/test_train.py:
--------------------------------------------------------------------------------
  1 | import uuid
  2 | from unittest import TestCase
  3 | from unittest.mock import patch
  4 | 
  5 | import os
  6 | 
  7 | import sys
  8 | sys.path.insert(1, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
  9 | 
 10 | from bothub.shared import train
 11 | 
 12 | 
 13 | class TestTrainTask(TestCase):
 14 | 
 15 |     # # bert_language = "pt_br"
 16 |     bert_language = "en"
 17 | 
 18 |     def setUp(self, *args):
 19 |         self.repository_authorization = uuid.uuid4()
 20 |         self.current_update = {
 21 |             "ready_for_train": True,
 22 |             "current_version_id": 6647,
 23 |             "language": "en",
 24 |             "algorithm": "transformer_network_diet_bert",
 25 |             "repository_authorization_user_id": 303,
 26 |         }
 27 | 
 28 |         # change directory to load bert /bert_english
 29 | 
 30 |         list_dir = os.listdir()
 31 |         while 'bert_english' not in list_dir:
 32 |             os.chdir("../")
 33 |             list_dir = os.listdir()
 34 |         print("Current Working Directory ", os.getcwd())
 35 | 
 36 |     @patch(
 37 |         "bothub_backend.bothub.BothubBackend.request_backend_start_training_nlu",
 38 |         return_value={
 39 |             "language": bert_language,
 40 |             "repository_version": 6647,
 41 |             "repository_uuid": "e1e8a0fa-625c-4ba3-8b91-4c9f308db791",
 42 |             "intent": [],
 43 |             "algorithm": "transformer_network_diet_bert",
 44 |             "total_training_end": 4,
 45 |             "use_name_entities": False,
 46 |             "use_competing_intents": False,
 47 |             "use_analyze_char": False,
 48 |         },
 49 |     )
 50 |     @patch(
 51 |         "bothub_backend.bothub.BothubBackend.request_backend_get_examples",
 52 |         return_value={
 53 |             "count": 358,
 54 |             "next": None,
 55 |             "previous": None,
 56 |             "results": [
 57 |                 {"text": "ss", "intent": "affirmative", "entities": []},
 58 |                 {"text": "okay", "intent": "affirmative", "entities": []},
 59 |                 {"text": "afirmativo", "intent": "affirmative", "entities": []},
 60 |                 {"text": "okk", "intent": "affirmative", "entities": []},
 61 |                 {"text": "okayy", "intent": "affirmative", "entities": []},
 62 |                 {"text": "certo", "intent": "affirmative", "entities": []},
 63 |                 {"text": "nops", "intent": "negative", "entities": []},
 64 |                 {"text": "no", "intent": "negative", "entities": []},
 65 |                 {"text": "nope", "intent": "negative", "entities": []},
 66 |                 {"text": "não sei", "intent": "doubt", "entities": []},
 67 |                 {"text": "naa", "intent": "negative", "entities": []},
 68 |                 {"text": "na", "intent": "negative", "entities": []},
 69 |                 {"text": "não", "intent": "negative", "entities": []},
 70 |                 {"text": "talvez nao", "intent": "negative", "entities": []},
 71 |                 {"text": "nnn", "intent": "negative", "entities": []},
 72 |                 {"text": "nn", "intent": "negative", "entities": []},
 73 |                 {"text": "isso", "intent": "affirmative", "entities": []},
 74 |                 {
 75 |                     "text": "sim, preciso daquilo",
 76 |                     "intent": "affirmative",
 77 |                     "entities": [],
 78 |                 },
 79 |                 {"text": "sim, desejo isso", "intent": "affirmative", "entities": []},
 80 |                 {"text": "sim, quero isso", "intent": "affirmative", "entities": []},
 81 |                 {"text": "não ne", "intent": "negative", "entities": []},
 82 |                 {"text": "tenho que pensar", "intent": "doubt", "entities": []},
 83 |                 {"text": "talvez", "intent": "doubt", "entities": []},
 84 |                 {"text": "é", "intent": "affirmative", "entities": []},
 85 |                 {"text": "quero", "intent": "affirmative", "entities": []},
 86 |                 {"text": "quero sim", "intent": "affirmative", "entities": []},
 87 |                 {"text": "negativo", "intent": "negative", "entities": []},
 88 |                 {"text": "siim", "intent": "affirmative", "entities": []},
 89 |                 {"text": "boa sim", "intent": "affirmative", "entities": []},
 90 |             ],
 91 |         },
 92 |     )
 93 |     @patch(
 94 |         "bothub_backend.bothub.BothubBackend.send_training_backend_nlu_persistor",
 95 |         return_value={},
 96 |     )
 97 |     @patch(
 98 |         "bothub_backend.bothub.BothubBackend.request_backend_traininglog_nlu",
 99 |         return_value={},
100 |     )
101 |     @patch(
102 |         "bothub_backend.bothub.BothubBackend.request_backend_trainfail_nlu",
103 |         return_value={},
104 |     )
105 |     def test_train_bert(self, *args):
106 |         train.train_update(
107 |             self.current_update.get("current_version_id"),
108 |             self.current_update.get("repository_authorization_user_id"),
109 |             self.repository_authorization,
110 |         )
111 | 
112 |     @patch(
113 |         "bothub_backend.bothub.BothubBackend.request_backend_start_training_nlu",
114 |         return_value={
115 |             "language": "pt_br",
116 |             "repository_version": 6647,
117 |             "repository_uuid": "e1e8a0fa-625c-4ba3-8b91-4c9f308db791",
118 |             "intent": [],
119 |             "algorithm": "transformer_network_diet",
120 |             "total_training_end": 4,
121 |             "use_name_entities": False,
122 |             "use_competing_intents": False,
123 |             "use_analyze_char": False,
124 |         },
125 |     )
126 |     @patch(
127 |         "bothub_backend.bothub.BothubBackend.request_backend_get_examples",
128 |         return_value={
129 |             "count": 358,
130 |             "next": None,
131 |             "previous": None,
132 |             "results": [
133 |                 {"text": "ss", "intent": "affirmative", "entities": []},
134 |                 {"text": "okay", "intent": "affirmative", "entities": []},
135 |                 {"text": "afirmativo", "intent": "affirmative", "entities": []},
136 |                 {"text": "okk", "intent": "affirmative", "entities": []},
137 |                 {"text": "okayy", "intent": "affirmative", "entities": []},
138 |                 {"text": "certo", "intent": "affirmative", "entities": []},
139 |                 {"text": "nops", "intent": "negative", "entities": []},
140 |                 {"text": "no", "intent": "negative", "entities": []},
141 |                 {"text": "nope", "intent": "negative", "entities": []},
142 |                 {"text": "não sei", "intent": "doubt", "entities": []},
143 |                 {"text": "naa", "intent": "negative", "entities": []},
144 |                 {"text": "na", "intent": "negative", "entities": []},
145 |                 {"text": "não", "intent": "negative", "entities": []},
146 |                 {"text": "talvez nao", "intent": "negative", "entities": []},
147 |                 {"text": "nnn", "intent": "negative", "entities": []},
148 |             ],
149 |         },
150 |     )
151 |     @patch(
152 |         "bothub_backend.bothub.BothubBackend.send_training_backend_nlu_persistor",
153 |         return_value={},
154 |     )
155 |     @patch(
156 |         "bothub_backend.bothub.BothubBackend.request_backend_traininglog_nlu",
157 |         return_value={},
158 |     )
159 |     @patch(
160 |         "bothub_backend.bothub.BothubBackend.request_backend_trainfail_nlu",
161 |         return_value={},
162 |     )
163 |     def test_train_transformer_diet(self, *args):
164 |         train.train_update(
165 |             self.current_update.get("current_version_id"),
166 |             self.current_update.get("repository_authorization_user_id"),
167 |             self.repository_authorization,
168 |         )
169 | 


--------------------------------------------------------------------------------