├── Dockerfile ├── LICENSE ├── Makefile ├── README.md ├── compose.yaml ├── data ├── en_agents.json ├── en_calls_batch.parquet ├── en_clients.json ├── en_conversations.zip ├── en_dataframe.parquet ├── en_ground_truths.parquet ├── en_metadata.parquet ├── es_agents.json ├── es_calls_batch.parquet ├── es_clients.json ├── es_conversations.zip ├── es_dataframe.parquet ├── es_ground_truths.parquet ├── es_metadata.parquet └── sqlite.db ├── dev-requirements.txt ├── example_data ├── agents.zip ├── batch_creation │ └── calls_batch.zip ├── clients.zip ├── conversation_generation │ ├── conversations.zip │ ├── ground_truths.zip │ └── metadata.zip └── text_to_audio │ ├── audio_files.zip │ └── dataframe.zip ├── images ├── call-center-readme.png └── call-center-workflow.png ├── mlrun.env ├── notebook.ipynb ├── project_setup.py ├── pyproject.toml ├── requirements.txt ├── setup.py ├── src ├── __init__.py ├── calls_analysis │ ├── __init__.py │ ├── db_management.py │ └── postprocessing.py ├── calls_generation │ ├── __init__.py │ ├── conversations_generator.py │ └── skip.py ├── common.py ├── vizro.py └── workflows │ ├── __init__.py │ ├── calls_analysis.py │ └── calls_generation.py └── vizro ├── app.py ├── assets └── vizro_dashboard_styles.css ├── custom_charts.py └── custom_components.py /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM mlrun/mlrun-gpu:1.7.0 2 | 3 | # Update apt-get to install ffmpeg (support audio file formats): 4 | RUN apt-get update -y 5 | RUN apt-get install ffmpeg -y 6 | 7 | # Install demo requirements: 8 | 9 | RUN pip install transformers==4.44.1 10 | RUN pip install torch==2.1.2 torchvision==0.16.2 torchaudio==2.1.2 --index-url https://download.pytorch.org/whl/cu118 11 | RUN pip install bitsandbytes==0.41.1 accelerate==0.24.1 datasets==2.14.6 peft==0.5.0 optimum==1.13.2 12 | RUN pip install auto-gptq==0.4.2 --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/ 13 | RUN pip install langchain==0.0.327 openai==0.28.1 14 | RUN pip install git+https://github.com/suno-ai/bark.git 15 | RUN pip install streamlit==1.28.0 st-annotated-text==4.0.1 spacy==3.7.2 librosa==0.10.1 presidio-anonymizer==2.2.34 presidio-analyzer==2.2.34 nltk==3.8.1 flair==0.13.0 16 | RUN python -m spacy download en_core_web_lg 17 | RUN pip install -U SQLAlchemy 18 | 19 | # Align onnxruntime to use gpu: 20 | RUN pip uninstall -y onnxruntime-gpu 21 | RUN pip uninstall -y onnxruntime 22 | RUN pip install onnxruntime-gpu 23 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | 2 | PYTHON_INTERPRETER = python3 3 | SHARED_DIR ?= ~/mlrun-data 4 | MLRUN_TAG ?= 1.4.0 5 | HOST_IP ?=$$(ip route get 1.2.3.4 | awk '{print $$7}') 6 | CONDA_ENV ?= mlrun 7 | SHELL=/bin/bash 8 | CONDA_PY_VER ?= 3.9 9 | CONDA_ACTIVATE = source $$(conda info --base)/etc/profile.d/conda.sh ; conda activate ; conda activate 10 | 11 | ################################################################################# 12 | # COMMANDS # 13 | ################################################################################# 14 | 15 | .PHONY: help 16 | help: ## Display available commands 17 | @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}' 18 | 19 | .PHONY: all 20 | all: 21 | $(error please pick a target) 22 | 23 | .PHONY: install-requirements 24 | install-requirements: ## Install all requirements needed for development 25 | $(PYTHON_INTERPRETER) -m pip install -r requirements.txt -r dev-requirements.txt 26 | 27 | 28 | .PHONY: package-wheel 29 | package-wheel: clean ## Build python package wheel 30 | python setup.py bdist_wheel 31 | 32 | .PHONY: clean 33 | clean: ## Clean python package build artifacts 34 | rm -rf build 35 | rm -rf dist 36 | find . -type f -name "*.py[co]" -delete 37 | find . -type d -name "__pycache__" -delete 38 | 39 | .PHONY: fmt 40 | fmt: ## Format the code (using black and isort) 41 | @echo "Running black fmt..." 42 | $(PYTHON_INTERPRETER) -m black src 43 | $(PYTHON_INTERPRETER) -m isort src 44 | 45 | .PHONY: lint 46 | lint: fmt-check flake8 ## Run lint on the code 47 | 48 | .PHONY: fmt-check 49 | fmt-check: ## Format and check the code (using black and isort) 50 | @echo "Running black+isort fmt check..." 51 | $(PYTHON_INTERPRETER) -m black --check --diff src 52 | $(PYTHON_INTERPRETER) -m isort --check --diff src 53 | 54 | .PHONY: flake8 55 | flake8: ## Run flake8 lint 56 | @echo "Running flake8 lint..." 57 | $(PYTHON_INTERPRETER) -m flake8 src 58 | 59 | .PHONY: mlrun-docker 60 | mlrun-docker: ## Start MLRun & Nuclio containers (using Docker compose) 61 | mkdir $(SHARED_DIR) -p 62 | @echo "HOST_IP=$(HOST_IP)" > .env 63 | SHARED_DIR=$(SHARED_DIR) TAG=$(MLRUN_TAG) docker-compose -f compose.yaml up -d 64 | @echo "use docker-compose stop / logs commands to stop or view logs" 65 | 66 | .PHONY: mlrun-api 67 | mlrun-api: ## Run MLRun DB locally (as process) 68 | @echo "Installing MLRun API dependencies ..." 69 | $(PYTHON_INTERPRETER) -m pip install uvicorn~=0.17.0 dask-kubernetes~=0.11.0 apscheduler~=3.6 sqlite3-to-mysql~=1.4 70 | @echo "Starting local mlrun..." 71 | MLRUN_ARTIFACT_PATH=$$(realpath ./artifacts) MLRUN_ENV_FILE= mlrun db -b 72 | 73 | .PHONY: conda-env 74 | conda-env: ## Create a conda environment 75 | @echo "Creating new conda environment $(CONDA_ENV)..." 76 | conda create -n $(CONDA_ENV) -y python=$(CONDA_PY_VER) ipykernel graphviz pip 77 | test -s ./mlrun.env && conda env config vars set -n $(CONDA_ENV) MLRUN_ENV_FILE=$$(realpath ./mlrun.env) 78 | @echo "Installing requirements.txt..." 79 | $(CONDA_ACTIVATE) $(CONDA_ENV); pip install -r requirements.txt 80 | @echo -e "\nTo run mlrun API as a local process type:\n conda activate $(CONDA_ENV) && make mlrun-api" -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # MLRun's Call Center Demo 2 | 3 | huggingface-mlrun 4 | 5 | This demo showcases how to use LLMs to turn audio files from call center conversations between customers and agents into valuable data, all in a single workflow orchestrated by MLRun. 6 | 7 | MLRun automates the entire workflow, auto-scales resources as needed, and automatically logs and parses values between the different workflow steps. 8 | 9 | By the end of this demo you will see the potential power of LLMs for feature extraction, and how easily you can do this with MLRun! 10 | 11 | This demo uses: 12 | * [**OpenAI's Whisper**](https://openai.com/research/whisper) — To transcribe the audio calls into text. 13 | * [**Flair**](https://flairnlp.github.io/) and [**Microsoft's Presidio**](https://microsoft.github.io/presidio/) - To recognize PII so it can be filtered out. 14 | * [**HuggingFace**](https://huggingface.co/) — The main machine-learning framework to get the model and tokenizer for the features extraction. 15 | * and [**MLRun**](https://www.mlrun.org/) — as the orchestrator to operationalize the workflow. 16 | 17 | The demo contains a single [notebook](./notebook.ipynb) that encompasses the entire demo. 18 | 19 | 20 | Most of the functions are imported from [MLRun's function hub](https://docs.mlrun.org/en/stable/runtimes/load-from-hub.html), which contains a wide range of functions that can be used for a variety of use cases. All functions used in the demo include links to their source in the hub. All of the python source code is under [/src](./src). 21 | Enjoy! 22 | 23 | ___ 24 | 25 | ## Installation 26 | 27 | This project can run in different development environments: 28 | * Local computer (using PyCharm, VSCode, Jupyter, etc.) 29 | * Inside GitHub Codespaces 30 | * Other managed Jupyter environments 31 | 32 | ### Install the code and the mlrun client 33 | 34 | To get started, fork this repo into your GitHub account and clone it into your development environment. 35 | 36 | To install the package dependencies (not required in GitHub codespaces) use: 37 | 38 | make install-requirements 39 | 40 | If you prefer to use Conda, use this instead (to create and configure a conda env): 41 | 42 | make conda-env 43 | 44 | > Make sure you open the notebooks and select the `mlrun` conda environment 45 | 46 | ### Install or connect to the MLRun service/cluster 47 | 48 | The MLRun service and computation can run locally (minimal setup) or over a remote Kubernetes environment. 49 | 50 | If your development environment supports Docker and there are sufficient CPU resources, run: 51 | 52 | make mlrun-docker 53 | 54 | > MLRun UI can be viewed in: http://localhost:8060 55 | 56 | If your environment is minimal, run mlrun as a process (no UI): 57 | 58 | [conda activate mlrun &&] make mlrun-api 59 | 60 | For MLRun to run properly you should set your client environment. This is not required when using **codespaces**, the mlrun **conda** environment, or **iguazio** managed notebooks. 61 | 62 | Your environment should include `MLRUN_ENV_FILE= ` (point to the mlrun .env file 63 | in this repo); see [mlrun client setup](https://docs.mlrun.org/en/latest/install/remote.html) instructions for details. 64 | 65 | > Note: You can also use a remote MLRun service (over Kubernetes): instead of starting a local mlrun: 66 | > edit the [mlrun.env](./mlrun.env) and specify its address and credentials. 67 | -------------------------------------------------------------------------------- /compose.yaml: -------------------------------------------------------------------------------- 1 | services: 2 | init_nuclio: 3 | image: alpine:3.16 4 | command: 5 | - "/bin/sh" 6 | - "-c" 7 | - | 8 | mkdir -p /etc/nuclio/config/platform; \ 9 | cat << EOF | tee /etc/nuclio/config/platform/platform.yaml 10 | runtime: 11 | common: 12 | env: 13 | MLRUN_DBPATH: http://${HOST_IP:?err}:8080 14 | local: 15 | defaultFunctionContainerNetworkName: mlrun 16 | defaultFunctionRestartPolicy: 17 | name: always 18 | maxRetryCount: 0 19 | defaultFunctionVolumes: 20 | - volume: 21 | name: mlrun-stuff 22 | hostPath: 23 | path: ${SHARED_DIR:?err} 24 | volumeMount: 25 | name: mlrun-stuff 26 | mountPath: /home/jovyan/data/ 27 | logger: 28 | sinks: 29 | myStdoutLoggerSink: 30 | kind: stdout 31 | system: 32 | - level: debug 33 | sink: myStdoutLoggerSink 34 | functions: 35 | - level: debug 36 | sink: myStdoutLoggerSink 37 | EOF 38 | volumes: 39 | - nuclio-platform-config:/etc/nuclio/config 40 | 41 | mlrun-api: 42 | image: "mlrun/mlrun-api:${TAG:-1.1.2}" 43 | ports: 44 | - "8080:8080" 45 | environment: 46 | MLRUN_ARTIFACT_PATH: "${SHARED_DIR}/{{project}}" 47 | # using local storage, meaning files / artifacts are stored locally, so we want to allow access to them 48 | MLRUN_HTTPDB__REAL_PATH: /data 49 | MLRUN_HTTPDB__DATA_VOLUME: "${SHARED_DIR}" 50 | MLRUN_LOG_LEVEL: DEBUG 51 | MLRUN_NUCLIO_DASHBOARD_URL: http://nuclio:8070 52 | MLRUN_HTTPDB__DSN: "sqlite:////data/mlrun.db?check_same_thread=false" 53 | MLRUN_UI__URL: http://localhost:8060 54 | # not running on k8s meaning no need to store secrets 55 | MLRUN_SECRET_STORES__KUBERNETES__AUTO_ADD_PROJECT_SECRETS: "false" 56 | # let mlrun control nuclio resources 57 | MLRUN_HTTPDB__PROJECTS__FOLLOWERS: "nuclio" 58 | volumes: 59 | - "${SHARED_DIR:?err}:/data" 60 | networks: 61 | - mlrun 62 | 63 | mlrun-ui: 64 | image: "mlrun/mlrun-ui:${TAG:-1.1.2}" 65 | ports: 66 | - "8060:8090" 67 | environment: 68 | MLRUN_API_PROXY_URL: http://mlrun-api:8080 69 | MLRUN_NUCLIO_MODE: enable 70 | MLRUN_NUCLIO_API_URL: http://nuclio:8070 71 | MLRUN_NUCLIO_UI_URL: http://localhost:8070 72 | networks: 73 | - mlrun 74 | 75 | nuclio: 76 | image: "quay.io/nuclio/dashboard:${NUCLIO_TAG:-stable-amd64}" 77 | ports: 78 | - "8070:8070" 79 | environment: 80 | NUCLIO_DASHBOARD_EXTERNAL_IP_ADDRESSES: "${HOST_IP:?err}" 81 | volumes: 82 | - /var/run/docker.sock:/var/run/docker.sock 83 | - nuclio-platform-config:/etc/nuclio/config 84 | depends_on: 85 | - init_nuclio 86 | networks: 87 | - mlrun 88 | 89 | volumes: 90 | nuclio-platform-config: {} 91 | 92 | networks: 93 | mlrun: 94 | name: mlrun 95 | -------------------------------------------------------------------------------- /data/en_agents.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "first_name": "Michael", 4 | "last_name": "Johnson", 5 | "agent_id": "A001" 6 | }, 7 | { 8 | "first_name": "Emma", 9 | "last_name": "Williams", 10 | "agent_id": "A002" 11 | }, 12 | { 13 | "first_name": "Daniel", 14 | "last_name": "Miller", 15 | "agent_id": "A003" 16 | }, 17 | { 18 | "first_name": "Sophia", 19 | "last_name": "Brown", 20 | "agent_id": "A004" 21 | }, 22 | { 23 | "first_name": "David", 24 | "last_name": "Davis", 25 | "agent_id": "A005" 26 | }, 27 | { 28 | "first_name": "Olivia", 29 | "last_name": "Garcia", 30 | "agent_id": "A006" 31 | }, 32 | { 33 | "first_name": "James", 34 | "last_name": "Rodriguez", 35 | "agent_id": "A007" 36 | }, 37 | { 38 | "first_name": "Mia", 39 | "last_name": "Martinez", 40 | "agent_id": "A008" 41 | }, 42 | { 43 | "first_name": "John", 44 | "last_name": "Hernandez", 45 | "agent_id": "A009" 46 | }, 47 | { 48 | "first_name": "Isabella", 49 | "last_name": "Lopez", 50 | "agent_id": "A010" 51 | } 52 | ] -------------------------------------------------------------------------------- /data/en_calls_batch.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlrun/demo-call-center/fac6cc4a5661ba469c28638a97af401a666ab031/data/en_calls_batch.parquet -------------------------------------------------------------------------------- /data/en_clients.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "first_name": "Emily", 4 | "last_name": "Smith", 5 | "phone_number": "123-456-7890", 6 | "email": "emilysmith@example.com", 7 | "client_id": "12345" 8 | }, 9 | { 10 | "first_name": "John", 11 | "last_name": "Doe", 12 | "phone_number": "098-765-4321", 13 | "email": "johndoe@example.com", 14 | "client_id": "67890" 15 | }, 16 | { 17 | "first_name": "Jane", 18 | "last_name": "Doe", 19 | "phone_number": "456-789-0123", 20 | "email": "janedoe@example.com", 21 | "client_id": "23456" 22 | }, 23 | { 24 | "first_name": "Robert", 25 | "last_name": "Johnson", 26 | "phone_number": "789-012-3456", 27 | "email": "robertjohnson@example.com", 28 | "client_id": "78901" 29 | }, 30 | { 31 | "first_name": "Mary", 32 | "last_name": "Davis", 33 | "phone_number": "012-345-6789", 34 | "email": "marydavis@example.com", 35 | "client_id": "34567" 36 | }, 37 | { 38 | "first_name": "James", 39 | "last_name": "Miller", 40 | "phone_number": "987-654-3210", 41 | "email": "jamesmiller@example.com", 42 | "client_id": "89012" 43 | }, 44 | { 45 | "first_name": "Patricia", 46 | "last_name": "Wilson", 47 | "phone_number": "654-321-0987", 48 | "email": "patriciawilson@example.com", 49 | "client_id": "45678" 50 | }, 51 | { 52 | "first_name": "Michael", 53 | "last_name": "Moore", 54 | "phone_number": "321-098-7654", 55 | "email": "michaelmoore@example.com", 56 | "client_id": "90123" 57 | }, 58 | { 59 | "first_name": "Elizabeth", 60 | "last_name": "Taylor", 61 | "phone_number": "234-567-8901", 62 | "email": "elizabethtaylor@example.com", 63 | "client_id": "56789" 64 | }, 65 | { 66 | "first_name": "David", 67 | "last_name": "Anderson", 68 | "phone_number": "567-890-1234", 69 | "email": "davidanderson@example.com", 70 | "client_id": "23459" 71 | } 72 | ] -------------------------------------------------------------------------------- /data/en_conversations.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlrun/demo-call-center/fac6cc4a5661ba469c28638a97af401a666ab031/data/en_conversations.zip -------------------------------------------------------------------------------- /data/en_dataframe.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlrun/demo-call-center/fac6cc4a5661ba469c28638a97af401a666ab031/data/en_dataframe.parquet -------------------------------------------------------------------------------- /data/en_ground_truths.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlrun/demo-call-center/fac6cc4a5661ba469c28638a97af401a666ab031/data/en_ground_truths.parquet -------------------------------------------------------------------------------- /data/en_metadata.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlrun/demo-call-center/fac6cc4a5661ba469c28638a97af401a666ab031/data/en_metadata.parquet -------------------------------------------------------------------------------- /data/es_agents.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "first_name": "Carlos", 4 | "last_name": "Gomez", 5 | "agent_id": "A5432" 6 | }, 7 | { 8 | "first_name": "Marta", 9 | "last_name": "Rodriguez", 10 | "agent_id": "B7658" 11 | }, 12 | { 13 | "first_name": "Francisco", 14 | "last_name": "Lopez", 15 | "agent_id": "C3421" 16 | }, 17 | { 18 | "first_name": "Ana", 19 | "last_name": "Perez", 20 | "agent_id": "D5463" 21 | }, 22 | { 23 | "first_name": "Luis", 24 | "last_name": "Martinez", 25 | "agent_id": "E7654" 26 | }, 27 | { 28 | "first_name": "Maria", 29 | "last_name": "Hernandez", 30 | "agent_id": "F3214" 31 | }, 32 | { 33 | "first_name": "Pedro", 34 | "last_name": "Gonzalez", 35 | "agent_id": "G9876" 36 | }, 37 | { 38 | "first_name": "Josefa", 39 | "last_name": "Ramirez", 40 | "agent_id": "H6543" 41 | }, 42 | { 43 | "first_name": "Antonio", 44 | "last_name": "Sanchez", 45 | "agent_id": "I4321" 46 | }, 47 | { 48 | "first_name": "Isabel", 49 | "last_name": "Torres", 50 | "agent_id": "J7658" 51 | } 52 | ] -------------------------------------------------------------------------------- /data/es_calls_batch.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlrun/demo-call-center/fac6cc4a5661ba469c28638a97af401a666ab031/data/es_calls_batch.parquet -------------------------------------------------------------------------------- /data/es_clients.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "first_name": "Carlos", 4 | "last_name": "Gomez", 5 | "phone_number": "678-234-5678", 6 | "email": "CarlosGomez@email.com", 7 | "client_id": "ID001" 8 | }, 9 | { 10 | "first_name": "Maria", 11 | "last_name": "Hernandez", 12 | "phone_number": "789-345-6789", 13 | "email": "MariaHernandez@email.com", 14 | "client_id": "ID002" 15 | }, 16 | { 17 | "first_name": "Luis", 18 | "last_name": "Rodriguez", 19 | "phone_number": "890-456-7890", 20 | "email": "LuisRodriguez@email.com", 21 | "client_id": "ID003" 22 | }, 23 | { 24 | "first_name": "Ana", 25 | "last_name": "Sanchez", 26 | "phone_number": "901-567-8901", 27 | "email": "AnaSanchez@email.com", 28 | "client_id": "ID004" 29 | }, 30 | { 31 | "first_name": "Jose", 32 | "last_name": "Martinez", 33 | "phone_number": "012-678-9012", 34 | "email": "JoseMartinez@email.com", 35 | "client_id": "ID005" 36 | }, 37 | { 38 | "first_name": "Isabel", 39 | "last_name": "Lopez", 40 | "phone_number": "123-789-0123", 41 | "email": "IsabelLopez@email.com", 42 | "client_id": "ID006" 43 | }, 44 | { 45 | "first_name": "Miguel", 46 | "last_name": "Gonzalez", 47 | "phone_number": "234-890-1234", 48 | "email": "MiguelGonzalez@email.com", 49 | "client_id": "ID007" 50 | }, 51 | { 52 | "first_name": "Sofia", 53 | "last_name": "Perez", 54 | "phone_number": "345-901-2345", 55 | "email": "SofiaPerez@email.com", 56 | "client_id": "ID008" 57 | }, 58 | { 59 | "first_name": "Antonio", 60 | "last_name": "Ramirez", 61 | "phone_number": "456-012-3456", 62 | "email": "AntonioRamirez@email.com", 63 | "client_id": "ID009" 64 | }, 65 | { 66 | "first_name": "Carmen", 67 | "last_name": "Torres", 68 | "phone_number": "567-123-4567", 69 | "email": "CarmenTorres@email.com", 70 | "client_id": "ID010" 71 | } 72 | ] -------------------------------------------------------------------------------- /data/es_conversations.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlrun/demo-call-center/fac6cc4a5661ba469c28638a97af401a666ab031/data/es_conversations.zip -------------------------------------------------------------------------------- /data/es_dataframe.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlrun/demo-call-center/fac6cc4a5661ba469c28638a97af401a666ab031/data/es_dataframe.parquet -------------------------------------------------------------------------------- /data/es_ground_truths.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlrun/demo-call-center/fac6cc4a5661ba469c28638a97af401a666ab031/data/es_ground_truths.parquet -------------------------------------------------------------------------------- /data/es_metadata.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlrun/demo-call-center/fac6cc4a5661ba469c28638a97af401a666ab031/data/es_metadata.parquet -------------------------------------------------------------------------------- /data/sqlite.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlrun/demo-call-center/fac6cc4a5661ba469c28638a97af401a666ab031/data/sqlite.db -------------------------------------------------------------------------------- /dev-requirements.txt: -------------------------------------------------------------------------------- 1 | pytest~=5.4 2 | black~=24.8 3 | isort~=5.7 4 | flake8~=5.0 5 | -------------------------------------------------------------------------------- /example_data/agents.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlrun/demo-call-center/fac6cc4a5661ba469c28638a97af401a666ab031/example_data/agents.zip -------------------------------------------------------------------------------- /example_data/batch_creation/calls_batch.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlrun/demo-call-center/fac6cc4a5661ba469c28638a97af401a666ab031/example_data/batch_creation/calls_batch.zip -------------------------------------------------------------------------------- /example_data/clients.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlrun/demo-call-center/fac6cc4a5661ba469c28638a97af401a666ab031/example_data/clients.zip -------------------------------------------------------------------------------- /example_data/conversation_generation/conversations.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlrun/demo-call-center/fac6cc4a5661ba469c28638a97af401a666ab031/example_data/conversation_generation/conversations.zip -------------------------------------------------------------------------------- /example_data/conversation_generation/ground_truths.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlrun/demo-call-center/fac6cc4a5661ba469c28638a97af401a666ab031/example_data/conversation_generation/ground_truths.zip -------------------------------------------------------------------------------- /example_data/conversation_generation/metadata.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlrun/demo-call-center/fac6cc4a5661ba469c28638a97af401a666ab031/example_data/conversation_generation/metadata.zip -------------------------------------------------------------------------------- /example_data/text_to_audio/audio_files.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlrun/demo-call-center/fac6cc4a5661ba469c28638a97af401a666ab031/example_data/text_to_audio/audio_files.zip -------------------------------------------------------------------------------- /example_data/text_to_audio/dataframe.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlrun/demo-call-center/fac6cc4a5661ba469c28638a97af401a666ab031/example_data/text_to_audio/dataframe.zip -------------------------------------------------------------------------------- /images/call-center-readme.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlrun/demo-call-center/fac6cc4a5661ba469c28638a97af401a666ab031/images/call-center-readme.png -------------------------------------------------------------------------------- /images/call-center-workflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlrun/demo-call-center/fac6cc4a5661ba469c28638a97af401a666ab031/images/call-center-workflow.png -------------------------------------------------------------------------------- /mlrun.env: -------------------------------------------------------------------------------- 1 | # default env vars, will be loaded once MLRun imports/starts 2 | # write here remote cluster credentials, addresses, etc. 3 | # uncomment the relevant lines and set with proper parameters 4 | 5 | # local/remote MLRun service address 6 | MLRUN_DBPATH=http://localhost:8080 7 | 8 | # if Nuclio not detected simulate it with mock 9 | MLRUN_MOCK_NUCLIO_DEPLOYMENT=auto 10 | 11 | # Iguazio cluster and V3IO credentials (for remote cluster) 12 | # V3IO_USERNAME= 13 | # V3IO_ACCESS_KEY= 14 | 15 | # AWS S3/services credentials 16 | # AWS_ACCESS_KEY_ID= 17 | # AWS_SECRET_ACCESS_KEY= 18 | 19 | # The Azure connection string which points at a storage account. For example: 20 | # DefaultEndpointsProtocol=https;AccountName=myAcct;AccountKey=XXXX;EndpointSuffix=core.windows.net 21 | # AZURE_STORAGE_CONNECTION_STRING= 22 | -------------------------------------------------------------------------------- /project_setup.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Iguazio 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import os 15 | from pathlib import Path 16 | import boto3 17 | import mlrun 18 | 19 | from src.calls_analysis.db_management import create_tables 20 | from src.common import ProjectSecrets 21 | 22 | CE_MODE = mlrun.mlconf.is_ce_mode() 23 | 24 | def setup( 25 | project: mlrun.projects.MlrunProject, 26 | ) -> mlrun.projects.MlrunProject: 27 | """ 28 | Creating the project for the demo. This function is expected to call automatically when calling the function 29 | `mlrun.get_or_create_project`. 30 | 31 | :param project: The project to set up. 32 | 33 | :returns: A fully prepared project for this demo. 34 | """ 35 | # Unpack secrets from environment variables: 36 | openai_key = os.getenv(ProjectSecrets.OPENAI_API_KEY) 37 | openai_base = os.getenv(ProjectSecrets.OPENAI_API_BASE) 38 | mysql_url = os.getenv(ProjectSecrets.MYSQL_URL, "") 39 | 40 | # Unpack parameters: 41 | source = project.get_param(key="source") 42 | default_image = project.get_param(key="default_image", default=None) 43 | build_image = project.get_param(key="build_image", default=False) 44 | gpus = project.get_param(key="gpus", default=0) 45 | node_name = project.get_param(key="node_name", default=None) 46 | node_selector = project.get_param(key="node_selector", default=None) 47 | use_sqlite = project.get_param(key="use_sqlite", default=False) 48 | 49 | # Update sqlite data: 50 | if use_sqlite: 51 | # uploading db file to s3: 52 | if CE_MODE: 53 | s3 = boto3.client("s3") if not os.getenv("S3_ENDPOINT_URL") else boto3.client('s3', endpoint_url=os.getenv("S3_ENDPOINT_URL")) 54 | bucket_name = Path(mlrun.mlconf.artifact_path).parts[1] 55 | # Upload the file 56 | s3.upload_file( 57 | Filename="data/sqlite.db", 58 | Bucket=bucket_name, 59 | Key="sqlite.db", 60 | ) 61 | os.environ["S3_BUCKET_NAME"] = bucket_name 62 | else: 63 | os.environ["MYSQL_URL"] = f"sqlite:///{os.path.abspath('.')}/data/sqlite.db" 64 | mysql_url = os.environ["MYSQL_URL"] 65 | 66 | # Set the project git source: 67 | if source: 68 | print(f"Project Source: {source}") 69 | project.set_source(source=source, pull_at_runtime=True) 70 | 71 | # Set default image: 72 | if default_image: 73 | project.set_default_image(default_image) 74 | 75 | # Build the image: 76 | if build_image: 77 | print("Building default image for the demo:") 78 | _build_image(project=project, with_gpu=gpus) 79 | 80 | # Set the secrets: 81 | _set_secrets( 82 | project=project, 83 | openai_key=openai_key, 84 | openai_base=openai_base, 85 | mysql_url=mysql_url, 86 | bucket_name=os.getenv(ProjectSecrets.S3_BUCKET_NAME), 87 | ) 88 | 89 | # Refresh MLRun hub to the most up-to-date version: 90 | mlrun.get_run_db().get_hub_catalog(source_name="default", force_refresh=True) 91 | 92 | # Set the functions: 93 | _set_calls_generation_functions(project=project, node_name=node_name) 94 | _set_calls_analysis_functions(project=project, gpus=gpus, node_name=node_name, node_selector=node_selector) 95 | 96 | # Set the workflows: 97 | _set_workflows(project=project) 98 | 99 | # Set UI application: 100 | app = project.set_function( 101 | name="call-center-ui", 102 | kind="application", 103 | requirements=["vizro==0.1.38", "gunicorn"] 104 | ) 105 | # Set the internal application port to Vizro's default port 106 | app.set_internal_application_port(8050) 107 | 108 | # Set the command to run the Vizro application 109 | app.spec.command = "gunicorn" 110 | app.spec.args = [ 111 | "app:app", 112 | "--bind", 113 | "0.0.0.0:8050", 114 | "--chdir", 115 | f"home/mlrun_code/vizro" 116 | ] 117 | app.save() 118 | 119 | # Create the DB tables: 120 | create_tables() 121 | 122 | # Save and return the project: 123 | project.save() 124 | return project 125 | 126 | def _build_image(project: mlrun.projects.MlrunProject, with_gpu: bool): 127 | config = { 128 | "base_image": "mlrun/mlrun-gpu" if with_gpu else "mlrun/mlrun", 129 | "torch_index": "https://download.pytorch.org/whl/cu118" if with_gpu else "https://download.pytorch.org/whl/cpu", 130 | "onnx_package": "onnxruntime-gpu" if with_gpu else "onnxruntime" 131 | } 132 | # Define commands in logical groups while maintaining order 133 | system_commands = [ 134 | # Update apt-get to install ffmpeg (support audio file formats): 135 | "apt-get update -y && apt-get install ffmpeg -y" 136 | ] 137 | 138 | infrastructure_requirements = [ 139 | "pip install transformers==4.44.1", 140 | f"pip install torch==2.1.2 torchvision==0.16.2 torchaudio==2.1.2 --index-url {config['torch_index']}" 141 | ] 142 | 143 | huggingface_requirements = [ 144 | "pip install bitsandbytes==0.41.1 accelerate==0.24.1 datasets==2.14.6 peft==0.5.0 optimum==1.13.2" 145 | ] 146 | 147 | gpu_specific_requirements = [ 148 | "pip install auto-gptq==0.4.2 --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/" 149 | ] if with_gpu else [] 150 | 151 | other_requirements = [ 152 | "pip install mlrun langchain==0.2.17 openai==1.58.1 langchain_community==0.2.19 pydub==0.25.1 streamlit==1.28.0 st-annotated-text==4.0.1 spacy==3.7.2 librosa==0.10.1 presidio-anonymizer==2.2.34 presidio-analyzer==2.2.34 nltk==3.8.1 flair==0.13.0 htbuilder==0.6.2", 153 | "python -m spacy download en_core_web_lg", 154 | "pip install -U SQLAlchemy", 155 | "pip uninstall -y onnxruntime-gpu onnxruntime", 156 | f"pip install {config['onnx_package']}", 157 | ] 158 | 159 | # Combine commands in the required order 160 | commands = ( 161 | system_commands + 162 | infrastructure_requirements + 163 | huggingface_requirements + 164 | gpu_specific_requirements + 165 | other_requirements 166 | ) 167 | 168 | # Build the image 169 | assert project.build_image( 170 | base_image=config["base_image"], 171 | commands=commands, 172 | set_as_default=True, 173 | ) 174 | 175 | def _set_secrets( 176 | project: mlrun.projects.MlrunProject, 177 | openai_key: str, 178 | openai_base: str, 179 | mysql_url: str, 180 | bucket_name: str = None, 181 | ): 182 | # Must have secrets: 183 | project.set_secrets( 184 | secrets={ 185 | ProjectSecrets.OPENAI_API_KEY: openai_key, 186 | ProjectSecrets.OPENAI_API_BASE: openai_base, 187 | ProjectSecrets.MYSQL_URL: mysql_url, 188 | } 189 | ) 190 | if bucket_name: 191 | project.set_secrets( 192 | secrets={ 193 | ProjectSecrets.S3_BUCKET_NAME: bucket_name, 194 | } 195 | ) 196 | 197 | 198 | def _set_function( 199 | project: mlrun.projects.MlrunProject, 200 | func: str, 201 | name: str, 202 | kind: str, 203 | gpus: int = 0, 204 | node_name: str = None, 205 | with_repo: bool = None, 206 | image: str = None, 207 | node_selector: dict = None, 208 | apply_auto_mount: bool = True, 209 | ): 210 | # Set the given function: 211 | if with_repo is None: 212 | with_repo = not func.startswith("hub://") 213 | mlrun_function = project.set_function( 214 | func=func, name=name, kind=kind, with_repo=with_repo, image=image, 215 | ) 216 | 217 | # Configure GPUs according to the given kind: 218 | if gpus >= 1: 219 | if node_selector: 220 | mlrun_function.with_node_selection(node_selector=node_selector) 221 | if kind == "mpijob": 222 | # 1 GPU for each rank: 223 | mlrun_function.with_limits(gpus=1) 224 | mlrun_function.spec.replicas = gpus 225 | else: 226 | # All GPUs for the single job: 227 | mlrun_function.with_limits(gpus=gpus) 228 | # Set the node selection: 229 | elif node_name: 230 | mlrun_function.with_node_selection(node_name=node_name) 231 | 232 | if not CE_MODE and apply_auto_mount: 233 | # Apply auto mount: 234 | mlrun_function.apply(mlrun.auto_mount()) 235 | # Save: 236 | mlrun_function.save() 237 | 238 | 239 | def _set_calls_generation_functions( 240 | project: mlrun.projects.MlrunProject, 241 | node_name: str = None, 242 | ): 243 | # Client and agent data generator 244 | _set_function( 245 | project=project, 246 | func="hub://structured_data_generator", 247 | name="structured-data-generator", 248 | kind="job", 249 | node_name=node_name, 250 | apply_auto_mount=True, 251 | ) 252 | 253 | # Conversation generator: 254 | _set_function( 255 | project=project, 256 | func="./src/calls_generation/conversations_generator.py", 257 | name="conversations-generator", 258 | kind="job", 259 | node_name=node_name, 260 | apply_auto_mount=True, 261 | ) 262 | 263 | # Text to audio generator: 264 | _set_function( 265 | project=project, 266 | func="hub://text_to_audio_generator", 267 | name="text-to-audio-generator", 268 | kind="job", 269 | with_repo=False, 270 | apply_auto_mount=True, 271 | ) 272 | 273 | 274 | def _set_calls_analysis_functions( 275 | project: mlrun.projects.MlrunProject, 276 | gpus: int, 277 | node_name: str = None, 278 | node_selector: dict = None, 279 | ): 280 | # DB management: 281 | _set_function( 282 | project=project, 283 | func="./src/calls_analysis/db_management.py", 284 | name="db-management", 285 | kind="job", 286 | node_name=node_name, 287 | apply_auto_mount=True, 288 | ) 289 | 290 | # Speech diarization: 291 | _set_function( 292 | project=project, 293 | func="hub://silero_vad", 294 | name="silero-vad", 295 | kind="job", 296 | node_name=node_name, 297 | ) 298 | 299 | # Transcription: 300 | _set_function( 301 | project=project, 302 | func="hub://transcribe", 303 | name="transcription", 304 | kind="mpijob" if gpus > 1 else "job", 305 | gpus=gpus, 306 | node_name=node_name, 307 | node_selector=node_selector, 308 | ) 309 | 310 | # PII recognition: 311 | _set_function( 312 | project=project, 313 | func="hub://pii_recognizer", 314 | name="pii-recognition", 315 | kind="job", 316 | node_name=node_name, 317 | ) 318 | 319 | # Question answering: 320 | _set_function( 321 | project=project, 322 | func="hub://question_answering", 323 | name="question-answering", 324 | kind="job", 325 | gpus=gpus, 326 | node_name=node_name, 327 | node_selector=node_selector, 328 | ) 329 | 330 | # Postprocessing: 331 | _set_function( 332 | project=project, 333 | func="./src/calls_analysis/postprocessing.py", 334 | name="postprocessing", 335 | with_repo=False, 336 | kind="job", 337 | node_name=node_name, 338 | ) 339 | 340 | 341 | def _set_workflows(project: mlrun.projects.MlrunProject): 342 | project.set_workflow( 343 | name="calls-generation", workflow_path="./src/workflows/calls_generation.py" 344 | ) 345 | project.set_workflow( 346 | name="calls-analysis", workflow_path="./src/workflows/calls_analysis.py" 347 | ) 348 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.isort] 2 | profile = "black" 3 | multi_line_output = 3 4 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | mlrun 2 | SQLAlchemy -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import find_packages, setup 2 | 3 | project_name = "myproj" 4 | with open("README.md", "r", encoding="utf-8") as fh: 5 | long_description = fh.read() 6 | 7 | setup( 8 | name=project_name, 9 | packages=[project_name], 10 | package_dir={project_name: "src"}, 11 | version="0.1.0", 12 | description="my desc", 13 | author="Yaron", 14 | author_email="author@example.com", 15 | license="MIT", 16 | long_description=long_description, 17 | long_description_content_type="text/markdown", 18 | python_requires=">=3.9", 19 | ) 20 | -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Iguazio 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /src/calls_analysis/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Iguazio 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /src/calls_analysis/db_management.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Iguazio 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import datetime 15 | import os 16 | import tempfile 17 | from typing import List, Optional, Tuple 18 | 19 | import boto3 20 | import mlrun 21 | import pandas as pd 22 | from sqlalchemy import ( 23 | Boolean, 24 | Date, 25 | Enum, 26 | ForeignKey, 27 | Integer, 28 | String, 29 | Time, 30 | bindparam, 31 | create_engine, 32 | insert, 33 | select, 34 | update, 35 | ) 36 | from sqlalchemy.orm import ( 37 | Mapped, 38 | declarative_base, 39 | mapped_column, 40 | relationship, 41 | sessionmaker, 42 | ) 43 | 44 | from src.common import CallStatus, ProjectSecrets 45 | 46 | ID_LENGTH = 32 47 | FILE_PATH_LENGTH = 500 48 | 49 | Base = declarative_base() 50 | 51 | 52 | class Client(Base): 53 | __tablename__ = "client" 54 | 55 | # Columns: 56 | client_id: Mapped[str] = mapped_column(String(length=ID_LENGTH), primary_key=True) 57 | first_name: Mapped[str] = mapped_column(String(length=30)) 58 | last_name: Mapped[str] = mapped_column(String(length=30)) 59 | phone_number: Mapped[str] = mapped_column(String(length=20)) 60 | email: Mapped[str] = mapped_column(String(length=50)) 61 | client_city: Mapped[str] = mapped_column(String(length=30)) 62 | latitude: Mapped[str] = mapped_column(String(length=20)) 63 | longitude: Mapped[str] = mapped_column(String(length=20)) 64 | 65 | # Many-to-one relationship: 66 | calls: Mapped[List["Call"]] = relationship(back_populates="client", lazy=True) 67 | 68 | 69 | class Agent(Base): 70 | __tablename__ = "agent" 71 | 72 | # Columns: 73 | agent_id: Mapped[str] = mapped_column(String(length=ID_LENGTH), primary_key=True) 74 | first_name: Mapped[str] = mapped_column(String(length=30)) 75 | last_name: Mapped[str] = mapped_column(String(length=30)) 76 | # phone: Mapped[str] = mapped_column(String(length=20)) 77 | # email: Mapped[str] = mapped_column(String(length=50)) 78 | 79 | # Many-to-one relationship: 80 | calls: Mapped[List["Call"]] = relationship(back_populates="agent", lazy=True) 81 | 82 | 83 | class Call(Base): 84 | __tablename__ = "call" 85 | 86 | # Metadata: 87 | call_id: Mapped[str] = mapped_column(String(length=ID_LENGTH), primary_key=True) 88 | client_id: Mapped[str] = mapped_column( 89 | String(length=ID_LENGTH), ForeignKey("client.client_id") 90 | ) 91 | agent_id: Mapped[str] = mapped_column( 92 | String(length=ID_LENGTH), ForeignKey("agent.agent_id") 93 | ) 94 | date: Mapped[datetime.date] = mapped_column(Date()) 95 | time: Mapped[datetime.time] = mapped_column(Time()) 96 | status: Mapped[CallStatus] = mapped_column(Enum(CallStatus), nullable=True) 97 | # Files: 98 | audio_file: Mapped[str] = mapped_column(String(length=FILE_PATH_LENGTH)) 99 | # TODO: processed_audio_file: Mapped[Optional[str]] = mapped_column(String(length=FILE_PATH_LENGTH)) 100 | transcription_file: Mapped[Optional[str]] = mapped_column( 101 | String(length=FILE_PATH_LENGTH), 102 | nullable=True, 103 | default=None, 104 | ) 105 | anonymized_file: Mapped[Optional[str]] = mapped_column( 106 | String(length=FILE_PATH_LENGTH), 107 | nullable=True, 108 | default=None, 109 | ) 110 | # Analysis: 111 | topic: Mapped[Optional[str]] = mapped_column( 112 | String(length=50), 113 | nullable=True, 114 | default=None, 115 | ) 116 | summary: Mapped[Optional[str]] = mapped_column( 117 | String(length=1000), 118 | nullable=True, 119 | default=None, 120 | ) 121 | concern_addressed: Mapped[Optional[bool]] = mapped_column( 122 | Boolean(), 123 | nullable=True, 124 | default=None, 125 | ) 126 | client_tone: Mapped[Optional[str]] = mapped_column( 127 | String(length=20), 128 | nullable=True, 129 | default=None, 130 | ) 131 | agent_tone: Mapped[Optional[str]] = mapped_column( 132 | String(length=20), 133 | nullable=True, 134 | default=None, 135 | ) 136 | upsale_attempted: Mapped[Optional[bool]] = mapped_column( 137 | Boolean(), 138 | nullable=True, 139 | default=None, 140 | ) 141 | upsale_success: Mapped[Optional[bool]] = mapped_column( 142 | Boolean(), 143 | nullable=True, 144 | default=None, 145 | ) 146 | empathy: Mapped[Optional[int]] = mapped_column( 147 | Integer(), 148 | nullable=True, 149 | default=None, 150 | ) 151 | professionalism: Mapped[Optional[int]] = mapped_column( 152 | Integer(), 153 | nullable=True, 154 | default=None, 155 | ) 156 | kindness: Mapped[Optional[int]] = mapped_column( 157 | Integer(), 158 | nullable=True, 159 | default=None, 160 | ) 161 | effective_communication: Mapped[Optional[int]] = mapped_column( 162 | Integer(), 163 | nullable=True, 164 | default=None, 165 | ) 166 | active_listening: Mapped[Optional[int]] = mapped_column( 167 | Integer(), 168 | nullable=True, 169 | default=None, 170 | ) 171 | customization: Mapped[Optional[int]] = mapped_column( 172 | Integer(), 173 | nullable=True, 174 | default=None, 175 | ) 176 | 177 | # One-to-many relationships: 178 | client: Mapped["Client"] = relationship(back_populates="calls", lazy=True) 179 | agent: Mapped["Agent"] = relationship(back_populates="calls", lazy=True) 180 | 181 | 182 | class DBEngine: 183 | def __init__(self, context: mlrun.MLClientCtx): 184 | self.bucket_name = context.get_secret(key=ProjectSecrets.S3_BUCKET_NAME) 185 | self.db_url = context.get_secret(key=ProjectSecrets.MYSQL_URL) 186 | self.temp_file = None 187 | self.engine = self._create_engine() 188 | 189 | def get_session(self): 190 | return sessionmaker(self.engine) 191 | 192 | def update_db(self): 193 | if self.bucket_name: 194 | s3 = boto3.client("s3") 195 | s3.upload_file(self.temp_file.name, self.bucket_name, "sqlite.db") 196 | 197 | def _create_engine(self): 198 | if self.bucket_name: 199 | # Create a temporary file that will persist throughout the object's lifetime 200 | self.temp_file = tempfile.NamedTemporaryFile(suffix=".sqlite", delete=False) 201 | self.temp_file.close() # Close the file but keep the name 202 | 203 | s3 = boto3.client("s3") 204 | try: 205 | s3.download_file(self.bucket_name, "sqlite.db", self.temp_file.name) 206 | except Exception as e: 207 | print(f"Warning: Could not download database from S3: {e}") 208 | 209 | return create_engine(f"sqlite:///{self.temp_file.name}") 210 | else: 211 | return create_engine(url=self.db_url) 212 | 213 | def __del__(self): 214 | # Clean up the temporary file when the object is destroyed 215 | if self.temp_file: 216 | try: 217 | os.unlink(self.temp_file.name) 218 | except: 219 | pass 220 | 221 | 222 | def create_tables(): 223 | """ 224 | Create the call center schema tables for when creating or loading the MLRun project. 225 | """ 226 | # Create an engine: 227 | engine = DBEngine(mlrun.get_or_create_ctx("create_tables")) 228 | 229 | # Create the schema's tables: 230 | Base.metadata.create_all(engine.engine) 231 | 232 | engine.update_db() 233 | 234 | 235 | def insert_clients(context: mlrun.MLClientCtx, clients: list): 236 | # Create an engine: 237 | engine = DBEngine(context) 238 | 239 | # Initialize a session maker: 240 | session = engine.get_session() 241 | 242 | # Insert the new calls into the table and commit: 243 | with session.begin() as sess: 244 | sess.execute(insert(Client), clients) 245 | 246 | engine.update_db() 247 | 248 | 249 | def insert_agents(context: mlrun.MLClientCtx, agents: list): 250 | # Create an engine: 251 | engine = DBEngine(context) 252 | 253 | # Initialize a session maker: 254 | session = engine.get_session() 255 | 256 | # Insert the new calls into the table and commit: 257 | with session.begin() as sess: 258 | sess.execute(insert(Agent), agents) 259 | 260 | engine.update_db() 261 | 262 | 263 | def insert_calls( 264 | context: mlrun.MLClientCtx, calls: pd.DataFrame 265 | ) -> Tuple[pd.DataFrame, List[str]]: 266 | # Create an engine: 267 | engine = DBEngine(context) 268 | 269 | # Initialize a session maker: 270 | session = engine.get_session() 271 | 272 | # Cast data from dataframe to a list of dictionaries: 273 | records = calls.to_dict(orient="records") 274 | 275 | # Insert the new calls into the table and commit: 276 | with session.begin() as sess: 277 | sess.execute(insert(Call), records) 278 | 279 | engine.update_db() 280 | 281 | # Return the metadata and audio files: 282 | audio_files = list(calls["audio_file"]) 283 | return calls, audio_files 284 | 285 | 286 | def update_calls( 287 | context: mlrun.MLClientCtx, 288 | status: str, 289 | table_key: str, 290 | data_key: str, 291 | data: pd.DataFrame, 292 | ): 293 | # Create an engine: 294 | engine = DBEngine(context) 295 | 296 | # Initialize a session maker: 297 | session = engine.get_session() 298 | 299 | # Add the status to the dataframe: 300 | data["status"] = [CallStatus(status)] * len(data) 301 | 302 | # Make sure keys are not duplicates (so we can update by the key with `bindparam`): 303 | if data_key == table_key: 304 | data_key += "_2" 305 | data.rename(columns={table_key: data_key}, inplace=True) 306 | 307 | # Cast data from dataframe to a list of dictionaries: 308 | data = data.to_dict(orient="records") 309 | 310 | # Insert the new calls into the table and commit: 311 | with session.begin() as sess: 312 | sess.connection().execute( 313 | update(Call).where(getattr(Call, table_key) == bindparam(data_key)), data 314 | ) 315 | 316 | engine.update_db() 317 | 318 | 319 | def get_calls() -> pd.DataFrame: 320 | context = mlrun.get_or_create_ctx("get_calls") 321 | # Create an engine: 322 | engine = DBEngine(context) 323 | 324 | # Initialize a session maker: 325 | session = engine.get_session() 326 | 327 | # Select all calls: 328 | with session.begin() as sess: 329 | calls = pd.read_sql(select(Call), sess.connection()) 330 | 331 | return calls 332 | 333 | 334 | def get_agents(context: mlrun.MLClientCtx) -> list: 335 | # Create an engine: 336 | engine = DBEngine(context) 337 | 338 | # Initialize a session maker: 339 | session = engine.get_session() 340 | 341 | # Select all calls: 342 | with session.begin() as sess: 343 | agents = pd.read_sql(select(Agent), sess.connection()) 344 | return agents 345 | 346 | 347 | def get_clients(context: mlrun.MLClientCtx) -> list: 348 | # Create an engine: 349 | engine = DBEngine(context) 350 | 351 | # Initialize a session maker: 352 | session = engine.get_session() 353 | 354 | # Select all calls: 355 | with session.begin() as sess: 356 | clients = pd.read_sql(select(Client), sess.connection()) 357 | return clients 358 | -------------------------------------------------------------------------------- /src/calls_analysis/postprocessing.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | 4 | def postprocess_answers(answers: pd.DataFrame): 5 | for column in ["concern_addressed", "upsale_attempted", "upsale_success"]: 6 | answers[column] = answers[column].apply(lambda x: "yes" in x.casefold()) 7 | for column in ["client_tone", "agent_tone"]: 8 | answers[column] = answers[column].apply( 9 | lambda x: "Positive" if "Positive" in x else x 10 | ) 11 | answers[column] = answers[column].apply( 12 | lambda x: "Negative" if "Negative" in x else x 13 | ) 14 | answers[column] = answers[column].apply( 15 | lambda x: "Neutral" if "Neutral" in x else x 16 | ) 17 | return answers 18 | -------------------------------------------------------------------------------- /src/calls_generation/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Iguazio 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from .skip import skip_and_import_local_data 15 | -------------------------------------------------------------------------------- /src/calls_generation/conversations_generator.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Iguazio 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import datetime 15 | import hashlib 16 | import os 17 | import pathlib 18 | import random 19 | import tempfile 20 | from typing import Tuple 21 | 22 | import mlrun 23 | import pandas as pd 24 | import tqdm 25 | from langchain.chat_models import ChatOpenAI 26 | 27 | from src.common import TONES, TOPICS, ProjectSecrets 28 | 29 | #: The approximate amount of words in one minute. 30 | WORDS_IN_1_MINUTE = 240 31 | 32 | 33 | def generate_conversations( 34 | context: mlrun.MLClientCtx, 35 | amount: int, 36 | agent_data: pd.DataFrame, 37 | client_data: pd.DataFrame, 38 | output_directory: str = None, 39 | model_name: str = "gpt-3.5-turbo", 40 | language: str = "en", 41 | min_time: int = 2, 42 | max_time: int = 5, 43 | from_date: str = "01.01.2023", 44 | to_date: str = "01.03.2023", 45 | from_time: str = "09:00", 46 | to_time: str = "17:00", 47 | ) -> Tuple[str, pd.DataFrame, pd.DataFrame]: 48 | """ 49 | Generates a list of conversations between an internet provider call center and a customer. 50 | 51 | :param context: The MLRun context. 52 | :param amount: The number of conversations to generate. 53 | :param agent_data: The agent data to use for the conversations. 54 | :param client_data: The client data to use for the conversations. 55 | :param output_directory: The directory to save the conversations to. 56 | :param model_name: The name of the model to use for conversation generation. 57 | You should choose one of GPT-4 or GPT-3.5 from the list here: 58 | https://platform.openai.com/docs/models. Default: 'gpt-3.5-turbo'. 59 | :param language: The language to use for the generated conversation text. 60 | :param min_time: Minimum time of conversation in minutes. 61 | Will be used approximately to generate the minimum words with the following assessment: 62 | 240 words are equal to one minute. Default: 2. 63 | :param max_time: Maximum time of conversation in minutes. 64 | Will be used approximately to generate the maximum words with the following assessment: 65 | 240 words are equal to one minute. Default: 5. 66 | :param from_date: The minimum date of the conversation. 67 | :param to_date: The maximum date of the conversation. 68 | :param from_time: The minimum time (HH:MM) of the conversation. 69 | :param to_time: The maximum time (HH:MM) of the conversation. 70 | """ 71 | # Get the minimum and maximum amount of words: 72 | min_words = WORDS_IN_1_MINUTE * min_time 73 | max_words = WORDS_IN_1_MINUTE * max_time 74 | 75 | # Get the minimum and maximum dates and times: 76 | min_time = datetime.datetime.strptime(from_time, "%H:%M") 77 | max_time = datetime.datetime.strptime(to_time, "%H:%M") 78 | min_date = datetime.datetime.strptime(from_date, "%m.%d.%Y").date() 79 | max_date = datetime.datetime.strptime(to_date, "%m.%d.%Y").date() 80 | 81 | # Create the concern addressed options: 82 | concern_addressed_options = { 83 | True: "", 84 | False: "Don't", 85 | } 86 | 87 | # Create the agent upsales options: 88 | agent_upsales_options = { 89 | "Doesn't try": "Doesn't try to upsale the customer on more services.", 90 | "Tries and doesn't succeed": "Tries to upsale the customer on more services, and doesn't succeed", 91 | "Tries and succeeds": "Tries to upsale the customer on more services, and succeeds", 92 | } 93 | 94 | # Create the upsale mapping: 95 | upsale_mapping = { 96 | "Doesn't try": [False, False], 97 | "Tries and doesn't succeed": [True, False], 98 | "Tries and succeeds": [True, True], 99 | } 100 | 101 | # Create the prompt structure: 102 | prompt_structure = ( 103 | "Generate a conversation between an internet provider call center agent named {agent_name} from (“Iguazio Internet”) and " 104 | "a client named {client_name} with email: {client_email} and phone number: {client_phone} in {language} except 'Agent' and 'Client' prefixes which are constants.\n" 105 | "Format the conversation as follow:\n" 106 | "Agent: \n" 107 | "Client: \n" 108 | "The conversations has to include at least {min_words} words and no more than {max_words} words.\n" 109 | "The call must include the following steps: \n" 110 | "1. Opening (greeting and customer details validation and confirmation)\n" 111 | "2. Presenting the problem by the customer" 112 | "3. The agent {concern_addressed} address the client's concern.\n" 113 | "4. The Agent {agent_upsales}" 114 | "5. Summerizing and closing the call" 115 | "It has to be about a client who is calling to discuss about {topic}.\n" 116 | "Do not add any descriptive tag and do not mark the end of the conversation with [End of conversation].\n" 117 | "Use ... for hesitation.\n" 118 | "The client needs to have a {client_tone} tone.\n" 119 | "The agent needs to have a {agent_tone}.\n" 120 | "Remove from the final output any word inside parentheses of all types. \n" 121 | "use the following levels of these attributes while describing the agent's role: \n" 122 | "Empathy {empathy}, Professionalism {professionalism}, Kindness {kindness}, \n" 123 | "Effective Communication {effective_communication}, Active listening {active_listening}, Customization {customization}." 124 | ) 125 | 126 | # Load the OpenAI model using langchain: 127 | os.environ["OPENAI_API_KEY"] = context.get_secret(key=ProjectSecrets.OPENAI_API_KEY) 128 | os.environ["OPENAI_API_BASE"] = context.get_secret( 129 | key=ProjectSecrets.OPENAI_API_BASE 130 | ) 131 | llm = ChatOpenAI(model=model_name) 132 | 133 | # Create the output directory: 134 | if output_directory is None: 135 | output_directory = tempfile.mkdtemp() 136 | output_directory = pathlib.Path(output_directory) 137 | if not output_directory.exists(): 138 | output_directory.mkdir(parents=True, exist_ok=True) 139 | 140 | # Start generating conversations: 141 | conversations = [] 142 | ground_truths = [] 143 | for _ in tqdm.tqdm(range(amount), desc="Generating"): 144 | # Randomize the conversation metadata: 145 | conversation_id = _generate_id() 146 | date = _get_random_date(min_date=min_date, max_date=max_date) 147 | time = _get_random_time(min_time=min_time, max_time=max_time) 148 | 149 | # Randomly select the conversation parameters: 150 | concern_addressed = random.choice(list(concern_addressed_options.keys())) 151 | agent_upsales = random.choice(list(agent_upsales_options.keys())) 152 | client_tone = random.choice(TONES) 153 | agent_tone = random.choice(TONES) 154 | topic = random.choice(TOPICS) 155 | agent = agent_data.sample().to_dict(orient="records")[0] 156 | client = client_data.sample().to_dict(orient="records")[0] 157 | 158 | # Generate levels os different agent attributes: 159 | empathy = random.randint(1, 5) 160 | professionalism = random.randint(1, 5) 161 | kindness = random.randint(1, 5) 162 | effective_communication = random.randint(1, 5) 163 | active_listening = random.randint(1, 5) 164 | customization = random.randint(1, 5) 165 | 166 | # Create the prompt: 167 | prompt = prompt_structure.format( 168 | language=language, 169 | min_words=min_words, 170 | max_words=max_words, 171 | topic=topic, 172 | concern_addressed=concern_addressed_options[concern_addressed], 173 | agent_upsales=agent_upsales_options[agent_upsales], 174 | client_tone=client_tone, 175 | agent_tone=agent_tone, 176 | agent_name=f"{agent['first_name']} {agent['last_name']}", 177 | client_name=f"{client['first_name']} {client['last_name']}", 178 | client_email=client["email"], 179 | client_phone=client["phone_number"], 180 | empathy=empathy, 181 | professionalism=professionalism, 182 | kindness=kindness, 183 | effective_communication=effective_communication, 184 | active_listening=active_listening, 185 | customization=customization, 186 | ) 187 | 188 | # Generate the conversation: 189 | conversation = llm.predict(text=prompt) 190 | # Remove redundant newlines and spaces: 191 | conversation = "".join( 192 | [ 193 | line 194 | for line in conversation.strip().splitlines(keepends=True) 195 | if line.strip("\n").strip() 196 | ] 197 | ) 198 | # Save to file: 199 | conversation_text_path = output_directory / f"{conversation_id}.txt" 200 | with open(conversation_text_path, "w") as fp: 201 | fp.write(conversation) 202 | 203 | # Collect to the conversations and ground truths lists: 204 | conversations.append( 205 | [ 206 | conversation_id, 207 | conversation_text_path.name, 208 | client["client_id"], 209 | agent["agent_id"], 210 | date, 211 | time, 212 | ] 213 | ) 214 | ground_truths.append( 215 | [ 216 | conversation_id, 217 | language, 218 | topic, 219 | concern_addressed, 220 | upsale_mapping[agent_upsales][0], 221 | upsale_mapping[agent_upsales][1], 222 | client_tone, 223 | agent_tone, 224 | client["client_id"], 225 | agent["agent_id"], 226 | empathy, 227 | professionalism, 228 | kindness, 229 | effective_communication, 230 | active_listening, 231 | customization, 232 | ] 233 | ) 234 | 235 | # Cast the conversations and ground truths into a dataframe: 236 | conversations = pd.DataFrame( 237 | conversations, 238 | columns=["call_id", "text_file", "client_id", "agent_id", "date", "time"], 239 | ) 240 | ground_truths = pd.DataFrame( 241 | ground_truths, 242 | columns=[ 243 | "call_id", 244 | "language", 245 | "topic", 246 | "concern_addressed", 247 | "agent_tries_upsale", 248 | "agent_succeeds_upsale", 249 | "client_tone", 250 | "agent_tone", 251 | "agent_id", 252 | "client_id", 253 | "empathy", 254 | "professionalism", 255 | "kindness", 256 | "effective_communication", 257 | "active_listening", 258 | "customization", 259 | ], 260 | ) 261 | 262 | return str(output_directory), conversations, ground_truths 263 | 264 | 265 | def _get_random_time( 266 | min_time: datetime.datetime, max_time: datetime.datetime 267 | ) -> datetime.time: 268 | if max_time.hour <= min_time.hour: 269 | max_time += datetime.timedelta(days=1) 270 | return ( 271 | min_time 272 | + datetime.timedelta( 273 | seconds=random.randint(0, int((max_time - min_time).total_seconds())), 274 | ) 275 | ).time() 276 | 277 | 278 | def _get_random_date(min_date, max_date) -> datetime.date: 279 | return min_date + datetime.timedelta( 280 | days=random.randint(0, int((max_date - min_date).days)), 281 | ) 282 | 283 | 284 | def create_batch_for_analysis( 285 | conversations_data: pd.DataFrame, audio_files: pd.DataFrame 286 | ) -> pd.DataFrame: 287 | conversations_data = conversations_data.join( 288 | other=audio_files.set_index(keys="text_file"), on="text_file" 289 | ) 290 | conversations_data.drop(columns="text_file", inplace=True) 291 | conversations_data.dropna(inplace=True) 292 | return conversations_data 293 | 294 | 295 | def _generate_id() -> str: 296 | return hashlib.md5(str(datetime.datetime.now()).encode("utf-8")).hexdigest() 297 | -------------------------------------------------------------------------------- /src/calls_generation/skip.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Iguazio 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from pathlib import Path 15 | 16 | import mlrun 17 | import pandas as pd 18 | import yaml 19 | from mlrun.artifacts import ArtifactSpec, DatasetArtifact 20 | from sqlalchemy import insert 21 | 22 | from src.calls_analysis.db_management import Agent, Call, Client, create_tables, DBEngine 23 | 24 | 25 | def skip_and_import_local_data(language: str): 26 | """ 27 | This function logs example data to the database and to the project. 28 | Call this function from the notebook in order to skip the calls generation workflow. 29 | """ 30 | # Get the example data directory: 31 | example_data_dir = Path("data") 32 | # Get the project: 33 | project = mlrun.get_current_project() 34 | 35 | # clean and recreate database tables: 36 | engine = DBEngine(mlrun.get_or_create_ctx("skip")) 37 | Call.__table__.drop(engine.engine) 38 | Client.__table__.drop(engine.engine) 39 | Agent.__table__.drop(engine.engine) 40 | create_tables() 41 | print("- Initialized tables") 42 | 43 | # log agents and clients data 44 | json_spec = ArtifactSpec( 45 | unpackaging_instructions={ 46 | "packager_name": "ListPackager", 47 | "object_type": "builtins.list", 48 | "artifact_type": "file", 49 | "instructions": {"file_format": "json"}, 50 | } 51 | ) 52 | zip_spec = ArtifactSpec( 53 | unpackaging_instructions={ 54 | "packager_name": "StrPackager", 55 | "object_type": "builtins.str", 56 | "artifact_type": "path", 57 | "instructions": {"archive_format": "zip", "is_directory": "true"}, 58 | } 59 | ) 60 | parquet_spec = ArtifactSpec( 61 | unpackaging_instructions={ 62 | "packager_name": "PandasDataFramePackager", 63 | "object_type": "pandas.core.frame.DataFrame", 64 | "artifact_type": "dataset", 65 | "instructions": {}, 66 | } 67 | ) 68 | # load agent and client data: 69 | agents = project.log_artifact( 70 | item="agent-data-generator_agents", 71 | spec=json_spec, 72 | local_path=str(example_data_dir / f"{language}_agents.json"), 73 | db_key="agent-data-generator_agents", 74 | ) 75 | agents = agents.to_dataitem() 76 | agents = yaml.load(agents.get(), Loader=yaml.FullLoader) 77 | clients = project.log_artifact( 78 | item="client-data-generator_clients", 79 | spec=json_spec, 80 | local_path=str(example_data_dir / f"{language}_clients.json"), 81 | db_key="client-data-generator_clients", 82 | ) 83 | clients = clients.to_dataitem() 84 | clients = yaml.load(clients.get(), Loader=yaml.FullLoader) 85 | 86 | # insert agent and client data to database: 87 | _insert_agents_and_clients_to_db(agents, clients) 88 | print("- agents and clients inserted") 89 | 90 | # log zip files 91 | remote_zip_path = mlrun.get_sample_path(f"call-demo/{language}_audio_files.zip") 92 | conversations_art = project.log_artifact( 93 | item="conversation-generation_conversations", 94 | spec=zip_spec, 95 | local_path=str(example_data_dir / f"{language}_conversations.zip"), 96 | db_key="conversation-generation_conversations", 97 | ) 98 | audio_files_art = project.log_artifact( 99 | item="text-to-audio_audio_files", 100 | spec=zip_spec, 101 | target_path=remote_zip_path, 102 | db_key="text-to-audio_audio_files", 103 | ) 104 | # log parquet files 105 | calls_batch_df = pd.read_parquet( 106 | str(example_data_dir / f"{language}_calls_batch.parquet") 107 | ) 108 | dataframe_df = pd.read_parquet( 109 | str(example_data_dir / f"{language}_dataframe.parquet") 110 | ) 111 | ground_truths_df = pd.read_parquet( 112 | str(example_data_dir / f"{language}_ground_truths.parquet") 113 | ) 114 | metadata_df = pd.read_parquet( 115 | str(example_data_dir / f"{language}_metadata.parquet") 116 | ) 117 | 118 | project.log_artifact( 119 | item=DatasetArtifact(key="batch-creation_calls_batch", df=calls_batch_df), 120 | spec=parquet_spec, 121 | local_path=str(example_data_dir / f"{language}_calls_batch.parquet"), 122 | ) 123 | project.log_artifact( 124 | item=DatasetArtifact(key="text-to-audio_dataframe", df=dataframe_df), 125 | spec=parquet_spec, 126 | ) 127 | project.log_artifact( 128 | item=DatasetArtifact( 129 | key="conversation-generation_ground_truths", df=ground_truths_df 130 | ), 131 | spec=parquet_spec, 132 | ) 133 | project.log_artifact( 134 | item=DatasetArtifact(key="conversation-generation_metadata", df=metadata_df), 135 | spec=parquet_spec, 136 | ) 137 | print("*** first workflow skipped successfully ***") 138 | 139 | 140 | def _insert_agents_and_clients_to_db(agents: list, clients: list): 141 | # Create an engine: 142 | engine = DBEngine(mlrun.get_or_create_ctx("skip")) 143 | 144 | # Initialize a session maker: 145 | session = engine.get_session() 146 | 147 | # Insert the new calls into the table and commit: 148 | with session.begin() as sess: 149 | sess.execute(insert(Agent), agents) 150 | sess.execute(insert(Client), clients) 151 | 152 | 153 | # TODO: change to export the actual data and not the artifacts 154 | def save_current_example_data(): 155 | project = mlrun.get_current_project() 156 | export_dir = Path("example_data") 157 | if not export_dir.exists(): 158 | export_dir.mkdir(parents=True, exist_ok=True) 159 | 160 | for artifact_name, target_path in [ 161 | ("client-data-generator_clients", "clients.zip"), 162 | ("agent-data-generator_agents", "agents.zip"), 163 | ( 164 | "conversation-generation_conversations", 165 | "conversation_generation/conversations.zip", 166 | ), 167 | ("conversation-generation_metadata", "conversation_generation/metadata.zip"), 168 | ( 169 | "conversation-generation_ground_truths", 170 | "conversation_generation/ground_truths.zip", 171 | ), 172 | ("text-to-audio_audio_files", "text_to_audio/audio_files.zip"), 173 | ("text-to-audio_dataframe", "text_to_audio/dataframe.zip"), 174 | ("batch-creation_calls_batch", "batch_creation/calls_batch.zip"), 175 | ]: 176 | export_path = export_dir / target_path 177 | if not export_path.exists(): 178 | export_path.parent.mkdir(parents=True, exist_ok=True) 179 | project.get_artifact(artifact_name).export(f"example_data/{target_path}") 180 | print(f"- exported {artifact_name} to {target_path}") 181 | print("*** all artifacts exported successfully ***") 182 | -------------------------------------------------------------------------------- /src/common.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Iguazio 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import enum 15 | 16 | 17 | class ProjectSecrets: 18 | OPENAI_API_KEY = "OPENAI_API_KEY" 19 | OPENAI_API_BASE = "OPENAI_API_BASE" 20 | MYSQL_URL = "MYSQL_URL" 21 | MYSQL_CONNECT_ARGS = "MYSQL_CONNECT_ARGS" 22 | S3_BUCKET_NAME = "S3_BUCKET_NAME" 23 | 24 | 25 | class CallStatus(enum.Enum): 26 | CREATED = "Created" 27 | AUDIO_PROCESSED = "Audio processed" 28 | SPEECH_DIARIZED = "Speech diarized" 29 | TRANSCRIBED = "Transcribed" 30 | TRANSLATED = "Translated" 31 | ANONYMIZED = "Anonymized" 32 | ANALYZED = "Analyzed" 33 | 34 | 35 | TOPICS = [ 36 | "slow internet speed", 37 | "billing discrepancies", 38 | "account login problems", 39 | "setting up a new device", 40 | "phishing or malware concerns", 41 | "scheduled maintenance notifications", 42 | "service upgrades", 43 | "negotiating pricing", 44 | "canceling service", 45 | "customer service feedback", 46 | ] 47 | 48 | TONES = [ 49 | "Positive", 50 | "Neutral", 51 | "Negative", 52 | ] 53 | -------------------------------------------------------------------------------- /src/vizro.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import tarfile 4 | from pathlib import Path 5 | 6 | import boto3 7 | import mlrun 8 | import mlrun.common.schemas 9 | import pandas as pd 10 | 11 | from src.calls_analysis.db_management import get_calls, get_clients 12 | 13 | COLUMNS_MAPPING = { 14 | "active_listening": "Active Listening", 15 | "agent_id": "Agent ID", 16 | "agent_tone": "Agent Tone", 17 | "date": "Call Date", 18 | "client_id": "Caller ID", 19 | "client_tone": "Client Tone", 20 | "concern_addressed": "Concern Addressed", 21 | "customization": "Customization", 22 | "effective_communication": "Effective Communication", 23 | "empathy": "Empathy", 24 | "kindness": "Kindness", 25 | "professionalism": "Professionalism", 26 | "summary": "Summary", 27 | "time": "Time", 28 | "topic": "Topic", 29 | "upsale_attempted": "Upsale Attempted", 30 | "upsale_success": "Upsale Success", 31 | "client_city": "Caller City", 32 | "anonymized_file": "text_file", 33 | } 34 | 35 | 36 | def deploy_vizro_application(): 37 | dir_name = "vizro" 38 | 39 | # Prepare the dataframe for vizro: 40 | _prepare_vizro_source(dir_name) 41 | print("Application source code ready for deployment.") 42 | 43 | # Archive 44 | bucket_name = os.getenv("S3_BUCKET_NAME") 45 | if bucket_name: 46 | _upload_to_s3(dir_name) 47 | # Add the source code to the application 48 | src_path = f"s3://{bucket_name}/{dir_name}.tar.gz" 49 | print(f"Uploading {src_path} to {bucket_name}") 50 | else: 51 | # Set the source path to V3IO 52 | src_path = f'v3io:///users/{os.environ["V3IO_USERNAME"]}/{os.getcwd().replace("/User/", "")}/{dir_name}.tar.gz' 53 | print(f"Configuring V3IO {src_path} to UI") 54 | project = mlrun.get_current_project() 55 | app = project.get_function("call-center-ui") 56 | app.with_source_archive(src_path, pull_at_runtime=False) 57 | 58 | # Deploy the application 59 | app.deploy(force_build=True, create_default_api_gateway=False, with_mlrun=False) 60 | app.create_api_gateway( 61 | name="call-center-ui", 62 | direct_port_access=True, 63 | set_as_default=True, 64 | authentication_mode=mlrun.common.schemas.api_gateway.APIGatewayAuthenticationMode.none, 65 | ) 66 | print("Application deployed successfully!") 67 | 68 | 69 | def _prepare_vizro_source(dir_name: str): 70 | clients_df = get_clients(mlrun.get_or_create_ctx("mlrun")) 71 | calls_df = get_calls() 72 | vizro_df = pd.merge( 73 | calls_df, 74 | clients_df[["client_id", "client_city", "latitude", "longitude"]], 75 | on="client_id", 76 | ) 77 | vizro_df = vizro_df.rename(columns=COLUMNS_MAPPING) 78 | vizro_df.to_csv("vizro/data.csv") 79 | 80 | # add text and audio files to vizro: 81 | shutil.copytree("outputs", "vizro/outputs", dirs_exist_ok=True) 82 | 83 | # Write the application code to a file 84 | app_dir = "vizro" 85 | 86 | # Create an archive of the application code 87 | archive_name = f"{dir_name}.tar.gz" 88 | with tarfile.open(archive_name, "w:gz") as tar: 89 | tar.add(app_dir) 90 | 91 | 92 | def _upload_to_s3(dir_name: str): 93 | # uploading db file to s3: 94 | s3 = boto3.client("s3") 95 | bucket_name = Path(mlrun.mlconf.artifact_path).parts[1] 96 | 97 | # Upload the file 98 | s3.upload_file( 99 | Filename=f"{dir_name}.tar.gz", 100 | Bucket=bucket_name, 101 | Key=f"{dir_name}.tar.gz", 102 | ) 103 | -------------------------------------------------------------------------------- /src/workflows/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Iguazio 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /src/workflows/calls_analysis.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Iguazio 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from typing import List 15 | 16 | import kfp 17 | import mlrun 18 | from kfp import dsl 19 | 20 | from src.common import TONES, TOPICS, CallStatus 21 | 22 | QUESTIONS = [ 23 | [ 24 | f"1. Classify the topic of the text from the following list (choose one): {TOPICS}", 25 | "2. Write a long summary of the text, focus on the topic (max 50 words).", 26 | "3. Was the Client's concern addressed, (choose only one) [Yes, No]?", 27 | f"4. Was the Client tone (choose only one, if not sure choose Neutral) {TONES}? ", 28 | f"5. Was the Call Center Agent tone (choose only one, if not sure choose Neutral) {TONES}?", 29 | ], 30 | [ 31 | "1. Did the agent try to upsale the customer (choose only one) [Yes, No]? (sell any additional product or service)", 32 | "2. If the agent indeed try to upsale the client, did he succeed (choose only one) [Yes, No]? if he didn't try' answer No", 33 | "3. Rate the agent's level of empathy (The ability to understand and share the feelings of others) on a scale of 1-5.", 34 | "4. Rate the agent's level of professionalism (Conducting oneself in a way that is appropriate for the workplace) on a scale of 1-5.", 35 | "5. Rate the agent's level of kindness (The quality of being friendly, generous, and considerate) on a scale of 1-5.", 36 | "6. Rate the agent's level of effective communication (The ability to convey information clearly and concisely) on a scale of 1-5.", 37 | "7. Rate the agent's level of active listening (The process of paying attention to and understanding what someone is saying) on a scale of 1-5.", 38 | "8. Rate the agent's level of customization (The process of tailoring something to the specific needs or preferences of an individual) on a scale of 1-5.", 39 | ], 40 | ] 41 | DEMO_CALL = ( 42 | "Agent: Good afternoon, you've reached [Internet Service Provider] customer support. I'm Megan. How can I assist " 43 | "you today?\n" 44 | "Customer: Hello, Megan. This is Lisa. I've noticed some billing discrepancies on my last statement.\n" 45 | "Agent: I'm sorry to hear that, Lisa. I'd be happy to help you with that. Could you please provide me with your " 46 | "account number or phone number associated with your account?\n" 47 | "Customer: Of course, my account number is 123456789.\n" 48 | "Agent: Thank you, Lisa. Let me pull up your account. I see the billing discrepancies you mentioned. It appears " 49 | "there was an error in the charges. I apologize for the inconvenience.\n" 50 | "Customer: Thank you for acknowledging the issue, Megan. Can you please help me get it resolved?\n" 51 | "Agent: Absolutely, Lisa. I've made note of the discrepancies, and I'll escalate this to our billing department " 52 | "for investigation and correction. You should see the adjustments on your next statement.\n" 53 | "Customer: That sounds good, Megan. I appreciate your help.\n" 54 | "Agent: You're welcome, Lisa. If you have any more questions or concerns in the future, please don't hesitate to " 55 | "reach out. Is there anything else I can assist you with today?\n" 56 | "Customer: No, that's all. Thank you for your assistance, Megan.\n" 57 | "Agent: Not a problem, Lisa. Have a wonderful day, and we'll get this sorted out for you.\n" 58 | "Customer: You too! Goodbye, Megan.\n" 59 | "Agent: Goodbye, Lisa!" 60 | ) 61 | DEMO_ANSWERS = [ 62 | ( 63 | "1. billing discrepancies\n" 64 | "2. The customer, contacted the call center regarding billing discrepancies on her statement. The agent, " 65 | "acknowledged the issue, assured The customer it would be resolved, and escalated it to the billing department for " 66 | "correction.\n" 67 | "3. Yes.\n" 68 | "4. Natural.\n" 69 | "5. positive.\n" 70 | ), 71 | ("1. No\n" "2. No\n" "3. 4\n" "4. 5\n" "5. 4\n" "6. 5\n" "7. 4\n" "8. 3"), 72 | ] 73 | TEXT_WRAPPER = [ 74 | ( 75 | f"<|im_start|>system: You are an AI assistant that answers questions accurately and shortly<|im_end|>\n" 76 | f"<|im_start|>user: Given the following text:\n" 77 | f"{DEMO_CALL}\n" 78 | f"answer the questions as accurately as you can:\n" 79 | f"{QUESTIONS[i]}<|im_end|>\n" 80 | f"<|im_start|>assistant:\n" 81 | f"{DEMO_ANSWERS[i]}<|im_end|>\n" 82 | f"<|im_start|>user: Given the following text:\n" 83 | "{}" 84 | ) 85 | for i in range(len(QUESTIONS)) 86 | ] 87 | QUESTIONS_WRAPPER = ( 88 | " answer the given questions as accurately as you can, do not write more answers the questions:\n" 89 | "{}<|im_end|>\n" 90 | "<|im_start|>assistant:\n" 91 | ) 92 | 93 | 94 | @kfp.dsl.pipeline() 95 | def pipeline( 96 | batch: str, 97 | calls_audio_files: str, 98 | transcribe_model: str, 99 | translate_to_english: bool, 100 | pii_recognition_model: str, 101 | pii_recognition_entities: List[str], 102 | pii_recognition_entity_operator_map: List[str], 103 | question_answering_model: str, 104 | batch_size: int = 2, 105 | auto_gptq_exllama_max_input_length: int = None, 106 | insert_calls_db: bool = True, 107 | ): 108 | # Get the project: 109 | project = mlrun.get_current_project() 110 | db_management_function = project.get_function("db-management") 111 | with dsl.Condition(insert_calls_db == True) as insert_calls_condition: 112 | # Insert new calls: 113 | insert_calls_run = project.run_function( 114 | db_management_function, 115 | handler="insert_calls", 116 | name="insert-calls", 117 | inputs={"calls": batch}, 118 | returns=[ 119 | "calls_batch: dataset", 120 | "audio_files: file", 121 | ], 122 | ) 123 | 124 | # Speech diarize: 125 | speech_diarization_function = project.get_function("silero-vad") 126 | diarize_run = project.run_function( 127 | speech_diarization_function, 128 | handler="diarize", 129 | name="diarization", 130 | inputs={"data_path": calls_audio_files}, 131 | params={ 132 | "speaker_labels": ["Agent", "Client"], 133 | "verbose": True, 134 | }, 135 | returns=["speech_diarization: file", "diarize_errors: file"], 136 | ).after(insert_calls_condition) 137 | 138 | # Update diarization state: 139 | update_calls_post_speech_diarization_run = project.run_function( 140 | db_management_function, 141 | handler="update_calls", 142 | name="update-calls", 143 | inputs={"data": batch}, 144 | params={ 145 | "status": CallStatus.SPEECH_DIARIZED.value, 146 | "table_key": "call_id", 147 | "data_key": "call_id", 148 | }, 149 | ).after(diarize_run) 150 | 151 | # Transcribe: 152 | transcription_function = project.get_function("transcription") 153 | transcribe_run = project.run_function( 154 | transcription_function, 155 | handler="transcribe", 156 | name="transcription", 157 | inputs={ 158 | "data_path": calls_audio_files, 159 | "speech_diarization": diarize_run.outputs["speech_diarization"], 160 | }, 161 | params={ 162 | "model_name": transcribe_model, 163 | "device": "cuda", 164 | "use_better_transformers": True, 165 | "batch_size": batch_size, 166 | "translate_to_english": translate_to_english, 167 | }, 168 | returns=[ 169 | "transcriptions: path", 170 | "transcriptions_dataframe: dataset", 171 | "transcriptions_errors: file", 172 | ], 173 | ) 174 | 175 | # Update transcription state: 176 | update_calls_post_transcription_run = project.run_function( 177 | db_management_function, 178 | handler="update_calls", 179 | name="update-calls-2", 180 | inputs={"data": transcribe_run.outputs["transcriptions_dataframe"]}, 181 | params={ 182 | "status": CallStatus.TRANSCRIBED.value, 183 | "table_key": "audio_file", 184 | "data_key": "audio_file", 185 | }, 186 | ) 187 | 188 | # Recognize PII: 189 | pii_recognition_function = project.get_function("pii-recognition") 190 | recognize_pii_run = project.run_function( 191 | pii_recognition_function, 192 | handler="recognize_pii", 193 | name="pii-recognition", 194 | inputs={"input_path": transcribe_run.outputs["transcriptions"]}, 195 | params={ 196 | "model": pii_recognition_model, 197 | "html_key": "highlighted", 198 | "entities": pii_recognition_entities, 199 | "entity_operator_map": pii_recognition_entity_operator_map, 200 | "score_threshold": 0.8, 201 | "is_full_report": False, 202 | }, 203 | returns=[ 204 | "anonymized_files: path", 205 | "anonymized_files_dataframe: dataset", 206 | "anonymized_files_errors: file", 207 | "anonymized_files_report: file", 208 | ], 209 | ) 210 | 211 | # Update PII state: 212 | update_calls_post_pii_recognition_run = project.run_function( 213 | db_management_function, 214 | handler="update_calls", 215 | name="update-calls-3", 216 | inputs={"data": recognize_pii_run.outputs["anonymized_files_dataframe"]}, 217 | params={ 218 | "status": CallStatus.ANONYMIZED.value, 219 | "table_key": "transcription_file", 220 | "data_key": "original_file", 221 | }, 222 | ) 223 | 224 | # Question-answering: 225 | question_answering_function = project.get_function("question-answering") 226 | question_answering_function.with_requests(mem="20G") 227 | answer_questions_run = project.run_function( 228 | question_answering_function, 229 | handler="answer_questions", 230 | name="analysis", 231 | inputs={"data_path": recognize_pii_run.outputs["anonymized_files"]}, 232 | params={ 233 | "verbose": True, 234 | "model_name": question_answering_model, 235 | # We don't need the auto_gptq_exllama if using CPU, we do need it if using GPU 236 | "auto_gptq_exllama_max_input_length": auto_gptq_exllama_max_input_length, 237 | "device_map": "auto", 238 | "text_wrapper": TEXT_WRAPPER, 239 | "questions": QUESTIONS, 240 | "questions_wrapper": QUESTIONS_WRAPPER, 241 | "questions_columns": [ 242 | "topic", 243 | "summary", 244 | "concern_addressed", 245 | "client_tone", 246 | "agent_tone", 247 | "upsale_attempted", 248 | "upsale_success", 249 | "empathy", 250 | "professionalism", 251 | "kindness", 252 | "effective_communication", 253 | "active_listening", 254 | "customization", 255 | ], 256 | "questions_config": [ 257 | {}, 258 | {"type": "poll", "poll_count": 3, "poll_strategy": "most_common"}, 259 | ], 260 | "generation_config": { 261 | "max_new_tokens": 250, 262 | "do_sample": True, 263 | "temperature": 0.7, 264 | "top_p": 0.95, 265 | "top_k": 40, 266 | "repetition_penalty": 1.1, 267 | }, 268 | "batch_size": 1, 269 | "model_kwargs": {}, 270 | }, 271 | returns=[ 272 | "question_answering_dataframe: dataset", 273 | "question_answering_errors: file", 274 | ], 275 | ) 276 | 277 | # Postprocess answers: 278 | postprocessing_function = project.get_function("postprocessing") 279 | postprocess_answers_run = project.run_function( 280 | postprocessing_function, 281 | handler="postprocess_answers", 282 | name="answers-postprocessing", 283 | inputs={ 284 | "answers": answer_questions_run.outputs["question_answering_dataframe"] 285 | }, 286 | returns=["processed_answers: dataset"], 287 | ) 288 | 289 | # Update question answering state: 290 | update_calls_post_question_answering_run = project.run_function( 291 | db_management_function, 292 | handler="update_calls", 293 | name="update-calls-4", 294 | inputs={"data": postprocess_answers_run.outputs["processed_answers"]}, 295 | params={ 296 | "status": CallStatus.ANALYZED.value, 297 | "table_key": "anonymized_file", 298 | "data_key": "text_file", 299 | }, 300 | ) 301 | -------------------------------------------------------------------------------- /src/workflows/calls_generation.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Iguazio 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from typing import List 15 | 16 | import kfp 17 | import mlrun 18 | from kfp import dsl 19 | 20 | 21 | @kfp.dsl.pipeline() 22 | def pipeline( 23 | amount: int, 24 | generation_model: str, 25 | tts_model: str, 26 | language: str, 27 | available_voices: List[str], 28 | min_time: int, 29 | max_time: int, 30 | from_date: str, 31 | to_date: str, 32 | from_time: str, 33 | to_time: str, 34 | num_clients: int, 35 | num_agents: int, 36 | generate_clients_and_agents: bool = True, 37 | ): 38 | # Get the project: 39 | project = mlrun.get_current_project() 40 | 41 | with dsl.Condition(generate_clients_and_agents == True) as generate_data_condition: 42 | # Generate client data: 43 | client_data_generator_function = project.get_function( 44 | "structured_data_generator" 45 | ) 46 | client_data_run = project.run_function( 47 | client_data_generator_function, 48 | handler="generate_data", 49 | name="client-data-generator", 50 | params={ 51 | "amount": num_clients, 52 | "model_name": generation_model, 53 | "language": language, 54 | "fields": [ 55 | f"first_name: in {language}, no special characters", 56 | f"last_name: in {language}, no special characters", 57 | "phone_number", 58 | "email", 59 | "client_id: no leading zeros", 60 | "client_city: Enter city, state in the US (e.g., Austin, TX), Not only Texas", 61 | "latitude: That correspond to the city", 62 | "longitude: That correspond to the city", 63 | ], 64 | }, 65 | returns=["clients: file"], 66 | ) 67 | 68 | # Insert client data to database 69 | db_management_function = project.get_function("db-management") 70 | project.run_function( 71 | db_management_function, 72 | handler="insert_clients", 73 | name="insert-clients", 74 | inputs={ 75 | "clients": client_data_run.outputs["clients"], 76 | }, 77 | ) 78 | 79 | # Generate agent data: 80 | agent_data_generator_function = project.get_function( 81 | "structured_data_generator" 82 | ) 83 | agent_data_run = project.run_function( 84 | agent_data_generator_function, 85 | handler="generate_data", 86 | name="agent-data-generator", 87 | params={ 88 | "amount": num_agents, 89 | "model_name": generation_model, 90 | "language": language, 91 | "fields": [ 92 | f"first_name: in {language}, no special characters", 93 | f"last_name: in {language}, no special characters", 94 | "agent_id: no leading zeros", 95 | ], 96 | }, 97 | returns=["agents: file"], 98 | ) 99 | 100 | # Insert agent data to database 101 | db_management_function = project.get_function("db-management") 102 | project.run_function( 103 | db_management_function, 104 | handler="insert_agents", 105 | name="insert-agents", 106 | inputs={ 107 | "agents": agent_data_run.outputs["agents"], 108 | }, 109 | ) 110 | 111 | # Get agents from database 112 | db_management_function = project.get_function("db-management") 113 | get_agents_run = project.run_function( 114 | db_management_function, 115 | handler="get_agents", 116 | name="get-agents", 117 | returns=["agents: file"], 118 | ).after(generate_data_condition) 119 | 120 | # Get clients from database 121 | db_management_function = project.get_function("db-management") 122 | get_clients_run = project.run_function( 123 | db_management_function, 124 | handler="get_clients", 125 | name="get-clients", 126 | returns=["clients: file"], 127 | ).after(generate_data_condition) 128 | 129 | # Generate conversations texts: 130 | conversations_generator_function = project.get_function("conversations-generator") 131 | generate_conversations_run = project.run_function( 132 | conversations_generator_function, 133 | handler="generate_conversations", 134 | name="conversation-generation", 135 | params={ 136 | "amount": amount, 137 | "model_name": generation_model, 138 | "language": language, 139 | "min_time": min_time, 140 | "max_time": max_time, 141 | "from_date": from_date, 142 | "to_date": to_date, 143 | "from_time": from_time, 144 | "to_time": to_time, 145 | }, 146 | inputs={ 147 | "agent_data": get_agents_run.outputs["agents"], 148 | "client_data": get_clients_run.outputs["clients"], 149 | }, 150 | returns=[ 151 | "conversations: path", 152 | "metadata: dataset", 153 | "ground_truths: dataset", 154 | ], 155 | ) 156 | 157 | # Text to audio: 158 | text_to_audio_generator_function = project.get_function("text-to-audio-generator") 159 | generate_multi_speakers_audio_run = project.run_function( 160 | text_to_audio_generator_function, 161 | handler="generate_multi_speakers_audio", 162 | name="text-to-audio", 163 | inputs={"data_path": generate_conversations_run.outputs["conversations"]}, 164 | params={ 165 | "speakers": {"Agent": 0, "Client": 1}, 166 | "available_voices": available_voices, 167 | "model": tts_model, 168 | "speed": 1, 169 | }, 170 | returns=[ 171 | "audio_files: path", 172 | "dataframe: dataset", 173 | "errors: file", 174 | ], 175 | ) 176 | 177 | # Create the input batch: 178 | create_batch_for_analysis_run = project.run_function( 179 | conversations_generator_function, 180 | handler="create_batch_for_analysis", 181 | name="batch-creation", 182 | inputs={ 183 | "conversations_data": generate_conversations_run.outputs["metadata"], 184 | "audio_files": generate_multi_speakers_audio_run.outputs["dataframe"], 185 | }, 186 | returns=["calls_batch: dataset"], 187 | ) 188 | -------------------------------------------------------------------------------- /vizro/app.py: -------------------------------------------------------------------------------- 1 | """Main app entry point for Vizro dashboard.""" 2 | 3 | # DEFINE IMPORTS 4 | import pandas as pd 5 | from custom_charts import ( 6 | plot_bar_concerns, 7 | plot_bar_quality, 8 | plot_bar_upsales, 9 | plot_box_communication, 10 | plot_butterfly_upsales_concerns, 11 | plot_donut_concerns, 12 | plot_donut_upsales, 13 | plot_line_calls_over_time, 14 | plot_map_call_locations, 15 | plot_radar_quality, 16 | ) 17 | from custom_components import Audio, make_tabs_with_title, update_from_selected_row 18 | from dash import html 19 | 20 | import vizro.models as vm 21 | from vizro import Vizro 22 | from vizro.figures import kpi_card, kpi_card_reference 23 | from vizro.tables import dash_ag_grid 24 | 25 | # DEFINE CONSTANTS 26 | MIN_ROW_HEIGHT = 420 27 | CONCERN_LABELS = ["Concerns Not Addressed", "Concerns Addressed"] 28 | 29 | 30 | def px(val: int) -> str: 31 | """Convert integer value to pixel string.""" 32 | return f"{int(val)}px" 33 | 34 | 35 | # DEFINE DATA 36 | try: 37 | df = pd.read_csv("/home/mlrun_code/vizro/data.csv") 38 | except FileNotFoundError: 39 | raise RuntimeError("The data file 'fake_data.csv' was not found.") 40 | df["Call Date"] = pd.to_datetime(df["Call Date"]) 41 | df["Upsale Success Reference"] = 0.25 42 | df["Concern Reference"] = 0.50 43 | 44 | # DEFINE DASHBOARD 45 | kpi_container = vm.Container( 46 | layout=vm.Grid(grid=[[0, 1, 2, 3, 4]], row_gap="0px", col_gap="20px"), 47 | components=[ 48 | vm.Figure( 49 | figure=kpi_card_reference( 50 | data_frame=df, 51 | value_column="Upsale Success", 52 | reference_column="Upsale Success Reference", 53 | title="Upsale Success", 54 | value_format="{value:.0%}", 55 | reference_format="{delta_relative:+.1%} vs. target", 56 | icon="more_up", 57 | agg_func="mean", 58 | ) 59 | ), 60 | vm.Figure( 61 | figure=kpi_card_reference( 62 | data_frame=df, 63 | value_column="Concern Addressed", 64 | reference_column="Concern Reference", 65 | title="Concerns Addressed", 66 | value_format="{value:.0%}", 67 | reference_format="{delta_relative:+.1%} vs. target", 68 | agg_func="mean", 69 | icon="recommend", 70 | ) 71 | ), 72 | vm.Figure( 73 | figure=kpi_card( 74 | data_frame=df, 75 | agg_func="count", 76 | value_column="Caller ID", 77 | title="Number of Calls", 78 | icon="call", 79 | ) 80 | ), 81 | vm.Figure( 82 | figure=kpi_card( 83 | data_frame=df, 84 | agg_func="nunique", 85 | value_column="Agent ID", 86 | title="Number of Agents", 87 | icon="support_agent", 88 | ) 89 | ), 90 | vm.Figure( 91 | figure=kpi_card( 92 | data_frame=df, 93 | agg_func="nunique", 94 | value_column="Caller ID", 95 | title="Number of Callers", 96 | icon="person", 97 | ) 98 | ), 99 | ], 100 | ) 101 | 102 | call_summary_container = vm.Container( 103 | title="Calls Summary", 104 | layout=vm.Grid(grid=[[0, 1]], row_min_height=px(MIN_ROW_HEIGHT), row_gap="0px"), 105 | components=[ 106 | vm.Container( 107 | title="", 108 | layout=vm.Grid( 109 | grid=[[0], [1]], row_min_height=px(MIN_ROW_HEIGHT // 2), row_gap="0px" 110 | ), 111 | components=[ 112 | vm.Graph( 113 | title="Calls over time", 114 | figure=plot_line_calls_over_time(df), 115 | ), 116 | vm.Graph( 117 | title="Upsales and Concerns Addressed", 118 | figure=plot_butterfly_upsales_concerns(df), 119 | ), 120 | ], 121 | variant="filled", 122 | ), 123 | vm.Container( 124 | title="", 125 | layout=vm.Grid( 126 | grid=[[0]], row_min_height=px(MIN_ROW_HEIGHT), row_gap="0px" 127 | ), 128 | components=[ 129 | vm.Graph( 130 | title="Call Locations", 131 | header="Showing actual number of calls per city", 132 | figure=plot_map_call_locations(df), 133 | ) 134 | ], 135 | variant="filled", 136 | ), 137 | ], 138 | ) 139 | 140 | upsales_container = make_tabs_with_title( 141 | title="Upsales", 142 | tabs=[ 143 | vm.Container( 144 | title="Percentage", 145 | layout=vm.Grid(grid=[[0, 1]], row_min_height=px(MIN_ROW_HEIGHT)), 146 | components=[ 147 | vm.Graph( 148 | title="Average Across Agents", 149 | header="Showing percentage of calls", 150 | figure=plot_donut_upsales( 151 | data_frame=df, 152 | group_column="Agent ID", 153 | mode="average", 154 | ), 155 | ), 156 | vm.Graph( 157 | title="Per Agent", 158 | header="Showing percentage of calls", 159 | figure=plot_donut_upsales( 160 | data_frame=df, 161 | group_column="Agent ID", 162 | mode="comparison", 163 | ), 164 | footer="(The Agent ID is shown inside each donut)", 165 | ), 166 | ], 167 | ), 168 | vm.Container( 169 | title="Absolute", 170 | layout=vm.Grid(grid=[[0, 1]], row_min_height=px(MIN_ROW_HEIGHT)), 171 | components=[ 172 | vm.Graph( 173 | title="Average Across Agents", 174 | header="Showing actual number of calls", 175 | figure=plot_bar_upsales( 176 | data_frame=df, 177 | group_column="Agent ID", 178 | mode="average", 179 | ), 180 | ), 181 | vm.Graph( 182 | title="Per Agent", 183 | header="Showing actual number of calls", 184 | figure=plot_bar_upsales( 185 | data_frame=df, 186 | group_column="Agent ID", 187 | mode="comparison", 188 | ), 189 | ), 190 | ], 191 | ), 192 | ], 193 | ) 194 | 195 | concerns_container = make_tabs_with_title( 196 | title="Concerns", 197 | tabs=[ 198 | vm.Container( 199 | title="Percentage", 200 | layout=vm.Grid(grid=[[0, 1]], row_min_height=px(MIN_ROW_HEIGHT)), 201 | components=[ 202 | vm.Graph( 203 | title="Average Across Agents", 204 | header="Showing percentage of calls", 205 | figure=plot_donut_concerns( 206 | data_frame=df, 207 | group_column="Agent ID", 208 | count_column="Concern Addressed", 209 | label_names=CONCERN_LABELS, 210 | mode="average", 211 | ), 212 | ), 213 | vm.Graph( 214 | title="Per Agent", 215 | header="Showing percentage of calls", 216 | figure=plot_donut_concerns( 217 | data_frame=df, 218 | group_column="Agent ID", 219 | count_column="Concern Addressed", 220 | label_names=CONCERN_LABELS, 221 | mode="comparison", 222 | ), 223 | footer="(The Agent ID is shown inside each donut)", 224 | ), 225 | ], 226 | ), 227 | vm.Container( 228 | title="Absolute", 229 | layout=vm.Grid(grid=[[0, 1]], row_min_height=px(MIN_ROW_HEIGHT)), 230 | components=[ 231 | vm.Graph( 232 | title="Average Across Agents", 233 | header="Showing actual number of calls", 234 | figure=plot_bar_concerns( 235 | data_frame=df, 236 | group_column="Agent ID", 237 | mode="average", 238 | ), 239 | ), 240 | vm.Graph( 241 | title="Per Agent", 242 | header="Showing actual number of calls", 243 | figure=plot_bar_concerns( 244 | data_frame=df, 245 | group_column="Agent ID", 246 | mode="comparison", 247 | ), 248 | ), 249 | ], 250 | ), 251 | ], 252 | ) 253 | 254 | quality_scores_container = make_tabs_with_title( 255 | title="Quality Scores", 256 | tabs=[ 257 | vm.Container( 258 | title="Absolute", 259 | layout=vm.Grid(grid=[[0, 1]], row_min_height=px(MIN_ROW_HEIGHT)), 260 | components=[ 261 | vm.Graph( 262 | title="Average Across Agents", 263 | header="Showing actual score", 264 | figure=plot_radar_quality(df, "average"), 265 | ), 266 | vm.Graph( 267 | title="Per Agent", 268 | header="Showing actual score", 269 | figure=plot_radar_quality(df, "comparison"), 270 | footer="(View the tooltips to see the Agent ID)", 271 | ), 272 | ], 273 | ), 274 | vm.Container( 275 | title="Comparison", 276 | layout=vm.Grid(grid=[[0, 1]], row_min_height=px(MIN_ROW_HEIGHT)), 277 | components=[ 278 | vm.Graph( 279 | title="Average Across Agents", 280 | header="Showing actual score", 281 | figure=plot_bar_quality(df, "average"), 282 | ), 283 | vm.Graph( 284 | title="Per Agent", 285 | header="Showing actual score", 286 | figure=plot_bar_quality(df, "comparison"), 287 | ), 288 | ], 289 | ), 290 | ], 291 | ) 292 | 293 | effective_communication_container = vm.Container( 294 | title="Effective Communication", 295 | layout=vm.Grid(grid=[[0, 1]], row_min_height=px(MIN_ROW_HEIGHT)), 296 | collapsed=False, 297 | components=[ 298 | vm.Graph( 299 | title="Average Across Agents", 300 | header="Showing actual score", 301 | figure=plot_box_communication(data_frame=df, mode="average"), 302 | ), 303 | vm.Graph( 304 | title="Per Agent", 305 | header="Showing actual score", 306 | figure=plot_box_communication(data_frame=df, mode="comparison"), 307 | ), 308 | ], 309 | variant="filled", 310 | ) 311 | 312 | transcripts_and_audio_container = vm.Container( 313 | title="Call transcripts", 314 | layout=vm.Flex(gap="40px"), 315 | components=[ 316 | vm.AgGrid( 317 | id="outer_grid", 318 | figure=dash_ag_grid( 319 | id="inner_grid", 320 | data_frame=df[ 321 | [ 322 | "Agent ID", 323 | "Caller ID", 324 | "Topic", 325 | "Summary", 326 | "audio_file", 327 | "text_file", 328 | ] 329 | ], 330 | dashGridOptions={ 331 | "rowSelection": "single", 332 | "suppressRowDeselection": True, 333 | }, 334 | columnState=[ 335 | {"colId": "audio_file", "hide": True}, 336 | {"colId": "text_file", "hide": True}, 337 | ], 338 | columnSize="responsiveSizeToFit", 339 | ), 340 | actions=[ 341 | vm.Action( 342 | function=update_from_selected_row(), 343 | inputs=["inner_grid.selectedRows"], 344 | outputs=["transcript.children", "audio.src"], 345 | ) 346 | ], 347 | ), 348 | vm.Container( 349 | layout=vm.Grid(grid=[[0, 0, 1]]), 350 | components=[ 351 | vm.Card( 352 | id="transcript", 353 | text="Select a row from the above table to see a transcript", 354 | extra={"style": {"height": "450px"}}, 355 | ), 356 | Audio(id="audio"), 357 | ], 358 | ), 359 | ], 360 | ) 361 | 362 | call_center_summary_page = vm.Page( 363 | title="Call Center Summary", 364 | layout=vm.Flex(gap="20px"), 365 | components=[ 366 | kpi_container, 367 | call_summary_container, 368 | upsales_container, 369 | concerns_container, 370 | quality_scores_container, 371 | effective_communication_container, 372 | ], 373 | controls=[ 374 | vm.Filter(column="Agent ID", selector=vm.Dropdown(title="Agent ID")), 375 | vm.Filter(column="Caller ID", selector=vm.Dropdown(title="Caller ID")), 376 | vm.Filter(column="Client Tone"), 377 | vm.Filter( 378 | column="Effective Communication", 379 | selector=vm.RangeSlider(title="Effective Communication Score", step=1), 380 | ), 381 | vm.Filter(column="Caller City", selector=vm.Dropdown(title="Caller City")), 382 | ], 383 | ) 384 | 385 | call_transcripts_page = vm.Page( 386 | title="Call Transcripts", 387 | components=[transcripts_and_audio_container], 388 | controls=[ 389 | vm.Filter(column="Agent ID", selector=vm.Dropdown(title="Agent ID")), 390 | vm.Filter(column="Caller ID", selector=vm.Dropdown(title="Caller ID")), 391 | ], 392 | ) 393 | 394 | dashboard = vm.Dashboard(pages=[call_center_summary_page, call_transcripts_page]) 395 | 396 | app = Vizro().build(dashboard) 397 | 398 | if __name__ == "__main__": 399 | app.run() 400 | -------------------------------------------------------------------------------- /vizro/assets/vizro_dashboard_styles.css: -------------------------------------------------------------------------------- 1 | audio::-webkit-media-controls-panel, audio::-webkit-media-controls-enclosure { 2 | border-radius: 0; 3 | background: var(--surfaces-bg-card); 4 | } 5 | 6 | #outer_grid { 7 | width: unset; 8 | } 9 | 10 | #transcript { 11 | line-height: unset; 12 | } -------------------------------------------------------------------------------- /vizro/custom_charts.py: -------------------------------------------------------------------------------- 1 | """Custom charts for Vizro dashboard. 2 | """ 3 | 4 | import math 5 | 6 | import numpy as np 7 | import pandas as pd 8 | import plotly.graph_objects as go 9 | from plotly.subplots import make_subplots 10 | 11 | import vizro.plotly.express as px 12 | from vizro.models.types import capture 13 | 14 | CONCERN_LABELS = ["Concerns Not Addressed", "Concerns Addressed"] 15 | UPSALE_LABELS = ["Failed Upsales", "No Upsale Attempted", "Successful Upsales"] 16 | 17 | 18 | @capture("graph") 19 | def plot_donut_concerns( 20 | data_frame: pd.DataFrame, 21 | group_column: str, 22 | count_column: str, 23 | label_names: list[str], 24 | mode: str, 25 | ) -> go.Figure: 26 | """Create a donut chart for concerns addressed, by agent or average. 27 | 28 | Args: 29 | data_frame (pd.DataFrame): Input data containing agent and concern columns. 30 | group_column (str): Column name for grouping (e.g., agent ID). 31 | count_column (str): Column name for concern addressed (boolean). 32 | label_names (list[str]): List of label names for the donut chart. 33 | mode (str): 'comparison' for agent subplots, 'average' for overall. 34 | 35 | Returns: 36 | go.Figure: Plotly Figure object representing the donut chart(s). 37 | """ 38 | if mode == "comparison": 39 | 40 | agent_count = data_frame[group_column].nunique() 41 | 42 | num_rows = math.ceil(agent_count / 4) 43 | num_cols = 4 44 | 45 | fig = make_subplots( 46 | rows=num_rows, 47 | cols=num_cols, 48 | subplot_titles=None, 49 | horizontal_spacing=0.08, 50 | vertical_spacing=0.02, 51 | specs=[[{"type": "pie"}] * num_cols for _ in range(num_rows)], 52 | ) 53 | 54 | agent_list = data_frame[group_column].unique().tolist() 55 | 56 | for i in range(0, len(agent_list)): 57 | chart_data = data_frame.copy() 58 | chart_data = chart_data[chart_data[group_column] == agent_list[i]] 59 | 60 | counts = chart_data[count_column].value_counts() 61 | labels = label_names 62 | 63 | chart_data = pd.DataFrame( 64 | { 65 | "Labels": labels, 66 | "Counts": [counts.get(False, 0), counts.get(True, 0)], 67 | } 68 | ) 69 | 70 | chart_data.sort_values(by="Labels", ascending=True, inplace=True) 71 | 72 | labels = chart_data["Labels"] 73 | values = chart_data["Counts"] 74 | 75 | color_discrete_map = { 76 | "Concerns Addressed": "#00b4ff", 77 | "Concerns Not Addressed": "#ff9222", 78 | } 79 | colors = [color_discrete_map[label] for label in labels] 80 | 81 | fig.add_trace( 82 | go.Pie( 83 | labels=labels, 84 | values=values, 85 | hole=0.6, 86 | title=str(agent_list[i]), 87 | marker=dict(colors=colors), 88 | sort=False, 89 | hovertemplate="Category: %{label}
Count: %{value}
Percent: %{percent}", 90 | ), 91 | row=i // num_cols + 1, 92 | col=i % num_cols + 1, 93 | ) 94 | 95 | fig.update_traces( 96 | textposition="outside", 97 | textinfo="percent+label", 98 | opacity=0.9, 99 | ) 100 | 101 | fig.update_traces(textinfo="none") 102 | 103 | fig.update_layout( 104 | margin_t=0, margin_b=0, margin_l=0, margin_r=0, showlegend=False 105 | ) 106 | 107 | if mode == "average": 108 | chart_data = data_frame.copy() 109 | 110 | counts = chart_data[count_column].value_counts() 111 | labels = label_names 112 | 113 | chart_data = pd.DataFrame( 114 | {"Labels": labels, "Counts": [counts.get(False, 0), counts.get(True, 0)]} 115 | ) 116 | 117 | chart_data.sort_values(by="Labels", ascending=True, inplace=True) 118 | 119 | labels = chart_data["Labels"] 120 | values = chart_data["Counts"] 121 | 122 | color_discrete_map = { 123 | "Concerns Addressed": "#00b4ff", 124 | "Concerns Not Addressed": "#ff9222", 125 | } 126 | colors = [color_discrete_map[label] for label in labels] 127 | 128 | fig = go.Figure() 129 | 130 | fig.add_trace( 131 | go.Pie( 132 | labels=labels, 133 | values=values, 134 | hole=0.6, 135 | marker=dict(colors=colors), 136 | sort=False, 137 | hovertemplate="Category: %{label}
Count: %{value}
Percent: %{percent}", 138 | ) 139 | ) 140 | 141 | fig.update_layout(margin_t=0, margin_b=0, margin_l=0, margin_r=0) 142 | fig.update_traces(textposition="outside", textinfo="percent", opacity=0.9) 143 | 144 | return fig 145 | 146 | 147 | @capture("graph") 148 | def plot_donut_upsales( 149 | data_frame: pd.DataFrame, 150 | group_column: str, 151 | mode: str, 152 | ) -> go.Figure: 153 | """Create a donut chart for upsales outcomes, by agent or average. 154 | 155 | Args: 156 | data_frame (pd.DataFrame): Input data containing agent and upsale columns. 157 | group_column (str): Column name for grouping (e.g., agent ID). 158 | mode (str): 'comparison' for agent subplots, 'average' for overall. 159 | 160 | Returns: 161 | go.Figure: Plotly Figure object representing the donut chart(s). 162 | """ 163 | color_discrete_map = { 164 | "Failed Upsales": "#FF9222", 165 | "No Upsale Attempted": "#3949AB", 166 | "Successful Upsales": "#00B4FF", 167 | } 168 | 169 | labels = ["Failed Upsales", "No Upsale Attempted", "Successful Upsales"] 170 | 171 | if mode == "comparison": 172 | 173 | agent_count = data_frame[group_column].nunique() 174 | 175 | num_rows = math.ceil(agent_count / 4) 176 | num_cols = 4 177 | 178 | fig = make_subplots( 179 | rows=num_rows, 180 | cols=num_cols, 181 | subplot_titles=None, 182 | horizontal_spacing=0.08, 183 | vertical_spacing=0.02, 184 | specs=[[{"type": "pie"}] * num_cols for _ in range(num_rows)], 185 | ) 186 | 187 | agent_list = data_frame[group_column].unique().tolist() 188 | 189 | for i in range(0, len(agent_list)): 190 | 191 | chart_data = data_frame.copy() 192 | upsale_outcomes = chart_data[chart_data[group_column] == agent_list[i]] 193 | 194 | upsale_outcomes = ( 195 | upsale_outcomes.groupby(["Upsale Attempted", "Upsale Success"]) 196 | .size() 197 | .reset_index(name="counts") 198 | ) 199 | 200 | def categorize(row: pd.Series) -> str: 201 | """Categorize upsale outcome for a row. 202 | 203 | Args: 204 | row (pd.Series): Row of DataFrame with 'Upsale Attempted' and 'Upsale Success'. 205 | Returns: 206 | str: Category label for the upsale outcome. 207 | """ 208 | if not row["Upsale Attempted"]: 209 | return "No Upsale Attempted" 210 | elif row["Upsale Success"]: 211 | return "Successful Upsales" 212 | else: 213 | return "Failed Upsales" 214 | 215 | upsale_outcomes["category"] = upsale_outcomes.apply(categorize, axis=1) 216 | 217 | counts = upsale_outcomes["category"].value_counts() 218 | 219 | chart_data = pd.DataFrame( 220 | { 221 | "Labels": labels, 222 | "Counts": [ 223 | counts.get("Failed Upsales", 0), 224 | counts.get("No Upsale Attempted", 0), 225 | counts.get("Successful Upsales", 0), 226 | ], 227 | } 228 | ) 229 | 230 | chart_data.sort_values(by="Labels", ascending=True, inplace=True) 231 | 232 | labels = chart_data["Labels"] 233 | values = chart_data["Counts"] 234 | 235 | colors = [color_discrete_map[label] for label in labels] 236 | 237 | fig.add_trace( 238 | go.Pie( 239 | labels=labels, 240 | values=values, 241 | hole=0.6, 242 | title=str(agent_list[i]), 243 | marker=dict(colors=colors), 244 | sort=False, 245 | hovertemplate="Category: %{label}
Count: %{value}
Percent: %{percent}", 246 | ), 247 | row=i // num_cols + 1, 248 | col=i % num_cols + 1, 249 | ) 250 | 251 | fig.update_traces( 252 | textposition="outside", 253 | textinfo="percent+label", 254 | opacity=0.9, 255 | ) 256 | 257 | fig.update_traces(textinfo="none") 258 | 259 | fig.update_layout( 260 | margin_t=0, margin_b=0, margin_l=0, margin_r=0, showlegend=False 261 | ) 262 | 263 | if mode == "average": 264 | 265 | upsale_outcomes = data_frame.copy() 266 | 267 | labels = ["Failed Upsales", "No Upsale Attempted", "Successful Upsales"] 268 | 269 | upsale_outcomes = ( 270 | upsale_outcomes.groupby(["Upsale Attempted", "Upsale Success"]) 271 | .size() 272 | .reset_index(name="counts") 273 | ) 274 | 275 | def categorize(row: pd.Series) -> str: 276 | """Categorize upsale outcome for a row. 277 | 278 | Args: 279 | row (pd.Series): Row of DataFrame with 'Upsale Attempted' and 'Upsale Success'. 280 | Returns: 281 | str: Category label for the upsale outcome. 282 | """ 283 | if not row["Upsale Attempted"]: 284 | return "No Upsale Attempted" 285 | elif row["Upsale Success"]: 286 | return "Successful Upsales" 287 | else: 288 | return "Failed Upsales" 289 | 290 | upsale_outcomes["category"] = upsale_outcomes.apply(categorize, axis=1) 291 | category_counts = ( 292 | upsale_outcomes.groupby("category")["counts"].sum().reset_index() 293 | ) 294 | 295 | counts = dict(zip(category_counts["category"], category_counts["counts"])) 296 | 297 | chart_data = pd.DataFrame( 298 | { 299 | "Labels": labels, 300 | "Counts": [ 301 | counts.get("Failed Upsales", 0), 302 | counts.get("No Upsale Attempted", 0), 303 | counts.get("Successful Upsales", 0), 304 | ], 305 | } 306 | ) 307 | 308 | chart_data.sort_values(by="Labels", ascending=True, inplace=True) 309 | 310 | labels = chart_data["Labels"] 311 | values = chart_data["Counts"] 312 | 313 | colors = [color_discrete_map[label] for label in labels] 314 | 315 | fig = go.Figure() 316 | 317 | fig.add_trace( 318 | go.Pie( 319 | labels=labels, 320 | values=values, 321 | hole=0.6, 322 | marker=dict(colors=colors), 323 | sort=False, 324 | hovertemplate="Category: %{label}
Count: %{value}
Percent: %{percent}", 325 | ) 326 | ) 327 | 328 | fig.update_layout( 329 | margin_t=0, margin_b=0, margin_l=0, margin_r=0, legend_traceorder="reversed" 330 | ) 331 | fig.update_traces(textposition="outside", textinfo="percent", opacity=0.9) 332 | 333 | return fig 334 | 335 | 336 | @capture("graph") 337 | def plot_bar_concerns( 338 | data_frame: pd.DataFrame, 339 | group_column: str, 340 | mode: str, 341 | ) -> go.Figure: 342 | """Create a bar chart for concerns addressed, by agent or average. 343 | 344 | Args: 345 | data_frame (pd.DataFrame): Input data containing agent and concern columns. 346 | group_column (str): Column name for grouping (e.g., agent ID). 347 | mode (str): 'comparison' for agent subplots, 'average' for overall. 348 | 349 | Returns: 350 | go.Figure: Plotly Figure object representing the bar chart(s). 351 | """ 352 | color_discrete_map = { 353 | "Concerns Addressed": "#00b4ff", 354 | "Concerns Not Addressed": "#ff9222", 355 | } 356 | 357 | if mode == "comparison": 358 | 359 | data = pd.DataFrame() 360 | 361 | agent_list = data_frame[group_column].unique().tolist() 362 | 363 | for i in range(0, len(agent_list)): 364 | 365 | chart_data = data_frame.copy() 366 | chart_data = chart_data[chart_data[group_column] == agent_list[i]] 367 | chart_data["Concern Addressed"] = chart_data["Concern Addressed"].replace( 368 | {True: "Concerns Addressed", False: "Concerns Not Addressed"} 369 | ) 370 | 371 | outcomes = ( 372 | chart_data.groupby(["Concern Addressed"]) 373 | .size() 374 | .reset_index(name="counts") 375 | ) 376 | 377 | category_counts = ( 378 | outcomes.groupby("Concern Addressed")["counts"].sum().reset_index() 379 | ) 380 | 381 | category_counts["agent_id"] = i 382 | 383 | data = pd.concat([data, category_counts]) 384 | 385 | fig = px.bar( 386 | data, 387 | x="agent_id", 388 | y="counts", 389 | color="Concern Addressed", 390 | title="", 391 | color_discrete_map=color_discrete_map, 392 | category_orders={ 393 | "category": [ 394 | "Concerns Not Addressed", 395 | "Concerns Addressed", 396 | ] 397 | }, 398 | ) 399 | 400 | fig.update_layout( 401 | showlegend=False, 402 | xaxis=dict( 403 | tickmode="array", 404 | tickvals=list(range(0, len(agent_list))), 405 | ticktext=agent_list, 406 | ), 407 | xaxis_title="Agent ID", 408 | yaxis_title=None, 409 | ) 410 | fig.update_traces( 411 | hovertemplate="Category: %{fullData.name}
Count: %{y}" 412 | ) 413 | 414 | if mode == "average": 415 | 416 | chart_data = data_frame.copy() 417 | chart_data["Concern Addressed"] = chart_data["Concern Addressed"].replace( 418 | {True: "Concerns Addressed", False: "Concerns Not Addressed"} 419 | ) 420 | 421 | outcomes = ( 422 | chart_data.groupby(["Concern Addressed"]).size().reset_index(name="counts") 423 | ) 424 | 425 | category_counts = ( 426 | outcomes.groupby("Concern Addressed")["counts"].sum().reset_index() 427 | ) 428 | category_counts["PLACEHOLDER"] = 1 429 | 430 | fig = px.bar( 431 | category_counts, 432 | y="PLACEHOLDER", 433 | x="counts", 434 | color="Concern Addressed", 435 | title="", 436 | orientation="h", 437 | text="counts", 438 | color_discrete_map=color_discrete_map, 439 | category_orders={ 440 | "category": [ 441 | "Concerns Not Addressed", 442 | "Concerns Addressed", 443 | ] 444 | }, 445 | ) 446 | 447 | fig.update_layout( 448 | xaxis=dict(visible=False), 449 | yaxis=dict(visible=False), 450 | showlegend=True, 451 | legend_title=None, 452 | margin=dict(t=60), 453 | ) 454 | 455 | fig.update_traces( 456 | textposition="inside", 457 | insidetextanchor="middle", 458 | width=0.2, 459 | hovertemplate="Category: %{fullData.name}
Count: %{x}", 460 | ) 461 | 462 | return fig 463 | 464 | 465 | @capture("graph") 466 | def plot_bar_upsales( 467 | data_frame: pd.DataFrame, 468 | group_column: str, 469 | mode: str, 470 | ) -> go.Figure: 471 | """Create a bar chart for upsales outcomes, by agent or average. 472 | 473 | Args: 474 | data_frame (pd.DataFrame): Input data containing agent and upsale columns. 475 | group_column (str): Column name for grouping (e.g., agent ID). 476 | mode (str): 'comparison' for agent subplots, 'average' for overall. 477 | 478 | Returns: 479 | go.Figure: Plotly Figure object representing the bar chart(s). 480 | """ 481 | color_discrete_map = { 482 | "Failed Upsales": "#FF9222", 483 | "No Upsale Attempted": "#3949AB", 484 | "Successful Upsales": "#00B4FF", 485 | } 486 | 487 | if mode == "comparison": 488 | 489 | data = pd.DataFrame() 490 | 491 | agent_list = data_frame[group_column].unique().tolist() 492 | 493 | for i in range(0, len(agent_list)): 494 | 495 | chart_data = data_frame.copy() 496 | chart_data = chart_data[chart_data[group_column] == agent_list[i]] 497 | upsale_outcomes = ( 498 | chart_data.groupby(["Upsale Attempted", "Upsale Success"]) 499 | .size() 500 | .reset_index(name="counts") 501 | ) 502 | 503 | def categorize(row: pd.Series) -> str: 504 | """Categorize upsale outcome for a row. 505 | 506 | Args: 507 | row (pd.Series): Row of DataFrame with 'Upsale Attempted' and 'Upsale Success'. 508 | Returns: 509 | str: Category label for the upsale outcome. 510 | """ 511 | if not row["Upsale Attempted"]: 512 | return "No Upsale Attempted" 513 | elif row["Upsale Success"]: 514 | return "Successful Upsales" 515 | else: 516 | return "Failed Upsales" 517 | 518 | upsale_outcomes["category"] = upsale_outcomes.apply(categorize, axis=1) 519 | category_counts = ( 520 | upsale_outcomes.groupby("category")["counts"].sum().reset_index() 521 | ) 522 | category_counts["agent_id"] = i 523 | 524 | data = pd.concat([data, category_counts]) 525 | 526 | fig = px.bar( 527 | data, 528 | x="agent_id", 529 | y="counts", 530 | color="category", 531 | title="", 532 | color_discrete_map=color_discrete_map, 533 | category_orders={ 534 | "category": [ 535 | "Successful Upsales", 536 | "No Upsale Attempted", 537 | "Failed Upsales", 538 | ] 539 | }, 540 | ) 541 | 542 | fig.update_traces( 543 | hovertemplate="Category: %{fullData.name}
Count: %{y}" 544 | ) 545 | 546 | if mode == "average": 547 | 548 | upsale_outcomes = ( 549 | data_frame.groupby(["Upsale Attempted", "Upsale Success"]) 550 | .size() 551 | .reset_index(name="counts") 552 | ) 553 | 554 | def categorize(row: pd.Series) -> str: 555 | """Categorize upsale outcome for a row. 556 | 557 | Args: 558 | row (pd.Series): Row of DataFrame with 'Upsale Attempted' and 'Upsale Success'. 559 | Returns: 560 | str: Category label for the upsale outcome. 561 | """ 562 | if not row["Upsale Attempted"]: 563 | return "No Upsale Attempted" 564 | elif row["Upsale Success"]: 565 | return "Successful Upsales" 566 | else: 567 | return "Failed Upsales" 568 | 569 | upsale_outcomes["category"] = upsale_outcomes.apply(categorize, axis=1) 570 | category_counts = ( 571 | upsale_outcomes.groupby("category")["counts"].sum().reset_index() 572 | ) 573 | category_counts["PLACEHOLDER"] = 1 574 | 575 | fig = px.bar( 576 | category_counts, 577 | y="PLACEHOLDER", 578 | x="counts", 579 | color="category", 580 | title="", 581 | orientation="h", 582 | text="counts", 583 | color_discrete_map=color_discrete_map, 584 | category_orders={ 585 | "category": [ 586 | "Successful Upsales", 587 | "No Upsale Attempted", 588 | "Failed Upsales", 589 | ] 590 | }, 591 | ) 592 | 593 | fig.update_traces( 594 | hovertemplate="Category: %{fullData.name}
Count: %{y}", 595 | textposition="inside", 596 | insidetextanchor="middle", 597 | width=0.2, 598 | ) 599 | 600 | fig.update_layout( 601 | xaxis=dict(visible=False), 602 | yaxis=dict(visible=False), 603 | showlegend=True, 604 | legend_title=None, 605 | ) 606 | 607 | return fig 608 | 609 | 610 | @capture("graph") 611 | def plot_radar_quality( 612 | data_frame: pd.DataFrame, 613 | mode: str, 614 | ) -> go.Figure: 615 | """Create a radar (polar) chart for agent communication quality metrics. 616 | 617 | Args: 618 | data_frame (pd.DataFrame): Input data with agent communication metrics. 619 | mode (str): 'comparison' for agent subplots, 'average' for overall. 620 | 621 | Returns: 622 | go.Figure: Plotly Figure object representing the radar chart(s). 623 | """ 624 | data = data_frame.copy() 625 | melted_df = pd.melt( 626 | data, 627 | id_vars=["Agent ID"], 628 | value_vars=[ 629 | "Empathy", 630 | "Professionalism", 631 | "Kindness", 632 | "Effective Communication", 633 | "Active Listening", 634 | ], 635 | var_name="Communication Metric", 636 | value_name="Value", 637 | ) 638 | 639 | grouped_avg_df = melted_df.groupby( 640 | ["Agent ID", "Communication Metric"], as_index=False 641 | )["Value"].mean() 642 | 643 | if mode == "comparison": 644 | 645 | agent_count = data_frame["Agent ID"].nunique() 646 | 647 | num_rows = math.ceil(agent_count / 4) 648 | num_cols = 4 649 | 650 | fig = make_subplots( 651 | rows=num_rows, 652 | cols=num_cols, 653 | subplot_titles=None, 654 | horizontal_spacing=0.02, 655 | vertical_spacing=0.02, 656 | specs=[[{"type": "polar"}] * num_cols for _ in range(num_rows)], 657 | ) 658 | 659 | agent_list = grouped_avg_df["Agent ID"].unique().tolist() 660 | 661 | for i in range(0, len(agent_list)): 662 | chart_data = grouped_avg_df.copy() 663 | chart_data = chart_data[chart_data["Agent ID"] == agent_list[i]] 664 | 665 | fig.add_trace( 666 | go.Barpolar( 667 | r=chart_data["Value"], 668 | theta=chart_data["Communication Metric"], 669 | marker_color=[ 670 | "#00B4FF", 671 | "#FF9222", 672 | "#3949AB", 673 | "#FF5267", 674 | "#08BDBA", 675 | "#FDC935", 676 | ], 677 | hovertemplate=f"Agent ID: {agent_list[i]}
Metric: %{{theta}}
Score: %{{r}}", 678 | ), 679 | row=i // num_cols + 1, 680 | col=i % num_cols + 1, 681 | ) 682 | 683 | for i in range(num_rows * num_cols): 684 | fig.update_layout( 685 | **{ 686 | f"polar{i + 1}": dict( 687 | radialaxis=dict(visible=False, showgrid=False), 688 | angularaxis=dict(visible=False, showgrid=False), 689 | bgcolor="rgba(0, 0, 0, 0)", 690 | ) 691 | } 692 | ) 693 | 694 | fig.update_layout( 695 | showlegend=False, 696 | paper_bgcolor="rgba(0, 0, 0, 0)", 697 | plot_bgcolor="rgba(0, 0, 0, 0)", 698 | ) 699 | if mode == "average": 700 | 701 | grouped_avg_df = melted_df.groupby( 702 | ["Agent ID", "Communication Metric"], as_index=False 703 | )["Value"].mean() 704 | grouped_avg_df = grouped_avg_df.groupby( 705 | ["Communication Metric"], as_index=False 706 | )["Value"].mean() 707 | 708 | fig = go.Figure() 709 | 710 | fig.add_trace( 711 | go.Barpolar( 712 | r=grouped_avg_df["Value"], 713 | theta=grouped_avg_df["Communication Metric"], 714 | marker_color=[ 715 | "#00B4FF", 716 | "#FF9222", 717 | "#3949AB", 718 | "#FF5267", 719 | "#08BDBA", 720 | "#FDC935", 721 | ], 722 | hovertemplate="Metric: %{theta}
Score: %{r}", 723 | ) 724 | ) 725 | 726 | fig.update_layout( 727 | polar=dict( 728 | angularaxis=dict(), 729 | radialaxis=dict( 730 | dtick=1, 731 | showgrid=False, 732 | ), 733 | bgcolor="rgba(0, 0, 0, 0)", 734 | ), 735 | showlegend=False, 736 | ) 737 | 738 | return fig 739 | 740 | 741 | @capture("graph") 742 | def plot_bar_quality( 743 | data_frame: pd.DataFrame, 744 | mode: str, 745 | ) -> go.Figure: 746 | """Create a bar chart for agent communication quality metrics. 747 | 748 | Args: 749 | data_frame (pd.DataFrame): Input data with agent communication metrics. 750 | mode (str): 'comparison' for agent subplots, 'average' for overall. 751 | 752 | Returns: 753 | go.Figure: Plotly Figure object representing the bar chart(s). 754 | """ 755 | data = data_frame.copy() 756 | melted_df = pd.melt( 757 | data, 758 | id_vars=["Agent ID"], 759 | value_vars=[ 760 | "Empathy", 761 | "Professionalism", 762 | "Kindness", 763 | "Effective Communication", 764 | "Active Listening", 765 | ], 766 | var_name="Communication Metric", 767 | value_name="Value", 768 | ) 769 | 770 | grouped_avg_df = melted_df.groupby( 771 | ["Agent ID", "Communication Metric"], as_index=False 772 | )["Value"].mean() 773 | 774 | if mode == "comparison": 775 | agent_count = data_frame["Agent ID"].nunique() 776 | num_rows = math.ceil(agent_count / 4) 777 | num_cols = 4 778 | fig = make_subplots( 779 | rows=num_rows, 780 | cols=num_cols, 781 | subplot_titles=None, 782 | horizontal_spacing=0.04, 783 | vertical_spacing=0.02, 784 | specs=[[{"type": "xy"}] * num_cols for _ in range(num_rows)], 785 | ) 786 | agent_list = grouped_avg_df["Agent ID"].unique().tolist() 787 | colors = ["#00B4FF", "#FF9222", "#3949AB", "#FF5267", "#08BDBA", "#FDC935"] 788 | for i, agent in enumerate(agent_list): 789 | chart_data = grouped_avg_df[grouped_avg_df["Agent ID"] == agent] 790 | for idx, row in chart_data.iterrows(): 791 | fig.add_trace( 792 | go.Scatter( 793 | x=[row["Communication Metric"], row["Communication Metric"]], 794 | y=[0, row["Value"]], 795 | mode="lines", 796 | line=dict(color=colors[idx % len(colors)], width=3), 797 | showlegend=False, 798 | ), 799 | row=i // num_cols + 1, 800 | col=i % num_cols + 1, 801 | ) 802 | fig.add_trace( 803 | go.Scatter( 804 | x=[row["Communication Metric"]], 805 | y=[row["Value"]], 806 | mode="markers", 807 | marker=dict(color=colors[idx % len(colors)], size=8), 808 | name=row["Communication Metric"] if i == 0 else None, 809 | showlegend=(i == 0), 810 | hovertemplate=f"Agent ID: {agent}
Metric: %{{x}}
Score: %{{y}}", 811 | ), 812 | row=i // num_cols + 1, 813 | col=i % num_cols + 1, 814 | ) 815 | fig.update_xaxes( 816 | showgrid=False, 817 | visible=True, 818 | showticklabels=False, 819 | ticks="", 820 | title=dict(text=str(agent), font=dict(size=10), standoff=2), 821 | row=i // num_cols + 1, 822 | col=i % num_cols + 1, 823 | zeroline=True, 824 | ) 825 | fig.update_yaxes( 826 | showgrid=False, 827 | visible=False, 828 | zeroline=False, 829 | row=i // num_cols + 1, 830 | col=i % num_cols + 1, 831 | ) 832 | fig.update_layout( 833 | showlegend=False, 834 | paper_bgcolor="rgba(0, 0, 0, 0)", 835 | plot_bgcolor="rgba(0, 0, 0, 0)", 836 | margin=dict(t=10), 837 | ) 838 | 839 | if mode == "average": 840 | grouped_avg_df = melted_df.groupby( 841 | ["Agent ID", "Communication Metric"], as_index=False 842 | )["Value"].mean() 843 | grouped_avg_df = grouped_avg_df.groupby( 844 | ["Communication Metric"], as_index=False 845 | )["Value"].mean() 846 | fig = go.Figure() 847 | colors = ["#00B4FF", "#FF9222", "#3949AB", "#FF5267", "#08BDBA", "#FDC935"] 848 | for idx, row in grouped_avg_df.iterrows(): 849 | fig.add_trace( 850 | go.Bar( 851 | y=[row["Value"]], 852 | x=[row["Communication Metric"]], 853 | name=row["Communication Metric"], 854 | marker=dict(color=colors[idx % len(colors)]), 855 | text=[round(row["Value"], 1)], 856 | textposition="inside", 857 | hovertemplate="Metric: %{x}
Score: %{y}", 858 | width=0.6, 859 | ) 860 | ) 861 | fig.update_layout( 862 | showlegend=True, 863 | paper_bgcolor="rgba(0, 0, 0, 0)", 864 | plot_bgcolor="rgba(0, 0, 0, 0)", 865 | xaxis=dict( 866 | showgrid=False, 867 | visible=True, 868 | zeroline=True, 869 | zerolinecolor="rgba(150,150,150,0.7)", 870 | zerolinewidth=2, 871 | showticklabels=False, 872 | ticks="", 873 | ), 874 | yaxis=dict( 875 | showgrid=False, 876 | visible=False, 877 | ), 878 | barmode="group", 879 | ) 880 | return fig 881 | 882 | 883 | @capture("graph") 884 | def plot_box_communication( 885 | data_frame: pd.DataFrame, 886 | mode: str, 887 | ) -> go.Figure: 888 | """Create a box plot for Effective Communication scores, by agent or average. 889 | 890 | Args: 891 | data_frame (pd.DataFrame): Input data with agent and communication scores. 892 | mode (str): 'comparison' for agent subplots, 'average' for overall. 893 | 894 | Returns: 895 | go.Figure: Plotly Figure object representing the box plot(s). 896 | """ 897 | data = data_frame[["Agent ID", "Effective Communication"]].copy() 898 | data["PLACEHOLDER"] = 1 899 | if mode == "comparison": 900 | fig = px.box(data, x="Agent ID", y="Effective Communication") 901 | fig.update_layout(xaxis=dict(tickvals=data["Agent ID"], tickangle=90)) 902 | if mode == "average": 903 | fig = px.box( 904 | data, y="PLACEHOLDER", x="Effective Communication", orientation="h" 905 | ) 906 | fig.update_layout( 907 | yaxis=dict(range=[0, 2], visible=False), boxmode="group", bargap=0.5 908 | ) 909 | return fig 910 | 911 | 912 | @capture("graph") 913 | def plot_map_call_locations( 914 | data_frame: pd.DataFrame, 915 | ) -> go.Figure: 916 | """Create a map of call locations with bubble size by call count. 917 | 918 | Args: 919 | data_frame (pd.DataFrame): Input data with city, latitude, longitude, and call info. 920 | 921 | Returns: 922 | go.Figure: Plotly Figure object representing the map. 923 | """ 924 | aggregated_df = ( 925 | data_frame.groupby(["Caller City", "latitude", "longitude"]) 926 | .agg( 927 | Call_Count=("Caller ID", "count"), 928 | Agent_IDs=("Agent ID", "count"), 929 | Caller_Count=("Caller ID", "nunique"), 930 | ) 931 | .reset_index() 932 | ) 933 | populations = aggregated_df["Call_Count"] 934 | min_size = 10 935 | max_size = 50 936 | sizes = np.interp( 937 | aggregated_df["Call_Count"], 938 | (populations.min(), populations.max()), 939 | (min_size, max_size), 940 | ) 941 | fig = go.Figure( 942 | go.Scattergeo( 943 | lat=aggregated_df["latitude"], 944 | lon=aggregated_df["longitude"], 945 | mode="markers", 946 | marker=dict( 947 | size=sizes, 948 | color="#00B4FF", 949 | opacity=0.6, 950 | line=dict(width=0), 951 | ), 952 | hovertemplate="City: %{text}
Calls: %{customdata[0]:,}
Agents: %{customdata[1]:,}
Callers: %{customdata[2]:,}", 953 | customdata=aggregated_df[["Call_Count", "Agent_IDs", "Caller_Count"]], 954 | text=aggregated_df["Caller City"], 955 | ) 956 | ) 957 | fig.update_geos( 958 | visible=False, 959 | resolution=110, 960 | scope="usa", 961 | showcountries=True, 962 | countrycolor="rgb(150, 150, 150)", 963 | showsubunits=True, 964 | subunitcolor="rgb(150, 150, 150)", 965 | ) 966 | fig.update_layout(margin={"r": 0, "t": 0, "l": 0, "b": 0}, showlegend=False) 967 | return fig 968 | 969 | 970 | @capture("graph") 971 | def plot_line_calls_over_time( 972 | data_frame: pd.DataFrame, 973 | ) -> go.Figure: 974 | """Create a line chart of number of calls per month. 975 | 976 | Args: 977 | data_frame (pd.DataFrame): Input data with call dates. 978 | 979 | Returns: 980 | go.Figure: Plotly Figure object representing the line chart. 981 | """ 982 | calls_per_month = ( 983 | data_frame.groupby(data_frame["Call Date"].dt.to_period("M")) 984 | .size() 985 | .reset_index(name="Count") 986 | ) 987 | calls_per_month["TickLabel"] = calls_per_month["Call Date"].dt.strftime("%b %y") 988 | calls_per_month["Call Date"] = calls_per_month["Call Date"].dt.strftime("%Y-%m") 989 | fig = go.Figure() 990 | fig.add_trace( 991 | go.Scatter( 992 | x=calls_per_month["Call Date"], 993 | y=calls_per_month["Count"], 994 | mode="lines+markers+text", 995 | text=calls_per_month["Count"], 996 | textposition="top center", 997 | hovertemplate="Month: %{x}
Count: %{y}", 998 | marker=dict(size=6, color="#00B4FF"), 999 | line=dict(color="#00B4FF", width=2), 1000 | showlegend=False, 1001 | cliponaxis=False, 1002 | ) 1003 | ) 1004 | fig.update_layout( 1005 | showlegend=False, 1006 | title=None, 1007 | yaxis=dict(visible=False), 1008 | xaxis=dict( 1009 | title=None, 1010 | tickangle=90, 1011 | tickmode="array", 1012 | tickvals=calls_per_month["Call Date"], 1013 | ticktext=calls_per_month["TickLabel"], 1014 | tickfont=dict(size=12), 1015 | showgrid=False, 1016 | ), 1017 | margin=dict(t=10, b=60), 1018 | ) 1019 | return fig 1020 | 1021 | 1022 | @capture("graph") 1023 | def plot_butterfly_upsales_concerns( 1024 | data_frame: pd.DataFrame, 1025 | ) -> go.Figure: 1026 | """Create a butterfly chart comparing upsales and concerns addressed percentages per month. 1027 | 1028 | Args: 1029 | data_frame (pd.DataFrame): Input data with call dates, upsale, and concern columns. 1030 | 1031 | Returns: 1032 | go.Figure: Plotly Figure object representing the butterfly chart. 1033 | """ 1034 | df = data_frame.copy() 1035 | df["Month"] = df["Call Date"].dt.to_period("M") 1036 | upsales = ( 1037 | df[df["Upsale Attempted"]] 1038 | .groupby("Month")["Upsale Success"] 1039 | .mean() 1040 | .reset_index() 1041 | ) 1042 | upsales["Metric"] = "Upsales Success" 1043 | upsales["Value"] = upsales["Upsale Success"] * 100 1044 | concerns = df.groupby("Month")["Concern Addressed"].mean().reset_index() 1045 | concerns["Metric"] = "Concerns Addressed" 1046 | concerns["Value"] = -concerns["Concern Addressed"] * 100 1047 | plot_df = pd.concat( 1048 | [upsales[["Month", "Metric", "Value"]], concerns[["Month", "Metric", "Value"]]] 1049 | ) 1050 | plot_df = plot_df.sort_values(["Month", "Metric"]) 1051 | plot_df["MonthLabel"] = plot_df["Month"].dt.strftime("%b %y") 1052 | plot_df = plot_df.sort_values("Month") 1053 | pivot_df = plot_df.pivot( 1054 | index=["Month", "MonthLabel"], columns="Metric", values="Value" 1055 | ).reset_index() 1056 | pivot_df = pivot_df.sort_values("Month") 1057 | month_labels = pivot_df["MonthLabel"] 1058 | if 'Upsales Success' not in pivot_df.columns: 1059 | pivot_df['Upsales Success'] = 0 1060 | else: 1061 | pivot_df['Upsales Success'].fillna(value=0, inplace=True) 1062 | 1063 | 1064 | if 'Concerns Addressed' not in pivot_df.columns: 1065 | pivot_df['Concerns Addressed'] = 0 1066 | else: 1067 | pivot_df['Concerns Addressed'].fillna(value=0, inplace=True) 1068 | upsales_y = pivot_df["Upsales Success"].fillna(0) 1069 | concerns_y = pivot_df["Concerns Addressed"].fillna(0) 1070 | fig = go.Figure() 1071 | fig.add_traces( 1072 | [ 1073 | go.Bar( 1074 | x=month_labels, 1075 | y=upsales_y, 1076 | name="% Upsales Success", 1077 | marker_color="#00B4FF", 1078 | text=[f"{int(round(v))}%" if v != 0 else "" for v in upsales_y], 1079 | textposition="inside", 1080 | insidetextanchor="start", 1081 | textfont=dict(size=2, color="white"), 1082 | textangle=90, 1083 | offsetgroup=1, 1084 | cliponaxis=False, 1085 | width=0.6, 1086 | hovertemplate="Month: %{x}
Upsales Success: %{y:.0f}%", 1087 | ), 1088 | go.Bar( 1089 | x=month_labels, 1090 | y=concerns_y, 1091 | name="% Concerns Addressed", 1092 | marker_color="#FF9222", 1093 | text=[f"{int(round(abs(v)))}%" if v != 0 else "" for v in concerns_y], 1094 | textposition="inside", 1095 | insidetextanchor="end", 1096 | textfont=dict(size=2, color="white"), 1097 | textangle=90, 1098 | offsetgroup=1, 1099 | cliponaxis=False, 1100 | width=0.6, 1101 | hovertemplate="Month: %{x}
Concerns Addressed: %{customdata:.0f}%", 1102 | customdata=[abs(v) for v in concerns_y], 1103 | ), 1104 | ] 1105 | ) 1106 | fig.update_layout( 1107 | barmode="relative", 1108 | bargap=0, 1109 | showlegend=False, 1110 | xaxis=dict( 1111 | visible=True, 1112 | showline=False, 1113 | showticklabels=True, 1114 | ticks="", 1115 | showgrid=False, 1116 | zeroline=False, 1117 | tickangle=90, 1118 | tickfont=dict(size=12), 1119 | ), 1120 | yaxis=dict(visible=False), 1121 | margin=dict(t=0, b=0), 1122 | ) 1123 | fig.add_hline(y=0, line_width=1, line_color="rgba(150,150,150,0.7)") 1124 | return fig 1125 | -------------------------------------------------------------------------------- /vizro/custom_components.py: -------------------------------------------------------------------------------- 1 | """Custom components for Vizro dashboard extensions. 2 | """ 3 | 4 | import base64 5 | import re 6 | from pathlib import Path 7 | from typing import Any, Literal, Sequence 8 | 9 | from dash import html 10 | from dash.exceptions import PreventUpdate 11 | 12 | import vizro.models as vm 13 | from vizro.models.types import capture 14 | 15 | 16 | @capture("action") 17 | def update_from_selected_row( 18 | selected_rows: Sequence[dict[str, Any]] 19 | ) -> tuple[str, str]: 20 | """Update transcript and audio from the selected row in the grid. 21 | 22 | Args: 23 | selected_rows (Sequence[dict[str, Any]]): 24 | List of selected row dictionaries from the grid, each containing 'text_file' and 'audio_file' keys. 25 | 26 | Returns: 27 | tuple[str, str]: 28 | A tuple containing: 29 | - The transcript as markdown-formatted string. 30 | - The audio source as a base64-encoded string suitable for HTML audio playback. 31 | 32 | Raises: 33 | PreventUpdate: If the required files are not found or cannot be read. 34 | """ 35 | selected_row = selected_rows[0] 36 | text_file_path = Path(f"outputs/anonymized_files/{selected_row['text_file']}") 37 | audio_file_path = Path(f"outputs/audio_files/{selected_row['audio_file']}") 38 | if ( 39 | text_file_path not in Path("outputs/anonymized_files").iterdir() 40 | or audio_file_path not in Path("outputs/audio_files").iterdir() 41 | ): 42 | raise PreventUpdate 43 | try: 44 | call_transcript = text_file_path.read_text() 45 | except Exception as e: 46 | raise PreventUpdate from e 47 | call_transcript = call_transcript.replace("\n", " \n") 48 | call_transcript = re.sub(r"^(\w+)", r"**\1**", call_transcript, flags=re.MULTILINE) 49 | try: 50 | call_audio_src = base64.b64encode(audio_file_path.read_bytes()) 51 | except Exception as e: 52 | raise PreventUpdate from e 53 | call_audio_src = f"data:audio/wav;base64,{call_audio_src.decode('utf-8')}" 54 | return call_transcript, call_audio_src 55 | 56 | 57 | class Audio(vm.VizroBaseModel): 58 | """Audio component for Vizro dashboard. 59 | 60 | This component renders an audio player for playback of call recordings or other audio content. 61 | """ 62 | 63 | type: Literal["audio"] = "audio" 64 | 65 | def build(self) -> html.Audio: 66 | """Build the Dash Audio component for playback. 67 | 68 | Returns: 69 | html.Audio: Dash HTML audio component with controls enabled. 70 | """ 71 | return html.Audio(id=self.id, controls=True) 72 | 73 | 74 | vm.Container.add_type("components", Audio) 75 | 76 | 77 | def make_tabs_with_title(title: str, tabs: list[vm.Container]) -> vm.Container: 78 | """Create a container with a title and tabbed content for the Vizro dashboard. 79 | 80 | Args: 81 | title (str): 82 | The title to display above the tabbed content. 83 | tabs (list[vm.Container]): 84 | List of vm.Container objects, each representing a tab. 85 | 86 | Returns: 87 | vm.Container: A container with a title and tabbed content, styled for the dashboard. 88 | """ 89 | return vm.Container( 90 | title=title, components=[vm.Tabs(tabs=tabs)], variant="filled", collapsed=False 91 | ) 92 | --------------------------------------------------------------------------------