├── Dockerfile
├── LICENSE
├── Makefile
├── README.md
├── compose.yaml
├── data
├── en_agents.json
├── en_calls_batch.parquet
├── en_clients.json
├── en_conversations.zip
├── en_dataframe.parquet
├── en_ground_truths.parquet
├── en_metadata.parquet
├── es_agents.json
├── es_calls_batch.parquet
├── es_clients.json
├── es_conversations.zip
├── es_dataframe.parquet
├── es_ground_truths.parquet
├── es_metadata.parquet
└── sqlite.db
├── dev-requirements.txt
├── example_data
├── agents.zip
├── batch_creation
│ └── calls_batch.zip
├── clients.zip
├── conversation_generation
│ ├── conversations.zip
│ ├── ground_truths.zip
│ └── metadata.zip
└── text_to_audio
│ ├── audio_files.zip
│ └── dataframe.zip
├── images
├── call-center-readme.png
└── call-center-workflow.png
├── mlrun.env
├── notebook.ipynb
├── project_setup.py
├── pyproject.toml
├── requirements.txt
├── setup.py
├── src
├── __init__.py
├── calls_analysis
│ ├── __init__.py
│ ├── db_management.py
│ └── postprocessing.py
├── calls_generation
│ ├── __init__.py
│ ├── conversations_generator.py
│ └── skip.py
├── common.py
├── vizro.py
└── workflows
│ ├── __init__.py
│ ├── calls_analysis.py
│ └── calls_generation.py
└── vizro
├── app.py
├── assets
└── vizro_dashboard_styles.css
├── custom_charts.py
└── custom_components.py
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM mlrun/mlrun-gpu:1.7.0
2 |
3 | # Update apt-get to install ffmpeg (support audio file formats):
4 | RUN apt-get update -y
5 | RUN apt-get install ffmpeg -y
6 |
7 | # Install demo requirements:
8 |
9 | RUN pip install transformers==4.44.1
10 | RUN pip install torch==2.1.2 torchvision==0.16.2 torchaudio==2.1.2 --index-url https://download.pytorch.org/whl/cu118
11 | RUN pip install bitsandbytes==0.41.1 accelerate==0.24.1 datasets==2.14.6 peft==0.5.0 optimum==1.13.2
12 | RUN pip install auto-gptq==0.4.2 --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/
13 | RUN pip install langchain==0.0.327 openai==0.28.1
14 | RUN pip install git+https://github.com/suno-ai/bark.git
15 | RUN pip install streamlit==1.28.0 st-annotated-text==4.0.1 spacy==3.7.2 librosa==0.10.1 presidio-anonymizer==2.2.34 presidio-analyzer==2.2.34 nltk==3.8.1 flair==0.13.0
16 | RUN python -m spacy download en_core_web_lg
17 | RUN pip install -U SQLAlchemy
18 |
19 | # Align onnxruntime to use gpu:
20 | RUN pip uninstall -y onnxruntime-gpu
21 | RUN pip uninstall -y onnxruntime
22 | RUN pip install onnxruntime-gpu
23 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 |
2 | PYTHON_INTERPRETER = python3
3 | SHARED_DIR ?= ~/mlrun-data
4 | MLRUN_TAG ?= 1.4.0
5 | HOST_IP ?=$$(ip route get 1.2.3.4 | awk '{print $$7}')
6 | CONDA_ENV ?= mlrun
7 | SHELL=/bin/bash
8 | CONDA_PY_VER ?= 3.9
9 | CONDA_ACTIVATE = source $$(conda info --base)/etc/profile.d/conda.sh ; conda activate ; conda activate
10 |
11 | #################################################################################
12 | # COMMANDS #
13 | #################################################################################
14 |
15 | .PHONY: help
16 | help: ## Display available commands
17 | @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}'
18 |
19 | .PHONY: all
20 | all:
21 | $(error please pick a target)
22 |
23 | .PHONY: install-requirements
24 | install-requirements: ## Install all requirements needed for development
25 | $(PYTHON_INTERPRETER) -m pip install -r requirements.txt -r dev-requirements.txt
26 |
27 |
28 | .PHONY: package-wheel
29 | package-wheel: clean ## Build python package wheel
30 | python setup.py bdist_wheel
31 |
32 | .PHONY: clean
33 | clean: ## Clean python package build artifacts
34 | rm -rf build
35 | rm -rf dist
36 | find . -type f -name "*.py[co]" -delete
37 | find . -type d -name "__pycache__" -delete
38 |
39 | .PHONY: fmt
40 | fmt: ## Format the code (using black and isort)
41 | @echo "Running black fmt..."
42 | $(PYTHON_INTERPRETER) -m black src
43 | $(PYTHON_INTERPRETER) -m isort src
44 |
45 | .PHONY: lint
46 | lint: fmt-check flake8 ## Run lint on the code
47 |
48 | .PHONY: fmt-check
49 | fmt-check: ## Format and check the code (using black and isort)
50 | @echo "Running black+isort fmt check..."
51 | $(PYTHON_INTERPRETER) -m black --check --diff src
52 | $(PYTHON_INTERPRETER) -m isort --check --diff src
53 |
54 | .PHONY: flake8
55 | flake8: ## Run flake8 lint
56 | @echo "Running flake8 lint..."
57 | $(PYTHON_INTERPRETER) -m flake8 src
58 |
59 | .PHONY: mlrun-docker
60 | mlrun-docker: ## Start MLRun & Nuclio containers (using Docker compose)
61 | mkdir $(SHARED_DIR) -p
62 | @echo "HOST_IP=$(HOST_IP)" > .env
63 | SHARED_DIR=$(SHARED_DIR) TAG=$(MLRUN_TAG) docker-compose -f compose.yaml up -d
64 | @echo "use docker-compose stop / logs commands to stop or view logs"
65 |
66 | .PHONY: mlrun-api
67 | mlrun-api: ## Run MLRun DB locally (as process)
68 | @echo "Installing MLRun API dependencies ..."
69 | $(PYTHON_INTERPRETER) -m pip install uvicorn~=0.17.0 dask-kubernetes~=0.11.0 apscheduler~=3.6 sqlite3-to-mysql~=1.4
70 | @echo "Starting local mlrun..."
71 | MLRUN_ARTIFACT_PATH=$$(realpath ./artifacts) MLRUN_ENV_FILE= mlrun db -b
72 |
73 | .PHONY: conda-env
74 | conda-env: ## Create a conda environment
75 | @echo "Creating new conda environment $(CONDA_ENV)..."
76 | conda create -n $(CONDA_ENV) -y python=$(CONDA_PY_VER) ipykernel graphviz pip
77 | test -s ./mlrun.env && conda env config vars set -n $(CONDA_ENV) MLRUN_ENV_FILE=$$(realpath ./mlrun.env)
78 | @echo "Installing requirements.txt..."
79 | $(CONDA_ACTIVATE) $(CONDA_ENV); pip install -r requirements.txt
80 | @echo -e "\nTo run mlrun API as a local process type:\n conda activate $(CONDA_ENV) && make mlrun-api"
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | #
MLRun's Call Center Demo
2 |
3 |
4 |
5 | This demo showcases how to use LLMs to turn audio files from call center conversations between customers and agents into valuable data, all in a single workflow orchestrated by MLRun.
6 |
7 | MLRun automates the entire workflow, auto-scales resources as needed, and automatically logs and parses values between the different workflow steps.
8 |
9 | By the end of this demo you will see the potential power of LLMs for feature extraction, and how easily you can do this with MLRun!
10 |
11 | This demo uses:
12 | * [**OpenAI's Whisper**](https://openai.com/research/whisper) — To transcribe the audio calls into text.
13 | * [**Flair**](https://flairnlp.github.io/) and [**Microsoft's Presidio**](https://microsoft.github.io/presidio/) - To recognize PII so it can be filtered out.
14 | * [**HuggingFace**](https://huggingface.co/) — The main machine-learning framework to get the model and tokenizer for the features extraction.
15 | * and [**MLRun**](https://www.mlrun.org/) — as the orchestrator to operationalize the workflow.
16 |
17 | The demo contains a single [notebook](./notebook.ipynb) that encompasses the entire demo.
18 |
19 |
20 | Most of the functions are imported from [MLRun's function hub](https://docs.mlrun.org/en/stable/runtimes/load-from-hub.html), which contains a wide range of functions that can be used for a variety of use cases. All functions used in the demo include links to their source in the hub. All of the python source code is under [/src](./src).
21 | Enjoy!
22 |
23 | ___
24 |
25 | ## Installation
26 |
27 | This project can run in different development environments:
28 | * Local computer (using PyCharm, VSCode, Jupyter, etc.)
29 | * Inside GitHub Codespaces
30 | * Other managed Jupyter environments
31 |
32 | ### Install the code and the mlrun client
33 |
34 | To get started, fork this repo into your GitHub account and clone it into your development environment.
35 |
36 | To install the package dependencies (not required in GitHub codespaces) use:
37 |
38 | make install-requirements
39 |
40 | If you prefer to use Conda, use this instead (to create and configure a conda env):
41 |
42 | make conda-env
43 |
44 | > Make sure you open the notebooks and select the `mlrun` conda environment
45 |
46 | ### Install or connect to the MLRun service/cluster
47 |
48 | The MLRun service and computation can run locally (minimal setup) or over a remote Kubernetes environment.
49 |
50 | If your development environment supports Docker and there are sufficient CPU resources, run:
51 |
52 | make mlrun-docker
53 |
54 | > MLRun UI can be viewed in: http://localhost:8060
55 |
56 | If your environment is minimal, run mlrun as a process (no UI):
57 |
58 | [conda activate mlrun &&] make mlrun-api
59 |
60 | For MLRun to run properly you should set your client environment. This is not required when using **codespaces**, the mlrun **conda** environment, or **iguazio** managed notebooks.
61 |
62 | Your environment should include `MLRUN_ENV_FILE= ` (point to the mlrun .env file
63 | in this repo); see [mlrun client setup](https://docs.mlrun.org/en/latest/install/remote.html) instructions for details.
64 |
65 | > Note: You can also use a remote MLRun service (over Kubernetes): instead of starting a local mlrun:
66 | > edit the [mlrun.env](./mlrun.env) and specify its address and credentials.
67 |
--------------------------------------------------------------------------------
/compose.yaml:
--------------------------------------------------------------------------------
1 | services:
2 | init_nuclio:
3 | image: alpine:3.16
4 | command:
5 | - "/bin/sh"
6 | - "-c"
7 | - |
8 | mkdir -p /etc/nuclio/config/platform; \
9 | cat << EOF | tee /etc/nuclio/config/platform/platform.yaml
10 | runtime:
11 | common:
12 | env:
13 | MLRUN_DBPATH: http://${HOST_IP:?err}:8080
14 | local:
15 | defaultFunctionContainerNetworkName: mlrun
16 | defaultFunctionRestartPolicy:
17 | name: always
18 | maxRetryCount: 0
19 | defaultFunctionVolumes:
20 | - volume:
21 | name: mlrun-stuff
22 | hostPath:
23 | path: ${SHARED_DIR:?err}
24 | volumeMount:
25 | name: mlrun-stuff
26 | mountPath: /home/jovyan/data/
27 | logger:
28 | sinks:
29 | myStdoutLoggerSink:
30 | kind: stdout
31 | system:
32 | - level: debug
33 | sink: myStdoutLoggerSink
34 | functions:
35 | - level: debug
36 | sink: myStdoutLoggerSink
37 | EOF
38 | volumes:
39 | - nuclio-platform-config:/etc/nuclio/config
40 |
41 | mlrun-api:
42 | image: "mlrun/mlrun-api:${TAG:-1.1.2}"
43 | ports:
44 | - "8080:8080"
45 | environment:
46 | MLRUN_ARTIFACT_PATH: "${SHARED_DIR}/{{project}}"
47 | # using local storage, meaning files / artifacts are stored locally, so we want to allow access to them
48 | MLRUN_HTTPDB__REAL_PATH: /data
49 | MLRUN_HTTPDB__DATA_VOLUME: "${SHARED_DIR}"
50 | MLRUN_LOG_LEVEL: DEBUG
51 | MLRUN_NUCLIO_DASHBOARD_URL: http://nuclio:8070
52 | MLRUN_HTTPDB__DSN: "sqlite:////data/mlrun.db?check_same_thread=false"
53 | MLRUN_UI__URL: http://localhost:8060
54 | # not running on k8s meaning no need to store secrets
55 | MLRUN_SECRET_STORES__KUBERNETES__AUTO_ADD_PROJECT_SECRETS: "false"
56 | # let mlrun control nuclio resources
57 | MLRUN_HTTPDB__PROJECTS__FOLLOWERS: "nuclio"
58 | volumes:
59 | - "${SHARED_DIR:?err}:/data"
60 | networks:
61 | - mlrun
62 |
63 | mlrun-ui:
64 | image: "mlrun/mlrun-ui:${TAG:-1.1.2}"
65 | ports:
66 | - "8060:8090"
67 | environment:
68 | MLRUN_API_PROXY_URL: http://mlrun-api:8080
69 | MLRUN_NUCLIO_MODE: enable
70 | MLRUN_NUCLIO_API_URL: http://nuclio:8070
71 | MLRUN_NUCLIO_UI_URL: http://localhost:8070
72 | networks:
73 | - mlrun
74 |
75 | nuclio:
76 | image: "quay.io/nuclio/dashboard:${NUCLIO_TAG:-stable-amd64}"
77 | ports:
78 | - "8070:8070"
79 | environment:
80 | NUCLIO_DASHBOARD_EXTERNAL_IP_ADDRESSES: "${HOST_IP:?err}"
81 | volumes:
82 | - /var/run/docker.sock:/var/run/docker.sock
83 | - nuclio-platform-config:/etc/nuclio/config
84 | depends_on:
85 | - init_nuclio
86 | networks:
87 | - mlrun
88 |
89 | volumes:
90 | nuclio-platform-config: {}
91 |
92 | networks:
93 | mlrun:
94 | name: mlrun
95 |
--------------------------------------------------------------------------------
/data/en_agents.json:
--------------------------------------------------------------------------------
1 | [
2 | {
3 | "first_name": "Michael",
4 | "last_name": "Johnson",
5 | "agent_id": "A001"
6 | },
7 | {
8 | "first_name": "Emma",
9 | "last_name": "Williams",
10 | "agent_id": "A002"
11 | },
12 | {
13 | "first_name": "Daniel",
14 | "last_name": "Miller",
15 | "agent_id": "A003"
16 | },
17 | {
18 | "first_name": "Sophia",
19 | "last_name": "Brown",
20 | "agent_id": "A004"
21 | },
22 | {
23 | "first_name": "David",
24 | "last_name": "Davis",
25 | "agent_id": "A005"
26 | },
27 | {
28 | "first_name": "Olivia",
29 | "last_name": "Garcia",
30 | "agent_id": "A006"
31 | },
32 | {
33 | "first_name": "James",
34 | "last_name": "Rodriguez",
35 | "agent_id": "A007"
36 | },
37 | {
38 | "first_name": "Mia",
39 | "last_name": "Martinez",
40 | "agent_id": "A008"
41 | },
42 | {
43 | "first_name": "John",
44 | "last_name": "Hernandez",
45 | "agent_id": "A009"
46 | },
47 | {
48 | "first_name": "Isabella",
49 | "last_name": "Lopez",
50 | "agent_id": "A010"
51 | }
52 | ]
--------------------------------------------------------------------------------
/data/en_calls_batch.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlrun/demo-call-center/fac6cc4a5661ba469c28638a97af401a666ab031/data/en_calls_batch.parquet
--------------------------------------------------------------------------------
/data/en_clients.json:
--------------------------------------------------------------------------------
1 | [
2 | {
3 | "first_name": "Emily",
4 | "last_name": "Smith",
5 | "phone_number": "123-456-7890",
6 | "email": "emilysmith@example.com",
7 | "client_id": "12345"
8 | },
9 | {
10 | "first_name": "John",
11 | "last_name": "Doe",
12 | "phone_number": "098-765-4321",
13 | "email": "johndoe@example.com",
14 | "client_id": "67890"
15 | },
16 | {
17 | "first_name": "Jane",
18 | "last_name": "Doe",
19 | "phone_number": "456-789-0123",
20 | "email": "janedoe@example.com",
21 | "client_id": "23456"
22 | },
23 | {
24 | "first_name": "Robert",
25 | "last_name": "Johnson",
26 | "phone_number": "789-012-3456",
27 | "email": "robertjohnson@example.com",
28 | "client_id": "78901"
29 | },
30 | {
31 | "first_name": "Mary",
32 | "last_name": "Davis",
33 | "phone_number": "012-345-6789",
34 | "email": "marydavis@example.com",
35 | "client_id": "34567"
36 | },
37 | {
38 | "first_name": "James",
39 | "last_name": "Miller",
40 | "phone_number": "987-654-3210",
41 | "email": "jamesmiller@example.com",
42 | "client_id": "89012"
43 | },
44 | {
45 | "first_name": "Patricia",
46 | "last_name": "Wilson",
47 | "phone_number": "654-321-0987",
48 | "email": "patriciawilson@example.com",
49 | "client_id": "45678"
50 | },
51 | {
52 | "first_name": "Michael",
53 | "last_name": "Moore",
54 | "phone_number": "321-098-7654",
55 | "email": "michaelmoore@example.com",
56 | "client_id": "90123"
57 | },
58 | {
59 | "first_name": "Elizabeth",
60 | "last_name": "Taylor",
61 | "phone_number": "234-567-8901",
62 | "email": "elizabethtaylor@example.com",
63 | "client_id": "56789"
64 | },
65 | {
66 | "first_name": "David",
67 | "last_name": "Anderson",
68 | "phone_number": "567-890-1234",
69 | "email": "davidanderson@example.com",
70 | "client_id": "23459"
71 | }
72 | ]
--------------------------------------------------------------------------------
/data/en_conversations.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlrun/demo-call-center/fac6cc4a5661ba469c28638a97af401a666ab031/data/en_conversations.zip
--------------------------------------------------------------------------------
/data/en_dataframe.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlrun/demo-call-center/fac6cc4a5661ba469c28638a97af401a666ab031/data/en_dataframe.parquet
--------------------------------------------------------------------------------
/data/en_ground_truths.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlrun/demo-call-center/fac6cc4a5661ba469c28638a97af401a666ab031/data/en_ground_truths.parquet
--------------------------------------------------------------------------------
/data/en_metadata.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlrun/demo-call-center/fac6cc4a5661ba469c28638a97af401a666ab031/data/en_metadata.parquet
--------------------------------------------------------------------------------
/data/es_agents.json:
--------------------------------------------------------------------------------
1 | [
2 | {
3 | "first_name": "Carlos",
4 | "last_name": "Gomez",
5 | "agent_id": "A5432"
6 | },
7 | {
8 | "first_name": "Marta",
9 | "last_name": "Rodriguez",
10 | "agent_id": "B7658"
11 | },
12 | {
13 | "first_name": "Francisco",
14 | "last_name": "Lopez",
15 | "agent_id": "C3421"
16 | },
17 | {
18 | "first_name": "Ana",
19 | "last_name": "Perez",
20 | "agent_id": "D5463"
21 | },
22 | {
23 | "first_name": "Luis",
24 | "last_name": "Martinez",
25 | "agent_id": "E7654"
26 | },
27 | {
28 | "first_name": "Maria",
29 | "last_name": "Hernandez",
30 | "agent_id": "F3214"
31 | },
32 | {
33 | "first_name": "Pedro",
34 | "last_name": "Gonzalez",
35 | "agent_id": "G9876"
36 | },
37 | {
38 | "first_name": "Josefa",
39 | "last_name": "Ramirez",
40 | "agent_id": "H6543"
41 | },
42 | {
43 | "first_name": "Antonio",
44 | "last_name": "Sanchez",
45 | "agent_id": "I4321"
46 | },
47 | {
48 | "first_name": "Isabel",
49 | "last_name": "Torres",
50 | "agent_id": "J7658"
51 | }
52 | ]
--------------------------------------------------------------------------------
/data/es_calls_batch.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlrun/demo-call-center/fac6cc4a5661ba469c28638a97af401a666ab031/data/es_calls_batch.parquet
--------------------------------------------------------------------------------
/data/es_clients.json:
--------------------------------------------------------------------------------
1 | [
2 | {
3 | "first_name": "Carlos",
4 | "last_name": "Gomez",
5 | "phone_number": "678-234-5678",
6 | "email": "CarlosGomez@email.com",
7 | "client_id": "ID001"
8 | },
9 | {
10 | "first_name": "Maria",
11 | "last_name": "Hernandez",
12 | "phone_number": "789-345-6789",
13 | "email": "MariaHernandez@email.com",
14 | "client_id": "ID002"
15 | },
16 | {
17 | "first_name": "Luis",
18 | "last_name": "Rodriguez",
19 | "phone_number": "890-456-7890",
20 | "email": "LuisRodriguez@email.com",
21 | "client_id": "ID003"
22 | },
23 | {
24 | "first_name": "Ana",
25 | "last_name": "Sanchez",
26 | "phone_number": "901-567-8901",
27 | "email": "AnaSanchez@email.com",
28 | "client_id": "ID004"
29 | },
30 | {
31 | "first_name": "Jose",
32 | "last_name": "Martinez",
33 | "phone_number": "012-678-9012",
34 | "email": "JoseMartinez@email.com",
35 | "client_id": "ID005"
36 | },
37 | {
38 | "first_name": "Isabel",
39 | "last_name": "Lopez",
40 | "phone_number": "123-789-0123",
41 | "email": "IsabelLopez@email.com",
42 | "client_id": "ID006"
43 | },
44 | {
45 | "first_name": "Miguel",
46 | "last_name": "Gonzalez",
47 | "phone_number": "234-890-1234",
48 | "email": "MiguelGonzalez@email.com",
49 | "client_id": "ID007"
50 | },
51 | {
52 | "first_name": "Sofia",
53 | "last_name": "Perez",
54 | "phone_number": "345-901-2345",
55 | "email": "SofiaPerez@email.com",
56 | "client_id": "ID008"
57 | },
58 | {
59 | "first_name": "Antonio",
60 | "last_name": "Ramirez",
61 | "phone_number": "456-012-3456",
62 | "email": "AntonioRamirez@email.com",
63 | "client_id": "ID009"
64 | },
65 | {
66 | "first_name": "Carmen",
67 | "last_name": "Torres",
68 | "phone_number": "567-123-4567",
69 | "email": "CarmenTorres@email.com",
70 | "client_id": "ID010"
71 | }
72 | ]
--------------------------------------------------------------------------------
/data/es_conversations.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlrun/demo-call-center/fac6cc4a5661ba469c28638a97af401a666ab031/data/es_conversations.zip
--------------------------------------------------------------------------------
/data/es_dataframe.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlrun/demo-call-center/fac6cc4a5661ba469c28638a97af401a666ab031/data/es_dataframe.parquet
--------------------------------------------------------------------------------
/data/es_ground_truths.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlrun/demo-call-center/fac6cc4a5661ba469c28638a97af401a666ab031/data/es_ground_truths.parquet
--------------------------------------------------------------------------------
/data/es_metadata.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlrun/demo-call-center/fac6cc4a5661ba469c28638a97af401a666ab031/data/es_metadata.parquet
--------------------------------------------------------------------------------
/data/sqlite.db:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlrun/demo-call-center/fac6cc4a5661ba469c28638a97af401a666ab031/data/sqlite.db
--------------------------------------------------------------------------------
/dev-requirements.txt:
--------------------------------------------------------------------------------
1 | pytest~=5.4
2 | black~=24.8
3 | isort~=5.7
4 | flake8~=5.0
5 |
--------------------------------------------------------------------------------
/example_data/agents.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlrun/demo-call-center/fac6cc4a5661ba469c28638a97af401a666ab031/example_data/agents.zip
--------------------------------------------------------------------------------
/example_data/batch_creation/calls_batch.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlrun/demo-call-center/fac6cc4a5661ba469c28638a97af401a666ab031/example_data/batch_creation/calls_batch.zip
--------------------------------------------------------------------------------
/example_data/clients.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlrun/demo-call-center/fac6cc4a5661ba469c28638a97af401a666ab031/example_data/clients.zip
--------------------------------------------------------------------------------
/example_data/conversation_generation/conversations.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlrun/demo-call-center/fac6cc4a5661ba469c28638a97af401a666ab031/example_data/conversation_generation/conversations.zip
--------------------------------------------------------------------------------
/example_data/conversation_generation/ground_truths.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlrun/demo-call-center/fac6cc4a5661ba469c28638a97af401a666ab031/example_data/conversation_generation/ground_truths.zip
--------------------------------------------------------------------------------
/example_data/conversation_generation/metadata.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlrun/demo-call-center/fac6cc4a5661ba469c28638a97af401a666ab031/example_data/conversation_generation/metadata.zip
--------------------------------------------------------------------------------
/example_data/text_to_audio/audio_files.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlrun/demo-call-center/fac6cc4a5661ba469c28638a97af401a666ab031/example_data/text_to_audio/audio_files.zip
--------------------------------------------------------------------------------
/example_data/text_to_audio/dataframe.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlrun/demo-call-center/fac6cc4a5661ba469c28638a97af401a666ab031/example_data/text_to_audio/dataframe.zip
--------------------------------------------------------------------------------
/images/call-center-readme.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlrun/demo-call-center/fac6cc4a5661ba469c28638a97af401a666ab031/images/call-center-readme.png
--------------------------------------------------------------------------------
/images/call-center-workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlrun/demo-call-center/fac6cc4a5661ba469c28638a97af401a666ab031/images/call-center-workflow.png
--------------------------------------------------------------------------------
/mlrun.env:
--------------------------------------------------------------------------------
1 | # default env vars, will be loaded once MLRun imports/starts
2 | # write here remote cluster credentials, addresses, etc.
3 | # uncomment the relevant lines and set with proper parameters
4 |
5 | # local/remote MLRun service address
6 | MLRUN_DBPATH=http://localhost:8080
7 |
8 | # if Nuclio not detected simulate it with mock
9 | MLRUN_MOCK_NUCLIO_DEPLOYMENT=auto
10 |
11 | # Iguazio cluster and V3IO credentials (for remote cluster)
12 | # V3IO_USERNAME=
13 | # V3IO_ACCESS_KEY=
14 |
15 | # AWS S3/services credentials
16 | # AWS_ACCESS_KEY_ID=
17 | # AWS_SECRET_ACCESS_KEY=
18 |
19 | # The Azure connection string which points at a storage account. For example:
20 | # DefaultEndpointsProtocol=https;AccountName=myAcct;AccountKey=XXXX;EndpointSuffix=core.windows.net
21 | # AZURE_STORAGE_CONNECTION_STRING=
22 |
--------------------------------------------------------------------------------
/project_setup.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Iguazio
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import os
15 | from pathlib import Path
16 | import boto3
17 | import mlrun
18 |
19 | from src.calls_analysis.db_management import create_tables
20 | from src.common import ProjectSecrets
21 |
22 | CE_MODE = mlrun.mlconf.is_ce_mode()
23 |
24 | def setup(
25 | project: mlrun.projects.MlrunProject,
26 | ) -> mlrun.projects.MlrunProject:
27 | """
28 | Creating the project for the demo. This function is expected to call automatically when calling the function
29 | `mlrun.get_or_create_project`.
30 |
31 | :param project: The project to set up.
32 |
33 | :returns: A fully prepared project for this demo.
34 | """
35 | # Unpack secrets from environment variables:
36 | openai_key = os.getenv(ProjectSecrets.OPENAI_API_KEY)
37 | openai_base = os.getenv(ProjectSecrets.OPENAI_API_BASE)
38 | mysql_url = os.getenv(ProjectSecrets.MYSQL_URL, "")
39 |
40 | # Unpack parameters:
41 | source = project.get_param(key="source")
42 | default_image = project.get_param(key="default_image", default=None)
43 | build_image = project.get_param(key="build_image", default=False)
44 | gpus = project.get_param(key="gpus", default=0)
45 | node_name = project.get_param(key="node_name", default=None)
46 | node_selector = project.get_param(key="node_selector", default=None)
47 | use_sqlite = project.get_param(key="use_sqlite", default=False)
48 |
49 | # Update sqlite data:
50 | if use_sqlite:
51 | # uploading db file to s3:
52 | if CE_MODE:
53 | s3 = boto3.client("s3") if not os.getenv("S3_ENDPOINT_URL") else boto3.client('s3', endpoint_url=os.getenv("S3_ENDPOINT_URL"))
54 | bucket_name = Path(mlrun.mlconf.artifact_path).parts[1]
55 | # Upload the file
56 | s3.upload_file(
57 | Filename="data/sqlite.db",
58 | Bucket=bucket_name,
59 | Key="sqlite.db",
60 | )
61 | os.environ["S3_BUCKET_NAME"] = bucket_name
62 | else:
63 | os.environ["MYSQL_URL"] = f"sqlite:///{os.path.abspath('.')}/data/sqlite.db"
64 | mysql_url = os.environ["MYSQL_URL"]
65 |
66 | # Set the project git source:
67 | if source:
68 | print(f"Project Source: {source}")
69 | project.set_source(source=source, pull_at_runtime=True)
70 |
71 | # Set default image:
72 | if default_image:
73 | project.set_default_image(default_image)
74 |
75 | # Build the image:
76 | if build_image:
77 | print("Building default image for the demo:")
78 | _build_image(project=project, with_gpu=gpus)
79 |
80 | # Set the secrets:
81 | _set_secrets(
82 | project=project,
83 | openai_key=openai_key,
84 | openai_base=openai_base,
85 | mysql_url=mysql_url,
86 | bucket_name=os.getenv(ProjectSecrets.S3_BUCKET_NAME),
87 | )
88 |
89 | # Refresh MLRun hub to the most up-to-date version:
90 | mlrun.get_run_db().get_hub_catalog(source_name="default", force_refresh=True)
91 |
92 | # Set the functions:
93 | _set_calls_generation_functions(project=project, node_name=node_name)
94 | _set_calls_analysis_functions(project=project, gpus=gpus, node_name=node_name, node_selector=node_selector)
95 |
96 | # Set the workflows:
97 | _set_workflows(project=project)
98 |
99 | # Set UI application:
100 | app = project.set_function(
101 | name="call-center-ui",
102 | kind="application",
103 | requirements=["vizro==0.1.38", "gunicorn"]
104 | )
105 | # Set the internal application port to Vizro's default port
106 | app.set_internal_application_port(8050)
107 |
108 | # Set the command to run the Vizro application
109 | app.spec.command = "gunicorn"
110 | app.spec.args = [
111 | "app:app",
112 | "--bind",
113 | "0.0.0.0:8050",
114 | "--chdir",
115 | f"home/mlrun_code/vizro"
116 | ]
117 | app.save()
118 |
119 | # Create the DB tables:
120 | create_tables()
121 |
122 | # Save and return the project:
123 | project.save()
124 | return project
125 |
126 | def _build_image(project: mlrun.projects.MlrunProject, with_gpu: bool):
127 | config = {
128 | "base_image": "mlrun/mlrun-gpu" if with_gpu else "mlrun/mlrun",
129 | "torch_index": "https://download.pytorch.org/whl/cu118" if with_gpu else "https://download.pytorch.org/whl/cpu",
130 | "onnx_package": "onnxruntime-gpu" if with_gpu else "onnxruntime"
131 | }
132 | # Define commands in logical groups while maintaining order
133 | system_commands = [
134 | # Update apt-get to install ffmpeg (support audio file formats):
135 | "apt-get update -y && apt-get install ffmpeg -y"
136 | ]
137 |
138 | infrastructure_requirements = [
139 | "pip install transformers==4.44.1",
140 | f"pip install torch==2.1.2 torchvision==0.16.2 torchaudio==2.1.2 --index-url {config['torch_index']}"
141 | ]
142 |
143 | huggingface_requirements = [
144 | "pip install bitsandbytes==0.41.1 accelerate==0.24.1 datasets==2.14.6 peft==0.5.0 optimum==1.13.2"
145 | ]
146 |
147 | gpu_specific_requirements = [
148 | "pip install auto-gptq==0.4.2 --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/"
149 | ] if with_gpu else []
150 |
151 | other_requirements = [
152 | "pip install mlrun langchain==0.2.17 openai==1.58.1 langchain_community==0.2.19 pydub==0.25.1 streamlit==1.28.0 st-annotated-text==4.0.1 spacy==3.7.2 librosa==0.10.1 presidio-anonymizer==2.2.34 presidio-analyzer==2.2.34 nltk==3.8.1 flair==0.13.0 htbuilder==0.6.2",
153 | "python -m spacy download en_core_web_lg",
154 | "pip install -U SQLAlchemy",
155 | "pip uninstall -y onnxruntime-gpu onnxruntime",
156 | f"pip install {config['onnx_package']}",
157 | ]
158 |
159 | # Combine commands in the required order
160 | commands = (
161 | system_commands +
162 | infrastructure_requirements +
163 | huggingface_requirements +
164 | gpu_specific_requirements +
165 | other_requirements
166 | )
167 |
168 | # Build the image
169 | assert project.build_image(
170 | base_image=config["base_image"],
171 | commands=commands,
172 | set_as_default=True,
173 | )
174 |
175 | def _set_secrets(
176 | project: mlrun.projects.MlrunProject,
177 | openai_key: str,
178 | openai_base: str,
179 | mysql_url: str,
180 | bucket_name: str = None,
181 | ):
182 | # Must have secrets:
183 | project.set_secrets(
184 | secrets={
185 | ProjectSecrets.OPENAI_API_KEY: openai_key,
186 | ProjectSecrets.OPENAI_API_BASE: openai_base,
187 | ProjectSecrets.MYSQL_URL: mysql_url,
188 | }
189 | )
190 | if bucket_name:
191 | project.set_secrets(
192 | secrets={
193 | ProjectSecrets.S3_BUCKET_NAME: bucket_name,
194 | }
195 | )
196 |
197 |
198 | def _set_function(
199 | project: mlrun.projects.MlrunProject,
200 | func: str,
201 | name: str,
202 | kind: str,
203 | gpus: int = 0,
204 | node_name: str = None,
205 | with_repo: bool = None,
206 | image: str = None,
207 | node_selector: dict = None,
208 | apply_auto_mount: bool = True,
209 | ):
210 | # Set the given function:
211 | if with_repo is None:
212 | with_repo = not func.startswith("hub://")
213 | mlrun_function = project.set_function(
214 | func=func, name=name, kind=kind, with_repo=with_repo, image=image,
215 | )
216 |
217 | # Configure GPUs according to the given kind:
218 | if gpus >= 1:
219 | if node_selector:
220 | mlrun_function.with_node_selection(node_selector=node_selector)
221 | if kind == "mpijob":
222 | # 1 GPU for each rank:
223 | mlrun_function.with_limits(gpus=1)
224 | mlrun_function.spec.replicas = gpus
225 | else:
226 | # All GPUs for the single job:
227 | mlrun_function.with_limits(gpus=gpus)
228 | # Set the node selection:
229 | elif node_name:
230 | mlrun_function.with_node_selection(node_name=node_name)
231 |
232 | if not CE_MODE and apply_auto_mount:
233 | # Apply auto mount:
234 | mlrun_function.apply(mlrun.auto_mount())
235 | # Save:
236 | mlrun_function.save()
237 |
238 |
239 | def _set_calls_generation_functions(
240 | project: mlrun.projects.MlrunProject,
241 | node_name: str = None,
242 | ):
243 | # Client and agent data generator
244 | _set_function(
245 | project=project,
246 | func="hub://structured_data_generator",
247 | name="structured-data-generator",
248 | kind="job",
249 | node_name=node_name,
250 | apply_auto_mount=True,
251 | )
252 |
253 | # Conversation generator:
254 | _set_function(
255 | project=project,
256 | func="./src/calls_generation/conversations_generator.py",
257 | name="conversations-generator",
258 | kind="job",
259 | node_name=node_name,
260 | apply_auto_mount=True,
261 | )
262 |
263 | # Text to audio generator:
264 | _set_function(
265 | project=project,
266 | func="hub://text_to_audio_generator",
267 | name="text-to-audio-generator",
268 | kind="job",
269 | with_repo=False,
270 | apply_auto_mount=True,
271 | )
272 |
273 |
274 | def _set_calls_analysis_functions(
275 | project: mlrun.projects.MlrunProject,
276 | gpus: int,
277 | node_name: str = None,
278 | node_selector: dict = None,
279 | ):
280 | # DB management:
281 | _set_function(
282 | project=project,
283 | func="./src/calls_analysis/db_management.py",
284 | name="db-management",
285 | kind="job",
286 | node_name=node_name,
287 | apply_auto_mount=True,
288 | )
289 |
290 | # Speech diarization:
291 | _set_function(
292 | project=project,
293 | func="hub://silero_vad",
294 | name="silero-vad",
295 | kind="job",
296 | node_name=node_name,
297 | )
298 |
299 | # Transcription:
300 | _set_function(
301 | project=project,
302 | func="hub://transcribe",
303 | name="transcription",
304 | kind="mpijob" if gpus > 1 else "job",
305 | gpus=gpus,
306 | node_name=node_name,
307 | node_selector=node_selector,
308 | )
309 |
310 | # PII recognition:
311 | _set_function(
312 | project=project,
313 | func="hub://pii_recognizer",
314 | name="pii-recognition",
315 | kind="job",
316 | node_name=node_name,
317 | )
318 |
319 | # Question answering:
320 | _set_function(
321 | project=project,
322 | func="hub://question_answering",
323 | name="question-answering",
324 | kind="job",
325 | gpus=gpus,
326 | node_name=node_name,
327 | node_selector=node_selector,
328 | )
329 |
330 | # Postprocessing:
331 | _set_function(
332 | project=project,
333 | func="./src/calls_analysis/postprocessing.py",
334 | name="postprocessing",
335 | with_repo=False,
336 | kind="job",
337 | node_name=node_name,
338 | )
339 |
340 |
341 | def _set_workflows(project: mlrun.projects.MlrunProject):
342 | project.set_workflow(
343 | name="calls-generation", workflow_path="./src/workflows/calls_generation.py"
344 | )
345 | project.set_workflow(
346 | name="calls-analysis", workflow_path="./src/workflows/calls_analysis.py"
347 | )
348 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [tool.isort]
2 | profile = "black"
3 | multi_line_output = 3
4 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | mlrun
2 | SQLAlchemy
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import find_packages, setup
2 |
3 | project_name = "myproj"
4 | with open("README.md", "r", encoding="utf-8") as fh:
5 | long_description = fh.read()
6 |
7 | setup(
8 | name=project_name,
9 | packages=[project_name],
10 | package_dir={project_name: "src"},
11 | version="0.1.0",
12 | description="my desc",
13 | author="Yaron",
14 | author_email="author@example.com",
15 | license="MIT",
16 | long_description=long_description,
17 | long_description_content_type="text/markdown",
18 | python_requires=">=3.9",
19 | )
20 |
--------------------------------------------------------------------------------
/src/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Iguazio
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
--------------------------------------------------------------------------------
/src/calls_analysis/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Iguazio
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
--------------------------------------------------------------------------------
/src/calls_analysis/db_management.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Iguazio
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import datetime
15 | import os
16 | import tempfile
17 | from typing import List, Optional, Tuple
18 |
19 | import boto3
20 | import mlrun
21 | import pandas as pd
22 | from sqlalchemy import (
23 | Boolean,
24 | Date,
25 | Enum,
26 | ForeignKey,
27 | Integer,
28 | String,
29 | Time,
30 | bindparam,
31 | create_engine,
32 | insert,
33 | select,
34 | update,
35 | )
36 | from sqlalchemy.orm import (
37 | Mapped,
38 | declarative_base,
39 | mapped_column,
40 | relationship,
41 | sessionmaker,
42 | )
43 |
44 | from src.common import CallStatus, ProjectSecrets
45 |
46 | ID_LENGTH = 32
47 | FILE_PATH_LENGTH = 500
48 |
49 | Base = declarative_base()
50 |
51 |
52 | class Client(Base):
53 | __tablename__ = "client"
54 |
55 | # Columns:
56 | client_id: Mapped[str] = mapped_column(String(length=ID_LENGTH), primary_key=True)
57 | first_name: Mapped[str] = mapped_column(String(length=30))
58 | last_name: Mapped[str] = mapped_column(String(length=30))
59 | phone_number: Mapped[str] = mapped_column(String(length=20))
60 | email: Mapped[str] = mapped_column(String(length=50))
61 | client_city: Mapped[str] = mapped_column(String(length=30))
62 | latitude: Mapped[str] = mapped_column(String(length=20))
63 | longitude: Mapped[str] = mapped_column(String(length=20))
64 |
65 | # Many-to-one relationship:
66 | calls: Mapped[List["Call"]] = relationship(back_populates="client", lazy=True)
67 |
68 |
69 | class Agent(Base):
70 | __tablename__ = "agent"
71 |
72 | # Columns:
73 | agent_id: Mapped[str] = mapped_column(String(length=ID_LENGTH), primary_key=True)
74 | first_name: Mapped[str] = mapped_column(String(length=30))
75 | last_name: Mapped[str] = mapped_column(String(length=30))
76 | # phone: Mapped[str] = mapped_column(String(length=20))
77 | # email: Mapped[str] = mapped_column(String(length=50))
78 |
79 | # Many-to-one relationship:
80 | calls: Mapped[List["Call"]] = relationship(back_populates="agent", lazy=True)
81 |
82 |
83 | class Call(Base):
84 | __tablename__ = "call"
85 |
86 | # Metadata:
87 | call_id: Mapped[str] = mapped_column(String(length=ID_LENGTH), primary_key=True)
88 | client_id: Mapped[str] = mapped_column(
89 | String(length=ID_LENGTH), ForeignKey("client.client_id")
90 | )
91 | agent_id: Mapped[str] = mapped_column(
92 | String(length=ID_LENGTH), ForeignKey("agent.agent_id")
93 | )
94 | date: Mapped[datetime.date] = mapped_column(Date())
95 | time: Mapped[datetime.time] = mapped_column(Time())
96 | status: Mapped[CallStatus] = mapped_column(Enum(CallStatus), nullable=True)
97 | # Files:
98 | audio_file: Mapped[str] = mapped_column(String(length=FILE_PATH_LENGTH))
99 | # TODO: processed_audio_file: Mapped[Optional[str]] = mapped_column(String(length=FILE_PATH_LENGTH))
100 | transcription_file: Mapped[Optional[str]] = mapped_column(
101 | String(length=FILE_PATH_LENGTH),
102 | nullable=True,
103 | default=None,
104 | )
105 | anonymized_file: Mapped[Optional[str]] = mapped_column(
106 | String(length=FILE_PATH_LENGTH),
107 | nullable=True,
108 | default=None,
109 | )
110 | # Analysis:
111 | topic: Mapped[Optional[str]] = mapped_column(
112 | String(length=50),
113 | nullable=True,
114 | default=None,
115 | )
116 | summary: Mapped[Optional[str]] = mapped_column(
117 | String(length=1000),
118 | nullable=True,
119 | default=None,
120 | )
121 | concern_addressed: Mapped[Optional[bool]] = mapped_column(
122 | Boolean(),
123 | nullable=True,
124 | default=None,
125 | )
126 | client_tone: Mapped[Optional[str]] = mapped_column(
127 | String(length=20),
128 | nullable=True,
129 | default=None,
130 | )
131 | agent_tone: Mapped[Optional[str]] = mapped_column(
132 | String(length=20),
133 | nullable=True,
134 | default=None,
135 | )
136 | upsale_attempted: Mapped[Optional[bool]] = mapped_column(
137 | Boolean(),
138 | nullable=True,
139 | default=None,
140 | )
141 | upsale_success: Mapped[Optional[bool]] = mapped_column(
142 | Boolean(),
143 | nullable=True,
144 | default=None,
145 | )
146 | empathy: Mapped[Optional[int]] = mapped_column(
147 | Integer(),
148 | nullable=True,
149 | default=None,
150 | )
151 | professionalism: Mapped[Optional[int]] = mapped_column(
152 | Integer(),
153 | nullable=True,
154 | default=None,
155 | )
156 | kindness: Mapped[Optional[int]] = mapped_column(
157 | Integer(),
158 | nullable=True,
159 | default=None,
160 | )
161 | effective_communication: Mapped[Optional[int]] = mapped_column(
162 | Integer(),
163 | nullable=True,
164 | default=None,
165 | )
166 | active_listening: Mapped[Optional[int]] = mapped_column(
167 | Integer(),
168 | nullable=True,
169 | default=None,
170 | )
171 | customization: Mapped[Optional[int]] = mapped_column(
172 | Integer(),
173 | nullable=True,
174 | default=None,
175 | )
176 |
177 | # One-to-many relationships:
178 | client: Mapped["Client"] = relationship(back_populates="calls", lazy=True)
179 | agent: Mapped["Agent"] = relationship(back_populates="calls", lazy=True)
180 |
181 |
182 | class DBEngine:
183 | def __init__(self, context: mlrun.MLClientCtx):
184 | self.bucket_name = context.get_secret(key=ProjectSecrets.S3_BUCKET_NAME)
185 | self.db_url = context.get_secret(key=ProjectSecrets.MYSQL_URL)
186 | self.temp_file = None
187 | self.engine = self._create_engine()
188 |
189 | def get_session(self):
190 | return sessionmaker(self.engine)
191 |
192 | def update_db(self):
193 | if self.bucket_name:
194 | s3 = boto3.client("s3")
195 | s3.upload_file(self.temp_file.name, self.bucket_name, "sqlite.db")
196 |
197 | def _create_engine(self):
198 | if self.bucket_name:
199 | # Create a temporary file that will persist throughout the object's lifetime
200 | self.temp_file = tempfile.NamedTemporaryFile(suffix=".sqlite", delete=False)
201 | self.temp_file.close() # Close the file but keep the name
202 |
203 | s3 = boto3.client("s3")
204 | try:
205 | s3.download_file(self.bucket_name, "sqlite.db", self.temp_file.name)
206 | except Exception as e:
207 | print(f"Warning: Could not download database from S3: {e}")
208 |
209 | return create_engine(f"sqlite:///{self.temp_file.name}")
210 | else:
211 | return create_engine(url=self.db_url)
212 |
213 | def __del__(self):
214 | # Clean up the temporary file when the object is destroyed
215 | if self.temp_file:
216 | try:
217 | os.unlink(self.temp_file.name)
218 | except:
219 | pass
220 |
221 |
222 | def create_tables():
223 | """
224 | Create the call center schema tables for when creating or loading the MLRun project.
225 | """
226 | # Create an engine:
227 | engine = DBEngine(mlrun.get_or_create_ctx("create_tables"))
228 |
229 | # Create the schema's tables:
230 | Base.metadata.create_all(engine.engine)
231 |
232 | engine.update_db()
233 |
234 |
235 | def insert_clients(context: mlrun.MLClientCtx, clients: list):
236 | # Create an engine:
237 | engine = DBEngine(context)
238 |
239 | # Initialize a session maker:
240 | session = engine.get_session()
241 |
242 | # Insert the new calls into the table and commit:
243 | with session.begin() as sess:
244 | sess.execute(insert(Client), clients)
245 |
246 | engine.update_db()
247 |
248 |
249 | def insert_agents(context: mlrun.MLClientCtx, agents: list):
250 | # Create an engine:
251 | engine = DBEngine(context)
252 |
253 | # Initialize a session maker:
254 | session = engine.get_session()
255 |
256 | # Insert the new calls into the table and commit:
257 | with session.begin() as sess:
258 | sess.execute(insert(Agent), agents)
259 |
260 | engine.update_db()
261 |
262 |
263 | def insert_calls(
264 | context: mlrun.MLClientCtx, calls: pd.DataFrame
265 | ) -> Tuple[pd.DataFrame, List[str]]:
266 | # Create an engine:
267 | engine = DBEngine(context)
268 |
269 | # Initialize a session maker:
270 | session = engine.get_session()
271 |
272 | # Cast data from dataframe to a list of dictionaries:
273 | records = calls.to_dict(orient="records")
274 |
275 | # Insert the new calls into the table and commit:
276 | with session.begin() as sess:
277 | sess.execute(insert(Call), records)
278 |
279 | engine.update_db()
280 |
281 | # Return the metadata and audio files:
282 | audio_files = list(calls["audio_file"])
283 | return calls, audio_files
284 |
285 |
286 | def update_calls(
287 | context: mlrun.MLClientCtx,
288 | status: str,
289 | table_key: str,
290 | data_key: str,
291 | data: pd.DataFrame,
292 | ):
293 | # Create an engine:
294 | engine = DBEngine(context)
295 |
296 | # Initialize a session maker:
297 | session = engine.get_session()
298 |
299 | # Add the status to the dataframe:
300 | data["status"] = [CallStatus(status)] * len(data)
301 |
302 | # Make sure keys are not duplicates (so we can update by the key with `bindparam`):
303 | if data_key == table_key:
304 | data_key += "_2"
305 | data.rename(columns={table_key: data_key}, inplace=True)
306 |
307 | # Cast data from dataframe to a list of dictionaries:
308 | data = data.to_dict(orient="records")
309 |
310 | # Insert the new calls into the table and commit:
311 | with session.begin() as sess:
312 | sess.connection().execute(
313 | update(Call).where(getattr(Call, table_key) == bindparam(data_key)), data
314 | )
315 |
316 | engine.update_db()
317 |
318 |
319 | def get_calls() -> pd.DataFrame:
320 | context = mlrun.get_or_create_ctx("get_calls")
321 | # Create an engine:
322 | engine = DBEngine(context)
323 |
324 | # Initialize a session maker:
325 | session = engine.get_session()
326 |
327 | # Select all calls:
328 | with session.begin() as sess:
329 | calls = pd.read_sql(select(Call), sess.connection())
330 |
331 | return calls
332 |
333 |
334 | def get_agents(context: mlrun.MLClientCtx) -> list:
335 | # Create an engine:
336 | engine = DBEngine(context)
337 |
338 | # Initialize a session maker:
339 | session = engine.get_session()
340 |
341 | # Select all calls:
342 | with session.begin() as sess:
343 | agents = pd.read_sql(select(Agent), sess.connection())
344 | return agents
345 |
346 |
347 | def get_clients(context: mlrun.MLClientCtx) -> list:
348 | # Create an engine:
349 | engine = DBEngine(context)
350 |
351 | # Initialize a session maker:
352 | session = engine.get_session()
353 |
354 | # Select all calls:
355 | with session.begin() as sess:
356 | clients = pd.read_sql(select(Client), sess.connection())
357 | return clients
358 |
--------------------------------------------------------------------------------
/src/calls_analysis/postprocessing.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 |
3 |
4 | def postprocess_answers(answers: pd.DataFrame):
5 | for column in ["concern_addressed", "upsale_attempted", "upsale_success"]:
6 | answers[column] = answers[column].apply(lambda x: "yes" in x.casefold())
7 | for column in ["client_tone", "agent_tone"]:
8 | answers[column] = answers[column].apply(
9 | lambda x: "Positive" if "Positive" in x else x
10 | )
11 | answers[column] = answers[column].apply(
12 | lambda x: "Negative" if "Negative" in x else x
13 | )
14 | answers[column] = answers[column].apply(
15 | lambda x: "Neutral" if "Neutral" in x else x
16 | )
17 | return answers
18 |
--------------------------------------------------------------------------------
/src/calls_generation/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Iguazio
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from .skip import skip_and_import_local_data
15 |
--------------------------------------------------------------------------------
/src/calls_generation/conversations_generator.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Iguazio
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import datetime
15 | import hashlib
16 | import os
17 | import pathlib
18 | import random
19 | import tempfile
20 | from typing import Tuple
21 |
22 | import mlrun
23 | import pandas as pd
24 | import tqdm
25 | from langchain.chat_models import ChatOpenAI
26 |
27 | from src.common import TONES, TOPICS, ProjectSecrets
28 |
29 | #: The approximate amount of words in one minute.
30 | WORDS_IN_1_MINUTE = 240
31 |
32 |
33 | def generate_conversations(
34 | context: mlrun.MLClientCtx,
35 | amount: int,
36 | agent_data: pd.DataFrame,
37 | client_data: pd.DataFrame,
38 | output_directory: str = None,
39 | model_name: str = "gpt-3.5-turbo",
40 | language: str = "en",
41 | min_time: int = 2,
42 | max_time: int = 5,
43 | from_date: str = "01.01.2023",
44 | to_date: str = "01.03.2023",
45 | from_time: str = "09:00",
46 | to_time: str = "17:00",
47 | ) -> Tuple[str, pd.DataFrame, pd.DataFrame]:
48 | """
49 | Generates a list of conversations between an internet provider call center and a customer.
50 |
51 | :param context: The MLRun context.
52 | :param amount: The number of conversations to generate.
53 | :param agent_data: The agent data to use for the conversations.
54 | :param client_data: The client data to use for the conversations.
55 | :param output_directory: The directory to save the conversations to.
56 | :param model_name: The name of the model to use for conversation generation.
57 | You should choose one of GPT-4 or GPT-3.5 from the list here:
58 | https://platform.openai.com/docs/models. Default: 'gpt-3.5-turbo'.
59 | :param language: The language to use for the generated conversation text.
60 | :param min_time: Minimum time of conversation in minutes.
61 | Will be used approximately to generate the minimum words with the following assessment:
62 | 240 words are equal to one minute. Default: 2.
63 | :param max_time: Maximum time of conversation in minutes.
64 | Will be used approximately to generate the maximum words with the following assessment:
65 | 240 words are equal to one minute. Default: 5.
66 | :param from_date: The minimum date of the conversation.
67 | :param to_date: The maximum date of the conversation.
68 | :param from_time: The minimum time (HH:MM) of the conversation.
69 | :param to_time: The maximum time (HH:MM) of the conversation.
70 | """
71 | # Get the minimum and maximum amount of words:
72 | min_words = WORDS_IN_1_MINUTE * min_time
73 | max_words = WORDS_IN_1_MINUTE * max_time
74 |
75 | # Get the minimum and maximum dates and times:
76 | min_time = datetime.datetime.strptime(from_time, "%H:%M")
77 | max_time = datetime.datetime.strptime(to_time, "%H:%M")
78 | min_date = datetime.datetime.strptime(from_date, "%m.%d.%Y").date()
79 | max_date = datetime.datetime.strptime(to_date, "%m.%d.%Y").date()
80 |
81 | # Create the concern addressed options:
82 | concern_addressed_options = {
83 | True: "",
84 | False: "Don't",
85 | }
86 |
87 | # Create the agent upsales options:
88 | agent_upsales_options = {
89 | "Doesn't try": "Doesn't try to upsale the customer on more services.",
90 | "Tries and doesn't succeed": "Tries to upsale the customer on more services, and doesn't succeed",
91 | "Tries and succeeds": "Tries to upsale the customer on more services, and succeeds",
92 | }
93 |
94 | # Create the upsale mapping:
95 | upsale_mapping = {
96 | "Doesn't try": [False, False],
97 | "Tries and doesn't succeed": [True, False],
98 | "Tries and succeeds": [True, True],
99 | }
100 |
101 | # Create the prompt structure:
102 | prompt_structure = (
103 | "Generate a conversation between an internet provider call center agent named {agent_name} from (“Iguazio Internet”) and "
104 | "a client named {client_name} with email: {client_email} and phone number: {client_phone} in {language} except 'Agent' and 'Client' prefixes which are constants.\n"
105 | "Format the conversation as follow:\n"
106 | "Agent: \n"
107 | "Client: \n"
108 | "The conversations has to include at least {min_words} words and no more than {max_words} words.\n"
109 | "The call must include the following steps: \n"
110 | "1. Opening (greeting and customer details validation and confirmation)\n"
111 | "2. Presenting the problem by the customer"
112 | "3. The agent {concern_addressed} address the client's concern.\n"
113 | "4. The Agent {agent_upsales}"
114 | "5. Summerizing and closing the call"
115 | "It has to be about a client who is calling to discuss about {topic}.\n"
116 | "Do not add any descriptive tag and do not mark the end of the conversation with [End of conversation].\n"
117 | "Use ... for hesitation.\n"
118 | "The client needs to have a {client_tone} tone.\n"
119 | "The agent needs to have a {agent_tone}.\n"
120 | "Remove from the final output any word inside parentheses of all types. \n"
121 | "use the following levels of these attributes while describing the agent's role: \n"
122 | "Empathy {empathy}, Professionalism {professionalism}, Kindness {kindness}, \n"
123 | "Effective Communication {effective_communication}, Active listening {active_listening}, Customization {customization}."
124 | )
125 |
126 | # Load the OpenAI model using langchain:
127 | os.environ["OPENAI_API_KEY"] = context.get_secret(key=ProjectSecrets.OPENAI_API_KEY)
128 | os.environ["OPENAI_API_BASE"] = context.get_secret(
129 | key=ProjectSecrets.OPENAI_API_BASE
130 | )
131 | llm = ChatOpenAI(model=model_name)
132 |
133 | # Create the output directory:
134 | if output_directory is None:
135 | output_directory = tempfile.mkdtemp()
136 | output_directory = pathlib.Path(output_directory)
137 | if not output_directory.exists():
138 | output_directory.mkdir(parents=True, exist_ok=True)
139 |
140 | # Start generating conversations:
141 | conversations = []
142 | ground_truths = []
143 | for _ in tqdm.tqdm(range(amount), desc="Generating"):
144 | # Randomize the conversation metadata:
145 | conversation_id = _generate_id()
146 | date = _get_random_date(min_date=min_date, max_date=max_date)
147 | time = _get_random_time(min_time=min_time, max_time=max_time)
148 |
149 | # Randomly select the conversation parameters:
150 | concern_addressed = random.choice(list(concern_addressed_options.keys()))
151 | agent_upsales = random.choice(list(agent_upsales_options.keys()))
152 | client_tone = random.choice(TONES)
153 | agent_tone = random.choice(TONES)
154 | topic = random.choice(TOPICS)
155 | agent = agent_data.sample().to_dict(orient="records")[0]
156 | client = client_data.sample().to_dict(orient="records")[0]
157 |
158 | # Generate levels os different agent attributes:
159 | empathy = random.randint(1, 5)
160 | professionalism = random.randint(1, 5)
161 | kindness = random.randint(1, 5)
162 | effective_communication = random.randint(1, 5)
163 | active_listening = random.randint(1, 5)
164 | customization = random.randint(1, 5)
165 |
166 | # Create the prompt:
167 | prompt = prompt_structure.format(
168 | language=language,
169 | min_words=min_words,
170 | max_words=max_words,
171 | topic=topic,
172 | concern_addressed=concern_addressed_options[concern_addressed],
173 | agent_upsales=agent_upsales_options[agent_upsales],
174 | client_tone=client_tone,
175 | agent_tone=agent_tone,
176 | agent_name=f"{agent['first_name']} {agent['last_name']}",
177 | client_name=f"{client['first_name']} {client['last_name']}",
178 | client_email=client["email"],
179 | client_phone=client["phone_number"],
180 | empathy=empathy,
181 | professionalism=professionalism,
182 | kindness=kindness,
183 | effective_communication=effective_communication,
184 | active_listening=active_listening,
185 | customization=customization,
186 | )
187 |
188 | # Generate the conversation:
189 | conversation = llm.predict(text=prompt)
190 | # Remove redundant newlines and spaces:
191 | conversation = "".join(
192 | [
193 | line
194 | for line in conversation.strip().splitlines(keepends=True)
195 | if line.strip("\n").strip()
196 | ]
197 | )
198 | # Save to file:
199 | conversation_text_path = output_directory / f"{conversation_id}.txt"
200 | with open(conversation_text_path, "w") as fp:
201 | fp.write(conversation)
202 |
203 | # Collect to the conversations and ground truths lists:
204 | conversations.append(
205 | [
206 | conversation_id,
207 | conversation_text_path.name,
208 | client["client_id"],
209 | agent["agent_id"],
210 | date,
211 | time,
212 | ]
213 | )
214 | ground_truths.append(
215 | [
216 | conversation_id,
217 | language,
218 | topic,
219 | concern_addressed,
220 | upsale_mapping[agent_upsales][0],
221 | upsale_mapping[agent_upsales][1],
222 | client_tone,
223 | agent_tone,
224 | client["client_id"],
225 | agent["agent_id"],
226 | empathy,
227 | professionalism,
228 | kindness,
229 | effective_communication,
230 | active_listening,
231 | customization,
232 | ]
233 | )
234 |
235 | # Cast the conversations and ground truths into a dataframe:
236 | conversations = pd.DataFrame(
237 | conversations,
238 | columns=["call_id", "text_file", "client_id", "agent_id", "date", "time"],
239 | )
240 | ground_truths = pd.DataFrame(
241 | ground_truths,
242 | columns=[
243 | "call_id",
244 | "language",
245 | "topic",
246 | "concern_addressed",
247 | "agent_tries_upsale",
248 | "agent_succeeds_upsale",
249 | "client_tone",
250 | "agent_tone",
251 | "agent_id",
252 | "client_id",
253 | "empathy",
254 | "professionalism",
255 | "kindness",
256 | "effective_communication",
257 | "active_listening",
258 | "customization",
259 | ],
260 | )
261 |
262 | return str(output_directory), conversations, ground_truths
263 |
264 |
265 | def _get_random_time(
266 | min_time: datetime.datetime, max_time: datetime.datetime
267 | ) -> datetime.time:
268 | if max_time.hour <= min_time.hour:
269 | max_time += datetime.timedelta(days=1)
270 | return (
271 | min_time
272 | + datetime.timedelta(
273 | seconds=random.randint(0, int((max_time - min_time).total_seconds())),
274 | )
275 | ).time()
276 |
277 |
278 | def _get_random_date(min_date, max_date) -> datetime.date:
279 | return min_date + datetime.timedelta(
280 | days=random.randint(0, int((max_date - min_date).days)),
281 | )
282 |
283 |
284 | def create_batch_for_analysis(
285 | conversations_data: pd.DataFrame, audio_files: pd.DataFrame
286 | ) -> pd.DataFrame:
287 | conversations_data = conversations_data.join(
288 | other=audio_files.set_index(keys="text_file"), on="text_file"
289 | )
290 | conversations_data.drop(columns="text_file", inplace=True)
291 | conversations_data.dropna(inplace=True)
292 | return conversations_data
293 |
294 |
295 | def _generate_id() -> str:
296 | return hashlib.md5(str(datetime.datetime.now()).encode("utf-8")).hexdigest()
297 |
--------------------------------------------------------------------------------
/src/calls_generation/skip.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Iguazio
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from pathlib import Path
15 |
16 | import mlrun
17 | import pandas as pd
18 | import yaml
19 | from mlrun.artifacts import ArtifactSpec, DatasetArtifact
20 | from sqlalchemy import insert
21 |
22 | from src.calls_analysis.db_management import Agent, Call, Client, create_tables, DBEngine
23 |
24 |
25 | def skip_and_import_local_data(language: str):
26 | """
27 | This function logs example data to the database and to the project.
28 | Call this function from the notebook in order to skip the calls generation workflow.
29 | """
30 | # Get the example data directory:
31 | example_data_dir = Path("data")
32 | # Get the project:
33 | project = mlrun.get_current_project()
34 |
35 | # clean and recreate database tables:
36 | engine = DBEngine(mlrun.get_or_create_ctx("skip"))
37 | Call.__table__.drop(engine.engine)
38 | Client.__table__.drop(engine.engine)
39 | Agent.__table__.drop(engine.engine)
40 | create_tables()
41 | print("- Initialized tables")
42 |
43 | # log agents and clients data
44 | json_spec = ArtifactSpec(
45 | unpackaging_instructions={
46 | "packager_name": "ListPackager",
47 | "object_type": "builtins.list",
48 | "artifact_type": "file",
49 | "instructions": {"file_format": "json"},
50 | }
51 | )
52 | zip_spec = ArtifactSpec(
53 | unpackaging_instructions={
54 | "packager_name": "StrPackager",
55 | "object_type": "builtins.str",
56 | "artifact_type": "path",
57 | "instructions": {"archive_format": "zip", "is_directory": "true"},
58 | }
59 | )
60 | parquet_spec = ArtifactSpec(
61 | unpackaging_instructions={
62 | "packager_name": "PandasDataFramePackager",
63 | "object_type": "pandas.core.frame.DataFrame",
64 | "artifact_type": "dataset",
65 | "instructions": {},
66 | }
67 | )
68 | # load agent and client data:
69 | agents = project.log_artifact(
70 | item="agent-data-generator_agents",
71 | spec=json_spec,
72 | local_path=str(example_data_dir / f"{language}_agents.json"),
73 | db_key="agent-data-generator_agents",
74 | )
75 | agents = agents.to_dataitem()
76 | agents = yaml.load(agents.get(), Loader=yaml.FullLoader)
77 | clients = project.log_artifact(
78 | item="client-data-generator_clients",
79 | spec=json_spec,
80 | local_path=str(example_data_dir / f"{language}_clients.json"),
81 | db_key="client-data-generator_clients",
82 | )
83 | clients = clients.to_dataitem()
84 | clients = yaml.load(clients.get(), Loader=yaml.FullLoader)
85 |
86 | # insert agent and client data to database:
87 | _insert_agents_and_clients_to_db(agents, clients)
88 | print("- agents and clients inserted")
89 |
90 | # log zip files
91 | remote_zip_path = mlrun.get_sample_path(f"call-demo/{language}_audio_files.zip")
92 | conversations_art = project.log_artifact(
93 | item="conversation-generation_conversations",
94 | spec=zip_spec,
95 | local_path=str(example_data_dir / f"{language}_conversations.zip"),
96 | db_key="conversation-generation_conversations",
97 | )
98 | audio_files_art = project.log_artifact(
99 | item="text-to-audio_audio_files",
100 | spec=zip_spec,
101 | target_path=remote_zip_path,
102 | db_key="text-to-audio_audio_files",
103 | )
104 | # log parquet files
105 | calls_batch_df = pd.read_parquet(
106 | str(example_data_dir / f"{language}_calls_batch.parquet")
107 | )
108 | dataframe_df = pd.read_parquet(
109 | str(example_data_dir / f"{language}_dataframe.parquet")
110 | )
111 | ground_truths_df = pd.read_parquet(
112 | str(example_data_dir / f"{language}_ground_truths.parquet")
113 | )
114 | metadata_df = pd.read_parquet(
115 | str(example_data_dir / f"{language}_metadata.parquet")
116 | )
117 |
118 | project.log_artifact(
119 | item=DatasetArtifact(key="batch-creation_calls_batch", df=calls_batch_df),
120 | spec=parquet_spec,
121 | local_path=str(example_data_dir / f"{language}_calls_batch.parquet"),
122 | )
123 | project.log_artifact(
124 | item=DatasetArtifact(key="text-to-audio_dataframe", df=dataframe_df),
125 | spec=parquet_spec,
126 | )
127 | project.log_artifact(
128 | item=DatasetArtifact(
129 | key="conversation-generation_ground_truths", df=ground_truths_df
130 | ),
131 | spec=parquet_spec,
132 | )
133 | project.log_artifact(
134 | item=DatasetArtifact(key="conversation-generation_metadata", df=metadata_df),
135 | spec=parquet_spec,
136 | )
137 | print("*** first workflow skipped successfully ***")
138 |
139 |
140 | def _insert_agents_and_clients_to_db(agents: list, clients: list):
141 | # Create an engine:
142 | engine = DBEngine(mlrun.get_or_create_ctx("skip"))
143 |
144 | # Initialize a session maker:
145 | session = engine.get_session()
146 |
147 | # Insert the new calls into the table and commit:
148 | with session.begin() as sess:
149 | sess.execute(insert(Agent), agents)
150 | sess.execute(insert(Client), clients)
151 |
152 |
153 | # TODO: change to export the actual data and not the artifacts
154 | def save_current_example_data():
155 | project = mlrun.get_current_project()
156 | export_dir = Path("example_data")
157 | if not export_dir.exists():
158 | export_dir.mkdir(parents=True, exist_ok=True)
159 |
160 | for artifact_name, target_path in [
161 | ("client-data-generator_clients", "clients.zip"),
162 | ("agent-data-generator_agents", "agents.zip"),
163 | (
164 | "conversation-generation_conversations",
165 | "conversation_generation/conversations.zip",
166 | ),
167 | ("conversation-generation_metadata", "conversation_generation/metadata.zip"),
168 | (
169 | "conversation-generation_ground_truths",
170 | "conversation_generation/ground_truths.zip",
171 | ),
172 | ("text-to-audio_audio_files", "text_to_audio/audio_files.zip"),
173 | ("text-to-audio_dataframe", "text_to_audio/dataframe.zip"),
174 | ("batch-creation_calls_batch", "batch_creation/calls_batch.zip"),
175 | ]:
176 | export_path = export_dir / target_path
177 | if not export_path.exists():
178 | export_path.parent.mkdir(parents=True, exist_ok=True)
179 | project.get_artifact(artifact_name).export(f"example_data/{target_path}")
180 | print(f"- exported {artifact_name} to {target_path}")
181 | print("*** all artifacts exported successfully ***")
182 |
--------------------------------------------------------------------------------
/src/common.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Iguazio
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import enum
15 |
16 |
17 | class ProjectSecrets:
18 | OPENAI_API_KEY = "OPENAI_API_KEY"
19 | OPENAI_API_BASE = "OPENAI_API_BASE"
20 | MYSQL_URL = "MYSQL_URL"
21 | MYSQL_CONNECT_ARGS = "MYSQL_CONNECT_ARGS"
22 | S3_BUCKET_NAME = "S3_BUCKET_NAME"
23 |
24 |
25 | class CallStatus(enum.Enum):
26 | CREATED = "Created"
27 | AUDIO_PROCESSED = "Audio processed"
28 | SPEECH_DIARIZED = "Speech diarized"
29 | TRANSCRIBED = "Transcribed"
30 | TRANSLATED = "Translated"
31 | ANONYMIZED = "Anonymized"
32 | ANALYZED = "Analyzed"
33 |
34 |
35 | TOPICS = [
36 | "slow internet speed",
37 | "billing discrepancies",
38 | "account login problems",
39 | "setting up a new device",
40 | "phishing or malware concerns",
41 | "scheduled maintenance notifications",
42 | "service upgrades",
43 | "negotiating pricing",
44 | "canceling service",
45 | "customer service feedback",
46 | ]
47 |
48 | TONES = [
49 | "Positive",
50 | "Neutral",
51 | "Negative",
52 | ]
53 |
--------------------------------------------------------------------------------
/src/vizro.py:
--------------------------------------------------------------------------------
1 | import os
2 | import shutil
3 | import tarfile
4 | from pathlib import Path
5 |
6 | import boto3
7 | import mlrun
8 | import mlrun.common.schemas
9 | import pandas as pd
10 |
11 | from src.calls_analysis.db_management import get_calls, get_clients
12 |
13 | COLUMNS_MAPPING = {
14 | "active_listening": "Active Listening",
15 | "agent_id": "Agent ID",
16 | "agent_tone": "Agent Tone",
17 | "date": "Call Date",
18 | "client_id": "Caller ID",
19 | "client_tone": "Client Tone",
20 | "concern_addressed": "Concern Addressed",
21 | "customization": "Customization",
22 | "effective_communication": "Effective Communication",
23 | "empathy": "Empathy",
24 | "kindness": "Kindness",
25 | "professionalism": "Professionalism",
26 | "summary": "Summary",
27 | "time": "Time",
28 | "topic": "Topic",
29 | "upsale_attempted": "Upsale Attempted",
30 | "upsale_success": "Upsale Success",
31 | "client_city": "Caller City",
32 | "anonymized_file": "text_file",
33 | }
34 |
35 |
36 | def deploy_vizro_application():
37 | dir_name = "vizro"
38 |
39 | # Prepare the dataframe for vizro:
40 | _prepare_vizro_source(dir_name)
41 | print("Application source code ready for deployment.")
42 |
43 | # Archive
44 | bucket_name = os.getenv("S3_BUCKET_NAME")
45 | if bucket_name:
46 | _upload_to_s3(dir_name)
47 | # Add the source code to the application
48 | src_path = f"s3://{bucket_name}/{dir_name}.tar.gz"
49 | print(f"Uploading {src_path} to {bucket_name}")
50 | else:
51 | # Set the source path to V3IO
52 | src_path = f'v3io:///users/{os.environ["V3IO_USERNAME"]}/{os.getcwd().replace("/User/", "")}/{dir_name}.tar.gz'
53 | print(f"Configuring V3IO {src_path} to UI")
54 | project = mlrun.get_current_project()
55 | app = project.get_function("call-center-ui")
56 | app.with_source_archive(src_path, pull_at_runtime=False)
57 |
58 | # Deploy the application
59 | app.deploy(force_build=True, create_default_api_gateway=False, with_mlrun=False)
60 | app.create_api_gateway(
61 | name="call-center-ui",
62 | direct_port_access=True,
63 | set_as_default=True,
64 | authentication_mode=mlrun.common.schemas.api_gateway.APIGatewayAuthenticationMode.none,
65 | )
66 | print("Application deployed successfully!")
67 |
68 |
69 | def _prepare_vizro_source(dir_name: str):
70 | clients_df = get_clients(mlrun.get_or_create_ctx("mlrun"))
71 | calls_df = get_calls()
72 | vizro_df = pd.merge(
73 | calls_df,
74 | clients_df[["client_id", "client_city", "latitude", "longitude"]],
75 | on="client_id",
76 | )
77 | vizro_df = vizro_df.rename(columns=COLUMNS_MAPPING)
78 | vizro_df.to_csv("vizro/data.csv")
79 |
80 | # add text and audio files to vizro:
81 | shutil.copytree("outputs", "vizro/outputs", dirs_exist_ok=True)
82 |
83 | # Write the application code to a file
84 | app_dir = "vizro"
85 |
86 | # Create an archive of the application code
87 | archive_name = f"{dir_name}.tar.gz"
88 | with tarfile.open(archive_name, "w:gz") as tar:
89 | tar.add(app_dir)
90 |
91 |
92 | def _upload_to_s3(dir_name: str):
93 | # uploading db file to s3:
94 | s3 = boto3.client("s3")
95 | bucket_name = Path(mlrun.mlconf.artifact_path).parts[1]
96 |
97 | # Upload the file
98 | s3.upload_file(
99 | Filename=f"{dir_name}.tar.gz",
100 | Bucket=bucket_name,
101 | Key=f"{dir_name}.tar.gz",
102 | )
103 |
--------------------------------------------------------------------------------
/src/workflows/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Iguazio
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
--------------------------------------------------------------------------------
/src/workflows/calls_analysis.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Iguazio
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from typing import List
15 |
16 | import kfp
17 | import mlrun
18 | from kfp import dsl
19 |
20 | from src.common import TONES, TOPICS, CallStatus
21 |
22 | QUESTIONS = [
23 | [
24 | f"1. Classify the topic of the text from the following list (choose one): {TOPICS}",
25 | "2. Write a long summary of the text, focus on the topic (max 50 words).",
26 | "3. Was the Client's concern addressed, (choose only one) [Yes, No]?",
27 | f"4. Was the Client tone (choose only one, if not sure choose Neutral) {TONES}? ",
28 | f"5. Was the Call Center Agent tone (choose only one, if not sure choose Neutral) {TONES}?",
29 | ],
30 | [
31 | "1. Did the agent try to upsale the customer (choose only one) [Yes, No]? (sell any additional product or service)",
32 | "2. If the agent indeed try to upsale the client, did he succeed (choose only one) [Yes, No]? if he didn't try' answer No",
33 | "3. Rate the agent's level of empathy (The ability to understand and share the feelings of others) on a scale of 1-5.",
34 | "4. Rate the agent's level of professionalism (Conducting oneself in a way that is appropriate for the workplace) on a scale of 1-5.",
35 | "5. Rate the agent's level of kindness (The quality of being friendly, generous, and considerate) on a scale of 1-5.",
36 | "6. Rate the agent's level of effective communication (The ability to convey information clearly and concisely) on a scale of 1-5.",
37 | "7. Rate the agent's level of active listening (The process of paying attention to and understanding what someone is saying) on a scale of 1-5.",
38 | "8. Rate the agent's level of customization (The process of tailoring something to the specific needs or preferences of an individual) on a scale of 1-5.",
39 | ],
40 | ]
41 | DEMO_CALL = (
42 | "Agent: Good afternoon, you've reached [Internet Service Provider] customer support. I'm Megan. How can I assist "
43 | "you today?\n"
44 | "Customer: Hello, Megan. This is Lisa. I've noticed some billing discrepancies on my last statement.\n"
45 | "Agent: I'm sorry to hear that, Lisa. I'd be happy to help you with that. Could you please provide me with your "
46 | "account number or phone number associated with your account?\n"
47 | "Customer: Of course, my account number is 123456789.\n"
48 | "Agent: Thank you, Lisa. Let me pull up your account. I see the billing discrepancies you mentioned. It appears "
49 | "there was an error in the charges. I apologize for the inconvenience.\n"
50 | "Customer: Thank you for acknowledging the issue, Megan. Can you please help me get it resolved?\n"
51 | "Agent: Absolutely, Lisa. I've made note of the discrepancies, and I'll escalate this to our billing department "
52 | "for investigation and correction. You should see the adjustments on your next statement.\n"
53 | "Customer: That sounds good, Megan. I appreciate your help.\n"
54 | "Agent: You're welcome, Lisa. If you have any more questions or concerns in the future, please don't hesitate to "
55 | "reach out. Is there anything else I can assist you with today?\n"
56 | "Customer: No, that's all. Thank you for your assistance, Megan.\n"
57 | "Agent: Not a problem, Lisa. Have a wonderful day, and we'll get this sorted out for you.\n"
58 | "Customer: You too! Goodbye, Megan.\n"
59 | "Agent: Goodbye, Lisa!"
60 | )
61 | DEMO_ANSWERS = [
62 | (
63 | "1. billing discrepancies\n"
64 | "2. The customer, contacted the call center regarding billing discrepancies on her statement. The agent, "
65 | "acknowledged the issue, assured The customer it would be resolved, and escalated it to the billing department for "
66 | "correction.\n"
67 | "3. Yes.\n"
68 | "4. Natural.\n"
69 | "5. positive.\n"
70 | ),
71 | ("1. No\n" "2. No\n" "3. 4\n" "4. 5\n" "5. 4\n" "6. 5\n" "7. 4\n" "8. 3"),
72 | ]
73 | TEXT_WRAPPER = [
74 | (
75 | f"<|im_start|>system: You are an AI assistant that answers questions accurately and shortly<|im_end|>\n"
76 | f"<|im_start|>user: Given the following text:\n"
77 | f"{DEMO_CALL}\n"
78 | f"answer the questions as accurately as you can:\n"
79 | f"{QUESTIONS[i]}<|im_end|>\n"
80 | f"<|im_start|>assistant:\n"
81 | f"{DEMO_ANSWERS[i]}<|im_end|>\n"
82 | f"<|im_start|>user: Given the following text:\n"
83 | "{}"
84 | )
85 | for i in range(len(QUESTIONS))
86 | ]
87 | QUESTIONS_WRAPPER = (
88 | " answer the given questions as accurately as you can, do not write more answers the questions:\n"
89 | "{}<|im_end|>\n"
90 | "<|im_start|>assistant:\n"
91 | )
92 |
93 |
94 | @kfp.dsl.pipeline()
95 | def pipeline(
96 | batch: str,
97 | calls_audio_files: str,
98 | transcribe_model: str,
99 | translate_to_english: bool,
100 | pii_recognition_model: str,
101 | pii_recognition_entities: List[str],
102 | pii_recognition_entity_operator_map: List[str],
103 | question_answering_model: str,
104 | batch_size: int = 2,
105 | auto_gptq_exllama_max_input_length: int = None,
106 | insert_calls_db: bool = True,
107 | ):
108 | # Get the project:
109 | project = mlrun.get_current_project()
110 | db_management_function = project.get_function("db-management")
111 | with dsl.Condition(insert_calls_db == True) as insert_calls_condition:
112 | # Insert new calls:
113 | insert_calls_run = project.run_function(
114 | db_management_function,
115 | handler="insert_calls",
116 | name="insert-calls",
117 | inputs={"calls": batch},
118 | returns=[
119 | "calls_batch: dataset",
120 | "audio_files: file",
121 | ],
122 | )
123 |
124 | # Speech diarize:
125 | speech_diarization_function = project.get_function("silero-vad")
126 | diarize_run = project.run_function(
127 | speech_diarization_function,
128 | handler="diarize",
129 | name="diarization",
130 | inputs={"data_path": calls_audio_files},
131 | params={
132 | "speaker_labels": ["Agent", "Client"],
133 | "verbose": True,
134 | },
135 | returns=["speech_diarization: file", "diarize_errors: file"],
136 | ).after(insert_calls_condition)
137 |
138 | # Update diarization state:
139 | update_calls_post_speech_diarization_run = project.run_function(
140 | db_management_function,
141 | handler="update_calls",
142 | name="update-calls",
143 | inputs={"data": batch},
144 | params={
145 | "status": CallStatus.SPEECH_DIARIZED.value,
146 | "table_key": "call_id",
147 | "data_key": "call_id",
148 | },
149 | ).after(diarize_run)
150 |
151 | # Transcribe:
152 | transcription_function = project.get_function("transcription")
153 | transcribe_run = project.run_function(
154 | transcription_function,
155 | handler="transcribe",
156 | name="transcription",
157 | inputs={
158 | "data_path": calls_audio_files,
159 | "speech_diarization": diarize_run.outputs["speech_diarization"],
160 | },
161 | params={
162 | "model_name": transcribe_model,
163 | "device": "cuda",
164 | "use_better_transformers": True,
165 | "batch_size": batch_size,
166 | "translate_to_english": translate_to_english,
167 | },
168 | returns=[
169 | "transcriptions: path",
170 | "transcriptions_dataframe: dataset",
171 | "transcriptions_errors: file",
172 | ],
173 | )
174 |
175 | # Update transcription state:
176 | update_calls_post_transcription_run = project.run_function(
177 | db_management_function,
178 | handler="update_calls",
179 | name="update-calls-2",
180 | inputs={"data": transcribe_run.outputs["transcriptions_dataframe"]},
181 | params={
182 | "status": CallStatus.TRANSCRIBED.value,
183 | "table_key": "audio_file",
184 | "data_key": "audio_file",
185 | },
186 | )
187 |
188 | # Recognize PII:
189 | pii_recognition_function = project.get_function("pii-recognition")
190 | recognize_pii_run = project.run_function(
191 | pii_recognition_function,
192 | handler="recognize_pii",
193 | name="pii-recognition",
194 | inputs={"input_path": transcribe_run.outputs["transcriptions"]},
195 | params={
196 | "model": pii_recognition_model,
197 | "html_key": "highlighted",
198 | "entities": pii_recognition_entities,
199 | "entity_operator_map": pii_recognition_entity_operator_map,
200 | "score_threshold": 0.8,
201 | "is_full_report": False,
202 | },
203 | returns=[
204 | "anonymized_files: path",
205 | "anonymized_files_dataframe: dataset",
206 | "anonymized_files_errors: file",
207 | "anonymized_files_report: file",
208 | ],
209 | )
210 |
211 | # Update PII state:
212 | update_calls_post_pii_recognition_run = project.run_function(
213 | db_management_function,
214 | handler="update_calls",
215 | name="update-calls-3",
216 | inputs={"data": recognize_pii_run.outputs["anonymized_files_dataframe"]},
217 | params={
218 | "status": CallStatus.ANONYMIZED.value,
219 | "table_key": "transcription_file",
220 | "data_key": "original_file",
221 | },
222 | )
223 |
224 | # Question-answering:
225 | question_answering_function = project.get_function("question-answering")
226 | question_answering_function.with_requests(mem="20G")
227 | answer_questions_run = project.run_function(
228 | question_answering_function,
229 | handler="answer_questions",
230 | name="analysis",
231 | inputs={"data_path": recognize_pii_run.outputs["anonymized_files"]},
232 | params={
233 | "verbose": True,
234 | "model_name": question_answering_model,
235 | # We don't need the auto_gptq_exllama if using CPU, we do need it if using GPU
236 | "auto_gptq_exllama_max_input_length": auto_gptq_exllama_max_input_length,
237 | "device_map": "auto",
238 | "text_wrapper": TEXT_WRAPPER,
239 | "questions": QUESTIONS,
240 | "questions_wrapper": QUESTIONS_WRAPPER,
241 | "questions_columns": [
242 | "topic",
243 | "summary",
244 | "concern_addressed",
245 | "client_tone",
246 | "agent_tone",
247 | "upsale_attempted",
248 | "upsale_success",
249 | "empathy",
250 | "professionalism",
251 | "kindness",
252 | "effective_communication",
253 | "active_listening",
254 | "customization",
255 | ],
256 | "questions_config": [
257 | {},
258 | {"type": "poll", "poll_count": 3, "poll_strategy": "most_common"},
259 | ],
260 | "generation_config": {
261 | "max_new_tokens": 250,
262 | "do_sample": True,
263 | "temperature": 0.7,
264 | "top_p": 0.95,
265 | "top_k": 40,
266 | "repetition_penalty": 1.1,
267 | },
268 | "batch_size": 1,
269 | "model_kwargs": {},
270 | },
271 | returns=[
272 | "question_answering_dataframe: dataset",
273 | "question_answering_errors: file",
274 | ],
275 | )
276 |
277 | # Postprocess answers:
278 | postprocessing_function = project.get_function("postprocessing")
279 | postprocess_answers_run = project.run_function(
280 | postprocessing_function,
281 | handler="postprocess_answers",
282 | name="answers-postprocessing",
283 | inputs={
284 | "answers": answer_questions_run.outputs["question_answering_dataframe"]
285 | },
286 | returns=["processed_answers: dataset"],
287 | )
288 |
289 | # Update question answering state:
290 | update_calls_post_question_answering_run = project.run_function(
291 | db_management_function,
292 | handler="update_calls",
293 | name="update-calls-4",
294 | inputs={"data": postprocess_answers_run.outputs["processed_answers"]},
295 | params={
296 | "status": CallStatus.ANALYZED.value,
297 | "table_key": "anonymized_file",
298 | "data_key": "text_file",
299 | },
300 | )
301 |
--------------------------------------------------------------------------------
/src/workflows/calls_generation.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Iguazio
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from typing import List
15 |
16 | import kfp
17 | import mlrun
18 | from kfp import dsl
19 |
20 |
21 | @kfp.dsl.pipeline()
22 | def pipeline(
23 | amount: int,
24 | generation_model: str,
25 | tts_model: str,
26 | language: str,
27 | available_voices: List[str],
28 | min_time: int,
29 | max_time: int,
30 | from_date: str,
31 | to_date: str,
32 | from_time: str,
33 | to_time: str,
34 | num_clients: int,
35 | num_agents: int,
36 | generate_clients_and_agents: bool = True,
37 | ):
38 | # Get the project:
39 | project = mlrun.get_current_project()
40 |
41 | with dsl.Condition(generate_clients_and_agents == True) as generate_data_condition:
42 | # Generate client data:
43 | client_data_generator_function = project.get_function(
44 | "structured_data_generator"
45 | )
46 | client_data_run = project.run_function(
47 | client_data_generator_function,
48 | handler="generate_data",
49 | name="client-data-generator",
50 | params={
51 | "amount": num_clients,
52 | "model_name": generation_model,
53 | "language": language,
54 | "fields": [
55 | f"first_name: in {language}, no special characters",
56 | f"last_name: in {language}, no special characters",
57 | "phone_number",
58 | "email",
59 | "client_id: no leading zeros",
60 | "client_city: Enter city, state in the US (e.g., Austin, TX), Not only Texas",
61 | "latitude: That correspond to the city",
62 | "longitude: That correspond to the city",
63 | ],
64 | },
65 | returns=["clients: file"],
66 | )
67 |
68 | # Insert client data to database
69 | db_management_function = project.get_function("db-management")
70 | project.run_function(
71 | db_management_function,
72 | handler="insert_clients",
73 | name="insert-clients",
74 | inputs={
75 | "clients": client_data_run.outputs["clients"],
76 | },
77 | )
78 |
79 | # Generate agent data:
80 | agent_data_generator_function = project.get_function(
81 | "structured_data_generator"
82 | )
83 | agent_data_run = project.run_function(
84 | agent_data_generator_function,
85 | handler="generate_data",
86 | name="agent-data-generator",
87 | params={
88 | "amount": num_agents,
89 | "model_name": generation_model,
90 | "language": language,
91 | "fields": [
92 | f"first_name: in {language}, no special characters",
93 | f"last_name: in {language}, no special characters",
94 | "agent_id: no leading zeros",
95 | ],
96 | },
97 | returns=["agents: file"],
98 | )
99 |
100 | # Insert agent data to database
101 | db_management_function = project.get_function("db-management")
102 | project.run_function(
103 | db_management_function,
104 | handler="insert_agents",
105 | name="insert-agents",
106 | inputs={
107 | "agents": agent_data_run.outputs["agents"],
108 | },
109 | )
110 |
111 | # Get agents from database
112 | db_management_function = project.get_function("db-management")
113 | get_agents_run = project.run_function(
114 | db_management_function,
115 | handler="get_agents",
116 | name="get-agents",
117 | returns=["agents: file"],
118 | ).after(generate_data_condition)
119 |
120 | # Get clients from database
121 | db_management_function = project.get_function("db-management")
122 | get_clients_run = project.run_function(
123 | db_management_function,
124 | handler="get_clients",
125 | name="get-clients",
126 | returns=["clients: file"],
127 | ).after(generate_data_condition)
128 |
129 | # Generate conversations texts:
130 | conversations_generator_function = project.get_function("conversations-generator")
131 | generate_conversations_run = project.run_function(
132 | conversations_generator_function,
133 | handler="generate_conversations",
134 | name="conversation-generation",
135 | params={
136 | "amount": amount,
137 | "model_name": generation_model,
138 | "language": language,
139 | "min_time": min_time,
140 | "max_time": max_time,
141 | "from_date": from_date,
142 | "to_date": to_date,
143 | "from_time": from_time,
144 | "to_time": to_time,
145 | },
146 | inputs={
147 | "agent_data": get_agents_run.outputs["agents"],
148 | "client_data": get_clients_run.outputs["clients"],
149 | },
150 | returns=[
151 | "conversations: path",
152 | "metadata: dataset",
153 | "ground_truths: dataset",
154 | ],
155 | )
156 |
157 | # Text to audio:
158 | text_to_audio_generator_function = project.get_function("text-to-audio-generator")
159 | generate_multi_speakers_audio_run = project.run_function(
160 | text_to_audio_generator_function,
161 | handler="generate_multi_speakers_audio",
162 | name="text-to-audio",
163 | inputs={"data_path": generate_conversations_run.outputs["conversations"]},
164 | params={
165 | "speakers": {"Agent": 0, "Client": 1},
166 | "available_voices": available_voices,
167 | "model": tts_model,
168 | "speed": 1,
169 | },
170 | returns=[
171 | "audio_files: path",
172 | "dataframe: dataset",
173 | "errors: file",
174 | ],
175 | )
176 |
177 | # Create the input batch:
178 | create_batch_for_analysis_run = project.run_function(
179 | conversations_generator_function,
180 | handler="create_batch_for_analysis",
181 | name="batch-creation",
182 | inputs={
183 | "conversations_data": generate_conversations_run.outputs["metadata"],
184 | "audio_files": generate_multi_speakers_audio_run.outputs["dataframe"],
185 | },
186 | returns=["calls_batch: dataset"],
187 | )
188 |
--------------------------------------------------------------------------------
/vizro/app.py:
--------------------------------------------------------------------------------
1 | """Main app entry point for Vizro dashboard."""
2 |
3 | # DEFINE IMPORTS
4 | import pandas as pd
5 | from custom_charts import (
6 | plot_bar_concerns,
7 | plot_bar_quality,
8 | plot_bar_upsales,
9 | plot_box_communication,
10 | plot_butterfly_upsales_concerns,
11 | plot_donut_concerns,
12 | plot_donut_upsales,
13 | plot_line_calls_over_time,
14 | plot_map_call_locations,
15 | plot_radar_quality,
16 | )
17 | from custom_components import Audio, make_tabs_with_title, update_from_selected_row
18 | from dash import html
19 |
20 | import vizro.models as vm
21 | from vizro import Vizro
22 | from vizro.figures import kpi_card, kpi_card_reference
23 | from vizro.tables import dash_ag_grid
24 |
25 | # DEFINE CONSTANTS
26 | MIN_ROW_HEIGHT = 420
27 | CONCERN_LABELS = ["Concerns Not Addressed", "Concerns Addressed"]
28 |
29 |
30 | def px(val: int) -> str:
31 | """Convert integer value to pixel string."""
32 | return f"{int(val)}px"
33 |
34 |
35 | # DEFINE DATA
36 | try:
37 | df = pd.read_csv("/home/mlrun_code/vizro/data.csv")
38 | except FileNotFoundError:
39 | raise RuntimeError("The data file 'fake_data.csv' was not found.")
40 | df["Call Date"] = pd.to_datetime(df["Call Date"])
41 | df["Upsale Success Reference"] = 0.25
42 | df["Concern Reference"] = 0.50
43 |
44 | # DEFINE DASHBOARD
45 | kpi_container = vm.Container(
46 | layout=vm.Grid(grid=[[0, 1, 2, 3, 4]], row_gap="0px", col_gap="20px"),
47 | components=[
48 | vm.Figure(
49 | figure=kpi_card_reference(
50 | data_frame=df,
51 | value_column="Upsale Success",
52 | reference_column="Upsale Success Reference",
53 | title="Upsale Success",
54 | value_format="{value:.0%}",
55 | reference_format="{delta_relative:+.1%} vs. target",
56 | icon="more_up",
57 | agg_func="mean",
58 | )
59 | ),
60 | vm.Figure(
61 | figure=kpi_card_reference(
62 | data_frame=df,
63 | value_column="Concern Addressed",
64 | reference_column="Concern Reference",
65 | title="Concerns Addressed",
66 | value_format="{value:.0%}",
67 | reference_format="{delta_relative:+.1%} vs. target",
68 | agg_func="mean",
69 | icon="recommend",
70 | )
71 | ),
72 | vm.Figure(
73 | figure=kpi_card(
74 | data_frame=df,
75 | agg_func="count",
76 | value_column="Caller ID",
77 | title="Number of Calls",
78 | icon="call",
79 | )
80 | ),
81 | vm.Figure(
82 | figure=kpi_card(
83 | data_frame=df,
84 | agg_func="nunique",
85 | value_column="Agent ID",
86 | title="Number of Agents",
87 | icon="support_agent",
88 | )
89 | ),
90 | vm.Figure(
91 | figure=kpi_card(
92 | data_frame=df,
93 | agg_func="nunique",
94 | value_column="Caller ID",
95 | title="Number of Callers",
96 | icon="person",
97 | )
98 | ),
99 | ],
100 | )
101 |
102 | call_summary_container = vm.Container(
103 | title="Calls Summary",
104 | layout=vm.Grid(grid=[[0, 1]], row_min_height=px(MIN_ROW_HEIGHT), row_gap="0px"),
105 | components=[
106 | vm.Container(
107 | title="",
108 | layout=vm.Grid(
109 | grid=[[0], [1]], row_min_height=px(MIN_ROW_HEIGHT // 2), row_gap="0px"
110 | ),
111 | components=[
112 | vm.Graph(
113 | title="Calls over time",
114 | figure=plot_line_calls_over_time(df),
115 | ),
116 | vm.Graph(
117 | title="Upsales and Concerns Addressed",
118 | figure=plot_butterfly_upsales_concerns(df),
119 | ),
120 | ],
121 | variant="filled",
122 | ),
123 | vm.Container(
124 | title="",
125 | layout=vm.Grid(
126 | grid=[[0]], row_min_height=px(MIN_ROW_HEIGHT), row_gap="0px"
127 | ),
128 | components=[
129 | vm.Graph(
130 | title="Call Locations",
131 | header="Showing actual number of calls per city",
132 | figure=plot_map_call_locations(df),
133 | )
134 | ],
135 | variant="filled",
136 | ),
137 | ],
138 | )
139 |
140 | upsales_container = make_tabs_with_title(
141 | title="Upsales",
142 | tabs=[
143 | vm.Container(
144 | title="Percentage",
145 | layout=vm.Grid(grid=[[0, 1]], row_min_height=px(MIN_ROW_HEIGHT)),
146 | components=[
147 | vm.Graph(
148 | title="Average Across Agents",
149 | header="Showing percentage of calls",
150 | figure=plot_donut_upsales(
151 | data_frame=df,
152 | group_column="Agent ID",
153 | mode="average",
154 | ),
155 | ),
156 | vm.Graph(
157 | title="Per Agent",
158 | header="Showing percentage of calls",
159 | figure=plot_donut_upsales(
160 | data_frame=df,
161 | group_column="Agent ID",
162 | mode="comparison",
163 | ),
164 | footer="(The Agent ID is shown inside each donut)",
165 | ),
166 | ],
167 | ),
168 | vm.Container(
169 | title="Absolute",
170 | layout=vm.Grid(grid=[[0, 1]], row_min_height=px(MIN_ROW_HEIGHT)),
171 | components=[
172 | vm.Graph(
173 | title="Average Across Agents",
174 | header="Showing actual number of calls",
175 | figure=plot_bar_upsales(
176 | data_frame=df,
177 | group_column="Agent ID",
178 | mode="average",
179 | ),
180 | ),
181 | vm.Graph(
182 | title="Per Agent",
183 | header="Showing actual number of calls",
184 | figure=plot_bar_upsales(
185 | data_frame=df,
186 | group_column="Agent ID",
187 | mode="comparison",
188 | ),
189 | ),
190 | ],
191 | ),
192 | ],
193 | )
194 |
195 | concerns_container = make_tabs_with_title(
196 | title="Concerns",
197 | tabs=[
198 | vm.Container(
199 | title="Percentage",
200 | layout=vm.Grid(grid=[[0, 1]], row_min_height=px(MIN_ROW_HEIGHT)),
201 | components=[
202 | vm.Graph(
203 | title="Average Across Agents",
204 | header="Showing percentage of calls",
205 | figure=plot_donut_concerns(
206 | data_frame=df,
207 | group_column="Agent ID",
208 | count_column="Concern Addressed",
209 | label_names=CONCERN_LABELS,
210 | mode="average",
211 | ),
212 | ),
213 | vm.Graph(
214 | title="Per Agent",
215 | header="Showing percentage of calls",
216 | figure=plot_donut_concerns(
217 | data_frame=df,
218 | group_column="Agent ID",
219 | count_column="Concern Addressed",
220 | label_names=CONCERN_LABELS,
221 | mode="comparison",
222 | ),
223 | footer="(The Agent ID is shown inside each donut)",
224 | ),
225 | ],
226 | ),
227 | vm.Container(
228 | title="Absolute",
229 | layout=vm.Grid(grid=[[0, 1]], row_min_height=px(MIN_ROW_HEIGHT)),
230 | components=[
231 | vm.Graph(
232 | title="Average Across Agents",
233 | header="Showing actual number of calls",
234 | figure=plot_bar_concerns(
235 | data_frame=df,
236 | group_column="Agent ID",
237 | mode="average",
238 | ),
239 | ),
240 | vm.Graph(
241 | title="Per Agent",
242 | header="Showing actual number of calls",
243 | figure=plot_bar_concerns(
244 | data_frame=df,
245 | group_column="Agent ID",
246 | mode="comparison",
247 | ),
248 | ),
249 | ],
250 | ),
251 | ],
252 | )
253 |
254 | quality_scores_container = make_tabs_with_title(
255 | title="Quality Scores",
256 | tabs=[
257 | vm.Container(
258 | title="Absolute",
259 | layout=vm.Grid(grid=[[0, 1]], row_min_height=px(MIN_ROW_HEIGHT)),
260 | components=[
261 | vm.Graph(
262 | title="Average Across Agents",
263 | header="Showing actual score",
264 | figure=plot_radar_quality(df, "average"),
265 | ),
266 | vm.Graph(
267 | title="Per Agent",
268 | header="Showing actual score",
269 | figure=plot_radar_quality(df, "comparison"),
270 | footer="(View the tooltips to see the Agent ID)",
271 | ),
272 | ],
273 | ),
274 | vm.Container(
275 | title="Comparison",
276 | layout=vm.Grid(grid=[[0, 1]], row_min_height=px(MIN_ROW_HEIGHT)),
277 | components=[
278 | vm.Graph(
279 | title="Average Across Agents",
280 | header="Showing actual score",
281 | figure=plot_bar_quality(df, "average"),
282 | ),
283 | vm.Graph(
284 | title="Per Agent",
285 | header="Showing actual score",
286 | figure=plot_bar_quality(df, "comparison"),
287 | ),
288 | ],
289 | ),
290 | ],
291 | )
292 |
293 | effective_communication_container = vm.Container(
294 | title="Effective Communication",
295 | layout=vm.Grid(grid=[[0, 1]], row_min_height=px(MIN_ROW_HEIGHT)),
296 | collapsed=False,
297 | components=[
298 | vm.Graph(
299 | title="Average Across Agents",
300 | header="Showing actual score",
301 | figure=plot_box_communication(data_frame=df, mode="average"),
302 | ),
303 | vm.Graph(
304 | title="Per Agent",
305 | header="Showing actual score",
306 | figure=plot_box_communication(data_frame=df, mode="comparison"),
307 | ),
308 | ],
309 | variant="filled",
310 | )
311 |
312 | transcripts_and_audio_container = vm.Container(
313 | title="Call transcripts",
314 | layout=vm.Flex(gap="40px"),
315 | components=[
316 | vm.AgGrid(
317 | id="outer_grid",
318 | figure=dash_ag_grid(
319 | id="inner_grid",
320 | data_frame=df[
321 | [
322 | "Agent ID",
323 | "Caller ID",
324 | "Topic",
325 | "Summary",
326 | "audio_file",
327 | "text_file",
328 | ]
329 | ],
330 | dashGridOptions={
331 | "rowSelection": "single",
332 | "suppressRowDeselection": True,
333 | },
334 | columnState=[
335 | {"colId": "audio_file", "hide": True},
336 | {"colId": "text_file", "hide": True},
337 | ],
338 | columnSize="responsiveSizeToFit",
339 | ),
340 | actions=[
341 | vm.Action(
342 | function=update_from_selected_row(),
343 | inputs=["inner_grid.selectedRows"],
344 | outputs=["transcript.children", "audio.src"],
345 | )
346 | ],
347 | ),
348 | vm.Container(
349 | layout=vm.Grid(grid=[[0, 0, 1]]),
350 | components=[
351 | vm.Card(
352 | id="transcript",
353 | text="Select a row from the above table to see a transcript",
354 | extra={"style": {"height": "450px"}},
355 | ),
356 | Audio(id="audio"),
357 | ],
358 | ),
359 | ],
360 | )
361 |
362 | call_center_summary_page = vm.Page(
363 | title="Call Center Summary",
364 | layout=vm.Flex(gap="20px"),
365 | components=[
366 | kpi_container,
367 | call_summary_container,
368 | upsales_container,
369 | concerns_container,
370 | quality_scores_container,
371 | effective_communication_container,
372 | ],
373 | controls=[
374 | vm.Filter(column="Agent ID", selector=vm.Dropdown(title="Agent ID")),
375 | vm.Filter(column="Caller ID", selector=vm.Dropdown(title="Caller ID")),
376 | vm.Filter(column="Client Tone"),
377 | vm.Filter(
378 | column="Effective Communication",
379 | selector=vm.RangeSlider(title="Effective Communication Score", step=1),
380 | ),
381 | vm.Filter(column="Caller City", selector=vm.Dropdown(title="Caller City")),
382 | ],
383 | )
384 |
385 | call_transcripts_page = vm.Page(
386 | title="Call Transcripts",
387 | components=[transcripts_and_audio_container],
388 | controls=[
389 | vm.Filter(column="Agent ID", selector=vm.Dropdown(title="Agent ID")),
390 | vm.Filter(column="Caller ID", selector=vm.Dropdown(title="Caller ID")),
391 | ],
392 | )
393 |
394 | dashboard = vm.Dashboard(pages=[call_center_summary_page, call_transcripts_page])
395 |
396 | app = Vizro().build(dashboard)
397 |
398 | if __name__ == "__main__":
399 | app.run()
400 |
--------------------------------------------------------------------------------
/vizro/assets/vizro_dashboard_styles.css:
--------------------------------------------------------------------------------
1 | audio::-webkit-media-controls-panel, audio::-webkit-media-controls-enclosure {
2 | border-radius: 0;
3 | background: var(--surfaces-bg-card);
4 | }
5 |
6 | #outer_grid {
7 | width: unset;
8 | }
9 |
10 | #transcript {
11 | line-height: unset;
12 | }
--------------------------------------------------------------------------------
/vizro/custom_charts.py:
--------------------------------------------------------------------------------
1 | """Custom charts for Vizro dashboard.
2 | """
3 |
4 | import math
5 |
6 | import numpy as np
7 | import pandas as pd
8 | import plotly.graph_objects as go
9 | from plotly.subplots import make_subplots
10 |
11 | import vizro.plotly.express as px
12 | from vizro.models.types import capture
13 |
14 | CONCERN_LABELS = ["Concerns Not Addressed", "Concerns Addressed"]
15 | UPSALE_LABELS = ["Failed Upsales", "No Upsale Attempted", "Successful Upsales"]
16 |
17 |
18 | @capture("graph")
19 | def plot_donut_concerns(
20 | data_frame: pd.DataFrame,
21 | group_column: str,
22 | count_column: str,
23 | label_names: list[str],
24 | mode: str,
25 | ) -> go.Figure:
26 | """Create a donut chart for concerns addressed, by agent or average.
27 |
28 | Args:
29 | data_frame (pd.DataFrame): Input data containing agent and concern columns.
30 | group_column (str): Column name for grouping (e.g., agent ID).
31 | count_column (str): Column name for concern addressed (boolean).
32 | label_names (list[str]): List of label names for the donut chart.
33 | mode (str): 'comparison' for agent subplots, 'average' for overall.
34 |
35 | Returns:
36 | go.Figure: Plotly Figure object representing the donut chart(s).
37 | """
38 | if mode == "comparison":
39 |
40 | agent_count = data_frame[group_column].nunique()
41 |
42 | num_rows = math.ceil(agent_count / 4)
43 | num_cols = 4
44 |
45 | fig = make_subplots(
46 | rows=num_rows,
47 | cols=num_cols,
48 | subplot_titles=None,
49 | horizontal_spacing=0.08,
50 | vertical_spacing=0.02,
51 | specs=[[{"type": "pie"}] * num_cols for _ in range(num_rows)],
52 | )
53 |
54 | agent_list = data_frame[group_column].unique().tolist()
55 |
56 | for i in range(0, len(agent_list)):
57 | chart_data = data_frame.copy()
58 | chart_data = chart_data[chart_data[group_column] == agent_list[i]]
59 |
60 | counts = chart_data[count_column].value_counts()
61 | labels = label_names
62 |
63 | chart_data = pd.DataFrame(
64 | {
65 | "Labels": labels,
66 | "Counts": [counts.get(False, 0), counts.get(True, 0)],
67 | }
68 | )
69 |
70 | chart_data.sort_values(by="Labels", ascending=True, inplace=True)
71 |
72 | labels = chart_data["Labels"]
73 | values = chart_data["Counts"]
74 |
75 | color_discrete_map = {
76 | "Concerns Addressed": "#00b4ff",
77 | "Concerns Not Addressed": "#ff9222",
78 | }
79 | colors = [color_discrete_map[label] for label in labels]
80 |
81 | fig.add_trace(
82 | go.Pie(
83 | labels=labels,
84 | values=values,
85 | hole=0.6,
86 | title=str(agent_list[i]),
87 | marker=dict(colors=colors),
88 | sort=False,
89 | hovertemplate="Category: %{label}
Count: %{value}
Percent: %{percent}",
90 | ),
91 | row=i // num_cols + 1,
92 | col=i % num_cols + 1,
93 | )
94 |
95 | fig.update_traces(
96 | textposition="outside",
97 | textinfo="percent+label",
98 | opacity=0.9,
99 | )
100 |
101 | fig.update_traces(textinfo="none")
102 |
103 | fig.update_layout(
104 | margin_t=0, margin_b=0, margin_l=0, margin_r=0, showlegend=False
105 | )
106 |
107 | if mode == "average":
108 | chart_data = data_frame.copy()
109 |
110 | counts = chart_data[count_column].value_counts()
111 | labels = label_names
112 |
113 | chart_data = pd.DataFrame(
114 | {"Labels": labels, "Counts": [counts.get(False, 0), counts.get(True, 0)]}
115 | )
116 |
117 | chart_data.sort_values(by="Labels", ascending=True, inplace=True)
118 |
119 | labels = chart_data["Labels"]
120 | values = chart_data["Counts"]
121 |
122 | color_discrete_map = {
123 | "Concerns Addressed": "#00b4ff",
124 | "Concerns Not Addressed": "#ff9222",
125 | }
126 | colors = [color_discrete_map[label] for label in labels]
127 |
128 | fig = go.Figure()
129 |
130 | fig.add_trace(
131 | go.Pie(
132 | labels=labels,
133 | values=values,
134 | hole=0.6,
135 | marker=dict(colors=colors),
136 | sort=False,
137 | hovertemplate="Category: %{label}
Count: %{value}
Percent: %{percent}",
138 | )
139 | )
140 |
141 | fig.update_layout(margin_t=0, margin_b=0, margin_l=0, margin_r=0)
142 | fig.update_traces(textposition="outside", textinfo="percent", opacity=0.9)
143 |
144 | return fig
145 |
146 |
147 | @capture("graph")
148 | def plot_donut_upsales(
149 | data_frame: pd.DataFrame,
150 | group_column: str,
151 | mode: str,
152 | ) -> go.Figure:
153 | """Create a donut chart for upsales outcomes, by agent or average.
154 |
155 | Args:
156 | data_frame (pd.DataFrame): Input data containing agent and upsale columns.
157 | group_column (str): Column name for grouping (e.g., agent ID).
158 | mode (str): 'comparison' for agent subplots, 'average' for overall.
159 |
160 | Returns:
161 | go.Figure: Plotly Figure object representing the donut chart(s).
162 | """
163 | color_discrete_map = {
164 | "Failed Upsales": "#FF9222",
165 | "No Upsale Attempted": "#3949AB",
166 | "Successful Upsales": "#00B4FF",
167 | }
168 |
169 | labels = ["Failed Upsales", "No Upsale Attempted", "Successful Upsales"]
170 |
171 | if mode == "comparison":
172 |
173 | agent_count = data_frame[group_column].nunique()
174 |
175 | num_rows = math.ceil(agent_count / 4)
176 | num_cols = 4
177 |
178 | fig = make_subplots(
179 | rows=num_rows,
180 | cols=num_cols,
181 | subplot_titles=None,
182 | horizontal_spacing=0.08,
183 | vertical_spacing=0.02,
184 | specs=[[{"type": "pie"}] * num_cols for _ in range(num_rows)],
185 | )
186 |
187 | agent_list = data_frame[group_column].unique().tolist()
188 |
189 | for i in range(0, len(agent_list)):
190 |
191 | chart_data = data_frame.copy()
192 | upsale_outcomes = chart_data[chart_data[group_column] == agent_list[i]]
193 |
194 | upsale_outcomes = (
195 | upsale_outcomes.groupby(["Upsale Attempted", "Upsale Success"])
196 | .size()
197 | .reset_index(name="counts")
198 | )
199 |
200 | def categorize(row: pd.Series) -> str:
201 | """Categorize upsale outcome for a row.
202 |
203 | Args:
204 | row (pd.Series): Row of DataFrame with 'Upsale Attempted' and 'Upsale Success'.
205 | Returns:
206 | str: Category label for the upsale outcome.
207 | """
208 | if not row["Upsale Attempted"]:
209 | return "No Upsale Attempted"
210 | elif row["Upsale Success"]:
211 | return "Successful Upsales"
212 | else:
213 | return "Failed Upsales"
214 |
215 | upsale_outcomes["category"] = upsale_outcomes.apply(categorize, axis=1)
216 |
217 | counts = upsale_outcomes["category"].value_counts()
218 |
219 | chart_data = pd.DataFrame(
220 | {
221 | "Labels": labels,
222 | "Counts": [
223 | counts.get("Failed Upsales", 0),
224 | counts.get("No Upsale Attempted", 0),
225 | counts.get("Successful Upsales", 0),
226 | ],
227 | }
228 | )
229 |
230 | chart_data.sort_values(by="Labels", ascending=True, inplace=True)
231 |
232 | labels = chart_data["Labels"]
233 | values = chart_data["Counts"]
234 |
235 | colors = [color_discrete_map[label] for label in labels]
236 |
237 | fig.add_trace(
238 | go.Pie(
239 | labels=labels,
240 | values=values,
241 | hole=0.6,
242 | title=str(agent_list[i]),
243 | marker=dict(colors=colors),
244 | sort=False,
245 | hovertemplate="Category: %{label}
Count: %{value}
Percent: %{percent}",
246 | ),
247 | row=i // num_cols + 1,
248 | col=i % num_cols + 1,
249 | )
250 |
251 | fig.update_traces(
252 | textposition="outside",
253 | textinfo="percent+label",
254 | opacity=0.9,
255 | )
256 |
257 | fig.update_traces(textinfo="none")
258 |
259 | fig.update_layout(
260 | margin_t=0, margin_b=0, margin_l=0, margin_r=0, showlegend=False
261 | )
262 |
263 | if mode == "average":
264 |
265 | upsale_outcomes = data_frame.copy()
266 |
267 | labels = ["Failed Upsales", "No Upsale Attempted", "Successful Upsales"]
268 |
269 | upsale_outcomes = (
270 | upsale_outcomes.groupby(["Upsale Attempted", "Upsale Success"])
271 | .size()
272 | .reset_index(name="counts")
273 | )
274 |
275 | def categorize(row: pd.Series) -> str:
276 | """Categorize upsale outcome for a row.
277 |
278 | Args:
279 | row (pd.Series): Row of DataFrame with 'Upsale Attempted' and 'Upsale Success'.
280 | Returns:
281 | str: Category label for the upsale outcome.
282 | """
283 | if not row["Upsale Attempted"]:
284 | return "No Upsale Attempted"
285 | elif row["Upsale Success"]:
286 | return "Successful Upsales"
287 | else:
288 | return "Failed Upsales"
289 |
290 | upsale_outcomes["category"] = upsale_outcomes.apply(categorize, axis=1)
291 | category_counts = (
292 | upsale_outcomes.groupby("category")["counts"].sum().reset_index()
293 | )
294 |
295 | counts = dict(zip(category_counts["category"], category_counts["counts"]))
296 |
297 | chart_data = pd.DataFrame(
298 | {
299 | "Labels": labels,
300 | "Counts": [
301 | counts.get("Failed Upsales", 0),
302 | counts.get("No Upsale Attempted", 0),
303 | counts.get("Successful Upsales", 0),
304 | ],
305 | }
306 | )
307 |
308 | chart_data.sort_values(by="Labels", ascending=True, inplace=True)
309 |
310 | labels = chart_data["Labels"]
311 | values = chart_data["Counts"]
312 |
313 | colors = [color_discrete_map[label] for label in labels]
314 |
315 | fig = go.Figure()
316 |
317 | fig.add_trace(
318 | go.Pie(
319 | labels=labels,
320 | values=values,
321 | hole=0.6,
322 | marker=dict(colors=colors),
323 | sort=False,
324 | hovertemplate="Category: %{label}
Count: %{value}
Percent: %{percent}",
325 | )
326 | )
327 |
328 | fig.update_layout(
329 | margin_t=0, margin_b=0, margin_l=0, margin_r=0, legend_traceorder="reversed"
330 | )
331 | fig.update_traces(textposition="outside", textinfo="percent", opacity=0.9)
332 |
333 | return fig
334 |
335 |
336 | @capture("graph")
337 | def plot_bar_concerns(
338 | data_frame: pd.DataFrame,
339 | group_column: str,
340 | mode: str,
341 | ) -> go.Figure:
342 | """Create a bar chart for concerns addressed, by agent or average.
343 |
344 | Args:
345 | data_frame (pd.DataFrame): Input data containing agent and concern columns.
346 | group_column (str): Column name for grouping (e.g., agent ID).
347 | mode (str): 'comparison' for agent subplots, 'average' for overall.
348 |
349 | Returns:
350 | go.Figure: Plotly Figure object representing the bar chart(s).
351 | """
352 | color_discrete_map = {
353 | "Concerns Addressed": "#00b4ff",
354 | "Concerns Not Addressed": "#ff9222",
355 | }
356 |
357 | if mode == "comparison":
358 |
359 | data = pd.DataFrame()
360 |
361 | agent_list = data_frame[group_column].unique().tolist()
362 |
363 | for i in range(0, len(agent_list)):
364 |
365 | chart_data = data_frame.copy()
366 | chart_data = chart_data[chart_data[group_column] == agent_list[i]]
367 | chart_data["Concern Addressed"] = chart_data["Concern Addressed"].replace(
368 | {True: "Concerns Addressed", False: "Concerns Not Addressed"}
369 | )
370 |
371 | outcomes = (
372 | chart_data.groupby(["Concern Addressed"])
373 | .size()
374 | .reset_index(name="counts")
375 | )
376 |
377 | category_counts = (
378 | outcomes.groupby("Concern Addressed")["counts"].sum().reset_index()
379 | )
380 |
381 | category_counts["agent_id"] = i
382 |
383 | data = pd.concat([data, category_counts])
384 |
385 | fig = px.bar(
386 | data,
387 | x="agent_id",
388 | y="counts",
389 | color="Concern Addressed",
390 | title="",
391 | color_discrete_map=color_discrete_map,
392 | category_orders={
393 | "category": [
394 | "Concerns Not Addressed",
395 | "Concerns Addressed",
396 | ]
397 | },
398 | )
399 |
400 | fig.update_layout(
401 | showlegend=False,
402 | xaxis=dict(
403 | tickmode="array",
404 | tickvals=list(range(0, len(agent_list))),
405 | ticktext=agent_list,
406 | ),
407 | xaxis_title="Agent ID",
408 | yaxis_title=None,
409 | )
410 | fig.update_traces(
411 | hovertemplate="Category: %{fullData.name}
Count: %{y}"
412 | )
413 |
414 | if mode == "average":
415 |
416 | chart_data = data_frame.copy()
417 | chart_data["Concern Addressed"] = chart_data["Concern Addressed"].replace(
418 | {True: "Concerns Addressed", False: "Concerns Not Addressed"}
419 | )
420 |
421 | outcomes = (
422 | chart_data.groupby(["Concern Addressed"]).size().reset_index(name="counts")
423 | )
424 |
425 | category_counts = (
426 | outcomes.groupby("Concern Addressed")["counts"].sum().reset_index()
427 | )
428 | category_counts["PLACEHOLDER"] = 1
429 |
430 | fig = px.bar(
431 | category_counts,
432 | y="PLACEHOLDER",
433 | x="counts",
434 | color="Concern Addressed",
435 | title="",
436 | orientation="h",
437 | text="counts",
438 | color_discrete_map=color_discrete_map,
439 | category_orders={
440 | "category": [
441 | "Concerns Not Addressed",
442 | "Concerns Addressed",
443 | ]
444 | },
445 | )
446 |
447 | fig.update_layout(
448 | xaxis=dict(visible=False),
449 | yaxis=dict(visible=False),
450 | showlegend=True,
451 | legend_title=None,
452 | margin=dict(t=60),
453 | )
454 |
455 | fig.update_traces(
456 | textposition="inside",
457 | insidetextanchor="middle",
458 | width=0.2,
459 | hovertemplate="Category: %{fullData.name}
Count: %{x}",
460 | )
461 |
462 | return fig
463 |
464 |
465 | @capture("graph")
466 | def plot_bar_upsales(
467 | data_frame: pd.DataFrame,
468 | group_column: str,
469 | mode: str,
470 | ) -> go.Figure:
471 | """Create a bar chart for upsales outcomes, by agent or average.
472 |
473 | Args:
474 | data_frame (pd.DataFrame): Input data containing agent and upsale columns.
475 | group_column (str): Column name for grouping (e.g., agent ID).
476 | mode (str): 'comparison' for agent subplots, 'average' for overall.
477 |
478 | Returns:
479 | go.Figure: Plotly Figure object representing the bar chart(s).
480 | """
481 | color_discrete_map = {
482 | "Failed Upsales": "#FF9222",
483 | "No Upsale Attempted": "#3949AB",
484 | "Successful Upsales": "#00B4FF",
485 | }
486 |
487 | if mode == "comparison":
488 |
489 | data = pd.DataFrame()
490 |
491 | agent_list = data_frame[group_column].unique().tolist()
492 |
493 | for i in range(0, len(agent_list)):
494 |
495 | chart_data = data_frame.copy()
496 | chart_data = chart_data[chart_data[group_column] == agent_list[i]]
497 | upsale_outcomes = (
498 | chart_data.groupby(["Upsale Attempted", "Upsale Success"])
499 | .size()
500 | .reset_index(name="counts")
501 | )
502 |
503 | def categorize(row: pd.Series) -> str:
504 | """Categorize upsale outcome for a row.
505 |
506 | Args:
507 | row (pd.Series): Row of DataFrame with 'Upsale Attempted' and 'Upsale Success'.
508 | Returns:
509 | str: Category label for the upsale outcome.
510 | """
511 | if not row["Upsale Attempted"]:
512 | return "No Upsale Attempted"
513 | elif row["Upsale Success"]:
514 | return "Successful Upsales"
515 | else:
516 | return "Failed Upsales"
517 |
518 | upsale_outcomes["category"] = upsale_outcomes.apply(categorize, axis=1)
519 | category_counts = (
520 | upsale_outcomes.groupby("category")["counts"].sum().reset_index()
521 | )
522 | category_counts["agent_id"] = i
523 |
524 | data = pd.concat([data, category_counts])
525 |
526 | fig = px.bar(
527 | data,
528 | x="agent_id",
529 | y="counts",
530 | color="category",
531 | title="",
532 | color_discrete_map=color_discrete_map,
533 | category_orders={
534 | "category": [
535 | "Successful Upsales",
536 | "No Upsale Attempted",
537 | "Failed Upsales",
538 | ]
539 | },
540 | )
541 |
542 | fig.update_traces(
543 | hovertemplate="Category: %{fullData.name}
Count: %{y}"
544 | )
545 |
546 | if mode == "average":
547 |
548 | upsale_outcomes = (
549 | data_frame.groupby(["Upsale Attempted", "Upsale Success"])
550 | .size()
551 | .reset_index(name="counts")
552 | )
553 |
554 | def categorize(row: pd.Series) -> str:
555 | """Categorize upsale outcome for a row.
556 |
557 | Args:
558 | row (pd.Series): Row of DataFrame with 'Upsale Attempted' and 'Upsale Success'.
559 | Returns:
560 | str: Category label for the upsale outcome.
561 | """
562 | if not row["Upsale Attempted"]:
563 | return "No Upsale Attempted"
564 | elif row["Upsale Success"]:
565 | return "Successful Upsales"
566 | else:
567 | return "Failed Upsales"
568 |
569 | upsale_outcomes["category"] = upsale_outcomes.apply(categorize, axis=1)
570 | category_counts = (
571 | upsale_outcomes.groupby("category")["counts"].sum().reset_index()
572 | )
573 | category_counts["PLACEHOLDER"] = 1
574 |
575 | fig = px.bar(
576 | category_counts,
577 | y="PLACEHOLDER",
578 | x="counts",
579 | color="category",
580 | title="",
581 | orientation="h",
582 | text="counts",
583 | color_discrete_map=color_discrete_map,
584 | category_orders={
585 | "category": [
586 | "Successful Upsales",
587 | "No Upsale Attempted",
588 | "Failed Upsales",
589 | ]
590 | },
591 | )
592 |
593 | fig.update_traces(
594 | hovertemplate="Category: %{fullData.name}
Count: %{y}",
595 | textposition="inside",
596 | insidetextanchor="middle",
597 | width=0.2,
598 | )
599 |
600 | fig.update_layout(
601 | xaxis=dict(visible=False),
602 | yaxis=dict(visible=False),
603 | showlegend=True,
604 | legend_title=None,
605 | )
606 |
607 | return fig
608 |
609 |
610 | @capture("graph")
611 | def plot_radar_quality(
612 | data_frame: pd.DataFrame,
613 | mode: str,
614 | ) -> go.Figure:
615 | """Create a radar (polar) chart for agent communication quality metrics.
616 |
617 | Args:
618 | data_frame (pd.DataFrame): Input data with agent communication metrics.
619 | mode (str): 'comparison' for agent subplots, 'average' for overall.
620 |
621 | Returns:
622 | go.Figure: Plotly Figure object representing the radar chart(s).
623 | """
624 | data = data_frame.copy()
625 | melted_df = pd.melt(
626 | data,
627 | id_vars=["Agent ID"],
628 | value_vars=[
629 | "Empathy",
630 | "Professionalism",
631 | "Kindness",
632 | "Effective Communication",
633 | "Active Listening",
634 | ],
635 | var_name="Communication Metric",
636 | value_name="Value",
637 | )
638 |
639 | grouped_avg_df = melted_df.groupby(
640 | ["Agent ID", "Communication Metric"], as_index=False
641 | )["Value"].mean()
642 |
643 | if mode == "comparison":
644 |
645 | agent_count = data_frame["Agent ID"].nunique()
646 |
647 | num_rows = math.ceil(agent_count / 4)
648 | num_cols = 4
649 |
650 | fig = make_subplots(
651 | rows=num_rows,
652 | cols=num_cols,
653 | subplot_titles=None,
654 | horizontal_spacing=0.02,
655 | vertical_spacing=0.02,
656 | specs=[[{"type": "polar"}] * num_cols for _ in range(num_rows)],
657 | )
658 |
659 | agent_list = grouped_avg_df["Agent ID"].unique().tolist()
660 |
661 | for i in range(0, len(agent_list)):
662 | chart_data = grouped_avg_df.copy()
663 | chart_data = chart_data[chart_data["Agent ID"] == agent_list[i]]
664 |
665 | fig.add_trace(
666 | go.Barpolar(
667 | r=chart_data["Value"],
668 | theta=chart_data["Communication Metric"],
669 | marker_color=[
670 | "#00B4FF",
671 | "#FF9222",
672 | "#3949AB",
673 | "#FF5267",
674 | "#08BDBA",
675 | "#FDC935",
676 | ],
677 | hovertemplate=f"Agent ID: {agent_list[i]}
Metric: %{{theta}}
Score: %{{r}}",
678 | ),
679 | row=i // num_cols + 1,
680 | col=i % num_cols + 1,
681 | )
682 |
683 | for i in range(num_rows * num_cols):
684 | fig.update_layout(
685 | **{
686 | f"polar{i + 1}": dict(
687 | radialaxis=dict(visible=False, showgrid=False),
688 | angularaxis=dict(visible=False, showgrid=False),
689 | bgcolor="rgba(0, 0, 0, 0)",
690 | )
691 | }
692 | )
693 |
694 | fig.update_layout(
695 | showlegend=False,
696 | paper_bgcolor="rgba(0, 0, 0, 0)",
697 | plot_bgcolor="rgba(0, 0, 0, 0)",
698 | )
699 | if mode == "average":
700 |
701 | grouped_avg_df = melted_df.groupby(
702 | ["Agent ID", "Communication Metric"], as_index=False
703 | )["Value"].mean()
704 | grouped_avg_df = grouped_avg_df.groupby(
705 | ["Communication Metric"], as_index=False
706 | )["Value"].mean()
707 |
708 | fig = go.Figure()
709 |
710 | fig.add_trace(
711 | go.Barpolar(
712 | r=grouped_avg_df["Value"],
713 | theta=grouped_avg_df["Communication Metric"],
714 | marker_color=[
715 | "#00B4FF",
716 | "#FF9222",
717 | "#3949AB",
718 | "#FF5267",
719 | "#08BDBA",
720 | "#FDC935",
721 | ],
722 | hovertemplate="Metric: %{theta}
Score: %{r}",
723 | )
724 | )
725 |
726 | fig.update_layout(
727 | polar=dict(
728 | angularaxis=dict(),
729 | radialaxis=dict(
730 | dtick=1,
731 | showgrid=False,
732 | ),
733 | bgcolor="rgba(0, 0, 0, 0)",
734 | ),
735 | showlegend=False,
736 | )
737 |
738 | return fig
739 |
740 |
741 | @capture("graph")
742 | def plot_bar_quality(
743 | data_frame: pd.DataFrame,
744 | mode: str,
745 | ) -> go.Figure:
746 | """Create a bar chart for agent communication quality metrics.
747 |
748 | Args:
749 | data_frame (pd.DataFrame): Input data with agent communication metrics.
750 | mode (str): 'comparison' for agent subplots, 'average' for overall.
751 |
752 | Returns:
753 | go.Figure: Plotly Figure object representing the bar chart(s).
754 | """
755 | data = data_frame.copy()
756 | melted_df = pd.melt(
757 | data,
758 | id_vars=["Agent ID"],
759 | value_vars=[
760 | "Empathy",
761 | "Professionalism",
762 | "Kindness",
763 | "Effective Communication",
764 | "Active Listening",
765 | ],
766 | var_name="Communication Metric",
767 | value_name="Value",
768 | )
769 |
770 | grouped_avg_df = melted_df.groupby(
771 | ["Agent ID", "Communication Metric"], as_index=False
772 | )["Value"].mean()
773 |
774 | if mode == "comparison":
775 | agent_count = data_frame["Agent ID"].nunique()
776 | num_rows = math.ceil(agent_count / 4)
777 | num_cols = 4
778 | fig = make_subplots(
779 | rows=num_rows,
780 | cols=num_cols,
781 | subplot_titles=None,
782 | horizontal_spacing=0.04,
783 | vertical_spacing=0.02,
784 | specs=[[{"type": "xy"}] * num_cols for _ in range(num_rows)],
785 | )
786 | agent_list = grouped_avg_df["Agent ID"].unique().tolist()
787 | colors = ["#00B4FF", "#FF9222", "#3949AB", "#FF5267", "#08BDBA", "#FDC935"]
788 | for i, agent in enumerate(agent_list):
789 | chart_data = grouped_avg_df[grouped_avg_df["Agent ID"] == agent]
790 | for idx, row in chart_data.iterrows():
791 | fig.add_trace(
792 | go.Scatter(
793 | x=[row["Communication Metric"], row["Communication Metric"]],
794 | y=[0, row["Value"]],
795 | mode="lines",
796 | line=dict(color=colors[idx % len(colors)], width=3),
797 | showlegend=False,
798 | ),
799 | row=i // num_cols + 1,
800 | col=i % num_cols + 1,
801 | )
802 | fig.add_trace(
803 | go.Scatter(
804 | x=[row["Communication Metric"]],
805 | y=[row["Value"]],
806 | mode="markers",
807 | marker=dict(color=colors[idx % len(colors)], size=8),
808 | name=row["Communication Metric"] if i == 0 else None,
809 | showlegend=(i == 0),
810 | hovertemplate=f"Agent ID: {agent}
Metric: %{{x}}
Score: %{{y}}",
811 | ),
812 | row=i // num_cols + 1,
813 | col=i % num_cols + 1,
814 | )
815 | fig.update_xaxes(
816 | showgrid=False,
817 | visible=True,
818 | showticklabels=False,
819 | ticks="",
820 | title=dict(text=str(agent), font=dict(size=10), standoff=2),
821 | row=i // num_cols + 1,
822 | col=i % num_cols + 1,
823 | zeroline=True,
824 | )
825 | fig.update_yaxes(
826 | showgrid=False,
827 | visible=False,
828 | zeroline=False,
829 | row=i // num_cols + 1,
830 | col=i % num_cols + 1,
831 | )
832 | fig.update_layout(
833 | showlegend=False,
834 | paper_bgcolor="rgba(0, 0, 0, 0)",
835 | plot_bgcolor="rgba(0, 0, 0, 0)",
836 | margin=dict(t=10),
837 | )
838 |
839 | if mode == "average":
840 | grouped_avg_df = melted_df.groupby(
841 | ["Agent ID", "Communication Metric"], as_index=False
842 | )["Value"].mean()
843 | grouped_avg_df = grouped_avg_df.groupby(
844 | ["Communication Metric"], as_index=False
845 | )["Value"].mean()
846 | fig = go.Figure()
847 | colors = ["#00B4FF", "#FF9222", "#3949AB", "#FF5267", "#08BDBA", "#FDC935"]
848 | for idx, row in grouped_avg_df.iterrows():
849 | fig.add_trace(
850 | go.Bar(
851 | y=[row["Value"]],
852 | x=[row["Communication Metric"]],
853 | name=row["Communication Metric"],
854 | marker=dict(color=colors[idx % len(colors)]),
855 | text=[round(row["Value"], 1)],
856 | textposition="inside",
857 | hovertemplate="Metric: %{x}
Score: %{y}",
858 | width=0.6,
859 | )
860 | )
861 | fig.update_layout(
862 | showlegend=True,
863 | paper_bgcolor="rgba(0, 0, 0, 0)",
864 | plot_bgcolor="rgba(0, 0, 0, 0)",
865 | xaxis=dict(
866 | showgrid=False,
867 | visible=True,
868 | zeroline=True,
869 | zerolinecolor="rgba(150,150,150,0.7)",
870 | zerolinewidth=2,
871 | showticklabels=False,
872 | ticks="",
873 | ),
874 | yaxis=dict(
875 | showgrid=False,
876 | visible=False,
877 | ),
878 | barmode="group",
879 | )
880 | return fig
881 |
882 |
883 | @capture("graph")
884 | def plot_box_communication(
885 | data_frame: pd.DataFrame,
886 | mode: str,
887 | ) -> go.Figure:
888 | """Create a box plot for Effective Communication scores, by agent or average.
889 |
890 | Args:
891 | data_frame (pd.DataFrame): Input data with agent and communication scores.
892 | mode (str): 'comparison' for agent subplots, 'average' for overall.
893 |
894 | Returns:
895 | go.Figure: Plotly Figure object representing the box plot(s).
896 | """
897 | data = data_frame[["Agent ID", "Effective Communication"]].copy()
898 | data["PLACEHOLDER"] = 1
899 | if mode == "comparison":
900 | fig = px.box(data, x="Agent ID", y="Effective Communication")
901 | fig.update_layout(xaxis=dict(tickvals=data["Agent ID"], tickangle=90))
902 | if mode == "average":
903 | fig = px.box(
904 | data, y="PLACEHOLDER", x="Effective Communication", orientation="h"
905 | )
906 | fig.update_layout(
907 | yaxis=dict(range=[0, 2], visible=False), boxmode="group", bargap=0.5
908 | )
909 | return fig
910 |
911 |
912 | @capture("graph")
913 | def plot_map_call_locations(
914 | data_frame: pd.DataFrame,
915 | ) -> go.Figure:
916 | """Create a map of call locations with bubble size by call count.
917 |
918 | Args:
919 | data_frame (pd.DataFrame): Input data with city, latitude, longitude, and call info.
920 |
921 | Returns:
922 | go.Figure: Plotly Figure object representing the map.
923 | """
924 | aggregated_df = (
925 | data_frame.groupby(["Caller City", "latitude", "longitude"])
926 | .agg(
927 | Call_Count=("Caller ID", "count"),
928 | Agent_IDs=("Agent ID", "count"),
929 | Caller_Count=("Caller ID", "nunique"),
930 | )
931 | .reset_index()
932 | )
933 | populations = aggregated_df["Call_Count"]
934 | min_size = 10
935 | max_size = 50
936 | sizes = np.interp(
937 | aggregated_df["Call_Count"],
938 | (populations.min(), populations.max()),
939 | (min_size, max_size),
940 | )
941 | fig = go.Figure(
942 | go.Scattergeo(
943 | lat=aggregated_df["latitude"],
944 | lon=aggregated_df["longitude"],
945 | mode="markers",
946 | marker=dict(
947 | size=sizes,
948 | color="#00B4FF",
949 | opacity=0.6,
950 | line=dict(width=0),
951 | ),
952 | hovertemplate="City: %{text}
Calls: %{customdata[0]:,}
Agents: %{customdata[1]:,}
Callers: %{customdata[2]:,}",
953 | customdata=aggregated_df[["Call_Count", "Agent_IDs", "Caller_Count"]],
954 | text=aggregated_df["Caller City"],
955 | )
956 | )
957 | fig.update_geos(
958 | visible=False,
959 | resolution=110,
960 | scope="usa",
961 | showcountries=True,
962 | countrycolor="rgb(150, 150, 150)",
963 | showsubunits=True,
964 | subunitcolor="rgb(150, 150, 150)",
965 | )
966 | fig.update_layout(margin={"r": 0, "t": 0, "l": 0, "b": 0}, showlegend=False)
967 | return fig
968 |
969 |
970 | @capture("graph")
971 | def plot_line_calls_over_time(
972 | data_frame: pd.DataFrame,
973 | ) -> go.Figure:
974 | """Create a line chart of number of calls per month.
975 |
976 | Args:
977 | data_frame (pd.DataFrame): Input data with call dates.
978 |
979 | Returns:
980 | go.Figure: Plotly Figure object representing the line chart.
981 | """
982 | calls_per_month = (
983 | data_frame.groupby(data_frame["Call Date"].dt.to_period("M"))
984 | .size()
985 | .reset_index(name="Count")
986 | )
987 | calls_per_month["TickLabel"] = calls_per_month["Call Date"].dt.strftime("%b %y")
988 | calls_per_month["Call Date"] = calls_per_month["Call Date"].dt.strftime("%Y-%m")
989 | fig = go.Figure()
990 | fig.add_trace(
991 | go.Scatter(
992 | x=calls_per_month["Call Date"],
993 | y=calls_per_month["Count"],
994 | mode="lines+markers+text",
995 | text=calls_per_month["Count"],
996 | textposition="top center",
997 | hovertemplate="Month: %{x}
Count: %{y}",
998 | marker=dict(size=6, color="#00B4FF"),
999 | line=dict(color="#00B4FF", width=2),
1000 | showlegend=False,
1001 | cliponaxis=False,
1002 | )
1003 | )
1004 | fig.update_layout(
1005 | showlegend=False,
1006 | title=None,
1007 | yaxis=dict(visible=False),
1008 | xaxis=dict(
1009 | title=None,
1010 | tickangle=90,
1011 | tickmode="array",
1012 | tickvals=calls_per_month["Call Date"],
1013 | ticktext=calls_per_month["TickLabel"],
1014 | tickfont=dict(size=12),
1015 | showgrid=False,
1016 | ),
1017 | margin=dict(t=10, b=60),
1018 | )
1019 | return fig
1020 |
1021 |
1022 | @capture("graph")
1023 | def plot_butterfly_upsales_concerns(
1024 | data_frame: pd.DataFrame,
1025 | ) -> go.Figure:
1026 | """Create a butterfly chart comparing upsales and concerns addressed percentages per month.
1027 |
1028 | Args:
1029 | data_frame (pd.DataFrame): Input data with call dates, upsale, and concern columns.
1030 |
1031 | Returns:
1032 | go.Figure: Plotly Figure object representing the butterfly chart.
1033 | """
1034 | df = data_frame.copy()
1035 | df["Month"] = df["Call Date"].dt.to_period("M")
1036 | upsales = (
1037 | df[df["Upsale Attempted"]]
1038 | .groupby("Month")["Upsale Success"]
1039 | .mean()
1040 | .reset_index()
1041 | )
1042 | upsales["Metric"] = "Upsales Success"
1043 | upsales["Value"] = upsales["Upsale Success"] * 100
1044 | concerns = df.groupby("Month")["Concern Addressed"].mean().reset_index()
1045 | concerns["Metric"] = "Concerns Addressed"
1046 | concerns["Value"] = -concerns["Concern Addressed"] * 100
1047 | plot_df = pd.concat(
1048 | [upsales[["Month", "Metric", "Value"]], concerns[["Month", "Metric", "Value"]]]
1049 | )
1050 | plot_df = plot_df.sort_values(["Month", "Metric"])
1051 | plot_df["MonthLabel"] = plot_df["Month"].dt.strftime("%b %y")
1052 | plot_df = plot_df.sort_values("Month")
1053 | pivot_df = plot_df.pivot(
1054 | index=["Month", "MonthLabel"], columns="Metric", values="Value"
1055 | ).reset_index()
1056 | pivot_df = pivot_df.sort_values("Month")
1057 | month_labels = pivot_df["MonthLabel"]
1058 | if 'Upsales Success' not in pivot_df.columns:
1059 | pivot_df['Upsales Success'] = 0
1060 | else:
1061 | pivot_df['Upsales Success'].fillna(value=0, inplace=True)
1062 |
1063 |
1064 | if 'Concerns Addressed' not in pivot_df.columns:
1065 | pivot_df['Concerns Addressed'] = 0
1066 | else:
1067 | pivot_df['Concerns Addressed'].fillna(value=0, inplace=True)
1068 | upsales_y = pivot_df["Upsales Success"].fillna(0)
1069 | concerns_y = pivot_df["Concerns Addressed"].fillna(0)
1070 | fig = go.Figure()
1071 | fig.add_traces(
1072 | [
1073 | go.Bar(
1074 | x=month_labels,
1075 | y=upsales_y,
1076 | name="% Upsales Success",
1077 | marker_color="#00B4FF",
1078 | text=[f"{int(round(v))}%" if v != 0 else "" for v in upsales_y],
1079 | textposition="inside",
1080 | insidetextanchor="start",
1081 | textfont=dict(size=2, color="white"),
1082 | textangle=90,
1083 | offsetgroup=1,
1084 | cliponaxis=False,
1085 | width=0.6,
1086 | hovertemplate="Month: %{x}
Upsales Success: %{y:.0f}%",
1087 | ),
1088 | go.Bar(
1089 | x=month_labels,
1090 | y=concerns_y,
1091 | name="% Concerns Addressed",
1092 | marker_color="#FF9222",
1093 | text=[f"{int(round(abs(v)))}%" if v != 0 else "" for v in concerns_y],
1094 | textposition="inside",
1095 | insidetextanchor="end",
1096 | textfont=dict(size=2, color="white"),
1097 | textangle=90,
1098 | offsetgroup=1,
1099 | cliponaxis=False,
1100 | width=0.6,
1101 | hovertemplate="Month: %{x}
Concerns Addressed: %{customdata:.0f}%",
1102 | customdata=[abs(v) for v in concerns_y],
1103 | ),
1104 | ]
1105 | )
1106 | fig.update_layout(
1107 | barmode="relative",
1108 | bargap=0,
1109 | showlegend=False,
1110 | xaxis=dict(
1111 | visible=True,
1112 | showline=False,
1113 | showticklabels=True,
1114 | ticks="",
1115 | showgrid=False,
1116 | zeroline=False,
1117 | tickangle=90,
1118 | tickfont=dict(size=12),
1119 | ),
1120 | yaxis=dict(visible=False),
1121 | margin=dict(t=0, b=0),
1122 | )
1123 | fig.add_hline(y=0, line_width=1, line_color="rgba(150,150,150,0.7)")
1124 | return fig
1125 |
--------------------------------------------------------------------------------
/vizro/custom_components.py:
--------------------------------------------------------------------------------
1 | """Custom components for Vizro dashboard extensions.
2 | """
3 |
4 | import base64
5 | import re
6 | from pathlib import Path
7 | from typing import Any, Literal, Sequence
8 |
9 | from dash import html
10 | from dash.exceptions import PreventUpdate
11 |
12 | import vizro.models as vm
13 | from vizro.models.types import capture
14 |
15 |
16 | @capture("action")
17 | def update_from_selected_row(
18 | selected_rows: Sequence[dict[str, Any]]
19 | ) -> tuple[str, str]:
20 | """Update transcript and audio from the selected row in the grid.
21 |
22 | Args:
23 | selected_rows (Sequence[dict[str, Any]]):
24 | List of selected row dictionaries from the grid, each containing 'text_file' and 'audio_file' keys.
25 |
26 | Returns:
27 | tuple[str, str]:
28 | A tuple containing:
29 | - The transcript as markdown-formatted string.
30 | - The audio source as a base64-encoded string suitable for HTML audio playback.
31 |
32 | Raises:
33 | PreventUpdate: If the required files are not found or cannot be read.
34 | """
35 | selected_row = selected_rows[0]
36 | text_file_path = Path(f"outputs/anonymized_files/{selected_row['text_file']}")
37 | audio_file_path = Path(f"outputs/audio_files/{selected_row['audio_file']}")
38 | if (
39 | text_file_path not in Path("outputs/anonymized_files").iterdir()
40 | or audio_file_path not in Path("outputs/audio_files").iterdir()
41 | ):
42 | raise PreventUpdate
43 | try:
44 | call_transcript = text_file_path.read_text()
45 | except Exception as e:
46 | raise PreventUpdate from e
47 | call_transcript = call_transcript.replace("\n", " \n")
48 | call_transcript = re.sub(r"^(\w+)", r"**\1**", call_transcript, flags=re.MULTILINE)
49 | try:
50 | call_audio_src = base64.b64encode(audio_file_path.read_bytes())
51 | except Exception as e:
52 | raise PreventUpdate from e
53 | call_audio_src = f"data:audio/wav;base64,{call_audio_src.decode('utf-8')}"
54 | return call_transcript, call_audio_src
55 |
56 |
57 | class Audio(vm.VizroBaseModel):
58 | """Audio component for Vizro dashboard.
59 |
60 | This component renders an audio player for playback of call recordings or other audio content.
61 | """
62 |
63 | type: Literal["audio"] = "audio"
64 |
65 | def build(self) -> html.Audio:
66 | """Build the Dash Audio component for playback.
67 |
68 | Returns:
69 | html.Audio: Dash HTML audio component with controls enabled.
70 | """
71 | return html.Audio(id=self.id, controls=True)
72 |
73 |
74 | vm.Container.add_type("components", Audio)
75 |
76 |
77 | def make_tabs_with_title(title: str, tabs: list[vm.Container]) -> vm.Container:
78 | """Create a container with a title and tabbed content for the Vizro dashboard.
79 |
80 | Args:
81 | title (str):
82 | The title to display above the tabbed content.
83 | tabs (list[vm.Container]):
84 | List of vm.Container objects, each representing a tab.
85 |
86 | Returns:
87 | vm.Container: A container with a title and tabbed content, styled for the dashboard.
88 | """
89 | return vm.Container(
90 | title=title, components=[vm.Tabs(tabs=tabs)], variant="filled", collapsed=False
91 | )
92 |
--------------------------------------------------------------------------------