├── .dockerignore ├── .env.docker ├── .env.local ├── .gitignore ├── .gitmodules ├── Dockerfile ├── README.md ├── build_docker.ps1 ├── chainlit.md ├── data ├── questionnaire.pkl ├── questionnaire_all.pkl ├── report_doc_classification_01JDYZ52AD3KYM2Q7VF5NHV3ER.json ├── report_item_count_01JDZBAA5T6Z0MHZ10KY0HF938.json ├── sample_questionnaire1.md ├── sample_questionnaire2.md └── sample_questionnaire3.md ├── data_questionnaire_agent ├── __init__.py ├── bootstrap │ ├── __init__.py │ ├── consultant_loader.py │ └── sql_script.py ├── build.py ├── cli │ └── data_questionnaire_agent_cli.py ├── config.py ├── config_support.py ├── log_init.py ├── model │ ├── application_schema.py │ ├── confidence_schema.py │ ├── consultant_rating.py │ ├── global_configuration.py │ ├── initial_question_data.py │ ├── jwt_token.py │ ├── languages.py │ ├── mail_data.py │ ├── ontology_schema.py │ ├── openai_schema.py │ ├── question_suggestion.py │ ├── questionnaire_status.py │ ├── report_aggregation_schema.py │ ├── server_model.py │ └── session_configuration.py ├── server │ ├── __init__.py │ ├── agent_session.py │ ├── questionnaire_server.py │ ├── questionnaire_server_backend.py │ ├── questionnaire_server_consultants.py │ ├── questionnaire_server_main.py │ └── server_support.py ├── service │ ├── advice_service.py │ ├── clarifications_agent.py │ ├── confidence_service.py │ ├── consultant_service.py │ ├── db_bootstrap_service.py │ ├── embedding_service.py │ ├── graph_service.py │ ├── graphrag_service.py │ ├── html_generator.py │ ├── initial_question_service.py │ ├── jwt_token_service.py │ ├── knowledge_base_service.py │ ├── language_adapter.py │ ├── mail_sender.py │ ├── ontology_service.py │ ├── persistence_service_async.py │ ├── persistence_service_consultants_async.py │ ├── persistence_service_questions_async.py │ ├── prompt_support.py │ ├── query_support.py │ ├── question_clarifications.py │ ├── question_generation_service.py │ ├── report_aggregation_main_service.py │ ├── report_aggregation_service.py │ ├── report_aggregation_summarization_service.py │ ├── report_enhancement_service.py │ ├── report_interactions_service.py │ ├── secondary_question_processor.py │ ├── similarity_search.py │ └── tagging_service.py ├── test │ ├── __init__.py │ ├── advice_service_test.py │ ├── conditional_advice_test.py │ ├── confidence_schema_test.py │ ├── confidence_service_test.py │ ├── consultant_service_test.py │ ├── graph_service_test.py │ ├── graphrag_service_test.py │ ├── initial_question_service_test.py │ ├── jwt_token_service_test.py │ ├── mail_sender_test.py │ ├── ontology_service_test.py │ ├── persistence_service_async_test.py │ ├── persistence_service_consultants_async_test.py │ ├── persistence_service_questions_async_test.py │ ├── provider │ │ ├── __init__.py │ │ ├── advice_provider.py │ │ ├── confidence_provider.py │ │ ├── consultant_provider.py │ │ ├── jwt_token_data_provider.py │ │ ├── knowledge_base_provider.py │ │ ├── multipart_provider.py │ │ ├── ontology_provider.py │ │ ├── question_answer_provider.py │ │ ├── question_answers_provider.py │ │ ├── questionnaire_provider.py │ │ ├── questionnaire_status_provider.py │ │ ├── session_configuration_provider.py │ │ └── session_properties_provider.py │ ├── question_answer_test.py │ ├── question_generation_service_test.py │ ├── questionnaire_server_test.py │ ├── questionnaire_status_test.py │ ├── report_aggregation_main_service_test.py │ ├── report_aggregation_service_test.py │ ├── report_aggregation_summarization_service_test.py │ ├── report_enhancement_service_test.py │ ├── server_message_test.py │ ├── session_configuration_test.py │ └── toml_support_test.py ├── toml_support.py ├── translation.py ├── ui │ ├── advice_processor.py │ ├── avatar_factory.py │ ├── chat_settings_factory.py │ ├── clarifications_chainlit.py │ ├── data_questionnaire_chainlit.py │ ├── image_support.py │ ├── mail_processor.py │ ├── model │ │ └── session_number_container.py │ ├── pdf_processor.py │ └── session_chainlit_playground.py └── utils │ ├── advice_parser.py │ ├── tracker_db_init.py │ ├── tracker_db_lister.py │ └── tracker_db_server.py ├── deploy_ui.sh ├── docs ├── advice_agent.bpmn ├── agent_interaction_flow.bpmn ├── data_wellness_workflow.bpmn ├── data_wellness_workflow.png ├── low_level_protocol │ ├── clarification.json │ ├── confidence_rating.json │ ├── generate_first_question.json │ ├── generate_secondary_question.json │ ├── ontology_creation.json │ └── report_generation.json └── prompt_examples │ ├── clarification.txt │ ├── confidence_rating.txt │ ├── ontology_creation.txt │ ├── question_generation.txt │ └── report_generation.txt ├── format.ps1 ├── i18n ├── messages.de.yml ├── messages.en.yml └── messages.fa.yml ├── notes.md ├── package-lock.json ├── poetry.lock ├── prompts_de.toml ├── prompts_en.toml ├── prompts_fa.toml ├── public ├── css │ └── styles.css ├── images │ ├── D-Well_Report_Banner.png │ ├── Hero_Image_with_Logo_and_Titles.jpg │ ├── banner_blank.png │ ├── banner_with_titles.png │ ├── companion_icon.png │ ├── data_ecosystem_areas.png │ └── natural-language-processing.png ├── js │ └── onepoint.js ├── logo_dark.png └── logo_light.png ├── pyproject.toml ├── run_app.sh ├── sql ├── db_creation_default.sql ├── db_creation_hypergility.sql ├── db_setup.sql ├── initial_data.sql ├── other_queries.sql └── verification_queries.sql ├── start.ps1 ├── start.sh └── templates ├── consultants-template.md ├── mail-template.html └── results-template.html /.dockerignore: -------------------------------------------------------------------------------- 1 | Dockerfile 2 | .chainlit 3 | .pytest_cache 4 | .env 5 | ./wheels 6 | .gitignore 7 | .gitmodules 8 | chainlit.md 9 | ./*.ps1 10 | ./report.csv 11 | src/phoenix/server/static/ 12 | .env.local -------------------------------------------------------------------------------- /.env.docker: -------------------------------------------------------------------------------- 1 | # OPENAI_API_KEY= 2 | OPENAI_MODEL=gpt-4o 3 | REQUEST_TIMEOUT=300 4 | 5 | VERBOSE_LLM=true 6 | LANGCHAIN_CACHE=false 7 | CHATGPT_STREAMING=false 8 | 9 | UI_TIMEOUT = 60 10 | 11 | # Email related 12 | # Production Settings 13 | MAIL_FROM_PERSON=Onepoint Data Wellness 14 | # MAIL_USER= 15 | # MAIL_PASSWORD= 16 | # MAIL_FROM= 17 | # MAIL_SERVER= 18 | MAIL_SUBJECT=Your recommendations from the Onepoint Data Wellness Companion™ 19 | FEEDBACK_EMAIL= 20 | 21 | # General stuff 22 | PROJECT_ROOT=/app 23 | QUESTION_CACHE_FOLDER=/tmp/data-questionnaire-agent/cache 24 | 25 | # PDF Related 26 | WKHTMLTOPDF_BINARY=/usr/bin/wkhtmltopdf 27 | TEMPLATE_LOCATION=/app/templates 28 | PDF_FOLDER=/tmp/data_questionnaire_agent/pdfs 29 | PDF_BANNER=/app/public/images/Hero_Image_with_Logo_and_Titles.jpg 30 | 31 | # Embedding related 32 | RAW_TEXT_FOLDER=/app/docs/raw_text 33 | EMBEDDINGS_PERSISTENCE_DIR=/app/embeddings 34 | EMBEDDINGS_CHUNK_SIZE=2500 35 | SEARCH_RESULTS_HOW_MANY=2 36 | 37 | # Question generation related 38 | QUESTIONS_PER_BATCH=1 39 | MINIMUM_QUESTIONNAIRE_SIZE=4 40 | 41 | # Token limit for chatgpt 4. Important to extend the context as much as possible using the vector DB search 42 | TOKEN_LIMIT=6000 43 | 44 | IMAGE_LLM_TEMPERATURE=0.9 45 | 46 | # Show session cost 47 | SHOW_SESSION_COST=false 48 | OPENAI_RETRY_ATTEMPTS=3 49 | OPENAI_WAIT_FIXED=30 50 | 51 | # Related to the tracker DB download 52 | # TRACKER_DB_LOGS_PASSWORD= 53 | 54 | # Database related 55 | # DB_NAME=data_wellness_companion 56 | # DB_USER= 57 | # DB_HOST=127.0.0.1 58 | # DB_PORT=5432 59 | # DB_PASSWORD= 60 | # DB_CREATE=true 61 | 62 | # Websocket server related 63 | WEBSOCKET_SERVER=0.0.0.0 64 | WEBSOCKET_PORT=8085 65 | WEBSOCKET_CORS_ALLOWED_ORIGINS=* 66 | 67 | # Webserver related 68 | UI_FOLDER=/app/ui 69 | 70 | # Langsmith related 71 | LANGCHAIN_TRACING_V2=false 72 | LANGCHAIN_PROJECT=Data Questionnaire Agent 73 | LANGCHAIN_ENDPOINT=https://api.smith.langchain.com 74 | # LANGCHAIN_API_KEY= 75 | 76 | USE_GRAPHRAG=true 77 | GRAPHRAG_BASE_URL=http://176.34.128.143:8087 78 | GRAPHRAG_MODE=all 79 | GRAPHRAG_CONTEXT_SIZE=5000 -------------------------------------------------------------------------------- /.env.local: -------------------------------------------------------------------------------- 1 | OPENAI_API_KEY= 2 | # Other options 3 | # OPENAI_MODEL=gpt-4-1106-preview 4 | # OPENAI_MODEL=gpt-4-turbo-2024-04-09 5 | # OPENAI_MODEL=gpt-4-0125-preview 6 | OPENAI_MODEL=gpt-4o 7 | OPENAI_API_TEMPERATURE=0.5 8 | REQUEST_TIMEOUT=300 9 | 10 | VERBOSE_LLM=true 11 | LANGCHAIN_CACHE=false 12 | CHATGPT_STREAMING=false 13 | 14 | UI_TIMEOUT = 60 15 | 16 | # Email related 17 | # Production Settings 18 | MAIL_FROM_PERSON=Onepoint Data Wellness 19 | MAIL_USER= 20 | MAIL_PASSWORD= 21 | MAIL_FROM= 22 | MAIL_SERVER= 23 | MAIL_SUBJECT=Your recommendations from the Onepoint Data Wellness Companion™ 24 | FEEDBACK_EMAIL= 25 | 26 | # General stuff 27 | PROJECT_ROOT=/development/playground/langchain/data_questionnaire_agent 28 | QUESTION_CACHE_FOLDER=/tmp/data_questionnaire_agent/cache 29 | TRANSLATION_PATH=/development/playground/langchain/data_questionnaire_agent/i18n 30 | 31 | # PDF Related 32 | WKHTMLTOPDF_BINARY=/Program Files/wkhtmltopdf/bin/wkhtmltopdf.exe 33 | TEMPLATE_LOCATION=/development/playground/langchain/data_questionnaire_agent/templates 34 | PDF_FOLDER=/tmp/data_questionnaire_agent/pdfs 35 | PDF_BANNER=/development/playground/langchain/data_questionnaire_agent/public/images/Hero_Image_with_Logo_and_Titles.jpg 36 | 37 | # Embedding related 38 | RAW_TEXT_FOLDER=/development/playground/langchain/data_questionnaire_agent/docs/raw_text 39 | EMBEDDINGS_PERSISTENCE_DIR=/development/playground/langchain/data_questionnaire_agent/embeddings 40 | EMBEDDINGS_CHUNK_SIZE=2500 41 | SEARCH_RESULTS_HOW_MANY=2 42 | 43 | # Question generation related 44 | QUESTIONS_PER_BATCH=1 45 | MINIMUM_QUESTIONNAIRE_SIZE=4 46 | 47 | # Token limit for chatgpt 4. Important to extend the context as much as possible using the vector DB search 48 | TOKEN_LIMIT=6000 49 | 50 | IMAGE_LLM_TEMPERATURE=0.9 51 | 52 | # Show session cost 53 | SHOW_SESSION_COST=false 54 | OPENAI_RETRY_ATTEMPTS=3 55 | OPENAI_WAIT_FIXED=30 56 | 57 | # Related to the tracker DB download 58 | TRACKER_DB_LOGS_PASSWORD= 59 | 60 | # Database related 61 | DB_NAME=data_wellness_languages 62 | DB_USER= 63 | DB_HOST=127.0.0.1 64 | DB_PORT=5432 65 | DB_PASSWORD= 66 | DB_CREATE=true 67 | 68 | # Websocket server related 69 | WEBSOCKET_SERVER=0.0.0.0 70 | WEBSOCKET_PORT=8085 71 | WEBSOCKET_CORS_ALLOWED_ORIGINS=* 72 | 73 | # Webserver related 74 | UI_FOLDER=/development/playground/langchain/data_questionnaire_agent/ui 75 | IMAGES_FOLDER=/development/playground/langchain/data_questionnaire_agent/public/images 76 | 77 | # Langsmith 78 | LANGCHAIN_TRACING_V2=true 79 | LANGCHAIN_PROJECT=Data Questionnaire Agent 80 | LANGCHAIN_ENDPOINT=https://api.smith.langchain.com 81 | LANGCHAIN_API_KEY= 82 | 83 | # JWT token related 84 | JWT_SECRET= 85 | JWT_ALGORITHM=HS256 86 | JWT_TIME_DELTA_MINUTES=120 87 | JWT_GEN_FOLDER=/tmp/data_wellness/jwt_gen 88 | DWELL_URL=http://localhost:5173 89 | DWISE_URL=http://localhost:5174 90 | 91 | 92 | # If Graph RAG being used, then extra parameters are needed 93 | USE_GRAPHRAG=true 94 | GRAPHRAG_BASE_URL=http://176.34.128.143:10000/protected/project 95 | GRAPHRAG_MODE=local 96 | GRAPHRAG_CONTEXT_SIZE=3000 97 | GRAPHRAG_JWT= 98 | GRAPHRAG_PROJECT=dwell 99 | GRAPHRAG_READ_TIMEOUT=20 100 | 101 | # Report Aggregation 102 | REPORT_TOKEN_LIMIT=30000 103 | AGGREGATOR_REPORT_FOLDER=/tmp/data_wellness 104 | 105 | LINKEDIN_USER= 106 | LINKEDIN_PASSWORD= 107 | 108 | GEMINI_API_KEY= 109 | GEMINI_MODEL=gemini-2.0-flash-lite-preview-02-05 -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | 162 | # Avoid pushing the embedding database. 163 | embeddings/ 164 | 165 | docker_start.bat -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "data_questionnaire_agent/data-wellness-companion-ui"] 2 | path = data_questionnaire_agent/data-wellness-companion-ui 3 | url = https://github.com/onepointconsulting/data-wellness-companion-ui.git 4 | [submodule "data-wellness-companion-ui"] 5 | path = data-wellness-companion-ui 6 | url = https://github.com/onepointconsulting/data-wellness-companion-ui.git 7 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # Please note that this Docker file does not yet produce anything useful. Work In Progress 2 | FROM ubuntu:latest 3 | 4 | SHELL ["/bin/bash", "-c"] 5 | 6 | RUN apt update 7 | RUN apt install python3 -y 8 | RUN apt install python3-pip -y 9 | RUN apt install python3.12-venv -y 10 | RUN apt install swig 11 | 12 | WORKDIR /app 13 | 14 | RUN python3 -m venv venv 15 | RUN . venv/bin/activate 16 | RUN ./venv/bin/pip install poetry 17 | 18 | COPY . . 19 | 20 | RUN ./venv/bin/poetry install 21 | 22 | RUN chmod +x ./start.sh 23 | RUN mv .env.docker .env 24 | 25 | # Install PDF converter 26 | RUN apt-get update && apt-get install -y wkhtmltopdf 27 | 28 | # Install node, npm and yarn for the JS side 29 | RUN apt install nodejs -y 30 | RUN apt install npm -y 31 | RUN npm install --global yarn 32 | 33 | # Make sure the PDF advice folder is available 34 | RUN mkdir -p /tmp/data_questionnaire_agent/pdfs 35 | 36 | CMD ["/bin/bash", "./start.sh"] -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Data Wellness Q&A Chatbot 2 | 3 | This is a reverse chatbot that asks the users questions about data integration practices and then gives advice based on a body of knowledge. 4 | This version operates a bit like an agent which tries to gather enough information to be able to give advice. So it may ask an unspecified number of questions. 5 | 6 | ## Cloning the project 7 | 8 | You should clone the project and then initialize the UI module with these commands: 9 | 10 | ``` 11 | git submodule init 12 | git submodule update 13 | ``` 14 | 15 | ## Setup 16 | 17 | We suggest to use [Conda](https://docs.conda.io/en/latest/) to manage the virtual environment and then install poetry. 18 | 19 | ``` 20 | conda activate base 21 | conda remove -n data_wellness_agent --all 22 | conda create -n data_wellness_agent python=3.13.1 23 | conda activate data_wellness_agent 24 | pip install poetry 25 | ``` 26 | 27 | ## Installation 28 | 29 | Change the directory to the root folder of the project. 30 | 31 | ``` 32 | poetry install 33 | ``` 34 | 35 | ## UI Installation 36 | 37 | Before you run, you should compile the UI with this command from the root folder of the project. 38 | Please note that this requires the installation of Yarn and node 18.18.0 or later. 39 | 40 | ``` 41 | poetry run build-ui 42 | ``` 43 | 44 | The UI connects the websocket to port 8085 natively. If the Python server runs on a different port, please change the port accordingly. 45 | 46 | ## Running 47 | 48 | ### Prepare the Postgres DB 49 | 50 | Before you run for the first time, you will need to create the Postgres database. Execute this command to start psql (make sure Postgres is installed) 51 | 52 | On Linux you can install Postgres like this: 53 | 54 | ``` 55 | sudo -u postgres psql 56 | ``` 57 | 58 | On Windows you will need to download the distribution from here: https://www.postgresql.org/download/ 59 | 60 | Then create the database: 61 | 62 | ``` 63 | CREATE DATABASE data_wellness_companion 64 | WITH 65 | OWNER = postgres 66 | ENCODING = 'UTF8' 67 | CONNECTION LIMIT = -1 68 | IS_TEMPLATE = False; 69 | ``` 70 | 71 | and add the initial tables with some data after exiting psql: 72 | 73 | Run on the console: 74 | 75 | ``` 76 | sudo -u postgres psql data_wellness_companion 77 | ``` 78 | 79 | And run the script sql/db_setup.sql 80 | 81 | 82 | ### Knowledge base 83 | 84 | The knowledge base is based on a directory (see environment variable `RAW_TEXT_FOLDER`). The folder needs to have *.txt documents in it. 85 | 86 | The server fails if there are no documents in this folder. 87 | 88 | ## Configuration 89 | 90 | You will have to have a `.env` file. To get you started you can copy the `.env.local` to `.env` and then change the configuration parameters accordingly. 91 | 92 | You will need to have an OpenAI API key and a Gemini key. 93 | 94 | ### Running the main application 95 | 96 | Make sure the Postgres DB is available. 97 | 98 | ```bash 99 | python ./data_questionnaire_agent/server/questionnaire_server_main.py 100 | ``` 101 | 102 | You can then check the UI on http://localhost:8085/index.html 103 | 104 | On Windows you can use the `.\start.ps1` script to start the server. 105 | 106 | ## Running Tests 107 | 108 | You can run tests like this: 109 | 110 | ```bash 111 | pytest 112 | ``` -------------------------------------------------------------------------------- /build_docker.ps1: -------------------------------------------------------------------------------- 1 | 2 | docker build -t onepoint-data-wellness-companion . -------------------------------------------------------------------------------- /chainlit.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/onepointconsulting/data-questionnaire-agent/cc96c26bbd559b15b6e3aec4e081e572713b14d9/chainlit.md -------------------------------------------------------------------------------- /data/questionnaire.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/onepointconsulting/data-questionnaire-agent/cc96c26bbd559b15b6e3aec4e081e572713b14d9/data/questionnaire.pkl -------------------------------------------------------------------------------- /data/questionnaire_all.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/onepointconsulting/data-questionnaire-agent/cc96c26bbd559b15b6e3aec4e081e572713b14d9/data/questionnaire_all.pkl -------------------------------------------------------------------------------- /data/sample_questionnaire1.md: -------------------------------------------------------------------------------- 1 | # Questionnaire 2 | 3 | Q: Which area of your data ecosystem are you most concerned about? 4 | A: Data silos - Data trapped in departmental silos can be inaccessible to other parts. 5 | 6 | 7 | Q: What specific challenges are you facing with data silos in your organisation? 8 | A: Our departments use different systems that don't communicate well, leading to data being isolated. 9 | 10 | 11 | Q: What are the main reasons for the lack of communication between the systems used by different departments in your organisation? 12 | A: The systems are built on different technologies that are not compatible with each other. 13 | 14 | 15 | Q: How do you currently manage data integration across the different systems within your organisation? 16 | A: We use manual processes to transfer data between systems, which is time-consuming and prone to errors. 17 | 18 | 19 | Q: What are the main consequences of using manual processes for data integration in your organisation? 20 | A: Manual processes are time-consuming and can lead to delays in accessing necessary data. 21 | Manual data integration is prone to human errors, which can result in data inaccuracies. 22 | 23 | 24 | Q: What are the most significant challenges you face in ensuring data quality during the manual data integration process? 25 | A: The manual nature of the process introduces frequent human errors, leading to inaccuracies. 26 | It is difficult to maintain data consistency across different systems during manual integration. 27 | 28 | 29 | Q: What are the main obstacles your organisation faces when trying to automate data integration processes across different systems? 30 | A: The lack of skilled personnel to design and implement automated processes. 31 | Budget constraints that limit the ability to invest in automation tools. 32 | 33 | 34 | Q: What are the key pain points your organisation faces due to data being trapped in departmental silos? 35 | A: Data silos lead to inefficiencies as teams cannot access the data they need promptly, affecting decision-making. 36 | The isolated data results in duplicated efforts across departments, wasting resources. 37 | 38 | 39 | Q: What are the specific types of data or systems most affected by the lack of integration and communication in your organisation? 40 | A: Customer data is most affected because it is spread across multiple systems, leading to inconsistencies. 41 | Financial data is often delayed in reporting due to integration issues, impacting decision-making. 42 | 43 | 44 | # Recommendations 45 | 46 | - Consider implementing a **DataOps** strategy to automate data integration processes. This approach can help streamline workflows and reduce the manual errors currently affecting your data quality. By adopting DataOps, your organisation can enhance collaboration between data engineers, data scientists, and business teams, leading to more efficient data handling and integration. 47 | - Invest in **data integration tools** that support interoperability between different systems. These tools can bridge the technological gaps that currently exist between departmental systems, facilitating smoother data flow and reducing data silos. 48 | - Develop a **data governance framework** that includes policies and procedures for data sharing across departments. This framework will ensure that data is managed consistently and securely, reducing the risk of errors and enhancing data quality. 49 | - Implement **data quality training** for employees involved in data handling. Training can equip them with the necessary skills to manage data responsibly, thereby reducing the likelihood of errors during manual integration processes. 50 | - Explore the use of **self-service data platforms** to empower departments to access and manage their data independently. Such platforms can reduce the dependency on IT for data integration tasks, improving efficiency and accessibility. 51 | 52 | # What to avoid 53 | 54 | - Avoid continuing to rely solely on manual data integration processes, as this approach is prone to errors and inefficiencies. 55 | - Do not ignore the importance of data consistency across systems, as inconsistencies can lead to inaccurate reporting and decision-making. 56 | - Avoid neglecting the need for skilled personnel in data integration and governance roles. Investing in training and hiring can significantly improve your data management capabilities. 57 | 58 | # Positive outcomes 59 | 60 | - Improved data accuracy and consistency, leading to more reliable decision-making processes. 61 | - Increased efficiency in data management and integration, reducing time delays and resource wastage. 62 | - Enhanced collaboration and data sharing across departments, breaking down silos and fostering innovation. 63 | -------------------------------------------------------------------------------- /data/sample_questionnaire2.md: -------------------------------------------------------------------------------- 1 | # Questionnaire 2 | 3 | Q: Which area of your data ecosystem are you most concerned about? 4 | A: Cost and complexity - A robust data analytics infrastructure requires significant investment of resources. 5 | 6 | 7 | Q: What specific challenges are you facing in managing the cost and complexity of your data analytics infrastructure? 8 | A: We struggle with the high costs of data storage and processing. 9 | 10 | 11 | Q: What strategies have you explored or implemented to manage the high costs associated with data storage and processing in your analytics infrastructure? 12 | A: We are considering data compression techniques to manage storage expenses. 13 | We have not yet implemented any specific strategies, as we are still in the research phase. 14 | 15 | 16 | Q: What types of data are you primarily dealing with in your analytics infrastructure, and how does this impact your storage and processing costs? 17 | A: Our infrastructure handles a lot of unstructured data, leading to increased storage costs due to its volume and complexity. 18 | 19 | 20 | Q: How do you prioritise your data governance initiatives within the organisation, especially when dealing with large volumes of unstructured data? 21 | A: We focus on data quality and security as top priorities to ensure compliance and reliability. 22 | 23 | 24 | Q: What methods or technologies are you currently using to handle and process unstructured data, and how effective have they been in managing costs and complexity? 25 | A: We have implemented data lakes to store unstructured data, but managing them has proven complex. 26 | 27 | 28 | # Recommendations 29 | 30 | - **Implement data compression techniques** to reduce storage expenses. Given the high costs associated with storing unstructured data, employing data compression can significantly decrease the amount of space required, thereby lowering costs. 31 | - **Leverage a Modern Data Strategy** to handle unstructured data more effectively. This strategy can help consolidate unstructured data on a data lake, making it easier to manage and potentially reducing costs. 32 | - **Utilise data lakehouse architecture** to simplify data management and reduce complexity. By integrating the benefits of data lakes and data warehouses, a data lakehouse can provide a more streamlined approach to managing data, which can help in handling large volumes of unstructured data. 33 | - **Adopt DataOps practices** to enhance data processing efficiency. DataOps can streamline workflows and improve the speed and quality of data analytics, helping to manage the complexity and costs of your data infrastructure. 34 | - **Invest in active metadata management** to improve data governance. This can help in maintaining data quality and security, particularly for unstructured data, ensuring compliance and reliability. 35 | 36 | # What to avoid 37 | 38 | - Avoid neglecting the importance of metadata management, as it is crucial for maintaining data quality and governance. 39 | - Do not overlook the potential of a Modern Data Strategy to manage unstructured data effectively, as this can lead to inefficiencies and increased costs. 40 | - Avoid relying solely on traditional data warehouses for unstructured data, as they may not be cost-effective or efficient for managing such data types. 41 | 42 | # Positive outcomes 43 | 44 | - Reduced storage costs through effective data compression and management strategies. 45 | - Enhanced data governance and quality, leading to improved compliance and reliability. 46 | - Streamlined data management processes that reduce complexity and improve efficiency. 47 | -------------------------------------------------------------------------------- /data/sample_questionnaire3.md: -------------------------------------------------------------------------------- 1 | # Questionnaire 2 | 3 | Q: Which area of your data ecosystem are you most concerned about? 4 | A: Lack of skilled personnel - Missing skills in data science, analytics, AI and ML can impede the effective use of data. 5 | 6 | 7 | Q: How do you currently address the skill gaps in data science, analytics, AI, and ML within your organisation? 8 | A: We do both: 9 | We provide internal training programmes and workshops to upskill our existing staff. 10 | We hire external consultants or temporary staff to fill the gaps in expertise. 11 | 12 | 13 | Q: What specific data quality issues are you currently facing that impact your organisation's decision-making process? 14 | A: We frequently encounter incomplete or outdated data, which affects the accuracy of our reports. 15 | 16 | 17 | Q: How does your organisation currently ensure data integrity across different systems and during various stages of the data lifecycle? 18 | A: We do not have a formal process in place, which sometimes leads to data integrity issues. 19 | We perform manual data checks whenever there is a need. 20 | 21 | 22 | Q: What steps has your organisation taken to address the issue of incomplete or outdated data? 23 | A: We rely on periodic manual reviews to identify and correct data issues. 24 | 25 | 26 | Q: What are the main challenges you face in maintaining accurate and up-to-date documentation about your data sources and processes? 27 | A: We struggle with keeping documentation current due to rapid changes in our data systems. 28 | 29 | 30 | Q: How does your organisation currently handle data governance, particularly in terms of establishing roles, responsibilities, and processes for data management? 31 | A: We do not right now because we are planning to create a dedicated data governance team. Here we need to find candidates for the job. 32 | 33 | 34 | Q: How do you plan to address the challenges of keeping your data documentation current amidst rapid changes in your data systems? 35 | A: We plan to assign dedicated personnel to regularly update documentation as part of their responsibilities. 36 | 37 | 38 | # Recommendations 39 | 40 | - **Invest in Data Quality Training**: As your organisation is facing issues with incomplete or outdated data, it's vital to provide comprehensive data quality training to your staff. This will equip them with the necessary skills to identify and rectify data quality issues proactively, reducing the reliance on manual reviews. 41 | - **Establish Formal Data Governance Policies**: Since you are planning to create a dedicated data governance team, it is crucial to establish clear data governance policies. These should define roles, responsibilities, and processes for data management, ensuring a consistent approach to data governance across the organisation. 42 | - **Implement Automated Data Integrity Checks**: To address the lack of formal processes for ensuring data integrity, consider implementing automated data integrity checks. This will help maintain data consistency and reliability across different systems and stages of the data lifecycle, reducing the dependency on manual checks. 43 | - **Utilise Data Cleansing Tools**: Given your challenges with incomplete or outdated data, employing data cleansing tools can be beneficial. These tools can automate the process of identifying and correcting data issues, ensuring that the data remains accurate and up-to-date. 44 | - **Regularly Update Data Documentation**: Assign dedicated personnel to maintain and update data documentation regularly. This will help keep your documentation current amidst rapid changes, ensuring that all stakeholders have access to accurate and up-to-date information about data sources and processes. 45 | 46 | # What to avoid 47 | 48 | - **Avoid Relying Solely on Manual Checks**: Manual checks for data integrity and quality are time-consuming and prone to human error. Relying solely on them can lead to inconsistencies and inaccuracies. 49 | - **Do Not Delay the Formation of a Data Governance Team**: Procrastinating on establishing a data governance team will only prolong the issues related to data management and governance. Swift action is necessary to address these challenges effectively. 50 | - **Avoid Neglecting Data Documentation**: Inadequate or outdated documentation can lead to misunderstandings and errors. Ensure that documentation is not neglected and is regularly updated to reflect any changes in data systems. 51 | 52 | # Positive outcomes 53 | 54 | - **Improved Data Quality and Accuracy**: By investing in training and automated tools, the quality and accuracy of your data will improve, leading to more reliable decision-making. 55 | - **Enhanced Data Governance**: Establishing a dedicated team and clear policies will streamline data management processes, ensuring better governance and compliance. 56 | - **Increased Operational Efficiency**: Automated processes for data integrity checks and cleansing will reduce the time and effort required for manual interventions, enhancing overall operational efficiency. 57 | -------------------------------------------------------------------------------- /data_questionnaire_agent/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/onepointconsulting/data-questionnaire-agent/cc96c26bbd559b15b6e3aec4e081e572713b14d9/data_questionnaire_agent/__init__.py -------------------------------------------------------------------------------- /data_questionnaire_agent/bootstrap/__init__.py: -------------------------------------------------------------------------------- 1 | from data_questionnaire_agent.config import db_cfg 2 | 3 | __all__ = [db_cfg] 4 | -------------------------------------------------------------------------------- /data_questionnaire_agent/bootstrap/consultant_loader.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | import click 4 | from consultant_info_generator.consultant_info_tools import extract_consultant 5 | 6 | from data_questionnaire_agent.bootstrap import db_cfg 7 | from data_questionnaire_agent.service.persistence_service_consultants_async import ( 8 | save_consultant, 9 | ) 10 | 11 | 12 | @click.command() 13 | @click.option( 14 | "--linkedin_id", "-lid", help="The linked in ID", multiple=True, required=True 15 | ) 16 | def save_consultants(linkedin_id: list[str]): 17 | for id in linkedin_id: 18 | try: 19 | consultant = extract_consultant(id) 20 | click.echo(f"Processed {id}") 21 | asyncio.run(save_consultant(consultant)) 22 | except Exception as e: 23 | click.echo(f"Failed to retrieve {id}", err=True) 24 | click.echo(f"Error message {e}", err=True) 25 | 26 | 27 | if __name__ == "__main__": 28 | print(db_cfg.db_conn_str) 29 | save_consultants() 30 | -------------------------------------------------------------------------------- /data_questionnaire_agent/bootstrap/sql_script.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import sys 3 | from pathlib import Path 4 | 5 | from data_questionnaire_agent.config import db_cfg 6 | from data_questionnaire_agent.log_init import logger 7 | from data_questionnaire_agent.service.db_bootstrap_service import execute_script 8 | 9 | if __name__ == "__main__": 10 | if len(sys.argv) < 2: 11 | logger.error("Please enter one or more scripts as arguments.") 12 | if db_cfg.db_create: 13 | for script in sys.argv[1:]: 14 | logger.info(f"Processing {script}") 15 | script_path = Path(script) 16 | assert script_path.exists(), f"Cannot find {script}." 17 | res = asyncio.run(execute_script(script_path)) 18 | if isinstance(res, bool): 19 | if res: 20 | logger.info(f"Script {script} executed successfully.") 21 | else: 22 | logger.error(f"Script {script} failed.") 23 | else: 24 | logger.error(f"An error has occurred: {res}") 25 | else: 26 | logger.info("DB creation turned off.") 27 | -------------------------------------------------------------------------------- /data_questionnaire_agent/build.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | 4 | 5 | def ui(): 6 | os.chdir("./data-wellness-companion-ui") 7 | os.system("yarn") 8 | if os.path.exists("./dist"): 9 | shutil.rmtree("./dist") 10 | os.system("yarn run build") 11 | if os.path.exists("../ui"): 12 | shutil.rmtree("../ui") 13 | shutil.copytree("./dist", "../ui", dirs_exist_ok=True) 14 | 15 | 16 | def check(): 17 | os.system("black .") 18 | os.system("ruff check --fix .") 19 | -------------------------------------------------------------------------------- /data_questionnaire_agent/config_support.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | 4 | def create_db_conn_str() -> str: 5 | db_name = os.getenv("DB_NAME") 6 | assert db_name is not None 7 | db_user = os.getenv("DB_USER") 8 | assert db_user is not None 9 | db_host = os.getenv("DB_HOST") 10 | assert db_host is not None 11 | db_port = os.getenv("DB_PORT") 12 | assert db_port is not None 13 | db_port = int(db_port) 14 | db_password = os.getenv("DB_PASSWORD") 15 | assert db_password is not None 16 | 17 | return f"dbname={db_name} user={db_user} password={db_password} host={db_host} port={db_port}" 18 | -------------------------------------------------------------------------------- /data_questionnaire_agent/log_init.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | logging.basicConfig(level="INFO") 4 | 5 | logger = logging.getLogger("data_integration_agent") 6 | -------------------------------------------------------------------------------- /data_questionnaire_agent/model/application_schema.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, field 2 | from typing import List 3 | 4 | from data_questionnaire_agent.model.openai_schema import ResponseQuestions 5 | 6 | 7 | @dataclass 8 | class QuestionAnswer: 9 | id: int | None 10 | question: str 11 | answer: str | dict 12 | clarification: str | None 13 | possible_answers: List[str] = field(default_factory=list) 14 | 15 | def answer_str(self): 16 | if not self.answer: 17 | return "" 18 | elif isinstance(self.answer, str): 19 | return self.answer 20 | else: 21 | return self.answer["content"] 22 | 23 | def __repr__(self) -> str: 24 | return f"""{self.question} 25 | {self.answer_str()}""" 26 | 27 | @staticmethod 28 | def question_answer_factory(question: str, answer: dict): 29 | return QuestionAnswer( 30 | id=None, question=question, answer=answer, clarification="" 31 | ) 32 | 33 | @staticmethod 34 | def question_factory(question: str): 35 | return QuestionAnswer(id=None, question=question, answer="", clarification="") 36 | 37 | 38 | @dataclass 39 | class Questionnaire: 40 | questions: List[QuestionAnswer] 41 | 42 | def __repr__(self) -> str: 43 | return "\n\n".join([str(qa) for qa in self.questions]) 44 | 45 | def __len__(self): 46 | return len(self.questions) 47 | 48 | def answers_str(self) -> str: 49 | return "\n\n".join( 50 | [ 51 | ( 52 | qa.answer["content"] or "" 53 | if isinstance(qa.answer, dict) 54 | else qa.answer or "" 55 | ) 56 | for qa in self.questions 57 | ] 58 | ) 59 | 60 | def to_html(self) -> str: 61 | html = """ 62 | """ 63 | for qa in self.questions: 64 | answer = qa.answer 65 | html += f""" 66 | 67 | 71 | 72 | 73 | 74 | 75 | """ 76 | html += "
68 |
69 | Q: {qa.question} 70 |
A: {answer}
" 77 | return html 78 | 79 | 80 | def convert_to_question_answers( 81 | response_questions: ResponseQuestions, 82 | ) -> List[QuestionAnswer]: 83 | question_answers = [] 84 | for i, q in enumerate(response_questions.questions): 85 | question_answer = QuestionAnswer.question_factory(q) 86 | question_answers.append(question_answer) 87 | if i == 0: 88 | question_answer.possible_answers = response_questions.possible_answers 89 | return question_answers 90 | -------------------------------------------------------------------------------- /data_questionnaire_agent/model/confidence_schema.py: -------------------------------------------------------------------------------- 1 | from enum import StrEnum 2 | from typing import Union 3 | 4 | from pydantic.v1 import BaseModel, Field 5 | 6 | from data_questionnaire_agent.translation import t 7 | 8 | 9 | class ConfidenceDegree(StrEnum): 10 | outstanding = "outstanding" 11 | high = "high" 12 | medium = "medium" 13 | mediocre = "mediocre" 14 | low = "low" 15 | 16 | 17 | CONFIDENCE_DEGREE_DICT = { 18 | ConfidenceDegree.outstanding: 5, 19 | ConfidenceDegree.high: 4, 20 | ConfidenceDegree.medium: 3, 21 | ConfidenceDegree.mediocre: 2, 22 | ConfidenceDegree.low: 1, 23 | } 24 | 25 | 26 | class ConfidenceRating(BaseModel): 27 | """Represents a rating of how confident the model is to give advice to a customer based on a questionnaire""" 28 | 29 | id: Union[int, None] = Field( 30 | default=None, description="The identifier of this session configuration" 31 | ) 32 | 33 | reasoning: str = Field( 34 | ..., description="The models's reasoning behind the confidence rating." 35 | ) 36 | 37 | rating: ConfidenceDegree = Field( 38 | ..., 39 | description="The confidence rating of the model to give advice to a customer based on a questionnaire", 40 | ) 41 | 42 | def _value(self) -> int: 43 | return CONFIDENCE_DEGREE_DICT[self.rating] 44 | 45 | def __lt__(self, other): 46 | if isinstance(other, ConfidenceRating): 47 | return self._value() < other._value() 48 | return NotImplemented 49 | 50 | def __le__(self, other): 51 | if isinstance(other, ConfidenceRating): 52 | return self._value() <= other._value() 53 | return NotImplemented 54 | 55 | def __gt__(self, other): 56 | if isinstance(other, ConfidenceRating): 57 | return self._value() > other._value() 58 | return NotImplemented 59 | 60 | def __ge__(self, other): 61 | if isinstance(other, ConfidenceRating): 62 | return self._value() >= other._value() 63 | return NotImplemented 64 | 65 | def to_markdown(self, locale: str = "en") -> str: 66 | return f""" 67 | # {t("Confidence Degree", locale=locale)} 68 | 69 | `{t("confidence_degree_" + self.rating, locale=locale)}` 70 | 71 | ## {t("Reasoning", locale=locale)} 72 | 73 | {self.reasoning} 74 | """ 75 | 76 | def to_html(self, language: str = "en") -> str: 77 | return f""" 78 |
{t("confidence_degree_" + self.rating)}
79 | 80 |

{t("Reasoning", locale=language)}

81 | 82 |

{self.reasoning}

83 | """ 84 | 85 | 86 | -------------------------------------------------------------------------------- /data_questionnaire_agent/model/consultant_rating.py: -------------------------------------------------------------------------------- 1 | from enum import StrEnum 2 | from typing import Optional 3 | 4 | from pydantic.v1 import BaseModel, Field 5 | 6 | 7 | class AnalystRating(StrEnum): 8 | very_suitable = "very suitable" 9 | suitable = "suitable" 10 | moderately_suitable = "moderately suitable" 11 | hardly_suitable = "hardly suitable" 12 | unsuitable = "unsuitable" 13 | 14 | 15 | SCORES = { 16 | AnalystRating.very_suitable: 5, 17 | AnalystRating.suitable: 4, 18 | AnalystRating.moderately_suitable: 3, 19 | AnalystRating.hardly_suitable: 2, 20 | AnalystRating.unsuitable: 1, 21 | } 22 | 23 | 24 | class ConsultantRating(BaseModel): 25 | """Represents the degree to which a consultant with a specific profile can help a customer that answered a questionnaire.""" 26 | 27 | analyst_name: str = Field(..., description="The analyst's name.") 28 | 29 | analyst_linkedin_url: Optional[str] = Field( 30 | ..., description="The optional analyst LinkedIN URL" 31 | ) 32 | 33 | reasoning: str = Field( 34 | ..., 35 | description="""The models's reasoning behind the analyst rating. 36 | Why is this analyst suitable or not to support the client which answered the questionnnaire. 37 | You can use markdown to highlight the main keywords in the reasoning. 38 | """, 39 | ) 40 | 41 | rating: AnalystRating = Field( 42 | ..., 43 | description="The rating of the consulting capabilities of the consultant to help the customer", 44 | ) 45 | 46 | 47 | class ScoredConsultantRating(ConsultantRating): 48 | """Rrepresents a scored consultant rating.""" 49 | 50 | score: int = Field( 51 | ..., 52 | description="The score applied to this consultant for the current customer.", 53 | ) 54 | 55 | 56 | class ConsultantRatings(BaseModel): 57 | consultant_ratings: list[ConsultantRating] = Field( 58 | ..., description="The list of consultant ratings" 59 | ) 60 | -------------------------------------------------------------------------------- /data_questionnaire_agent/model/global_configuration.py: -------------------------------------------------------------------------------- 1 | from pydantic.v1 import BaseModel, Field 2 | 3 | from data_questionnaire_agent.log_init import logger 4 | 5 | 6 | class GlobalConfigurationProperty(BaseModel): 7 | config_key: str = Field(..., description="The configuration key") 8 | config_value: str = Field(..., description="The configuration value") 9 | 10 | 11 | class GlobalConfiguration(BaseModel): 12 | properties: list[GlobalConfigurationProperty] = Field( 13 | ..., description="The list of properties in the configuration" 14 | ) 15 | 16 | def get_default_session_steps(self, default_steps: int): 17 | for p in self.properties: 18 | if p.config_key == "MESSAGE_LOWER_LIMIT": 19 | try: 20 | return int(p.config_value) 21 | except ValueError as e: 22 | logger.warn( 23 | f"Could not extract default steps from configuration: {e}" 24 | ) 25 | return default_steps 26 | return default_steps 27 | -------------------------------------------------------------------------------- /data_questionnaire_agent/model/initial_question_data.py: -------------------------------------------------------------------------------- 1 | special_question_data = [ 2 | { 3 | "img_src": "poor_data_quality.png", 4 | "img_alt": "Poor data quality", 5 | "title": "Poor data quality", 6 | "text": "Low-quality data can lead to incorrect insights and poor decision-making.", 7 | }, 8 | { 9 | "img_src": "compliance_risks.png", 10 | "img_alt": "Compliance and security risks", 11 | "title": "Compliance and security risks", 12 | "text": "Mishandling data can lead to legal troubles and reputational damage.", 13 | }, 14 | { 15 | "img_src": "data_silos.png", 16 | "img_alt": "Data silos", 17 | "title": "Data silos", 18 | "text": "Data trapped in departmental silos can be inaccessible to other parts.", 19 | }, 20 | { 21 | "img_src": "lack_of_skilled_personnel.png", 22 | "img_alt": "Lack of skilled personnel", 23 | "title": "Lack of skilled personnel", 24 | "text": "Missing skills in data science, analytics, AI and ML can impede the effective use of data.", 25 | }, 26 | { 27 | "img_src": "data_overload.png", 28 | "img_alt": "Data overload", 29 | "title": "Data overload", 30 | "text": '"Data glut" can slow down processes and make it difficult to identify what data is actually useful.', 31 | }, 32 | { 33 | "img_src": "cost_and_complexity.png", 34 | "img_alt": "Cost and complexity", 35 | "title": "Cost and complexity", 36 | "text": "A robust data analytics infrastructure requires significant investment of resources.", 37 | }, 38 | { 39 | "img_src": "inconsistent_data_strategies.png", 40 | "img_alt": "Inconsistent data strategies", 41 | "title": "Inconsistent data strategies", 42 | "text": "Difficult to align with modern concepts like Data Fabric, Mesh and Generative AI.", 43 | }, 44 | { 45 | "img_src": "resistence_to_change.png", 46 | "img_alt": "Resistance to change", 47 | "title": "Resistance to change", 48 | "text": "Employees need to adapt to new ways of operating to make data-driven transformation work.", 49 | }, 50 | ] 51 | -------------------------------------------------------------------------------- /data_questionnaire_agent/model/jwt_token.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | from pydantic.v1 import BaseModel, Field 4 | 5 | 6 | class JWTToken(BaseModel): 7 | """JWT Token Data""" 8 | 9 | email: str = Field(..., description="The email") 10 | token: str = Field(..., description="The whole token") 11 | 12 | 13 | class JWTTokenData(BaseModel): 14 | name: str = Field(..., description="The subject on the token") 15 | email: str = Field(..., description="The email") 16 | time_delta_minutes: Optional[int] = Field( 17 | ..., description="Determines the expiry date of the token" 18 | ) 19 | -------------------------------------------------------------------------------- /data_questionnaire_agent/model/languages.py: -------------------------------------------------------------------------------- 1 | ENGLISH = "en" 2 | FARSI = "fa" 3 | 4 | DEFAULT_LANGUAGE = ENGLISH 5 | -------------------------------------------------------------------------------- /data_questionnaire_agent/model/mail_data.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | from pydantic import BaseModel, Field 4 | 5 | 6 | class MailData(BaseModel): 7 | person_name: str = Field(..., description="The name of the person") 8 | email: str = Field(..., description="The actual name of the person") 9 | 10 | 11 | class Email(BaseModel): 12 | recipient: str = Field(..., description="The recipient email") 13 | subject: str = Field(..., description="The email subject") 14 | html_body: str = Field(..., description="The html email body") 15 | files: list[Path] = Field(..., description="The attachment files") 16 | -------------------------------------------------------------------------------- /data_questionnaire_agent/model/ontology_schema.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, List 2 | 3 | from pydantic.v1 import BaseModel, Field 4 | 5 | 6 | class Edge(BaseModel): 7 | """Represents single triplet with source, target and relationship names""" 8 | 9 | source: str = Field(..., description="The source node") 10 | relationship: str = Field( 11 | ..., description="The relationship between the source and the target node" 12 | ) 13 | target: str = Field(..., description="The target node") 14 | 15 | 16 | class Ontology(BaseModel): 17 | """Represents a container with a list of source, target and relationship names""" 18 | 19 | relationships: List[Edge] = Field( 20 | ..., 21 | description="The list of all edges in the questionnaire", 22 | ) 23 | 24 | 25 | class AnalyzedOntology(BaseModel): 26 | """Represents a container with a list of source, target and relationship names with extended measurements""" 27 | 28 | relationships: List[Edge] = Field( 29 | ..., 30 | description="The list of all edges with extended information in the questionnaire", 31 | ) 32 | degree_centrality: Dict[str, float] = Field( 33 | ..., 34 | description="The dictionary of node to degree centrality", 35 | ) 36 | betweenness_centrality: Dict[str, float] = Field( 37 | ..., 38 | description="The dictionary of node to degree betweenness", 39 | ) 40 | connected_component_importance_dict: Dict[str, int] = Field( 41 | ..., 42 | description="Dictionary containing the number of nodes in the subgraph of the node which is the key", 43 | ) 44 | 45 | 46 | if __name__ == "__main__": 47 | print(Ontology.schema_json()) 48 | -------------------------------------------------------------------------------- /data_questionnaire_agent/model/question_suggestion.py: -------------------------------------------------------------------------------- 1 | from typing import Union 2 | 3 | from pydantic.v1 import BaseModel, Field 4 | 5 | 6 | class QuestionSuggestion(BaseModel): 7 | id: Union[int, None] = Field(default=None, description="The unique identifier") 8 | img_src: str = Field(..., description="The image associated to the suggestion") 9 | img_alt: str = Field(..., description="The alternative description of the image") 10 | title: str = Field(..., description="The suggestion title") 11 | main_text: str = Field(..., description="The suggestion text") 12 | svg_image: str = Field(..., description="The SVG image") 13 | 14 | 15 | class QuestionAndSuggestions(BaseModel): 16 | id: Union[int, None] = Field(default=None, description="The unique identifier") 17 | question: str = Field(default=None, description="The actual question") 18 | suggestions: list[QuestionSuggestion] = Field( 19 | ..., description="The list of suggested question suggestions" 20 | ) 21 | 22 | 23 | class QuestionInfo(BaseModel): 24 | question_and_suggestions: list[QuestionAndSuggestions] = Field( 25 | ..., description="Theh list of question and suggestions" 26 | ) 27 | -------------------------------------------------------------------------------- /data_questionnaire_agent/model/questionnaire_status.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from typing import Union 3 | 4 | from pydantic import BaseModel, Field 5 | 6 | 7 | class QuestionnaireStatus(BaseModel): 8 | id: Union[int, None] = Field(default=None, description="The unique identifier") 9 | session_id: str = Field(..., description="The session identifier") 10 | question: str = Field(..., description="The question") 11 | answer: Union[str, None] = Field(default="", description="The optional answer") 12 | final_report: bool = Field(..., description="Whether this is the final report") 13 | created_at: Union[datetime, None] = Field( 14 | default=None, description="The created timestamp" 15 | ) 16 | updated_at: Union[datetime, None] = Field( 17 | default=None, description="The updated timestamp" 18 | ) 19 | total_cost: float = Field(default=0, description="") 20 | clarification: Union[str, None] = Field( 21 | default=None, description="The clarification or explanation of the question" 22 | ) 23 | question_id: int | None = Field( 24 | default=None, 25 | description="The question identifier in case a question is NOT generated", 26 | ) 27 | -------------------------------------------------------------------------------- /data_questionnaire_agent/model/server_model.py: -------------------------------------------------------------------------------- 1 | from typing import Any, List, Union 2 | 3 | from pydantic.v1 import BaseModel, Field 4 | 5 | from data_questionnaire_agent.model.global_configuration import GlobalConfiguration 6 | from data_questionnaire_agent.model.questionnaire_status import QuestionnaireStatus 7 | from data_questionnaire_agent.model.session_configuration import SessionConfiguration 8 | 9 | 10 | class ServerMessage(BaseModel): 11 | session_id: str = Field(..., description="The application's source identifier") 12 | question: str = Field(..., description="The question") 13 | answer: str = Field( 14 | default="", description="The answer to the question. Should come from the user" 15 | ) 16 | final_report: bool = Field( 17 | default=False, 18 | description="Whether the question is to be seen as a final report", 19 | ) 20 | suggestions: List[Any] = Field( 21 | default=[], description="The list of suggested responses" 22 | ) 23 | clarification: Union[str, None] = Field( 24 | ..., description="The clarification or explanation of the question" 25 | ) 26 | question_id: int | None = Field( 27 | default=None, 28 | description="The question identifier in case a question is NOT generated", 29 | ) 30 | 31 | 32 | class ServerMessages(BaseModel): 33 | session_id: str = Field(..., description="The application's source identifier") 34 | server_messages: List[ServerMessage] = Field( 35 | ..., description="A list with server messages" 36 | ) 37 | session_configuration: Union[SessionConfiguration, None] = Field( 38 | default=None, description="The session configuration" 39 | ) 40 | global_configuration: Union[GlobalConfiguration, None] = Field( 41 | default=None, description="The global configuration with key value pairs" 42 | ) 43 | 44 | 45 | class ErrorMessage(BaseModel): 46 | session_id: str = Field(..., description="The application's source identifier") 47 | error: str = Field(..., description="The error message") 48 | 49 | 50 | def assert_server_messages_factory(questionnaire: List[QuestionnaireStatus]): 51 | assert questionnaire is not None, "The questionnaire is none" 52 | assert len(questionnaire) > 0, "Questionnaire is empty" 53 | 54 | 55 | def convert_questionnaire( 56 | questionnaire: List[QuestionnaireStatus], 57 | ) -> List[ServerMessage]: 58 | return [ 59 | ServerMessage( 60 | session_id=q.session_id, 61 | question=q.question, 62 | answer="" if q.answer is None else q.answer, 63 | final_report=q.final_report, 64 | clarification=q.clarification, 65 | question_id=q.question_id, 66 | ) 67 | for q in questionnaire 68 | ] 69 | 70 | 71 | def server_messages_factory(questionnaire: List[QuestionnaireStatus]) -> ServerMessages: 72 | assert_server_messages_factory(questionnaire) 73 | session_id = questionnaire[0].session_id 74 | return ServerMessages( 75 | session_id=session_id, 76 | server_messages=convert_questionnaire(questionnaire), 77 | ) 78 | -------------------------------------------------------------------------------- /data_questionnaire_agent/model/session_configuration.py: -------------------------------------------------------------------------------- 1 | from enum import StrEnum 2 | from typing import List, Union 3 | 4 | from pydantic.v1 import BaseModel, Field 5 | 6 | DEFAULT_SESSION_STEPS = 6 7 | 8 | SESSION_STEPS_CONFIG_KEY = "session-steps" 9 | SESSION_STEPS_LANGUAGE_KEY = "session-language" 10 | SESSION_CHAT_TYPE = "session-chat-type" 11 | CLIENT_ID_KEY = "session-client-id" 12 | 13 | 14 | class ChatType(StrEnum): 15 | DIVERGING = "diverging" 16 | TO_THE_POINT = "to_the_point" 17 | 18 | 19 | DEFAULT_CHAT_TYPE = ChatType.DIVERGING 20 | 21 | 22 | def chat_type_factory(s: str) -> ChatType: 23 | try: 24 | return ChatType(s) 25 | except ValueError: 26 | return DEFAULT_CHAT_TYPE 27 | 28 | 29 | class SessionConfigurationEntry(BaseModel): 30 | id: Union[int, None] = Field( 31 | default=None, description="The identifier of this session configuration" 32 | ) 33 | session_id: str = Field(..., description="The application's source identifier") 34 | config_key: str = Field(..., description="The configuration key") 35 | config_value: str = Field(..., description="The configuration value") 36 | 37 | 38 | class SessionConfiguration(BaseModel): 39 | configuration_entries: List[SessionConfigurationEntry] = Field( 40 | ..., description="All session configuration entries" 41 | ) 42 | 43 | 44 | class SessionProperties(BaseModel): 45 | session_steps: int = Field(..., description="The session steps") 46 | session_language: str = Field(..., description="The session language") 47 | chat_type: ChatType = Field(..., description="The type of the chat") 48 | 49 | 50 | def create_session_configurations( 51 | session_id: str, 52 | session_properties: SessionProperties, 53 | client_id: str = "", 54 | ) -> List[SessionConfigurationEntry]: 55 | session_steps = session_properties.session_steps 56 | chat_type = session_properties.chat_type 57 | language = session_properties.session_language 58 | session_configuration_entry = SessionConfigurationEntry( 59 | session_id=session_id, 60 | config_key=SESSION_STEPS_CONFIG_KEY, 61 | config_value=str(session_steps), 62 | ) 63 | session_configuration_language = SessionConfigurationEntry( 64 | session_id=session_id, 65 | config_key=SESSION_STEPS_LANGUAGE_KEY, 66 | config_value=language, 67 | ) 68 | session_configuration_chat_type = SessionConfigurationEntry( 69 | session_id=session_id, 70 | config_key=SESSION_CHAT_TYPE, 71 | config_value=chat_type.value, 72 | ) 73 | session_keys = [ 74 | session_configuration_entry, 75 | session_configuration_language, 76 | session_configuration_chat_type, 77 | ] 78 | if client_id is not None and len(client_id.strip()) > 0: 79 | session_keys.append( 80 | SessionConfigurationEntry( 81 | session_id=session_id, 82 | config_key=CLIENT_ID_KEY, 83 | config_value=client_id, 84 | ) 85 | ) 86 | return session_keys 87 | -------------------------------------------------------------------------------- /data_questionnaire_agent/server/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/onepointconsulting/data-questionnaire-agent/cc96c26bbd559b15b6e3aec4e081e572713b14d9/data_questionnaire_agent/server/__init__.py -------------------------------------------------------------------------------- /data_questionnaire_agent/server/agent_session.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, Optional 2 | 3 | from ulid import ULID 4 | 5 | 6 | class AgentSession: 7 | def __init__( 8 | self, 9 | # Id from the session cookie 10 | socket_id: str, 11 | session_id: Optional[str], 12 | ): 13 | self.socket_id = socket_id 14 | self.session_id = ( 15 | session_id 16 | if session_id is not None and len(session_id) > 0 17 | else str(ULID()) 18 | ) 19 | agent_sessions[self.session_id] = self 20 | 21 | 22 | agent_sessions: Dict[str, AgentSession] = {} 23 | -------------------------------------------------------------------------------- /data_questionnaire_agent/server/questionnaire_server_consultants.py: -------------------------------------------------------------------------------- 1 | from aiohttp import web 2 | 3 | from data_questionnaire_agent.log_init import logger 4 | from data_questionnaire_agent.server.server_support import ( 5 | CORS_HEADERS, 6 | extract_language, 7 | extract_session, 8 | routes, 9 | ) 10 | from data_questionnaire_agent.service.consultant_service import ( 11 | calculate_consultant_ratings_for, 12 | ) 13 | 14 | 15 | @routes.options("/consultant/ratings/{session_id}") 16 | async def consultant_ratings(_: web.Request) -> web.Response: 17 | return web.json_response({"message": "Accept all hosts"}, headers=CORS_HEADERS) 18 | 19 | 20 | @routes.get("/consultant/ratings/{session_id}") 21 | async def consultant_ratings(request: web.Request) -> web.Response: 22 | session_id = extract_session(request) 23 | language = extract_language(request) 24 | consultant_ratings = await calculate_consultant_ratings_for(session_id, language) 25 | if not consultant_ratings: 26 | raise web.HTTPBadRequest( 27 | text="Cannot find any consultant ratings for that report, because either the session does not exist or does not have a final report.", 28 | headers=CORS_HEADERS, 29 | ) 30 | logger.info("Generating consultant advice for: %s", session_id) 31 | return web.json_response(consultant_ratings.dict(), headers=CORS_HEADERS) 32 | -------------------------------------------------------------------------------- /data_questionnaire_agent/server/questionnaire_server_main.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from aiohttp import web 4 | 5 | from data_questionnaire_agent.config import web_server_cfg, websocket_cfg 6 | from data_questionnaire_agent.server.questionnaire_server import ( 7 | MAX_SESSION_STEPS, 8 | app, 9 | routes, 10 | ) 11 | from data_questionnaire_agent.server.questionnaire_server_backend import ( 12 | routes as config_routes, 13 | ) 14 | from data_questionnaire_agent.server.questionnaire_server_consultants import ( 15 | routes as consultant_routes, 16 | ) 17 | 18 | assert config_routes == routes 19 | assert consultant_routes == routes 20 | 21 | FILE_INDEX = "index.html" 22 | PATH_INDEX = web_server_cfg.ui_folder / FILE_INDEX 23 | INDEX_LINKS = ["/", "/admin"] 24 | 25 | 26 | async def get_index(_: web.Request) -> web.Response: 27 | return web.FileResponse(PATH_INDEX) 28 | 29 | 30 | def run_server(): 31 | for i in range(MAX_SESSION_STEPS): 32 | app.router.add_get(f"/{i}", get_index) 33 | for url in INDEX_LINKS: 34 | app.router.add_get(url, get_index) 35 | app.add_routes(routes) 36 | app.router.add_static( 37 | "/images", path=web_server_cfg.images_folder.as_posix(), name="images" 38 | ) 39 | app.router.add_static("/", path=web_server_cfg.ui_folder.as_posix(), name="ui") 40 | loop = asyncio.new_event_loop() 41 | 42 | web.run_app( 43 | app, 44 | host=websocket_cfg.websocket_server, 45 | port=websocket_cfg.websocket_port, 46 | loop=loop, 47 | ) 48 | 49 | 50 | if __name__ == "__main__": 51 | run_server() 52 | -------------------------------------------------------------------------------- /data_questionnaire_agent/server/server_support.py: -------------------------------------------------------------------------------- 1 | from typing import Awaitable 2 | 3 | from aiohttp import web 4 | 5 | from data_questionnaire_agent.log_init import logger 6 | 7 | CORS_HEADERS = {"Access-Control-Allow-Origin": "*", "Access-Control-Allow-Headers": "*"} 8 | 9 | routes = web.RouteTableDef() 10 | 11 | 12 | async def handle_error(fun: Awaitable, **kwargs) -> any: 13 | try: 14 | return await fun(kwargs["request"]) 15 | except Exception as e: 16 | logger.error(f"Error occurred: {e}", exc_info=True) 17 | raise web.HTTPBadRequest( 18 | text="Please make sure the JSON body is available and well formatted." 19 | ) 20 | 21 | 22 | def extract_session(request: web.Request): 23 | session_id = request.match_info.get("session_id", None) 24 | logger.info("PDF session_id: %s", session_id) 25 | if session_id is None: 26 | raise web.HTTPNotFound(text="No session id specified") 27 | return session_id 28 | 29 | 30 | def extract_language(request: web.Request): 31 | return request.rel_url.query.get("language", "en") 32 | -------------------------------------------------------------------------------- /data_questionnaire_agent/service/advice_service.py: -------------------------------------------------------------------------------- 1 | from langchain.prompts import ChatPromptTemplate 2 | from langchain_core.runnables.base import RunnableSequence 3 | 4 | from data_questionnaire_agent.config import cfg 5 | from data_questionnaire_agent.model.openai_schema import ConditionalAdvice 6 | from data_questionnaire_agent.service.prompt_support import ( 7 | prompt_factory_generic, 8 | ) 9 | from data_questionnaire_agent.toml_support import get_prompts 10 | 11 | 12 | def prompt_factory_conditional_advice(language: str) -> ChatPromptTemplate: 13 | # Assuming get_prompts() returns the required dictionary 14 | prompts = get_prompts(language) 15 | section = prompts["advice"] 16 | return prompt_factory_generic( 17 | section=section, 18 | input_variables=["knowledge_base", "questions_answers"], 19 | prompts=prompts, 20 | ) 21 | 22 | 23 | def chain_factory_advice(language: str) -> RunnableSequence: 24 | model = cfg.llm.with_structured_output(ConditionalAdvice) 25 | prompt = prompt_factory_conditional_advice(language) 26 | return prompt | model 27 | 28 | 29 | def create_structured_question_call(language: str) -> RunnableSequence: 30 | model = cfg.llm.with_structured_output(ConditionalAdvice) 31 | prompt = prompt_factory_conditional_advice(language) 32 | return prompt | model 33 | 34 | 35 | def prepare_conditional_advice(knowledge_base: str, questions_answers: str) -> dict: 36 | return {"knowledge_base": knowledge_base, "questions_answers": questions_answers} 37 | -------------------------------------------------------------------------------- /data_questionnaire_agent/service/clarifications_agent.py: -------------------------------------------------------------------------------- 1 | from typing import Any, List, Optional, Type 2 | 3 | from duckduckgo_search import DDGS 4 | from langchain.agents import AgentExecutor, AgentType, initialize_agent 5 | from langchain.tools import BaseTool 6 | from pydantic.v1 import BaseModel, Field 7 | 8 | from data_questionnaire_agent.config import cfg 9 | 10 | RESULT_LIMIT = 5 11 | 12 | 13 | class SearchTermsInput(BaseModel): 14 | """Search terms used for the search""" 15 | 16 | search_terms: List[str] = Field( 17 | ..., 18 | description="Search terms used to get more information from the search engine", 19 | ) 20 | 21 | 22 | class DuckDuckGoTool(BaseTool): 23 | name = "get_search_duck_duck_go" 24 | description = "Useful to find search results on all possible topics. It produces searches for information via duck duck go search engine." 25 | 26 | def _run(self, search_terms: List[str]) -> Any: 27 | """Use the tool. 28 | 29 | Add run_manager: Optional[CallbackManagerForToolRun] = None 30 | to child implementations to enable tracing, 31 | """ 32 | search_res = text_search("".join(search_terms), limit=RESULT_LIMIT) 33 | return "\n".join([r["body"] for r in search_res]) 34 | 35 | args_schema: Optional[Type[BaseModel]] = SearchTermsInput 36 | 37 | 38 | def create_clarification_agent() -> AgentExecutor: 39 | tools = [DuckDuckGoTool()] 40 | return initialize_agent( 41 | tools, cfg.llm, agent=AgentType.OPENAI_FUNCTIONS, verbose=True 42 | ) 43 | 44 | 45 | def text_search(input: str, limit: int = 10) -> List[dict]: 46 | from itertools import islice 47 | 48 | res_list = [] 49 | with DDGS() as ddgs: 50 | for r in islice( 51 | ddgs.text(input, region="wt-wt", safesearch="off", timelimit="y"), limit 52 | ): 53 | res_list.append(r) 54 | return res_list 55 | 56 | 57 | def answers(input: str): 58 | with DDGS() as ddgs: 59 | for r in ddgs.answers(input): 60 | print(r) 61 | 62 | 63 | if __name__ == "__main__": 64 | 65 | def test_search(): 66 | res_list = text_search("Dark data") 67 | for res in res_list: 68 | print(res) 69 | 70 | tool = DuckDuckGoTool( 71 | name="search_duck_duck_go", 72 | description="Used to find answers and explanations based on keywords", 73 | ) 74 | res = tool.run({"search_terms": ["Weather in London tomorrow"]}) 75 | from data_questionnaire_agent.log_init import logger 76 | 77 | logger.info(res) 78 | agent_executor = create_clarification_agent() 79 | response = agent_executor.run("What is dark data?") 80 | logger.info("Agent response: %s", response) 81 | logger.info("Agent response type: %s", type(response)) 82 | -------------------------------------------------------------------------------- /data_questionnaire_agent/service/confidence_service.py: -------------------------------------------------------------------------------- 1 | from typing import Union 2 | 3 | from langchain.prompts import ChatPromptTemplate 4 | from langchain_core.runnables.base import RunnableSequence 5 | 6 | from data_questionnaire_agent.config import cfg 7 | from data_questionnaire_agent.log_init import logger 8 | from data_questionnaire_agent.model.application_schema import Questionnaire 9 | from data_questionnaire_agent.model.confidence_schema import ConfidenceRating 10 | from data_questionnaire_agent.service.ontology_service import PARAM_QUESTIONS_ANSWERS 11 | from data_questionnaire_agent.service.prompt_support import ( 12 | prompt_factory_generic, 13 | ) 14 | from data_questionnaire_agent.toml_support import get_prompts 15 | 16 | 17 | def prompt_factory_confidence(language: str) -> ChatPromptTemplate: 18 | # Assuming get_prompts() returns the required dictionary 19 | prompts = get_prompts(language) 20 | assert ( 21 | "confidence_prompt" in prompts 22 | ), "Make sure that you have the confidence prompt in your prompts file." 23 | section = prompts["confidence_prompt"] 24 | return prompt_factory_generic( 25 | section=section, 26 | input_variables=[PARAM_QUESTIONS_ANSWERS], 27 | prompts=prompts, 28 | ) 29 | 30 | 31 | def create_structured_question_call(language: str) -> RunnableSequence: 32 | model = cfg.llm.with_structured_output(ConfidenceRating) 33 | prompt = prompt_factory_confidence(language) 34 | return prompt | model 35 | 36 | 37 | def prepare_confidence_chain_call(questionnaire: Questionnaire) -> dict: 38 | return {PARAM_QUESTIONS_ANSWERS: str(questionnaire)} 39 | 40 | 41 | async def calculate_confidence_rating( 42 | questionnaire: Questionnaire, language: str 43 | ) -> Union[ConfidenceRating, None]: 44 | assert questionnaire is not None, "Missing questionnaire" 45 | try: 46 | call_params = prepare_confidence_chain_call(questionnaire) 47 | chain = create_structured_question_call(language) 48 | return await chain.ainvoke(call_params) 49 | except Exception: 50 | logger.exception("Failed to calculate confidence rating.") 51 | return None 52 | -------------------------------------------------------------------------------- /data_questionnaire_agent/service/db_bootstrap_service.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from typing import Union 3 | 4 | from psycopg import AsyncCursor 5 | 6 | from data_questionnaire_agent.log_init import logger 7 | from data_questionnaire_agent.service.query_support import create_connection 8 | 9 | 10 | async def execute_script(path: Path) -> Union[str, bool]: 11 | assert path.exists(), f"Path {path} does not exist." 12 | with open(path, "r") as file: 13 | sql_script = file.read() 14 | assert len(sql_script) > 0, "SQL script seems to be empty" 15 | 16 | conn = None 17 | try: 18 | conn = await create_connection() 19 | await conn.set_autocommit(True) 20 | async with conn.cursor() as cursor: 21 | exists = await table_exists("tb_session_configuration", cursor) 22 | if not exists or "create database" in sql_script.lower(): 23 | # If there are no table or no questions the script is executed. 24 | cursor = await cursor.execute(sql_script) 25 | else: 26 | return False 27 | return True 28 | except Exception as e: 29 | logger.exception("Cannot execute script") 30 | return str(e) 31 | finally: 32 | if conn: 33 | await conn.close() 34 | 35 | 36 | async def table_exists(table: str, cursor: AsyncCursor) -> bool: 37 | await cursor.execute( 38 | """ 39 | SELECT EXISTS ( 40 | SELECT FROM information_schema.tables 41 | WHERE table_schema = 'public' 42 | AND table_name = %(table)s 43 | ); 44 | """, 45 | {"table": table}, 46 | ) 47 | rows = await cursor.fetchone() 48 | return rows[0] 49 | -------------------------------------------------------------------------------- /data_questionnaire_agent/service/embedding_service.py: -------------------------------------------------------------------------------- 1 | import re 2 | from pathlib import Path 3 | from typing import List, TypeVar 4 | 5 | import numpy as np 6 | from langchain.schema import Document 7 | from langchain_community.document_loaders import TextLoader 8 | from langchain_community.vectorstores import FAISS 9 | 10 | from data_questionnaire_agent.config import cfg 11 | from data_questionnaire_agent.log_init import logger 12 | 13 | VST = TypeVar("VST", bound="VectorStore") 14 | 15 | 16 | def load_text(path: Path) -> List[Document]: 17 | """ 18 | Loads the text files and extracts chunks of a pre-defined size. 19 | It simply loads whole documents without any splitting. 20 | 21 | Parameters: 22 | path (Path): The path where the documents are saved. 23 | 24 | Returns: 25 | List[Document]: Returns a list of documents 26 | """ 27 | assert path.exists(), f"Path {path} does not exist" 28 | all_pages = [] 29 | for text_file in path.glob("*.txt"): 30 | loader = TextLoader(text_file.as_posix(), encoding="utf-8") 31 | pages: List[Document] = loader.load() 32 | for i, p in enumerate(pages): 33 | file_name = re.sub(r".+[\\/]", "", p.metadata["source"]) 34 | p.metadata["source"] = f"{file_name} page {i + 1}" 35 | all_pages.extend(pages) 36 | logger.info(f"Processed {text_file}, all_pages size: {len(all_pages)}") 37 | log_stats(all_pages) 38 | return all_pages 39 | 40 | 41 | def log_stats(documents: List[Document]): 42 | """ 43 | Logs statistics about a list of documents. 44 | 45 | Parameters: 46 | documents (List[Document]): The list of documents with the knowledge base. 47 | """ 48 | logger.info(f"Total number of documents {len(documents)}") 49 | counts = [] 50 | for d in documents: 51 | counts.append(count_words(d)) 52 | logger.info(f"Tokens Max {np.max(counts)}") 53 | logger.info(f"Tokens Min {np.min(counts)}") 54 | logger.info(f"Tokens Min {np.mean(counts)}") 55 | 56 | 57 | def count_words(document: Document) -> int: 58 | splits = [s for s in re.split("[\s,.]", document.page_content) if len(s) > 0] 59 | return len(splits) 60 | 61 | 62 | def generate_embeddings(documents: List[Document], persist_directory: str) -> VST: 63 | """ 64 | Receives a list of documents and generates the embeddings via OpenAI API. 65 | 66 | Parameters: 67 | documents (List[Document]): The document list with one page per document. 68 | path (Path): The path where the documents are found. 69 | 70 | Returns: 71 | VST: Recturs a reference to the vector store. 72 | """ 73 | try: 74 | docsearch = FAISS.from_documents(documents, cfg.embeddings) 75 | docsearch.save_local(persist_directory) 76 | logger.info("Vector database persisted") 77 | except Exception: 78 | logger.exception("Failed to process documents") 79 | if "docsearch" in vars() or "docsearch" in globals(): 80 | docsearch.persist() 81 | return None 82 | return docsearch 83 | 84 | 85 | if __name__ == "__main__": 86 | from data_questionnaire_agent.config import cfg 87 | 88 | raw_text_folder = cfg.raw_text_folder 89 | document_list = load_text(raw_text_folder) 90 | logger.info("Amount of documents: %d", len(document_list)) 91 | generate_embeddings(document_list, cfg.embeddings_persistence_dir.as_posix()) 92 | -------------------------------------------------------------------------------- /data_questionnaire_agent/service/graph_service.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, List, Tuple 2 | 3 | import networkx as nx 4 | 5 | from data_questionnaire_agent.model.ontology_schema import AnalyzedOntology, Ontology 6 | 7 | 8 | def generate_analyzed_ontology(ontology_data: Ontology) -> AnalyzedOntology: 9 | relationships = ontology_data.relationships 10 | if relationships is None: 11 | return AnalyzedOntology(relationships=[]) 12 | name_id_dict, id_name_dict = extract_nodes(relationships) 13 | edges = extract_edges(relationships, name_id_dict) 14 | 15 | G = nx.Graph() 16 | for name, id in name_id_dict.items(): 17 | G.add_node(id, name=name) 18 | G.add_edges_from(edges) 19 | 20 | degree_centrality = nx.degree_centrality(G) 21 | betweenness_centrality = nx.betweenness_centrality(G) 22 | connected_component_importance_dict = { 23 | id_name_dict[c]: len(cc) for cc in nx.connected_components(G) for c in cc 24 | } 25 | degree_centrality_: Dict[str, float] = { 26 | id_name_dict[id]: centrality for id, centrality in degree_centrality.items() 27 | } 28 | betweenness_centrality_: Dict[str, float] = { 29 | id_name_dict[id]: centrality 30 | for id, centrality in betweenness_centrality.items() 31 | } 32 | return AnalyzedOntology( 33 | relationships=relationships, 34 | degree_centrality=degree_centrality_, 35 | betweenness_centrality=betweenness_centrality_, 36 | connected_component_importance_dict=connected_component_importance_dict, 37 | ) 38 | 39 | 40 | def extract_nodes( 41 | relationships: List[Dict[str, str]], 42 | ) -> Tuple[Dict[str, int], Dict[int, str]]: 43 | node_set = set() 44 | for r in relationships: 45 | node_set.add(r.source) 46 | node_set.add(r.target) 47 | name_id_dict = {r: i for i, r in enumerate(node_set)} 48 | id_name_dict = {i: r for i, r in enumerate(node_set)} 49 | return (name_id_dict, id_name_dict) 50 | 51 | 52 | def extract_edges( 53 | relationships: List[Dict[str, str]], node_map: Dict[int, str] 54 | ) -> List[Tuple[int, int]]: 55 | edges = [] 56 | for rel in relationships: 57 | source_id = node_map[rel.source] 58 | target_id = node_map[rel.target] 59 | edges.append((source_id, target_id)) 60 | return edges 61 | -------------------------------------------------------------------------------- /data_questionnaire_agent/service/graphrag_service.py: -------------------------------------------------------------------------------- 1 | from typing import Union 2 | 3 | import httpx 4 | 5 | from data_questionnaire_agent.config import cfg 6 | from data_questionnaire_agent.log_init import logger 7 | 8 | 9 | async def graphrag_context(question: str) -> Union[str, None]: 10 | params = { 11 | "question": question, 12 | "use_context_records": False, 13 | "search": cfg.graphrag_mode, 14 | "context_size": cfg.graphrag_context_size, 15 | "project": cfg.graphrag_project, 16 | } 17 | 18 | params = {key: value for key, value in params.items() if value is not None} 19 | 20 | headers = { 21 | "Authorization": f"Bearer {cfg.graphrag_jwt}" # Add the JWT to the Authorization header 22 | } 23 | 24 | async with httpx.AsyncClient() as client: 25 | timeout = httpx.Timeout( 26 | connect=5.0, # 5 seconds for connection establishment 27 | read=cfg.graphrag_read_timeout, # 20 seconds for reading response 28 | write=10.0, # 10 seconds for sending request data 29 | pool=5.0, # 5 seconds for acquiring a connection from the pool 30 | ) 31 | context_url = f"{cfg.graphrag_base_url}/context" 32 | try: 33 | response = await client.get( 34 | context_url, params=params, headers=headers, timeout=timeout 35 | ) 36 | 37 | if response.status_code == 200: 38 | json_result = response.json() 39 | if "context_text" in json_result: 40 | return json_result["context_text"] 41 | else: 42 | logger.warning("Could not find context_text field in json") 43 | return None 44 | else: 45 | logger.warning( 46 | f"Failed with status code {response.status_code}: {response.text}" 47 | ) 48 | return None 49 | except httpx.RequestError: 50 | logger.exception( 51 | f"Failed to process query ({context_url}) to fetch graphrag context." 52 | ) 53 | return None 54 | 55 | 56 | if __name__ == "__main__": 57 | import asyncio 58 | 59 | res = asyncio.run( 60 | graphrag_context( 61 | "Which areas of your data ecosystem are you most converned about?" 62 | ) 63 | ) 64 | print(res) 65 | -------------------------------------------------------------------------------- /data_questionnaire_agent/service/html_generator.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from pathlib import Path 3 | 4 | import jinja2 5 | import pdfkit 6 | 7 | from data_questionnaire_agent.config import cfg 8 | from data_questionnaire_agent.log_init import logger 9 | from data_questionnaire_agent.model.application_schema import Questionnaire 10 | from data_questionnaire_agent.model.openai_schema import ConditionalAdvice 11 | from data_questionnaire_agent.service.report_enhancement_service import ( 12 | replace_bold_markdown, 13 | ) 14 | from data_questionnaire_agent.translation import t 15 | 16 | 17 | def generate_html( 18 | questionnaire: Questionnaire, advices: ConditionalAdvice, language: str = "en" 19 | ) -> str: 20 | timestamp = datetime.today().strftime("%A, %b %d %Y") 21 | context = { 22 | "banner": t("banner_link", locale=language), 23 | "questionnaire": questionnaire.to_html(), 24 | "advices": replace_bold_markdown(advices.to_advice_html()), 25 | "avoids": replace_bold_markdown(advices.to_avoid_html()), 26 | "title_confidence": t("Confidence Degree", locale=language), 27 | "confidence": replace_bold_markdown(advices.confidence_html(language)), 28 | "positive_outcomes": replace_bold_markdown(advices.positive_outcomes_html()), 29 | "timestamp": timestamp, 30 | "big_thank_you": t( 31 | "A big thank you for completing a session with", 32 | name=cfg.product_title, 33 | locale=language, 34 | ), 35 | "intro_advice": t("intro_advice", locale=language), 36 | "offering_long": t("offering_long", locale=language), 37 | "personal_offer": t("A personal offer for you", locale=language), 38 | "produced_on": t("Produced on", locale=language), 39 | "love_feedback": t("We would love your feedback", locale=language), 40 | "for_more_info": t("for_more_info", locale=language), 41 | "title_potential_outcomes": t("Potential positive outcomes", locale=language), 42 | "title_transcript": t("Transcript", locale=language), 43 | "title_what_to_do": t("What you should do", locale=language), 44 | "title_what_to_avoid": t("What to avoid", locale=language), 45 | } 46 | template_loader = jinja2.FileSystemLoader(cfg.template_location) 47 | template_env = jinja2.Environment(loader=template_loader) 48 | template = template_env.get_template("results-template.html") 49 | return template.render(context) 50 | 51 | 52 | def generate_pdf_from( 53 | questionnaire: Questionnaire, advices: ConditionalAdvice, language: str = "en" 54 | ) -> Path: 55 | if questionnaire is None: 56 | return None 57 | html = generate_html(questionnaire, advices, language) 58 | logger.info("PDF html: %s", html) 59 | file_name = cfg.pdf_folder / f"generated_advice_{generate_iso()}.pdf" 60 | logger.info("PDF to be created file name: %s", file_name) 61 | config = pdfkit.configuration(wkhtmltopdf=cfg.wkhtmltopdf_binary.as_posix()) 62 | pdfkit.from_string( 63 | html, 64 | file_name, 65 | configuration=config, 66 | verbose=True, 67 | options={"--enable-local-file-access": True}, 68 | ) 69 | logger.info("Created PDF: %s", file_name) 70 | return file_name 71 | 72 | 73 | def generate_iso() -> str: 74 | current_time = datetime.now() 75 | return current_time.isoformat().replace(":", ".") 76 | 77 | 78 | if __name__ == "__main__": 79 | from data_questionnaire_agent.test.provider.advice_provider import ( 80 | create_simple_advice, 81 | ) 82 | from data_questionnaire_agent.test.provider.questionnaire_provider import ( 83 | create_questionnaire_2_questions, 84 | ) 85 | 86 | questionnaire: Questionnaire = create_questionnaire_2_questions() 87 | advices: ConditionalAdvice = create_simple_advice() 88 | logger.info("PDF Path: %s", generate_pdf_from(questionnaire, advices)) 89 | -------------------------------------------------------------------------------- /data_questionnaire_agent/service/initial_question_service.py: -------------------------------------------------------------------------------- 1 | from langchain.chains.llm import LLMChain 2 | from langchain.chains.openai_functions import create_structured_output_chain 3 | from langchain.prompts import ( 4 | ChatPromptTemplate, 5 | ) 6 | 7 | from data_questionnaire_agent.config import cfg 8 | from data_questionnaire_agent.model.openai_schema import ResponseQuestions 9 | from data_questionnaire_agent.service.prompt_support import prompt_factory_generic 10 | from data_questionnaire_agent.toml_support import get_prompts 11 | 12 | 13 | def prompt_factory_initial_questions(language: str) -> ChatPromptTemplate: 14 | prompts = get_prompts(language) 15 | section = prompts["questionnaire"]["initial"] 16 | return prompt_factory_generic( 17 | section, 18 | [ 19 | "knowledge_base", 20 | "question", 21 | "answer", 22 | "questions_per_batch", 23 | ], 24 | prompts, 25 | ) 26 | 27 | 28 | def chain_factory_initial_question(language: str) -> LLMChain: 29 | return create_structured_output_chain( 30 | ResponseQuestions, 31 | cfg.llm, 32 | prompt_factory_initial_questions(language), 33 | verbose=cfg.verbose_llm, 34 | ) 35 | 36 | 37 | def prepare_initial_question( 38 | question: str, 39 | answer: str, 40 | questions_per_batch: int = 1, 41 | knowledge_base: str = "", 42 | ) -> dict: 43 | return { 44 | "knowledge_base": knowledge_base, 45 | "question": question, 46 | "answer": answer, 47 | "questions_per_batch": questions_per_batch, 48 | } 49 | 50 | 51 | if __name__ == "__main__": 52 | import asyncio 53 | 54 | from data_questionnaire_agent.log_init import logger 55 | from data_questionnaire_agent.service.knowledge_base_service import fetch_context 56 | 57 | language = "en" 58 | initial_question = get_prompts(language)["questionnaire"]["initial"]["question"] 59 | assert initial_question is not None 60 | 61 | answer = "Expired Passport" # Supposed the client answer 62 | search_res = asyncio.run(fetch_context(answer)) 63 | input = prepare_initial_question( 64 | question=initial_question, 65 | answer=answer, 66 | questions_per_batch=1, 67 | knowledge_base=search_res, 68 | ) 69 | chain = chain_factory_initial_question(language) 70 | res: dict = chain.invoke(input) 71 | assert res is not None 72 | response_questions: ResponseQuestions = res["function"] 73 | 74 | logger.info("Results: ") 75 | logger.info(response_questions) 76 | -------------------------------------------------------------------------------- /data_questionnaire_agent/service/knowledge_base_service.py: -------------------------------------------------------------------------------- 1 | from typing import Union 2 | 3 | from data_questionnaire_agent.config import cfg 4 | from data_questionnaire_agent.log_init import logger 5 | from data_questionnaire_agent.model.application_schema import Questionnaire 6 | from data_questionnaire_agent.service.graphrag_service import graphrag_context 7 | from data_questionnaire_agent.service.similarity_search import ( 8 | init_vector_search, 9 | similarity_search, 10 | ) 11 | 12 | docsearch = None 13 | 14 | 15 | async def fetch_context(questionnaire: Union[str, Questionnaire]) -> str: 16 | global docsearch 17 | 18 | try: 19 | questionnaire_str = ( 20 | str(questionnaire) 21 | if isinstance(questionnaire, Questionnaire) 22 | else questionnaire 23 | ) 24 | if cfg.use_graphrag: 25 | knowledge_base = await graphrag_context(questionnaire_str) 26 | return knowledge_base or "" 27 | else: 28 | if docsearch is None: 29 | docsearch = init_vector_search() 30 | knowledge_base = similarity_search( 31 | docsearch, questionnaire_str, how_many=cfg.search_results_how_many 32 | ) 33 | return knowledge_base or "" 34 | except Exception as e: 35 | logger.exception("Could not fetch context.") 36 | logger.error(str(e)) 37 | return "" 38 | -------------------------------------------------------------------------------- /data_questionnaire_agent/service/language_adapter.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | ACCEPTED_LANGUAGES = ["en", "de"] 4 | 5 | 6 | def adapt_language(language: str) -> str: 7 | first_chars = re.sub(r"^([a-z]{2}).*", r"\1", language) 8 | if first_chars in ACCEPTED_LANGUAGES: 9 | return first_chars 10 | return ACCEPTED_LANGUAGES[0] 11 | 12 | 13 | if __name__ == "__main__": 14 | assert adapt_language("en-GB") == "en" 15 | assert adapt_language("en") == "en" 16 | assert adapt_language("de-DB") == "de" 17 | assert adapt_language("de-AT") == "de" 18 | assert adapt_language("pt-PT") == "en" 19 | -------------------------------------------------------------------------------- /data_questionnaire_agent/service/ontology_service.py: -------------------------------------------------------------------------------- 1 | from langchain.chains.llm import LLMChain 2 | from langchain.chains.openai_functions import create_structured_output_chain 3 | from langchain.prompts import ChatPromptTemplate 4 | from langchain_core.runnables.base import RunnableSequence 5 | 6 | from data_questionnaire_agent.config import cfg 7 | from data_questionnaire_agent.model.application_schema import Questionnaire 8 | from data_questionnaire_agent.model.ontology_schema import Ontology 9 | from data_questionnaire_agent.model.openai_schema import ConditionalAdvice 10 | from data_questionnaire_agent.service.prompt_support import ( 11 | prompt_factory_generic, 12 | ) 13 | from data_questionnaire_agent.toml_support import get_prompts 14 | 15 | PARAM_QUESTIONS_ANSWERS = "questions_answers" 16 | PARAM_ADVICE = "conditional_advice" 17 | 18 | 19 | def prompt_factory_ontology(language: str) -> ChatPromptTemplate: 20 | # Assuming get_prompts() returns the required dictionary 21 | prompts = get_prompts(language) 22 | assert ( 23 | "extract_ontology" in prompts 24 | ), "Make sure that you have the ontolgy prompt in your prompts file." 25 | section = prompts["extract_ontology"] 26 | return prompt_factory_generic( 27 | section=section, 28 | input_variables=[PARAM_ADVICE, PARAM_QUESTIONS_ANSWERS], 29 | prompts=prompts, 30 | ) 31 | 32 | 33 | def chain_factory_ontology(language: str) -> LLMChain: 34 | return create_structured_output_chain( 35 | Ontology, 36 | cfg.llm, 37 | prompt_factory_ontology(language), 38 | verbose=cfg.verbose_llm, 39 | ) 40 | 41 | 42 | def create_structured_question_call(language: str) -> RunnableSequence: 43 | model = cfg.llm.with_structured_output(Ontology) 44 | prompt = prompt_factory_ontology(language) 45 | return prompt | model 46 | 47 | 48 | def prepare_ontology_chain_call( 49 | questionnaire: Questionnaire, conditional_advice: ConditionalAdvice 50 | ) -> dict: 51 | return { 52 | PARAM_QUESTIONS_ANSWERS: str(questionnaire), 53 | PARAM_ADVICE: str(conditional_advice), 54 | } 55 | 56 | 57 | async def create_ontology( 58 | questionnaire: Questionnaire, conditional_advice: ConditionalAdvice, language: str 59 | ) -> Ontology: 60 | assert conditional_advice is not None, "Missing conditional advice" 61 | assert questionnaire is not None, "Missing questionnaire" 62 | call_params = prepare_ontology_chain_call(questionnaire, conditional_advice) 63 | chain = create_structured_question_call(language) 64 | return await chain.ainvoke(call_params) 65 | -------------------------------------------------------------------------------- /data_questionnaire_agent/service/prompt_support.py: -------------------------------------------------------------------------------- 1 | from typing import Callable, List, Union 2 | 3 | from langchain.prompts import ( 4 | ChatPromptTemplate, 5 | HumanMessagePromptTemplate, 6 | PromptTemplate, 7 | SystemMessagePromptTemplate, 8 | ) 9 | 10 | from data_questionnaire_agent.toml_support import get_prompts 11 | 12 | 13 | def factory_prompt( 14 | find_prompt: Callable, params: list[str], language: str = "en" 15 | ) -> ChatPromptTemplate: 16 | prompts = get_prompts(language) 17 | section = find_prompt(prompts) 18 | return prompt_factory_generic(section, params, prompts) 19 | 20 | 21 | def prompt_factory_generic( 22 | section: dict, 23 | input_variables: List[str], 24 | prompts: object, 25 | prompt_transform: Union[Callable, None] = None, 26 | ) -> ChatPromptTemplate: 27 | human_message = section["human_message"] 28 | human_message = ( 29 | prompt_transform(human_message) 30 | if prompt_transform is not None 31 | else human_message 32 | ) 33 | prompt_msgs = [ 34 | SystemMessagePromptTemplate( 35 | prompt=PromptTemplate( 36 | template=section["system_message"], input_variables=[] 37 | ) 38 | ), 39 | HumanMessagePromptTemplate( 40 | prompt=PromptTemplate( 41 | template=human_message, 42 | input_variables=input_variables, 43 | ) 44 | ), 45 | HumanMessagePromptTemplate( 46 | prompt=PromptTemplate( 47 | template=prompts["general_messages"]["tip_correct_format"], 48 | input_variables=[], 49 | ) 50 | ), 51 | HumanMessagePromptTemplate( 52 | prompt=PromptTemplate( 53 | template=prompts["general_messages"]["tip_language"], 54 | input_variables=[], 55 | ) 56 | ), 57 | ] 58 | return ChatPromptTemplate(messages=prompt_msgs) 59 | -------------------------------------------------------------------------------- /data_questionnaire_agent/service/query_support.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import sys 3 | from typing import Any, Callable, Coroutine 4 | 5 | from psycopg import AsyncConnection, AsyncCursor 6 | 7 | from data_questionnaire_agent.config import db_cfg 8 | from data_questionnaire_agent.log_init import logger 9 | 10 | if sys.platform == "win32": 11 | asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy()) 12 | 13 | 14 | async def create_connection(conninfo: str = db_cfg.db_conn_str) -> AsyncConnection: 15 | return await AsyncConnection.connect(conninfo) 16 | 17 | 18 | async def create_cursor(func: Callable, commit=False, conn_info: str = None) -> Any: 19 | # await asynch_pool.check() 20 | try: 21 | conn = ( 22 | (await create_connection()) 23 | if conn_info is None 24 | else (await create_connection(conn_info)) 25 | ) 26 | # async with asynch_pool.connection() as conn: 27 | async with conn.cursor() as cur: 28 | return await func(cur) 29 | except Exception as e: 30 | logger.error(str(e)) 31 | logger.exception("Could not create cursor.") 32 | return None 33 | finally: 34 | if "conn" in locals() and conn is not None: 35 | if commit: 36 | await conn.commit() 37 | await conn.close() 38 | 39 | 40 | async def use_connection(func: Coroutine, commit=True) -> any: 41 | try: 42 | conn = await create_connection() 43 | return await func(conn) 44 | except Exception as e: 45 | logger.exception(f"Could not create database connection: {e}") 46 | finally: 47 | if conn is not None: 48 | if commit: 49 | await conn.commit() 50 | await conn.close() 51 | 52 | 53 | async def handle_select_func(query: str, query_params: dict): 54 | async def func(cur: AsyncCursor): 55 | await cur.execute( 56 | query, 57 | query_params, 58 | ) 59 | return list(await cur.fetchall()) 60 | 61 | return func 62 | 63 | 64 | async def select_from(query: str, parameter_map: dict, conn_info: str = None) -> list: 65 | handle_select = await handle_select_func(query, parameter_map) 66 | return await create_cursor(handle_select, False, conn_info) 67 | -------------------------------------------------------------------------------- /data_questionnaire_agent/service/question_clarifications.py: -------------------------------------------------------------------------------- 1 | from typing import AsyncIterator 2 | 3 | from langchain.prompts import ChatPromptTemplate 4 | from langchain_core.messages import BaseMessageChunk 5 | 6 | from data_questionnaire_agent.config import cfg 7 | from data_questionnaire_agent.toml_support import get_prompts 8 | 9 | 10 | def prompt_factory_question_clarifications(language: str) -> ChatPromptTemplate: 11 | extraction_prompts = get_prompts(language)["questionnaire"]["clarification"] 12 | return ChatPromptTemplate.from_messages( 13 | [ 14 | ("system", extraction_prompts["system_message"]), 15 | ("human", extraction_prompts["human_message"]), 16 | ] 17 | ) 18 | 19 | 20 | async def chain_factory_question_clarifications( 21 | question: str, language: str 22 | ) -> AsyncIterator[BaseMessageChunk]: 23 | input = prompt_factory_question_clarifications(language).format(question=question) 24 | return cfg.llm_stream.astream(input) 25 | 26 | 27 | if __name__ == "__main__": 28 | import asyncio 29 | 30 | async def stream_response(): 31 | question = "What is the meaning of data governance?" 32 | async for token in await chain_factory_question_clarifications(question): 33 | print(token.content, end="", flush=True) 34 | 35 | asyncio.run(stream_response()) 36 | -------------------------------------------------------------------------------- /data_questionnaire_agent/service/question_generation_service.py: -------------------------------------------------------------------------------- 1 | from langchain.chains.llm import LLMChain 2 | from langchain.chains.openai_functions import create_structured_output_chain 3 | from langchain.prompts import ( 4 | ChatPromptTemplate, 5 | HumanMessagePromptTemplate, 6 | PromptTemplate, 7 | ) 8 | from langchain_core.runnables.base import RunnableSequence 9 | 10 | from data_questionnaire_agent.config import cfg 11 | from data_questionnaire_agent.model.application_schema import Questionnaire 12 | from data_questionnaire_agent.model.openai_schema import ResponseQuestions 13 | from data_questionnaire_agent.model.session_configuration import ( 14 | ChatType, 15 | SessionProperties, 16 | ) 17 | from data_questionnaire_agent.service.prompt_support import ( 18 | prompt_factory_generic, 19 | ) 20 | from data_questionnaire_agent.toml_support import get_prompts 21 | 22 | 23 | def divergent_prompt_transformer(prompt: str, language: str = "en") -> str: 24 | defaultExclusions = [ 25 | "Main questionnaire topic:", 26 | "The questions should explore topics related to the main topic", 27 | ] 28 | exclusions_by_language = { 29 | "en": defaultExclusions, 30 | "de": [ 31 | "Hauptfragebogen-Thema:", 32 | "Die Fragen sollten Themen im Zusammenhang mit dem Hauptthema", 33 | ], 34 | } 35 | 36 | # Get the exclusions for the selected language 37 | exclusions = exclusions_by_language.get(language, defaultExclusions) 38 | 39 | # Process the lines 40 | lines = [ 41 | line for line in prompt.splitlines() if not any(e in line for e in exclusions) 42 | ] 43 | 44 | return "\n".join(lines) 45 | 46 | 47 | def prompt_factory_secondary_questions( 48 | session_properties: SessionProperties, 49 | ) -> ChatPromptTemplate: 50 | language = session_properties.session_language 51 | prompt_transformer = None 52 | if session_properties.chat_type == ChatType.DIVERGING: 53 | prompt_transformer = lambda p: divergent_prompt_transformer( 54 | p, language=language 55 | ) 56 | 57 | prompts = get_prompts(language) 58 | section = prompts["questionnaire"]["secondary"] 59 | return prompt_factory_generic( 60 | section, 61 | [ 62 | "knowledge_base", 63 | "questions_answers", 64 | "answers", 65 | "questions_per_batch", 66 | ], 67 | prompts, 68 | prompt_transformer, 69 | ) 70 | 71 | 72 | def prompt_factory_recreate_question( 73 | session_properties: SessionProperties, 74 | ) -> ChatPromptTemplate: 75 | language = session_properties.session_language 76 | regenerate_template = prompt_factory_secondary_questions(session_properties) 77 | # Build the normal prompt template and then modify it to avoid code duplication 78 | main_message_index = 1 79 | main_template: HumanMessagePromptTemplate = regenerate_template.messages[ 80 | main_message_index 81 | ] 82 | main_template_prompt: PromptTemplate = main_template.prompt 83 | template, input_variables = ( 84 | main_template_prompt.template, 85 | main_template_prompt.input_variables, 86 | ) 87 | input_variables.append("previous_question") 88 | insertion_mark = "\n==== KNOWLEDGE BASE START ====\n" 89 | insertion_index = template.find(insertion_mark) 90 | prompts = get_prompts(language) 91 | # manipulate the template 92 | section_mod_human_message = prompts["questionnaire"]["secondary_regenerate"][ 93 | "human_message" 94 | ] 95 | previous = template[0:insertion_index] 96 | after = template[insertion_index:] 97 | changed_template = f"""{previous} 98 | {section_mod_human_message} 99 | {after} 100 | """ 101 | main_template_prompt.template = changed_template 102 | return regenerate_template 103 | 104 | 105 | def create_structured_question_call( 106 | session_properties: SessionProperties, is_recreate: bool = False 107 | ) -> RunnableSequence: 108 | model = cfg.llm.with_structured_output(ResponseQuestions) 109 | prompt = ( 110 | prompt_factory_secondary_questions(session_properties) 111 | if not is_recreate 112 | else prompt_factory_recreate_question(session_properties) 113 | ) 114 | return prompt | model 115 | 116 | 117 | def chain_factory_secondary_question(session_properties: SessionProperties) -> LLMChain: 118 | return create_structured_output_chain( 119 | ResponseQuestions, 120 | cfg.llm, 121 | prompt_factory_secondary_questions(session_properties), 122 | verbose=cfg.verbose_llm, 123 | ) 124 | 125 | 126 | def prepare_secondary_question( 127 | questionnaire: Questionnaire, 128 | knowledge_base: str, 129 | questions_per_batch: int = cfg.questions_per_batch, 130 | is_recreate: bool = False, 131 | ) -> dict: 132 | params = { 133 | "knowledge_base": knowledge_base, 134 | "questions_answers": str(questionnaire), 135 | "answers": questionnaire.answers_str(), 136 | "questions_per_batch": questions_per_batch, 137 | "questionnaire_topic": ( 138 | questionnaire.questions[0].answer 139 | if len(questionnaire.questions) > 0 140 | else "" 141 | ), 142 | } 143 | if is_recreate: 144 | params["previous_question"] = questionnaire.questions[-1].question 145 | return params 146 | -------------------------------------------------------------------------------- /data_questionnaire_agent/service/report_aggregation_service.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | from data_questionnaire_agent.model.openai_schema import ConditionalAdvice 4 | from data_questionnaire_agent.model.questionnaire_status import QuestionnaireStatus 5 | 6 | 7 | def create_bullet_list(items: List[str]) -> str: 8 | return "\n".join(f"- {item}" for item in items) 9 | 10 | 11 | def convert_to_str(questionnaire_statuses: List[QuestionnaireStatus]) -> List[str]: 12 | questionnaires = [] 13 | questionnaire = [] 14 | for i, qs in enumerate(questionnaire_statuses): 15 | question, answer, final_report = qs.question, qs.answer, qs.final_report 16 | if not final_report: 17 | if ( 18 | i + 1 != len(questionnaire_statuses) 19 | and qs.session_id != questionnaire_statuses[i + 1].session_id 20 | ): 21 | # There is a missing final report in this case 22 | # Flush the report to the output list 23 | questionnaire.insert(0, "# Questionnaire") 24 | questionnaire.append( 25 | f""" 26 | Q: {question} 27 | A: {answer} 28 | """ 29 | ) 30 | questionnaires.append("\n".join(questionnaire)) 31 | questionnaire.clear() 32 | continue 33 | questionnaire.append( 34 | f""" 35 | Q: {question} 36 | A: {answer} 37 | """ 38 | ) 39 | if i + 1 == len(questionnaire_statuses): 40 | # Last questions of last questionnaire 41 | questionnaire.insert(0, "# Questionnaire") 42 | questionnaires.append("\n".join(questionnaire)) 43 | questionnaire.clear() 44 | elif final_report: 45 | conditional_advice: ConditionalAdvice = ConditionalAdvice.parse_raw( 46 | question 47 | ) 48 | advices, what_you_should_avoid, positive_outcomes = ( 49 | conditional_advice.advices, 50 | conditional_advice.what_you_should_avoid, 51 | conditional_advice.positive_outcomes, 52 | ) 53 | advice_str = create_bullet_list(advices) 54 | what_you_should_avoid_str = create_bullet_list(what_you_should_avoid) 55 | positive_outcomes_str = create_bullet_list(positive_outcomes) 56 | questionnaire.append( 57 | f""" 58 | # Recommendations 59 | 60 | {advice_str} 61 | 62 | # What to avoid 63 | 64 | {what_you_should_avoid_str} 65 | 66 | # Positive outcomes 67 | 68 | {positive_outcomes_str} 69 | """ 70 | ) 71 | questionnaire.insert(0, "# Questionnaire") 72 | questionnaires.append("\n".join(questionnaire)) 73 | questionnaire.clear() 74 | return questionnaires 75 | -------------------------------------------------------------------------------- /data_questionnaire_agent/service/report_aggregation_summarization_service.py: -------------------------------------------------------------------------------- 1 | from langchain.prompts import ChatPromptTemplate 2 | from langchain_core.runnables.base import RunnableSequence 3 | 4 | from data_questionnaire_agent.config import cfg 5 | from data_questionnaire_agent.log_init import logger 6 | from data_questionnaire_agent.model.report_aggregation_schema import ( 7 | ReportDocumentSummarization, 8 | ) 9 | from data_questionnaire_agent.service.prompt_support import ( 10 | factory_prompt, 11 | ) 12 | 13 | KEY_QUESTIONNAIRE = "full_questionnaire" 14 | 15 | 16 | def prompt_factory_summarization_prompt(language: str) -> ChatPromptTemplate: 17 | return factory_prompt( 18 | lambda prompt: prompt["reporting"]["summarization_prompt"], 19 | [KEY_QUESTIONNAIRE], 20 | language, 21 | ) 22 | 23 | 24 | def create_summarization_call(language: str = "en") -> RunnableSequence: 25 | model = cfg.llm.with_structured_output(ReportDocumentSummarization) 26 | prompt = prompt_factory_summarization_prompt(language) 27 | return prompt | model 28 | 29 | 30 | async def aexecute_summarization_batch( 31 | inputs: list[str], batch_size: int = 2, language: str = "en" 32 | ) -> list[ReportDocumentSummarization]: 33 | chain = create_summarization_call(language) 34 | summaries = [] 35 | inputs_dict = [{KEY_QUESTIONNAIRE: s} for s in inputs] 36 | batches = [ 37 | inputs_dict[i : i + batch_size] for i in range(len(inputs_dict))[::batch_size] 38 | ] 39 | for i, b in enumerate(batches): 40 | try: 41 | res = await chain.abatch(b) 42 | summaries.extend(res) 43 | logger.info(f"Summarized {(i + batch_size) * batch_size} records.") 44 | except Exception as e: 45 | logger.exception(e) 46 | return summaries 47 | 48 | 49 | async def aexecute_summarization_batch_str( 50 | inputs: list[str], batch_size: int = 2, language: str = "en" 51 | ) -> list[str]: 52 | logger.info("Summarizing %d questionnaires.", len(inputs)) 53 | return [ 54 | summ.summary 55 | for summ in await aexecute_summarization_batch(inputs, batch_size, language) 56 | ] 57 | -------------------------------------------------------------------------------- /data_questionnaire_agent/service/report_enhancement_service.py: -------------------------------------------------------------------------------- 1 | import re 2 | from urllib.parse import quote 3 | 4 | PATTERN_MARKDOWN_BOLD = re.compile(r"\*\*(?P.+?)\*\*") 5 | 6 | 7 | def replace_bold_markdown( 8 | html: str, 9 | format_str="{content}", 10 | ) -> str: 11 | # Output string initialization 12 | output = [] 13 | last_end = 0 # This keeps track of the end of the last match 14 | for match in PATTERN_MARKDOWN_BOLD.finditer(html): 15 | start, end = match.span() 16 | content = match.group("content") 17 | quoted_content = quote(content) 18 | output.append(html[last_end:start]) 19 | replacement = format_str.format(quoted_content=quoted_content, content=content) 20 | output.append(replacement) 21 | last_end = end 22 | output.append(html[last_end:]) 23 | 24 | return "".join(output) 25 | 26 | 27 | def replace_markdown_bold_with_links(text: str) -> str: 28 | return replace_bold_markdown( 29 | text, "**[{content}](https://www.google.com/search?q={quoted_content})**" 30 | ) 31 | -------------------------------------------------------------------------------- /data_questionnaire_agent/service/secondary_question_processor.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | from typing import List 3 | 4 | from tenacity import AsyncRetrying 5 | 6 | from data_questionnaire_agent.config import cfg 7 | from data_questionnaire_agent.model.application_schema import ( 8 | QuestionAnswer, 9 | Questionnaire, 10 | convert_to_question_answers, 11 | ) 12 | from data_questionnaire_agent.model.openai_schema import ( 13 | ResponseQuestions, 14 | ) 15 | from data_questionnaire_agent.model.session_configuration import SessionProperties 16 | from data_questionnaire_agent.service.knowledge_base_service import fetch_context 17 | from data_questionnaire_agent.service.persistence_service_async import ( 18 | check_question_exists, 19 | ) 20 | from data_questionnaire_agent.service.question_generation_service import ( 21 | create_structured_question_call, 22 | prepare_secondary_question, 23 | ) 24 | 25 | 26 | async def process_secondary_questions( 27 | questionnaire: Questionnaire, 28 | question_per_batch: int, 29 | session_properties: SessionProperties, 30 | session_id: str, 31 | ) -> List[QuestionAnswer]: 32 | knowledge_base = await fetch_context(questionnaire) 33 | secondary_question_input = prepare_secondary_question( 34 | questionnaire, knowledge_base, question_per_batch 35 | ) 36 | retries = 3 37 | while retries > 0: 38 | retries -= 1 39 | async for attempt in AsyncRetrying(**cfg.retry_args): 40 | with attempt: 41 | response_questions: ResponseQuestions = ( 42 | await create_structured_question_call(session_properties).ainvoke( 43 | secondary_question_input 44 | ) 45 | ) 46 | tasks = [ 47 | check_question_exists(response_question, session_id) 48 | for response_question in response_questions.questions 49 | ] 50 | results = await asyncio.gather(*tasks) 51 | has_repeated = any(results) 52 | if not has_repeated or retries == 0: 53 | return convert_to_question_answers(response_questions) 54 | -------------------------------------------------------------------------------- /data_questionnaire_agent/service/similarity_search.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from typing import List 3 | 4 | import tiktoken 5 | from langchain.schema import Document 6 | from langchain_community.vectorstores import FAISS 7 | 8 | from data_questionnaire_agent.config import cfg 9 | from data_questionnaire_agent.log_init import logger 10 | from data_questionnaire_agent.service.embedding_service import ( 11 | generate_embeddings, 12 | load_text, 13 | ) 14 | 15 | 16 | def init_vector_search() -> FAISS: 17 | embedding_dir = cfg.embeddings_persistence_dir.as_posix() 18 | embedding_dir_path = Path(embedding_dir) 19 | # Check if directory exists and has something inside 20 | if embedding_dir_path.exists() and len(list(embedding_dir_path.glob("*"))) > 0: 21 | logger.info("reading from existing directory") 22 | docsearch = FAISS.load_local( 23 | embedding_dir, cfg.embeddings, allow_dangerous_deserialization=True 24 | ) 25 | return docsearch 26 | else: 27 | logger.warning(f"Cannot find path {embedding_dir} or path is empty.") 28 | doc_location = cfg.raw_text_folder 29 | logger.info(f"Using doc location {doc_location}.") 30 | logger.info("Generating vectors") 31 | documents = load_text(path=doc_location) 32 | docsearch = generate_embeddings( 33 | documents=documents, persist_directory=embedding_dir 34 | ) 35 | return docsearch 36 | 37 | 38 | def join_pages(doc_list: List[Document]) -> str: 39 | return "\n\n".join([p.page_content for p in doc_list]) 40 | 41 | 42 | def similarity_search( 43 | docsearch: FAISS, input: str, how_many=cfg.search_results_how_many 44 | ) -> str: 45 | """ 46 | Performs multiple searches until it reaches the maximum amount of tokens below a specified threshold. 47 | When the threshold of tokens is reached it stops and returns the search results. 48 | 49 | Parameters: 50 | docsearch FAISS: The object used to access the vector database. 51 | input str: The input of the search. 52 | how_many int: The initial number of results to be retrieved. 53 | 54 | Returns: 55 | str: The maximum amount of text with the number of tokens below the threshold specified in the configuration. 56 | """ 57 | token_count = 0 58 | previous_res = "" 59 | attempts = 0 60 | max_attempts = 4 61 | while attempts < max_attempts: 62 | doc_list = docsearch.similarity_search(input, k=how_many + attempts) 63 | logger.info("Similarity search results: %s", len(doc_list)) 64 | joined = join_pages(doc_list) 65 | token_count = num_tokens_from_string(joined) 66 | logger.info("Token count: %d", token_count) 67 | attempts += 1 68 | if token_count > cfg.token_limit: 69 | return previous_res 70 | previous_res = joined 71 | return previous_res 72 | 73 | 74 | def num_tokens_from_string(string: str) -> int: 75 | """ 76 | Returns the number of tokens in a text string. 77 | 78 | Parameters: 79 | string (str): The string for which the tiktokens are to be counted. 80 | 81 | Returns: 82 | int: Recturs the number of tokens generated using tiktoken. 83 | """ 84 | encoding = tiktoken.encoding_for_model(cfg.model) 85 | num_tokens = len(encoding.encode(string)) 86 | return num_tokens 87 | 88 | 89 | if __name__ == "__main__": 90 | docsearch = init_vector_search() 91 | search_res = similarity_search(docsearch, "Passport") 92 | print(search_res) 93 | print(num_tokens_from_string(search_res)) 94 | -------------------------------------------------------------------------------- /data_questionnaire_agent/service/tagging_service.py: -------------------------------------------------------------------------------- 1 | from langchain.chains import LLMChain, create_tagging_chain_pydantic 2 | from langchain.prompts import ( 3 | ChatPromptTemplate, 4 | HumanMessagePromptTemplate, 5 | PromptTemplate, 6 | SystemMessagePromptTemplate, 7 | ) 8 | 9 | from data_questionnaire_agent.config import cfg 10 | from data_questionnaire_agent.log_init import logger 11 | from data_questionnaire_agent.model.openai_schema import ResponseTags 12 | from data_questionnaire_agent.toml_support import read_prompts_toml 13 | 14 | prompts = read_prompts_toml() 15 | 16 | 17 | def prompt_factory_sentiment() -> ChatPromptTemplate: 18 | section = prompts["tagging"] 19 | human_message = section["human_message"] 20 | human_message_extraction = section["human_message_extraction"] 21 | prompt_msgs = [ 22 | SystemMessagePromptTemplate( 23 | prompt=PromptTemplate( 24 | template=section["system_message"], input_variables=[] 25 | ) 26 | ), 27 | HumanMessagePromptTemplate( 28 | prompt=PromptTemplate( 29 | template=human_message, 30 | input_variables=["answer"], 31 | ) 32 | ), 33 | HumanMessagePromptTemplate( 34 | prompt=PromptTemplate( 35 | template=human_message_extraction, 36 | input_variables=["answer"], 37 | ) 38 | ), 39 | ] 40 | return ChatPromptTemplate(messages=prompt_msgs) 41 | 42 | 43 | def sentiment_chain_factory() -> LLMChain: 44 | return create_tagging_chain_pydantic( 45 | ResponseTags, cfg.llm, prompt_factory_sentiment(), verbose=cfg.verbose_llm 46 | ) 47 | 48 | 49 | chain = create_tagging_chain_pydantic(ResponseTags, cfg.llm, prompt_factory_sentiment()) 50 | 51 | 52 | def prepare_sentiment_input(question: str) -> dict: 53 | return {"answer": question} 54 | 55 | 56 | def tag_response(response: str) -> dict: 57 | res = chain(prepare_sentiment_input(response)) 58 | return res 59 | 60 | 61 | if __name__ == "__main__": 62 | 63 | def process_answer(answer: str): 64 | logger.info(type(answer)) 65 | logger.info(answer) 66 | 67 | # Does your organization support an event driven architecture for data integration? 68 | process_answer(tag_response("Yes, it does")) 69 | process_answer( 70 | tag_response( 71 | "Yes, I know that CDC is good to prevent data from being outdated." 72 | ) 73 | ) 74 | process_answer( 75 | tag_response("Well, since you are asking, I am not quite sure about it.") 76 | ) 77 | # Does your organization take more than 3 weeks for data integration between 2 systems? 78 | process_answer( 79 | tag_response( 80 | "Well, that depends on the size of the project. But in most cases yes." 81 | ) 82 | ) 83 | process_answer(tag_response("Almost we never finish integrations before that.")) 84 | process_answer( 85 | tag_response( 86 | "Which is the meaning of dark data? What is CDC (Change Data Capture)?" 87 | ) 88 | ) 89 | -------------------------------------------------------------------------------- /data_questionnaire_agent/test/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/onepointconsulting/data-questionnaire-agent/cc96c26bbd559b15b6e3aec4e081e572713b14d9/data_questionnaire_agent/test/__init__.py -------------------------------------------------------------------------------- /data_questionnaire_agent/test/advice_service_test.py: -------------------------------------------------------------------------------- 1 | from data_questionnaire_agent.model.openai_schema import ConditionalAdvice 2 | from data_questionnaire_agent.service.advice_service import ( 3 | chain_factory_advice, 4 | prepare_conditional_advice, 5 | ) 6 | from data_questionnaire_agent.test.provider.knowledge_base_provider import ( 7 | provide_knowledge_base, 8 | ) 9 | from data_questionnaire_agent.test.provider.question_answers_provider import ( 10 | provide_dummy_questionnaire, 11 | ) 12 | 13 | 14 | def create_chain(): 15 | chain = chain_factory_advice("en") 16 | assert chain is not None 17 | return chain 18 | 19 | 20 | def test_has_advice_questionnaire(): 21 | chain = create_chain() 22 | knowledge_base = provide_knowledge_base() 23 | questions_answers = provide_dummy_questionnaire() 24 | conditional_advice_input = prepare_conditional_advice( 25 | knowledge_base, questions_answers 26 | ) 27 | res: ConditionalAdvice = chain.invoke(conditional_advice_input) 28 | assert res.has_advice is True, f"{res}" 29 | print("************ ADVICE ***************") 30 | print(res.to_html()) 31 | -------------------------------------------------------------------------------- /data_questionnaire_agent/test/conditional_advice_test.py: -------------------------------------------------------------------------------- 1 | from data_questionnaire_agent.model.openai_schema import ConditionalAdvice 2 | from data_questionnaire_agent.test.provider.advice_provider import create_simple_advice 3 | 4 | 5 | def test_create_markdown(): 6 | advice = create_simple_advice() 7 | markdown = advice.to_markdown() 8 | assert "- Monitor data quality metrics" in markdown 9 | print(markdown) 10 | advice_json = advice.json() 11 | assert advice_json is not None 12 | conditional_advice_check = ConditionalAdvice.parse_raw(advice_json) 13 | assert conditional_advice_check is not None 14 | assert len(conditional_advice_check.advices) == len( 15 | advice.advices 16 | ), "Advices lengths do not match" 17 | -------------------------------------------------------------------------------- /data_questionnaire_agent/test/confidence_schema_test.py: -------------------------------------------------------------------------------- 1 | from data_questionnaire_agent.model.confidence_schema import ( 2 | ConfidenceDegree, 3 | ConfidenceRating, 4 | ) 5 | 6 | 7 | def test_simple_confidence_rating(): 8 | degree = ConfidenceDegree.high 9 | rating = ConfidenceRating( 10 | reasoning="We know the main problem of the user and the causes well. The only thing missing is more background information about the technological landscpae of the user.", 11 | rating=degree, 12 | ) 13 | assert rating is not None 14 | -------------------------------------------------------------------------------- /data_questionnaire_agent/test/confidence_service_test.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | from typing import List 3 | 4 | from data_questionnaire_agent.model.application_schema import ( 5 | Questionnaire, 6 | ) 7 | from data_questionnaire_agent.model.confidence_schema import ConfidenceRating 8 | from data_questionnaire_agent.service.confidence_service import ( 9 | calculate_confidence_rating, 10 | prompt_factory_confidence, 11 | ) 12 | from data_questionnaire_agent.test.provider.questionnaire_provider import ( 13 | create_questionnaire_2_questions, 14 | create_questionnaire_3_questions, 15 | create_questionnaire_4_questions, 16 | create_questionnaire_4_questions_german, 17 | ) 18 | 19 | 20 | def test_prompt_factory_confidence(): 21 | prompt_template = prompt_factory_confidence("en") 22 | assert prompt_template is not None 23 | assert prompt_template.messages is not None 24 | assert len(prompt_template.messages) > 0 25 | 26 | 27 | def test_calculate_confidence_rating_mediocre(): 28 | eval_questionnaire(create_questionnaire_2_questions(), ["low", "mediocre"]) 29 | 30 | 31 | def test_calculate_confidence_rating_mediocre_2(): 32 | eval_questionnaire(create_questionnaire_3_questions(), ["mediocre", "medium"]) 33 | 34 | 35 | def test_calculate_confidence_rating_stronger(): 36 | eval_questionnaire( 37 | create_questionnaire_4_questions(), ["mediocre", "medium", "high"] 38 | ) 39 | 40 | 41 | def test_calculate_confidence_rating_german(): 42 | eval_questionnaire( 43 | create_questionnaire_4_questions_german(), ["mediocre", "medium", "high"], "de" 44 | ) 45 | 46 | 47 | def eval_questionnaire( 48 | questionnaire: Questionnaire, expected: List[str], language: str = "en" 49 | ): 50 | confidence_rating: ConfidenceRating = asyncio.run( 51 | calculate_confidence_rating(questionnaire, "en") 52 | ) 53 | assert confidence_rating is not None 54 | assert confidence_rating.rating is not None 55 | assert confidence_rating.reasoning is not None 56 | assert confidence_rating.rating in expected 57 | -------------------------------------------------------------------------------- /data_questionnaire_agent/test/consultant_service_test.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from data_questionnaire_agent.service.consultant_service import ( 4 | convert_all_consultants, 5 | convert_to_markdown, 6 | create_structured_consultant_call, 7 | prepare_consultant_call, 8 | prompt_factory_consultants, 9 | ) 10 | from data_questionnaire_agent.test.provider.advice_provider import create_full_advice1 11 | from data_questionnaire_agent.test.provider.consultant_provider import ( 12 | create_simple_consultant, 13 | ) 14 | from data_questionnaire_agent.test.provider.questionnaire_provider import ( 15 | create_questionnaire_7_questions, 16 | ) 17 | 18 | 19 | def test_convert_to_markdown(): 20 | consultant = create_simple_consultant() 21 | markdown = convert_to_markdown([consultant]) 22 | assert markdown is not None, "There is no markdown" 23 | assert "John" in markdown, "John is not in markdown" 24 | assert "Doe" in markdown, "Doe is not in markdown" 25 | assert "gmail" in markdown, "gmail is not in markdown" 26 | 27 | 28 | def test_convert_all_consultants(): 29 | markdown = asyncio.run(convert_all_consultants()) 30 | assert markdown is not None, "there is not markdown" 31 | from pathlib import Path 32 | 33 | Path("/tmp/consultants.md").write_text(markdown) 34 | 35 | 36 | def test_prompt_factory_consultants(): 37 | prompt_template = prompt_factory_consultants("en") 38 | assert prompt_template is not None, "There is no prompt template" 39 | prompt_template.config_schema is not None, "There is no configuration schema" 40 | 41 | 42 | def test_create_structured_consultant_call(): 43 | runnable_sequence = create_structured_consultant_call("en") 44 | assert runnable_sequence is not None, "There is no runnable sequence" 45 | 46 | 47 | def provide_dummy_data(): 48 | questionnaire = create_questionnaire_7_questions() 49 | advice = create_full_advice1() 50 | prompt_data = asyncio.run(prepare_consultant_call(questionnaire, advice)) 51 | assert prompt_data is not None, "There is no prompt data." 52 | assert ( 53 | prompt_data["questions_answers"] is not None 54 | ), "There are no question and answers" 55 | return prompt_data 56 | 57 | 58 | def test_prepare_consultant_call(): 59 | provide_dummy_data() 60 | 61 | 62 | def test_call_consultant_evaluation(): 63 | prompt_data = provide_dummy_data() 64 | runnable_sequence = create_structured_consultant_call("en") 65 | consultant_ratings = runnable_sequence.invoke(prompt_data) 66 | assert consultant_ratings is not None, "There are no consultant ratings" 67 | -------------------------------------------------------------------------------- /data_questionnaire_agent/test/graph_service_test.py: -------------------------------------------------------------------------------- 1 | from data_questionnaire_agent.service.graph_service import generate_analyzed_ontology 2 | from data_questionnaire_agent.test.provider.ontology_provider import create_ontology 3 | 4 | 5 | def test_generate_analyzed_ontology(): 6 | ontology = create_ontology() 7 | analyzed_ontology = generate_analyzed_ontology(ontology) 8 | assert analyzed_ontology is not None 9 | assert analyzed_ontology.relationships is not None 10 | assert analyzed_ontology.degree_centrality is not None 11 | assert analyzed_ontology.betweenness_centrality is not None 12 | assert len(analyzed_ontology.degree_centrality) == len( 13 | analyzed_ontology.betweenness_centrality 14 | ) 15 | -------------------------------------------------------------------------------- /data_questionnaire_agent/test/graphrag_service_test.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from data_questionnaire_agent.service.graphrag_service import graphrag_context 4 | 5 | 6 | # Test requires external server to be running 7 | def test_graphrag_context(): 8 | result = asyncio.run(graphrag_context("What are the main topics?")) 9 | assert len(result) > 100, "Length should be at least 100 characters" 10 | -------------------------------------------------------------------------------- /data_questionnaire_agent/test/initial_question_service_test.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from data_questionnaire_agent.log_init import logger 4 | from data_questionnaire_agent.model.openai_schema import ResponseQuestions 5 | from data_questionnaire_agent.service.initial_question_service import ( 6 | chain_factory_initial_question, 7 | prepare_initial_question, 8 | ) 9 | from data_questionnaire_agent.service.knowledge_base_service import fetch_context 10 | from data_questionnaire_agent.toml_support import get_prompts_object 11 | 12 | 13 | def test_initial_question(): 14 | language = "en" 15 | initial_question = get_prompts_object(language).questionnaire["initial"]["question"] 16 | assert initial_question is not None 17 | 18 | answer = "Data Quality" 19 | search_res = asyncio.run(fetch_context(answer)) 20 | input = prepare_initial_question( 21 | question=initial_question, 22 | answer=answer, 23 | questions_per_batch=1, 24 | knowledge_base=search_res, 25 | ) 26 | chain = chain_factory_initial_question("en") 27 | res: dict = chain.invoke(input) 28 | assert res is not None 29 | response_questions: ResponseQuestions = res["function"] 30 | 31 | logger.info("Results: ") 32 | logger.info(response_questions) 33 | -------------------------------------------------------------------------------- /data_questionnaire_agent/test/jwt_token_service_test.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from data_questionnaire_agent.service.jwt_token_service import ( 4 | decode_token, 5 | generate_token_batch, 6 | generate_token_batch_file, 7 | ) 8 | from data_questionnaire_agent.test.provider.jwt_token_data_provider import ( 9 | generate_token_data, 10 | ) 11 | 12 | 13 | def test_decode_token(): 14 | token = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiIwMUo3RzRHMlEyRUVTWTY2Q1BHUjZFR042SCIsIm5hbWUiOiJHaWwiLCJpYXQiOjE3MjYwNDQ3MDMsImV4cCI6MTcyNjA0NDc2M30.2IJ_DoSQ8hyU4DU3lgXZduYPvoaAgxP1WKqTHbphI8Y" 15 | decoded = decode_token(token) 16 | assert decoded is not None, "Decoded should not be none." 17 | 18 | 19 | def test_generate_token_batch(): 20 | jwt_token_data = generate_token_data() 21 | amount = 5 22 | jwt_tokens = asyncio.run(generate_token_batch(jwt_token_data, amount)) 23 | assert len(jwt_tokens) == amount 24 | for jwt_token in jwt_tokens: 25 | decoded = asyncio.run(decode_token(jwt_token.token)) 26 | assert decoded is not None, "Decoded should not be none." 27 | 28 | 29 | def test_generate_token_batch_file(): 30 | jwt_token_data = generate_token_data() 31 | amount = 5 32 | file = asyncio.run(generate_token_batch_file(jwt_token_data, amount)) 33 | assert file.exists(), f"Cannot find file {file}" 34 | -------------------------------------------------------------------------------- /data_questionnaire_agent/test/mail_sender_test.py: -------------------------------------------------------------------------------- 1 | from data_questionnaire_agent.service.mail_sender import ( 2 | validate_address, 3 | ) 4 | from data_questionnaire_agent.test.provider.multipart_provider import ( 5 | create_dummy_multipart, 6 | ) 7 | 8 | 9 | def test_validate_address_ok(): 10 | assert validate_address("john.doe@gmail.com") 11 | assert validate_address("mary.do@protonmail.com") 12 | 13 | 14 | def test_validate_address_not_ok(): 15 | assert not validate_address("john.doegmail.com") 16 | assert not validate_address("mary.doprotonmail.com") 17 | 18 | 19 | def test_create_attachment_email(): 20 | multipart = create_dummy_multipart() 21 | assert multipart is not None 22 | assert len(multipart.as_string()) > 0 23 | -------------------------------------------------------------------------------- /data_questionnaire_agent/test/ontology_service_test.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from data_questionnaire_agent.model.ontology_schema import Ontology 4 | from data_questionnaire_agent.service.ontology_service import ( 5 | create_structured_question_call, 6 | prepare_ontology_chain_call, 7 | ) 8 | from data_questionnaire_agent.test.provider.advice_provider import ( 9 | create_advice_with_questionnaire, 10 | ) 11 | 12 | 13 | def test_chain_factory_ontology(): 14 | chain = create_structured_question_call("en") 15 | assert chain is not None 16 | conditional_advice, questionnaire = create_advice_with_questionnaire() 17 | assert conditional_advice is not None 18 | assert questionnaire is not None 19 | call_params = prepare_ontology_chain_call(questionnaire, conditional_advice) 20 | res = asyncio.run(chain.ainvoke(call_params)) 21 | assert res is not None 22 | assert isinstance(res, Ontology) 23 | assert len(res.relationships) > 0 24 | with open("relationships.json", "w") as f: 25 | f.write(res.json()) 26 | -------------------------------------------------------------------------------- /data_questionnaire_agent/test/persistence_service_consultants_async_test.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from data_questionnaire_agent.service.persistence_service_consultants_async import ( 4 | delete_consultant, 5 | delete_session_consultant_ratings, 6 | delete_skill, 7 | read_consultants, 8 | read_session_consultant_ratings, 9 | save_consultant, 10 | save_session_consultant_ratings, 11 | upsert_skill, 12 | ) 13 | from data_questionnaire_agent.test.provider.consultant_provider import ( 14 | create_consultant_rating, 15 | create_simple_consultant, 16 | ) 17 | 18 | if __name__ == "__main__": 19 | 20 | def test_upsert_skill(): 21 | skill = "testing123" 22 | count = asyncio.run(upsert_skill(skill)) 23 | assert count == 1, "Count is expected to be 1" 24 | count = asyncio.run(delete_skill(skill)) 25 | assert count == 1, "Delete count is expected to be 1" 26 | 27 | async def test_save_consultant(): 28 | consultant = create_simple_consultant() 29 | await save_consultant(consultant) 30 | consultants = await read_consultants() 31 | assert len(consultants) > 0, "There should be at least one consultant" 32 | await delete_consultant(consultant) 33 | 34 | async def test_save_session_consultants(): 35 | dummy_session_id = "dummy1" 36 | consultant_ratings = create_consultant_rating() 37 | counter = await save_session_consultant_ratings( 38 | dummy_session_id, consultant_ratings 39 | ) 40 | assert counter == 1 41 | read_crs = await read_session_consultant_ratings(dummy_session_id) 42 | assert len(read_crs.consultant_ratings) == len( 43 | consultant_ratings.consultant_ratings 44 | ) 45 | await delete_session_consultant_ratings(dummy_session_id) 46 | 47 | # test_upsert_skill() 48 | # asyncio.run(test_save_consultant()) 49 | asyncio.run(test_save_session_consultants()) 50 | -------------------------------------------------------------------------------- /data_questionnaire_agent/test/persistence_service_questions_async_test.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from data_questionnaire_agent.service.persistence_service_questions_async import ( 4 | delete_question, 5 | insert_question, 6 | ) 7 | 8 | if __name__ == "__main__": 9 | 10 | def test_insert_question(): 11 | question = "Which is the meaning of life?" 12 | id = asyncio.run( 13 | insert_question( 14 | question, 15 | "en", 16 | [ 17 | { 18 | "img_src": "", 19 | "img_alt": "", 20 | "title": "Test Suggestion Title", 21 | "main_text": "This is the main text of the suggestion", 22 | "svg_image": "", 23 | } 24 | ], 25 | ) 26 | ) 27 | print("Inserted question id", id) 28 | assert id > 0, "Id should be bigger than 0" 29 | return id 30 | 31 | # I created seperate functions, so i can test them separately if needed. 32 | 33 | def test_delete_question(id: int): 34 | count = asyncio.run(delete_question(id)) 35 | assert count == 1, "Delete count is expected to be 1" 36 | print("Delete count", count) 37 | 38 | def test_insert_delete(): 39 | question_id = test_insert_question() 40 | assert question_id is not None, "The question identifier is none" 41 | # You can also pass id manually to test_delete_question function 42 | test_delete_question(question_id) 43 | 44 | test_insert_delete() 45 | -------------------------------------------------------------------------------- /data_questionnaire_agent/test/provider/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/onepointconsulting/data-questionnaire-agent/cc96c26bbd559b15b6e3aec4e081e572713b14d9/data_questionnaire_agent/test/provider/__init__.py -------------------------------------------------------------------------------- /data_questionnaire_agent/test/provider/confidence_provider.py: -------------------------------------------------------------------------------- 1 | from data_questionnaire_agent.model.confidence_schema import ( 2 | ConfidenceDegree, 3 | ConfidenceRating, 4 | ) 5 | 6 | 7 | def create_confidence_rating() -> ConfidenceRating: 8 | return ConfidenceRating( 9 | rating=ConfidenceDegree.low, 10 | reasoning="""Based on the provided information, I only know the main problem of the customer, which is poor data quality. However, I do not have detailed information about the causes of the problem, the technological landscape, or the data governance strategies of the customer. Therefore, my confidence in giving advice is low.""", 11 | ) 12 | -------------------------------------------------------------------------------- /data_questionnaire_agent/test/provider/consultant_provider.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | 3 | from consultant_info_generator.model import Company, Consultant, Experience, Skill 4 | 5 | from data_questionnaire_agent.model.consultant_rating import ( 6 | AnalystRating, 7 | ConsultantRating, 8 | ConsultantRatings, 9 | ) 10 | 11 | 12 | def create_simple_consultant() -> Consultant: 13 | company = Company(name="Onepoint Consulting Ltd") 14 | experience1 = Experience( 15 | location="London, UK", 16 | title="Enterprise Architect", 17 | company=company, 18 | start=datetime.datetime(2023, 1, 1), 19 | end=None, 20 | ) 21 | consultant = Consultant( 22 | given_name="John", 23 | surname="Doe", 24 | email="john.doe@gmail.com", 25 | cv="General blabla", 26 | industry_name="IT", 27 | geo_location="London", 28 | linkedin_profile_url="john-doe", 29 | experiences=[experience1], 30 | skills=[Skill(name="Data Science"), Skill(name="Enterprise Architecture")], 31 | ) 32 | return consultant 33 | 34 | 35 | def create_consultant_rating() -> ConsultantRatings: 36 | consultant_rating = ConsultantRating( 37 | analyst_name="Alexander Polev", 38 | analyst_linkedin_url="https://www.linkedin.com/in/alexander-polev-cto", 39 | reasoning="Alexander Polev is an excellent choice, because of this and that.", 40 | rating=AnalystRating.suitable, 41 | ) 42 | return ConsultantRatings(consultant_ratings=[consultant_rating]) 43 | -------------------------------------------------------------------------------- /data_questionnaire_agent/test/provider/jwt_token_data_provider.py: -------------------------------------------------------------------------------- 1 | from data_questionnaire_agent.model.jwt_token import JWTTokenData 2 | 3 | 4 | def generate_token_data() -> JWTTokenData: 5 | return JWTTokenData( 6 | name="anonymous", email="anonymous@test.com", time_delta_minutes=None 7 | ) 8 | -------------------------------------------------------------------------------- /data_questionnaire_agent/test/provider/knowledge_base_provider.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | from data_questionnaire_agent.config import cfg 4 | 5 | 6 | def read_file(file_name: str) -> str: 7 | with open(cfg.raw_text_folder / file_name, "r", encoding="utf-8") as f: 8 | return f.read() 9 | 10 | 11 | # provide_data_quality throw an error if the file does not exist 12 | def provide_data_quality() -> str: 13 | # Check if "data quality.txt" exists in the raw_text_folder, if not, check if "AboutRefugees.txt" exists 14 | # return read_file("data quality.txt" if not "refugee" in str(cfg.raw_text_folder) else "AboutRefugees.txt") 15 | 16 | raw_text_folder = Path(cfg.raw_text_folder) 17 | assert raw_text_folder.exists() 18 | if (raw_text_folder / "data quality.txt").exists(): 19 | return read_file("data quality.txt") 20 | else: 21 | return read_file("AboutRefugees.txt") 22 | 23 | 24 | def provide_data_ops() -> str: 25 | # return read_file("dataops.txt" if not "refugee" in str(cfg.raw_text_folder) else "AsylumInAustria.txt") 26 | raw_text_folder = Path(cfg.raw_text_folder) 27 | assert raw_text_folder.exists() 28 | if (cfg.raw_text_folder / "dataops.txt").exists(): 29 | return read_file("dataops.txt") 30 | else: 31 | return read_file("AsylumInAustria.txt") 32 | 33 | 34 | def provide_knowledge_base() -> str: 35 | return f"{provide_data_quality()}\n\n{provide_data_ops()}" 36 | -------------------------------------------------------------------------------- /data_questionnaire_agent/test/provider/multipart_provider.py: -------------------------------------------------------------------------------- 1 | from email.mime.multipart import MIMEMultipart 2 | 3 | from data_questionnaire_agent.config import cfg 4 | from data_questionnaire_agent.model.mail_data import Email 5 | from data_questionnaire_agent.service.mail_sender import create_attachment_email 6 | 7 | 8 | def create_dummy_email() -> Email: 9 | return Email( 10 | recipient="john.smith@gmail.com", 11 | subject="Test", 12 | html_body="

Testing

", 13 | files=[cfg.project_root / "README.md"], 14 | ) 15 | 16 | 17 | def create_dummy_email_2() -> Email: 18 | return Email( 19 | recipient="gil.fernandes@gmail.com", 20 | subject="Test", 21 | html_body="

Testing

", 22 | files=[cfg.project_root / "README.md"], 23 | ) 24 | 25 | 26 | def create_dummy_multipart() -> MIMEMultipart: 27 | return create_attachment_email(create_dummy_email()) 28 | -------------------------------------------------------------------------------- /data_questionnaire_agent/test/provider/ontology_provider.py: -------------------------------------------------------------------------------- 1 | from data_questionnaire_agent.model.ontology_schema import Ontology 2 | 3 | 4 | def create_ontology(): 5 | ontology_json = { 6 | "relationships": [ 7 | { 8 | "source": "Poor data quality", 9 | "relationship": "leads to", 10 | "target": "incorrect insights", 11 | }, 12 | { 13 | "source": "Poor data quality", 14 | "relationship": "leads to", 15 | "target": "poor decision-making", 16 | }, 17 | { 18 | "source": "Customer duplicates", 19 | "relationship": "is a challenge in", 20 | "target": "data quality", 21 | }, 22 | { 23 | "source": "Manual checks", 24 | "relationship": "used for", 25 | "target": "identifying duplicates", 26 | }, 27 | { 28 | "source": "Manual checks", 29 | "relationship": "leads to", 30 | "target": "time-consuming process", 31 | }, 32 | {"source": "Manual checks", "relationship": "leads to", "target": "errors"}, 33 | { 34 | "source": "CRM systems", 35 | "relationship": "contributes to", 36 | "target": "duplicate customer records", 37 | }, 38 | { 39 | "source": "E-commerce platforms", 40 | "relationship": "contributes to", 41 | "target": "duplicate customer records", 42 | }, 43 | { 44 | "source": "Customer service databases", 45 | "relationship": "contributes to", 46 | "target": "duplicate customer records", 47 | }, 48 | { 49 | "source": "Different systems", 50 | "relationship": "have", 51 | "target": "varying data formats and standards", 52 | }, 53 | { 54 | "source": "Automated Deduplication Tools", 55 | "relationship": "improves", 56 | "target": "data quality", 57 | }, 58 | { 59 | "source": "Automated Deduplication Tools", 60 | "relationship": "improves", 61 | "target": "operational efficiency", 62 | }, 63 | { 64 | "source": "Standardised Data Formats and Naming Conventions", 65 | "relationship": "reduces", 66 | "target": "duplicates", 67 | }, 68 | { 69 | "source": "Data Governance Policies", 70 | "relationship": "manages", 71 | "target": "data quality", 72 | }, 73 | { 74 | "source": "Data Profiling and Cleansing Tools", 75 | "relationship": "identifies and corrects", 76 | "target": "inconsistencies, inaccuracies, and duplicates", 77 | }, 78 | { 79 | "source": "Data Quality Metrics", 80 | "relationship": "detects and addresses", 81 | "target": "data quality issues", 82 | }, 83 | { 84 | "source": "Manual Processes", 85 | "relationship": "leads to", 86 | "target": "data quality problems", 87 | }, 88 | { 89 | "source": "Ignoring Data Governance", 90 | "relationship": "results in", 91 | "target": "poor data quality", 92 | }, 93 | { 94 | "source": "Overlooking Data Standardisation", 95 | "relationship": "leads to", 96 | "target": "data quality issues", 97 | }, 98 | { 99 | "source": "Automating deduplication and standardising data formats", 100 | "relationship": "leads to", 101 | "target": "improved operational efficiency", 102 | }, 103 | { 104 | "source": "High-quality, consistent data", 105 | "relationship": "enables", 106 | "target": "better-informed decision-making", 107 | }, 108 | { 109 | "source": "Accurate and up-to-date customer records", 110 | "relationship": "leads to", 111 | "target": "increased customer satisfaction", 112 | }, 113 | ] 114 | } 115 | ontology = Ontology.parse_obj(ontology_json) 116 | return ontology 117 | -------------------------------------------------------------------------------- /data_questionnaire_agent/test/provider/question_answer_provider.py: -------------------------------------------------------------------------------- 1 | from data_questionnaire_agent.model.application_schema import QuestionAnswer 2 | 3 | 4 | def create_question_answer_with_possible_answers() -> QuestionAnswer: 5 | return QuestionAnswer( 6 | id=None, 7 | question="What is the meaning of life?", 8 | answer="The meaning of life is 42", 9 | clarification=[], 10 | possible_answers=["The meaning of life is 43", "The meaning of life is 44"], 11 | ) 12 | -------------------------------------------------------------------------------- /data_questionnaire_agent/test/provider/question_answers_provider.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from typing import List 3 | 4 | from data_questionnaire_agent.config import cfg 5 | 6 | 7 | def questionnaire_str_adapter(questions: List[str], answers: List[str]) -> str: 8 | return "\n\n".join([f"{q}\n{a}" for q, a in zip(questions, answers)]) 9 | 10 | 11 | def provide_data_silo_questionnaire() -> str: 12 | questions = [ 13 | "Which area of your data ecosystem are you most concerned about?", 14 | "What steps have you taken so far to address the issue of data silos in your organization?", 15 | "How are you currently sharing data between different departments, lines of business and subsidiaries in your organization?", 16 | "Considering the challenges you've faced with data silos and the partial success of your service-driven architecture, have you considered implementing a Data Mesh architecture to decentralize data ownership and accountability, thereby reducing data silos?", 17 | "You mentioned that not all departments have bought into the idea of using REST services for data sharing, particularly the billing department. What are the specific concerns or challenges they have raised, and how might these be addressed to improve data integration across your organization?", 18 | ] 19 | answers = [ 20 | "Data silos", 21 | "We have been trying to create a service driven architecture to share data across systems using Mulesoft.", 22 | "We have a series of well documented REST services that allow sharing data between departments. However not all departments have bought into this, specially the billing department.", 23 | "Yes, but we are facing political issues with some departments not wanting to share their data.", 24 | "Billing data is related to personal data and money flows and should therefore only be accessed by the department itself or selected controllers. This is the main concern.", 25 | ] 26 | return questionnaire_str_adapter(questions, answers) 27 | 28 | 29 | def provide_missing_documents_questionnaire() -> str: 30 | questions = [ 31 | "What challenges are you currently facing as a refugee?", 32 | "Have you reported the loss of your documents to the local authorities?", 33 | "Have you obtained the confirmation of the theft report from the local police, which is required to apply for the reissue of your ID card at the passport authority center?", 34 | "Do you need assistance in obtaining a replacement for your lost identity card?", 35 | ] 36 | answers = [ 37 | "I lost my documents", 38 | "Yes, I did.", 39 | "Yes, I have obtained it.", 40 | "Yes, I need help with the replacement process.", 41 | ] 42 | return questionnaire_str_adapter(questions, answers) 43 | 44 | 45 | def provide_dummy_questionnaire() -> str: 46 | raw_text_folder = Path(cfg.raw_text_folder) 47 | if (raw_text_folder / "data quality.txt").exists(): 48 | return provide_data_silo_questionnaire() 49 | else: 50 | return provide_missing_documents_questionnaire() 51 | 52 | 53 | def provide_incomplete_questionnaire() -> str: 54 | questions = ["Which area of your data ecosystem are you most concerned about?"] 55 | answers = ["Data quality"] 56 | return questionnaire_str_adapter(questions, answers) 57 | -------------------------------------------------------------------------------- /data_questionnaire_agent/test/provider/questionnaire_status_provider.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | 3 | from data_questionnaire_agent.model.questionnaire_status import QuestionnaireStatus 4 | 5 | 6 | def create_simple() -> QuestionnaireStatus: 7 | return QuestionnaireStatus( 8 | session_id="12312231231", 9 | question="What is the meaning of this?", 10 | answer="42", 11 | final_report=False, 12 | created_at=datetime.now(), 13 | updated_at=datetime.now(), 14 | ) 15 | -------------------------------------------------------------------------------- /data_questionnaire_agent/test/provider/session_configuration_provider.py: -------------------------------------------------------------------------------- 1 | from data_questionnaire_agent.model.session_configuration import ( 2 | DEFAULT_SESSION_STEPS, 3 | SESSION_STEPS_CONFIG_KEY, 4 | SessionConfigurationEntry, 5 | ) 6 | 7 | 8 | def create_session_configuration(): 9 | return SessionConfigurationEntry( 10 | session_id="test_id", 11 | config_key=SESSION_STEPS_CONFIG_KEY, 12 | config_value=str(DEFAULT_SESSION_STEPS), 13 | ) 14 | -------------------------------------------------------------------------------- /data_questionnaire_agent/test/provider/session_properties_provider.py: -------------------------------------------------------------------------------- 1 | from data_questionnaire_agent.model.session_configuration import ( 2 | ChatType, 3 | SessionProperties, 4 | ) 5 | 6 | 7 | def create_session_properties() -> SessionProperties: 8 | return SessionProperties( 9 | session_steps=6, session_language="en", chat_type=ChatType.DIVERGING 10 | ) 11 | 12 | 13 | def create_session_properties_to_the_point() -> SessionProperties: 14 | return SessionProperties( 15 | session_steps=6, session_language="en", chat_type=ChatType.TO_THE_POINT 16 | ) 17 | -------------------------------------------------------------------------------- /data_questionnaire_agent/test/question_answer_test.py: -------------------------------------------------------------------------------- 1 | from data_questionnaire_agent.model.application_schema import QuestionAnswer 2 | 3 | 4 | def test_validate_simple_question_answer(): 5 | question_answer = QuestionAnswer( 6 | id=None, question="What is the meaning of life?", answer="42", clarification="" 7 | ) 8 | assert question_answer.possible_answers is not None 9 | assert len(question_answer.possible_answers) == 0 10 | -------------------------------------------------------------------------------- /data_questionnaire_agent/test/questionnaire_server_test.py: -------------------------------------------------------------------------------- 1 | from unittest.mock import AsyncMock, patch 2 | 3 | import pytest 4 | from aiohttp import web 5 | 6 | from data_questionnaire_agent.model.jwt_token import JWTTokenData 7 | from data_questionnaire_agent.server.questionnaire_server import routes 8 | 9 | CORS_HEADERS = {"Access-Control-Allow-Origin": "*"} 10 | 11 | 12 | @pytest.fixture 13 | def client(aiohttp_client): 14 | app = web.Application() 15 | app.add_routes(routes) 16 | return aiohttp_client(app) 17 | 18 | 19 | @pytest.mark.asyncio 20 | @patch("data_questionnaire_agent.server.questionnaire_server.generate_jwt_token") 21 | async def test_generate_jwt_token_success(mock_generate_token, client): 22 | # Mock the generate_token function 23 | mock_generate_token.return_value = AsyncMock( 24 | return_value=JWTTokenData( 25 | name="John Doe", email="john@example.com", time_delta_minutes=30 26 | ) 27 | ) 28 | 29 | # Make a POST request with valid JSON 30 | json_payload = { 31 | "name": "John Doe", 32 | "email": "john@example.com", 33 | "time_delta_minutes": 30, 34 | } 35 | c = await client 36 | resp = await c.post("/gen_jwt_token", json=json_payload) 37 | 38 | # Validate response 39 | assert resp.status == 200 40 | body = await resp.json() 41 | assert body["token"] is not None 42 | assert body["email"] == "john@example.com" 43 | -------------------------------------------------------------------------------- /data_questionnaire_agent/test/questionnaire_status_test.py: -------------------------------------------------------------------------------- 1 | from data_questionnaire_agent.model.questionnaire_status import QuestionnaireStatus 2 | from data_questionnaire_agent.test.provider.questionnaire_status_provider import ( 3 | create_simple, 4 | ) 5 | 6 | 7 | def test_has_advice_questionnaire(): 8 | qs: QuestionnaireStatus = create_simple() 9 | assert qs is not None 10 | difference = qs.updated_at - qs.created_at 11 | assert difference.seconds < 2 12 | -------------------------------------------------------------------------------- /data_questionnaire_agent/test/report_aggregation_main_service_test.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from typing import Dict 3 | 4 | import pandas as pd 5 | 6 | from data_questionnaire_agent.config import cfg 7 | from data_questionnaire_agent.model.report_aggregation_schema import ( 8 | ReportDocumentClassificationContainer, 9 | ReportItemCount, 10 | ) 11 | from data_questionnaire_agent.service.report_aggregation_main_service import ( 12 | convert_to_dataframe, 13 | create_multiple_excel, 14 | group_reports, 15 | prompt_factory_document_classifier_prompt, 16 | prompt_factory_keyword_extraction_prompt, 17 | ) 18 | 19 | 20 | def test_prompt_factory_keyword_extraction_prompt(): 21 | template = prompt_factory_keyword_extraction_prompt(language="en") 22 | assert template is not None, "Template should not be none" 23 | 24 | 25 | def test_prompt_factory_document_classifier_prompt(): 26 | template = prompt_factory_document_classifier_prompt(language="en") 27 | assert template is not None, "Template should not be none" 28 | 29 | 30 | def test_group_reports(): 31 | sample_file = ( 32 | cfg.project_root 33 | / "data/report_doc_classification_01JDYZ52AD3KYM2Q7VF5NHV3ER.json" 34 | ) 35 | assert (sample_file).exists() 36 | document_classification = ReportDocumentClassificationContainer.model_validate_json( 37 | sample_file.read_text() 38 | ) 39 | report_item_count = group_reports(document_classification) 40 | assert report_item_count is not None 41 | assert report_item_count.problem_count is not None 42 | assert report_item_count.problem_area_count is not None 43 | assert report_item_count.recommendation_count is not None 44 | 45 | 46 | def generate_df_dict() -> Dict[str, pd.DataFrame]: 47 | sample_file = ( 48 | cfg.project_root / "data/report_item_count_01JDZBAA5T6Z0MHZ10KY0HF938.json" 49 | ) 50 | assert (sample_file).exists() 51 | report_item_count = ReportItemCount.model_validate_json(sample_file.read_text()) 52 | return convert_to_dataframe(report_item_count) 53 | 54 | 55 | def test_convert_to_dataframe(): 56 | df_dict = generate_df_dict() 57 | assert df_dict is not None 58 | assert df_dict["problem_df"] is not None 59 | assert len(df_dict["problem_df"]["df"]) > 0 60 | assert df_dict["problem_area_df"] is not None 61 | assert len(df_dict["problem_area_df"]["df"]) > 0 62 | 63 | 64 | def test_create_multiple_excel(): 65 | df_dict = generate_df_dict() 66 | assert df_dict is not None 67 | excel_path = Path("./report_aggregation_counts.xlsx") 68 | create_multiple_excel(df_dict, excel_path) 69 | assert excel_path.exists(), "Cannot find multiple excel report" 70 | -------------------------------------------------------------------------------- /data_questionnaire_agent/test/report_aggregation_service_test.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | from typing import List 3 | 4 | from data_questionnaire_agent.config import cfg 5 | from data_questionnaire_agent.model.questionnaire_status import QuestionnaireStatus 6 | from data_questionnaire_agent.service.report_aggregation_service import convert_to_str 7 | from data_questionnaire_agent.service.similarity_search import num_tokens_from_string 8 | 9 | 10 | def test_convert_to_str(): 11 | questionnaire_pkl = cfg.project_root / "data/questionnaire.pkl" 12 | assert questionnaire_pkl.exists() 13 | with open(questionnaire_pkl, "rb") as f: 14 | questionnaire_data = pickle.load(f) 15 | questionnaire_statuses: List[QuestionnaireStatus] = convert_to_str( 16 | questionnaire_data 17 | ) 18 | assert len(questionnaire_statuses) > 0 19 | print(num_tokens_from_string("\n".join(questionnaire_statuses))) 20 | -------------------------------------------------------------------------------- /data_questionnaire_agent/test/report_aggregation_summarization_service_test.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from data_questionnaire_agent.config import cfg 4 | from data_questionnaire_agent.service.report_aggregation_summarization_service import ( 5 | aexecute_summarization_batch, 6 | create_summarization_call, 7 | prompt_factory_summarization_prompt, 8 | ) 9 | 10 | 11 | def test_prompt_factory_summarization_prompt(): 12 | chat_template = prompt_factory_summarization_prompt("en") 13 | assert chat_template is not None, "Chat template cannot be none" 14 | res = chat_template.format(full_questionnaire="Bla") 15 | assert res is not None, "No result from formatting" 16 | 17 | 18 | def test_create_summarization_call(): 19 | runnable = create_summarization_call("en") 20 | assert runnable is not None, "Runnable is none" 21 | 22 | 23 | def test_aexecute_summarization_batch(): 24 | files = [ 25 | "data/sample_questionnaire1.md", 26 | "data/sample_questionnaire2.md", 27 | "data/sample_questionnaire3.md", 28 | ] 29 | texts = [(cfg.project_root / f).read_text(encoding="utf-8") for f in files] 30 | summaries = asyncio.run(aexecute_summarization_batch(texts)) 31 | assert summaries is not None 32 | assert len(summaries) == len(files) 33 | -------------------------------------------------------------------------------- /data_questionnaire_agent/test/report_enhancement_service_test.py: -------------------------------------------------------------------------------- 1 | from data_questionnaire_agent.service.report_enhancement_service import ( 2 | replace_bold_markdown, 3 | replace_markdown_bold_with_links, 4 | ) 5 | 6 | 7 | def provider_string(): 8 | return "This is some **bold** content with some **important** message as you can imagine." 9 | 10 | 11 | def test_replace_bold_markdown(): 12 | test_str = provider_string() 13 | res = replace_bold_markdown(test_str) 14 | assert ( 15 | res 16 | == "This is some bold content with some important message as you can imagine." 17 | ), f"Unexpected output: {res}" 18 | 19 | 20 | def test_replace_markdown_bold_with_links(): 21 | test_str = provider_string() 22 | res = replace_markdown_bold_with_links(test_str) 23 | assert ( 24 | res 25 | == "This is some **[bold](https://www.google.com/search?q=bold)** content with some **[important](https://www.google.com/search?q=important)** message as you can imagine." 26 | ), f"Unexpected output: {res}" 27 | -------------------------------------------------------------------------------- /data_questionnaire_agent/test/server_message_test.py: -------------------------------------------------------------------------------- 1 | from data_questionnaire_agent.model.server_model import ServerMessage 2 | 3 | 4 | def test_simple_server_message(): 5 | question = "test" 6 | session_id = "1231231231231" 7 | server_message = ServerMessage( 8 | question=question, session_id=session_id, clarification=None, question_id=None 9 | ) 10 | server_message_str = server_message.json() 11 | assert question in server_message_str 12 | assert session_id in server_message_str 13 | -------------------------------------------------------------------------------- /data_questionnaire_agent/test/session_configuration_test.py: -------------------------------------------------------------------------------- 1 | from data_questionnaire_agent.model.session_configuration import ( 2 | SESSION_STEPS_CONFIG_KEY, 3 | SESSION_STEPS_LANGUAGE_KEY, 4 | ChatType, 5 | chat_type_factory, 6 | create_session_configurations, 7 | ) 8 | from data_questionnaire_agent.test.provider.session_configuration_provider import ( 9 | create_session_configuration, 10 | ) 11 | from data_questionnaire_agent.test.provider.session_properties_provider import ( 12 | create_session_properties, 13 | ) 14 | 15 | 16 | def test_session_configuration(): 17 | session_configuration = create_session_configuration() 18 | assert session_configuration.config_key == SESSION_STEPS_CONFIG_KEY 19 | 20 | 21 | def test_chat_type_factory(): 22 | assert chat_type_factory(ChatType.DIVERGING.value) == ChatType.DIVERGING 23 | assert chat_type_factory(ChatType.TO_THE_POINT.value) == ChatType.TO_THE_POINT 24 | assert chat_type_factory("bla") == ChatType.DIVERGING 25 | 26 | 27 | def test_create_session_configurations(): 28 | session_properties = create_session_properties() 29 | config_entries = create_session_configurations( 30 | "12345", session_properties, "hi there" 31 | ) 32 | assert len(config_entries) == 4 33 | assert config_entries[0].config_key == SESSION_STEPS_CONFIG_KEY 34 | assert config_entries[1].config_key == SESSION_STEPS_LANGUAGE_KEY 35 | -------------------------------------------------------------------------------- /data_questionnaire_agent/test/toml_support_test.py: -------------------------------------------------------------------------------- 1 | from data_questionnaire_agent.toml_support import DEFAULT_LANGUAGE, get_prompts 2 | 3 | 4 | def test_english_prompt(): 5 | prompts = get_prompts(DEFAULT_LANGUAGE) 6 | assert prompts is not None, "Could not find prompts" 7 | questionnaire = prompts["questionnaire"] 8 | assert questionnaire is not None, "Cannot find questionnaire" 9 | initial = questionnaire["initial"] 10 | assert initial is not None, "Cannot find initial" 11 | -------------------------------------------------------------------------------- /data_questionnaire_agent/toml_support.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | import tomli 4 | 5 | from data_questionnaire_agent.config import cfg 6 | from data_questionnaire_agent.log_init import logger 7 | 8 | 9 | def read_toml(file: Path) -> dict: 10 | with open(file, "rb") as f: 11 | return tomli.load(f) 12 | 13 | 14 | # Need to add a language parameter. 15 | # Pick the right file based on the language parameter. 16 | # Default to English if the language is not supported. 17 | 18 | DEFAULT_LANGUAGE = "en" 19 | SUPPORTED_LANGUAGES = ["en", "de"] 20 | 21 | 22 | def read_prompts_toml(language: str = DEFAULT_LANGUAGE) -> dict: 23 | if language not in SUPPORTED_LANGUAGES: 24 | logger.warn( 25 | f"Warning: language {language} not supported. Using default language." 26 | ) 27 | language = DEFAULT_LANGUAGE 28 | 29 | logger.warn(f"Reading prompts from prompts_{language}.toml") 30 | return read_toml(cfg.project_root / f"prompts_{language}.toml") 31 | 32 | 33 | prompts_language = {} 34 | for lang in SUPPORTED_LANGUAGES: 35 | prompts_language[lang] = read_prompts_toml(lang) 36 | 37 | 38 | def get_prompts(language: str = DEFAULT_LANGUAGE) -> dict: 39 | if language in prompts_language: 40 | return prompts_language[language] 41 | return prompts_language[DEFAULT_LANGUAGE] 42 | 43 | 44 | def get_prompts_object(language: str = DEFAULT_LANGUAGE) -> object: 45 | prompts = get_prompts(language) 46 | return objectview(prompts) 47 | 48 | 49 | class objectview(object): 50 | def __init__(self, d): 51 | self.__dict__ = d 52 | 53 | 54 | if __name__ == "__main__": 55 | from data_questionnaire_agent.log_init import logger 56 | 57 | def test_prompts(language: str): 58 | prompts = get_prompts(language) 59 | assert prompts is not None, "Could not find prompts" 60 | questionnaire = prompts["questionnaire"] 61 | assert questionnaire is not None, "Cannot find questionnaire" 62 | initial = questionnaire["initial"] 63 | assert initial is not None, "Cannot find initial" 64 | 65 | def english_prompt_test(): 66 | test_prompts(DEFAULT_LANGUAGE) 67 | 68 | def farsi_prompt_test(): 69 | test_prompts("fa") 70 | 71 | english_prompt_test() 72 | farsi_prompt_test() 73 | 74 | prompts_object = get_prompts("fa") 75 | print(type(prompts_object)) 76 | print(prompts_object) 77 | -------------------------------------------------------------------------------- /data_questionnaire_agent/translation.py: -------------------------------------------------------------------------------- 1 | import i18n 2 | from data_questionnaire_agent.config import cfg 3 | 4 | i18n.load_path.append(cfg.translation_path) 5 | 6 | 7 | def t(key: str, **kwargs): 8 | return i18n.t(f"messages.{key}", **kwargs) 9 | 10 | 11 | if __name__ == "__main__": 12 | print( 13 | t( 14 | "A big thank you for completing a session with", 15 | name=cfg.product_title, 16 | locale="de", 17 | ) 18 | ) 19 | print(t("db_insert_failed", locale="en")) 20 | -------------------------------------------------------------------------------- /data_questionnaire_agent/ui/advice_processor.py: -------------------------------------------------------------------------------- 1 | from langchain_core.runnables.base import RunnableSequence 2 | from tenacity import AsyncRetrying 3 | 4 | from data_questionnaire_agent.config import cfg 5 | from data_questionnaire_agent.log_init import logger 6 | from data_questionnaire_agent.model.application_schema import Questionnaire 7 | from data_questionnaire_agent.model.openai_schema import ConditionalAdvice 8 | from data_questionnaire_agent.service.advice_service import ( 9 | chain_factory_advice, 10 | prepare_conditional_advice, 11 | ) 12 | from data_questionnaire_agent.service.knowledge_base_service import fetch_context 13 | 14 | 15 | async def process_advice( 16 | questionnaire: Questionnaire, advice_chain: RunnableSequence 17 | ) -> ConditionalAdvice: 18 | questionnaire_str = str(questionnaire) 19 | 20 | knowledge_base = await fetch_context(questionnaire_str) 21 | 22 | advice_input = prepare_conditional_advice( 23 | knowledge_base=knowledge_base, questions_answers=questionnaire_str 24 | ) 25 | async for attempt in AsyncRetrying(**cfg.retry_args): 26 | with attempt: 27 | conditional_advice: ConditionalAdvice = await advice_chain.ainvoke( 28 | advice_input 29 | ) 30 | if conditional_advice.has_advice: 31 | for advice in conditional_advice.advices: 32 | logger.info(advice) 33 | return conditional_advice 34 | 35 | 36 | if __name__ == "__main__": 37 | import asyncio 38 | 39 | from data_questionnaire_agent.test.provider.questionnaire_provider import ( 40 | create_questionnaire_2_questions, 41 | ) 42 | 43 | advice_chain = chain_factory_advice() 44 | questionnaire = create_questionnaire_2_questions() 45 | print(asyncio.run(process_advice(questionnaire, advice_chain))) 46 | -------------------------------------------------------------------------------- /data_questionnaire_agent/ui/avatar_factory.py: -------------------------------------------------------------------------------- 1 | import chainlit as cl 2 | 3 | AVATAR = {"CHATBOT": "Chatbot", "USER": "User"} 4 | 5 | 6 | async def setup_avatar(): 7 | await cl.Avatar( 8 | name=AVATAR["CHATBOT"], 9 | url="/public/images/companion_icon.png", 10 | ).send() 11 | await cl.Avatar( 12 | name=AVATAR["USER"], 13 | url="/public/images/user.png", 14 | ).send() 15 | -------------------------------------------------------------------------------- /data_questionnaire_agent/ui/chat_settings_factory.py: -------------------------------------------------------------------------------- 1 | import chainlit as cl 2 | from chainlit.input_widget import Slider, TextInput 3 | 4 | from data_questionnaire_agent.toml_support import prompts 5 | 6 | MINIMUM_NUMBER_OF_QUESTIONS = "Minimum number of questions" 7 | QUESTION_PER_BATCH = "Questions per batch" 8 | INITIAL_QUESTION = "Initial question" 9 | 10 | 11 | async def create_chat_settings() -> cl.ChatSettings: 12 | questions_per_batch = prompts["general_settings"]["questions_per_batch"] 13 | minimum_number_of_questions = prompts["general_settings"][ 14 | "minimum_number_of_questions" 15 | ] 16 | initial_question = prompts["questionnaire"]["initial"]["question"] 17 | settings = await cl.ChatSettings( 18 | [ 19 | TextInput( 20 | id=INITIAL_QUESTION, label=INITIAL_QUESTION, initial=initial_question 21 | ), 22 | Slider( 23 | id=MINIMUM_NUMBER_OF_QUESTIONS, 24 | label="Minimum number of questions", 25 | initial=minimum_number_of_questions, 26 | min=0, 27 | max=5, 28 | step=1, 29 | ), 30 | Slider( 31 | id=QUESTION_PER_BATCH, 32 | label="Number of question per batch", 33 | initial=questions_per_batch, 34 | min=0, 35 | max=5, 36 | step=1, 37 | ), 38 | ] 39 | ).send() 40 | return settings 41 | -------------------------------------------------------------------------------- /data_questionnaire_agent/ui/clarifications_chainlit.py: -------------------------------------------------------------------------------- 1 | import chainlit as cl 2 | from langchain.agents import AgentExecutor 3 | from langchain.chains import LLMChain 4 | from tenacity import AsyncRetrying 5 | 6 | # Activating REST interfaces 7 | from data_questionnaire_agent.config import cfg 8 | from data_questionnaire_agent.log_init import logger 9 | from data_questionnaire_agent.model.application_schema import Questionnaire 10 | from data_questionnaire_agent.model.openai_schema import ResponseTags 11 | from data_questionnaire_agent.service.clarifications_agent import ( 12 | create_clarification_agent, 13 | ) 14 | from data_questionnaire_agent.service.tagging_service import prepare_sentiment_input 15 | 16 | 17 | async def process_clarifications_chainlit( 18 | questionnaire: Questionnaire, 19 | questions_to_process: int, 20 | has_questions_chain: LLMChain, 21 | clarification_agent: AgentExecutor, 22 | use_chainlit: bool = True, 23 | ): 24 | questions = questionnaire.questions[-questions_to_process:] 25 | content = "\n".join([q.answer for q in questions]) 26 | 27 | async for attempt in AsyncRetrying(**cfg.retry_args): 28 | with attempt: 29 | response_tags: ResponseTags = await has_questions_chain.arun( 30 | prepare_sentiment_input(content) 31 | ) 32 | if len(response_tags.extracted_questions) > 0: 33 | if response_tags.questions_related_to_data_analytics: 34 | for clarification_question in response_tags.extracted_questions: 35 | clarification = await clarification_agent.arun( 36 | clarification_question 37 | ) 38 | if use_chainlit: 39 | await cl.Message(content=clarification).send() 40 | else: 41 | logger.info(clarification) 42 | else: 43 | message = f"These questions: {response_tags.extracted_questions} are unrelated to our main topic." 44 | if use_chainlit: 45 | await cl.Message(content=message).send() 46 | logger.warn(message) 47 | 48 | 49 | if __name__ == "__main__": 50 | import asyncio 51 | 52 | from data_questionnaire_agent.service.tagging_service import ( 53 | prepare_sentiment_input, 54 | sentiment_chain_factory, 55 | ) 56 | from data_questionnaire_agent.test.provider.questionnaire_provider import ( 57 | create_questionnaire_initial_question, 58 | ) 59 | 60 | def deal_with_unrelated_questions(): 61 | questionnaire: Questionnaire = create_questionnaire_initial_question( 62 | "Which is the capital of India" 63 | ) 64 | has_questions_chain: LLMChain = sentiment_chain_factory() 65 | clarification_agent: AgentExecutor = create_clarification_agent() 66 | asyncio.run( 67 | process_clarifications_chainlit( 68 | questionnaire, 1, has_questions_chain, clarification_agent, False 69 | ) 70 | ) 71 | 72 | def deal_with_related_questions(): 73 | questionnaire: Questionnaire = create_questionnaire_initial_question( 74 | "What do you mean by data quality?" 75 | ) 76 | has_questions_chain: LLMChain = sentiment_chain_factory() 77 | clarification_agent: AgentExecutor = create_clarification_agent() 78 | asyncio.run( 79 | process_clarifications_chainlit( 80 | questionnaire, 1, has_questions_chain, clarification_agent, False 81 | ) 82 | ) 83 | 84 | deal_with_unrelated_questions() 85 | -------------------------------------------------------------------------------- /data_questionnaire_agent/ui/image_support.py: -------------------------------------------------------------------------------- 1 | def display_image(image_path: str, alt: str, title: str): 2 | return f'![{alt}](/public/images/{image_path} "{title}")' 3 | -------------------------------------------------------------------------------- /data_questionnaire_agent/ui/mail_processor.py: -------------------------------------------------------------------------------- 1 | import chainlit as cl 2 | from asyncer import asyncify 3 | 4 | from data_questionnaire_agent.config import cfg, mail_config 5 | from data_questionnaire_agent.log_init import logger 6 | from data_questionnaire_agent.model.application_schema import Questionnaire 7 | from data_questionnaire_agent.model.openai_schema import ConditionalAdvice 8 | from data_questionnaire_agent.service.mail_sender import ( 9 | create_mail_body, 10 | send_email, 11 | validate_address, 12 | ) 13 | from data_questionnaire_agent.ui.avatar_factory import AVATAR 14 | 15 | 16 | async def process_send_email(questionnaire: Questionnaire, advices: ConditionalAdvice): 17 | response = await cl.AskUserMessage( 18 | content="Would you like to receive an email with the recommendations? If so, please enter your email address in the chat.", 19 | timeout=cfg.ui_timeout, 20 | author=AVATAR["CHATBOT"], 21 | ).send() 22 | has_replied = False 23 | if response and "content" in response: 24 | has_replied = True 25 | response_content = response["content"] 26 | if validate_address(response_content): 27 | feedback_email = "feedback@onepointltd.ai" 28 | logger.info("Sending email to %s", response_content) 29 | await asyncify(send_email)( 30 | "Dear customer", 31 | response_content, 32 | mail_config.mail_subject, 33 | create_mail_body(questionnaire, advices, feedback_email), 34 | ) 35 | await cl.Message( 36 | content="Thank you for submitting the query. We really appreciate that you have taken time to do this.", 37 | author=AVATAR["CHATBOT"], 38 | ).send() 39 | else: 40 | logger.warn("%s is not a valid email", response_content) 41 | await cl.ErrorMessage( 42 | content=f"Sorry, '{response_content}' does not seem to be an email address", 43 | author=AVATAR["CHATBOT"], 44 | ).send() 45 | 46 | extra_message = "" if has_replied else "We did not hear from you... " 47 | await cl.Message( 48 | content=f"{extra_message}The session is complete. Please press the 'New Chat' button to restart.", 49 | author=AVATAR["CHATBOT"], 50 | ).send() 51 | -------------------------------------------------------------------------------- /data_questionnaire_agent/ui/model/session_number_container.py: -------------------------------------------------------------------------------- 1 | class SessionNumberContainer: 2 | def __init__(self) -> None: 3 | self.counter = 0 4 | 5 | def current(self): 6 | return self.counter 7 | 8 | def increment_and_get(self): 9 | self.counter += 1 10 | return self.counter 11 | 12 | def __repr__(self) -> str: 13 | return f"{self.counter}" 14 | -------------------------------------------------------------------------------- /data_questionnaire_agent/ui/pdf_processor.py: -------------------------------------------------------------------------------- 1 | import chainlit as cl 2 | from asyncer import asyncify 3 | 4 | from data_questionnaire_agent.log_init import logger 5 | from data_questionnaire_agent.model.application_schema import Questionnaire 6 | from data_questionnaire_agent.model.openai_schema import ConditionalAdvice 7 | from data_questionnaire_agent.service.html_generator import generate_pdf_from 8 | from data_questionnaire_agent.ui.avatar_factory import AVATAR 9 | 10 | 11 | async def generate_display_pdf( 12 | advices: ConditionalAdvice, questionnaire: Questionnaire 13 | ): 14 | pdf_path = await asyncify(generate_pdf_from)(questionnaire, advices) 15 | logger.info("PDF path: %s", pdf_path) 16 | elements = [ 17 | cl.File( 18 | name=pdf_path.name, 19 | path=pdf_path.as_posix(), 20 | display="inline", 21 | ), 22 | ] 23 | await cl.Message( 24 | content="So you can download a copy, here’s a PDF with the recommendations:", 25 | elements=elements, 26 | author=AVATAR["CHATBOT"], 27 | ).send() 28 | -------------------------------------------------------------------------------- /data_questionnaire_agent/ui/session_chainlit_playground.py: -------------------------------------------------------------------------------- 1 | import chainlit as cl 2 | from chainlit import context 3 | 4 | from data_questionnaire_agent.config import cfg 5 | from data_questionnaire_agent.log_init import logger 6 | from data_questionnaire_agent.ui.chat_settings_factory import create_chat_settings 7 | from data_questionnaire_agent.ui.model.session_number_container import ( 8 | SessionNumberContainer, 9 | ) 10 | 11 | 12 | @cl.on_chat_start 13 | async def init(): 14 | logger.info("Init") 15 | cl.user_session.set("session_counter", SessionNumberContainer()) 16 | settings = await create_chat_settings() 17 | await run_agent(settings, False) 18 | 19 | 20 | @cl.on_settings_update 21 | async def on_settings_update(settings: cl.ChatSettings): 22 | await run_agent(settings, True) 23 | 24 | 25 | async def run_agent(settings: cl.ChatSettings, from_settings: bool): 26 | session_counter = cl.user_session.get("session_counter") 27 | my_counter = session_counter.increment_and_get() 28 | local_context = context.get_context() 29 | logger.info("start id: %s", local_context.session.id) 30 | await cl.Message(content=f"Start {local_context.session.id}").send() 31 | response = None 32 | while response is None: 33 | latest_counter = cl.user_session.get("session_counter") 34 | response = await cl.AskUserMessage( 35 | content=f"Please reply something {from_settings} {my_counter} {latest_counter} ...", 36 | timeout=cfg.ui_timeout, 37 | ).send() 38 | 39 | 40 | @cl.on_chat_end 41 | async def end(): 42 | local_context = context.get_context() 43 | logger.info("end id: %s", local_context.session.id) 44 | cl.user_session.set("session_started", False) 45 | -------------------------------------------------------------------------------- /data_questionnaire_agent/utils/advice_parser.py: -------------------------------------------------------------------------------- 1 | from data_questionnaire_agent.model.openai_schema import ConditionalAdvice 2 | 3 | 4 | def parse_advice(advice: str) -> ConditionalAdvice: 5 | sections = {} 6 | current_section = None 7 | section_count = 0 8 | consume_item = False 9 | current_item = "" 10 | for i, c in enumerate(advice): 11 | if c == "#" and advice[i + 1] == " ": 12 | section_count += 1 13 | if section_count == 1: 14 | current_section = "advice" 15 | else: 16 | current_section = "avoid" 17 | sections[current_section] = [] 18 | current_item = "" 19 | elif c == "-" and advice[i + 1] == " ": 20 | consume_item = True 21 | current_item = "" 22 | elif ( 23 | c == "\n" 24 | and i < len(advice) - 1 25 | and advice[i + 1] == "\n" 26 | and len(current_item) > 0 27 | ): 28 | consume_item = False 29 | sections[current_section].append(current_item.strip()) 30 | current_item = "" 31 | elif consume_item: 32 | current_item += c 33 | return ConditionalAdvice( 34 | has_advice=True, 35 | advices=sections["advice"], 36 | what_you_should_avoid=sections["avoid"], 37 | ) 38 | 39 | return None 40 | 41 | 42 | if __name__ == "__main__": 43 | text = """# What you should do ... 44 | 45 | - Given your concerns about data quality, specifically the issue of duplicate customer records, it would be beneficial to invest in machine learning (ML) capabilities. ML algorithms can significantly enhance the efficiency and accuracy of duplicate detection. However, as you've mentioned, maintaining human oversight is crucial. Therefore, a hybrid approach that combines automated ML tools with human review could be an optimal solution. 46 | 47 | - Investing in training or hiring ML expertise could be a valuable step forward. This would not only help in addressing your current data quality issues but also equip your organisation with the skills needed to leverage ML for other data-related challenges in the future. 48 | 49 | - Consider implementing a data governance policy if you haven't already. This would provide clear guidelines on data management practices within your organisation, promoting uniformity and improving overall data quality. 50 | 51 | - Regular monitoring of data quality metrics can help you identify and address issues promptly. This could involve setting up alerts for key metrics or datasets and implementing a systematic process for resolving identified issues. 52 | 53 | - Lastly, consider investing in data quality training for your employees. This would equip them with the necessary skills to handle data responsibly, thereby contributing to the overall quality of your data. 54 | 55 | # What you should avoid ... 56 | 57 | - Avoid relying solely on manual processes for data quality management. While human oversight is important, manual processes can be time-consuming and prone to errors. Leveraging automated tools, where appropriate, can enhance efficiency and accuracy. 58 | 59 | - Avoid neglecting the importance of data governance. Without clear policies and procedures in place, data management can become chaotic and inconsistent, leading to poor data quality. 60 | 61 | - Avoid ignoring the potential of dark data. While it may seem challenging to integrate dark data into your data ecosystem, it can provide valuable insights when properly managed and analysed. 62 | 63 | 64 | """ 65 | 66 | conditional_advice = parse_advice(text) 67 | assert len(conditional_advice.advices) == 5 68 | assert len(conditional_advice.what_you_should_avoid) == 3 69 | -------------------------------------------------------------------------------- /data_questionnaire_agent/utils/tracker_db_init.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | from chainlit.onepoint.tracker_db import ONEPOINT_SQL_LITE_DB, create_table 4 | 5 | if __name__ == "__main__": 6 | db_path = Path(ONEPOINT_SQL_LITE_DB) 7 | if not db_path.exists(): 8 | db_path.mkdir(parents=True) 9 | assert db_path.exists() 10 | create_table() 11 | -------------------------------------------------------------------------------- /data_questionnaire_agent/utils/tracker_db_lister.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import sys 3 | from pathlib import Path 4 | from typing import Optional 5 | 6 | from chainlit.onepoint.tracker_db import ONEPOINT_SQL_LITE_DB, list_activity_log 7 | 8 | from data_questionnaire_agent.log_init import logger 9 | 10 | TRACKING_FIELD_PROPERTIES = [ 11 | {"name": "id"}, 12 | {"name": "operation"}, 13 | {"name": "user_id"}, 14 | {"name": "session_id"}, 15 | {"name": "message"}, 16 | {"name": "timestamp"}, 17 | ] 18 | 19 | 20 | def write_log(file: Optional[Path] = None): 21 | logger.info("Database location: %s", ONEPOINT_SQL_LITE_DB) 22 | logger.info("") 23 | 24 | output = sys.stdout 25 | if file is not None: 26 | output = open(file, "w") 27 | logwriter = csv.writer(output, delimiter=",", lineterminator="\n") 28 | logwriter.writerow([f["name"] for f in TRACKING_FIELD_PROPERTIES]) 29 | 30 | for row in list_activity_log(): 31 | logwriter.writerow(row) 32 | 33 | output.close() 34 | 35 | 36 | if __name__ == "__main__": 37 | write_log() 38 | -------------------------------------------------------------------------------- /data_questionnaire_agent/utils/tracker_db_server.py: -------------------------------------------------------------------------------- 1 | import mimetypes 2 | 3 | from chainlit.server import app 4 | from fastapi import HTTPException, Request 5 | from fastapi.responses import FileResponse 6 | 7 | from data_questionnaire_agent.config import cfg 8 | from data_questionnaire_agent.utils.tracker_db_lister import write_log 9 | 10 | PARAM_SECURITY_KEY = "security_key" 11 | 12 | 13 | @app.get("/onepoint/logs") 14 | async def get_report(request: Request): 15 | params = request.query_params 16 | security_key = params.get(PARAM_SECURITY_KEY) 17 | 18 | if security_key != cfg.tracker_db_logs_password: 19 | raise HTTPException( 20 | status_code=403, 21 | detail=f"Please add the {PARAM_SECURITY_KEY} query parameter with the correct security key", 22 | ) 23 | 24 | csv_file = cfg.project_root / "report.csv" 25 | write_log(csv_file) 26 | 27 | media_type, _ = mimetypes.guess_type(csv_file) 28 | 29 | return FileResponse(csv_file, media_type=media_type, filename=csv_file.name) 30 | -------------------------------------------------------------------------------- /deploy_ui.sh: -------------------------------------------------------------------------------- 1 | cd ~/projects/data-wellness-companion-staging/ui 2 | rm -rf * 3 | unzip /home/ubuntu/companion_ui.zip 4 | sed -i -e 's/8085/443/g' ./index.html 5 | sed -i -e 's/127\.0\.0\.1/staging-d-well.onepointltd.ai/g' ./index.html 6 | sed -i -e 's/ws:/wss:/g' ./index.html 7 | sed -i -e 's/http:/https:/g' ./index.html 8 | -------------------------------------------------------------------------------- /docs/data_wellness_workflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/onepointconsulting/data-questionnaire-agent/cc96c26bbd559b15b6e3aec4e081e572713b14d9/docs/data_wellness_workflow.png -------------------------------------------------------------------------------- /docs/low_level_protocol/clarification.json: -------------------------------------------------------------------------------- 1 | { 2 | "messages": [ 3 | { 4 | "content": "System: I am a helpful assistant. I explain the meaning of questions step by step. I highlight the main topics in bold markdown. I use British English.\\nHuman: Please explain the following question in a way that a layman can understand it:\\n\\nWhat specific aspects of data quality do you find most challenging to address in your organisation's data management processes?\\n", 5 | "role": "user" 6 | } 7 | ], 8 | "model": "gpt-4o-mini", 9 | "stream": true, 10 | "n": 1, 11 | "temperature": 0.5, 12 | "logprobs": false 13 | } -------------------------------------------------------------------------------- /docs/prompt_examples/clarification.txt: -------------------------------------------------------------------------------- 1 | System: I am a helpful assistant. I explain the meaning of questions step by step. I highlight the main topics in bold markdown. I use British English. 2 | Human: Please explain the following question in a way that a layman can understand it: 3 | 4 | What specific technical limitations or gaps in expertise do you believe are hindering your ability to manage and integrate streaming data effectively? 5 | -------------------------------------------------------------------------------- /docs/prompt_examples/confidence_rating.txt: -------------------------------------------------------------------------------- 1 | SYSTEM 2 | You are an expert at giving data governance advice based on a question answer dialogue with a customer 3 | 4 | HUMAN 5 | Determine how confident you are in terms giving advice to a customer based on a sequence of questions and answers that you can find here: 6 | 7 | ``` 8 | Which area of your data ecosystem are you most concerned about? 9 | Data silos - Data trapped in departmental silos can be inaccessible to other parts. 10 | 11 | What types of data (e.g., structured, unstructured, streaming) are most commonly trapped in your departmental silos? 12 | - Streaming data such as social media feeds and IoT sensor data. 13 | ``` 14 | 15 | In order to be confident you should know about the difficulties of the customer. You should know about the following: 16 | * the customer's main problem 17 | * have some detailed information about his problem. Just having a high-level sentence on the problem of the customer like e.g: "Data Quality" is not enough to be confident. 18 | * you should have also knowledge about what is causing the problem. 19 | * ideally you know more than one cause behind the main problem of the customer. 20 | * you also need some more background information about the technologies used by the customer to be confident. 21 | * you should also know about the data governance strategies of the customer to be able to have a high degree of confidence. 22 | 23 | Please use the following classifications to this question about the degree of confidence with which you can give advice: 24 | 25 | - "outstanding" 26 | - "high" 27 | - "medium" 28 | - "mediocre" 29 | - "low" 30 | 31 | For example, you should report an "outstanding" confidence degree when: 32 | You know the main problem of the customer and the causes well. You also know about data governance aspects related to the customer's organisation. 33 | And you also know the technological landscape of the customer very well. 34 | 35 | For example, you should report a "high" confidence degree when: 36 | You know the main problem of the customer and the causes well. You also know about data governance aspects related to the customer's organisation. 37 | The only thing missing is more background information about the technological landscape of the customer. 38 | 39 | For example, you should report a "medium" confidence degree when: 40 | You know the main problem of the customer and the causes well. 41 | You miss the background information about the technological landscape of the customer and also about data governance in the customer's company. 42 | 43 | For example, you should report a "mediocre" confidence degree when: 44 | You know the main problem of the customer and the cause not well. The information about the causes is very limited 45 | You miss the background information about the technological landscape of the customer and also about data governance in the customer's company. 46 | 47 | For example, you should report a "low" confidence degree when: 48 | You know the main problem of the customer and nothing else. Or you do not even know about the main problem of the user. 49 | 50 | HUMAN 51 | Tip: Make sure to answer in the correct format 52 | 53 | HUMAN 54 | Tip: Please make sure that you write all your answers in British English. -------------------------------------------------------------------------------- /docs/prompt_examples/ontology_creation.txt: -------------------------------------------------------------------------------- 1 | SYSTEM 2 | You are an expert at creating ontologies 3 | 4 | HUMAN 5 | # Ontology from text: 6 | 7 | Extract an ontology. Do not draw it, but create a markdown table of relations with 3 columns in this order: source, target, relation name. And also create a list with 2 columns: the first contains the term name and the second column contains the term definition. 8 | the relations should only be mapped to source, target, relations 9 | Use this text as a basis: 10 | 11 | ``` 12 | Which area of your data ecosystem are you most concerned about? 13 | Lack of skilled personnel - Missing skills in data science, analytics, AI and ML can impede the effective use of data. 14 | 15 | What specific challenges are you facing due to the lack of skilled personnel in data science, analytics, AI, and ML? 16 | - We struggle to build and deploy machine learning models effectively. 17 | - Our data integration processes are inefficient and error-prone. 18 | - We have difficulty maintaining data quality and governance standards. 19 | 20 | What are the primary data sources you are currently integrating, and what challenges do you face with these sources? 21 | - We are working with a mix of structured and unstructured data from IoT devices and social media platforms, and we encounter difficulties in data processing and storage. 22 | - Our primary data sources include external APIs and third-party data providers, and we struggle with data quality and reliability issues. 23 | 24 | What specific issues do you encounter with data processing and storage when working with structured and unstructured data from IoT devices and social media platforms? 25 | - Data security and privacy concerns make it difficult to store and process data efficiently. 26 | - We lack the tools to effectively analyse and derive insights from unstructured data. 27 | 28 | What specific data quality and governance standards are you finding difficult to maintain, and how are these impacting your operations? 29 | - We face difficulties in implementing data privacy and security measures, which affects our compliance with regulations. 30 | 31 | What specific tools or technologies are you currently using for data integration, and what limitations or challenges have you encountered with them? 32 | - We are currently using custom scripts for data integration, which are error-prone and difficult to maintain. 33 | - We use Microsoft Azure Data Factory, but we struggle with real-time data processing and integration. 34 | 35 | What are the main pain points you experience with your current data integration processes, and how do they impact your overall data strategy? 36 | - We struggle with real-time data integration, which impacts our ability to make timely and informed decisions. 37 | - The lack of automation in our data integration processes results in high manual effort and increased operational costs. 38 | 39 | Invest in a comprehensive **data governance framework** that includes data quality, privacy, and security measures. This will help ensure compliance with regulations and improve overall data management. 40 | 41 | Adopt **DataOps** practices to streamline and automate your data integration processes. This will reduce manual effort, minimise errors, and lower operational costs. 42 | 43 | Utilise advanced **metadata management** tools to maintain semantic consistency and improve data discoverability. This will aid in better data governance and integration. 44 | 45 | Implement **real-time data processing** capabilities using tools like Apache Kafka or Azure Stream Analytics. This will enhance your ability to make timely and informed decisions. 46 | 47 | Leverage **AI and machine learning** to analyse unstructured data from IoT devices and social media platforms. This will help derive valuable insights and improve decision-making. 48 | 49 | 50 | Avoid relying solely on custom scripts for data integration, as they are error-prone and difficult to maintain. 51 | 52 | Do not neglect the importance of data security and privacy, especially when dealing with sensitive data from IoT devices and social media platforms. 53 | 54 | Avoid underestimating the need for skilled personnel in data science, analytics, AI, and ML. Investing in training and hiring is crucial for effective data management. 55 | 56 | 57 | Improved compliance with data privacy and security regulations, reducing the risk of legal issues. 58 | 59 | Enhanced decision-making capabilities through real-time data integration and advanced analytics. 60 | 61 | Reduced operational costs and increased efficiency through automation and better data governance. 62 | 63 | 64 | ``` 65 | 66 | HUMAN 67 | Tip: Make sure to answer in the correct format 68 | 69 | HUMAN 70 | Tip: Please make sure that you write all your answers in British English. -------------------------------------------------------------------------------- /format.ps1: -------------------------------------------------------------------------------- 1 | ruff check --fix 2 | black . -------------------------------------------------------------------------------- /i18n/messages.de.yml: -------------------------------------------------------------------------------- 1 | de: 2 | "A big thank you for completing a session with": "Ein großes Dankeschön für den Abschluss einer Sitzung mit %{name}." 3 | "Advice": "Beratung" 4 | "A personal offer for you": "Ein persönliches Angebot für Sie" 5 | "banner_link": https://d-well.onepointltd.ai/images/D-Well_Report_Banner.png 6 | "banner_link_email": https://cdn.discordapp.com/attachments/932953013683879957/1271015144666824704/D-Well_Report_Banner.png?ex=66b5cc98&is=66b47b18&hm=789480e3fae7c3c0f1893ad08538f9ccc82868f4aed2b058704f212ac5f3710a& 7 | "Confidence Degree": "Konfidenzgrad" 8 | "Data Wellness Aggregation Report": "Data Wellness Bericht" 9 | "for_more_info": "Für weitere Informationen besuchen Sie uns bitte unter Onepoint Data Wellness." 10 | "intro_advice": "Hier finden Sie die von uns gestellten Fragen und die Antworten, die Sie gegeben haben, gefolgt von den Empfehlungen und Handlungsvorschlägen." 11 | "offering_long": "Wir bieten einen kostenlosen Anruf zur Ergebnisinterpretation an, um die Empfehlungen und Handlungsvorschläge des Companion mit einem echten menschlichen Experten zu besprechen.

Wenn Sie dazu bereit sind, senden Sie uns bitte eine E-Mail an datawellness@onepointltd.com von Ihrer geschäftlichen E-Mail-Adresse aus mit Ihrer Anfrage, einen Anruf zu vereinbaren.

Sie können auch das kostenlose KI-gestützte Selbstdiagnose-Berichtstool von Onepoint ausprobieren, um eine kostenlose Selbstdiagnose zu erhalten." 12 | "Please check the attached report": "Bitte überprüfen Sie den beigefügten Bericht" 13 | "Potential positive outcomes": "Mögliche positive Ergebnisse (falls Sie den Rat befolgen)" 14 | "Positive outcomes (if you follow the advices)": "Positive Ergebnisse (wenn Sie den Ratschlägen folgen)" 15 | "Produced on": "Erstellt am" 16 | "Reasoning": "Gründe für den Vertrauensgrad von D-Well" 17 | "Transcript": "Transkript" 18 | "We would love your feedback": "Wir würden uns über Ihr Feedback freuen" 19 | "What to avoid": "Was man vermeiden sollte" 20 | "What you should avoid": "Was Sie vermeiden sollten" 21 | "What you should do": "Empfehlungen" 22 | confidence_degree_high: "Gut" 23 | confidence_degree_low: "Sehr niedrig" 24 | confidence_degree_mediocre: "Niedrig" 25 | confidence_degree_medium: "Mittel" 26 | confidence_degree_outstanding: "Sehr gut" 27 | db_insert_failed: "Fehler beim Hinzufügen der Abfrage" 28 | db_update_failed: "Fehler beim Hinzufügen der Antwort" 29 | failed_insert_question: "Fehler beim Einfügen der Frage in die Datenbank" 30 | no_answer_from_chatgpt: "Konnte keine Antworten von ChatGPT erhalten" 31 | no_report_id: "Berichts-ID ist nicht verfügbar" 32 | regeneration_failed: "Die Frage konnte nicht neu generiert werden." 33 | -------------------------------------------------------------------------------- /i18n/messages.en.yml: -------------------------------------------------------------------------------- 1 | en: 2 | "A big thank you for completing a session with": "A big thank you for completing a session with the %{name}" 3 | "A personal offer for you": "A personal offer for you" 4 | "Advice": "Advice" 5 | "banner_link": /development/playground/langchain/data_questionnaire_agent/public/images/D-Well_Report_Banner.png 6 | "banner_link_email": https://d-well.onepointltd.ai/images/D-Well_Report_Banner.png 7 | "Confidence Degree": "D-Well confidence degree" 8 | "Data Wellness Aggregation Report": "Data Wellness Aggregation Report" 9 | "for_more_info": "For more information, please visit us at Onepoint Data Wellness." 10 | "intro_advice": "Here you have the questions we asked and the answers you gave, followed by the recommendations and suggested courses of action." 11 | "offering_long": "We are offering a free results interpretation call to talk through the Companion's recommendations and suggested courses of action with a real human expert.

If you are open to that, please email us at datawellness@onepointltd.com from your business email address with your request to schedule a call.

You can also try out Onepoint's free AI-powered Self-Diagnostic Report tool to get a free self-diagnostic." 12 | "Please check the attached report": "Please check the attached Data Wellness report." 13 | "Potential positive outcomes": "Benefits (in case you follow the advice)" 14 | "Positive outcomes (if you follow the advices)": "Benefits (if you follow the advices)" 15 | "Produced on": "Produced on" 16 | "Reasoning": "D-Well reasoning behind confidence degree" 17 | "Transcript": "Transcript" 18 | "We would love your feedback": "We would love your feedback" 19 | "What to avoid": "What to avoid" 20 | "What you should avoid": "What you should avoid" 21 | "What you should do": "Suggested courses of action" 22 | confidence_degree_high: "Good" 23 | confidence_degree_low: "Very low" 24 | confidence_degree_mediocre: "Low" 25 | confidence_degree_medium: "Medium" 26 | confidence_degree_outstanding: "Very good" 27 | db_insert_failed: "Failed to add query to database" 28 | db_update_failed: "Failed to update the answer in database" 29 | failed_insert_question: "Failed to insert question in database" 30 | no_answer_from_chatgpt: "Could not get any answers from ChatGPT" 31 | no_report_id: "Report ID is not available" 32 | regeneration_failed: "The question could not be re-generated." 33 | 34 | -------------------------------------------------------------------------------- /i18n/messages.fa.yml: -------------------------------------------------------------------------------- 1 | fa: 2 | db_insert_failed: "عدم موفقیت در افزودن سوال به پایگاه داده" 3 | db_update_failed: "پاسخ در پایگاه داده به روز نشد" 4 | no_report_id: "شناسه گزارش در دسترس نیست" 5 | no_answer_from_chatgpt: "هیچ پاسخی از ChatGPT دریافت نشد" 6 | failed_insert_question: "عدم موفقیت در وارد کردن سوال در پایگاه داده" 7 | -------------------------------------------------------------------------------- /notes.md: -------------------------------------------------------------------------------- 1 | # Nots Demo 2 | 3 | - Tick all that apply 4 | - Fine tune the prompts to explain in more detail? 5 | - Give more information about certain topics (tweak knowledge base) 6 | - Show other options for confidence degree 7 | 8 | - What types of information is the company dealing with? 9 | 10 | Topics 11 | ====== 12 | 13 | cyber security 14 | data protection 15 | information compliance 16 | 17 | Try to figure out which information the user is dealing with. 18 | What regulation and standards are important 19 | Do you have a problem with data compliance? 20 | 21 | Add in context help, tool tips 22 | 23 | 24 | Test generate report with one JWT token 25 | Report not in session when Restart not clicked 26 | -------------------------------------------------------------------------------- /package-lock.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "data-questionnaire-agent", 3 | "lockfileVersion": 3, 4 | "requires": true, 5 | "packages": {} 6 | } 7 | -------------------------------------------------------------------------------- /public/css/styles.css: -------------------------------------------------------------------------------- 1 | :root { 2 | --text_blue: #0066A4; 3 | --experimental-blue: #0087d3 4 | } 5 | 6 | .markdown-body img { 7 | width: 100% 8 | } 9 | 10 | .MuiFormControl-root.MuiFormControl-fullWidth.MuiTextField-root { 11 | border: 1px var(--text_blue) solid; 12 | } 13 | 14 | .generated-message { 15 | color: var(--text_blue) 16 | } 17 | 18 | .MuiButtonBase-root.MuiButtonBase-root.css-1vhtqje { 19 | background-color: #edf3f8; 20 | color: #91b3d6 21 | } 22 | 23 | .MuiButtonBase-root.MuiButton-root.css-1ruedog { 24 | border: 1px solid var(--text_blue); 25 | color: #5285be 26 | } 27 | 28 | .MuiSvgIcon-root.MuiSvgIcon-fontSizeMedium.MuiAvatar-fallback.css-13y7ul3 { 29 | color: #4f83bd 30 | } 31 | 32 | .image-banner { 33 | background: url("/public/images/banner_blank.png") center center / cover no-repeat; 34 | color: white; 35 | min-height: 10em; 36 | } 37 | 38 | .image-banner h1 { 39 | margin: 0.5em 0 0 0; 40 | } 41 | 42 | .image-banner h2 { 43 | margin: 0.2em 0 0.5em 0; 44 | } 45 | 46 | .logo-box img { 47 | position: relative; 48 | right: -1em; 49 | max-width: 68% 50 | } 51 | 52 | .image-banner > .css-dvxtzn { 53 | align-items: start; 54 | } 55 | 56 | /* Hiding chat button */ 57 | .logo-box + .MuiStack-root.css-1mzerio { 58 | display: none; 59 | } 60 | 61 | /* Hiding the settings button */ 62 | svg[data-testid=TuneIcon] { 63 | display: none; 64 | } 65 | 66 | /* Hiding the message date */ 67 | .message-date { 68 | display: none; 69 | } 70 | 71 | /* Hiding the settings button */ 72 | .MuiInputAdornment-root .settings-button { 73 | display: none; 74 | } 75 | 76 | /* Aavatar related */ 77 | .MuiAvatar-circular { 78 | margin: 0 auto 79 | } 80 | 81 | .css-kt4rn8 { 82 | text-align: center; 83 | } 84 | /* End Avatar */ 85 | 86 | /* Hide Github button */ 87 | a[href*='https://onepointltd.com'].css-1deacqj { 88 | display: none 89 | } 90 | 91 | button:has(> svg[data-testid=KeyboardDoubleArrowUpIcon]) { 92 | display: none; 93 | } 94 | 95 | /** Hide the side menu */ 96 | .experimental { 97 | background-color: var(--experimental-blue); 98 | padding: 0.2em 0.1em 99 | } 100 | 101 | /* Medium devices (tablets, 768px and up) */ 102 | @media (max-width: 1200px) { 103 | 104 | /* Mobile title */ 105 | .css-17yvo7f { 106 | font-size: 74%; 107 | padding: 0 0.5em; 108 | } 109 | } 110 | 111 | 112 | /* Medium devices (tablets, 768px and up) */ 113 | @media (min-width: 1200px) { 114 | 115 | /* Mobile title */ 116 | .css-17yvo7f { 117 | font-size: 84%; 118 | padding: 0 1em; 119 | } 120 | } 121 | 122 | 123 | /* Medium devices (tablets, 768px and up) */ 124 | @media (min-width: 1400px) { 125 | 126 | .image-banner { 127 | min-height: 15em; 128 | } 129 | } 130 | 131 | /* Areas of concern */ 132 | .markdown-body .init-options-row { 133 | font-size: small; 134 | } 135 | 136 | .markdown-body .img-cell > img { 137 | max-width: 4em; 138 | display: block; 139 | margin: 0 auto; 140 | cursor: pointer 141 | } 142 | 143 | .markdown-body .img-cell ~ div.col-9 { 144 | cursor: pointer 145 | } 146 | 147 | /* Micro framework for responsive layout */ 148 | .row { 149 | display: flex; 150 | flex-wrap: wrap; 151 | } 152 | 153 | .col-2 { 154 | width: 16.667% 155 | } 156 | 157 | .col-3 { 158 | width: 25% 159 | } 160 | 161 | .col-9 { 162 | width: 75% 163 | } 164 | 165 | .col-10 { 166 | width: 75% 167 | } 168 | 169 | /* Medium devices (tablets, 768px and up) */ 170 | @media (min-width: 768px) { 171 | 172 | /* Tablet */ 173 | 174 | .col-md-1 { 175 | width: 8.33% 176 | } 177 | 178 | .col-md-2 { 179 | width: 16.667% 180 | } 181 | 182 | .col-md-4 { 183 | width: 33.333% 184 | } 185 | 186 | .col-md-5 { 187 | width: 41.667% 188 | } 189 | } 190 | /* End micro framework */ 191 | 192 | 193 | 194 | -------------------------------------------------------------------------------- /public/images/D-Well_Report_Banner.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/onepointconsulting/data-questionnaire-agent/cc96c26bbd559b15b6e3aec4e081e572713b14d9/public/images/D-Well_Report_Banner.png -------------------------------------------------------------------------------- /public/images/Hero_Image_with_Logo_and_Titles.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/onepointconsulting/data-questionnaire-agent/cc96c26bbd559b15b6e3aec4e081e572713b14d9/public/images/Hero_Image_with_Logo_and_Titles.jpg -------------------------------------------------------------------------------- /public/images/banner_blank.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/onepointconsulting/data-questionnaire-agent/cc96c26bbd559b15b6e3aec4e081e572713b14d9/public/images/banner_blank.png -------------------------------------------------------------------------------- /public/images/banner_with_titles.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/onepointconsulting/data-questionnaire-agent/cc96c26bbd559b15b6e3aec4e081e572713b14d9/public/images/banner_with_titles.png -------------------------------------------------------------------------------- /public/images/companion_icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/onepointconsulting/data-questionnaire-agent/cc96c26bbd559b15b6e3aec4e081e572713b14d9/public/images/companion_icon.png -------------------------------------------------------------------------------- /public/images/data_ecosystem_areas.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/onepointconsulting/data-questionnaire-agent/cc96c26bbd559b15b6e3aec4e081e572713b14d9/public/images/data_ecosystem_areas.png -------------------------------------------------------------------------------- /public/images/natural-language-processing.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/onepointconsulting/data-questionnaire-agent/cc96c26bbd559b15b6e3aec4e081e572713b14d9/public/images/natural-language-processing.png -------------------------------------------------------------------------------- /public/js/onepoint.js: -------------------------------------------------------------------------------- 1 | function setNativeValue(element, value) { 2 | let lastValue = element.value; 3 | element.value = value; 4 | let event = new Event("input", { target: element, bubbles: true }); 5 | // React 15 6 | event.simulated = true; 7 | // React 16 8 | let tracker = element._valueTracker; 9 | if (tracker) { 10 | tracker.setValue(lastValue); 11 | } 12 | element.dispatchEvent(event); 13 | } 14 | 15 | function injectTextListener(e) { 16 | const altText = e.target.getAttribute('alt') 17 | const textarea = document.querySelector("#chat-input") 18 | setNativeValue(textarea, altText) 19 | } 20 | 21 | 22 | function processEvent(e, injectFunc) { 23 | if (typeof (e.onclick) == "undefined" || e.onclick == null) { 24 | const listener = (e) => injectFunc(e) 25 | e.addEventListener("click", listener) 26 | e.onclick = listener 27 | console.info("Added event listener.") 28 | } else { 29 | console.info("Skipped event listener") 30 | } 31 | } 32 | 33 | function addImgOnepointEventListener() { 34 | [...document.querySelectorAll(".img-cell img")].forEach(e => { 35 | processEvent(e, injectTextListener) 36 | }) 37 | } 38 | 39 | function injectInnerTextListener(e) { 40 | const text = e.target.innerText 41 | const textarea = document.querySelector("#chat-input") 42 | setNativeValue(textarea, text) 43 | } 44 | 45 | function addImgOnepointTextEventListener() { 46 | [...document.querySelectorAll(".img-cell ~ div.col-9")].forEach(e => { 47 | if (typeof (e.onclick) == "undefined" || e.onclick == null) { 48 | processEvent(e, injectInnerTextListener) 49 | } 50 | }) 51 | } 52 | 53 | function activateClickListeners(_records, _observer) { 54 | addImgOnepointEventListener() 55 | addImgOnepointTextEventListener() 56 | } 57 | 58 | const observerOptions = { 59 | childList: true, 60 | subtree: true, 61 | }; 62 | 63 | document.addEventListener('DOMContentLoaded', function () { 64 | const observer = new MutationObserver(activateClickListeners); 65 | observer.observe(document.querySelector("body"), observerOptions); 66 | }, false); 67 | 68 | 69 | -------------------------------------------------------------------------------- /public/logo_dark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/onepointconsulting/data-questionnaire-agent/cc96c26bbd559b15b6e3aec4e081e572713b14d9/public/logo_dark.png -------------------------------------------------------------------------------- /public/logo_light.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/onepointconsulting/data-questionnaire-agent/cc96c26bbd559b15b6e3aec4e081e572713b14d9/public/logo_light.png -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "data-questionnaire-agent" 3 | version = "0.2.0" 4 | description = "Implementation of an agent which asks questions and gives advices about a predefined question." 5 | authors = ["Gil Fernandes "] 6 | readme = "README.md" 7 | 8 | [tool.poetry.dependencies] 9 | python = "^3.13" 10 | openai = "^1.37.0" 11 | langchain = "^0.3.16" 12 | langchain-community = "0.3.16" 13 | prompt-toolkit = "^3.0.39" 14 | tomli = "^2.0.1" 15 | faiss-cpu = "^1.7.4" 16 | tiktoken = "^0.8.0" 17 | python-dotenv = "^1.0.0" 18 | pydantic = "2.10.6" 19 | duckduckgo-search = "^3.8.5" 20 | pdfkit = "^1.0.0" 21 | opencv-python = "^4.8.1.78" 22 | scikit-image = "^0.25.1" 23 | tenacity = "^8.2.3" 24 | psycopg = {extras = ["binary"], version = "^3.1.18"} 25 | aiohttp = "^3.9.3" 26 | langchain-openai = "^0.2.10" 27 | python-socketio = "^5.11.2" 28 | asyncer = "^0.0.5" 29 | jinja2 = "^3.1.3" 30 | python-i18n = {extras = ["yaml"], version = "^0.3.9"} 31 | python-ulid = "^2.7.0" 32 | pyjwt = "^2.9.0" 33 | pandas = "^2.2.3" 34 | xlsxwriter = "^3.2.0" 35 | openpyxl = "^3.1.5" 36 | consultant-info-generator = {git = "https://github.com/OnepointConsultingLtd/consultant_info_generator.git"} 37 | 38 | [tool.poetry.scripts] 39 | build-ui = "data_questionnaire_agent.build:ui" 40 | build-check = "data_questionnaire_agent.build:check" 41 | run = "data_questionnaire_agent.server.questionnaire_server_main:run_server" 42 | generate_tokens = "data_questionnaire_agent.service.jwt_token_service:generate_from_file_cmdline" 43 | 44 | [tool.poetry.group.dev.dependencies] 45 | pytest = "^7.4.2" 46 | black = "^23.11.0" 47 | ruff = "^0.4.8" 48 | pytest-aiohttp = "^1.0.5" 49 | 50 | [build-system] 51 | requires = ["poetry-core"] 52 | build-backend = "poetry.core.masonry.api" 53 | 54 | [tool.ruff] 55 | line-length = 120 56 | indent-width = 4 57 | target-version = "py39" 58 | lint.extend-select = ["I"] 59 | -------------------------------------------------------------------------------- /run_app.sh: -------------------------------------------------------------------------------- 1 | 2 | ENV_NAME=data_wellness_agent_staging 3 | source /home/ubuntu/miniconda3/etc/profile.d/conda.sh 4 | conda activate $ENV_NAME 5 | cd /home/ubuntu/projects/data-wellness-companion-staging 6 | 7 | python ./data_questionnaire_agent/server/questionnaire_server_main.py 8 | 9 | -------------------------------------------------------------------------------- /sql/db_creation_default.sql: -------------------------------------------------------------------------------- 1 | -- Hypergility database 2 | CREATE DATABASE data_wellness_companion 3 | WITH 4 | OWNER = postgres 5 | ENCODING = 'UTF8' 6 | LOCALE_PROVIDER = 'libc' 7 | CONNECTION LIMIT = -1 8 | IS_TEMPLATE = False; -------------------------------------------------------------------------------- /sql/db_creation_hypergility.sql: -------------------------------------------------------------------------------- 1 | -- Hypergility database 2 | CREATE DATABASE data_wellness_companion_hypergility 3 | WITH 4 | OWNER = postgres 5 | ENCODING = 'UTF8' 6 | CONNECTION LIMIT = -1 7 | IS_TEMPLATE = False; -------------------------------------------------------------------------------- /sql/initial_data.sql: -------------------------------------------------------------------------------- 1 | -- Add language 2 | INSERT INTO public.tb_language(language_code) VALUES ('en'); 3 | INSERT INTO public.tb_language(language_code) VALUES ('fa'); 4 | 5 | -- Initial question 6 | INSERT INTO TB_QUESTION(QUESTION, PREFERRED_QUESTION_ORDER, LANGUAGE_ID) 7 | VALUES('بیشتر نگران کدام ناحیه از اکوسیستم داده خود هستید؟', 1, (SELECT ID FROM TB_LANGUAGE WHERE language_code = 'en')); 8 | 9 | INSERT INTO TB_QUESTION(QUESTION, PREFERRED_QUESTION_ORDER, LANGUAGE_ID) 10 | VALUES('شما نگران کدام بخش از اکوسیستم داده‌های خود هستید؟', 1, (SELECT ID FROM TB_LANGUAGE WHERE language_code = 'fa')); 11 | 12 | -- Suggestions 13 | INSERT INTO TB_QUESTION_SUGGESTIONS(IMG_SRC, IMG_ALT, TITLE, MAIN_TEXT, QUESTION_ID) 14 | VALUES('poor_data_quality.png', 'کیفیت داده ضعیف', 'کیفیت داده ضعیف', '.داده های با کیفیت پایین می تواند منجر به بینش نادرست و تصمیم گیری ضعیف شود', 15 | (SELECT ID FROM TB_QUESTION WHERE QUESTION = 'بیشتر نگران کدام ناحیه از اکوسیستم داده خود هستید؟')); 16 | 17 | INSERT INTO TB_QUESTION_SUGGESTIONS(IMG_SRC, IMG_ALT, TITLE, MAIN_TEXT, QUESTION_ID) 18 | VALUES('compliance_risks.png', 'انطباق و خطرات امنیتی', 'انطباق و خطرات امنیتی', '.سوء استفاده از داده ها می تواند منجر به مشکلات قانونی و آسیب به اعتبار شود', 19 | (SELECT ID FROM TB_QUESTION WHERE QUESTION = 'Which area of your data ecosys, ()tem are you most concerned about?')); 20 | 21 | INSERT INTO TB_QUESTION_SUGGESTIONS(IMG_SRC, IMG_ALT, TITLE, MAIN_TEXT, QUESTION_ID) 22 | VALUES('data_silos.png', 'سیلوهای داده', 'سیلوهای داده', '.داده های به دام افتاده در سیلوهای دپارتمان می تواند برای سایر قسمت ها غیرقابل دسترسی باشد', 23 | (SELECT ID FROM TB_QUESTION WHERE QUESTION = 'بیشتر نگران کدام ناحیه از اکوسیستم داده خود هستید؟')); 24 | 25 | INSERT INTO TB_QUESTION_SUGGESTIONS(IMG_SRC, IMG_ALT, TITLE, MAIN_TEXT, QUESTION_ID) 26 | VALUES('lack_of_skilled_personnel.png', 'کمبود پرسنل ماهر', '.کمبود پرسنل ماهر', 'فقدان مهارت در علم داده، تجزیه و تحلیل، هوش مصنوعی و ML می تواند مانع استفاده موثر از داده ها شود.', 27 | (SELECT ID FROM TB_QUESTION WHERE QUESTION = 'بیشتر نگران کدام ناحیه از اکوسیستم داده خود هستید؟')); 28 | 29 | INSERT INTO TB_QUESTION_SUGGESTIONS(IMG_SRC, IMG_ALT, TITLE, MAIN_TEXT, QUESTION_ID) 30 | VALUES('data_overload.png', 'اضافه بار داده ها', 'اضافه بار داده ها', '."افزایش داده ها" می تواند فرآیندها را کند کند و تشخیص اینکه چه داده هایی واقعا مفید هستند را دشوار می کند', 31 | (SELECT ID FROM TB_QUESTION WHERE QUESTION = 'بیشتر نگران کدام ناحیه از اکوسیستم داده خود هستید؟')); 32 | 33 | INSERT INTO TB_QUESTION_SUGGESTIONS(IMG_SRC, IMG_ALT, TITLE, MAIN_TEXT, QUESTION_ID) 34 | VALUES('cost_and_complexity.png', 'هزینه و پیچیدگی', 'هزینه و پیچیدگی', '.یک زیرساخت قوی تجزیه و تحلیل داده ها به سرمایه گذاری قابل توجهی در منابع نیاز دارد', 35 | (SELECT ID FROM TB_QUESTION WHERE QUESTION = 'بیشتر نگران کدام ناحیه از اکوسیستم داده خود هستید؟')); 36 | 37 | INSERT INTO TB_QUESTION_SUGGESTIONS(IMG_SRC, IMG_ALT, TITLE, MAIN_TEXT, QUESTION_ID) 38 | VALUES('inconsistent_data_strategies.png', 'استراتژی های داده ناسازگار', 'استراتژی های داده ناسازگار', '.این موارد سخت است که با مفاهیم مدرن مانند ساختار داده، شبکه و هوش مصنوعی تولیدی هماهنگ شوند', 39 | (SELECT ID FROM TB_QUESTION WHERE QUESTION = 'بیشتر نگران کدام ناحیه از اکوسیستم داده خود هستید؟')); 40 | 41 | INSERT INTO TB_QUESTION_SUGGESTIONS(IMG_SRC, IMG_ALT, TITLE, MAIN_TEXT, QUESTION_ID) 42 | VALUES('resistence_to_change.png', 'مقاومت در برابر تغییرات', 'مقاومت در برابر تغییرات', '.کارمندان باید خود را با روش‌های جدید کار تطبیق دهند تا تحول مبتنی بر داده‌ها عملی شود', 43 | (SELECT ID FROM TB_QUESTION WHERE QUESTION = 'بیشتر نگران کدام ناحیه از اکوسیستم داده خود هستید؟')); 44 | 45 | 46 | INSERT INTO TB_QUESTION_SUGGESTIONS(IMG_SRC, IMG_ALT, TITLE, MAIN_TEXT, QUESTION_ID) 47 | VALUES('document-related-issues.jpg', 'مراحل درخواست پناهندگی', 'مراحل درخواست پناهندگی', 'من علاقه مند هستم در مورد روش درخواست پناهندگی بیشتر بدانم', 48 | (SELECT ID FROM TB_QUESTION WHERE QUESTION = 'بیشتر نگران کدام ناحیه از اکوسیستم داده خود هستید؟')); 49 | -------------------------------------------------------------------------------- /sql/other_queries.sql: -------------------------------------------------------------------------------- 1 | -- Extract the content of specific sessions 2 | select final_report, session_id, question, answer from tb_questionnaire_status where session_id in 3 | (select distinct c.session_id from tb_session_configuration c inner join tb_questionnaire_status s on s.session_id = c.session_id 4 | where c.config_key = 'session-client-id' 5 | and c.config_value in ('eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiIwMUpEMkpXTkhBNTZGN1lHRENDU1czRjJaQiIsIm5hbWUiOiJHaWwiLCJpYXQiOjE3MzIwMzI0ODR9.r8LTAiuORLPk2QnrS8YMcX7dHdlYKndHuXc3PEY6Msw') 6 | and s.final_report = true) 7 | order by id asc; 8 | 9 | -- Get some statistics on sessions 10 | select * from 11 | (select t.email, sc.session_id, count(*), min(created_at), max(created_at) from tb_session_configuration sc 12 | inner join tb_jwt_token t on sc.config_value = t.jwt_token 13 | inner join tb_questionnaire_status qs on qs.session_id = sc.session_id 14 | where t.id > 100 15 | group by sc.session_id, t.email 16 | order by count(*)) q where max > '2025-01-15'; -------------------------------------------------------------------------------- /sql/verification_queries.sql: -------------------------------------------------------------------------------- 1 | select min(ordinal), max(ordinal), session_id, question, min(created_at), max(created_at) from 2 | (select ROW_NUMBER() OVER (PARTITION BY session_id ORDER BY created_at) ordinal, session_id, question, created_at 3 | from tb_questionnaire_status 4 | where session_id in (select distinct session_id from tb_questionnaire_status where final_report is true)) q 5 | group by session_id, question having count(*) > 1 order by min(created_at); 6 | 7 | -- Check duplicate questions 8 | select lower(question), count(*) from tb_questionnaire_status where session_id = '01JD748MW7FKMZ2FY5H87WWQ8B' 9 | group by lower(question) having count(*) > 1; 10 | 11 | -- Select all questions in session 12 | select question from tb_questionnaire_status where session_id = '01JD748MW7FKMZ2FY5H87WWQ8B'; -------------------------------------------------------------------------------- /start.ps1: -------------------------------------------------------------------------------- 1 | conda activate data_wellness_agent 2 | $env:OPENAI_MODEL='gpt-4o' 3 | python .\data_questionnaire_agent\server\questionnaire_server_main.py -------------------------------------------------------------------------------- /start.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Activate environment 4 | . venv/bin/activate; 5 | 6 | # Install all libraries in the right environment 7 | ./venv/bin/poetry install 8 | 9 | # Build the UI 10 | echo Building the User Interface 11 | poetry run build-ui 12 | 13 | # Build the database if requested 14 | echo Building database 15 | python ./data_questionnaire_agent/bootstrap/sql_script.py ./sql/db_setup.sql 16 | 17 | # Run the main application 18 | /app/venv/bin/python ./data_questionnaire_agent/server/questionnaire_server_main.py 19 | # /app/venv/bin/python ./data_questionnaire_agent/toml_support.py -------------------------------------------------------------------------------- /templates/consultants-template.md: -------------------------------------------------------------------------------- 1 | {% for consultant in consultants %} 2 | # Consultant: {{consultant.given_name}} {{consultant.surname}} 3 | 4 | Email: {{consultant.email}}
5 | Location: {{consultant.geo_location}}
6 | Industry: {{consultant.industry_name}}
7 | linkedin_profile: [{{consultant.linkedin_profile_url}}](https://www.linkedin.com/in/{{consultant.linkedin_profile_url}}) 8 | 9 | ## Curriculum Vitae 10 | 11 | {{consultant.cv}} 12 | 13 | ## Experience 14 | 15 | {% for experience in consultant.experiences %} 16 | ### {{experience.title}} 17 | 18 | Company: {{experience.company.name}}
19 | {% if experience.location is not none %}Location: {{experience.location}}
{% endif %} 20 | From: {{experience.start.strftime('%Y-%m-%d')}}
21 | {% if experience.end is not none %}To: {{experience.end.strftime('%Y-%m-%d')}}
{% endif %} 22 | 23 | {% endfor %} 24 | ## Skills 25 | 26 | {% for skill in consultant.skills %} 27 | - {{skill.name}} 28 | {% endfor %} 29 | 30 | {% endfor %} -------------------------------------------------------------------------------- /templates/mail-template.html: -------------------------------------------------------------------------------- 1 | 3 | 4 | 5 | 25 | 26 | 27 |
6 | 7 | 8 | 9 | 21 | 22 | 23 |
10 | 12 | 13 | 14 | 17 | 18 | 19 |
15 | {text} 16 |
20 |
24 |
-------------------------------------------------------------------------------- /templates/results-template.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | {{title}} 8 | 9 | 10 | 13 | 14 | 67 | 68 | 69 | 70 | 71 |
72 | 73 |

{{produced_on}}: {{timestamp}}

74 |

{{big_thank_you}}

75 |

{{intro_advice}}

76 |

{{title_transcript}}

77 | 78 | 79 | {{questionnaire}} 80 | 81 |
82 |

{{title_what_to_do}}

83 |
    84 | {{advices}} 85 |
86 |

{{title_what_to_avoid}}

87 |
    88 | {{avoids}} 89 |
90 |

{{title_potential_outcomes}}

91 |
    92 | {{positive_outcomes}} 93 |
94 |

{{title_confidence}}

95 | {{confidence}} 96 |

{{personal_offer}}

97 |

{{offering_long}}

98 |

{{love_feedback}}: feedback@onepointltd.ai.

99 |

{{for_more_info}}

100 |
101 | 102 | 103 | --------------------------------------------------------------------------------