├── .env.example ├── .github ├── actions │ └── poetry │ │ ├── pre-commit-lint │ │ └── action.yaml │ │ └── setup │ │ └── action.yaml └── workflows │ └── status-checks.yaml ├── .gitignore ├── .pre-commit-config.yaml ├── .python-version ├── .streamlit └── config.toml ├── .yamllint ├── Dockerfile ├── Dockerfile.heroku ├── LICENSE ├── Makefile ├── README.md ├── app.py ├── build_pinecone_index.sh ├── charts ├── dm-app.png ├── dm-pinecone.png ├── eli3-chat.png ├── eli3.png ├── eyfs-kb-bbc-app.png ├── eyfs-kb-labels-and-vectors.png ├── eyfs.png ├── parenting-chatbot-app.png ├── parenting-chatbot-index.png └── whatsapp_bot.png ├── pip_reqs_for_reference.txt ├── poetry.lock ├── poetry.toml ├── public_activity_recommender_app.py ├── public_eli3_app.py ├── public_parenting_chatbot_app.py ├── pyproject.toml ├── signals_app.py ├── src ├── evals │ ├── charts │ │ ├── question_matrix.png │ │ └── win_matrix.png │ └── check_evals.ipynb ├── genai │ ├── __init__.py │ ├── dm │ │ ├── __init__.py │ │ ├── dm.json │ │ ├── prompts │ │ │ ├── dm_prompt.json │ │ │ └── dm_prompt_2.json │ │ └── run_dm_index.py │ ├── eli3 │ │ ├── __init__.py │ │ └── prompts │ │ │ ├── eli3.json │ │ │ ├── eli3_chat.json │ │ │ └── eli3_chat_2.json │ ├── eyfs │ │ ├── __init__.py │ │ ├── areas_of_learning.json │ │ ├── eyfs.py │ │ ├── prompts │ │ │ ├── choices_function.json │ │ │ ├── classifier.json │ │ │ ├── classifier_function.json │ │ │ ├── constraints.json │ │ │ ├── context_and_task.json │ │ │ ├── context_and_task_with_examples.json │ │ │ ├── situation.json │ │ │ └── system.json │ │ ├── run_classifier.py │ │ └── run_pinecone_index.py │ ├── message_history.py │ ├── parenting_chatbot │ │ ├── __init__.py │ │ ├── prodigy_eval │ │ │ ├── README.md │ │ │ ├── best_answer_recipe.py │ │ │ ├── create_eval_data.py │ │ │ ├── data │ │ │ │ ├── answers.jsonl │ │ │ │ ├── answers_gpt4.jsonl │ │ │ │ ├── answers_human.jsonl │ │ │ │ ├── answers_rag.jsonl │ │ │ │ ├── questions.jsonl │ │ │ │ └── system.json │ │ │ ├── figures │ │ │ │ ├── eval_parenting_chatbot.png │ │ │ │ └── prodigy_screenshot.png │ │ │ └── generate_gpt4_answers.py │ │ ├── prompts │ │ │ ├── filter_refs_function.json │ │ │ ├── filter_refs_system.json │ │ │ ├── filter_refs_user.json │ │ │ └── system.json │ │ ├── run_nhs_index.py │ │ └── run_nhs_index_full_page.py │ ├── prompt_template.py │ ├── sandbox │ │ └── signals │ │ │ ├── data │ │ │ ├── 00_system.jsonl │ │ │ ├── 01_intro.jsonl │ │ │ ├── 02_signal_impact.jsonl │ │ │ ├── 03_signal_choice.jsonl │ │ │ ├── 04_follow_up.jsonl │ │ │ ├── func_intent.json │ │ │ ├── func_top_signal.json │ │ │ ├── func_top_three_signals.json │ │ │ ├── intent_actions.json │ │ │ ├── prompt_intent.jsonl │ │ │ ├── prompt_top_signal.jsonl │ │ │ ├── prompt_top_three_signals.jsonl │ │ │ └── signals_2023.json │ │ │ └── signals_test.ipynb │ ├── setup.py │ ├── streamlit_pages │ │ ├── __init__.py │ │ ├── dm_page.py │ │ ├── eli3_page.py │ │ ├── eyfs_compare_page.py │ │ ├── eyfs_kb_bbc_page.py │ │ ├── eyfs_page.py │ │ ├── parenting_page.py │ │ └── utils.py │ ├── utils.py │ ├── vector_index.py │ ├── whatsapp_bot │ │ ├── Dockerfile │ │ ├── README.md │ │ ├── prompts │ │ │ └── eli3 │ │ │ │ └── eli3.json │ │ └── screenshots │ │ │ ├── chat_1.png │ │ │ ├── chat_2.png │ │ │ ├── chat_3.png │ │ │ ├── chat_4.png │ │ │ └── twilio_ngrok.png │ └── whatsapp_parenting_bot │ │ └── Dockerfile └── scraping │ ├── activities │ ├── README.md │ ├── scrape_activities.py │ └── scrape_notebook.ipynb │ └── start_for_life │ ├── README.md │ ├── scrape_sitemap.py │ └── scrape_startforlife.py ├── tests └── __init__.py ├── whatsapp_bot.py └── whatsapp_parenting_bot.py /.env.example: -------------------------------------------------------------------------------- 1 | OPENAI_API_KEY= 2 | PINECONE_API_KEY= 3 | PINECONE_REGION= 4 | AWS_ACCESS_KEY_ID= 5 | AWS_SECRET_ACCESS_KEY= 6 | S3_BUCKET= 7 | PATH_TO_NHS_DATA= 8 | PATH_TO_BBC_ACTIVITIES_DATA= 9 | PATH_TO_LABELLED_BBC_DATA= 10 | -------------------------------------------------------------------------------- /.github/actions/poetry/pre-commit-lint/action.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | name: Run Lint 3 | description: Run linters to verify code quality. Credits to Andre Sionek for writing the original GitHub Action. 4 | 5 | runs: 6 | using: "composite" 7 | steps: 8 | - name: Lint 9 | shell: bash 10 | run: poetry run pre-commit run --all-files 11 | -------------------------------------------------------------------------------- /.github/actions/poetry/setup/action.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | name: Set Up Poetry Environment 3 | description: Install Python and Poetry. Credits to Andre Sionek for writing the original GitHub Action. 4 | 5 | inputs: 6 | python_version: 7 | description: Python version 8 | required: true 9 | default: '3.9.17' 10 | type: string 11 | 12 | poetry_version: 13 | description: Poetry version 14 | required: true 15 | default: 1.2.2 16 | type: string 17 | 18 | runs: 19 | using: "composite" 20 | steps: 21 | 22 | - name: Set up Python 23 | uses: actions/setup-python@v4 24 | with: 25 | python-version: ${{ inputs.python_version }} 26 | 27 | - name: Install and configure Poetry 28 | uses: snok/install-poetry@v1 29 | with: 30 | virtualenvs-create: true 31 | virtualenvs-in-project: true 32 | version: ${{ inputs.poetry_version }} 33 | 34 | - name: Set up Poetry cache 35 | uses: actions/setup-python@v4 36 | with: 37 | python-version: ${{ inputs.python_version }} 38 | cache: poetry 39 | 40 | - name: Install Python Dependencies with Poetry 41 | shell: bash 42 | run: poetry install 43 | -------------------------------------------------------------------------------- /.github/workflows/status-checks.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | name: Status Checks 3 | 4 | on: 5 | push: 6 | branches-ignore: 7 | - production 8 | - main 9 | 10 | # Stops the currently running workflow if a new one has been triggered 11 | concurrency: 12 | group: "${{ github.ref }}" 13 | cancel-in-progress: true 14 | 15 | jobs: 16 | lint: 17 | name: Lint 18 | runs-on: ubuntu-20.04 19 | steps: 20 | - uses: actions/checkout@v3 21 | 22 | - name: Set up 23 | uses: ./.github/actions/poetry/setup 24 | with: 25 | python_version: "3.9.17" 26 | 27 | - name: Install Python Lint Dependencies with Poetry 28 | shell: bash 29 | run: poetry install --with lint 30 | 31 | - name: Lint 32 | uses: ./.github/actions/poetry/pre-commit-lint 33 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | .streamlit/secrets.toml 86 | # pyenv 87 | # For a library or package, you might want to ignore these files since the code is 88 | # intended to run in multiple environments; otherwise, check them in: 89 | # .python-version 90 | 91 | # pipenv 92 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 93 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 94 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 95 | # install all needed dependencies. 96 | #Pipfile.lock 97 | 98 | # poetry 99 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 100 | # This is especially recommended for binary packages to ensure reproducibility, and is more 101 | # commonly ignored for libraries. 102 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 103 | #poetry.lock 104 | 105 | # pdm 106 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 107 | #pdm.lock 108 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 109 | # in version control. 110 | # https://pdm.fming.dev/#use-with-ide 111 | .pdm.toml 112 | 113 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 114 | __pypackages__/ 115 | 116 | # Celery stuff 117 | celerybeat-schedule 118 | celerybeat.pid 119 | 120 | # SageMath parsed files 121 | *.sage.py 122 | 123 | # Environments 124 | .env 125 | .venv 126 | env/ 127 | venv/ 128 | ENV/ 129 | env.bak/ 130 | venv.bak/ 131 | 132 | # Spyder project settings 133 | .spyderproject 134 | .spyproject 135 | 136 | # Rope project settings 137 | .ropeproject 138 | 139 | # mkdocs documentation 140 | /site 141 | 142 | # mypy 143 | .mypy_cache/ 144 | .dmypy.json 145 | dmypy.json 146 | 147 | # Pyre type checker 148 | .pyre/ 149 | 150 | # pytype static type analyzer 151 | .pytype/ 152 | 153 | # Cython debug symbols 154 | cython_debug/ 155 | 156 | # PyCharm 157 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 158 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 159 | # and can be added to the global gitignore or merged into this file. For a more nuclear 160 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 161 | #.idea/ 162 | 163 | data/ 164 | notebooks/ 165 | .DS_Store 166 | .vscode/ 167 | src/scraping/data/* 168 | 169 | src/evals/parenting_chatbot/* 170 | src/genai/parenting_chatbot/prodigy_eval/_scrap/* 171 | !src/genai/parenting_chatbot/prodigy_eval/data/ 172 | !src/genai/sandbox/signals/data/ 173 | src/genai/sandbox/signals/data/signals_2024.json 174 | src/genai/sandbox/signals/illustrations/* 175 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | fail_fast: true 3 | 4 | repos: 5 | - repo: https://github.com/pre-commit/pre-commit-hooks 6 | rev: v4.3.0 7 | hooks: 8 | - id: check-toml 9 | stages: [commit] 10 | 11 | - id: end-of-file-fixer 12 | stages: [commit] 13 | 14 | - id: trailing-whitespace 15 | stages: [commit] 16 | 17 | - id: check-yaml 18 | stages: [commit] 19 | 20 | - id: detect-private-key 21 | stages: [commit] 22 | 23 | - id: check-executables-have-shebangs 24 | stages: [commit] 25 | 26 | - id: check-shebang-scripts-are-executable 27 | stages: [commit] 28 | 29 | 30 | - repo: local 31 | hooks: 32 | 33 | - id: black 34 | name: Run Black 35 | entry: poetry run black 36 | language: system 37 | types: [file, python] 38 | stages: [commit] 39 | 40 | - id: isort 41 | name: Run Isort 42 | entry: poetry run isort 43 | language: system 44 | types: [file, python] 45 | stages: [commit] 46 | 47 | - id: flake8 48 | name: Run Flake8 49 | entry: poetry run pflake8 50 | language: system 51 | types: [file, python] 52 | stages: [commit] 53 | 54 | - id: yamllint 55 | name: Run Yamllint 56 | entry: poetry run yamllint 57 | language: system 58 | types: [file, yaml] 59 | stages: [commit] 60 | 61 | - id: bandit 62 | name: Run Bandit 63 | entry: poetry run bandit 64 | language: system 65 | types: [file, python] 66 | args: 67 | [ 68 | --configfile, 69 | pyproject.toml, 70 | --severity-level, 71 | all, 72 | --confidence-level, 73 | all, 74 | --quiet, 75 | --format, 76 | custom, 77 | ] 78 | -------------------------------------------------------------------------------- /.python-version: -------------------------------------------------------------------------------- 1 | 3.9.17 2 | -------------------------------------------------------------------------------- /.streamlit/config.toml: -------------------------------------------------------------------------------- 1 | [theme] 2 | base="light" 3 | -------------------------------------------------------------------------------- /.yamllint: -------------------------------------------------------------------------------- 1 | --- 2 | extends: default 3 | 4 | rules: 5 | braces: 6 | level: error 7 | max-spaces-inside: 1 8 | brackets: 9 | level: error 10 | max-spaces-inside: 1 11 | colons: 12 | level: error 13 | commas: 14 | level: error 15 | empty-lines: 16 | level: error 17 | hyphens: 18 | level: error 19 | document-start: 20 | level: error 21 | indentation: 22 | level: error 23 | indent-sequences: consistent 24 | new-lines: 25 | level: warning 26 | trailing-spaces: 27 | level: warning 28 | line-length: 29 | max: 159 30 | level: error 31 | allow-non-breakable-inline-mappings: true 32 | truthy: disable 33 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # Use an official Python runtime as a parent image 2 | FROM python:3.9.18-slim 3 | 4 | # Set environment varibles 5 | ENV PYTHONDONTWRITEBYTECODE 1 6 | ENV PYTHONUNBUFFERED 1 7 | 8 | # Install system dependencies 9 | RUN apt-get update \ 10 | && apt-get install -y --no-install-recommends gcc python3-dev 11 | 12 | # Install poetry 13 | RUN pip install "poetry==1.5.1" 14 | 15 | # Set the working directory in the Docker image 16 | WORKDIR /app 17 | 18 | # Copy only requirements to cache them in docker layer 19 | COPY pyproject.toml poetry.lock ./ 20 | COPY src/genai /app/src/genai 21 | COPY README.md /app/README.md 22 | COPY app.py /app/app.py 23 | 24 | # Don't push the image to dockerhub 25 | COPY .env /app/.env 26 | COPY .streamlit /app/.streamlit 27 | 28 | # Project initialization: 29 | RUN poetry config virtualenvs.create false \ 30 | && poetry install --no-interaction --no-ansi 31 | 32 | EXPOSE 8501 33 | # Specify the command to run your application 34 | CMD ["streamlit", "run", "app.py"] 35 | -------------------------------------------------------------------------------- /Dockerfile.heroku: -------------------------------------------------------------------------------- 1 | # Use an official Python runtime as a parent image 2 | FROM --platform=linux/amd64 python:3.9.18-slim 3 | 4 | # Set environment varibles 5 | ENV PYTHONDONTWRITEBYTECODE 1 6 | ENV PYTHONUNBUFFERED 1 7 | 8 | # Install system dependencies 9 | RUN apt-get update \ 10 | && apt-get install -y --no-install-recommends gcc 11 | 12 | # Install poetry 13 | RUN pip install "poetry==1.5.1" 14 | 15 | # Set the working directory in the Docker image 16 | WORKDIR /app 17 | 18 | # Copy only requirements to cache them in docker layer 19 | COPY pyproject.toml poetry.lock ./ 20 | COPY src/genai /app/src/genai 21 | COPY README.md /app/README.md 22 | COPY app.py /app/app.py 23 | 24 | # Don't push the image to dockerhub 25 | COPY .env /app/.env 26 | COPY .streamlit /app/.streamlit 27 | 28 | # Project initialization: 29 | RUN poetry config virtualenvs.create false \ 30 | && poetry install --no-interaction --no-ansi 31 | 32 | # Specify the command to run your application 33 | CMD ["sh", "-c", "streamlit run --server.port $PORT app.py"] 34 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Nesta 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | SHELL=/bin/bash 2 | 3 | .DEFAULT_GOAL := help 4 | 5 | .PHONY: help 6 | help: ## Shows this help text 7 | @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}' 8 | 9 | .PHONY: init 10 | init: clean install ## Clean environment and reinstall all dependencies TODO: add `test` in the future 11 | 12 | .PHONY: clean 13 | clean: ## Removes project virtual env 14 | rm -rf .venv build dist **/*.egg-info .pytest_cache node_modules .coverage 15 | 16 | .PHONY: install 17 | install: ## Install the project dependencies and pre-commit using Poetry. 18 | poetry install --with lint,test 19 | poetry run pre-commit install --hook-type pre-commit --hook-type commit-msg --hook-type pre-push 20 | 21 | # .PHONY: test 22 | # test: ## Run tests 23 | # poetry run python -m pytest 24 | 25 | .PHONY: lint 26 | lint: ## Apply linters to all files 27 | poetry run pre-commit run --all-files 28 | 29 | .PHONY: clean-poetry-lock 30 | clean-poetry-lock: ## Removes poetry.lock from all folders except .venv 31 | find . -name \poetry.lock -type f -not -path "./.venv/*" -delete 32 | 33 | .PHONY: build-pinecone 34 | build-pinecone: 35 | chmod +x build_pinecone_index.sh 36 | ./build_pinecone_index.sh 37 | -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- 1 | """Streamlit app for the Generative AI prototypes.""" 2 | 3 | import os 4 | 5 | import openai 6 | import streamlit as st 7 | 8 | from dotenv import load_dotenv 9 | from streamlit_option_menu import option_menu 10 | 11 | from genai.streamlit_pages import early_year_activity_plan 12 | from genai.streamlit_pages import eli3 13 | from genai.streamlit_pages import eyfs_compare 14 | from genai.streamlit_pages import eyfs_dm_kb 15 | from genai.streamlit_pages import eyfs_kb_bbc 16 | from genai.streamlit_pages import parenting_chatbot 17 | from genai.streamlit_pages.utils import reset_state 18 | 19 | 20 | load_dotenv() 21 | 22 | 23 | def auth_openai() -> None: 24 | """Authenticate with OpenAI.""" 25 | try: 26 | openai.api_key = os.environ["OPENAI_API_KEY"] 27 | except Exception: 28 | openai.api_key = st.secrets["OPENAI_API_KEY"] 29 | 30 | 31 | def s3_creds() -> None: 32 | """Get s3 creds.""" 33 | try: 34 | aws_key = os.environ["AWS_ACCESS_KEY_ID"] 35 | aws_secret = os.environ["AWS_SECRET_ACCESS_KEY"] 36 | s3_path = os.environ["S3_BUCKET"] 37 | except Exception: 38 | aws_key = st.secrets["AWS_ACCESS_KEY_ID"] 39 | aws_secret = st.secrets["AWS_SECRET_ACCESS_KEY"] 40 | s3_path = st.secrets["S3_BUCKET"] 41 | 42 | return aws_key, aws_secret, s3_path 43 | 44 | 45 | def check_password() -> bool: 46 | """Return `True` if the user had the correct password.""" 47 | 48 | def password_entered() -> None: 49 | """Check whether a password entered by the user is correct.""" 50 | if st.session_state["password"] == st.secrets["password"]: 51 | st.session_state["password_correct"] = True 52 | del st.session_state["password"] # don't store password 53 | else: 54 | st.session_state["password_correct"] = False 55 | 56 | if "password_correct" not in st.session_state: 57 | # First run, show input for password. 58 | st.text_input("Password", type="password", on_change=password_entered, key="password") 59 | return False 60 | elif not st.session_state["password_correct"]: 61 | # Password not correct, show input + error. 62 | st.text_input("Password", type="password", on_change=password_entered, key="password") 63 | st.error("😕 Password incorrect") 64 | return False 65 | else: 66 | # Password correct. 67 | 68 | return True 69 | 70 | 71 | def main() -> None: 72 | """Run the app.""" 73 | auth_openai() 74 | aws_key, aws_secret, s3_path = s3_creds() 75 | with st.sidebar: 76 | selected = option_menu( 77 | "Prototypes", 78 | [ 79 | "Home page", 80 | "ELI3", 81 | "EYFS-based activities", 82 | "EYFS-based activities + BBC activities", 83 | "EYFS prototypes comparison", 84 | "Development Matters prototype", 85 | "Parenting Chatbot", 86 | ], 87 | default_index=0, 88 | on_change=reset_state, 89 | key="menu_selection", 90 | ) 91 | if selected == "Home page": 92 | st.title("Nesta Discovery: Generative AI Prototypes") 93 | st.write("Welcome to the Nesta Discovery Generative AI prototypes. Please select a prototype from the menu.") 94 | elif selected == "ELI3": 95 | eli3() 96 | elif selected == "EYFS-based activities": 97 | early_year_activity_plan() 98 | elif selected == "EYFS-based activities + BBC activities": 99 | eyfs_kb_bbc() 100 | elif selected == "EYFS prototypes comparison": 101 | eyfs_compare() 102 | elif selected == "Development Matters prototype": 103 | eyfs_dm_kb() 104 | elif selected == "Parenting Chatbot": 105 | parenting_chatbot(aws_key, aws_secret, s3_path) 106 | 107 | 108 | if check_password(): 109 | main() 110 | -------------------------------------------------------------------------------- /build_pinecone_index.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | python src/genai/eyfs/run_pinecone_index.py 3 | python src/genai/dm/run_dm_index.py 4 | python src/genai/parenting_chatbot/run_nhs_index_full_page.py 5 | -------------------------------------------------------------------------------- /charts/dm-app.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nestauk/discovery_generative_ai/b5ab4b0bf1feda7cd4a85a18ba20aa996cfb9e0d/charts/dm-app.png -------------------------------------------------------------------------------- /charts/dm-pinecone.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nestauk/discovery_generative_ai/b5ab4b0bf1feda7cd4a85a18ba20aa996cfb9e0d/charts/dm-pinecone.png -------------------------------------------------------------------------------- /charts/eli3-chat.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nestauk/discovery_generative_ai/b5ab4b0bf1feda7cd4a85a18ba20aa996cfb9e0d/charts/eli3-chat.png -------------------------------------------------------------------------------- /charts/eli3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nestauk/discovery_generative_ai/b5ab4b0bf1feda7cd4a85a18ba20aa996cfb9e0d/charts/eli3.png -------------------------------------------------------------------------------- /charts/eyfs-kb-bbc-app.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nestauk/discovery_generative_ai/b5ab4b0bf1feda7cd4a85a18ba20aa996cfb9e0d/charts/eyfs-kb-bbc-app.png -------------------------------------------------------------------------------- /charts/eyfs-kb-labels-and-vectors.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nestauk/discovery_generative_ai/b5ab4b0bf1feda7cd4a85a18ba20aa996cfb9e0d/charts/eyfs-kb-labels-and-vectors.png -------------------------------------------------------------------------------- /charts/eyfs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nestauk/discovery_generative_ai/b5ab4b0bf1feda7cd4a85a18ba20aa996cfb9e0d/charts/eyfs.png -------------------------------------------------------------------------------- /charts/parenting-chatbot-app.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nestauk/discovery_generative_ai/b5ab4b0bf1feda7cd4a85a18ba20aa996cfb9e0d/charts/parenting-chatbot-app.png -------------------------------------------------------------------------------- /charts/parenting-chatbot-index.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nestauk/discovery_generative_ai/b5ab4b0bf1feda7cd4a85a18ba20aa996cfb9e0d/charts/parenting-chatbot-index.png -------------------------------------------------------------------------------- /charts/whatsapp_bot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nestauk/discovery_generative_ai/b5ab4b0bf1feda7cd4a85a18ba20aa996cfb9e0d/charts/whatsapp_bot.png -------------------------------------------------------------------------------- /pip_reqs_for_reference.txt: -------------------------------------------------------------------------------- 1 | requests==2.28.1 2 | openai==0.27.8 3 | streamlit>=1.24.1 4 | python-dotenv>=1.0.0 5 | streamlit-option-menu>=0.3.6 6 | tiktoken>=0.4.0 7 | tenacity>=8.2.2 8 | beautifulsoup4>=4.12.2 9 | Flask>=2.3.2 10 | twilio>=8.5.0 11 | aiofiles>=23.1.0 12 | gunicorn>=21.2.0 13 | pinecone-client>=2.2.2 14 | streamlit-feedback==0.0.9 15 | boto3==1.28.17 16 | s3fs>=2023.9.0 17 | -------------------------------------------------------------------------------- /poetry.toml: -------------------------------------------------------------------------------- 1 | [virtualenvs] 2 | create = true 3 | in-project = true 4 | -------------------------------------------------------------------------------- /public_activity_recommender_app.py: -------------------------------------------------------------------------------- 1 | """Streamlit app for the Generative AI prototypes.""" 2 | 3 | import os 4 | 5 | import openai 6 | import streamlit as st 7 | 8 | from dotenv import load_dotenv 9 | 10 | from genai.streamlit_pages import eyfs_dm_kb 11 | 12 | 13 | load_dotenv() 14 | 15 | 16 | def auth_openai() -> None: 17 | """Authenticate with OpenAI.""" 18 | try: 19 | openai.api_key = os.environ["OPENAI_API_KEY"] 20 | except Exception: 21 | openai.api_key = st.secrets["OPENAI_API_KEY"] 22 | 23 | 24 | def main() -> None: 25 | """Run the app.""" 26 | auth_openai() 27 | eyfs_dm_kb(sidebar=False) 28 | 29 | 30 | main() 31 | -------------------------------------------------------------------------------- /public_eli3_app.py: -------------------------------------------------------------------------------- 1 | """Streamlit app for the Generative AI prototypes.""" 2 | 3 | import os 4 | 5 | import openai 6 | import streamlit as st 7 | 8 | from dotenv import load_dotenv 9 | 10 | from genai.streamlit_pages import eli3 11 | 12 | 13 | load_dotenv() 14 | 15 | 16 | def auth_openai() -> None: 17 | """Authenticate with OpenAI.""" 18 | try: 19 | openai.api_key = os.environ["OPENAI_API_KEY"] 20 | except Exception: 21 | openai.api_key = st.secrets["OPENAI_API_KEY"] 22 | 23 | 24 | def main() -> None: 25 | """Run the app.""" 26 | auth_openai() 27 | eli3(sidebar=False) 28 | 29 | 30 | main() 31 | -------------------------------------------------------------------------------- /public_parenting_chatbot_app.py: -------------------------------------------------------------------------------- 1 | """Streamlit app for the Generative AI prototypes.""" 2 | 3 | import os 4 | 5 | import openai 6 | import streamlit as st 7 | 8 | from dotenv import load_dotenv 9 | 10 | from genai.streamlit_pages import parenting_chatbot 11 | 12 | 13 | load_dotenv() 14 | 15 | 16 | def auth_openai() -> None: 17 | """Authenticate with OpenAI.""" 18 | try: 19 | openai.api_key = os.environ["OPENAI_API_KEY"] 20 | except Exception: 21 | openai.api_key = st.secrets["OPENAI_API_KEY"] 22 | 23 | 24 | def s3_creds() -> None: 25 | """Get s3 creds.""" 26 | try: 27 | aws_key = os.environ["AWS_ACCESS_KEY_ID"] 28 | aws_secret = os.environ["AWS_SECRET_ACCESS_KEY"] 29 | s3_path = os.environ["S3_BUCKET"] 30 | except Exception: 31 | aws_key = st.secrets["AWS_ACCESS_KEY_ID"] 32 | aws_secret = st.secrets["AWS_SECRET_ACCESS_KEY"] 33 | s3_path = st.secrets["S3_BUCKET"] 34 | 35 | return aws_key, aws_secret, s3_path 36 | 37 | 38 | def main() -> None: 39 | """Run the app.""" 40 | auth_openai() 41 | aws_key, aws_secret, s3_path = s3_creds() 42 | parenting_chatbot(aws_key, aws_secret, s3_path, sidebar=False) 43 | 44 | 45 | main() 46 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "discovery_generative_ai " 3 | version = "0.1.0" 4 | description = " We are exploring the potential impact of Generative AI on Nesta's Missions and work to uncover opportunities and risks that can inform Nesta strategy." 5 | authors = ["Nesta"] 6 | readme = "README.md" 7 | homepage = "https://github.com/nestauk/discovery_generative_ai" 8 | repository = "https://github.com/nestauk/discovery_generative_ai" 9 | keywords = ['llm', 'generative-ai'] # noqa 10 | packages = [ 11 | { include = "genai", from = "src" }, 12 | ] 13 | 14 | [tool.poetry.build] 15 | generate-setup-file = false 16 | 17 | 18 | [tool.poetry.dependencies] 19 | python = "3.9.17" 20 | requests = "2.28.1" 21 | openai = "0.27.8" 22 | streamlit = "^1.24.1" 23 | python-dotenv = "^1.0.0" 24 | streamlit-option-menu = "^0.3.6" 25 | tiktoken = "^0.4.0" 26 | tenacity = "^8.2.2" 27 | beautifulsoup4 = "^4.12.2" 28 | Flask = "^2.3.2" 29 | twilio = "^8.5.0" 30 | aiofiles = "23.1.0" 31 | gunicorn = "^21.2.0" 32 | pinecone-client = "^4.1" 33 | streamlit-feedback = "0.0.9" 34 | boto3= "1.28.17" 35 | s3fs = "^2023.9.0" 36 | seaborn = "^0.13.0" 37 | cchardet = "^2.1.7" 38 | chardet = "^5.2.0" 39 | 40 | [tool.poetry.group.test] 41 | optional = true 42 | 43 | 44 | [tool.poetry.group.test.dependencies] 45 | pytest = "^7.1.1" # Allows for testing of the project 46 | pytest-cov = "^4.0.0" # Allows to run coverage of the project 47 | moto = "^3.1.6" # Allows for mocking of AWS services 48 | 49 | [tool.poetry.group.lint] 50 | optional = true 51 | 52 | [tool.poetry.group.lint.dependencies] 53 | flake8 = "^4.0.1" # Style Guide Enforcement 54 | pyproject-flake8 = "^0.0.1-alpha.4" # Allows configuration of flake 8 from pyproject.toml 55 | flake8-bugbear = "^22.3.23" # Finding likely bugs and design problems in your program. 56 | flake8-print = "^4.0.0" # Check for print statements in python files. 57 | flake8-docstrings = "^1.6.0" # Include checks provided by pep257. 58 | flake8-annotations = "^2.8.0" # check for presence of type annotations in function definitions. 59 | isort = "^5.10.1" # Sort imports 60 | yamllint = "^1.26.3" # Linter for YAML files 61 | bandit = {version = "^1.7.4", extras = ["toml"]} # Security linter 62 | pre-commit = "^2.18.1" # Runs a pipeline before commiting code 63 | black = "^22.6.0" # Python style checks 64 | 65 | 66 | 67 | [tool.poetry.group.dev.dependencies] 68 | ipykernel = "^6.24.0" 69 | 70 | [build-system] 71 | requires = ["poetry-core>=1.2.0rc1"] 72 | build-backend = "poetry.core.masonry.api" 73 | 74 | 75 | [tool.black] 76 | line-length = 119 77 | target-version = ['py39'] 78 | include = '\.pyi?$' 79 | extend-exclude = ''' 80 | /( 81 | \.eggs 82 | | \.git 83 | | \.venv 84 | | \.env 85 | | \.vscode 86 | | build 87 | | dist 88 | | \.mypy_cache 89 | )\ 90 | ''' 91 | 92 | [tool.isort] 93 | profile = 'black' 94 | force_single_line = true 95 | combine_as_imports = true 96 | lines_between_types = 1 97 | lines_after_imports = 2 98 | src_paths = ["src", "tests"] 99 | line_length = 119 100 | 101 | [tool.flake8] 102 | max-line-length = 119 103 | select = [ # Choose the flake8 errors to select globally here 104 | "C", # McCabe complexity 105 | "E", # pep8 errors 106 | "W", # pep8 warnings 107 | "F", # pyflakes errors 108 | "N", # naming conventions 109 | "B", # bugbear errors (needs plugin installed) 110 | "ANN", # flake8 annotations errors (needs plugin installed) 111 | "T", # flake8 print errors (needs plugin installed) 112 | "D", # flake8 doscstrings errors (needs plugin installed) 113 | "B950", # Line too long. It considers "max-line-length" but only triggers when exceeded by more than 10%. 114 | ] 115 | ignore = [ # Choose the flake8 errors to ignore globally here 116 | "E501", # Line too long (using B950 instead, which has 10% tolerance) 117 | "D107", # Missing docstring in __init__ 118 | "D202", # No blank lines allowed after function docstring 119 | "D400", # First line should end with a period 120 | "D100", # Missing docstring in public module 121 | "D104", # Missing docstring in public package 122 | "ANN003", # Missing type annotation for **kwargs 123 | "ANN002", # Missing type annotation for **args 124 | "ANN1", # Missing type annotation for self in methot or cls method 125 | "W503", # Line break occurred before a binary operator 126 | "E203", # Whitespace before ':' 127 | ] 128 | per-file-ignores = [ # Choose the flake8 errors to ignore per file here 129 | "*/__init__.py:F401", # Ignore imported but unused in __init__.py files 130 | "tests/*:ANN,D", # Ignore Docstring and annotations on tests 131 | ] 132 | exclude = [ 133 | ".venv/*", 134 | ".vscode/*", 135 | ] 136 | # See other flake8 extensions here: https://github.com/DmytroLitvinov/awesome-flake8-extensions 137 | 138 | 139 | [tool.bandit] 140 | skips = [ # Choose the bandit errors to ignore globally 141 | "B101", # Use of assert detected. The enclosed code will be removed when compiling to optimised byte code. 142 | "B104", # Possible binding to all interfaces 143 | ] 144 | exclude_dirs = [ 145 | ".venv", 146 | ] 147 | 148 | [tool.pytest.ini_options] 149 | minversion = "7.1" 150 | addopts = """ 151 | --verbose 152 | --color=yes 153 | --assert=plain 154 | --cov-report term 155 | --cov=src 156 | --last-failed 157 | """ 158 | testpaths = [ 159 | "tests", 160 | ] 161 | 162 | [tool.coverage.run] 163 | omit = ["*/tests/*"] # Remove test files from coverage run. 164 | 165 | [tool.coverage.report] 166 | omit = ["*/tests/*"] # Remove test files from coverage report. 167 | -------------------------------------------------------------------------------- /src/evals/charts/question_matrix.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nestauk/discovery_generative_ai/b5ab4b0bf1feda7cd4a85a18ba20aa996cfb9e0d/src/evals/charts/question_matrix.png -------------------------------------------------------------------------------- /src/evals/charts/win_matrix.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nestauk/discovery_generative_ai/b5ab4b0bf1feda7cd4a85a18ba20aa996cfb9e0d/src/evals/charts/win_matrix.png -------------------------------------------------------------------------------- /src/genai/__init__.py: -------------------------------------------------------------------------------- 1 | from .prompt_template import FunctionTemplate 2 | from .prompt_template import MessageTemplate 3 | from .vector_index import PineconeIndex 4 | -------------------------------------------------------------------------------- /src/genai/dm/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nestauk/discovery_generative_ai/b5ab4b0bf1feda7cd4a85a18ba20aa996cfb9e0d/src/genai/dm/__init__.py -------------------------------------------------------------------------------- /src/genai/dm/prompts/dm_prompt.json: -------------------------------------------------------------------------------- 1 | {"role": "user", "content": "###Instructions###\nI will describe a theme below. Generate fun and exciting activities related to UK's Early Years Foundation Stage (EYFS) framework and the Development Matters guidance.\n\n###Examples###\nThe activities you generated must be inspired by the examples below:\n{examples}\n\n###Constraints###\n- The activities must be suitable for kids in the age groups: {age_groups}.\n- The activities must be related to the areas of learning: {areas_of_learning}.\n- You must generate 5 activities.\n\n###Description###\n{description}\n\n"} 2 | -------------------------------------------------------------------------------- /src/genai/dm/prompts/dm_prompt_2.json: -------------------------------------------------------------------------------- 1 | {"role": "user", "content": "###Instructions###\nI will describe a theme below. Generate fun and exciting activities related to UK's Early Years Foundation Stage (EYFS) framework and the Development Matters guidance.\n\n###Examples###\nThe activities you generate must be inspired by the examples below:\n{examples}\n\n###Constraints###\n- The activities must be suitable for kids in the age groups: {age_groups}.\n- The activities must be related to the areas of learning: {areas_of_learning}.\n- You must generate 5 activities.\n- You must describe each activity in 5-6 sentences.\n\n###Description###\n{description}\n\n###Formatting###\nYou must format the activities as follows:\n## \n\n\n## \n\n"} 2 | -------------------------------------------------------------------------------- /src/genai/dm/run_dm_index.py: -------------------------------------------------------------------------------- 1 | """Build a pinecone index with the Development Matters learning goals and examples.""" 2 | 3 | import os 4 | import uuid 5 | 6 | import openai 7 | 8 | from dotenv import load_dotenv 9 | 10 | from genai.eyfs import get_embedding 11 | from genai.utils import read_json 12 | from genai.vector_index import PineconeIndex 13 | 14 | 15 | load_dotenv() 16 | 17 | 18 | openai.api_key = os.environ["OPENAI_API_KEY"] 19 | PATH_TO_DM = "src/genai/dm/dm.json" 20 | INDEX_NAME = "eyfs-index" 21 | ENCODER_NAME = "text-embedding-ada-002" 22 | 23 | if "__main__" == __name__: 24 | data = read_json(PATH_TO_DM) 25 | # Temp hack to exclude the template 26 | data = [d for d in data if d["area_of_learning"] != ""] 27 | 28 | # Format the data to what pinecone needs and generate a temp uuid 29 | docs = [] 30 | for elem in data: 31 | aol = elem["area_of_learning"] 32 | d = elem["age_group"] 33 | for age, age_dict in d.items(): 34 | for k, items in age_dict.items(): 35 | for item in items: 36 | doc = tuple( 37 | ( 38 | str(uuid.uuid4()), 39 | get_embedding(item), 40 | { 41 | "age_group": age, 42 | "type_": k, 43 | "source": "dm", 44 | "text": item, 45 | "areas_of_learning": aol, 46 | }, 47 | ) 48 | ) 49 | docs.append(doc) 50 | 51 | # Build the index 52 | conn = PineconeIndex(api_key=os.environ["PINECONE_API_KEY"], environment=os.environ["PINECONE_REGION"]) 53 | 54 | conn.build_and_upsert( 55 | index_name=INDEX_NAME, 56 | dimension=1536, 57 | metric="euclidean", 58 | docs=docs, 59 | metadata_config={"indexed": ["areas_of_learning", "source", "type_", "age_group"]}, 60 | batch_size=40, 61 | ) 62 | -------------------------------------------------------------------------------- /src/genai/eli3/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nestauk/discovery_generative_ai/b5ab4b0bf1feda7cd4a85a18ba20aa996cfb9e0d/src/genai/eli3/__init__.py -------------------------------------------------------------------------------- /src/genai/eli3/prompts/eli3.json: -------------------------------------------------------------------------------- 1 | { 2 | "role": "user", 3 | "content": "###Instructions###\nYou are a helpful, kind, intelligent and polite early-year educator. Your task is to explain a concept to a 3 year old child. You must explain it in simple words that a young kid would understand. You must also be patient and never offend or be aggressive. Gendered language and any adjectives about the kid are strictly prohibited.\n\n###Question###\n{input}\n\n###Answer###\n" 4 | } 5 | -------------------------------------------------------------------------------- /src/genai/eli3/prompts/eli3_chat.json: -------------------------------------------------------------------------------- 1 | { 2 | "role": "user", 3 | "content": "###Instructions###\nYou are a helpful, kind, intelligent and polite early-year educator. Your task is to explain a concept to a 3 year old child. You must explain it in simple words that a young kid would understand. You must also be patient and never offend or be aggressive. Gendered language and any adjectives about the kid are strictly prohibited.\n\n###Answer###\n" 4 | } 5 | -------------------------------------------------------------------------------- /src/genai/eli3/prompts/eli3_chat_2.json: -------------------------------------------------------------------------------- 1 | {"role": "user", "content": "\n###Instructions###\nYou are a helpful, kind, intelligent and polite early-year educator. Your task is to discuss a concept with a 3 year old child. Your role as an EY educator is to built children's curiosity and develop their thinking process, not to provide an explicit answer to their question. You should provide an answer to the question and ask follow-up questions. You must also be patient and never offend or be aggressive. Gendered language and any adjectives about the kid are strictly prohibited.\n\n###Answer###\n", "name": null} 2 | -------------------------------------------------------------------------------- /src/genai/eyfs/__init__.py: -------------------------------------------------------------------------------- 1 | from .eyfs import EYFSClassifier 2 | from .eyfs import TextGenerator 3 | from .eyfs import get_embedding 4 | -------------------------------------------------------------------------------- /src/genai/eyfs/areas_of_learning.json: -------------------------------------------------------------------------------- 1 | { 2 | "Communication and Language": "##Communication and Language##\nThe development of children’s spoken language underpins all seven areas of learning and development. Children’s back-and-forth interactions from an early age form the foundations for language and cognitive development. The number and quality of the conversations they have with adults and peers throughout the day in a language-rich environment is crucial. By commenting on what children are interested in or doing, and echoing back what they say with new vocabulary added, practitioners will build children's language effectively. Reading frequently to children, and engaging them actively in stories, non-fiction, rhymes and poems, and then providing them with extensive opportunities to use and embed new words in a range of contexts, will give children the opportunity to thrive. Through conversation, story-telling and role play, where children share their ideas with support and modelling from their teacher, and sensitive questioning that invites them to elaborate, children become comfortable using a rich range of vocabulary and language structures.", 3 | "Personal, Social and Emotional Development": "##Personal, Social and Emotional Development##\nChildren’s personal, social and emotional development (PSED) is crucial for children to lead healthy and happy lives, and is fundamental to their cognitive development. Underpinning their personal development are the important attachments that shape their social world. Strong, warm and supportive relationships with adults enable children to learn how to understand their own feelings and those of others. Children should be supported to manage emotions, develop a positive sense of self, set themselves simple goals, have confidence in their own abilities, to persist and wait for what they want and direct attention as necessary. Through adult modelling and guidance, they will learn how to look after their bodies, including healthy eating, and manage personal needs independently. Through supported interaction with other children, they learn how to make good friendships, co-operate and resolve conflicts peaceably. These attributes will provide a secure platform from which children can achieve at school and in later life.", 4 | "Physical Development": "##Physical Development##\nPhysical activity is vital in children’s all-round development, enabling them to pursue happy, healthy and active lives7. Gross and fine motor experiences develop incrementally throughout early childhood, starting with sensory explorations and the development of a child’s strength, co-ordination and positional awareness through tummy time, crawling and play movement with both objects and adults. By creating games and providing opportunities for play both indoors and outdoors, adults can support children to develop their core strength, stability, balance, spatial awareness, co-ordination and agility. Gross motor skills provide the foundation for developing healthy bodies and social and emotional well-being. Fine motor control and precision helps with hand-eye co-ordination, which is later linked to early literacy. Repeated and varied opportunities to explore and play with small world activities, puzzles, arts and crafts and the practice of using small tools, with feedback and support from adults, allow children to develop proficiency, control and confidence.", 5 | "Literacy": "##Literacy##\nIt is crucial for children to develop a life-long love of reading. Reading consists of two dimensions: language comprehension and word reading. Language comprehension (necessary for both reading and writing) starts from birth. It only develops when adults talk with children about the world around them and the books (stories and non-fiction) they read with them, and enjoy rhymes, poems and songs together. Skilled word reading, taught later, involves both the speedy working out of the pronunciation of unfamiliar printed words (decoding) and the speedy recognition of familiar printed words. Writing involves transcription (spelling and handwriting) and composition (articulating ideas and structuring them in speech, before writing).", 6 | "Mathematics": "##Mathematics##\nDeveloping a strong grounding in number is essential so that all children develop the necessary building blocks to excel mathematically. Children should be able to count confidently, develop a deep understanding of the numbers to 10, the relationships between them and the patterns within those numbers. By providing frequent and varied opportunities to build and apply this understanding - such as using manipulatives, including small pebbles and tens frames for organising counting - children will develop a secure base of knowledge and vocabulary from which mastery of mathematics is built. In addition, it is important that the curriculum includes rich opportunities for children to develop their spatial reasoning skills across all areas of mathematics including shape, space and measures. It is important that children develop positive attitudes and interests in mathematics, look for patterns and relationships, spot connections, ‘have a go’, talk to adults and peers about what they notice and not be afraid to make mistakes.", 7 | "Understanding the World": "##Understanding the World##\nUnderstanding the world involves guiding children to make sense of their physical world and their community. The frequency and range of children’s personal experiences increases their knowledge and sense of the world around them – from visiting parks, libraries and museums to meeting important members of society such as police officers, nurses and firefighters. In addition, listening to a broad selection of stories, non-fiction, rhymes and poems will foster their understanding of our culturally, socially, technologically and ecologically diverse world. As well as building important knowledge, this extends their familiarity with words that support understanding across domains. Enriching and widening children’s vocabulary will support later reading comprehension.", 8 | "Expressive Arts and Design": "##Expressive Arts and Design##\nThe development of children’s artistic and cultural awareness supports their imagination and creativity. It is important that children have regular opportunities to engage with the arts, enabling them to explore and play with a wide range of media and materials. The quality and variety of what children see, hear and participate in is crucial for developing their understanding, self-expression, vocabulary and ability to communicate through the arts. The frequency, repetition and depth of their experiences are fundamental to their progress in interpreting and appreciating what they hear, respond to and observe." 9 | } 10 | -------------------------------------------------------------------------------- /src/genai/eyfs/eyfs.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import json 3 | import logging 4 | import string 5 | 6 | from typing import Dict 7 | from typing import List 8 | from typing import Optional 9 | from typing import Union 10 | 11 | import aiofiles 12 | import openai 13 | 14 | from openai.error import APIConnectionError 15 | from openai.error import APIError 16 | from openai.error import RateLimitError 17 | from openai.error import ServiceUnavailableError 18 | from openai.error import Timeout 19 | from tenacity import before_sleep_log 20 | from tenacity import retry 21 | from tenacity import retry_if_exception_type 22 | from tenacity import stop_after_attempt 23 | from tenacity import wait_exponential 24 | 25 | from genai import MessageTemplate 26 | 27 | 28 | logger = logging.getLogger(__name__) 29 | 30 | 31 | class TextGenerator: 32 | """Generate tokens using OpenAI's API.""" 33 | 34 | @classmethod 35 | def generate( 36 | cls, 37 | messages: List[Union[str, Dict]], 38 | message_kwargs: Optional[Dict] = None, 39 | model: str = "gpt-3.5-turbo", 40 | temperature: float = 0.0, 41 | **openai_kwargs, 42 | ) -> Union[Dict, str]: 43 | """Generate text using OpenAI's API. 44 | 45 | More details on the API and messages: https://platform.openai.com/docs/guides/gpt/chat-completions-api 46 | 47 | Args: 48 | messages 49 | A list of messages to send to the API. They can be: 50 | - dictionaries 51 | - str (JSON file path) 52 | - instances of classes that inherit from BasePromptTemplate 53 | 54 | message_kwargs 55 | A dictionary of keyword arguments to pass to the messages. 56 | 57 | model 58 | The OpenAI model to use. 59 | 60 | temperature 61 | The sampling temperature. 62 | 63 | openai_kwargs 64 | Keyword arguments to pass to the OpenAI API. 65 | 66 | Returns: 67 | A dictionary containing the response from the API. 68 | 69 | """ 70 | if not message_kwargs: 71 | message_kwargs = {} 72 | 73 | messages = [cls.prepare_message(message, **message_kwargs) for message in messages] 74 | 75 | response = cls._call( 76 | messages=messages, 77 | temperature=temperature, 78 | model=model, 79 | **openai_kwargs, 80 | ) 81 | 82 | return response 83 | 84 | @classmethod 85 | def prepare_message(cls, obj: Union[MessageTemplate, dict, str], **kwargs) -> Dict: 86 | """Process a message.""" 87 | if not isinstance(obj, MessageTemplate): 88 | prompt = MessageTemplate.load(obj) 89 | else: 90 | prompt = obj 91 | 92 | prompt.format_message(**kwargs) 93 | 94 | return prompt.to_prompt() 95 | 96 | @staticmethod 97 | @retry( 98 | retry( 99 | reraise=True, 100 | stop=stop_after_attempt(6), 101 | wait=wait_exponential(multiplier=1, min=1, max=60), 102 | retry=( 103 | retry_if_exception_type(Timeout) 104 | | retry_if_exception_type(APIError) 105 | | retry_if_exception_type(APIConnectionError) 106 | | retry_if_exception_type(RateLimitError) 107 | | retry_if_exception_type(ServiceUnavailableError) 108 | ), 109 | before_sleep=before_sleep_log(logger, logging.WARNING), 110 | ) 111 | ) 112 | def _call( 113 | messages: List[Dict], 114 | temperature: float = 0.0, 115 | **kwargs, 116 | ) -> Dict: 117 | response = openai.ChatCompletion.create( 118 | messages=messages, 119 | temperature=temperature, 120 | **kwargs, 121 | ) 122 | 123 | return response # type: ignore 124 | 125 | @staticmethod 126 | def _extract_placeholders(s: str) -> List[str]: 127 | """Extract placeholder variables that can be filled in an f-string.""" 128 | formatter = string.Formatter() 129 | return [field_name for _, field_name, _, _ in formatter.parse(s) if field_name is not None] 130 | 131 | 132 | class EYFSClassifier: 133 | """Classify text to EYFS areas of learning.""" 134 | 135 | @classmethod 136 | def generate( 137 | cls, 138 | messages: List[Union[str, Dict]], 139 | message_kwargs: Optional[Dict] = None, 140 | model: str = "gpt-3.5-turbo", 141 | temperature: float = 0.0, 142 | **openai_kwargs, 143 | ) -> Union[Dict, str]: 144 | """Generate text using OpenAI's API. 145 | 146 | More details on the API and messages: https://platform.openai.com/docs/guides/gpt/chat-completions-api 147 | 148 | Args: 149 | messages 150 | A list of messages to send to the API. They can be: 151 | - dictionaries 152 | - str (JSON file path) 153 | - instances of classes that inherit from BasePromptTemplate 154 | 155 | message_kwargs 156 | A dictionary of keyword arguments to pass to the messages. 157 | 158 | model 159 | The OpenAI model to use. 160 | 161 | temperature 162 | The sampling temperature. 163 | 164 | openai_kwargs 165 | Keyword arguments to pass to the OpenAI API. 166 | 167 | Returns: 168 | A dictionary containing the response from the API. 169 | 170 | """ 171 | if not message_kwargs: 172 | message_kwargs = {} 173 | 174 | messages = [cls.prepare_message(message, **message_kwargs) for message in messages] 175 | 176 | response = cls._call( 177 | messages=messages, 178 | temperature=temperature, 179 | model=model, 180 | **openai_kwargs, 181 | ) 182 | 183 | parsed_response = json.loads(response["choices"][0]["message"]["function_call"]["arguments"]) 184 | if parsed_response: 185 | parsed_response["url"] = message_kwargs["url"] 186 | return parsed_response 187 | 188 | return message_kwargs["url"] 189 | 190 | @classmethod 191 | def prepare_message(cls, obj: Union[MessageTemplate, dict, str], **kwargs) -> Dict: 192 | """Process a message.""" 193 | if not isinstance(obj, MessageTemplate): 194 | prompt = MessageTemplate.load(obj) 195 | else: 196 | prompt = obj 197 | 198 | prompt.format_message(**kwargs) 199 | 200 | return prompt.to_prompt() 201 | 202 | @staticmethod 203 | @retry( 204 | retry( 205 | reraise=True, 206 | stop=stop_after_attempt(6), 207 | wait=wait_exponential(multiplier=1, min=1, max=60), 208 | retry=( 209 | retry_if_exception_type(Timeout) # noqa: W503 210 | | retry_if_exception_type(APIError) # noqa: W503 211 | | retry_if_exception_type(APIConnectionError) # noqa: W503 212 | | retry_if_exception_type(RateLimitError) # noqa: W503 213 | | retry_if_exception_type(ServiceUnavailableError) # noqa: W503 214 | ), 215 | before_sleep=before_sleep_log(logger, logging.WARNING), 216 | ) 217 | ) 218 | def _call( 219 | messages: List[Dict], 220 | model: str = "gpt-3.5-turbo", 221 | temperature: float = 0.0, 222 | **kwargs, 223 | ) -> Dict: 224 | response = openai.ChatCompletion.create( 225 | messages=messages, 226 | model=model, 227 | temperature=temperature, 228 | **kwargs, 229 | ) 230 | 231 | return response # type: ignore 232 | 233 | @classmethod 234 | async def agenerate( 235 | cls, 236 | messages: List[Union[str, Dict]], 237 | message_kwargs: Optional[Dict] = None, 238 | model: str = "gpt-3.5-turbo", 239 | temperature: float = 0.0, 240 | concurrency: int = 10, 241 | **openai_kwargs, 242 | ) -> Dict: 243 | """Generate text using async OpenAI's API. 244 | 245 | More details on the API and messages: https://platform.openai.com/docs/guides/gpt/chat-completions-api 246 | 247 | Args: 248 | messages 249 | A list of messages to send to the API. They can be: 250 | - dictionaries 251 | - str (JSON file path) 252 | 253 | message_kwargs 254 | A dictionary of keyword arguments to pass to the messages. 255 | 256 | temperature 257 | The sampling temperature. 258 | 259 | openai_kwargs 260 | Keyword arguments to pass to the OpenAI API. 261 | 262 | concurrency: 263 | The number of concurrent requests to make. 264 | 265 | Returns: 266 | A dictionary containing the response from the API. 267 | 268 | """ 269 | semaphore = asyncio.Semaphore(concurrency) 270 | async with semaphore: 271 | if not message_kwargs: 272 | message_kwargs = {} 273 | 274 | messages = [cls.prepare_message(message, **message_kwargs) for message in messages] 275 | 276 | response = await cls._acall( 277 | messages=messages, 278 | temperature=temperature, 279 | model=model, 280 | **openai_kwargs, 281 | ) 282 | 283 | response = response["choices"][0]["message"]["function_call"]["arguments"] 284 | parsed_response = await cls._parse_json(response) 285 | if parsed_response: 286 | parsed_response["url"] = message_kwargs["url"] 287 | return parsed_response 288 | 289 | return message_kwargs["url"] 290 | 291 | @staticmethod 292 | @retry( 293 | retry( 294 | reraise=True, 295 | stop=stop_after_attempt(6), 296 | wait=wait_exponential(multiplier=1, min=1, max=60), 297 | retry=( 298 | retry_if_exception_type(Timeout) # noqa: W503 299 | | retry_if_exception_type(APIError) # noqa: W503 300 | | retry_if_exception_type(APIConnectionError) # noqa: W503 301 | | retry_if_exception_type(RateLimitError) # noqa: W503 302 | | retry_if_exception_type(ServiceUnavailableError) # noqa: W503 303 | ), 304 | before_sleep=before_sleep_log(logger, logging.WARNING), 305 | ) 306 | ) 307 | async def _acall( 308 | messages: List[Dict], 309 | model: str = "gpt-3.5-turbo", 310 | temperature: float = 0.0, 311 | **kwargs, 312 | ) -> Dict: 313 | response = await openai.ChatCompletion.acreate( 314 | messages=messages, 315 | model=model, 316 | temperature=temperature, 317 | **kwargs, 318 | ) 319 | 320 | return response # type: ignore 321 | 322 | @staticmethod 323 | async def _try_parse_json(item: str) -> Union[dict, None]: 324 | try: 325 | return json.loads(item) 326 | except json.JSONDecodeError as e: 327 | return e 328 | 329 | @staticmethod 330 | async def _parse_json(item: str) -> Union[dict, None]: 331 | result = await EYFSClassifier._try_parse_json(item) 332 | if isinstance(result, json.JSONDecodeError): 333 | result = await EYFSClassifier._try_parse_json(item.replace("'", '"')) 334 | if isinstance(result, json.JSONDecodeError): 335 | logging.error(f"Invalid JSON: Error: {str(result)}") 336 | return None 337 | return result 338 | 339 | @staticmethod 340 | async def write_line_to_file(item: dict, filename: str) -> None: 341 | """Write the item to a file.""" 342 | file = f"{filename}/invalid_json.txt" 343 | if isinstance(item, dict): 344 | file = f"{filename}/parsed_json.jsonl" 345 | 346 | async with aiofiles.open(file, "a") as f: 347 | await f.write(f"{json.dumps(item)}\n") 348 | 349 | 350 | def get_embedding(text: str, model: str = "text-embedding-ada-002") -> List[float]: 351 | """Encode text with OpenAI's text embedding model.""" 352 | text = text.replace("\n", " ") 353 | return openai.Embedding.create(input=[text], model=model)["data"][0]["embedding"] 354 | -------------------------------------------------------------------------------- /src/genai/eyfs/prompts/choices_function.json: -------------------------------------------------------------------------------- 1 | {"name": "extract_activity_names", "description": "Extract all entities starting with three hashtags.", "parameters": {"type": "object", "properties": {"activity_names": {"type": "array", "items": {"type": "string", "description": "The name of an activity. It usually starts with ###."}}}, "required": ["activity_names"]}} 2 | -------------------------------------------------------------------------------- /src/genai/eyfs/prompts/classifier.json: -------------------------------------------------------------------------------- 1 | {"role": "user", "content": "###Areas of learning###\n{areas_of_learning}\n\n###Instructions###\nCategorise the following text to one or more areas of learning.\n{text}\n"} 2 | -------------------------------------------------------------------------------- /src/genai/eyfs/prompts/classifier_function.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "predict_area_of_learning", 3 | "description": "Predict the EYFS area of learning for a given text", 4 | "parameters": { 5 | "type": "object", 6 | "properties": { 7 | "prediction": { 8 | "type": "array", 9 | "items": { 10 | "type": "string", 11 | "enum": [ 12 | "Communication and Language", 13 | "Personal, Social and Emotional Development", 14 | "Physical Development", 15 | "Literacy", 16 | "Mathematics", 17 | "Understanding the World", 18 | "Expressive Arts and Design", 19 | "None" 20 | ] 21 | }, 22 | "description": "The predicted areas of learning." 23 | } 24 | }, 25 | "required": [ 26 | "prediction" 27 | ] 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /src/genai/eyfs/prompts/constraints.json: -------------------------------------------------------------------------------- 1 | { 2 | "role": "user", 3 | "content": "###Requirements for the activities###\n1. Your suggestions must be fun and engaging for the children.\n2. Your suggestions must be novel, inspiring and memorable.\n3. You must suggest topics for conversation with the children and questions to ask them.\n4. Your proposed activities engage children in the following Areas of Learning: {areas_of_learning}.\n5. You must generate {n_results} activities.\n6. Your proposed activities must be played {location}\n7. Describe the activities in 3-4 sentences." 4 | } 5 | -------------------------------------------------------------------------------- /src/genai/eyfs/prompts/context_and_task.json: -------------------------------------------------------------------------------- 1 | { 2 | "role": "user", 3 | "content": "###Context###The UK's Early Years Foundation Stage framework recommends that educational programmes must involve activities and experiences for children, as set out under each of the areas of learning described below.\n\n##Areas of Learning###\n{areas_of_learning_text}\n\n###Instructions###\nI am an early years educator and I am working with children 3-4 years old. I will describe you a situation in the ###Description### section. Please propose two conversation ideas and several games and crafts ideas I could do with the children to extend their learning.\n\nTypes of activities:\n- Conversations: Asking them questions about the topic\n- Games and Crafts: Puzzles, games, role play, arts and crafts\n\n###Formatting###\nReturn the proposed activities in the following format and always start with the two conversation ideas and then follow with all games and crafts ideas:\n\n## Conversations\n### \n\n**Activity description**:\n\n**Areas of learning**:\n\n### \n\n**Activity description**:\n\n**Areas of learning**:\n\n## Games and Crafts\n### \n\n**Activity description**:\n\n**Areas of learning**:\n\n### \n\n**Activity description**:\n\n**Areas of learning**:\n" 4 | } 5 | -------------------------------------------------------------------------------- /src/genai/eyfs/prompts/context_and_task_with_examples.json: -------------------------------------------------------------------------------- 1 | {"role": "user", "content": "###Context###The UK's Early Years Foundation Stage framework recommends that educational programmes must involve activities and experiences for children, as set out under each of the areas of learning described below.\n\n##Areas of Learning###\n{areas_of_learning_text}\n\n###Instructions###\nI am an early years educator and I am working with children 3-4 years old. I will describe you a situation in the ###Description### section. Please propose two conversation ideas and several games and crafts ideas I could do with the children to extend their learning.\n\nTypes of activities:\n- Conversations: Asking them questions about the topic\n- Games and Crafts: Puzzles, games, role play, arts and crafts\n\n###Activity examples###\nTake inspiration from the following examples:\n\n{activity_examples}\n\n###Formatting###\nReturn the proposed activities in the following format and always start with the two conversation ideas and then follow with all games and crafts ideas:\n\n## Conversations\n### \n\n**Activity description**:\n\n**Areas of learning**:\n\n### \n\n**Activity description**:\n\n**Areas of learning**:\n\n## Games and Crafts\n### \n\n**Activity description**:\n\n**Areas of learning**:\n\n### \n\n**Activity description**:\n\n**Areas of learning**:\n\n"} 2 | -------------------------------------------------------------------------------- /src/genai/eyfs/prompts/situation.json: -------------------------------------------------------------------------------- 1 | { 2 | "role": "user", 3 | "content": "###Description###\n{description}\n\n###Activities###\n" 4 | } 5 | -------------------------------------------------------------------------------- /src/genai/eyfs/prompts/system.json: -------------------------------------------------------------------------------- 1 | { 2 | "role": "system", 3 | "content": "You are a very creative and highly educated assistant who loves designing early year education programmes." 4 | } 5 | -------------------------------------------------------------------------------- /src/genai/eyfs/run_classifier.py: -------------------------------------------------------------------------------- 1 | """Classify the BBC activities to EYFS areas of learning.""" 2 | 3 | import asyncio 4 | import os 5 | import time 6 | 7 | import openai 8 | import pandas as pd 9 | 10 | from aiohttp import ClientSession 11 | from dotenv import load_dotenv 12 | 13 | from genai import FunctionTemplate 14 | from genai import MessageTemplate 15 | from genai.eyfs import EYFSClassifier 16 | from genai.utils import batch 17 | from genai.utils import create_directory_if_not_exists 18 | from genai.utils import read_json 19 | 20 | 21 | load_dotenv() 22 | 23 | # You need to create this manually before running the script 24 | OUTPUT_FILENAME = "data/" 25 | PATH_TO_AREAS_OF_LEARNING = "src/genai/eyfs/areas_of_learning.json" 26 | PATH_TO_MESSAGE_PROMPT = "src/genai/eyfs/prompts/classifier.json" 27 | PATH_TO_FUNCTION = "src/genai/eyfs/prompts/classifier_function.json" 28 | openai.api_key = os.environ["OPENAI_API_KEY"] 29 | 30 | 31 | def get_bbc_activities(path: str) -> pd.DataFrame: 32 | """Read and clean the BBC activities file and return a dataframe.""" 33 | df = pd.read_csv(path) 34 | df = df.rename(columns={"Age Range (if applicable)": "Age"}) 35 | df = df.dropna(subset=["text", "URL"]) 36 | df = df.drop_duplicates(subset=["URL"]) 37 | 38 | return df 39 | 40 | 41 | def get_areas_of_learning(path: str) -> tuple: 42 | """Get the EYFS areas of learning and their text.""" 43 | 44 | # keys for the classes, text for the prompt 45 | areas_of_learning = read_json(path) 46 | areas_of_learning_text = "\n".join([v for v in areas_of_learning.values()]) 47 | areas_of_learning_keys = list(areas_of_learning.keys()) 48 | 49 | # Add "None" to cover cases where no option is applicable. 50 | areas_of_learning_keys.append("None") 51 | 52 | return areas_of_learning_keys, areas_of_learning_text 53 | 54 | 55 | async def main() -> None: 56 | """Create prompts for path selection and infer paths.""" 57 | openai.aiosession.set(ClientSession()) 58 | 59 | # Fetch the BBC activities 60 | activities_df = get_bbc_activities(os.environ["PATH_TO_BBC_ACTIVITIES_DATA"]) 61 | 62 | # Fetch the EYFS areas of learning 63 | _, areas_of_learning_text = get_areas_of_learning(PATH_TO_AREAS_OF_LEARNING) 64 | 65 | print(f"Number of BBC activities: {len(activities_df)}") # noqa: T001 66 | 67 | message = MessageTemplate.load(PATH_TO_MESSAGE_PROMPT) 68 | function = FunctionTemplate.load(PATH_TO_FUNCTION) 69 | model = "gpt-3.5-turbo" 70 | temperature = 0.6 71 | 72 | for i, batched_results in enumerate(batch(activities_df, 20)): 73 | print(f"Batch {i} / {len(activities_df) // 20}") # noqa: T001 74 | tasks = [ 75 | EYFSClassifier.agenerate( 76 | model=model, 77 | temperature=temperature, 78 | messages=[message], 79 | message_kwargs={ 80 | "areas_of_learning": areas_of_learning_text, 81 | "text": tup.text, 82 | "url": tup.URL, 83 | }, 84 | functions=[function.to_prompt()], 85 | function_call={"name": "predict_area_of_learning"}, 86 | max_tokens=100, 87 | concurrency=5, 88 | ) 89 | for tup in batched_results.itertuples() 90 | ] 91 | 92 | for future in asyncio.as_completed(tasks): 93 | result = await future # Get the result (waits if not ready) 94 | await EYFSClassifier.write_line_to_file(result, OUTPUT_FILENAME) # Write to the file 95 | 96 | time.sleep(2) 97 | 98 | await openai.aiosession.get().close() # Close the http session at the end of the program 99 | 100 | 101 | if "__main__" == __name__: 102 | # create data/ directory if it doesn't exist 103 | create_directory_if_not_exists(OUTPUT_FILENAME) 104 | 105 | start = time.time() 106 | s = time.perf_counter() 107 | loop = asyncio.new_event_loop() 108 | asyncio.set_event_loop(loop) 109 | try: 110 | loop.run_until_complete(main()) 111 | finally: 112 | loop.close() 113 | e = time.perf_counter() 114 | 115 | print(f"Concurrent execution completed in: {e - s:0.2f} seconds") # noqa: T001 116 | -------------------------------------------------------------------------------- /src/genai/eyfs/run_pinecone_index.py: -------------------------------------------------------------------------------- 1 | """Join the BBC activities with the labelled activities and build a pinecone index. 2 | 3 | Note: Running this script will delete the existing index and build a new one. 4 | """ 5 | 6 | import os 7 | 8 | import numpy as np 9 | import openai 10 | import pandas as pd 11 | 12 | from dotenv import load_dotenv 13 | 14 | from genai.eyfs import get_embedding 15 | from genai.utils import read_jsonl_from_s3 16 | from genai.vector_index import PineconeIndex 17 | 18 | 19 | load_dotenv() 20 | 21 | 22 | openai.api_key = os.environ["OPENAI_API_KEY"] 23 | INDEX_NAME = "eyfs-index" 24 | ENCODER_NAME = "text-embedding-ada-002" 25 | 26 | 27 | def get_labelled_bbc_activities(path: str) -> pd.DataFrame: 28 | """Read and clean the labelled BBC activities file and return a dataframe.""" 29 | data = read_jsonl_from_s3(path) 30 | df = pd.concat([pd.DataFrame([line]) for line in data]) 31 | df["prediction"] = df["prediction"].apply(lambda row: row if row else np.nan) 32 | df = df[~df.prediction.isnull()] 33 | df = df[df.prediction != "None"] 34 | df = df.rename({"prediction": "areas_of_learning", "url": "URL"}, axis=1) 35 | 36 | return df 37 | 38 | 39 | def get_bbc_activities(path: str) -> pd.DataFrame: 40 | """Read and clean the BBC activities file and return a dataframe.""" 41 | df = pd.read_csv(path) 42 | df = df.rename(columns={"Age Range (if applicable)": "Age", "CONTENT": "title"}) 43 | df = df.dropna(subset=["text", "URL"]) 44 | df = df.drop_duplicates(subset=["URL"]) 45 | 46 | return df 47 | 48 | 49 | def main() -> None: 50 | """Run the script.""" 51 | # Read and merge dataframes 52 | labels = get_labelled_bbc_activities(os.environ["PATH_TO_LABELLED_BBC_DATA"]) 53 | bbc = get_bbc_activities(os.environ["PATH_TO_BBC_ACTIVITIES_DATA"]) 54 | 55 | df = labels.merge(bbc[["SHORT DESCRIPTION", "text", "URL", "title"]], how="left", left_on="URL", right_on="URL") 56 | 57 | # Encode the BBC activities' text 58 | df["embedding"] = df["text"].apply(lambda row: get_embedding(row, model=ENCODER_NAME)) 59 | 60 | # Batch items 61 | items = [] 62 | for tup in df.itertuples(): 63 | item = ( 64 | tup.URL, 65 | tup.embedding, 66 | {"areas_of_learning": tup.areas_of_learning, "title": tup.title, "text": tup.text, "source": "BBC"}, 67 | ) 68 | items.append(item) 69 | 70 | # Build the index 71 | conn = PineconeIndex(api_key=os.environ["PINECONE_API_KEY"], environment=os.environ["PINECONE_REGION"]) 72 | conn.build_and_upsert( 73 | index_name=INDEX_NAME, 74 | dimension=len(df["embedding"].iloc[0]), 75 | metric="euclidean", 76 | docs=items, 77 | metadata_config={"indexed": ["areas_of_learning", "source", "type_", "age_group"]}, 78 | batch_size=40, 79 | delete_if_exists=True, 80 | ) 81 | 82 | 83 | if "__main__" == __name__: 84 | main() 85 | -------------------------------------------------------------------------------- /src/genai/message_history.py: -------------------------------------------------------------------------------- 1 | from abc import ABC 2 | from abc import abstractmethod 3 | from typing import List 4 | 5 | import tiktoken 6 | 7 | from genai import MessageTemplate 8 | 9 | 10 | class TokenCounter: 11 | """Count tokens and buffer messages from memory.""" 12 | 13 | @staticmethod 14 | def _count_tokens_from_string(s: str, model_name: str) -> int: 15 | """Return the number of tokens in a text string.""" 16 | encoding = tiktoken.encoding_for_model(model_name) 17 | num_tokens = len(encoding.encode(s)) 18 | return num_tokens 19 | 20 | @classmethod 21 | def _count_tokens_from_messages(cls, messages: list, model_name: str) -> int: 22 | """Return the number of tokens in a list of messages.""" 23 | num_tokens = 0 24 | 25 | # every message follows <|start|>{role/name}\n{content}<|end|>\n 26 | tokens_per_message = 3 27 | tokens_per_name = 1 28 | for message in messages: 29 | num_tokens += tokens_per_message 30 | for k, v in message.items(): 31 | num_tokens += cls._count_tokens_from_string(v, model_name) 32 | if k == "name": # When role = function 33 | num_tokens += tokens_per_name 34 | 35 | num_tokens += 3 # every reply is primed with <|start|>assistant<|message|> 36 | return num_tokens 37 | 38 | @staticmethod 39 | def _forget_messages(num_tokens: int, max_tokens: int) -> bool: 40 | if num_tokens > max_tokens: 41 | return True 42 | return False 43 | 44 | @classmethod 45 | def buffer( 46 | cls, 47 | messages: list, 48 | model_name: str = "gpt-3.5-turbo", 49 | max_tokens: int = 4096, 50 | keep_system_message: bool = True, 51 | ) -> List[dict]: 52 | """Return the number of tokens in a list of messages. 53 | 54 | Parameters 55 | ---------- 56 | model_name 57 | The name of the model to use for token counting. 58 | 59 | max_tokens 60 | The maximum number of tokens to keep in the history. 61 | 62 | keep_system_message 63 | Whether to keep the system message in the history. 64 | 65 | Returns 66 | ------- 67 | messages 68 | List of messages. 69 | 70 | """ 71 | num_tokens = cls._count_tokens_from_messages(messages, model_name) 72 | if cls._forget_messages(num_tokens, max_tokens): 73 | if keep_system_message: 74 | messages.pop(1) 75 | else: 76 | messages.pop(0) 77 | cls.buffer(messages, model_name=model_name, max_tokens=max_tokens) 78 | 79 | return messages 80 | 81 | 82 | class BaseMessageHistory(ABC): 83 | """Base class for message history.""" 84 | 85 | @abstractmethod 86 | def add_message(self, message: MessageTemplate) -> None: 87 | """Add a message to the history.""" 88 | pass 89 | 90 | @abstractmethod 91 | def get_messages(self) -> List[MessageTemplate]: 92 | """Get all messages from history.""" 93 | pass 94 | 95 | @abstractmethod 96 | def clear_messages(self) -> None: 97 | """Clear all messages from history.""" 98 | pass 99 | 100 | 101 | class InMemoryMessageHistory(BaseMessageHistory): 102 | """In-memory message history.""" 103 | 104 | def __init__(self) -> None: 105 | """Initialize the message history.""" 106 | self.messages = [] 107 | 108 | def add_message(self, message: MessageTemplate) -> None: 109 | """Add a message to the history.""" 110 | self.messages.append(message) 111 | 112 | def get_messages( 113 | self, 114 | model_name: str = "gpt-3.5-turbo", 115 | max_tokens: int = 4096, 116 | keep_system_message: bool = True, 117 | ) -> List[dict]: 118 | """Get all messages from history. 119 | 120 | Filter messages when the number of tokens exceeds max_tokens. 121 | 122 | Parameters 123 | ---------- 124 | model_name 125 | The name of the model to use for token counting. 126 | 127 | max_tokens 128 | The maximum number of tokens to keep in the history. 129 | 130 | keep_system_message 131 | Whether to keep the system message in the history. 132 | 133 | Returns 134 | ------- 135 | messages 136 | List of messages. 137 | 138 | """ 139 | return TokenCounter.buffer( 140 | self.messages, 141 | model_name=model_name, 142 | max_tokens=max_tokens, 143 | keep_system_message=keep_system_message, 144 | ) 145 | 146 | def clear_messages(self) -> None: 147 | """Empty the message history.""" 148 | self.messages = [] 149 | -------------------------------------------------------------------------------- /src/genai/parenting_chatbot/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nestauk/discovery_generative_ai/b5ab4b0bf1feda7cd4a85a18ba20aa996cfb9e0d/src/genai/parenting_chatbot/__init__.py -------------------------------------------------------------------------------- /src/genai/parenting_chatbot/prodigy_eval/best_answer_recipe.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | from typing import Dict 4 | from typing import Generator 5 | from typing import List 6 | 7 | import prodigy 8 | 9 | from prodigy.components.loaders import JSONL 10 | 11 | 12 | GLOBAL_CSS = ( 13 | ".prodigy-content{font-size: 15px}" 14 | " .prodigy-option{width: 49%}" 15 | " .prodigy-option{align-items:flex-start}" 16 | " .prodigy-option{margin-right: 3px}" 17 | " .prodigy-container{max-width: 1200px}" 18 | ) 19 | 20 | 21 | @prodigy.recipe( 22 | "best_answer", 23 | dataset=("The dataset to save to", "positional", None, str), 24 | file_path=("Path to the questions and answers file", "positional", None, str), 25 | ) 26 | def best_answer(dataset: str, file_path: str) -> Dict: 27 | """ 28 | Choose the best answer out of the given options. 29 | 30 | Arguments: 31 | dataset: The dataset to save to. 32 | file_path: Path to the questions and answers file. 33 | 34 | Returns: 35 | A dictionary containing the recipe configuration. 36 | 37 | """ 38 | 39 | # Load the data 40 | stream = list(JSONL(file_path)) 41 | 42 | def get_shuffled_stream(stream: List) -> Generator: 43 | random.shuffle(stream) 44 | for eg in stream: 45 | yield eg 46 | 47 | # Process the stream to format for Prodigy 48 | def format_stream(stream: List) -> Dict: 49 | for item in stream: 50 | question = item["question"] 51 | options = [{"id": key, "html": value} for key, value in item["answers"].items()] 52 | yield {"html": question, "options": options} 53 | 54 | stream = format_stream(get_shuffled_stream(stream)) 55 | 56 | return { 57 | # Use the choice interface 58 | "view_id": "choice", 59 | # Name of the dataset 60 | "dataset": dataset, 61 | # The data stream 62 | "stream": stream, 63 | "config": { 64 | # Only allow one choice 65 | "choice_style": "single", 66 | "task_description": "Choose the best answer", 67 | "choice_auto_accept": False, 68 | # Define which buttons to show 69 | "buttons": ["accept", "ignore"], 70 | # Add custom css 71 | "global_css": GLOBAL_CSS, 72 | # If feed_overlap is True, the same example can be sent out to multiple users at the same time 73 | "feed_overlap": True, 74 | # Port to run the server on 75 | "port": 8080, 76 | # Important to set host to 0.0.0.0 when running on ec2 77 | "host": "0.0.0.0", 78 | # Setting instant_submit as True means that the user doesn't have to click the "save" button 79 | "instant_submit": True, 80 | }, 81 | } 82 | -------------------------------------------------------------------------------- /src/genai/parenting_chatbot/prodigy_eval/create_eval_data.py: -------------------------------------------------------------------------------- 1 | """ 2 | Generate data for Prodigy platform to evaluate answers to questions about raising babies 3 | 4 | Usage: Run the script from the repo root directory 5 | $ poetry run python src/genai/parenting_chatbot/prodigy_eval/create_eval_data.py 6 | 7 | The final output file is stored in data/answers.jsonl file 8 | following the format: {"question": your-question, "answers": {"human": human-answer, "rag": rag-answer, "gpt4": gpt4-answer}} 9 | 10 | """ 11 | 12 | from itertools import combinations 13 | 14 | import pandas as pd 15 | 16 | 17 | # Path constants 18 | DATA_DIR = "src/genai/parenting_chatbot/prodigy_eval/data/" 19 | QUESTION_FILE = DATA_DIR + "questions.jsonl" 20 | ANSWER_FILE = DATA_DIR + "answers_{}.jsonl" 21 | OUTPUT_FILE = DATA_DIR + "answers.jsonl" 22 | # Define answer types, for loading corresponding answers 23 | ANSWER_TYPES = ["human", "rag", "gpt4"] 24 | # html formatting of prefix and suffix for questions 25 | QUESTION_PREFIX = "Which one is a better answer to this question:\n\n" 26 | QUESTION_SUFFIX = "" 27 | 28 | if __name__ == "__main__": 29 | # Load questions 30 | questions = pd.read_json(QUESTION_FILE, lines=True)["question"].to_list() 31 | # Load answers 32 | answers = [ 33 | pd.read_json(path_or_buf=ANSWER_FILE.format(answer_type), lines=True)[answer_type] 34 | for answer_type in ANSWER_TYPES 35 | ] 36 | 37 | answers_df = ( 38 | # Construct a dataframe with columns: question, human, rag, and gpt4 39 | pd.DataFrame({"question": questions, "human": answers[0], "rag": answers[1], "gpt4": answers[2]}) 40 | # Melt the dataframe for pairwise combinations and rename the resulting column 41 | .melt(id_vars=["question"], value_vars=ANSWER_TYPES).rename(columns={"value": "answer"}) 42 | # Add html formatting to the question 43 | .assign(question=lambda df: QUESTION_PREFIX + df["question"] + QUESTION_SUFFIX) 44 | # Format the answer as a dictionary {answer_type: answer} 45 | .assign(answer=lambda df: df.apply(lambda x: {x["variable"]: x["answer"]}, axis=1)) 46 | ) 47 | 48 | # Generate pairwise combinations of answer types 49 | answer_type_pairs = list(combinations(ANSWER_TYPES, 2)) 50 | 51 | # Aggregate answers based on the pairwise combinations of answer types 52 | dataframes = [] 53 | for answer_type_pair in answer_type_pairs: 54 | subset_df = answers_df[answers_df["variable"].isin(answer_type_pair)] 55 | aggregated_df = subset_df.groupby("question").agg(lambda x: x.tolist()).reset_index() 56 | dataframes.append(aggregated_df) 57 | 58 | # Combine the results, merge dictionaries and save the output 59 | ( 60 | pd.concat(dataframes, ignore_index=True) 61 | .assign(answers=lambda df: df.apply(lambda x: {k: v for d in x["answer"] for k, v in d.items()}, axis=1)) 62 | .drop(columns=["variable", "answer"]) 63 | .sort_values(by="question") 64 | .reset_index(drop=True) 65 | .to_json(path_or_buf=OUTPUT_FILE, orient="records", lines=True) 66 | ) 67 | -------------------------------------------------------------------------------- /src/genai/parenting_chatbot/prodigy_eval/data/answers_gpt4.jsonl: -------------------------------------------------------------------------------- 1 | {"question":"How to change a nappy?","gpt4":"Changing a nappy can be a bit intimidating at first, but don't worry, you'll get the hang of it quickly. Here's a simple step-by-step guide:\n\n1. Gather your supplies: You'll need a clean nappy, baby wipes or a warm washcloth, and possibly nappy rash cream.\n\n2. Lay your baby down on a clean, safe surface: If you're using a changing table, be sure to use the safety straps.\n\n3. Remove the dirty nappy: Unfasten the tabs on the old nappy. Hold your baby's ankles and lift their bottom off the table, then pull the front of the nappy down. If there's a lot of mess, you can use the front of the nappy to wipe the bulk of it off.\n\n4. Clean your baby: Use baby wipes or a warm washcloth to clean your baby's bottom thoroughly. For a girl, wipe from front to back to avoid infection.\n\n5. Apply cream: If your baby has a rash, apply some nappy rash cream or ointment.\n\n6. Put on the clean nappy: Lift your baby's legs again and slide the clean nappy underneath. The back part with the tabs should go under your baby's bottom, and the front should come up between their legs.\n\n7. Fasten the nappy: Pull the front of the nappy up to your baby's belly, and open out the tabs. Make sure it's snug but not too tight - you should be able to fit two fingers between the nappy and your baby's belly.\n\n8. Dispose of the dirty nappy: Roll it up, secure it with the tabs, and put it in a nappy bag or a nappy bin.\n\nAlways remember to wash your hands thoroughly afterwards, and never leave your baby unattended on the changing table."} 2 | {"question":"When to stop breastfeeding?","gpt4":"The timing for stopping breastfeeding can vary greatly from mother to mother and baby to baby. The World Health Organization recommends exclusive breastfeeding up to 6 months of age, with continued breastfeeding along with appropriate complementary foods up to two years of age or beyond. However, the decision to stop breastfeeding is a personal one, and it's about what makes the most sense for you and your child. It's important to consider your baby's nutritional needs, your own health and well-being, and the emotional readiness of both you and your baby to wean. It's always a good idea to discuss this with your pediatrician or a lactation consultant."} 3 | {"question":"How to baby-proof?","gpt4":"Baby-proofing your home is an essential task to ensure your baby's safety as they start to explore. Here are some basic steps to follow:\n\n1. Start on the Floor: Since babies spend most of their time on the floor, start by removing any small objects that the baby could choke on. \n\n2. Secure Furniture: Secure any heavy furniture that could tip over, such as bookcases or TVs, to the wall. \n\n3. Install Gates: Install safety gates at the top and bottom of stairs, and in the doorways of rooms with hazards.\n\n4. Lock Cabinets: Install safety locks on cabinets, especially those containing cleaning supplies, medications, or other hazardous substances.\n\n5. Cover Outlets: Use safety plugs or outlet covers on all unused electrical outlets.\n\n6. Install Corner Guards: Use corner guards on furniture with sharp edges.\n\n7. Secure Windows: Install window guards and stoppers to prevent falls.\n\n8. Check Toys: Make sure all toys are safe for your baby's age group and that there are no small parts that could be a choking hazard.\n\n9. Adjust Water Heater: Set your water heater to 120 degrees Fahrenheit to prevent burns.\n\n10. Install Toilet Locks: To prevent drowning, install toilet locks to keep the lid down when not in use.\n\nRemember, baby-proofing should be an ongoing process. As your child grows and starts moving around more, you may need to make additional modifications. Always supervise your baby, even in a baby-proofed home."} 4 | {"question":"How do babies learn?","gpt4":"Babies learn primarily through their senses and by interacting with the people and environment around them. This process begins almost instantly after birth. For example, they learn to recognize their parents' faces and voices very quickly. They also begin to understand cause and effect through simple actions like shaking a rattle and hearing the sound it makes.\n\nAs they grow, their learning continues to be hands-on and experiential. They'll start exploring the world around them by touching, tasting, and playing with everything they can get their hands on. This is why providing a safe and stimulating environment is important.\n\nIn addition, babies learn a lot through imitation. They observe the people around them and try to replicate their actions and expressions. This is how they learn social cues, language, and various skills. It's fascinating to watch a baby's learning process unfold!"} 5 | {"question":"How to store breast milk?","gpt4":"Storing breast milk properly is important for maintaining its quality and safety for your baby. Here are some guidelines:\n\n1. Freshly Pumped\/Expressed Milk: At room temperature (up to 77\u00b0F or 25\u00b0C), it can be kept for 4 to 6 hours. In a cooler with ice packs, it can last up to one day. In the refrigerator (39\u00b0F or 4\u00b0C), it can be stored for up to 4 days. \n\n2. Frozen Milk: In a freezer compartment inside a refrigerator, it can last for 2 weeks. In a freezer with its own door (0\u00b0F or -18\u00b0C), it can be kept for 6 to 12 months. \n\n3. Thawed Milk: It can be kept in the refrigerator for 24 hours. Do not refreeze thawed milk.\n\nRemember to store the milk in clean containers, such as hard plastic or glass containers, or special plastic bags designed for milk storage. Don't forget to label each container with the date. Always use the oldest milk first."} 6 | {"question":"How to potty train?","gpt4":"Potty training is a big step for both parents and children. Here are some steps you can follow to make the process easier:\n\n1. **Choose the Right Time**: Begin potty training when your child shows readiness signs, such as staying dry for at least two hours, regular bowel movements, and showing interest in the potty.\n\n2. **Buy the Right Equipment**: Get a child-sized potty or a child's toilet seat and step for the regular toilet. This will make the child more comfortable and less scared.\n\n3. **Explain the Process**: Talk to your child about the toilet process. You can use a children's book about potty training to make it more understandable.\n\n4. **Set a Routine**: Set regular times to sit your child on the potty, such as after meals or before bed. This will help them understand when they need to go.\n\n5. **Praise Success**: Whenever your child uses the potty correctly, praise them and show positive reinforcement. This can be verbal praise, a hug, or even a small reward.\n\n6. **Be Patient**: Accidents will happen. It's important to stay calm and patient, and reassure your child that it's okay. \n\nRemember, every child is unique and may take different lengths of time to get used to potty training. It's important to be patient and supportive throughout the process."} 7 | {"question":"How to breastfeed?","gpt4":"Breastfeeding can be a very intimate and rewarding experience for both you and your baby. Here are some steps to help you get started:\n\n1. Find a comfortable position: This can be sitting or lying down. Use pillows to support your arms and back if necessary. You'll want to be comfortable since feedings can take some time.\n\n2. Hold your baby correctly: Your baby should be facing you, with their body in a straight line. You can hold your baby in a cradle hold (across your lap), football hold (at your side with their legs towards your back), or lying down.\n\n3. Latch: Touch your baby's lips with your nipple until they open their mouth wide. Then, quickly bring them onto your breast, not your breast to them. Their mouth should cover not just the nipple but as much of the areola (the darker circle around the nipple) as possible.\n\n4. Check for a good latch: If your baby is latched on properly, their chin and nose will be touching your breast, their lips will be flared out (like a fish), and you'll hear them swallowing. It shouldn't be painful for you. If it is, gently insert your finger into the corner of your baby's mouth to break the suction and try again.\n\n5. Let your baby finish the first breast: This ensures they get the \"hindmilk\", which is richer in fats and calories. Once they slow down or release the nipple, you can offer the other breast.\n\n6. Burp your baby: After feeding, it's important to burp your baby to release any air they swallowed. Hold them upright against your chest or shoulder and gently pat or rub their back until you hear a burp.\n\nRemember, every mother and baby are different, so what works for one may not work for another. It may take some time and practice to get the hang of it. Don't hesitate to ask for help from a lactation consultant or your healthcare provider if you're having trouble."} 8 | {"question":"How much to feed baby?","gpt4":"The amount to feed your baby can vary based on their age, weight, and individual needs. However, here's a general guideline:\n\n- Newborns usually take about 1.5 to 3 ounces of breast milk or formula at each feeding for the first few weeks, then it increases to 4 ounces by six weeks. \n- From 1 to 6 months, your baby will likely consume an average of 2.5 to 4 ounces every three to four hours.\n- By 6 months, your baby may be taking 6 to 8 ounces every four to five hours.\n\nRemember, these are just averages. It's important to follow your baby's hunger cues and consult with your pediatrician if you have concerns about your baby's food intake."} 9 | {"question":"How to get baby to sleep?","gpt4":"Establishing a consistent bedtime routine can help your baby understand that it's time to sleep. This can include activities like a warm bath, a bedtime story, or a lullaby. Try to make the environment conducive to sleep by keeping the room dark and at a comfortable temperature. Some babies might also find white noise or a pacifier soothing. Remember, every baby is different, so it might take a bit of trial and error to find out what works best for your little one. It's also important to remember that newborns typically have different sleep schedules than older babies or adults, and it's normal for them to wake frequently during the night."} 10 | {"question":"What is colic?","gpt4":"Colic is a term used to describe severe, often fluctuating pain in the abdomen that is caused by the spasmodic contraction of the gut. When it comes to babies, colic is commonly used to refer to excessive, frequent crying in a baby who appears to be otherwise healthy. This can be extremely distressing for parents. The exact cause of colic is unknown but it's thought to be related to indigestion, trapped wind or a temporary gut sensitivity in young babies. The condition usually appears within the first few weeks of life and tends to resolve by the time the baby is four months old, but can last up to 12 months. It's a common condition and is thought to affect up to one in five babies."} 11 | -------------------------------------------------------------------------------- /src/genai/parenting_chatbot/prodigy_eval/data/answers_human.jsonl: -------------------------------------------------------------------------------- 1 | {"question": "How to change a nappy", "human": "When arriving home with your newborn, you’re thrown into motherhood, and there are many things to consider. With many conversations around the best nappies to use and how to use them correctly, how, and how often, do you change a nappy? Young babies may need changing as many as ten or twelve times a day, while older babies must be changed around six to eight times. If your baby's nappy is dirty, it’s best to use it to clean off most of the excrement from their bottom. Once this is done, use either the cotton wool and warm water or baby wipes to remove the rest and ensure your baby is clean. A top tip is to clean inside the folds of the skin. If you're using disposable nappies, try not to get any liquid on the sticky tabs, as they won't stick if you do. If you're using cloth nappies, carefully place them in a nappy liner and then fasten the nappy. The nappy then needs to be adjusted to fit around the waist and legs comfortably. We recommend using reusable nappies from birth if you’d like to reduce the impact you’re having on the environment and the number of times you change your baby."} 2 | {"question": "When to stop breastfeeding", "human": "Breastfeeding is a very personal experience for mums, and if this is the chosen route for feeding your baby, it can be difficult to decide when to stop. Breastfeeding is recommended for the first six months of your baby's life. While many parents decide to stop breastfeeding before their child turns two, there's no reason why you should not continue breastfeeding your child into their second year. Breastfeeding can provide comfort for your baby when they’re unwell or upset, and it’s common for both the baby and mum to enjoy the benefits of breastfeeding. However, with that being said, mums shouldn’t feel pressured to breastfeed, and you should always do what you’re comfortable with."} 3 | {"question": "How to baby-proof", "human": "Baby-proofing your house is recommended for when your baby begins to explore, often when they’re beginning to crawl. Some baby-proofing precautions you can take include: Fitting safety devices to doors, windows or drawers; Covering sharp edges and corners around your home; Locking any dangerous items away, such as medication; Adding barriers, such as baby gates, to keep your child away from hazards such as stairs. We recommend looking at things from your baby’s perspective when baby-proofing, crawling around your home and seeing things from a lower perspective can help you spot any hazards you’d miss from adult height."} 4 | {"question": "How do babies learn", "human": "Babies constantly learn every day, discovering new abilities and expanding their senses. Play is the main way infants learn to understand their surroundings and communicate with others. During the first month, your baby will learn by interacting with you. The first thing a baby learns is how to use their body language to ensure their needs are met. Sensory details such as your touch, the sound of your voice and the sight of you will allow comfort through all learning experiences."} 5 | {"question": "How to store breast milk", "human": "Mums often choose to express their breast milk and store it for future use. This can be useful if you’re going to be away from your baby, if your breasts feel uncomfortably full, your partner is helping to feed the baby or if you simply want to boost your milk supply. You can express milk by hand or with a pump. The best way to store breast milk is in a sterilised container or in special breast milk storage bags. Once contained, you can store the milk in the fridge for up to eight days at 4C or lower. You can store it for two weeks in the ice compartment of a fridge and up to six months in a freezer if it's -18C or lower. Breast milk that's been cooled in the fridge can be carried in a cool bag with ice packs for up to 24 hours. We recommend labelling all breast milk when storing with dates of when it was expressed."} 6 | {"question": "How to potty train", "human": "Potty training your child takes patience, and you must go at your own pace with your little one. To potty train, the main key is communication, introduce your child to the potty and talk to them about nappies. Children learn by watching and copying what they see, so you can use toys to demonstrate how to use the potty. Keeping the potty in plain sight can be helpful, allowing them to sit on it and get used to the environment."} 7 | {"question": "How to breastfeed", "human": "Breastfeeding is an individual experience for every mum, it can take time to get used to. There are different positions you can breastfeed in, so you and your baby should use trial and error and see which works best for you. In any position, you should make sure you’re supporting your baby’s head and neck, allowing them to tilt their head and swallow. Bring your baby close to your breast and allow them to latch themselves, it can help to lean forward. Breastfeeding is also optional, and if you choose not to, the method of feeding your baby on formula is very similar."} 8 | {"question": "How much to feed baby", "human": "There is no correct amount to feed your baby, each little one will differ with how regularly they need feeding. Usually, most newborns eat every two to three hours or eight to twelve times every 24 hours. Babies usually drink one to two ounces at each feeding after the first few days of life. Your baby will have hunger cues and will let you know when they’d like feeding; this can be in the form of crying, sticking their tongue out, putting their hand to their mouth regularly, opening their mouth and sucking on other items."} 9 | {"question": "How to get baby to sleep", "human": "Babies can vary in their sleep patterns, especially in the first few months of life. Tips to get your baby to sleep involve encouraging a routine or sleep habits, including a bath, cuddling, singing and quiet music in a dimly lit room. Give your baby time to settle down after comforting, we also recommend putting them down drowsy but still awake, with a comforter such as a teddy bear or a pacifier, allowing them to be soothed to sleep."} 10 | {"question": "What is colic", "human": "Colic is when a baby cries continuously, but there is no obvious cause, it often gets better after three or four months of life. Although all babies cry, babies with colic may cry for three hours at a time but are otherwise healthy. Other signs of colic can include babies clenching their fists, getting red in the face and bringing their knees up to their stomachs. Steps to soothe colic include cuddling, sitting your baby upright during feeds, gently rocking your baby or giving them a warm bath."} 11 | -------------------------------------------------------------------------------- /src/genai/parenting_chatbot/prodigy_eval/data/questions.jsonl: -------------------------------------------------------------------------------- 1 | {"question": "How to change a nappy?"} 2 | {"question": "When to stop breastfeeding?"} 3 | {"question": "How to baby-proof?"} 4 | {"question": "How do babies learn?"} 5 | {"question": "How to store breast milk?"} 6 | {"question": "How to potty train?"} 7 | {"question": "How to breastfeed?"} 8 | {"question": "How much to feed baby?"} 9 | {"question": "How to get baby to sleep?"} 10 | {"question": "What is colic?"} 11 | -------------------------------------------------------------------------------- /src/genai/parenting_chatbot/prodigy_eval/data/system.json: -------------------------------------------------------------------------------- 1 | { 2 | "role": "system", 3 | "content": "You are an AI assistant for answering questions about parenting, babies and toddlers.\nProvide a conversational answer.\nIf the question is not about parenting, babies and toddlers, politely inform them that you are tuned to only answer questions about that." 4 | } 5 | -------------------------------------------------------------------------------- /src/genai/parenting_chatbot/prodigy_eval/figures/eval_parenting_chatbot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nestauk/discovery_generative_ai/b5ab4b0bf1feda7cd4a85a18ba20aa996cfb9e0d/src/genai/parenting_chatbot/prodigy_eval/figures/eval_parenting_chatbot.png -------------------------------------------------------------------------------- /src/genai/parenting_chatbot/prodigy_eval/figures/prodigy_screenshot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nestauk/discovery_generative_ai/b5ab4b0bf1feda7cd4a85a18ba20aa996cfb9e0d/src/genai/parenting_chatbot/prodigy_eval/figures/prodigy_screenshot.png -------------------------------------------------------------------------------- /src/genai/parenting_chatbot/prodigy_eval/generate_gpt4_answers.py: -------------------------------------------------------------------------------- 1 | """ 2 | Generate answers to questions about raising babies using GPT-4 3 | 4 | Usage: Run the script from the repo root directory 5 | $ poetry run python src/genai/parenting_chatbot/prodigy_eval/data/generate_gpt4_answers.py 6 | 7 | The final output file is stored in data/answers_gpt4.jsonl file 8 | following the format: {"question": your-question, "gpt4": gpt4-answer} 9 | 10 | """ 11 | import os 12 | 13 | import dotenv 14 | import openai 15 | import pandas as pd 16 | 17 | from genai import MessageTemplate 18 | from genai.eyfs import TextGenerator 19 | 20 | 21 | DIR = "src/genai/parenting_chatbot/prodigy_eval/data/" 22 | SYSTEM_PROMPT = DIR + "system.json" 23 | QUESTIONS = DIR + "questions.jsonl" 24 | OUTPUT_FILE = DIR + "answers_gpt4.jsonl" 25 | 26 | dotenv.load_dotenv() 27 | openai.api_key = os.environ["OPENAI_API_KEY"] 28 | 29 | if __name__ == "__main__": 30 | # Load the system prompt 31 | system_prompt = MessageTemplate.load(SYSTEM_PROMPT) 32 | # Load the questions 33 | questions = pd.read_json(path_or_buf=QUESTIONS, lines=True).question.to_list() 34 | # Generate answers 35 | responses = [] 36 | for question in questions: 37 | prompt = MessageTemplate(role="user", content=question) 38 | response = TextGenerator.generate( 39 | model="gpt-4", 40 | temperature=0.6, 41 | messages=[system_prompt.to_prompt(), prompt.to_prompt()], 42 | message_kwargs=None, 43 | stream=False, 44 | ) 45 | responses.append(response) 46 | # Extract only texts from each response 47 | answer_text = [response["choices"][0]["message"]["content"] for response in responses] 48 | # Write the answers to jsonl file 49 | pd.DataFrame({"question": questions, "gpt4": answer_text}).to_json( 50 | path_or_buf=OUTPUT_FILE, orient="records", lines=True 51 | ) 52 | -------------------------------------------------------------------------------- /src/genai/parenting_chatbot/prompts/filter_refs_function.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "is_this_text_relevant_to_the_question", 3 | "description": "Determine whether the provided text contains information that is DIRECTLY related to the question", 4 | "parameters": { 5 | "type": "object", 6 | "properties": { 7 | "prediction": { 8 | "type": "boolean", 9 | "description": "Determine whether the provided text contains information relevant to the question" 10 | } 11 | }, 12 | "required": [ 13 | "prediction" 14 | ] 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /src/genai/parenting_chatbot/prompts/filter_refs_system.json: -------------------------------------------------------------------------------- 1 | {"role": "system", "content": "You are a smart assistant with proficiency is reading. You read the provided text and a question and you determine whether the text is relevant to the question.", "name": null} 2 | -------------------------------------------------------------------------------- /src/genai/parenting_chatbot/prompts/filter_refs_user.json: -------------------------------------------------------------------------------- 1 | {"role": "user", "content": "###Instructions###\nPlease read the following text and determine whether is relevant to the question.\n###Text###\n{text}\n###Question###\n{question}", "name": null} 2 | -------------------------------------------------------------------------------- /src/genai/parenting_chatbot/prompts/system.json: -------------------------------------------------------------------------------- 1 | { 2 | "role": "system", 3 | "content": "You are an AI assistant for answering questions about parenting, babies and toddlers.\nYou are given the following extracted documents from NHS Start For Life website and a question. Provide a conversational answer.\nIf you are not provided with any documents, say \"I did not get any relevant context for this but I will reply to the best of my knowledge\" and then write your answer\nIf you don't know the answer, just say \"Hmm, I'm not sure. It's better if you search online for this.\" Don't try to make up an answer.\nIf the question is not about parenting, babies and toddlers, politely inform them that you are tuned to only answer questions about that.\n\n", 4 | "name": null 5 | } 6 | -------------------------------------------------------------------------------- /src/genai/parenting_chatbot/run_nhs_index.py: -------------------------------------------------------------------------------- 1 | """Build a pinecone index with the NHS Start for Life data. 2 | 3 | Note: We've opted for indexing full pages instead of individual paragraphs. 4 | """ 5 | 6 | import os 7 | import uuid 8 | 9 | import openai 10 | import pandas as pd 11 | 12 | from dotenv import load_dotenv 13 | 14 | from genai.eyfs import get_embedding 15 | from genai.vector_index import PineconeIndex 16 | 17 | 18 | load_dotenv() 19 | 20 | 21 | openai.api_key = os.environ["OPENAI_API_KEY"] 22 | INDEX_NAME = "eyfs-index" 23 | ENCODER_NAME = "text-embedding-ada-002" 24 | 25 | if "__main__" == __name__: 26 | df = pd.read_csv(os.environ["PATH_TO_NHS_DATA"]) 27 | df = df.drop_duplicates(subset=["header", "content", "content_no"], keep="last") 28 | 29 | # Format the data to what pinecone needs and generate a temp uuid 30 | docs = [] 31 | for tup in df.itertuples(): 32 | doc = tuple( 33 | ( 34 | str(uuid.uuid4()), 35 | get_embedding(tup.content), 36 | { 37 | "source": "nhs", 38 | "text": tup.content, 39 | "header": tup.header, 40 | "content_no": tup.content_no, 41 | "url": "".join(["https://www.nhs.uk", tup.URL]), 42 | }, 43 | ) 44 | ) 45 | docs.append(doc) 46 | 47 | # Build the index 48 | conn = PineconeIndex(api_key=os.environ["PINECONE_API_KEY"], environment="us-west1-gcp") 49 | 50 | conn.build_and_upsert( 51 | index_name=INDEX_NAME, 52 | dimension=1536, 53 | metric="euclidean", 54 | docs=docs, 55 | metadata_config={"indexed": ["areas_of_learning", "source", "type_", "age_group"]}, 56 | batch_size=80, 57 | ) 58 | -------------------------------------------------------------------------------- /src/genai/parenting_chatbot/run_nhs_index_full_page.py: -------------------------------------------------------------------------------- 1 | """Build a pinecone index with the NHS Start for Life data.""" 2 | 3 | import os 4 | import uuid 5 | 6 | import openai 7 | import pandas as pd 8 | 9 | from dotenv import load_dotenv 10 | 11 | from genai.eyfs import get_embedding 12 | from genai.vector_index import PineconeIndex 13 | 14 | 15 | load_dotenv() 16 | 17 | 18 | openai.api_key = os.environ["OPENAI_API_KEY"] 19 | INDEX_NAME = "eyfs-index" 20 | ENCODER_NAME = "text-embedding-ada-002" 21 | 22 | if "__main__" == __name__: 23 | df = pd.read_csv(os.environ["PATH_TO_NHS_DATA"]) 24 | df = df.groupby("URL").apply(lambda group: "\n\n".join(group["header"] + "\n" + group["content"])) 25 | df = df.reset_index() 26 | df.columns = ["URL", "content"] 27 | 28 | # Format the data to what pinecone needs and generate a temp uuid 29 | docs = [] 30 | for tup in df.itertuples(): 31 | doc = tuple( 32 | ( 33 | str(uuid.uuid4()), 34 | get_embedding(tup.content), 35 | { 36 | "source": "nhs_full_page", 37 | "text": tup.content, 38 | "url": "".join(["https://www.nhs.uk", tup.URL]), 39 | }, 40 | ) 41 | ) 42 | docs.append(doc) 43 | 44 | # Build the index 45 | conn = PineconeIndex(api_key=os.environ["PINECONE_API_KEY"], environment=os.environ["PINECONE_REGION"]) 46 | 47 | conn.build_and_upsert( 48 | index_name=INDEX_NAME, 49 | dimension=1536, 50 | metric="euclidean", 51 | docs=docs, 52 | metadata_config={"indexed": ["areas_of_learning", "source", "type_", "age_group"]}, 53 | batch_size=40, 54 | ) 55 | -------------------------------------------------------------------------------- /src/genai/prompt_template.py: -------------------------------------------------------------------------------- 1 | import json 2 | import string 3 | 4 | from abc import ABC 5 | from abc import abstractmethod 6 | from dataclasses import asdict 7 | from dataclasses import dataclass 8 | from dataclasses import field 9 | from typing import Dict 10 | from typing import List 11 | from typing import Optional 12 | from typing import Union 13 | 14 | 15 | @dataclass 16 | class BasePromptTemplate(ABC): 17 | """Base template prompts flexibly.""" 18 | 19 | initial_template: Dict[str, str] = field(default_factory=dict, init=False) 20 | 21 | def __post_init__(self) -> None: 22 | """Keep the initial template.""" 23 | self.initial_template = self._initialize_template() 24 | 25 | @abstractmethod 26 | def _initialize_template(self) -> None: 27 | """To be implemented by child classes""" 28 | pass 29 | 30 | @staticmethod 31 | @abstractmethod 32 | def _from_dict(data: Dict) -> None: 33 | """Create a Template instance from a dictionary.""" 34 | pass 35 | 36 | def format_message(self, **kwargs) -> None: 37 | """Process a message and fill in any placeholders.""" 38 | 39 | def recursive_format(value: Union[str, dict]) -> Union[str, dict]: 40 | if isinstance(value, str): 41 | placeholders = self._extract_placeholders(value) 42 | if placeholders: 43 | return value.format(**kwargs) 44 | return value 45 | elif isinstance(value, dict): 46 | return {k: recursive_format(v) for k, v in value.items()} 47 | else: 48 | return value 49 | 50 | for k in self.__dict__.keys(): 51 | if k != "initial_template": 52 | self.__dict__[k] = recursive_format(self.initial_template[k]) 53 | 54 | @classmethod 55 | def load(cls, obj: Union[Dict, str]) -> "BasePromptTemplate": 56 | """Load a Template instance from a JSON file or a dictionary.""" 57 | if isinstance(obj, str): 58 | return cls._from_json(obj) 59 | elif isinstance(obj, Dict): 60 | return cls._from_dict(obj) 61 | else: 62 | raise TypeError(f"Expected a JSON file path or a dictionary, got {type(obj)}.") 63 | 64 | @staticmethod 65 | def _exclude_keys( 66 | d: dict, 67 | exclude: Optional[List[str]] = None, # noqa: B006 68 | ) -> dict: 69 | """Exclude keys from a dictionary.""" 70 | if not d["name"]: 71 | d.pop("name", None) 72 | 73 | if exclude: 74 | for item in exclude: 75 | d.pop(item, None) 76 | return d 77 | return d 78 | 79 | def to_prompt( 80 | self, 81 | exclude: Optional[List[str]] = ["initial_template"], # noqa: B006 82 | ) -> Dict: 83 | """Convert a Template instance to a JSON string.""" 84 | d = asdict(self) 85 | return self._exclude_keys(d, exclude=exclude) 86 | 87 | @staticmethod 88 | def _extract_placeholders(s: str) -> List[str]: 89 | """Extract placeholder variables that can be filled in an f-string.""" 90 | formatter = string.Formatter() 91 | return [field_name for _, field_name, _, _ in formatter.parse(s) if field_name is not None] 92 | 93 | @classmethod 94 | def _from_json(cls, json_path: str) -> "BasePromptTemplate": 95 | """Create a Template instance by providing a JSON path.""" 96 | return cls._from_dict(cls._read_json(json_path)) 97 | 98 | @staticmethod 99 | def _read_json(json_path: str) -> Dict: 100 | """Read a JSON file.""" 101 | with open(json_path, "r") as f: 102 | return json.load(f) 103 | 104 | def to_json(self, path: str) -> None: 105 | """Convert a Template instance to a JSON string.""" 106 | self._write_json(self.initial_template, path) 107 | 108 | def _write_json(self, data: Dict, path: str) -> None: 109 | """Write a JSON file.""" 110 | with open(path, "w") as f: 111 | json.dump(data, f) 112 | 113 | 114 | @dataclass 115 | class MessageTemplate(BasePromptTemplate): 116 | """Create a template for a message prompt.""" 117 | 118 | role: str 119 | content: str 120 | name: Optional[str] = None 121 | 122 | def __post_init__(self) -> None: 123 | """Keep the initial template and error when the role is function but not name was given.""" 124 | super().__post_init__() 125 | if self.role == "function" and not self.name: 126 | raise ValueError("The 'name' attribute is required when 'role' is 'function'.") 127 | 128 | def _initialize_template(self) -> Dict[str, str]: 129 | return {"role": self.role, "content": self.content, "name": self.name} 130 | 131 | @staticmethod 132 | def _from_dict(data: Dict) -> "MessageTemplate": 133 | instance = MessageTemplate(**data) 134 | # Validate after initialisation 135 | if instance.role == "function" and not instance.name: 136 | raise ValueError("The 'name' attribute is required when 'role' is 'function'.") 137 | return instance 138 | 139 | 140 | @dataclass 141 | class FunctionTemplate(BasePromptTemplate): 142 | """Create a template for an OpenAI function.""" 143 | 144 | name: str 145 | description: str 146 | parameters: Dict[str, Union[str, Dict[str, Dict[str, Union[str, List[str]]]], List[str]]] 147 | 148 | def _initialize_template(self) -> Dict[str, Union[str, Dict[str, Dict[str, Union[str, List[str]]]], List[str]]]: 149 | return { 150 | "name": self.name, 151 | "description": self.description, 152 | "parameters": self.parameters, 153 | } 154 | 155 | @staticmethod 156 | def _from_dict(data: Dict) -> "FunctionTemplate": 157 | """Create a Template instance from a dictionary.""" 158 | return FunctionTemplate(**data) 159 | -------------------------------------------------------------------------------- /src/genai/sandbox/signals/data/00_system.jsonl: -------------------------------------------------------------------------------- 1 | {"role": "system", "content": "###Instructions###\nYou are a helpful, kind, intelligent and polite futurist. You work for the United Kingdom's innovation agency Nesta, and your task is to engage the user about the future signals and trends that Nesta has researched, by helping the user imagine and appreciate how the signals will impact their life. You will personalise the user experience by taking the information provided by the user and tailoring your explanation to the user background. You are also expert in equity, diversity and inclusion and your answers will be inclusive, the answers will never be based on negative stereotypes, you will never offend and you will exercise sensitivity about topics such as ethnicity and gender. Here are the future signals that you can talk about: {signals}. Do not discuss other future signals as this is not part of this year's Nesta's Signals edition."} 2 | -------------------------------------------------------------------------------- /src/genai/sandbox/signals/data/01_intro.jsonl: -------------------------------------------------------------------------------- 1 | {"role": "assistant", "content": "Hi, I’m Scout, [Discovery Hub’s](https://www.nesta.org.uk/discovery/) experimental AI assistant which helps people explore and interpret signals about the future. ✨"} 2 | {"role": "assistant", "content": "This year we have collected signals about a variety of topics, from 🏥 virtual hospital wards and 🧠 neuroprivacy to ⚡ space solar power and 🧪 data poisoning."} 3 | {"role": "assistant", "content": "Tell me one or two things about you and your interests, so that I can suggest which future signals might be the most relevant to you! You can always refresh your browser to restart the conversation."} 4 | -------------------------------------------------------------------------------- /src/genai/sandbox/signals/data/02_signal_impact.jsonl: -------------------------------------------------------------------------------- 1 | {"role": "user", "content": "Start your answer by summarising the signal in one clear, concise sentence and then follow by explaining in another clear sentence how the selected future signal might be relevant to the user, given the user information and conversation history. Then describe three ways how the selected future signal might impact them. Keep these descriptions short, two-three sentences at most. Finish your answer by encouraging the user to ask questions about this signal (note that you will try your best to answer them) or suggest to ask about the other future signals. Remember that you must be patient and never offend or be aggressive. \n\n###Future signal###{signal}\n\n###User information### Here is what the user told you about themselves: {user_input}.\n\n###Answer###" 2 | } 3 | -------------------------------------------------------------------------------- /src/genai/sandbox/signals/data/03_signal_choice.jsonl: -------------------------------------------------------------------------------- 1 | {"role": "user", "content": "Start your answer by explaining each of the signals in one clear sentence (use similar language to the signals descriptions). If possible, indicate how a signal might be relevant to the user, given the user information and conversation history. Finish your answer by asking the user to choose one of the signals to hear more about it. Remember that you must be patient and never offend or be aggressive. \n\n###Future signals###{signals}\n\n###User information### Here is what the user told you about themselves: {user_input}.\n\n###Answer###" 2 | } 3 | -------------------------------------------------------------------------------- /src/genai/sandbox/signals/data/04_follow_up.jsonl: -------------------------------------------------------------------------------- 1 | {"role": "user", "content": "Answer to the user's most recent message. Be as concise or detailed as necessary. Use the information from the future signal description when relevant. Keep your answers conversational and three to four sentences long at most. \n\n###Future signal###{signal}\n\n###User information### Here is what the user told you: {user_input}.\n\n###Answer###" 2 | } 3 | -------------------------------------------------------------------------------- /src/genai/sandbox/signals/data/func_intent.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "predict_intent", 3 | "description": "Predict what is the user's intent", 4 | "parameters": { 5 | "type": "object", 6 | "properties": { 7 | "prediction": { 8 | "type": "string", 9 | "enum": ["new_signal", "more_signals", "following_up"], 10 | "description": "The predicted intent" 11 | } 12 | }, 13 | "required": ["prediction"] 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /src/genai/sandbox/signals/data/func_top_signal.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "predict_top_signal", 3 | "description": "Predict which one of the signal is the most relevant to user input", 4 | "parameters": { 5 | "type": "object", 6 | "properties": { 7 | "prediction": { 8 | "type": "string", 9 | "enum": [], 10 | "description": "The predicted most relevant signal" 11 | } 12 | }, 13 | "required": ["prediction"] 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /src/genai/sandbox/signals/data/func_top_three_signals.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "predict_top_signals", 3 | "description": "Predict which three signals are the most relevant to user input", 4 | "parameters": { 5 | "type": "object", 6 | "properties": { 7 | "prediction": { 8 | "type": "array", 9 | "items": { 10 | "type": "string", 11 | "enum": [] 12 | }, 13 | "description": "The predicted most relevant signals" 14 | } 15 | }, 16 | "required": ["prediction"] 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /src/genai/sandbox/signals/data/intent_actions.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "name": "new_signal", 4 | "description": "User wishes to change the topic and talk about an different future signal. Alternatively, the user has been just presented with a set of future signal options by the assistant, and the user has now chosen which signal to talk about more." 5 | }, 6 | { 7 | "name": "more_signals", 8 | "description": "User has asked to hear more about other future signals" 9 | }, 10 | { 11 | "name": "following_up", 12 | "description": "User is following up with another question about the signal that's being discussed just now." 13 | } 14 | ] 15 | -------------------------------------------------------------------------------- /src/genai/sandbox/signals/data/prompt_intent.jsonl: -------------------------------------------------------------------------------- 1 | {"role": "user", "content": "You are a helpful chatbot talking with the user about the articles of future signals that we wrote. ###Instructions### Predict the intended action of the user, what the user wishes you to carry out based on the conversation history. Pay attention to the most recent messages\n\n###Possible intents###\n{intents}\n\n"} 2 | -------------------------------------------------------------------------------- /src/genai/sandbox/signals/data/prompt_top_signal.jsonl: -------------------------------------------------------------------------------- 1 | {"role": "user", "content": "###Instructions### Predict which of the following future signals is the most relevant to user input. You have to choose one of these signals. \n\n###Future signal summaries###\n{signals}\n\n###User input:\n{user_input}"} 2 | -------------------------------------------------------------------------------- /src/genai/sandbox/signals/data/prompt_top_three_signals.jsonl: -------------------------------------------------------------------------------- 1 | {"role": "user", "content": "###Instructions### Predict which three of the following future signals are the most relevant to user input. You have to choose three of these signals. \n\n###Future signal summaries###\n{signals}\n\n###User input:\n{user_input}"} 2 | -------------------------------------------------------------------------------- /src/genai/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import find_packages 2 | from setuptools import setup 3 | 4 | 5 | setup( 6 | name="genai", 7 | version="0.1", 8 | packages=find_packages(), 9 | ) 10 | -------------------------------------------------------------------------------- /src/genai/streamlit_pages/__init__.py: -------------------------------------------------------------------------------- 1 | from .dm_page import eyfs_dm_kb 2 | from .eli3_page import eli3 3 | from .eyfs_compare_page import eyfs_compare 4 | from .eyfs_kb_bbc_page import eyfs_kb_bbc 5 | from .eyfs_page import early_year_activity_plan 6 | from .parenting_page import parenting_chatbot 7 | -------------------------------------------------------------------------------- /src/genai/streamlit_pages/dm_page.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | import streamlit as st 4 | 5 | from genai import MessageTemplate 6 | from genai.eyfs import TextGenerator 7 | from genai.eyfs import get_embedding 8 | from genai.streamlit_pages.utils import get_index 9 | from genai.streamlit_pages.utils import query_pinecone 10 | from genai.streamlit_pages.utils import reset_state 11 | from genai.streamlit_pages.utils import sample_docs 12 | from genai.utils import read_json 13 | 14 | 15 | def eyfs_dm_kb(index_name: str = "eyfs-index-1718673592-index", sidebar: bool = True) -> None: 16 | """Run the Development Matters app.""" 17 | 18 | # Define your custom CSS 19 | custom_css = """ 20 | 31 | """ 32 | 33 | # Apply the custom CSS 34 | st.markdown(custom_css, unsafe_allow_html=True) 35 | 36 | st.title("Generate activities anchored to the Development Matters guidance") 37 | areas_of_learning_desc = read_json("src/genai/eyfs/areas_of_learning.json") 38 | aol = list(areas_of_learning_desc.keys()) 39 | index = get_index(index_name=index_name) 40 | 41 | if "examples" not in st.session_state: 42 | st.session_state["examples"] = "" 43 | 44 | if "learning_goals" not in st.session_state: 45 | st.session_state["learning_goals"] = "" 46 | 47 | if "full_response" not in st.session_state: 48 | st.session_state["full_response"] = None 49 | 50 | message = MessageTemplate.load("src/genai/dm/prompts/dm_prompt_2.json") 51 | 52 | if sidebar: 53 | with st.sidebar: 54 | selected_model, temperature, n_examples = _sidebar() 55 | else: 56 | selected_model = "gpt-4" 57 | temperature = 0.6 58 | n_examples = 5 59 | 60 | choice = st.radio( 61 | label="**Select a learning goal**", 62 | options=["Pick a predefined learning goal", "Describe a learning goal"], 63 | on_change=reset_state, 64 | ) 65 | 66 | age_groups = st.selectbox( 67 | label="**Age group (in years)**", 68 | options=["0-3", "3-4", "4-5"], 69 | index=2, 70 | on_change=reset_state, 71 | ) 72 | 73 | if choice == "Pick a predefined learning goal": 74 | # Select the areas of learning 75 | areas_of_learning = st.selectbox( 76 | label="**Areas of learning**", 77 | options=aol, 78 | index=1, 79 | on_change=reset_state, 80 | ) 81 | 82 | if areas_of_learning and age_groups: 83 | predefined_learning_goals = get_data( 84 | path="src/genai/dm/dm.json", 85 | type_="learning_goals", 86 | areas_of_learning=[areas_of_learning], 87 | age_groups=[age_groups], 88 | ) 89 | # Hack till we bring in the rest of the AOL 90 | try: 91 | learning_goals = st.multiselect( 92 | label="**Predefined Learning Goals**", 93 | options=predefined_learning_goals, 94 | default=predefined_learning_goals[0], 95 | on_change=reset_state, 96 | ) 97 | except IndexError: 98 | st.write( 99 | """**ERROR**: You can only pick one of the following Areas of Learning for now: 100 | - `Communication and Language` 101 | - `Mathematics` 102 | - `Personal, Social and Emotional Development` """ 103 | ) 104 | 105 | if st.button("**Search for activity examples**"): 106 | results = [] 107 | for learning_goal in learning_goals: 108 | search_results = query_pinecone( 109 | index=index, 110 | encoded_query=get_embedding(learning_goal), 111 | filters={ 112 | "areas_of_learning": {"$in": [areas_of_learning]}, 113 | "source": {"$eq": "dm"}, 114 | "age_group": {"$in": [age_groups]}, 115 | "type_": {"$eq": "examples"}, 116 | }, 117 | top_n=n_examples, 118 | ) 119 | results.extend(search_results) 120 | 121 | results = list(set([result["metadata"]["text"] for result in results])) 122 | idx = sample_docs(num_docs=len(results), n=n_examples) 123 | results = [results[i] for i in idx] 124 | st.session_state["examples"] = "\n\n".join(results) 125 | 126 | elif choice == "Describe a learning goal": 127 | if age_groups: 128 | text_input = st.text_input(label="**Describe a learning goal**") 129 | if st.button("Search for learning goals"): 130 | results = query_pinecone( 131 | index=index, 132 | encoded_query=get_embedding(text_input), 133 | filters={ 134 | "source": {"$eq": "dm"}, 135 | "age_group": {"$in": [age_groups]}, 136 | "type_": {"$eq": "learning_goals"}, 137 | }, 138 | top_n=n_examples, 139 | ) 140 | 141 | results = list(set([result["metadata"]["text"] for result in results])) 142 | idx = sample_docs(num_docs=len(results), n=n_examples) 143 | results = [results[i] for i in idx] 144 | st.session_state["learning_goals"] = "\n\n".join(results) 145 | 146 | if st.session_state["learning_goals"]: 147 | st.write("## Development Matters guidance: Learning Goals") 148 | with st.expander("**Learning Goals**", expanded=True): 149 | for result in st.session_state["learning_goals"].split("\n\n"): 150 | st.write(f"- {result}\n") 151 | 152 | results = [] 153 | for learning_goal in st.session_state["learning_goals"].split("\n\n"): 154 | search_results = query_pinecone( 155 | index=index, 156 | encoded_query=get_embedding(learning_goal), 157 | filters={ 158 | "source": {"$eq": "dm"}, 159 | "age_group": {"$in": [age_groups]}, 160 | "type_": {"$eq": "examples"}, 161 | }, 162 | top_n=n_examples, 163 | ) 164 | results.extend(search_results) 165 | areas_of_learning = [result["metadata"]["areas_of_learning"] for result in results] 166 | results = list(set([result["metadata"]["text"] for result in results])) 167 | idx = sample_docs(num_docs=len(results), n=n_examples) 168 | results = [results[i] for i in idx] 169 | st.session_state["examples"] = "\n\n".join(results) 170 | 171 | if st.session_state["examples"]: 172 | st.write("## Development Matters guidance: Examples") 173 | with st.expander("**Examples**", expanded=True): 174 | for result in st.session_state["examples"].split("\n\n"): 175 | st.write(f"- {result}\n") 176 | 177 | text_input = st.text_input(label="**Describe a theme for the activity**") 178 | if st.button("Generate activities"): 179 | st.session_state["full_response"] = llm_call( 180 | selected_model=selected_model, 181 | temperature=temperature, 182 | message=message, 183 | messages_placeholders={ 184 | "description": text_input, 185 | "areas_of_learning": areas_of_learning, 186 | "examples": st.session_state["examples"], 187 | "age_groups": age_groups, 188 | }, 189 | ) 190 | 191 | if st.session_state["examples"] and st.session_state["full_response"]: 192 | with st.expander("**Suggested activities**"): 193 | st.write(st.session_state["full_response"]) 194 | text_input = st.text_input( 195 | "**Ask a follow-up question**", 196 | help="You can ask for details on how to play one of the games!", 197 | ) 198 | if text_input: 199 | _ = llm_call( 200 | selected_model=selected_model, 201 | temperature=temperature, 202 | message=MessageTemplate( 203 | role="user", content="###Activities###\n{activities} ###Task###\n{text_input}" 204 | ), 205 | messages_placeholders={ 206 | "text_input": text_input, 207 | "activities": st.session_state["full_response"], 208 | }, 209 | ) 210 | 211 | 212 | def get_data(path: str, type_: str, areas_of_learning: List[str], age_groups: List[str]) -> List[str]: 213 | """Get Learning Goals or Examples based on the selected areas of learning and age groups.""" 214 | data = read_json(path) 215 | # Temp hack to exclude the template 216 | data = [d for d in data if d["area_of_learning"] != ""] 217 | 218 | predefined_learning_goals = [] 219 | for elem in data: 220 | aol = elem["area_of_learning"] 221 | d = elem["age_group"] 222 | for age, age_dict in d.items(): 223 | for k, items in age_dict.items(): 224 | if k == type_ and aol in areas_of_learning and age in age_groups: 225 | for item in items: 226 | predefined_learning_goals.append(item) 227 | 228 | return predefined_learning_goals 229 | 230 | 231 | def _sidebar() -> tuple: 232 | """Select a model, temperature and number of results.""" 233 | selected_model = st.radio( 234 | label="**OpenAI model**", 235 | options=["gpt-3.5-turbo", "gpt-4"], 236 | index=1, 237 | on_change=reset_state, 238 | ) 239 | 240 | temperature = st.slider( 241 | label="**Temperature**", 242 | min_value=0.0, 243 | max_value=2.0, 244 | value=0.6, 245 | step=0.1, 246 | on_change=reset_state, 247 | ) 248 | 249 | n_examples = st.slider( 250 | label="**Examples**", 251 | help="Number of search results. Those are added to the prompt.", 252 | min_value=1, 253 | max_value=10, 254 | value=5, 255 | step=1, 256 | on_change=reset_state, 257 | ) 258 | 259 | st.button("Reset chat", on_click=reset_state, type="primary", help="Reset the chat history") 260 | 261 | return selected_model, temperature, n_examples 262 | 263 | 264 | def llm_call(selected_model: str, temperature: float, message: MessageTemplate, messages_placeholders: dict) -> str: 265 | """Call the LLM.""" 266 | message_placeholder = st.empty() 267 | full_response = "" 268 | for response in TextGenerator.generate( 269 | model=selected_model, 270 | temperature=temperature, 271 | messages=[message], 272 | message_kwargs=messages_placeholders, 273 | stream=True, 274 | ): 275 | full_response += response.choices[0].delta.get("content", "") 276 | message_placeholder.markdown(full_response + "▌") 277 | 278 | message_placeholder.markdown(full_response) 279 | 280 | return full_response 281 | -------------------------------------------------------------------------------- /src/genai/streamlit_pages/eli3_page.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | 3 | from genai import MessageTemplate 4 | from genai.eyfs import TextGenerator 5 | from genai.streamlit_pages.utils import reset_state 6 | 7 | 8 | def eli3(sidebar: bool = True) -> None: 9 | """Explain me a concept like I'm 3.""" 10 | 11 | # Define your custom CSS 12 | custom_css = """ 13 | 24 | """ 25 | 26 | # Apply the custom CSS 27 | st.markdown(custom_css, unsafe_allow_html=True) 28 | 29 | st.title("Explain-Like-I'm-3") 30 | 31 | # Create the generator 32 | if sidebar: 33 | with st.sidebar: 34 | selected_model = st.radio( 35 | label="**OpenAI model**", 36 | options=["gpt-3.5-turbo", "gpt-4"], 37 | on_change=reset_state, 38 | ) 39 | temperature = st.slider( 40 | label="**Temperature**", 41 | min_value=0.0, 42 | max_value=2.0, 43 | value=0.6, 44 | step=0.1, 45 | on_change=reset_state, 46 | ) 47 | 48 | st.button("Reset chat", on_click=reset_state, type="primary", help="Reset the chat history") 49 | else: 50 | selected_model = "gpt-4" 51 | temperature = 0.6 52 | 53 | prompt_template = MessageTemplate.load("src/genai/eli3/prompts/eli3_chat_2.json") 54 | 55 | # Initialize chat history 56 | if "messages" not in st.session_state: 57 | st.session_state.messages = [{"role": prompt_template.role, "content": prompt_template.content}] 58 | 59 | # Display chat messages from history on app rerun. 60 | # The first message is the prompt, so we skip it. 61 | for message in st.session_state.messages[1:]: 62 | with st.chat_message(message["role"]): 63 | st.markdown(message["content"]) 64 | 65 | # Accept user input 66 | prompt = st.chat_input("How do whales breathe?") 67 | if prompt: 68 | # Display user message in chat message container 69 | with st.chat_message("user"): 70 | st.markdown(prompt) 71 | # Add user message to chat history 72 | st.session_state.messages.append({"role": "user", "content": prompt}) 73 | 74 | with st.chat_message("assistant"): 75 | message_placeholder = st.empty() 76 | full_response = "" 77 | for response in TextGenerator.generate( 78 | model=selected_model, 79 | temperature=temperature, 80 | messages=[{"role": m["role"], "content": m["content"]} for m in st.session_state.messages], 81 | message_kwargs=None, 82 | stream=True, 83 | ): 84 | full_response += response.choices[0].delta.get("content", "") 85 | message_placeholder.markdown(full_response + "▌") 86 | message_placeholder.markdown(full_response) 87 | st.session_state.messages.append({"role": "assistant", "content": full_response}) 88 | -------------------------------------------------------------------------------- /src/genai/streamlit_pages/eyfs_compare_page.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | 3 | from genai.eyfs import TextGenerator 4 | from genai.eyfs import get_embedding 5 | from genai.streamlit_pages.eyfs_kb_bbc_page import get_index 6 | from genai.streamlit_pages.eyfs_kb_bbc_page import query_pinecone 7 | from genai.utils import read_json 8 | 9 | 10 | def eyfs_compare() -> None: 11 | """Compare the EYFS prototypes.""" 12 | st.title("Compare the EYFS prototypes") 13 | index = get_index(index_name="eyfs-index-1718673592-index") 14 | areas_of_learning_desc = read_json("src/genai/eyfs/areas_of_learning.json") 15 | aol = list(areas_of_learning_desc.keys()) 16 | 17 | with st.sidebar: 18 | # Select a model, temperature and number of results 19 | selected_model = st.radio(label="**OpenAI model**", options=["gpt-3.5-turbo", "gpt-4"], index=1) 20 | # description = "" 21 | n_results = 10 22 | temperature = st.slider(label="**Temperature**", min_value=0.0, max_value=2.0, value=0.6, step=0.1) 23 | 24 | # Select the areas of learning 25 | areas_of_learning = st.multiselect(label="**Areas of learning**", options=aol, default=aol) 26 | areas_of_learning_text = [v for k, v in areas_of_learning_desc.items() if k in areas_of_learning] 27 | 28 | # Describe each Area of Learning in an expanding window 29 | with st.expander("**Areas of Learning Description**"): 30 | for k, v in areas_of_learning_desc.items(): 31 | if k in areas_of_learning: 32 | st.write(f"#### {k}") 33 | st.write(v.split("##")[-1]) 34 | 35 | areas_of_learning_text = "\n\n".join(areas_of_learning_text) 36 | location = st.selectbox(label="**Location**", options=["Indoor", "Outdoor", "Indoor or Outdoor"], index=2) 37 | 38 | # Get the user input 39 | query = st.text_input( 40 | label="**What's the topic you want activities for?**", 41 | value="Let's create activities educating children on how whales breathe", 42 | help="Prompt the large language model with a some text and it will generate an activity plan for you.", 43 | ) 44 | if st.button(label="**Generate**", help="Generate an answer."): 45 | eyfs_prototype, eyfs_bbc_prototype = st.columns(2) 46 | eyfs_prototype.subheader("EYFS-related activities") 47 | eyfs_bbc_prototype.subheader("EYFS-related activities with ext KB (BBC)") 48 | with eyfs_prototype: 49 | with st.spinner("Generating activities..."): 50 | res_box = st.empty() 51 | report = [] 52 | # Create the messages 53 | paths = [ 54 | "src/genai/eyfs/prompts/system.json", 55 | "src/genai/eyfs/prompts/context_and_task.json", 56 | "src/genai/eyfs/prompts/constraints.json", 57 | "src/genai/eyfs/prompts/situation.json", 58 | ] 59 | 60 | messages = [read_json(path) for path in paths] 61 | messages_placeholders = { 62 | "description": query, 63 | "areas_of_learning": areas_of_learning, 64 | "n_results": n_results, 65 | "location": location, 66 | "areas_of_learning_text": areas_of_learning_text, 67 | } 68 | 69 | r = TextGenerator.generate( 70 | model=selected_model, 71 | temperature=temperature, 72 | messages=messages, 73 | message_kwargs=messages_placeholders, 74 | stream=True, 75 | ) 76 | 77 | for chunk in r: 78 | content = chunk["choices"][0].get("delta", {}).get("content") 79 | report.append(content) 80 | if chunk["choices"][0]["finish_reason"] != "stop": 81 | result = "".join(report).strip() 82 | res_box.markdown(f"{result}") 83 | 84 | with eyfs_bbc_prototype: 85 | # Create the messages 86 | paths = [ 87 | "src/genai/eyfs/prompts/system.json", 88 | "src/genai/eyfs/prompts/context_and_task_with_examples.json", 89 | "src/genai/eyfs/prompts/constraints.json", 90 | "src/genai/eyfs/prompts/situation.json", 91 | ] 92 | 93 | messages = [read_json(path) for path in paths] 94 | with st.spinner("Searching for relevant BBC activities..."): 95 | # Encode the query 96 | encoded_query = get_embedding(query) 97 | 98 | # Search with Chroma 99 | similar_docs = query_pinecone( 100 | index, 101 | encoded_query, 102 | areas_of_learning=areas_of_learning, 103 | top_n=4, 104 | max_n=4, 105 | ) 106 | 107 | with st.spinner("Generating activities..."): 108 | res_box = st.empty() 109 | report = [] 110 | # Create the prompt 111 | messages_placeholders = { 112 | "description": query, 113 | "areas_of_learning": areas_of_learning, 114 | "n_results": n_results, 115 | "location": location, 116 | "areas_of_learning_text": areas_of_learning_text, 117 | "activity_examples": "\n======\n".join( 118 | [similar_doc["metadata"]["text"] for similar_doc in similar_docs] 119 | ), 120 | } 121 | 122 | r = TextGenerator.generate( 123 | model=selected_model, 124 | temperature=temperature, 125 | messages=messages, 126 | message_kwargs=messages_placeholders, 127 | stream=True, 128 | ) 129 | 130 | for chunk in r: 131 | content = chunk["choices"][0].get("delta", {}).get("content") 132 | report.append(content) 133 | if chunk["choices"][0]["finish_reason"] != "stop": 134 | result = "".join(report).strip() 135 | res_box.markdown(f"{result}") 136 | 137 | st.subheader("Sources") 138 | 139 | for similar_doc in similar_docs: 140 | title = similar_doc["metadata"]["title"] 141 | url = similar_doc["id"] 142 | category = similar_doc["metadata"]["areas_of_learning"] 143 | st.write(f"""- [{title}]({url}) {category}""") 144 | -------------------------------------------------------------------------------- /src/genai/streamlit_pages/eyfs_kb_bbc_page.py: -------------------------------------------------------------------------------- 1 | import pinecone 2 | import streamlit as st 3 | 4 | from genai import MessageTemplate 5 | from genai.eyfs import TextGenerator 6 | from genai.eyfs import get_embedding 7 | from genai.streamlit_pages.utils import get_index 8 | from genai.streamlit_pages.utils import reset_state 9 | from genai.streamlit_pages.utils import sample_docs 10 | from genai.utils import read_json 11 | 12 | 13 | def eyfs_kb_bbc(index_name: str = "eyfs-index") -> None: 14 | """Run the EYFS + BBC activities app.""" 15 | st.title("Generating activity plans grounded in EY foundation stages") 16 | areas_of_learning_desc = read_json("src/genai/eyfs/areas_of_learning.json") 17 | aol = list(areas_of_learning_desc.keys()) 18 | index = get_index(index_name=index_name) 19 | 20 | with st.sidebar: 21 | # Select a model, temperature and number of results 22 | selected_model = st.radio( 23 | label="**OpenAI model**", 24 | options=["gpt-3.5-turbo", "gpt-4"], 25 | index=1, 26 | on_change=reset_state, 27 | ) 28 | n_results = 10 29 | temperature = st.slider( 30 | label="**Temperature**", 31 | min_value=0.0, 32 | max_value=2.0, 33 | value=0.6, 34 | step=0.1, 35 | on_change=reset_state, 36 | ) 37 | 38 | st.button("Reset chat", on_click=reset_state, type="primary", help="Reset the chat history") 39 | 40 | # Select the areas of learning 41 | areas_of_learning = st.multiselect( 42 | label="**Areas of learning**", 43 | options=aol, 44 | default=aol, 45 | on_change=reset_state, 46 | ) 47 | areas_of_learning_text = [v for k, v in areas_of_learning_desc.items() if k in areas_of_learning] 48 | 49 | # Describe each Area of Learning in an expanding window 50 | with st.expander("**Areas of Learning Description**"): 51 | for k, v in areas_of_learning_desc.items(): 52 | if k in areas_of_learning: 53 | st.write(f"#### {k}") 54 | st.write(v.split("##")[-1]) 55 | 56 | areas_of_learning_text = "\n\n".join(areas_of_learning_text) 57 | location = st.selectbox(label="**Location**", options=["Indoor", "Outdoor", "Indoor or Outdoor"], index=2) 58 | 59 | # Create the messages 60 | paths = [ 61 | "src/genai/eyfs/prompts/system.json", 62 | "src/genai/eyfs/prompts/context_and_task_with_examples.json", 63 | "src/genai/eyfs/prompts/constraints.json", 64 | "src/genai/eyfs/prompts/situation.json", 65 | ] 66 | 67 | prompt_templates = [MessageTemplate.load(path) for path in paths] 68 | 69 | # Initialize chat history 70 | if "messages" not in st.session_state: 71 | st.session_state.messages = [ 72 | {"role": prompt_template.role, "content": prompt_template.content} for prompt_template in prompt_templates 73 | ] 74 | 75 | # Display chat messages from history on app rerun. 76 | # The first messages are the prompt, so we skip it. 77 | for message in st.session_state.messages[len(prompt_templates) :]: 78 | with st.chat_message(message["role"]): 79 | st.markdown(message["content"]) 80 | 81 | prompt = st.chat_input("Let's create activities educating children on how whales breathe") 82 | if prompt: 83 | # Display user message in chat message container 84 | with st.chat_message("user"): 85 | st.markdown(prompt) 86 | 87 | # Add user message to chat history 88 | # The very first message will be used to fill in the prompt template 89 | # after that, we store the user messages in the chat history 90 | if len(st.session_state.messages) == len(prompt_templates): 91 | query = prompt 92 | with st.spinner("Searching for relevant BBC activities..."): 93 | # Encode the query 94 | encoded_query = get_embedding(query) 95 | 96 | # Search with Pinecone 97 | similar_docs = query_pinecone( 98 | index, 99 | encoded_query, 100 | areas_of_learning=areas_of_learning, 101 | top_n=4, 102 | max_n=4, 103 | ) 104 | 105 | if "similar_docs" not in st.session_state: 106 | st.session_state["similar_docs"] = similar_docs 107 | 108 | else: 109 | st.session_state.messages.append({"role": "user", "content": prompt}) 110 | query = "" 111 | 112 | with st.chat_message("assistant"): 113 | message_placeholder = st.empty() 114 | full_response = "" 115 | 116 | messages_placeholders = { 117 | "description": query, 118 | "areas_of_learning": areas_of_learning, 119 | "n_results": n_results, 120 | "location": location, 121 | "areas_of_learning_text": areas_of_learning_text, 122 | "activity_examples": "\n======\n".join( 123 | [similar_doc["metadata"]["text"] for similar_doc in st.session_state["similar_docs"]] 124 | ), 125 | } 126 | 127 | r = TextGenerator.generate( 128 | model=selected_model, 129 | temperature=temperature, 130 | messages=[{"role": m["role"], "content": m["content"]} for m in st.session_state.messages], 131 | message_kwargs=messages_placeholders, 132 | stream=True, 133 | ) 134 | 135 | for response in r: 136 | full_response += response.choices[0].delta.get("content", "") 137 | message_placeholder.markdown(full_response + "▌") 138 | message_placeholder.markdown(full_response) 139 | 140 | if len(st.session_state.messages) == len(prompt_templates): 141 | st.subheader("Sources") 142 | for similar_doc in st.session_state["similar_docs"]: 143 | title = similar_doc["metadata"]["title"] 144 | url = similar_doc["id"] 145 | category = similar_doc["metadata"]["areas_of_learning"] 146 | st.write(f"""- [{title}]({url}) {category}""") 147 | st.session_state.messages.append({"role": "assistant", "content": full_response}) 148 | 149 | 150 | def query_pinecone( 151 | index: pinecone.index.Index, 152 | encoded_query: list, 153 | areas_of_learning: list, 154 | top_n: int = 4, 155 | max_n: int = 4, 156 | ) -> list: 157 | """Query the pinecone index. 158 | 159 | Parameters 160 | ---------- 161 | index 162 | Pinecone index. 163 | 164 | query 165 | Query vector to search for. 166 | 167 | areas_of_learning 168 | Areas of learning to filter by. 169 | 170 | top_n 171 | Number of results to return. 172 | 173 | max_n 174 | Maximum number of results to keep as prompt examples. 175 | 176 | Returns 177 | ------- 178 | docs 179 | List of documents. 180 | 181 | 182 | """ 183 | results = index.query( 184 | vector=encoded_query, 185 | top_k=top_n, 186 | include_metadata=True, 187 | filter={ 188 | "areas_of_learning": {"$in": areas_of_learning}, 189 | "source": {"$eq": "BBC"}, 190 | }, 191 | ) 192 | 193 | results = results["matches"] 194 | # Subset docs to fit the prompt length 195 | idx = sample_docs(len(results), max_n) 196 | 197 | return [results[i] for i in idx] 198 | -------------------------------------------------------------------------------- /src/genai/streamlit_pages/eyfs_page.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | 3 | from genai import MessageTemplate 4 | from genai.eyfs.eyfs import TextGenerator 5 | from genai.streamlit_pages.utils import reset_state 6 | from genai.utils import read_json 7 | 8 | 9 | def early_year_activity_plan() -> None: 10 | """Come up with activities for children.""" 11 | 12 | # Define your custom CSS 13 | custom_css = """ 14 | 25 | """ 26 | 27 | # Apply the custom CSS 28 | st.markdown(custom_css, unsafe_allow_html=True) 29 | 30 | st.title("Generating activity plans grounded in EY foundation stages") 31 | areas_of_learning_desc = read_json("src/genai/eyfs/areas_of_learning.json") 32 | aol = list(areas_of_learning_desc.keys()) 33 | 34 | with st.sidebar: 35 | # Select a model, temperature and number of results 36 | selected_model = st.radio( 37 | label="**OpenAI model**", 38 | options=["gpt-3.5-turbo", "gpt-4"], 39 | index=1, 40 | on_change=reset_state, 41 | ) 42 | description = "" 43 | n_results = 10 44 | temperature = st.slider( 45 | label="**Temperature**", 46 | min_value=0.0, 47 | max_value=2.0, 48 | value=0.6, 49 | step=0.1, 50 | on_change=reset_state, 51 | ) 52 | 53 | st.button("Reset chat", on_click=reset_state, type="primary", help="Reset the chat history") 54 | 55 | # Select the areas of learning 56 | areas_of_learning = st.multiselect( 57 | label="**Areas of learning**", 58 | options=aol, 59 | default=aol, 60 | on_change=reset_state, 61 | ) 62 | areas_of_learning_text = [v for k, v in areas_of_learning_desc.items() if k in areas_of_learning] 63 | 64 | # Describe each Area of Learning in an expanding window 65 | with st.expander("**Areas of Learning Description**"): 66 | for k, v in areas_of_learning_desc.items(): 67 | if k in areas_of_learning: 68 | st.write(f"#### {k}") 69 | st.write(v.split("##")[-1]) 70 | 71 | areas_of_learning_text = "\n\n".join(areas_of_learning_text) 72 | location = st.selectbox(label="**Location**", options=["Indoor", "Outdoor", "Indoor or Outdoor"], index=2) 73 | 74 | # Create the messages 75 | paths = [ 76 | "src/genai/eyfs/prompts/system.json", 77 | "src/genai/eyfs/prompts/context_and_task.json", 78 | "src/genai/eyfs/prompts/constraints.json", 79 | "src/genai/eyfs/prompts/situation.json", 80 | ] 81 | 82 | prompt_templates = [MessageTemplate.load(path) for path in paths] 83 | 84 | # Initialize chat history 85 | if "messages" not in st.session_state: 86 | st.session_state.messages = [ 87 | {"role": prompt_template.role, "content": prompt_template.content} for prompt_template in prompt_templates 88 | ] 89 | 90 | # Display chat messages from history on app rerun. 91 | # The first messages are the prompt, so we skip it. 92 | for message in st.session_state.messages[len(prompt_templates) :]: 93 | with st.chat_message(message["role"]): 94 | st.markdown(message["content"]) 95 | 96 | # Accept user input 97 | prompt = st.chat_input("Let's create activities educating children on how whales breathe") 98 | if prompt: 99 | # Display user message in chat message container 100 | with st.chat_message("user"): 101 | st.markdown(prompt) 102 | 103 | # Add user message to chat history 104 | if len(st.session_state.messages) == len(prompt_templates): 105 | description = prompt 106 | else: 107 | st.session_state.messages.append({"role": "user", "content": prompt}) 108 | description = "" 109 | 110 | with st.chat_message("assistant"): 111 | message_placeholder = st.empty() 112 | full_response = "" 113 | 114 | for response in TextGenerator.generate( 115 | model=selected_model, 116 | temperature=temperature, 117 | messages=[{"role": m["role"], "content": m["content"]} for m in st.session_state.messages], 118 | message_kwargs={ 119 | "description": description, 120 | "areas_of_learning": areas_of_learning, 121 | "n_results": n_results, 122 | "location": location, 123 | "areas_of_learning_text": areas_of_learning_text, 124 | }, 125 | stream=True, 126 | ): 127 | full_response += response.choices[0].delta.get("content", "") 128 | message_placeholder.markdown(full_response + "▌") 129 | message_placeholder.markdown(full_response) 130 | st.session_state.messages.append({"role": "assistant", "content": full_response}) 131 | -------------------------------------------------------------------------------- /src/genai/streamlit_pages/parenting_page.py: -------------------------------------------------------------------------------- 1 | import json 2 | import uuid 3 | 4 | from datetime import datetime 5 | 6 | import s3fs 7 | import streamlit as st 8 | 9 | from dotenv import load_dotenv 10 | from streamlit_feedback import streamlit_feedback 11 | 12 | from genai.eyfs import TextGenerator 13 | from genai.eyfs import get_embedding 14 | from genai.message_history import InMemoryMessageHistory 15 | from genai.prompt_template import FunctionTemplate 16 | from genai.prompt_template import MessageTemplate 17 | from genai.streamlit_pages.utils import get_index 18 | from genai.streamlit_pages.utils import query_pinecone 19 | from genai.streamlit_pages.utils import reset_state 20 | 21 | 22 | load_dotenv() 23 | 24 | 25 | def parenting_chatbot(aws_key: str, aws_secret: str, s3_path: str, sidebar: bool = True) -> None: 26 | """Early-years parenting chatbot.""" 27 | 28 | # Define your custom CSS 29 | custom_css = """ 30 | 41 | """ 42 | 43 | # Apply the custom CSS 44 | st.markdown(custom_css, unsafe_allow_html=True) 45 | 46 | st.title("Early-years parenting chatbot") 47 | st.write( 48 | "This is a chatbot based on information from the [NHS Start for Life](https://www.nhs.uk/start-for-life/) website. " 49 | "You can ask it questions about pregnancy, birth and parenthood. " 50 | "Please note that this is a prototype and the answers should not be used as medical advice." 51 | ) 52 | st.write("---") 53 | 54 | selected_model = "gpt-3.5-turbo" 55 | temperature = 0.6 56 | pinecone_index = get_index(index_name="eyfs-index-1718673592-index") 57 | 58 | if sidebar: 59 | with st.sidebar: 60 | st.button("Reset chat", on_click=reset_state, type="primary", help="Reset the chat history") 61 | 62 | system_message = MessageTemplate.load("src/genai/parenting_chatbot/prompts/system.json") 63 | filter_refs_function = FunctionTemplate.load("src/genai/parenting_chatbot/prompts/filter_refs_function.json") 64 | filter_refs_user_message = MessageTemplate.load("src/genai/parenting_chatbot/prompts/filter_refs_user.json") 65 | filter_refs_system_message = MessageTemplate.load("src/genai/parenting_chatbot/prompts/filter_refs_system.json") 66 | 67 | if "session_uuid" not in st.session_state: 68 | st.session_state["session_uuid"] = f"{current_time()}-{str(uuid.uuid4())}" 69 | 70 | # Single submitted feedback 71 | if "feedback" not in st.session_state: 72 | st.session_state["feedback"] = None 73 | 74 | # st.session_state["memory"] controls the flow to OpenAI and logging 75 | if "memory" not in st.session_state: 76 | st.session_state["memory"] = InMemoryMessageHistory() 77 | st.session_state["memory"].add_message(system_message.to_prompt()) 78 | 79 | # st.session_state["messages"] shows the conversation in the UI 80 | if "messages" not in st.session_state: 81 | # instantiate the memory instead of None 82 | st.session_state["messages"] = [system_message.to_prompt()] 83 | 84 | for message in st.session_state.messages: 85 | if message["role"] in ["user", "assistant"]: 86 | with st.chat_message(message["role"]): 87 | st.markdown(message["content"]) 88 | 89 | # Accept user input 90 | prompt = st.chat_input("What's on your mind?") 91 | if prompt: 92 | # Display user message in chat message container 93 | with st.chat_message("user"): 94 | st.markdown(prompt) 95 | 96 | # Search the vector index 97 | search_results = query_pinecone( 98 | index=pinecone_index, 99 | encoded_query=get_embedding(prompt), 100 | top_n=3, 101 | filters={ 102 | "source": {"$eq": "nhs_full_page"}, 103 | }, 104 | ) 105 | 106 | nhs_texts = [] 107 | nhs_urls = [] 108 | for result in search_results: 109 | pred = TextGenerator.generate( 110 | temperature=0.0, 111 | messages=[filter_refs_system_message, filter_refs_user_message], 112 | message_kwargs={"text": result["metadata"]["text"], "question": prompt}, 113 | functions=[filter_refs_function.to_prompt()], 114 | function_call={"name": filter_refs_function.name}, 115 | ) 116 | 117 | pred = json.loads(pred["choices"][0]["message"]["function_call"]["arguments"])["prediction"] 118 | 119 | if pred: 120 | nhs_texts.append(result["metadata"]["text"]) 121 | nhs_urls.append(result["metadata"]["url"]) 122 | 123 | if nhs_texts: 124 | nhs_texts = "\n===\n".join(nhs_texts) 125 | 126 | # Log message for the UI before adding the references 127 | st.session_state["messages"].append({"role": "user", "content": prompt}) 128 | 129 | # Add user message to chat history 130 | prompt = f"""###NHS Start for Life references###\n{nhs_texts}\n\n###User message###\n{prompt}""" 131 | st.session_state["memory"].add_message({"role": "user", "content": prompt}) 132 | 133 | write_to_s3( 134 | aws_key, 135 | aws_secret, 136 | f"{s3_path}/session-logs/{st.session_state['session_uuid']}", 137 | "messages", 138 | st.session_state["memory"].messages[-1], 139 | how="a", 140 | ) 141 | 142 | with st.chat_message("assistant"): 143 | message_placeholder = st.empty() 144 | full_response = "" 145 | 146 | for response in TextGenerator.generate( 147 | model=selected_model, 148 | temperature=temperature, 149 | messages=st.session_state["memory"].get_messages(), 150 | message_kwargs=None, 151 | stream=True, 152 | ): 153 | full_response += response.choices[0].delta.get("content", "") 154 | message_placeholder.markdown(full_response + "▌") 155 | 156 | # Submit feedback 157 | streamlit_feedback( 158 | feedback_type="faces", 159 | single_submit=False, 160 | optional_text_label="[Optional] Please provide an explanation", 161 | key="feedback", 162 | ) 163 | 164 | message_placeholder.markdown(full_response) 165 | 166 | # Display NHS URLs in chat message container 167 | if nhs_urls: 168 | with st.expander("NHS Start for Life references"): 169 | for url in nhs_urls: 170 | st.markdown(f"[{url}]({url})") 171 | 172 | st.session_state["messages"].append({"role": "assistant", "content": full_response}) 173 | st.session_state["memory"].add_message({"role": "assistant", "content": full_response}) 174 | 175 | write_to_s3( 176 | aws_key, 177 | aws_secret, 178 | f"{s3_path}/session-logs/{st.session_state['session_uuid']}", 179 | "messages", 180 | st.session_state["memory"].messages[-1], 181 | how="a", 182 | ) 183 | 184 | # Log feedback and messages 185 | if st.session_state["feedback"]: 186 | user_feedback = { 187 | "user_message": st.session_state["messages"][-2], 188 | "assistant_message": st.session_state["messages"][-1], 189 | "feedback_score": st.session_state["feedback"]["score"], 190 | "feedback_text": st.session_state["feedback"]["text"], 191 | } 192 | 193 | write_to_s3( 194 | aws_key, 195 | aws_secret, 196 | f"{s3_path}/session-logs/{st.session_state['session_uuid']}", 197 | "feedback", 198 | user_feedback, 199 | how="a", 200 | ) 201 | 202 | 203 | def write_to_s3(key: str, secret: str, s3_path: str, filename: str, data: dict, how: str = "a") -> None: 204 | """Write data to a jsonl file in S3. 205 | 206 | Parameters 207 | ---------- 208 | key 209 | AWS access key ID. 210 | 211 | secret 212 | AWS secret access key. 213 | 214 | s3_path 215 | S3 bucket path. 216 | 217 | filename 218 | Name of the file to write to. 219 | 220 | data 221 | Data to write to the file. 222 | 223 | how 224 | How to write to the file. Default is "a" for append. Use "w" to overwrite. 225 | 226 | """ 227 | fs = s3fs.S3FileSystem(key=key, secret=secret) 228 | with fs.open(f"{s3_path}/{filename}.jsonl", how) as f: 229 | f.write(f"{json.dumps(data)}\n") 230 | 231 | 232 | def current_time() -> str: 233 | """Return the current time as a string. Used as part of the session UUID.""" 234 | # Get current date and time 235 | current_datetime = datetime.now() 236 | 237 | # Convert to a long number format 238 | datetime_string = current_datetime.strftime("%Y%m%d%H%M%S") 239 | 240 | return datetime_string 241 | -------------------------------------------------------------------------------- /src/genai/streamlit_pages/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import random 3 | 4 | from typing import List 5 | from typing import Optional 6 | from typing import Union 7 | 8 | import dotenv 9 | import pinecone 10 | import streamlit as st 11 | 12 | from genai.vector_index import PineconeIndex 13 | 14 | 15 | dotenv.load_dotenv() 16 | 17 | 18 | def reset_state(key: Optional[str] = None) -> None: 19 | """Delete the message placeholder state.""" 20 | keys = [ 21 | "similar_docs", 22 | "areas_of_learning_text", 23 | "areas_of_learning", 24 | "n_results", 25 | "location", 26 | "messages", 27 | "choice", 28 | "choices", 29 | "examples", 30 | "learning_goals", 31 | "full_response", 32 | "memory", 33 | "feedback", 34 | "user_feedback", 35 | "session_uuid", 36 | ] 37 | for key in keys: 38 | try: 39 | del st.session_state[key] 40 | except KeyError: 41 | pass 42 | 43 | 44 | @st.cache_resource 45 | def get_index(index_name: str) -> pinecone.index.Index: 46 | """Return and persist the pinecone index.""" 47 | conn = PineconeIndex(api_key=os.environ["PINECONE_API_KEY"], environment=os.environ["PINECONE_REGION"]) 48 | index = conn.connect(index_name=index_name) 49 | return index 50 | 51 | 52 | def sample_docs(num_docs: int, n: int) -> Union[List[int], ValueError]: 53 | """Sample docs (without replacement).""" 54 | try: 55 | return random.sample(range(num_docs), n) 56 | except ValueError: 57 | try: 58 | return random.sample(range(num_docs), num_docs) 59 | except ValueError as e: 60 | raise ValueError(f"Cannot sample docs: {e}") 61 | 62 | 63 | def query_pinecone( 64 | index: pinecone.index.Index, 65 | encoded_query: list, 66 | filters: dict, 67 | top_n: int = 5, 68 | max_n: int = 10, 69 | ) -> list: 70 | """Query the pinecone index. 71 | 72 | Parameters 73 | ---------- 74 | index 75 | Pinecone index. 76 | 77 | query 78 | Query vector to search for. 79 | 80 | areas_of_learning 81 | Areas of learning to filter by. 82 | 83 | top_n 84 | Number of results to return. 85 | 86 | max_n 87 | Maximum number of results to keep as prompt examples. 88 | 89 | Returns 90 | ------- 91 | docs 92 | List of documents. 93 | 94 | """ 95 | results = index.query( 96 | vector=encoded_query, 97 | top_k=top_n, 98 | include_metadata=True, 99 | filter=filters, 100 | ) 101 | 102 | return results["matches"] 103 | -------------------------------------------------------------------------------- /src/genai/utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | 4 | from typing import Generator 5 | from typing import List 6 | 7 | import boto3 8 | 9 | 10 | def read_json(path: str, lines: bool = False) -> List[dict]: 11 | """Read a JSONL file.""" 12 | with open(path, "r") as f: 13 | if lines: 14 | return [json.loads(line) for line in f.readlines()] 15 | return json.load(f) 16 | 17 | 18 | def batch(lst: list, n: int) -> Generator: 19 | """Yield successive n-sized chunks from lst.""" 20 | for i in range(0, len(lst), n): 21 | yield lst[i : i + n] 22 | 23 | 24 | def create_directory_if_not_exists(dir_path: str) -> None: 25 | """Create a directory if it doesn't exist.""" 26 | if not os.path.exists(dir_path): 27 | os.makedirs(dir_path) 28 | 29 | 30 | def read_jsonl_from_s3(s3_uri: str) -> List[dict]: 31 | """Read a JSONL file from S3.""" 32 | s3_uri = s3_uri.replace("s3://", "") 33 | bucket_name = s3_uri.split("/")[0] 34 | file_key = "/".join(s3_uri.split("/")[1:]) 35 | 36 | s3 = boto3.client("s3") 37 | s3_object = s3.get_object(Bucket=bucket_name, Key=file_key) 38 | content = s3_object["Body"].read().decode("utf-8") 39 | 40 | return [json.loads(line) for line in content.strip().split("\n")] 41 | -------------------------------------------------------------------------------- /src/genai/vector_index.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | 4 | from typing import Optional 5 | 6 | # import pinecone 7 | from pinecone import Pinecone 8 | 9 | from genai.utils import batch 10 | 11 | 12 | class PineconeIndex: 13 | """Wrap the Pinecone API. 14 | 15 | Note that this is a very thin and untested wrapper. It is not intended for production use. 16 | Its main purpose is to support this repo's prototypes and tiny indexes. 17 | """ 18 | 19 | def __init__( 20 | self, 21 | api_key: Optional[str] = None, 22 | environment: Optional[str] = None, 23 | ) -> None: 24 | """Initialize the index.""" 25 | 26 | # Connect to pinecone 27 | if api_key: 28 | self.pc = Pinecone(api_key=api_key) 29 | else: 30 | self.pc = Pinecone(api_key=os.environ["PINECONE_API_KEY"]) 31 | 32 | def connect(self, index_name: str) -> Pinecone.Index: 33 | """Connect to the index.""" 34 | return self.pc.Index(index_name) 35 | 36 | # To do: update for the new Pinecone API 37 | def build_and_upsert( 38 | self, 39 | index_name: str, 40 | dimension: int, 41 | metadata_config: dict, 42 | metric: str, 43 | docs: list, 44 | batch_size: int = 100, 45 | delete_if_exists: bool = False, 46 | **kwargs, 47 | ) -> None: 48 | """Build the index (if it does not exist) and add docs. 49 | 50 | Parameters 51 | ---------- 52 | index_name 53 | Name of the index. 54 | 55 | dimension 56 | Length of the indexed vectors. 57 | 58 | metadata_config 59 | The metadata config. 60 | 61 | metric 62 | The distance metric to use. 63 | 64 | docs 65 | The documents to index. 66 | 67 | batch_size 68 | The batch size to use when indexing. 69 | 70 | delete_if_exists 71 | Whether to delete the index if it already exists. 72 | 73 | """ 74 | if delete_if_exists: 75 | self.delete(index_name) 76 | 77 | if index_name in pinecone.list_indexes(): # noqa: F821 78 | index = self.connect(index_name) 79 | else: 80 | pinecone.create_index( # noqa: F821 81 | index_name, 82 | dimension=dimension, 83 | metadata_config=metadata_config, 84 | metric=metric, 85 | ) 86 | 87 | index = self.connect(index_name) 88 | 89 | # Potential fix to avoid error 403 90 | time.sleep(30) 91 | 92 | for batched_docs in batch(docs, batch_size): 93 | index.upsert(batched_docs) 94 | 95 | @staticmethod 96 | def delete(index_name: str) -> None: 97 | """Delete the index.""" 98 | try: 99 | pinecone.delete_index(index_name) # noqa: F821 100 | except Exception as e: 101 | print(e) # noqa: T001 102 | -------------------------------------------------------------------------------- /src/genai/whatsapp_bot/Dockerfile: -------------------------------------------------------------------------------- 1 | # Use an official Python runtime as a parent image 2 | FROM --platform=linux/amd64 python:3.9.17-slim 3 | 4 | # Set environment varibles 5 | ENV PYTHONDONTWRITEBYTECODE 1 6 | ENV PYTHONUNBUFFERED 1 7 | 8 | # Install system dependencies 9 | RUN apt-get update \ 10 | && apt-get install -y --no-install-recommends gcc 11 | 12 | # Install poetry 13 | RUN pip install "poetry==1.5.1" 14 | 15 | # Set the working directory in the Docker image 16 | WORKDIR /app 17 | 18 | # Copy only requirements to cache them in docker layer 19 | COPY pyproject.toml poetry.lock ./ 20 | COPY src/genai /app/src/genai 21 | COPY README.md /app/README.md 22 | COPY whatsapp_bot.py /app/whatsapp_bot.py 23 | 24 | # Don't push the image to dockerhub 25 | COPY .env /app/.env 26 | 27 | # Project initialization: 28 | RUN poetry config virtualenvs.create false \ 29 | && poetry install --no-interaction --no-ansi 30 | 31 | # Specify the command to run your application 32 | CMD ["sh", "-c", "gunicorn --bind 0.0.0.0:$PORT whatsapp_bot:app"] 33 | -------------------------------------------------------------------------------- /src/genai/whatsapp_bot/README.md: -------------------------------------------------------------------------------- 1 | # WhatsApp bot prototype 2 | 3 | This prototype provides a messaging front end (WhatsApp) to a large language model (LLM) using prompts related to early-years activities. The idea is that in some situations a widely-adopted messaging platform could be a more natural and accessible user interface for caregivers or educators than a web app. 4 | 5 | To set up the WhatsApp bot, you will need to sign up to [Twilio](https://www.twilio.com/), install [ngrok](https://ngrok.com/) and set up a [Heroku](https://www.heroku.com/) account. At the time of writing this, Twilio has a free tier that you can use to test the bot, and Heroku provides affordable plans (less than $10 per month) that will be sufficient for our needs. 6 | 7 | - [How it works](#how-it-works) 8 | - [Running the API locally](#running-the-api-locally) 9 | - [Deploying the API on Heroku](#deploying-the-api-on-heroku) 10 | 11 | 12 | ## How it works 13 | 14 | Essentially, we use Flask to create an API, it runs on Heroku and can be called via the `/text` endpoint. Twilio provides a WhatsApp contact that receives messages from users and calls the API endpoint when new messages come in. 15 | 16 | At the moment, the prototype is not exactly a chatbot as it only allows two types of queries and cannot answer follow-up questions. 17 | 18 | You can start the interaction by saying anything, eg 'Hi" 19 | 20 |

21 | 22 |

23 | 24 | The 'Explain ' message triggers the "Explain like I'm 3" prompt 25 | 26 |

27 | 28 |

29 | 30 | 31 | The 'Activities ' message triggers the personalised activity generation prompt. Note that this prompt is quite long and takes 20+ seconds to respond. This is longer than Twilio's [max timeout](https://www.twilio.com/docs/usage/webhooks/webhooks-connection-overrides) (15 seconds) and hence we first reply with a generic hold message, and then send the actual response when it's ready. 32 | 33 |

34 | 35 |

36 | 37 | This is implemented by the activity message triggering a new thread in which the app calls the OpenAI API and then use Twilio to send the response back to the same user. This should be seen as a quick hack for prototyping purposes, as this might result in a dangling thread that isn't destructed after sending the reply. A better solution would be to use a task queue (eg, Celery) to handle the long-running task. 38 | 39 | Note that the response for this prompt is too long for a single message (WhatsApp limit is 1600 characters), and hence we need to split it up in chunks and send as multiple separate messages. 40 | 41 |

42 | 43 |

44 | 45 | Overall, this prototype has been useful to explore the technical feasibility of using WhatsApp as a front end for interacting with a LLM. It also highlights differences between a messaging app and a web app user experience, where the messaging app will be probably better suited for shorter interactions and responses from the LLM (particularly because we can't stream the text character by character to WhatsApp). This means that the prompts that we used for our web app prototype would need to be optimised to provide shorter answers (eg, responding with only one idea at a time). 46 | 47 | Possible future iterations could explore using [message history](https://www.twilio.com/docs/sms/tutorials/how-to-retrieve-and-modify-message-history-python) to enable a more natural chatbot interaction and allow the user to configure our prompts (eg, the areas of learning or the age of the kids) using natural language. 48 | 49 | 50 | ## Running the API locally 51 | 52 | To run the API locally, navigate to the root of this repo and start the app 53 | 54 | ``` 55 | poetry run python whatsapp_bot.py 56 | ``` 57 | 58 | Start ngrok to expose the app to the internet (assuming the app is using port 5000). This will provide you with a URL that you can use to connect to your app. 59 | 60 | ``` 61 | ngrok http 5000 62 | ``` 63 | 64 | Then in Twilio you'll need to set up a Sandbox for WhatsApp and add the endpoint URL (created by ngrok) of your app under "Sandbox settings" in the box "When the message comes in". 65 | 66 | ![twilio_ngrok](screenshots/twilio_ngrok.png) 67 | 68 | You can now go to WhatsApp, connect with the phone number provided by Twilio and chat with your bot. 69 | 70 | ## Deploying the API on Heroku 71 | 72 | Testing your app locally is great, but what if you close your machine? To keep your WhatsApp bot running even when you rest, we'll deploy it on Heroku using Docker. 73 | 74 | First, make sure you've set up heroku on your machine and log in the container registry 75 | 76 | ``` 77 | heroku container:login 78 | ``` 79 | 80 | Then you can create a new app 81 | 82 | ``` 83 | heroku create 84 | ``` 85 | 86 | This will create an app with a random name - this can be changed later. 87 | 88 | Set up your environment variables 89 | ``` 90 | heroku config:set OPENAI_API_KEY= 91 | heroku config:set TWILIO_ACCOUNT_SID= 92 | heroku config:set TWILIO_AUTH_TOKEN= 93 | ``` 94 | 95 | Navigate to your app's directory and build the container (because that's where we store the Dockerfile for this prototype) 96 | 97 | ``` 98 | cd src/genai/whatsapp_bot 99 | heroku container:push web --app --context-path ../../.. 100 | ``` 101 | 102 | Then release the container 103 | 104 | ``` 105 | heroku container:release web --app 106 | ``` 107 | 108 | Start the app 109 | 110 | ``` 111 | heroku ps:scale web=1 112 | ``` 113 | 114 | You can check that the app is working by opening it in your browser 115 | 116 | ``` 117 | heroku open 118 | ``` 119 | 120 | Finally, check the logs (useful if something isn't working) 121 | 122 | ``` 123 | heroku logs --tail 124 | ``` 125 | 126 | Once the app is running OK, you can configure the WhatsApp sandbox as described above, using your new Heroku URL. 127 | -------------------------------------------------------------------------------- /src/genai/whatsapp_bot/prompts/eli3/eli3.json: -------------------------------------------------------------------------------- 1 | { 2 | "role": "user", 3 | "content": "###Instructions###\nYou are a helpful, kind, intelligent and polite early-years educator. Your task is to explain a concept to a 3 year old child. You must explain it in simple words that a young kid would understand. You must also be patient and never offend or be aggressive. Gendered language and any adjectives about the kid are strictly prohibited.\n\n###Question###\n{input}\n\n###Answer###\n" 4 | } 5 | -------------------------------------------------------------------------------- /src/genai/whatsapp_bot/screenshots/chat_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nestauk/discovery_generative_ai/b5ab4b0bf1feda7cd4a85a18ba20aa996cfb9e0d/src/genai/whatsapp_bot/screenshots/chat_1.png -------------------------------------------------------------------------------- /src/genai/whatsapp_bot/screenshots/chat_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nestauk/discovery_generative_ai/b5ab4b0bf1feda7cd4a85a18ba20aa996cfb9e0d/src/genai/whatsapp_bot/screenshots/chat_2.png -------------------------------------------------------------------------------- /src/genai/whatsapp_bot/screenshots/chat_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nestauk/discovery_generative_ai/b5ab4b0bf1feda7cd4a85a18ba20aa996cfb9e0d/src/genai/whatsapp_bot/screenshots/chat_3.png -------------------------------------------------------------------------------- /src/genai/whatsapp_bot/screenshots/chat_4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nestauk/discovery_generative_ai/b5ab4b0bf1feda7cd4a85a18ba20aa996cfb9e0d/src/genai/whatsapp_bot/screenshots/chat_4.png -------------------------------------------------------------------------------- /src/genai/whatsapp_bot/screenshots/twilio_ngrok.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nestauk/discovery_generative_ai/b5ab4b0bf1feda7cd4a85a18ba20aa996cfb9e0d/src/genai/whatsapp_bot/screenshots/twilio_ngrok.png -------------------------------------------------------------------------------- /src/genai/whatsapp_parenting_bot/Dockerfile: -------------------------------------------------------------------------------- 1 | # Use an official Python runtime as a parent image 2 | FROM --platform=linux/amd64 python:3.9.17-slim 3 | 4 | # Set environment varibles 5 | ENV PYTHONDONTWRITEBYTECODE 1 6 | ENV PYTHONUNBUFFERED 1 7 | 8 | # Install system dependencies 9 | RUN apt-get update \ 10 | && apt-get install -y --no-install-recommends gcc 11 | 12 | # Install poetry 13 | RUN pip install "poetry==1.5.1" 14 | 15 | # Set the working directory in the Docker image 16 | WORKDIR /app 17 | 18 | # Copy only requirements to cache them in docker layer 19 | COPY pyproject.toml poetry.lock ./ 20 | COPY src/genai /app/src/genai 21 | COPY README.md /app/README.md 22 | COPY whatsapp_parenting_bot.py /app/whatsapp_parenting_bot.py 23 | 24 | # Don't push the image to dockerhub 25 | COPY .env /app/.env 26 | 27 | # Project initialization: 28 | RUN poetry config virtualenvs.create false \ 29 | && poetry install --no-interaction --no-ansi 30 | 31 | # Specify the command to run your application 32 | CMD ["sh", "-c", "gunicorn --bind 0.0.0.0:$PORT whatsapp_parenting_bot:app"] 33 | -------------------------------------------------------------------------------- /src/scraping/activities/README.md: -------------------------------------------------------------------------------- 1 | # Examples of activities 2 | 3 | You can use `scrape_activities.py` to collect examples of kids activities from the [Tiny Happy People website](https://www.bbc.co.uk/tiny-happy-people/activities/zjh8hbk). 4 | 5 | The relevant urls were found in [this spreadsheet](https://docs.google.com/spreadsheets/d/1loK4_sCp35JRVM9z_xW8inF3eqb7pL-jEuFWuJ5fjNE/edit#gid=0) 6 | 7 | 8 | ## Usage 9 | 10 | First navigate to the scraping folder from repo's root 11 | ``` 12 | cd src/scraping/activities 13 | ``` 14 | 15 | Then run the script 16 | ``` 17 | poetry run python scrape_activities.py 18 | ``` 19 | 20 | The final output table is stored in `data/tiny_happy_people - final.csv` 21 | with the following columns: 22 | - CONTENT: Title of the activity 23 | - SHORT DESCRIPTION: Short description of the activity 24 | - Age Range (if applicable): Age range of the activity 25 | - Type: Type of the activity (there are 20+ different types used by BBC) 26 | - URL: URL of the activity 27 | - **text: Scraped text, describing the activity in greater detail** 28 | 29 | ## Notes 30 | 31 | You can use `scrape_notebook.ipynb` to test the scraping function for single URLs 32 | -------------------------------------------------------------------------------- /src/scraping/activities/scrape_activities.py: -------------------------------------------------------------------------------- 1 | """ 2 | Get text data from the BBC's Tiny Happy People website about kids activities 3 | 4 | Usage: 5 | 6 | First navigate to the scraping folder 7 | $ cd src/scraping/activities 8 | 9 | Then run the script 10 | $ poetry run python scrape_activities.py 11 | 12 | The final output file is stored in data/tiny_happy_people - final.csv 13 | with the following columns: 14 | - CONTENT: Title of the activity 15 | - SHORT DESCRIPTION: Short description of the activity 16 | - Age Range (if applicable): Age range of the activity 17 | - Type: Type of the activity (there are 20+ different types used by BBC) 18 | - URL: URL of the activity 19 | - text: Scraped text, describing the activity in greater detail 20 | 21 | """ 22 | 23 | import csv 24 | import logging 25 | 26 | from pathlib import Path 27 | from time import sleep 28 | 29 | import bs4 30 | import pandas as pd 31 | import requests 32 | 33 | from bs4 import BeautifulSoup 34 | from tqdm import tqdm 35 | 36 | 37 | # URLs to scrape 38 | URL_PATH = Path("data/tiny_happy_people_urls - links.csv") 39 | # Interim file to store the scraped text 40 | SCRAPED_PATH = URL_PATH.parent / "tiny_happy_people - scraped.csv" 41 | # File to store the final table 42 | OUTPUTS_PATH = URL_PATH.parent / "tiny_happy_people - final.csv" 43 | 44 | # Possible headers signalling the end of the article 45 | END_MARKERS = [ 46 | "In case you missed it", 47 | "Find another activity", 48 | ] 49 | 50 | # Boilerplate text to be removed 51 | SCRAP_TEXT = [ 52 | "TwitterFacebookMessengerWhatsAppShareShare this withTwitterFacebookMessengerWhatsAppCopy linkRead more about sharing", 53 | "previousnext", 54 | ] 55 | 56 | # Headers to be used when scraping 57 | HEADERS = { 58 | "User-Agent": "Data collection for the purpose of research. For questions, reach out to karlis.kanders@nesta.org.uk" 59 | } 60 | 61 | 62 | def pad_element(element: bs4.element.Tag) -> str: 63 | """Pad an element's content with spaces for certain tags (a recursive function) 64 | 65 | Args: 66 | element (bs4.element.Tag): HTML element 67 | 68 | Returns: 69 | str: Text with spaces 70 | """ 71 | if isinstance(element, str): 72 | return element 73 | elif element.name in ["p", "h1", "h2", "h3", "h4", "h5", "h6"]: 74 | return " " + " ".join(pad_element(child) for child in element.children) + " " 75 | else: 76 | return "".join(pad_element(child) for child in element.children) 77 | 78 | 79 | # Function to scrape the web page 80 | def web_scraper(url: str, timeout: float = 10) -> str: 81 | """Scrape a web page and return the content 82 | 83 | Args: 84 | url 85 | URL of the web page 86 | 87 | Returns: 88 | Dataframe with URL and content 89 | """ 90 | # Fetch webpage 91 | response = requests.get( 92 | url, 93 | timeout=timeout, 94 | headers=HEADERS, 95 | ) 96 | 97 | # Parse with BeautifulSoup 98 | soup = BeautifulSoup(response.text, "html.parser") 99 | 100 | # Remove all tags 101 | for style_tag in soup.find_all("style"): 102 | style_tag.extract() 103 | 104 | # Find the headline 105 | headline = soup.find("h1", class_="blocks-article__headline") 106 | 107 | # Fetch all the content starting from headline 108 | content = [] 109 | curr_element = headline.find_next_sibling() 110 | while curr_element is not None: 111 | content.append(curr_element) 112 | curr_element = curr_element.find_next_sibling() 113 | 114 | # Clean the text from HTML tags and add spaces 115 | clean_content = "".join(pad_element(element) for element in content) 116 | 117 | # Remove the end of the webpage 118 | if any(marker in clean_content for marker in END_MARKERS): 119 | for marker in END_MARKERS: 120 | clean_content = clean_content.split(marker)[0] 121 | 122 | # Remove the boilerplate text 123 | for text in SCRAP_TEXT: 124 | clean_content = clean_content.replace(text, "") 125 | 126 | return clean_content.strip() 127 | 128 | 129 | if __name__ == "__main__": 130 | urls_df = pd.read_csv(URL_PATH) 131 | 132 | # Fetch the already scraped urls 133 | if SCRAPED_PATH.exists(): 134 | scraped_urls = pd.read_csv(SCRAPED_PATH, names=["URL", "text"]).URL.to_list() 135 | else: 136 | scraped_urls = [] 137 | 138 | new_urls_df = urls_df[~urls_df["URL"].isin(scraped_urls)] 139 | 140 | # Scrape the urls 141 | with open(SCRAPED_PATH, "a") as f: 142 | writer = csv.writer(f) 143 | for row in tqdm(new_urls_df.itertuples(), total=len(new_urls_df)): 144 | try: 145 | # if url starts with 'www' then add 'https://' 146 | url = "https://" + row.URL if row.URL.startswith("www") else row.URL 147 | text = web_scraper(url) 148 | writer.writerow([row.URL, text]) 149 | except Exception: 150 | logging.warning(f"Error scraping {row.URL}") 151 | sleep(0.5) 152 | 153 | # Create the final output table 154 | final_df = urls_df.merge( 155 | pd.read_csv(SCRAPED_PATH, names=["URL", "text"]), 156 | on="URL", 157 | how="left", 158 | ) 159 | final_df.to_csv(OUTPUTS_PATH, index=False) 160 | -------------------------------------------------------------------------------- /src/scraping/activities/scrape_notebook.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Notebook for testing the scraping functions" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "from scrape_activities import web_scraper\n", 17 | "import scrape_activities\n", 18 | "import pandas as pd" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "pd.read_csv(scrape_activities.URL_PATH)" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "url = 'https://www.bbc.co.uk/tiny-happy-people/night-night-teddy/znmspg8'\n", 37 | "web_scraper(url)" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": null, 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [ 46 | "pd.read_csv(scrape_activities.OUTPUTS_PATH)" 47 | ] 48 | } 49 | ], 50 | "metadata": { 51 | "kernelspec": { 52 | "display_name": ".venv", 53 | "language": "python", 54 | "name": "python3" 55 | }, 56 | "language_info": { 57 | "codemirror_mode": { 58 | "name": "ipython", 59 | "version": 3 60 | }, 61 | "file_extension": ".py", 62 | "mimetype": "text/x-python", 63 | "name": "python", 64 | "nbconvert_exporter": "python", 65 | "pygments_lexer": "ipython3", 66 | "version": "3.9.17" 67 | }, 68 | "orig_nbformat": 4 69 | }, 70 | "nbformat": 4, 71 | "nbformat_minor": 2 72 | } 73 | -------------------------------------------------------------------------------- /src/scraping/start_for_life/README.md: -------------------------------------------------------------------------------- 1 | # A knowledge base for testing a parenting chatbot 2 | 3 | You can use `scrape_sitemap.py` and `scrape_startforlife.py` scripts to collect trusted information and advice about raising a child from the [NHS Start for Life](https://www.nhs.uk/start-for-life/) website 4 | 5 | ## Usage 6 | 7 | ### Getting the sitemap URLs 8 | 9 | First navigate to the scraping folder from repo's root 10 | ``` 11 | cd src/scraping/start_for_life 12 | ``` 13 | 14 | Then run the script 15 | ``` 16 | poetry run python scrape_sitemap.py 17 | ``` 18 | 19 | This will create a file in `data/sitemap.csv` with the following columns: 20 | - Title: Title of the page 21 | - URL: URL of the page 22 | 23 | ### Getting the text content 24 | 25 | After fetching the sitemap URLs, run the following script 26 | ``` 27 | poetry run python scrape_startforlife.py 28 | ``` 29 | 30 | The final output file will be stored in `data/startforlife.csv` with the following columns: 31 | - URL: URL of the page 32 | - content_type: Content type (ie, class tag of the div section) 33 | - header: Header of the page's section 34 | - content: Content of the page's section 35 | - content_no: Index of the section (ie, if there are multiple sections on the page, this is the integer index of the section) 36 | -------------------------------------------------------------------------------- /src/scraping/start_for_life/scrape_sitemap.py: -------------------------------------------------------------------------------- 1 | """ 2 | Get all URLs from the NHS https://www.nhs.uk/start-for-life/site-map/ 3 | 4 | Usage: 5 | 6 | First navigate to the scraping folder 7 | $ cd src/scraping/start_for_life 8 | 9 | Then run the script 10 | $ poetry run python scrape_sitemap.py 11 | 12 | The final output file is stored in data/sitemap.csv 13 | with the following columns: 14 | - Title: Title of the page 15 | - URL: URL of the page 16 | 17 | """ 18 | import csv 19 | import logging 20 | 21 | import requests 22 | 23 | from bs4 import BeautifulSoup 24 | 25 | 26 | # Headers to be used when scraping 27 | HEADERS = { 28 | "User-Agent": "Data collection for the purpose of research. For questions, reach out to karlis.kanders@nesta.org.uk" 29 | } 30 | 31 | # URL with the website you want to scrape 32 | BASE_URL = "https://www.nhs.uk/start-for-life/site-map/" 33 | # Path to your output CSV file with titles and URLs 34 | OUTPUT_PATH = "data/sitemap.csv" 35 | 36 | 37 | def scrape_urls(base_url: str, csv_filename: str, timeout: float = 10) -> None: 38 | """ 39 | Get all URLs from a website and save them to a CSV file 40 | 41 | Args: 42 | base_url: 43 | URL of the website to scrape 44 | 45 | csv_filename: 46 | Path to the CSV file to save the URLs to 47 | 48 | timeout: 49 | Timeout in seconds for the request to the website 50 | """ 51 | 52 | # Create a set to hold Titles and URLs 53 | unique_urls = [] 54 | url_titles = [] 55 | # Make a request to the website 56 | response = requests.get( 57 | base_url, 58 | timeout=timeout, 59 | headers=HEADERS, 60 | ) 61 | # Check if the request was successful 62 | if response.status_code == 200: 63 | # Parse the HTML content of the page using Beautiful Soup 64 | soup = BeautifulSoup(response.text, "html.parser") 65 | # Find all anchor tags () in the HTML 66 | for link in soup.find_all("a"): 67 | # Get the href attribute (the URL) 68 | url = link.get("href") 69 | # Check if the URL starts with 'http' or 'https' to ignore relative URLs 70 | if url and (url.startswith("http") or url.startswith("https") or url.startswith("/")): 71 | unique_urls.append(url) 72 | if type(link.text) is str: 73 | url_titles.append(link.text.strip()) 74 | else: 75 | url_titles.append("") 76 | 77 | # Write the unique URLs to a CSV file 78 | with open(csv_filename, "w", newline="") as csvfile: 79 | csv_writer = csv.writer(csvfile) 80 | csv_writer.writerow(["Title", "URL"]) # Header row 81 | 82 | for i, url in enumerate(unique_urls): 83 | csv_writer.writerow([url_titles[i], url]) 84 | 85 | logging.info(f"Scraping complete. {len(unique_urls)} unique URLs have been saved to {csv_filename}.") 86 | 87 | 88 | if __name__ == "__main__": 89 | scrape_urls(BASE_URL, OUTPUT_PATH) 90 | -------------------------------------------------------------------------------- /src/scraping/start_for_life/scrape_startforlife.py: -------------------------------------------------------------------------------- 1 | """ 2 | Get content from all URLs from the NHS https://www.nhs.uk/start-for-life/site-map/ 3 | 4 | Usage: 5 | 6 | First navigate to the scraping folder 7 | $ cd src/scraping/start_for_life 8 | 9 | Then run the script 10 | $ poetry run python scrape_startforlife.py 11 | 12 | The final output file is stored in data/startforlife.csv 13 | with the following columns: 14 | - URL: URL of the page 15 | - content_type: Content type (ie, class tag of the div section) 16 | - header: Header of the page's section 17 | - content: Content of the page's section 18 | - content_no: Index of the section (ie, if there are multiple sections on the page, this is the integer index of the section) 19 | 20 | """ 21 | 22 | import logging 23 | 24 | from time import sleep 25 | from typing import List 26 | 27 | import pandas as pd 28 | import requests 29 | 30 | from bs4 import BeautifulSoup 31 | from tqdm import tqdm 32 | 33 | 34 | # URLs to scrape 35 | SITEMAP_PATH = "data/sitemap.csv" 36 | BASE_URL = "https://www.nhs.uk/" 37 | # Headers to be used when scraping 38 | REQUEST_HEADERS = { 39 | "User-Agent": "Data collection for the purpose of research. For questions, reach out to karlis.kanders@nesta.org.uk" 40 | } 41 | # We're only fetching on type of div section; one can also add 'nhsuk-promo__content' to capture the site navigation content 42 | DIV_CLASS = ["nhsuk-u-reading-width"] 43 | # Typical header tags 44 | HEADER_TAGS = ["h1", "h2", "h3", "h4", "h5", "h6"] 45 | # Sections with these header will be excluded from the final output 46 | EXCLUDED_HEADERS = ["Sign up for emails"] 47 | # Path to the output file 48 | OUTPUT_PATH = "data/start_for_life.csv" 49 | 50 | 51 | def merge_sections_based_on_headers(headers: List[str], sections: List[str]) -> (List[str], List[str], List[int]): 52 | """ 53 | Merge sections based on headers, where sections without headers will be merged in the previous section that has a header 54 | 55 | Args: 56 | headers 57 | List of headers 58 | 59 | sections 60 | List of sections 61 | 62 | Returns: 63 | Tuple of lists with headers, merged sections and removed indices 64 | """ 65 | new_headers = [] 66 | new_sections = [] 67 | # Keep track of removed indices 68 | removed_indices = [] 69 | # Keep track of the last non-empty header index 70 | last_non_empty_index = -1 71 | 72 | for i in range(len(headers)): 73 | # If header is non-empty 74 | if headers[i]: 75 | new_headers.append(headers[i]) 76 | new_sections.append(sections[i]) 77 | last_non_empty_index = len(new_headers) - 1 78 | # If header is empty 79 | else: 80 | # Add index to removed list 81 | removed_indices.append(i) 82 | # If there was a previous non-empty header 83 | if last_non_empty_index != -1: 84 | new_sections[last_non_empty_index] += sections[i] 85 | 86 | return new_headers, new_sections, removed_indices 87 | 88 | 89 | def web_scraper(url: str, timeout: float = 10) -> (List[str], List[str], List[str]): 90 | """Scrape a web page and return the content 91 | 92 | Args: 93 | url 94 | URL of the web page 95 | 96 | timeout 97 | Timeout in seconds for the request to the website 98 | 99 | Returns: 100 | Tuple of lists with sections' headers, content under each header and the sections' classes 101 | """ 102 | # Fetch webpage 103 | response = requests.get( 104 | url, 105 | timeout=timeout, 106 | headers=REQUEST_HEADERS, 107 | ) 108 | 109 | # Parse with BeautifulSoup 110 | soup = BeautifulSoup(response.text, "html.parser") 111 | 112 | # Get all divs with the specified list of class names 113 | divs = soup.find_all("div", class_=DIV_CLASS) 114 | 115 | # Get all header (h1, h2 etc) tags from each of the divs 116 | headers = [[d.get_text() for d in div.find_all(HEADER_TAGS)] for div in divs] 117 | 118 | # Process the div content: remove headers (h1, h2, etc) 119 | divs_processed = [] 120 | for div in divs: 121 | div_processed = div 122 | for header in div_processed.find_all(HEADER_TAGS): 123 | header.decompose() 124 | divs_processed.append(div_processed) 125 | # Get class names of the divs 126 | divs_classes = [div.get("class") for div in divs] 127 | # Keep only text 128 | divs_processed = [div.get_text() for div in divs_processed] 129 | # Merge divs without header into previous div 130 | headers, divs_processed, removed_indices = merge_sections_based_on_headers(headers, divs_processed) 131 | # Remove corresponding elements from divs_classes 132 | divs_classes = [div for i, div in enumerate(divs_classes) if i not in removed_indices] 133 | # Remove new lines from the beginning and end of each div 134 | divs_processed = [div.strip() for div in divs_processed] 135 | # and replace multiple new lines with a space 136 | divs_processed = [div.strip().replace("\n", " ") for div in divs_processed] 137 | return headers, divs_processed, divs_classes 138 | 139 | 140 | def has_multiple_items(items: List[str]) -> bool: 141 | """Check if there are multiple headers in the second-level list""" 142 | for item in items: 143 | if len(item) > 1: 144 | logging.info(f"Multiple headers in {item}") 145 | return False 146 | else: 147 | return True 148 | 149 | 150 | if __name__ == "__main__": 151 | sitemap_df = pd.read_csv(SITEMAP_PATH) 152 | content_dfs = [] 153 | # Go through each URL and scrape the content 154 | for url in tqdm(sitemap_df["URL"].to_list()): 155 | headers, content, div_classes = web_scraper(BASE_URL + url) 156 | # Just checking the unlikely case of multiple headers in a div section 157 | has_multiple_items(headers) 158 | # Join lists of lists into a single list (unlikely that there is more than one header in a div class) 159 | headers = [". ".join(h) for h in headers] 160 | div_classes = ["; ".join(d) for d in div_classes] 161 | # Append to the list of dataframes 162 | content_dfs.append( 163 | pd.DataFrame( 164 | { 165 | "URL": url, 166 | "content_type": div_classes, 167 | "header": headers, 168 | "content": content, 169 | "content_no": range(len(content)), 170 | } 171 | ) 172 | ) 173 | sleep(0.1) 174 | content_dfs = pd.concat(content_dfs, ignore_index=True) 175 | 176 | # Some light post-processing 177 | content_processed_df = ( 178 | content_dfs 179 | # Remove rows with empty content 180 | .loc[content_dfs["content"].str.len() > 0] 181 | # Remove rows with excluded headers 182 | .loc[~content_dfs["header"].isin(EXCLUDED_HEADERS)] 183 | # For each unique URL, reindex the content_no to follow subsequent integers 184 | .assign(content_no=lambda x: x.groupby("URL")["content_no"].rank(method="first").astype(int)) 185 | # Export to CSV 186 | .to_csv(OUTPUT_PATH, index=False) 187 | ) 188 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nestauk/discovery_generative_ai/b5ab4b0bf1feda7cd4a85a18ba20aa996cfb9e0d/tests/__init__.py -------------------------------------------------------------------------------- /whatsapp_bot.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import os 3 | import re 4 | 5 | from threading import Thread 6 | from time import sleep 7 | from typing import Dict 8 | from typing import List 9 | 10 | import openai 11 | 12 | from dotenv import load_dotenv 13 | from flask import Flask 14 | from flask import request 15 | from twilio.rest import Client 16 | from twilio.twiml.messaging_response import MessagingResponse 17 | 18 | from genai.eyfs.eyfs import TextGenerator 19 | from genai.utils import read_json 20 | 21 | 22 | load_dotenv() 23 | # Twilio settings 24 | client = Client(os.environ["TWILIO_ACCOUNT_SID"], os.environ["TWILIO_AUTH_TOKEN"]) 25 | # OpenAI model 26 | LLM = "gpt-3.5-turbo" 27 | TEMPERATURE = 0.5 28 | openai.api_key = os.environ["OPENAI_API_KEY"] 29 | 30 | # Prepare ELI3 base prompt 31 | ELI3_MESSAGES = read_json("src/genai/whatsapp_bot/prompts/eli3/eli3.json") 32 | 33 | # Prepare EYFS base prompt and parameters 34 | areas_of_learning = [ 35 | "Communication and Language", 36 | "Personal, Social and Emotional Development", 37 | "Physical Development", 38 | "Literacy", 39 | "Mathematics", 40 | "Understanding the World", 41 | "Expressive Arts and Design", 42 | ] 43 | areas_of_learning_desc = read_json("src/genai/eyfs/areas_of_learning.json") 44 | areas_of_learning_text = [v for k, v in areas_of_learning_desc.items() if k in areas_of_learning] 45 | eyfs_paths = [ 46 | "src/genai/eyfs/prompts/system.json", 47 | "src/genai/eyfs/prompts/context_and_task.json", 48 | "src/genai/eyfs/prompts/constraints.json", 49 | "src/genai/eyfs/prompts/situation.json", 50 | ] 51 | EYFS_MESSAGES = [read_json(path) for path in eyfs_paths] 52 | EYFS_PARAMETERS = { 53 | "areas_of_learning": areas_of_learning, 54 | "n_results": 5, 55 | "location": "Indoors or Outdoors", 56 | "areas_of_learning_text": areas_of_learning_text, 57 | } 58 | 59 | # Initiate the Flask app 60 | app = Flask(__name__) 61 | 62 | 63 | def format_activities_text(text: str) -> str: 64 | """Format the response from the EYFS generator for better display in WhatsApp""" 65 | text = ( 66 | text.replace("## Conversations", "*Conversations*\n") 67 | .replace("## Games and Crafts", "*Games and Crafts*\n") 68 | .replace("**Activity description**", "_Activity description_") 69 | .replace("**Areas of learning**", "_Areas of learning_") 70 | ) 71 | # replace markdown subheadings with bold italics 72 | text = re.sub(r"###\s*(.+)", r"*_\1_*", text) 73 | return text 74 | 75 | 76 | def generate_reply(incoming_message: str, sender_contact: str, receiver_contact: str) -> str: 77 | """Parse message text and return an appropriate response. 78 | 79 | Presently supports two types of responses: 'explain' and 'activities' 80 | Activities response is threaded to allow for longer response times. This is a very basic 81 | workaround to the 15 second timeout limit imposed by Twilio. 82 | 83 | Args: 84 | incoming_message: 85 | Message text 86 | sender_contact: 87 | Sender's contact, follows a format 'whatsapp:+' 88 | receiver_contact: 89 | Receiver's contact (ie, my contact), follows a format 'whatsapp:+' 90 | 91 | Returns: 92 | Response text 93 | """ 94 | text_message = incoming_message.lower() 95 | 96 | # 'explain' response 97 | if text_message[0:7] == "explain": 98 | response = TextGenerator.generate( 99 | model=LLM, 100 | temperature=TEMPERATURE, 101 | messages=[ELI3_MESSAGES.copy()], 102 | message_kwargs={"input": text_message[7:].strip()}, 103 | ) 104 | return response["choices"][0]["message"]["content"] 105 | # 'activities' response 106 | elif "activities" in text_message[0:10]: 107 | EYFS_PARAMETERS["description"] = text_message 108 | thread = Thread( 109 | target=send_text, args=[copy.deepcopy(EYFS_MESSAGES), EYFS_PARAMETERS, receiver_contact, sender_contact] 110 | ) 111 | thread.start() 112 | return "Thank you for your question. I am thinking..." 113 | else: 114 | # Return a default message 115 | return ( 116 | 'Write "Explain " to explain a concept to a 3-year old \n\n or' 117 | + '\n\n "Activities " to get activity ideas' 118 | ) 119 | 120 | 121 | def send_text(messages: List[Dict], message_kwargs: Dict, my_contact: str, receiver_contact: str) -> None: 122 | """Generate text messages and send them to a given contact 123 | 124 | Args: 125 | messages: 126 | List of messages to be used as prompts 127 | message_kwargs: 128 | Dictionary of keyword arguments to be passed to the TextGenerator 129 | my_contact: 130 | Sender's contact, follows a format 'whatsapp:+' 131 | receiver_contact: 132 | Receiver's contact (ie, my contact), follows a format 'whatsapp:+' 133 | """ 134 | # Generate response to the message 135 | response = TextGenerator.generate( 136 | model=LLM, 137 | temperature=TEMPERATURE, 138 | messages=messages, 139 | message_kwargs=message_kwargs, 140 | ) 141 | text_body = response["choices"][0]["message"]["content"] 142 | # Format the text_body for better display on WhatsApp 143 | text_body = format_activities_text(text_body) 144 | # Divide output into 1500 character chunks due to WhatsApp character limit of 1600 chars 145 | texts = [text_body[i : i + 1500] for i in range(0, len(text_body), 1500)] 146 | # Send message 147 | for text in texts: 148 | client.messages.create(body=text, from_=my_contact, to=receiver_contact) 149 | sleep(0.5) 150 | return 151 | 152 | 153 | @app.route("/") 154 | def hello_world() -> str: 155 | """Information message""" 156 | return "Nesta generative AI prototype: WhatsApp bot for suggesting kids activities" 157 | 158 | 159 | @app.route("/text", methods=["POST"]) 160 | def text_reply() -> str: 161 | """Respond to incoming messages""" 162 | reply = generate_reply( 163 | incoming_message=request.form.get("Body"), 164 | sender_contact=request.form.get("From"), 165 | receiver_contact=request.form.get("To"), 166 | ) 167 | resp = MessagingResponse() 168 | resp.message(reply) 169 | return str(resp) 170 | 171 | 172 | if __name__ == "__main__": 173 | port = int(os.environ.get("PORT", 5000)) 174 | app.run(debug=False, host="0.0.0.0", port=port) 175 | -------------------------------------------------------------------------------- /whatsapp_parenting_bot.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | 4 | from threading import Thread 5 | 6 | import openai 7 | import s3fs 8 | 9 | from dotenv import load_dotenv 10 | from flask import Flask 11 | from flask import request 12 | from twilio.rest import Client 13 | from twilio.twiml.messaging_response import MessagingResponse 14 | 15 | from genai.eyfs import TextGenerator 16 | from genai.eyfs import get_embedding 17 | from genai.message_history import InMemoryMessageHistory 18 | from genai.prompt_template import FunctionTemplate 19 | from genai.prompt_template import MessageTemplate 20 | from genai.streamlit_pages.utils import get_index 21 | from genai.streamlit_pages.utils import query_pinecone 22 | 23 | 24 | load_dotenv() 25 | # Twilio settings 26 | client = Client(os.environ["TWILIO_ACCOUNT_SID"], os.environ["TWILIO_AUTH_TOKEN"]) 27 | # OpenAI model 28 | LLM = "gpt-3.5-turbo" 29 | TEMPERATURE = 0.6 30 | openai.api_key = os.environ["OPENAI_API_KEY"] 31 | 32 | AWS_KEY = os.environ["AWS_ACCESS_KEY_ID"] 33 | AWS_SECRET = os.environ["AWS_SECRET_ACCESS_KEY"] 34 | S3_PATH = os.environ["S3_BUCKET"] + "/prototypes/whatsapp-bot/logs" 35 | 36 | pinecone_index = get_index(index_name="eyfs-index") 37 | system_message = MessageTemplate.load("src/genai/parenting_chatbot/prompts/system.json") 38 | filter_refs_function = FunctionTemplate.load("src/genai/parenting_chatbot/prompts/filter_refs_function.json") 39 | filter_refs_user_message = MessageTemplate.load("src/genai/parenting_chatbot/prompts/filter_refs_user.json") 40 | filter_refs_system_message = MessageTemplate.load("src/genai/parenting_chatbot/prompts/filter_refs_system.json") 41 | 42 | # Initiate the Flask app 43 | app = Flask(__name__) 44 | 45 | 46 | def write_to_s3(key: str, secret: str, s3_path: str, filename: str, data: dict, how: str = "a") -> None: 47 | """ 48 | Write data to a jsonl file in S3. 49 | 50 | Args: 51 | key (str): AWS access key ID. 52 | secret (str): AWS secret access key. 53 | s3_path (str): S3 bucket path. 54 | filename (str): Name of the file to write to. 55 | data (dict): Data to write to the file. 56 | how (str, optional): How to write to the file. Default is "a" for append. Use "w" to overwrite. 57 | 58 | """ 59 | fs = s3fs.S3FileSystem(key=key, secret=secret) 60 | with fs.open(f"{s3_path}/{filename}.jsonl", how) as f: 61 | f.write(f"{json.dumps(data)}\n") 62 | 63 | 64 | def read_from_s3(key: str, secret: str, s3_path: str, filename: str) -> list: 65 | """ 66 | Read data from a jsonl file in S3 and return as a list of dictionaries. 67 | 68 | Args: 69 | key (str): AWS access key ID. 70 | secret (str): AWS secret access key. 71 | s3_path (str): S3 bucket path. 72 | filename (str): Name of the file to read from. 73 | 74 | Returns: 75 | list: A list of dictionaries, each representing a line in the jsonl file. 76 | """ 77 | fs = s3fs.S3FileSystem(key=key, secret=secret) 78 | data_list = [] 79 | 80 | with fs.open(f"{s3_path}/{filename}.jsonl", "r") as file: 81 | for line in file: 82 | data_list.append(json.loads(line)) 83 | 84 | return data_list 85 | 86 | 87 | def fetch_message_history(sender_contact: str, create_new: bool = True) -> InMemoryMessageHistory: 88 | """ 89 | Find a folder on s3 corresponding to the sender; if doesn't exist, create a new folder 90 | 91 | Args: 92 | sender_contact (str): Sender's contact, follows a format 'whatsapp:+' 93 | create_new (bool, optional): Whether to create a new folder if one doesn't exist. Defaults to True. 94 | 95 | Returns: 96 | InMemoryMessageHistory: Message history 97 | """ 98 | message_history = InMemoryMessageHistory() 99 | try: 100 | messages = read_from_s3( 101 | AWS_KEY, 102 | AWS_SECRET, 103 | f"{S3_PATH}/{sender_contact}", 104 | "messages", 105 | ) 106 | for message in messages: 107 | message_history.add_message(message) 108 | return message_history 109 | except FileNotFoundError as e: 110 | if create_new: 111 | write_to_s3( 112 | AWS_KEY, 113 | AWS_SECRET, 114 | f"{S3_PATH}/{sender_contact}", 115 | "messages", 116 | {"role": "system", "content": "Welcome to the Parenting Chatbot!"}, 117 | how="w", 118 | ) 119 | return message_history 120 | else: 121 | raise e 122 | 123 | 124 | def send_links(link: str, my_contact: str, receiver_contact: str) -> None: 125 | """Generate text messages and send them to a given contact 126 | 127 | Args: 128 | links: 129 | Url to share 130 | my_contact: 131 | Sender's contact, follows a format 'whatsapp:+' 132 | receiver_contact: 133 | Receiver's contact (ie, my contact), follows a format 'whatsapp:+' 134 | """ 135 | text = f"Read more: {link}" 136 | client.messages.create(body=text, from_=my_contact, to=receiver_contact) 137 | return 138 | 139 | 140 | @app.route("/text", methods=["POST"]) 141 | def text_reply() -> str: 142 | """Respond to incoming messages""" 143 | receiver_contact = request.form.get("To") 144 | 145 | # Fetch message history for this sender 146 | sender_contact = request.form.get("From") 147 | message_history = fetch_message_history(sender_contact) 148 | 149 | # Save the incoming message to the message history 150 | prompt = request.form.get("Body") 151 | 152 | # Generate response to the message 153 | 154 | # Search the vector index 155 | search_results = query_pinecone( 156 | index=pinecone_index, 157 | encoded_query=get_embedding(prompt), 158 | top_n=3, 159 | filters={ 160 | "source": {"$eq": "nhs_full_page"}, 161 | }, 162 | ) 163 | 164 | nhs_texts = [] 165 | nhs_urls = [] 166 | for result in search_results: 167 | pred = TextGenerator.generate( 168 | temperature=0.0, 169 | messages=[filter_refs_system_message, filter_refs_user_message], 170 | message_kwargs={"text": result["metadata"]["text"], "question": prompt}, 171 | functions=[filter_refs_function.to_prompt()], 172 | function_call={"name": filter_refs_function.name}, 173 | ) 174 | 175 | pred = json.loads(pred["choices"][0]["message"]["function_call"]["arguments"])["prediction"] 176 | 177 | if pred: 178 | nhs_texts.append(result["metadata"]["text"]) 179 | nhs_urls.append(result["metadata"]["url"]) 180 | 181 | if nhs_texts: 182 | nhs_texts = "\n===\n".join(nhs_texts) 183 | 184 | # Add references to the prompt 185 | prompt = f"""###NHS Start for Life references###\n{nhs_texts}\n\n###User message###\n{prompt} \n\n###Additional instructions###\nAnswer in one or two sentences, not more.""" # noqa: B950 186 | 187 | message_history.add_message({"role": "user", "content": prompt}) 188 | write_to_s3( 189 | AWS_KEY, 190 | AWS_SECRET, 191 | f"{S3_PATH}/{sender_contact}", 192 | "messages", 193 | message_history.messages[-1], 194 | how="a", 195 | ) 196 | 197 | response = TextGenerator.generate( 198 | model=LLM, 199 | temperature=TEMPERATURE, 200 | messages=message_history.get_messages(), 201 | message_kwargs=None, 202 | ) 203 | response = response["choices"][0]["message"]["content"] 204 | message_history.add_message({"role": "assistant", "content": response}) 205 | 206 | # Save message history 207 | write_to_s3( 208 | AWS_KEY, 209 | AWS_SECRET, 210 | f"{S3_PATH}/{sender_contact}", 211 | "messages", 212 | message_history.messages[-1], 213 | how="a", 214 | ) 215 | resp = MessagingResponse() 216 | resp.message(response) 217 | 218 | # Only when incoming whatsapp message 219 | if (len(nhs_urls) > 0) and ("whatsapp" in receiver_contact): 220 | thread = Thread(target=send_links, args=[nhs_urls[0], receiver_contact, sender_contact]) 221 | thread.start() 222 | 223 | return str(resp) 224 | 225 | 226 | if __name__ == "__main__": 227 | port = int(os.environ.get("PORT", 5000)) 228 | app.run(debug=False, host="0.0.0.0", port=port) 229 | --------------------------------------------------------------------------------