├── .env.example
├── .github
    ├── actions
    │   └── poetry
    │   │   ├── pre-commit-lint
    │   │       └── action.yaml
    │   │   └── setup
    │   │       └── action.yaml
    └── workflows
    │   └── status-checks.yaml
├── .gitignore
├── .pre-commit-config.yaml
├── .python-version
├── .streamlit
    └── config.toml
├── .yamllint
├── Dockerfile
├── Dockerfile.heroku
├── LICENSE
├── Makefile
├── README.md
├── app.py
├── build_pinecone_index.sh
├── charts
    ├── dm-app.png
    ├── dm-pinecone.png
    ├── eli3-chat.png
    ├── eli3.png
    ├── eyfs-kb-bbc-app.png
    ├── eyfs-kb-labels-and-vectors.png
    ├── eyfs.png
    ├── parenting-chatbot-app.png
    ├── parenting-chatbot-index.png
    └── whatsapp_bot.png
├── pip_reqs_for_reference.txt
├── poetry.lock
├── poetry.toml
├── public_activity_recommender_app.py
├── public_eli3_app.py
├── public_parenting_chatbot_app.py
├── pyproject.toml
├── signals_app.py
├── src
    ├── evals
    │   ├── charts
    │   │   ├── question_matrix.png
    │   │   └── win_matrix.png
    │   └── check_evals.ipynb
    ├── genai
    │   ├── __init__.py
    │   ├── dm
    │   │   ├── __init__.py
    │   │   ├── dm.json
    │   │   ├── prompts
    │   │   │   ├── dm_prompt.json
    │   │   │   └── dm_prompt_2.json
    │   │   └── run_dm_index.py
    │   ├── eli3
    │   │   ├── __init__.py
    │   │   └── prompts
    │   │   │   ├── eli3.json
    │   │   │   ├── eli3_chat.json
    │   │   │   └── eli3_chat_2.json
    │   ├── eyfs
    │   │   ├── __init__.py
    │   │   ├── areas_of_learning.json
    │   │   ├── eyfs.py
    │   │   ├── prompts
    │   │   │   ├── choices_function.json
    │   │   │   ├── classifier.json
    │   │   │   ├── classifier_function.json
    │   │   │   ├── constraints.json
    │   │   │   ├── context_and_task.json
    │   │   │   ├── context_and_task_with_examples.json
    │   │   │   ├── situation.json
    │   │   │   └── system.json
    │   │   ├── run_classifier.py
    │   │   └── run_pinecone_index.py
    │   ├── message_history.py
    │   ├── parenting_chatbot
    │   │   ├── __init__.py
    │   │   ├── prodigy_eval
    │   │   │   ├── README.md
    │   │   │   ├── best_answer_recipe.py
    │   │   │   ├── create_eval_data.py
    │   │   │   ├── data
    │   │   │   │   ├── answers.jsonl
    │   │   │   │   ├── answers_gpt4.jsonl
    │   │   │   │   ├── answers_human.jsonl
    │   │   │   │   ├── answers_rag.jsonl
    │   │   │   │   ├── questions.jsonl
    │   │   │   │   └── system.json
    │   │   │   ├── figures
    │   │   │   │   ├── eval_parenting_chatbot.png
    │   │   │   │   └── prodigy_screenshot.png
    │   │   │   └── generate_gpt4_answers.py
    │   │   ├── prompts
    │   │   │   ├── filter_refs_function.json
    │   │   │   ├── filter_refs_system.json
    │   │   │   ├── filter_refs_user.json
    │   │   │   └── system.json
    │   │   ├── run_nhs_index.py
    │   │   └── run_nhs_index_full_page.py
    │   ├── prompt_template.py
    │   ├── sandbox
    │   │   └── signals
    │   │   │   ├── data
    │   │   │       ├── 00_system.jsonl
    │   │   │       ├── 01_intro.jsonl
    │   │   │       ├── 02_signal_impact.jsonl
    │   │   │       ├── 03_signal_choice.jsonl
    │   │   │       ├── 04_follow_up.jsonl
    │   │   │       ├── func_intent.json
    │   │   │       ├── func_top_signal.json
    │   │   │       ├── func_top_three_signals.json
    │   │   │       ├── intent_actions.json
    │   │   │       ├── prompt_intent.jsonl
    │   │   │       ├── prompt_top_signal.jsonl
    │   │   │       ├── prompt_top_three_signals.jsonl
    │   │   │       └── signals_2023.json
    │   │   │   └── signals_test.ipynb
    │   ├── setup.py
    │   ├── streamlit_pages
    │   │   ├── __init__.py
    │   │   ├── dm_page.py
    │   │   ├── eli3_page.py
    │   │   ├── eyfs_compare_page.py
    │   │   ├── eyfs_kb_bbc_page.py
    │   │   ├── eyfs_page.py
    │   │   ├── parenting_page.py
    │   │   └── utils.py
    │   ├── utils.py
    │   ├── vector_index.py
    │   ├── whatsapp_bot
    │   │   ├── Dockerfile
    │   │   ├── README.md
    │   │   ├── prompts
    │   │   │   └── eli3
    │   │   │   │   └── eli3.json
    │   │   └── screenshots
    │   │   │   ├── chat_1.png
    │   │   │   ├── chat_2.png
    │   │   │   ├── chat_3.png
    │   │   │   ├── chat_4.png
    │   │   │   └── twilio_ngrok.png
    │   └── whatsapp_parenting_bot
    │   │   └── Dockerfile
    └── scraping
    │   ├── activities
    │       ├── README.md
    │       ├── scrape_activities.py
    │       └── scrape_notebook.ipynb
    │   └── start_for_life
    │       ├── README.md
    │       ├── scrape_sitemap.py
    │       └── scrape_startforlife.py
├── tests
    └── __init__.py
├── whatsapp_bot.py
└── whatsapp_parenting_bot.py


/.env.example:
--------------------------------------------------------------------------------
 1 | OPENAI_API_KEY=<mykey>
 2 | PINECONE_API_KEY=<mykey>
 3 | PINECONE_REGION=<region>
 4 | AWS_ACCESS_KEY_ID=<mykey>
 5 | AWS_SECRET_ACCESS_KEY=<mykey>
 6 | S3_BUCKET=<bucket>
 7 | PATH_TO_NHS_DATA=<s3://bucket/path/to/data/file>
 8 | PATH_TO_BBC_ACTIVITIES_DATA=<s3://bucket/path/to/data/file>
 9 | PATH_TO_LABELLED_BBC_DATA=<s3://bucket/path/to/data/file>
10 | 


--------------------------------------------------------------------------------
/.github/actions/poetry/pre-commit-lint/action.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Run Lint
 3 | description: Run linters to verify code quality. Credits to Andre Sionek for writing the original GitHub Action.
 4 | 
 5 | runs:
 6 |   using: "composite"
 7 |   steps:
 8 |     - name: Lint
 9 |       shell: bash
10 |       run: poetry run pre-commit run --all-files
11 | 


--------------------------------------------------------------------------------
/.github/actions/poetry/setup/action.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Set Up Poetry Environment
 3 | description: Install Python and Poetry. Credits to Andre Sionek for writing the original GitHub Action.
 4 | 
 5 | inputs:
 6 |   python_version:
 7 |     description: Python version
 8 |     required: true
 9 |     default: '3.9.17'
10 |     type: string
11 | 
12 |   poetry_version:
13 |     description: Poetry version
14 |     required: true
15 |     default: 1.2.2
16 |     type: string
17 | 
18 | runs:
19 |   using: "composite"
20 |   steps:
21 | 
22 |     - name: Set up Python
23 |       uses: actions/setup-python@v4
24 |       with:
25 |         python-version: ${{ inputs.python_version }}
26 | 
27 |     - name: Install and configure Poetry
28 |       uses: snok/install-poetry@v1
29 |       with:
30 |         virtualenvs-create: true
31 |         virtualenvs-in-project: true
32 |         version: ${{ inputs.poetry_version }}
33 | 
34 |     - name: Set up Poetry cache
35 |       uses: actions/setup-python@v4
36 |       with:
37 |         python-version: ${{ inputs.python_version }}
38 |         cache: poetry
39 | 
40 |     - name: Install Python Dependencies with Poetry
41 |       shell: bash
42 |       run: poetry install
43 | 


--------------------------------------------------------------------------------
/.github/workflows/status-checks.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Status Checks
 3 | 
 4 | on:
 5 |   push:
 6 |     branches-ignore:
 7 |       - production
 8 |       - main
 9 | 
10 | # Stops the currently running workflow if a new one has been triggered
11 | concurrency:
12 |   group: "${{ github.ref }}"
13 |   cancel-in-progress: true
14 | 
15 | jobs:
16 |   lint:
17 |     name: Lint
18 |     runs-on: ubuntu-20.04
19 |     steps:
20 |       - uses: actions/checkout@v3
21 | 
22 |       - name: Set up
23 |         uses: ./.github/actions/poetry/setup
24 |         with:
25 |           python_version: "3.9.17"
26 | 
27 |       - name: Install Python Lint Dependencies with Poetry
28 |         shell: bash
29 |         run: poetry install --with lint
30 | 
31 |       - name: Lint
32 |         uses: ./.github/actions/poetry/pre-commit-lint
33 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | .streamlit/secrets.toml
 86 | # pyenv
 87 | #   For a library or package, you might want to ignore these files since the code is
 88 | #   intended to run in multiple environments; otherwise, check them in:
 89 | # .python-version
 90 | 
 91 | # pipenv
 92 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 93 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 94 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 95 | #   install all needed dependencies.
 96 | #Pipfile.lock
 97 | 
 98 | # poetry
 99 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
100 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
101 | #   commonly ignored for libraries.
102 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
103 | #poetry.lock
104 | 
105 | # pdm
106 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
107 | #pdm.lock
108 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
109 | #   in version control.
110 | #   https://pdm.fming.dev/#use-with-ide
111 | .pdm.toml
112 | 
113 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
114 | __pypackages__/
115 | 
116 | # Celery stuff
117 | celerybeat-schedule
118 | celerybeat.pid
119 | 
120 | # SageMath parsed files
121 | *.sage.py
122 | 
123 | # Environments
124 | .env
125 | .venv
126 | env/
127 | venv/
128 | ENV/
129 | env.bak/
130 | venv.bak/
131 | 
132 | # Spyder project settings
133 | .spyderproject
134 | .spyproject
135 | 
136 | # Rope project settings
137 | .ropeproject
138 | 
139 | # mkdocs documentation
140 | /site
141 | 
142 | # mypy
143 | .mypy_cache/
144 | .dmypy.json
145 | dmypy.json
146 | 
147 | # Pyre type checker
148 | .pyre/
149 | 
150 | # pytype static type analyzer
151 | .pytype/
152 | 
153 | # Cython debug symbols
154 | cython_debug/
155 | 
156 | # PyCharm
157 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
158 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
159 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
160 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
161 | #.idea/
162 | 
163 | data/
164 | notebooks/
165 | .DS_Store
166 | .vscode/
167 | src/scraping/data/*
168 | 
169 | src/evals/parenting_chatbot/*
170 | src/genai/parenting_chatbot/prodigy_eval/_scrap/*
171 | !src/genai/parenting_chatbot/prodigy_eval/data/
172 | !src/genai/sandbox/signals/data/
173 | src/genai/sandbox/signals/data/signals_2024.json
174 | src/genai/sandbox/signals/illustrations/*
175 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | fail_fast: true
 3 | 
 4 | repos:
 5 |   - repo: https://github.com/pre-commit/pre-commit-hooks
 6 |     rev: v4.3.0
 7 |     hooks:
 8 |       - id: check-toml
 9 |         stages: [commit]
10 | 
11 |       - id: end-of-file-fixer
12 |         stages: [commit]
13 | 
14 |       - id: trailing-whitespace
15 |         stages: [commit]
16 | 
17 |       - id: check-yaml
18 |         stages: [commit]
19 | 
20 |       - id: detect-private-key
21 |         stages: [commit]
22 | 
23 |       - id: check-executables-have-shebangs
24 |         stages: [commit]
25 | 
26 |       - id: check-shebang-scripts-are-executable
27 |         stages: [commit]
28 | 
29 | 
30 |   - repo: local
31 |     hooks:
32 | 
33 |       - id: black
34 |         name: Run Black
35 |         entry: poetry run black
36 |         language: system
37 |         types: [file, python]
38 |         stages: [commit]
39 | 
40 |       - id: isort
41 |         name: Run Isort
42 |         entry: poetry run isort
43 |         language: system
44 |         types: [file, python]
45 |         stages: [commit]
46 | 
47 |       - id: flake8
48 |         name: Run Flake8
49 |         entry: poetry run pflake8
50 |         language: system
51 |         types: [file, python]
52 |         stages: [commit]
53 | 
54 |       - id: yamllint
55 |         name: Run Yamllint
56 |         entry: poetry run yamllint
57 |         language: system
58 |         types: [file, yaml]
59 |         stages: [commit]
60 | 
61 |       - id: bandit
62 |         name: Run Bandit
63 |         entry: poetry run bandit
64 |         language: system
65 |         types: [file, python]
66 |         args:
67 |           [
68 |             --configfile,
69 |             pyproject.toml,
70 |             --severity-level,
71 |             all,
72 |             --confidence-level,
73 |             all,
74 |             --quiet,
75 |             --format,
76 |             custom,
77 |           ]
78 | 


--------------------------------------------------------------------------------
/.python-version:
--------------------------------------------------------------------------------
1 | 3.9.17
2 | 


--------------------------------------------------------------------------------
/.streamlit/config.toml:
--------------------------------------------------------------------------------
1 | [theme]
2 | base="light"
3 | 


--------------------------------------------------------------------------------
/.yamllint:
--------------------------------------------------------------------------------
 1 | ---
 2 | extends: default
 3 | 
 4 | rules:
 5 |   braces:
 6 |     level: error
 7 |     max-spaces-inside: 1
 8 |   brackets:
 9 |     level: error
10 |     max-spaces-inside: 1
11 |   colons:
12 |     level: error
13 |   commas:
14 |     level: error
15 |   empty-lines:
16 |     level: error
17 |   hyphens:
18 |     level: error
19 |   document-start:
20 |     level: error
21 |   indentation:
22 |     level: error
23 |     indent-sequences: consistent
24 |   new-lines:
25 |     level: warning
26 |   trailing-spaces:
27 |     level: warning
28 |   line-length:
29 |     max: 159
30 |     level: error
31 |     allow-non-breakable-inline-mappings: true
32 |   truthy: disable
33 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Use an official Python runtime as a parent image
 2 | FROM python:3.9.18-slim
 3 | 
 4 | # Set environment varibles
 5 | ENV PYTHONDONTWRITEBYTECODE 1
 6 | ENV PYTHONUNBUFFERED 1
 7 | 
 8 | # Install system dependencies
 9 | RUN apt-get update \
10 |     && apt-get install -y --no-install-recommends gcc python3-dev
11 | 
12 | # Install poetry
13 | RUN pip install "poetry==1.5.1"
14 | 
15 | # Set the working directory in the Docker image
16 | WORKDIR /app
17 | 
18 | # Copy only requirements to cache them in docker layer
19 | COPY pyproject.toml poetry.lock ./
20 | COPY src/genai  /app/src/genai
21 | COPY README.md /app/README.md
22 | COPY app.py /app/app.py
23 | 
24 | # Don't push the image to dockerhub
25 | COPY .env /app/.env
26 | COPY .streamlit /app/.streamlit
27 | 
28 | # Project initialization:
29 | RUN poetry config virtualenvs.create false \
30 |     && poetry install --no-interaction --no-ansi
31 | 
32 | EXPOSE 8501
33 | # Specify the command to run your application
34 | CMD ["streamlit", "run", "app.py"]
35 | 


--------------------------------------------------------------------------------
/Dockerfile.heroku:
--------------------------------------------------------------------------------
 1 | # Use an official Python runtime as a parent image
 2 | FROM --platform=linux/amd64 python:3.9.18-slim
 3 | 
 4 | # Set environment varibles
 5 | ENV PYTHONDONTWRITEBYTECODE 1
 6 | ENV PYTHONUNBUFFERED 1
 7 | 
 8 | # Install system dependencies
 9 | RUN apt-get update \
10 |     && apt-get install -y --no-install-recommends gcc
11 | 
12 | # Install poetry
13 | RUN pip install "poetry==1.5.1"
14 | 
15 | # Set the working directory in the Docker image
16 | WORKDIR /app
17 | 
18 | # Copy only requirements to cache them in docker layer
19 | COPY pyproject.toml poetry.lock ./
20 | COPY src/genai  /app/src/genai
21 | COPY README.md /app/README.md
22 | COPY app.py /app/app.py
23 | 
24 | # Don't push the image to dockerhub
25 | COPY .env /app/.env
26 | COPY .streamlit /app/.streamlit
27 | 
28 | # Project initialization:
29 | RUN poetry config virtualenvs.create false \
30 |     && poetry install --no-interaction --no-ansi
31 | 
32 | # Specify the command to run your application
33 | CMD ["sh", "-c", "streamlit run --server.port $PORT app.py"]
34 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Nesta
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | SHELL=/bin/bash
 2 | 
 3 | .DEFAULT_GOAL := help
 4 | 
 5 | .PHONY: help
 6 | help: ## Shows this help text
 7 | 	@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}'
 8 | 
 9 | .PHONY: init
10 | init: clean install ## Clean environment and reinstall all dependencies TODO: add `test` in the future
11 | 
12 | .PHONY: clean
13 | clean: ## Removes project virtual env
14 | 	rm -rf .venv build dist **/*.egg-info .pytest_cache node_modules .coverage
15 | 
16 | .PHONY: install
17 | install: ## Install the project dependencies and pre-commit using Poetry.
18 | 	poetry install --with lint,test
19 | 	poetry run pre-commit install --hook-type pre-commit --hook-type commit-msg --hook-type pre-push
20 | 
21 | # .PHONY: test
22 | # test: ## Run tests
23 | # 	poetry run python -m pytest
24 | 
25 | .PHONY: lint
26 | lint: ## Apply linters to all files
27 | 	poetry run pre-commit run --all-files
28 | 
29 | .PHONY: clean-poetry-lock
30 | clean-poetry-lock: ## Removes poetry.lock from all folders except .venv
31 | 	find . -name \poetry.lock -type f -not -path "./.venv/*" -delete
32 | 
33 | .PHONY: build-pinecone
34 | build-pinecone:
35 | 	chmod +x build_pinecone_index.sh
36 | 	./build_pinecone_index.sh
37 | 


--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
  1 | """Streamlit app for the Generative AI prototypes."""
  2 | 
  3 | import os
  4 | 
  5 | import openai
  6 | import streamlit as st
  7 | 
  8 | from dotenv import load_dotenv
  9 | from streamlit_option_menu import option_menu
 10 | 
 11 | from genai.streamlit_pages import early_year_activity_plan
 12 | from genai.streamlit_pages import eli3
 13 | from genai.streamlit_pages import eyfs_compare
 14 | from genai.streamlit_pages import eyfs_dm_kb
 15 | from genai.streamlit_pages import eyfs_kb_bbc
 16 | from genai.streamlit_pages import parenting_chatbot
 17 | from genai.streamlit_pages.utils import reset_state
 18 | 
 19 | 
 20 | load_dotenv()
 21 | 
 22 | 
 23 | def auth_openai() -> None:
 24 |     """Authenticate with OpenAI."""
 25 |     try:
 26 |         openai.api_key = os.environ["OPENAI_API_KEY"]
 27 |     except Exception:
 28 |         openai.api_key = st.secrets["OPENAI_API_KEY"]
 29 | 
 30 | 
 31 | def s3_creds() -> None:
 32 |     """Get s3 creds."""
 33 |     try:
 34 |         aws_key = os.environ["AWS_ACCESS_KEY_ID"]
 35 |         aws_secret = os.environ["AWS_SECRET_ACCESS_KEY"]
 36 |         s3_path = os.environ["S3_BUCKET"]
 37 |     except Exception:
 38 |         aws_key = st.secrets["AWS_ACCESS_KEY_ID"]
 39 |         aws_secret = st.secrets["AWS_SECRET_ACCESS_KEY"]
 40 |         s3_path = st.secrets["S3_BUCKET"]
 41 | 
 42 |     return aws_key, aws_secret, s3_path
 43 | 
 44 | 
 45 | def check_password() -> bool:
 46 |     """Return `True` if the user had the correct password."""
 47 | 
 48 |     def password_entered() -> None:
 49 |         """Check whether a password entered by the user is correct."""
 50 |         if st.session_state["password"] == st.secrets["password"]:
 51 |             st.session_state["password_correct"] = True
 52 |             del st.session_state["password"]  # don't store password
 53 |         else:
 54 |             st.session_state["password_correct"] = False
 55 | 
 56 |     if "password_correct" not in st.session_state:
 57 |         # First run, show input for password.
 58 |         st.text_input("Password", type="password", on_change=password_entered, key="password")
 59 |         return False
 60 |     elif not st.session_state["password_correct"]:
 61 |         # Password not correct, show input + error.
 62 |         st.text_input("Password", type="password", on_change=password_entered, key="password")
 63 |         st.error("😕 Password incorrect")
 64 |         return False
 65 |     else:
 66 |         # Password correct.
 67 | 
 68 |         return True
 69 | 
 70 | 
 71 | def main() -> None:
 72 |     """Run the app."""
 73 |     auth_openai()
 74 |     aws_key, aws_secret, s3_path = s3_creds()
 75 |     with st.sidebar:
 76 |         selected = option_menu(
 77 |             "Prototypes",
 78 |             [
 79 |                 "Home page",
 80 |                 "ELI3",
 81 |                 "EYFS-based activities",
 82 |                 "EYFS-based activities + BBC activities",
 83 |                 "EYFS prototypes comparison",
 84 |                 "Development Matters prototype",
 85 |                 "Parenting Chatbot",
 86 |             ],
 87 |             default_index=0,
 88 |             on_change=reset_state,
 89 |             key="menu_selection",
 90 |         )
 91 |     if selected == "Home page":
 92 |         st.title("Nesta Discovery: Generative AI Prototypes")
 93 |         st.write("Welcome to the Nesta Discovery Generative AI prototypes. Please select a prototype from the menu.")
 94 |     elif selected == "ELI3":
 95 |         eli3()
 96 |     elif selected == "EYFS-based activities":
 97 |         early_year_activity_plan()
 98 |     elif selected == "EYFS-based activities + BBC activities":
 99 |         eyfs_kb_bbc()
100 |     elif selected == "EYFS prototypes comparison":
101 |         eyfs_compare()
102 |     elif selected == "Development Matters prototype":
103 |         eyfs_dm_kb()
104 |     elif selected == "Parenting Chatbot":
105 |         parenting_chatbot(aws_key, aws_secret, s3_path)
106 | 
107 | 
108 | if check_password():
109 |     main()
110 | 


--------------------------------------------------------------------------------
/build_pinecone_index.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | python src/genai/eyfs/run_pinecone_index.py
3 | python src/genai/dm/run_dm_index.py
4 | python src/genai/parenting_chatbot/run_nhs_index_full_page.py
5 | 


--------------------------------------------------------------------------------
/charts/dm-app.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nestauk/discovery_generative_ai/b5ab4b0bf1feda7cd4a85a18ba20aa996cfb9e0d/charts/dm-app.png


--------------------------------------------------------------------------------
/charts/dm-pinecone.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nestauk/discovery_generative_ai/b5ab4b0bf1feda7cd4a85a18ba20aa996cfb9e0d/charts/dm-pinecone.png


--------------------------------------------------------------------------------
/charts/eli3-chat.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nestauk/discovery_generative_ai/b5ab4b0bf1feda7cd4a85a18ba20aa996cfb9e0d/charts/eli3-chat.png


--------------------------------------------------------------------------------
/charts/eli3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nestauk/discovery_generative_ai/b5ab4b0bf1feda7cd4a85a18ba20aa996cfb9e0d/charts/eli3.png


--------------------------------------------------------------------------------
/charts/eyfs-kb-bbc-app.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nestauk/discovery_generative_ai/b5ab4b0bf1feda7cd4a85a18ba20aa996cfb9e0d/charts/eyfs-kb-bbc-app.png


--------------------------------------------------------------------------------
/charts/eyfs-kb-labels-and-vectors.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nestauk/discovery_generative_ai/b5ab4b0bf1feda7cd4a85a18ba20aa996cfb9e0d/charts/eyfs-kb-labels-and-vectors.png


--------------------------------------------------------------------------------
/charts/eyfs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nestauk/discovery_generative_ai/b5ab4b0bf1feda7cd4a85a18ba20aa996cfb9e0d/charts/eyfs.png


--------------------------------------------------------------------------------
/charts/parenting-chatbot-app.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nestauk/discovery_generative_ai/b5ab4b0bf1feda7cd4a85a18ba20aa996cfb9e0d/charts/parenting-chatbot-app.png


--------------------------------------------------------------------------------
/charts/parenting-chatbot-index.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nestauk/discovery_generative_ai/b5ab4b0bf1feda7cd4a85a18ba20aa996cfb9e0d/charts/parenting-chatbot-index.png


--------------------------------------------------------------------------------
/charts/whatsapp_bot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nestauk/discovery_generative_ai/b5ab4b0bf1feda7cd4a85a18ba20aa996cfb9e0d/charts/whatsapp_bot.png


--------------------------------------------------------------------------------
/pip_reqs_for_reference.txt:
--------------------------------------------------------------------------------
 1 | requests==2.28.1
 2 | openai==0.27.8
 3 | streamlit>=1.24.1
 4 | python-dotenv>=1.0.0
 5 | streamlit-option-menu>=0.3.6
 6 | tiktoken>=0.4.0
 7 | tenacity>=8.2.2
 8 | beautifulsoup4>=4.12.2
 9 | Flask>=2.3.2
10 | twilio>=8.5.0
11 | aiofiles>=23.1.0
12 | gunicorn>=21.2.0
13 | pinecone-client>=2.2.2
14 | streamlit-feedback==0.0.9
15 | boto3==1.28.17
16 | s3fs>=2023.9.0
17 | 


--------------------------------------------------------------------------------
/poetry.toml:
--------------------------------------------------------------------------------
1 | [virtualenvs]
2 | create = true
3 | in-project = true
4 | 


--------------------------------------------------------------------------------
/public_activity_recommender_app.py:
--------------------------------------------------------------------------------
 1 | """Streamlit app for the Generative AI prototypes."""
 2 | 
 3 | import os
 4 | 
 5 | import openai
 6 | import streamlit as st
 7 | 
 8 | from dotenv import load_dotenv
 9 | 
10 | from genai.streamlit_pages import eyfs_dm_kb
11 | 
12 | 
13 | load_dotenv()
14 | 
15 | 
16 | def auth_openai() -> None:
17 |     """Authenticate with OpenAI."""
18 |     try:
19 |         openai.api_key = os.environ["OPENAI_API_KEY"]
20 |     except Exception:
21 |         openai.api_key = st.secrets["OPENAI_API_KEY"]
22 | 
23 | 
24 | def main() -> None:
25 |     """Run the app."""
26 |     auth_openai()
27 |     eyfs_dm_kb(sidebar=False)
28 | 
29 | 
30 | main()
31 | 


--------------------------------------------------------------------------------
/public_eli3_app.py:
--------------------------------------------------------------------------------
 1 | """Streamlit app for the Generative AI prototypes."""
 2 | 
 3 | import os
 4 | 
 5 | import openai
 6 | import streamlit as st
 7 | 
 8 | from dotenv import load_dotenv
 9 | 
10 | from genai.streamlit_pages import eli3
11 | 
12 | 
13 | load_dotenv()
14 | 
15 | 
16 | def auth_openai() -> None:
17 |     """Authenticate with OpenAI."""
18 |     try:
19 |         openai.api_key = os.environ["OPENAI_API_KEY"]
20 |     except Exception:
21 |         openai.api_key = st.secrets["OPENAI_API_KEY"]
22 | 
23 | 
24 | def main() -> None:
25 |     """Run the app."""
26 |     auth_openai()
27 |     eli3(sidebar=False)
28 | 
29 | 
30 | main()
31 | 


--------------------------------------------------------------------------------
/public_parenting_chatbot_app.py:
--------------------------------------------------------------------------------
 1 | """Streamlit app for the Generative AI prototypes."""
 2 | 
 3 | import os
 4 | 
 5 | import openai
 6 | import streamlit as st
 7 | 
 8 | from dotenv import load_dotenv
 9 | 
10 | from genai.streamlit_pages import parenting_chatbot
11 | 
12 | 
13 | load_dotenv()
14 | 
15 | 
16 | def auth_openai() -> None:
17 |     """Authenticate with OpenAI."""
18 |     try:
19 |         openai.api_key = os.environ["OPENAI_API_KEY"]
20 |     except Exception:
21 |         openai.api_key = st.secrets["OPENAI_API_KEY"]
22 | 
23 | 
24 | def s3_creds() -> None:
25 |     """Get s3 creds."""
26 |     try:
27 |         aws_key = os.environ["AWS_ACCESS_KEY_ID"]
28 |         aws_secret = os.environ["AWS_SECRET_ACCESS_KEY"]
29 |         s3_path = os.environ["S3_BUCKET"]
30 |     except Exception:
31 |         aws_key = st.secrets["AWS_ACCESS_KEY_ID"]
32 |         aws_secret = st.secrets["AWS_SECRET_ACCESS_KEY"]
33 |         s3_path = st.secrets["S3_BUCKET"]
34 | 
35 |     return aws_key, aws_secret, s3_path
36 | 
37 | 
38 | def main() -> None:
39 |     """Run the app."""
40 |     auth_openai()
41 |     aws_key, aws_secret, s3_path = s3_creds()
42 |     parenting_chatbot(aws_key, aws_secret, s3_path, sidebar=False)
43 | 
44 | 
45 | main()
46 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
  1 | [tool.poetry]
  2 | name = "discovery_generative_ai "
  3 | version = "0.1.0"
  4 | description = " We are exploring the potential impact of Generative AI on Nesta's Missions and work to uncover opportunities and risks that can inform Nesta strategy."
  5 | authors = ["Nesta"]
  6 | readme = "README.md"
  7 | homepage = "https://github.com/nestauk/discovery_generative_ai"
  8 | repository = "https://github.com/nestauk/discovery_generative_ai"
  9 | keywords = ['llm', 'generative-ai']  # noqa
 10 | packages = [
 11 |     { include = "genai", from = "src" },
 12 | ]
 13 | 
 14 | [tool.poetry.build]
 15 | generate-setup-file = false
 16 | 
 17 | 
 18 | [tool.poetry.dependencies]
 19 | python = "3.9.17"
 20 | requests = "2.28.1"
 21 | openai = "0.27.8"
 22 | streamlit = "^1.24.1"
 23 | python-dotenv = "^1.0.0"
 24 | streamlit-option-menu = "^0.3.6"
 25 | tiktoken = "^0.4.0"
 26 | tenacity = "^8.2.2"
 27 | beautifulsoup4 = "^4.12.2"
 28 | Flask = "^2.3.2"
 29 | twilio = "^8.5.0"
 30 | aiofiles = "23.1.0"
 31 | gunicorn = "^21.2.0"
 32 | pinecone-client = "^4.1"
 33 | streamlit-feedback = "0.0.9"
 34 | boto3= "1.28.17"
 35 | s3fs = "^2023.9.0"
 36 | seaborn = "^0.13.0"
 37 | cchardet = "^2.1.7"
 38 | chardet = "^5.2.0"
 39 | 
 40 | [tool.poetry.group.test]
 41 | optional = true
 42 | 
 43 | 
 44 | [tool.poetry.group.test.dependencies]
 45 | pytest = "^7.1.1" # Allows for testing of the project
 46 | pytest-cov = "^4.0.0" # Allows to run coverage of the project
 47 | moto = "^3.1.6" # Allows for mocking of AWS services
 48 | 
 49 | [tool.poetry.group.lint]
 50 | optional = true
 51 | 
 52 | [tool.poetry.group.lint.dependencies]
 53 | flake8 = "^4.0.1"                         # Style Guide Enforcement
 54 | pyproject-flake8 = "^0.0.1-alpha.4"       # Allows configuration of flake 8 from pyproject.toml
 55 | flake8-bugbear = "^22.3.23"               # Finding likely bugs and design problems in your program.
 56 | flake8-print = "^4.0.0"                   # Check for print statements in python files.
 57 | flake8-docstrings = "^1.6.0"              # Include checks provided by pep257.
 58 | flake8-annotations = "^2.8.0"             # check for presence of type annotations in function definitions.
 59 | isort = "^5.10.1"                         # Sort imports
 60 | yamllint = "^1.26.3"                      # Linter for YAML files
 61 | bandit = {version = "^1.7.4", extras = ["toml"]}  # Security linter
 62 | pre-commit = "^2.18.1"                    # Runs a pipeline before commiting code
 63 | black = "^22.6.0"                         # Python style checks
 64 | 
 65 | 
 66 | 
 67 | [tool.poetry.group.dev.dependencies]
 68 | ipykernel = "^6.24.0"
 69 | 
 70 | [build-system]
 71 | requires = ["poetry-core>=1.2.0rc1"]
 72 | build-backend = "poetry.core.masonry.api"
 73 | 
 74 | 
 75 | [tool.black]
 76 | line-length = 119
 77 | target-version = ['py39']
 78 | include = '\.pyi?$'
 79 | extend-exclude = '''
 80 | /(
 81 |     \.eggs
 82 |   | \.git
 83 |   | \.venv
 84 |   | \.env
 85 |   | \.vscode
 86 |   | build
 87 |   | dist
 88 |   | \.mypy_cache
 89 | )\
 90 | '''
 91 | 
 92 | [tool.isort]
 93 | profile = 'black'
 94 | force_single_line = true
 95 | combine_as_imports = true
 96 | lines_between_types = 1
 97 | lines_after_imports = 2
 98 | src_paths = ["src", "tests"]
 99 | line_length = 119
100 | 
101 | [tool.flake8]
102 | max-line-length = 119
103 | select = [  # Choose the flake8 errors to select globally here
104 |   "C",      # McCabe complexity
105 |   "E",      # pep8 errors
106 |   "W",      # pep8 warnings
107 |   "F",      # pyflakes errors
108 |   "N",      # naming conventions
109 |   "B",      # bugbear errors (needs plugin installed)
110 |   "ANN",    # flake8 annotations errors (needs plugin installed)
111 |   "T",      # flake8 print errors (needs plugin installed)
112 |   "D",      # flake8 doscstrings errors (needs plugin installed)
113 |   "B950",   #  Line too long. It considers "max-line-length" but only triggers when exceeded by more than 10%.
114 | ]
115 | ignore = [   # Choose the flake8 errors to ignore globally here
116 |   "E501",    # Line too long  (using B950 instead, which has 10% tolerance)
117 |   "D107",    # Missing docstring in __init__
118 |   "D202",    # No blank lines allowed after function docstring
119 |   "D400",    # First line should end with a period
120 |   "D100",    # Missing docstring in public module
121 |   "D104",    # Missing docstring in public package
122 |   "ANN003",  # Missing type annotation for **kwargs
123 |   "ANN002",  # Missing type annotation for **args
124 |   "ANN1",    # Missing type annotation for self in methot or cls method
125 |   "W503",    # Line break occurred before a binary operator
126 |   "E203",    # Whitespace before ':'
127 | ]
128 | per-file-ignores = [     # Choose the flake8 errors to ignore per file here
129 |   "*/__init__.py:F401",  # Ignore imported but unused in __init__.py files
130 |   "tests/*:ANN,D",       # Ignore Docstring and annotations on tests
131 | ]
132 | exclude = [
133 |     ".venv/*",
134 |     ".vscode/*",
135 | ]
136 | # See other flake8 extensions here: https://github.com/DmytroLitvinov/awesome-flake8-extensions
137 | 
138 | 
139 | [tool.bandit]
140 | skips = [      # Choose the bandit errors to ignore globally
141 |   "B101",      # Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.
142 |   "B104",      # Possible binding to all interfaces
143 | ]
144 | exclude_dirs = [
145 |   ".venv",
146 | ]
147 | 
148 | [tool.pytest.ini_options]
149 | minversion = "7.1"
150 | addopts = """
151 |   --verbose
152 |   --color=yes
153 |   --assert=plain
154 |   --cov-report term
155 |   --cov=src
156 |   --last-failed
157 | """
158 | testpaths = [
159 |     "tests",
160 | ]
161 | 
162 | [tool.coverage.run]
163 | omit = ["*/tests/*"]    # Remove test files from coverage run.
164 | 
165 | [tool.coverage.report]
166 | omit = ["*/tests/*"]    # Remove test files from coverage report.
167 | 


--------------------------------------------------------------------------------
/src/evals/charts/question_matrix.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nestauk/discovery_generative_ai/b5ab4b0bf1feda7cd4a85a18ba20aa996cfb9e0d/src/evals/charts/question_matrix.png


--------------------------------------------------------------------------------
/src/evals/charts/win_matrix.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nestauk/discovery_generative_ai/b5ab4b0bf1feda7cd4a85a18ba20aa996cfb9e0d/src/evals/charts/win_matrix.png


--------------------------------------------------------------------------------
/src/genai/__init__.py:
--------------------------------------------------------------------------------
1 | from .prompt_template import FunctionTemplate
2 | from .prompt_template import MessageTemplate
3 | from .vector_index import PineconeIndex
4 | 


--------------------------------------------------------------------------------
/src/genai/dm/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nestauk/discovery_generative_ai/b5ab4b0bf1feda7cd4a85a18ba20aa996cfb9e0d/src/genai/dm/__init__.py


--------------------------------------------------------------------------------
/src/genai/dm/prompts/dm_prompt.json:
--------------------------------------------------------------------------------
1 | {"role": "user", "content": "###Instructions###\nI will describe a theme below. Generate fun and exciting activities related to UK's Early Years Foundation Stage (EYFS) framework and the Development Matters guidance.\n\n###Examples###\nThe activities you generated must be inspired by the examples below:\n{examples}\n\n###Constraints###\n- The activities must be suitable for kids in the age groups: {age_groups}.\n- The activities must be related to the areas of learning: {areas_of_learning}.\n- You must generate 5 activities.\n\n###Description###\n{description}\n\n"}
2 | 


--------------------------------------------------------------------------------
/src/genai/dm/prompts/dm_prompt_2.json:
--------------------------------------------------------------------------------
1 | {"role": "user", "content": "###Instructions###\nI will describe a theme below. Generate fun and exciting activities related to UK's Early Years Foundation Stage (EYFS) framework and the Development Matters guidance.\n\n###Examples###\nThe activities you generate must be inspired by the examples below:\n{examples}\n\n###Constraints###\n- The activities must be suitable for kids in the age groups: {age_groups}.\n- The activities must be related to the areas of learning: {areas_of_learning}.\n- You must generate 5 activities.\n- You must describe each activity in 5-6 sentences.\n\n###Description###\n{description}\n\n###Formatting###\nYou must format the activities as follows:\n## <activity_name>\n<activity_description>\n\n## <activity_name>\n<activity_description>\n"}
2 | 


--------------------------------------------------------------------------------
/src/genai/dm/run_dm_index.py:
--------------------------------------------------------------------------------
 1 | """Build a pinecone index with the Development Matters learning goals and examples."""
 2 | 
 3 | import os
 4 | import uuid
 5 | 
 6 | import openai
 7 | 
 8 | from dotenv import load_dotenv
 9 | 
10 | from genai.eyfs import get_embedding
11 | from genai.utils import read_json
12 | from genai.vector_index import PineconeIndex
13 | 
14 | 
15 | load_dotenv()
16 | 
17 | 
18 | openai.api_key = os.environ["OPENAI_API_KEY"]
19 | PATH_TO_DM = "src/genai/dm/dm.json"
20 | INDEX_NAME = "eyfs-index"
21 | ENCODER_NAME = "text-embedding-ada-002"
22 | 
23 | if "__main__" == __name__:
24 |     data = read_json(PATH_TO_DM)
25 |     # Temp hack to exclude the template
26 |     data = [d for d in data if d["area_of_learning"] != ""]
27 | 
28 |     # Format the data to what pinecone needs and generate a temp uuid
29 |     docs = []
30 |     for elem in data:
31 |         aol = elem["area_of_learning"]
32 |         d = elem["age_group"]
33 |         for age, age_dict in d.items():
34 |             for k, items in age_dict.items():
35 |                 for item in items:
36 |                     doc = tuple(
37 |                         (
38 |                             str(uuid.uuid4()),
39 |                             get_embedding(item),
40 |                             {
41 |                                 "age_group": age,
42 |                                 "type_": k,
43 |                                 "source": "dm",
44 |                                 "text": item,
45 |                                 "areas_of_learning": aol,
46 |                             },
47 |                         )
48 |                     )
49 |                     docs.append(doc)
50 | 
51 |     # Build the index
52 |     conn = PineconeIndex(api_key=os.environ["PINECONE_API_KEY"], environment=os.environ["PINECONE_REGION"])
53 | 
54 |     conn.build_and_upsert(
55 |         index_name=INDEX_NAME,
56 |         dimension=1536,
57 |         metric="euclidean",
58 |         docs=docs,
59 |         metadata_config={"indexed": ["areas_of_learning", "source", "type_", "age_group"]},
60 |         batch_size=40,
61 |     )
62 | 


--------------------------------------------------------------------------------
/src/genai/eli3/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nestauk/discovery_generative_ai/b5ab4b0bf1feda7cd4a85a18ba20aa996cfb9e0d/src/genai/eli3/__init__.py


--------------------------------------------------------------------------------
/src/genai/eli3/prompts/eli3.json:
--------------------------------------------------------------------------------
1 | {
2 |     "role": "user",
3 |     "content": "###Instructions###\nYou are a helpful, kind, intelligent and polite early-year educator. Your task is to explain a concept to a 3 year old child. You must explain it in simple words that a young kid would understand. You must also be patient and never offend or be aggressive. Gendered language and any adjectives about the kid are strictly prohibited.\n\n###Question###\n{input}\n\n###Answer###\n"
4 | }
5 | 


--------------------------------------------------------------------------------
/src/genai/eli3/prompts/eli3_chat.json:
--------------------------------------------------------------------------------
1 | {
2 |     "role": "user",
3 |     "content": "###Instructions###\nYou are a helpful, kind, intelligent and polite early-year educator. Your task is to explain a concept to a 3 year old child. You must explain it in simple words that a young kid would understand. You must also be patient and never offend or be aggressive. Gendered language and any adjectives about the kid are strictly prohibited.\n\n###Answer###\n"
4 | }
5 | 


--------------------------------------------------------------------------------
/src/genai/eli3/prompts/eli3_chat_2.json:
--------------------------------------------------------------------------------
1 | {"role": "user", "content": "\n###Instructions###\nYou are a helpful, kind, intelligent and polite early-year educator. Your task is to discuss a concept with a 3 year old child. Your role as an EY educator is to built children's curiosity and develop their thinking process, not to provide an explicit answer to their question. You should provide an answer to the question and ask follow-up questions. You must also be patient and never offend or be aggressive. Gendered language and any adjectives about the kid are strictly prohibited.\n\n###Answer###\n", "name": null}
2 | 


--------------------------------------------------------------------------------
/src/genai/eyfs/__init__.py:
--------------------------------------------------------------------------------
1 | from .eyfs import EYFSClassifier
2 | from .eyfs import TextGenerator
3 | from .eyfs import get_embedding
4 | 


--------------------------------------------------------------------------------
/src/genai/eyfs/areas_of_learning.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "Communication and Language": "##Communication and Language##\nThe development of children’s spoken language underpins all seven areas of learning and development. Children’s back-and-forth interactions from an early age form the foundations for language and cognitive development. The number and quality of the conversations they have with adults and peers throughout the day in a language-rich environment is crucial. By commenting on what children are interested in or doing, and echoing back what they say with new vocabulary added, practitioners will build children's language effectively. Reading frequently to children, and engaging them actively in stories, non-fiction, rhymes and poems, and then providing them with extensive opportunities to use and embed new words in a range of contexts, will give children the opportunity to thrive. Through conversation, story-telling and role play, where children share their ideas with support and modelling from their teacher, and sensitive questioning that invites them to elaborate, children become comfortable using a rich range of vocabulary and language structures.",
 3 |     "Personal, Social and Emotional Development": "##Personal, Social and Emotional Development##\nChildren’s personal, social and emotional development (PSED) is crucial for children to lead healthy and happy lives, and is fundamental to their cognitive development. Underpinning their personal development are the important attachments that shape their social world. Strong, warm and supportive relationships with adults enable children to learn how to understand their own feelings and those of others. Children should be supported to manage emotions, develop a positive sense of self, set themselves simple goals, have confidence in their own abilities, to persist and wait for what they want and direct attention as necessary. Through adult modelling and guidance, they will learn how to look after their bodies, including healthy eating, and manage personal needs independently. Through supported interaction with other children, they learn how to make good friendships, co-operate and resolve conflicts peaceably. These attributes will provide a secure platform from which children can achieve at school and in later life.",
 4 |     "Physical Development": "##Physical Development##\nPhysical activity is vital in children’s all-round development, enabling them to pursue happy, healthy and active lives7. Gross and fine motor experiences develop incrementally throughout early childhood, starting with sensory explorations and the development of a child’s strength, co-ordination and positional awareness through tummy time, crawling and play movement with both objects and adults. By creating games and providing opportunities for play both indoors and outdoors, adults can support children to develop their core strength, stability, balance, spatial awareness, co-ordination and agility. Gross motor skills provide the foundation for developing healthy bodies and social and emotional well-being. Fine motor control and precision helps with hand-eye co-ordination, which is later linked to early literacy. Repeated and varied opportunities to explore and play with small world activities, puzzles, arts and crafts and the practice of using small tools, with feedback and support from adults, allow children to develop proficiency, control and confidence.",
 5 |     "Literacy": "##Literacy##\nIt is crucial for children to develop a life-long love of reading. Reading consists of two dimensions: language comprehension and word reading. Language comprehension (necessary for both reading and writing) starts from birth. It only develops when adults talk with children about the world around them and the books (stories and non-fiction) they read with them, and enjoy rhymes, poems and songs together. Skilled word reading, taught later, involves both the speedy working out of the pronunciation of unfamiliar printed words (decoding) and the speedy recognition of familiar printed words. Writing involves transcription (spelling and handwriting) and composition (articulating ideas and structuring them in speech, before writing).",
 6 |     "Mathematics": "##Mathematics##\nDeveloping a strong grounding in number is essential so that all children develop the necessary building blocks to excel mathematically. Children should be able to count confidently, develop a deep understanding of the numbers to 10, the relationships between them and the patterns within those numbers. By providing frequent and varied opportunities to build and apply this understanding - such as using manipulatives, including small pebbles and tens frames for organising counting - children will develop a secure base of knowledge and vocabulary from which mastery of mathematics is built. In addition, it is important that the curriculum includes rich opportunities for children to develop their spatial reasoning skills across all areas of mathematics including shape, space and measures. It is important that children develop positive attitudes and interests in mathematics, look for patterns and relationships, spot connections, ‘have a go’, talk to adults and peers about what they notice and not be afraid to make mistakes.",
 7 |     "Understanding the World": "##Understanding the World##\nUnderstanding the world involves guiding children to make sense of their physical world and their community. The frequency and range of children’s personal experiences increases their knowledge and sense of the world around them – from visiting parks, libraries and museums to meeting important members of society such as police officers, nurses and firefighters. In addition, listening to a broad selection of stories, non-fiction, rhymes and poems will foster their understanding of our culturally, socially, technologically and ecologically diverse world. As well as building important knowledge, this extends their familiarity with words that support understanding across domains. Enriching and widening children’s vocabulary will support later reading comprehension.",
 8 |     "Expressive Arts and Design": "##Expressive Arts and Design##\nThe development of children’s artistic and cultural awareness supports their imagination and creativity. It is important that children have regular opportunities to engage with the arts, enabling them to explore and play with a wide range of media and materials. The quality and variety of what children see, hear and participate in is crucial for developing their understanding, self-expression, vocabulary and ability to communicate through the arts. The frequency, repetition and depth of their experiences are fundamental to their progress in interpreting and appreciating what they hear, respond to and observe."
 9 | }
10 | 


--------------------------------------------------------------------------------
/src/genai/eyfs/eyfs.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import json
  3 | import logging
  4 | import string
  5 | 
  6 | from typing import Dict
  7 | from typing import List
  8 | from typing import Optional
  9 | from typing import Union
 10 | 
 11 | import aiofiles
 12 | import openai
 13 | 
 14 | from openai.error import APIConnectionError
 15 | from openai.error import APIError
 16 | from openai.error import RateLimitError
 17 | from openai.error import ServiceUnavailableError
 18 | from openai.error import Timeout
 19 | from tenacity import before_sleep_log
 20 | from tenacity import retry
 21 | from tenacity import retry_if_exception_type
 22 | from tenacity import stop_after_attempt
 23 | from tenacity import wait_exponential
 24 | 
 25 | from genai import MessageTemplate
 26 | 
 27 | 
 28 | logger = logging.getLogger(__name__)
 29 | 
 30 | 
 31 | class TextGenerator:
 32 |     """Generate tokens using OpenAI's API."""
 33 | 
 34 |     @classmethod
 35 |     def generate(
 36 |         cls,
 37 |         messages: List[Union[str, Dict]],
 38 |         message_kwargs: Optional[Dict] = None,
 39 |         model: str = "gpt-3.5-turbo",
 40 |         temperature: float = 0.0,
 41 |         **openai_kwargs,
 42 |     ) -> Union[Dict, str]:
 43 |         """Generate text using OpenAI's API.
 44 | 
 45 |         More details on the API and messages: https://platform.openai.com/docs/guides/gpt/chat-completions-api
 46 | 
 47 |         Args:
 48 |             messages
 49 |                 A list of messages to send to the API. They can be:
 50 |                 - dictionaries
 51 |                 - str (JSON file path)
 52 |                 - instances of classes that inherit from BasePromptTemplate
 53 | 
 54 |             message_kwargs
 55 |                 A dictionary of keyword arguments to pass to the messages.
 56 | 
 57 |             model
 58 |                 The OpenAI model to use.
 59 | 
 60 |             temperature
 61 |                 The sampling temperature.
 62 | 
 63 |             openai_kwargs
 64 |                 Keyword arguments to pass to the OpenAI API.
 65 | 
 66 |         Returns:
 67 |             A dictionary containing the response from the API.
 68 | 
 69 |         """
 70 |         if not message_kwargs:
 71 |             message_kwargs = {}
 72 | 
 73 |         messages = [cls.prepare_message(message, **message_kwargs) for message in messages]
 74 | 
 75 |         response = cls._call(
 76 |             messages=messages,
 77 |             temperature=temperature,
 78 |             model=model,
 79 |             **openai_kwargs,
 80 |         )
 81 | 
 82 |         return response
 83 | 
 84 |     @classmethod
 85 |     def prepare_message(cls, obj: Union[MessageTemplate, dict, str], **kwargs) -> Dict:
 86 |         """Process a message."""
 87 |         if not isinstance(obj, MessageTemplate):
 88 |             prompt = MessageTemplate.load(obj)
 89 |         else:
 90 |             prompt = obj
 91 | 
 92 |         prompt.format_message(**kwargs)
 93 | 
 94 |         return prompt.to_prompt()
 95 | 
 96 |     @staticmethod
 97 |     @retry(
 98 |         retry(
 99 |             reraise=True,
100 |             stop=stop_after_attempt(6),
101 |             wait=wait_exponential(multiplier=1, min=1, max=60),
102 |             retry=(
103 |                 retry_if_exception_type(Timeout)
104 |                 | retry_if_exception_type(APIError)
105 |                 | retry_if_exception_type(APIConnectionError)
106 |                 | retry_if_exception_type(RateLimitError)
107 |                 | retry_if_exception_type(ServiceUnavailableError)
108 |             ),
109 |             before_sleep=before_sleep_log(logger, logging.WARNING),
110 |         )
111 |     )
112 |     def _call(
113 |         messages: List[Dict],
114 |         temperature: float = 0.0,
115 |         **kwargs,
116 |     ) -> Dict:
117 |         response = openai.ChatCompletion.create(
118 |             messages=messages,
119 |             temperature=temperature,
120 |             **kwargs,
121 |         )
122 | 
123 |         return response  # type: ignore
124 | 
125 |     @staticmethod
126 |     def _extract_placeholders(s: str) -> List[str]:
127 |         """Extract placeholder variables that can be filled in an f-string."""
128 |         formatter = string.Formatter()
129 |         return [field_name for _, field_name, _, _ in formatter.parse(s) if field_name is not None]
130 | 
131 | 
132 | class EYFSClassifier:
133 |     """Classify text to EYFS areas of learning."""
134 | 
135 |     @classmethod
136 |     def generate(
137 |         cls,
138 |         messages: List[Union[str, Dict]],
139 |         message_kwargs: Optional[Dict] = None,
140 |         model: str = "gpt-3.5-turbo",
141 |         temperature: float = 0.0,
142 |         **openai_kwargs,
143 |     ) -> Union[Dict, str]:
144 |         """Generate text using OpenAI's API.
145 | 
146 |         More details on the API and messages: https://platform.openai.com/docs/guides/gpt/chat-completions-api
147 | 
148 |         Args:
149 |             messages
150 |                 A list of messages to send to the API. They can be:
151 |                 - dictionaries
152 |                 - str (JSON file path)
153 |                 - instances of classes that inherit from BasePromptTemplate
154 | 
155 |             message_kwargs
156 |                 A dictionary of keyword arguments to pass to the messages.
157 | 
158 |             model
159 |                 The OpenAI model to use.
160 | 
161 |             temperature
162 |                 The sampling temperature.
163 | 
164 |             openai_kwargs
165 |                 Keyword arguments to pass to the OpenAI API.
166 | 
167 |         Returns:
168 |             A dictionary containing the response from the API.
169 | 
170 |         """
171 |         if not message_kwargs:
172 |             message_kwargs = {}
173 | 
174 |         messages = [cls.prepare_message(message, **message_kwargs) for message in messages]
175 | 
176 |         response = cls._call(
177 |             messages=messages,
178 |             temperature=temperature,
179 |             model=model,
180 |             **openai_kwargs,
181 |         )
182 | 
183 |         parsed_response = json.loads(response["choices"][0]["message"]["function_call"]["arguments"])
184 |         if parsed_response:
185 |             parsed_response["url"] = message_kwargs["url"]
186 |             return parsed_response
187 | 
188 |         return message_kwargs["url"]
189 | 
190 |     @classmethod
191 |     def prepare_message(cls, obj: Union[MessageTemplate, dict, str], **kwargs) -> Dict:
192 |         """Process a message."""
193 |         if not isinstance(obj, MessageTemplate):
194 |             prompt = MessageTemplate.load(obj)
195 |         else:
196 |             prompt = obj
197 | 
198 |         prompt.format_message(**kwargs)
199 | 
200 |         return prompt.to_prompt()
201 | 
202 |     @staticmethod
203 |     @retry(
204 |         retry(
205 |             reraise=True,
206 |             stop=stop_after_attempt(6),
207 |             wait=wait_exponential(multiplier=1, min=1, max=60),
208 |             retry=(
209 |                 retry_if_exception_type(Timeout)  # noqa: W503
210 |                 | retry_if_exception_type(APIError)  # noqa: W503
211 |                 | retry_if_exception_type(APIConnectionError)  # noqa: W503
212 |                 | retry_if_exception_type(RateLimitError)  # noqa: W503
213 |                 | retry_if_exception_type(ServiceUnavailableError)  # noqa: W503
214 |             ),
215 |             before_sleep=before_sleep_log(logger, logging.WARNING),
216 |         )
217 |     )
218 |     def _call(
219 |         messages: List[Dict],
220 |         model: str = "gpt-3.5-turbo",
221 |         temperature: float = 0.0,
222 |         **kwargs,
223 |     ) -> Dict:
224 |         response = openai.ChatCompletion.create(
225 |             messages=messages,
226 |             model=model,
227 |             temperature=temperature,
228 |             **kwargs,
229 |         )
230 | 
231 |         return response  # type: ignore
232 | 
233 |     @classmethod
234 |     async def agenerate(
235 |         cls,
236 |         messages: List[Union[str, Dict]],
237 |         message_kwargs: Optional[Dict] = None,
238 |         model: str = "gpt-3.5-turbo",
239 |         temperature: float = 0.0,
240 |         concurrency: int = 10,
241 |         **openai_kwargs,
242 |     ) -> Dict:
243 |         """Generate text using async OpenAI's API.
244 | 
245 |         More details on the API and messages: https://platform.openai.com/docs/guides/gpt/chat-completions-api
246 | 
247 |         Args:
248 |             messages
249 |                 A list of messages to send to the API. They can be:
250 |                 - dictionaries
251 |                 - str (JSON file path)
252 | 
253 |             message_kwargs
254 |                 A dictionary of keyword arguments to pass to the messages.
255 | 
256 |             temperature
257 |                 The sampling temperature.
258 | 
259 |             openai_kwargs
260 |                 Keyword arguments to pass to the OpenAI API.
261 | 
262 |             concurrency:
263 |                 The number of concurrent requests to make.
264 | 
265 |         Returns:
266 |             A dictionary containing the response from the API.
267 | 
268 |         """
269 |         semaphore = asyncio.Semaphore(concurrency)
270 |         async with semaphore:
271 |             if not message_kwargs:
272 |                 message_kwargs = {}
273 | 
274 |             messages = [cls.prepare_message(message, **message_kwargs) for message in messages]
275 | 
276 |             response = await cls._acall(
277 |                 messages=messages,
278 |                 temperature=temperature,
279 |                 model=model,
280 |                 **openai_kwargs,
281 |             )
282 | 
283 |             response = response["choices"][0]["message"]["function_call"]["arguments"]
284 |             parsed_response = await cls._parse_json(response)
285 |             if parsed_response:
286 |                 parsed_response["url"] = message_kwargs["url"]
287 |                 return parsed_response
288 | 
289 |             return message_kwargs["url"]
290 | 
291 |     @staticmethod
292 |     @retry(
293 |         retry(
294 |             reraise=True,
295 |             stop=stop_after_attempt(6),
296 |             wait=wait_exponential(multiplier=1, min=1, max=60),
297 |             retry=(
298 |                 retry_if_exception_type(Timeout)  # noqa: W503
299 |                 | retry_if_exception_type(APIError)  # noqa: W503
300 |                 | retry_if_exception_type(APIConnectionError)  # noqa: W503
301 |                 | retry_if_exception_type(RateLimitError)  # noqa: W503
302 |                 | retry_if_exception_type(ServiceUnavailableError)  # noqa: W503
303 |             ),
304 |             before_sleep=before_sleep_log(logger, logging.WARNING),
305 |         )
306 |     )
307 |     async def _acall(
308 |         messages: List[Dict],
309 |         model: str = "gpt-3.5-turbo",
310 |         temperature: float = 0.0,
311 |         **kwargs,
312 |     ) -> Dict:
313 |         response = await openai.ChatCompletion.acreate(
314 |             messages=messages,
315 |             model=model,
316 |             temperature=temperature,
317 |             **kwargs,
318 |         )
319 | 
320 |         return response  # type: ignore
321 | 
322 |     @staticmethod
323 |     async def _try_parse_json(item: str) -> Union[dict, None]:
324 |         try:
325 |             return json.loads(item)
326 |         except json.JSONDecodeError as e:
327 |             return e
328 | 
329 |     @staticmethod
330 |     async def _parse_json(item: str) -> Union[dict, None]:
331 |         result = await EYFSClassifier._try_parse_json(item)
332 |         if isinstance(result, json.JSONDecodeError):
333 |             result = await EYFSClassifier._try_parse_json(item.replace("'", '"'))
334 |             if isinstance(result, json.JSONDecodeError):
335 |                 logging.error(f"Invalid JSON: Error: {str(result)}")
336 |                 return None
337 |         return result
338 | 
339 |     @staticmethod
340 |     async def write_line_to_file(item: dict, filename: str) -> None:
341 |         """Write the item to a file."""
342 |         file = f"{filename}/invalid_json.txt"
343 |         if isinstance(item, dict):
344 |             file = f"{filename}/parsed_json.jsonl"
345 | 
346 |         async with aiofiles.open(file, "a") as f:
347 |             await f.write(f"{json.dumps(item)}\n")
348 | 
349 | 
350 | def get_embedding(text: str, model: str = "text-embedding-ada-002") -> List[float]:
351 |     """Encode text with OpenAI's text embedding model."""
352 |     text = text.replace("\n", " ")
353 |     return openai.Embedding.create(input=[text], model=model)["data"][0]["embedding"]
354 | 


--------------------------------------------------------------------------------
/src/genai/eyfs/prompts/choices_function.json:
--------------------------------------------------------------------------------
1 | {"name": "extract_activity_names", "description": "Extract all entities starting with three hashtags.", "parameters": {"type": "object", "properties": {"activity_names": {"type": "array", "items": {"type": "string", "description": "The name of an activity. It usually starts with ###."}}}, "required": ["activity_names"]}}
2 | 


--------------------------------------------------------------------------------
/src/genai/eyfs/prompts/classifier.json:
--------------------------------------------------------------------------------
1 | {"role": "user", "content": "###Areas of learning###\n{areas_of_learning}\n\n###Instructions###\nCategorise the following text to one or more areas of learning.\n{text}\n"}
2 | 


--------------------------------------------------------------------------------
/src/genai/eyfs/prompts/classifier_function.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "predict_area_of_learning",
 3 |     "description": "Predict the EYFS area of learning for a given text",
 4 |     "parameters": {
 5 |         "type": "object",
 6 |         "properties": {
 7 |             "prediction": {
 8 |                 "type": "array",
 9 |                 "items": {
10 |                     "type": "string",
11 |                     "enum": [
12 |                         "Communication and Language",
13 |                         "Personal, Social and Emotional Development",
14 |                         "Physical Development",
15 |                         "Literacy",
16 |                         "Mathematics",
17 |                         "Understanding the World",
18 |                         "Expressive Arts and Design",
19 |                         "None"
20 |                     ]
21 |                 },
22 |                 "description": "The predicted areas of learning."
23 |             }
24 |         },
25 |         "required": [
26 |             "prediction"
27 |         ]
28 |     }
29 | }
30 | 


--------------------------------------------------------------------------------
/src/genai/eyfs/prompts/constraints.json:
--------------------------------------------------------------------------------
1 | {
2 |     "role": "user",
3 |     "content": "###Requirements for the activities###\n1. Your suggestions must be fun and engaging for the children.\n2. Your suggestions must be novel, inspiring and memorable.\n3. You must suggest topics for conversation with the children and questions to ask them.\n4. Your proposed activities engage children in the following Areas of Learning: {areas_of_learning}.\n5. You must generate {n_results} activities.\n6. Your proposed activities must be played {location}\n7. Describe the activities in 3-4 sentences."
4 | }
5 | 


--------------------------------------------------------------------------------
/src/genai/eyfs/prompts/context_and_task.json:
--------------------------------------------------------------------------------
1 | {
2 |     "role": "user",
3 |     "content": "###Context###The UK's Early Years Foundation Stage framework recommends that educational programmes must involve activities and experiences for children, as set out under each of the areas of learning described below.\n\n##Areas of Learning###\n{areas_of_learning_text}\n\n###Instructions###\nI am an early years educator and I am working with children 3-4 years old. I will describe you a situation in the ###Description### section. Please propose two conversation ideas and several games and crafts ideas I could do with the children to extend their learning.\n\nTypes of activities:\n- Conversations: Asking them questions about the topic\n- Games and Crafts: Puzzles, games, role play, arts and crafts\n\n###Formatting###\nReturn the proposed activities in the following format and always start with the two conversation ideas and then follow with all games and crafts ideas:\n\n## Conversations\n### <activity_name>\n\n**Activity description**:<activity_description>\n\n**Areas of learning**:<list_of_areas_of_learning>\n\n### <activity_name>\n\n**Activity description**:<activity_description>\n\n**Areas of learning**:<list_of_areas_of_learning>\n\n## Games and Crafts\n### <activity_name>\n\n**Activity description**:<activity_description>\n\n**Areas of learning**:<list_of_areas_of_learning>\n\n### <activity_name>\n\n**Activity description**:<activity_description>\n\n**Areas of learning**:<list_of_areas_of_learning>\n"
4 | }
5 | 


--------------------------------------------------------------------------------
/src/genai/eyfs/prompts/context_and_task_with_examples.json:
--------------------------------------------------------------------------------
1 | {"role": "user", "content": "###Context###The UK's Early Years Foundation Stage framework recommends that educational programmes must involve activities and experiences for children, as set out under each of the areas of learning described below.\n\n##Areas of Learning###\n{areas_of_learning_text}\n\n###Instructions###\nI am an early years educator and I am working with children 3-4 years old. I will describe you a situation in the ###Description### section. Please propose two conversation ideas and several games and crafts ideas I could do with the children to extend their learning.\n\nTypes of activities:\n- Conversations: Asking them questions about the topic\n- Games and Crafts: Puzzles, games, role play, arts and crafts\n\n###Activity examples###\nTake inspiration from the following examples:\n\n{activity_examples}\n\n###Formatting###\nReturn the proposed activities in the following format and always start with the two conversation ideas and then follow with all games and crafts ideas:\n\n## Conversations\n### <activity_name>\n\n**Activity description**:<activity_description>\n\n**Areas of learning**:<list_of_areas_of_learning>\n\n### <activity_name>\n\n**Activity description**:<activity_description>\n\n**Areas of learning**:<list_of_areas_of_learning>\n\n## Games and Crafts\n### <activity_name>\n\n**Activity description**:<activity_description>\n\n**Areas of learning**:<list_of_areas_of_learning>\n\n### <activity_name>\n\n**Activity description**:<activity_description>\n\n**Areas of learning**:<list_of_areas_of_learning>\n\n"}
2 | 


--------------------------------------------------------------------------------
/src/genai/eyfs/prompts/situation.json:
--------------------------------------------------------------------------------
1 | {
2 |     "role": "user",
3 |     "content": "###Description###\n{description}\n\n###Activities###\n"
4 | }
5 | 


--------------------------------------------------------------------------------
/src/genai/eyfs/prompts/system.json:
--------------------------------------------------------------------------------
1 | {
2 |     "role": "system",
3 |     "content": "You are a very creative and highly educated assistant who loves designing early year education programmes."
4 | }
5 | 


--------------------------------------------------------------------------------
/src/genai/eyfs/run_classifier.py:
--------------------------------------------------------------------------------
  1 | """Classify the BBC activities to EYFS areas of learning."""
  2 | 
  3 | import asyncio
  4 | import os
  5 | import time
  6 | 
  7 | import openai
  8 | import pandas as pd
  9 | 
 10 | from aiohttp import ClientSession
 11 | from dotenv import load_dotenv
 12 | 
 13 | from genai import FunctionTemplate
 14 | from genai import MessageTemplate
 15 | from genai.eyfs import EYFSClassifier
 16 | from genai.utils import batch
 17 | from genai.utils import create_directory_if_not_exists
 18 | from genai.utils import read_json
 19 | 
 20 | 
 21 | load_dotenv()
 22 | 
 23 | # You need to create this manually before running the script
 24 | OUTPUT_FILENAME = "data/"
 25 | PATH_TO_AREAS_OF_LEARNING = "src/genai/eyfs/areas_of_learning.json"
 26 | PATH_TO_MESSAGE_PROMPT = "src/genai/eyfs/prompts/classifier.json"
 27 | PATH_TO_FUNCTION = "src/genai/eyfs/prompts/classifier_function.json"
 28 | openai.api_key = os.environ["OPENAI_API_KEY"]
 29 | 
 30 | 
 31 | def get_bbc_activities(path: str) -> pd.DataFrame:
 32 |     """Read and clean the BBC activities file and return a dataframe."""
 33 |     df = pd.read_csv(path)
 34 |     df = df.rename(columns={"Age Range (if applicable)": "Age"})
 35 |     df = df.dropna(subset=["text", "URL"])
 36 |     df = df.drop_duplicates(subset=["URL"])
 37 | 
 38 |     return df
 39 | 
 40 | 
 41 | def get_areas_of_learning(path: str) -> tuple:
 42 |     """Get the EYFS areas of learning and their text."""
 43 | 
 44 |     # keys for the classes, text for the prompt
 45 |     areas_of_learning = read_json(path)
 46 |     areas_of_learning_text = "\n".join([v for v in areas_of_learning.values()])
 47 |     areas_of_learning_keys = list(areas_of_learning.keys())
 48 | 
 49 |     # Add "None" to cover cases where no option is applicable.
 50 |     areas_of_learning_keys.append("None")
 51 | 
 52 |     return areas_of_learning_keys, areas_of_learning_text
 53 | 
 54 | 
 55 | async def main() -> None:
 56 |     """Create prompts for path selection and infer paths."""
 57 |     openai.aiosession.set(ClientSession())
 58 | 
 59 |     # Fetch the BBC activities
 60 |     activities_df = get_bbc_activities(os.environ["PATH_TO_BBC_ACTIVITIES_DATA"])
 61 | 
 62 |     # Fetch the EYFS areas of learning
 63 |     _, areas_of_learning_text = get_areas_of_learning(PATH_TO_AREAS_OF_LEARNING)
 64 | 
 65 |     print(f"Number of BBC activities: {len(activities_df)}")  # noqa: T001
 66 | 
 67 |     message = MessageTemplate.load(PATH_TO_MESSAGE_PROMPT)
 68 |     function = FunctionTemplate.load(PATH_TO_FUNCTION)
 69 |     model = "gpt-3.5-turbo"
 70 |     temperature = 0.6
 71 | 
 72 |     for i, batched_results in enumerate(batch(activities_df, 20)):
 73 |         print(f"Batch {i} / {len(activities_df) // 20}")  # noqa: T001
 74 |         tasks = [
 75 |             EYFSClassifier.agenerate(
 76 |                 model=model,
 77 |                 temperature=temperature,
 78 |                 messages=[message],
 79 |                 message_kwargs={
 80 |                     "areas_of_learning": areas_of_learning_text,
 81 |                     "text": tup.text,
 82 |                     "url": tup.URL,
 83 |                 },
 84 |                 functions=[function.to_prompt()],
 85 |                 function_call={"name": "predict_area_of_learning"},
 86 |                 max_tokens=100,
 87 |                 concurrency=5,
 88 |             )
 89 |             for tup in batched_results.itertuples()
 90 |         ]
 91 | 
 92 |         for future in asyncio.as_completed(tasks):
 93 |             result = await future  # Get the result (waits if not ready)
 94 |             await EYFSClassifier.write_line_to_file(result, OUTPUT_FILENAME)  # Write to the file
 95 | 
 96 |         time.sleep(2)
 97 | 
 98 |     await openai.aiosession.get().close()  # Close the http session at the end of the program
 99 | 
100 | 
101 | if "__main__" == __name__:
102 |     # create data/ directory if it doesn't exist
103 |     create_directory_if_not_exists(OUTPUT_FILENAME)
104 | 
105 |     start = time.time()
106 |     s = time.perf_counter()
107 |     loop = asyncio.new_event_loop()
108 |     asyncio.set_event_loop(loop)
109 |     try:
110 |         loop.run_until_complete(main())
111 |     finally:
112 |         loop.close()
113 |     e = time.perf_counter()
114 | 
115 |     print(f"Concurrent execution completed in: {e - s:0.2f} seconds")  # noqa: T001
116 | 


--------------------------------------------------------------------------------
/src/genai/eyfs/run_pinecone_index.py:
--------------------------------------------------------------------------------
 1 | """Join the BBC activities with the labelled activities and build a pinecone index.
 2 | 
 3 | Note: Running this script will delete the existing index and build a new one.
 4 | """
 5 | 
 6 | import os
 7 | 
 8 | import numpy as np
 9 | import openai
10 | import pandas as pd
11 | 
12 | from dotenv import load_dotenv
13 | 
14 | from genai.eyfs import get_embedding
15 | from genai.utils import read_jsonl_from_s3
16 | from genai.vector_index import PineconeIndex
17 | 
18 | 
19 | load_dotenv()
20 | 
21 | 
22 | openai.api_key = os.environ["OPENAI_API_KEY"]
23 | INDEX_NAME = "eyfs-index"
24 | ENCODER_NAME = "text-embedding-ada-002"
25 | 
26 | 
27 | def get_labelled_bbc_activities(path: str) -> pd.DataFrame:
28 |     """Read and clean the labelled BBC activities file and return a dataframe."""
29 |     data = read_jsonl_from_s3(path)
30 |     df = pd.concat([pd.DataFrame([line]) for line in data])
31 |     df["prediction"] = df["prediction"].apply(lambda row: row if row else np.nan)
32 |     df = df[~df.prediction.isnull()]
33 |     df = df[df.prediction != "None"]
34 |     df = df.rename({"prediction": "areas_of_learning", "url": "URL"}, axis=1)
35 | 
36 |     return df
37 | 
38 | 
39 | def get_bbc_activities(path: str) -> pd.DataFrame:
40 |     """Read and clean the BBC activities file and return a dataframe."""
41 |     df = pd.read_csv(path)
42 |     df = df.rename(columns={"Age Range (if applicable)": "Age", "CONTENT": "title"})
43 |     df = df.dropna(subset=["text", "URL"])
44 |     df = df.drop_duplicates(subset=["URL"])
45 | 
46 |     return df
47 | 
48 | 
49 | def main() -> None:
50 |     """Run the script."""
51 |     # Read and merge dataframes
52 |     labels = get_labelled_bbc_activities(os.environ["PATH_TO_LABELLED_BBC_DATA"])
53 |     bbc = get_bbc_activities(os.environ["PATH_TO_BBC_ACTIVITIES_DATA"])
54 | 
55 |     df = labels.merge(bbc[["SHORT DESCRIPTION", "text", "URL", "title"]], how="left", left_on="URL", right_on="URL")
56 | 
57 |     # Encode the BBC activities' text
58 |     df["embedding"] = df["text"].apply(lambda row: get_embedding(row, model=ENCODER_NAME))
59 | 
60 |     # Batch items
61 |     items = []
62 |     for tup in df.itertuples():
63 |         item = (
64 |             tup.URL,
65 |             tup.embedding,
66 |             {"areas_of_learning": tup.areas_of_learning, "title": tup.title, "text": tup.text, "source": "BBC"},
67 |         )
68 |         items.append(item)
69 | 
70 |     # Build the index
71 |     conn = PineconeIndex(api_key=os.environ["PINECONE_API_KEY"], environment=os.environ["PINECONE_REGION"])
72 |     conn.build_and_upsert(
73 |         index_name=INDEX_NAME,
74 |         dimension=len(df["embedding"].iloc[0]),
75 |         metric="euclidean",
76 |         docs=items,
77 |         metadata_config={"indexed": ["areas_of_learning", "source", "type_", "age_group"]},
78 |         batch_size=40,
79 |         delete_if_exists=True,
80 |     )
81 | 
82 | 
83 | if "__main__" == __name__:
84 |     main()
85 | 


--------------------------------------------------------------------------------
/src/genai/message_history.py:
--------------------------------------------------------------------------------
  1 | from abc import ABC
  2 | from abc import abstractmethod
  3 | from typing import List
  4 | 
  5 | import tiktoken
  6 | 
  7 | from genai import MessageTemplate
  8 | 
  9 | 
 10 | class TokenCounter:
 11 |     """Count tokens and buffer messages from memory."""
 12 | 
 13 |     @staticmethod
 14 |     def _count_tokens_from_string(s: str, model_name: str) -> int:
 15 |         """Return the number of tokens in a text string."""
 16 |         encoding = tiktoken.encoding_for_model(model_name)
 17 |         num_tokens = len(encoding.encode(s))
 18 |         return num_tokens
 19 | 
 20 |     @classmethod
 21 |     def _count_tokens_from_messages(cls, messages: list, model_name: str) -> int:
 22 |         """Return the number of tokens in a list of messages."""
 23 |         num_tokens = 0
 24 | 
 25 |         # every message follows <|start|>{role/name}\n{content}<|end|>\n
 26 |         tokens_per_message = 3
 27 |         tokens_per_name = 1
 28 |         for message in messages:
 29 |             num_tokens += tokens_per_message
 30 |             for k, v in message.items():
 31 |                 num_tokens += cls._count_tokens_from_string(v, model_name)
 32 |                 if k == "name":  # When role = function
 33 |                     num_tokens += tokens_per_name
 34 | 
 35 |         num_tokens += 3  # every reply is primed with <|start|>assistant<|message|>
 36 |         return num_tokens
 37 | 
 38 |     @staticmethod
 39 |     def _forget_messages(num_tokens: int, max_tokens: int) -> bool:
 40 |         if num_tokens > max_tokens:
 41 |             return True
 42 |         return False
 43 | 
 44 |     @classmethod
 45 |     def buffer(
 46 |         cls,
 47 |         messages: list,
 48 |         model_name: str = "gpt-3.5-turbo",
 49 |         max_tokens: int = 4096,
 50 |         keep_system_message: bool = True,
 51 |     ) -> List[dict]:
 52 |         """Return the number of tokens in a list of messages.
 53 | 
 54 |         Parameters
 55 |         ----------
 56 |         model_name
 57 |             The name of the model to use for token counting.
 58 | 
 59 |         max_tokens
 60 |             The maximum number of tokens to keep in the history.
 61 | 
 62 |         keep_system_message
 63 |             Whether to keep the system message in the history.
 64 | 
 65 |         Returns
 66 |         -------
 67 |         messages
 68 |             List of messages.
 69 | 
 70 |         """
 71 |         num_tokens = cls._count_tokens_from_messages(messages, model_name)
 72 |         if cls._forget_messages(num_tokens, max_tokens):
 73 |             if keep_system_message:
 74 |                 messages.pop(1)
 75 |             else:
 76 |                 messages.pop(0)
 77 |             cls.buffer(messages, model_name=model_name, max_tokens=max_tokens)
 78 | 
 79 |         return messages
 80 | 
 81 | 
 82 | class BaseMessageHistory(ABC):
 83 |     """Base class for message history."""
 84 | 
 85 |     @abstractmethod
 86 |     def add_message(self, message: MessageTemplate) -> None:
 87 |         """Add a message to the history."""
 88 |         pass
 89 | 
 90 |     @abstractmethod
 91 |     def get_messages(self) -> List[MessageTemplate]:
 92 |         """Get all messages from history."""
 93 |         pass
 94 | 
 95 |     @abstractmethod
 96 |     def clear_messages(self) -> None:
 97 |         """Clear all messages from history."""
 98 |         pass
 99 | 
100 | 
101 | class InMemoryMessageHistory(BaseMessageHistory):
102 |     """In-memory message history."""
103 | 
104 |     def __init__(self) -> None:
105 |         """Initialize the message history."""
106 |         self.messages = []
107 | 
108 |     def add_message(self, message: MessageTemplate) -> None:
109 |         """Add a message to the history."""
110 |         self.messages.append(message)
111 | 
112 |     def get_messages(
113 |         self,
114 |         model_name: str = "gpt-3.5-turbo",
115 |         max_tokens: int = 4096,
116 |         keep_system_message: bool = True,
117 |     ) -> List[dict]:
118 |         """Get all messages from history.
119 | 
120 |         Filter messages when the number of tokens exceeds max_tokens.
121 | 
122 |         Parameters
123 |         ----------
124 |         model_name
125 |             The name of the model to use for token counting.
126 | 
127 |         max_tokens
128 |             The maximum number of tokens to keep in the history.
129 | 
130 |         keep_system_message
131 |             Whether to keep the system message in the history.
132 | 
133 |         Returns
134 |         -------
135 |         messages
136 |             List of messages.
137 | 
138 |         """
139 |         return TokenCounter.buffer(
140 |             self.messages,
141 |             model_name=model_name,
142 |             max_tokens=max_tokens,
143 |             keep_system_message=keep_system_message,
144 |         )
145 | 
146 |     def clear_messages(self) -> None:
147 |         """Empty the message history."""
148 |         self.messages = []
149 | 


--------------------------------------------------------------------------------
/src/genai/parenting_chatbot/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nestauk/discovery_generative_ai/b5ab4b0bf1feda7cd4a85a18ba20aa996cfb9e0d/src/genai/parenting_chatbot/__init__.py


--------------------------------------------------------------------------------
/src/genai/parenting_chatbot/prodigy_eval/best_answer_recipe.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | from typing import Dict
 4 | from typing import Generator
 5 | from typing import List
 6 | 
 7 | import prodigy
 8 | 
 9 | from prodigy.components.loaders import JSONL
10 | 
11 | 
12 | GLOBAL_CSS = (
13 |     ".prodigy-content{font-size: 15px}"
14 |     " .prodigy-option{width: 49%}"
15 |     " .prodigy-option{align-items:flex-start}"
16 |     " .prodigy-option{margin-right: 3px}"
17 |     " .prodigy-container{max-width: 1200px}"
18 | )
19 | 
20 | 
21 | @prodigy.recipe(
22 |     "best_answer",
23 |     dataset=("The dataset to save to", "positional", None, str),
24 |     file_path=("Path to the questions and answers file", "positional", None, str),
25 | )
26 | def best_answer(dataset: str, file_path: str) -> Dict:
27 |     """
28 |     Choose the best answer out of the given options.
29 | 
30 |     Arguments:
31 |         dataset: The dataset to save to.
32 |         file_path: Path to the questions and answers file.
33 | 
34 |     Returns:
35 |         A dictionary containing the recipe configuration.
36 | 
37 |     """
38 | 
39 |     # Load the data
40 |     stream = list(JSONL(file_path))
41 | 
42 |     def get_shuffled_stream(stream: List) -> Generator:
43 |         random.shuffle(stream)
44 |         for eg in stream:
45 |             yield eg
46 | 
47 |     # Process the stream to format for Prodigy
48 |     def format_stream(stream: List) -> Dict:
49 |         for item in stream:
50 |             question = item["question"]
51 |             options = [{"id": key, "html": value} for key, value in item["answers"].items()]
52 |             yield {"html": question, "options": options}
53 | 
54 |     stream = format_stream(get_shuffled_stream(stream))
55 | 
56 |     return {
57 |         # Use the choice interface
58 |         "view_id": "choice",
59 |         # Name of the dataset
60 |         "dataset": dataset,
61 |         # The data stream
62 |         "stream": stream,
63 |         "config": {
64 |             # Only allow one choice
65 |             "choice_style": "single",
66 |             "task_description": "Choose the best answer",
67 |             "choice_auto_accept": False,
68 |             # Define which buttons to show
69 |             "buttons": ["accept", "ignore"],
70 |             # Add custom css
71 |             "global_css": GLOBAL_CSS,
72 |             # If feed_overlap is True, the same example can be sent out to multiple users at the same time
73 |             "feed_overlap": True,
74 |             # Port to run the server on
75 |             "port": 8080,
76 |             # Important to set host to 0.0.0.0 when running on ec2
77 |             "host": "0.0.0.0",
78 |             # Setting instant_submit as True means that the user doesn't have to click the "save" button
79 |             "instant_submit": True,
80 |         },
81 |     }
82 | 


--------------------------------------------------------------------------------
/src/genai/parenting_chatbot/prodigy_eval/create_eval_data.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Generate data for Prodigy platform to evaluate answers to questions about raising babies
 3 | 
 4 | Usage: Run the script from the repo root directory
 5 | $ poetry run python src/genai/parenting_chatbot/prodigy_eval/create_eval_data.py
 6 | 
 7 | The final output file is stored in data/answers.jsonl file
 8 | following the format: {"question": your-question, "answers": {"human": human-answer, "rag": rag-answer, "gpt4": gpt4-answer}}
 9 | 
10 | """
11 | 
12 | from itertools import combinations
13 | 
14 | import pandas as pd
15 | 
16 | 
17 | # Path constants
18 | DATA_DIR = "src/genai/parenting_chatbot/prodigy_eval/data/"
19 | QUESTION_FILE = DATA_DIR + "questions.jsonl"
20 | ANSWER_FILE = DATA_DIR + "answers_{}.jsonl"
21 | OUTPUT_FILE = DATA_DIR + "answers.jsonl"
22 | # Define answer types, for loading corresponding answers
23 | ANSWER_TYPES = ["human", "rag", "gpt4"]
24 | # html formatting of prefix and suffix for questions
25 | QUESTION_PREFIX = "Which one is a better answer to this question:\n\n<span style='font-weight: bold; font-size:30px'>"
26 | QUESTION_SUFFIX = "</span>"
27 | 
28 | if __name__ == "__main__":
29 |     # Load questions
30 |     questions = pd.read_json(QUESTION_FILE, lines=True)["question"].to_list()
31 |     # Load answers
32 |     answers = [
33 |         pd.read_json(path_or_buf=ANSWER_FILE.format(answer_type), lines=True)[answer_type]
34 |         for answer_type in ANSWER_TYPES
35 |     ]
36 | 
37 |     answers_df = (
38 |         # Construct a dataframe with columns: question, human, rag, and gpt4
39 |         pd.DataFrame({"question": questions, "human": answers[0], "rag": answers[1], "gpt4": answers[2]})
40 |         # Melt the dataframe for pairwise combinations and rename the resulting column
41 |         .melt(id_vars=["question"], value_vars=ANSWER_TYPES).rename(columns={"value": "answer"})
42 |         # Add html formatting to the question
43 |         .assign(question=lambda df: QUESTION_PREFIX + df["question"] + QUESTION_SUFFIX)
44 |         # Format the answer as a dictionary {answer_type: answer}
45 |         .assign(answer=lambda df: df.apply(lambda x: {x["variable"]: x["answer"]}, axis=1))
46 |     )
47 | 
48 |     # Generate pairwise combinations of answer types
49 |     answer_type_pairs = list(combinations(ANSWER_TYPES, 2))
50 | 
51 |     # Aggregate answers based on the pairwise combinations of answer types
52 |     dataframes = []
53 |     for answer_type_pair in answer_type_pairs:
54 |         subset_df = answers_df[answers_df["variable"].isin(answer_type_pair)]
55 |         aggregated_df = subset_df.groupby("question").agg(lambda x: x.tolist()).reset_index()
56 |         dataframes.append(aggregated_df)
57 | 
58 |     # Combine the results, merge dictionaries and save the output
59 |     (
60 |         pd.concat(dataframes, ignore_index=True)
61 |         .assign(answers=lambda df: df.apply(lambda x: {k: v for d in x["answer"] for k, v in d.items()}, axis=1))
62 |         .drop(columns=["variable", "answer"])
63 |         .sort_values(by="question")
64 |         .reset_index(drop=True)
65 |         .to_json(path_or_buf=OUTPUT_FILE, orient="records", lines=True)
66 |     )
67 | 


--------------------------------------------------------------------------------
/src/genai/parenting_chatbot/prodigy_eval/data/answers_gpt4.jsonl:
--------------------------------------------------------------------------------
 1 | {"question":"How to change a nappy?","gpt4":"Changing a nappy can be a bit intimidating at first, but don't worry, you'll get the hang of it quickly. Here's a simple step-by-step guide:\n\n1. Gather your supplies: You'll need a clean nappy, baby wipes or a warm washcloth, and possibly nappy rash cream.\n\n2. Lay your baby down on a clean, safe surface: If you're using a changing table, be sure to use the safety straps.\n\n3. Remove the dirty nappy: Unfasten the tabs on the old nappy. Hold your baby's ankles and lift their bottom off the table, then pull the front of the nappy down. If there's a lot of mess, you can use the front of the nappy to wipe the bulk of it off.\n\n4. Clean your baby: Use baby wipes or a warm washcloth to clean your baby's bottom thoroughly. For a girl, wipe from front to back to avoid infection.\n\n5. Apply cream: If your baby has a rash, apply some nappy rash cream or ointment.\n\n6. Put on the clean nappy: Lift your baby's legs again and slide the clean nappy underneath. The back part with the tabs should go under your baby's bottom, and the front should come up between their legs.\n\n7. Fasten the nappy: Pull the front of the nappy up to your baby's belly, and open out the tabs. Make sure it's snug but not too tight - you should be able to fit two fingers between the nappy and your baby's belly.\n\n8. Dispose of the dirty nappy: Roll it up, secure it with the tabs, and put it in a nappy bag or a nappy bin.\n\nAlways remember to wash your hands thoroughly afterwards, and never leave your baby unattended on the changing table."}
 2 | {"question":"When to stop breastfeeding?","gpt4":"The timing for stopping breastfeeding can vary greatly from mother to mother and baby to baby. The World Health Organization recommends exclusive breastfeeding up to 6 months of age, with continued breastfeeding along with appropriate complementary foods up to two years of age or beyond. However, the decision to stop breastfeeding is a personal one, and it's about what makes the most sense for you and your child. It's important to consider your baby's nutritional needs, your own health and well-being, and the emotional readiness of both you and your baby to wean. It's always a good idea to discuss this with your pediatrician or a lactation consultant."}
 3 | {"question":"How to baby-proof?","gpt4":"Baby-proofing your home is an essential task to ensure your baby's safety as they start to explore. Here are some basic steps to follow:\n\n1. Start on the Floor: Since babies spend most of their time on the floor, start by removing any small objects that the baby could choke on. \n\n2. Secure Furniture: Secure any heavy furniture that could tip over, such as bookcases or TVs, to the wall. \n\n3. Install Gates: Install safety gates at the top and bottom of stairs, and in the doorways of rooms with hazards.\n\n4. Lock Cabinets: Install safety locks on cabinets, especially those containing cleaning supplies, medications, or other hazardous substances.\n\n5. Cover Outlets: Use safety plugs or outlet covers on all unused electrical outlets.\n\n6. Install Corner Guards: Use corner guards on furniture with sharp edges.\n\n7. Secure Windows: Install window guards and stoppers to prevent falls.\n\n8. Check Toys: Make sure all toys are safe for your baby's age group and that there are no small parts that could be a choking hazard.\n\n9. Adjust Water Heater: Set your water heater to 120 degrees Fahrenheit to prevent burns.\n\n10. Install Toilet Locks: To prevent drowning, install toilet locks to keep the lid down when not in use.\n\nRemember, baby-proofing should be an ongoing process. As your child grows and starts moving around more, you may need to make additional modifications. Always supervise your baby, even in a baby-proofed home."}
 4 | {"question":"How do babies learn?","gpt4":"Babies learn primarily through their senses and by interacting with the people and environment around them. This process begins almost instantly after birth. For example, they learn to recognize their parents' faces and voices very quickly. They also begin to understand cause and effect through simple actions like shaking a rattle and hearing the sound it makes.\n\nAs they grow, their learning continues to be hands-on and experiential. They'll start exploring the world around them by touching, tasting, and playing with everything they can get their hands on. This is why providing a safe and stimulating environment is important.\n\nIn addition, babies learn a lot through imitation. They observe the people around them and try to replicate their actions and expressions. This is how they learn social cues, language, and various skills. It's fascinating to watch a baby's learning process unfold!"}
 5 | {"question":"How to store breast milk?","gpt4":"Storing breast milk properly is important for maintaining its quality and safety for your baby. Here are some guidelines:\n\n1. Freshly Pumped\/Expressed Milk: At room temperature (up to 77\u00b0F or 25\u00b0C), it can be kept for 4 to 6 hours. In a cooler with ice packs, it can last up to one day. In the refrigerator (39\u00b0F or 4\u00b0C), it can be stored for up to 4 days. \n\n2. Frozen Milk: In a freezer compartment inside a refrigerator, it can last for 2 weeks. In a freezer with its own door (0\u00b0F or -18\u00b0C), it can be kept for 6 to 12 months. \n\n3. Thawed Milk: It can be kept in the refrigerator for 24 hours. Do not refreeze thawed milk.\n\nRemember to store the milk in clean containers, such as hard plastic or glass containers, or special plastic bags designed for milk storage. Don't forget to label each container with the date. Always use the oldest milk first."}
 6 | {"question":"How to potty train?","gpt4":"Potty training is a big step for both parents and children. Here are some steps you can follow to make the process easier:\n\n1. **Choose the Right Time**: Begin potty training when your child shows readiness signs, such as staying dry for at least two hours, regular bowel movements, and showing interest in the potty.\n\n2. **Buy the Right Equipment**: Get a child-sized potty or a child's toilet seat and step for the regular toilet. This will make the child more comfortable and less scared.\n\n3. **Explain the Process**: Talk to your child about the toilet process. You can use a children's book about potty training to make it more understandable.\n\n4. **Set a Routine**: Set regular times to sit your child on the potty, such as after meals or before bed. This will help them understand when they need to go.\n\n5. **Praise Success**: Whenever your child uses the potty correctly, praise them and show positive reinforcement. This can be verbal praise, a hug, or even a small reward.\n\n6. **Be Patient**: Accidents will happen. It's important to stay calm and patient, and reassure your child that it's okay. \n\nRemember, every child is unique and may take different lengths of time to get used to potty training. It's important to be patient and supportive throughout the process."}
 7 | {"question":"How to breastfeed?","gpt4":"Breastfeeding can be a very intimate and rewarding experience for both you and your baby. Here are some steps to help you get started:\n\n1. Find a comfortable position: This can be sitting or lying down. Use pillows to support your arms and back if necessary. You'll want to be comfortable since feedings can take some time.\n\n2. Hold your baby correctly: Your baby should be facing you, with their body in a straight line. You can hold your baby in a cradle hold (across your lap), football hold (at your side with their legs towards your back), or lying down.\n\n3. Latch: Touch your baby's lips with your nipple until they open their mouth wide. Then, quickly bring them onto your breast, not your breast to them. Their mouth should cover not just the nipple but as much of the areola (the darker circle around the nipple) as possible.\n\n4. Check for a good latch: If your baby is latched on properly, their chin and nose will be touching your breast, their lips will be flared out (like a fish), and you'll hear them swallowing. It shouldn't be painful for you. If it is, gently insert your finger into the corner of your baby's mouth to break the suction and try again.\n\n5. Let your baby finish the first breast: This ensures they get the \"hindmilk\", which is richer in fats and calories. Once they slow down or release the nipple, you can offer the other breast.\n\n6. Burp your baby: After feeding, it's important to burp your baby to release any air they swallowed. Hold them upright against your chest or shoulder and gently pat or rub their back until you hear a burp.\n\nRemember, every mother and baby are different, so what works for one may not work for another. It may take some time and practice to get the hang of it. Don't hesitate to ask for help from a lactation consultant or your healthcare provider if you're having trouble."}
 8 | {"question":"How much to feed baby?","gpt4":"The amount to feed your baby can vary based on their age, weight, and individual needs. However, here's a general guideline:\n\n- Newborns usually take about 1.5 to 3 ounces of breast milk or formula at each feeding for the first few weeks, then it increases to 4 ounces by six weeks. \n- From 1 to 6 months, your baby will likely consume an average of 2.5 to 4 ounces every three to four hours.\n- By 6 months, your baby may be taking 6 to 8 ounces every four to five hours.\n\nRemember, these are just averages. It's important to follow your baby's hunger cues and consult with your pediatrician if you have concerns about your baby's food intake."}
 9 | {"question":"How to get baby to sleep?","gpt4":"Establishing a consistent bedtime routine can help your baby understand that it's time to sleep. This can include activities like a warm bath, a bedtime story, or a lullaby. Try to make the environment conducive to sleep by keeping the room dark and at a comfortable temperature. Some babies might also find white noise or a pacifier soothing. Remember, every baby is different, so it might take a bit of trial and error to find out what works best for your little one. It's also important to remember that newborns typically have different sleep schedules than older babies or adults, and it's normal for them to wake frequently during the night."}
10 | {"question":"What is colic?","gpt4":"Colic is a term used to describe severe, often fluctuating pain in the abdomen that is caused by the spasmodic contraction of the gut. When it comes to babies, colic is commonly used to refer to excessive, frequent crying in a baby who appears to be otherwise healthy. This can be extremely distressing for parents. The exact cause of colic is unknown but it's thought to be related to indigestion, trapped wind or a temporary gut sensitivity in young babies. The condition usually appears within the first few weeks of life and tends to resolve by the time the baby is four months old, but can last up to 12 months. It's a common condition and is thought to affect up to one in five babies."}
11 | 


--------------------------------------------------------------------------------
/src/genai/parenting_chatbot/prodigy_eval/data/answers_human.jsonl:
--------------------------------------------------------------------------------
 1 | {"question": "How to change a nappy", "human": "When arriving home with your newborn, you’re thrown into motherhood, and there are many things to consider. With many conversations around the best nappies to use and how to use them correctly, how, and how often, do you change a nappy? Young babies may need changing as many as ten or twelve times a day, while older babies must be changed around six to eight times. If your baby's nappy is dirty, it’s best to use it to clean off most of the excrement from their bottom. Once this is done, use either the cotton wool and warm water or baby wipes to remove the rest and ensure your baby is clean. A top tip is to clean inside the folds of the skin. If you're using disposable nappies, try not to get any liquid on the sticky tabs, as they won't stick if you do. If you're using cloth nappies, carefully place them in a nappy liner and then fasten the nappy. The nappy then needs to be adjusted to fit around the waist and legs comfortably. We recommend using reusable nappies from birth if you’d like to reduce the impact you’re having on the environment and the number of times you change your baby."}
 2 | {"question": "When to stop breastfeeding", "human": "Breastfeeding is a very personal experience for mums, and if this is the chosen route for feeding your baby, it can be difficult to decide when to stop. Breastfeeding is recommended for the first six months of your baby's life. While many parents decide to stop breastfeeding before their child turns two, there's no reason why you should not continue breastfeeding your child into their second year. Breastfeeding can provide comfort for your baby when they’re unwell or upset, and it’s common for both the baby and mum to enjoy the benefits of breastfeeding. However, with that being said, mums shouldn’t feel pressured to breastfeed, and you should always do what you’re comfortable with."}
 3 | {"question": "How to baby-proof", "human": "Baby-proofing your house is recommended for when your baby begins to explore, often when they’re beginning to crawl. Some baby-proofing precautions you can take include: Fitting safety devices to doors, windows or drawers; Covering sharp edges and corners around your home; Locking any dangerous items away, such as medication; Adding barriers, such as baby gates, to keep your child away from hazards such as stairs. We recommend looking at things from your baby’s perspective when baby-proofing, crawling around your home and seeing things from a lower perspective can help you spot any hazards you’d miss from adult height."}
 4 | {"question": "How do babies learn", "human": "Babies constantly learn every day, discovering new abilities and expanding their senses. Play is the main way infants learn to understand their surroundings and communicate with others. During the first month, your baby will learn by interacting with you. The first thing a baby learns is how to use their body language to ensure their needs are met. Sensory details such as your touch, the sound of your voice and the sight of you will allow comfort through all learning experiences."}
 5 | {"question": "How to store breast milk", "human": "Mums often choose to express their breast milk and store it for future use. This can be useful if you’re going to be away from your baby, if your breasts feel uncomfortably full, your partner is helping to feed the baby or if you simply want to boost your milk supply. You can express milk by hand or with a pump. The best way to store breast milk is in a sterilised container or in special breast milk storage bags. Once contained, you can store the milk in the fridge for up to eight days at 4C or lower. You can store it for two weeks in the ice compartment of a fridge and up to six months in a freezer if it's -18C or lower. Breast milk that's been cooled in the fridge can be carried in a cool bag with ice packs for up to 24 hours. We recommend labelling all breast milk when storing with dates of when it was expressed."}
 6 | {"question": "How to potty train", "human": "Potty training your child takes patience, and you must go at your own pace with your little one. To potty train, the main key is communication, introduce your child to the potty and talk to them about nappies. Children learn by watching and copying what they see, so you can use toys to demonstrate how to use the potty. Keeping the potty in plain sight can be helpful, allowing them to sit on it and get used to the environment."}
 7 | {"question": "How to breastfeed", "human": "Breastfeeding is an individual experience for every mum, it can take time to get used to. There are different positions you can breastfeed in, so you and your baby should use trial and error and see which works best for you. In any position, you should make sure you’re supporting your baby’s head and neck, allowing them to tilt their head and swallow. Bring your baby close to your breast and allow them to latch themselves, it can help to lean forward. Breastfeeding is also optional, and if you choose not to, the method of feeding your baby on formula is very similar."}
 8 | {"question": "How much to feed baby", "human": "There is no correct amount to feed your baby, each little one will differ with how regularly they need feeding. Usually, most newborns eat every two to three hours or eight to twelve times every 24 hours. Babies usually drink one to two ounces at each feeding after the first few days of life. Your baby will have hunger cues and will let you know when they’d like feeding; this can be in the form of crying, sticking their tongue out, putting their hand to their mouth regularly, opening their mouth and sucking on other items."}
 9 | {"question": "How to get baby to sleep", "human": "Babies can vary in their sleep patterns, especially in the first few months of life. Tips to get your baby to sleep involve encouraging a routine or sleep habits, including a bath, cuddling, singing and quiet music in a dimly lit room. Give your baby time to settle down after comforting, we also recommend putting them down drowsy but still awake, with a comforter such as a teddy bear or a pacifier, allowing them to be soothed to sleep."}
10 | {"question": "What is colic", "human": "Colic is when a baby cries continuously, but there is no obvious cause, it often gets better after three or four months of life. Although all babies cry, babies with colic may cry for three hours at a time but are otherwise healthy. Other signs of colic can include babies clenching their fists, getting red in the face and bringing their knees up to their stomachs. Steps to soothe colic include cuddling, sitting your baby upright during feeds, gently rocking your baby or giving them a warm bath."}
11 | 


--------------------------------------------------------------------------------
/src/genai/parenting_chatbot/prodigy_eval/data/questions.jsonl:
--------------------------------------------------------------------------------
 1 | {"question": "How to change a nappy?"}
 2 | {"question": "When to stop breastfeeding?"}
 3 | {"question": "How to baby-proof?"}
 4 | {"question": "How do babies learn?"}
 5 | {"question": "How to store breast milk?"}
 6 | {"question": "How to potty train?"}
 7 | {"question": "How to breastfeed?"}
 8 | {"question": "How much to feed baby?"}
 9 | {"question": "How to get baby to sleep?"}
10 | {"question": "What is colic?"}
11 | 


--------------------------------------------------------------------------------
/src/genai/parenting_chatbot/prodigy_eval/data/system.json:
--------------------------------------------------------------------------------
1 | {
2 |     "role": "system",
3 |     "content": "You are an AI assistant for answering questions about parenting, babies and toddlers.\nProvide a conversational answer.\nIf the question is not about parenting, babies and toddlers, politely inform them that you are tuned to only answer questions about that."
4 | }
5 | 


--------------------------------------------------------------------------------
/src/genai/parenting_chatbot/prodigy_eval/figures/eval_parenting_chatbot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nestauk/discovery_generative_ai/b5ab4b0bf1feda7cd4a85a18ba20aa996cfb9e0d/src/genai/parenting_chatbot/prodigy_eval/figures/eval_parenting_chatbot.png


--------------------------------------------------------------------------------
/src/genai/parenting_chatbot/prodigy_eval/figures/prodigy_screenshot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nestauk/discovery_generative_ai/b5ab4b0bf1feda7cd4a85a18ba20aa996cfb9e0d/src/genai/parenting_chatbot/prodigy_eval/figures/prodigy_screenshot.png


--------------------------------------------------------------------------------
/src/genai/parenting_chatbot/prodigy_eval/generate_gpt4_answers.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Generate answers to questions about raising babies using GPT-4
 3 | 
 4 | Usage: Run the script from the repo root directory
 5 | $ poetry run python src/genai/parenting_chatbot/prodigy_eval/data/generate_gpt4_answers.py
 6 | 
 7 | The final output file is stored in data/answers_gpt4.jsonl file
 8 | following the format: {"question": your-question, "gpt4": gpt4-answer}
 9 | 
10 | """
11 | import os
12 | 
13 | import dotenv
14 | import openai
15 | import pandas as pd
16 | 
17 | from genai import MessageTemplate
18 | from genai.eyfs import TextGenerator
19 | 
20 | 
21 | DIR = "src/genai/parenting_chatbot/prodigy_eval/data/"
22 | SYSTEM_PROMPT = DIR + "system.json"
23 | QUESTIONS = DIR + "questions.jsonl"
24 | OUTPUT_FILE = DIR + "answers_gpt4.jsonl"
25 | 
26 | dotenv.load_dotenv()
27 | openai.api_key = os.environ["OPENAI_API_KEY"]
28 | 
29 | if __name__ == "__main__":
30 |     # Load the system prompt
31 |     system_prompt = MessageTemplate.load(SYSTEM_PROMPT)
32 |     # Load the questions
33 |     questions = pd.read_json(path_or_buf=QUESTIONS, lines=True).question.to_list()
34 |     # Generate answers
35 |     responses = []
36 |     for question in questions:
37 |         prompt = MessageTemplate(role="user", content=question)
38 |         response = TextGenerator.generate(
39 |             model="gpt-4",
40 |             temperature=0.6,
41 |             messages=[system_prompt.to_prompt(), prompt.to_prompt()],
42 |             message_kwargs=None,
43 |             stream=False,
44 |         )
45 |         responses.append(response)
46 |     # Extract only texts from each response
47 |     answer_text = [response["choices"][0]["message"]["content"] for response in responses]
48 |     # Write the answers to jsonl file
49 |     pd.DataFrame({"question": questions, "gpt4": answer_text}).to_json(
50 |         path_or_buf=OUTPUT_FILE, orient="records", lines=True
51 |     )
52 | 


--------------------------------------------------------------------------------
/src/genai/parenting_chatbot/prompts/filter_refs_function.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "is_this_text_relevant_to_the_question",
 3 |     "description": "Determine whether the provided text contains information that is DIRECTLY related to the question",
 4 |     "parameters": {
 5 |         "type": "object",
 6 |         "properties": {
 7 |             "prediction": {
 8 |                 "type": "boolean",
 9 |                 "description": "Determine whether the provided text contains information relevant to the question"
10 |             }
11 |         },
12 |         "required": [
13 |             "prediction"
14 |         ]
15 |     }
16 | }
17 | 


--------------------------------------------------------------------------------
/src/genai/parenting_chatbot/prompts/filter_refs_system.json:
--------------------------------------------------------------------------------
1 | {"role": "system", "content": "You are a smart assistant with proficiency is reading. You read the provided text and a question and you determine whether the text is relevant to the question.", "name": null}
2 | 


--------------------------------------------------------------------------------
/src/genai/parenting_chatbot/prompts/filter_refs_user.json:
--------------------------------------------------------------------------------
1 | {"role": "user", "content": "###Instructions###\nPlease read the following text and determine whether is relevant to the question.\n###Text###\n{text}\n###Question###\n{question}", "name": null}
2 | 


--------------------------------------------------------------------------------
/src/genai/parenting_chatbot/prompts/system.json:
--------------------------------------------------------------------------------
1 | {
2 |     "role": "system",
3 |     "content": "You are an AI assistant for answering questions about parenting, babies and toddlers.\nYou are given the following extracted documents from NHS Start For Life website and a question. Provide a conversational answer.\nIf you are not provided with any documents, say \"I did not get any relevant context for this but I will reply to the best of my knowledge\" and then write your answer\nIf you don't know the answer, just say \"Hmm, I'm not sure. It's better if you search online for this.\" Don't try to make up an answer.\nIf the question is not about parenting, babies and toddlers, politely inform them that you are tuned to only answer questions about that.\n\n",
4 |     "name": null
5 | }
6 | 


--------------------------------------------------------------------------------
/src/genai/parenting_chatbot/run_nhs_index.py:
--------------------------------------------------------------------------------
 1 | """Build a pinecone index with the NHS Start for Life data.
 2 | 
 3 | Note: We've opted for indexing full pages instead of individual paragraphs.
 4 | """
 5 | 
 6 | import os
 7 | import uuid
 8 | 
 9 | import openai
10 | import pandas as pd
11 | 
12 | from dotenv import load_dotenv
13 | 
14 | from genai.eyfs import get_embedding
15 | from genai.vector_index import PineconeIndex
16 | 
17 | 
18 | load_dotenv()
19 | 
20 | 
21 | openai.api_key = os.environ["OPENAI_API_KEY"]
22 | INDEX_NAME = "eyfs-index"
23 | ENCODER_NAME = "text-embedding-ada-002"
24 | 
25 | if "__main__" == __name__:
26 |     df = pd.read_csv(os.environ["PATH_TO_NHS_DATA"])
27 |     df = df.drop_duplicates(subset=["header", "content", "content_no"], keep="last")
28 | 
29 |     # Format the data to what pinecone needs and generate a temp uuid
30 |     docs = []
31 |     for tup in df.itertuples():
32 |         doc = tuple(
33 |             (
34 |                 str(uuid.uuid4()),
35 |                 get_embedding(tup.content),
36 |                 {
37 |                     "source": "nhs",
38 |                     "text": tup.content,
39 |                     "header": tup.header,
40 |                     "content_no": tup.content_no,
41 |                     "url": "".join(["https://www.nhs.uk", tup.URL]),
42 |                 },
43 |             )
44 |         )
45 |         docs.append(doc)
46 | 
47 |     # Build the index
48 |     conn = PineconeIndex(api_key=os.environ["PINECONE_API_KEY"], environment="us-west1-gcp")
49 | 
50 |     conn.build_and_upsert(
51 |         index_name=INDEX_NAME,
52 |         dimension=1536,
53 |         metric="euclidean",
54 |         docs=docs,
55 |         metadata_config={"indexed": ["areas_of_learning", "source", "type_", "age_group"]},
56 |         batch_size=80,
57 |     )
58 | 


--------------------------------------------------------------------------------
/src/genai/parenting_chatbot/run_nhs_index_full_page.py:
--------------------------------------------------------------------------------
 1 | """Build a pinecone index with the NHS Start for Life data."""
 2 | 
 3 | import os
 4 | import uuid
 5 | 
 6 | import openai
 7 | import pandas as pd
 8 | 
 9 | from dotenv import load_dotenv
10 | 
11 | from genai.eyfs import get_embedding
12 | from genai.vector_index import PineconeIndex
13 | 
14 | 
15 | load_dotenv()
16 | 
17 | 
18 | openai.api_key = os.environ["OPENAI_API_KEY"]
19 | INDEX_NAME = "eyfs-index"
20 | ENCODER_NAME = "text-embedding-ada-002"
21 | 
22 | if "__main__" == __name__:
23 |     df = pd.read_csv(os.environ["PATH_TO_NHS_DATA"])
24 |     df = df.groupby("URL").apply(lambda group: "\n\n".join(group["header"] + "\n" + group["content"]))
25 |     df = df.reset_index()
26 |     df.columns = ["URL", "content"]
27 | 
28 |     # Format the data to what pinecone needs and generate a temp uuid
29 |     docs = []
30 |     for tup in df.itertuples():
31 |         doc = tuple(
32 |             (
33 |                 str(uuid.uuid4()),
34 |                 get_embedding(tup.content),
35 |                 {
36 |                     "source": "nhs_full_page",
37 |                     "text": tup.content,
38 |                     "url": "".join(["https://www.nhs.uk", tup.URL]),
39 |                 },
40 |             )
41 |         )
42 |         docs.append(doc)
43 | 
44 |     # Build the index
45 |     conn = PineconeIndex(api_key=os.environ["PINECONE_API_KEY"], environment=os.environ["PINECONE_REGION"])
46 | 
47 |     conn.build_and_upsert(
48 |         index_name=INDEX_NAME,
49 |         dimension=1536,
50 |         metric="euclidean",
51 |         docs=docs,
52 |         metadata_config={"indexed": ["areas_of_learning", "source", "type_", "age_group"]},
53 |         batch_size=40,
54 |     )
55 | 


--------------------------------------------------------------------------------
/src/genai/prompt_template.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import string
  3 | 
  4 | from abc import ABC
  5 | from abc import abstractmethod
  6 | from dataclasses import asdict
  7 | from dataclasses import dataclass
  8 | from dataclasses import field
  9 | from typing import Dict
 10 | from typing import List
 11 | from typing import Optional
 12 | from typing import Union
 13 | 
 14 | 
 15 | @dataclass
 16 | class BasePromptTemplate(ABC):
 17 |     """Base template prompts flexibly."""
 18 | 
 19 |     initial_template: Dict[str, str] = field(default_factory=dict, init=False)
 20 | 
 21 |     def __post_init__(self) -> None:
 22 |         """Keep the initial template."""
 23 |         self.initial_template = self._initialize_template()
 24 | 
 25 |     @abstractmethod
 26 |     def _initialize_template(self) -> None:
 27 |         """To be implemented by child classes"""
 28 |         pass
 29 | 
 30 |     @staticmethod
 31 |     @abstractmethod
 32 |     def _from_dict(data: Dict) -> None:
 33 |         """Create a Template instance from a dictionary."""
 34 |         pass
 35 | 
 36 |     def format_message(self, **kwargs) -> None:
 37 |         """Process a message and fill in any placeholders."""
 38 | 
 39 |         def recursive_format(value: Union[str, dict]) -> Union[str, dict]:
 40 |             if isinstance(value, str):
 41 |                 placeholders = self._extract_placeholders(value)
 42 |                 if placeholders:
 43 |                     return value.format(**kwargs)
 44 |                 return value
 45 |             elif isinstance(value, dict):
 46 |                 return {k: recursive_format(v) for k, v in value.items()}
 47 |             else:
 48 |                 return value
 49 | 
 50 |         for k in self.__dict__.keys():
 51 |             if k != "initial_template":
 52 |                 self.__dict__[k] = recursive_format(self.initial_template[k])
 53 | 
 54 |     @classmethod
 55 |     def load(cls, obj: Union[Dict, str]) -> "BasePromptTemplate":
 56 |         """Load a Template instance from a JSON file or a dictionary."""
 57 |         if isinstance(obj, str):
 58 |             return cls._from_json(obj)
 59 |         elif isinstance(obj, Dict):
 60 |             return cls._from_dict(obj)
 61 |         else:
 62 |             raise TypeError(f"Expected a JSON file path or a dictionary, got {type(obj)}.")
 63 | 
 64 |     @staticmethod
 65 |     def _exclude_keys(
 66 |         d: dict,
 67 |         exclude: Optional[List[str]] = None,  # noqa: B006
 68 |     ) -> dict:
 69 |         """Exclude keys from a dictionary."""
 70 |         if not d["name"]:
 71 |             d.pop("name", None)
 72 | 
 73 |         if exclude:
 74 |             for item in exclude:
 75 |                 d.pop(item, None)
 76 |             return d
 77 |         return d
 78 | 
 79 |     def to_prompt(
 80 |         self,
 81 |         exclude: Optional[List[str]] = ["initial_template"],  # noqa: B006
 82 |     ) -> Dict:
 83 |         """Convert a Template instance to a JSON string."""
 84 |         d = asdict(self)
 85 |         return self._exclude_keys(d, exclude=exclude)
 86 | 
 87 |     @staticmethod
 88 |     def _extract_placeholders(s: str) -> List[str]:
 89 |         """Extract placeholder variables that can be filled in an f-string."""
 90 |         formatter = string.Formatter()
 91 |         return [field_name for _, field_name, _, _ in formatter.parse(s) if field_name is not None]
 92 | 
 93 |     @classmethod
 94 |     def _from_json(cls, json_path: str) -> "BasePromptTemplate":
 95 |         """Create a Template instance by providing a JSON path."""
 96 |         return cls._from_dict(cls._read_json(json_path))
 97 | 
 98 |     @staticmethod
 99 |     def _read_json(json_path: str) -> Dict:
100 |         """Read a JSON file."""
101 |         with open(json_path, "r") as f:
102 |             return json.load(f)
103 | 
104 |     def to_json(self, path: str) -> None:
105 |         """Convert a Template instance to a JSON string."""
106 |         self._write_json(self.initial_template, path)
107 | 
108 |     def _write_json(self, data: Dict, path: str) -> None:
109 |         """Write a JSON file."""
110 |         with open(path, "w") as f:
111 |             json.dump(data, f)
112 | 
113 | 
114 | @dataclass
115 | class MessageTemplate(BasePromptTemplate):
116 |     """Create a template for a message prompt."""
117 | 
118 |     role: str
119 |     content: str
120 |     name: Optional[str] = None
121 | 
122 |     def __post_init__(self) -> None:
123 |         """Keep the initial template and error when the role is function but not name was given."""
124 |         super().__post_init__()
125 |         if self.role == "function" and not self.name:
126 |             raise ValueError("The 'name' attribute is required when 'role' is 'function'.")
127 | 
128 |     def _initialize_template(self) -> Dict[str, str]:
129 |         return {"role": self.role, "content": self.content, "name": self.name}
130 | 
131 |     @staticmethod
132 |     def _from_dict(data: Dict) -> "MessageTemplate":
133 |         instance = MessageTemplate(**data)
134 |         # Validate after initialisation
135 |         if instance.role == "function" and not instance.name:
136 |             raise ValueError("The 'name' attribute is required when 'role' is 'function'.")
137 |         return instance
138 | 
139 | 
140 | @dataclass
141 | class FunctionTemplate(BasePromptTemplate):
142 |     """Create a template for an OpenAI function."""
143 | 
144 |     name: str
145 |     description: str
146 |     parameters: Dict[str, Union[str, Dict[str, Dict[str, Union[str, List[str]]]], List[str]]]
147 | 
148 |     def _initialize_template(self) -> Dict[str, Union[str, Dict[str, Dict[str, Union[str, List[str]]]], List[str]]]:
149 |         return {
150 |             "name": self.name,
151 |             "description": self.description,
152 |             "parameters": self.parameters,
153 |         }
154 | 
155 |     @staticmethod
156 |     def _from_dict(data: Dict) -> "FunctionTemplate":
157 |         """Create a Template instance from a dictionary."""
158 |         return FunctionTemplate(**data)
159 | 


--------------------------------------------------------------------------------
/src/genai/sandbox/signals/data/00_system.jsonl:
--------------------------------------------------------------------------------
1 | {"role": "system", "content": "###Instructions###\nYou are a helpful, kind, intelligent and polite futurist. You work for the United Kingdom's innovation agency Nesta, and your task is to engage the user about the future signals and trends that Nesta has researched, by helping the user imagine and appreciate how the signals will impact their life. You will personalise the user experience by taking the information provided by the user and tailoring your explanation to the user background. You are also expert in equity, diversity and inclusion and your answers will be inclusive, the answers will never be based on negative stereotypes, you will never offend and you will exercise sensitivity about topics such as ethnicity and gender. Here are the future signals that you can talk about: {signals}. Do not discuss other future signals as this is not part of this year's Nesta's Signals edition."}
2 | 


--------------------------------------------------------------------------------
/src/genai/sandbox/signals/data/01_intro.jsonl:
--------------------------------------------------------------------------------
1 | {"role": "assistant", "content": "Hi, I’m Scout, [Discovery Hub’s](https://www.nesta.org.uk/discovery/) experimental AI assistant which helps people explore and interpret signals about the future. ✨"}
2 | {"role": "assistant", "content": "This year we have collected signals about a variety of topics, from 🏥 virtual hospital wards and 🧠 neuroprivacy to ⚡ space solar power and 🧪 data poisoning."}
3 | {"role": "assistant", "content": "Tell me one or two things about you and your interests, so that I can suggest which future signals might be the most relevant to you! You can always refresh your browser to restart the conversation."}
4 | 


--------------------------------------------------------------------------------
/src/genai/sandbox/signals/data/02_signal_impact.jsonl:
--------------------------------------------------------------------------------
1 | {"role": "user", "content": "Start your answer by summarising the signal in one clear, concise sentence and then follow by explaining in another clear sentence how the selected future signal might be relevant to the user, given the user information and conversation history. Then describe three ways how the selected future signal might impact them. Keep these descriptions short, two-three sentences at most. Finish your answer by encouraging the user to ask questions about this signal (note that you will try your best to answer them) or suggest to ask about the other future signals. Remember that you must be patient and never offend or be aggressive.   \n\n###Future signal###{signal}\n\n###User information### Here is what the user told you about themselves: {user_input}.\n\n###Answer###"
2 | }
3 | 


--------------------------------------------------------------------------------
/src/genai/sandbox/signals/data/03_signal_choice.jsonl:
--------------------------------------------------------------------------------
1 | {"role": "user", "content": "Start your answer by explaining each of the signals in one clear sentence (use similar language to the signals descriptions). If possible, indicate how a signal might be relevant to the user, given the user information and conversation history. Finish your answer by asking the user to choose one of the signals to hear more about it. Remember that you must be patient and never offend or be aggressive.   \n\n###Future signals###{signals}\n\n###User information### Here is what the user told you about themselves: {user_input}.\n\n###Answer###"
2 | }
3 | 


--------------------------------------------------------------------------------
/src/genai/sandbox/signals/data/04_follow_up.jsonl:
--------------------------------------------------------------------------------
1 | {"role": "user", "content": "Answer to the user's most recent message. Be as concise or detailed as necessary. Use the information from the future signal description when relevant. Keep your answers conversational and three to four sentences long at most. \n\n###Future signal###{signal}\n\n###User information### Here is what the user told you: {user_input}.\n\n###Answer###"
2 | }
3 | 


--------------------------------------------------------------------------------
/src/genai/sandbox/signals/data/func_intent.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "predict_intent",
 3 |     "description": "Predict what is the user's intent",
 4 |     "parameters": {
 5 |       "type": "object",
 6 |       "properties": {
 7 |         "prediction": {
 8 |           "type": "string",
 9 |           "enum": ["new_signal", "more_signals", "following_up"],
10 |           "description": "The predicted intent"
11 |         }
12 |       },
13 |       "required": ["prediction"]
14 |     }
15 |   }
16 | 


--------------------------------------------------------------------------------
/src/genai/sandbox/signals/data/func_top_signal.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "predict_top_signal",
 3 |     "description": "Predict which one of the signal is the most relevant to user input",
 4 |     "parameters": {
 5 |       "type": "object",
 6 |       "properties": {
 7 |         "prediction": {
 8 |           "type": "string",
 9 |           "enum": [],
10 |           "description": "The predicted most relevant signal"
11 |         }
12 |       },
13 |       "required": ["prediction"]
14 |     }
15 |   }
16 | 


--------------------------------------------------------------------------------
/src/genai/sandbox/signals/data/func_top_three_signals.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "predict_top_signals",
 3 |     "description": "Predict which three signals are the most relevant to user input",
 4 |     "parameters": {
 5 |       "type": "object",
 6 |       "properties": {
 7 |         "prediction": {
 8 |           "type": "array",
 9 |           "items": {
10 |             "type": "string",
11 |             "enum": []
12 |           },
13 |           "description": "The predicted most relevant signals"
14 |         }
15 |       },
16 |       "required": ["prediction"]
17 |     }
18 |   }
19 | 


--------------------------------------------------------------------------------
/src/genai/sandbox/signals/data/intent_actions.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     {
 3 |         "name": "new_signal",
 4 |         "description": "User wishes to change the topic and talk about an different future signal. Alternatively, the user has been just presented with a set of future signal options by the assistant, and the user has now chosen which signal to talk about more."
 5 |     },
 6 |     {
 7 |         "name": "more_signals",
 8 |         "description": "User has asked to hear more about other future signals"
 9 |     },
10 |     {
11 |         "name": "following_up",
12 |         "description": "User is following up with another question about the signal that's being discussed just now."
13 |     }
14 | ]
15 | 


--------------------------------------------------------------------------------
/src/genai/sandbox/signals/data/prompt_intent.jsonl:
--------------------------------------------------------------------------------
1 | {"role": "user", "content": "You are a helpful chatbot talking with the user about the articles of future signals that we wrote. ###Instructions### Predict the intended action of the user, what the user wishes you to carry out based on the conversation history. Pay attention to the most recent messages\n\n###Possible intents###\n{intents}\n\n"}
2 | 


--------------------------------------------------------------------------------
/src/genai/sandbox/signals/data/prompt_top_signal.jsonl:
--------------------------------------------------------------------------------
1 | {"role": "user", "content": "###Instructions### Predict which of the following future signals is the most relevant to user input. You have to choose one of these signals. \n\n###Future signal summaries###\n{signals}\n\n###User input:\n{user_input}"}
2 | 


--------------------------------------------------------------------------------
/src/genai/sandbox/signals/data/prompt_top_three_signals.jsonl:
--------------------------------------------------------------------------------
1 | {"role": "user", "content": "###Instructions### Predict which three of the following future signals are the most relevant to user input. You have to choose three of these signals. \n\n###Future signal summaries###\n{signals}\n\n###User input:\n{user_input}"}
2 | 


--------------------------------------------------------------------------------
/src/genai/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import find_packages
 2 | from setuptools import setup
 3 | 
 4 | 
 5 | setup(
 6 |     name="genai",
 7 |     version="0.1",
 8 |     packages=find_packages(),
 9 | )
10 | 


--------------------------------------------------------------------------------
/src/genai/streamlit_pages/__init__.py:
--------------------------------------------------------------------------------
1 | from .dm_page import eyfs_dm_kb
2 | from .eli3_page import eli3
3 | from .eyfs_compare_page import eyfs_compare
4 | from .eyfs_kb_bbc_page import eyfs_kb_bbc
5 | from .eyfs_page import early_year_activity_plan
6 | from .parenting_page import parenting_chatbot
7 | 


--------------------------------------------------------------------------------
/src/genai/streamlit_pages/dm_page.py:
--------------------------------------------------------------------------------
  1 | from typing import List
  2 | 
  3 | import streamlit as st
  4 | 
  5 | from genai import MessageTemplate
  6 | from genai.eyfs import TextGenerator
  7 | from genai.eyfs import get_embedding
  8 | from genai.streamlit_pages.utils import get_index
  9 | from genai.streamlit_pages.utils import query_pinecone
 10 | from genai.streamlit_pages.utils import reset_state
 11 | from genai.streamlit_pages.utils import sample_docs
 12 | from genai.utils import read_json
 13 | 
 14 | 
 15 | def eyfs_dm_kb(index_name: str = "eyfs-index-1718673592-index", sidebar: bool = True) -> None:
 16 |     """Run the Development Matters app."""
 17 | 
 18 |     # Define your custom CSS
 19 |     custom_css = """
 20 |         <style>
 21 |             /* Adjust the selector as needed */
 22 |             .stHeadingContainer {
 23 |                 margin-top: -100px; /* Reduce the top margin */
 24 |             }
 25 |             #MainMenu {visibility: hidden;}
 26 |             footer {visibility: hidden;}
 27 |             header {
 28 |                 visibility: hidden
 29 |             }
 30 |         </style>
 31 |         """
 32 | 
 33 |     # Apply the custom CSS
 34 |     st.markdown(custom_css, unsafe_allow_html=True)
 35 | 
 36 |     st.title("Generate activities anchored to the Development Matters guidance")
 37 |     areas_of_learning_desc = read_json("src/genai/eyfs/areas_of_learning.json")
 38 |     aol = list(areas_of_learning_desc.keys())
 39 |     index = get_index(index_name=index_name)
 40 | 
 41 |     if "examples" not in st.session_state:
 42 |         st.session_state["examples"] = ""
 43 | 
 44 |     if "learning_goals" not in st.session_state:
 45 |         st.session_state["learning_goals"] = ""
 46 | 
 47 |     if "full_response" not in st.session_state:
 48 |         st.session_state["full_response"] = None
 49 | 
 50 |     message = MessageTemplate.load("src/genai/dm/prompts/dm_prompt_2.json")
 51 | 
 52 |     if sidebar:
 53 |         with st.sidebar:
 54 |             selected_model, temperature, n_examples = _sidebar()
 55 |     else:
 56 |         selected_model = "gpt-4"
 57 |         temperature = 0.6
 58 |         n_examples = 5
 59 | 
 60 |     choice = st.radio(
 61 |         label="**Select a learning goal**",
 62 |         options=["Pick a predefined learning goal", "Describe a learning goal"],
 63 |         on_change=reset_state,
 64 |     )
 65 | 
 66 |     age_groups = st.selectbox(
 67 |         label="**Age group (in years)**",
 68 |         options=["0-3", "3-4", "4-5"],
 69 |         index=2,
 70 |         on_change=reset_state,
 71 |     )
 72 | 
 73 |     if choice == "Pick a predefined learning goal":
 74 |         # Select the areas of learning
 75 |         areas_of_learning = st.selectbox(
 76 |             label="**Areas of learning**",
 77 |             options=aol,
 78 |             index=1,
 79 |             on_change=reset_state,
 80 |         )
 81 | 
 82 |         if areas_of_learning and age_groups:
 83 |             predefined_learning_goals = get_data(
 84 |                 path="src/genai/dm/dm.json",
 85 |                 type_="learning_goals",
 86 |                 areas_of_learning=[areas_of_learning],
 87 |                 age_groups=[age_groups],
 88 |             )
 89 |             # Hack till we bring in the rest of the AOL
 90 |             try:
 91 |                 learning_goals = st.multiselect(
 92 |                     label="**Predefined Learning Goals**",
 93 |                     options=predefined_learning_goals,
 94 |                     default=predefined_learning_goals[0],
 95 |                     on_change=reset_state,
 96 |                 )
 97 |             except IndexError:
 98 |                 st.write(
 99 |                     """**ERROR**: You can only pick one of the following Areas of Learning for now:
100 |                     - `Communication and Language`
101 |                     - `Mathematics`
102 |                     - `Personal, Social and Emotional Development`  """
103 |                 )
104 | 
105 |             if st.button("**Search for activity examples**"):
106 |                 results = []
107 |                 for learning_goal in learning_goals:
108 |                     search_results = query_pinecone(
109 |                         index=index,
110 |                         encoded_query=get_embedding(learning_goal),
111 |                         filters={
112 |                             "areas_of_learning": {"$in": [areas_of_learning]},
113 |                             "source": {"$eq": "dm"},
114 |                             "age_group": {"$in": [age_groups]},
115 |                             "type_": {"$eq": "examples"},
116 |                         },
117 |                         top_n=n_examples,
118 |                     )
119 |                     results.extend(search_results)
120 | 
121 |                 results = list(set([result["metadata"]["text"] for result in results]))
122 |                 idx = sample_docs(num_docs=len(results), n=n_examples)
123 |                 results = [results[i] for i in idx]
124 |                 st.session_state["examples"] = "\n\n".join(results)
125 | 
126 |     elif choice == "Describe a learning goal":
127 |         if age_groups:
128 |             text_input = st.text_input(label="**Describe a learning goal**")
129 |             if st.button("Search for learning goals"):
130 |                 results = query_pinecone(
131 |                     index=index,
132 |                     encoded_query=get_embedding(text_input),
133 |                     filters={
134 |                         "source": {"$eq": "dm"},
135 |                         "age_group": {"$in": [age_groups]},
136 |                         "type_": {"$eq": "learning_goals"},
137 |                     },
138 |                     top_n=n_examples,
139 |                 )
140 | 
141 |                 results = list(set([result["metadata"]["text"] for result in results]))
142 |                 idx = sample_docs(num_docs=len(results), n=n_examples)
143 |                 results = [results[i] for i in idx]
144 |                 st.session_state["learning_goals"] = "\n\n".join(results)
145 | 
146 |             if st.session_state["learning_goals"]:
147 |                 st.write("## Development Matters guidance: Learning Goals")
148 |                 with st.expander("**Learning Goals**", expanded=True):
149 |                     for result in st.session_state["learning_goals"].split("\n\n"):
150 |                         st.write(f"- {result}\n")
151 | 
152 |                 results = []
153 |                 for learning_goal in st.session_state["learning_goals"].split("\n\n"):
154 |                     search_results = query_pinecone(
155 |                         index=index,
156 |                         encoded_query=get_embedding(learning_goal),
157 |                         filters={
158 |                             "source": {"$eq": "dm"},
159 |                             "age_group": {"$in": [age_groups]},
160 |                             "type_": {"$eq": "examples"},
161 |                         },
162 |                         top_n=n_examples,
163 |                     )
164 |                     results.extend(search_results)
165 |                 areas_of_learning = [result["metadata"]["areas_of_learning"] for result in results]
166 |                 results = list(set([result["metadata"]["text"] for result in results]))
167 |                 idx = sample_docs(num_docs=len(results), n=n_examples)
168 |                 results = [results[i] for i in idx]
169 |                 st.session_state["examples"] = "\n\n".join(results)
170 | 
171 |     if st.session_state["examples"]:
172 |         st.write("## Development Matters guidance: Examples")
173 |         with st.expander("**Examples**", expanded=True):
174 |             for result in st.session_state["examples"].split("\n\n"):
175 |                 st.write(f"- {result}\n")
176 | 
177 |         text_input = st.text_input(label="**Describe a theme for the activity**")
178 |         if st.button("Generate activities"):
179 |             st.session_state["full_response"] = llm_call(
180 |                 selected_model=selected_model,
181 |                 temperature=temperature,
182 |                 message=message,
183 |                 messages_placeholders={
184 |                     "description": text_input,
185 |                     "areas_of_learning": areas_of_learning,
186 |                     "examples": st.session_state["examples"],
187 |                     "age_groups": age_groups,
188 |                 },
189 |             )
190 | 
191 |     if st.session_state["examples"] and st.session_state["full_response"]:
192 |         with st.expander("**Suggested activities**"):
193 |             st.write(st.session_state["full_response"])
194 |         text_input = st.text_input(
195 |             "**Ask a follow-up question**",
196 |             help="You can ask for details on how to play one of the games!",
197 |         )
198 |         if text_input:
199 |             _ = llm_call(
200 |                 selected_model=selected_model,
201 |                 temperature=temperature,
202 |                 message=MessageTemplate(
203 |                     role="user", content="###Activities###\n{activities} ###Task###\n{text_input}"
204 |                 ),
205 |                 messages_placeholders={
206 |                     "text_input": text_input,
207 |                     "activities": st.session_state["full_response"],
208 |                 },
209 |             )
210 | 
211 | 
212 | def get_data(path: str, type_: str, areas_of_learning: List[str], age_groups: List[str]) -> List[str]:
213 |     """Get Learning Goals or Examples based on the selected areas of learning and age groups."""
214 |     data = read_json(path)
215 |     # Temp hack to exclude the template
216 |     data = [d for d in data if d["area_of_learning"] != ""]
217 | 
218 |     predefined_learning_goals = []
219 |     for elem in data:
220 |         aol = elem["area_of_learning"]
221 |         d = elem["age_group"]
222 |         for age, age_dict in d.items():
223 |             for k, items in age_dict.items():
224 |                 if k == type_ and aol in areas_of_learning and age in age_groups:
225 |                     for item in items:
226 |                         predefined_learning_goals.append(item)
227 | 
228 |     return predefined_learning_goals
229 | 
230 | 
231 | def _sidebar() -> tuple:
232 |     """Select a model, temperature and number of results."""
233 |     selected_model = st.radio(
234 |         label="**OpenAI model**",
235 |         options=["gpt-3.5-turbo", "gpt-4"],
236 |         index=1,
237 |         on_change=reset_state,
238 |     )
239 | 
240 |     temperature = st.slider(
241 |         label="**Temperature**",
242 |         min_value=0.0,
243 |         max_value=2.0,
244 |         value=0.6,
245 |         step=0.1,
246 |         on_change=reset_state,
247 |     )
248 | 
249 |     n_examples = st.slider(
250 |         label="**Examples**",
251 |         help="Number of search results. Those are added to the prompt.",
252 |         min_value=1,
253 |         max_value=10,
254 |         value=5,
255 |         step=1,
256 |         on_change=reset_state,
257 |     )
258 | 
259 |     st.button("Reset chat", on_click=reset_state, type="primary", help="Reset the chat history")
260 | 
261 |     return selected_model, temperature, n_examples
262 | 
263 | 
264 | def llm_call(selected_model: str, temperature: float, message: MessageTemplate, messages_placeholders: dict) -> str:
265 |     """Call the LLM."""
266 |     message_placeholder = st.empty()
267 |     full_response = ""
268 |     for response in TextGenerator.generate(
269 |         model=selected_model,
270 |         temperature=temperature,
271 |         messages=[message],
272 |         message_kwargs=messages_placeholders,
273 |         stream=True,
274 |     ):
275 |         full_response += response.choices[0].delta.get("content", "")
276 |         message_placeholder.markdown(full_response + "▌")
277 | 
278 |     message_placeholder.markdown(full_response)
279 | 
280 |     return full_response
281 | 


--------------------------------------------------------------------------------
/src/genai/streamlit_pages/eli3_page.py:
--------------------------------------------------------------------------------
 1 | import streamlit as st
 2 | 
 3 | from genai import MessageTemplate
 4 | from genai.eyfs import TextGenerator
 5 | from genai.streamlit_pages.utils import reset_state
 6 | 
 7 | 
 8 | def eli3(sidebar: bool = True) -> None:
 9 |     """Explain me a concept like I'm 3."""
10 | 
11 |     # Define your custom CSS
12 |     custom_css = """
13 |         <style>
14 |             /* Adjust the selector as needed */
15 |             .stHeadingContainer {
16 |                 margin-top: -100px; /* Reduce the top margin */
17 |             }
18 |             #MainMenu {visibility: hidden;}
19 |             footer {visibility: hidden;}
20 |             header {
21 |                 visibility: hidden
22 |             }
23 |         </style>
24 |         """
25 | 
26 |     # Apply the custom CSS
27 |     st.markdown(custom_css, unsafe_allow_html=True)
28 | 
29 |     st.title("Explain-Like-I'm-3")
30 | 
31 |     # Create the generator
32 |     if sidebar:
33 |         with st.sidebar:
34 |             selected_model = st.radio(
35 |                 label="**OpenAI model**",
36 |                 options=["gpt-3.5-turbo", "gpt-4"],
37 |                 on_change=reset_state,
38 |             )
39 |             temperature = st.slider(
40 |                 label="**Temperature**",
41 |                 min_value=0.0,
42 |                 max_value=2.0,
43 |                 value=0.6,
44 |                 step=0.1,
45 |                 on_change=reset_state,
46 |             )
47 | 
48 |             st.button("Reset chat", on_click=reset_state, type="primary", help="Reset the chat history")
49 |     else:
50 |         selected_model = "gpt-4"
51 |         temperature = 0.6
52 | 
53 |     prompt_template = MessageTemplate.load("src/genai/eli3/prompts/eli3_chat_2.json")
54 | 
55 |     # Initialize chat history
56 |     if "messages" not in st.session_state:
57 |         st.session_state.messages = [{"role": prompt_template.role, "content": prompt_template.content}]
58 | 
59 |     # Display chat messages from history on app rerun.
60 |     # The first message is the prompt, so we skip it.
61 |     for message in st.session_state.messages[1:]:
62 |         with st.chat_message(message["role"]):
63 |             st.markdown(message["content"])
64 | 
65 |     # Accept user input
66 |     prompt = st.chat_input("How do whales breathe?")
67 |     if prompt:
68 |         # Display user message in chat message container
69 |         with st.chat_message("user"):
70 |             st.markdown(prompt)
71 |         # Add user message to chat history
72 |         st.session_state.messages.append({"role": "user", "content": prompt})
73 | 
74 |         with st.chat_message("assistant"):
75 |             message_placeholder = st.empty()
76 |             full_response = ""
77 |             for response in TextGenerator.generate(
78 |                 model=selected_model,
79 |                 temperature=temperature,
80 |                 messages=[{"role": m["role"], "content": m["content"]} for m in st.session_state.messages],
81 |                 message_kwargs=None,
82 |                 stream=True,
83 |             ):
84 |                 full_response += response.choices[0].delta.get("content", "")
85 |                 message_placeholder.markdown(full_response + "▌")
86 |             message_placeholder.markdown(full_response)
87 |         st.session_state.messages.append({"role": "assistant", "content": full_response})
88 | 


--------------------------------------------------------------------------------
/src/genai/streamlit_pages/eyfs_compare_page.py:
--------------------------------------------------------------------------------
  1 | import streamlit as st
  2 | 
  3 | from genai.eyfs import TextGenerator
  4 | from genai.eyfs import get_embedding
  5 | from genai.streamlit_pages.eyfs_kb_bbc_page import get_index
  6 | from genai.streamlit_pages.eyfs_kb_bbc_page import query_pinecone
  7 | from genai.utils import read_json
  8 | 
  9 | 
 10 | def eyfs_compare() -> None:
 11 |     """Compare the EYFS prototypes."""
 12 |     st.title("Compare the EYFS prototypes")
 13 |     index = get_index(index_name="eyfs-index-1718673592-index")
 14 |     areas_of_learning_desc = read_json("src/genai/eyfs/areas_of_learning.json")
 15 |     aol = list(areas_of_learning_desc.keys())
 16 | 
 17 |     with st.sidebar:
 18 |         # Select a model, temperature and number of results
 19 |         selected_model = st.radio(label="**OpenAI model**", options=["gpt-3.5-turbo", "gpt-4"], index=1)
 20 |         # description = "<THIS IS WHERE THE GENERATOR WILL SHOW THE RESULTS>"
 21 |         n_results = 10
 22 |         temperature = st.slider(label="**Temperature**", min_value=0.0, max_value=2.0, value=0.6, step=0.1)
 23 | 
 24 |     # Select the areas of learning
 25 |     areas_of_learning = st.multiselect(label="**Areas of learning**", options=aol, default=aol)
 26 |     areas_of_learning_text = [v for k, v in areas_of_learning_desc.items() if k in areas_of_learning]
 27 | 
 28 |     # Describe each Area of Learning in an expanding window
 29 |     with st.expander("**Areas of Learning Description**"):
 30 |         for k, v in areas_of_learning_desc.items():
 31 |             if k in areas_of_learning:
 32 |                 st.write(f"#### {k}")
 33 |                 st.write(v.split("##")[-1])
 34 | 
 35 |     areas_of_learning_text = "\n\n".join(areas_of_learning_text)
 36 |     location = st.selectbox(label="**Location**", options=["Indoor", "Outdoor", "Indoor or Outdoor"], index=2)
 37 | 
 38 |     # Get the user input
 39 |     query = st.text_input(
 40 |         label="**What's the topic you want activities for?**",
 41 |         value="Let's create activities educating children on how whales breathe",
 42 |         help="Prompt the large language model with a some text and it will generate an activity plan for you.",
 43 |     )
 44 |     if st.button(label="**Generate**", help="Generate an answer."):
 45 |         eyfs_prototype, eyfs_bbc_prototype = st.columns(2)
 46 |         eyfs_prototype.subheader("EYFS-related activities")
 47 |         eyfs_bbc_prototype.subheader("EYFS-related activities with ext KB (BBC)")
 48 |         with eyfs_prototype:
 49 |             with st.spinner("Generating activities..."):
 50 |                 res_box = st.empty()
 51 |                 report = []
 52 |                 # Create the messages
 53 |                 paths = [
 54 |                     "src/genai/eyfs/prompts/system.json",
 55 |                     "src/genai/eyfs/prompts/context_and_task.json",
 56 |                     "src/genai/eyfs/prompts/constraints.json",
 57 |                     "src/genai/eyfs/prompts/situation.json",
 58 |                 ]
 59 | 
 60 |                 messages = [read_json(path) for path in paths]
 61 |                 messages_placeholders = {
 62 |                     "description": query,
 63 |                     "areas_of_learning": areas_of_learning,
 64 |                     "n_results": n_results,
 65 |                     "location": location,
 66 |                     "areas_of_learning_text": areas_of_learning_text,
 67 |                 }
 68 | 
 69 |                 r = TextGenerator.generate(
 70 |                     model=selected_model,
 71 |                     temperature=temperature,
 72 |                     messages=messages,
 73 |                     message_kwargs=messages_placeholders,
 74 |                     stream=True,
 75 |                 )
 76 | 
 77 |                 for chunk in r:
 78 |                     content = chunk["choices"][0].get("delta", {}).get("content")
 79 |                     report.append(content)
 80 |                     if chunk["choices"][0]["finish_reason"] != "stop":
 81 |                         result = "".join(report).strip()
 82 |                         res_box.markdown(f"{result}")
 83 | 
 84 |         with eyfs_bbc_prototype:
 85 |             # Create the messages
 86 |             paths = [
 87 |                 "src/genai/eyfs/prompts/system.json",
 88 |                 "src/genai/eyfs/prompts/context_and_task_with_examples.json",
 89 |                 "src/genai/eyfs/prompts/constraints.json",
 90 |                 "src/genai/eyfs/prompts/situation.json",
 91 |             ]
 92 | 
 93 |             messages = [read_json(path) for path in paths]
 94 |             with st.spinner("Searching for relevant BBC activities..."):
 95 |                 # Encode the query
 96 |                 encoded_query = get_embedding(query)
 97 | 
 98 |                 # Search with Chroma
 99 |                 similar_docs = query_pinecone(
100 |                     index,
101 |                     encoded_query,
102 |                     areas_of_learning=areas_of_learning,
103 |                     top_n=4,
104 |                     max_n=4,
105 |                 )
106 | 
107 |             with st.spinner("Generating activities..."):
108 |                 res_box = st.empty()
109 |                 report = []
110 |                 # Create the prompt
111 |                 messages_placeholders = {
112 |                     "description": query,
113 |                     "areas_of_learning": areas_of_learning,
114 |                     "n_results": n_results,
115 |                     "location": location,
116 |                     "areas_of_learning_text": areas_of_learning_text,
117 |                     "activity_examples": "\n======\n".join(
118 |                         [similar_doc["metadata"]["text"] for similar_doc in similar_docs]
119 |                     ),
120 |                 }
121 | 
122 |                 r = TextGenerator.generate(
123 |                     model=selected_model,
124 |                     temperature=temperature,
125 |                     messages=messages,
126 |                     message_kwargs=messages_placeholders,
127 |                     stream=True,
128 |                 )
129 | 
130 |                 for chunk in r:
131 |                     content = chunk["choices"][0].get("delta", {}).get("content")
132 |                     report.append(content)
133 |                     if chunk["choices"][0]["finish_reason"] != "stop":
134 |                         result = "".join(report).strip()
135 |                         res_box.markdown(f"{result}")
136 | 
137 |             st.subheader("Sources")
138 | 
139 |             for similar_doc in similar_docs:
140 |                 title = similar_doc["metadata"]["title"]
141 |                 url = similar_doc["id"]
142 |                 category = similar_doc["metadata"]["areas_of_learning"]
143 |                 st.write(f"""- [{title}]({url}) {category}""")
144 | 


--------------------------------------------------------------------------------
/src/genai/streamlit_pages/eyfs_kb_bbc_page.py:
--------------------------------------------------------------------------------
  1 | import pinecone
  2 | import streamlit as st
  3 | 
  4 | from genai import MessageTemplate
  5 | from genai.eyfs import TextGenerator
  6 | from genai.eyfs import get_embedding
  7 | from genai.streamlit_pages.utils import get_index
  8 | from genai.streamlit_pages.utils import reset_state
  9 | from genai.streamlit_pages.utils import sample_docs
 10 | from genai.utils import read_json
 11 | 
 12 | 
 13 | def eyfs_kb_bbc(index_name: str = "eyfs-index") -> None:
 14 |     """Run the EYFS + BBC activities app."""
 15 |     st.title("Generating activity plans grounded in EY foundation stages")
 16 |     areas_of_learning_desc = read_json("src/genai/eyfs/areas_of_learning.json")
 17 |     aol = list(areas_of_learning_desc.keys())
 18 |     index = get_index(index_name=index_name)
 19 | 
 20 |     with st.sidebar:
 21 |         # Select a model, temperature and number of results
 22 |         selected_model = st.radio(
 23 |             label="**OpenAI model**",
 24 |             options=["gpt-3.5-turbo", "gpt-4"],
 25 |             index=1,
 26 |             on_change=reset_state,
 27 |         )
 28 |         n_results = 10
 29 |         temperature = st.slider(
 30 |             label="**Temperature**",
 31 |             min_value=0.0,
 32 |             max_value=2.0,
 33 |             value=0.6,
 34 |             step=0.1,
 35 |             on_change=reset_state,
 36 |         )
 37 | 
 38 |         st.button("Reset chat", on_click=reset_state, type="primary", help="Reset the chat history")
 39 | 
 40 |     # Select the areas of learning
 41 |     areas_of_learning = st.multiselect(
 42 |         label="**Areas of learning**",
 43 |         options=aol,
 44 |         default=aol,
 45 |         on_change=reset_state,
 46 |     )
 47 |     areas_of_learning_text = [v for k, v in areas_of_learning_desc.items() if k in areas_of_learning]
 48 | 
 49 |     # Describe each Area of Learning in an expanding window
 50 |     with st.expander("**Areas of Learning Description**"):
 51 |         for k, v in areas_of_learning_desc.items():
 52 |             if k in areas_of_learning:
 53 |                 st.write(f"#### {k}")
 54 |                 st.write(v.split("##")[-1])
 55 | 
 56 |     areas_of_learning_text = "\n\n".join(areas_of_learning_text)
 57 |     location = st.selectbox(label="**Location**", options=["Indoor", "Outdoor", "Indoor or Outdoor"], index=2)
 58 | 
 59 |     # Create the messages
 60 |     paths = [
 61 |         "src/genai/eyfs/prompts/system.json",
 62 |         "src/genai/eyfs/prompts/context_and_task_with_examples.json",
 63 |         "src/genai/eyfs/prompts/constraints.json",
 64 |         "src/genai/eyfs/prompts/situation.json",
 65 |     ]
 66 | 
 67 |     prompt_templates = [MessageTemplate.load(path) for path in paths]
 68 | 
 69 |     # Initialize chat history
 70 |     if "messages" not in st.session_state:
 71 |         st.session_state.messages = [
 72 |             {"role": prompt_template.role, "content": prompt_template.content} for prompt_template in prompt_templates
 73 |         ]
 74 | 
 75 |     # Display chat messages from history on app rerun.
 76 |     # The first messages are the prompt, so we skip it.
 77 |     for message in st.session_state.messages[len(prompt_templates) :]:
 78 |         with st.chat_message(message["role"]):
 79 |             st.markdown(message["content"])
 80 | 
 81 |     prompt = st.chat_input("Let's create activities educating children on how whales breathe")
 82 |     if prompt:
 83 |         # Display user message in chat message container
 84 |         with st.chat_message("user"):
 85 |             st.markdown(prompt)
 86 | 
 87 |         # Add user message to chat history
 88 |         # The very first message will be used to fill in the prompt template
 89 |         # after that, we store the user messages in the chat history
 90 |         if len(st.session_state.messages) == len(prompt_templates):
 91 |             query = prompt
 92 |             with st.spinner("Searching for relevant BBC activities..."):
 93 |                 # Encode the query
 94 |                 encoded_query = get_embedding(query)
 95 | 
 96 |                 # Search with Pinecone
 97 |                 similar_docs = query_pinecone(
 98 |                     index,
 99 |                     encoded_query,
100 |                     areas_of_learning=areas_of_learning,
101 |                     top_n=4,
102 |                     max_n=4,
103 |                 )
104 | 
105 |                 if "similar_docs" not in st.session_state:
106 |                     st.session_state["similar_docs"] = similar_docs
107 | 
108 |         else:
109 |             st.session_state.messages.append({"role": "user", "content": prompt})
110 |             query = ""
111 | 
112 |         with st.chat_message("assistant"):
113 |             message_placeholder = st.empty()
114 |             full_response = ""
115 | 
116 |             messages_placeholders = {
117 |                 "description": query,
118 |                 "areas_of_learning": areas_of_learning,
119 |                 "n_results": n_results,
120 |                 "location": location,
121 |                 "areas_of_learning_text": areas_of_learning_text,
122 |                 "activity_examples": "\n======\n".join(
123 |                     [similar_doc["metadata"]["text"] for similar_doc in st.session_state["similar_docs"]]
124 |                 ),
125 |             }
126 | 
127 |             r = TextGenerator.generate(
128 |                 model=selected_model,
129 |                 temperature=temperature,
130 |                 messages=[{"role": m["role"], "content": m["content"]} for m in st.session_state.messages],
131 |                 message_kwargs=messages_placeholders,
132 |                 stream=True,
133 |             )
134 | 
135 |             for response in r:
136 |                 full_response += response.choices[0].delta.get("content", "")
137 |                 message_placeholder.markdown(full_response + "▌")
138 |             message_placeholder.markdown(full_response)
139 | 
140 |             if len(st.session_state.messages) == len(prompt_templates):
141 |                 st.subheader("Sources")
142 |                 for similar_doc in st.session_state["similar_docs"]:
143 |                     title = similar_doc["metadata"]["title"]
144 |                     url = similar_doc["id"]
145 |                     category = similar_doc["metadata"]["areas_of_learning"]
146 |                     st.write(f"""- [{title}]({url}) {category}""")
147 |         st.session_state.messages.append({"role": "assistant", "content": full_response})
148 | 
149 | 
150 | def query_pinecone(
151 |     index: pinecone.index.Index,
152 |     encoded_query: list,
153 |     areas_of_learning: list,
154 |     top_n: int = 4,
155 |     max_n: int = 4,
156 | ) -> list:
157 |     """Query the pinecone index.
158 | 
159 |     Parameters
160 |     ----------
161 |     index
162 |         Pinecone index.
163 | 
164 |     query
165 |         Query vector to search for.
166 | 
167 |     areas_of_learning
168 |         Areas of learning to filter by.
169 | 
170 |     top_n
171 |         Number of results to return.
172 | 
173 |     max_n
174 |         Maximum number of results to keep as prompt examples.
175 | 
176 |     Returns
177 |     -------
178 |     docs
179 |         List of documents.
180 | 
181 | 
182 |     """
183 |     results = index.query(
184 |         vector=encoded_query,
185 |         top_k=top_n,
186 |         include_metadata=True,
187 |         filter={
188 |             "areas_of_learning": {"$in": areas_of_learning},
189 |             "source": {"$eq": "BBC"},
190 |         },
191 |     )
192 | 
193 |     results = results["matches"]
194 |     # Subset docs to fit the prompt length
195 |     idx = sample_docs(len(results), max_n)
196 | 
197 |     return [results[i] for i in idx]
198 | 


--------------------------------------------------------------------------------
/src/genai/streamlit_pages/eyfs_page.py:
--------------------------------------------------------------------------------
  1 | import streamlit as st
  2 | 
  3 | from genai import MessageTemplate
  4 | from genai.eyfs.eyfs import TextGenerator
  5 | from genai.streamlit_pages.utils import reset_state
  6 | from genai.utils import read_json
  7 | 
  8 | 
  9 | def early_year_activity_plan() -> None:
 10 |     """Come up with activities for children."""
 11 | 
 12 |     # Define your custom CSS
 13 |     custom_css = """
 14 |         <style>
 15 |             /* Adjust the selector as needed */
 16 |             .stHeadingContainer {
 17 |                 margin-top: -100px; /* Reduce the top margin */
 18 |             }
 19 |             #MainMenu {visibility: hidden;}
 20 |             footer {visibility: hidden;}
 21 |             header {
 22 |                 visibility: hidden
 23 |             }
 24 |         </style>
 25 |         """
 26 | 
 27 |     # Apply the custom CSS
 28 |     st.markdown(custom_css, unsafe_allow_html=True)
 29 | 
 30 |     st.title("Generating activity plans grounded in EY foundation stages")
 31 |     areas_of_learning_desc = read_json("src/genai/eyfs/areas_of_learning.json")
 32 |     aol = list(areas_of_learning_desc.keys())
 33 | 
 34 |     with st.sidebar:
 35 |         # Select a model, temperature and number of results
 36 |         selected_model = st.radio(
 37 |             label="**OpenAI model**",
 38 |             options=["gpt-3.5-turbo", "gpt-4"],
 39 |             index=1,
 40 |             on_change=reset_state,
 41 |         )
 42 |         description = "<THIS IS WHERE THE GENERATOR WILL SHOW THE RESULTS>"
 43 |         n_results = 10
 44 |         temperature = st.slider(
 45 |             label="**Temperature**",
 46 |             min_value=0.0,
 47 |             max_value=2.0,
 48 |             value=0.6,
 49 |             step=0.1,
 50 |             on_change=reset_state,
 51 |         )
 52 | 
 53 |         st.button("Reset chat", on_click=reset_state, type="primary", help="Reset the chat history")
 54 | 
 55 |     # Select the areas of learning
 56 |     areas_of_learning = st.multiselect(
 57 |         label="**Areas of learning**",
 58 |         options=aol,
 59 |         default=aol,
 60 |         on_change=reset_state,
 61 |     )
 62 |     areas_of_learning_text = [v for k, v in areas_of_learning_desc.items() if k in areas_of_learning]
 63 | 
 64 |     # Describe each Area of Learning in an expanding window
 65 |     with st.expander("**Areas of Learning Description**"):
 66 |         for k, v in areas_of_learning_desc.items():
 67 |             if k in areas_of_learning:
 68 |                 st.write(f"#### {k}")
 69 |                 st.write(v.split("##")[-1])
 70 | 
 71 |     areas_of_learning_text = "\n\n".join(areas_of_learning_text)
 72 |     location = st.selectbox(label="**Location**", options=["Indoor", "Outdoor", "Indoor or Outdoor"], index=2)
 73 | 
 74 |     # Create the messages
 75 |     paths = [
 76 |         "src/genai/eyfs/prompts/system.json",
 77 |         "src/genai/eyfs/prompts/context_and_task.json",
 78 |         "src/genai/eyfs/prompts/constraints.json",
 79 |         "src/genai/eyfs/prompts/situation.json",
 80 |     ]
 81 | 
 82 |     prompt_templates = [MessageTemplate.load(path) for path in paths]
 83 | 
 84 |     # Initialize chat history
 85 |     if "messages" not in st.session_state:
 86 |         st.session_state.messages = [
 87 |             {"role": prompt_template.role, "content": prompt_template.content} for prompt_template in prompt_templates
 88 |         ]
 89 | 
 90 |     # Display chat messages from history on app rerun.
 91 |     # The first messages are the prompt, so we skip it.
 92 |     for message in st.session_state.messages[len(prompt_templates) :]:
 93 |         with st.chat_message(message["role"]):
 94 |             st.markdown(message["content"])
 95 | 
 96 |     # Accept user input
 97 |     prompt = st.chat_input("Let's create activities educating children on how whales breathe")
 98 |     if prompt:
 99 |         # Display user message in chat message container
100 |         with st.chat_message("user"):
101 |             st.markdown(prompt)
102 | 
103 |         # Add user message to chat history
104 |         if len(st.session_state.messages) == len(prompt_templates):
105 |             description = prompt
106 |         else:
107 |             st.session_state.messages.append({"role": "user", "content": prompt})
108 |             description = ""
109 | 
110 |         with st.chat_message("assistant"):
111 |             message_placeholder = st.empty()
112 |             full_response = ""
113 | 
114 |             for response in TextGenerator.generate(
115 |                 model=selected_model,
116 |                 temperature=temperature,
117 |                 messages=[{"role": m["role"], "content": m["content"]} for m in st.session_state.messages],
118 |                 message_kwargs={
119 |                     "description": description,
120 |                     "areas_of_learning": areas_of_learning,
121 |                     "n_results": n_results,
122 |                     "location": location,
123 |                     "areas_of_learning_text": areas_of_learning_text,
124 |                 },
125 |                 stream=True,
126 |             ):
127 |                 full_response += response.choices[0].delta.get("content", "")
128 |                 message_placeholder.markdown(full_response + "▌")
129 |             message_placeholder.markdown(full_response)
130 |         st.session_state.messages.append({"role": "assistant", "content": full_response})
131 | 


--------------------------------------------------------------------------------
/src/genai/streamlit_pages/parenting_page.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import uuid
  3 | 
  4 | from datetime import datetime
  5 | 
  6 | import s3fs
  7 | import streamlit as st
  8 | 
  9 | from dotenv import load_dotenv
 10 | from streamlit_feedback import streamlit_feedback
 11 | 
 12 | from genai.eyfs import TextGenerator
 13 | from genai.eyfs import get_embedding
 14 | from genai.message_history import InMemoryMessageHistory
 15 | from genai.prompt_template import FunctionTemplate
 16 | from genai.prompt_template import MessageTemplate
 17 | from genai.streamlit_pages.utils import get_index
 18 | from genai.streamlit_pages.utils import query_pinecone
 19 | from genai.streamlit_pages.utils import reset_state
 20 | 
 21 | 
 22 | load_dotenv()
 23 | 
 24 | 
 25 | def parenting_chatbot(aws_key: str, aws_secret: str, s3_path: str, sidebar: bool = True) -> None:
 26 |     """Early-years parenting chatbot."""
 27 | 
 28 |     # Define your custom CSS
 29 |     custom_css = """
 30 |         <style>
 31 |             /* Adjust the selector as needed */
 32 |             .stHeadingContainer {
 33 |                 margin-top: -100px; /* Reduce the top margin */
 34 |             }
 35 |             #MainMenu {visibility: hidden;}
 36 |             footer {visibility: hidden;}
 37 |             header {
 38 |                 visibility: hidden
 39 |             }
 40 |         </style>
 41 |         """
 42 | 
 43 |     # Apply the custom CSS
 44 |     st.markdown(custom_css, unsafe_allow_html=True)
 45 | 
 46 |     st.title("Early-years parenting chatbot")
 47 |     st.write(
 48 |         "This is a chatbot based on information from the [NHS Start for Life](https://www.nhs.uk/start-for-life/) website. "
 49 |         "You can ask it questions about pregnancy, birth and parenthood. "
 50 |         "Please note that this is a prototype and the answers should not be used as medical advice."
 51 |     )
 52 |     st.write("---")
 53 | 
 54 |     selected_model = "gpt-3.5-turbo"
 55 |     temperature = 0.6
 56 |     pinecone_index = get_index(index_name="eyfs-index-1718673592-index")
 57 | 
 58 |     if sidebar:
 59 |         with st.sidebar:
 60 |             st.button("Reset chat", on_click=reset_state, type="primary", help="Reset the chat history")
 61 | 
 62 |     system_message = MessageTemplate.load("src/genai/parenting_chatbot/prompts/system.json")
 63 |     filter_refs_function = FunctionTemplate.load("src/genai/parenting_chatbot/prompts/filter_refs_function.json")
 64 |     filter_refs_user_message = MessageTemplate.load("src/genai/parenting_chatbot/prompts/filter_refs_user.json")
 65 |     filter_refs_system_message = MessageTemplate.load("src/genai/parenting_chatbot/prompts/filter_refs_system.json")
 66 | 
 67 |     if "session_uuid" not in st.session_state:
 68 |         st.session_state["session_uuid"] = f"{current_time()}-{str(uuid.uuid4())}"
 69 | 
 70 |     # Single submitted feedback
 71 |     if "feedback" not in st.session_state:
 72 |         st.session_state["feedback"] = None
 73 | 
 74 |     # st.session_state["memory"] controls the flow to OpenAI and logging
 75 |     if "memory" not in st.session_state:
 76 |         st.session_state["memory"] = InMemoryMessageHistory()
 77 |         st.session_state["memory"].add_message(system_message.to_prompt())
 78 | 
 79 |     # st.session_state["messages"] shows the conversation in the UI
 80 |     if "messages" not in st.session_state:
 81 |         # instantiate the memory instead of None
 82 |         st.session_state["messages"] = [system_message.to_prompt()]
 83 | 
 84 |     for message in st.session_state.messages:
 85 |         if message["role"] in ["user", "assistant"]:
 86 |             with st.chat_message(message["role"]):
 87 |                 st.markdown(message["content"])
 88 | 
 89 |     # Accept user input
 90 |     prompt = st.chat_input("What's on your mind?")
 91 |     if prompt:
 92 |         # Display user message in chat message container
 93 |         with st.chat_message("user"):
 94 |             st.markdown(prompt)
 95 | 
 96 |         # Search the vector index
 97 |         search_results = query_pinecone(
 98 |             index=pinecone_index,
 99 |             encoded_query=get_embedding(prompt),
100 |             top_n=3,
101 |             filters={
102 |                 "source": {"$eq": "nhs_full_page"},
103 |             },
104 |         )
105 | 
106 |         nhs_texts = []
107 |         nhs_urls = []
108 |         for result in search_results:
109 |             pred = TextGenerator.generate(
110 |                 temperature=0.0,
111 |                 messages=[filter_refs_system_message, filter_refs_user_message],
112 |                 message_kwargs={"text": result["metadata"]["text"], "question": prompt},
113 |                 functions=[filter_refs_function.to_prompt()],
114 |                 function_call={"name": filter_refs_function.name},
115 |             )
116 | 
117 |             pred = json.loads(pred["choices"][0]["message"]["function_call"]["arguments"])["prediction"]
118 | 
119 |             if pred:
120 |                 nhs_texts.append(result["metadata"]["text"])
121 |                 nhs_urls.append(result["metadata"]["url"])
122 | 
123 |         if nhs_texts:
124 |             nhs_texts = "\n===\n".join(nhs_texts)
125 | 
126 |         # Log message for the UI before adding the references
127 |         st.session_state["messages"].append({"role": "user", "content": prompt})
128 | 
129 |         # Add user message to chat history
130 |         prompt = f"""###NHS Start for Life references###\n{nhs_texts}\n\n###User message###\n{prompt}"""
131 |         st.session_state["memory"].add_message({"role": "user", "content": prompt})
132 | 
133 |         write_to_s3(
134 |             aws_key,
135 |             aws_secret,
136 |             f"{s3_path}/session-logs/{st.session_state['session_uuid']}",
137 |             "messages",
138 |             st.session_state["memory"].messages[-1],
139 |             how="a",
140 |         )
141 | 
142 |         with st.chat_message("assistant"):
143 |             message_placeholder = st.empty()
144 |             full_response = ""
145 | 
146 |             for response in TextGenerator.generate(
147 |                 model=selected_model,
148 |                 temperature=temperature,
149 |                 messages=st.session_state["memory"].get_messages(),
150 |                 message_kwargs=None,
151 |                 stream=True,
152 |             ):
153 |                 full_response += response.choices[0].delta.get("content", "")
154 |                 message_placeholder.markdown(full_response + "▌")
155 | 
156 |             # Submit feedback
157 |             streamlit_feedback(
158 |                 feedback_type="faces",
159 |                 single_submit=False,
160 |                 optional_text_label="[Optional] Please provide an explanation",
161 |                 key="feedback",
162 |             )
163 | 
164 |             message_placeholder.markdown(full_response)
165 | 
166 |             # Display NHS URLs in chat message container
167 |             if nhs_urls:
168 |                 with st.expander("NHS Start for Life references"):
169 |                     for url in nhs_urls:
170 |                         st.markdown(f"[{url}]({url})")
171 | 
172 |         st.session_state["messages"].append({"role": "assistant", "content": full_response})
173 |         st.session_state["memory"].add_message({"role": "assistant", "content": full_response})
174 | 
175 |         write_to_s3(
176 |             aws_key,
177 |             aws_secret,
178 |             f"{s3_path}/session-logs/{st.session_state['session_uuid']}",
179 |             "messages",
180 |             st.session_state["memory"].messages[-1],
181 |             how="a",
182 |         )
183 | 
184 |     # Log feedback and messages
185 |     if st.session_state["feedback"]:
186 |         user_feedback = {
187 |             "user_message": st.session_state["messages"][-2],
188 |             "assistant_message": st.session_state["messages"][-1],
189 |             "feedback_score": st.session_state["feedback"]["score"],
190 |             "feedback_text": st.session_state["feedback"]["text"],
191 |         }
192 | 
193 |         write_to_s3(
194 |             aws_key,
195 |             aws_secret,
196 |             f"{s3_path}/session-logs/{st.session_state['session_uuid']}",
197 |             "feedback",
198 |             user_feedback,
199 |             how="a",
200 |         )
201 | 
202 | 
203 | def write_to_s3(key: str, secret: str, s3_path: str, filename: str, data: dict, how: str = "a") -> None:
204 |     """Write data to a jsonl file in S3.
205 | 
206 |     Parameters
207 |     ----------
208 |     key
209 |         AWS access key ID.
210 | 
211 |     secret
212 |         AWS secret access key.
213 | 
214 |     s3_path
215 |         S3 bucket path.
216 | 
217 |     filename
218 |         Name of the file to write to.
219 | 
220 |     data
221 |         Data to write to the file.
222 | 
223 |     how
224 |         How to write to the file. Default is "a" for append. Use "w" to overwrite.
225 | 
226 |     """
227 |     fs = s3fs.S3FileSystem(key=key, secret=secret)
228 |     with fs.open(f"{s3_path}/{filename}.jsonl", how) as f:
229 |         f.write(f"{json.dumps(data)}\n")
230 | 
231 | 
232 | def current_time() -> str:
233 |     """Return the current time as a string. Used as part of the session UUID."""
234 |     # Get current date and time
235 |     current_datetime = datetime.now()
236 | 
237 |     # Convert to a long number format
238 |     datetime_string = current_datetime.strftime("%Y%m%d%H%M%S")
239 | 
240 |     return datetime_string
241 | 


--------------------------------------------------------------------------------
/src/genai/streamlit_pages/utils.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import random
  3 | 
  4 | from typing import List
  5 | from typing import Optional
  6 | from typing import Union
  7 | 
  8 | import dotenv
  9 | import pinecone
 10 | import streamlit as st
 11 | 
 12 | from genai.vector_index import PineconeIndex
 13 | 
 14 | 
 15 | dotenv.load_dotenv()
 16 | 
 17 | 
 18 | def reset_state(key: Optional[str] = None) -> None:
 19 |     """Delete the message placeholder state."""
 20 |     keys = [
 21 |         "similar_docs",
 22 |         "areas_of_learning_text",
 23 |         "areas_of_learning",
 24 |         "n_results",
 25 |         "location",
 26 |         "messages",
 27 |         "choice",
 28 |         "choices",
 29 |         "examples",
 30 |         "learning_goals",
 31 |         "full_response",
 32 |         "memory",
 33 |         "feedback",
 34 |         "user_feedback",
 35 |         "session_uuid",
 36 |     ]
 37 |     for key in keys:
 38 |         try:
 39 |             del st.session_state[key]
 40 |         except KeyError:
 41 |             pass
 42 | 
 43 | 
 44 | @st.cache_resource
 45 | def get_index(index_name: str) -> pinecone.index.Index:
 46 |     """Return and persist the pinecone index."""
 47 |     conn = PineconeIndex(api_key=os.environ["PINECONE_API_KEY"], environment=os.environ["PINECONE_REGION"])
 48 |     index = conn.connect(index_name=index_name)
 49 |     return index
 50 | 
 51 | 
 52 | def sample_docs(num_docs: int, n: int) -> Union[List[int], ValueError]:
 53 |     """Sample docs (without replacement)."""
 54 |     try:
 55 |         return random.sample(range(num_docs), n)
 56 |     except ValueError:
 57 |         try:
 58 |             return random.sample(range(num_docs), num_docs)
 59 |         except ValueError as e:
 60 |             raise ValueError(f"Cannot sample docs: {e}")
 61 | 
 62 | 
 63 | def query_pinecone(
 64 |     index: pinecone.index.Index,
 65 |     encoded_query: list,
 66 |     filters: dict,
 67 |     top_n: int = 5,
 68 |     max_n: int = 10,
 69 | ) -> list:
 70 |     """Query the pinecone index.
 71 | 
 72 |     Parameters
 73 |     ----------
 74 |     index
 75 |         Pinecone index.
 76 | 
 77 |     query
 78 |         Query vector to search for.
 79 | 
 80 |     areas_of_learning
 81 |         Areas of learning to filter by.
 82 | 
 83 |     top_n
 84 |         Number of results to return.
 85 | 
 86 |     max_n
 87 |         Maximum number of results to keep as prompt examples.
 88 | 
 89 |     Returns
 90 |     -------
 91 |     docs
 92 |         List of documents.
 93 | 
 94 |     """
 95 |     results = index.query(
 96 |         vector=encoded_query,
 97 |         top_k=top_n,
 98 |         include_metadata=True,
 99 |         filter=filters,
100 |     )
101 | 
102 |     return results["matches"]
103 | 


--------------------------------------------------------------------------------
/src/genai/utils.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | 
 4 | from typing import Generator
 5 | from typing import List
 6 | 
 7 | import boto3
 8 | 
 9 | 
10 | def read_json(path: str, lines: bool = False) -> List[dict]:
11 |     """Read a JSONL file."""
12 |     with open(path, "r") as f:
13 |         if lines:
14 |             return [json.loads(line) for line in f.readlines()]
15 |         return json.load(f)
16 | 
17 | 
18 | def batch(lst: list, n: int) -> Generator:
19 |     """Yield successive n-sized chunks from lst."""
20 |     for i in range(0, len(lst), n):
21 |         yield lst[i : i + n]
22 | 
23 | 
24 | def create_directory_if_not_exists(dir_path: str) -> None:
25 |     """Create a directory if it doesn't exist."""
26 |     if not os.path.exists(dir_path):
27 |         os.makedirs(dir_path)
28 | 
29 | 
30 | def read_jsonl_from_s3(s3_uri: str) -> List[dict]:
31 |     """Read a JSONL file from S3."""
32 |     s3_uri = s3_uri.replace("s3://", "")
33 |     bucket_name = s3_uri.split("/")[0]
34 |     file_key = "/".join(s3_uri.split("/")[1:])
35 | 
36 |     s3 = boto3.client("s3")
37 |     s3_object = s3.get_object(Bucket=bucket_name, Key=file_key)
38 |     content = s3_object["Body"].read().decode("utf-8")
39 | 
40 |     return [json.loads(line) for line in content.strip().split("\n")]
41 | 


--------------------------------------------------------------------------------
/src/genai/vector_index.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | 
  4 | from typing import Optional
  5 | 
  6 | # import pinecone
  7 | from pinecone import Pinecone
  8 | 
  9 | from genai.utils import batch
 10 | 
 11 | 
 12 | class PineconeIndex:
 13 |     """Wrap the Pinecone API.
 14 | 
 15 |     Note that this is a very thin and untested wrapper. It is not intended for production use.
 16 |     Its main purpose is to support this repo's prototypes and tiny indexes.
 17 |     """
 18 | 
 19 |     def __init__(
 20 |         self,
 21 |         api_key: Optional[str] = None,
 22 |         environment: Optional[str] = None,
 23 |     ) -> None:
 24 |         """Initialize the index."""
 25 | 
 26 |         # Connect to pinecone
 27 |         if api_key:
 28 |             self.pc = Pinecone(api_key=api_key)
 29 |         else:
 30 |             self.pc = Pinecone(api_key=os.environ["PINECONE_API_KEY"])
 31 | 
 32 |     def connect(self, index_name: str) -> Pinecone.Index:
 33 |         """Connect to the index."""
 34 |         return self.pc.Index(index_name)
 35 | 
 36 |     # To do: update for the new Pinecone API
 37 |     def build_and_upsert(
 38 |         self,
 39 |         index_name: str,
 40 |         dimension: int,
 41 |         metadata_config: dict,
 42 |         metric: str,
 43 |         docs: list,
 44 |         batch_size: int = 100,
 45 |         delete_if_exists: bool = False,
 46 |         **kwargs,
 47 |     ) -> None:
 48 |         """Build the index (if it does not exist) and add docs.
 49 | 
 50 |         Parameters
 51 |         ----------
 52 |         index_name
 53 |             Name of the index.
 54 | 
 55 |         dimension
 56 |             Length of the indexed vectors.
 57 | 
 58 |         metadata_config
 59 |             The metadata config.
 60 | 
 61 |         metric
 62 |             The distance metric to use.
 63 | 
 64 |         docs
 65 |             The documents to index.
 66 | 
 67 |         batch_size
 68 |             The batch size to use when indexing.
 69 | 
 70 |         delete_if_exists
 71 |             Whether to delete the index if it already exists.
 72 | 
 73 |         """
 74 |         if delete_if_exists:
 75 |             self.delete(index_name)
 76 | 
 77 |         if index_name in pinecone.list_indexes():  # noqa: F821
 78 |             index = self.connect(index_name)
 79 |         else:
 80 |             pinecone.create_index(  # noqa: F821
 81 |                 index_name,
 82 |                 dimension=dimension,
 83 |                 metadata_config=metadata_config,
 84 |                 metric=metric,
 85 |             )
 86 | 
 87 |             index = self.connect(index_name)
 88 | 
 89 |         # Potential fix to avoid error 403
 90 |         time.sleep(30)
 91 | 
 92 |         for batched_docs in batch(docs, batch_size):
 93 |             index.upsert(batched_docs)
 94 | 
 95 |     @staticmethod
 96 |     def delete(index_name: str) -> None:
 97 |         """Delete the index."""
 98 |         try:
 99 |             pinecone.delete_index(index_name)  # noqa: F821
100 |         except Exception as e:
101 |             print(e)  # noqa: T001
102 | 


--------------------------------------------------------------------------------
/src/genai/whatsapp_bot/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Use an official Python runtime as a parent image
 2 | FROM --platform=linux/amd64 python:3.9.17-slim
 3 | 
 4 | # Set environment varibles
 5 | ENV PYTHONDONTWRITEBYTECODE 1
 6 | ENV PYTHONUNBUFFERED 1
 7 | 
 8 | # Install system dependencies
 9 | RUN apt-get update \
10 |     && apt-get install -y --no-install-recommends gcc
11 | 
12 | # Install poetry
13 | RUN pip install "poetry==1.5.1"
14 | 
15 | # Set the working directory in the Docker image
16 | WORKDIR /app
17 | 
18 | # Copy only requirements to cache them in docker layer
19 | COPY pyproject.toml poetry.lock ./
20 | COPY src/genai  /app/src/genai
21 | COPY README.md /app/README.md
22 | COPY whatsapp_bot.py /app/whatsapp_bot.py
23 | 
24 | # Don't push the image to dockerhub
25 | COPY .env /app/.env
26 | 
27 | # Project initialization:
28 | RUN poetry config virtualenvs.create false \
29 |     && poetry install --no-interaction --no-ansi
30 | 
31 | # Specify the command to run your application
32 | CMD ["sh", "-c", "gunicorn --bind 0.0.0.0:$PORT whatsapp_bot:app"]
33 | 


--------------------------------------------------------------------------------
/src/genai/whatsapp_bot/README.md:
--------------------------------------------------------------------------------
  1 | # WhatsApp bot prototype
  2 | 
  3 | This prototype provides a messaging front end (WhatsApp) to a large language model (LLM) using prompts related to early-years activities. The idea is that in some situations a widely-adopted messaging platform could be a more natural and accessible user interface for caregivers or educators than a web app.
  4 | 
  5 | To set up the WhatsApp bot, you will need to sign up to [Twilio](https://www.twilio.com/), install [ngrok](https://ngrok.com/) and set up a [Heroku](https://www.heroku.com/) account. At the time of writing this, Twilio has a free tier that you can use to test the bot, and Heroku provides affordable plans (less than $10 per month) that will be sufficient for our needs.
  6 | 
  7 | - [How it works](#how-it-works)
  8 | - [Running the API locally](#running-the-api-locally)
  9 | - [Deploying the API on Heroku](#deploying-the-api-on-heroku)
 10 | 
 11 | 
 12 | ## How it works
 13 | 
 14 | Essentially, we use Flask to create an API, it runs on Heroku and can be called via the `/text` endpoint. Twilio provides a WhatsApp contact that receives messages from users and calls the API endpoint when new messages come in.
 15 | 
 16 | At the moment, the prototype is not exactly a chatbot as it only allows two types of queries and cannot answer follow-up questions.
 17 | 
 18 | You can start the interaction by saying anything, eg 'Hi"
 19 | 
 20 | <p align="center">
 21 | <img src="screenshots/chat_1.png" style="max-width: 500px;"/>
 22 | </p>
 23 | 
 24 | The 'Explain <your question>' message triggers the "Explain like I'm 3" prompt
 25 | 
 26 | <p align="center">
 27 | <img src="screenshots/chat_2.png" style="max-width: 500px;"/>
 28 | </p>
 29 | 
 30 | 
 31 | The 'Activities <your_topic>' message triggers the personalised activity generation prompt. Note that this prompt is quite long and takes 20+ seconds to respond. This is longer than Twilio's [max timeout](https://www.twilio.com/docs/usage/webhooks/webhooks-connection-overrides) (15 seconds) and hence we first reply with a generic hold message, and then send the actual response when it's ready.
 32 | 
 33 | <p align="center">
 34 | <img src="screenshots/chat_3.png" style="max-width: 500px;"/>
 35 | </p>
 36 | 
 37 | This is implemented by the activity message triggering a new thread in which the app calls the OpenAI API and then use Twilio to send the response back to the same user. This should be seen as a quick hack for prototyping purposes, as this might result in a dangling thread that isn't destructed after sending the reply. A better solution would be to use a task queue (eg, Celery) to handle the long-running task.
 38 | 
 39 | Note that the response for this prompt is too long for a single message (WhatsApp limit is 1600 characters), and hence we need to split it up in chunks and send as multiple separate messages.
 40 | 
 41 | <p align="center">
 42 | <img src="screenshots/chat_4.png" style="max-width: 500px;"/>
 43 | </p>
 44 | 
 45 | Overall, this prototype has been useful to explore the technical feasibility of using WhatsApp as a front end for interacting with a LLM. It also highlights differences between a messaging app and a web app user experience, where the messaging app will be probably better suited for shorter interactions and responses from the LLM (particularly because we can't stream the text character by character to WhatsApp). This means that the prompts that we used for our web app prototype would need to be optimised to provide shorter answers (eg, responding with only one idea at a time).
 46 | 
 47 | Possible future iterations could explore using [message history](https://www.twilio.com/docs/sms/tutorials/how-to-retrieve-and-modify-message-history-python) to enable a more natural chatbot interaction and allow the user to configure our prompts (eg, the areas of learning or the age of the kids) using natural language.
 48 | 
 49 | 
 50 | ## Running the API locally
 51 | 
 52 | To run the API locally, navigate to the root of this repo and start the app
 53 | 
 54 | ```
 55 | poetry run python whatsapp_bot.py
 56 | ```
 57 | 
 58 | Start ngrok to expose the app to the internet (assuming the app is using port 5000). This will provide you with a URL that you can use to connect to your app.
 59 | 
 60 | ```
 61 | ngrok http 5000
 62 | ```
 63 | 
 64 | Then in Twilio you'll need to set up a Sandbox for WhatsApp and add the endpoint URL (created by ngrok) of your app under "Sandbox settings" in the box "When the message comes in".
 65 | 
 66 | ![twilio_ngrok](screenshots/twilio_ngrok.png)
 67 | 
 68 | You can now go to WhatsApp, connect with the phone number provided by Twilio and chat with your bot.
 69 | 
 70 | ## Deploying the API on Heroku
 71 | 
 72 | Testing your app locally is great, but what if you close your machine? To keep your WhatsApp bot running even when you rest, we'll deploy it on Heroku using Docker.
 73 | 
 74 | First, make sure you've set up heroku on your machine and log in the container registry
 75 | 
 76 | ```
 77 | heroku container:login
 78 | ```
 79 | 
 80 | Then you can create a new app
 81 | 
 82 | ```
 83 | heroku create
 84 | ```
 85 | 
 86 | This will create an app with a random name - this can be changed later.
 87 | 
 88 | Set up your environment variables
 89 | ```
 90 | heroku config:set OPENAI_API_KEY=<your_api_key>
 91 | heroku config:set TWILIO_ACCOUNT_SID=<your_account_sid>
 92 | heroku config:set TWILIO_AUTH_TOKEN=<your_token>
 93 | ```
 94 | 
 95 | Navigate to your app's directory and build the container (because that's where we store the Dockerfile for this prototype)
 96 | 
 97 | ```
 98 | cd src/genai/whatsapp_bot
 99 | heroku container:push web --app <your_app_name> --context-path ../../..
100 | ```
101 | 
102 | Then release the container
103 | 
104 | ```
105 | heroku container:release web --app <your_app_name>
106 | ```
107 | 
108 | Start the app
109 | 
110 | ```
111 | heroku ps:scale web=1
112 | ```
113 | 
114 | You can check that the app is working by opening it in your browser
115 | 
116 | ```
117 | heroku open
118 | ```
119 | 
120 | Finally, check the logs (useful if something isn't working)
121 | 
122 | ```
123 | heroku logs --tail
124 | ```
125 | 
126 | Once the app is running OK, you can configure the WhatsApp sandbox as described above, using your new Heroku URL.
127 | 


--------------------------------------------------------------------------------
/src/genai/whatsapp_bot/prompts/eli3/eli3.json:
--------------------------------------------------------------------------------
1 | {
2 |     "role": "user",
3 |     "content": "###Instructions###\nYou are a helpful, kind, intelligent and polite early-years educator. Your task is to explain a concept to a 3 year old child. You must explain it in simple words that a young kid would understand. You must also be patient and never offend or be aggressive. Gendered language and any adjectives about the kid are strictly prohibited.\n\n###Question###\n{input}\n\n###Answer###\n"
4 | }
5 | 


--------------------------------------------------------------------------------
/src/genai/whatsapp_bot/screenshots/chat_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nestauk/discovery_generative_ai/b5ab4b0bf1feda7cd4a85a18ba20aa996cfb9e0d/src/genai/whatsapp_bot/screenshots/chat_1.png


--------------------------------------------------------------------------------
/src/genai/whatsapp_bot/screenshots/chat_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nestauk/discovery_generative_ai/b5ab4b0bf1feda7cd4a85a18ba20aa996cfb9e0d/src/genai/whatsapp_bot/screenshots/chat_2.png


--------------------------------------------------------------------------------
/src/genai/whatsapp_bot/screenshots/chat_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nestauk/discovery_generative_ai/b5ab4b0bf1feda7cd4a85a18ba20aa996cfb9e0d/src/genai/whatsapp_bot/screenshots/chat_3.png


--------------------------------------------------------------------------------
/src/genai/whatsapp_bot/screenshots/chat_4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nestauk/discovery_generative_ai/b5ab4b0bf1feda7cd4a85a18ba20aa996cfb9e0d/src/genai/whatsapp_bot/screenshots/chat_4.png


--------------------------------------------------------------------------------
/src/genai/whatsapp_bot/screenshots/twilio_ngrok.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nestauk/discovery_generative_ai/b5ab4b0bf1feda7cd4a85a18ba20aa996cfb9e0d/src/genai/whatsapp_bot/screenshots/twilio_ngrok.png


--------------------------------------------------------------------------------
/src/genai/whatsapp_parenting_bot/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Use an official Python runtime as a parent image
 2 | FROM --platform=linux/amd64 python:3.9.17-slim
 3 | 
 4 | # Set environment varibles
 5 | ENV PYTHONDONTWRITEBYTECODE 1
 6 | ENV PYTHONUNBUFFERED 1
 7 | 
 8 | # Install system dependencies
 9 | RUN apt-get update \
10 |     && apt-get install -y --no-install-recommends gcc
11 | 
12 | # Install poetry
13 | RUN pip install "poetry==1.5.1"
14 | 
15 | # Set the working directory in the Docker image
16 | WORKDIR /app
17 | 
18 | # Copy only requirements to cache them in docker layer
19 | COPY pyproject.toml poetry.lock ./
20 | COPY src/genai  /app/src/genai
21 | COPY README.md /app/README.md
22 | COPY whatsapp_parenting_bot.py /app/whatsapp_parenting_bot.py
23 | 
24 | # Don't push the image to dockerhub
25 | COPY .env /app/.env
26 | 
27 | # Project initialization:
28 | RUN poetry config virtualenvs.create false \
29 |     && poetry install --no-interaction --no-ansi
30 | 
31 | # Specify the command to run your application
32 | CMD ["sh", "-c", "gunicorn --bind 0.0.0.0:$PORT whatsapp_parenting_bot:app"]
33 | 


--------------------------------------------------------------------------------
/src/scraping/activities/README.md:
--------------------------------------------------------------------------------
 1 | # Examples of activities
 2 | 
 3 | You can use `scrape_activities.py` to collect examples of kids activities from the [Tiny Happy People website](https://www.bbc.co.uk/tiny-happy-people/activities/zjh8hbk).
 4 | 
 5 | The relevant urls were found in [this spreadsheet](https://docs.google.com/spreadsheets/d/1loK4_sCp35JRVM9z_xW8inF3eqb7pL-jEuFWuJ5fjNE/edit#gid=0)
 6 | 
 7 | 
 8 | ## Usage
 9 | 
10 | First navigate to the scraping folder from repo's root
11 | ```
12 | cd src/scraping/activities
13 | ```
14 | 
15 | Then run the script
16 | ```
17 | poetry run python scrape_activities.py
18 | ```
19 | 
20 | The final output table is stored in `data/tiny_happy_people - final.csv`
21 | with the following columns:
22 | - CONTENT: Title of the activity
23 | - SHORT DESCRIPTION: Short description of the activity
24 | - Age Range (if applicable): Age range of the activity
25 | - Type: Type of the activity (there are 20+ different types used by BBC)
26 | - URL: URL of the activity
27 | - **text: Scraped text, describing the activity in greater detail**
28 | 
29 | ## Notes
30 | 
31 | You can use `scrape_notebook.ipynb` to test the scraping function for single URLs
32 | 


--------------------------------------------------------------------------------
/src/scraping/activities/scrape_activities.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Get text data from the BBC's Tiny Happy People website about kids activities
  3 | 
  4 | Usage:
  5 | 
  6 | First navigate to the scraping folder
  7 | $ cd src/scraping/activities
  8 | 
  9 | Then run the script
 10 | $ poetry run python scrape_activities.py
 11 | 
 12 | The final output file is stored in data/tiny_happy_people - final.csv
 13 | with the following columns:
 14 |     - CONTENT: Title of the activity
 15 |     - SHORT DESCRIPTION: Short description of the activity
 16 |     - Age Range (if applicable): Age range of the activity
 17 |     - Type: Type of the activity (there are 20+ different types used by BBC)
 18 |     - URL: URL of the activity
 19 |     - text: Scraped text, describing the activity in greater detail
 20 | 
 21 | """
 22 | 
 23 | import csv
 24 | import logging
 25 | 
 26 | from pathlib import Path
 27 | from time import sleep
 28 | 
 29 | import bs4
 30 | import pandas as pd
 31 | import requests
 32 | 
 33 | from bs4 import BeautifulSoup
 34 | from tqdm import tqdm
 35 | 
 36 | 
 37 | # URLs to scrape
 38 | URL_PATH = Path("data/tiny_happy_people_urls - links.csv")
 39 | # Interim file to store the scraped text
 40 | SCRAPED_PATH = URL_PATH.parent / "tiny_happy_people - scraped.csv"
 41 | # File to store the final table
 42 | OUTPUTS_PATH = URL_PATH.parent / "tiny_happy_people - final.csv"
 43 | 
 44 | # Possible headers signalling the end of the article
 45 | END_MARKERS = [
 46 |     "In case you missed it",
 47 |     "Find another activity",
 48 | ]
 49 | 
 50 | # Boilerplate text to be removed
 51 | SCRAP_TEXT = [
 52 |     "TwitterFacebookMessengerWhatsAppShareShare this withTwitterFacebookMessengerWhatsAppCopy linkRead more about sharing",
 53 |     "previousnext",
 54 | ]
 55 | 
 56 | # Headers to be used when scraping
 57 | HEADERS = {
 58 |     "User-Agent": "Data collection for the purpose of research. For questions, reach out to karlis.kanders@nesta.org.uk"
 59 | }
 60 | 
 61 | 
 62 | def pad_element(element: bs4.element.Tag) -> str:
 63 |     """Pad an element's content with spaces for certain tags (a recursive function)
 64 | 
 65 |     Args:
 66 |         element (bs4.element.Tag): HTML element
 67 | 
 68 |     Returns:
 69 |         str: Text with spaces
 70 |     """
 71 |     if isinstance(element, str):
 72 |         return element
 73 |     elif element.name in ["p", "h1", "h2", "h3", "h4", "h5", "h6"]:
 74 |         return " " + " ".join(pad_element(child) for child in element.children) + " "
 75 |     else:
 76 |         return "".join(pad_element(child) for child in element.children)
 77 | 
 78 | 
 79 | # Function to scrape the web page
 80 | def web_scraper(url: str, timeout: float = 10) -> str:
 81 |     """Scrape a web page and return the content
 82 | 
 83 |     Args:
 84 |         url
 85 |             URL of the web page
 86 | 
 87 |     Returns:
 88 |         Dataframe with URL and content
 89 |     """
 90 |     # Fetch webpage
 91 |     response = requests.get(
 92 |         url,
 93 |         timeout=timeout,
 94 |         headers=HEADERS,
 95 |     )
 96 | 
 97 |     # Parse with BeautifulSoup
 98 |     soup = BeautifulSoup(response.text, "html.parser")
 99 | 
100 |     # Remove all <style>...</style> tags
101 |     for style_tag in soup.find_all("style"):
102 |         style_tag.extract()
103 | 
104 |     # Find the headline
105 |     headline = soup.find("h1", class_="blocks-article__headline")
106 | 
107 |     # Fetch all the content starting from headline
108 |     content = []
109 |     curr_element = headline.find_next_sibling()
110 |     while curr_element is not None:
111 |         content.append(curr_element)
112 |         curr_element = curr_element.find_next_sibling()
113 | 
114 |     # Clean the text from HTML tags and add spaces
115 |     clean_content = "".join(pad_element(element) for element in content)
116 | 
117 |     # Remove the end of the webpage
118 |     if any(marker in clean_content for marker in END_MARKERS):
119 |         for marker in END_MARKERS:
120 |             clean_content = clean_content.split(marker)[0]
121 | 
122 |     # Remove the boilerplate text
123 |     for text in SCRAP_TEXT:
124 |         clean_content = clean_content.replace(text, "")
125 | 
126 |     return clean_content.strip()
127 | 
128 | 
129 | if __name__ == "__main__":
130 |     urls_df = pd.read_csv(URL_PATH)
131 | 
132 |     # Fetch the already scraped urls
133 |     if SCRAPED_PATH.exists():
134 |         scraped_urls = pd.read_csv(SCRAPED_PATH, names=["URL", "text"]).URL.to_list()
135 |     else:
136 |         scraped_urls = []
137 | 
138 |     new_urls_df = urls_df[~urls_df["URL"].isin(scraped_urls)]
139 | 
140 |     # Scrape the urls
141 |     with open(SCRAPED_PATH, "a") as f:
142 |         writer = csv.writer(f)
143 |         for row in tqdm(new_urls_df.itertuples(), total=len(new_urls_df)):
144 |             try:
145 |                 # if url starts with 'www' then add 'https://'
146 |                 url = "https://" + row.URL if row.URL.startswith("www") else row.URL
147 |                 text = web_scraper(url)
148 |                 writer.writerow([row.URL, text])
149 |             except Exception:
150 |                 logging.warning(f"Error scraping {row.URL}")
151 |             sleep(0.5)
152 | 
153 |     # Create the final output table
154 |     final_df = urls_df.merge(
155 |         pd.read_csv(SCRAPED_PATH, names=["URL", "text"]),
156 |         on="URL",
157 |         how="left",
158 |     )
159 |     final_df.to_csv(OUTPUTS_PATH, index=False)
160 | 


--------------------------------------------------------------------------------
/src/scraping/activities/scrape_notebook.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "metadata": {},
 6 |    "source": [
 7 |     "# Notebook for testing the scraping functions"
 8 |    ]
 9 |   },
10 |   {
11 |    "cell_type": "code",
12 |    "execution_count": null,
13 |    "metadata": {},
14 |    "outputs": [],
15 |    "source": [
16 |     "from scrape_activities import web_scraper\n",
17 |     "import scrape_activities\n",
18 |     "import pandas as pd"
19 |    ]
20 |   },
21 |   {
22 |    "cell_type": "code",
23 |    "execution_count": null,
24 |    "metadata": {},
25 |    "outputs": [],
26 |    "source": [
27 |     "pd.read_csv(scrape_activities.URL_PATH)"
28 |    ]
29 |   },
30 |   {
31 |    "cell_type": "code",
32 |    "execution_count": null,
33 |    "metadata": {},
34 |    "outputs": [],
35 |    "source": [
36 |     "url = 'https://www.bbc.co.uk/tiny-happy-people/night-night-teddy/znmspg8'\n",
37 |     "web_scraper(url)"
38 |    ]
39 |   },
40 |   {
41 |    "cell_type": "code",
42 |    "execution_count": null,
43 |    "metadata": {},
44 |    "outputs": [],
45 |    "source": [
46 |     "pd.read_csv(scrape_activities.OUTPUTS_PATH)"
47 |    ]
48 |   }
49 |  ],
50 |  "metadata": {
51 |   "kernelspec": {
52 |    "display_name": ".venv",
53 |    "language": "python",
54 |    "name": "python3"
55 |   },
56 |   "language_info": {
57 |    "codemirror_mode": {
58 |     "name": "ipython",
59 |     "version": 3
60 |    },
61 |    "file_extension": ".py",
62 |    "mimetype": "text/x-python",
63 |    "name": "python",
64 |    "nbconvert_exporter": "python",
65 |    "pygments_lexer": "ipython3",
66 |    "version": "3.9.17"
67 |   },
68 |   "orig_nbformat": 4
69 |  },
70 |  "nbformat": 4,
71 |  "nbformat_minor": 2
72 | }
73 | 


--------------------------------------------------------------------------------
/src/scraping/start_for_life/README.md:
--------------------------------------------------------------------------------
 1 | # A knowledge base for testing a parenting chatbot
 2 | 
 3 | You can use `scrape_sitemap.py` and `scrape_startforlife.py` scripts to collect trusted information and advice about raising a child from the [NHS Start for Life](https://www.nhs.uk/start-for-life/) website
 4 | 
 5 | ## Usage
 6 | 
 7 | ### Getting the sitemap URLs
 8 | 
 9 | First navigate to the scraping folder from repo's root
10 | ```
11 | cd src/scraping/start_for_life
12 | ```
13 | 
14 | Then run the script
15 | ```
16 | poetry run python scrape_sitemap.py
17 | ```
18 | 
19 | This will create a file in `data/sitemap.csv` with the following columns:
20 |  - Title: Title of the page
21 |  - URL: URL of the page
22 | 
23 | ### Getting the text content
24 | 
25 | After fetching the sitemap URLs, run the following script
26 | ```
27 | poetry run python scrape_startforlife.py
28 | ```
29 | 
30 | The final output file will be stored in `data/startforlife.csv` with the following columns:
31 |   - URL: URL of the page
32 |   - content_type: Content type (ie, class tag of the div section)
33 |   - header: Header of the page's section
34 |   - content: Content of the page's section
35 |   - content_no: Index of the section (ie, if there are multiple sections on the page, this is the integer index of the section)
36 | 


--------------------------------------------------------------------------------
/src/scraping/start_for_life/scrape_sitemap.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Get all URLs from the NHS https://www.nhs.uk/start-for-life/site-map/
 3 | 
 4 | Usage:
 5 | 
 6 | First navigate to the scraping folder
 7 | $ cd src/scraping/start_for_life
 8 | 
 9 | Then run the script
10 | $ poetry run python scrape_sitemap.py
11 | 
12 | The final output file is stored in data/sitemap.csv
13 | with the following columns:
14 |     - Title: Title of the page
15 |     - URL: URL of the page
16 | 
17 | """
18 | import csv
19 | import logging
20 | 
21 | import requests
22 | 
23 | from bs4 import BeautifulSoup
24 | 
25 | 
26 | # Headers to be used when scraping
27 | HEADERS = {
28 |     "User-Agent": "Data collection for the purpose of research. For questions, reach out to karlis.kanders@nesta.org.uk"
29 | }
30 | 
31 | # URL with the website you want to scrape
32 | BASE_URL = "https://www.nhs.uk/start-for-life/site-map/"
33 | # Path to your output CSV file with titles and URLs
34 | OUTPUT_PATH = "data/sitemap.csv"
35 | 
36 | 
37 | def scrape_urls(base_url: str, csv_filename: str, timeout: float = 10) -> None:
38 |     """
39 |     Get all URLs from a website and save them to a CSV file
40 | 
41 |     Args:
42 |         base_url:
43 |             URL of the website to scrape
44 | 
45 |         csv_filename:
46 |             Path to the CSV file to save the URLs to
47 | 
48 |         timeout:
49 |             Timeout in seconds for the request to the website
50 |     """
51 | 
52 |     # Create a set to hold Titles and URLs
53 |     unique_urls = []
54 |     url_titles = []
55 |     # Make a request to the website
56 |     response = requests.get(
57 |         base_url,
58 |         timeout=timeout,
59 |         headers=HEADERS,
60 |     )
61 |     # Check if the request was successful
62 |     if response.status_code == 200:
63 |         # Parse the HTML content of the page using Beautiful Soup
64 |         soup = BeautifulSoup(response.text, "html.parser")
65 |         # Find all anchor tags (<a>) in the HTML
66 |         for link in soup.find_all("a"):
67 |             # Get the href attribute (the URL)
68 |             url = link.get("href")
69 |             # Check if the URL starts with 'http' or 'https' to ignore relative URLs
70 |             if url and (url.startswith("http") or url.startswith("https") or url.startswith("/")):
71 |                 unique_urls.append(url)
72 |                 if type(link.text) is str:
73 |                     url_titles.append(link.text.strip())
74 |                 else:
75 |                     url_titles.append("")
76 | 
77 |     # Write the unique URLs to a CSV file
78 |     with open(csv_filename, "w", newline="") as csvfile:
79 |         csv_writer = csv.writer(csvfile)
80 |         csv_writer.writerow(["Title", "URL"])  # Header row
81 | 
82 |         for i, url in enumerate(unique_urls):
83 |             csv_writer.writerow([url_titles[i], url])
84 | 
85 |     logging.info(f"Scraping complete. {len(unique_urls)} unique URLs have been saved to {csv_filename}.")
86 | 
87 | 
88 | if __name__ == "__main__":
89 |     scrape_urls(BASE_URL, OUTPUT_PATH)
90 | 


--------------------------------------------------------------------------------
/src/scraping/start_for_life/scrape_startforlife.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Get content from all URLs from the NHS https://www.nhs.uk/start-for-life/site-map/
  3 | 
  4 | Usage:
  5 | 
  6 | First navigate to the scraping folder
  7 | $ cd src/scraping/start_for_life
  8 | 
  9 | Then run the script
 10 | $ poetry run python scrape_startforlife.py
 11 | 
 12 | The final output file is stored in data/startforlife.csv
 13 | with the following columns:
 14 |     - URL: URL of the page
 15 |     - content_type: Content type (ie, class tag of the div section)
 16 |     - header: Header of the page's section
 17 |     - content: Content of the page's section
 18 |     - content_no: Index of the section (ie, if there are multiple sections on the page, this is the integer index of the section)
 19 | 
 20 | """
 21 | 
 22 | import logging
 23 | 
 24 | from time import sleep
 25 | from typing import List
 26 | 
 27 | import pandas as pd
 28 | import requests
 29 | 
 30 | from bs4 import BeautifulSoup
 31 | from tqdm import tqdm
 32 | 
 33 | 
 34 | # URLs to scrape
 35 | SITEMAP_PATH = "data/sitemap.csv"
 36 | BASE_URL = "https://www.nhs.uk/"
 37 | # Headers to be used when scraping
 38 | REQUEST_HEADERS = {
 39 |     "User-Agent": "Data collection for the purpose of research. For questions, reach out to karlis.kanders@nesta.org.uk"
 40 | }
 41 | # We're only fetching on type of div section; one can also add 'nhsuk-promo__content' to capture the site navigation content
 42 | DIV_CLASS = ["nhsuk-u-reading-width"]
 43 | # Typical header tags
 44 | HEADER_TAGS = ["h1", "h2", "h3", "h4", "h5", "h6"]
 45 | # Sections with these header will be excluded from the final output
 46 | EXCLUDED_HEADERS = ["Sign up for emails"]
 47 | # Path to the output file
 48 | OUTPUT_PATH = "data/start_for_life.csv"
 49 | 
 50 | 
 51 | def merge_sections_based_on_headers(headers: List[str], sections: List[str]) -> (List[str], List[str], List[int]):
 52 |     """
 53 |     Merge sections based on headers, where sections without headers will be merged in the previous section that has a header
 54 | 
 55 |     Args:
 56 |         headers
 57 |             List of headers
 58 | 
 59 |         sections
 60 |             List of sections
 61 | 
 62 |     Returns:
 63 |         Tuple of lists with headers, merged sections and removed indices
 64 |     """
 65 |     new_headers = []
 66 |     new_sections = []
 67 |     # Keep track of removed indices
 68 |     removed_indices = []
 69 |     # Keep track of the last non-empty header index
 70 |     last_non_empty_index = -1
 71 | 
 72 |     for i in range(len(headers)):
 73 |         # If header is non-empty
 74 |         if headers[i]:
 75 |             new_headers.append(headers[i])
 76 |             new_sections.append(sections[i])
 77 |             last_non_empty_index = len(new_headers) - 1
 78 |         # If header is empty
 79 |         else:
 80 |             # Add index to removed list
 81 |             removed_indices.append(i)
 82 |             # If there was a previous non-empty header
 83 |             if last_non_empty_index != -1:
 84 |                 new_sections[last_non_empty_index] += sections[i]
 85 | 
 86 |     return new_headers, new_sections, removed_indices
 87 | 
 88 | 
 89 | def web_scraper(url: str, timeout: float = 10) -> (List[str], List[str], List[str]):
 90 |     """Scrape a web page and return the content
 91 | 
 92 |     Args:
 93 |         url
 94 |             URL of the web page
 95 | 
 96 |         timeout
 97 |             Timeout in seconds for the request to the website
 98 | 
 99 |     Returns:
100 |         Tuple of lists with sections' headers, content under each header and the sections' classes
101 |     """
102 |     # Fetch webpage
103 |     response = requests.get(
104 |         url,
105 |         timeout=timeout,
106 |         headers=REQUEST_HEADERS,
107 |     )
108 | 
109 |     # Parse with BeautifulSoup
110 |     soup = BeautifulSoup(response.text, "html.parser")
111 | 
112 |     # Get all divs with the specified list of class names
113 |     divs = soup.find_all("div", class_=DIV_CLASS)
114 | 
115 |     # Get all header (h1, h2 etc) tags from each of the divs
116 |     headers = [[d.get_text() for d in div.find_all(HEADER_TAGS)] for div in divs]
117 | 
118 |     # Process the div content: remove headers (h1, h2, etc)
119 |     divs_processed = []
120 |     for div in divs:
121 |         div_processed = div
122 |         for header in div_processed.find_all(HEADER_TAGS):
123 |             header.decompose()
124 |         divs_processed.append(div_processed)
125 |     # Get class names of the divs
126 |     divs_classes = [div.get("class") for div in divs]
127 |     # Keep only text
128 |     divs_processed = [div.get_text() for div in divs_processed]
129 |     # Merge divs without header into previous div
130 |     headers, divs_processed, removed_indices = merge_sections_based_on_headers(headers, divs_processed)
131 |     # Remove corresponding elements from divs_classes
132 |     divs_classes = [div for i, div in enumerate(divs_classes) if i not in removed_indices]
133 |     # Remove new lines from the beginning and end of each div
134 |     divs_processed = [div.strip() for div in divs_processed]
135 |     # and replace multiple new lines with a space
136 |     divs_processed = [div.strip().replace("\n", " ") for div in divs_processed]
137 |     return headers, divs_processed, divs_classes
138 | 
139 | 
140 | def has_multiple_items(items: List[str]) -> bool:
141 |     """Check if there are multiple headers in the second-level list"""
142 |     for item in items:
143 |         if len(item) > 1:
144 |             logging.info(f"Multiple headers in {item}")
145 |             return False
146 |         else:
147 |             return True
148 | 
149 | 
150 | if __name__ == "__main__":
151 |     sitemap_df = pd.read_csv(SITEMAP_PATH)
152 |     content_dfs = []
153 |     # Go through each URL and scrape the content
154 |     for url in tqdm(sitemap_df["URL"].to_list()):
155 |         headers, content, div_classes = web_scraper(BASE_URL + url)
156 |         # Just checking the unlikely case of multiple headers in a div section
157 |         has_multiple_items(headers)
158 |         # Join lists of lists into a single list (unlikely that there is more than one header in a div class)
159 |         headers = [". ".join(h) for h in headers]
160 |         div_classes = ["; ".join(d) for d in div_classes]
161 |         # Append to the list of dataframes
162 |         content_dfs.append(
163 |             pd.DataFrame(
164 |                 {
165 |                     "URL": url,
166 |                     "content_type": div_classes,
167 |                     "header": headers,
168 |                     "content": content,
169 |                     "content_no": range(len(content)),
170 |                 }
171 |             )
172 |         )
173 |         sleep(0.1)
174 |     content_dfs = pd.concat(content_dfs, ignore_index=True)
175 | 
176 |     # Some light post-processing
177 |     content_processed_df = (
178 |         content_dfs
179 |         # Remove rows with empty content
180 |         .loc[content_dfs["content"].str.len() > 0]
181 |         # Remove rows with excluded headers
182 |         .loc[~content_dfs["header"].isin(EXCLUDED_HEADERS)]
183 |         # For each unique URL, reindex the content_no to follow subsequent integers
184 |         .assign(content_no=lambda x: x.groupby("URL")["content_no"].rank(method="first").astype(int))
185 |         # Export to CSV
186 |         .to_csv(OUTPUT_PATH, index=False)
187 |     )
188 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nestauk/discovery_generative_ai/b5ab4b0bf1feda7cd4a85a18ba20aa996cfb9e0d/tests/__init__.py


--------------------------------------------------------------------------------
/whatsapp_bot.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import os
  3 | import re
  4 | 
  5 | from threading import Thread
  6 | from time import sleep
  7 | from typing import Dict
  8 | from typing import List
  9 | 
 10 | import openai
 11 | 
 12 | from dotenv import load_dotenv
 13 | from flask import Flask
 14 | from flask import request
 15 | from twilio.rest import Client
 16 | from twilio.twiml.messaging_response import MessagingResponse
 17 | 
 18 | from genai.eyfs.eyfs import TextGenerator
 19 | from genai.utils import read_json
 20 | 
 21 | 
 22 | load_dotenv()
 23 | # Twilio settings
 24 | client = Client(os.environ["TWILIO_ACCOUNT_SID"], os.environ["TWILIO_AUTH_TOKEN"])
 25 | # OpenAI model
 26 | LLM = "gpt-3.5-turbo"
 27 | TEMPERATURE = 0.5
 28 | openai.api_key = os.environ["OPENAI_API_KEY"]
 29 | 
 30 | # Prepare ELI3 base prompt
 31 | ELI3_MESSAGES = read_json("src/genai/whatsapp_bot/prompts/eli3/eli3.json")
 32 | 
 33 | # Prepare EYFS base prompt and parameters
 34 | areas_of_learning = [
 35 |     "Communication and Language",
 36 |     "Personal, Social and Emotional Development",
 37 |     "Physical Development",
 38 |     "Literacy",
 39 |     "Mathematics",
 40 |     "Understanding the World",
 41 |     "Expressive Arts and Design",
 42 | ]
 43 | areas_of_learning_desc = read_json("src/genai/eyfs/areas_of_learning.json")
 44 | areas_of_learning_text = [v for k, v in areas_of_learning_desc.items() if k in areas_of_learning]
 45 | eyfs_paths = [
 46 |     "src/genai/eyfs/prompts/system.json",
 47 |     "src/genai/eyfs/prompts/context_and_task.json",
 48 |     "src/genai/eyfs/prompts/constraints.json",
 49 |     "src/genai/eyfs/prompts/situation.json",
 50 | ]
 51 | EYFS_MESSAGES = [read_json(path) for path in eyfs_paths]
 52 | EYFS_PARAMETERS = {
 53 |     "areas_of_learning": areas_of_learning,
 54 |     "n_results": 5,
 55 |     "location": "Indoors or Outdoors",
 56 |     "areas_of_learning_text": areas_of_learning_text,
 57 | }
 58 | 
 59 | # Initiate the Flask app
 60 | app = Flask(__name__)
 61 | 
 62 | 
 63 | def format_activities_text(text: str) -> str:
 64 |     """Format the response from the EYFS generator for better display in WhatsApp"""
 65 |     text = (
 66 |         text.replace("## Conversations", "*Conversations*\n")
 67 |         .replace("## Games and Crafts", "*Games and Crafts*\n")
 68 |         .replace("**Activity description**", "_Activity description_")
 69 |         .replace("**Areas of learning**", "_Areas of learning_")
 70 |     )
 71 |     # replace markdown subheadings with bold italics
 72 |     text = re.sub(r"###\s*(.+)", r"*_\1_*", text)
 73 |     return text
 74 | 
 75 | 
 76 | def generate_reply(incoming_message: str, sender_contact: str, receiver_contact: str) -> str:
 77 |     """Parse message text and return an appropriate response.
 78 | 
 79 |     Presently supports two types of responses: 'explain' and 'activities'
 80 |     Activities response is threaded to allow for longer response times. This is a very basic
 81 |     workaround to the 15 second timeout limit imposed by Twilio.
 82 | 
 83 |     Args:
 84 |         incoming_message:
 85 |             Message text
 86 |         sender_contact:
 87 |             Sender's contact, follows a format 'whatsapp:+<phone number>'
 88 |         receiver_contact:
 89 |             Receiver's contact (ie, my contact), follows a format 'whatsapp:+<phone number>'
 90 | 
 91 |     Returns:
 92 |         Response text
 93 |     """
 94 |     text_message = incoming_message.lower()
 95 | 
 96 |     # 'explain' response
 97 |     if text_message[0:7] == "explain":
 98 |         response = TextGenerator.generate(
 99 |             model=LLM,
100 |             temperature=TEMPERATURE,
101 |             messages=[ELI3_MESSAGES.copy()],
102 |             message_kwargs={"input": text_message[7:].strip()},
103 |         )
104 |         return response["choices"][0]["message"]["content"]
105 |     # 'activities' response
106 |     elif "activities" in text_message[0:10]:
107 |         EYFS_PARAMETERS["description"] = text_message
108 |         thread = Thread(
109 |             target=send_text, args=[copy.deepcopy(EYFS_MESSAGES), EYFS_PARAMETERS, receiver_contact, sender_contact]
110 |         )
111 |         thread.start()
112 |         return "Thank you for your question. I am thinking..."
113 |     else:
114 |         # Return a default message
115 |         return (
116 |             'Write "Explain <your question>" to explain a concept to a 3-year old \n\n or'
117 |             + '\n\n "Activities <your topic>" to get activity ideas'
118 |         )
119 | 
120 | 
121 | def send_text(messages: List[Dict], message_kwargs: Dict, my_contact: str, receiver_contact: str) -> None:
122 |     """Generate text messages and send them to a given contact
123 | 
124 |     Args:
125 |         messages:
126 |             List of messages to be used as prompts
127 |         message_kwargs:
128 |             Dictionary of keyword arguments to be passed to the TextGenerator
129 |         my_contact:
130 |             Sender's contact, follows a format 'whatsapp:+<phone number>'
131 |         receiver_contact:
132 |             Receiver's contact (ie, my contact), follows a format 'whatsapp:+<phone number>'
133 |     """
134 |     # Generate response to the message
135 |     response = TextGenerator.generate(
136 |         model=LLM,
137 |         temperature=TEMPERATURE,
138 |         messages=messages,
139 |         message_kwargs=message_kwargs,
140 |     )
141 |     text_body = response["choices"][0]["message"]["content"]
142 |     # Format the text_body for better display on WhatsApp
143 |     text_body = format_activities_text(text_body)
144 |     # Divide output into 1500 character chunks due to WhatsApp character limit of 1600 chars
145 |     texts = [text_body[i : i + 1500] for i in range(0, len(text_body), 1500)]
146 |     # Send message
147 |     for text in texts:
148 |         client.messages.create(body=text, from_=my_contact, to=receiver_contact)
149 |         sleep(0.5)
150 |     return
151 | 
152 | 
153 | @app.route("/")
154 | def hello_world() -> str:
155 |     """Information message"""
156 |     return "Nesta generative AI prototype: WhatsApp bot for suggesting kids activities"
157 | 
158 | 
159 | @app.route("/text", methods=["POST"])
160 | def text_reply() -> str:
161 |     """Respond to incoming messages"""
162 |     reply = generate_reply(
163 |         incoming_message=request.form.get("Body"),
164 |         sender_contact=request.form.get("From"),
165 |         receiver_contact=request.form.get("To"),
166 |     )
167 |     resp = MessagingResponse()
168 |     resp.message(reply)
169 |     return str(resp)
170 | 
171 | 
172 | if __name__ == "__main__":
173 |     port = int(os.environ.get("PORT", 5000))
174 |     app.run(debug=False, host="0.0.0.0", port=port)
175 | 


--------------------------------------------------------------------------------
/whatsapp_parenting_bot.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | 
  4 | from threading import Thread
  5 | 
  6 | import openai
  7 | import s3fs
  8 | 
  9 | from dotenv import load_dotenv
 10 | from flask import Flask
 11 | from flask import request
 12 | from twilio.rest import Client
 13 | from twilio.twiml.messaging_response import MessagingResponse
 14 | 
 15 | from genai.eyfs import TextGenerator
 16 | from genai.eyfs import get_embedding
 17 | from genai.message_history import InMemoryMessageHistory
 18 | from genai.prompt_template import FunctionTemplate
 19 | from genai.prompt_template import MessageTemplate
 20 | from genai.streamlit_pages.utils import get_index
 21 | from genai.streamlit_pages.utils import query_pinecone
 22 | 
 23 | 
 24 | load_dotenv()
 25 | # Twilio settings
 26 | client = Client(os.environ["TWILIO_ACCOUNT_SID"], os.environ["TWILIO_AUTH_TOKEN"])
 27 | # OpenAI model
 28 | LLM = "gpt-3.5-turbo"
 29 | TEMPERATURE = 0.6
 30 | openai.api_key = os.environ["OPENAI_API_KEY"]
 31 | 
 32 | AWS_KEY = os.environ["AWS_ACCESS_KEY_ID"]
 33 | AWS_SECRET = os.environ["AWS_SECRET_ACCESS_KEY"]
 34 | S3_PATH = os.environ["S3_BUCKET"] + "/prototypes/whatsapp-bot/logs"
 35 | 
 36 | pinecone_index = get_index(index_name="eyfs-index")
 37 | system_message = MessageTemplate.load("src/genai/parenting_chatbot/prompts/system.json")
 38 | filter_refs_function = FunctionTemplate.load("src/genai/parenting_chatbot/prompts/filter_refs_function.json")
 39 | filter_refs_user_message = MessageTemplate.load("src/genai/parenting_chatbot/prompts/filter_refs_user.json")
 40 | filter_refs_system_message = MessageTemplate.load("src/genai/parenting_chatbot/prompts/filter_refs_system.json")
 41 | 
 42 | # Initiate the Flask app
 43 | app = Flask(__name__)
 44 | 
 45 | 
 46 | def write_to_s3(key: str, secret: str, s3_path: str, filename: str, data: dict, how: str = "a") -> None:
 47 |     """
 48 |     Write data to a jsonl file in S3.
 49 | 
 50 |     Args:
 51 |         key (str): AWS access key ID.
 52 |         secret (str): AWS secret access key.
 53 |         s3_path (str): S3 bucket path.
 54 |         filename (str): Name of the file to write to.
 55 |         data (dict): Data to write to the file.
 56 |         how (str, optional): How to write to the file. Default is "a" for append. Use "w" to overwrite.
 57 | 
 58 |     """
 59 |     fs = s3fs.S3FileSystem(key=key, secret=secret)
 60 |     with fs.open(f"{s3_path}/{filename}.jsonl", how) as f:
 61 |         f.write(f"{json.dumps(data)}\n")
 62 | 
 63 | 
 64 | def read_from_s3(key: str, secret: str, s3_path: str, filename: str) -> list:
 65 |     """
 66 |     Read data from a jsonl file in S3 and return as a list of dictionaries.
 67 | 
 68 |     Args:
 69 |         key (str): AWS access key ID.
 70 |         secret (str): AWS secret access key.
 71 |         s3_path (str): S3 bucket path.
 72 |         filename (str): Name of the file to read from.
 73 | 
 74 |     Returns:
 75 |         list: A list of dictionaries, each representing a line in the jsonl file.
 76 |     """
 77 |     fs = s3fs.S3FileSystem(key=key, secret=secret)
 78 |     data_list = []
 79 | 
 80 |     with fs.open(f"{s3_path}/{filename}.jsonl", "r") as file:
 81 |         for line in file:
 82 |             data_list.append(json.loads(line))
 83 | 
 84 |     return data_list
 85 | 
 86 | 
 87 | def fetch_message_history(sender_contact: str, create_new: bool = True) -> InMemoryMessageHistory:
 88 |     """
 89 |     Find a folder on s3 corresponding to the sender; if doesn't exist, create a new folder
 90 | 
 91 |     Args:
 92 |         sender_contact (str): Sender's contact, follows a format 'whatsapp:+<phone number>'
 93 |         create_new (bool, optional): Whether to create a new folder if one doesn't exist. Defaults to True.
 94 | 
 95 |     Returns:
 96 |         InMemoryMessageHistory: Message history
 97 |     """
 98 |     message_history = InMemoryMessageHistory()
 99 |     try:
100 |         messages = read_from_s3(
101 |             AWS_KEY,
102 |             AWS_SECRET,
103 |             f"{S3_PATH}/{sender_contact}",
104 |             "messages",
105 |         )
106 |         for message in messages:
107 |             message_history.add_message(message)
108 |         return message_history
109 |     except FileNotFoundError as e:
110 |         if create_new:
111 |             write_to_s3(
112 |                 AWS_KEY,
113 |                 AWS_SECRET,
114 |                 f"{S3_PATH}/{sender_contact}",
115 |                 "messages",
116 |                 {"role": "system", "content": "Welcome to the Parenting Chatbot!"},
117 |                 how="w",
118 |             )
119 |             return message_history
120 |         else:
121 |             raise e
122 | 
123 | 
124 | def send_links(link: str, my_contact: str, receiver_contact: str) -> None:
125 |     """Generate text messages and send them to a given contact
126 | 
127 |     Args:
128 |         links:
129 |             Url to share
130 |         my_contact:
131 |             Sender's contact, follows a format 'whatsapp:+<phone number>'
132 |         receiver_contact:
133 |             Receiver's contact (ie, my contact), follows a format 'whatsapp:+<phone number>'
134 |     """
135 |     text = f"Read more: {link}"
136 |     client.messages.create(body=text, from_=my_contact, to=receiver_contact)
137 |     return
138 | 
139 | 
140 | @app.route("/text", methods=["POST"])
141 | def text_reply() -> str:
142 |     """Respond to incoming messages"""
143 |     receiver_contact = request.form.get("To")
144 | 
145 |     # Fetch message history for this sender
146 |     sender_contact = request.form.get("From")
147 |     message_history = fetch_message_history(sender_contact)
148 | 
149 |     # Save the incoming message to the message history
150 |     prompt = request.form.get("Body")
151 | 
152 |     # Generate response to the message
153 | 
154 |     # Search the vector index
155 |     search_results = query_pinecone(
156 |         index=pinecone_index,
157 |         encoded_query=get_embedding(prompt),
158 |         top_n=3,
159 |         filters={
160 |             "source": {"$eq": "nhs_full_page"},
161 |         },
162 |     )
163 | 
164 |     nhs_texts = []
165 |     nhs_urls = []
166 |     for result in search_results:
167 |         pred = TextGenerator.generate(
168 |             temperature=0.0,
169 |             messages=[filter_refs_system_message, filter_refs_user_message],
170 |             message_kwargs={"text": result["metadata"]["text"], "question": prompt},
171 |             functions=[filter_refs_function.to_prompt()],
172 |             function_call={"name": filter_refs_function.name},
173 |         )
174 | 
175 |         pred = json.loads(pred["choices"][0]["message"]["function_call"]["arguments"])["prediction"]
176 | 
177 |         if pred:
178 |             nhs_texts.append(result["metadata"]["text"])
179 |             nhs_urls.append(result["metadata"]["url"])
180 | 
181 |     if nhs_texts:
182 |         nhs_texts = "\n===\n".join(nhs_texts)
183 | 
184 |     # Add references to the prompt
185 |     prompt = f"""###NHS Start for Life references###\n{nhs_texts}\n\n###User message###\n{prompt} \n\n###Additional instructions###\nAnswer in one or two sentences, not more."""  # noqa: B950
186 | 
187 |     message_history.add_message({"role": "user", "content": prompt})
188 |     write_to_s3(
189 |         AWS_KEY,
190 |         AWS_SECRET,
191 |         f"{S3_PATH}/{sender_contact}",
192 |         "messages",
193 |         message_history.messages[-1],
194 |         how="a",
195 |     )
196 | 
197 |     response = TextGenerator.generate(
198 |         model=LLM,
199 |         temperature=TEMPERATURE,
200 |         messages=message_history.get_messages(),
201 |         message_kwargs=None,
202 |     )
203 |     response = response["choices"][0]["message"]["content"]
204 |     message_history.add_message({"role": "assistant", "content": response})
205 | 
206 |     # Save message history
207 |     write_to_s3(
208 |         AWS_KEY,
209 |         AWS_SECRET,
210 |         f"{S3_PATH}/{sender_contact}",
211 |         "messages",
212 |         message_history.messages[-1],
213 |         how="a",
214 |     )
215 |     resp = MessagingResponse()
216 |     resp.message(response)
217 | 
218 |     # Only when incoming whatsapp message
219 |     if (len(nhs_urls) > 0) and ("whatsapp" in receiver_contact):
220 |         thread = Thread(target=send_links, args=[nhs_urls[0], receiver_contact, sender_contact])
221 |         thread.start()
222 | 
223 |     return str(resp)
224 | 
225 | 
226 | if __name__ == "__main__":
227 |     port = int(os.environ.get("PORT", 5000))
228 |     app.run(debug=False, host="0.0.0.0", port=port)
229 | 


--------------------------------------------------------------------------------