├── .coveragerc ├── .github └── workflows │ └── test_package.yaml ├── .gitignore ├── .pre-commit-config.yaml ├── Dockerfile ├── LICENSE ├── Makefile ├── README.md ├── api ├── __init__.py ├── api.py ├── final_svc.sav ├── requirements.txt ├── schemas.py └── test_api_locally.py ├── coverage.xml ├── current_best_model ├── final_bert │ ├── bert_perf.xlsx │ └── bert_summary.txt ├── final_ensemble │ ├── ensemble_perf.xlsx │ └── ensemble_summary.txt ├── final_svc │ ├── final_svc.sav │ ├── final_svc_perf.xlsx │ └── final_svc_summary.txt ├── final_xgb │ ├── final_xgb.sav │ ├── final_xgb_perf.xlsx │ └── final_xgb_summary.txt └── sentiment │ ├── bert_sentiment.txt │ ├── confusion_matrix_3_counts.png │ ├── confusion_matrix_3_percentages.png │ ├── confusion_matrix_5_counts.png │ └── confusion_matrix_5_percentages.png ├── datasets ├── README.md ├── phase_1 │ ├── README.md │ ├── co.csv │ ├── co_multi_label.csv │ └── text_data.csv ├── testing │ └── test_data.csv └── v6framework_230831.csv ├── docker_README.md ├── docker_data ├── data_in │ ├── file_01.json │ └── file_02.json └── data_out │ └── file_01.json ├── docker_run.py ├── docs ├── about.md ├── create_docs.py ├── getting started │ ├── install.md │ ├── package.md │ ├── training_new_model.md │ └── using_trained_model.md ├── index.md ├── main.css └── reference │ ├── API │ ├── API.md │ ├── quick_API.md │ └── slow_API.md │ ├── Docker │ └── docker_README.md │ └── pxtextmining │ ├── factories │ ├── factory_data_load_and_split.md │ ├── factory_model_performance.md │ ├── factory_pipeline.md │ ├── factory_predict_unlabelled_text.md │ └── factory_write_results.md │ ├── helpers │ └── text_preprocessor.md │ └── pipelines │ ├── multilabel_pipeline.md │ └── sentiment_pipeline.md ├── mkdocs.yml ├── poetry.lock ├── pxtextmining ├── __init__.py ├── factories │ ├── __init__.py │ ├── factory_data_load_and_split.py │ ├── factory_model_performance.py │ ├── factory_pipeline.py │ ├── factory_predict_unlabelled_text.py │ └── factory_write_results.py ├── helpers │ ├── __init__.py │ └── text_preprocessor.py ├── params.py └── pipelines │ ├── __init__.py │ ├── multilabel_pipeline.py │ └── sentiment_pipeline.py ├── pyproject.toml ├── setup.py ├── test_multilabel └── dummy_metrics.txt └── tests ├── __init__.py ├── conftest.py ├── test_api.py ├── test_data_load_and_split.py ├── test_docker_run.py ├── test_factory_pipeline.py ├── test_helpers.py ├── test_model_performance.py ├── test_multilabel_pipeline.py ├── test_predict_unlabelled_text.py ├── test_sentiment_pipeline.py └── test_write_results.py /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | omit = tests\* 3 | *\__init__.py 4 | *\params.py 5 | api\test_api_locally.py 6 | setup.py 7 | test_rules.py 8 | 9 | source = api 10 | pxtextmining 11 | 12 | [report] 13 | exclude_lines = 14 | if __name__ == .__main__.: 15 | -------------------------------------------------------------------------------- /.github/workflows/test_package.yaml: -------------------------------------------------------------------------------- 1 | name: Run tests 2 | 3 | on: 4 | push: 5 | branches: [ $default-branch ] 6 | pull_request: 7 | branches: 8 | - development 9 | - main 10 | 11 | jobs: 12 | build: 13 | 14 | runs-on: ubuntu-latest 15 | strategy: 16 | fail-fast: false 17 | matrix: 18 | python-version: ["3.9", "3.10"] 19 | 20 | steps: 21 | - uses: actions/checkout@v3 22 | - name: Install poetry 23 | run: pipx install poetry 24 | - name: Ruff 25 | uses: chartboost/ruff-action@v1 26 | - name: Set up Python ${{ matrix.python-version }} 27 | uses: actions/setup-python@v4 28 | with: 29 | python-version: ${{ matrix.python-version }} 30 | cache: 'poetry' 31 | - name: Install package 32 | run: poetry install --with dev 33 | - name: Run tests 34 | run: poetry run pytest tests/* -sx 35 | - name: Upload coverage reports to Codecov 36 | if: ${{ matrix.python-version }} == "3.10" 37 | uses: codecov/codecov-action@v3 38 | env: 39 | CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} 40 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .python-version 2 | *__pycache__* 3 | notebooks/* 4 | my.conf 5 | site/ 6 | dist/ 7 | .vscode/ 8 | datasets/hidden/* 9 | test_multilabel/* 10 | .env 11 | api/rsconnect-python/* 12 | .coverage 13 | *_labels.xlsx 14 | current_best_model/final_bert/bert_multilabel 15 | current_best_model/sentiment/bert_sentiment 16 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | exclude: '(build|datasets|current_best_multilabel|docs)/.*' 2 | 3 | repos: 4 | - repo: https://github.com/pre-commit/pre-commit-hooks 5 | rev: v2.3.0 6 | hooks: 7 | - id: check-added-large-files 8 | name: Check for files larger than 75 MB 9 | args: [ "--maxkb=750000" ] 10 | - id: end-of-file-fixer 11 | name: Check for a blank line at the end of scripts (auto-fixes) 12 | exclude: 'json' 13 | - id: trailing-whitespace 14 | name: Check for trailing whitespaces (auto-fixes) 15 | - repo: https://github.com/pycqa/isort 16 | rev: 5.12.0 17 | hooks: 18 | - id: isort 19 | name: isort - Sort Python imports (auto-fixes) 20 | args: [ "--profile", "black", "--filter-files" ] 21 | - repo: https://github.com/astral-sh/ruff-pre-commit 22 | rev: v0.0.272 23 | hooks: 24 | - id: ruff 25 | name: Ruff linting 26 | - repo: https://github.com/psf/black 27 | rev: 22.10.0 28 | hooks: 29 | - id: black 30 | name: black - consistent Python code formatting (auto-fixes) 31 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.10.13-slim 2 | VOLUME /data 3 | 4 | COPY pxtextmining /pxtextmining 5 | COPY pyproject.toml /pyproject.toml 6 | COPY docker_README.md /README.md 7 | RUN pip install --upgrade pip setuptools \ 8 | && pip install . \ 9 | && rm -rf /root/.cache 10 | COPY current_best_model/sentiment/bert_sentiment bert_sentiment 11 | COPY current_best_model/final_bert/bert_multilabel bert_multilabel 12 | COPY current_best_model/final_svc/final_svc.sav /final_svc.sav 13 | COPY current_best_model/final_xgb/final_xgb.sav /final_xgb.sav 14 | COPY --chmod=755 docker_run.py docker_run.py 15 | 16 | LABEL org.opencontainers.image.source=https://github.com/the-strategy-unit/pxtextmining 17 | 18 | ENTRYPOINT ["python3", "docker_run.py"] 19 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 NHS England 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | coverage: 2 | pytest --cov=. tests/ --cov-report xml:coverage.xml --cov-report term 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # pxtextmining: Text Classification of Patient Experience feedback 2 | 3 | ## Project description 4 | **pxtextmining** is a Python package for classifying patient feedback comments collected via the [NHS England Friends and Family Test](https://www.england.nhs.uk/fft/) (FFT). It is part of the [Patient Experience Qualitative Data Categorisation project](https://the-strategy-unit.github.io/PatientExperience-QDC/), funded by NHS England and hosted by Nottinghamshire Healthcare NHS Foundation Trust. 5 | 6 | __We are working openly by [open-sourcing](https://github.com/The-Strategy-Unit/pxtextmining/blob/main/LICENSE) the analysis code and data where possible to promote replication, reproducibility and further developments. Pull requests are more than welcome.__ 7 | 8 | ## Documentation and installation 9 | 10 | Full documentation, including installation instructions, is available on our [documentation page](https://the-strategy-unit.github.io/pxtextmining/). 11 | -------------------------------------------------------------------------------- /api/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/The-Strategy-Unit/pxtextmining/ba6c0a2f10ca9bb3b964b72afe6047b463095c75/api/__init__.py -------------------------------------------------------------------------------- /api/api.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | from typing import List 4 | 5 | import pandas as pd 6 | import schemas 7 | from fastapi import FastAPI 8 | 9 | from pxtextmining.factories.factory_predict_unlabelled_text import ( 10 | predict_multilabel_sklearn, 11 | ) 12 | from pxtextmining.params import minor_cats 13 | 14 | description = """ 15 | This API is for classifying patient experience qualitative data, 16 | utilising the models trained as part of the pxtextmining project. 17 | """ 18 | 19 | tags_metadata = [ 20 | {"name": "index", "description": "Basic page to test if API is working."}, 21 | { 22 | "name": "multilabel", 23 | "description": "Generate multilabel predictions for given text.", 24 | }, 25 | ] 26 | 27 | 28 | app = FastAPI( 29 | title="pxtextmining API", 30 | description=description, 31 | version="1.0.0", 32 | contact={ 33 | "name": "Patient Experience Qualitative Data Categorisation", 34 | "url": "https://the-strategy-unit.github.io/PatientExperience-QDC/", 35 | "email": "chris.beeley1@nhs.net", 36 | }, 37 | license_info={ 38 | "name": "MIT License", 39 | "url": "https://github.com/the-strategy-unit/pxtextmining/blob/main/LICENSE", 40 | }, 41 | openapi_tags=tags_metadata, 42 | ) 43 | 44 | 45 | @app.get("/", response_model=schemas.Test, tags=["index"]) 46 | def index(): 47 | return {"test": "Hello"} 48 | 49 | 50 | @app.post( 51 | "/predict_multilabel", 52 | response_model=List[schemas.MultilabelOut], 53 | tags=["multilabel"], 54 | ) 55 | async def predict_multilabel(items: List[schemas.ItemIn]): 56 | """Accepts comment ids and comment text as JSON in a POST request. Makes predictions using trained SVC model. 57 | 58 | Args: 59 | items (List[ItemIn]): JSON list of dictionaries with the following compulsory keys: 60 | - `comment_id` (str) 61 | - `comment_text` (str) 62 | 63 | Returns: 64 | (dict): Keys are: `comment_id` and predicted `labels`. 65 | """ 66 | 67 | # Process received data 68 | df = pd.DataFrame([i.dict() for i in items], dtype=str) 69 | df_for_preds = df.copy().rename( 70 | columns={"comment_id": "Comment ID", "comment_text": "FFT answer"} 71 | ) 72 | df_for_preds = df_for_preds.set_index("Comment ID") 73 | if df_for_preds.index.duplicated().sum() != 0: 74 | raise ValueError("comment_id must all be unique values") 75 | text_to_predict = df_for_preds["FFT answer"] 76 | # Make predictions 77 | model_path = "final_svc.sav" 78 | if not os.path.isfile(model_path): 79 | model_path = os.path.join("api", model_path) 80 | with open(model_path, "rb") as model: 81 | loaded_model = pickle.load(model) 82 | preds_df = predict_multilabel_sklearn( 83 | text_to_predict, loaded_model, labels=minor_cats, additional_features=False 84 | ) 85 | # Join predicted labels with received data 86 | preds_df["comment_id"] = preds_df.index.astype(str) 87 | merged = pd.merge(df, preds_df, how="left", on="comment_id") 88 | merged["labels"] = merged["labels"].fillna("").apply(list) 89 | for i in merged["labels"].index: 90 | label_list = merged.loc[i, "labels"] 91 | if len(label_list) < 1: 92 | merged.loc[i, "labels"].append("Labelling not possible") 93 | return_dict = merged[["comment_id", "labels"]].to_dict(orient="records") 94 | return return_dict 95 | -------------------------------------------------------------------------------- /api/final_svc.sav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/The-Strategy-Unit/pxtextmining/ba6c0a2f10ca9bb3b964b72afe6047b463095c75/api/final_svc.sav -------------------------------------------------------------------------------- /api/requirements.txt: -------------------------------------------------------------------------------- 1 | absl-py==2.1.0 ; python_version >= "3.8" and python_version < "3.11" 2 | anyio==4.2.0 ; python_version >= "3.8" and python_version < "3.11" 3 | astunparse==1.6.3 ; python_version >= "3.8" and python_version < "3.11" 4 | cachetools==5.3.2 ; python_version >= "3.8" and python_version < "3.11" 5 | certifi==2024.2.2 ; python_version >= "3.8" and python_version < "3.11" 6 | cfgv==3.4.0 ; python_version >= "3.8" and python_version < "3.11" 7 | charset-normalizer==3.3.2 ; python_version >= "3.8" and python_version < "3.11" 8 | click==8.1.7 ; python_version >= "3.8" and python_version < "3.11" 9 | colorama==0.4.6 ; python_version >= "3.8" and python_version < "3.11" and (sys_platform == "win32" or platform_system == "Windows") 10 | contourpy==1.1.1 ; python_version >= "3.8" and python_version < "3.11" 11 | coverage[toml]==7.4.1 ; python_version >= "3.8" and python_version < "3.11" 12 | cycler==0.12.1 ; python_version >= "3.8" and python_version < "3.11" 13 | distlib==0.3.8 ; python_version >= "3.8" and python_version < "3.11" 14 | exceptiongroup==1.2.0 ; python_version >= "3.8" and python_version < "3.11" 15 | fastapi==0.101.1 ; python_version >= "3.8" and python_version < "3.11" 16 | filelock==3.13.1 ; python_version >= "3.8" and python_version < "3.11" 17 | flatbuffers==23.5.26 ; python_version >= "3.8" and python_version < "3.11" 18 | fonttools==4.48.1 ; python_version >= "3.8" and python_version < "3.11" 19 | fsspec==2024.2.0 ; python_version >= "3.8" and python_version < "3.11" 20 | gast==0.4.0 ; python_version >= "3.8" and python_version < "3.11" 21 | google-auth-oauthlib==1.0.0 ; python_version >= "3.8" and python_version < "3.11" 22 | google-auth==2.27.0 ; python_version >= "3.8" and python_version < "3.11" 23 | google-pasta==0.2.0 ; python_version >= "3.8" and python_version < "3.11" 24 | grpcio==1.60.1 ; python_version >= "3.8" and python_version < "3.11" 25 | h11==0.14.0 ; python_version >= "3.8" and python_version < "3.11" 26 | h5py==3.10.0 ; python_version >= "3.8" and python_version < "3.11" 27 | httpcore==0.16.3 ; python_version >= "3.8" and python_version < "3.11" 28 | httpx==0.23.3 ; python_version >= "3.8" and python_version < "3.11" 29 | huggingface-hub==0.20.3 ; python_version >= "3.8" and python_version < "3.11" 30 | identify==2.5.34 ; python_version >= "3.8" and python_version < "3.11" 31 | idna==3.6 ; python_version >= "3.8" and python_version < "3.11" 32 | importlib-metadata==7.0.1 ; python_version >= "3.8" and python_version < "3.10" 33 | importlib-resources==6.1.1 ; python_version >= "3.8" and python_version < "3.10" 34 | iniconfig==2.0.0 ; python_version >= "3.8" and python_version < "3.11" 35 | jax==0.4.13 ; python_version >= "3.8" and python_version < "3.11" 36 | joblib==1.3.2 ; python_version >= "3.8" and python_version < "3.11" 37 | keras==2.12.0 ; python_version >= "3.8" and python_version < "3.11" 38 | kiwisolver==1.4.5 ; python_version >= "3.8" and python_version < "3.11" 39 | libclang==16.0.6 ; python_version >= "3.8" and python_version < "3.11" 40 | markdown==3.5.2 ; python_version >= "3.8" and python_version < "3.11" 41 | markupsafe==2.1.5 ; python_version >= "3.8" and python_version < "3.11" 42 | matplotlib==3.7.4 ; python_version >= "3.8" and python_version < "3.11" 43 | ml-dtypes==0.2.0 ; python_version >= "3.8" and python_version < "3.11" 44 | nodeenv==1.8.0 ; python_version >= "3.8" and python_version < "3.11" 45 | numpy==1.23.5 ; python_version >= "3.8" and python_version < "3.11" 46 | oauthlib==3.2.2 ; python_version >= "3.8" and python_version < "3.11" 47 | opt-einsum==3.3.0 ; python_version >= "3.8" and python_version < "3.11" 48 | packaging==23.2 ; python_version >= "3.8" and python_version < "3.11" 49 | pandas==1.5.3 ; python_version >= "3.8" and python_version < "3.11" 50 | pillow==10.2.0 ; python_version >= "3.8" and python_version < "3.11" 51 | platformdirs==4.2.0 ; python_version >= "3.8" and python_version < "3.11" 52 | pluggy==1.4.0 ; python_version >= "3.8" and python_version < "3.11" 53 | pre-commit==3.5.0 ; python_version >= "3.8" and python_version < "3.11" 54 | protobuf==4.25.2 ; python_version >= "3.8" and python_version < "3.11" 55 | pyasn1-modules==0.3.0 ; python_version >= "3.8" and python_version < "3.11" 56 | pyasn1==0.5.1 ; python_version >= "3.8" and python_version < "3.11" 57 | pydantic==1.10.14 ; python_version >= "3.8" and python_version < "3.11" 58 | pyparsing==3.1.1 ; python_version >= "3.8" and python_version < "3.11" 59 | pytest-cov==4.1.0 ; python_version >= "3.8" and python_version < "3.11" 60 | pytest-mock==3.12.0 ; python_version >= "3.8" and python_version < "3.11" 61 | pytest==7.4.4 ; python_version >= "3.8" and python_version < "3.11" 62 | python-dateutil==2.8.2 ; python_version >= "3.8" and python_version < "3.11" 63 | pytz==2024.1 ; python_version >= "3.8" and python_version < "3.11" 64 | pyyaml==6.0.1 ; python_version >= "3.8" and python_version < "3.11" 65 | regex==2023.12.25 ; python_version >= "3.8" and python_version < "3.11" 66 | requests-oauthlib==1.3.1 ; python_version >= "3.8" and python_version < "3.11" 67 | requests==2.31.0 ; python_version >= "3.8" and python_version < "3.11" 68 | rfc3986[idna2008]==1.5.0 ; python_version >= "3.8" and python_version < "3.11" 69 | rsa==4.9 ; python_version >= "3.8" and python_version < "3.11" 70 | ruff==0.0.272 ; python_version >= "3.8" and python_version < "3.11" 71 | safetensors==0.4.2 ; python_version >= "3.8" and python_version < "3.11" 72 | scikit-learn==1.0.2 ; python_version >= "3.8" and python_version < "3.11" 73 | scipy==1.10.1 ; python_version >= "3.8" and python_version < "3.11" 74 | setuptools==69.1.0 ; python_version >= "3.8" and python_version < "3.11" 75 | six==1.16.0 ; python_version >= "3.8" and python_version < "3.11" 76 | sniffio==1.3.0 ; python_version >= "3.8" and python_version < "3.11" 77 | starlette==0.27.0 ; python_version >= "3.8" and python_version < "3.11" 78 | tensorboard-data-server==0.7.2 ; python_version >= "3.8" and python_version < "3.11" 79 | tensorboard==2.12.3 ; python_version >= "3.8" and python_version < "3.11" 80 | tensorflow-estimator==2.12.0 ; python_version >= "3.8" and python_version < "3.11" 81 | tensorflow-io-gcs-filesystem==0.36.0 ; python_version >= "3.8" and python_version < "3.11" and platform_machine != "arm64" or python_version >= "3.8" and python_version < "3.11" and platform_system != "Darwin" 82 | tensorflow==2.12.0 ; python_version >= "3.8" and python_version < "3.11" 83 | termcolor==2.4.0 ; python_version >= "3.8" and python_version < "3.11" 84 | threadpoolctl==3.3.0 ; python_version >= "3.8" and python_version < "3.11" 85 | tokenizers==0.15.2 ; python_version >= "3.8" and python_version < "3.11" 86 | tomli==2.0.1 ; python_version >= "3.8" and python_version < "3.11" 87 | tornado==6.4 ; python_version >= "3.8" and python_version < "3.11" 88 | tqdm==4.66.2 ; python_version >= "3.8" and python_version < "3.11" 89 | transformers==4.37.2 ; python_version >= "3.8" and python_version < "3.11" 90 | typing-extensions==4.9.0 ; python_version >= "3.8" and python_version < "3.11" 91 | urllib3==2.2.0 ; python_version >= "3.8" and python_version < "3.11" 92 | uvicorn==0.20.0 ; python_version >= "3.8" and python_version < "3.11" 93 | virtualenv==20.25.0 ; python_version >= "3.8" and python_version < "3.11" 94 | werkzeug==3.0.1 ; python_version >= "3.8" and python_version < "3.11" 95 | wheel==0.42.0 ; python_version >= "3.8" and python_version < "3.11" 96 | wrapt==1.14.1 ; python_version >= "3.8" and python_version < "3.11" 97 | xgboost==1.7.6 ; python_version >= "3.8" and python_version < "3.11" 98 | zipp==3.17.0 ; python_version >= "3.8" and python_version < "3.10" 99 | -------------------------------------------------------------------------------- /api/schemas.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel 2 | 3 | 4 | class Test(BaseModel): 5 | test: str 6 | 7 | class Config: 8 | schema_extra = {"example": {"test": "Hello"}} 9 | 10 | 11 | class ItemIn(BaseModel): 12 | comment_id: str 13 | comment_text: str 14 | 15 | class Config: 16 | schema_extra = { 17 | "example": { 18 | "comment_id": "01", 19 | "comment_text": "Nurses were friendly. Parking was awful.", 20 | } 21 | } 22 | 23 | 24 | class MultilabelOut(BaseModel): 25 | comment_id: str 26 | labels: list 27 | 28 | class Config: 29 | schema_extra = { 30 | "example": { 31 | "comment_id": "01", 32 | "labels": ["Staff manner & personal attributes", "Parking"], 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /api/test_api_locally.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | import pandas as pd 4 | import requests 5 | 6 | """ 7 | To test the API, first in terminal, run this command to launch uvicorn server on http://127.0.0.1:8000 8 | uvicorn api.api:app --reload 9 | Then you can run this test_api script to check if the API is behaving as it should locally 10 | """ 11 | 12 | 13 | def test_json_predictions(json): 14 | response = requests.post("http://127.0.0.1:8000/predict_multilabel", json=json) 15 | return response 16 | 17 | 18 | if __name__ == "__main__": 19 | start = time.time() 20 | df = pd.read_csv("datasets/hidden/merged_230612.csv")[["Comment ID", "FFT answer"]][ 21 | :2000 22 | ] 23 | df = df.rename( 24 | columns={"Comment ID": "row_id", "FFT answer": "comment_txt"} 25 | ).dropna() 26 | df = df[["row_id", "comment_txt"]].copy().set_index("row_id")[:1000] 27 | js = [] 28 | for i in df.index: 29 | js.append({"comment_id": str(i), "comment_text": df.loc[i]["comment_txt"]}) 30 | print("The JSON that was sent looks like:") 31 | print(js[:5]) 32 | print("The JSON that is returned is:") 33 | returned_json = test_json_predictions(js).json() 34 | finish = time.time() 35 | total = finish - start 36 | print(f"Time taken: {total} seconds") 37 | print(returned_json) 38 | # json_object = json.dumps(returned_json, indent=4) 39 | # with open("predictions.json", "w") as outfile: 40 | # outfile.write(json_object) 41 | -------------------------------------------------------------------------------- /current_best_model/final_bert/bert_perf.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/The-Strategy-Unit/pxtextmining/ba6c0a2f10ca9bb3b964b72afe6047b463095c75/current_best_model/final_bert/bert_perf.xlsx -------------------------------------------------------------------------------- /current_best_model/final_bert/bert_summary.txt: -------------------------------------------------------------------------------- 1 | 2 | ***************** 3 | Random state seed for train test split is: 42 4 | 5 | 6 | Model: "DistilBERT" 7 | _________________________________________________________________ 8 | Layer (type) Output Shape Param # 9 | ================================================================= 10 | input_ids (InputLayer) [(None, 150)] 0 11 | 12 | distilbert (TFDistilBertMai TFBaseModelOutput(last_h 66362880 13 | nLayer) idden_state=(None, 150, 14 | 768), 15 | hidden_states=None, att 16 | entions=None) 17 | 18 | tf.__operators__.getitem (S (None, 768) 0 19 | licingOpLambda) 20 | 21 | pooled_output (Dropout) (None, 768) 0 22 | 23 | output (Dense) (None, 32) 24608 24 | 25 | ================================================================= 26 | Total params: 66,387,488 27 | Trainable params: 66,387,488 28 | Non-trainable params: 0 29 | _________________________________________________________________ 30 | 31 | 32 | Training time: 8:47:32 33 | 34 | exact_accuracy: 0.5759920139755428 35 | hamming_loss: 0.024114050411779386 36 | macro_jaccard_score: 0.528819440670572 37 | macro_roc_auc: 0.9619264849220406 38 | Label ranking average precision: 0.8684599465486575 39 | 40 | Classification report: 41 | precision recall f1-score support 42 | 43 | Organisation & efficiency 0.47 0.82 0.60 102 44 | Funding & use of financial resources 0.71 0.68 0.69 25 45 | Staff manner & personal attributes 0.94 0.86 0.90 1431 46 | Competence & training 0.61 0.52 0.57 164 47 | Unspecified communication 0.65 0.61 0.63 36 48 | Staff listening, understanding & involving patients 0.57 0.76 0.65 361 49 | Information directly from staff during care 0.78 0.76 0.77 390 50 | Information provision & guidance 0.68 0.38 0.49 90 51 | Being kept informed, clarity & consistency of information 0.49 0.59 0.53 183 52 | Contacting services 0.69 0.59 0.63 100 53 | Appointment arrangements 0.76 0.55 0.64 261 54 | Appointment method 0.63 0.61 0.62 31 55 | Timeliness of care 0.73 0.73 0.73 529 56 | Pain management 0.80 0.56 0.66 43 57 | Discharge 0.72 0.28 0.41 46 58 | Cleanliness, tidiness & infection control 0.87 0.73 0.79 107 59 | Service location 0.85 0.52 0.65 86 60 | Transport to/ from services 0.70 0.65 0.68 78 61 | Parking 0.94 0.89 0.91 18 62 | Electronic entertainment 0.94 0.74 0.83 23 63 | Feeling safe 0.75 0.78 0.77 23 64 | Mental Health Act 0.67 0.31 0.42 13 65 | Labelling not possible 1.00 1.00 1.00 238 66 | Supplying & understanding medication 0.75 0.69 0.72 59 67 | Activities & access to fresh air 0.92 0.63 0.75 54 68 | Food & drink provision & facilities 0.88 0.85 0.87 106 69 | Sensory experience 0.78 0.85 0.81 67 70 | Interaction with family/ carers 0.61 0.34 0.44 123 71 | Positive experience & gratitude 0.86 0.88 0.87 938 72 | Continuity of care 0.72 0.30 0.42 290 73 | Environment, facilities & equipment 0.77 0.58 0.66 202 74 | Staffing levels & responsiveness 0.47 0.44 0.45 194 75 | 76 | micro avg 0.78 0.72 0.75 6411 77 | macro avg 0.74 0.64 0.67 6411 78 | weighted avg 0.79 0.72 0.74 6411 79 | samples avg 0.81 0.78 0.78 6411 80 | -------------------------------------------------------------------------------- /current_best_model/final_ensemble/ensemble_perf.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/The-Strategy-Unit/pxtextmining/ba6c0a2f10ca9bb3b964b72afe6047b463095c75/current_best_model/final_ensemble/ensemble_perf.xlsx -------------------------------------------------------------------------------- /current_best_model/final_ensemble/ensemble_summary.txt: -------------------------------------------------------------------------------- 1 | 2 | ***************** 3 | Random state seed for train test split is: 42 4 | 5 | 6 | Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer()), 7 | ('multioutputclassifier', 8 | MultiOutputClassifier(estimator=SVC(C=15, cache_size=1000, 9 | class_weight='balanced', 10 | max_iter=1000, 11 | probability=True)))]) 12 | Model: "DistilBERT" 13 | _________________________________________________________________ 14 | Layer (type) Output Shape Param # 15 | ================================================================= 16 | input_ids (InputLayer) [(None, 150)] 0 17 | 18 | distilbert (TFDistilBertMai TFBaseModelOutput(last_h 66362880 19 | nLayer) idden_state=(None, 150, 20 | 768), 21 | hidden_states=None, att 22 | entions=None) 23 | 24 | tf.__operators__.getitem (S (None, 768) 0 25 | licingOpLambda) 26 | 27 | pooled_output (Dropout) (None, 768) 0 28 | 29 | output (Dense) (None, 32) 24608 30 | 31 | ================================================================= 32 | Total params: 66,387,488 33 | Trainable params: 66,387,488 34 | Non-trainable params: 0 35 | _________________________________________________________________ 36 | 37 | Pipeline(steps=[('tfidfvectorizer', 38 | TfidfVectorizer(max_df=0.99, min_df=6, ngram_range=(1, 2))), 39 | ('xgbclassifier', 40 | XGBClassifier(base_score=None, booster=None, callbacks=None, 41 | colsample_bylevel=None, colsample_bynode=None, 42 | colsample_bytree=None, 43 | early_stopping_rounds=None, 44 | enable_categorical=False, eval_metric=None, 45 | feature_types=None, gamma=0.3, gpu_id=None, 46 | grow_policy=None, importance_type=None, 47 | interaction_constraints=None, learning_rate=None, 48 | max_bin=None, max_cat_threshold=None, 49 | max_cat_to_onehot=None, max_delta_step=None, 50 | max_depth=4, max_leaves=None, 51 | min_child_weight=0.5, missing=nan, 52 | monotone_constraints=None, n_estimators=200, 53 | n_jobs=None, num_parallel_tree=None, 54 | predictor=None, random_state=None, ...))]) 55 | 56 | 57 | Ensembling method: Average of predicted probabilities for each model taken. Threshold set at 0.3 58 | 59 | exact_accuracy: 0.560019965061143 60 | hamming_loss: 0.023146992762665335 61 | macro_jaccard_score: 0.5644485328931894 62 | macro_roc_auc: 0.9709795934716587 63 | Label ranking average precision: 0.8805306557959275 64 | 65 | Classification report: 66 | precision recall f1-score support 67 | 68 | Organisation & efficiency 0.49 0.75 0.59 102 69 | Funding & use of financial resources 0.64 0.64 0.64 25 70 | Staff manner & personal attributes 0.90 0.90 0.90 1431 71 | Competence & training 0.74 0.57 0.64 164 72 | Unspecified communication 0.61 0.64 0.62 36 73 | Staff listening, understanding & involving patients 0.64 0.75 0.69 361 74 | Information directly from staff during care 0.73 0.80 0.76 390 75 | Information provision & guidance 0.64 0.51 0.57 90 76 | Being kept informed, clarity & consistency of information 0.56 0.66 0.60 183 77 | Contacting services 0.72 0.65 0.68 100 78 | Appointment arrangements 0.73 0.69 0.71 261 79 | Appointment method 0.65 0.65 0.65 31 80 | Timeliness of care 0.67 0.82 0.74 529 81 | Pain management 0.86 0.72 0.78 43 82 | Discharge 0.80 0.52 0.63 46 83 | Cleanliness, tidiness & infection control 0.91 0.86 0.88 107 84 | Service location 0.85 0.59 0.70 86 85 | Transport to/ from services 0.74 0.64 0.68 78 86 | Parking 0.94 0.94 0.94 18 87 | Electronic entertainment 0.94 0.65 0.77 23 88 | Feeling safe 0.69 0.87 0.77 23 89 | Mental Health Act 0.75 0.23 0.35 13 90 | Labelling not possible 1.00 1.00 1.00 238 91 | Supplying & understanding medication 0.73 0.69 0.71 59 92 | Activities & access to fresh air 0.76 0.76 0.76 54 93 | Food & drink provision & facilities 0.87 0.85 0.86 106 94 | Sensory experience 0.80 0.79 0.80 67 95 | Interaction with family/ carers 0.59 0.45 0.51 123 96 | Positive experience & gratitude 0.80 0.91 0.85 938 97 | Continuity of care 0.65 0.58 0.61 290 98 | Environment, facilities & equipment 0.71 0.67 0.69 202 99 | Staffing levels & responsiveness 0.55 0.56 0.56 194 100 | 101 | micro avg 0.76 0.78 0.77 6411 102 | macro avg 0.74 0.70 0.71 6411 103 | weighted avg 0.76 0.78 0.77 6411 104 | samples avg 0.79 0.83 0.79 6411 105 | -------------------------------------------------------------------------------- /current_best_model/final_svc/final_svc.sav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/The-Strategy-Unit/pxtextmining/ba6c0a2f10ca9bb3b964b72afe6047b463095c75/current_best_model/final_svc/final_svc.sav -------------------------------------------------------------------------------- /current_best_model/final_svc/final_svc_perf.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/The-Strategy-Unit/pxtextmining/ba6c0a2f10ca9bb3b964b72afe6047b463095c75/current_best_model/final_svc/final_svc_perf.xlsx -------------------------------------------------------------------------------- /current_best_model/final_svc/final_svc_summary.txt: -------------------------------------------------------------------------------- 1 | 2 | ***************** 3 | Random state seed for train test split is: 42 4 | 5 | 6 | Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer()), 7 | ('multioutputclassifier', 8 | MultiOutputClassifier(estimator=SVC(C=15, cache_size=1000, 9 | class_weight='balanced', 10 | max_iter=1000, 11 | probability=True)))]) 12 | 13 | 14 | Training time: 0:08:43 15 | 16 | exact_accuracy: 0.4634389817818817 17 | hamming_loss: 0.029105315697529322 18 | macro_jaccard_score: 0.48982370495986105 19 | macro_roc_auc: 0.9515989884263054 20 | Label ranking average precision: 0.8440659360100616 21 | 22 | Classification report: 23 | precision recall f1-score support 24 | 25 | Organisation & efficiency 0.65 0.50 0.57 102 26 | Funding & use of financial resources 0.62 0.64 0.63 25 27 | Staff manner & personal attributes 0.85 0.86 0.86 1431 28 | Competence & training 0.78 0.41 0.54 164 29 | Unspecified communication 0.70 0.44 0.54 36 30 | Staff listening, understanding & involving patients 0.65 0.66 0.65 361 31 | Information directly from staff during care 0.77 0.71 0.74 390 32 | Information provision & guidance 0.67 0.36 0.46 90 33 | Being kept informed, clarity & consistency of information 0.59 0.45 0.51 183 34 | Contacting services 0.72 0.59 0.65 100 35 | Appointment arrangements 0.71 0.59 0.65 261 36 | Appointment method 0.78 0.45 0.57 31 37 | Timeliness of care 0.60 0.71 0.65 529 38 | Pain management 0.88 0.67 0.76 43 39 | Discharge 0.77 0.37 0.50 46 40 | Cleanliness, tidiness & infection control 0.94 0.82 0.88 107 41 | Service location 0.84 0.56 0.67 86 42 | Transport to/ from services 0.67 0.46 0.55 78 43 | Parking 1.00 0.83 0.91 18 44 | Electronic entertainment 1.00 0.57 0.72 23 45 | Feeling safe 0.67 0.52 0.59 23 46 | Mental Health Act 0.67 0.15 0.25 13 47 | Labelling not possible 1.00 1.00 1.00 238 48 | Supplying & understanding medication 0.69 0.59 0.64 59 49 | Activities & access to fresh air 0.71 0.72 0.72 54 50 | Food & drink provision & facilities 0.88 0.70 0.78 106 51 | Sensory experience 0.81 0.69 0.74 67 52 | Interaction with family/ carers 0.57 0.32 0.41 123 53 | Positive experience & gratitude 0.60 0.85 0.70 938 54 | Continuity of care 0.52 0.59 0.55 290 55 | Environment, facilities & equipment 0.77 0.55 0.64 202 56 | Staffing levels & responsiveness 0.56 0.42 0.48 194 57 | 58 | micro avg 0.71 0.70 0.71 6411 59 | macro avg 0.74 0.59 0.64 6411 60 | weighted avg 0.72 0.70 0.70 6411 61 | samples avg 0.74 0.76 0.72 6411 62 | -------------------------------------------------------------------------------- /current_best_model/final_xgb/final_xgb.sav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/The-Strategy-Unit/pxtextmining/ba6c0a2f10ca9bb3b964b72afe6047b463095c75/current_best_model/final_xgb/final_xgb.sav -------------------------------------------------------------------------------- /current_best_model/final_xgb/final_xgb_perf.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/The-Strategy-Unit/pxtextmining/ba6c0a2f10ca9bb3b964b72afe6047b463095c75/current_best_model/final_xgb/final_xgb_perf.xlsx -------------------------------------------------------------------------------- /current_best_model/final_xgb/final_xgb_summary.txt: -------------------------------------------------------------------------------- 1 | 2 | ***************** 3 | Random state seed for train test split is: 42 4 | 5 | 6 | Pipeline(steps=[('tfidfvectorizer', 7 | TfidfVectorizer(max_df=0.99, min_df=6, ngram_range=(1, 2))), 8 | ('xgbclassifier', 9 | XGBClassifier(base_score=None, booster=None, callbacks=None, 10 | colsample_bylevel=None, colsample_bynode=None, 11 | colsample_bytree=None, 12 | early_stopping_rounds=None, 13 | enable_categorical=False, eval_metric=None, 14 | feature_types=None, gamma=0.3, gpu_id=None, 15 | grow_policy=None, importance_type=None, 16 | interaction_constraints=None, learning_rate=None, 17 | max_bin=None, max_cat_threshold=None, 18 | max_cat_to_onehot=None, max_delta_step=None, 19 | max_depth=4, max_leaves=None, 20 | min_child_weight=0.5, missing=nan, 21 | monotone_constraints=None, n_estimators=200, 22 | n_jobs=None, num_parallel_tree=None, 23 | predictor=None, random_state=None, ...))]) 24 | 25 | 26 | Training time: 4:15:32 27 | 28 | exact_accuracy: 0.5547791365111056 29 | hamming_loss: 0.02501871724482156 30 | macro_jaccard_score: 0.4596213577953742 31 | macro_roc_auc: 0.9340739201717683 32 | Label ranking average precision: 0.8480797054165633 33 | 34 | Classification report: 35 | precision recall f1-score support 36 | 37 | Organisation & efficiency 0.62 0.45 0.52 102 38 | Funding & use of financial resources 0.75 0.24 0.36 25 39 | Staff manner & personal attributes 0.91 0.87 0.89 1431 40 | Competence & training 0.79 0.39 0.52 164 41 | Unspecified communication 0.56 0.42 0.48 36 42 | Staff listening, understanding & involving patients 0.79 0.56 0.65 361 43 | Information directly from staff during care 0.78 0.69 0.73 390 44 | Information provision & guidance 0.63 0.36 0.45 90 45 | Being kept informed, clarity & consistency of information 0.61 0.32 0.42 183 46 | Contacting services 0.76 0.52 0.62 100 47 | Appointment arrangements 0.74 0.56 0.64 261 48 | Appointment method 0.62 0.42 0.50 31 49 | Timeliness of care 0.71 0.67 0.69 529 50 | Pain management 0.77 0.56 0.65 43 51 | Discharge 0.81 0.37 0.51 46 52 | Cleanliness, tidiness & infection control 0.95 0.78 0.86 107 53 | Service location 0.85 0.53 0.66 86 54 | Transport to/ from services 0.69 0.40 0.50 78 55 | Parking 0.94 0.89 0.91 18 56 | Electronic entertainment 0.92 0.52 0.67 23 57 | Feeling safe 0.73 0.70 0.71 23 58 | Mental Health Act 0.50 0.08 0.13 13 59 | Labelling not possible 1.00 1.00 1.00 238 60 | Supplying & understanding medication 0.77 0.58 0.66 59 61 | Activities & access to fresh air 0.88 0.52 0.65 54 62 | Food & drink provision & facilities 0.92 0.67 0.78 106 63 | Sensory experience 0.77 0.36 0.49 67 64 | Interaction with family/ carers 0.58 0.24 0.34 123 65 | Positive experience & gratitude 0.78 0.86 0.81 938 66 | Continuity of care 0.62 0.45 0.52 290 67 | Environment, facilities & equipment 0.75 0.45 0.56 202 68 | Staffing levels & responsiveness 0.68 0.40 0.50 194 69 | 70 | micro avg 0.80 0.67 0.73 6411 71 | macro avg 0.76 0.53 0.61 6411 72 | weighted avg 0.79 0.67 0.71 6411 73 | samples avg 0.81 0.73 0.75 6411 74 | -------------------------------------------------------------------------------- /current_best_model/sentiment/bert_sentiment.txt: -------------------------------------------------------------------------------- 1 | 2 | ***************** 3 | Random state seed for train test split is: 75 4 | 5 | 6 | Model: "model" 7 | __________________________________________________________________________________________________ 8 | Layer (type) Output Shape Param # Connected to 9 | ================================================================================================== 10 | input_ids (InputLayer) [(None, 150)] 0 [] 11 | 12 | distilbert (TFDistilBertMainLa TFBaseModelOutput(l 66362880 ['input_ids[0][0]'] 13 | yer) ast_hidden_state=(N 14 | one, 150, 768), 15 | hidden_states=None 16 | , attentions=None) 17 | 18 | input_cat (InputLayer) [(None, 1)] 0 [] 19 | 20 | tf.__operators__.getitem (Slic (None, 768) 0 ['distilbert[0][0]'] 21 | ingOpLambda) 22 | 23 | category_encoding (CategoryEnc (None, 3) 0 ['input_cat[0][0]'] 24 | oding) 25 | 26 | pooled_output (Dropout) (None, 768) 0 ['tf.__operators__.getitem[0][0]' 27 | ] 28 | 29 | dense (Dense) (None, 10) 40 ['category_encoding[0][0]'] 30 | 31 | concatenate (Concatenate) (None, 778) 0 ['pooled_output[0][0]', 32 | 'dense[0][0]'] 33 | 34 | output (Dense) (None, 5) 3895 ['concatenate[0][0]'] 35 | 36 | ================================================================================================== 37 | Total params: 66,366,815 38 | Trainable params: 66,366,815 39 | Non-trainable params: 0 40 | __________________________________________________________________________________________________ 41 | 42 | 43 | Training time: 5:20:56 44 | 45 | 46 | Classification report: 47 | precision recall f1-score support 48 | 49 | very positive 0.80 0.79 0.80 1746 50 | positive 0.63 0.52 0.57 841 51 | neutral 0.52 0.71 0.60 551 52 | negative 0.79 0.68 0.73 639 53 | very negative 0.52 0.64 0.57 166 54 | 55 | accuracy 0.70 3943 56 | macro avg 0.65 0.67 0.65 3943 57 | weighted avg 0.71 0.70 0.70 3943 58 | -------------------------------------------------------------------------------- /current_best_model/sentiment/confusion_matrix_3_counts.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/The-Strategy-Unit/pxtextmining/ba6c0a2f10ca9bb3b964b72afe6047b463095c75/current_best_model/sentiment/confusion_matrix_3_counts.png -------------------------------------------------------------------------------- /current_best_model/sentiment/confusion_matrix_3_percentages.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/The-Strategy-Unit/pxtextmining/ba6c0a2f10ca9bb3b964b72afe6047b463095c75/current_best_model/sentiment/confusion_matrix_3_percentages.png -------------------------------------------------------------------------------- /current_best_model/sentiment/confusion_matrix_5_counts.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/The-Strategy-Unit/pxtextmining/ba6c0a2f10ca9bb3b964b72afe6047b463095c75/current_best_model/sentiment/confusion_matrix_5_counts.png -------------------------------------------------------------------------------- /current_best_model/sentiment/confusion_matrix_5_percentages.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/The-Strategy-Unit/pxtextmining/ba6c0a2f10ca9bb3b964b72afe6047b463095c75/current_best_model/sentiment/confusion_matrix_5_percentages.png -------------------------------------------------------------------------------- /datasets/README.md: -------------------------------------------------------------------------------- 1 | Please note that the Care Opinion data is being shared under the [CC BY-NC-SA 4.0 licence](https://creativecommons.org/licenses/by-nc-sa/4.0/) and is generated from the [Care Opinion API](https://www.careopinion.org.uk/info/api-v2). 2 | 3 | 4 | Two out of the six participating trusts have agreed to make their data available publicly. 5 | 6 | An explanation of the dataset columns for phase 2 is available below. 7 | 8 | 9 | 10 | Comment ID: ID for the specific comment. 11 | 12 | Trust: NHS Trust where comment originated. 13 | 14 | Respondent ID: ID for the specific respondent. Not linked to any personal identifiable information. 15 | 16 | Date: Date the comment was provided. 17 | 18 | Service type 1: Department relating to the comment. 19 | 20 | Service type 2: Subdepartment relating to the comment. 21 | 22 | FFT categorical answer: Quantitative score attached to the comment. 1 is "very good", 5 is "very poor". 23 | 24 | FFT question: The specific question asked by the NHS trust to elicit the qualitative text response. 25 | 26 | FFT answer: The qualitative text response provided by the respondent to the FFT question. 27 | 28 | Person identifiable info?: Whether or not the FFT answer contains any person identifiable info, as flagged by the labeller. 29 | 30 | Comment sentiment: The sentiment score applied to the FFT answer by the labeller. 1 is "very positive", 5 is "very negative". Mixed comments have been labelled as "3", neutral. 31 | 32 | All other columns are the qualitative framework labels, in one hot encoded format. The version of the framework being used is reflected in the filename. Full details of the framework are available on the [project documentation website](https://the-strategy-unit.github.io/PatientExperience-QDC/framework/framework3.html). 33 | -------------------------------------------------------------------------------- /datasets/phase_1/README.md: -------------------------------------------------------------------------------- 1 | Please note that the Care Opinion data is being shared under the [CC BY-NC-SA 4.0 licence](https://creativecommons.org/licenses/by-nc-sa/4.0/) and is generated from the [Care Opinion API](https://www.careopinion.org.uk/info/api-v2). 2 | 3 | The dataset for phase 1 is stored in this folder. It is no longer used for training the pxtextmining models but is provided for historical interest. 4 | 5 | The `co` and `co_multi_label` files are less useful, with fewer rows. 6 | 7 | The main dataset is the file `text_data`. The following is a description of the columns: 8 | 9 | code: 10 | The shortcode given for the subcategory applied to the comment. There is 1:1 relationship between codes and subcategories, listed below. 11 | 12 | 'cc': 'Care received', 13 | 'xn': 'Nothing to improve', 14 | 'sa': 'Attitude Of Staff', 15 | 'ss': 'Staff: General', 16 | 'cs': 'Advice and support', 17 | 'mi': 'Amount/clarity of information', 18 | 'sp': 'Professionalism/Competence Of Staff', 19 | 'xe': 'Everything was good/bad', 20 | 'mm': 'Communication', 21 | 'cr': 'Rules/approach to care', 22 | 'ml': 'Listening', 23 | 'ef': 'Food', 24 | 'wa': 'Time spent waiting for first appt/referral/service', 25 | 'ap': 'Provision of services', 26 | 'eq': 'Facilities/equipment', 27 | 'ce': 'Emotional care', 28 | 'ee': 'Environment/ facilities', 29 | 'cp': 'Physical care', 30 | 'aa': 'General', 31 | 'ca': 'Activities', 32 | 'co': '1-2-1 care/Time spent with service user', 33 | 'cm': 'Medication ', 34 | 'tc': 'Consistency/Continuity of care', 35 | 'da': 'Respect For Diversity/ Person-Centeredness', 36 | 'ec': 'Cleanliness', 37 | 'sl': 'Staffing levels', 38 | 'ti': 'Coordination/Integration Of Care', 39 | 'cl': 'Made A Difference To My Life', 40 | 'ds': 'Feeling safe including bullying', 41 | 'tx': 'Transition And Discharge', 42 | 'wb': 'Time spent waiting between appointments', 43 | 'ct': 'Therapies', 44 | 'al': 'Location', 45 | 'dp': 'Involvement: Of Service Users/Patients', 46 | 'dd': 'Dignity: General', 47 | 'cf': 'Carer support', 48 | 'xm': 'Miscellaneous', 49 | 'tt': 'Transition/ coordination: General', 50 | 'xg': 'Nothing was good', 51 | 'ep': 'Parking/transport', 52 | 'xf': 'Funding', 53 | 'xl': 'Leave (under MHA)', 54 | 'dc': 'Involvement: Of Family And Carers', 55 | 'xs': 'Surveying' 56 | 57 | label: 58 | The overarching major category label for the text comment. 59 | 60 | subcategory: 61 | The subcategory label for the text comment. 62 | 63 | feedback: 64 | The actual text of the qualitative feedback comment. 65 | 66 | criticality: 67 | How critical the comment is towards the organisation. Can also be interpreted as a type of sentiment. Ranges from -5 to 5, with -5 being highly critical, or highly negative, and 5 being highly positive. 68 | 69 | organization: 70 | Which NHS Trust the feedback relates to. 71 | 72 | question: 73 | The question that the feedback relates to. 74 | 75 | row_index: 76 | row ID number for the feedback comment. 77 | -------------------------------------------------------------------------------- /docker_README.md: -------------------------------------------------------------------------------- 1 | # pxtextmining: Text Classification of Patient Experience feedback 2 | 3 | This Docker container contains the pxtextmining machine learning models trained as part of the [Patient Experience Qualitative Data Categorisation project](https://the-strategy-unit.github.io/PatientExperience-QDC/). 4 | 5 | To use this Docker container to predict your unlabelled text: 6 | 7 | 1. Set up your folders. You will need to set up a folder containing two other folders, data_in and data_out, as below. 8 | ``` 9 | docker_data/ 10 | ├─ data_in/ 11 | ├─ data_out/ 12 | 13 | ``` 14 | 15 | 2. Prepare your data. Save the data you wish to pass through the machine learning models as json, in the data_in folder. The data should be in the following format: 16 | 17 | In Python, a `list` containing as many `dict`s as there are comments to be predicted. Each `dict` has three compulsory keys: 18 | 19 | * `comment_id`: Unique ID associated with the comment, in `str` format. Each Comment ID per API call must be unique. 20 | * `comment_text`: Text to be classified, in `str` format. 21 | * `question_type`: The type of question asked to elicit the comment text. Questions are different from trust to trust, but they all fall into one of three categories: 22 | * `what_good`: Any variation on the question "What was good about the service?", or "What did we do well?" 23 | * `could_improve`: Any variation on the question "Please tell us about anything that we could have done better", or "How could we improve?" 24 | * `nonspecific`: Any other type of nonspecific question, e.g. "Please can you tell us why you gave your answer?", or "What were you satisfied and/or dissatisfied with?". 25 | 26 | ```python 27 | # In Python 28 | 29 | text_data = [ 30 | { 'comment_id': '1', # The comment_id values in each dict must be unique. 31 | 'comment_text': 'This is the first comment. Nurse was great.', 32 | 'question_type': 'what_good' }, 33 | { 'comment_id': '2', 34 | 'comment_text': 'This is the second comment. The ward was freezing.', 35 | 'question_type': 'could_improve' }, 36 | { 'comment_id': '3', 37 | 'comment_text': '', # This comment is an empty string. 38 | 'question_type': 'nonspecific' } 39 | ] 40 | 41 | ``` 42 | 43 | ```R 44 | # In R 45 | 46 | library(jsonlite) 47 | 48 | comment_id <- c("1", "2", "3") 49 | comment_text <- c( 50 | "This is the first comment. Nurse was great.", 51 | "This is the second comment. The ward was freezing.", 52 | "" 53 | ) 54 | question_type <- c("what_good", "could_improve", "nonspecific") 55 | df <- data.frame(comment_id, comment_text, question_type) 56 | text_data <- toJSON(df) 57 | ``` 58 | 59 | 3. Save the JSON data in the data_in folder, as follows: 60 | 61 | ```python 62 | # In Python 63 | 64 | json_data = json.dumps(text_data) 65 | with open("data_in/file_01.json", "w") as outfile: 66 | outfile.write(json_data) 67 | ``` 68 | 69 | ```R 70 | # In R 71 | 72 | json_data <- toJSON(text_data, pretty = TRUE) 73 | write(json_data, file = "data_in/file_01.json") 74 | ``` 75 | 76 | 4. Your file structure should now look like this: 77 | 78 | ``` 79 | docker_data/ 80 | ├─ data_in/ 81 | │ ├─ file_01.json 82 | ├─ data_out/ 83 | ``` 84 | 85 | 5. Mount the docker_data folder as the `data` volume for the Docker container and run the container. Pass the filename for the input JSON as the first argument. The following arguments are also available: 86 | - `--local-storage` or `-l` flag for local storage (does not delete the files in data_in after completing predictions) 87 | - `--target` or `-t` to select the machine learning models used. Options are `m` for multilabel, `s` for `sentiment`, or `ms` for both. Defaults to `ms` if nothing is selected. 88 | 89 | A sample command would be: 90 | `docker run --rm -it -v /docker_data:/data ghcr.io/the-strategy-unit/pxtextmining:latest file_01.json -l ` 91 | 92 | 6. The predictions will be outputted as a json file in the data_out folder, with the same filename. After running successfully, the final folder structure should be: 93 | 94 | ``` 95 | docker_data/ 96 | ├─ data_in/ 97 | │ ├─ file_01.json 98 | ├─ data_out/ 99 | ├─ file_01.json 100 | ``` 101 | -------------------------------------------------------------------------------- /docker_data/data_in/file_01.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "comment_id": "1", 4 | "comment_text": "The nurse was very rude and unhelpful", 5 | "question_type": "what_good" 6 | }, 7 | { 8 | "comment_id": "2", 9 | "comment_text": "The ward was freezing.", 10 | "question_type": "could_improve" 11 | }, 12 | { 13 | "comment_id": "3", 14 | "comment_text": "", 15 | "question_type": "nonspecific" 16 | }, 17 | { 18 | "comment_id": "4", 19 | "comment_text": "Thank you so much", 20 | "question_type": "nonspecific" 21 | } 22 | ] 23 | -------------------------------------------------------------------------------- /docker_data/data_out/file_01.json: -------------------------------------------------------------------------------- 1 | [{"comment_id": "1", "sentiment": 5.0, "labels": ["Staff manner & personal attributes"]}, {"comment_id": "2", "sentiment": 4.0, "labels": ["Sensory experience"]}, {"comment_id": "3", "sentiment": "Labelling not possible", "labels": ["Labelling not possible"]}, {"comment_id": "4", "sentiment": 1.0, "labels": ["Positive experience & gratitude"]}] -------------------------------------------------------------------------------- /docker_run.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import os 4 | import pickle 5 | 6 | import pandas as pd 7 | from tensorflow.keras.saving import load_model 8 | 9 | from pxtextmining.factories.factory_predict_unlabelled_text import ( 10 | combine_predictions, 11 | predict_multilabel_bert, 12 | predict_multilabel_sklearn, 13 | predict_sentiment_bert, 14 | ) 15 | from pxtextmining.params import minor_cats 16 | 17 | 18 | def load_bert_model(model_path): 19 | if not os.path.exists(f"bert_{model_path}"): 20 | if model_path == "sentiment": 21 | model_path = os.path.join( 22 | "current_best_model", model_path, f"bert_{model_path}" 23 | ) 24 | elif model_path == "multilabel": 25 | model_path = os.path.join( 26 | "current_best_model", "final_bert", f"bert_{model_path}" 27 | ) 28 | loaded_model = load_model(f"bert_{model_path}") 29 | return loaded_model 30 | 31 | 32 | def load_sklearn_model(model_name): 33 | model_path = f"{model_name}.sav" 34 | if not os.path.exists(model_path): 35 | model_path = os.path.join("current_best_model", model_name, model_path) 36 | with open(model_path, "rb") as model: 37 | loaded_model = pickle.load(model) 38 | return loaded_model 39 | 40 | 41 | def process_text(items): 42 | df = pd.DataFrame([i for i in items], dtype=str) 43 | df_newindex = df.set_index("comment_id") 44 | if df_newindex.index.duplicated().sum() != 0: 45 | raise ValueError("comment_id must all be unique values") 46 | df_newindex.index.rename("Comment ID", inplace=True) 47 | text_to_predict = df_newindex[["comment_text", "question_type"]] 48 | text_to_predict = text_to_predict.rename( 49 | columns={"comment_text": "FFT answer", "question_type": "FFT_q_standardised"} 50 | ) 51 | return df, text_to_predict 52 | 53 | 54 | def predict_multilabel_ensemble(items): 55 | # Function which gets preds_dfs for bert, svc, and xgb, and combines them all 56 | # Process the data 57 | df, text_to_predict = process_text(items) 58 | text_to_predict = text_to_predict["FFT answer"] 59 | # Load models 60 | bert_model = load_bert_model("multilabel") 61 | svc_model = load_sklearn_model("final_svc") 62 | xgb_model = load_sklearn_model("final_xgb") 63 | # Make preds 64 | bert_preds = predict_multilabel_bert( 65 | text_to_predict, 66 | bert_model, 67 | labels=minor_cats, 68 | additional_features=False, 69 | label_fix=False, 70 | ) 71 | svc_preds = predict_multilabel_sklearn( 72 | text_to_predict, 73 | svc_model, 74 | labels=minor_cats, 75 | additional_features=False, 76 | label_fix=False, 77 | ) 78 | xgb_preds = predict_multilabel_sklearn( 79 | text_to_predict, 80 | xgb_model, 81 | labels=minor_cats, 82 | additional_features=False, 83 | label_fix=False, 84 | ) 85 | # Combine preds 86 | preds_list = [bert_preds, svc_preds, xgb_preds] 87 | combined_preds = combine_predictions(preds_list, labels=minor_cats) 88 | # Join predicted labels with received data 89 | combined_preds["comment_id"] = combined_preds.index.astype(str) 90 | merged = pd.merge(df, combined_preds, how="left", on="comment_id") 91 | # Fill in anything that got cleaned in preprocessing step 92 | nulls = merged[merged.labels.isnull()].index 93 | lnp = pd.Series( 94 | [["Labelling not possible"]] * len(nulls), index=nulls, dtype=object 95 | ) 96 | merged.loc[nulls, "labels"] = lnp 97 | return_df = merged[["comment_id", "labels"]] 98 | return return_df 99 | 100 | 101 | def predict_sentiment(items): 102 | """Accepts comment ids, comment text and question type as JSON in a POST request. Makes predictions using trained Tensorflow Keras model. 103 | 104 | Args: 105 | items (List[ItemIn]): JSON list of dictionaries with the following compulsory keys: 106 | - `comment_id` (str) 107 | - `comment_text` (str) 108 | - `question_type` (str) 109 | The 'question_type' must be one of three values: 'nonspecific', 'what_good', and 'could_improve'. 110 | For example, `[{'comment_id': '1', 'comment_text': 'Thank you', 'question_type': 'what_good'}, 111 | {'comment_id': '2', 'comment_text': 'Food was cold', 'question_type': 'could_improve'}]` 112 | 113 | Returns: 114 | (dict): Keys are: `comment_id`, `comment_text`, and predicted `labels`. 115 | """ 116 | 117 | # Process received data 118 | df, text_to_predict = process_text(items) 119 | # Make predictions 120 | loaded_model = load_bert_model("sentiment") 121 | preds_df = predict_sentiment_bert( 122 | text_to_predict, loaded_model, preprocess_text=False, additional_features=True 123 | ) 124 | # Join predicted labels with received data 125 | preds_df["comment_id"] = preds_df.index.astype(str) 126 | merged = pd.merge(df, preds_df, how="left", on="comment_id") 127 | merged["sentiment"] = merged["sentiment"].fillna("Labelling not possible") 128 | return_df = merged[["comment_id", "sentiment"]] 129 | return return_df 130 | 131 | 132 | def parse_args(): 133 | """Parse command line arguments""" 134 | parser = argparse.ArgumentParser() 135 | parser.add_argument( 136 | "json_file", 137 | nargs=1, 138 | help="Name of the json file", 139 | ) 140 | parser.add_argument( 141 | "--local-storage", 142 | "-l", 143 | action="store_true", 144 | help="Use local storage (instead of Azure)", 145 | ) 146 | parser.add_argument( 147 | "--target", 148 | "-t", 149 | default="ms", 150 | help="Target of the predictions. m for multilabel, s for sentiment. Defaults to ms for both multilabel and sentiment", 151 | ) 152 | args = parser.parse_args() 153 | return args 154 | 155 | 156 | def main(): 157 | args = parse_args() 158 | json_file = os.path.join("data", "data_in", args.json_file[0]) 159 | with open(json_file, "r") as jf: 160 | json_in = json.load(jf) 161 | preds_list = [] 162 | if "s" in args.target: 163 | s_preds = predict_sentiment(json_in) 164 | preds_list.append(s_preds) 165 | if "m" in args.target: 166 | m_preds = predict_multilabel_ensemble(json_in) 167 | preds_list.append(m_preds) 168 | if len(preds_list) == 2: 169 | preds = pd.merge(preds_list[0], preds_list[1], on="comment_id") 170 | else: 171 | preds = preds_list[0] 172 | if not args.local_storage: 173 | os.remove(json_file) 174 | json_out = preds.to_dict(orient="records") 175 | out_path = os.path.join("data", "data_out", args.json_file[0]) 176 | with open(out_path, "w+") as jf: 177 | json.dump(json_out, jf) 178 | 179 | 180 | if __name__ == "__main__": 181 | main() 182 | -------------------------------------------------------------------------------- /docs/about.md: -------------------------------------------------------------------------------- 1 | # Project background 2 | 3 | The `pxtextmining` package is part of [the Patient Experience Qualitative Data Categorisation project](https://the-strategy-unit.github.io/PatientExperience-QDC/). This project is is hosted by Nottinghamshire Healthcare NHS Foundation Trust's Clinical Development Unit Data Science Team, and funded by NHS England's Insight and Feedback Team. 4 | 5 | The primary objective of the `pxtextmining` element is to create a machine learning model capable of categorising the free text data obtained through the [NHS England Friends and Family Test](https://www.england.nhs.uk/fft/) (FFT). It is a multilabel classification problem, with one or more categories applied to each patient feedback comment. In this way, we hope to support better use of qualitative patient experience feedback by NHS provider organisations. 6 | 7 | This package works together with the [experiencesdashboard](https://github.com/the-strategy-unit/experiencesdashboard), a frontend coded in R/Shiny. 8 | -------------------------------------------------------------------------------- /docs/create_docs.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import os 3 | 4 | """ 5 | Python script to automatically generate .md files in docs/reference based on 6 | contents of pxtextmining folders 7 | """ 8 | 9 | 10 | modules = glob.glob('pxtextmining/*/') 11 | module_names = [] 12 | for folder in modules: 13 | if '__' not in folder: 14 | module_name = folder.split('/')[-2] 15 | print(f'MODULE: {module_name}') 16 | pylist = glob.glob(f"{folder}/*.py") 17 | for py in pylist: 18 | if '__' not in py: 19 | py_name = os.path.basename(py)[:-3] 20 | print(py_name) 21 | with open(f'docs/reference/{module_name}/{py_name}.md', 'w') as f: 22 | if module_name == 'helpers': 23 | f.write(f"""::: pxtextmining.{module_name}.{py_name} 24 | options: 25 | show_source: true""") 26 | else: 27 | f.write(f'::: pxtextmining.{module_name}.{py_name}') 28 | -------------------------------------------------------------------------------- /docs/getting started/install.md: -------------------------------------------------------------------------------- 1 | ## Installation 2 | 3 | You can install `pxtextmining` from either [PyPI](https://pypi.org/project/pxtextmining/) or [GitHub](https://github.com/the-strategy-unit/pxtextmining). 4 | 5 | The recommended method is to clone the repository from GitHub, as this will also include the models and datasets. 6 | 7 | ### Option 1: Install from PyPI 8 | This option allows you to use the functions coded in pxtextmining. 9 | 10 | 1. Install `pxtextmining` and its PyPI dependencies: 11 | - `pip install pxtextmining` 12 | 13 | 14 | ### Option 2 (RECOMMENDED): Install from GitHub 15 | This option is recommended as it gives you access to the full datasets and already trained models. 16 | 17 | 1. To begin with, [clone the repository from github](https://docs.github.com/en/repositories/creating-and-managing-repositories/cloning-a-repository). 18 | 19 | 2. It is also recommended to [create a new virtual environment](https://docs.python.org/3/library/venv.html), using your chosen method of managing Python environments. 20 | 21 | 3. The package uses `poetry` for dependency management. First, run `pip install poetry`. 22 | 23 | 4. Then, run `poetry install --with dev`. 24 | -------------------------------------------------------------------------------- /docs/getting started/package.md: -------------------------------------------------------------------------------- 1 | # Package structure 2 | 3 | ## pxtextmining 4 | 5 | The `pxtextmining` package is constructed using the following elements: 6 | 7 | - **`pxtextmining.factories`** 8 | This module contains vast majority of the code in the package. There are five different stages, each corresponding to a different submodule. 9 | 10 | - `factory_data_load_and_split`: Loading of multilabel data, preprocessing, and splitting into train/test/validation sets as appropriate. 11 | 12 | - `factory_pipeline`: Construction and training of different models/estimators/algorithms using the `sklearn`, `tensorflow.keras` and `transformers` libraries. 13 | 14 | - `factory_model_performance`: Evaluation of a trained model, comparing predicted targets with real target values, to produce performance metrics. The decision-making process behind the peformance metrics chosen can be seen on the [project documentation website](https://the-strategy-unit.github.io/PatientExperience-QDC/pxtextmining/performance_metrics.html). The performance metrics for the current best models utilised in the API can be found in the `current_best_multilabel` folder in the main repository. 15 | 16 | - `factory_predict_unlabelled_text`: Prepares unlabelled text (with or without additional features such as question type) in a format suitable for each model type, and passes this through the selected models, to produce predicted labels. 17 | 18 | - **`pxtextmining.helpers`** 19 | This module contains some helper functions which are used in `pxtextmining.factories`. Some of this is legacy code, so this may just be moved into the `factories` submodule in future versions of the package. 20 | 21 | - **`pxtextmining.pipelines`** 22 | All of the processes in `pxtextmining.factories` are pulled together in `multilabel_pipeline`, to create the complete end-to-end process of data processing, model creation, training, evaluation, and saving. 23 | 24 | There is also a `pxtextmining.params` file which is used to standardise specific variables that are used across the entire package. The aim of this is to reduce repetition across the package, for example when trying different targets or model types. 25 | 26 | ## API 27 | 28 | Separate from the `pxtextmining` package is the API, which can be found in the folder `api`. It is constructed using FastAPI and Uvicorn. The aim of the API is to make the trained machine learning models available publicly, so that predictions can be made on any text. The API is not currently publicly available and access is only for participating partner trusts. However, all the code and documentation is available on our github repository. 29 | -------------------------------------------------------------------------------- /docs/getting started/training_new_model.md: -------------------------------------------------------------------------------- 1 | # Training a new model 2 | 3 | To train a new model to categorise patient feedback text, labelled data is required. Discussions are currently underway to enable the release of the data that the multilabel models in `pxtextmining` are trained on. 4 | 5 | This page breaks down the steps in the function `pxtextmining.pipelines.run_sklearn_pipeline`, which outputs trained sklearn models. This is a high-level explanation of the processes; for more detailed technical information please see the relevant code reference pages for each function. 6 | 7 | 8 | ```python 9 | 10 | # Step 1: Generate a random_state which is used for the train_test_split. 11 | # This means that the pipeline and evaluation should be reproducible. 12 | random_state = random.randint(1,999) 13 | 14 | # Step 2: Load the data and isolate the target columns from the dataframe. 15 | df = load_multilabel_data(filename = 'datasets/hidden/multilabeldata_2.csv', 16 | target = 'major_categories') 17 | 18 | # Step 3: Conduct preprocessing: remove punctuation and numbers, clean whitespace and drop empty lines. 19 | # Split into train and test using the random_state above. 20 | X_train, X_test, Y_train, Y_test = process_and_split_data( 21 | df, target = target, 22 | random_state = random_state) 23 | 24 | # Step 4: Instantiate a pipeline and hyperparamter grid for each estimator to be tried. 25 | # Conduct a cross-validated randomized search to identify the hyperparameters 26 | # producing the best results on the validation set. 27 | # For each estimator, returns the pipeline with the best hyperparameters, 28 | # together with the time taken to search the pipeline. 29 | models, training_times = search_sklearn_pipelines(X_train, Y_train, 30 | models_to_try = models_to_try, 31 | additional_features = additional_features) 32 | 33 | # Step 5: Evaluate each pipeline using the test set, comparing predicted values with real values. 34 | # Performance metrics are recorded together with the time taken to search the pipeline. 35 | model_metrics = [] 36 | for i in range(len(models)): 37 | m = models[i] 38 | t = training_times[i] 39 | model_metrics.append(get_multilabel_metrics(X_test, Y_test, 40 | random_state = random_state, 41 | labels = target, model_type = 'sklearn', 42 | model = m, training_time = t)) 43 | 44 | # Step 6: Save the models and performance metrics to the path specified 45 | write_multilabel_models_and_metrics(models,model_metrics,path=path) 46 | ``` 47 | -------------------------------------------------------------------------------- /docs/getting started/using_trained_model.md: -------------------------------------------------------------------------------- 1 | # Using a trained model 2 | 3 | The `current_best_multilabel` folder should contain a fully trained `sklearn` model in .sav format, as well as performance metrics for the model. 4 | 5 | The Transformer-based `tensorflow.keras` model is over 1GB and cannot be shared via GitHub. However, it will be made available via the API, which is forthcoming in a future release of this package. 6 | 7 | This page breaks down the steps in the function `pxtextmining.pipelines.factory_predict_unlabelled_text.predict_multilabel_sklearn`, which can make predictions using the `sklearn` model available via GitHub. This is a high-level explanation of the processes; for more detailed technical information please see the relevant code reference page. 8 | 9 | ```python 10 | 11 | # Step 1: Conduct preprocessing on text: 12 | # Temove trailing whitespaces, NULL values, NaNs, and punctuation. Converts to lowercase. 13 | text_no_whitespace = text.replace(r"^\s*$", np.nan, regex=True) 14 | text_no_nans = text_no_whitespace.dropna() 15 | text_cleaned = text_no_nans.astype(str).apply(remove_punc_and_nums) 16 | processed_text = text_cleaned.astype(str).apply(clean_empty_features) 17 | 18 | # Step 2: Make predictions with the trained model 19 | binary_preds = model.predict(processed_text) 20 | 21 | # Step 3: Get predicted probabilities for each label 22 | pred_probs = np.array(model.predict_proba(processed_text)) 23 | 24 | # Step 4: Some samples do not have any predicted labels. 25 | # For these, take the label with the highest predicted probability. 26 | predictions = fix_no_labels(binary_preds, pred_probs, model_type="sklearn") 27 | 28 | # Step 5: Convert predictions to a dataframe. 29 | preds_df = pd.DataFrame(predictions, index=processed_text.index, columns=labels) 30 | preds_df["labels"] = preds_df.apply(get_labels, args=(labels,), axis=1) 31 | 32 | ``` 33 | -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | # Home 2 | 3 | This site contains the project documentation for the `pxtextmining` python package. 4 | This provides a technical overview of the package; for a non-technical overview and further information, visit the 5 | [Patient Experience Qualitative Data Categorisation website](https://the-strategy-unit.github.io/PatientExperience-QDC/pxtextmining/). 6 | 7 | ## Table Of Contents 8 | 9 | The documentation is split into three separate sections: 10 | 11 | 1. [Project background](about.md) 12 | 2. Getting started, a simple approach to using the package: 13 | - [Installation](getting%20started/install.md) 14 | - [How the package works](getting%20started/package.md) 15 | - [Training a new model](getting%20started/training_new_model.md) 16 | - [Making predictions with a trained model](getting%20started/using_trained_model.md) 17 | 3. Code reference, a more technical overview of the functions and modules: 18 | - [Factories](reference/pxtextmining/factories/factory_data_load_and_split.md) 19 | - [Helpers](reference/pxtextmining/helpers/text_preprocessor.md) 20 | - [Pipelines](reference/pxtextmining/pipelines/multilabel_pipeline.md) 21 | 22 | ### Other repos that use `pxtextmining` 23 | - [nhs_fft_sentiment_analysis](https://github.com/yunus-m/nhs_fft_sentiment_analysis/blob/main/README.md) 24 | - Exploratory analysis and sentiment modelling of FFT feedback using `scikit-learn`, TinyBERT, and hierarchical approaches. -------------------------------------------------------------------------------- /docs/main.css: -------------------------------------------------------------------------------- 1 | /*CSS to make text in tables wrap rather than scrolling forever*/ 2 | 3 | .wy-table-responsive table td, .wy-table-responsive table th { 4 | white-space: inherit; 5 | } 6 | -------------------------------------------------------------------------------- /docs/reference/API/API.md: -------------------------------------------------------------------------------- 1 | # pxtextmining API overview 2 | 3 | We have created two different APIs for labelling patient experience feedback. Both APIs are free to use and completely open source. For help and support with using them, please contact [Chris Beeley](mailto:chris.beeley1@nhs.net). 4 | 5 | The "Quick API" is faster and simpler, as it uses an sklearn model which is quicker to make predictions. The performance of predictions from this API can be seen on our project documentation website. It is less accurate than the slow API. This API is a more 'traditional' style of API. 6 | 7 | The "Slow API" utilises sklearn models as well as the slower but more powerful transformer-based Distilbert model. Due to the demanding hardware requirements of this model, we have set up a slower and slightly more complex API which combines (ensembles) together these models but has higher performance overall. 8 | 9 | ## Security 10 | 11 | The data is submitted via a secure HTTPS connection. All data is encrypted in transit with HTTPS, using the SSL/TLS protocol for encryption and authentication. The data is stored in blob storage on a UK-based Azure container instance for the duration of the model predictions, and is then immediately deleted. Ad hoc support is provided where possible, no uptime or other guarantees exist. 12 | -------------------------------------------------------------------------------- /docs/reference/API/quick_API.md: -------------------------------------------------------------------------------- 1 | # Quick API 2 | 3 | To facilitate the use of the models trained in this project, an API has been created using the FastAPI library. Users will be able to send their patient experience feedback comments to the model via the API, and will receive the predicted labels for those comments. 4 | 5 | This API utilises the Support Vector Classifier model which is less performant than the transformer-based Distilbert model. However, it is also much quicker and simpler. Performance metrics for this model can be seen on our [project documentation website](https://the-strategy-unit.github.io/PatientExperience-QDC/). 6 | 7 | The API has been created using FastAPI and is deployed on Posit Connect. The URL is available on request. Full documentation for the API, automatically generated by FastAPI, is available at [API URL]/docs. 8 | 9 | ## How to make an API call 10 | 11 | 1\. Prepare the data in JSON format. In Python, this is a `list` containing as many `dict`s as there are comments to be predicted. Each `dict` has two compulsory keys: 12 | 13 | * `comment_id`: Unique ID associated with the comment, in `str` format. Each Comment ID per API call must be unique. 14 | * `comment_text`: Text to be classified, in `str` format. 15 | 16 | ```python 17 | # In Python 18 | 19 | text_data = [ 20 | { 'comment_id': '1', # The comment_id values in each dict must be unique. 21 | 'comment_text': 'This is the first comment. Nurse was great.', 22 | }, 23 | { 'comment_id': '2', 24 | 'comment_text': 'This is the second comment. The ward was freezing.', 25 | }, 26 | { 'comment_id': '3', 27 | 'comment_text': '', # This comment is an empty string. 28 | }, 29 | ] 30 | ``` 31 | 32 | ```R 33 | # In R 34 | 35 | library(jsonlite) 36 | 37 | comment_id <- c("1", "2", "3") 38 | comment_text <- c( 39 | "This is the first comment. Nurse was great.", 40 | "This is the second comment. The ward was freezing.", 41 | "" 42 | ) 43 | df <- data.frame(comment_id, comment_text) 44 | text_data <- toJSON(df) 45 | ``` 46 | 47 | 48 | 2\. Send the JSON containing the text data to the `predict_multilabel` endpoint. In python, this can be done using the `requests` library. 49 | 50 | ```python 51 | # In Python 52 | 53 | import requests 54 | 55 | url = "API_URL_GOES_HERE" 56 | 57 | response = requests.post(f"{url}/predict_multilabel", 58 | json = text_data) 59 | ``` 60 | 61 | ```R 62 | # In R 63 | 64 | library(httr) 65 | 66 | r <- POST( 67 | url = "API_URL_GOES_HERE", 68 | body = text_data, 69 | encode = "json", 70 | add_headers( 71 | "Content-Type" = "application/json" 72 | ) 73 | ) 74 | ``` 75 | 76 | 3\. After waiting for the data to be processed and passed through the machine learning model, receive predicted labels at the same endpoint, in the example format below. Note that the comment with blank text, with comment_id 3, was assigned the label 'Labelling not possible' as it would have been stripped out during preprocessing. 77 | 78 | ```python 79 | # In Python 80 | 81 | print(response.json()) 82 | # Output below 83 | [ 84 | { 'comment_id': '1', 85 | 'labels': ['Non-specific praise for staff']} , 86 | { 'comment_id': '2', 87 | 'labels': ['Sensory experience']} , 88 | { 'comment_id': '3', 89 | 'labels': ['Labelling not possible'] } 90 | ] 91 | ``` 92 | 93 | ```R 94 | # In R 95 | 96 | r_parsed = fromJSON(content(r, "text")) 97 | ``` 98 | -------------------------------------------------------------------------------- /docs/reference/API/slow_API.md: -------------------------------------------------------------------------------- 1 | # Slow API 2 | 3 | This API is slower but uses the best performing models. The transformer-based Distilbert model consumes a lot of hardware resource, and as such required a different approach. 4 | 5 | ![Diagram showing Slow API architecture](https://the-strategy-unit.github.io/PatientExperience-QDC/pxtextmining/slow_API.png) 6 | 7 | For predicting the multilabel categories, the API endpoint ensembles together Support Vector Classifier, Gradient Boosted Decision Trees (XGBoost), and Distilbert models. 8 | 9 | For predicting text sentiment , the API endpoint utilises a Distilbert model. 10 | 11 | The API URL endpoint is available on request. You will need an API key, please contact the project team to obtain one. The key should be passed as a `code` param with your API request. 12 | 13 | ## How to make an API call 14 | 15 | 1\. Prepare the data in JSON format. In Python, this is a `list` containing as many `dict`s as there are comments to be predicted. Each `dict` has three compulsory keys: 16 | 17 | * `comment_id`: Unique ID associated with the comment, in `str` format. Each Comment ID per API call must be unique. 18 | * `comment_text`: Text to be classified, in `str` format. 19 | * `question_type`: The type of question asked to elicit the comment text. Questions are different from trust to trust, but they all fall into one of three categories: 20 | * `what_good`: Any variation on the question "What was good about the service?", or "What did we do well?" 21 | * `could_improve`: Any variation on the question "Please tell us about anything that we could have done better", or "How could we improve?" 22 | * `nonspecific`: Any other type of nonspecific question, e.g. "Please can you tell us why you gave your answer?", or "What were you satisfied and/or dissatisfied with?". 23 | 24 | ```python 25 | # In Python 26 | 27 | text_data = [ 28 | { 'comment_id': '1', # The comment_id values in each dict must be unique. 29 | 'comment_text': 'This is the first comment. Nurse was great.', 30 | 'question_type': 'what_good' }, 31 | { 'comment_id': '2', 32 | 'comment_text': 'This is the second comment. The ward was freezing.', 33 | 'question_type': 'could_improve' }, 34 | { 'comment_id': '3', 35 | 'comment_text': '', # This comment is an empty string. 36 | 'question_type': 'nonspecific' } 37 | ] 38 | ``` 39 | 40 | ```R 41 | # In R 42 | 43 | library(jsonlite) 44 | 45 | comment_id <- c("1", "2", "3") 46 | comment_text <- c( 47 | "This is the first comment. Nurse was great.", 48 | "This is the second comment. The ward was freezing.", 49 | "" 50 | ) 51 | question_type <- c("what_good", "could_improve", "nonspecific") 52 | df <- data.frame(comment_id, comment_text, question_type) 53 | text_data <- toJSON(df) 54 | ``` 55 | 56 | 2\. Send the JSON containing the text data in a POST request to the API. Ensure that you include your API key, which should be stored securely. 57 | 58 | The model(s) used to make predictions can be selected with the `target` param. The options for this param are: 59 | 60 | - `m`: multilabel 61 | - `s`: sentiment 62 | - `ms`: both multilabel and sentiment. 63 | 64 | ```python 65 | # In Python 66 | 67 | api_key = os.getenv('API_KEY') 68 | params_dict = {'code': api_key, 'target': 'ms'} 69 | 70 | url = os.getenv('API_URL') 71 | 72 | response = requests.post(url, params= params_dict, json = text_data) 73 | ``` 74 | 75 | ```R 76 | # In R 77 | library(httr) 78 | 79 | api_key <- Sys.getenv("API_KEY") 80 | params_dict <- list(code = api_key, target = "ms") 81 | url <- Sys.getenv("API_URL") 82 | 83 | response <- POST(url, query = params_dict, body = text_data, encode = "json") 84 | ``` 85 | 86 | 3\. If the POST request is successful, you will receive a response with a 202 code, and a URL to retrieve your results, called the `results URL`. For example: 87 | 88 | ```python 89 | # In Python 90 | 91 | if response.status_code == 202: 92 | results_url = response.text 93 | 94 | print(f"URL for results is {results_url}") 95 | ``` 96 | 97 | ```R 98 | # In R 99 | 100 | if (http_status(response) == 202) { 101 | results_url <- content(response, as = "text") 102 | } 103 | print(results_url) 104 | ``` 105 | 106 | 4\. Use a GET request to check the results URL. If your predictions are not yet ready, you will receive a 202 response. If they are ready, you will receive a 200 response. 107 | 108 | What is happening behind the scenes? The API has received your data and has started up a secure Azure container instance with your data stored in blob storage. The Docker container will install the pxtextmining package and make predictions using your data. Starting up a fresh container instance can take up to 5 minutes, and predictions using the slow transformer models can some time, up to 5 further minutes per 1000 comments. Once the predictions are complete, it will delete your data and save the predictions in blob storage. 109 | 110 | Once you receive a 200 response, your results are available in JSON format. Please note that this will only be available once; once you have collected the data, it will be deleted due to security reasons and your results URL will no longer be valid. 111 | 112 | You can set up a loop to check if your results are ready every 5 minutes, as follows. 113 | 114 | ```python 115 | # In Python 116 | 117 | while True: 118 | results_response = requests.get(results_url) 119 | if results_response.status_code == 200: 120 | final_labels = results_response.json() 121 | break 122 | else: 123 | print('Not ready! Trying again in 300 seconds...') 124 | time.sleep(300) 125 | 126 | print('Predicted labels':) 127 | print(final_labels) 128 | ``` 129 | 130 | ```R 131 | # In R 132 | 133 | while (TRUE) { 134 | results_response <- GET(results_url) 135 | if (results_response$status_code == 200) { 136 | final_labels <- fromJSON(content(results_response, "text")) 137 | break 138 | } else { 139 | cat("Not ready! Trying again in 300 seconds...\n") 140 | Sys.sleep(300) 141 | } 142 | } 143 | 144 | cat("Predicted labels:\n") 145 | print(final_labels) 146 | ``` 147 | -------------------------------------------------------------------------------- /docs/reference/Docker/docker_README.md: -------------------------------------------------------------------------------- 1 | # Using our Docker container 2 | 3 | This Docker container contains the pxtextmining machine learning models trained as part of the [Patient Experience Qualitative Data Categorisation project](https://the-strategy-unit.github.io/PatientExperience-QDC/). 4 | 5 | To use this Docker container to predict your unlabelled text: 6 | 7 | 1\. Set up your folders. You will need to set up a folder containing two other folders, data_in and data_out, as below. 8 | ``` 9 | docker_data/ 10 | ├─ data_in/ 11 | ├─ data_out/ 12 | 13 | ``` 14 | 15 | 2\. Prepare your data. Save the data you wish to pass through the machine learning models as json, in the data_in folder. The data should be in the following format: 16 | 17 | In Python, a `list` containing as many `dict`s as there are comments to be predicted. Each `dict` has three compulsory keys: 18 | 19 | * `comment_id`: Unique ID associated with the comment, in `str` format. Each Comment ID per API call must be unique. 20 | * `comment_text`: Text to be classified, in `str` format. 21 | * `question_type`: The type of question asked to elicit the comment text. Questions are different from trust to trust, but they all fall into one of three categories: 22 | * `what_good`: Any variation on the question "What was good about the service?", or "What did we do well?" 23 | * `could_improve`: Any variation on the question "Please tell us about anything that we could have done better", or "How could we improve?" 24 | * `nonspecific`: Any other type of nonspecific question, e.g. "Please can you tell us why you gave your answer?", or "What were you satisfied and/or dissatisfied with?". 25 | 26 | ```python 27 | # In Python 28 | 29 | text_data = [ 30 | { 'comment_id': '1', # The comment_id values in each dict must be unique. 31 | 'comment_text': 'This is the first comment. Nurse was great.', 32 | 'question_type': 'what_good' }, 33 | { 'comment_id': '2', 34 | 'comment_text': 'This is the second comment. The ward was freezing.', 35 | 'question_type': 'could_improve' }, 36 | { 'comment_id': '3', 37 | 'comment_text': '', # This comment is an empty string. 38 | 'question_type': 'nonspecific' } 39 | ] 40 | 41 | ``` 42 | 43 | ```R 44 | # In R 45 | 46 | library(jsonlite) 47 | 48 | comment_id <- c("1", "2", "3") 49 | comment_text <- c( 50 | "This is the first comment. Nurse was great.", 51 | "This is the second comment. The ward was freezing.", 52 | "" 53 | ) 54 | question_type <- c("what_good", "could_improve", "nonspecific") 55 | df <- data.frame(comment_id, comment_text, question_type) 56 | text_data <- toJSON(df) 57 | ``` 58 | 59 | 3\. Save the JSON data in the data_in folder, as follows: 60 | 61 | ```python 62 | # In Python 63 | 64 | json_data = json.dumps(text_data) 65 | with open("data_in/file_01.json", "w") as outfile: 66 | outfile.write(json_data) 67 | ``` 68 | 69 | ```R 70 | # In R 71 | 72 | json_data <- toJSON(text_data, pretty = TRUE) 73 | write(json_data, file = "data_in/file_01.json") 74 | ``` 75 | 76 | 4\. Your file structure should now look like this: 77 | 78 | ``` 79 | docker_data/ 80 | ├─ data_in/ 81 | │ ├─ file_01.json 82 | ├─ data_out/ 83 | ``` 84 | 85 | 5\. Mount the docker_data folder as the `data` volume for the Docker container and run the container. Pass the filename for the input JSON as the first argument. The following arguments are also available: 86 | 87 | - `--local-storage` or `-l` flag for local storage (does not delete the files in data_in after completing predictions) 88 | - `--target` or `-t` to select the machine learning models used. Options are `m` for multilabel, `s` for `sentiment`, or `ms` for both. Defaults to `ms` if nothing is selected. 89 | 90 | A sample command would be: 91 | `docker run --rm -it -v /docker_data:/data ghcr.io/the-strategy-unit/pxtextmining:latest file_01.json -l ` 92 | 93 | 6\. The predictions will be outputted as a json file in the data_out folder, with the same filename. After running successfully, the final folder structure should be: 94 | 95 | ``` 96 | docker_data/ 97 | ├─ data_in/ 98 | │ ├─ file_01.json 99 | ├─ data_out/ 100 | ├─ file_01.json 101 | ``` 102 | -------------------------------------------------------------------------------- /docs/reference/pxtextmining/factories/factory_data_load_and_split.md: -------------------------------------------------------------------------------- 1 | ::: pxtextmining.factories.factory_data_load_and_split -------------------------------------------------------------------------------- /docs/reference/pxtextmining/factories/factory_model_performance.md: -------------------------------------------------------------------------------- 1 | ::: pxtextmining.factories.factory_model_performance -------------------------------------------------------------------------------- /docs/reference/pxtextmining/factories/factory_pipeline.md: -------------------------------------------------------------------------------- 1 | ::: pxtextmining.factories.factory_pipeline -------------------------------------------------------------------------------- /docs/reference/pxtextmining/factories/factory_predict_unlabelled_text.md: -------------------------------------------------------------------------------- 1 | ::: pxtextmining.factories.factory_predict_unlabelled_text -------------------------------------------------------------------------------- /docs/reference/pxtextmining/factories/factory_write_results.md: -------------------------------------------------------------------------------- 1 | ::: pxtextmining.factories.factory_write_results -------------------------------------------------------------------------------- /docs/reference/pxtextmining/helpers/text_preprocessor.md: -------------------------------------------------------------------------------- 1 | ::: pxtextmining.helpers.text_preprocessor 2 | options: 3 | show_source: true -------------------------------------------------------------------------------- /docs/reference/pxtextmining/pipelines/multilabel_pipeline.md: -------------------------------------------------------------------------------- 1 | ::: pxtextmining.pipelines.multilabel_pipeline -------------------------------------------------------------------------------- /docs/reference/pxtextmining/pipelines/sentiment_pipeline.md: -------------------------------------------------------------------------------- 1 | ::: pxtextmining.pipelines.sentiment_pipeline 2 | -------------------------------------------------------------------------------- /mkdocs.yml: -------------------------------------------------------------------------------- 1 | site_name: pxtextmining 2 | site_url: https://the-strategy-unit.github.io/pxtextmining/ 3 | theme: readthedocs 4 | watch: 5 | - pxtextmining 6 | 7 | extra_css: 8 | - main.css 9 | 10 | plugins: 11 | - search 12 | - mkdocstrings: 13 | handlers: 14 | python: 15 | options: 16 | docstring_style: google 17 | show_root_heading: true 18 | show_root_toc_entry: false 19 | show_root_full_path: false 20 | show_source: false 21 | show_if_no_docstring: true 22 | heading_level: 4 23 | members_order: source 24 | -------------------------------------------------------------------------------- /pxtextmining/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/The-Strategy-Unit/pxtextmining/ba6c0a2f10ca9bb3b964b72afe6047b463095c75/pxtextmining/__init__.py -------------------------------------------------------------------------------- /pxtextmining/factories/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/The-Strategy-Unit/pxtextmining/ba6c0a2f10ca9bb3b964b72afe6047b463095c75/pxtextmining/factories/__init__.py -------------------------------------------------------------------------------- /pxtextmining/factories/factory_data_load_and_split.py: -------------------------------------------------------------------------------- 1 | import re 2 | import string 3 | 4 | import numpy as np 5 | import pandas as pd 6 | from sklearn.model_selection import train_test_split 7 | from sklearn.preprocessing import OneHotEncoder 8 | from tensorflow.data import Dataset 9 | from transformers import AutoTokenizer 10 | 11 | from pxtextmining.params import dataset, major_cat_dict, minor_cats, q_map 12 | 13 | 14 | def merge_categories(df, new_cat, cats_to_merge): 15 | """Merges categories together in a dataset. Assumes all categories are all in the 16 | right format, one hot encoded with int values. 17 | 18 | Args: 19 | df (pd.DataFrame): DataFrame with labelled data. 20 | new_cat (str): Name for new column of merged data. 21 | cats_to_merge (list): List containing columns to be merged. 22 | 23 | Returns: 24 | (pd.DataFrame): DataFrame with new columns 25 | """ 26 | df[new_cat] = np.NaN 27 | for cat in cats_to_merge: 28 | print(f"Number of {cat} labels: {df[cat].sum()}") 29 | df[new_cat] = df[new_cat].mask(df[cat] == 1, other=1) 30 | print(f"Number of new label {new_cat}: {df[new_cat].sum()}") 31 | df = df.drop(columns=cats_to_merge) 32 | return df 33 | 34 | 35 | def bert_data_to_dataset( 36 | X, 37 | Y=None, 38 | max_length=150, 39 | model_name="distilbert-base-uncased", 40 | additional_features=False, 41 | ): 42 | """This function converts a dataframe into a format that can be utilised by a transformer model. 43 | If Y is provided then it returns a TensorFlow dataset for training the model. 44 | If Y is not provided, then it returns a dict which can be used to make predictions by an already trained model. 45 | 46 | Args: 47 | X (pd.DataFrame): Data to be converted to text data. Text should be in column 'FFT answer', 48 | FFT question should be in column 'FFT_q_standardised'. 49 | Y (pd.DataFrame, optional): One Hot Encoded targets. Defaults to None. 50 | max_length (int, optional): Maximum length of text to be encoded. Defaults to 150. 51 | model_name (str, optional): Type of transformer model. Defaults to 'distilbert-base-uncased'. 52 | additional_features (bool, optional): Whether additional features are to be included, currently this is only question type 53 | in 'FFT_q_standardised' column. Defaults to False. 54 | 55 | Returns: 56 | (tf.data.Dataset OR dict): `tf.data.Dataset` if Y is provided, `dict` otherwise. 57 | """ 58 | tokenizer = AutoTokenizer.from_pretrained(model_name) 59 | if type(X) == pd.DataFrame: 60 | data_encoded = dict( 61 | tokenizer( 62 | list(X["FFT answer"]), 63 | truncation=True, 64 | padding="max_length", 65 | max_length=max_length, 66 | return_tensors="tf", 67 | ) 68 | ) 69 | elif type(X) == pd.Series: 70 | data_encoded = dict( 71 | tokenizer( 72 | list(X), 73 | truncation=True, 74 | padding="max_length", 75 | max_length=max_length, 76 | return_tensors="tf", 77 | ) 78 | ) 79 | data_encoded.pop("attention_mask", None) 80 | if additional_features is True: 81 | data_encoded["input_cat"] = X["FFT_q_standardised"].map( 82 | {"what_good": 0, "could_improve": 1, "nonspecific": 2} 83 | ) 84 | if Y is not None: 85 | data_encoded = Dataset.from_tensor_slices((data_encoded, Y)) 86 | return data_encoded 87 | 88 | 89 | def load_multilabel_data(filename, target="major_categories"): 90 | """Function for loading the multilabel dataset, converting it from csv to pd.DataFrame. Conducts some basic preprocessing, 91 | including standardisation of the question types, calculation of text length, and drops rows with no labels. Depending on 92 | selected `target`, returned dataframe contains different columns. 93 | 94 | Args: 95 | filename (str): Path to file containing multilabel data, in csv format 96 | target (str, optional): Options are 'minor_categories', 'major_categories', or 'sentiment'. Defaults to 'major_categories'. 97 | 98 | Returns: 99 | (pd.DataFrame): DataFrame containing the columns 'FFT categorical answer', 'FFT question', and 'FFT answer'. Also conducts some 100 | """ 101 | print("Loading multilabel dataset...") 102 | raw_data = pd.read_csv( 103 | filename, 104 | na_values=" ", 105 | ) 106 | print(f"Shape of raw data is {raw_data.shape}") 107 | raw_data.columns = raw_data.columns.str.strip() 108 | raw_data = raw_data.set_index("Comment ID").copy() 109 | features = ["FFT categorical answer", "FFT question", "FFT answer"] 110 | # For now the labels are hardcoded, these are subject to change as framework is in progress 111 | if target in ["minor_categories", "major_categories"]: 112 | cols = minor_cats 113 | elif target == "sentiment": 114 | cols = ["Comment sentiment"] 115 | # Sort out the features first 116 | features_df = raw_data.loc[:, features].copy() 117 | # Standardize FFT qs 118 | features_df["FFT question"] = features_df["FFT question"].fillna("nonspecific") 119 | features_df.loc[:, "FFT_q_standardised"] = ( 120 | features_df.loc[:, "FFT question"].map(q_map).copy() 121 | ) 122 | if features_df["FFT_q_standardised"].count() != features_df.shape[0]: 123 | raise ValueError( 124 | f'Check q_map is correct. features_df.shape[0] is {features_df.shape[0]}. \n \ 125 | features_df["FFT_q_standardised"].count() is {features_df["FFT_q_standardised"].count()}. \n\n\ 126 | Questions are: {features_df["FFT question"].value_counts()}' 127 | ) 128 | features_df.loc[:, "text_length"] = features_df.loc[:, "FFT answer"].apply( 129 | lambda x: len([word for word in str(x).split(" ") if word != ""]) 130 | ) 131 | features_df = clean_empty_features(features_df) 132 | # Sort out the targets 133 | targets_df = raw_data.loc[:, cols].copy() 134 | targets_df = targets_df.replace("1", 1) 135 | targets_df = targets_df.fillna(value=0) 136 | if target == "major_categories": 137 | for maj, min_list in major_cat_dict.items(): 138 | targets_df = merge_categories(targets_df, maj, min_list) 139 | cols = list(major_cat_dict.keys()) 140 | targets_df.loc[:, "num_labels"] = targets_df.loc[:, cols].sum(axis=1) 141 | targets_df = targets_df[targets_df["num_labels"] != 0] 142 | targets_df = targets_df.fillna(value=0) 143 | # merge two together 144 | combined_df = pd.merge(features_df, targets_df, left_index=True, right_index=True) 145 | combined_df = combined_df.reset_index() 146 | combined_df = combined_df.drop_duplicates() 147 | combined_df = combined_df.set_index("Comment ID") 148 | print(f"Shape of cleaned data is {combined_df.shape}") 149 | return combined_df 150 | 151 | 152 | def clean_empty_features(text_dataframe): 153 | """Replaces all empty whitespaces in a dataframe with np.NaN. 154 | 155 | Args: 156 | text_dataframe (pd.DataFrame): DataFrame containing text data with labels. 157 | 158 | Returns: 159 | (pd.DataFrame): DataFrame with all empty whitespaces replaced with np.NaN 160 | """ 161 | clean_dataframe = text_dataframe.replace(r"^\s*$", np.nan, regex=True) 162 | clean_dataframe = clean_dataframe.dropna() 163 | return clean_dataframe 164 | 165 | 166 | def onehot(df, col_to_onehot): 167 | """Function to one-hot encode specified columns in a dataframe. 168 | 169 | Args: 170 | df (pd.DataFrame): DataFrame containing data to be one-hot encoded 171 | col_to_onehot (list): List of column names to be one-hot encoded 172 | 173 | Returns: 174 | (pd.DataFrame): One-hot encoded data 175 | """ 176 | encoder = OneHotEncoder(sparse=False, handle_unknown="ignore") 177 | col_encoded = encoder.fit_transform(df[[col_to_onehot]]) 178 | return col_encoded 179 | 180 | 181 | def process_data(df, target, preprocess_text=True, additional_features=False): 182 | """Utilises remove_punc_and_nums and clean_empty_features functions to clean the text data and 183 | drop any rows that are only whitespace after cleaning. Also fills one-hot encoded columns with 184 | 0s rather than NaNs so that Y target is not sparse. 185 | 186 | Args: 187 | df (pd.DataFrame): DataFrame containing text data, any additional features, and targets 188 | target (list): List of column names of targets 189 | preprocess_text (bool, optional): Whether or not text is to be processed with remove_punc_and_nums. If utilising 190 | an sklearn model then should be True. If utilising transformer-based BERT model then should be set to False. 191 | Defaults to True. 192 | additional_features (bool, optional): Whether or not 'question type' feature should be included. Defaults to False. 193 | 194 | Returns: 195 | (tuple): Tuple containing two pd.DataFrames. The first contains the X features (text, with or without question type depending on additional_features), the second contains the one-hot encoded Y targets 196 | """ 197 | 198 | if preprocess_text is True: 199 | X = df["FFT answer"].astype(str).apply(remove_punc_and_nums) 200 | X = clean_empty_features(X) 201 | print(f"After preprocessing, shape of X is {X.shape}") 202 | if preprocess_text is False: 203 | X_temp = df["FFT answer"].astype(str).apply(remove_punc_and_nums) 204 | X_temp = clean_empty_features(X_temp) 205 | print(f"After preprocessing, shape of X is {X_temp.shape}") 206 | indices = X_temp.index 207 | X = df["FFT answer"].astype(str).filter(indices) 208 | if additional_features is True: 209 | X = pd.merge(X, df[["FFT_q_standardised"]], left_index=True, right_index=True) 210 | X = X.reset_index() 211 | X = X.drop_duplicates() 212 | X = X.set_index("Comment ID") 213 | if target == "sentiment": 214 | Y = df["Comment sentiment"].astype(int) - 1 215 | else: 216 | Y = df[target].fillna(value=0) 217 | Y = Y.loc[X.index] 218 | Y = Y.reset_index() 219 | Y = Y.drop_duplicates() 220 | Y = Y.set_index("Comment ID") 221 | if target == "sentiment": 222 | Y = Y["Comment sentiment"] 223 | Y = np.array(Y).astype(int) 224 | return X, Y 225 | 226 | 227 | def process_and_split_data( 228 | df, 229 | target, 230 | preprocess_text=True, 231 | additional_features=False, 232 | random_state=42, 233 | ): 234 | """Combines the process_multilabel_data and train_test_split functions into one function 235 | 236 | Args: 237 | df (pd.DataFrame): DataFrame containing text data, any additional features, and targets 238 | target (list): List of column names of targets 239 | preprocess_text (bool, optional): Whether or not text is to be processed with remove_punc_and_nums. If utilising 240 | an sklearn model then should be True. If utilising transformer-based BERT model then should be set to False. 241 | Defaults to True. 242 | additional_features (bool, optional): Whether or not 'question type' feature should be included. Defaults to False. 243 | random_state (int, optional): Controls the shuffling applied to the data before applying the split. Enables reproducible output across multiple function calls. Defaults to 42. 244 | 245 | Returns: 246 | (list): List containing train-test split of preprocessed X features and Y targets. 247 | """ 248 | X, Y = process_data( 249 | df, 250 | target, 251 | preprocess_text=preprocess_text, 252 | additional_features=additional_features, 253 | ) 254 | X_train, X_test, Y_train, Y_test = train_test_split( 255 | X, Y, test_size=0.2, random_state=random_state 256 | ) 257 | return X_train, X_test, Y_train, Y_test 258 | 259 | 260 | def remove_punc_and_nums(text): 261 | """Function to conduct basic preprocessing of text, removing punctuation and numbers, converting 262 | all text to lowercase, removing trailing whitespace. 263 | 264 | Args: 265 | text (str): Str containing the text to be cleaned 266 | 267 | Returns: 268 | (str): Cleaned text, all lowercased with no punctuation, numbers or trailing whitespace. 269 | """ 270 | text = re.sub("\\n", " ", text) 271 | text = re.sub("\\r", " ", text) 272 | text = re.sub("’", "'", text) 273 | text = "".join(char for char in text if not char.isdigit()) 274 | punc_list = string.punctuation 275 | for punctuation in punc_list: 276 | if punctuation in [",", ".", "-"]: 277 | text = text.replace(punctuation, " ") 278 | else: 279 | text = text.replace(punctuation, "") 280 | text_split = [word for word in text.split(" ") if word != ""] 281 | text_lower = [] 282 | for word in text_split: 283 | text_lower.append(word.lower()) 284 | cleaned_sentence = " ".join(word for word in text_lower) 285 | cleaned_sentence = cleaned_sentence.strip() 286 | return cleaned_sentence 287 | 288 | 289 | if __name__ == "__main__": 290 | df = load_multilabel_data(dataset, target="major_categories") 291 | print(df.shape) 292 | print(df.head()) 293 | for i in df.columns: 294 | print(f"{i}: {df[i].dtype}") 295 | -------------------------------------------------------------------------------- /pxtextmining/factories/factory_model_performance.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from sklearn import metrics 4 | from sklearn.base import is_classifier 5 | from sklearn.dummy import DummyClassifier 6 | from sklearn.metrics import confusion_matrix 7 | from tensorflow.keras.models import Model 8 | 9 | from pxtextmining.factories.factory_predict_unlabelled_text import ( 10 | predict_multiclass_bert, 11 | ) 12 | 13 | 14 | def get_dummy_model(x_train, y_train): 15 | """Creates dummy model that randomly predicts labels, fitted on the training data. 16 | 17 | Args: 18 | x_train (pd.DataFrame): Input features. 19 | y_train (pd.DataFrame): Target values. 20 | 21 | Returns: 22 | (sklearn.dummy.DummyClassifier): Trained dummy classifier. 23 | """ 24 | model = DummyClassifier(strategy="uniform") 25 | model.fit(x_train, y_train) 26 | return model 27 | 28 | 29 | def get_multiclass_metrics( 30 | x_test, y_test, labels, random_state, model, additional_features, training_time=None 31 | ): 32 | """Creates a string detailing various performance metrics for a multiclass model, which can then be written to 33 | a text file. 34 | 35 | Args: 36 | x_test (pd.DataFrame): DataFrame containing test dataset features 37 | y_test (pd.DataFrame): DataFrame containing test dataset true target values 38 | labels (list): List containing the target labels 39 | random_state (int): Seed used to control the shuffling of the data, to enable reproducible results. 40 | model (tf.keras or sklearn model): Trained estimator. 41 | additional_features (bool, optional): Whether or not additional features (e.g. question type) have been included in training the model. Defaults to False. 42 | training_time (str, optional): Amount of time taken for model to train. Defaults to None. 43 | 44 | Raises: 45 | ValueError: Only models built with sklearn or tensorflow are allowed. 46 | 47 | Returns: 48 | (str): String containing the model architecture/hyperparameters, random state used for the train test split, and classification report. 49 | """ 50 | metrics_string = "\n *****************" 51 | metrics_string += ( 52 | f"\n Random state seed for train test split is: {random_state} \n\n" 53 | ) 54 | # TF Keras models output probabilities with model.predict, whilst sklearn models output binary outcomes 55 | # Get them both to output the same (binary outcomes) and take max prob as label if no labels predicted at all 56 | if isinstance(model, Model) is True: 57 | stringlist = [] 58 | model.summary(print_fn=lambda x: stringlist.append(x)) 59 | model_summary = "\n".join(stringlist) 60 | metrics_string += f"\n{model_summary}\n" 61 | y_pred = predict_multiclass_bert( 62 | x_test, 63 | model, 64 | additional_features=additional_features, 65 | ) 66 | elif is_classifier(model) is True: 67 | metrics_string += f"\n{model}\n" 68 | y_pred = model.predict(x_test) 69 | else: 70 | raise ValueError("Model type not recognised") 71 | # Calculate various metrics 72 | metrics_string += f"\n\nTraining time: {training_time}\n" 73 | # Classification report 74 | metrics_string += "\n\n Classification report:\n" 75 | c_report_str = metrics.classification_report( 76 | y_test, y_pred, target_names=labels, zero_division=0 77 | ) 78 | metrics_string += c_report_str 79 | return metrics_string 80 | 81 | 82 | def get_multilabel_metrics( 83 | preds_df, 84 | y_test, 85 | labels, 86 | random_state, 87 | model, 88 | training_time=None, 89 | ): 90 | """Creates a string detailing various performance metrics for a multilabel model, which can then be written to 91 | a text file. 92 | 93 | Args: 94 | preds_df (pd.DataFrame): DataFrame containing model predictions 95 | y_test (pd.DataFrame): DataFrame containing test dataset true target values 96 | labels (list): List containing the target labels 97 | random_state (int): Seed used to control the shuffling of the data, to enable reproducible results. 98 | model (tf.keras or sklearn model): Trained estimator. 99 | training_time (str, optional): Amount of time taken for model to train. Defaults to None. 100 | 101 | Raises: 102 | ValueError: Only sklearn and tensorflow keras models allowed. 103 | 104 | Returns: 105 | (str): String containing the model architecture/hyperparameters, random state used for the train test split, and performance metrics including: exact accuracy, hamming loss, macro jaccard score, and classification report. 106 | """ 107 | 108 | metrics_string = "\n *****************" 109 | metrics_string += ( 110 | f"\n Random state seed for train test split is: {random_state} \n\n" 111 | ) 112 | model_metrics = {} 113 | if isinstance(model, Model) is True: 114 | stringlist = [] 115 | model.summary(print_fn=lambda x: stringlist.append(x)) 116 | model_summary = "\n".join(stringlist) 117 | elif is_classifier(model) is True: 118 | model_summary = model 119 | else: 120 | raise ValueError("invalid model type") 121 | y_pred = np.array(preds_df[labels]).astype("int64") 122 | # Calculate various metrics 123 | model_metrics["exact_accuracy"] = metrics.accuracy_score(y_test, y_pred) 124 | model_metrics["hamming_loss"] = metrics.hamming_loss(y_test, y_pred) 125 | model_metrics["macro_jaccard_score"] = metrics.jaccard_score( 126 | y_test, y_pred, average="macro" 127 | ) 128 | y_probs = preds_df.filter(like="Probability", axis=1) 129 | model_metrics["macro_roc_auc"] = metrics.roc_auc_score( 130 | y_test, y_probs, multi_class="ovr" 131 | ) 132 | model_metrics[ 133 | "Label ranking average precision" 134 | ] = metrics.label_ranking_average_precision_score( 135 | y_test, 136 | y_probs, 137 | ) 138 | # Model summary 139 | metrics_string += f"\n{model_summary}\n" 140 | metrics_string += f"\n\nTraining time: {training_time}\n" 141 | for k, v in model_metrics.items(): 142 | metrics_string += f"\n{k}: {v}" 143 | # Classification report 144 | metrics_string += "\n\n Classification report:\n" 145 | c_report_str = metrics.classification_report( 146 | y_test, y_pred, target_names=labels, zero_division=0 147 | ) 148 | metrics_string += c_report_str 149 | return metrics_string 150 | 151 | 152 | def get_accuracy_per_class(y_test, pred): 153 | """Function to produce accuracy per class for the predicted categories, compared against real values. 154 | 155 | Args: 156 | y_test (pd.Series): Test data (real target values). 157 | pred (pd.Series): Predicted target values. 158 | 159 | Returns: 160 | (pd.DataFrame): The computed accuracy per class metrics for the model. 161 | 162 | """ 163 | cm = confusion_matrix(y_test, pred) 164 | accuracy_per_class = cm.astype("float") / cm.sum(axis=1)[:, np.newaxis] 165 | accuracy_per_class = pd.DataFrame(accuracy_per_class.diagonal()) 166 | accuracy_per_class.columns = ["accuracy"] 167 | unique, frequency = np.unique(y_test, return_counts=True) 168 | accuracy_per_class["class"], accuracy_per_class["counts"] = unique, frequency 169 | accuracy_per_class = accuracy_per_class[["class", "counts", "accuracy"]] 170 | return accuracy_per_class 171 | 172 | 173 | def parse_metrics_file(metrics_file, labels): 174 | """Reads performance metrics files that are written by `factory_write_results.write_multilabel_models_and_metrics`. 175 | Creates a pd.DataFrame with the precision, recall, f1_score, and support for each label, which can be filtered and sorted more easily. 176 | 177 | Args: 178 | metrics_file (str): Path to the metrics file to be parsed. 179 | labels (list): List of the target labels used in the metrics file. 180 | 181 | Returns: 182 | (pd.DataFrame): DataFrame containing the precision, recall, f1_score, and support for each label, as detailed in the performance metrics file. 183 | """ 184 | with open(metrics_file, "r") as file: 185 | content = file.readlines() 186 | for i, line in enumerate(content): 187 | if line.strip().startswith(labels[0][:10]): 188 | startline = i 189 | if line.strip().startswith(labels[-1][:10]): 190 | endline = i + 1 191 | lines = [x.strip() for x in content[startline:endline]] 192 | metrics_dict = { 193 | "label": [], 194 | "precision": [], 195 | "recall": [], 196 | "f1_score": [], 197 | "support (label count in test data)": [], 198 | } 199 | for each in lines: 200 | splitted = each.split(" ") 201 | metrics_dict["label"].append(splitted[0].strip()) 202 | metrics_dict["precision"].append(splitted[1].strip()) 203 | metrics_dict["recall"].append(splitted[2].strip()) 204 | metrics_dict["f1_score"].append(splitted[3].strip()) 205 | metrics_dict["support (label count in test data)"].append(splitted[4].strip()) 206 | metrics_df = pd.DataFrame.from_dict(metrics_dict) 207 | return metrics_df 208 | 209 | 210 | def get_y_score(probs): 211 | """Converts probabilities into format (n_samples, n_classes) so they can be passed into sklearn roc_auc_score function 212 | 213 | Args: 214 | probs (np.ndarray): Probability estimates outputted by model 215 | 216 | Returns: 217 | (np.ndarray): Probability estimates in format (n_samples, n_classes) 218 | """ 219 | if probs.ndim == 3: 220 | score = np.transpose([pred[:, 1] for pred in probs]) 221 | elif probs.ndim == 2: 222 | score = probs 223 | return score 224 | 225 | 226 | def additional_analysis(preds_df, y_true, labels, custom_threshold_dict=None): 227 | """For given predictions, returns dataframe containing: macro one-vs-one ROC AUC score, number of True Positives, True Negatives, False Positives, and False Negatives. 228 | 229 | Args: 230 | preds_df (pd.DataFrame): Dataframe containing predicted labels in one-hot encoded format 231 | y_true (np.array): One-hot encoded real Y values 232 | labels (List): List of the target labels 233 | 234 | Returns: 235 | (pd.DataFrame): dataframe containing: macro one-vs-one ROC AUC score, number of True Positives, True Negatives, False Positives, and False Negatives. 236 | """ 237 | y_score = np.array(preds_df.filter(like="Probability", axis=1)) 238 | cm = metrics.multilabel_confusion_matrix(y_true, np.array(preds_df[labels])) 239 | cm_dict = {} 240 | average_precision = {} 241 | for i, label in enumerate(labels): 242 | cm_meaning = {} 243 | tn, fp = cm[i][0] 244 | fn, tp = cm[i][1] 245 | cm_meaning["True Negative"] = tn 246 | cm_meaning["False Negative"] = fn 247 | cm_meaning["True Positive"] = tp 248 | cm_meaning["False Positive"] = fp 249 | cm_dict[label] = cm_meaning 250 | average_precision[label] = metrics.average_precision_score( 251 | y_true[:, i], y_score[:, i] 252 | ) 253 | df = pd.DataFrame.from_dict(cm_dict, orient="index") 254 | average_precision = pd.Series(average_precision) 255 | df["average_precision_score"] = average_precision 256 | if custom_threshold_dict is not None: 257 | df["custom_threshold"] = custom_threshold_dict 258 | return df 259 | -------------------------------------------------------------------------------- /pxtextmining/factories/factory_write_results.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | 4 | import numpy as np 5 | import pandas as pd 6 | from tensorflow.keras import Model, Sequential 7 | 8 | from pxtextmining.factories.factory_model_performance import ( 9 | additional_analysis, 10 | parse_metrics_file, 11 | ) 12 | from pxtextmining.factories.factory_predict_unlabelled_text import ( 13 | get_labels, 14 | get_probabilities, 15 | ) 16 | 17 | 18 | def write_multilabel_models_and_metrics(models, model_metrics, path): 19 | """Saves models and their associated performance metrics into a specified folder 20 | 21 | Args: 22 | models (list): List containing the trained tf.keras or sklearn models to be saved. 23 | model_metrics (list): List containing the model metrics in `str` format 24 | path (str): Path where model is to be saved. 25 | """ 26 | for i in range(len(models)): 27 | model_name = f"model_{i}" 28 | if not os.path.exists(path): 29 | os.makedirs(path) 30 | fullpath = os.path.join(path, model_name) 31 | if isinstance(models[i], (Sequential, Model)): 32 | models[i].save(fullpath) 33 | else: 34 | modelpath = os.path.join(path, model_name + ".sav") 35 | pickle.dump(models[i], open(modelpath, "wb")) 36 | # Write performance metrics file 37 | txtpath = os.path.join(path, model_name + ".txt") 38 | with open(txtpath, "w") as file: 39 | file.write(model_metrics[i]) 40 | print(f"{len(models)} models have been written to {path}") 41 | 42 | 43 | def write_model_preds(x, y_true, preds_df, labels, path="labels.xlsx", return_df=False): 44 | """Writes an Excel file to enable easier analysis of model outputs using the test set. Columns of the Excel file are: comment_id, actual_labels, predicted_labels, actual_label_probs, and predicted_label_probs. 45 | 46 | Args: 47 | x (pd.Series OR pd.DataFrame): Text data used for predictions 48 | y_true (np.array): Onehot encoded targets 49 | preds_df (pd.DataFrame): DataFrame containing predictions, predicted probabilities, and labels. Should be produced by predict_multilabel_sklearn or predict_multilabel_bert 50 | labels (list): List containing target labels. 51 | path (str, optional): Filename and path for file to be saved. Defaults to "labels.xlsx". 52 | return_df (bool, optional): Whether or not the processed data should be returned as a DataFrame. Defaults to False. 53 | 54 | Returns: 55 | (pd.DataFrame): DataFrame containing comment_id, comment text, actual_labels, predicted_labels, actual_label_probs, and predicted_label_probs. 56 | """ 57 | assert len(x) == len(y_true) == len(preds_df) 58 | actual_labels = pd.DataFrame(y_true, columns=labels).apply( 59 | get_labels, args=(labels,), axis=1 60 | ) 61 | actual_labels.name = "actual_labels" 62 | predicted_labels = preds_df["labels"] 63 | predicted_labels.name = "predicted_labels" 64 | df = x.reset_index() 65 | probabilities = np.array(preds_df.filter(like="Probability", axis=1)) 66 | probs_actual = get_probabilities(actual_labels, labels, probabilities) 67 | probs_predicted = get_probabilities(predicted_labels, labels, probabilities) 68 | df = df.merge(actual_labels, left_index=True, right_index=True) 69 | df = df.merge(predicted_labels, left_on="Comment ID", right_index=True) 70 | df = df.merge(probs_actual, left_index=True, right_index=True) 71 | df = df.merge(probs_predicted, left_on="Comment ID", right_index=True) 72 | # Deal with any rogue characters 73 | df.applymap( 74 | lambda x: x.encode("unicode_escape").decode("utf-8") 75 | if isinstance(x, str) 76 | else x 77 | ) 78 | df.to_excel(path, index=False) 79 | if return_df is True: 80 | return df 81 | 82 | 83 | def write_model_analysis( 84 | model_name, 85 | labels, 86 | dataset, 87 | path, 88 | preds_df=None, 89 | y_true=None, 90 | custom_threshold_dict=None, 91 | ): 92 | """Writes an Excel file with the performance metrics of each label, as well as the counts of samples for each label. 93 | 94 | Args: 95 | model_name (str): Model name used in the performance metrics file 96 | labels (list): List of labels for the categories to be predicted. 97 | dataset (pd.DataFrame): Original dataset before train test split 98 | path (str): Filepath where model and performance metrics file are saved. 99 | """ 100 | metrics_df = parse_metrics_file(f"{path}/{model_name}.txt", labels=labels) 101 | label_counts = pd.DataFrame(dataset[labels].sum()) 102 | label_counts = label_counts.reset_index() 103 | label_counts = label_counts.rename( 104 | columns={"index": "label", 0: "label_count_in_full_dataset"} 105 | ) 106 | metrics_df = metrics_df.merge(label_counts, on="label").set_index("label") 107 | if preds_df is not None and y_true is not None: 108 | more_metrics = additional_analysis( 109 | preds_df, y_true, labels, custom_threshold_dict 110 | ) 111 | metrics_df = pd.concat([metrics_df, more_metrics], axis=1) 112 | metrics_df.to_excel(f"{path}/{model_name}_perf.xlsx", index=True) 113 | -------------------------------------------------------------------------------- /pxtextmining/helpers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/The-Strategy-Unit/pxtextmining/ba6c0a2f10ca9bb3b964b72afe6047b463095c75/pxtextmining/helpers/__init__.py -------------------------------------------------------------------------------- /pxtextmining/helpers/text_preprocessor.py: -------------------------------------------------------------------------------- 1 | from tensorflow.keras.preprocessing.text import Tokenizer 2 | from tensorflow.keras.preprocessing.sequence import pad_sequences 3 | 4 | 5 | def tf_preprocessing(X, max_sentence_length = 150): 6 | """Conducts preprocessing with tensorflow tokenizer which vectorizes text and standardizes length. 7 | 8 | Args: 9 | X (pd.Series): Series containing the text to be processed 10 | max_sentence_length (int, optional): Maximum number of words. Defaults to 150. 11 | 12 | Returns: 13 | (tuple): Tuple containing `np.array` of padded, tokenized, vectorized texts, and `int` showing number of unique words in vocabulary. 14 | """ 15 | tk = Tokenizer() 16 | tk.fit_on_texts(X) 17 | vocab_size = len(tk.word_index) 18 | print(f'There are {vocab_size} different words in your corpus') 19 | X_token = tk.texts_to_sequences(X) 20 | ### Pad the inputs 21 | X_pad = pad_sequences(X_token, dtype='float32', padding='post', maxlen = max_sentence_length) 22 | return X_pad, vocab_size 23 | -------------------------------------------------------------------------------- /pxtextmining/params.py: -------------------------------------------------------------------------------- 1 | dataset = "datasets/hidden/v7_230925.csv" 2 | 3 | random_state = 42 4 | 5 | model_name = "distilbert-base-uncased" 6 | 7 | q_map = { 8 | "Please tell us why": "nonspecific", 9 | "Please tells us why you gave this answer?": "nonspecific", 10 | "FFT Why?": "nonspecific", 11 | "What was good?": "what_good", 12 | "Is there anything we could have done better?": "could_improve", 13 | "How could we improve?": "could_improve", 14 | "What could we do better?": "could_improve", 15 | "Please can you tell us why you gave your answer and what we could have done better?": "nonspecific", 16 | "Please describe any things about the 111 service that\r\nyou were particularly satisfied and/or dissatisfied with": "nonspecific", 17 | "Please describe any things about the 111 service that \nyou were particularly satisfied and/or dissatisfied with": "nonspecific", 18 | "Please describe any things about the 111 service that\nyou were particularly satisfied and/or dissatisfied with": "nonspecific", 19 | "Nonspecific": "nonspecific", 20 | "nonspecific": "nonspecific", 21 | } 22 | 23 | # v7 24 | major_cat_dict = { 25 | "General": [ 26 | "Positive experience & gratitude", 27 | "Organisation & efficiency", 28 | "Funding & use of financial resources", 29 | "Feeling safe", 30 | "Labelling not possible", 31 | ], 32 | "Staff": [ 33 | "Staff manner & personal attributes", 34 | "Competence & training", 35 | "Staffing levels & responsiveness", 36 | ], 37 | "Access to medical care & support": [ 38 | "Contacting services", 39 | "Appointment arrangements", 40 | "Appointment method", 41 | "Timeliness of care", 42 | ], 43 | "Communication & involvement": [ 44 | "Unspecified communication", 45 | "Staff listening, understanding & involving patients", 46 | "Information directly from staff during care", 47 | "Information provision & guidance", 48 | "Being kept informed, clarity & consistency of information", 49 | "Interaction with family/ carers", 50 | ], 51 | "Mental Health specifics": ["Mental Health Act"], 52 | "Patient journey & service coordination": ["Continuity of care", "Discharge"], 53 | "Medication & pain": ["Supplying & understanding medication", "Pain management"], 54 | "Activities": ["Activities & access to fresh air", "Electronic entertainment"], 55 | "Environment, equipment & catering": [ 56 | "Cleanliness, tidiness & infection control", 57 | "Sensory experience", 58 | "Environment, facilities & equipment", 59 | "Food & drink provision & facilities", 60 | ], 61 | "Service location, travel & transport": [ 62 | "Service location", 63 | "Transport to/ from services", 64 | "Parking", 65 | ], 66 | } 67 | 68 | major_cats = list(major_cat_dict.keys()) 69 | 70 | # v7 20230925 71 | minor_cats = [ 72 | "Organisation & efficiency", 73 | "Funding & use of financial resources", 74 | "Staff manner & personal attributes", 75 | "Competence & training", 76 | "Unspecified communication", 77 | "Staff listening, understanding & involving patients", 78 | "Information directly from staff during care", 79 | "Information provision & guidance", 80 | "Being kept informed, clarity & consistency of information", 81 | "Contacting services", 82 | "Appointment arrangements", 83 | "Appointment method", 84 | "Timeliness of care", 85 | "Pain management", 86 | "Discharge", 87 | "Cleanliness, tidiness & infection control", 88 | "Service location", 89 | "Transport to/ from services", 90 | "Parking", 91 | "Electronic entertainment", 92 | "Feeling safe", 93 | "Mental Health Act", 94 | "Labelling not possible", 95 | "Supplying & understanding medication", 96 | "Activities & access to fresh air", 97 | "Food & drink provision & facilities", 98 | "Sensory experience", 99 | "Interaction with family/ carers", 100 | "Positive experience & gratitude", 101 | "Continuity of care", 102 | "Environment, facilities & equipment", 103 | "Staffing levels & responsiveness", 104 | ] 105 | 106 | sentiment_dict = { 107 | 1: "very positive", 108 | 2: "positive", 109 | 3: "neutral", 110 | 4: "negative", 111 | 5: "very negative", 112 | } 113 | 114 | 115 | # Note that some of these categories no longer exist since v7 of the framework 116 | rules_dict = { 117 | "Care plans": [ 118 | "plan", 119 | "planning", 120 | "plans", 121 | "treatment", 122 | "care", 123 | "future", 124 | "forward", 125 | "forwards", 126 | "action", 127 | ], 128 | "Patient appearance & grooming": [ 129 | "basin", 130 | "bowl", 131 | "brush", 132 | "clothes", 133 | "comb", 134 | "dressed", 135 | "gown", 136 | "hair", 137 | "hairbrush", 138 | "mirror", 139 | "modesty", 140 | "hygien", 141 | "razor", 142 | "shampoo", 143 | "shower", 144 | "sink", 145 | "wear", 146 | "wash", 147 | ], 148 | "Equality, Diversity & Inclusion": [ 149 | "accessib", 150 | "adjustment", 151 | "adhd", 152 | "age", 153 | "autis", 154 | "cultur", 155 | "deaf", 156 | "disab", 157 | "wheelchair", 158 | "discriminat", 159 | "gender", 160 | "hearing", 161 | "language", 162 | "blind", 163 | "mobility", 164 | "race", 165 | "racis", 166 | "religio", 167 | "sexis", 168 | "trans ", 169 | "misgender", 170 | ], 171 | "Patient records": [ 172 | "accurate", 173 | "computer", 174 | "confidential", 175 | "data", 176 | "identifiable", 177 | "notes", 178 | "paperwork", 179 | "papers", 180 | "details", 181 | "record", 182 | "system", 183 | "updated", 184 | "app", 185 | ], 186 | # "Admission": [ 187 | # "admission", 188 | # "admit", 189 | # ], Model already good at picking this up where the words admission/admit are in the text 190 | "Referals & continuity of care": [ 191 | "refer", 192 | "same", 193 | "different", 194 | "continu", 195 | "transfer", 196 | "pass", 197 | "between", 198 | ], 199 | "Staff continuity": [ 200 | "same", 201 | "different", 202 | "retire", 203 | "handover", 204 | "relationship", 205 | "communication", 206 | "change", 207 | "transition", 208 | "passed", 209 | ], 210 | "Diagnosis & triage": [ 211 | "assess", 212 | "diagnos", 213 | "question", 214 | "scan", 215 | "test", 216 | "triage", 217 | "wrong", 218 | "figure", 219 | "identif", 220 | "call", 221 | ], 222 | "Mental Health Act": [ 223 | "leave", 224 | "leav", 225 | "allowed", 226 | "detain", 227 | "prisoner", 228 | "release", 229 | "restrict", 230 | "seclu", 231 | "section", 232 | ], 233 | "Interaction with family/ carers": [ 234 | "brother", 235 | "carer", 236 | "child", 237 | "dad", 238 | "father", 239 | "husband", 240 | "partner", 241 | "famil", 242 | "mam", 243 | "mum", 244 | "wife", 245 | "mother", 246 | "parent", 247 | "relative", 248 | "visit", 249 | "sister", 250 | ], 251 | "Service location": [ 252 | "access", 253 | "direct", 254 | "away", 255 | "far", 256 | "distance", 257 | "local", 258 | "locat", 259 | "lost", 260 | "map", 261 | "miles", 262 | "place", 263 | "sign", 264 | "go to", 265 | "travel", 266 | "where", 267 | "get to", 268 | ], 269 | "Negative experience & dissatisfaction": [ 270 | "rubbish", 271 | "awful", 272 | "poor", 273 | "bad", 274 | "terrible", 275 | "unacceptable", 276 | ], 277 | } 278 | 279 | probs_dict = { 280 | "Negative experience & dissatisfaction": 0.4, 281 | "Diagnosis & triage": 0.4, 282 | "Equality, Diversity & Inclusion": 0.4, 283 | "Referals & continuity of care": 0.4, 284 | "Staff continuity": 0.4, 285 | } 286 | -------------------------------------------------------------------------------- /pxtextmining/pipelines/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/The-Strategy-Unit/pxtextmining/ba6c0a2f10ca9bb3b964b72afe6047b463095c75/pxtextmining/pipelines/__init__.py -------------------------------------------------------------------------------- /pxtextmining/pipelines/multilabel_pipeline.py: -------------------------------------------------------------------------------- 1 | from warnings import simplefilter 2 | 3 | from sklearn.exceptions import ConvergenceWarning 4 | from sklearn.model_selection import train_test_split 5 | 6 | from pxtextmining.factories.factory_data_load_and_split import ( 7 | bert_data_to_dataset, 8 | load_multilabel_data, 9 | process_and_split_data, 10 | ) 11 | from pxtextmining.factories.factory_model_performance import get_multilabel_metrics 12 | from pxtextmining.factories.factory_pipeline import ( 13 | calculating_class_weights, 14 | create_and_train_svc_model, 15 | create_bert_model, 16 | create_bert_model_additional_features, 17 | search_sklearn_pipelines, 18 | train_bert_model, 19 | ) 20 | from pxtextmining.factories.factory_predict_unlabelled_text import ( 21 | get_thresholds, 22 | predict_multilabel_bert, 23 | predict_multilabel_sklearn, 24 | ) 25 | from pxtextmining.factories.factory_write_results import ( 26 | write_model_analysis, 27 | write_model_preds, 28 | write_multilabel_models_and_metrics, 29 | ) 30 | from pxtextmining.params import dataset, major_cats, minor_cats, random_state 31 | 32 | simplefilter("ignore", category=ConvergenceWarning) 33 | 34 | 35 | def run_sklearn_pipeline( 36 | additional_features=False, 37 | target=major_cats, 38 | models_to_try=("mnb", "knn", "svm", "rfc"), 39 | path="test_multilabel", 40 | include_analysis=False, 41 | custom_threshold=False, 42 | ): 43 | """Runs all the functions required to load multilabel data, preprocess it, and split it into training and test sets. 44 | Creates sklearn pipelines and hyperparameters to search, using specified estimators. 45 | For each estimator type selected, performs a randomized search across the hyperparameters to identify the parameters providing the best 46 | results on the holdout data within the randomized search. 47 | Evaluates the performance of the refitted estimator with the best hyperparameters on the test set, and saves the model 48 | and the performance metrics to a specified folder. 49 | 50 | Args: 51 | additional_features (bool, optional): Whether or not additional features (question type and text length) are used. Defaults to False. 52 | target (list, optional): The target labels, which should be columns in the dataset DataFrame. Defaults to major_cats. 53 | models_to_try (list, optional): List of the estimators to try. Defaults to ["mnb", "knn", "svm", "rfc"]. Permitted values are "mnb" (Multinomial Naive Bayes), "knn" (K Nearest Neighbours), "svm" (Support Vector Classifier), or "rfc" (Random Forest Classifier). 54 | path (str, optional): Path where the models are to be saved. If path does not exist, it will be created. Defaults to 'test_multilabel'. 55 | include_analysis (bool, optional): Whether or not additional Excel files showing model performance and predicted labels are generated. Defaults to False. 56 | custom_threshold (bool, optional): Whether or not a custom classification threshold maximising the F1 score is to be calculated. Defaults to False. 57 | """ 58 | # random_state = random.randint(1, 999) 59 | if target == major_cats: 60 | target_name = "major_categories" 61 | if target == minor_cats: 62 | target_name = "minor_categories" 63 | df = load_multilabel_data(filename=dataset, target=target_name) 64 | if custom_threshold is True: 65 | X_train_val, X_test, Y_train_val, Y_test = process_and_split_data( 66 | df, 67 | target=target, 68 | preprocess_text=False, 69 | additional_features=additional_features, 70 | random_state=random_state, 71 | ) 72 | X_train, X_val, Y_train, Y_val = train_test_split( 73 | X_train_val, Y_train_val, test_size=0.2, random_state=random_state 74 | ) 75 | else: 76 | X_train, X_test, Y_train, Y_test = process_and_split_data( 77 | df, 78 | target=target, 79 | additional_features=additional_features, 80 | random_state=random_state, 81 | ) 82 | models, training_times = search_sklearn_pipelines( 83 | X_train, 84 | Y_train, 85 | models_to_try=models_to_try, 86 | additional_features=additional_features, 87 | ) 88 | model_metrics = [] 89 | threshold_dicts = [] 90 | preds = [] 91 | for i in range(len(models)): 92 | m = models[i] 93 | t = training_times[i] 94 | if custom_threshold is True: 95 | val_probs = m.predict_proba(X_val) 96 | custom_threshold_dict = get_thresholds(Y_val, val_probs, labels=target) 97 | else: 98 | custom_threshold_dict = None 99 | threshold_dicts.append(custom_threshold_dict) 100 | preds_df = predict_multilabel_sklearn( 101 | X_test, 102 | m, 103 | labels=target, 104 | additional_features=additional_features, 105 | label_fix=True, 106 | custom_threshold_dict=custom_threshold_dict, 107 | ) 108 | preds.append(preds_df) 109 | model_metrics.append( 110 | get_multilabel_metrics( 111 | preds_df, 112 | Y_test, 113 | random_state=random_state, 114 | labels=target, 115 | model=m, 116 | training_time=t, 117 | ) 118 | ) 119 | write_multilabel_models_and_metrics(models, model_metrics, path=path) 120 | if include_analysis is True: 121 | for i in range(len(models)): 122 | model_name = f"model_{i}" 123 | write_model_preds( 124 | X_test, 125 | Y_test, 126 | preds[i], 127 | labels=target, 128 | path=f"{path}/{model_name}_labels.xlsx", 129 | ) 130 | write_model_analysis( 131 | model_name, 132 | labels=target, 133 | dataset=df, 134 | path=path, 135 | preds_df=preds[i], 136 | y_true=Y_test, 137 | custom_threshold_dict=threshold_dicts[i], 138 | ) 139 | print("Pipeline complete") 140 | 141 | 142 | def run_svc_pipeline( 143 | additional_features=False, 144 | target=major_cats, 145 | path="test_multilabel", 146 | include_analysis=False, 147 | custom_threshold=False, 148 | ): 149 | """Runs all the functions required to load multilabel data, preprocess it, and split it into training and test sets. 150 | Creates sklearn pipeline using a MultiOutputClassifier and Support Vector Classifier estimator, with specific hyperparameters. 151 | Fits the pipeline on the training data. 152 | Evaluates the performance of the refitted estimator with the best hyperparameters on the test set, and saves the model and the performance metrics to a specified folder, together with optional further analysis in the form of Excel files. 153 | 154 | Args: 155 | additional_features (bool, optional): Whether or not additional features (question type and text length) are used. Defaults to False. 156 | target (list, optional): The target labels, which should be columns in the dataset DataFrame. Defaults to major_cats. 157 | path (str, optional): Path where the models are to be saved. If path does not exist, it will be created. Defaults to 'test_multilabel'. 158 | include_analysis (bool, optional): Whether or not to create Excel files including further analysis of the model's performance. Defaults to False. If True, writes two Excel files to the specified folder, one containing the labels and the performance metrics for each label, and one containing the predicted labels and the actual labels for the test set, with the model's probabilities for both. 159 | custom_threshold (bool, optional): Whether or not a custom classification threshold maximising the F1 score is to be calculated. Defaults to False. 160 | 161 | """ 162 | # random_state = random.randint(1, 999) 163 | if target == major_cats: 164 | target_name = "major_categories" 165 | if target == minor_cats: 166 | target_name = "minor_categories" 167 | df = load_multilabel_data(filename=dataset, target=target_name) 168 | if custom_threshold is True: 169 | X_train_val, X_test, Y_train_val, Y_test = process_and_split_data( 170 | df, 171 | target=target, 172 | preprocess_text=False, 173 | additional_features=additional_features, 174 | random_state=random_state, 175 | ) 176 | X_train, X_val, Y_train, Y_val = train_test_split( 177 | X_train_val, Y_train_val, test_size=0.2, random_state=random_state 178 | ) 179 | else: 180 | X_train, X_test, Y_train, Y_test = process_and_split_data( 181 | df, 182 | target=target, 183 | additional_features=additional_features, 184 | random_state=random_state, 185 | ) 186 | model, training_time = create_and_train_svc_model( 187 | X_train, Y_train, additional_features=additional_features 188 | ) 189 | if custom_threshold is True: 190 | val_probs = model.predict_proba(X_val) 191 | custom_threshold_dict = get_thresholds(Y_val, val_probs, labels=target) 192 | else: 193 | custom_threshold_dict = None 194 | preds_df = predict_multilabel_sklearn( 195 | X_test, 196 | model=model, 197 | labels=target, 198 | additional_features=additional_features, 199 | label_fix=True, 200 | custom_threshold_dict=custom_threshold_dict, 201 | ) 202 | model_metrics = get_multilabel_metrics( 203 | preds_df, 204 | Y_test, 205 | labels=target, 206 | random_state=random_state, 207 | model=model, 208 | training_time=training_time, 209 | ) 210 | write_multilabel_models_and_metrics([model], [model_metrics], path=path) 211 | if include_analysis is True: 212 | write_model_preds( 213 | X_test, 214 | Y_test, 215 | preds_df, 216 | labels=target, 217 | path=f"{path}/labels.xlsx", 218 | ) 219 | write_model_analysis( 220 | model_name="model_0", 221 | labels=target, 222 | dataset=df, 223 | path=path, 224 | preds_df=preds_df, 225 | y_true=Y_test, 226 | custom_threshold_dict=custom_threshold_dict, 227 | ) 228 | print("Pipeline complete!") 229 | 230 | 231 | def run_bert_pipeline( 232 | additional_features=False, 233 | path="test_multilabel/bert", 234 | target=major_cats, 235 | include_analysis=False, 236 | custom_threshold=False, 237 | ): 238 | """Runs all the functions required to load multilabel data, preprocess it, and split it into training, test and validation sets. 239 | Creates tf.keras Transformer model with additional layers specific to the classification task, and trains it on the train set. 240 | Evaluates the performance of trained model with the best hyperparameters on the test set, and saves the model 241 | and the performance metrics to a specified folder. 242 | 243 | Args: 244 | additional_features (bool, optional): Whether or not additional features (question type and text length) are used. Defaults to False. 245 | path (str, optional): Path where the models are to be saved. If path does not exist, it will be created. Defaults to 'test_multilabel'. 246 | target (list, optional): The target labels, which should be columns in the dataset DataFrame. Defaults to major_cats. 247 | include_analysis (bool, optional): Whether or not to create Excel files including further analysis of the model's performance. Defaults to False. If True, writes two Excel files to the specified folder, one containing the labels and the performance metrics for each label, and one containing the predicted labels and the actual labels for the test set, with the model's probabilities for both. 248 | custom_threshold (bool, optional): Whether or not a custom classification threshold maximising the F1 score is to be calculated. Defaults to False. 249 | 250 | """ 251 | # random_state = random.randint(1, 999) 252 | print(f"random_state is: {random_state}") 253 | if target == major_cats: 254 | target_name = "major_categories" 255 | if target == minor_cats: 256 | target_name = "minor_categories" 257 | df = load_multilabel_data(filename=dataset, target=target_name) 258 | X_train_val, X_test, Y_train_val, Y_test = process_and_split_data( 259 | df, 260 | target=target, 261 | preprocess_text=False, 262 | additional_features=additional_features, 263 | random_state=random_state, 264 | ) 265 | X_train, X_val, Y_train, Y_val = train_test_split( 266 | X_train_val, Y_train_val, test_size=0.2, random_state=random_state 267 | ) 268 | train_dataset = bert_data_to_dataset( 269 | X_train, Y_train, additional_features=additional_features 270 | ) 271 | val_dataset = bert_data_to_dataset( 272 | X_val, Y_val, additional_features=additional_features 273 | ) 274 | class_weights_dict = calculating_class_weights(Y_train_val) 275 | if additional_features is True: 276 | model = create_bert_model_additional_features(Y_train) 277 | else: 278 | model = create_bert_model(Y_train) 279 | model_trained, training_time = train_bert_model( 280 | train_dataset, 281 | val_dataset, 282 | model, 283 | class_weights_dict=class_weights_dict, 284 | epochs=25, 285 | ) 286 | if custom_threshold is True: 287 | val = bert_data_to_dataset(X_val, additional_features=additional_features) 288 | val_probs = model_trained.predict(val) 289 | custom_threshold_dict = get_thresholds(Y_val, val_probs, labels=target) 290 | else: 291 | custom_threshold_dict = None 292 | preds_df = predict_multilabel_bert( 293 | X_test, 294 | model=model_trained, 295 | labels=target, 296 | additional_features=additional_features, 297 | label_fix=True, 298 | custom_threshold_dict=custom_threshold_dict, 299 | ) 300 | model_metrics = get_multilabel_metrics( 301 | preds_df, 302 | Y_test, 303 | labels=target, 304 | random_state=random_state, 305 | model=model_trained, 306 | training_time=training_time, 307 | ) 308 | write_multilabel_models_and_metrics([model_trained], [model_metrics], path=path) 309 | if include_analysis is True: 310 | write_model_preds( 311 | X_test, 312 | Y_test, 313 | preds_df, 314 | labels=target, 315 | path=f"{path}/labels.xlsx", 316 | ) 317 | write_model_analysis( 318 | model_name="model_0", 319 | labels=target, 320 | dataset=df, 321 | path=path, 322 | preds_df=preds_df, 323 | y_true=Y_test, 324 | custom_threshold_dict=custom_threshold_dict, 325 | ) 326 | print("Pipeline complete!") 327 | 328 | 329 | if __name__ == "__main__": 330 | # run_svc_pipeline( 331 | # additional_features=False, 332 | # target=minor_cats, 333 | # path="test_multilabel/v7_final/svc_noq", 334 | # include_analysis=True, 335 | # custom_threshold=False, 336 | # ) 337 | # run_svc_pipeline( 338 | # additional_features=True, 339 | # target=minor_cats, 340 | # path="test_multilabel/v7_final/svc", 341 | # include_analysis=True, 342 | # custom_threshold=False, 343 | # ) 344 | # run_sklearn_pipeline( 345 | # additional_features=True, 346 | # target=minor_cats, 347 | # models_to_try=["xgb"], 348 | # path="test_multilabel/v7_final/xgb", 349 | # include_analysis=True, 350 | # custom_threshold=False, 351 | # ) 352 | # run_sklearn_pipeline( 353 | # additional_features=False, 354 | # target=minor_cats, 355 | # models_to_try=["xgb"], 356 | # path="test_multilabel/v7_final/xgb_noq", 357 | # include_analysis=True, 358 | # custom_threshold=False, 359 | # ) 360 | # run_bert_pipeline( 361 | # additional_features=True, 362 | # path="test_multilabel/v7_final/bert", 363 | # target=minor_cats, 364 | # include_analysis=True, 365 | # custom_threshold=False, 366 | # ) 367 | # run_bert_pipeline( 368 | # additional_features=False, 369 | # path="test_multilabel/v7_final/bert_noq", 370 | # target=minor_cats, 371 | # include_analysis=True, 372 | # custom_threshold=False, 373 | # ) 374 | run_sklearn_pipeline( 375 | additional_features=False, 376 | target=minor_cats, 377 | models_to_try=["svm"], 378 | path="test_multilabel/v7_final/svc_gridsearch", 379 | include_analysis=True, 380 | custom_threshold=False, 381 | ) 382 | # run_two_layer_sklearn_pipeline() 383 | -------------------------------------------------------------------------------- /pxtextmining/pipelines/sentiment_pipeline.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.model_selection import train_test_split 3 | from sklearn.utils.class_weight import compute_class_weight 4 | from tensorflow.keras.utils import to_categorical 5 | 6 | from pxtextmining.factories.factory_data_load_and_split import ( 7 | bert_data_to_dataset, 8 | load_multilabel_data, 9 | process_and_split_data, 10 | ) 11 | from pxtextmining.factories.factory_model_performance import get_multiclass_metrics 12 | from pxtextmining.factories.factory_pipeline import ( 13 | create_bert_model, 14 | create_bert_model_additional_features, 15 | search_sklearn_pipelines, 16 | train_bert_model, 17 | ) 18 | from pxtextmining.factories.factory_write_results import ( 19 | write_multilabel_models_and_metrics, 20 | ) 21 | from pxtextmining.params import dataset 22 | 23 | random_state = 75 24 | 25 | 26 | def run_sentiment_pipeline( 27 | additional_features=False, 28 | models_to_try=("svm", "xgb"), 29 | path="test_multilabel/sentiment", 30 | ): 31 | """Runs all the functions required to load multiclass data, preprocess it, and split it into training, test and validation sets. 32 | Creates sklearn model and hyperparameter grid to search, and trains it on the train set. 33 | Evaluates the performance of trained model with the best hyperparameters on the test set, and saves the model 34 | and the performance metrics to a specified folder. 35 | 36 | Args: 37 | additional_features (bool, optional): Whether or not additional features (question type and text length) are used. Defaults to False. 38 | models_to_try (list, optional): Which model types to try. Defaults to ["svm", "xgb"]. 39 | path (str, optional): Path where the models are to be saved. If path does not exist, it will be created. Defaults to 'test_multilabel'. 40 | """ 41 | target_names = ["very positive", "positive", "neutral", "negative", "very negative"] 42 | df = load_multilabel_data(filename=dataset, target="sentiment") 43 | X_train, X_test, Y_train, Y_test = process_and_split_data( 44 | df, 45 | target="sentiment", 46 | additional_features=additional_features, 47 | random_state=random_state, 48 | ) 49 | models, training_times = search_sklearn_pipelines( 50 | X_train, 51 | Y_train, 52 | target="sentiment", 53 | models_to_try=models_to_try, 54 | additional_features=additional_features, 55 | ) 56 | model_metrics = [] 57 | for i in range(len(models)): 58 | m = models[i] 59 | t = training_times[i] 60 | metrics = get_multiclass_metrics( 61 | X_test, 62 | Y_test, 63 | labels=target_names, 64 | random_state=random_state, 65 | model=m, 66 | training_time=t, 67 | additional_features=additional_features, 68 | ) 69 | model_metrics.append(metrics) 70 | write_multilabel_models_and_metrics(models, model_metrics, path) 71 | 72 | 73 | def run_sentiment_bert_pipeline( 74 | additional_features=True, path="test_multilabel/sentiment_bert" 75 | ): 76 | """Runs all the functions required to load multiclass data, preprocess it, and split it into training, test and validation sets. 77 | Creates tf.keras Transformer model with additional layers specific to the classification task, and trains it on the train set. 78 | Evaluates the performance of trained model with the best hyperparameters on the test set, and saves the model 79 | and the performance metrics to a specified folder. 80 | 81 | Args: 82 | additional_features (bool, optional): Whether or not additional features (question type and text length) are used. Defaults to False. 83 | path (str, optional): Path where the models are to be saved. If path does not exist, it will be created. Defaults to 'test_multilabel'. 84 | """ 85 | print(f"random_state is: {random_state}") 86 | target_names = ["very positive", "positive", "neutral", "negative", "very negative"] 87 | df = load_multilabel_data(filename=dataset, target="sentiment") 88 | X_train_val, X_test, Y_train_val, Y_test = process_and_split_data( 89 | df, 90 | target="sentiment", 91 | additional_features=additional_features, 92 | preprocess_text=True, 93 | random_state=random_state, 94 | ) 95 | Y_train_val_oh = to_categorical(Y_train_val) 96 | X_train, X_val, Y_train, Y_val = train_test_split( 97 | X_train_val, Y_train_val_oh, test_size=0.2, random_state=random_state 98 | ) 99 | train_dataset = bert_data_to_dataset( 100 | X_train, Y_train, additional_features=additional_features 101 | ) 102 | val_dataset = bert_data_to_dataset( 103 | X_val, Y_val, additional_features=additional_features 104 | ) 105 | cw = compute_class_weight("balanced", classes=np.unique(Y_train_val), y=Y_train_val) 106 | class_weights_dict = {} 107 | for k, v in enumerate(list(cw)): 108 | class_weights_dict[k] = v 109 | if additional_features is True: 110 | model = create_bert_model_additional_features(Y_train, multilabel=False) 111 | else: 112 | model = create_bert_model(Y_train, multilabel=False) 113 | model_trained, training_time = train_bert_model( 114 | train_dataset, 115 | val_dataset, 116 | model, 117 | class_weights_dict=class_weights_dict, 118 | epochs=25, 119 | ) 120 | model_metrics = get_multiclass_metrics( 121 | X_test, 122 | Y_test, 123 | random_state=random_state, 124 | labels=target_names, 125 | model=model_trained, 126 | training_time=training_time, 127 | additional_features=additional_features, 128 | ) 129 | write_multilabel_models_and_metrics([model_trained], [model_metrics], path=path) 130 | 131 | 132 | if __name__ == "__main__": 133 | # run_sentiment_pipeline(additional_features=False) 134 | run_sentiment_bert_pipeline( 135 | additional_features=True, path="test_multilabel/230908_sentiment_bert" 136 | ) 137 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "pxtextmining" 3 | version = "1.0.1" 4 | description = "Text classification of patient experience feedback." 5 | authors = ['CDU Data Science ', 6 | 'YiWen Hon '] 7 | readme = "README.md" 8 | license = "MIT" 9 | repository = "https://github.com/the-strategy-unit/pxtextmining" 10 | documentation = "https://the-strategy-unit.github.io/pxtextmining" 11 | 12 | [tool.poetry.dependencies] 13 | python = ">3.8, <3.11" 14 | joblib = "^1.2.0" 15 | matplotlib = "^3.3.2" 16 | numpy = ">=1.22" 17 | pandas = "^1.4.0" 18 | scikit-learn = "1.0.2" 19 | tensorflow = "2.12.0" 20 | transformers = "^4.26.1" 21 | scipy = "^1.10.1" 22 | xgboost = "^1.7.5" 23 | 24 | [tool.poetry.group.dev] 25 | optional = true 26 | 27 | [tool.poetry.group.dev.dependencies] 28 | uvicorn = "^0.20.0" 29 | pydantic = "^1.10.4" 30 | pytest = "^7.2.2" 31 | fastapi = "^0.101.0" 32 | httpx = "^0.23.3" 33 | pytest-cov = "^4.0.0" 34 | pytest-mock = "^3.10.0" 35 | requests = "^2.31.0" 36 | ruff = "^0.0.272" 37 | pre-commit = "^3.3.3" 38 | tornado = "^6.3.3" 39 | 40 | [tool.poetry.group.docs] 41 | optional = true 42 | 43 | [tool.poetry.group.docs.dependencies] 44 | mkdocs = "^1.4.2" 45 | mkdocstrings-python = "^0.8.2" 46 | mkdocstrings = "^0.19.1" 47 | 48 | [tool.pytest.ini_options] 49 | testpaths = ["tests"] 50 | pythonpath = ["api"] 51 | 52 | [tool.ruff] 53 | select = ["E", "F", "B"] 54 | ignore = ["E501"] 55 | 56 | [build-system] 57 | requires = ["poetry-core"] 58 | build-backend = "poetry.core.masonry.api" 59 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import setuptools 4 | 5 | if __name__ == "__main__": 6 | setuptools.setup() 7 | -------------------------------------------------------------------------------- /test_multilabel/dummy_metrics.txt: -------------------------------------------------------------------------------- 1 | 2 | ***************** 3 | DummyClassifier(strategy='uniform') 4 | 5 | 6 | Training time: None 7 | 8 | exact_accuracy: 0.0 9 | hamming_loss: 0.5030487804878049 10 | macro_jaccard_score: 0.0787901813020388 11 | 12 | Classification report: 13 | precision recall f1-score support 14 | 15 | Access to medical care & support 0.21 0.47 0.29 143 16 | Activities 0.01 0.36 0.02 11 17 | Additional 0.01 0.43 0.02 7 18 | Category TBC 0.00 0.00 0.00 1 19 | Communication & involvement 0.21 0.53 0.31 137 20 | Environment & equipment 0.03 0.52 0.06 21 21 | Food & diet 0.03 0.57 0.05 14 22 | General 0.36 0.46 0.40 248 23 | Medication 0.03 0.77 0.06 13 24 | Mental Health specifics 0.01 0.43 0.02 7 25 | Patient journey & service coordination 0.08 0.48 0.14 58 26 | Service location, travel & transport 0.01 0.33 0.02 12 27 | Staff 0.28 0.48 0.35 193 28 | 29 | micro avg 0.10 0.48 0.16 865 30 | macro avg 0.10 0.45 0.13 865 31 | weighted avg 0.24 0.48 0.30 865 32 | samples avg 0.10 0.48 0.15 865 33 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/The-Strategy-Unit/pxtextmining/ba6c0a2f10ca9bb3b964b72afe6047b463095c75/tests/__init__.py -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | import random 2 | import string 3 | from unittest.mock import Mock 4 | 5 | import numpy as np 6 | import pandas as pd 7 | import pytest 8 | 9 | from pxtextmining.params import minor_cats, q_map 10 | 11 | 12 | @pytest.fixture 13 | def grab_test_X_additional_feats(): 14 | data_dict = { 15 | "FFT answer": { 16 | "Q1": "Nurses were great", 17 | "Q2": "Communication was fantastic", 18 | "Q3": "Impossible to find parking, but pleased to get an appointment close to home", 19 | "Q4": "Food and drink selection very limited", 20 | "Q5": "The ward was boiling hot, although staff were great at explaining details", 21 | }, 22 | "FFT_q_standardised": { 23 | "Q1": "what_good", 24 | "Q2": "what_good", 25 | "Q3": "could_improve", 26 | "Q4": "could_improve", 27 | "Q5": "could_improve", 28 | }, 29 | } 30 | text_X_additional_feats = pd.DataFrame(data_dict) 31 | text_X_additional_feats.index.name = "Comment ID" 32 | return text_X_additional_feats 33 | 34 | 35 | @pytest.fixture 36 | def mock_read_csv(mocker, test_raw_data): 37 | mock = Mock() 38 | mocker.patch("pandas.read_csv", return_value=test_raw_data) 39 | return mock 40 | 41 | 42 | @pytest.fixture 43 | def test_raw_data(): 44 | cols = [ 45 | "Comment ID", 46 | "Trust", 47 | "Respondent ID", 48 | "Date", 49 | "Service Type 1", 50 | "Service type 2", 51 | "FFT categorical answer", 52 | "FFT question", 53 | "FFT answer", 54 | "Comment sentiment", 55 | ] 56 | cols.extend(minor_cats) 57 | data_dict = {} 58 | for col in cols: 59 | row = [] 60 | if col not in minor_cats: 61 | if col in ["FFT categorical answer", "Comment sentiment"]: 62 | for _i in range(5): 63 | row.append(random.randint(1, 5)) 64 | elif col == "FFT question": 65 | for _i in range(5): 66 | row.append(random.choice(list(q_map.keys()))) 67 | else: 68 | for _i in range(5): 69 | row.append( 70 | "".join( 71 | random.choices(string.ascii_uppercase + string.digits, k=5) 72 | ) 73 | ) 74 | else: 75 | for _i in range(5): 76 | row.append(random.choice([np.NaN, 1])) 77 | data_dict[col] = row 78 | data = pd.DataFrame(data_dict) 79 | return data 80 | 81 | 82 | @pytest.fixture 83 | def grab_preds_df(): 84 | labels = ["one", "two", "three", "four", "five"] 85 | probs_labels = ["Probability of " + x for x in labels] 86 | preds_df = pd.DataFrame( 87 | np.array( 88 | [ 89 | [0.0, 1.0, 0.0, 1.0, 0.0, 0.1, 0.6, 0.2, 0.7, 0.05], 90 | [1.0, 0.0, 0.0, 1.0, 0.0, 0.55, 0.2, 0.3, 0.8, 0.4], 91 | [1.0, 0.0, 0.0, 0.0, 0.0, 0.8, 0.3, 0.2, 0.3, 0.1], 92 | [1.0, 0.0, 1.0, 1.0, 0.0, 0.7, 0.2, 0.8, 0.9, 0.0], 93 | [0.0, 0.0, 0.0, 0.0, 1.0, 0.2, 0.4, 0.2, 0.1, 0.6], 94 | ] 95 | ), 96 | columns=labels + probs_labels, 97 | ) 98 | preds_df["labels"] = [ 99 | ["two", "four"], 100 | ["one", "four"], 101 | ["one"], 102 | ["one", "three", "four"], 103 | ["five"], 104 | ] 105 | return preds_df 106 | -------------------------------------------------------------------------------- /tests/test_api.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from fastapi.testclient import TestClient 3 | 4 | from api.api import app 5 | 6 | client = TestClient(app) 7 | 8 | 9 | def test_main(): 10 | response = client.get("/") 11 | assert response.status_code == 200 12 | assert response.json() == {"test": "Hello"} 13 | 14 | 15 | def test_multilabel_predictions(): 16 | test_json = [ 17 | { 18 | "comment_id": "99999", 19 | "comment_text": "I liked all of it", 20 | }, 21 | {"comment_id": "A55", "comment_text": "", "question_type": "nonspecific"}, 22 | { 23 | "comment_id": "A56", 24 | "comment_text": "Truly awful time finding parking", 25 | }, 26 | { 27 | "comment_id": "4", 28 | "comment_text": "I really enjoyed the session", 29 | }, 30 | {"comment_id": "5", "comment_text": "7482367"}, 31 | ] 32 | response = client.post("/predict_multilabel", json=test_json).json() 33 | assert len(test_json) == len(response) 34 | assert isinstance(response[0]["labels"], list) 35 | 36 | 37 | def test_comment_id_error(): 38 | with pytest.raises(ValueError): 39 | test_json = [ 40 | {"comment_id": "1", "comment_text": "I liked all of it"}, 41 | {"comment_id": "1", "comment_text": "I liked all of it"}, 42 | ] 43 | client.post("/predict_multilabel", json=test_json).json() 44 | -------------------------------------------------------------------------------- /tests/test_data_load_and_split.py: -------------------------------------------------------------------------------- 1 | import random 2 | from unittest.mock import patch 3 | 4 | import numpy as np 5 | import pandas as pd 6 | import pytest 7 | 8 | from pxtextmining.factories import factory_data_load_and_split 9 | from pxtextmining.params import minor_cats 10 | 11 | 12 | @pytest.fixture 13 | def grab_test_df(grab_test_X_additional_feats): 14 | df = grab_test_X_additional_feats 15 | df["Comment sentiment"] = 0 16 | df[minor_cats] = 0 17 | for i in range(df.shape[0]): 18 | df.loc[i, "Comment sentiment"] = random.randint(1, 5) 19 | for cat in minor_cats: 20 | df.loc[i, cat] = random.randint(0, 1) 21 | return df 22 | 23 | 24 | @pytest.fixture 25 | def grab_test_Y(): 26 | Y_feats = np.array( 27 | [ 28 | [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 29 | [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 30 | [1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0], 31 | [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 32 | [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 33 | ] 34 | ) 35 | return Y_feats 36 | 37 | 38 | @pytest.mark.parametrize( 39 | "target", ["major_categories", "minor_categories", "sentiment"] 40 | ) 41 | def test_load_multilabel_data(mock_read_csv, target): 42 | filename = "None" 43 | df = factory_data_load_and_split.load_multilabel_data(filename, target) 44 | assert type(df) == pd.DataFrame 45 | 46 | 47 | @patch("pandas.read_csv") 48 | def test_load_multilabel_data_error(mock_bad_csv, test_raw_data): 49 | test_raw_data["FFT question"] = "Nonsense question" 50 | mock_bad_csv.return_value = test_raw_data 51 | filename = "None" 52 | with pytest.raises(ValueError): 53 | factory_data_load_and_split.load_multilabel_data( 54 | filename, target="minor_categories" 55 | ) 56 | 57 | 58 | def test_merge_categories(): 59 | test_df = pd.DataFrame( 60 | {"col_1": [0, 0, 0, 0, 1], "col_2": [0, 1, 0, 0, 1], "col_3": [1, 0, 0, 0, 0]} 61 | ) 62 | new_cat = "new_cat" 63 | cats_to_merge = ["col_1", "col_2"] 64 | merged_df = factory_data_load_and_split.merge_categories( 65 | test_df, new_cat, cats_to_merge 66 | ) 67 | assert list(merged_df.columns) == ["col_3", "new_cat"] 68 | assert merged_df["new_cat"].sum() == 2 69 | 70 | 71 | def test_remove_punc_and_nums(): 72 | text = "Here is.some TEXT?!?!?! 12345 :)" 73 | cleaned_text = factory_data_load_and_split.remove_punc_and_nums(text) 74 | assert cleaned_text == "here is some text" 75 | 76 | 77 | def test_clean_empty_features(): 78 | df_with_empty_lines = pd.DataFrame({"text": ["Some text", "", " ", "More text"]}) 79 | clean_df = factory_data_load_and_split.clean_empty_features(df_with_empty_lines) 80 | assert clean_df.shape == (2, 1) 81 | 82 | 83 | def test_onehot(): 84 | df_to_onehot = pd.DataFrame({"Categories": ["A", "B", "C", "A", "A", "B"]}) 85 | df_onehotted = factory_data_load_and_split.onehot(df_to_onehot, "Categories") 86 | assert df_onehotted.shape == (6, 3) 87 | 88 | 89 | def test_bert_data_to_dataset_with_Y(grab_test_X_additional_feats, grab_test_Y): 90 | train_dataset = factory_data_load_and_split.bert_data_to_dataset( 91 | grab_test_X_additional_feats, grab_test_Y, additional_features=True 92 | ) 93 | assert isinstance(train_dataset._structure, tuple) 94 | 95 | 96 | def test_bert_data_to_dataset_without_Y(grab_test_X_additional_feats): 97 | test_dataset = factory_data_load_and_split.bert_data_to_dataset( 98 | grab_test_X_additional_feats, Y=None, additional_features=True 99 | ) 100 | assert isinstance(test_dataset, dict) 101 | 102 | 103 | @pytest.mark.parametrize("target", [minor_cats, "sentiment"]) 104 | @pytest.mark.parametrize("additional_features", [True, False]) 105 | @pytest.mark.parametrize("preprocess_text", [True, False]) 106 | def test_process_data(grab_test_df, target, preprocess_text, additional_features): 107 | X, Y = factory_data_load_and_split.process_data( 108 | grab_test_df, target, preprocess_text, additional_features 109 | ) 110 | assert X.shape[0] == Y.shape[0] 111 | 112 | 113 | def test_process_and_split_data(grab_test_df): 114 | ( 115 | X_train, 116 | X_test, 117 | Y_train, 118 | Y_test, 119 | ) = factory_data_load_and_split.process_and_split_data( 120 | grab_test_df, 121 | target=minor_cats, 122 | preprocess_text=False, 123 | additional_features=True, 124 | ) 125 | assert len(X_train) == len(Y_train) 126 | assert len(X_test) == len(Y_test) 127 | -------------------------------------------------------------------------------- /tests/test_docker_run.py: -------------------------------------------------------------------------------- 1 | import json 2 | import random 3 | from unittest.mock import mock_open, patch 4 | 5 | import pandas as pd 6 | import pytest 7 | 8 | import docker_run 9 | from pxtextmining.params import minor_cats 10 | 11 | 12 | @pytest.fixture 13 | def sentiment_output(): 14 | s_preds = pd.DataFrame( 15 | [ 16 | {"comment_id": "1", "sentiment": 1}, 17 | {"comment_id": "2", "sentiment": "Labelling not possible"}, 18 | ] 19 | ) 20 | return s_preds 21 | 22 | 23 | @pytest.fixture 24 | def multilabel_output(): 25 | m_preds = pd.DataFrame( 26 | [ 27 | {"comment_id": "1", "labels": ["Positive experience & gratitude"]}, 28 | {"comment_id": "2", "labels": ["Labelling not possible"]}, 29 | ] 30 | ) 31 | return m_preds 32 | 33 | 34 | @pytest.fixture 35 | def output_df(): 36 | indices = ["1"] 37 | df_list = [] 38 | for _ in range(len(indices)): 39 | data_dict = {} 40 | for cat in minor_cats: 41 | data_dict[cat] = random.randint(0, 1) 42 | key = f"Probability of '{cat}'" 43 | data_dict[key] = random.uniform(0.0, 0.99) 44 | df_list.append(data_dict) 45 | df = pd.DataFrame(df_list) 46 | df.index = indices 47 | assert len(df.columns) == 64 48 | return df 49 | 50 | 51 | @pytest.fixture 52 | def input_data(): 53 | input_text = [ 54 | { 55 | "comment_id": "1", 56 | "comment_text": "Nurse was great.", 57 | "question_type": "what_good", 58 | }, 59 | {"comment_id": "2", "comment_text": "", "question_type": "could_improve"}, 60 | ] 61 | return input_text 62 | 63 | 64 | @patch("docker_run.load_model") 65 | def test_load_bert_model(mock_load): 66 | docker_run.load_bert_model("bert_sentiment") 67 | mock_load.assert_called_once() 68 | 69 | 70 | def test_process_text(input_data): 71 | df, text_to_predict = docker_run.process_text(input_data) 72 | assert len(df.columns) == 3 73 | assert len(text_to_predict.columns) == 2 74 | assert text_to_predict.index.name == "Comment ID" 75 | assert df.shape[0] == text_to_predict.shape[0] 76 | 77 | 78 | @patch("docker_run.pickle.load") 79 | def test_load_sklearn_model(mock_pickle_load): 80 | docker_run.load_sklearn_model("final_svc") 81 | mock_pickle_load.assert_called_once() 82 | 83 | 84 | @patch("docker_run.predict_sentiment_bert") 85 | @patch("docker_run.load_model") 86 | def test_predict_sentiment(mock_load_model, mock_get_predictions, input_data): 87 | input_text = input_data 88 | output = pd.DataFrame( 89 | [ 90 | { 91 | "Comment ID": "1", 92 | "FFT answer": "Nurse was great.", 93 | "FFT_q_standardised": "what_good", 94 | "sentiment": 1, 95 | } 96 | ] 97 | ).set_index("Comment ID") 98 | mock_get_predictions.return_value = output 99 | return_df = docker_run.predict_sentiment(input_text) 100 | mock_load_model.assert_called_once() 101 | mock_get_predictions.assert_called() 102 | assert len(return_df) == len(input_text) 103 | 104 | 105 | @patch("docker_run.predict_multilabel_sklearn") 106 | @patch("docker_run.predict_multilabel_bert") 107 | @patch("docker_run.pickle.load") 108 | @patch("docker_run.load_model") 109 | def test_predict_multilabel_ensemble( 110 | mock_load_model, 111 | mock_pickle_load, 112 | mock_predict_bert, 113 | mock_predict_sklearn, 114 | output_df, 115 | input_data, 116 | ): 117 | mock_predict_bert.return_value = output_df 118 | mock_predict_sklearn.return_value = output_df 119 | input_text = input_data 120 | return_df = docker_run.predict_multilabel_ensemble(input_text) 121 | mock_load_model.assert_called_once() 122 | mock_pickle_load.assert_called() 123 | mock_predict_bert.assert_called_once() 124 | mock_predict_sklearn.assert_called() 125 | assert len(return_df) == len(input_text) 126 | 127 | 128 | @pytest.mark.parametrize( 129 | "args", 130 | [ 131 | ["file_01.json"], 132 | ["file_01.json", "-l", "--target", "m"], 133 | ["file_01.json", "-t", "s"], 134 | ], 135 | ) 136 | def test_parse_args(mocker, args): 137 | mocker.patch("sys.argv", ["docker_run.py"] + args) 138 | args = docker_run.parse_args() 139 | assert args.json_file[0] == "file_01.json" 140 | if args.local_storage: 141 | assert args.local_storage is True 142 | assert args.target in "ms" 143 | 144 | 145 | def test_comment_id_error(): 146 | with pytest.raises(ValueError): 147 | test_json = [ 148 | { 149 | "comment_id": "1", 150 | "comment_text": "I liked all of it", 151 | "question_type": "nonspecific", 152 | }, 153 | { 154 | "comment_id": "1", 155 | "comment_text": "I liked all of it", 156 | "question_type": "nonspecific", 157 | }, 158 | ] 159 | docker_run.process_text(test_json) 160 | 161 | 162 | @patch("docker_run.predict_multilabel_ensemble") 163 | @patch("docker_run.predict_sentiment") 164 | @patch("docker_run.os.remove") 165 | @patch( 166 | "builtins.open", new_callable=mock_open, read_data=json.dumps([{"data": "Here"}]) 167 | ) 168 | @patch("sys.argv", ["docker_run.py"] + ["file_01.json"]) 169 | def test_main_not_local( 170 | mock_open, 171 | mock_remove, 172 | mock_predict_sentiment, 173 | mock_predict_ensemble, 174 | sentiment_output, 175 | multilabel_output, 176 | ): 177 | mock_predict_sentiment.return_value = sentiment_output 178 | mock_predict_ensemble.return_value = multilabel_output 179 | docker_run.main() 180 | mock_open.assert_called() 181 | mock_predict_sentiment.assert_called() 182 | mock_predict_ensemble.assert_called() 183 | mock_remove.assert_called_once() 184 | 185 | 186 | @patch("docker_run.predict_sentiment") 187 | @patch( 188 | "builtins.open", new_callable=mock_open, read_data=json.dumps([{"data": "Here"}]) 189 | ) 190 | @patch("sys.argv", ["docker_run.py"] + ["file_01.json", "-l", "-t", "s"]) 191 | def test_main_local(mock_open, mock_predict_sentiment, sentiment_output): 192 | mock_predict_sentiment.return_value = sentiment_output 193 | docker_run.main() 194 | mock_open.assert_called() 195 | mock_predict_sentiment.assert_called() 196 | -------------------------------------------------------------------------------- /tests/test_factory_pipeline.py: -------------------------------------------------------------------------------- 1 | from unittest.mock import MagicMock, Mock, patch 2 | 3 | import numpy as np 4 | import pytest 5 | from keras.engine.functional import Functional 6 | from sklearn.base import is_classifier 7 | from sklearn.pipeline import Pipeline 8 | 9 | from pxtextmining.factories import factory_pipeline 10 | 11 | 12 | @pytest.mark.parametrize("model_type", ["svm", "xgb"]) 13 | @pytest.mark.parametrize("additional_features", [True, False]) 14 | def test_create_sklearn_pipeline_sentiment(model_type, additional_features): 15 | pipe, params = factory_pipeline.create_sklearn_pipeline_sentiment( 16 | model_type, 3, additional_features=additional_features 17 | ) 18 | assert isinstance(params, dict) is True 19 | assert is_classifier(pipe) is True 20 | 21 | 22 | @pytest.mark.parametrize("multilabel", [True, False]) 23 | def test_create_bert_model(multilabel): 24 | Y_train = np.array([[0, 1, 0], [1, 0, 0], [0, 0, 1]]) 25 | model = factory_pipeline.create_bert_model(Y_train, multilabel=multilabel) 26 | assert isinstance(model, Functional) is True 27 | 28 | 29 | @pytest.mark.parametrize("multilabel", [True, False]) 30 | def test_create_bert_model_additional_features(multilabel): 31 | Y_train = np.array([[0, 1, 0], [1, 0, 0], [0, 0, 1]]) 32 | model = factory_pipeline.create_bert_model_additional_features( 33 | Y_train, multilabel=multilabel 34 | ) 35 | assert isinstance(model, Functional) is True 36 | 37 | 38 | def test_train_bert_model(): 39 | train_dataset = Mock() 40 | test_dataset = Mock() 41 | model = Mock() 42 | model, training_time = factory_pipeline.train_bert_model( 43 | train_dataset, test_dataset, model 44 | ) 45 | model.fit.assert_called_once() 46 | assert isinstance(training_time, str) is True 47 | 48 | 49 | def test_calculating_class_weights(): 50 | Y_train = np.array( 51 | [[0, 1, 0], [1, 0, 0], [0, 0, 1], [0, 1, 0], [1, 0, 0], [1, 0, 0]] 52 | ) 53 | class_weights_dict = factory_pipeline.calculating_class_weights(Y_train) 54 | assert isinstance(class_weights_dict, dict) is True 55 | 56 | 57 | @pytest.mark.parametrize("model_type", ["svm", "xgb", "rfc", "mnb", "knn"]) 58 | @pytest.mark.parametrize("additional_features", [True, False]) 59 | def test_create_sklearn_pipeline(model_type, additional_features): 60 | pipe, params = factory_pipeline.create_sklearn_pipeline( 61 | model_type, additional_features 62 | ) 63 | assert is_classifier(pipe) is True 64 | assert isinstance(params, dict) is True 65 | 66 | 67 | @pytest.mark.parametrize("target", ["sentiment", None]) 68 | @pytest.mark.parametrize("model_type", [["svm"], ["xgb"]]) 69 | @patch("pxtextmining.factories.factory_pipeline.RandomizedSearchCV") 70 | def test_search_sklearn_pipelines( 71 | mock_randomsearch, target, model_type, grab_test_X_additional_feats 72 | ): 73 | mock_instance = MagicMock() 74 | mock_randomsearch.return_value = mock_instance 75 | X_train = grab_test_X_additional_feats 76 | Y_train = np.array( 77 | [ 78 | [0, 1, 0, 1, 0], 79 | [1, 0, 0, 1, 0], 80 | [1, 0, 0, 0, 0], 81 | [1, 0, 1, 1, 0], 82 | [0, 0, 0, 0, 1], 83 | ] 84 | ) 85 | mock_instance.best_estimator_ = Pipeline([("dummy", None)]) 86 | mock_instance.best_params_ = {"param1": 10, "param2": 20} 87 | 88 | models, training_times = factory_pipeline.search_sklearn_pipelines( 89 | X_train, 90 | Y_train, 91 | models_to_try=model_type, 92 | target=target, 93 | additional_features=True, 94 | ) 95 | 96 | mock_instance.fit.assert_called() 97 | assert len(models) == 1 98 | assert isinstance(models[0], Pipeline) is True 99 | assert models[0].steps[0][0] == "dummy" 100 | assert len(training_times) == 1 101 | 102 | with pytest.raises(ValueError): 103 | factory_pipeline.search_sklearn_pipelines( 104 | X_train, 105 | Y_train, 106 | models_to_try=["nonsense"], 107 | target=target, 108 | additional_features=True, 109 | ) 110 | 111 | 112 | @pytest.mark.parametrize("target", ["sentiment", None]) 113 | @patch("pxtextmining.factories.factory_pipeline.RandomizedSearchCV") 114 | def test_search_sklearn_pipelines_no_feats( 115 | mock_randomsearch, target, grab_test_X_additional_feats 116 | ): 117 | mock_instance = MagicMock() 118 | mock_randomsearch.return_value = mock_instance 119 | models_to_try = ["svm"] 120 | X_train = grab_test_X_additional_feats["FFT answer"] 121 | Y_train = np.array( 122 | [ 123 | [0, 1, 0, 1, 0], 124 | [1, 0, 0, 1, 0], 125 | [1, 0, 0, 0, 0], 126 | [1, 0, 1, 1, 0], 127 | [0, 0, 0, 0, 1], 128 | ] 129 | ) 130 | mock_instance.best_estimator_ = Pipeline([("dummy", None)]) 131 | mock_instance.best_params_ = {"param1": 10, "param2": 20} 132 | 133 | models, training_times = factory_pipeline.search_sklearn_pipelines( 134 | X_train, Y_train, models_to_try, target=target, additional_features=False 135 | ) 136 | 137 | mock_instance.fit.assert_called() 138 | assert len(models) == 1 139 | assert isinstance(models[0], Pipeline) is True 140 | assert models[0].steps[0][0] == "dummy" 141 | assert len(training_times) == 1 142 | 143 | 144 | @patch("pxtextmining.factories.factory_pipeline.make_pipeline") 145 | def test_create_and_train_svc_model(mock_pipeline, grab_test_X_additional_feats): 146 | mock_pipe = Mock() 147 | mock_pipeline.return_value = mock_pipe 148 | X_train = grab_test_X_additional_feats 149 | Y_train = np.array( 150 | [ 151 | [0, 1, 0, 1, 0], 152 | [1, 0, 0, 1, 0], 153 | [1, 0, 0, 0, 0], 154 | [1, 0, 1, 1, 0], 155 | [0, 0, 0, 0, 1], 156 | ] 157 | ) 158 | factory_pipeline.create_and_train_svc_model( 159 | X_train, Y_train, additional_features=True 160 | ) 161 | mock_pipe.fit.assert_called_with(X_train, Y_train) 162 | 163 | 164 | @patch("pxtextmining.factories.factory_pipeline.make_pipeline") 165 | def test_create_and_train_svc_model_no_feats( 166 | mock_pipeline, grab_test_X_additional_feats 167 | ): 168 | mock_pipe = Mock() 169 | mock_pipeline.return_value = mock_pipe 170 | X_train = grab_test_X_additional_feats["FFT answer"] 171 | Y_train = np.array( 172 | [ 173 | [0, 1, 0, 1, 0], 174 | [1, 0, 0, 1, 0], 175 | [1, 0, 0, 0, 0], 176 | [1, 0, 1, 1, 0], 177 | [0, 0, 0, 0, 1], 178 | ] 179 | ) 180 | factory_pipeline.create_and_train_svc_model( 181 | X_train, Y_train, additional_features=False 182 | ) 183 | mock_pipe.fit.assert_called_with(X_train, Y_train) 184 | -------------------------------------------------------------------------------- /tests/test_helpers.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from pxtextmining.helpers.text_preprocessor import tf_preprocessing 4 | 5 | 6 | def test_text_preprocessor(grab_test_X_additional_feats): 7 | data = grab_test_X_additional_feats["FFT answer"] 8 | X_pad, vocab_size = tf_preprocessing(data) 9 | assert isinstance(X_pad, np.ndarray) is True 10 | assert len(X_pad) == data.shape[0] 11 | assert isinstance(vocab_size, int) is True 12 | -------------------------------------------------------------------------------- /tests/test_model_performance.py: -------------------------------------------------------------------------------- 1 | from unittest.mock import Mock 2 | 3 | import numpy as np 4 | import pandas as pd 5 | import pytest 6 | from tensorflow.keras import Model 7 | 8 | from pxtextmining.factories import factory_model_performance 9 | 10 | 11 | @pytest.fixture 12 | def grab_test_bert_multiclass(): 13 | predicted_probs = np.array( 14 | [ 15 | [0.9, 0.01, 0.07, 0.01, 0.01], 16 | [0.01, 0.07, 0.01, 0.01, 0.9], 17 | [0.07, 0.9, 0.01, 0.01, 0.01], 18 | [0.9, 0.01, 0.07, 0.01, 0.01], 19 | [0.9, 0.01, 0.01, 0.01, 0.07], 20 | ] 21 | ) 22 | model = Mock(spec=Model, predict=Mock(return_value=predicted_probs)) 23 | return model 24 | 25 | 26 | @pytest.fixture 27 | def grab_test_bert_multilabel(): 28 | predicted_probs = np.array( 29 | [ 30 | [ 31 | 6.2770307e-01, 32 | 2.3520987e-02, 33 | 1.3149388e-01, 34 | 2.7835215e-02, 35 | 1.8944685e-01, 36 | ], 37 | [ 38 | 9.8868138e-01, 39 | 1.9990385e-03, 40 | 5.4453085e-03, 41 | 9.0726715e-04, 42 | 2.9669846e-03, 43 | ], 44 | [ 45 | 4.2310607e-01, 46 | 5.6546849e-01, 47 | 9.3136989e-03, 48 | 1.3205722e-03, 49 | 7.9117226e-04, 50 | ], 51 | [ 52 | 2.0081511e-01, 53 | 7.0609129e-04, 54 | 1.1107661e-03, 55 | 7.9677838e-01, 56 | 5.8961433e-04, 57 | ], 58 | [ 59 | 1.4777037e-03, 60 | 5.1493715e-03, 61 | 2.8268427e-03, 62 | 7.4673461e-04, 63 | 9.8979920e-01, 64 | ], 65 | ] 66 | ) 67 | model = Mock(spec=Model, predict=Mock(return_value=predicted_probs)) 68 | return model 69 | 70 | 71 | def test_multiclass_metrics_sklearn(grab_test_X_additional_feats): 72 | x = grab_test_X_additional_feats 73 | y = np.array([[0, 1, 0], [1, 0, 0], [1, 0, 0], [1, 0, 0], [0, 0, 1]]) 74 | labels = ["A", "B", "C"] 75 | model = factory_model_performance.get_dummy_model(x, y) 76 | random_state = 42 77 | additional_features = True 78 | metrics_string = factory_model_performance.get_multiclass_metrics( 79 | x, y, labels, random_state, model, additional_features 80 | ) 81 | assert isinstance(metrics_string, str) is True 82 | 83 | 84 | def test_multiclass_metrics_bert( 85 | grab_test_X_additional_feats, grab_test_bert_multiclass 86 | ): 87 | x = grab_test_X_additional_feats 88 | y = np.array( 89 | [ 90 | [0], 91 | [4], 92 | [1], 93 | [3], 94 | [3], 95 | ] 96 | ) 97 | labels = ["A", "B", "C", "D"] 98 | model = grab_test_bert_multiclass 99 | random_state = 42 100 | additional_features = True 101 | metrics_string = factory_model_performance.get_multiclass_metrics( 102 | x, y, labels, random_state, model, additional_features 103 | ) 104 | assert isinstance(metrics_string, str) is True 105 | 106 | 107 | def test_multilabel_metrics_sklearn(grab_preds_df, grab_test_X_additional_feats): 108 | preds_df = grab_preds_df 109 | x = grab_test_X_additional_feats 110 | y = np.array( 111 | [ 112 | [0, 1, 0, 1, 0], 113 | [1, 0, 0, 1, 0], 114 | [1, 0, 0, 0, 0], 115 | [1, 0, 1, 1, 0], 116 | [0, 0, 0, 0, 1], 117 | ] 118 | ) 119 | labels = ["one", "two", "three", "four", "five"] 120 | random_state = 42 121 | model = factory_model_performance.get_dummy_model(x, y) 122 | metrics_string = factory_model_performance.get_multilabel_metrics( 123 | preds_df, 124 | y, 125 | labels, 126 | random_state, 127 | model, 128 | ) 129 | assert isinstance(metrics_string, str) is True 130 | 131 | 132 | def test_multilabel_metrics_bert(grab_test_bert_multilabel, grab_preds_df): 133 | preds_df = grab_preds_df 134 | y = np.array( 135 | [ 136 | [0, 1, 0, 1, 0], 137 | [1, 0, 0, 1, 0], 138 | [1, 0, 0, 0, 0], 139 | [1, 0, 1, 1, 0], 140 | [0, 0, 0, 0, 1], 141 | ] 142 | ) 143 | labels = ["one", "two", "three", "four", "five"] 144 | random_state = 42 145 | model = grab_test_bert_multilabel 146 | metrics_string = factory_model_performance.get_multilabel_metrics( 147 | preds_df, 148 | y, 149 | labels, 150 | random_state, 151 | model, 152 | ) 153 | assert isinstance(metrics_string, str) is True 154 | 155 | 156 | def test_accuracy_per_class(): 157 | y_test = pd.Series([0, 1, 0, 2, 1, 0]) 158 | y_pred = pd.Series([0, 1, 0, 1, 1, 2]) 159 | df = factory_model_performance.get_accuracy_per_class(y_test, y_pred) 160 | assert df.shape == (3, 3) 161 | 162 | 163 | def test_parse_metrics_file(): 164 | metrics_file = "current_best_model/sentiment/bert_sentiment.txt" 165 | labels = ["very positive", "positive", "neutral", "negative", "very negative"] 166 | metrics_df = factory_model_performance.parse_metrics_file(metrics_file, labels) 167 | assert metrics_df.shape == (5, 5) 168 | 169 | 170 | @pytest.mark.parametrize( 171 | "custom_threshold_dict", 172 | [None, {"one": 0.6, "two": 0.5, "three": 0.75, "four": 0.6, "five": 0.5}], 173 | ) 174 | def test_additional_analysis(custom_threshold_dict, grab_preds_df): 175 | y_true = np.array( 176 | [ 177 | [0.0, 1.0, 0.0, 0.0, 0.0], 178 | [1.0, 0.0, 0.0, 1.0, 1.0], 179 | [1.0, 0.0, 0.0, 0.0, 0.0], 180 | [0.0, 0.0, 1.0, 1.0, 0.0], 181 | [0.0, 0.0, 0.0, 0.0, 1.0], 182 | ] 183 | ) 184 | labels = ["one", "two", "three", "four", "five"] 185 | preds_df = grab_preds_df 186 | analysis_df = factory_model_performance.additional_analysis( 187 | preds_df, y_true, labels, custom_threshold_dict 188 | ) 189 | assert list(analysis_df.index) == labels 190 | if custom_threshold_dict is None: 191 | assert len(analysis_df.columns) == 5 192 | else: 193 | assert len(analysis_df.columns) == 6 194 | 195 | 196 | def test_multiclass_metrics_valueerror( 197 | grab_test_X_additional_feats, 198 | ): 199 | x = grab_test_X_additional_feats 200 | y = np.array( 201 | [ 202 | [0], 203 | [4], 204 | [1], 205 | [3], 206 | [3], 207 | ] 208 | ) 209 | labels = ["A", "B", "C", "D"] 210 | model = Mock(spec=None) 211 | random_state = 42 212 | additional_features = True 213 | with pytest.raises(ValueError): 214 | factory_model_performance.get_multiclass_metrics( 215 | x, y, labels, random_state, model, additional_features 216 | ) 217 | 218 | 219 | def test_multilabel_metrics_valueerror( 220 | grab_preds_df, 221 | ): 222 | preds_df = grab_preds_df 223 | y = np.array( 224 | [ 225 | [0, 1, 0, 1, 0], 226 | [1, 0, 0, 1, 0], 227 | [1, 0, 0, 0, 0], 228 | [1, 0, 1, 1, 0], 229 | [0, 0, 0, 0, 1], 230 | ] 231 | ) 232 | labels = ["one", "two", "three", "four", "five"] 233 | random_state = 42 234 | model = Mock(spec=None) 235 | with pytest.raises(ValueError): 236 | factory_model_performance.get_multilabel_metrics( 237 | preds_df, 238 | y, 239 | labels, 240 | random_state, 241 | model, 242 | ) 243 | 244 | 245 | def test_get_y_score_2d(): 246 | test_probs = np.array( 247 | [ 248 | [ 249 | 6.2770307e-01, 250 | 2.3520987e-02, 251 | 1.3149388e-01, 252 | 2.7835215e-02, 253 | 1.8944685e-01, 254 | ], 255 | [ 256 | 9.8868138e-01, 257 | 1.9990385e-03, 258 | 5.4453085e-03, 259 | 9.0726715e-04, 260 | 2.9669846e-03, 261 | ], 262 | [ 263 | 4.2310607e-01, 264 | 5.6546849e-01, 265 | 9.3136989e-03, 266 | 1.3205722e-03, 267 | 7.9117226e-04, 268 | ], 269 | [ 270 | 2.0081511e-01, 271 | 7.0609129e-04, 272 | 1.1107661e-03, 273 | 7.9677838e-01, 274 | 5.8961433e-04, 275 | ], 276 | [ 277 | 1.4777037e-03, 278 | 5.1493715e-03, 279 | 2.8268427e-03, 280 | 7.4673461e-04, 281 | 9.8979920e-01, 282 | ], 283 | ] 284 | ) 285 | probs = factory_model_performance.get_y_score(test_probs) 286 | assert probs.ndim == 2 287 | 288 | 289 | def test_get_y_score_3d(): 290 | test_probs = np.array( 291 | [ 292 | [ 293 | [0.80465788, 0.19534212], 294 | [0.94292979, 0.05707021], 295 | [0.33439024, 0.66560976], 296 | ], 297 | [ 298 | [0.33439024, 0.66560976], 299 | [0.9949298, 0.0050702], 300 | [0.99459238, 0.00540762], 301 | ], 302 | [ 303 | [0.97472981, 0.02527019], 304 | [0.25069129, 0.74930871], 305 | [0.33439024, 0.66560976], 306 | ], 307 | ] 308 | ) 309 | probs = factory_model_performance.get_y_score(test_probs) 310 | assert probs.ndim == 2 311 | -------------------------------------------------------------------------------- /tests/test_multilabel_pipeline.py: -------------------------------------------------------------------------------- 1 | from unittest.mock import Mock, patch 2 | 3 | import pytest 4 | 5 | from pxtextmining.params import major_cats, minor_cats 6 | from pxtextmining.pipelines import multilabel_pipeline 7 | 8 | 9 | @pytest.mark.parametrize("target", [major_cats, minor_cats]) 10 | @patch("pxtextmining.pipelines.multilabel_pipeline.write_model_analysis") 11 | @patch("pxtextmining.pipelines.multilabel_pipeline.write_model_preds") 12 | @patch("pxtextmining.pipelines.multilabel_pipeline.write_multilabel_models_and_metrics") 13 | @patch("pxtextmining.pipelines.multilabel_pipeline.get_multilabel_metrics") 14 | @patch("pxtextmining.pipelines.multilabel_pipeline.predict_multilabel_sklearn") 15 | @patch("pxtextmining.pipelines.multilabel_pipeline.get_thresholds") 16 | @patch("pxtextmining.pipelines.multilabel_pipeline.search_sklearn_pipelines") 17 | @patch("pxtextmining.pipelines.multilabel_pipeline.train_test_split") 18 | @patch("pxtextmining.pipelines.multilabel_pipeline.process_and_split_data", create=True) 19 | @patch("pxtextmining.pipelines.multilabel_pipeline.load_multilabel_data") 20 | @pytest.mark.parametrize("custom_threshold", [True, False]) 21 | def test_sklearn_pipeline( 22 | mock_dataload, 23 | mock_datasplit, 24 | mock_traintestsplit, 25 | mock_skpipeline, 26 | mock_threshold, 27 | mock_predict, 28 | mock_metrics, 29 | mock_write, 30 | mock_writepreds, 31 | mock_writeanalysis, 32 | custom_threshold, 33 | target, 34 | ): 35 | # arrange mocks 36 | mock_datasplit.return_value = (1, 2, 3, 4) 37 | mock_traintestsplit.return_value = (1, 2, 3, 4) 38 | mock_skpipeline.return_value = ([Mock()], ["training_time"]) 39 | 40 | # act 41 | multilabel_pipeline.run_sklearn_pipeline( 42 | target=target, include_analysis=True, custom_threshold=custom_threshold 43 | ) 44 | 45 | # assert 46 | mock_dataload.assert_called_once() 47 | mock_datasplit.assert_called_once() 48 | mock_skpipeline.assert_called_once() 49 | mock_metrics.assert_called_once() 50 | mock_predict.assert_called_once() 51 | mock_write.assert_called_once() 52 | mock_writepreds.assert_called_once() 53 | mock_writeanalysis.assert_called_once() 54 | if custom_threshold is True: 55 | mock_traintestsplit.assert_called_once() 56 | mock_threshold.assert_called_once() 57 | 58 | 59 | @pytest.mark.parametrize("target", [major_cats, minor_cats]) 60 | @patch("pxtextmining.pipelines.multilabel_pipeline.write_model_analysis") 61 | @patch("pxtextmining.pipelines.multilabel_pipeline.write_model_preds") 62 | @patch("pxtextmining.pipelines.multilabel_pipeline.write_multilabel_models_and_metrics") 63 | @patch("pxtextmining.pipelines.multilabel_pipeline.get_multilabel_metrics") 64 | @patch("pxtextmining.pipelines.multilabel_pipeline.predict_multilabel_sklearn") 65 | @patch("pxtextmining.pipelines.multilabel_pipeline.get_thresholds") 66 | @patch("pxtextmining.pipelines.multilabel_pipeline.create_and_train_svc_model") 67 | @patch("pxtextmining.pipelines.multilabel_pipeline.train_test_split") 68 | @patch("pxtextmining.pipelines.multilabel_pipeline.process_and_split_data", create=True) 69 | @patch("pxtextmining.pipelines.multilabel_pipeline.load_multilabel_data") 70 | @pytest.mark.parametrize("custom_threshold", [True, False]) 71 | def test_svc_pipeline( 72 | mock_dataload, 73 | mock_datasplit, 74 | mock_traintestsplit, 75 | mock_skpipeline, 76 | mock_threshold, 77 | mock_predict, 78 | mock_metrics, 79 | mock_write, 80 | mock_writepreds, 81 | mock_writeanalysis, 82 | target, 83 | custom_threshold, 84 | ): 85 | # arrange mocks 86 | mock_traintestsplit.return_value = (1, 2, 3, 4) 87 | mock_datasplit.return_value = (1, 2, 3, 4) 88 | mock_skpipeline.return_value = (Mock(), "training_time") 89 | 90 | # act 91 | multilabel_pipeline.run_svc_pipeline( 92 | target=target, include_analysis=True, custom_threshold=custom_threshold 93 | ) 94 | 95 | # assert 96 | mock_dataload.assert_called_once() 97 | mock_datasplit.assert_called_once() 98 | mock_skpipeline.assert_called_once() 99 | mock_predict.assert_called_once() 100 | mock_metrics.assert_called_once() 101 | mock_write.assert_called_once() 102 | mock_writepreds.assert_called_once() 103 | mock_writeanalysis.assert_called_once() 104 | if custom_threshold is True: 105 | mock_traintestsplit.assert_called_once() 106 | mock_threshold.assert_called_once() 107 | 108 | 109 | @pytest.mark.parametrize("target", [major_cats, minor_cats]) 110 | @patch("pxtextmining.pipelines.multilabel_pipeline.write_model_analysis") 111 | @patch("pxtextmining.pipelines.multilabel_pipeline.write_model_preds") 112 | @patch("pxtextmining.pipelines.multilabel_pipeline.write_multilabel_models_and_metrics") 113 | @patch("pxtextmining.pipelines.multilabel_pipeline.get_multilabel_metrics") 114 | @patch("pxtextmining.pipelines.multilabel_pipeline.predict_multilabel_bert") 115 | @patch("pxtextmining.pipelines.multilabel_pipeline.train_bert_model") 116 | @patch("pxtextmining.pipelines.multilabel_pipeline.create_bert_model") 117 | @patch("pxtextmining.pipelines.multilabel_pipeline.calculating_class_weights") 118 | @patch("pxtextmining.pipelines.multilabel_pipeline.bert_data_to_dataset") 119 | @patch("pxtextmining.pipelines.multilabel_pipeline.train_test_split") 120 | @patch("pxtextmining.pipelines.multilabel_pipeline.process_and_split_data") 121 | @patch("pxtextmining.pipelines.multilabel_pipeline.load_multilabel_data") 122 | def test_bert_pipeline( 123 | mock_dataload, 124 | mock_datasplit, 125 | mock_traintest, 126 | mock_bertdata, 127 | mock_classweights, 128 | mock_createbert, 129 | mock_trainbert, 130 | mock_predict, 131 | mock_metrics, 132 | mock_write, 133 | mock_writepreds, 134 | mock_writeanalysis, 135 | target, 136 | ): 137 | # arrange mocks 138 | mock_datasplit.return_value = (1, 2, 3, 4) 139 | mock_traintest.return_value = ("X_train_val", "X_test", "Y_train_val", "Y_test") 140 | mock_trainbert.return_value = (1, 2) 141 | 142 | # act 143 | multilabel_pipeline.run_bert_pipeline(target=target, include_analysis=True) 144 | 145 | # assert 146 | mock_dataload.assert_called_once() 147 | mock_datasplit.assert_called_once() 148 | mock_traintest.assert_called_once() 149 | mock_bertdata.assert_called() 150 | mock_classweights.assert_called_once() 151 | mock_createbert.assert_called_once() 152 | mock_trainbert.assert_called_once() 153 | mock_predict.assert_called_once() 154 | mock_metrics.assert_called_once() 155 | mock_write.assert_called_once() 156 | mock_writepreds.assert_called_once() 157 | mock_writeanalysis.assert_called_once() 158 | 159 | 160 | @pytest.mark.parametrize("target", [major_cats, minor_cats]) 161 | @patch("pxtextmining.pipelines.multilabel_pipeline.write_multilabel_models_and_metrics") 162 | @patch("pxtextmining.pipelines.multilabel_pipeline.get_multilabel_metrics") 163 | @patch("pxtextmining.pipelines.multilabel_pipeline.predict_multilabel_bert") 164 | @patch("pxtextmining.pipelines.multilabel_pipeline.get_thresholds") 165 | @patch("pxtextmining.pipelines.multilabel_pipeline.train_bert_model") 166 | @patch( 167 | "pxtextmining.pipelines.multilabel_pipeline.create_bert_model_additional_features" 168 | ) 169 | @patch("pxtextmining.pipelines.multilabel_pipeline.calculating_class_weights") 170 | @patch("pxtextmining.pipelines.multilabel_pipeline.bert_data_to_dataset") 171 | @patch("pxtextmining.pipelines.multilabel_pipeline.train_test_split") 172 | @patch("pxtextmining.pipelines.multilabel_pipeline.process_and_split_data") 173 | @patch("pxtextmining.pipelines.multilabel_pipeline.load_multilabel_data") 174 | @pytest.mark.parametrize("custom_threshold", [True, False]) 175 | def test_bert_pipeline_additional_features( 176 | mock_dataload, 177 | mock_datasplit, 178 | mock_traintest, 179 | mock_bertdata, 180 | mock_classweights, 181 | mock_createbert, 182 | mock_trainbert, 183 | mock_thresholds, 184 | mock_predict, 185 | mock_metrics, 186 | mock_write, 187 | target, 188 | custom_threshold, 189 | ): 190 | # arrange mocks 191 | mock_datasplit.return_value = (1, 2, 3, 4) 192 | mock_traintest.return_value = ("X_train_val", "X_test", "Y_train_val", "Y_test") 193 | mock_trainbert.return_value = (Mock(), 2) 194 | 195 | # act 196 | multilabel_pipeline.run_bert_pipeline( 197 | target=target, 198 | additional_features=True, 199 | include_analysis=False, 200 | custom_threshold=custom_threshold, 201 | ) 202 | 203 | # assert 204 | mock_dataload.assert_called_once() 205 | mock_datasplit.assert_called_once() 206 | mock_traintest.assert_called_once() 207 | mock_bertdata.assert_called() 208 | mock_classweights.assert_called_once() 209 | mock_createbert.assert_called_once() 210 | mock_trainbert.assert_called_once() 211 | mock_predict.assert_called_once() 212 | mock_metrics.assert_called_once() 213 | mock_write.assert_called_once() 214 | if custom_threshold is True: 215 | mock_thresholds.assert_called_once() 216 | -------------------------------------------------------------------------------- /tests/test_sentiment_pipeline.py: -------------------------------------------------------------------------------- 1 | from unittest.mock import patch 2 | 3 | from pxtextmining.pipelines import sentiment_pipeline 4 | 5 | 6 | @patch("pxtextmining.pipelines.sentiment_pipeline.write_multilabel_models_and_metrics") 7 | @patch("pxtextmining.pipelines.sentiment_pipeline.get_multiclass_metrics") 8 | @patch("pxtextmining.pipelines.sentiment_pipeline.search_sklearn_pipelines") 9 | @patch("pxtextmining.pipelines.sentiment_pipeline.process_and_split_data", create=True) 10 | @patch("pxtextmining.pipelines.sentiment_pipeline.load_multilabel_data") 11 | def test_sentiment_pipeline( 12 | mock_dataload, 13 | mock_datasplit, 14 | mock_skpipeline, 15 | mock_metrics, 16 | mock_write, 17 | ): 18 | # arrange mocks 19 | mock_datasplit.return_value = (1, 2, 3, 4) 20 | mock_skpipeline.return_value = (["model"], ["training_time"]) 21 | 22 | # act 23 | sentiment_pipeline.run_sentiment_pipeline() 24 | 25 | # assert 26 | mock_dataload.assert_called_once() 27 | mock_datasplit.assert_called_once() 28 | mock_skpipeline.assert_called_once() 29 | mock_metrics.assert_called_once() 30 | mock_write.assert_called_once() 31 | 32 | 33 | @patch("pxtextmining.pipelines.sentiment_pipeline.write_multilabel_models_and_metrics") 34 | @patch("pxtextmining.pipelines.sentiment_pipeline.get_multiclass_metrics") 35 | @patch("pxtextmining.pipelines.sentiment_pipeline.train_bert_model") 36 | @patch("pxtextmining.pipelines.sentiment_pipeline.create_bert_model") 37 | @patch("pxtextmining.pipelines.sentiment_pipeline.compute_class_weight") 38 | @patch("pxtextmining.pipelines.sentiment_pipeline.bert_data_to_dataset") 39 | @patch("pxtextmining.pipelines.sentiment_pipeline.train_test_split") 40 | @patch("pxtextmining.pipelines.sentiment_pipeline.to_categorical") 41 | @patch("pxtextmining.pipelines.sentiment_pipeline.process_and_split_data") 42 | @patch("pxtextmining.pipelines.sentiment_pipeline.load_multilabel_data") 43 | def test_bert_pipeline( 44 | mock_dataload, 45 | mock_datasplit, 46 | mock_categorical, 47 | mock_traintest, 48 | mock_bertdata, 49 | mock_classweights, 50 | mock_createbert, 51 | mock_trainbert, 52 | mock_metrics, 53 | mock_write, 54 | ): 55 | # arrange mocks 56 | mock_datasplit.return_value = (1, 2, 3, 4) 57 | mock_traintest.return_value = ("X_train_val", "X_test", "Y_train_val", "Y_test") 58 | mock_trainbert.return_value = (1, 2) 59 | 60 | # act 61 | sentiment_pipeline.run_sentiment_bert_pipeline(additional_features=False) 62 | 63 | # assert 64 | mock_dataload.assert_called_once() 65 | mock_datasplit.assert_called_once() 66 | mock_categorical.assert_called_once() 67 | mock_traintest.assert_called_once() 68 | mock_bertdata.assert_called() 69 | mock_classweights.assert_called_once() 70 | mock_createbert.assert_called_once() 71 | mock_trainbert.assert_called_once() 72 | mock_metrics.assert_called_once() 73 | mock_write.assert_called_once() 74 | 75 | 76 | @patch("pxtextmining.pipelines.sentiment_pipeline.write_multilabel_models_and_metrics") 77 | @patch("pxtextmining.pipelines.sentiment_pipeline.get_multiclass_metrics") 78 | @patch("pxtextmining.pipelines.sentiment_pipeline.train_bert_model") 79 | @patch( 80 | "pxtextmining.pipelines.sentiment_pipeline.create_bert_model_additional_features" 81 | ) 82 | @patch("pxtextmining.pipelines.sentiment_pipeline.compute_class_weight") 83 | @patch("pxtextmining.pipelines.sentiment_pipeline.bert_data_to_dataset") 84 | @patch("pxtextmining.pipelines.sentiment_pipeline.train_test_split") 85 | @patch("pxtextmining.pipelines.sentiment_pipeline.to_categorical") 86 | @patch("pxtextmining.pipelines.sentiment_pipeline.process_and_split_data") 87 | @patch("pxtextmining.pipelines.sentiment_pipeline.load_multilabel_data") 88 | def test_bert_pipeline_additional_features( 89 | mock_dataload, 90 | mock_datasplit, 91 | mock_categorical, 92 | mock_traintest, 93 | mock_bertdata, 94 | mock_classweights, 95 | mock_createbert, 96 | mock_trainbert, 97 | mock_metrics, 98 | mock_write, 99 | ): 100 | # arrange mocks 101 | mock_datasplit.return_value = (1, 2, 3, 4) 102 | mock_traintest.return_value = ("X_train_val", "X_test", "Y_train_val", "Y_test") 103 | mock_trainbert.return_value = (1, 2) 104 | mock_classweights.return_value = [0.5, 0.2] 105 | 106 | # act 107 | sentiment_pipeline.run_sentiment_bert_pipeline(additional_features=True) 108 | 109 | # assert 110 | mock_dataload.assert_called_once() 111 | mock_datasplit.assert_called_once() 112 | mock_categorical.assert_called_once() 113 | mock_traintest.assert_called_once() 114 | mock_bertdata.assert_called() 115 | mock_classweights.assert_called_once() 116 | mock_createbert.assert_called_once() 117 | mock_trainbert.assert_called_once() 118 | mock_metrics.assert_called_once() 119 | mock_write.assert_called_once() 120 | -------------------------------------------------------------------------------- /tests/test_write_results.py: -------------------------------------------------------------------------------- 1 | import os 2 | from unittest.mock import Mock, mock_open, patch 3 | 4 | import numpy as np 5 | import pandas as pd 6 | import pytest 7 | from sklearn.dummy import DummyClassifier 8 | from tensorflow.keras import Model 9 | 10 | from pxtextmining.factories import factory_write_results 11 | 12 | 13 | @patch("pxtextmining.factories.factory_write_results.pickle.dump", Mock()) 14 | @patch( 15 | "builtins.open", 16 | new_callable=mock_open, 17 | read_data="somestr", 18 | ) 19 | @pytest.mark.parametrize("models", [[Mock(spec=Model)], [Mock(spec=DummyClassifier)]]) 20 | def test_write_multilabel_models_and_metrics(mock_file, tmp_path_factory, models): 21 | # arrange 22 | models = models 23 | model_metrics = ["somestr"] 24 | path = tmp_path_factory.mktemp("somepath") 25 | # act 26 | factory_write_results.write_multilabel_models_and_metrics( 27 | models, model_metrics, path 28 | ) 29 | # assert 30 | if isinstance(models[0], Model): 31 | models[0].save.assert_called_once() 32 | mock_file.assert_called_with(os.path.join(path, "model_0.txt"), "w") 33 | assert open(os.path.join("somepath", "model_0.txt")).read() == "somestr" 34 | 35 | 36 | @patch( 37 | "builtins.open", 38 | new_callable=mock_open, 39 | read_data="somestr", 40 | ) 41 | @patch("pxtextmining.factories.factory_write_results.os.makedirs") 42 | def test_write_multilabel_models_and_metrics_nopath( 43 | mock_makedirs, mock_file_open, tmp_path 44 | ): 45 | # arrange 46 | models = [Mock(spec=Model)] 47 | model_metrics = ["somestr"] 48 | path = "somepath" 49 | # act 50 | factory_write_results.write_multilabel_models_and_metrics( 51 | models, model_metrics, path 52 | ) 53 | # assert 54 | mock_makedirs.assert_called_once_with(path) 55 | 56 | 57 | @patch("pxtextmining.factories.factory_write_results.pd.DataFrame.to_excel") 58 | def test_write_model_preds_sklearn(mock_toexcel, grab_test_X_additional_feats): 59 | x = grab_test_X_additional_feats["FFT answer"] 60 | # arrange 61 | y_true = np.array( 62 | [ 63 | [0.0, 1.0, 0.0, 0.0, 0.0], 64 | [1.0, 0.0, 0.0, 1.0, 1.0], 65 | [1.0, 0.0, 0.0, 0.0, 0.0], 66 | [0.0, 0.0, 1.0, 1.0, 0.0], 67 | [0.0, 0.0, 0.0, 0.0, 1.0], 68 | ] 69 | ) 70 | labels = ["one", "two", "three", "four", "five"] 71 | probs_labels = ["Probability of " + x for x in labels] 72 | preds_df = pd.DataFrame( 73 | np.array( 74 | [ 75 | [0.0, 1.0, 0.0, 1.0, 0.0, 0.1, 0.6, 0.2, 0.7, 0.05], 76 | [1.0, 0.0, 0.0, 1.0, 0.0, 0.55, 0.2, 0.3, 0.8, 0.4], 77 | [1.0, 0.0, 0.0, 0.0, 0.0, 0.8, 0.3, 0.2, 0.3, 0.1], 78 | [1.0, 0.0, 1.0, 1.0, 0.0, 0.7, 0.2, 0.8, 0.9, 0.0], 79 | [0.0, 0.0, 0.0, 0.0, 1.0, 0.2, 0.4, 0.2, 0.1, 0.6], 80 | ] 81 | ), 82 | columns=labels + probs_labels, 83 | index=grab_test_X_additional_feats.index, 84 | ) 85 | preds_df["labels"] = [ 86 | ["two", "four"], 87 | ["one", "four"], 88 | ["one"], 89 | ["one", "three", "four"], 90 | ["five"], 91 | ] 92 | path = "somepath.xlsx" 93 | # act 94 | df = factory_write_results.write_model_preds( 95 | x, y_true, preds_df, labels, path=path, return_df=True 96 | ) 97 | # assert 98 | assert df.shape[0] == len(x) 99 | mock_toexcel.assert_called() 100 | 101 | 102 | @patch("pxtextmining.factories.factory_write_results.pd.DataFrame.to_excel") 103 | @patch("pxtextmining.factories.factory_write_results.parse_metrics_file") 104 | def test_write_model_analysis( 105 | mock_parsemetrics, 106 | mock_toexcel, 107 | grab_preds_df, 108 | ): 109 | mock_parsemetrics.return_value = pd.DataFrame( 110 | { 111 | "label": {0: "one", 1: "two", 2: "three", 3: "four", 4: "five"}, 112 | "precision": {0: 0.46, 1: 0.54, 2: 0.52, 3: 0.54, 4: 0.52}, 113 | "recall": {0: 0.43, 1: 0.82, 2: 0.65, 3: 0.82, 4: 0.65}, 114 | "f1_score": {0: 0.44, 1: 0.65, 2: 0.58, 3: 0.65, 4: 0.58}, 115 | "support (label count in test data)": { 116 | 0: 129, 117 | 1: 115, 118 | 2: 20, 119 | 3: 115, 120 | 4: 20, 121 | }, 122 | } 123 | ) 124 | labels = ["one", "two", "three", "four", "five"] 125 | dataset = grab_preds_df.copy() 126 | preds_df = grab_preds_df 127 | y_true = np.array(grab_preds_df[labels]) 128 | 129 | factory_write_results.write_model_analysis( 130 | "model_name", 131 | labels=labels, 132 | dataset=dataset, 133 | path="somepath", 134 | preds_df=preds_df, 135 | y_true=y_true, 136 | custom_threshold_dict=None, 137 | ) 138 | mock_toexcel.assert_called_once() 139 | --------------------------------------------------------------------------------