├── .coveragerc
├── .github
    └── workflows
    │   └── test_package.yaml
├── .gitignore
├── .pre-commit-config.yaml
├── Dockerfile
├── LICENSE
├── Makefile
├── README.md
├── api
    ├── __init__.py
    ├── api.py
    ├── final_svc.sav
    ├── requirements.txt
    ├── schemas.py
    └── test_api_locally.py
├── coverage.xml
├── current_best_model
    ├── final_bert
    │   ├── bert_perf.xlsx
    │   └── bert_summary.txt
    ├── final_ensemble
    │   ├── ensemble_perf.xlsx
    │   └── ensemble_summary.txt
    ├── final_svc
    │   ├── final_svc.sav
    │   ├── final_svc_perf.xlsx
    │   └── final_svc_summary.txt
    ├── final_xgb
    │   ├── final_xgb.sav
    │   ├── final_xgb_perf.xlsx
    │   └── final_xgb_summary.txt
    └── sentiment
    │   ├── bert_sentiment.txt
    │   ├── confusion_matrix_3_counts.png
    │   ├── confusion_matrix_3_percentages.png
    │   ├── confusion_matrix_5_counts.png
    │   └── confusion_matrix_5_percentages.png
├── datasets
    ├── README.md
    ├── phase_1
    │   ├── README.md
    │   ├── co.csv
    │   ├── co_multi_label.csv
    │   └── text_data.csv
    ├── testing
    │   └── test_data.csv
    └── v6framework_230831.csv
├── docker_README.md
├── docker_data
    ├── data_in
    │   ├── file_01.json
    │   └── file_02.json
    └── data_out
    │   └── file_01.json
├── docker_run.py
├── docs
    ├── about.md
    ├── create_docs.py
    ├── getting started
    │   ├── install.md
    │   ├── package.md
    │   ├── training_new_model.md
    │   └── using_trained_model.md
    ├── index.md
    ├── main.css
    └── reference
    │   ├── API
    │       ├── API.md
    │       ├── quick_API.md
    │       └── slow_API.md
    │   ├── Docker
    │       └── docker_README.md
    │   └── pxtextmining
    │       ├── factories
    │           ├── factory_data_load_and_split.md
    │           ├── factory_model_performance.md
    │           ├── factory_pipeline.md
    │           ├── factory_predict_unlabelled_text.md
    │           └── factory_write_results.md
    │       ├── helpers
    │           └── text_preprocessor.md
    │       └── pipelines
    │           ├── multilabel_pipeline.md
    │           └── sentiment_pipeline.md
├── mkdocs.yml
├── poetry.lock
├── pxtextmining
    ├── __init__.py
    ├── factories
    │   ├── __init__.py
    │   ├── factory_data_load_and_split.py
    │   ├── factory_model_performance.py
    │   ├── factory_pipeline.py
    │   ├── factory_predict_unlabelled_text.py
    │   └── factory_write_results.py
    ├── helpers
    │   ├── __init__.py
    │   └── text_preprocessor.py
    ├── params.py
    └── pipelines
    │   ├── __init__.py
    │   ├── multilabel_pipeline.py
    │   └── sentiment_pipeline.py
├── pyproject.toml
├── setup.py
├── test_multilabel
    └── dummy_metrics.txt
└── tests
    ├── __init__.py
    ├── conftest.py
    ├── test_api.py
    ├── test_data_load_and_split.py
    ├── test_docker_run.py
    ├── test_factory_pipeline.py
    ├── test_helpers.py
    ├── test_model_performance.py
    ├── test_multilabel_pipeline.py
    ├── test_predict_unlabelled_text.py
    ├── test_sentiment_pipeline.py
    └── test_write_results.py


/.coveragerc:
--------------------------------------------------------------------------------
 1 | [run]
 2 | omit = tests\*
 3 |        *\__init__.py
 4 |        *\params.py
 5 |        api\test_api_locally.py
 6 |        setup.py
 7 |        test_rules.py
 8 | 
 9 | source =  api
10 |           pxtextmining
11 | 
12 | [report]
13 | exclude_lines =
14 |     if __name__ == .__main__.:
15 | 


--------------------------------------------------------------------------------
/.github/workflows/test_package.yaml:
--------------------------------------------------------------------------------
 1 | name: Run tests
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ $default-branch ]
 6 |   pull_request:
 7 |     branches:
 8 |       - development
 9 |       - main
10 | 
11 | jobs:
12 |   build:
13 | 
14 |     runs-on: ubuntu-latest
15 |     strategy:
16 |       fail-fast: false
17 |       matrix:
18 |         python-version: ["3.9", "3.10"]
19 | 
20 |     steps:
21 |     - uses: actions/checkout@v3
22 |     - name: Install poetry
23 |       run: pipx install poetry
24 |     - name: Ruff
25 |       uses: chartboost/ruff-action@v1
26 |     - name: Set up Python ${{ matrix.python-version }}
27 |       uses: actions/setup-python@v4
28 |       with:
29 |         python-version: ${{ matrix.python-version }}
30 |         cache: 'poetry'
31 |     - name: Install package
32 |       run: poetry install --with dev
33 |     - name: Run tests
34 |       run: poetry run pytest tests/* -sx
35 |     - name: Upload coverage reports to Codecov
36 |       if: ${{ matrix.python-version }} == "3.10"
37 |       uses: codecov/codecov-action@v3
38 |       env:
39 |         CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
40 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .python-version
 2 | *__pycache__*
 3 | notebooks/*
 4 | my.conf
 5 | site/
 6 | dist/
 7 | .vscode/
 8 | datasets/hidden/*
 9 | test_multilabel/*
10 | .env
11 | api/rsconnect-python/*
12 | .coverage
13 | *_labels.xlsx
14 | current_best_model/final_bert/bert_multilabel
15 | current_best_model/sentiment/bert_sentiment
16 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | exclude: '(build|datasets|current_best_multilabel|docs)/.*'
 2 | 
 3 | repos:
 4 | - repo: https://github.com/pre-commit/pre-commit-hooks
 5 |   rev: v2.3.0
 6 |   hooks:
 7 |     - id: check-added-large-files
 8 |       name: Check for files larger than 75 MB
 9 |       args: [ "--maxkb=750000" ]
10 |     - id: end-of-file-fixer
11 |       name: Check for a blank line at the end of scripts (auto-fixes)
12 |       exclude: 'json'
13 |     - id: trailing-whitespace
14 |       name: Check for trailing whitespaces (auto-fixes)
15 | - repo: https://github.com/pycqa/isort
16 |   rev: 5.12.0
17 |   hooks:
18 |     - id: isort
19 |       name: isort - Sort Python imports (auto-fixes)
20 |       args: [ "--profile", "black", "--filter-files" ]
21 | - repo: https://github.com/astral-sh/ruff-pre-commit
22 |   rev: v0.0.272
23 |   hooks:
24 |     - id: ruff
25 |       name: Ruff linting
26 | - repo: https://github.com/psf/black
27 |   rev: 22.10.0
28 |   hooks:
29 |     - id: black
30 |       name: black - consistent Python code formatting (auto-fixes)
31 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.10.13-slim
 2 | VOLUME /data
 3 | 
 4 | COPY pxtextmining /pxtextmining
 5 | COPY pyproject.toml /pyproject.toml
 6 | COPY docker_README.md /README.md
 7 | RUN pip install --upgrade pip setuptools \
 8 |   && pip install . \
 9 |   && rm -rf /root/.cache
10 | COPY current_best_model/sentiment/bert_sentiment bert_sentiment
11 | COPY current_best_model/final_bert/bert_multilabel bert_multilabel
12 | COPY current_best_model/final_svc/final_svc.sav /final_svc.sav
13 | COPY current_best_model/final_xgb/final_xgb.sav /final_xgb.sav
14 | COPY --chmod=755 docker_run.py docker_run.py
15 | 
16 | LABEL org.opencontainers.image.source=https://github.com/the-strategy-unit/pxtextmining
17 | 
18 | ENTRYPOINT ["python3", "docker_run.py"]
19 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 NHS England
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | coverage:
2 | 	pytest --cov=. tests/ --cov-report xml:coverage.xml --cov-report term
3 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # pxtextmining: Text Classification of Patient Experience feedback
 2 | 
 3 | ## Project description
 4 | **pxtextmining** is a Python package for classifying patient feedback comments collected via the [NHS England Friends and Family Test](https://www.england.nhs.uk/fft/) (FFT). It is part of the [Patient Experience Qualitative Data Categorisation project](https://the-strategy-unit.github.io/PatientExperience-QDC/), funded by NHS England and hosted by Nottinghamshire Healthcare NHS Foundation Trust.
 5 | 
 6 | __We are working openly by [open-sourcing](https://github.com/The-Strategy-Unit/pxtextmining/blob/main/LICENSE) the analysis code and data where possible to promote replication, reproducibility and further developments. Pull requests are more than welcome.__
 7 | 
 8 | ## Documentation and installation
 9 | 
10 | Full documentation, including installation instructions, is available on our [documentation page](https://the-strategy-unit.github.io/pxtextmining/).
11 | 


--------------------------------------------------------------------------------
/api/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/The-Strategy-Unit/pxtextmining/ba6c0a2f10ca9bb3b964b72afe6047b463095c75/api/__init__.py


--------------------------------------------------------------------------------
/api/api.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pickle
 3 | from typing import List
 4 | 
 5 | import pandas as pd
 6 | import schemas
 7 | from fastapi import FastAPI
 8 | 
 9 | from pxtextmining.factories.factory_predict_unlabelled_text import (
10 |     predict_multilabel_sklearn,
11 | )
12 | from pxtextmining.params import minor_cats
13 | 
14 | description = """
15 | This API is for classifying patient experience qualitative data,
16 | utilising the models trained as part of the pxtextmining project.
17 | """
18 | 
19 | tags_metadata = [
20 |     {"name": "index", "description": "Basic page to test if API is working."},
21 |     {
22 |         "name": "multilabel",
23 |         "description": "Generate multilabel predictions for given text.",
24 |     },
25 | ]
26 | 
27 | 
28 | app = FastAPI(
29 |     title="pxtextmining API",
30 |     description=description,
31 |     version="1.0.0",
32 |     contact={
33 |         "name": "Patient Experience Qualitative Data Categorisation",
34 |         "url": "https://the-strategy-unit.github.io/PatientExperience-QDC/",
35 |         "email": "chris.beeley1@nhs.net",
36 |     },
37 |     license_info={
38 |         "name": "MIT License",
39 |         "url": "https://github.com/the-strategy-unit/pxtextmining/blob/main/LICENSE",
40 |     },
41 |     openapi_tags=tags_metadata,
42 | )
43 | 
44 | 
45 | @app.get("/", response_model=schemas.Test, tags=["index"])
46 | def index():
47 |     return {"test": "Hello"}
48 | 
49 | 
50 | @app.post(
51 |     "/predict_multilabel",
52 |     response_model=List[schemas.MultilabelOut],
53 |     tags=["multilabel"],
54 | )
55 | async def predict_multilabel(items: List[schemas.ItemIn]):
56 |     """Accepts comment ids and comment text as JSON in a POST request. Makes predictions using trained SVC model.
57 | 
58 |     Args:
59 |         items (List[ItemIn]): JSON list of dictionaries with the following compulsory keys:
60 |         - `comment_id` (str)
61 |         - `comment_text` (str)
62 | 
63 |     Returns:
64 |         (dict): Keys are: `comment_id` and predicted `labels`.
65 |     """
66 | 
67 |     # Process received data
68 |     df = pd.DataFrame([i.dict() for i in items], dtype=str)
69 |     df_for_preds = df.copy().rename(
70 |         columns={"comment_id": "Comment ID", "comment_text": "FFT answer"}
71 |     )
72 |     df_for_preds = df_for_preds.set_index("Comment ID")
73 |     if df_for_preds.index.duplicated().sum() != 0:
74 |         raise ValueError("comment_id must all be unique values")
75 |     text_to_predict = df_for_preds["FFT answer"]
76 |     # Make predictions
77 |     model_path = "final_svc.sav"
78 |     if not os.path.isfile(model_path):
79 |         model_path = os.path.join("api", model_path)
80 |     with open(model_path, "rb") as model:
81 |         loaded_model = pickle.load(model)
82 |     preds_df = predict_multilabel_sklearn(
83 |         text_to_predict, loaded_model, labels=minor_cats, additional_features=False
84 |     )
85 |     # Join predicted labels with received data
86 |     preds_df["comment_id"] = preds_df.index.astype(str)
87 |     merged = pd.merge(df, preds_df, how="left", on="comment_id")
88 |     merged["labels"] = merged["labels"].fillna("").apply(list)
89 |     for i in merged["labels"].index:
90 |         label_list = merged.loc[i, "labels"]
91 |         if len(label_list) < 1:
92 |             merged.loc[i, "labels"].append("Labelling not possible")
93 |     return_dict = merged[["comment_id", "labels"]].to_dict(orient="records")
94 |     return return_dict
95 | 


--------------------------------------------------------------------------------
/api/final_svc.sav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/The-Strategy-Unit/pxtextmining/ba6c0a2f10ca9bb3b964b72afe6047b463095c75/api/final_svc.sav


--------------------------------------------------------------------------------
/api/requirements.txt:
--------------------------------------------------------------------------------
 1 | absl-py==2.1.0 ; python_version >= "3.8" and python_version < "3.11"
 2 | anyio==4.2.0 ; python_version >= "3.8" and python_version < "3.11"
 3 | astunparse==1.6.3 ; python_version >= "3.8" and python_version < "3.11"
 4 | cachetools==5.3.2 ; python_version >= "3.8" and python_version < "3.11"
 5 | certifi==2024.2.2 ; python_version >= "3.8" and python_version < "3.11"
 6 | cfgv==3.4.0 ; python_version >= "3.8" and python_version < "3.11"
 7 | charset-normalizer==3.3.2 ; python_version >= "3.8" and python_version < "3.11"
 8 | click==8.1.7 ; python_version >= "3.8" and python_version < "3.11"
 9 | colorama==0.4.6 ; python_version >= "3.8" and python_version < "3.11" and (sys_platform == "win32" or platform_system == "Windows")
10 | contourpy==1.1.1 ; python_version >= "3.8" and python_version < "3.11"
11 | coverage[toml]==7.4.1 ; python_version >= "3.8" and python_version < "3.11"
12 | cycler==0.12.1 ; python_version >= "3.8" and python_version < "3.11"
13 | distlib==0.3.8 ; python_version >= "3.8" and python_version < "3.11"
14 | exceptiongroup==1.2.0 ; python_version >= "3.8" and python_version < "3.11"
15 | fastapi==0.101.1 ; python_version >= "3.8" and python_version < "3.11"
16 | filelock==3.13.1 ; python_version >= "3.8" and python_version < "3.11"
17 | flatbuffers==23.5.26 ; python_version >= "3.8" and python_version < "3.11"
18 | fonttools==4.48.1 ; python_version >= "3.8" and python_version < "3.11"
19 | fsspec==2024.2.0 ; python_version >= "3.8" and python_version < "3.11"
20 | gast==0.4.0 ; python_version >= "3.8" and python_version < "3.11"
21 | google-auth-oauthlib==1.0.0 ; python_version >= "3.8" and python_version < "3.11"
22 | google-auth==2.27.0 ; python_version >= "3.8" and python_version < "3.11"
23 | google-pasta==0.2.0 ; python_version >= "3.8" and python_version < "3.11"
24 | grpcio==1.60.1 ; python_version >= "3.8" and python_version < "3.11"
25 | h11==0.14.0 ; python_version >= "3.8" and python_version < "3.11"
26 | h5py==3.10.0 ; python_version >= "3.8" and python_version < "3.11"
27 | httpcore==0.16.3 ; python_version >= "3.8" and python_version < "3.11"
28 | httpx==0.23.3 ; python_version >= "3.8" and python_version < "3.11"
29 | huggingface-hub==0.20.3 ; python_version >= "3.8" and python_version < "3.11"
30 | identify==2.5.34 ; python_version >= "3.8" and python_version < "3.11"
31 | idna==3.6 ; python_version >= "3.8" and python_version < "3.11"
32 | importlib-metadata==7.0.1 ; python_version >= "3.8" and python_version < "3.10"
33 | importlib-resources==6.1.1 ; python_version >= "3.8" and python_version < "3.10"
34 | iniconfig==2.0.0 ; python_version >= "3.8" and python_version < "3.11"
35 | jax==0.4.13 ; python_version >= "3.8" and python_version < "3.11"
36 | joblib==1.3.2 ; python_version >= "3.8" and python_version < "3.11"
37 | keras==2.12.0 ; python_version >= "3.8" and python_version < "3.11"
38 | kiwisolver==1.4.5 ; python_version >= "3.8" and python_version < "3.11"
39 | libclang==16.0.6 ; python_version >= "3.8" and python_version < "3.11"
40 | markdown==3.5.2 ; python_version >= "3.8" and python_version < "3.11"
41 | markupsafe==2.1.5 ; python_version >= "3.8" and python_version < "3.11"
42 | matplotlib==3.7.4 ; python_version >= "3.8" and python_version < "3.11"
43 | ml-dtypes==0.2.0 ; python_version >= "3.8" and python_version < "3.11"
44 | nodeenv==1.8.0 ; python_version >= "3.8" and python_version < "3.11"
45 | numpy==1.23.5 ; python_version >= "3.8" and python_version < "3.11"
46 | oauthlib==3.2.2 ; python_version >= "3.8" and python_version < "3.11"
47 | opt-einsum==3.3.0 ; python_version >= "3.8" and python_version < "3.11"
48 | packaging==23.2 ; python_version >= "3.8" and python_version < "3.11"
49 | pandas==1.5.3 ; python_version >= "3.8" and python_version < "3.11"
50 | pillow==10.2.0 ; python_version >= "3.8" and python_version < "3.11"
51 | platformdirs==4.2.0 ; python_version >= "3.8" and python_version < "3.11"
52 | pluggy==1.4.0 ; python_version >= "3.8" and python_version < "3.11"
53 | pre-commit==3.5.0 ; python_version >= "3.8" and python_version < "3.11"
54 | protobuf==4.25.2 ; python_version >= "3.8" and python_version < "3.11"
55 | pyasn1-modules==0.3.0 ; python_version >= "3.8" and python_version < "3.11"
56 | pyasn1==0.5.1 ; python_version >= "3.8" and python_version < "3.11"
57 | pydantic==1.10.14 ; python_version >= "3.8" and python_version < "3.11"
58 | pyparsing==3.1.1 ; python_version >= "3.8" and python_version < "3.11"
59 | pytest-cov==4.1.0 ; python_version >= "3.8" and python_version < "3.11"
60 | pytest-mock==3.12.0 ; python_version >= "3.8" and python_version < "3.11"
61 | pytest==7.4.4 ; python_version >= "3.8" and python_version < "3.11"
62 | python-dateutil==2.8.2 ; python_version >= "3.8" and python_version < "3.11"
63 | pytz==2024.1 ; python_version >= "3.8" and python_version < "3.11"
64 | pyyaml==6.0.1 ; python_version >= "3.8" and python_version < "3.11"
65 | regex==2023.12.25 ; python_version >= "3.8" and python_version < "3.11"
66 | requests-oauthlib==1.3.1 ; python_version >= "3.8" and python_version < "3.11"
67 | requests==2.31.0 ; python_version >= "3.8" and python_version < "3.11"
68 | rfc3986[idna2008]==1.5.0 ; python_version >= "3.8" and python_version < "3.11"
69 | rsa==4.9 ; python_version >= "3.8" and python_version < "3.11"
70 | ruff==0.0.272 ; python_version >= "3.8" and python_version < "3.11"
71 | safetensors==0.4.2 ; python_version >= "3.8" and python_version < "3.11"
72 | scikit-learn==1.0.2 ; python_version >= "3.8" and python_version < "3.11"
73 | scipy==1.10.1 ; python_version >= "3.8" and python_version < "3.11"
74 | setuptools==69.1.0 ; python_version >= "3.8" and python_version < "3.11"
75 | six==1.16.0 ; python_version >= "3.8" and python_version < "3.11"
76 | sniffio==1.3.0 ; python_version >= "3.8" and python_version < "3.11"
77 | starlette==0.27.0 ; python_version >= "3.8" and python_version < "3.11"
78 | tensorboard-data-server==0.7.2 ; python_version >= "3.8" and python_version < "3.11"
79 | tensorboard==2.12.3 ; python_version >= "3.8" and python_version < "3.11"
80 | tensorflow-estimator==2.12.0 ; python_version >= "3.8" and python_version < "3.11"
81 | tensorflow-io-gcs-filesystem==0.36.0 ; python_version >= "3.8" and python_version < "3.11" and platform_machine != "arm64" or python_version >= "3.8" and python_version < "3.11" and platform_system != "Darwin"
82 | tensorflow==2.12.0 ; python_version >= "3.8" and python_version < "3.11"
83 | termcolor==2.4.0 ; python_version >= "3.8" and python_version < "3.11"
84 | threadpoolctl==3.3.0 ; python_version >= "3.8" and python_version < "3.11"
85 | tokenizers==0.15.2 ; python_version >= "3.8" and python_version < "3.11"
86 | tomli==2.0.1 ; python_version >= "3.8" and python_version < "3.11"
87 | tornado==6.4 ; python_version >= "3.8" and python_version < "3.11"
88 | tqdm==4.66.2 ; python_version >= "3.8" and python_version < "3.11"
89 | transformers==4.37.2 ; python_version >= "3.8" and python_version < "3.11"
90 | typing-extensions==4.9.0 ; python_version >= "3.8" and python_version < "3.11"
91 | urllib3==2.2.0 ; python_version >= "3.8" and python_version < "3.11"
92 | uvicorn==0.20.0 ; python_version >= "3.8" and python_version < "3.11"
93 | virtualenv==20.25.0 ; python_version >= "3.8" and python_version < "3.11"
94 | werkzeug==3.0.1 ; python_version >= "3.8" and python_version < "3.11"
95 | wheel==0.42.0 ; python_version >= "3.8" and python_version < "3.11"
96 | wrapt==1.14.1 ; python_version >= "3.8" and python_version < "3.11"
97 | xgboost==1.7.6 ; python_version >= "3.8" and python_version < "3.11"
98 | zipp==3.17.0 ; python_version >= "3.8" and python_version < "3.10"
99 | 


--------------------------------------------------------------------------------
/api/schemas.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel
 2 | 
 3 | 
 4 | class Test(BaseModel):
 5 |     test: str
 6 | 
 7 |     class Config:
 8 |         schema_extra = {"example": {"test": "Hello"}}
 9 | 
10 | 
11 | class ItemIn(BaseModel):
12 |     comment_id: str
13 |     comment_text: str
14 | 
15 |     class Config:
16 |         schema_extra = {
17 |             "example": {
18 |                 "comment_id": "01",
19 |                 "comment_text": "Nurses were friendly. Parking was awful.",
20 |             }
21 |         }
22 | 
23 | 
24 | class MultilabelOut(BaseModel):
25 |     comment_id: str
26 |     labels: list
27 | 
28 |     class Config:
29 |         schema_extra = {
30 |             "example": {
31 |                 "comment_id": "01",
32 |                 "labels": ["Staff manner & personal attributes", "Parking"],
33 |             }
34 |         }
35 | 


--------------------------------------------------------------------------------
/api/test_api_locally.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | 
 3 | import pandas as pd
 4 | import requests
 5 | 
 6 | """
 7 | To test the API, first in terminal, run this command to launch uvicorn server on http://127.0.0.1:8000
 8 |     uvicorn api.api:app --reload
 9 | Then you can run this test_api script to check if the API is behaving as it should locally
10 | """
11 | 
12 | 
13 | def test_json_predictions(json):
14 |     response = requests.post("http://127.0.0.1:8000/predict_multilabel", json=json)
15 |     return response
16 | 
17 | 
18 | if __name__ == "__main__":
19 |     start = time.time()
20 |     df = pd.read_csv("datasets/hidden/merged_230612.csv")[["Comment ID", "FFT answer"]][
21 |         :2000
22 |     ]
23 |     df = df.rename(
24 |         columns={"Comment ID": "row_id", "FFT answer": "comment_txt"}
25 |     ).dropna()
26 |     df = df[["row_id", "comment_txt"]].copy().set_index("row_id")[:1000]
27 |     js = []
28 |     for i in df.index:
29 |         js.append({"comment_id": str(i), "comment_text": df.loc[i]["comment_txt"]})
30 |     print("The JSON that was sent looks like:")
31 |     print(js[:5])
32 |     print("The JSON that is returned is:")
33 |     returned_json = test_json_predictions(js).json()
34 |     finish = time.time()
35 |     total = finish - start
36 |     print(f"Time taken: {total} seconds")
37 |     print(returned_json)
38 |     # json_object = json.dumps(returned_json, indent=4)
39 |     # with open("predictions.json", "w") as outfile:
40 |     #     outfile.write(json_object)
41 | 


--------------------------------------------------------------------------------
/current_best_model/final_bert/bert_perf.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/The-Strategy-Unit/pxtextmining/ba6c0a2f10ca9bb3b964b72afe6047b463095c75/current_best_model/final_bert/bert_perf.xlsx


--------------------------------------------------------------------------------
/current_best_model/final_bert/bert_summary.txt:
--------------------------------------------------------------------------------
 1 | 
 2 |  *****************
 3 |  Random state seed for train test split is: 42
 4 | 
 5 | 
 6 | Model: "DistilBERT"
 7 | _________________________________________________________________
 8 |  Layer (type)                Output Shape              Param #
 9 | =================================================================
10 |  input_ids (InputLayer)      [(None, 150)]             0
11 | 
12 |  distilbert (TFDistilBertMai  TFBaseModelOutput(last_h  66362880
13 |  nLayer)                     idden_state=(None, 150,
14 |                              768),
15 |                               hidden_states=None, att
16 |                              entions=None)
17 | 
18 |  tf.__operators__.getitem (S  (None, 768)              0
19 |  licingOpLambda)
20 | 
21 |  pooled_output (Dropout)     (None, 768)               0
22 | 
23 |  output (Dense)              (None, 32)                24608
24 | 
25 | =================================================================
26 | Total params: 66,387,488
27 | Trainable params: 66,387,488
28 | Non-trainable params: 0
29 | _________________________________________________________________
30 | 
31 | 
32 | Training time: 8:47:32
33 | 
34 | exact_accuracy: 0.5759920139755428
35 | hamming_loss: 0.024114050411779386
36 | macro_jaccard_score: 0.528819440670572
37 | macro_roc_auc: 0.9619264849220406
38 | Label ranking average precision: 0.8684599465486575
39 | 
40 |  Classification report:
41 |                                                            precision    recall  f1-score   support
42 | 
43 |                                 Organisation & efficiency       0.47      0.82      0.60       102
44 |                      Funding & use of financial resources       0.71      0.68      0.69        25
45 |                        Staff manner & personal attributes       0.94      0.86      0.90      1431
46 |                                     Competence & training       0.61      0.52      0.57       164
47 |                                 Unspecified communication       0.65      0.61      0.63        36
48 |       Staff listening, understanding & involving patients       0.57      0.76      0.65       361
49 |               Information directly from staff during care       0.78      0.76      0.77       390
50 |                          Information provision & guidance       0.68      0.38      0.49        90
51 | Being kept informed, clarity & consistency of information       0.49      0.59      0.53       183
52 |                                       Contacting services       0.69      0.59      0.63       100
53 |                                  Appointment arrangements       0.76      0.55      0.64       261
54 |                                        Appointment method       0.63      0.61      0.62        31
55 |                                        Timeliness of care       0.73      0.73      0.73       529
56 |                                           Pain management       0.80      0.56      0.66        43
57 |                                                 Discharge       0.72      0.28      0.41        46
58 |                 Cleanliness, tidiness & infection control       0.87      0.73      0.79       107
59 |                                          Service location       0.85      0.52      0.65        86
60 |                               Transport to/ from services       0.70      0.65      0.68        78
61 |                                                   Parking       0.94      0.89      0.91        18
62 |                                  Electronic entertainment       0.94      0.74      0.83        23
63 |                                              Feeling safe       0.75      0.78      0.77        23
64 |                                         Mental Health Act       0.67      0.31      0.42        13
65 |                                    Labelling not possible       1.00      1.00      1.00       238
66 |                      Supplying & understanding medication       0.75      0.69      0.72        59
67 |                          Activities & access to fresh air       0.92      0.63      0.75        54
68 |                       Food & drink provision & facilities       0.88      0.85      0.87       106
69 |                                        Sensory experience       0.78      0.85      0.81        67
70 |                           Interaction with family/ carers       0.61      0.34      0.44       123
71 |                           Positive experience & gratitude       0.86      0.88      0.87       938
72 |                                        Continuity of care       0.72      0.30      0.42       290
73 |                       Environment, facilities & equipment       0.77      0.58      0.66       202
74 |                          Staffing levels & responsiveness       0.47      0.44      0.45       194
75 | 
76 |                                                 micro avg       0.78      0.72      0.75      6411
77 |                                                 macro avg       0.74      0.64      0.67      6411
78 |                                              weighted avg       0.79      0.72      0.74      6411
79 |                                               samples avg       0.81      0.78      0.78      6411
80 | 


--------------------------------------------------------------------------------
/current_best_model/final_ensemble/ensemble_perf.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/The-Strategy-Unit/pxtextmining/ba6c0a2f10ca9bb3b964b72afe6047b463095c75/current_best_model/final_ensemble/ensemble_perf.xlsx


--------------------------------------------------------------------------------
/current_best_model/final_ensemble/ensemble_summary.txt:
--------------------------------------------------------------------------------
  1 | 
  2 |  *****************
  3 |  Random state seed for train test split is: 42
  4 | 
  5 | 
  6 | Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer()),
  7 |                 ('multioutputclassifier',
  8 |                  MultiOutputClassifier(estimator=SVC(C=15, cache_size=1000,
  9 |                                                      class_weight='balanced',
 10 |                                                      max_iter=1000,
 11 |                                                      probability=True)))])
 12 | Model: "DistilBERT"
 13 | _________________________________________________________________
 14 |  Layer (type)                Output Shape              Param #
 15 | =================================================================
 16 |  input_ids (InputLayer)      [(None, 150)]             0
 17 | 
 18 |  distilbert (TFDistilBertMai  TFBaseModelOutput(last_h  66362880
 19 |  nLayer)                     idden_state=(None, 150,
 20 |                              768),
 21 |                               hidden_states=None, att
 22 |                              entions=None)
 23 | 
 24 |  tf.__operators__.getitem (S  (None, 768)              0
 25 |  licingOpLambda)
 26 | 
 27 |  pooled_output (Dropout)     (None, 768)               0
 28 | 
 29 |  output (Dense)              (None, 32)                24608
 30 | 
 31 | =================================================================
 32 | Total params: 66,387,488
 33 | Trainable params: 66,387,488
 34 | Non-trainable params: 0
 35 | _________________________________________________________________
 36 | 
 37 | Pipeline(steps=[('tfidfvectorizer',
 38 |                  TfidfVectorizer(max_df=0.99, min_df=6, ngram_range=(1, 2))),
 39 |                 ('xgbclassifier',
 40 |                  XGBClassifier(base_score=None, booster=None, callbacks=None,
 41 |                                colsample_bylevel=None, colsample_bynode=None,
 42 |                                colsample_bytree=None,
 43 |                                early_stopping_rounds=None,
 44 |                                enable_categorical=False, eval_metric=None,
 45 |                                feature_types=None, gamma=0.3, gpu_id=None,
 46 |                                grow_policy=None, importance_type=None,
 47 |                                interaction_constraints=None, learning_rate=None,
 48 |                                max_bin=None, max_cat_threshold=None,
 49 |                                max_cat_to_onehot=None, max_delta_step=None,
 50 |                                max_depth=4, max_leaves=None,
 51 |                                min_child_weight=0.5, missing=nan,
 52 |                                monotone_constraints=None, n_estimators=200,
 53 |                                n_jobs=None, num_parallel_tree=None,
 54 |                                predictor=None, random_state=None, ...))])
 55 | 
 56 | 
 57 | Ensembling method: Average of predicted probabilities for each model taken. Threshold set at 0.3
 58 | 
 59 | exact_accuracy: 0.560019965061143
 60 | hamming_loss: 0.023146992762665335
 61 | macro_jaccard_score: 0.5644485328931894
 62 | macro_roc_auc: 0.9709795934716587
 63 | Label ranking average precision: 0.8805306557959275
 64 | 
 65 |  Classification report:
 66 |                                                            precision    recall  f1-score   support
 67 | 
 68 |                                 Organisation & efficiency       0.49      0.75      0.59       102
 69 |                      Funding & use of financial resources       0.64      0.64      0.64        25
 70 |                        Staff manner & personal attributes       0.90      0.90      0.90      1431
 71 |                                     Competence & training       0.74      0.57      0.64       164
 72 |                                 Unspecified communication       0.61      0.64      0.62        36
 73 |       Staff listening, understanding & involving patients       0.64      0.75      0.69       361
 74 |               Information directly from staff during care       0.73      0.80      0.76       390
 75 |                          Information provision & guidance       0.64      0.51      0.57        90
 76 | Being kept informed, clarity & consistency of information       0.56      0.66      0.60       183
 77 |                                       Contacting services       0.72      0.65      0.68       100
 78 |                                  Appointment arrangements       0.73      0.69      0.71       261
 79 |                                        Appointment method       0.65      0.65      0.65        31
 80 |                                        Timeliness of care       0.67      0.82      0.74       529
 81 |                                           Pain management       0.86      0.72      0.78        43
 82 |                                                 Discharge       0.80      0.52      0.63        46
 83 |                 Cleanliness, tidiness & infection control       0.91      0.86      0.88       107
 84 |                                          Service location       0.85      0.59      0.70        86
 85 |                               Transport to/ from services       0.74      0.64      0.68        78
 86 |                                                   Parking       0.94      0.94      0.94        18
 87 |                                  Electronic entertainment       0.94      0.65      0.77        23
 88 |                                              Feeling safe       0.69      0.87      0.77        23
 89 |                                         Mental Health Act       0.75      0.23      0.35        13
 90 |                                    Labelling not possible       1.00      1.00      1.00       238
 91 |                      Supplying & understanding medication       0.73      0.69      0.71        59
 92 |                          Activities & access to fresh air       0.76      0.76      0.76        54
 93 |                       Food & drink provision & facilities       0.87      0.85      0.86       106
 94 |                                        Sensory experience       0.80      0.79      0.80        67
 95 |                           Interaction with family/ carers       0.59      0.45      0.51       123
 96 |                           Positive experience & gratitude       0.80      0.91      0.85       938
 97 |                                        Continuity of care       0.65      0.58      0.61       290
 98 |                       Environment, facilities & equipment       0.71      0.67      0.69       202
 99 |                          Staffing levels & responsiveness       0.55      0.56      0.56       194
100 | 
101 |                                                 micro avg       0.76      0.78      0.77      6411
102 |                                                 macro avg       0.74      0.70      0.71      6411
103 |                                              weighted avg       0.76      0.78      0.77      6411
104 |                                               samples avg       0.79      0.83      0.79      6411
105 | 


--------------------------------------------------------------------------------
/current_best_model/final_svc/final_svc.sav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/The-Strategy-Unit/pxtextmining/ba6c0a2f10ca9bb3b964b72afe6047b463095c75/current_best_model/final_svc/final_svc.sav


--------------------------------------------------------------------------------
/current_best_model/final_svc/final_svc_perf.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/The-Strategy-Unit/pxtextmining/ba6c0a2f10ca9bb3b964b72afe6047b463095c75/current_best_model/final_svc/final_svc_perf.xlsx


--------------------------------------------------------------------------------
/current_best_model/final_svc/final_svc_summary.txt:
--------------------------------------------------------------------------------
 1 | 
 2 |  *****************
 3 |  Random state seed for train test split is: 42
 4 | 
 5 | 
 6 | Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer()),
 7 |                 ('multioutputclassifier',
 8 |                  MultiOutputClassifier(estimator=SVC(C=15, cache_size=1000,
 9 |                                                      class_weight='balanced',
10 |                                                      max_iter=1000,
11 |                                                      probability=True)))])
12 | 
13 | 
14 | Training time: 0:08:43
15 | 
16 | exact_accuracy: 0.4634389817818817
17 | hamming_loss: 0.029105315697529322
18 | macro_jaccard_score: 0.48982370495986105
19 | macro_roc_auc: 0.9515989884263054
20 | Label ranking average precision: 0.8440659360100616
21 | 
22 |  Classification report:
23 |                                                            precision    recall  f1-score   support
24 | 
25 |                                 Organisation & efficiency       0.65      0.50      0.57       102
26 |                      Funding & use of financial resources       0.62      0.64      0.63        25
27 |                        Staff manner & personal attributes       0.85      0.86      0.86      1431
28 |                                     Competence & training       0.78      0.41      0.54       164
29 |                                 Unspecified communication       0.70      0.44      0.54        36
30 |       Staff listening, understanding & involving patients       0.65      0.66      0.65       361
31 |               Information directly from staff during care       0.77      0.71      0.74       390
32 |                          Information provision & guidance       0.67      0.36      0.46        90
33 | Being kept informed, clarity & consistency of information       0.59      0.45      0.51       183
34 |                                       Contacting services       0.72      0.59      0.65       100
35 |                                  Appointment arrangements       0.71      0.59      0.65       261
36 |                                        Appointment method       0.78      0.45      0.57        31
37 |                                        Timeliness of care       0.60      0.71      0.65       529
38 |                                           Pain management       0.88      0.67      0.76        43
39 |                                                 Discharge       0.77      0.37      0.50        46
40 |                 Cleanliness, tidiness & infection control       0.94      0.82      0.88       107
41 |                                          Service location       0.84      0.56      0.67        86
42 |                               Transport to/ from services       0.67      0.46      0.55        78
43 |                                                   Parking       1.00      0.83      0.91        18
44 |                                  Electronic entertainment       1.00      0.57      0.72        23
45 |                                              Feeling safe       0.67      0.52      0.59        23
46 |                                         Mental Health Act       0.67      0.15      0.25        13
47 |                                    Labelling not possible       1.00      1.00      1.00       238
48 |                      Supplying & understanding medication       0.69      0.59      0.64        59
49 |                          Activities & access to fresh air       0.71      0.72      0.72        54
50 |                       Food & drink provision & facilities       0.88      0.70      0.78       106
51 |                                        Sensory experience       0.81      0.69      0.74        67
52 |                           Interaction with family/ carers       0.57      0.32      0.41       123
53 |                           Positive experience & gratitude       0.60      0.85      0.70       938
54 |                                        Continuity of care       0.52      0.59      0.55       290
55 |                       Environment, facilities & equipment       0.77      0.55      0.64       202
56 |                          Staffing levels & responsiveness       0.56      0.42      0.48       194
57 | 
58 |                                                 micro avg       0.71      0.70      0.71      6411
59 |                                                 macro avg       0.74      0.59      0.64      6411
60 |                                              weighted avg       0.72      0.70      0.70      6411
61 |                                               samples avg       0.74      0.76      0.72      6411
62 | 


--------------------------------------------------------------------------------
/current_best_model/final_xgb/final_xgb.sav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/The-Strategy-Unit/pxtextmining/ba6c0a2f10ca9bb3b964b72afe6047b463095c75/current_best_model/final_xgb/final_xgb.sav


--------------------------------------------------------------------------------
/current_best_model/final_xgb/final_xgb_perf.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/The-Strategy-Unit/pxtextmining/ba6c0a2f10ca9bb3b964b72afe6047b463095c75/current_best_model/final_xgb/final_xgb_perf.xlsx


--------------------------------------------------------------------------------
/current_best_model/final_xgb/final_xgb_summary.txt:
--------------------------------------------------------------------------------
 1 | 
 2 |  *****************
 3 |  Random state seed for train test split is: 42
 4 | 
 5 | 
 6 | Pipeline(steps=[('tfidfvectorizer',
 7 |                  TfidfVectorizer(max_df=0.99, min_df=6, ngram_range=(1, 2))),
 8 |                 ('xgbclassifier',
 9 |                  XGBClassifier(base_score=None, booster=None, callbacks=None,
10 |                                colsample_bylevel=None, colsample_bynode=None,
11 |                                colsample_bytree=None,
12 |                                early_stopping_rounds=None,
13 |                                enable_categorical=False, eval_metric=None,
14 |                                feature_types=None, gamma=0.3, gpu_id=None,
15 |                                grow_policy=None, importance_type=None,
16 |                                interaction_constraints=None, learning_rate=None,
17 |                                max_bin=None, max_cat_threshold=None,
18 |                                max_cat_to_onehot=None, max_delta_step=None,
19 |                                max_depth=4, max_leaves=None,
20 |                                min_child_weight=0.5, missing=nan,
21 |                                monotone_constraints=None, n_estimators=200,
22 |                                n_jobs=None, num_parallel_tree=None,
23 |                                predictor=None, random_state=None, ...))])
24 | 
25 | 
26 | Training time: 4:15:32
27 | 
28 | exact_accuracy: 0.5547791365111056
29 | hamming_loss: 0.02501871724482156
30 | macro_jaccard_score: 0.4596213577953742
31 | macro_roc_auc: 0.9340739201717683
32 | Label ranking average precision: 0.8480797054165633
33 | 
34 |  Classification report:
35 |                                                            precision    recall  f1-score   support
36 | 
37 |                                 Organisation & efficiency       0.62      0.45      0.52       102
38 |                      Funding & use of financial resources       0.75      0.24      0.36        25
39 |                        Staff manner & personal attributes       0.91      0.87      0.89      1431
40 |                                     Competence & training       0.79      0.39      0.52       164
41 |                                 Unspecified communication       0.56      0.42      0.48        36
42 |       Staff listening, understanding & involving patients       0.79      0.56      0.65       361
43 |               Information directly from staff during care       0.78      0.69      0.73       390
44 |                          Information provision & guidance       0.63      0.36      0.45        90
45 | Being kept informed, clarity & consistency of information       0.61      0.32      0.42       183
46 |                                       Contacting services       0.76      0.52      0.62       100
47 |                                  Appointment arrangements       0.74      0.56      0.64       261
48 |                                        Appointment method       0.62      0.42      0.50        31
49 |                                        Timeliness of care       0.71      0.67      0.69       529
50 |                                           Pain management       0.77      0.56      0.65        43
51 |                                                 Discharge       0.81      0.37      0.51        46
52 |                 Cleanliness, tidiness & infection control       0.95      0.78      0.86       107
53 |                                          Service location       0.85      0.53      0.66        86
54 |                               Transport to/ from services       0.69      0.40      0.50        78
55 |                                                   Parking       0.94      0.89      0.91        18
56 |                                  Electronic entertainment       0.92      0.52      0.67        23
57 |                                              Feeling safe       0.73      0.70      0.71        23
58 |                                         Mental Health Act       0.50      0.08      0.13        13
59 |                                    Labelling not possible       1.00      1.00      1.00       238
60 |                      Supplying & understanding medication       0.77      0.58      0.66        59
61 |                          Activities & access to fresh air       0.88      0.52      0.65        54
62 |                       Food & drink provision & facilities       0.92      0.67      0.78       106
63 |                                        Sensory experience       0.77      0.36      0.49        67
64 |                           Interaction with family/ carers       0.58      0.24      0.34       123
65 |                           Positive experience & gratitude       0.78      0.86      0.81       938
66 |                                        Continuity of care       0.62      0.45      0.52       290
67 |                       Environment, facilities & equipment       0.75      0.45      0.56       202
68 |                          Staffing levels & responsiveness       0.68      0.40      0.50       194
69 | 
70 |                                                 micro avg       0.80      0.67      0.73      6411
71 |                                                 macro avg       0.76      0.53      0.61      6411
72 |                                              weighted avg       0.79      0.67      0.71      6411
73 |                                               samples avg       0.81      0.73      0.75      6411
74 | 


--------------------------------------------------------------------------------
/current_best_model/sentiment/bert_sentiment.txt:
--------------------------------------------------------------------------------
 1 | 
 2 |  *****************
 3 |  Random state seed for train test split is: 75
 4 | 
 5 | 
 6 | Model: "model"
 7 | __________________________________________________________________________________________________
 8 |  Layer (type)                   Output Shape         Param #     Connected to
 9 | ==================================================================================================
10 |  input_ids (InputLayer)         [(None, 150)]        0           []
11 | 
12 |  distilbert (TFDistilBertMainLa  TFBaseModelOutput(l  66362880   ['input_ids[0][0]']
13 |  yer)                           ast_hidden_state=(N
14 |                                 one, 150, 768),
15 |                                  hidden_states=None
16 |                                 , attentions=None)
17 | 
18 |  input_cat (InputLayer)         [(None, 1)]          0           []
19 | 
20 |  tf.__operators__.getitem (Slic  (None, 768)         0           ['distilbert[0][0]']
21 |  ingOpLambda)
22 | 
23 |  category_encoding (CategoryEnc  (None, 3)           0           ['input_cat[0][0]']
24 |  oding)
25 | 
26 |  pooled_output (Dropout)        (None, 768)          0           ['tf.__operators__.getitem[0][0]'
27 |                                                                  ]
28 | 
29 |  dense (Dense)                  (None, 10)           40          ['category_encoding[0][0]']
30 | 
31 |  concatenate (Concatenate)      (None, 778)          0           ['pooled_output[0][0]',
32 |                                                                   'dense[0][0]']
33 | 
34 |  output (Dense)                 (None, 5)            3895        ['concatenate[0][0]']
35 | 
36 | ==================================================================================================
37 | Total params: 66,366,815
38 | Trainable params: 66,366,815
39 | Non-trainable params: 0
40 | __________________________________________________________________________________________________
41 | 
42 | 
43 | Training time: 5:20:56
44 | 
45 | 
46 |  Classification report:
47 |                precision    recall  f1-score   support
48 | 
49 | very positive       0.80      0.79      0.80      1746
50 |      positive       0.63      0.52      0.57       841
51 |       neutral       0.52      0.71      0.60       551
52 |      negative       0.79      0.68      0.73       639
53 | very negative       0.52      0.64      0.57       166
54 | 
55 |      accuracy                           0.70      3943
56 |     macro avg       0.65      0.67      0.65      3943
57 |  weighted avg       0.71      0.70      0.70      3943
58 | 


--------------------------------------------------------------------------------
/current_best_model/sentiment/confusion_matrix_3_counts.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/The-Strategy-Unit/pxtextmining/ba6c0a2f10ca9bb3b964b72afe6047b463095c75/current_best_model/sentiment/confusion_matrix_3_counts.png


--------------------------------------------------------------------------------
/current_best_model/sentiment/confusion_matrix_3_percentages.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/The-Strategy-Unit/pxtextmining/ba6c0a2f10ca9bb3b964b72afe6047b463095c75/current_best_model/sentiment/confusion_matrix_3_percentages.png


--------------------------------------------------------------------------------
/current_best_model/sentiment/confusion_matrix_5_counts.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/The-Strategy-Unit/pxtextmining/ba6c0a2f10ca9bb3b964b72afe6047b463095c75/current_best_model/sentiment/confusion_matrix_5_counts.png


--------------------------------------------------------------------------------
/current_best_model/sentiment/confusion_matrix_5_percentages.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/The-Strategy-Unit/pxtextmining/ba6c0a2f10ca9bb3b964b72afe6047b463095c75/current_best_model/sentiment/confusion_matrix_5_percentages.png


--------------------------------------------------------------------------------
/datasets/README.md:
--------------------------------------------------------------------------------
 1 | Please note that the Care Opinion data is being shared under the [CC BY-NC-SA 4.0 licence](https://creativecommons.org/licenses/by-nc-sa/4.0/) and is generated from the [Care Opinion API](https://www.careopinion.org.uk/info/api-v2).
 2 | 
 3 | 
 4 | Two out of the six participating trusts have agreed to make their data available publicly.
 5 | 
 6 | An explanation of the dataset columns for phase 2 is available below.
 7 | 
 8 | 
 9 | 
10 | Comment ID: ID for the specific comment.
11 | 
12 | Trust: NHS Trust where comment originated.
13 | 
14 | Respondent ID: ID for the specific respondent. Not linked to any personal identifiable information.
15 | 
16 | Date: Date the comment was provided.
17 | 
18 | Service type 1: Department relating to the comment.
19 | 
20 | Service type 2: Subdepartment relating to the comment.
21 | 
22 | FFT categorical answer: Quantitative score attached to the comment. 1 is "very good", 5 is "very poor".
23 | 
24 | FFT question: The specific question asked by the NHS trust to elicit the qualitative text response.
25 | 
26 | FFT answer: The qualitative text response provided by the respondent to the FFT question.
27 | 
28 | Person identifiable info?: Whether or not the FFT answer contains any person identifiable info, as flagged by the labeller.
29 | 
30 | Comment sentiment: The sentiment score applied to the FFT answer by the labeller. 1 is "very positive", 5 is "very negative". Mixed comments have been labelled as "3", neutral.
31 | 
32 | All other columns are the qualitative framework labels, in one hot encoded format. The version of the framework being used is reflected in the filename. Full details of the framework are available on the [project documentation website](https://the-strategy-unit.github.io/PatientExperience-QDC/framework/framework3.html).
33 | 


--------------------------------------------------------------------------------
/datasets/phase_1/README.md:
--------------------------------------------------------------------------------
 1 | Please note that the Care Opinion data is being shared under the [CC BY-NC-SA 4.0 licence](https://creativecommons.org/licenses/by-nc-sa/4.0/) and is generated from the [Care Opinion API](https://www.careopinion.org.uk/info/api-v2).
 2 | 
 3 | The dataset for phase 1 is stored in this folder. It is no longer used for training the pxtextmining models but is provided for historical interest.
 4 | 
 5 | The `co` and `co_multi_label` files are less useful, with fewer rows.
 6 | 
 7 | The main dataset is the file `text_data`. The following is a description of the columns:
 8 | 
 9 | code:
10 | The shortcode given for the subcategory applied to the comment. There is 1:1 relationship between codes and subcategories, listed below.
11 | 
12 |  'cc': 'Care received',
13 |  'xn': 'Nothing to improve',
14 |  'sa': 'Attitude Of Staff',
15 |  'ss': 'Staff: General',
16 |  'cs': 'Advice and support',
17 |  'mi': 'Amount/clarity of information',
18 |  'sp': 'Professionalism/Competence Of Staff',
19 |  'xe': 'Everything was good/bad',
20 |  'mm': 'Communication',
21 |  'cr': 'Rules/approach to care',
22 |  'ml': 'Listening',
23 |  'ef': 'Food',
24 |  'wa': 'Time spent waiting for first appt/referral/service',
25 |  'ap': 'Provision of services',
26 |  'eq': 'Facilities/equipment',
27 |  'ce': 'Emotional care',
28 |  'ee': 'Environment/ facilities',
29 |  'cp': 'Physical care',
30 |  'aa': 'General',
31 |  'ca': 'Activities',
32 |  'co': '1-2-1 care/Time spent with service user',
33 |  'cm': 'Medication ',
34 |  'tc': 'Consistency/Continuity of care',
35 |  'da': 'Respect For Diversity/ Person-Centeredness',
36 |  'ec': 'Cleanliness',
37 |  'sl': 'Staffing levels',
38 |  'ti': 'Coordination/Integration Of Care',
39 |  'cl': 'Made A Difference To My Life',
40 |  'ds': 'Feeling safe including bullying',
41 |  'tx': 'Transition And Discharge',
42 |  'wb': 'Time spent waiting between appointments',
43 |  'ct': 'Therapies',
44 |  'al': 'Location',
45 |  'dp': 'Involvement: Of Service Users/Patients',
46 |  'dd': 'Dignity: General',
47 |  'cf': 'Carer support',
48 |  'xm': 'Miscellaneous',
49 |  'tt': 'Transition/ coordination: General',
50 |  'xg': 'Nothing was good',
51 |  'ep': 'Parking/transport',
52 |  'xf': 'Funding',
53 |  'xl': 'Leave (under MHA)',
54 |  'dc': 'Involvement: Of Family And Carers',
55 |  'xs': 'Surveying'
56 | 
57 | label:
58 | The overarching major category label for the text comment.
59 | 
60 | subcategory:
61 | The subcategory label for the text comment.
62 | 
63 | feedback:
64 | The actual text of the qualitative feedback comment.
65 | 
66 | criticality:
67 | How critical the comment is towards the organisation. Can also be interpreted as a type of sentiment. Ranges from -5 to 5, with -5 being highly critical, or highly negative, and 5 being highly positive.
68 | 
69 | organization:
70 | Which NHS Trust the feedback relates to.
71 | 
72 | question:
73 | The question that the feedback relates to.
74 | 
75 | row_index:
76 | row ID number for the feedback comment.
77 | 


--------------------------------------------------------------------------------
/docker_README.md:
--------------------------------------------------------------------------------
  1 | # pxtextmining: Text Classification of Patient Experience feedback
  2 | 
  3 | This Docker container contains the pxtextmining machine learning models trained as part of the [Patient Experience Qualitative Data Categorisation project](https://the-strategy-unit.github.io/PatientExperience-QDC/).
  4 | 
  5 | To use this Docker container to predict your unlabelled text:
  6 | 
  7 | 1. Set up your folders. You will need to set up a folder containing two other folders, data_in and data_out, as below.
  8 | ```
  9 | docker_data/
 10 | ├─ data_in/
 11 | ├─ data_out/
 12 | 
 13 | ```
 14 | 
 15 | 2. Prepare your data. Save the data you wish to pass through the machine learning models as json, in the data_in folder. The data should be in the following format:
 16 | 
 17 | In Python, a `list` containing as many `dict`s as there are comments to be predicted. Each `dict` has three compulsory keys:
 18 | 
 19 |   * `comment_id`: Unique ID associated with the comment, in `str` format. Each Comment ID per API call must be unique.
 20 |   * `comment_text`: Text to be classified, in `str` format.
 21 |   * `question_type`: The type of question asked to elicit the comment text. Questions are different from trust to trust, but they all fall into one of three categories:
 22 |        * `what_good`: Any variation on the question "What was good about the service?", or "What did we do well?"
 23 |        * `could_improve`: Any variation on the question "Please tell us about anything that we could have done better", or "How could we improve?"
 24 |        * `nonspecific`: Any other type of nonspecific question, e.g. "Please can you tell us why you gave your answer?", or "What were you satisfied and/or dissatisfied with?".
 25 | 
 26 | ```python
 27 | # In Python
 28 | 
 29 | text_data = [
 30 |               { 'comment_id': '1', # The comment_id values in each dict must be unique.
 31 |                 'comment_text': 'This is the first comment. Nurse was great.',
 32 |                 'question_type': 'what_good' },
 33 |               { 'comment_id': '2',
 34 |                 'comment_text': 'This is the second comment. The ward was freezing.',
 35 |                 'question_type': 'could_improve' },
 36 |               { 'comment_id': '3',
 37 |                 'comment_text': '',  # This comment is an empty string.
 38 |                 'question_type': 'nonspecific' }
 39 |             ]
 40 | 
 41 | ```
 42 | 
 43 | ```R
 44 | # In R
 45 | 
 46 | library(jsonlite)
 47 | 
 48 | comment_id <- c("1", "2", "3")
 49 | comment_text <- c(
 50 |   "This is the first comment. Nurse was great.",
 51 |   "This is the second comment. The ward was freezing.",
 52 |   ""
 53 | )
 54 | question_type <- c("what_good", "could_improve", "nonspecific")
 55 | df <- data.frame(comment_id, comment_text, question_type)
 56 | text_data <- toJSON(df)
 57 | ```
 58 | 
 59 | 3. Save the JSON data in the data_in folder, as follows:
 60 | 
 61 | ```python
 62 | # In Python
 63 | 
 64 | json_data = json.dumps(text_data)
 65 | with open("data_in/file_01.json", "w") as outfile:
 66 |     outfile.write(json_data)
 67 | ```
 68 | 
 69 | ```R
 70 | # In R
 71 | 
 72 | json_data <- toJSON(text_data, pretty = TRUE)
 73 | write(json_data, file = "data_in/file_01.json")
 74 | ```
 75 | 
 76 | 4. Your file structure should now look like this:
 77 | 
 78 | ```
 79 | docker_data/
 80 | ├─ data_in/
 81 | │  ├─ file_01.json
 82 | ├─ data_out/
 83 | ```
 84 | 
 85 | 5. Mount the docker_data folder as the `data` volume for the Docker container and run the container. Pass the filename for the input JSON as the first argument. The following arguments are also available:
 86 |    - `--local-storage` or `-l` flag for local storage (does not delete the files in data_in after completing predictions)
 87 |    - `--target` or `-t` to select the machine learning models used. Options are `m` for multilabel, `s` for `sentiment`, or `ms` for both. Defaults to `ms` if nothing is selected.
 88 | 
 89 | A sample command would be:
 90 | `docker run --rm -it -v /docker_data:/data ghcr.io/the-strategy-unit/pxtextmining:latest file_01.json -l `
 91 | 
 92 | 6. The predictions will be outputted as a json file in the data_out folder, with the same filename. After running successfully, the final folder structure should be:
 93 | 
 94 | ```
 95 | docker_data/
 96 | ├─ data_in/
 97 | │  ├─ file_01.json
 98 | ├─ data_out/
 99 |    ├─ file_01.json
100 | ```
101 | 


--------------------------------------------------------------------------------
/docker_data/data_in/file_01.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "comment_id": "1",
 4 |     "comment_text": "The nurse was very rude and unhelpful",
 5 |     "question_type": "what_good"
 6 |   },
 7 |   {
 8 |     "comment_id": "2",
 9 |     "comment_text": "The ward was freezing.",
10 |     "question_type": "could_improve"
11 |   },
12 |   {
13 |     "comment_id": "3",
14 |     "comment_text": "",
15 |     "question_type": "nonspecific"
16 |   },
17 |   {
18 |     "comment_id": "4",
19 |     "comment_text": "Thank you so much",
20 |     "question_type": "nonspecific"
21 |   }
22 | ]
23 | 


--------------------------------------------------------------------------------
/docker_data/data_out/file_01.json:
--------------------------------------------------------------------------------
1 | [{"comment_id": "1", "sentiment": 5.0, "labels": ["Staff manner & personal attributes"]}, {"comment_id": "2", "sentiment": 4.0, "labels": ["Sensory experience"]}, {"comment_id": "3", "sentiment": "Labelling not possible", "labels": ["Labelling not possible"]}, {"comment_id": "4", "sentiment": 1.0, "labels": ["Positive experience & gratitude"]}]


--------------------------------------------------------------------------------
/docker_run.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import json
  3 | import os
  4 | import pickle
  5 | 
  6 | import pandas as pd
  7 | from tensorflow.keras.saving import load_model
  8 | 
  9 | from pxtextmining.factories.factory_predict_unlabelled_text import (
 10 |     combine_predictions,
 11 |     predict_multilabel_bert,
 12 |     predict_multilabel_sklearn,
 13 |     predict_sentiment_bert,
 14 | )
 15 | from pxtextmining.params import minor_cats
 16 | 
 17 | 
 18 | def load_bert_model(model_path):
 19 |     if not os.path.exists(f"bert_{model_path}"):
 20 |         if model_path == "sentiment":
 21 |             model_path = os.path.join(
 22 |                 "current_best_model", model_path, f"bert_{model_path}"
 23 |             )
 24 |         elif model_path == "multilabel":
 25 |             model_path = os.path.join(
 26 |                 "current_best_model", "final_bert", f"bert_{model_path}"
 27 |             )
 28 |     loaded_model = load_model(f"bert_{model_path}")
 29 |     return loaded_model
 30 | 
 31 | 
 32 | def load_sklearn_model(model_name):
 33 |     model_path = f"{model_name}.sav"
 34 |     if not os.path.exists(model_path):
 35 |         model_path = os.path.join("current_best_model", model_name, model_path)
 36 |     with open(model_path, "rb") as model:
 37 |         loaded_model = pickle.load(model)
 38 |     return loaded_model
 39 | 
 40 | 
 41 | def process_text(items):
 42 |     df = pd.DataFrame([i for i in items], dtype=str)
 43 |     df_newindex = df.set_index("comment_id")
 44 |     if df_newindex.index.duplicated().sum() != 0:
 45 |         raise ValueError("comment_id must all be unique values")
 46 |     df_newindex.index.rename("Comment ID", inplace=True)
 47 |     text_to_predict = df_newindex[["comment_text", "question_type"]]
 48 |     text_to_predict = text_to_predict.rename(
 49 |         columns={"comment_text": "FFT answer", "question_type": "FFT_q_standardised"}
 50 |     )
 51 |     return df, text_to_predict
 52 | 
 53 | 
 54 | def predict_multilabel_ensemble(items):
 55 |     # Function which gets preds_dfs for bert, svc, and xgb, and combines them all
 56 |     # Process the data
 57 |     df, text_to_predict = process_text(items)
 58 |     text_to_predict = text_to_predict["FFT answer"]
 59 |     # Load models
 60 |     bert_model = load_bert_model("multilabel")
 61 |     svc_model = load_sklearn_model("final_svc")
 62 |     xgb_model = load_sklearn_model("final_xgb")
 63 |     # Make preds
 64 |     bert_preds = predict_multilabel_bert(
 65 |         text_to_predict,
 66 |         bert_model,
 67 |         labels=minor_cats,
 68 |         additional_features=False,
 69 |         label_fix=False,
 70 |     )
 71 |     svc_preds = predict_multilabel_sklearn(
 72 |         text_to_predict,
 73 |         svc_model,
 74 |         labels=minor_cats,
 75 |         additional_features=False,
 76 |         label_fix=False,
 77 |     )
 78 |     xgb_preds = predict_multilabel_sklearn(
 79 |         text_to_predict,
 80 |         xgb_model,
 81 |         labels=minor_cats,
 82 |         additional_features=False,
 83 |         label_fix=False,
 84 |     )
 85 |     # Combine preds
 86 |     preds_list = [bert_preds, svc_preds, xgb_preds]
 87 |     combined_preds = combine_predictions(preds_list, labels=minor_cats)
 88 |     # Join predicted labels with received data
 89 |     combined_preds["comment_id"] = combined_preds.index.astype(str)
 90 |     merged = pd.merge(df, combined_preds, how="left", on="comment_id")
 91 |     # Fill in anything that got cleaned in preprocessing step
 92 |     nulls = merged[merged.labels.isnull()].index
 93 |     lnp = pd.Series(
 94 |         [["Labelling not possible"]] * len(nulls), index=nulls, dtype=object
 95 |     )
 96 |     merged.loc[nulls, "labels"] = lnp
 97 |     return_df = merged[["comment_id", "labels"]]
 98 |     return return_df
 99 | 
100 | 
101 | def predict_sentiment(items):
102 |     """Accepts comment ids, comment text and question type as JSON in a POST request. Makes predictions using trained Tensorflow Keras model.
103 | 
104 |     Args:
105 |         items (List[ItemIn]): JSON list of dictionaries with the following compulsory keys:
106 |         - `comment_id` (str)
107 |         - `comment_text` (str)
108 |         - `question_type` (str)
109 |         The 'question_type' must be one of three values: 'nonspecific', 'what_good', and 'could_improve'.
110 |         For example, `[{'comment_id': '1', 'comment_text': 'Thank you', 'question_type': 'what_good'},
111 |         {'comment_id': '2', 'comment_text': 'Food was cold', 'question_type': 'could_improve'}]`
112 | 
113 |     Returns:
114 |         (dict): Keys are: `comment_id`, `comment_text`, and predicted `labels`.
115 |     """
116 | 
117 |     # Process received data
118 |     df, text_to_predict = process_text(items)
119 |     # Make predictions
120 |     loaded_model = load_bert_model("sentiment")
121 |     preds_df = predict_sentiment_bert(
122 |         text_to_predict, loaded_model, preprocess_text=False, additional_features=True
123 |     )
124 |     # Join predicted labels with received data
125 |     preds_df["comment_id"] = preds_df.index.astype(str)
126 |     merged = pd.merge(df, preds_df, how="left", on="comment_id")
127 |     merged["sentiment"] = merged["sentiment"].fillna("Labelling not possible")
128 |     return_df = merged[["comment_id", "sentiment"]]
129 |     return return_df
130 | 
131 | 
132 | def parse_args():
133 |     """Parse command line arguments"""
134 |     parser = argparse.ArgumentParser()
135 |     parser.add_argument(
136 |         "json_file",
137 |         nargs=1,
138 |         help="Name of the json file",
139 |     )
140 |     parser.add_argument(
141 |         "--local-storage",
142 |         "-l",
143 |         action="store_true",
144 |         help="Use local storage (instead of Azure)",
145 |     )
146 |     parser.add_argument(
147 |         "--target",
148 |         "-t",
149 |         default="ms",
150 |         help="Target of the predictions. m for multilabel, s for sentiment. Defaults to ms for both multilabel and sentiment",
151 |     )
152 |     args = parser.parse_args()
153 |     return args
154 | 
155 | 
156 | def main():
157 |     args = parse_args()
158 |     json_file = os.path.join("data", "data_in", args.json_file[0])
159 |     with open(json_file, "r") as jf:
160 |         json_in = json.load(jf)
161 |     preds_list = []
162 |     if "s" in args.target:
163 |         s_preds = predict_sentiment(json_in)
164 |         preds_list.append(s_preds)
165 |     if "m" in args.target:
166 |         m_preds = predict_multilabel_ensemble(json_in)
167 |         preds_list.append(m_preds)
168 |     if len(preds_list) == 2:
169 |         preds = pd.merge(preds_list[0], preds_list[1], on="comment_id")
170 |     else:
171 |         preds = preds_list[0]
172 |     if not args.local_storage:
173 |         os.remove(json_file)
174 |     json_out = preds.to_dict(orient="records")
175 |     out_path = os.path.join("data", "data_out", args.json_file[0])
176 |     with open(out_path, "w+") as jf:
177 |         json.dump(json_out, jf)
178 | 
179 | 
180 | if __name__ == "__main__":
181 |     main()
182 | 


--------------------------------------------------------------------------------
/docs/about.md:
--------------------------------------------------------------------------------
1 | # Project background
2 | 
3 | The `pxtextmining` package is part of [the Patient Experience Qualitative Data Categorisation project](https://the-strategy-unit.github.io/PatientExperience-QDC/). This project is is hosted by Nottinghamshire Healthcare NHS Foundation Trust's Clinical Development Unit Data Science Team, and funded by NHS England's Insight and Feedback Team.
4 | 
5 | The primary objective of the `pxtextmining` element is to create a machine learning model capable of categorising the free text data obtained through the [NHS England Friends and Family Test](https://www.england.nhs.uk/fft/) (FFT). It is a multilabel classification problem, with one or more categories applied to each patient feedback comment. In this way, we hope to support better use of qualitative patient experience feedback by NHS provider organisations.
6 | 
7 | This package works together with the [experiencesdashboard](https://github.com/the-strategy-unit/experiencesdashboard), a frontend coded in R/Shiny.
8 | 


--------------------------------------------------------------------------------
/docs/create_docs.py:
--------------------------------------------------------------------------------
 1 | import glob
 2 | import os
 3 | 
 4 | """
 5 | Python script to automatically generate .md files in docs/reference based on
 6 | contents of pxtextmining folders
 7 | """
 8 | 
 9 | 
10 | modules = glob.glob('pxtextmining/*/')
11 | module_names = []
12 | for folder in modules:
13 |     if '__' not in folder:
14 |         module_name = folder.split('/')[-2]
15 |         print(f'MODULE: {module_name}')
16 |         pylist = glob.glob(f"{folder}/*.py")
17 |         for py in pylist:
18 |             if '__' not in py:
19 |                 py_name = os.path.basename(py)[:-3]
20 |                 print(py_name)
21 |                 with open(f'docs/reference/{module_name}/{py_name}.md', 'w') as f:
22 |                     if module_name == 'helpers':
23 |                         f.write(f"""::: pxtextmining.{module_name}.{py_name}
24 |     options:
25 |         show_source: true""")
26 |                     else:
27 |                         f.write(f'::: pxtextmining.{module_name}.{py_name}')
28 | 


--------------------------------------------------------------------------------
/docs/getting started/install.md:
--------------------------------------------------------------------------------
 1 | ## Installation
 2 | 
 3 | You can install `pxtextmining` from either [PyPI](https://pypi.org/project/pxtextmining/) or [GitHub](https://github.com/the-strategy-unit/pxtextmining).
 4 | 
 5 | The recommended method is to clone the repository from GitHub, as this will also include the models and datasets.
 6 | 
 7 | ### Option 1: Install from PyPI
 8 | This option allows you to use the functions coded in pxtextmining.
 9 | 
10 | 1. Install `pxtextmining` and its PyPI dependencies:
11 |       - `pip install pxtextmining`
12 | 
13 | 
14 | ### Option 2 (RECOMMENDED): Install from GitHub
15 | This option is recommended as it gives you access to the full datasets and already trained models.
16 | 
17 | 1. To begin with, [clone the repository from github](https://docs.github.com/en/repositories/creating-and-managing-repositories/cloning-a-repository).
18 | 
19 | 2. It is also recommended to [create a new virtual environment](https://docs.python.org/3/library/venv.html), using your chosen method of managing Python environments.
20 | 
21 | 3. The package uses `poetry` for dependency management. First, run `pip install poetry`.
22 | 
23 | 4. Then, run `poetry install --with dev`.
24 | 


--------------------------------------------------------------------------------
/docs/getting started/package.md:
--------------------------------------------------------------------------------
 1 | # Package structure
 2 | 
 3 | ## pxtextmining
 4 | 
 5 | The `pxtextmining` package is constructed using the following elements:
 6 | 
 7 | - **`pxtextmining.factories`**
 8 | This module contains vast majority of the code in the package. There are five different stages, each corresponding to a different submodule.
 9 | 
10 |       - `factory_data_load_and_split`: Loading of multilabel data, preprocessing, and splitting into train/test/validation sets as appropriate.
11 | 
12 |       - `factory_pipeline`: Construction and training of different models/estimators/algorithms using the `sklearn`, `tensorflow.keras` and `transformers` libraries.
13 | 
14 |       - `factory_model_performance`: Evaluation of a trained model, comparing predicted targets with real target values, to produce performance metrics. The decision-making process behind the peformance metrics chosen can be seen on the [project documentation website](https://the-strategy-unit.github.io/PatientExperience-QDC/pxtextmining/performance_metrics.html). The performance metrics for the current best models utilised in the API can be found in the `current_best_multilabel` folder in the main repository.
15 | 
16 |       - `factory_predict_unlabelled_text`: Prepares unlabelled text (with or without additional features such as question type) in a format suitable for each model type, and passes this through the selected models, to produce predicted labels.
17 | 
18 | - **`pxtextmining.helpers`**
19 | This module contains some helper functions which are used in `pxtextmining.factories`. Some of this is legacy code, so this may just be moved into the `factories` submodule in future versions of the package.
20 | 
21 | - **`pxtextmining.pipelines`**
22 | All of the processes in `pxtextmining.factories` are pulled together in `multilabel_pipeline`, to create the complete end-to-end process of data processing, model creation, training, evaluation, and saving.
23 | 
24 | There is also a `pxtextmining.params` file which is used to standardise specific variables that are used across the entire package. The aim of this is to reduce repetition across the package, for example when trying different targets or model types.
25 | 
26 | ## API
27 | 
28 | Separate from the `pxtextmining` package is the API, which can be found in the folder `api`. It is constructed using FastAPI and Uvicorn. The aim of the API is to make the trained machine learning models available publicly, so that predictions can be made on any text. The API is not currently publicly available and access is only for participating partner trusts. However, all the code and documentation is available on our github repository.
29 | 


--------------------------------------------------------------------------------
/docs/getting started/training_new_model.md:
--------------------------------------------------------------------------------
 1 | # Training a new model
 2 | 
 3 | To train a new model to categorise patient feedback text, labelled data is required. Discussions are currently underway to enable the release of the data that the multilabel models in `pxtextmining` are trained on.
 4 | 
 5 | This page breaks down the steps in the function `pxtextmining.pipelines.run_sklearn_pipeline`, which outputs trained sklearn models. This is a high-level explanation of the processes; for more detailed technical information please see the relevant code reference pages for each function.
 6 | 
 7 | 
 8 | ```python
 9 | 
10 | # Step 1: Generate a random_state which is used for the train_test_split.
11 | # This means that the pipeline and evaluation should be reproducible.
12 | random_state = random.randint(1,999)
13 | 
14 | # Step 2: Load the data and isolate the target columns from the dataframe.
15 | df = load_multilabel_data(filename = 'datasets/hidden/multilabeldata_2.csv',
16 |                           target = 'major_categories')
17 | 
18 | # Step 3: Conduct preprocessing: remove punctuation and numbers, clean whitespace and drop empty lines.
19 | # Split into train and test using the random_state above.
20 | X_train, X_test, Y_train, Y_test = process_and_split_data(
21 |                                         df, target = target,
22 |                                         random_state = random_state)
23 | 
24 | # Step 4: Instantiate a pipeline and hyperparamter grid for each estimator to be tried.
25 | # Conduct a cross-validated randomized search to identify the hyperparameters
26 |   # producing the best results on the validation set.
27 | # For each estimator, returns the pipeline with the best hyperparameters,
28 |   # together with the time taken to search the pipeline.
29 | models, training_times = search_sklearn_pipelines(X_train, Y_train,
30 |                                         models_to_try = models_to_try,
31 |                                         additional_features = additional_features)
32 | 
33 | # Step 5: Evaluate each pipeline using the test set, comparing predicted values with real values.
34 | # Performance metrics are recorded together with the time taken to search the pipeline.
35 | model_metrics = []
36 | for i in range(len(models)):
37 |     m = models[i]
38 |     t = training_times[i]
39 |     model_metrics.append(get_multilabel_metrics(X_test, Y_test,
40 |                                         random_state = random_state,
41 |                                         labels = target, model_type = 'sklearn',
42 |                                         model = m, training_time = t))
43 | 
44 | # Step 6: Save the models and performance metrics to the path specified
45 | write_multilabel_models_and_metrics(models,model_metrics,path=path)
46 | ```
47 | 


--------------------------------------------------------------------------------
/docs/getting started/using_trained_model.md:
--------------------------------------------------------------------------------
 1 | # Using a trained model
 2 | 
 3 | The `current_best_multilabel` folder should contain a fully trained `sklearn` model in .sav format, as well as performance metrics for the model.
 4 | 
 5 | The Transformer-based `tensorflow.keras` model is over 1GB and cannot be shared via GitHub. However, it will be made available via the API, which is forthcoming in a future release of this package.
 6 | 
 7 | This page breaks down the steps in the function `pxtextmining.pipelines.factory_predict_unlabelled_text.predict_multilabel_sklearn`, which can make predictions using the `sklearn` model available via GitHub. This is a high-level explanation of the processes; for more detailed technical information please see the relevant code reference page.
 8 | 
 9 | ```python
10 | 
11 | # Step 1: Conduct preprocessing on text:
12 | # Temove trailing whitespaces, NULL values, NaNs, and punctuation. Converts to lowercase.
13 | text_no_whitespace = text.replace(r"^\s*$", np.nan, regex=True)
14 | text_no_nans = text_no_whitespace.dropna()
15 | text_cleaned = text_no_nans.astype(str).apply(remove_punc_and_nums)
16 | processed_text = text_cleaned.astype(str).apply(clean_empty_features)
17 | 
18 | # Step 2: Make predictions with the trained model
19 | binary_preds = model.predict(processed_text)
20 | 
21 | # Step 3: Get predicted probabilities for each label
22 | pred_probs = np.array(model.predict_proba(processed_text))
23 | 
24 | # Step 4: Some samples do not have any predicted labels.
25 | # For these, take the label with the highest predicted probability.
26 | predictions = fix_no_labels(binary_preds, pred_probs, model_type="sklearn")
27 | 
28 | # Step 5: Convert predictions to a dataframe.
29 | preds_df = pd.DataFrame(predictions, index=processed_text.index, columns=labels)
30 | preds_df["labels"] = preds_df.apply(get_labels, args=(labels,), axis=1)
31 | 
32 | ```
33 | 


--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
 1 | # Home
 2 | 
 3 | This site contains the project documentation for the `pxtextmining` python package.
 4 | This provides a technical overview of the package; for a non-technical overview and further information, visit the
 5 | [Patient Experience Qualitative Data Categorisation website](https://the-strategy-unit.github.io/PatientExperience-QDC/pxtextmining/).
 6 | 
 7 | ## Table Of Contents
 8 | 
 9 | The documentation is split into three separate sections:
10 | 
11 | 1. [Project background](about.md)
12 | 2. Getting started, a simple approach to using the package:
13 |     - [Installation](getting%20started/install.md)
14 |     - [How the package works](getting%20started/package.md)
15 |     - [Training a new model](getting%20started/training_new_model.md)
16 |     - [Making predictions with a trained model](getting%20started/using_trained_model.md)
17 | 3. Code reference, a more technical overview of the functions and modules:
18 |     - [Factories](reference/pxtextmining/factories/factory_data_load_and_split.md)
19 |     - [Helpers](reference/pxtextmining/helpers/text_preprocessor.md)
20 |     - [Pipelines](reference/pxtextmining/pipelines/multilabel_pipeline.md)
21 | 
22 | ### Other repos that use `pxtextmining`
23 |  - [nhs_fft_sentiment_analysis](https://github.com/yunus-m/nhs_fft_sentiment_analysis/blob/main/README.md)
24 |    - Exploratory analysis and sentiment modelling of FFT feedback using `scikit-learn`, TinyBERT, and hierarchical approaches.


--------------------------------------------------------------------------------
/docs/main.css:
--------------------------------------------------------------------------------
1 | /*CSS to make text in tables wrap rather than scrolling forever*/
2 | 
3 | .wy-table-responsive table td, .wy-table-responsive table th {
4 |   white-space: inherit;
5 | }
6 | 


--------------------------------------------------------------------------------
/docs/reference/API/API.md:
--------------------------------------------------------------------------------
 1 | # pxtextmining API overview
 2 | 
 3 | We have created two different APIs for labelling patient experience feedback. Both APIs are free to use and completely open source. For help and support with using them, please contact [Chris Beeley](mailto:chris.beeley1@nhs.net).
 4 | 
 5 | The "Quick API" is faster and simpler, as it uses an sklearn model which is quicker to make predictions. The performance of predictions from this API can be seen on our project documentation website. It is less accurate than the slow API. This API is a more 'traditional' style of API.
 6 | 
 7 | The "Slow API" utilises sklearn models as well as the slower but more powerful transformer-based Distilbert model. Due to the demanding hardware requirements of this model, we have set up a slower and slightly more complex API which combines (ensembles) together these models but has higher performance overall.
 8 | 
 9 | ## Security
10 | 
11 | The data is submitted via a secure HTTPS connection. All data is encrypted in transit with HTTPS, using the SSL/TLS protocol for encryption and authentication. The data is stored in blob storage on a UK-based Azure container instance for the duration of the model predictions, and is then immediately deleted. Ad hoc support is provided where possible, no uptime or other guarantees exist.
12 | 


--------------------------------------------------------------------------------
/docs/reference/API/quick_API.md:
--------------------------------------------------------------------------------
 1 | # Quick API
 2 | 
 3 | To facilitate the use of the models trained in this project, an API has been created using the FastAPI library. Users will be able to send their patient experience feedback comments to the model via the API, and will receive the predicted labels for those comments.
 4 | 
 5 | This API utilises the Support Vector Classifier model which is less performant than the transformer-based Distilbert model. However, it is also much quicker and simpler. Performance metrics for this model can be seen on our [project documentation website](https://the-strategy-unit.github.io/PatientExperience-QDC/).
 6 | 
 7 | The API has been created using FastAPI and is deployed on Posit Connect. The URL is available on request. Full documentation for the API, automatically generated by FastAPI, is available at [API URL]/docs.
 8 | 
 9 | ## How to make an API call
10 | 
11 | 1\. Prepare the data in JSON format. In Python, this is a `list` containing as many `dict`s as there are comments to be predicted. Each `dict` has two compulsory keys:
12 | 
13 |   * `comment_id`: Unique ID associated with the comment, in `str` format. Each Comment ID per API call must be unique.
14 |   * `comment_text`: Text to be classified, in `str` format.
15 | 
16 | ```python
17 | # In Python
18 | 
19 | text_data = [
20 |               { 'comment_id': '1', # The comment_id values in each dict must be unique.
21 |                 'comment_text': 'This is the first comment. Nurse was great.',
22 |                 },
23 |               { 'comment_id': '2',
24 |                 'comment_text': 'This is the second comment. The ward was freezing.',
25 |                 },
26 |               { 'comment_id': '3',
27 |                 'comment_text': '',  # This comment is an empty string.
28 |                 },
29 |             ]
30 | ```
31 | 
32 | ```R
33 | # In R
34 | 
35 | library(jsonlite)
36 | 
37 | comment_id <- c("1", "2", "3")
38 | comment_text <- c(
39 |   "This is the first comment. Nurse was great.",
40 |   "This is the second comment. The ward was freezing.",
41 |   ""
42 | )
43 | df <- data.frame(comment_id, comment_text)
44 | text_data <- toJSON(df)
45 | ```
46 | 
47 | 
48 | 2\. Send the JSON containing the text data to the `predict_multilabel` endpoint. In python, this can be done using the `requests` library.
49 | 
50 | ```python
51 | # In Python
52 | 
53 | import requests
54 | 
55 | url = "API_URL_GOES_HERE"
56 | 
57 | response = requests.post(f"{url}/predict_multilabel",
58 |                           json = text_data)
59 | ```
60 | 
61 | ```R
62 | # In R
63 | 
64 | library(httr)
65 | 
66 | r <- POST(
67 |   url = "API_URL_GOES_HERE",
68 |   body = text_data,
69 |   encode = "json",
70 |   add_headers(
71 |     "Content-Type" = "application/json"
72 |   )
73 | )
74 | ```
75 | 
76 | 3\. After waiting for the data to be processed and passed through the machine learning model, receive predicted labels at the same endpoint, in the example format below. Note that the comment with blank text, with comment_id 3, was assigned the label 'Labelling not possible' as it would have been stripped out during preprocessing.
77 | 
78 | ```python
79 | # In Python
80 | 
81 | print(response.json())
82 | # Output below
83 | [
84 |   { 'comment_id': '1',
85 |     'labels': ['Non-specific praise for staff']} ,
86 |   { 'comment_id': '2',
87 |     'labels': ['Sensory experience']} ,
88 |   { 'comment_id': '3',
89 |     'labels': ['Labelling not possible'] }
90 | ]
91 | ```
92 | 
93 | ```R
94 | # In R
95 | 
96 | r_parsed = fromJSON(content(r, "text"))
97 | ```
98 | 


--------------------------------------------------------------------------------
/docs/reference/API/slow_API.md:
--------------------------------------------------------------------------------
  1 | # Slow API
  2 | 
  3 | This API is slower but uses the best performing models. The transformer-based Distilbert model consumes a lot of hardware resource, and as such required a different approach.
  4 | 
  5 | ![Diagram showing Slow API architecture](https://the-strategy-unit.github.io/PatientExperience-QDC/pxtextmining/slow_API.png)
  6 | 
  7 | For predicting the multilabel categories, the API endpoint ensembles together Support Vector Classifier, Gradient Boosted Decision Trees (XGBoost), and Distilbert models.
  8 | 
  9 | For predicting text sentiment , the API endpoint utilises a Distilbert model.
 10 | 
 11 | The API URL endpoint is available on request. You will need an API key, please contact the project team to obtain one. The key should be passed as a `code` param with your API request.
 12 | 
 13 | ## How to make an API call
 14 | 
 15 | 1\. Prepare the data in JSON format. In Python, this is a `list` containing as many `dict`s as there are comments to be predicted. Each `dict` has three compulsory keys:
 16 | 
 17 |   * `comment_id`: Unique ID associated with the comment, in `str` format. Each Comment ID per API call must be unique.
 18 |   * `comment_text`: Text to be classified, in `str` format.
 19 |   * `question_type`: The type of question asked to elicit the comment text. Questions are different from trust to trust, but they all fall into one of three categories:
 20 |        * `what_good`: Any variation on the question "What was good about the service?", or "What did we do well?"
 21 |        * `could_improve`: Any variation on the question "Please tell us about anything that we could have done better", or "How could we improve?"
 22 |        * `nonspecific`: Any other type of nonspecific question, e.g. "Please can you tell us why you gave your answer?", or "What were you satisfied and/or dissatisfied with?".
 23 | 
 24 | ```python
 25 | # In Python
 26 | 
 27 | text_data = [
 28 |               { 'comment_id': '1', # The comment_id values in each dict must be unique.
 29 |                 'comment_text': 'This is the first comment. Nurse was great.',
 30 |                 'question_type': 'what_good' },
 31 |               { 'comment_id': '2',
 32 |                 'comment_text': 'This is the second comment. The ward was freezing.',
 33 |                 'question_type': 'could_improve' },
 34 |               { 'comment_id': '3',
 35 |                 'comment_text': '',  # This comment is an empty string.
 36 |                 'question_type': 'nonspecific' }
 37 |             ]
 38 | ```
 39 | 
 40 | ```R
 41 | # In R
 42 | 
 43 | library(jsonlite)
 44 | 
 45 | comment_id <- c("1", "2", "3")
 46 | comment_text <- c(
 47 |   "This is the first comment. Nurse was great.",
 48 |   "This is the second comment. The ward was freezing.",
 49 |   ""
 50 | )
 51 | question_type <- c("what_good", "could_improve", "nonspecific")
 52 | df <- data.frame(comment_id, comment_text, question_type)
 53 | text_data <- toJSON(df)
 54 | ```
 55 | 
 56 | 2\. Send the JSON containing the text data in a POST request to the API. Ensure that you include your API key, which should be stored securely.
 57 | 
 58 | The model(s) used to make predictions can be selected with the `target` param. The options for this param are:
 59 | 
 60 | - `m`: multilabel
 61 | - `s`: sentiment
 62 | - `ms`: both multilabel and sentiment.
 63 | 
 64 | ```python
 65 | # In Python
 66 | 
 67 | api_key = os.getenv('API_KEY')
 68 | params_dict = {'code': api_key, 'target': 'ms'}
 69 | 
 70 | url = os.getenv('API_URL')
 71 | 
 72 | response = requests.post(url, params= params_dict, json = text_data)
 73 | ```
 74 | 
 75 | ```R
 76 | # In R
 77 | library(httr)
 78 | 
 79 | api_key <- Sys.getenv("API_KEY")
 80 | params_dict <- list(code = api_key, target = "ms")
 81 | url <- Sys.getenv("API_URL")
 82 | 
 83 | response <- POST(url, query = params_dict, body = text_data, encode = "json")
 84 | ```
 85 | 
 86 | 3\. If the POST request is successful, you will receive a response with a 202 code, and a URL to retrieve your results, called the `results URL`. For example:
 87 | 
 88 | ```python
 89 | # In Python
 90 | 
 91 | if response.status_code == 202:
 92 |     results_url = response.text
 93 | 
 94 |     print(f"URL for results is {results_url}")
 95 | ```
 96 | 
 97 | ```R
 98 | # In R
 99 | 
100 | if (http_status(response) == 202) {
101 |     results_url <- content(response, as = "text")
102 |     }
103 |     print(results_url)
104 | ```
105 | 
106 | 4\. Use a GET request to check the results URL. If your predictions are not yet ready, you will receive a 202 response. If they are ready, you will receive a 200 response.
107 | 
108 | What is happening behind the scenes? The API has received your data and has started up a secure Azure container instance with your data stored in blob storage. The Docker container will install the pxtextmining package and make predictions using your data. Starting up a fresh container instance can take up to 5 minutes, and predictions using the slow transformer models can some time, up to 5 further minutes per 1000 comments. Once the predictions are complete, it will delete your data and save the predictions in blob storage.
109 | 
110 | Once you receive a 200 response, your results are available in JSON format. Please note that this will only be available once; once you have collected the data, it will be deleted due to security reasons and your results URL will no longer be valid.
111 | 
112 | You can set up a loop to check if your results are ready every 5 minutes, as follows.
113 | 
114 | ```python
115 | # In Python
116 | 
117 | while True:
118 |     results_response = requests.get(results_url)
119 |     if results_response.status_code == 200:
120 |         final_labels = results_response.json()
121 |         break
122 |     else:
123 |         print('Not ready! Trying again in 300 seconds...')
124 |         time.sleep(300)
125 | 
126 | print('Predicted labels':)
127 | print(final_labels)
128 | ```
129 | 
130 | ```R
131 | # In R
132 | 
133 | while (TRUE) {
134 |   results_response <- GET(results_url)
135 |   if (results_response$status_code == 200) {
136 |     final_labels <- fromJSON(content(results_response, "text"))
137 |     break
138 |   } else {
139 |     cat("Not ready! Trying again in 300 seconds...\n")
140 |     Sys.sleep(300)
141 |   }
142 | }
143 | 
144 | cat("Predicted labels:\n")
145 | print(final_labels)
146 | ```
147 | 


--------------------------------------------------------------------------------
/docs/reference/Docker/docker_README.md:
--------------------------------------------------------------------------------
  1 | # Using our Docker container
  2 | 
  3 | This Docker container contains the pxtextmining machine learning models trained as part of the [Patient Experience Qualitative Data Categorisation project](https://the-strategy-unit.github.io/PatientExperience-QDC/).
  4 | 
  5 | To use this Docker container to predict your unlabelled text:
  6 | 
  7 | 1\. Set up your folders. You will need to set up a folder containing two other folders, data_in and data_out, as below.
  8 | ```
  9 | docker_data/
 10 | ├─ data_in/
 11 | ├─ data_out/
 12 | 
 13 | ```
 14 | 
 15 | 2\. Prepare your data. Save the data you wish to pass through the machine learning models as json, in the data_in folder. The data should be in the following format:
 16 | 
 17 | In Python, a `list` containing as many `dict`s as there are comments to be predicted. Each `dict` has three compulsory keys:
 18 | 
 19 |   * `comment_id`: Unique ID associated with the comment, in `str` format. Each Comment ID per API call must be unique.
 20 |   * `comment_text`: Text to be classified, in `str` format.
 21 |   * `question_type`: The type of question asked to elicit the comment text. Questions are different from trust to trust, but they all fall into one of three categories:
 22 |        * `what_good`: Any variation on the question "What was good about the service?", or "What did we do well?"
 23 |        * `could_improve`: Any variation on the question "Please tell us about anything that we could have done better", or "How could we improve?"
 24 |        * `nonspecific`: Any other type of nonspecific question, e.g. "Please can you tell us why you gave your answer?", or "What were you satisfied and/or dissatisfied with?".
 25 | 
 26 | ```python
 27 | # In Python
 28 | 
 29 | text_data = [
 30 |               { 'comment_id': '1', # The comment_id values in each dict must be unique.
 31 |                 'comment_text': 'This is the first comment. Nurse was great.',
 32 |                 'question_type': 'what_good' },
 33 |               { 'comment_id': '2',
 34 |                 'comment_text': 'This is the second comment. The ward was freezing.',
 35 |                 'question_type': 'could_improve' },
 36 |               { 'comment_id': '3',
 37 |                 'comment_text': '',  # This comment is an empty string.
 38 |                 'question_type': 'nonspecific' }
 39 |             ]
 40 | 
 41 | ```
 42 | 
 43 | ```R
 44 | # In R
 45 | 
 46 | library(jsonlite)
 47 | 
 48 | comment_id <- c("1", "2", "3")
 49 | comment_text <- c(
 50 |   "This is the first comment. Nurse was great.",
 51 |   "This is the second comment. The ward was freezing.",
 52 |   ""
 53 | )
 54 | question_type <- c("what_good", "could_improve", "nonspecific")
 55 | df <- data.frame(comment_id, comment_text, question_type)
 56 | text_data <- toJSON(df)
 57 | ```
 58 | 
 59 | 3\. Save the JSON data in the data_in folder, as follows:
 60 | 
 61 | ```python
 62 | # In Python
 63 | 
 64 | json_data = json.dumps(text_data)
 65 | with open("data_in/file_01.json", "w") as outfile:
 66 |     outfile.write(json_data)
 67 | ```
 68 | 
 69 | ```R
 70 | # In R
 71 | 
 72 | json_data <- toJSON(text_data, pretty = TRUE)
 73 | write(json_data, file = "data_in/file_01.json")
 74 | ```
 75 | 
 76 | 4\. Your file structure should now look like this:
 77 | 
 78 | ```
 79 | docker_data/
 80 | ├─ data_in/
 81 | │  ├─ file_01.json
 82 | ├─ data_out/
 83 | ```
 84 | 
 85 | 5\. Mount the docker_data folder as the `data` volume for the Docker container and run the container. Pass the filename for the input JSON as the first argument. The following arguments are also available:
 86 | 
 87 |    - `--local-storage` or `-l` flag for local storage (does not delete the files in data_in after completing predictions)
 88 |    - `--target` or `-t` to select the machine learning models used. Options are `m` for multilabel, `s` for `sentiment`, or `ms` for both. Defaults to `ms` if nothing is selected.
 89 | 
 90 | A sample command would be:
 91 | `docker run --rm -it -v /docker_data:/data ghcr.io/the-strategy-unit/pxtextmining:latest file_01.json -l `
 92 | 
 93 | 6\. The predictions will be outputted as a json file in the data_out folder, with the same filename. After running successfully, the final folder structure should be:
 94 | 
 95 | ```
 96 | docker_data/
 97 | ├─ data_in/
 98 | │  ├─ file_01.json
 99 | ├─ data_out/
100 |    ├─ file_01.json
101 | ```
102 | 


--------------------------------------------------------------------------------
/docs/reference/pxtextmining/factories/factory_data_load_and_split.md:
--------------------------------------------------------------------------------
1 | ::: pxtextmining.factories.factory_data_load_and_split


--------------------------------------------------------------------------------
/docs/reference/pxtextmining/factories/factory_model_performance.md:
--------------------------------------------------------------------------------
1 | ::: pxtextmining.factories.factory_model_performance


--------------------------------------------------------------------------------
/docs/reference/pxtextmining/factories/factory_pipeline.md:
--------------------------------------------------------------------------------
1 | ::: pxtextmining.factories.factory_pipeline


--------------------------------------------------------------------------------
/docs/reference/pxtextmining/factories/factory_predict_unlabelled_text.md:
--------------------------------------------------------------------------------
1 | ::: pxtextmining.factories.factory_predict_unlabelled_text


--------------------------------------------------------------------------------
/docs/reference/pxtextmining/factories/factory_write_results.md:
--------------------------------------------------------------------------------
1 | ::: pxtextmining.factories.factory_write_results


--------------------------------------------------------------------------------
/docs/reference/pxtextmining/helpers/text_preprocessor.md:
--------------------------------------------------------------------------------
1 | ::: pxtextmining.helpers.text_preprocessor
2 |     options:
3 |         show_source: true


--------------------------------------------------------------------------------
/docs/reference/pxtextmining/pipelines/multilabel_pipeline.md:
--------------------------------------------------------------------------------
1 | ::: pxtextmining.pipelines.multilabel_pipeline


--------------------------------------------------------------------------------
/docs/reference/pxtextmining/pipelines/sentiment_pipeline.md:
--------------------------------------------------------------------------------
1 | ::: pxtextmining.pipelines.sentiment_pipeline
2 | 


--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
 1 | site_name: pxtextmining
 2 | site_url: https://the-strategy-unit.github.io/pxtextmining/
 3 | theme: readthedocs
 4 | watch:
 5 | - pxtextmining
 6 | 
 7 | extra_css:
 8 |   - main.css
 9 | 
10 | plugins:
11 |   - search
12 |   - mkdocstrings:
13 |       handlers:
14 |         python:
15 |           options:
16 |             docstring_style: google
17 |             show_root_heading: true
18 |             show_root_toc_entry: false
19 |             show_root_full_path: false
20 |             show_source: false
21 |             show_if_no_docstring: true
22 |             heading_level: 4
23 |             members_order: source
24 | 


--------------------------------------------------------------------------------
/pxtextmining/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/The-Strategy-Unit/pxtextmining/ba6c0a2f10ca9bb3b964b72afe6047b463095c75/pxtextmining/__init__.py


--------------------------------------------------------------------------------
/pxtextmining/factories/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/The-Strategy-Unit/pxtextmining/ba6c0a2f10ca9bb3b964b72afe6047b463095c75/pxtextmining/factories/__init__.py


--------------------------------------------------------------------------------
/pxtextmining/factories/factory_data_load_and_split.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import string
  3 | 
  4 | import numpy as np
  5 | import pandas as pd
  6 | from sklearn.model_selection import train_test_split
  7 | from sklearn.preprocessing import OneHotEncoder
  8 | from tensorflow.data import Dataset
  9 | from transformers import AutoTokenizer
 10 | 
 11 | from pxtextmining.params import dataset, major_cat_dict, minor_cats, q_map
 12 | 
 13 | 
 14 | def merge_categories(df, new_cat, cats_to_merge):
 15 |     """Merges categories together in a dataset. Assumes all categories are all in the
 16 |     right format, one hot encoded with int values.
 17 | 
 18 |     Args:
 19 |         df (pd.DataFrame): DataFrame with labelled data.
 20 |         new_cat (str): Name for new column of merged data.
 21 |         cats_to_merge (list): List containing columns to be merged.
 22 | 
 23 |     Returns:
 24 |         (pd.DataFrame): DataFrame with new columns
 25 |     """
 26 |     df[new_cat] = np.NaN
 27 |     for cat in cats_to_merge:
 28 |         print(f"Number of {cat} labels: {df[cat].sum()}")
 29 |         df[new_cat] = df[new_cat].mask(df[cat] == 1, other=1)
 30 |     print(f"Number of new label {new_cat}: {df[new_cat].sum()}")
 31 |     df = df.drop(columns=cats_to_merge)
 32 |     return df
 33 | 
 34 | 
 35 | def bert_data_to_dataset(
 36 |     X,
 37 |     Y=None,
 38 |     max_length=150,
 39 |     model_name="distilbert-base-uncased",
 40 |     additional_features=False,
 41 | ):
 42 |     """This function converts a dataframe into a format that can be utilised by a transformer model.
 43 |     If Y is provided then it returns a TensorFlow dataset for training the model.
 44 |     If Y is not provided, then it returns a dict which can be used to make predictions by an already trained model.
 45 | 
 46 |     Args:
 47 |         X (pd.DataFrame): Data to be converted to text data. Text should be in column 'FFT answer',
 48 |             FFT question should be in column 'FFT_q_standardised'.
 49 |         Y (pd.DataFrame, optional): One Hot Encoded targets. Defaults to None.
 50 |         max_length (int, optional): Maximum length of text to be encoded. Defaults to 150.
 51 |         model_name (str, optional): Type of transformer model. Defaults to 'distilbert-base-uncased'.
 52 |         additional_features (bool, optional): Whether additional features are to be included, currently this is only question type
 53 |             in 'FFT_q_standardised' column. Defaults to False.
 54 | 
 55 |     Returns:
 56 |         (tf.data.Dataset OR dict): `tf.data.Dataset` if Y is provided, `dict` otherwise.
 57 |     """
 58 |     tokenizer = AutoTokenizer.from_pretrained(model_name)
 59 |     if type(X) == pd.DataFrame:
 60 |         data_encoded = dict(
 61 |             tokenizer(
 62 |                 list(X["FFT answer"]),
 63 |                 truncation=True,
 64 |                 padding="max_length",
 65 |                 max_length=max_length,
 66 |                 return_tensors="tf",
 67 |             )
 68 |         )
 69 |     elif type(X) == pd.Series:
 70 |         data_encoded = dict(
 71 |             tokenizer(
 72 |                 list(X),
 73 |                 truncation=True,
 74 |                 padding="max_length",
 75 |                 max_length=max_length,
 76 |                 return_tensors="tf",
 77 |             )
 78 |         )
 79 |     data_encoded.pop("attention_mask", None)
 80 |     if additional_features is True:
 81 |         data_encoded["input_cat"] = X["FFT_q_standardised"].map(
 82 |             {"what_good": 0, "could_improve": 1, "nonspecific": 2}
 83 |         )
 84 |     if Y is not None:
 85 |         data_encoded = Dataset.from_tensor_slices((data_encoded, Y))
 86 |     return data_encoded
 87 | 
 88 | 
 89 | def load_multilabel_data(filename, target="major_categories"):
 90 |     """Function for loading the multilabel dataset, converting it from csv to pd.DataFrame. Conducts some basic preprocessing,
 91 |     including standardisation of the question types, calculation of text length, and drops rows with no labels. Depending on
 92 |     selected `target`, returned dataframe contains different columns.
 93 | 
 94 |     Args:
 95 |         filename (str): Path to file containing multilabel data, in csv format
 96 |         target (str, optional): Options are 'minor_categories', 'major_categories', or 'sentiment'. Defaults to 'major_categories'.
 97 | 
 98 |     Returns:
 99 |         (pd.DataFrame): DataFrame containing the columns 'FFT categorical answer', 'FFT question', and 'FFT answer'. Also conducts some
100 |     """
101 |     print("Loading multilabel dataset...")
102 |     raw_data = pd.read_csv(
103 |         filename,
104 |         na_values=" ",
105 |     )
106 |     print(f"Shape of raw data is {raw_data.shape}")
107 |     raw_data.columns = raw_data.columns.str.strip()
108 |     raw_data = raw_data.set_index("Comment ID").copy()
109 |     features = ["FFT categorical answer", "FFT question", "FFT answer"]
110 |     # For now the labels are hardcoded, these are subject to change as framework is in progress
111 |     if target in ["minor_categories", "major_categories"]:
112 |         cols = minor_cats
113 |     elif target == "sentiment":
114 |         cols = ["Comment sentiment"]
115 |     # Sort out the features first
116 |     features_df = raw_data.loc[:, features].copy()
117 |     # Standardize FFT qs
118 |     features_df["FFT question"] = features_df["FFT question"].fillna("nonspecific")
119 |     features_df.loc[:, "FFT_q_standardised"] = (
120 |         features_df.loc[:, "FFT question"].map(q_map).copy()
121 |     )
122 |     if features_df["FFT_q_standardised"].count() != features_df.shape[0]:
123 |         raise ValueError(
124 |             f'Check q_map is correct. features_df.shape[0] is {features_df.shape[0]}. \n \
125 |                          features_df["FFT_q_standardised"].count()  is {features_df["FFT_q_standardised"].count()}. \n\n\
126 |                          Questions are: {features_df["FFT question"].value_counts()}'
127 |         )
128 |     features_df.loc[:, "text_length"] = features_df.loc[:, "FFT answer"].apply(
129 |         lambda x: len([word for word in str(x).split(" ") if word != ""])
130 |     )
131 |     features_df = clean_empty_features(features_df)
132 |     # Sort out the targets
133 |     targets_df = raw_data.loc[:, cols].copy()
134 |     targets_df = targets_df.replace("1", 1)
135 |     targets_df = targets_df.fillna(value=0)
136 |     if target == "major_categories":
137 |         for maj, min_list in major_cat_dict.items():
138 |             targets_df = merge_categories(targets_df, maj, min_list)
139 |         cols = list(major_cat_dict.keys())
140 |     targets_df.loc[:, "num_labels"] = targets_df.loc[:, cols].sum(axis=1)
141 |     targets_df = targets_df[targets_df["num_labels"] != 0]
142 |     targets_df = targets_df.fillna(value=0)
143 |     # merge two together
144 |     combined_df = pd.merge(features_df, targets_df, left_index=True, right_index=True)
145 |     combined_df = combined_df.reset_index()
146 |     combined_df = combined_df.drop_duplicates()
147 |     combined_df = combined_df.set_index("Comment ID")
148 |     print(f"Shape of cleaned data is {combined_df.shape}")
149 |     return combined_df
150 | 
151 | 
152 | def clean_empty_features(text_dataframe):
153 |     """Replaces all empty whitespaces in a dataframe with np.NaN.
154 | 
155 |     Args:
156 |         text_dataframe (pd.DataFrame): DataFrame containing text data with labels.
157 | 
158 |     Returns:
159 |         (pd.DataFrame): DataFrame with all empty whitespaces replaced with np.NaN
160 |     """
161 |     clean_dataframe = text_dataframe.replace(r"^\s*$", np.nan, regex=True)
162 |     clean_dataframe = clean_dataframe.dropna()
163 |     return clean_dataframe
164 | 
165 | 
166 | def onehot(df, col_to_onehot):
167 |     """Function to one-hot encode specified columns in a dataframe.
168 | 
169 |     Args:
170 |         df (pd.DataFrame): DataFrame containing data to be one-hot encoded
171 |         col_to_onehot (list): List of column names to be one-hot encoded
172 | 
173 |     Returns:
174 |         (pd.DataFrame): One-hot encoded data
175 |     """
176 |     encoder = OneHotEncoder(sparse=False, handle_unknown="ignore")
177 |     col_encoded = encoder.fit_transform(df[[col_to_onehot]])
178 |     return col_encoded
179 | 
180 | 
181 | def process_data(df, target, preprocess_text=True, additional_features=False):
182 |     """Utilises remove_punc_and_nums and clean_empty_features functions to clean the text data and
183 |     drop any rows that are only whitespace after cleaning. Also fills one-hot encoded columns with
184 |     0s rather than NaNs so that Y target is not sparse.
185 | 
186 |     Args:
187 |         df (pd.DataFrame): DataFrame containing text data, any additional features, and targets
188 |         target (list): List of column names of targets
189 |         preprocess_text (bool, optional): Whether or not text is to be processed with remove_punc_and_nums. If utilising
190 |             an sklearn model then should be True. If utilising transformer-based BERT model then should be set to False.
191 |             Defaults to True.
192 |         additional_features (bool, optional): Whether or not 'question type' feature should be included. Defaults to False.
193 | 
194 |     Returns:
195 |         (tuple): Tuple containing two pd.DataFrames. The first contains the X features (text, with or without question type depending on additional_features), the second contains the one-hot encoded Y targets
196 |     """
197 | 
198 |     if preprocess_text is True:
199 |         X = df["FFT answer"].astype(str).apply(remove_punc_and_nums)
200 |         X = clean_empty_features(X)
201 |         print(f"After preprocessing, shape of X is {X.shape}")
202 |     if preprocess_text is False:
203 |         X_temp = df["FFT answer"].astype(str).apply(remove_punc_and_nums)
204 |         X_temp = clean_empty_features(X_temp)
205 |         print(f"After preprocessing, shape of X is {X_temp.shape}")
206 |         indices = X_temp.index
207 |         X = df["FFT answer"].astype(str).filter(indices)
208 |     if additional_features is True:
209 |         X = pd.merge(X, df[["FFT_q_standardised"]], left_index=True, right_index=True)
210 |         X = X.reset_index()
211 |         X = X.drop_duplicates()
212 |         X = X.set_index("Comment ID")
213 |     if target == "sentiment":
214 |         Y = df["Comment sentiment"].astype(int) - 1
215 |     else:
216 |         Y = df[target].fillna(value=0)
217 |     Y = Y.loc[X.index]
218 |     Y = Y.reset_index()
219 |     Y = Y.drop_duplicates()
220 |     Y = Y.set_index("Comment ID")
221 |     if target == "sentiment":
222 |         Y = Y["Comment sentiment"]
223 |     Y = np.array(Y).astype(int)
224 |     return X, Y
225 | 
226 | 
227 | def process_and_split_data(
228 |     df,
229 |     target,
230 |     preprocess_text=True,
231 |     additional_features=False,
232 |     random_state=42,
233 | ):
234 |     """Combines the process_multilabel_data and train_test_split functions into one function
235 | 
236 |     Args:
237 |         df (pd.DataFrame): DataFrame containing text data, any additional features, and targets
238 |         target (list): List of column names of targets
239 |         preprocess_text (bool, optional): Whether or not text is to be processed with remove_punc_and_nums. If utilising
240 |             an sklearn model then should be True. If utilising transformer-based BERT model then should be set to False.
241 |             Defaults to True.
242 |         additional_features (bool, optional): Whether or not 'question type' feature should be included. Defaults to False.
243 |         random_state (int, optional): Controls the shuffling applied to the data before applying the split. Enables reproducible output across multiple function calls. Defaults to 42.
244 | 
245 |     Returns:
246 |         (list): List containing train-test split of preprocessed X features and Y targets.
247 |     """
248 |     X, Y = process_data(
249 |         df,
250 |         target,
251 |         preprocess_text=preprocess_text,
252 |         additional_features=additional_features,
253 |     )
254 |     X_train, X_test, Y_train, Y_test = train_test_split(
255 |         X, Y, test_size=0.2, random_state=random_state
256 |     )
257 |     return X_train, X_test, Y_train, Y_test
258 | 
259 | 
260 | def remove_punc_and_nums(text):
261 |     """Function to conduct basic preprocessing of text, removing punctuation and numbers, converting
262 |     all text to lowercase, removing trailing whitespace.
263 | 
264 |     Args:
265 |         text (str): Str containing the text to be cleaned
266 | 
267 |     Returns:
268 |         (str): Cleaned text, all lowercased with no punctuation, numbers or trailing whitespace.
269 |     """
270 |     text = re.sub("\\n", " ", text)
271 |     text = re.sub("\\r", " ", text)
272 |     text = re.sub("â€™", "'", text)
273 |     text = "".join(char for char in text if not char.isdigit())
274 |     punc_list = string.punctuation
275 |     for punctuation in punc_list:
276 |         if punctuation in [",", ".", "-"]:
277 |             text = text.replace(punctuation, " ")
278 |         else:
279 |             text = text.replace(punctuation, "")
280 |     text_split = [word for word in text.split(" ") if word != ""]
281 |     text_lower = []
282 |     for word in text_split:
283 |         text_lower.append(word.lower())
284 |     cleaned_sentence = " ".join(word for word in text_lower)
285 |     cleaned_sentence = cleaned_sentence.strip()
286 |     return cleaned_sentence
287 | 
288 | 
289 | if __name__ == "__main__":
290 |     df = load_multilabel_data(dataset, target="major_categories")
291 |     print(df.shape)
292 |     print(df.head())
293 |     for i in df.columns:
294 |         print(f"{i}: {df[i].dtype}")
295 | 


--------------------------------------------------------------------------------
/pxtextmining/factories/factory_model_performance.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | from sklearn import metrics
  4 | from sklearn.base import is_classifier
  5 | from sklearn.dummy import DummyClassifier
  6 | from sklearn.metrics import confusion_matrix
  7 | from tensorflow.keras.models import Model
  8 | 
  9 | from pxtextmining.factories.factory_predict_unlabelled_text import (
 10 |     predict_multiclass_bert,
 11 | )
 12 | 
 13 | 
 14 | def get_dummy_model(x_train, y_train):
 15 |     """Creates dummy model that randomly predicts labels, fitted on the training data.
 16 | 
 17 |     Args:
 18 |         x_train (pd.DataFrame): Input features.
 19 |         y_train (pd.DataFrame): Target values.
 20 | 
 21 |     Returns:
 22 |         (sklearn.dummy.DummyClassifier): Trained dummy classifier.
 23 |     """
 24 |     model = DummyClassifier(strategy="uniform")
 25 |     model.fit(x_train, y_train)
 26 |     return model
 27 | 
 28 | 
 29 | def get_multiclass_metrics(
 30 |     x_test, y_test, labels, random_state, model, additional_features, training_time=None
 31 | ):
 32 |     """Creates a string detailing various performance metrics for a multiclass model, which can then be written to
 33 |     a text file.
 34 | 
 35 |     Args:
 36 |         x_test (pd.DataFrame): DataFrame containing test dataset features
 37 |         y_test (pd.DataFrame): DataFrame containing test dataset true target values
 38 |         labels (list): List containing the target labels
 39 |         random_state (int): Seed used to control the shuffling of the data, to enable reproducible results.
 40 |         model (tf.keras or sklearn model): Trained estimator.
 41 |         additional_features (bool, optional): Whether or not additional features (e.g. question type) have been included in training the model. Defaults to False.
 42 |         training_time (str, optional): Amount of time taken for model to train. Defaults to None.
 43 | 
 44 |     Raises:
 45 |         ValueError: Only models built with sklearn or tensorflow are allowed.
 46 | 
 47 |     Returns:
 48 |         (str): String containing the model architecture/hyperparameters, random state used for the train test split, and classification report.
 49 |     """
 50 |     metrics_string = "\n *****************"
 51 |     metrics_string += (
 52 |         f"\n Random state seed for train test split is: {random_state} \n\n"
 53 |     )
 54 |     # TF Keras models output probabilities with model.predict, whilst sklearn models output binary outcomes
 55 |     # Get them both to output the same (binary outcomes) and take max prob as label if no labels predicted at all
 56 |     if isinstance(model, Model) is True:
 57 |         stringlist = []
 58 |         model.summary(print_fn=lambda x: stringlist.append(x))
 59 |         model_summary = "\n".join(stringlist)
 60 |         metrics_string += f"\n{model_summary}\n"
 61 |         y_pred = predict_multiclass_bert(
 62 |             x_test,
 63 |             model,
 64 |             additional_features=additional_features,
 65 |         )
 66 |     elif is_classifier(model) is True:
 67 |         metrics_string += f"\n{model}\n"
 68 |         y_pred = model.predict(x_test)
 69 |     else:
 70 |         raise ValueError("Model type not recognised")
 71 |     # Calculate various metrics
 72 |     metrics_string += f"\n\nTraining time: {training_time}\n"
 73 |     # Classification report
 74 |     metrics_string += "\n\n Classification report:\n"
 75 |     c_report_str = metrics.classification_report(
 76 |         y_test, y_pred, target_names=labels, zero_division=0
 77 |     )
 78 |     metrics_string += c_report_str
 79 |     return metrics_string
 80 | 
 81 | 
 82 | def get_multilabel_metrics(
 83 |     preds_df,
 84 |     y_test,
 85 |     labels,
 86 |     random_state,
 87 |     model,
 88 |     training_time=None,
 89 | ):
 90 |     """Creates a string detailing various performance metrics for a multilabel model, which can then be written to
 91 |     a text file.
 92 | 
 93 |     Args:
 94 |         preds_df (pd.DataFrame): DataFrame containing model predictions
 95 |         y_test (pd.DataFrame): DataFrame containing test dataset true target values
 96 |         labels (list): List containing the target labels
 97 |         random_state (int): Seed used to control the shuffling of the data, to enable reproducible results.
 98 |         model (tf.keras or sklearn model): Trained estimator.
 99 |         training_time (str, optional): Amount of time taken for model to train. Defaults to None.
100 | 
101 |     Raises:
102 |         ValueError: Only sklearn and tensorflow keras models allowed.
103 | 
104 |     Returns:
105 |         (str): String containing the model architecture/hyperparameters, random state used for the train test split, and performance metrics including: exact accuracy, hamming loss, macro jaccard score, and classification report.
106 |     """
107 | 
108 |     metrics_string = "\n *****************"
109 |     metrics_string += (
110 |         f"\n Random state seed for train test split is: {random_state} \n\n"
111 |     )
112 |     model_metrics = {}
113 |     if isinstance(model, Model) is True:
114 |         stringlist = []
115 |         model.summary(print_fn=lambda x: stringlist.append(x))
116 |         model_summary = "\n".join(stringlist)
117 |     elif is_classifier(model) is True:
118 |         model_summary = model
119 |     else:
120 |         raise ValueError("invalid model type")
121 |     y_pred = np.array(preds_df[labels]).astype("int64")
122 |     # Calculate various metrics
123 |     model_metrics["exact_accuracy"] = metrics.accuracy_score(y_test, y_pred)
124 |     model_metrics["hamming_loss"] = metrics.hamming_loss(y_test, y_pred)
125 |     model_metrics["macro_jaccard_score"] = metrics.jaccard_score(
126 |         y_test, y_pred, average="macro"
127 |     )
128 |     y_probs = preds_df.filter(like="Probability", axis=1)
129 |     model_metrics["macro_roc_auc"] = metrics.roc_auc_score(
130 |         y_test, y_probs, multi_class="ovr"
131 |     )
132 |     model_metrics[
133 |         "Label ranking average precision"
134 |     ] = metrics.label_ranking_average_precision_score(
135 |         y_test,
136 |         y_probs,
137 |     )
138 |     # Model summary
139 |     metrics_string += f"\n{model_summary}\n"
140 |     metrics_string += f"\n\nTraining time: {training_time}\n"
141 |     for k, v in model_metrics.items():
142 |         metrics_string += f"\n{k}: {v}"
143 |     # Classification report
144 |     metrics_string += "\n\n Classification report:\n"
145 |     c_report_str = metrics.classification_report(
146 |         y_test, y_pred, target_names=labels, zero_division=0
147 |     )
148 |     metrics_string += c_report_str
149 |     return metrics_string
150 | 
151 | 
152 | def get_accuracy_per_class(y_test, pred):
153 |     """Function to produce accuracy per class for the predicted categories, compared against real values.
154 | 
155 |     Args:
156 |         y_test (pd.Series): Test data (real target values).
157 |         pred (pd.Series): Predicted target values.
158 | 
159 |     Returns:
160 |         (pd.DataFrame): The computed accuracy per class metrics for the model.
161 | 
162 |     """
163 |     cm = confusion_matrix(y_test, pred)
164 |     accuracy_per_class = cm.astype("float") / cm.sum(axis=1)[:, np.newaxis]
165 |     accuracy_per_class = pd.DataFrame(accuracy_per_class.diagonal())
166 |     accuracy_per_class.columns = ["accuracy"]
167 |     unique, frequency = np.unique(y_test, return_counts=True)
168 |     accuracy_per_class["class"], accuracy_per_class["counts"] = unique, frequency
169 |     accuracy_per_class = accuracy_per_class[["class", "counts", "accuracy"]]
170 |     return accuracy_per_class
171 | 
172 | 
173 | def parse_metrics_file(metrics_file, labels):
174 |     """Reads performance metrics files that are written by `factory_write_results.write_multilabel_models_and_metrics`.
175 |     Creates a pd.DataFrame with the precision, recall, f1_score, and support for each label, which can be filtered and sorted more easily.
176 | 
177 |     Args:
178 |         metrics_file (str): Path to the metrics file to be parsed.
179 |         labels (list): List of the target labels used in the metrics file.
180 | 
181 |     Returns:
182 |         (pd.DataFrame): DataFrame containing the precision, recall, f1_score, and support for each label, as detailed in the performance metrics file.
183 |     """
184 |     with open(metrics_file, "r") as file:
185 |         content = file.readlines()
186 |     for i, line in enumerate(content):
187 |         if line.strip().startswith(labels[0][:10]):
188 |             startline = i
189 |         if line.strip().startswith(labels[-1][:10]):
190 |             endline = i + 1
191 |     lines = [x.strip() for x in content[startline:endline]]
192 |     metrics_dict = {
193 |         "label": [],
194 |         "precision": [],
195 |         "recall": [],
196 |         "f1_score": [],
197 |         "support (label count in test data)": [],
198 |     }
199 |     for each in lines:
200 |         splitted = each.split("      ")
201 |         metrics_dict["label"].append(splitted[0].strip())
202 |         metrics_dict["precision"].append(splitted[1].strip())
203 |         metrics_dict["recall"].append(splitted[2].strip())
204 |         metrics_dict["f1_score"].append(splitted[3].strip())
205 |         metrics_dict["support (label count in test data)"].append(splitted[4].strip())
206 |     metrics_df = pd.DataFrame.from_dict(metrics_dict)
207 |     return metrics_df
208 | 
209 | 
210 | def get_y_score(probs):
211 |     """Converts probabilities into format (n_samples, n_classes) so they can be passed into sklearn roc_auc_score function
212 | 
213 |     Args:
214 |         probs (np.ndarray): Probability estimates outputted by model
215 | 
216 |     Returns:
217 |         (np.ndarray): Probability estimates in format (n_samples, n_classes)
218 |     """
219 |     if probs.ndim == 3:
220 |         score = np.transpose([pred[:, 1] for pred in probs])
221 |     elif probs.ndim == 2:
222 |         score = probs
223 |     return score
224 | 
225 | 
226 | def additional_analysis(preds_df, y_true, labels, custom_threshold_dict=None):
227 |     """For given predictions, returns dataframe containing: macro one-vs-one ROC AUC score, number of True Positives, True Negatives, False Positives, and False Negatives.
228 | 
229 |     Args:
230 |         preds_df (pd.DataFrame): Dataframe containing predicted labels in one-hot encoded format
231 |         y_true (np.array): One-hot encoded real Y values
232 |         labels (List): List of the target labels
233 | 
234 |     Returns:
235 |         (pd.DataFrame): dataframe containing: macro one-vs-one ROC AUC score, number of True Positives, True Negatives, False Positives, and False Negatives.
236 |     """
237 |     y_score = np.array(preds_df.filter(like="Probability", axis=1))
238 |     cm = metrics.multilabel_confusion_matrix(y_true, np.array(preds_df[labels]))
239 |     cm_dict = {}
240 |     average_precision = {}
241 |     for i, label in enumerate(labels):
242 |         cm_meaning = {}
243 |         tn, fp = cm[i][0]
244 |         fn, tp = cm[i][1]
245 |         cm_meaning["True Negative"] = tn
246 |         cm_meaning["False Negative"] = fn
247 |         cm_meaning["True Positive"] = tp
248 |         cm_meaning["False Positive"] = fp
249 |         cm_dict[label] = cm_meaning
250 |         average_precision[label] = metrics.average_precision_score(
251 |             y_true[:, i], y_score[:, i]
252 |         )
253 |     df = pd.DataFrame.from_dict(cm_dict, orient="index")
254 |     average_precision = pd.Series(average_precision)
255 |     df["average_precision_score"] = average_precision
256 |     if custom_threshold_dict is not None:
257 |         df["custom_threshold"] = custom_threshold_dict
258 |     return df
259 | 


--------------------------------------------------------------------------------
/pxtextmining/factories/factory_write_results.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pickle
  3 | 
  4 | import numpy as np
  5 | import pandas as pd
  6 | from tensorflow.keras import Model, Sequential
  7 | 
  8 | from pxtextmining.factories.factory_model_performance import (
  9 |     additional_analysis,
 10 |     parse_metrics_file,
 11 | )
 12 | from pxtextmining.factories.factory_predict_unlabelled_text import (
 13 |     get_labels,
 14 |     get_probabilities,
 15 | )
 16 | 
 17 | 
 18 | def write_multilabel_models_and_metrics(models, model_metrics, path):
 19 |     """Saves models and their associated performance metrics into a specified folder
 20 | 
 21 |     Args:
 22 |         models (list): List containing the trained tf.keras or sklearn models to be saved.
 23 |         model_metrics (list): List containing the model metrics in `str` format
 24 |         path (str): Path where model is to be saved.
 25 |     """
 26 |     for i in range(len(models)):
 27 |         model_name = f"model_{i}"
 28 |         if not os.path.exists(path):
 29 |             os.makedirs(path)
 30 |         fullpath = os.path.join(path, model_name)
 31 |         if isinstance(models[i], (Sequential, Model)):
 32 |             models[i].save(fullpath)
 33 |         else:
 34 |             modelpath = os.path.join(path, model_name + ".sav")
 35 |             pickle.dump(models[i], open(modelpath, "wb"))
 36 |         # Write performance metrics file
 37 |         txtpath = os.path.join(path, model_name + ".txt")
 38 |         with open(txtpath, "w") as file:
 39 |             file.write(model_metrics[i])
 40 |     print(f"{len(models)} models have been written to {path}")
 41 | 
 42 | 
 43 | def write_model_preds(x, y_true, preds_df, labels, path="labels.xlsx", return_df=False):
 44 |     """Writes an Excel file to enable easier analysis of model outputs using the test set. Columns of the Excel file are: comment_id, actual_labels, predicted_labels, actual_label_probs, and predicted_label_probs.
 45 | 
 46 |     Args:
 47 |         x (pd.Series OR pd.DataFrame): Text data used for predictions
 48 |         y_true (np.array): Onehot encoded targets
 49 |         preds_df (pd.DataFrame): DataFrame containing predictions, predicted probabilities, and labels. Should be produced by predict_multilabel_sklearn or predict_multilabel_bert
 50 |         labels (list): List containing target labels.
 51 |         path (str, optional): Filename and path for file to be saved. Defaults to "labels.xlsx".
 52 |         return_df (bool, optional): Whether or not the processed data should be returned as a DataFrame. Defaults to False.
 53 | 
 54 |     Returns:
 55 |         (pd.DataFrame): DataFrame containing comment_id, comment text, actual_labels, predicted_labels, actual_label_probs, and predicted_label_probs.
 56 |     """
 57 |     assert len(x) == len(y_true) == len(preds_df)
 58 |     actual_labels = pd.DataFrame(y_true, columns=labels).apply(
 59 |         get_labels, args=(labels,), axis=1
 60 |     )
 61 |     actual_labels.name = "actual_labels"
 62 |     predicted_labels = preds_df["labels"]
 63 |     predicted_labels.name = "predicted_labels"
 64 |     df = x.reset_index()
 65 |     probabilities = np.array(preds_df.filter(like="Probability", axis=1))
 66 |     probs_actual = get_probabilities(actual_labels, labels, probabilities)
 67 |     probs_predicted = get_probabilities(predicted_labels, labels, probabilities)
 68 |     df = df.merge(actual_labels, left_index=True, right_index=True)
 69 |     df = df.merge(predicted_labels, left_on="Comment ID", right_index=True)
 70 |     df = df.merge(probs_actual, left_index=True, right_index=True)
 71 |     df = df.merge(probs_predicted, left_on="Comment ID", right_index=True)
 72 |     # Deal with any rogue characters
 73 |     df.applymap(
 74 |         lambda x: x.encode("unicode_escape").decode("utf-8")
 75 |         if isinstance(x, str)
 76 |         else x
 77 |     )
 78 |     df.to_excel(path, index=False)
 79 |     if return_df is True:
 80 |         return df
 81 | 
 82 | 
 83 | def write_model_analysis(
 84 |     model_name,
 85 |     labels,
 86 |     dataset,
 87 |     path,
 88 |     preds_df=None,
 89 |     y_true=None,
 90 |     custom_threshold_dict=None,
 91 | ):
 92 |     """Writes an Excel file with the performance metrics of each label, as well as the counts of samples for each label.
 93 | 
 94 |     Args:
 95 |         model_name (str): Model name used in the performance metrics file
 96 |         labels (list): List of labels for the categories to be predicted.
 97 |         dataset (pd.DataFrame): Original dataset before train test split
 98 |         path (str): Filepath where model and performance metrics file are saved.
 99 |     """
100 |     metrics_df = parse_metrics_file(f"{path}/{model_name}.txt", labels=labels)
101 |     label_counts = pd.DataFrame(dataset[labels].sum())
102 |     label_counts = label_counts.reset_index()
103 |     label_counts = label_counts.rename(
104 |         columns={"index": "label", 0: "label_count_in_full_dataset"}
105 |     )
106 |     metrics_df = metrics_df.merge(label_counts, on="label").set_index("label")
107 |     if preds_df is not None and y_true is not None:
108 |         more_metrics = additional_analysis(
109 |             preds_df, y_true, labels, custom_threshold_dict
110 |         )
111 |         metrics_df = pd.concat([metrics_df, more_metrics], axis=1)
112 |     metrics_df.to_excel(f"{path}/{model_name}_perf.xlsx", index=True)
113 | 


--------------------------------------------------------------------------------
/pxtextmining/helpers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/The-Strategy-Unit/pxtextmining/ba6c0a2f10ca9bb3b964b72afe6047b463095c75/pxtextmining/helpers/__init__.py


--------------------------------------------------------------------------------
/pxtextmining/helpers/text_preprocessor.py:
--------------------------------------------------------------------------------
 1 | from tensorflow.keras.preprocessing.text import Tokenizer
 2 | from tensorflow.keras.preprocessing.sequence import pad_sequences
 3 | 
 4 | 
 5 | def tf_preprocessing(X, max_sentence_length = 150):
 6 |     """Conducts preprocessing with tensorflow tokenizer which vectorizes text and standardizes length.
 7 | 
 8 |     Args:
 9 |         X (pd.Series): Series containing the text to be processed
10 |         max_sentence_length (int, optional): Maximum number of words. Defaults to 150.
11 | 
12 |     Returns:
13 |         (tuple): Tuple containing `np.array` of padded, tokenized, vectorized texts, and `int` showing number of unique words in vocabulary.
14 |     """
15 |     tk = Tokenizer()
16 |     tk.fit_on_texts(X)
17 |     vocab_size = len(tk.word_index)
18 |     print(f'There are {vocab_size} different words in your corpus')
19 |     X_token = tk.texts_to_sequences(X)
20 |     ### Pad the inputs
21 |     X_pad = pad_sequences(X_token, dtype='float32', padding='post', maxlen = max_sentence_length)
22 |     return X_pad, vocab_size
23 | 


--------------------------------------------------------------------------------
/pxtextmining/params.py:
--------------------------------------------------------------------------------
  1 | dataset = "datasets/hidden/v7_230925.csv"
  2 | 
  3 | random_state = 42
  4 | 
  5 | model_name = "distilbert-base-uncased"
  6 | 
  7 | q_map = {
  8 |     "Please tell us why": "nonspecific",
  9 |     "Please tells us why you gave this answer?": "nonspecific",
 10 |     "FFT Why?": "nonspecific",
 11 |     "What was good?": "what_good",
 12 |     "Is there anything we could have done better?": "could_improve",
 13 |     "How could we improve?": "could_improve",
 14 |     "What could we do better?": "could_improve",
 15 |     "Please can you tell us why you gave your answer and what we could have done better?": "nonspecific",
 16 |     "Please describe any things about the 111 service that\r\nyou were particularly satisfied and/or dissatisfied with": "nonspecific",
 17 |     "Please describe any things about the 111 service that \nyou were particularly satisfied and/or dissatisfied with": "nonspecific",
 18 |     "Please describe any things about the 111 service that\nyou were particularly satisfied and/or dissatisfied with": "nonspecific",
 19 |     "Nonspecific": "nonspecific",
 20 |     "nonspecific": "nonspecific",
 21 | }
 22 | 
 23 | # v7
 24 | major_cat_dict = {
 25 |     "General": [
 26 |         "Positive experience & gratitude",
 27 |         "Organisation & efficiency",
 28 |         "Funding & use of financial resources",
 29 |         "Feeling safe",
 30 |         "Labelling not possible",
 31 |     ],
 32 |     "Staff": [
 33 |         "Staff manner & personal attributes",
 34 |         "Competence & training",
 35 |         "Staffing levels & responsiveness",
 36 |     ],
 37 |     "Access to medical care & support": [
 38 |         "Contacting services",
 39 |         "Appointment arrangements",
 40 |         "Appointment method",
 41 |         "Timeliness of care",
 42 |     ],
 43 |     "Communication & involvement": [
 44 |         "Unspecified communication",
 45 |         "Staff listening, understanding & involving patients",
 46 |         "Information directly from staff during care",
 47 |         "Information provision & guidance",
 48 |         "Being kept informed, clarity & consistency of information",
 49 |         "Interaction with family/ carers",
 50 |     ],
 51 |     "Mental Health specifics": ["Mental Health Act"],
 52 |     "Patient journey & service coordination": ["Continuity of care", "Discharge"],
 53 |     "Medication & pain": ["Supplying & understanding medication", "Pain management"],
 54 |     "Activities": ["Activities & access to fresh air", "Electronic entertainment"],
 55 |     "Environment, equipment & catering": [
 56 |         "Cleanliness, tidiness & infection control",
 57 |         "Sensory experience",
 58 |         "Environment, facilities & equipment",
 59 |         "Food & drink provision & facilities",
 60 |     ],
 61 |     "Service location, travel & transport": [
 62 |         "Service location",
 63 |         "Transport to/ from services",
 64 |         "Parking",
 65 |     ],
 66 | }
 67 | 
 68 | major_cats = list(major_cat_dict.keys())
 69 | 
 70 | # v7 20230925
 71 | minor_cats = [
 72 |     "Organisation & efficiency",
 73 |     "Funding & use of financial resources",
 74 |     "Staff manner & personal attributes",
 75 |     "Competence & training",
 76 |     "Unspecified communication",
 77 |     "Staff listening, understanding & involving patients",
 78 |     "Information directly from staff during care",
 79 |     "Information provision & guidance",
 80 |     "Being kept informed, clarity & consistency of information",
 81 |     "Contacting services",
 82 |     "Appointment arrangements",
 83 |     "Appointment method",
 84 |     "Timeliness of care",
 85 |     "Pain management",
 86 |     "Discharge",
 87 |     "Cleanliness, tidiness & infection control",
 88 |     "Service location",
 89 |     "Transport to/ from services",
 90 |     "Parking",
 91 |     "Electronic entertainment",
 92 |     "Feeling safe",
 93 |     "Mental Health Act",
 94 |     "Labelling not possible",
 95 |     "Supplying & understanding medication",
 96 |     "Activities & access to fresh air",
 97 |     "Food & drink provision & facilities",
 98 |     "Sensory experience",
 99 |     "Interaction with family/ carers",
100 |     "Positive experience & gratitude",
101 |     "Continuity of care",
102 |     "Environment, facilities & equipment",
103 |     "Staffing levels & responsiveness",
104 | ]
105 | 
106 | sentiment_dict = {
107 |     1: "very positive",
108 |     2: "positive",
109 |     3: "neutral",
110 |     4: "negative",
111 |     5: "very negative",
112 | }
113 | 
114 | 
115 | # Note that some of these categories no longer exist since v7 of the framework
116 | rules_dict = {
117 |     "Care plans": [
118 |         "plan",
119 |         "planning",
120 |         "plans",
121 |         "treatment",
122 |         "care",
123 |         "future",
124 |         "forward",
125 |         "forwards",
126 |         "action",
127 |     ],
128 |     "Patient appearance & grooming": [
129 |         "basin",
130 |         "bowl",
131 |         "brush",
132 |         "clothes",
133 |         "comb",
134 |         "dressed",
135 |         "gown",
136 |         "hair",
137 |         "hairbrush",
138 |         "mirror",
139 |         "modesty",
140 |         "hygien",
141 |         "razor",
142 |         "shampoo",
143 |         "shower",
144 |         "sink",
145 |         "wear",
146 |         "wash",
147 |     ],
148 |     "Equality, Diversity & Inclusion": [
149 |         "accessib",
150 |         "adjustment",
151 |         "adhd",
152 |         "age",
153 |         "autis",
154 |         "cultur",
155 |         "deaf",
156 |         "disab",
157 |         "wheelchair",
158 |         "discriminat",
159 |         "gender",
160 |         "hearing",
161 |         "language",
162 |         "blind",
163 |         "mobility",
164 |         "race",
165 |         "racis",
166 |         "religio",
167 |         "sexis",
168 |         "trans ",
169 |         "misgender",
170 |     ],
171 |     "Patient records": [
172 |         "accurate",
173 |         "computer",
174 |         "confidential",
175 |         "data",
176 |         "identifiable",
177 |         "notes",
178 |         "paperwork",
179 |         "papers",
180 |         "details",
181 |         "record",
182 |         "system",
183 |         "updated",
184 |         "app",
185 |     ],
186 |     # "Admission": [
187 |     #     "admission",
188 |     #     "admit",
189 |     # ], Model already good at picking this up where the words admission/admit are in the text
190 |     "Referals & continuity of care": [
191 |         "refer",
192 |         "same",
193 |         "different",
194 |         "continu",
195 |         "transfer",
196 |         "pass",
197 |         "between",
198 |     ],
199 |     "Staff continuity": [
200 |         "same",
201 |         "different",
202 |         "retire",
203 |         "handover",
204 |         "relationship",
205 |         "communication",
206 |         "change",
207 |         "transition",
208 |         "passed",
209 |     ],
210 |     "Diagnosis & triage": [
211 |         "assess",
212 |         "diagnos",
213 |         "question",
214 |         "scan",
215 |         "test",
216 |         "triage",
217 |         "wrong",
218 |         "figure",
219 |         "identif",
220 |         "call",
221 |     ],
222 |     "Mental Health Act": [
223 |         "leave",
224 |         "leav",
225 |         "allowed",
226 |         "detain",
227 |         "prisoner",
228 |         "release",
229 |         "restrict",
230 |         "seclu",
231 |         "section",
232 |     ],
233 |     "Interaction with family/ carers": [
234 |         "brother",
235 |         "carer",
236 |         "child",
237 |         "dad",
238 |         "father",
239 |         "husband",
240 |         "partner",
241 |         "famil",
242 |         "mam",
243 |         "mum",
244 |         "wife",
245 |         "mother",
246 |         "parent",
247 |         "relative",
248 |         "visit",
249 |         "sister",
250 |     ],
251 |     "Service location": [
252 |         "access",
253 |         "direct",
254 |         "away",
255 |         "far",
256 |         "distance",
257 |         "local",
258 |         "locat",
259 |         "lost",
260 |         "map",
261 |         "miles",
262 |         "place",
263 |         "sign",
264 |         "go to",
265 |         "travel",
266 |         "where",
267 |         "get to",
268 |     ],
269 |     "Negative experience & dissatisfaction": [
270 |         "rubbish",
271 |         "awful",
272 |         "poor",
273 |         "bad",
274 |         "terrible",
275 |         "unacceptable",
276 |     ],
277 | }
278 | 
279 | probs_dict = {
280 |     "Negative experience & dissatisfaction": 0.4,
281 |     "Diagnosis & triage": 0.4,
282 |     "Equality, Diversity & Inclusion": 0.4,
283 |     "Referals & continuity of care": 0.4,
284 |     "Staff continuity": 0.4,
285 | }
286 | 


--------------------------------------------------------------------------------
/pxtextmining/pipelines/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/The-Strategy-Unit/pxtextmining/ba6c0a2f10ca9bb3b964b72afe6047b463095c75/pxtextmining/pipelines/__init__.py


--------------------------------------------------------------------------------
/pxtextmining/pipelines/multilabel_pipeline.py:
--------------------------------------------------------------------------------
  1 | from warnings import simplefilter
  2 | 
  3 | from sklearn.exceptions import ConvergenceWarning
  4 | from sklearn.model_selection import train_test_split
  5 | 
  6 | from pxtextmining.factories.factory_data_load_and_split import (
  7 |     bert_data_to_dataset,
  8 |     load_multilabel_data,
  9 |     process_and_split_data,
 10 | )
 11 | from pxtextmining.factories.factory_model_performance import get_multilabel_metrics
 12 | from pxtextmining.factories.factory_pipeline import (
 13 |     calculating_class_weights,
 14 |     create_and_train_svc_model,
 15 |     create_bert_model,
 16 |     create_bert_model_additional_features,
 17 |     search_sklearn_pipelines,
 18 |     train_bert_model,
 19 | )
 20 | from pxtextmining.factories.factory_predict_unlabelled_text import (
 21 |     get_thresholds,
 22 |     predict_multilabel_bert,
 23 |     predict_multilabel_sklearn,
 24 | )
 25 | from pxtextmining.factories.factory_write_results import (
 26 |     write_model_analysis,
 27 |     write_model_preds,
 28 |     write_multilabel_models_and_metrics,
 29 | )
 30 | from pxtextmining.params import dataset, major_cats, minor_cats, random_state
 31 | 
 32 | simplefilter("ignore", category=ConvergenceWarning)
 33 | 
 34 | 
 35 | def run_sklearn_pipeline(
 36 |     additional_features=False,
 37 |     target=major_cats,
 38 |     models_to_try=("mnb", "knn", "svm", "rfc"),
 39 |     path="test_multilabel",
 40 |     include_analysis=False,
 41 |     custom_threshold=False,
 42 | ):
 43 |     """Runs all the functions required to load multilabel data, preprocess it, and split it into training and test sets.
 44 |     Creates sklearn pipelines and hyperparameters to search, using specified estimators.
 45 |     For each estimator type selected, performs a randomized search across the hyperparameters to identify the parameters providing the best
 46 |     results on the holdout data within the randomized search.
 47 |     Evaluates the performance of the refitted estimator with the best hyperparameters on the test set, and saves the model
 48 |     and the performance metrics to a specified folder.
 49 | 
 50 |     Args:
 51 |         additional_features (bool, optional): Whether or not additional features (question type and text length) are used. Defaults to False.
 52 |         target (list, optional): The target labels, which should be columns in the dataset DataFrame. Defaults to major_cats.
 53 |         models_to_try (list, optional): List of the estimators to try. Defaults to ["mnb", "knn", "svm", "rfc"]. Permitted values are "mnb" (Multinomial Naive Bayes), "knn" (K Nearest Neighbours), "svm" (Support Vector Classifier), or "rfc" (Random Forest Classifier).
 54 |         path (str, optional): Path where the models are to be saved. If path does not exist, it will be created. Defaults to 'test_multilabel'.
 55 |         include_analysis (bool, optional): Whether or not additional Excel files showing model performance and predicted labels are generated. Defaults to False.
 56 |         custom_threshold (bool, optional): Whether or not a custom classification threshold maximising the F1 score is to be calculated. Defaults to False.
 57 |     """
 58 |     # random_state = random.randint(1, 999)
 59 |     if target == major_cats:
 60 |         target_name = "major_categories"
 61 |     if target == minor_cats:
 62 |         target_name = "minor_categories"
 63 |     df = load_multilabel_data(filename=dataset, target=target_name)
 64 |     if custom_threshold is True:
 65 |         X_train_val, X_test, Y_train_val, Y_test = process_and_split_data(
 66 |             df,
 67 |             target=target,
 68 |             preprocess_text=False,
 69 |             additional_features=additional_features,
 70 |             random_state=random_state,
 71 |         )
 72 |         X_train, X_val, Y_train, Y_val = train_test_split(
 73 |             X_train_val, Y_train_val, test_size=0.2, random_state=random_state
 74 |         )
 75 |     else:
 76 |         X_train, X_test, Y_train, Y_test = process_and_split_data(
 77 |             df,
 78 |             target=target,
 79 |             additional_features=additional_features,
 80 |             random_state=random_state,
 81 |         )
 82 |     models, training_times = search_sklearn_pipelines(
 83 |         X_train,
 84 |         Y_train,
 85 |         models_to_try=models_to_try,
 86 |         additional_features=additional_features,
 87 |     )
 88 |     model_metrics = []
 89 |     threshold_dicts = []
 90 |     preds = []
 91 |     for i in range(len(models)):
 92 |         m = models[i]
 93 |         t = training_times[i]
 94 |         if custom_threshold is True:
 95 |             val_probs = m.predict_proba(X_val)
 96 |             custom_threshold_dict = get_thresholds(Y_val, val_probs, labels=target)
 97 |         else:
 98 |             custom_threshold_dict = None
 99 |         threshold_dicts.append(custom_threshold_dict)
100 |         preds_df = predict_multilabel_sklearn(
101 |             X_test,
102 |             m,
103 |             labels=target,
104 |             additional_features=additional_features,
105 |             label_fix=True,
106 |             custom_threshold_dict=custom_threshold_dict,
107 |         )
108 |         preds.append(preds_df)
109 |         model_metrics.append(
110 |             get_multilabel_metrics(
111 |                 preds_df,
112 |                 Y_test,
113 |                 random_state=random_state,
114 |                 labels=target,
115 |                 model=m,
116 |                 training_time=t,
117 |             )
118 |         )
119 |     write_multilabel_models_and_metrics(models, model_metrics, path=path)
120 |     if include_analysis is True:
121 |         for i in range(len(models)):
122 |             model_name = f"model_{i}"
123 |             write_model_preds(
124 |                 X_test,
125 |                 Y_test,
126 |                 preds[i],
127 |                 labels=target,
128 |                 path=f"{path}/{model_name}_labels.xlsx",
129 |             )
130 |             write_model_analysis(
131 |                 model_name,
132 |                 labels=target,
133 |                 dataset=df,
134 |                 path=path,
135 |                 preds_df=preds[i],
136 |                 y_true=Y_test,
137 |                 custom_threshold_dict=threshold_dicts[i],
138 |             )
139 |     print("Pipeline complete")
140 | 
141 | 
142 | def run_svc_pipeline(
143 |     additional_features=False,
144 |     target=major_cats,
145 |     path="test_multilabel",
146 |     include_analysis=False,
147 |     custom_threshold=False,
148 | ):
149 |     """Runs all the functions required to load multilabel data, preprocess it, and split it into training and test sets.
150 |     Creates sklearn pipeline using a MultiOutputClassifier and Support Vector Classifier estimator, with specific hyperparameters.
151 |     Fits the pipeline on the training data.
152 |     Evaluates the performance of the refitted estimator with the best hyperparameters on the test set, and saves the model and the performance metrics to a specified folder, together with optional further analysis in the form of Excel files.
153 | 
154 |     Args:
155 |         additional_features (bool, optional): Whether or not additional features (question type and text length) are used. Defaults to False.
156 |         target (list, optional): The target labels, which should be columns in the dataset DataFrame. Defaults to major_cats.
157 |         path (str, optional): Path where the models are to be saved. If path does not exist, it will be created. Defaults to 'test_multilabel'.
158 |         include_analysis (bool, optional): Whether or not to create Excel files including further analysis of the model's performance. Defaults to False. If True, writes two Excel files to the specified folder, one containing the labels and the performance metrics for each label, and one containing the predicted labels and the actual labels for the test set, with the model's probabilities for both.
159 |         custom_threshold (bool, optional): Whether or not a custom classification threshold maximising the F1 score is to be calculated. Defaults to False.
160 | 
161 |     """
162 |     # random_state = random.randint(1, 999)
163 |     if target == major_cats:
164 |         target_name = "major_categories"
165 |     if target == minor_cats:
166 |         target_name = "minor_categories"
167 |     df = load_multilabel_data(filename=dataset, target=target_name)
168 |     if custom_threshold is True:
169 |         X_train_val, X_test, Y_train_val, Y_test = process_and_split_data(
170 |             df,
171 |             target=target,
172 |             preprocess_text=False,
173 |             additional_features=additional_features,
174 |             random_state=random_state,
175 |         )
176 |         X_train, X_val, Y_train, Y_val = train_test_split(
177 |             X_train_val, Y_train_val, test_size=0.2, random_state=random_state
178 |         )
179 |     else:
180 |         X_train, X_test, Y_train, Y_test = process_and_split_data(
181 |             df,
182 |             target=target,
183 |             additional_features=additional_features,
184 |             random_state=random_state,
185 |         )
186 |     model, training_time = create_and_train_svc_model(
187 |         X_train, Y_train, additional_features=additional_features
188 |     )
189 |     if custom_threshold is True:
190 |         val_probs = model.predict_proba(X_val)
191 |         custom_threshold_dict = get_thresholds(Y_val, val_probs, labels=target)
192 |     else:
193 |         custom_threshold_dict = None
194 |     preds_df = predict_multilabel_sklearn(
195 |         X_test,
196 |         model=model,
197 |         labels=target,
198 |         additional_features=additional_features,
199 |         label_fix=True,
200 |         custom_threshold_dict=custom_threshold_dict,
201 |     )
202 |     model_metrics = get_multilabel_metrics(
203 |         preds_df,
204 |         Y_test,
205 |         labels=target,
206 |         random_state=random_state,
207 |         model=model,
208 |         training_time=training_time,
209 |     )
210 |     write_multilabel_models_and_metrics([model], [model_metrics], path=path)
211 |     if include_analysis is True:
212 |         write_model_preds(
213 |             X_test,
214 |             Y_test,
215 |             preds_df,
216 |             labels=target,
217 |             path=f"{path}/labels.xlsx",
218 |         )
219 |         write_model_analysis(
220 |             model_name="model_0",
221 |             labels=target,
222 |             dataset=df,
223 |             path=path,
224 |             preds_df=preds_df,
225 |             y_true=Y_test,
226 |             custom_threshold_dict=custom_threshold_dict,
227 |         )
228 |     print("Pipeline complete!")
229 | 
230 | 
231 | def run_bert_pipeline(
232 |     additional_features=False,
233 |     path="test_multilabel/bert",
234 |     target=major_cats,
235 |     include_analysis=False,
236 |     custom_threshold=False,
237 | ):
238 |     """Runs all the functions required to load multilabel data, preprocess it, and split it into training, test and validation sets.
239 |     Creates tf.keras Transformer model with additional layers specific to the classification task, and trains it on the train set.
240 |     Evaluates the performance of trained model with the best hyperparameters on the test set, and saves the model
241 |     and the performance metrics to a specified folder.
242 | 
243 |     Args:
244 |         additional_features (bool, optional): Whether or not additional features (question type and text length) are used. Defaults to False.
245 |         path (str, optional): Path where the models are to be saved. If path does not exist, it will be created. Defaults to 'test_multilabel'.
246 |         target (list, optional): The target labels, which should be columns in the dataset DataFrame. Defaults to major_cats.
247 |         include_analysis (bool, optional): Whether or not to create Excel files including further analysis of the model's performance. Defaults to False. If True, writes two Excel files to the specified folder, one containing the labels and the performance metrics for each label, and one containing the predicted labels and the actual labels for the test set, with the model's probabilities for both.
248 |         custom_threshold (bool, optional): Whether or not a custom classification threshold maximising the F1 score is to be calculated. Defaults to False.
249 | 
250 |     """
251 |     # random_state = random.randint(1, 999)
252 |     print(f"random_state is: {random_state}")
253 |     if target == major_cats:
254 |         target_name = "major_categories"
255 |     if target == minor_cats:
256 |         target_name = "minor_categories"
257 |     df = load_multilabel_data(filename=dataset, target=target_name)
258 |     X_train_val, X_test, Y_train_val, Y_test = process_and_split_data(
259 |         df,
260 |         target=target,
261 |         preprocess_text=False,
262 |         additional_features=additional_features,
263 |         random_state=random_state,
264 |     )
265 |     X_train, X_val, Y_train, Y_val = train_test_split(
266 |         X_train_val, Y_train_val, test_size=0.2, random_state=random_state
267 |     )
268 |     train_dataset = bert_data_to_dataset(
269 |         X_train, Y_train, additional_features=additional_features
270 |     )
271 |     val_dataset = bert_data_to_dataset(
272 |         X_val, Y_val, additional_features=additional_features
273 |     )
274 |     class_weights_dict = calculating_class_weights(Y_train_val)
275 |     if additional_features is True:
276 |         model = create_bert_model_additional_features(Y_train)
277 |     else:
278 |         model = create_bert_model(Y_train)
279 |     model_trained, training_time = train_bert_model(
280 |         train_dataset,
281 |         val_dataset,
282 |         model,
283 |         class_weights_dict=class_weights_dict,
284 |         epochs=25,
285 |     )
286 |     if custom_threshold is True:
287 |         val = bert_data_to_dataset(X_val, additional_features=additional_features)
288 |         val_probs = model_trained.predict(val)
289 |         custom_threshold_dict = get_thresholds(Y_val, val_probs, labels=target)
290 |     else:
291 |         custom_threshold_dict = None
292 |     preds_df = predict_multilabel_bert(
293 |         X_test,
294 |         model=model_trained,
295 |         labels=target,
296 |         additional_features=additional_features,
297 |         label_fix=True,
298 |         custom_threshold_dict=custom_threshold_dict,
299 |     )
300 |     model_metrics = get_multilabel_metrics(
301 |         preds_df,
302 |         Y_test,
303 |         labels=target,
304 |         random_state=random_state,
305 |         model=model_trained,
306 |         training_time=training_time,
307 |     )
308 |     write_multilabel_models_and_metrics([model_trained], [model_metrics], path=path)
309 |     if include_analysis is True:
310 |         write_model_preds(
311 |             X_test,
312 |             Y_test,
313 |             preds_df,
314 |             labels=target,
315 |             path=f"{path}/labels.xlsx",
316 |         )
317 |         write_model_analysis(
318 |             model_name="model_0",
319 |             labels=target,
320 |             dataset=df,
321 |             path=path,
322 |             preds_df=preds_df,
323 |             y_true=Y_test,
324 |             custom_threshold_dict=custom_threshold_dict,
325 |         )
326 |     print("Pipeline complete!")
327 | 
328 | 
329 | if __name__ == "__main__":
330 |     # run_svc_pipeline(
331 |     #     additional_features=False,
332 |     #     target=minor_cats,
333 |     #     path="test_multilabel/v7_final/svc_noq",
334 |     #     include_analysis=True,
335 |     #     custom_threshold=False,
336 |     # )
337 |     # run_svc_pipeline(
338 |     #     additional_features=True,
339 |     #     target=minor_cats,
340 |     #     path="test_multilabel/v7_final/svc",
341 |     #     include_analysis=True,
342 |     #     custom_threshold=False,
343 |     # )
344 |     # run_sklearn_pipeline(
345 |     #     additional_features=True,
346 |     #     target=minor_cats,
347 |     #     models_to_try=["xgb"],
348 |     #     path="test_multilabel/v7_final/xgb",
349 |     #     include_analysis=True,
350 |     #     custom_threshold=False,
351 |     # )
352 |     # run_sklearn_pipeline(
353 |     #     additional_features=False,
354 |     #     target=minor_cats,
355 |     #     models_to_try=["xgb"],
356 |     #     path="test_multilabel/v7_final/xgb_noq",
357 |     #     include_analysis=True,
358 |     #     custom_threshold=False,
359 |     # )
360 |     # run_bert_pipeline(
361 |     #     additional_features=True,
362 |     #     path="test_multilabel/v7_final/bert",
363 |     #     target=minor_cats,
364 |     #     include_analysis=True,
365 |     #     custom_threshold=False,
366 |     # )
367 |     # run_bert_pipeline(
368 |     #     additional_features=False,
369 |     #     path="test_multilabel/v7_final/bert_noq",
370 |     #     target=minor_cats,
371 |     #     include_analysis=True,
372 |     #     custom_threshold=False,
373 |     # )
374 |     run_sklearn_pipeline(
375 |         additional_features=False,
376 |         target=minor_cats,
377 |         models_to_try=["svm"],
378 |         path="test_multilabel/v7_final/svc_gridsearch",
379 |         include_analysis=True,
380 |         custom_threshold=False,
381 |     )
382 |     # run_two_layer_sklearn_pipeline()
383 | 


--------------------------------------------------------------------------------
/pxtextmining/pipelines/sentiment_pipeline.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from sklearn.model_selection import train_test_split
  3 | from sklearn.utils.class_weight import compute_class_weight
  4 | from tensorflow.keras.utils import to_categorical
  5 | 
  6 | from pxtextmining.factories.factory_data_load_and_split import (
  7 |     bert_data_to_dataset,
  8 |     load_multilabel_data,
  9 |     process_and_split_data,
 10 | )
 11 | from pxtextmining.factories.factory_model_performance import get_multiclass_metrics
 12 | from pxtextmining.factories.factory_pipeline import (
 13 |     create_bert_model,
 14 |     create_bert_model_additional_features,
 15 |     search_sklearn_pipelines,
 16 |     train_bert_model,
 17 | )
 18 | from pxtextmining.factories.factory_write_results import (
 19 |     write_multilabel_models_and_metrics,
 20 | )
 21 | from pxtextmining.params import dataset
 22 | 
 23 | random_state = 75
 24 | 
 25 | 
 26 | def run_sentiment_pipeline(
 27 |     additional_features=False,
 28 |     models_to_try=("svm", "xgb"),
 29 |     path="test_multilabel/sentiment",
 30 | ):
 31 |     """Runs all the functions required to load multiclass data, preprocess it, and split it into training, test and validation sets.
 32 |     Creates sklearn model and hyperparameter grid to search, and trains it on the train set.
 33 |     Evaluates the performance of trained model with the best hyperparameters on the test set, and saves the model
 34 |     and the performance metrics to a specified folder.
 35 | 
 36 |     Args:
 37 |         additional_features (bool, optional): Whether or not additional features (question type and text length) are used. Defaults to False.
 38 |         models_to_try (list, optional): Which model types to try. Defaults to ["svm", "xgb"].
 39 |         path (str, optional): Path where the models are to be saved. If path does not exist, it will be created. Defaults to 'test_multilabel'.
 40 |     """
 41 |     target_names = ["very positive", "positive", "neutral", "negative", "very negative"]
 42 |     df = load_multilabel_data(filename=dataset, target="sentiment")
 43 |     X_train, X_test, Y_train, Y_test = process_and_split_data(
 44 |         df,
 45 |         target="sentiment",
 46 |         additional_features=additional_features,
 47 |         random_state=random_state,
 48 |     )
 49 |     models, training_times = search_sklearn_pipelines(
 50 |         X_train,
 51 |         Y_train,
 52 |         target="sentiment",
 53 |         models_to_try=models_to_try,
 54 |         additional_features=additional_features,
 55 |     )
 56 |     model_metrics = []
 57 |     for i in range(len(models)):
 58 |         m = models[i]
 59 |         t = training_times[i]
 60 |         metrics = get_multiclass_metrics(
 61 |             X_test,
 62 |             Y_test,
 63 |             labels=target_names,
 64 |             random_state=random_state,
 65 |             model=m,
 66 |             training_time=t,
 67 |             additional_features=additional_features,
 68 |         )
 69 |         model_metrics.append(metrics)
 70 |     write_multilabel_models_and_metrics(models, model_metrics, path)
 71 | 
 72 | 
 73 | def run_sentiment_bert_pipeline(
 74 |     additional_features=True, path="test_multilabel/sentiment_bert"
 75 | ):
 76 |     """Runs all the functions required to load multiclass data, preprocess it, and split it into training, test and validation sets.
 77 |     Creates tf.keras Transformer model with additional layers specific to the classification task, and trains it on the train set.
 78 |     Evaluates the performance of trained model with the best hyperparameters on the test set, and saves the model
 79 |     and the performance metrics to a specified folder.
 80 | 
 81 |     Args:
 82 |         additional_features (bool, optional): Whether or not additional features (question type and text length) are used. Defaults to False.
 83 |         path (str, optional): Path where the models are to be saved. If path does not exist, it will be created. Defaults to 'test_multilabel'.
 84 |     """
 85 |     print(f"random_state is: {random_state}")
 86 |     target_names = ["very positive", "positive", "neutral", "negative", "very negative"]
 87 |     df = load_multilabel_data(filename=dataset, target="sentiment")
 88 |     X_train_val, X_test, Y_train_val, Y_test = process_and_split_data(
 89 |         df,
 90 |         target="sentiment",
 91 |         additional_features=additional_features,
 92 |         preprocess_text=True,
 93 |         random_state=random_state,
 94 |     )
 95 |     Y_train_val_oh = to_categorical(Y_train_val)
 96 |     X_train, X_val, Y_train, Y_val = train_test_split(
 97 |         X_train_val, Y_train_val_oh, test_size=0.2, random_state=random_state
 98 |     )
 99 |     train_dataset = bert_data_to_dataset(
100 |         X_train, Y_train, additional_features=additional_features
101 |     )
102 |     val_dataset = bert_data_to_dataset(
103 |         X_val, Y_val, additional_features=additional_features
104 |     )
105 |     cw = compute_class_weight("balanced", classes=np.unique(Y_train_val), y=Y_train_val)
106 |     class_weights_dict = {}
107 |     for k, v in enumerate(list(cw)):
108 |         class_weights_dict[k] = v
109 |     if additional_features is True:
110 |         model = create_bert_model_additional_features(Y_train, multilabel=False)
111 |     else:
112 |         model = create_bert_model(Y_train, multilabel=False)
113 |     model_trained, training_time = train_bert_model(
114 |         train_dataset,
115 |         val_dataset,
116 |         model,
117 |         class_weights_dict=class_weights_dict,
118 |         epochs=25,
119 |     )
120 |     model_metrics = get_multiclass_metrics(
121 |         X_test,
122 |         Y_test,
123 |         random_state=random_state,
124 |         labels=target_names,
125 |         model=model_trained,
126 |         training_time=training_time,
127 |         additional_features=additional_features,
128 |     )
129 |     write_multilabel_models_and_metrics([model_trained], [model_metrics], path=path)
130 | 
131 | 
132 | if __name__ == "__main__":
133 |     # run_sentiment_pipeline(additional_features=False)
134 |     run_sentiment_bert_pipeline(
135 |         additional_features=True, path="test_multilabel/230908_sentiment_bert"
136 |     )
137 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "pxtextmining"
 3 | version = "1.0.1"
 4 | description = "Text classification of patient experience feedback."
 5 | authors = ['CDU Data Science <phudatascience@nottshc.nhs.uk>',
 6 | 'YiWen Hon <yiwen.hon1@nhs.net>']
 7 | readme = "README.md"
 8 | license = "MIT"
 9 | repository = "https://github.com/the-strategy-unit/pxtextmining"
10 | documentation = "https://the-strategy-unit.github.io/pxtextmining"
11 | 
12 | [tool.poetry.dependencies]
13 | python = ">3.8, <3.11"
14 | joblib = "^1.2.0"
15 | matplotlib = "^3.3.2"
16 | numpy = ">=1.22"
17 | pandas = "^1.4.0"
18 | scikit-learn = "1.0.2"
19 | tensorflow = "2.12.0"
20 | transformers = "^4.26.1"
21 | scipy = "^1.10.1"
22 | xgboost = "^1.7.5"
23 | 
24 | [tool.poetry.group.dev]
25 | optional = true
26 | 
27 | [tool.poetry.group.dev.dependencies]
28 | uvicorn = "^0.20.0"
29 | pydantic = "^1.10.4"
30 | pytest = "^7.2.2"
31 | fastapi = "^0.101.0"
32 | httpx = "^0.23.3"
33 | pytest-cov = "^4.0.0"
34 | pytest-mock = "^3.10.0"
35 | requests = "^2.31.0"
36 | ruff = "^0.0.272"
37 | pre-commit = "^3.3.3"
38 | tornado = "^6.3.3"
39 | 
40 | [tool.poetry.group.docs]
41 | optional = true
42 | 
43 | [tool.poetry.group.docs.dependencies]
44 | mkdocs = "^1.4.2"
45 | mkdocstrings-python = "^0.8.2"
46 | mkdocstrings = "^0.19.1"
47 | 
48 | [tool.pytest.ini_options]
49 | testpaths = ["tests"]
50 | pythonpath = ["api"]
51 | 
52 | [tool.ruff]
53 | select = ["E", "F", "B"]
54 | ignore = ["E501"]
55 | 
56 | [build-system]
57 | requires = ["poetry-core"]
58 | build-backend = "poetry.core.masonry.api"
59 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | 
3 | import setuptools
4 | 
5 | if __name__ == "__main__":
6 |     setuptools.setup()
7 | 


--------------------------------------------------------------------------------
/test_multilabel/dummy_metrics.txt:
--------------------------------------------------------------------------------
 1 | 
 2 |  *****************
 3 | DummyClassifier(strategy='uniform')
 4 | 
 5 | 
 6 | Training time: None
 7 | 
 8 | exact_accuracy: 0.0
 9 | hamming_loss: 0.5030487804878049
10 | macro_jaccard_score: 0.0787901813020388
11 | 
12 |  Classification report:
13 |                                         precision    recall  f1-score   support
14 | 
15 |       Access to medical care & support       0.21      0.47      0.29       143
16 |                             Activities       0.01      0.36      0.02        11
17 |                             Additional       0.01      0.43      0.02         7
18 |                           Category TBC       0.00      0.00      0.00         1
19 |            Communication & involvement       0.21      0.53      0.31       137
20 |                Environment & equipment       0.03      0.52      0.06        21
21 |                            Food & diet       0.03      0.57      0.05        14
22 |                                General       0.36      0.46      0.40       248
23 |                             Medication       0.03      0.77      0.06        13
24 |                Mental Health specifics       0.01      0.43      0.02         7
25 | Patient journey & service coordination       0.08      0.48      0.14        58
26 |   Service location, travel & transport       0.01      0.33      0.02        12
27 |                                  Staff       0.28      0.48      0.35       193
28 | 
29 |                              micro avg       0.10      0.48      0.16       865
30 |                              macro avg       0.10      0.45      0.13       865
31 |                           weighted avg       0.24      0.48      0.30       865
32 |                            samples avg       0.10      0.48      0.15       865
33 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/The-Strategy-Unit/pxtextmining/ba6c0a2f10ca9bb3b964b72afe6047b463095c75/tests/__init__.py


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | import string
  3 | from unittest.mock import Mock
  4 | 
  5 | import numpy as np
  6 | import pandas as pd
  7 | import pytest
  8 | 
  9 | from pxtextmining.params import minor_cats, q_map
 10 | 
 11 | 
 12 | @pytest.fixture
 13 | def grab_test_X_additional_feats():
 14 |     data_dict = {
 15 |         "FFT answer": {
 16 |             "Q1": "Nurses were great",
 17 |             "Q2": "Communication was fantastic",
 18 |             "Q3": "Impossible to find parking, but pleased to get an appointment close to home",
 19 |             "Q4": "Food and drink selection very limited",
 20 |             "Q5": "The ward was boiling hot, although staff were great at explaining details",
 21 |         },
 22 |         "FFT_q_standardised": {
 23 |             "Q1": "what_good",
 24 |             "Q2": "what_good",
 25 |             "Q3": "could_improve",
 26 |             "Q4": "could_improve",
 27 |             "Q5": "could_improve",
 28 |         },
 29 |     }
 30 |     text_X_additional_feats = pd.DataFrame(data_dict)
 31 |     text_X_additional_feats.index.name = "Comment ID"
 32 |     return text_X_additional_feats
 33 | 
 34 | 
 35 | @pytest.fixture
 36 | def mock_read_csv(mocker, test_raw_data):
 37 |     mock = Mock()
 38 |     mocker.patch("pandas.read_csv", return_value=test_raw_data)
 39 |     return mock
 40 | 
 41 | 
 42 | @pytest.fixture
 43 | def test_raw_data():
 44 |     cols = [
 45 |         "Comment ID",
 46 |         "Trust",
 47 |         "Respondent ID",
 48 |         "Date",
 49 |         "Service Type 1",
 50 |         "Service type 2",
 51 |         "FFT categorical answer",
 52 |         "FFT question",
 53 |         "FFT answer",
 54 |         "Comment sentiment",
 55 |     ]
 56 |     cols.extend(minor_cats)
 57 |     data_dict = {}
 58 |     for col in cols:
 59 |         row = []
 60 |         if col not in minor_cats:
 61 |             if col in ["FFT categorical answer", "Comment sentiment"]:
 62 |                 for _i in range(5):
 63 |                     row.append(random.randint(1, 5))
 64 |             elif col == "FFT question":
 65 |                 for _i in range(5):
 66 |                     row.append(random.choice(list(q_map.keys())))
 67 |             else:
 68 |                 for _i in range(5):
 69 |                     row.append(
 70 |                         "".join(
 71 |                             random.choices(string.ascii_uppercase + string.digits, k=5)
 72 |                         )
 73 |                     )
 74 |         else:
 75 |             for _i in range(5):
 76 |                 row.append(random.choice([np.NaN, 1]))
 77 |         data_dict[col] = row
 78 |     data = pd.DataFrame(data_dict)
 79 |     return data
 80 | 
 81 | 
 82 | @pytest.fixture
 83 | def grab_preds_df():
 84 |     labels = ["one", "two", "three", "four", "five"]
 85 |     probs_labels = ["Probability of " + x for x in labels]
 86 |     preds_df = pd.DataFrame(
 87 |         np.array(
 88 |             [
 89 |                 [0.0, 1.0, 0.0, 1.0, 0.0, 0.1, 0.6, 0.2, 0.7, 0.05],
 90 |                 [1.0, 0.0, 0.0, 1.0, 0.0, 0.55, 0.2, 0.3, 0.8, 0.4],
 91 |                 [1.0, 0.0, 0.0, 0.0, 0.0, 0.8, 0.3, 0.2, 0.3, 0.1],
 92 |                 [1.0, 0.0, 1.0, 1.0, 0.0, 0.7, 0.2, 0.8, 0.9, 0.0],
 93 |                 [0.0, 0.0, 0.0, 0.0, 1.0, 0.2, 0.4, 0.2, 0.1, 0.6],
 94 |             ]
 95 |         ),
 96 |         columns=labels + probs_labels,
 97 |     )
 98 |     preds_df["labels"] = [
 99 |         ["two", "four"],
100 |         ["one", "four"],
101 |         ["one"],
102 |         ["one", "three", "four"],
103 |         ["five"],
104 |     ]
105 |     return preds_df
106 | 


--------------------------------------------------------------------------------
/tests/test_api.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from fastapi.testclient import TestClient
 3 | 
 4 | from api.api import app
 5 | 
 6 | client = TestClient(app)
 7 | 
 8 | 
 9 | def test_main():
10 |     response = client.get("/")
11 |     assert response.status_code == 200
12 |     assert response.json() == {"test": "Hello"}
13 | 
14 | 
15 | def test_multilabel_predictions():
16 |     test_json = [
17 |         {
18 |             "comment_id": "99999",
19 |             "comment_text": "I liked all of it",
20 |         },
21 |         {"comment_id": "A55", "comment_text": "", "question_type": "nonspecific"},
22 |         {
23 |             "comment_id": "A56",
24 |             "comment_text": "Truly awful time finding parking",
25 |         },
26 |         {
27 |             "comment_id": "4",
28 |             "comment_text": "I really enjoyed the session",
29 |         },
30 |         {"comment_id": "5", "comment_text": "7482367"},
31 |     ]
32 |     response = client.post("/predict_multilabel", json=test_json).json()
33 |     assert len(test_json) == len(response)
34 |     assert isinstance(response[0]["labels"], list)
35 | 
36 | 
37 | def test_comment_id_error():
38 |     with pytest.raises(ValueError):
39 |         test_json = [
40 |             {"comment_id": "1", "comment_text": "I liked all of it"},
41 |             {"comment_id": "1", "comment_text": "I liked all of it"},
42 |         ]
43 |         client.post("/predict_multilabel", json=test_json).json()
44 | 


--------------------------------------------------------------------------------
/tests/test_data_load_and_split.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | from unittest.mock import patch
  3 | 
  4 | import numpy as np
  5 | import pandas as pd
  6 | import pytest
  7 | 
  8 | from pxtextmining.factories import factory_data_load_and_split
  9 | from pxtextmining.params import minor_cats
 10 | 
 11 | 
 12 | @pytest.fixture
 13 | def grab_test_df(grab_test_X_additional_feats):
 14 |     df = grab_test_X_additional_feats
 15 |     df["Comment sentiment"] = 0
 16 |     df[minor_cats] = 0
 17 |     for i in range(df.shape[0]):
 18 |         df.loc[i, "Comment sentiment"] = random.randint(1, 5)
 19 |         for cat in minor_cats:
 20 |             df.loc[i, cat] = random.randint(0, 1)
 21 |     return df
 22 | 
 23 | 
 24 | @pytest.fixture
 25 | def grab_test_Y():
 26 |     Y_feats = np.array(
 27 |         [
 28 |             [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 29 |             [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 30 |             [1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
 31 |             [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 32 |             [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 33 |         ]
 34 |     )
 35 |     return Y_feats
 36 | 
 37 | 
 38 | @pytest.mark.parametrize(
 39 |     "target", ["major_categories", "minor_categories", "sentiment"]
 40 | )
 41 | def test_load_multilabel_data(mock_read_csv, target):
 42 |     filename = "None"
 43 |     df = factory_data_load_and_split.load_multilabel_data(filename, target)
 44 |     assert type(df) == pd.DataFrame
 45 | 
 46 | 
 47 | @patch("pandas.read_csv")
 48 | def test_load_multilabel_data_error(mock_bad_csv, test_raw_data):
 49 |     test_raw_data["FFT question"] = "Nonsense question"
 50 |     mock_bad_csv.return_value = test_raw_data
 51 |     filename = "None"
 52 |     with pytest.raises(ValueError):
 53 |         factory_data_load_and_split.load_multilabel_data(
 54 |             filename, target="minor_categories"
 55 |         )
 56 | 
 57 | 
 58 | def test_merge_categories():
 59 |     test_df = pd.DataFrame(
 60 |         {"col_1": [0, 0, 0, 0, 1], "col_2": [0, 1, 0, 0, 1], "col_3": [1, 0, 0, 0, 0]}
 61 |     )
 62 |     new_cat = "new_cat"
 63 |     cats_to_merge = ["col_1", "col_2"]
 64 |     merged_df = factory_data_load_and_split.merge_categories(
 65 |         test_df, new_cat, cats_to_merge
 66 |     )
 67 |     assert list(merged_df.columns) == ["col_3", "new_cat"]
 68 |     assert merged_df["new_cat"].sum() == 2
 69 | 
 70 | 
 71 | def test_remove_punc_and_nums():
 72 |     text = "Here is.some TEXT?!?!?! 12345 :)"
 73 |     cleaned_text = factory_data_load_and_split.remove_punc_and_nums(text)
 74 |     assert cleaned_text == "here is some text"
 75 | 
 76 | 
 77 | def test_clean_empty_features():
 78 |     df_with_empty_lines = pd.DataFrame({"text": ["Some text", "", " ", "More text"]})
 79 |     clean_df = factory_data_load_and_split.clean_empty_features(df_with_empty_lines)
 80 |     assert clean_df.shape == (2, 1)
 81 | 
 82 | 
 83 | def test_onehot():
 84 |     df_to_onehot = pd.DataFrame({"Categories": ["A", "B", "C", "A", "A", "B"]})
 85 |     df_onehotted = factory_data_load_and_split.onehot(df_to_onehot, "Categories")
 86 |     assert df_onehotted.shape == (6, 3)
 87 | 
 88 | 
 89 | def test_bert_data_to_dataset_with_Y(grab_test_X_additional_feats, grab_test_Y):
 90 |     train_dataset = factory_data_load_and_split.bert_data_to_dataset(
 91 |         grab_test_X_additional_feats, grab_test_Y, additional_features=True
 92 |     )
 93 |     assert isinstance(train_dataset._structure, tuple)
 94 | 
 95 | 
 96 | def test_bert_data_to_dataset_without_Y(grab_test_X_additional_feats):
 97 |     test_dataset = factory_data_load_and_split.bert_data_to_dataset(
 98 |         grab_test_X_additional_feats, Y=None, additional_features=True
 99 |     )
100 |     assert isinstance(test_dataset, dict)
101 | 
102 | 
103 | @pytest.mark.parametrize("target", [minor_cats, "sentiment"])
104 | @pytest.mark.parametrize("additional_features", [True, False])
105 | @pytest.mark.parametrize("preprocess_text", [True, False])
106 | def test_process_data(grab_test_df, target, preprocess_text, additional_features):
107 |     X, Y = factory_data_load_and_split.process_data(
108 |         grab_test_df, target, preprocess_text, additional_features
109 |     )
110 |     assert X.shape[0] == Y.shape[0]
111 | 
112 | 
113 | def test_process_and_split_data(grab_test_df):
114 |     (
115 |         X_train,
116 |         X_test,
117 |         Y_train,
118 |         Y_test,
119 |     ) = factory_data_load_and_split.process_and_split_data(
120 |         grab_test_df,
121 |         target=minor_cats,
122 |         preprocess_text=False,
123 |         additional_features=True,
124 |     )
125 |     assert len(X_train) == len(Y_train)
126 |     assert len(X_test) == len(Y_test)
127 | 


--------------------------------------------------------------------------------
/tests/test_docker_run.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import random
  3 | from unittest.mock import mock_open, patch
  4 | 
  5 | import pandas as pd
  6 | import pytest
  7 | 
  8 | import docker_run
  9 | from pxtextmining.params import minor_cats
 10 | 
 11 | 
 12 | @pytest.fixture
 13 | def sentiment_output():
 14 |     s_preds = pd.DataFrame(
 15 |         [
 16 |             {"comment_id": "1", "sentiment": 1},
 17 |             {"comment_id": "2", "sentiment": "Labelling not possible"},
 18 |         ]
 19 |     )
 20 |     return s_preds
 21 | 
 22 | 
 23 | @pytest.fixture
 24 | def multilabel_output():
 25 |     m_preds = pd.DataFrame(
 26 |         [
 27 |             {"comment_id": "1", "labels": ["Positive experience & gratitude"]},
 28 |             {"comment_id": "2", "labels": ["Labelling not possible"]},
 29 |         ]
 30 |     )
 31 |     return m_preds
 32 | 
 33 | 
 34 | @pytest.fixture
 35 | def output_df():
 36 |     indices = ["1"]
 37 |     df_list = []
 38 |     for _ in range(len(indices)):
 39 |         data_dict = {}
 40 |         for cat in minor_cats:
 41 |             data_dict[cat] = random.randint(0, 1)
 42 |             key = f"Probability of '{cat}'"
 43 |             data_dict[key] = random.uniform(0.0, 0.99)
 44 |         df_list.append(data_dict)
 45 |     df = pd.DataFrame(df_list)
 46 |     df.index = indices
 47 |     assert len(df.columns) == 64
 48 |     return df
 49 | 
 50 | 
 51 | @pytest.fixture
 52 | def input_data():
 53 |     input_text = [
 54 |         {
 55 |             "comment_id": "1",
 56 |             "comment_text": "Nurse was great.",
 57 |             "question_type": "what_good",
 58 |         },
 59 |         {"comment_id": "2", "comment_text": "", "question_type": "could_improve"},
 60 |     ]
 61 |     return input_text
 62 | 
 63 | 
 64 | @patch("docker_run.load_model")
 65 | def test_load_bert_model(mock_load):
 66 |     docker_run.load_bert_model("bert_sentiment")
 67 |     mock_load.assert_called_once()
 68 | 
 69 | 
 70 | def test_process_text(input_data):
 71 |     df, text_to_predict = docker_run.process_text(input_data)
 72 |     assert len(df.columns) == 3
 73 |     assert len(text_to_predict.columns) == 2
 74 |     assert text_to_predict.index.name == "Comment ID"
 75 |     assert df.shape[0] == text_to_predict.shape[0]
 76 | 
 77 | 
 78 | @patch("docker_run.pickle.load")
 79 | def test_load_sklearn_model(mock_pickle_load):
 80 |     docker_run.load_sklearn_model("final_svc")
 81 |     mock_pickle_load.assert_called_once()
 82 | 
 83 | 
 84 | @patch("docker_run.predict_sentiment_bert")
 85 | @patch("docker_run.load_model")
 86 | def test_predict_sentiment(mock_load_model, mock_get_predictions, input_data):
 87 |     input_text = input_data
 88 |     output = pd.DataFrame(
 89 |         [
 90 |             {
 91 |                 "Comment ID": "1",
 92 |                 "FFT answer": "Nurse was great.",
 93 |                 "FFT_q_standardised": "what_good",
 94 |                 "sentiment": 1,
 95 |             }
 96 |         ]
 97 |     ).set_index("Comment ID")
 98 |     mock_get_predictions.return_value = output
 99 |     return_df = docker_run.predict_sentiment(input_text)
100 |     mock_load_model.assert_called_once()
101 |     mock_get_predictions.assert_called()
102 |     assert len(return_df) == len(input_text)
103 | 
104 | 
105 | @patch("docker_run.predict_multilabel_sklearn")
106 | @patch("docker_run.predict_multilabel_bert")
107 | @patch("docker_run.pickle.load")
108 | @patch("docker_run.load_model")
109 | def test_predict_multilabel_ensemble(
110 |     mock_load_model,
111 |     mock_pickle_load,
112 |     mock_predict_bert,
113 |     mock_predict_sklearn,
114 |     output_df,
115 |     input_data,
116 | ):
117 |     mock_predict_bert.return_value = output_df
118 |     mock_predict_sklearn.return_value = output_df
119 |     input_text = input_data
120 |     return_df = docker_run.predict_multilabel_ensemble(input_text)
121 |     mock_load_model.assert_called_once()
122 |     mock_pickle_load.assert_called()
123 |     mock_predict_bert.assert_called_once()
124 |     mock_predict_sklearn.assert_called()
125 |     assert len(return_df) == len(input_text)
126 | 
127 | 
128 | @pytest.mark.parametrize(
129 |     "args",
130 |     [
131 |         ["file_01.json"],
132 |         ["file_01.json", "-l", "--target", "m"],
133 |         ["file_01.json", "-t", "s"],
134 |     ],
135 | )
136 | def test_parse_args(mocker, args):
137 |     mocker.patch("sys.argv", ["docker_run.py"] + args)
138 |     args = docker_run.parse_args()
139 |     assert args.json_file[0] == "file_01.json"
140 |     if args.local_storage:
141 |         assert args.local_storage is True
142 |     assert args.target in "ms"
143 | 
144 | 
145 | def test_comment_id_error():
146 |     with pytest.raises(ValueError):
147 |         test_json = [
148 |             {
149 |                 "comment_id": "1",
150 |                 "comment_text": "I liked all of it",
151 |                 "question_type": "nonspecific",
152 |             },
153 |             {
154 |                 "comment_id": "1",
155 |                 "comment_text": "I liked all of it",
156 |                 "question_type": "nonspecific",
157 |             },
158 |         ]
159 |         docker_run.process_text(test_json)
160 | 
161 | 
162 | @patch("docker_run.predict_multilabel_ensemble")
163 | @patch("docker_run.predict_sentiment")
164 | @patch("docker_run.os.remove")
165 | @patch(
166 |     "builtins.open", new_callable=mock_open, read_data=json.dumps([{"data": "Here"}])
167 | )
168 | @patch("sys.argv", ["docker_run.py"] + ["file_01.json"])
169 | def test_main_not_local(
170 |     mock_open,
171 |     mock_remove,
172 |     mock_predict_sentiment,
173 |     mock_predict_ensemble,
174 |     sentiment_output,
175 |     multilabel_output,
176 | ):
177 |     mock_predict_sentiment.return_value = sentiment_output
178 |     mock_predict_ensemble.return_value = multilabel_output
179 |     docker_run.main()
180 |     mock_open.assert_called()
181 |     mock_predict_sentiment.assert_called()
182 |     mock_predict_ensemble.assert_called()
183 |     mock_remove.assert_called_once()
184 | 
185 | 
186 | @patch("docker_run.predict_sentiment")
187 | @patch(
188 |     "builtins.open", new_callable=mock_open, read_data=json.dumps([{"data": "Here"}])
189 | )
190 | @patch("sys.argv", ["docker_run.py"] + ["file_01.json", "-l", "-t", "s"])
191 | def test_main_local(mock_open, mock_predict_sentiment, sentiment_output):
192 |     mock_predict_sentiment.return_value = sentiment_output
193 |     docker_run.main()
194 |     mock_open.assert_called()
195 |     mock_predict_sentiment.assert_called()
196 | 


--------------------------------------------------------------------------------
/tests/test_factory_pipeline.py:
--------------------------------------------------------------------------------
  1 | from unittest.mock import MagicMock, Mock, patch
  2 | 
  3 | import numpy as np
  4 | import pytest
  5 | from keras.engine.functional import Functional
  6 | from sklearn.base import is_classifier
  7 | from sklearn.pipeline import Pipeline
  8 | 
  9 | from pxtextmining.factories import factory_pipeline
 10 | 
 11 | 
 12 | @pytest.mark.parametrize("model_type", ["svm", "xgb"])
 13 | @pytest.mark.parametrize("additional_features", [True, False])
 14 | def test_create_sklearn_pipeline_sentiment(model_type, additional_features):
 15 |     pipe, params = factory_pipeline.create_sklearn_pipeline_sentiment(
 16 |         model_type, 3, additional_features=additional_features
 17 |     )
 18 |     assert isinstance(params, dict) is True
 19 |     assert is_classifier(pipe) is True
 20 | 
 21 | 
 22 | @pytest.mark.parametrize("multilabel", [True, False])
 23 | def test_create_bert_model(multilabel):
 24 |     Y_train = np.array([[0, 1, 0], [1, 0, 0], [0, 0, 1]])
 25 |     model = factory_pipeline.create_bert_model(Y_train, multilabel=multilabel)
 26 |     assert isinstance(model, Functional) is True
 27 | 
 28 | 
 29 | @pytest.mark.parametrize("multilabel", [True, False])
 30 | def test_create_bert_model_additional_features(multilabel):
 31 |     Y_train = np.array([[0, 1, 0], [1, 0, 0], [0, 0, 1]])
 32 |     model = factory_pipeline.create_bert_model_additional_features(
 33 |         Y_train, multilabel=multilabel
 34 |     )
 35 |     assert isinstance(model, Functional) is True
 36 | 
 37 | 
 38 | def test_train_bert_model():
 39 |     train_dataset = Mock()
 40 |     test_dataset = Mock()
 41 |     model = Mock()
 42 |     model, training_time = factory_pipeline.train_bert_model(
 43 |         train_dataset, test_dataset, model
 44 |     )
 45 |     model.fit.assert_called_once()
 46 |     assert isinstance(training_time, str) is True
 47 | 
 48 | 
 49 | def test_calculating_class_weights():
 50 |     Y_train = np.array(
 51 |         [[0, 1, 0], [1, 0, 0], [0, 0, 1], [0, 1, 0], [1, 0, 0], [1, 0, 0]]
 52 |     )
 53 |     class_weights_dict = factory_pipeline.calculating_class_weights(Y_train)
 54 |     assert isinstance(class_weights_dict, dict) is True
 55 | 
 56 | 
 57 | @pytest.mark.parametrize("model_type", ["svm", "xgb", "rfc", "mnb", "knn"])
 58 | @pytest.mark.parametrize("additional_features", [True, False])
 59 | def test_create_sklearn_pipeline(model_type, additional_features):
 60 |     pipe, params = factory_pipeline.create_sklearn_pipeline(
 61 |         model_type, additional_features
 62 |     )
 63 |     assert is_classifier(pipe) is True
 64 |     assert isinstance(params, dict) is True
 65 | 
 66 | 
 67 | @pytest.mark.parametrize("target", ["sentiment", None])
 68 | @pytest.mark.parametrize("model_type", [["svm"], ["xgb"]])
 69 | @patch("pxtextmining.factories.factory_pipeline.RandomizedSearchCV")
 70 | def test_search_sklearn_pipelines(
 71 |     mock_randomsearch, target, model_type, grab_test_X_additional_feats
 72 | ):
 73 |     mock_instance = MagicMock()
 74 |     mock_randomsearch.return_value = mock_instance
 75 |     X_train = grab_test_X_additional_feats
 76 |     Y_train = np.array(
 77 |         [
 78 |             [0, 1, 0, 1, 0],
 79 |             [1, 0, 0, 1, 0],
 80 |             [1, 0, 0, 0, 0],
 81 |             [1, 0, 1, 1, 0],
 82 |             [0, 0, 0, 0, 1],
 83 |         ]
 84 |     )
 85 |     mock_instance.best_estimator_ = Pipeline([("dummy", None)])
 86 |     mock_instance.best_params_ = {"param1": 10, "param2": 20}
 87 | 
 88 |     models, training_times = factory_pipeline.search_sklearn_pipelines(
 89 |         X_train,
 90 |         Y_train,
 91 |         models_to_try=model_type,
 92 |         target=target,
 93 |         additional_features=True,
 94 |     )
 95 | 
 96 |     mock_instance.fit.assert_called()
 97 |     assert len(models) == 1
 98 |     assert isinstance(models[0], Pipeline) is True
 99 |     assert models[0].steps[0][0] == "dummy"
100 |     assert len(training_times) == 1
101 | 
102 |     with pytest.raises(ValueError):
103 |         factory_pipeline.search_sklearn_pipelines(
104 |             X_train,
105 |             Y_train,
106 |             models_to_try=["nonsense"],
107 |             target=target,
108 |             additional_features=True,
109 |         )
110 | 
111 | 
112 | @pytest.mark.parametrize("target", ["sentiment", None])
113 | @patch("pxtextmining.factories.factory_pipeline.RandomizedSearchCV")
114 | def test_search_sklearn_pipelines_no_feats(
115 |     mock_randomsearch, target, grab_test_X_additional_feats
116 | ):
117 |     mock_instance = MagicMock()
118 |     mock_randomsearch.return_value = mock_instance
119 |     models_to_try = ["svm"]
120 |     X_train = grab_test_X_additional_feats["FFT answer"]
121 |     Y_train = np.array(
122 |         [
123 |             [0, 1, 0, 1, 0],
124 |             [1, 0, 0, 1, 0],
125 |             [1, 0, 0, 0, 0],
126 |             [1, 0, 1, 1, 0],
127 |             [0, 0, 0, 0, 1],
128 |         ]
129 |     )
130 |     mock_instance.best_estimator_ = Pipeline([("dummy", None)])
131 |     mock_instance.best_params_ = {"param1": 10, "param2": 20}
132 | 
133 |     models, training_times = factory_pipeline.search_sklearn_pipelines(
134 |         X_train, Y_train, models_to_try, target=target, additional_features=False
135 |     )
136 | 
137 |     mock_instance.fit.assert_called()
138 |     assert len(models) == 1
139 |     assert isinstance(models[0], Pipeline) is True
140 |     assert models[0].steps[0][0] == "dummy"
141 |     assert len(training_times) == 1
142 | 
143 | 
144 | @patch("pxtextmining.factories.factory_pipeline.make_pipeline")
145 | def test_create_and_train_svc_model(mock_pipeline, grab_test_X_additional_feats):
146 |     mock_pipe = Mock()
147 |     mock_pipeline.return_value = mock_pipe
148 |     X_train = grab_test_X_additional_feats
149 |     Y_train = np.array(
150 |         [
151 |             [0, 1, 0, 1, 0],
152 |             [1, 0, 0, 1, 0],
153 |             [1, 0, 0, 0, 0],
154 |             [1, 0, 1, 1, 0],
155 |             [0, 0, 0, 0, 1],
156 |         ]
157 |     )
158 |     factory_pipeline.create_and_train_svc_model(
159 |         X_train, Y_train, additional_features=True
160 |     )
161 |     mock_pipe.fit.assert_called_with(X_train, Y_train)
162 | 
163 | 
164 | @patch("pxtextmining.factories.factory_pipeline.make_pipeline")
165 | def test_create_and_train_svc_model_no_feats(
166 |     mock_pipeline, grab_test_X_additional_feats
167 | ):
168 |     mock_pipe = Mock()
169 |     mock_pipeline.return_value = mock_pipe
170 |     X_train = grab_test_X_additional_feats["FFT answer"]
171 |     Y_train = np.array(
172 |         [
173 |             [0, 1, 0, 1, 0],
174 |             [1, 0, 0, 1, 0],
175 |             [1, 0, 0, 0, 0],
176 |             [1, 0, 1, 1, 0],
177 |             [0, 0, 0, 0, 1],
178 |         ]
179 |     )
180 |     factory_pipeline.create_and_train_svc_model(
181 |         X_train, Y_train, additional_features=False
182 |     )
183 |     mock_pipe.fit.assert_called_with(X_train, Y_train)
184 | 


--------------------------------------------------------------------------------
/tests/test_helpers.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from pxtextmining.helpers.text_preprocessor import tf_preprocessing
 4 | 
 5 | 
 6 | def test_text_preprocessor(grab_test_X_additional_feats):
 7 |     data = grab_test_X_additional_feats["FFT answer"]
 8 |     X_pad, vocab_size = tf_preprocessing(data)
 9 |     assert isinstance(X_pad, np.ndarray) is True
10 |     assert len(X_pad) == data.shape[0]
11 |     assert isinstance(vocab_size, int) is True
12 | 


--------------------------------------------------------------------------------
/tests/test_model_performance.py:
--------------------------------------------------------------------------------
  1 | from unittest.mock import Mock
  2 | 
  3 | import numpy as np
  4 | import pandas as pd
  5 | import pytest
  6 | from tensorflow.keras import Model
  7 | 
  8 | from pxtextmining.factories import factory_model_performance
  9 | 
 10 | 
 11 | @pytest.fixture
 12 | def grab_test_bert_multiclass():
 13 |     predicted_probs = np.array(
 14 |         [
 15 |             [0.9, 0.01, 0.07, 0.01, 0.01],
 16 |             [0.01, 0.07, 0.01, 0.01, 0.9],
 17 |             [0.07, 0.9, 0.01, 0.01, 0.01],
 18 |             [0.9, 0.01, 0.07, 0.01, 0.01],
 19 |             [0.9, 0.01, 0.01, 0.01, 0.07],
 20 |         ]
 21 |     )
 22 |     model = Mock(spec=Model, predict=Mock(return_value=predicted_probs))
 23 |     return model
 24 | 
 25 | 
 26 | @pytest.fixture
 27 | def grab_test_bert_multilabel():
 28 |     predicted_probs = np.array(
 29 |         [
 30 |             [
 31 |                 6.2770307e-01,
 32 |                 2.3520987e-02,
 33 |                 1.3149388e-01,
 34 |                 2.7835215e-02,
 35 |                 1.8944685e-01,
 36 |             ],
 37 |             [
 38 |                 9.8868138e-01,
 39 |                 1.9990385e-03,
 40 |                 5.4453085e-03,
 41 |                 9.0726715e-04,
 42 |                 2.9669846e-03,
 43 |             ],
 44 |             [
 45 |                 4.2310607e-01,
 46 |                 5.6546849e-01,
 47 |                 9.3136989e-03,
 48 |                 1.3205722e-03,
 49 |                 7.9117226e-04,
 50 |             ],
 51 |             [
 52 |                 2.0081511e-01,
 53 |                 7.0609129e-04,
 54 |                 1.1107661e-03,
 55 |                 7.9677838e-01,
 56 |                 5.8961433e-04,
 57 |             ],
 58 |             [
 59 |                 1.4777037e-03,
 60 |                 5.1493715e-03,
 61 |                 2.8268427e-03,
 62 |                 7.4673461e-04,
 63 |                 9.8979920e-01,
 64 |             ],
 65 |         ]
 66 |     )
 67 |     model = Mock(spec=Model, predict=Mock(return_value=predicted_probs))
 68 |     return model
 69 | 
 70 | 
 71 | def test_multiclass_metrics_sklearn(grab_test_X_additional_feats):
 72 |     x = grab_test_X_additional_feats
 73 |     y = np.array([[0, 1, 0], [1, 0, 0], [1, 0, 0], [1, 0, 0], [0, 0, 1]])
 74 |     labels = ["A", "B", "C"]
 75 |     model = factory_model_performance.get_dummy_model(x, y)
 76 |     random_state = 42
 77 |     additional_features = True
 78 |     metrics_string = factory_model_performance.get_multiclass_metrics(
 79 |         x, y, labels, random_state, model, additional_features
 80 |     )
 81 |     assert isinstance(metrics_string, str) is True
 82 | 
 83 | 
 84 | def test_multiclass_metrics_bert(
 85 |     grab_test_X_additional_feats, grab_test_bert_multiclass
 86 | ):
 87 |     x = grab_test_X_additional_feats
 88 |     y = np.array(
 89 |         [
 90 |             [0],
 91 |             [4],
 92 |             [1],
 93 |             [3],
 94 |             [3],
 95 |         ]
 96 |     )
 97 |     labels = ["A", "B", "C", "D"]
 98 |     model = grab_test_bert_multiclass
 99 |     random_state = 42
100 |     additional_features = True
101 |     metrics_string = factory_model_performance.get_multiclass_metrics(
102 |         x, y, labels, random_state, model, additional_features
103 |     )
104 |     assert isinstance(metrics_string, str) is True
105 | 
106 | 
107 | def test_multilabel_metrics_sklearn(grab_preds_df, grab_test_X_additional_feats):
108 |     preds_df = grab_preds_df
109 |     x = grab_test_X_additional_feats
110 |     y = np.array(
111 |         [
112 |             [0, 1, 0, 1, 0],
113 |             [1, 0, 0, 1, 0],
114 |             [1, 0, 0, 0, 0],
115 |             [1, 0, 1, 1, 0],
116 |             [0, 0, 0, 0, 1],
117 |         ]
118 |     )
119 |     labels = ["one", "two", "three", "four", "five"]
120 |     random_state = 42
121 |     model = factory_model_performance.get_dummy_model(x, y)
122 |     metrics_string = factory_model_performance.get_multilabel_metrics(
123 |         preds_df,
124 |         y,
125 |         labels,
126 |         random_state,
127 |         model,
128 |     )
129 |     assert isinstance(metrics_string, str) is True
130 | 
131 | 
132 | def test_multilabel_metrics_bert(grab_test_bert_multilabel, grab_preds_df):
133 |     preds_df = grab_preds_df
134 |     y = np.array(
135 |         [
136 |             [0, 1, 0, 1, 0],
137 |             [1, 0, 0, 1, 0],
138 |             [1, 0, 0, 0, 0],
139 |             [1, 0, 1, 1, 0],
140 |             [0, 0, 0, 0, 1],
141 |         ]
142 |     )
143 |     labels = ["one", "two", "three", "four", "five"]
144 |     random_state = 42
145 |     model = grab_test_bert_multilabel
146 |     metrics_string = factory_model_performance.get_multilabel_metrics(
147 |         preds_df,
148 |         y,
149 |         labels,
150 |         random_state,
151 |         model,
152 |     )
153 |     assert isinstance(metrics_string, str) is True
154 | 
155 | 
156 | def test_accuracy_per_class():
157 |     y_test = pd.Series([0, 1, 0, 2, 1, 0])
158 |     y_pred = pd.Series([0, 1, 0, 1, 1, 2])
159 |     df = factory_model_performance.get_accuracy_per_class(y_test, y_pred)
160 |     assert df.shape == (3, 3)
161 | 
162 | 
163 | def test_parse_metrics_file():
164 |     metrics_file = "current_best_model/sentiment/bert_sentiment.txt"
165 |     labels = ["very positive", "positive", "neutral", "negative", "very negative"]
166 |     metrics_df = factory_model_performance.parse_metrics_file(metrics_file, labels)
167 |     assert metrics_df.shape == (5, 5)
168 | 
169 | 
170 | @pytest.mark.parametrize(
171 |     "custom_threshold_dict",
172 |     [None, {"one": 0.6, "two": 0.5, "three": 0.75, "four": 0.6, "five": 0.5}],
173 | )
174 | def test_additional_analysis(custom_threshold_dict, grab_preds_df):
175 |     y_true = np.array(
176 |         [
177 |             [0.0, 1.0, 0.0, 0.0, 0.0],
178 |             [1.0, 0.0, 0.0, 1.0, 1.0],
179 |             [1.0, 0.0, 0.0, 0.0, 0.0],
180 |             [0.0, 0.0, 1.0, 1.0, 0.0],
181 |             [0.0, 0.0, 0.0, 0.0, 1.0],
182 |         ]
183 |     )
184 |     labels = ["one", "two", "three", "four", "five"]
185 |     preds_df = grab_preds_df
186 |     analysis_df = factory_model_performance.additional_analysis(
187 |         preds_df, y_true, labels, custom_threshold_dict
188 |     )
189 |     assert list(analysis_df.index) == labels
190 |     if custom_threshold_dict is None:
191 |         assert len(analysis_df.columns) == 5
192 |     else:
193 |         assert len(analysis_df.columns) == 6
194 | 
195 | 
196 | def test_multiclass_metrics_valueerror(
197 |     grab_test_X_additional_feats,
198 | ):
199 |     x = grab_test_X_additional_feats
200 |     y = np.array(
201 |         [
202 |             [0],
203 |             [4],
204 |             [1],
205 |             [3],
206 |             [3],
207 |         ]
208 |     )
209 |     labels = ["A", "B", "C", "D"]
210 |     model = Mock(spec=None)
211 |     random_state = 42
212 |     additional_features = True
213 |     with pytest.raises(ValueError):
214 |         factory_model_performance.get_multiclass_metrics(
215 |             x, y, labels, random_state, model, additional_features
216 |         )
217 | 
218 | 
219 | def test_multilabel_metrics_valueerror(
220 |     grab_preds_df,
221 | ):
222 |     preds_df = grab_preds_df
223 |     y = np.array(
224 |         [
225 |             [0, 1, 0, 1, 0],
226 |             [1, 0, 0, 1, 0],
227 |             [1, 0, 0, 0, 0],
228 |             [1, 0, 1, 1, 0],
229 |             [0, 0, 0, 0, 1],
230 |         ]
231 |     )
232 |     labels = ["one", "two", "three", "four", "five"]
233 |     random_state = 42
234 |     model = Mock(spec=None)
235 |     with pytest.raises(ValueError):
236 |         factory_model_performance.get_multilabel_metrics(
237 |             preds_df,
238 |             y,
239 |             labels,
240 |             random_state,
241 |             model,
242 |         )
243 | 
244 | 
245 | def test_get_y_score_2d():
246 |     test_probs = np.array(
247 |         [
248 |             [
249 |                 6.2770307e-01,
250 |                 2.3520987e-02,
251 |                 1.3149388e-01,
252 |                 2.7835215e-02,
253 |                 1.8944685e-01,
254 |             ],
255 |             [
256 |                 9.8868138e-01,
257 |                 1.9990385e-03,
258 |                 5.4453085e-03,
259 |                 9.0726715e-04,
260 |                 2.9669846e-03,
261 |             ],
262 |             [
263 |                 4.2310607e-01,
264 |                 5.6546849e-01,
265 |                 9.3136989e-03,
266 |                 1.3205722e-03,
267 |                 7.9117226e-04,
268 |             ],
269 |             [
270 |                 2.0081511e-01,
271 |                 7.0609129e-04,
272 |                 1.1107661e-03,
273 |                 7.9677838e-01,
274 |                 5.8961433e-04,
275 |             ],
276 |             [
277 |                 1.4777037e-03,
278 |                 5.1493715e-03,
279 |                 2.8268427e-03,
280 |                 7.4673461e-04,
281 |                 9.8979920e-01,
282 |             ],
283 |         ]
284 |     )
285 |     probs = factory_model_performance.get_y_score(test_probs)
286 |     assert probs.ndim == 2
287 | 
288 | 
289 | def test_get_y_score_3d():
290 |     test_probs = np.array(
291 |         [
292 |             [
293 |                 [0.80465788, 0.19534212],
294 |                 [0.94292979, 0.05707021],
295 |                 [0.33439024, 0.66560976],
296 |             ],
297 |             [
298 |                 [0.33439024, 0.66560976],
299 |                 [0.9949298, 0.0050702],
300 |                 [0.99459238, 0.00540762],
301 |             ],
302 |             [
303 |                 [0.97472981, 0.02527019],
304 |                 [0.25069129, 0.74930871],
305 |                 [0.33439024, 0.66560976],
306 |             ],
307 |         ]
308 |     )
309 |     probs = factory_model_performance.get_y_score(test_probs)
310 |     assert probs.ndim == 2
311 | 


--------------------------------------------------------------------------------
/tests/test_multilabel_pipeline.py:
--------------------------------------------------------------------------------
  1 | from unittest.mock import Mock, patch
  2 | 
  3 | import pytest
  4 | 
  5 | from pxtextmining.params import major_cats, minor_cats
  6 | from pxtextmining.pipelines import multilabel_pipeline
  7 | 
  8 | 
  9 | @pytest.mark.parametrize("target", [major_cats, minor_cats])
 10 | @patch("pxtextmining.pipelines.multilabel_pipeline.write_model_analysis")
 11 | @patch("pxtextmining.pipelines.multilabel_pipeline.write_model_preds")
 12 | @patch("pxtextmining.pipelines.multilabel_pipeline.write_multilabel_models_and_metrics")
 13 | @patch("pxtextmining.pipelines.multilabel_pipeline.get_multilabel_metrics")
 14 | @patch("pxtextmining.pipelines.multilabel_pipeline.predict_multilabel_sklearn")
 15 | @patch("pxtextmining.pipelines.multilabel_pipeline.get_thresholds")
 16 | @patch("pxtextmining.pipelines.multilabel_pipeline.search_sklearn_pipelines")
 17 | @patch("pxtextmining.pipelines.multilabel_pipeline.train_test_split")
 18 | @patch("pxtextmining.pipelines.multilabel_pipeline.process_and_split_data", create=True)
 19 | @patch("pxtextmining.pipelines.multilabel_pipeline.load_multilabel_data")
 20 | @pytest.mark.parametrize("custom_threshold", [True, False])
 21 | def test_sklearn_pipeline(
 22 |     mock_dataload,
 23 |     mock_datasplit,
 24 |     mock_traintestsplit,
 25 |     mock_skpipeline,
 26 |     mock_threshold,
 27 |     mock_predict,
 28 |     mock_metrics,
 29 |     mock_write,
 30 |     mock_writepreds,
 31 |     mock_writeanalysis,
 32 |     custom_threshold,
 33 |     target,
 34 | ):
 35 |     # arrange mocks
 36 |     mock_datasplit.return_value = (1, 2, 3, 4)
 37 |     mock_traintestsplit.return_value = (1, 2, 3, 4)
 38 |     mock_skpipeline.return_value = ([Mock()], ["training_time"])
 39 | 
 40 |     # act
 41 |     multilabel_pipeline.run_sklearn_pipeline(
 42 |         target=target, include_analysis=True, custom_threshold=custom_threshold
 43 |     )
 44 | 
 45 |     # assert
 46 |     mock_dataload.assert_called_once()
 47 |     mock_datasplit.assert_called_once()
 48 |     mock_skpipeline.assert_called_once()
 49 |     mock_metrics.assert_called_once()
 50 |     mock_predict.assert_called_once()
 51 |     mock_write.assert_called_once()
 52 |     mock_writepreds.assert_called_once()
 53 |     mock_writeanalysis.assert_called_once()
 54 |     if custom_threshold is True:
 55 |         mock_traintestsplit.assert_called_once()
 56 |         mock_threshold.assert_called_once()
 57 | 
 58 | 
 59 | @pytest.mark.parametrize("target", [major_cats, minor_cats])
 60 | @patch("pxtextmining.pipelines.multilabel_pipeline.write_model_analysis")
 61 | @patch("pxtextmining.pipelines.multilabel_pipeline.write_model_preds")
 62 | @patch("pxtextmining.pipelines.multilabel_pipeline.write_multilabel_models_and_metrics")
 63 | @patch("pxtextmining.pipelines.multilabel_pipeline.get_multilabel_metrics")
 64 | @patch("pxtextmining.pipelines.multilabel_pipeline.predict_multilabel_sklearn")
 65 | @patch("pxtextmining.pipelines.multilabel_pipeline.get_thresholds")
 66 | @patch("pxtextmining.pipelines.multilabel_pipeline.create_and_train_svc_model")
 67 | @patch("pxtextmining.pipelines.multilabel_pipeline.train_test_split")
 68 | @patch("pxtextmining.pipelines.multilabel_pipeline.process_and_split_data", create=True)
 69 | @patch("pxtextmining.pipelines.multilabel_pipeline.load_multilabel_data")
 70 | @pytest.mark.parametrize("custom_threshold", [True, False])
 71 | def test_svc_pipeline(
 72 |     mock_dataload,
 73 |     mock_datasplit,
 74 |     mock_traintestsplit,
 75 |     mock_skpipeline,
 76 |     mock_threshold,
 77 |     mock_predict,
 78 |     mock_metrics,
 79 |     mock_write,
 80 |     mock_writepreds,
 81 |     mock_writeanalysis,
 82 |     target,
 83 |     custom_threshold,
 84 | ):
 85 |     # arrange mocks
 86 |     mock_traintestsplit.return_value = (1, 2, 3, 4)
 87 |     mock_datasplit.return_value = (1, 2, 3, 4)
 88 |     mock_skpipeline.return_value = (Mock(), "training_time")
 89 | 
 90 |     # act
 91 |     multilabel_pipeline.run_svc_pipeline(
 92 |         target=target, include_analysis=True, custom_threshold=custom_threshold
 93 |     )
 94 | 
 95 |     # assert
 96 |     mock_dataload.assert_called_once()
 97 |     mock_datasplit.assert_called_once()
 98 |     mock_skpipeline.assert_called_once()
 99 |     mock_predict.assert_called_once()
100 |     mock_metrics.assert_called_once()
101 |     mock_write.assert_called_once()
102 |     mock_writepreds.assert_called_once()
103 |     mock_writeanalysis.assert_called_once()
104 |     if custom_threshold is True:
105 |         mock_traintestsplit.assert_called_once()
106 |         mock_threshold.assert_called_once()
107 | 
108 | 
109 | @pytest.mark.parametrize("target", [major_cats, minor_cats])
110 | @patch("pxtextmining.pipelines.multilabel_pipeline.write_model_analysis")
111 | @patch("pxtextmining.pipelines.multilabel_pipeline.write_model_preds")
112 | @patch("pxtextmining.pipelines.multilabel_pipeline.write_multilabel_models_and_metrics")
113 | @patch("pxtextmining.pipelines.multilabel_pipeline.get_multilabel_metrics")
114 | @patch("pxtextmining.pipelines.multilabel_pipeline.predict_multilabel_bert")
115 | @patch("pxtextmining.pipelines.multilabel_pipeline.train_bert_model")
116 | @patch("pxtextmining.pipelines.multilabel_pipeline.create_bert_model")
117 | @patch("pxtextmining.pipelines.multilabel_pipeline.calculating_class_weights")
118 | @patch("pxtextmining.pipelines.multilabel_pipeline.bert_data_to_dataset")
119 | @patch("pxtextmining.pipelines.multilabel_pipeline.train_test_split")
120 | @patch("pxtextmining.pipelines.multilabel_pipeline.process_and_split_data")
121 | @patch("pxtextmining.pipelines.multilabel_pipeline.load_multilabel_data")
122 | def test_bert_pipeline(
123 |     mock_dataload,
124 |     mock_datasplit,
125 |     mock_traintest,
126 |     mock_bertdata,
127 |     mock_classweights,
128 |     mock_createbert,
129 |     mock_trainbert,
130 |     mock_predict,
131 |     mock_metrics,
132 |     mock_write,
133 |     mock_writepreds,
134 |     mock_writeanalysis,
135 |     target,
136 | ):
137 |     # arrange mocks
138 |     mock_datasplit.return_value = (1, 2, 3, 4)
139 |     mock_traintest.return_value = ("X_train_val", "X_test", "Y_train_val", "Y_test")
140 |     mock_trainbert.return_value = (1, 2)
141 | 
142 |     # act
143 |     multilabel_pipeline.run_bert_pipeline(target=target, include_analysis=True)
144 | 
145 |     # assert
146 |     mock_dataload.assert_called_once()
147 |     mock_datasplit.assert_called_once()
148 |     mock_traintest.assert_called_once()
149 |     mock_bertdata.assert_called()
150 |     mock_classweights.assert_called_once()
151 |     mock_createbert.assert_called_once()
152 |     mock_trainbert.assert_called_once()
153 |     mock_predict.assert_called_once()
154 |     mock_metrics.assert_called_once()
155 |     mock_write.assert_called_once()
156 |     mock_writepreds.assert_called_once()
157 |     mock_writeanalysis.assert_called_once()
158 | 
159 | 
160 | @pytest.mark.parametrize("target", [major_cats, minor_cats])
161 | @patch("pxtextmining.pipelines.multilabel_pipeline.write_multilabel_models_and_metrics")
162 | @patch("pxtextmining.pipelines.multilabel_pipeline.get_multilabel_metrics")
163 | @patch("pxtextmining.pipelines.multilabel_pipeline.predict_multilabel_bert")
164 | @patch("pxtextmining.pipelines.multilabel_pipeline.get_thresholds")
165 | @patch("pxtextmining.pipelines.multilabel_pipeline.train_bert_model")
166 | @patch(
167 |     "pxtextmining.pipelines.multilabel_pipeline.create_bert_model_additional_features"
168 | )
169 | @patch("pxtextmining.pipelines.multilabel_pipeline.calculating_class_weights")
170 | @patch("pxtextmining.pipelines.multilabel_pipeline.bert_data_to_dataset")
171 | @patch("pxtextmining.pipelines.multilabel_pipeline.train_test_split")
172 | @patch("pxtextmining.pipelines.multilabel_pipeline.process_and_split_data")
173 | @patch("pxtextmining.pipelines.multilabel_pipeline.load_multilabel_data")
174 | @pytest.mark.parametrize("custom_threshold", [True, False])
175 | def test_bert_pipeline_additional_features(
176 |     mock_dataload,
177 |     mock_datasplit,
178 |     mock_traintest,
179 |     mock_bertdata,
180 |     mock_classweights,
181 |     mock_createbert,
182 |     mock_trainbert,
183 |     mock_thresholds,
184 |     mock_predict,
185 |     mock_metrics,
186 |     mock_write,
187 |     target,
188 |     custom_threshold,
189 | ):
190 |     # arrange mocks
191 |     mock_datasplit.return_value = (1, 2, 3, 4)
192 |     mock_traintest.return_value = ("X_train_val", "X_test", "Y_train_val", "Y_test")
193 |     mock_trainbert.return_value = (Mock(), 2)
194 | 
195 |     # act
196 |     multilabel_pipeline.run_bert_pipeline(
197 |         target=target,
198 |         additional_features=True,
199 |         include_analysis=False,
200 |         custom_threshold=custom_threshold,
201 |     )
202 | 
203 |     # assert
204 |     mock_dataload.assert_called_once()
205 |     mock_datasplit.assert_called_once()
206 |     mock_traintest.assert_called_once()
207 |     mock_bertdata.assert_called()
208 |     mock_classweights.assert_called_once()
209 |     mock_createbert.assert_called_once()
210 |     mock_trainbert.assert_called_once()
211 |     mock_predict.assert_called_once()
212 |     mock_metrics.assert_called_once()
213 |     mock_write.assert_called_once()
214 |     if custom_threshold is True:
215 |         mock_thresholds.assert_called_once()
216 | 


--------------------------------------------------------------------------------
/tests/test_sentiment_pipeline.py:
--------------------------------------------------------------------------------
  1 | from unittest.mock import patch
  2 | 
  3 | from pxtextmining.pipelines import sentiment_pipeline
  4 | 
  5 | 
  6 | @patch("pxtextmining.pipelines.sentiment_pipeline.write_multilabel_models_and_metrics")
  7 | @patch("pxtextmining.pipelines.sentiment_pipeline.get_multiclass_metrics")
  8 | @patch("pxtextmining.pipelines.sentiment_pipeline.search_sklearn_pipelines")
  9 | @patch("pxtextmining.pipelines.sentiment_pipeline.process_and_split_data", create=True)
 10 | @patch("pxtextmining.pipelines.sentiment_pipeline.load_multilabel_data")
 11 | def test_sentiment_pipeline(
 12 |     mock_dataload,
 13 |     mock_datasplit,
 14 |     mock_skpipeline,
 15 |     mock_metrics,
 16 |     mock_write,
 17 | ):
 18 |     # arrange mocks
 19 |     mock_datasplit.return_value = (1, 2, 3, 4)
 20 |     mock_skpipeline.return_value = (["model"], ["training_time"])
 21 | 
 22 |     # act
 23 |     sentiment_pipeline.run_sentiment_pipeline()
 24 | 
 25 |     # assert
 26 |     mock_dataload.assert_called_once()
 27 |     mock_datasplit.assert_called_once()
 28 |     mock_skpipeline.assert_called_once()
 29 |     mock_metrics.assert_called_once()
 30 |     mock_write.assert_called_once()
 31 | 
 32 | 
 33 | @patch("pxtextmining.pipelines.sentiment_pipeline.write_multilabel_models_and_metrics")
 34 | @patch("pxtextmining.pipelines.sentiment_pipeline.get_multiclass_metrics")
 35 | @patch("pxtextmining.pipelines.sentiment_pipeline.train_bert_model")
 36 | @patch("pxtextmining.pipelines.sentiment_pipeline.create_bert_model")
 37 | @patch("pxtextmining.pipelines.sentiment_pipeline.compute_class_weight")
 38 | @patch("pxtextmining.pipelines.sentiment_pipeline.bert_data_to_dataset")
 39 | @patch("pxtextmining.pipelines.sentiment_pipeline.train_test_split")
 40 | @patch("pxtextmining.pipelines.sentiment_pipeline.to_categorical")
 41 | @patch("pxtextmining.pipelines.sentiment_pipeline.process_and_split_data")
 42 | @patch("pxtextmining.pipelines.sentiment_pipeline.load_multilabel_data")
 43 | def test_bert_pipeline(
 44 |     mock_dataload,
 45 |     mock_datasplit,
 46 |     mock_categorical,
 47 |     mock_traintest,
 48 |     mock_bertdata,
 49 |     mock_classweights,
 50 |     mock_createbert,
 51 |     mock_trainbert,
 52 |     mock_metrics,
 53 |     mock_write,
 54 | ):
 55 |     # arrange mocks
 56 |     mock_datasplit.return_value = (1, 2, 3, 4)
 57 |     mock_traintest.return_value = ("X_train_val", "X_test", "Y_train_val", "Y_test")
 58 |     mock_trainbert.return_value = (1, 2)
 59 | 
 60 |     # act
 61 |     sentiment_pipeline.run_sentiment_bert_pipeline(additional_features=False)
 62 | 
 63 |     # assert
 64 |     mock_dataload.assert_called_once()
 65 |     mock_datasplit.assert_called_once()
 66 |     mock_categorical.assert_called_once()
 67 |     mock_traintest.assert_called_once()
 68 |     mock_bertdata.assert_called()
 69 |     mock_classweights.assert_called_once()
 70 |     mock_createbert.assert_called_once()
 71 |     mock_trainbert.assert_called_once()
 72 |     mock_metrics.assert_called_once()
 73 |     mock_write.assert_called_once()
 74 | 
 75 | 
 76 | @patch("pxtextmining.pipelines.sentiment_pipeline.write_multilabel_models_and_metrics")
 77 | @patch("pxtextmining.pipelines.sentiment_pipeline.get_multiclass_metrics")
 78 | @patch("pxtextmining.pipelines.sentiment_pipeline.train_bert_model")
 79 | @patch(
 80 |     "pxtextmining.pipelines.sentiment_pipeline.create_bert_model_additional_features"
 81 | )
 82 | @patch("pxtextmining.pipelines.sentiment_pipeline.compute_class_weight")
 83 | @patch("pxtextmining.pipelines.sentiment_pipeline.bert_data_to_dataset")
 84 | @patch("pxtextmining.pipelines.sentiment_pipeline.train_test_split")
 85 | @patch("pxtextmining.pipelines.sentiment_pipeline.to_categorical")
 86 | @patch("pxtextmining.pipelines.sentiment_pipeline.process_and_split_data")
 87 | @patch("pxtextmining.pipelines.sentiment_pipeline.load_multilabel_data")
 88 | def test_bert_pipeline_additional_features(
 89 |     mock_dataload,
 90 |     mock_datasplit,
 91 |     mock_categorical,
 92 |     mock_traintest,
 93 |     mock_bertdata,
 94 |     mock_classweights,
 95 |     mock_createbert,
 96 |     mock_trainbert,
 97 |     mock_metrics,
 98 |     mock_write,
 99 | ):
100 |     # arrange mocks
101 |     mock_datasplit.return_value = (1, 2, 3, 4)
102 |     mock_traintest.return_value = ("X_train_val", "X_test", "Y_train_val", "Y_test")
103 |     mock_trainbert.return_value = (1, 2)
104 |     mock_classweights.return_value = [0.5, 0.2]
105 | 
106 |     # act
107 |     sentiment_pipeline.run_sentiment_bert_pipeline(additional_features=True)
108 | 
109 |     # assert
110 |     mock_dataload.assert_called_once()
111 |     mock_datasplit.assert_called_once()
112 |     mock_categorical.assert_called_once()
113 |     mock_traintest.assert_called_once()
114 |     mock_bertdata.assert_called()
115 |     mock_classweights.assert_called_once()
116 |     mock_createbert.assert_called_once()
117 |     mock_trainbert.assert_called_once()
118 |     mock_metrics.assert_called_once()
119 |     mock_write.assert_called_once()
120 | 


--------------------------------------------------------------------------------
/tests/test_write_results.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from unittest.mock import Mock, mock_open, patch
  3 | 
  4 | import numpy as np
  5 | import pandas as pd
  6 | import pytest
  7 | from sklearn.dummy import DummyClassifier
  8 | from tensorflow.keras import Model
  9 | 
 10 | from pxtextmining.factories import factory_write_results
 11 | 
 12 | 
 13 | @patch("pxtextmining.factories.factory_write_results.pickle.dump", Mock())
 14 | @patch(
 15 |     "builtins.open",
 16 |     new_callable=mock_open,
 17 |     read_data="somestr",
 18 | )
 19 | @pytest.mark.parametrize("models", [[Mock(spec=Model)], [Mock(spec=DummyClassifier)]])
 20 | def test_write_multilabel_models_and_metrics(mock_file, tmp_path_factory, models):
 21 |     # arrange
 22 |     models = models
 23 |     model_metrics = ["somestr"]
 24 |     path = tmp_path_factory.mktemp("somepath")
 25 |     # act
 26 |     factory_write_results.write_multilabel_models_and_metrics(
 27 |         models, model_metrics, path
 28 |     )
 29 |     # assert
 30 |     if isinstance(models[0], Model):
 31 |         models[0].save.assert_called_once()
 32 |     mock_file.assert_called_with(os.path.join(path, "model_0.txt"), "w")
 33 |     assert open(os.path.join("somepath", "model_0.txt")).read() == "somestr"
 34 | 
 35 | 
 36 | @patch(
 37 |     "builtins.open",
 38 |     new_callable=mock_open,
 39 |     read_data="somestr",
 40 | )
 41 | @patch("pxtextmining.factories.factory_write_results.os.makedirs")
 42 | def test_write_multilabel_models_and_metrics_nopath(
 43 |     mock_makedirs, mock_file_open, tmp_path
 44 | ):
 45 |     # arrange
 46 |     models = [Mock(spec=Model)]
 47 |     model_metrics = ["somestr"]
 48 |     path = "somepath"
 49 |     # act
 50 |     factory_write_results.write_multilabel_models_and_metrics(
 51 |         models, model_metrics, path
 52 |     )
 53 |     # assert
 54 |     mock_makedirs.assert_called_once_with(path)
 55 | 
 56 | 
 57 | @patch("pxtextmining.factories.factory_write_results.pd.DataFrame.to_excel")
 58 | def test_write_model_preds_sklearn(mock_toexcel, grab_test_X_additional_feats):
 59 |     x = grab_test_X_additional_feats["FFT answer"]
 60 |     # arrange
 61 |     y_true = np.array(
 62 |         [
 63 |             [0.0, 1.0, 0.0, 0.0, 0.0],
 64 |             [1.0, 0.0, 0.0, 1.0, 1.0],
 65 |             [1.0, 0.0, 0.0, 0.0, 0.0],
 66 |             [0.0, 0.0, 1.0, 1.0, 0.0],
 67 |             [0.0, 0.0, 0.0, 0.0, 1.0],
 68 |         ]
 69 |     )
 70 |     labels = ["one", "two", "three", "four", "five"]
 71 |     probs_labels = ["Probability of " + x for x in labels]
 72 |     preds_df = pd.DataFrame(
 73 |         np.array(
 74 |             [
 75 |                 [0.0, 1.0, 0.0, 1.0, 0.0, 0.1, 0.6, 0.2, 0.7, 0.05],
 76 |                 [1.0, 0.0, 0.0, 1.0, 0.0, 0.55, 0.2, 0.3, 0.8, 0.4],
 77 |                 [1.0, 0.0, 0.0, 0.0, 0.0, 0.8, 0.3, 0.2, 0.3, 0.1],
 78 |                 [1.0, 0.0, 1.0, 1.0, 0.0, 0.7, 0.2, 0.8, 0.9, 0.0],
 79 |                 [0.0, 0.0, 0.0, 0.0, 1.0, 0.2, 0.4, 0.2, 0.1, 0.6],
 80 |             ]
 81 |         ),
 82 |         columns=labels + probs_labels,
 83 |         index=grab_test_X_additional_feats.index,
 84 |     )
 85 |     preds_df["labels"] = [
 86 |         ["two", "four"],
 87 |         ["one", "four"],
 88 |         ["one"],
 89 |         ["one", "three", "four"],
 90 |         ["five"],
 91 |     ]
 92 |     path = "somepath.xlsx"
 93 |     # act
 94 |     df = factory_write_results.write_model_preds(
 95 |         x, y_true, preds_df, labels, path=path, return_df=True
 96 |     )
 97 |     # assert
 98 |     assert df.shape[0] == len(x)
 99 |     mock_toexcel.assert_called()
100 | 
101 | 
102 | @patch("pxtextmining.factories.factory_write_results.pd.DataFrame.to_excel")
103 | @patch("pxtextmining.factories.factory_write_results.parse_metrics_file")
104 | def test_write_model_analysis(
105 |     mock_parsemetrics,
106 |     mock_toexcel,
107 |     grab_preds_df,
108 | ):
109 |     mock_parsemetrics.return_value = pd.DataFrame(
110 |         {
111 |             "label": {0: "one", 1: "two", 2: "three", 3: "four", 4: "five"},
112 |             "precision": {0: 0.46, 1: 0.54, 2: 0.52, 3: 0.54, 4: 0.52},
113 |             "recall": {0: 0.43, 1: 0.82, 2: 0.65, 3: 0.82, 4: 0.65},
114 |             "f1_score": {0: 0.44, 1: 0.65, 2: 0.58, 3: 0.65, 4: 0.58},
115 |             "support (label count in test data)": {
116 |                 0: 129,
117 |                 1: 115,
118 |                 2: 20,
119 |                 3: 115,
120 |                 4: 20,
121 |             },
122 |         }
123 |     )
124 |     labels = ["one", "two", "three", "four", "five"]
125 |     dataset = grab_preds_df.copy()
126 |     preds_df = grab_preds_df
127 |     y_true = np.array(grab_preds_df[labels])
128 | 
129 |     factory_write_results.write_model_analysis(
130 |         "model_name",
131 |         labels=labels,
132 |         dataset=dataset,
133 |         path="somepath",
134 |         preds_df=preds_df,
135 |         y_true=y_true,
136 |         custom_threshold_dict=None,
137 |     )
138 |     mock_toexcel.assert_called_once()
139 | 


--------------------------------------------------------------------------------