├── .flake8 ├── .gcloudignore ├── .github └── workflows │ ├── CodeCov.yml │ └── QuartoDocs.yml ├── .gitignore ├── .pre-commit-config.yaml ├── Dockerfile ├── LICENSE ├── README.md ├── app.yaml ├── app ├── Welcome.py └── pages │ ├── 1_SIC_&_SOC_pre-defined_coding_assistant.py │ ├── 2_Setup_custom_coding_assistant.py │ └── 3_Test_custom_coding_assistant.py ├── cloud_deploy.sh ├── codecov.yml ├── docs ├── _quarto.yml ├── _static │ ├── app-ui.png │ └── sic-soc-llm.png ├── index.qmd ├── method.qmd └── tutorials │ ├── 1_sic_data_structure.qmd │ ├── 2_sic_classifier.qmd │ ├── 3_soc_classifier.qmd │ ├── 4_custom_coicop_classifier.qmd │ └── index.qmd ├── pyproject.toml ├── src └── sic_soc_llm │ ├── __init__.py │ ├── _config │ ├── __init__.py │ ├── main.py │ └── sic_soc_llm_config.toml │ ├── data_models │ ├── __init__.py │ ├── response_model.py │ ├── sicDB.py │ ├── sic_data_access.py │ ├── sic_hierarchy.py │ └── sic_meta_model.py │ ├── embedding.py │ ├── example_data │ ├── coicop_5d_condensed.txt │ ├── sic_2d_condensed.txt │ ├── sic_4d_condensed.txt │ ├── soc_4d_condensed.txt │ └── toy_index.txt │ ├── llm.py │ ├── logs.py │ └── prompt.py └── tests ├── test_classification_llm.py ├── test_embedding.py └── test_sic_data_structure.py /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | # Rule definitions: http://flake8.pycqa.org/en/latest/user/error-codes.html 3 | # D203: 1 blank line required before class docstring 4 | # W503: line break before binary operator 5 | exclude = venv*,__pycache__,node_modules,bower_components,migrations 6 | ignore = D203,W503 7 | max-complexity = 9 8 | max-line-length = 88 9 | extend-ignore = E203 10 | -------------------------------------------------------------------------------- /.gcloudignore: -------------------------------------------------------------------------------- 1 | # Ignore all files and folders 2 | * 3 | 4 | # Allow selected files and folders 5 | !app.yaml 6 | !Dockerfile 7 | !pyproject.toml 8 | !app/ 9 | !src/ 10 | src/* 11 | !src/sic_soc_llm/ 12 | !data/ 13 | data/* 14 | !data/sic-index/ 15 | !data/soc-index/ 16 | !data/coicop-index/ 17 | !data/custom-index/ 18 | data/custom-index/* 19 | !data/custom-index/example-index.txt 20 | 21 | # ignore pycache 22 | **/__pycache__/ 23 | -------------------------------------------------------------------------------- /.github/workflows/CodeCov.yml: -------------------------------------------------------------------------------- 1 | name: CodeCov 2 | on: 3 | pull_request: 4 | branches: [develop, main] 5 | push: 6 | branches: [develop, main] 7 | 8 | jobs: 9 | build: 10 | name: Pytest & Coverage 11 | runs-on: ubuntu-latest 12 | 13 | steps: 14 | - name: Checkout Repo 15 | uses: actions/checkout@v4 16 | 17 | - name: Setup Python 18 | uses: actions/setup-python@v5 19 | with: 20 | python-version: "3.10" 21 | cache: "pip" 22 | 23 | - name: Run Pre-commit 24 | uses: pre-commit/action@v3.0.1 25 | 26 | - name: Install Dependencies 27 | run: | 28 | python -m pip install --upgrade pip 29 | python -m pip install -e ".[test]" 30 | 31 | - name: Run Pytest and Generate Report 32 | run: | 33 | python -m pip install coverage[toml] 34 | coverage run -m pytest 35 | 36 | - name: Upload Coverage Reports to Codecov 37 | uses: codecov/codecov-action@v4 38 | with: 39 | token: ${{ secrets.CODECOV_TOKEN }} 40 | fail_ci_if_error: false 41 | -------------------------------------------------------------------------------- /.github/workflows/QuartoDocs.yml: -------------------------------------------------------------------------------- 1 | name: QuartoDocs 2 | on: 3 | pull_request: 4 | branches: develop 5 | 6 | jobs: 7 | build-deploy: 8 | runs-on: ubuntu-latest 9 | permissions: 10 | contents: write 11 | pages: write 12 | steps: 13 | - name: Check out repository 14 | uses: actions/checkout@v4 15 | 16 | - name: Set up Quarto 17 | uses: quarto-dev/quarto-actions/setup@v2 18 | 19 | - name: Setup Python 20 | uses: actions/setup-python@v5 21 | with: 22 | python-version: "3.10" 23 | cache: "pip" 24 | 25 | - name: Install Dependencies 26 | run: | 27 | python -m pip install --upgrade pip 28 | python -m pip install -e ".[docs]" 29 | 30 | - name: Build API reference 31 | run: | 32 | cd docs 33 | python -m quartodoc build 34 | 35 | - name: Publish to gh-pages 36 | uses: quarto-dev/quarto-actions/publish@v2 37 | with: 38 | target: gh-pages 39 | render: true 40 | path: docs 41 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Data 10 | db/ 11 | *.db 12 | *.csv 13 | *.txt 14 | *.json 15 | *.pkl 16 | *.parquet 17 | *.bin 18 | *.faiss 19 | *.xlsx 20 | *.xls 21 | *.png 22 | *.ods 23 | *.pickle 24 | 25 | # Distribution / packaging 26 | .Python 27 | build/ 28 | develop-eggs/ 29 | dist/ 30 | downloads/ 31 | eggs/ 32 | .eggs/ 33 | lib/ 34 | lib64/ 35 | parts/ 36 | sdist/ 37 | var/ 38 | wheels/ 39 | share/python-wheels/ 40 | *.egg-info/ 41 | .installed.cfg 42 | *.egg 43 | MANIFEST 44 | *.DS_Store 45 | 46 | # PyInstaller 47 | # Usually these files are written by a python script from a template 48 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 49 | *.manifest 50 | *.spec 51 | 52 | # Installer logs 53 | pip-log.txt 54 | pip-delete-this-directory.txt 55 | 56 | # Unit test / coverage reports 57 | # Project-specific 58 | *.csv 59 | 60 | htmlcov/ 61 | .tox/ 62 | .nox/ 63 | .coverage 64 | .coverage.* 65 | .cache 66 | nosetests.xml 67 | coverage.xml 68 | *.cover 69 | *.py,cover 70 | .hypothesis/ 71 | .pytest_cache/ 72 | cover/ 73 | 74 | # Translations 75 | *.mo 76 | *.pot 77 | 78 | # Django stuff: 79 | *.log 80 | local_settings.py 81 | *.sqlite3 82 | *.sqlite3-journal 83 | 84 | # Flask stuff: 85 | instance/ 86 | .webassets-cache 87 | 88 | # Scrapy stuff: 89 | .scrapy 90 | 91 | # Sphinx documentation 92 | docs/_build/ 93 | 94 | # PyBuilder 95 | .pybuilder/ 96 | target/ 97 | 98 | # Jupyter Notebook 99 | .ipynb_checkpoints 100 | *.ipynb 101 | 102 | # IPython 103 | profile_default/ 104 | ipython_config.py 105 | 106 | # pyenv 107 | # For a library or package, you might want to ignore these files since the code is 108 | # intended to run in multiple environments; otherwise, check them in: 109 | # .python-version 110 | 111 | # pipenv 112 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 113 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 114 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 115 | # install all needed dependencies. 116 | #Pipfile.lock 117 | 118 | # poetry 119 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 120 | # This is especially recommended for binary packages to ensure reproducibility, and is more 121 | # commonly ignored for libraries. 122 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 123 | #poetry.lock 124 | 125 | # pdm 126 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 127 | #pdm.lock 128 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 129 | # in version control. 130 | # https://pdm.fming.dev/#use-with-ide 131 | .pdm.toml 132 | 133 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 134 | __pypackages__/ 135 | 136 | # Celery stuff 137 | celerybeat-schedule 138 | celerybeat.pid 139 | 140 | # SageMath parsed files 141 | *.sage.py 142 | 143 | # Environments 144 | .env 145 | .venv 146 | env/ 147 | venv/ 148 | ENV/ 149 | env.bak/ 150 | venv.bak/ 151 | 152 | # Spyder project settings 153 | .spyderproject 154 | .spyproject 155 | 156 | # Rope project settings 157 | .ropeproject 158 | 159 | # mkdocs documentation 160 | /site 161 | 162 | # quarto 163 | _site/ 164 | .quarto/ 165 | /docs/reference/ 166 | /docs/.gitignore 167 | 168 | # mypy 169 | .mypy_cache/ 170 | .dmypy.json 171 | dmypy.json 172 | 173 | # Pyre type checker 174 | .pyre/ 175 | 176 | # pytype static type analyzer 177 | .pytype/ 178 | 179 | # Cython debug symbols 180 | cython_debug/ 181 | 182 | # PyCharm 183 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 184 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 185 | # and can be added to the global gitignore or merged into this file. For a more nuclear 186 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 187 | #.idea/ 188 | 189 | # VSCode 190 | .vscode/ 191 | 192 | #config 193 | config/ 194 | 195 | #output figures 196 | *.svg 197 | *.pdf 198 | *.png 199 | *.jpg 200 | 201 | #llm model files 202 | *.gguf 203 | *.llamafile 204 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | # See https://pre-commit.com for more information 2 | # See https://pre-commit.com/hooks.html for more hooks 3 | exclude: "src/sic_soc_llm/data_models/sicDB.py" 4 | repos: 5 | - repo: https://github.com/kynan/nbstripout 6 | rev: 0.6.1 7 | hooks: 8 | - id: nbstripout 9 | name: nbstripout - Strip outputs from notebooks (auto-fixes) 10 | args: 11 | - --extra-keys 12 | - "metadata.colab metadata.kernelspec cell.metadata.colab cell.metadata.executionInfo cell.metadata.id cell.metadata.outputId" 13 | - repo: https://github.com/pre-commit/pre-commit-hooks 14 | rev: v4.4.0 15 | hooks: 16 | - id: check-added-large-files 17 | name: Check for files larger than 5 MB 18 | args: [ "--maxkb=5120" ] 19 | - id: end-of-file-fixer 20 | name: Check for a blank line at the end of scripts (auto-fixes) 21 | exclude: '\.Rd' 22 | - id: trailing-whitespace 23 | name: Check for trailing whitespaces (auto-fixes) 24 | - repo: https://github.com/pycqa/isort 25 | rev: 5.12.0 26 | hooks: 27 | - id: isort 28 | name: isort - Sort Python imports (auto-fixes) 29 | types: [ cython, pyi, python ] 30 | args: [ "--profile", "black", "--filter-files" ] 31 | - repo: https://github.com/psf/black 32 | rev: 23.3.0 # Replace by any tag/version: https://github.com/psf/black/tags 33 | hooks: 34 | - id: black 35 | name: black - consistent Python code formatting (auto-fixes) 36 | language_version: python # Should be a command that runs python3.6+ 37 | - repo: https://github.com/PyCQA/flake8 38 | rev: 6.0.0 39 | hooks: 40 | - id: flake8 41 | name: flake8 - Python linting 42 | - repo: https://github.com/nbQA-dev/nbQA 43 | rev: 1.6.4 44 | hooks: 45 | - id: nbqa-isort 46 | name: nbqa-isort - Sort Python imports (notebooks; auto-fixes) 47 | args: [ --nbqa-mutate ] 48 | additional_dependencies: [ isort==5.8.0 ] 49 | - id: nbqa-black 50 | name: nbqa-black - consistent Python code formatting (notebooks; auto-fixes) 51 | args: [ --nbqa-mutate ] 52 | additional_dependencies: [ black==22.3.0 ] 53 | # TODO: Disabled for now until it's clear how to add noqa to specific cells of a Jupyter notebook 54 | #- id: nbqa-flake8 55 | # name: nbqa-flake8 - Python linting (notebooks) 56 | # additional_dependencies: [ flake8==3.9.2 ] 57 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.10.4-buster 2 | ENV PYTHONUNBUFFERED True 3 | 4 | # Copy app code 5 | WORKDIR /sic-soc 6 | COPY . ./ 7 | RUN ls -laRt 8 | 9 | # Upgrade pip and install requirements 10 | RUN python -m pip install --upgrade pip 11 | RUN python -m pip install pysqlite3-binary 12 | RUN python -m pip install -e ".[app]" --no-cache-dir 13 | 14 | # Expose port you want your app on 15 | ENV PORT=8080 16 | ENV HOSTNAME="0.0.0.0" 17 | EXPOSE 8080 18 | HEALTHCHECK CMD curl --fail http://localhost:8080/_stcore/health 19 | 20 | # Run 21 | ENTRYPOINT ["streamlit", "run", "app/Welcome.py", "--server.port=8080", "--server.address=0.0.0.0"] 22 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Data Science Campus 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # SIC-SOC-LLM 2 | 3 |

4 | Repository status 5 | Code stability 6 | MacOS 7 | codecov 8 |

9 | 10 | 11 | ## Overview 12 | 13 | This app/package has been created by the [Data Science Campus](https://datasciencecampus.ons.gov.uk/) as a proof of concept to evaluate Large Language Models (LLM) potential to assist 14 | with classification coding. It uses the `LangChain` library to perform Retrieval Augmented Generation (RAG) based on the provided classification index. A special case of Standard Industrial Classification (SIC) coding has been used as the primary test case, see [method explanation](https://datasciencecampus.github.io/sic-soc-llm/method.html#method). An example deployment using `Streamlit` allows for interactive exploration of the model's capabilities. 15 | 16 | ## Data sources 17 | 18 | Examples of simplified SIC, Standard Occupational Classification (SOC) and Classification of Individual Consumption According to Purpose (COICOP) are included in the `example_data` folder. These condensed indices are flattened subsets of more detailed indices officially published online, such as the [UK SIC 2007](https://www.ons.gov.uk/methodology/classificationsandstandards/ukstandardindustrialclassificationofeconomicactivities/uksic2007), [UK SOC 2020](https://www.ons.gov.uk/methodology/classificationsandstandards/standardoccupationalclassificationsoc/soc2020), and [COICOP 2018 (pdf)](https://unstats.un.org/unsd/classifications/unsdclassifications/COICOP_2018_-_pre-edited_white_cover_version_-_2018-12-26.pdf). 19 | 20 | > ⚠️ **Warning:** The example data is provided for demonstration purposes only. No guarrantee is given for its accuracy or up to date status. 21 | 22 | In this project, we focused on the SIC. A flexible representation of this hierarchical index (including metadata) has been implemented within the `data_models` submodule, enabling enhanced context for RAG/LLM. This representation can be used independently for other SIC coding tasks or easily extended to accommodate different classification indices. 23 | 24 | The SIC index hierarchy object is built using three data sources provided by ONS: 25 | 26 | - [Published UK SIC summary of structure worksheet (xlsx)](https://www.ons.gov.uk/file?uri=/methodology/classificationsandstandards/ukstandardindustrialclassificationofeconomicactivities/uksic2007/publisheduksicsummaryofstructureworksheet.xlsx) - location needs to be specified in config 27 | 28 | - [UK SIC2007 indexes with addendum December 2022 (xlsx)](https://www.ons.gov.uk/file?uri=/methodology/classificationsandstandards/ukstandardindustrialclassificationofeconomicactivities/uksic2007/uksic2007indexeswithaddendumdecember2022.xlsx) - location needs to be specified in config 29 | 30 | - [SIC resource file by ONSdigital/dp-classification-tools (js)](https://github.com/ONSdigital/dp-classification-tools/blob/develop/standard-industrial-classification/data/sicDB.js) - included inside the package 31 | 32 | 33 | ## Installation 34 | 35 | ### 1. Virtual environment 36 | 37 | It is recommended that you install the project with its required dependencies in a virtual environment. When the virtual environment is activated, any subsequent Python commands will use the Python interpreter and libraries specific to that isolated environment. This ensures that the project uses the correct versions of the dependencies specified in its requirements. 38 | 39 | Create and activate a new virtual environment on Linux/OS X: 40 | 41 | ```{shell} 42 | python3.10 -m venv .venv 43 | source .venv/bin/activate 44 | ``` 45 | 46 | 47 | ### 2. Requirements 48 | Update pip and install requirements: 49 | ``` 50 | python -m pip install --upgrade pip 51 | python -m pip install -e ".[dev]" 52 | ``` 53 | The -e flag installs the project in "editable" mode, which means that any changes made to the project code will be reflected immediately without the need to reinstall. The ".[dev]" part specifies that both the regular requirements and the development requirements should be installed. 54 | 55 | ### 3. LLM authentication: 56 | 57 | The package provides code to use popular LLMs, access to the LLMs is a perquisite for use. Depending on your choice, keys/credentials may need to be added, for example: 58 | 59 | - Include a personal [OpenAI](https://openai.com/) API in .env as 60 | 61 | ```{shell} 62 | OPENAI_API_KEY="" 63 | ``` 64 | 65 | - Authenticate for [Vertex AI](https://cloud.google.com/model-garden?hl=en): 66 | 67 | ```{shell} 68 | gcloud config set project "" 69 | gcloud auth application-default login 70 | ``` 71 | 72 | 73 | 74 | 75 | ## Usage 76 | 77 | Examples of how to use the `sic-soc-llm` package can be found in [Tutorials](https://datasciencecampus.github.io/sic-soc-llm/tutorials/) and [References](https://datasciencecampus.github.io/sic-soc-llm/reference/). 78 | 79 | ### Configuration 80 | 81 | The `sic-soc-llm` package uses a configuration file in TOML format to specify the paths to the data files and the names of the models to use. An example configuration file is provided in `sic_soc_llm_config.toml` and is read by the [`get_config`](https://datasciencecampus.github.io/sic-soc-llm/reference/get_config.html) function. The following fields are required: 82 | 83 | | Field | Type | Default value | 84 | | --- | --- | --- | 85 | [lookups]| | | 86 | | sic_structure | str | "data/sic-index/publisheduksicsummaryofstructureworksheet.xlsx" | 87 | | sic_index | str | "data/sic-index/uksic2007indexeswithaddendumdecember2022.xlsx" | 88 | | sic_condensed | str | "sic_2d_condensed.txt" | 89 | | soc_condensed | str | "soc_4d_condensed.txt" | 90 | | coicop_condensed | str | "coicop_5d_condensed.txt" | 91 | | [llm]| | | 92 | | db_dir | str | "data/sic-index/db" | 93 | | embedding_model_name | str | "all-MiniLM-L6-v2" | 94 | | llm_model_name | str | "gemini-pro" | 95 | 96 | 97 | Make sure to update the file paths and model names according to your specific setup. While the condensed indexes (`.txt`) are included in the package, the `.xlsx` files need to be downloaded from the ONS website (mentioned above) and placed in the specified locations. 98 | 99 | ### Run and deploy Streamlit app 100 | 101 | To run the Streamlit app, use the following command: 102 | 103 | ```{shell} 104 | streamlit run app/Welcome.py --server.port 8500 105 | ``` 106 | 107 | The app will be available at `http://localhost:8500/`. 108 | 109 | 110 | Example commands used to build and deploy the app as a GCP Cloud Run service are provided in `cloud_deploy.sh` (which references `Dockerfile` and `app.yaml`). The `Dockerfile` contains a set of instructions for building a Docker image. It specifies the base image to use, the files and directories to include, the dependencies and the commands to run. The `app.yaml` file is used to specify the configuration of the Cloud Run service, including the container image to deploy, the service name, and the port to expose. 111 | 112 | 113 | ## Development and testing 114 | 115 | ### 1. Pre-commit actions 116 | 117 | This repository contains a configuration of pre-commit hooks. If approaching this project as a developer, you are encouraged to install and enable `pre-commits` by running the following in your shell: 118 | 119 | ``` 120 | pip install pre-commit 121 | pre-commit install 122 | ``` 123 | 124 | ### 2. Unit tests 125 | 126 | To run the unit tests, use the following command: 127 | 128 | ```{shell} 129 | python -m pytest 130 | ``` 131 | 132 | 133 | ### 3. Building documentation and webpage: 134 | 135 | 136 | 1. Build (Quatro markdown) `reference` files from docstrings: 137 | 138 | ```{shell} 139 | cd docs 140 | python -m quartodoc build 141 | ``` 142 | 143 | 2. Render webpage from Quarto markdowns in `docs` dir (including `reference` files): 144 | 145 | ```{shell} 146 | quarto render 147 | ``` 148 | 149 | ## License 150 | 151 | The code, unless otherwise stated, is released under [the MIT Licence][mit]. 152 | The documentation for this work is subject to [© 2024 Crown Copyright (Office for National Statistics)][copyright] and is available under the terms of the [Open Government 3.0][ogl] licence. 153 | 154 | [mit]: https://github.com/datasciencecampus/sic-soc-llm?tab=MIT-1-ov-file 155 | [copyright]: http://www.nationalarchives.gov.uk/information-management/re-using-public-sector-information/uk-government-licensing-framework/crown-copyright/ 156 | [ogl]: http://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/ 157 | 158 | ## Data Science Campus 159 | At the [Data Science Campus](https://datasciencecampus.ons.gov.uk/about-us/) we apply data science, and build skills, for public good across the UK and internationally. Get in touch with the Campus at [datasciencecampus@ons.gov.uk](mailto:datasciencecampus@ons.gov.uk). 160 | -------------------------------------------------------------------------------- /app.yaml: -------------------------------------------------------------------------------- 1 | runtime: custom 2 | env: flex 3 | service: default 4 | resources: 5 | disk_size_gb: 25 6 | memory_gb: 4 7 | -------------------------------------------------------------------------------- /app/Welcome.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | from sic_soc_llm import setup_logging 3 | 4 | logger = setup_logging("streamlit_app") 5 | 6 | st.set_page_config( 7 | page_title="SIC/SOC LLM assistant", 8 | page_icon="🐥", 9 | ) 10 | 11 | st.subheader("LLM assisted classification coding", divider=True) 12 | 13 | st.markdown( 14 | """This app/package has been created by the Data Science Campus 15 | as a proof of concept to evaluate Large Language Models (LLM) potential to assist 16 | with classification coding. 17 | It provides an example of using pre-trained LLM models to assist with 18 | Standard Industrial Classification (SIC) and Standard Occupational Classification 19 | (SOC) coding. It also provides a way to set up and test a custom index. 20 | """ 21 | ) 22 | 23 | st.markdown( 24 | """ 25 | - Source code: [github/datasciencecampus/sic-soc-llm]( 26 | https://github.com/datasciencecampus/sic-soc-llm) 27 | - Documentation & references: 28 | [github.io docs](https://datasciencecampus.github.io/sic-soc-llm/docs) 29 | - Website: [Data Science Campus](https://datasciencecampus.ons.gov.uk/) 30 | - Email: [Data Science Campus](mailto:datasciencecampus@ons.gov.uk) 31 | """ 32 | ) 33 | 34 | st.subheader( 35 | "Use the sidebar on the left to navigate to any of the pages:", divider=True 36 | ) 37 | st.markdown( 38 | """ 39 | 1. **SIC & SOC pre-defined coding assistant** 40 | 41 | Enter respondent data into the input fields and press the validate button. 42 | The response and debugging info will be displayed below. 43 | Preloaded classification indices are used. 44 | 45 | 2. **Setup custom LLM coding assistant** 46 | 47 | You can customise your own classification assistant by uploading your own index and 48 | specifying survey fields. This will be used in a Retrieval Augmented 49 | Generation (RAG) pipeline. 50 | 51 | 3. **Test custom LLM coding assistant** 52 | 53 | After setting up a custom index and survey fields, you can test 54 | your custom LLM coding assistant. 55 | """ 56 | ) 57 | -------------------------------------------------------------------------------- /app/pages/1_SIC_&_SOC_pre-defined_coding_assistant.py: -------------------------------------------------------------------------------- 1 | import dotenv 2 | import streamlit as st 3 | from sic_soc_llm.llm import ClassificationLLM 4 | from sic_soc_llm.embedding import EmbeddingHandler 5 | 6 | st.set_page_config( 7 | page_title="SIC/SOC LLM assistant", 8 | page_icon="🐥", 9 | ) 10 | 11 | if st.session_state.get("open_ai_key") is None: 12 | try: 13 | openai_api_key = dotenv.dotenv_values(".env")["OPENAI_API_KEY"] 14 | st.session_state["open_ai_key"] = openai_api_key 15 | except Exception as e: 16 | st.session_state["open_ai_key"] = str(e) 17 | 18 | embed = EmbeddingHandler() # Loaded once, used twice 19 | uni_chat = ClassificationLLM("gemini-pro", embedding_handler=embed) # "text-unicorn" 20 | gpt_chat = ClassificationLLM( 21 | "gpt-4", openai_api_key=st.session_state["open_ai_key"], embedding_handler=embed 22 | ) 23 | 24 | 25 | # Ask the user for the OpenAI API key if they want to use it remotely 26 | ai_key_enter = st.sidebar.text_input( 27 | "If you want to use GPT, update OpenAI API key", 28 | value="", 29 | type="password", 30 | on_change=None, 31 | ) 32 | if ai_key_enter: 33 | st.session_state["open_ai_key"] = ai_key_enter 34 | gpt_chat = ClassificationLLM( 35 | "gpt-4", openai_api_key=st.session_state["open_ai_key"], embedding_handler=embed 36 | ) 37 | st.sidebar.success("OpenAI API key updated successfully") 38 | 39 | 40 | # Streamlit app 41 | def main(verbose: bool = True): # noqa: C901 42 | st.subheader("LLM assisted SIC/SOC Coding", divider=True) 43 | st.subheader("Respondent data - survey fields", divider=True) 44 | # Job Title and Description inputs 45 | job_title = st.text_input("Job Title") 46 | job_description = st.text_area("Job Description") 47 | manage_others = st.toggle("Line management responsibility") 48 | 49 | # Level of Education input 50 | education_levels = [ 51 | "No formal qualifications", 52 | "Level 1: one to four GCSE passes (grade A* to C or grade 4 and above)" 53 | + " and any other GCSEs at other grades, or equivalent qualifications", 54 | "Level 2: five or more GCSE passes (grade A* to C or grade 4 and above)" 55 | + " or equivalent qualifications", 56 | "Level 3: two or more A Levels or equivalent qualifications", 57 | "Level 4 or above: Higher National Certificate, Higher National Diploma," 58 | + " Bachelor's degree, or post-graduate qualifications", 59 | "Other qualifications, of unknown level", 60 | ] 61 | level_of_education = st.selectbox("Level of Education", education_levels) 62 | industry_descr = st.text_area("What does the organisation mainly make or do?") 63 | 64 | buttons = {} 65 | st.subheader("Validate using LLM (one-shot)", divider=True) 66 | col1, col2 = st.columns(2) 67 | buttons["soc_uni"] = col1.button( 68 | "Validate input for SOC using Gemini ⛋" 69 | ) # Palm2 🦄") 70 | buttons["soc_gpt"] = col2.button("Validate input for SOC using GPT ⚛") 71 | buttons["sic_uni"] = col1.button( 72 | "Validate input for SIC using Gemini ⛋" 73 | ) # Palm2 🦄") 74 | buttons["sic_gpt"] = col2.button("Validate input for SIC using GPT ⚛") 75 | # Add buttons for rag 76 | st.subheader("Validate SIC using LLM (RAG)", divider=True) 77 | col3, col4 = st.columns(2) 78 | buttons["rag_uni"] = col3.button( 79 | "Validate input for SIC using RAG Gemini ⛋" 80 | ) # Palm2 🦄") 81 | buttons["rag_gpt"] = col4.button("Validate input for SIC using RAG GPT ⚛") 82 | 83 | if any(buttons.values()): 84 | if job_title or job_description or industry_descr: 85 | if buttons["soc_uni"]: 86 | response = uni_chat.get_soc_code( 87 | job_title, 88 | job_description, 89 | level_of_education, 90 | manage_others, 91 | industry_descr, 92 | ) 93 | elif buttons["soc_gpt"]: 94 | response = gpt_chat.get_soc_code( 95 | job_title, 96 | job_description, 97 | level_of_education, 98 | manage_others, 99 | industry_descr, 100 | ) 101 | elif buttons["sic_uni"]: 102 | response = uni_chat.get_sic_code( 103 | industry_descr, job_title, job_description 104 | ) 105 | elif buttons["sic_gpt"]: 106 | response = gpt_chat.get_sic_code( 107 | industry_descr, job_title, job_description 108 | ) 109 | elif buttons["rag_uni"]: 110 | response, _, _ = uni_chat.rag_sic_code( 111 | industry_descr, job_title, job_description 112 | ) 113 | else: 114 | response, _, _ = gpt_chat.rag_sic_code( 115 | industry_descr, job_title, job_description 116 | ) 117 | 118 | if response.codable & ("soc_code" in response.model_fields): 119 | st.success( 120 | f"👍 Coded as {response.soc_code}: {response.soc_descriptive}" 121 | ) 122 | elif response.codable & ("sic_code" in response.model_fields): 123 | st.success( 124 | f"👍 Coded as {response.sic_code}: {response.sic_descriptive}" 125 | ) 126 | else: 127 | st.warning(f"👆 More details needed: {response.followup}") 128 | 129 | if verbose: 130 | print( 131 | "Input: ", 132 | job_title, 133 | job_description, 134 | manage_others, 135 | level_of_education, 136 | ) 137 | print("Response: ", response) 138 | st.subheader("Debugging info", divider=True) 139 | st.json(response.model_dump()) 140 | else: 141 | st.warning("👆 Please enter something somewhere at least...") 142 | 143 | 144 | if __name__ == "__main__": 145 | main() 146 | -------------------------------------------------------------------------------- /app/pages/2_Setup_custom_coding_assistant.py: -------------------------------------------------------------------------------- 1 | # %% 2 | import streamlit as st 3 | from io import StringIO 4 | from importlib import resources 5 | from sic_soc_llm.embedding import EmbeddingHandler 6 | 7 | # %% 8 | st.set_page_config( 9 | page_title="Custom LLM coding assistant", 10 | page_icon="🐥", 11 | ) 12 | 13 | 14 | def main(): 15 | # File upload 16 | st.subheader("1. Upload Classification Index", divider=True) 17 | index_file = st.file_uploader("Upload a file", type=["txt"]) 18 | st.markdown( 19 | """The index file should be a text file with one classification entry 20 | per line in the format `code: description`. 21 | You can download an example index file for reference:""" 22 | ) 23 | # Put the download buttons on one row with different spacing 24 | col1, col2, col3, col4 = st.columns([1.3, 2.2, 2, 2]) 25 | col1.download_button( 26 | "Toy index", 27 | data=( 28 | resources.files("sic_soc_llm.example_data") / "toy_index.txt" 29 | ).read_bytes(), 30 | file_name="toy_index.txt", 31 | mime="text/plain", 32 | ) 33 | col2.download_button( 34 | "COICOP 5d condensed", 35 | data=( 36 | resources.files("sic_soc_llm.example_data") / "coicop_5d_condensed.txt" 37 | ).read_bytes(), 38 | file_name="coicop_5d_condensed.txt", 39 | mime="text/plain", 40 | ) 41 | col3.download_button( 42 | "SOC 4d condensed", 43 | data=( 44 | resources.files("sic_soc_llm.example_data") / "soc_4d_condensed.txt" 45 | ).read_bytes(), 46 | file_name="soc_4d_condensed.txt", 47 | mime="text/plain", 48 | ) 49 | col4.download_button( 50 | "SIC 4d condensed", 51 | data=( 52 | resources.files("sic_soc_llm.example_data") / "sic_4d_condensed.txt" 53 | ).read_bytes(), 54 | file_name="sic_4d_condensed.txt", 55 | mime="text/plain", 56 | ) 57 | 58 | if index_file is not None: 59 | try: 60 | # Embed the index 61 | embedding_handler = EmbeddingHandler(db_dir=None) 62 | embedding_handler.embed_index( 63 | file_object=StringIO(index_file.getvalue().decode("utf-8")) 64 | ) 65 | # Display size of embedded index 66 | coll_size = embedding_handler._index_size 67 | if coll_size > 0: 68 | st.session_state.custom_embed = embedding_handler 69 | st.success( 70 | f"Index embedding successful. Embedded index of size {coll_size}." 71 | ) 72 | else: 73 | st.warning( 74 | "Index embedding failed. Please check the index file and try again." 75 | ) 76 | except Exception as e: 77 | st.error( 78 | f"""Index embedding failed. Please check the index file and try again. 79 | Error: {e}""" 80 | ) 81 | else: 82 | if st.session_state.get("custom_embed", None) is not None: 83 | coll_size = st.session_state.custom_embed._index_size 84 | st.info( 85 | f"""A custom index has been embedded previously with 86 | {coll_size} entries. Uploading new index will replace it.""" 87 | ) 88 | 89 | # Specify respondent data fields 90 | st.subheader("2. Specify respondent data fields", divider=True) 91 | # Specify respondent data fields 92 | fields = st.text_input("Enter respondent data fields (separated by commas)") 93 | if fields: 94 | field_list = [s.strip() for s in fields.split(",")] 95 | # Save the custom fields in session state 96 | st.session_state.custom_fields = field_list 97 | st.success(f"Respondent data fields registered: {field_list}") 98 | else: 99 | if st.session_state.get("custom_fields", None) is not None: 100 | st.info( 101 | f"""A custom respondent data fields previously registered: 102 | {st.session_state.custom_fields}.""" 103 | ) 104 | 105 | 106 | if __name__ == "__main__": 107 | main() 108 | 109 | # %% 110 | -------------------------------------------------------------------------------- /app/pages/3_Test_custom_coding_assistant.py: -------------------------------------------------------------------------------- 1 | import dotenv 2 | import streamlit as st 3 | from sic_soc_llm.llm import ClassificationLLM 4 | 5 | st.set_page_config( 6 | page_title="Custom LLM coding assistant", 7 | page_icon="🐥", 8 | ) 9 | 10 | if "custom_embed" not in st.session_state or "custom_fields" not in st.session_state: 11 | # Point user to setup page 12 | st.write( 13 | """Custom index and response fields not provided. Please go to the 14 | **Setup custom LLM coding assistant** page to setup your index and fields.""", 15 | unsafe_allow_html=True, 16 | ) 17 | st.stop() 18 | 19 | if st.session_state.get("open_ai_key") is None: 20 | try: 21 | openai_api_key = dotenv.dotenv_values(".env")["OPENAI_API_KEY"] 22 | st.session_state["open_ai_key"] = openai_api_key 23 | except Exception as e: 24 | st.session_state["open_ai_key"] = str(e) 25 | 26 | 27 | uni_chat = ClassificationLLM( 28 | "gemini-pro", embedding_handler=st.session_state.custom_embed 29 | ) # "text-unicorn" 30 | gpt_chat = ClassificationLLM( 31 | "gpt-4", 32 | embedding_handler=st.session_state.custom_embed, 33 | openai_api_key=st.session_state["open_ai_key"], 34 | ) 35 | 36 | 37 | # Ask the user for the OpenAI API key if they want to use it remotely 38 | ai_key_enter = st.sidebar.text_input( 39 | "If you want to use GPT, update OpenAI API key", 40 | value="", 41 | type="password", 42 | on_change=None, 43 | ) 44 | if ai_key_enter: 45 | st.session_state["open_ai_key"] = ai_key_enter 46 | gpt_chat = ClassificationLLM( 47 | "gpt-4", 48 | embedding_handler=st.session_state.custom_embed, 49 | openai_api_key=st.session_state["open_ai_key"], 50 | ) 51 | st.sidebar.success("OpenAI API key updated successfully") 52 | 53 | 54 | fields = st.session_state.custom_fields 55 | 56 | 57 | # Streamlit app 58 | def main(verbose: bool = True): 59 | st.subheader("LLM assisted custom coding", divider=True) 60 | # Ask for specific field inputs from session state 61 | input_field = {} 62 | for field in st.session_state.custom_fields: 63 | input_field[field] = st.text_input(field) 64 | 65 | col1, col2 = st.columns(2) 66 | rag_uni_button = col1.button("Validate input using Gemini ⛋") # Palm2 🦄") 67 | rag_gpt_button = col2.button("Validate input using GPT ⚛") 68 | 69 | if rag_uni_button or rag_gpt_button: 70 | # Check there is some value in input fields 71 | print("Input: ", input_field) 72 | if len(set(input_field.values()).difference({"", None})) > 0: 73 | if rag_uni_button: 74 | response, _ = uni_chat.rag_general_code(respondent_data=input_field) 75 | elif rag_gpt_button: 76 | response, _ = gpt_chat.rag_general_code(respondent_data=input_field) 77 | 78 | if response.codable: 79 | st.success( 80 | f"👍 Coded as {response.class_code}: {response.class_descriptive}" 81 | ) 82 | 83 | else: 84 | st.warning(f"👆 More details needed: {response.followup}") 85 | 86 | if verbose: 87 | print("Input: ", input_field) 88 | print("Response: ", response) 89 | st.subheader("Debugging info", divider=True) 90 | st.json(response.model_dump()) 91 | else: 92 | st.warning("👆 Please enter something somewhere at least...") 93 | 94 | 95 | if __name__ == "__main__": 96 | main() 97 | -------------------------------------------------------------------------------- /cloud_deploy.sh: -------------------------------------------------------------------------------- 1 | # set the default gcloud project 2 | gcloud config set project "" 3 | # set the default compute zone to london 4 | gcloud config set compute/zone europe-west2-c 5 | # set build region to eu west 6 | gcloud config set builds/region europe-west2 7 | 8 | # build remotely (uses Dockerfile) 9 | gcloud builds submit --tag europe-west2-docker.pkg.dev/""/sic-soc-docker/app_test:v1 . --region=europe-west2 10 | 11 | # deploy the image as app engine (uses app.yaml) 12 | # gcloud app deploy --image-url=europe-west2-docker.pkg.dev/""/sic-soc-docker/app_test:v1 13 | # works on port 8080 but cannot write to the file system (problem for custom classification index) 14 | 15 | # deploy the image as google run service 16 | gcloud run deploy sic-soc --image europe-west2-docker.pkg.dev/""/sic-soc-docker/app_test:v1 \ 17 | --min-instances=0 --max-instances=3 --region=europe-west2 --allow-unauthenticated --memory=4G --port=8080 18 | -------------------------------------------------------------------------------- /codecov.yml: -------------------------------------------------------------------------------- 1 | comment: true 2 | 3 | coverage: 4 | status: 5 | project: 6 | default: 7 | target: auto 8 | threshold: 80% 9 | informational: true 10 | patch: 11 | default: 12 | target: auto 13 | threshold: 20% 14 | informational: true 15 | 16 | ignore: 17 | - "tests" 18 | - "**/__init__.py" 19 | -------------------------------------------------------------------------------- /docs/_quarto.yml: -------------------------------------------------------------------------------- 1 | project: 2 | type: website 3 | render: 4 | - /*.qmd 5 | - tutorials/*.qmd 6 | - reference/*.qmd 7 | preview: 8 | port: 1111 9 | browser: true 10 | watch-inputs: true 11 | navigate: true 12 | resources: 13 | - _static/ 14 | 15 | website: 16 | title: sic-soc-llm 17 | navbar: 18 | left: 19 | - href: index.qmd 20 | text: About 21 | - href: method.qmd 22 | text: Method 23 | - href: tutorials/index.qmd 24 | text: Tutorials 25 | - href: reference/index.qmd 26 | text: Reference 27 | right: 28 | - icon: github 29 | url: https://github.com/datasciencecampus/sic-soc-llm 30 | reader-mode: false 31 | page-footer: 32 | left: > 33 | All content is available under the 34 | [Open Government Licence V3.0](http://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/), 35 | except where otherwise stated. 36 | center: > 37 | Built using [Quarto](https://quarto.org/). 38 | 39 | format: 40 | html: 41 | mainfont: Arial 42 | theme: 43 | dark: cyborg 44 | light: cosmo 45 | lang: en-GB 46 | 47 | metadata-files: 48 | - reference/_sidebar.yml 49 | 50 | quartodoc: 51 | parser: google 52 | title: LLM based classification 53 | package: sic_soc_llm 54 | dir: reference 55 | sidebar: reference/_sidebar.yml 56 | sections: 57 | - title: Classification module 58 | desc: > 59 | Large Language Model based classification main handlers. 60 | package: sic_soc_llm 61 | contents: 62 | - embedding.EmbeddingHandler 63 | - llm.ClassificationLLM 64 | - prompt.PromptTemplates 65 | 66 | - subtitle: Response models 67 | package: sic_soc_llm.data_models.response_model 68 | contents: 69 | - SocCandidate 70 | - SocResponse 71 | - SicCandidate 72 | - SicResponse 73 | - RagCandidate 74 | - RagResponse 75 | 76 | - title: SIC Index Abstraction 77 | desc: > 78 | Data models to represent Standard Industry Classiffication 79 | package: sic_soc_llm.data_models.sic_hierarchy 80 | contents: 81 | - SIC 82 | - SicCode 83 | - SicNode 84 | - subtitle: SIC metadata 85 | package: sic_soc_llm.data_models 86 | contents: 87 | - sic_meta_model.ClassificationMeta 88 | - sicDB.SicMeta 89 | - title: Helpers 90 | desc: Config and Log utils 91 | package: sic_soc_llm 92 | contents: 93 | - setup_logging 94 | - get_config 95 | - check_file_exists 96 | -------------------------------------------------------------------------------- /docs/_static/app-ui.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datasciencecampus/sic-soc-llm/d3b758970968427483f534faefbca34fff426172/docs/_static/app-ui.png -------------------------------------------------------------------------------- /docs/_static/sic-soc-llm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datasciencecampus/sic-soc-llm/d3b758970968427483f534faefbca34fff426172/docs/_static/sic-soc-llm.png -------------------------------------------------------------------------------- /docs/index.qmd: -------------------------------------------------------------------------------- 1 | --- 2 | format: html 3 | --- 4 | 5 | {{< include ../README.md >}} 6 | 7 |
8 | -------------------------------------------------------------------------------- /docs/method.qmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: Method 3 | format: html 4 | --- 5 | 6 | ## Summary 7 | 8 | A proof-of-concept large language model (LLM) application was created to assess whether an LLM could improve SIC autocoding performance for survey data. This was applied to sample of anonimized survey data and evaluated by comparing the results to clerical coding and to logistic regression model. The LLM showed marginal improvement over the logistic regression in the level of agreement with clerical coding at the 5-digit SIC level. It is likely that refinement of the method would improve performance further. Note that the evaluation scripts are out of scope for this repository. The methodology of the main SIC autocoding module is described bellow. For more information see Data science campus [blog](https://datasciencecampus.ons.gov.uk/classifai-exploring-the-use-of-large-language-models-llms-to-assign-free-text-to-commonly-used-classifications/). 9 | 10 | ## RAG based classification 11 | 12 | The proposed LLM-based method for auto coding of free text survey responses involves two main steps. Our implementation follows the common Retrieval Augmented Generation (RAG) design, for overview 13 | see @fig-system. 14 | 15 | ![System Design](_static/sic-soc-llm.png){#fig-system} 16 | 17 | The primary **input** for this process consists of three free text fields from survey responses: the company's activity, job title, and job description. 18 | 19 | 1. **Semantic Search of Relevant SIC Index Candidates** 20 | 21 | The first step in the process involves conducting a semantic search for relevant Standard Industrial Classification (SIC) index candidates. This is achieved by embedding of a knowledge base using transformer language model MiniLM. The knowledge base includes a list of activities, each with an assigned SIC code. MiniLM is a smaller, more efficient version of the BERT-based transformer model, designed for tasks that require understanding the semantic meaning of text. It is used to convert the text from the survey response into a form that can be compared with the embeddings of the activities in the knowledge base. The result of this step is a list of potential SIC codes that may be relevant to the response. 22 | 23 | 2. **LLM Query** 24 | 25 | The second step involves querying a general purpose pretrained large language model (Gemini-Pro) to evaluate which, if any, of the SIC code candidates is the best fit for the response. This step leverages the ability of LLMs to understand and generate human-like text. The LLM is presented with the response and the list of potential SIC codes and their description, and it is asked to determine which code should be assigned based on the response. If the decision cannot be confidently made the LLM is instructed to return uncodable status. 26 | 27 | The **output** from the LLM is required in such a form that specific fields can be identified and easily analysed: 28 | 29 | - Codable (Yes/No): This field indicates whether or not the survey response could be assigned a SIC code. 30 | - SIC code: This field contains the SIC code that was determined to be the best fit for the response. The code may be requested at either the 5-digit or 2-digit levels. 31 | - Follow-up question: This field specifies a suitable follow-up question to clarify the response in case that an appropriate SIC code cannot be readily determined. 32 | - SIC candidates (+likelihood estimate): This field lists the SIC codes that were considered as potential matches for the response, along with an estimate of the likelihood that each code is the correct match. 33 | - Reasoning: This field provides an explanation of why the LLM selected the particular SIC code or decided that the correct code cannot be determined. 34 | 35 | Alterations to the pipeline were considered. For example, instead of providing a short-list of candidates one can take advantage of the ever-increasing context window (input length allowance) and include the full index or use the LLM’s own awareness of SIC index. We found these options yield worse results than the above outlined RAG for this particular task and model used. 36 | 37 | Both steps rely on pretrained transformer-based models. Because the latest LLMs have been trained on large bodies of text and have billions of parameters they are able to identify the semantic meaning of words, nuance in grammar and spelling. In contrast with rule-based or bag-of-words based machine learning methods this improves how it handles previously unseen responses, such as emerging jobs and industries, unusually phrased or misspelled responses. 38 | 39 | The use of pretrained models in our pipeline provides a solid foundation, but there is an option to fine-tune these models on a specific task to potentially improve performance. Fine-tuning involves continuing the training of the pretrained model on a new dataset, in this case, the survey responses and SIC codes. However, it tends to be computationally expensive and time-consuming and require large, annotated dataset, which was not available. 40 | 41 | An alternative approach to the one-shot prompt used in the second step of the pipeline is to use an agent-based method. In this approach, instead of the LLM making a decision based on a single interaction, the LLM acts as an agent that engages in a dialogue with the text data. The LLM, acting as an agent, can be dynamically assigned different roles in the conversation or specialist tasks. However, it therefore requires more computational resources and time, as it involves multiple interactions with the LLM. 42 | 43 | At this moment we have not evaluated the quality of provided follow-up question and reasoning but included them in the proof of concept due to their potential to improve the data collection step (whether as a one-off qualitative analysis or in real-time process). 44 | 45 | The codebase includes an example user interface. This allows small-scale testing where users can experiment with different models and test their sensitivity to the input. An example of this working with output is shown in @fig-app 46 | 47 | ![App User Interface](_static/app-ui.png){#fig-app} 48 | -------------------------------------------------------------------------------- /docs/tutorials/1_sic_data_structure.qmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "1. SIC data structure" 3 | format: 4 | html: 5 | code-fold: show 6 | --- 7 | 8 | Demonstration notebook for the SIC data structure. 9 | 10 | ```{python} 11 | #| code-summary: "Code: Import methods and initialise" 12 | #| output: false 13 | import random 14 | 15 | from sic_soc_llm import setup_logging, get_config 16 | from sic_soc_llm.data_models import sic_hierarchy, sic_data_access 17 | 18 | logger = setup_logging("sic_data_notebook") 19 | config = get_config() 20 | seed = 3847693223 21 | ``` 22 | 23 | There are two additional datasets required for the SIC hierarchy object that are not part of the repository. These are the SIC structure and SIC index datasets. The following code will download these datasets from the ONS website if they are not already available. 24 | 25 | ```{python} 26 | #| output: false 27 | #| code-summary: "Code: Make sure all required SIC datasets are available" 28 | import requests 29 | from pathlib import Path 30 | 31 | sic_urls = [ 32 | "https://www.ons.gov.uk/file?uri=/methodology/classificationsandstandards/ukstandardindustrialclassificationofeconomicactivities/uksic2007/publisheduksicsummaryofstructureworksheet.xlsx", 33 | "https://www.ons.gov.uk/file?uri=/methodology/classificationsandstandards/ukstandardindustrialclassificationofeconomicactivities/uksic2007/uksic2007indexeswithaddendumdecember2022.xlsx" 34 | ] 35 | 36 | file_paths = [ 37 | Path(config['lookups']['sic_structure']), 38 | Path(config["lookups"]["sic_index"]) 39 | ] 40 | 41 | for url, file_path in zip(sic_urls, file_paths): 42 | if not file_path.exists(): 43 | r = requests.get(url) 44 | file_path.parent.mkdir(exist_ok=True, parents=True) 45 | with open(file_path, 'wb') as outfile: 46 | outfile.write(r.content) 47 | ``` 48 | 49 | ## Load SIC index 50 | 51 | ```{python} 52 | #| code-summary: "Code: Load SIC index" 53 | sic_index_filepath = config["lookups"]["sic_index"] 54 | sic_index_df = sic_data_access.load_sic_index(sic_index_filepath) 55 | 56 | sic_index_df.sample(5, random_state=seed) 57 | 58 | ``` 59 | 60 | ## Load SIC structure 61 | 62 | ```{python} 63 | #| code-summary: "Code: Load SIC structure" 64 | sic_structure_filepath = config["lookups"]["sic_structure"] 65 | sic_df = sic_data_access.load_sic_structure(sic_structure_filepath) 66 | 67 | sic_df.sample(5, random_state=seed) 68 | ``` 69 | 70 | ## Create SIC hierarchy 71 | 72 | ```{python} 73 | #| code-summary: "Code: Create SIC hierarchy" 74 | sic = sic_hierarchy.load_hierarchy(sic_df, sic_index_df) 75 | 76 | print(f"There are {len(sic):,} entries in the hierarcy") 77 | ``` 78 | 79 | ## Example lookup 80 | 81 | Supports a variety of common formatting patterns for SIC. 82 | Sometimes 4-digit SIC serve as 5-digit SIC 83 | ```{python} 84 | #| code-summary: "Code: Example lookup" 85 | print(sic["A011xx"]) 86 | print(sic["A011"]) 87 | print(sic["011"]) 88 | print(sic["01.1"]) 89 | 90 | print(sic["A0111x"]) 91 | print(sic["0111"]) 92 | print(sic["01110"]) 93 | ``` 94 | 95 | ## Select a random example 96 | 97 | ```{python} 98 | #| code-summary: "Code: Example SIC index entry" 99 | random.seed(seed) 100 | sic_node = random.choice(sic.nodes) 101 | 102 | sic_node.print_all() 103 | ``` 104 | -------------------------------------------------------------------------------- /docs/tutorials/2_sic_classifier.qmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "2. SIC classifier" 3 | execute: 4 | warning: False 5 | format: 6 | html: 7 | code-fold: show 8 | --- 9 | 10 | Demonstration notebook for the `ClassificationLLM` using Retrieval Augmented Generation (RAG) with Standard Industrial Classification (SIC) codes. 11 | 12 | ```{python} 13 | #| code-summary: "Code: Import methods and initialise" 14 | from sic_soc_llm import setup_logging, get_config 15 | from sic_soc_llm.embedding import EmbeddingHandler 16 | from sic_soc_llm.llm import ClassificationLLM 17 | 18 | logger = setup_logging('sic_classifier') 19 | config = get_config() 20 | ``` 21 | 22 | ```{python} 23 | #| code-summary: "Code: Make sure the SIC datasets are available" 24 | #| echo: false 25 | 26 | import requests 27 | from pathlib import Path 28 | import hashlib 29 | 30 | sic_urls = [ 31 | "https://www.ons.gov.uk/file?uri=/methodology/classificationsandstandards/ukstandardindustrialclassificationofeconomicactivities/uksic2007/publisheduksicsummaryofstructureworksheet.xlsx", 32 | "https://www.ons.gov.uk/file?uri=/methodology/classificationsandstandards/ukstandardindustrialclassificationofeconomicactivities/uksic2007/uksic2007indexeswithaddendumdecember2022.xlsx" 33 | ] 34 | 35 | file_paths = [ 36 | Path(config['lookups']['sic_structure']), 37 | Path(config["lookups"]["sic_index"]) 38 | ] 39 | 40 | expected_hashes = [ 41 | 'f5090c89938b1f24f7b1498530bc99f520abf4198a3af3f3655814c094cc0944', 42 | '3d6bf9d0950b8b9836d3590001cb391ac4338a8319a4b519483ad17f0d51f085' 43 | ] 44 | 45 | for url, file_path, expected_hash in zip(sic_urls, file_paths, expected_hashes): 46 | if not file_path.exists(): 47 | r = requests.get(url) 48 | file_path.parent.mkdir(exist_ok=True, parents=True) 49 | with open(file_path, 'wb') as outfile: 50 | outfile.write(r.content) 51 | 52 | # Calculate the SHA256 hash of the downloaded file 53 | hash_object = hashlib.sha256() 54 | hash_object.update(r.content) 55 | file_hash = hash_object.hexdigest() 56 | if file_hash != expected_hash: 57 | raise ValueError(f"Downloaded file {file_path} has incorrect hash {file_hash}, expected {expected_hash}") 58 | ``` 59 | 60 | ```{python} 61 | #| echo: false 62 | #| code-summary: "Code: Create a fake Large Language Model (LLM) for demonstration purposes" 63 | from langchain.llms.fake import FakeListLLM 64 | 65 | sic_demo_llm = FakeListLLM(responses=[ 66 | ''' 67 | { "codable": true, "sic_code": "86101", "sic_descriptive": "Hospital activities", "sic_candidates": [ { "sic_code": "86101", "sic_descriptive": "Hospital activities", "likelihood": 0.9 }, { "sic_code": "86220", "sic_descriptive": "Specialist medical practice activities", "likelihood": 0.1 } ], "reasoning": "The company\'s main activity is providing care to patients, which aligns with the \'Hospital activities\' SIC code. The job title and description also suggest a hospital setting. However, there is a small possibility that the company could fall under \'Specialist medical practice activities\' as the job title is a specialist role."} 68 | ''', 69 | ''' 70 | { "codable": true, "sic_code": "03110", "sic_descriptive": "Marine fishing", "sic_candidates": [ { "sic_code": "03110", "sic_descriptive": "Marine fishing", "likelihood": 1 } ], "reasoning": "The company\'s main activity is described as \'catching fish on the north sea from grimsby port\', which aligns with the \'Marine fishing\' category under SIC code 03110."} 71 | ''', 72 | '''{ "codable": true, "sic_code": "66190", "sic_descriptive": "Other activities auxiliary to financial services, except insurance and pension funding", "sic_candidates": [ { "sic_code": "66190", "sic_descriptive": "Other activities auxiliary to financial services, except insurance and pension funding", "likelihood": 0.7 }, { "sic_code": "64191", "sic_descriptive": "Banks", "likelihood": 0.2 }, { "sic_code": "64991", "sic_descriptive": "Security dealing on own account", "likelihood": 0.1 } ], "reasoning": "The company\'s main activity is bitcoin trading, which falls under \'Other activities auxiliary to financial services, except insurance and pension funding\'. However, it could also potentially fall under \'Banks\' or \'Security dealing on own account\', but these are less likely."} 73 | ''', 74 | '''{ "codable": true, "sic_code": "85590", "sic_descriptive": "Other education nec", "sic_candidates": [ { "sic_code": "85590", "sic_descriptive": "Other education nec", "likelihood": 0.9 }, { "sic_code": "85600", "sic_descriptive": "Educational support activities", "likelihood": 0.1 } ], "reasoning": "The company\'s main activity of matching tutors to pupils for extra help outside of school aligns with the \'Other education nec\' category (SIC code 85590). The job description of helping GCSE and A level students achieve the best possible results further supports this classification. The \'Educational support activities\' category (SIC code 85600) could also be a possibility, but is less likely given the specific tutoring focus of the company."} 75 | ''' 76 | ]) 77 | 78 | # populate the vector store with tiny index for demo purposes 79 | embed = EmbeddingHandler() 80 | if embed._index_size == 0: 81 | index_filepath = config["lookups"]["sic_condensed"] 82 | with open(index_filepath) as file_object: 83 | embed.embed_index(file_object=file_object) 84 | ``` 85 | 86 | For the retrieval part of the RAG based SIC classification a correctly populated vector store is required. By default the `EmbeddingHandler` would load `SIC` data structure with all its activities using files specified in the `sic_soc_llm_config.toml`. This may take several minutes. 87 | 88 | For more details about the `SIC` data structure and the data files required for it, see the [SIC data structure tutorial](1_sic_data_structure.html). 89 | 90 | ```{python} 91 | #| code-summary: "Code: Populate vector store" 92 | embed = EmbeddingHandler() 93 | if embed._index_size == 0: 94 | embed.embed_index() 95 | ``` 96 | 97 | As we have already initialised the `EmbeddingHandler` we can pass it to the `ClassificationLLM` object; this is not essential as the `ClassificationLLM` will initialise its own `EmbeddingHandler` if one is not provided (based on the same config values). Note that the `sic_demo_llm` should be replaced with the LLM of your choice. 98 | 99 | ```{python} 100 | #| code-summary: "Code: Initialise the SIC classifier" 101 | sic_llm = ClassificationLLM(llm=sic_demo_llm, embedding_handler=embed) 102 | ``` 103 | 104 | 105 | ## Example SIC classification 106 | 107 | Load a few examples of possible survey responses and classify them using the SIC classifier. 108 | 109 | ```{python} 110 | #| code-summary: "Code: Input and classify examples" 111 | sic_examples = [ 112 | { 113 | "industry_descr": "we provide care to thousands of patients across north east lincolnshire", 114 | "job_title": "anaesthetist", 115 | "job_description": "give anaesthetics for surgical, medical and psychiatric procedures" 116 | }, 117 | { 118 | "industry_descr": "we catch fish on the north sea from grimsby port", 119 | "job_title": None, 120 | "job_description": None 121 | }, 122 | { 123 | "industry_descr": "bitcoin trading", 124 | "job_title": None, 125 | "job_description": None 126 | 127 | }, 128 | { 129 | "industry_descr": "we match tutors to pupils for extra help outside of school", 130 | "job_title": None, 131 | "job_description": "help gcse and a level students achieve the best possible results" 132 | }, 133 | ] 134 | 135 | for item in sic_examples: 136 | # Get response from LLM 137 | response, short_list, call_dict = sic_llm.rag_sic_code( 138 | industry_descr = item["industry_descr"], 139 | job_title = item["job_title"], 140 | job_description = item["job_description"], 141 | ) 142 | 143 | # Print the output 144 | print("Input:") 145 | for v, w in item.items(): 146 | print(f" {v}: {w}") 147 | print('') 148 | 149 | print("Response:") 150 | for x,y in response.__dict__.items(): 151 | print (f" {x}: {y}") 152 | print("") 153 | print('===========================================') 154 | print("") 155 | 156 | ``` 157 | -------------------------------------------------------------------------------- /docs/tutorials/3_soc_classifier.qmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "3. SOC classifier" 3 | execute: 4 | warning: False 5 | format: 6 | html: 7 | code-fold: show 8 | --- 9 | 10 | Demonstration notebook for the `ClassificationLLM` with Standard Occupational Classification (SOC) codes. 11 | 12 | ```{python} 13 | #| code-summary: "Code: Import methods and initialise" 14 | from sic_soc_llm import setup_logging 15 | from sic_soc_llm.llm import ClassificationLLM 16 | 17 | logger = setup_logging("soc_classifier") 18 | ``` 19 | 20 | ```{python} 21 | #| echo: false 22 | #| code-summary: "Code: Create a fake Large Language Model (LLM) for demonstration purposes" 23 | from langchain.llms.fake import FakeListLLM 24 | 25 | soc_demo_llm = FakeListLLM(responses=[ 26 | ''' 27 | {"codable": true, "followup": null, "soc_code": "9265", "soc_descriptive": "Bar staff", "soc_candidates": [{"soc_code": "9265", "soc_descriptive": "Bar staff", "likelihood": 1.0}], "soc_code_2digits": "92", "reasoning": "The job title \'barman\' and the job description \'barman at local golf club\' clearly indicate that the respondent\'s job involves serving drinks at a bar, which aligns with the SOC code 9265 for \'Bar staff\'."} 28 | ''', 29 | ''' 30 | { "codable": true, "followup": null, "soc_code": "1221", "soc_descriptive": "Hotel and accommodation managers and proprietors", "soc_candidates": [ { "soc_code": "1221", "soc_descriptive": "Hotel and accommodation managers and proprietors", "likelihood": 1 } ], "soc_code_2digits": "12", "reasoning": "The job title \'hotel night manager\' and the company\'s main activity being a hotel aligns with the SOC code 1221 for \'Hotel and accommodation managers and proprietors\'. The job description, although unclear, seems to involve duties that could be associated with this role."}''', 31 | ''' 32 | { "codable": false, "followup": "Could you please provide more details about your daily tasks and responsibilities in this role?", "soc_code": null, "soc_descriptive": null, "soc_candidates": [ { "soc_code": "2139", "soc_descriptive": "Information technology professionals n.e.c.", "likelihood": 0.7 } ], "soc_code_2digits": "21", "reasoning": "The job title \'functional consultant\' and the job description \'provide consultancy on system configuration\' suggest a role in IT consultancy. However, more specific information about the tasks and responsibilities of the role is needed to assign a more accurate SOC code."} 33 | ''', 34 | ''' 35 | {"codable": true, "followup": null, "soc_code": "6213", "soc_descriptive": "Air travel assistants", "soc_candidates": [{"soc_code": "6213", "soc_descriptive": "Air travel assistants", "likelihood": 0.9}], "soc_code_2digits": "62", "reasoning": "The job title \'senior airport services agent\' and the job description \'customer service\' in the context of an airline company suggest that the respondent\'s role involves assisting passengers and providing customer service in an airport setting. This aligns with the SOC code 6213 for \'Air travel assistants\'."} 36 | ''', 37 | ''' 38 | { "codable": false, "followup": "Could you please provide more specific information about your job responsibilities and the nature of the materials you work with?", "soc_code": null, "soc_descriptive": null, "soc_candidates": [ { "soc_code": "2125", "soc_descriptive": "Production and Process Engineers", "likelihood": 0.5 }, { "soc_code": "2122", "soc_descriptive": "Mechanical Engineers", "likelihood": 0.5 } ], "soc_code_2digits": "21", "reasoning": "The job title translates to \'Engineer\' and the company\'s main activity involves \'Processing Materials\'. This could correspond to several engineering roles within the \'21\' SOC code category, but without more specific information, it is not possible to determine the exact SOC code."} 39 | ''' 40 | ]) 41 | ``` 42 | 43 | The example SOC classifier uses a one-shot prompt to classify respondent's data. In particular, there is no retrieval step (to reduce the list of candidate codes) and the whole condensed index is included in the prompt. Note that the `soc_demo_llm` should be replaced with the LLM of your choice. 44 | 45 | ```{python} 46 | #| code-summary: "Code: Initialise the SOC classifier" 47 | soc_llm = ClassificationLLM(llm=soc_demo_llm) 48 | ``` 49 | 50 | ## Example SOC classifications 51 | 52 | Load a few examples of possible survey responses and classify them using the SOC classifier. 53 | 54 | ```{python} 55 | #| code-summary: "Code: Input and classify examples" 56 | soc_examples = [ 57 | { 58 | "job_title": "barman", 59 | "job_description": "barman at local golf club", 60 | "employer_activities": "golf club", 61 | }, 62 | { 63 | "job_title": "hotel night manager", 64 | "job_description": """hight potter reception closing documents 65 | breakfast preparation""", 66 | "employer_activities": "hotel", 67 | }, 68 | { 69 | "job_title": "functional consultant", 70 | "job_description": "provide cnsultancy on system configuration", 71 | "employer_activities": "technology provide deliver enterprise software", 72 | }, 73 | { 74 | "job_title": "senior airport services agent", 75 | "job_description": "customer service", 76 | "employer_activities": "airline", 77 | }, 78 | { 79 | "job_title": "PEIRIANYDD", 80 | "job_description": "TRWSHIO", 81 | "employer_activities": "TRIN PERIANAU", 82 | }, 83 | ] 84 | 85 | for item in soc_examples: 86 | # Get response from LLM 87 | response = soc_llm.get_soc_code( 88 | item["job_title"], 89 | item["job_description"], 90 | level_of_education="Unknown", 91 | manage_others="Unknown", 92 | industry_descr=item["employer_activities"], 93 | ) 94 | 95 | # Print the output 96 | print("Input:") 97 | for v, w in item.items(): 98 | print(f" {v}: {w}") 99 | print('') 100 | 101 | print("Response:") 102 | for x,y in response.__dict__.items(): 103 | print (f" {x}: {y}") 104 | print("") 105 | print('===========================================') 106 | print("") 107 | ``` 108 | -------------------------------------------------------------------------------- /docs/tutorials/4_custom_coicop_classifier.qmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "4. Custom (COICOP) classifier" 3 | format: 4 | html: 5 | code-fold: show 6 | --- 7 | 8 | Demonstration notebook for the `ClassificationLLM` using RAG with a custom index. In this demo, the Classification of Individual Consumption According to Purpose (COICOP) index is used. 9 | 10 | ```{python} 11 | #| code-summary: "Code: Import methods and initialise" 12 | #| output: false 13 | from sic_soc_llm import setup_logging, get_config 14 | from sic_soc_llm.llm import ClassificationLLM 15 | from sic_soc_llm.embedding import EmbeddingHandler 16 | 17 | logger = setup_logging('coicop_notebook') 18 | config = get_config() 19 | ``` 20 | 21 | 22 | ```{python} 23 | #| echo: false 24 | from langchain.llms.fake import FakeListLLM 25 | 26 | coicop_demo_llm = FakeListLLM(responses=[ 27 | ''' 28 | { 29 | "codable": true, 30 | "class_code": "CP01141", 31 | "class_descriptive": "Whole milk", 32 | "alt_candidates": [ 33 | { 34 | "class_code": "CP01146", 35 | "class_descriptive": "Other milk products", 36 | "likelihood": 0.1 37 | }, 38 | { 39 | "class_code": "CP01199", 40 | "class_descriptive": "Other food products n.e.c.", 41 | "likelihood": 0.05 42 | } 43 | ], 44 | "reasoning": "The respondent's data mentions 'organic whole milk' which directly matches with the 'Whole milk' category in the classification index. Although the milk is organic, there is no separate category for organic milk in the provided subset of classification index. Therefore, the most suitable classification code is 'CP01141' for 'Whole milk'. Other possible but less likely categories could be 'Other milk products' or 'Other food products n.e.c.'." 45 | } 46 | ''',''' 47 | { 48 | "codable": false, 49 | "followup": "Is the item intended for men or women?", 50 | "class_code": null, 51 | "class_descriptive": null, 52 | "alt_candidates": [ 53 | { 54 | "class_code": "CP03121", 55 | "class_descriptive": "Garments for men", 56 | "likelihood": 0.5 57 | }, 58 | { 59 | "class_code": "CP03122", 60 | "class_descriptive": "Garments for women", 61 | "likelihood": 0.5 62 | } 63 | ], 64 | "reasoning": "The item 'skinny jeans' can be classified as either 'Garments for men' or 'Garments for women'. Without information on the intended gender for the item, a definitive classification cannot be made." 65 | } 66 | ''',''' 67 | { 68 | "codable": true, 69 | "class_code": "CP06220", 70 | "class_descriptive": "Dental services", 71 | "alt_candidates": [], 72 | "reasoning": "The respondent's data mentions 'tooth filling' which is a service provided by dentists. Therefore, the classification code 'CP06220' for 'Dental services' is the most appropriate." 73 | }''' 74 | ]) 75 | ``` 76 | 77 | ## Load COICOP or other custom index 78 | 79 | The expected format of the custom index is a text file with each line containing one index entry in the format `class_code : class_descriptive`. The following code snippet demonstrates how to load and embed the COICOP index. This embedding is saved in a vector store that is used in the retrieval step of RAG based classification in `ClassificationLLM`. Note that the `coicop_demo_llm` should be replaced with the LLM of your choice. 80 | 81 | ```{python} 82 | #| code-summary: "Code: Load COICOP index" 83 | #| warning: false 84 | index_filepath = config["lookups"]["coicop_condensed"] 85 | with open(index_filepath) as file_object: 86 | for _ in range(5): 87 | print(next(file_object)) 88 | 89 | embed = EmbeddingHandler(db_dir=None) 90 | with open(index_filepath) as file_object: 91 | embed.embed_index(file_object=file_object) 92 | 93 | coicop_llm = ClassificationLLM(embedding_handler=embed, llm = coicop_demo_llm) 94 | ``` 95 | 96 | ## Example classification using COICOP index 97 | 98 | The following code block demonstrates how to classify a few examples using the COICOP index. Note that the respondent data is passed as a dictionary. For different use cases, any custom survey fields can be used as keys in the dictionary. `ClassificationLLM` uses the values that are present in the dictionary to retrieve the relevant information from the index and includes all the provided fields in the generative query step. 99 | 100 | ```{python} 101 | #| code-summary: "Code: Example lookup" 102 | #| warning: false 103 | for item in ["organic whole milk", "skinny jeans", "tooth filling"]: 104 | # Get response from LLM 105 | response, short_list = coicop_llm.rag_general_code(respondent_data={"item": item}) 106 | 107 | # Print the output 108 | print("Input:") 109 | print(f" item: {item}") 110 | print('') 111 | print("Response:") 112 | for x,y in response.__dict__.items(): 113 | print (f' {x}: {y}') 114 | print(f" shortlist used in RAG: {short_list}") 115 | print("") 116 | print('===========================================') 117 | print("") 118 | ``` 119 | -------------------------------------------------------------------------------- /docs/tutorials/index.qmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: Tutorials 3 | listing: 4 | type: table 5 | contents: 6 | - "*.qmd" 7 | fields: [title, description, reading-time] 8 | sort-ui: false 9 | filter-ui: false 10 | --- 11 | 12 | These tutorials walk you through some of the essential workflows for `sic-soc-llm`. 13 | 14 |
15 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "sic_soc_llm" 3 | description = "An app for LLM based SIC/SOC Classification" 4 | authors = [{name = "Data Science Campus", email = "datasciencecampus@ons.gov.uk"}] 5 | readme = "README.md" 6 | license = {file = "LICENSE"} 7 | requires-python = ">=3.10" 8 | dynamic = ["version"] 9 | dependencies = [ 10 | "toml==0.10.2", 11 | "numpy==1.26.3", 12 | "pandas==2.1.4", 13 | "langchain==0.1.0", 14 | "langchain-google-vertexai==0.0.1", 15 | "langchain-openai==0.0.2", 16 | "openai==1.7.2", 17 | "google-cloud-aiplatform==1.38.1", 18 | "sentence-transformers==2.3.1", 19 | "chromadb==0.4.22", 20 | "autocorrect==2.6.1", 21 | "pyfarmhash==0.3.2", 22 | "openpyxl==3.1.2", 23 | "pyprojroot==0.3.0", 24 | ] 25 | classifiers = [ 26 | "Programming Language :: Python :: 3", 27 | "License :: OSI Approved :: MIT License", 28 | "Operating System :: OS Independent", 29 | "Do not upload :: Internal project :: !" 30 | ] 31 | 32 | [build-system] 33 | requires = ["setuptools>=62"] 34 | build-backend = "setuptools.build_meta" 35 | 36 | [tool.setuptools.dynamic] 37 | version = {attr = "sic_soc_llm.__version__"} 38 | 39 | [tool.setuptools.packages.find] 40 | where = ["src"] 41 | namespaces = false 42 | 43 | [tool.setuptools.package-data] 44 | sic_soc_llm = [ 45 | "example_data/*.txt", 46 | "_config/*.toml", 47 | ] 48 | 49 | [project.optional-dependencies] 50 | app = [ 51 | "streamlit==1.30.0", 52 | "python-dotenv==1.0.0", 53 | ] 54 | test = [ 55 | "pytest==6.2.5", 56 | "pytest-pythonpath==0.7.4", 57 | "coverage==7.5.4", 58 | ] 59 | 60 | docs = ["quartodoc>=0.6.6", 61 | "ipykernel==6.23.2", 62 | "nbclient==0.10.0", 63 | "nbformat==5.9.2", 64 | ] 65 | 66 | dev = [ 67 | "pre-commit==3.3.3", 68 | "dill==0.3.8", 69 | "matplotlib_venn==0.11.10", 70 | "sic_soc_llm[app]", 71 | "sic_soc_llm[test]", 72 | "sic_soc_llm[docs]" 73 | ] 74 | 75 | [project.urls] 76 | homepage = "https://github.com/datasciencecampus/sic-soc-llm" 77 | -------------------------------------------------------------------------------- /src/sic_soc_llm/__init__.py: -------------------------------------------------------------------------------- 1 | """sic-soc-llm: LLM assisted SIC/SOC classification.""" 2 | 3 | __version__ = "0.0.1" 4 | 5 | from .logs import setup_logging 6 | from ._config.main import get_config, check_file_exists 7 | 8 | __all__ = ["setup_logging", "get_config", "check_file_exists"] 9 | -------------------------------------------------------------------------------- /src/sic_soc_llm/_config/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datasciencecampus/sic-soc-llm/d3b758970968427483f534faefbca34fff426172/src/sic_soc_llm/_config/__init__.py -------------------------------------------------------------------------------- /src/sic_soc_llm/_config/main.py: -------------------------------------------------------------------------------- 1 | """Provides configuration for the project. 2 | 3 | Usage: 4 | ``` 5 | from sic_soc_llm import get_config 6 | config = get_config() 7 | config.CONFIG_NAME 8 | ``` 9 | """ 10 | 11 | from pathlib import Path 12 | from typing import Optional, Union 13 | import toml 14 | import logging 15 | from importlib import resources 16 | from pyprojroot import here 17 | 18 | logger = logging.getLogger(__name__) 19 | 20 | _config = None 21 | 22 | 23 | def check_file_exists( 24 | file_name: Optional[Union[Path, str]] = "sic_soc_llm_config.toml" 25 | ) -> Path: 26 | """Check if the file exists. 27 | 28 | If relative path provided it will look for the file in these locations: 29 | 1. relative to the current working directory 30 | 2. ralative to project root directory 31 | 3. relative to user's home directory 32 | 4. relative to the package resources 33 | 34 | Args: 35 | file_name (Path or str, optional): The name of the file to check. 36 | Defaults to config file name. 37 | 38 | Returns: 39 | Path: The absolute path to the file if it exists, None otherwise. 40 | """ 41 | 42 | file_path = Path(file_name) 43 | # check whether the filepath is relative or absolute 44 | if file_path.is_absolute(): 45 | return file_path if file_path.exists() else None 46 | else: 47 | # check whether the file exists in the current directory 48 | if (Path.cwd() / file_path).exists(): 49 | return Path.cwd() / file_path 50 | # check whether the file exists in the project root directory 51 | elif (Path(here()) / file_path).exists(): 52 | return Path(here()) / file_path 53 | # check whether the file exists in the user's home directory 54 | elif (Path.home() / file_path).exists(): 55 | return Path.home() / file_path 56 | # check whether the file exists in the package resources 57 | elif (resources.files("sic_soc_llm._config") / file_path).exists(): 58 | return resources.files("sic_soc_llm._config") / file_path 59 | elif (resources.files("sic_soc_llm.example_data") / file_path).exists(): 60 | return resources.files("sic_soc_llm.example_data") / file_path 61 | else: 62 | return None 63 | 64 | 65 | def get_config( 66 | config_name: Optional[Union[Path, str]] = "sic_soc_llm_config.toml" 67 | ) -> dict: 68 | """Fetch the configuration. 69 | 70 | Loads config from the filepath defined in `CONFIG_FILEPATH`. 71 | 72 | Args: 73 | config_name (Path or str, optional): The name of the config file to load. 74 | Defaults to relative path "sic_soc_llm_config.toml" - in such case it 75 | looks for the config file in 1. current dir, 2. project dir, 3. user home 76 | and 4. package resources. 77 | 78 | Returns: 79 | dict: Configuration for the system. 80 | 81 | Raises: 82 | FileNotFoundError: If the config file or required lookups not found. 83 | """ 84 | global _config 85 | 86 | if _config is None: 87 | config_filepath = check_file_exists(config_name) 88 | 89 | if config_filepath is None: 90 | raise FileNotFoundError("Config file not found.") 91 | else: 92 | with open(config_filepath, mode="r") as f: 93 | logger.info(f"Loading config from {config_filepath}") 94 | in_config = toml.load(f) 95 | for key, lookup_file in in_config["lookups"].items(): 96 | lookup_file_path = check_file_exists(lookup_file) 97 | if lookup_file_path is None: 98 | if key in ["sic_condensed", "soc_condensed"]: 99 | raise FileNotFoundError( 100 | f"Required lookup file {key}: {lookup_file} not found." 101 | ) 102 | else: 103 | logger.warning( 104 | f"Optional lookup file {key}: {lookup_file} not found." 105 | ) 106 | else: 107 | in_config["lookups"][key] = lookup_file_path 108 | _config = in_config 109 | logger.debug(f"Config values: {_config}") 110 | 111 | return _config 112 | -------------------------------------------------------------------------------- /src/sic_soc_llm/_config/sic_soc_llm_config.toml: -------------------------------------------------------------------------------- 1 | [sic_soc_data] 2 | 3 | [lookups] 4 | sic_structure = "data/sic-index/publisheduksicsummaryofstructureworksheet.xlsx" 5 | # https://www.ons.gov.uk/file?uri=/methodology/classificationsandstandards/ukstandardindustrialclassificationofeconomicactivities/uksic2007/publisheduksicsummaryofstructureworksheet.xlsx 6 | sic_index = "data/sic-index/uksic2007indexeswithaddendumdecember2022.xlsx" 7 | # https://www.ons.gov.uk/file?uri=/methodology/classificationsandstandards/ukstandardindustrialclassificationofeconomicactivities/uksic2007/uksic2007indexeswithaddendumdecember2022.xlsx 8 | sic_condensed = "sic_2d_condensed.txt" 9 | soc_condensed = "soc_4d_condensed.txt" 10 | coicop_condensed = "coicop_5d_condensed.txt" 11 | 12 | [llm] 13 | db_dir = "data/sic-index/db" 14 | embedding_model_name = "all-MiniLM-L6-v2" # all-mpnet-base-v2 15 | llm_model_name = "gemini-pro" # "gpt-4" 16 | -------------------------------------------------------------------------------- /src/sic_soc_llm/data_models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datasciencecampus/sic-soc-llm/d3b758970968427483f534faefbca34fff426172/src/sic_soc_llm/data_models/__init__.py -------------------------------------------------------------------------------- /src/sic_soc_llm/data_models/response_model.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel, Field, model_validator 2 | from typing import List, Optional 3 | 4 | 5 | class SocCandidate(BaseModel): 6 | """ 7 | Represents a candidate SOC code based on provided job title and description. 8 | 9 | Attributes: 10 | soc_code (str): Plausible SOC code based on the provided job title and 11 | description. 12 | soc_descriptive (str): Descriptive label of the SOC category associated 13 | with soc_code. 14 | likelihood (float): Likelihood of this soc_code with a value between 0 and 1. 15 | """ 16 | 17 | soc_code: str = Field( 18 | description="Plausible SOC code based on provided job title and description." 19 | ) 20 | soc_descriptive: str = Field( 21 | description="Descriptive label of the SOC category associated with soc_code." 22 | ) 23 | likelihood: float = Field( 24 | description="Likelihood of this soc_code with value between 0 and 1." 25 | ) 26 | 27 | 28 | class SocResponse(BaseModel): 29 | """Represents a response model for SOC code assignment. 30 | 31 | Attributes: 32 | codable (bool): True if enough information is provided to decide SOC code, 33 | False otherwise. 34 | followup (Optional[str]): Question to ask the user in order to collect 35 | additional information to enable reliable SOC assignment. 36 | Empty if codable=True. 37 | soc_code (Optional[str]): Full four-digit SOC code assigned based on provided 38 | job title, description, etc. Empty if codable=False. 39 | soc_descriptive (Optional[str]): Descriptive label of the SOC category 40 | associated with soc_code if provided. Empty if codable=False. 41 | soc_candidates (List[SocCandidate]): List of possible or alternative SOC 42 | codes that may be applicable with their descriptive label and estimated 43 | likelihood. 44 | soc_code_2digits (Optional[str]): First two digits of the hierarchical SOC code 45 | assigned. This field should be non-empty if the larger (two-digit) group of 46 | SOC codes can be determined even in cases where additional information is 47 | needed to code to four digits (for example when all SOC candidates share 48 | the same first two digits). 49 | reasoning (str): Step by step reasoning behind classification selected. 50 | Specifies the information used to assign the SOC code or any additional 51 | information required to assign a SOC code. 52 | """ 53 | 54 | codable: bool = Field( 55 | description="""True if enough information is provided to decide 56 | SOC code, False otherwise.""" 57 | ) 58 | followup: Optional[str] = Field( 59 | description="""Question to ask user in order to collect additional information 60 | to enable reliable SOC assignment. Empty if codable=True.""", 61 | default=None, 62 | ) 63 | soc_code: Optional[str] = Field( 64 | description="""Full four digit SOC code assigned based on provided job title, 65 | description, etc. Empty if codable=False.""", 66 | default=None, 67 | ) 68 | soc_descriptive: Optional[str] = Field( 69 | description="""Descriptive label of the SOC category associated with soc_code 70 | if provided. Empty if codable=False.""", 71 | default=None, 72 | ) 73 | soc_candidates: List[SocCandidate] = Field( 74 | description="""List of possible or alternative SOC codes that may be applicable 75 | with their descriptive label and estimated likelihood.""" 76 | ) 77 | soc_code_2digits: Optional[str] = Field( 78 | description="""First two digits of the hierarchical SOC code assigned. 79 | This field should be non empty if the larger (two-digit) group of SOC codes 80 | can be determined even in cases where additional information is needed to 81 | to code to four digits (for example when all SOC candidates share 82 | the same first two digits).""", 83 | default=None, 84 | ) 85 | reasoning: str = Field( 86 | description="""Step by step reasoning behind classification selected. Specifies 87 | the information used to assign the SOC code or any additional information 88 | required to assign a SOC code.""" 89 | ) 90 | 91 | @classmethod 92 | def soc_code_validator(cls, v): 93 | # TODO: check for valid codes from some list 94 | assert v != "", "If codable, then valid soc_code needs to be provided" 95 | return v 96 | 97 | @model_validator(mode="before") 98 | @classmethod 99 | def check_valid_fields(cls, values): 100 | if values.get("codable"): 101 | cls.soc_code_validator(values.get("soc_code")) 102 | else: 103 | assert ( 104 | values.get("followup") != "" 105 | ), """If uncodable, 106 | follow up question needs to be provided.""" 107 | return values 108 | 109 | 110 | class SicCandidate(BaseModel): 111 | """Represents a candidate SIC code with associated information. 112 | 113 | Attributes: 114 | sic_code (str): Plausible SIC code based on the company activity description. 115 | sic_descriptive (str): Descriptive label of the SIC category associated with 116 | sic_code. 117 | likelihood (float): Likelihood of this sic_code with a value between 0 and 1. 118 | 119 | """ 120 | 121 | sic_code: str = Field( 122 | description="Plausible SIC code based on the company activity description." 123 | ) 124 | sic_descriptive: str = Field( 125 | description="Descriptive label of the SIC category associated with sic_code." 126 | ) 127 | likelihood: float = Field( 128 | description="Likelihood of this sic_code with value between 0 and 1." 129 | ) 130 | 131 | 132 | class SicResponse(BaseModel): 133 | """Represents a response model for SIC code assignment. 134 | 135 | Attributes: 136 | codable (bool): True if enough information is provided to decide SIC code, 137 | False otherwise. 138 | followup (Optional[str]): Question to ask user in order to collect additional 139 | information to enable reliable SIC assignment. Empty if codable=True. 140 | sic_code (Optional[str]): Full SIC code (to the required number of digits) 141 | assigned based on the provided company activity description. 142 | Empty if codable=False. 143 | sic_descriptive (Optional[str]): Descriptive label of the SIC category 144 | associated with sic_code if provided. Empty if codable=False. 145 | sic_candidates (List[SicCandidate]): Short list of less than ten possible or 146 | alternative sic codes that may be applicable with their descriptive label 147 | and estimated likelihood. 148 | sic_code_2digits (Optional[str]): First two digits of the hierarchical SIC 149 | code assigned. This field should be non empty if the larger (two-digit) 150 | group of SIC codes can be determined even in cases where additional 151 | information is needed to code to four digits (for example when all 152 | SIC candidates share the same first two digits). 153 | reasoning (str): Specifies the information used to assign the SIC code or any 154 | additional information required to assign a SIC code. 155 | """ 156 | 157 | codable: bool = Field( 158 | description="""True if enough information is provided to decide 159 | SIC code, False otherwise.""" 160 | ) 161 | followup: Optional[str] = Field( 162 | description="""Question to ask user in order to collect additional information 163 | to enable reliable SIC assignment. Empty if codable=True.""", 164 | default=None, 165 | ) 166 | sic_code: Optional[str] = Field( 167 | description="""Full SIC code (to the required number of digits) assigned based 168 | on provided the company activity description. Empty if codable=False.""", 169 | default=None, 170 | ) 171 | sic_descriptive: Optional[str] = Field( 172 | description="""Descriptive label of the SIC category associated with sic_code 173 | if provided. Empty if codable=False.""", 174 | default=None, 175 | ) 176 | sic_candidates: List[SicCandidate] = Field( 177 | description="""Short list of less than ten possible or alternative SIC codes 178 | that may be applicable with their descriptive label and estimated likelihood.""" 179 | ) 180 | 181 | reasoning: str = Field( 182 | description="""Step by step reasoning behind classification selected. Specifies 183 | the information used to assign the SIC code or any additional information 184 | required to assign a SIC code.""" 185 | ) 186 | 187 | @classmethod 188 | def sic_code_validator(cls, v): 189 | # TODO: check for valid codes from some list 190 | assert v != "", "If codable, then valid sic_code needs to be provided" 191 | return v 192 | 193 | @model_validator(mode="before") 194 | @classmethod 195 | def check_valid_fields(cls, values): 196 | if values.get("codable"): 197 | cls.sic_code_validator(values.get("sic_code")) 198 | else: 199 | assert ( 200 | values.get("followup") != "" 201 | ), """If uncodable, 202 | follow up question needs to be provided.""" 203 | return values 204 | 205 | 206 | class RagCandidate(BaseModel): 207 | """Represents a candidate classification code with associated information. 208 | 209 | Attributes: 210 | class_code (str): Plausible classification code based on the respondent's data. 211 | class_descriptive (str): Descriptive label of the classification category 212 | associated with class_code. 213 | likelihood (float): Likelihood of this class_code with a value between 0 and 1. 214 | 215 | """ 216 | 217 | class_code: str = Field( 218 | description="Plausible classification code based on the respondent's data." 219 | ) 220 | class_descriptive: str = Field( 221 | description="""Descriptive label of the classification category 222 | associated with class_code.""" 223 | ) 224 | likelihood: float = Field( 225 | description="Likelihood of this class_code with value between 0 and 1." 226 | ) 227 | 228 | 229 | class RagResponse(BaseModel): 230 | """Represents a response model for classification code assignment. 231 | 232 | Attributes: 233 | codable (bool): True if enough information is provided to decide 234 | classification code, False otherwise. 235 | followup (Optional[str]): Question to ask user in order to collect 236 | additional information to enable reliable classification assignment. 237 | Empty if codable=True. 238 | class_code (Optional[str]): Full classification code (to the required 239 | number of digits) assigned based on provided respondent's data. 240 | Empty if codable=False. 241 | class_descriptive (Optional[str]): Descriptive label of the classification 242 | category associated with class_code if provided. 243 | Empty if codable=False. 244 | alt_candidates (List[RagCandidate]): Short list of less than ten possible 245 | or alternative classification codes that may be applicable with their 246 | descriptive label and estimated likelihood. 247 | reasoning (str): Step by step reasoning behind the classification selected. 248 | Specifies the information used to assign the SIC code or any additional 249 | information required to assign a SIC code. 250 | """ 251 | 252 | codable: bool = Field( 253 | description="""True if enough information is provided to decide 254 | classification code, False otherwise.""" 255 | ) 256 | followup: Optional[str] = Field( 257 | description="""Question to ask user in order to collect additional information 258 | to enable reliable classification assignment. Empty if codable=True.""", 259 | default=None, 260 | ) 261 | class_code: Optional[str] = Field( 262 | description="""Full classification code (to the required number of digits) 263 | assigned based on provided respondent's data. Empty if codable=False.""", 264 | default=None, 265 | ) 266 | class_descriptive: Optional[str] = Field( 267 | description="""Descriptive label of the classification category associated 268 | with class_code if provided. Empty if codable=False.""", 269 | default=None, 270 | ) 271 | alt_candidates: List[RagCandidate] = Field( 272 | description="""Short list of less than ten possible or alternative 273 | classification codes that may be applicable with their descriptive label 274 | and estimated likelihood.""" 275 | ) 276 | reasoning: str = Field( 277 | description="""Step by step reasoning behind classification selected. Specifies 278 | the information used to assign the SIC code or any additional information 279 | required to assign a SIC code.""" 280 | ) 281 | -------------------------------------------------------------------------------- /src/sic_soc_llm/data_models/sic_data_access.py: -------------------------------------------------------------------------------- 1 | """Provides data access for key files. 2 | 3 | Filepaths are defined in config, see: `sic_soc_llm._config`. 4 | """ 5 | 6 | import pandas as pd 7 | 8 | 9 | def load_sic_index(filepath: str) -> pd.DataFrame: 10 | """Load the SIC index. 11 | 12 | The SIC index provides a list of around 15,000 activities and 13 | their associated 5-digit SIC. 14 | """ 15 | 16 | sic_index_df = pd.read_excel( 17 | filepath, 18 | sheet_name="Alphabetical Index", 19 | skiprows=1, 20 | usecols=["UK SIC 2007", "Activity"], 21 | dtype=str, 22 | ) 23 | 24 | sic_index_df.columns = [ 25 | col.lower().replace(" ", "_") for col in sic_index_df.columns 26 | ] 27 | 28 | return sic_index_df 29 | 30 | 31 | def load_sic_structure(filepath: str) -> pd.DataFrame: 32 | """Load SIC structure. 33 | 34 | Loads a worksheet with all the levels/names of the UK SIC 2007 hierarchy. 35 | """ 36 | 37 | sic_df = pd.read_excel( 38 | filepath, 39 | sheet_name="reworked structure", 40 | usecols=[ 41 | "Description", 42 | "SECTION", 43 | "Most disaggregated level", 44 | "Level headings", 45 | ], 46 | dtype=str, 47 | ) 48 | 49 | sic_df.columns = [col.lower().replace(" ", "_") for col in sic_df.columns] 50 | 51 | for col in sic_df.columns: 52 | sic_df[col] = sic_df[col].str.strip() 53 | 54 | return sic_df 55 | -------------------------------------------------------------------------------- /src/sic_soc_llm/data_models/sic_hierarchy.py: -------------------------------------------------------------------------------- 1 | """SIC hierarchy. 2 | 3 | Provides a common interface for SIC lookups and navigation. 4 | 5 | Usage: 6 | 7 | sic = sic_hierarchy.load_hierarchy(sic_df, sic_index_df) 8 | sic["01110"].print_all() 9 | """ 10 | from typing import Iterator 11 | import html 12 | import re 13 | 14 | import pandas as pd 15 | 16 | from sic_soc_llm.data_models import sicDB 17 | 18 | SEE_CODE_REGEX = re.compile( 19 | r"(,?\s?see\s(divisions?\s)?)?##\d+(\.\d+(\/\d)?)?", re.IGNORECASE 20 | ) 21 | 22 | # TODO enum? 23 | _LEVEL_DICT = {1: "section", 2: "division", 3: "group", 4: "class", 5: "subclass"} 24 | 25 | 26 | class SicCode: 27 | """Standard Industrial Classification code. 28 | 29 | The main representation for SIC in this class is the `alpha_code`, 30 | which we define as: 31 | 32 | * The section character e.g. 'A' 33 | * Followed by the numeric SIC code e.g. "0111" 34 | * Padded with 'x' to six characters 35 | 36 | For example: "A0111x" 37 | 38 | The class supports initialisation with section, code and level via 39 | the factory method `from_section_code_level`. 40 | 41 | For example: 42 | SicCode.from_section_code_level("A", "0111", "class") 43 | 44 | Note: 45 | This class is mainly for internal use, beyond some basic checks 46 | of formatting and consistency it does not validate that a code 47 | is defined in UK SIC 2007. 48 | """ 49 | 50 | def __init__(self, alpha_code: str): 51 | SicCode._validate_alpha_code(alpha_code) 52 | 53 | self.alpha_code = alpha_code 54 | self.n_digits = SicCode._parse_digits(alpha_code) 55 | self.level_name = _LEVEL_DICT[self.n_digits] 56 | self._formatted_code = SicCode._format_code(alpha_code) 57 | self._alpha_code_no_pad = self.alpha_code.replace("x", "") 58 | 59 | @staticmethod 60 | def from_section_code_level(section, code, level) -> "SicCode": 61 | """Factory method for SicCode. 62 | 63 | Note: 64 | Used to produce the definitive list of SIC codes, 65 | only call with data that defines SIC. 66 | """ 67 | level = level.lower().strip().replace(" ", "") 68 | 69 | if len(code) < 5: 70 | n_digits = len(code) 71 | if _LEVEL_DICT[n_digits] != level: 72 | raise ValueError(f"Code/level mismatch: '{code}' -> '{level}'") 73 | 74 | elif len(code) == 5: 75 | if level not in {_LEVEL_DICT[4], _LEVEL_DICT[5]}: 76 | raise ValueError(f"Code/level mismatch: '{code}' -> '{level}'") 77 | 78 | if level == _LEVEL_DICT[1] and section != code: 79 | raise ValueError(f"Section/code mismatch: '{section}' - '{code}'") 80 | 81 | match level: 82 | case "section": 83 | alpha_code = f"{section}" 84 | 85 | case "class": 86 | if len(code) == 5: 87 | if code[4] != "0": 88 | raise ValueError( 89 | f"4-digit SIC code as 5 digit must end in zero: '{code}'" 90 | ) 91 | code = code[:4] 92 | alpha_code = f"{section}{code}" 93 | 94 | case _: 95 | alpha_code = f"{section}{code}" 96 | 97 | pad = 6 - len(alpha_code) 98 | alpha_code += "x" * pad 99 | 100 | return SicCode(alpha_code) 101 | 102 | def __eq__(self, other): 103 | return self.alpha_code == other.alpha_code 104 | 105 | def __hash__(self): 106 | return hash(self.alpha_code) 107 | 108 | def __lt__(self, other): 109 | return self._alpha_code_no_pad < other._alpha_code_no_pad 110 | 111 | @staticmethod 112 | def _validate_alpha_code(alpha_code: str): 113 | if not isinstance(alpha_code, str): 114 | raise TypeError("SIC code must be a string") 115 | 116 | first_char = alpha_code[0] 117 | if not (first_char.isalpha() and first_char.isupper()): 118 | raise ValueError("Alpha SIC code must start with an upper case letter A-Z") 119 | 120 | if len(alpha_code) != 6: 121 | raise ValueError("Alpha SIC must be padded to 6 characters") 122 | 123 | @staticmethod 124 | def _parse_digits(alpha_code: str): 125 | alpha_code = alpha_code.replace("x", "") 126 | 127 | if len(alpha_code) == 1: 128 | n_digits = 1 129 | 130 | else: 131 | n_digits = len(alpha_code[1:]) 132 | if n_digits == 1: 133 | raise ValueError(f'Invalid SIC code: "{alpha_code}"') 134 | 135 | return n_digits 136 | 137 | @staticmethod 138 | def _format_code(alpha_code: str): 139 | alpha_code = alpha_code.replace("x", "") 140 | 141 | formatted_code = None 142 | 143 | match len(alpha_code): 144 | case 1: 145 | formatted_code = alpha_code 146 | case 3: 147 | formatted_code = alpha_code[1:3] 148 | case 4 | 5: 149 | formatted_code = f"{alpha_code[1:3]}.{alpha_code[3:]}" 150 | case 6: 151 | formatted_code = f"{alpha_code[1:3]}.{alpha_code[3:5]}/{alpha_code[5]}" 152 | 153 | if formatted_code is None: 154 | raise ValueError(f'Unable to format code: "{alpha_code}"') 155 | 156 | return formatted_code 157 | 158 | def __str__(self): 159 | return self._formatted_code 160 | 161 | def __repr__(self): 162 | repr_str = f'SicCode("{self.alpha_code}")' 163 | return repr_str 164 | 165 | 166 | class SicNode: 167 | """Tree data structure where the nodes hold all data associated with a given SIC. 168 | 169 | The SIC hierarchy is represented as several separate trees, 170 | with each section (e.g. "A", "B", "C") as a root node. 171 | """ 172 | 173 | def __init__(self, sic_code: SicCode, description: str): 174 | self.sic_code = sic_code 175 | self.description = description 176 | 177 | self.activities = [] 178 | self.sic_meta = None 179 | self.parent = None 180 | self.children = [] 181 | 182 | def __repr__(self): 183 | return f'SicNode({repr(self.sic_code)}, "{self.description}")' 184 | 185 | def __str__(self): 186 | return f'{str(self.sic_code)}: "{self.description}"' 187 | 188 | def print_all(self): 189 | """Prints all information about the SIC hierarchy. 190 | 191 | This method prints the following information: 192 | - The string representation of the SIC hierarchy. 193 | - The section of the SIC code. 194 | - The parent of the current SIC hierarchy. 195 | - The children of the current SIC hierarchy. 196 | - The detail, includes, and excludes attributes of the SIC meta. 197 | - The activities associated with the SIC hierarchy. 198 | """ 199 | print(str(self)) 200 | 201 | print(f"Section: {self.sic_code.alpha_code[0]}") 202 | print(f"Parent: {self.parent}") 203 | print(f"Children: {[str(child) for child in self.children]}") 204 | print() 205 | print(f"detail={self.sic_meta.detail}") 206 | print(f"includes={self.sic_meta.includes}") 207 | print(f"excludes={self.sic_meta.excludes}") 208 | print() 209 | print("Activities:") 210 | for activity in self.activities: 211 | print(f"\t- {activity}") 212 | 213 | def is_leaf(self): 214 | return not self.children 215 | 216 | def numeric_string_padded(self): 217 | numeric_string = self.sic_code.alpha_code[1:].replace("x", "") 218 | 219 | if self.sic_code.n_digits == 4 and self.is_leaf(): 220 | numeric_string += "0" 221 | 222 | return numeric_string 223 | 224 | 225 | class SIC: 226 | """Main class for SIC lookups. 227 | 228 | Usage: 229 | | sic = load_hierarchy(sic_df, sic_index_df) 230 | | sic["01.1"] 231 | | sic["011"] 232 | | sic["A011xx"] 233 | """ 234 | 235 | def __init__(self, nodes, code_lookup): 236 | self.nodes = sorted(nodes, key=lambda node: node.sic_code) 237 | self._code_lookup = code_lookup 238 | 239 | def __getitem__(self, key): 240 | return self._code_lookup[key] 241 | 242 | def __iter__(self): 243 | return iter(self.nodes) 244 | 245 | def __len__(self): 246 | return len(self.nodes) 247 | 248 | def all_leaf_activities(self) -> Iterator[dict]: 249 | """All activities for 5-digit SIC. 250 | 251 | Note: 252 | Does not include 4-digit SIC codes where those codes 253 | have a 5-digit expansion. 254 | 255 | i.e. Only returns for leaf nodes. 256 | """ 257 | return ( 258 | {"code": node.sic_code, "text": activity} 259 | for node in self 260 | if node.is_leaf() 261 | for activity in node.activities 262 | ) 263 | 264 | def all_leaf_descriptions(self) -> Iterator[dict]: 265 | """All descriptions for 5-digit SIC. 266 | 267 | Note: 268 | Does not include 4-digit SIC codes where those codes 269 | have a 5-digit expansion. 270 | 271 | i.e. Only returns for leaf nodes. 272 | """ 273 | return ( 274 | {"code": node.sic_code, "text": node.description} 275 | for node in self 276 | if node.is_leaf() 277 | ) 278 | 279 | def all_leaf_text(self) -> pd.DataFrame: 280 | """Returns all short text descriptions of 5-digit level SIC. 281 | 282 | Includes: 283 | * Activities from the SIC index 284 | * Description from the SIC structure 285 | 286 | Returns: 287 | pd.DataFrame 288 | Two columns `code`, `text` 289 | """ 290 | description_df = pd.DataFrame(self.all_leaf_descriptions()) 291 | activity_df = pd.DataFrame(self.all_leaf_activities()) 292 | 293 | df = pd.concat([description_df, activity_df], ignore_index=True) 294 | df = df.drop_duplicates() 295 | df = df.sort_values("code") 296 | df = df.reset_index(drop=True).copy() 297 | 298 | df["code"] = df["code"].apply(lambda sic_code: str(sic_code)) 299 | 300 | return df 301 | 302 | 303 | def _define_codes_and_nodes(sic_df) -> ([SicCode], [SicNode]): 304 | codes = [] 305 | nodes = [] 306 | 307 | code_node_dict = dict() 308 | 309 | for description, section, code, level in sic_df[ 310 | ["description", "section", "most_disaggregated_level", "level_headings"] 311 | ].itertuples(index=False, name=None): 312 | sic_code = SicCode.from_section_code_level(section, code, level) 313 | 314 | sic_node = SicNode(sic_code, description=description) 315 | 316 | codes.append(sic_code) 317 | nodes.append(sic_node) 318 | code_node_dict[sic_code] = sic_node 319 | 320 | return codes, nodes, code_node_dict 321 | 322 | 323 | def _populate_parent_child_relationships(nodes, code_node_dict): 324 | """Populate the parent/child relationships in SIC. 325 | 326 | Warning: Modifies nodes in place 327 | """ 328 | for node in nodes: 329 | if node.sic_code.n_digits > 1: 330 | match node.sic_code.n_digits: 331 | case 2: 332 | parent_code = node.sic_code.alpha_code[0] 333 | case 3: 334 | parent_code = node.sic_code.alpha_code[:3] 335 | case 4: 336 | parent_code = node.sic_code.alpha_code[:4] 337 | case 5: 338 | parent_code = node.sic_code.alpha_code[:5] 339 | case _: 340 | raise ValueError(f"No parent found for '{node.sic_code}'") 341 | 342 | pad = 6 - len(parent_code) 343 | parent_code += "x" * pad 344 | parent_sic_code = SicCode(parent_code) 345 | 346 | parent_node = code_node_dict[parent_sic_code] 347 | 348 | parent_node.children.append(node) 349 | node.parent = parent_node 350 | 351 | 352 | def _populate_sic_meta(nodes, code_node_dict): 353 | """Populate metadata for SIC. 354 | 355 | Warning: modifies data in place. 356 | """ 357 | 358 | if not len(sicDB.sic_meta) == len(nodes): 359 | raise ValueError("Mismatch in SIC data sources: sicDB.sic_meta and sic_df") 360 | 361 | for meta in sicDB.sic_meta: 362 | sic_code = SicCode(meta.code) 363 | sic_node = code_node_dict[sic_code] 364 | 365 | sic_node.sic_meta = _clean_meta(meta) 366 | 367 | 368 | def _populate_activities(nodes, sic_index_df): 369 | """Populate activities. 370 | 371 | Warning: Modifies nodes in place. 372 | """ 373 | 374 | padded_digits_to_sic_codes = dict() 375 | 376 | for sic_node in nodes: 377 | sic_digits = None 378 | if sic_node.sic_code.n_digits == 4: 379 | sic_digits = sic_node.sic_code.alpha_code[1:5] + "0" 380 | if sic_node.sic_code.n_digits == 5: 381 | sic_digits = sic_node.sic_code.alpha_code[1:6] 382 | 383 | if sic_digits: 384 | padded_digits_to_sic_codes[sic_digits] = sic_node 385 | 386 | for sic_digits, activity in sic_index_df[["uk_sic_2007", "activity"]].itertuples( 387 | index=False, name=None 388 | ): 389 | padded_digits_to_sic_codes[sic_digits.strip()].activities.append(activity) 390 | 391 | 392 | def _clean_text(text): 393 | """Clean text. 394 | 395 | Unescape HTML, remove the ", see ##11.11" entries. 396 | """ 397 | clean_text = html.unescape(text) 398 | 399 | clean_text = re.sub(SEE_CODE_REGEX, "", clean_text) 400 | 401 | return clean_text 402 | 403 | 404 | def _clean_meta(meta): 405 | clean_detail = _clean_text(meta.detail) 406 | clean_includes = [_clean_text(text) for text in meta.includes] 407 | clean_excludes = [_clean_text(text) for text in meta.excludes] 408 | 409 | cleaned_meta = sicDB.ClassificationMeta( 410 | code=meta.code, 411 | title=meta.title, 412 | detail=clean_detail, 413 | includes=clean_includes, 414 | excludes=clean_excludes, 415 | ) 416 | 417 | return cleaned_meta 418 | 419 | 420 | def load_hierarchy(sic_df, sic_index_df): 421 | """Create the SIC lookups from all supporting data. 422 | 423 | Uses: 424 | * SIC structure 425 | * SIC index 426 | * SIC meta data (from the SIC interactive tool) 427 | 428 | Once created this provides a single point of access for all 429 | data associated with a SIC definition. 430 | """ 431 | codes, nodes, code_node_dict = _define_codes_and_nodes(sic_df) 432 | 433 | _populate_parent_child_relationships(nodes, code_node_dict) 434 | 435 | _populate_sic_meta(nodes, code_node_dict) 436 | 437 | _populate_activities(nodes, sic_index_df) 438 | 439 | lookup = dict() 440 | 441 | for node in nodes: 442 | lookup[str(node.sic_code)] = node 443 | lookup[node.sic_code.alpha_code] = node 444 | lookup[node.sic_code.alpha_code.replace("x", "")] = node 445 | if node.sic_code.n_digits > 1: 446 | lookup[node.sic_code.alpha_code[1:].replace("x", "")] = node 447 | 448 | if node.sic_code.n_digits == 4 and not node.children: 449 | key = node.sic_code.alpha_code[1:5] + "0" 450 | lookup[key] = node 451 | 452 | return SIC(nodes, lookup) 453 | -------------------------------------------------------------------------------- /src/sic_soc_llm/data_models/sic_meta_model.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel, Field 2 | from typing import List 3 | 4 | 5 | class ClassificationMeta(BaseModel): 6 | """ 7 | Represents a classification meta model. 8 | 9 | Attributes: 10 | code (str): Category code. Either a full code or a partial code for a 11 | larger hierarchical group. 12 | Partial code has last digits replaced by 'x'. 13 | title (str): Short descriptive title of the code category. 14 | detail (str): Descriptive label of the category associated with the code. 15 | includes (List[str]): Optional list of titles that should be included 16 | in this category. 17 | excludes (List[str]): Optional list of titles that should be excluded 18 | from this category. 19 | """ 20 | 21 | code: str = Field( 22 | description="""Category code. Either a full code or a partial code 23 | for a larger hierarchical group. 24 | Partial code has last digits replaced by 'x'.""" 25 | ) 26 | title: str = Field(description="Short descriptive title of the code category.") 27 | detail: str = Field( 28 | default="", 29 | description="Descriptive label of the category associated with code.", 30 | ) 31 | includes: List[str] = Field( 32 | default=[], 33 | description="Optional list of titles that should be included in this category", 34 | ) 35 | excludes: List[str] = Field( 36 | default=[], 37 | description="""Optional list of titles that should be excluded from 38 | this category""", 39 | ) 40 | 41 | def check_code_match(self, subcode: str) -> bool: 42 | """Check for partial match of the code. 43 | Discards 1st letter on SIC and then check only valid numbers. 44 | 45 | Args: 46 | subcode (str): 2-5 digits code for matching 47 | 48 | Returns: 49 | bool: if partial match found 50 | """ 51 | n = min(len(self.code.replace("x", "")), len(subcode) + 1) 52 | return (n > 2) & (self.code[1:n] == subcode[0 : (n - 1)]) 53 | 54 | def pretty_print(self, subset_digits=[4, 2]) -> str: 55 | """Prints nicely the present fields. 56 | 57 | Returns: 58 | str: _description_ 59 | """ 60 | code = self.code[1:].replace("x", "") 61 | if len(code) in subset_digits: 62 | out = "Code " + code + ": " + self.title + ". " 63 | if self.detail: 64 | out += self.detail + ". " 65 | if self.includes: 66 | out += "Includes " + ", ".join(self.includes) + ". " 67 | if self.excludes: 68 | out += "Excludes " + ", ".join(self.excludes) + ". " 69 | else: 70 | out = "" 71 | return out 72 | -------------------------------------------------------------------------------- /src/sic_soc_llm/embedding.py: -------------------------------------------------------------------------------- 1 | # Docker Image may have old sqlite3 version for ChromaDB 2 | import sys 3 | import sqlite3 # noqa:F401 4 | 5 | if sys.modules["sqlite3"].sqlite_version_info < (3, 35, 0): 6 | __import__("pysqlite3") 7 | sys.modules["sqlite3"] = sys.modules.pop("pysqlite3") 8 | 9 | import uuid 10 | import logging 11 | from langchain_community.embeddings import HuggingFaceEmbeddings, VertexAIEmbeddings 12 | from langchain_community.vectorstores import Chroma 13 | from langchain.docstore.document import Document 14 | from autocorrect import Speller 15 | 16 | from sic_soc_llm import get_config 17 | from sic_soc_llm.data_models.sic_data_access import load_sic_index, load_sic_structure 18 | from sic_soc_llm.data_models.sic_hierarchy import load_hierarchy, SIC 19 | 20 | logger = logging.getLogger(__name__) 21 | config = get_config() 22 | 23 | 24 | class EmbeddingHandler: 25 | """ 26 | Handles embedding operations for the Chroma vector store. 27 | 28 | Args: 29 | embedding_model_name (str, optional): The name of the embedding model to use. 30 | Defaults to the value specified in the configuration file. 31 | db_dir (str, optional): The directory where the vector store database 32 | is located. Defaults to the value specified in the configuration file. 33 | If None then the embedding db will be non-persistent. 34 | k_matches (int, optional): The number of nearest matches to retrieve. 35 | Defaults to 20. 36 | """ 37 | 38 | def __init__( 39 | self, 40 | embedding_model_name: str = config["llm"]["embedding_model_name"], 41 | db_dir: str = config["llm"]["db_dir"], 42 | k_matches: int = 20, 43 | ): 44 | """ 45 | Initialises the EmbeddingHandler. 46 | """ 47 | if embedding_model_name.startswith("textembedding-"): 48 | self.embeddings = VertexAIEmbeddings(model_name=embedding_model_name) 49 | else: 50 | self.embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name) 51 | self.db_dir = db_dir 52 | self.vector_store = self._create_vector_store() 53 | self.k_matches = k_matches 54 | self.spell = Speller() 55 | self._index_size = self.vector_store._client.get_collection("langchain").count() 56 | 57 | def _create_vector_store(self) -> Chroma: 58 | """ 59 | Initialises Chroma VectorDB on known DB dir in data. 60 | 61 | Returns: 62 | Chroma: The LangChain vector store object for Chroma. 63 | """ 64 | if self.db_dir is None: 65 | return Chroma(embedding_function=self.embeddings) 66 | else: 67 | return Chroma( 68 | embedding_function=self.embeddings, persist_directory=self.db_dir 69 | ) 70 | 71 | def embed_index( 72 | self, 73 | from_empty: bool = True, 74 | sic: SIC = None, 75 | file_object=None, 76 | ): 77 | """ 78 | Embeds the index entries into the vector store. 79 | 80 | Args: 81 | from_empty (bool, optional): Whether to drop the current vector store 82 | content and start fresh. 83 | sic (SIC, optional): The SIC hierarchy object. If None, the hierarchy 84 | is loaded from files specified in the config. 85 | file_object (StringIO object): The index file as StringIO object. 86 | If provided, the file will be read by line and embedded. 87 | Each line has expected format of **code**: **description** 88 | """ 89 | if from_empty: 90 | self.vector_store._client.delete_collection("langchain") 91 | self.vector_store = self._create_vector_store() 92 | 93 | docs = [] 94 | ids = [] 95 | if file_object is not None: 96 | for line in file_object: 97 | if line: 98 | bits = line.split(":", 1) 99 | docs.append( 100 | Document( 101 | page_content=bits[1], 102 | metadata={ 103 | "code": bits[0], 104 | "four_digit_code": bits[0][0:4], 105 | "two_digit_code": bits[0][0:2], 106 | }, 107 | ) 108 | ) 109 | ids.append(str(uuid.uuid3(uuid.NAMESPACE_URL, line))) 110 | 111 | else: 112 | if sic is None: 113 | sic_index_df = load_sic_index(config["lookups"]["sic_index"]) 114 | sic_df = load_sic_structure(config["lookups"]["sic_structure"]) 115 | sic = load_hierarchy(sic_df, sic_index_df) 116 | 117 | logger.debug("Loading entries from SIC hierarchy for embedding.") 118 | for _, row in sic.all_leaf_text().iterrows(): 119 | code = (row["code"].replace(".", "").replace("/", "") + "0")[:5] 120 | docs.append( 121 | Document( 122 | page_content=row["text"], 123 | metadata={ 124 | "code": code, 125 | "four_digit_code": code[0:4], 126 | "two_digit_code": code[0:2], 127 | }, 128 | ) 129 | ) 130 | ids.append(str(uuid.uuid3(uuid.NAMESPACE_URL, row["text"]))) 131 | 132 | self.vector_store.add_documents(docs, ids=ids) 133 | self._index_size = self.vector_store._client.get_collection("langchain").count() 134 | logger.debug(f"Inserted {len(docs):,} entries into vector embedding database.") 135 | 136 | def search_index(self, query: str, return_dicts: bool = True) -> list[dict]: 137 | """ 138 | Returns k document chunks with the highest relevance to the query. 139 | 140 | Args: 141 | query (str): Question for which most relevant index entries 142 | will be returned. 143 | return_dicts (bool, optional): If True, data returned as list of 144 | dictionaries, otherwise as document tuples. Defaults to True. 145 | 146 | Returns: 147 | List[dict]: List of top k index entries by relevance. 148 | """ 149 | top_matches = self.vector_store.similarity_search_with_score( 150 | query=query, k=self.k_matches 151 | ) 152 | 153 | if return_dicts: 154 | return [ 155 | {"distance": float(doc[1])} 156 | | {"title": doc[0].page_content} 157 | | doc[0].metadata 158 | for doc in top_matches 159 | ] 160 | return top_matches 161 | 162 | def search_index_multi(self, query: list[str]) -> list[dict]: 163 | """ 164 | Returns k document chunks with the highest relevance to the query. 165 | 166 | Args: 167 | query (list[str]): List of query fields (in priority order) for which 168 | most relevant index entries will be returned. 169 | e.g [industry_descr, job_title, job_descr] 170 | 171 | Returns: 172 | List[dict]: List of top k index entries by relevance. 173 | """ 174 | query = [x for x in query if x is not None] 175 | search_terms_list = set() 176 | for i in range(len(query)): 177 | x = " ".join(query[: (i + 1)]) 178 | search_terms_list.add(x) 179 | search_terms_list.add(self.spell(x)) 180 | short_list = [y for x in search_terms_list for y in self.search_index(query=x)] 181 | return sorted(short_list, key=lambda x: x["distance"]) 182 | -------------------------------------------------------------------------------- /src/sic_soc_llm/example_data/coicop_5d_condensed.txt: -------------------------------------------------------------------------------- 1 | CP01111: Rice 2 | CP01112: Flours and other cereals 3 | CP01113: Bread 4 | CP01114: Other bakery products 5 | CP01115: Pizza and quiche 6 | CP01116: Pasta products and couscous 7 | CP01117: Breakfast cereals 8 | CP01118: Other cereal products 9 | CP01121: Beef and veal 10 | CP01122: Pork 11 | CP01123: Lamb and goat 12 | CP01124: Poultry 13 | CP01125: Other meats 14 | CP01126: Edible offal 15 | CP01127: Dried 16 | CP01128: Other meat preparations 17 | CP01131: Fresh or chilled fish 18 | CP01132: Frozen fish 19 | CP01133: Fresh or chilled seafood 20 | CP01134: Frozen seafood 21 | CP01135: Dried 22 | CP01136: Other preserved or processed fish and seafood and fish and seafood preparations 23 | CP01141: Whole milk 24 | CP01142: Low fat milk 25 | CP01143: Preserved milk 26 | CP01144: Yoghurt 27 | CP01145: Cheese and curd 28 | CP01146: Other milk products 29 | CP01147: Eggs 30 | CP01151: Butter 31 | CP01152: Margarine and other vegetable fats 32 | CP01153: Olive oil 33 | CP01154: Other edible oils 34 | CP01155: Other edible animal fats 35 | CP01161: Fresh or chilled fruit 36 | CP01162: Frozen fruit 37 | CP01163: Dried fruit and nuts 38 | CP01164: Preserved fruit and fruit-based products 39 | CP01171: Fresh or chilled vegetables other than potatoes and other tubers 40 | CP01172: Frozen vegetables other than potatoes and other tubers 41 | CP01173: Dried vegetables 42 | CP01174: Potatoes 43 | CP01175: Crisps 44 | CP01176: Other tubers and products of tuber vegetables 45 | CP01181: Sugar 46 | CP01182: Jams 47 | CP01183: Chocolate 48 | CP01184: Confectionery products 49 | CP01185: Edible ices and ice cream 50 | CP01186: Artificial sugar substitutes 51 | CP01191: Sauces 52 | CP01192: Salt 53 | CP01193: Baby food 54 | CP01194: Ready-made meals 55 | CP01199: Other food products n.e.c. 56 | CP01211: Coffee 57 | CP01212: Tea 58 | CP01213: Cocoa and powdered chocolate 59 | CP01221: Mineral or spring waters 60 | CP01222: Soft drinks 61 | CP01223: Fruit and vegetables juices 62 | CP02111: Spirits and liqueurs 63 | CP02112: Alcoholic soft drinks 64 | CP02121: Wine from grapes 65 | CP02122: Wine from other fruits 66 | CP02123: Fortified wines 67 | CP02124: Wine-based drinks 68 | CP02131: Lager beer 69 | CP02132: Other alcoholic beer 70 | CP02133: Low and non-alcoholic beer 71 | CP02134: Beer-based drinks 72 | CP02201: Cigarettes 73 | CP02202: Cigars 74 | CP02203: Other tobacco products 75 | CP02300: Narcotics 76 | CP03110: Clothing materials 77 | CP03121: Garments for men 78 | CP03122: Garments for women 79 | CP03123: Garments for infants (0 to 2 years) and children (3 to 13 years) 80 | CP03131: Other articles of clothing 81 | CP03132: Clothing accessories 82 | CP03141: Cleaning of clothing 83 | CP03142: Repair and hire of clothing 84 | CP03211: Footwear for men 85 | CP03212: Footwear for women 86 | CP03213: Footwear for infants and children 87 | CP03220: Repair and hire of footwear 88 | CP04110: Actual rentals paid by tenants 89 | CP04121: Actual rentals paid by tenants for secondary residences 90 | CP04122: Garage rentals and other rentals paid by tenants 91 | CP04210: Imputed rentals of owner-occupiers 92 | CP04220: Other imputed rentals 93 | CP04310: Materials for the maintenance and repair of the dwelling 94 | CP04321: Services of plumbers 95 | CP04322: Services of electricians 96 | CP04323: Maintenance services for heating systems 97 | CP04324: Services of painters 98 | CP04325: Services of carpenters 99 | CP04329: Other services for maintenance and repair of the dwelling 100 | CP04410: Water supply 101 | CP04420: Refuse collection 102 | CP04430: Sewerage collection 103 | CP04441: Maintenance charges in multi-occupied buildings 104 | CP04442: Security services 105 | CP04449: Other services related to dwelling 106 | CP04510: Electricity 107 | CP04521: Natural gas and town gas 108 | CP04522: Liquefied hydrocarbons (butane 109 | CP04530: Liquid fuels 110 | CP04541: Coal 111 | CP04549: Other solid fuels 112 | CP04550: Heat energy 113 | CP05111: Household furniture 114 | CP05112: Garden furniture 115 | CP05113: Lighting equipment 116 | CP05119: Other furniture and furnishings 117 | CP05121: Carpet and rugs 118 | CP05122: Other floor coverings 119 | CP05123: Services of laying of fitted carpets and floor coverings 120 | CP05130: Repair of furniture 121 | CP05201: Furnishings fabrics and curtains 122 | CP05202: Bed linen 123 | CP05203: Table linen and bathroom linen 124 | CP05209: Other household textiles 125 | CP05311: Refrigerators 126 | CP05312: Clothes washing machines 127 | CP05313: Cookers 128 | CP05314: Heaters 129 | CP05315: Cleaning equipment 130 | CP05319: Other major household appliances 131 | CP05321: Food processing appliances 132 | CP05322: Coffee machines 133 | CP05323: Irons 134 | CP05324: Toasters and grills 135 | CP05329: Other small electric household appliances 136 | CP05330: Repair of household appliances 137 | CP05401: Glassware 138 | CP05402: Cutlery 139 | CP05403: Non-electric kitchen utensils and articles 140 | CP05404: Repair of glassware 141 | CP05511: Motorized major tools and equipment 142 | CP05512: Repair 143 | CP05521: Non-motorised small tools 144 | CP05522: Miscellaneous small tool accessories 145 | CP05523: Repair of non-motorised small tools and miscellaneous accessories 146 | CP05611: Cleaning and maintenance products 147 | CP05612: Other non-durable small household articles 148 | CP05621: Domestic services by paid staff 149 | CP05622: Cleaning services 150 | CP05623: Hire of furniture and furnishings 151 | CP05629: Other domestic services and household services 152 | CP06110: Pharmaceutical products 153 | CP06121: Pregnancy tests and mechanical contraceptive devices 154 | CP06129: Other medical products n.e.c. 155 | CP06131: Corrective eye-glasses and contact lenses 156 | CP06132: Hearing aids 157 | CP06133: Repair of therapeutic appliances and equipment 158 | CP06139: Other therapeutic appliances and equipment 159 | CP06211: General practice 160 | CP06212: Specialist practice 161 | CP06220: Dental services 162 | CP06231: Services of medical analysis laboratories and X-ray centres 163 | CP06232: Thermal-baths 164 | CP06239: Other paramedical services 165 | CP06300: Hospital services 166 | CP07111: New motor cars 167 | CP07112: Second-hand motor cars 168 | CP07120: Motor cycles 169 | CP07130: Bicycles 170 | CP07140: Animal drawn vehicles 171 | CP07211: Tyres 172 | CP07212: Spare parts for personal transport equipment 173 | CP07213: Accessories for personal transport equipment 174 | CP07221: Diesel 175 | CP07222: Petrol 176 | CP07223: Other fuels for personal transport equipment 177 | CP07224: Lubricants 178 | CP07230: Maintenance and repair of personal transport equipment 179 | CP07241: Hire of garages 180 | CP07242: Toll facilities and parking meters 181 | CP07243: Driving lessons 182 | CP07311: Passenger transport by train 183 | CP07312: Passenger transport by underground and tram 184 | CP07321: Passenger transport by bus and coach 185 | CP07322: Passenger transport by taxi and hired car with driver 186 | CP07331: Domestic flights 187 | CP07332: International flights 188 | CP07341: Passenger transport by sea 189 | CP07342: Passenger transport by inland waterway 190 | CP07350: Combined passenger transport 191 | CP07361: Funicular 192 | CP07362: Removal and storage services 193 | CP07369: Other purchased transport services n.e.c. 194 | CP08101: Letter handling services 195 | CP08109: Other postal services 196 | CP08201: Fixed telephone equipment 197 | CP08202: Mobile telephone equipment 198 | CP08203: Other equipment of telephone and telefax equipment 199 | CP08204: Repair of telephone or telefax equipment 200 | CP08301: Wired telephone services 201 | CP08302: Wireless telephone services 202 | CP08303: Internet access provision services 203 | CP08304: Bundled telecommunication services 204 | CP08305: Other information transmission services 205 | CP09111: Equipment for the reception 206 | CP09112: Equipment for the reception 207 | CP09113: Portable sound and vision devices 208 | CP09119: Other equipment for the reception 209 | CP09121: Cameras 210 | CP09122: Accessories for photographic and cinematographic equipment 211 | CP09123: Optical instruments 212 | CP09131: Personal computers 213 | CP09132: Accessories for information processing equipment 214 | CP09133: Software 215 | CP09134: Calculators and other information processing equipment 216 | CP09141: Pre-recorded recording media 217 | CP09142: Unrecorded recording media 218 | CP09149: Other recording media 219 | CP09150: Repair of audio-visual 220 | CP09211: Camper vans 221 | CP09212: Aeroplanes 222 | CP09213: Boats 223 | CP09214: Horses 224 | CP09215: Major items for games and sport 225 | CP09221: Musical instruments 226 | CP09222: Major durables for indoor recreation 227 | CP09230: Maintenance and repair of other major durables for recreation and culture 228 | CP09311: Games and hobbies 229 | CP09312: Toys and celebration articles 230 | CP09321: Equipment for sport 231 | CP09322: Equipment for camping and open-air recreation 232 | CP09323: Repair of equipment for sport 233 | CP09331: Garden products 234 | CP09332: Plants and flowers 235 | CP09341: Purchase of pets 236 | CP09342: Products for pets 237 | CP09350: Veterinary and other services for pets 238 | CP09411: Recreational and sporting services - Attendance 239 | CP09412: Recreational and sporting services - Participation 240 | CP09421: Cinemas 241 | CP09422: Museums 242 | CP09423: Television and radio licence fees 243 | CP09424: Hire of equipment and accessories for culture 244 | CP09425: Photographic services 245 | CP09429: Other cultural services 246 | CP09430: Games of chance 247 | CP09511: Fiction books 248 | CP09512: Educational text books 249 | CP09513: Other non-fiction books 250 | CP09514: Binding services and E-book downloads 251 | CP09521: Newspapers 252 | CP09522: Magazines and periodicals 253 | CP09530: Miscellaneous printed matter 254 | CP09541: Paper products 255 | CP09549: Other stationery and drawing materials 256 | CP09601: Package domestic holidays 257 | CP09602: Package international holidays 258 | CP10101: Pre-primary education 259 | CP10102: Primary education 260 | CP10200: Secondary education 261 | CP10300: Post-secondary non-tertiary education 262 | CP10400: Tertiary education 263 | CP10500: Education not definable by level 264 | CP11111: Restaurants 265 | CP11112: Fast food and take away food services 266 | CP11120: Canteens 267 | CP11201: Hotels 268 | CP11202: Holiday centres 269 | CP11203: Accommodation services of other establishments 270 | CP12111: Hairdressing for men and children 271 | CP12112: Hairdressing for women 272 | CP12113: Personal grooming treatments 273 | CP12121: Electric appliances for personal care 274 | CP12122: Repair of electric appliances for personal care 275 | CP12131: Non-electrical appliances 276 | CP12132: Articles for personal hygiene and wellness 277 | CP12200: Prostitution 278 | CP12311: Jewellery 279 | CP12312: Clocks and watches 280 | CP12313: Repair of jewellery 281 | CP12321: Travel goods 282 | CP12322: Articles for babies 283 | CP12323: Repair of other personal effects 284 | CP12329: Other personal effects n.e.c. 285 | CP12401: Child care services 286 | CP12402: Retirement homes for elderly persons and residences for disabled persons 287 | CP12403: Services to maintain people in their private homes 288 | CP12404: Counselling 289 | CP12510: Life insurance 290 | CP12520: Insurance connected with the dwelling 291 | CP12531: Public insurance connected with health 292 | CP12532: Private insurance connected with health 293 | CP12541: Motor vehicle insurance 294 | CP12542: Travel insurance 295 | CP12550: Other insurance 296 | CP12610: FISIM 297 | CP12621: Charges by banks and post offices 298 | CP12622: Fees and service charges of brokers 299 | CP12701: Administrative fees 300 | CP12702: Legal services and accountancy 301 | CP12703: Funeral services 302 | CP12704: Other fees and services 303 | -------------------------------------------------------------------------------- /src/sic_soc_llm/example_data/sic_2d_condensed.txt: -------------------------------------------------------------------------------- 1 | 01: Crop and animal production, hunting and related service activities 2 | 02: Forestry and logging 3 | 03: Fishing and aquaculture 4 | 05: Mining of coal and lignite 5 | 06: Extraction of crude petroleum and natural gas 6 | 07: Mining of metal ores 7 | 08: Other mining and quarrying 8 | 09: Mining support service activities 9 | 10: Manufacture of food products 10 | 11: Manufacture of beverages 11 | 12: Manufacture of tobacco products 12 | 13: Manufacture of textiles 13 | 14: Manufacture of wearing apparel 14 | 15: Manufacture of leather and related products 15 | 16: Manufacture of wood and of products of wood and cork, except furniture; manufacture of articles of straw and plaiting materials 16 | 17: Manufacture of paper and paper products 17 | 18: Printing and reproduction of recorded media 18 | 19: Manufacture of coke and refined petroleum products 19 | 20: Manufacture of chemicals and chemical products 20 | 21: Manufacture of basic pharmaceutical products and pharmaceutical preparations 21 | 22: Manufacture of rubber and plastic products 22 | 23: Manufacture of other non-metallic mineral products 23 | 24: Manufacture of basic metals 24 | 25: Manufacture of fabricated metal products, except machinery and equipment 25 | 26: Manufacture of computer, electronic and optical products 26 | 27: Manufacture of electrical equipment 27 | 28: Manufacture of machinery and equipment nec 28 | 29: Manufacture of motor vehicles, trailers and semi-trailers 29 | 30: Manufacture of other transport equipment 30 | 31: Manufacture of furniture 31 | 32: Other manufacturing 32 | 33: Repair and installation of machinery and equipment 33 | 35: Electricity, gas, steam and air conditioning supply 34 | 36: Water collection, treatment and supply 35 | 37: Sewerage 36 | 38: Waste collection, treatment and disposal activities; materials recovery 37 | 39: Remediation activities and other waste management services 38 | 41: Construction of buildings 39 | 42: Civil engineering 40 | 43: Specialised construction activities 41 | 45: Wholesale and retail trade and repair of motor vehicles and motorcycles 42 | 46: Wholesale trade, except of motor vehicles and motorcycles 43 | 47: Retail trade, except of motor vehicles and motorcycles 44 | 49: Land transport and transport via pipelines 45 | 50: Water transport 46 | 51: Air transport 47 | 52: Warehousing and support activities for transportation 48 | 53: Postal and courier activities 49 | 55: Accommodation 50 | 56: Food and beverage service activities 51 | 58: Publishing activities 52 | 59: Motion picture, video and television programme production, sound recording and music publishing activities 53 | 60: Programming and broadcasting activities 54 | 61: Telecommunications 55 | 62: Computer programming, consultancy and related activities 56 | 63: Information service activities 57 | 64: Financial service activities, except insurance and pension funding 58 | 65: Insurance, reinsurance and pension funding, except compulsory social security 59 | 66: Activities auxiliary to financial services and insurance activities 60 | 68: Real estate activities 61 | 69: Legal and accounting activities 62 | 70: Activities of head offices; management consultancy activities 63 | 71: Architectural and engineering activities; technical testing and analysis 64 | 72: Scientific research and development 65 | 73: Advertising and market research 66 | 74: Other professional, scientific and technical activities 67 | 75: Veterinary activities 68 | 77: Rental and leasing activities 69 | 78: Employment activities 70 | 79: Travel agency, tour operator and other reservation service and related activities 71 | 80: Security and investigation activities 72 | 81: Services to buildings and landscape activities 73 | 82: Office administrative, office support and other business support activities 74 | 84: Public administration and defence; compulsory social security 75 | 85: Education 76 | 86: Human health activities 77 | 87: Residential care activities 78 | 88: Social work activities without accommodation 79 | 90: Creative, arts and entertainment activities 80 | 91: Libraries, archives, museums and other cultural activities 81 | 92: Gambling and betting activities 82 | 93: Sports activities and amusement and recreation activities 83 | 94: Activities of membership organisations 84 | 95: Repair of computers and personal and household goods 85 | 96: Other personal service activities 86 | 97: Activities of households as employers of domestic personnel 87 | 98: Undifferentiated goods- and services-producing activities of private households for own use 88 | 99: Activities of extraterritorial organisations and bodies 89 | -------------------------------------------------------------------------------- /src/sic_soc_llm/example_data/sic_4d_condensed.txt: -------------------------------------------------------------------------------- 1 | 0111: Growing of cereals (except rice), leguminous crops and oil seeds 2 | 0112: Growing of rice 3 | 0113: Growing of vegetables and melons, roots and tubers 4 | 0114: Growing of sugar cane 5 | 0115: Growing of tobacco 6 | 0116: Growing of fibre crops 7 | 0119: Growing of other non-perennial crops 8 | 0121: Growing of grapes 9 | 0122: Growing of tropical and subtropical fruits 10 | 0123: Growing of citrus fruits 11 | 0124: Growing of pome fruits and stone fruits 12 | 0125: Growing of other tree and bush fruits and nuts 13 | 0126: Growing of oleaginous fruits 14 | 0127: Growing of beverage crops 15 | 0128: Growing of spices, aromatic, drug and pharmaceutical crops 16 | 0129: Growing of other perennial crops 17 | 0130: Plant propagation 18 | 0141: Raising of dairy cattle 19 | 0142: Raising of other cattle and buffaloes 20 | 0143: Raising of horses and other equines 21 | 0144: Raising of camels and camelids 22 | 0145: Raising of sheep and goats 23 | 0146: Raising of swinepigs 24 | 0147: Raising of poultry 25 | 0149: Raising of other animals 26 | 0150: Mixed farming 27 | 0161: Support activities for crop production 28 | 0162: Support activities for animal production 29 | 0163: Post-harvest crop activities 30 | 0164: Seed processing for propagation 31 | 0170: Hunting, trapping and related service activities 32 | 0210: Silviculture and other forestry activities 33 | 0220: Logging 34 | 0230: Gathering of wild growing non-wood products 35 | 0240: Support services to forestry 36 | 0311: Marine fishing 37 | 0312: Freshwater fishing 38 | 0321: Marine aquaculture 39 | 0322: Freshwater aquaculture 40 | 0510: Mining of hard coal 41 | 0520: Mining of lignite 42 | 0610: Extraction of crude petroleum 43 | 0620: Extraction of natural gas 44 | 0710: Mining of iron ores 45 | 0721: Mining of uranium and thorium ores 46 | 0729: Mining of other non-ferrous metal ores 47 | 0811: Quarrying of ornamental and building stone, limestone, gypsum, chalk and slate 48 | 0812: Operation of gravel and sand pits; mining of clays and kaolin 49 | 0891: Mining of chemical and fertiliser minerals 50 | 0892: Extraction of peat 51 | 0893: Extraction of salt 52 | 0899: Other mining and quarrying nec 53 | 0910: Support activities for petroleum and natural gas extraction 54 | 0990: Support activities for other mining and quarrying 55 | 1011: Processing and preserving of meat 56 | 1012: Processing and preserving of poultry meat 57 | 1013: Production of meat and poultry meat products 58 | 1020: Processing and preserving of fish, crustaceans and molluscs 59 | 1031: Processing and preserving of potatoes 60 | 1032: Manufacture of fruit and vegetable juice 61 | 1039: Other processing and preserving of fruit and vegetables 62 | 1041: Manufacture of oils and fats 63 | 1042: Manufacture of margarine and similar edible fats 64 | 1051: Operation of dairies and cheese making 65 | 1052: Manufacture of ice cream 66 | 1061: Manufacture of grain mill products 67 | 1062: Manufacture of starches and starch products 68 | 1071: Manufacture of bread; manufacture of fresh pastry goods and cakes 69 | 1072: Manufacture of rusks and biscuits; manufacture of preserved pastry goods and cakes 70 | 1073: Manufacture of macaroni, noodles, couscous and similar farinaceous products 71 | 1081: Manufacture of sugar 72 | 1082: Manufacture of cocoa, chocolate and sugar confectionery 73 | 1083: Processing of tea and coffee 74 | 1084: Manufacture of condiments and seasonings 75 | 1085: Manufacture of prepared meals and dishes 76 | 1086: Manufacture of homogenised food preparations and dietetic food 77 | 1089: Manufacture of other food products nec 78 | 1091: Manufacture of prepared feeds for farm animals 79 | 1092: Manufacture of prepared pet foods 80 | 1101: Distilling, rectifying and blending of spirits 81 | 1102: Manufacture of wine from grape 82 | 1103: Manufacture of cider and other fruit wines 83 | 1104: Manufacture of other non-distilled fermented beverages 84 | 1105: Manufacture of beer 85 | 1106: Manufacture of malt 86 | 1107: Manufacture of soft drinks; production of mineral waters and other bottled waters 87 | 1200: Manufacture of tobacco products 88 | 1310: Preparation and spinning of textile fibres 89 | 1320: Weaving of textiles 90 | 1330: Finishing of textiles 91 | 1391: Manufacture of knitted and crocheted fabrics 92 | 1392: Manufacture of made-up textile articles, except apparel 93 | 1393: Manufacture of carpets and rugs 94 | 1394: Manufacture of cordage, rope, twine and netting 95 | 1395: Manufacture of non-wovens and articles made from non-wovens, except apparel 96 | 1396: Manufacture of other technical and industrial textiles 97 | 1399: Manufacture of other textiles nec 98 | 1411: Manufacture of leather clothes 99 | 1412: Manufacture of workwear 100 | 1413: Manufacture of other outerwear 101 | 1414: Manufacture of underwear 102 | 1419: Manufacture of other wearing apparel and accessories 103 | 1420: Manufacture of articles of fur 104 | 1431: Manufacture of knitted and crocheted hosiery 105 | 1439: Manufacture of other knitted and crocheted apparel 106 | 1511: Tanning and dressing of leather; dressing and dyeing of fur 107 | 1512: Manufacture of luggage, handbags and the like, saddlery and harness 108 | 1520: Manufacture of footwear 109 | 1610: Sawmilling and planing of wood 110 | 1621: Manufacture of veneer sheets and wood-based panels 111 | 1622: Manufacture of assembled parquet floors 112 | 1623: Manufacture of other builders' carpentry and joinery 113 | 1624: Manufacture of wooden containers 114 | 1629: Manufacture of other products of wood; manufacture of articles of cork, straw and plaiting materials 115 | 1711: Manufacture of pulp 116 | 1712: Manufacture of paper and paperboard 117 | 1721: Manufacture of corrugated paper and paperboard and of containers of paper and paperboard 118 | 1722: Manufacture of household and sanitary goods and of toilet requisites 119 | 1723: Manufacture of paper stationery 120 | 1724: Manufacture of wallpaper 121 | 1729: Manufacture of other articles of paper and paperboard 122 | 1811: Printing of newspapers 123 | 1812: Other printing 124 | 1813: Pre-press and pre-media services 125 | 1814: Binding and related services 126 | 1820: Reproduction of recorded media 127 | 1910: Manufacture of coke oven products 128 | 1920: Manufacture of refined petroleum products 129 | 2011: Manufacture of industrial gases 130 | 2012: Manufacture of dyes and pigments 131 | 2013: Manufacture of other inorganic basic chemicals 132 | 2014: Manufacture of other organic basic chemicals 133 | 2015: Manufacture of fertilisers and nitrogen compounds 134 | 2016: Manufacture of plastics in primary forms 135 | 2017: Manufacture of synthetic rubber in primary forms 136 | 2020: Manufacture of pesticides and other agrochemical products 137 | 2030: Manufacture of paints, varnishes and similar coatings, printing ink and mastics 138 | 2041: Manufacture of soap and detergents, cleaning and polishing preparations 139 | 2042: Manufacture of perfumes and toilet preparations 140 | 2051: Manufacture of explosives 141 | 2052: Manufacture of glues 142 | 2053: Manufacture of essential oils 143 | 2059: Manufacture of other chemical products nec 144 | 2060: Manufacture of man-made fibres 145 | 2110: Manufacture of basic pharmaceutical products 146 | 2120: Manufacture of pharmaceutical preparations 147 | 2211: Manufacture of rubber tyres and tubes; retreading and rebuilding of rubber tyres 148 | 2219: Manufacture of other rubber products 149 | 2221: Manufacture of plastic plates, sheets, tubes and profiles 150 | 2222: Manufacture of plastic packing goods 151 | 2223: Manufacture of builders’ ware of plastic 152 | 2229: Manufacture of other plastic products 153 | 2311: Manufacture of flat glass 154 | 2312: Shaping and processing of flat glass 155 | 2313: Manufacture of hollow glass 156 | 2314: Manufacture of glass fibres 157 | 2319: Manufacture and processing of other glass, including technical glassware 158 | 2320: Manufacture of refractory products 159 | 2331: Manufacture of ceramic tiles and flags 160 | 2332: Manufacture of bricks, tiles and construction products, in baked clay 161 | 2341: Manufacture of ceramic household and ornamental articles 162 | 2342: Manufacture of ceramic sanitary fixtures 163 | 2343: Manufacture of ceramic insulators and insulating fittings 164 | 2344: Manufacture of other technical ceramic products 165 | 2349: Manufacture of other ceramic products 166 | 2351: Manufacture of cement 167 | 2352: Manufacture of lime and plaster 168 | 2361: Manufacture of concrete products for construction purposes 169 | 2362: Manufacture of plaster products for construction purposes 170 | 2363: Manufacture of ready-mixed concrete 171 | 2364: Manufacture of mortars 172 | 2365: Manufacture of fibre cement 173 | 2369: Manufacture of other articles of concrete, plaster and cement 174 | 2370: Cutting, shaping and finishing of stone 175 | 2391: Production of abrasive products 176 | 2399: Manufacture of other non-metallic mineral products nec 177 | 2410: Manufacture of basic iron and steel and of ferro-alloys 178 | 2420: Manufacture of tubes, pipes, hollow profiles and related fittings, of steel 179 | 2431: Cold drawing of bars 180 | 2432: Cold rolling of narrow strip 181 | 2433: Cold forming or folding 182 | 2434: Cold drawing of wire 183 | 2441: Precious metals production 184 | 2442: Aluminium production 185 | 2443: Lead, zinc and tin production 186 | 2444: Copper production 187 | 2445: Other non-ferrous metal production 188 | 2446: Processing of nuclear fuel 189 | 2451: Casting of iron 190 | 2452: Casting of steel 191 | 2453: Casting of light metals 192 | 2454: Casting of other non-ferrous metals 193 | 2511: Manufacture of metal structures and parts of structures 194 | 2512: Manufacture of doors and windows of metal 195 | 2521: Manufacture of central heating radiators and boilers 196 | 2529: Manufacture of other tanks, reservoirs and containers of metal 197 | 2530: Manufacture of steam generators, except central heating hot water boilers 198 | 2540: Manufacture of weapons and ammunition 199 | 2550: Forging, pressing, stamping and roll-forming of metal; powder metallurgy 200 | 2561: Treatment and coating of metals 201 | 2562: Machining 202 | 2571: Manufacture of cutlery 203 | 2572: Manufacture of locks and hinges 204 | 2573: Manufacture of tools 205 | 2591: Manufacture of steel drums and similar containers 206 | 2592: Manufacture of light metal packaging 207 | 2593: Manufacture of wire products, chain and springs 208 | 2594: Manufacture of fasteners and screw machine products 209 | 2599: Manufacture of other fabricated metal products nec 210 | 2611: Manufacture of electronic components 211 | 2612: Manufacture of loaded electronic boards 212 | 2620: Manufacture of computers and peripheral equipment 213 | 2630: Manufacture of communication equipment 214 | 2640: Manufacture of consumer electronics 215 | 2651: Manufacture of instruments and appliances for measuring, testing and navigation 216 | 2652: Manufacture of watches and clocks 217 | 2660: Manufacture of irradiation, electromedical and electrotherapeutic equipment 218 | 2670: Manufacture of optical instruments and photographic equipment 219 | 2680: Manufacture of magnetic and optical media 220 | 2711: Manufacture of electric motors, generators and transformers 221 | 2712: Manufacture of electricity distribution and control apparatus 222 | 2720: Manufacture of batteries and accumulators 223 | 2731: Manufacture of fibre optic cables 224 | 2732: Manufacture of other electronic and electric wires and cables 225 | 2733: Manufacture of wiring devices 226 | 2740: Manufacture of electric lighting equipment 227 | 2751: Manufacture of electric domestic appliances 228 | 2752: Manufacture of non-electric domestic appliances 229 | 2790: Manufacture of other electrical equipment 230 | 2811: Manufacture of engines and turbines, except aircraft, vehicle and cycle engines 231 | 2812: Manufacture of fluid power equipment 232 | 2813: Manufacture of other pumps and compressors 233 | 2814: Manufacture of other taps and valves 234 | 2815: Manufacture of bearings, gears, gearing and driving elements 235 | 2821: Manufacture of ovens, furnaces and furnace burners 236 | 2822: Manufacture of lifting and handling equipment 237 | 2823: Manufacture of office machinery and equipment (except computers and peripheral equipment) 238 | 2824: Manufacture of power-driven hand tools 239 | 2825: Manufacture of non-domestic cooling and ventilation equipment 240 | 2829: Manufacture of other general-purpose machinery nec 241 | 2830: Manufacture of agricultural and forestry machinery 242 | 2841: Manufacture of metal forming machinery 243 | 2849: Manufacture of other machine tools 244 | 2891: Manufacture of machinery for metallurgy 245 | 2892: Manufacture of machinery for mining, quarrying and construction 246 | 2893: Manufacture of machinery for food, beverage and tobacco processing 247 | 2894: Manufacture of machinery for textile, apparel and leather production 248 | 2895: Manufacture of machinery for paper and paperboard production 249 | 2896: Manufacture of plastics and rubber machinery 250 | 2899: Manufacture of other special-purpose machinery nec 251 | 2910: Manufacture of motor vehicles 252 | 2920: Manufacture of bodies (coachwork) for motor vehicles; manufacture of trailers and semi-trailers 253 | 2931: Manufacture of electrical and electronic equipment for motor vehicles 254 | 2932: Manufacture of other parts and accessories for motor vehicles 255 | 3011: Building of ships and floating structures 256 | 3012: Building of pleasure and sporting boats 257 | 3020: Manufacture of railway locomotives and rolling stock 258 | 3030: Manufacture of air and spacecraft and related machinery 259 | 3040: Manufacture of military fighting vehicles 260 | 3091: Manufacture of motorcycles 261 | 3092: Manufacture of bicycles and invalid carriages 262 | 3099: Manufacture of other transport equipment nec 263 | 3101: Manufacture of office and shop furniture 264 | 3102: Manufacture of kitchen furniture 265 | 3103: Manufacture of mattresses 266 | 3109: Manufacture of other furniture 267 | 3211: Striking of coins 268 | 3212: Manufacture of jewellery and related articles 269 | 3213: Manufacture of imitation jewellery and related articles 270 | 3220: Manufacture of musical instruments 271 | 3230: Manufacture of sports goods 272 | 3240: Manufacture of games and toys 273 | 3250: Manufacture of medical and dental instruments and supplies 274 | 3291: Manufacture of brooms and brushes 275 | 3299: Other manufacturing nec 276 | 3311: Repair of fabricated metal products 277 | 3312: Repair of machinery 278 | 3313: Repair of electronic and optical equipment 279 | 3314: Repair of electrical equipment 280 | 3315: Repair and maintenance of ships and boats 281 | 3316: Repair and maintenance of aircraft and spacecraft 282 | 3317: Repair and maintenance of other transport equipment 283 | 3319: Repair of other equipment 284 | 3320: Installation of industrial machinery and equipment 285 | 3511: Production of electricity 286 | 3512: Transmission of electricity 287 | 3513: Distribution of electricity 288 | 3514: Trade of electricity 289 | 3521: Manufacture of gas 290 | 3522: Distribution of gaseous fuels through mains 291 | 3523: Trade of gas through mains 292 | 3530: Steam and air conditioning supply 293 | 3600: Water collection, treatment and supply 294 | 3700: Sewerage 295 | 3811: Collection of non-hazardous waste 296 | 3812: Collection of hazardous waste 297 | 3821: Treatment and disposal of non-hazardous waste 298 | 3822: Treatment and disposal of hazardous waste 299 | 3831: Dismantling of wrecks 300 | 3832: Recovery of sorted materials 301 | 3900: Remediation activities and other waste management services 302 | 4110: Development of building projects 303 | 4120: Construction of residential and non-residential buildings 304 | 4211: Construction of roads and motorways 305 | 4212: Construction of railways and underground railways 306 | 4213: Construction of bridges and tunnels 307 | 4221: Construction of utility projects for fluids 308 | 4222: Construction of utility projects for electricity and telecommunications 309 | 4291: Construction of water projects 310 | 4299: Construction of other civil engineering projects nec 311 | 4311: Demolition 312 | 4312: Site preparation 313 | 4313: Test drilling and boring 314 | 4321: Electrical installation 315 | 4322: Plumbing, heat and air-conditioning installation 316 | 4329: Other construction installation 317 | 4331: Plastering 318 | 4332: Joinery installation 319 | 4333: Floor and wall covering 320 | 4334: Painting and glazing 321 | 4339: Other building completion and finishing 322 | 4391: Roofing activities 323 | 4399: Other specialised construction activities nec 324 | 4511: Sale of cars and light motor vehicles 325 | 4519: Sale of other motor vehicles 326 | 4520: Maintenance and repair of motor vehicles 327 | 4531: Wholesale trade of motor vehicle parts and accessories 328 | 4532: Retail trade of motor vehicle parts and accessories 329 | 4540: Sale, maintenance and repair of motorcycles and related parts and accessories 330 | 4611: Agents involved in the sale of agricultural raw materials, live animals, textile raw materials and semi-finished goods 331 | 4612: Agents involved in the sale of fuels, ores, metals and industrial chemicals 332 | 4613: Agents involved in the sale of timber and building materials 333 | 4614: Agents involved in the sale of machinery, industrial equipment, ships and aircraft 334 | 4615: Agents involved in the sale of furniture, household goods, hardware and ironmongery 335 | 4616: Agents involved in the sale of textiles, clothing, fur, footwear and leather goods 336 | 4617: Agents involved in the sale of food, beverages and tobacco 337 | 4618: Agents specialised in the sale of other particular products 338 | 4619: Agents involved in the sale of a variety of goods 339 | 4621: Wholesale of grain, unmanufactured tobacco, seeds and animal feeds 340 | 4622: Wholesale of flowers and plants 341 | 4623: Wholesale of live animals 342 | 4624: Wholesale of hides, skins and leather 343 | 4631: Wholesale of fruit and vegetables 344 | 4632: Wholesale of meat and meat products 345 | 4633: Wholesale of dairy products, eggs and edible oils and fats 346 | 4634: Wholesale of beverages 347 | 4635: Wholesale of tobacco products 348 | 4636: Wholesale of sugar and chocolate and sugar confectionery 349 | 4637: Wholesale of coffee, tea, cocoa and spices 350 | 4638: Wholesale of other food, including fish, crustaceans and molluscs 351 | 4639: Non-specialised wholesale of food, beverages and tobacco 352 | 4641: Wholesale of textiles 353 | 4642: Wholesale of clothing and footwear 354 | 4643: Wholesale of electrical household appliances 355 | 4644: Wholesale of china and glassware and cleaning materials 356 | 4645: Wholesale of perfume and cosmetics 357 | 4646: Wholesale of pharmaceutical goods 358 | 4647: Wholesale of furniture, carpets and lighting equipment 359 | 4648: Wholesale of watches and jewellery 360 | 4649: Wholesale of other household goods 361 | 4651: Wholesale of computers, computer peripheral equipment and software 362 | 4652: Wholesale of electronic and telecommunications equipment and parts 363 | 4661: Wholesale of agricultural machinery, equipment and supplies 364 | 4662: Wholesale of machine tools 365 | 4663: Wholesale of mining, construction and civil engineering machinery 366 | 4664: Wholesale of machinery for the textile industry and of sewing and knitting machines 367 | 4665: Wholesale of office furniture 368 | 4666: Wholesale of other office machinery and equipment 369 | 4669: Wholesale of other machinery and equipment 370 | 4671: Wholesale of solid, liquid and gaseous fuels and related products 371 | 4672: Wholesale of metals and metal ores 372 | 4673: Wholesale of wood, construction materials and sanitary equipment 373 | 4674: Wholesale of hardware, plumbing and heating equipment and supplies 374 | 4675: Wholesale of chemical products 375 | 4676: Wholesale of other intermediate products 376 | 4677: Wholesale of waste and scrap 377 | 4690: Non-specialised wholesale trade 378 | 4711: Retail sale in non-specialised stores with food, beverages or tobacco predominating 379 | 4719: Other retail sale in non-specialised stores 380 | 4721: Retail sale of fruit and vegetables in specialised stores 381 | 4722: Retail sale of meat and meat products in specialised stores 382 | 4723: Retail sale of fish, crustaceans and molluscs in specialised stores 383 | 4724: Retail sale of bread, cakes, flour confectionery and sugar confectionery in specialised stores 384 | 4725: Retail sale of beverages in specialised stores 385 | 4726: Retail sale of tobacco products in specialised stores 386 | 4729: Other retail sale of food in specialised stores 387 | 4730: Retail sale of automotive fuel in specialised stores 388 | 4741: Retail sale of computers, peripheral units and software in specialised stores 389 | 4742: Retail sale of telecommunications equipment in specialised stores 390 | 4743: Retail sale of audio and video equipment in specialised stores 391 | 4751: Retail sale of textiles in specialised stores 392 | 4752: Retail sale of hardware, paints and glass in specialised stores 393 | 4753: Retail sale of carpets, rugs, wall and floor coverings in specialised stores 394 | 4754: Retail sale of electrical household appliances in specialised stores 395 | 4759: Retail sale of furniture, lighting equipment and other household articles in specialised stores 396 | 4761: Retail sale of books in specialised stores 397 | 4762: Retail sale of newspapers and stationery in specialised stores 398 | 4763: Retail sale of music and video recordings in specialised stores 399 | 4764: Retail sale of sporting equipment in specialised stores 400 | 4765: Retail sale of games and toys in specialised stores 401 | 4771: Retail sale of clothing in specialised stores 402 | 4772: Retail sale of footwear and leather goods in specialised stores 403 | 4773: Dispensing chemist in specialised stores 404 | 4774: Retail sale of medical and orthopaedic goods in specialised stores 405 | 4775: Retail sale of cosmetic and toilet articles in specialised stores 406 | 4776: Retail sale of flowers, plants, seeds, fertilisers, pet animals and pet food in specialised stores 407 | 4777: Retail sale of watches and jewellery in specialised stores 408 | 4778: Other retail sale of new goods in specialised stores 409 | 4779: Retail sale of second-hand goods in stores 410 | 4781: Retail sale via stalls and markets of food, beverages and tobacco products 411 | 4782: Retail sale via stalls and markets of textiles, clothing and footwear 412 | 4789: Retail sale via stalls and markets of other goods 413 | 4791: Retail sale via mail order houses or via Internet 414 | 4799: Other retail sale not in stores, stalls or markets 415 | 4910: Passenger rail transport, interurban 416 | 4920: Freight rail transport 417 | 4931: Urban and suburban passenger land transport 418 | 4932: Taxi operation 419 | 4939: Other passenger land transport nec 420 | 4941: Freight transport by road 421 | 4942: Removal services 422 | 4950: Transport via pipeline 423 | 5010: Sea and coastal passenger water transport 424 | 5020: Sea and coastal freight water transport 425 | 5030: Inland passenger water transport 426 | 5040: Inland freight water transport 427 | 5110: Passenger air transport 428 | 5121: Freight air transport 429 | 5122: Space transport 430 | 5210: Warehousing and storage 431 | 5221: Service activities incidental to land transportation 432 | 5222: Service activities incidental to water transportation 433 | 5223: Service activities incidental to air transportation 434 | 5224: Cargo handling 435 | 5229: Other transportation support activities 436 | 5310: Postal activities under universal service obligation 437 | 5320: Other postal and courier activities 438 | 5510: Hotels and similar accommodation 439 | 5520: Holiday and other short-stay accommodation 440 | 5530: Camping grounds, recreational vehicle parks and trailer parks 441 | 5590: Other accommodation 442 | 5610: Restaurants and mobile food service activities 443 | 5621: Event catering activities 444 | 5629: Other food service activities 445 | 5630: Beverage serving activities 446 | 5811: Book publishing 447 | 5812: Publishing of directories and mailing lists 448 | 5813: Publishing of newspapers 449 | 5814: Publishing of journals and periodicals 450 | 5819: Other publishing activities 451 | 5821: Publishing of computer games 452 | 5829: Other software publishing 453 | 5911: Motion picture, video and television programme production activities 454 | 5912: Motion picture, video and television programme post-production activities 455 | 5913: Motion picture, video and television programme distribution activities 456 | 5914: Motion picture projection activities 457 | 5920: Sound recording and music publishing activities 458 | 6010: Radio broadcasting 459 | 6020: Television programming and broadcasting activities 460 | 6110: Wired telecommunications activities 461 | 6120: Wireless telecommunications activities 462 | 6130: Satellite telecommunications activities 463 | 6190: Other telecommunications activities 464 | 6201: Computer programming activities 465 | 6202: Computer consultancy activities 466 | 6203: Computer facilities management activities 467 | 6209: Other information technology and computer service activities 468 | 6311: Data processing, hosting and related activities 469 | 6312: Web portals 470 | 6391: News agency activities 471 | 6399: Other information service activities nec 472 | 6411: Central banking 473 | 6419: Other monetary intermediation 474 | 6420: Activities of holding companies 475 | 6430: Trusts, funds and similar financial entities 476 | 6491: Financial leasing 477 | 6492: Other credit granting 478 | 6499: Other financial service activities, except insurance and pension funding, nec 479 | 6511: Life insurance 480 | 6512: Non-life insurance 481 | 6520: Reinsurance 482 | 6530: Pension funding 483 | 6611: Administration of financial markets 484 | 6612: Security and commodity contracts brokerage 485 | 6619: Other activities auxiliary to financial services, except insurance and pension funding 486 | 6621: Risk and damage evaluation 487 | 6622: Activities of insurance agents and brokers 488 | 6629: Other activities auxiliary to insurance and pension funding 489 | 6630: Fund management activities 490 | 6810: Buying and selling of own real estate 491 | 6820: Renting and operating of own or leased real estate 492 | 6831: Real estate agencies 493 | 6832: Management of real estate on a fee or contract basis 494 | 6910: Legal activities 495 | 6920: Accounting, bookkeeping and auditing activities; tax consultancy 496 | 7010: Activities of head offices 497 | 7021: Public relations and communication activities 498 | 7022: Business and other management consultancy activities 499 | 7111: Architectural activities 500 | 7112: Engineering activities and related technical consultancy 501 | 7120: Technical testing and analysis 502 | 7211: Research and experimental development on biotechnology 503 | 7219: Other research and experimental development on natural sciences and engineering 504 | 7220: Research and experimental development on social sciences and humanities 505 | 7311: Advertising agencies 506 | 7312: Media representation 507 | 7320: Market research and public opinion polling 508 | 7410: Specialised design activities 509 | 7420: Photographic activities 510 | 7430: Translation and interpretation activities 511 | 7490: Other professional, scientific and technical activities nec 512 | 7500: Veterinary activities 513 | 7711: Renting and leasing of cars and light motor vehicles 514 | 7712: Renting and leasing of trucks 515 | 7721: Renting and leasing of recreational and sports goods 516 | 7722: Renting of video tapes and disks 517 | 7729: Renting and leasing of other personal and household goods 518 | 7731: Renting and leasing of agricultural machinery and equipment 519 | 7732: Renting and leasing of construction and civil engineering machinery and equipment 520 | 7733: Renting and leasing of office machinery and equipment (including computers) 521 | 7734: Renting and leasing of water transport equipment 522 | 7735: Renting and leasing of air transport equipment 523 | 7739: Renting and leasing of other machinery, equipment and tangible goods nec 524 | 7740: Leasing of intellectual property and similar products, except copyrighted works 525 | 7810: Activities of employment placement agencies 526 | 7820: Temporary employment agency activities 527 | 7830: Other human resources provision 528 | 7911: Travel agency activities 529 | 7912: Tour operator activities 530 | 7990: Other reservation service and related activities 531 | 8010: Private security activities 532 | 8020: Security systems service activities 533 | 8030: Investigation activities 534 | 8110: Combined facilities support activities 535 | 8121: General cleaning of buildings 536 | 8122: Other building and industrial cleaning activities 537 | 8129: Other cleaning activities 538 | 8130: Landscape service activities 539 | 8211: Combined office administrative service activities 540 | 8219: Photocopying, document preparation and other specialised office support activities 541 | 8220: Activities of call centres 542 | 8230: Organisation of conventions and trade shows 543 | 8291: Activities of collection agencies and credit bureaus 544 | 8292: Packaging activities 545 | 8299: Other business support service activities nec 546 | 8411: General public administration activities 547 | 8412: Regulation of the activities of providing health care, education, cultural services and other social services, excluding social security 548 | 8413: Regulation of and contribution to more efficient operation of businesses 549 | 8421: Foreign affairs 550 | 8422: Defence activities 551 | 8423: Justice and judicial activities 552 | 8424: Public order and safety activities 553 | 8425: Fire service activities 554 | 8430: Compulsory social security activities 555 | 8510: Pre-primary education 556 | 8520: Primary education 557 | 8531: General secondary education 558 | 8532: Technical and vocational secondary education 559 | 8541: Post-secondary non-tertiary education 560 | 8542: Tertiary education 561 | 8551: Sports and recreation education 562 | 8552: Cultural education 563 | 8553: Driving school activities 564 | 8559: Other education nec 565 | 8560: Educational support activities 566 | 8610: Hospital activities 567 | 8621: General medical practice activities 568 | 8622: Specialist medical practice activities 569 | 8623: Dental practice activities 570 | 8690: Other human health activities 571 | 8710: Residential nursing care activities 572 | 8720: Residential care activities for learning disabilities, mental health and substance abuse 573 | 8730: Residential care activities for the elderly and disabled 574 | 8790: Other residential care activities 575 | 8810: Social work activities without accommodation for the elderly and disabled 576 | 8891: Child day-care activities 577 | 8899: Other social work activities without accommodation nec 578 | 9001: Performing arts 579 | 9002: Support activities to performing arts 580 | 9003: Artistic creation 581 | 9004: Operation of arts facilities 582 | 9101: Library and archive activities 583 | 9102: Museum activities 584 | 9103: Operation of historical sites and buildings and similar visitor attractions 585 | 9104: Botanical and zoological gardens and nature reserve activities 586 | 9200: Gambling and betting activities 587 | 9311: Operation of sports facilities 588 | 9312: Activities of sport clubs 589 | 9313: Fitness facilities 590 | 9319: Other sports activities 591 | 9321: Activities of amusement parks and theme parks 592 | 9329: Other amusement and recreation activities 593 | 9411: Activities of business and employers membership organisations 594 | 9412: Activities of professional membership organisations 595 | 9420: Activities of trade unions 596 | 9491: Activities of religious organisations 597 | 9492: Activities of political organisations 598 | 9499: Activities of other membership organisations nec 599 | 9511: Repair of computers and peripheral equipment 600 | 9512: Repair of communication equipment 601 | 9521: Repair of consumer electronics 602 | 9522: Repair of household appliances and home and garden equipment 603 | 9523: Repair of footwear and leather goods 604 | 9524: Repair of furniture and home furnishings 605 | 9525: Repair of watches, clocks and jewellery 606 | 9529: Repair of other personal and household goods 607 | 9601: Washing and (dry-)cleaning of textile and fur products 608 | 9602: Hairdressing and other beauty treatment 609 | 9603: Funeral and related activities 610 | 9604: Physical well-being activities 611 | 9609: Other personal service activities nec 612 | 9700: Activities of households as employers of domestic personnel 613 | 9810: Undifferentiated goods-producing activities of private households for own use 614 | 9820: Undifferentiated service-producing activities of private households for own use 615 | 9900: Activities of extraterritorial organisations and bodies 616 | -------------------------------------------------------------------------------- /src/sic_soc_llm/example_data/soc_4d_condensed.txt: -------------------------------------------------------------------------------- 1 | 1111: Chief executives and senior officials 2 | 1112: Elected officers and representatives 3 | 1121: Production managers and directors in manufacturing 4 | 1122: Production managers and directors in construction 5 | 1123: Production managers and directors in mining and energy 6 | 1131: Financial managers and directors 7 | 1132: Marketing, sales and advertising directors 8 | 1133: Public relations and communications directors 9 | 1134: Purchasing managers and directors 10 | 1135: Charitable organisation managers and directors 11 | 1136: Human resource managers and directors 12 | 1137: Information technology directors 13 | 1139: Functional managers and directors n.e.c. 14 | 1140: Directors in logistics, warehousing and transport 15 | 1150: Managers and directors in retail and wholesale 16 | 1161: Officers in armed forces 17 | 1162: Senior police officers 18 | 1163: Senior officers in fire, ambulance, prison and related services 19 | 1171: Health services and public health managers and directors 20 | 1172: Social services managers and directors 21 | 1211: Managers and proprietors in agriculture and horticulture 22 | 1212: Managers and proprietors in forestry, fishing and related services 23 | 1221: Hotel and accommodation managers and proprietors 24 | 1222: Restaurant and catering establishment managers and proprietors 25 | 1223: Publicans and managers of licensed premises 26 | 1224: Leisure and sports managers and proprietors 27 | 1225: Travel agency managers and proprietors 28 | 1231: Health care practice managers 29 | 1232: Residential, day and domiciliary care managers and proprietors 30 | 1233: Early education and childcare services proprietors 31 | 1241: Managers in transport and distribution 32 | 1242: Managers in storage and warehousing 33 | 1243: Managers in logistics 34 | 1251: Property, housing and estate managers 35 | 1252: Garage managers and proprietors 36 | 1253: Hairdressing and beauty salon managers and proprietors 37 | 1254: Waste disposal and environmental services managers 38 | 1255: Managers and directors in the creative industries 39 | 1256: Betting shop and gambling establishment managers 40 | 1257: Hire services managers and proprietors 41 | 1258: Directors in consultancy services 42 | 1259: Managers and proprietors in other services n.e.c. 43 | 2111: Chemical scientists 44 | 2112: Biological scientists 45 | 2113: Biochemists and biomedical scientists 46 | 2114: Physical scientists 47 | 2115: Social and humanities scientists 48 | 2119: Natural and social science professionals n.e.c. 49 | 2121: Civil engineers 50 | 2122: Mechanical engineers 51 | 2123: Electrical engineers 52 | 2124: Electronics engineers 53 | 2125: Production and process engineers 54 | 2126: Aerospace engineers 55 | 2127: Engineering project managers and project engineers 56 | 2129: Engineering professionals n.e.c. 57 | 2131: IT project managers 58 | 2132: IT managers 59 | 2133: IT business analysts, architects and systems designers 60 | 2134: Programmers and software development professionals 61 | 2135: Cyber security professionals 62 | 2136: IT quality and testing professionals 63 | 2137: IT network professionals 64 | 2139: Information technology professionals n.e.c. 65 | 2141: Web design professionals 66 | 2142: Graphic and multimedia designers 67 | 2151: Conservation professionals 68 | 2152: Environment professionals 69 | 2161: Research and development (r&d) managers 70 | 2162: Other researchers, unspecified discipline 71 | 2211: Generalist medical practitioners 72 | 2212: Specialist medical practitioners 73 | 2221: Physiotherapists 74 | 2222: Occupational therapists 75 | 2223: Speech and language therapists 76 | 2224: Psychotherapists and cognitive behaviour therapists 77 | 2225: Clinical psychologists 78 | 2226: Other psychologists 79 | 2229: Therapy professionals n.e.c. 80 | 2231: Midwifery nurses 81 | 2232: Registered community nurses 82 | 2233: Registered specialist nurses 83 | 2234: Registered nurse practitioners 84 | 2235: Registered mental health nurses 85 | 2236: Registered children's nurses 86 | 2237: Other registered nursing professionals 87 | 2240: Veterinarians 88 | 2251: Pharmacists 89 | 2252: Optometrists 90 | 2253: Dental practitioners 91 | 2254: Medical radiographers 92 | 2255: Paramedics 93 | 2256: Podiatrists 94 | 2259: Other health professionals n.e.c. 95 | 2311: Higher education teaching professionals 96 | 2312: Further education teaching professionals 97 | 2313: Secondary education teaching professionals 98 | 2314: Primary education teaching professionals 99 | 2315: Nursery education teaching professionals 100 | 2316: Special and additional needs education teaching professionals 101 | 2317: Teachers of english as a foreign language 102 | 2319: Teaching professionals n.e.c. 103 | 2321: Head teachers and principals 104 | 2322: Education managers 105 | 2323: Education advisers and school inspectors 106 | 2324: Early education and childcare services managers 107 | 2329: Other educational professionals n.e.c 108 | 2411: Barristers and judges 109 | 2412: Solicitors and lawyers 110 | 2419: Legal professionals n.e.c. 111 | 2421: Chartered and certified accountants 112 | 2422: Finance and investment analysts and advisers 113 | 2423: Taxation experts 114 | 2431: Management consultants and business analysts 115 | 2432: Marketing and commercial managers 116 | 2433: Actuaries, economists and statisticians 117 | 2434: Business and related research professionals 118 | 2435: Professional/chartered company secretaries 119 | 2439: Business, research and administrative professionals n.e.c. 120 | 2440: Business and financial project management professionals 121 | 2451: Architects 122 | 2452: Chartered architectural technologists, planning officers and consultants 123 | 2453: Quantity surveyors 124 | 2454: Chartered surveyors 125 | 2455: Construction project managers and related professionals 126 | 2461: Social workers 127 | 2462: Probation officers 128 | 2463: Clergy 129 | 2464: Youth work professionals 130 | 2469: Welfare professionals n.e.c. 131 | 2471: Librarians 132 | 2472: Archivists, conservators and curators 133 | 2481: Quality control and planning engineers 134 | 2482: Quality assurance and regulatory professionals 135 | 2483: Environmental health professionals 136 | 2491: Newspaper, periodical and broadcast editors 137 | 2492: Newspaper and periodical broadcast journalists and reporters 138 | 2493: Public relations professionals 139 | 2494: Advertising accounts managers and creative directors 140 | 3111: Laboratory technicians 141 | 3112: Electrical and electronics technicians 142 | 3113: Engineering technicians 143 | 3114: Building and civil engineering technicians 144 | 3115: Quality assurance technicians 145 | 3116: Planning, process and production technicians 146 | 3119: Science, engineering and production technicians n.e.c. 147 | 3120: CAD, drawing and architectural technicians 148 | 3131: IT operations technicians 149 | 3132: IT user support technicians 150 | 3133: Database administrators and web content technicians 151 | 3211: Dispensing opticians 152 | 3212: Pharmaceutical technicians 153 | 3213: Medical and dental technicians 154 | 3214: Complementary health associate professionals 155 | 3219: Health associate professionals n.e.c. 156 | 3221: Youth and community workers 157 | 3222: Child and early years officers 158 | 3223: Housing officers 159 | 3224: Counsellors 160 | 3229: Welfare and housing associate professionals n.e.c. 161 | 3231: Higher level teaching assistants 162 | 3232: Early education and childcare practitioners 163 | 3240: Veterinary nurses 164 | 3311: Non-commissioned officers and other ranks 165 | 3312: Police officers (sergeant and below) 166 | 3313: Fire service officers (watch manager and below) 167 | 3314: Prison service officers (below principal officer) 168 | 3319: Protective service associate professionals n.e.c. 169 | 3411: Artists 170 | 3412: Authors, writers and translators 171 | 3413: Actors, entertainers and presenters 172 | 3414: Dancers and choreographers 173 | 3415: Musicians 174 | 3416: Arts officers, producers and directors 175 | 3417: Photographers, audio-visual and broadcasting equipment operators 176 | 3421: Interior designers 177 | 3422: Clothing, fashion and accessories designers 178 | 3429: Design occupations n.e.c. 179 | 3431: Sports players 180 | 3432: Sports coaches, instructors and officials 181 | 3433: Fitness and wellbeing instructors 182 | 3511: Aircraft pilots and air traffic controllers 183 | 3512: Ship and hovercraft officers 184 | 3520: Legal associate professionals 185 | 3531: Brokers 186 | 3532: Insurance underwriters 187 | 3533: Financial and accounting technicians 188 | 3534: Financial accounts managers 189 | 3541: Estimators, valuers and assessors 190 | 3542: Importers and exporters 191 | 3543: Project support officers 192 | 3544: Data analysts 193 | 3549: Business associate professionals n.e.c. 194 | 3551: Buyers and procurement officers 195 | 3552: Business sales executives 196 | 3553: Merchandisers 197 | 3554: Advertising and marketing associate professionals 198 | 3555: Estate agents and auctioneers 199 | 3556: Sales accounts and business development managers 200 | 3557: Events managers and organisers 201 | 3560: Public services associate professionals 202 | 3571: Human resources and industrial relations officers 203 | 3572: Careers advisers and vocational guidance specialists 204 | 3573: Information technology trainers 205 | 3574: Other vocational and industrial trainers 206 | 3581: Inspectors of standards and regulations 207 | 3582: Health and safety managers and officers 208 | 4111: National government administrative occupations 209 | 4112: Local government administrative occupations 210 | 4113: Officers of non-governmental organisations 211 | 4121: Credit controllers 212 | 4122: Book-keepers, payroll managers and wages clerks 213 | 4123: Bank and post office clerks 214 | 4124: Finance officers 215 | 4129: Financial administrative occupations n.e.c. 216 | 4131: Records clerks and assistants 217 | 4132: Pensions and insurance clerks and assistants 218 | 4133: Stock control clerks and assistants 219 | 4134: Transport and distribution clerks and assistants 220 | 4135: Library clerks and assistants 221 | 4136: Human resources administrative occupations 222 | 4141: Office managers 223 | 4142: Office supervisors 224 | 4143: Customer service managers 225 | 4151: Sales administrators 226 | 4152: Data entry administrators 227 | 4159: Other administrative occupations n.e.c. 228 | 4211: Medical secretaries 229 | 4212: Legal secretaries 230 | 4213: School secretaries 231 | 4214: Company secretaries and administrators 232 | 4215: Personal assistants and other secretaries 233 | 4216: Receptionists 234 | 4217: Typists and related keyboard occupations 235 | 5111: Farmers 236 | 5112: Horticultural trades 237 | 5113: Gardeners and landscape gardeners 238 | 5114: Groundsmen and greenkeepers 239 | 5119: Agricultural and fishing trades n.e.c. 240 | 5211: Sheet metal workers 241 | 5212: Metal plate workers, smiths, moulders and related occupations 242 | 5213: Welding trades 243 | 5214: Pipe fitters 244 | 5221: Metal machining setters and setter-operators 245 | 5222: Tool makers, tool fitters and markers-out 246 | 5223: Metal working production and maintenance fitters 247 | 5224: Precision instrument makers and repairers 248 | 5225: Air-conditioning and refrigeration installers and repairers 249 | 5231: Vehicle technicians, mechanics and electricians 250 | 5232: Vehicle body builders and repairers 251 | 5233: Vehicle paint technicians 252 | 5234: Aircraft maintenance and related trades 253 | 5235: Boat and ship builders and repairers 254 | 5236: Rail and rolling stock builders and repairers 255 | 5241: Electricians and electrical fitters 256 | 5242: Telecoms and related network installers and repairers 257 | 5243: Tv, video and audio servicers and repairers 258 | 5244: Computer system and equipment installers and servicers 259 | 5245: Security system installers and repairers 260 | 5246: Electrical service and maintenance mechanics and repairers 261 | 5249: Electrical and electronic trades n.e.c. 262 | 5250: Skilled metal, electrical and electronic trades supervisors 263 | 5311: Steel erectors 264 | 5312: Stonemasons and related trades 265 | 5313: Bricklayers 266 | 5314: Roofers, roof tilers and slaters 267 | 5315: Plumbers and heating and ventilating installers and repairers 268 | 5316: Carpenters and joiners 269 | 5317: Glaziers, window fabricators and fitters 270 | 5319: Construction and building trades n.e.c. 271 | 5321: Plasterers 272 | 5322: Floorers and wall tilers 273 | 5323: Painters and decorators 274 | 5330: Construction and building trades supervisors 275 | 5411: Upholsterers 276 | 5412: Footwear and leather working trades 277 | 5413: Tailors and dressmakers 278 | 5419: Textiles, garments and related trades n.e.c. 279 | 5421: Pre-press technicians 280 | 5422: Printers 281 | 5423: Print finishing and binding workers 282 | 5431: Butchers 283 | 5432: Bakers and flour confectioners 284 | 5433: Fishmongers and poultry dressers 285 | 5434: Chefs 286 | 5435: Cooks 287 | 5436: Catering and bar managers 288 | 5441: Glass and ceramics makers, decorators and finishers 289 | 5442: Furniture makers and other craft woodworkers 290 | 5443: Florists 291 | 5449: Other skilled trades n.e.c. 292 | 6111: Early education and childcare assistants 293 | 6112: Teaching assistants 294 | 6113: Educational support assistants 295 | 6114: Childminders 296 | 6116: Nannies and au pairs 297 | 6117: Playworkers 298 | 6121: Pest control officers 299 | 6129: Animal care services occupations n.e.c 300 | 6131: Nursing auxiliaries and assistants 301 | 6132: Ambulance staff (excluding paramedics) 302 | 6133: Dental nurses 303 | 6134: Houseparents and residential wardens 304 | 6135: Care workers and home carers 305 | 6136: Senior care workers 306 | 6137: Care escorts 307 | 6138: Undertakers, mortuary and crematorium assistants 308 | 6211: Sports and leisure assistants 309 | 6212: Travel agents 310 | 6213: Air travel assistants 311 | 6214: Rail travel assistants 312 | 6219: Leisure and travel service occupations n.e.c. 313 | 6221: Hairdressers and barbers 314 | 6222: Beauticians and related occupations 315 | 6231: Housekeepers and related occupations 316 | 6232: Caretakers 317 | 6240: Cleaning and housekeeping managers and supervisors 318 | 6250: Bed and breakfast and guest house owners and proprietors 319 | 6311: Police community support officers 320 | 6312: Parking and civil enforcement occupations 321 | 7111: Sales and retail assistants 322 | 7112: Retail cashiers and check-out operators 323 | 7113: Telephone salespersons 324 | 7114: Pharmacy and optical dispensing assistants 325 | 7115: Vehicle and parts salespersons and advisers 326 | 7121: Collector salespersons and credit agents 327 | 7122: Debt, rent and other cash collectors 328 | 7123: Roundspersons and van salespersons 329 | 7124: Market and street traders and assistants 330 | 7125: Visual merchandisers and related occupations 331 | 7129: Sales related occupations n.e.c. 332 | 7131: Shopkeepers and owners - retail and wholesale 333 | 7132: Sales supervisors - retail and wholesale 334 | 7211: Call and contact centre occupations 335 | 7212: Telephonists 336 | 7213: Communication operators 337 | 7214: Market research interviewers 338 | 7219: Customer service occupations n.e.c. 339 | 7220: Customer service supervisors 340 | 8111: Food, drink and tobacco process operatives 341 | 8112: Textile process operatives 342 | 8113: Chemical and related process operatives 343 | 8114: Plastics process operatives 344 | 8115: Metal making and treating process operatives 345 | 8119: Process operatives n.e.c. 346 | 8120: Metal working machine operatives 347 | 8131: Paper and wood machine operatives 348 | 8132: Mining and quarry workers and related operatives 349 | 8133: Energy plant operatives 350 | 8134: Water and sewerage plant operatives 351 | 8135: Printing machine assistants 352 | 8139: Plant and machine operatives n.e.c. 353 | 8141: Assemblers (electrical and electronic products) 354 | 8142: Assemblers (vehicles and metal goods) 355 | 8143: Routine inspectors and testers 356 | 8144: Weighers, graders and sorters 357 | 8145: Tyre, exhaust and windscreen fitters 358 | 8146: Sewing machinists 359 | 8149: Assemblers and routine operatives n.e.c. 360 | 8151: Scaffolders, stagers and riggers 361 | 8152: Road construction operatives 362 | 8153: Rail construction and maintenance operatives 363 | 8159: Construction operatives n.e.c. 364 | 8160: Production, factory and assembly supervisors 365 | 8211: Heavy and large goods vehicle drivers 366 | 8212: Bus and coach drivers 367 | 8213: Taxi and cab drivers and chauffeurs 368 | 8214: Delivery drivers and couriers 369 | 8215: Driving instructors 370 | 8219: Road transport drivers n.e.c. 371 | 8221: Crane drivers 372 | 8222: Fork-lift truck drivers 373 | 8229: Mobile machine drivers and operatives n.e.c. 374 | 8231: Train and tram drivers 375 | 8232: Marine and waterways transport operatives 376 | 8233: Air transport operatives 377 | 8234: Rail transport operatives 378 | 8239: Other drivers and transport operatives n.e.c. 379 | 9111: Farm workers 380 | 9112: Forestry and related workers 381 | 9119: Fishing and other elementary agriculture occupations n.e.c. 382 | 9121: Groundworkers 383 | 9129: Elementary construction occupations n.e.c. 384 | 9131: Industrial cleaning process occupations 385 | 9132: Packers, bottlers, canners and fillers 386 | 9139: Elementary process plant occupations n.e.c. 387 | 9211: Postal workers, mail sorters and messengers 388 | 9219: Elementary administration occupations n.e.c. 389 | 9221: Window cleaners 390 | 9222: Street cleaners 391 | 9223: Cleaners and domestics 392 | 9224: Launderers, dry cleaners and pressers 393 | 9225: Refuse and salvage occupations 394 | 9226: Vehicle valeters and cleaners 395 | 9229: Elementary cleaning occupations n.e.c. 396 | 9231: Security guards and related occupations 397 | 9232: School midday and crossing patrol occupations 398 | 9233: Exam invigilators 399 | 9241: Shelf fillers 400 | 9249: Elementary sales occupations n.e.c. 401 | 9251: Elementary storage supervisors 402 | 9252: Warehouse operatives 403 | 9253: Delivery operatives 404 | 9259: Elementary storage occupations n.e.c. 405 | 9261: Bar and catering supervisors 406 | 9262: Hospital porters 407 | 9263: Kitchen and catering assistants 408 | 9264: Waiters and waitresses 409 | 9265: Bar staff 410 | 9266: Coffee shop workers 411 | 9267: Leisure and theme park attendants 412 | 9269: Other elementary services occupations n.e.c. 413 | -------------------------------------------------------------------------------- /src/sic_soc_llm/example_data/toy_index.txt: -------------------------------------------------------------------------------- 1 | 01: Cat 2 | 02: Dog 3 | 03: Fish 4 | 05: Mouse 5 | -------------------------------------------------------------------------------- /src/sic_soc_llm/llm.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import numpy as np 3 | from functools import lru_cache 4 | from collections import defaultdict 5 | from typing import Optional, Union 6 | from langchain.chains.llm import LLMChain 7 | from langchain.output_parsers import PydanticOutputParser 8 | from langchain_google_vertexai import VertexAI 9 | from langchain_openai import ChatOpenAI 10 | 11 | from sic_soc_llm import get_config 12 | from sic_soc_llm.prompt import ( 13 | SOC_PROMPT_PYDANTIC, 14 | SIC_PROMPT_PYDANTIC, 15 | SIC_PROMPT_RAG, 16 | GENERAL_PROMPT_RAG, 17 | ) 18 | from sic_soc_llm.data_models.response_model import SocResponse, SicResponse, RagResponse 19 | from sic_soc_llm.embedding import EmbeddingHandler 20 | from sic_soc_llm.data_models.sicDB import sic_meta 21 | from sic_soc_llm.data_models.sic_data_access import load_sic_index, load_sic_structure 22 | from sic_soc_llm.data_models.sic_hierarchy import load_hierarchy 23 | 24 | logger = logging.getLogger(__name__) 25 | config = get_config() 26 | 27 | 28 | class ClassificationLLM: 29 | """ 30 | Wraps the logic for using an LLM to classify respondent's data 31 | based on provided index. Includes direct (one-shot) generative llm 32 | method and Retrieval Augmented Generation (RAG). 33 | 34 | Args: 35 | model_name (str): Name of the model. Defaults to the value in the `config` file. 36 | Used if no LLM object is passed. 37 | llm (LLM): LLM to use. Optional. 38 | embedding_handler (EmbeddingHandler): Embedding handler. Optional. 39 | If None a default embedding handler is retrieved based on config file. 40 | max_tokens (int): Maximum number of tokens to generate. Defaults to 1600. 41 | temperature (float): Temperature of the LLM model. Defaults to 0.0. 42 | verbose (bool): Whether to print verbose output. Defaults to False. 43 | openai_api_key (str): OpenAI API key. Optional, but needed for OpenAI models. 44 | """ 45 | 46 | def __init__( 47 | self, 48 | model_name: str = config["llm"]["llm_model_name"], 49 | llm: Optional[Union[VertexAI, ChatOpenAI]] = None, 50 | embedding_handler: Optional[EmbeddingHandler] = None, 51 | max_tokens: int = 1600, 52 | temperature: float = 0.0, 53 | verbose: bool = False, 54 | openai_api_key: Optional[str] = None, 55 | ): 56 | """ 57 | Initialises the ClassificationLLM object. 58 | """ 59 | if llm is not None: 60 | self.llm = llm 61 | else: 62 | if model_name.startswith("text-") or model_name.startswith("gemini"): 63 | self.llm = VertexAI( 64 | model_name=model_name, 65 | max_output_tokens=max_tokens, 66 | temperature=temperature, 67 | location="europe-west2", 68 | ) 69 | elif model_name.startswith("gpt"): 70 | if openai_api_key is None: 71 | raise NotImplementedError("Need to provide an OpenAI API key") 72 | self.llm = ChatOpenAI( 73 | model=model_name, 74 | openai_api_key=openai_api_key, 75 | temperature=temperature, 76 | max_tokens=max_tokens, 77 | ) 78 | else: 79 | raise NotImplementedError("Unsupported model family") 80 | 81 | self.soc_prompt = SOC_PROMPT_PYDANTIC 82 | self.sic_prompt = SIC_PROMPT_PYDANTIC 83 | self.sic_meta = sic_meta 84 | self.sic_prompt_rag = SIC_PROMPT_RAG 85 | self.general_prompt_rag = GENERAL_PROMPT_RAG 86 | self.embed = embedding_handler 87 | self.sic = None 88 | self.verbose = verbose 89 | 90 | def _load_embedding_handler(self): 91 | """ 92 | Loads the default embedding handler according to the 'config' file. 93 | Expects an existing and populated persistent vector store. 94 | 95 | Raises: 96 | ValueError: If the retrieved embedding handler has an empty vector store. 97 | Please embed an index before using it in the ClassificationLLM. 98 | """ 99 | logger.info( 100 | """Loading default embedding handler according to 'config' file. 101 | Expecting existing & populated persistent vector store.""" 102 | ) 103 | self.embed = EmbeddingHandler() 104 | if self.embed._index_size == 0: 105 | raise ValueError( 106 | """The retrieved embedding handler has an empty vector store. 107 | Please embed an index before using in the ClassificationLLM.""" 108 | ) 109 | 110 | @lru_cache 111 | def get_soc_code( 112 | self, 113 | job_title: str, 114 | job_description: str, 115 | level_of_education: str, 116 | manage_others: bool, 117 | industry_descr: str, 118 | ) -> SocResponse: 119 | """ 120 | Generates a SOC classification based on respondent's data 121 | using a whole condensed index embedded in the query. 122 | 123 | Args: 124 | job_title (str): The title of the job. 125 | job_description (str): The description of the job. 126 | level_of_education (str): The level of education required for the job. 127 | manage_others (bool): Indicates whether the job involves managing others. 128 | industry_descr (str): The description of the industry. 129 | 130 | Returns: 131 | SocResponse: The generated response to the query. 132 | 133 | Raises: 134 | ValueError: If there is an error parsing the response from the LLM model. 135 | 136 | """ 137 | chain = LLMChain(llm=self.llm, prompt=self.soc_prompt) 138 | response = chain.invoke( 139 | { 140 | "job_title": job_title, 141 | "job_description": job_description, 142 | "level_of_education": level_of_education, 143 | "manage_others": manage_others, 144 | "industry_descr": industry_descr, 145 | }, 146 | return_only_outputs=True, 147 | ) 148 | if self.verbose: 149 | logger.debug(f"{response=}") 150 | # Parse the output to desired format with one retry 151 | parser = PydanticOutputParser(pydantic_object=SocResponse) 152 | try: 153 | validated_answer = parser.parse(response["text"]) 154 | except Exception as parse_error: 155 | logger.error(f"Unable to parse llm response: {str(parse_error)}") 156 | reasoning = ( 157 | f'ERROR parse_error=<{parse_error}>, response=<{response["text"]}>' 158 | ) 159 | validated_answer = SocResponse( 160 | codable=False, soc_candidates=[], reasoning=reasoning 161 | ) 162 | 163 | return validated_answer 164 | 165 | @lru_cache 166 | def get_sic_code( 167 | self, 168 | industry_descr: str, 169 | job_title: str, 170 | job_description: str, 171 | ) -> SicResponse: 172 | """ 173 | Generates a SIC classification based on respondent's data 174 | using a whole condensed index embedded in the query. 175 | 176 | Args: 177 | industry_descr (str): Description of the industry. 178 | job_title (str): Title of the job. 179 | job_description (str): Description of the job. 180 | 181 | Returns: 182 | SicResponse: Generated response to the query. 183 | """ 184 | 185 | chain = LLMChain(llm=self.llm, prompt=self.sic_prompt) 186 | response = chain.invoke( 187 | { 188 | "industry_descr": industry_descr, 189 | "job_title": job_title, 190 | "job_description": job_description, 191 | }, 192 | return_only_outputs=True, 193 | ) 194 | if self.verbose: 195 | logger.debug(f"{response=}") 196 | # Parse the output to desired format with one retry 197 | parser = PydanticOutputParser(pydantic_object=SicResponse) 198 | try: 199 | validated_answer = parser.parse(response["text"]) 200 | except ValueError as parse_error: 201 | logger.debug( 202 | f"Retrying llm response parsing due to an error: {parse_error}" 203 | ) 204 | logger.error(f"Unable to parse llm response: {str(parse_error)}") 205 | 206 | reasoning = ( 207 | f'ERROR parse_error=<{parse_error}>, response=<{response["text"]}>' 208 | ) 209 | validated_answer = SicResponse( 210 | codable=False, 211 | sic_candidates=[], 212 | reasoning=reasoning, 213 | ) 214 | 215 | return validated_answer 216 | 217 | def _prompt_candidate( 218 | self, code: str, activities: list[str], include_all: bool = False 219 | ) -> str: 220 | """Reformat the candidate activities for the prompt. 221 | 222 | Args: 223 | code (str): The code for the item. 224 | activities (list[str]): The list of example activities. 225 | include_all (bool, optional): Whether to include all the sic metadata. 226 | 227 | Returns: 228 | str: A formatted string containing the code, title, and example activities. 229 | """ 230 | if self.sic is None: 231 | sic_index_df = load_sic_index(config["lookups"]["sic_index"]) 232 | sic_df = load_sic_structure(config["lookups"]["sic_structure"]) 233 | self.sic = load_hierarchy(sic_df, sic_index_df) 234 | 235 | item = self.sic[code] 236 | txt = "{" + f"Code: {item.numeric_string_padded()}, Title: {item.description}" 237 | txt += f", Example activities: {', '.join(activities)}" 238 | if include_all: 239 | if item.sic_meta.detail: 240 | txt += f", Details: {item.sic_meta.detail}" 241 | if item.sic_meta.includes: 242 | txt += f", Includes: {', '.join(item.sic_meta.includes)}" 243 | if item.sic_meta.excludes: 244 | txt += f", Excludes: {', '.join(item.sic_meta.excludes)}" 245 | return txt + "}" 246 | 247 | def _prompt_candidate_list( 248 | self, 249 | short_list: list[dict], 250 | chars_limit: int = 14000, 251 | candidates_limit: int = 5, 252 | activities_limit: int = 3, 253 | code_digits: int = 5, 254 | ) -> str: 255 | """Create candidate list for the prompt based on the given parameters. 256 | 257 | This method takes a structured list of candidates and generates a short 258 | string list based on the provided parameters. It filters the candidates 259 | based on the code digits and activities limit, and shortens the list to 260 | fit the character limit. 261 | 262 | Args: 263 | short_list (list[dict]): A list of candidate dictionaries. 264 | chars_limit (int, optional): The character limit for the generated 265 | prompt. Defaults to 14000. 266 | candidates_limit (int, optional): The maximum number of candidates 267 | to include in the prompt. Defaults to 5. 268 | activities_limit (int, optional): The maximum number of activities 269 | to include for each code. Defaults to 3. 270 | code_digits (int, optional): The number of digits to consider from 271 | the code for filtering candidates. Defaults to 5. 272 | 273 | Returns: 274 | str: The generated candidate list for the prompt. 275 | """ 276 | a = defaultdict(list) 277 | for item in short_list: 278 | if item["title"] not in a[item["code"][:code_digits]]: 279 | a[item["code"][:code_digits]].append(item["title"]) 280 | 281 | sic_candidates = [ 282 | self._prompt_candidate(code, activities[:activities_limit]) 283 | for code, activities in a.items() 284 | ][:candidates_limit] 285 | 286 | if chars_limit: 287 | chars_count = np.cumsum([len(x) for x in sic_candidates]) 288 | nn = sum([x <= chars_limit for x in chars_count]) 289 | if nn < len(sic_candidates): 290 | logger.warning( 291 | "Shortening list of candidates to fit token limit " 292 | + f"from {len(sic_candidates)} to {nn}" 293 | ) 294 | sic_candidates = sic_candidates[:nn] 295 | 296 | return "\n".join(sic_candidates) 297 | 298 | def rag_sic_code( # noqa: C901 299 | self, 300 | industry_descr: str, 301 | job_title: str = None, 302 | job_description: str = None, 303 | expand_search_terms: bool = True, 304 | code_digits: int = 5, 305 | candidates_limit: int = 5, 306 | ) -> SicResponse: 307 | """ 308 | Generates a SIC classification based on respondent's data using RAG approach. 309 | 310 | Args: 311 | industry_descr (str): The description of the industry. 312 | job_title (str, optional): The job title. Defaults to None. 313 | job_description (str, optional): The job description. Defaults to None. 314 | expand_search_terms (bool, optional): Whether to expand the search terms 315 | to include job title and description. Defaults to True. 316 | code_digits (int, optional): The number of digits in the generated 317 | SIC code. Defaults to 5. 318 | candidates_limit (int, optional): The maximum number of SIC code candidates 319 | to consider. Defaults to 5. 320 | 321 | Returns: 322 | SicResponse: The generated response to the query. 323 | 324 | Raises: 325 | ValueError: If there is an error during the parsing of the response. 326 | ValueError: If the default embedding handler is required but 327 | not loaded correctly. 328 | 329 | """ 330 | 331 | def prep_call_dict(industry_descr, job_title, job_description, sic_codes): 332 | # Helper function to prepare the call dictionary 333 | is_job_title_present = job_title is None or job_title in {"", " "} 334 | job_title = "Unknown" if is_job_title_present else job_title 335 | 336 | is_job_description_present = job_description is None or job_description in { 337 | "", 338 | " ", 339 | } 340 | job_description = ( 341 | "Unknown" if is_job_description_present else job_description 342 | ) 343 | 344 | call_dict = { 345 | "industry_descr": industry_descr, 346 | "job_title": job_title, 347 | "job_description": job_description, 348 | "sic_index": sic_codes, 349 | } 350 | return call_dict 351 | 352 | if self.embed is None: 353 | try: 354 | self._load_embedding_handler() 355 | except ValueError as err: 356 | logger.exception(err) 357 | logger.warning("Error: Empty embedding vector store, exit early") 358 | validated_answer = SicResponse( 359 | codable=False, 360 | sic_candidates=[], 361 | reasoning="Error, Empty embedding vector store, exit early", 362 | ) 363 | return validated_answer, None, None 364 | 365 | # Retrieve relevant SIC codes and format them for prompt 366 | if expand_search_terms: 367 | short_list = self.embed.search_index_multi( 368 | query=[industry_descr, job_title, job_description] 369 | ) 370 | else: 371 | short_list = self.embed.search_index(query=industry_descr) 372 | 373 | sic_codes = self._prompt_candidate_list( 374 | short_list, code_digits=code_digits, candidates_limit=candidates_limit 375 | ) 376 | 377 | call_dict = prep_call_dict( 378 | industry_descr=industry_descr, 379 | job_title=job_title, 380 | job_description=job_description, 381 | sic_codes=sic_codes, 382 | ) 383 | 384 | if self.verbose: 385 | final_prompt = self.sic_prompt_rag.format(**call_dict) 386 | logger.debug(final_prompt) 387 | 388 | chain = LLMChain(llm=self.llm, prompt=self.sic_prompt_rag) 389 | 390 | try: 391 | response = chain.invoke(call_dict, return_only_outputs=True) 392 | except ValueError as err: 393 | logger.exception(err) 394 | logger.warning("Error from LLMChain, exit early") 395 | validated_answer = SicResponse( 396 | codable=False, 397 | sic_candidates=[], 398 | reasoning="Error from LLMChain, exit early", 399 | ) 400 | return validated_answer, short_list, call_dict 401 | 402 | if self.verbose: 403 | logger.debug(f"{response=}") 404 | 405 | # Parse the output to the desired format 406 | parser = PydanticOutputParser(pydantic_object=SicResponse) 407 | try: 408 | validated_answer = parser.parse(response["text"]) 409 | except ValueError as parse_error: 410 | logger.exception(parse_error) 411 | logger.warning(f"Failed to parse response:\n{response['text']}") 412 | 413 | reasoning = ( 414 | f'ERROR parse_error=<{parse_error}>, response=<{response["text"]}>' 415 | ) 416 | validated_answer = SicResponse( 417 | codable=False, 418 | sic_candidates=[], 419 | reasoning=reasoning, 420 | ) 421 | 422 | return validated_answer, short_list, call_dict 423 | 424 | def rag_general_code( # noqa: C901 425 | self, 426 | respondent_data: dict, 427 | candidates_limit: int = 7, 428 | ) -> RagResponse: 429 | """ 430 | Generates a classification answer based on respondent's data 431 | using RAG and custom index. 432 | 433 | Args: 434 | respondent_data (dict): A dictionary containing respondent data. 435 | candidates_limit (int, optional): The maximum number of candidate 436 | codes to consider. Defaults to 7. 437 | 438 | Returns: 439 | RagResponse: The generated classification response to the query. 440 | 441 | Raises: 442 | ValueError: If there is an error during the parsing of the response. 443 | ValueError: If the default embedding handler is required but 444 | not loaded correctly. 445 | """ 446 | 447 | if self.embed is None: 448 | try: 449 | self._load_embedding_handler() 450 | except ValueError as err: 451 | logger.exception(err) 452 | logger.warning("Error: Empty embedding vector store, exit early") 453 | validated_answer = RagResponse( 454 | codable=False, 455 | alt_candidates=[], 456 | reasoning="Error: Empty embedding vector store, exit early", 457 | ) 458 | return validated_answer, None 459 | 460 | # Retrieve relevant SIC codes and format them for prompt 461 | short_list = self.embed.search_index_multi(query=respondent_data.values()) 462 | 463 | candidate_codes = ( 464 | "{" 465 | + "}, /n{".join( 466 | [ 467 | "Code: " + x["code"] + ", Description: " + x["title"] 468 | for x in short_list[:candidates_limit] 469 | ] 470 | ) 471 | + "}" 472 | ) 473 | 474 | if self.verbose: 475 | final_prompt = self.general_prompt_rag.format( 476 | respondent_data=str(respondent_data), 477 | classification_index=candidate_codes, 478 | ) 479 | logger.debug(final_prompt) 480 | 481 | chain = LLMChain(llm=self.llm, prompt=self.general_prompt_rag) 482 | 483 | try: 484 | response = chain.invoke( 485 | { 486 | "respondent_data": str(respondent_data), 487 | "classification_index": candidate_codes, 488 | }, 489 | return_only_outputs=True, 490 | ) 491 | except ValueError as err: 492 | logger.exception(err) 493 | logger.warning("Error from LLMChain, exit early") 494 | validated_answer = RagResponse( 495 | codable=False, 496 | alt_candidates=[], 497 | reasoning="Error from LLMChain, exit early", 498 | ) 499 | return validated_answer, short_list 500 | 501 | if self.verbose: 502 | logger.debug(f"llm_response={response}") 503 | 504 | # Parse the output to desired format 505 | parser = PydanticOutputParser(pydantic_object=RagResponse) 506 | try: 507 | validated_answer = parser.parse(response["text"]) 508 | except ValueError as parse_error: 509 | logger.exception(parse_error) 510 | logger.warning(f"Failed to parse response:\n{response['text']}") 511 | 512 | reasoning = ( 513 | f'ERROR parse_error=<{parse_error}>, response=<{response["text"]}>' 514 | ) 515 | validated_answer = RagResponse( 516 | codable=False, 517 | alt_candidates=[], 518 | reasoning=reasoning, 519 | ) 520 | 521 | return validated_answer, short_list 522 | -------------------------------------------------------------------------------- /src/sic_soc_llm/logs.py: -------------------------------------------------------------------------------- 1 | """Provides logging for the project. 2 | 3 | Used to set up file and console based loggers. 4 | Typically called from an entry point or external script. 5 | 6 | Typical usage: 7 | 8 | ``` 9 | logger = logs.setup_logging("some_script_name") 10 | ``` 11 | This will create a separate log file for the `some_script_name`. 12 | """ 13 | 14 | import datetime 15 | import logging 16 | from typing import Union 17 | from pathlib import Path 18 | 19 | LOG_FORMAT = "%(asctime)s - %(filename)s:%(lineno)d - %(levelname)s - %(message)s" 20 | DATE_FORMAT = "%Y-%m-%d %H:%M:%S" 21 | MODULE_NAME = "sic_soc_llm" 22 | EXTRA_MODULE_NAME = "server" 23 | LOG_LEVEL = logging.DEBUG 24 | DATE_STRING = f"{datetime.datetime.now().date()}" 25 | 26 | # Logging can be used independently of configuration 27 | LOG_DIR = Path.home() / "logs" 28 | 29 | 30 | def setup_logging( 31 | script_name: str = None, log_dir: Union[Path, str] = LOG_DIR 32 | ) -> logging.Logger: 33 | """Set up console and file logging. 34 | 35 | This will create a directory to log to if it doesn't already exist. 36 | 37 | Safe to call in interactive environments without duplicating the logging. 38 | 39 | Logs on the same day will append to the same file for the same script_name. 40 | 41 | Args: 42 | script_name (str): Used in the filename for the logs. 43 | log_dir (Path or str): Directory to store logs in. Defaults to "~/logs". 44 | 45 | Returns: 46 | Logger object with handlers set up. 47 | """ 48 | 49 | logger = logging.getLogger(MODULE_NAME) 50 | logger.setLevel(logging.DEBUG) 51 | 52 | other_logger = logging.getLogger(EXTRA_MODULE_NAME) 53 | other_logger.setLevel(logging.DEBUG) 54 | 55 | log_dir = Path(log_dir) 56 | log_dir.mkdir(parents=True, exist_ok=True) 57 | 58 | ch = logging.StreamHandler() 59 | ch.setLevel(LOG_LEVEL) 60 | 61 | formatter = logging.Formatter(LOG_FORMAT, DATE_FORMAT) 62 | ch.setFormatter(formatter) 63 | 64 | # In case this is called twice check whether a handler is already registered 65 | if not logger.handlers: 66 | logger.addHandler(ch) 67 | other_logger.addHandler(ch) 68 | 69 | try: 70 | if script_name is None: 71 | log_file = log_dir / f"{MODULE_NAME}.{DATE_STRING}.log" 72 | else: 73 | log_file = log_dir / f"{MODULE_NAME}_{script_name}.{DATE_STRING}.log" 74 | 75 | if len(logger.handlers) == 1: 76 | fh = logging.FileHandler(log_file) 77 | 78 | fh.setFormatter(formatter) 79 | fh.setLevel(LOG_LEVEL) 80 | logger.addHandler(fh) 81 | other_logger.addHandler(fh) 82 | 83 | except FileNotFoundError: 84 | logger.warning("Console logging only") 85 | 86 | return logger 87 | -------------------------------------------------------------------------------- /src/sic_soc_llm/prompt.py: -------------------------------------------------------------------------------- 1 | from langchain.prompts.prompt import PromptTemplate 2 | from langchain.output_parsers import PydanticOutputParser 3 | from sic_soc_llm.data_models.response_model import SocResponse, SicResponse, RagResponse 4 | from sic_soc_llm import get_config 5 | 6 | config = get_config() 7 | 8 | _core_prompt = """You are a conscientious classification assistant of respondent data 9 | for the use in the UK official statistics. Respondent data may be in English or Welsh, 10 | but you always respond in British English.""" 11 | 12 | _soc_template = """"Given the respondent data (that may include all or some of 13 | job title, job description, level of education, line management responsibilities, 14 | and company's main activity) your task is to determine 15 | the UK SOC (Standard Occupational Classification) code for this job if it can be 16 | determined. If the code cannot be determined, identify the additional information 17 | needed to determine it. Make sure to use the provided 2020 SOC index. 18 | 19 | ===Respondent Data=== 20 | - Job Title: {job_title} 21 | - Job Description: {job_description} 22 | - Level of Education: {level_of_education} 23 | - Line Management Responsibilities: {manage_others} 24 | - Company's main activity: {industry_descr} 25 | 26 | ===Output Format=== 27 | {format_instructions} 28 | 29 | ===2020 SOC Index=== 30 | {soc_index} 31 | """ 32 | 33 | with open(config["lookups"]["soc_condensed"]) as f: 34 | soc_index = f.read() 35 | 36 | 37 | parser = PydanticOutputParser(pydantic_object=SocResponse) 38 | 39 | SOC_PROMPT_PYDANTIC = PromptTemplate.from_template( 40 | template=_core_prompt + _soc_template, 41 | partial_variables={ 42 | "format_instructions": parser.get_format_instructions(), 43 | "soc_index": soc_index, 44 | }, 45 | ) 46 | 47 | # TODO include SIC/SOC definitions, coding guidance, the concept/question phrasing? 48 | 49 | 50 | _sic_template = """"Given the respondent's description of the main activity their 51 | company does, their job title and job description, your task is to determine 52 | the UK SIC (Standard Industry Classification) code for this company if it can be 53 | determined to the division (two-digit) level. If the code cannot be determined, 54 | identify the additional information needed to determine it. 55 | Make sure to use the provided 2007 SIC Index. 56 | 57 | ===Respondent Data=== 58 | - Company's main activity: {industry_descr} 59 | - Job Title: {job_title} 60 | - Job Description: {job_description} 61 | 62 | ===Output Format=== 63 | {format_instructions} 64 | 65 | ===2007 SIC Index=== 66 | {sic_index} 67 | """ 68 | 69 | with open(config["lookups"]["sic_condensed"]) as f: 70 | sic_index = f.read() 71 | 72 | 73 | parser = PydanticOutputParser(pydantic_object=SicResponse) 74 | 75 | SIC_PROMPT_PYDANTIC = PromptTemplate.from_template( 76 | template=_core_prompt + _sic_template, 77 | partial_variables={ 78 | "format_instructions": parser.get_format_instructions(), 79 | "sic_index": sic_index, 80 | }, 81 | ) 82 | 83 | 84 | _sic_template_rag = """"Given the respondent's description of the main activity their 85 | company does, their job title and job description (which may be different then the 86 | main company activity), your task is to determine the UK SIC (Standard Industry 87 | Classification) code for this company if it can be determined. 88 | Make sure to use the provided Relevant subset of UK SIC 2007. If the code cannot be 89 | determined (or is likely not included in the provided subset), identify the additional 90 | information needed to determine it and a list of most likely codes. 91 | 92 | ===Respondent Data=== 93 | - Company's main activity: {industry_descr} 94 | - Job Title: {job_title} 95 | - Job Description: {job_description} 96 | 97 | ===Relevant subset of UK SIC 2007=== 98 | {sic_index} 99 | 100 | ===Output Format=== 101 | {format_instructions} 102 | 103 | ===Output=== 104 | """ 105 | 106 | SIC_PROMPT_RAG = PromptTemplate.from_template( 107 | template=_core_prompt + _sic_template_rag, 108 | partial_variables={ 109 | "format_instructions": parser.get_format_instructions(), 110 | }, 111 | ) 112 | 113 | _general_template_rag = """"Given the respondent's data, your task is to determine 114 | the classification code. Make sure to use the provided Relevant subset of 115 | classification index and select codes from this list only. 116 | If the code cannot be determined (or not included in the provided subset), 117 | do not provide final code, instead identify the additional information needed 118 | to determine the correct code and suggest few most likely codes. 119 | 120 | ===Respondent Data=== 121 | {respondent_data} 122 | 123 | ===Relevant subset of classification index=== 124 | {classification_index} 125 | 126 | ===Output Format=== 127 | {format_instructions} 128 | 129 | ===Output=== 130 | """ 131 | parser = PydanticOutputParser(pydantic_object=RagResponse) 132 | 133 | GENERAL_PROMPT_RAG = PromptTemplate.from_template( 134 | template=_core_prompt + _general_template_rag, 135 | partial_variables={ 136 | "format_instructions": parser.get_format_instructions(), 137 | }, 138 | ) 139 | 140 | 141 | class PromptTemplates: 142 | """Class to store prompt templates for SOC and SIC classification tasks. 143 | 144 | Each prompt template includes the necessary variables and formatting instructions 145 | to generate the prompt. The module also includes the necessary imports and 146 | configurations to generate the prompts. 147 | 148 | Attributes: 149 | SOC_PROMPT_PYDANTIC (PromptTemplate): Prompt template for determining SOC codes 150 | based on respondent's data. 151 | SIC_PROMPT_PYDANTIC (PromptTemplate): Prompt template for determining SIC codes 152 | based on respondent's data. 153 | SIC_PROMPT_RAG (PromptTemplate): Prompt template for determining SIC codes based 154 | on respondent's data, with a relevant subset of SIC codes provided. 155 | GENERAL_PROMPT_RAG (PromptTemplate): Prompt template for determining custom 156 | classification codes based on respondent data, with a relevant subset of 157 | codes provided. 158 | """ 159 | 160 | def __init__(self): 161 | self.SOC_PROMPT_PYDANTIC = SOC_PROMPT_PYDANTIC 162 | self.SIC_PROMPT_PYDANTIC = SIC_PROMPT_PYDANTIC 163 | self.SIC_PROMPT_RAG = SIC_PROMPT_RAG 164 | self.GENERAL_PROMPT_RAG = GENERAL_PROMPT_RAG 165 | -------------------------------------------------------------------------------- /tests/test_classification_llm.py: -------------------------------------------------------------------------------- 1 | # %% 2 | import unittest 3 | import tempfile 4 | import os 5 | from pathlib import Path 6 | from sic_soc_llm import setup_logging, check_file_exists 7 | from sic_soc_llm.llm import ClassificationLLM 8 | from sic_soc_llm.embedding import EmbeddingHandler 9 | from langchain.llms.fake import FakeListLLM 10 | from sic_soc_llm.data_models.response_model import SocResponse, SicResponse, RagResponse 11 | 12 | # %% 13 | FAKE_LLM = FakeListLLM(responses=["something not structured"]) 14 | 15 | 16 | # %% 17 | class TestClassificationLLM(unittest.TestCase): 18 | def setUp(self): 19 | # Create a temporary testing directory 20 | self.cwd = os.getcwd() 21 | self.temp_dir = tempfile.TemporaryDirectory() 22 | os.chdir(self.temp_dir.name) 23 | self.llm = ClassificationLLM(llm=FAKE_LLM, verbose=True) 24 | 25 | def tearDown(self): 26 | # Clean up the temporary testing directory 27 | self.temp_dir.cleanup() 28 | os.chdir(self.cwd) 29 | 30 | def test_logging_setup(self): 31 | # check new file exists in tmp log dir 32 | logger = setup_logging(log_dir=Path(self.temp_dir.name) / "logs") 33 | logger.info("TestClassificationLLM: Setting up test") 34 | log_files = list((Path(self.temp_dir.name) / "logs").iterdir()) 35 | self.assertEqual(len(log_files), 1) 36 | 37 | def test_classification_llm_initialised(self): 38 | # Test if the ClassificationLLM instance is initialized correctly 39 | self.assertIsInstance(self.llm, ClassificationLLM) 40 | 41 | def test_prompt_candidate_list_empty(self): 42 | out = self.llm._prompt_candidate_list([]) 43 | print(out) 44 | self.assertEqual(out, "") 45 | 46 | def test_get_soc_code(self): 47 | # Test if the SOC code is returned correctly 48 | soc_code = self.llm.get_soc_code( 49 | job_title="science teacher", 50 | job_description="", 51 | manage_others=False, 52 | level_of_education="Other", 53 | industry_descr="", 54 | ) 55 | self.assertIsInstance(soc_code, SocResponse) 56 | 57 | def test_get_sic_code(self): 58 | # Test if the SIC code is returned correctly 59 | sic_code = self.llm.get_sic_code( 60 | industry_descr="secondary school", 61 | job_title="teacher", 62 | job_description="", 63 | ) 64 | self.assertIsInstance(sic_code, SicResponse) 65 | 66 | def test_sic_empty_embed_error(self): 67 | print(Path.cwd()) 68 | resp = self.llm.rag_sic_code(industry_descr="secondary school") 69 | assert resp[0].reasoning.startswith("Error") 70 | 71 | def test_rag_empty_embed_error(self): 72 | resp = self.llm.rag_general_code(respondent_data={"descr": "school"}) 73 | assert resp[0].reasoning.startswith("Error") 74 | 75 | 76 | # %% 77 | class TestGeneralRAG(unittest.TestCase): 78 | def setUp(self): 79 | embed = EmbeddingHandler(db_dir=None) 80 | file_path = check_file_exists("toy_index.txt") 81 | with open(file_path, "r") as file_object: 82 | embed.embed_index(from_empty=True, file_object=file_object) 83 | self.llm = ClassificationLLM( 84 | llm=FAKE_LLM, embedding_handler=embed, verbose=True 85 | ) 86 | 87 | def tearDown(self): 88 | pass 89 | 90 | def test_rag_general_code(self): 91 | # Test if the RAG code is returned correctly 92 | rag_code, shortlist = self.llm.rag_general_code( 93 | respondent_data={"characteristics": "gills"} 94 | ) 95 | self.assertIsInstance(rag_code, RagResponse) 96 | assert len(shortlist) == 4 97 | -------------------------------------------------------------------------------- /tests/test_embedding.py: -------------------------------------------------------------------------------- 1 | # %% 2 | import unittest 3 | from sic_soc_llm import check_file_exists 4 | from sic_soc_llm.embedding import EmbeddingHandler 5 | 6 | 7 | # %% 8 | class TestEmbeddingHandlerToy(unittest.TestCase): 9 | def setUp(self): 10 | self.embedding_handler = EmbeddingHandler(db_dir=None) 11 | file_path = check_file_exists("toy_index.txt") 12 | with open(file_path, "r") as file_object: 13 | self.embedding_handler.embed_index(from_empty=True, file_object=file_object) 14 | 15 | def tearDown(self): 16 | pass 17 | 18 | def test_embed_index_with_file_object(self): 19 | # Count number of entries 20 | assert self.embedding_handler._index_size == 4 21 | 22 | def test_search_index(self): 23 | # Test searching index with a query 24 | query = "mens best friend" 25 | results = self.embedding_handler.search_index(query) 26 | assert results[0]["code"] == "02" 27 | 28 | def test_search_index_multi(self): 29 | # Test searching index with multiple queries 30 | queries = ["has gills", "has scales"] 31 | results = self.embedding_handler.search_index_multi(queries) 32 | assert len(results) == 8 33 | -------------------------------------------------------------------------------- /tests/test_sic_data_structure.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from sic_soc_llm.data_models import sic_hierarchy 4 | 5 | 6 | def test_sic_code_alpha_code_is_string_else_error(): 7 | with pytest.raises(TypeError): 8 | sic_hierarchy.SicCode(123123) 9 | 10 | 11 | def test_sic_code_alpha_code_starts_with_letter_else_error(): 12 | with pytest.raises(ValueError): 13 | sic_hierarchy.SicCode("123123") 14 | 15 | 16 | def test_sic_code_alpha_code_starts_with_uppercase_else_error(): 17 | with pytest.raises(ValueError): 18 | sic_hierarchy.SicCode("a12312") 19 | 20 | 21 | def test_sic_code_alpha_code_short_raises_error(): 22 | with pytest.raises(ValueError): 23 | sic_hierarchy.SicCode("a123") 24 | 25 | 26 | def test_sic_code_alpha_code_long_raises_error(): 27 | with pytest.raises(ValueError): 28 | sic_hierarchy.SicCode("a123123") 29 | 30 | 31 | @pytest.mark.parametrize( 32 | "code,expected_digits", 33 | [ 34 | ("Axxxxx", 1), 35 | ("A12xxx", 2), 36 | ("A123xx", 3), 37 | ("A1234x", 4), 38 | ("A12345", 5), 39 | ], 40 | ) 41 | def test_sic_code_alpha_code_digits_parsed(code, expected_digits): 42 | # Given 43 | alpha_code = code 44 | 45 | # When 46 | code = sic_hierarchy.SicCode(alpha_code) 47 | 48 | # Then 49 | assert code.n_digits == expected_digits 50 | 51 | 52 | @pytest.mark.parametrize( 53 | "code,expected_level_name", 54 | [ 55 | ("Axxxxx", "section"), 56 | ("A12xxx", "division"), 57 | ("A123xx", "group"), 58 | ("A1234x", "class"), 59 | ("A12345", "subclass"), 60 | ], 61 | ) 62 | def test_sic_code_alpha_code_levels_correct(code, expected_level_name): 63 | # Given 64 | alpha_code = code 65 | 66 | # When 67 | code = sic_hierarchy.SicCode(alpha_code) 68 | 69 | # Then 70 | assert code.level_name == expected_level_name 71 | 72 | 73 | @pytest.mark.parametrize( 74 | "code,expected_formatted_code", 75 | [ 76 | ("Axxxxx", "A"), 77 | ("A12xxx", "12"), 78 | ("A123xx", "12.3"), 79 | ("A1234x", "12.34"), 80 | ("A12345", "12.34/5"), 81 | ], 82 | ) 83 | def test_sic_code_alpha_code_readable_code_correct(code, expected_formatted_code): 84 | # Given 85 | alpha_code = code 86 | 87 | # When 88 | code = sic_hierarchy.SicCode(alpha_code) 89 | 90 | # Then 91 | assert str(code) == expected_formatted_code 92 | 93 | 94 | def test_sic_code_alpha_single_digit_raises_error(): 95 | with pytest.raises(ValueError): 96 | sic_hierarchy.SicCode("A1xxxx") 97 | 98 | 99 | @pytest.mark.parametrize( 100 | "section,code,level,expected_formatted_code", 101 | [ 102 | ("A", "A", "section", "A"), 103 | ("A", "12", "division", "12"), 104 | ("A", "123", "group", "12.3"), 105 | ("A", "1234", "class", "12.34"), 106 | ("A", "12340", "class", "12.34"), 107 | ("A", "12345", "subclass", "12.34/5"), 108 | ], 109 | ) 110 | def test_sic_code_from_section_code_level_valid_cases( 111 | section, code, level, expected_formatted_code 112 | ): 113 | # When 114 | code = sic_hierarchy.SicCode.from_section_code_level(section, code, level) 115 | 116 | # Then 117 | assert str(code) == expected_formatted_code 118 | 119 | 120 | def test_sic_code_from_section_code_level_invalid_class(): 121 | # Given 122 | section = "A" 123 | code = "12341" 124 | level = "class" 125 | 126 | with pytest.raises(ValueError): 127 | sic_hierarchy.SicCode.from_section_code_level(section, code, level) 128 | 129 | 130 | @pytest.mark.parametrize( 131 | "section,code,level", 132 | [ 133 | ("A", "A", "division"), 134 | ("A", "A", "group"), 135 | ("A", "A", "class"), 136 | ("A", "A", "subclass"), 137 | ("A", "12", "section"), 138 | ("A", "12", "group"), 139 | ("A", "12", "class"), 140 | ("A", "12", "subclass"), 141 | ("A", "123", "section"), 142 | ("A", "123", "division"), 143 | ("A", "123", "class"), 144 | ("A", "123", "subclass"), 145 | ("A", "1234", "section"), 146 | ("A", "1234", "division"), 147 | ("A", "1234", "group"), 148 | ("A", "1234", "subclass"), 149 | ("A", "12340", "section"), 150 | ("A", "12340", "division"), 151 | ("A", "12340", "group"), 152 | ("A", "12345", "section"), 153 | ("A", "12345", "division"), 154 | ("A", "12345", "group"), 155 | ("A", "12345", "class"), 156 | ], 157 | ) 158 | def test_sic_code_from_section_code_level_invalid_levels_raise_error( 159 | section, code, level 160 | ): 161 | with pytest.raises(ValueError): 162 | sic_hierarchy.SicCode.from_section_code_level(section, code, level) 163 | 164 | 165 | def test_sic_code_from_section_code_level_invalid_section_code_raises_error(): 166 | # Given 167 | section = "A" 168 | code = "B" 169 | level = "section" 170 | 171 | with pytest.raises(ValueError): 172 | sic_hierarchy.SicCode.from_section_code_level(section, code, level) 173 | 174 | 175 | @pytest.mark.parametrize( 176 | "text,expected", 177 | [ 178 | (""", '"'), 179 | ("some text "here"", 'some text "here"'), 180 | ('mixed "some" text "here"', 'mixed "some" text "here"'), 181 | ], 182 | ) 183 | def test_clean_text_with_html_unescapes(text, expected): 184 | # When 185 | clean_text = sic_hierarchy._clean_text(text) 186 | 187 | # Then 188 | assert clean_text == expected 189 | 190 | 191 | @pytest.mark.parametrize( 192 | "text,expected", 193 | [ 194 | (", see ##12.12", ""), 195 | ("Some text, see ##12.12", "Some text"), 196 | ("Some text, See ##12.12", "Some text"), 197 | ("Some text, see ##12.12/1", "Some text"), 198 | ("some text,see ##12.12", "some text"), 199 | ("see ##12.12", ""), 200 | ("##12.12", ""), 201 | ("some text ##12.12 different", "some text different"), 202 | (", see division ##85", ""), 203 | ("##85", ""), 204 | ("some text, see division ##25", "some text"), 205 | ("see divisions ##12", ""), 206 | ("some text, see division ##25, see ##12.12, see divisions ##12", "some text"), 207 | ], 208 | ) 209 | def test_clean_text_with_see_gets_trimmed(text, expected): 210 | # When 211 | clean_text = sic_hierarchy._clean_text(text) 212 | 213 | # Then 214 | assert clean_text == expected 215 | --------------------------------------------------------------------------------