├── .flake8
├── .gcloudignore
├── .github
└── workflows
│ ├── CodeCov.yml
│ └── QuartoDocs.yml
├── .gitignore
├── .pre-commit-config.yaml
├── Dockerfile
├── LICENSE
├── README.md
├── app.yaml
├── app
├── Welcome.py
└── pages
│ ├── 1_SIC_&_SOC_pre-defined_coding_assistant.py
│ ├── 2_Setup_custom_coding_assistant.py
│ └── 3_Test_custom_coding_assistant.py
├── cloud_deploy.sh
├── codecov.yml
├── docs
├── _quarto.yml
├── _static
│ ├── app-ui.png
│ └── sic-soc-llm.png
├── index.qmd
├── method.qmd
└── tutorials
│ ├── 1_sic_data_structure.qmd
│ ├── 2_sic_classifier.qmd
│ ├── 3_soc_classifier.qmd
│ ├── 4_custom_coicop_classifier.qmd
│ └── index.qmd
├── pyproject.toml
├── src
└── sic_soc_llm
│ ├── __init__.py
│ ├── _config
│ ├── __init__.py
│ ├── main.py
│ └── sic_soc_llm_config.toml
│ ├── data_models
│ ├── __init__.py
│ ├── response_model.py
│ ├── sicDB.py
│ ├── sic_data_access.py
│ ├── sic_hierarchy.py
│ └── sic_meta_model.py
│ ├── embedding.py
│ ├── example_data
│ ├── coicop_5d_condensed.txt
│ ├── sic_2d_condensed.txt
│ ├── sic_4d_condensed.txt
│ ├── soc_4d_condensed.txt
│ └── toy_index.txt
│ ├── llm.py
│ ├── logs.py
│ └── prompt.py
└── tests
├── test_classification_llm.py
├── test_embedding.py
└── test_sic_data_structure.py
/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | # Rule definitions: http://flake8.pycqa.org/en/latest/user/error-codes.html
3 | # D203: 1 blank line required before class docstring
4 | # W503: line break before binary operator
5 | exclude = venv*,__pycache__,node_modules,bower_components,migrations
6 | ignore = D203,W503
7 | max-complexity = 9
8 | max-line-length = 88
9 | extend-ignore = E203
10 |
--------------------------------------------------------------------------------
/.gcloudignore:
--------------------------------------------------------------------------------
1 | # Ignore all files and folders
2 | *
3 |
4 | # Allow selected files and folders
5 | !app.yaml
6 | !Dockerfile
7 | !pyproject.toml
8 | !app/
9 | !src/
10 | src/*
11 | !src/sic_soc_llm/
12 | !data/
13 | data/*
14 | !data/sic-index/
15 | !data/soc-index/
16 | !data/coicop-index/
17 | !data/custom-index/
18 | data/custom-index/*
19 | !data/custom-index/example-index.txt
20 |
21 | # ignore pycache
22 | **/__pycache__/
23 |
--------------------------------------------------------------------------------
/.github/workflows/CodeCov.yml:
--------------------------------------------------------------------------------
1 | name: CodeCov
2 | on:
3 | pull_request:
4 | branches: [develop, main]
5 | push:
6 | branches: [develop, main]
7 |
8 | jobs:
9 | build:
10 | name: Pytest & Coverage
11 | runs-on: ubuntu-latest
12 |
13 | steps:
14 | - name: Checkout Repo
15 | uses: actions/checkout@v4
16 |
17 | - name: Setup Python
18 | uses: actions/setup-python@v5
19 | with:
20 | python-version: "3.10"
21 | cache: "pip"
22 |
23 | - name: Run Pre-commit
24 | uses: pre-commit/action@v3.0.1
25 |
26 | - name: Install Dependencies
27 | run: |
28 | python -m pip install --upgrade pip
29 | python -m pip install -e ".[test]"
30 |
31 | - name: Run Pytest and Generate Report
32 | run: |
33 | python -m pip install coverage[toml]
34 | coverage run -m pytest
35 |
36 | - name: Upload Coverage Reports to Codecov
37 | uses: codecov/codecov-action@v4
38 | with:
39 | token: ${{ secrets.CODECOV_TOKEN }}
40 | fail_ci_if_error: false
41 |
--------------------------------------------------------------------------------
/.github/workflows/QuartoDocs.yml:
--------------------------------------------------------------------------------
1 | name: QuartoDocs
2 | on:
3 | pull_request:
4 | branches: develop
5 |
6 | jobs:
7 | build-deploy:
8 | runs-on: ubuntu-latest
9 | permissions:
10 | contents: write
11 | pages: write
12 | steps:
13 | - name: Check out repository
14 | uses: actions/checkout@v4
15 |
16 | - name: Set up Quarto
17 | uses: quarto-dev/quarto-actions/setup@v2
18 |
19 | - name: Setup Python
20 | uses: actions/setup-python@v5
21 | with:
22 | python-version: "3.10"
23 | cache: "pip"
24 |
25 | - name: Install Dependencies
26 | run: |
27 | python -m pip install --upgrade pip
28 | python -m pip install -e ".[docs]"
29 |
30 | - name: Build API reference
31 | run: |
32 | cd docs
33 | python -m quartodoc build
34 |
35 | - name: Publish to gh-pages
36 | uses: quarto-dev/quarto-actions/publish@v2
37 | with:
38 | target: gh-pages
39 | render: true
40 | path: docs
41 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Data
10 | db/
11 | *.db
12 | *.csv
13 | *.txt
14 | *.json
15 | *.pkl
16 | *.parquet
17 | *.bin
18 | *.faiss
19 | *.xlsx
20 | *.xls
21 | *.png
22 | *.ods
23 | *.pickle
24 |
25 | # Distribution / packaging
26 | .Python
27 | build/
28 | develop-eggs/
29 | dist/
30 | downloads/
31 | eggs/
32 | .eggs/
33 | lib/
34 | lib64/
35 | parts/
36 | sdist/
37 | var/
38 | wheels/
39 | share/python-wheels/
40 | *.egg-info/
41 | .installed.cfg
42 | *.egg
43 | MANIFEST
44 | *.DS_Store
45 |
46 | # PyInstaller
47 | # Usually these files are written by a python script from a template
48 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
49 | *.manifest
50 | *.spec
51 |
52 | # Installer logs
53 | pip-log.txt
54 | pip-delete-this-directory.txt
55 |
56 | # Unit test / coverage reports
57 | # Project-specific
58 | *.csv
59 |
60 | htmlcov/
61 | .tox/
62 | .nox/
63 | .coverage
64 | .coverage.*
65 | .cache
66 | nosetests.xml
67 | coverage.xml
68 | *.cover
69 | *.py,cover
70 | .hypothesis/
71 | .pytest_cache/
72 | cover/
73 |
74 | # Translations
75 | *.mo
76 | *.pot
77 |
78 | # Django stuff:
79 | *.log
80 | local_settings.py
81 | *.sqlite3
82 | *.sqlite3-journal
83 |
84 | # Flask stuff:
85 | instance/
86 | .webassets-cache
87 |
88 | # Scrapy stuff:
89 | .scrapy
90 |
91 | # Sphinx documentation
92 | docs/_build/
93 |
94 | # PyBuilder
95 | .pybuilder/
96 | target/
97 |
98 | # Jupyter Notebook
99 | .ipynb_checkpoints
100 | *.ipynb
101 |
102 | # IPython
103 | profile_default/
104 | ipython_config.py
105 |
106 | # pyenv
107 | # For a library or package, you might want to ignore these files since the code is
108 | # intended to run in multiple environments; otherwise, check them in:
109 | # .python-version
110 |
111 | # pipenv
112 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
113 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
114 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
115 | # install all needed dependencies.
116 | #Pipfile.lock
117 |
118 | # poetry
119 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
120 | # This is especially recommended for binary packages to ensure reproducibility, and is more
121 | # commonly ignored for libraries.
122 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
123 | #poetry.lock
124 |
125 | # pdm
126 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
127 | #pdm.lock
128 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
129 | # in version control.
130 | # https://pdm.fming.dev/#use-with-ide
131 | .pdm.toml
132 |
133 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
134 | __pypackages__/
135 |
136 | # Celery stuff
137 | celerybeat-schedule
138 | celerybeat.pid
139 |
140 | # SageMath parsed files
141 | *.sage.py
142 |
143 | # Environments
144 | .env
145 | .venv
146 | env/
147 | venv/
148 | ENV/
149 | env.bak/
150 | venv.bak/
151 |
152 | # Spyder project settings
153 | .spyderproject
154 | .spyproject
155 |
156 | # Rope project settings
157 | .ropeproject
158 |
159 | # mkdocs documentation
160 | /site
161 |
162 | # quarto
163 | _site/
164 | .quarto/
165 | /docs/reference/
166 | /docs/.gitignore
167 |
168 | # mypy
169 | .mypy_cache/
170 | .dmypy.json
171 | dmypy.json
172 |
173 | # Pyre type checker
174 | .pyre/
175 |
176 | # pytype static type analyzer
177 | .pytype/
178 |
179 | # Cython debug symbols
180 | cython_debug/
181 |
182 | # PyCharm
183 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
184 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
185 | # and can be added to the global gitignore or merged into this file. For a more nuclear
186 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
187 | #.idea/
188 |
189 | # VSCode
190 | .vscode/
191 |
192 | #config
193 | config/
194 |
195 | #output figures
196 | *.svg
197 | *.pdf
198 | *.png
199 | *.jpg
200 |
201 | #llm model files
202 | *.gguf
203 | *.llamafile
204 |
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | # See https://pre-commit.com for more information
2 | # See https://pre-commit.com/hooks.html for more hooks
3 | exclude: "src/sic_soc_llm/data_models/sicDB.py"
4 | repos:
5 | - repo: https://github.com/kynan/nbstripout
6 | rev: 0.6.1
7 | hooks:
8 | - id: nbstripout
9 | name: nbstripout - Strip outputs from notebooks (auto-fixes)
10 | args:
11 | - --extra-keys
12 | - "metadata.colab metadata.kernelspec cell.metadata.colab cell.metadata.executionInfo cell.metadata.id cell.metadata.outputId"
13 | - repo: https://github.com/pre-commit/pre-commit-hooks
14 | rev: v4.4.0
15 | hooks:
16 | - id: check-added-large-files
17 | name: Check for files larger than 5 MB
18 | args: [ "--maxkb=5120" ]
19 | - id: end-of-file-fixer
20 | name: Check for a blank line at the end of scripts (auto-fixes)
21 | exclude: '\.Rd'
22 | - id: trailing-whitespace
23 | name: Check for trailing whitespaces (auto-fixes)
24 | - repo: https://github.com/pycqa/isort
25 | rev: 5.12.0
26 | hooks:
27 | - id: isort
28 | name: isort - Sort Python imports (auto-fixes)
29 | types: [ cython, pyi, python ]
30 | args: [ "--profile", "black", "--filter-files" ]
31 | - repo: https://github.com/psf/black
32 | rev: 23.3.0 # Replace by any tag/version: https://github.com/psf/black/tags
33 | hooks:
34 | - id: black
35 | name: black - consistent Python code formatting (auto-fixes)
36 | language_version: python # Should be a command that runs python3.6+
37 | - repo: https://github.com/PyCQA/flake8
38 | rev: 6.0.0
39 | hooks:
40 | - id: flake8
41 | name: flake8 - Python linting
42 | - repo: https://github.com/nbQA-dev/nbQA
43 | rev: 1.6.4
44 | hooks:
45 | - id: nbqa-isort
46 | name: nbqa-isort - Sort Python imports (notebooks; auto-fixes)
47 | args: [ --nbqa-mutate ]
48 | additional_dependencies: [ isort==5.8.0 ]
49 | - id: nbqa-black
50 | name: nbqa-black - consistent Python code formatting (notebooks; auto-fixes)
51 | args: [ --nbqa-mutate ]
52 | additional_dependencies: [ black==22.3.0 ]
53 | # TODO: Disabled for now until it's clear how to add noqa to specific cells of a Jupyter notebook
54 | #- id: nbqa-flake8
55 | # name: nbqa-flake8 - Python linting (notebooks)
56 | # additional_dependencies: [ flake8==3.9.2 ]
57 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.10.4-buster
2 | ENV PYTHONUNBUFFERED True
3 |
4 | # Copy app code
5 | WORKDIR /sic-soc
6 | COPY . ./
7 | RUN ls -laRt
8 |
9 | # Upgrade pip and install requirements
10 | RUN python -m pip install --upgrade pip
11 | RUN python -m pip install pysqlite3-binary
12 | RUN python -m pip install -e ".[app]" --no-cache-dir
13 |
14 | # Expose port you want your app on
15 | ENV PORT=8080
16 | ENV HOSTNAME="0.0.0.0"
17 | EXPOSE 8080
18 | HEALTHCHECK CMD curl --fail http://localhost:8080/_stcore/health
19 |
20 | # Run
21 | ENTRYPOINT ["streamlit", "run", "app/Welcome.py", "--server.port=8080", "--server.address=0.0.0.0"]
22 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2024 Data Science Campus
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # SIC-SOC-LLM
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 | ## Overview
12 |
13 | This app/package has been created by the [Data Science Campus](https://datasciencecampus.ons.gov.uk/) as a proof of concept to evaluate Large Language Models (LLM) potential to assist
14 | with classification coding. It uses the `LangChain` library to perform Retrieval Augmented Generation (RAG) based on the provided classification index. A special case of Standard Industrial Classification (SIC) coding has been used as the primary test case, see [method explanation](https://datasciencecampus.github.io/sic-soc-llm/method.html#method). An example deployment using `Streamlit` allows for interactive exploration of the model's capabilities.
15 |
16 | ## Data sources
17 |
18 | Examples of simplified SIC, Standard Occupational Classification (SOC) and Classification of Individual Consumption According to Purpose (COICOP) are included in the `example_data` folder. These condensed indices are flattened subsets of more detailed indices officially published online, such as the [UK SIC 2007](https://www.ons.gov.uk/methodology/classificationsandstandards/ukstandardindustrialclassificationofeconomicactivities/uksic2007), [UK SOC 2020](https://www.ons.gov.uk/methodology/classificationsandstandards/standardoccupationalclassificationsoc/soc2020), and [COICOP 2018 (pdf)](https://unstats.un.org/unsd/classifications/unsdclassifications/COICOP_2018_-_pre-edited_white_cover_version_-_2018-12-26.pdf).
19 |
20 | > ⚠️ **Warning:** The example data is provided for demonstration purposes only. No guarrantee is given for its accuracy or up to date status.
21 |
22 | In this project, we focused on the SIC. A flexible representation of this hierarchical index (including metadata) has been implemented within the `data_models` submodule, enabling enhanced context for RAG/LLM. This representation can be used independently for other SIC coding tasks or easily extended to accommodate different classification indices.
23 |
24 | The SIC index hierarchy object is built using three data sources provided by ONS:
25 |
26 | - [Published UK SIC summary of structure worksheet (xlsx)](https://www.ons.gov.uk/file?uri=/methodology/classificationsandstandards/ukstandardindustrialclassificationofeconomicactivities/uksic2007/publisheduksicsummaryofstructureworksheet.xlsx) - location needs to be specified in config
27 |
28 | - [UK SIC2007 indexes with addendum December 2022 (xlsx)](https://www.ons.gov.uk/file?uri=/methodology/classificationsandstandards/ukstandardindustrialclassificationofeconomicactivities/uksic2007/uksic2007indexeswithaddendumdecember2022.xlsx) - location needs to be specified in config
29 |
30 | - [SIC resource file by ONSdigital/dp-classification-tools (js)](https://github.com/ONSdigital/dp-classification-tools/blob/develop/standard-industrial-classification/data/sicDB.js) - included inside the package
31 |
32 |
33 | ## Installation
34 |
35 | ### 1. Virtual environment
36 |
37 | It is recommended that you install the project with its required dependencies in a virtual environment. When the virtual environment is activated, any subsequent Python commands will use the Python interpreter and libraries specific to that isolated environment. This ensures that the project uses the correct versions of the dependencies specified in its requirements.
38 |
39 | Create and activate a new virtual environment on Linux/OS X:
40 |
41 | ```{shell}
42 | python3.10 -m venv .venv
43 | source .venv/bin/activate
44 | ```
45 |
46 |
47 | ### 2. Requirements
48 | Update pip and install requirements:
49 | ```
50 | python -m pip install --upgrade pip
51 | python -m pip install -e ".[dev]"
52 | ```
53 | The -e flag installs the project in "editable" mode, which means that any changes made to the project code will be reflected immediately without the need to reinstall. The ".[dev]" part specifies that both the regular requirements and the development requirements should be installed.
54 |
55 | ### 3. LLM authentication:
56 |
57 | The package provides code to use popular LLMs, access to the LLMs is a perquisite for use. Depending on your choice, keys/credentials may need to be added, for example:
58 |
59 | - Include a personal [OpenAI](https://openai.com/) API in .env as
60 |
61 | ```{shell}
62 | OPENAI_API_KEY=""
63 | ```
64 |
65 | - Authenticate for [Vertex AI](https://cloud.google.com/model-garden?hl=en):
66 |
67 | ```{shell}
68 | gcloud config set project ""
69 | gcloud auth application-default login
70 | ```
71 |
72 |
73 |
74 |
75 | ## Usage
76 |
77 | Examples of how to use the `sic-soc-llm` package can be found in [Tutorials](https://datasciencecampus.github.io/sic-soc-llm/tutorials/) and [References](https://datasciencecampus.github.io/sic-soc-llm/reference/).
78 |
79 | ### Configuration
80 |
81 | The `sic-soc-llm` package uses a configuration file in TOML format to specify the paths to the data files and the names of the models to use. An example configuration file is provided in `sic_soc_llm_config.toml` and is read by the [`get_config`](https://datasciencecampus.github.io/sic-soc-llm/reference/get_config.html) function. The following fields are required:
82 |
83 | | Field | Type | Default value |
84 | | --- | --- | --- |
85 | [lookups]| | |
86 | | sic_structure | str | "data/sic-index/publisheduksicsummaryofstructureworksheet.xlsx" |
87 | | sic_index | str | "data/sic-index/uksic2007indexeswithaddendumdecember2022.xlsx" |
88 | | sic_condensed | str | "sic_2d_condensed.txt" |
89 | | soc_condensed | str | "soc_4d_condensed.txt" |
90 | | coicop_condensed | str | "coicop_5d_condensed.txt" |
91 | | [llm]| | |
92 | | db_dir | str | "data/sic-index/db" |
93 | | embedding_model_name | str | "all-MiniLM-L6-v2" |
94 | | llm_model_name | str | "gemini-pro" |
95 |
96 |
97 | Make sure to update the file paths and model names according to your specific setup. While the condensed indexes (`.txt`) are included in the package, the `.xlsx` files need to be downloaded from the ONS website (mentioned above) and placed in the specified locations.
98 |
99 | ### Run and deploy Streamlit app
100 |
101 | To run the Streamlit app, use the following command:
102 |
103 | ```{shell}
104 | streamlit run app/Welcome.py --server.port 8500
105 | ```
106 |
107 | The app will be available at `http://localhost:8500/`.
108 |
109 |
110 | Example commands used to build and deploy the app as a GCP Cloud Run service are provided in `cloud_deploy.sh` (which references `Dockerfile` and `app.yaml`). The `Dockerfile` contains a set of instructions for building a Docker image. It specifies the base image to use, the files and directories to include, the dependencies and the commands to run. The `app.yaml` file is used to specify the configuration of the Cloud Run service, including the container image to deploy, the service name, and the port to expose.
111 |
112 |
113 | ## Development and testing
114 |
115 | ### 1. Pre-commit actions
116 |
117 | This repository contains a configuration of pre-commit hooks. If approaching this project as a developer, you are encouraged to install and enable `pre-commits` by running the following in your shell:
118 |
119 | ```
120 | pip install pre-commit
121 | pre-commit install
122 | ```
123 |
124 | ### 2. Unit tests
125 |
126 | To run the unit tests, use the following command:
127 |
128 | ```{shell}
129 | python -m pytest
130 | ```
131 |
132 |
133 | ### 3. Building documentation and webpage:
134 |
135 |
136 | 1. Build (Quatro markdown) `reference` files from docstrings:
137 |
138 | ```{shell}
139 | cd docs
140 | python -m quartodoc build
141 | ```
142 |
143 | 2. Render webpage from Quarto markdowns in `docs` dir (including `reference` files):
144 |
145 | ```{shell}
146 | quarto render
147 | ```
148 |
149 | ## License
150 |
151 | The code, unless otherwise stated, is released under [the MIT Licence][mit].
152 | The documentation for this work is subject to [© 2024 Crown Copyright (Office for National Statistics)][copyright] and is available under the terms of the [Open Government 3.0][ogl] licence.
153 |
154 | [mit]: https://github.com/datasciencecampus/sic-soc-llm?tab=MIT-1-ov-file
155 | [copyright]: http://www.nationalarchives.gov.uk/information-management/re-using-public-sector-information/uk-government-licensing-framework/crown-copyright/
156 | [ogl]: http://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/
157 |
158 | ## Data Science Campus
159 | At the [Data Science Campus](https://datasciencecampus.ons.gov.uk/about-us/) we apply data science, and build skills, for public good across the UK and internationally. Get in touch with the Campus at [datasciencecampus@ons.gov.uk](mailto:datasciencecampus@ons.gov.uk).
160 |
--------------------------------------------------------------------------------
/app.yaml:
--------------------------------------------------------------------------------
1 | runtime: custom
2 | env: flex
3 | service: default
4 | resources:
5 | disk_size_gb: 25
6 | memory_gb: 4
7 |
--------------------------------------------------------------------------------
/app/Welcome.py:
--------------------------------------------------------------------------------
1 | import streamlit as st
2 | from sic_soc_llm import setup_logging
3 |
4 | logger = setup_logging("streamlit_app")
5 |
6 | st.set_page_config(
7 | page_title="SIC/SOC LLM assistant",
8 | page_icon="🐥",
9 | )
10 |
11 | st.subheader("LLM assisted classification coding", divider=True)
12 |
13 | st.markdown(
14 | """This app/package has been created by the Data Science Campus
15 | as a proof of concept to evaluate Large Language Models (LLM) potential to assist
16 | with classification coding.
17 | It provides an example of using pre-trained LLM models to assist with
18 | Standard Industrial Classification (SIC) and Standard Occupational Classification
19 | (SOC) coding. It also provides a way to set up and test a custom index.
20 | """
21 | )
22 |
23 | st.markdown(
24 | """
25 | - Source code: [github/datasciencecampus/sic-soc-llm](
26 | https://github.com/datasciencecampus/sic-soc-llm)
27 | - Documentation & references:
28 | [github.io docs](https://datasciencecampus.github.io/sic-soc-llm/docs)
29 | - Website: [Data Science Campus](https://datasciencecampus.ons.gov.uk/)
30 | - Email: [Data Science Campus](mailto:datasciencecampus@ons.gov.uk)
31 | """
32 | )
33 |
34 | st.subheader(
35 | "Use the sidebar on the left to navigate to any of the pages:", divider=True
36 | )
37 | st.markdown(
38 | """
39 | 1. **SIC & SOC pre-defined coding assistant**
40 |
41 | Enter respondent data into the input fields and press the validate button.
42 | The response and debugging info will be displayed below.
43 | Preloaded classification indices are used.
44 |
45 | 2. **Setup custom LLM coding assistant**
46 |
47 | You can customise your own classification assistant by uploading your own index and
48 | specifying survey fields. This will be used in a Retrieval Augmented
49 | Generation (RAG) pipeline.
50 |
51 | 3. **Test custom LLM coding assistant**
52 |
53 | After setting up a custom index and survey fields, you can test
54 | your custom LLM coding assistant.
55 | """
56 | )
57 |
--------------------------------------------------------------------------------
/app/pages/1_SIC_&_SOC_pre-defined_coding_assistant.py:
--------------------------------------------------------------------------------
1 | import dotenv
2 | import streamlit as st
3 | from sic_soc_llm.llm import ClassificationLLM
4 | from sic_soc_llm.embedding import EmbeddingHandler
5 |
6 | st.set_page_config(
7 | page_title="SIC/SOC LLM assistant",
8 | page_icon="🐥",
9 | )
10 |
11 | if st.session_state.get("open_ai_key") is None:
12 | try:
13 | openai_api_key = dotenv.dotenv_values(".env")["OPENAI_API_KEY"]
14 | st.session_state["open_ai_key"] = openai_api_key
15 | except Exception as e:
16 | st.session_state["open_ai_key"] = str(e)
17 |
18 | embed = EmbeddingHandler() # Loaded once, used twice
19 | uni_chat = ClassificationLLM("gemini-pro", embedding_handler=embed) # "text-unicorn"
20 | gpt_chat = ClassificationLLM(
21 | "gpt-4", openai_api_key=st.session_state["open_ai_key"], embedding_handler=embed
22 | )
23 |
24 |
25 | # Ask the user for the OpenAI API key if they want to use it remotely
26 | ai_key_enter = st.sidebar.text_input(
27 | "If you want to use GPT, update OpenAI API key",
28 | value="",
29 | type="password",
30 | on_change=None,
31 | )
32 | if ai_key_enter:
33 | st.session_state["open_ai_key"] = ai_key_enter
34 | gpt_chat = ClassificationLLM(
35 | "gpt-4", openai_api_key=st.session_state["open_ai_key"], embedding_handler=embed
36 | )
37 | st.sidebar.success("OpenAI API key updated successfully")
38 |
39 |
40 | # Streamlit app
41 | def main(verbose: bool = True): # noqa: C901
42 | st.subheader("LLM assisted SIC/SOC Coding", divider=True)
43 | st.subheader("Respondent data - survey fields", divider=True)
44 | # Job Title and Description inputs
45 | job_title = st.text_input("Job Title")
46 | job_description = st.text_area("Job Description")
47 | manage_others = st.toggle("Line management responsibility")
48 |
49 | # Level of Education input
50 | education_levels = [
51 | "No formal qualifications",
52 | "Level 1: one to four GCSE passes (grade A* to C or grade 4 and above)"
53 | + " and any other GCSEs at other grades, or equivalent qualifications",
54 | "Level 2: five or more GCSE passes (grade A* to C or grade 4 and above)"
55 | + " or equivalent qualifications",
56 | "Level 3: two or more A Levels or equivalent qualifications",
57 | "Level 4 or above: Higher National Certificate, Higher National Diploma,"
58 | + " Bachelor's degree, or post-graduate qualifications",
59 | "Other qualifications, of unknown level",
60 | ]
61 | level_of_education = st.selectbox("Level of Education", education_levels)
62 | industry_descr = st.text_area("What does the organisation mainly make or do?")
63 |
64 | buttons = {}
65 | st.subheader("Validate using LLM (one-shot)", divider=True)
66 | col1, col2 = st.columns(2)
67 | buttons["soc_uni"] = col1.button(
68 | "Validate input for SOC using Gemini ⛋"
69 | ) # Palm2 🦄")
70 | buttons["soc_gpt"] = col2.button("Validate input for SOC using GPT ⚛")
71 | buttons["sic_uni"] = col1.button(
72 | "Validate input for SIC using Gemini ⛋"
73 | ) # Palm2 🦄")
74 | buttons["sic_gpt"] = col2.button("Validate input for SIC using GPT ⚛")
75 | # Add buttons for rag
76 | st.subheader("Validate SIC using LLM (RAG)", divider=True)
77 | col3, col4 = st.columns(2)
78 | buttons["rag_uni"] = col3.button(
79 | "Validate input for SIC using RAG Gemini ⛋"
80 | ) # Palm2 🦄")
81 | buttons["rag_gpt"] = col4.button("Validate input for SIC using RAG GPT ⚛")
82 |
83 | if any(buttons.values()):
84 | if job_title or job_description or industry_descr:
85 | if buttons["soc_uni"]:
86 | response = uni_chat.get_soc_code(
87 | job_title,
88 | job_description,
89 | level_of_education,
90 | manage_others,
91 | industry_descr,
92 | )
93 | elif buttons["soc_gpt"]:
94 | response = gpt_chat.get_soc_code(
95 | job_title,
96 | job_description,
97 | level_of_education,
98 | manage_others,
99 | industry_descr,
100 | )
101 | elif buttons["sic_uni"]:
102 | response = uni_chat.get_sic_code(
103 | industry_descr, job_title, job_description
104 | )
105 | elif buttons["sic_gpt"]:
106 | response = gpt_chat.get_sic_code(
107 | industry_descr, job_title, job_description
108 | )
109 | elif buttons["rag_uni"]:
110 | response, _, _ = uni_chat.rag_sic_code(
111 | industry_descr, job_title, job_description
112 | )
113 | else:
114 | response, _, _ = gpt_chat.rag_sic_code(
115 | industry_descr, job_title, job_description
116 | )
117 |
118 | if response.codable & ("soc_code" in response.model_fields):
119 | st.success(
120 | f"👍 Coded as {response.soc_code}: {response.soc_descriptive}"
121 | )
122 | elif response.codable & ("sic_code" in response.model_fields):
123 | st.success(
124 | f"👍 Coded as {response.sic_code}: {response.sic_descriptive}"
125 | )
126 | else:
127 | st.warning(f"👆 More details needed: {response.followup}")
128 |
129 | if verbose:
130 | print(
131 | "Input: ",
132 | job_title,
133 | job_description,
134 | manage_others,
135 | level_of_education,
136 | )
137 | print("Response: ", response)
138 | st.subheader("Debugging info", divider=True)
139 | st.json(response.model_dump())
140 | else:
141 | st.warning("👆 Please enter something somewhere at least...")
142 |
143 |
144 | if __name__ == "__main__":
145 | main()
146 |
--------------------------------------------------------------------------------
/app/pages/2_Setup_custom_coding_assistant.py:
--------------------------------------------------------------------------------
1 | # %%
2 | import streamlit as st
3 | from io import StringIO
4 | from importlib import resources
5 | from sic_soc_llm.embedding import EmbeddingHandler
6 |
7 | # %%
8 | st.set_page_config(
9 | page_title="Custom LLM coding assistant",
10 | page_icon="🐥",
11 | )
12 |
13 |
14 | def main():
15 | # File upload
16 | st.subheader("1. Upload Classification Index", divider=True)
17 | index_file = st.file_uploader("Upload a file", type=["txt"])
18 | st.markdown(
19 | """The index file should be a text file with one classification entry
20 | per line in the format `code: description`.
21 | You can download an example index file for reference:"""
22 | )
23 | # Put the download buttons on one row with different spacing
24 | col1, col2, col3, col4 = st.columns([1.3, 2.2, 2, 2])
25 | col1.download_button(
26 | "Toy index",
27 | data=(
28 | resources.files("sic_soc_llm.example_data") / "toy_index.txt"
29 | ).read_bytes(),
30 | file_name="toy_index.txt",
31 | mime="text/plain",
32 | )
33 | col2.download_button(
34 | "COICOP 5d condensed",
35 | data=(
36 | resources.files("sic_soc_llm.example_data") / "coicop_5d_condensed.txt"
37 | ).read_bytes(),
38 | file_name="coicop_5d_condensed.txt",
39 | mime="text/plain",
40 | )
41 | col3.download_button(
42 | "SOC 4d condensed",
43 | data=(
44 | resources.files("sic_soc_llm.example_data") / "soc_4d_condensed.txt"
45 | ).read_bytes(),
46 | file_name="soc_4d_condensed.txt",
47 | mime="text/plain",
48 | )
49 | col4.download_button(
50 | "SIC 4d condensed",
51 | data=(
52 | resources.files("sic_soc_llm.example_data") / "sic_4d_condensed.txt"
53 | ).read_bytes(),
54 | file_name="sic_4d_condensed.txt",
55 | mime="text/plain",
56 | )
57 |
58 | if index_file is not None:
59 | try:
60 | # Embed the index
61 | embedding_handler = EmbeddingHandler(db_dir=None)
62 | embedding_handler.embed_index(
63 | file_object=StringIO(index_file.getvalue().decode("utf-8"))
64 | )
65 | # Display size of embedded index
66 | coll_size = embedding_handler._index_size
67 | if coll_size > 0:
68 | st.session_state.custom_embed = embedding_handler
69 | st.success(
70 | f"Index embedding successful. Embedded index of size {coll_size}."
71 | )
72 | else:
73 | st.warning(
74 | "Index embedding failed. Please check the index file and try again."
75 | )
76 | except Exception as e:
77 | st.error(
78 | f"""Index embedding failed. Please check the index file and try again.
79 | Error: {e}"""
80 | )
81 | else:
82 | if st.session_state.get("custom_embed", None) is not None:
83 | coll_size = st.session_state.custom_embed._index_size
84 | st.info(
85 | f"""A custom index has been embedded previously with
86 | {coll_size} entries. Uploading new index will replace it."""
87 | )
88 |
89 | # Specify respondent data fields
90 | st.subheader("2. Specify respondent data fields", divider=True)
91 | # Specify respondent data fields
92 | fields = st.text_input("Enter respondent data fields (separated by commas)")
93 | if fields:
94 | field_list = [s.strip() for s in fields.split(",")]
95 | # Save the custom fields in session state
96 | st.session_state.custom_fields = field_list
97 | st.success(f"Respondent data fields registered: {field_list}")
98 | else:
99 | if st.session_state.get("custom_fields", None) is not None:
100 | st.info(
101 | f"""A custom respondent data fields previously registered:
102 | {st.session_state.custom_fields}."""
103 | )
104 |
105 |
106 | if __name__ == "__main__":
107 | main()
108 |
109 | # %%
110 |
--------------------------------------------------------------------------------
/app/pages/3_Test_custom_coding_assistant.py:
--------------------------------------------------------------------------------
1 | import dotenv
2 | import streamlit as st
3 | from sic_soc_llm.llm import ClassificationLLM
4 |
5 | st.set_page_config(
6 | page_title="Custom LLM coding assistant",
7 | page_icon="🐥",
8 | )
9 |
10 | if "custom_embed" not in st.session_state or "custom_fields" not in st.session_state:
11 | # Point user to setup page
12 | st.write(
13 | """Custom index and response fields not provided. Please go to the
14 | **Setup custom LLM coding assistant** page to setup your index and fields.""",
15 | unsafe_allow_html=True,
16 | )
17 | st.stop()
18 |
19 | if st.session_state.get("open_ai_key") is None:
20 | try:
21 | openai_api_key = dotenv.dotenv_values(".env")["OPENAI_API_KEY"]
22 | st.session_state["open_ai_key"] = openai_api_key
23 | except Exception as e:
24 | st.session_state["open_ai_key"] = str(e)
25 |
26 |
27 | uni_chat = ClassificationLLM(
28 | "gemini-pro", embedding_handler=st.session_state.custom_embed
29 | ) # "text-unicorn"
30 | gpt_chat = ClassificationLLM(
31 | "gpt-4",
32 | embedding_handler=st.session_state.custom_embed,
33 | openai_api_key=st.session_state["open_ai_key"],
34 | )
35 |
36 |
37 | # Ask the user for the OpenAI API key if they want to use it remotely
38 | ai_key_enter = st.sidebar.text_input(
39 | "If you want to use GPT, update OpenAI API key",
40 | value="",
41 | type="password",
42 | on_change=None,
43 | )
44 | if ai_key_enter:
45 | st.session_state["open_ai_key"] = ai_key_enter
46 | gpt_chat = ClassificationLLM(
47 | "gpt-4",
48 | embedding_handler=st.session_state.custom_embed,
49 | openai_api_key=st.session_state["open_ai_key"],
50 | )
51 | st.sidebar.success("OpenAI API key updated successfully")
52 |
53 |
54 | fields = st.session_state.custom_fields
55 |
56 |
57 | # Streamlit app
58 | def main(verbose: bool = True):
59 | st.subheader("LLM assisted custom coding", divider=True)
60 | # Ask for specific field inputs from session state
61 | input_field = {}
62 | for field in st.session_state.custom_fields:
63 | input_field[field] = st.text_input(field)
64 |
65 | col1, col2 = st.columns(2)
66 | rag_uni_button = col1.button("Validate input using Gemini ⛋") # Palm2 🦄")
67 | rag_gpt_button = col2.button("Validate input using GPT ⚛")
68 |
69 | if rag_uni_button or rag_gpt_button:
70 | # Check there is some value in input fields
71 | print("Input: ", input_field)
72 | if len(set(input_field.values()).difference({"", None})) > 0:
73 | if rag_uni_button:
74 | response, _ = uni_chat.rag_general_code(respondent_data=input_field)
75 | elif rag_gpt_button:
76 | response, _ = gpt_chat.rag_general_code(respondent_data=input_field)
77 |
78 | if response.codable:
79 | st.success(
80 | f"👍 Coded as {response.class_code}: {response.class_descriptive}"
81 | )
82 |
83 | else:
84 | st.warning(f"👆 More details needed: {response.followup}")
85 |
86 | if verbose:
87 | print("Input: ", input_field)
88 | print("Response: ", response)
89 | st.subheader("Debugging info", divider=True)
90 | st.json(response.model_dump())
91 | else:
92 | st.warning("👆 Please enter something somewhere at least...")
93 |
94 |
95 | if __name__ == "__main__":
96 | main()
97 |
--------------------------------------------------------------------------------
/cloud_deploy.sh:
--------------------------------------------------------------------------------
1 | # set the default gcloud project
2 | gcloud config set project ""
3 | # set the default compute zone to london
4 | gcloud config set compute/zone europe-west2-c
5 | # set build region to eu west
6 | gcloud config set builds/region europe-west2
7 |
8 | # build remotely (uses Dockerfile)
9 | gcloud builds submit --tag europe-west2-docker.pkg.dev/""/sic-soc-docker/app_test:v1 . --region=europe-west2
10 |
11 | # deploy the image as app engine (uses app.yaml)
12 | # gcloud app deploy --image-url=europe-west2-docker.pkg.dev/""/sic-soc-docker/app_test:v1
13 | # works on port 8080 but cannot write to the file system (problem for custom classification index)
14 |
15 | # deploy the image as google run service
16 | gcloud run deploy sic-soc --image europe-west2-docker.pkg.dev/""/sic-soc-docker/app_test:v1 \
17 | --min-instances=0 --max-instances=3 --region=europe-west2 --allow-unauthenticated --memory=4G --port=8080
18 |
--------------------------------------------------------------------------------
/codecov.yml:
--------------------------------------------------------------------------------
1 | comment: true
2 |
3 | coverage:
4 | status:
5 | project:
6 | default:
7 | target: auto
8 | threshold: 80%
9 | informational: true
10 | patch:
11 | default:
12 | target: auto
13 | threshold: 20%
14 | informational: true
15 |
16 | ignore:
17 | - "tests"
18 | - "**/__init__.py"
19 |
--------------------------------------------------------------------------------
/docs/_quarto.yml:
--------------------------------------------------------------------------------
1 | project:
2 | type: website
3 | render:
4 | - /*.qmd
5 | - tutorials/*.qmd
6 | - reference/*.qmd
7 | preview:
8 | port: 1111
9 | browser: true
10 | watch-inputs: true
11 | navigate: true
12 | resources:
13 | - _static/
14 |
15 | website:
16 | title: sic-soc-llm
17 | navbar:
18 | left:
19 | - href: index.qmd
20 | text: About
21 | - href: method.qmd
22 | text: Method
23 | - href: tutorials/index.qmd
24 | text: Tutorials
25 | - href: reference/index.qmd
26 | text: Reference
27 | right:
28 | - icon: github
29 | url: https://github.com/datasciencecampus/sic-soc-llm
30 | reader-mode: false
31 | page-footer:
32 | left: >
33 | All content is available under the
34 | [Open Government Licence V3.0](http://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/),
35 | except where otherwise stated.
36 | center: >
37 | Built using [Quarto](https://quarto.org/).
38 |
39 | format:
40 | html:
41 | mainfont: Arial
42 | theme:
43 | dark: cyborg
44 | light: cosmo
45 | lang: en-GB
46 |
47 | metadata-files:
48 | - reference/_sidebar.yml
49 |
50 | quartodoc:
51 | parser: google
52 | title: LLM based classification
53 | package: sic_soc_llm
54 | dir: reference
55 | sidebar: reference/_sidebar.yml
56 | sections:
57 | - title: Classification module
58 | desc: >
59 | Large Language Model based classification main handlers.
60 | package: sic_soc_llm
61 | contents:
62 | - embedding.EmbeddingHandler
63 | - llm.ClassificationLLM
64 | - prompt.PromptTemplates
65 |
66 | - subtitle: Response models
67 | package: sic_soc_llm.data_models.response_model
68 | contents:
69 | - SocCandidate
70 | - SocResponse
71 | - SicCandidate
72 | - SicResponse
73 | - RagCandidate
74 | - RagResponse
75 |
76 | - title: SIC Index Abstraction
77 | desc: >
78 | Data models to represent Standard Industry Classiffication
79 | package: sic_soc_llm.data_models.sic_hierarchy
80 | contents:
81 | - SIC
82 | - SicCode
83 | - SicNode
84 | - subtitle: SIC metadata
85 | package: sic_soc_llm.data_models
86 | contents:
87 | - sic_meta_model.ClassificationMeta
88 | - sicDB.SicMeta
89 | - title: Helpers
90 | desc: Config and Log utils
91 | package: sic_soc_llm
92 | contents:
93 | - setup_logging
94 | - get_config
95 | - check_file_exists
96 |
--------------------------------------------------------------------------------
/docs/_static/app-ui.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datasciencecampus/sic-soc-llm/d3b758970968427483f534faefbca34fff426172/docs/_static/app-ui.png
--------------------------------------------------------------------------------
/docs/_static/sic-soc-llm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datasciencecampus/sic-soc-llm/d3b758970968427483f534faefbca34fff426172/docs/_static/sic-soc-llm.png
--------------------------------------------------------------------------------
/docs/index.qmd:
--------------------------------------------------------------------------------
1 | ---
2 | format: html
3 | ---
4 |
5 | {{< include ../README.md >}}
6 |
7 |
8 |
--------------------------------------------------------------------------------
/docs/method.qmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: Method
3 | format: html
4 | ---
5 |
6 | ## Summary
7 |
8 | A proof-of-concept large language model (LLM) application was created to assess whether an LLM could improve SIC autocoding performance for survey data. This was applied to sample of anonimized survey data and evaluated by comparing the results to clerical coding and to logistic regression model. The LLM showed marginal improvement over the logistic regression in the level of agreement with clerical coding at the 5-digit SIC level. It is likely that refinement of the method would improve performance further. Note that the evaluation scripts are out of scope for this repository. The methodology of the main SIC autocoding module is described bellow. For more information see Data science campus [blog](https://datasciencecampus.ons.gov.uk/classifai-exploring-the-use-of-large-language-models-llms-to-assign-free-text-to-commonly-used-classifications/).
9 |
10 | ## RAG based classification
11 |
12 | The proposed LLM-based method for auto coding of free text survey responses involves two main steps. Our implementation follows the common Retrieval Augmented Generation (RAG) design, for overview
13 | see @fig-system.
14 |
15 | {#fig-system}
16 |
17 | The primary **input** for this process consists of three free text fields from survey responses: the company's activity, job title, and job description.
18 |
19 | 1. **Semantic Search of Relevant SIC Index Candidates**
20 |
21 | The first step in the process involves conducting a semantic search for relevant Standard Industrial Classification (SIC) index candidates. This is achieved by embedding of a knowledge base using transformer language model MiniLM. The knowledge base includes a list of activities, each with an assigned SIC code. MiniLM is a smaller, more efficient version of the BERT-based transformer model, designed for tasks that require understanding the semantic meaning of text. It is used to convert the text from the survey response into a form that can be compared with the embeddings of the activities in the knowledge base. The result of this step is a list of potential SIC codes that may be relevant to the response.
22 |
23 | 2. **LLM Query**
24 |
25 | The second step involves querying a general purpose pretrained large language model (Gemini-Pro) to evaluate which, if any, of the SIC code candidates is the best fit for the response. This step leverages the ability of LLMs to understand and generate human-like text. The LLM is presented with the response and the list of potential SIC codes and their description, and it is asked to determine which code should be assigned based on the response. If the decision cannot be confidently made the LLM is instructed to return uncodable status.
26 |
27 | The **output** from the LLM is required in such a form that specific fields can be identified and easily analysed:
28 |
29 | - Codable (Yes/No): This field indicates whether or not the survey response could be assigned a SIC code.
30 | - SIC code: This field contains the SIC code that was determined to be the best fit for the response. The code may be requested at either the 5-digit or 2-digit levels.
31 | - Follow-up question: This field specifies a suitable follow-up question to clarify the response in case that an appropriate SIC code cannot be readily determined.
32 | - SIC candidates (+likelihood estimate): This field lists the SIC codes that were considered as potential matches for the response, along with an estimate of the likelihood that each code is the correct match.
33 | - Reasoning: This field provides an explanation of why the LLM selected the particular SIC code or decided that the correct code cannot be determined.
34 |
35 | Alterations to the pipeline were considered. For example, instead of providing a short-list of candidates one can take advantage of the ever-increasing context window (input length allowance) and include the full index or use the LLM’s own awareness of SIC index. We found these options yield worse results than the above outlined RAG for this particular task and model used.
36 |
37 | Both steps rely on pretrained transformer-based models. Because the latest LLMs have been trained on large bodies of text and have billions of parameters they are able to identify the semantic meaning of words, nuance in grammar and spelling. In contrast with rule-based or bag-of-words based machine learning methods this improves how it handles previously unseen responses, such as emerging jobs and industries, unusually phrased or misspelled responses.
38 |
39 | The use of pretrained models in our pipeline provides a solid foundation, but there is an option to fine-tune these models on a specific task to potentially improve performance. Fine-tuning involves continuing the training of the pretrained model on a new dataset, in this case, the survey responses and SIC codes. However, it tends to be computationally expensive and time-consuming and require large, annotated dataset, which was not available.
40 |
41 | An alternative approach to the one-shot prompt used in the second step of the pipeline is to use an agent-based method. In this approach, instead of the LLM making a decision based on a single interaction, the LLM acts as an agent that engages in a dialogue with the text data. The LLM, acting as an agent, can be dynamically assigned different roles in the conversation or specialist tasks. However, it therefore requires more computational resources and time, as it involves multiple interactions with the LLM.
42 |
43 | At this moment we have not evaluated the quality of provided follow-up question and reasoning but included them in the proof of concept due to their potential to improve the data collection step (whether as a one-off qualitative analysis or in real-time process).
44 |
45 | The codebase includes an example user interface. This allows small-scale testing where users can experiment with different models and test their sensitivity to the input. An example of this working with output is shown in @fig-app
46 |
47 | {#fig-app}
48 |
--------------------------------------------------------------------------------
/docs/tutorials/1_sic_data_structure.qmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "1. SIC data structure"
3 | format:
4 | html:
5 | code-fold: show
6 | ---
7 |
8 | Demonstration notebook for the SIC data structure.
9 |
10 | ```{python}
11 | #| code-summary: "Code: Import methods and initialise"
12 | #| output: false
13 | import random
14 |
15 | from sic_soc_llm import setup_logging, get_config
16 | from sic_soc_llm.data_models import sic_hierarchy, sic_data_access
17 |
18 | logger = setup_logging("sic_data_notebook")
19 | config = get_config()
20 | seed = 3847693223
21 | ```
22 |
23 | There are two additional datasets required for the SIC hierarchy object that are not part of the repository. These are the SIC structure and SIC index datasets. The following code will download these datasets from the ONS website if they are not already available.
24 |
25 | ```{python}
26 | #| output: false
27 | #| code-summary: "Code: Make sure all required SIC datasets are available"
28 | import requests
29 | from pathlib import Path
30 |
31 | sic_urls = [
32 | "https://www.ons.gov.uk/file?uri=/methodology/classificationsandstandards/ukstandardindustrialclassificationofeconomicactivities/uksic2007/publisheduksicsummaryofstructureworksheet.xlsx",
33 | "https://www.ons.gov.uk/file?uri=/methodology/classificationsandstandards/ukstandardindustrialclassificationofeconomicactivities/uksic2007/uksic2007indexeswithaddendumdecember2022.xlsx"
34 | ]
35 |
36 | file_paths = [
37 | Path(config['lookups']['sic_structure']),
38 | Path(config["lookups"]["sic_index"])
39 | ]
40 |
41 | for url, file_path in zip(sic_urls, file_paths):
42 | if not file_path.exists():
43 | r = requests.get(url)
44 | file_path.parent.mkdir(exist_ok=True, parents=True)
45 | with open(file_path, 'wb') as outfile:
46 | outfile.write(r.content)
47 | ```
48 |
49 | ## Load SIC index
50 |
51 | ```{python}
52 | #| code-summary: "Code: Load SIC index"
53 | sic_index_filepath = config["lookups"]["sic_index"]
54 | sic_index_df = sic_data_access.load_sic_index(sic_index_filepath)
55 |
56 | sic_index_df.sample(5, random_state=seed)
57 |
58 | ```
59 |
60 | ## Load SIC structure
61 |
62 | ```{python}
63 | #| code-summary: "Code: Load SIC structure"
64 | sic_structure_filepath = config["lookups"]["sic_structure"]
65 | sic_df = sic_data_access.load_sic_structure(sic_structure_filepath)
66 |
67 | sic_df.sample(5, random_state=seed)
68 | ```
69 |
70 | ## Create SIC hierarchy
71 |
72 | ```{python}
73 | #| code-summary: "Code: Create SIC hierarchy"
74 | sic = sic_hierarchy.load_hierarchy(sic_df, sic_index_df)
75 |
76 | print(f"There are {len(sic):,} entries in the hierarcy")
77 | ```
78 |
79 | ## Example lookup
80 |
81 | Supports a variety of common formatting patterns for SIC.
82 | Sometimes 4-digit SIC serve as 5-digit SIC
83 | ```{python}
84 | #| code-summary: "Code: Example lookup"
85 | print(sic["A011xx"])
86 | print(sic["A011"])
87 | print(sic["011"])
88 | print(sic["01.1"])
89 |
90 | print(sic["A0111x"])
91 | print(sic["0111"])
92 | print(sic["01110"])
93 | ```
94 |
95 | ## Select a random example
96 |
97 | ```{python}
98 | #| code-summary: "Code: Example SIC index entry"
99 | random.seed(seed)
100 | sic_node = random.choice(sic.nodes)
101 |
102 | sic_node.print_all()
103 | ```
104 |
--------------------------------------------------------------------------------
/docs/tutorials/2_sic_classifier.qmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "2. SIC classifier"
3 | execute:
4 | warning: False
5 | format:
6 | html:
7 | code-fold: show
8 | ---
9 |
10 | Demonstration notebook for the `ClassificationLLM` using Retrieval Augmented Generation (RAG) with Standard Industrial Classification (SIC) codes.
11 |
12 | ```{python}
13 | #| code-summary: "Code: Import methods and initialise"
14 | from sic_soc_llm import setup_logging, get_config
15 | from sic_soc_llm.embedding import EmbeddingHandler
16 | from sic_soc_llm.llm import ClassificationLLM
17 |
18 | logger = setup_logging('sic_classifier')
19 | config = get_config()
20 | ```
21 |
22 | ```{python}
23 | #| code-summary: "Code: Make sure the SIC datasets are available"
24 | #| echo: false
25 |
26 | import requests
27 | from pathlib import Path
28 | import hashlib
29 |
30 | sic_urls = [
31 | "https://www.ons.gov.uk/file?uri=/methodology/classificationsandstandards/ukstandardindustrialclassificationofeconomicactivities/uksic2007/publisheduksicsummaryofstructureworksheet.xlsx",
32 | "https://www.ons.gov.uk/file?uri=/methodology/classificationsandstandards/ukstandardindustrialclassificationofeconomicactivities/uksic2007/uksic2007indexeswithaddendumdecember2022.xlsx"
33 | ]
34 |
35 | file_paths = [
36 | Path(config['lookups']['sic_structure']),
37 | Path(config["lookups"]["sic_index"])
38 | ]
39 |
40 | expected_hashes = [
41 | 'f5090c89938b1f24f7b1498530bc99f520abf4198a3af3f3655814c094cc0944',
42 | '3d6bf9d0950b8b9836d3590001cb391ac4338a8319a4b519483ad17f0d51f085'
43 | ]
44 |
45 | for url, file_path, expected_hash in zip(sic_urls, file_paths, expected_hashes):
46 | if not file_path.exists():
47 | r = requests.get(url)
48 | file_path.parent.mkdir(exist_ok=True, parents=True)
49 | with open(file_path, 'wb') as outfile:
50 | outfile.write(r.content)
51 |
52 | # Calculate the SHA256 hash of the downloaded file
53 | hash_object = hashlib.sha256()
54 | hash_object.update(r.content)
55 | file_hash = hash_object.hexdigest()
56 | if file_hash != expected_hash:
57 | raise ValueError(f"Downloaded file {file_path} has incorrect hash {file_hash}, expected {expected_hash}")
58 | ```
59 |
60 | ```{python}
61 | #| echo: false
62 | #| code-summary: "Code: Create a fake Large Language Model (LLM) for demonstration purposes"
63 | from langchain.llms.fake import FakeListLLM
64 |
65 | sic_demo_llm = FakeListLLM(responses=[
66 | '''
67 | { "codable": true, "sic_code": "86101", "sic_descriptive": "Hospital activities", "sic_candidates": [ { "sic_code": "86101", "sic_descriptive": "Hospital activities", "likelihood": 0.9 }, { "sic_code": "86220", "sic_descriptive": "Specialist medical practice activities", "likelihood": 0.1 } ], "reasoning": "The company\'s main activity is providing care to patients, which aligns with the \'Hospital activities\' SIC code. The job title and description also suggest a hospital setting. However, there is a small possibility that the company could fall under \'Specialist medical practice activities\' as the job title is a specialist role."}
68 | ''',
69 | '''
70 | { "codable": true, "sic_code": "03110", "sic_descriptive": "Marine fishing", "sic_candidates": [ { "sic_code": "03110", "sic_descriptive": "Marine fishing", "likelihood": 1 } ], "reasoning": "The company\'s main activity is described as \'catching fish on the north sea from grimsby port\', which aligns with the \'Marine fishing\' category under SIC code 03110."}
71 | ''',
72 | '''{ "codable": true, "sic_code": "66190", "sic_descriptive": "Other activities auxiliary to financial services, except insurance and pension funding", "sic_candidates": [ { "sic_code": "66190", "sic_descriptive": "Other activities auxiliary to financial services, except insurance and pension funding", "likelihood": 0.7 }, { "sic_code": "64191", "sic_descriptive": "Banks", "likelihood": 0.2 }, { "sic_code": "64991", "sic_descriptive": "Security dealing on own account", "likelihood": 0.1 } ], "reasoning": "The company\'s main activity is bitcoin trading, which falls under \'Other activities auxiliary to financial services, except insurance and pension funding\'. However, it could also potentially fall under \'Banks\' or \'Security dealing on own account\', but these are less likely."}
73 | ''',
74 | '''{ "codable": true, "sic_code": "85590", "sic_descriptive": "Other education nec", "sic_candidates": [ { "sic_code": "85590", "sic_descriptive": "Other education nec", "likelihood": 0.9 }, { "sic_code": "85600", "sic_descriptive": "Educational support activities", "likelihood": 0.1 } ], "reasoning": "The company\'s main activity of matching tutors to pupils for extra help outside of school aligns with the \'Other education nec\' category (SIC code 85590). The job description of helping GCSE and A level students achieve the best possible results further supports this classification. The \'Educational support activities\' category (SIC code 85600) could also be a possibility, but is less likely given the specific tutoring focus of the company."}
75 | '''
76 | ])
77 |
78 | # populate the vector store with tiny index for demo purposes
79 | embed = EmbeddingHandler()
80 | if embed._index_size == 0:
81 | index_filepath = config["lookups"]["sic_condensed"]
82 | with open(index_filepath) as file_object:
83 | embed.embed_index(file_object=file_object)
84 | ```
85 |
86 | For the retrieval part of the RAG based SIC classification a correctly populated vector store is required. By default the `EmbeddingHandler` would load `SIC` data structure with all its activities using files specified in the `sic_soc_llm_config.toml`. This may take several minutes.
87 |
88 | For more details about the `SIC` data structure and the data files required for it, see the [SIC data structure tutorial](1_sic_data_structure.html).
89 |
90 | ```{python}
91 | #| code-summary: "Code: Populate vector store"
92 | embed = EmbeddingHandler()
93 | if embed._index_size == 0:
94 | embed.embed_index()
95 | ```
96 |
97 | As we have already initialised the `EmbeddingHandler` we can pass it to the `ClassificationLLM` object; this is not essential as the `ClassificationLLM` will initialise its own `EmbeddingHandler` if one is not provided (based on the same config values). Note that the `sic_demo_llm` should be replaced with the LLM of your choice.
98 |
99 | ```{python}
100 | #| code-summary: "Code: Initialise the SIC classifier"
101 | sic_llm = ClassificationLLM(llm=sic_demo_llm, embedding_handler=embed)
102 | ```
103 |
104 |
105 | ## Example SIC classification
106 |
107 | Load a few examples of possible survey responses and classify them using the SIC classifier.
108 |
109 | ```{python}
110 | #| code-summary: "Code: Input and classify examples"
111 | sic_examples = [
112 | {
113 | "industry_descr": "we provide care to thousands of patients across north east lincolnshire",
114 | "job_title": "anaesthetist",
115 | "job_description": "give anaesthetics for surgical, medical and psychiatric procedures"
116 | },
117 | {
118 | "industry_descr": "we catch fish on the north sea from grimsby port",
119 | "job_title": None,
120 | "job_description": None
121 | },
122 | {
123 | "industry_descr": "bitcoin trading",
124 | "job_title": None,
125 | "job_description": None
126 |
127 | },
128 | {
129 | "industry_descr": "we match tutors to pupils for extra help outside of school",
130 | "job_title": None,
131 | "job_description": "help gcse and a level students achieve the best possible results"
132 | },
133 | ]
134 |
135 | for item in sic_examples:
136 | # Get response from LLM
137 | response, short_list, call_dict = sic_llm.rag_sic_code(
138 | industry_descr = item["industry_descr"],
139 | job_title = item["job_title"],
140 | job_description = item["job_description"],
141 | )
142 |
143 | # Print the output
144 | print("Input:")
145 | for v, w in item.items():
146 | print(f" {v}: {w}")
147 | print('')
148 |
149 | print("Response:")
150 | for x,y in response.__dict__.items():
151 | print (f" {x}: {y}")
152 | print("")
153 | print('===========================================')
154 | print("")
155 |
156 | ```
157 |
--------------------------------------------------------------------------------
/docs/tutorials/3_soc_classifier.qmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "3. SOC classifier"
3 | execute:
4 | warning: False
5 | format:
6 | html:
7 | code-fold: show
8 | ---
9 |
10 | Demonstration notebook for the `ClassificationLLM` with Standard Occupational Classification (SOC) codes.
11 |
12 | ```{python}
13 | #| code-summary: "Code: Import methods and initialise"
14 | from sic_soc_llm import setup_logging
15 | from sic_soc_llm.llm import ClassificationLLM
16 |
17 | logger = setup_logging("soc_classifier")
18 | ```
19 |
20 | ```{python}
21 | #| echo: false
22 | #| code-summary: "Code: Create a fake Large Language Model (LLM) for demonstration purposes"
23 | from langchain.llms.fake import FakeListLLM
24 |
25 | soc_demo_llm = FakeListLLM(responses=[
26 | '''
27 | {"codable": true, "followup": null, "soc_code": "9265", "soc_descriptive": "Bar staff", "soc_candidates": [{"soc_code": "9265", "soc_descriptive": "Bar staff", "likelihood": 1.0}], "soc_code_2digits": "92", "reasoning": "The job title \'barman\' and the job description \'barman at local golf club\' clearly indicate that the respondent\'s job involves serving drinks at a bar, which aligns with the SOC code 9265 for \'Bar staff\'."}
28 | ''',
29 | '''
30 | { "codable": true, "followup": null, "soc_code": "1221", "soc_descriptive": "Hotel and accommodation managers and proprietors", "soc_candidates": [ { "soc_code": "1221", "soc_descriptive": "Hotel and accommodation managers and proprietors", "likelihood": 1 } ], "soc_code_2digits": "12", "reasoning": "The job title \'hotel night manager\' and the company\'s main activity being a hotel aligns with the SOC code 1221 for \'Hotel and accommodation managers and proprietors\'. The job description, although unclear, seems to involve duties that could be associated with this role."}''',
31 | '''
32 | { "codable": false, "followup": "Could you please provide more details about your daily tasks and responsibilities in this role?", "soc_code": null, "soc_descriptive": null, "soc_candidates": [ { "soc_code": "2139", "soc_descriptive": "Information technology professionals n.e.c.", "likelihood": 0.7 } ], "soc_code_2digits": "21", "reasoning": "The job title \'functional consultant\' and the job description \'provide consultancy on system configuration\' suggest a role in IT consultancy. However, more specific information about the tasks and responsibilities of the role is needed to assign a more accurate SOC code."}
33 | ''',
34 | '''
35 | {"codable": true, "followup": null, "soc_code": "6213", "soc_descriptive": "Air travel assistants", "soc_candidates": [{"soc_code": "6213", "soc_descriptive": "Air travel assistants", "likelihood": 0.9}], "soc_code_2digits": "62", "reasoning": "The job title \'senior airport services agent\' and the job description \'customer service\' in the context of an airline company suggest that the respondent\'s role involves assisting passengers and providing customer service in an airport setting. This aligns with the SOC code 6213 for \'Air travel assistants\'."}
36 | ''',
37 | '''
38 | { "codable": false, "followup": "Could you please provide more specific information about your job responsibilities and the nature of the materials you work with?", "soc_code": null, "soc_descriptive": null, "soc_candidates": [ { "soc_code": "2125", "soc_descriptive": "Production and Process Engineers", "likelihood": 0.5 }, { "soc_code": "2122", "soc_descriptive": "Mechanical Engineers", "likelihood": 0.5 } ], "soc_code_2digits": "21", "reasoning": "The job title translates to \'Engineer\' and the company\'s main activity involves \'Processing Materials\'. This could correspond to several engineering roles within the \'21\' SOC code category, but without more specific information, it is not possible to determine the exact SOC code."}
39 | '''
40 | ])
41 | ```
42 |
43 | The example SOC classifier uses a one-shot prompt to classify respondent's data. In particular, there is no retrieval step (to reduce the list of candidate codes) and the whole condensed index is included in the prompt. Note that the `soc_demo_llm` should be replaced with the LLM of your choice.
44 |
45 | ```{python}
46 | #| code-summary: "Code: Initialise the SOC classifier"
47 | soc_llm = ClassificationLLM(llm=soc_demo_llm)
48 | ```
49 |
50 | ## Example SOC classifications
51 |
52 | Load a few examples of possible survey responses and classify them using the SOC classifier.
53 |
54 | ```{python}
55 | #| code-summary: "Code: Input and classify examples"
56 | soc_examples = [
57 | {
58 | "job_title": "barman",
59 | "job_description": "barman at local golf club",
60 | "employer_activities": "golf club",
61 | },
62 | {
63 | "job_title": "hotel night manager",
64 | "job_description": """hight potter reception closing documents
65 | breakfast preparation""",
66 | "employer_activities": "hotel",
67 | },
68 | {
69 | "job_title": "functional consultant",
70 | "job_description": "provide cnsultancy on system configuration",
71 | "employer_activities": "technology provide deliver enterprise software",
72 | },
73 | {
74 | "job_title": "senior airport services agent",
75 | "job_description": "customer service",
76 | "employer_activities": "airline",
77 | },
78 | {
79 | "job_title": "PEIRIANYDD",
80 | "job_description": "TRWSHIO",
81 | "employer_activities": "TRIN PERIANAU",
82 | },
83 | ]
84 |
85 | for item in soc_examples:
86 | # Get response from LLM
87 | response = soc_llm.get_soc_code(
88 | item["job_title"],
89 | item["job_description"],
90 | level_of_education="Unknown",
91 | manage_others="Unknown",
92 | industry_descr=item["employer_activities"],
93 | )
94 |
95 | # Print the output
96 | print("Input:")
97 | for v, w in item.items():
98 | print(f" {v}: {w}")
99 | print('')
100 |
101 | print("Response:")
102 | for x,y in response.__dict__.items():
103 | print (f" {x}: {y}")
104 | print("")
105 | print('===========================================')
106 | print("")
107 | ```
108 |
--------------------------------------------------------------------------------
/docs/tutorials/4_custom_coicop_classifier.qmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "4. Custom (COICOP) classifier"
3 | format:
4 | html:
5 | code-fold: show
6 | ---
7 |
8 | Demonstration notebook for the `ClassificationLLM` using RAG with a custom index. In this demo, the Classification of Individual Consumption According to Purpose (COICOP) index is used.
9 |
10 | ```{python}
11 | #| code-summary: "Code: Import methods and initialise"
12 | #| output: false
13 | from sic_soc_llm import setup_logging, get_config
14 | from sic_soc_llm.llm import ClassificationLLM
15 | from sic_soc_llm.embedding import EmbeddingHandler
16 |
17 | logger = setup_logging('coicop_notebook')
18 | config = get_config()
19 | ```
20 |
21 |
22 | ```{python}
23 | #| echo: false
24 | from langchain.llms.fake import FakeListLLM
25 |
26 | coicop_demo_llm = FakeListLLM(responses=[
27 | '''
28 | {
29 | "codable": true,
30 | "class_code": "CP01141",
31 | "class_descriptive": "Whole milk",
32 | "alt_candidates": [
33 | {
34 | "class_code": "CP01146",
35 | "class_descriptive": "Other milk products",
36 | "likelihood": 0.1
37 | },
38 | {
39 | "class_code": "CP01199",
40 | "class_descriptive": "Other food products n.e.c.",
41 | "likelihood": 0.05
42 | }
43 | ],
44 | "reasoning": "The respondent's data mentions 'organic whole milk' which directly matches with the 'Whole milk' category in the classification index. Although the milk is organic, there is no separate category for organic milk in the provided subset of classification index. Therefore, the most suitable classification code is 'CP01141' for 'Whole milk'. Other possible but less likely categories could be 'Other milk products' or 'Other food products n.e.c.'."
45 | }
46 | ''','''
47 | {
48 | "codable": false,
49 | "followup": "Is the item intended for men or women?",
50 | "class_code": null,
51 | "class_descriptive": null,
52 | "alt_candidates": [
53 | {
54 | "class_code": "CP03121",
55 | "class_descriptive": "Garments for men",
56 | "likelihood": 0.5
57 | },
58 | {
59 | "class_code": "CP03122",
60 | "class_descriptive": "Garments for women",
61 | "likelihood": 0.5
62 | }
63 | ],
64 | "reasoning": "The item 'skinny jeans' can be classified as either 'Garments for men' or 'Garments for women'. Without information on the intended gender for the item, a definitive classification cannot be made."
65 | }
66 | ''','''
67 | {
68 | "codable": true,
69 | "class_code": "CP06220",
70 | "class_descriptive": "Dental services",
71 | "alt_candidates": [],
72 | "reasoning": "The respondent's data mentions 'tooth filling' which is a service provided by dentists. Therefore, the classification code 'CP06220' for 'Dental services' is the most appropriate."
73 | }'''
74 | ])
75 | ```
76 |
77 | ## Load COICOP or other custom index
78 |
79 | The expected format of the custom index is a text file with each line containing one index entry in the format `class_code : class_descriptive`. The following code snippet demonstrates how to load and embed the COICOP index. This embedding is saved in a vector store that is used in the retrieval step of RAG based classification in `ClassificationLLM`. Note that the `coicop_demo_llm` should be replaced with the LLM of your choice.
80 |
81 | ```{python}
82 | #| code-summary: "Code: Load COICOP index"
83 | #| warning: false
84 | index_filepath = config["lookups"]["coicop_condensed"]
85 | with open(index_filepath) as file_object:
86 | for _ in range(5):
87 | print(next(file_object))
88 |
89 | embed = EmbeddingHandler(db_dir=None)
90 | with open(index_filepath) as file_object:
91 | embed.embed_index(file_object=file_object)
92 |
93 | coicop_llm = ClassificationLLM(embedding_handler=embed, llm = coicop_demo_llm)
94 | ```
95 |
96 | ## Example classification using COICOP index
97 |
98 | The following code block demonstrates how to classify a few examples using the COICOP index. Note that the respondent data is passed as a dictionary. For different use cases, any custom survey fields can be used as keys in the dictionary. `ClassificationLLM` uses the values that are present in the dictionary to retrieve the relevant information from the index and includes all the provided fields in the generative query step.
99 |
100 | ```{python}
101 | #| code-summary: "Code: Example lookup"
102 | #| warning: false
103 | for item in ["organic whole milk", "skinny jeans", "tooth filling"]:
104 | # Get response from LLM
105 | response, short_list = coicop_llm.rag_general_code(respondent_data={"item": item})
106 |
107 | # Print the output
108 | print("Input:")
109 | print(f" item: {item}")
110 | print('')
111 | print("Response:")
112 | for x,y in response.__dict__.items():
113 | print (f' {x}: {y}')
114 | print(f" shortlist used in RAG: {short_list}")
115 | print("")
116 | print('===========================================')
117 | print("")
118 | ```
119 |
--------------------------------------------------------------------------------
/docs/tutorials/index.qmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: Tutorials
3 | listing:
4 | type: table
5 | contents:
6 | - "*.qmd"
7 | fields: [title, description, reading-time]
8 | sort-ui: false
9 | filter-ui: false
10 | ---
11 |
12 | These tutorials walk you through some of the essential workflows for `sic-soc-llm`.
13 |
14 |
15 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [project]
2 | name = "sic_soc_llm"
3 | description = "An app for LLM based SIC/SOC Classification"
4 | authors = [{name = "Data Science Campus", email = "datasciencecampus@ons.gov.uk"}]
5 | readme = "README.md"
6 | license = {file = "LICENSE"}
7 | requires-python = ">=3.10"
8 | dynamic = ["version"]
9 | dependencies = [
10 | "toml==0.10.2",
11 | "numpy==1.26.3",
12 | "pandas==2.1.4",
13 | "langchain==0.1.0",
14 | "langchain-google-vertexai==0.0.1",
15 | "langchain-openai==0.0.2",
16 | "openai==1.7.2",
17 | "google-cloud-aiplatform==1.38.1",
18 | "sentence-transformers==2.3.1",
19 | "chromadb==0.4.22",
20 | "autocorrect==2.6.1",
21 | "pyfarmhash==0.3.2",
22 | "openpyxl==3.1.2",
23 | "pyprojroot==0.3.0",
24 | ]
25 | classifiers = [
26 | "Programming Language :: Python :: 3",
27 | "License :: OSI Approved :: MIT License",
28 | "Operating System :: OS Independent",
29 | "Do not upload :: Internal project :: !"
30 | ]
31 |
32 | [build-system]
33 | requires = ["setuptools>=62"]
34 | build-backend = "setuptools.build_meta"
35 |
36 | [tool.setuptools.dynamic]
37 | version = {attr = "sic_soc_llm.__version__"}
38 |
39 | [tool.setuptools.packages.find]
40 | where = ["src"]
41 | namespaces = false
42 |
43 | [tool.setuptools.package-data]
44 | sic_soc_llm = [
45 | "example_data/*.txt",
46 | "_config/*.toml",
47 | ]
48 |
49 | [project.optional-dependencies]
50 | app = [
51 | "streamlit==1.30.0",
52 | "python-dotenv==1.0.0",
53 | ]
54 | test = [
55 | "pytest==6.2.5",
56 | "pytest-pythonpath==0.7.4",
57 | "coverage==7.5.4",
58 | ]
59 |
60 | docs = ["quartodoc>=0.6.6",
61 | "ipykernel==6.23.2",
62 | "nbclient==0.10.0",
63 | "nbformat==5.9.2",
64 | ]
65 |
66 | dev = [
67 | "pre-commit==3.3.3",
68 | "dill==0.3.8",
69 | "matplotlib_venn==0.11.10",
70 | "sic_soc_llm[app]",
71 | "sic_soc_llm[test]",
72 | "sic_soc_llm[docs]"
73 | ]
74 |
75 | [project.urls]
76 | homepage = "https://github.com/datasciencecampus/sic-soc-llm"
77 |
--------------------------------------------------------------------------------
/src/sic_soc_llm/__init__.py:
--------------------------------------------------------------------------------
1 | """sic-soc-llm: LLM assisted SIC/SOC classification."""
2 |
3 | __version__ = "0.0.1"
4 |
5 | from .logs import setup_logging
6 | from ._config.main import get_config, check_file_exists
7 |
8 | __all__ = ["setup_logging", "get_config", "check_file_exists"]
9 |
--------------------------------------------------------------------------------
/src/sic_soc_llm/_config/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datasciencecampus/sic-soc-llm/d3b758970968427483f534faefbca34fff426172/src/sic_soc_llm/_config/__init__.py
--------------------------------------------------------------------------------
/src/sic_soc_llm/_config/main.py:
--------------------------------------------------------------------------------
1 | """Provides configuration for the project.
2 |
3 | Usage:
4 | ```
5 | from sic_soc_llm import get_config
6 | config = get_config()
7 | config.CONFIG_NAME
8 | ```
9 | """
10 |
11 | from pathlib import Path
12 | from typing import Optional, Union
13 | import toml
14 | import logging
15 | from importlib import resources
16 | from pyprojroot import here
17 |
18 | logger = logging.getLogger(__name__)
19 |
20 | _config = None
21 |
22 |
23 | def check_file_exists(
24 | file_name: Optional[Union[Path, str]] = "sic_soc_llm_config.toml"
25 | ) -> Path:
26 | """Check if the file exists.
27 |
28 | If relative path provided it will look for the file in these locations:
29 | 1. relative to the current working directory
30 | 2. ralative to project root directory
31 | 3. relative to user's home directory
32 | 4. relative to the package resources
33 |
34 | Args:
35 | file_name (Path or str, optional): The name of the file to check.
36 | Defaults to config file name.
37 |
38 | Returns:
39 | Path: The absolute path to the file if it exists, None otherwise.
40 | """
41 |
42 | file_path = Path(file_name)
43 | # check whether the filepath is relative or absolute
44 | if file_path.is_absolute():
45 | return file_path if file_path.exists() else None
46 | else:
47 | # check whether the file exists in the current directory
48 | if (Path.cwd() / file_path).exists():
49 | return Path.cwd() / file_path
50 | # check whether the file exists in the project root directory
51 | elif (Path(here()) / file_path).exists():
52 | return Path(here()) / file_path
53 | # check whether the file exists in the user's home directory
54 | elif (Path.home() / file_path).exists():
55 | return Path.home() / file_path
56 | # check whether the file exists in the package resources
57 | elif (resources.files("sic_soc_llm._config") / file_path).exists():
58 | return resources.files("sic_soc_llm._config") / file_path
59 | elif (resources.files("sic_soc_llm.example_data") / file_path).exists():
60 | return resources.files("sic_soc_llm.example_data") / file_path
61 | else:
62 | return None
63 |
64 |
65 | def get_config(
66 | config_name: Optional[Union[Path, str]] = "sic_soc_llm_config.toml"
67 | ) -> dict:
68 | """Fetch the configuration.
69 |
70 | Loads config from the filepath defined in `CONFIG_FILEPATH`.
71 |
72 | Args:
73 | config_name (Path or str, optional): The name of the config file to load.
74 | Defaults to relative path "sic_soc_llm_config.toml" - in such case it
75 | looks for the config file in 1. current dir, 2. project dir, 3. user home
76 | and 4. package resources.
77 |
78 | Returns:
79 | dict: Configuration for the system.
80 |
81 | Raises:
82 | FileNotFoundError: If the config file or required lookups not found.
83 | """
84 | global _config
85 |
86 | if _config is None:
87 | config_filepath = check_file_exists(config_name)
88 |
89 | if config_filepath is None:
90 | raise FileNotFoundError("Config file not found.")
91 | else:
92 | with open(config_filepath, mode="r") as f:
93 | logger.info(f"Loading config from {config_filepath}")
94 | in_config = toml.load(f)
95 | for key, lookup_file in in_config["lookups"].items():
96 | lookup_file_path = check_file_exists(lookup_file)
97 | if lookup_file_path is None:
98 | if key in ["sic_condensed", "soc_condensed"]:
99 | raise FileNotFoundError(
100 | f"Required lookup file {key}: {lookup_file} not found."
101 | )
102 | else:
103 | logger.warning(
104 | f"Optional lookup file {key}: {lookup_file} not found."
105 | )
106 | else:
107 | in_config["lookups"][key] = lookup_file_path
108 | _config = in_config
109 | logger.debug(f"Config values: {_config}")
110 |
111 | return _config
112 |
--------------------------------------------------------------------------------
/src/sic_soc_llm/_config/sic_soc_llm_config.toml:
--------------------------------------------------------------------------------
1 | [sic_soc_data]
2 |
3 | [lookups]
4 | sic_structure = "data/sic-index/publisheduksicsummaryofstructureworksheet.xlsx"
5 | # https://www.ons.gov.uk/file?uri=/methodology/classificationsandstandards/ukstandardindustrialclassificationofeconomicactivities/uksic2007/publisheduksicsummaryofstructureworksheet.xlsx
6 | sic_index = "data/sic-index/uksic2007indexeswithaddendumdecember2022.xlsx"
7 | # https://www.ons.gov.uk/file?uri=/methodology/classificationsandstandards/ukstandardindustrialclassificationofeconomicactivities/uksic2007/uksic2007indexeswithaddendumdecember2022.xlsx
8 | sic_condensed = "sic_2d_condensed.txt"
9 | soc_condensed = "soc_4d_condensed.txt"
10 | coicop_condensed = "coicop_5d_condensed.txt"
11 |
12 | [llm]
13 | db_dir = "data/sic-index/db"
14 | embedding_model_name = "all-MiniLM-L6-v2" # all-mpnet-base-v2
15 | llm_model_name = "gemini-pro" # "gpt-4"
16 |
--------------------------------------------------------------------------------
/src/sic_soc_llm/data_models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datasciencecampus/sic-soc-llm/d3b758970968427483f534faefbca34fff426172/src/sic_soc_llm/data_models/__init__.py
--------------------------------------------------------------------------------
/src/sic_soc_llm/data_models/response_model.py:
--------------------------------------------------------------------------------
1 | from pydantic import BaseModel, Field, model_validator
2 | from typing import List, Optional
3 |
4 |
5 | class SocCandidate(BaseModel):
6 | """
7 | Represents a candidate SOC code based on provided job title and description.
8 |
9 | Attributes:
10 | soc_code (str): Plausible SOC code based on the provided job title and
11 | description.
12 | soc_descriptive (str): Descriptive label of the SOC category associated
13 | with soc_code.
14 | likelihood (float): Likelihood of this soc_code with a value between 0 and 1.
15 | """
16 |
17 | soc_code: str = Field(
18 | description="Plausible SOC code based on provided job title and description."
19 | )
20 | soc_descriptive: str = Field(
21 | description="Descriptive label of the SOC category associated with soc_code."
22 | )
23 | likelihood: float = Field(
24 | description="Likelihood of this soc_code with value between 0 and 1."
25 | )
26 |
27 |
28 | class SocResponse(BaseModel):
29 | """Represents a response model for SOC code assignment.
30 |
31 | Attributes:
32 | codable (bool): True if enough information is provided to decide SOC code,
33 | False otherwise.
34 | followup (Optional[str]): Question to ask the user in order to collect
35 | additional information to enable reliable SOC assignment.
36 | Empty if codable=True.
37 | soc_code (Optional[str]): Full four-digit SOC code assigned based on provided
38 | job title, description, etc. Empty if codable=False.
39 | soc_descriptive (Optional[str]): Descriptive label of the SOC category
40 | associated with soc_code if provided. Empty if codable=False.
41 | soc_candidates (List[SocCandidate]): List of possible or alternative SOC
42 | codes that may be applicable with their descriptive label and estimated
43 | likelihood.
44 | soc_code_2digits (Optional[str]): First two digits of the hierarchical SOC code
45 | assigned. This field should be non-empty if the larger (two-digit) group of
46 | SOC codes can be determined even in cases where additional information is
47 | needed to code to four digits (for example when all SOC candidates share
48 | the same first two digits).
49 | reasoning (str): Step by step reasoning behind classification selected.
50 | Specifies the information used to assign the SOC code or any additional
51 | information required to assign a SOC code.
52 | """
53 |
54 | codable: bool = Field(
55 | description="""True if enough information is provided to decide
56 | SOC code, False otherwise."""
57 | )
58 | followup: Optional[str] = Field(
59 | description="""Question to ask user in order to collect additional information
60 | to enable reliable SOC assignment. Empty if codable=True.""",
61 | default=None,
62 | )
63 | soc_code: Optional[str] = Field(
64 | description="""Full four digit SOC code assigned based on provided job title,
65 | description, etc. Empty if codable=False.""",
66 | default=None,
67 | )
68 | soc_descriptive: Optional[str] = Field(
69 | description="""Descriptive label of the SOC category associated with soc_code
70 | if provided. Empty if codable=False.""",
71 | default=None,
72 | )
73 | soc_candidates: List[SocCandidate] = Field(
74 | description="""List of possible or alternative SOC codes that may be applicable
75 | with their descriptive label and estimated likelihood."""
76 | )
77 | soc_code_2digits: Optional[str] = Field(
78 | description="""First two digits of the hierarchical SOC code assigned.
79 | This field should be non empty if the larger (two-digit) group of SOC codes
80 | can be determined even in cases where additional information is needed to
81 | to code to four digits (for example when all SOC candidates share
82 | the same first two digits).""",
83 | default=None,
84 | )
85 | reasoning: str = Field(
86 | description="""Step by step reasoning behind classification selected. Specifies
87 | the information used to assign the SOC code or any additional information
88 | required to assign a SOC code."""
89 | )
90 |
91 | @classmethod
92 | def soc_code_validator(cls, v):
93 | # TODO: check for valid codes from some list
94 | assert v != "", "If codable, then valid soc_code needs to be provided"
95 | return v
96 |
97 | @model_validator(mode="before")
98 | @classmethod
99 | def check_valid_fields(cls, values):
100 | if values.get("codable"):
101 | cls.soc_code_validator(values.get("soc_code"))
102 | else:
103 | assert (
104 | values.get("followup") != ""
105 | ), """If uncodable,
106 | follow up question needs to be provided."""
107 | return values
108 |
109 |
110 | class SicCandidate(BaseModel):
111 | """Represents a candidate SIC code with associated information.
112 |
113 | Attributes:
114 | sic_code (str): Plausible SIC code based on the company activity description.
115 | sic_descriptive (str): Descriptive label of the SIC category associated with
116 | sic_code.
117 | likelihood (float): Likelihood of this sic_code with a value between 0 and 1.
118 |
119 | """
120 |
121 | sic_code: str = Field(
122 | description="Plausible SIC code based on the company activity description."
123 | )
124 | sic_descriptive: str = Field(
125 | description="Descriptive label of the SIC category associated with sic_code."
126 | )
127 | likelihood: float = Field(
128 | description="Likelihood of this sic_code with value between 0 and 1."
129 | )
130 |
131 |
132 | class SicResponse(BaseModel):
133 | """Represents a response model for SIC code assignment.
134 |
135 | Attributes:
136 | codable (bool): True if enough information is provided to decide SIC code,
137 | False otherwise.
138 | followup (Optional[str]): Question to ask user in order to collect additional
139 | information to enable reliable SIC assignment. Empty if codable=True.
140 | sic_code (Optional[str]): Full SIC code (to the required number of digits)
141 | assigned based on the provided company activity description.
142 | Empty if codable=False.
143 | sic_descriptive (Optional[str]): Descriptive label of the SIC category
144 | associated with sic_code if provided. Empty if codable=False.
145 | sic_candidates (List[SicCandidate]): Short list of less than ten possible or
146 | alternative sic codes that may be applicable with their descriptive label
147 | and estimated likelihood.
148 | sic_code_2digits (Optional[str]): First two digits of the hierarchical SIC
149 | code assigned. This field should be non empty if the larger (two-digit)
150 | group of SIC codes can be determined even in cases where additional
151 | information is needed to code to four digits (for example when all
152 | SIC candidates share the same first two digits).
153 | reasoning (str): Specifies the information used to assign the SIC code or any
154 | additional information required to assign a SIC code.
155 | """
156 |
157 | codable: bool = Field(
158 | description="""True if enough information is provided to decide
159 | SIC code, False otherwise."""
160 | )
161 | followup: Optional[str] = Field(
162 | description="""Question to ask user in order to collect additional information
163 | to enable reliable SIC assignment. Empty if codable=True.""",
164 | default=None,
165 | )
166 | sic_code: Optional[str] = Field(
167 | description="""Full SIC code (to the required number of digits) assigned based
168 | on provided the company activity description. Empty if codable=False.""",
169 | default=None,
170 | )
171 | sic_descriptive: Optional[str] = Field(
172 | description="""Descriptive label of the SIC category associated with sic_code
173 | if provided. Empty if codable=False.""",
174 | default=None,
175 | )
176 | sic_candidates: List[SicCandidate] = Field(
177 | description="""Short list of less than ten possible or alternative SIC codes
178 | that may be applicable with their descriptive label and estimated likelihood."""
179 | )
180 |
181 | reasoning: str = Field(
182 | description="""Step by step reasoning behind classification selected. Specifies
183 | the information used to assign the SIC code or any additional information
184 | required to assign a SIC code."""
185 | )
186 |
187 | @classmethod
188 | def sic_code_validator(cls, v):
189 | # TODO: check for valid codes from some list
190 | assert v != "", "If codable, then valid sic_code needs to be provided"
191 | return v
192 |
193 | @model_validator(mode="before")
194 | @classmethod
195 | def check_valid_fields(cls, values):
196 | if values.get("codable"):
197 | cls.sic_code_validator(values.get("sic_code"))
198 | else:
199 | assert (
200 | values.get("followup") != ""
201 | ), """If uncodable,
202 | follow up question needs to be provided."""
203 | return values
204 |
205 |
206 | class RagCandidate(BaseModel):
207 | """Represents a candidate classification code with associated information.
208 |
209 | Attributes:
210 | class_code (str): Plausible classification code based on the respondent's data.
211 | class_descriptive (str): Descriptive label of the classification category
212 | associated with class_code.
213 | likelihood (float): Likelihood of this class_code with a value between 0 and 1.
214 |
215 | """
216 |
217 | class_code: str = Field(
218 | description="Plausible classification code based on the respondent's data."
219 | )
220 | class_descriptive: str = Field(
221 | description="""Descriptive label of the classification category
222 | associated with class_code."""
223 | )
224 | likelihood: float = Field(
225 | description="Likelihood of this class_code with value between 0 and 1."
226 | )
227 |
228 |
229 | class RagResponse(BaseModel):
230 | """Represents a response model for classification code assignment.
231 |
232 | Attributes:
233 | codable (bool): True if enough information is provided to decide
234 | classification code, False otherwise.
235 | followup (Optional[str]): Question to ask user in order to collect
236 | additional information to enable reliable classification assignment.
237 | Empty if codable=True.
238 | class_code (Optional[str]): Full classification code (to the required
239 | number of digits) assigned based on provided respondent's data.
240 | Empty if codable=False.
241 | class_descriptive (Optional[str]): Descriptive label of the classification
242 | category associated with class_code if provided.
243 | Empty if codable=False.
244 | alt_candidates (List[RagCandidate]): Short list of less than ten possible
245 | or alternative classification codes that may be applicable with their
246 | descriptive label and estimated likelihood.
247 | reasoning (str): Step by step reasoning behind the classification selected.
248 | Specifies the information used to assign the SIC code or any additional
249 | information required to assign a SIC code.
250 | """
251 |
252 | codable: bool = Field(
253 | description="""True if enough information is provided to decide
254 | classification code, False otherwise."""
255 | )
256 | followup: Optional[str] = Field(
257 | description="""Question to ask user in order to collect additional information
258 | to enable reliable classification assignment. Empty if codable=True.""",
259 | default=None,
260 | )
261 | class_code: Optional[str] = Field(
262 | description="""Full classification code (to the required number of digits)
263 | assigned based on provided respondent's data. Empty if codable=False.""",
264 | default=None,
265 | )
266 | class_descriptive: Optional[str] = Field(
267 | description="""Descriptive label of the classification category associated
268 | with class_code if provided. Empty if codable=False.""",
269 | default=None,
270 | )
271 | alt_candidates: List[RagCandidate] = Field(
272 | description="""Short list of less than ten possible or alternative
273 | classification codes that may be applicable with their descriptive label
274 | and estimated likelihood."""
275 | )
276 | reasoning: str = Field(
277 | description="""Step by step reasoning behind classification selected. Specifies
278 | the information used to assign the SIC code or any additional information
279 | required to assign a SIC code."""
280 | )
281 |
--------------------------------------------------------------------------------
/src/sic_soc_llm/data_models/sic_data_access.py:
--------------------------------------------------------------------------------
1 | """Provides data access for key files.
2 |
3 | Filepaths are defined in config, see: `sic_soc_llm._config`.
4 | """
5 |
6 | import pandas as pd
7 |
8 |
9 | def load_sic_index(filepath: str) -> pd.DataFrame:
10 | """Load the SIC index.
11 |
12 | The SIC index provides a list of around 15,000 activities and
13 | their associated 5-digit SIC.
14 | """
15 |
16 | sic_index_df = pd.read_excel(
17 | filepath,
18 | sheet_name="Alphabetical Index",
19 | skiprows=1,
20 | usecols=["UK SIC 2007", "Activity"],
21 | dtype=str,
22 | )
23 |
24 | sic_index_df.columns = [
25 | col.lower().replace(" ", "_") for col in sic_index_df.columns
26 | ]
27 |
28 | return sic_index_df
29 |
30 |
31 | def load_sic_structure(filepath: str) -> pd.DataFrame:
32 | """Load SIC structure.
33 |
34 | Loads a worksheet with all the levels/names of the UK SIC 2007 hierarchy.
35 | """
36 |
37 | sic_df = pd.read_excel(
38 | filepath,
39 | sheet_name="reworked structure",
40 | usecols=[
41 | "Description",
42 | "SECTION",
43 | "Most disaggregated level",
44 | "Level headings",
45 | ],
46 | dtype=str,
47 | )
48 |
49 | sic_df.columns = [col.lower().replace(" ", "_") for col in sic_df.columns]
50 |
51 | for col in sic_df.columns:
52 | sic_df[col] = sic_df[col].str.strip()
53 |
54 | return sic_df
55 |
--------------------------------------------------------------------------------
/src/sic_soc_llm/data_models/sic_hierarchy.py:
--------------------------------------------------------------------------------
1 | """SIC hierarchy.
2 |
3 | Provides a common interface for SIC lookups and navigation.
4 |
5 | Usage:
6 |
7 | sic = sic_hierarchy.load_hierarchy(sic_df, sic_index_df)
8 | sic["01110"].print_all()
9 | """
10 | from typing import Iterator
11 | import html
12 | import re
13 |
14 | import pandas as pd
15 |
16 | from sic_soc_llm.data_models import sicDB
17 |
18 | SEE_CODE_REGEX = re.compile(
19 | r"(,?\s?see\s(divisions?\s)?)?##\d+(\.\d+(\/\d)?)?", re.IGNORECASE
20 | )
21 |
22 | # TODO enum?
23 | _LEVEL_DICT = {1: "section", 2: "division", 3: "group", 4: "class", 5: "subclass"}
24 |
25 |
26 | class SicCode:
27 | """Standard Industrial Classification code.
28 |
29 | The main representation for SIC in this class is the `alpha_code`,
30 | which we define as:
31 |
32 | * The section character e.g. 'A'
33 | * Followed by the numeric SIC code e.g. "0111"
34 | * Padded with 'x' to six characters
35 |
36 | For example: "A0111x"
37 |
38 | The class supports initialisation with section, code and level via
39 | the factory method `from_section_code_level`.
40 |
41 | For example:
42 | SicCode.from_section_code_level("A", "0111", "class")
43 |
44 | Note:
45 | This class is mainly for internal use, beyond some basic checks
46 | of formatting and consistency it does not validate that a code
47 | is defined in UK SIC 2007.
48 | """
49 |
50 | def __init__(self, alpha_code: str):
51 | SicCode._validate_alpha_code(alpha_code)
52 |
53 | self.alpha_code = alpha_code
54 | self.n_digits = SicCode._parse_digits(alpha_code)
55 | self.level_name = _LEVEL_DICT[self.n_digits]
56 | self._formatted_code = SicCode._format_code(alpha_code)
57 | self._alpha_code_no_pad = self.alpha_code.replace("x", "")
58 |
59 | @staticmethod
60 | def from_section_code_level(section, code, level) -> "SicCode":
61 | """Factory method for SicCode.
62 |
63 | Note:
64 | Used to produce the definitive list of SIC codes,
65 | only call with data that defines SIC.
66 | """
67 | level = level.lower().strip().replace(" ", "")
68 |
69 | if len(code) < 5:
70 | n_digits = len(code)
71 | if _LEVEL_DICT[n_digits] != level:
72 | raise ValueError(f"Code/level mismatch: '{code}' -> '{level}'")
73 |
74 | elif len(code) == 5:
75 | if level not in {_LEVEL_DICT[4], _LEVEL_DICT[5]}:
76 | raise ValueError(f"Code/level mismatch: '{code}' -> '{level}'")
77 |
78 | if level == _LEVEL_DICT[1] and section != code:
79 | raise ValueError(f"Section/code mismatch: '{section}' - '{code}'")
80 |
81 | match level:
82 | case "section":
83 | alpha_code = f"{section}"
84 |
85 | case "class":
86 | if len(code) == 5:
87 | if code[4] != "0":
88 | raise ValueError(
89 | f"4-digit SIC code as 5 digit must end in zero: '{code}'"
90 | )
91 | code = code[:4]
92 | alpha_code = f"{section}{code}"
93 |
94 | case _:
95 | alpha_code = f"{section}{code}"
96 |
97 | pad = 6 - len(alpha_code)
98 | alpha_code += "x" * pad
99 |
100 | return SicCode(alpha_code)
101 |
102 | def __eq__(self, other):
103 | return self.alpha_code == other.alpha_code
104 |
105 | def __hash__(self):
106 | return hash(self.alpha_code)
107 |
108 | def __lt__(self, other):
109 | return self._alpha_code_no_pad < other._alpha_code_no_pad
110 |
111 | @staticmethod
112 | def _validate_alpha_code(alpha_code: str):
113 | if not isinstance(alpha_code, str):
114 | raise TypeError("SIC code must be a string")
115 |
116 | first_char = alpha_code[0]
117 | if not (first_char.isalpha() and first_char.isupper()):
118 | raise ValueError("Alpha SIC code must start with an upper case letter A-Z")
119 |
120 | if len(alpha_code) != 6:
121 | raise ValueError("Alpha SIC must be padded to 6 characters")
122 |
123 | @staticmethod
124 | def _parse_digits(alpha_code: str):
125 | alpha_code = alpha_code.replace("x", "")
126 |
127 | if len(alpha_code) == 1:
128 | n_digits = 1
129 |
130 | else:
131 | n_digits = len(alpha_code[1:])
132 | if n_digits == 1:
133 | raise ValueError(f'Invalid SIC code: "{alpha_code}"')
134 |
135 | return n_digits
136 |
137 | @staticmethod
138 | def _format_code(alpha_code: str):
139 | alpha_code = alpha_code.replace("x", "")
140 |
141 | formatted_code = None
142 |
143 | match len(alpha_code):
144 | case 1:
145 | formatted_code = alpha_code
146 | case 3:
147 | formatted_code = alpha_code[1:3]
148 | case 4 | 5:
149 | formatted_code = f"{alpha_code[1:3]}.{alpha_code[3:]}"
150 | case 6:
151 | formatted_code = f"{alpha_code[1:3]}.{alpha_code[3:5]}/{alpha_code[5]}"
152 |
153 | if formatted_code is None:
154 | raise ValueError(f'Unable to format code: "{alpha_code}"')
155 |
156 | return formatted_code
157 |
158 | def __str__(self):
159 | return self._formatted_code
160 |
161 | def __repr__(self):
162 | repr_str = f'SicCode("{self.alpha_code}")'
163 | return repr_str
164 |
165 |
166 | class SicNode:
167 | """Tree data structure where the nodes hold all data associated with a given SIC.
168 |
169 | The SIC hierarchy is represented as several separate trees,
170 | with each section (e.g. "A", "B", "C") as a root node.
171 | """
172 |
173 | def __init__(self, sic_code: SicCode, description: str):
174 | self.sic_code = sic_code
175 | self.description = description
176 |
177 | self.activities = []
178 | self.sic_meta = None
179 | self.parent = None
180 | self.children = []
181 |
182 | def __repr__(self):
183 | return f'SicNode({repr(self.sic_code)}, "{self.description}")'
184 |
185 | def __str__(self):
186 | return f'{str(self.sic_code)}: "{self.description}"'
187 |
188 | def print_all(self):
189 | """Prints all information about the SIC hierarchy.
190 |
191 | This method prints the following information:
192 | - The string representation of the SIC hierarchy.
193 | - The section of the SIC code.
194 | - The parent of the current SIC hierarchy.
195 | - The children of the current SIC hierarchy.
196 | - The detail, includes, and excludes attributes of the SIC meta.
197 | - The activities associated with the SIC hierarchy.
198 | """
199 | print(str(self))
200 |
201 | print(f"Section: {self.sic_code.alpha_code[0]}")
202 | print(f"Parent: {self.parent}")
203 | print(f"Children: {[str(child) for child in self.children]}")
204 | print()
205 | print(f"detail={self.sic_meta.detail}")
206 | print(f"includes={self.sic_meta.includes}")
207 | print(f"excludes={self.sic_meta.excludes}")
208 | print()
209 | print("Activities:")
210 | for activity in self.activities:
211 | print(f"\t- {activity}")
212 |
213 | def is_leaf(self):
214 | return not self.children
215 |
216 | def numeric_string_padded(self):
217 | numeric_string = self.sic_code.alpha_code[1:].replace("x", "")
218 |
219 | if self.sic_code.n_digits == 4 and self.is_leaf():
220 | numeric_string += "0"
221 |
222 | return numeric_string
223 |
224 |
225 | class SIC:
226 | """Main class for SIC lookups.
227 |
228 | Usage:
229 | | sic = load_hierarchy(sic_df, sic_index_df)
230 | | sic["01.1"]
231 | | sic["011"]
232 | | sic["A011xx"]
233 | """
234 |
235 | def __init__(self, nodes, code_lookup):
236 | self.nodes = sorted(nodes, key=lambda node: node.sic_code)
237 | self._code_lookup = code_lookup
238 |
239 | def __getitem__(self, key):
240 | return self._code_lookup[key]
241 |
242 | def __iter__(self):
243 | return iter(self.nodes)
244 |
245 | def __len__(self):
246 | return len(self.nodes)
247 |
248 | def all_leaf_activities(self) -> Iterator[dict]:
249 | """All activities for 5-digit SIC.
250 |
251 | Note:
252 | Does not include 4-digit SIC codes where those codes
253 | have a 5-digit expansion.
254 |
255 | i.e. Only returns for leaf nodes.
256 | """
257 | return (
258 | {"code": node.sic_code, "text": activity}
259 | for node in self
260 | if node.is_leaf()
261 | for activity in node.activities
262 | )
263 |
264 | def all_leaf_descriptions(self) -> Iterator[dict]:
265 | """All descriptions for 5-digit SIC.
266 |
267 | Note:
268 | Does not include 4-digit SIC codes where those codes
269 | have a 5-digit expansion.
270 |
271 | i.e. Only returns for leaf nodes.
272 | """
273 | return (
274 | {"code": node.sic_code, "text": node.description}
275 | for node in self
276 | if node.is_leaf()
277 | )
278 |
279 | def all_leaf_text(self) -> pd.DataFrame:
280 | """Returns all short text descriptions of 5-digit level SIC.
281 |
282 | Includes:
283 | * Activities from the SIC index
284 | * Description from the SIC structure
285 |
286 | Returns:
287 | pd.DataFrame
288 | Two columns `code`, `text`
289 | """
290 | description_df = pd.DataFrame(self.all_leaf_descriptions())
291 | activity_df = pd.DataFrame(self.all_leaf_activities())
292 |
293 | df = pd.concat([description_df, activity_df], ignore_index=True)
294 | df = df.drop_duplicates()
295 | df = df.sort_values("code")
296 | df = df.reset_index(drop=True).copy()
297 |
298 | df["code"] = df["code"].apply(lambda sic_code: str(sic_code))
299 |
300 | return df
301 |
302 |
303 | def _define_codes_and_nodes(sic_df) -> ([SicCode], [SicNode]):
304 | codes = []
305 | nodes = []
306 |
307 | code_node_dict = dict()
308 |
309 | for description, section, code, level in sic_df[
310 | ["description", "section", "most_disaggregated_level", "level_headings"]
311 | ].itertuples(index=False, name=None):
312 | sic_code = SicCode.from_section_code_level(section, code, level)
313 |
314 | sic_node = SicNode(sic_code, description=description)
315 |
316 | codes.append(sic_code)
317 | nodes.append(sic_node)
318 | code_node_dict[sic_code] = sic_node
319 |
320 | return codes, nodes, code_node_dict
321 |
322 |
323 | def _populate_parent_child_relationships(nodes, code_node_dict):
324 | """Populate the parent/child relationships in SIC.
325 |
326 | Warning: Modifies nodes in place
327 | """
328 | for node in nodes:
329 | if node.sic_code.n_digits > 1:
330 | match node.sic_code.n_digits:
331 | case 2:
332 | parent_code = node.sic_code.alpha_code[0]
333 | case 3:
334 | parent_code = node.sic_code.alpha_code[:3]
335 | case 4:
336 | parent_code = node.sic_code.alpha_code[:4]
337 | case 5:
338 | parent_code = node.sic_code.alpha_code[:5]
339 | case _:
340 | raise ValueError(f"No parent found for '{node.sic_code}'")
341 |
342 | pad = 6 - len(parent_code)
343 | parent_code += "x" * pad
344 | parent_sic_code = SicCode(parent_code)
345 |
346 | parent_node = code_node_dict[parent_sic_code]
347 |
348 | parent_node.children.append(node)
349 | node.parent = parent_node
350 |
351 |
352 | def _populate_sic_meta(nodes, code_node_dict):
353 | """Populate metadata for SIC.
354 |
355 | Warning: modifies data in place.
356 | """
357 |
358 | if not len(sicDB.sic_meta) == len(nodes):
359 | raise ValueError("Mismatch in SIC data sources: sicDB.sic_meta and sic_df")
360 |
361 | for meta in sicDB.sic_meta:
362 | sic_code = SicCode(meta.code)
363 | sic_node = code_node_dict[sic_code]
364 |
365 | sic_node.sic_meta = _clean_meta(meta)
366 |
367 |
368 | def _populate_activities(nodes, sic_index_df):
369 | """Populate activities.
370 |
371 | Warning: Modifies nodes in place.
372 | """
373 |
374 | padded_digits_to_sic_codes = dict()
375 |
376 | for sic_node in nodes:
377 | sic_digits = None
378 | if sic_node.sic_code.n_digits == 4:
379 | sic_digits = sic_node.sic_code.alpha_code[1:5] + "0"
380 | if sic_node.sic_code.n_digits == 5:
381 | sic_digits = sic_node.sic_code.alpha_code[1:6]
382 |
383 | if sic_digits:
384 | padded_digits_to_sic_codes[sic_digits] = sic_node
385 |
386 | for sic_digits, activity in sic_index_df[["uk_sic_2007", "activity"]].itertuples(
387 | index=False, name=None
388 | ):
389 | padded_digits_to_sic_codes[sic_digits.strip()].activities.append(activity)
390 |
391 |
392 | def _clean_text(text):
393 | """Clean text.
394 |
395 | Unescape HTML, remove the ", see ##11.11" entries.
396 | """
397 | clean_text = html.unescape(text)
398 |
399 | clean_text = re.sub(SEE_CODE_REGEX, "", clean_text)
400 |
401 | return clean_text
402 |
403 |
404 | def _clean_meta(meta):
405 | clean_detail = _clean_text(meta.detail)
406 | clean_includes = [_clean_text(text) for text in meta.includes]
407 | clean_excludes = [_clean_text(text) for text in meta.excludes]
408 |
409 | cleaned_meta = sicDB.ClassificationMeta(
410 | code=meta.code,
411 | title=meta.title,
412 | detail=clean_detail,
413 | includes=clean_includes,
414 | excludes=clean_excludes,
415 | )
416 |
417 | return cleaned_meta
418 |
419 |
420 | def load_hierarchy(sic_df, sic_index_df):
421 | """Create the SIC lookups from all supporting data.
422 |
423 | Uses:
424 | * SIC structure
425 | * SIC index
426 | * SIC meta data (from the SIC interactive tool)
427 |
428 | Once created this provides a single point of access for all
429 | data associated with a SIC definition.
430 | """
431 | codes, nodes, code_node_dict = _define_codes_and_nodes(sic_df)
432 |
433 | _populate_parent_child_relationships(nodes, code_node_dict)
434 |
435 | _populate_sic_meta(nodes, code_node_dict)
436 |
437 | _populate_activities(nodes, sic_index_df)
438 |
439 | lookup = dict()
440 |
441 | for node in nodes:
442 | lookup[str(node.sic_code)] = node
443 | lookup[node.sic_code.alpha_code] = node
444 | lookup[node.sic_code.alpha_code.replace("x", "")] = node
445 | if node.sic_code.n_digits > 1:
446 | lookup[node.sic_code.alpha_code[1:].replace("x", "")] = node
447 |
448 | if node.sic_code.n_digits == 4 and not node.children:
449 | key = node.sic_code.alpha_code[1:5] + "0"
450 | lookup[key] = node
451 |
452 | return SIC(nodes, lookup)
453 |
--------------------------------------------------------------------------------
/src/sic_soc_llm/data_models/sic_meta_model.py:
--------------------------------------------------------------------------------
1 | from pydantic import BaseModel, Field
2 | from typing import List
3 |
4 |
5 | class ClassificationMeta(BaseModel):
6 | """
7 | Represents a classification meta model.
8 |
9 | Attributes:
10 | code (str): Category code. Either a full code or a partial code for a
11 | larger hierarchical group.
12 | Partial code has last digits replaced by 'x'.
13 | title (str): Short descriptive title of the code category.
14 | detail (str): Descriptive label of the category associated with the code.
15 | includes (List[str]): Optional list of titles that should be included
16 | in this category.
17 | excludes (List[str]): Optional list of titles that should be excluded
18 | from this category.
19 | """
20 |
21 | code: str = Field(
22 | description="""Category code. Either a full code or a partial code
23 | for a larger hierarchical group.
24 | Partial code has last digits replaced by 'x'."""
25 | )
26 | title: str = Field(description="Short descriptive title of the code category.")
27 | detail: str = Field(
28 | default="",
29 | description="Descriptive label of the category associated with code.",
30 | )
31 | includes: List[str] = Field(
32 | default=[],
33 | description="Optional list of titles that should be included in this category",
34 | )
35 | excludes: List[str] = Field(
36 | default=[],
37 | description="""Optional list of titles that should be excluded from
38 | this category""",
39 | )
40 |
41 | def check_code_match(self, subcode: str) -> bool:
42 | """Check for partial match of the code.
43 | Discards 1st letter on SIC and then check only valid numbers.
44 |
45 | Args:
46 | subcode (str): 2-5 digits code for matching
47 |
48 | Returns:
49 | bool: if partial match found
50 | """
51 | n = min(len(self.code.replace("x", "")), len(subcode) + 1)
52 | return (n > 2) & (self.code[1:n] == subcode[0 : (n - 1)])
53 |
54 | def pretty_print(self, subset_digits=[4, 2]) -> str:
55 | """Prints nicely the present fields.
56 |
57 | Returns:
58 | str: _description_
59 | """
60 | code = self.code[1:].replace("x", "")
61 | if len(code) in subset_digits:
62 | out = "Code " + code + ": " + self.title + ". "
63 | if self.detail:
64 | out += self.detail + ". "
65 | if self.includes:
66 | out += "Includes " + ", ".join(self.includes) + ". "
67 | if self.excludes:
68 | out += "Excludes " + ", ".join(self.excludes) + ". "
69 | else:
70 | out = ""
71 | return out
72 |
--------------------------------------------------------------------------------
/src/sic_soc_llm/embedding.py:
--------------------------------------------------------------------------------
1 | # Docker Image may have old sqlite3 version for ChromaDB
2 | import sys
3 | import sqlite3 # noqa:F401
4 |
5 | if sys.modules["sqlite3"].sqlite_version_info < (3, 35, 0):
6 | __import__("pysqlite3")
7 | sys.modules["sqlite3"] = sys.modules.pop("pysqlite3")
8 |
9 | import uuid
10 | import logging
11 | from langchain_community.embeddings import HuggingFaceEmbeddings, VertexAIEmbeddings
12 | from langchain_community.vectorstores import Chroma
13 | from langchain.docstore.document import Document
14 | from autocorrect import Speller
15 |
16 | from sic_soc_llm import get_config
17 | from sic_soc_llm.data_models.sic_data_access import load_sic_index, load_sic_structure
18 | from sic_soc_llm.data_models.sic_hierarchy import load_hierarchy, SIC
19 |
20 | logger = logging.getLogger(__name__)
21 | config = get_config()
22 |
23 |
24 | class EmbeddingHandler:
25 | """
26 | Handles embedding operations for the Chroma vector store.
27 |
28 | Args:
29 | embedding_model_name (str, optional): The name of the embedding model to use.
30 | Defaults to the value specified in the configuration file.
31 | db_dir (str, optional): The directory where the vector store database
32 | is located. Defaults to the value specified in the configuration file.
33 | If None then the embedding db will be non-persistent.
34 | k_matches (int, optional): The number of nearest matches to retrieve.
35 | Defaults to 20.
36 | """
37 |
38 | def __init__(
39 | self,
40 | embedding_model_name: str = config["llm"]["embedding_model_name"],
41 | db_dir: str = config["llm"]["db_dir"],
42 | k_matches: int = 20,
43 | ):
44 | """
45 | Initialises the EmbeddingHandler.
46 | """
47 | if embedding_model_name.startswith("textembedding-"):
48 | self.embeddings = VertexAIEmbeddings(model_name=embedding_model_name)
49 | else:
50 | self.embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name)
51 | self.db_dir = db_dir
52 | self.vector_store = self._create_vector_store()
53 | self.k_matches = k_matches
54 | self.spell = Speller()
55 | self._index_size = self.vector_store._client.get_collection("langchain").count()
56 |
57 | def _create_vector_store(self) -> Chroma:
58 | """
59 | Initialises Chroma VectorDB on known DB dir in data.
60 |
61 | Returns:
62 | Chroma: The LangChain vector store object for Chroma.
63 | """
64 | if self.db_dir is None:
65 | return Chroma(embedding_function=self.embeddings)
66 | else:
67 | return Chroma(
68 | embedding_function=self.embeddings, persist_directory=self.db_dir
69 | )
70 |
71 | def embed_index(
72 | self,
73 | from_empty: bool = True,
74 | sic: SIC = None,
75 | file_object=None,
76 | ):
77 | """
78 | Embeds the index entries into the vector store.
79 |
80 | Args:
81 | from_empty (bool, optional): Whether to drop the current vector store
82 | content and start fresh.
83 | sic (SIC, optional): The SIC hierarchy object. If None, the hierarchy
84 | is loaded from files specified in the config.
85 | file_object (StringIO object): The index file as StringIO object.
86 | If provided, the file will be read by line and embedded.
87 | Each line has expected format of **code**: **description**
88 | """
89 | if from_empty:
90 | self.vector_store._client.delete_collection("langchain")
91 | self.vector_store = self._create_vector_store()
92 |
93 | docs = []
94 | ids = []
95 | if file_object is not None:
96 | for line in file_object:
97 | if line:
98 | bits = line.split(":", 1)
99 | docs.append(
100 | Document(
101 | page_content=bits[1],
102 | metadata={
103 | "code": bits[0],
104 | "four_digit_code": bits[0][0:4],
105 | "two_digit_code": bits[0][0:2],
106 | },
107 | )
108 | )
109 | ids.append(str(uuid.uuid3(uuid.NAMESPACE_URL, line)))
110 |
111 | else:
112 | if sic is None:
113 | sic_index_df = load_sic_index(config["lookups"]["sic_index"])
114 | sic_df = load_sic_structure(config["lookups"]["sic_structure"])
115 | sic = load_hierarchy(sic_df, sic_index_df)
116 |
117 | logger.debug("Loading entries from SIC hierarchy for embedding.")
118 | for _, row in sic.all_leaf_text().iterrows():
119 | code = (row["code"].replace(".", "").replace("/", "") + "0")[:5]
120 | docs.append(
121 | Document(
122 | page_content=row["text"],
123 | metadata={
124 | "code": code,
125 | "four_digit_code": code[0:4],
126 | "two_digit_code": code[0:2],
127 | },
128 | )
129 | )
130 | ids.append(str(uuid.uuid3(uuid.NAMESPACE_URL, row["text"])))
131 |
132 | self.vector_store.add_documents(docs, ids=ids)
133 | self._index_size = self.vector_store._client.get_collection("langchain").count()
134 | logger.debug(f"Inserted {len(docs):,} entries into vector embedding database.")
135 |
136 | def search_index(self, query: str, return_dicts: bool = True) -> list[dict]:
137 | """
138 | Returns k document chunks with the highest relevance to the query.
139 |
140 | Args:
141 | query (str): Question for which most relevant index entries
142 | will be returned.
143 | return_dicts (bool, optional): If True, data returned as list of
144 | dictionaries, otherwise as document tuples. Defaults to True.
145 |
146 | Returns:
147 | List[dict]: List of top k index entries by relevance.
148 | """
149 | top_matches = self.vector_store.similarity_search_with_score(
150 | query=query, k=self.k_matches
151 | )
152 |
153 | if return_dicts:
154 | return [
155 | {"distance": float(doc[1])}
156 | | {"title": doc[0].page_content}
157 | | doc[0].metadata
158 | for doc in top_matches
159 | ]
160 | return top_matches
161 |
162 | def search_index_multi(self, query: list[str]) -> list[dict]:
163 | """
164 | Returns k document chunks with the highest relevance to the query.
165 |
166 | Args:
167 | query (list[str]): List of query fields (in priority order) for which
168 | most relevant index entries will be returned.
169 | e.g [industry_descr, job_title, job_descr]
170 |
171 | Returns:
172 | List[dict]: List of top k index entries by relevance.
173 | """
174 | query = [x for x in query if x is not None]
175 | search_terms_list = set()
176 | for i in range(len(query)):
177 | x = " ".join(query[: (i + 1)])
178 | search_terms_list.add(x)
179 | search_terms_list.add(self.spell(x))
180 | short_list = [y for x in search_terms_list for y in self.search_index(query=x)]
181 | return sorted(short_list, key=lambda x: x["distance"])
182 |
--------------------------------------------------------------------------------
/src/sic_soc_llm/example_data/coicop_5d_condensed.txt:
--------------------------------------------------------------------------------
1 | CP01111: Rice
2 | CP01112: Flours and other cereals
3 | CP01113: Bread
4 | CP01114: Other bakery products
5 | CP01115: Pizza and quiche
6 | CP01116: Pasta products and couscous
7 | CP01117: Breakfast cereals
8 | CP01118: Other cereal products
9 | CP01121: Beef and veal
10 | CP01122: Pork
11 | CP01123: Lamb and goat
12 | CP01124: Poultry
13 | CP01125: Other meats
14 | CP01126: Edible offal
15 | CP01127: Dried
16 | CP01128: Other meat preparations
17 | CP01131: Fresh or chilled fish
18 | CP01132: Frozen fish
19 | CP01133: Fresh or chilled seafood
20 | CP01134: Frozen seafood
21 | CP01135: Dried
22 | CP01136: Other preserved or processed fish and seafood and fish and seafood preparations
23 | CP01141: Whole milk
24 | CP01142: Low fat milk
25 | CP01143: Preserved milk
26 | CP01144: Yoghurt
27 | CP01145: Cheese and curd
28 | CP01146: Other milk products
29 | CP01147: Eggs
30 | CP01151: Butter
31 | CP01152: Margarine and other vegetable fats
32 | CP01153: Olive oil
33 | CP01154: Other edible oils
34 | CP01155: Other edible animal fats
35 | CP01161: Fresh or chilled fruit
36 | CP01162: Frozen fruit
37 | CP01163: Dried fruit and nuts
38 | CP01164: Preserved fruit and fruit-based products
39 | CP01171: Fresh or chilled vegetables other than potatoes and other tubers
40 | CP01172: Frozen vegetables other than potatoes and other tubers
41 | CP01173: Dried vegetables
42 | CP01174: Potatoes
43 | CP01175: Crisps
44 | CP01176: Other tubers and products of tuber vegetables
45 | CP01181: Sugar
46 | CP01182: Jams
47 | CP01183: Chocolate
48 | CP01184: Confectionery products
49 | CP01185: Edible ices and ice cream
50 | CP01186: Artificial sugar substitutes
51 | CP01191: Sauces
52 | CP01192: Salt
53 | CP01193: Baby food
54 | CP01194: Ready-made meals
55 | CP01199: Other food products n.e.c.
56 | CP01211: Coffee
57 | CP01212: Tea
58 | CP01213: Cocoa and powdered chocolate
59 | CP01221: Mineral or spring waters
60 | CP01222: Soft drinks
61 | CP01223: Fruit and vegetables juices
62 | CP02111: Spirits and liqueurs
63 | CP02112: Alcoholic soft drinks
64 | CP02121: Wine from grapes
65 | CP02122: Wine from other fruits
66 | CP02123: Fortified wines
67 | CP02124: Wine-based drinks
68 | CP02131: Lager beer
69 | CP02132: Other alcoholic beer
70 | CP02133: Low and non-alcoholic beer
71 | CP02134: Beer-based drinks
72 | CP02201: Cigarettes
73 | CP02202: Cigars
74 | CP02203: Other tobacco products
75 | CP02300: Narcotics
76 | CP03110: Clothing materials
77 | CP03121: Garments for men
78 | CP03122: Garments for women
79 | CP03123: Garments for infants (0 to 2 years) and children (3 to 13 years)
80 | CP03131: Other articles of clothing
81 | CP03132: Clothing accessories
82 | CP03141: Cleaning of clothing
83 | CP03142: Repair and hire of clothing
84 | CP03211: Footwear for men
85 | CP03212: Footwear for women
86 | CP03213: Footwear for infants and children
87 | CP03220: Repair and hire of footwear
88 | CP04110: Actual rentals paid by tenants
89 | CP04121: Actual rentals paid by tenants for secondary residences
90 | CP04122: Garage rentals and other rentals paid by tenants
91 | CP04210: Imputed rentals of owner-occupiers
92 | CP04220: Other imputed rentals
93 | CP04310: Materials for the maintenance and repair of the dwelling
94 | CP04321: Services of plumbers
95 | CP04322: Services of electricians
96 | CP04323: Maintenance services for heating systems
97 | CP04324: Services of painters
98 | CP04325: Services of carpenters
99 | CP04329: Other services for maintenance and repair of the dwelling
100 | CP04410: Water supply
101 | CP04420: Refuse collection
102 | CP04430: Sewerage collection
103 | CP04441: Maintenance charges in multi-occupied buildings
104 | CP04442: Security services
105 | CP04449: Other services related to dwelling
106 | CP04510: Electricity
107 | CP04521: Natural gas and town gas
108 | CP04522: Liquefied hydrocarbons (butane
109 | CP04530: Liquid fuels
110 | CP04541: Coal
111 | CP04549: Other solid fuels
112 | CP04550: Heat energy
113 | CP05111: Household furniture
114 | CP05112: Garden furniture
115 | CP05113: Lighting equipment
116 | CP05119: Other furniture and furnishings
117 | CP05121: Carpet and rugs
118 | CP05122: Other floor coverings
119 | CP05123: Services of laying of fitted carpets and floor coverings
120 | CP05130: Repair of furniture
121 | CP05201: Furnishings fabrics and curtains
122 | CP05202: Bed linen
123 | CP05203: Table linen and bathroom linen
124 | CP05209: Other household textiles
125 | CP05311: Refrigerators
126 | CP05312: Clothes washing machines
127 | CP05313: Cookers
128 | CP05314: Heaters
129 | CP05315: Cleaning equipment
130 | CP05319: Other major household appliances
131 | CP05321: Food processing appliances
132 | CP05322: Coffee machines
133 | CP05323: Irons
134 | CP05324: Toasters and grills
135 | CP05329: Other small electric household appliances
136 | CP05330: Repair of household appliances
137 | CP05401: Glassware
138 | CP05402: Cutlery
139 | CP05403: Non-electric kitchen utensils and articles
140 | CP05404: Repair of glassware
141 | CP05511: Motorized major tools and equipment
142 | CP05512: Repair
143 | CP05521: Non-motorised small tools
144 | CP05522: Miscellaneous small tool accessories
145 | CP05523: Repair of non-motorised small tools and miscellaneous accessories
146 | CP05611: Cleaning and maintenance products
147 | CP05612: Other non-durable small household articles
148 | CP05621: Domestic services by paid staff
149 | CP05622: Cleaning services
150 | CP05623: Hire of furniture and furnishings
151 | CP05629: Other domestic services and household services
152 | CP06110: Pharmaceutical products
153 | CP06121: Pregnancy tests and mechanical contraceptive devices
154 | CP06129: Other medical products n.e.c.
155 | CP06131: Corrective eye-glasses and contact lenses
156 | CP06132: Hearing aids
157 | CP06133: Repair of therapeutic appliances and equipment
158 | CP06139: Other therapeutic appliances and equipment
159 | CP06211: General practice
160 | CP06212: Specialist practice
161 | CP06220: Dental services
162 | CP06231: Services of medical analysis laboratories and X-ray centres
163 | CP06232: Thermal-baths
164 | CP06239: Other paramedical services
165 | CP06300: Hospital services
166 | CP07111: New motor cars
167 | CP07112: Second-hand motor cars
168 | CP07120: Motor cycles
169 | CP07130: Bicycles
170 | CP07140: Animal drawn vehicles
171 | CP07211: Tyres
172 | CP07212: Spare parts for personal transport equipment
173 | CP07213: Accessories for personal transport equipment
174 | CP07221: Diesel
175 | CP07222: Petrol
176 | CP07223: Other fuels for personal transport equipment
177 | CP07224: Lubricants
178 | CP07230: Maintenance and repair of personal transport equipment
179 | CP07241: Hire of garages
180 | CP07242: Toll facilities and parking meters
181 | CP07243: Driving lessons
182 | CP07311: Passenger transport by train
183 | CP07312: Passenger transport by underground and tram
184 | CP07321: Passenger transport by bus and coach
185 | CP07322: Passenger transport by taxi and hired car with driver
186 | CP07331: Domestic flights
187 | CP07332: International flights
188 | CP07341: Passenger transport by sea
189 | CP07342: Passenger transport by inland waterway
190 | CP07350: Combined passenger transport
191 | CP07361: Funicular
192 | CP07362: Removal and storage services
193 | CP07369: Other purchased transport services n.e.c.
194 | CP08101: Letter handling services
195 | CP08109: Other postal services
196 | CP08201: Fixed telephone equipment
197 | CP08202: Mobile telephone equipment
198 | CP08203: Other equipment of telephone and telefax equipment
199 | CP08204: Repair of telephone or telefax equipment
200 | CP08301: Wired telephone services
201 | CP08302: Wireless telephone services
202 | CP08303: Internet access provision services
203 | CP08304: Bundled telecommunication services
204 | CP08305: Other information transmission services
205 | CP09111: Equipment for the reception
206 | CP09112: Equipment for the reception
207 | CP09113: Portable sound and vision devices
208 | CP09119: Other equipment for the reception
209 | CP09121: Cameras
210 | CP09122: Accessories for photographic and cinematographic equipment
211 | CP09123: Optical instruments
212 | CP09131: Personal computers
213 | CP09132: Accessories for information processing equipment
214 | CP09133: Software
215 | CP09134: Calculators and other information processing equipment
216 | CP09141: Pre-recorded recording media
217 | CP09142: Unrecorded recording media
218 | CP09149: Other recording media
219 | CP09150: Repair of audio-visual
220 | CP09211: Camper vans
221 | CP09212: Aeroplanes
222 | CP09213: Boats
223 | CP09214: Horses
224 | CP09215: Major items for games and sport
225 | CP09221: Musical instruments
226 | CP09222: Major durables for indoor recreation
227 | CP09230: Maintenance and repair of other major durables for recreation and culture
228 | CP09311: Games and hobbies
229 | CP09312: Toys and celebration articles
230 | CP09321: Equipment for sport
231 | CP09322: Equipment for camping and open-air recreation
232 | CP09323: Repair of equipment for sport
233 | CP09331: Garden products
234 | CP09332: Plants and flowers
235 | CP09341: Purchase of pets
236 | CP09342: Products for pets
237 | CP09350: Veterinary and other services for pets
238 | CP09411: Recreational and sporting services - Attendance
239 | CP09412: Recreational and sporting services - Participation
240 | CP09421: Cinemas
241 | CP09422: Museums
242 | CP09423: Television and radio licence fees
243 | CP09424: Hire of equipment and accessories for culture
244 | CP09425: Photographic services
245 | CP09429: Other cultural services
246 | CP09430: Games of chance
247 | CP09511: Fiction books
248 | CP09512: Educational text books
249 | CP09513: Other non-fiction books
250 | CP09514: Binding services and E-book downloads
251 | CP09521: Newspapers
252 | CP09522: Magazines and periodicals
253 | CP09530: Miscellaneous printed matter
254 | CP09541: Paper products
255 | CP09549: Other stationery and drawing materials
256 | CP09601: Package domestic holidays
257 | CP09602: Package international holidays
258 | CP10101: Pre-primary education
259 | CP10102: Primary education
260 | CP10200: Secondary education
261 | CP10300: Post-secondary non-tertiary education
262 | CP10400: Tertiary education
263 | CP10500: Education not definable by level
264 | CP11111: Restaurants
265 | CP11112: Fast food and take away food services
266 | CP11120: Canteens
267 | CP11201: Hotels
268 | CP11202: Holiday centres
269 | CP11203: Accommodation services of other establishments
270 | CP12111: Hairdressing for men and children
271 | CP12112: Hairdressing for women
272 | CP12113: Personal grooming treatments
273 | CP12121: Electric appliances for personal care
274 | CP12122: Repair of electric appliances for personal care
275 | CP12131: Non-electrical appliances
276 | CP12132: Articles for personal hygiene and wellness
277 | CP12200: Prostitution
278 | CP12311: Jewellery
279 | CP12312: Clocks and watches
280 | CP12313: Repair of jewellery
281 | CP12321: Travel goods
282 | CP12322: Articles for babies
283 | CP12323: Repair of other personal effects
284 | CP12329: Other personal effects n.e.c.
285 | CP12401: Child care services
286 | CP12402: Retirement homes for elderly persons and residences for disabled persons
287 | CP12403: Services to maintain people in their private homes
288 | CP12404: Counselling
289 | CP12510: Life insurance
290 | CP12520: Insurance connected with the dwelling
291 | CP12531: Public insurance connected with health
292 | CP12532: Private insurance connected with health
293 | CP12541: Motor vehicle insurance
294 | CP12542: Travel insurance
295 | CP12550: Other insurance
296 | CP12610: FISIM
297 | CP12621: Charges by banks and post offices
298 | CP12622: Fees and service charges of brokers
299 | CP12701: Administrative fees
300 | CP12702: Legal services and accountancy
301 | CP12703: Funeral services
302 | CP12704: Other fees and services
303 |
--------------------------------------------------------------------------------
/src/sic_soc_llm/example_data/sic_2d_condensed.txt:
--------------------------------------------------------------------------------
1 | 01: Crop and animal production, hunting and related service activities
2 | 02: Forestry and logging
3 | 03: Fishing and aquaculture
4 | 05: Mining of coal and lignite
5 | 06: Extraction of crude petroleum and natural gas
6 | 07: Mining of metal ores
7 | 08: Other mining and quarrying
8 | 09: Mining support service activities
9 | 10: Manufacture of food products
10 | 11: Manufacture of beverages
11 | 12: Manufacture of tobacco products
12 | 13: Manufacture of textiles
13 | 14: Manufacture of wearing apparel
14 | 15: Manufacture of leather and related products
15 | 16: Manufacture of wood and of products of wood and cork, except furniture; manufacture of articles of straw and plaiting materials
16 | 17: Manufacture of paper and paper products
17 | 18: Printing and reproduction of recorded media
18 | 19: Manufacture of coke and refined petroleum products
19 | 20: Manufacture of chemicals and chemical products
20 | 21: Manufacture of basic pharmaceutical products and pharmaceutical preparations
21 | 22: Manufacture of rubber and plastic products
22 | 23: Manufacture of other non-metallic mineral products
23 | 24: Manufacture of basic metals
24 | 25: Manufacture of fabricated metal products, except machinery and equipment
25 | 26: Manufacture of computer, electronic and optical products
26 | 27: Manufacture of electrical equipment
27 | 28: Manufacture of machinery and equipment nec
28 | 29: Manufacture of motor vehicles, trailers and semi-trailers
29 | 30: Manufacture of other transport equipment
30 | 31: Manufacture of furniture
31 | 32: Other manufacturing
32 | 33: Repair and installation of machinery and equipment
33 | 35: Electricity, gas, steam and air conditioning supply
34 | 36: Water collection, treatment and supply
35 | 37: Sewerage
36 | 38: Waste collection, treatment and disposal activities; materials recovery
37 | 39: Remediation activities and other waste management services
38 | 41: Construction of buildings
39 | 42: Civil engineering
40 | 43: Specialised construction activities
41 | 45: Wholesale and retail trade and repair of motor vehicles and motorcycles
42 | 46: Wholesale trade, except of motor vehicles and motorcycles
43 | 47: Retail trade, except of motor vehicles and motorcycles
44 | 49: Land transport and transport via pipelines
45 | 50: Water transport
46 | 51: Air transport
47 | 52: Warehousing and support activities for transportation
48 | 53: Postal and courier activities
49 | 55: Accommodation
50 | 56: Food and beverage service activities
51 | 58: Publishing activities
52 | 59: Motion picture, video and television programme production, sound recording and music publishing activities
53 | 60: Programming and broadcasting activities
54 | 61: Telecommunications
55 | 62: Computer programming, consultancy and related activities
56 | 63: Information service activities
57 | 64: Financial service activities, except insurance and pension funding
58 | 65: Insurance, reinsurance and pension funding, except compulsory social security
59 | 66: Activities auxiliary to financial services and insurance activities
60 | 68: Real estate activities
61 | 69: Legal and accounting activities
62 | 70: Activities of head offices; management consultancy activities
63 | 71: Architectural and engineering activities; technical testing and analysis
64 | 72: Scientific research and development
65 | 73: Advertising and market research
66 | 74: Other professional, scientific and technical activities
67 | 75: Veterinary activities
68 | 77: Rental and leasing activities
69 | 78: Employment activities
70 | 79: Travel agency, tour operator and other reservation service and related activities
71 | 80: Security and investigation activities
72 | 81: Services to buildings and landscape activities
73 | 82: Office administrative, office support and other business support activities
74 | 84: Public administration and defence; compulsory social security
75 | 85: Education
76 | 86: Human health activities
77 | 87: Residential care activities
78 | 88: Social work activities without accommodation
79 | 90: Creative, arts and entertainment activities
80 | 91: Libraries, archives, museums and other cultural activities
81 | 92: Gambling and betting activities
82 | 93: Sports activities and amusement and recreation activities
83 | 94: Activities of membership organisations
84 | 95: Repair of computers and personal and household goods
85 | 96: Other personal service activities
86 | 97: Activities of households as employers of domestic personnel
87 | 98: Undifferentiated goods- and services-producing activities of private households for own use
88 | 99: Activities of extraterritorial organisations and bodies
89 |
--------------------------------------------------------------------------------
/src/sic_soc_llm/example_data/sic_4d_condensed.txt:
--------------------------------------------------------------------------------
1 | 0111: Growing of cereals (except rice), leguminous crops and oil seeds
2 | 0112: Growing of rice
3 | 0113: Growing of vegetables and melons, roots and tubers
4 | 0114: Growing of sugar cane
5 | 0115: Growing of tobacco
6 | 0116: Growing of fibre crops
7 | 0119: Growing of other non-perennial crops
8 | 0121: Growing of grapes
9 | 0122: Growing of tropical and subtropical fruits
10 | 0123: Growing of citrus fruits
11 | 0124: Growing of pome fruits and stone fruits
12 | 0125: Growing of other tree and bush fruits and nuts
13 | 0126: Growing of oleaginous fruits
14 | 0127: Growing of beverage crops
15 | 0128: Growing of spices, aromatic, drug and pharmaceutical crops
16 | 0129: Growing of other perennial crops
17 | 0130: Plant propagation
18 | 0141: Raising of dairy cattle
19 | 0142: Raising of other cattle and buffaloes
20 | 0143: Raising of horses and other equines
21 | 0144: Raising of camels and camelids
22 | 0145: Raising of sheep and goats
23 | 0146: Raising of swinepigs
24 | 0147: Raising of poultry
25 | 0149: Raising of other animals
26 | 0150: Mixed farming
27 | 0161: Support activities for crop production
28 | 0162: Support activities for animal production
29 | 0163: Post-harvest crop activities
30 | 0164: Seed processing for propagation
31 | 0170: Hunting, trapping and related service activities
32 | 0210: Silviculture and other forestry activities
33 | 0220: Logging
34 | 0230: Gathering of wild growing non-wood products
35 | 0240: Support services to forestry
36 | 0311: Marine fishing
37 | 0312: Freshwater fishing
38 | 0321: Marine aquaculture
39 | 0322: Freshwater aquaculture
40 | 0510: Mining of hard coal
41 | 0520: Mining of lignite
42 | 0610: Extraction of crude petroleum
43 | 0620: Extraction of natural gas
44 | 0710: Mining of iron ores
45 | 0721: Mining of uranium and thorium ores
46 | 0729: Mining of other non-ferrous metal ores
47 | 0811: Quarrying of ornamental and building stone, limestone, gypsum, chalk and slate
48 | 0812: Operation of gravel and sand pits; mining of clays and kaolin
49 | 0891: Mining of chemical and fertiliser minerals
50 | 0892: Extraction of peat
51 | 0893: Extraction of salt
52 | 0899: Other mining and quarrying nec
53 | 0910: Support activities for petroleum and natural gas extraction
54 | 0990: Support activities for other mining and quarrying
55 | 1011: Processing and preserving of meat
56 | 1012: Processing and preserving of poultry meat
57 | 1013: Production of meat and poultry meat products
58 | 1020: Processing and preserving of fish, crustaceans and molluscs
59 | 1031: Processing and preserving of potatoes
60 | 1032: Manufacture of fruit and vegetable juice
61 | 1039: Other processing and preserving of fruit and vegetables
62 | 1041: Manufacture of oils and fats
63 | 1042: Manufacture of margarine and similar edible fats
64 | 1051: Operation of dairies and cheese making
65 | 1052: Manufacture of ice cream
66 | 1061: Manufacture of grain mill products
67 | 1062: Manufacture of starches and starch products
68 | 1071: Manufacture of bread; manufacture of fresh pastry goods and cakes
69 | 1072: Manufacture of rusks and biscuits; manufacture of preserved pastry goods and cakes
70 | 1073: Manufacture of macaroni, noodles, couscous and similar farinaceous products
71 | 1081: Manufacture of sugar
72 | 1082: Manufacture of cocoa, chocolate and sugar confectionery
73 | 1083: Processing of tea and coffee
74 | 1084: Manufacture of condiments and seasonings
75 | 1085: Manufacture of prepared meals and dishes
76 | 1086: Manufacture of homogenised food preparations and dietetic food
77 | 1089: Manufacture of other food products nec
78 | 1091: Manufacture of prepared feeds for farm animals
79 | 1092: Manufacture of prepared pet foods
80 | 1101: Distilling, rectifying and blending of spirits
81 | 1102: Manufacture of wine from grape
82 | 1103: Manufacture of cider and other fruit wines
83 | 1104: Manufacture of other non-distilled fermented beverages
84 | 1105: Manufacture of beer
85 | 1106: Manufacture of malt
86 | 1107: Manufacture of soft drinks; production of mineral waters and other bottled waters
87 | 1200: Manufacture of tobacco products
88 | 1310: Preparation and spinning of textile fibres
89 | 1320: Weaving of textiles
90 | 1330: Finishing of textiles
91 | 1391: Manufacture of knitted and crocheted fabrics
92 | 1392: Manufacture of made-up textile articles, except apparel
93 | 1393: Manufacture of carpets and rugs
94 | 1394: Manufacture of cordage, rope, twine and netting
95 | 1395: Manufacture of non-wovens and articles made from non-wovens, except apparel
96 | 1396: Manufacture of other technical and industrial textiles
97 | 1399: Manufacture of other textiles nec
98 | 1411: Manufacture of leather clothes
99 | 1412: Manufacture of workwear
100 | 1413: Manufacture of other outerwear
101 | 1414: Manufacture of underwear
102 | 1419: Manufacture of other wearing apparel and accessories
103 | 1420: Manufacture of articles of fur
104 | 1431: Manufacture of knitted and crocheted hosiery
105 | 1439: Manufacture of other knitted and crocheted apparel
106 | 1511: Tanning and dressing of leather; dressing and dyeing of fur
107 | 1512: Manufacture of luggage, handbags and the like, saddlery and harness
108 | 1520: Manufacture of footwear
109 | 1610: Sawmilling and planing of wood
110 | 1621: Manufacture of veneer sheets and wood-based panels
111 | 1622: Manufacture of assembled parquet floors
112 | 1623: Manufacture of other builders' carpentry and joinery
113 | 1624: Manufacture of wooden containers
114 | 1629: Manufacture of other products of wood; manufacture of articles of cork, straw and plaiting materials
115 | 1711: Manufacture of pulp
116 | 1712: Manufacture of paper and paperboard
117 | 1721: Manufacture of corrugated paper and paperboard and of containers of paper and paperboard
118 | 1722: Manufacture of household and sanitary goods and of toilet requisites
119 | 1723: Manufacture of paper stationery
120 | 1724: Manufacture of wallpaper
121 | 1729: Manufacture of other articles of paper and paperboard
122 | 1811: Printing of newspapers
123 | 1812: Other printing
124 | 1813: Pre-press and pre-media services
125 | 1814: Binding and related services
126 | 1820: Reproduction of recorded media
127 | 1910: Manufacture of coke oven products
128 | 1920: Manufacture of refined petroleum products
129 | 2011: Manufacture of industrial gases
130 | 2012: Manufacture of dyes and pigments
131 | 2013: Manufacture of other inorganic basic chemicals
132 | 2014: Manufacture of other organic basic chemicals
133 | 2015: Manufacture of fertilisers and nitrogen compounds
134 | 2016: Manufacture of plastics in primary forms
135 | 2017: Manufacture of synthetic rubber in primary forms
136 | 2020: Manufacture of pesticides and other agrochemical products
137 | 2030: Manufacture of paints, varnishes and similar coatings, printing ink and mastics
138 | 2041: Manufacture of soap and detergents, cleaning and polishing preparations
139 | 2042: Manufacture of perfumes and toilet preparations
140 | 2051: Manufacture of explosives
141 | 2052: Manufacture of glues
142 | 2053: Manufacture of essential oils
143 | 2059: Manufacture of other chemical products nec
144 | 2060: Manufacture of man-made fibres
145 | 2110: Manufacture of basic pharmaceutical products
146 | 2120: Manufacture of pharmaceutical preparations
147 | 2211: Manufacture of rubber tyres and tubes; retreading and rebuilding of rubber tyres
148 | 2219: Manufacture of other rubber products
149 | 2221: Manufacture of plastic plates, sheets, tubes and profiles
150 | 2222: Manufacture of plastic packing goods
151 | 2223: Manufacture of builders’ ware of plastic
152 | 2229: Manufacture of other plastic products
153 | 2311: Manufacture of flat glass
154 | 2312: Shaping and processing of flat glass
155 | 2313: Manufacture of hollow glass
156 | 2314: Manufacture of glass fibres
157 | 2319: Manufacture and processing of other glass, including technical glassware
158 | 2320: Manufacture of refractory products
159 | 2331: Manufacture of ceramic tiles and flags
160 | 2332: Manufacture of bricks, tiles and construction products, in baked clay
161 | 2341: Manufacture of ceramic household and ornamental articles
162 | 2342: Manufacture of ceramic sanitary fixtures
163 | 2343: Manufacture of ceramic insulators and insulating fittings
164 | 2344: Manufacture of other technical ceramic products
165 | 2349: Manufacture of other ceramic products
166 | 2351: Manufacture of cement
167 | 2352: Manufacture of lime and plaster
168 | 2361: Manufacture of concrete products for construction purposes
169 | 2362: Manufacture of plaster products for construction purposes
170 | 2363: Manufacture of ready-mixed concrete
171 | 2364: Manufacture of mortars
172 | 2365: Manufacture of fibre cement
173 | 2369: Manufacture of other articles of concrete, plaster and cement
174 | 2370: Cutting, shaping and finishing of stone
175 | 2391: Production of abrasive products
176 | 2399: Manufacture of other non-metallic mineral products nec
177 | 2410: Manufacture of basic iron and steel and of ferro-alloys
178 | 2420: Manufacture of tubes, pipes, hollow profiles and related fittings, of steel
179 | 2431: Cold drawing of bars
180 | 2432: Cold rolling of narrow strip
181 | 2433: Cold forming or folding
182 | 2434: Cold drawing of wire
183 | 2441: Precious metals production
184 | 2442: Aluminium production
185 | 2443: Lead, zinc and tin production
186 | 2444: Copper production
187 | 2445: Other non-ferrous metal production
188 | 2446: Processing of nuclear fuel
189 | 2451: Casting of iron
190 | 2452: Casting of steel
191 | 2453: Casting of light metals
192 | 2454: Casting of other non-ferrous metals
193 | 2511: Manufacture of metal structures and parts of structures
194 | 2512: Manufacture of doors and windows of metal
195 | 2521: Manufacture of central heating radiators and boilers
196 | 2529: Manufacture of other tanks, reservoirs and containers of metal
197 | 2530: Manufacture of steam generators, except central heating hot water boilers
198 | 2540: Manufacture of weapons and ammunition
199 | 2550: Forging, pressing, stamping and roll-forming of metal; powder metallurgy
200 | 2561: Treatment and coating of metals
201 | 2562: Machining
202 | 2571: Manufacture of cutlery
203 | 2572: Manufacture of locks and hinges
204 | 2573: Manufacture of tools
205 | 2591: Manufacture of steel drums and similar containers
206 | 2592: Manufacture of light metal packaging
207 | 2593: Manufacture of wire products, chain and springs
208 | 2594: Manufacture of fasteners and screw machine products
209 | 2599: Manufacture of other fabricated metal products nec
210 | 2611: Manufacture of electronic components
211 | 2612: Manufacture of loaded electronic boards
212 | 2620: Manufacture of computers and peripheral equipment
213 | 2630: Manufacture of communication equipment
214 | 2640: Manufacture of consumer electronics
215 | 2651: Manufacture of instruments and appliances for measuring, testing and navigation
216 | 2652: Manufacture of watches and clocks
217 | 2660: Manufacture of irradiation, electromedical and electrotherapeutic equipment
218 | 2670: Manufacture of optical instruments and photographic equipment
219 | 2680: Manufacture of magnetic and optical media
220 | 2711: Manufacture of electric motors, generators and transformers
221 | 2712: Manufacture of electricity distribution and control apparatus
222 | 2720: Manufacture of batteries and accumulators
223 | 2731: Manufacture of fibre optic cables
224 | 2732: Manufacture of other electronic and electric wires and cables
225 | 2733: Manufacture of wiring devices
226 | 2740: Manufacture of electric lighting equipment
227 | 2751: Manufacture of electric domestic appliances
228 | 2752: Manufacture of non-electric domestic appliances
229 | 2790: Manufacture of other electrical equipment
230 | 2811: Manufacture of engines and turbines, except aircraft, vehicle and cycle engines
231 | 2812: Manufacture of fluid power equipment
232 | 2813: Manufacture of other pumps and compressors
233 | 2814: Manufacture of other taps and valves
234 | 2815: Manufacture of bearings, gears, gearing and driving elements
235 | 2821: Manufacture of ovens, furnaces and furnace burners
236 | 2822: Manufacture of lifting and handling equipment
237 | 2823: Manufacture of office machinery and equipment (except computers and peripheral equipment)
238 | 2824: Manufacture of power-driven hand tools
239 | 2825: Manufacture of non-domestic cooling and ventilation equipment
240 | 2829: Manufacture of other general-purpose machinery nec
241 | 2830: Manufacture of agricultural and forestry machinery
242 | 2841: Manufacture of metal forming machinery
243 | 2849: Manufacture of other machine tools
244 | 2891: Manufacture of machinery for metallurgy
245 | 2892: Manufacture of machinery for mining, quarrying and construction
246 | 2893: Manufacture of machinery for food, beverage and tobacco processing
247 | 2894: Manufacture of machinery for textile, apparel and leather production
248 | 2895: Manufacture of machinery for paper and paperboard production
249 | 2896: Manufacture of plastics and rubber machinery
250 | 2899: Manufacture of other special-purpose machinery nec
251 | 2910: Manufacture of motor vehicles
252 | 2920: Manufacture of bodies (coachwork) for motor vehicles; manufacture of trailers and semi-trailers
253 | 2931: Manufacture of electrical and electronic equipment for motor vehicles
254 | 2932: Manufacture of other parts and accessories for motor vehicles
255 | 3011: Building of ships and floating structures
256 | 3012: Building of pleasure and sporting boats
257 | 3020: Manufacture of railway locomotives and rolling stock
258 | 3030: Manufacture of air and spacecraft and related machinery
259 | 3040: Manufacture of military fighting vehicles
260 | 3091: Manufacture of motorcycles
261 | 3092: Manufacture of bicycles and invalid carriages
262 | 3099: Manufacture of other transport equipment nec
263 | 3101: Manufacture of office and shop furniture
264 | 3102: Manufacture of kitchen furniture
265 | 3103: Manufacture of mattresses
266 | 3109: Manufacture of other furniture
267 | 3211: Striking of coins
268 | 3212: Manufacture of jewellery and related articles
269 | 3213: Manufacture of imitation jewellery and related articles
270 | 3220: Manufacture of musical instruments
271 | 3230: Manufacture of sports goods
272 | 3240: Manufacture of games and toys
273 | 3250: Manufacture of medical and dental instruments and supplies
274 | 3291: Manufacture of brooms and brushes
275 | 3299: Other manufacturing nec
276 | 3311: Repair of fabricated metal products
277 | 3312: Repair of machinery
278 | 3313: Repair of electronic and optical equipment
279 | 3314: Repair of electrical equipment
280 | 3315: Repair and maintenance of ships and boats
281 | 3316: Repair and maintenance of aircraft and spacecraft
282 | 3317: Repair and maintenance of other transport equipment
283 | 3319: Repair of other equipment
284 | 3320: Installation of industrial machinery and equipment
285 | 3511: Production of electricity
286 | 3512: Transmission of electricity
287 | 3513: Distribution of electricity
288 | 3514: Trade of electricity
289 | 3521: Manufacture of gas
290 | 3522: Distribution of gaseous fuels through mains
291 | 3523: Trade of gas through mains
292 | 3530: Steam and air conditioning supply
293 | 3600: Water collection, treatment and supply
294 | 3700: Sewerage
295 | 3811: Collection of non-hazardous waste
296 | 3812: Collection of hazardous waste
297 | 3821: Treatment and disposal of non-hazardous waste
298 | 3822: Treatment and disposal of hazardous waste
299 | 3831: Dismantling of wrecks
300 | 3832: Recovery of sorted materials
301 | 3900: Remediation activities and other waste management services
302 | 4110: Development of building projects
303 | 4120: Construction of residential and non-residential buildings
304 | 4211: Construction of roads and motorways
305 | 4212: Construction of railways and underground railways
306 | 4213: Construction of bridges and tunnels
307 | 4221: Construction of utility projects for fluids
308 | 4222: Construction of utility projects for electricity and telecommunications
309 | 4291: Construction of water projects
310 | 4299: Construction of other civil engineering projects nec
311 | 4311: Demolition
312 | 4312: Site preparation
313 | 4313: Test drilling and boring
314 | 4321: Electrical installation
315 | 4322: Plumbing, heat and air-conditioning installation
316 | 4329: Other construction installation
317 | 4331: Plastering
318 | 4332: Joinery installation
319 | 4333: Floor and wall covering
320 | 4334: Painting and glazing
321 | 4339: Other building completion and finishing
322 | 4391: Roofing activities
323 | 4399: Other specialised construction activities nec
324 | 4511: Sale of cars and light motor vehicles
325 | 4519: Sale of other motor vehicles
326 | 4520: Maintenance and repair of motor vehicles
327 | 4531: Wholesale trade of motor vehicle parts and accessories
328 | 4532: Retail trade of motor vehicle parts and accessories
329 | 4540: Sale, maintenance and repair of motorcycles and related parts and accessories
330 | 4611: Agents involved in the sale of agricultural raw materials, live animals, textile raw materials and semi-finished goods
331 | 4612: Agents involved in the sale of fuels, ores, metals and industrial chemicals
332 | 4613: Agents involved in the sale of timber and building materials
333 | 4614: Agents involved in the sale of machinery, industrial equipment, ships and aircraft
334 | 4615: Agents involved in the sale of furniture, household goods, hardware and ironmongery
335 | 4616: Agents involved in the sale of textiles, clothing, fur, footwear and leather goods
336 | 4617: Agents involved in the sale of food, beverages and tobacco
337 | 4618: Agents specialised in the sale of other particular products
338 | 4619: Agents involved in the sale of a variety of goods
339 | 4621: Wholesale of grain, unmanufactured tobacco, seeds and animal feeds
340 | 4622: Wholesale of flowers and plants
341 | 4623: Wholesale of live animals
342 | 4624: Wholesale of hides, skins and leather
343 | 4631: Wholesale of fruit and vegetables
344 | 4632: Wholesale of meat and meat products
345 | 4633: Wholesale of dairy products, eggs and edible oils and fats
346 | 4634: Wholesale of beverages
347 | 4635: Wholesale of tobacco products
348 | 4636: Wholesale of sugar and chocolate and sugar confectionery
349 | 4637: Wholesale of coffee, tea, cocoa and spices
350 | 4638: Wholesale of other food, including fish, crustaceans and molluscs
351 | 4639: Non-specialised wholesale of food, beverages and tobacco
352 | 4641: Wholesale of textiles
353 | 4642: Wholesale of clothing and footwear
354 | 4643: Wholesale of electrical household appliances
355 | 4644: Wholesale of china and glassware and cleaning materials
356 | 4645: Wholesale of perfume and cosmetics
357 | 4646: Wholesale of pharmaceutical goods
358 | 4647: Wholesale of furniture, carpets and lighting equipment
359 | 4648: Wholesale of watches and jewellery
360 | 4649: Wholesale of other household goods
361 | 4651: Wholesale of computers, computer peripheral equipment and software
362 | 4652: Wholesale of electronic and telecommunications equipment and parts
363 | 4661: Wholesale of agricultural machinery, equipment and supplies
364 | 4662: Wholesale of machine tools
365 | 4663: Wholesale of mining, construction and civil engineering machinery
366 | 4664: Wholesale of machinery for the textile industry and of sewing and knitting machines
367 | 4665: Wholesale of office furniture
368 | 4666: Wholesale of other office machinery and equipment
369 | 4669: Wholesale of other machinery and equipment
370 | 4671: Wholesale of solid, liquid and gaseous fuels and related products
371 | 4672: Wholesale of metals and metal ores
372 | 4673: Wholesale of wood, construction materials and sanitary equipment
373 | 4674: Wholesale of hardware, plumbing and heating equipment and supplies
374 | 4675: Wholesale of chemical products
375 | 4676: Wholesale of other intermediate products
376 | 4677: Wholesale of waste and scrap
377 | 4690: Non-specialised wholesale trade
378 | 4711: Retail sale in non-specialised stores with food, beverages or tobacco predominating
379 | 4719: Other retail sale in non-specialised stores
380 | 4721: Retail sale of fruit and vegetables in specialised stores
381 | 4722: Retail sale of meat and meat products in specialised stores
382 | 4723: Retail sale of fish, crustaceans and molluscs in specialised stores
383 | 4724: Retail sale of bread, cakes, flour confectionery and sugar confectionery in specialised stores
384 | 4725: Retail sale of beverages in specialised stores
385 | 4726: Retail sale of tobacco products in specialised stores
386 | 4729: Other retail sale of food in specialised stores
387 | 4730: Retail sale of automotive fuel in specialised stores
388 | 4741: Retail sale of computers, peripheral units and software in specialised stores
389 | 4742: Retail sale of telecommunications equipment in specialised stores
390 | 4743: Retail sale of audio and video equipment in specialised stores
391 | 4751: Retail sale of textiles in specialised stores
392 | 4752: Retail sale of hardware, paints and glass in specialised stores
393 | 4753: Retail sale of carpets, rugs, wall and floor coverings in specialised stores
394 | 4754: Retail sale of electrical household appliances in specialised stores
395 | 4759: Retail sale of furniture, lighting equipment and other household articles in specialised stores
396 | 4761: Retail sale of books in specialised stores
397 | 4762: Retail sale of newspapers and stationery in specialised stores
398 | 4763: Retail sale of music and video recordings in specialised stores
399 | 4764: Retail sale of sporting equipment in specialised stores
400 | 4765: Retail sale of games and toys in specialised stores
401 | 4771: Retail sale of clothing in specialised stores
402 | 4772: Retail sale of footwear and leather goods in specialised stores
403 | 4773: Dispensing chemist in specialised stores
404 | 4774: Retail sale of medical and orthopaedic goods in specialised stores
405 | 4775: Retail sale of cosmetic and toilet articles in specialised stores
406 | 4776: Retail sale of flowers, plants, seeds, fertilisers, pet animals and pet food in specialised stores
407 | 4777: Retail sale of watches and jewellery in specialised stores
408 | 4778: Other retail sale of new goods in specialised stores
409 | 4779: Retail sale of second-hand goods in stores
410 | 4781: Retail sale via stalls and markets of food, beverages and tobacco products
411 | 4782: Retail sale via stalls and markets of textiles, clothing and footwear
412 | 4789: Retail sale via stalls and markets of other goods
413 | 4791: Retail sale via mail order houses or via Internet
414 | 4799: Other retail sale not in stores, stalls or markets
415 | 4910: Passenger rail transport, interurban
416 | 4920: Freight rail transport
417 | 4931: Urban and suburban passenger land transport
418 | 4932: Taxi operation
419 | 4939: Other passenger land transport nec
420 | 4941: Freight transport by road
421 | 4942: Removal services
422 | 4950: Transport via pipeline
423 | 5010: Sea and coastal passenger water transport
424 | 5020: Sea and coastal freight water transport
425 | 5030: Inland passenger water transport
426 | 5040: Inland freight water transport
427 | 5110: Passenger air transport
428 | 5121: Freight air transport
429 | 5122: Space transport
430 | 5210: Warehousing and storage
431 | 5221: Service activities incidental to land transportation
432 | 5222: Service activities incidental to water transportation
433 | 5223: Service activities incidental to air transportation
434 | 5224: Cargo handling
435 | 5229: Other transportation support activities
436 | 5310: Postal activities under universal service obligation
437 | 5320: Other postal and courier activities
438 | 5510: Hotels and similar accommodation
439 | 5520: Holiday and other short-stay accommodation
440 | 5530: Camping grounds, recreational vehicle parks and trailer parks
441 | 5590: Other accommodation
442 | 5610: Restaurants and mobile food service activities
443 | 5621: Event catering activities
444 | 5629: Other food service activities
445 | 5630: Beverage serving activities
446 | 5811: Book publishing
447 | 5812: Publishing of directories and mailing lists
448 | 5813: Publishing of newspapers
449 | 5814: Publishing of journals and periodicals
450 | 5819: Other publishing activities
451 | 5821: Publishing of computer games
452 | 5829: Other software publishing
453 | 5911: Motion picture, video and television programme production activities
454 | 5912: Motion picture, video and television programme post-production activities
455 | 5913: Motion picture, video and television programme distribution activities
456 | 5914: Motion picture projection activities
457 | 5920: Sound recording and music publishing activities
458 | 6010: Radio broadcasting
459 | 6020: Television programming and broadcasting activities
460 | 6110: Wired telecommunications activities
461 | 6120: Wireless telecommunications activities
462 | 6130: Satellite telecommunications activities
463 | 6190: Other telecommunications activities
464 | 6201: Computer programming activities
465 | 6202: Computer consultancy activities
466 | 6203: Computer facilities management activities
467 | 6209: Other information technology and computer service activities
468 | 6311: Data processing, hosting and related activities
469 | 6312: Web portals
470 | 6391: News agency activities
471 | 6399: Other information service activities nec
472 | 6411: Central banking
473 | 6419: Other monetary intermediation
474 | 6420: Activities of holding companies
475 | 6430: Trusts, funds and similar financial entities
476 | 6491: Financial leasing
477 | 6492: Other credit granting
478 | 6499: Other financial service activities, except insurance and pension funding, nec
479 | 6511: Life insurance
480 | 6512: Non-life insurance
481 | 6520: Reinsurance
482 | 6530: Pension funding
483 | 6611: Administration of financial markets
484 | 6612: Security and commodity contracts brokerage
485 | 6619: Other activities auxiliary to financial services, except insurance and pension funding
486 | 6621: Risk and damage evaluation
487 | 6622: Activities of insurance agents and brokers
488 | 6629: Other activities auxiliary to insurance and pension funding
489 | 6630: Fund management activities
490 | 6810: Buying and selling of own real estate
491 | 6820: Renting and operating of own or leased real estate
492 | 6831: Real estate agencies
493 | 6832: Management of real estate on a fee or contract basis
494 | 6910: Legal activities
495 | 6920: Accounting, bookkeeping and auditing activities; tax consultancy
496 | 7010: Activities of head offices
497 | 7021: Public relations and communication activities
498 | 7022: Business and other management consultancy activities
499 | 7111: Architectural activities
500 | 7112: Engineering activities and related technical consultancy
501 | 7120: Technical testing and analysis
502 | 7211: Research and experimental development on biotechnology
503 | 7219: Other research and experimental development on natural sciences and engineering
504 | 7220: Research and experimental development on social sciences and humanities
505 | 7311: Advertising agencies
506 | 7312: Media representation
507 | 7320: Market research and public opinion polling
508 | 7410: Specialised design activities
509 | 7420: Photographic activities
510 | 7430: Translation and interpretation activities
511 | 7490: Other professional, scientific and technical activities nec
512 | 7500: Veterinary activities
513 | 7711: Renting and leasing of cars and light motor vehicles
514 | 7712: Renting and leasing of trucks
515 | 7721: Renting and leasing of recreational and sports goods
516 | 7722: Renting of video tapes and disks
517 | 7729: Renting and leasing of other personal and household goods
518 | 7731: Renting and leasing of agricultural machinery and equipment
519 | 7732: Renting and leasing of construction and civil engineering machinery and equipment
520 | 7733: Renting and leasing of office machinery and equipment (including computers)
521 | 7734: Renting and leasing of water transport equipment
522 | 7735: Renting and leasing of air transport equipment
523 | 7739: Renting and leasing of other machinery, equipment and tangible goods nec
524 | 7740: Leasing of intellectual property and similar products, except copyrighted works
525 | 7810: Activities of employment placement agencies
526 | 7820: Temporary employment agency activities
527 | 7830: Other human resources provision
528 | 7911: Travel agency activities
529 | 7912: Tour operator activities
530 | 7990: Other reservation service and related activities
531 | 8010: Private security activities
532 | 8020: Security systems service activities
533 | 8030: Investigation activities
534 | 8110: Combined facilities support activities
535 | 8121: General cleaning of buildings
536 | 8122: Other building and industrial cleaning activities
537 | 8129: Other cleaning activities
538 | 8130: Landscape service activities
539 | 8211: Combined office administrative service activities
540 | 8219: Photocopying, document preparation and other specialised office support activities
541 | 8220: Activities of call centres
542 | 8230: Organisation of conventions and trade shows
543 | 8291: Activities of collection agencies and credit bureaus
544 | 8292: Packaging activities
545 | 8299: Other business support service activities nec
546 | 8411: General public administration activities
547 | 8412: Regulation of the activities of providing health care, education, cultural services and other social services, excluding social security
548 | 8413: Regulation of and contribution to more efficient operation of businesses
549 | 8421: Foreign affairs
550 | 8422: Defence activities
551 | 8423: Justice and judicial activities
552 | 8424: Public order and safety activities
553 | 8425: Fire service activities
554 | 8430: Compulsory social security activities
555 | 8510: Pre-primary education
556 | 8520: Primary education
557 | 8531: General secondary education
558 | 8532: Technical and vocational secondary education
559 | 8541: Post-secondary non-tertiary education
560 | 8542: Tertiary education
561 | 8551: Sports and recreation education
562 | 8552: Cultural education
563 | 8553: Driving school activities
564 | 8559: Other education nec
565 | 8560: Educational support activities
566 | 8610: Hospital activities
567 | 8621: General medical practice activities
568 | 8622: Specialist medical practice activities
569 | 8623: Dental practice activities
570 | 8690: Other human health activities
571 | 8710: Residential nursing care activities
572 | 8720: Residential care activities for learning disabilities, mental health and substance abuse
573 | 8730: Residential care activities for the elderly and disabled
574 | 8790: Other residential care activities
575 | 8810: Social work activities without accommodation for the elderly and disabled
576 | 8891: Child day-care activities
577 | 8899: Other social work activities without accommodation nec
578 | 9001: Performing arts
579 | 9002: Support activities to performing arts
580 | 9003: Artistic creation
581 | 9004: Operation of arts facilities
582 | 9101: Library and archive activities
583 | 9102: Museum activities
584 | 9103: Operation of historical sites and buildings and similar visitor attractions
585 | 9104: Botanical and zoological gardens and nature reserve activities
586 | 9200: Gambling and betting activities
587 | 9311: Operation of sports facilities
588 | 9312: Activities of sport clubs
589 | 9313: Fitness facilities
590 | 9319: Other sports activities
591 | 9321: Activities of amusement parks and theme parks
592 | 9329: Other amusement and recreation activities
593 | 9411: Activities of business and employers membership organisations
594 | 9412: Activities of professional membership organisations
595 | 9420: Activities of trade unions
596 | 9491: Activities of religious organisations
597 | 9492: Activities of political organisations
598 | 9499: Activities of other membership organisations nec
599 | 9511: Repair of computers and peripheral equipment
600 | 9512: Repair of communication equipment
601 | 9521: Repair of consumer electronics
602 | 9522: Repair of household appliances and home and garden equipment
603 | 9523: Repair of footwear and leather goods
604 | 9524: Repair of furniture and home furnishings
605 | 9525: Repair of watches, clocks and jewellery
606 | 9529: Repair of other personal and household goods
607 | 9601: Washing and (dry-)cleaning of textile and fur products
608 | 9602: Hairdressing and other beauty treatment
609 | 9603: Funeral and related activities
610 | 9604: Physical well-being activities
611 | 9609: Other personal service activities nec
612 | 9700: Activities of households as employers of domestic personnel
613 | 9810: Undifferentiated goods-producing activities of private households for own use
614 | 9820: Undifferentiated service-producing activities of private households for own use
615 | 9900: Activities of extraterritorial organisations and bodies
616 |
--------------------------------------------------------------------------------
/src/sic_soc_llm/example_data/soc_4d_condensed.txt:
--------------------------------------------------------------------------------
1 | 1111: Chief executives and senior officials
2 | 1112: Elected officers and representatives
3 | 1121: Production managers and directors in manufacturing
4 | 1122: Production managers and directors in construction
5 | 1123: Production managers and directors in mining and energy
6 | 1131: Financial managers and directors
7 | 1132: Marketing, sales and advertising directors
8 | 1133: Public relations and communications directors
9 | 1134: Purchasing managers and directors
10 | 1135: Charitable organisation managers and directors
11 | 1136: Human resource managers and directors
12 | 1137: Information technology directors
13 | 1139: Functional managers and directors n.e.c.
14 | 1140: Directors in logistics, warehousing and transport
15 | 1150: Managers and directors in retail and wholesale
16 | 1161: Officers in armed forces
17 | 1162: Senior police officers
18 | 1163: Senior officers in fire, ambulance, prison and related services
19 | 1171: Health services and public health managers and directors
20 | 1172: Social services managers and directors
21 | 1211: Managers and proprietors in agriculture and horticulture
22 | 1212: Managers and proprietors in forestry, fishing and related services
23 | 1221: Hotel and accommodation managers and proprietors
24 | 1222: Restaurant and catering establishment managers and proprietors
25 | 1223: Publicans and managers of licensed premises
26 | 1224: Leisure and sports managers and proprietors
27 | 1225: Travel agency managers and proprietors
28 | 1231: Health care practice managers
29 | 1232: Residential, day and domiciliary care managers and proprietors
30 | 1233: Early education and childcare services proprietors
31 | 1241: Managers in transport and distribution
32 | 1242: Managers in storage and warehousing
33 | 1243: Managers in logistics
34 | 1251: Property, housing and estate managers
35 | 1252: Garage managers and proprietors
36 | 1253: Hairdressing and beauty salon managers and proprietors
37 | 1254: Waste disposal and environmental services managers
38 | 1255: Managers and directors in the creative industries
39 | 1256: Betting shop and gambling establishment managers
40 | 1257: Hire services managers and proprietors
41 | 1258: Directors in consultancy services
42 | 1259: Managers and proprietors in other services n.e.c.
43 | 2111: Chemical scientists
44 | 2112: Biological scientists
45 | 2113: Biochemists and biomedical scientists
46 | 2114: Physical scientists
47 | 2115: Social and humanities scientists
48 | 2119: Natural and social science professionals n.e.c.
49 | 2121: Civil engineers
50 | 2122: Mechanical engineers
51 | 2123: Electrical engineers
52 | 2124: Electronics engineers
53 | 2125: Production and process engineers
54 | 2126: Aerospace engineers
55 | 2127: Engineering project managers and project engineers
56 | 2129: Engineering professionals n.e.c.
57 | 2131: IT project managers
58 | 2132: IT managers
59 | 2133: IT business analysts, architects and systems designers
60 | 2134: Programmers and software development professionals
61 | 2135: Cyber security professionals
62 | 2136: IT quality and testing professionals
63 | 2137: IT network professionals
64 | 2139: Information technology professionals n.e.c.
65 | 2141: Web design professionals
66 | 2142: Graphic and multimedia designers
67 | 2151: Conservation professionals
68 | 2152: Environment professionals
69 | 2161: Research and development (r&d) managers
70 | 2162: Other researchers, unspecified discipline
71 | 2211: Generalist medical practitioners
72 | 2212: Specialist medical practitioners
73 | 2221: Physiotherapists
74 | 2222: Occupational therapists
75 | 2223: Speech and language therapists
76 | 2224: Psychotherapists and cognitive behaviour therapists
77 | 2225: Clinical psychologists
78 | 2226: Other psychologists
79 | 2229: Therapy professionals n.e.c.
80 | 2231: Midwifery nurses
81 | 2232: Registered community nurses
82 | 2233: Registered specialist nurses
83 | 2234: Registered nurse practitioners
84 | 2235: Registered mental health nurses
85 | 2236: Registered children's nurses
86 | 2237: Other registered nursing professionals
87 | 2240: Veterinarians
88 | 2251: Pharmacists
89 | 2252: Optometrists
90 | 2253: Dental practitioners
91 | 2254: Medical radiographers
92 | 2255: Paramedics
93 | 2256: Podiatrists
94 | 2259: Other health professionals n.e.c.
95 | 2311: Higher education teaching professionals
96 | 2312: Further education teaching professionals
97 | 2313: Secondary education teaching professionals
98 | 2314: Primary education teaching professionals
99 | 2315: Nursery education teaching professionals
100 | 2316: Special and additional needs education teaching professionals
101 | 2317: Teachers of english as a foreign language
102 | 2319: Teaching professionals n.e.c.
103 | 2321: Head teachers and principals
104 | 2322: Education managers
105 | 2323: Education advisers and school inspectors
106 | 2324: Early education and childcare services managers
107 | 2329: Other educational professionals n.e.c
108 | 2411: Barristers and judges
109 | 2412: Solicitors and lawyers
110 | 2419: Legal professionals n.e.c.
111 | 2421: Chartered and certified accountants
112 | 2422: Finance and investment analysts and advisers
113 | 2423: Taxation experts
114 | 2431: Management consultants and business analysts
115 | 2432: Marketing and commercial managers
116 | 2433: Actuaries, economists and statisticians
117 | 2434: Business and related research professionals
118 | 2435: Professional/chartered company secretaries
119 | 2439: Business, research and administrative professionals n.e.c.
120 | 2440: Business and financial project management professionals
121 | 2451: Architects
122 | 2452: Chartered architectural technologists, planning officers and consultants
123 | 2453: Quantity surveyors
124 | 2454: Chartered surveyors
125 | 2455: Construction project managers and related professionals
126 | 2461: Social workers
127 | 2462: Probation officers
128 | 2463: Clergy
129 | 2464: Youth work professionals
130 | 2469: Welfare professionals n.e.c.
131 | 2471: Librarians
132 | 2472: Archivists, conservators and curators
133 | 2481: Quality control and planning engineers
134 | 2482: Quality assurance and regulatory professionals
135 | 2483: Environmental health professionals
136 | 2491: Newspaper, periodical and broadcast editors
137 | 2492: Newspaper and periodical broadcast journalists and reporters
138 | 2493: Public relations professionals
139 | 2494: Advertising accounts managers and creative directors
140 | 3111: Laboratory technicians
141 | 3112: Electrical and electronics technicians
142 | 3113: Engineering technicians
143 | 3114: Building and civil engineering technicians
144 | 3115: Quality assurance technicians
145 | 3116: Planning, process and production technicians
146 | 3119: Science, engineering and production technicians n.e.c.
147 | 3120: CAD, drawing and architectural technicians
148 | 3131: IT operations technicians
149 | 3132: IT user support technicians
150 | 3133: Database administrators and web content technicians
151 | 3211: Dispensing opticians
152 | 3212: Pharmaceutical technicians
153 | 3213: Medical and dental technicians
154 | 3214: Complementary health associate professionals
155 | 3219: Health associate professionals n.e.c.
156 | 3221: Youth and community workers
157 | 3222: Child and early years officers
158 | 3223: Housing officers
159 | 3224: Counsellors
160 | 3229: Welfare and housing associate professionals n.e.c.
161 | 3231: Higher level teaching assistants
162 | 3232: Early education and childcare practitioners
163 | 3240: Veterinary nurses
164 | 3311: Non-commissioned officers and other ranks
165 | 3312: Police officers (sergeant and below)
166 | 3313: Fire service officers (watch manager and below)
167 | 3314: Prison service officers (below principal officer)
168 | 3319: Protective service associate professionals n.e.c.
169 | 3411: Artists
170 | 3412: Authors, writers and translators
171 | 3413: Actors, entertainers and presenters
172 | 3414: Dancers and choreographers
173 | 3415: Musicians
174 | 3416: Arts officers, producers and directors
175 | 3417: Photographers, audio-visual and broadcasting equipment operators
176 | 3421: Interior designers
177 | 3422: Clothing, fashion and accessories designers
178 | 3429: Design occupations n.e.c.
179 | 3431: Sports players
180 | 3432: Sports coaches, instructors and officials
181 | 3433: Fitness and wellbeing instructors
182 | 3511: Aircraft pilots and air traffic controllers
183 | 3512: Ship and hovercraft officers
184 | 3520: Legal associate professionals
185 | 3531: Brokers
186 | 3532: Insurance underwriters
187 | 3533: Financial and accounting technicians
188 | 3534: Financial accounts managers
189 | 3541: Estimators, valuers and assessors
190 | 3542: Importers and exporters
191 | 3543: Project support officers
192 | 3544: Data analysts
193 | 3549: Business associate professionals n.e.c.
194 | 3551: Buyers and procurement officers
195 | 3552: Business sales executives
196 | 3553: Merchandisers
197 | 3554: Advertising and marketing associate professionals
198 | 3555: Estate agents and auctioneers
199 | 3556: Sales accounts and business development managers
200 | 3557: Events managers and organisers
201 | 3560: Public services associate professionals
202 | 3571: Human resources and industrial relations officers
203 | 3572: Careers advisers and vocational guidance specialists
204 | 3573: Information technology trainers
205 | 3574: Other vocational and industrial trainers
206 | 3581: Inspectors of standards and regulations
207 | 3582: Health and safety managers and officers
208 | 4111: National government administrative occupations
209 | 4112: Local government administrative occupations
210 | 4113: Officers of non-governmental organisations
211 | 4121: Credit controllers
212 | 4122: Book-keepers, payroll managers and wages clerks
213 | 4123: Bank and post office clerks
214 | 4124: Finance officers
215 | 4129: Financial administrative occupations n.e.c.
216 | 4131: Records clerks and assistants
217 | 4132: Pensions and insurance clerks and assistants
218 | 4133: Stock control clerks and assistants
219 | 4134: Transport and distribution clerks and assistants
220 | 4135: Library clerks and assistants
221 | 4136: Human resources administrative occupations
222 | 4141: Office managers
223 | 4142: Office supervisors
224 | 4143: Customer service managers
225 | 4151: Sales administrators
226 | 4152: Data entry administrators
227 | 4159: Other administrative occupations n.e.c.
228 | 4211: Medical secretaries
229 | 4212: Legal secretaries
230 | 4213: School secretaries
231 | 4214: Company secretaries and administrators
232 | 4215: Personal assistants and other secretaries
233 | 4216: Receptionists
234 | 4217: Typists and related keyboard occupations
235 | 5111: Farmers
236 | 5112: Horticultural trades
237 | 5113: Gardeners and landscape gardeners
238 | 5114: Groundsmen and greenkeepers
239 | 5119: Agricultural and fishing trades n.e.c.
240 | 5211: Sheet metal workers
241 | 5212: Metal plate workers, smiths, moulders and related occupations
242 | 5213: Welding trades
243 | 5214: Pipe fitters
244 | 5221: Metal machining setters and setter-operators
245 | 5222: Tool makers, tool fitters and markers-out
246 | 5223: Metal working production and maintenance fitters
247 | 5224: Precision instrument makers and repairers
248 | 5225: Air-conditioning and refrigeration installers and repairers
249 | 5231: Vehicle technicians, mechanics and electricians
250 | 5232: Vehicle body builders and repairers
251 | 5233: Vehicle paint technicians
252 | 5234: Aircraft maintenance and related trades
253 | 5235: Boat and ship builders and repairers
254 | 5236: Rail and rolling stock builders and repairers
255 | 5241: Electricians and electrical fitters
256 | 5242: Telecoms and related network installers and repairers
257 | 5243: Tv, video and audio servicers and repairers
258 | 5244: Computer system and equipment installers and servicers
259 | 5245: Security system installers and repairers
260 | 5246: Electrical service and maintenance mechanics and repairers
261 | 5249: Electrical and electronic trades n.e.c.
262 | 5250: Skilled metal, electrical and electronic trades supervisors
263 | 5311: Steel erectors
264 | 5312: Stonemasons and related trades
265 | 5313: Bricklayers
266 | 5314: Roofers, roof tilers and slaters
267 | 5315: Plumbers and heating and ventilating installers and repairers
268 | 5316: Carpenters and joiners
269 | 5317: Glaziers, window fabricators and fitters
270 | 5319: Construction and building trades n.e.c.
271 | 5321: Plasterers
272 | 5322: Floorers and wall tilers
273 | 5323: Painters and decorators
274 | 5330: Construction and building trades supervisors
275 | 5411: Upholsterers
276 | 5412: Footwear and leather working trades
277 | 5413: Tailors and dressmakers
278 | 5419: Textiles, garments and related trades n.e.c.
279 | 5421: Pre-press technicians
280 | 5422: Printers
281 | 5423: Print finishing and binding workers
282 | 5431: Butchers
283 | 5432: Bakers and flour confectioners
284 | 5433: Fishmongers and poultry dressers
285 | 5434: Chefs
286 | 5435: Cooks
287 | 5436: Catering and bar managers
288 | 5441: Glass and ceramics makers, decorators and finishers
289 | 5442: Furniture makers and other craft woodworkers
290 | 5443: Florists
291 | 5449: Other skilled trades n.e.c.
292 | 6111: Early education and childcare assistants
293 | 6112: Teaching assistants
294 | 6113: Educational support assistants
295 | 6114: Childminders
296 | 6116: Nannies and au pairs
297 | 6117: Playworkers
298 | 6121: Pest control officers
299 | 6129: Animal care services occupations n.e.c
300 | 6131: Nursing auxiliaries and assistants
301 | 6132: Ambulance staff (excluding paramedics)
302 | 6133: Dental nurses
303 | 6134: Houseparents and residential wardens
304 | 6135: Care workers and home carers
305 | 6136: Senior care workers
306 | 6137: Care escorts
307 | 6138: Undertakers, mortuary and crematorium assistants
308 | 6211: Sports and leisure assistants
309 | 6212: Travel agents
310 | 6213: Air travel assistants
311 | 6214: Rail travel assistants
312 | 6219: Leisure and travel service occupations n.e.c.
313 | 6221: Hairdressers and barbers
314 | 6222: Beauticians and related occupations
315 | 6231: Housekeepers and related occupations
316 | 6232: Caretakers
317 | 6240: Cleaning and housekeeping managers and supervisors
318 | 6250: Bed and breakfast and guest house owners and proprietors
319 | 6311: Police community support officers
320 | 6312: Parking and civil enforcement occupations
321 | 7111: Sales and retail assistants
322 | 7112: Retail cashiers and check-out operators
323 | 7113: Telephone salespersons
324 | 7114: Pharmacy and optical dispensing assistants
325 | 7115: Vehicle and parts salespersons and advisers
326 | 7121: Collector salespersons and credit agents
327 | 7122: Debt, rent and other cash collectors
328 | 7123: Roundspersons and van salespersons
329 | 7124: Market and street traders and assistants
330 | 7125: Visual merchandisers and related occupations
331 | 7129: Sales related occupations n.e.c.
332 | 7131: Shopkeepers and owners - retail and wholesale
333 | 7132: Sales supervisors - retail and wholesale
334 | 7211: Call and contact centre occupations
335 | 7212: Telephonists
336 | 7213: Communication operators
337 | 7214: Market research interviewers
338 | 7219: Customer service occupations n.e.c.
339 | 7220: Customer service supervisors
340 | 8111: Food, drink and tobacco process operatives
341 | 8112: Textile process operatives
342 | 8113: Chemical and related process operatives
343 | 8114: Plastics process operatives
344 | 8115: Metal making and treating process operatives
345 | 8119: Process operatives n.e.c.
346 | 8120: Metal working machine operatives
347 | 8131: Paper and wood machine operatives
348 | 8132: Mining and quarry workers and related operatives
349 | 8133: Energy plant operatives
350 | 8134: Water and sewerage plant operatives
351 | 8135: Printing machine assistants
352 | 8139: Plant and machine operatives n.e.c.
353 | 8141: Assemblers (electrical and electronic products)
354 | 8142: Assemblers (vehicles and metal goods)
355 | 8143: Routine inspectors and testers
356 | 8144: Weighers, graders and sorters
357 | 8145: Tyre, exhaust and windscreen fitters
358 | 8146: Sewing machinists
359 | 8149: Assemblers and routine operatives n.e.c.
360 | 8151: Scaffolders, stagers and riggers
361 | 8152: Road construction operatives
362 | 8153: Rail construction and maintenance operatives
363 | 8159: Construction operatives n.e.c.
364 | 8160: Production, factory and assembly supervisors
365 | 8211: Heavy and large goods vehicle drivers
366 | 8212: Bus and coach drivers
367 | 8213: Taxi and cab drivers and chauffeurs
368 | 8214: Delivery drivers and couriers
369 | 8215: Driving instructors
370 | 8219: Road transport drivers n.e.c.
371 | 8221: Crane drivers
372 | 8222: Fork-lift truck drivers
373 | 8229: Mobile machine drivers and operatives n.e.c.
374 | 8231: Train and tram drivers
375 | 8232: Marine and waterways transport operatives
376 | 8233: Air transport operatives
377 | 8234: Rail transport operatives
378 | 8239: Other drivers and transport operatives n.e.c.
379 | 9111: Farm workers
380 | 9112: Forestry and related workers
381 | 9119: Fishing and other elementary agriculture occupations n.e.c.
382 | 9121: Groundworkers
383 | 9129: Elementary construction occupations n.e.c.
384 | 9131: Industrial cleaning process occupations
385 | 9132: Packers, bottlers, canners and fillers
386 | 9139: Elementary process plant occupations n.e.c.
387 | 9211: Postal workers, mail sorters and messengers
388 | 9219: Elementary administration occupations n.e.c.
389 | 9221: Window cleaners
390 | 9222: Street cleaners
391 | 9223: Cleaners and domestics
392 | 9224: Launderers, dry cleaners and pressers
393 | 9225: Refuse and salvage occupations
394 | 9226: Vehicle valeters and cleaners
395 | 9229: Elementary cleaning occupations n.e.c.
396 | 9231: Security guards and related occupations
397 | 9232: School midday and crossing patrol occupations
398 | 9233: Exam invigilators
399 | 9241: Shelf fillers
400 | 9249: Elementary sales occupations n.e.c.
401 | 9251: Elementary storage supervisors
402 | 9252: Warehouse operatives
403 | 9253: Delivery operatives
404 | 9259: Elementary storage occupations n.e.c.
405 | 9261: Bar and catering supervisors
406 | 9262: Hospital porters
407 | 9263: Kitchen and catering assistants
408 | 9264: Waiters and waitresses
409 | 9265: Bar staff
410 | 9266: Coffee shop workers
411 | 9267: Leisure and theme park attendants
412 | 9269: Other elementary services occupations n.e.c.
413 |
--------------------------------------------------------------------------------
/src/sic_soc_llm/example_data/toy_index.txt:
--------------------------------------------------------------------------------
1 | 01: Cat
2 | 02: Dog
3 | 03: Fish
4 | 05: Mouse
5 |
--------------------------------------------------------------------------------
/src/sic_soc_llm/llm.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import numpy as np
3 | from functools import lru_cache
4 | from collections import defaultdict
5 | from typing import Optional, Union
6 | from langchain.chains.llm import LLMChain
7 | from langchain.output_parsers import PydanticOutputParser
8 | from langchain_google_vertexai import VertexAI
9 | from langchain_openai import ChatOpenAI
10 |
11 | from sic_soc_llm import get_config
12 | from sic_soc_llm.prompt import (
13 | SOC_PROMPT_PYDANTIC,
14 | SIC_PROMPT_PYDANTIC,
15 | SIC_PROMPT_RAG,
16 | GENERAL_PROMPT_RAG,
17 | )
18 | from sic_soc_llm.data_models.response_model import SocResponse, SicResponse, RagResponse
19 | from sic_soc_llm.embedding import EmbeddingHandler
20 | from sic_soc_llm.data_models.sicDB import sic_meta
21 | from sic_soc_llm.data_models.sic_data_access import load_sic_index, load_sic_structure
22 | from sic_soc_llm.data_models.sic_hierarchy import load_hierarchy
23 |
24 | logger = logging.getLogger(__name__)
25 | config = get_config()
26 |
27 |
28 | class ClassificationLLM:
29 | """
30 | Wraps the logic for using an LLM to classify respondent's data
31 | based on provided index. Includes direct (one-shot) generative llm
32 | method and Retrieval Augmented Generation (RAG).
33 |
34 | Args:
35 | model_name (str): Name of the model. Defaults to the value in the `config` file.
36 | Used if no LLM object is passed.
37 | llm (LLM): LLM to use. Optional.
38 | embedding_handler (EmbeddingHandler): Embedding handler. Optional.
39 | If None a default embedding handler is retrieved based on config file.
40 | max_tokens (int): Maximum number of tokens to generate. Defaults to 1600.
41 | temperature (float): Temperature of the LLM model. Defaults to 0.0.
42 | verbose (bool): Whether to print verbose output. Defaults to False.
43 | openai_api_key (str): OpenAI API key. Optional, but needed for OpenAI models.
44 | """
45 |
46 | def __init__(
47 | self,
48 | model_name: str = config["llm"]["llm_model_name"],
49 | llm: Optional[Union[VertexAI, ChatOpenAI]] = None,
50 | embedding_handler: Optional[EmbeddingHandler] = None,
51 | max_tokens: int = 1600,
52 | temperature: float = 0.0,
53 | verbose: bool = False,
54 | openai_api_key: Optional[str] = None,
55 | ):
56 | """
57 | Initialises the ClassificationLLM object.
58 | """
59 | if llm is not None:
60 | self.llm = llm
61 | else:
62 | if model_name.startswith("text-") or model_name.startswith("gemini"):
63 | self.llm = VertexAI(
64 | model_name=model_name,
65 | max_output_tokens=max_tokens,
66 | temperature=temperature,
67 | location="europe-west2",
68 | )
69 | elif model_name.startswith("gpt"):
70 | if openai_api_key is None:
71 | raise NotImplementedError("Need to provide an OpenAI API key")
72 | self.llm = ChatOpenAI(
73 | model=model_name,
74 | openai_api_key=openai_api_key,
75 | temperature=temperature,
76 | max_tokens=max_tokens,
77 | )
78 | else:
79 | raise NotImplementedError("Unsupported model family")
80 |
81 | self.soc_prompt = SOC_PROMPT_PYDANTIC
82 | self.sic_prompt = SIC_PROMPT_PYDANTIC
83 | self.sic_meta = sic_meta
84 | self.sic_prompt_rag = SIC_PROMPT_RAG
85 | self.general_prompt_rag = GENERAL_PROMPT_RAG
86 | self.embed = embedding_handler
87 | self.sic = None
88 | self.verbose = verbose
89 |
90 | def _load_embedding_handler(self):
91 | """
92 | Loads the default embedding handler according to the 'config' file.
93 | Expects an existing and populated persistent vector store.
94 |
95 | Raises:
96 | ValueError: If the retrieved embedding handler has an empty vector store.
97 | Please embed an index before using it in the ClassificationLLM.
98 | """
99 | logger.info(
100 | """Loading default embedding handler according to 'config' file.
101 | Expecting existing & populated persistent vector store."""
102 | )
103 | self.embed = EmbeddingHandler()
104 | if self.embed._index_size == 0:
105 | raise ValueError(
106 | """The retrieved embedding handler has an empty vector store.
107 | Please embed an index before using in the ClassificationLLM."""
108 | )
109 |
110 | @lru_cache
111 | def get_soc_code(
112 | self,
113 | job_title: str,
114 | job_description: str,
115 | level_of_education: str,
116 | manage_others: bool,
117 | industry_descr: str,
118 | ) -> SocResponse:
119 | """
120 | Generates a SOC classification based on respondent's data
121 | using a whole condensed index embedded in the query.
122 |
123 | Args:
124 | job_title (str): The title of the job.
125 | job_description (str): The description of the job.
126 | level_of_education (str): The level of education required for the job.
127 | manage_others (bool): Indicates whether the job involves managing others.
128 | industry_descr (str): The description of the industry.
129 |
130 | Returns:
131 | SocResponse: The generated response to the query.
132 |
133 | Raises:
134 | ValueError: If there is an error parsing the response from the LLM model.
135 |
136 | """
137 | chain = LLMChain(llm=self.llm, prompt=self.soc_prompt)
138 | response = chain.invoke(
139 | {
140 | "job_title": job_title,
141 | "job_description": job_description,
142 | "level_of_education": level_of_education,
143 | "manage_others": manage_others,
144 | "industry_descr": industry_descr,
145 | },
146 | return_only_outputs=True,
147 | )
148 | if self.verbose:
149 | logger.debug(f"{response=}")
150 | # Parse the output to desired format with one retry
151 | parser = PydanticOutputParser(pydantic_object=SocResponse)
152 | try:
153 | validated_answer = parser.parse(response["text"])
154 | except Exception as parse_error:
155 | logger.error(f"Unable to parse llm response: {str(parse_error)}")
156 | reasoning = (
157 | f'ERROR parse_error=<{parse_error}>, response=<{response["text"]}>'
158 | )
159 | validated_answer = SocResponse(
160 | codable=False, soc_candidates=[], reasoning=reasoning
161 | )
162 |
163 | return validated_answer
164 |
165 | @lru_cache
166 | def get_sic_code(
167 | self,
168 | industry_descr: str,
169 | job_title: str,
170 | job_description: str,
171 | ) -> SicResponse:
172 | """
173 | Generates a SIC classification based on respondent's data
174 | using a whole condensed index embedded in the query.
175 |
176 | Args:
177 | industry_descr (str): Description of the industry.
178 | job_title (str): Title of the job.
179 | job_description (str): Description of the job.
180 |
181 | Returns:
182 | SicResponse: Generated response to the query.
183 | """
184 |
185 | chain = LLMChain(llm=self.llm, prompt=self.sic_prompt)
186 | response = chain.invoke(
187 | {
188 | "industry_descr": industry_descr,
189 | "job_title": job_title,
190 | "job_description": job_description,
191 | },
192 | return_only_outputs=True,
193 | )
194 | if self.verbose:
195 | logger.debug(f"{response=}")
196 | # Parse the output to desired format with one retry
197 | parser = PydanticOutputParser(pydantic_object=SicResponse)
198 | try:
199 | validated_answer = parser.parse(response["text"])
200 | except ValueError as parse_error:
201 | logger.debug(
202 | f"Retrying llm response parsing due to an error: {parse_error}"
203 | )
204 | logger.error(f"Unable to parse llm response: {str(parse_error)}")
205 |
206 | reasoning = (
207 | f'ERROR parse_error=<{parse_error}>, response=<{response["text"]}>'
208 | )
209 | validated_answer = SicResponse(
210 | codable=False,
211 | sic_candidates=[],
212 | reasoning=reasoning,
213 | )
214 |
215 | return validated_answer
216 |
217 | def _prompt_candidate(
218 | self, code: str, activities: list[str], include_all: bool = False
219 | ) -> str:
220 | """Reformat the candidate activities for the prompt.
221 |
222 | Args:
223 | code (str): The code for the item.
224 | activities (list[str]): The list of example activities.
225 | include_all (bool, optional): Whether to include all the sic metadata.
226 |
227 | Returns:
228 | str: A formatted string containing the code, title, and example activities.
229 | """
230 | if self.sic is None:
231 | sic_index_df = load_sic_index(config["lookups"]["sic_index"])
232 | sic_df = load_sic_structure(config["lookups"]["sic_structure"])
233 | self.sic = load_hierarchy(sic_df, sic_index_df)
234 |
235 | item = self.sic[code]
236 | txt = "{" + f"Code: {item.numeric_string_padded()}, Title: {item.description}"
237 | txt += f", Example activities: {', '.join(activities)}"
238 | if include_all:
239 | if item.sic_meta.detail:
240 | txt += f", Details: {item.sic_meta.detail}"
241 | if item.sic_meta.includes:
242 | txt += f", Includes: {', '.join(item.sic_meta.includes)}"
243 | if item.sic_meta.excludes:
244 | txt += f", Excludes: {', '.join(item.sic_meta.excludes)}"
245 | return txt + "}"
246 |
247 | def _prompt_candidate_list(
248 | self,
249 | short_list: list[dict],
250 | chars_limit: int = 14000,
251 | candidates_limit: int = 5,
252 | activities_limit: int = 3,
253 | code_digits: int = 5,
254 | ) -> str:
255 | """Create candidate list for the prompt based on the given parameters.
256 |
257 | This method takes a structured list of candidates and generates a short
258 | string list based on the provided parameters. It filters the candidates
259 | based on the code digits and activities limit, and shortens the list to
260 | fit the character limit.
261 |
262 | Args:
263 | short_list (list[dict]): A list of candidate dictionaries.
264 | chars_limit (int, optional): The character limit for the generated
265 | prompt. Defaults to 14000.
266 | candidates_limit (int, optional): The maximum number of candidates
267 | to include in the prompt. Defaults to 5.
268 | activities_limit (int, optional): The maximum number of activities
269 | to include for each code. Defaults to 3.
270 | code_digits (int, optional): The number of digits to consider from
271 | the code for filtering candidates. Defaults to 5.
272 |
273 | Returns:
274 | str: The generated candidate list for the prompt.
275 | """
276 | a = defaultdict(list)
277 | for item in short_list:
278 | if item["title"] not in a[item["code"][:code_digits]]:
279 | a[item["code"][:code_digits]].append(item["title"])
280 |
281 | sic_candidates = [
282 | self._prompt_candidate(code, activities[:activities_limit])
283 | for code, activities in a.items()
284 | ][:candidates_limit]
285 |
286 | if chars_limit:
287 | chars_count = np.cumsum([len(x) for x in sic_candidates])
288 | nn = sum([x <= chars_limit for x in chars_count])
289 | if nn < len(sic_candidates):
290 | logger.warning(
291 | "Shortening list of candidates to fit token limit "
292 | + f"from {len(sic_candidates)} to {nn}"
293 | )
294 | sic_candidates = sic_candidates[:nn]
295 |
296 | return "\n".join(sic_candidates)
297 |
298 | def rag_sic_code( # noqa: C901
299 | self,
300 | industry_descr: str,
301 | job_title: str = None,
302 | job_description: str = None,
303 | expand_search_terms: bool = True,
304 | code_digits: int = 5,
305 | candidates_limit: int = 5,
306 | ) -> SicResponse:
307 | """
308 | Generates a SIC classification based on respondent's data using RAG approach.
309 |
310 | Args:
311 | industry_descr (str): The description of the industry.
312 | job_title (str, optional): The job title. Defaults to None.
313 | job_description (str, optional): The job description. Defaults to None.
314 | expand_search_terms (bool, optional): Whether to expand the search terms
315 | to include job title and description. Defaults to True.
316 | code_digits (int, optional): The number of digits in the generated
317 | SIC code. Defaults to 5.
318 | candidates_limit (int, optional): The maximum number of SIC code candidates
319 | to consider. Defaults to 5.
320 |
321 | Returns:
322 | SicResponse: The generated response to the query.
323 |
324 | Raises:
325 | ValueError: If there is an error during the parsing of the response.
326 | ValueError: If the default embedding handler is required but
327 | not loaded correctly.
328 |
329 | """
330 |
331 | def prep_call_dict(industry_descr, job_title, job_description, sic_codes):
332 | # Helper function to prepare the call dictionary
333 | is_job_title_present = job_title is None or job_title in {"", " "}
334 | job_title = "Unknown" if is_job_title_present else job_title
335 |
336 | is_job_description_present = job_description is None or job_description in {
337 | "",
338 | " ",
339 | }
340 | job_description = (
341 | "Unknown" if is_job_description_present else job_description
342 | )
343 |
344 | call_dict = {
345 | "industry_descr": industry_descr,
346 | "job_title": job_title,
347 | "job_description": job_description,
348 | "sic_index": sic_codes,
349 | }
350 | return call_dict
351 |
352 | if self.embed is None:
353 | try:
354 | self._load_embedding_handler()
355 | except ValueError as err:
356 | logger.exception(err)
357 | logger.warning("Error: Empty embedding vector store, exit early")
358 | validated_answer = SicResponse(
359 | codable=False,
360 | sic_candidates=[],
361 | reasoning="Error, Empty embedding vector store, exit early",
362 | )
363 | return validated_answer, None, None
364 |
365 | # Retrieve relevant SIC codes and format them for prompt
366 | if expand_search_terms:
367 | short_list = self.embed.search_index_multi(
368 | query=[industry_descr, job_title, job_description]
369 | )
370 | else:
371 | short_list = self.embed.search_index(query=industry_descr)
372 |
373 | sic_codes = self._prompt_candidate_list(
374 | short_list, code_digits=code_digits, candidates_limit=candidates_limit
375 | )
376 |
377 | call_dict = prep_call_dict(
378 | industry_descr=industry_descr,
379 | job_title=job_title,
380 | job_description=job_description,
381 | sic_codes=sic_codes,
382 | )
383 |
384 | if self.verbose:
385 | final_prompt = self.sic_prompt_rag.format(**call_dict)
386 | logger.debug(final_prompt)
387 |
388 | chain = LLMChain(llm=self.llm, prompt=self.sic_prompt_rag)
389 |
390 | try:
391 | response = chain.invoke(call_dict, return_only_outputs=True)
392 | except ValueError as err:
393 | logger.exception(err)
394 | logger.warning("Error from LLMChain, exit early")
395 | validated_answer = SicResponse(
396 | codable=False,
397 | sic_candidates=[],
398 | reasoning="Error from LLMChain, exit early",
399 | )
400 | return validated_answer, short_list, call_dict
401 |
402 | if self.verbose:
403 | logger.debug(f"{response=}")
404 |
405 | # Parse the output to the desired format
406 | parser = PydanticOutputParser(pydantic_object=SicResponse)
407 | try:
408 | validated_answer = parser.parse(response["text"])
409 | except ValueError as parse_error:
410 | logger.exception(parse_error)
411 | logger.warning(f"Failed to parse response:\n{response['text']}")
412 |
413 | reasoning = (
414 | f'ERROR parse_error=<{parse_error}>, response=<{response["text"]}>'
415 | )
416 | validated_answer = SicResponse(
417 | codable=False,
418 | sic_candidates=[],
419 | reasoning=reasoning,
420 | )
421 |
422 | return validated_answer, short_list, call_dict
423 |
424 | def rag_general_code( # noqa: C901
425 | self,
426 | respondent_data: dict,
427 | candidates_limit: int = 7,
428 | ) -> RagResponse:
429 | """
430 | Generates a classification answer based on respondent's data
431 | using RAG and custom index.
432 |
433 | Args:
434 | respondent_data (dict): A dictionary containing respondent data.
435 | candidates_limit (int, optional): The maximum number of candidate
436 | codes to consider. Defaults to 7.
437 |
438 | Returns:
439 | RagResponse: The generated classification response to the query.
440 |
441 | Raises:
442 | ValueError: If there is an error during the parsing of the response.
443 | ValueError: If the default embedding handler is required but
444 | not loaded correctly.
445 | """
446 |
447 | if self.embed is None:
448 | try:
449 | self._load_embedding_handler()
450 | except ValueError as err:
451 | logger.exception(err)
452 | logger.warning("Error: Empty embedding vector store, exit early")
453 | validated_answer = RagResponse(
454 | codable=False,
455 | alt_candidates=[],
456 | reasoning="Error: Empty embedding vector store, exit early",
457 | )
458 | return validated_answer, None
459 |
460 | # Retrieve relevant SIC codes and format them for prompt
461 | short_list = self.embed.search_index_multi(query=respondent_data.values())
462 |
463 | candidate_codes = (
464 | "{"
465 | + "}, /n{".join(
466 | [
467 | "Code: " + x["code"] + ", Description: " + x["title"]
468 | for x in short_list[:candidates_limit]
469 | ]
470 | )
471 | + "}"
472 | )
473 |
474 | if self.verbose:
475 | final_prompt = self.general_prompt_rag.format(
476 | respondent_data=str(respondent_data),
477 | classification_index=candidate_codes,
478 | )
479 | logger.debug(final_prompt)
480 |
481 | chain = LLMChain(llm=self.llm, prompt=self.general_prompt_rag)
482 |
483 | try:
484 | response = chain.invoke(
485 | {
486 | "respondent_data": str(respondent_data),
487 | "classification_index": candidate_codes,
488 | },
489 | return_only_outputs=True,
490 | )
491 | except ValueError as err:
492 | logger.exception(err)
493 | logger.warning("Error from LLMChain, exit early")
494 | validated_answer = RagResponse(
495 | codable=False,
496 | alt_candidates=[],
497 | reasoning="Error from LLMChain, exit early",
498 | )
499 | return validated_answer, short_list
500 |
501 | if self.verbose:
502 | logger.debug(f"llm_response={response}")
503 |
504 | # Parse the output to desired format
505 | parser = PydanticOutputParser(pydantic_object=RagResponse)
506 | try:
507 | validated_answer = parser.parse(response["text"])
508 | except ValueError as parse_error:
509 | logger.exception(parse_error)
510 | logger.warning(f"Failed to parse response:\n{response['text']}")
511 |
512 | reasoning = (
513 | f'ERROR parse_error=<{parse_error}>, response=<{response["text"]}>'
514 | )
515 | validated_answer = RagResponse(
516 | codable=False,
517 | alt_candidates=[],
518 | reasoning=reasoning,
519 | )
520 |
521 | return validated_answer, short_list
522 |
--------------------------------------------------------------------------------
/src/sic_soc_llm/logs.py:
--------------------------------------------------------------------------------
1 | """Provides logging for the project.
2 |
3 | Used to set up file and console based loggers.
4 | Typically called from an entry point or external script.
5 |
6 | Typical usage:
7 |
8 | ```
9 | logger = logs.setup_logging("some_script_name")
10 | ```
11 | This will create a separate log file for the `some_script_name`.
12 | """
13 |
14 | import datetime
15 | import logging
16 | from typing import Union
17 | from pathlib import Path
18 |
19 | LOG_FORMAT = "%(asctime)s - %(filename)s:%(lineno)d - %(levelname)s - %(message)s"
20 | DATE_FORMAT = "%Y-%m-%d %H:%M:%S"
21 | MODULE_NAME = "sic_soc_llm"
22 | EXTRA_MODULE_NAME = "server"
23 | LOG_LEVEL = logging.DEBUG
24 | DATE_STRING = f"{datetime.datetime.now().date()}"
25 |
26 | # Logging can be used independently of configuration
27 | LOG_DIR = Path.home() / "logs"
28 |
29 |
30 | def setup_logging(
31 | script_name: str = None, log_dir: Union[Path, str] = LOG_DIR
32 | ) -> logging.Logger:
33 | """Set up console and file logging.
34 |
35 | This will create a directory to log to if it doesn't already exist.
36 |
37 | Safe to call in interactive environments without duplicating the logging.
38 |
39 | Logs on the same day will append to the same file for the same script_name.
40 |
41 | Args:
42 | script_name (str): Used in the filename for the logs.
43 | log_dir (Path or str): Directory to store logs in. Defaults to "~/logs".
44 |
45 | Returns:
46 | Logger object with handlers set up.
47 | """
48 |
49 | logger = logging.getLogger(MODULE_NAME)
50 | logger.setLevel(logging.DEBUG)
51 |
52 | other_logger = logging.getLogger(EXTRA_MODULE_NAME)
53 | other_logger.setLevel(logging.DEBUG)
54 |
55 | log_dir = Path(log_dir)
56 | log_dir.mkdir(parents=True, exist_ok=True)
57 |
58 | ch = logging.StreamHandler()
59 | ch.setLevel(LOG_LEVEL)
60 |
61 | formatter = logging.Formatter(LOG_FORMAT, DATE_FORMAT)
62 | ch.setFormatter(formatter)
63 |
64 | # In case this is called twice check whether a handler is already registered
65 | if not logger.handlers:
66 | logger.addHandler(ch)
67 | other_logger.addHandler(ch)
68 |
69 | try:
70 | if script_name is None:
71 | log_file = log_dir / f"{MODULE_NAME}.{DATE_STRING}.log"
72 | else:
73 | log_file = log_dir / f"{MODULE_NAME}_{script_name}.{DATE_STRING}.log"
74 |
75 | if len(logger.handlers) == 1:
76 | fh = logging.FileHandler(log_file)
77 |
78 | fh.setFormatter(formatter)
79 | fh.setLevel(LOG_LEVEL)
80 | logger.addHandler(fh)
81 | other_logger.addHandler(fh)
82 |
83 | except FileNotFoundError:
84 | logger.warning("Console logging only")
85 |
86 | return logger
87 |
--------------------------------------------------------------------------------
/src/sic_soc_llm/prompt.py:
--------------------------------------------------------------------------------
1 | from langchain.prompts.prompt import PromptTemplate
2 | from langchain.output_parsers import PydanticOutputParser
3 | from sic_soc_llm.data_models.response_model import SocResponse, SicResponse, RagResponse
4 | from sic_soc_llm import get_config
5 |
6 | config = get_config()
7 |
8 | _core_prompt = """You are a conscientious classification assistant of respondent data
9 | for the use in the UK official statistics. Respondent data may be in English or Welsh,
10 | but you always respond in British English."""
11 |
12 | _soc_template = """"Given the respondent data (that may include all or some of
13 | job title, job description, level of education, line management responsibilities,
14 | and company's main activity) your task is to determine
15 | the UK SOC (Standard Occupational Classification) code for this job if it can be
16 | determined. If the code cannot be determined, identify the additional information
17 | needed to determine it. Make sure to use the provided 2020 SOC index.
18 |
19 | ===Respondent Data===
20 | - Job Title: {job_title}
21 | - Job Description: {job_description}
22 | - Level of Education: {level_of_education}
23 | - Line Management Responsibilities: {manage_others}
24 | - Company's main activity: {industry_descr}
25 |
26 | ===Output Format===
27 | {format_instructions}
28 |
29 | ===2020 SOC Index===
30 | {soc_index}
31 | """
32 |
33 | with open(config["lookups"]["soc_condensed"]) as f:
34 | soc_index = f.read()
35 |
36 |
37 | parser = PydanticOutputParser(pydantic_object=SocResponse)
38 |
39 | SOC_PROMPT_PYDANTIC = PromptTemplate.from_template(
40 | template=_core_prompt + _soc_template,
41 | partial_variables={
42 | "format_instructions": parser.get_format_instructions(),
43 | "soc_index": soc_index,
44 | },
45 | )
46 |
47 | # TODO include SIC/SOC definitions, coding guidance, the concept/question phrasing?
48 |
49 |
50 | _sic_template = """"Given the respondent's description of the main activity their
51 | company does, their job title and job description, your task is to determine
52 | the UK SIC (Standard Industry Classification) code for this company if it can be
53 | determined to the division (two-digit) level. If the code cannot be determined,
54 | identify the additional information needed to determine it.
55 | Make sure to use the provided 2007 SIC Index.
56 |
57 | ===Respondent Data===
58 | - Company's main activity: {industry_descr}
59 | - Job Title: {job_title}
60 | - Job Description: {job_description}
61 |
62 | ===Output Format===
63 | {format_instructions}
64 |
65 | ===2007 SIC Index===
66 | {sic_index}
67 | """
68 |
69 | with open(config["lookups"]["sic_condensed"]) as f:
70 | sic_index = f.read()
71 |
72 |
73 | parser = PydanticOutputParser(pydantic_object=SicResponse)
74 |
75 | SIC_PROMPT_PYDANTIC = PromptTemplate.from_template(
76 | template=_core_prompt + _sic_template,
77 | partial_variables={
78 | "format_instructions": parser.get_format_instructions(),
79 | "sic_index": sic_index,
80 | },
81 | )
82 |
83 |
84 | _sic_template_rag = """"Given the respondent's description of the main activity their
85 | company does, their job title and job description (which may be different then the
86 | main company activity), your task is to determine the UK SIC (Standard Industry
87 | Classification) code for this company if it can be determined.
88 | Make sure to use the provided Relevant subset of UK SIC 2007. If the code cannot be
89 | determined (or is likely not included in the provided subset), identify the additional
90 | information needed to determine it and a list of most likely codes.
91 |
92 | ===Respondent Data===
93 | - Company's main activity: {industry_descr}
94 | - Job Title: {job_title}
95 | - Job Description: {job_description}
96 |
97 | ===Relevant subset of UK SIC 2007===
98 | {sic_index}
99 |
100 | ===Output Format===
101 | {format_instructions}
102 |
103 | ===Output===
104 | """
105 |
106 | SIC_PROMPT_RAG = PromptTemplate.from_template(
107 | template=_core_prompt + _sic_template_rag,
108 | partial_variables={
109 | "format_instructions": parser.get_format_instructions(),
110 | },
111 | )
112 |
113 | _general_template_rag = """"Given the respondent's data, your task is to determine
114 | the classification code. Make sure to use the provided Relevant subset of
115 | classification index and select codes from this list only.
116 | If the code cannot be determined (or not included in the provided subset),
117 | do not provide final code, instead identify the additional information needed
118 | to determine the correct code and suggest few most likely codes.
119 |
120 | ===Respondent Data===
121 | {respondent_data}
122 |
123 | ===Relevant subset of classification index===
124 | {classification_index}
125 |
126 | ===Output Format===
127 | {format_instructions}
128 |
129 | ===Output===
130 | """
131 | parser = PydanticOutputParser(pydantic_object=RagResponse)
132 |
133 | GENERAL_PROMPT_RAG = PromptTemplate.from_template(
134 | template=_core_prompt + _general_template_rag,
135 | partial_variables={
136 | "format_instructions": parser.get_format_instructions(),
137 | },
138 | )
139 |
140 |
141 | class PromptTemplates:
142 | """Class to store prompt templates for SOC and SIC classification tasks.
143 |
144 | Each prompt template includes the necessary variables and formatting instructions
145 | to generate the prompt. The module also includes the necessary imports and
146 | configurations to generate the prompts.
147 |
148 | Attributes:
149 | SOC_PROMPT_PYDANTIC (PromptTemplate): Prompt template for determining SOC codes
150 | based on respondent's data.
151 | SIC_PROMPT_PYDANTIC (PromptTemplate): Prompt template for determining SIC codes
152 | based on respondent's data.
153 | SIC_PROMPT_RAG (PromptTemplate): Prompt template for determining SIC codes based
154 | on respondent's data, with a relevant subset of SIC codes provided.
155 | GENERAL_PROMPT_RAG (PromptTemplate): Prompt template for determining custom
156 | classification codes based on respondent data, with a relevant subset of
157 | codes provided.
158 | """
159 |
160 | def __init__(self):
161 | self.SOC_PROMPT_PYDANTIC = SOC_PROMPT_PYDANTIC
162 | self.SIC_PROMPT_PYDANTIC = SIC_PROMPT_PYDANTIC
163 | self.SIC_PROMPT_RAG = SIC_PROMPT_RAG
164 | self.GENERAL_PROMPT_RAG = GENERAL_PROMPT_RAG
165 |
--------------------------------------------------------------------------------
/tests/test_classification_llm.py:
--------------------------------------------------------------------------------
1 | # %%
2 | import unittest
3 | import tempfile
4 | import os
5 | from pathlib import Path
6 | from sic_soc_llm import setup_logging, check_file_exists
7 | from sic_soc_llm.llm import ClassificationLLM
8 | from sic_soc_llm.embedding import EmbeddingHandler
9 | from langchain.llms.fake import FakeListLLM
10 | from sic_soc_llm.data_models.response_model import SocResponse, SicResponse, RagResponse
11 |
12 | # %%
13 | FAKE_LLM = FakeListLLM(responses=["something not structured"])
14 |
15 |
16 | # %%
17 | class TestClassificationLLM(unittest.TestCase):
18 | def setUp(self):
19 | # Create a temporary testing directory
20 | self.cwd = os.getcwd()
21 | self.temp_dir = tempfile.TemporaryDirectory()
22 | os.chdir(self.temp_dir.name)
23 | self.llm = ClassificationLLM(llm=FAKE_LLM, verbose=True)
24 |
25 | def tearDown(self):
26 | # Clean up the temporary testing directory
27 | self.temp_dir.cleanup()
28 | os.chdir(self.cwd)
29 |
30 | def test_logging_setup(self):
31 | # check new file exists in tmp log dir
32 | logger = setup_logging(log_dir=Path(self.temp_dir.name) / "logs")
33 | logger.info("TestClassificationLLM: Setting up test")
34 | log_files = list((Path(self.temp_dir.name) / "logs").iterdir())
35 | self.assertEqual(len(log_files), 1)
36 |
37 | def test_classification_llm_initialised(self):
38 | # Test if the ClassificationLLM instance is initialized correctly
39 | self.assertIsInstance(self.llm, ClassificationLLM)
40 |
41 | def test_prompt_candidate_list_empty(self):
42 | out = self.llm._prompt_candidate_list([])
43 | print(out)
44 | self.assertEqual(out, "")
45 |
46 | def test_get_soc_code(self):
47 | # Test if the SOC code is returned correctly
48 | soc_code = self.llm.get_soc_code(
49 | job_title="science teacher",
50 | job_description="",
51 | manage_others=False,
52 | level_of_education="Other",
53 | industry_descr="",
54 | )
55 | self.assertIsInstance(soc_code, SocResponse)
56 |
57 | def test_get_sic_code(self):
58 | # Test if the SIC code is returned correctly
59 | sic_code = self.llm.get_sic_code(
60 | industry_descr="secondary school",
61 | job_title="teacher",
62 | job_description="",
63 | )
64 | self.assertIsInstance(sic_code, SicResponse)
65 |
66 | def test_sic_empty_embed_error(self):
67 | print(Path.cwd())
68 | resp = self.llm.rag_sic_code(industry_descr="secondary school")
69 | assert resp[0].reasoning.startswith("Error")
70 |
71 | def test_rag_empty_embed_error(self):
72 | resp = self.llm.rag_general_code(respondent_data={"descr": "school"})
73 | assert resp[0].reasoning.startswith("Error")
74 |
75 |
76 | # %%
77 | class TestGeneralRAG(unittest.TestCase):
78 | def setUp(self):
79 | embed = EmbeddingHandler(db_dir=None)
80 | file_path = check_file_exists("toy_index.txt")
81 | with open(file_path, "r") as file_object:
82 | embed.embed_index(from_empty=True, file_object=file_object)
83 | self.llm = ClassificationLLM(
84 | llm=FAKE_LLM, embedding_handler=embed, verbose=True
85 | )
86 |
87 | def tearDown(self):
88 | pass
89 |
90 | def test_rag_general_code(self):
91 | # Test if the RAG code is returned correctly
92 | rag_code, shortlist = self.llm.rag_general_code(
93 | respondent_data={"characteristics": "gills"}
94 | )
95 | self.assertIsInstance(rag_code, RagResponse)
96 | assert len(shortlist) == 4
97 |
--------------------------------------------------------------------------------
/tests/test_embedding.py:
--------------------------------------------------------------------------------
1 | # %%
2 | import unittest
3 | from sic_soc_llm import check_file_exists
4 | from sic_soc_llm.embedding import EmbeddingHandler
5 |
6 |
7 | # %%
8 | class TestEmbeddingHandlerToy(unittest.TestCase):
9 | def setUp(self):
10 | self.embedding_handler = EmbeddingHandler(db_dir=None)
11 | file_path = check_file_exists("toy_index.txt")
12 | with open(file_path, "r") as file_object:
13 | self.embedding_handler.embed_index(from_empty=True, file_object=file_object)
14 |
15 | def tearDown(self):
16 | pass
17 |
18 | def test_embed_index_with_file_object(self):
19 | # Count number of entries
20 | assert self.embedding_handler._index_size == 4
21 |
22 | def test_search_index(self):
23 | # Test searching index with a query
24 | query = "mens best friend"
25 | results = self.embedding_handler.search_index(query)
26 | assert results[0]["code"] == "02"
27 |
28 | def test_search_index_multi(self):
29 | # Test searching index with multiple queries
30 | queries = ["has gills", "has scales"]
31 | results = self.embedding_handler.search_index_multi(queries)
32 | assert len(results) == 8
33 |
--------------------------------------------------------------------------------
/tests/test_sic_data_structure.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | from sic_soc_llm.data_models import sic_hierarchy
4 |
5 |
6 | def test_sic_code_alpha_code_is_string_else_error():
7 | with pytest.raises(TypeError):
8 | sic_hierarchy.SicCode(123123)
9 |
10 |
11 | def test_sic_code_alpha_code_starts_with_letter_else_error():
12 | with pytest.raises(ValueError):
13 | sic_hierarchy.SicCode("123123")
14 |
15 |
16 | def test_sic_code_alpha_code_starts_with_uppercase_else_error():
17 | with pytest.raises(ValueError):
18 | sic_hierarchy.SicCode("a12312")
19 |
20 |
21 | def test_sic_code_alpha_code_short_raises_error():
22 | with pytest.raises(ValueError):
23 | sic_hierarchy.SicCode("a123")
24 |
25 |
26 | def test_sic_code_alpha_code_long_raises_error():
27 | with pytest.raises(ValueError):
28 | sic_hierarchy.SicCode("a123123")
29 |
30 |
31 | @pytest.mark.parametrize(
32 | "code,expected_digits",
33 | [
34 | ("Axxxxx", 1),
35 | ("A12xxx", 2),
36 | ("A123xx", 3),
37 | ("A1234x", 4),
38 | ("A12345", 5),
39 | ],
40 | )
41 | def test_sic_code_alpha_code_digits_parsed(code, expected_digits):
42 | # Given
43 | alpha_code = code
44 |
45 | # When
46 | code = sic_hierarchy.SicCode(alpha_code)
47 |
48 | # Then
49 | assert code.n_digits == expected_digits
50 |
51 |
52 | @pytest.mark.parametrize(
53 | "code,expected_level_name",
54 | [
55 | ("Axxxxx", "section"),
56 | ("A12xxx", "division"),
57 | ("A123xx", "group"),
58 | ("A1234x", "class"),
59 | ("A12345", "subclass"),
60 | ],
61 | )
62 | def test_sic_code_alpha_code_levels_correct(code, expected_level_name):
63 | # Given
64 | alpha_code = code
65 |
66 | # When
67 | code = sic_hierarchy.SicCode(alpha_code)
68 |
69 | # Then
70 | assert code.level_name == expected_level_name
71 |
72 |
73 | @pytest.mark.parametrize(
74 | "code,expected_formatted_code",
75 | [
76 | ("Axxxxx", "A"),
77 | ("A12xxx", "12"),
78 | ("A123xx", "12.3"),
79 | ("A1234x", "12.34"),
80 | ("A12345", "12.34/5"),
81 | ],
82 | )
83 | def test_sic_code_alpha_code_readable_code_correct(code, expected_formatted_code):
84 | # Given
85 | alpha_code = code
86 |
87 | # When
88 | code = sic_hierarchy.SicCode(alpha_code)
89 |
90 | # Then
91 | assert str(code) == expected_formatted_code
92 |
93 |
94 | def test_sic_code_alpha_single_digit_raises_error():
95 | with pytest.raises(ValueError):
96 | sic_hierarchy.SicCode("A1xxxx")
97 |
98 |
99 | @pytest.mark.parametrize(
100 | "section,code,level,expected_formatted_code",
101 | [
102 | ("A", "A", "section", "A"),
103 | ("A", "12", "division", "12"),
104 | ("A", "123", "group", "12.3"),
105 | ("A", "1234", "class", "12.34"),
106 | ("A", "12340", "class", "12.34"),
107 | ("A", "12345", "subclass", "12.34/5"),
108 | ],
109 | )
110 | def test_sic_code_from_section_code_level_valid_cases(
111 | section, code, level, expected_formatted_code
112 | ):
113 | # When
114 | code = sic_hierarchy.SicCode.from_section_code_level(section, code, level)
115 |
116 | # Then
117 | assert str(code) == expected_formatted_code
118 |
119 |
120 | def test_sic_code_from_section_code_level_invalid_class():
121 | # Given
122 | section = "A"
123 | code = "12341"
124 | level = "class"
125 |
126 | with pytest.raises(ValueError):
127 | sic_hierarchy.SicCode.from_section_code_level(section, code, level)
128 |
129 |
130 | @pytest.mark.parametrize(
131 | "section,code,level",
132 | [
133 | ("A", "A", "division"),
134 | ("A", "A", "group"),
135 | ("A", "A", "class"),
136 | ("A", "A", "subclass"),
137 | ("A", "12", "section"),
138 | ("A", "12", "group"),
139 | ("A", "12", "class"),
140 | ("A", "12", "subclass"),
141 | ("A", "123", "section"),
142 | ("A", "123", "division"),
143 | ("A", "123", "class"),
144 | ("A", "123", "subclass"),
145 | ("A", "1234", "section"),
146 | ("A", "1234", "division"),
147 | ("A", "1234", "group"),
148 | ("A", "1234", "subclass"),
149 | ("A", "12340", "section"),
150 | ("A", "12340", "division"),
151 | ("A", "12340", "group"),
152 | ("A", "12345", "section"),
153 | ("A", "12345", "division"),
154 | ("A", "12345", "group"),
155 | ("A", "12345", "class"),
156 | ],
157 | )
158 | def test_sic_code_from_section_code_level_invalid_levels_raise_error(
159 | section, code, level
160 | ):
161 | with pytest.raises(ValueError):
162 | sic_hierarchy.SicCode.from_section_code_level(section, code, level)
163 |
164 |
165 | def test_sic_code_from_section_code_level_invalid_section_code_raises_error():
166 | # Given
167 | section = "A"
168 | code = "B"
169 | level = "section"
170 |
171 | with pytest.raises(ValueError):
172 | sic_hierarchy.SicCode.from_section_code_level(section, code, level)
173 |
174 |
175 | @pytest.mark.parametrize(
176 | "text,expected",
177 | [
178 | (""", '"'),
179 | ("some text "here"", 'some text "here"'),
180 | ('mixed "some" text "here"', 'mixed "some" text "here"'),
181 | ],
182 | )
183 | def test_clean_text_with_html_unescapes(text, expected):
184 | # When
185 | clean_text = sic_hierarchy._clean_text(text)
186 |
187 | # Then
188 | assert clean_text == expected
189 |
190 |
191 | @pytest.mark.parametrize(
192 | "text,expected",
193 | [
194 | (", see ##12.12", ""),
195 | ("Some text, see ##12.12", "Some text"),
196 | ("Some text, See ##12.12", "Some text"),
197 | ("Some text, see ##12.12/1", "Some text"),
198 | ("some text,see ##12.12", "some text"),
199 | ("see ##12.12", ""),
200 | ("##12.12", ""),
201 | ("some text ##12.12 different", "some text different"),
202 | (", see division ##85", ""),
203 | ("##85", ""),
204 | ("some text, see division ##25", "some text"),
205 | ("see divisions ##12", ""),
206 | ("some text, see division ##25, see ##12.12, see divisions ##12", "some text"),
207 | ],
208 | )
209 | def test_clean_text_with_see_gets_trimmed(text, expected):
210 | # When
211 | clean_text = sic_hierarchy._clean_text(text)
212 |
213 | # Then
214 | assert clean_text == expected
215 |
--------------------------------------------------------------------------------