├── __init__.py
├── data
├── .gitkeep
└── interim
│ └── subj_mapping.json
├── models
└── .gitkeep
├── tests
├── __init__.py
├── .gitignore
├── context.py
├── test_conceptExtractor.py
├── test_extract_from_doc.py
├── test_conceptTrainer.py
├── test_featureExtractor.py
└── test_hierarchicalClassifier.py
├── src
├── pipeline
│ ├── __init__.py
│ ├── .gitignore
│ ├── start.sh
│ ├── config.yml
│ ├── docker_pipeline.sh
│ └── pipeline.py
├── dsconcept
│ ├── .gitignore
│ ├── __init__.py
│ ├── README.md
│ ├── train.py
│ ├── model.py
│ └── get_metrics.py
├── features.py
├── concepts.py
├── make_vec_and_matrix.py
├── process.py
├── make_cat_models.py
├── make_records_for_cat_bert.py
├── synthesize_predictions.py
├── make_kwd_models.py
├── get_bert_cat_models_preds.py
└── make_plots.py
├── .coveragerc
├── docs
├── .gitignore
├── reset.sh
├── research_access.png
├── push_pages.sh
├── code.rst
├── index.rst
├── Makefile
├── docker-versions.txt
└── conf.py
├── .dockerignore
├── version.py
├── .github
└── workflows
│ └── greetings.yml
├── config
└── test_config.yml
├── Dockerfile
├── LICENSE
├── setup.py
├── .gitignore
├── requirements.txt
├── CHANGELOG.md
├── README.md
└── Makefile
/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/data/.gitkeep:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/models/.gitkeep:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/src/pipeline/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/.coveragerc:
--------------------------------------------------------------------------------
1 | [run]
2 | branch = True
--------------------------------------------------------------------------------
/src/pipeline/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__/*
--------------------------------------------------------------------------------
/docs/.gitignore:
--------------------------------------------------------------------------------
1 | html/*
2 | _build/*
3 |
--------------------------------------------------------------------------------
/src/dsconcept/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__/*
2 |
--------------------------------------------------------------------------------
/src/dsconcept/__init__.py:
--------------------------------------------------------------------------------
1 | import dsconcept.model
2 | import dsconcept.train
3 |
--------------------------------------------------------------------------------
/tests/.gitignore:
--------------------------------------------------------------------------------
1 | .hypothesis/*
2 | .pytest_cache/*
3 | .coverage
4 | __pycache__/*
--------------------------------------------------------------------------------
/docs/reset.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | make clean && make html && open _build/html/index.html
3 |
--------------------------------------------------------------------------------
/docs/research_access.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nasa/concept-tagging-training/master/docs/research_access.png
--------------------------------------------------------------------------------
/docs/push_pages.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | make html
3 | cd _build/html
4 | git add .
5 | git commit -m 'rebuilt docs'
6 | git push origin gh-pages
7 |
--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
1 | data/*
2 | __pycache__/*
3 | notebook/*
4 | reports/*
5 | env/*
6 | venv/*
7 | docs/*
8 | models/*
9 | scratch/*
10 | .hypothesis/*
11 | .pytest_cache/*
12 | *.tar
13 | *.tar.gz
14 |
--------------------------------------------------------------------------------
/version.py:
--------------------------------------------------------------------------------
1 | import os
2 | from setuptools_scm import get_version
3 |
4 | version = get_version(root=os.path.dirname(os.path.abspath(__file__)))
5 | version = ".".join(version.split(".")[:3])
6 | print(version)
7 |
--------------------------------------------------------------------------------
/docs/code.rst:
--------------------------------------------------------------------------------
1 | dsconcept
2 | ==========
3 |
4 | .. automodule:: dsconcept.train
5 | :members:
6 | :undoc-members:
7 | .. autofunction::
8 |
9 | .. automodule:: dsconcept.model
10 | :members:
11 | .. autofunction::
12 |
13 |
--------------------------------------------------------------------------------
/tests/context.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import logging
4 |
5 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "../src")))
6 |
7 | logging.basicConfig(level=logging.INFO)
8 | LOG = logging.getLogger(__name__)
9 | LOG.setLevel(logging.DEBUG)
10 |
11 | import dsconcept
12 |
13 | LOG.info(f"Loaded Module {dsconcept}")
14 |
--------------------------------------------------------------------------------
/src/pipeline/start.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | export MYDIR="$(dirname "$(realpath "$0")")"
3 |
4 | python ${MYDIR}/pipeline.py \
5 | ${MYDIR}/volumes/in_data/records.json \
6 | ${MYDIR}/volumes/in_data/config.yml \
7 | ${MYDIR}/volumes/out_data/processed_data \
8 | ${MYDIR}/volumes/out_data/topic_models \
9 | ${MYDIR}/volumes/out_data/models \
10 | -loglevel ${LOGLEVEL}
11 |
--------------------------------------------------------------------------------
/.github/workflows/greetings.yml:
--------------------------------------------------------------------------------
1 | name: Greetings
2 |
3 | on: [pull_request, issues]
4 |
5 | jobs:
6 | greeting:
7 | runs-on: ubuntu-latest
8 | permissions:
9 | issues: write
10 | pull-requests: write
11 | steps:
12 | - uses: actions/first-interaction@v1
13 | with:
14 | repo-token: ${{ secrets.GITHUB_TOKEN }}
15 | issue-message: 'Message that will be displayed on users first issue'
16 | pr-message: 'Thank you for contributing to this NASA repository!'
17 |
--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
1 | .. Research Access documentation master file, created by
2 | sphinx-quickstart on Fri Sep 14 16:48:31 2018.
3 | You can adapt this file completely to your liking, but it should at least
4 | contain the root `toctree` directive.
5 |
6 | Welcome to Research Access's documentation!
7 | ===========================================
8 |
9 | .. toctree::
10 | :maxdepth: 2
11 | :caption: Contents:
12 |
13 | code
14 |
15 |
16 | Indices and tables
17 | ==================
18 |
19 | * :ref:`genindex`
20 | * :ref:`modindex`
21 | * :ref:`search`
22 |
--------------------------------------------------------------------------------
/config/test_config.yml:
--------------------------------------------------------------------------------
1 | weights: # assign weights for term types specified in process section
2 | NOUN: 1
3 | PROPN: 1
4 | NOUN_CHUNK: 1
5 | ENT: 1
6 | ACRONYM: 1
7 | min_feature_occurrence: 10
8 | # features from corpus which occur fewer than
9 | # this many times are not used for training
10 | max_feature_occurrence: 0.9
11 | # features which occur in more than this percentage
12 | # of the corpus are not used for training
13 | min_concept_occurrence: 5
14 | # only concepts which occur greater than or equal to this many times
15 | # in the corpus will have associated classifiers created.
16 |
--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
1 | # Minimal makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line.
5 | SPHINXOPTS =
6 | SPHINXBUILD = sphinx-build
7 | SOURCEDIR = .
8 | BUILDDIR = _build
9 |
10 | # Put it first so that "make" without argument is like "make help".
11 | help:
12 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
13 |
14 | .PHONY: help Makefile
15 |
16 | # Catch-all target: route all unknown targets to Sphinx using the new
17 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
18 | %: Makefile
19 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
--------------------------------------------------------------------------------
/src/dsconcept/README.md:
--------------------------------------------------------------------------------
1 | # dsconcept
2 | Python library with supporting classes for runnning processing,
3 | and training of classifiers. Originally desgined for
4 | tagging the [NASA STI database](https://www.sti.nasa.gov/).
5 |
6 | ## installation
7 | You can install the dsconcept library from this repository.
8 | It also requires the [spacy 'en' language models](https://spacy.io/usage/models).
9 | ```bash
10 | pip install git+https://developer.nasa.gov/DataSquad/classifier_scripts.git
11 | python -m spacy download en
12 | ```
13 |
14 | ## Usage
15 | Docs are available [here](../docs).
16 | You can go through a full interactive tutorial using the Dockerfile available in
17 | [notebook](#../notebook).
18 |
19 |
--------------------------------------------------------------------------------
/docs/docker-versions.txt:
--------------------------------------------------------------------------------
1 | Client: Docker Engine - Community
2 | Version: 19.03.5
3 | API version: 1.40
4 | Go version: go1.12.12
5 | Git commit: 633a0ea
6 | Built: Wed Nov 13 07:22:34 2019
7 | OS/Arch: darwin/amd64
8 | Experimental: false
9 |
10 | Server: Docker Engine - Community
11 | Engine:
12 | Version: 19.03.5
13 | API version: 1.40 (minimum version 1.12)
14 | Go version: go1.12.12
15 | Git commit: 633a0ea
16 | Built: Wed Nov 13 07:29:19 2019
17 | OS/Arch: linux/amd64
18 | Experimental: true
19 | containerd:
20 | Version: v1.2.10
21 | GitCommit: b34a5c8af56e510852c35414db4c1f4fa6172339
22 | runc:
23 | Version: 1.0.0-rc8+dev
24 | GitCommit: 3e425f80a8c931f88e6d94a8c831b9d5aa481657
25 | docker-init:
26 | Version: 0.18.0
27 | GitCommit: fec3683
28 |
--------------------------------------------------------------------------------
/src/pipeline/config.yml:
--------------------------------------------------------------------------------
1 | # Configuration for research access training pipeline
2 |
3 | #image: storage.analytics.nasa.gov/rat_trainer:0.12.0
4 |
5 | process:
6 | term_types:
7 | - "NOUN"
8 | - "PROPN"
9 | - "ENT"
10 | - "NOUN_CHUNK"
11 | - "ACRONYM"
12 | abstract_field: "description"
13 | concept_field: "subject.NASATerms"
14 |
15 | topic_model:
16 | weights: # assign weights for term types specified in process section
17 | NOUN: 1
18 | PROPN: 1
19 | NOUN_CHUNK: 1
20 | ENT: 1
21 | ACRONYM: 1
22 | min_feature_occurrence: 5
23 | max_feature_occurrence: 0.9
24 | number_of_topics: 10
25 |
26 | train_classifiers:
27 | weights: # assign weights for term types specified in process section
28 | NOUN: 1
29 | PROPN: 1
30 | NOUN_CHUNK: 1
31 | ENT: 1
32 | ACRONYM: 1
33 | min_feature_occurrence: 5
34 | max_feature_occurrence: 0.9
35 | min_concept_occurrence: 10
36 |
37 |
38 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | # Create essential base image
2 | FROM python:3.7 as base
3 | COPY requirements.txt /home/
4 | WORKDIR /home/
5 | RUN pip install -U pip setuptools wheel && \
6 | pip install -r requirements.txt && \
7 | python -m spacy download en
8 | ADD src/ /home/src/
9 | ENV PYTHONPATH=/home/src
10 | ENV PYTHONUNBUFFERED=0
11 |
12 | # Label image with git commit url
13 | ARG GIT_URL=unspecified
14 | ARG VERSION=unspecified
15 | LABEL org.label-schema.schema-version=1.0
16 | LABEL org.label-schema.url=$GIT_URL
17 | LABEL org.label-schema.version=$VERSION
18 | ENV VERSION=$VERSION
19 |
20 | # Run unittests
21 | FROM base as tests
22 | RUN pip install nose && \
23 | pip install pytest && \
24 | pip install coverage && \
25 | pip install hypothesis && \
26 | pip install testfixtures
27 | COPY tests /home/tests
28 | ARG cachebust=0
29 | # ^ Change this to avoid using cached results. These are tests, so we may want to run them.
30 | RUN nosetests --with-coverage --cover-package dsconcept
31 |
32 | # Deployment ready image
33 | FROM base as pipeline
34 | COPY Makefile /home/
35 | ENTRYPOINT ["make"]
36 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 |
2 | The MIT License (MIT)
3 | Copyright (c) 2020, United States Government as represented by the Administrator of the National Aeronautics and Space Administration.
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
6 |
7 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
8 |
9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
10 |
11 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from glob import glob
2 | from os.path import basename
3 | from os.path import splitext
4 |
5 | import setuptools
6 |
7 | setuptools.setup(
8 | name="dsconcept",
9 | use_scm_version=True,
10 | setup_requires=["setuptools_scm"],
11 | url="https://developer.nasa.gov/DataSquad/classifier_scripts",
12 | author="Anthony Buonomo",
13 | author_email="anthony.r.buonomo@nasa.gov",
14 | description="Scripts for processing, topic modeling, and creating classifiers for STI concepts.",
15 | long_description=open("README.md").read(),
16 | license="MIT",
17 | packages=setuptools.find_packages("src"),
18 | package_dir={"": "src"},
19 | py_modules=[splitext(basename(path))[0] for path in glob("src/*.py")],
20 | install_requires=[
21 | "scikit-learn>=0.21.3",
22 | "spacy>=2.2.3",
23 | "numpy>=1.17.4",
24 | "pandas>=0.25.3",
25 | "pyLDAvis>=2.1.2",
26 | "textacy==0.9.1",
27 | "boto3>=1.7.46",
28 | "dask>=2.8.1",
29 | "PyYAML>=5.1.2",
30 | "h5py>=2.10.0",
31 | "tqdm>=4.39.0",
32 | ],
33 | classifiers=[
34 | "Development Status :: 2 - Beta",
35 | "Programming Language :: Python :: 3.6",
36 | ],
37 | )
38 |
--------------------------------------------------------------------------------
/tests/test_conceptExtractor.py:
--------------------------------------------------------------------------------
1 | from unittest import TestCase
2 | from .context import dsconcept
3 | from testfixtures import TempDirectory
4 | from pathlib import Path
5 | import logging
6 |
7 | logging.basicConfig(level=logging.INFO)
8 | LOG = logging.getLogger(__name__)
9 | LOG.setLevel(logging.INFO)
10 |
11 |
12 | class TestConceptExtractor(TestCase):
13 | def setUp(self):
14 | self.ce = dsconcept.model.ConceptExtractor()
15 | self.d = TempDirectory()
16 |
17 | def test_concept_sets(self):
18 | self.ce.concept_sets = [
19 | ["MARS", "NASA"],
20 | ["NASA"],
21 | ["MARS"],
22 | ["HIT", "JUPITER"],
23 | ]
24 |
25 | def test_from_corpus(self):
26 | data = b'{"abstract":"Astronauts are very cool.", "concept": ["ASTRONAUTS", "COOL THINGS"]} \n {"abstract":"NASA is going to Mars.", "concept":["NASA", "MARS"]}'
27 | self.d.write("test.json", data)
28 | self.ce.from_corpus(Path(f"{self.d.path}/test.json"), "concept")
29 |
30 | def test_get_top_concepts(self):
31 | self.ce.concept_sets = [
32 | ["MARS", "NASA"],
33 | ["NASA"],
34 | ["MARS"],
35 | ["HIT", "JUPITER"],
36 | ]
37 | self.assertDictEqual(
38 | self.ce.get_top_concepts(2), {"mars": [0, 2], "nasa": [0, 1]}
39 | )
40 |
41 | def tearDown(self):
42 | self.d.cleanup()
43 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # remove credentials
2 | .eggs/*
3 | bandit_analysis.txt
4 | *.h5
5 | /commands/.env
6 | venv/*
7 | versions_and*
8 | my_env/*
9 | /volumes/*
10 | notebook/*
11 | cover/*
12 | .coverage
13 | !/volumes/in_data
14 | env/*
15 | .idea/*
16 | scratch/*
17 | models/*
18 | !models/.gitkeep
19 | config/*
20 | !config/test_config.yml
21 |
22 | *.env
23 | *.pkl
24 | *.npy
25 | *.tgz
26 | *.gz
27 | *.tar
28 | *.npz
29 | *.swp
30 |
31 |
32 | */.ipynb_checkpoints/*
33 | kubernetes-manifests/*
34 | sample_outdata/*
35 | __pycache__/*
36 |
37 | reports/
38 | !reports/.gitkeep
39 | data/interim/*
40 | data/raw/*
41 | !data/interim/subj_mapping.json
42 | !data/raw/STI_public_metadata_records_sample100.jsonl
43 | volumes/big_data/*
44 | !volumes/big_data/.gitkeep
45 | tests/test_data/*
46 | !tests/test_data/.gitkeep
47 | !tests/test_data/results_small.json
48 |
49 | misc-ignore/*
50 | .ipynb_checkpoints/*
51 |
52 | notebook/src/*
53 | notebook/data/*
54 |
55 | # Byte-compiled / optimized / DLL files
56 | __pycache__/
57 | *.py[cod]
58 |
59 | # C extensions
60 | *.so
61 |
62 | # Distribution / packaging
63 | bin/
64 | build/
65 | develop-eggs/
66 | dist/
67 | eggs/
68 | lib/
69 | lib64/
70 | parts/
71 | sdist/
72 | var/
73 | *.egg-info/
74 | .installed.cfg
75 | *.egg
76 |
77 | # Installer logs
78 | pip-log.txt
79 | pip-delete-this-directory.txt
80 |
81 | # Unit test / coverage reports
82 | .tox/
83 | .coverage
84 | .cache
85 | .hypothesis/*
86 | .pytest_cache/*
87 | nosetests.xml
88 | coverage.xml
89 |
90 | # Translations
91 | *.mo
92 |
93 |
--------------------------------------------------------------------------------
/src/features.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import logging
3 |
4 | import dsconcept.model as ml
5 | from multiprocessing import cpu_count
6 |
7 | logging.basicConfig(level=logging.INFO)
8 | LOG = logging.getLogger(__name__)
9 | LOG.setLevel(logging.INFO)
10 |
11 | N_CPUS = cpu_count()
12 | BATCH_SIZE = 1000
13 |
14 |
15 | def main(in_corpus, abstract_field, out_features, batch_size, n_threads):
16 | LOG.info(f"Extracting features from corpus at {in_corpus}.")
17 | LOG.info(f"Using field: {abstract_field}.")
18 | fe = ml.FeatureExtractor()
19 | LOG.info(f"Using batch_size {batch_size} with {n_threads} threads.")
20 | LOG.info(f"Outputting processed features to {out_features}.")
21 | fe.from_corpus_to_jsonlines(
22 | in_corpus, out_features, abstract_field, batch_size, n_threads
23 | )
24 |
25 |
26 | if __name__ == "__main__":
27 | parser = argparse.ArgumentParser(
28 | description="""Create features for each document in the processed corpus.
29 | Each line in output file is a json formatted string
30 | with features and their types."""
31 | )
32 | parser.add_argument("i", help="input jsonlines corpus")
33 | parser.add_argument("f", help="abstract field")
34 | parser.add_argument("o", help="ouput jsonlines features")
35 | parser.add_argument(
36 | "-b", help="batch size for feature processing", default=BATCH_SIZE
37 | )
38 | parser.add_argument(
39 | "-n", help="number of threads for features processing", default=N_CPUS
40 | )
41 | args = parser.parse_args()
42 | main(args.i, args.f, args.o, args.b, args.n)
43 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | appnope==0.1.0
2 | attrs==19.3.0
3 | backcall==0.1.0
4 | bleach==3.3.0
5 | blis==0.4.1
6 | cachetools==3.1.1
7 | catalogue==0.0.8
8 | certifi==2019.9.11
9 | chardet==3.0.4
10 | coverage==4.5.4
11 | cycler==0.10.0
12 | cymem==2.0.3
13 | cytoolz==0.10.1
14 | dask==2.8.1
15 | decorator==4.4.1
16 | defusedxml==0.6.0
17 | entrypoints==0.3
18 | h5py==2.10.0
19 | hypothesis==4.47.1
20 | idna==2.8
21 | importlib-metadata==0.23
22 | jedi==0.15.1
23 | jellyfish==0.7.2
24 | Jinja2==2.11.3
25 | joblib==0.14.0
26 | jsonschema==3.2.0
27 | kiwisolver==1.1.0
28 | MarkupSafe==1.1.1
29 | matplotlib==3.1.2
30 | mistune==0.8.4
31 | more-itertools==7.2.0
32 | murmurhash==1.0.2
33 | nbformat==4.4.0
34 | networkx==2.4
35 | nose==1.3.7
36 | numpy==1.17.4
37 | packaging==19.2
38 | pandas==0.25.3
39 | pandocfilters==1.4.2
40 | parso==0.5.1
41 | pexpect==4.7.0
42 | pickleshare==0.7.5
43 | plac==1.1.3
44 | pluggy==0.13.1
45 | preshed==3.0.2
46 | prometheus-client==0.7.1
47 | prompt-toolkit==2.0.10
48 | ptyprocess==0.6.0
49 | py==1.10.0
50 | pyemd==0.5.1
51 | Pygments==2.7.4
52 | pyparsing==2.4.5
53 | Pyphen==0.9.5
54 | pyrsistent==0.15.6
55 | python-dateutil==2.8.1
56 | pytz==2019.3
57 | PyYAML==5.4
58 | pyzmq==18.1.1
59 | requests==2.22.0
60 | scikit-learn==0.21.3
61 | scipy==1.3.3
62 | Send2Trash==1.5.0
63 | six==1.13.0
64 | spacy==2.2.3
65 | srsly==0.2.0
66 | terminado==0.8.3
67 | testpath==0.4.4
68 | textacy==0.9.1
69 | thinc==7.3.1
70 | toolz==0.10.0
71 | tornado==6.0.3
72 | tqdm==4.39.0
73 | traitlets==4.3.3
74 | urllib3==1.26.5
75 | wasabi==0.4.0
76 | wcwidth==0.1.7
77 | webencodings==0.5.1
78 | widgetsnbextension==3.5.1
79 | zipp==0.6.0
80 |
--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
1 | # Changelog
2 | All notable changes to this project will be documented in this file.
3 |
4 | Releases page is here.
5 |
6 | ## [Unreleased]
7 |
8 |
9 | ## [v1.0.3-open_source_release] - 2020-06-10
10 | #### Added:
11 | Original open-source release of this repository on github.com/nasa after having received SRA (software release authority) approval.
12 |
13 |
14 |
15 | # Guidelines for ChangeLog Entries
16 |
17 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
18 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
19 |
20 | ### Guiding Principles
21 | - Changelogs are for humans, not machines.
22 | - There should be an entry for every single version.
23 | - The same types of changes should be grouped.
24 | - Versions and sections should be linkable.
25 | - The latest version comes first.
26 | - The release date of each version is displayed.
27 |
28 | ### All Entries Sould be Under One of These Types of changes
29 | - Added for new features.
30 | - Changed for changes in existing functionality.
31 | - Deprecated for soon-to-be removed features.
32 | - Removed for now removed features.
33 | - Fixed for any bug fixes.
34 | - Security in case of vulnerabilities.
35 |
36 | Google technical writer Sarah Maddox gave the following advice about release notes:
37 | `“The most important function of release notes is to let customers know that something has changed in the product, particularly when that something may affect the way the customer uses the product.”`
38 |
--------------------------------------------------------------------------------
/tests/test_extract_from_doc.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | from dsconcept.model import *
3 |
4 | import logging
5 |
6 | logging.basicConfig(level=logging.WARNING)
7 | logging.disable(level=logging.INFO)
8 | LOG = logging.getLogger(__name__)
9 | LOG.setLevel(logging.WARNING)
10 |
11 |
12 | class TestExtractFromDoc(unittest.TestCase):
13 | def setUp(self):
14 | self.nlp = spacy.load("en_core_web_sm")
15 | self.doc = nlp(
16 | """The NASA Scientific and Technical Information (STI) Program was established to support the
17 | objectives of NASA’s missions and research. The Mission of the STI Program is to support the
18 | advancement of aerospace knowledge and contribute to U.S. competitiveness in aerospace research and
19 | development. This program is essential to help NASA avoid duplication of research by sharing
20 | information and to ensure that the U.S. maintains its preeminence in aerospace-related industries
21 | and education. The NASA STI Program acquires, processes, archives, announces, and disseminates
22 | NASA STI and acquires worldwide STI of critical importance to the
23 | National Aeronautics and Space Administation (NASA) and the Nation."""
24 | )
25 | self.terms_tagged = extract_from_doc(self.doc)
26 |
27 | def test_is_set(self):
28 | self.assertEqual(dict, type(self.terms_tagged))
29 |
30 | def test_has_terms(self):
31 | self.assertGreater(len(self.terms_tagged), 0)
32 |
33 | def test_has_all_feature_types(self):
34 | self.term_types = {term_type for term, term_type in self.terms_tagged.items()}
35 | LOG.info(self.term_types)
36 | LOG.info(self.terms_tagged)
37 | self.assertEqual(
38 | {"NOUN", "PROPN", "NOUN_CHUNK", "ENT", "ACRONYM"}, self.term_types
39 | )
40 |
41 |
42 | if __name__ == "__main__":
43 | unittest.main()
44 |
--------------------------------------------------------------------------------
/src/concepts.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import logging
3 |
4 | import dsconcept.model as ml
5 |
6 | logging.basicConfig(level=logging.INFO)
7 | LOG = logging.getLogger(__name__)
8 | LOG.setLevel(logging.INFO)
9 |
10 |
11 | def main(
12 | in_corpus,
13 | concept_field,
14 | cat_field,
15 | out_indices,
16 | out_cat_indices,
17 | out_raw2lemma,
18 | out_cat_raw2lemma,
19 | ):
20 | LOG.info(f"Corpus: {in_corpus}")
21 | LOG.info(f"Keyword Field: {concept_field}")
22 | LOG.info(f"Category Field: {cat_field}")
23 |
24 | ce = ml.ConceptExtractor()
25 | ce.from_corpus(in_corpus, concept_field)
26 | LOG.info(f"Output keyword indices: {out_indices}")
27 | LOG.info(f"Output keyword raw2lemma: {out_raw2lemma}")
28 | ce.to_jsons(out_indices, out_raw2lemma)
29 |
30 | LOG.info(f"Extracting categories.")
31 | ce_higher = ml.ConceptExtractor()
32 | ce_higher.from_corpus(in_corpus, cat_field)
33 | LOG.info(f"Output category indices: {out_cat_indices}")
34 | LOG.info(f"Output category raw2lemma: {out_cat_raw2lemma}")
35 | ce_higher.to_jsons(out_cat_indices, out_cat_raw2lemma)
36 |
37 |
38 | if __name__ == "__main__":
39 | parser = argparse.ArgumentParser(
40 | description="""Get indices of processed corpus for all of concept and category
41 | tags. Also get lemmas for these concepts and categories. Output all of this
42 | information to json files."""
43 | )
44 | parser.add_argument("i", help="input processed jsonlines corpus")
45 | parser.add_argument("k", help="concept field")
46 | parser.add_argument("c", help="concept field")
47 | parser.add_argument("ok", help="output indices for concepts")
48 | parser.add_argument("oc", help="output indices for categories")
49 | parser.add_argument("rk", help="out keyword raw to lemma mapping")
50 | parser.add_argument("rc", help="out category raw to lemma mapping")
51 | args = parser.parse_args()
52 | main(args.i, args.k, args.c, args.ok, args.oc, args.rk, args.rc)
53 |
--------------------------------------------------------------------------------
/src/make_vec_and_matrix.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import logging
3 | from pathlib import Path
4 |
5 | import joblib
6 | import numpy as np
7 | import yaml
8 | from sklearn.feature_extraction import DictVectorizer
9 | from sklearn.model_selection import train_test_split
10 |
11 | import dsconcept.model as ml
12 |
13 | logging.basicConfig(level=logging.INFO)
14 | LOG = logging.getLogger(__name__)
15 | LOG.setLevel(logging.INFO)
16 |
17 | VECTORIZER = "vectorizer.jbl"
18 | FEATURE_MATRIX = "feature_matrix.jbl"
19 |
20 |
21 | def main(in_features, in_config, out_feature_dir, out_vectorizer):
22 | with open(in_config, "r") as f0:
23 | config = yaml.safe_load(f0)
24 |
25 | LOG.info(f"Loading features from {in_features}.")
26 | fe = ml.FeatureExtractor()
27 | fe.from_jsonlines(in_features)
28 | weighted_features = fe.weight_terms(config["weights"])
29 | limited_features = fe.limit_features(
30 | weighted_features,
31 | config["min_feature_occurrence"],
32 | config["max_feature_occurrence"],
33 | )
34 | v = DictVectorizer()
35 | X = v.fit_transform(limited_features)
36 |
37 | out_feature_matrix = out_feature_dir / FEATURE_MATRIX
38 | LOG.info(f"Outputting vectorizer to {out_vectorizer}.")
39 | joblib.dump(v, out_vectorizer)
40 | LOG.info(f"Outputting feature matrix to {out_feature_matrix}.")
41 | joblib.dump(X, out_feature_matrix)
42 |
43 | _, _, ind_train, ind_test = train_test_split(
44 | X, np.array(range(X.shape[0])), test_size=0.10, random_state=42
45 | )
46 | np.save(out_feature_dir / f"train_inds.npy", ind_train)
47 | np.save(out_feature_dir / f"test_inds.npy", ind_test)
48 |
49 |
50 | if __name__ == "__main__":
51 | parser = argparse.ArgumentParser(
52 | description="""From features file, create a feature matrix and vectorizer
53 | which translates between columns of the matrix and feature strings. Limit
54 | which features are included in these files with configuration."""
55 | )
56 | parser.add_argument("in_features", help="input features jsonlines file")
57 | parser.add_argument("in_config", help="configuration for creating models")
58 | parser.add_argument(
59 | "out_feature_dir",
60 | help="output directory for feature matrix and indices",
61 | type=Path,
62 | )
63 | parser.add_argument(
64 | "out_vectorizer", help="output path for for vectorizer", type=Path,
65 | )
66 | # TODO: split outputs
67 | args = parser.parse_args()
68 |
69 | main(args.in_features, args.in_config, args.out_feature_dir, args.out_vectorizer)
70 |
--------------------------------------------------------------------------------
/src/process.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import logging
3 | import json
4 |
5 | import pandas as pd
6 |
7 | logging.basicConfig(level=logging.INFO)
8 | LOG = logging.getLogger(__name__)
9 | LOG.setLevel(logging.INFO)
10 |
11 |
12 | def main(infile, in_subj_mapping, outfile):
13 | LOG.info(f"Reading corpus from {infile}.")
14 | df = pd.read_json(infile, orient="records", lines=True)
15 | LOG.info(f"Shape of input: {df.shape}")
16 |
17 | with open(in_subj_mapping, "r") as f0:
18 | subj_mapping = json.load(f0)
19 |
20 | def get_subjs(x):
21 | if type(x) == list:
22 | s = set(
23 | subj_mapping[s.strip().lower()]
24 | for s in x
25 | if s.strip().lower() in subj_mapping
26 | )
27 | l = list(s)
28 | else:
29 | l = None
30 | return l
31 |
32 | categories = df["D072B (Subject Category)"].apply(get_subjs)
33 |
34 | text_col = (
35 | "
" + df["D245A (Title)"] + " " + df["D520B (Abstract)"]
36 | )
37 | keywords = (
38 | df["D650A (NASA Major Indexing Terms)"]
39 | + df["D659A (NASA Minor Indexing Terms)"]
40 | )
41 |
42 | pdf = pd.DataFrame()
43 | pdf["text"] = text_col
44 | pdf["keywords"] = keywords
45 | pdf["subjects"] = df["D072B (Subject Category)"]
46 | pdf["categories"] = categories
47 |
48 | def remove_no_abstracts(x):
49 | if type(x) == str:
50 | if "no abstract available" not in x.lower():
51 | val = True
52 | else:
53 | val = False
54 | else:
55 | val = False
56 | return val
57 |
58 | has_abs = pdf["text"].apply(remove_no_abstracts)
59 | has_kwds = pdf["keywords"].apply(lambda x: type(x) is list)
60 | has_subj = pdf["keywords"].apply(lambda x: type(x) is list) # Should be subjects?
61 | has_cats = pdf["categories"].apply(lambda x: type(x) is list)
62 | tf = has_kwds & has_subj & has_cats & has_abs
63 | LOG.info(f"Removed {sum(~tf)} rows.")
64 |
65 | LOG.info(f"Outputting processed corpus to {outfile}.")
66 | pdf[tf].to_json(outfile, orient="records", lines=True)
67 |
68 |
69 | if __name__ == "__main__":
70 | parser = argparse.ArgumentParser(
71 | description="""Merge text and keyword fields from input corpus.
72 | Remove documents without abstracts, keywords, or categories."""
73 | )
74 | parser.add_argument("i", help="input corpus")
75 | parser.add_argument("m", help="subject to category mapping json")
76 | parser.add_argument("o", help="output processed data")
77 | args = parser.parse_args()
78 | main(args.i, args.m, args.o)
79 |
--------------------------------------------------------------------------------
/tests/test_conceptTrainer.py:
--------------------------------------------------------------------------------
1 | import logging
2 | from pathlib import Path
3 | from unittest import TestCase
4 |
5 | import joblib
6 | import numpy as np
7 | from scipy.sparse import csc_matrix
8 | from hypothesis import given
9 | from hypothesis.extra.numpy import arrays
10 | from sklearn.linear_model import SGDClassifier
11 | from sklearn.model_selection import GridSearchCV, train_test_split
12 | from testfixtures import TempDirectory
13 |
14 | from .context import dsconcept
15 |
16 | logging.basicConfig(level=logging.INFO)
17 | LOG = logging.getLogger(__name__)
18 | LOG.setLevel(logging.INFO)
19 |
20 |
21 | class TestConceptTrainer(TestCase):
22 | def setUp(self):
23 | ce = dsconcept.model.ConceptExtractor()
24 | fe = dsconcept.model.FeatureExtractor()
25 | self.d = TempDirectory()
26 | data = b'{"abstract":["Astronauts are very cool."], "concept": ["ASTRONAUTS", "COOL THINGS"]}\n {"abstract":["NASA is going to Mars."], "concept":["NASA", "MARS"]}'
27 | self.d.write("test.json", data)
28 | self.corpus_path = f"{self.d.path}/test.json"
29 | s = 100
30 | self.X = csc_matrix(np.random.randint(2, size=s * 2).reshape(int(s), 2))
31 | self.y = np.random.randint(2, size=s)
32 | paramgrid = {
33 | "alpha": [0.01, 0.001, 0.0001],
34 | "class_weight": [{1: 10, 0: 1}, {1: 5, 0: 1}, {1: 20, 0: 1}],
35 | "max_iter": [1],
36 | "loss": ["log"],
37 | } # requires loss function with predict_proba
38 | clf = GridSearchCV(
39 | SGDClassifier(), paramgrid, scoring="f1"
40 | ) # requires GridSearchCV
41 | self.ct = dsconcept.train.ConceptTrainer(ce, clf)
42 |
43 | def test_create_concept_classifier(self):
44 | out_dir = Path(f"{self.d.path}/models")
45 | out_dir.mkdir()
46 | X_train, X_test, y_train, y_test = train_test_split(
47 | self.X, self.y, test_size=0.5, random_state=42
48 | )
49 | self.ct.create_concept_classifier(
50 | "test_concept", X_train, X_test, y_train, y_test, out_dir
51 | )
52 | clf = joblib.load(out_dir / "test_concept.pkl")
53 | LOG.info(clf)
54 |
55 | def test_train_all(self): # This test is super naive. Does not check behaviour.
56 | self.ct.train_all(self.X, Path(f"{self.d.path}/models"), 5)
57 | test_inds = np.load((Path(f"{self.d.path}") / "test_inds.npy"))
58 | train_inds = np.load((Path(f"{self.d.path}") / "train_inds.npy"))
59 | LOG.info(f"test_inds: {test_inds}")
60 | LOG.info(f"train_inds: {train_inds}")
61 |
62 | @given(arrays(dtype=np.float_, shape=1))
63 | def test_get_dispersed_subset(self, array):
64 | subset = dsconcept.train.get_dispersed_subset(array, 5)
65 | self.assertLessEqual(len(subset), 5)
66 | LOG.info(subset)
67 |
68 | def tearDown(self):
69 | self.d.cleanup()
70 |
--------------------------------------------------------------------------------
/src/pipeline/docker_pipeline.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | #export IMAGE=storage.analytics.nasa.gov/abuonomo/rat_trainer:dev # TODO: make command line arg
4 | ## TODO: add help for description of parameters
5 | #
6 | #echo "Reading from ${IN_DATA}." # TODO: put in docker container env variable so its in docker logs
7 | #echo "Dumping to ${OUT_DATA}."
8 |
9 | usage="$(basename "$0") [-h] [-i path] [-o path] [-d docker-image] [-l loglevel] [-c cpus]
10 | Concept training pipeline
11 |
12 | where:
13 | -h show this help text
14 | -i (absolute path) input data directory
15 | -o (absolute path) output data directory
16 | -d the docker image to use
17 | -l the log level to use ['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']
18 | -c number of cpus to allow container to use"
19 |
20 | # Get command line arguments
21 | input=""
22 | output=""
23 | while getopts ':hi:o:d:l:c:' option; do
24 | case "$option" in
25 | h) echo "$usage"
26 | exit
27 | ;;
28 | i) input=$OPTARG
29 | ;;
30 | :) printf "missing argument for -%s\n" "$OPTARG" >&2
31 | echo "$usage" >&2
32 | exit 1
33 | ;;
34 | o) output=$OPTARG
35 | ;;
36 | :) printf "missing argument for -%s\n" "$OPTARG" >&2
37 | echo "$usage" >&2
38 | exit 1
39 | ;;
40 | d) image=$OPTARG
41 | ;;
42 | :) printf "missing argument for -%s\n" "$OPTARG" >&2
43 | echo "$usage" >&2
44 | exit 1
45 | ;;
46 | l) LOGLEVEL=$OPTARG
47 | ;;
48 | :) printf "missing argument for -%s\n" "$OPTARG" >&2
49 | echo "$usage" >&2
50 | exit 1
51 | ;;
52 | c) cpus=$OPTARG
53 | ;;
54 | :) printf "missing argument for -%s\n" "$OPTARG" >&2
55 | echo "$usage" >&2
56 | exit 1
57 | ;;
58 | \?) printf "illegal option: -%s\n" "$OPTARG" >&2
59 | echo "$usage" >&2
60 | exit 1
61 | ;;
62 | esac
63 | done
64 | shift $((OPTIND - 1))
65 |
66 | # Check for errors
67 | if [ ! -d "${input}" ]; then
68 | echo "${input} directory does not exists. Choose a directory name which does exists and contains requisite data."
69 | exit 1
70 | fi
71 | if [ -d "${output}" ]; then
72 | echo "${output} directory already exists. Choose a new directory name which does not exist."
73 | exit 1
74 | fi
75 | if [ "${LOGLEVEL}" = "" ]; then
76 | echo "Setting empty LOGLEVEL to INFO."
77 | export LOGLEVEL="INFO"
78 | fi
79 | if [ "${cpus}" = "" ]; then
80 | echo "Setting empty LOGLEVEL to INFO."
81 | export cpus=0.000
82 | fi
83 |
84 |
85 | mkdir ${output}
86 |
87 | echo "Running full pipeline."
88 | docker run -it\
89 | -v ${input}:/home/pipeline/volumes/in_data \
90 | -v ${output}:/home/pipeline/volumes/out_data \
91 | -e LOGLEVEL=${LOGLEVEL} \
92 | --cpus=${cpus} \
93 | ${image} bash -c 'bash pipeline/start.sh'
94 | #--cpus= # TODO: add cpus arg
95 | echo "Completed Pipeline."
96 |
--------------------------------------------------------------------------------
/tests/test_featureExtractor.py:
--------------------------------------------------------------------------------
1 | from unittest import TestCase
2 | from dsconcept.model import FeatureExtractor
3 |
4 | # from .context import dsconcept
5 | from testfixtures import TempDirectory
6 | from hypothesis import given
7 | import hypothesis.strategies as st
8 | import pytest
9 |
10 |
11 | @st.composite
12 | def features(draw):
13 | tags = [st.just(tag) for tag in ["NOUN", "PROPN", "NOUN_CHUNK", "ENT"]]
14 | tags_strat = st.one_of(*tags)
15 | txt = st.text(max_size=5)
16 | doc_feats = st.dictionaries(keys=txt, values=tags_strat, min_size=4, max_size=5)
17 | feats = draw(st.lists(doc_feats, max_size=5))
18 | return feats
19 |
20 |
21 | @st.composite
22 | def weights(draw):
23 | tags = [st.just(tag) for tag in ["NOUN", "PROPN", "NOUN_CHUNK", "ENT"]]
24 | tags_strat = st.one_of(*tags)
25 | weights_dict = draw(
26 | st.dictionaries(keys=tags_strat, values=st.integers(min_value=0))
27 | )
28 | return weights_dict
29 |
30 |
31 | class TestFeatureExtractor(TestCase):
32 | def setUp(self):
33 | self.fe = FeatureExtractor()
34 | self.d = TempDirectory()
35 | data = b'{"abstract":"Astronauts are very cool.", "concept": ["ASTRONAUTS", "COOL THINGS"]} \n {"abstract":"NASA is going to Mars.", "concept":["NASA", "MARS"]}'
36 | self.d.write("test.json", data)
37 | self.corpus_path = f"{self.d.path}/test.json"
38 |
39 | @given(features())
40 | def test_features(self, d):
41 | self.fe.features = d
42 | self.assertEqual(len(self.fe.features), len(d))
43 |
44 | def test_from_corpus_to_jsonlines(self):
45 | self.fe.from_corpus_to_jsonlines(
46 | self.corpus_path, f"{self.d.path}/features.jsonl", "abstract",
47 | )
48 |
49 | def test_from_jsonlines(self):
50 | data = b'{"astronaut":"NOUN", "space": "NOUN", "NASA": "ENT"}\n{"Mars": "PROPN", "dog": "NOUN"}'
51 | features_out = "features.jsonl"
52 | self.d.write(features_out, data)
53 | self.fe.from_jsonlines(f"{self.d.path}/{features_out}")
54 | self.assertSetEqual(self.fe.term_types, {"NOUN", "PROPN", "ENT"})
55 |
56 | def test_to_jsonlines(self):
57 | self.fe.features = [
58 | {"space": "NOUN", "Mars": "PROPN"},
59 | {"Anita": "PROPN", "Adams": "PROPN"},
60 | ]
61 | out_features = "features.jsonl"
62 | self.fe.to_jsonlines(f"{self.d.path}/{out_features}")
63 |
64 | @given(features(), weights())
65 | def test_weight_terms(self, d, w):
66 | self.fe.features = d
67 | self.fe.weight_terms(w)
68 |
69 | @given(features(), weights())
70 | def test_limit_features(self, d, w):
71 | self.fe.features = d
72 | weighted_features = self.fe.weight_terms(
73 | w
74 | ) # Test method contingent upon another test. Bad?
75 | self.fe.limit_features(weighted_features, feature_min=1, feature_max=0.90)
76 |
77 | def tearDown(self):
78 | self.d.cleanup()
79 |
--------------------------------------------------------------------------------
/src/make_cat_models.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import logging
3 | import joblib
4 | from sklearn.feature_extraction import DictVectorizer
5 | from sklearn.linear_model import SGDClassifier
6 | from sklearn.model_selection import GridSearchCV, train_test_split
7 | import yaml
8 | from pathlib import Path
9 | import numpy as np
10 |
11 | import dsconcept.model as ml
12 | from dsconcept.train import ConceptTrainer
13 |
14 | logging.basicConfig(level=logging.INFO)
15 | LOG = logging.getLogger(__name__)
16 | LOG.setLevel(logging.INFO)
17 |
18 | VECTORIZER = "vectorizer.jbl"
19 | FEATURE_MATRIX = "feature_matrix.jbl"
20 | OUT_MODELS_DIR = "models"
21 |
22 |
23 | def main(
24 | in_feature_matrix,
25 | in_ind_train,
26 | in_ind_test,
27 | in_cat_indices,
28 | in_cat_raw2lemma,
29 | in_config,
30 | out_dir,
31 | ):
32 | with open(in_config, "r") as f0:
33 | config = yaml.safe_load(f0)
34 |
35 | X = joblib.load(in_feature_matrix)
36 | ind_train = np.load(in_ind_train)
37 | ind_test = np.load(in_ind_test)
38 |
39 | LOG.info(
40 | f"Loading category extractor from {in_cat_indices} and {in_cat_raw2lemma}."
41 | )
42 | cat_ext = ml.ConceptExtractor()
43 | cat_ext.from_jsons(in_cat_indices, in_cat_raw2lemma)
44 |
45 | paramgrid = {
46 | "alpha": [0.01, 0.001, 0.0001],
47 | "class_weight": [{1: 10, 0: 1}, {1: 5, 0: 1}, {1: 20, 0: 1}],
48 | "max_iter": [1],
49 | "loss": ["log"],
50 | } # requires loss function with predict_proba
51 | clf = GridSearchCV(
52 | SGDClassifier(), paramgrid, scoring="f1"
53 | ) # requires GridSearchCV
54 | out_models = out_dir / OUT_MODELS_DIR
55 | trainer = ConceptTrainer(cat_ext, clf)
56 |
57 | trainer.train_concepts(
58 | X, ind_train, ind_test, out_models, config["min_concept_occurrence"]
59 | )
60 | LOG.info("Complete.")
61 |
62 |
63 | if __name__ == "__main__":
64 | parser = argparse.ArgumentParser(
65 | description="""Use feature matrix and location of indices to create classifiers
66 | for the categories in the corpus."""
67 | )
68 | parser.add_argument(
69 | "in_feature_matrix", help="input scipy sparse matrix of features"
70 | )
71 | parser.add_argument("in_ind_train", help="train set index")
72 | parser.add_argument("in_ind_test", help="test set index")
73 | parser.add_argument("in_cat_indices", help="category indices")
74 | parser.add_argument("in_cat_raw2lemma", help="category raw to lemma mapping")
75 | parser.add_argument("in_config", help="configuration for creating models")
76 | parser.add_argument(
77 | "out_dir",
78 | help="output directory for vectorizer, feature matrix, and models",
79 | type=Path,
80 | )
81 | args = parser.parse_args()
82 |
83 | main(
84 | args.in_feature_matrix,
85 | args.in_ind_train,
86 | args.in_ind_test,
87 | args.in_cat_indices,
88 | args.in_cat_raw2lemma,
89 | args.in_config,
90 | args.out_dir,
91 | )
92 |
--------------------------------------------------------------------------------
/src/make_records_for_cat_bert.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import json
3 | import logging
4 | from pathlib import Path
5 |
6 | import numpy as np
7 | import pandas as pd
8 | from sklearn.model_selection import train_test_split
9 | from sklearn.preprocessing import MultiLabelBinarizer
10 |
11 | logging.basicConfig(level=logging.INFO)
12 | LOG = logging.getLogger(__name__)
13 | LOG.setLevel(logging.INFO)
14 |
15 |
16 | def main(in_records, inds_loc, out_records_dir):
17 | train_inds_loc = inds_loc / "train_inds.npy"
18 | test_inds_loc = inds_loc / "test_inds.npy"
19 | train_bert_inds_loc = inds_loc / "train_bert_inds.npy"
20 | dev_bert_inds_loc = inds_loc / "dev_bert_inds.npy"
21 |
22 | LOG.info(f"Loading cleaned records from {in_records}.")
23 | records = pd.read_json(in_records, orient="records", lines=True)
24 | train_inds = np.load(train_inds_loc)
25 | test_inds = np.load(test_inds_loc)
26 |
27 | LOG.info(f"Creating bert cat df format.")
28 | mlb = MultiLabelBinarizer()
29 | cat_bin_array = mlb.fit_transform(records["categories"])
30 | cat_df = pd.DataFrame(cat_bin_array)
31 | cat_df.columns = mlb.classes_
32 | cat_df["text"] = records["text"]
33 | ordered_cols = ["text"] + mlb.classes_.tolist()
34 | cat_df = cat_df[ordered_cols]
35 |
36 | train_bert_inds, dev_bert_inds = train_test_split(train_inds, test_size=0.25)
37 | np.save(train_bert_inds_loc, train_bert_inds)
38 | np.save(dev_bert_inds_loc, dev_bert_inds)
39 |
40 | ml_sets = {
41 | "train": cat_df.iloc[train_bert_inds],
42 | "test": cat_df.iloc[test_inds],
43 | "dev": cat_df.iloc[dev_bert_inds],
44 | }
45 |
46 | out_records_dir.mkdir(exist_ok=True)
47 | for set_type, ml_set in ml_sets.items():
48 | outfile = out_records_dir / f"{set_type}.csv"
49 | LOG.info("Writing to {}".format(outfile))
50 | ml_set.to_csv(outfile, index=True)
51 |
52 | out_id_to_label = str(out_records_dir / "id_to_label.json")
53 | out_label_to_id = str(out_records_dir / "label_to_id.json")
54 | out_classes = str(out_records_dir / "classes.txt")
55 |
56 | id_to_label = {i: c for i, c in enumerate(mlb.classes_)}
57 | label_to_id = {c: i for i, c in enumerate(mlb.classes_)}
58 |
59 | LOG.info(f"Writing classes to {out_classes}")
60 | classes = mlb.classes_.tolist()
61 |
62 | with open(out_classes, "w") as f0:
63 | for i, c in enumerate(classes):
64 | f0.write(c.strip())
65 | if i < len(classes) - 1:
66 | f0.write("\n")
67 |
68 | LOG.info(f"Writing to {out_id_to_label}.")
69 | with open(out_id_to_label, "w") as f0:
70 | json.dump(id_to_label, f0)
71 |
72 | LOG.info(f"Writing to {out_label_to_id}.")
73 | with open(out_label_to_id, "w") as f0:
74 | json.dump(label_to_id, f0)
75 |
76 |
77 | if __name__ == "__main__":
78 | parser = argparse.ArgumentParser(description="Say hello")
79 | parser.add_argument("i", help="input records", type=Path)
80 | parser.add_argument(
81 | "inds_loc", help="directory for train, test, and dev indices", type=Path
82 | )
83 | parser.add_argument(
84 | "o", help="output files for bert category classifying.", type=Path
85 | )
86 | args = parser.parse_args()
87 | main(args.i, args.inds_loc, args.o)
88 |
--------------------------------------------------------------------------------
/src/synthesize_predictions.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import logging
3 | from pathlib import Path
4 |
5 | import dask
6 | import h5py
7 | import joblib
8 | import numpy as np
9 | import pandas as pd
10 | from dask.diagnostics import ProgressBar
11 | from tqdm import tqdm
12 |
13 | from dsconcept.get_metrics import (
14 | get_cat_inds,
15 | get_synth_preds,
16 | load_category_models,
17 | load_concept_models,
18 | HierarchicalClassifier,
19 | get_mets,
20 | )
21 |
22 | logging.basicConfig(level=logging.INFO)
23 | LOG = logging.getLogger(__name__)
24 | LOG.setLevel(logging.INFO)
25 |
26 |
27 | def main(
28 | experiment_name,
29 | synth_strat,
30 | in_cat_preds,
31 | out_store,
32 | synth_batch_size,
33 | t,
34 | out_synth_scores,
35 | limit=None,
36 | con_limit=None,
37 | ):
38 | test_inds = np.load(f"data/interim/{experiment_name}/test_inds.npy")
39 | feature_matrix = joblib.load(f"data/interim/{experiment_name}/feature_matrix.jbl")
40 | in_cat_models = Path(f"models/{experiment_name}/categories/models/")
41 | in_kwd_models = Path(f"models/{experiment_name}/keywords/models/")
42 | cat_preds = np.load(in_cat_preds) # based on experiment or explicit path?
43 | cat_clfs = load_category_models(in_cat_models)
44 | cd = load_concept_models(in_kwd_models)
45 | clf = HierarchicalClassifier(cat_clfs, cd)
46 |
47 | if limit is not None:
48 | LOG.info(f"Limiting to {limit} test records.")
49 | feature_matrix_test = feature_matrix.tocsc()[test_inds[0:limit], :]
50 | cat_preds = cat_preds[0:limit, :]
51 | # TODO: How does this affect indices?
52 | else:
53 | feature_matrix_test = feature_matrix.tocsc()[test_inds, :]
54 |
55 | LOG.info(f'Synthesizing predictions with strategy "{synth_strat}".')
56 | all_cat_inds = get_cat_inds(clf.categories, cat_preds, t=t)
57 | if con_limit is not None:
58 | conwc = clf.concepts_with_classifiers[0:con_limit]
59 | else:
60 | conwc = clf.concepts_with_classifiers
61 | shape = (feature_matrix_test.shape[0], len(conwc))
62 | with tqdm(total=shape[0]) as pbar:
63 | get_synth_preds(
64 | out_store,
65 | shape,
66 | all_cat_inds,
67 | clf.categories,
68 | synth_batch_size,
69 | only_cat=False,
70 | synth_strat=synth_strat,
71 | con_limit=con_limit,
72 | limit=limit,
73 | pbar=pbar,
74 | )
75 |
76 | LOG.info("Obtaining metrics.")
77 | with h5py.File(out_store, "r") as f0:
78 | if limit is not None:
79 | target_values = f0["ground_truth"][0:limit, :]
80 | else:
81 | target_values = f0["ground_truth"].value
82 | with h5py.File(out_store, "r") as f0:
83 | synth_preds = f0["synthesis"].value
84 |
85 | jobs = []
86 | mets_pbar = tqdm(
87 | range(len(conwc)),
88 | total=len(conwc),
89 | )
90 | for i in mets_pbar:
91 | job = dask.delayed(get_mets)(
92 | i, synth_preds, target_values, conwc, mets_pbar
93 | )
94 | jobs.append(job)
95 | records = dask.compute(jobs)
96 | new_recs_df = pd.DataFrame(records[0])
97 | LOG.info(f"Saving results to {out_synth_scores}.")
98 | new_recs_df.to_csv(out_synth_scores)
99 |
100 |
101 | if __name__ == "__main__":
102 | parser = argparse.ArgumentParser(description="Say hello")
103 | parser.add_argument("--experiment_name", help="input txt file")
104 | parser.add_argument("--synth_strat", help="input txt file")
105 | parser.add_argument("--in_cat_preds", help="input txt file")
106 | parser.add_argument("--store", help="input txt file")
107 | parser.add_argument("--synth_batch_size", help="input txt file", type=int)
108 | parser.add_argument("--threshold", help="input txt file", type=float)
109 | parser.add_argument("--out_synth_scores", help="input txt file")
110 | parser.add_argument(
111 | "--limit", help="size for sample to test synthesis", type=int, default=None
112 | )
113 | parser.add_argument(
114 | "--con_limit", help="size for concept sample", type=int, default=None
115 | )
116 | args = parser.parse_args()
117 | main(
118 | args.experiment_name,
119 | args.synth_strat,
120 | args.in_cat_preds,
121 | args.store,
122 | args.synth_batch_size,
123 | args.threshold,
124 | args.out_synth_scores,
125 | args.limit,
126 | args.con_limit,
127 | )
128 |
--------------------------------------------------------------------------------
/data/interim/subj_mapping.json:
--------------------------------------------------------------------------------
1 | {"environment pollution": "geosciences", "energy production and conversion": "geosciences", "oceanography": "geosciences", "geophysics": "geosciences", "earth resources and remote sensing": "geosciences", "geosciences (general)": "geosciences", "meteorology and climatology": "geosciences", "spacecraft design, testing and performance": "astronautics", "astrodynamics": "astronautics", "astronautics (general)": "astronautics", "ground support systems and facilities (space)": "astronautics", "launch vehicles and launch operations": "astronautics", "space transportation and safety": "astronautics", "spacecraft instrumentation and astrionics": "astronautics", "spacecraft propulsion and power": "astronautics", "space communications, spacecraft communications, command and tracking": "astronautics", "space transportation": "astronautics", "spacecraft instrumentation": "astronautics", "launch vehicles and space vehicles": "astronautics", "physics (general)": "physics", "plasma physics": "physics", "optics": "physics", "nuclear physics": "physics", "acoustics": "physics", "solid-state physics": "physics", "atomic and molecular physics": "physics", "physics of elementary particles and fields": "physics", "nuclear and high-energy physics": "physics", "thermodynamics and statistical physics": "physics", "astronomy": "space sciences", "solar physics": "space sciences", "lunar and planetary science and exploration": "space sciences", "space radiation": "space sciences", "astrophysics": "space sciences", "space sciences (general)": "space sciences", "lunar and planetary exploration": "space sciences", "space biology": "space sciences", "inorganic, organic and physical chemistry": "chemistry and materials", "space processing": "chemistry and materials", "chemistry and materials (general)": "chemistry and materials", "propellants and fuels": "chemistry and materials", "nonmetallic materials": "chemistry and materials", "metals and metallic materials": "chemistry and materials", "composite materials": "chemistry and materials", "materials processing": "chemistry and materials", "metallic materials": "chemistry and materials", "inorganic and physical chemistry": "chemistry and materials", "materials": "chemistry and materials", "research and support facilities (air)": "aeronautics", "avionics and aircraft instrumentation": "aeronautics", "aircraft communications and navigation": "aeronautics", "aircraft propulsion and power": "aeronautics", "aerodynamics": "aeronautics", "aeronautics (general)": "aeronautics", "air transportation and safety": "aeronautics", "aircraft design, testing and performance": "aeronautics", "aircraft stability and control": "aeronautics", "aircraft instrumentation": "aeronautics", "economics and cost analysis": "social and information sciences", "documentation and information science": "social and information sciences", "technology utilization and surface transportation": "social and information sciences", "administration and management": "social and information sciences", "law, political science and space policy": "social and information sciences", "social and information sciences (general)": "social and information sciences", "social sciences (general)": "social and information sciences", "statistics and probability": "mathematical and computer sciences", "computer operations and hardware": "mathematical and computer sciences", "computer programming and software": "mathematical and computer sciences", "computer systems": "mathematical and computer sciences", "cybernetics, artificial intelligence and robotics": "mathematical and computer sciences", "mathematical and computer sciences (general)": "mathematical and computer sciences", "numerical analysis": "mathematical and computer sciences", "systems analysis and operations research": "mathematical and computer sciences", "theoretical mathematics": "mathematical and computer sciences", "cybernetics": "mathematical and computer sciences", "systems analysis": "mathematical and computer sciences", "mathematics and information sciences": "mathematical and computer sciences", "general.": "general", "general": "general", "communications and radar": "engineering", "engineering (general)": "engineering", "electronics and electrical engineering": "engineering", "fluid mechanics and thermodynamics": "engineering", "instrumentation and photography": "engineering", "lasers and masers": "engineering", "mechanical engineering": "engineering", "quality assurance and reliability": "engineering", "structural mechanics": "engineering", "fluid mechanics and heat transfer": "engineering", "behavioral sciences": "life sciences", "aerospace medicine": "life sciences", "man/system technology and life support": "life sciences", "exobiology": "life sciences", "life sciences (general)": "life sciences", "life sciences": "life sciences"}
--------------------------------------------------------------------------------
/src/make_kwd_models.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import logging
3 | from pathlib import Path
4 |
5 | import joblib
6 | import numpy as np
7 | import yaml
8 | from sklearn.linear_model import SGDClassifier
9 | from sklearn.model_selection import GridSearchCV
10 |
11 | import dsconcept.model as ml
12 | import dsconcept.train as tr
13 |
14 | logging.basicConfig(level=logging.INFO)
15 | LOG = logging.getLogger(__name__)
16 | LOG.setLevel(logging.DEBUG)
17 |
18 | VECTORIZER = "vectorizer.jbl"
19 | FEATURE_MATRIX = "feature_matrix.jbl"
20 | OUT_MODELS_DIR = "models/topic_"
21 |
22 |
23 | def main(
24 | in_feature_matrix,
25 | in_ind_train,
26 | in_ind_test,
27 | in_kwd_indices,
28 | in_cat_indices,
29 | in_kwd_raw2lemma,
30 | in_cat_raw2lemma,
31 | in_config,
32 | out_dir,
33 | topics=True,
34 | ):
35 | with open(in_config, "r") as f0:
36 | config = yaml.safe_load(f0)
37 |
38 | X = joblib.load(in_feature_matrix)
39 | ind_train = np.load(in_ind_train)
40 | ind_test = np.load(in_ind_test)
41 |
42 | LOG.info(f"Loading keyword extractor from {in_kwd_indices} and {in_kwd_raw2lemma}.")
43 | ce = ml.ConceptExtractor()
44 | ce.from_jsons(in_kwd_indices, in_kwd_raw2lemma)
45 |
46 | LOG.info(
47 | f"Loading category extractor from {in_cat_indices} and {in_cat_raw2lemma}."
48 | )
49 | cat_ext = ml.ConceptExtractor()
50 | cat_ext.from_jsons(in_cat_indices, in_cat_raw2lemma)
51 |
52 | paramgrid = {
53 | "alpha": [0.01, 0.001, 0.0001],
54 | "class_weight": [{1: 10, 0: 1}, {1: 5, 0: 1}, {1: 20, 0: 1}],
55 | "max_iter": [5],
56 | "loss": ["log"],
57 | } # requires loss function with predict_proba
58 | clf = GridSearchCV(
59 | SGDClassifier(), paramgrid, scoring="f1", n_jobs=-1,
60 | ) # requires GridSearchCV
61 | out_models = out_dir / OUT_MODELS_DIR
62 | trainer = tr.ConceptTrainer(ce, clf)
63 | doc_topic_indices = cat_ext.concept_index_mapping
64 |
65 | if topics:
66 | LOG.info(
67 | f"Training one set for each of {len(doc_topic_indices)} topics divisions."
68 | )
69 | for topic, doc_topic_index in doc_topic_indices.items():
70 | trainer.train_concepts(
71 | X,
72 | ind_train,
73 | ind_test,
74 | out_models,
75 | config["min_concept_occurrence"],
76 | topic,
77 | doc_topic_index,
78 | )
79 | LOG.info("Training one general set")
80 | trainer.train_concepts(
81 | X, ind_train, ind_test, out_models, config["min_concept_occurrence"]
82 | )
83 | LOG.info("Complete.")
84 |
85 |
86 | if __name__ == "__main__":
87 | parser = argparse.ArgumentParser(
88 | description="""Use feature matrix and location of indices to create classifiers
89 | for the concepts in the corpus."""
90 | )
91 | parser.add_argument(
92 | "in_feature_matrix", help="input scipy sparse matrix of features"
93 | )
94 | parser.add_argument("in_ind_train", help="train set index")
95 | parser.add_argument("in_ind_test", help="test set index")
96 | parser.add_argument("in_kwd_indices", help="keyword indicies")
97 | parser.add_argument("in_cat_indices", help="category indices")
98 | parser.add_argument("in_kwd_raw2lemma", help="keyword raw to lemma mapping")
99 | parser.add_argument("in_cat_raw2lemma", help="category raw to lemma mapping")
100 | parser.add_argument("in_config", help="configuration for creating models")
101 | parser.add_argument(
102 | "out_dir",
103 | help="output directory for vectorizer, feature matrix, and models",
104 | type=Path,
105 | )
106 | parser.add_argument("--topics", dest="topics", action="store_true")
107 | parser.add_argument("--no-topics", dest="topics", action="store_false")
108 | parser.set_defaults(topics=True)
109 | parser.add_argument(
110 | "-v", "--verbose", help="increase output verbosity", action="store_true"
111 | )
112 | args = parser.parse_args()
113 | if args.verbose:
114 | LOG.info("Changing log level to DEBUG.")
115 | LOG.setLevel(logging.DEBUG)
116 | tr.LOG.setLevel(logging.DEBUG)
117 | LOG.debug("Changed log level to DEBUG.")
118 |
119 | main(
120 | args.in_feature_matrix,
121 | args.in_ind_train,
122 | args.in_ind_test,
123 | args.in_kwd_indices,
124 | args.in_cat_indices,
125 | args.in_kwd_raw2lemma,
126 | args.in_cat_raw2lemma,
127 | args.in_config,
128 | args.out_dir,
129 | args.topics,
130 | )
131 |
--------------------------------------------------------------------------------
/src/get_bert_cat_models_preds.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import json
3 | import linecache
4 | import logging
5 | from pathlib import Path
6 |
7 | import dsbert.dsbert.multilabel as mll
8 | import numpy as np
9 | import pandas as pd
10 | from tqdm import tqdm
11 |
12 | import dsconcept.get_metrics as gm
13 |
14 | logging.basicConfig(level=logging.INFO)
15 | LOG = logging.getLogger(__name__)
16 | LOG.setLevel(logging.INFO)
17 |
18 |
19 | def init_domain_bert(base_model_dir, finetuned_model_dir, map_loc="cpu"):
20 | LOG.info("Loading BERT models")
21 | processor = mll.MultiLabelTextProcessor(finetuned_model_dir)
22 | clf = mll.BERTMultilabelClassifier(
23 | processor, bert_model=base_model_dir, do_lower_case=False,
24 | )
25 | clf.initialize_devices()
26 | clf.load_model(
27 | f"{finetuned_model_dir}/finetuned_pytorch_model.bin", map_location=map_loc,
28 | )
29 | return clf, processor
30 |
31 |
32 | def load_lines_to_df(data_loc, line_inds):
33 | tmp_records = []
34 | for i in tqdm(line_inds):
35 | r_str = linecache.getline(str(data_loc), i + 1)
36 | r = json.loads(r_str)
37 | tmp_records.append(r)
38 | records_df = pd.DataFrame(tmp_records)
39 | return records_df
40 |
41 |
42 | def main(
43 | data_dir, models_dir, reports_dir, base_model_dir, finetuned_model_dir, sample,
44 | ):
45 | test_inds = np.load(data_dir / "test_inds.npy")
46 | clean_data_loc = data_dir / "abs_kwds.jsonl"
47 |
48 | in_cat_models = models_dir / "categories/models/"
49 | in_kwd_models = models_dir / "keywords/models/"
50 | cat_raw2lemma_loc = models_dir / "cat_raw2lemma.json"
51 |
52 | out_preds_loc = reports_dir / "bert_cat_preds.npy"
53 |
54 | LOG.info("Loading models.")
55 | cat_clfs = gm.load_category_models(in_cat_models)
56 | cd = gm.load_concept_models(in_kwd_models)
57 | clf = gm.HierarchicalClassifier(cat_clfs, cd)
58 | with open(cat_raw2lemma_loc) as f0:
59 | cat_raw2lemma = json.load(f0)
60 | # base_model_dir = str(bert_models_dir / "cased_L-12_H-768_A-12")
61 | # processor_dir = str(bert_models_dir / "processor_dir")
62 | # finetuned_model_loc = str(
63 | # bert_models_dir / "cased_L-12_H-768_A-12/cache/finetuned_pytorch_model.bin"
64 | # )
65 | bert_clf, processor = init_domain_bert(base_model_dir, finetuned_model_dir,)
66 |
67 | LOG.info(f'Loading records from "{clean_data_loc}".')
68 | if sample is not None:
69 | lines_to_load = test_inds[0:sample]
70 | else:
71 | lines_to_load = test_inds
72 | records_df = load_lines_to_df(clean_data_loc, lines_to_load)
73 |
74 | LOG.info(f"Processing {len(records_df)} records.")
75 | df_example = pd.DataFrame()
76 | df_example["test"] = records_df["text"]
77 | df_example["label"] = 0
78 | df_example = df_example.reset_index()
79 | sample_examples = processor._create_examples(df_example, "test")
80 |
81 | LOG.info("Making BERT category predictions.")
82 | topic_predictions_df = bert_clf.predict(sample_examples)
83 |
84 | LOG.info("Transforming predictions into matrix which aligns with categories.")
85 | cols = topic_predictions_df.iloc[:, 2:].columns
86 | only_preds = topic_predictions_df.iloc[:, 2:]
87 | tcols = [cat_raw2lemma[c] if c in cat_raw2lemma else c for c in cols]
88 | only_preds.columns = tcols
89 | only_preds = only_preds[clf.categories[0:-1]] # don't include '' cat
90 | only_pred_vals = only_preds.values
91 |
92 | LOG.info(f'Saving results to "{out_preds_loc}".')
93 | np.save(out_preds_loc, only_pred_vals)
94 |
95 |
96 | if __name__ == "__main__":
97 | parser = argparse.ArgumentParser(
98 | description="Use BERT cat models to get predictions for all test documents"
99 | )
100 | parser.add_argument(
101 | "--data_dir", help="interim data dir for given experiment", type=Path
102 | )
103 | parser.add_argument("--models_dir", help="model_dir for experiment", type=Path)
104 | parser.add_argument("--reports_dir", help="reports dir for experiment", type=Path)
105 | parser.add_argument("--base_model_dir", help="base bert model dir", type=str)
106 | parser.add_argument(
107 | "--finetuned_model_dir",
108 | help="dir with classes.txt file and finetuned pytorch model",
109 | type=str,
110 | )
111 | parser.add_argument(
112 | "--sample", help="how many to sample from test inds", type=int, default=None
113 | )
114 | args = parser.parse_args()
115 | main(
116 | args.data_dir,
117 | args.models_dir,
118 | args.reports_dir,
119 | args.base_model_dir,
120 | args.finetuned_model_dir,
121 | args.sample,
122 | )
123 |
--------------------------------------------------------------------------------
/tests/test_hierarchicalClassifier.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import unittest
3 | from pathlib import Path
4 | from unittest import TestCase
5 |
6 | import h5py
7 | import joblib
8 | import numpy as np
9 | from sklearn.feature_extraction import DictVectorizer
10 | from testfixtures import TempDirectory
11 |
12 | from dsconcept.get_metrics import HierarchicalClassifier, StubBestEstimator
13 |
14 | logging.basicConfig(level=logging.INFO)
15 | LOG = logging.getLogger(__name__)
16 | LOG.setLevel(logging.INFO)
17 |
18 |
19 | class TestHierarchicalClassifier(TestCase):
20 | def setUp(self) -> None:
21 | self.d = TempDirectory()
22 | self.clf_loc = Path(self.d.path) / "stub.jbl"
23 | out_info = {'concept': 'something', 'best_estimator_': StubBestEstimator()}
24 | joblib.dump(out_info, self.clf_loc)
25 | cat_clfs = [
26 | {"best_estimator_": StubBestEstimator(), "concept": "physics"},
27 | {"best_estimator_": StubBestEstimator(), "concept": "video games"},
28 | ]
29 | kwd_clfs = {
30 | ("physics", "gamma ray"): StubBestEstimator(),
31 | ("video games", "minecraft"): StubBestEstimator(),
32 | ("video games", "kerbal space program"): StubBestEstimator(),
33 | ("", "minecraft"): StubBestEstimator(),
34 | ("", "gamma ray"): StubBestEstimator(),
35 | ("", "penguins"): StubBestEstimator(),
36 | }
37 | kwd_clfs_locs = {
38 | ("physics", "gamma ray"): self.clf_loc,
39 | ("video games", "minecraft"): self.clf_loc,
40 | ("video games", "kerbal space program"): self.clf_loc,
41 | ("", "minecraft"): self.clf_loc,
42 | ("", "gamma ray"): self.clf_loc,
43 | ("", "penguins"): self.clf_loc,
44 | }
45 | self.hclf = HierarchicalClassifier(cat_clfs, kwd_clfs)
46 | self.hclf_locs = HierarchicalClassifier(cat_clfs, kwd_clfs_locs)
47 | self.feature_matrix = np.array([[0, 0, 1, 1], [0, 1, 0, 1], [1, 1, 0, 0]])
48 | v = DictVectorizer()
49 | d = [{"astronauts": 1, "astronomy": 1}, {"space": 1, "basalt": 1}]
50 | v.fit(d)
51 | self.v = v
52 |
53 | def test_cat_clfs(self):
54 | cats = ["physics", "video games", ""]
55 | self.assertListEqual(self.hclf.categories, cats)
56 |
57 | def test_kwd_clfs(self):
58 | kwds = ["gamma ray", "kerbal space program", "minecraft", "penguins"]
59 | self.assertListEqual(self.hclf.concepts_with_classifiers.tolist(), kwds)
60 |
61 | def test_predict_categories(self):
62 | cat_preds = self.hclf.predict_categories(self.feature_matrix)
63 | self.assertEqual(cat_preds.shape, (3, 2))
64 | print(cat_preds)
65 |
66 | def test__predict_one_clf(self):
67 | pred = self.hclf._predict_one_clf(self.feature_matrix, 1, "video games")
68 | self.assertEqual(pred.shape[0], 3)
69 |
70 | def test__predict_one_clf_locs(self):
71 | pred = self.hclf_locs._predict_one_clf(self.feature_matrix, 1, "video games")
72 | self.assertEqual(pred.shape[0], 3)
73 |
74 | def test__predict_keywords(self):
75 | cat_indices = {"physics": [0], "video games": [1, 2]}
76 | store = self.hclf._predict_keywords(
77 | self.feature_matrix,
78 | f"{self.d.path}/store.h5",
79 | cat_indices,
80 | only_no_topic=False,
81 | use_dask=False,
82 | )
83 | with h5py.File(store, 'r') as f0:
84 | pred_array = f0["predictions"][()]
85 | LOG.info(pred_array)
86 | self.assertEqual(pred_array.shape, (3, 3, 4))
87 |
88 | def test__predict_keywords_locs(self):
89 | cat_indices = {"physics": [0], "video games": [1, 2]}
90 | store = self.hclf_locs._predict_keywords(
91 | self.feature_matrix,
92 | f"{self.d.path}/store.h5",
93 | cat_indices,
94 | only_no_topic=False,
95 | use_dask=False,
96 | )
97 | with h5py.File(store, 'r') as f0:
98 | pred_array = f0["predictions"][()]
99 | LOG.info(pred_array)
100 | self.assertEqual(pred_array.shape, (3, 3, 4))
101 |
102 | def test_get_synth_preds(self):
103 | cat_indices = {"physics": [0], "video games": [1, 2]}
104 | store = self.hclf._predict_keywords(
105 | self.feature_matrix,
106 | f"{self.d.path}/store.h5",
107 | cat_indices,
108 | only_no_topic=False,
109 | use_dask=False,
110 | )
111 | all_cat_inds = {
112 | "physics": np.array([0]),
113 | "video games": np.array([0, 1]),
114 | "": np.array([0, 1, 2]),
115 | }
116 | self.hclf.get_synth_preds(
117 | store,
118 | all_cat_inds,
119 | batch_size=10000,
120 | only_cat=False,
121 | synth_strat="mean",
122 | use_dask=False,
123 | )
124 | with h5py.File(store) as f0:
125 | synth_array = f0["synthesis"].value
126 | LOG.info(synth_array)
127 | self.assertEqual(synth_array.shape, (3, 4))
128 |
129 | def test__to_strings(self):
130 | synth_array = np.array(
131 | [[0, 0.51, 0.9, 0.2], [0.8, 0.1, 0.4, 0.7], [0.4, 0.2, 0.1, 0.9]]
132 | )
133 | kwd_strs = self.hclf._to_strings(
134 | self.hclf.concepts_with_classifiers, synth_array, t=0.5
135 | )
136 | results = [
137 | [("minecraft", 0.9), ("kerbal space program", 0.51)],
138 | [("gamma ray", 0.8), ("penguins", 0.7)],
139 | [("penguins", 0.9)],
140 | ]
141 | self.assertEqual(results, kwd_strs)
142 | LOG.info(kwd_strs)
143 |
144 | def test_predict(self):
145 | examples = [
146 | "Olympus Mons is the largest volcano in the solar system",
147 | "Database management is critical for information retrieval",
148 | "We used a logistic regression with batched stochastic gradient descent.",
149 | ]
150 | weights = {"NOUN": 1, "PROPN": 1, "ENT": 1, "NOUN_CHUNK": 1, "ACRONYM": 1}
151 | self.hclf.vectorizer = self.v
152 | features, feature_matrix = self.hclf.vectorize(examples, weights)
153 | self.hclf.predict(feature_matrix)
154 |
155 |
156 | if __name__ == "__main__":
157 | unittest.main()
--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | #
3 | # Configuration file for the Sphinx documentation builder.
4 | #
5 | # This file does only contain a selection of the most common options. For a
6 | # full list see the documentation:
7 | # http://www.sphinx-doc.org/en/master/config
8 |
9 | # -- Path setup --------------------------------------------------------------
10 |
11 | # If extensions (or modules to document with autodoc) are in another directory,
12 | # add these directories to sys.path here. If the directory is relative to the
13 | # documentation root, use os.path.abspath to make it absolute, like shown here.
14 | #
15 | import os
16 | import sys
17 |
18 | sys.path.insert(0, os.path.abspath("../src"))
19 | sys.path.insert(0, os.path.abspath("../src/dsconcept"))
20 |
21 | # -- Project information -----------------------------------------------------
22 |
23 | project = "Research Access"
24 | copyright = "2018, Anthony Buonomo"
25 | author = "Anthony Buonomo"
26 |
27 | # The short X.Y version
28 | version = ""
29 | # The full version, including alpha/beta/rc tags
30 | release = "3.0.0"
31 |
32 |
33 | # -- General configuration ---------------------------------------------------
34 |
35 | # If your documentation needs a minimal Sphinx version, state it here.
36 | #
37 | # needs_sphinx = '1.0'
38 |
39 | # Add any Sphinx extension module names here, as strings. They can be
40 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
41 | # ones.
42 | extensions = [
43 | "sphinx.ext.autodoc",
44 | "sphinx.ext.todo",
45 | "sphinx.ext.napoleon",
46 | ]
47 |
48 | # Add any paths that contain templates here, relative to this directory.
49 | templates_path = ["_templates"]
50 |
51 | # The suffix(es) of source filenames.
52 | # You can specify multiple suffix as a list of string:
53 | #
54 | # source_suffix = ['.rst', '.md']
55 | source_suffix = ".rst"
56 |
57 | # The master toctree document.
58 | master_doc = "index"
59 |
60 | # The language for content autogenerated by Sphinx. Refer to documentation
61 | # for a list of supported languages.
62 | #
63 | # This is also used if you do content translation via gettext catalogs.
64 | # Usually you set "language" from the command line for these cases.
65 | language = None
66 |
67 | # List of patterns, relative to source directory, that match files and
68 | # directories to ignore when looking for source files.
69 | # This pattern also affects html_static_path and html_extra_path.
70 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
71 |
72 | # The name of the Pygments (syntax highlighting) style to use.
73 | pygments_style = None
74 |
75 |
76 | # -- Options for HTML output -------------------------------------------------
77 |
78 | # The theme to use for HTML and HTML Help pages. See the documentation for
79 | # a list of builtin themes.
80 | #
81 | html_theme = "alabaster"
82 |
83 | # Theme options are theme-specific and customize the look and feel of a theme
84 | # further. For a list of options available for each theme, see the
85 | # documentation.
86 | #
87 | # html_theme_options = {}
88 |
89 | # Add any paths that contain custom static files (such as style sheets) here,
90 | # relative to this directory. They are copied after the builtin static files,
91 | # so a file named "default.css" will overwrite the builtin "default.css".
92 | html_static_path = ["_static"]
93 |
94 | # Custom sidebar templates, must be a dictionary that maps document names
95 | # to template names.
96 | #
97 | # The default sidebars (for documents that don't match any pattern) are
98 | # defined by theme itself. Builtin themes are using these templates by
99 | # default: ``['localtoc.html', 'relations.html', 'sourcelink.html',
100 | # 'searchbox.html']``.
101 | #
102 | # html_sidebars = {}
103 |
104 |
105 | # -- Options for HTMLHelp output ---------------------------------------------
106 |
107 | # Output file base name for HTML help builder.
108 | htmlhelp_basename = "ResearchAccessdoc"
109 |
110 |
111 | # -- Options for LaTeX output ------------------------------------------------
112 |
113 | latex_elements = {
114 | # The paper size ('letterpaper' or 'a4paper').
115 | #
116 | # 'papersize': 'letterpaper',
117 | # The font size ('10pt', '11pt' or '12pt').
118 | #
119 | # 'pointsize': '10pt',
120 | # Additional stuff for the LaTeX preamble.
121 | #
122 | # 'preamble': '',
123 | # Latex figure (float) alignment
124 | #
125 | # 'figure_align': 'htbp',
126 | }
127 |
128 | # Grouping the document tree into LaTeX files. List of tuples
129 | # (source start file, target name, title,
130 | # author, documentclass [howto, manual, or own class]).
131 | latex_documents = [
132 | (
133 | master_doc,
134 | "ResearchAccess.tex",
135 | "Research Access Documentation",
136 | "Anthony Buonomo",
137 | "manual",
138 | ),
139 | ]
140 |
141 |
142 | # -- Options for manual page output ------------------------------------------
143 |
144 | # One entry per manual page. List of tuples
145 | # (source start file, name, description, authors, manual section).
146 | man_pages = [
147 | (master_doc, "researchaccess", "Research Access Documentation", [author], 1)
148 | ]
149 |
150 |
151 | # -- Options for Texinfo output ----------------------------------------------
152 |
153 | # Grouping the document tree into Texinfo files. List of tuples
154 | # (source start file, target name, title, author,
155 | # dir menu entry, description, category)
156 | texinfo_documents = [
157 | (
158 | master_doc,
159 | "ResearchAccess",
160 | "Research Access Documentation",
161 | author,
162 | "ResearchAccess",
163 | "One line description of project.",
164 | "Miscellaneous",
165 | ),
166 | ]
167 |
168 |
169 | # -- Options for Epub output -------------------------------------------------
170 |
171 | # Bibliographic Dublin Core info.
172 | epub_title = project
173 |
174 | # The unique identifier of the text. This can be a ISBN number
175 | # or the project homepage.
176 | #
177 | # epub_identifier = ''
178 |
179 | # A unique identification for the text.
180 | #
181 | # epub_uid = ''
182 |
183 | # A list of files that should not be packed into the epub file.
184 | epub_exclude_files = ["search.html"]
185 |
186 |
187 | # -- Extension configuration -------------------------------------------------
188 |
189 | # -- Options for todo extension ----------------------------------------------
190 |
191 | # If true, `todo` and `todoList` produce output, else they produce nothing.
192 | todo_include_todos = True
193 |
194 |
195 | # -- User config
196 | autodoc_member_order = "bysource"
197 |
--------------------------------------------------------------------------------
/src/make_plots.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import logging
3 | from math import ceil
4 | from pathlib import Path
5 | from time import time
6 |
7 | import joblib
8 | import matplotlib.pyplot as plt
9 | import numpy as np
10 | import pandas as pd
11 | import yaml
12 | from scipy.stats import linregress
13 | from tqdm import tqdm
14 |
15 | import dsconcept.get_metrics as gm
16 | from dsconcept.get_metrics import get_keyword_results
17 |
18 | logging.basicConfig(level=logging.INFO)
19 | LOG = logging.getLogger(__name__)
20 | LOG.setLevel(logging.INFO)
21 |
22 |
23 | def lim_concepts_and_plot(mean_df, tmp_df, fig_dir):
24 | LOG.info(f"tmp_df.shape={tmp_df.shape}")
25 | cat = tmp_df["category"].iloc[0]
26 | lim_mean_df = mean_df[np.in1d(mean_df["concept"], tmp_df["concept"])]
27 | lim_tmp_df = tmp_df[np.in1d(tmp_df["concept"], mean_df["concept"])]
28 | if lim_mean_df.shape[0] != lim_tmp_df.shape[0]:
29 | ValueError("Different df sizes")
30 | metrics = ["recall", "precision", "f1", "roc_auc"]
31 | for m in metrics:
32 | a = 0.3
33 | lim_tmp_df[m].hist(alpha=a, label=f"one_layer | cat={cat}")
34 | lim_mean_df[m].hist(alpha=a, label="mean")
35 | plt.legend()
36 | plt.title(m)
37 | fig_loc = fig_dir / f"{m}.png"
38 | LOG.info(f"Saving plot to {fig_loc}")
39 | plt.savefig(fig_loc)
40 | plt.clf()
41 |
42 |
43 | def load_classifier(in_cat_models, in_kwd_models, in_vectorizer):
44 | LOG.info(f"Loading category classifiers from {in_cat_models}.")
45 | in_clfs = list(in_cat_models.iterdir())
46 | cat_clfs = [joblib.load(c) for c in tqdm(in_clfs)]
47 |
48 | LOG.info(f"Loading keyword classifiers from {in_kwd_models}.")
49 | cd = {} # expects no_topics with suffix ''
50 | for topic_dir in tqdm(in_kwd_models.iterdir()):
51 | in_clfs = list(topic_dir.iterdir())
52 | clfs = [joblib.load(c) for c in in_clfs] # loads the classifiers
53 | topic_name = topic_dir.stem.split("_")[1]
54 | # depends on opinionated path format
55 | for c in clfs:
56 | cd[topic_name, c["concept"]] = c["best_estimator_"]
57 |
58 | hclf = gm.HierarchicalClassifier(cat_clfs, cd)
59 | hclf.load_vectorizer(in_vectorizer)
60 | return hclf
61 |
62 |
63 | def get_clf_times(hclf, small_res, weights, sizes):
64 | hl_strats = ["topics", "only_no_topic"]
65 | batch_size = 10_000_000 # TODO: remove batching
66 | hl_dicts = []
67 |
68 | for hls in hl_strats:
69 | times = []
70 | out_sizes = []
71 | for s_size in sizes:
72 | if s_size > small_res.shape[0]:
73 | LOG.warning(f"Skipping {s_size} because it is greater than data size.")
74 | continue
75 | examples = small_res["text"].sample(s_size)
76 | n_splits = ceil(examples.shape[0] / batch_size)
77 | t1 = time()
78 | for n in tqdm(range(n_splits)):
79 | start = n * batch_size
80 | end = (n + 1) * batch_size
81 | example_batch = examples[start:end]
82 | _, feature_matrix = hclf.vectorize(example_batch, weights)
83 | LOG.info(f"Predicting keywords")
84 | if hls == "only_no_topic":
85 | no_categories = True
86 | elif hls == "topics":
87 | no_categories = False
88 | else:
89 | LOG.exception(f"Invalid strategy selection: {hls}")
90 | _, _ = hclf.predict(feature_matrix, 0.5, 0.5, no_categories)
91 | t2 = time()
92 | tt = t2 - t1
93 | out_sizes.append(s_size)
94 | times.append(tt)
95 | hld = {
96 | "strat": hls,
97 | "times": times,
98 | "sizes": out_sizes,
99 | }
100 | hl_dicts.append(hld)
101 | return hl_dicts
102 |
103 |
104 | def make_time_plots(hl_dicts, out_plot_file):
105 | fig, axes = plt.subplots(1, 2, figsize=(15, 5))
106 | for hd in hl_dicts:
107 | lg = linregress(hd["sizes"], hd["times"])
108 | docs_per_sec = [s / t for s, t in zip(hd["sizes"], hd["times"])]
109 | a = np.array(hd["sizes"])
110 | axes[0].plot(hd["sizes"], hd["times"], marker="o", label=hd["strat"])
111 | axes[0].plot(hd["sizes"], lg.slope * a + lg.intercept, "r", alpha=0.5)
112 | axes[0].set_xlabel("number of docs")
113 | axes[0].set_ylabel("time to tag (seconds)")
114 | axes[0].set_title("Time to tag depending on batch size")
115 | axes[1].plot(hd["sizes"], docs_per_sec, marker="o", label=hd["strat"])
116 | axes[1].set_xlabel("number of docs")
117 | axes[1].set_ylabel("tagging rate (docs/seconds)")
118 | axes[1].set_title("Tagging rate depending on batch size")
119 | axes[0].legend()
120 | axes[1].legend()
121 | plt.savefig(out_plot_file)
122 | plt.clf()
123 |
124 |
125 | def main(
126 | in_mean,
127 | in_cats_dir,
128 | in_kwds_dir,
129 | in_vectorizer,
130 | in_clean_data,
131 | in_config,
132 | out_plots_dir,
133 | ):
134 | LOG.info("Loading dataframes.")
135 | mean_df = pd.read_csv(in_mean, index_col=0)
136 | no_synth_df = get_keyword_results(in_kwds_dir)
137 | if no_synth_df.shape[0] == 0:
138 | raise ValueError(
139 | f"No keyword results. Are the subdirectories of {in_kwds_dir} empty?"
140 | )
141 | no_cat_df = no_synth_df[no_synth_df["category"] == ""]
142 | LOG.info("Making plots.")
143 | lim_concepts_and_plot(mean_df, no_cat_df, out_plots_dir)
144 |
145 | with open(in_config) as f0:
146 | config = yaml.safe_load(f0)
147 | hclf = load_classifier(in_cats_dir, in_kwds_dir, in_vectorizer)
148 | full_corpus = pd.read_json(in_clean_data, orient="records", lines=True)
149 | sizes = [1, 10, 50, 100, 250, 500, 1000, 2000, 5000, 10_000]
150 | sample_size = min(max(sizes), full_corpus.shape[0])
151 |
152 | small_res = pd.read_json(in_clean_data, orient="records", lines=True).sample(
153 | sample_size
154 | )
155 |
156 | hl_dicts = get_clf_times(hclf, small_res, config["weights"], sizes)
157 |
158 | out_plots_time = out_plots_dir / "time_v_batch_size.png"
159 | make_time_plots(hl_dicts, out_plot_file=out_plots_time)
160 |
161 |
162 | if __name__ == "__main__":
163 | parser = argparse.ArgumentParser(
164 | description="""From output metrics, create plots for ROC-AUC, F1, precision,
165 | and recall"""
166 | )
167 | parser.add_argument("--mean", help="results from synthesis with max strategy")
168 | parser.add_argument("--in_cats_dir", help="category classifiers dir", type=Path)
169 | parser.add_argument("--in_kwds_dir", help="kwds classifier models dir", type=Path)
170 | parser.add_argument("--in_vectorizer", help="vectorizer location", type=Path)
171 | parser.add_argument("--in_clean_data", help="clean code location", type=Path)
172 | parser.add_argument("--in_config", help="config location", type=Path)
173 | parser.add_argument("--out_plots_dir", help="output dir for plots pngs", type=Path)
174 | args = parser.parse_args()
175 | main(
176 | args.mean,
177 | args.in_cats_dir,
178 | args.in_kwds_dir,
179 | args.in_vectorizer,
180 | args.in_clean_data,
181 | args.in_config,
182 | args.out_plots_dir,
183 | )
184 |
--------------------------------------------------------------------------------
/src/pipeline/pipeline.py:
--------------------------------------------------------------------------------
1 | """
2 | Pipeline
3 | --------
4 | Program to make classifiers from input corpus and selected keyword field.
5 |
6 | Author: Anthony Buonomo
7 | Contact: anthony.r.buonomo@nasa.gov
8 |
9 | Full opinionated pipeline from processing, to topic_modeling, to training classifiers.
10 | """
11 |
12 | import logging
13 | import warnings
14 | from pathlib import Path
15 |
16 | import plac
17 | import yaml
18 | from sklearn.decomposition import LatentDirichletAllocation
19 | import joblib
20 | from sklearn.feature_extraction import DictVectorizer
21 | from sklearn.linear_model import SGDClassifier
22 | from sklearn.model_selection import GridSearchCV
23 |
24 | import dsconcept.model as ml
25 | from dsconcept.train import ConceptTrainer
26 |
27 | warnings.filterwarnings("ignore", category=FutureWarning)
28 | logging.basicConfig(level=logging.INFO)
29 | LOG = logging.getLogger(__name__)
30 | LOG.setLevel(logging.INFO)
31 |
32 | FEATURES = Path("features.jsonl")
33 | INDICES = Path("indices.json")
34 | RAW2LEMMA = Path("raw2lemma.json")
35 |
36 | TOPIC_VECTORIZER = Path("vectorizer.pkl")
37 | TOPIC_FEATURE_MATRIX = Path("doc_feature_matrix.pkl")
38 | TOPIC_MODEL = Path("model.pkl")
39 | DOC_TOPIC_DISTR = Path("doc_topic_distr.pkl")
40 |
41 | VECTORIZER = Path("vectorizer.pkl")
42 | FEATURE_MATRIX = Path("doc_feature_matrix.pkl")
43 | OUT_MODELS_DIR = Path("classifiers")
44 |
45 |
46 | def process(
47 | in_corpus, out_dir, abstract_field, concept_field, term_types, batch_size, n_threads
48 | ):
49 | out_dir.mkdir(exist_ok=True, parents=True)
50 | out_features = out_dir / FEATURES
51 | out_indices = out_dir / INDICES
52 | out_raw2lemma = out_dir / RAW2LEMMA
53 |
54 | fe = ml.FeatureExtractor()
55 | fe.from_corpus_to_jsonlines(
56 | in_corpus, out_features, abstract_field, term_types, batch_size, n_threads
57 | )
58 |
59 | ce = ml.ConceptExtractor()
60 | ce.from_corpus(in_corpus, concept_field)
61 | ce.to_jsons(out_indices, out_raw2lemma)
62 |
63 | return fe, ce
64 |
65 |
66 | def topic_model(
67 | topic_model_dir, processed_dir, topic_weights, min_feature, max_feature
68 | ):
69 | topic_model_dir.mkdir(exist_ok=True)
70 | tfe = ml.FeatureExtractor()
71 | tfe.from_jsonlines(processed_dir / FEATURES)
72 |
73 | topic_weighted_features = tfe.weight_terms(topic_weights)
74 | topic_limited_features = tfe.limit_features(
75 | topic_weighted_features, min_feature, max_feature
76 | )
77 |
78 | topic_v = DictVectorizer()
79 | topic_X = topic_v.fit_transform(topic_limited_features)
80 |
81 | model = LatentDirichletAllocation(
82 | n_components=3,
83 | max_iter=5,
84 | learning_method="online",
85 | learning_offset=50.0,
86 | random_state=0,
87 | )
88 | doc_topic_distr = model.fit_transform(topic_X)
89 |
90 | out_vectorizer = topic_model_dir / TOPIC_VECTORIZER
91 | out_feature_matrix = topic_model_dir / TOPIC_FEATURE_MATRIX
92 | out_model = topic_model_dir / TOPIC_MODEL
93 | out_doc_topic_distr = topic_model_dir / DOC_TOPIC_DISTR
94 |
95 | joblib.dump(topic_v, out_vectorizer)
96 | joblib.dump(topic_X, out_feature_matrix)
97 | joblib.dump(model, out_model)
98 | joblib.dump(doc_topic_distr, out_doc_topic_distr)
99 |
100 | return doc_topic_distr
101 |
102 |
103 | def train(
104 | out_dir,
105 | process_dir,
106 | fe,
107 | ce,
108 | weights,
109 | min_feature,
110 | max_feature,
111 | min_concept_occurrence,
112 | doc_topic_distr,
113 | ):
114 | out_dir.mkdir(exist_ok=True)
115 | out_features = process_dir / FEATURES
116 | fe.from_jsonlines(out_features)
117 | weighted_features = fe.weight_terms(weights)
118 | limited_features = fe.limit_features(weighted_features, min_feature, max_feature)
119 | v = DictVectorizer()
120 | X = v.fit_transform(limited_features)
121 |
122 | out_vectorizer = out_dir / VECTORIZER
123 | out_feature_matrix = out_dir / FEATURE_MATRIX
124 | joblib.dump(v, out_vectorizer)
125 | joblib.dump(X, out_feature_matrix)
126 |
127 | paramgrid = {
128 | "alpha": [0.01, 0.001, 0.0001],
129 | "class_weight": [{1: 10, 0: 1}, {1: 5, 0: 1}, {1: 20, 0: 1}],
130 | "max_iter": [1],
131 | "loss": ["log"],
132 | } # requires loss function with predict_proba
133 | clf = GridSearchCV(
134 | SGDClassifier(), paramgrid, scoring="f1"
135 | ) # requires GridSearchCV
136 | out_models = out_dir / OUT_MODELS_DIR
137 | trainer = ConceptTrainer(fe, ce, clf, out_models)
138 | trainer.train_all(
139 | X, out_models, min_concept_occurrence, doc_topic_distr=doc_topic_distr
140 | )
141 | return out_models
142 |
143 |
144 | def parse_config(in_config):
145 | with open(in_config, "r") as f0:
146 | cfg = yaml.safe_load(f0)
147 |
148 | term_types = cfg["process"]["term_types"]
149 | abstract_field = cfg["process"]["abstract_field"]
150 | concept_field = cfg["process"]["concept_field"]
151 |
152 | topic_weights = cfg["topic_model"]["weights"]
153 | topic_min_feature = cfg["topic_model"]["min_feature_occurrence"]
154 | topic_max_feature = cfg["topic_model"]["max_feature_occurrence"]
155 | num_topics = cfg["topic_model"]["number_of_topics"]
156 |
157 | weights = cfg["train_classifiers"]["weights"]
158 | min_feature = cfg["train_classifiers"]["max_feature_occurrence"]
159 | max_feature = cfg["train_classifiers"]["max_feature_occurrence"]
160 | min_concept = cfg["train_classifiers"]["min_concept_occurrence"]
161 |
162 | return (
163 | abstract_field,
164 | concept_field,
165 | term_types,
166 | topic_weights,
167 | topic_min_feature,
168 | topic_max_feature,
169 | num_topics,
170 | weights,
171 | min_feature,
172 | max_feature,
173 | min_concept,
174 | )
175 |
176 |
177 | @plac.annotations(
178 | in_corpus=plac.Annotation("path to json-formatted corpus", "positional", type=Path),
179 | config=plac.Annotation("path to configuration yaml file", "positional", type=Path),
180 | process_dir=plac.Annotation(
181 | "path to dir where you want to store processed corpus data",
182 | "positional",
183 | type=Path,
184 | ),
185 | topic_model_dir=plac.Annotation(
186 | "path to dir where you want to store topic_modeling data",
187 | "positional",
188 | type=Path,
189 | ),
190 | classify_dir=plac.Annotation(
191 | "path to dir where you want to store classifying data", "positional", type=Path
192 | ),
193 | batch_size=plac.Annotation(
194 | "size of batches to process in processing phase of pipeline", "option", type=int
195 | ),
196 | n_threads=plac.Annotation(
197 | "number of threads to use in processing phase of pipeline", "option", type=int
198 | ),
199 | )
200 | def main(
201 | in_corpus,
202 | config,
203 | process_dir,
204 | topic_model_dir,
205 | classify_dir,
206 | batch_size=10,
207 | n_threads=1,
208 | ):
209 |
210 | (
211 | abstract_field,
212 | concept_field,
213 | term_types,
214 | topic_weights,
215 | topic_min_feature,
216 | topic_max_feature,
217 | num_topics,
218 | weights,
219 | min_feature,
220 | max_feature,
221 | min_concept,
222 | ) = parse_config(config)
223 |
224 | fe, ce = process(
225 | in_corpus,
226 | process_dir,
227 | abstract_field,
228 | concept_field,
229 | term_types,
230 | batch_size,
231 | n_threads,
232 | )
233 | doc_topic_distr = topic_model(
234 | topic_model_dir,
235 | process_dir,
236 | topic_weights,
237 | topic_min_feature,
238 | topic_max_feature,
239 | )
240 | train(
241 | classify_dir,
242 | process_dir,
243 | fe,
244 | ce,
245 | weights,
246 | min_feature,
247 | max_feature,
248 | min_concept,
249 | doc_topic_distr,
250 | )
251 | LOG.info("SUCCESS!")
252 |
253 |
254 | if __name__ == "__main__":
255 | plac.call(main)
256 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Concept Tagging Training
2 |
3 | This software enables the creation of concept classifiers, to be utilized by an
4 | accompanying [service](https://github.com/nasa/concept-tagging-api). If you don't have your own data to train, you can use the pretrained models described here. This project was written about [here](https://strategy.data.gov/proof-points/2019/05/28/improving-data-access-and-data-management-artificial-intelligence-generated-metadata-tags-at-nasa/) for the Federal Data Strategy Incubator Project.
5 |
6 | ### What is Concept Tagging
7 | By concept tagging, we mean you can supply text, for example:` Volcanic activity, or volcanism, has played a significant role in the geologic evolution of Mars.[2] Scientists have known since the Mariner 9 mission in 1972 that volcanic features cover large portions of the Martian surface.` and get back predicted keywords, like `volcanology, mars surface, and structural properties`, as well as topics like `space sciences, geosciences`, from a standardized list of several thousand NASA concepts with a probability score for each prediction.
8 |
9 | ## Requirements
10 |
11 | You can see a list of options for this project by navigating to the root of the project and executing `make` or `make help`.
12 |
13 | This project requires:
14 | * [docker](https://docs.docker.com/install/) -- [tested with this version](docker-versions.txt)
15 | * [GNU Make](https://www.gnu.org/software/make/) -- tested with 3.81 built for i386-apple-darwin11.3.0
16 |
17 | ## Index:
18 | 1. [installation](#installation)
19 | 2. [how to run](#how-to-run)
20 | 3. [managing experiments](#managing-experiments)
21 | 4. [advanced usage](#advanced-usage)
22 |
23 | ## installation
24 | You have several options for installing and using the pipeline.
25 | 1) [pull existing docker image](#pull-existing-docker-image)
26 | 2) [build docker image from source](#build-docker-image-from-source)
27 | 3) [install in python virtual environment](#install-in-python-virtual-environment)
28 |
29 | ### pull existing docker image
30 | You can just pull a stable docker image which has already been made:
31 | ```bash
32 | docker pull storage.analytics.nasa.gov/abuonomo/concept_trainer:stable
33 | ```
34 | In order to do this, you must be on the NASA network and able to connect to the docker registry.
35 | \* There are several versions of the images. You can see them [here](https://storage.analytics.nasa.gov/repository/abuonomo/rat_trainer).
36 | If you don't use "stable", some or all of this guide may not work properly.
37 |
38 |
39 | ### build docker image from source
40 | To build from source, first clone this repository and go to its root.
41 |
42 | Then build the docker image using:
43 | ```bash
44 | docker build -t concept_trainer:example .
45 | ```
46 | Substitute `concept_trainer:example` for whatever name you would like. Keep this image name in mind. It will be used elsewhere.
47 |
48 | \* If you are actively developing this project, you should look at the `make build` in [Makefile](Makefile). This command automatically tags the image with the current commit url and most recent git tag. The command requires that [setuptools-scm](https://pypi.org/project/setuptools-scm/) is installed.
49 |
50 | ### install in python virtual environment
51 | \* tested with python3.7
52 | First, clone this repository.
53 | Then create and activate a virtual environment. For example, using [venv](https://docs.python.org/3/library/venv.html):
54 | ```bash
55 | python -m venv my_env
56 | source my_env/bin/activate
57 | ```
58 | Next, while in the root of this project, run `make requirements`.
59 |
60 |
61 | ## how to run
62 | The pipeline takes input document metadata structured like [this](data/raw/STI_public_metadata_records_sample100.jsonl) and a config file like [this](config/test_config.yml). The pipeline produces interim data, models, and reports.
63 |
64 | 1. [using docker](#using-docker) -- if you pulled or built the image
65 | 2. [using python in virtual environment](#using-python-in-virtual-environment) -- if you are running in a local virtual environment
66 |
67 | ### using docker
68 | First, make sure `config`, `data`, `data/raw`, `data/interim`, `models`, and `reports` directories. If they do not exist, make them (`mkdir config data models reports data/raw`). These directories will be used as docker mounted volumes. If you don't make these directories beforehand, they will be created by docker later on, but their permissions will be unnecessarily restrictive.
69 |
70 | Next, make sure you have your input data in the `data/raw/` directory. [Here](data/raw/STI_public_metadata_records_sample100.jsonl) is an example file with the proper structure. You also need to make sure the `subj_mapping.json` file [here](data/interim/subj_mapping.json) is in `data/interim/` directory.
71 |
72 | Now, make sure you have a config file in the `config` directory. [Here](config/test_config.yml) is an example config which will work with the above example file.
73 |
74 | With these files in place, you can run the full pipeline with this command:
75 | ```bash
76 | docker run -it \
77 | -v $(pwd)/data:/home/data \
78 | -v $(pwd)/models:/home/models \
79 | -v $(pwd)/config:/home/config \
80 | -v $(pwd)/reports:/home/reports \
81 | concept_trainer:example pipeline \
82 | EXPERIMENT_NAME=my_test_experiment \
83 | IN_CORPUS=data/raw/STI_public_metadata_records_sample100.jsonl \
84 | IN_CONFIG=config/test_config.yml
85 | ```
86 | Substitute `concept_trainer:example` with the name of your docker image.
87 | You can set the `EXPERIMENT_NAME` to whatever you prefer.
88 | `IN_CORPUS` and `IN_CONFIG` should be set to the paths to the corpus and to the configuration file, respectively.
89 |
90 | \* Developers can also use the `container` command in the [Makefile](Makefile). Note that this command requires [setuptools-scm](https://pypi.org/project/setuptools-scm/). Note that this command will use the image defined by the `IMAGE_NAME` variable and version number equivalent to the most recent git tag.
91 |
92 |
93 | ### using python in virtual environment
94 |
95 | Assuming you have cloned this repository, files for testing the pipeline should be in place. In particular, `data/raw/STI_public_metadata_records_sample100.jsonl` and `config/test_config.yml` should both exist. Additionally, you should add the `src` directory to your `PYTHONPATH`:
96 | ```
97 | export PYTHONPATH=$PYTHONPATH:$(pwd)/src/
98 | ```
99 |
100 | Then, you can run a test of the pipeline with:
101 | ```
102 | make pipeline \
103 | EXPERIMENT_NAME=test \
104 | IN_CORPUS=data/raw/STI_public_metadata_records_sample100.jsonl \
105 | IN_CONFIG=config/test_config.yml
106 | ```
107 | If you are not using the default values, simply substitute the proper paths for `IN_CORPUS` and `IN_CONFIG`. Choose whatever name you prefer for `EXPERIMENT_NAME`.
108 |
109 | ## managing experiments
110 |
111 | If you have access to the `hq-ocio-ci-bigdata` moderate s3 bucket, you can sync local experiments with those in the s3 bucket.
112 |
113 | For example, if you created a local experiment with `EXPERIMENT_NAME=my_cool_experiment`, you can upload your local results to the appropriate place on the s3 bucket with:
114 | ```bash
115 | make sync_experiment_to_s3 EXPERIMENT_NAME=my_cool_experiment PROFILE=my_aws_profile
116 | ```
117 | where `my_aws_profile` is the name of your awscli profile which has access to the given bucket.
118 |
119 | Afterwards, you can download the experiment interim files and results with:
120 | ```bash
121 | make sync_experiment_from_s3 EXPERIMENT_NAME=my_cool_experiment PROFILE=my_aws_profile
122 | ```
123 | ## use full sti metadata records
124 | If you have access to the moderate bucket and you want to work with the full STI metadata records, you can download them to the `data/raw` folder with:
125 | ```bash
126 | make sync_raw_data_from_s3 PROFILE=my_aws_profile
127 | ```
128 | When using these data, you will want to use a config file which is different from the test config file. You can browse previous experiments at `s3://hq-ocio-ci-bigdata/home/DataSquad/classifier_scripts/` to see example config files. You might try:
129 | ```yaml
130 | weights: # assign weights for term types specified in process section
131 | NOUN: 1
132 | PROPN: 1
133 | NOUN_CHUNK: 1
134 | ENT: 1
135 | ACRONYM: 1
136 | min_feature_occurrence: 100
137 | max_feature_occurrence: 0.6
138 | min_concept_occurrence: 500
139 | ```
140 | See [config/test_config.yml](config/test_config.yml) for details on these parameters.
141 |
142 | ## advanced usage
143 | For more advanced usage of the project, look at the [Makefile](Makefile) commands and their associated scripts. You can learn more about these python scripts by them with help flags. For example, you can run `python src/make_cat_models.py -h`.
144 |
145 |
--------------------------------------------------------------------------------
/src/dsconcept/train.py:
--------------------------------------------------------------------------------
1 | """
2 | Train
3 | -----
4 | Program to make classifiers from input corpus and selected keyword field.
5 |
6 | author: Anthony Buonomo
7 | contact: anthony.r.buonomo@nasa.gov
8 |
9 | """
10 | import logging
11 | from pathlib import Path
12 | import time
13 | from math import ceil
14 |
15 | from sklearn.model_selection import train_test_split
16 | from sklearn.exceptions import UndefinedMetricWarning
17 | from sklearn.metrics import (
18 | accuracy_score,
19 | roc_auc_score,
20 | f1_score,
21 | precision_score,
22 | recall_score,
23 | )
24 | import joblib
25 | import numpy as np
26 | import warnings
27 | from tqdm import tqdm
28 |
29 | warnings.filterwarnings("ignore", category=FutureWarning)
30 | warnings.filterwarnings("ignore", category=DeprecationWarning)
31 | warnings.filterwarnings("ignore", category=UndefinedMetricWarning)
32 | warnings.filterwarnings("ignore", category=Warning)
33 | logging.basicConfig(level=logging.INFO)
34 | LOG = logging.getLogger(__name__)
35 | LOG.setLevel(logging.INFO)
36 |
37 |
38 | def get_dispersed_subset(array, subset_size):
39 | """
40 | Get dispersed subset of an array. By dispersed, I mean that we values extract values
41 | from an evenly distributed by location in the array.
42 |
43 | Args:
44 | array (numpy.ndarray): array from which to extract subset
45 | subset_size (int): the number of elements to extract from array
46 |
47 | Returns:
48 | subset (numpy.ndarray): the dispersed subset
49 | array (numpy.ndarray): if subset_size too large, return the input array
50 |
51 | Examples:
52 | >>> from dsconcept.train import get_dispersed_subset
53 | >>> l1 = list(range(100))
54 | >>> l2 = get_dispersed_subset(l1, 10)
55 | >>> l2
56 | array([ 0., 12., 23., 34., 45., 56., 67., 78., 89., 99.], dtype=float16)
57 | """
58 | if len(array) <= subset_size:
59 | return array
60 | else:
61 | last = array[-1]
62 | subset = [sub[0] for sub in np.array_split(array, (subset_size - 1))]
63 | subset.append(last)
64 | subset = np.array(subset, dtype=np.float16)
65 | return subset
66 |
67 |
68 | # TODO: refactor to remove need for this function
69 | def path_append(in_path, addition):
70 | out_path = f"{in_path.parent}/{in_path.stem}{addition}{in_path.suffix}"
71 | return Path(out_path)
72 |
73 |
74 | def topic_path_format(out_classifier_dir, topic):
75 | if topic is None:
76 | tmp_topic = ""
77 | else:
78 | tmp_topic = topic
79 | out_classifier_dir = path_append(out_classifier_dir, tmp_topic) # appends to stem
80 | if not out_classifier_dir.exists():
81 | out_classifier_dir.mkdir(parents=True)
82 | return out_classifier_dir
83 |
84 |
85 | class ConceptTrainer:
86 | def __init__(self, concept_extractor, classifier):
87 | """
88 | Initialize object for training of classifiers based on given corpus extractors.
89 |
90 | Args:
91 | concept_extractor (dsconcept.model.ConceptExtractor): ConceptExtractor (with concepts already loaded)
92 | for which to create classifiers
93 | classifier (sklearn.GridSearchCV): the classifier algorithm to use (wrapped in sklearn GridSearchCV)
94 |
95 | """
96 | self.concept_extractor = concept_extractor
97 | self.classifier = classifier
98 |
99 | def train_all(
100 | self,
101 | doc_feature_matrix,
102 | out_classifier_dir,
103 | min_concept_freq,
104 | doc_topic_distr=None,
105 | ):
106 | """
107 | Train classifiers for each concept for each topic (if topic distributions are provided).
108 |
109 | Args:
110 | doc_feature_matrix (scipy.sparse.csr.csr_matrix): document feature matrix
111 | out_classifier_dir (pathlib.Path): output path for classifiers
112 | min_concept_freq (int): minimum frequency for concepts in corpus in order
113 | for their corresponding classifiers to be made
114 | doc_topic_distr (numpy.ndarray): topic distributions for each doc in training set
115 |
116 | Returns:
117 | out_classifier_dir (pathlib.Path): output path for classifiers
118 |
119 | """
120 | doc_topic_indices = {}
121 | if doc_topic_distr is not None:
122 | for topic in range(
123 | doc_topic_distr.shape[1]
124 | ): # cols of distr matrix ~ topics
125 | doc_topic_indices[topic] = [
126 | i
127 | for i, distr in enumerate(doc_topic_distr)
128 | if distr.argmax() == topic
129 | ]
130 | _, _, ind_train, ind_test = train_test_split(
131 | doc_feature_matrix,
132 | np.array(range(doc_feature_matrix.shape[0])),
133 | test_size=0.10,
134 | random_state=42,
135 | )
136 | np.save(out_classifier_dir.parent / f"train_inds.npy", ind_train)
137 | np.save(out_classifier_dir.parent / f"test_inds.npy", ind_test)
138 |
139 | LOG.info(
140 | f"Training one general set, and one set for each of {len(doc_topic_indices)} topics divisions."
141 | )
142 | for topic, doc_topic_index in doc_topic_indices.items():
143 | self.train_concepts(
144 | doc_feature_matrix,
145 | ind_train,
146 | ind_test,
147 | out_classifier_dir,
148 | min_concept_freq,
149 | topic,
150 | doc_topic_index,
151 | )
152 | self.train_concepts(
153 | doc_feature_matrix,
154 | ind_train,
155 | ind_test,
156 | out_classifier_dir,
157 | min_concept_freq,
158 | )
159 | return out_classifier_dir
160 |
161 | def train_concepts(
162 | self,
163 | doc_feature_matrix,
164 | ind_train,
165 | ind_test,
166 | out_classifier_dir,
167 | min_concept_freq,
168 | topic=None,
169 | doc_topic_index=None,
170 | scale_threshold=False,
171 | ):
172 | """
173 | Create classifiers for group of concepts.
174 |
175 | Args:
176 | doc_feature_matrix (scipy.sparse.csr.csr_matrix): document feature matrix
177 | ind_train (list of int): indices for training partition
178 | ind_test (list of int): indices for testing partition
179 | out_classifier_dir (pathlib.Path): path to directory where classifiers will be dumped.
180 | min_concept_freq (int): minimum frequency for concepts in corpus in order
181 | for their corresponding classifiers to be made
182 | topic (int | None): the topic (if any) from which to select training data for classifiers
183 | doc_topic_index (lists): mapping from given topic to document indices
184 | for which that topic has the highest probability
185 | scale_threshold (bool | False): If true, scale the minimum_concept_freq by the size of the topic division.
186 |
187 | Returns:
188 | out_classifier_dir (pathlib.Path): directory where classifiers have been placed
189 |
190 | """
191 |
192 | LOG.info(f"Queuing classifier job for topic {topic}.")
193 | t1 = time.time()
194 | out_classifier_dir = topic_path_format(out_classifier_dir, topic)
195 |
196 | LOG.info("Getting indices for training and testing.")
197 | if doc_topic_index is not None:
198 | train_inds = list(set(ind_train).intersection(doc_topic_index))
199 | test_inds = list(set(ind_test).intersection(doc_topic_index))
200 | else:
201 | train_inds = ind_train
202 | test_inds = ind_test
203 |
204 | X_train = doc_feature_matrix.tocsc()[train_inds, :]
205 | X_test = doc_feature_matrix.tocsc()[test_inds, :]
206 |
207 | if scale_threshold:
208 | total_size = X_train.shape[0] + X_test.shape[0]
209 | # scale threshold based on size of topic division
210 | r = total_size / doc_feature_matrix.shape[0]
211 | topic_min_concept_threshold = ceil(min_concept_freq * r)
212 | else:
213 | topic_min_concept_threshold = min_concept_freq
214 | LOG.info(f"Topic threshold set to {topic_min_concept_threshold}.")
215 |
216 | concept_index_mapping = self.concept_extractor.get_top_concepts(
217 | topic_min_concept_threshold
218 | )
219 | no_concepts = len(concept_index_mapping)
220 | LOG.info(f"Training {no_concepts} concepts.")
221 |
222 | nu_passed = 0
223 | for concept, index in tqdm(concept_index_mapping.items()):
224 | LOG.debug(f"TOPIC={topic}:Loading indices for {concept}")
225 | y = np.zeros(doc_feature_matrix.shape[0])
226 | np.put(y, index, 1)
227 |
228 | y_train = y[train_inds]
229 | y_test = y[test_inds]
230 | total_yes = sum(y_train) + sum(y_test)
231 |
232 | if total_yes < topic_min_concept_threshold:
233 | nu_passed += 1
234 | LOG.debug(
235 | f"Passing {concept} because it is under topic_min_concept_threshold of {topic_min_concept_threshold}."
236 | )
237 | continue
238 | # TODO: move around y0 train and test inds to keep aligned
239 | self.create_concept_classifier(
240 | concept, X_train, X_test, y_train, y_test, out_classifier_dir
241 | )
242 | t2 = time.time()
243 | LOG.warning(f"Passed {nu_passed} in topic {topic} due to freq under threshold.")
244 | LOG.debug(f"{t2-t1} seconds for topic {topic}.")
245 | return out_classifier_dir
246 |
247 | def create_concept_classifier(
248 | self, concept, X_train, X_test, y_train, y_test, out_classifier_dir
249 | ):
250 | """
251 | Create an individual classifier.
252 |
253 | Args:
254 | concept (str): the concept for which to create a classifier
255 | doc_feature_matrix (scipy.sparse.csr.csr_matrix): documents with their features
256 | y (numpy.ndarray): array which indicates whether or not given concept occurs for a given topic
257 | out_classifier_dir (pathlib.Path): output directory for classifiers
258 |
259 | Returns:
260 | out_model_path (pathlib.Path): the path to the concept classifier just produced.
261 |
262 | """
263 | LOG.debug(f"Making classifier for concept {concept}.")
264 | try:
265 | LOG.debug(f"fitting {concept}...")
266 | self.classifier.fit(X_train, y_train)
267 | LOG.debug(f"testing {concept}...")
268 | y_score = self.classifier.predict_proba(X_test)[:, 1]
269 | LOG.debug(f"Binarizing score for {concept}...")
270 | y_pred = np.where(y_score > 0.5, 1, 0)
271 |
272 | LOG.debug(f"Getting metric scores for {concept}...")
273 | accuracy = accuracy_score(y_test, y_pred)
274 | roc_auc = roc_auc_score(y_test, y_score)
275 | f1 = f1_score(y_test, y_pred)
276 | precision = precision_score(y_test, y_pred)
277 | recall = recall_score(y_test, y_pred)
278 |
279 | out_model = {
280 | "concept": concept,
281 | "best_estimator_": self.classifier.best_estimator_,
282 | "cv_results_": self.classifier.cv_results_,
283 | "scores": {
284 | "accuracy": accuracy,
285 | "roc_auc": roc_auc,
286 | "f1": f1,
287 | "precision": precision,
288 | "recall": recall,
289 | },
290 | }
291 | LOG.debug(f"Accuracy: {accuracy} | ROC-AUC: {roc_auc} | F1: {f1}")
292 | out_concept = str(Path(concept).name)
293 | out_model_path = out_classifier_dir / f"{out_concept}.pkl"
294 | LOG.debug(f"Writing model to {out_model_path}.")
295 | joblib.dump(out_model, out_model_path)
296 | return out_model_path
297 |
298 | except ValueError:
299 | LOG.debug(f"Insufficient data for concept {concept}.")
300 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | .PHONY: process features concepts keywords categories structure requirements \
2 | sync_data_to_s3 sync_data_from_s3 sync_raw_data_from_s3 pipeline plots \
3 | tests docs check_clean clean_experiment clean
4 |
5 | #.SHELLFLAGS := -o nounset -c
6 | SHELL := /bin/bash
7 |
8 | #################################################################################
9 | # GLOBALS #
10 | #################################################################################
11 |
12 | PROJECT_DIR := $(shell dirname $(realpath $(lastword $(MAKEFILE_LIST))))
13 | BUCKET = hq-ocio-ci-bigdata/home/DataSquad/classifier_scripts/
14 | PROFILE = moderate
15 | PROJECT_NAME = classifier_scripts
16 | PYTHON_INTERPRETER = python3
17 |
18 | ifeq (,$(shell which conda))
19 | HAS_CONDA=False
20 | else
21 | HAS_CONDA=True
22 | endif
23 |
24 | #################################################################################
25 | # COMMANDS #
26 | #################################################################################
27 |
28 | # These three variables should be tailored for you use case.
29 | EXPERIMENT_NAME=test
30 | IN_CORPUS=data/raw/STI_public_metadata_records_sample100.jsonl
31 | IN_CONFIG=config/test_config.yml
32 |
33 | INTERIM_DATA=data/interim/$(EXPERIMENT_NAME)
34 | INTERIM_CORPUS=data/interim/$(EXPERIMENT_NAME)/abs_kwds.jsonl
35 |
36 | FIELD=text
37 | SUBJ_MAPPING=data/interim/subj_mapping.json
38 | FEATURES=data/interim/$(EXPERIMENT_NAME)/features.jsonl
39 |
40 | CONCEPT_FIELD='keywords'
41 | CAT_FIELD='categories'
42 | OUT_KWD_INDICES=data/interim/$(EXPERIMENT_NAME)/kwd_indices.json
43 | OUT_CAT_INDICES=data/interim/$(EXPERIMENT_NAME)/cat_indices.json
44 | OUT_KWD_RAW_TO_LEMMA=models/$(EXPERIMENT_NAME)/kwd_raw2lemma.json
45 | OUT_CAT_RAW_TO_LEMMA=models/$(EXPERIMENT_NAME)/cat_raw2lemma.json
46 |
47 | OUT_OUTER_MODEL_DIR=models/$(EXPERIMENT_NAME)
48 | OUT_KWD_MODEL_DIR=$(OUT_OUTER_MODEL_DIR)/keywords
49 | OUT_CAT_MODEL_DIR=$(OUT_OUTER_MODEL_DIR)/categories
50 |
51 | METRICS_LOC=reports/$(EXPERIMENT_NAME)
52 | BERT_MODELS_DIR=models/bert_models
53 |
54 | GIT_REMOTE='origin'
55 | IMAGE_NAME=concept_trainer
56 |
57 |
58 | ## Test underlying dsconcept library
59 | tests:
60 | nosetests --with-coverage --cover-package dsconcept --cover-html; \
61 | open cover/index.html
62 |
63 | ## Run through all steps to create all classifiers
64 | pipeline: structure process features concepts vectorizer_and_matrix \
65 | categories keywords metrics plots
66 |
67 | ## create directory structure if necessary
68 | structure:
69 | mkdir -p data
70 | mkdir -p data/raw
71 | mkdir -p data/interim
72 | mkdir -p data/interim/$(EXPERIMENT_NAME)
73 | mkdir -p models/$(EXPERIMENT_NAME)
74 | mkdir -p config
75 | mkdir -p reports
76 | mkdir -p reports/$(EXPERIMENT_NAME)
77 |
78 | ## install newest version of dependencies. Untested.
79 | approximate-install:
80 | pip install scikit-learn spacy tqdm textacy pyyaml pandas h5py \
81 | testfixtures hypothesis dask pytest matplotlib
82 | $(PYTHON_INTERPRETER) -m spacy download en_core_web_sm
83 |
84 | ## install precise python dependencies
85 | requirements:
86 | $(PYTHON_INTERPRETER) -m pip install -U pip setuptools wheel
87 | $(PYTHON_INTERPRETER) -m pip install -r requirements.txt
88 | $(PYTHON_INTERPRETER) -m spacy download en_core_web_sm
89 |
90 | ## processing by merging text and keyword fields
91 | process: $(INTERIM_CORPUS)
92 | $(INTERIM_CORPUS): $(IN_CORPUS) src/process.py
93 | mkdir -p data/interim/$(EXPERIMENT_NAME)
94 | mkdir -p models/$(EXPERIMENT_NAME)
95 | $(PYTHON_INTERPRETER) src/process.py $(IN_CORPUS) $(SUBJ_MAPPING) $(INTERIM_CORPUS)
96 |
97 | ## creature feature sets for processed data
98 | features: $(FEATURES)
99 | $(FEATURES): $(INTERIM_CORPUS) src/features.py
100 | $(PYTHON_INTERPRETER) src/features.py $(INTERIM_CORPUS) $(FIELD) $(FEATURES)
101 |
102 | ## create concepts indices json and mappings from raw to lemmas
103 | concepts: $(OUT_KWD_INDICES) $(OUT_CAT_INDICES)
104 | $(OUT_KWD_INDICES) $(OUT_CAT_INDICES): $(INTERIM_CORPUS) src/concepts.py
105 | $(PYTHON_INTERPRETER) src/concepts.py \
106 | $(INTERIM_CORPUS) \
107 | $(CONCEPT_FIELD) $(CAT_FIELD) \
108 | $(OUT_KWD_INDICES) $(OUT_CAT_INDICES) \
109 | $(OUT_KWD_RAW_TO_LEMMA) $(OUT_CAT_RAW_TO_LEMMA)
110 |
111 | ## create vectorizer and feature matrix from feature records
112 | vectorizer_and_matrix: $(INTERIM_DATA)/feature_matrix.jbl
113 | $(INTERIM_DATA)/feature_matrix.jbl: src/make_vec_and_matrix.py $(FEATURES) $(IN_CONFIG)
114 | mkdir -p $(OUT_OUTER_MODEL_DIR) && \
115 | cp $(IN_CONFIG) $(OUT_OUTER_MODEL_DIR)/config.yml && \
116 | $(PYTHON_INTERPRETER) src/make_vec_and_matrix.py \
117 | $(FEATURES) $(IN_CONFIG) $(INTERIM_DATA) $(OUT_OUTER_MODEL_DIR)/vectorizer.jbl
118 | # TODO: separate outputs for vec and matrix, send matrix to INTERIM_DATA
119 |
120 | ## train category models
121 | categories: src/make_cat_models.py $(OUT_CAT_INDICES) $(INTERIM_DATA)/feature_matrix.jbl $(IN_CONFIG)
122 | mkdir -p $(OUT_CAT_MODEL_DIR) && \
123 | $(PYTHON_INTERPRETER) src/make_cat_models.py \
124 | $(INTERIM_DATA)/feature_matrix.jbl \
125 | $(INTERIM_DATA)/train_inds.npy \
126 | $(INTERIM_DATA)/test_inds.npy \
127 | $(OUT_CAT_INDICES) \
128 | $(OUT_CAT_RAW_TO_LEMMA) \
129 | $(IN_CONFIG) $(OUT_CAT_MODEL_DIR)
130 |
131 | ## train keyword models
132 | keywords: src/make_kwd_models.py $(OUT_KWD_INDICES) $(INTERIM_DATA)/feature_matrix.jbl $(IN_CONFIG) $(INTERIM_DATA)/test_inds.npy
133 | mkdir -p $(OUT_KWD_MODEL_DIR) && \
134 | $(PYTHON_INTERPRETER) src/make_kwd_models.py \
135 | $(INTERIM_DATA)/feature_matrix.jbl \
136 | $(INTERIM_DATA)/train_inds.npy \
137 | $(INTERIM_DATA)/test_inds.npy \
138 | $(OUT_KWD_INDICES) $(OUT_CAT_INDICES) \
139 | $(OUT_KWD_RAW_TO_LEMMA) $(OUT_CAT_RAW_TO_LEMMA) \
140 | $(IN_CONFIG) $(OUT_KWD_MODEL_DIR)
141 |
142 | ## Only train keywords on full training set. No topic splitting.
143 | keywords-no-topics:
144 | mkdir -p $(OUT_KWD_MODEL_DIR) && \
145 | $(PYTHON_INTERPRETER) src/make_kwd_models.py \
146 | $(INTERIM_DATA)/feature_matrix.jbl \
147 | $(INTERIM_DATA)/train_inds.npy \
148 | $(INTERIM_DATA)/test_inds.npy \
149 | $(OUT_KWD_INDICES) $(OUT_CAT_INDICES) \
150 | $(OUT_KWD_RAW_TO_LEMMA) $(OUT_CAT_RAW_TO_LEMMA) \
151 | $(IN_CONFIG) $(OUT_KWD_MODEL_DIR) --no-topics ${VERBOSE}
152 |
153 | ## Get predictions from category models made with BERT classification
154 | bert_cat_model_scores:
155 | mkdir -p $(METRICS_LOC) && \
156 | $(PYTHON_INTERPRETER) src/get_bert_cat_models_preds.py \
157 | --data_dir $(INTERIM_DATA) \
158 | --models_dir $(OUT_OUTER_MODEL_DIR) \
159 | --reports_dir $(METRICS_LOC) \
160 | --base_model_dir ../nlp-working-with-bert/models/base/cased_L-12_H-768_A-12 \
161 | --finetuned_model_dir ../nlp-working-with-bert/models/01_02_2020/ \
162 | --sample 1000
163 | # --base_model_dir models/bert_models/cased_L-12_H-768_A-12 \
164 | # --finetuned_model_dir models/bert_models/cased_L-12_H-768_A-12/cache
165 |
166 | ## Create cleaned dataset for training transformer category models
167 | bert_cat_clean_dataset:
168 | $(PYTHON_INTERPRETER) src/make_records_for_cat_bert.py \
169 | $(INTERIM_CORPUS) \
170 | $(INTERIM_DATA) \
171 | $(OUT_OUTER_MODEL_DIR)/bert
172 |
173 | ## Get metrics for test data
174 | metrics:
175 | mkdir -p $(METRICS_LOC) && \
176 | $(PYTHON_INTERPRETER) src/dsconcept/get_metrics.py \
177 | --experiment_name $(EXPERIMENT_NAME) \
178 | --out_store $(METRICS_LOC)/store.h5 \
179 | --out_cat_preds $(METRICS_LOC)/cat_preds.npy \
180 | --batch_size 500
181 |
182 | ## Synthesize predictions for keywords and classifiers to create full classification
183 | synthesize:
184 | mkdir -p $(METRICS_LOC) && \
185 | $(PYTHON_INTERPRETER) src/synthesize_predictions.py \
186 | --experiment_name $(EXPERIMENT_NAME) \
187 | --synth_strat mean \
188 | --in_cat_preds $(METRICS_LOC)/cat_preds.npy \
189 | --store $(METRICS_LOC)/store.h5 \
190 | --synth_batch_size 3000 \
191 | --threshold 0.5 \
192 | --out_synth_scores $(METRICS_LOC)/synth_mean_results.csv
193 |
194 | ## Synthesize predictions for keywords and classifiers to create full classification
195 | synthesize-bert:
196 | mkdir -p $(METRICS_LOC) && \
197 | $(PYTHON_INTERPRETER) src/synthesize_predictions.py \
198 | --experiment_name $(EXPERIMENT_NAME) \
199 | --synth_strat mean \
200 | --in_cat_preds $(METRICS_LOC)/bert_cat_preds.npy \
201 | --store $(METRICS_LOC)/store.h5 \
202 | --synth_batch_size 3000 \
203 | --threshold 0.5 \
204 | --out_synth_scores $(METRICS_LOC)/synth_bert_mean_results.csv
205 |
206 | ## create plots from performance metrics
207 | plots:
208 | mkdir -p $(METRICS_LOC)/figures && \
209 | $(PYTHON_INTERPRETER) src/make_plots.py \
210 | --mean $(METRICS_LOC)/synth_mean_results.csv \
211 | --in_cats_dir $(OUT_CAT_MODEL_DIR)/models \
212 | --in_kwds_dir $(OUT_KWD_MODEL_DIR)/models \
213 | --in_cats_dir $(OUT_CAT_MODEL_DIR)/models \
214 | --in_vectorizer $(OUT_OUTER_MODEL_DIR)/vectorizer.jbl \
215 | --in_clean_data $(INTERIM_CORPUS) \
216 | --in_config $(OUT_OUTER_MODEL_DIR)/config.yml \
217 | --out_plots_dir $(METRICS_LOC)/figures
218 |
219 | ## create plots from performance metrics
220 | plots-bert:
221 | mkdir -p $(METRICS_LOC)/figures_bert && \
222 | $(PYTHON_INTERPRETER) src/make_plots.py \
223 | --mean $(METRICS_LOC)/synth_bert_mean_results.csv \
224 | --in_cats_dir $(OUT_CAT_MODEL_DIR)/models \
225 | --in_kwds_dir $(OUT_KWD_MODEL_DIR)/models \
226 | --in_cats_dir $(OUT_CAT_MODEL_DIR)/models \
227 | --in_vectorizer $(OUT_OUTER_MODEL_DIR)/vectorizer.jbl \
228 | --in_clean_data $(INTERIM_CORPUS) \
229 | --in_config $(OUT_OUTER_MODEL_DIR)/config.yml \
230 | --out_plots_dir $(METRICS_LOC)/figures_bert
231 |
232 | ## Build docker image for training
233 | build:
234 | export COMMIT=$$(git log -1 --format=%H); \
235 | export REPO_URL=$$(git remote get-url $(GIT_REMOTE)); \
236 | export REPO_DIR=$$(dirname $$REPO_URL); \
237 | export BASE_NAME=$$(basename $$REPO_URL .git); \
238 | export GIT_LOC=$$REPO_DIR/$$BASE_NAME/tree/$$COMMIT; \
239 | export VERSION=$$(python version.py); \
240 | echo $$GIT_LOC; \
241 | echo $$VERSION; \
242 | docker build -t $(IMAGE_NAME):$$VERSION \
243 | --build-arg GIT_URL=$$GIT_LOC \
244 | --build-arg VERSION=$$VERSION .
245 |
246 | ## Start docker container for running full pipeline
247 | container:
248 | export VERSION=$$(python version.py); \
249 | docker run -it \
250 | -v $$(pwd)/data:/home/data \
251 | -v $$(pwd)/models:/home/models \
252 | -v $$(pwd)/config:/home/config \
253 | -v $$(pwd)/reports:/home/reports \
254 | $(IMAGE_NAME):$$VERSION pipeline \
255 | EXPERIMENT_NAME=$(EXPERIMENT_NAME) \
256 | IN_CORPUS=$(IN_CORPUS) \
257 | IN_CONFIG=$(IN_CONFIG)
258 |
259 | ## Delete all compiled Python files
260 | clean:
261 | find . -type f -name "*.py[co]" -delete
262 | find . -type d -name "__pycache__" -delete
263 |
264 | check_clean:
265 | @echo $(OUT_OUTER_MODEL_DIR)
266 | @echo data/interim/$(EXPERIMENT_NAME)
267 | @echo $(METRICS_LOC)
268 | @echo -n "Are you sure you want to remove the above folders? [y/N] " && read ans && [ $${ans:-N} = y ]
269 |
270 | ## delete all interim data, models, and reports for the given experiment
271 | clean_experiment: check_clean
272 | rm -r $(OUT_OUTER_MODEL_DIR)
273 | rm -r data/interim/$(EXPERIMENT_NAME)
274 | rm -r $(METRICS_LOC)
275 |
276 | ## sync this experiment to s3
277 | sync_experiment_to_s3:
278 | ifeq (default,$(PROFILE))
279 | aws s3 sync models/$(EXPERIMENT_NAME) s3://$(BUCKET)models/$(EXPERIMENT_NAME)
280 | aws s3 sync data/interim/$(EXPERIMENT_NAME) s3://$(BUCKET)data/interim/$(EXPERIMENT_NAME)
281 | aws s3 sync reports/$(EXPERIMENT_NAME) s3://$(BUCKET)reports/$(EXPERIMENT_NAME)
282 | else
283 | aws s3 sync models/$(EXPERIMENT_NAME) s3://$(BUCKET)models/$(EXPERIMENT_NAME) --profile $(PROFILE)
284 | aws s3 sync data/interim/$(EXPERIMENT_NAME) s3://$(BUCKET)data/interim/$(EXPERIMENT_NAME) --profile $(PROFILE)
285 | aws s3 sync reports/$(EXPERIMENT_NAME) s3://$(BUCKET)reports/$(EXPERIMENT_NAME) --profile $(PROFILE)
286 | endif
287 |
288 | ## sync this experiment from s3
289 | sync_experiment_from_s3:
290 | ifeq (default,$(PROFILE))
291 | aws s3 sync s3://$(BUCKET)models/$(EXPERIMENT_NAME) models/$(EXPERIMENT_NAME)
292 | aws s3 sync s3://$(BUCKET)reports/$(EXPERIMENT_NAME) reports/$(EXPERIMENT_NAME)
293 | aws s3 sync s3://$(BUCKET)data/processed/$(EXPERIMENT_NAME) data/processed/$(EXPERIMENT_NAME)
294 | else
295 | aws s3 sync s3://$(BUCKET)models/$(EXPERIMENT_NAME) models/$(EXPERIMENT_NAME) --profile $(PROFILE)
296 | aws s3 sync s3://$(BUCKET)reports/$(EXPERIMENT_NAME) reports/$(EXPERIMENT_NAME) --profile $(PROFILE)
297 | aws s3 sync s3://$(BUCKET)data/processed/$(EXPERIMENT_NAME) data/processed/$(EXPERIMENT_NAME) --profile $(PROFILE)
298 | endif
299 |
300 | ## sync raw starting data from s3
301 | sync_raw_data_from_s3:
302 | ifeq (default,$(PROFILE))
303 | aws s3 cp s3://hq-ocio-ci-bigdata/data/STI/STI_records_metadata.jsonl data/raw/STI_records_metadata.jsonl
304 | else
305 | aws s3 cp s3://hq-ocio-ci-bigdata/data/STI/STI_records_metadata.jsonl data/raw/STI_records_metadata.jsonl --profile $(PROFILE)
306 | endif
307 | echo "These records should be handled as moderate data assets. Handle these records with care."
308 |
309 | ## zip models necessary for running the app
310 | zip-experiment-for-app:
311 | cd models/; \
312 | zip -r $(EXPERIMENT_NAME).zip \
313 | $(EXPERIMENT_NAME)/categories/models \
314 | $(EXPERIMENT_NAME)/keywords/models \
315 | $(EXPERIMENT_NAME)/kwd_raw2lemma.json \
316 | $(EXPERIMENT_NAME)/cat_raw2lemma.json \
317 | $(EXPERIMENT_NAME)/vectorizer.jbl \
318 | $(EXPERIMENT_NAME)/config.yml \
319 |
320 | ## Upload zipped experiment app files to s3
321 | upload-experiment-zip-to-s3:
322 | aws s3 cp models/$(EXPERIMENT_NAME).zip s3://$(BUCKET)models/$(EXPERIMENT_NAME).zip --profile $(PROFILE)
323 | #################################################################################
324 | # PROJECT RULES #
325 | #################################################################################
326 |
327 |
328 |
329 | #################################################################################
330 | # Self Documenting Commands #
331 | #################################################################################
332 |
333 | .DEFAULT_GOAL := help
334 |
335 | # Inspired by
336 | # sed script explained:
337 | # /^##/:
338 | # * save line in hold space
339 | # * purge line
340 | # * Loop:
341 | # * append newline + line to hold space
342 | # * go to next line
343 | # * if line starts with doc comment, strip comment character off and loop
344 | # * remove target prerequisites
345 | # * append hold space (+ newline) to line
346 | # * replace newline plus comments by `---`
347 | # * print line
348 | # Separate expressions are necessary because labels cannot be delimited by
349 | # semicolon; see
350 | .PHONY: help
351 | help:
352 | @echo "$$(tput bold)Available rules:$$(tput sgr0)"
353 | @echo
354 | @sed -n -e "/^## / { \
355 | h; \
356 | s/.*//; \
357 | :doc" \
358 | -e "H; \
359 | n; \
360 | s/^## //; \
361 | t doc" \
362 | -e "s/:.*//; \
363 | G; \
364 | s/\\n## /---/; \
365 | s/\\n/ /g; \
366 | p; \
367 | }" ${MAKEFILE_LIST} \
368 | | LC_ALL='C' sort --ignore-case \
369 | | awk -F '---' \
370 | -v ncol=$$(tput cols) \
371 | -v indent=19 \
372 | -v col_on="$$(tput setaf 6)" \
373 | -v col_off="$$(tput sgr0)" \
374 | '{ \
375 | printf "%s%*s%s ", col_on, -indent, $$1, col_off; \
376 | n = split($$2, words, " "); \
377 | line_length = ncol - indent; \
378 | for (i = 1; i <= n; i++) { \
379 | line_length -= length(words[i]) + 1; \
380 | if (line_length <= 0) { \
381 | line_length = ncol - indent - length(words[i]) - 1; \
382 | printf "\n%*s ", -indent, " "; \
383 | } \
384 | printf "%s ", words[i]; \
385 | } \
386 | printf "\n"; \
387 | }' \
388 | | more $(shell test $(shell uname) = Darwin && echo '--no-init --raw-control-chars')
--------------------------------------------------------------------------------
/src/dsconcept/model.py:
--------------------------------------------------------------------------------
1 | """
2 | model
3 | -----
4 | Program to make classifiers from input corpus and selected keyword field.
5 |
6 | Author: Anthony Buonomo
7 | Contact: anthony.r.buonomo@nasa.gov
8 |
9 | Classes to support document classification.
10 | """
11 |
12 | from collections import Counter
13 | import logging
14 | from multiprocessing import cpu_count
15 | import json
16 | from typing import Dict
17 | from tqdm import tqdm
18 |
19 | import spacy
20 | from spacy.lemmatizer import Lemmatizer
21 | from spacy.lookups import Lookups
22 | from textacy.extract import acronyms_and_definitions
23 |
24 | nlp = spacy.load("en_core_web_sm")
25 |
26 | logging.basicConfig(level=logging.INFO)
27 | LOG = logging.getLogger(__name__)
28 | LOG.setLevel(logging.DEBUG)
29 |
30 |
31 | def file_len(fname):
32 | with open(fname) as f:
33 | for i, l in enumerate(f):
34 | pass
35 | return i + 1
36 |
37 |
38 | def spacy_tokenizer(txt):
39 | """
40 | Tokenize txt using spacy. Fit for use with sklearn CountVectorizer.
41 |
42 | Args:
43 | txt (str): text to be tokenized
44 |
45 | Returns:
46 | terms_tagged_list (list of str): tokens extracted from text
47 |
48 | Examples:
49 | >>> from dsconcept.model import spacy_tokenizer
50 | >>> from sklearn.feature_extraction.text import CountVectorizer
51 | >>> txt = "The ship hung in the sky much the same way bricks don't."
52 | >>> doc_tokens = spacy_tokenizer(txt)
53 | >>> doc_tokens
54 | ['ship :: NOUN',
55 | 'sky :: NOUN',
56 | 'way :: NOUN',
57 | 'brick :: NOUN',
58 | 'the ship :: NOUN_CHUNK']
59 | >>> v = CountVectorizer(txt, tokenizer=spacy_tokenizer)
60 | >>> v.fit_transform([txt])
61 | >>> v.vocabulary_
62 | {'ship :: NOUN': 1, 'sky :: NOUN': 2, 'way :: NOUN': 3, 'brick :: NOUN': 0}
63 | """
64 | doc = nlp(txt)
65 | terms_tagged = extract_from_doc(doc)
66 | terms_tagged_list = [f"{term} :: {tag}" for term, tag in terms_tagged.items()]
67 | return terms_tagged_list
68 |
69 |
70 | def should_keep(w, desired_parts_of_speech):
71 | desiredPOS = w.pos_ in desired_parts_of_speech
72 | notStop = not w.is_stop
73 | notPerc = w.lemma_ not in ["%"]
74 | return desiredPOS and notStop and notPerc
75 |
76 |
77 | def extract_from_doc(doc):
78 | """
79 | Extract features from a spacy doc.
80 |
81 | Args:
82 | doc (spacy.doc): a doc processed by the spacy 'en' model
83 |
84 | Returns:
85 | terms_tagged (dict): features with their respective tags
86 |
87 | Examples:
88 | >>> from dsconcept.model import extract_from_doc
89 | >>> import spacy
90 | >>> nlp = spacy.load('en_core_web_sm')
91 | >>> txt = "The ship hung in the sky much the same way bricks don't."
92 | >>> doc = nlp(txt)
93 | >>> features = extract_from_doc(doc)
94 | >>> features
95 | {'ship': 'NOUN',
96 | 'sky': 'NOUN',
97 | 'way': 'NOUN',
98 | 'brick': 'NOUN',
99 | 'the ship': 'NOUN_CHUNK'}
100 | """
101 | # TODO: change this function such that it processes better but maintains the same interface.
102 | terms_tagged = dict()
103 |
104 | desired_parts_of_speech = ["NOUN", "PROPN"]
105 | # Get any 1-gram terms which are not % signs, or stop words.
106 | terms = {w.lemma_: w.pos_ for w in doc if should_keep(w, desired_parts_of_speech)}
107 | terms_tagged.update(terms)
108 |
109 | # Lemmatize each gram and join with a space.
110 | noun_chunks = {
111 | " ".join([w.lemma_ for w in nc if not w.is_stop]): nc.label_
112 | for nc in doc.noun_chunks
113 | }
114 | # filter our noun chunks that are already in terms set and not in excluded_list.
115 | excluded_list = ["-PRON-", ""]
116 | noun_chunks_filtered = {
117 | w.strip(): "NOUN_CHUNK"
118 | for w, lab in noun_chunks.items()
119 | if (w not in terms.keys()) and (w not in excluded_list)
120 | }
121 | terms_tagged.update(noun_chunks_filtered)
122 |
123 | # TODO: entities take precedence over noun chunks
124 | # Get entities from text and remove collisions with terms and noun chunks.
125 | ent_excluded_set = ["ORDINAL", "CARDINAL", "QUANTITY", "DATE", "PERCENT"]
126 | ents = {e.lemma_: e.label_ for e in doc.ents if e.label_ not in ent_excluded_set}
127 | ents_filtered = {
128 | ent: "ENT"
129 | for ent, lab in ents.items()
130 | if ent not in terms.keys() and ent not in noun_chunks_filtered.keys()
131 | }
132 | terms_tagged.update(ents_filtered)
133 |
134 | # Add acronyms which have definitions.
135 | # These acronyms could create Noise if they are not good. Maybe better to use their definitions.
136 | # This schema will only pull out identifical definitions. No lemmatizing, no fuzzy matching.
137 | # TODO: add lemmatizing and fuzzy matching for acrnoyms. This code exists in acronyms project.
138 | acronyms_with_defs = acronyms_and_definitions(doc)
139 | acronyms_filtered = {
140 | "{} - {}".format(ac, definition): "ACRONYM"
141 | for ac, definition in acronyms_with_defs.items()
142 | if definition != ""
143 | }
144 | terms_tagged.update(acronyms_filtered)
145 |
146 | return terms_tagged
147 |
148 |
149 | def extract_features_from_abstracts(
150 | descriptions, feature_outfile, batch_size=1000, n_threads=cpu_count(), total=None
151 | ):
152 | """
153 | Generate features from input batch of abstracts.
154 |
155 | Args:
156 | descriptions (list of str): list of descriptions
157 | feature_outfile (str): output file for features jsonlines
158 | batch_size (int): how many docs to process in a batch
159 | n_threads (int): number of threads to process with
160 | total (int): total number of description to optionally pass to tqdm for a better loading bar
161 |
162 | Returns:
163 | no_descriptions (int): hown many descriptions were processed
164 |
165 | Examples:
166 | >>> from dsconcept.model import extract_features_from_abstracts
167 | >>> import json
168 | >>>
169 | >>> abstract1 = " A common mistake that people make when trying to design something completely foolproof is to underestimate the ingenuity of complete fools."
170 | >>> abstract2 = "Since we decided a few weeks ago to adopt the leaf as legal tender, we have, of course, all become immensely rich."
171 | >>> abstracts = [abstract1, abstract2]
172 | >>>
173 | >>> feature_outfile = 'data/tmp_features.txt'
174 | >>>
175 | >>> extract_features_from_abstracts(abstracts, feature_outfile, batch_size=1, n_threads=1)
176 | >>>
177 | >>> with open(feature_outfile, 'r') as f0:
178 | >>> content = f0.readlines()
179 | >>> features = [json.loads(line) for line in content]
180 | >>> features
181 | [{'mistake': 'NOUN',
182 | 'people': 'NOUN',
183 | 'ingenuity': 'NOUN',
184 | 'fool': 'NOUN',
185 | 'a common mistake': 'NOUN_CHUNK',
186 | 'complete fool': 'NOUN_CHUNK'},
187 | {'week': 'NOUN',
188 | 'leaf': 'NOUN',
189 | 'tender': 'NOUN',
190 | 'course': 'NOUN',
191 | 'legal tender': 'NOUN_CHUNK'}]
192 | """
193 |
194 | LOG.info("Extracting features to {}".format(feature_outfile))
195 | no_descriptions = 0
196 | with open(feature_outfile, "w") as f0:
197 | for doc in tqdm(
198 | nlp.pipe(descriptions, batch_size=batch_size, n_threads=n_threads,),
199 | total=total,
200 | ):
201 | json.dump(extract_from_doc(doc), f0) # each line is valid json
202 | f0.write("\n")
203 | no_descriptions += 1
204 |
205 | LOG.info("Extracted feature sets to {}".format(feature_outfile))
206 | return no_descriptions
207 |
208 |
209 | class FeatureExtractor:
210 | def __init__(self):
211 | """
212 | A term extractor.
213 |
214 | Examples:
215 | >>> from dsconcept.model import FeatureExtractor
216 | >>> extractor = FeatureExtractor()
217 | """
218 | self._features = list()
219 | self.term_types = dict()
220 | self.feature_counts = Counter()
221 |
222 | @property
223 | def features(self):
224 | return self._features
225 |
226 | @features.setter
227 | def features(self, value):
228 | self._features = value
229 | self.term_types = {
230 | term_type
231 | for feature_set in self._features
232 | for term_type in feature_set.values()
233 | }
234 | all_features = [
235 | feature
236 | for feature_set in self._features
237 | for feature, val in feature_set.items()
238 | ]
239 | self.feature_counts = Counter(all_features)
240 |
241 | @staticmethod
242 | def from_corpus_to_jsonlines(
243 | in_corpus, out_features, abstract_field, batch_size=1000, n_threads=cpu_count()
244 | ):
245 | """
246 |
247 | Args:
248 | in_corpus (pathlib.Path | str): input path to json file containing corpus
249 | out_features (pathlib.Path | str): output path for features json lines file.
250 | abstract_field (str): name of abstract field for corpus
251 | batch_size (int): size of batch to use when multithreading using spacy's nlp.pipe
252 | n_threads (int): number of threads to use when multithreading using spacy's nlp.pipe
253 |
254 | Returns:
255 | n_descriptions (int): the number of abstracts in the corpus
256 |
257 | """
258 |
259 | n_lines = file_len(in_corpus)
260 | with open(in_corpus, "r") as f0:
261 | record_generator = (json.loads(l) for l in f0.readlines())
262 | text_generator = (r[abstract_field] for r in record_generator)
263 | n_descriptions = extract_features_from_abstracts(
264 | text_generator, out_features, batch_size, n_threads, total=n_lines
265 | )
266 | return n_descriptions
267 |
268 | def from_jsonlines(self, in_features):
269 | """
270 | Load features from jsonlines.
271 |
272 | Args:
273 | in_features (pathlib.Path | str): path to input jsonlines features file
274 |
275 | Returns:
276 | in_features (pathlib.Path | str): path to input jsonlines features file
277 |
278 | """
279 | with open(in_features, "r") as f0:
280 | content = (
281 | f0.readlines()
282 | ) # each line is json formatted, but whole file is not.
283 | self.features = [json.loads(line) for line in content]
284 | return in_features
285 |
286 | def to_jsonlines(self, out_features):
287 | """
288 | Output features to jsonlines.
289 |
290 | Args:
291 | out_features (pathlib.Path | str): output path to features jsonlines file
292 |
293 | Returns:
294 | out_features (pathlib.Path | str): output path to features jsonlines file
295 |
296 | """
297 | with open(out_features, "w") as f0:
298 | for feature_set in self.features:
299 | json.dump(feature_set, f0) # each line is valid json
300 | f0.write("\n")
301 | return out_features
302 |
303 | def weight_terms(self, weights: Dict[str, int]):
304 | """
305 | Weights features according to tag type.
306 |
307 | Args:
308 | weights (dict of str): mappings from term types to their weights
309 |
310 | Returns:
311 | weighted_features (list of dict): features with mappings to weights instead of term types
312 |
313 | Examples
314 | --------
315 | >>> weights = {'NOUN': 1, 'NOUN_CHUNK': 2}
316 | >>> weighted_features = tm.weight_terms(weights)
317 | >>> weighted_features
318 | [{'mistake': 1,
319 | 'people': 1,
320 | 'ingenuity': 1,
321 | 'fool': 1,
322 | 'a common mistake': 2,
323 | 'complete fool': 2},
324 | {'week': 1, 'leaf': 1, 'tender': 1, 'course': 1, 'legal tender': 2}]
325 | """
326 | assert type(weights) is dict, "Weights must be dict: {}".format(weights)
327 | if self.term_types > weights.keys():
328 | LOG.warning(
329 | "Term types without a specified weight will be omitted from returned feature sets."
330 | )
331 | elif self.term_types < weights.keys():
332 | LOG.warning(
333 | "More term types specified then those which exist in corpus. Ignoring excess."
334 | )
335 | weighted_features = [
336 | weight_terms_inner(doc_features, weights) for doc_features in self.features
337 | ]
338 | return weighted_features
339 |
340 | def limit_features(
341 | self,
342 | weighted_features,
343 | feature_min,
344 | feature_max,
345 | topic=None,
346 | doc_topic_matrix=None,
347 | ):
348 | """
349 | Cull features.
350 |
351 | Args:
352 | weighted_features (list of dict): features with assigned weights
353 | feature_min (int): features which have in-corpus frequencies under feature_min are excluded.
354 | feature_max (float): features which occur in greater than this percentage of documents are excluded.
355 | topic (int | None): if specified, only return feature sets with maximum probability to be in this topic.
356 | doc_topic_matrix (numpy.ndarray): topic probability distributions for each document in corpus.
357 |
358 | Returns:
359 | weighted_limited (list): limited features with assigned weights
360 |
361 | Examples:
362 | >>> limited_features = tm.limit_features_for_X(weighted_features, feature_min=1, feature_max=0.99)
363 | """
364 | assert (feature_max > 0.0) and (
365 | feature_max <= (1.0)
366 | ), "feature_max should be float in (0,1]"
367 | feature_ex = {
368 | feature: occurrence
369 | for feature, occurrence in self.feature_counts.items()
370 | if (occurrence >= feature_min)
371 | and (occurrence / len(self.features) < feature_max)
372 | }
373 |
374 | weighted_limited = [
375 | {
376 | feature: val
377 | for feature, val in feature_set.items()
378 | if feature in feature_ex
379 | }
380 | for feature_set in weighted_features
381 | ]
382 |
383 | if topic is not None:
384 | assert doc_topic_matrix is not None, LOG.error(
385 | "Must supply doc_topic_matrix when using topic model segmentation."
386 | )
387 | LOG.info(f"Segmenting vectorizer and matrix for topic {topic}.")
388 | print("here")
389 | in_topic_index = [
390 | i for i, distr in enumerate(doc_topic_matrix) if distr.argmax() == topic
391 | ]
392 | weighted_limited = [weighted_limited[i] for i in in_topic_index]
393 |
394 | return weighted_limited
395 |
396 |
397 | def weight_terms_inner(doc_features, weights):
398 | """
399 |
400 | Args:
401 | doc_features (dict): features with assigned tags
402 | weights (dict): tag to weight mappings
403 |
404 | Returns:
405 | weighted_terms (dict): features with assigned weights
406 |
407 | Examples
408 | >>> from dsconcept.model import weight_terms_inner
409 | >>> features = {'ship': 'NOUN', 'sky': 'NOUN', 'way': 'NOUN', 'brick': 'NOUN', 'the ship': 'NOUN_CHUNK'}
410 | >>> weights = {'NOUN': 1, 'NOUN_CHUNK': 3}
411 | >>> weighted_terms = weight_terms_inner(features, weights)
412 | >>> weighted_terms
413 | {'ship': 1, 'sky': 1, 'way': 1, 'brick': 1, 'the ship': 3}
414 | """
415 | weighted_terms = {}
416 | for pos0, weight in weights.items():
417 | updated_dict = {w: weight for w, pos in doc_features.items() if pos == pos0}
418 | weighted_terms.update(updated_dict)
419 |
420 | return weighted_terms
421 |
422 |
423 | class ConceptExtractor:
424 | def __init__(self):
425 | """
426 | Information about relationship between concepts/keywords and corpus.
427 |
428 | Examples:
429 | >>> from dsconcept.model import ConceptExtractor
430 | >>> kwd_sets = [['Zaphod', 'Arthur'], ['Arthur'], ['Zaphod'], ['Heart of Gold']]
431 | >>> info = ConceptExtractor.concept_sets = kwd_sets
432 | >>> info.concepts
433 | {'arthur', 'heart of gold', 'zaphod'}
434 | """
435 | self._concept_sets = []
436 | self.raw2lemma = {}
437 | self.lemma2raw = {}
438 | self.lemmatizer = None
439 | self.concepts_frequencies = Counter()
440 | self.concepts = set()
441 | self.concept_index_mapping = {}
442 |
443 | @property
444 | def concept_sets(self):
445 | return self._concept_sets
446 |
447 | @concept_sets.setter
448 | def concept_sets(self, value):
449 | """
450 | Sets concepts_sets and the attributes derived from it.
451 |
452 | Args:
453 | value (list of list of str): A list of lists of strings; each string being a concept,
454 | each set in the larger list corresponding to a document which has the tags seen in the set.
455 | """
456 | self._concept_sets = value
457 | LOG.debug("Extracting raw keywords as concepts.")
458 | all_concepts = [
459 | concept
460 | for concept_set in tqdm(self._concept_sets)
461 | for concept in concept_set
462 | if concept.strip() != ""
463 | ]
464 | raw_concepts = set(all_concepts)
465 |
466 | LOG.debug("Lemmatizing {} raw concepts.".format(len(raw_concepts)))
467 | concepts = [c.lower() for c in raw_concepts]
468 |
469 | self.raw2lemma = {rc: c for rc, c in zip(raw_concepts, concepts)}
470 | lookups = Lookups()
471 | lookups.add_table("lemma_lookup", self.raw2lemma)
472 | self.lemmatizer = Lemmatizer(lookups)
473 | self.lemma2raw = {v: k for k, v in self.raw2lemma.items()}
474 | lemma_concepts = [
475 | self.lemmatizer(concept, "NOUN")[0] for concept in all_concepts
476 | ]
477 | self.concepts_frequencies = Counter(lemma_concepts)
478 | self.concepts = set(lemma_concepts)
479 | self._fit_concept_indices()
480 |
481 | def _fit_concept_indices(self):
482 | kwd_sets_lemmas = [
483 | [self.lemmatizer(kwd, "NOUN")[0] for kwd in kwd_set]
484 | for kwd_set in self.concept_sets
485 | ]
486 | concepts_with_inds = dict()
487 | for i, kwd_set in enumerate(kwd_sets_lemmas):
488 | for kwd in kwd_set:
489 | if kwd not in concepts_with_inds:
490 | concepts_with_inds[kwd] = [i]
491 | else:
492 | concepts_with_inds[kwd].append(i)
493 | self.concept_index_mapping = concepts_with_inds
494 |
495 | def from_corpus(self, in_corpus, concept_field):
496 | """
497 | Extract concepts from input json corpus.
498 |
499 | Args:
500 | in_corpus (pathlike): path to input json-formatted corpus from which to extract concepts
501 | concept_field (str): the name of the concept field
502 | """
503 | with open(in_corpus, "r") as f0:
504 | record_generator = (json.loads(l) for l in f0.readlines())
505 | concept_sets = [r[concept_field] for r in record_generator]
506 | with_concepts = [i for i, cs in enumerate(concept_sets) if cs is not []]
507 | assert len(with_concepts) > 0, LOG.error(
508 | f'"{concept_field}" not present in corpus.'
509 | )
510 | LOG.debug(f"{len(with_concepts)} docs in corpus with {concept_field}.")
511 | self.concept_sets = concept_sets
512 |
513 | def to_jsons(self, out_indices, out_raw2lemma):
514 | """
515 | Output indices and raw2lemma dicts to json files.
516 |
517 | Args:
518 | out_indices (pathlib.Path): path to output file containing indices for concepts
519 | out_raw2lemma (pathlib.Path): path to output file containing mappings from concepts to their lemmas
520 |
521 | Returns:
522 | out_indices (pathlib.Path): path to output file containing indices for concepts
523 | out_raw2lemma (pathlib.Path): path to output file containing mappings from concepts to their lemmas
524 |
525 | """
526 | with open(out_indices, "w") as f0:
527 | json.dump(self.concept_index_mapping, f0)
528 | with open(out_raw2lemma, "w") as f0:
529 | json.dump(self.raw2lemma, f0)
530 | return out_indices, out_raw2lemma
531 |
532 | def from_jsons(
533 | self, in_indices, in_raw2lemma
534 | ): # a little strange because it does not fill in all attributes
535 | """
536 | Load index and raw2lemma dictionaries into empty ConceptExtractor
537 |
538 | Args:
539 | in_indices ():
540 | in_raw2lemma ():
541 | """
542 | with open(in_indices, "r") as f0:
543 | self.concept_index_mapping = json.load(f0)
544 | with open(in_raw2lemma, "r") as f0:
545 | self.raw2lemma = json.load(f0)
546 | lookups = Lookups()
547 | lookups.add_table("lemma_lookup", self.raw2lemma)
548 | self.lemmatizer = Lemmatizer(lookups)
549 | self.lemma2raw = {v: k for k, v in self.raw2lemma.items()}
550 | self.concepts = self.concept_index_mapping.keys()
551 | tmp_frequencies = {
552 | concept: len(index) for concept, index in self.concept_index_mapping.items()
553 | }
554 | self.concepts_frequencies = Counter(tmp_frequencies)
555 |
556 | def get_top_concepts(self, min_freq=500):
557 | """
558 |
559 | Args:
560 | min_freq (int): occurrence threshold for concepts
561 |
562 | Returns:
563 | top_concepts(dict): a subset of the
564 |
565 | Examples:
566 | >>> info.get_top_concepts(2)
567 | >>> info.top_concepts
568 | ['zaphod', 'arthur']
569 | """
570 | LOG.info(f"Getting indices for concepts with frequency >= {min_freq}.")
571 | top_concepts = {
572 | concept: index
573 | for concept, index in self.concept_index_mapping.items()
574 | if len(index) >= min_freq
575 | }
576 | return top_concepts
577 |
--------------------------------------------------------------------------------
/src/dsconcept/get_metrics.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import os
3 | import logging
4 | from math import ceil
5 | from multiprocessing import cpu_count
6 | from pathlib import Path
7 | from typing import List, Dict, Tuple
8 |
9 | import dask
10 | import h5py
11 | import joblib
12 | import numpy as np
13 | import pandas as pd
14 | from dask.diagnostics import ProgressBar
15 | from sklearn.feature_extraction import DictVectorizer
16 | from sklearn.metrics import (
17 | accuracy_score,
18 | roc_auc_score,
19 | recall_score,
20 | precision_score,
21 | )
22 | from sklearn.model_selection import GridSearchCV
23 | from tqdm import tqdm as tqdm
24 | from tempfile import NamedTemporaryFile, TemporaryDirectory
25 |
26 | import dsconcept.model as ml
27 |
28 | logging.basicConfig(level=logging.INFO)
29 | LOG = logging.getLogger(__name__)
30 | LOG.setLevel(logging.INFO)
31 |
32 | PRED_LIST_TYPE = List[List[Tuple[str, float]]]
33 |
34 |
35 | def get_cat_inds(
36 | categories: List[str], cat_preds: np.array, t: float = 0.5
37 | ) -> Dict[str, np.array]:
38 | """
39 | Apply a threshold to get documents indices corresponding to each category.
40 |
41 | Args:
42 | categories: list of categories which are columns of the cat_preds array
43 | cat_preds: array of scores for each category for each document
44 | ([documents, categories])
45 | t: threshold over which a category is determined to be relevant
46 | to a given document
47 |
48 | Returns:
49 | all_cat_inds: dictionary with keys which are categories.
50 | Values are index of documents which apply to each category.
51 |
52 | Examples:
53 | >>> from get_metrics import get_cat_inds
54 | >>> import numpy as np
55 | >>> cats = ['physics', 'geology']
56 | >>> cat_preds = np.array([[0.4, 0.8], [0.5, 0.6], [0.9, 0.3]])
57 | >>> get_cat_inds(cats, cat_preds, t=0.5)
58 | {'physics': array([2]), 'geology': array([0, 1])}
59 | """
60 | all_cat_inds = {}
61 | for i, cat in enumerate(categories):
62 | if cat == "":
63 | continue
64 | x = cat_preds[:, i]
65 | g_args = np.argwhere(x > t)
66 | if g_args.shape[0] == 0:
67 | cat_inds = np.array([])
68 | else:
69 | cat_inds = np.stack(np.argwhere(x > t), axis=1)[0]
70 | all_cat_inds[cat] = cat_inds
71 | return all_cat_inds
72 |
73 |
74 | def f_score(r: float, p: float, b: int = 1):
75 | """
76 | Calculate f-measure from recall and precision.
77 |
78 | Args:
79 | r: recall score
80 | p: precision score
81 | b: weight of precision in harmonic mean
82 |
83 | Returns:
84 | val: value of f-measure
85 | """
86 | try:
87 | val = (1 + b ** 2) * (p * r) / (b ** 2 * p + r)
88 | except ZeroDivisionError:
89 | val = 0
90 | return val
91 |
92 |
93 | def get_mets(
94 | i: int,
95 | synth_preds: np.array,
96 | target_vals: np.array,
97 | con_with_clf: np.array,
98 | pbar=None,
99 | ) -> dict:
100 | """
101 | Get various metrics for the given arrays.
102 | #
103 | TODO: just pass in the already sliced synth_preds, Y, and con_with_clf?
104 |
105 | Args:
106 | i: index for the given concept
107 | synth_preds: arrays of predictions for each document and each concept
108 | target_vals: true values for each document and concept
109 | con_with_clf: arrays of concepts corresponding
110 | to columns synth_preds and target_vals
111 |
112 | Returns:
113 | metrics: metric records for the given concept
114 | """
115 | tmp_y_pred = synth_preds[:, i]
116 | tmp_y_pred_bool = [1 if v > 0.5 else 0 for v in tmp_y_pred]
117 | tmp_y_test = target_vals[:, i]
118 | p = precision_score(tmp_y_test, tmp_y_pred_bool)
119 | r = recall_score(tmp_y_test, tmp_y_pred_bool)
120 | f = f_score(r, p)
121 | accuracy = accuracy_score(tmp_y_test, tmp_y_pred_bool)
122 | try:
123 | roc_auc = roc_auc_score(tmp_y_test, tmp_y_pred)
124 | except ValueError: # why does this happen?
125 | roc_auc = np.nan
126 | metrics = {
127 | "concept": con_with_clf[i],
128 | "accuracy": accuracy,
129 | "f1": f,
130 | "precision": p,
131 | "recall": r,
132 | "roc_auc": roc_auc,
133 | }
134 | if pbar is not None:
135 | pbar.update(1)
136 | return metrics
137 |
138 |
139 | def synth_mean(
140 | kwd_preds_tmp: np.array, doc_index: int, concept_index: int, non_zero_cats: list,
141 | ) -> float:
142 | """
143 | Get the mean of nonzero predictions for given concept and given document.
144 | # TODO: get the precise matrix outside of function? Then pass in?
145 |
146 | Args:
147 | kwd_preds_tmp: 3D array of predictions
148 | [categories, documents, concepts]
149 | doc_index: index of test document
150 | concept_index: index of concept
151 | non_zero_cats: categories for which this concept has nonzero prediction
152 |
153 | Returns:
154 | mean: mean of nonzero predictions for this concept for this document
155 | """
156 | if len(non_zero_cats) != 0:
157 | mean = np.mean(kwd_preds_tmp[non_zero_cats, doc_index, concept_index])
158 | else:
159 | mean = np.nan
160 | return mean
161 |
162 |
163 | def synth_max(
164 | kwd_preds_tmp: np.array, doc_index: int, concept_index: int, non_zero_cats: list,
165 | ) -> float:
166 | """
167 | Get the max of nonzero predictions for given concept and given document.
168 | # TODO: nearly same as above function. Just pass in the np.nanmax or mean as args and collapse into one function?
169 | """
170 | if len(non_zero_cats) != 0:
171 | val = np.nanmax(kwd_preds_tmp[non_zero_cats, doc_index, concept_index])
172 | else:
173 | val = np.nan
174 | return val
175 |
176 |
177 | def get_means_for_one_doc(
178 | doc_index: int,
179 | all_cat_inds: Dict[str, np.array],
180 | kwd_preds_tmp: np.array,
181 | categories: List[str],
182 | no_cat_ind: int,
183 | only_cat: bool = False,
184 | synth_strat: str = "mean",
185 | pbar=None,
186 | ) -> np.array:
187 | """
188 | Get mean of nonzero concept predictions for each concepts
189 | in relevant categories for given doc.
190 |
191 | Args:
192 | doc_index: index of given document
193 | all_cat_inds: dictionary with keys which are categories.
194 | Values are index of documents which apply to each category.
195 | kwd_preds_tmp: array of all predictions
196 | [categories, documents, concepts]
197 | categories: list of categories
198 | no_cat_ind: index in categories list of the blank category ""
199 | only_cat: Only use category classifier or mixin the no category classifiers
200 | synth_strat: either "mean" or "max"
201 | # TODO: just pass a function instead of string?
202 |
203 | Returns:
204 | kwd_vals: array of synthesizes keyword prediction values
205 | for given document
206 | """
207 | cats = [
208 | cat for cat, inds in all_cat_inds.items() if doc_index in inds
209 | ] # get category by index instead? means all_cat index should be by index
210 | cat_inds = [categories.index(cat) for cat in cats]
211 | if only_cat is False:
212 | cat_inds.append(no_cat_ind)
213 | # ^ also average with the no-topic set, make this a decision?
214 | kwd_vals = []
215 | for concept_index in range(kwd_preds_tmp.shape[2]):
216 | non_zero_cats = np.where(kwd_preds_tmp[:, doc_index, concept_index] != 0)[0]
217 | non_zero_cats = list(set(non_zero_cats).intersection(set(cat_inds)))
218 | assert synth_strat in ["mean", "max"], LOG.exception(
219 | f'Synthesis strategy "{synth_strat}" is invalid.'
220 | )
221 | strat = synth_mean if synth_strat == "mean" else synth_max
222 | v = strat(kwd_preds_tmp, doc_index, concept_index, non_zero_cats)
223 | kwd_vals.append(v)
224 | kwd_vals = np.array(kwd_vals)
225 | if pbar is not None:
226 | pbar.update(1)
227 | return kwd_vals
228 |
229 |
230 | def create_ground_truth(
231 | store: str,
232 | dataset: str,
233 | test_inds: np.array,
234 | train_inds: np.array,
235 | concepts_with_classifiers: np.array,
236 | kwd_ext: ml.ConceptExtractor,
237 | batch_size: int,
238 | ):
239 | """
240 | Make an array of ground truth binary labels.
241 |
242 | Args:
243 | store: location of h5 database
244 | dataset: name of dataset in h5 database
245 | at which to store ground_truth array
246 | test_inds: test indices in the training data
247 | train_inds: training indices in the training data
248 | concepts_with_classifiers: all concepts which have models
249 | kwd_ext: ml.ConceptExtractor with ground_truth indices for concepts
250 | batch_size: batch_size for creating ground truth for each concept
251 |
252 | Returns:
253 | store, dataset: h5 store location and dataset name
254 | """
255 | with h5py.File(store, "a") as f0:
256 | ground_truth = f0.create_dataset(
257 | dataset,
258 | shape=(len(test_inds), len(concepts_with_classifiers)),
259 | compression="gzip",
260 | )
261 | n_batches = np.int(np.ceil(len(concepts_with_classifiers) / batch_size))
262 | for n in tqdm(range(n_batches)):
263 | start_batch = n * batch_size
264 | end_batch = (n + 1) * batch_size
265 | if end_batch >= len(concepts_with_classifiers):
266 | end_batch = len(concepts_with_classifiers) - 1
267 | batch_matrix = np.zeros((len(test_inds), end_batch - start_batch))
268 | con_batch = concepts_with_classifiers[start_batch:end_batch]
269 | for i, con in enumerate(con_batch):
270 | index = kwd_ext.concept_index_mapping[con]
271 | y_full = np.zeros((len(test_inds) + len(train_inds)))
272 | y_full[index] = 1
273 | y = y_full[test_inds]
274 | batch_matrix[:, i] = y
275 | ground_truth[:, start_batch:end_batch] = batch_matrix
276 |
277 | return store, dataset
278 |
279 |
280 | # TODO: maybe make this a part of the hierarchical class
281 | def get_synth_preds(
282 | store,
283 | shape,
284 | all_cat_inds,
285 | categories,
286 | batch_size,
287 | only_cat,
288 | synth_strat,
289 | use_dask=True,
290 | con_limit=None,
291 | limit=None,
292 | pbar=None,
293 | ):
294 | with h5py.File(store, "a") as f_synth, h5py.File(store, "r") as f_preds:
295 | if "synthesis" in f_synth.keys():
296 | del f_synth['synthesis']
297 | f_synth.create_dataset("synthesis", shape)
298 | synth_preds = f_synth["synthesis"]
299 | if (limit is not None):
300 | kwd_preds = f_preds["predictions"][:, 0:limit, :]
301 | else:
302 | kwd_preds = f_preds["predictions"]
303 | n_batches = np.ceil(kwd_preds.shape[1] / batch_size)
304 | LOG.debug(f"{n_batches} batches")
305 | no_cat_ind = categories.index("")
306 | for n in range(int(n_batches)):
307 | start_batch = n * batch_size
308 | end_batch = (n + 1) * batch_size
309 | if con_limit is not None:
310 | kwd_preds_tmp = kwd_preds[0:con_limit, start_batch:end_batch, :]
311 | else:
312 | kwd_preds_tmp = kwd_preds[:, start_batch:end_batch, :]
313 | n_docs = kwd_preds_tmp.shape[1]
314 | if True: # use_dask is True:
315 | kwd_preds_tmp = dask.delayed(kwd_preds_tmp)
316 | all_cat_inds = dask.delayed(all_cat_inds)
317 | jobs = []
318 | for doc_index in range(n_docs):
319 | # should be everything now, since '' category is included
320 | job = dask.delayed(get_means_for_one_doc)(
321 | doc_index,
322 | all_cat_inds,
323 | kwd_preds_tmp,
324 | categories,
325 | no_cat_ind,
326 | synth_strat,
327 | pbar=pbar,
328 | )
329 | jobs.append(job)
330 | hybrid_preds = dask.compute(jobs)[0]
331 | else:
332 | hybrid_preds = []
333 | for doc_index in range(n_docs):
334 | # should be everything now, since '' category is included
335 | v = get_means_for_one_doc(
336 | doc_index,
337 | all_cat_inds,
338 | kwd_preds_tmp,
339 | categories,
340 | no_cat_ind,
341 | only_cat,
342 | synth_strat,
343 | pbar=pbar,
344 | )
345 | hybrid_preds.append(v)
346 | hybrid_pred_array = np.stack(hybrid_preds)
347 | if limit is not None:
348 | if limit <= end_batch:
349 | synth_preds[start_batch:limit, :] = hybrid_pred_array
350 | else:
351 | synth_preds[start_batch:end_batch, :] = hybrid_pred_array
352 | else:
353 | synth_preds[start_batch:end_batch, :] = hybrid_pred_array
354 |
355 |
356 | def load_category_models(in_cat_models: str) -> List[dict]:
357 | """
358 | Load all category models from given directory
359 |
360 | Args:
361 | in_cat_models: directory where category models reside
362 |
363 | Returns:
364 | cat_clfs: A list of dictionaries, each with a category model
365 | """
366 | LOG.info(f"Loading category classifiers from {in_cat_models}.")
367 | in_clfs = list(Path(in_cat_models).iterdir())
368 | cat_clfs = [joblib.load(c) for c in tqdm(in_clfs)]
369 | return cat_clfs
370 |
371 |
372 | def load_concept_models(in_kwd_models: str, load: bool = True) -> Dict[Tuple[str, str], GridSearchCV]:
373 | """
374 | Load keyword models from given directory.
375 |
376 | Args:
377 | in_kwd_models: directory with subdirs, the suffixes of which are the
378 | names of the categories (ex. topic_physics). Each of these
379 | subfolders contains binary files for concepts in that category.
380 | The classifiers trained on all documents are in a subfolder which
381 | has not suffix (ex. topic_).
382 | load: whether to load the models into memory, or just get their paths
383 |
384 | Returns:
385 | cd: Dictionary with all classifiers for each category.
386 | """
387 | LOG.info(f"Loading keyword classifiers from {in_kwd_models}.")
388 | cd = {} # expects no_topics with suffix ''
389 | topic_dirs = list(Path(in_kwd_models).iterdir())
390 | total = 0
391 | for td in topic_dirs:
392 | in_clfs = list(td.iterdir())
393 | total += len(in_clfs)
394 | pbar = tqdm(topic_dirs, total=total)
395 | for topic_dir in pbar:
396 | topic_name = topic_dir.stem.split("_")[1] # depends on opinionated path format
397 | pbar.set_description(topic_name)
398 | in_clfs = list(topic_dir.iterdir())
399 | clfs = (joblib.load(c) for c in in_clfs) # generator for loading classifiers
400 | for c, c_loc in zip(clfs, in_clfs):
401 | if load is True:
402 | cd[topic_name, c["concept"]] = c["best_estimator_"]
403 | else:
404 | cd[topic_name, c['concept']] = c_loc
405 | pbar.update(1)
406 | return cd
407 |
408 |
409 | def make_predictions(
410 | in_cat_models,
411 | in_kwd_models,
412 | feature_matrix,
413 | out_store="test_results/store.h5",
414 | t=None,
415 | ):
416 | cat_clfs = load_category_models(in_cat_models)
417 | cd = load_concept_models(in_kwd_models)
418 | clf = HierarchicalClassifier(cat_clfs, cd)
419 | LOG.info("Predicting categories.")
420 | cat_preds = clf.predict_categories(feature_matrix)
421 | if t is not None:
422 | LOG.info("Only making predictions for keywords in predicted categories.")
423 | cat_indices = get_cat_inds(clf.categories, cat_preds, t)
424 | # TODO: add rule for when cat_indices has nothing in it!
425 | all_kwd_preds_loc = clf._predict_keywords(
426 | feature_matrix, out_store, cat_indices
427 | )
428 | else:
429 | LOG.info("Predicting for all keywords on all documents.")
430 | # TODO: this should call a public function
431 | all_kwd_preds_loc = clf._predict_keywords(feature_matrix, out_store)
432 | LOG.info(f"all_kwd_preds_loc={all_kwd_preds_loc}")
433 |
434 | return clf.categories, clf.concepts_with_classifiers, cat_preds
435 |
436 |
437 | class StubBestEstimator:
438 | """
439 | Stub class for classifier's best_estimator to be used for testing.
440 | """
441 |
442 | def init(self):
443 | pass
444 |
445 | def predict_proba(self, feature_matrix):
446 | val = np.random.rand(feature_matrix.shape[0], 2)
447 | return val
448 |
449 |
450 | def main(
451 | experiment_name, out_store, out_cat_preds, gt_batch_size, limit=None,
452 | ):
453 | LOG.info("Loading test data and models.")
454 | # TODO: paths should be put into main function
455 | test_inds = np.load(f"data/interim/{experiment_name}/test_inds.npy")
456 | train_inds = np.load(f"data/interim/{experiment_name}/train_inds.npy")
457 | feature_matrix = joblib.load(f"data/interim/{experiment_name}/feature_matrix.jbl")
458 | in_cat_models = Path(f"models/{experiment_name}/categories/models/")
459 | in_kwd_models = Path(f"models/{experiment_name}/keywords/models/")
460 |
461 | if limit is not None:
462 | LOG.info(f"Limiting to {limit} test records.")
463 | feature_matrix_test = feature_matrix.tocsc()[test_inds[0:limit], :]
464 | # TODO: How does this affect indices?
465 | else:
466 | feature_matrix_test = feature_matrix.tocsc()[test_inds, :]
467 |
468 | LOG.info("Making predictions.")
469 | categories, concepts_with_classifiers, cat_preds, = make_predictions(
470 | in_cat_models, in_kwd_models, feature_matrix_test, out_store,
471 | ) # need t if limiting
472 | np.save(out_cat_preds, cat_preds)
473 | LOG.info("Creating ground truth data.")
474 | kwd_ext = ml.ConceptExtractor() # TODO: these paths should be provided as args
475 | kwd_ext.from_jsons(
476 | f"data/interim/{experiment_name}/kwd_indices.json",
477 | f"models/{experiment_name}/kwd_raw2lemma.json",
478 | )
479 | create_ground_truth(
480 | store=out_store,
481 | dataset="ground_truth",
482 | kwd_ext=kwd_ext,
483 | concepts_with_classifiers=concepts_with_classifiers,
484 | batch_size=gt_batch_size,
485 | train_inds=train_inds,
486 | test_inds=test_inds,
487 | )
488 |
489 |
490 | def get_category_results(cat_models_dir: Path) -> pd.DataFrame:
491 | in_clfs = list(cat_models_dir.iterdir())
492 | cat_clfs = [joblib.load(c) for c in in_clfs] # loads the classifiers
493 | cat_results_df = pd.DataFrame(
494 | [{**c["scores"], **{"concept": c["concept"]}} for c in cat_clfs]
495 | )
496 | return cat_results_df
497 |
498 |
499 | def get_keyword_results(kwd_models_dir: Path) -> pd.DataFrame:
500 | cd = {}
501 | for topic_dir in kwd_models_dir.iterdir():
502 | in_clfs = list(topic_dir.iterdir())
503 | clfs = (joblib.load(c) for c in in_clfs) # loads the classifiers
504 | topic_name = topic_dir.stem.split("_")[1] # depends on opinionated path format
505 | cd[topic_name] = clfs
506 |
507 | all_records = []
508 | for t, clfs in tqdm(cd.items()):
509 | for clf in clfs:
510 | r = {**{"concept": clf["concept"], "category": t}, **clf["scores"]}
511 | all_records.append(r)
512 | results_df = pd.DataFrame(all_records)
513 | return results_df
514 |
515 |
516 | class HierarchicalClassifier:
517 | """
518 | Hierarchical Classifier object which allows for streamlined predictions
519 | on suites of concept models associated with different categories.
520 |
521 | Attributes:
522 | categories: list of categories
523 | concepts_with_classifiers: sorted array of concepts with classifiers
524 | cat_concept_indices: list where each element maps onto a category.
525 | Each element consists of a selection of indices
526 | in concepts_with_classifier which occur in the given category.
527 | vectorizer: DictVectorizer for transforming features
528 | """
529 |
530 | def __init__(
531 | self, cat_clfs: List[dict], kwd_clfs: Dict[Tuple[str, str], GridSearchCV],
532 | ):
533 | """
534 | Set the models for categories and concepts_with_classifiers
535 |
536 | Args:
537 | cat_clfs: category classifier models
538 | kwd_clfs: Dictionary with keys which are tuples
539 | of categories and concepts, values are the classifier models
540 | """
541 | self.cat_clfs = cat_clfs
542 | self.kwd_clfs = kwd_clfs
543 | self.vectorizer = None
544 |
545 | @property
546 | def cat_clfs(self):
547 | """
548 | The category classifiers.
549 |
550 | Setter also creates categories attribute.
551 | """
552 | return self._cat_clfs
553 |
554 | @property
555 | def kwd_clfs(self):
556 | """
557 | Dictionary with keys which are tuples of categories and concepts,
558 | values are the classifier models
559 |
560 | Setter method creates concept_indices,
561 | and concepts_with_classifiers attributes.
562 | """
563 | return self._kwd_clfs
564 |
565 | @cat_clfs.setter
566 | def cat_clfs(self, cat_clfs: List[dict]):
567 | self._cat_clfs = cat_clfs
568 | self.categories = [c["concept"] for c in self.cat_clfs] + [""]
569 |
570 | @kwd_clfs.setter
571 | def kwd_clfs(self, kwd_clfs: Dict[Tuple[str, str], dict]):
572 | self._kwd_clfs = kwd_clfs
573 | category_concepts = {}
574 |
575 | for cat in self.categories:
576 | concepts = [k[1] for k, v in kwd_clfs.items() if k[0] == cat]
577 | # concepts = [clf["concept"] for clf in kwd_clfs[cat]]
578 | category_concepts[cat] = concepts
579 |
580 | all_cat_concepts = set(
581 | c for ts, cons in category_concepts.items() for c in cons
582 | )
583 | concepts_with_classifiers = np.sort(list(all_cat_concepts))
584 | LOG.info(f"concepts_with_classifiers: {concepts_with_classifiers.shape[0]}")
585 |
586 | cat_concept_indices = []
587 | for cat in self.categories:
588 | full_in_cats = np.isin(concepts_with_classifiers, category_concepts[cat])
589 | cat_concept_cols = np.where(full_in_cats)[0]
590 | cat_concept_indices.append(cat_concept_cols)
591 |
592 | self.cat_concept_indices: List[np.array] = cat_concept_indices
593 | # shape is [categories, keywords]
594 | self.concepts_with_classifiers: np.array = concepts_with_classifiers
595 |
596 | def load_vectorizer(self, v_loc: str):
597 | """
598 | Loads the DictVectorizer
599 |
600 | Args:
601 | v_loc: location of vectorizer
602 | """
603 | self.vectorizer: DictVectorizer = joblib.load(v_loc)
604 |
605 | def vectorize(
606 | self,
607 | texts: List[str],
608 | weights: Dict[str, int],
609 | batch_size: int = 1000,
610 | n_threads: int = cpu_count(),
611 | ) -> Tuple[List[Dict[str, str]], np.array]:
612 | """
613 | Transform texts into a matrix of features.
614 |
615 | Args:
616 | texts: texts to transform
617 | weights: how to weight different types of features
618 | batch_size: what batch size to pass to nlp.pipe
619 | n_threads: number of threads to use
620 |
621 | Returns:
622 | feature_matrix: matrix representation of features for each document
623 | """
624 | assert self.vectorizer is not None, LOG.exception("Must initialize vectorizer.")
625 | fe = ml.FeatureExtractor()
626 | with NamedTemporaryFile() as tmp_features_loc:
627 | tmp_features = tmp_features_loc.name
628 | ml.extract_features_from_abstracts(
629 | texts, tmp_features, batch_size, n_threads
630 | )
631 | fe.from_jsonlines(tmp_features)
632 | weighted_features = fe.weight_terms(weights)
633 | feature_matrix = self.vectorizer.transform(weighted_features)
634 | return fe.features, feature_matrix
635 |
636 | def predict_categories(self, feature_matrix: np.array) -> np.array:
637 | """
638 | Make predictions with category classifiers
639 |
640 | Args:
641 | feature_matrix: array of features for each document
642 |
643 | Returns:
644 | cat_preds: prediction belief values for each document
645 | """
646 | cat_preds_list = [
647 | clf["best_estimator_"].predict_proba(feature_matrix)[:, 1]
648 | for clf in tqdm(self.cat_clfs)
649 | ]
650 | cat_preds = np.stack(cat_preds_list, axis=1)
651 | return cat_preds
652 |
653 | def _predict_one_clf(
654 | self, feature_matrix: np.array, concept_index: int, cat: str, pbar=None,
655 | ) -> np.array:
656 | """
657 | Make a prediction for a particular concept.
658 |
659 | Args:
660 | feature_matrix: array of features for each document
661 | concept_index: index for the given concept
662 | in concepts_with_classifiers attribute
663 | cat: name of the given category
664 |
665 | Returns:
666 | v: predictions for all documents for the given concept
667 | """
668 | con = self.concepts_with_classifiers[concept_index]
669 | clf = self.kwd_clfs[cat, con]
670 | try: # TODO: explicit option for this rather than interpreting?
671 | os.fspath(clf)
672 | clf = joblib.load(clf)["best_estimator_"]
673 | except TypeError:
674 | pass
675 | v = clf.predict_proba(feature_matrix)[:, 1]
676 | if pbar is not None:
677 | pbar.update(1)
678 | return v
679 |
680 | def _predict_kwds_for_cat(
681 | self,
682 | feature_matrix: np.array,
683 | cat_index: int,
684 | predictions: np.array,
685 | cat_indices: Dict[str, List[int]] = None,
686 | use_dask: bool = True,
687 | pbar: tqdm = None,
688 | ):
689 | """
690 | Make predictions for all documents for all concepts
691 | in the given category
692 |
693 | Args:
694 | feature_matrix: array of features for each document
695 | cat_index: index in categories attribute of the given category
696 | predictions: the h5 dataset where predictions are stored
697 | cat_indices: Predicted indices where categories occur
698 | for each category
699 | use_dask: Use dask for multiprocessing
700 | pbar: tqdm progress bar
701 | """
702 | cat = self.categories[cat_index]
703 | pbar.set_postfix(category=cat, refresh=False)
704 | if (cat_indices is not None) and (cat != ""):
705 | feature_matrix_test = feature_matrix[cat_indices[cat], :]
706 | # this could be a problem if I want everything to perfectly align.
707 | else:
708 | feature_matrix_test = feature_matrix
709 | if feature_matrix_test.shape[0] == 0:
710 | pbar.update(len(self.cat_concept_indices[cat_index]))
711 | return 0
712 | # TODO: for good bar, should walk tasks to compute total
713 | cat_concept_cols = self.cat_concept_indices[cat_index]
714 | # use the np.where here, bool index for initial setting?
715 | if False: # use_dask is True:
716 | feature_matrix_test = dask.delayed(feature_matrix_test)
717 | jobs = []
718 | ProgressBar().register()
719 | for concept_index in cat_concept_cols:
720 | j = dask.delayed(self._predict_one_clf)(
721 | feature_matrix_test, concept_index, cat, pbar
722 | )
723 | jobs.append(j)
724 | vals = dask.compute(jobs)[0]
725 | else:
726 | vals = []
727 | for concept_index in cat_concept_cols:
728 | val = self._predict_one_clf(
729 | feature_matrix_test, concept_index, cat, pbar
730 | )
731 | vals.append(val)
732 | if (cat_indices is not None) and (cat is not ""):
733 | # need to correct indices, zeros in places with no predictions
734 | # TODO: determine if this patching activity
735 | # takes longer than just predicting on more
736 | new_vals = []
737 | for v in vals:
738 | new_v = np.zeros(feature_matrix.shape[0])
739 | new_v[cat_indices[cat]] = v
740 | new_vals.append(new_v)
741 | vals = new_vals
742 | # TODO: below will not work with cat_inds
743 | if len(vals) > 0:
744 | topic_preds_sub = np.stack(vals, axis=1)
745 | predictions[cat_index, :, cat_concept_cols] = topic_preds_sub
746 |
747 | def _predict_keywords(
748 | self,
749 | feature_matrix: np.array,
750 | store: str,
751 | cat_indices: Dict[str, list] = None,
752 | only_no_topic: bool = False,
753 | use_dask: bool = True,
754 | ):
755 | """
756 | Make keyword predictions
757 |
758 | Args:
759 | feature_matrix: array of features for each document
760 | store: location of h5 store for predictions
761 | cat_indices: Predicted indices where categories
762 | occur for each category
763 | only_no_topic: only use the models which are
764 | not associated with a category
765 | use_dask: use dask for multiprocessing
766 |
767 | Returns:
768 | store: the location of the h5 store
769 | """
770 | all_con_checks = np.sum(
771 | np.array([a.shape[0] for a in self.cat_concept_indices])
772 | )
773 | if Path(store).exists():
774 | ValueError(f"{store} already exists.")
775 | with h5py.File(store, "w") as f0, tqdm(total=all_con_checks) as pbar:
776 | predictions = f0.create_dataset(
777 | "predictions",
778 | (
779 | len(self.categories),
780 | feature_matrix.shape[0],
781 | len(self.concepts_with_classifiers),
782 | ),
783 | compression="gzip",
784 | ) # [categories, docs, concepts]
785 | if only_no_topic is True:
786 | cat_index = self.categories.index("")
787 | self._predict_kwds_for_cat(
788 | feature_matrix, cat_index, predictions, cat_indices, use_dask, pbar,
789 | )
790 | else:
791 | for cat_index in range(len(self.categories)):
792 | self._predict_kwds_for_cat(
793 | feature_matrix,
794 | cat_index,
795 | predictions,
796 | cat_indices,
797 | use_dask,
798 | pbar,
799 | )
800 | return store
801 |
802 | def get_synth_preds(
803 | self,
804 | store: str,
805 | all_cat_inds: Dict[str, np.array],
806 | batch_size: int,
807 | only_cat: bool,
808 | synth_strat: str,
809 | use_dask: bool = True,
810 | ) -> np.array:
811 | """
812 | Synthesize all keyword models into a single prediction score.
813 |
814 | Args:
815 | store: location of h5 database
816 | all_cat_inds: dictionary with keys which are categories.
817 | Values are index of documents which apply to each category.
818 | batch_size: batch size for synthesizing predictions
819 | only_cat: only use category classifiers in synthesis
820 | synth_strat: strategy for synthesizing category predictions
821 | use_dask: use dask for multiprocessing
822 |
823 | """
824 | # TODO: do this without all of the intermediaries
825 | with h5py.File(store, "r") as f0:
826 | tdocs = f0["predictions"].shape[1]
827 | shape = f0["predictions"].shape[1:]
828 | with tqdm(total=tdocs) as pbar:
829 | get_synth_preds(
830 | store,
831 | shape,
832 | all_cat_inds,
833 | self.categories,
834 | batch_size,
835 | only_cat,
836 | synth_strat,
837 | use_dask,
838 | pbar=pbar,
839 | )
840 | with h5py.File(store, "r") as f0:
841 | results = f0["synthesis"].value # TODO: optional return?
842 | return results
843 |
844 | @staticmethod
845 | def _to_strings(tags, preds, t):
846 | all_tag_vals = [
847 | get_tag_vals(preds[i], tags, t) for i in tqdm(range(preds.shape[0]))
848 | ]
849 | return all_tag_vals
850 |
851 | def predict(
852 | self,
853 | feature_matrix: np.array,
854 | cat_threshold: float = 0.5,
855 | concept_threshold: float = 0.5,
856 | no_categories: bool = False,
857 | only_cat: bool = False,
858 | synth_strat: str = "mean",
859 | batch_size: int = 10_000,
860 | ) -> Tuple[PRED_LIST_TYPE, PRED_LIST_TYPE]:
861 | """
862 | Make predictions for all input texts.
863 |
864 | Args:
865 | texts: input texts for which to produce predictions
866 | cat_threhold: threshold over which to mix in category subset
867 | model predictions
868 | concept_threhold: threshold over which to return
869 | a concept prediction
870 | no_categories: whether or not to use category-specific models
871 | only_cat: only use category classifiers in synthesis
872 | synth_strat: strategy for synthesizing category concept models
873 | to produce single result.
874 | batch_size: size of batches for making predictions
875 |
876 | Returns:
877 | concept_preds: concepts and their belief scores
878 |
879 | Examples:
880 | >>> examples = ["Olympus Mons is the largest volcano in the solar system",
881 | ... "Database management is critical for information retrieval",
882 | ... "We used a logistic regression with batched stochastic gradient descent."]
883 | >>> weights = {'NOUN': 1, 'PROPN': 1, 'ENT': 1, 'NOUN_CHUNK':1, 'ACRONYM': 1}
884 | >>> features, feature_matrix = hclf.vectorize(examples, weights)
885 | >>> hclf.predict(feature_matrix)
886 | """
887 | n_splits = ceil(feature_matrix.shape[0] / batch_size)
888 | r1s = []
889 | # TODO: make temp folder and then write the file
890 | with NamedTemporaryFile() as tmp_dir:
891 | tmp_store = Path(f"{tmp_dir.name}/store.h5")
892 | cat_pred_strings = []
893 | for n in tqdm(range(n_splits)):
894 | # TODO: Leave batching to lower methods?
895 | start = n * batch_size
896 | end = (n + 1) * batch_size
897 | matrix_slice = feature_matrix[start:end, :]
898 | cat_preds = self.predict_categories(matrix_slice)
899 | cat_inds = get_cat_inds(self.categories, cat_preds, t=cat_threshold)
900 | LOG.info(f"Predicting keywords")
901 | store_loc = self._predict_keywords(
902 | matrix_slice,
903 | tmp_store.name,
904 | cat_indices=cat_inds,
905 | use_dask=False,
906 | only_no_topic=no_categories,
907 | )
908 | if no_categories is True:
909 | with h5py.File(store_loc) as f0:
910 | sp = f0["predictions"][-1, :, :]
911 | else:
912 | LOG.info(f"Synthesizing for each doc.")
913 | sp = self.get_synth_preds(
914 | store_loc,
915 | cat_inds,
916 | 1000000000, # TODO: more explanation here
917 | only_cat,
918 | synth_strat,
919 | use_dask=False,
920 | )
921 | LOG.info(f"Converting to strings.")
922 | r1 = self._to_strings(
923 | self.concepts_with_classifiers, sp, concept_threshold
924 | )
925 | cp = self._to_strings(self.categories, cat_preds, t=0.0)
926 | r1s.append(r1)
927 | cat_pred_strings.append(cp)
928 | concept_preds = [doc_preds for r1 in r1s for doc_preds in r1]
929 | all_cat_pred_strings = [
930 | doc_preds for cp in cat_pred_strings for doc_preds in cp
931 | ]
932 | return all_cat_pred_strings, concept_preds
933 |
934 |
935 | def get_tag_vals(pred_vals: List[float], tags: List[str], t: float):
936 | tag_vals = [(tags[i], v) for i, v in enumerate(pred_vals) if v > t]
937 | tag_vals.sort(key=lambda x: -x[1])
938 | return tag_vals
939 |
940 |
941 | if __name__ == "__main__":
942 | parser = argparse.ArgumentParser(
943 | description="Use category and concept models to get metrics on the test data."
944 | )
945 | parser.add_argument("--experiment_name", help="experiment to generate metrics for")
946 | parser.add_argument("--out_store", help="h5 store in which to store results")
947 | parser.add_argument(
948 | "--out_cat_preds", help="output npy file for category predictions"
949 | )
950 | parser.add_argument(
951 | "--batch_size", help="size of batches for creating ground truth data", type=int,
952 | )
953 | parser.add_argument(
954 | "--limit",
955 | help="size limit for test data (for testing on smaller subset)",
956 | type=int,
957 | default=None,
958 | )
959 | args = parser.parse_args()
960 | main(
961 | args.experiment_name,
962 | args.out_store,
963 | args.out_cat_preds,
964 | args.batch_size,
965 | args.limit,
966 | )
967 |
--------------------------------------------------------------------------------