├── __init__.py
├── data
    ├── .gitkeep
    └── interim
    │   └── subj_mapping.json
├── models
    └── .gitkeep
├── tests
    ├── __init__.py
    ├── .gitignore
    ├── context.py
    ├── test_conceptExtractor.py
    ├── test_extract_from_doc.py
    ├── test_conceptTrainer.py
    ├── test_featureExtractor.py
    └── test_hierarchicalClassifier.py
├── src
    ├── pipeline
    │   ├── __init__.py
    │   ├── .gitignore
    │   ├── start.sh
    │   ├── config.yml
    │   ├── docker_pipeline.sh
    │   └── pipeline.py
    ├── dsconcept
    │   ├── .gitignore
    │   ├── __init__.py
    │   ├── README.md
    │   ├── train.py
    │   ├── model.py
    │   └── get_metrics.py
    ├── features.py
    ├── concepts.py
    ├── make_vec_and_matrix.py
    ├── process.py
    ├── make_cat_models.py
    ├── make_records_for_cat_bert.py
    ├── synthesize_predictions.py
    ├── make_kwd_models.py
    ├── get_bert_cat_models_preds.py
    └── make_plots.py
├── .coveragerc
├── docs
    ├── .gitignore
    ├── reset.sh
    ├── research_access.png
    ├── push_pages.sh
    ├── code.rst
    ├── index.rst
    ├── Makefile
    ├── docker-versions.txt
    └── conf.py
├── .dockerignore
├── version.py
├── .github
    └── workflows
    │   └── greetings.yml
├── config
    └── test_config.yml
├── Dockerfile
├── LICENSE
├── setup.py
├── .gitignore
├── requirements.txt
├── CHANGELOG.md
├── README.md
└── Makefile


/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/data/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/models/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/pipeline/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.coveragerc:
--------------------------------------------------------------------------------
1 | [run]
2 | branch = True


--------------------------------------------------------------------------------
/src/pipeline/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__/*


--------------------------------------------------------------------------------
/docs/.gitignore:
--------------------------------------------------------------------------------
1 | html/*
2 | _build/*
3 | 


--------------------------------------------------------------------------------
/src/dsconcept/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__/*
2 | 


--------------------------------------------------------------------------------
/src/dsconcept/__init__.py:
--------------------------------------------------------------------------------
1 | import dsconcept.model
2 | import dsconcept.train
3 | 


--------------------------------------------------------------------------------
/tests/.gitignore:
--------------------------------------------------------------------------------
1 | .hypothesis/*
2 | .pytest_cache/*
3 | .coverage
4 | __pycache__/*


--------------------------------------------------------------------------------
/docs/reset.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | make clean && make html && open _build/html/index.html
3 | 


--------------------------------------------------------------------------------
/docs/research_access.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nasa/concept-tagging-training/master/docs/research_access.png


--------------------------------------------------------------------------------
/docs/push_pages.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | make html
3 | cd _build/html
4 | git add .
5 | git commit -m 'rebuilt docs'
6 | git push origin gh-pages
7 | 


--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
 1 | data/*
 2 | __pycache__/*
 3 | notebook/*
 4 | reports/*
 5 | env/*
 6 | venv/*
 7 | docs/*
 8 | models/*
 9 | scratch/*
10 | .hypothesis/*
11 | .pytest_cache/*
12 | *.tar
13 | *.tar.gz
14 | 


--------------------------------------------------------------------------------
/version.py:
--------------------------------------------------------------------------------
1 | import os
2 | from setuptools_scm import get_version
3 | 
4 | version = get_version(root=os.path.dirname(os.path.abspath(__file__)))
5 | version = ".".join(version.split(".")[:3])
6 | print(version)
7 | 


--------------------------------------------------------------------------------
/docs/code.rst:
--------------------------------------------------------------------------------
 1 | dsconcept
 2 | ==========
 3 | 
 4 | .. automodule:: dsconcept.train
 5 |    :members:
 6 |    :undoc-members:
 7 | .. autofunction::
 8 | 
 9 | .. automodule:: dsconcept.model
10 |    :members:
11 | .. autofunction::
12 | 
13 | 


--------------------------------------------------------------------------------
/tests/context.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import logging
 4 | 
 5 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "../src")))
 6 | 
 7 | logging.basicConfig(level=logging.INFO)
 8 | LOG = logging.getLogger(__name__)
 9 | LOG.setLevel(logging.DEBUG)
10 | 
11 | import dsconcept
12 | 
13 | LOG.info(f"Loaded Module {dsconcept}")
14 | 


--------------------------------------------------------------------------------
/src/pipeline/start.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | export MYDIR="$(dirname "$(realpath "$0")")"
 3 | 
 4 | python ${MYDIR}/pipeline.py \
 5 |     ${MYDIR}/volumes/in_data/records.json \
 6 |     ${MYDIR}/volumes/in_data/config.yml \
 7 |     ${MYDIR}/volumes/out_data/processed_data \
 8 |     ${MYDIR}/volumes/out_data/topic_models \
 9 |     ${MYDIR}/volumes/out_data/models \
10 |     -loglevel ${LOGLEVEL}
11 | 


--------------------------------------------------------------------------------
/.github/workflows/greetings.yml:
--------------------------------------------------------------------------------
 1 | name: Greetings
 2 | 
 3 | on: [pull_request, issues]
 4 | 
 5 | jobs:
 6 |   greeting:
 7 |     runs-on: ubuntu-latest
 8 |     permissions:
 9 |       issues: write
10 |       pull-requests: write
11 |     steps:
12 |     - uses: actions/first-interaction@v1
13 |       with:
14 |         repo-token: ${{ secrets.GITHUB_TOKEN }}
15 |         issue-message: 'Message that will be displayed on users first issue'
16 |         pr-message: 'Thank you for contributing to this NASA repository!'
17 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | .. Research Access documentation master file, created by
 2 |    sphinx-quickstart on Fri Sep 14 16:48:31 2018.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | Welcome to Research Access's documentation!
 7 | ===========================================
 8 | 
 9 | .. toctree::
10 |    :maxdepth: 2
11 |    :caption: Contents:
12 | 
13 |    code
14 | 
15 | 
16 | Indices and tables
17 | ==================
18 | 
19 | * :ref:`genindex`
20 | * :ref:`modindex`
21 | * :ref:`search`
22 | 


--------------------------------------------------------------------------------
/config/test_config.yml:
--------------------------------------------------------------------------------
 1 | weights:  # assign weights for term types specified in process section
 2 |   NOUN: 1
 3 |   PROPN: 1
 4 |   NOUN_CHUNK: 1
 5 |   ENT: 1
 6 |   ACRONYM: 1
 7 | min_feature_occurrence: 10  
 8 | # features from corpus which occur fewer than 
 9 | # this many times are not used for training 
10 | max_feature_occurrence: 0.9  
11 | # features which occur in more than this percentage 
12 | # of the corpus are not used for training
13 | min_concept_occurrence: 5  
14 | # only concepts which occur greater than or equal to this many times 
15 | # in the corpus will have associated classifiers created.
16 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line.
 5 | SPHINXOPTS    =
 6 | SPHINXBUILD   = sphinx-build
 7 | SOURCEDIR     = .
 8 | BUILDDIR      = _build
 9 | 
10 | # Put it first so that "make" without argument is like "make help".
11 | help:
12 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
13 | 
14 | .PHONY: help Makefile
15 | 
16 | # Catch-all target: route all unknown targets to Sphinx using the new
17 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
18 | %: Makefile
19 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)


--------------------------------------------------------------------------------
/src/dsconcept/README.md:
--------------------------------------------------------------------------------
 1 | # dsconcept
 2 | Python library with supporting classes for runnning processing, 
 3 | and training of classifiers. Originally desgined for 
 4 | tagging the [NASA STI database](https://www.sti.nasa.gov/).
 5 | 
 6 | ## installation
 7 | You can install the dsconcept library from this repository. 
 8 | It also requires the [spacy 'en' language models](https://spacy.io/usage/models).
 9 | ```bash
10 | pip install git+https://developer.nasa.gov/DataSquad/classifier_scripts.git
11 | python -m spacy download en
12 | ```
13 | 
14 | ## Usage
15 | Docs are available [here](../docs).
16 | You can go through a full interactive tutorial using the Dockerfile available in
17 | [notebook](#../notebook).
18 | 
19 | 


--------------------------------------------------------------------------------
/docs/docker-versions.txt:
--------------------------------------------------------------------------------
 1 | Client: Docker Engine - Community
 2 |  Version:           19.03.5
 3 |  API version:       1.40
 4 |  Go version:        go1.12.12
 5 |  Git commit:        633a0ea
 6 |  Built:             Wed Nov 13 07:22:34 2019
 7 |  OS/Arch:           darwin/amd64
 8 |  Experimental:      false
 9 | 
10 | Server: Docker Engine - Community
11 |  Engine:
12 |   Version:          19.03.5
13 |   API version:      1.40 (minimum version 1.12)
14 |   Go version:       go1.12.12
15 |   Git commit:       633a0ea
16 |   Built:            Wed Nov 13 07:29:19 2019
17 |   OS/Arch:          linux/amd64
18 |   Experimental:     true
19 |  containerd:
20 |   Version:          v1.2.10
21 |   GitCommit:        b34a5c8af56e510852c35414db4c1f4fa6172339
22 |  runc:
23 |   Version:          1.0.0-rc8+dev
24 |   GitCommit:        3e425f80a8c931f88e6d94a8c831b9d5aa481657
25 |  docker-init:
26 |   Version:          0.18.0
27 |   GitCommit:        fec3683
28 | 


--------------------------------------------------------------------------------
/src/pipeline/config.yml:
--------------------------------------------------------------------------------
 1 | # Configuration for research access training pipeline
 2 | 
 3 | #image: storage.analytics.nasa.gov/rat_trainer:0.12.0
 4 | 
 5 | process:
 6 |   term_types:
 7 |     - "NOUN"
 8 |     - "PROPN"
 9 |     - "ENT"
10 |     - "NOUN_CHUNK"
11 |     - "ACRONYM"
12 |   abstract_field: "description"
13 |   concept_field: "subject.NASATerms"
14 | 
15 | topic_model:
16 |   weights:  # assign weights for term types specified in process section
17 |     NOUN: 1
18 |     PROPN: 1
19 |     NOUN_CHUNK: 1
20 |     ENT: 1
21 |     ACRONYM: 1
22 |   min_feature_occurrence: 5
23 |   max_feature_occurrence: 0.9
24 |   number_of_topics: 10
25 | 
26 | train_classifiers:
27 |   weights:  # assign weights for term types specified in process section
28 |     NOUN: 1
29 |     PROPN: 1
30 |     NOUN_CHUNK: 1
31 |     ENT: 1
32 |     ACRONYM: 1
33 |   min_feature_occurrence: 5
34 |   max_feature_occurrence: 0.9
35 |   min_concept_occurrence: 10
36 | 
37 | 
38 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Create essential base image
 2 | FROM python:3.7 as base
 3 | COPY requirements.txt /home/
 4 | WORKDIR /home/
 5 | RUN pip install -U pip setuptools wheel && \
 6 |     pip install -r requirements.txt && \
 7 |     python -m spacy download en
 8 | ADD src/ /home/src/
 9 | ENV PYTHONPATH=/home/src
10 | ENV PYTHONUNBUFFERED=0
11 | 
12 | # Label image with git commit url
13 | ARG GIT_URL=unspecified
14 | ARG VERSION=unspecified
15 | LABEL org.label-schema.schema-version=1.0
16 | LABEL org.label-schema.url=$GIT_URL
17 | LABEL org.label-schema.version=$VERSION
18 | ENV VERSION=$VERSION
19 | 
20 | # Run unittests
21 | FROM base as tests
22 | RUN pip install nose && \
23 |     pip install pytest && \
24 |     pip install coverage && \
25 |     pip install hypothesis && \
26 |     pip install testfixtures
27 | COPY tests /home/tests
28 | ARG cachebust=0
29 | # ^ Change this to avoid using cached results. These are tests, so we may want to run them.
30 | RUN nosetests --with-coverage --cover-package dsconcept
31 | 
32 | # Deployment ready image
33 | FROM base as pipeline
34 | COPY Makefile /home/
35 | ENTRYPOINT ["make"]
36 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | 
 2 | The MIT License (MIT)
 3 | Copyright (c) 2020, United States Government as represented by the Administrator of the National Aeronautics and Space Administration.
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
 6 | 
 7 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
 8 | 
 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
10 | 
11 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from glob import glob
 2 | from os.path import basename
 3 | from os.path import splitext
 4 | 
 5 | import setuptools
 6 | 
 7 | setuptools.setup(
 8 |     name="dsconcept",
 9 |     use_scm_version=True,
10 |     setup_requires=["setuptools_scm"],
11 |     url="https://developer.nasa.gov/DataSquad/classifier_scripts",
12 |     author="Anthony Buonomo",
13 |     author_email="anthony.r.buonomo@nasa.gov",
14 |     description="Scripts for processing, topic modeling, and creating classifiers for STI concepts.",
15 |     long_description=open("README.md").read(),
16 |     license="MIT",
17 |     packages=setuptools.find_packages("src"),
18 |     package_dir={"": "src"},
19 |     py_modules=[splitext(basename(path))[0] for path in glob("src/*.py")],
20 |     install_requires=[
21 |         "scikit-learn>=0.21.3",
22 |         "spacy>=2.2.3",
23 |         "numpy>=1.17.4",
24 |         "pandas>=0.25.3",
25 |         "pyLDAvis>=2.1.2",
26 |         "textacy==0.9.1",
27 |         "boto3>=1.7.46",
28 |         "dask>=2.8.1",
29 |         "PyYAML>=5.1.2",
30 |         "h5py>=2.10.0",
31 |         "tqdm>=4.39.0",
32 |     ],
33 |     classifiers=[
34 |         "Development Status :: 2 - Beta",
35 |         "Programming Language :: Python :: 3.6",
36 |     ],
37 | )
38 | 


--------------------------------------------------------------------------------
/tests/test_conceptExtractor.py:
--------------------------------------------------------------------------------
 1 | from unittest import TestCase
 2 | from .context import dsconcept
 3 | from testfixtures import TempDirectory
 4 | from pathlib import Path
 5 | import logging
 6 | 
 7 | logging.basicConfig(level=logging.INFO)
 8 | LOG = logging.getLogger(__name__)
 9 | LOG.setLevel(logging.INFO)
10 | 
11 | 
12 | class TestConceptExtractor(TestCase):
13 |     def setUp(self):
14 |         self.ce = dsconcept.model.ConceptExtractor()
15 |         self.d = TempDirectory()
16 | 
17 |     def test_concept_sets(self):
18 |         self.ce.concept_sets = [
19 |             ["MARS", "NASA"],
20 |             ["NASA"],
21 |             ["MARS"],
22 |             ["HIT", "JUPITER"],
23 |         ]
24 | 
25 |     def test_from_corpus(self):
26 |         data = b'{"abstract":"Astronauts are very cool.", "concept": ["ASTRONAUTS", "COOL THINGS"]} \n {"abstract":"NASA is going to Mars.", "concept":["NASA", "MARS"]}'
27 |         self.d.write("test.json", data)
28 |         self.ce.from_corpus(Path(f"{self.d.path}/test.json"), "concept")
29 | 
30 |     def test_get_top_concepts(self):
31 |         self.ce.concept_sets = [
32 |             ["MARS", "NASA"],
33 |             ["NASA"],
34 |             ["MARS"],
35 |             ["HIT", "JUPITER"],
36 |         ]
37 |         self.assertDictEqual(
38 |             self.ce.get_top_concepts(2), {"mars": [0, 2], "nasa": [0, 1]}
39 |         )
40 | 
41 |     def tearDown(self):
42 |         self.d.cleanup()
43 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # remove credentials
 2 | .eggs/*
 3 | bandit_analysis.txt
 4 | *.h5
 5 | /commands/.env
 6 | venv/*
 7 | versions_and*
 8 | my_env/*
 9 | /volumes/*
10 | notebook/*
11 | cover/*
12 | .coverage
13 | !/volumes/in_data
14 | env/*
15 | .idea/*
16 | scratch/*
17 | models/*
18 | !models/.gitkeep
19 | config/*
20 | !config/test_config.yml
21 | 
22 | *.env
23 | *.pkl
24 | *.npy
25 | *.tgz
26 | *.gz
27 | *.tar
28 | *.npz
29 | *.swp
30 | 
31 | 
32 | */.ipynb_checkpoints/*
33 | kubernetes-manifests/* 
34 | sample_outdata/*
35 | __pycache__/*
36 | 
37 | reports/
38 | !reports/.gitkeep
39 | data/interim/*
40 | data/raw/*
41 | !data/interim/subj_mapping.json
42 | !data/raw/STI_public_metadata_records_sample100.jsonl
43 | volumes/big_data/*
44 | !volumes/big_data/.gitkeep
45 | tests/test_data/*
46 | !tests/test_data/.gitkeep
47 | !tests/test_data/results_small.json
48 | 
49 | misc-ignore/*
50 | .ipynb_checkpoints/*
51 | 
52 | notebook/src/*
53 | notebook/data/*
54 | 
55 | # Byte-compiled / optimized / DLL files
56 | __pycache__/
57 | *.py[cod]
58 | 
59 | # C extensions
60 | *.so
61 | 
62 | # Distribution / packaging
63 | bin/
64 | build/
65 | develop-eggs/
66 | dist/
67 | eggs/
68 | lib/
69 | lib64/
70 | parts/
71 | sdist/
72 | var/
73 | *.egg-info/
74 | .installed.cfg
75 | *.egg
76 | 
77 | # Installer logs
78 | pip-log.txt
79 | pip-delete-this-directory.txt
80 | 
81 | # Unit test / coverage reports
82 | .tox/
83 | .coverage
84 | .cache
85 | .hypothesis/*
86 | .pytest_cache/*
87 | nosetests.xml
88 | coverage.xml
89 | 
90 | # Translations
91 | *.mo
92 | 
93 | 


--------------------------------------------------------------------------------
/src/features.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import logging
 3 | 
 4 | import dsconcept.model as ml
 5 | from multiprocessing import cpu_count
 6 | 
 7 | logging.basicConfig(level=logging.INFO)
 8 | LOG = logging.getLogger(__name__)
 9 | LOG.setLevel(logging.INFO)
10 | 
11 | N_CPUS = cpu_count()
12 | BATCH_SIZE = 1000
13 | 
14 | 
15 | def main(in_corpus, abstract_field, out_features, batch_size, n_threads):
16 |     LOG.info(f"Extracting features from corpus at {in_corpus}.")
17 |     LOG.info(f"Using field: {abstract_field}.")
18 |     fe = ml.FeatureExtractor()
19 |     LOG.info(f"Using batch_size {batch_size} with {n_threads} threads.")
20 |     LOG.info(f"Outputting processed features to {out_features}.")
21 |     fe.from_corpus_to_jsonlines(
22 |         in_corpus, out_features, abstract_field, batch_size, n_threads
23 |     )
24 | 
25 | 
26 | if __name__ == "__main__":
27 |     parser = argparse.ArgumentParser(
28 |         description="""Create features for each document in the processed corpus.
29 |         Each line in output file is a json formatted string 
30 |         with features and their types."""
31 |     )
32 |     parser.add_argument("i", help="input jsonlines corpus")
33 |     parser.add_argument("f", help="abstract field")
34 |     parser.add_argument("o", help="ouput jsonlines features")
35 |     parser.add_argument(
36 |         "-b", help="batch size for feature processing", default=BATCH_SIZE
37 |     )
38 |     parser.add_argument(
39 |         "-n", help="number of threads for features processing", default=N_CPUS
40 |     )
41 |     args = parser.parse_args()
42 |     main(args.i, args.f, args.o, args.b, args.n)
43 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | appnope==0.1.0
 2 | attrs==19.3.0
 3 | backcall==0.1.0
 4 | bleach==3.3.0
 5 | blis==0.4.1
 6 | cachetools==3.1.1
 7 | catalogue==0.0.8
 8 | certifi==2019.9.11
 9 | chardet==3.0.4
10 | coverage==4.5.4
11 | cycler==0.10.0
12 | cymem==2.0.3
13 | cytoolz==0.10.1
14 | dask==2.8.1
15 | decorator==4.4.1
16 | defusedxml==0.6.0
17 | entrypoints==0.3
18 | h5py==2.10.0
19 | hypothesis==4.47.1
20 | idna==2.8
21 | importlib-metadata==0.23
22 | jedi==0.15.1
23 | jellyfish==0.7.2
24 | Jinja2==2.11.3
25 | joblib==0.14.0
26 | jsonschema==3.2.0
27 | kiwisolver==1.1.0
28 | MarkupSafe==1.1.1
29 | matplotlib==3.1.2
30 | mistune==0.8.4
31 | more-itertools==7.2.0
32 | murmurhash==1.0.2
33 | nbformat==4.4.0
34 | networkx==2.4
35 | nose==1.3.7
36 | numpy==1.17.4
37 | packaging==19.2
38 | pandas==0.25.3
39 | pandocfilters==1.4.2
40 | parso==0.5.1
41 | pexpect==4.7.0
42 | pickleshare==0.7.5
43 | plac==1.1.3
44 | pluggy==0.13.1
45 | preshed==3.0.2
46 | prometheus-client==0.7.1
47 | prompt-toolkit==2.0.10
48 | ptyprocess==0.6.0
49 | py==1.10.0
50 | pyemd==0.5.1
51 | Pygments==2.7.4
52 | pyparsing==2.4.5
53 | Pyphen==0.9.5
54 | pyrsistent==0.15.6
55 | python-dateutil==2.8.1
56 | pytz==2019.3
57 | PyYAML==5.4
58 | pyzmq==18.1.1
59 | requests==2.22.0
60 | scikit-learn==0.21.3
61 | scipy==1.3.3
62 | Send2Trash==1.5.0
63 | six==1.13.0
64 | spacy==2.2.3
65 | srsly==0.2.0
66 | terminado==0.8.3
67 | testpath==0.4.4
68 | textacy==0.9.1
69 | thinc==7.3.1
70 | toolz==0.10.0
71 | tornado==6.0.3
72 | tqdm==4.39.0
73 | traitlets==4.3.3
74 | urllib3==1.26.5
75 | wasabi==0.4.0
76 | wcwidth==0.1.7
77 | webencodings==0.5.1
78 | widgetsnbextension==3.5.1
79 | zipp==0.6.0
80 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # Changelog
 2 | All notable changes to this project will be documented in this file.
 3 | 
 4 | Releases page is <a href="https://github.com/nasa/concept-tagging-api/releases">here</a>.
 5 | 
 6 | ## [Unreleased]
 7 | 
 8 | 
 9 | ## [v1.0.3-open_source_release] - 2020-06-10
10 | #### Added:
11 | Original open-source release of this repository on github.com/nasa after having received SRA (software release authority) approval.
12 | 
13 | 
14 | 
15 | # Guidelines for ChangeLog Entries
16 | 
17 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
18 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
19 | 
20 | ### Guiding Principles
21 | - Changelogs are for humans, not machines.
22 | - There should be an entry for every single version.
23 | - The same types of changes should be grouped.
24 | - Versions and sections should be linkable.
25 | - The latest version comes first.
26 | - The release date of each version is displayed.
27 | 
28 | ### All Entries Sould be Under One of These Types of changes
29 | - <b>Added</b> for new features.
30 | - <b>Changed</b> for changes in existing functionality.
31 | - <b>Deprecated</b> for soon-to-be removed features.
32 | - <b>Removed</b> for now removed features.
33 | - <b>Fixed</b> for any bug fixes.
34 | - <b>Security</b> in case of vulnerabilities.
35 | 
36 | Google technical writer Sarah Maddox <a href="https://ffeathers.wordpress.com/2017/08/19/how-to-write-release-notes/">gave the following advice</a> about release notes:
37 | `“The most important function of release notes is to let customers know that something has changed in the product, particularly when that something may affect the way the customer uses the product.”`</a>
38 | 


--------------------------------------------------------------------------------
/tests/test_extract_from_doc.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from dsconcept.model import *
 3 | 
 4 | import logging
 5 | 
 6 | logging.basicConfig(level=logging.WARNING)
 7 | logging.disable(level=logging.INFO)
 8 | LOG = logging.getLogger(__name__)
 9 | LOG.setLevel(logging.WARNING)
10 | 
11 | 
12 | class TestExtractFromDoc(unittest.TestCase):
13 |     def setUp(self):
14 |         self.nlp = spacy.load("en_core_web_sm")
15 |         self.doc = nlp(
16 |             """The NASA Scientific and Technical Information (STI) Program was established to support the 
17 |                    objectives of NASA’s missions and research. The Mission of the STI Program is to support the 
18 |                    advancement of aerospace knowledge and contribute to U.S. competitiveness in aerospace research and 
19 |                    development. This program is essential to help NASA avoid duplication of research by sharing 
20 |                    information and to ensure that the U.S. maintains its preeminence in aerospace-related industries 
21 |                    and education. The NASA STI Program acquires, processes, archives, announces, and disseminates 
22 |                    NASA STI and acquires worldwide STI of critical importance to the
23 |                    National Aeronautics and Space Administation (NASA) and the Nation."""
24 |         )
25 |         self.terms_tagged = extract_from_doc(self.doc)
26 | 
27 |     def test_is_set(self):
28 |         self.assertEqual(dict, type(self.terms_tagged))
29 | 
30 |     def test_has_terms(self):
31 |         self.assertGreater(len(self.terms_tagged), 0)
32 | 
33 |     def test_has_all_feature_types(self):
34 |         self.term_types = {term_type for term, term_type in self.terms_tagged.items()}
35 |         LOG.info(self.term_types)
36 |         LOG.info(self.terms_tagged)
37 |         self.assertEqual(
38 |             {"NOUN", "PROPN", "NOUN_CHUNK", "ENT", "ACRONYM"}, self.term_types
39 |         )
40 | 
41 | 
42 | if __name__ == "__main__":
43 |     unittest.main()
44 | 


--------------------------------------------------------------------------------
/src/concepts.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import logging
 3 | 
 4 | import dsconcept.model as ml
 5 | 
 6 | logging.basicConfig(level=logging.INFO)
 7 | LOG = logging.getLogger(__name__)
 8 | LOG.setLevel(logging.INFO)
 9 | 
10 | 
11 | def main(
12 |     in_corpus,
13 |     concept_field,
14 |     cat_field,
15 |     out_indices,
16 |     out_cat_indices,
17 |     out_raw2lemma,
18 |     out_cat_raw2lemma,
19 | ):
20 |     LOG.info(f"Corpus: {in_corpus}")
21 |     LOG.info(f"Keyword Field: {concept_field}")
22 |     LOG.info(f"Category Field: {cat_field}")
23 | 
24 |     ce = ml.ConceptExtractor()
25 |     ce.from_corpus(in_corpus, concept_field)
26 |     LOG.info(f"Output keyword indices: {out_indices}")
27 |     LOG.info(f"Output keyword raw2lemma: {out_raw2lemma}")
28 |     ce.to_jsons(out_indices, out_raw2lemma)
29 | 
30 |     LOG.info(f"Extracting categories.")
31 |     ce_higher = ml.ConceptExtractor()
32 |     ce_higher.from_corpus(in_corpus, cat_field)
33 |     LOG.info(f"Output category indices: {out_cat_indices}")
34 |     LOG.info(f"Output category raw2lemma: {out_cat_raw2lemma}")
35 |     ce_higher.to_jsons(out_cat_indices, out_cat_raw2lemma)
36 | 
37 | 
38 | if __name__ == "__main__":
39 |     parser = argparse.ArgumentParser(
40 |         description="""Get indices of processed corpus for all of concept and category 
41 |         tags. Also get lemmas for these concepts and categories. Output all of this
42 |         information to json files."""
43 |     )
44 |     parser.add_argument("i", help="input processed jsonlines corpus")
45 |     parser.add_argument("k", help="concept field")
46 |     parser.add_argument("c", help="concept field")
47 |     parser.add_argument("ok", help="output indices for concepts")
48 |     parser.add_argument("oc", help="output indices for categories")
49 |     parser.add_argument("rk", help="out keyword raw to lemma mapping")
50 |     parser.add_argument("rc", help="out category raw to lemma mapping")
51 |     args = parser.parse_args()
52 |     main(args.i, args.k, args.c, args.ok, args.oc, args.rk, args.rc)
53 | 


--------------------------------------------------------------------------------
/src/make_vec_and_matrix.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import logging
 3 | from pathlib import Path
 4 | 
 5 | import joblib
 6 | import numpy as np
 7 | import yaml
 8 | from sklearn.feature_extraction import DictVectorizer
 9 | from sklearn.model_selection import train_test_split
10 | 
11 | import dsconcept.model as ml
12 | 
13 | logging.basicConfig(level=logging.INFO)
14 | LOG = logging.getLogger(__name__)
15 | LOG.setLevel(logging.INFO)
16 | 
17 | VECTORIZER = "vectorizer.jbl"
18 | FEATURE_MATRIX = "feature_matrix.jbl"
19 | 
20 | 
21 | def main(in_features, in_config, out_feature_dir, out_vectorizer):
22 |     with open(in_config, "r") as f0:
23 |         config = yaml.safe_load(f0)
24 | 
25 |     LOG.info(f"Loading features from {in_features}.")
26 |     fe = ml.FeatureExtractor()
27 |     fe.from_jsonlines(in_features)
28 |     weighted_features = fe.weight_terms(config["weights"])
29 |     limited_features = fe.limit_features(
30 |         weighted_features,
31 |         config["min_feature_occurrence"],
32 |         config["max_feature_occurrence"],
33 |     )
34 |     v = DictVectorizer()
35 |     X = v.fit_transform(limited_features)
36 | 
37 |     out_feature_matrix = out_feature_dir / FEATURE_MATRIX
38 |     LOG.info(f"Outputting vectorizer to {out_vectorizer}.")
39 |     joblib.dump(v, out_vectorizer)
40 |     LOG.info(f"Outputting feature matrix to {out_feature_matrix}.")
41 |     joblib.dump(X, out_feature_matrix)
42 | 
43 |     _, _, ind_train, ind_test = train_test_split(
44 |         X, np.array(range(X.shape[0])), test_size=0.10, random_state=42
45 |     )
46 |     np.save(out_feature_dir / f"train_inds.npy", ind_train)
47 |     np.save(out_feature_dir / f"test_inds.npy", ind_test)
48 | 
49 | 
50 | if __name__ == "__main__":
51 |     parser = argparse.ArgumentParser(
52 |         description="""From features file, create a feature matrix and vectorizer 
53 |         which translates between columns of the matrix and feature strings. Limit
54 |         which features are included in these files with configuration."""
55 |     )
56 |     parser.add_argument("in_features", help="input features jsonlines file")
57 |     parser.add_argument("in_config", help="configuration for creating models")
58 |     parser.add_argument(
59 |         "out_feature_dir",
60 |         help="output directory for feature matrix and indices",
61 |         type=Path,
62 |     )
63 |     parser.add_argument(
64 |         "out_vectorizer", help="output path for for vectorizer", type=Path,
65 |     )
66 |     # TODO: split outputs
67 |     args = parser.parse_args()
68 | 
69 |     main(args.in_features, args.in_config, args.out_feature_dir, args.out_vectorizer)
70 | 


--------------------------------------------------------------------------------
/src/process.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import logging
 3 | import json
 4 | 
 5 | import pandas as pd
 6 | 
 7 | logging.basicConfig(level=logging.INFO)
 8 | LOG = logging.getLogger(__name__)
 9 | LOG.setLevel(logging.INFO)
10 | 
11 | 
12 | def main(infile, in_subj_mapping, outfile):
13 |     LOG.info(f"Reading corpus from {infile}.")
14 |     df = pd.read_json(infile, orient="records", lines=True)
15 |     LOG.info(f"Shape of input: {df.shape}")
16 | 
17 |     with open(in_subj_mapping, "r") as f0:
18 |         subj_mapping = json.load(f0)
19 | 
20 |     def get_subjs(x):
21 |         if type(x) == list:
22 |             s = set(
23 |                 subj_mapping[s.strip().lower()]
24 |                 for s in x
25 |                 if s.strip().lower() in subj_mapping
26 |             )
27 |             l = list(s)
28 |         else:
29 |             l = None
30 |         return l
31 | 
32 |     categories = df["D072B (Subject Category)"].apply(get_subjs)
33 | 
34 |     text_col = (
35 |         "<TITLE> " + df["D245A (Title)"] + " <ABSTRACT> " + df["D520B (Abstract)"]
36 |     )
37 |     keywords = (
38 |         df["D650A (NASA Major Indexing Terms)"]
39 |         + df["D659A (NASA Minor Indexing Terms)"]
40 |     )
41 | 
42 |     pdf = pd.DataFrame()
43 |     pdf["text"] = text_col
44 |     pdf["keywords"] = keywords
45 |     pdf["subjects"] = df["D072B (Subject Category)"]
46 |     pdf["categories"] = categories
47 | 
48 |     def remove_no_abstracts(x):
49 |         if type(x) == str:
50 |             if "no abstract available" not in x.lower():
51 |                 val = True
52 |             else:
53 |                 val = False
54 |         else:
55 |             val = False
56 |         return val
57 | 
58 |     has_abs = pdf["text"].apply(remove_no_abstracts)
59 |     has_kwds = pdf["keywords"].apply(lambda x: type(x) is list)
60 |     has_subj = pdf["keywords"].apply(lambda x: type(x) is list)  # Should be subjects?
61 |     has_cats = pdf["categories"].apply(lambda x: type(x) is list)
62 |     tf = has_kwds & has_subj & has_cats & has_abs
63 |     LOG.info(f"Removed {sum(~tf)} rows.")
64 | 
65 |     LOG.info(f"Outputting processed corpus to {outfile}.")
66 |     pdf[tf].to_json(outfile, orient="records", lines=True)
67 | 
68 | 
69 | if __name__ == "__main__":
70 |     parser = argparse.ArgumentParser(
71 |         description="""Merge text and keyword fields from input corpus. 
72 |         Remove documents without abstracts, keywords, or categories."""
73 |     )
74 |     parser.add_argument("i", help="input corpus")
75 |     parser.add_argument("m", help="subject to category mapping json")
76 |     parser.add_argument("o", help="output processed data")
77 |     args = parser.parse_args()
78 |     main(args.i, args.m, args.o)
79 | 


--------------------------------------------------------------------------------
/tests/test_conceptTrainer.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from pathlib import Path
 3 | from unittest import TestCase
 4 | 
 5 | import joblib
 6 | import numpy as np
 7 | from scipy.sparse import csc_matrix
 8 | from hypothesis import given
 9 | from hypothesis.extra.numpy import arrays
10 | from sklearn.linear_model import SGDClassifier
11 | from sklearn.model_selection import GridSearchCV, train_test_split
12 | from testfixtures import TempDirectory
13 | 
14 | from .context import dsconcept
15 | 
16 | logging.basicConfig(level=logging.INFO)
17 | LOG = logging.getLogger(__name__)
18 | LOG.setLevel(logging.INFO)
19 | 
20 | 
21 | class TestConceptTrainer(TestCase):
22 |     def setUp(self):
23 |         ce = dsconcept.model.ConceptExtractor()
24 |         fe = dsconcept.model.FeatureExtractor()
25 |         self.d = TempDirectory()
26 |         data = b'{"abstract":["Astronauts are very cool."], "concept": ["ASTRONAUTS", "COOL THINGS"]}\n {"abstract":["NASA is going to Mars."], "concept":["NASA", "MARS"]}'
27 |         self.d.write("test.json", data)
28 |         self.corpus_path = f"{self.d.path}/test.json"
29 |         s = 100
30 |         self.X = csc_matrix(np.random.randint(2, size=s * 2).reshape(int(s), 2))
31 |         self.y = np.random.randint(2, size=s)
32 |         paramgrid = {
33 |             "alpha": [0.01, 0.001, 0.0001],
34 |             "class_weight": [{1: 10, 0: 1}, {1: 5, 0: 1}, {1: 20, 0: 1}],
35 |             "max_iter": [1],
36 |             "loss": ["log"],
37 |         }  # requires loss function with predict_proba
38 |         clf = GridSearchCV(
39 |             SGDClassifier(), paramgrid, scoring="f1"
40 |         )  # requires GridSearchCV
41 |         self.ct = dsconcept.train.ConceptTrainer(ce, clf)
42 | 
43 |     def test_create_concept_classifier(self):
44 |         out_dir = Path(f"{self.d.path}/models")
45 |         out_dir.mkdir()
46 |         X_train, X_test, y_train, y_test = train_test_split(
47 |             self.X, self.y, test_size=0.5, random_state=42
48 |         )
49 |         self.ct.create_concept_classifier(
50 |             "test_concept", X_train, X_test, y_train, y_test, out_dir
51 |         )
52 |         clf = joblib.load(out_dir / "test_concept.pkl")
53 |         LOG.info(clf)
54 | 
55 |     def test_train_all(self):  # This test is super naive. Does not check behaviour.
56 |         self.ct.train_all(self.X, Path(f"{self.d.path}/models"), 5)
57 |         test_inds = np.load((Path(f"{self.d.path}") / "test_inds.npy"))
58 |         train_inds = np.load((Path(f"{self.d.path}") / "train_inds.npy"))
59 |         LOG.info(f"test_inds: {test_inds}")
60 |         LOG.info(f"train_inds: {train_inds}")
61 | 
62 |     @given(arrays(dtype=np.float_, shape=1))
63 |     def test_get_dispersed_subset(self, array):
64 |         subset = dsconcept.train.get_dispersed_subset(array, 5)
65 |         self.assertLessEqual(len(subset), 5)
66 |         LOG.info(subset)
67 | 
68 |     def tearDown(self):
69 |         self.d.cleanup()
70 | 


--------------------------------------------------------------------------------
/src/pipeline/docker_pipeline.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | #export IMAGE=storage.analytics.nasa.gov/abuonomo/rat_trainer:dev # TODO: make command line arg
 4 | ## TODO: add help for description of parameters
 5 | #
 6 | #echo "Reading from ${IN_DATA}." # TODO: put in docker container env variable so its in docker logs
 7 | #echo "Dumping to ${OUT_DATA}."
 8 | 
 9 | usage="$(basename "$0") [-h] [-i path] [-o path] [-d docker-image] [-l loglevel] [-c cpus]
10 | Concept training pipeline
11 | 
12 | where:
13 |     -h  show this help text
14 |     -i  (absolute path) input data directory
15 |     -o  (absolute path) output data directory
16 |     -d  the docker image to use
17 |     -l  the log level to use ['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']
18 |     -c  number of cpus to allow container to use"
19 | 
20 | # Get command line arguments
21 | input=""
22 | output=""
23 | while getopts ':hi:o:d:l:c:' option; do
24 |   case "$option" in
25 |     h) echo "$usage"
26 |        exit
27 |        ;;
28 |     i) input=$OPTARG
29 |        ;;
30 |     :) printf "missing argument for -%s\n" "$OPTARG" >&2
31 |        echo "$usage" >&2
32 |        exit 1
33 |        ;;
34 |     o) output=$OPTARG
35 |        ;;
36 |     :) printf "missing argument for -%s\n" "$OPTARG" >&2
37 |        echo "$usage" >&2
38 |        exit 1
39 |        ;;
40 |     d) image=$OPTARG
41 |        ;;
42 |     :) printf "missing argument for -%s\n" "$OPTARG" >&2
43 |        echo "$usage" >&2
44 |        exit 1
45 |        ;;
46 |     l) LOGLEVEL=$OPTARG
47 |        ;;
48 |     :) printf "missing argument for -%s\n" "$OPTARG" >&2
49 |        echo "$usage" >&2
50 |        exit 1
51 |        ;;
52 |     c) cpus=$OPTARG
53 |        ;;
54 |     :) printf "missing argument for -%s\n" "$OPTARG" >&2
55 |        echo "$usage" >&2
56 |        exit 1
57 |        ;;
58 |     \?) printf "illegal option: -%s\n" "$OPTARG" >&2
59 |        echo "$usage" >&2
60 |        exit 1
61 |        ;;
62 |   esac
63 | done
64 | shift $((OPTIND - 1))
65 | 
66 | # Check for errors
67 | if [ ! -d "${input}" ]; then
68 |     echo "${input} directory does not exists. Choose a directory name which does exists and contains requisite data."
69 |     exit 1
70 | fi
71 | if [ -d "${output}" ]; then
72 |     echo "${output} directory already exists. Choose a new directory name which does not exist."
73 |     exit 1
74 | fi
75 | if [ "${LOGLEVEL}" = "" ]; then
76 |    echo "Setting empty LOGLEVEL to INFO."
77 |    export LOGLEVEL="INFO"
78 | fi
79 | if [ "${cpus}" = "" ]; then
80 |    echo "Setting empty LOGLEVEL to INFO."
81 |    export cpus=0.000
82 | fi
83 | 
84 | 
85 | mkdir ${output}
86 | 
87 | echo "Running full pipeline."
88 | docker run -it\
89 |     -v ${input}:/home/pipeline/volumes/in_data \
90 |     -v ${output}:/home/pipeline/volumes/out_data \
91 |     -e LOGLEVEL=${LOGLEVEL} \
92 |     --cpus=${cpus} \
93 |     ${image} bash -c 'bash pipeline/start.sh'
94 | #--cpus=<value> # TODO: add cpus arg
95 | echo "Completed Pipeline."
96 | 


--------------------------------------------------------------------------------
/tests/test_featureExtractor.py:
--------------------------------------------------------------------------------
 1 | from unittest import TestCase
 2 | from dsconcept.model import FeatureExtractor
 3 | 
 4 | # from .context import dsconcept
 5 | from testfixtures import TempDirectory
 6 | from hypothesis import given
 7 | import hypothesis.strategies as st
 8 | import pytest
 9 | 
10 | 
11 | @st.composite
12 | def features(draw):
13 |     tags = [st.just(tag) for tag in ["NOUN", "PROPN", "NOUN_CHUNK", "ENT"]]
14 |     tags_strat = st.one_of(*tags)
15 |     txt = st.text(max_size=5)
16 |     doc_feats = st.dictionaries(keys=txt, values=tags_strat, min_size=4, max_size=5)
17 |     feats = draw(st.lists(doc_feats, max_size=5))
18 |     return feats
19 | 
20 | 
21 | @st.composite
22 | def weights(draw):
23 |     tags = [st.just(tag) for tag in ["NOUN", "PROPN", "NOUN_CHUNK", "ENT"]]
24 |     tags_strat = st.one_of(*tags)
25 |     weights_dict = draw(
26 |         st.dictionaries(keys=tags_strat, values=st.integers(min_value=0))
27 |     )
28 |     return weights_dict
29 | 
30 | 
31 | class TestFeatureExtractor(TestCase):
32 |     def setUp(self):
33 |         self.fe = FeatureExtractor()
34 |         self.d = TempDirectory()
35 |         data = b'{"abstract":"Astronauts are very cool.", "concept": ["ASTRONAUTS", "COOL THINGS"]} \n {"abstract":"NASA is going to Mars.", "concept":["NASA", "MARS"]}'
36 |         self.d.write("test.json", data)
37 |         self.corpus_path = f"{self.d.path}/test.json"
38 | 
39 |     @given(features())
40 |     def test_features(self, d):
41 |         self.fe.features = d
42 |         self.assertEqual(len(self.fe.features), len(d))
43 | 
44 |     def test_from_corpus_to_jsonlines(self):
45 |         self.fe.from_corpus_to_jsonlines(
46 |             self.corpus_path, f"{self.d.path}/features.jsonl", "abstract",
47 |         )
48 | 
49 |     def test_from_jsonlines(self):
50 |         data = b'{"astronaut":"NOUN", "space": "NOUN", "NASA": "ENT"}\n{"Mars": "PROPN", "dog": "NOUN"}'
51 |         features_out = "features.jsonl"
52 |         self.d.write(features_out, data)
53 |         self.fe.from_jsonlines(f"{self.d.path}/{features_out}")
54 |         self.assertSetEqual(self.fe.term_types, {"NOUN", "PROPN", "ENT"})
55 | 
56 |     def test_to_jsonlines(self):
57 |         self.fe.features = [
58 |             {"space": "NOUN", "Mars": "PROPN"},
59 |             {"Anita": "PROPN", "Adams": "PROPN"},
60 |         ]
61 |         out_features = "features.jsonl"
62 |         self.fe.to_jsonlines(f"{self.d.path}/{out_features}")
63 | 
64 |     @given(features(), weights())
65 |     def test_weight_terms(self, d, w):
66 |         self.fe.features = d
67 |         self.fe.weight_terms(w)
68 | 
69 |     @given(features(), weights())
70 |     def test_limit_features(self, d, w):
71 |         self.fe.features = d
72 |         weighted_features = self.fe.weight_terms(
73 |             w
74 |         )  # Test method contingent upon another test. Bad?
75 |         self.fe.limit_features(weighted_features, feature_min=1, feature_max=0.90)
76 | 
77 |     def tearDown(self):
78 |         self.d.cleanup()
79 | 


--------------------------------------------------------------------------------
/src/make_cat_models.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import logging
 3 | import joblib
 4 | from sklearn.feature_extraction import DictVectorizer
 5 | from sklearn.linear_model import SGDClassifier
 6 | from sklearn.model_selection import GridSearchCV, train_test_split
 7 | import yaml
 8 | from pathlib import Path
 9 | import numpy as np
10 | 
11 | import dsconcept.model as ml
12 | from dsconcept.train import ConceptTrainer
13 | 
14 | logging.basicConfig(level=logging.INFO)
15 | LOG = logging.getLogger(__name__)
16 | LOG.setLevel(logging.INFO)
17 | 
18 | VECTORIZER = "vectorizer.jbl"
19 | FEATURE_MATRIX = "feature_matrix.jbl"
20 | OUT_MODELS_DIR = "models"
21 | 
22 | 
23 | def main(
24 |     in_feature_matrix,
25 |     in_ind_train,
26 |     in_ind_test,
27 |     in_cat_indices,
28 |     in_cat_raw2lemma,
29 |     in_config,
30 |     out_dir,
31 | ):
32 |     with open(in_config, "r") as f0:
33 |         config = yaml.safe_load(f0)
34 | 
35 |     X = joblib.load(in_feature_matrix)
36 |     ind_train = np.load(in_ind_train)
37 |     ind_test = np.load(in_ind_test)
38 | 
39 |     LOG.info(
40 |         f"Loading category extractor from {in_cat_indices} and {in_cat_raw2lemma}."
41 |     )
42 |     cat_ext = ml.ConceptExtractor()
43 |     cat_ext.from_jsons(in_cat_indices, in_cat_raw2lemma)
44 | 
45 |     paramgrid = {
46 |         "alpha": [0.01, 0.001, 0.0001],
47 |         "class_weight": [{1: 10, 0: 1}, {1: 5, 0: 1}, {1: 20, 0: 1}],
48 |         "max_iter": [1],
49 |         "loss": ["log"],
50 |     }  # requires loss function with predict_proba
51 |     clf = GridSearchCV(
52 |         SGDClassifier(), paramgrid, scoring="f1"
53 |     )  # requires GridSearchCV
54 |     out_models = out_dir / OUT_MODELS_DIR
55 |     trainer = ConceptTrainer(cat_ext, clf)
56 | 
57 |     trainer.train_concepts(
58 |         X, ind_train, ind_test, out_models, config["min_concept_occurrence"]
59 |     )
60 |     LOG.info("Complete.")
61 | 
62 | 
63 | if __name__ == "__main__":
64 |     parser = argparse.ArgumentParser(
65 |         description="""Use feature matrix and location of indices to create classifiers
66 |         for the categories in the corpus."""
67 |     )
68 |     parser.add_argument(
69 |         "in_feature_matrix", help="input scipy sparse matrix of features"
70 |     )
71 |     parser.add_argument("in_ind_train", help="train set index")
72 |     parser.add_argument("in_ind_test", help="test set index")
73 |     parser.add_argument("in_cat_indices", help="category indices")
74 |     parser.add_argument("in_cat_raw2lemma", help="category raw to lemma mapping")
75 |     parser.add_argument("in_config", help="configuration for creating models")
76 |     parser.add_argument(
77 |         "out_dir",
78 |         help="output directory for vectorizer, feature matrix, and models",
79 |         type=Path,
80 |     )
81 |     args = parser.parse_args()
82 | 
83 |     main(
84 |         args.in_feature_matrix,
85 |         args.in_ind_train,
86 |         args.in_ind_test,
87 |         args.in_cat_indices,
88 |         args.in_cat_raw2lemma,
89 |         args.in_config,
90 |         args.out_dir,
91 |     )
92 | 


--------------------------------------------------------------------------------
/src/make_records_for_cat_bert.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import json
 3 | import logging
 4 | from pathlib import Path
 5 | 
 6 | import numpy as np
 7 | import pandas as pd
 8 | from sklearn.model_selection import train_test_split
 9 | from sklearn.preprocessing import MultiLabelBinarizer
10 | 
11 | logging.basicConfig(level=logging.INFO)
12 | LOG = logging.getLogger(__name__)
13 | LOG.setLevel(logging.INFO)
14 | 
15 | 
16 | def main(in_records, inds_loc, out_records_dir):
17 |     train_inds_loc = inds_loc / "train_inds.npy"
18 |     test_inds_loc = inds_loc / "test_inds.npy"
19 |     train_bert_inds_loc = inds_loc / "train_bert_inds.npy"
20 |     dev_bert_inds_loc = inds_loc / "dev_bert_inds.npy"
21 | 
22 |     LOG.info(f"Loading cleaned records from  {in_records}.")
23 |     records = pd.read_json(in_records, orient="records", lines=True)
24 |     train_inds = np.load(train_inds_loc)
25 |     test_inds = np.load(test_inds_loc)
26 | 
27 |     LOG.info(f"Creating bert cat df format.")
28 |     mlb = MultiLabelBinarizer()
29 |     cat_bin_array = mlb.fit_transform(records["categories"])
30 |     cat_df = pd.DataFrame(cat_bin_array)
31 |     cat_df.columns = mlb.classes_
32 |     cat_df["text"] = records["text"]
33 |     ordered_cols = ["text"] + mlb.classes_.tolist()
34 |     cat_df = cat_df[ordered_cols]
35 | 
36 |     train_bert_inds, dev_bert_inds = train_test_split(train_inds, test_size=0.25)
37 |     np.save(train_bert_inds_loc, train_bert_inds)
38 |     np.save(dev_bert_inds_loc, dev_bert_inds)
39 | 
40 |     ml_sets = {
41 |         "train": cat_df.iloc[train_bert_inds],
42 |         "test": cat_df.iloc[test_inds],
43 |         "dev": cat_df.iloc[dev_bert_inds],
44 |     }
45 | 
46 |     out_records_dir.mkdir(exist_ok=True)
47 |     for set_type, ml_set in ml_sets.items():
48 |         outfile = out_records_dir / f"{set_type}.csv"
49 |         LOG.info("Writing to {}".format(outfile))
50 |         ml_set.to_csv(outfile, index=True)
51 | 
52 |     out_id_to_label = str(out_records_dir / "id_to_label.json")
53 |     out_label_to_id = str(out_records_dir / "label_to_id.json")
54 |     out_classes = str(out_records_dir / "classes.txt")
55 | 
56 |     id_to_label = {i: c for i, c in enumerate(mlb.classes_)}
57 |     label_to_id = {c: i for i, c in enumerate(mlb.classes_)}
58 | 
59 |     LOG.info(f"Writing classes to {out_classes}")
60 |     classes = mlb.classes_.tolist()
61 | 
62 |     with open(out_classes, "w") as f0:
63 |         for i, c in enumerate(classes):
64 |             f0.write(c.strip())
65 |             if i < len(classes) - 1:
66 |                 f0.write("\n")
67 | 
68 |     LOG.info(f"Writing to {out_id_to_label}.")
69 |     with open(out_id_to_label, "w") as f0:
70 |         json.dump(id_to_label, f0)
71 | 
72 |     LOG.info(f"Writing to {out_label_to_id}.")
73 |     with open(out_label_to_id, "w") as f0:
74 |         json.dump(label_to_id, f0)
75 | 
76 | 
77 | if __name__ == "__main__":
78 |     parser = argparse.ArgumentParser(description="Say hello")
79 |     parser.add_argument("i", help="input records", type=Path)
80 |     parser.add_argument(
81 |         "inds_loc", help="directory for train, test, and dev indices", type=Path
82 |     )
83 |     parser.add_argument(
84 |         "o", help="output files for bert category classifying.", type=Path
85 |     )
86 |     args = parser.parse_args()
87 |     main(args.i, args.inds_loc,  args.o)
88 | 


--------------------------------------------------------------------------------
/src/synthesize_predictions.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import logging
  3 | from pathlib import Path
  4 | 
  5 | import dask
  6 | import h5py
  7 | import joblib
  8 | import numpy as np
  9 | import pandas as pd
 10 | from dask.diagnostics import ProgressBar
 11 | from tqdm import tqdm
 12 | 
 13 | from dsconcept.get_metrics import (
 14 |     get_cat_inds,
 15 |     get_synth_preds,
 16 |     load_category_models,
 17 |     load_concept_models,
 18 |     HierarchicalClassifier,
 19 |     get_mets,
 20 | )
 21 | 
 22 | logging.basicConfig(level=logging.INFO)
 23 | LOG = logging.getLogger(__name__)
 24 | LOG.setLevel(logging.INFO)
 25 | 
 26 | 
 27 | def main(
 28 |     experiment_name,
 29 |     synth_strat,
 30 |     in_cat_preds,
 31 |     out_store,
 32 |     synth_batch_size,
 33 |     t,
 34 |     out_synth_scores,
 35 |     limit=None,
 36 |     con_limit=None,
 37 | ):
 38 |     test_inds = np.load(f"data/interim/{experiment_name}/test_inds.npy")
 39 |     feature_matrix = joblib.load(f"data/interim/{experiment_name}/feature_matrix.jbl")
 40 |     in_cat_models = Path(f"models/{experiment_name}/categories/models/")
 41 |     in_kwd_models = Path(f"models/{experiment_name}/keywords/models/")
 42 |     cat_preds = np.load(in_cat_preds)  # based on experiment or explicit path?
 43 |     cat_clfs = load_category_models(in_cat_models)
 44 |     cd = load_concept_models(in_kwd_models)
 45 |     clf = HierarchicalClassifier(cat_clfs, cd)
 46 | 
 47 |     if limit is not None:
 48 |         LOG.info(f"Limiting to {limit} test records.")
 49 |         feature_matrix_test = feature_matrix.tocsc()[test_inds[0:limit], :]
 50 |         cat_preds = cat_preds[0:limit, :]
 51 |         # TODO: How does this affect indices?
 52 |     else:
 53 |         feature_matrix_test = feature_matrix.tocsc()[test_inds, :]
 54 | 
 55 |     LOG.info(f'Synthesizing predictions with strategy "{synth_strat}".')
 56 |     all_cat_inds = get_cat_inds(clf.categories, cat_preds, t=t)
 57 |     if con_limit is not None:
 58 |         conwc = clf.concepts_with_classifiers[0:con_limit]
 59 |     else:
 60 |         conwc = clf.concepts_with_classifiers
 61 |     shape = (feature_matrix_test.shape[0], len(conwc))
 62 |     with tqdm(total=shape[0]) as pbar:
 63 |         get_synth_preds(
 64 |             out_store,
 65 |             shape,
 66 |             all_cat_inds,
 67 |             clf.categories,
 68 |             synth_batch_size,
 69 |             only_cat=False,
 70 |             synth_strat=synth_strat,
 71 |             con_limit=con_limit,
 72 |             limit=limit,
 73 |             pbar=pbar,
 74 |         )
 75 | 
 76 |     LOG.info("Obtaining metrics.")
 77 |     with h5py.File(out_store, "r") as f0:
 78 |         if limit is not None:
 79 |             target_values = f0["ground_truth"][0:limit, :]
 80 |         else:
 81 |             target_values = f0["ground_truth"].value
 82 |     with h5py.File(out_store, "r") as f0:
 83 |         synth_preds = f0["synthesis"].value
 84 | 
 85 |     jobs = []
 86 |     mets_pbar = tqdm(
 87 |         range(len(conwc)),
 88 |         total=len(conwc),
 89 |     )
 90 |     for i in mets_pbar:
 91 |         job = dask.delayed(get_mets)(
 92 |             i, synth_preds, target_values, conwc, mets_pbar
 93 |         )
 94 |         jobs.append(job)
 95 |     records = dask.compute(jobs)
 96 |     new_recs_df = pd.DataFrame(records[0])
 97 |     LOG.info(f"Saving results to {out_synth_scores}.")
 98 |     new_recs_df.to_csv(out_synth_scores)
 99 | 
100 | 
101 | if __name__ == "__main__":
102 |     parser = argparse.ArgumentParser(description="Say hello")
103 |     parser.add_argument("--experiment_name", help="input txt file")
104 |     parser.add_argument("--synth_strat", help="input txt file")
105 |     parser.add_argument("--in_cat_preds", help="input txt file")
106 |     parser.add_argument("--store", help="input txt file")
107 |     parser.add_argument("--synth_batch_size", help="input txt file", type=int)
108 |     parser.add_argument("--threshold", help="input txt file", type=float)
109 |     parser.add_argument("--out_synth_scores", help="input txt file")
110 |     parser.add_argument(
111 |         "--limit", help="size for sample to test synthesis", type=int, default=None
112 |     )
113 |     parser.add_argument(
114 |         "--con_limit", help="size for concept sample", type=int, default=None
115 |     )
116 |     args = parser.parse_args()
117 |     main(
118 |         args.experiment_name,
119 |         args.synth_strat,
120 |         args.in_cat_preds,
121 |         args.store,
122 |         args.synth_batch_size,
123 |         args.threshold,
124 |         args.out_synth_scores,
125 |         args.limit,
126 |         args.con_limit,
127 |     )
128 | 


--------------------------------------------------------------------------------
/data/interim/subj_mapping.json:
--------------------------------------------------------------------------------
1 | {"environment pollution": "geosciences", "energy production and conversion": "geosciences", "oceanography": "geosciences", "geophysics": "geosciences", "earth resources and remote sensing": "geosciences", "geosciences (general)": "geosciences", "meteorology and climatology": "geosciences", "spacecraft design, testing and performance": "astronautics", "astrodynamics": "astronautics", "astronautics (general)": "astronautics", "ground support systems and facilities (space)": "astronautics", "launch vehicles and launch operations": "astronautics", "space transportation and safety": "astronautics", "spacecraft instrumentation and astrionics": "astronautics", "spacecraft propulsion and power": "astronautics", "space communications, spacecraft communications, command and tracking": "astronautics", "space transportation": "astronautics", "spacecraft instrumentation": "astronautics", "launch vehicles and space vehicles": "astronautics", "physics (general)": "physics", "plasma physics": "physics", "optics": "physics", "nuclear physics": "physics", "acoustics": "physics", "solid-state physics": "physics", "atomic and molecular physics": "physics", "physics of elementary particles and fields": "physics", "nuclear and high-energy physics": "physics", "thermodynamics and statistical physics": "physics", "astronomy": "space sciences", "solar physics": "space sciences", "lunar and planetary science and exploration": "space sciences", "space radiation": "space sciences", "astrophysics": "space sciences", "space sciences (general)": "space sciences", "lunar and planetary exploration": "space sciences", "space biology": "space sciences", "inorganic, organic and physical chemistry": "chemistry and materials", "space processing": "chemistry and materials", "chemistry and materials (general)": "chemistry and materials", "propellants and fuels": "chemistry and materials", "nonmetallic materials": "chemistry and materials", "metals and metallic materials": "chemistry and materials", "composite materials": "chemistry and materials", "materials processing": "chemistry and materials", "metallic materials": "chemistry and materials", "inorganic and physical chemistry": "chemistry and materials", "materials": "chemistry and materials", "research and support facilities (air)": "aeronautics", "avionics and aircraft instrumentation": "aeronautics", "aircraft communications and navigation": "aeronautics", "aircraft propulsion and power": "aeronautics", "aerodynamics": "aeronautics", "aeronautics (general)": "aeronautics", "air transportation and safety": "aeronautics", "aircraft design, testing and performance": "aeronautics", "aircraft stability and control": "aeronautics", "aircraft instrumentation": "aeronautics", "economics and cost analysis": "social and information sciences", "documentation and information science": "social and information sciences", "technology utilization and surface transportation": "social and information sciences", "administration and management": "social and information sciences", "law, political science and space policy": "social and information sciences", "social and information sciences (general)": "social and information sciences", "social sciences (general)": "social and information sciences", "statistics and probability": "mathematical and computer sciences", "computer operations and hardware": "mathematical and computer sciences", "computer programming and software": "mathematical and computer sciences", "computer systems": "mathematical and computer sciences", "cybernetics, artificial intelligence and robotics": "mathematical and computer sciences", "mathematical and computer sciences (general)": "mathematical and computer sciences", "numerical analysis": "mathematical and computer sciences", "systems analysis and operations research": "mathematical and computer sciences", "theoretical mathematics": "mathematical and computer sciences", "cybernetics": "mathematical and computer sciences", "systems analysis": "mathematical and computer sciences", "mathematics and information sciences": "mathematical and computer sciences", "general.": "general", "general": "general", "communications and radar": "engineering", "engineering (general)": "engineering", "electronics and electrical engineering": "engineering", "fluid mechanics and thermodynamics": "engineering", "instrumentation and photography": "engineering", "lasers and masers": "engineering", "mechanical engineering": "engineering", "quality assurance and reliability": "engineering", "structural mechanics": "engineering", "fluid mechanics and heat transfer": "engineering", "behavioral sciences": "life sciences", "aerospace medicine": "life sciences", "man/system technology and life support": "life sciences", "exobiology": "life sciences", "life sciences (general)": "life sciences", "life sciences": "life sciences"}


--------------------------------------------------------------------------------
/src/make_kwd_models.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import logging
  3 | from pathlib import Path
  4 | 
  5 | import joblib
  6 | import numpy as np
  7 | import yaml
  8 | from sklearn.linear_model import SGDClassifier
  9 | from sklearn.model_selection import GridSearchCV
 10 | 
 11 | import dsconcept.model as ml
 12 | import dsconcept.train as tr
 13 | 
 14 | logging.basicConfig(level=logging.INFO)
 15 | LOG = logging.getLogger(__name__)
 16 | LOG.setLevel(logging.DEBUG)
 17 | 
 18 | VECTORIZER = "vectorizer.jbl"
 19 | FEATURE_MATRIX = "feature_matrix.jbl"
 20 | OUT_MODELS_DIR = "models/topic_"
 21 | 
 22 | 
 23 | def main(
 24 |     in_feature_matrix,
 25 |     in_ind_train,
 26 |     in_ind_test,
 27 |     in_kwd_indices,
 28 |     in_cat_indices,
 29 |     in_kwd_raw2lemma,
 30 |     in_cat_raw2lemma,
 31 |     in_config,
 32 |     out_dir,
 33 |     topics=True,
 34 | ):
 35 |     with open(in_config, "r") as f0:
 36 |         config = yaml.safe_load(f0)
 37 | 
 38 |     X = joblib.load(in_feature_matrix)
 39 |     ind_train = np.load(in_ind_train)
 40 |     ind_test = np.load(in_ind_test)
 41 | 
 42 |     LOG.info(f"Loading keyword extractor from {in_kwd_indices} and {in_kwd_raw2lemma}.")
 43 |     ce = ml.ConceptExtractor()
 44 |     ce.from_jsons(in_kwd_indices, in_kwd_raw2lemma)
 45 | 
 46 |     LOG.info(
 47 |         f"Loading category extractor from {in_cat_indices} and {in_cat_raw2lemma}."
 48 |     )
 49 |     cat_ext = ml.ConceptExtractor()
 50 |     cat_ext.from_jsons(in_cat_indices, in_cat_raw2lemma)
 51 | 
 52 |     paramgrid = {
 53 |         "alpha": [0.01, 0.001, 0.0001],
 54 |         "class_weight": [{1: 10, 0: 1}, {1: 5, 0: 1}, {1: 20, 0: 1}],
 55 |         "max_iter": [5],
 56 |         "loss": ["log"],
 57 |     }  # requires loss function with predict_proba
 58 |     clf = GridSearchCV(
 59 |         SGDClassifier(), paramgrid, scoring="f1", n_jobs=-1,
 60 |     )  # requires GridSearchCV
 61 |     out_models = out_dir / OUT_MODELS_DIR
 62 |     trainer = tr.ConceptTrainer(ce, clf)
 63 |     doc_topic_indices = cat_ext.concept_index_mapping
 64 | 
 65 |     if topics:
 66 |         LOG.info(
 67 |             f"Training one set for each of {len(doc_topic_indices)} topics divisions."
 68 |         )
 69 |         for topic, doc_topic_index in doc_topic_indices.items():
 70 |             trainer.train_concepts(
 71 |                 X,
 72 |                 ind_train,
 73 |                 ind_test,
 74 |                 out_models,
 75 |                 config["min_concept_occurrence"],
 76 |                 topic,
 77 |                 doc_topic_index,
 78 |             )
 79 |     LOG.info("Training one general set")
 80 |     trainer.train_concepts(
 81 |         X, ind_train, ind_test, out_models, config["min_concept_occurrence"]
 82 |     )
 83 |     LOG.info("Complete.")
 84 | 
 85 | 
 86 | if __name__ == "__main__":
 87 |     parser = argparse.ArgumentParser(
 88 |         description="""Use feature matrix and location of indices to create classifiers
 89 |         for the concepts in the corpus."""
 90 |     )
 91 |     parser.add_argument(
 92 |         "in_feature_matrix", help="input scipy sparse matrix of features"
 93 |     )
 94 |     parser.add_argument("in_ind_train", help="train set index")
 95 |     parser.add_argument("in_ind_test", help="test set index")
 96 |     parser.add_argument("in_kwd_indices", help="keyword indicies")
 97 |     parser.add_argument("in_cat_indices", help="category indices")
 98 |     parser.add_argument("in_kwd_raw2lemma", help="keyword raw to lemma mapping")
 99 |     parser.add_argument("in_cat_raw2lemma", help="category raw to lemma mapping")
100 |     parser.add_argument("in_config", help="configuration for creating models")
101 |     parser.add_argument(
102 |         "out_dir",
103 |         help="output directory for vectorizer, feature matrix, and models",
104 |         type=Path,
105 |     )
106 |     parser.add_argument("--topics", dest="topics", action="store_true")
107 |     parser.add_argument("--no-topics", dest="topics", action="store_false")
108 |     parser.set_defaults(topics=True)
109 |     parser.add_argument(
110 |         "-v", "--verbose", help="increase output verbosity", action="store_true"
111 |     )
112 |     args = parser.parse_args()
113 |     if args.verbose:
114 |         LOG.info("Changing log level to DEBUG.")
115 |         LOG.setLevel(logging.DEBUG)
116 |         tr.LOG.setLevel(logging.DEBUG)
117 |         LOG.debug("Changed log level to DEBUG.")
118 | 
119 |     main(
120 |         args.in_feature_matrix,
121 |         args.in_ind_train,
122 |         args.in_ind_test,
123 |         args.in_kwd_indices,
124 |         args.in_cat_indices,
125 |         args.in_kwd_raw2lemma,
126 |         args.in_cat_raw2lemma,
127 |         args.in_config,
128 |         args.out_dir,
129 |         args.topics,
130 |     )
131 | 


--------------------------------------------------------------------------------
/src/get_bert_cat_models_preds.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import json
  3 | import linecache
  4 | import logging
  5 | from pathlib import Path
  6 | 
  7 | import dsbert.dsbert.multilabel as mll
  8 | import numpy as np
  9 | import pandas as pd
 10 | from tqdm import tqdm
 11 | 
 12 | import dsconcept.get_metrics as gm
 13 | 
 14 | logging.basicConfig(level=logging.INFO)
 15 | LOG = logging.getLogger(__name__)
 16 | LOG.setLevel(logging.INFO)
 17 | 
 18 | 
 19 | def init_domain_bert(base_model_dir, finetuned_model_dir, map_loc="cpu"):
 20 |     LOG.info("Loading BERT models")
 21 |     processor = mll.MultiLabelTextProcessor(finetuned_model_dir)
 22 |     clf = mll.BERTMultilabelClassifier(
 23 |         processor, bert_model=base_model_dir, do_lower_case=False,
 24 |     )
 25 |     clf.initialize_devices()
 26 |     clf.load_model(
 27 |         f"{finetuned_model_dir}/finetuned_pytorch_model.bin", map_location=map_loc,
 28 |     )
 29 |     return clf, processor
 30 | 
 31 | 
 32 | def load_lines_to_df(data_loc, line_inds):
 33 |     tmp_records = []
 34 |     for i in tqdm(line_inds):
 35 |         r_str = linecache.getline(str(data_loc), i + 1)
 36 |         r = json.loads(r_str)
 37 |         tmp_records.append(r)
 38 |     records_df = pd.DataFrame(tmp_records)
 39 |     return records_df
 40 | 
 41 | 
 42 | def main(
 43 |     data_dir, models_dir, reports_dir, base_model_dir, finetuned_model_dir, sample,
 44 | ):
 45 |     test_inds = np.load(data_dir / "test_inds.npy")
 46 |     clean_data_loc = data_dir / "abs_kwds.jsonl"
 47 | 
 48 |     in_cat_models = models_dir / "categories/models/"
 49 |     in_kwd_models = models_dir / "keywords/models/"
 50 |     cat_raw2lemma_loc = models_dir / "cat_raw2lemma.json"
 51 | 
 52 |     out_preds_loc = reports_dir / "bert_cat_preds.npy"
 53 | 
 54 |     LOG.info("Loading models.")
 55 |     cat_clfs = gm.load_category_models(in_cat_models)
 56 |     cd = gm.load_concept_models(in_kwd_models)
 57 |     clf = gm.HierarchicalClassifier(cat_clfs, cd)
 58 |     with open(cat_raw2lemma_loc) as f0:
 59 |         cat_raw2lemma = json.load(f0)
 60 |     # base_model_dir = str(bert_models_dir / "cased_L-12_H-768_A-12")
 61 |     # processor_dir = str(bert_models_dir / "processor_dir")
 62 |     # finetuned_model_loc = str(
 63 |     #     bert_models_dir / "cased_L-12_H-768_A-12/cache/finetuned_pytorch_model.bin"
 64 |     # )
 65 |     bert_clf, processor = init_domain_bert(base_model_dir, finetuned_model_dir,)
 66 | 
 67 |     LOG.info(f'Loading records from "{clean_data_loc}".')
 68 |     if sample is not None:
 69 |         lines_to_load = test_inds[0:sample]
 70 |     else:
 71 |         lines_to_load = test_inds
 72 |     records_df = load_lines_to_df(clean_data_loc, lines_to_load)
 73 | 
 74 |     LOG.info(f"Processing {len(records_df)} records.")
 75 |     df_example = pd.DataFrame()
 76 |     df_example["test"] = records_df["text"]
 77 |     df_example["label"] = 0
 78 |     df_example = df_example.reset_index()
 79 |     sample_examples = processor._create_examples(df_example, "test")
 80 | 
 81 |     LOG.info("Making BERT category predictions.")
 82 |     topic_predictions_df = bert_clf.predict(sample_examples)
 83 | 
 84 |     LOG.info("Transforming predictions into matrix which aligns with categories.")
 85 |     cols = topic_predictions_df.iloc[:, 2:].columns
 86 |     only_preds = topic_predictions_df.iloc[:, 2:]
 87 |     tcols = [cat_raw2lemma[c] if c in cat_raw2lemma else c for c in cols]
 88 |     only_preds.columns = tcols
 89 |     only_preds = only_preds[clf.categories[0:-1]]  # don't include '' cat
 90 |     only_pred_vals = only_preds.values
 91 | 
 92 |     LOG.info(f'Saving results to "{out_preds_loc}".')
 93 |     np.save(out_preds_loc, only_pred_vals)
 94 | 
 95 | 
 96 | if __name__ == "__main__":
 97 |     parser = argparse.ArgumentParser(
 98 |         description="Use BERT cat models to get predictions for all test documents"
 99 |     )
100 |     parser.add_argument(
101 |         "--data_dir", help="interim data dir for given experiment", type=Path
102 |     )
103 |     parser.add_argument("--models_dir", help="model_dir for experiment", type=Path)
104 |     parser.add_argument("--reports_dir", help="reports dir for experiment", type=Path)
105 |     parser.add_argument("--base_model_dir", help="base bert model dir", type=str)
106 |     parser.add_argument(
107 |         "--finetuned_model_dir",
108 |         help="dir with classes.txt file and finetuned pytorch model",
109 |         type=str,
110 |     )
111 |     parser.add_argument(
112 |         "--sample", help="how many to sample from test inds", type=int, default=None
113 |     )
114 |     args = parser.parse_args()
115 |     main(
116 |         args.data_dir,
117 |         args.models_dir,
118 |         args.reports_dir,
119 |         args.base_model_dir,
120 |         args.finetuned_model_dir,
121 |         args.sample,
122 |     )
123 | 


--------------------------------------------------------------------------------
/tests/test_hierarchicalClassifier.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import unittest
  3 | from pathlib import Path
  4 | from unittest import TestCase
  5 | 
  6 | import h5py
  7 | import joblib
  8 | import numpy as np
  9 | from sklearn.feature_extraction import DictVectorizer
 10 | from testfixtures import TempDirectory
 11 | 
 12 | from dsconcept.get_metrics import HierarchicalClassifier, StubBestEstimator
 13 | 
 14 | logging.basicConfig(level=logging.INFO)
 15 | LOG = logging.getLogger(__name__)
 16 | LOG.setLevel(logging.INFO)
 17 | 
 18 | 
 19 | class TestHierarchicalClassifier(TestCase):
 20 |     def setUp(self) -> None:
 21 |         self.d = TempDirectory()
 22 |         self.clf_loc = Path(self.d.path) / "stub.jbl"
 23 |         out_info = {'concept': 'something', 'best_estimator_': StubBestEstimator()}
 24 |         joblib.dump(out_info, self.clf_loc)
 25 |         cat_clfs = [
 26 |             {"best_estimator_": StubBestEstimator(), "concept": "physics"},
 27 |             {"best_estimator_": StubBestEstimator(), "concept": "video games"},
 28 |         ]
 29 |         kwd_clfs = {
 30 |             ("physics", "gamma ray"): StubBestEstimator(),
 31 |             ("video games", "minecraft"): StubBestEstimator(),
 32 |             ("video games", "kerbal space program"): StubBestEstimator(),
 33 |             ("", "minecraft"): StubBestEstimator(),
 34 |             ("", "gamma ray"): StubBestEstimator(),
 35 |             ("", "penguins"): StubBestEstimator(),
 36 |         }
 37 |         kwd_clfs_locs = {
 38 |             ("physics", "gamma ray"): self.clf_loc,
 39 |             ("video games", "minecraft"): self.clf_loc,
 40 |             ("video games", "kerbal space program"): self.clf_loc,
 41 |             ("", "minecraft"): self.clf_loc,
 42 |             ("", "gamma ray"): self.clf_loc,
 43 |             ("", "penguins"): self.clf_loc,
 44 |         }
 45 |         self.hclf = HierarchicalClassifier(cat_clfs, kwd_clfs)
 46 |         self.hclf_locs = HierarchicalClassifier(cat_clfs, kwd_clfs_locs)
 47 |         self.feature_matrix = np.array([[0, 0, 1, 1], [0, 1, 0, 1], [1, 1, 0, 0]])
 48 |         v = DictVectorizer()
 49 |         d = [{"astronauts": 1, "astronomy": 1}, {"space": 1, "basalt": 1}]
 50 |         v.fit(d)
 51 |         self.v = v
 52 | 
 53 |     def test_cat_clfs(self):
 54 |         cats = ["physics", "video games", ""]
 55 |         self.assertListEqual(self.hclf.categories, cats)
 56 | 
 57 |     def test_kwd_clfs(self):
 58 |         kwds = ["gamma ray", "kerbal space program", "minecraft", "penguins"]
 59 |         self.assertListEqual(self.hclf.concepts_with_classifiers.tolist(), kwds)
 60 | 
 61 |     def test_predict_categories(self):
 62 |         cat_preds = self.hclf.predict_categories(self.feature_matrix)
 63 |         self.assertEqual(cat_preds.shape, (3, 2))
 64 |         print(cat_preds)
 65 | 
 66 |     def test__predict_one_clf(self):
 67 |         pred = self.hclf._predict_one_clf(self.feature_matrix, 1, "video games")
 68 |         self.assertEqual(pred.shape[0], 3)
 69 | 
 70 |     def test__predict_one_clf_locs(self):
 71 |         pred = self.hclf_locs._predict_one_clf(self.feature_matrix, 1, "video games")
 72 |         self.assertEqual(pred.shape[0], 3)
 73 | 
 74 |     def test__predict_keywords(self):
 75 |         cat_indices = {"physics": [0], "video games": [1, 2]}
 76 |         store = self.hclf._predict_keywords(
 77 |             self.feature_matrix,
 78 |             f"{self.d.path}/store.h5",
 79 |             cat_indices,
 80 |             only_no_topic=False,
 81 |             use_dask=False,
 82 |         )
 83 |         with h5py.File(store, 'r') as f0:
 84 |             pred_array = f0["predictions"][()]
 85 |         LOG.info(pred_array)
 86 |         self.assertEqual(pred_array.shape, (3, 3, 4))
 87 | 
 88 |     def test__predict_keywords_locs(self):
 89 |         cat_indices = {"physics": [0], "video games": [1, 2]}
 90 |         store = self.hclf_locs._predict_keywords(
 91 |             self.feature_matrix,
 92 |             f"{self.d.path}/store.h5",
 93 |             cat_indices,
 94 |             only_no_topic=False,
 95 |             use_dask=False,
 96 |         )
 97 |         with h5py.File(store, 'r') as f0:
 98 |             pred_array = f0["predictions"][()]
 99 |         LOG.info(pred_array)
100 |         self.assertEqual(pred_array.shape, (3, 3, 4))
101 | 
102 |     def test_get_synth_preds(self):
103 |         cat_indices = {"physics": [0], "video games": [1, 2]}
104 |         store = self.hclf._predict_keywords(
105 |             self.feature_matrix,
106 |             f"{self.d.path}/store.h5",
107 |             cat_indices,
108 |             only_no_topic=False,
109 |             use_dask=False,
110 |         )
111 |         all_cat_inds = {
112 |             "physics": np.array([0]),
113 |             "video games": np.array([0, 1]),
114 |             "": np.array([0, 1, 2]),
115 |         }
116 |         self.hclf.get_synth_preds(
117 |             store,
118 |             all_cat_inds,
119 |             batch_size=10000,
120 |             only_cat=False,
121 |             synth_strat="mean",
122 |             use_dask=False,
123 |         )
124 |         with h5py.File(store) as f0:
125 |             synth_array = f0["synthesis"].value
126 |         LOG.info(synth_array)
127 |         self.assertEqual(synth_array.shape, (3, 4))
128 | 
129 |     def test__to_strings(self):
130 |         synth_array = np.array(
131 |             [[0, 0.51, 0.9, 0.2], [0.8, 0.1, 0.4, 0.7], [0.4, 0.2, 0.1, 0.9]]
132 |         )
133 |         kwd_strs = self.hclf._to_strings(
134 |             self.hclf.concepts_with_classifiers, synth_array, t=0.5
135 |         )
136 |         results = [
137 |             [("minecraft", 0.9), ("kerbal space program", 0.51)],
138 |             [("gamma ray", 0.8), ("penguins", 0.7)],
139 |             [("penguins", 0.9)],
140 |         ]
141 |         self.assertEqual(results, kwd_strs)
142 |         LOG.info(kwd_strs)
143 | 
144 |     def test_predict(self):
145 |         examples = [
146 |             "Olympus Mons is the largest volcano in the solar system",
147 |             "Database management is critical for information retrieval",
148 |             "We used a logistic regression with batched stochastic gradient descent.",
149 |         ]
150 |         weights = {"NOUN": 1, "PROPN": 1, "ENT": 1, "NOUN_CHUNK": 1, "ACRONYM": 1}
151 |         self.hclf.vectorizer = self.v
152 |         features, feature_matrix = self.hclf.vectorize(examples, weights)
153 |         self.hclf.predict(feature_matrix)
154 | 
155 | 
156 | if __name__ == "__main__":
157 |     unittest.main()


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # Configuration file for the Sphinx documentation builder.
  4 | #
  5 | # This file does only contain a selection of the most common options. For a
  6 | # full list see the documentation:
  7 | # http://www.sphinx-doc.org/en/master/config
  8 | 
  9 | # -- Path setup --------------------------------------------------------------
 10 | 
 11 | # If extensions (or modules to document with autodoc) are in another directory,
 12 | # add these directories to sys.path here. If the directory is relative to the
 13 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 14 | #
 15 | import os
 16 | import sys
 17 | 
 18 | sys.path.insert(0, os.path.abspath("../src"))
 19 | sys.path.insert(0, os.path.abspath("../src/dsconcept"))
 20 | 
 21 | # -- Project information -----------------------------------------------------
 22 | 
 23 | project = "Research Access"
 24 | copyright = "2018, Anthony Buonomo"
 25 | author = "Anthony Buonomo"
 26 | 
 27 | # The short X.Y version
 28 | version = ""
 29 | # The full version, including alpha/beta/rc tags
 30 | release = "3.0.0"
 31 | 
 32 | 
 33 | # -- General configuration ---------------------------------------------------
 34 | 
 35 | # If your documentation needs a minimal Sphinx version, state it here.
 36 | #
 37 | # needs_sphinx = '1.0'
 38 | 
 39 | # Add any Sphinx extension module names here, as strings. They can be
 40 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 41 | # ones.
 42 | extensions = [
 43 |     "sphinx.ext.autodoc",
 44 |     "sphinx.ext.todo",
 45 |     "sphinx.ext.napoleon",
 46 | ]
 47 | 
 48 | # Add any paths that contain templates here, relative to this directory.
 49 | templates_path = ["_templates"]
 50 | 
 51 | # The suffix(es) of source filenames.
 52 | # You can specify multiple suffix as a list of string:
 53 | #
 54 | # source_suffix = ['.rst', '.md']
 55 | source_suffix = ".rst"
 56 | 
 57 | # The master toctree document.
 58 | master_doc = "index"
 59 | 
 60 | # The language for content autogenerated by Sphinx. Refer to documentation
 61 | # for a list of supported languages.
 62 | #
 63 | # This is also used if you do content translation via gettext catalogs.
 64 | # Usually you set "language" from the command line for these cases.
 65 | language = None
 66 | 
 67 | # List of patterns, relative to source directory, that match files and
 68 | # directories to ignore when looking for source files.
 69 | # This pattern also affects html_static_path and html_extra_path.
 70 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
 71 | 
 72 | # The name of the Pygments (syntax highlighting) style to use.
 73 | pygments_style = None
 74 | 
 75 | 
 76 | # -- Options for HTML output -------------------------------------------------
 77 | 
 78 | # The theme to use for HTML and HTML Help pages.  See the documentation for
 79 | # a list of builtin themes.
 80 | #
 81 | html_theme = "alabaster"
 82 | 
 83 | # Theme options are theme-specific and customize the look and feel of a theme
 84 | # further.  For a list of options available for each theme, see the
 85 | # documentation.
 86 | #
 87 | # html_theme_options = {}
 88 | 
 89 | # Add any paths that contain custom static files (such as style sheets) here,
 90 | # relative to this directory. They are copied after the builtin static files,
 91 | # so a file named "default.css" will overwrite the builtin "default.css".
 92 | html_static_path = ["_static"]
 93 | 
 94 | # Custom sidebar templates, must be a dictionary that maps document names
 95 | # to template names.
 96 | #
 97 | # The default sidebars (for documents that don't match any pattern) are
 98 | # defined by theme itself.  Builtin themes are using these templates by
 99 | # default: ``['localtoc.html', 'relations.html', 'sourcelink.html',
100 | # 'searchbox.html']``.
101 | #
102 | # html_sidebars = {}
103 | 
104 | 
105 | # -- Options for HTMLHelp output ---------------------------------------------
106 | 
107 | # Output file base name for HTML help builder.
108 | htmlhelp_basename = "ResearchAccessdoc"
109 | 
110 | 
111 | # -- Options for LaTeX output ------------------------------------------------
112 | 
113 | latex_elements = {
114 |     # The paper size ('letterpaper' or 'a4paper').
115 |     #
116 |     # 'papersize': 'letterpaper',
117 |     # The font size ('10pt', '11pt' or '12pt').
118 |     #
119 |     # 'pointsize': '10pt',
120 |     # Additional stuff for the LaTeX preamble.
121 |     #
122 |     # 'preamble': '',
123 |     # Latex figure (float) alignment
124 |     #
125 |     # 'figure_align': 'htbp',
126 | }
127 | 
128 | # Grouping the document tree into LaTeX files. List of tuples
129 | # (source start file, target name, title,
130 | #  author, documentclass [howto, manual, or own class]).
131 | latex_documents = [
132 |     (
133 |         master_doc,
134 |         "ResearchAccess.tex",
135 |         "Research Access Documentation",
136 |         "Anthony Buonomo",
137 |         "manual",
138 |     ),
139 | ]
140 | 
141 | 
142 | # -- Options for manual page output ------------------------------------------
143 | 
144 | # One entry per manual page. List of tuples
145 | # (source start file, name, description, authors, manual section).
146 | man_pages = [
147 |     (master_doc, "researchaccess", "Research Access Documentation", [author], 1)
148 | ]
149 | 
150 | 
151 | # -- Options for Texinfo output ----------------------------------------------
152 | 
153 | # Grouping the document tree into Texinfo files. List of tuples
154 | # (source start file, target name, title, author,
155 | #  dir menu entry, description, category)
156 | texinfo_documents = [
157 |     (
158 |         master_doc,
159 |         "ResearchAccess",
160 |         "Research Access Documentation",
161 |         author,
162 |         "ResearchAccess",
163 |         "One line description of project.",
164 |         "Miscellaneous",
165 |     ),
166 | ]
167 | 
168 | 
169 | # -- Options for Epub output -------------------------------------------------
170 | 
171 | # Bibliographic Dublin Core info.
172 | epub_title = project
173 | 
174 | # The unique identifier of the text. This can be a ISBN number
175 | # or the project homepage.
176 | #
177 | # epub_identifier = ''
178 | 
179 | # A unique identification for the text.
180 | #
181 | # epub_uid = ''
182 | 
183 | # A list of files that should not be packed into the epub file.
184 | epub_exclude_files = ["search.html"]
185 | 
186 | 
187 | # -- Extension configuration -------------------------------------------------
188 | 
189 | # -- Options for todo extension ----------------------------------------------
190 | 
191 | # If true, `todo` and `todoList` produce output, else they produce nothing.
192 | todo_include_todos = True
193 | 
194 | 
195 | # -- User config
196 | autodoc_member_order = "bysource"
197 | 


--------------------------------------------------------------------------------
/src/make_plots.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import logging
  3 | from math import ceil
  4 | from pathlib import Path
  5 | from time import time
  6 | 
  7 | import joblib
  8 | import matplotlib.pyplot as plt
  9 | import numpy as np
 10 | import pandas as pd
 11 | import yaml
 12 | from scipy.stats import linregress
 13 | from tqdm import tqdm
 14 | 
 15 | import dsconcept.get_metrics as gm
 16 | from dsconcept.get_metrics import get_keyword_results
 17 | 
 18 | logging.basicConfig(level=logging.INFO)
 19 | LOG = logging.getLogger(__name__)
 20 | LOG.setLevel(logging.INFO)
 21 | 
 22 | 
 23 | def lim_concepts_and_plot(mean_df, tmp_df, fig_dir):
 24 |     LOG.info(f"tmp_df.shape={tmp_df.shape}")
 25 |     cat = tmp_df["category"].iloc[0]
 26 |     lim_mean_df = mean_df[np.in1d(mean_df["concept"], tmp_df["concept"])]
 27 |     lim_tmp_df = tmp_df[np.in1d(tmp_df["concept"], mean_df["concept"])]
 28 |     if lim_mean_df.shape[0] != lim_tmp_df.shape[0]:
 29 |         ValueError("Different df sizes")
 30 |     metrics = ["recall", "precision", "f1", "roc_auc"]
 31 |     for m in metrics:
 32 |         a = 0.3
 33 |         lim_tmp_df[m].hist(alpha=a, label=f"one_layer | cat={cat}")
 34 |         lim_mean_df[m].hist(alpha=a, label="mean")
 35 |         plt.legend()
 36 |         plt.title(m)
 37 |         fig_loc = fig_dir / f"{m}.png"
 38 |         LOG.info(f"Saving plot to {fig_loc}")
 39 |         plt.savefig(fig_loc)
 40 |         plt.clf()
 41 | 
 42 | 
 43 | def load_classifier(in_cat_models, in_kwd_models, in_vectorizer):
 44 |     LOG.info(f"Loading category classifiers from {in_cat_models}.")
 45 |     in_clfs = list(in_cat_models.iterdir())
 46 |     cat_clfs = [joblib.load(c) for c in tqdm(in_clfs)]
 47 | 
 48 |     LOG.info(f"Loading keyword classifiers from {in_kwd_models}.")
 49 |     cd = {}  # expects no_topics with suffix ''
 50 |     for topic_dir in tqdm(in_kwd_models.iterdir()):
 51 |         in_clfs = list(topic_dir.iterdir())
 52 |         clfs = [joblib.load(c) for c in in_clfs]  # loads the classifiers
 53 |         topic_name = topic_dir.stem.split("_")[1]
 54 |         # depends on opinionated path format
 55 |         for c in clfs:
 56 |             cd[topic_name, c["concept"]] = c["best_estimator_"]
 57 | 
 58 |     hclf = gm.HierarchicalClassifier(cat_clfs, cd)
 59 |     hclf.load_vectorizer(in_vectorizer)
 60 |     return hclf
 61 | 
 62 | 
 63 | def get_clf_times(hclf, small_res, weights, sizes):
 64 |     hl_strats = ["topics", "only_no_topic"]
 65 |     batch_size = 10_000_000  # TODO: remove batching
 66 |     hl_dicts = []
 67 | 
 68 |     for hls in hl_strats:
 69 |         times = []
 70 |         out_sizes = []
 71 |         for s_size in sizes:
 72 |             if s_size > small_res.shape[0]:
 73 |                 LOG.warning(f"Skipping {s_size} because it is greater than data size.")
 74 |                 continue
 75 |             examples = small_res["text"].sample(s_size)
 76 |             n_splits = ceil(examples.shape[0] / batch_size)
 77 |             t1 = time()
 78 |             for n in tqdm(range(n_splits)):
 79 |                 start = n * batch_size
 80 |                 end = (n + 1) * batch_size
 81 |                 example_batch = examples[start:end]
 82 |                 _, feature_matrix = hclf.vectorize(example_batch, weights)
 83 |                 LOG.info(f"Predicting keywords")
 84 |                 if hls == "only_no_topic":
 85 |                     no_categories = True
 86 |                 elif hls == "topics":
 87 |                     no_categories = False
 88 |                 else:
 89 |                     LOG.exception(f"Invalid strategy selection: {hls}")
 90 |                 _, _ = hclf.predict(feature_matrix, 0.5, 0.5, no_categories)
 91 |             t2 = time()
 92 |             tt = t2 - t1
 93 |             out_sizes.append(s_size)
 94 |             times.append(tt)
 95 |         hld = {
 96 |             "strat": hls,
 97 |             "times": times,
 98 |             "sizes": out_sizes,
 99 |         }
100 |         hl_dicts.append(hld)
101 |     return hl_dicts
102 | 
103 | 
104 | def make_time_plots(hl_dicts, out_plot_file):
105 |     fig, axes = plt.subplots(1, 2, figsize=(15, 5))
106 |     for hd in hl_dicts:
107 |         lg = linregress(hd["sizes"], hd["times"])
108 |         docs_per_sec = [s / t for s, t in zip(hd["sizes"], hd["times"])]
109 |         a = np.array(hd["sizes"])
110 |         axes[0].plot(hd["sizes"], hd["times"], marker="o", label=hd["strat"])
111 |         axes[0].plot(hd["sizes"], lg.slope * a + lg.intercept, "r", alpha=0.5)
112 |         axes[0].set_xlabel("number of docs")
113 |         axes[0].set_ylabel("time to tag (seconds)")
114 |         axes[0].set_title("Time to tag depending on batch size")
115 |         axes[1].plot(hd["sizes"], docs_per_sec, marker="o", label=hd["strat"])
116 |         axes[1].set_xlabel("number of docs")
117 |         axes[1].set_ylabel("tagging rate (docs/seconds)")
118 |         axes[1].set_title("Tagging rate depending on batch size")
119 |         axes[0].legend()
120 |         axes[1].legend()
121 |     plt.savefig(out_plot_file)
122 |     plt.clf()
123 | 
124 | 
125 | def main(
126 |     in_mean,
127 |     in_cats_dir,
128 |     in_kwds_dir,
129 |     in_vectorizer,
130 |     in_clean_data,
131 |     in_config,
132 |     out_plots_dir,
133 | ):
134 |     LOG.info("Loading dataframes.")
135 |     mean_df = pd.read_csv(in_mean, index_col=0)
136 |     no_synth_df = get_keyword_results(in_kwds_dir)
137 |     if no_synth_df.shape[0] == 0:
138 |         raise ValueError(
139 |             f"No keyword results. Are the subdirectories of {in_kwds_dir} empty?"
140 |         )
141 |     no_cat_df = no_synth_df[no_synth_df["category"] == ""]
142 |     LOG.info("Making plots.")
143 |     lim_concepts_and_plot(mean_df, no_cat_df, out_plots_dir)
144 | 
145 |     with open(in_config) as f0:
146 |         config = yaml.safe_load(f0)
147 |     hclf = load_classifier(in_cats_dir, in_kwds_dir, in_vectorizer)
148 |     full_corpus = pd.read_json(in_clean_data, orient="records", lines=True)
149 |     sizes = [1, 10, 50, 100, 250, 500, 1000, 2000, 5000, 10_000]
150 |     sample_size = min(max(sizes), full_corpus.shape[0])
151 | 
152 |     small_res = pd.read_json(in_clean_data, orient="records", lines=True).sample(
153 |         sample_size
154 |     )
155 | 
156 |     hl_dicts = get_clf_times(hclf, small_res, config["weights"], sizes)
157 | 
158 |     out_plots_time = out_plots_dir / "time_v_batch_size.png"
159 |     make_time_plots(hl_dicts, out_plot_file=out_plots_time)
160 | 
161 | 
162 | if __name__ == "__main__":
163 |     parser = argparse.ArgumentParser(
164 |         description="""From output metrics, create plots for ROC-AUC, F1, precision, 
165 |         and recall"""
166 |     )
167 |     parser.add_argument("--mean", help="results from synthesis with max strategy")
168 |     parser.add_argument("--in_cats_dir", help="category classifiers dir", type=Path)
169 |     parser.add_argument("--in_kwds_dir", help="kwds classifier models dir", type=Path)
170 |     parser.add_argument("--in_vectorizer", help="vectorizer location", type=Path)
171 |     parser.add_argument("--in_clean_data", help="clean code location", type=Path)
172 |     parser.add_argument("--in_config", help="config location", type=Path)
173 |     parser.add_argument("--out_plots_dir", help="output dir for plots pngs", type=Path)
174 |     args = parser.parse_args()
175 |     main(
176 |         args.mean,
177 |         args.in_cats_dir,
178 |         args.in_kwds_dir,
179 |         args.in_vectorizer,
180 |         args.in_clean_data,
181 |         args.in_config,
182 |         args.out_plots_dir,
183 |     )
184 | 


--------------------------------------------------------------------------------
/src/pipeline/pipeline.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Pipeline
  3 | --------
  4 | Program to make classifiers from input corpus and selected keyword field.
  5 | 
  6 | Author: Anthony Buonomo
  7 | Contact: anthony.r.buonomo@nasa.gov
  8 | 
  9 | Full opinionated pipeline from processing, to topic_modeling, to training classifiers.
 10 | """
 11 | 
 12 | import logging
 13 | import warnings
 14 | from pathlib import Path
 15 | 
 16 | import plac
 17 | import yaml
 18 | from sklearn.decomposition import LatentDirichletAllocation
 19 | import joblib
 20 | from sklearn.feature_extraction import DictVectorizer
 21 | from sklearn.linear_model import SGDClassifier
 22 | from sklearn.model_selection import GridSearchCV
 23 | 
 24 | import dsconcept.model as ml
 25 | from dsconcept.train import ConceptTrainer
 26 | 
 27 | warnings.filterwarnings("ignore", category=FutureWarning)
 28 | logging.basicConfig(level=logging.INFO)
 29 | LOG = logging.getLogger(__name__)
 30 | LOG.setLevel(logging.INFO)
 31 | 
 32 | FEATURES = Path("features.jsonl")
 33 | INDICES = Path("indices.json")
 34 | RAW2LEMMA = Path("raw2lemma.json")
 35 | 
 36 | TOPIC_VECTORIZER = Path("vectorizer.pkl")
 37 | TOPIC_FEATURE_MATRIX = Path("doc_feature_matrix.pkl")
 38 | TOPIC_MODEL = Path("model.pkl")
 39 | DOC_TOPIC_DISTR = Path("doc_topic_distr.pkl")
 40 | 
 41 | VECTORIZER = Path("vectorizer.pkl")
 42 | FEATURE_MATRIX = Path("doc_feature_matrix.pkl")
 43 | OUT_MODELS_DIR = Path("classifiers")
 44 | 
 45 | 
 46 | def process(
 47 |     in_corpus, out_dir, abstract_field, concept_field, term_types, batch_size, n_threads
 48 | ):
 49 |     out_dir.mkdir(exist_ok=True, parents=True)
 50 |     out_features = out_dir / FEATURES
 51 |     out_indices = out_dir / INDICES
 52 |     out_raw2lemma = out_dir / RAW2LEMMA
 53 | 
 54 |     fe = ml.FeatureExtractor()
 55 |     fe.from_corpus_to_jsonlines(
 56 |         in_corpus, out_features, abstract_field, term_types, batch_size, n_threads
 57 |     )
 58 | 
 59 |     ce = ml.ConceptExtractor()
 60 |     ce.from_corpus(in_corpus, concept_field)
 61 |     ce.to_jsons(out_indices, out_raw2lemma)
 62 | 
 63 |     return fe, ce
 64 | 
 65 | 
 66 | def topic_model(
 67 |     topic_model_dir, processed_dir, topic_weights, min_feature, max_feature
 68 | ):
 69 |     topic_model_dir.mkdir(exist_ok=True)
 70 |     tfe = ml.FeatureExtractor()
 71 |     tfe.from_jsonlines(processed_dir / FEATURES)
 72 | 
 73 |     topic_weighted_features = tfe.weight_terms(topic_weights)
 74 |     topic_limited_features = tfe.limit_features(
 75 |         topic_weighted_features, min_feature, max_feature
 76 |     )
 77 | 
 78 |     topic_v = DictVectorizer()
 79 |     topic_X = topic_v.fit_transform(topic_limited_features)
 80 | 
 81 |     model = LatentDirichletAllocation(
 82 |         n_components=3,
 83 |         max_iter=5,
 84 |         learning_method="online",
 85 |         learning_offset=50.0,
 86 |         random_state=0,
 87 |     )
 88 |     doc_topic_distr = model.fit_transform(topic_X)
 89 | 
 90 |     out_vectorizer = topic_model_dir / TOPIC_VECTORIZER
 91 |     out_feature_matrix = topic_model_dir / TOPIC_FEATURE_MATRIX
 92 |     out_model = topic_model_dir / TOPIC_MODEL
 93 |     out_doc_topic_distr = topic_model_dir / DOC_TOPIC_DISTR
 94 | 
 95 |     joblib.dump(topic_v, out_vectorizer)
 96 |     joblib.dump(topic_X, out_feature_matrix)
 97 |     joblib.dump(model, out_model)
 98 |     joblib.dump(doc_topic_distr, out_doc_topic_distr)
 99 | 
100 |     return doc_topic_distr
101 | 
102 | 
103 | def train(
104 |     out_dir,
105 |     process_dir,
106 |     fe,
107 |     ce,
108 |     weights,
109 |     min_feature,
110 |     max_feature,
111 |     min_concept_occurrence,
112 |     doc_topic_distr,
113 | ):
114 |     out_dir.mkdir(exist_ok=True)
115 |     out_features = process_dir / FEATURES
116 |     fe.from_jsonlines(out_features)
117 |     weighted_features = fe.weight_terms(weights)
118 |     limited_features = fe.limit_features(weighted_features, min_feature, max_feature)
119 |     v = DictVectorizer()
120 |     X = v.fit_transform(limited_features)
121 | 
122 |     out_vectorizer = out_dir / VECTORIZER
123 |     out_feature_matrix = out_dir / FEATURE_MATRIX
124 |     joblib.dump(v, out_vectorizer)
125 |     joblib.dump(X, out_feature_matrix)
126 | 
127 |     paramgrid = {
128 |         "alpha": [0.01, 0.001, 0.0001],
129 |         "class_weight": [{1: 10, 0: 1}, {1: 5, 0: 1}, {1: 20, 0: 1}],
130 |         "max_iter": [1],
131 |         "loss": ["log"],
132 |     }  # requires loss function with predict_proba
133 |     clf = GridSearchCV(
134 |         SGDClassifier(), paramgrid, scoring="f1"
135 |     )  # requires GridSearchCV
136 |     out_models = out_dir / OUT_MODELS_DIR
137 |     trainer = ConceptTrainer(fe, ce, clf, out_models)
138 |     trainer.train_all(
139 |         X, out_models, min_concept_occurrence, doc_topic_distr=doc_topic_distr
140 |     )
141 |     return out_models
142 | 
143 | 
144 | def parse_config(in_config):
145 |     with open(in_config, "r") as f0:
146 |         cfg = yaml.safe_load(f0)
147 | 
148 |     term_types = cfg["process"]["term_types"]
149 |     abstract_field = cfg["process"]["abstract_field"]
150 |     concept_field = cfg["process"]["concept_field"]
151 | 
152 |     topic_weights = cfg["topic_model"]["weights"]
153 |     topic_min_feature = cfg["topic_model"]["min_feature_occurrence"]
154 |     topic_max_feature = cfg["topic_model"]["max_feature_occurrence"]
155 |     num_topics = cfg["topic_model"]["number_of_topics"]
156 | 
157 |     weights = cfg["train_classifiers"]["weights"]
158 |     min_feature = cfg["train_classifiers"]["max_feature_occurrence"]
159 |     max_feature = cfg["train_classifiers"]["max_feature_occurrence"]
160 |     min_concept = cfg["train_classifiers"]["min_concept_occurrence"]
161 | 
162 |     return (
163 |         abstract_field,
164 |         concept_field,
165 |         term_types,
166 |         topic_weights,
167 |         topic_min_feature,
168 |         topic_max_feature,
169 |         num_topics,
170 |         weights,
171 |         min_feature,
172 |         max_feature,
173 |         min_concept,
174 |     )
175 | 
176 | 
177 | @plac.annotations(
178 |     in_corpus=plac.Annotation("path to json-formatted corpus", "positional", type=Path),
179 |     config=plac.Annotation("path to configuration yaml file", "positional", type=Path),
180 |     process_dir=plac.Annotation(
181 |         "path to dir where you want to store processed corpus data",
182 |         "positional",
183 |         type=Path,
184 |     ),
185 |     topic_model_dir=plac.Annotation(
186 |         "path to dir where you want to store topic_modeling data",
187 |         "positional",
188 |         type=Path,
189 |     ),
190 |     classify_dir=plac.Annotation(
191 |         "path to dir where you want to store classifying data", "positional", type=Path
192 |     ),
193 |     batch_size=plac.Annotation(
194 |         "size of batches to process in processing phase of pipeline", "option", type=int
195 |     ),
196 |     n_threads=plac.Annotation(
197 |         "number of threads to use in processing phase of pipeline", "option", type=int
198 |     ),
199 | )
200 | def main(
201 |     in_corpus,
202 |     config,
203 |     process_dir,
204 |     topic_model_dir,
205 |     classify_dir,
206 |     batch_size=10,
207 |     n_threads=1,
208 | ):
209 | 
210 |     (
211 |         abstract_field,
212 |         concept_field,
213 |         term_types,
214 |         topic_weights,
215 |         topic_min_feature,
216 |         topic_max_feature,
217 |         num_topics,
218 |         weights,
219 |         min_feature,
220 |         max_feature,
221 |         min_concept,
222 |     ) = parse_config(config)
223 | 
224 |     fe, ce = process(
225 |         in_corpus,
226 |         process_dir,
227 |         abstract_field,
228 |         concept_field,
229 |         term_types,
230 |         batch_size,
231 |         n_threads,
232 |     )
233 |     doc_topic_distr = topic_model(
234 |         topic_model_dir,
235 |         process_dir,
236 |         topic_weights,
237 |         topic_min_feature,
238 |         topic_max_feature,
239 |     )
240 |     train(
241 |         classify_dir,
242 |         process_dir,
243 |         fe,
244 |         ce,
245 |         weights,
246 |         min_feature,
247 |         max_feature,
248 |         min_concept,
249 |         doc_topic_distr,
250 |     )
251 |     LOG.info("SUCCESS!")
252 | 
253 | 
254 | if __name__ == "__main__":
255 |     plac.call(main)
256 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Concept Tagging Training
  2 | 
  3 | This software enables the creation of concept classifiers, to be utilized by an 
  4 | accompanying [service](https://github.com/nasa/concept-tagging-api). If you don't have your own data to train, you can use the pretrained models <a href="https://data.nasa.gov/Software/STI-Tagging-Models/jd6d-mr3p">described here</a>. This project was written about [here](https://strategy.data.gov/proof-points/2019/05/28/improving-data-access-and-data-management-artificial-intelligence-generated-metadata-tags-at-nasa/) for the Federal Data Strategy Incubator Project.
  5 | 
  6 | ### What is Concept Tagging
  7 | By concept tagging, we mean you can supply text, for example:` Volcanic activity, or volcanism, has played a significant role in the geologic evolution of Mars.[2] Scientists have known since the Mariner 9 mission in 1972 that volcanic features cover large portions of the Martian surface.` and get back predicted keywords, like `volcanology, mars surface, and structural properties`, as well as topics like `space sciences, geosciences`, from a standardized list of several thousand NASA concepts with a probability score for each prediction.
  8 | 
  9 | ## Requirements
 10 | 
 11 | You can see a list of options for this project by navigating to the root of the project and executing `make` or `make help`.
 12 | 
 13 | This project requires:
 14 | * [docker](https://docs.docker.com/install/) -- [tested with this version](docker-versions.txt)
 15 | * [GNU Make](https://www.gnu.org/software/make/) -- tested with 3.81 built for i386-apple-darwin11.3.0
 16 | 
 17 | ## Index:
 18 | 1. [installation](#installation)
 19 | 2. [how to run](#how-to-run)
 20 | 3. [managing experiments](#managing-experiments)
 21 | 4. [advanced usage](#advanced-usage)
 22 | 
 23 | ## installation
 24 | You have several options for installing and using the pipeline. 
 25 | 1) [pull existing docker image](#pull-existing-docker-image)
 26 | 2) [build docker image from source](#build-docker-image-from-source)
 27 | 3) [install in python virtual environment](#install-in-python-virtual-environment)
 28 |  
 29 | ### pull existing docker image
 30 | You can just pull a stable docker image which has already been made:
 31 | ```bash
 32 | docker pull storage.analytics.nasa.gov/abuonomo/concept_trainer:stable
 33 | ```
 34 | In order to do this, you must be on the NASA network and able to connect to the <https://storage.analytics.nasa.gov> docker registry.
 35 | \* <sub> There are several versions of the images. You can see them [here](https://storage.analytics.nasa.gov/repository/abuonomo/rat_trainer). 
 36 | If you don't use "stable", some or all of this guide may not work properly. </sub>
 37 | 
 38 | 
 39 | ### build docker image from source
 40 | To build from source, first clone this repository and go to its root.
 41 | 
 42 | Then build the docker image using:
 43 | ```bash
 44 | docker build -t concept_trainer:example .
 45 | ```
 46 | Substitute `concept_trainer:example` for whatever name you would like. Keep this image name in mind. It will be used elsewhere. 
 47 | 
 48 | \* If you are actively developing this project, you should look at the `make build` in [Makefile](Makefile). This command automatically tags the image with the current commit url and most recent git tag. The command requires that [setuptools-scm](https://pypi.org/project/setuptools-scm/) is installed.
 49 | 
 50 | ### install in python virtual environment
 51 | \* tested with python3.7
 52 | First, clone this repository. 
 53 | Then create and activate a virtual environment. For example, using [venv](https://docs.python.org/3/library/venv.html):
 54 | ```bash
 55 | python -m venv my_env
 56 | source my_env/bin/activate
 57 | ```
 58 | Next, while in the root of this project, run `make requirements`.
 59 | 
 60 | 
 61 | ## how to run
 62 | The pipeline takes input document metadata structured like [this](data/raw/STI_public_metadata_records_sample100.jsonl) and a config file like [this](config/test_config.yml). The pipeline produces interim data, models, and reports.
 63 | 
 64 | 1. [using docker](#using-docker) -- if you pulled or built the image
 65 | 2. [using python in virtual environment](#using-python-in-virtual-environment) -- if you are running in a local virtual environment
 66 | 
 67 | ### using docker
 68 | First, make sure `config`, `data`, `data/raw`, `data/interim`, `models`, and `reports` directories. If they do not exist, make them (`mkdir config data models reports data/raw`). These directories will be used as docker mounted volumes. If you don't make these directories beforehand, they will be created by docker later on, but their permissions will be unnecessarily restrictive.  
 69 | 
 70 | Next, make sure you have your input data in the `data/raw/` directory. [Here](data/raw/STI_public_metadata_records_sample100.jsonl) is an example file with the proper structure. You also need to make sure the `subj_mapping.json` file [here](data/interim/subj_mapping.json) is in `data/interim/` directory.
 71 | 
 72 | Now, make sure you have a config file in the `config` directory. [Here](config/test_config.yml) is an example config which will work with the above example file.
 73 | 
 74 | With these files in place, you can run the full pipeline with this command:
 75 | ```bash
 76 | docker run -it \
 77 |      -v $(pwd)/data:/home/data \
 78 |      -v $(pwd)/models:/home/models \
 79 |      -v $(pwd)/config:/home/config \
 80 |      -v $(pwd)/reports:/home/reports \
 81 |     concept_trainer:example pipeline \
 82 |         EXPERIMENT_NAME=my_test_experiment \
 83 |         IN_CORPUS=data/raw/STI_public_metadata_records_sample100.jsonl \
 84 |         IN_CONFIG=config/test_config.yml
 85 | ```
 86 | Substitute `concept_trainer:example` with the name of your docker image.
 87 | You can set the `EXPERIMENT_NAME` to whatever you prefer.
 88 | `IN_CORPUS` and `IN_CONFIG` should be set to the paths to the corpus and to the configuration file, respectively.
 89 | 
 90 | \* Developers can also use the `container` command in the [Makefile](Makefile). Note that this command requires [setuptools-scm](https://pypi.org/project/setuptools-scm/). Note that this command will use the image defined by the `IMAGE_NAME` variable and version number equivalent to the most recent git tag. 
 91 | 
 92 | 
 93 | ### using python in virtual environment
 94 | 
 95 | Assuming you have cloned this repository, files for testing the pipeline should be in place. In particular, `data/raw/STI_public_metadata_records_sample100.jsonl` and `config/test_config.yml` should both exist. Additionally, you should add the `src` directory to your `PYTHONPATH`:
 96 | ```
 97 | export PYTHONPATH=$PYTHONPATH:$(pwd)/src/
 98 | ``` 
 99 | 
100 | Then, you can run a test of the pipeline with: 
101 | ```
102 | make pipeline \
103 |     EXPERIMENT_NAME=test \
104 |     IN_CORPUS=data/raw/STI_public_metadata_records_sample100.jsonl \
105 |     IN_CONFIG=config/test_config.yml
106 | ```
107 | If you are not using the default values, simply substitute the proper paths for `IN_CORPUS` and `IN_CONFIG`. Choose whatever name you prefer for `EXPERIMENT_NAME`.
108 | 
109 | ## managing experiments
110 | 
111 | If you have access to the `hq-ocio-ci-bigdata` moderate s3 bucket, you can sync local experiments with those in the s3 bucket.
112 | 
113 | For example, if you created a local experiment with `EXPERIMENT_NAME=my_cool_experiment`, you can upload your local results to the appropriate place on the s3 bucket with:
114 | ```bash
115 | make sync_experiment_to_s3 EXPERIMENT_NAME=my_cool_experiment PROFILE=my_aws_profile
116 | ```
117 | where `my_aws_profile` is the name of your awscli profile which has access to the given bucket.
118 | 
119 | Afterwards, you can download the experiment interim files and results with:
120 | ```bash
121 | make sync_experiment_from_s3 EXPERIMENT_NAME=my_cool_experiment PROFILE=my_aws_profile
122 | ```
123 | ## use full sti metadata records
124 | If you have access to the moderate bucket and you want to work with the full STI metadata records, you can download them to the `data/raw` folder with:
125 | ```bash
126 | make sync_raw_data_from_s3 PROFILE=my_aws_profile
127 | ``` 
128 | When using these data, you will want to use a config file which is different from the test config file. You can browse previous experiments at `s3://hq-ocio-ci-bigdata/home/DataSquad/classifier_scripts/` to see example config files. You might try:
129 | ```yaml
130 | weights:  # assign weights for term types specified in process section
131 |   NOUN: 1
132 |   PROPN: 1
133 |   NOUN_CHUNK: 1
134 |   ENT: 1
135 |   ACRONYM: 1
136 | min_feature_occurrence: 100
137 | max_feature_occurrence: 0.6
138 | min_concept_occurrence: 500
139 | ```
140 | See [config/test_config.yml](config/test_config.yml) for details on these parameters.
141 | 
142 | ## advanced usage
143 | For more advanced usage of the project, look at the [Makefile](Makefile) commands and their associated scripts. You can learn more about these python scripts by them with help flags. For example, you can run `python src/make_cat_models.py -h`. 
144 | 
145 | 


--------------------------------------------------------------------------------
/src/dsconcept/train.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Train
  3 | -----
  4 | Program to make classifiers from input corpus and selected keyword field.
  5 | 
  6 | author: Anthony Buonomo
  7 | contact: anthony.r.buonomo@nasa.gov
  8 | 
  9 | """
 10 | import logging
 11 | from pathlib import Path
 12 | import time
 13 | from math import ceil
 14 | 
 15 | from sklearn.model_selection import train_test_split
 16 | from sklearn.exceptions import UndefinedMetricWarning
 17 | from sklearn.metrics import (
 18 |     accuracy_score,
 19 |     roc_auc_score,
 20 |     f1_score,
 21 |     precision_score,
 22 |     recall_score,
 23 | )
 24 | import joblib
 25 | import numpy as np
 26 | import warnings
 27 | from tqdm import tqdm
 28 | 
 29 | warnings.filterwarnings("ignore", category=FutureWarning)
 30 | warnings.filterwarnings("ignore", category=DeprecationWarning)
 31 | warnings.filterwarnings("ignore", category=UndefinedMetricWarning)
 32 | warnings.filterwarnings("ignore", category=Warning)
 33 | logging.basicConfig(level=logging.INFO)
 34 | LOG = logging.getLogger(__name__)
 35 | LOG.setLevel(logging.INFO)
 36 | 
 37 | 
 38 | def get_dispersed_subset(array, subset_size):
 39 |     """
 40 |     Get dispersed subset of an array. By dispersed, I mean that we values extract values
 41 |     from an evenly distributed by location in the array.
 42 | 
 43 |     Args:
 44 |         array (numpy.ndarray): array from which to extract subset
 45 |         subset_size (int): the number of elements to extract from array
 46 | 
 47 |     Returns:
 48 |         subset (numpy.ndarray): the dispersed subset
 49 |         array (numpy.ndarray): if subset_size too large, return the input array
 50 | 
 51 |     Examples:
 52 |         >>> from dsconcept.train import get_dispersed_subset
 53 |         >>> l1 = list(range(100))
 54 |         >>> l2 = get_dispersed_subset(l1, 10)
 55 |         >>> l2
 56 |         array([ 0., 12., 23., 34., 45., 56., 67., 78., 89., 99.], dtype=float16)
 57 |     """
 58 |     if len(array) <= subset_size:
 59 |         return array
 60 |     else:
 61 |         last = array[-1]
 62 |         subset = [sub[0] for sub in np.array_split(array, (subset_size - 1))]
 63 |         subset.append(last)
 64 |         subset = np.array(subset, dtype=np.float16)
 65 |         return subset
 66 | 
 67 | 
 68 | # TODO: refactor to remove need for this function
 69 | def path_append(in_path, addition):
 70 |     out_path = f"{in_path.parent}/{in_path.stem}{addition}{in_path.suffix}"
 71 |     return Path(out_path)
 72 | 
 73 | 
 74 | def topic_path_format(out_classifier_dir, topic):
 75 |     if topic is None:
 76 |         tmp_topic = ""
 77 |     else:
 78 |         tmp_topic = topic
 79 |     out_classifier_dir = path_append(out_classifier_dir, tmp_topic)  # appends to stem
 80 |     if not out_classifier_dir.exists():
 81 |         out_classifier_dir.mkdir(parents=True)
 82 |     return out_classifier_dir
 83 | 
 84 | 
 85 | class ConceptTrainer:
 86 |     def __init__(self, concept_extractor, classifier):
 87 |         """
 88 |         Initialize object for training of classifiers based on given corpus extractors.
 89 | 
 90 |         Args:
 91 |             concept_extractor (dsconcept.model.ConceptExtractor): ConceptExtractor (with concepts already loaded)
 92 |                 for which to create classifiers
 93 |             classifier (sklearn.GridSearchCV): the classifier algorithm to use (wrapped in sklearn GridSearchCV)
 94 | 
 95 |         """
 96 |         self.concept_extractor = concept_extractor
 97 |         self.classifier = classifier
 98 | 
 99 |     def train_all(
100 |         self,
101 |         doc_feature_matrix,
102 |         out_classifier_dir,
103 |         min_concept_freq,
104 |         doc_topic_distr=None,
105 |     ):
106 |         """
107 |         Train classifiers for each concept for each topic (if topic distributions are provided).
108 | 
109 |         Args:
110 |             doc_feature_matrix (scipy.sparse.csr.csr_matrix): document feature matrix
111 |             out_classifier_dir (pathlib.Path): output path for classifiers
112 |             min_concept_freq (int): minimum frequency for concepts in corpus in order
113 |                 for their corresponding classifiers to be made
114 |             doc_topic_distr (numpy.ndarray): topic distributions for each doc in training set
115 | 
116 |         Returns:
117 |             out_classifier_dir (pathlib.Path): output path for classifiers
118 | 
119 |         """
120 |         doc_topic_indices = {}
121 |         if doc_topic_distr is not None:
122 |             for topic in range(
123 |                 doc_topic_distr.shape[1]
124 |             ):  # cols of distr matrix ~ topics
125 |                 doc_topic_indices[topic] = [
126 |                     i
127 |                     for i, distr in enumerate(doc_topic_distr)
128 |                     if distr.argmax() == topic
129 |                 ]
130 |         _, _, ind_train, ind_test = train_test_split(
131 |             doc_feature_matrix,
132 |             np.array(range(doc_feature_matrix.shape[0])),
133 |             test_size=0.10,
134 |             random_state=42,
135 |         )
136 |         np.save(out_classifier_dir.parent / f"train_inds.npy", ind_train)
137 |         np.save(out_classifier_dir.parent / f"test_inds.npy", ind_test)
138 | 
139 |         LOG.info(
140 |             f"Training one general set, and one set for each of {len(doc_topic_indices)} topics divisions."
141 |         )
142 |         for topic, doc_topic_index in doc_topic_indices.items():
143 |             self.train_concepts(
144 |                 doc_feature_matrix,
145 |                 ind_train,
146 |                 ind_test,
147 |                 out_classifier_dir,
148 |                 min_concept_freq,
149 |                 topic,
150 |                 doc_topic_index,
151 |             )
152 |         self.train_concepts(
153 |             doc_feature_matrix,
154 |             ind_train,
155 |             ind_test,
156 |             out_classifier_dir,
157 |             min_concept_freq,
158 |         )
159 |         return out_classifier_dir
160 | 
161 |     def train_concepts(
162 |         self,
163 |         doc_feature_matrix,
164 |         ind_train,
165 |         ind_test,
166 |         out_classifier_dir,
167 |         min_concept_freq,
168 |         topic=None,
169 |         doc_topic_index=None,
170 |         scale_threshold=False,
171 |     ):
172 |         """
173 |         Create classifiers for group of concepts.
174 | 
175 |         Args:
176 |             doc_feature_matrix (scipy.sparse.csr.csr_matrix): document feature matrix
177 |             ind_train (list of int): indices for training partition
178 |             ind_test (list of int): indices for testing partition
179 |             out_classifier_dir (pathlib.Path): path to directory where classifiers will be dumped.
180 |             min_concept_freq (int): minimum frequency for concepts in corpus in order
181 |                 for their corresponding classifiers to be made
182 |             topic (int | None): the topic (if any) from which to select training data for classifiers
183 |             doc_topic_index (lists): mapping from given topic to document indices
184 |                 for which that topic has the highest probability
185 |             scale_threshold (bool | False): If true, scale the minimum_concept_freq by the size of the topic division.
186 | 
187 |         Returns:
188 |             out_classifier_dir (pathlib.Path): directory where classifiers have been placed
189 | 
190 |         """
191 | 
192 |         LOG.info(f"Queuing classifier job for topic {topic}.")
193 |         t1 = time.time()
194 |         out_classifier_dir = topic_path_format(out_classifier_dir, topic)
195 | 
196 |         LOG.info("Getting indices for training and testing.")
197 |         if doc_topic_index is not None:
198 |             train_inds = list(set(ind_train).intersection(doc_topic_index))
199 |             test_inds = list(set(ind_test).intersection(doc_topic_index))
200 |         else:
201 |             train_inds = ind_train
202 |             test_inds = ind_test
203 | 
204 |         X_train = doc_feature_matrix.tocsc()[train_inds, :]
205 |         X_test = doc_feature_matrix.tocsc()[test_inds, :]
206 | 
207 |         if scale_threshold:
208 |             total_size = X_train.shape[0] + X_test.shape[0]
209 |             # scale threshold based on size of topic division
210 |             r = total_size / doc_feature_matrix.shape[0]
211 |             topic_min_concept_threshold = ceil(min_concept_freq * r)
212 |         else:
213 |             topic_min_concept_threshold = min_concept_freq
214 |         LOG.info(f"Topic threshold set to {topic_min_concept_threshold}.")
215 | 
216 |         concept_index_mapping = self.concept_extractor.get_top_concepts(
217 |             topic_min_concept_threshold
218 |         )
219 |         no_concepts = len(concept_index_mapping)
220 |         LOG.info(f"Training {no_concepts} concepts.")
221 | 
222 |         nu_passed = 0
223 |         for concept, index in tqdm(concept_index_mapping.items()):
224 |             LOG.debug(f"TOPIC={topic}:Loading indices for {concept}")
225 |             y = np.zeros(doc_feature_matrix.shape[0])
226 |             np.put(y, index, 1)
227 | 
228 |             y_train = y[train_inds]
229 |             y_test = y[test_inds]
230 |             total_yes = sum(y_train) + sum(y_test)
231 | 
232 |             if total_yes < topic_min_concept_threshold:
233 |                 nu_passed += 1
234 |                 LOG.debug(
235 |                     f"Passing {concept} because it is under topic_min_concept_threshold of {topic_min_concept_threshold}."
236 |                 )
237 |                 continue
238 |             # TODO: move around y0 train and test inds to keep aligned
239 |             self.create_concept_classifier(
240 |                 concept, X_train, X_test, y_train, y_test, out_classifier_dir
241 |             )
242 |         t2 = time.time()
243 |         LOG.warning(f"Passed {nu_passed} in topic {topic} due to freq under threshold.")
244 |         LOG.debug(f"{t2-t1} seconds for topic {topic}.")
245 |         return out_classifier_dir
246 | 
247 |     def create_concept_classifier(
248 |         self, concept, X_train, X_test, y_train, y_test, out_classifier_dir
249 |     ):
250 |         """
251 |         Create an individual classifier.
252 | 
253 |         Args:
254 |             concept (str): the concept for which to create a classifier
255 |             doc_feature_matrix (scipy.sparse.csr.csr_matrix): documents with their features
256 |             y (numpy.ndarray): array which indicates whether or not given concept occurs for a given topic
257 |             out_classifier_dir (pathlib.Path): output directory for classifiers
258 | 
259 |         Returns:
260 |             out_model_path (pathlib.Path): the path to the concept classifier just produced.
261 | 
262 |         """
263 |         LOG.debug(f"Making classifier for concept {concept}.")
264 |         try:
265 |             LOG.debug(f"fitting {concept}...")
266 |             self.classifier.fit(X_train, y_train)
267 |             LOG.debug(f"testing {concept}...")
268 |             y_score = self.classifier.predict_proba(X_test)[:, 1]
269 |             LOG.debug(f"Binarizing score for {concept}...")
270 |             y_pred = np.where(y_score > 0.5, 1, 0)
271 | 
272 |             LOG.debug(f"Getting metric scores for {concept}...")
273 |             accuracy = accuracy_score(y_test, y_pred)
274 |             roc_auc = roc_auc_score(y_test, y_score)
275 |             f1 = f1_score(y_test, y_pred)
276 |             precision = precision_score(y_test, y_pred)
277 |             recall = recall_score(y_test, y_pred)
278 | 
279 |             out_model = {
280 |                 "concept": concept,
281 |                 "best_estimator_": self.classifier.best_estimator_,
282 |                 "cv_results_": self.classifier.cv_results_,
283 |                 "scores": {
284 |                     "accuracy": accuracy,
285 |                     "roc_auc": roc_auc,
286 |                     "f1": f1,
287 |                     "precision": precision,
288 |                     "recall": recall,
289 |                 },
290 |             }
291 |             LOG.debug(f"Accuracy: {accuracy} | ROC-AUC: {roc_auc} | F1: {f1}")
292 |             out_concept = str(Path(concept).name)
293 |             out_model_path = out_classifier_dir / f"{out_concept}.pkl"
294 |             LOG.debug(f"Writing model to  {out_model_path}.")
295 |             joblib.dump(out_model, out_model_path)
296 |             return out_model_path
297 | 
298 |         except ValueError:
299 |             LOG.debug(f"Insufficient data for concept {concept}.")
300 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
  1 | .PHONY: process features concepts keywords categories structure requirements \
  2 | 		sync_data_to_s3 sync_data_from_s3 sync_raw_data_from_s3 pipeline plots \
  3 | 		tests docs check_clean clean_experiment clean
  4 | 
  5 | #.SHELLFLAGS := -o nounset -c
  6 | SHELL := /bin/bash
  7 | 
  8 | #################################################################################
  9 | # GLOBALS                                                                       #
 10 | #################################################################################
 11 | 
 12 | PROJECT_DIR := $(shell dirname $(realpath $(lastword $(MAKEFILE_LIST))))
 13 | BUCKET = hq-ocio-ci-bigdata/home/DataSquad/classifier_scripts/
 14 | PROFILE = moderate
 15 | PROJECT_NAME = classifier_scripts
 16 | PYTHON_INTERPRETER = python3
 17 | 
 18 | ifeq (,$(shell which conda))
 19 | HAS_CONDA=False
 20 | else
 21 | HAS_CONDA=True
 22 | endif
 23 | 
 24 | #################################################################################
 25 | # COMMANDS                                                                      #
 26 | #################################################################################
 27 | 
 28 | # These three variables should be tailored for you use case.
 29 | EXPERIMENT_NAME=test
 30 | IN_CORPUS=data/raw/STI_public_metadata_records_sample100.jsonl
 31 | IN_CONFIG=config/test_config.yml
 32 | 
 33 | INTERIM_DATA=data/interim/$(EXPERIMENT_NAME)
 34 | INTERIM_CORPUS=data/interim/$(EXPERIMENT_NAME)/abs_kwds.jsonl
 35 | 
 36 | FIELD=text
 37 | SUBJ_MAPPING=data/interim/subj_mapping.json
 38 | FEATURES=data/interim/$(EXPERIMENT_NAME)/features.jsonl
 39 | 
 40 | CONCEPT_FIELD='keywords'
 41 | CAT_FIELD='categories'
 42 | OUT_KWD_INDICES=data/interim/$(EXPERIMENT_NAME)/kwd_indices.json
 43 | OUT_CAT_INDICES=data/interim/$(EXPERIMENT_NAME)/cat_indices.json
 44 | OUT_KWD_RAW_TO_LEMMA=models/$(EXPERIMENT_NAME)/kwd_raw2lemma.json
 45 | OUT_CAT_RAW_TO_LEMMA=models/$(EXPERIMENT_NAME)/cat_raw2lemma.json
 46 | 
 47 | OUT_OUTER_MODEL_DIR=models/$(EXPERIMENT_NAME)
 48 | OUT_KWD_MODEL_DIR=$(OUT_OUTER_MODEL_DIR)/keywords
 49 | OUT_CAT_MODEL_DIR=$(OUT_OUTER_MODEL_DIR)/categories
 50 | 
 51 | METRICS_LOC=reports/$(EXPERIMENT_NAME)
 52 | BERT_MODELS_DIR=models/bert_models
 53 | 
 54 | GIT_REMOTE='origin'
 55 | IMAGE_NAME=concept_trainer
 56 | 
 57 | 
 58 | ## Test underlying dsconcept library
 59 | tests:
 60 | 	nosetests --with-coverage --cover-package dsconcept --cover-html; \
 61 | 	open cover/index.html
 62 | 
 63 | ## Run through all steps to create all classifiers
 64 | pipeline: structure process features concepts vectorizer_and_matrix \
 65 | 		  categories keywords metrics plots
 66 | 
 67 | ## create directory structure if necessary
 68 | structure:
 69 | 	mkdir -p data
 70 | 	mkdir -p data/raw
 71 | 	mkdir -p data/interim
 72 | 	mkdir -p data/interim/$(EXPERIMENT_NAME)
 73 | 	mkdir -p models/$(EXPERIMENT_NAME)
 74 | 	mkdir -p config
 75 | 	mkdir -p reports
 76 | 	mkdir -p reports/$(EXPERIMENT_NAME)
 77 | 
 78 | ## install newest version of dependencies. Untested.
 79 | approximate-install:
 80 | 	pip install scikit-learn spacy tqdm textacy pyyaml pandas h5py \
 81 | 		testfixtures hypothesis dask pytest matplotlib
 82 | 	$(PYTHON_INTERPRETER) -m spacy download en_core_web_sm
 83 | 
 84 | ## install precise python dependencies
 85 | requirements:
 86 | 	$(PYTHON_INTERPRETER) -m pip install -U pip setuptools wheel
 87 | 	$(PYTHON_INTERPRETER) -m pip install -r requirements.txt
 88 | 	$(PYTHON_INTERPRETER) -m spacy download en_core_web_sm
 89 | 
 90 | ## processing by merging text and keyword fields
 91 | process: $(INTERIM_CORPUS)
 92 | $(INTERIM_CORPUS): $(IN_CORPUS) src/process.py
 93 | 	mkdir -p data/interim/$(EXPERIMENT_NAME)
 94 | 	mkdir -p models/$(EXPERIMENT_NAME)
 95 | 	$(PYTHON_INTERPRETER) src/process.py $(IN_CORPUS) $(SUBJ_MAPPING) $(INTERIM_CORPUS)
 96 | 
 97 | ## creature feature sets for processed data
 98 | features: $(FEATURES)
 99 | $(FEATURES): $(INTERIM_CORPUS) src/features.py
100 | 	$(PYTHON_INTERPRETER) src/features.py $(INTERIM_CORPUS) $(FIELD) $(FEATURES)
101 | 
102 | ## create concepts indices json and mappings from raw to lemmas
103 | concepts: $(OUT_KWD_INDICES) $(OUT_CAT_INDICES)
104 | $(OUT_KWD_INDICES) $(OUT_CAT_INDICES): $(INTERIM_CORPUS) src/concepts.py
105 | 	$(PYTHON_INTERPRETER) src/concepts.py \
106 | 		$(INTERIM_CORPUS) \
107 | 		$(CONCEPT_FIELD) $(CAT_FIELD) \
108 | 		$(OUT_KWD_INDICES) $(OUT_CAT_INDICES) \
109 | 		$(OUT_KWD_RAW_TO_LEMMA) $(OUT_CAT_RAW_TO_LEMMA)
110 | 
111 | ## create vectorizer and feature matrix from feature records
112 | vectorizer_and_matrix: $(INTERIM_DATA)/feature_matrix.jbl
113 | $(INTERIM_DATA)/feature_matrix.jbl: src/make_vec_and_matrix.py $(FEATURES) $(IN_CONFIG)
114 | 	mkdir -p $(OUT_OUTER_MODEL_DIR) && \
115 | 	cp $(IN_CONFIG) $(OUT_OUTER_MODEL_DIR)/config.yml && \
116 | 	$(PYTHON_INTERPRETER) src/make_vec_and_matrix.py \
117 | 		$(FEATURES) $(IN_CONFIG) $(INTERIM_DATA) $(OUT_OUTER_MODEL_DIR)/vectorizer.jbl
118 | # TODO: separate outputs for vec and matrix, send matrix to INTERIM_DATA
119 | 
120 | ## train category models
121 | categories: src/make_cat_models.py $(OUT_CAT_INDICES) $(INTERIM_DATA)/feature_matrix.jbl $(IN_CONFIG)
122 | 	mkdir -p $(OUT_CAT_MODEL_DIR) && \
123 | 	$(PYTHON_INTERPRETER) src/make_cat_models.py \
124 | 		$(INTERIM_DATA)/feature_matrix.jbl \
125 | 		$(INTERIM_DATA)/train_inds.npy \
126 | 		$(INTERIM_DATA)/test_inds.npy \
127 | 		$(OUT_CAT_INDICES) \
128 | 		$(OUT_CAT_RAW_TO_LEMMA) \
129 | 		$(IN_CONFIG) $(OUT_CAT_MODEL_DIR)
130 | 
131 | ## train keyword models
132 | keywords: src/make_kwd_models.py $(OUT_KWD_INDICES) $(INTERIM_DATA)/feature_matrix.jbl $(IN_CONFIG) $(INTERIM_DATA)/test_inds.npy
133 | 	mkdir -p $(OUT_KWD_MODEL_DIR) && \
134 | 	$(PYTHON_INTERPRETER) src/make_kwd_models.py \
135 | 		$(INTERIM_DATA)/feature_matrix.jbl \
136 | 		$(INTERIM_DATA)/train_inds.npy \
137 | 		$(INTERIM_DATA)/test_inds.npy \
138 | 		$(OUT_KWD_INDICES) $(OUT_CAT_INDICES) \
139 | 		$(OUT_KWD_RAW_TO_LEMMA) $(OUT_CAT_RAW_TO_LEMMA) \
140 | 		$(IN_CONFIG) $(OUT_KWD_MODEL_DIR)
141 | 
142 | ## Only train keywords on full training set. No topic splitting.
143 | keywords-no-topics:
144 | 	mkdir -p $(OUT_KWD_MODEL_DIR) && \
145 | 	$(PYTHON_INTERPRETER) src/make_kwd_models.py \
146 | 		$(INTERIM_DATA)/feature_matrix.jbl \
147 | 		$(INTERIM_DATA)/train_inds.npy \
148 | 		$(INTERIM_DATA)/test_inds.npy \
149 | 		$(OUT_KWD_INDICES) $(OUT_CAT_INDICES) \
150 | 		$(OUT_KWD_RAW_TO_LEMMA) $(OUT_CAT_RAW_TO_LEMMA) \
151 | 		$(IN_CONFIG) $(OUT_KWD_MODEL_DIR) --no-topics ${VERBOSE}
152 | 
153 | ## Get predictions from category models made with BERT classification
154 | bert_cat_model_scores:
155 | 	mkdir -p $(METRICS_LOC) && \
156 | 	$(PYTHON_INTERPRETER) src/get_bert_cat_models_preds.py \
157 | 		--data_dir $(INTERIM_DATA) \
158 | 		--models_dir $(OUT_OUTER_MODEL_DIR) \
159 | 		--reports_dir $(METRICS_LOC) \
160 | 		--base_model_dir ../nlp-working-with-bert/models/base/cased_L-12_H-768_A-12 \
161 | 		--finetuned_model_dir ../nlp-working-with-bert/models/01_02_2020/ \
162 | 		--sample 1000
163 | #		--base_model_dir models/bert_models/cased_L-12_H-768_A-12 \
164 | #		--finetuned_model_dir models/bert_models/cased_L-12_H-768_A-12/cache
165 | 
166 | ## Create cleaned dataset for training transformer category models
167 | bert_cat_clean_dataset:
168 | 	$(PYTHON_INTERPRETER) src/make_records_for_cat_bert.py \
169 | 		$(INTERIM_CORPUS) \
170 | 		$(INTERIM_DATA) \
171 | 		$(OUT_OUTER_MODEL_DIR)/bert
172 | 
173 | ## Get metrics for test data
174 | metrics:
175 | 	mkdir -p $(METRICS_LOC) && \
176 | 	$(PYTHON_INTERPRETER) src/dsconcept/get_metrics.py \
177 | 		--experiment_name $(EXPERIMENT_NAME) \
178 | 		--out_store $(METRICS_LOC)/store.h5 \
179 | 		--out_cat_preds $(METRICS_LOC)/cat_preds.npy \
180 | 		--batch_size 500
181 | 
182 | ## Synthesize predictions for keywords and classifiers to create full classification
183 | synthesize:
184 | 	mkdir -p $(METRICS_LOC) && \
185 | 	$(PYTHON_INTERPRETER) src/synthesize_predictions.py \
186 | 		--experiment_name $(EXPERIMENT_NAME) \
187 | 		--synth_strat mean \
188 | 		--in_cat_preds $(METRICS_LOC)/cat_preds.npy \
189 | 		--store $(METRICS_LOC)/store.h5 \
190 | 		--synth_batch_size 3000 \
191 | 		--threshold 0.5 \
192 | 		--out_synth_scores $(METRICS_LOC)/synth_mean_results.csv
193 | 
194 | ## Synthesize predictions for keywords and classifiers to create full classification
195 | synthesize-bert:
196 | 	mkdir -p $(METRICS_LOC) && \
197 | 	$(PYTHON_INTERPRETER) src/synthesize_predictions.py \
198 | 		--experiment_name $(EXPERIMENT_NAME) \
199 | 		--synth_strat mean \
200 | 		--in_cat_preds $(METRICS_LOC)/bert_cat_preds.npy \
201 | 		--store $(METRICS_LOC)/store.h5 \
202 | 		--synth_batch_size 3000 \
203 | 		--threshold 0.5 \
204 | 		--out_synth_scores $(METRICS_LOC)/synth_bert_mean_results.csv
205 | 
206 | ## create plots from performance metrics
207 | plots:
208 | 	mkdir -p $(METRICS_LOC)/figures && \
209 | 	$(PYTHON_INTERPRETER) src/make_plots.py \
210 | 		--mean $(METRICS_LOC)/synth_mean_results.csv \
211 | 		--in_cats_dir $(OUT_CAT_MODEL_DIR)/models \
212 | 		--in_kwds_dir $(OUT_KWD_MODEL_DIR)/models \
213 | 		--in_cats_dir $(OUT_CAT_MODEL_DIR)/models \
214 | 		--in_vectorizer $(OUT_OUTER_MODEL_DIR)/vectorizer.jbl \
215 | 		--in_clean_data $(INTERIM_CORPUS) \
216 | 		--in_config $(OUT_OUTER_MODEL_DIR)/config.yml \
217 | 		--out_plots_dir $(METRICS_LOC)/figures
218 | 
219 | ## create plots from performance metrics
220 | plots-bert:
221 | 	mkdir -p $(METRICS_LOC)/figures_bert && \
222 | 	$(PYTHON_INTERPRETER) src/make_plots.py \
223 | 		--mean $(METRICS_LOC)/synth_bert_mean_results.csv \
224 | 		--in_cats_dir $(OUT_CAT_MODEL_DIR)/models \
225 | 		--in_kwds_dir $(OUT_KWD_MODEL_DIR)/models \
226 | 		--in_cats_dir $(OUT_CAT_MODEL_DIR)/models \
227 | 		--in_vectorizer $(OUT_OUTER_MODEL_DIR)/vectorizer.jbl \
228 | 		--in_clean_data $(INTERIM_CORPUS) \
229 | 		--in_config $(OUT_OUTER_MODEL_DIR)/config.yml \
230 | 		--out_plots_dir $(METRICS_LOC)/figures_bert
231 | 
232 | ## Build docker image for training
233 | build:
234 | 	export COMMIT=$$(git log -1 --format=%H); \
235 | 	export REPO_URL=$$(git remote get-url $(GIT_REMOTE)); \
236 | 	export REPO_DIR=$$(dirname $$REPO_URL); \
237 | 	export BASE_NAME=$$(basename $$REPO_URL .git); \
238 | 	export GIT_LOC=$$REPO_DIR/$$BASE_NAME/tree/$$COMMIT; \
239 | 	export VERSION=$$(python version.py); \
240 | 	echo $$GIT_LOC; \
241 | 	echo $$VERSION; \
242 | 	docker build -t $(IMAGE_NAME):$$VERSION \
243 | 		--build-arg GIT_URL=$$GIT_LOC \
244 | 		--build-arg VERSION=$$VERSION .
245 | 
246 | ## Start docker container for running full pipeline
247 | container:
248 | 	export VERSION=$$(python version.py); \
249 | 	docker run -it \
250 | 		 -v $$(pwd)/data:/home/data \
251 | 		 -v $$(pwd)/models:/home/models \
252 | 		 -v $$(pwd)/config:/home/config \
253 | 		 -v $$(pwd)/reports:/home/reports \
254 | 		$(IMAGE_NAME):$$VERSION pipeline \
255 | 			EXPERIMENT_NAME=$(EXPERIMENT_NAME) \
256 | 			IN_CORPUS=$(IN_CORPUS) \
257 | 			IN_CONFIG=$(IN_CONFIG)
258 | 
259 | ## Delete all compiled Python files
260 | clean:
261 | 	find . -type f -name "*.py[co]" -delete
262 | 	find . -type d -name "__pycache__" -delete
263 | 
264 | check_clean:
265 | 	@echo $(OUT_OUTER_MODEL_DIR)
266 | 	@echo data/interim/$(EXPERIMENT_NAME)
267 | 	@echo $(METRICS_LOC)
268 | 	@echo -n "Are you sure you want to remove the above folders? [y/N] " && read ans && [ $${ans:-N} = y ]
269 | 
270 | ## delete all interim data, models, and reports for the given experiment
271 | clean_experiment: check_clean
272 | 	rm -r $(OUT_OUTER_MODEL_DIR)
273 | 	rm -r data/interim/$(EXPERIMENT_NAME)
274 | 	rm -r $(METRICS_LOC)
275 | 
276 | ## sync this experiment to s3
277 | sync_experiment_to_s3:
278 | ifeq (default,$(PROFILE))
279 | 	aws s3 sync models/$(EXPERIMENT_NAME) s3://$(BUCKET)models/$(EXPERIMENT_NAME)
280 | 	aws s3 sync data/interim/$(EXPERIMENT_NAME) s3://$(BUCKET)data/interim/$(EXPERIMENT_NAME)
281 | 	aws s3 sync reports/$(EXPERIMENT_NAME) s3://$(BUCKET)reports/$(EXPERIMENT_NAME)
282 | else
283 | 	aws s3 sync models/$(EXPERIMENT_NAME) s3://$(BUCKET)models/$(EXPERIMENT_NAME) --profile $(PROFILE)
284 | 	aws s3 sync data/interim/$(EXPERIMENT_NAME) s3://$(BUCKET)data/interim/$(EXPERIMENT_NAME) --profile $(PROFILE)
285 | 	aws s3 sync reports/$(EXPERIMENT_NAME) s3://$(BUCKET)reports/$(EXPERIMENT_NAME) --profile $(PROFILE)
286 | endif
287 | 
288 | ## sync this experiment from s3
289 | sync_experiment_from_s3:
290 | ifeq (default,$(PROFILE))
291 | 	aws s3 sync s3://$(BUCKET)models/$(EXPERIMENT_NAME) models/$(EXPERIMENT_NAME)
292 | 	aws s3 sync s3://$(BUCKET)reports/$(EXPERIMENT_NAME) reports/$(EXPERIMENT_NAME)
293 | 	aws s3 sync s3://$(BUCKET)data/processed/$(EXPERIMENT_NAME) data/processed/$(EXPERIMENT_NAME)
294 | else
295 | 	aws s3 sync s3://$(BUCKET)models/$(EXPERIMENT_NAME) models/$(EXPERIMENT_NAME) --profile $(PROFILE)
296 | 	aws s3 sync s3://$(BUCKET)reports/$(EXPERIMENT_NAME) reports/$(EXPERIMENT_NAME) --profile $(PROFILE)
297 | 	aws s3 sync s3://$(BUCKET)data/processed/$(EXPERIMENT_NAME) data/processed/$(EXPERIMENT_NAME) --profile $(PROFILE)
298 | endif
299 | 
300 | ## sync raw starting data from s3
301 | sync_raw_data_from_s3:
302 | ifeq (default,$(PROFILE))
303 | 	aws s3 cp s3://hq-ocio-ci-bigdata/data/STI/STI_records_metadata.jsonl data/raw/STI_records_metadata.jsonl
304 | else
305 | 	aws s3 cp s3://hq-ocio-ci-bigdata/data/STI/STI_records_metadata.jsonl data/raw/STI_records_metadata.jsonl --profile $(PROFILE)
306 | endif
307 | 	echo "These records should be handled as moderate data assets. Handle these records with care."
308 | 
309 | ## zip models necessary for running the app
310 | zip-experiment-for-app:
311 | 	cd models/; \
312 | 	zip -r $(EXPERIMENT_NAME).zip \
313 | 		$(EXPERIMENT_NAME)/categories/models \
314 | 		$(EXPERIMENT_NAME)/keywords/models \
315 | 		$(EXPERIMENT_NAME)/kwd_raw2lemma.json \
316 | 		$(EXPERIMENT_NAME)/cat_raw2lemma.json \
317 | 		$(EXPERIMENT_NAME)/vectorizer.jbl \
318 | 		$(EXPERIMENT_NAME)/config.yml \
319 | 
320 | ## Upload zipped experiment app files to s3
321 | upload-experiment-zip-to-s3:
322 | 	aws s3 cp models/$(EXPERIMENT_NAME).zip s3://$(BUCKET)models/$(EXPERIMENT_NAME).zip --profile $(PROFILE)
323 | #################################################################################
324 | # PROJECT RULES                                                                 #
325 | #################################################################################
326 | 
327 | 
328 | 
329 | #################################################################################
330 | # Self Documenting Commands                                                     #
331 | #################################################################################
332 | 
333 | .DEFAULT_GOAL := help
334 | 
335 | # Inspired by <http://marmelab.com/blog/2016/02/29/auto-documented-makefile.html>
336 | # sed script explained:
337 | # /^##/:
338 | # 	* save line in hold space
339 | # 	* purge line
340 | # 	* Loop:
341 | # 		* append newline + line to hold space
342 | # 		* go to next line
343 | # 		* if line starts with doc comment, strip comment character off and loop
344 | # 	* remove target prerequisites
345 | # 	* append hold space (+ newline) to line
346 | # 	* replace newline plus comments by `---`
347 | # 	* print line
348 | # Separate expressions are necessary because labels cannot be delimited by
349 | # semicolon; see <http://stackoverflow.com/a/11799865/1968>
350 | .PHONY: help
351 | help:
352 | 	@echo "$$(tput bold)Available rules:$$(tput sgr0)"
353 | 	@echo
354 | 	@sed -n -e "/^## / { \
355 | 		h; \
356 | 		s/.*//; \
357 | 		:doc" \
358 | 		-e "H; \
359 | 		n; \
360 | 		s/^## //; \
361 | 		t doc" \
362 | 		-e "s/:.*//; \
363 | 		G; \
364 | 		s/\\n## /---/; \
365 | 		s/\\n/ /g; \
366 | 		p; \
367 | 	}" ${MAKEFILE_LIST} \
368 | 	| LC_ALL='C' sort --ignore-case \
369 | 	| awk -F '---' \
370 | 		-v ncol=$$(tput cols) \
371 | 		-v indent=19 \
372 | 		-v col_on="$$(tput setaf 6)" \
373 | 		-v col_off="$$(tput sgr0)" \
374 | 	'{ \
375 | 		printf "%s%*s%s ", col_on, -indent, $$1, col_off; \
376 | 		n = split($$2, words, " "); \
377 | 		line_length = ncol - indent; \
378 | 		for (i = 1; i <= n; i++) { \
379 | 			line_length -= length(words[i]) + 1; \
380 | 			if (line_length <= 0) { \
381 | 				line_length = ncol - indent - length(words[i]) - 1; \
382 | 				printf "\n%*s ", -indent, " "; \
383 | 			} \
384 | 			printf "%s ", words[i]; \
385 | 		} \
386 | 		printf "\n"; \
387 | 	}' \
388 | 	| more $(shell test $(shell uname) = Darwin && echo '--no-init --raw-control-chars')


--------------------------------------------------------------------------------
/src/dsconcept/model.py:
--------------------------------------------------------------------------------
  1 | """
  2 | model
  3 | -----
  4 | Program to make classifiers from input corpus and selected keyword field.
  5 | 
  6 | Author: Anthony Buonomo
  7 | Contact: anthony.r.buonomo@nasa.gov
  8 | 
  9 | Classes to support document classification.
 10 | """
 11 | 
 12 | from collections import Counter
 13 | import logging
 14 | from multiprocessing import cpu_count
 15 | import json
 16 | from typing import Dict
 17 | from tqdm import tqdm
 18 | 
 19 | import spacy
 20 | from spacy.lemmatizer import Lemmatizer
 21 | from spacy.lookups import Lookups
 22 | from textacy.extract import acronyms_and_definitions
 23 | 
 24 | nlp = spacy.load("en_core_web_sm")
 25 | 
 26 | logging.basicConfig(level=logging.INFO)
 27 | LOG = logging.getLogger(__name__)
 28 | LOG.setLevel(logging.DEBUG)
 29 | 
 30 | 
 31 | def file_len(fname):
 32 |     with open(fname) as f:
 33 |         for i, l in enumerate(f):
 34 |             pass
 35 |     return i + 1
 36 | 
 37 | 
 38 | def spacy_tokenizer(txt):
 39 |     """
 40 |     Tokenize txt using spacy. Fit for use with sklearn CountVectorizer.
 41 | 
 42 |     Args:
 43 |         txt (str): text to be tokenized
 44 | 
 45 |     Returns:
 46 |         terms_tagged_list (list of str): tokens extracted from text
 47 | 
 48 |     Examples:
 49 |         >>> from dsconcept.model import spacy_tokenizer
 50 |         >>> from sklearn.feature_extraction.text import CountVectorizer
 51 |         >>> txt = "The ship hung in the sky much the same way bricks don't."
 52 |         >>> doc_tokens = spacy_tokenizer(txt)
 53 |         >>> doc_tokens
 54 |         ['ship :: NOUN',
 55 |          'sky :: NOUN',
 56 |          'way :: NOUN',
 57 |          'brick :: NOUN',
 58 |          'the ship :: NOUN_CHUNK']
 59 |         >>> v = CountVectorizer(txt, tokenizer=spacy_tokenizer)
 60 |         >>> v.fit_transform([txt])
 61 |         >>> v.vocabulary_
 62 |         {'ship :: NOUN': 1, 'sky :: NOUN': 2, 'way :: NOUN': 3, 'brick :: NOUN': 0}
 63 |     """
 64 |     doc = nlp(txt)
 65 |     terms_tagged = extract_from_doc(doc)
 66 |     terms_tagged_list = [f"{term} :: {tag}" for term, tag in terms_tagged.items()]
 67 |     return terms_tagged_list
 68 | 
 69 | 
 70 | def should_keep(w, desired_parts_of_speech):
 71 |     desiredPOS = w.pos_ in desired_parts_of_speech
 72 |     notStop = not w.is_stop
 73 |     notPerc = w.lemma_ not in ["%"]
 74 |     return desiredPOS and notStop and notPerc
 75 | 
 76 | 
 77 | def extract_from_doc(doc):
 78 |     """
 79 |     Extract features from a spacy doc.
 80 | 
 81 |     Args:
 82 |         doc (spacy.doc): a doc processed by the spacy 'en' model
 83 | 
 84 |     Returns:
 85 |         terms_tagged (dict): features with their respective tags
 86 | 
 87 |     Examples:
 88 |         >>> from dsconcept.model import extract_from_doc
 89 |         >>> import spacy
 90 |         >>> nlp = spacy.load('en_core_web_sm')
 91 |         >>> txt = "The ship hung in the sky much the same way bricks don't."
 92 |         >>> doc = nlp(txt)
 93 |         >>> features = extract_from_doc(doc)
 94 |         >>> features
 95 |         {'ship': 'NOUN',
 96 |          'sky': 'NOUN',
 97 |          'way': 'NOUN',
 98 |          'brick': 'NOUN',
 99 |          'the ship': 'NOUN_CHUNK'}
100 |     """
101 |     # TODO: change this function such that it processes better but maintains the same interface.
102 |     terms_tagged = dict()
103 | 
104 |     desired_parts_of_speech = ["NOUN", "PROPN"]
105 |     # Get any 1-gram terms which are not % signs, or stop words.
106 |     terms = {w.lemma_: w.pos_ for w in doc if should_keep(w, desired_parts_of_speech)}
107 |     terms_tagged.update(terms)
108 | 
109 |     # Lemmatize each gram and join with a space.
110 |     noun_chunks = {
111 |         " ".join([w.lemma_ for w in nc if not w.is_stop]): nc.label_
112 |         for nc in doc.noun_chunks
113 |     }
114 |     # filter our noun chunks that are already in terms set and not in excluded_list.
115 |     excluded_list = ["-PRON-", ""]
116 |     noun_chunks_filtered = {
117 |         w.strip(): "NOUN_CHUNK"
118 |         for w, lab in noun_chunks.items()
119 |         if (w not in terms.keys()) and (w not in excluded_list)
120 |     }
121 |     terms_tagged.update(noun_chunks_filtered)
122 | 
123 |     # TODO: entities take precedence over noun chunks
124 |     # Get entities from text and remove collisions with terms and noun chunks.
125 |     ent_excluded_set = ["ORDINAL", "CARDINAL", "QUANTITY", "DATE", "PERCENT"]
126 |     ents = {e.lemma_: e.label_ for e in doc.ents if e.label_ not in ent_excluded_set}
127 |     ents_filtered = {
128 |         ent: "ENT"
129 |         for ent, lab in ents.items()
130 |         if ent not in terms.keys() and ent not in noun_chunks_filtered.keys()
131 |     }
132 |     terms_tagged.update(ents_filtered)
133 | 
134 |     # Add acronyms which have definitions.
135 |     # These acronyms could create Noise if they are not good. Maybe better to use their definitions.
136 |     # This schema will only pull out identifical definitions. No lemmatizing, no fuzzy matching.
137 |     # TODO: add lemmatizing and fuzzy matching for acrnoyms. This code exists in acronyms project.
138 |     acronyms_with_defs = acronyms_and_definitions(doc)
139 |     acronyms_filtered = {
140 |         "{} - {}".format(ac, definition): "ACRONYM"
141 |         for ac, definition in acronyms_with_defs.items()
142 |         if definition != ""
143 |     }
144 |     terms_tagged.update(acronyms_filtered)
145 | 
146 |     return terms_tagged
147 | 
148 | 
149 | def extract_features_from_abstracts(
150 |     descriptions, feature_outfile, batch_size=1000, n_threads=cpu_count(), total=None
151 | ):
152 |     """
153 |     Generate features from input batch of abstracts.
154 | 
155 |     Args:
156 |         descriptions (list of str): list of descriptions
157 |         feature_outfile (str): output file for features jsonlines
158 |         batch_size (int): how many docs to process in a batch
159 |         n_threads (int): number of threads to process with
160 |         total (int): total number of description to optionally pass to tqdm for a better loading bar
161 | 
162 |     Returns:
163 |         no_descriptions (int): hown many descriptions were processed
164 | 
165 |     Examples:
166 |         >>> from dsconcept.model import extract_features_from_abstracts
167 |         >>> import json
168 |         >>>
169 |         >>> abstract1 = " A common mistake that people make when trying to design something completely foolproof is to underestimate the ingenuity of complete fools."
170 |         >>> abstract2 = "Since we decided a few weeks ago to adopt the leaf as legal tender, we have, of course, all become immensely rich."
171 |         >>> abstracts = [abstract1, abstract2]
172 |         >>>
173 |         >>> feature_outfile = 'data/tmp_features.txt'
174 |         >>>
175 |         >>> extract_features_from_abstracts(abstracts, feature_outfile, batch_size=1, n_threads=1)
176 |         >>>
177 |         >>> with open(feature_outfile, 'r') as f0:
178 |         >>>     content = f0.readlines()
179 |         >>> features = [json.loads(line) for line in content]
180 |         >>> features
181 |         [{'mistake': 'NOUN',
182 |           'people': 'NOUN',
183 |           'ingenuity': 'NOUN',
184 |           'fool': 'NOUN',
185 |           'a common mistake': 'NOUN_CHUNK',
186 |           'complete fool': 'NOUN_CHUNK'},
187 |          {'week': 'NOUN',
188 |           'leaf': 'NOUN',
189 |           'tender': 'NOUN',
190 |           'course': 'NOUN',
191 |           'legal tender': 'NOUN_CHUNK'}]
192 |     """
193 | 
194 |     LOG.info("Extracting features to {}".format(feature_outfile))
195 |     no_descriptions = 0
196 |     with open(feature_outfile, "w") as f0:
197 |         for doc in tqdm(
198 |             nlp.pipe(descriptions, batch_size=batch_size, n_threads=n_threads,),
199 |             total=total,
200 |         ):
201 |             json.dump(extract_from_doc(doc), f0)  # each line is valid json
202 |             f0.write("\n")
203 |             no_descriptions += 1
204 | 
205 |     LOG.info("Extracted feature sets to {}".format(feature_outfile))
206 |     return no_descriptions
207 | 
208 | 
209 | class FeatureExtractor:
210 |     def __init__(self):
211 |         """
212 |         A term extractor.
213 | 
214 |         Examples:
215 |             >>> from dsconcept.model import FeatureExtractor
216 |             >>> extractor = FeatureExtractor()
217 |         """
218 |         self._features = list()
219 |         self.term_types = dict()
220 |         self.feature_counts = Counter()
221 | 
222 |     @property
223 |     def features(self):
224 |         return self._features
225 | 
226 |     @features.setter
227 |     def features(self, value):
228 |         self._features = value
229 |         self.term_types = {
230 |             term_type
231 |             for feature_set in self._features
232 |             for term_type in feature_set.values()
233 |         }
234 |         all_features = [
235 |             feature
236 |             for feature_set in self._features
237 |             for feature, val in feature_set.items()
238 |         ]
239 |         self.feature_counts = Counter(all_features)
240 | 
241 |     @staticmethod
242 |     def from_corpus_to_jsonlines(
243 |         in_corpus, out_features, abstract_field, batch_size=1000, n_threads=cpu_count()
244 |     ):
245 |         """
246 | 
247 |         Args:
248 |             in_corpus (pathlib.Path | str): input path to json file containing corpus
249 |             out_features (pathlib.Path | str): output path for features json lines file.
250 |             abstract_field (str): name of abstract field for corpus
251 |             batch_size (int): size of batch to use when multithreading using spacy's nlp.pipe
252 |             n_threads (int): number of threads to use when multithreading using spacy's nlp.pipe
253 | 
254 |         Returns:
255 |             n_descriptions (int): the number of abstracts in the corpus
256 | 
257 |         """
258 | 
259 |         n_lines = file_len(in_corpus)
260 |         with open(in_corpus, "r") as f0:
261 |             record_generator = (json.loads(l) for l in f0.readlines())
262 |         text_generator = (r[abstract_field] for r in record_generator)
263 |         n_descriptions = extract_features_from_abstracts(
264 |             text_generator, out_features, batch_size, n_threads, total=n_lines
265 |         )
266 |         return n_descriptions
267 | 
268 |     def from_jsonlines(self, in_features):
269 |         """
270 |         Load features from jsonlines.
271 | 
272 |         Args:
273 |             in_features (pathlib.Path | str): path to input jsonlines features file
274 | 
275 |         Returns:
276 |             in_features (pathlib.Path | str): path to input jsonlines features file
277 | 
278 |         """
279 |         with open(in_features, "r") as f0:
280 |             content = (
281 |                 f0.readlines()
282 |             )  # each line is json formatted, but whole file is not.
283 |         self.features = [json.loads(line) for line in content]
284 |         return in_features
285 | 
286 |     def to_jsonlines(self, out_features):
287 |         """
288 |         Output features to jsonlines.
289 | 
290 |         Args:
291 |             out_features (pathlib.Path | str): output path to features jsonlines file
292 | 
293 |         Returns:
294 |             out_features (pathlib.Path | str): output path to features jsonlines file
295 | 
296 |         """
297 |         with open(out_features, "w") as f0:
298 |             for feature_set in self.features:
299 |                 json.dump(feature_set, f0)  # each line is valid json
300 |                 f0.write("\n")
301 |         return out_features
302 | 
303 |     def weight_terms(self, weights: Dict[str, int]):
304 |         """
305 |         Weights features according to tag type.
306 | 
307 |         Args:
308 |             weights (dict of str): mappings from term types to their weights
309 | 
310 |         Returns:
311 |             weighted_features (list of dict): features with mappings to weights instead of term types
312 | 
313 |         Examples
314 |         --------
315 |         >>> weights = {'NOUN': 1, 'NOUN_CHUNK': 2}
316 |         >>> weighted_features = tm.weight_terms(weights)
317 |         >>> weighted_features
318 |         [{'mistake': 1,
319 |           'people': 1,
320 |           'ingenuity': 1,
321 |           'fool': 1,
322 |           'a common mistake': 2,
323 |           'complete fool': 2},
324 |          {'week': 1, 'leaf': 1, 'tender': 1, 'course': 1, 'legal tender': 2}]
325 |         """
326 |         assert type(weights) is dict, "Weights must be dict: {}".format(weights)
327 |         if self.term_types > weights.keys():
328 |             LOG.warning(
329 |                 "Term types without a specified weight will be omitted from returned feature sets."
330 |             )
331 |         elif self.term_types < weights.keys():
332 |             LOG.warning(
333 |                 "More term types specified then those which exist in corpus. Ignoring excess."
334 |             )
335 |         weighted_features = [
336 |             weight_terms_inner(doc_features, weights) for doc_features in self.features
337 |         ]
338 |         return weighted_features
339 | 
340 |     def limit_features(
341 |         self,
342 |         weighted_features,
343 |         feature_min,
344 |         feature_max,
345 |         topic=None,
346 |         doc_topic_matrix=None,
347 |     ):
348 |         """
349 |         Cull features.
350 | 
351 |         Args:
352 |             weighted_features (list of dict): features with assigned weights
353 |             feature_min (int): features which have in-corpus frequencies under feature_min are excluded.
354 |             feature_max (float): features which occur in greater than this percentage of documents are excluded.
355 |             topic (int | None): if specified, only return feature sets with maximum probability to be in this topic.
356 |             doc_topic_matrix (numpy.ndarray): topic probability distributions for each document in corpus.
357 | 
358 |         Returns:
359 |             weighted_limited (list): limited features with assigned weights
360 | 
361 |         Examples:
362 |             >>> limited_features = tm.limit_features_for_X(weighted_features, feature_min=1, feature_max=0.99)
363 |         """
364 |         assert (feature_max > 0.0) and (
365 |             feature_max <= (1.0)
366 |         ), "feature_max should be float in (0,1]"
367 |         feature_ex = {
368 |             feature: occurrence
369 |             for feature, occurrence in self.feature_counts.items()
370 |             if (occurrence >= feature_min)
371 |             and (occurrence / len(self.features) < feature_max)
372 |         }
373 | 
374 |         weighted_limited = [
375 |             {
376 |                 feature: val
377 |                 for feature, val in feature_set.items()
378 |                 if feature in feature_ex
379 |             }
380 |             for feature_set in weighted_features
381 |         ]
382 | 
383 |         if topic is not None:
384 |             assert doc_topic_matrix is not None, LOG.error(
385 |                 "Must supply doc_topic_matrix when using topic model segmentation."
386 |             )
387 |             LOG.info(f"Segmenting vectorizer and matrix for topic {topic}.")
388 |             print("here")
389 |             in_topic_index = [
390 |                 i for i, distr in enumerate(doc_topic_matrix) if distr.argmax() == topic
391 |             ]
392 |             weighted_limited = [weighted_limited[i] for i in in_topic_index]
393 | 
394 |         return weighted_limited
395 | 
396 | 
397 | def weight_terms_inner(doc_features, weights):
398 |     """
399 | 
400 |     Args:
401 |         doc_features (dict): features with assigned tags
402 |         weights (dict): tag to weight mappings
403 | 
404 |     Returns:
405 |         weighted_terms (dict): features with assigned weights
406 | 
407 |     Examples
408 |         >>> from dsconcept.model import weight_terms_inner
409 |         >>> features = {'ship': 'NOUN', 'sky': 'NOUN', 'way': 'NOUN', 'brick': 'NOUN', 'the ship': 'NOUN_CHUNK'}
410 |         >>> weights = {'NOUN': 1, 'NOUN_CHUNK': 3}
411 |         >>> weighted_terms = weight_terms_inner(features, weights)
412 |         >>> weighted_terms
413 |         {'ship': 1, 'sky': 1, 'way': 1, 'brick': 1, 'the ship': 3}
414 |     """
415 |     weighted_terms = {}
416 |     for pos0, weight in weights.items():
417 |         updated_dict = {w: weight for w, pos in doc_features.items() if pos == pos0}
418 |         weighted_terms.update(updated_dict)
419 | 
420 |     return weighted_terms
421 | 
422 | 
423 | class ConceptExtractor:
424 |     def __init__(self):
425 |         """
426 |         Information about relationship between concepts/keywords and corpus.
427 | 
428 |         Examples:
429 |             >>> from dsconcept.model import ConceptExtractor
430 |             >>> kwd_sets = [['Zaphod', 'Arthur'], ['Arthur'], ['Zaphod'], ['Heart of Gold']]
431 |             >>> info = ConceptExtractor.concept_sets = kwd_sets
432 |             >>> info.concepts
433 |             {'arthur', 'heart of gold', 'zaphod'}
434 |         """
435 |         self._concept_sets = []
436 |         self.raw2lemma = {}
437 |         self.lemma2raw = {}
438 |         self.lemmatizer = None
439 |         self.concepts_frequencies = Counter()
440 |         self.concepts = set()
441 |         self.concept_index_mapping = {}
442 | 
443 |     @property
444 |     def concept_sets(self):
445 |         return self._concept_sets
446 | 
447 |     @concept_sets.setter
448 |     def concept_sets(self, value):
449 |         """
450 |         Sets concepts_sets and the attributes derived from it.
451 | 
452 |         Args:
453 |             value (list of list of str): A list of lists of strings; each string being a concept,
454 |                 each set in the larger list corresponding to a document which has the tags seen in the set.
455 |         """
456 |         self._concept_sets = value
457 |         LOG.debug("Extracting raw keywords as concepts.")
458 |         all_concepts = [
459 |             concept
460 |             for concept_set in tqdm(self._concept_sets)
461 |             for concept in concept_set
462 |             if concept.strip() != ""
463 |         ]
464 |         raw_concepts = set(all_concepts)
465 | 
466 |         LOG.debug("Lemmatizing {} raw concepts.".format(len(raw_concepts)))
467 |         concepts = [c.lower() for c in raw_concepts]
468 | 
469 |         self.raw2lemma = {rc: c for rc, c in zip(raw_concepts, concepts)}
470 |         lookups = Lookups()
471 |         lookups.add_table("lemma_lookup", self.raw2lemma)
472 |         self.lemmatizer = Lemmatizer(lookups)
473 |         self.lemma2raw = {v: k for k, v in self.raw2lemma.items()}
474 |         lemma_concepts = [
475 |             self.lemmatizer(concept, "NOUN")[0] for concept in all_concepts
476 |         ]
477 |         self.concepts_frequencies = Counter(lemma_concepts)
478 |         self.concepts = set(lemma_concepts)
479 |         self._fit_concept_indices()
480 | 
481 |     def _fit_concept_indices(self):
482 |         kwd_sets_lemmas = [
483 |             [self.lemmatizer(kwd, "NOUN")[0] for kwd in kwd_set]
484 |             for kwd_set in self.concept_sets
485 |         ]
486 |         concepts_with_inds = dict()
487 |         for i, kwd_set in enumerate(kwd_sets_lemmas):
488 |             for kwd in kwd_set:
489 |                 if kwd not in concepts_with_inds:
490 |                     concepts_with_inds[kwd] = [i]
491 |                 else:
492 |                     concepts_with_inds[kwd].append(i)
493 |         self.concept_index_mapping = concepts_with_inds
494 | 
495 |     def from_corpus(self, in_corpus, concept_field):
496 |         """
497 |         Extract concepts from input json corpus.
498 | 
499 |         Args:
500 |             in_corpus (pathlike): path to input json-formatted corpus from which to extract concepts
501 |             concept_field (str): the name of the concept field
502 |         """
503 |         with open(in_corpus, "r") as f0:
504 |             record_generator = (json.loads(l) for l in f0.readlines())
505 |         concept_sets = [r[concept_field] for r in record_generator]
506 |         with_concepts = [i for i, cs in enumerate(concept_sets) if cs is not []]
507 |         assert len(with_concepts) > 0, LOG.error(
508 |             f'"{concept_field}" not present in corpus.'
509 |         )
510 |         LOG.debug(f"{len(with_concepts)} docs in corpus with {concept_field}.")
511 |         self.concept_sets = concept_sets
512 | 
513 |     def to_jsons(self, out_indices, out_raw2lemma):
514 |         """
515 |         Output indices and raw2lemma dicts to json files.
516 | 
517 |         Args:
518 |             out_indices (pathlib.Path): path to output file containing indices for concepts
519 |             out_raw2lemma (pathlib.Path): path to output file containing mappings from concepts to their lemmas
520 | 
521 |         Returns:
522 |             out_indices (pathlib.Path): path to output file containing indices for concepts
523 |             out_raw2lemma (pathlib.Path): path to output file containing mappings from concepts to their lemmas
524 | 
525 |         """
526 |         with open(out_indices, "w") as f0:
527 |             json.dump(self.concept_index_mapping, f0)
528 |         with open(out_raw2lemma, "w") as f0:
529 |             json.dump(self.raw2lemma, f0)
530 |         return out_indices, out_raw2lemma
531 | 
532 |     def from_jsons(
533 |         self, in_indices, in_raw2lemma
534 |     ):  # a little strange because it does not fill in all attributes
535 |         """
536 |         Load index and raw2lemma dictionaries into empty ConceptExtractor
537 | 
538 |         Args:
539 |             in_indices ():
540 |             in_raw2lemma ():
541 |         """
542 |         with open(in_indices, "r") as f0:
543 |             self.concept_index_mapping = json.load(f0)
544 |         with open(in_raw2lemma, "r") as f0:
545 |             self.raw2lemma = json.load(f0)
546 |         lookups = Lookups()
547 |         lookups.add_table("lemma_lookup", self.raw2lemma)
548 |         self.lemmatizer = Lemmatizer(lookups)
549 |         self.lemma2raw = {v: k for k, v in self.raw2lemma.items()}
550 |         self.concepts = self.concept_index_mapping.keys()
551 |         tmp_frequencies = {
552 |             concept: len(index) for concept, index in self.concept_index_mapping.items()
553 |         }
554 |         self.concepts_frequencies = Counter(tmp_frequencies)
555 | 
556 |     def get_top_concepts(self, min_freq=500):
557 |         """
558 | 
559 |         Args:
560 |             min_freq (int): occurrence threshold for concepts
561 | 
562 |         Returns:
563 |             top_concepts(dict): a subset of the
564 | 
565 |         Examples:
566 |             >>> info.get_top_concepts(2)
567 |             >>> info.top_concepts
568 |             ['zaphod', 'arthur']
569 |         """
570 |         LOG.info(f"Getting indices for concepts with frequency >= {min_freq}.")
571 |         top_concepts = {
572 |             concept: index
573 |             for concept, index in self.concept_index_mapping.items()
574 |             if len(index) >= min_freq
575 |         }
576 |         return top_concepts
577 | 


--------------------------------------------------------------------------------
/src/dsconcept/get_metrics.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | import logging
  4 | from math import ceil
  5 | from multiprocessing import cpu_count
  6 | from pathlib import Path
  7 | from typing import List, Dict, Tuple
  8 | 
  9 | import dask
 10 | import h5py
 11 | import joblib
 12 | import numpy as np
 13 | import pandas as pd
 14 | from dask.diagnostics import ProgressBar
 15 | from sklearn.feature_extraction import DictVectorizer
 16 | from sklearn.metrics import (
 17 |     accuracy_score,
 18 |     roc_auc_score,
 19 |     recall_score,
 20 |     precision_score,
 21 | )
 22 | from sklearn.model_selection import GridSearchCV
 23 | from tqdm import tqdm as tqdm
 24 | from tempfile import NamedTemporaryFile, TemporaryDirectory
 25 | 
 26 | import dsconcept.model as ml
 27 | 
 28 | logging.basicConfig(level=logging.INFO)
 29 | LOG = logging.getLogger(__name__)
 30 | LOG.setLevel(logging.INFO)
 31 | 
 32 | PRED_LIST_TYPE = List[List[Tuple[str, float]]]
 33 | 
 34 | 
 35 | def get_cat_inds(
 36 |     categories: List[str], cat_preds: np.array, t: float = 0.5
 37 | ) -> Dict[str, np.array]:
 38 |     """
 39 |     Apply a threshold to get documents indices corresponding to each category.
 40 | 
 41 |     Args:
 42 |         categories: list of categories which are columns of the cat_preds array
 43 |         cat_preds: array of scores for each category for each document
 44 |             ([documents, categories])
 45 |         t: threshold over which a category is determined to be relevant
 46 |             to a given document
 47 | 
 48 |     Returns:
 49 |         all_cat_inds: dictionary with keys which are categories.
 50 |             Values are index of documents which apply to each category.
 51 | 
 52 |     Examples:
 53 |         >>> from get_metrics import get_cat_inds
 54 |         >>> import numpy as np
 55 |         >>> cats = ['physics', 'geology']
 56 |         >>> cat_preds = np.array([[0.4, 0.8], [0.5, 0.6], [0.9, 0.3]])
 57 |         >>> get_cat_inds(cats, cat_preds, t=0.5)
 58 |         {'physics': array([2]), 'geology': array([0, 1])}
 59 |     """
 60 |     all_cat_inds = {}
 61 |     for i, cat in enumerate(categories):
 62 |         if cat == "":
 63 |             continue
 64 |         x = cat_preds[:, i]
 65 |         g_args = np.argwhere(x > t)
 66 |         if g_args.shape[0] == 0:
 67 |             cat_inds = np.array([])
 68 |         else:
 69 |             cat_inds = np.stack(np.argwhere(x > t), axis=1)[0]
 70 |         all_cat_inds[cat] = cat_inds
 71 |     return all_cat_inds
 72 | 
 73 | 
 74 | def f_score(r: float, p: float, b: int = 1):
 75 |     """
 76 |     Calculate f-measure from recall and precision.
 77 | 
 78 |     Args:
 79 |         r: recall score
 80 |         p: precision score
 81 |         b: weight of precision in harmonic mean
 82 | 
 83 |     Returns:
 84 |         val: value of f-measure
 85 |     """
 86 |     try:
 87 |         val = (1 + b ** 2) * (p * r) / (b ** 2 * p + r)
 88 |     except ZeroDivisionError:
 89 |         val = 0
 90 |     return val
 91 | 
 92 | 
 93 | def get_mets(
 94 |     i: int,
 95 |     synth_preds: np.array,
 96 |     target_vals: np.array,
 97 |     con_with_clf: np.array,
 98 |     pbar=None,
 99 | ) -> dict:
100 |     """
101 |     Get various metrics for the given arrays.
102 |     #
103 |     TODO: just pass in the already sliced synth_preds, Y, and con_with_clf?
104 | 
105 |     Args:
106 |         i: index for the given concept
107 |         synth_preds: arrays of predictions for each document and each concept
108 |         target_vals: true values for each document and concept
109 |         con_with_clf: arrays of concepts corresponding
110 |             to columns synth_preds and target_vals
111 | 
112 |     Returns:
113 |         metrics: metric records for the given concept
114 |     """
115 |     tmp_y_pred = synth_preds[:, i]
116 |     tmp_y_pred_bool = [1 if v > 0.5 else 0 for v in tmp_y_pred]
117 |     tmp_y_test = target_vals[:, i]
118 |     p = precision_score(tmp_y_test, tmp_y_pred_bool)
119 |     r = recall_score(tmp_y_test, tmp_y_pred_bool)
120 |     f = f_score(r, p)
121 |     accuracy = accuracy_score(tmp_y_test, tmp_y_pred_bool)
122 |     try:
123 |         roc_auc = roc_auc_score(tmp_y_test, tmp_y_pred)
124 |     except ValueError:  # why does this happen?
125 |         roc_auc = np.nan
126 |     metrics = {
127 |         "concept": con_with_clf[i],
128 |         "accuracy": accuracy,
129 |         "f1": f,
130 |         "precision": p,
131 |         "recall": r,
132 |         "roc_auc": roc_auc,
133 |     }
134 |     if pbar is not None:
135 |         pbar.update(1)
136 |     return metrics
137 | 
138 | 
139 | def synth_mean(
140 |     kwd_preds_tmp: np.array, doc_index: int, concept_index: int, non_zero_cats: list,
141 | ) -> float:
142 |     """
143 |     Get the mean of nonzero predictions for given concept and given document.
144 |     # TODO: get the precise matrix outside of function? Then pass in?
145 | 
146 |     Args:
147 |         kwd_preds_tmp: 3D array of predictions
148 |             [categories, documents, concepts]
149 |         doc_index: index of test document
150 |         concept_index: index of concept
151 |         non_zero_cats: categories for which this concept has nonzero prediction
152 | 
153 |     Returns:
154 |         mean: mean of nonzero predictions for this concept for this document
155 |     """
156 |     if len(non_zero_cats) != 0:
157 |         mean = np.mean(kwd_preds_tmp[non_zero_cats, doc_index, concept_index])
158 |     else:
159 |         mean = np.nan
160 |     return mean
161 | 
162 | 
163 | def synth_max(
164 |     kwd_preds_tmp: np.array, doc_index: int, concept_index: int, non_zero_cats: list,
165 | ) -> float:
166 |     """
167 |     Get the max of nonzero predictions for given concept and given document.
168 |     # TODO: nearly same as above function. Just pass in the np.nanmax or mean as args and collapse into one function?
169 |     """
170 |     if len(non_zero_cats) != 0:
171 |         val = np.nanmax(kwd_preds_tmp[non_zero_cats, doc_index, concept_index])
172 |     else:
173 |         val = np.nan
174 |     return val
175 | 
176 | 
177 | def get_means_for_one_doc(
178 |     doc_index: int,
179 |     all_cat_inds: Dict[str, np.array],
180 |     kwd_preds_tmp: np.array,
181 |     categories: List[str],
182 |     no_cat_ind: int,
183 |     only_cat: bool = False,
184 |     synth_strat: str = "mean",
185 |     pbar=None,
186 | ) -> np.array:
187 |     """
188 |     Get mean of nonzero concept predictions for each concepts
189 |         in relevant categories for given doc.
190 | 
191 |     Args:
192 |         doc_index: index of given document
193 |         all_cat_inds: dictionary with keys which are categories.
194 |             Values are index of documents which apply to each category.
195 |         kwd_preds_tmp: array of all predictions
196 |             [categories, documents, concepts]
197 |         categories: list of categories
198 |         no_cat_ind: index in categories list of the blank category ""
199 |         only_cat: Only use category classifier or mixin the no category classifiers
200 |         synth_strat: either "mean" or "max"
201 |         # TODO: just pass a function instead of string?
202 | 
203 |     Returns:
204 |         kwd_vals: array of synthesizes keyword prediction values
205 |             for given document
206 |     """
207 |     cats = [
208 |         cat for cat, inds in all_cat_inds.items() if doc_index in inds
209 |     ]  # get category by index instead? means all_cat index should be by index
210 |     cat_inds = [categories.index(cat) for cat in cats]
211 |     if only_cat is False:
212 |         cat_inds.append(no_cat_ind)
213 |     # ^ also average with the no-topic set, make this a decision?
214 |     kwd_vals = []
215 |     for concept_index in range(kwd_preds_tmp.shape[2]):
216 |         non_zero_cats = np.where(kwd_preds_tmp[:, doc_index, concept_index] != 0)[0]
217 |         non_zero_cats = list(set(non_zero_cats).intersection(set(cat_inds)))
218 |         assert synth_strat in ["mean", "max"], LOG.exception(
219 |             f'Synthesis strategy "{synth_strat}" is invalid.'
220 |         )
221 |         strat = synth_mean if synth_strat == "mean" else synth_max
222 |         v = strat(kwd_preds_tmp, doc_index, concept_index, non_zero_cats)
223 |         kwd_vals.append(v)
224 |     kwd_vals = np.array(kwd_vals)
225 |     if pbar is not None:
226 |         pbar.update(1)
227 |     return kwd_vals
228 | 
229 | 
230 | def create_ground_truth(
231 |     store: str,
232 |     dataset: str,
233 |     test_inds: np.array,
234 |     train_inds: np.array,
235 |     concepts_with_classifiers: np.array,
236 |     kwd_ext: ml.ConceptExtractor,
237 |     batch_size: int,
238 | ):
239 |     """
240 |     Make an array of ground truth binary labels.
241 | 
242 |     Args:
243 |         store: location of h5 database
244 |         dataset: name of dataset in h5 database
245 |             at which to store ground_truth array
246 |         test_inds: test indices in the training data
247 |         train_inds: training indices in the training data
248 |         concepts_with_classifiers: all concepts which have models
249 |         kwd_ext: ml.ConceptExtractor with ground_truth indices for concepts
250 |         batch_size: batch_size for creating ground truth for each concept
251 | 
252 |     Returns:
253 |         store, dataset: h5 store location and dataset name
254 |     """
255 |     with h5py.File(store, "a") as f0:
256 |         ground_truth = f0.create_dataset(
257 |             dataset,
258 |             shape=(len(test_inds), len(concepts_with_classifiers)),
259 |             compression="gzip",
260 |         )
261 |         n_batches = np.int(np.ceil(len(concepts_with_classifiers) / batch_size))
262 |         for n in tqdm(range(n_batches)):
263 |             start_batch = n * batch_size
264 |             end_batch = (n + 1) * batch_size
265 |             if end_batch >= len(concepts_with_classifiers):
266 |                 end_batch = len(concepts_with_classifiers) - 1
267 |             batch_matrix = np.zeros((len(test_inds), end_batch - start_batch))
268 |             con_batch = concepts_with_classifiers[start_batch:end_batch]
269 |             for i, con in enumerate(con_batch):
270 |                 index = kwd_ext.concept_index_mapping[con]
271 |                 y_full = np.zeros((len(test_inds) + len(train_inds)))
272 |                 y_full[index] = 1
273 |                 y = y_full[test_inds]
274 |                 batch_matrix[:, i] = y
275 |             ground_truth[:, start_batch:end_batch] = batch_matrix
276 | 
277 |     return store, dataset
278 | 
279 | 
280 | # TODO: maybe make this a part of the hierarchical class
281 | def get_synth_preds(
282 |     store,
283 |     shape,
284 |     all_cat_inds,
285 |     categories,
286 |     batch_size,
287 |     only_cat,
288 |     synth_strat,
289 |     use_dask=True,
290 |     con_limit=None,
291 |     limit=None,
292 |     pbar=None,
293 | ):
294 |     with h5py.File(store, "a") as f_synth, h5py.File(store, "r") as f_preds:
295 |         if "synthesis" in f_synth.keys():
296 |             del f_synth['synthesis']
297 |         f_synth.create_dataset("synthesis", shape)
298 |         synth_preds = f_synth["synthesis"]
299 |         if (limit is not None):
300 |             kwd_preds = f_preds["predictions"][:, 0:limit, :]
301 |         else:
302 |             kwd_preds = f_preds["predictions"]
303 |         n_batches = np.ceil(kwd_preds.shape[1] / batch_size)
304 |         LOG.debug(f"{n_batches} batches")
305 |         no_cat_ind = categories.index("")
306 |         for n in range(int(n_batches)):
307 |             start_batch = n * batch_size
308 |             end_batch = (n + 1) * batch_size
309 |             if con_limit is not None:
310 |                 kwd_preds_tmp = kwd_preds[0:con_limit, start_batch:end_batch, :]
311 |             else:
312 |                 kwd_preds_tmp = kwd_preds[:, start_batch:end_batch, :]
313 |             n_docs = kwd_preds_tmp.shape[1]
314 |             if True:  # use_dask is True:
315 |                 kwd_preds_tmp = dask.delayed(kwd_preds_tmp)
316 |                 all_cat_inds = dask.delayed(all_cat_inds)
317 |                 jobs = []
318 |                 for doc_index in range(n_docs):
319 |                     # should be everything now, since '' category is included
320 |                     job = dask.delayed(get_means_for_one_doc)(
321 |                         doc_index,
322 |                         all_cat_inds,
323 |                         kwd_preds_tmp,
324 |                         categories,
325 |                         no_cat_ind,
326 |                         synth_strat,
327 |                         pbar=pbar,
328 |                     )
329 |                     jobs.append(job)
330 |                 hybrid_preds = dask.compute(jobs)[0]
331 |             else:
332 |                 hybrid_preds = []
333 |                 for doc_index in range(n_docs):
334 |                     # should be everything now, since '' category is included
335 |                     v = get_means_for_one_doc(
336 |                         doc_index,
337 |                         all_cat_inds,
338 |                         kwd_preds_tmp,
339 |                         categories,
340 |                         no_cat_ind,
341 |                         only_cat,
342 |                         synth_strat,
343 |                         pbar=pbar,
344 |                     )
345 |                     hybrid_preds.append(v)
346 |             hybrid_pred_array = np.stack(hybrid_preds)
347 |             if limit is not None:
348 |                 if limit <= end_batch:
349 |                     synth_preds[start_batch:limit, :] = hybrid_pred_array
350 |                 else:
351 |                     synth_preds[start_batch:end_batch, :] = hybrid_pred_array
352 |             else:
353 |                 synth_preds[start_batch:end_batch, :] = hybrid_pred_array
354 | 
355 | 
356 | def load_category_models(in_cat_models: str) -> List[dict]:
357 |     """
358 |     Load all category models from given directory
359 | 
360 |     Args:
361 |         in_cat_models: directory where category models reside
362 | 
363 |     Returns:
364 |         cat_clfs: A list of dictionaries, each with a category model
365 |     """
366 |     LOG.info(f"Loading category classifiers from {in_cat_models}.")
367 |     in_clfs = list(Path(in_cat_models).iterdir())
368 |     cat_clfs = [joblib.load(c) for c in tqdm(in_clfs)]
369 |     return cat_clfs
370 | 
371 | 
372 | def load_concept_models(in_kwd_models: str, load: bool = True) -> Dict[Tuple[str, str], GridSearchCV]:
373 |     """
374 |     Load keyword models from given directory.
375 | 
376 |     Args:
377 |         in_kwd_models: directory with subdirs, the suffixes of which are the
378 |             names of the categories (ex. topic_physics). Each of these
379 |             subfolders contains binary files for concepts in that category.
380 |             The classifiers trained on all documents are in a subfolder which
381 |             has not suffix (ex. topic_).
382 |         load: whether to load the models into memory, or just get their paths
383 | 
384 |     Returns:
385 |         cd: Dictionary with all classifiers for each category.
386 |     """
387 |     LOG.info(f"Loading keyword classifiers from {in_kwd_models}.")
388 |     cd = {}  # expects no_topics with suffix ''
389 |     topic_dirs = list(Path(in_kwd_models).iterdir())
390 |     total = 0
391 |     for td in topic_dirs:
392 |         in_clfs = list(td.iterdir())
393 |         total += len(in_clfs)
394 |     pbar = tqdm(topic_dirs, total=total)
395 |     for topic_dir in pbar:
396 |         topic_name = topic_dir.stem.split("_")[1]  # depends on opinionated path format
397 |         pbar.set_description(topic_name)
398 |         in_clfs = list(topic_dir.iterdir())
399 |         clfs = (joblib.load(c) for c in in_clfs)  # generator for loading classifiers
400 |         for c, c_loc in zip(clfs, in_clfs):
401 |             if load is True:
402 |                 cd[topic_name, c["concept"]] = c["best_estimator_"]
403 |             else:
404 |                 cd[topic_name, c['concept']] = c_loc
405 |             pbar.update(1)
406 |     return cd
407 | 
408 | 
409 | def make_predictions(
410 |     in_cat_models,
411 |     in_kwd_models,
412 |     feature_matrix,
413 |     out_store="test_results/store.h5",
414 |     t=None,
415 | ):
416 |     cat_clfs = load_category_models(in_cat_models)
417 |     cd = load_concept_models(in_kwd_models)
418 |     clf = HierarchicalClassifier(cat_clfs, cd)
419 |     LOG.info("Predicting categories.")
420 |     cat_preds = clf.predict_categories(feature_matrix)
421 |     if t is not None:
422 |         LOG.info("Only making predictions for keywords in predicted categories.")
423 |         cat_indices = get_cat_inds(clf.categories, cat_preds, t)
424 |         # TODO: add rule for when cat_indices has nothing in it!
425 |         all_kwd_preds_loc = clf._predict_keywords(
426 |             feature_matrix, out_store, cat_indices
427 |         )
428 |     else:
429 |         LOG.info("Predicting for all keywords on all documents.")
430 |         # TODO: this should call a public function
431 |         all_kwd_preds_loc = clf._predict_keywords(feature_matrix, out_store)
432 |     LOG.info(f"all_kwd_preds_loc={all_kwd_preds_loc}")
433 | 
434 |     return clf.categories, clf.concepts_with_classifiers, cat_preds
435 | 
436 | 
437 | class StubBestEstimator:
438 |     """
439 |     Stub class for classifier's best_estimator to be used for testing.
440 |     """
441 | 
442 |     def init(self):
443 |         pass
444 | 
445 |     def predict_proba(self, feature_matrix):
446 |         val = np.random.rand(feature_matrix.shape[0], 2)
447 |         return val
448 | 
449 | 
450 | def main(
451 |     experiment_name, out_store, out_cat_preds, gt_batch_size, limit=None,
452 | ):
453 |     LOG.info("Loading test data and models.")
454 |     # TODO: paths should be put into main function
455 |     test_inds = np.load(f"data/interim/{experiment_name}/test_inds.npy")
456 |     train_inds = np.load(f"data/interim/{experiment_name}/train_inds.npy")
457 |     feature_matrix = joblib.load(f"data/interim/{experiment_name}/feature_matrix.jbl")
458 |     in_cat_models = Path(f"models/{experiment_name}/categories/models/")
459 |     in_kwd_models = Path(f"models/{experiment_name}/keywords/models/")
460 | 
461 |     if limit is not None:
462 |         LOG.info(f"Limiting to {limit} test records.")
463 |         feature_matrix_test = feature_matrix.tocsc()[test_inds[0:limit], :]
464 |         # TODO: How does this affect indices?
465 |     else:
466 |         feature_matrix_test = feature_matrix.tocsc()[test_inds, :]
467 | 
468 |     LOG.info("Making predictions.")
469 |     categories, concepts_with_classifiers, cat_preds, = make_predictions(
470 |         in_cat_models, in_kwd_models, feature_matrix_test, out_store,
471 |     )  # need t if limiting
472 |     np.save(out_cat_preds, cat_preds)
473 |     LOG.info("Creating ground truth data.")
474 |     kwd_ext = ml.ConceptExtractor()  # TODO: these paths should be provided as args
475 |     kwd_ext.from_jsons(
476 |         f"data/interim/{experiment_name}/kwd_indices.json",
477 |         f"models/{experiment_name}/kwd_raw2lemma.json",
478 |     )
479 |     create_ground_truth(
480 |         store=out_store,
481 |         dataset="ground_truth",
482 |         kwd_ext=kwd_ext,
483 |         concepts_with_classifiers=concepts_with_classifiers,
484 |         batch_size=gt_batch_size,
485 |         train_inds=train_inds,
486 |         test_inds=test_inds,
487 |     )
488 | 
489 | 
490 | def get_category_results(cat_models_dir: Path) -> pd.DataFrame:
491 |     in_clfs = list(cat_models_dir.iterdir())
492 |     cat_clfs = [joblib.load(c) for c in in_clfs]  # loads the classifiers
493 |     cat_results_df = pd.DataFrame(
494 |         [{**c["scores"], **{"concept": c["concept"]}} for c in cat_clfs]
495 |     )
496 |     return cat_results_df
497 | 
498 | 
499 | def get_keyword_results(kwd_models_dir: Path) -> pd.DataFrame:
500 |     cd = {}
501 |     for topic_dir in kwd_models_dir.iterdir():
502 |         in_clfs = list(topic_dir.iterdir())
503 |         clfs = (joblib.load(c) for c in in_clfs)  # loads the classifiers
504 |         topic_name = topic_dir.stem.split("_")[1]  # depends on opinionated path format
505 |         cd[topic_name] = clfs
506 | 
507 |     all_records = []
508 |     for t, clfs in tqdm(cd.items()):
509 |         for clf in clfs:
510 |             r = {**{"concept": clf["concept"], "category": t}, **clf["scores"]}
511 |             all_records.append(r)
512 |     results_df = pd.DataFrame(all_records)
513 |     return results_df
514 | 
515 | 
516 | class HierarchicalClassifier:
517 |     """
518 |     Hierarchical Classifier object which allows for streamlined predictions
519 |         on suites of concept models associated with different categories.
520 | 
521 |     Attributes:
522 |         categories: list of categories
523 |         concepts_with_classifiers: sorted array of concepts with classifiers
524 |         cat_concept_indices: list where each element maps onto a category.
525 |             Each element consists of a selection of indices
526 |             in concepts_with_classifier which occur in the given category.
527 |         vectorizer: DictVectorizer for transforming features
528 |     """
529 | 
530 |     def __init__(
531 |         self, cat_clfs: List[dict], kwd_clfs: Dict[Tuple[str, str], GridSearchCV],
532 |     ):
533 |         """
534 |         Set the models for categories and concepts_with_classifiers
535 | 
536 |         Args:
537 |             cat_clfs: category classifier models
538 |             kwd_clfs: Dictionary with keys which are tuples
539 |                 of categories and concepts, values are the classifier models
540 |         """
541 |         self.cat_clfs = cat_clfs
542 |         self.kwd_clfs = kwd_clfs
543 |         self.vectorizer = None
544 | 
545 |     @property
546 |     def cat_clfs(self):
547 |         """
548 |         The category classifiers.
549 | 
550 |         Setter also creates categories attribute.
551 |         """
552 |         return self._cat_clfs
553 | 
554 |     @property
555 |     def kwd_clfs(self):
556 |         """
557 |         Dictionary with keys which are tuples of categories and concepts,
558 |             values are the classifier models
559 | 
560 |         Setter method creates concept_indices,
561 |             and concepts_with_classifiers attributes.
562 |         """
563 |         return self._kwd_clfs
564 | 
565 |     @cat_clfs.setter
566 |     def cat_clfs(self, cat_clfs: List[dict]):
567 |         self._cat_clfs = cat_clfs
568 |         self.categories = [c["concept"] for c in self.cat_clfs] + [""]
569 | 
570 |     @kwd_clfs.setter
571 |     def kwd_clfs(self, kwd_clfs: Dict[Tuple[str, str], dict]):
572 |         self._kwd_clfs = kwd_clfs
573 |         category_concepts = {}
574 | 
575 |         for cat in self.categories:
576 |             concepts = [k[1] for k, v in kwd_clfs.items() if k[0] == cat]
577 |             # concepts = [clf["concept"] for clf in kwd_clfs[cat]]
578 |             category_concepts[cat] = concepts
579 | 
580 |         all_cat_concepts = set(
581 |             c for ts, cons in category_concepts.items() for c in cons
582 |         )
583 |         concepts_with_classifiers = np.sort(list(all_cat_concepts))
584 |         LOG.info(f"concepts_with_classifiers: {concepts_with_classifiers.shape[0]}")
585 | 
586 |         cat_concept_indices = []
587 |         for cat in self.categories:
588 |             full_in_cats = np.isin(concepts_with_classifiers, category_concepts[cat])
589 |             cat_concept_cols = np.where(full_in_cats)[0]
590 |             cat_concept_indices.append(cat_concept_cols)
591 | 
592 |         self.cat_concept_indices: List[np.array] = cat_concept_indices
593 |         # shape is [categories, keywords]
594 |         self.concepts_with_classifiers: np.array = concepts_with_classifiers
595 | 
596 |     def load_vectorizer(self, v_loc: str):
597 |         """
598 |         Loads the DictVectorizer
599 | 
600 |         Args:
601 |             v_loc: location of vectorizer
602 |         """
603 |         self.vectorizer: DictVectorizer = joblib.load(v_loc)
604 | 
605 |     def vectorize(
606 |         self,
607 |         texts: List[str],
608 |         weights: Dict[str, int],
609 |         batch_size: int = 1000,
610 |         n_threads: int = cpu_count(),
611 |     ) -> Tuple[List[Dict[str, str]], np.array]:
612 |         """
613 |         Transform texts into a matrix of features.
614 | 
615 |         Args:
616 |             texts: texts to transform
617 |             weights: how to weight different types of features
618 |             batch_size: what batch size to pass to nlp.pipe
619 |             n_threads: number of threads to use
620 | 
621 |         Returns:
622 |             feature_matrix: matrix representation of features for each document
623 |         """
624 |         assert self.vectorizer is not None, LOG.exception("Must initialize vectorizer.")
625 |         fe = ml.FeatureExtractor()
626 |         with NamedTemporaryFile() as tmp_features_loc:
627 |             tmp_features = tmp_features_loc.name
628 |             ml.extract_features_from_abstracts(
629 |                 texts, tmp_features, batch_size, n_threads
630 |             )
631 |             fe.from_jsonlines(tmp_features)
632 |         weighted_features = fe.weight_terms(weights)
633 |         feature_matrix = self.vectorizer.transform(weighted_features)
634 |         return fe.features, feature_matrix
635 | 
636 |     def predict_categories(self, feature_matrix: np.array) -> np.array:
637 |         """
638 |         Make predictions with category classifiers
639 | 
640 |         Args:
641 |             feature_matrix: array of features for each document
642 | 
643 |         Returns:
644 |             cat_preds: prediction belief values for each document
645 |         """
646 |         cat_preds_list = [
647 |             clf["best_estimator_"].predict_proba(feature_matrix)[:, 1]
648 |             for clf in tqdm(self.cat_clfs)
649 |         ]
650 |         cat_preds = np.stack(cat_preds_list, axis=1)
651 |         return cat_preds
652 | 
653 |     def _predict_one_clf(
654 |         self, feature_matrix: np.array, concept_index: int, cat: str, pbar=None,
655 |     ) -> np.array:
656 |         """
657 |         Make a prediction for a particular concept.
658 | 
659 |         Args:
660 |             feature_matrix: array of features for each document
661 |             concept_index: index for the given concept
662 |                 in concepts_with_classifiers attribute
663 |             cat: name of the given category
664 | 
665 |         Returns:
666 |             v: predictions for all documents for the given concept
667 |         """
668 |         con = self.concepts_with_classifiers[concept_index]
669 |         clf = self.kwd_clfs[cat, con]
670 |         try:  # TODO: explicit option for this rather than interpreting?
671 |             os.fspath(clf)
672 |             clf = joblib.load(clf)["best_estimator_"]
673 |         except TypeError:
674 |             pass
675 |         v = clf.predict_proba(feature_matrix)[:, 1]
676 |         if pbar is not None:
677 |             pbar.update(1)
678 |         return v
679 | 
680 |     def _predict_kwds_for_cat(
681 |         self,
682 |         feature_matrix: np.array,
683 |         cat_index: int,
684 |         predictions: np.array,
685 |         cat_indices: Dict[str, List[int]] = None,
686 |         use_dask: bool = True,
687 |         pbar: tqdm = None,
688 |     ):
689 |         """
690 |         Make predictions for all documents for all concepts
691 |             in the given category
692 | 
693 |         Args:
694 |             feature_matrix: array of features for each document
695 |             cat_index: index in categories attribute of the given category
696 |             predictions: the h5 dataset where predictions are stored
697 |             cat_indices: Predicted indices where categories occur
698 |                 for each category
699 |             use_dask: Use dask for multiprocessing
700 |             pbar: tqdm progress bar
701 |         """
702 |         cat = self.categories[cat_index]
703 |         pbar.set_postfix(category=cat, refresh=False)
704 |         if (cat_indices is not None) and (cat != ""):
705 |             feature_matrix_test = feature_matrix[cat_indices[cat], :]
706 |             # this could be a problem if I want everything to perfectly align.
707 |         else:
708 |             feature_matrix_test = feature_matrix
709 |         if feature_matrix_test.shape[0] == 0:
710 |             pbar.update(len(self.cat_concept_indices[cat_index]))
711 |             return 0
712 |         # TODO: for good bar, should walk tasks to compute total
713 |         cat_concept_cols = self.cat_concept_indices[cat_index]
714 |         # use the np.where here, bool index for initial setting?
715 |         if False:  # use_dask is True:
716 |             feature_matrix_test = dask.delayed(feature_matrix_test)
717 |             jobs = []
718 |             ProgressBar().register()
719 |             for concept_index in cat_concept_cols:
720 |                 j = dask.delayed(self._predict_one_clf)(
721 |                     feature_matrix_test, concept_index, cat, pbar
722 |                 )
723 |                 jobs.append(j)
724 |             vals = dask.compute(jobs)[0]
725 |         else:
726 |             vals = []
727 |             for concept_index in cat_concept_cols:
728 |                 val = self._predict_one_clf(
729 |                     feature_matrix_test, concept_index, cat, pbar
730 |                 )
731 |                 vals.append(val)
732 |         if (cat_indices is not None) and (cat is not ""):
733 |             # need to correct indices, zeros in places with no predictions
734 |             # TODO: determine if this patching activity
735 |             #  takes longer than just predicting on more
736 |             new_vals = []
737 |             for v in vals:
738 |                 new_v = np.zeros(feature_matrix.shape[0])
739 |                 new_v[cat_indices[cat]] = v
740 |                 new_vals.append(new_v)
741 |             vals = new_vals
742 |         # TODO: below will not work with cat_inds
743 |         if len(vals) > 0:
744 |             topic_preds_sub = np.stack(vals, axis=1)
745 |             predictions[cat_index, :, cat_concept_cols] = topic_preds_sub
746 | 
747 |     def _predict_keywords(
748 |         self,
749 |         feature_matrix: np.array,
750 |         store: str,
751 |         cat_indices: Dict[str, list] = None,
752 |         only_no_topic: bool = False,
753 |         use_dask: bool = True,
754 |     ):
755 |         """
756 |         Make keyword predictions
757 | 
758 |         Args:
759 |             feature_matrix: array of features for each document
760 |             store: location of h5 store for predictions
761 |             cat_indices: Predicted indices where categories
762 |                 occur for each category
763 |             only_no_topic: only use the models which are
764 |                 not associated with a category
765 |             use_dask: use dask for multiprocessing
766 | 
767 |         Returns:
768 |             store: the location of the h5 store
769 |         """
770 |         all_con_checks = np.sum(
771 |             np.array([a.shape[0] for a in self.cat_concept_indices])
772 |         )
773 |         if Path(store).exists():
774 |             ValueError(f"{store} already exists.")
775 |         with h5py.File(store, "w") as f0, tqdm(total=all_con_checks) as pbar:
776 |             predictions = f0.create_dataset(
777 |                 "predictions",
778 |                 (
779 |                     len(self.categories),
780 |                     feature_matrix.shape[0],
781 |                     len(self.concepts_with_classifiers),
782 |                 ),
783 |                 compression="gzip",
784 |             )  # [categories, docs, concepts]
785 |             if only_no_topic is True:
786 |                 cat_index = self.categories.index("")
787 |                 self._predict_kwds_for_cat(
788 |                     feature_matrix, cat_index, predictions, cat_indices, use_dask, pbar,
789 |                 )
790 |             else:
791 |                 for cat_index in range(len(self.categories)):
792 |                     self._predict_kwds_for_cat(
793 |                         feature_matrix,
794 |                         cat_index,
795 |                         predictions,
796 |                         cat_indices,
797 |                         use_dask,
798 |                         pbar,
799 |                     )
800 |             return store
801 | 
802 |     def get_synth_preds(
803 |         self,
804 |         store: str,
805 |         all_cat_inds: Dict[str, np.array],
806 |         batch_size: int,
807 |         only_cat: bool,
808 |         synth_strat: str,
809 |         use_dask: bool = True,
810 |     ) -> np.array:
811 |         """
812 |         Synthesize all keyword models into a single prediction score.
813 | 
814 |         Args:
815 |             store: location of h5 database
816 |             all_cat_inds: dictionary with keys which are categories.
817 |                 Values are index of documents which apply to each category.
818 |             batch_size: batch size for synthesizing predictions
819 |             only_cat: only use category classifiers in synthesis
820 |             synth_strat: strategy for synthesizing category predictions
821 |             use_dask: use dask for multiprocessing
822 | 
823 |         """
824 |         # TODO: do this without all of the intermediaries
825 |         with h5py.File(store, "r") as f0:
826 |             tdocs = f0["predictions"].shape[1]
827 |             shape = f0["predictions"].shape[1:]
828 |         with tqdm(total=tdocs) as pbar:
829 |             get_synth_preds(
830 |                 store,
831 |                 shape,
832 |                 all_cat_inds,
833 |                 self.categories,
834 |                 batch_size,
835 |                 only_cat,
836 |                 synth_strat,
837 |                 use_dask,
838 |                 pbar=pbar,
839 |             )
840 |         with h5py.File(store, "r") as f0:
841 |             results = f0["synthesis"].value  # TODO: optional return?
842 |         return results
843 | 
844 |     @staticmethod
845 |     def _to_strings(tags, preds, t):
846 |         all_tag_vals = [
847 |             get_tag_vals(preds[i], tags, t) for i in tqdm(range(preds.shape[0]))
848 |         ]
849 |         return all_tag_vals
850 | 
851 |     def predict(
852 |         self,
853 |         feature_matrix: np.array,
854 |         cat_threshold: float = 0.5,
855 |         concept_threshold: float = 0.5,
856 |         no_categories: bool = False,
857 |         only_cat: bool = False,
858 |         synth_strat: str = "mean",
859 |         batch_size: int = 10_000,
860 |     ) -> Tuple[PRED_LIST_TYPE, PRED_LIST_TYPE]:
861 |         """
862 |         Make predictions for all input texts.
863 | 
864 |         Args:
865 |             texts: input texts for which to produce predictions
866 |             cat_threhold: threshold over which to mix in category subset
867 |                 model predictions
868 |             concept_threhold: threshold over which to return
869 |                 a concept prediction
870 |             no_categories: whether or not to use category-specific models
871 |             only_cat: only use category classifiers in synthesis
872 |             synth_strat: strategy for synthesizing category concept models
873 |                 to produce single result.
874 |             batch_size: size of batches for making predictions
875 | 
876 |         Returns:
877 |             concept_preds: concepts and their belief scores
878 | 
879 |         Examples:
880 |             >>> examples = ["Olympus Mons is the largest volcano in the solar system",
881 |             ...             "Database management is critical for information retrieval",
882 |             ...             "We used a logistic regression with batched stochastic gradient descent."]
883 |             >>> weights = {'NOUN': 1, 'PROPN': 1, 'ENT': 1, 'NOUN_CHUNK':1, 'ACRONYM': 1}
884 |             >>> features, feature_matrix = hclf.vectorize(examples, weights)
885 |             >>> hclf.predict(feature_matrix)
886 |         """
887 |         n_splits = ceil(feature_matrix.shape[0] / batch_size)
888 |         r1s = []
889 |         # TODO: make temp folder and then write the file
890 |         with NamedTemporaryFile() as tmp_dir:
891 |             tmp_store = Path(f"{tmp_dir.name}/store.h5")
892 |             cat_pred_strings = []
893 |             for n in tqdm(range(n_splits)):
894 |                 # TODO: Leave batching to lower methods?
895 |                 start = n * batch_size
896 |                 end = (n + 1) * batch_size
897 |                 matrix_slice = feature_matrix[start:end, :]
898 |                 cat_preds = self.predict_categories(matrix_slice)
899 |                 cat_inds = get_cat_inds(self.categories, cat_preds, t=cat_threshold)
900 |                 LOG.info(f"Predicting keywords")
901 |                 store_loc = self._predict_keywords(
902 |                     matrix_slice,
903 |                     tmp_store.name,
904 |                     cat_indices=cat_inds,
905 |                     use_dask=False,
906 |                     only_no_topic=no_categories,
907 |                 )
908 |                 if no_categories is True:
909 |                     with h5py.File(store_loc) as f0:
910 |                         sp = f0["predictions"][-1, :, :]
911 |                 else:
912 |                     LOG.info(f"Synthesizing for each doc.")
913 |                     sp = self.get_synth_preds(
914 |                         store_loc,
915 |                         cat_inds,
916 |                         1000000000,  # TODO: more explanation here
917 |                         only_cat,
918 |                         synth_strat,
919 |                         use_dask=False,
920 |                     )
921 |                 LOG.info(f"Converting to strings.")
922 |                 r1 = self._to_strings(
923 |                     self.concepts_with_classifiers, sp, concept_threshold
924 |                 )
925 |                 cp = self._to_strings(self.categories, cat_preds, t=0.0)
926 |                 r1s.append(r1)
927 |                 cat_pred_strings.append(cp)
928 |             concept_preds = [doc_preds for r1 in r1s for doc_preds in r1]
929 |             all_cat_pred_strings = [
930 |                 doc_preds for cp in cat_pred_strings for doc_preds in cp
931 |             ]
932 |         return all_cat_pred_strings, concept_preds
933 | 
934 | 
935 | def get_tag_vals(pred_vals: List[float], tags: List[str], t: float):
936 |     tag_vals = [(tags[i], v) for i, v in enumerate(pred_vals) if v > t]
937 |     tag_vals.sort(key=lambda x: -x[1])
938 |     return tag_vals
939 | 
940 | 
941 | if __name__ == "__main__":
942 |     parser = argparse.ArgumentParser(
943 |         description="Use category and concept models to get metrics on the test data."
944 |     )
945 |     parser.add_argument("--experiment_name", help="experiment to generate metrics for")
946 |     parser.add_argument("--out_store", help="h5 store in which to store results")
947 |     parser.add_argument(
948 |         "--out_cat_preds", help="output npy file for category predictions"
949 |     )
950 |     parser.add_argument(
951 |         "--batch_size", help="size of batches for creating ground truth data", type=int,
952 |     )
953 |     parser.add_argument(
954 |         "--limit",
955 |         help="size limit for test data (for testing on smaller subset)",
956 |         type=int,
957 |         default=None,
958 |     )
959 |     args = parser.parse_args()
960 |     main(
961 |         args.experiment_name,
962 |         args.out_store,
963 |         args.out_cat_preds,
964 |         args.batch_size,
965 |         args.limit,
966 |     )
967 | 


--------------------------------------------------------------------------------