├── __init__.py ├── data ├── .gitkeep └── interim │ └── subj_mapping.json ├── models └── .gitkeep ├── tests ├── __init__.py ├── .gitignore ├── context.py ├── test_conceptExtractor.py ├── test_extract_from_doc.py ├── test_conceptTrainer.py ├── test_featureExtractor.py └── test_hierarchicalClassifier.py ├── src ├── pipeline │ ├── __init__.py │ ├── .gitignore │ ├── start.sh │ ├── config.yml │ ├── docker_pipeline.sh │ └── pipeline.py ├── dsconcept │ ├── .gitignore │ ├── __init__.py │ ├── README.md │ ├── train.py │ ├── model.py │ └── get_metrics.py ├── features.py ├── concepts.py ├── make_vec_and_matrix.py ├── process.py ├── make_cat_models.py ├── make_records_for_cat_bert.py ├── synthesize_predictions.py ├── make_kwd_models.py ├── get_bert_cat_models_preds.py └── make_plots.py ├── .coveragerc ├── docs ├── .gitignore ├── reset.sh ├── research_access.png ├── push_pages.sh ├── code.rst ├── index.rst ├── Makefile ├── docker-versions.txt └── conf.py ├── .dockerignore ├── version.py ├── .github └── workflows │ └── greetings.yml ├── config └── test_config.yml ├── Dockerfile ├── LICENSE ├── setup.py ├── .gitignore ├── requirements.txt ├── CHANGELOG.md ├── README.md └── Makefile /__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /models/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/pipeline/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | branch = True -------------------------------------------------------------------------------- /src/pipeline/.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/* -------------------------------------------------------------------------------- /docs/.gitignore: -------------------------------------------------------------------------------- 1 | html/* 2 | _build/* 3 | -------------------------------------------------------------------------------- /src/dsconcept/.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/* 2 | -------------------------------------------------------------------------------- /src/dsconcept/__init__.py: -------------------------------------------------------------------------------- 1 | import dsconcept.model 2 | import dsconcept.train 3 | -------------------------------------------------------------------------------- /tests/.gitignore: -------------------------------------------------------------------------------- 1 | .hypothesis/* 2 | .pytest_cache/* 3 | .coverage 4 | __pycache__/* -------------------------------------------------------------------------------- /docs/reset.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | make clean && make html && open _build/html/index.html 3 | -------------------------------------------------------------------------------- /docs/research_access.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa/concept-tagging-training/master/docs/research_access.png -------------------------------------------------------------------------------- /docs/push_pages.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | make html 3 | cd _build/html 4 | git add . 5 | git commit -m 'rebuilt docs' 6 | git push origin gh-pages 7 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | data/* 2 | __pycache__/* 3 | notebook/* 4 | reports/* 5 | env/* 6 | venv/* 7 | docs/* 8 | models/* 9 | scratch/* 10 | .hypothesis/* 11 | .pytest_cache/* 12 | *.tar 13 | *.tar.gz 14 | -------------------------------------------------------------------------------- /version.py: -------------------------------------------------------------------------------- 1 | import os 2 | from setuptools_scm import get_version 3 | 4 | version = get_version(root=os.path.dirname(os.path.abspath(__file__))) 5 | version = ".".join(version.split(".")[:3]) 6 | print(version) 7 | -------------------------------------------------------------------------------- /docs/code.rst: -------------------------------------------------------------------------------- 1 | dsconcept 2 | ========== 3 | 4 | .. automodule:: dsconcept.train 5 | :members: 6 | :undoc-members: 7 | .. autofunction:: 8 | 9 | .. automodule:: dsconcept.model 10 | :members: 11 | .. autofunction:: 12 | 13 | -------------------------------------------------------------------------------- /tests/context.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import logging 4 | 5 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "../src"))) 6 | 7 | logging.basicConfig(level=logging.INFO) 8 | LOG = logging.getLogger(__name__) 9 | LOG.setLevel(logging.DEBUG) 10 | 11 | import dsconcept 12 | 13 | LOG.info(f"Loaded Module {dsconcept}") 14 | -------------------------------------------------------------------------------- /src/pipeline/start.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | export MYDIR="$(dirname "$(realpath "$0")")" 3 | 4 | python ${MYDIR}/pipeline.py \ 5 | ${MYDIR}/volumes/in_data/records.json \ 6 | ${MYDIR}/volumes/in_data/config.yml \ 7 | ${MYDIR}/volumes/out_data/processed_data \ 8 | ${MYDIR}/volumes/out_data/topic_models \ 9 | ${MYDIR}/volumes/out_data/models \ 10 | -loglevel ${LOGLEVEL} 11 | -------------------------------------------------------------------------------- /.github/workflows/greetings.yml: -------------------------------------------------------------------------------- 1 | name: Greetings 2 | 3 | on: [pull_request, issues] 4 | 5 | jobs: 6 | greeting: 7 | runs-on: ubuntu-latest 8 | permissions: 9 | issues: write 10 | pull-requests: write 11 | steps: 12 | - uses: actions/first-interaction@v1 13 | with: 14 | repo-token: ${{ secrets.GITHUB_TOKEN }} 15 | issue-message: 'Message that will be displayed on users first issue' 16 | pr-message: 'Thank you for contributing to this NASA repository!' 17 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. Research Access documentation master file, created by 2 | sphinx-quickstart on Fri Sep 14 16:48:31 2018. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to Research Access's documentation! 7 | =========================================== 8 | 9 | .. toctree:: 10 | :maxdepth: 2 11 | :caption: Contents: 12 | 13 | code 14 | 15 | 16 | Indices and tables 17 | ================== 18 | 19 | * :ref:`genindex` 20 | * :ref:`modindex` 21 | * :ref:`search` 22 | -------------------------------------------------------------------------------- /config/test_config.yml: -------------------------------------------------------------------------------- 1 | weights: # assign weights for term types specified in process section 2 | NOUN: 1 3 | PROPN: 1 4 | NOUN_CHUNK: 1 5 | ENT: 1 6 | ACRONYM: 1 7 | min_feature_occurrence: 10 8 | # features from corpus which occur fewer than 9 | # this many times are not used for training 10 | max_feature_occurrence: 0.9 11 | # features which occur in more than this percentage 12 | # of the corpus are not used for training 13 | min_concept_occurrence: 5 14 | # only concepts which occur greater than or equal to this many times 15 | # in the corpus will have associated classifiers created. 16 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SOURCEDIR = . 8 | BUILDDIR = _build 9 | 10 | # Put it first so that "make" without argument is like "make help". 11 | help: 12 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 13 | 14 | .PHONY: help Makefile 15 | 16 | # Catch-all target: route all unknown targets to Sphinx using the new 17 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 18 | %: Makefile 19 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) -------------------------------------------------------------------------------- /src/dsconcept/README.md: -------------------------------------------------------------------------------- 1 | # dsconcept 2 | Python library with supporting classes for runnning processing, 3 | and training of classifiers. Originally desgined for 4 | tagging the [NASA STI database](https://www.sti.nasa.gov/). 5 | 6 | ## installation 7 | You can install the dsconcept library from this repository. 8 | It also requires the [spacy 'en' language models](https://spacy.io/usage/models). 9 | ```bash 10 | pip install git+https://developer.nasa.gov/DataSquad/classifier_scripts.git 11 | python -m spacy download en 12 | ``` 13 | 14 | ## Usage 15 | Docs are available [here](../docs). 16 | You can go through a full interactive tutorial using the Dockerfile available in 17 | [notebook](#../notebook). 18 | 19 | -------------------------------------------------------------------------------- /docs/docker-versions.txt: -------------------------------------------------------------------------------- 1 | Client: Docker Engine - Community 2 | Version: 19.03.5 3 | API version: 1.40 4 | Go version: go1.12.12 5 | Git commit: 633a0ea 6 | Built: Wed Nov 13 07:22:34 2019 7 | OS/Arch: darwin/amd64 8 | Experimental: false 9 | 10 | Server: Docker Engine - Community 11 | Engine: 12 | Version: 19.03.5 13 | API version: 1.40 (minimum version 1.12) 14 | Go version: go1.12.12 15 | Git commit: 633a0ea 16 | Built: Wed Nov 13 07:29:19 2019 17 | OS/Arch: linux/amd64 18 | Experimental: true 19 | containerd: 20 | Version: v1.2.10 21 | GitCommit: b34a5c8af56e510852c35414db4c1f4fa6172339 22 | runc: 23 | Version: 1.0.0-rc8+dev 24 | GitCommit: 3e425f80a8c931f88e6d94a8c831b9d5aa481657 25 | docker-init: 26 | Version: 0.18.0 27 | GitCommit: fec3683 28 | -------------------------------------------------------------------------------- /src/pipeline/config.yml: -------------------------------------------------------------------------------- 1 | # Configuration for research access training pipeline 2 | 3 | #image: storage.analytics.nasa.gov/rat_trainer:0.12.0 4 | 5 | process: 6 | term_types: 7 | - "NOUN" 8 | - "PROPN" 9 | - "ENT" 10 | - "NOUN_CHUNK" 11 | - "ACRONYM" 12 | abstract_field: "description" 13 | concept_field: "subject.NASATerms" 14 | 15 | topic_model: 16 | weights: # assign weights for term types specified in process section 17 | NOUN: 1 18 | PROPN: 1 19 | NOUN_CHUNK: 1 20 | ENT: 1 21 | ACRONYM: 1 22 | min_feature_occurrence: 5 23 | max_feature_occurrence: 0.9 24 | number_of_topics: 10 25 | 26 | train_classifiers: 27 | weights: # assign weights for term types specified in process section 28 | NOUN: 1 29 | PROPN: 1 30 | NOUN_CHUNK: 1 31 | ENT: 1 32 | ACRONYM: 1 33 | min_feature_occurrence: 5 34 | max_feature_occurrence: 0.9 35 | min_concept_occurrence: 10 36 | 37 | 38 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # Create essential base image 2 | FROM python:3.7 as base 3 | COPY requirements.txt /home/ 4 | WORKDIR /home/ 5 | RUN pip install -U pip setuptools wheel && \ 6 | pip install -r requirements.txt && \ 7 | python -m spacy download en 8 | ADD src/ /home/src/ 9 | ENV PYTHONPATH=/home/src 10 | ENV PYTHONUNBUFFERED=0 11 | 12 | # Label image with git commit url 13 | ARG GIT_URL=unspecified 14 | ARG VERSION=unspecified 15 | LABEL org.label-schema.schema-version=1.0 16 | LABEL org.label-schema.url=$GIT_URL 17 | LABEL org.label-schema.version=$VERSION 18 | ENV VERSION=$VERSION 19 | 20 | # Run unittests 21 | FROM base as tests 22 | RUN pip install nose && \ 23 | pip install pytest && \ 24 | pip install coverage && \ 25 | pip install hypothesis && \ 26 | pip install testfixtures 27 | COPY tests /home/tests 28 | ARG cachebust=0 29 | # ^ Change this to avoid using cached results. These are tests, so we may want to run them. 30 | RUN nosetests --with-coverage --cover-package dsconcept 31 | 32 | # Deployment ready image 33 | FROM base as pipeline 34 | COPY Makefile /home/ 35 | ENTRYPOINT ["make"] 36 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | The MIT License (MIT) 3 | Copyright (c) 2020, United States Government as represented by the Administrator of the National Aeronautics and Space Administration. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 6 | 7 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 8 | 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 10 | 11 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from glob import glob 2 | from os.path import basename 3 | from os.path import splitext 4 | 5 | import setuptools 6 | 7 | setuptools.setup( 8 | name="dsconcept", 9 | use_scm_version=True, 10 | setup_requires=["setuptools_scm"], 11 | url="https://developer.nasa.gov/DataSquad/classifier_scripts", 12 | author="Anthony Buonomo", 13 | author_email="anthony.r.buonomo@nasa.gov", 14 | description="Scripts for processing, topic modeling, and creating classifiers for STI concepts.", 15 | long_description=open("README.md").read(), 16 | license="MIT", 17 | packages=setuptools.find_packages("src"), 18 | package_dir={"": "src"}, 19 | py_modules=[splitext(basename(path))[0] for path in glob("src/*.py")], 20 | install_requires=[ 21 | "scikit-learn>=0.21.3", 22 | "spacy>=2.2.3", 23 | "numpy>=1.17.4", 24 | "pandas>=0.25.3", 25 | "pyLDAvis>=2.1.2", 26 | "textacy==0.9.1", 27 | "boto3>=1.7.46", 28 | "dask>=2.8.1", 29 | "PyYAML>=5.1.2", 30 | "h5py>=2.10.0", 31 | "tqdm>=4.39.0", 32 | ], 33 | classifiers=[ 34 | "Development Status :: 2 - Beta", 35 | "Programming Language :: Python :: 3.6", 36 | ], 37 | ) 38 | -------------------------------------------------------------------------------- /tests/test_conceptExtractor.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | from .context import dsconcept 3 | from testfixtures import TempDirectory 4 | from pathlib import Path 5 | import logging 6 | 7 | logging.basicConfig(level=logging.INFO) 8 | LOG = logging.getLogger(__name__) 9 | LOG.setLevel(logging.INFO) 10 | 11 | 12 | class TestConceptExtractor(TestCase): 13 | def setUp(self): 14 | self.ce = dsconcept.model.ConceptExtractor() 15 | self.d = TempDirectory() 16 | 17 | def test_concept_sets(self): 18 | self.ce.concept_sets = [ 19 | ["MARS", "NASA"], 20 | ["NASA"], 21 | ["MARS"], 22 | ["HIT", "JUPITER"], 23 | ] 24 | 25 | def test_from_corpus(self): 26 | data = b'{"abstract":"Astronauts are very cool.", "concept": ["ASTRONAUTS", "COOL THINGS"]} \n {"abstract":"NASA is going to Mars.", "concept":["NASA", "MARS"]}' 27 | self.d.write("test.json", data) 28 | self.ce.from_corpus(Path(f"{self.d.path}/test.json"), "concept") 29 | 30 | def test_get_top_concepts(self): 31 | self.ce.concept_sets = [ 32 | ["MARS", "NASA"], 33 | ["NASA"], 34 | ["MARS"], 35 | ["HIT", "JUPITER"], 36 | ] 37 | self.assertDictEqual( 38 | self.ce.get_top_concepts(2), {"mars": [0, 2], "nasa": [0, 1]} 39 | ) 40 | 41 | def tearDown(self): 42 | self.d.cleanup() 43 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # remove credentials 2 | .eggs/* 3 | bandit_analysis.txt 4 | *.h5 5 | /commands/.env 6 | venv/* 7 | versions_and* 8 | my_env/* 9 | /volumes/* 10 | notebook/* 11 | cover/* 12 | .coverage 13 | !/volumes/in_data 14 | env/* 15 | .idea/* 16 | scratch/* 17 | models/* 18 | !models/.gitkeep 19 | config/* 20 | !config/test_config.yml 21 | 22 | *.env 23 | *.pkl 24 | *.npy 25 | *.tgz 26 | *.gz 27 | *.tar 28 | *.npz 29 | *.swp 30 | 31 | 32 | */.ipynb_checkpoints/* 33 | kubernetes-manifests/* 34 | sample_outdata/* 35 | __pycache__/* 36 | 37 | reports/ 38 | !reports/.gitkeep 39 | data/interim/* 40 | data/raw/* 41 | !data/interim/subj_mapping.json 42 | !data/raw/STI_public_metadata_records_sample100.jsonl 43 | volumes/big_data/* 44 | !volumes/big_data/.gitkeep 45 | tests/test_data/* 46 | !tests/test_data/.gitkeep 47 | !tests/test_data/results_small.json 48 | 49 | misc-ignore/* 50 | .ipynb_checkpoints/* 51 | 52 | notebook/src/* 53 | notebook/data/* 54 | 55 | # Byte-compiled / optimized / DLL files 56 | __pycache__/ 57 | *.py[cod] 58 | 59 | # C extensions 60 | *.so 61 | 62 | # Distribution / packaging 63 | bin/ 64 | build/ 65 | develop-eggs/ 66 | dist/ 67 | eggs/ 68 | lib/ 69 | lib64/ 70 | parts/ 71 | sdist/ 72 | var/ 73 | *.egg-info/ 74 | .installed.cfg 75 | *.egg 76 | 77 | # Installer logs 78 | pip-log.txt 79 | pip-delete-this-directory.txt 80 | 81 | # Unit test / coverage reports 82 | .tox/ 83 | .coverage 84 | .cache 85 | .hypothesis/* 86 | .pytest_cache/* 87 | nosetests.xml 88 | coverage.xml 89 | 90 | # Translations 91 | *.mo 92 | 93 | -------------------------------------------------------------------------------- /src/features.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | 4 | import dsconcept.model as ml 5 | from multiprocessing import cpu_count 6 | 7 | logging.basicConfig(level=logging.INFO) 8 | LOG = logging.getLogger(__name__) 9 | LOG.setLevel(logging.INFO) 10 | 11 | N_CPUS = cpu_count() 12 | BATCH_SIZE = 1000 13 | 14 | 15 | def main(in_corpus, abstract_field, out_features, batch_size, n_threads): 16 | LOG.info(f"Extracting features from corpus at {in_corpus}.") 17 | LOG.info(f"Using field: {abstract_field}.") 18 | fe = ml.FeatureExtractor() 19 | LOG.info(f"Using batch_size {batch_size} with {n_threads} threads.") 20 | LOG.info(f"Outputting processed features to {out_features}.") 21 | fe.from_corpus_to_jsonlines( 22 | in_corpus, out_features, abstract_field, batch_size, n_threads 23 | ) 24 | 25 | 26 | if __name__ == "__main__": 27 | parser = argparse.ArgumentParser( 28 | description="""Create features for each document in the processed corpus. 29 | Each line in output file is a json formatted string 30 | with features and their types.""" 31 | ) 32 | parser.add_argument("i", help="input jsonlines corpus") 33 | parser.add_argument("f", help="abstract field") 34 | parser.add_argument("o", help="ouput jsonlines features") 35 | parser.add_argument( 36 | "-b", help="batch size for feature processing", default=BATCH_SIZE 37 | ) 38 | parser.add_argument( 39 | "-n", help="number of threads for features processing", default=N_CPUS 40 | ) 41 | args = parser.parse_args() 42 | main(args.i, args.f, args.o, args.b, args.n) 43 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | appnope==0.1.0 2 | attrs==19.3.0 3 | backcall==0.1.0 4 | bleach==3.3.0 5 | blis==0.4.1 6 | cachetools==3.1.1 7 | catalogue==0.0.8 8 | certifi==2019.9.11 9 | chardet==3.0.4 10 | coverage==4.5.4 11 | cycler==0.10.0 12 | cymem==2.0.3 13 | cytoolz==0.10.1 14 | dask==2.8.1 15 | decorator==4.4.1 16 | defusedxml==0.6.0 17 | entrypoints==0.3 18 | h5py==2.10.0 19 | hypothesis==4.47.1 20 | idna==2.8 21 | importlib-metadata==0.23 22 | jedi==0.15.1 23 | jellyfish==0.7.2 24 | Jinja2==2.11.3 25 | joblib==0.14.0 26 | jsonschema==3.2.0 27 | kiwisolver==1.1.0 28 | MarkupSafe==1.1.1 29 | matplotlib==3.1.2 30 | mistune==0.8.4 31 | more-itertools==7.2.0 32 | murmurhash==1.0.2 33 | nbformat==4.4.0 34 | networkx==2.4 35 | nose==1.3.7 36 | numpy==1.17.4 37 | packaging==19.2 38 | pandas==0.25.3 39 | pandocfilters==1.4.2 40 | parso==0.5.1 41 | pexpect==4.7.0 42 | pickleshare==0.7.5 43 | plac==1.1.3 44 | pluggy==0.13.1 45 | preshed==3.0.2 46 | prometheus-client==0.7.1 47 | prompt-toolkit==2.0.10 48 | ptyprocess==0.6.0 49 | py==1.10.0 50 | pyemd==0.5.1 51 | Pygments==2.7.4 52 | pyparsing==2.4.5 53 | Pyphen==0.9.5 54 | pyrsistent==0.15.6 55 | python-dateutil==2.8.1 56 | pytz==2019.3 57 | PyYAML==5.4 58 | pyzmq==18.1.1 59 | requests==2.22.0 60 | scikit-learn==0.21.3 61 | scipy==1.3.3 62 | Send2Trash==1.5.0 63 | six==1.13.0 64 | spacy==2.2.3 65 | srsly==0.2.0 66 | terminado==0.8.3 67 | testpath==0.4.4 68 | textacy==0.9.1 69 | thinc==7.3.1 70 | toolz==0.10.0 71 | tornado==6.0.3 72 | tqdm==4.39.0 73 | traitlets==4.3.3 74 | urllib3==1.26.5 75 | wasabi==0.4.0 76 | wcwidth==0.1.7 77 | webencodings==0.5.1 78 | widgetsnbextension==3.5.1 79 | zipp==0.6.0 80 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | All notable changes to this project will be documented in this file. 3 | 4 | Releases page is here. 5 | 6 | ## [Unreleased] 7 | 8 | 9 | ## [v1.0.3-open_source_release] - 2020-06-10 10 | #### Added: 11 | Original open-source release of this repository on github.com/nasa after having received SRA (software release authority) approval. 12 | 13 | 14 | 15 | # Guidelines for ChangeLog Entries 16 | 17 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), 18 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). 19 | 20 | ### Guiding Principles 21 | - Changelogs are for humans, not machines. 22 | - There should be an entry for every single version. 23 | - The same types of changes should be grouped. 24 | - Versions and sections should be linkable. 25 | - The latest version comes first. 26 | - The release date of each version is displayed. 27 | 28 | ### All Entries Sould be Under One of These Types of changes 29 | - Added for new features. 30 | - Changed for changes in existing functionality. 31 | - Deprecated for soon-to-be removed features. 32 | - Removed for now removed features. 33 | - Fixed for any bug fixes. 34 | - Security in case of vulnerabilities. 35 | 36 | Google technical writer Sarah Maddox gave the following advice about release notes: 37 | `“The most important function of release notes is to let customers know that something has changed in the product, particularly when that something may affect the way the customer uses the product.”` 38 | -------------------------------------------------------------------------------- /tests/test_extract_from_doc.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from dsconcept.model import * 3 | 4 | import logging 5 | 6 | logging.basicConfig(level=logging.WARNING) 7 | logging.disable(level=logging.INFO) 8 | LOG = logging.getLogger(__name__) 9 | LOG.setLevel(logging.WARNING) 10 | 11 | 12 | class TestExtractFromDoc(unittest.TestCase): 13 | def setUp(self): 14 | self.nlp = spacy.load("en_core_web_sm") 15 | self.doc = nlp( 16 | """The NASA Scientific and Technical Information (STI) Program was established to support the 17 | objectives of NASA’s missions and research. The Mission of the STI Program is to support the 18 | advancement of aerospace knowledge and contribute to U.S. competitiveness in aerospace research and 19 | development. This program is essential to help NASA avoid duplication of research by sharing 20 | information and to ensure that the U.S. maintains its preeminence in aerospace-related industries 21 | and education. The NASA STI Program acquires, processes, archives, announces, and disseminates 22 | NASA STI and acquires worldwide STI of critical importance to the 23 | National Aeronautics and Space Administation (NASA) and the Nation.""" 24 | ) 25 | self.terms_tagged = extract_from_doc(self.doc) 26 | 27 | def test_is_set(self): 28 | self.assertEqual(dict, type(self.terms_tagged)) 29 | 30 | def test_has_terms(self): 31 | self.assertGreater(len(self.terms_tagged), 0) 32 | 33 | def test_has_all_feature_types(self): 34 | self.term_types = {term_type for term, term_type in self.terms_tagged.items()} 35 | LOG.info(self.term_types) 36 | LOG.info(self.terms_tagged) 37 | self.assertEqual( 38 | {"NOUN", "PROPN", "NOUN_CHUNK", "ENT", "ACRONYM"}, self.term_types 39 | ) 40 | 41 | 42 | if __name__ == "__main__": 43 | unittest.main() 44 | -------------------------------------------------------------------------------- /src/concepts.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | 4 | import dsconcept.model as ml 5 | 6 | logging.basicConfig(level=logging.INFO) 7 | LOG = logging.getLogger(__name__) 8 | LOG.setLevel(logging.INFO) 9 | 10 | 11 | def main( 12 | in_corpus, 13 | concept_field, 14 | cat_field, 15 | out_indices, 16 | out_cat_indices, 17 | out_raw2lemma, 18 | out_cat_raw2lemma, 19 | ): 20 | LOG.info(f"Corpus: {in_corpus}") 21 | LOG.info(f"Keyword Field: {concept_field}") 22 | LOG.info(f"Category Field: {cat_field}") 23 | 24 | ce = ml.ConceptExtractor() 25 | ce.from_corpus(in_corpus, concept_field) 26 | LOG.info(f"Output keyword indices: {out_indices}") 27 | LOG.info(f"Output keyword raw2lemma: {out_raw2lemma}") 28 | ce.to_jsons(out_indices, out_raw2lemma) 29 | 30 | LOG.info(f"Extracting categories.") 31 | ce_higher = ml.ConceptExtractor() 32 | ce_higher.from_corpus(in_corpus, cat_field) 33 | LOG.info(f"Output category indices: {out_cat_indices}") 34 | LOG.info(f"Output category raw2lemma: {out_cat_raw2lemma}") 35 | ce_higher.to_jsons(out_cat_indices, out_cat_raw2lemma) 36 | 37 | 38 | if __name__ == "__main__": 39 | parser = argparse.ArgumentParser( 40 | description="""Get indices of processed corpus for all of concept and category 41 | tags. Also get lemmas for these concepts and categories. Output all of this 42 | information to json files.""" 43 | ) 44 | parser.add_argument("i", help="input processed jsonlines corpus") 45 | parser.add_argument("k", help="concept field") 46 | parser.add_argument("c", help="concept field") 47 | parser.add_argument("ok", help="output indices for concepts") 48 | parser.add_argument("oc", help="output indices for categories") 49 | parser.add_argument("rk", help="out keyword raw to lemma mapping") 50 | parser.add_argument("rc", help="out category raw to lemma mapping") 51 | args = parser.parse_args() 52 | main(args.i, args.k, args.c, args.ok, args.oc, args.rk, args.rc) 53 | -------------------------------------------------------------------------------- /src/make_vec_and_matrix.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | from pathlib import Path 4 | 5 | import joblib 6 | import numpy as np 7 | import yaml 8 | from sklearn.feature_extraction import DictVectorizer 9 | from sklearn.model_selection import train_test_split 10 | 11 | import dsconcept.model as ml 12 | 13 | logging.basicConfig(level=logging.INFO) 14 | LOG = logging.getLogger(__name__) 15 | LOG.setLevel(logging.INFO) 16 | 17 | VECTORIZER = "vectorizer.jbl" 18 | FEATURE_MATRIX = "feature_matrix.jbl" 19 | 20 | 21 | def main(in_features, in_config, out_feature_dir, out_vectorizer): 22 | with open(in_config, "r") as f0: 23 | config = yaml.safe_load(f0) 24 | 25 | LOG.info(f"Loading features from {in_features}.") 26 | fe = ml.FeatureExtractor() 27 | fe.from_jsonlines(in_features) 28 | weighted_features = fe.weight_terms(config["weights"]) 29 | limited_features = fe.limit_features( 30 | weighted_features, 31 | config["min_feature_occurrence"], 32 | config["max_feature_occurrence"], 33 | ) 34 | v = DictVectorizer() 35 | X = v.fit_transform(limited_features) 36 | 37 | out_feature_matrix = out_feature_dir / FEATURE_MATRIX 38 | LOG.info(f"Outputting vectorizer to {out_vectorizer}.") 39 | joblib.dump(v, out_vectorizer) 40 | LOG.info(f"Outputting feature matrix to {out_feature_matrix}.") 41 | joblib.dump(X, out_feature_matrix) 42 | 43 | _, _, ind_train, ind_test = train_test_split( 44 | X, np.array(range(X.shape[0])), test_size=0.10, random_state=42 45 | ) 46 | np.save(out_feature_dir / f"train_inds.npy", ind_train) 47 | np.save(out_feature_dir / f"test_inds.npy", ind_test) 48 | 49 | 50 | if __name__ == "__main__": 51 | parser = argparse.ArgumentParser( 52 | description="""From features file, create a feature matrix and vectorizer 53 | which translates between columns of the matrix and feature strings. Limit 54 | which features are included in these files with configuration.""" 55 | ) 56 | parser.add_argument("in_features", help="input features jsonlines file") 57 | parser.add_argument("in_config", help="configuration for creating models") 58 | parser.add_argument( 59 | "out_feature_dir", 60 | help="output directory for feature matrix and indices", 61 | type=Path, 62 | ) 63 | parser.add_argument( 64 | "out_vectorizer", help="output path for for vectorizer", type=Path, 65 | ) 66 | # TODO: split outputs 67 | args = parser.parse_args() 68 | 69 | main(args.in_features, args.in_config, args.out_feature_dir, args.out_vectorizer) 70 | -------------------------------------------------------------------------------- /src/process.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | import json 4 | 5 | import pandas as pd 6 | 7 | logging.basicConfig(level=logging.INFO) 8 | LOG = logging.getLogger(__name__) 9 | LOG.setLevel(logging.INFO) 10 | 11 | 12 | def main(infile, in_subj_mapping, outfile): 13 | LOG.info(f"Reading corpus from {infile}.") 14 | df = pd.read_json(infile, orient="records", lines=True) 15 | LOG.info(f"Shape of input: {df.shape}") 16 | 17 | with open(in_subj_mapping, "r") as f0: 18 | subj_mapping = json.load(f0) 19 | 20 | def get_subjs(x): 21 | if type(x) == list: 22 | s = set( 23 | subj_mapping[s.strip().lower()] 24 | for s in x 25 | if s.strip().lower() in subj_mapping 26 | ) 27 | l = list(s) 28 | else: 29 | l = None 30 | return l 31 | 32 | categories = df["D072B (Subject Category)"].apply(get_subjs) 33 | 34 | text_col = ( 35 | " " + df["D245A (Title)"] + " <ABSTRACT> " + df["D520B (Abstract)"] 36 | ) 37 | keywords = ( 38 | df["D650A (NASA Major Indexing Terms)"] 39 | + df["D659A (NASA Minor Indexing Terms)"] 40 | ) 41 | 42 | pdf = pd.DataFrame() 43 | pdf["text"] = text_col 44 | pdf["keywords"] = keywords 45 | pdf["subjects"] = df["D072B (Subject Category)"] 46 | pdf["categories"] = categories 47 | 48 | def remove_no_abstracts(x): 49 | if type(x) == str: 50 | if "no abstract available" not in x.lower(): 51 | val = True 52 | else: 53 | val = False 54 | else: 55 | val = False 56 | return val 57 | 58 | has_abs = pdf["text"].apply(remove_no_abstracts) 59 | has_kwds = pdf["keywords"].apply(lambda x: type(x) is list) 60 | has_subj = pdf["keywords"].apply(lambda x: type(x) is list) # Should be subjects? 61 | has_cats = pdf["categories"].apply(lambda x: type(x) is list) 62 | tf = has_kwds & has_subj & has_cats & has_abs 63 | LOG.info(f"Removed {sum(~tf)} rows.") 64 | 65 | LOG.info(f"Outputting processed corpus to {outfile}.") 66 | pdf[tf].to_json(outfile, orient="records", lines=True) 67 | 68 | 69 | if __name__ == "__main__": 70 | parser = argparse.ArgumentParser( 71 | description="""Merge text and keyword fields from input corpus. 72 | Remove documents without abstracts, keywords, or categories.""" 73 | ) 74 | parser.add_argument("i", help="input corpus") 75 | parser.add_argument("m", help="subject to category mapping json") 76 | parser.add_argument("o", help="output processed data") 77 | args = parser.parse_args() 78 | main(args.i, args.m, args.o) 79 | -------------------------------------------------------------------------------- /tests/test_conceptTrainer.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from pathlib import Path 3 | from unittest import TestCase 4 | 5 | import joblib 6 | import numpy as np 7 | from scipy.sparse import csc_matrix 8 | from hypothesis import given 9 | from hypothesis.extra.numpy import arrays 10 | from sklearn.linear_model import SGDClassifier 11 | from sklearn.model_selection import GridSearchCV, train_test_split 12 | from testfixtures import TempDirectory 13 | 14 | from .context import dsconcept 15 | 16 | logging.basicConfig(level=logging.INFO) 17 | LOG = logging.getLogger(__name__) 18 | LOG.setLevel(logging.INFO) 19 | 20 | 21 | class TestConceptTrainer(TestCase): 22 | def setUp(self): 23 | ce = dsconcept.model.ConceptExtractor() 24 | fe = dsconcept.model.FeatureExtractor() 25 | self.d = TempDirectory() 26 | data = b'{"abstract":["Astronauts are very cool."], "concept": ["ASTRONAUTS", "COOL THINGS"]}\n {"abstract":["NASA is going to Mars."], "concept":["NASA", "MARS"]}' 27 | self.d.write("test.json", data) 28 | self.corpus_path = f"{self.d.path}/test.json" 29 | s = 100 30 | self.X = csc_matrix(np.random.randint(2, size=s * 2).reshape(int(s), 2)) 31 | self.y = np.random.randint(2, size=s) 32 | paramgrid = { 33 | "alpha": [0.01, 0.001, 0.0001], 34 | "class_weight": [{1: 10, 0: 1}, {1: 5, 0: 1}, {1: 20, 0: 1}], 35 | "max_iter": [1], 36 | "loss": ["log"], 37 | } # requires loss function with predict_proba 38 | clf = GridSearchCV( 39 | SGDClassifier(), paramgrid, scoring="f1" 40 | ) # requires GridSearchCV 41 | self.ct = dsconcept.train.ConceptTrainer(ce, clf) 42 | 43 | def test_create_concept_classifier(self): 44 | out_dir = Path(f"{self.d.path}/models") 45 | out_dir.mkdir() 46 | X_train, X_test, y_train, y_test = train_test_split( 47 | self.X, self.y, test_size=0.5, random_state=42 48 | ) 49 | self.ct.create_concept_classifier( 50 | "test_concept", X_train, X_test, y_train, y_test, out_dir 51 | ) 52 | clf = joblib.load(out_dir / "test_concept.pkl") 53 | LOG.info(clf) 54 | 55 | def test_train_all(self): # This test is super naive. Does not check behaviour. 56 | self.ct.train_all(self.X, Path(f"{self.d.path}/models"), 5) 57 | test_inds = np.load((Path(f"{self.d.path}") / "test_inds.npy")) 58 | train_inds = np.load((Path(f"{self.d.path}") / "train_inds.npy")) 59 | LOG.info(f"test_inds: {test_inds}") 60 | LOG.info(f"train_inds: {train_inds}") 61 | 62 | @given(arrays(dtype=np.float_, shape=1)) 63 | def test_get_dispersed_subset(self, array): 64 | subset = dsconcept.train.get_dispersed_subset(array, 5) 65 | self.assertLessEqual(len(subset), 5) 66 | LOG.info(subset) 67 | 68 | def tearDown(self): 69 | self.d.cleanup() 70 | -------------------------------------------------------------------------------- /src/pipeline/docker_pipeline.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | #export IMAGE=storage.analytics.nasa.gov/abuonomo/rat_trainer:dev # TODO: make command line arg 4 | ## TODO: add help for description of parameters 5 | # 6 | #echo "Reading from ${IN_DATA}." # TODO: put in docker container env variable so its in docker logs 7 | #echo "Dumping to ${OUT_DATA}." 8 | 9 | usage="$(basename "$0") [-h] [-i path] [-o path] [-d docker-image] [-l loglevel] [-c cpus] 10 | Concept training pipeline 11 | 12 | where: 13 | -h show this help text 14 | -i (absolute path) input data directory 15 | -o (absolute path) output data directory 16 | -d the docker image to use 17 | -l the log level to use ['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'] 18 | -c number of cpus to allow container to use" 19 | 20 | # Get command line arguments 21 | input="" 22 | output="" 23 | while getopts ':hi:o:d:l:c:' option; do 24 | case "$option" in 25 | h) echo "$usage" 26 | exit 27 | ;; 28 | i) input=$OPTARG 29 | ;; 30 | :) printf "missing argument for -%s\n" "$OPTARG" >&2 31 | echo "$usage" >&2 32 | exit 1 33 | ;; 34 | o) output=$OPTARG 35 | ;; 36 | :) printf "missing argument for -%s\n" "$OPTARG" >&2 37 | echo "$usage" >&2 38 | exit 1 39 | ;; 40 | d) image=$OPTARG 41 | ;; 42 | :) printf "missing argument for -%s\n" "$OPTARG" >&2 43 | echo "$usage" >&2 44 | exit 1 45 | ;; 46 | l) LOGLEVEL=$OPTARG 47 | ;; 48 | :) printf "missing argument for -%s\n" "$OPTARG" >&2 49 | echo "$usage" >&2 50 | exit 1 51 | ;; 52 | c) cpus=$OPTARG 53 | ;; 54 | :) printf "missing argument for -%s\n" "$OPTARG" >&2 55 | echo "$usage" >&2 56 | exit 1 57 | ;; 58 | \?) printf "illegal option: -%s\n" "$OPTARG" >&2 59 | echo "$usage" >&2 60 | exit 1 61 | ;; 62 | esac 63 | done 64 | shift $((OPTIND - 1)) 65 | 66 | # Check for errors 67 | if [ ! -d "${input}" ]; then 68 | echo "${input} directory does not exists. Choose a directory name which does exists and contains requisite data." 69 | exit 1 70 | fi 71 | if [ -d "${output}" ]; then 72 | echo "${output} directory already exists. Choose a new directory name which does not exist." 73 | exit 1 74 | fi 75 | if [ "${LOGLEVEL}" = "" ]; then 76 | echo "Setting empty LOGLEVEL to INFO." 77 | export LOGLEVEL="INFO" 78 | fi 79 | if [ "${cpus}" = "" ]; then 80 | echo "Setting empty LOGLEVEL to INFO." 81 | export cpus=0.000 82 | fi 83 | 84 | 85 | mkdir ${output} 86 | 87 | echo "Running full pipeline." 88 | docker run -it\ 89 | -v ${input}:/home/pipeline/volumes/in_data \ 90 | -v ${output}:/home/pipeline/volumes/out_data \ 91 | -e LOGLEVEL=${LOGLEVEL} \ 92 | --cpus=${cpus} \ 93 | ${image} bash -c 'bash pipeline/start.sh' 94 | #--cpus=<value> # TODO: add cpus arg 95 | echo "Completed Pipeline." 96 | -------------------------------------------------------------------------------- /tests/test_featureExtractor.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | from dsconcept.model import FeatureExtractor 3 | 4 | # from .context import dsconcept 5 | from testfixtures import TempDirectory 6 | from hypothesis import given 7 | import hypothesis.strategies as st 8 | import pytest 9 | 10 | 11 | @st.composite 12 | def features(draw): 13 | tags = [st.just(tag) for tag in ["NOUN", "PROPN", "NOUN_CHUNK", "ENT"]] 14 | tags_strat = st.one_of(*tags) 15 | txt = st.text(max_size=5) 16 | doc_feats = st.dictionaries(keys=txt, values=tags_strat, min_size=4, max_size=5) 17 | feats = draw(st.lists(doc_feats, max_size=5)) 18 | return feats 19 | 20 | 21 | @st.composite 22 | def weights(draw): 23 | tags = [st.just(tag) for tag in ["NOUN", "PROPN", "NOUN_CHUNK", "ENT"]] 24 | tags_strat = st.one_of(*tags) 25 | weights_dict = draw( 26 | st.dictionaries(keys=tags_strat, values=st.integers(min_value=0)) 27 | ) 28 | return weights_dict 29 | 30 | 31 | class TestFeatureExtractor(TestCase): 32 | def setUp(self): 33 | self.fe = FeatureExtractor() 34 | self.d = TempDirectory() 35 | data = b'{"abstract":"Astronauts are very cool.", "concept": ["ASTRONAUTS", "COOL THINGS"]} \n {"abstract":"NASA is going to Mars.", "concept":["NASA", "MARS"]}' 36 | self.d.write("test.json", data) 37 | self.corpus_path = f"{self.d.path}/test.json" 38 | 39 | @given(features()) 40 | def test_features(self, d): 41 | self.fe.features = d 42 | self.assertEqual(len(self.fe.features), len(d)) 43 | 44 | def test_from_corpus_to_jsonlines(self): 45 | self.fe.from_corpus_to_jsonlines( 46 | self.corpus_path, f"{self.d.path}/features.jsonl", "abstract", 47 | ) 48 | 49 | def test_from_jsonlines(self): 50 | data = b'{"astronaut":"NOUN", "space": "NOUN", "NASA": "ENT"}\n{"Mars": "PROPN", "dog": "NOUN"}' 51 | features_out = "features.jsonl" 52 | self.d.write(features_out, data) 53 | self.fe.from_jsonlines(f"{self.d.path}/{features_out}") 54 | self.assertSetEqual(self.fe.term_types, {"NOUN", "PROPN", "ENT"}) 55 | 56 | def test_to_jsonlines(self): 57 | self.fe.features = [ 58 | {"space": "NOUN", "Mars": "PROPN"}, 59 | {"Anita": "PROPN", "Adams": "PROPN"}, 60 | ] 61 | out_features = "features.jsonl" 62 | self.fe.to_jsonlines(f"{self.d.path}/{out_features}") 63 | 64 | @given(features(), weights()) 65 | def test_weight_terms(self, d, w): 66 | self.fe.features = d 67 | self.fe.weight_terms(w) 68 | 69 | @given(features(), weights()) 70 | def test_limit_features(self, d, w): 71 | self.fe.features = d 72 | weighted_features = self.fe.weight_terms( 73 | w 74 | ) # Test method contingent upon another test. Bad? 75 | self.fe.limit_features(weighted_features, feature_min=1, feature_max=0.90) 76 | 77 | def tearDown(self): 78 | self.d.cleanup() 79 | -------------------------------------------------------------------------------- /src/make_cat_models.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | import joblib 4 | from sklearn.feature_extraction import DictVectorizer 5 | from sklearn.linear_model import SGDClassifier 6 | from sklearn.model_selection import GridSearchCV, train_test_split 7 | import yaml 8 | from pathlib import Path 9 | import numpy as np 10 | 11 | import dsconcept.model as ml 12 | from dsconcept.train import ConceptTrainer 13 | 14 | logging.basicConfig(level=logging.INFO) 15 | LOG = logging.getLogger(__name__) 16 | LOG.setLevel(logging.INFO) 17 | 18 | VECTORIZER = "vectorizer.jbl" 19 | FEATURE_MATRIX = "feature_matrix.jbl" 20 | OUT_MODELS_DIR = "models" 21 | 22 | 23 | def main( 24 | in_feature_matrix, 25 | in_ind_train, 26 | in_ind_test, 27 | in_cat_indices, 28 | in_cat_raw2lemma, 29 | in_config, 30 | out_dir, 31 | ): 32 | with open(in_config, "r") as f0: 33 | config = yaml.safe_load(f0) 34 | 35 | X = joblib.load(in_feature_matrix) 36 | ind_train = np.load(in_ind_train) 37 | ind_test = np.load(in_ind_test) 38 | 39 | LOG.info( 40 | f"Loading category extractor from {in_cat_indices} and {in_cat_raw2lemma}." 41 | ) 42 | cat_ext = ml.ConceptExtractor() 43 | cat_ext.from_jsons(in_cat_indices, in_cat_raw2lemma) 44 | 45 | paramgrid = { 46 | "alpha": [0.01, 0.001, 0.0001], 47 | "class_weight": [{1: 10, 0: 1}, {1: 5, 0: 1}, {1: 20, 0: 1}], 48 | "max_iter": [1], 49 | "loss": ["log"], 50 | } # requires loss function with predict_proba 51 | clf = GridSearchCV( 52 | SGDClassifier(), paramgrid, scoring="f1" 53 | ) # requires GridSearchCV 54 | out_models = out_dir / OUT_MODELS_DIR 55 | trainer = ConceptTrainer(cat_ext, clf) 56 | 57 | trainer.train_concepts( 58 | X, ind_train, ind_test, out_models, config["min_concept_occurrence"] 59 | ) 60 | LOG.info("Complete.") 61 | 62 | 63 | if __name__ == "__main__": 64 | parser = argparse.ArgumentParser( 65 | description="""Use feature matrix and location of indices to create classifiers 66 | for the categories in the corpus.""" 67 | ) 68 | parser.add_argument( 69 | "in_feature_matrix", help="input scipy sparse matrix of features" 70 | ) 71 | parser.add_argument("in_ind_train", help="train set index") 72 | parser.add_argument("in_ind_test", help="test set index") 73 | parser.add_argument("in_cat_indices", help="category indices") 74 | parser.add_argument("in_cat_raw2lemma", help="category raw to lemma mapping") 75 | parser.add_argument("in_config", help="configuration for creating models") 76 | parser.add_argument( 77 | "out_dir", 78 | help="output directory for vectorizer, feature matrix, and models", 79 | type=Path, 80 | ) 81 | args = parser.parse_args() 82 | 83 | main( 84 | args.in_feature_matrix, 85 | args.in_ind_train, 86 | args.in_ind_test, 87 | args.in_cat_indices, 88 | args.in_cat_raw2lemma, 89 | args.in_config, 90 | args.out_dir, 91 | ) 92 | -------------------------------------------------------------------------------- /src/make_records_for_cat_bert.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import logging 4 | from pathlib import Path 5 | 6 | import numpy as np 7 | import pandas as pd 8 | from sklearn.model_selection import train_test_split 9 | from sklearn.preprocessing import MultiLabelBinarizer 10 | 11 | logging.basicConfig(level=logging.INFO) 12 | LOG = logging.getLogger(__name__) 13 | LOG.setLevel(logging.INFO) 14 | 15 | 16 | def main(in_records, inds_loc, out_records_dir): 17 | train_inds_loc = inds_loc / "train_inds.npy" 18 | test_inds_loc = inds_loc / "test_inds.npy" 19 | train_bert_inds_loc = inds_loc / "train_bert_inds.npy" 20 | dev_bert_inds_loc = inds_loc / "dev_bert_inds.npy" 21 | 22 | LOG.info(f"Loading cleaned records from {in_records}.") 23 | records = pd.read_json(in_records, orient="records", lines=True) 24 | train_inds = np.load(train_inds_loc) 25 | test_inds = np.load(test_inds_loc) 26 | 27 | LOG.info(f"Creating bert cat df format.") 28 | mlb = MultiLabelBinarizer() 29 | cat_bin_array = mlb.fit_transform(records["categories"]) 30 | cat_df = pd.DataFrame(cat_bin_array) 31 | cat_df.columns = mlb.classes_ 32 | cat_df["text"] = records["text"] 33 | ordered_cols = ["text"] + mlb.classes_.tolist() 34 | cat_df = cat_df[ordered_cols] 35 | 36 | train_bert_inds, dev_bert_inds = train_test_split(train_inds, test_size=0.25) 37 | np.save(train_bert_inds_loc, train_bert_inds) 38 | np.save(dev_bert_inds_loc, dev_bert_inds) 39 | 40 | ml_sets = { 41 | "train": cat_df.iloc[train_bert_inds], 42 | "test": cat_df.iloc[test_inds], 43 | "dev": cat_df.iloc[dev_bert_inds], 44 | } 45 | 46 | out_records_dir.mkdir(exist_ok=True) 47 | for set_type, ml_set in ml_sets.items(): 48 | outfile = out_records_dir / f"{set_type}.csv" 49 | LOG.info("Writing to {}".format(outfile)) 50 | ml_set.to_csv(outfile, index=True) 51 | 52 | out_id_to_label = str(out_records_dir / "id_to_label.json") 53 | out_label_to_id = str(out_records_dir / "label_to_id.json") 54 | out_classes = str(out_records_dir / "classes.txt") 55 | 56 | id_to_label = {i: c for i, c in enumerate(mlb.classes_)} 57 | label_to_id = {c: i for i, c in enumerate(mlb.classes_)} 58 | 59 | LOG.info(f"Writing classes to {out_classes}") 60 | classes = mlb.classes_.tolist() 61 | 62 | with open(out_classes, "w") as f0: 63 | for i, c in enumerate(classes): 64 | f0.write(c.strip()) 65 | if i < len(classes) - 1: 66 | f0.write("\n") 67 | 68 | LOG.info(f"Writing to {out_id_to_label}.") 69 | with open(out_id_to_label, "w") as f0: 70 | json.dump(id_to_label, f0) 71 | 72 | LOG.info(f"Writing to {out_label_to_id}.") 73 | with open(out_label_to_id, "w") as f0: 74 | json.dump(label_to_id, f0) 75 | 76 | 77 | if __name__ == "__main__": 78 | parser = argparse.ArgumentParser(description="Say hello") 79 | parser.add_argument("i", help="input records", type=Path) 80 | parser.add_argument( 81 | "inds_loc", help="directory for train, test, and dev indices", type=Path 82 | ) 83 | parser.add_argument( 84 | "o", help="output files for bert category classifying.", type=Path 85 | ) 86 | args = parser.parse_args() 87 | main(args.i, args.inds_loc, args.o) 88 | -------------------------------------------------------------------------------- /src/synthesize_predictions.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | from pathlib import Path 4 | 5 | import dask 6 | import h5py 7 | import joblib 8 | import numpy as np 9 | import pandas as pd 10 | from dask.diagnostics import ProgressBar 11 | from tqdm import tqdm 12 | 13 | from dsconcept.get_metrics import ( 14 | get_cat_inds, 15 | get_synth_preds, 16 | load_category_models, 17 | load_concept_models, 18 | HierarchicalClassifier, 19 | get_mets, 20 | ) 21 | 22 | logging.basicConfig(level=logging.INFO) 23 | LOG = logging.getLogger(__name__) 24 | LOG.setLevel(logging.INFO) 25 | 26 | 27 | def main( 28 | experiment_name, 29 | synth_strat, 30 | in_cat_preds, 31 | out_store, 32 | synth_batch_size, 33 | t, 34 | out_synth_scores, 35 | limit=None, 36 | con_limit=None, 37 | ): 38 | test_inds = np.load(f"data/interim/{experiment_name}/test_inds.npy") 39 | feature_matrix = joblib.load(f"data/interim/{experiment_name}/feature_matrix.jbl") 40 | in_cat_models = Path(f"models/{experiment_name}/categories/models/") 41 | in_kwd_models = Path(f"models/{experiment_name}/keywords/models/") 42 | cat_preds = np.load(in_cat_preds) # based on experiment or explicit path? 43 | cat_clfs = load_category_models(in_cat_models) 44 | cd = load_concept_models(in_kwd_models) 45 | clf = HierarchicalClassifier(cat_clfs, cd) 46 | 47 | if limit is not None: 48 | LOG.info(f"Limiting to {limit} test records.") 49 | feature_matrix_test = feature_matrix.tocsc()[test_inds[0:limit], :] 50 | cat_preds = cat_preds[0:limit, :] 51 | # TODO: How does this affect indices? 52 | else: 53 | feature_matrix_test = feature_matrix.tocsc()[test_inds, :] 54 | 55 | LOG.info(f'Synthesizing predictions with strategy "{synth_strat}".') 56 | all_cat_inds = get_cat_inds(clf.categories, cat_preds, t=t) 57 | if con_limit is not None: 58 | conwc = clf.concepts_with_classifiers[0:con_limit] 59 | else: 60 | conwc = clf.concepts_with_classifiers 61 | shape = (feature_matrix_test.shape[0], len(conwc)) 62 | with tqdm(total=shape[0]) as pbar: 63 | get_synth_preds( 64 | out_store, 65 | shape, 66 | all_cat_inds, 67 | clf.categories, 68 | synth_batch_size, 69 | only_cat=False, 70 | synth_strat=synth_strat, 71 | con_limit=con_limit, 72 | limit=limit, 73 | pbar=pbar, 74 | ) 75 | 76 | LOG.info("Obtaining metrics.") 77 | with h5py.File(out_store, "r") as f0: 78 | if limit is not None: 79 | target_values = f0["ground_truth"][0:limit, :] 80 | else: 81 | target_values = f0["ground_truth"].value 82 | with h5py.File(out_store, "r") as f0: 83 | synth_preds = f0["synthesis"].value 84 | 85 | jobs = [] 86 | mets_pbar = tqdm( 87 | range(len(conwc)), 88 | total=len(conwc), 89 | ) 90 | for i in mets_pbar: 91 | job = dask.delayed(get_mets)( 92 | i, synth_preds, target_values, conwc, mets_pbar 93 | ) 94 | jobs.append(job) 95 | records = dask.compute(jobs) 96 | new_recs_df = pd.DataFrame(records[0]) 97 | LOG.info(f"Saving results to {out_synth_scores}.") 98 | new_recs_df.to_csv(out_synth_scores) 99 | 100 | 101 | if __name__ == "__main__": 102 | parser = argparse.ArgumentParser(description="Say hello") 103 | parser.add_argument("--experiment_name", help="input txt file") 104 | parser.add_argument("--synth_strat", help="input txt file") 105 | parser.add_argument("--in_cat_preds", help="input txt file") 106 | parser.add_argument("--store", help="input txt file") 107 | parser.add_argument("--synth_batch_size", help="input txt file", type=int) 108 | parser.add_argument("--threshold", help="input txt file", type=float) 109 | parser.add_argument("--out_synth_scores", help="input txt file") 110 | parser.add_argument( 111 | "--limit", help="size for sample to test synthesis", type=int, default=None 112 | ) 113 | parser.add_argument( 114 | "--con_limit", help="size for concept sample", type=int, default=None 115 | ) 116 | args = parser.parse_args() 117 | main( 118 | args.experiment_name, 119 | args.synth_strat, 120 | args.in_cat_preds, 121 | args.store, 122 | args.synth_batch_size, 123 | args.threshold, 124 | args.out_synth_scores, 125 | args.limit, 126 | args.con_limit, 127 | ) 128 | -------------------------------------------------------------------------------- /data/interim/subj_mapping.json: -------------------------------------------------------------------------------- 1 | {"environment pollution": "geosciences", "energy production and conversion": "geosciences", "oceanography": "geosciences", "geophysics": "geosciences", "earth resources and remote sensing": "geosciences", "geosciences (general)": "geosciences", "meteorology and climatology": "geosciences", "spacecraft design, testing and performance": "astronautics", "astrodynamics": "astronautics", "astronautics (general)": "astronautics", "ground support systems and facilities (space)": "astronautics", "launch vehicles and launch operations": "astronautics", "space transportation and safety": "astronautics", "spacecraft instrumentation and astrionics": "astronautics", "spacecraft propulsion and power": "astronautics", "space communications, spacecraft communications, command and tracking": "astronautics", "space transportation": "astronautics", "spacecraft instrumentation": "astronautics", "launch vehicles and space vehicles": "astronautics", "physics (general)": "physics", "plasma physics": "physics", "optics": "physics", "nuclear physics": "physics", "acoustics": "physics", "solid-state physics": "physics", "atomic and molecular physics": "physics", "physics of elementary particles and fields": "physics", "nuclear and high-energy physics": "physics", "thermodynamics and statistical physics": "physics", "astronomy": "space sciences", "solar physics": "space sciences", "lunar and planetary science and exploration": "space sciences", "space radiation": "space sciences", "astrophysics": "space sciences", "space sciences (general)": "space sciences", "lunar and planetary exploration": "space sciences", "space biology": "space sciences", "inorganic, organic and physical chemistry": "chemistry and materials", "space processing": "chemistry and materials", "chemistry and materials (general)": "chemistry and materials", "propellants and fuels": "chemistry and materials", "nonmetallic materials": "chemistry and materials", "metals and metallic materials": "chemistry and materials", "composite materials": "chemistry and materials", "materials processing": "chemistry and materials", "metallic materials": "chemistry and materials", "inorganic and physical chemistry": "chemistry and materials", "materials": "chemistry and materials", "research and support facilities (air)": "aeronautics", "avionics and aircraft instrumentation": "aeronautics", "aircraft communications and navigation": "aeronautics", "aircraft propulsion and power": "aeronautics", "aerodynamics": "aeronautics", "aeronautics (general)": "aeronautics", "air transportation and safety": "aeronautics", "aircraft design, testing and performance": "aeronautics", "aircraft stability and control": "aeronautics", "aircraft instrumentation": "aeronautics", "economics and cost analysis": "social and information sciences", "documentation and information science": "social and information sciences", "technology utilization and surface transportation": "social and information sciences", "administration and management": "social and information sciences", "law, political science and space policy": "social and information sciences", "social and information sciences (general)": "social and information sciences", "social sciences (general)": "social and information sciences", "statistics and probability": "mathematical and computer sciences", "computer operations and hardware": "mathematical and computer sciences", "computer programming and software": "mathematical and computer sciences", "computer systems": "mathematical and computer sciences", "cybernetics, artificial intelligence and robotics": "mathematical and computer sciences", "mathematical and computer sciences (general)": "mathematical and computer sciences", "numerical analysis": "mathematical and computer sciences", "systems analysis and operations research": "mathematical and computer sciences", "theoretical mathematics": "mathematical and computer sciences", "cybernetics": "mathematical and computer sciences", "systems analysis": "mathematical and computer sciences", "mathematics and information sciences": "mathematical and computer sciences", "general.": "general", "general": "general", "communications and radar": "engineering", "engineering (general)": "engineering", "electronics and electrical engineering": "engineering", "fluid mechanics and thermodynamics": "engineering", "instrumentation and photography": "engineering", "lasers and masers": "engineering", "mechanical engineering": "engineering", "quality assurance and reliability": "engineering", "structural mechanics": "engineering", "fluid mechanics and heat transfer": "engineering", "behavioral sciences": "life sciences", "aerospace medicine": "life sciences", "man/system technology and life support": "life sciences", "exobiology": "life sciences", "life sciences (general)": "life sciences", "life sciences": "life sciences"} -------------------------------------------------------------------------------- /src/make_kwd_models.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | from pathlib import Path 4 | 5 | import joblib 6 | import numpy as np 7 | import yaml 8 | from sklearn.linear_model import SGDClassifier 9 | from sklearn.model_selection import GridSearchCV 10 | 11 | import dsconcept.model as ml 12 | import dsconcept.train as tr 13 | 14 | logging.basicConfig(level=logging.INFO) 15 | LOG = logging.getLogger(__name__) 16 | LOG.setLevel(logging.DEBUG) 17 | 18 | VECTORIZER = "vectorizer.jbl" 19 | FEATURE_MATRIX = "feature_matrix.jbl" 20 | OUT_MODELS_DIR = "models/topic_" 21 | 22 | 23 | def main( 24 | in_feature_matrix, 25 | in_ind_train, 26 | in_ind_test, 27 | in_kwd_indices, 28 | in_cat_indices, 29 | in_kwd_raw2lemma, 30 | in_cat_raw2lemma, 31 | in_config, 32 | out_dir, 33 | topics=True, 34 | ): 35 | with open(in_config, "r") as f0: 36 | config = yaml.safe_load(f0) 37 | 38 | X = joblib.load(in_feature_matrix) 39 | ind_train = np.load(in_ind_train) 40 | ind_test = np.load(in_ind_test) 41 | 42 | LOG.info(f"Loading keyword extractor from {in_kwd_indices} and {in_kwd_raw2lemma}.") 43 | ce = ml.ConceptExtractor() 44 | ce.from_jsons(in_kwd_indices, in_kwd_raw2lemma) 45 | 46 | LOG.info( 47 | f"Loading category extractor from {in_cat_indices} and {in_cat_raw2lemma}." 48 | ) 49 | cat_ext = ml.ConceptExtractor() 50 | cat_ext.from_jsons(in_cat_indices, in_cat_raw2lemma) 51 | 52 | paramgrid = { 53 | "alpha": [0.01, 0.001, 0.0001], 54 | "class_weight": [{1: 10, 0: 1}, {1: 5, 0: 1}, {1: 20, 0: 1}], 55 | "max_iter": [5], 56 | "loss": ["log"], 57 | } # requires loss function with predict_proba 58 | clf = GridSearchCV( 59 | SGDClassifier(), paramgrid, scoring="f1", n_jobs=-1, 60 | ) # requires GridSearchCV 61 | out_models = out_dir / OUT_MODELS_DIR 62 | trainer = tr.ConceptTrainer(ce, clf) 63 | doc_topic_indices = cat_ext.concept_index_mapping 64 | 65 | if topics: 66 | LOG.info( 67 | f"Training one set for each of {len(doc_topic_indices)} topics divisions." 68 | ) 69 | for topic, doc_topic_index in doc_topic_indices.items(): 70 | trainer.train_concepts( 71 | X, 72 | ind_train, 73 | ind_test, 74 | out_models, 75 | config["min_concept_occurrence"], 76 | topic, 77 | doc_topic_index, 78 | ) 79 | LOG.info("Training one general set") 80 | trainer.train_concepts( 81 | X, ind_train, ind_test, out_models, config["min_concept_occurrence"] 82 | ) 83 | LOG.info("Complete.") 84 | 85 | 86 | if __name__ == "__main__": 87 | parser = argparse.ArgumentParser( 88 | description="""Use feature matrix and location of indices to create classifiers 89 | for the concepts in the corpus.""" 90 | ) 91 | parser.add_argument( 92 | "in_feature_matrix", help="input scipy sparse matrix of features" 93 | ) 94 | parser.add_argument("in_ind_train", help="train set index") 95 | parser.add_argument("in_ind_test", help="test set index") 96 | parser.add_argument("in_kwd_indices", help="keyword indicies") 97 | parser.add_argument("in_cat_indices", help="category indices") 98 | parser.add_argument("in_kwd_raw2lemma", help="keyword raw to lemma mapping") 99 | parser.add_argument("in_cat_raw2lemma", help="category raw to lemma mapping") 100 | parser.add_argument("in_config", help="configuration for creating models") 101 | parser.add_argument( 102 | "out_dir", 103 | help="output directory for vectorizer, feature matrix, and models", 104 | type=Path, 105 | ) 106 | parser.add_argument("--topics", dest="topics", action="store_true") 107 | parser.add_argument("--no-topics", dest="topics", action="store_false") 108 | parser.set_defaults(topics=True) 109 | parser.add_argument( 110 | "-v", "--verbose", help="increase output verbosity", action="store_true" 111 | ) 112 | args = parser.parse_args() 113 | if args.verbose: 114 | LOG.info("Changing log level to DEBUG.") 115 | LOG.setLevel(logging.DEBUG) 116 | tr.LOG.setLevel(logging.DEBUG) 117 | LOG.debug("Changed log level to DEBUG.") 118 | 119 | main( 120 | args.in_feature_matrix, 121 | args.in_ind_train, 122 | args.in_ind_test, 123 | args.in_kwd_indices, 124 | args.in_cat_indices, 125 | args.in_kwd_raw2lemma, 126 | args.in_cat_raw2lemma, 127 | args.in_config, 128 | args.out_dir, 129 | args.topics, 130 | ) 131 | -------------------------------------------------------------------------------- /src/get_bert_cat_models_preds.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import linecache 4 | import logging 5 | from pathlib import Path 6 | 7 | import dsbert.dsbert.multilabel as mll 8 | import numpy as np 9 | import pandas as pd 10 | from tqdm import tqdm 11 | 12 | import dsconcept.get_metrics as gm 13 | 14 | logging.basicConfig(level=logging.INFO) 15 | LOG = logging.getLogger(__name__) 16 | LOG.setLevel(logging.INFO) 17 | 18 | 19 | def init_domain_bert(base_model_dir, finetuned_model_dir, map_loc="cpu"): 20 | LOG.info("Loading BERT models") 21 | processor = mll.MultiLabelTextProcessor(finetuned_model_dir) 22 | clf = mll.BERTMultilabelClassifier( 23 | processor, bert_model=base_model_dir, do_lower_case=False, 24 | ) 25 | clf.initialize_devices() 26 | clf.load_model( 27 | f"{finetuned_model_dir}/finetuned_pytorch_model.bin", map_location=map_loc, 28 | ) 29 | return clf, processor 30 | 31 | 32 | def load_lines_to_df(data_loc, line_inds): 33 | tmp_records = [] 34 | for i in tqdm(line_inds): 35 | r_str = linecache.getline(str(data_loc), i + 1) 36 | r = json.loads(r_str) 37 | tmp_records.append(r) 38 | records_df = pd.DataFrame(tmp_records) 39 | return records_df 40 | 41 | 42 | def main( 43 | data_dir, models_dir, reports_dir, base_model_dir, finetuned_model_dir, sample, 44 | ): 45 | test_inds = np.load(data_dir / "test_inds.npy") 46 | clean_data_loc = data_dir / "abs_kwds.jsonl" 47 | 48 | in_cat_models = models_dir / "categories/models/" 49 | in_kwd_models = models_dir / "keywords/models/" 50 | cat_raw2lemma_loc = models_dir / "cat_raw2lemma.json" 51 | 52 | out_preds_loc = reports_dir / "bert_cat_preds.npy" 53 | 54 | LOG.info("Loading models.") 55 | cat_clfs = gm.load_category_models(in_cat_models) 56 | cd = gm.load_concept_models(in_kwd_models) 57 | clf = gm.HierarchicalClassifier(cat_clfs, cd) 58 | with open(cat_raw2lemma_loc) as f0: 59 | cat_raw2lemma = json.load(f0) 60 | # base_model_dir = str(bert_models_dir / "cased_L-12_H-768_A-12") 61 | # processor_dir = str(bert_models_dir / "processor_dir") 62 | # finetuned_model_loc = str( 63 | # bert_models_dir / "cased_L-12_H-768_A-12/cache/finetuned_pytorch_model.bin" 64 | # ) 65 | bert_clf, processor = init_domain_bert(base_model_dir, finetuned_model_dir,) 66 | 67 | LOG.info(f'Loading records from "{clean_data_loc}".') 68 | if sample is not None: 69 | lines_to_load = test_inds[0:sample] 70 | else: 71 | lines_to_load = test_inds 72 | records_df = load_lines_to_df(clean_data_loc, lines_to_load) 73 | 74 | LOG.info(f"Processing {len(records_df)} records.") 75 | df_example = pd.DataFrame() 76 | df_example["test"] = records_df["text"] 77 | df_example["label"] = 0 78 | df_example = df_example.reset_index() 79 | sample_examples = processor._create_examples(df_example, "test") 80 | 81 | LOG.info("Making BERT category predictions.") 82 | topic_predictions_df = bert_clf.predict(sample_examples) 83 | 84 | LOG.info("Transforming predictions into matrix which aligns with categories.") 85 | cols = topic_predictions_df.iloc[:, 2:].columns 86 | only_preds = topic_predictions_df.iloc[:, 2:] 87 | tcols = [cat_raw2lemma[c] if c in cat_raw2lemma else c for c in cols] 88 | only_preds.columns = tcols 89 | only_preds = only_preds[clf.categories[0:-1]] # don't include '' cat 90 | only_pred_vals = only_preds.values 91 | 92 | LOG.info(f'Saving results to "{out_preds_loc}".') 93 | np.save(out_preds_loc, only_pred_vals) 94 | 95 | 96 | if __name__ == "__main__": 97 | parser = argparse.ArgumentParser( 98 | description="Use BERT cat models to get predictions for all test documents" 99 | ) 100 | parser.add_argument( 101 | "--data_dir", help="interim data dir for given experiment", type=Path 102 | ) 103 | parser.add_argument("--models_dir", help="model_dir for experiment", type=Path) 104 | parser.add_argument("--reports_dir", help="reports dir for experiment", type=Path) 105 | parser.add_argument("--base_model_dir", help="base bert model dir", type=str) 106 | parser.add_argument( 107 | "--finetuned_model_dir", 108 | help="dir with classes.txt file and finetuned pytorch model", 109 | type=str, 110 | ) 111 | parser.add_argument( 112 | "--sample", help="how many to sample from test inds", type=int, default=None 113 | ) 114 | args = parser.parse_args() 115 | main( 116 | args.data_dir, 117 | args.models_dir, 118 | args.reports_dir, 119 | args.base_model_dir, 120 | args.finetuned_model_dir, 121 | args.sample, 122 | ) 123 | -------------------------------------------------------------------------------- /tests/test_hierarchicalClassifier.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import unittest 3 | from pathlib import Path 4 | from unittest import TestCase 5 | 6 | import h5py 7 | import joblib 8 | import numpy as np 9 | from sklearn.feature_extraction import DictVectorizer 10 | from testfixtures import TempDirectory 11 | 12 | from dsconcept.get_metrics import HierarchicalClassifier, StubBestEstimator 13 | 14 | logging.basicConfig(level=logging.INFO) 15 | LOG = logging.getLogger(__name__) 16 | LOG.setLevel(logging.INFO) 17 | 18 | 19 | class TestHierarchicalClassifier(TestCase): 20 | def setUp(self) -> None: 21 | self.d = TempDirectory() 22 | self.clf_loc = Path(self.d.path) / "stub.jbl" 23 | out_info = {'concept': 'something', 'best_estimator_': StubBestEstimator()} 24 | joblib.dump(out_info, self.clf_loc) 25 | cat_clfs = [ 26 | {"best_estimator_": StubBestEstimator(), "concept": "physics"}, 27 | {"best_estimator_": StubBestEstimator(), "concept": "video games"}, 28 | ] 29 | kwd_clfs = { 30 | ("physics", "gamma ray"): StubBestEstimator(), 31 | ("video games", "minecraft"): StubBestEstimator(), 32 | ("video games", "kerbal space program"): StubBestEstimator(), 33 | ("", "minecraft"): StubBestEstimator(), 34 | ("", "gamma ray"): StubBestEstimator(), 35 | ("", "penguins"): StubBestEstimator(), 36 | } 37 | kwd_clfs_locs = { 38 | ("physics", "gamma ray"): self.clf_loc, 39 | ("video games", "minecraft"): self.clf_loc, 40 | ("video games", "kerbal space program"): self.clf_loc, 41 | ("", "minecraft"): self.clf_loc, 42 | ("", "gamma ray"): self.clf_loc, 43 | ("", "penguins"): self.clf_loc, 44 | } 45 | self.hclf = HierarchicalClassifier(cat_clfs, kwd_clfs) 46 | self.hclf_locs = HierarchicalClassifier(cat_clfs, kwd_clfs_locs) 47 | self.feature_matrix = np.array([[0, 0, 1, 1], [0, 1, 0, 1], [1, 1, 0, 0]]) 48 | v = DictVectorizer() 49 | d = [{"astronauts": 1, "astronomy": 1}, {"space": 1, "basalt": 1}] 50 | v.fit(d) 51 | self.v = v 52 | 53 | def test_cat_clfs(self): 54 | cats = ["physics", "video games", ""] 55 | self.assertListEqual(self.hclf.categories, cats) 56 | 57 | def test_kwd_clfs(self): 58 | kwds = ["gamma ray", "kerbal space program", "minecraft", "penguins"] 59 | self.assertListEqual(self.hclf.concepts_with_classifiers.tolist(), kwds) 60 | 61 | def test_predict_categories(self): 62 | cat_preds = self.hclf.predict_categories(self.feature_matrix) 63 | self.assertEqual(cat_preds.shape, (3, 2)) 64 | print(cat_preds) 65 | 66 | def test__predict_one_clf(self): 67 | pred = self.hclf._predict_one_clf(self.feature_matrix, 1, "video games") 68 | self.assertEqual(pred.shape[0], 3) 69 | 70 | def test__predict_one_clf_locs(self): 71 | pred = self.hclf_locs._predict_one_clf(self.feature_matrix, 1, "video games") 72 | self.assertEqual(pred.shape[0], 3) 73 | 74 | def test__predict_keywords(self): 75 | cat_indices = {"physics": [0], "video games": [1, 2]} 76 | store = self.hclf._predict_keywords( 77 | self.feature_matrix, 78 | f"{self.d.path}/store.h5", 79 | cat_indices, 80 | only_no_topic=False, 81 | use_dask=False, 82 | ) 83 | with h5py.File(store, 'r') as f0: 84 | pred_array = f0["predictions"][()] 85 | LOG.info(pred_array) 86 | self.assertEqual(pred_array.shape, (3, 3, 4)) 87 | 88 | def test__predict_keywords_locs(self): 89 | cat_indices = {"physics": [0], "video games": [1, 2]} 90 | store = self.hclf_locs._predict_keywords( 91 | self.feature_matrix, 92 | f"{self.d.path}/store.h5", 93 | cat_indices, 94 | only_no_topic=False, 95 | use_dask=False, 96 | ) 97 | with h5py.File(store, 'r') as f0: 98 | pred_array = f0["predictions"][()] 99 | LOG.info(pred_array) 100 | self.assertEqual(pred_array.shape, (3, 3, 4)) 101 | 102 | def test_get_synth_preds(self): 103 | cat_indices = {"physics": [0], "video games": [1, 2]} 104 | store = self.hclf._predict_keywords( 105 | self.feature_matrix, 106 | f"{self.d.path}/store.h5", 107 | cat_indices, 108 | only_no_topic=False, 109 | use_dask=False, 110 | ) 111 | all_cat_inds = { 112 | "physics": np.array([0]), 113 | "video games": np.array([0, 1]), 114 | "": np.array([0, 1, 2]), 115 | } 116 | self.hclf.get_synth_preds( 117 | store, 118 | all_cat_inds, 119 | batch_size=10000, 120 | only_cat=False, 121 | synth_strat="mean", 122 | use_dask=False, 123 | ) 124 | with h5py.File(store) as f0: 125 | synth_array = f0["synthesis"].value 126 | LOG.info(synth_array) 127 | self.assertEqual(synth_array.shape, (3, 4)) 128 | 129 | def test__to_strings(self): 130 | synth_array = np.array( 131 | [[0, 0.51, 0.9, 0.2], [0.8, 0.1, 0.4, 0.7], [0.4, 0.2, 0.1, 0.9]] 132 | ) 133 | kwd_strs = self.hclf._to_strings( 134 | self.hclf.concepts_with_classifiers, synth_array, t=0.5 135 | ) 136 | results = [ 137 | [("minecraft", 0.9), ("kerbal space program", 0.51)], 138 | [("gamma ray", 0.8), ("penguins", 0.7)], 139 | [("penguins", 0.9)], 140 | ] 141 | self.assertEqual(results, kwd_strs) 142 | LOG.info(kwd_strs) 143 | 144 | def test_predict(self): 145 | examples = [ 146 | "Olympus Mons is the largest volcano in the solar system", 147 | "Database management is critical for information retrieval", 148 | "We used a logistic regression with batched stochastic gradient descent.", 149 | ] 150 | weights = {"NOUN": 1, "PROPN": 1, "ENT": 1, "NOUN_CHUNK": 1, "ACRONYM": 1} 151 | self.hclf.vectorizer = self.v 152 | features, feature_matrix = self.hclf.vectorize(examples, weights) 153 | self.hclf.predict(feature_matrix) 154 | 155 | 156 | if __name__ == "__main__": 157 | unittest.main() -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Configuration file for the Sphinx documentation builder. 4 | # 5 | # This file does only contain a selection of the most common options. For a 6 | # full list see the documentation: 7 | # http://www.sphinx-doc.org/en/master/config 8 | 9 | # -- Path setup -------------------------------------------------------------- 10 | 11 | # If extensions (or modules to document with autodoc) are in another directory, 12 | # add these directories to sys.path here. If the directory is relative to the 13 | # documentation root, use os.path.abspath to make it absolute, like shown here. 14 | # 15 | import os 16 | import sys 17 | 18 | sys.path.insert(0, os.path.abspath("../src")) 19 | sys.path.insert(0, os.path.abspath("../src/dsconcept")) 20 | 21 | # -- Project information ----------------------------------------------------- 22 | 23 | project = "Research Access" 24 | copyright = "2018, Anthony Buonomo" 25 | author = "Anthony Buonomo" 26 | 27 | # The short X.Y version 28 | version = "" 29 | # The full version, including alpha/beta/rc tags 30 | release = "3.0.0" 31 | 32 | 33 | # -- General configuration --------------------------------------------------- 34 | 35 | # If your documentation needs a minimal Sphinx version, state it here. 36 | # 37 | # needs_sphinx = '1.0' 38 | 39 | # Add any Sphinx extension module names here, as strings. They can be 40 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 41 | # ones. 42 | extensions = [ 43 | "sphinx.ext.autodoc", 44 | "sphinx.ext.todo", 45 | "sphinx.ext.napoleon", 46 | ] 47 | 48 | # Add any paths that contain templates here, relative to this directory. 49 | templates_path = ["_templates"] 50 | 51 | # The suffix(es) of source filenames. 52 | # You can specify multiple suffix as a list of string: 53 | # 54 | # source_suffix = ['.rst', '.md'] 55 | source_suffix = ".rst" 56 | 57 | # The master toctree document. 58 | master_doc = "index" 59 | 60 | # The language for content autogenerated by Sphinx. Refer to documentation 61 | # for a list of supported languages. 62 | # 63 | # This is also used if you do content translation via gettext catalogs. 64 | # Usually you set "language" from the command line for these cases. 65 | language = None 66 | 67 | # List of patterns, relative to source directory, that match files and 68 | # directories to ignore when looking for source files. 69 | # This pattern also affects html_static_path and html_extra_path. 70 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] 71 | 72 | # The name of the Pygments (syntax highlighting) style to use. 73 | pygments_style = None 74 | 75 | 76 | # -- Options for HTML output ------------------------------------------------- 77 | 78 | # The theme to use for HTML and HTML Help pages. See the documentation for 79 | # a list of builtin themes. 80 | # 81 | html_theme = "alabaster" 82 | 83 | # Theme options are theme-specific and customize the look and feel of a theme 84 | # further. For a list of options available for each theme, see the 85 | # documentation. 86 | # 87 | # html_theme_options = {} 88 | 89 | # Add any paths that contain custom static files (such as style sheets) here, 90 | # relative to this directory. They are copied after the builtin static files, 91 | # so a file named "default.css" will overwrite the builtin "default.css". 92 | html_static_path = ["_static"] 93 | 94 | # Custom sidebar templates, must be a dictionary that maps document names 95 | # to template names. 96 | # 97 | # The default sidebars (for documents that don't match any pattern) are 98 | # defined by theme itself. Builtin themes are using these templates by 99 | # default: ``['localtoc.html', 'relations.html', 'sourcelink.html', 100 | # 'searchbox.html']``. 101 | # 102 | # html_sidebars = {} 103 | 104 | 105 | # -- Options for HTMLHelp output --------------------------------------------- 106 | 107 | # Output file base name for HTML help builder. 108 | htmlhelp_basename = "ResearchAccessdoc" 109 | 110 | 111 | # -- Options for LaTeX output ------------------------------------------------ 112 | 113 | latex_elements = { 114 | # The paper size ('letterpaper' or 'a4paper'). 115 | # 116 | # 'papersize': 'letterpaper', 117 | # The font size ('10pt', '11pt' or '12pt'). 118 | # 119 | # 'pointsize': '10pt', 120 | # Additional stuff for the LaTeX preamble. 121 | # 122 | # 'preamble': '', 123 | # Latex figure (float) alignment 124 | # 125 | # 'figure_align': 'htbp', 126 | } 127 | 128 | # Grouping the document tree into LaTeX files. List of tuples 129 | # (source start file, target name, title, 130 | # author, documentclass [howto, manual, or own class]). 131 | latex_documents = [ 132 | ( 133 | master_doc, 134 | "ResearchAccess.tex", 135 | "Research Access Documentation", 136 | "Anthony Buonomo", 137 | "manual", 138 | ), 139 | ] 140 | 141 | 142 | # -- Options for manual page output ------------------------------------------ 143 | 144 | # One entry per manual page. List of tuples 145 | # (source start file, name, description, authors, manual section). 146 | man_pages = [ 147 | (master_doc, "researchaccess", "Research Access Documentation", [author], 1) 148 | ] 149 | 150 | 151 | # -- Options for Texinfo output ---------------------------------------------- 152 | 153 | # Grouping the document tree into Texinfo files. List of tuples 154 | # (source start file, target name, title, author, 155 | # dir menu entry, description, category) 156 | texinfo_documents = [ 157 | ( 158 | master_doc, 159 | "ResearchAccess", 160 | "Research Access Documentation", 161 | author, 162 | "ResearchAccess", 163 | "One line description of project.", 164 | "Miscellaneous", 165 | ), 166 | ] 167 | 168 | 169 | # -- Options for Epub output ------------------------------------------------- 170 | 171 | # Bibliographic Dublin Core info. 172 | epub_title = project 173 | 174 | # The unique identifier of the text. This can be a ISBN number 175 | # or the project homepage. 176 | # 177 | # epub_identifier = '' 178 | 179 | # A unique identification for the text. 180 | # 181 | # epub_uid = '' 182 | 183 | # A list of files that should not be packed into the epub file. 184 | epub_exclude_files = ["search.html"] 185 | 186 | 187 | # -- Extension configuration ------------------------------------------------- 188 | 189 | # -- Options for todo extension ---------------------------------------------- 190 | 191 | # If true, `todo` and `todoList` produce output, else they produce nothing. 192 | todo_include_todos = True 193 | 194 | 195 | # -- User config 196 | autodoc_member_order = "bysource" 197 | -------------------------------------------------------------------------------- /src/make_plots.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | from math import ceil 4 | from pathlib import Path 5 | from time import time 6 | 7 | import joblib 8 | import matplotlib.pyplot as plt 9 | import numpy as np 10 | import pandas as pd 11 | import yaml 12 | from scipy.stats import linregress 13 | from tqdm import tqdm 14 | 15 | import dsconcept.get_metrics as gm 16 | from dsconcept.get_metrics import get_keyword_results 17 | 18 | logging.basicConfig(level=logging.INFO) 19 | LOG = logging.getLogger(__name__) 20 | LOG.setLevel(logging.INFO) 21 | 22 | 23 | def lim_concepts_and_plot(mean_df, tmp_df, fig_dir): 24 | LOG.info(f"tmp_df.shape={tmp_df.shape}") 25 | cat = tmp_df["category"].iloc[0] 26 | lim_mean_df = mean_df[np.in1d(mean_df["concept"], tmp_df["concept"])] 27 | lim_tmp_df = tmp_df[np.in1d(tmp_df["concept"], mean_df["concept"])] 28 | if lim_mean_df.shape[0] != lim_tmp_df.shape[0]: 29 | ValueError("Different df sizes") 30 | metrics = ["recall", "precision", "f1", "roc_auc"] 31 | for m in metrics: 32 | a = 0.3 33 | lim_tmp_df[m].hist(alpha=a, label=f"one_layer | cat={cat}") 34 | lim_mean_df[m].hist(alpha=a, label="mean") 35 | plt.legend() 36 | plt.title(m) 37 | fig_loc = fig_dir / f"{m}.png" 38 | LOG.info(f"Saving plot to {fig_loc}") 39 | plt.savefig(fig_loc) 40 | plt.clf() 41 | 42 | 43 | def load_classifier(in_cat_models, in_kwd_models, in_vectorizer): 44 | LOG.info(f"Loading category classifiers from {in_cat_models}.") 45 | in_clfs = list(in_cat_models.iterdir()) 46 | cat_clfs = [joblib.load(c) for c in tqdm(in_clfs)] 47 | 48 | LOG.info(f"Loading keyword classifiers from {in_kwd_models}.") 49 | cd = {} # expects no_topics with suffix '' 50 | for topic_dir in tqdm(in_kwd_models.iterdir()): 51 | in_clfs = list(topic_dir.iterdir()) 52 | clfs = [joblib.load(c) for c in in_clfs] # loads the classifiers 53 | topic_name = topic_dir.stem.split("_")[1] 54 | # depends on opinionated path format 55 | for c in clfs: 56 | cd[topic_name, c["concept"]] = c["best_estimator_"] 57 | 58 | hclf = gm.HierarchicalClassifier(cat_clfs, cd) 59 | hclf.load_vectorizer(in_vectorizer) 60 | return hclf 61 | 62 | 63 | def get_clf_times(hclf, small_res, weights, sizes): 64 | hl_strats = ["topics", "only_no_topic"] 65 | batch_size = 10_000_000 # TODO: remove batching 66 | hl_dicts = [] 67 | 68 | for hls in hl_strats: 69 | times = [] 70 | out_sizes = [] 71 | for s_size in sizes: 72 | if s_size > small_res.shape[0]: 73 | LOG.warning(f"Skipping {s_size} because it is greater than data size.") 74 | continue 75 | examples = small_res["text"].sample(s_size) 76 | n_splits = ceil(examples.shape[0] / batch_size) 77 | t1 = time() 78 | for n in tqdm(range(n_splits)): 79 | start = n * batch_size 80 | end = (n + 1) * batch_size 81 | example_batch = examples[start:end] 82 | _, feature_matrix = hclf.vectorize(example_batch, weights) 83 | LOG.info(f"Predicting keywords") 84 | if hls == "only_no_topic": 85 | no_categories = True 86 | elif hls == "topics": 87 | no_categories = False 88 | else: 89 | LOG.exception(f"Invalid strategy selection: {hls}") 90 | _, _ = hclf.predict(feature_matrix, 0.5, 0.5, no_categories) 91 | t2 = time() 92 | tt = t2 - t1 93 | out_sizes.append(s_size) 94 | times.append(tt) 95 | hld = { 96 | "strat": hls, 97 | "times": times, 98 | "sizes": out_sizes, 99 | } 100 | hl_dicts.append(hld) 101 | return hl_dicts 102 | 103 | 104 | def make_time_plots(hl_dicts, out_plot_file): 105 | fig, axes = plt.subplots(1, 2, figsize=(15, 5)) 106 | for hd in hl_dicts: 107 | lg = linregress(hd["sizes"], hd["times"]) 108 | docs_per_sec = [s / t for s, t in zip(hd["sizes"], hd["times"])] 109 | a = np.array(hd["sizes"]) 110 | axes[0].plot(hd["sizes"], hd["times"], marker="o", label=hd["strat"]) 111 | axes[0].plot(hd["sizes"], lg.slope * a + lg.intercept, "r", alpha=0.5) 112 | axes[0].set_xlabel("number of docs") 113 | axes[0].set_ylabel("time to tag (seconds)") 114 | axes[0].set_title("Time to tag depending on batch size") 115 | axes[1].plot(hd["sizes"], docs_per_sec, marker="o", label=hd["strat"]) 116 | axes[1].set_xlabel("number of docs") 117 | axes[1].set_ylabel("tagging rate (docs/seconds)") 118 | axes[1].set_title("Tagging rate depending on batch size") 119 | axes[0].legend() 120 | axes[1].legend() 121 | plt.savefig(out_plot_file) 122 | plt.clf() 123 | 124 | 125 | def main( 126 | in_mean, 127 | in_cats_dir, 128 | in_kwds_dir, 129 | in_vectorizer, 130 | in_clean_data, 131 | in_config, 132 | out_plots_dir, 133 | ): 134 | LOG.info("Loading dataframes.") 135 | mean_df = pd.read_csv(in_mean, index_col=0) 136 | no_synth_df = get_keyword_results(in_kwds_dir) 137 | if no_synth_df.shape[0] == 0: 138 | raise ValueError( 139 | f"No keyword results. Are the subdirectories of {in_kwds_dir} empty?" 140 | ) 141 | no_cat_df = no_synth_df[no_synth_df["category"] == ""] 142 | LOG.info("Making plots.") 143 | lim_concepts_and_plot(mean_df, no_cat_df, out_plots_dir) 144 | 145 | with open(in_config) as f0: 146 | config = yaml.safe_load(f0) 147 | hclf = load_classifier(in_cats_dir, in_kwds_dir, in_vectorizer) 148 | full_corpus = pd.read_json(in_clean_data, orient="records", lines=True) 149 | sizes = [1, 10, 50, 100, 250, 500, 1000, 2000, 5000, 10_000] 150 | sample_size = min(max(sizes), full_corpus.shape[0]) 151 | 152 | small_res = pd.read_json(in_clean_data, orient="records", lines=True).sample( 153 | sample_size 154 | ) 155 | 156 | hl_dicts = get_clf_times(hclf, small_res, config["weights"], sizes) 157 | 158 | out_plots_time = out_plots_dir / "time_v_batch_size.png" 159 | make_time_plots(hl_dicts, out_plot_file=out_plots_time) 160 | 161 | 162 | if __name__ == "__main__": 163 | parser = argparse.ArgumentParser( 164 | description="""From output metrics, create plots for ROC-AUC, F1, precision, 165 | and recall""" 166 | ) 167 | parser.add_argument("--mean", help="results from synthesis with max strategy") 168 | parser.add_argument("--in_cats_dir", help="category classifiers dir", type=Path) 169 | parser.add_argument("--in_kwds_dir", help="kwds classifier models dir", type=Path) 170 | parser.add_argument("--in_vectorizer", help="vectorizer location", type=Path) 171 | parser.add_argument("--in_clean_data", help="clean code location", type=Path) 172 | parser.add_argument("--in_config", help="config location", type=Path) 173 | parser.add_argument("--out_plots_dir", help="output dir for plots pngs", type=Path) 174 | args = parser.parse_args() 175 | main( 176 | args.mean, 177 | args.in_cats_dir, 178 | args.in_kwds_dir, 179 | args.in_vectorizer, 180 | args.in_clean_data, 181 | args.in_config, 182 | args.out_plots_dir, 183 | ) 184 | -------------------------------------------------------------------------------- /src/pipeline/pipeline.py: -------------------------------------------------------------------------------- 1 | """ 2 | Pipeline 3 | -------- 4 | Program to make classifiers from input corpus and selected keyword field. 5 | 6 | Author: Anthony Buonomo 7 | Contact: anthony.r.buonomo@nasa.gov 8 | 9 | Full opinionated pipeline from processing, to topic_modeling, to training classifiers. 10 | """ 11 | 12 | import logging 13 | import warnings 14 | from pathlib import Path 15 | 16 | import plac 17 | import yaml 18 | from sklearn.decomposition import LatentDirichletAllocation 19 | import joblib 20 | from sklearn.feature_extraction import DictVectorizer 21 | from sklearn.linear_model import SGDClassifier 22 | from sklearn.model_selection import GridSearchCV 23 | 24 | import dsconcept.model as ml 25 | from dsconcept.train import ConceptTrainer 26 | 27 | warnings.filterwarnings("ignore", category=FutureWarning) 28 | logging.basicConfig(level=logging.INFO) 29 | LOG = logging.getLogger(__name__) 30 | LOG.setLevel(logging.INFO) 31 | 32 | FEATURES = Path("features.jsonl") 33 | INDICES = Path("indices.json") 34 | RAW2LEMMA = Path("raw2lemma.json") 35 | 36 | TOPIC_VECTORIZER = Path("vectorizer.pkl") 37 | TOPIC_FEATURE_MATRIX = Path("doc_feature_matrix.pkl") 38 | TOPIC_MODEL = Path("model.pkl") 39 | DOC_TOPIC_DISTR = Path("doc_topic_distr.pkl") 40 | 41 | VECTORIZER = Path("vectorizer.pkl") 42 | FEATURE_MATRIX = Path("doc_feature_matrix.pkl") 43 | OUT_MODELS_DIR = Path("classifiers") 44 | 45 | 46 | def process( 47 | in_corpus, out_dir, abstract_field, concept_field, term_types, batch_size, n_threads 48 | ): 49 | out_dir.mkdir(exist_ok=True, parents=True) 50 | out_features = out_dir / FEATURES 51 | out_indices = out_dir / INDICES 52 | out_raw2lemma = out_dir / RAW2LEMMA 53 | 54 | fe = ml.FeatureExtractor() 55 | fe.from_corpus_to_jsonlines( 56 | in_corpus, out_features, abstract_field, term_types, batch_size, n_threads 57 | ) 58 | 59 | ce = ml.ConceptExtractor() 60 | ce.from_corpus(in_corpus, concept_field) 61 | ce.to_jsons(out_indices, out_raw2lemma) 62 | 63 | return fe, ce 64 | 65 | 66 | def topic_model( 67 | topic_model_dir, processed_dir, topic_weights, min_feature, max_feature 68 | ): 69 | topic_model_dir.mkdir(exist_ok=True) 70 | tfe = ml.FeatureExtractor() 71 | tfe.from_jsonlines(processed_dir / FEATURES) 72 | 73 | topic_weighted_features = tfe.weight_terms(topic_weights) 74 | topic_limited_features = tfe.limit_features( 75 | topic_weighted_features, min_feature, max_feature 76 | ) 77 | 78 | topic_v = DictVectorizer() 79 | topic_X = topic_v.fit_transform(topic_limited_features) 80 | 81 | model = LatentDirichletAllocation( 82 | n_components=3, 83 | max_iter=5, 84 | learning_method="online", 85 | learning_offset=50.0, 86 | random_state=0, 87 | ) 88 | doc_topic_distr = model.fit_transform(topic_X) 89 | 90 | out_vectorizer = topic_model_dir / TOPIC_VECTORIZER 91 | out_feature_matrix = topic_model_dir / TOPIC_FEATURE_MATRIX 92 | out_model = topic_model_dir / TOPIC_MODEL 93 | out_doc_topic_distr = topic_model_dir / DOC_TOPIC_DISTR 94 | 95 | joblib.dump(topic_v, out_vectorizer) 96 | joblib.dump(topic_X, out_feature_matrix) 97 | joblib.dump(model, out_model) 98 | joblib.dump(doc_topic_distr, out_doc_topic_distr) 99 | 100 | return doc_topic_distr 101 | 102 | 103 | def train( 104 | out_dir, 105 | process_dir, 106 | fe, 107 | ce, 108 | weights, 109 | min_feature, 110 | max_feature, 111 | min_concept_occurrence, 112 | doc_topic_distr, 113 | ): 114 | out_dir.mkdir(exist_ok=True) 115 | out_features = process_dir / FEATURES 116 | fe.from_jsonlines(out_features) 117 | weighted_features = fe.weight_terms(weights) 118 | limited_features = fe.limit_features(weighted_features, min_feature, max_feature) 119 | v = DictVectorizer() 120 | X = v.fit_transform(limited_features) 121 | 122 | out_vectorizer = out_dir / VECTORIZER 123 | out_feature_matrix = out_dir / FEATURE_MATRIX 124 | joblib.dump(v, out_vectorizer) 125 | joblib.dump(X, out_feature_matrix) 126 | 127 | paramgrid = { 128 | "alpha": [0.01, 0.001, 0.0001], 129 | "class_weight": [{1: 10, 0: 1}, {1: 5, 0: 1}, {1: 20, 0: 1}], 130 | "max_iter": [1], 131 | "loss": ["log"], 132 | } # requires loss function with predict_proba 133 | clf = GridSearchCV( 134 | SGDClassifier(), paramgrid, scoring="f1" 135 | ) # requires GridSearchCV 136 | out_models = out_dir / OUT_MODELS_DIR 137 | trainer = ConceptTrainer(fe, ce, clf, out_models) 138 | trainer.train_all( 139 | X, out_models, min_concept_occurrence, doc_topic_distr=doc_topic_distr 140 | ) 141 | return out_models 142 | 143 | 144 | def parse_config(in_config): 145 | with open(in_config, "r") as f0: 146 | cfg = yaml.safe_load(f0) 147 | 148 | term_types = cfg["process"]["term_types"] 149 | abstract_field = cfg["process"]["abstract_field"] 150 | concept_field = cfg["process"]["concept_field"] 151 | 152 | topic_weights = cfg["topic_model"]["weights"] 153 | topic_min_feature = cfg["topic_model"]["min_feature_occurrence"] 154 | topic_max_feature = cfg["topic_model"]["max_feature_occurrence"] 155 | num_topics = cfg["topic_model"]["number_of_topics"] 156 | 157 | weights = cfg["train_classifiers"]["weights"] 158 | min_feature = cfg["train_classifiers"]["max_feature_occurrence"] 159 | max_feature = cfg["train_classifiers"]["max_feature_occurrence"] 160 | min_concept = cfg["train_classifiers"]["min_concept_occurrence"] 161 | 162 | return ( 163 | abstract_field, 164 | concept_field, 165 | term_types, 166 | topic_weights, 167 | topic_min_feature, 168 | topic_max_feature, 169 | num_topics, 170 | weights, 171 | min_feature, 172 | max_feature, 173 | min_concept, 174 | ) 175 | 176 | 177 | @plac.annotations( 178 | in_corpus=plac.Annotation("path to json-formatted corpus", "positional", type=Path), 179 | config=plac.Annotation("path to configuration yaml file", "positional", type=Path), 180 | process_dir=plac.Annotation( 181 | "path to dir where you want to store processed corpus data", 182 | "positional", 183 | type=Path, 184 | ), 185 | topic_model_dir=plac.Annotation( 186 | "path to dir where you want to store topic_modeling data", 187 | "positional", 188 | type=Path, 189 | ), 190 | classify_dir=plac.Annotation( 191 | "path to dir where you want to store classifying data", "positional", type=Path 192 | ), 193 | batch_size=plac.Annotation( 194 | "size of batches to process in processing phase of pipeline", "option", type=int 195 | ), 196 | n_threads=plac.Annotation( 197 | "number of threads to use in processing phase of pipeline", "option", type=int 198 | ), 199 | ) 200 | def main( 201 | in_corpus, 202 | config, 203 | process_dir, 204 | topic_model_dir, 205 | classify_dir, 206 | batch_size=10, 207 | n_threads=1, 208 | ): 209 | 210 | ( 211 | abstract_field, 212 | concept_field, 213 | term_types, 214 | topic_weights, 215 | topic_min_feature, 216 | topic_max_feature, 217 | num_topics, 218 | weights, 219 | min_feature, 220 | max_feature, 221 | min_concept, 222 | ) = parse_config(config) 223 | 224 | fe, ce = process( 225 | in_corpus, 226 | process_dir, 227 | abstract_field, 228 | concept_field, 229 | term_types, 230 | batch_size, 231 | n_threads, 232 | ) 233 | doc_topic_distr = topic_model( 234 | topic_model_dir, 235 | process_dir, 236 | topic_weights, 237 | topic_min_feature, 238 | topic_max_feature, 239 | ) 240 | train( 241 | classify_dir, 242 | process_dir, 243 | fe, 244 | ce, 245 | weights, 246 | min_feature, 247 | max_feature, 248 | min_concept, 249 | doc_topic_distr, 250 | ) 251 | LOG.info("SUCCESS!") 252 | 253 | 254 | if __name__ == "__main__": 255 | plac.call(main) 256 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Concept Tagging Training 2 | 3 | This software enables the creation of concept classifiers, to be utilized by an 4 | accompanying [service](https://github.com/nasa/concept-tagging-api). If you don't have your own data to train, you can use the pretrained models <a href="https://data.nasa.gov/Software/STI-Tagging-Models/jd6d-mr3p">described here</a>. This project was written about [here](https://strategy.data.gov/proof-points/2019/05/28/improving-data-access-and-data-management-artificial-intelligence-generated-metadata-tags-at-nasa/) for the Federal Data Strategy Incubator Project. 5 | 6 | ### What is Concept Tagging 7 | By concept tagging, we mean you can supply text, for example:` Volcanic activity, or volcanism, has played a significant role in the geologic evolution of Mars.[2] Scientists have known since the Mariner 9 mission in 1972 that volcanic features cover large portions of the Martian surface.` and get back predicted keywords, like `volcanology, mars surface, and structural properties`, as well as topics like `space sciences, geosciences`, from a standardized list of several thousand NASA concepts with a probability score for each prediction. 8 | 9 | ## Requirements 10 | 11 | You can see a list of options for this project by navigating to the root of the project and executing `make` or `make help`. 12 | 13 | This project requires: 14 | * [docker](https://docs.docker.com/install/) -- [tested with this version](docker-versions.txt) 15 | * [GNU Make](https://www.gnu.org/software/make/) -- tested with 3.81 built for i386-apple-darwin11.3.0 16 | 17 | ## Index: 18 | 1. [installation](#installation) 19 | 2. [how to run](#how-to-run) 20 | 3. [managing experiments](#managing-experiments) 21 | 4. [advanced usage](#advanced-usage) 22 | 23 | ## installation 24 | You have several options for installing and using the pipeline. 25 | 1) [pull existing docker image](#pull-existing-docker-image) 26 | 2) [build docker image from source](#build-docker-image-from-source) 27 | 3) [install in python virtual environment](#install-in-python-virtual-environment) 28 | 29 | ### pull existing docker image 30 | You can just pull a stable docker image which has already been made: 31 | ```bash 32 | docker pull storage.analytics.nasa.gov/abuonomo/concept_trainer:stable 33 | ``` 34 | In order to do this, you must be on the NASA network and able to connect to the <https://storage.analytics.nasa.gov> docker registry. 35 | \* <sub> There are several versions of the images. You can see them [here](https://storage.analytics.nasa.gov/repository/abuonomo/rat_trainer). 36 | If you don't use "stable", some or all of this guide may not work properly. </sub> 37 | 38 | 39 | ### build docker image from source 40 | To build from source, first clone this repository and go to its root. 41 | 42 | Then build the docker image using: 43 | ```bash 44 | docker build -t concept_trainer:example . 45 | ``` 46 | Substitute `concept_trainer:example` for whatever name you would like. Keep this image name in mind. It will be used elsewhere. 47 | 48 | \* If you are actively developing this project, you should look at the `make build` in [Makefile](Makefile). This command automatically tags the image with the current commit url and most recent git tag. The command requires that [setuptools-scm](https://pypi.org/project/setuptools-scm/) is installed. 49 | 50 | ### install in python virtual environment 51 | \* tested with python3.7 52 | First, clone this repository. 53 | Then create and activate a virtual environment. For example, using [venv](https://docs.python.org/3/library/venv.html): 54 | ```bash 55 | python -m venv my_env 56 | source my_env/bin/activate 57 | ``` 58 | Next, while in the root of this project, run `make requirements`. 59 | 60 | 61 | ## how to run 62 | The pipeline takes input document metadata structured like [this](data/raw/STI_public_metadata_records_sample100.jsonl) and a config file like [this](config/test_config.yml). The pipeline produces interim data, models, and reports. 63 | 64 | 1. [using docker](#using-docker) -- if you pulled or built the image 65 | 2. [using python in virtual environment](#using-python-in-virtual-environment) -- if you are running in a local virtual environment 66 | 67 | ### using docker 68 | First, make sure `config`, `data`, `data/raw`, `data/interim`, `models`, and `reports` directories. If they do not exist, make them (`mkdir config data models reports data/raw`). These directories will be used as docker mounted volumes. If you don't make these directories beforehand, they will be created by docker later on, but their permissions will be unnecessarily restrictive. 69 | 70 | Next, make sure you have your input data in the `data/raw/` directory. [Here](data/raw/STI_public_metadata_records_sample100.jsonl) is an example file with the proper structure. You also need to make sure the `subj_mapping.json` file [here](data/interim/subj_mapping.json) is in `data/interim/` directory. 71 | 72 | Now, make sure you have a config file in the `config` directory. [Here](config/test_config.yml) is an example config which will work with the above example file. 73 | 74 | With these files in place, you can run the full pipeline with this command: 75 | ```bash 76 | docker run -it \ 77 | -v $(pwd)/data:/home/data \ 78 | -v $(pwd)/models:/home/models \ 79 | -v $(pwd)/config:/home/config \ 80 | -v $(pwd)/reports:/home/reports \ 81 | concept_trainer:example pipeline \ 82 | EXPERIMENT_NAME=my_test_experiment \ 83 | IN_CORPUS=data/raw/STI_public_metadata_records_sample100.jsonl \ 84 | IN_CONFIG=config/test_config.yml 85 | ``` 86 | Substitute `concept_trainer:example` with the name of your docker image. 87 | You can set the `EXPERIMENT_NAME` to whatever you prefer. 88 | `IN_CORPUS` and `IN_CONFIG` should be set to the paths to the corpus and to the configuration file, respectively. 89 | 90 | \* Developers can also use the `container` command in the [Makefile](Makefile). Note that this command requires [setuptools-scm](https://pypi.org/project/setuptools-scm/). Note that this command will use the image defined by the `IMAGE_NAME` variable and version number equivalent to the most recent git tag. 91 | 92 | 93 | ### using python in virtual environment 94 | 95 | Assuming you have cloned this repository, files for testing the pipeline should be in place. In particular, `data/raw/STI_public_metadata_records_sample100.jsonl` and `config/test_config.yml` should both exist. Additionally, you should add the `src` directory to your `PYTHONPATH`: 96 | ``` 97 | export PYTHONPATH=$PYTHONPATH:$(pwd)/src/ 98 | ``` 99 | 100 | Then, you can run a test of the pipeline with: 101 | ``` 102 | make pipeline \ 103 | EXPERIMENT_NAME=test \ 104 | IN_CORPUS=data/raw/STI_public_metadata_records_sample100.jsonl \ 105 | IN_CONFIG=config/test_config.yml 106 | ``` 107 | If you are not using the default values, simply substitute the proper paths for `IN_CORPUS` and `IN_CONFIG`. Choose whatever name you prefer for `EXPERIMENT_NAME`. 108 | 109 | ## managing experiments 110 | 111 | If you have access to the `hq-ocio-ci-bigdata` moderate s3 bucket, you can sync local experiments with those in the s3 bucket. 112 | 113 | For example, if you created a local experiment with `EXPERIMENT_NAME=my_cool_experiment`, you can upload your local results to the appropriate place on the s3 bucket with: 114 | ```bash 115 | make sync_experiment_to_s3 EXPERIMENT_NAME=my_cool_experiment PROFILE=my_aws_profile 116 | ``` 117 | where `my_aws_profile` is the name of your awscli profile which has access to the given bucket. 118 | 119 | Afterwards, you can download the experiment interim files and results with: 120 | ```bash 121 | make sync_experiment_from_s3 EXPERIMENT_NAME=my_cool_experiment PROFILE=my_aws_profile 122 | ``` 123 | ## use full sti metadata records 124 | If you have access to the moderate bucket and you want to work with the full STI metadata records, you can download them to the `data/raw` folder with: 125 | ```bash 126 | make sync_raw_data_from_s3 PROFILE=my_aws_profile 127 | ``` 128 | When using these data, you will want to use a config file which is different from the test config file. You can browse previous experiments at `s3://hq-ocio-ci-bigdata/home/DataSquad/classifier_scripts/` to see example config files. You might try: 129 | ```yaml 130 | weights: # assign weights for term types specified in process section 131 | NOUN: 1 132 | PROPN: 1 133 | NOUN_CHUNK: 1 134 | ENT: 1 135 | ACRONYM: 1 136 | min_feature_occurrence: 100 137 | max_feature_occurrence: 0.6 138 | min_concept_occurrence: 500 139 | ``` 140 | See [config/test_config.yml](config/test_config.yml) for details on these parameters. 141 | 142 | ## advanced usage 143 | For more advanced usage of the project, look at the [Makefile](Makefile) commands and their associated scripts. You can learn more about these python scripts by them with help flags. For example, you can run `python src/make_cat_models.py -h`. 144 | 145 | -------------------------------------------------------------------------------- /src/dsconcept/train.py: -------------------------------------------------------------------------------- 1 | """ 2 | Train 3 | ----- 4 | Program to make classifiers from input corpus and selected keyword field. 5 | 6 | author: Anthony Buonomo 7 | contact: anthony.r.buonomo@nasa.gov 8 | 9 | """ 10 | import logging 11 | from pathlib import Path 12 | import time 13 | from math import ceil 14 | 15 | from sklearn.model_selection import train_test_split 16 | from sklearn.exceptions import UndefinedMetricWarning 17 | from sklearn.metrics import ( 18 | accuracy_score, 19 | roc_auc_score, 20 | f1_score, 21 | precision_score, 22 | recall_score, 23 | ) 24 | import joblib 25 | import numpy as np 26 | import warnings 27 | from tqdm import tqdm 28 | 29 | warnings.filterwarnings("ignore", category=FutureWarning) 30 | warnings.filterwarnings("ignore", category=DeprecationWarning) 31 | warnings.filterwarnings("ignore", category=UndefinedMetricWarning) 32 | warnings.filterwarnings("ignore", category=Warning) 33 | logging.basicConfig(level=logging.INFO) 34 | LOG = logging.getLogger(__name__) 35 | LOG.setLevel(logging.INFO) 36 | 37 | 38 | def get_dispersed_subset(array, subset_size): 39 | """ 40 | Get dispersed subset of an array. By dispersed, I mean that we values extract values 41 | from an evenly distributed by location in the array. 42 | 43 | Args: 44 | array (numpy.ndarray): array from which to extract subset 45 | subset_size (int): the number of elements to extract from array 46 | 47 | Returns: 48 | subset (numpy.ndarray): the dispersed subset 49 | array (numpy.ndarray): if subset_size too large, return the input array 50 | 51 | Examples: 52 | >>> from dsconcept.train import get_dispersed_subset 53 | >>> l1 = list(range(100)) 54 | >>> l2 = get_dispersed_subset(l1, 10) 55 | >>> l2 56 | array([ 0., 12., 23., 34., 45., 56., 67., 78., 89., 99.], dtype=float16) 57 | """ 58 | if len(array) <= subset_size: 59 | return array 60 | else: 61 | last = array[-1] 62 | subset = [sub[0] for sub in np.array_split(array, (subset_size - 1))] 63 | subset.append(last) 64 | subset = np.array(subset, dtype=np.float16) 65 | return subset 66 | 67 | 68 | # TODO: refactor to remove need for this function 69 | def path_append(in_path, addition): 70 | out_path = f"{in_path.parent}/{in_path.stem}{addition}{in_path.suffix}" 71 | return Path(out_path) 72 | 73 | 74 | def topic_path_format(out_classifier_dir, topic): 75 | if topic is None: 76 | tmp_topic = "" 77 | else: 78 | tmp_topic = topic 79 | out_classifier_dir = path_append(out_classifier_dir, tmp_topic) # appends to stem 80 | if not out_classifier_dir.exists(): 81 | out_classifier_dir.mkdir(parents=True) 82 | return out_classifier_dir 83 | 84 | 85 | class ConceptTrainer: 86 | def __init__(self, concept_extractor, classifier): 87 | """ 88 | Initialize object for training of classifiers based on given corpus extractors. 89 | 90 | Args: 91 | concept_extractor (dsconcept.model.ConceptExtractor): ConceptExtractor (with concepts already loaded) 92 | for which to create classifiers 93 | classifier (sklearn.GridSearchCV): the classifier algorithm to use (wrapped in sklearn GridSearchCV) 94 | 95 | """ 96 | self.concept_extractor = concept_extractor 97 | self.classifier = classifier 98 | 99 | def train_all( 100 | self, 101 | doc_feature_matrix, 102 | out_classifier_dir, 103 | min_concept_freq, 104 | doc_topic_distr=None, 105 | ): 106 | """ 107 | Train classifiers for each concept for each topic (if topic distributions are provided). 108 | 109 | Args: 110 | doc_feature_matrix (scipy.sparse.csr.csr_matrix): document feature matrix 111 | out_classifier_dir (pathlib.Path): output path for classifiers 112 | min_concept_freq (int): minimum frequency for concepts in corpus in order 113 | for their corresponding classifiers to be made 114 | doc_topic_distr (numpy.ndarray): topic distributions for each doc in training set 115 | 116 | Returns: 117 | out_classifier_dir (pathlib.Path): output path for classifiers 118 | 119 | """ 120 | doc_topic_indices = {} 121 | if doc_topic_distr is not None: 122 | for topic in range( 123 | doc_topic_distr.shape[1] 124 | ): # cols of distr matrix ~ topics 125 | doc_topic_indices[topic] = [ 126 | i 127 | for i, distr in enumerate(doc_topic_distr) 128 | if distr.argmax() == topic 129 | ] 130 | _, _, ind_train, ind_test = train_test_split( 131 | doc_feature_matrix, 132 | np.array(range(doc_feature_matrix.shape[0])), 133 | test_size=0.10, 134 | random_state=42, 135 | ) 136 | np.save(out_classifier_dir.parent / f"train_inds.npy", ind_train) 137 | np.save(out_classifier_dir.parent / f"test_inds.npy", ind_test) 138 | 139 | LOG.info( 140 | f"Training one general set, and one set for each of {len(doc_topic_indices)} topics divisions." 141 | ) 142 | for topic, doc_topic_index in doc_topic_indices.items(): 143 | self.train_concepts( 144 | doc_feature_matrix, 145 | ind_train, 146 | ind_test, 147 | out_classifier_dir, 148 | min_concept_freq, 149 | topic, 150 | doc_topic_index, 151 | ) 152 | self.train_concepts( 153 | doc_feature_matrix, 154 | ind_train, 155 | ind_test, 156 | out_classifier_dir, 157 | min_concept_freq, 158 | ) 159 | return out_classifier_dir 160 | 161 | def train_concepts( 162 | self, 163 | doc_feature_matrix, 164 | ind_train, 165 | ind_test, 166 | out_classifier_dir, 167 | min_concept_freq, 168 | topic=None, 169 | doc_topic_index=None, 170 | scale_threshold=False, 171 | ): 172 | """ 173 | Create classifiers for group of concepts. 174 | 175 | Args: 176 | doc_feature_matrix (scipy.sparse.csr.csr_matrix): document feature matrix 177 | ind_train (list of int): indices for training partition 178 | ind_test (list of int): indices for testing partition 179 | out_classifier_dir (pathlib.Path): path to directory where classifiers will be dumped. 180 | min_concept_freq (int): minimum frequency for concepts in corpus in order 181 | for their corresponding classifiers to be made 182 | topic (int | None): the topic (if any) from which to select training data for classifiers 183 | doc_topic_index (lists): mapping from given topic to document indices 184 | for which that topic has the highest probability 185 | scale_threshold (bool | False): If true, scale the minimum_concept_freq by the size of the topic division. 186 | 187 | Returns: 188 | out_classifier_dir (pathlib.Path): directory where classifiers have been placed 189 | 190 | """ 191 | 192 | LOG.info(f"Queuing classifier job for topic {topic}.") 193 | t1 = time.time() 194 | out_classifier_dir = topic_path_format(out_classifier_dir, topic) 195 | 196 | LOG.info("Getting indices for training and testing.") 197 | if doc_topic_index is not None: 198 | train_inds = list(set(ind_train).intersection(doc_topic_index)) 199 | test_inds = list(set(ind_test).intersection(doc_topic_index)) 200 | else: 201 | train_inds = ind_train 202 | test_inds = ind_test 203 | 204 | X_train = doc_feature_matrix.tocsc()[train_inds, :] 205 | X_test = doc_feature_matrix.tocsc()[test_inds, :] 206 | 207 | if scale_threshold: 208 | total_size = X_train.shape[0] + X_test.shape[0] 209 | # scale threshold based on size of topic division 210 | r = total_size / doc_feature_matrix.shape[0] 211 | topic_min_concept_threshold = ceil(min_concept_freq * r) 212 | else: 213 | topic_min_concept_threshold = min_concept_freq 214 | LOG.info(f"Topic threshold set to {topic_min_concept_threshold}.") 215 | 216 | concept_index_mapping = self.concept_extractor.get_top_concepts( 217 | topic_min_concept_threshold 218 | ) 219 | no_concepts = len(concept_index_mapping) 220 | LOG.info(f"Training {no_concepts} concepts.") 221 | 222 | nu_passed = 0 223 | for concept, index in tqdm(concept_index_mapping.items()): 224 | LOG.debug(f"TOPIC={topic}:Loading indices for {concept}") 225 | y = np.zeros(doc_feature_matrix.shape[0]) 226 | np.put(y, index, 1) 227 | 228 | y_train = y[train_inds] 229 | y_test = y[test_inds] 230 | total_yes = sum(y_train) + sum(y_test) 231 | 232 | if total_yes < topic_min_concept_threshold: 233 | nu_passed += 1 234 | LOG.debug( 235 | f"Passing {concept} because it is under topic_min_concept_threshold of {topic_min_concept_threshold}." 236 | ) 237 | continue 238 | # TODO: move around y0 train and test inds to keep aligned 239 | self.create_concept_classifier( 240 | concept, X_train, X_test, y_train, y_test, out_classifier_dir 241 | ) 242 | t2 = time.time() 243 | LOG.warning(f"Passed {nu_passed} in topic {topic} due to freq under threshold.") 244 | LOG.debug(f"{t2-t1} seconds for topic {topic}.") 245 | return out_classifier_dir 246 | 247 | def create_concept_classifier( 248 | self, concept, X_train, X_test, y_train, y_test, out_classifier_dir 249 | ): 250 | """ 251 | Create an individual classifier. 252 | 253 | Args: 254 | concept (str): the concept for which to create a classifier 255 | doc_feature_matrix (scipy.sparse.csr.csr_matrix): documents with their features 256 | y (numpy.ndarray): array which indicates whether or not given concept occurs for a given topic 257 | out_classifier_dir (pathlib.Path): output directory for classifiers 258 | 259 | Returns: 260 | out_model_path (pathlib.Path): the path to the concept classifier just produced. 261 | 262 | """ 263 | LOG.debug(f"Making classifier for concept {concept}.") 264 | try: 265 | LOG.debug(f"fitting {concept}...") 266 | self.classifier.fit(X_train, y_train) 267 | LOG.debug(f"testing {concept}...") 268 | y_score = self.classifier.predict_proba(X_test)[:, 1] 269 | LOG.debug(f"Binarizing score for {concept}...") 270 | y_pred = np.where(y_score > 0.5, 1, 0) 271 | 272 | LOG.debug(f"Getting metric scores for {concept}...") 273 | accuracy = accuracy_score(y_test, y_pred) 274 | roc_auc = roc_auc_score(y_test, y_score) 275 | f1 = f1_score(y_test, y_pred) 276 | precision = precision_score(y_test, y_pred) 277 | recall = recall_score(y_test, y_pred) 278 | 279 | out_model = { 280 | "concept": concept, 281 | "best_estimator_": self.classifier.best_estimator_, 282 | "cv_results_": self.classifier.cv_results_, 283 | "scores": { 284 | "accuracy": accuracy, 285 | "roc_auc": roc_auc, 286 | "f1": f1, 287 | "precision": precision, 288 | "recall": recall, 289 | }, 290 | } 291 | LOG.debug(f"Accuracy: {accuracy} | ROC-AUC: {roc_auc} | F1: {f1}") 292 | out_concept = str(Path(concept).name) 293 | out_model_path = out_classifier_dir / f"{out_concept}.pkl" 294 | LOG.debug(f"Writing model to {out_model_path}.") 295 | joblib.dump(out_model, out_model_path) 296 | return out_model_path 297 | 298 | except ValueError: 299 | LOG.debug(f"Insufficient data for concept {concept}.") 300 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: process features concepts keywords categories structure requirements \ 2 | sync_data_to_s3 sync_data_from_s3 sync_raw_data_from_s3 pipeline plots \ 3 | tests docs check_clean clean_experiment clean 4 | 5 | #.SHELLFLAGS := -o nounset -c 6 | SHELL := /bin/bash 7 | 8 | ################################################################################# 9 | # GLOBALS # 10 | ################################################################################# 11 | 12 | PROJECT_DIR := $(shell dirname $(realpath $(lastword $(MAKEFILE_LIST)))) 13 | BUCKET = hq-ocio-ci-bigdata/home/DataSquad/classifier_scripts/ 14 | PROFILE = moderate 15 | PROJECT_NAME = classifier_scripts 16 | PYTHON_INTERPRETER = python3 17 | 18 | ifeq (,$(shell which conda)) 19 | HAS_CONDA=False 20 | else 21 | HAS_CONDA=True 22 | endif 23 | 24 | ################################################################################# 25 | # COMMANDS # 26 | ################################################################################# 27 | 28 | # These three variables should be tailored for you use case. 29 | EXPERIMENT_NAME=test 30 | IN_CORPUS=data/raw/STI_public_metadata_records_sample100.jsonl 31 | IN_CONFIG=config/test_config.yml 32 | 33 | INTERIM_DATA=data/interim/$(EXPERIMENT_NAME) 34 | INTERIM_CORPUS=data/interim/$(EXPERIMENT_NAME)/abs_kwds.jsonl 35 | 36 | FIELD=text 37 | SUBJ_MAPPING=data/interim/subj_mapping.json 38 | FEATURES=data/interim/$(EXPERIMENT_NAME)/features.jsonl 39 | 40 | CONCEPT_FIELD='keywords' 41 | CAT_FIELD='categories' 42 | OUT_KWD_INDICES=data/interim/$(EXPERIMENT_NAME)/kwd_indices.json 43 | OUT_CAT_INDICES=data/interim/$(EXPERIMENT_NAME)/cat_indices.json 44 | OUT_KWD_RAW_TO_LEMMA=models/$(EXPERIMENT_NAME)/kwd_raw2lemma.json 45 | OUT_CAT_RAW_TO_LEMMA=models/$(EXPERIMENT_NAME)/cat_raw2lemma.json 46 | 47 | OUT_OUTER_MODEL_DIR=models/$(EXPERIMENT_NAME) 48 | OUT_KWD_MODEL_DIR=$(OUT_OUTER_MODEL_DIR)/keywords 49 | OUT_CAT_MODEL_DIR=$(OUT_OUTER_MODEL_DIR)/categories 50 | 51 | METRICS_LOC=reports/$(EXPERIMENT_NAME) 52 | BERT_MODELS_DIR=models/bert_models 53 | 54 | GIT_REMOTE='origin' 55 | IMAGE_NAME=concept_trainer 56 | 57 | 58 | ## Test underlying dsconcept library 59 | tests: 60 | nosetests --with-coverage --cover-package dsconcept --cover-html; \ 61 | open cover/index.html 62 | 63 | ## Run through all steps to create all classifiers 64 | pipeline: structure process features concepts vectorizer_and_matrix \ 65 | categories keywords metrics plots 66 | 67 | ## create directory structure if necessary 68 | structure: 69 | mkdir -p data 70 | mkdir -p data/raw 71 | mkdir -p data/interim 72 | mkdir -p data/interim/$(EXPERIMENT_NAME) 73 | mkdir -p models/$(EXPERIMENT_NAME) 74 | mkdir -p config 75 | mkdir -p reports 76 | mkdir -p reports/$(EXPERIMENT_NAME) 77 | 78 | ## install newest version of dependencies. Untested. 79 | approximate-install: 80 | pip install scikit-learn spacy tqdm textacy pyyaml pandas h5py \ 81 | testfixtures hypothesis dask pytest matplotlib 82 | $(PYTHON_INTERPRETER) -m spacy download en_core_web_sm 83 | 84 | ## install precise python dependencies 85 | requirements: 86 | $(PYTHON_INTERPRETER) -m pip install -U pip setuptools wheel 87 | $(PYTHON_INTERPRETER) -m pip install -r requirements.txt 88 | $(PYTHON_INTERPRETER) -m spacy download en_core_web_sm 89 | 90 | ## processing by merging text and keyword fields 91 | process: $(INTERIM_CORPUS) 92 | $(INTERIM_CORPUS): $(IN_CORPUS) src/process.py 93 | mkdir -p data/interim/$(EXPERIMENT_NAME) 94 | mkdir -p models/$(EXPERIMENT_NAME) 95 | $(PYTHON_INTERPRETER) src/process.py $(IN_CORPUS) $(SUBJ_MAPPING) $(INTERIM_CORPUS) 96 | 97 | ## creature feature sets for processed data 98 | features: $(FEATURES) 99 | $(FEATURES): $(INTERIM_CORPUS) src/features.py 100 | $(PYTHON_INTERPRETER) src/features.py $(INTERIM_CORPUS) $(FIELD) $(FEATURES) 101 | 102 | ## create concepts indices json and mappings from raw to lemmas 103 | concepts: $(OUT_KWD_INDICES) $(OUT_CAT_INDICES) 104 | $(OUT_KWD_INDICES) $(OUT_CAT_INDICES): $(INTERIM_CORPUS) src/concepts.py 105 | $(PYTHON_INTERPRETER) src/concepts.py \ 106 | $(INTERIM_CORPUS) \ 107 | $(CONCEPT_FIELD) $(CAT_FIELD) \ 108 | $(OUT_KWD_INDICES) $(OUT_CAT_INDICES) \ 109 | $(OUT_KWD_RAW_TO_LEMMA) $(OUT_CAT_RAW_TO_LEMMA) 110 | 111 | ## create vectorizer and feature matrix from feature records 112 | vectorizer_and_matrix: $(INTERIM_DATA)/feature_matrix.jbl 113 | $(INTERIM_DATA)/feature_matrix.jbl: src/make_vec_and_matrix.py $(FEATURES) $(IN_CONFIG) 114 | mkdir -p $(OUT_OUTER_MODEL_DIR) && \ 115 | cp $(IN_CONFIG) $(OUT_OUTER_MODEL_DIR)/config.yml && \ 116 | $(PYTHON_INTERPRETER) src/make_vec_and_matrix.py \ 117 | $(FEATURES) $(IN_CONFIG) $(INTERIM_DATA) $(OUT_OUTER_MODEL_DIR)/vectorizer.jbl 118 | # TODO: separate outputs for vec and matrix, send matrix to INTERIM_DATA 119 | 120 | ## train category models 121 | categories: src/make_cat_models.py $(OUT_CAT_INDICES) $(INTERIM_DATA)/feature_matrix.jbl $(IN_CONFIG) 122 | mkdir -p $(OUT_CAT_MODEL_DIR) && \ 123 | $(PYTHON_INTERPRETER) src/make_cat_models.py \ 124 | $(INTERIM_DATA)/feature_matrix.jbl \ 125 | $(INTERIM_DATA)/train_inds.npy \ 126 | $(INTERIM_DATA)/test_inds.npy \ 127 | $(OUT_CAT_INDICES) \ 128 | $(OUT_CAT_RAW_TO_LEMMA) \ 129 | $(IN_CONFIG) $(OUT_CAT_MODEL_DIR) 130 | 131 | ## train keyword models 132 | keywords: src/make_kwd_models.py $(OUT_KWD_INDICES) $(INTERIM_DATA)/feature_matrix.jbl $(IN_CONFIG) $(INTERIM_DATA)/test_inds.npy 133 | mkdir -p $(OUT_KWD_MODEL_DIR) && \ 134 | $(PYTHON_INTERPRETER) src/make_kwd_models.py \ 135 | $(INTERIM_DATA)/feature_matrix.jbl \ 136 | $(INTERIM_DATA)/train_inds.npy \ 137 | $(INTERIM_DATA)/test_inds.npy \ 138 | $(OUT_KWD_INDICES) $(OUT_CAT_INDICES) \ 139 | $(OUT_KWD_RAW_TO_LEMMA) $(OUT_CAT_RAW_TO_LEMMA) \ 140 | $(IN_CONFIG) $(OUT_KWD_MODEL_DIR) 141 | 142 | ## Only train keywords on full training set. No topic splitting. 143 | keywords-no-topics: 144 | mkdir -p $(OUT_KWD_MODEL_DIR) && \ 145 | $(PYTHON_INTERPRETER) src/make_kwd_models.py \ 146 | $(INTERIM_DATA)/feature_matrix.jbl \ 147 | $(INTERIM_DATA)/train_inds.npy \ 148 | $(INTERIM_DATA)/test_inds.npy \ 149 | $(OUT_KWD_INDICES) $(OUT_CAT_INDICES) \ 150 | $(OUT_KWD_RAW_TO_LEMMA) $(OUT_CAT_RAW_TO_LEMMA) \ 151 | $(IN_CONFIG) $(OUT_KWD_MODEL_DIR) --no-topics ${VERBOSE} 152 | 153 | ## Get predictions from category models made with BERT classification 154 | bert_cat_model_scores: 155 | mkdir -p $(METRICS_LOC) && \ 156 | $(PYTHON_INTERPRETER) src/get_bert_cat_models_preds.py \ 157 | --data_dir $(INTERIM_DATA) \ 158 | --models_dir $(OUT_OUTER_MODEL_DIR) \ 159 | --reports_dir $(METRICS_LOC) \ 160 | --base_model_dir ../nlp-working-with-bert/models/base/cased_L-12_H-768_A-12 \ 161 | --finetuned_model_dir ../nlp-working-with-bert/models/01_02_2020/ \ 162 | --sample 1000 163 | # --base_model_dir models/bert_models/cased_L-12_H-768_A-12 \ 164 | # --finetuned_model_dir models/bert_models/cased_L-12_H-768_A-12/cache 165 | 166 | ## Create cleaned dataset for training transformer category models 167 | bert_cat_clean_dataset: 168 | $(PYTHON_INTERPRETER) src/make_records_for_cat_bert.py \ 169 | $(INTERIM_CORPUS) \ 170 | $(INTERIM_DATA) \ 171 | $(OUT_OUTER_MODEL_DIR)/bert 172 | 173 | ## Get metrics for test data 174 | metrics: 175 | mkdir -p $(METRICS_LOC) && \ 176 | $(PYTHON_INTERPRETER) src/dsconcept/get_metrics.py \ 177 | --experiment_name $(EXPERIMENT_NAME) \ 178 | --out_store $(METRICS_LOC)/store.h5 \ 179 | --out_cat_preds $(METRICS_LOC)/cat_preds.npy \ 180 | --batch_size 500 181 | 182 | ## Synthesize predictions for keywords and classifiers to create full classification 183 | synthesize: 184 | mkdir -p $(METRICS_LOC) && \ 185 | $(PYTHON_INTERPRETER) src/synthesize_predictions.py \ 186 | --experiment_name $(EXPERIMENT_NAME) \ 187 | --synth_strat mean \ 188 | --in_cat_preds $(METRICS_LOC)/cat_preds.npy \ 189 | --store $(METRICS_LOC)/store.h5 \ 190 | --synth_batch_size 3000 \ 191 | --threshold 0.5 \ 192 | --out_synth_scores $(METRICS_LOC)/synth_mean_results.csv 193 | 194 | ## Synthesize predictions for keywords and classifiers to create full classification 195 | synthesize-bert: 196 | mkdir -p $(METRICS_LOC) && \ 197 | $(PYTHON_INTERPRETER) src/synthesize_predictions.py \ 198 | --experiment_name $(EXPERIMENT_NAME) \ 199 | --synth_strat mean \ 200 | --in_cat_preds $(METRICS_LOC)/bert_cat_preds.npy \ 201 | --store $(METRICS_LOC)/store.h5 \ 202 | --synth_batch_size 3000 \ 203 | --threshold 0.5 \ 204 | --out_synth_scores $(METRICS_LOC)/synth_bert_mean_results.csv 205 | 206 | ## create plots from performance metrics 207 | plots: 208 | mkdir -p $(METRICS_LOC)/figures && \ 209 | $(PYTHON_INTERPRETER) src/make_plots.py \ 210 | --mean $(METRICS_LOC)/synth_mean_results.csv \ 211 | --in_cats_dir $(OUT_CAT_MODEL_DIR)/models \ 212 | --in_kwds_dir $(OUT_KWD_MODEL_DIR)/models \ 213 | --in_cats_dir $(OUT_CAT_MODEL_DIR)/models \ 214 | --in_vectorizer $(OUT_OUTER_MODEL_DIR)/vectorizer.jbl \ 215 | --in_clean_data $(INTERIM_CORPUS) \ 216 | --in_config $(OUT_OUTER_MODEL_DIR)/config.yml \ 217 | --out_plots_dir $(METRICS_LOC)/figures 218 | 219 | ## create plots from performance metrics 220 | plots-bert: 221 | mkdir -p $(METRICS_LOC)/figures_bert && \ 222 | $(PYTHON_INTERPRETER) src/make_plots.py \ 223 | --mean $(METRICS_LOC)/synth_bert_mean_results.csv \ 224 | --in_cats_dir $(OUT_CAT_MODEL_DIR)/models \ 225 | --in_kwds_dir $(OUT_KWD_MODEL_DIR)/models \ 226 | --in_cats_dir $(OUT_CAT_MODEL_DIR)/models \ 227 | --in_vectorizer $(OUT_OUTER_MODEL_DIR)/vectorizer.jbl \ 228 | --in_clean_data $(INTERIM_CORPUS) \ 229 | --in_config $(OUT_OUTER_MODEL_DIR)/config.yml \ 230 | --out_plots_dir $(METRICS_LOC)/figures_bert 231 | 232 | ## Build docker image for training 233 | build: 234 | export COMMIT=$$(git log -1 --format=%H); \ 235 | export REPO_URL=$$(git remote get-url $(GIT_REMOTE)); \ 236 | export REPO_DIR=$$(dirname $$REPO_URL); \ 237 | export BASE_NAME=$$(basename $$REPO_URL .git); \ 238 | export GIT_LOC=$$REPO_DIR/$$BASE_NAME/tree/$$COMMIT; \ 239 | export VERSION=$$(python version.py); \ 240 | echo $$GIT_LOC; \ 241 | echo $$VERSION; \ 242 | docker build -t $(IMAGE_NAME):$$VERSION \ 243 | --build-arg GIT_URL=$$GIT_LOC \ 244 | --build-arg VERSION=$$VERSION . 245 | 246 | ## Start docker container for running full pipeline 247 | container: 248 | export VERSION=$$(python version.py); \ 249 | docker run -it \ 250 | -v $$(pwd)/data:/home/data \ 251 | -v $$(pwd)/models:/home/models \ 252 | -v $$(pwd)/config:/home/config \ 253 | -v $$(pwd)/reports:/home/reports \ 254 | $(IMAGE_NAME):$$VERSION pipeline \ 255 | EXPERIMENT_NAME=$(EXPERIMENT_NAME) \ 256 | IN_CORPUS=$(IN_CORPUS) \ 257 | IN_CONFIG=$(IN_CONFIG) 258 | 259 | ## Delete all compiled Python files 260 | clean: 261 | find . -type f -name "*.py[co]" -delete 262 | find . -type d -name "__pycache__" -delete 263 | 264 | check_clean: 265 | @echo $(OUT_OUTER_MODEL_DIR) 266 | @echo data/interim/$(EXPERIMENT_NAME) 267 | @echo $(METRICS_LOC) 268 | @echo -n "Are you sure you want to remove the above folders? [y/N] " && read ans && [ $${ans:-N} = y ] 269 | 270 | ## delete all interim data, models, and reports for the given experiment 271 | clean_experiment: check_clean 272 | rm -r $(OUT_OUTER_MODEL_DIR) 273 | rm -r data/interim/$(EXPERIMENT_NAME) 274 | rm -r $(METRICS_LOC) 275 | 276 | ## sync this experiment to s3 277 | sync_experiment_to_s3: 278 | ifeq (default,$(PROFILE)) 279 | aws s3 sync models/$(EXPERIMENT_NAME) s3://$(BUCKET)models/$(EXPERIMENT_NAME) 280 | aws s3 sync data/interim/$(EXPERIMENT_NAME) s3://$(BUCKET)data/interim/$(EXPERIMENT_NAME) 281 | aws s3 sync reports/$(EXPERIMENT_NAME) s3://$(BUCKET)reports/$(EXPERIMENT_NAME) 282 | else 283 | aws s3 sync models/$(EXPERIMENT_NAME) s3://$(BUCKET)models/$(EXPERIMENT_NAME) --profile $(PROFILE) 284 | aws s3 sync data/interim/$(EXPERIMENT_NAME) s3://$(BUCKET)data/interim/$(EXPERIMENT_NAME) --profile $(PROFILE) 285 | aws s3 sync reports/$(EXPERIMENT_NAME) s3://$(BUCKET)reports/$(EXPERIMENT_NAME) --profile $(PROFILE) 286 | endif 287 | 288 | ## sync this experiment from s3 289 | sync_experiment_from_s3: 290 | ifeq (default,$(PROFILE)) 291 | aws s3 sync s3://$(BUCKET)models/$(EXPERIMENT_NAME) models/$(EXPERIMENT_NAME) 292 | aws s3 sync s3://$(BUCKET)reports/$(EXPERIMENT_NAME) reports/$(EXPERIMENT_NAME) 293 | aws s3 sync s3://$(BUCKET)data/processed/$(EXPERIMENT_NAME) data/processed/$(EXPERIMENT_NAME) 294 | else 295 | aws s3 sync s3://$(BUCKET)models/$(EXPERIMENT_NAME) models/$(EXPERIMENT_NAME) --profile $(PROFILE) 296 | aws s3 sync s3://$(BUCKET)reports/$(EXPERIMENT_NAME) reports/$(EXPERIMENT_NAME) --profile $(PROFILE) 297 | aws s3 sync s3://$(BUCKET)data/processed/$(EXPERIMENT_NAME) data/processed/$(EXPERIMENT_NAME) --profile $(PROFILE) 298 | endif 299 | 300 | ## sync raw starting data from s3 301 | sync_raw_data_from_s3: 302 | ifeq (default,$(PROFILE)) 303 | aws s3 cp s3://hq-ocio-ci-bigdata/data/STI/STI_records_metadata.jsonl data/raw/STI_records_metadata.jsonl 304 | else 305 | aws s3 cp s3://hq-ocio-ci-bigdata/data/STI/STI_records_metadata.jsonl data/raw/STI_records_metadata.jsonl --profile $(PROFILE) 306 | endif 307 | echo "These records should be handled as moderate data assets. Handle these records with care." 308 | 309 | ## zip models necessary for running the app 310 | zip-experiment-for-app: 311 | cd models/; \ 312 | zip -r $(EXPERIMENT_NAME).zip \ 313 | $(EXPERIMENT_NAME)/categories/models \ 314 | $(EXPERIMENT_NAME)/keywords/models \ 315 | $(EXPERIMENT_NAME)/kwd_raw2lemma.json \ 316 | $(EXPERIMENT_NAME)/cat_raw2lemma.json \ 317 | $(EXPERIMENT_NAME)/vectorizer.jbl \ 318 | $(EXPERIMENT_NAME)/config.yml \ 319 | 320 | ## Upload zipped experiment app files to s3 321 | upload-experiment-zip-to-s3: 322 | aws s3 cp models/$(EXPERIMENT_NAME).zip s3://$(BUCKET)models/$(EXPERIMENT_NAME).zip --profile $(PROFILE) 323 | ################################################################################# 324 | # PROJECT RULES # 325 | ################################################################################# 326 | 327 | 328 | 329 | ################################################################################# 330 | # Self Documenting Commands # 331 | ################################################################################# 332 | 333 | .DEFAULT_GOAL := help 334 | 335 | # Inspired by <http://marmelab.com/blog/2016/02/29/auto-documented-makefile.html> 336 | # sed script explained: 337 | # /^##/: 338 | # * save line in hold space 339 | # * purge line 340 | # * Loop: 341 | # * append newline + line to hold space 342 | # * go to next line 343 | # * if line starts with doc comment, strip comment character off and loop 344 | # * remove target prerequisites 345 | # * append hold space (+ newline) to line 346 | # * replace newline plus comments by `---` 347 | # * print line 348 | # Separate expressions are necessary because labels cannot be delimited by 349 | # semicolon; see <http://stackoverflow.com/a/11799865/1968> 350 | .PHONY: help 351 | help: 352 | @echo "$$(tput bold)Available rules:$$(tput sgr0)" 353 | @echo 354 | @sed -n -e "/^## / { \ 355 | h; \ 356 | s/.*//; \ 357 | :doc" \ 358 | -e "H; \ 359 | n; \ 360 | s/^## //; \ 361 | t doc" \ 362 | -e "s/:.*//; \ 363 | G; \ 364 | s/\\n## /---/; \ 365 | s/\\n/ /g; \ 366 | p; \ 367 | }" ${MAKEFILE_LIST} \ 368 | | LC_ALL='C' sort --ignore-case \ 369 | | awk -F '---' \ 370 | -v ncol=$$(tput cols) \ 371 | -v indent=19 \ 372 | -v col_on="$$(tput setaf 6)" \ 373 | -v col_off="$$(tput sgr0)" \ 374 | '{ \ 375 | printf "%s%*s%s ", col_on, -indent, $$1, col_off; \ 376 | n = split($$2, words, " "); \ 377 | line_length = ncol - indent; \ 378 | for (i = 1; i <= n; i++) { \ 379 | line_length -= length(words[i]) + 1; \ 380 | if (line_length <= 0) { \ 381 | line_length = ncol - indent - length(words[i]) - 1; \ 382 | printf "\n%*s ", -indent, " "; \ 383 | } \ 384 | printf "%s ", words[i]; \ 385 | } \ 386 | printf "\n"; \ 387 | }' \ 388 | | more $(shell test $(shell uname) = Darwin && echo '--no-init --raw-control-chars') -------------------------------------------------------------------------------- /src/dsconcept/model.py: -------------------------------------------------------------------------------- 1 | """ 2 | model 3 | ----- 4 | Program to make classifiers from input corpus and selected keyword field. 5 | 6 | Author: Anthony Buonomo 7 | Contact: anthony.r.buonomo@nasa.gov 8 | 9 | Classes to support document classification. 10 | """ 11 | 12 | from collections import Counter 13 | import logging 14 | from multiprocessing import cpu_count 15 | import json 16 | from typing import Dict 17 | from tqdm import tqdm 18 | 19 | import spacy 20 | from spacy.lemmatizer import Lemmatizer 21 | from spacy.lookups import Lookups 22 | from textacy.extract import acronyms_and_definitions 23 | 24 | nlp = spacy.load("en_core_web_sm") 25 | 26 | logging.basicConfig(level=logging.INFO) 27 | LOG = logging.getLogger(__name__) 28 | LOG.setLevel(logging.DEBUG) 29 | 30 | 31 | def file_len(fname): 32 | with open(fname) as f: 33 | for i, l in enumerate(f): 34 | pass 35 | return i + 1 36 | 37 | 38 | def spacy_tokenizer(txt): 39 | """ 40 | Tokenize txt using spacy. Fit for use with sklearn CountVectorizer. 41 | 42 | Args: 43 | txt (str): text to be tokenized 44 | 45 | Returns: 46 | terms_tagged_list (list of str): tokens extracted from text 47 | 48 | Examples: 49 | >>> from dsconcept.model import spacy_tokenizer 50 | >>> from sklearn.feature_extraction.text import CountVectorizer 51 | >>> txt = "The ship hung in the sky much the same way bricks don't." 52 | >>> doc_tokens = spacy_tokenizer(txt) 53 | >>> doc_tokens 54 | ['ship :: NOUN', 55 | 'sky :: NOUN', 56 | 'way :: NOUN', 57 | 'brick :: NOUN', 58 | 'the ship :: NOUN_CHUNK'] 59 | >>> v = CountVectorizer(txt, tokenizer=spacy_tokenizer) 60 | >>> v.fit_transform([txt]) 61 | >>> v.vocabulary_ 62 | {'ship :: NOUN': 1, 'sky :: NOUN': 2, 'way :: NOUN': 3, 'brick :: NOUN': 0} 63 | """ 64 | doc = nlp(txt) 65 | terms_tagged = extract_from_doc(doc) 66 | terms_tagged_list = [f"{term} :: {tag}" for term, tag in terms_tagged.items()] 67 | return terms_tagged_list 68 | 69 | 70 | def should_keep(w, desired_parts_of_speech): 71 | desiredPOS = w.pos_ in desired_parts_of_speech 72 | notStop = not w.is_stop 73 | notPerc = w.lemma_ not in ["%"] 74 | return desiredPOS and notStop and notPerc 75 | 76 | 77 | def extract_from_doc(doc): 78 | """ 79 | Extract features from a spacy doc. 80 | 81 | Args: 82 | doc (spacy.doc): a doc processed by the spacy 'en' model 83 | 84 | Returns: 85 | terms_tagged (dict): features with their respective tags 86 | 87 | Examples: 88 | >>> from dsconcept.model import extract_from_doc 89 | >>> import spacy 90 | >>> nlp = spacy.load('en_core_web_sm') 91 | >>> txt = "The ship hung in the sky much the same way bricks don't." 92 | >>> doc = nlp(txt) 93 | >>> features = extract_from_doc(doc) 94 | >>> features 95 | {'ship': 'NOUN', 96 | 'sky': 'NOUN', 97 | 'way': 'NOUN', 98 | 'brick': 'NOUN', 99 | 'the ship': 'NOUN_CHUNK'} 100 | """ 101 | # TODO: change this function such that it processes better but maintains the same interface. 102 | terms_tagged = dict() 103 | 104 | desired_parts_of_speech = ["NOUN", "PROPN"] 105 | # Get any 1-gram terms which are not % signs, or stop words. 106 | terms = {w.lemma_: w.pos_ for w in doc if should_keep(w, desired_parts_of_speech)} 107 | terms_tagged.update(terms) 108 | 109 | # Lemmatize each gram and join with a space. 110 | noun_chunks = { 111 | " ".join([w.lemma_ for w in nc if not w.is_stop]): nc.label_ 112 | for nc in doc.noun_chunks 113 | } 114 | # filter our noun chunks that are already in terms set and not in excluded_list. 115 | excluded_list = ["-PRON-", ""] 116 | noun_chunks_filtered = { 117 | w.strip(): "NOUN_CHUNK" 118 | for w, lab in noun_chunks.items() 119 | if (w not in terms.keys()) and (w not in excluded_list) 120 | } 121 | terms_tagged.update(noun_chunks_filtered) 122 | 123 | # TODO: entities take precedence over noun chunks 124 | # Get entities from text and remove collisions with terms and noun chunks. 125 | ent_excluded_set = ["ORDINAL", "CARDINAL", "QUANTITY", "DATE", "PERCENT"] 126 | ents = {e.lemma_: e.label_ for e in doc.ents if e.label_ not in ent_excluded_set} 127 | ents_filtered = { 128 | ent: "ENT" 129 | for ent, lab in ents.items() 130 | if ent not in terms.keys() and ent not in noun_chunks_filtered.keys() 131 | } 132 | terms_tagged.update(ents_filtered) 133 | 134 | # Add acronyms which have definitions. 135 | # These acronyms could create Noise if they are not good. Maybe better to use their definitions. 136 | # This schema will only pull out identifical definitions. No lemmatizing, no fuzzy matching. 137 | # TODO: add lemmatizing and fuzzy matching for acrnoyms. This code exists in acronyms project. 138 | acronyms_with_defs = acronyms_and_definitions(doc) 139 | acronyms_filtered = { 140 | "{} - {}".format(ac, definition): "ACRONYM" 141 | for ac, definition in acronyms_with_defs.items() 142 | if definition != "" 143 | } 144 | terms_tagged.update(acronyms_filtered) 145 | 146 | return terms_tagged 147 | 148 | 149 | def extract_features_from_abstracts( 150 | descriptions, feature_outfile, batch_size=1000, n_threads=cpu_count(), total=None 151 | ): 152 | """ 153 | Generate features from input batch of abstracts. 154 | 155 | Args: 156 | descriptions (list of str): list of descriptions 157 | feature_outfile (str): output file for features jsonlines 158 | batch_size (int): how many docs to process in a batch 159 | n_threads (int): number of threads to process with 160 | total (int): total number of description to optionally pass to tqdm for a better loading bar 161 | 162 | Returns: 163 | no_descriptions (int): hown many descriptions were processed 164 | 165 | Examples: 166 | >>> from dsconcept.model import extract_features_from_abstracts 167 | >>> import json 168 | >>> 169 | >>> abstract1 = " A common mistake that people make when trying to design something completely foolproof is to underestimate the ingenuity of complete fools." 170 | >>> abstract2 = "Since we decided a few weeks ago to adopt the leaf as legal tender, we have, of course, all become immensely rich." 171 | >>> abstracts = [abstract1, abstract2] 172 | >>> 173 | >>> feature_outfile = 'data/tmp_features.txt' 174 | >>> 175 | >>> extract_features_from_abstracts(abstracts, feature_outfile, batch_size=1, n_threads=1) 176 | >>> 177 | >>> with open(feature_outfile, 'r') as f0: 178 | >>> content = f0.readlines() 179 | >>> features = [json.loads(line) for line in content] 180 | >>> features 181 | [{'mistake': 'NOUN', 182 | 'people': 'NOUN', 183 | 'ingenuity': 'NOUN', 184 | 'fool': 'NOUN', 185 | 'a common mistake': 'NOUN_CHUNK', 186 | 'complete fool': 'NOUN_CHUNK'}, 187 | {'week': 'NOUN', 188 | 'leaf': 'NOUN', 189 | 'tender': 'NOUN', 190 | 'course': 'NOUN', 191 | 'legal tender': 'NOUN_CHUNK'}] 192 | """ 193 | 194 | LOG.info("Extracting features to {}".format(feature_outfile)) 195 | no_descriptions = 0 196 | with open(feature_outfile, "w") as f0: 197 | for doc in tqdm( 198 | nlp.pipe(descriptions, batch_size=batch_size, n_threads=n_threads,), 199 | total=total, 200 | ): 201 | json.dump(extract_from_doc(doc), f0) # each line is valid json 202 | f0.write("\n") 203 | no_descriptions += 1 204 | 205 | LOG.info("Extracted feature sets to {}".format(feature_outfile)) 206 | return no_descriptions 207 | 208 | 209 | class FeatureExtractor: 210 | def __init__(self): 211 | """ 212 | A term extractor. 213 | 214 | Examples: 215 | >>> from dsconcept.model import FeatureExtractor 216 | >>> extractor = FeatureExtractor() 217 | """ 218 | self._features = list() 219 | self.term_types = dict() 220 | self.feature_counts = Counter() 221 | 222 | @property 223 | def features(self): 224 | return self._features 225 | 226 | @features.setter 227 | def features(self, value): 228 | self._features = value 229 | self.term_types = { 230 | term_type 231 | for feature_set in self._features 232 | for term_type in feature_set.values() 233 | } 234 | all_features = [ 235 | feature 236 | for feature_set in self._features 237 | for feature, val in feature_set.items() 238 | ] 239 | self.feature_counts = Counter(all_features) 240 | 241 | @staticmethod 242 | def from_corpus_to_jsonlines( 243 | in_corpus, out_features, abstract_field, batch_size=1000, n_threads=cpu_count() 244 | ): 245 | """ 246 | 247 | Args: 248 | in_corpus (pathlib.Path | str): input path to json file containing corpus 249 | out_features (pathlib.Path | str): output path for features json lines file. 250 | abstract_field (str): name of abstract field for corpus 251 | batch_size (int): size of batch to use when multithreading using spacy's nlp.pipe 252 | n_threads (int): number of threads to use when multithreading using spacy's nlp.pipe 253 | 254 | Returns: 255 | n_descriptions (int): the number of abstracts in the corpus 256 | 257 | """ 258 | 259 | n_lines = file_len(in_corpus) 260 | with open(in_corpus, "r") as f0: 261 | record_generator = (json.loads(l) for l in f0.readlines()) 262 | text_generator = (r[abstract_field] for r in record_generator) 263 | n_descriptions = extract_features_from_abstracts( 264 | text_generator, out_features, batch_size, n_threads, total=n_lines 265 | ) 266 | return n_descriptions 267 | 268 | def from_jsonlines(self, in_features): 269 | """ 270 | Load features from jsonlines. 271 | 272 | Args: 273 | in_features (pathlib.Path | str): path to input jsonlines features file 274 | 275 | Returns: 276 | in_features (pathlib.Path | str): path to input jsonlines features file 277 | 278 | """ 279 | with open(in_features, "r") as f0: 280 | content = ( 281 | f0.readlines() 282 | ) # each line is json formatted, but whole file is not. 283 | self.features = [json.loads(line) for line in content] 284 | return in_features 285 | 286 | def to_jsonlines(self, out_features): 287 | """ 288 | Output features to jsonlines. 289 | 290 | Args: 291 | out_features (pathlib.Path | str): output path to features jsonlines file 292 | 293 | Returns: 294 | out_features (pathlib.Path | str): output path to features jsonlines file 295 | 296 | """ 297 | with open(out_features, "w") as f0: 298 | for feature_set in self.features: 299 | json.dump(feature_set, f0) # each line is valid json 300 | f0.write("\n") 301 | return out_features 302 | 303 | def weight_terms(self, weights: Dict[str, int]): 304 | """ 305 | Weights features according to tag type. 306 | 307 | Args: 308 | weights (dict of str): mappings from term types to their weights 309 | 310 | Returns: 311 | weighted_features (list of dict): features with mappings to weights instead of term types 312 | 313 | Examples 314 | -------- 315 | >>> weights = {'NOUN': 1, 'NOUN_CHUNK': 2} 316 | >>> weighted_features = tm.weight_terms(weights) 317 | >>> weighted_features 318 | [{'mistake': 1, 319 | 'people': 1, 320 | 'ingenuity': 1, 321 | 'fool': 1, 322 | 'a common mistake': 2, 323 | 'complete fool': 2}, 324 | {'week': 1, 'leaf': 1, 'tender': 1, 'course': 1, 'legal tender': 2}] 325 | """ 326 | assert type(weights) is dict, "Weights must be dict: {}".format(weights) 327 | if self.term_types > weights.keys(): 328 | LOG.warning( 329 | "Term types without a specified weight will be omitted from returned feature sets." 330 | ) 331 | elif self.term_types < weights.keys(): 332 | LOG.warning( 333 | "More term types specified then those which exist in corpus. Ignoring excess." 334 | ) 335 | weighted_features = [ 336 | weight_terms_inner(doc_features, weights) for doc_features in self.features 337 | ] 338 | return weighted_features 339 | 340 | def limit_features( 341 | self, 342 | weighted_features, 343 | feature_min, 344 | feature_max, 345 | topic=None, 346 | doc_topic_matrix=None, 347 | ): 348 | """ 349 | Cull features. 350 | 351 | Args: 352 | weighted_features (list of dict): features with assigned weights 353 | feature_min (int): features which have in-corpus frequencies under feature_min are excluded. 354 | feature_max (float): features which occur in greater than this percentage of documents are excluded. 355 | topic (int | None): if specified, only return feature sets with maximum probability to be in this topic. 356 | doc_topic_matrix (numpy.ndarray): topic probability distributions for each document in corpus. 357 | 358 | Returns: 359 | weighted_limited (list): limited features with assigned weights 360 | 361 | Examples: 362 | >>> limited_features = tm.limit_features_for_X(weighted_features, feature_min=1, feature_max=0.99) 363 | """ 364 | assert (feature_max > 0.0) and ( 365 | feature_max <= (1.0) 366 | ), "feature_max should be float in (0,1]" 367 | feature_ex = { 368 | feature: occurrence 369 | for feature, occurrence in self.feature_counts.items() 370 | if (occurrence >= feature_min) 371 | and (occurrence / len(self.features) < feature_max) 372 | } 373 | 374 | weighted_limited = [ 375 | { 376 | feature: val 377 | for feature, val in feature_set.items() 378 | if feature in feature_ex 379 | } 380 | for feature_set in weighted_features 381 | ] 382 | 383 | if topic is not None: 384 | assert doc_topic_matrix is not None, LOG.error( 385 | "Must supply doc_topic_matrix when using topic model segmentation." 386 | ) 387 | LOG.info(f"Segmenting vectorizer and matrix for topic {topic}.") 388 | print("here") 389 | in_topic_index = [ 390 | i for i, distr in enumerate(doc_topic_matrix) if distr.argmax() == topic 391 | ] 392 | weighted_limited = [weighted_limited[i] for i in in_topic_index] 393 | 394 | return weighted_limited 395 | 396 | 397 | def weight_terms_inner(doc_features, weights): 398 | """ 399 | 400 | Args: 401 | doc_features (dict): features with assigned tags 402 | weights (dict): tag to weight mappings 403 | 404 | Returns: 405 | weighted_terms (dict): features with assigned weights 406 | 407 | Examples 408 | >>> from dsconcept.model import weight_terms_inner 409 | >>> features = {'ship': 'NOUN', 'sky': 'NOUN', 'way': 'NOUN', 'brick': 'NOUN', 'the ship': 'NOUN_CHUNK'} 410 | >>> weights = {'NOUN': 1, 'NOUN_CHUNK': 3} 411 | >>> weighted_terms = weight_terms_inner(features, weights) 412 | >>> weighted_terms 413 | {'ship': 1, 'sky': 1, 'way': 1, 'brick': 1, 'the ship': 3} 414 | """ 415 | weighted_terms = {} 416 | for pos0, weight in weights.items(): 417 | updated_dict = {w: weight for w, pos in doc_features.items() if pos == pos0} 418 | weighted_terms.update(updated_dict) 419 | 420 | return weighted_terms 421 | 422 | 423 | class ConceptExtractor: 424 | def __init__(self): 425 | """ 426 | Information about relationship between concepts/keywords and corpus. 427 | 428 | Examples: 429 | >>> from dsconcept.model import ConceptExtractor 430 | >>> kwd_sets = [['Zaphod', 'Arthur'], ['Arthur'], ['Zaphod'], ['Heart of Gold']] 431 | >>> info = ConceptExtractor.concept_sets = kwd_sets 432 | >>> info.concepts 433 | {'arthur', 'heart of gold', 'zaphod'} 434 | """ 435 | self._concept_sets = [] 436 | self.raw2lemma = {} 437 | self.lemma2raw = {} 438 | self.lemmatizer = None 439 | self.concepts_frequencies = Counter() 440 | self.concepts = set() 441 | self.concept_index_mapping = {} 442 | 443 | @property 444 | def concept_sets(self): 445 | return self._concept_sets 446 | 447 | @concept_sets.setter 448 | def concept_sets(self, value): 449 | """ 450 | Sets concepts_sets and the attributes derived from it. 451 | 452 | Args: 453 | value (list of list of str): A list of lists of strings; each string being a concept, 454 | each set in the larger list corresponding to a document which has the tags seen in the set. 455 | """ 456 | self._concept_sets = value 457 | LOG.debug("Extracting raw keywords as concepts.") 458 | all_concepts = [ 459 | concept 460 | for concept_set in tqdm(self._concept_sets) 461 | for concept in concept_set 462 | if concept.strip() != "" 463 | ] 464 | raw_concepts = set(all_concepts) 465 | 466 | LOG.debug("Lemmatizing {} raw concepts.".format(len(raw_concepts))) 467 | concepts = [c.lower() for c in raw_concepts] 468 | 469 | self.raw2lemma = {rc: c for rc, c in zip(raw_concepts, concepts)} 470 | lookups = Lookups() 471 | lookups.add_table("lemma_lookup", self.raw2lemma) 472 | self.lemmatizer = Lemmatizer(lookups) 473 | self.lemma2raw = {v: k for k, v in self.raw2lemma.items()} 474 | lemma_concepts = [ 475 | self.lemmatizer(concept, "NOUN")[0] for concept in all_concepts 476 | ] 477 | self.concepts_frequencies = Counter(lemma_concepts) 478 | self.concepts = set(lemma_concepts) 479 | self._fit_concept_indices() 480 | 481 | def _fit_concept_indices(self): 482 | kwd_sets_lemmas = [ 483 | [self.lemmatizer(kwd, "NOUN")[0] for kwd in kwd_set] 484 | for kwd_set in self.concept_sets 485 | ] 486 | concepts_with_inds = dict() 487 | for i, kwd_set in enumerate(kwd_sets_lemmas): 488 | for kwd in kwd_set: 489 | if kwd not in concepts_with_inds: 490 | concepts_with_inds[kwd] = [i] 491 | else: 492 | concepts_with_inds[kwd].append(i) 493 | self.concept_index_mapping = concepts_with_inds 494 | 495 | def from_corpus(self, in_corpus, concept_field): 496 | """ 497 | Extract concepts from input json corpus. 498 | 499 | Args: 500 | in_corpus (pathlike): path to input json-formatted corpus from which to extract concepts 501 | concept_field (str): the name of the concept field 502 | """ 503 | with open(in_corpus, "r") as f0: 504 | record_generator = (json.loads(l) for l in f0.readlines()) 505 | concept_sets = [r[concept_field] for r in record_generator] 506 | with_concepts = [i for i, cs in enumerate(concept_sets) if cs is not []] 507 | assert len(with_concepts) > 0, LOG.error( 508 | f'"{concept_field}" not present in corpus.' 509 | ) 510 | LOG.debug(f"{len(with_concepts)} docs in corpus with {concept_field}.") 511 | self.concept_sets = concept_sets 512 | 513 | def to_jsons(self, out_indices, out_raw2lemma): 514 | """ 515 | Output indices and raw2lemma dicts to json files. 516 | 517 | Args: 518 | out_indices (pathlib.Path): path to output file containing indices for concepts 519 | out_raw2lemma (pathlib.Path): path to output file containing mappings from concepts to their lemmas 520 | 521 | Returns: 522 | out_indices (pathlib.Path): path to output file containing indices for concepts 523 | out_raw2lemma (pathlib.Path): path to output file containing mappings from concepts to their lemmas 524 | 525 | """ 526 | with open(out_indices, "w") as f0: 527 | json.dump(self.concept_index_mapping, f0) 528 | with open(out_raw2lemma, "w") as f0: 529 | json.dump(self.raw2lemma, f0) 530 | return out_indices, out_raw2lemma 531 | 532 | def from_jsons( 533 | self, in_indices, in_raw2lemma 534 | ): # a little strange because it does not fill in all attributes 535 | """ 536 | Load index and raw2lemma dictionaries into empty ConceptExtractor 537 | 538 | Args: 539 | in_indices (): 540 | in_raw2lemma (): 541 | """ 542 | with open(in_indices, "r") as f0: 543 | self.concept_index_mapping = json.load(f0) 544 | with open(in_raw2lemma, "r") as f0: 545 | self.raw2lemma = json.load(f0) 546 | lookups = Lookups() 547 | lookups.add_table("lemma_lookup", self.raw2lemma) 548 | self.lemmatizer = Lemmatizer(lookups) 549 | self.lemma2raw = {v: k for k, v in self.raw2lemma.items()} 550 | self.concepts = self.concept_index_mapping.keys() 551 | tmp_frequencies = { 552 | concept: len(index) for concept, index in self.concept_index_mapping.items() 553 | } 554 | self.concepts_frequencies = Counter(tmp_frequencies) 555 | 556 | def get_top_concepts(self, min_freq=500): 557 | """ 558 | 559 | Args: 560 | min_freq (int): occurrence threshold for concepts 561 | 562 | Returns: 563 | top_concepts(dict): a subset of the 564 | 565 | Examples: 566 | >>> info.get_top_concepts(2) 567 | >>> info.top_concepts 568 | ['zaphod', 'arthur'] 569 | """ 570 | LOG.info(f"Getting indices for concepts with frequency >= {min_freq}.") 571 | top_concepts = { 572 | concept: index 573 | for concept, index in self.concept_index_mapping.items() 574 | if len(index) >= min_freq 575 | } 576 | return top_concepts 577 | -------------------------------------------------------------------------------- /src/dsconcept/get_metrics.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import logging 4 | from math import ceil 5 | from multiprocessing import cpu_count 6 | from pathlib import Path 7 | from typing import List, Dict, Tuple 8 | 9 | import dask 10 | import h5py 11 | import joblib 12 | import numpy as np 13 | import pandas as pd 14 | from dask.diagnostics import ProgressBar 15 | from sklearn.feature_extraction import DictVectorizer 16 | from sklearn.metrics import ( 17 | accuracy_score, 18 | roc_auc_score, 19 | recall_score, 20 | precision_score, 21 | ) 22 | from sklearn.model_selection import GridSearchCV 23 | from tqdm import tqdm as tqdm 24 | from tempfile import NamedTemporaryFile, TemporaryDirectory 25 | 26 | import dsconcept.model as ml 27 | 28 | logging.basicConfig(level=logging.INFO) 29 | LOG = logging.getLogger(__name__) 30 | LOG.setLevel(logging.INFO) 31 | 32 | PRED_LIST_TYPE = List[List[Tuple[str, float]]] 33 | 34 | 35 | def get_cat_inds( 36 | categories: List[str], cat_preds: np.array, t: float = 0.5 37 | ) -> Dict[str, np.array]: 38 | """ 39 | Apply a threshold to get documents indices corresponding to each category. 40 | 41 | Args: 42 | categories: list of categories which are columns of the cat_preds array 43 | cat_preds: array of scores for each category for each document 44 | ([documents, categories]) 45 | t: threshold over which a category is determined to be relevant 46 | to a given document 47 | 48 | Returns: 49 | all_cat_inds: dictionary with keys which are categories. 50 | Values are index of documents which apply to each category. 51 | 52 | Examples: 53 | >>> from get_metrics import get_cat_inds 54 | >>> import numpy as np 55 | >>> cats = ['physics', 'geology'] 56 | >>> cat_preds = np.array([[0.4, 0.8], [0.5, 0.6], [0.9, 0.3]]) 57 | >>> get_cat_inds(cats, cat_preds, t=0.5) 58 | {'physics': array([2]), 'geology': array([0, 1])} 59 | """ 60 | all_cat_inds = {} 61 | for i, cat in enumerate(categories): 62 | if cat == "": 63 | continue 64 | x = cat_preds[:, i] 65 | g_args = np.argwhere(x > t) 66 | if g_args.shape[0] == 0: 67 | cat_inds = np.array([]) 68 | else: 69 | cat_inds = np.stack(np.argwhere(x > t), axis=1)[0] 70 | all_cat_inds[cat] = cat_inds 71 | return all_cat_inds 72 | 73 | 74 | def f_score(r: float, p: float, b: int = 1): 75 | """ 76 | Calculate f-measure from recall and precision. 77 | 78 | Args: 79 | r: recall score 80 | p: precision score 81 | b: weight of precision in harmonic mean 82 | 83 | Returns: 84 | val: value of f-measure 85 | """ 86 | try: 87 | val = (1 + b ** 2) * (p * r) / (b ** 2 * p + r) 88 | except ZeroDivisionError: 89 | val = 0 90 | return val 91 | 92 | 93 | def get_mets( 94 | i: int, 95 | synth_preds: np.array, 96 | target_vals: np.array, 97 | con_with_clf: np.array, 98 | pbar=None, 99 | ) -> dict: 100 | """ 101 | Get various metrics for the given arrays. 102 | # 103 | TODO: just pass in the already sliced synth_preds, Y, and con_with_clf? 104 | 105 | Args: 106 | i: index for the given concept 107 | synth_preds: arrays of predictions for each document and each concept 108 | target_vals: true values for each document and concept 109 | con_with_clf: arrays of concepts corresponding 110 | to columns synth_preds and target_vals 111 | 112 | Returns: 113 | metrics: metric records for the given concept 114 | """ 115 | tmp_y_pred = synth_preds[:, i] 116 | tmp_y_pred_bool = [1 if v > 0.5 else 0 for v in tmp_y_pred] 117 | tmp_y_test = target_vals[:, i] 118 | p = precision_score(tmp_y_test, tmp_y_pred_bool) 119 | r = recall_score(tmp_y_test, tmp_y_pred_bool) 120 | f = f_score(r, p) 121 | accuracy = accuracy_score(tmp_y_test, tmp_y_pred_bool) 122 | try: 123 | roc_auc = roc_auc_score(tmp_y_test, tmp_y_pred) 124 | except ValueError: # why does this happen? 125 | roc_auc = np.nan 126 | metrics = { 127 | "concept": con_with_clf[i], 128 | "accuracy": accuracy, 129 | "f1": f, 130 | "precision": p, 131 | "recall": r, 132 | "roc_auc": roc_auc, 133 | } 134 | if pbar is not None: 135 | pbar.update(1) 136 | return metrics 137 | 138 | 139 | def synth_mean( 140 | kwd_preds_tmp: np.array, doc_index: int, concept_index: int, non_zero_cats: list, 141 | ) -> float: 142 | """ 143 | Get the mean of nonzero predictions for given concept and given document. 144 | # TODO: get the precise matrix outside of function? Then pass in? 145 | 146 | Args: 147 | kwd_preds_tmp: 3D array of predictions 148 | [categories, documents, concepts] 149 | doc_index: index of test document 150 | concept_index: index of concept 151 | non_zero_cats: categories for which this concept has nonzero prediction 152 | 153 | Returns: 154 | mean: mean of nonzero predictions for this concept for this document 155 | """ 156 | if len(non_zero_cats) != 0: 157 | mean = np.mean(kwd_preds_tmp[non_zero_cats, doc_index, concept_index]) 158 | else: 159 | mean = np.nan 160 | return mean 161 | 162 | 163 | def synth_max( 164 | kwd_preds_tmp: np.array, doc_index: int, concept_index: int, non_zero_cats: list, 165 | ) -> float: 166 | """ 167 | Get the max of nonzero predictions for given concept and given document. 168 | # TODO: nearly same as above function. Just pass in the np.nanmax or mean as args and collapse into one function? 169 | """ 170 | if len(non_zero_cats) != 0: 171 | val = np.nanmax(kwd_preds_tmp[non_zero_cats, doc_index, concept_index]) 172 | else: 173 | val = np.nan 174 | return val 175 | 176 | 177 | def get_means_for_one_doc( 178 | doc_index: int, 179 | all_cat_inds: Dict[str, np.array], 180 | kwd_preds_tmp: np.array, 181 | categories: List[str], 182 | no_cat_ind: int, 183 | only_cat: bool = False, 184 | synth_strat: str = "mean", 185 | pbar=None, 186 | ) -> np.array: 187 | """ 188 | Get mean of nonzero concept predictions for each concepts 189 | in relevant categories for given doc. 190 | 191 | Args: 192 | doc_index: index of given document 193 | all_cat_inds: dictionary with keys which are categories. 194 | Values are index of documents which apply to each category. 195 | kwd_preds_tmp: array of all predictions 196 | [categories, documents, concepts] 197 | categories: list of categories 198 | no_cat_ind: index in categories list of the blank category "" 199 | only_cat: Only use category classifier or mixin the no category classifiers 200 | synth_strat: either "mean" or "max" 201 | # TODO: just pass a function instead of string? 202 | 203 | Returns: 204 | kwd_vals: array of synthesizes keyword prediction values 205 | for given document 206 | """ 207 | cats = [ 208 | cat for cat, inds in all_cat_inds.items() if doc_index in inds 209 | ] # get category by index instead? means all_cat index should be by index 210 | cat_inds = [categories.index(cat) for cat in cats] 211 | if only_cat is False: 212 | cat_inds.append(no_cat_ind) 213 | # ^ also average with the no-topic set, make this a decision? 214 | kwd_vals = [] 215 | for concept_index in range(kwd_preds_tmp.shape[2]): 216 | non_zero_cats = np.where(kwd_preds_tmp[:, doc_index, concept_index] != 0)[0] 217 | non_zero_cats = list(set(non_zero_cats).intersection(set(cat_inds))) 218 | assert synth_strat in ["mean", "max"], LOG.exception( 219 | f'Synthesis strategy "{synth_strat}" is invalid.' 220 | ) 221 | strat = synth_mean if synth_strat == "mean" else synth_max 222 | v = strat(kwd_preds_tmp, doc_index, concept_index, non_zero_cats) 223 | kwd_vals.append(v) 224 | kwd_vals = np.array(kwd_vals) 225 | if pbar is not None: 226 | pbar.update(1) 227 | return kwd_vals 228 | 229 | 230 | def create_ground_truth( 231 | store: str, 232 | dataset: str, 233 | test_inds: np.array, 234 | train_inds: np.array, 235 | concepts_with_classifiers: np.array, 236 | kwd_ext: ml.ConceptExtractor, 237 | batch_size: int, 238 | ): 239 | """ 240 | Make an array of ground truth binary labels. 241 | 242 | Args: 243 | store: location of h5 database 244 | dataset: name of dataset in h5 database 245 | at which to store ground_truth array 246 | test_inds: test indices in the training data 247 | train_inds: training indices in the training data 248 | concepts_with_classifiers: all concepts which have models 249 | kwd_ext: ml.ConceptExtractor with ground_truth indices for concepts 250 | batch_size: batch_size for creating ground truth for each concept 251 | 252 | Returns: 253 | store, dataset: h5 store location and dataset name 254 | """ 255 | with h5py.File(store, "a") as f0: 256 | ground_truth = f0.create_dataset( 257 | dataset, 258 | shape=(len(test_inds), len(concepts_with_classifiers)), 259 | compression="gzip", 260 | ) 261 | n_batches = np.int(np.ceil(len(concepts_with_classifiers) / batch_size)) 262 | for n in tqdm(range(n_batches)): 263 | start_batch = n * batch_size 264 | end_batch = (n + 1) * batch_size 265 | if end_batch >= len(concepts_with_classifiers): 266 | end_batch = len(concepts_with_classifiers) - 1 267 | batch_matrix = np.zeros((len(test_inds), end_batch - start_batch)) 268 | con_batch = concepts_with_classifiers[start_batch:end_batch] 269 | for i, con in enumerate(con_batch): 270 | index = kwd_ext.concept_index_mapping[con] 271 | y_full = np.zeros((len(test_inds) + len(train_inds))) 272 | y_full[index] = 1 273 | y = y_full[test_inds] 274 | batch_matrix[:, i] = y 275 | ground_truth[:, start_batch:end_batch] = batch_matrix 276 | 277 | return store, dataset 278 | 279 | 280 | # TODO: maybe make this a part of the hierarchical class 281 | def get_synth_preds( 282 | store, 283 | shape, 284 | all_cat_inds, 285 | categories, 286 | batch_size, 287 | only_cat, 288 | synth_strat, 289 | use_dask=True, 290 | con_limit=None, 291 | limit=None, 292 | pbar=None, 293 | ): 294 | with h5py.File(store, "a") as f_synth, h5py.File(store, "r") as f_preds: 295 | if "synthesis" in f_synth.keys(): 296 | del f_synth['synthesis'] 297 | f_synth.create_dataset("synthesis", shape) 298 | synth_preds = f_synth["synthesis"] 299 | if (limit is not None): 300 | kwd_preds = f_preds["predictions"][:, 0:limit, :] 301 | else: 302 | kwd_preds = f_preds["predictions"] 303 | n_batches = np.ceil(kwd_preds.shape[1] / batch_size) 304 | LOG.debug(f"{n_batches} batches") 305 | no_cat_ind = categories.index("") 306 | for n in range(int(n_batches)): 307 | start_batch = n * batch_size 308 | end_batch = (n + 1) * batch_size 309 | if con_limit is not None: 310 | kwd_preds_tmp = kwd_preds[0:con_limit, start_batch:end_batch, :] 311 | else: 312 | kwd_preds_tmp = kwd_preds[:, start_batch:end_batch, :] 313 | n_docs = kwd_preds_tmp.shape[1] 314 | if True: # use_dask is True: 315 | kwd_preds_tmp = dask.delayed(kwd_preds_tmp) 316 | all_cat_inds = dask.delayed(all_cat_inds) 317 | jobs = [] 318 | for doc_index in range(n_docs): 319 | # should be everything now, since '' category is included 320 | job = dask.delayed(get_means_for_one_doc)( 321 | doc_index, 322 | all_cat_inds, 323 | kwd_preds_tmp, 324 | categories, 325 | no_cat_ind, 326 | synth_strat, 327 | pbar=pbar, 328 | ) 329 | jobs.append(job) 330 | hybrid_preds = dask.compute(jobs)[0] 331 | else: 332 | hybrid_preds = [] 333 | for doc_index in range(n_docs): 334 | # should be everything now, since '' category is included 335 | v = get_means_for_one_doc( 336 | doc_index, 337 | all_cat_inds, 338 | kwd_preds_tmp, 339 | categories, 340 | no_cat_ind, 341 | only_cat, 342 | synth_strat, 343 | pbar=pbar, 344 | ) 345 | hybrid_preds.append(v) 346 | hybrid_pred_array = np.stack(hybrid_preds) 347 | if limit is not None: 348 | if limit <= end_batch: 349 | synth_preds[start_batch:limit, :] = hybrid_pred_array 350 | else: 351 | synth_preds[start_batch:end_batch, :] = hybrid_pred_array 352 | else: 353 | synth_preds[start_batch:end_batch, :] = hybrid_pred_array 354 | 355 | 356 | def load_category_models(in_cat_models: str) -> List[dict]: 357 | """ 358 | Load all category models from given directory 359 | 360 | Args: 361 | in_cat_models: directory where category models reside 362 | 363 | Returns: 364 | cat_clfs: A list of dictionaries, each with a category model 365 | """ 366 | LOG.info(f"Loading category classifiers from {in_cat_models}.") 367 | in_clfs = list(Path(in_cat_models).iterdir()) 368 | cat_clfs = [joblib.load(c) for c in tqdm(in_clfs)] 369 | return cat_clfs 370 | 371 | 372 | def load_concept_models(in_kwd_models: str, load: bool = True) -> Dict[Tuple[str, str], GridSearchCV]: 373 | """ 374 | Load keyword models from given directory. 375 | 376 | Args: 377 | in_kwd_models: directory with subdirs, the suffixes of which are the 378 | names of the categories (ex. topic_physics). Each of these 379 | subfolders contains binary files for concepts in that category. 380 | The classifiers trained on all documents are in a subfolder which 381 | has not suffix (ex. topic_). 382 | load: whether to load the models into memory, or just get their paths 383 | 384 | Returns: 385 | cd: Dictionary with all classifiers for each category. 386 | """ 387 | LOG.info(f"Loading keyword classifiers from {in_kwd_models}.") 388 | cd = {} # expects no_topics with suffix '' 389 | topic_dirs = list(Path(in_kwd_models).iterdir()) 390 | total = 0 391 | for td in topic_dirs: 392 | in_clfs = list(td.iterdir()) 393 | total += len(in_clfs) 394 | pbar = tqdm(topic_dirs, total=total) 395 | for topic_dir in pbar: 396 | topic_name = topic_dir.stem.split("_")[1] # depends on opinionated path format 397 | pbar.set_description(topic_name) 398 | in_clfs = list(topic_dir.iterdir()) 399 | clfs = (joblib.load(c) for c in in_clfs) # generator for loading classifiers 400 | for c, c_loc in zip(clfs, in_clfs): 401 | if load is True: 402 | cd[topic_name, c["concept"]] = c["best_estimator_"] 403 | else: 404 | cd[topic_name, c['concept']] = c_loc 405 | pbar.update(1) 406 | return cd 407 | 408 | 409 | def make_predictions( 410 | in_cat_models, 411 | in_kwd_models, 412 | feature_matrix, 413 | out_store="test_results/store.h5", 414 | t=None, 415 | ): 416 | cat_clfs = load_category_models(in_cat_models) 417 | cd = load_concept_models(in_kwd_models) 418 | clf = HierarchicalClassifier(cat_clfs, cd) 419 | LOG.info("Predicting categories.") 420 | cat_preds = clf.predict_categories(feature_matrix) 421 | if t is not None: 422 | LOG.info("Only making predictions for keywords in predicted categories.") 423 | cat_indices = get_cat_inds(clf.categories, cat_preds, t) 424 | # TODO: add rule for when cat_indices has nothing in it! 425 | all_kwd_preds_loc = clf._predict_keywords( 426 | feature_matrix, out_store, cat_indices 427 | ) 428 | else: 429 | LOG.info("Predicting for all keywords on all documents.") 430 | # TODO: this should call a public function 431 | all_kwd_preds_loc = clf._predict_keywords(feature_matrix, out_store) 432 | LOG.info(f"all_kwd_preds_loc={all_kwd_preds_loc}") 433 | 434 | return clf.categories, clf.concepts_with_classifiers, cat_preds 435 | 436 | 437 | class StubBestEstimator: 438 | """ 439 | Stub class for classifier's best_estimator to be used for testing. 440 | """ 441 | 442 | def init(self): 443 | pass 444 | 445 | def predict_proba(self, feature_matrix): 446 | val = np.random.rand(feature_matrix.shape[0], 2) 447 | return val 448 | 449 | 450 | def main( 451 | experiment_name, out_store, out_cat_preds, gt_batch_size, limit=None, 452 | ): 453 | LOG.info("Loading test data and models.") 454 | # TODO: paths should be put into main function 455 | test_inds = np.load(f"data/interim/{experiment_name}/test_inds.npy") 456 | train_inds = np.load(f"data/interim/{experiment_name}/train_inds.npy") 457 | feature_matrix = joblib.load(f"data/interim/{experiment_name}/feature_matrix.jbl") 458 | in_cat_models = Path(f"models/{experiment_name}/categories/models/") 459 | in_kwd_models = Path(f"models/{experiment_name}/keywords/models/") 460 | 461 | if limit is not None: 462 | LOG.info(f"Limiting to {limit} test records.") 463 | feature_matrix_test = feature_matrix.tocsc()[test_inds[0:limit], :] 464 | # TODO: How does this affect indices? 465 | else: 466 | feature_matrix_test = feature_matrix.tocsc()[test_inds, :] 467 | 468 | LOG.info("Making predictions.") 469 | categories, concepts_with_classifiers, cat_preds, = make_predictions( 470 | in_cat_models, in_kwd_models, feature_matrix_test, out_store, 471 | ) # need t if limiting 472 | np.save(out_cat_preds, cat_preds) 473 | LOG.info("Creating ground truth data.") 474 | kwd_ext = ml.ConceptExtractor() # TODO: these paths should be provided as args 475 | kwd_ext.from_jsons( 476 | f"data/interim/{experiment_name}/kwd_indices.json", 477 | f"models/{experiment_name}/kwd_raw2lemma.json", 478 | ) 479 | create_ground_truth( 480 | store=out_store, 481 | dataset="ground_truth", 482 | kwd_ext=kwd_ext, 483 | concepts_with_classifiers=concepts_with_classifiers, 484 | batch_size=gt_batch_size, 485 | train_inds=train_inds, 486 | test_inds=test_inds, 487 | ) 488 | 489 | 490 | def get_category_results(cat_models_dir: Path) -> pd.DataFrame: 491 | in_clfs = list(cat_models_dir.iterdir()) 492 | cat_clfs = [joblib.load(c) for c in in_clfs] # loads the classifiers 493 | cat_results_df = pd.DataFrame( 494 | [{**c["scores"], **{"concept": c["concept"]}} for c in cat_clfs] 495 | ) 496 | return cat_results_df 497 | 498 | 499 | def get_keyword_results(kwd_models_dir: Path) -> pd.DataFrame: 500 | cd = {} 501 | for topic_dir in kwd_models_dir.iterdir(): 502 | in_clfs = list(topic_dir.iterdir()) 503 | clfs = (joblib.load(c) for c in in_clfs) # loads the classifiers 504 | topic_name = topic_dir.stem.split("_")[1] # depends on opinionated path format 505 | cd[topic_name] = clfs 506 | 507 | all_records = [] 508 | for t, clfs in tqdm(cd.items()): 509 | for clf in clfs: 510 | r = {**{"concept": clf["concept"], "category": t}, **clf["scores"]} 511 | all_records.append(r) 512 | results_df = pd.DataFrame(all_records) 513 | return results_df 514 | 515 | 516 | class HierarchicalClassifier: 517 | """ 518 | Hierarchical Classifier object which allows for streamlined predictions 519 | on suites of concept models associated with different categories. 520 | 521 | Attributes: 522 | categories: list of categories 523 | concepts_with_classifiers: sorted array of concepts with classifiers 524 | cat_concept_indices: list where each element maps onto a category. 525 | Each element consists of a selection of indices 526 | in concepts_with_classifier which occur in the given category. 527 | vectorizer: DictVectorizer for transforming features 528 | """ 529 | 530 | def __init__( 531 | self, cat_clfs: List[dict], kwd_clfs: Dict[Tuple[str, str], GridSearchCV], 532 | ): 533 | """ 534 | Set the models for categories and concepts_with_classifiers 535 | 536 | Args: 537 | cat_clfs: category classifier models 538 | kwd_clfs: Dictionary with keys which are tuples 539 | of categories and concepts, values are the classifier models 540 | """ 541 | self.cat_clfs = cat_clfs 542 | self.kwd_clfs = kwd_clfs 543 | self.vectorizer = None 544 | 545 | @property 546 | def cat_clfs(self): 547 | """ 548 | The category classifiers. 549 | 550 | Setter also creates categories attribute. 551 | """ 552 | return self._cat_clfs 553 | 554 | @property 555 | def kwd_clfs(self): 556 | """ 557 | Dictionary with keys which are tuples of categories and concepts, 558 | values are the classifier models 559 | 560 | Setter method creates concept_indices, 561 | and concepts_with_classifiers attributes. 562 | """ 563 | return self._kwd_clfs 564 | 565 | @cat_clfs.setter 566 | def cat_clfs(self, cat_clfs: List[dict]): 567 | self._cat_clfs = cat_clfs 568 | self.categories = [c["concept"] for c in self.cat_clfs] + [""] 569 | 570 | @kwd_clfs.setter 571 | def kwd_clfs(self, kwd_clfs: Dict[Tuple[str, str], dict]): 572 | self._kwd_clfs = kwd_clfs 573 | category_concepts = {} 574 | 575 | for cat in self.categories: 576 | concepts = [k[1] for k, v in kwd_clfs.items() if k[0] == cat] 577 | # concepts = [clf["concept"] for clf in kwd_clfs[cat]] 578 | category_concepts[cat] = concepts 579 | 580 | all_cat_concepts = set( 581 | c for ts, cons in category_concepts.items() for c in cons 582 | ) 583 | concepts_with_classifiers = np.sort(list(all_cat_concepts)) 584 | LOG.info(f"concepts_with_classifiers: {concepts_with_classifiers.shape[0]}") 585 | 586 | cat_concept_indices = [] 587 | for cat in self.categories: 588 | full_in_cats = np.isin(concepts_with_classifiers, category_concepts[cat]) 589 | cat_concept_cols = np.where(full_in_cats)[0] 590 | cat_concept_indices.append(cat_concept_cols) 591 | 592 | self.cat_concept_indices: List[np.array] = cat_concept_indices 593 | # shape is [categories, keywords] 594 | self.concepts_with_classifiers: np.array = concepts_with_classifiers 595 | 596 | def load_vectorizer(self, v_loc: str): 597 | """ 598 | Loads the DictVectorizer 599 | 600 | Args: 601 | v_loc: location of vectorizer 602 | """ 603 | self.vectorizer: DictVectorizer = joblib.load(v_loc) 604 | 605 | def vectorize( 606 | self, 607 | texts: List[str], 608 | weights: Dict[str, int], 609 | batch_size: int = 1000, 610 | n_threads: int = cpu_count(), 611 | ) -> Tuple[List[Dict[str, str]], np.array]: 612 | """ 613 | Transform texts into a matrix of features. 614 | 615 | Args: 616 | texts: texts to transform 617 | weights: how to weight different types of features 618 | batch_size: what batch size to pass to nlp.pipe 619 | n_threads: number of threads to use 620 | 621 | Returns: 622 | feature_matrix: matrix representation of features for each document 623 | """ 624 | assert self.vectorizer is not None, LOG.exception("Must initialize vectorizer.") 625 | fe = ml.FeatureExtractor() 626 | with NamedTemporaryFile() as tmp_features_loc: 627 | tmp_features = tmp_features_loc.name 628 | ml.extract_features_from_abstracts( 629 | texts, tmp_features, batch_size, n_threads 630 | ) 631 | fe.from_jsonlines(tmp_features) 632 | weighted_features = fe.weight_terms(weights) 633 | feature_matrix = self.vectorizer.transform(weighted_features) 634 | return fe.features, feature_matrix 635 | 636 | def predict_categories(self, feature_matrix: np.array) -> np.array: 637 | """ 638 | Make predictions with category classifiers 639 | 640 | Args: 641 | feature_matrix: array of features for each document 642 | 643 | Returns: 644 | cat_preds: prediction belief values for each document 645 | """ 646 | cat_preds_list = [ 647 | clf["best_estimator_"].predict_proba(feature_matrix)[:, 1] 648 | for clf in tqdm(self.cat_clfs) 649 | ] 650 | cat_preds = np.stack(cat_preds_list, axis=1) 651 | return cat_preds 652 | 653 | def _predict_one_clf( 654 | self, feature_matrix: np.array, concept_index: int, cat: str, pbar=None, 655 | ) -> np.array: 656 | """ 657 | Make a prediction for a particular concept. 658 | 659 | Args: 660 | feature_matrix: array of features for each document 661 | concept_index: index for the given concept 662 | in concepts_with_classifiers attribute 663 | cat: name of the given category 664 | 665 | Returns: 666 | v: predictions for all documents for the given concept 667 | """ 668 | con = self.concepts_with_classifiers[concept_index] 669 | clf = self.kwd_clfs[cat, con] 670 | try: # TODO: explicit option for this rather than interpreting? 671 | os.fspath(clf) 672 | clf = joblib.load(clf)["best_estimator_"] 673 | except TypeError: 674 | pass 675 | v = clf.predict_proba(feature_matrix)[:, 1] 676 | if pbar is not None: 677 | pbar.update(1) 678 | return v 679 | 680 | def _predict_kwds_for_cat( 681 | self, 682 | feature_matrix: np.array, 683 | cat_index: int, 684 | predictions: np.array, 685 | cat_indices: Dict[str, List[int]] = None, 686 | use_dask: bool = True, 687 | pbar: tqdm = None, 688 | ): 689 | """ 690 | Make predictions for all documents for all concepts 691 | in the given category 692 | 693 | Args: 694 | feature_matrix: array of features for each document 695 | cat_index: index in categories attribute of the given category 696 | predictions: the h5 dataset where predictions are stored 697 | cat_indices: Predicted indices where categories occur 698 | for each category 699 | use_dask: Use dask for multiprocessing 700 | pbar: tqdm progress bar 701 | """ 702 | cat = self.categories[cat_index] 703 | pbar.set_postfix(category=cat, refresh=False) 704 | if (cat_indices is not None) and (cat != ""): 705 | feature_matrix_test = feature_matrix[cat_indices[cat], :] 706 | # this could be a problem if I want everything to perfectly align. 707 | else: 708 | feature_matrix_test = feature_matrix 709 | if feature_matrix_test.shape[0] == 0: 710 | pbar.update(len(self.cat_concept_indices[cat_index])) 711 | return 0 712 | # TODO: for good bar, should walk tasks to compute total 713 | cat_concept_cols = self.cat_concept_indices[cat_index] 714 | # use the np.where here, bool index for initial setting? 715 | if False: # use_dask is True: 716 | feature_matrix_test = dask.delayed(feature_matrix_test) 717 | jobs = [] 718 | ProgressBar().register() 719 | for concept_index in cat_concept_cols: 720 | j = dask.delayed(self._predict_one_clf)( 721 | feature_matrix_test, concept_index, cat, pbar 722 | ) 723 | jobs.append(j) 724 | vals = dask.compute(jobs)[0] 725 | else: 726 | vals = [] 727 | for concept_index in cat_concept_cols: 728 | val = self._predict_one_clf( 729 | feature_matrix_test, concept_index, cat, pbar 730 | ) 731 | vals.append(val) 732 | if (cat_indices is not None) and (cat is not ""): 733 | # need to correct indices, zeros in places with no predictions 734 | # TODO: determine if this patching activity 735 | # takes longer than just predicting on more 736 | new_vals = [] 737 | for v in vals: 738 | new_v = np.zeros(feature_matrix.shape[0]) 739 | new_v[cat_indices[cat]] = v 740 | new_vals.append(new_v) 741 | vals = new_vals 742 | # TODO: below will not work with cat_inds 743 | if len(vals) > 0: 744 | topic_preds_sub = np.stack(vals, axis=1) 745 | predictions[cat_index, :, cat_concept_cols] = topic_preds_sub 746 | 747 | def _predict_keywords( 748 | self, 749 | feature_matrix: np.array, 750 | store: str, 751 | cat_indices: Dict[str, list] = None, 752 | only_no_topic: bool = False, 753 | use_dask: bool = True, 754 | ): 755 | """ 756 | Make keyword predictions 757 | 758 | Args: 759 | feature_matrix: array of features for each document 760 | store: location of h5 store for predictions 761 | cat_indices: Predicted indices where categories 762 | occur for each category 763 | only_no_topic: only use the models which are 764 | not associated with a category 765 | use_dask: use dask for multiprocessing 766 | 767 | Returns: 768 | store: the location of the h5 store 769 | """ 770 | all_con_checks = np.sum( 771 | np.array([a.shape[0] for a in self.cat_concept_indices]) 772 | ) 773 | if Path(store).exists(): 774 | ValueError(f"{store} already exists.") 775 | with h5py.File(store, "w") as f0, tqdm(total=all_con_checks) as pbar: 776 | predictions = f0.create_dataset( 777 | "predictions", 778 | ( 779 | len(self.categories), 780 | feature_matrix.shape[0], 781 | len(self.concepts_with_classifiers), 782 | ), 783 | compression="gzip", 784 | ) # [categories, docs, concepts] 785 | if only_no_topic is True: 786 | cat_index = self.categories.index("") 787 | self._predict_kwds_for_cat( 788 | feature_matrix, cat_index, predictions, cat_indices, use_dask, pbar, 789 | ) 790 | else: 791 | for cat_index in range(len(self.categories)): 792 | self._predict_kwds_for_cat( 793 | feature_matrix, 794 | cat_index, 795 | predictions, 796 | cat_indices, 797 | use_dask, 798 | pbar, 799 | ) 800 | return store 801 | 802 | def get_synth_preds( 803 | self, 804 | store: str, 805 | all_cat_inds: Dict[str, np.array], 806 | batch_size: int, 807 | only_cat: bool, 808 | synth_strat: str, 809 | use_dask: bool = True, 810 | ) -> np.array: 811 | """ 812 | Synthesize all keyword models into a single prediction score. 813 | 814 | Args: 815 | store: location of h5 database 816 | all_cat_inds: dictionary with keys which are categories. 817 | Values are index of documents which apply to each category. 818 | batch_size: batch size for synthesizing predictions 819 | only_cat: only use category classifiers in synthesis 820 | synth_strat: strategy for synthesizing category predictions 821 | use_dask: use dask for multiprocessing 822 | 823 | """ 824 | # TODO: do this without all of the intermediaries 825 | with h5py.File(store, "r") as f0: 826 | tdocs = f0["predictions"].shape[1] 827 | shape = f0["predictions"].shape[1:] 828 | with tqdm(total=tdocs) as pbar: 829 | get_synth_preds( 830 | store, 831 | shape, 832 | all_cat_inds, 833 | self.categories, 834 | batch_size, 835 | only_cat, 836 | synth_strat, 837 | use_dask, 838 | pbar=pbar, 839 | ) 840 | with h5py.File(store, "r") as f0: 841 | results = f0["synthesis"].value # TODO: optional return? 842 | return results 843 | 844 | @staticmethod 845 | def _to_strings(tags, preds, t): 846 | all_tag_vals = [ 847 | get_tag_vals(preds[i], tags, t) for i in tqdm(range(preds.shape[0])) 848 | ] 849 | return all_tag_vals 850 | 851 | def predict( 852 | self, 853 | feature_matrix: np.array, 854 | cat_threshold: float = 0.5, 855 | concept_threshold: float = 0.5, 856 | no_categories: bool = False, 857 | only_cat: bool = False, 858 | synth_strat: str = "mean", 859 | batch_size: int = 10_000, 860 | ) -> Tuple[PRED_LIST_TYPE, PRED_LIST_TYPE]: 861 | """ 862 | Make predictions for all input texts. 863 | 864 | Args: 865 | texts: input texts for which to produce predictions 866 | cat_threhold: threshold over which to mix in category subset 867 | model predictions 868 | concept_threhold: threshold over which to return 869 | a concept prediction 870 | no_categories: whether or not to use category-specific models 871 | only_cat: only use category classifiers in synthesis 872 | synth_strat: strategy for synthesizing category concept models 873 | to produce single result. 874 | batch_size: size of batches for making predictions 875 | 876 | Returns: 877 | concept_preds: concepts and their belief scores 878 | 879 | Examples: 880 | >>> examples = ["Olympus Mons is the largest volcano in the solar system", 881 | ... "Database management is critical for information retrieval", 882 | ... "We used a logistic regression with batched stochastic gradient descent."] 883 | >>> weights = {'NOUN': 1, 'PROPN': 1, 'ENT': 1, 'NOUN_CHUNK':1, 'ACRONYM': 1} 884 | >>> features, feature_matrix = hclf.vectorize(examples, weights) 885 | >>> hclf.predict(feature_matrix) 886 | """ 887 | n_splits = ceil(feature_matrix.shape[0] / batch_size) 888 | r1s = [] 889 | # TODO: make temp folder and then write the file 890 | with NamedTemporaryFile() as tmp_dir: 891 | tmp_store = Path(f"{tmp_dir.name}/store.h5") 892 | cat_pred_strings = [] 893 | for n in tqdm(range(n_splits)): 894 | # TODO: Leave batching to lower methods? 895 | start = n * batch_size 896 | end = (n + 1) * batch_size 897 | matrix_slice = feature_matrix[start:end, :] 898 | cat_preds = self.predict_categories(matrix_slice) 899 | cat_inds = get_cat_inds(self.categories, cat_preds, t=cat_threshold) 900 | LOG.info(f"Predicting keywords") 901 | store_loc = self._predict_keywords( 902 | matrix_slice, 903 | tmp_store.name, 904 | cat_indices=cat_inds, 905 | use_dask=False, 906 | only_no_topic=no_categories, 907 | ) 908 | if no_categories is True: 909 | with h5py.File(store_loc) as f0: 910 | sp = f0["predictions"][-1, :, :] 911 | else: 912 | LOG.info(f"Synthesizing for each doc.") 913 | sp = self.get_synth_preds( 914 | store_loc, 915 | cat_inds, 916 | 1000000000, # TODO: more explanation here 917 | only_cat, 918 | synth_strat, 919 | use_dask=False, 920 | ) 921 | LOG.info(f"Converting to strings.") 922 | r1 = self._to_strings( 923 | self.concepts_with_classifiers, sp, concept_threshold 924 | ) 925 | cp = self._to_strings(self.categories, cat_preds, t=0.0) 926 | r1s.append(r1) 927 | cat_pred_strings.append(cp) 928 | concept_preds = [doc_preds for r1 in r1s for doc_preds in r1] 929 | all_cat_pred_strings = [ 930 | doc_preds for cp in cat_pred_strings for doc_preds in cp 931 | ] 932 | return all_cat_pred_strings, concept_preds 933 | 934 | 935 | def get_tag_vals(pred_vals: List[float], tags: List[str], t: float): 936 | tag_vals = [(tags[i], v) for i, v in enumerate(pred_vals) if v > t] 937 | tag_vals.sort(key=lambda x: -x[1]) 938 | return tag_vals 939 | 940 | 941 | if __name__ == "__main__": 942 | parser = argparse.ArgumentParser( 943 | description="Use category and concept models to get metrics on the test data." 944 | ) 945 | parser.add_argument("--experiment_name", help="experiment to generate metrics for") 946 | parser.add_argument("--out_store", help="h5 store in which to store results") 947 | parser.add_argument( 948 | "--out_cat_preds", help="output npy file for category predictions" 949 | ) 950 | parser.add_argument( 951 | "--batch_size", help="size of batches for creating ground truth data", type=int, 952 | ) 953 | parser.add_argument( 954 | "--limit", 955 | help="size limit for test data (for testing on smaller subset)", 956 | type=int, 957 | default=None, 958 | ) 959 | args = parser.parse_args() 960 | main( 961 | args.experiment_name, 962 | args.out_store, 963 | args.out_cat_preds, 964 | args.batch_size, 965 | args.limit, 966 | ) 967 | --------------------------------------------------------------------------------