├── .gitignore ├── .travis.yml ├── HAEP.md ├── LICENSE.md ├── MANIFEST.in ├── README.md ├── docs ├── Makefile ├── make.bat └── source │ ├── cli.rst │ ├── conf.py │ ├── index.rst │ ├── sdk.rst │ └── tips.rst ├── htrc ├── .htrc.default ├── __init__.py ├── __main__.py ├── auth.py ├── config.py ├── hf_utils │ └── __init__.py ├── lib │ ├── __init__.py │ └── cli.py ├── metadata │ ├── __init__.py │ └── marc.py ├── mock │ ├── __init__.py │ └── volumes │ │ ├── __init__.py │ │ └── example.zip ├── models │ └── __init__.py ├── runningheaders │ └── __init__.py ├── tools │ ├── __init__.py │ ├── mallet.py │ └── topicexplorer.py ├── util │ ├── __init__.py │ └── resolve.py ├── volumes │ └── __init__.py └── workset │ ├── __init__.py │ └── __main__.py ├── setup.py ├── tests ├── __init__.py ├── data │ ├── example.csv │ ├── example.jsonld │ └── example.zip ├── test_download_cli.py ├── test_htrc_lib_cli.py ├── test_htrc_mock_volumes.py ├── test_htrc_util_resolve.py ├── test_htrc_volumes.py └── test_htrc_workset.py └── utils └── generate_data.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | sandbox 3 | 4 | *.ini 5 | .pypirc 6 | .idea 7 | 8 | logs/* 9 | 10 | build/ 11 | dist/ 12 | docs/build/ 13 | htrc.egg-info 14 | 15 | *~ 16 | 17 | .coverage 18 | htmlcov/ 19 | .eggs 20 | ssl-cert-trust 21 | venv/ 22 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "2.7" 4 | - "3.6" 5 | # command to install dependencies 6 | install: 7 | - python --version 8 | - pip --version 9 | # command to install dependencies 10 | - pip install cython wget 11 | - pip install coveralls coverage codeclimate-test-reporter 12 | - pip install . 13 | # command to run tests 14 | script: 15 | - coverage run --include=htrc/* setup.py test 16 | - coverage report 17 | after_success: 18 | - coveralls 19 | - codeclimate-test-reporter 20 | addons: 21 | code_climate: 22 | repo_token: 0299da27c6ac280129992725e48ee5ff71ea668b755a0301ebd8374c6900b80e 23 | -------------------------------------------------------------------------------- /HAEP.md: -------------------------------------------------------------------------------- 1 | # HEAP-1: HTRC User Toolkit 2 | 3 | HTRC Analytics Enhancement Proposal 1: HTRC Workset Toolkit 4 | 5 | ## Introduction 6 | The HTRC Workset Toolkit provides a command line interface for interacting with 7 | and analyzing volumes in the HathiTrust Digital Library. It operates on the 8 | concept of a "workset". A workset is a research collection intended for 9 | consumption by an automated process for non-consumptive analysis. 10 | 11 | The tools also assist with the HTRC Data Capsule, enabling you to download volumes 12 | to the secure mode of the capsule for analysis. 13 | 14 | ## Motivation 15 | Currently, we do not have an end-user tool built around the workset paradigm. 16 | This tool allows for a workset to be downloaded and analyzed using the Data 17 | Capsule, and enables testing outside of the data Capsule. 18 | 19 | ## Related Work 20 | 21 | ## Proposed Change 22 | The proposed changes are stored in the GitHub repository 23 | [htrc/HTRC-WorksetToolkit](http://github.com/htrc/HTRC-PythonToolkit), with 24 | [documentation at GitHub.io](http://htrc.github.io/HTRC-WorksetToolkit) 25 | 26 | ## User Interface 27 | The HTRC Workset Toolkit provides a command line interface for interacting with 28 | and analyzing volumes in the HathiTrust Digital Library: 29 | 30 | - Volume Download (``htrc download``) 31 | - Metadata Download (``htrc metadata``) 32 | - Pre-built Analysis Workflows (``htrc run``) 33 | - Export of volume lists (``htrc export``) 34 | 35 | Each tool operates on a *workset*, which is a collection of volumes, pages, 36 | or catalog records. 37 | 38 | A workset is referenced by a :ref:`workset path`, which is one of 7 types of 39 | identifiers. Almost any web page on http://hathitrust.org is a valid identifier, 40 | including the PageTurner view, Catalog record view, and Collection Builder 41 | collections. 42 | 43 | 44 | ## Migration and Maintenance Plan 45 | The project has complete Travis-CI unit test integratioin at [http://travis-ci.org/htrc/HTRC-PythonSDK]. 46 | 47 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | Copyright 2012 The Obvious Corporation and contributors. 2 | 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | 16 | ``` 17 | ------------------------------------------------------------------------- 18 | Apache License 19 | Version 2.0, January 2004 20 | http://www.apache.org/licenses/ 21 | 22 | 23 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 24 | 25 | 1. Definitions. 26 | 27 | "License" shall mean the terms and conditions for use, reproduction, 28 | and distribution as defined by Sections 1 through 9 of this document. 29 | 30 | "Licensor" shall mean the copyright owner or entity authorized by 31 | the copyright owner that is granting the License. 32 | 33 | "Legal Entity" shall mean the union of the acting entity and all 34 | other entities that control, are controlled by, or are under common 35 | control with that entity. For the purposes of this definition, 36 | "control" means (i) the power, direct or indirect, to cause the 37 | direction or management of such entity, whether by contract or 38 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 39 | outstanding shares, or (iii) beneficial ownership of such entity. 40 | 41 | "You" (or "Your") shall mean an individual or Legal Entity 42 | exercising permissions granted by this License. 43 | 44 | "Source" form shall mean the preferred form for making modifications, 45 | including but not limited to software source code, documentation 46 | source, and configuration files. 47 | 48 | "Object" form shall mean any form resulting from mechanical 49 | transformation or translation of a Source form, including but 50 | not limited to compiled object code, generated documentation, 51 | and conversions to other media types. 52 | 53 | "Work" shall mean the work of authorship, whether in Source or 54 | Object form, made available under the License, as indicated by a 55 | copyright notice that is included in or attached to the work 56 | (an example is provided in the Appendix below). 57 | 58 | "Derivative Works" shall mean any work, whether in Source or Object 59 | form, that is based on (or derived from) the Work and for which the 60 | editorial revisions, annotations, elaborations, or other modifications 61 | represent, as a whole, an original work of authorship. For the purposes 62 | of this License, Derivative Works shall not include works that remain 63 | separable from, or merely link (or bind by name) to the interfaces of, 64 | the Work and Derivative Works thereof. 65 | 66 | "Contribution" shall mean any work of authorship, including 67 | the original version of the Work and any modifications or additions 68 | to that Work or Derivative Works thereof, that is intentionally 69 | submitted to Licensor for inclusion in the Work by the copyright owner 70 | or by an individual or Legal Entity authorized to submit on behalf of 71 | the copyright owner. For the purposes of this definition, "submitted" 72 | means any form of electronic, verbal, or written communication sent 73 | to the Licensor or its representatives, including but not limited to 74 | communication on electronic mailing lists, source code control systems, 75 | and issue tracking systems that are managed by, or on behalf of, the 76 | Licensor for the purpose of discussing and improving the Work, but 77 | excluding communication that is conspicuously marked or otherwise 78 | designated in writing by the copyright owner as "Not a Contribution." 79 | 80 | "Contributor" shall mean Licensor and any individual or Legal Entity 81 | on behalf of whom a Contribution has been received by Licensor and 82 | subsequently incorporated within the Work. 83 | 84 | 2. Grant of Copyright License. Subject to the terms and conditions of 85 | this License, each Contributor hereby grants to You a perpetual, 86 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 87 | copyright license to reproduce, prepare Derivative Works of, 88 | publicly display, publicly perform, sublicense, and distribute the 89 | Work and such Derivative Works in Source or Object form. 90 | 91 | 3. Grant of Patent License. Subject to the terms and conditions of 92 | this License, each Contributor hereby grants to You a perpetual, 93 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 94 | (except as stated in this section) patent license to make, have made, 95 | use, offer to sell, sell, import, and otherwise transfer the Work, 96 | where such license applies only to those patent claims licensable 97 | by such Contributor that are necessarily infringed by their 98 | Contribution(s) alone or by combination of their Contribution(s) 99 | with the Work to which such Contribution(s) was submitted. If You 100 | institute patent litigation against any entity (including a 101 | cross-claim or counterclaim in a lawsuit) alleging that the Work 102 | or a Contribution incorporated within the Work constitutes direct 103 | or contributory patent infringement, then any patent licenses 104 | granted to You under this License for that Work shall terminate 105 | as of the date such litigation is filed. 106 | 107 | 4. Redistribution. You may reproduce and distribute copies of the 108 | Work or Derivative Works thereof in any medium, with or without 109 | modifications, and in Source or Object form, provided that You 110 | meet the following conditions: 111 | 112 | (a) You must give any other recipients of the Work or 113 | Derivative Works a copy of this License; and 114 | 115 | (b) You must cause any modified files to carry prominent notices 116 | stating that You changed the files; and 117 | 118 | (c) You must retain, in the Source form of any Derivative Works 119 | that You distribute, all copyright, patent, trademark, and 120 | attribution notices from the Source form of the Work, 121 | excluding those notices that do not pertain to any part of 122 | the Derivative Works; and 123 | 124 | (d) If the Work includes a "NOTICE" text file as part of its 125 | distribution, then any Derivative Works that You distribute must 126 | include a readable copy of the attribution notices contained 127 | within such NOTICE file, excluding those notices that do not 128 | pertain to any part of the Derivative Works, in at least one 129 | of the following places: within a NOTICE text file distributed 130 | as part of the Derivative Works; within the Source form or 131 | documentation, if provided along with the Derivative Works; or, 132 | within a display generated by the Derivative Works, if and 133 | wherever such third-party notices normally appear. The contents 134 | of the NOTICE file are for informational purposes only and 135 | do not modify the License. You may add Your own attribution 136 | notices within Derivative Works that You distribute, alongside 137 | or as an addendum to the NOTICE text from the Work, provided 138 | that such additional attribution notices cannot be construed 139 | as modifying the License. 140 | 141 | You may add Your own copyright statement to Your modifications and 142 | may provide additional or different license terms and conditions 143 | for use, reproduction, or distribution of Your modifications, or 144 | for any such Derivative Works as a whole, provided Your use, 145 | reproduction, and distribution of the Work otherwise complies with 146 | the conditions stated in this License. 147 | 148 | 5. Submission of Contributions. Unless You explicitly state otherwise, 149 | any Contribution intentionally submitted for inclusion in the Work 150 | by You to the Licensor shall be under the terms and conditions of 151 | this License, without any additional terms or conditions. 152 | Notwithstanding the above, nothing herein shall supersede or modify 153 | the terms of any separate license agreement you may have executed 154 | with Licensor regarding such Contributions. 155 | 156 | 6. Trademarks. This License does not grant permission to use the trade 157 | names, trademarks, service marks, or product names of the Licensor, 158 | except as required for reasonable and customary use in describing the 159 | origin of the Work and reproducing the content of the NOTICE file. 160 | 161 | 7. Disclaimer of Warranty. Unless required by applicable law or 162 | agreed to in writing, Licensor provides the Work (and each 163 | Contributor provides its Contributions) on an "AS IS" BASIS, 164 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 165 | implied, including, without limitation, any warranties or conditions 166 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 167 | PARTICULAR PURPOSE. You are solely responsible for determining the 168 | appropriateness of using or redistributing the Work and assume any 169 | risks associated with Your exercise of permissions under this License. 170 | 171 | 8. Limitation of Liability. In no event and under no legal theory, 172 | whether in tort (including negligence), contract, or otherwise, 173 | unless required by applicable law (such as deliberate and grossly 174 | negligent acts) or agreed to in writing, shall any Contributor be 175 | liable to You for damages, including any direct, indirect, special, 176 | incidental, or consequential damages of any character arising as a 177 | result of this License or out of the use or inability to use the 178 | Work (including but not limited to damages for loss of goodwill, 179 | work stoppage, computer failure or malfunction, or any and all 180 | other commercial damages or losses), even if such Contributor 181 | has been advised of the possibility of such damages. 182 | 183 | 9. Accepting Warranty or Additional Liability. While redistributing 184 | the Work or Derivative Works thereof, You may choose to offer, 185 | and charge a fee for, acceptance of support, warranty, indemnity, 186 | or other liability obligations and/or rights consistent with this 187 | License. However, in accepting such obligations, You may act only 188 | on Your own behalf and on Your sole responsibility, not on behalf 189 | of any other Contributor, and only if You agree to indemnify, 190 | defend, and hold each Contributor harmless for any liability 191 | incurred by, or claims asserted against, such Contributor by reason 192 | of your accepting any such warranty or additional liability. 193 | 194 | END OF TERMS AND CONDITIONS 195 | ``` 196 | 197 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include htrc/mock/volumes/example.zip 2 | include htrc/.htrc.default 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # HTRC Workset Toolkit 2 | [![Supported Python Versions](https://img.shields.io/pypi/pyversions/htrc.svg)](https://pypi.python.org/pypi/htrc) 3 | [![PyPI Version](https://img.shields.io/pypi/v/htrc.svg)](https://pypi.python.org/pypi/htrc) 4 | [![Build Status](https://travis-ci.org/htrc/HTRC-WorksetToolkit.svg?branch=master)](https://travis-ci.org/htrc/HTRC-WorksetToolkit) 5 | [![Coverage Status](https://coveralls.io/repos/github/htrc/HTRC-WorksetToolkit/badge.svg?branch=master)](https://coveralls.io/github/htrc/HTRC-WorksetToolkit?branch=master) 6 | 7 | HTRC Workset Toolkit provides tools for interacting with and analyzing volumes in the HathiTrust Digital Library: 8 | 9 | - Volume Download (`htrc download`) 10 | - Metadata Download (`htrc metadata`) 11 | - Pre-built Analysis Workflows (`htrc run`) 12 | - Export of volume lists (`htrc export`) 13 | 14 | Each tool operates on a *workset*, which is a collection of volumes, pages, or catalog records. 15 | 16 | The tools also assist with the HTRC Data Capsule, enabling you to download volumes to the secure mode of the capsule for analysis. 17 | 18 | For usage instructions and documentation see [https://htrc.github.io/HTRC-WorksetToolkit/cli.html]. 19 | 20 | For developers, the Workset Toolkit provides ways to test algorithms that will be run in the secure mode of the Data Capsule. It also provides methods for accessing the bibliographic records for HathiTrust volumes and ways to resolve catalog records for multivolume collections. It has the following components: 21 | 22 | - An access layer for the Bibliographic API (`htrc.metadata`) 23 | - An access layer for the Data API (`htrc.volumes`) 24 | - Pre-built analysis workflows (`htrc.tools`) 25 | - Provenance tracking for verification of non-consumptive exports (`htrc.prov`) 26 | - Mock testing interface for user-machine or maintenance-mode testing of 27 | secure-mode commands (`htrc.mock`) 28 | - Utilities for record and volume resolution (`htrc.util`) 29 | 30 | For documentation of the development libraries see [https://htrc.github.io/HTRC-WorksetToolkit/sdk.html]. 31 | 32 | ## Data Capsule usage 33 | The HTRC Data Capsule allows for analysis of HathiTrust volumes. It is the only way to perform analysis on the raw OCR text of in-copyright works. 34 | 35 | New users can register and configure a data capsule by following the [HTRC Data Capsule Tutorial](https://wiki.htrc.illinois.edu/display/COM/HTRC+Data+Capsule+Tutorial). 36 | 37 | The HTRC Workset Toolkit will be pre-installed on Data Capsule images in the near future. Current data capsules will need to follow the [installation instructions](#installation-instructions). 38 | 39 | 40 | ## Installation instructions 41 | 42 | 1. Download and install [Anaconda Python](https://www.continuum.io/downloads). The HTRC Workset Toolkit is compatible with both Python 2.7 and 3.6, but we recommend using the 3.6 version for future compatibility. 43 | 44 | 2. After installing Anaconda, open a new terminal and type `pip install htrc` to install the SDK. 45 | 46 | ## Testing 47 | 48 | 1. `git clone https://github.com/htrc/HTRC-WorksetToolkit.git` 49 | 2. `cd HTRC-WorksetToolkit` 50 | 3. `python setup.py develop` 51 | 4. The `htrc` command will now refer to the code in this local repository. 52 | 5. Run the unit tests with the command: `python setup.py test` 53 | 6. To revert to the PyPI version: 54 | ``` 55 | pip uninstall htrc 56 | pip install htrc 57 | ``` 58 | 59 | ## Updating PyPI 60 | In order to update PyPI, you will need owner permissions, which are currently held by Samitha Liyanage and Jaimie Murdock. 61 | 62 | 1. Create a `.pypirc` containing your username and password: 63 | ``` 64 | [distutils] 65 | index-servers = 66 | pypi 67 | 68 | [pypi] 69 | repository=https://upload.pypi.org/legacy/ 70 | username:USERNAME 71 | password:PASSWORD 72 | ``` 73 | 2. Run `python setup.py sdist upload` to upload the tarball. 74 | 3. Run `python setup.py bdist_egg upload` to upload the egg file. 75 | (If `upload` command doesn't work use `twine upload dist/*`) 76 | 77 | ## Documentation 78 | For usage instructions and documentation please see: [https://htrc.github.io/HTRC-WorksetToolkit/] 79 | 80 | For a more detailed guide please see: [https://wiki.htrc.illinois.edu/x/NQBTAw.] 81 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SPHINXPROJ = HTRCPythonSDK 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | latex: 18 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 19 | cd build/latex && pdflatex $(SPHINXPROJ) 20 | 21 | 22 | # Catch-all target: route all unknown targets to Sphinx using the new 23 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 24 | %: Makefile 25 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 26 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | set SPHINXPROJ=HTRCPythonSDK 13 | 14 | if "%1" == "" goto help 15 | 16 | %SPHINXBUILD% >NUL 2>NUL 17 | if errorlevel 9009 ( 18 | echo. 19 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 20 | echo.installed, then set the SPHINXBUILD environment variable to point 21 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 22 | echo.may add the Sphinx directory to PATH. 23 | echo. 24 | echo.If you don't have Sphinx installed, grab it from 25 | echo.http://sphinx-doc.org/ 26 | exit /b 1 27 | ) 28 | 29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 30 | goto end 31 | 32 | :help 33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 34 | 35 | :end 36 | popd 37 | -------------------------------------------------------------------------------- /docs/source/cli.rst: -------------------------------------------------------------------------------- 1 | HTRC Workset Toolkit 2 | ====================== 3 | The HTRC Workset Toolkit povides a command line interface for interacting with 4 | and analyzing volumes in the HathiTrust Digital Library: 5 | 6 | - Volume Download (``htrc download``) 7 | - Metadata Download (``htrc metadata``) 8 | - Pre-built Analysis Workflows (``htrc run``) 9 | - Export of volume lists (``htrc export``) 10 | 11 | Workset Path 12 | -------------- 13 | 14 | Each of these commands takes a *workset path*. Valid types of workset paths 15 | and examples of each are: 16 | 17 | ================================== ============================================================================== 18 | Identifier Type Example 19 | ================================== ============================================================================== 20 | HathiTrust ID mdp.39015078560078 21 | HathiTrust Catalog ID 001423370 22 | HathiTrust URL https://babel.hathitrust.org/cgi/pt?id=mdp.39015078560078;view=1up;seq=13 23 | Handle.org Volume URL https://hdl.handle.net/2027/mdp.39015078560078 24 | HathiTrust Catalog URL https://catalog.hathitrust.org/Record/001423370 25 | HathiTrust Collection Builder URL https://babel.hathitrust.org/shcgi/mb?a=listis;c=696632727 26 | Local volumes file ``/home/dcuser/Downloads/collections.txt`` 27 | ================================== ============================================================================== 28 | 29 | 30 | 31 | Volume Download 32 | -------------------- 33 | The ``htrc download`` command retrieves volumes from the HTRC Data API 34 | to the secure mode of the :ref:`HTRC Data Capsule Service`. 35 | 36 | .. note:: 37 | 38 | This command will return an error when run on a non-HTRC computer or on a 39 | Capsule running in maintenance mode. 40 | 41 | .. _HTRC Data API: https://wiki.htrc.illinois.edu/display/COM/HTRC+Data+API+Users+Guide 42 | 43 | Arguments 44 | ''''''''''' 45 | .. argparse:: 46 | :module: htrc.__main__ 47 | :func: download_parser 48 | :prog: htrc download 49 | 50 | 51 | Bibliographic API Access 52 | -------------------------- 53 | ``htrc metadata`` retrieves metadata from the `HathiTrust Bibliographic API`_. 54 | This command has no limitations on which computer or network executes it. 55 | 56 | .. _HathiTrust Bibliographic API: https://www.hathitrust.org/bib_api 57 | 58 | 59 | Arguments 60 | ''''''''''' 61 | .. argparse:: 62 | :module: htrc.__main__ 63 | :func: add_workset_path 64 | :prog: htrc metadata 65 | 66 | 67 | Analysis Workflows 68 | -------------------- 69 | The HTRC Workset Toolkit also provides the command line tool ``htrc run``. Like `volume 70 | download`_, the 71 | 72 | Topic Modeling 73 | '''''''''''''''' 74 | There are two implementations of LDA topic modeling supported by the 75 | 76 | 77 | Arguments 78 | ''''''''''' 79 | .. argparse:: 80 | :module: htrc.tools.mallet 81 | :func: populate_parser 82 | :prog: htrc run mallet 83 | 84 | Use Cases and Examples 85 | -------------------------------------------- 86 | 87 | Following are the use cases and examples of ``htrc`` commands inside the HTRC Data Capsule. 88 | 89 | +---------------------------------+---------------------------+ 90 | | command: ``htrc download`` | capsule mode: **secure** | 91 | +---------------------------------+---------------------------+ 92 | 93 | * Download volumes of volume id list to default path : 94 | (/media/secure_volume/workset) 95 | 96 | ``htrc download /home/dcuser/HTRC/htrc-id`` 97 | 98 | * Download volumes of hathi collection url to default path : 99 | (/media/secure_volume/workset) 100 | 101 | ``htrc download “https://babel.hathitrust.org/cgi/mb?a=listis&c=1337751722”`` 102 | 103 | * Download volumes to specific location : 104 | 105 | ``htrc download /home/dcuser/HTRC/htrc-id -o /media/secure_volume/my-workset`` 106 | 107 | * Download volumes to specific location with concatenation option - (This will concatenate all the pages of the volume into one txt file.) : 108 | 109 | ``htrc download /home/dcuser/HTRC/htrc-id -o /media/secure_volume/my-workset -c`` 110 | 111 | * Download specific pages from a single volume : 112 | 113 | ``htrc download -pg coo.31924089593846[5,10,15,20,25,30]`` 114 | 115 | * Download volumes and then extract headers/footers from the volumes : 116 | 117 | ``htrc download -hf /home/dcuser/HTRC/htrc-id`` 118 | 119 | * Download volumes, extract headers/footers from the volume pages then concatenate the pages - (This will concatenate all the pages of the volume into one txt file.) : 120 | 121 | ``htrc download -hfc /home/dcuser/HTRC/htrc-id`` 122 | 123 | * Download volumes, extract headers/footers from the volumes, skip downloading the .csv files containing removed headers and footers : 124 | 125 | ``htrc download -hf -s /home/dcuser/HTRC/htrc-id`` 126 | 127 | * Download volumes, extract headers/footers from volumes, change window of pages in extractor algorithm (The default is 6, lower numbers increase speed, but are less accurate) : 128 | 129 | ``htrc download -hf -w 3 /home/dcuser/HTRC/htrc-id`` 130 | 131 | * Download volumes, extract headers/footers from volumes, change minimum similarity rate for lines on pages to be considered a header or footer (Default is .7 or 70%, so if a line is 70% the same as other lines on other pages within the window of pages it is labeled a header or footer and removed) : 132 | 133 | ``htrc download -hf -msr .9 /home/dcuser/HTRC/htrc-id`` 134 | 135 | * Download volumes, extract headers/footers from volumes, change the max number of concurrent tasks (note that the only options are 1 or 2): 136 | 137 | ``htrc download -hf --parallelism 2 /home/dcuser/HTRC/htrc-id`` 138 | 139 | 140 | | 141 | +---------------------------------+-----------------------------------------------+ 142 | | command: ``htrc metadata`` | capsule mode: **secure** and **maintenance** | 143 | +---------------------------------+-----------------------------------------------+ 144 | 145 | * Download the metadata of volumes by giving hathi collection url : 146 | 147 | ``htrc metadata "https://babel.hathitrust.org/cgi/mb?a=listis&c=1853042514"`` 148 | 149 | * Download the metadata of volumes by giving volume id list : 150 | 151 | ``htrc metadata /home/dcuser/HTRC/htrc-id`` 152 | 153 | * Download the metadata associated with volume id : 154 | volume 1 of `The Works of Jonathan Swift`_ 155 | 156 | ``htrc metadata mdp.39015078560078`` 157 | 158 | Note that this would only retrieve the first volume. If you want to download metadata for all 8 volumes, the catalog identifier would be used: 159 | 160 | ``htrc metadata 001423370`` 161 | 162 | Each command can be used with the URL as well (*note the quote marks around each URL*): 163 | 164 | ``htrc metadata "https://babel.hathitrust.org/cgi/pt?id=mdp.39015078560078;view=1up;seq=13"`` 165 | 166 | ``htrc metadata "https://catalog.hathitrust.org/Record/001423370"`` 167 | 168 | This URL support makes it easy to browse `hathitrust.org`_ and copy links for computational analysis using the SDK. 169 | 170 | .. _The Works of Jonathan Swift: https://hdl.handle.net/2027/mdp.39015078560078 171 | .. _hathitrust.org: https://www.hathitrust.org/ 172 | 173 | 174 | 175 | | 176 | +---------------------------------+-----------------------------------------------+ 177 | | command: ``htrc metadata`` | capsule mode: **secure** | 178 | +---------------------------------+-----------------------------------------------+ 179 | 180 | * Download the metadata of volumes by giving already downloaded volumes path : 181 | 182 | ``htrc metadata /media/secure_volume/workset`` 183 | 184 | | 185 | +---------------------------------+-----------------------------------------------+ 186 | | command: ``htrc metadata`` | capsule mode: **maintenance** | 187 | +---------------------------------+-----------------------------------------------+ 188 | 189 | * Download the metadata of volumes by giving already downloaded volumes path - (Sample volumes are available in capsules created with ubuntu-16-04-with-sample-volumes image. Those sample volumes are available as zip files. Please unzip before use them because the metadata function gets volume ids from volume directory names.) : 190 | 191 | ``mkdir /home/dcuser/unzipped_volumes`` 192 | 193 | ``cp /home/dcuser/HTRC/data/sample_volumes/fiction/ /home/dcuser/unzipped_volumes`` 194 | 195 | ``unzip /home/dcuser/unzipped_volumes/’*.zip’ | rm /home/dcuser/unzipped_volumes/*.zip`` 196 | 197 | ``htrc metadata /home/dcuser/unzipped_volumes`` 198 | 199 | | 200 | +---------------------------------+-----------------------------------------------+ 201 | | command: ``htrc export`` | capsule mode: **secure** and **maintenance** | 202 | +---------------------------------+-----------------------------------------------+ 203 | 204 | * Export volume ids from downloaded hathi collection and create workset with only volume ids : 205 | 206 | Go to the following link in the browser 207 | 208 | https://babel.hathitrust.org/cgi/mb?a=listis&c=1853042514 209 | 210 | Download metadata in tab separated format (Download Item Metadata: Tab-Delimited Text (TSV)), then - 211 | 212 | 213 | ``htrc export mb-9.txt > volumes.tx`` 214 | 215 | * Export volume ids from hathi collection url and create workset with only volume ids (works in both secure and maintenance modes) : 216 | 217 | ``htrc export "https://babel.hathitrust.org/cgi/mb?a=listis&c=1853042514" > volumes.txt`` 218 | 219 | 220 | | 221 | +---------------------------------+-----------------------------------------------+ 222 | | command: ``htrc run mallet`` | capsule mode: **secure** | 223 | +---------------------------------+-----------------------------------------------+ 224 | 225 | * Run mallet on already downloaded volumes : 226 | 227 | ``htrc run mallet /media/secure_volume/workset -k 20`` 228 | 229 | * Run mallet on volume id list : 230 | 231 | ``htrc run mallet /home/dcuser/HTRC/htrc-id -k 20`` 232 | 233 | * Run mallet on hathi collection : 234 | 235 | ``htrc run mallet "https://babel.hathitrust.org/cgi/mb?a=listis&c=1853042514" -k 20`` 236 | 237 | | 238 | +-----------------------------------+-----------------------------------------------+ 239 | | command: ``htrc run mallet`` | capsule mode: **maintenance** | 240 | +-----------------------------------+-----------------------------------------------+ 241 | 242 | * Run mallet on already downloaded volume - (Sample volumes are available in capsules created with ubuntu-16-04-with-sample-volumes image. Those sample volumes are available as zip files. Please unzip before use them because the metadata function gets volume ids from volume directory names). 243 | 244 | ``htrc mallet /home/dcuser/unzipped_volumes -k 20`` 245 | -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # 4 | # HTRC Python SDK documentation build configuration file, created by 5 | # sphinx-quickstart on Sun Jun 18 16:15:31 2017. 6 | # 7 | # This file is execfile()d with the current directory set to its 8 | # containing dir. 9 | # 10 | # Note that not all possible configuration values are present in this 11 | # autogenerated file. 12 | # 13 | # All configuration values have a default; values that are commented out 14 | # serve to show the default. 15 | 16 | # If extensions (or modules to document with autodoc) are in another directory, 17 | # add these directories to sys.path here. If the directory is relative to the 18 | # documentation root, use os.path.abspath to make it absolute, like shown here. 19 | # 20 | # import os 21 | # import sys 22 | # sys.path.insert(0, os.path.abspath('.')) 23 | 24 | 25 | # -- General configuration ------------------------------------------------ 26 | 27 | # If your documentation needs a minimal Sphinx version, state it here. 28 | # 29 | # needs_sphinx = '1.0' 30 | 31 | # Add any Sphinx extension module names here, as strings. They can be 32 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 33 | # ones. 34 | extensions = ['sphinx.ext.autodoc', 35 | 'sphinx.ext.doctest', 36 | 'sphinx.ext.autosectionlabel', 37 | 'sphinx.ext.todo', 38 | 'sphinx.ext.coverage', 39 | 'sphinx.ext.viewcode', 40 | 'sphinx.ext.githubpages', 41 | 'sphinxarg.ext', 42 | 'sphinxcontrib.fulltoc'] 43 | 44 | # Add any paths that contain templates here, relative to this directory. 45 | templates_path = ['_templates'] 46 | 47 | # The suffix(es) of source filenames. 48 | # You can specify multiple suffix as a list of string: 49 | # 50 | # source_suffix = ['.rst', '.md'] 51 | source_suffix = '.rst' 52 | 53 | # The master toctree document. 54 | master_doc = 'index' 55 | 56 | # General information about the project. 57 | project = 'HTRC Workset Toolkit' 58 | copyright = '2017, HathiTrust Research Center (Jaimie Murdock)' 59 | author = 'HathiTrust Research Center (Jaimie Murdock)' 60 | 61 | # The version info for the project you're documenting, acts as replacement for 62 | # |version| and |release|, also used in various other places throughout the 63 | # built documents. 64 | # 65 | # The short X.Y version. 66 | version = '1.0' 67 | # The full version, including alpha/beta/rc tags. 68 | release = '1.0.0b1' 69 | 70 | # The language for content autogenerated by Sphinx. Refer to documentation 71 | # for a list of supported languages. 72 | # 73 | # This is also used if you do content translation via gettext catalogs. 74 | # Usually you set "language" from the command line for these cases. 75 | language = None 76 | 77 | # List of patterns, relative to source directory, that match files and 78 | # directories to ignore when looking for source files. 79 | # This patterns also effect to html_static_path and html_extra_path 80 | exclude_patterns = [] 81 | 82 | # The name of the Pygments (syntax highlighting) style to use. 83 | pygments_style = 'sphinx' 84 | 85 | # If true, `todo` and `todoList` produce output, else they produce nothing. 86 | todo_include_todos = True 87 | 88 | 89 | # -- Options for HTML output ---------------------------------------------- 90 | 91 | # The theme to use for HTML and HTML Help pages. See the documentation for 92 | # a list of builtin themes. 93 | # 94 | html_theme = 'alabaster' 95 | 96 | # Theme options are theme-specific and customize the look and feel of a theme 97 | # further. For a list of options available for each theme, see the 98 | # documentation. 99 | # 100 | # html_theme_options = {} 101 | 102 | # Add any paths that contain custom static files (such as style sheets) here, 103 | # relative to this directory. They are copied after the builtin static files, 104 | # so a file named "default.css" will overwrite the builtin "default.css". 105 | html_static_path = ['_static'] 106 | 107 | 108 | # -- Options for HTMLHelp output ------------------------------------------ 109 | 110 | # Output file base name for HTML help builder. 111 | htmlhelp_basename = 'HTRCWorksetToolkigdoc' 112 | 113 | 114 | # -- Options for LaTeX output --------------------------------------------- 115 | 116 | latex_elements = { 117 | # The paper size ('letterpaper' or 'a4paper'). 118 | # 119 | # 'papersize': 'letterpaper', 120 | 121 | # The font size ('10pt', '11pt' or '12pt'). 122 | # 123 | # 'pointsize': '10pt', 124 | 125 | # Additional stuff for the LaTeX preamble. 126 | # 127 | # 'preamble': '', 128 | 129 | # Latex figure (float) alignment 130 | # 131 | # 'figure_align': 'htbp', 132 | } 133 | 134 | # Grouping the document tree into LaTeX files. List of tuples 135 | # (source start file, target name, title, 136 | # author, documentclass [howto, manual, or own class]). 137 | latex_documents = [ 138 | (master_doc, 'HTRCWorksetToolkit.tex', 'HTRC Workset Toolkit Documentation', 139 | 'HathiTrust Research Center (Jaimie Murdock)', 'manual'), 140 | ] 141 | 142 | 143 | # -- Options for manual page output --------------------------------------- 144 | 145 | # One entry per manual page. List of tuples 146 | # (source start file, name, description, authors, manual section). 147 | man_pages = [ 148 | (master_doc, 'htrc', 'HTRC Workset Toolkit Documentation', 149 | [author], 1) 150 | ] 151 | 152 | 153 | # -- Options for Texinfo output ------------------------------------------- 154 | 155 | # Grouping the document tree into Texinfo files. List of tuples 156 | # (source start file, target name, title, author, 157 | # dir menu entry, description, category) 158 | texinfo_documents = [ 159 | (master_doc, 'HTRCWorksetToolkit', 'HTRC Workset Toolkit Documentation', 160 | author, 'HTRCWorksetToolkit', '''Tools for interacting with collections of 161 | HathiTrust volumes and records.''', 162 | 'Documentation'), 163 | ] 164 | 165 | 166 | 167 | -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | .. HTRC Workset Toolkit documentation master file, created by 2 | sphinx-quickstart on Sun Jun 18 16:15:31 2017. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to the HTRC Workset Toolkit's documentation! 7 | ======================================================= 8 | The HTRC Workset Toolkit provides a command line interface for interacting with 9 | and analyzing volumes in the HathiTrust Digital Library: 10 | 11 | - Volume Download (``htrc download``) 12 | - Metadata Download (``htrc metadata``) 13 | - Pre-built Analysis Workflows (``htrc run``) 14 | - Export of volume lists (``htrc export``) 15 | 16 | Each tool operates on a *workset*, which is a collection of volumes, pages, 17 | or catalog records. 18 | 19 | A workset is referenced by a :ref:`workset path`, which is one of 7 types of 20 | identifiers. Almost any web page on http://hathitrust.org is a valid identifier, 21 | including the PageTurner view, Catalog record view, and Collection Builder 22 | collections. 23 | 24 | The tools also assist with the HTRC Data Capsule, enabling you to download volumes 25 | to the secure mode of the capsule for analysis. 26 | 27 | More details on each command can be found on the :ref:`HTRC Workset Toolkit` page. 28 | 29 | For developers, the Workset Toolkit provides ways to test algorithms that will 30 | be run in the secure mode of the Data Capsule. It also provides methods for 31 | accessing the bibliographic records for HathiTrust volumes and ways to resolve 32 | catalog records for multivolume collections. It has the following components: 33 | 34 | - An access layer for the Bibliographic API (``htrc.metadata``) 35 | - An access layer for the Data API (``htrc.volumes``) 36 | - Pre-built analysis workflows (``htrc.tools``) 37 | - Provenance tracking for verification of non-consumptive exports (``htrc.prov``) 38 | - Mock testing interface for user-machine or maintenance-mode testing of 39 | secure-mode commands (``htrc.mock``) 40 | - Utilities for record and volume resolution (``htrc.util``) 41 | 42 | More details on each module can be found on the :ref:`HTRC Workset Toolkit 43 | Development Library` page. 44 | 45 | All source code for the HTRC Workset Toolkit is available on `GitHub`_ under an 46 | `Apache 2.0 License`_. 47 | 48 | .. _GitHub: https://github.com/htrc/HTRC-PythonSDK/ 49 | .. _Apache 2.0 License: https://github.com/htrc/HTRC-PythonSDK/blob/master/LICENSE.md 50 | 51 | 52 | Data Capsule usage 53 | ---------------------------- 54 | The HTRC Data Capsule allows for analysis of HathiTrust volumes. It is the only 55 | way to perform analysis on the raw OCR text of in-copyright works. 56 | 57 | New users can register and configure a data capsule by following the `HTRC Data 58 | Capsule Tutorial`_. 59 | 60 | The HTRC Workset Toolkit will be pre-installed on Data Capsule images in the 61 | near future. Current data capsules will need to follow the ref:`installation 62 | instructions`. 63 | 64 | .. _HTRC Data Capsule Tutorial: https://wiki.htrc.illinois.edu/display/COM/HTRC+Data+Capsule+Tutorial 65 | 66 | 67 | Installation instructions 68 | --------------------------- 69 | 70 | 1. Download and install `Anaconda Python`_. The HTRC Workset Toolkit is 71 | compatible with both Python 2.7 and 3.6, but we recommend using the 3.6 version 72 | for future compatibility. 73 | 74 | 2. After installing Anaconda, open a new terminal and type ``pip install htrc`` 75 | to install the SDK. 76 | 77 | .. _Anaconda Python: https://www.continuum.io/downloads 78 | 79 | 80 | Table of Contents 81 | ================================ 82 | .. toctree:: 83 | :maxdepth: 2 84 | 85 | cli 86 | sdk 87 | tips 88 | 89 | 90 | Indices and tables 91 | ==================== 92 | 93 | * :ref:`genindex` 94 | * :ref:`modindex` 95 | * :ref:`search` 96 | -------------------------------------------------------------------------------- /docs/source/sdk.rst: -------------------------------------------------------------------------------- 1 | HTRC Workset Toolkit Development Library 2 | ========================================== 3 | 4 | .. toctree:: 5 | :maxdepth: 2 6 | :caption: Contents: 7 | 8 | HTRC Data Capsule Service 9 | ------------------------------ 10 | The *HTRC Data Capsule Service* provisions virtual machines (VMs) to researchers 11 | within the HTRC secure environment. The VM and software environment (including 12 | the SDK) together form a Capsule. Each researcher has exclusive use of the 13 | Capsule for a period of weeks or months during which they can configure their 14 | own environment for performing research on HathiTrust Digital Library texts, 15 | including both in-copyright and public domain volumes. 16 | 17 | Each Capsule has both a maintenance mode and a secure mode. In secure 18 | mode, network access is restricted to the HTRC Data API and some HTDL 19 | resources, allowing text and image data to be downloaded to the Capsule. 20 | 21 | Any changes made on the non-secure volumes are reverted when leaving secure 22 | mode, so persistent code changes must occur in maintenance mode. The SDK 23 | addresses these connectivity issues with the ``htrc.mock`` library. 24 | 25 | 26 | 27 | Mock Testing 28 | '''''''''''''' 29 | `Mock testing`_ uses simulated objects or functions to mimic the behavior 30 | of real code in controlled ways. 31 | 32 | The HTRC Workset Toolkit implements a mock of the Data API access layer in 33 | ``htrc.mock.volumes``. The Data API server is only accessible via a Capsule 34 | in secure mode. By implementing a function with the same call signature 35 | that returns the same data types, workflows that rely on the Data API can be 36 | tested either in Capsule maintenance mode or on a user's own computer. 37 | 38 | An easy way to use this pattern is shown below. 39 | 40 | Example 41 | ''''''''' 42 | 43 | :: 44 | 45 | if __debug__: 46 | # This code will execute when running `python script.py` 47 | import htrc.mock.volumes as volumes 48 | else: 49 | # This code will execute when running `python -O script.py` 50 | # The -O argument turns on optimizations, setting __debug__ = False. 51 | import htrc.volumes as volumes 52 | 53 | # The following is just to make a running script 54 | volume_ids = ['htrc.testid'] # any list will do 55 | output_dir = 'htrc_data' # any path will do 56 | 57 | # download volumes 58 | volumes.download(volume_ids, output_dir) 59 | 60 | This script leverages use of the ``python -O`` switch, which controls the 61 | ``__debug__`` global variable: 62 | 63 | - When run in the development environment, which does not have secure 64 | access to the Data API, the program is run with ``python script.py``, 65 | setting ``__debug__ = True``. This means that ``volumes.download(volume_ids, 66 | output_dir)`` utilizes the function ``htrc.mock.volumes.download(volume_ids, 67 | output_dir)``. 68 | - When run in secure mode of the data capsule, the program is executed with 69 | ``python -O script.py``, setting ``__debug__ = False``. The statement 70 | ``volumes.download(volume_ids, output_dir)`` utilizes the function 71 | ``htrc.mock.volumes.download(volume_ids, output_dir)``. 72 | 73 | 74 | .. _Mock testing: https://en.wikipedia.org/wiki/Mock_object 75 | 76 | 77 | Modules 78 | --------- 79 | 80 | `htrc.metadata` 81 | ----------------- 82 | .. automodule:: htrc.metadata 83 | :members: 84 | 85 | `htrc.mock` 86 | ----------------- 87 | .. automodule:: htrc.mock 88 | :members: 89 | 90 | `htrc.mock.volumes` 91 | ''''''''''''''''''''' 92 | .. automodule:: htrc.mock.volumes 93 | :members: 94 | 95 | `htrc.volumes` 96 | ---------------- 97 | .. automodule:: htrc.volumes 98 | :members: 99 | 100 | `htrc.util` 101 | ---------------- 102 | .. automodule:: htrc.util 103 | :members: 104 | -------------------------------------------------------------------------------- /docs/source/tips.rst: -------------------------------------------------------------------------------- 1 | HTRC Python SDK Tips 2 | ====================== 3 | 4 | This document contains a number of tips for using the CLI and SDK in conjunction 5 | with other tools. 6 | 7 | Pretty-print of JSON data using ``jq`` 8 | -------------------------------------- 9 | The command line tool ``jq`` is very powerful when combined with the ``htrc 10 | metadata`` command, as it can be used to quickly query documents:: 11 | 12 | htrc metadata mdp.39015078560078 | jq 13 | 14 | -------------------------------------------------------------------------------- /htrc/.htrc.default: -------------------------------------------------------------------------------- 1 | [main] 2 | username = 3 | password = 4 | 5 | [data] 6 | host = dataapi-dc.htrc.indiana.edu 7 | port = 443 8 | url = / 9 | cert = 10 | key = 11 | pd_only = 12 | 13 | [oauth] 14 | host = silvermaple.pti.indiana.edu 15 | port = 9443 16 | url = /oauth2/token 17 | 18 | [jwt] 19 | token = 20 | -------------------------------------------------------------------------------- /htrc/__init__.py: -------------------------------------------------------------------------------- 1 | import htrc.metadata 2 | import htrc.volumes 3 | -------------------------------------------------------------------------------- /htrc/__main__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Master script for HTRC Workset Toolkit. 4 | """ 5 | from __future__ import absolute_import, division, print_function 6 | from future import standard_library 7 | standard_library.install_aliases() 8 | 9 | import os 10 | import os.path 11 | import shutil 12 | import sys 13 | from tempfile import NamedTemporaryFile 14 | 15 | from htrc.metadata import get_metadata, get_volume_metadata 16 | import htrc.volumes 17 | import htrc.workset 18 | import htrc.tools.mallet 19 | 20 | from argparse import ArgumentParser 21 | # import htrc.tools.topicexplorer 22 | from htrc.lib.cli import bool_prompt 23 | from htrc.util.resolve import * 24 | 25 | 26 | def download_parser(parser=None): 27 | if parser is None: 28 | parser = ArgumentParser() 29 | #parser.add_argument("-u", "--username", help="HTRC username") 30 | #parser.add_argument("-p", "--password", help="HTRC password") 31 | parser.add_argument("file", nargs='?', default=sys.stdin, 32 | help="Workset path[s]") 33 | parser.add_argument("-f", "--force", action='store_true', 34 | help="Remove folder if exists") 35 | parser.add_argument("-o", "--output", help="Output directory", 36 | default='/media/secure_volume/workset/') 37 | parser.add_argument("-hf", "--remove-headers-footers", action='store_true', 38 | help="Remove headers and footers from individual pages and save in a separate csv file for inspection") 39 | parser.add_argument("-hfc", "--remove-headers-footers-and-concat", action='store_true', 40 | help="Remove headers and footers from individual pages and save in a separate csv file for inspection then concatenate pages") 41 | parser.add_argument("-w", "--window-size", required=False, type=int, metavar="N", default=6, 42 | help="How many pages ahead does the header/footer extractor algorithm look to find potential " 43 | "matching headers/footers (higher value gives potentially more accurate results on lower " 44 | "quality OCR volumes at the expense of runtime)") 45 | parser.add_argument("-msr", "--min-similarity-ratio", required=False, type=float, metavar="N", default=0.7, 46 | help="The minimum string similarity ratio required for the Levenshtein distance fuzzy-matching " 47 | "algorithm to declare that two headers are considered 'the same' (the higher the value, up " 48 | "to a max of 1.0, the more strict the matching has to be; lower values allow for more " 49 | "fuzziness to account for OCR errors)") 50 | parser.add_argument("-s", "--skip-removed-hf", action='store_true', 51 | help="Skip creating a saved report of the removed headers and footers for each page for inspection") 52 | parser.add_argument("--parallelism", required=False, type=int, metavar="N", default=os.cpu_count(), 53 | help="The max number of concurrent tasks to start when downloading or removing headers/footers") 54 | parser.add_argument("--batch-size", required=False, type=int, metavar="N", default=250, 55 | help="The max number of volumes to download at a time from DataAPI") 56 | parser.add_argument("-c", "--concat", action='store_true', 57 | help="Concatenate a volume's pages in to a single file") 58 | parser.add_argument("-m", "--mets", action='store_true', 59 | help="Add volume's METS file") 60 | parser.add_argument("-pg", "--pages",action='store_true', 61 | help="Download given page numbers of a volumes.") 62 | parser.add_argument("-t", "--token", help="JWT for volumes download.") 63 | parser.add_argument("-dh", "--datahost", help="Data API host.") 64 | parser.add_argument("-dp", "--dataport", help="Data API port.") 65 | parser.add_argument("-de", "--dataepr", help="Data API EPR.") 66 | parser.add_argument("-dc", "--datacert", help="Client certificate file for mutual TLS with Data API.") 67 | parser.add_argument("-dk", "--datakey", help="Client key file for mutual TLS with Data API.") 68 | return parser 69 | 70 | 71 | def add_workset_path(parser=None): 72 | if parser is None: 73 | parser = ArgumentParser() 74 | parser.add_argument("path", nargs='+', help="Workset path[s]") 75 | return parser 76 | 77 | 78 | def main(): 79 | parser = ArgumentParser() 80 | parser.add_argument('-d', '--debug', help="Print long debug messages", 81 | action='store_true') 82 | parsers = parser.add_subparsers(help="select a command") 83 | 84 | # Metadata Helpers 85 | parser_getmd = parsers.add_parser('metadata', 86 | help="Get metadata for a folder of HathiTrust volumes") 87 | add_workset_path(parser_getmd) 88 | parser_getmd.set_defaults(func='metadata') 89 | 90 | # Export Helpers 91 | parser_export = parsers.add_parser('export', 92 | help="Export the list of HathiTrust volumes") 93 | add_workset_path(parser_export) 94 | parser_export.set_defaults(func='export') 95 | 96 | # Download Helper 97 | parser_download = parsers.add_parser('download', 98 | help="Download HathiTrust volumes to disk [requires auth]") 99 | download_parser(parser_download) 100 | parser_download.set_defaults(func='download') 101 | 102 | 103 | # Run helper 104 | parser_run = parsers.add_parser('run', help="Run a built-in algorithm.") 105 | run_parsers = parser_run.add_subparsers(help="Select a command") 106 | 107 | parser_mallet = run_parsers.add_parser('mallet') 108 | htrc.tools.mallet.populate_parser(parser_mallet) 109 | parser_mallet.set_defaults(run='mallet') 110 | 111 | # parser_topicexplorer = run_parsers.add_parser('topicexplorer') 112 | # htrc.tools.topicexplorer.populate_parser(parser_topicexplorer) 113 | # parser_topicexplorer.set_defaults(run='topicexplorer') 114 | 115 | parser_run.set_defaults(func='run') 116 | 117 | args = parser.parse_args() 118 | if 'func' not in args: 119 | parser.print_help() 120 | sys.exit(1) 121 | 122 | if args.func in ['metadata', 'export']: 123 | volumes = [] 124 | if not args.path: 125 | for line in sys.stdin: 126 | volumes.append(line) 127 | else: 128 | for path in args.path: 129 | try: 130 | volumes.extend(htrc.workset.path_to_volumes(path)) 131 | except ValueError: 132 | volumes.append(path) 133 | if args.func == 'export': 134 | for volume in volumes: 135 | print(volume) 136 | if args.func == 'metadata': 137 | metadata = get_metadata(volumes) 138 | print(json.dumps(metadata)) 139 | elif args.func == 'run': 140 | if 'run' not in args: 141 | parser_run.print_help() 142 | sys.exit(1) 143 | if args.run == 'mallet': 144 | htrc.tools.mallet.main(args.path, args.k, args.iter) 145 | # if args.run == 'topicexplorer': 146 | # htrc.tools.topicexplorer.main(args.path, args.k, args.iter) 147 | elif args.func == 'download': 148 | if os.path.exists(args.output): 149 | if args.force or bool_prompt('Folder {} exists. Delete?'.format(args.output), default=False): 150 | shutil.rmtree(args.output) 151 | os.makedirs(args.output) 152 | else: 153 | print("Please choose another output folder and try again.") 154 | sys.exit(1) 155 | 156 | if args.concat and args.remove_headers_footers: 157 | print("Cannot set both concat and remove-headers-footers") 158 | sys.exit(1) 159 | if args.concat and args.remove_headers_footers_and_concat: 160 | print("Cannot set both concat and remove-headers-footers-and-concat") 161 | sys.exit(1) 162 | if args.remove_headers_footers and args.remove_headers_footers_and_concat: 163 | print("Cannot set both remove_headers_footers and remove_headers_footers_and_concat") 164 | sys.exit(1) 165 | if args.mets and args.remove_headers_footers_and_concat: 166 | print("Cannot set both mets and remove_headers_footers_and_concat") 167 | sys.exit(1) 168 | if args.pages: 169 | if args.mets and args.concat: 170 | print("Cannot set both concat and mets with pages") 171 | sys.exit(1) 172 | if args.mets and args.remove_headers_footers_and_concat: 173 | print("Cannot set both mets and remove_headers_footers_and_concat with pages") 174 | sys.exit(1) 175 | 176 | try: 177 | resolve_and_download(args) 178 | except ValueError: 179 | print("Invalid identifier:", args.file) 180 | sys.exit(1) 181 | 182 | 183 | def resolve_and_download(args): 184 | if args.file == sys.stdin: 185 | # For use with UNIX pipes 186 | download_with_tempfile(args, sys.stdin) 187 | return 188 | 189 | elif os.path.exists(args.file): 190 | # For use with downloaded workset files - either in JSON or 191 | download(args) 192 | return 193 | 194 | elif (args.file.endswith('json') 195 | or args.file.endswith('jsonld') 196 | or args.file.startswith('http://') 197 | or args.file.startswith('https://')): 198 | # For use with HTRC Worksets and HT Collection Builder 199 | try: 200 | volumes = htrc.workset.load(args.file) 201 | download_with_tempfile(args, volumes) 202 | return 203 | except ValueError: 204 | # Invalid workset, continue to last block 205 | pass 206 | 207 | # Check for valid volume_id 208 | try: 209 | if parse_volume_id(args.file): 210 | volumes = [parse_volume_id(args.file)] 211 | download_with_tempfile(args, volumes) 212 | return 213 | else: 214 | raise ValueError("No Volume ID found") 215 | except ValueError: 216 | pass 217 | 218 | # Check for valid record id 219 | if parse_record_id(args.file): 220 | record_id = parse_record_id(args.file) 221 | volumes = record_id_to_volume_ids(record_id) 222 | download_with_tempfile(args, volumes) 223 | return 224 | else: 225 | # invalid 226 | raise ValueError("Not a valid ID file or workset identifier: {}".format( 227 | args.file)) 228 | 229 | 230 | def download(args): 231 | try: 232 | htrc.volumes.download(args) 233 | except OSError as e: 234 | if not os.path.exists('/media/secure_volume/'): 235 | print('Secure volume not mounted. Could not download volumes') 236 | sys.exit(1) 237 | else: 238 | print("Could not download volumes. {} {}".format(e.strerror, e.filename)) 239 | sys.exit(1) 240 | except RuntimeError as e: 241 | if not args.debug: 242 | print("Could not download volumes. {}".format(str(e))) 243 | sys.exit(1) 244 | else: 245 | raise e 246 | 247 | 248 | def download_with_tempfile(args, volumes): 249 | f = NamedTemporaryFile() 250 | for volume in volumes: 251 | f.write((volume + '\n').encode('utf-8')) 252 | f.flush() 253 | args.file = f.name 254 | 255 | try: 256 | download(args) 257 | finally: 258 | print("Closing temporary file: " + f.name) 259 | f.close() 260 | 261 | 262 | if __name__ == '__main__': 263 | main() 264 | -------------------------------------------------------------------------------- /htrc/auth.py: -------------------------------------------------------------------------------- 1 | #from base64 import b64encode 2 | from getpass import getpass 3 | #import http.client 4 | #import ssl 5 | #import time 6 | import subprocess 7 | 8 | import requests 9 | import requests.auth 10 | #import configparser 11 | 12 | import htrc.config 13 | 14 | 15 | def get_jwt_token(): 16 | # Currently we just store one common jwt token locally at .htrc file for simplicity 17 | # Expect to add POST method to query unique jwt token with the combo of username and password 18 | #username, password = credential_prompt() 19 | 20 | #client_id, client_secret = htrc.config.get_credentials() 21 | 22 | #auth = requests.auth.HTTPBasicAuth(client_id, client_secret) 23 | #data = { "grant_type": "password", 24 | #"username": username, 25 | #"password": password, 26 | #"scope" : "openid"} 27 | 28 | url1 = htrc.config.get_idp_url() 29 | capsule_id = htrc.config._get_value("jwt", "capsule_id") 30 | result = subprocess.check_output("hostname -s -I | awk '{print $1}'", shell=True) 31 | result = result.decode('utf-8') 32 | result = result[:-1] 33 | capsule_ip = result.strip() 34 | url = url1 + "/" + capsule_id + "/" + capsule_ip 35 | r = requests.get(url) 36 | 37 | data = r.json() 38 | if 'error' not in data: 39 | #expiration = int(time.time()) + data['expires_in'] 40 | return data['token'] 41 | elif data['error'] == 'invalid_grant': 42 | print("Invalid username or password. Please try again.\n") 43 | return get_jwt_token() 44 | else: 45 | raise RuntimeError("JWT token retrieval failed: {}".format(data['error'])) 46 | 47 | 48 | def credential_prompt(): 49 | """ 50 | A prompt for entering HathiTrust Research Center credentials. 51 | """ 52 | print("Please enter your HathiTrust Research Center credentials.") 53 | username = input("HTRC Username: ") 54 | password = getpass("HTRC Password: ") 55 | 56 | if not username or not password: 57 | print("Invalid username or password. Please try again.\n") 58 | return credential_prompt() 59 | else: 60 | return (username, password) 61 | 62 | 63 | if __name__ == '__main__': 64 | token = get_jwt_token() 65 | htrc.config.save_jwt_token(token) 66 | -------------------------------------------------------------------------------- /htrc/config.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | `htrc.volumes` 4 | 5 | Contains the configuration parser object. 6 | """ 7 | from future import standard_library 8 | standard_library.install_aliases() 9 | from typing import Optional 10 | from configparser import RawConfigParser as ConfigParser, NoSectionError 11 | from codecs import open 12 | import logging 13 | import os.path 14 | import shutil 15 | import time 16 | 17 | DEFAULT_PATH = os.path.expanduser('~') 18 | DEFAULT_PATH = os.path.join(DEFAULT_PATH, '.htrc') 19 | if not os.path.exists(DEFAULT_PATH): 20 | DEFAULT_FILE = os.path.dirname(__file__) 21 | DEFAULT_FILE = os.path.join(DEFAULT_FILE, '.htrc.default') 22 | logging.info("Copying default config file to home directory.") 23 | shutil.copyfile(DEFAULT_FILE, DEFAULT_PATH) 24 | 25 | 26 | class HtrcDataApiConfig: 27 | def __init__(self, 28 | token: Optional[str] = None, 29 | host: Optional[str] = None, 30 | port: Optional[int] = None, 31 | epr: Optional[str] = None, 32 | cert: Optional[str] = None, 33 | key: Optional[str] = None) -> None: 34 | super().__init__() 35 | 36 | self.token = token or get_jwt_token() 37 | self.host = host or get_dataapi_host() 38 | self.port = port or get_dataapi_port() 39 | self.epr = epr or get_dataapi_epr() 40 | self.cert = cert or get_dataapi_cert() 41 | self.key = key or get_dataapi_key() 42 | 43 | 44 | def _get_value(section, key, path=None): 45 | if path is None: 46 | path = DEFAULT_PATH 47 | 48 | config = ConfigParser(allow_no_value=True) 49 | with open(path, encoding='utf8') as configfile: 50 | config.readfp(configfile) 51 | try: 52 | return config.get(section, key) 53 | except NoSectionError: 54 | raise EnvironmentError("Config not set for {} {} in {}".format( 55 | section, key, path)) 56 | 57 | 58 | def get_dataapi_port(path=None): 59 | port = int(_get_value('data', 'port', path)) 60 | return (port) 61 | 62 | 63 | def get_dataapi_host(path=None): 64 | host = _get_value('data', 'host', path) 65 | return (host) 66 | 67 | 68 | def get_dataapi_epr(path=None): 69 | return _get_value('data', 'url', path) 70 | 71 | 72 | def get_dataapi_cert(path=None): 73 | return _get_value('data', 'cert', path) 74 | 75 | 76 | def get_dataapi_key(path=None): 77 | return _get_value('data', 'key', path) 78 | 79 | 80 | def get_dataapi_access(path=None): 81 | return _get_value('data', 'pd_only', path) 82 | 83 | 84 | def get_idp_host_port(path=None): 85 | host = _get_value('idp', 'host', path) 86 | port = _get_value('idp', 'port', path) 87 | 88 | return (host, port) 89 | 90 | 91 | def get_idp_path(path=None): 92 | return _get_value('idp', 'url') 93 | 94 | 95 | def get_idp_url(path=None): 96 | host, port = get_idp_host_port(path) 97 | path = get_idp_path(path) 98 | if port == 443: 99 | # On HTTPS Default Path 100 | return "https://{}{}".format(host, path) 101 | else: 102 | return "https://{}:{}{}".format(host, port, path) 103 | 104 | 105 | # Add jwt credential access methods 106 | def get_jwt_token(path=None): 107 | 108 | import htrc.auth 109 | token = htrc.auth.get_jwt_token() 110 | 111 | return token 112 | 113 | def save_jwt_token(token, path=None): 114 | 115 | """ 116 | Saves JWT token in the config file. 117 | """ 118 | # Default to ~/.htrc 119 | if path is None: 120 | path = DEFAULT_PATH 121 | 122 | # Default to expiration of now - force a new token on next request 123 | #if expiration is None: 124 | #expiration = time.time() 125 | 126 | # Open and modify existing config file, if it exists. 127 | config = ConfigParser(allow_no_value=True) 128 | if os.path.exists(path): 129 | config.read(path) 130 | if not config.has_section('jwt'): 131 | config.add_section('jwt') 132 | 133 | # set token and expiration 134 | config.set('jwt', 'token', token) 135 | #config.set('jwt', 'expiration', expiration) 136 | 137 | with open(path, 'w') as credential_file: 138 | config.write(credential_file) 139 | 140 | return token 141 | 142 | 143 | def remove_jwt_token(path=None): 144 | """ 145 | Removes JWT token from the config file. 146 | """ 147 | # Default to ~/.htrc 148 | if path is None: 149 | path = DEFAULT_PATH 150 | 151 | # Open and modify existing config file, if it exists. 152 | config = ConfigParser(allow_no_value=True) 153 | if os.path.exists(path): 154 | config.read(path) 155 | if not config.has_section('jwt'): 156 | config.add_section('jwt') 157 | # set token and expiration 158 | config.set('jwt', 'token', " ") 159 | #config.set('jwt', 'expiration', " ") 160 | 161 | with open(path, 'w') as credential_file: 162 | config.write(credential_file) 163 | 164 | 165 | def get_credentials(path=None): 166 | """ 167 | Retrieves the username and password from a config file for the Data API. 168 | Raises an EnvironmentError if not specified. 169 | See also: credential_prompt 170 | """ 171 | client_id = _get_value('idp', 'client_id', path) 172 | client_secret = _get_value('idp', 'client_secret', path) 173 | 174 | if not client_id and not client_secret: 175 | logging.error("Config path: {}".format(path)) 176 | raise EnvironmentError("No client_id and client_secret stored in config file.") 177 | 178 | return (client_id, client_secret) 179 | 180 | 181 | def populate_parser(parser): 182 | return parser 183 | 184 | 185 | if __name__ == '__main__': 186 | from argparse import ArgumentParser 187 | 188 | parser = ArgumentParser() 189 | parser = populate_parser(parser) 190 | parser.parse_args() 191 | 192 | -------------------------------------------------------------------------------- /htrc/hf_utils/__init__.py: -------------------------------------------------------------------------------- 1 | import re 2 | from typing import TypeVar, List, Iterator, Tuple, Callable 3 | 4 | T = TypeVar('T') 5 | 6 | 7 | def clean_text(s: str) -> str: 8 | # replace all characters which aren't letters with whitespaces ([\W\d_] is equivalent of \P{L} which is unsupported) 9 | s = re.sub(r'[\W\d_]+', " ", s, flags=re.UNICODE) 10 | # replace multiple sequential whitespaces with single whitespace 11 | s = re.sub(r'\s{2,}', " ", s, flags=re.UNICODE) 12 | # trim whitespaces at the beginning and end 13 | s = s.strip() 14 | # lowercase 15 | s = s.lower() 16 | 17 | return s 18 | 19 | 20 | def levenshtein(s: str, t: str, insert_cost: int = 1, delete_cost: int = 1, replace_cost: int = 1) -> int: 21 | """ From Wikipedia article; Iterative with two matrix rows. """ 22 | # degenerate cases 23 | if s == t: 24 | return 0 25 | 26 | len0 = len(s) 27 | len1 = len(t) 28 | 29 | if not len0: 30 | return len1 31 | 32 | if not len1: 33 | return len0 34 | 35 | # the array of distances 36 | v0 = [0] * (len0 + 1) 37 | v1 = [0] * (len0 + 1) 38 | 39 | # initial cost of skipping prefix in s 40 | for i in range(len(v0)): 41 | v0[i] = i 42 | 43 | # dynamically compute the array of distances 44 | 45 | # transformation cost for each letter in t 46 | for j in range(len1): 47 | # initial cost of skipping prefix in t 48 | v1[0] = j + 1 49 | 50 | # transformation cost for each letter in s 51 | for i in range(len0): 52 | # matching current letters in both strings 53 | match = 0 if s[i] == t[j] else 1 54 | 55 | # computing cost for each transformation 56 | cost_insert = v0[i + 1] + insert_cost 57 | cost_delete = v1[i] + delete_cost 58 | cost_replace = v0[i] + match * replace_cost 59 | 60 | # keep minimum cost 61 | v1[i + 1] = min(cost_insert, cost_delete, cost_replace) 62 | 63 | # swap cost arrays 64 | v0, v1 = v1, v0 65 | 66 | # the distance is the cost for transforming all letters in both strings 67 | return v0[len0] 68 | 69 | 70 | def pairwise_combine_within_distance(xs: List[T], n: int) -> List[Tuple[T, T]]: 71 | if not xs: 72 | return [] 73 | 74 | result = [] 75 | x, xs = xs[0], xs[1:] 76 | 77 | while xs: 78 | result = result + [(x, v) for v in xs[:n - 1]] 79 | x, xs = xs[0], xs[1:] 80 | 81 | return result 82 | 83 | 84 | def group_consecutive_when(xs: List[T], pred: Callable[[T, T], bool]) -> Iterator[List[T]]: 85 | result = [] 86 | _prev, _next = None, None 87 | 88 | while len(xs) > 1: 89 | _prev, _next = xs[0], xs[1] 90 | result.append(_prev) 91 | if not pred(_prev, _next): 92 | yield result 93 | result = [] 94 | xs = xs[1:] 95 | 96 | if len(xs) == 1: 97 | _prev, _next = _next, xs[0] 98 | 99 | if _prev is not None and _next is not None and pred(_prev, _next): 100 | result.extend([_prev, _next]) 101 | elif _next is not None: 102 | result.append(_next) 103 | 104 | yield result 105 | 106 | 107 | def flatten(xss: List[tuple]) -> Iterator[T]: 108 | for xs in xss: 109 | for x in xs: 110 | yield x 111 | -------------------------------------------------------------------------------- /htrc/lib/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/htrc/HTRC-WorksetToolkit/3c3428d80a72a644925dc6ab2827470c8467af30/htrc/lib/__init__.py -------------------------------------------------------------------------------- /htrc/lib/cli.py: -------------------------------------------------------------------------------- 1 | from builtins import input 2 | 3 | 4 | def bool_prompt(prompt_str, default=None): 5 | if default is True: 6 | default = 'y' 7 | elif default is False: 8 | default = 'n' 9 | 10 | result = prompt(prompt_str, options=['y', 'n'], default=default) 11 | 12 | if result == 'y': 13 | return True 14 | elif result == 'n': 15 | return False 16 | 17 | 18 | def prompt(prompt, options=None, default=None): 19 | # Construct prompt 20 | prompt = "\n" + prompt 21 | 22 | if options: 23 | choices = options[:] 24 | if default and default in choices: 25 | default_idx = choices.index(default) 26 | choices[default_idx] = choices[default_idx].upper() 27 | prompt += " [{0}]".format('/'.join(choices)) 28 | elif default: 29 | if isinstance(default,str): 30 | prompt += " [Default: {0}]".format(default.encode('utf-8')) 31 | else: 32 | prompt += " [Default: {0}]".format(default) 33 | prompt += " " 34 | 35 | # Wait for valid response 36 | result = None 37 | while result is None or (options and result not in options): 38 | result = input(prompt) 39 | result = result.lower().strip() 40 | if default and result == '': 41 | result = default 42 | 43 | return result 44 | -------------------------------------------------------------------------------- /htrc/metadata/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from future import standard_library 3 | standard_library.install_aliases() 4 | from builtins import str 5 | 6 | import codecs 7 | import json 8 | import logging 9 | import os, os.path 10 | import re 11 | from time import sleep 12 | from urllib.request import urlopen 13 | from urllib.error import HTTPError 14 | from urllib.parse import quote_plus, urlencode 15 | 16 | import requests 17 | 18 | from htrc.util import split_items 19 | 20 | def get_volume_metadata(id, marc=False): 21 | """ 22 | Retrieve item metadata `from the HathiTrust Bibliographic API`_. 23 | 24 | Params: 25 | :param id: HTID for the volume to be retrieved 26 | :param marc: Retrieve MARC-XML within JSON return value. 27 | 28 | .. _from the HathiTrust Bibliographic API: https://www.hathitrust.org/bib_api 29 | """ 30 | biblio_api = "https://catalog.hathitrust.org/api/volumes" 31 | 32 | if marc: 33 | biblio_api += '/full' 34 | else: 35 | biblio_api += '/brief' 36 | 37 | url = biblio_api + '/htid/{}.json'.format(id) 38 | 39 | try: 40 | reader = codecs.getreader('utf-8') 41 | data = json.load(reader(urlopen(url))) 42 | if len(data['records']) == 1: 43 | for item in data['items']: 44 | if item['htid'] == id: 45 | md = data['records'][item['fromRecord']] 46 | md.update(item) 47 | return md 48 | else: 49 | raise ValueError 50 | except (ValueError, IndexError, HTTPError): 51 | raise ValueError("No result found for " + id) 52 | 53 | 54 | def safe_volume_metadata(id, marc=False, sleep_time=1): 55 | """ 56 | Retrieve item metadata `from the HathiTrust Bibliographic API`_. 57 | 58 | Unlike :method volume_metadata:, this function returns an empty dictionary, 59 | rather than an error when metadata is missing. 60 | 61 | Params: 62 | :param id: HTID for the volume to be retrieved 63 | :param marc: Retrieve MARC-XML within JSON return value. 64 | 65 | _ https://www.hathitrust.org/bib_api 66 | """ 67 | try: 68 | metadata = get_volume_metadata(id, marc) 69 | if sleep_time: 70 | sleep(sleep_time) 71 | return metadata 72 | except ValueError as err: 73 | logging.error(err) 74 | return dict() 75 | 76 | def get_bulk_metadata(ids, marc=False): 77 | """ 78 | Retrieve item metadata `from the HathiTrust Bibliographic API`_. 79 | 80 | Params: 81 | :param ids: HTIDs for the volumes to be retrieved 82 | :param marc: Retrieve MARC-XML within JSON return value. 83 | 84 | .. _from the HathiTrust Bibliographic API: https://www.hathitrust.org/bib_api 85 | """ 86 | biblio_api = "https://catalog.hathitrust.org/api/volumes" 87 | 88 | if marc: 89 | biblio_api += '/full' 90 | else: 91 | biblio_api += '/brief' 92 | 93 | query = '|'.join(['htid:' + id for id in ids]) 94 | url = biblio_api + '/json/' + query 95 | 96 | metadata = dict() 97 | try: 98 | reader = codecs.getreader('utf-8') 99 | raw = json.load(reader(urlopen(url))) 100 | 101 | for id, data in raw.items(): 102 | id = id.replace('htid:','') 103 | if len(data['records']) == 1: 104 | for item in data['items']: 105 | if item['htid'] == id: 106 | item_md = data['records'][item['fromRecord']] 107 | item_md.update(item) 108 | metadata[id] = item_md 109 | else: 110 | metadata[id] = dict() 111 | except HTTPError: 112 | raise RuntimeError("Could not access HT Bibliography API.") 113 | 114 | return metadata 115 | 116 | def safe_bulk_metadata(ids, marc=False, sleep_time=1): 117 | """ 118 | Retrieve bulk item metadata `from the HathiTrust Bibliographic API`_. 119 | 120 | Unlike :method get_bulk_metadata:, this function returns an 121 | empty dictionary, rather than an error when metadata is missing. 122 | 123 | Params: 124 | :param ids: HTIDs for the volumes to be retrieved 125 | :param marc: Retrieve MARC-XML within JSON return value. 126 | 127 | _ https://www.hathitrust.org/bib_api 128 | """ 129 | try: 130 | metadata = get_bulk_metadata(ids, marc) 131 | if sleep_time: 132 | sleep(sleep_time) 133 | return metadata 134 | except ValueError as err: 135 | logging.error(err) 136 | return dict() 137 | 138 | def get_metadata(ids, output_file=None): 139 | """ 140 | Retrieves metadata for a folder of folders, where each subfolder is named 141 | for a HathiTrust ID. This structure is the default structure extracted from 142 | a Data API request (:method htrc.volumes.get_volumes:). 143 | """ 144 | ids = [str.strip(id).replace('+', ':').replace('=', '/') for id in ids] # data cleanup 145 | 146 | metadata = dict() 147 | for segment in split_items(ids, 50): 148 | items = safe_bulk_metadata(segment) 149 | metadata.update(items) 150 | 151 | if output_file: 152 | with open(output_file, 'w') as outfile: 153 | json.dump(metadata, outfile) 154 | 155 | return metadata 156 | 157 | def record_metadata(id, sleep_time=1): 158 | """ 159 | Retrieve metadata for a HathiTrust Record. 160 | """ 161 | regex = re.compile('\W') 162 | url = "http://catalog.hathitrust.org/api/volumes/brief/recordnumber/{0}.json" 163 | 164 | url = url.format(id) 165 | r = requests.get(url) 166 | data = r.json() 167 | 168 | # data = data['items'][id] 169 | items = [] 170 | if data: 171 | for item in data['items']: 172 | enum = regex.sub('', str(item.get('enumcron', '')).lower()) 173 | htid = item.get('htid', '') 174 | items.append((enum, htid)) 175 | else: 176 | items = [] 177 | 178 | sleep(sleep_time) 179 | return items 180 | 181 | 182 | 183 | def volume_solr_metadata(id, sleep_time=0.1): 184 | """ 185 | Retrieve metadata from HTRC Solr API. 186 | 187 | The HTRC Solr instance is used only for certain extracted features 188 | unavailable in the main HathiTrust Bibliographic API. If you are a 189 | recipient of a HTRC Advanced Collaborative Support (ACS) grant, 190 | then you may have to use this function. 191 | 192 | """ 193 | solr = "http://chinkapin.pti.indiana.edu:9994/solr/meta/select/?q=id:%s" % id 194 | solr += "&wt=json" # retrieve JSON results 195 | if sleep_time: 196 | sleep(sleep_time) # JUST TO MAKE SURE WE ARE THROTTLED 197 | try: 198 | reader = codecs.getreader('utf-8') 199 | data = json.load(reader(urlopen(solr))) 200 | return data['response']['docs'][0] 201 | except (ValueError, IndexError, HTTPError): 202 | logging.error("No result found for " + id) 203 | return dict() 204 | 205 | 206 | -------------------------------------------------------------------------------- /htrc/metadata/marc.py: -------------------------------------------------------------------------------- 1 | """ 2 | MARC CODE HANDLING 3 | """ 4 | from __future__ import print_function 5 | from future import standard_library 6 | standard_library.install_aliases() 7 | from builtins import str 8 | 9 | import xml.etree.ElementTree as ET 10 | 11 | 12 | def parse_marc(raw): 13 | # lazy workaround 14 | raw = raw.replace(' xmlns', ' xmlnamespace') 15 | ET.register_namespace('', 'http://www.loc.gov/MARC21/slim') 16 | return ET.fromstring(raw) 17 | 18 | 19 | def get_marc_value(xml, tag, code): 20 | xpath = "{marc}datafield[@tag='{tag}']/{marc}subfield[@code='{code}']".format( 21 | tag=tag, code=code, marc='') # marc="{http://www.loc.gov/MARC21/slim}") 22 | results = xml.findall(xpath) 23 | return results[0].text if results else None 24 | 25 | 26 | def get_lccn_from_marc(xml): 27 | return get_marc_value(xml, '010', 'a') 28 | 29 | 30 | def get_title_from_marc(xml): 31 | return get_marc_value(xml, '245', 'a') 32 | 33 | 34 | def get_volume_from_marc(xml): 35 | return get_marc_value(xml, '974', 'c') 36 | 37 | 38 | def get_lcc_from_marc(xml): 39 | # MARC tag 050a/b or 991h/i 40 | lcc = list() 41 | val = get_marc_value(xml, '050', 'a') 42 | if val: 43 | lcc.append(val) 44 | 45 | val = get_marc_value(xml, '050', 'b') 46 | if val: 47 | lcc[-1] += val 48 | 49 | val = get_marc_value(xml, '991', 'h') 50 | if val: 51 | lcc.append(val) 52 | 53 | val = get_marc_value(xml, '991', 'i') 54 | if val: 55 | lcc[-1] += val 56 | 57 | return ";".join(lcc) 58 | 59 | 60 | -------------------------------------------------------------------------------- /htrc/mock/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/htrc/HTRC-WorksetToolkit/3c3428d80a72a644925dc6ab2827470c8467af30/htrc/mock/__init__.py -------------------------------------------------------------------------------- /htrc/mock/volumes/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | `htrc.mock.volumes` 4 | 5 | Contains functions to test the volume retrieval from the HTRC Data API. 6 | The download functions will return a sample zip file. 7 | 8 | See the core documentation for an example of how to use this library. 9 | """ 10 | from __future__ import print_function 11 | from future import standard_library 12 | standard_library.install_aliases() 13 | 14 | from builtins import input 15 | 16 | from configparser import RawConfigParser as ConfigParser 17 | from io import BytesIO 18 | import os, os.path 19 | from zipfile import ZipFile # used to decompress requested zip archives. 20 | 21 | from htrc.lib.cli import bool_prompt 22 | from htrc.auth import credential_prompt 23 | from htrc.config import save_jwt_token 24 | 25 | EXAMPLE_FILE = os.path.join(os.path.dirname(__file__), 'example.zip') 26 | 27 | def get_volumes(token, volume_ids, concat=False): 28 | """ 29 | Returns volumes from the Data API as a raw zip stream. 30 | 31 | Parameters: 32 | :token: An OAuth2 token for the app. 33 | :volume_ids: A list of volume_ids 34 | :concat: If True, return a single file per volume. If False, return a single 35 | file per page (default). 36 | """ 37 | if not volume_ids: 38 | raise ValueError("volume_ids is empty.") 39 | 40 | with open(EXAMPLE_FILE, 'rb') as infile: 41 | data = infile.read() 42 | 43 | return data 44 | 45 | def get_pages(token, page_ids, concat=False): 46 | """ 47 | Returns a ZIP file containing specfic pages. 48 | 49 | Parameters: 50 | :token: An OAuth2 token for the app. 51 | :volume_ids: A list of volume_ids 52 | :concat: If True, return a single file per volume. If False, return a single 53 | file per page (default). 54 | """ 55 | if not page_ids: 56 | raise ValueError("page_ids is empty.") 57 | 58 | with open(EXAMPLE_FILE, 'rb') as infile: 59 | data = infile.read() 60 | 61 | return data 62 | 63 | def get_oauth2_token(username, password): 64 | """ 65 | Returns a sample token for oauth2 66 | """ 67 | return 'a1b2c3d4e5f6' 68 | 69 | 70 | def credentials_from_config(path): 71 | """ 72 | Retrieves the username and password from a config file for the Data API. 73 | DOES NOT raise an EnvironmentError if path is invalid. 74 | See also: credential_prompt 75 | """ 76 | username = None 77 | password = None 78 | 79 | return (username, password) 80 | 81 | 82 | def download_volumes(volume_ids, output_dir, username=None, password=None): 83 | # create output_dir folder, if nonexistant 84 | if not os.path.isdir(output_dir): 85 | os.makedirs(output_dir) 86 | 87 | # Retrieve token and download volumes 88 | token = get_oauth2_token(username, password) 89 | data = get_volumes(token, volume_ids, False) 90 | 91 | with open(EXAMPLE_FILE, 'rb') as infile: 92 | myzip = ZipFile(infile) 93 | myzip.extractall(output_dir) 94 | myzip.close() 95 | 96 | 97 | def download(args): 98 | # extract files 99 | with open(args.file) as IDfile: 100 | volumeIDs = [line.strip() for line in IDfile] 101 | 102 | return download_volumes(volumeIDs, args.output, args.username, args.password) 103 | 104 | -------------------------------------------------------------------------------- /htrc/mock/volumes/example.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/htrc/HTRC-WorksetToolkit/3c3428d80a72a644925dc6ab2827470c8467af30/htrc/mock/volumes/example.zip -------------------------------------------------------------------------------- /htrc/models/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | from abc import ABC, abstractmethod 3 | from typing import List 4 | 5 | 6 | class Page(ABC): 7 | @property 8 | @abstractmethod 9 | def text_lines(self) -> List[str]: 10 | """ 11 | The lines of text on the page 12 | """ 13 | pass 14 | 15 | @property 16 | def text(self) -> str: 17 | return os.linesep.join(self.text_lines) 18 | 19 | 20 | class PageStructure(Page, ABC): 21 | def __init__(self) -> None: 22 | self.num_header_lines = 0 23 | self.num_footer_lines = 0 24 | 25 | @property 26 | def has_header(self) -> bool: 27 | return self.num_header_lines > 0 28 | 29 | @property 30 | def has_body(self) -> bool: 31 | return len(self.text_lines) - self.num_header_lines - self.num_footer_lines > 0 32 | 33 | @property 34 | def has_footer(self) -> bool: 35 | return self.num_footer_lines > 0 36 | 37 | @property 38 | def header_lines(self) -> List[str]: 39 | return self.text_lines[:self.num_header_lines] 40 | 41 | @property 42 | def body_lines(self) -> List[str]: 43 | return self.text_lines[self.num_header_lines:len(self.text_lines) - self.num_footer_lines] 44 | 45 | @property 46 | def footer_lines(self) -> List[str]: 47 | return self.text_lines[-self.num_footer_lines:] if self.has_footer else [] 48 | 49 | @property 50 | def header(self) -> str: 51 | return os.linesep.join(self.header_lines) 52 | 53 | @property 54 | def body(self) -> str: 55 | return os.linesep.join(self.body_lines) 56 | 57 | @property 58 | def footer(self) -> str: 59 | return os.linesep.join(self.footer_lines) 60 | 61 | 62 | class HtrcPage(Page): 63 | def __init__(self, lines: List[str]) -> None: 64 | self._lines = lines 65 | 66 | @property 67 | def text_lines(self) -> List[str]: 68 | return self._lines 69 | -------------------------------------------------------------------------------- /htrc/runningheaders/__init__.py: -------------------------------------------------------------------------------- 1 | import re 2 | from collections import defaultdict 3 | from typing import List, TypeVar, Set, Iterator, Optional, Tuple, Dict 4 | 5 | from htrc.models import Page, PageStructure 6 | from htrc.hf_utils import clean_text, levenshtein, pairwise_combine_within_distance, flatten, group_consecutive_when 7 | 8 | T = TypeVar('T', bound=Page) 9 | U = TypeVar('U', bound=PageStructure) 10 | 11 | 12 | class _Line: 13 | def __init__(self, text: str, line_number: int, page: Page) -> None: 14 | self.text = text 15 | self.line_number = line_number 16 | self.page = page 17 | self.cleaned_text = clean_text(text) 18 | 19 | def __eq__(self, o: object) -> bool: 20 | if not isinstance(o, _Line): 21 | raise NotImplemented 22 | 23 | are_equal = self.page is o.page and self.line_number == o.line_number 24 | 25 | return are_equal 26 | 27 | def __ne__(self, o: object) -> bool: 28 | return not self == o 29 | 30 | def __hash__(self) -> int: 31 | line_hash = hash(self.line_number) 32 | page_hash = hash(self.page) 33 | hash_value = 31 * line_hash + page_hash 34 | 35 | return hash_value 36 | 37 | def __str__(self) -> str: 38 | return str((self.line_number, self.cleaned_text)) 39 | 40 | def similarity_ratio(self, line: '_Line') -> float: 41 | ratio = 1 - float(levenshtein(self.cleaned_text, line.cleaned_text)) / max(len(self.cleaned_text), 42 | len(line.cleaned_text)) 43 | 44 | return ratio 45 | 46 | 47 | def parse_page_structure(pages: List[T], 48 | window_size: int = 6, 49 | min_similarity_ratio: float = 0.7, 50 | min_cluster_size: int = 3, 51 | max_header_lines: int = 3, 52 | max_footer_lines: int = 3) -> List[U]: 53 | def _get_page_lines(p: T) -> List[_Line]: 54 | return [_Line(text, line_num, p) for line_num, text in enumerate(p.text_lines)] 55 | 56 | def _cluster_lines(lines: List[Tuple[_Line, _Line]]) -> Set[tuple]: 57 | cluster_map = {} 58 | 59 | for l1, l2 in lines: 60 | c1 = cluster_map.get(l1) 61 | c2 = cluster_map.get(l2) 62 | 63 | if c1 is not None and c2 is not None and c1 is not c2: 64 | smaller, larger = (c1, c2) if len(c1) < len(c2) else (c2, c1) 65 | larger.extend(smaller) 66 | for x in smaller: 67 | cluster_map[x] = larger 68 | elif c1 is not None and c2 is None: 69 | c1.append(l2) 70 | cluster_map[l2] = c1 71 | elif c1 is None and c2 is not None: 72 | c2.append(l1) 73 | cluster_map[l1] = c2 74 | elif c1 is None and c2 is None: 75 | c = [l1, l2] 76 | cluster_map[l1] = c 77 | cluster_map[l2] = c 78 | 79 | return set(map(tuple, cluster_map.values())) 80 | 81 | def _group_lines_by_page(lines: Iterator[_Line]) -> Dict[Page, List[_Line]]: 82 | lines_grouped_by_page = defaultdict(list) 83 | for line in lines: 84 | lines_grouped_by_page[line.page].append(line) 85 | 86 | return lines_grouped_by_page 87 | 88 | def _get_last_header_line(lines: List[_Line]) -> Optional[int]: 89 | if not lines: 90 | return None 91 | 92 | return max(l.line_number for l in lines) 93 | 94 | def _get_first_footer_line(lines: List[_Line]) -> Optional[int]: 95 | if not lines: 96 | return None 97 | 98 | return min(l.line_number for l in lines) 99 | 100 | def _extract_line_numbers(line: _Line) -> Tuple[_Line, List[int]]: 101 | numbers = [int(match.group(0)) for match in 102 | re.finditer(r"(?:(?<=^)|(?<=\s))\d{1,4}(?=\s|$)", line.text, flags=re.UNICODE)] 103 | 104 | return line, numbers 105 | 106 | def _extract_potential_page_numbers(lines: List[_Line]) -> Tuple[_Line, List[int]]: 107 | assert len(lines) > 0 108 | line, numbers = _extract_line_numbers(lines[-1]) 109 | if not numbers and not str.strip(line.text) and len(lines) > 1: 110 | line, numbers = _extract_line_numbers(lines[-2]) 111 | 112 | return line, numbers 113 | 114 | candidate_header_lines = [] 115 | candidate_footer_lines = [] 116 | 117 | pages_lines = [_get_page_lines(p) for p in pages] 118 | 119 | for lines in pages_lines: 120 | # ignore lines that are <4 characters long and/or have no alphabetic characters 121 | candidate_header_lines.append([l for l in lines[:max_header_lines] if not len(l.cleaned_text) < 4]) 122 | candidate_footer_lines.append([l for l in lines[-max_footer_lines:] if not len(l.cleaned_text) < 4]) 123 | 124 | headers_for_comparison = pairwise_combine_within_distance(candidate_header_lines, window_size) 125 | footers_for_comparison = pairwise_combine_within_distance(candidate_footer_lines, window_size) 126 | 127 | header_line_similarities = [] 128 | for (lines1, lines2) in headers_for_comparison: 129 | header_line_similarities.extend( 130 | (l1, l2) for l1 in lines1 for l2 in lines2 if l1.similarity_ratio(l2) >= min_similarity_ratio) 131 | 132 | footer_line_similarities = [] 133 | for (lines1, lines2) in footers_for_comparison: 134 | footer_line_similarities.extend( 135 | (l1, l2) for l1 in lines1 for l2 in lines2 if l1.similarity_ratio(l2) >= min_similarity_ratio) 136 | 137 | header_clusters = [cluster for cluster in _cluster_lines(header_line_similarities) if 138 | len(cluster) >= min_cluster_size] 139 | footer_clusters = [cluster for cluster in _cluster_lines(footer_line_similarities) if 140 | len(cluster) >= min_cluster_size] 141 | 142 | if not footer_clusters: 143 | potential_page_numbers = [_extract_potential_page_numbers(lines) for lines in pages_lines if lines] 144 | potential_page_numbers = [(line, numbers[0]) for line, numbers in potential_page_numbers if len(numbers) == 1] 145 | potential_clusters = map(lambda group: tuple(map(lambda t: t[0], group)), 146 | group_consecutive_when(potential_page_numbers, lambda x, y: y[1] - x[1] == 1)) 147 | footer_clusters = [cluster for cluster in potential_clusters if len(cluster) >= min_cluster_size] 148 | 149 | header_lines_grouped_by_page = _group_lines_by_page(flatten(header_clusters)) 150 | footer_lines_grouped_by_page = _group_lines_by_page(flatten(footer_clusters)) 151 | 152 | last_header_line_pages_map = {p: _get_last_header_line(lines) for p, lines in header_lines_grouped_by_page.items()} 153 | first_footer_line_pages_map = {p: _get_first_footer_line(lines) for p, lines in 154 | footer_lines_grouped_by_page.items()} 155 | 156 | for page in pages: 157 | last_header_line = last_header_line_pages_map.get(page) 158 | first_footer_line = first_footer_line_pages_map.get(page) 159 | page.__class__ = type('StructuredPage', (page.__class__, PageStructure), {}) 160 | page.num_header_lines = last_header_line + 1 if last_header_line is not None else 0 161 | page.num_footer_lines = len(page.text_lines) - first_footer_line if first_footer_line is not None else 0 162 | 163 | return pages 164 | -------------------------------------------------------------------------------- /htrc/tools/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/htrc/HTRC-WorksetToolkit/3c3428d80a72a644925dc6ab2827470c8467af30/htrc/tools/__init__.py -------------------------------------------------------------------------------- /htrc/tools/mallet.py: -------------------------------------------------------------------------------- 1 | from builtins import str 2 | import os, os.path 3 | import subprocess 4 | import sys 5 | import tarfile 6 | import urllib.request 7 | 8 | from htrc.volumes import download_volumes 9 | from htrc.workset import path_to_volumes 10 | 11 | MALLET_DIR = os.path.expanduser('~/mallet') 12 | 13 | # Mallet is downloaded and intalled in user's home directory 14 | def install_mallet(): 15 | if not os.path.exists(MALLET_DIR): 16 | os.makedirs(MALLET_DIR) 17 | mallet_zip = urllib.request.urlopen('http://mallet.cs.umass.edu/dist/mallet-2.0.8RC3.tar.gz') 18 | mallet_dir = tarfile.open(mallet_zip, "r:gz") 19 | mallet_dir.extractall(path=MALLET_DIR) 20 | mallet_dir.close() 21 | 22 | 23 | def main(path, topics, iterations, output_dir='/media/secure_volume/workset/'): 24 | if not os.path.exists(MALLET_DIR): 25 | if not os.path.exists('/media/secure_volume/'): 26 | print('Installing Mallet ...') 27 | install_mallet() 28 | print('\n') 29 | else: 30 | print('Mallet not installed, but capsule is in secure mode.') 31 | print('Switch to maintenance mode and run this command again') 32 | print('to install Mallet. Then, switch to secure mode to train') 33 | print('topic models.') 34 | sys.exit(1) 35 | 36 | if not os.path.isdir(path): 37 | try: 38 | volumes = path_to_volumes(path) 39 | except ValueError as e: 40 | print("Could not process workset. {}".format(str(e))) 41 | sys.exit(1) 42 | 43 | try: 44 | download_volumes(volumes, output_dir) 45 | except OSError as e: 46 | if not os.path.exists('/media/secure_volume/'): 47 | print('Secure volume not mounted. Could not download volumes') 48 | sys.exit(1) 49 | else: 50 | print("Could not download volumes. {} {}".format(e.strerror, e.filename)) 51 | sys.exit(1) 52 | except RuntimeError as e: 53 | if not args.debug: 54 | print("Could not download volumes. {}".format(str(e))) 55 | sys.exit(1) 56 | else: 57 | raise e 58 | path = output_dir 59 | 60 | 61 | # import the workset to MALLET format. 62 | subprocess.check_call([ 63 | '{}/mallet-2.0.8RC3/bin/mallet'.format(MALLET_DIR), 64 | 'import-dir', 65 | '--input', path, 66 | '--output', os.path.join(path, '../corpus.mallet'), 67 | '--keep-sequence', 68 | '--remove-stopwords' 69 | ]) 70 | 71 | subprocess.check_call([ 72 | '{}/mallet-2.0.8RC3/bin/mallet'.format(MALLET_DIR), 73 | 'train-topics', 74 | '--input', os.path.join(path, '../corpus.mallet'), 75 | '--num-topics', str(topics), 76 | '--output-state', os.path.join(path, '../mallet_state.gz'), 77 | '--output-topic-keys', os.path.join(path, '../mallet_topic-keys.txt'), 78 | '--output-doc-topics', os.path.join(path, '../mallet_doc-topics.txt'), 79 | '--num-iterations', str(iterations) 80 | ]) 81 | 82 | def populate_parser(parser=None): 83 | if parser is None: 84 | from argparse import ArgumentParser 85 | parser = ArgumentParser() 86 | parser.add_argument('-k', help="number of topics", required=True) 87 | parser.add_argument('--iter', help="number of iterations", default=200) 88 | parser.add_argument('--workset-path', help="Location to store workset download.", 89 | default='/media/secure_volume/workset/') 90 | parser.add_argument('path', default='/media/secure_volume/workset/', 91 | nargs='?') 92 | return parser 93 | 94 | 95 | if __name__ == '__main__': 96 | from argparse import ArgumentParser 97 | parser = ArgumentParser(description="MALLET tools for the HTRC") 98 | populate_parser(parser) 99 | args = parser.parse_args() 100 | 101 | main(args.path, args.k, args.iter, args.workset_path) 102 | -------------------------------------------------------------------------------- /htrc/tools/topicexplorer.py: -------------------------------------------------------------------------------- 1 | from builtins import map 2 | import os.path 3 | import subprocess 4 | from tempfile import NamedTemporaryFile 5 | 6 | from htrc.volumes import download_volumes 7 | from htrc.workset import path_to_volumes 8 | import sys 9 | 10 | 11 | def main(path, topics, iterations, output_dir='/media/secure_volume/workset'): 12 | if os.path.exists("/media/secure_volume"): 13 | # If in secure mode, downlaod the volumes from data api 14 | try: 15 | volumes = path_to_volumes(path) 16 | except ValueError as e: 17 | print("Could not process workset. {}".format(str(e))) 18 | sys.exit(1) 19 | 20 | try: 21 | download_volumes(volumes, output_dir) 22 | except OSError as e: 23 | if not os.path.exists('/media/secure_volume/'): 24 | print('Secure volume not mounted. Could not download volumes') 25 | sys.exit(1) 26 | else: 27 | print("Could not download volumes. {} {}".format(e.strerror, e.filename)) 28 | sys.exit(1) 29 | except RuntimeError as e: 30 | if not args.debug: 31 | print("Could not download volumes. {}".format(str(e))) 32 | sys.exit(1) 33 | else: 34 | raise e 35 | path = output_dir 36 | 37 | elif not os.path.exists(path): 38 | # If in maintenance mode, use extracted features. 39 | # Assume that if an existing path is given, it is a pre-downloaded set 40 | # or a file containing hathitrust ids and continue. 41 | # If the path does not exist, assume it is a url to a hathitrust 42 | # collection and write volumes list into a temporary file for 43 | # proper handling by extracted features downloader 44 | try: 45 | volumes = path_to_volumes(path) 46 | 47 | volfile = NamedTemporaryFile(prefix='htrc-workset', delete=False) 48 | volfile.write(bytes('\n'.join(volumes), "ascii")) 49 | 50 | path = volfile.name 51 | 52 | volfile.close() 53 | 54 | except ValueError as e: 55 | print("Could not process workset. {}".format(str(e))) 56 | sys.exit(1) 57 | 58 | # strip trailing slash for topic support. 59 | if path.endswith('/'): 60 | path = path[:-1] 61 | 62 | # training the topics on the data from above. 63 | subprocess.check_call([ 64 | 'topicexplorer', 'init', path, 65 | '--name', '"HathiTrust Workset"', 66 | '--rebuild', '--htrc', '-q' 67 | ]) 68 | subprocess.check_call([ 69 | 'topicexplorer', 'prep', path, 70 | '-q', '--min-word-len', '3', '--lang', 'en', 71 | '--high', '30', '--low', '10' 72 | ]) 73 | subprocess.check_call([ 74 | 'topicexplorer', 'train', path, 75 | '-k'] + list(map(str,topics)) + [ 76 | '--iter', str(iterations), 77 | '--context-type', 'book', 78 | '-q' 79 | ]) 80 | 81 | subprocess.check_call([ 82 | 'topicexplorer', 'launch', path 83 | ]) 84 | 85 | def populate_parser(parser=None): 86 | if parser is None: 87 | from argparse import ArgumentParser 88 | parser = ArgumentParser() 89 | 90 | parser.add_argument('-k', type=int, nargs='+', required=True, 91 | help="number of topics") 92 | parser.add_argument('--iter', help="number of iterations", default=200) 93 | parser.add_argument('path', default='/media/secure_volume/workset', 94 | nargs='?') 95 | parser.add_argument('--workset-path', help="Location to store workset download.", 96 | default='/media/secure_volume/workset') 97 | return parser 98 | 99 | if __name__ == '__main__': 100 | from argparse import ArgumentParser 101 | parser = ArgumentParser(description="Topic Explorer tools for the HTRC") 102 | populate_parser(parser) 103 | args = parser.parse_args() 104 | 105 | main(args.path, args.k, args.iter, args.workset_path) 106 | -------------------------------------------------------------------------------- /htrc/util/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import math 4 | 5 | from .resolve import ORG_CODES 6 | 7 | 8 | def split_items(seq, split_size): 9 | """ 10 | Returns a generator that returns portions of `seq` up to `split_size`. 11 | Useful when chunking requests to bulk endpoints. 12 | 13 | :param seq: A sequence to split. 14 | :param split_size: The maximum size of each split. 15 | """ 16 | full_segments = int(math.floor(len(seq) / split_size)) 17 | for i in range(1, full_segments + 1): 18 | yield seq[(i - 1) * split_size:i * split_size] 19 | if (full_segments * split_size) < len(seq): 20 | yield seq[full_segments * split_size:] 21 | -------------------------------------------------------------------------------- /htrc/util/resolve.py: -------------------------------------------------------------------------------- 1 | from future import standard_library 2 | standard_library.install_aliases() 3 | import json 4 | import re 5 | from pprint import pprint 6 | from urllib.request import urlopen 7 | from urllib.parse import urlparse, parse_qs 8 | 9 | # List of organization codes in HathiTrust Digital Library 10 | # Derived from https://github.com/Bookworm-project/Bookworm-MARC/issues/1 11 | # More info on HT codes at: 12 | ORG_CODES = { 13 | "mdp" : "University of Michigan", 14 | "miua" : "University of Michigan", 15 | "miun" : "University of Michigan", 16 | "wu" : "University of Wisconsin", 17 | "inu" : "Indiana University", 18 | "uc1" : "University of California", 19 | "uc2" : "University of California", 20 | "pst" : "Penn State University", 21 | "umn" : "University of Minnesota", 22 | "nnc1" : "Columbia University", 23 | "nnc2" : "Columbia University", 24 | "nyp" : "New York Public Library", 25 | "uiuo" : "University of Illinois", 26 | "njp" : "Princeton University", 27 | "yale" : "Yale University", 28 | "chi" : "University of Chicago", 29 | "coo" : "Cornell University", 30 | "ucm" : "Universidad Complutense de Madrid", 31 | "loc" : "Library of Congress", 32 | "ien" : "Northwestern University", 33 | "hvd" : "Harvard University", 34 | "uva" : "University of Virginia", 35 | "dul1" : "Duke University", 36 | "ncs1" : "North Carolina State University", 37 | "nc01" : "University of North Carolina", 38 | "pur1" : "Purdue University", 39 | "pur2" : "Purdue University", 40 | "mdl" : "Minnesota Digital Library", 41 | "usu" : "Utah State University Press", 42 | "gri" : "Getty Research Institute", 43 | "uiug" : "University of Illinois", 44 | "psia" : "Penn State University", 45 | "bc" : "Boston College", 46 | "ufl1" : "University of Florida", 47 | "ufl2" : "University of Florida", 48 | "txa" : "Texas A&M University", 49 | "keio" : "Keio University", 50 | "osu" : "The Ohio State University", 51 | "uma" : "University of Massachusets", 52 | "udel" : "University of Delaware", 53 | "caia" : "Clark Art Institute Library" 54 | } 55 | 56 | 57 | def parse_record_id(string, fix_truncated_id=False): 58 | # type: (str) -> str 59 | ''' 60 | Takes either a record ID or a HT URL for a record. 61 | Returns a string containing the record ID or None. 62 | 63 | >>> parse_record_id('https://catalog.hathitrust.org/Record/000234911') 64 | '000234911' 65 | >>> parse_record_id('001022499') 66 | '001022499' 67 | >>> parse_record_id('1022499', fix_truncated_id=True) 68 | '001022499' 69 | ''' 70 | REGEX = r'(?:http[s]?://catalog.hathitrust.org/Record/)?([\d]+)' 71 | 72 | try: 73 | record = re.search(REGEX, string).group(1) 74 | except AttributeError: 75 | raise ValueError("No record ID found in string: {}".format(string)) 76 | 77 | # Correct truncated IDs or raise error. 78 | if len(record) != 9: 79 | if fix_truncated_id: 80 | record = '0'*(9-len(record)) + record 81 | else: 82 | raise ValueError("Invalid record ID. Valid record IDs are 9 digits. " + 83 | "Call parse_record_id(string, fix_truncated_id=True) to correct.") 84 | 85 | return record 86 | 87 | 88 | def parse_volume_id(string): 89 | # type: (str) -> str 90 | ''' 91 | Takes either a volume ID, HT URL, or Handle URL for a volume. 92 | Returns a string containing the HTID or None. 93 | 94 | Organization codes for the volumes can be found in ORG_CODES. 95 | ''' 96 | 97 | # First extract the volume ID from a URL, fallback to assume string. 98 | parsed_url = urlparse(string) 99 | if parsed_url.netloc == 'hdl.handle.net': 100 | # Parse the Handle ID, ex: 101 | # https://hdl.handle.net/2027/uc2.ark:/13960/fk92805m1s' 102 | # Note that if the Handle URL contains page info, this is discarded. 103 | htid = parsed_url.path.replace('/2027/', '') 104 | 105 | elif parsed_url.netloc == 'babel.hathitrust.org': 106 | # Parse the HT Digital Library URL, ex: 107 | # https://babel.hathitrust.org/cgi/pt?id=uc2.ark:/13960/fk92805m1s;view=1up;seq=7 108 | if parsed_url.query: 109 | htid = parse_qs(parsed_url.query).get('id', None) 110 | if htid is not None: 111 | htid = htid[0] 112 | if ';' in htid: 113 | htid = htid.split(';')[0] 114 | 115 | else: 116 | htid = string 117 | 118 | # Validate ID against ORG_CODES. 119 | # Won't guarantee volume existence, but it is a sanity check. 120 | if htid and any(htid.startswith(org) for org in ORG_CODES): 121 | return htid 122 | else: 123 | raise ValueError("Invalid Organization Code in HathiTrust ID") 124 | 125 | 126 | def volume_id_to_record_id(volume_id): 127 | # type: (str) -> str 128 | """ 129 | Takes a volume id and returns a record id. 130 | 131 | See also: `parse_record_id` 132 | """ 133 | URL = 'https://catalog.hathitrust.org/Record/HTID/{}'.format(volume_id) 134 | record_url = urlopen(URL).geturl() 135 | return parse_record_id(record_url) 136 | 137 | 138 | def record_id_to_volume_ids(record_id): 139 | """ 140 | Takes a record id and returns a list of corresponding volume ids. 141 | 142 | HathiTrust is a Digital Library, but is composed of scans of physical 143 | artifacts. A single catalog record may correspond to multiple volumes 144 | in print, especially among pre-20th century texts. Additionally, a single 145 | catalog record may correspond to multiple scans from multiple libraries. 146 | 147 | This function resolves these ambiguities by selecting only a single copy per 148 | unique volume label. For example, if a book was printed as three volumes 149 | labeled in the catalog record as 'v. 1', 'v. 2', and 'v. 3', and contained 150 | scans from four different libraries of each, this function would return a 151 | list of 3 volume ids. 152 | 153 | Future iterations of this function may take a list of preferred sources 154 | based on ORG_CODE and attempt to use same-source volumes for consistency. 155 | """ 156 | # Get record from BibAPI 157 | URL = "http://catalog.hathitrust.org/api/volumes/brief/recordnumber/{0}.json" 158 | URL = URL.format(record_id) 159 | data = urlopen(URL) 160 | data = json.load(data) 161 | data = data['items'] 162 | 163 | if not data: 164 | raise KeyError("No items found for record ID: {}".format(record_id)) 165 | 166 | # Normalize volume labels 167 | REGEX = re.compile('\W') 168 | items = [('DEFAULT' if not item['enumcron'] 169 | else REGEX.sub('', item['enumcron']), 170 | item['htid']) for item in data] 171 | 172 | # Cast to a dictionary, which removes duplicates as each dictionary key may 173 | # only have a single value. 174 | items = dict(items) 175 | 176 | if not items: 177 | raise KeyError("No items found for record ID: {}".format(record_id)) 178 | 179 | # Return the list of volume ids 180 | return list(items.values()) 181 | 182 | 183 | -------------------------------------------------------------------------------- /htrc/volumes/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | `htrc.volumes` 4 | 5 | Contains functions to retrieve volumes from the HTRC Data API. 6 | 7 | The functions in this package will not operate unless they are 8 | executed from an HTRC Data Capsule in Secure Mode. The module 9 | `htrc.mock.volumes` contains Patch objects for testing workflows. 10 | """ 11 | from __future__ import print_function 12 | from future import standard_library 13 | 14 | standard_library.install_aliases() 15 | 16 | 17 | #from builtins import input 18 | from htrc.models import HtrcPage 19 | 20 | import http.client 21 | from io import BytesIO, TextIOWrapper 22 | import json 23 | import os.path 24 | import progressbar 25 | 26 | #import re 27 | import socket 28 | import ssl 29 | #import sys 30 | #from time import sleep 31 | #from urllib.request import urlopen 32 | #from urllib.error import HTTPError 33 | from urllib.parse import urlencode 34 | #import xml.etree.ElementTree as ET 35 | from urllib.parse import urlencode 36 | from zipfile import ZipFile # used to decompress requested zip archives. 37 | from tqdm import tqdm 38 | from htrc.runningheaders import parse_page_structure 39 | from functools import partial 40 | import pandas as pd 41 | #from htrc.lib.cli import bool_prompt 42 | from htrc.util import split_items 43 | import htrc.config 44 | import multiprocessing 45 | 46 | import logging 47 | from logging import NullHandler 48 | 49 | logging.getLogger(__name__).addHandler(NullHandler()) 50 | 51 | 52 | def get_volumes(data_api_config: htrc.config.HtrcDataApiConfig, volume_ids, concat=False, mets=False, buffer_size=128): 53 | """ 54 | Returns volumes from the Data API as a raw zip stream. 55 | 56 | Parameters: 57 | :token: An OAuth2 token for the app. 58 | :volume_ids: A list of volume_ids 59 | :concat: If True, return a single file per volume. If False, return a single 60 | file per page (default). 61 | :host: Data API host 62 | :port: Data API port 63 | """ 64 | if not volume_ids: 65 | raise ValueError("volume_ids is empty.") 66 | 67 | url = data_api_config.epr + "volumes" 68 | 69 | for id in volume_ids: 70 | if ("." not in id 71 | or " " in id): 72 | print("Invalid volume id " + id + ". Please correct this volume id and try again.") 73 | 74 | data = {'volumeIDs': '|'.join( 75 | [id.replace('+', ':').replace('=', '/') for id in volume_ids])} 76 | 77 | 78 | if concat: 79 | data['concat'] = 'true' 80 | 81 | if mets: 82 | data['mets'] = 'true' 83 | 84 | # Authorization 85 | headers = {"Authorization": "Bearer " + data_api_config.token, 86 | "Content-type": "application/x-www-form-urlencoded"} 87 | 88 | # Create SSL lookup 89 | # TODO: Fix SSL cert verification 90 | ctx = ssl.create_default_context() 91 | ctx.check_hostname = False 92 | #ctx.verify_mode = ssl.CERT_NONE 93 | 94 | # Retrieve the volumes 95 | httpsConnection = http.client.HTTPSConnection( 96 | data_api_config.host, 97 | data_api_config.port, 98 | context=ctx, 99 | key_file=data_api_config.key, 100 | cert_file=data_api_config.cert) 101 | 102 | httpsConnection.request("POST", url, urlencode(data), headers) 103 | 104 | response = httpsConnection.getresponse() 105 | 106 | if response.status is 200: 107 | body = True 108 | data = BytesIO() 109 | bytes_downloaded = 0 110 | bar = progressbar.ProgressBar(max_value=progressbar.UnknownLength, 111 | widgets=[progressbar.AnimatedMarker(), ' ', 112 | progressbar.DataSize(), 113 | ' (', progressbar.FileTransferSpeed(), ')']) 114 | 115 | while body: 116 | body = response.read(buffer_size) 117 | data.write(body) 118 | bytes_downloaded += len(body) 119 | bar.update(bytes_downloaded) 120 | 121 | data = data.getvalue() 122 | else: 123 | logging.debug("Unable to get volumes") 124 | logging.debug("Response Code: {}".format(response.status)) 125 | logging.debug("Response: {}".format(response.reason)) 126 | raise EnvironmentError("Unable to get volumes.") 127 | 128 | if httpsConnection is not None: 129 | httpsConnection.close() 130 | 131 | return data 132 | 133 | 134 | def get_pages(data_api_config: htrc.config.HtrcDataApiConfig, page_ids, concat=False, mets=False, buffer_size=128): 135 | """ 136 | Returns a ZIP file containing specfic pages. 137 | 138 | Parameters: 139 | :data_api_config: The configuration data of the DataAPI endpoint. 140 | :volume_ids: A list of volume_ids 141 | :concat: If True, return a single file per volume. If False, return a single 142 | file per page (default). 143 | """ 144 | if not page_ids: 145 | raise ValueError("page_ids is empty.") 146 | 147 | url = data_api_config.epr + "pages" 148 | 149 | for id in page_ids: 150 | if ("." not in id 151 | or " " in id): 152 | print("Invalid volume id " + id + ". Please correct this volume id and try again.") 153 | 154 | data = {'pageIDs': '|'.join( 155 | [id.replace('+', ':').replace('=', '/') for id in page_ids])} 156 | 157 | if concat and mets: 158 | print("Cannot set both concat and mets with pages.") 159 | elif concat: 160 | data['concat'] = 'true' 161 | elif mets: 162 | data['mets'] = 'true' 163 | 164 | # Authorization 165 | headers = {"Authorization": "Bearer " + data_api_config.token, 166 | "Content-type": "application/x-www-form-urlencoded"} 167 | 168 | # Create SSL lookup 169 | # TODO: Fix SSL cert verification 170 | ctx = ssl.create_default_context() 171 | ctx.check_hostname = False 172 | #ctx.verify_mode = ssl.CERT_NONE 173 | 174 | # Retrieve the volumes 175 | httpsConnection = http.client.HTTPSConnection( 176 | data_api_config.host, 177 | data_api_config.port, 178 | context=ctx, 179 | key_file=data_api_config.key, 180 | cert_file=data_api_config.cert 181 | ) 182 | 183 | httpsConnection.request("POST", url, urlencode(data), headers) 184 | 185 | response = httpsConnection.getresponse() 186 | 187 | if response.status is 200: 188 | body = True 189 | data = BytesIO() 190 | bytes_downloaded = 0 191 | bar = progressbar.ProgressBar(max_value=progressbar.UnknownLength, 192 | widgets=[progressbar.AnimatedMarker(), ' ', 193 | progressbar.DataSize(), 194 | ' (', progressbar.FileTransferSpeed(), ')']) 195 | 196 | while body: 197 | body = response.read(buffer_size) 198 | data.write(body) 199 | bytes_downloaded += len(body) 200 | bar.update(bytes_downloaded) 201 | 202 | data = data.getvalue() 203 | else: 204 | logging.debug("Unable to get pages") 205 | logging.debug("Response Code: ".format(response.status)) 206 | logging.debug("Response: ".format(response.reason)) 207 | raise EnvironmentError("Unable to get pages.") 208 | 209 | if httpsConnection is not None: 210 | httpsConnection.close() 211 | 212 | return data 213 | 214 | 215 | #def get_oauth2_token(username, password): 216 | # make sure to set the request content-type as application/x-www-form-urlencoded 217 | #headers = {"Content-type": "application/x-www-form-urlencoded"} 218 | #data = { "grant_type": "client_credentials", 219 | #"client_secret": password, 220 | #"client_id": username } 221 | #data = urlencode(data) 222 | 223 | # create an SSL context 224 | #ctx = ssl.create_default_context() 225 | #ctx.check_hostname = False 226 | #ctx.verify_mode = ssl.CERT_NONE 227 | 228 | # make sure the request method is POST 229 | #host, port = htrc.config.get_oauth2_host_port() 230 | #oauth2port = htrc.config.get_oauth2_port() 231 | #oauth2EPRurl = htrc.config.get_oauth2_url() 232 | #httpsConnection = http.client.HTTPSConnection(host, oauth2port, context=ctx) 233 | #httpsConnection.request("POST", oauth2EPRurl + "?" + data, "", headers) 234 | 235 | #response = httpsConnection.getresponse() 236 | 237 | # if response status is OK 238 | #if response.status == 200: 239 | #data = response.read().decode('utf8') 240 | 241 | #jsonData = json.loads(data) 242 | #logging.info("*** JSON: {}".format(jsonData)) 243 | 244 | #token = jsonData["access_token"] 245 | #logging.info("*** parsed token: {}".format(token)) 246 | 247 | 248 | #else: 249 | #logging.debug("Unable to get token") 250 | #logging.debug("Response Code: {}".format(response.status)) 251 | #logging.debug("Response: {}".format(response.reason)) 252 | #logging.debug(response.read()) 253 | #raise EnvironmentError("Unable to get token.") 254 | 255 | #if httpsConnection is not None: 256 | #httpsConnection.close() 257 | 258 | 259 | #return token 260 | 261 | 262 | 263 | def grep_error(file_name, output_dir, pattern, txt_index): 264 | na_volume = [] 265 | if output_dir.endswith("/"): 266 | file_path = output_dir + file_name 267 | else: 268 | file_path = output_dir + "/" + file_name 269 | 270 | if os.path.isfile(file_path): 271 | for line in open(file_path): 272 | if pattern in line: 273 | na_volume.append(line.split()[txt_index]) 274 | 275 | return na_volume 276 | 277 | 278 | return na_volume 279 | 280 | 281 | def _to_htrc_page(page_file, zip): 282 | with TextIOWrapper(BytesIO(zip.read(page_file)), encoding='utf-8') as page: 283 | return HtrcPage([line.rstrip() for line in page.readlines()]) 284 | 285 | 286 | def download_volumes(volume_ids, output_dir, concat=False, mets=False, pages=False, 287 | remove_headers_footers=False, hf_window_size=6, hf_min_similarity=0.7, skip_removed_hf=False, 288 | parallelism=multiprocessing.cpu_count(), batch_size=250, data_api_config=None): 289 | if not 0 < parallelism <= multiprocessing.cpu_count(): 290 | raise ValueError("Invalid parallelism level specified") 291 | 292 | remove_hf_fun = partial( 293 | _remove_headers_footers_and_save, 294 | concat=concat, 295 | hf_min_similarity=hf_min_similarity, 296 | hf_window_size=hf_window_size, 297 | skip_removed_hf=skip_removed_hf, 298 | output_dir=output_dir 299 | ) 300 | 301 | volume_ids = list(set(volume_ids)) # ensure unique volume ids 302 | num_vols = len(volume_ids) 303 | 304 | data_api_config = data_api_config or htrc.config.HtrcDataApiConfig() 305 | 306 | os.makedirs(output_dir, exist_ok=True) 307 | 308 | if any((data_api_config.token, data_api_config.host, data_api_config.port)) is not None: 309 | logging.info("obtained token: %s\n" % data_api_config.token) 310 | 311 | try: 312 | errors = [] 313 | rights = [] 314 | 315 | with tqdm(total=num_vols) as progress, multiprocessing.Pool(processes=parallelism) as pool: 316 | for ids in split_items(volume_ids, batch_size): 317 | if pages: 318 | if concat and mets: 319 | raise ValueError("Cannot set both concat and mets with pages.") 320 | else: 321 | data = get_pages(data_api_config, ids, concat and not remove_headers_footers, mets) 322 | else: 323 | data = get_volumes(data_api_config, ids, concat and not remove_headers_footers, mets) 324 | 325 | volumes = [] 326 | 327 | with ZipFile(BytesIO(data)) as vols_zip: 328 | zip_list = vols_zip.namelist() 329 | if 'ERROR.err' in zip_list: 330 | errors.append(vols_zip.read('ERROR.err').decode('utf-8')) 331 | zip_list.remove('ERROR.err') 332 | if 'volume-rights.txt' in zip_list: 333 | rights_data = vols_zip.read('volume-rights.txt').decode('utf-8') 334 | zip_list.remove('volume-rights.txt') 335 | if not rights: 336 | rights.append(rights_data) 337 | else: 338 | # due to the format in which 'volume-rights.txt' is created, we have to skip 339 | # the first 4 lines which make up the header of the file, to extract only the 340 | # actual volume rights data for accumulation 341 | rights.append(''.join(rights_data.splitlines(keepends=True)[4:])) 342 | 343 | zip_volume_paths = [zip_vol_path for zip_vol_path in zip_list if zip_vol_path.endswith('/')] 344 | num_vols_in_zip = len(zip_volume_paths) 345 | 346 | if not remove_headers_footers: 347 | vols_zip.extractall(output_dir, members=zip_list) 348 | progress.update(num_vols_in_zip) 349 | else: 350 | for zip_vol_path in zip_volume_paths: 351 | sorted_vol_zip_page_paths = sorted(zip_page_path for zip_page_path in zip_list if zip_page_path.startswith(zip_vol_path) and not zip_page_path.endswith('/')) 352 | vol_pages = [_to_htrc_page(page_path, vols_zip) for page_path in sorted_vol_zip_page_paths] 353 | volumes.append((zip_vol_path, sorted_vol_zip_page_paths, vol_pages)) 354 | 355 | del data, vols_zip 356 | 357 | num_missing = batch_size - num_vols_in_zip if num_vols >= batch_size else num_vols - num_vols_in_zip 358 | progress.update(num_missing) # update progress bar state to include the missing volumes also 359 | 360 | # `volumes` will be empty if `remove_headers_footers=False` since the ZIP was extracted 361 | # without further processing 362 | if volumes: 363 | for _ in pool.imap_unordered(remove_hf_fun, volumes): 364 | progress.update() 365 | 366 | na_volumes_all = [] 367 | 368 | if errors: 369 | with open(os.path.join(output_dir, 'ERROR.err'), 'w') as err_file: 370 | err_file.write(''.join(errors)) 371 | 372 | na_volumes_error = grep_error('ERROR.err', output_dir, 'KeyNotFoundException', -1) 373 | na_volumes_all.extend(na_volumes_error) 374 | 375 | if rights: 376 | with open(os.path.join(output_dir, 'volume-rights.txt'), 'w') as rights_file: 377 | rights_file.write(''.join(rights)) 378 | 379 | if htrc.config.get_dataapi_access() == "true": 380 | na_volumes_rights = grep_error('volume-rights.txt', output_dir, ' 3', 0) 381 | na_volumes_all.extend(na_volumes_rights) 382 | 383 | num_na = len(na_volumes_all) 384 | 385 | if num_na > 0: 386 | with open(os.path.join(output_dir, 'volumes_not_available.txt'), 'w') as volumes_na: 387 | volumes_na.write("\n".join(str(item) for item in na_volumes_all)) 388 | 389 | if num_na < 100: 390 | print("\nThe following volume ids are not available. \n Please check volumes_not_available.txt " 391 | "for the complete list. ") 392 | print('\n'.join(str(item) for item in na_volumes_all)) 393 | else: 394 | print("\nThere are {:,} unavailable volumes.\n Please check volumes_not_available.txt " 395 | "for the " 396 | "complete list. \nTo check the validity of volumes in your workset or volume id file go " 397 | "to:\n " 398 | "https://analytics.hathitrust.org/validateworkset \n or email us at " 399 | "htrc-help@hathitrust.org " 400 | "for assistance.".format(num_na)) 401 | 402 | except socket.error: 403 | raise RuntimeError("HTRC Data API time out. Check your inode usage if downloading a large workset. " 404 | "Contact HTRC for further help.") 405 | 406 | else: 407 | raise RuntimeError("Failed to obtain the JWT token.") 408 | 409 | 410 | def _remove_headers_footers_and_save(vol_data, concat, hf_min_similarity, hf_window_size, skip_removed_hf, output_dir): 411 | zip_vol_path, sorted_vol_zip_page_paths, vol_pages = vol_data 412 | clean_volid = zip_vol_path[:-1] 413 | 414 | vol_pages = parse_page_structure(vol_pages, window_size=hf_window_size, min_similarity_ratio=hf_min_similarity) 415 | pages_body = (page.body for page in vol_pages) 416 | # save the removed headers/footers for user inspection 417 | if skip_removed_hf: 418 | if concat: 419 | with open(os.path.join(output_dir, clean_volid + '.txt'), 'w', encoding='utf-8') as vol_file: 420 | vol_file.write('\n'.join(pages_body)) 421 | else: 422 | vol_path = os.path.join(output_dir, zip_vol_path) 423 | os.mkdir(vol_path) 424 | for vol_page_path, page_body in zip(sorted_vol_zip_page_paths, pages_body): 425 | with open(os.path.join(output_dir, vol_page_path), 'w', encoding='utf-8') as page_file: 426 | page_file.write(page_body) 427 | else: 428 | if concat: 429 | with open(os.path.join(output_dir, clean_volid + '.txt'), 'w', encoding='utf-8') as vol_file: 430 | vol_file.write('\n'.join(pages_body)) 431 | else: 432 | vol_path = os.path.join(output_dir, zip_vol_path) 433 | os.mkdir(vol_path) 434 | for vol_page_path, page_body in zip(sorted_vol_zip_page_paths, pages_body): 435 | with open(os.path.join(output_dir, vol_page_path), 'w', encoding='utf-8') as page_file: 436 | page_file.write(page_body) 437 | 438 | 439 | 440 | removed_hf = [] 441 | for vol_page_path, vol_page in zip(sorted_vol_zip_page_paths, vol_pages): 442 | if not (vol_page.has_header or vol_page.has_footer): 443 | # skip reporting pages that don't have an identified header or footer 444 | continue 445 | _, page_name = os.path.split(vol_page_path) 446 | page_name, _ = os.path.splitext(page_name) 447 | removed_hf.append({'page': page_name, 'header': vol_page.header, 'footer': vol_page.footer}) 448 | 449 | if concat: 450 | removed_hf_filename = os.path.join(output_dir, clean_volid + '_removed_hf.csv') 451 | else: 452 | removed_hf_filename = os.path.join(output_dir, clean_volid, 'removed_hf.csv') 453 | 454 | pd.DataFrame(removed_hf, columns=['page', 'header', 'footer']).to_csv(removed_hf_filename, index=False) 455 | 456 | 457 | def download(args): 458 | # extract files 459 | with open(args.file) as IDfile: 460 | volumeIDs = [line.strip() for line in IDfile] 461 | 462 | data_api_config = htrc.config.HtrcDataApiConfig( 463 | token=args.token, 464 | host=args.datahost, 465 | port=args.dataport, 466 | epr=args.dataepr, 467 | cert=args.datacert, 468 | key=args.datakey 469 | ) 470 | 471 | return download_volumes(volumeIDs, args.output, 472 | remove_headers_footers=args.remove_headers_footers or args.remove_headers_footers_and_concat, 473 | concat=args.concat or args.remove_headers_footers_and_concat, 474 | mets=args.mets, 475 | pages=args.pages, 476 | hf_window_size=args.window_size, 477 | hf_min_similarity=args.min_similarity_ratio, 478 | parallelism=args.parallelism, 479 | batch_size=args.batch_size, 480 | skip_removed_hf=args.skip_removed_hf, 481 | data_api_config=data_api_config) 482 | 483 | -------------------------------------------------------------------------------- /htrc/workset/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | `htrc.workset` 3 | 4 | Contains function to extract all volume IDs from a JSON-LD workset 5 | representation. 6 | 7 | Will eventually be expanded to allow for querying based on arbitrary 8 | ID and for update and removal of volumes from Workset. 9 | """ 10 | from __future__ import absolute_import, print_function 11 | from future import standard_library 12 | standard_library.install_aliases() 13 | 14 | import unicodecsv as csv 15 | from io import BytesIO 16 | import json 17 | import os.path 18 | from pprint import pprint 19 | import re 20 | from urllib.request import urlopen 21 | from urllib.parse import urlparse 22 | 23 | from pyld import jsonld 24 | 25 | def get_volumes(data): 26 | """ 27 | Takes a data structure in the canonical HathiTrust JSON-LD format 28 | and expands the dataset. Traverses the edm:gathers relation to find 29 | all HT volume IDs. 30 | 31 | Returns a list of volume IDs for use with the `htrc.metadata` and 32 | `htrc.volume` modules. 33 | """ 34 | 35 | # Remove all namespaces to ensure proper referencing 36 | data = jsonld.expand(data) 37 | 38 | # Build up the list of volumes. Because the JSON-LD `@graph` may 39 | # contain multiple worksets, this code uses a set representation 40 | # to ensure that duplicates are removed 41 | volumes = set() 42 | for obj in data: 43 | # retrieve list of entities gathered 44 | gathers = obj.get('http://www.europeana.eu/schemas/edm/gathers', []) 45 | gathers = [vol['@id'].replace('http://hdl.handle.net/2027/','') 46 | for vol in gathers] 47 | 48 | # Check if `gathers` has any elements to ensure we don't add [] 49 | # to the list of volumes. 50 | if gathers: 51 | volumes.update(gathers) 52 | 53 | # return the list representation, maintains a more consistent interface 54 | return list(volumes) 55 | 56 | def create_jsonld(volumes, title=None, curator=None): 57 | """ 58 | Takes a list of volumes and exports a JSON-LD formated workset 59 | """ 60 | if curator is None: 61 | import getpass 62 | curator = getpass.getuser() 63 | 64 | context = "http://emblematica.library.illinois.edu/test/worksetcontext.jsonld" 65 | 66 | graph = {'@type':'http://wcsa.htrc.illinois.edu/Workset'} 67 | GATHERS = "http://www.europeana.eu/schemas/edm/gathers" 68 | graph[GATHERS] = [{'@id' : "http://hdl.handle.net/2027/" + vol} 69 | for vol in volumes] 70 | graph['numItems'] = len(volumes) 71 | if curator: 72 | graph['curator'] = curator 73 | if title: 74 | graph['title'] = title 75 | 76 | return jsonld.compact(graph, context) 77 | 78 | def load(filename): 79 | """ 80 | Takes a filename and retrieves a list of volumes from the workset 81 | description. If a URL is passed, automatically uses `load_url` to resolve. 82 | """ 83 | if filename.startswith('http://') or filename.startswith('https://'): 84 | return load_url(filename) 85 | 86 | with open(filename) as infile: 87 | data = json.load(infile) 88 | 89 | # Retrieve and print the volumes 90 | return get_volumes(data) 91 | 92 | 93 | def load_url(url): 94 | """ 95 | Takes a workset URL, parses it, and uses the workset retrieval API to fetch 96 | the data and return the volumes.. 97 | """ 98 | url_components = urlparse(url) 99 | if url_components.netloc.startswith('babel.hathitrust.org'): 100 | return load_hathitrust_collection(url) 101 | elif (url_components.netloc.startswith('htrc.hathitrust.org') 102 | and url_components.path.startswith('/wsid/')): 103 | base_url = 'http://acbres224.ischool.illinois.edu:8080' 104 | base_url += '/dcWSfetch/getDescription?id=' 105 | base_url += url 106 | url = base_url 107 | elif (url_components.netloc.startswith('acbres224.ischool.illinois.edu') 108 | and url_components.path.startswith('/dcWSfetch/')): 109 | # copied from direct call to WS fetch, a-ok. 110 | pass 111 | else: 112 | raise ValueError("Invalid workset URL: {}".format(url)) 113 | 114 | response = urlopen(url) 115 | data = json.loads(response.read().decode('utf-8')) 116 | 117 | return get_volumes(data) 118 | 119 | 120 | def get_volumes_from_csv(data): 121 | """ 122 | Retrieves the volume list for a given HathiTrust collection. 123 | """ 124 | 125 | csvfile = BytesIO(data) 126 | reader = csv.DictReader(csvfile, delimiter='\t') 127 | volumes = [row['htitem_id'] for row in reader] 128 | csvfile.close() 129 | 130 | return volumes 131 | 132 | 133 | def load_hathitrust_collection(url): 134 | """ 135 | Retrieves the volume list for a given HathiTrust Collection URL. 136 | In contrast to `get_volumes_csv`, which makes the request and handles data, 137 | this function parses out the collection ID from a variety of canonical URL 138 | schemes for collections: 139 | - https://babel.hathitrust.org/shcgi/mb?a=listis;c=548413090 140 | - https://babel.hathitrust.org/cgi/mb?a=listis&c=548413090 141 | """ 142 | if not url.startswith('https://babel.hathitrust.org/'): 143 | raise ValueError('Invalid HathiTrust Collection URL: {}'.format(url)) 144 | try: 145 | collection_id = re.search('c=(\d+)', url).group(1) 146 | except AttributeError: 147 | raise ValueError('Invalid HathiTrust Collection URL: {}'.format(url)) 148 | 149 | url = "https://babel.hathitrust.org/shcgi/mb" 150 | data = "a=download&c={}&format=text".format(collection_id) 151 | 152 | response = urlopen(url, bytes(data.encode('utf-8'))) 153 | data = response.read() 154 | 155 | return get_volumes_from_csv(data) 156 | 157 | 158 | def path_to_volumes(path): 159 | """ 160 | Takes a path and resolves to a list of volumes. 161 | 162 | Accepts: 163 | - Plaintext file, each line is an ID 164 | - Directory with subfolders that are volume pages 165 | - JSON or JSONLD workset representation 166 | - HT CB or HTRC WCSA URL. 167 | """ 168 | if os.path.isdir(path): 169 | volumes = [id for id in os.listdir(path) if not id.endswith('.log')] 170 | elif (path.endswith('json') 171 | or path.endswith('jsonld') 172 | or path.startswith('http://') 173 | or path.startswith('https://')): 174 | volumes = load(path) 175 | elif os.path.isfile(path): 176 | with open(path) as infile: 177 | volumes = [line.strip() for line in infile] 178 | else: 179 | raise ValueError("Invalid workset path.") 180 | 181 | return volumes 182 | -------------------------------------------------------------------------------- /htrc/workset/__main__.py: -------------------------------------------------------------------------------- 1 | from htrc.workset import * 2 | 3 | if __name__ == '__main__': 4 | from argparse import ArgumentParser 5 | 6 | parser = ArgumentParser() 7 | parser.add_argument('filename') 8 | 9 | args = parser.parse_args() 10 | 11 | volumes = load(args.filename) 12 | 13 | for vol in volumes: 14 | print(vol) 15 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from __future__ import print_function 3 | 4 | from setuptools import setup, find_packages 5 | from setuptools.command.install import install 6 | import os 7 | import platform 8 | import sys 9 | import atexit 10 | import tarfile 11 | 12 | 13 | __version__ = '0.1.58' 14 | 15 | # Installing Cython indepdently bc of numpy version. Pinning packages bc of python3.6 16 | install_requires = ['PyLD', 'future', 'prov', 'unicodecsv', 'progressbar2==3.55.0', 'pandas==1.1.5','requests', 'argparse==1.1', 'Cython','tqdm==4.46.0'] 17 | # TODO: migrate to docs confix:, 'sphinx-argparse', 'sphinxcontrib-fulltoc'] 18 | if sys.version_info.major == 2: 19 | install_requires.append('configparser') 20 | install_requires.append('mock') 21 | 22 | 23 | def _download_config(): 24 | print("Downloading .htrc file...") 25 | 26 | _config_file_url = 'https://analytics.hathitrust.org/files/.htrc' 27 | _path = os.path.expanduser('~/.htrc') 28 | try: 29 | from urllib.request import urlretrieve 30 | except ImportError: 31 | from urllib import urlretrieve 32 | urlretrieve(_config_file_url, _path) 33 | 34 | print("\n") 35 | 36 | 37 | def _install_mallet(): 38 | mallet_path = os.path.expanduser('~/mallet') 39 | if not os.path.exists(mallet_path): 40 | print('Installing Mallet ...') 41 | os.makedirs(mallet_path) 42 | try: 43 | from urllib.request import urlretrieve 44 | except ImportError: 45 | from urllib import urlretrieve 46 | mallet_zip, _ = urlretrieve('http://mallet.cs.umass.edu/dist/mallet-2.0.8RC3.tar.gz') 47 | mallet_dir = tarfile.open(mallet_zip, "r:gz") 48 | mallet_dir.extractall(path=mallet_path) 49 | mallet_dir.close() 50 | print('\n') 51 | 52 | 53 | class PostInstallCommand(install, object): 54 | def __init__(self, *args, **kwargs): 55 | super(PostInstallCommand, self).__init__(*args, **kwargs) 56 | atexit.register(_download_config) 57 | atexit.register(_install_mallet) 58 | 59 | 60 | setup( 61 | name='htrc', 62 | version=__version__, 63 | description='HathiTrust Research Center API Access', 64 | author="HathiTrust Research Center", 65 | author_email="htrc@indiana.edu", 66 | url='http://analytics.hathitrust.org', 67 | download_url='http://github.com/htrc/HTRC-PythonSDK', 68 | keywords=[], 69 | classifiers=[ 70 | "Programming Language :: Python", 71 | "Programming Language :: Python :: 2.7", 72 | "Programming Language :: Python :: 3.5", 73 | "Development Status :: 4 - Beta", 74 | "Intended Audience :: Science/Research", 75 | "License :: OSI Approved :: MIT License", 76 | "Operating System :: Unix", 77 | "Topic :: Scientific/Engineering :: Information Analysis", 78 | "Topic :: Software Development :: Libraries :: Python Modules", 79 | "Topic :: Text Processing :: Linguistic", 80 | ], 81 | packages=find_packages(), 82 | install_requires=install_requires, 83 | include_package_data=True, 84 | data_files=[('htrc/mock/volumes', ['htrc/mock/volumes/example.zip']), 85 | ('htrc', ['htrc/.htrc.default'])], 86 | zip_safe=False, 87 | entry_points={ 88 | 'console_scripts': ['htrc = htrc.__main__:main'] 89 | }, 90 | test_suite="unittest2.collector", 91 | tests_require=['unittest2'], 92 | cmdclass={'install': PostInstallCommand} 93 | ) 94 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/htrc/HTRC-WorksetToolkit/3c3428d80a72a644925dc6ab2827470c8467af30/tests/__init__.py -------------------------------------------------------------------------------- /tests/data/example.csv: -------------------------------------------------------------------------------- 1 | htitem_id title author date rights OCLC LCCN ISBN catalog_url handle_url 2 | mdp.39015050817181 Archaeology, art, and religion : new perspectives on Vijayanagar / Anila Verghese. Verghese, Anila. 2000-00-00 ic 41960671 00371223 9780195648904,0195648900 http://catalog.hathitrust.org/Record/004126407 http://hdl.handle.net/2027/mdp.39015050817181 3 | mdp.39015055436151 Hampi / Anila Verghese. Verghese, Anila. 2002-00-00 ic 47940144 2002285547 9780195654332,0195654331 http://catalog.hathitrust.org/Record/004227991 http://hdl.handle.net/2027/mdp.39015055436151 4 | mdp.39015056169157 Hampi / Anila Verghese. Verghese, Anila. 2002-00-00 ic 47940144 2002285547 9780195654332,0195654331 http://catalog.hathitrust.org/Record/004227991 http://hdl.handle.net/2027/mdp.39015056169157 5 | mdp.39015050161697 Religious traditions at Vijayanagara, as revealed through its monuments / Anila Verghese. Verghese, Anila. 1995-00-00 ic 32893147 95903106 9788173040863,8173040869 http://catalog.hathitrust.org/Record/004054378 http://hdl.handle.net/2027/mdp.39015050161697 6 | mdp.39015042791874 Sculpture at Vijayanagara : iconography and style / Anna L. Dallapiccola, Anila Verghese. Dallapiccola, Anna L. 1944- 1998-00-00 ic 40480543 98909496 9788173042324,8173042322 http://catalog.hathitrust.org/Record/003333435 http://hdl.handle.net/2027/mdp.39015042791874 -------------------------------------------------------------------------------- /tests/data/example.jsonld: -------------------------------------------------------------------------------- 1 | { 2 | "@context": "http://emblematica.library.illinois.edu/test/worksetcontext.jsonld", 3 | "@graph" : [ 4 | { 5 | "@id": "https://babel.hathitrust.org/shcgi/mb?a=listis;c=548413090", 6 | "@type": "Workset", 7 | "title": "Sample", 8 | "curator": "jammurdo", 9 | "numItems": 5, 10 | "gathers": [ 11 | { 12 | "@id": "http://hdl.handle.net/2027/mdp.39015050817181" 13 | }, 14 | { 15 | "@id": "http://hdl.handle.net/2027/mdp.39015055436151" 16 | }, 17 | { 18 | "@id": "http://hdl.handle.net/2027/mdp.39015056169157" 19 | }, 20 | { 21 | "@id": "http://hdl.handle.net/2027/mdp.39015050161697" 22 | }, 23 | { 24 | "@id": "http://hdl.handle.net/2027/mdp.39015042791874" 25 | } 26 | ] 27 | }] 28 | } 29 | -------------------------------------------------------------------------------- /tests/data/example.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/htrc/HTRC-WorksetToolkit/3c3428d80a72a644925dc6ab2827470c8467af30/tests/data/example.zip -------------------------------------------------------------------------------- /tests/test_download_cli.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from future import standard_library 3 | standard_library.install_aliases() 4 | 5 | import sys 6 | if sys.version_info.major == 2: 7 | from mock import Mock, patch, PropertyMock 8 | elif sys.version_info.major == 3: 9 | from unittest.mock import Mock, patch, PropertyMock 10 | 11 | import unittest2 as unittest 12 | 13 | import htrc.__main__ 14 | import argparse 15 | 16 | class TestDownload(unittest.TestCase): 17 | 18 | @patch('argparse._sys.argv', ['htrc', 'download', 'mdp.1234567']) 19 | @patch('htrc.__main__.download') 20 | def test_raw_volume_id(self, download_mock): 21 | htrc.__main__.main() 22 | download_mock.assert_called_once() 23 | 24 | @patch('argparse._sys.argv', ['htrc', 'download', '001423370']) 25 | @patch('htrc.__main__.download') 26 | def test_raw_record_id(self, download_mock): 27 | htrc.__main__.main() 28 | download_mock.assert_called_once() 29 | 30 | @patch('argparse._sys.argv', ['htrc', 'download', 'https://babel.hathitrust.org/cgi/pt?id=mdp.39015078560078;view=1up;seq=13']) 31 | @patch('htrc.__main__.download') 32 | def test_babel_url(self, download_mock): 33 | htrc.__main__.main() 34 | download_mock.assert_called_once() 35 | 36 | @patch('argparse._sys.argv', ['htrc', 'download', 'https://hdl.handle.net/2027/mdp.39015078560078']) 37 | @patch('htrc.__main__.download') 38 | def test_handle_url(self, download_mock): 39 | htrc.__main__.main() 40 | download_mock.assert_called_once() 41 | 42 | @patch('argparse._sys.argv', ['htrc', 'download', 'https://catalog.hathitrust.org/Record/001423370']) 43 | @patch('htrc.__main__.download') 44 | def test_catalog_url(self, download_mock): 45 | htrc.__main__.main() 46 | download_mock.assert_called_once() 47 | 48 | @patch('argparse._sys.argv', ['htrc', 'download', 'https://babel.hathitrust.org/shcgi/mb?a=listis;c=696632727']) 49 | @patch('htrc.__main__.download') 50 | def test_collection_builder_url(self, download_mock): 51 | htrc.__main__.main() 52 | download_mock.assert_called_once() -------------------------------------------------------------------------------- /tests/test_htrc_lib_cli.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from future import standard_library 3 | standard_library.install_aliases() 4 | 5 | import sys 6 | if sys.version_info.major == 2: 7 | from mock import Mock, patch, PropertyMock 8 | elif sys.version_info.major == 3: 9 | from unittest.mock import Mock, patch, PropertyMock 10 | 11 | import unittest2 as unittest 12 | 13 | from htrc.lib.cli import * 14 | 15 | class TestVolumes(unittest.TestCase): 16 | @patch('htrc.lib.cli.input') 17 | def test_bool_prompt(self, input_mock): 18 | # test True 19 | input_mock.return_value = 'y' 20 | return_value = bool_prompt("Enter yes") 21 | self.assertEqual(return_value, True) 22 | 23 | input_mock.return_value = 'n' 24 | return_value = bool_prompt("Enter no") 25 | self.assertEqual(return_value, False) 26 | 27 | input_mock.return_value = '' 28 | return_value = bool_prompt("Enter nothing for false", default=False) 29 | self.assertEqual(return_value, False) 30 | 31 | return_value = bool_prompt("Enter nothing for true", default=True) 32 | self.assertEqual(return_value, True) 33 | 34 | @patch('htrc.lib.cli.input') 35 | def test_prompt_default(self, input_mock): 36 | input_mock.return_value = '' 37 | return_value = prompt("Enter nothing for 3", default='3') 38 | self.assertEqual(return_value, '3') 39 | 40 | -------------------------------------------------------------------------------- /tests/test_htrc_mock_volumes.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from future import standard_library 3 | standard_library.install_aliases() 4 | 5 | import sys 6 | if sys.version_info.major == 2: 7 | from mock import Mock, patch, PropertyMock 8 | elif sys.version_info.major == 3: 9 | from unittest.mock import Mock, patch, PropertyMock 10 | 11 | from io import BytesIO # used to stream http response into zipfile. 12 | from tempfile import NamedTemporaryFile, mkdtemp 13 | import unittest2 as unittest 14 | 15 | import htrc.mock.volumes 16 | 17 | class MockResponse(BytesIO): 18 | def __init__(self, data, status=200, *args, **kwargs): 19 | BytesIO.__init__(self, data, *args, **kwargs) 20 | self.status = status 21 | 22 | class TestVolumes(unittest.TestCase): 23 | def setUp(self): 24 | self.test_vols = ['mdp.39015050817181', 'mdp.39015055436151', 25 | 'mdp.39015056169157', 'mdp.39015050161697', 'mdp.39015042791874'] 26 | 27 | self.config_path = NamedTemporaryFile(delete=False).name 28 | self.empty_config_path = NamedTemporaryFile(delete=False).name 29 | 30 | self.output_path = mkdtemp() 31 | 32 | def tearDown(self): 33 | import os, shutil 34 | os.remove(self.config_path) 35 | shutil.rmtree(self.output_path) 36 | 37 | @patch('htrc.mock.volumes.credential_prompt') 38 | def test_credential_prompt(self, credential_prompt_mock): 39 | # configure mocks 40 | credential_prompt_mock.return_value = ('1234', '1234') 41 | 42 | # test prompts 43 | username, password = htrc.mock.volumes.credential_prompt(self.config_path) 44 | self.assertEqual(username, '1234') 45 | self.assertEqual(password, '1234') 46 | 47 | # test read 48 | username, password = htrc.mock.volumes.credentials_from_config( 49 | self.config_path) 50 | self.assertEqual(username, None) 51 | self.assertEqual(password, None) 52 | 53 | def test_get_oauth2_token(self): 54 | token = htrc.mock.volumes.get_oauth2_token('1234','1234') 55 | self.assertEqual(token, 'a1b2c3d4e5f6') 56 | 57 | def test_get_volumes_and_pages(self): 58 | htrc.mock.volumes.get_volumes('1234', self.test_vols) 59 | htrc.mock.volumes.get_pages('1234', self.test_vols) 60 | 61 | def test_get_volumes_and_pages_empty(self): 62 | with self.assertRaises(ValueError): 63 | htrc.mock.volumes.get_volumes('1234', []) 64 | 65 | with self.assertRaises(ValueError): 66 | htrc.mock.volumes.get_pages('1234', []) 67 | 68 | @patch('htrc.mock.volumes.ZipFile') 69 | @patch('htrc.mock.volumes.get_volumes') 70 | @patch('htrc.mock.volumes.get_oauth2_token') 71 | def test_download_volumes(self, oauth2_mock, volumes_mock, zip_mock): 72 | oauth2_mock.return_value = 'a1b2c3d4e5' 73 | volumes_mock.return_value = b'' 74 | 75 | htrc.mock.volumes.download_volumes(self.test_vols, self.output_path, 76 | username='1234', password='1234') 77 | 78 | # test directory creation 79 | import shutil 80 | shutil.rmtree(self.output_path) 81 | htrc.mock.volumes.download_volumes(self.test_vols, self.output_path, 82 | username='1234', password='1234') 83 | 84 | @patch('htrc.mock.volumes.ZipFile') 85 | @patch('htrc.mock.volumes.get_volumes') 86 | @patch('htrc.mock.volumes.get_oauth2_token') 87 | def test_download_volumes_saved_creds(self, oauth2_mock, volumes_mock, zip_mock): 88 | oauth2_mock.return_value = 'a1b2c3d4e5' 89 | volumes_mock.return_value = b'' 90 | 91 | # test config-based auth 92 | import os, os.path 93 | config_path = os.path.expanduser('~') 94 | config_path = os.path.join(config_path, '.htrc') 95 | preexisting_config = os.path.exists(config_path) 96 | if not preexisting_config: 97 | htrc.mock.volumes.save_credentials('1234', '1234', config_path) 98 | 99 | htrc.mock.volumes.download_volumes(self.test_vols, self.output_path) 100 | 101 | if not preexisting_config: 102 | os.remove(config_path) 103 | 104 | def test_download(self): 105 | pass 106 | 107 | suite = unittest.TestLoader().loadTestsFromTestCase(TestVolumes) 108 | unittest.TextTestRunner(verbosity=2).run(suite) 109 | 110 | -------------------------------------------------------------------------------- /tests/test_htrc_util_resolve.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from future import standard_library 3 | standard_library.install_aliases() 4 | 5 | import sys 6 | if sys.version_info.major == 2: 7 | from mock import Mock, patch 8 | elif sys.version_info.major == 3: 9 | from unittest.mock import Mock, patch 10 | 11 | import unittest2 as unittest 12 | 13 | import htrc.util.resolve as resolve 14 | 15 | class TestResolve(unittest.TestCase): 16 | def test_parse_record_id(self): 17 | id = resolve.parse_record_id('https://catalog.hathitrust.org/Record/000234911') 18 | self.assertEqual(id, '000234911') 19 | 20 | id = resolve.parse_record_id('000234911') 21 | self.assertEqual(id, '000234911') 22 | 23 | 24 | with self.assertRaises(ValueError): 25 | resolve.parse_record_id('https://hdl.handle.net/2027/hvd.hn3t2m') 26 | 27 | with self.assertRaises(ValueError): 28 | resolve.parse_record_id('this is not a valid URL or volume ID') 29 | 30 | def test_parse_truncated_record_id(self): 31 | # test truncated IDs 32 | with self.assertRaises(ValueError): 33 | resolve.parse_record_id('234911') 34 | 35 | id = resolve.parse_record_id('234911', fix_truncated_id=True) 36 | self.assertEqual(id, '000234911') 37 | 38 | def test_parse_volume_id(self): 39 | id = resolve.parse_volume_id('https://hdl.handle.net/2027/uc2.ark:/13960/fk92805m1s') 40 | self.assertEqual(id, 'uc2.ark:/13960/fk92805m1s') 41 | 42 | id = resolve.parse_volume_id('https://babel.hathitrust.org/cgi/pt?id=uc2.ark:/13960/fk92805m1s;view=1up;seq=7') 43 | self.assertEqual(id, 'uc2.ark:/13960/fk92805m1s') 44 | 45 | id = resolve.parse_volume_id('https://babel.hathitrust.org/cgi/pt?id=uc2.ark:/13960/fk92805m1s&view=1up&seq=7') 46 | self.assertEqual(id, 'uc2.ark:/13960/fk92805m1s') 47 | 48 | id = resolve.parse_volume_id('uc2.ark:/13960/fk92805m1s') 49 | self.assertEqual(id, 'uc2.ark:/13960/fk92805m1s') 50 | 51 | with self.assertRaises(ValueError): 52 | # check if incorrect institution ID raises error 53 | resolve.parse_volume_id('uc42.ark:/13960/fk92805m1s') 54 | 55 | @patch('htrc.util.resolve.urlopen') 56 | def test_volume_id_to_record_id(self, urlopen_mock): 57 | urlopen_mock.return_value.geturl.return_value =\ 58 | 'https://catalog.hathitrust.org/Record/000850926' 59 | record_id = resolve.volume_id_to_record_id('uc2.ark:/13960/fk92805m1s') 60 | 61 | self.assertEqual(record_id, '000850926') 62 | 63 | 64 | @patch('htrc.util.resolve.urlopen') 65 | def test_record_id_to_volume_ids(self, urlopen_mock): 66 | urlopen_mock.return_value.read.return_value =\ 67 | b'{"items":[{"orig":"Harvard University","fromRecord":"000850926","htid":"hvd.hn3t2m","itemURL":"https:\/\/hdl.handle.net\/2027\/hvd.hn3t2m","rightsCode":"pd","lastUpdate":"20130803","enumcron":false,"usRightsString":"Full view"}]}'.decode('utf-8') 68 | ids = resolve.record_id_to_volume_ids('000234911') 69 | self.assertEqual(ids, ['hvd.hn3t2m']) 70 | -------------------------------------------------------------------------------- /tests/test_htrc_volumes.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from future import standard_library 3 | standard_library.install_aliases() 4 | 5 | import sys 6 | if sys.version_info.major == 2: 7 | from mock import Mock, patch, PropertyMock 8 | elif sys.version_info.major == 3: 9 | from unittest.mock import Mock, patch, PropertyMock 10 | 11 | from io import BytesIO # used to stream http response into zipfile. 12 | from tempfile import NamedTemporaryFile, mkdtemp 13 | import unittest2 as unittest 14 | 15 | import htrc.volumes 16 | import htrc.config 17 | 18 | class MockResponse(BytesIO): 19 | def __init__(self, data, status=200, *args, **kwargs): 20 | BytesIO.__init__(self, data, *args, **kwargs) 21 | self.status = status 22 | 23 | class TestVolumes(unittest.TestCase): 24 | def setUp(self): 25 | self.test_vols = ['mdp.39015050817181', 'mdp.39015055436151', 26 | 'mdp.39015056169157', 'mdp.39015050161697', 'mdp.39015042791874'] 27 | 28 | self.config_path = NamedTemporaryFile(delete=False).name 29 | self.empty_config_path = NamedTemporaryFile(delete=False).name 30 | 31 | self.output_path = mkdtemp() 32 | 33 | def tearDown(self): 34 | import os, shutil 35 | os.remove(self.config_path) 36 | shutil.rmtree(self.output_path) 37 | 38 | 39 | # @patch('htrc.volumes.http.client.HTTPSConnection') 40 | # def test_get_oauth2_token(self, https_mock): 41 | # response_mock = Mock(status=200, return_value=b'') 42 | # response_mock.read.return_value =\ 43 | # '{"access_token": "a1b2c3d4e5f6"}'.encode('utf8') 44 | # https_mock.return_value.getresponse.return_value = response_mock 45 | # 46 | # token = htrc.volumes.get_oauth2_token('1234','1234') 47 | # self.assertEqual(token, 'a1b2c3d4e5f6') 48 | # 49 | # @patch('htrc.volumes.http.client.HTTPSConnection') 50 | # def test_get_oauth2_token_error(self, https_mock): 51 | # response_mock = Mock(status=500) 52 | # https_mock.return_value.getresponse.return_value = response_mock 53 | # 54 | # with self.assertRaises(EnvironmentError): 55 | # token = htrc.volumes.get_oauth2_token('1234','1234') 56 | 57 | @patch('htrc.volumes.http.client.HTTPSConnection') 58 | def test_get_volumes_and_pages(self, https_mock): 59 | response_mock = Mock(status=200) 60 | response_mock.read.return_value =\ 61 | ''.encode('utf8') 62 | https_mock.return_value.getresponse.return_value = response_mock 63 | data_api_config = htrc.config.HtrcDataApiConfig( 64 | token='1234', 65 | host='data-host', 66 | port=443, 67 | epr='/', 68 | cert='/home/client-certs/client.pem', 69 | key='/home/client-certs/client.pem' 70 | ) 71 | 72 | htrc.volumes.get_volumes(data_api_config, self.test_vols) 73 | htrc.volumes.get_pages(data_api_config, self.test_vols) 74 | 75 | @patch('htrc.volumes.http.client.HTTPSConnection') 76 | def test_get_volumes_and_pages_error(self, https_mock): 77 | response_mock = Mock(status=500) 78 | https_mock.return_value.getresponse.return_value = response_mock 79 | 80 | data_api_config = htrc.config.HtrcDataApiConfig( 81 | token='1234', 82 | host='data-host', 83 | port=443, 84 | epr='/', 85 | cert='/home/client-certs/client.pem', 86 | key='/home/client-certs/client.pem' 87 | ) 88 | 89 | with self.assertRaises(EnvironmentError): 90 | htrc.volumes.get_volumes(data_api_config, self.test_vols) 91 | 92 | with self.assertRaises(EnvironmentError): 93 | htrc.volumes.get_pages(data_api_config, self.test_vols) 94 | 95 | def test_get_volumes_and_pages_empty(self): 96 | data_api_config = htrc.config.HtrcDataApiConfig( 97 | token='1234', 98 | host='data-host', 99 | port=443, 100 | epr='/', 101 | cert='/home/client-certs/client.pem', 102 | key='/home/client-certs/client.pem' 103 | ) 104 | 105 | with self.assertRaises(ValueError): 106 | htrc.volumes.get_volumes(data_api_config, []) 107 | 108 | with self.assertRaises(ValueError): 109 | htrc.volumes.get_pages(data_api_config, []) 110 | 111 | @patch('htrc.volumes.ZipFile') 112 | @patch('htrc.volumes.get_volumes') 113 | # test is looking for oauth2 tokens. looks like we made a jump to jwt but not seeing tests for those. 114 | # revised code to point towards mock.volumes.get_oauth2_token as a hot fix - 5/22 dan 115 | @patch('htrc.mock.volumes.get_oauth2_token') 116 | @patch('htrc.volumes.http.client.HTTPSConnection') 117 | def test_download_volumes(self, https_mock, oauth2_mock, volumes_mock, 118 | zip_mock): 119 | response_mock = Mock(status=200) 120 | https_mock.return_value.getresponse.return_value = response_mock 121 | oauth2_mock.return_value = 'a1b2c3d4e5' 122 | volumes_mock.return_value = b'' 123 | 124 | data_api_config = htrc.config.HtrcDataApiConfig( 125 | token='1234', 126 | host='data-host', 127 | port=443, 128 | epr='/', 129 | cert='/home/client-certs/client.pem', 130 | key='/home/client-certs/client.pem' 131 | ) 132 | 133 | htrc.volumes.download_volumes(self.test_vols, self.output_path, data_api_config=data_api_config) 134 | 135 | # test directory creation 136 | import shutil 137 | shutil.rmtree(self.output_path) 138 | htrc.volumes.download_volumes(self.test_vols, self.output_path, data_api_config=data_api_config) 139 | 140 | # TODO: Fix this test for case where config file exists, but creds not set 141 | """ 142 | @patch('htrc.volumes.ZipFile') 143 | @patch('htrc.volumes.get_volumes') 144 | @patch('htrc.volumes.get_oauth2_token') 145 | @patch('htrc.volumes.http.client.HTTPSConnection') 146 | def test_download_volumes_saved_creds(self, https_mock, oauth2_mock, volumes_mock, 147 | zip_mock): 148 | response_mock = Mock(status=200) 149 | https_mock.return_value.getresponse.return_value = response_mock 150 | oauth2_mock.return_value = 'a1b2c3d4e5' 151 | volumes_mock.return_value = b'' 152 | 153 | # test config-based auth 154 | import os, os.path 155 | config_path = os.path.expanduser('~') 156 | config_path = os.path.join(config_path, '.htrc') 157 | preexisting_config = os.path.exists(config_path) 158 | if not preexisting_config: 159 | htrc.config.save_credentials('1234', '1234', config_path) 160 | 161 | htrc.volumes.download_volumes(self.test_vols, self.output_path) 162 | 163 | if not preexisting_config: 164 | os.remove(config_path) 165 | """ 166 | 167 | def test_download(self): 168 | pass 169 | 170 | 171 | suite = unittest.TestLoader().loadTestsFromTestCase(TestVolumes) 172 | unittest.TextTestRunner(verbosity=2).run(suite) 173 | 174 | -------------------------------------------------------------------------------- /tests/test_htrc_workset.py: -------------------------------------------------------------------------------- 1 | import sys 2 | if sys.version_info.major == 2: 3 | from mock import Mock, patch 4 | elif sys.version_info.major == 3: 5 | from unittest.mock import Mock, patch 6 | 7 | import unittest2 as unittest 8 | import json 9 | import os.path 10 | 11 | import htrc.workset 12 | 13 | class TestWorkset(unittest.TestCase): 14 | def setUp(self): 15 | self.test_vols = ['mdp.39015050817181', 'mdp.39015055436151', 16 | 'mdp.39015056169157', 'mdp.39015050161697', 'mdp.39015042791874'] 17 | 18 | dirname = os.path.dirname(__file__) 19 | self.example_file = os.path.join(dirname, 'data/example.jsonld') 20 | with open(self.example_file, 'r') as infile: 21 | self.json = json.load(infile) 22 | 23 | self.example_csv = os.path.join(dirname, 'data/example.csv') 24 | with open(self.example_csv, 'r') as infile: 25 | self.csv = infile.read().encode('utf-8') 26 | 27 | def test_get_volumes(self): 28 | """ 29 | Test get_volumes by ensuring that the JSON-LD data structure can be 30 | extracted. 31 | """ 32 | vols = htrc.workset.get_volumes(self.json) 33 | 34 | # check that each volume correctly parsed out of JSONLD data 35 | for vol in self.test_vols: 36 | self.assertIn(vol, vols) 37 | 38 | def test_load_file(self): 39 | vols = htrc.workset.load(self.example_file) 40 | 41 | # check that each volume correctly parsed out of JSONLD data 42 | for vol in self.test_vols: 43 | self.assertIn(vol, vols) 44 | 45 | def test_get_volumes_from_csv(self): 46 | vols = htrc.workset.get_volumes_from_csv(self.csv) 47 | 48 | # check that each volume correctly parsed out of JSONLD data 49 | for vol in self.test_vols: 50 | self.assertIn(vol, vols) 51 | 52 | @patch('htrc.workset.urlopen') 53 | def test_load_url_hathitrust(self, urlopen_mock): 54 | ht_url = "https://babel.hathitrust.org/cgi/mb?a=listis&c=548413090" 55 | 56 | # test the default URL with a mock 57 | response_mock = Mock() 58 | urlopen_mock.return_value = response_mock 59 | response_mock.read.return_value = self.csv 60 | 61 | vols = htrc.workset.load_url(ht_url) 62 | 63 | # check that each volume correctly parsed out of CSV data 64 | for vol in self.test_vols: 65 | self.assertIn(vol, vols) 66 | 67 | 68 | @patch('htrc.workset.urlopen') 69 | def test_load_url_htrc(self, urlopen_mock): 70 | htrc_url = 'https://htrc.hathitrust.org/wsid/123456' 71 | 72 | # test the default URL with a mock 73 | response_mock = Mock() 74 | urlopen_mock.return_value = response_mock 75 | response_mock.read.return_value = json.dumps(self.json).encode('utf-8') 76 | 77 | vols = htrc.workset.load_url(htrc_url) 78 | 79 | # check that each volume correctly parsed out of JSONLD data 80 | for vol in self.test_vols: 81 | self.assertIn(vol, vols) 82 | 83 | # Also test a direct URL from the triple store service. 84 | # Since this is using a mock, it doesn't matter if the service 85 | # is up or down 86 | htrc_url2 = ('http://acbres224.ischool.illinois.edu:8080/' + 87 | 'dcWSfetch/getItems?id=http://htrc.hathitrust.org/wsid/189324102') 88 | vols = htrc.workset.load_url(htrc_url2) 89 | 90 | # check that each volume correctly parsed out of JSONLD data 91 | for vol in self.test_vols: 92 | self.assertIn(vol, vols) 93 | 94 | @patch('htrc.workset.urlopen') 95 | def test_load_url_error(self, urlopen_mock): 96 | invalid_url = 'blahblahblah' 97 | 98 | with self.assertRaises(ValueError): 99 | htrc.workset.load_url(invalid_url) 100 | 101 | 102 | @patch('htrc.workset.urlopen') 103 | def test_load_hathitrust_collection(self, urlopen_mock): 104 | ht_url = "https://babel.hathitrust.org/cgi/mb?a=listis&c=548413090" 105 | 106 | # test the default URL with a mock 107 | response_mock = Mock() 108 | urlopen_mock.return_value = response_mock 109 | response_mock.read.return_value = self.csv 110 | 111 | vols = htrc.workset.load_hathitrust_collection(ht_url) 112 | 113 | # check that each volume correctly parsed out of CSV data 114 | for vol in self.test_vols: 115 | self.assertIn(vol, vols) 116 | 117 | 118 | # test three malformed URLs: 119 | # 1. Missing Collection ID 120 | ht_url = "https://babel.hathitrust.org/cgi/mb?a=listis" 121 | with self.assertRaises(ValueError): 122 | htrc.workset.load_hathitrust_collection(ht_url) 123 | 124 | # 2. http vs. https 125 | ht_url = "http://babel.hathitrust.org/cgi/mb?a=listis&c=548413090" 126 | with self.assertRaises(ValueError): 127 | htrc.workset.load_hathitrust_collection(ht_url) 128 | 129 | # 3. Non-Babel URL 130 | ht_url = "https://htrc.hathitrust.org/cgi/mb?a=listis&c=548413090" 131 | with self.assertRaises(ValueError): 132 | htrc.workset.load_hathitrust_collection(ht_url) 133 | 134 | @patch('htrc.workset.load_url') 135 | def test_load_with_url(self, load_url_mock): 136 | ht_url = "https://babel.hathitrust.org/cgi/mb?a=listis&c=548413090" 137 | load_url_mock.return_value = self.test_vols 138 | 139 | vols = htrc.workset.load(ht_url) 140 | load_url_mock.assert_called_with(ht_url) 141 | 142 | 143 | suite = unittest.TestLoader().loadTestsFromTestCase(TestWorkset) 144 | unittest.TextTestRunner(verbosity=2).run(suite) 145 | -------------------------------------------------------------------------------- /utils/generate_data.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import itertools 4 | import os, os.path 5 | 6 | import loremipsum 7 | 8 | VOL_NAME = 'htrc.test{}' 9 | PAGE_FILENAME = '{0:08d}.txt' 10 | 11 | def generate_file(filename, N=4, separator='\n\n'): 12 | with open(filename, 'w') as outfile: 13 | for _,_,text in loremipsum.generate_paragraphs(N): 14 | outfile.write(text + separator) 15 | 16 | def generate_volumes(num_volumes, num_pages=5): 17 | if isinstance(num_pages, int): 18 | num_pages = itertools.repeat(num_pages, num_volumes) 19 | elif len(num_pages) != num_volumes: 20 | raise ValueError("len(num_pages) != num_volumes") 21 | 22 | for i, pages in enumerate(num_pages): 23 | vol_name = VOL_NAME.format(i) 24 | if not os.path.exists(vol_name): 25 | os.makedirs(vol_name) 26 | 27 | for page in range(pages): 28 | page = page+1 29 | filename = os.path.join(vol_name, PAGE_FILENAME.format(page)) 30 | generate_file(filename) 31 | 32 | if __name__ == '__main__': 33 | from argparse import ArgumentParser 34 | parser = ArgumentParser() 35 | parser.add_argument('vols', type=int) 36 | parser.add_argument('pages', type=int) 37 | args = parser.parse_args() 38 | 39 | generate_volumes(args.vols, args.pages) 40 | 41 | --------------------------------------------------------------------------------