├── .gitignore
├── .travis.yml
├── HAEP.md
├── LICENSE.md
├── MANIFEST.in
├── README.md
├── docs
    ├── Makefile
    ├── make.bat
    └── source
    │   ├── cli.rst
    │   ├── conf.py
    │   ├── index.rst
    │   ├── sdk.rst
    │   └── tips.rst
├── htrc
    ├── .htrc.default
    ├── __init__.py
    ├── __main__.py
    ├── auth.py
    ├── config.py
    ├── hf_utils
    │   └── __init__.py
    ├── lib
    │   ├── __init__.py
    │   └── cli.py
    ├── metadata
    │   ├── __init__.py
    │   └── marc.py
    ├── mock
    │   ├── __init__.py
    │   └── volumes
    │   │   ├── __init__.py
    │   │   └── example.zip
    ├── models
    │   └── __init__.py
    ├── runningheaders
    │   └── __init__.py
    ├── tools
    │   ├── __init__.py
    │   ├── mallet.py
    │   └── topicexplorer.py
    ├── util
    │   ├── __init__.py
    │   └── resolve.py
    ├── volumes
    │   └── __init__.py
    └── workset
    │   ├── __init__.py
    │   └── __main__.py
├── setup.py
├── tests
    ├── __init__.py
    ├── data
    │   ├── example.csv
    │   ├── example.jsonld
    │   └── example.zip
    ├── test_download_cli.py
    ├── test_htrc_lib_cli.py
    ├── test_htrc_mock_volumes.py
    ├── test_htrc_util_resolve.py
    ├── test_htrc_volumes.py
    └── test_htrc_workset.py
└── utils
    └── generate_data.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.pyc
 2 | sandbox
 3 | 
 4 | *.ini
 5 | .pypirc
 6 | .idea
 7 | 
 8 | logs/*
 9 | 
10 | build/
11 | dist/
12 | docs/build/
13 | htrc.egg-info
14 | 
15 | *~
16 | 
17 | .coverage
18 | htmlcov/
19 | .eggs
20 | ssl-cert-trust
21 | venv/
22 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | python:
 3 |   - "2.7"
 4 |   - "3.6"
 5 | # command to install dependencies
 6 | install:
 7 |   - python --version
 8 |   - pip --version
 9 |   # command to install dependencies
10 |   - pip install cython wget
11 |   - pip install coveralls coverage codeclimate-test-reporter
12 |   - pip install .
13 | # command to run tests
14 | script:
15 |   - coverage run --include=htrc/* setup.py test
16 |   - coverage report
17 | after_success:
18 |   - coveralls
19 |   - codeclimate-test-reporter
20 | addons:
21 |   code_climate:
22 |     repo_token: 0299da27c6ac280129992725e48ee5ff71ea668b755a0301ebd8374c6900b80e
23 | 


--------------------------------------------------------------------------------
/HAEP.md:
--------------------------------------------------------------------------------
 1 | # HEAP-1: HTRC User Toolkit
 2 | 
 3 | HTRC Analytics Enhancement Proposal 1: HTRC Workset Toolkit
 4 | 
 5 | ## Introduction
 6 | The HTRC Workset Toolkit provides a command line interface for interacting with
 7 | and analyzing volumes in the HathiTrust Digital Library. It operates on the
 8 | concept of a "workset". A workset is a research collection intended for
 9 | consumption by an automated process for non-consumptive analysis.
10 | 
11 | The tools also assist with the HTRC Data Capsule, enabling you to download volumes 
12 | to the secure mode of the capsule for analysis.
13 | 
14 | ## Motivation
15 | Currently, we do not have an end-user tool built around the workset paradigm.
16 | This tool allows for a workset to be downloaded and analyzed using the Data
17 | Capsule, and enables testing outside of the data Capsule.
18 | 
19 | ## Related Work
20 | 
21 | ## Proposed Change
22 | The proposed changes are stored in the GitHub repository
23 | [htrc/HTRC-WorksetToolkit](http://github.com/htrc/HTRC-PythonToolkit), with
24 | [documentation at GitHub.io](http://htrc.github.io/HTRC-WorksetToolkit)
25 | 
26 | ## User Interface
27 | The HTRC Workset Toolkit provides a command line interface for interacting with 
28 | and analyzing volumes in the HathiTrust Digital Library:
29 | 
30 | - Volume Download (``htrc download``)
31 | - Metadata Download (``htrc metadata``)
32 | - Pre-built Analysis Workflows (``htrc run``)
33 | - Export of volume lists (``htrc export``)
34 | 
35 | Each tool operates on a *workset*, which is a collection of volumes, pages,
36 | or catalog records. 
37 | 
38 | A workset is referenced by a :ref:`workset path`, which is one of 7 types of
39 | identifiers. Almost any web page on http://hathitrust.org is a valid identifier,
40 | including the PageTurner view, Catalog record view, and Collection Builder
41 | collections.
42 | 
43 | 
44 | ## Migration and Maintenance Plan
45 | The project has complete Travis-CI unit test integratioin at [http://travis-ci.org/htrc/HTRC-PythonSDK].
46 | 
47 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
  1 | Copyright 2012 The Obvious Corporation and contributors.
  2 | <http://obvious.com/>
  3 | 
  4 | Licensed under the Apache License, Version 2.0 (the "License");
  5 | you may not use this file except in compliance with the License.
  6 | You may obtain a copy of the License at
  7 | 
  8 | <http://www.apache.org/licenses/LICENSE-2.0>
  9 | 
 10 | Unless required by applicable law or agreed to in writing, software
 11 | distributed under the License is distributed on an "AS IS" BASIS,
 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | See the License for the specific language governing permissions and
 14 | limitations under the License.
 15 | 
 16 | ```
 17 | -------------------------------------------------------------------------
 18 |                               Apache License
 19 |                         Version 2.0, January 2004
 20 |                      http://www.apache.org/licenses/
 21 | 
 22 | 
 23 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
 24 | 
 25 | 1. Definitions.
 26 | 
 27 |    "License" shall mean the terms and conditions for use, reproduction,
 28 |    and distribution as defined by Sections 1 through 9 of this document.
 29 | 
 30 |    "Licensor" shall mean the copyright owner or entity authorized by
 31 |    the copyright owner that is granting the License.
 32 | 
 33 |    "Legal Entity" shall mean the union of the acting entity and all
 34 |    other entities that control, are controlled by, or are under common
 35 |    control with that entity. For the purposes of this definition,
 36 |    "control" means (i) the power, direct or indirect, to cause the
 37 |    direction or management of such entity, whether by contract or
 38 |    otherwise, or (ii) ownership of fifty percent (50%) or more of the
 39 |    outstanding shares, or (iii) beneficial ownership of such entity.
 40 | 
 41 |    "You" (or "Your") shall mean an individual or Legal Entity
 42 |    exercising permissions granted by this License.
 43 | 
 44 |    "Source" form shall mean the preferred form for making modifications,
 45 |    including but not limited to software source code, documentation
 46 |    source, and configuration files.
 47 | 
 48 |    "Object" form shall mean any form resulting from mechanical
 49 |    transformation or translation of a Source form, including but
 50 |    not limited to compiled object code, generated documentation,
 51 |    and conversions to other media types.
 52 | 
 53 |    "Work" shall mean the work of authorship, whether in Source or
 54 |    Object form, made available under the License, as indicated by a
 55 |    copyright notice that is included in or attached to the work
 56 |    (an example is provided in the Appendix below).
 57 | 
 58 |    "Derivative Works" shall mean any work, whether in Source or Object
 59 |    form, that is based on (or derived from) the Work and for which the
 60 |    editorial revisions, annotations, elaborations, or other modifications
 61 |    represent, as a whole, an original work of authorship. For the purposes
 62 |    of this License, Derivative Works shall not include works that remain
 63 |    separable from, or merely link (or bind by name) to the interfaces of,
 64 |    the Work and Derivative Works thereof.
 65 | 
 66 |    "Contribution" shall mean any work of authorship, including
 67 |    the original version of the Work and any modifications or additions
 68 |    to that Work or Derivative Works thereof, that is intentionally
 69 |    submitted to Licensor for inclusion in the Work by the copyright owner
 70 |    or by an individual or Legal Entity authorized to submit on behalf of
 71 |    the copyright owner. For the purposes of this definition, "submitted"
 72 |    means any form of electronic, verbal, or written communication sent
 73 |    to the Licensor or its representatives, including but not limited to
 74 |    communication on electronic mailing lists, source code control systems,
 75 |    and issue tracking systems that are managed by, or on behalf of, the
 76 |    Licensor for the purpose of discussing and improving the Work, but
 77 |    excluding communication that is conspicuously marked or otherwise
 78 |    designated in writing by the copyright owner as "Not a Contribution."
 79 | 
 80 |    "Contributor" shall mean Licensor and any individual or Legal Entity
 81 |    on behalf of whom a Contribution has been received by Licensor and
 82 |    subsequently incorporated within the Work.
 83 | 
 84 | 2. Grant of Copyright License. Subject to the terms and conditions of
 85 |    this License, each Contributor hereby grants to You a perpetual,
 86 |    worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 87 |    copyright license to reproduce, prepare Derivative Works of,
 88 |    publicly display, publicly perform, sublicense, and distribute the
 89 |    Work and such Derivative Works in Source or Object form.
 90 | 
 91 | 3. Grant of Patent License. Subject to the terms and conditions of
 92 |    this License, each Contributor hereby grants to You a perpetual,
 93 |    worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 94 |    (except as stated in this section) patent license to make, have made,
 95 |    use, offer to sell, sell, import, and otherwise transfer the Work,
 96 |    where such license applies only to those patent claims licensable
 97 |    by such Contributor that are necessarily infringed by their
 98 |    Contribution(s) alone or by combination of their Contribution(s)
 99 |    with the Work to which such Contribution(s) was submitted. If You
100 |    institute patent litigation against any entity (including a
101 |    cross-claim or counterclaim in a lawsuit) alleging that the Work
102 |    or a Contribution incorporated within the Work constitutes direct
103 |    or contributory patent infringement, then any patent licenses
104 |    granted to You under this License for that Work shall terminate
105 |    as of the date such litigation is filed.
106 | 
107 | 4. Redistribution. You may reproduce and distribute copies of the
108 |    Work or Derivative Works thereof in any medium, with or without
109 |    modifications, and in Source or Object form, provided that You
110 |    meet the following conditions:
111 | 
112 |    (a) You must give any other recipients of the Work or
113 |        Derivative Works a copy of this License; and
114 | 
115 |    (b) You must cause any modified files to carry prominent notices
116 |        stating that You changed the files; and
117 | 
118 |    (c) You must retain, in the Source form of any Derivative Works
119 |        that You distribute, all copyright, patent, trademark, and
120 |        attribution notices from the Source form of the Work,
121 |        excluding those notices that do not pertain to any part of
122 |        the Derivative Works; and
123 | 
124 |    (d) If the Work includes a "NOTICE" text file as part of its
125 |        distribution, then any Derivative Works that You distribute must
126 |        include a readable copy of the attribution notices contained
127 |        within such NOTICE file, excluding those notices that do not
128 |        pertain to any part of the Derivative Works, in at least one
129 |        of the following places: within a NOTICE text file distributed
130 |        as part of the Derivative Works; within the Source form or
131 |        documentation, if provided along with the Derivative Works; or,
132 |        within a display generated by the Derivative Works, if and
133 |        wherever such third-party notices normally appear. The contents
134 |        of the NOTICE file are for informational purposes only and
135 |        do not modify the License. You may add Your own attribution
136 |        notices within Derivative Works that You distribute, alongside
137 |        or as an addendum to the NOTICE text from the Work, provided
138 |        that such additional attribution notices cannot be construed
139 |        as modifying the License.
140 | 
141 |    You may add Your own copyright statement to Your modifications and
142 |    may provide additional or different license terms and conditions
143 |    for use, reproduction, or distribution of Your modifications, or
144 |    for any such Derivative Works as a whole, provided Your use,
145 |    reproduction, and distribution of the Work otherwise complies with
146 |    the conditions stated in this License.
147 | 
148 | 5. Submission of Contributions. Unless You explicitly state otherwise,
149 |    any Contribution intentionally submitted for inclusion in the Work
150 |    by You to the Licensor shall be under the terms and conditions of
151 |    this License, without any additional terms or conditions.
152 |    Notwithstanding the above, nothing herein shall supersede or modify
153 |    the terms of any separate license agreement you may have executed
154 |    with Licensor regarding such Contributions.
155 | 
156 | 6. Trademarks. This License does not grant permission to use the trade
157 |    names, trademarks, service marks, or product names of the Licensor,
158 |    except as required for reasonable and customary use in describing the
159 |    origin of the Work and reproducing the content of the NOTICE file.
160 | 
161 | 7. Disclaimer of Warranty. Unless required by applicable law or
162 |    agreed to in writing, Licensor provides the Work (and each
163 |    Contributor provides its Contributions) on an "AS IS" BASIS,
164 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
165 |    implied, including, without limitation, any warranties or conditions
166 |    of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
167 |    PARTICULAR PURPOSE. You are solely responsible for determining the
168 |    appropriateness of using or redistributing the Work and assume any
169 |    risks associated with Your exercise of permissions under this License.
170 | 
171 | 8. Limitation of Liability. In no event and under no legal theory,
172 |    whether in tort (including negligence), contract, or otherwise,
173 |    unless required by applicable law (such as deliberate and grossly
174 |    negligent acts) or agreed to in writing, shall any Contributor be
175 |    liable to You for damages, including any direct, indirect, special,
176 |    incidental, or consequential damages of any character arising as a
177 |    result of this License or out of the use or inability to use the
178 |    Work (including but not limited to damages for loss of goodwill,
179 |    work stoppage, computer failure or malfunction, or any and all
180 |    other commercial damages or losses), even if such Contributor
181 |    has been advised of the possibility of such damages.
182 | 
183 | 9. Accepting Warranty or Additional Liability. While redistributing
184 |    the Work or Derivative Works thereof, You may choose to offer,
185 |    and charge a fee for, acceptance of support, warranty, indemnity,
186 |    or other liability obligations and/or rights consistent with this
187 |    License. However, in accepting such obligations, You may act only
188 |    on Your own behalf and on Your sole responsibility, not on behalf
189 |    of any other Contributor, and only if You agree to indemnify,
190 |    defend, and hold each Contributor harmless for any liability
191 |    incurred by, or claims asserted against, such Contributor by reason
192 |    of your accepting any such warranty or additional liability.
193 | 
194 | END OF TERMS AND CONDITIONS
195 | ```
196 | 
197 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include htrc/mock/volumes/example.zip
2 | include htrc/.htrc.default
3 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # HTRC Workset Toolkit
 2 | [![Supported Python Versions](https://img.shields.io/pypi/pyversions/htrc.svg)](https://pypi.python.org/pypi/htrc)
 3 | [![PyPI Version](https://img.shields.io/pypi/v/htrc.svg)](https://pypi.python.org/pypi/htrc)
 4 | [![Build Status](https://travis-ci.org/htrc/HTRC-WorksetToolkit.svg?branch=master)](https://travis-ci.org/htrc/HTRC-WorksetToolkit)
 5 | [![Coverage Status](https://coveralls.io/repos/github/htrc/HTRC-WorksetToolkit/badge.svg?branch=master)](https://coveralls.io/github/htrc/HTRC-WorksetToolkit?branch=master)
 6 | 
 7 | HTRC Workset Toolkit provides tools for interacting with and analyzing volumes in the HathiTrust Digital Library:
 8 | 
 9 | - Volume Download (`htrc download`)
10 | - Metadata Download (`htrc metadata`)
11 | - Pre-built Analysis Workflows (`htrc run`)
12 | - Export of volume lists (`htrc export`)
13 | 
14 | Each tool operates on a *workset*, which is a collection of volumes, pages, or catalog records. 
15 | 
16 | The tools also assist with the HTRC Data Capsule, enabling you to download volumes to the secure mode of the capsule for analysis.
17 | 
18 | For usage instructions and documentation see [https://htrc.github.io/HTRC-WorksetToolkit/cli.html].
19 | 
20 | For developers, the Workset Toolkit provides ways to test algorithms that will be run in the secure mode of the Data Capsule. It also provides methods for accessing the bibliographic records for HathiTrust volumes and ways to resolve catalog records for multivolume collections. It has the following components:
21 | 
22 | - An access layer for the Bibliographic API (`htrc.metadata`)
23 | - An access layer for the Data API (`htrc.volumes`)
24 | - Pre-built analysis workflows (`htrc.tools`)
25 | - Provenance tracking for verification of non-consumptive exports (`htrc.prov`)
26 | - Mock testing interface for user-machine or maintenance-mode testing of
27 |   secure-mode commands (`htrc.mock`)
28 | - Utilities for record and volume resolution (`htrc.util`)
29 | 
30 | For documentation of the development libraries see [https://htrc.github.io/HTRC-WorksetToolkit/sdk.html].
31 | 
32 | ## Data Capsule usage
33 | The HTRC Data Capsule allows for analysis of HathiTrust volumes. It is the only way to perform analysis on the raw OCR text of in-copyright works.
34 | 
35 | New users can register and configure a data capsule by following the [HTRC Data Capsule Tutorial](https://wiki.htrc.illinois.edu/display/COM/HTRC+Data+Capsule+Tutorial).
36 | 
37 | The HTRC Workset Toolkit will be pre-installed on Data Capsule images in the near future. Current data capsules will need to follow the [installation instructions](#installation-instructions).
38 | 
39 | 
40 | ## Installation instructions
41 | 
42 | 1. Download and install [Anaconda Python](https://www.continuum.io/downloads). The HTRC Workset Toolkit is compatible with both Python 2.7 and 3.6, but we recommend using the 3.6 version for future compatibility.
43 | 
44 | 2. After installing Anaconda, open a new terminal and type `pip install htrc` to install the SDK.
45 | 
46 | ## Testing
47 | 
48 | 1. `git clone https://github.com/htrc/HTRC-WorksetToolkit.git`
49 | 2. `cd HTRC-WorksetToolkit`
50 | 3. `python setup.py develop`
51 | 4. The `htrc` command will now refer to the code in this local repository.
52 | 5. Run the unit tests with the command: `python setup.py test`
53 | 6. To revert to the PyPI version:
54 |    ```
55 |    pip uninstall htrc
56 |    pip install htrc
57 |    ```
58 | 
59 | ## Updating PyPI
60 | In order to update PyPI, you will need owner permissions, which are currently held by Samitha Liyanage and Jaimie Murdock.
61 | 
62 | 1. Create a `.pypirc` containing your username and password:
63 |    ```
64 |    [distutils]
65 |    index-servers =
66 |       pypi
67 | 
68 |    [pypi]
69 |    repository=https://upload.pypi.org/legacy/
70 |    username:USERNAME
71 |    password:PASSWORD
72 |    ```
73 | 2. Run `python setup.py sdist upload` to upload the tarball.
74 | 3. Run `python setup.py bdist_egg upload` to upload the egg file.
75 | (If `upload` command doesn't work use `twine upload dist/*`)
76 | 
77 | ## Documentation
78 | For usage instructions and documentation please see: [https://htrc.github.io/HTRC-WorksetToolkit/]
79 | 
80 | For a more detailed guide please see: [https://wiki.htrc.illinois.edu/x/NQBTAw.]
81 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line.
 5 | SPHINXOPTS    =
 6 | SPHINXBUILD   = sphinx-build
 7 | SPHINXPROJ    = HTRCPythonSDK
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | latex:
18 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
19 | 	cd build/latex && pdflatex $(SPHINXPROJ)
20 | 
21 | 
22 | # Catch-all target: route all unknown targets to Sphinx using the new
23 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
24 | %: Makefile
25 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
26 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 | set SPHINXPROJ=HTRCPythonSDK
13 | 
14 | if "%1" == "" goto help
15 | 
16 | %SPHINXBUILD% >NUL 2>NUL
17 | if errorlevel 9009 (
18 | 	echo.
19 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
20 | 	echo.installed, then set the SPHINXBUILD environment variable to point
21 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
22 | 	echo.may add the Sphinx directory to PATH.
23 | 	echo.
24 | 	echo.If you don't have Sphinx installed, grab it from
25 | 	echo.http://sphinx-doc.org/
26 | 	exit /b 1
27 | )
28 | 
29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
30 | goto end
31 | 
32 | :help
33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
34 | 
35 | :end
36 | popd
37 | 


--------------------------------------------------------------------------------
/docs/source/cli.rst:
--------------------------------------------------------------------------------
  1 | HTRC Workset Toolkit
  2 | ======================
  3 | The HTRC Workset Toolkit povides a command line interface for interacting with
  4 | and analyzing volumes in the HathiTrust Digital Library:
  5 | 
  6 | - Volume Download (``htrc download``)
  7 | - Metadata Download (``htrc metadata``)
  8 | - Pre-built Analysis Workflows (``htrc run``)
  9 | - Export of volume lists (``htrc export``)
 10 | 
 11 | Workset Path
 12 | --------------
 13 | 
 14 | Each of these commands takes a *workset path*. Valid types of workset paths
 15 | and examples of each are:
 16 | 
 17 | ==================================  ==============================================================================
 18 | Identifier Type                     Example
 19 | ==================================  ==============================================================================
 20 | HathiTrust ID                       mdp.39015078560078
 21 | HathiTrust Catalog ID               001423370
 22 | HathiTrust URL                      https://babel.hathitrust.org/cgi/pt?id=mdp.39015078560078;view=1up;seq=13
 23 | Handle.org Volume URL               https://hdl.handle.net/2027/mdp.39015078560078
 24 | HathiTrust Catalog URL              https://catalog.hathitrust.org/Record/001423370
 25 | HathiTrust Collection Builder URL   https://babel.hathitrust.org/shcgi/mb?a=listis;c=696632727
 26 | Local volumes file                  ``/home/dcuser/Downloads/collections.txt``
 27 | ==================================  ==============================================================================
 28 | 
 29 | 
 30 | 
 31 | Volume Download
 32 | --------------------
 33 | The ``htrc download`` command retrieves volumes from the HTRC Data API
 34 | to the secure mode of the :ref:`HTRC Data Capsule Service`.
 35 | 
 36 | .. note::
 37 | 
 38 |    This command will return an error when run on a non-HTRC computer or on a
 39 |    Capsule running in maintenance mode.
 40 | 
 41 | .. _HTRC Data API: https://wiki.htrc.illinois.edu/display/COM/HTRC+Data+API+Users+Guide
 42 | 
 43 | Arguments
 44 | '''''''''''
 45 | .. argparse::
 46 |    :module: htrc.__main__
 47 |    :func: download_parser
 48 |    :prog: htrc download
 49 | 
 50 | 
 51 | Bibliographic API Access
 52 | --------------------------
 53 | ``htrc metadata`` retrieves metadata from the `HathiTrust Bibliographic API`_.
 54 | This command has no limitations on which computer or network executes it.
 55 | 
 56 | .. _HathiTrust Bibliographic API: https://www.hathitrust.org/bib_api
 57 | 
 58 | 
 59 | Arguments
 60 | '''''''''''
 61 | .. argparse::
 62 |    :module: htrc.__main__
 63 |    :func: add_workset_path
 64 |    :prog: htrc metadata
 65 | 
 66 | 
 67 | Analysis Workflows
 68 | --------------------
 69 | The HTRC Workset Toolkit also provides the command line tool ``htrc run``. Like `volume
 70 | download`_, the
 71 | 
 72 | Topic Modeling
 73 | ''''''''''''''''
 74 | There are two implementations of LDA topic modeling supported by the
 75 | 
 76 | 
 77 | Arguments
 78 | '''''''''''
 79 | .. argparse::
 80 |    :module: htrc.tools.mallet
 81 |    :func: populate_parser
 82 |    :prog: htrc run mallet
 83 | 
 84 | Use Cases and Examples
 85 | --------------------------------------------
 86 | 
 87 | Following are the use cases and examples of ``htrc`` commands inside the HTRC Data Capsule.
 88 | 
 89 | +---------------------------------+---------------------------+
 90 | | command: ``htrc download``      | capsule mode: **secure**  |
 91 | +---------------------------------+---------------------------+
 92 | 
 93 | * Download volumes of volume id list to default path :
 94 |   (/media/secure_volume/workset)
 95 | 
 96 |    ``htrc download /home/dcuser/HTRC/htrc-id``
 97 | 
 98 | * Download volumes of hathi collection url to default path :
 99 |   (/media/secure_volume/workset)
100 | 
101 |    ``htrc download “https://babel.hathitrust.org/cgi/mb?a=listis&c=1337751722”``
102 | 
103 | * Download volumes to specific location :
104 | 
105 |    ``htrc download /home/dcuser/HTRC/htrc-id -o /media/secure_volume/my-workset``
106 | 
107 | * Download volumes to specific location with concatenation option - (This will concatenate all the pages of the volume into one txt file.) :
108 | 
109 |     ``htrc download /home/dcuser/HTRC/htrc-id -o /media/secure_volume/my-workset -c``
110 | 
111 | * Download specific pages from a single volume :
112 | 
113 |     ``htrc download -pg coo.31924089593846[5,10,15,20,25,30]``
114 | 
115 | * Download volumes and then extract headers/footers from the volumes :
116 | 
117 |     ``htrc download -hf /home/dcuser/HTRC/htrc-id``
118 | 
119 | * Download volumes, extract headers/footers from the volume pages then concatenate the pages - (This will concatenate all the pages of the volume into one txt file.) :
120 | 
121 |     ``htrc download -hfc /home/dcuser/HTRC/htrc-id``
122 |     
123 | * Download volumes, extract headers/footers from the volumes, skip downloading the .csv files containing removed headers and footers :
124 | 
125 |     ``htrc download -hf -s /home/dcuser/HTRC/htrc-id``
126 | 
127 | * Download volumes, extract headers/footers from volumes, change window of pages in extractor algorithm (The default is 6, lower numbers increase speed, but are less accurate) :
128 | 
129 |     ``htrc download -hf -w 3 /home/dcuser/HTRC/htrc-id``
130 | 
131 | * Download volumes, extract headers/footers from volumes, change minimum similarity rate for lines on pages to be considered a header or footer (Default is .7 or 70%, so if a line is 70% the same as other lines on other pages within the window of pages it is labeled a header or footer and removed) :
132 | 
133 |     ``htrc download -hf -msr .9 /home/dcuser/HTRC/htrc-id``
134 |     
135 | * Download volumes, extract headers/footers from volumes, change the max number of concurrent tasks (note that the only options are 1 or 2):
136 | 
137 |     ``htrc download -hf --parallelism 2 /home/dcuser/HTRC/htrc-id``
138 |     
139 | 
140 | |
141 | +---------------------------------+-----------------------------------------------+
142 | | command: ``htrc metadata``      | capsule mode: **secure** and **maintenance**  |
143 | +---------------------------------+-----------------------------------------------+
144 | 
145 | * Download the metadata of volumes by giving hathi collection url :
146 | 
147 |    ``htrc metadata "https://babel.hathitrust.org/cgi/mb?a=listis&c=1853042514"``
148 | 
149 | * Download the metadata of volumes by giving volume id list :
150 | 
151 |    ``htrc metadata /home/dcuser/HTRC/htrc-id``
152 | 
153 | * Download the metadata associated with volume id :
154 |   volume 1 of `The Works of Jonathan Swift`_
155 | 
156 |     ``htrc metadata mdp.39015078560078``
157 | 
158 |   Note that this would only retrieve the first volume. If you want to download metadata for all 8 volumes, the catalog identifier would be used:
159 | 
160 |     ``htrc metadata 001423370``
161 | 
162 |   Each command can be used with the URL as well (*note the quote marks around each URL*):
163 | 
164 |     ``htrc metadata "https://babel.hathitrust.org/cgi/pt?id=mdp.39015078560078;view=1up;seq=13"``
165 | 
166 |     ``htrc metadata "https://catalog.hathitrust.org/Record/001423370"``
167 | 
168 |   This URL support makes it easy to browse `hathitrust.org`_ and copy links for computational analysis using the SDK.
169 | 
170 | .. _The Works of Jonathan Swift: https://hdl.handle.net/2027/mdp.39015078560078
171 | .. _hathitrust.org: https://www.hathitrust.org/
172 | 
173 | 
174 | 
175 | |
176 | +---------------------------------+-----------------------------------------------+
177 | | command: ``htrc metadata``      | capsule mode: **secure**                      |
178 | +---------------------------------+-----------------------------------------------+
179 | 
180 | * Download the metadata of volumes by giving already downloaded volumes path :
181 | 
182 |    ``htrc metadata /media/secure_volume/workset``
183 | 
184 | |
185 | +---------------------------------+-----------------------------------------------+
186 | | command: ``htrc metadata``      | capsule mode: **maintenance**                 |
187 | +---------------------------------+-----------------------------------------------+
188 | 
189 | * Download the metadata of volumes by giving already downloaded volumes path - (Sample volumes are available in capsules created with ubuntu-16-04-with-sample-volumes image. Those sample volumes are available as zip files. Please unzip before use them because the metadata function gets volume ids from volume directory names.) :
190 | 
191 |    ``mkdir /home/dcuser/unzipped_volumes``
192 | 
193 |    ``cp /home/dcuser/HTRC/data/sample_volumes/fiction/<zip_files> /home/dcuser/unzipped_volumes``
194 | 
195 |    ``unzip /home/dcuser/unzipped_volumes/’*.zip’ | rm /home/dcuser/unzipped_volumes/*.zip``
196 | 
197 |    ``htrc metadata /home/dcuser/unzipped_volumes``
198 | 
199 | |
200 | +---------------------------------+-----------------------------------------------+
201 | | command: ``htrc export``        | capsule mode: **secure** and **maintenance**  |
202 | +---------------------------------+-----------------------------------------------+
203 | 
204 | * Export volume ids from downloaded hathi collection and create workset with only volume ids :
205 | 
206 |    Go to the following link in the browser
207 | 
208 |    https://babel.hathitrust.org/cgi/mb?a=listis&c=1853042514
209 | 
210 |    Download metadata in tab separated format (Download Item Metadata: Tab-Delimited Text (TSV)), then -
211 | 
212 | 
213 |    ``htrc export mb-9.txt > volumes.tx``
214 | 
215 | * Export volume ids from hathi collection url and create workset with only volume ids (works in both secure and maintenance modes) :
216 | 
217 |   ``htrc export "https://babel.hathitrust.org/cgi/mb?a=listis&c=1853042514" > volumes.txt``
218 | 
219 | 
220 | |
221 | +---------------------------------+-----------------------------------------------+
222 | | command: ``htrc run mallet``    | capsule mode: **secure**                      |
223 | +---------------------------------+-----------------------------------------------+
224 | 
225 | * Run mallet on already downloaded volumes :
226 | 
227 |    ``htrc run mallet /media/secure_volume/workset -k 20``
228 | 
229 | * Run mallet on volume id list :
230 | 
231 |    ``htrc run mallet /home/dcuser/HTRC/htrc-id -k 20``
232 | 
233 | * Run mallet on hathi collection :
234 | 
235 |    ``htrc run mallet "https://babel.hathitrust.org/cgi/mb?a=listis&c=1853042514" -k 20``
236 | 
237 | |
238 | +-----------------------------------+-----------------------------------------------+
239 | | command: ``htrc run mallet``      | capsule mode: **maintenance**                 |
240 | +-----------------------------------+-----------------------------------------------+
241 | 
242 | * Run mallet on already downloaded volume - (Sample volumes are available in capsules created with ubuntu-16-04-with-sample-volumes image. Those sample volumes are available as zip files. Please unzip before use them because the metadata function gets volume ids from volume directory names).
243 | 
244 |     ``htrc mallet /home/dcuser/unzipped_volumes -k 20``
245 | 


--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # HTRC Python SDK documentation build configuration file, created by
  5 | # sphinx-quickstart on Sun Jun 18 16:15:31 2017.
  6 | #
  7 | # This file is execfile()d with the current directory set to its
  8 | # containing dir.
  9 | #
 10 | # Note that not all possible configuration values are present in this
 11 | # autogenerated file.
 12 | #
 13 | # All configuration values have a default; values that are commented out
 14 | # serve to show the default.
 15 | 
 16 | # If extensions (or modules to document with autodoc) are in another directory,
 17 | # add these directories to sys.path here. If the directory is relative to the
 18 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 19 | #
 20 | # import os
 21 | # import sys
 22 | # sys.path.insert(0, os.path.abspath('.'))
 23 | 
 24 | 
 25 | # -- General configuration ------------------------------------------------
 26 | 
 27 | # If your documentation needs a minimal Sphinx version, state it here.
 28 | #
 29 | # needs_sphinx = '1.0'
 30 | 
 31 | # Add any Sphinx extension module names here, as strings. They can be
 32 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 33 | # ones.
 34 | extensions = ['sphinx.ext.autodoc',
 35 |     'sphinx.ext.doctest',
 36 |     'sphinx.ext.autosectionlabel',
 37 |     'sphinx.ext.todo',
 38 |     'sphinx.ext.coverage',
 39 |     'sphinx.ext.viewcode',
 40 |     'sphinx.ext.githubpages',
 41 |     'sphinxarg.ext',
 42 |     'sphinxcontrib.fulltoc']
 43 | 
 44 | # Add any paths that contain templates here, relative to this directory.
 45 | templates_path = ['_templates']
 46 | 
 47 | # The suffix(es) of source filenames.
 48 | # You can specify multiple suffix as a list of string:
 49 | #
 50 | # source_suffix = ['.rst', '.md']
 51 | source_suffix = '.rst'
 52 | 
 53 | # The master toctree document.
 54 | master_doc = 'index'
 55 | 
 56 | # General information about the project.
 57 | project = 'HTRC Workset Toolkit'
 58 | copyright = '2017, HathiTrust Research Center (Jaimie Murdock)'
 59 | author = 'HathiTrust Research Center (Jaimie Murdock)'
 60 | 
 61 | # The version info for the project you're documenting, acts as replacement for
 62 | # |version| and |release|, also used in various other places throughout the
 63 | # built documents.
 64 | #
 65 | # The short X.Y version.
 66 | version = '1.0'
 67 | # The full version, including alpha/beta/rc tags.
 68 | release = '1.0.0b1'
 69 | 
 70 | # The language for content autogenerated by Sphinx. Refer to documentation
 71 | # for a list of supported languages.
 72 | #
 73 | # This is also used if you do content translation via gettext catalogs.
 74 | # Usually you set "language" from the command line for these cases.
 75 | language = None
 76 | 
 77 | # List of patterns, relative to source directory, that match files and
 78 | # directories to ignore when looking for source files.
 79 | # This patterns also effect to html_static_path and html_extra_path
 80 | exclude_patterns = []
 81 | 
 82 | # The name of the Pygments (syntax highlighting) style to use.
 83 | pygments_style = 'sphinx'
 84 | 
 85 | # If true, `todo` and `todoList` produce output, else they produce nothing.
 86 | todo_include_todos = True
 87 | 
 88 | 
 89 | # -- Options for HTML output ----------------------------------------------
 90 | 
 91 | # The theme to use for HTML and HTML Help pages.  See the documentation for
 92 | # a list of builtin themes.
 93 | #
 94 | html_theme = 'alabaster'
 95 | 
 96 | # Theme options are theme-specific and customize the look and feel of a theme
 97 | # further.  For a list of options available for each theme, see the
 98 | # documentation.
 99 | #
100 | # html_theme_options = {}
101 | 
102 | # Add any paths that contain custom static files (such as style sheets) here,
103 | # relative to this directory. They are copied after the builtin static files,
104 | # so a file named "default.css" will overwrite the builtin "default.css".
105 | html_static_path = ['_static']
106 | 
107 | 
108 | # -- Options for HTMLHelp output ------------------------------------------
109 | 
110 | # Output file base name for HTML help builder.
111 | htmlhelp_basename = 'HTRCWorksetToolkigdoc'
112 | 
113 | 
114 | # -- Options for LaTeX output ---------------------------------------------
115 | 
116 | latex_elements = {
117 |     # The paper size ('letterpaper' or 'a4paper').
118 |     #
119 |     # 'papersize': 'letterpaper',
120 | 
121 |     # The font size ('10pt', '11pt' or '12pt').
122 |     #
123 |     # 'pointsize': '10pt',
124 | 
125 |     # Additional stuff for the LaTeX preamble.
126 |     #
127 |     # 'preamble': '',
128 | 
129 |     # Latex figure (float) alignment
130 |     #
131 |     # 'figure_align': 'htbp',
132 | }
133 | 
134 | # Grouping the document tree into LaTeX files. List of tuples
135 | # (source start file, target name, title,
136 | #  author, documentclass [howto, manual, or own class]).
137 | latex_documents = [
138 |     (master_doc, 'HTRCWorksetToolkit.tex', 'HTRC Workset Toolkit Documentation',
139 |      'HathiTrust Research Center (Jaimie Murdock)', 'manual'),
140 | ]
141 | 
142 | 
143 | # -- Options for manual page output ---------------------------------------
144 | 
145 | # One entry per manual page. List of tuples
146 | # (source start file, name, description, authors, manual section).
147 | man_pages = [
148 |     (master_doc, 'htrc', 'HTRC Workset Toolkit Documentation',
149 |      [author], 1)
150 | ]
151 | 
152 | 
153 | # -- Options for Texinfo output -------------------------------------------
154 | 
155 | # Grouping the document tree into Texinfo files. List of tuples
156 | # (source start file, target name, title, author,
157 | #  dir menu entry, description, category)
158 | texinfo_documents = [
159 |     (master_doc, 'HTRCWorksetToolkit', 'HTRC Workset Toolkit Documentation',
160 |      author, 'HTRCWorksetToolkit', '''Tools for interacting with collections of
161 |      HathiTrust volumes and records.''',
162 |      'Documentation'),
163 | ]
164 | 
165 | 
166 | 
167 | 


--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
 1 | .. HTRC Workset Toolkit documentation master file, created by
 2 |    sphinx-quickstart on Sun Jun 18 16:15:31 2017.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | Welcome to the HTRC Workset Toolkit's documentation!
 7 | =======================================================
 8 | The HTRC Workset Toolkit provides a command line interface for interacting with 
 9 | and analyzing volumes in the HathiTrust Digital Library:
10 | 
11 | - Volume Download (``htrc download``)
12 | - Metadata Download (``htrc metadata``)
13 | - Pre-built Analysis Workflows (``htrc run``)
14 | - Export of volume lists (``htrc export``)
15 | 
16 | Each tool operates on a *workset*, which is a collection of volumes, pages,
17 | or catalog records. 
18 | 
19 | A workset is referenced by a :ref:`workset path`, which is one of 7 types of
20 | identifiers. Almost any web page on http://hathitrust.org is a valid identifier,
21 | including the PageTurner view, Catalog record view, and Collection Builder
22 | collections.
23 | 
24 | The tools also assist with the HTRC Data Capsule, enabling you to download volumes 
25 | to the secure mode of the capsule for analysis.
26 | 
27 | More details on each command can be found on the :ref:`HTRC Workset Toolkit` page.
28 | 
29 | For developers, the Workset Toolkit provides ways to test algorithms that will
30 | be run in the secure mode of the Data Capsule. It also provides methods for
31 | accessing the bibliographic records for HathiTrust volumes and ways to resolve
32 | catalog records for multivolume collections. It has the following components:
33 | 
34 | - An access layer for the Bibliographic API (``htrc.metadata``)
35 | - An access layer for the Data API (``htrc.volumes``)
36 | - Pre-built analysis workflows (``htrc.tools``)
37 | - Provenance tracking for verification of non-consumptive exports (``htrc.prov``)
38 | - Mock testing interface for user-machine or maintenance-mode testing of
39 |   secure-mode commands (``htrc.mock``)
40 | - Utilities for record and volume resolution (``htrc.util``)
41 | 
42 | More details on each module can be found on the :ref:`HTRC Workset Toolkit
43 | Development Library` page.
44 | 
45 | All source code for the HTRC Workset Toolkit is available on `GitHub`_ under an
46 | `Apache 2.0 License`_.
47 | 
48 | .. _GitHub: https://github.com/htrc/HTRC-PythonSDK/
49 | .. _Apache 2.0 License: https://github.com/htrc/HTRC-PythonSDK/blob/master/LICENSE.md
50 | 
51 | 
52 | Data Capsule usage
53 | ----------------------------
54 | The HTRC Data Capsule allows for analysis of HathiTrust volumes. It is the only
55 | way to perform analysis on the raw OCR text of in-copyright works.
56 | 
57 | New users can register and configure a data capsule by following the `HTRC Data
58 | Capsule Tutorial`_.
59 | 
60 | The HTRC Workset Toolkit will be pre-installed on Data Capsule images in the
61 | near future. Current data capsules will need to follow the ref:`installation
62 | instructions`.
63 | 
64 | .. _HTRC Data Capsule Tutorial: https://wiki.htrc.illinois.edu/display/COM/HTRC+Data+Capsule+Tutorial
65 | 
66 | 
67 | Installation instructions
68 | ---------------------------
69 | 
70 | 1. Download and install `Anaconda Python`_. The HTRC Workset Toolkit is
71 |    compatible with both Python 2.7 and 3.6, but we recommend using the 3.6 version
72 |    for future compatibility.
73 | 
74 | 2. After installing Anaconda, open a new terminal and type ``pip install htrc``
75 |    to install the SDK.
76 | 
77 | .. _Anaconda Python: https://www.continuum.io/downloads
78 | 
79 | 
80 | Table of Contents
81 | ================================
82 | .. toctree::
83 |    :maxdepth: 2
84 | 
85 |    cli
86 |    sdk
87 |    tips
88 | 
89 | 
90 | Indices and tables
91 | ====================
92 | 
93 | * :ref:`genindex`
94 | * :ref:`modindex`
95 | * :ref:`search`
96 | 


--------------------------------------------------------------------------------
/docs/source/sdk.rst:
--------------------------------------------------------------------------------
  1 | HTRC Workset Toolkit Development Library
  2 | ==========================================
  3 | 
  4 | .. toctree::
  5 |    :maxdepth: 2
  6 |    :caption: Contents:
  7 | 
  8 | HTRC Data Capsule Service
  9 | ------------------------------
 10 | The *HTRC Data Capsule Service* provisions virtual machines (VMs) to researchers
 11 | within the HTRC secure environment. The VM and software environment (including
 12 | the SDK) together form a Capsule. Each researcher has exclusive use of the
 13 | Capsule for a period of weeks or months during which they can configure their
 14 | own environment for performing research on HathiTrust Digital Library texts,
 15 | including both in-copyright and public domain volumes.
 16 | 
 17 | Each Capsule has both a maintenance mode and a secure mode. In secure
 18 | mode, network access is restricted to the HTRC Data API and some HTDL
 19 | resources, allowing text and image data to be downloaded to the Capsule.
 20 | 
 21 | Any changes made on the non-secure volumes are reverted when leaving secure
 22 | mode, so persistent code changes must occur in maintenance mode. The SDK
 23 | addresses these connectivity issues with the ``htrc.mock`` library.
 24 | 
 25 | 
 26 | 
 27 | Mock Testing
 28 | ''''''''''''''
 29 | `Mock testing`_ uses simulated objects or functions to mimic the behavior 
 30 | of real code in controlled ways. 
 31 | 
 32 | The HTRC Workset Toolkit implements a mock of the Data API access layer in
 33 | ``htrc.mock.volumes``. The Data API server is only accessible via a Capsule 
 34 | in secure mode. By implementing a function with the same call signature 
 35 | that returns the same data types, workflows that rely on the Data API can be
 36 | tested either in Capsule maintenance mode or on a user's own computer.
 37 | 
 38 | An easy way to use this pattern is shown below.
 39 | 
 40 | Example
 41 | '''''''''
 42 | 
 43 | ::
 44 | 
 45 |     if __debug__:
 46 |         # This code will execute when running `python script.py`
 47 |         import htrc.mock.volumes as volumes
 48 |     else:
 49 |         # This code will execute when running `python -O script.py`
 50 |         # The -O argument turns on optimizations, setting __debug__ = False.
 51 |         import htrc.volumes as volumes
 52 |     
 53 |     # The following is just to make a running script
 54 |     volume_ids = ['htrc.testid']    # any list will do
 55 |     output_dir = 'htrc_data'        # any path will do
 56 |     
 57 |     # download volumes
 58 |     volumes.download(volume_ids, output_dir)
 59 | 
 60 | This script leverages use of the ``python -O`` switch, which controls the
 61 | ``__debug__`` global variable:
 62 | 
 63 |  -  When run in the development environment, which does not have secure 
 64 |     access to the Data API, the program is run with ``python script.py``, 
 65 |     setting ``__debug__ = True``. This means that ``volumes.download(volume_ids,
 66 |     output_dir)`` utilizes the function ``htrc.mock.volumes.download(volume_ids,
 67 |     output_dir)``.  
 68 |  -  When run in secure mode of the data capsule, the program is executed with 
 69 |     ``python -O script.py``, setting ``__debug__ = False``. The statement 
 70 |     ``volumes.download(volume_ids, output_dir)`` utilizes the function 
 71 |     ``htrc.mock.volumes.download(volume_ids, output_dir)``.
 72 | 
 73 | 
 74 | .. _Mock testing: https://en.wikipedia.org/wiki/Mock_object
 75 | 
 76 | 
 77 | Modules
 78 | ---------
 79 | 
 80 | `htrc.metadata`
 81 | -----------------
 82 | .. automodule:: htrc.metadata
 83 |    :members:
 84 | 
 85 | `htrc.mock`
 86 | -----------------
 87 | .. automodule:: htrc.mock
 88 |    :members:
 89 | 
 90 | `htrc.mock.volumes`
 91 | '''''''''''''''''''''
 92 | .. automodule:: htrc.mock.volumes
 93 |    :members:
 94 | 
 95 | `htrc.volumes`
 96 | ----------------
 97 | .. automodule:: htrc.volumes
 98 |    :members:
 99 | 
100 | `htrc.util`
101 | ----------------
102 | .. automodule:: htrc.util
103 |    :members:
104 | 


--------------------------------------------------------------------------------
/docs/source/tips.rst:
--------------------------------------------------------------------------------
 1 | HTRC Python SDK Tips
 2 | ======================
 3 | 
 4 | This document contains a number of tips for using the CLI and SDK in conjunction
 5 | with other tools.
 6 | 
 7 | Pretty-print of JSON data using ``jq``
 8 | --------------------------------------
 9 | The command line tool ``jq`` is very powerful when combined with the ``htrc
10 | metadata`` command, as it can be used to quickly query documents::
11 | 
12 |     htrc metadata mdp.39015078560078 | jq
13 | 
14 | 


--------------------------------------------------------------------------------
/htrc/.htrc.default:
--------------------------------------------------------------------------------
 1 | [main]
 2 | username = 
 3 | password = 
 4 | 
 5 | [data]
 6 | host = dataapi-dc.htrc.indiana.edu
 7 | port = 443
 8 | url = /
 9 | cert =
10 | key =
11 | pd_only =
12 | 
13 | [oauth]
14 | host = silvermaple.pti.indiana.edu
15 | port = 9443
16 | url = /oauth2/token
17 | 
18 | [jwt]
19 | token =
20 | 


--------------------------------------------------------------------------------
/htrc/__init__.py:
--------------------------------------------------------------------------------
1 | import htrc.metadata
2 | import htrc.volumes
3 | 


--------------------------------------------------------------------------------
/htrc/__main__.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | """
  3 | Master script for HTRC Workset Toolkit.
  4 | """
  5 | from __future__ import absolute_import, division, print_function
  6 | from future import standard_library
  7 | standard_library.install_aliases()
  8 | 
  9 | import os
 10 | import os.path
 11 | import shutil
 12 | import sys
 13 | from tempfile import NamedTemporaryFile
 14 | 
 15 | from htrc.metadata import get_metadata, get_volume_metadata
 16 | import htrc.volumes
 17 | import htrc.workset
 18 | import htrc.tools.mallet
 19 | 
 20 | from argparse import ArgumentParser
 21 | # import htrc.tools.topicexplorer
 22 | from htrc.lib.cli import bool_prompt
 23 | from htrc.util.resolve import *
 24 | 
 25 | 
 26 | def download_parser(parser=None):
 27 |     if parser is None:
 28 |         parser = ArgumentParser()
 29 |     #parser.add_argument("-u", "--username", help="HTRC username")
 30 |     #parser.add_argument("-p", "--password", help="HTRC password")
 31 |     parser.add_argument("file", nargs='?', default=sys.stdin,
 32 |         help="Workset path[s]")
 33 |     parser.add_argument("-f", "--force", action='store_true', 
 34 |         help="Remove folder if exists")
 35 |     parser.add_argument("-o", "--output", help="Output directory",
 36 |         default='/media/secure_volume/workset/')
 37 |     parser.add_argument("-hf", "--remove-headers-footers", action='store_true',
 38 |         help="Remove headers and footers from individual pages and save in a separate csv file for inspection")
 39 |     parser.add_argument("-hfc", "--remove-headers-footers-and-concat", action='store_true',
 40 |         help="Remove headers and footers from individual pages and save in a separate csv file for inspection then concatenate pages")
 41 |     parser.add_argument("-w", "--window-size", required=False, type=int, metavar="N", default=6,
 42 |                         help="How many pages ahead does the header/footer extractor algorithm look to find potential "
 43 |                              "matching headers/footers (higher value gives potentially more accurate results on lower "
 44 |                              "quality OCR volumes at the expense of runtime)")
 45 |     parser.add_argument("-msr", "--min-similarity-ratio", required=False, type=float, metavar="N", default=0.7,
 46 |                         help="The minimum string similarity ratio required for the Levenshtein distance fuzzy-matching "
 47 |                              "algorithm to declare that two headers are considered 'the same' (the higher the value, up "
 48 |                              "to a max of 1.0, the more strict the matching has to be; lower values allow for more "
 49 |                              "fuzziness to account for OCR errors)")
 50 |     parser.add_argument("-s", "--skip-removed-hf", action='store_true',
 51 |                         help="Skip creating a saved report of the removed headers and footers for each page for inspection")
 52 |     parser.add_argument("--parallelism", required=False, type=int, metavar="N", default=os.cpu_count(),
 53 |                         help="The max number of concurrent tasks to start when downloading or removing headers/footers")
 54 |     parser.add_argument("--batch-size", required=False, type=int, metavar="N", default=250,
 55 |                         help="The max number of volumes to download at a time from DataAPI")
 56 |     parser.add_argument("-c", "--concat", action='store_true',
 57 |         help="Concatenate a volume's pages in to a single file")
 58 |     parser.add_argument("-m", "--mets", action='store_true',
 59 |                         help="Add volume's METS file")
 60 |     parser.add_argument("-pg", "--pages",action='store_true',
 61 |         help="Download given page numbers of a volumes.")
 62 |     parser.add_argument("-t", "--token", help="JWT for volumes download.")
 63 |     parser.add_argument("-dh", "--datahost", help="Data API host.")
 64 |     parser.add_argument("-dp", "--dataport", help="Data API port.")
 65 |     parser.add_argument("-de", "--dataepr", help="Data API EPR.")
 66 |     parser.add_argument("-dc", "--datacert", help="Client certificate file for mutual TLS with Data API.")
 67 |     parser.add_argument("-dk", "--datakey", help="Client key file for mutual TLS with Data API.")
 68 |     return parser
 69 | 
 70 | 
 71 | def add_workset_path(parser=None):
 72 |     if parser is None:
 73 |         parser = ArgumentParser()
 74 |     parser.add_argument("path", nargs='+', help="Workset path[s]")
 75 |     return parser
 76 | 
 77 | 
 78 | def main():
 79 |     parser = ArgumentParser()
 80 |     parser.add_argument('-d', '--debug', help="Print long debug messages",
 81 |                         action='store_true')
 82 |     parsers = parser.add_subparsers(help="select a command")
 83 | 
 84 |     # Metadata Helpers
 85 |     parser_getmd = parsers.add_parser('metadata',
 86 |                                       help="Get metadata for a folder of HathiTrust volumes")
 87 |     add_workset_path(parser_getmd)
 88 |     parser_getmd.set_defaults(func='metadata')
 89 | 
 90 |     # Export Helpers
 91 |     parser_export = parsers.add_parser('export',
 92 |                                       help="Export the list of HathiTrust volumes")
 93 |     add_workset_path(parser_export)
 94 |     parser_export.set_defaults(func='export')
 95 | 
 96 |     # Download Helper
 97 |     parser_download = parsers.add_parser('download',
 98 |         help="Download HathiTrust volumes to disk [requires auth]")
 99 |     download_parser(parser_download)
100 |     parser_download.set_defaults(func='download')
101 |     
102 |     
103 |     # Run helper
104 |     parser_run = parsers.add_parser('run', help="Run a built-in algorithm.")
105 |     run_parsers = parser_run.add_subparsers(help="Select a command")
106 | 
107 |     parser_mallet = run_parsers.add_parser('mallet')
108 |     htrc.tools.mallet.populate_parser(parser_mallet)
109 |     parser_mallet.set_defaults(run='mallet')
110 |     
111 |     # parser_topicexplorer = run_parsers.add_parser('topicexplorer')
112 |     # htrc.tools.topicexplorer.populate_parser(parser_topicexplorer)
113 |     # parser_topicexplorer.set_defaults(run='topicexplorer')
114 |     
115 |     parser_run.set_defaults(func='run')
116 | 
117 |     args = parser.parse_args()
118 |     if 'func' not in args:
119 |         parser.print_help()
120 |         sys.exit(1)
121 | 
122 |     if args.func in ['metadata', 'export']:
123 |         volumes = []
124 |         if not args.path:
125 |             for line in sys.stdin:
126 |                 volumes.append(line)
127 |         else:
128 |             for path in args.path:
129 |                 try:
130 |                     volumes.extend(htrc.workset.path_to_volumes(path))
131 |                 except ValueError:
132 |                     volumes.append(path)
133 |         if args.func == 'export':
134 |             for volume in volumes:
135 |                 print(volume)
136 |         if args.func == 'metadata':
137 |             metadata = get_metadata(volumes)
138 |             print(json.dumps(metadata))
139 |     elif args.func == 'run':
140 |         if 'run' not in args:
141 |             parser_run.print_help()
142 |             sys.exit(1)
143 |         if args.run == 'mallet':
144 |             htrc.tools.mallet.main(args.path, args.k, args.iter)
145 |         # if args.run == 'topicexplorer':
146 |         #     htrc.tools.topicexplorer.main(args.path, args.k, args.iter)
147 |     elif args.func == 'download':
148 |         if os.path.exists(args.output):
149 |             if args.force or bool_prompt('Folder {} exists. Delete?'.format(args.output), default=False):
150 |                 shutil.rmtree(args.output)
151 |                 os.makedirs(args.output)
152 |             else:
153 |                 print("Please choose another output folder and try again.")
154 |                 sys.exit(1)
155 |         
156 |         if args.concat and args.remove_headers_footers:
157 |             print("Cannot set both concat and remove-headers-footers")
158 |             sys.exit(1)
159 |         if args.concat and args.remove_headers_footers_and_concat:
160 |             print("Cannot set both concat and remove-headers-footers-and-concat")
161 |             sys.exit(1)
162 |         if args.remove_headers_footers and args.remove_headers_footers_and_concat:
163 |             print("Cannot set both remove_headers_footers and remove_headers_footers_and_concat")
164 |             sys.exit(1)
165 |         if args.mets and args.remove_headers_footers_and_concat:
166 |             print("Cannot set both mets and remove_headers_footers_and_concat")
167 |             sys.exit(1)
168 |         if args.pages:
169 |             if args.mets and args.concat:
170 |                 print("Cannot set both concat and mets with pages")
171 |                 sys.exit(1)
172 |             if args.mets and args.remove_headers_footers_and_concat:
173 |                 print("Cannot set both mets and remove_headers_footers_and_concat with pages")
174 |                 sys.exit(1)
175 | 
176 |         try:
177 |             resolve_and_download(args)
178 |         except ValueError:
179 |             print("Invalid identifier:", args.file)
180 |             sys.exit(1)
181 | 
182 | 
183 | def resolve_and_download(args):
184 |     if args.file == sys.stdin:
185 |         # For use with UNIX pipes
186 |         download_with_tempfile(args, sys.stdin)
187 |         return
188 | 
189 |     elif os.path.exists(args.file):
190 |         # For use with downloaded workset files - either in JSON or 
191 |         download(args)
192 |         return
193 | 
194 |     elif (args.file.endswith('json')
195 |         or args.file.endswith('jsonld')
196 |         or args.file.startswith('http://')
197 |         or args.file.startswith('https://')):
198 |         # For use with HTRC Worksets and HT Collection Builder
199 |         try:
200 |             volumes = htrc.workset.load(args.file)
201 |             download_with_tempfile(args, volumes)
202 |             return
203 |         except ValueError:
204 |             # Invalid workset, continue to last block
205 |             pass
206 | 
207 |     # Check for valid volume_id
208 |     try:
209 |         if parse_volume_id(args.file):
210 |             volumes = [parse_volume_id(args.file)]
211 |             download_with_tempfile(args, volumes)
212 |             return
213 |         else:
214 |             raise ValueError("No Volume ID found")
215 |     except ValueError:
216 |         pass
217 |     
218 |     # Check for valid record id
219 |     if parse_record_id(args.file):
220 |         record_id = parse_record_id(args.file)
221 |         volumes = record_id_to_volume_ids(record_id)
222 |         download_with_tempfile(args, volumes)
223 |         return
224 |     else:
225 |         # invalid
226 |         raise ValueError("Not a valid ID file or workset identifier: {}".format(
227 |                          args.file))
228 | 
229 | 
230 | def download(args):
231 |     try:
232 |         htrc.volumes.download(args)
233 |     except OSError as e:
234 |         if not os.path.exists('/media/secure_volume/'):
235 |             print('Secure volume not mounted. Could not download volumes')
236 |             sys.exit(1)
237 |         else:
238 |             print("Could not download volumes. {} {}".format(e.strerror, e.filename))
239 |             sys.exit(1)
240 |     except RuntimeError as e:
241 |         if not args.debug:
242 |             print("Could not download volumes. {}".format(str(e)))
243 |             sys.exit(1)
244 |         else:
245 |             raise e
246 | 
247 | 
248 | def download_with_tempfile(args, volumes):
249 |     f = NamedTemporaryFile()
250 |     for volume in volumes:
251 |         f.write((volume + '\n').encode('utf-8'))
252 |     f.flush()
253 |     args.file = f.name
254 | 
255 |     try:
256 |         download(args)
257 |     finally:
258 |         print("Closing temporary file: " + f.name)
259 |         f.close()
260 | 
261 | 
262 | if __name__ == '__main__':
263 |     main()
264 | 


--------------------------------------------------------------------------------
/htrc/auth.py:
--------------------------------------------------------------------------------
 1 | #from base64 import b64encode
 2 | from getpass import getpass
 3 | #import http.client
 4 | #import ssl
 5 | #import time
 6 | import subprocess
 7 | 
 8 | import requests
 9 | import requests.auth
10 | #import configparser
11 | 
12 | import htrc.config
13 | 
14 | 
15 | def get_jwt_token():
16 |     # Currently we just store one common jwt token locally at .htrc file for simplicity
17 |     # Expect to add POST method to query unique jwt token with the combo of username and password
18 |     #username, password = credential_prompt()
19 | 
20 |     #client_id, client_secret = htrc.config.get_credentials()
21 | 
22 |     #auth = requests.auth.HTTPBasicAuth(client_id, client_secret)
23 |     #data = { "grant_type": "password",
24 |              #"username": username,
25 |              #"password": password,
26 |              #"scope" : "openid"}
27 | 
28 |     url1 = htrc.config.get_idp_url()
29 |     capsule_id = htrc.config._get_value("jwt", "capsule_id")
30 |     result = subprocess.check_output("hostname -s -I | awk '{print $1}'", shell=True)
31 |     result = result.decode('utf-8')
32 |     result = result[:-1]
33 |     capsule_ip = result.strip()
34 |     url = url1 + "/" + capsule_id + "/" + capsule_ip
35 |     r = requests.get(url)
36 | 
37 |     data = r.json()
38 |     if 'error' not in data:
39 |         #expiration = int(time.time()) + data['expires_in']
40 |         return data['token']
41 |     elif data['error'] == 'invalid_grant':
42 |         print("Invalid username or password. Please try again.\n")
43 |         return get_jwt_token()
44 |     else:
45 |         raise RuntimeError("JWT token retrieval failed: {}".format(data['error']))
46 | 
47 | 
48 | def credential_prompt():
49 |     """
50 |     A prompt for entering HathiTrust Research Center credentials.
51 |     """
52 |     print("Please enter your HathiTrust Research Center credentials.")
53 |     username = input("HTRC Username: ")
54 |     password = getpass("HTRC Password: ")
55 | 
56 |     if not username or not password:
57 |         print("Invalid username or password. Please try again.\n")
58 |         return credential_prompt()
59 |     else:
60 |         return (username, password)
61 | 
62 | 
63 | if __name__ == '__main__':
64 |     token = get_jwt_token()
65 |     htrc.config.save_jwt_token(token)
66 | 


--------------------------------------------------------------------------------
/htrc/config.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | """
  3 | `htrc.volumes`
  4 | 
  5 | Contains the configuration parser object.
  6 | """
  7 | from future import standard_library
  8 | standard_library.install_aliases()
  9 | from typing import Optional
 10 | from configparser import RawConfigParser as ConfigParser, NoSectionError
 11 | from codecs import open
 12 | import logging
 13 | import os.path
 14 | import shutil
 15 | import time
 16 | 
 17 | DEFAULT_PATH = os.path.expanduser('~')
 18 | DEFAULT_PATH = os.path.join(DEFAULT_PATH, '.htrc')
 19 | if not os.path.exists(DEFAULT_PATH):
 20 |     DEFAULT_FILE = os.path.dirname(__file__)
 21 |     DEFAULT_FILE = os.path.join(DEFAULT_FILE, '.htrc.default')
 22 |     logging.info("Copying default config file to home directory.")
 23 |     shutil.copyfile(DEFAULT_FILE, DEFAULT_PATH)
 24 | 
 25 | 
 26 | class HtrcDataApiConfig:
 27 |     def __init__(self,
 28 |                  token: Optional[str] = None,
 29 |                  host: Optional[str] = None,
 30 |                  port: Optional[int] = None,
 31 |                  epr: Optional[str] = None,
 32 |                  cert: Optional[str] = None,
 33 |                  key: Optional[str] = None) -> None:
 34 |         super().__init__()
 35 | 
 36 |         self.token = token or get_jwt_token()
 37 |         self.host = host or get_dataapi_host()
 38 |         self.port = port or get_dataapi_port()
 39 |         self.epr = epr or get_dataapi_epr()
 40 |         self.cert = cert or get_dataapi_cert()
 41 |         self.key = key or get_dataapi_key()
 42 | 
 43 | 
 44 | def _get_value(section, key, path=None):
 45 |     if path is None:
 46 |         path = DEFAULT_PATH
 47 | 
 48 |     config = ConfigParser(allow_no_value=True)
 49 |     with open(path, encoding='utf8') as configfile:
 50 |         config.readfp(configfile)
 51 |     try:
 52 |         return config.get(section, key)
 53 |     except NoSectionError:
 54 |         raise EnvironmentError("Config not set for {} {} in {}".format(
 55 |             section, key, path))
 56 | 
 57 | 
 58 | def get_dataapi_port(path=None):
 59 |     port = int(_get_value('data', 'port', path))
 60 |     return (port)
 61 | 
 62 | 
 63 | def get_dataapi_host(path=None):
 64 |     host = _get_value('data', 'host', path)
 65 |     return (host)
 66 | 
 67 | 
 68 | def get_dataapi_epr(path=None):
 69 |     return _get_value('data', 'url', path)
 70 | 
 71 | 
 72 | def get_dataapi_cert(path=None):
 73 |     return _get_value('data', 'cert', path)
 74 | 
 75 | 
 76 | def get_dataapi_key(path=None):
 77 |     return _get_value('data', 'key', path)
 78 | 
 79 | 
 80 | def get_dataapi_access(path=None):
 81 |     return _get_value('data', 'pd_only', path)
 82 | 
 83 | 
 84 | def get_idp_host_port(path=None):
 85 |     host = _get_value('idp', 'host', path)
 86 |     port = _get_value('idp', 'port', path)
 87 | 
 88 |     return (host, port)
 89 | 
 90 | 
 91 | def get_idp_path(path=None):
 92 |     return _get_value('idp', 'url')
 93 | 
 94 | 
 95 | def get_idp_url(path=None):
 96 |     host, port = get_idp_host_port(path)
 97 |     path = get_idp_path(path)
 98 |     if port == 443:
 99 |         # On HTTPS Default Path
100 |         return "https://{}{}".format(host, path)
101 |     else:
102 |         return "https://{}:{}{}".format(host, port, path)
103 | 
104 | 
105 | # Add jwt credential access methods
106 | def get_jwt_token(path=None):
107 | 
108 |     import htrc.auth
109 |     token = htrc.auth.get_jwt_token()
110 | 
111 |     return token
112 | 
113 | def save_jwt_token(token, path=None):
114 | 
115 |     """
116 |     Saves JWT token in the config file.
117 |     """
118 |     # Default to ~/.htrc
119 |     if path is None:
120 |         path = DEFAULT_PATH
121 | 
122 |     # Default to expiration of now - force a new token on next request
123 |     #if expiration is None:
124 |         #expiration = time.time()
125 | 
126 |     # Open and modify existing config file, if it exists.
127 |     config = ConfigParser(allow_no_value=True)
128 |     if os.path.exists(path):
129 |         config.read(path)
130 |     if not config.has_section('jwt'):
131 |         config.add_section('jwt')
132 | 
133 |     # set token and expiration
134 |     config.set('jwt', 'token', token)
135 |     #config.set('jwt', 'expiration', expiration)
136 | 
137 |     with open(path, 'w') as credential_file:
138 |         config.write(credential_file)
139 | 
140 |     return token
141 | 
142 | 
143 | def remove_jwt_token(path=None):
144 |     """
145 |     Removes JWT token from the config file.
146 |     """
147 |     # Default to ~/.htrc
148 |     if path is None:
149 |         path = DEFAULT_PATH
150 | 
151 |     # Open and modify existing config file, if it exists.
152 |     config = ConfigParser(allow_no_value=True)
153 |     if os.path.exists(path):
154 |         config.read(path)
155 |     if not config.has_section('jwt'):
156 |         config.add_section('jwt')
157 |     # set token and expiration
158 |     config.set('jwt', 'token', " ")
159 |     #config.set('jwt', 'expiration', " ")
160 | 
161 |     with open(path, 'w') as credential_file:
162 |         config.write(credential_file)
163 | 
164 | 
165 | def get_credentials(path=None):
166 |     """
167 |     Retrieves the username and password from a config file for the Data API.
168 |     Raises an EnvironmentError if not specified.
169 |     See also: credential_prompt
170 |     """
171 |     client_id = _get_value('idp', 'client_id', path)
172 |     client_secret = _get_value('idp', 'client_secret', path)
173 | 
174 |     if not client_id and not client_secret:
175 |         logging.error("Config path: {}".format(path))
176 |         raise EnvironmentError("No client_id and client_secret stored in config file.")
177 | 
178 |     return (client_id, client_secret)
179 | 
180 | 
181 | def populate_parser(parser):
182 |     return parser
183 | 
184 | 
185 | if __name__ == '__main__':
186 |     from argparse import ArgumentParser
187 | 
188 |     parser = ArgumentParser()
189 |     parser = populate_parser(parser)
190 |     parser.parse_args()
191 | 
192 | 


--------------------------------------------------------------------------------
/htrc/hf_utils/__init__.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | from typing import TypeVar, List, Iterator, Tuple, Callable
  3 | 
  4 | T = TypeVar('T')
  5 | 
  6 | 
  7 | def clean_text(s: str) -> str:
  8 |     # replace all characters which aren't letters with whitespaces ([\W\d_] is equivalent of \P{L} which is unsupported)
  9 |     s = re.sub(r'[\W\d_]+', " ", s, flags=re.UNICODE)
 10 |     # replace multiple sequential whitespaces with single whitespace
 11 |     s = re.sub(r'\s{2,}', " ", s, flags=re.UNICODE)
 12 |     # trim whitespaces at the beginning and end
 13 |     s = s.strip()
 14 |     # lowercase
 15 |     s = s.lower()
 16 | 
 17 |     return s
 18 | 
 19 | 
 20 | def levenshtein(s: str, t: str, insert_cost: int = 1, delete_cost: int = 1, replace_cost: int = 1) -> int:
 21 |     """ From Wikipedia article; Iterative with two matrix rows. """
 22 |     # degenerate cases
 23 |     if s == t:
 24 |         return 0
 25 | 
 26 |     len0 = len(s)
 27 |     len1 = len(t)
 28 | 
 29 |     if not len0:
 30 |         return len1
 31 | 
 32 |     if not len1:
 33 |         return len0
 34 | 
 35 |     # the array of distances
 36 |     v0 = [0] * (len0 + 1)
 37 |     v1 = [0] * (len0 + 1)
 38 | 
 39 |     # initial cost of skipping prefix in s
 40 |     for i in range(len(v0)):
 41 |         v0[i] = i
 42 | 
 43 |     # dynamically compute the array of distances
 44 | 
 45 |     # transformation cost for each letter in t
 46 |     for j in range(len1):
 47 |         # initial cost of skipping prefix in t
 48 |         v1[0] = j + 1
 49 | 
 50 |         # transformation cost for each letter in s
 51 |         for i in range(len0):
 52 |             # matching current letters in both strings
 53 |             match = 0 if s[i] == t[j] else 1
 54 | 
 55 |             # computing cost for each transformation
 56 |             cost_insert = v0[i + 1] + insert_cost
 57 |             cost_delete = v1[i] + delete_cost
 58 |             cost_replace = v0[i] + match * replace_cost
 59 | 
 60 |             # keep minimum cost
 61 |             v1[i + 1] = min(cost_insert, cost_delete, cost_replace)
 62 | 
 63 |         # swap cost arrays
 64 |         v0, v1 = v1, v0
 65 | 
 66 |     # the distance is the cost for transforming all letters in both strings
 67 |     return v0[len0]
 68 | 
 69 | 
 70 | def pairwise_combine_within_distance(xs: List[T], n: int) -> List[Tuple[T, T]]:
 71 |     if not xs:
 72 |         return []
 73 | 
 74 |     result = []
 75 |     x, xs = xs[0], xs[1:]
 76 | 
 77 |     while xs:
 78 |         result = result + [(x, v) for v in xs[:n - 1]]
 79 |         x, xs = xs[0], xs[1:]
 80 | 
 81 |     return result
 82 | 
 83 | 
 84 | def group_consecutive_when(xs: List[T], pred: Callable[[T, T], bool]) -> Iterator[List[T]]:
 85 |     result = []
 86 |     _prev, _next = None, None
 87 | 
 88 |     while len(xs) > 1:
 89 |         _prev, _next = xs[0], xs[1]
 90 |         result.append(_prev)
 91 |         if not pred(_prev, _next):
 92 |             yield result
 93 |             result = []
 94 |         xs = xs[1:]
 95 | 
 96 |     if len(xs) == 1:
 97 |         _prev, _next = _next, xs[0]
 98 | 
 99 |     if _prev is not None and _next is not None and pred(_prev, _next):
100 |         result.extend([_prev, _next])
101 |     elif _next is not None:
102 |         result.append(_next)
103 | 
104 |     yield result
105 | 
106 | 
107 | def flatten(xss: List[tuple]) -> Iterator[T]:
108 |     for xs in xss:
109 |         for x in xs:
110 |             yield x
111 | 


--------------------------------------------------------------------------------
/htrc/lib/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/htrc/HTRC-WorksetToolkit/3c3428d80a72a644925dc6ab2827470c8467af30/htrc/lib/__init__.py


--------------------------------------------------------------------------------
/htrc/lib/cli.py:
--------------------------------------------------------------------------------
 1 | from builtins import input
 2 | 
 3 | 
 4 | def bool_prompt(prompt_str, default=None):
 5 |     if default is True:
 6 |         default = 'y'
 7 |     elif default is False:
 8 |         default = 'n'
 9 | 
10 |     result = prompt(prompt_str, options=['y', 'n'], default=default)
11 | 
12 |     if result == 'y':
13 |         return True
14 |     elif result == 'n':
15 |         return False
16 | 
17 | 
18 | def prompt(prompt, options=None, default=None):
19 |     # Construct prompt
20 |     prompt = "\n" + prompt
21 | 
22 |     if options:
23 |         choices = options[:]
24 |         if default and default in choices:
25 |             default_idx = choices.index(default)
26 |             choices[default_idx] = choices[default_idx].upper()
27 |         prompt += " [{0}]".format('/'.join(choices))
28 |     elif default:
29 |         if isinstance(default,str):
30 |             prompt += " [Default: {0}]".format(default.encode('utf-8'))
31 |         else:
32 |             prompt += " [Default: {0}]".format(default)
33 |     prompt += " "
34 | 
35 |     # Wait for valid response
36 |     result = None
37 |     while result is None or (options and result not in options):
38 |         result = input(prompt)
39 |         result = result.lower().strip()
40 |         if default and result == '':
41 |             result = default
42 | 
43 |     return result
44 | 


--------------------------------------------------------------------------------
/htrc/metadata/__init__.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | from future import standard_library
  3 | standard_library.install_aliases()
  4 | from builtins import str
  5 | 
  6 | import codecs
  7 | import json
  8 | import logging
  9 | import os, os.path
 10 | import re
 11 | from time import sleep
 12 | from urllib.request import urlopen
 13 | from urllib.error import HTTPError
 14 | from urllib.parse import quote_plus, urlencode
 15 | 
 16 | import requests
 17 | 
 18 | from htrc.util import split_items
 19 | 
 20 | def get_volume_metadata(id, marc=False):
 21 |     """
 22 |     Retrieve item metadata `from the HathiTrust Bibliographic API`_.
 23 | 
 24 |     Params:
 25 |     :param id: HTID for the volume to be retrieved
 26 |     :param marc: Retrieve MARC-XML within JSON return value.
 27 | 
 28 |     .. _from the HathiTrust Bibliographic API: https://www.hathitrust.org/bib_api
 29 |     """
 30 |     biblio_api = "https://catalog.hathitrust.org/api/volumes"
 31 |     
 32 |     if marc:
 33 |         biblio_api += '/full'
 34 |     else:
 35 |         biblio_api += '/brief'
 36 | 
 37 |     url = biblio_api + '/htid/{}.json'.format(id)
 38 | 
 39 |     try:
 40 |         reader = codecs.getreader('utf-8')
 41 |         data = json.load(reader(urlopen(url)))
 42 |         if len(data['records']) == 1:
 43 |             for item in data['items']:
 44 |                 if item['htid'] == id:
 45 |                     md = data['records'][item['fromRecord']]
 46 |                     md.update(item)
 47 |                     return md
 48 |             else:
 49 |                 raise ValueError
 50 |     except (ValueError, IndexError, HTTPError):
 51 |         raise ValueError("No result found for " + id)
 52 | 
 53 | 
 54 | def safe_volume_metadata(id, marc=False, sleep_time=1):
 55 |     """
 56 |     Retrieve item metadata `from the HathiTrust Bibliographic API`_.
 57 |     
 58 |     Unlike :method volume_metadata:, this function returns an empty dictionary,
 59 |     rather than an error when metadata is missing.
 60 | 
 61 |     Params:
 62 |     :param id: HTID for the volume to be retrieved
 63 |     :param marc: Retrieve MARC-XML within JSON return value.
 64 | 
 65 |     _ https://www.hathitrust.org/bib_api
 66 |     """
 67 |     try:
 68 |         metadata = get_volume_metadata(id, marc)
 69 |         if sleep_time:
 70 |             sleep(sleep_time)
 71 |         return metadata
 72 |     except ValueError as err:
 73 |         logging.error(err)
 74 |         return dict()
 75 | 
 76 | def get_bulk_metadata(ids, marc=False):
 77 |     """
 78 |     Retrieve item metadata `from the HathiTrust Bibliographic API`_.
 79 | 
 80 |     Params:
 81 |     :param ids: HTIDs for the volumes to be retrieved
 82 |     :param marc: Retrieve MARC-XML within JSON return value.
 83 | 
 84 |     .. _from the HathiTrust Bibliographic API: https://www.hathitrust.org/bib_api
 85 |     """
 86 |     biblio_api = "https://catalog.hathitrust.org/api/volumes"
 87 | 
 88 |     if marc:
 89 |         biblio_api += '/full'
 90 |     else:
 91 |         biblio_api += '/brief'
 92 | 
 93 |     query = '|'.join(['htid:' + id for id in ids])
 94 |     url = biblio_api + '/json/' + query
 95 | 
 96 |     metadata = dict()
 97 |     try:
 98 |         reader = codecs.getreader('utf-8')
 99 |         raw = json.load(reader(urlopen(url)))
100 | 
101 |         for id, data in raw.items():
102 |             id = id.replace('htid:','')
103 |             if len(data['records']) == 1:
104 |                 for item in data['items']:
105 |                     if item['htid'] == id:
106 |                         item_md = data['records'][item['fromRecord']]
107 |                         item_md.update(item)
108 |                         metadata[id] = item_md
109 |             else:
110 |                 metadata[id] = dict()
111 |     except HTTPError:
112 |         raise RuntimeError("Could not access HT Bibliography API.")
113 | 
114 |     return metadata
115 | 
116 | def safe_bulk_metadata(ids, marc=False, sleep_time=1):
117 |     """
118 |     Retrieve bulk item metadata `from the HathiTrust Bibliographic API`_.
119 | 
120 |     Unlike :method get_bulk_metadata:, this function returns an
121 |     empty dictionary, rather than an error when metadata is missing.
122 | 
123 |     Params:
124 |     :param ids: HTIDs for the volumes to be retrieved
125 |     :param marc: Retrieve MARC-XML within JSON return value.
126 | 
127 |     _ https://www.hathitrust.org/bib_api
128 |     """
129 |     try:
130 |         metadata = get_bulk_metadata(ids, marc)
131 |         if sleep_time:
132 |             sleep(sleep_time)
133 |         return metadata
134 |     except ValueError as err:
135 |         logging.error(err)
136 |         return dict()
137 | 
138 | def get_metadata(ids, output_file=None):
139 |     """
140 |     Retrieves metadata for a folder of folders, where each subfolder is named
141 |     for a HathiTrust ID. This structure is the default structure extracted from
142 |     a Data API request (:method htrc.volumes.get_volumes:).
143 |     """
144 |     ids = [str.strip(id).replace('+', ':').replace('=', '/') for id in ids] # data cleanup
145 | 
146 |     metadata = dict()
147 |     for segment in split_items(ids, 50):
148 |         items = safe_bulk_metadata(segment)
149 |         metadata.update(items)
150 | 
151 |     if output_file:
152 |         with open(output_file, 'w') as outfile:
153 |             json.dump(metadata, outfile)
154 | 
155 |     return metadata
156 | 
157 | def record_metadata(id, sleep_time=1):
158 |     """
159 |     Retrieve metadata for a HathiTrust Record.
160 |     """
161 |     regex = re.compile('\W')
162 |     url = "http://catalog.hathitrust.org/api/volumes/brief/recordnumber/{0}.json"
163 | 
164 |     url = url.format(id)
165 |     r = requests.get(url)
166 |     data = r.json()
167 | 
168 |     # data = data['items'][id]
169 |     items = []
170 |     if data:
171 |         for item in data['items']:
172 |             enum = regex.sub('', str(item.get('enumcron', '')).lower())
173 |             htid = item.get('htid', '')
174 |             items.append((enum, htid))
175 |     else:
176 |         items = []
177 | 
178 |     sleep(sleep_time)
179 |     return items
180 | 
181 | 
182 | 
183 | def volume_solr_metadata(id, sleep_time=0.1):
184 |     """
185 |     Retrieve metadata from HTRC Solr API.
186 | 
187 |     The HTRC Solr instance is used only for certain extracted features 
188 |     unavailable in the main HathiTrust Bibliographic API. If you are a
189 |     recipient of a HTRC Advanced Collaborative Support (ACS) grant,
190 |     then you may have to use this function.
191 | 
192 |     """
193 |     solr = "http://chinkapin.pti.indiana.edu:9994/solr/meta/select/?q=id:%s" % id
194 |     solr += "&wt=json"  # retrieve JSON results
195 |     if sleep_time:
196 |         sleep(sleep_time)  # JUST TO MAKE SURE WE ARE THROTTLED
197 |     try:
198 |         reader = codecs.getreader('utf-8')
199 |         data = json.load(reader(urlopen(solr)))
200 |         return data['response']['docs'][0]
201 |     except (ValueError, IndexError, HTTPError):
202 |         logging.error("No result found for " + id)
203 |         return dict()
204 | 
205 | 
206 | 


--------------------------------------------------------------------------------
/htrc/metadata/marc.py:
--------------------------------------------------------------------------------
 1 | """
 2 | MARC CODE HANDLING
 3 | """
 4 | from __future__ import print_function
 5 | from future import standard_library
 6 | standard_library.install_aliases()
 7 | from builtins import str
 8 | 
 9 | import xml.etree.ElementTree as ET
10 | 
11 | 
12 | def parse_marc(raw):
13 |     # lazy workaround
14 |     raw = raw.replace(' xmlns', ' xmlnamespace')
15 |     ET.register_namespace('', 'http://www.loc.gov/MARC21/slim')
16 |     return ET.fromstring(raw)
17 | 
18 | 
19 | def get_marc_value(xml, tag, code):
20 |     xpath = "{marc}datafield[@tag='{tag}']/{marc}subfield[@code='{code}']".format(
21 |         tag=tag, code=code, marc='')  # marc="{http://www.loc.gov/MARC21/slim}")
22 |     results = xml.findall(xpath)
23 |     return results[0].text if results else None
24 | 
25 | 
26 | def get_lccn_from_marc(xml):
27 |     return get_marc_value(xml, '010', 'a')
28 | 
29 | 
30 | def get_title_from_marc(xml):
31 |     return get_marc_value(xml, '245', 'a')
32 | 
33 | 
34 | def get_volume_from_marc(xml):
35 |     return get_marc_value(xml, '974', 'c')
36 | 
37 | 
38 | def get_lcc_from_marc(xml):
39 |     # MARC tag 050a/b or 991h/i
40 |     lcc = list()
41 |     val = get_marc_value(xml, '050', 'a')
42 |     if val:
43 |         lcc.append(val)
44 | 
45 |     val = get_marc_value(xml, '050', 'b')
46 |     if val:
47 |         lcc[-1] += val
48 | 
49 |     val = get_marc_value(xml, '991', 'h')
50 |     if val:
51 |         lcc.append(val)
52 | 
53 |     val = get_marc_value(xml, '991', 'i')
54 |     if val:
55 |         lcc[-1] += val
56 | 
57 |     return ";".join(lcc)
58 | 
59 | 
60 | 


--------------------------------------------------------------------------------
/htrc/mock/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/htrc/HTRC-WorksetToolkit/3c3428d80a72a644925dc6ab2827470c8467af30/htrc/mock/__init__.py


--------------------------------------------------------------------------------
/htrc/mock/volumes/__init__.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | """
  3 | `htrc.mock.volumes`
  4 | 
  5 | Contains functions to test the volume retrieval from the HTRC Data API.
  6 | The download functions will return a sample zip file.
  7 | 
  8 | See the core documentation for an example of how to use this library.
  9 | """
 10 | from __future__ import print_function
 11 | from future import standard_library
 12 | standard_library.install_aliases()
 13 | 
 14 | from builtins import input
 15 | 
 16 | from configparser import RawConfigParser as ConfigParser
 17 | from io import BytesIO
 18 | import os, os.path
 19 | from zipfile import ZipFile  # used to decompress requested zip archives.
 20 | 
 21 | from htrc.lib.cli import bool_prompt
 22 | from htrc.auth import credential_prompt
 23 | from htrc.config import save_jwt_token
 24 | 
 25 | EXAMPLE_FILE = os.path.join(os.path.dirname(__file__), 'example.zip')
 26 | 
 27 | def get_volumes(token, volume_ids, concat=False):
 28 |     """
 29 |     Returns volumes from the Data API as a raw zip stream.
 30 | 
 31 |     Parameters:
 32 |     :token: An OAuth2 token for the app.
 33 |     :volume_ids: A list of volume_ids
 34 |     :concat: If True, return a single file per volume. If False, return a single
 35 |     file per page (default).
 36 |     """
 37 |     if not volume_ids:
 38 |         raise ValueError("volume_ids is empty.")
 39 | 
 40 |     with open(EXAMPLE_FILE, 'rb') as infile:
 41 |         data = infile.read()
 42 | 
 43 |     return data
 44 | 
 45 | def get_pages(token, page_ids, concat=False):
 46 |     """
 47 |     Returns a ZIP file containing specfic pages.
 48 |     
 49 |     Parameters:
 50 |     :token: An OAuth2 token for the app.
 51 |     :volume_ids: A list of volume_ids
 52 |     :concat: If True, return a single file per volume. If False, return a single
 53 |     file per page (default).
 54 |     """
 55 |     if not page_ids:
 56 |         raise ValueError("page_ids is empty.")
 57 |     
 58 |     with open(EXAMPLE_FILE, 'rb') as infile:
 59 |         data = infile.read()
 60 | 
 61 |     return data
 62 | 
 63 | def get_oauth2_token(username, password):
 64 |     """
 65 |     Returns a sample token for oauth2
 66 |     """
 67 |     return 'a1b2c3d4e5f6'
 68 | 
 69 | 
 70 | def credentials_from_config(path):
 71 |     """
 72 |     Retrieves the username and password from a config file for the Data API.
 73 |     DOES NOT raise an EnvironmentError if path is invalid.
 74 |     See also: credential_prompt
 75 |     """
 76 |     username = None
 77 |     password = None
 78 | 
 79 |     return (username, password)
 80 | 
 81 | 
 82 | def download_volumes(volume_ids, output_dir, username=None, password=None):
 83 |     # create output_dir folder, if nonexistant
 84 |     if not os.path.isdir(output_dir):
 85 |         os.makedirs(output_dir)
 86 | 
 87 |     # Retrieve token and download volumes
 88 |     token = get_oauth2_token(username, password)
 89 |     data = get_volumes(token, volume_ids, False)
 90 | 
 91 |     with open(EXAMPLE_FILE, 'rb') as infile:
 92 |         myzip = ZipFile(infile)
 93 |         myzip.extractall(output_dir)
 94 |         myzip.close()
 95 | 
 96 | 
 97 | def download(args):
 98 |     # extract files
 99 |     with open(args.file) as IDfile:
100 |         volumeIDs = [line.strip() for line in IDfile]
101 | 
102 |     return download_volumes(volumeIDs, args.output, args.username, args.password)
103 | 
104 | 


--------------------------------------------------------------------------------
/htrc/mock/volumes/example.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/htrc/HTRC-WorksetToolkit/3c3428d80a72a644925dc6ab2827470c8467af30/htrc/mock/volumes/example.zip


--------------------------------------------------------------------------------
/htrc/models/__init__.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from abc import ABC, abstractmethod
 3 | from typing import List
 4 | 
 5 | 
 6 | class Page(ABC):
 7 |     @property
 8 |     @abstractmethod
 9 |     def text_lines(self) -> List[str]:
10 |         """
11 |         The lines of text on the page
12 |         """
13 |         pass
14 | 
15 |     @property
16 |     def text(self) -> str:
17 |         return os.linesep.join(self.text_lines)
18 | 
19 | 
20 | class PageStructure(Page, ABC):
21 |     def __init__(self) -> None:
22 |         self.num_header_lines = 0
23 |         self.num_footer_lines = 0
24 | 
25 |     @property
26 |     def has_header(self) -> bool:
27 |         return self.num_header_lines > 0
28 | 
29 |     @property
30 |     def has_body(self) -> bool:
31 |         return len(self.text_lines) - self.num_header_lines - self.num_footer_lines > 0
32 | 
33 |     @property
34 |     def has_footer(self) -> bool:
35 |         return self.num_footer_lines > 0
36 | 
37 |     @property
38 |     def header_lines(self) -> List[str]:
39 |         return self.text_lines[:self.num_header_lines]
40 | 
41 |     @property
42 |     def body_lines(self) -> List[str]:
43 |         return self.text_lines[self.num_header_lines:len(self.text_lines) - self.num_footer_lines]
44 | 
45 |     @property
46 |     def footer_lines(self) -> List[str]:
47 |         return self.text_lines[-self.num_footer_lines:] if self.has_footer else []
48 | 
49 |     @property
50 |     def header(self) -> str:
51 |         return os.linesep.join(self.header_lines)
52 | 
53 |     @property
54 |     def body(self) -> str:
55 |         return os.linesep.join(self.body_lines)
56 | 
57 |     @property
58 |     def footer(self) -> str:
59 |         return os.linesep.join(self.footer_lines)
60 | 
61 | 
62 | class HtrcPage(Page):
63 |     def __init__(self, lines: List[str]) -> None:
64 |         self._lines = lines
65 | 
66 |     @property
67 |     def text_lines(self) -> List[str]:
68 |         return self._lines
69 | 


--------------------------------------------------------------------------------
/htrc/runningheaders/__init__.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | from collections import defaultdict
  3 | from typing import List, TypeVar, Set, Iterator, Optional, Tuple, Dict
  4 | 
  5 | from htrc.models import Page, PageStructure
  6 | from htrc.hf_utils import clean_text, levenshtein, pairwise_combine_within_distance, flatten, group_consecutive_when
  7 | 
  8 | T = TypeVar('T', bound=Page)
  9 | U = TypeVar('U', bound=PageStructure)
 10 | 
 11 | 
 12 | class _Line:
 13 |     def __init__(self, text: str, line_number: int, page: Page) -> None:
 14 |         self.text = text
 15 |         self.line_number = line_number
 16 |         self.page = page
 17 |         self.cleaned_text = clean_text(text)
 18 | 
 19 |     def __eq__(self, o: object) -> bool:
 20 |         if not isinstance(o, _Line):
 21 |             raise NotImplemented
 22 | 
 23 |         are_equal = self.page is o.page and self.line_number == o.line_number
 24 | 
 25 |         return are_equal
 26 | 
 27 |     def __ne__(self, o: object) -> bool:
 28 |         return not self == o
 29 | 
 30 |     def __hash__(self) -> int:
 31 |         line_hash = hash(self.line_number)
 32 |         page_hash = hash(self.page)
 33 |         hash_value = 31 * line_hash + page_hash
 34 | 
 35 |         return hash_value
 36 | 
 37 |     def __str__(self) -> str:
 38 |         return str((self.line_number, self.cleaned_text))
 39 | 
 40 |     def similarity_ratio(self, line: '_Line') -> float:
 41 |         ratio = 1 - float(levenshtein(self.cleaned_text, line.cleaned_text)) / max(len(self.cleaned_text),
 42 |                                                                                    len(line.cleaned_text))
 43 | 
 44 |         return ratio
 45 | 
 46 | 
 47 | def parse_page_structure(pages: List[T],
 48 |                          window_size: int = 6,
 49 |                          min_similarity_ratio: float = 0.7,
 50 |                          min_cluster_size: int = 3,
 51 |                          max_header_lines: int = 3,
 52 |                          max_footer_lines: int = 3) -> List[U]:
 53 |     def _get_page_lines(p: T) -> List[_Line]:
 54 |         return [_Line(text, line_num, p) for line_num, text in enumerate(p.text_lines)]
 55 | 
 56 |     def _cluster_lines(lines: List[Tuple[_Line, _Line]]) -> Set[tuple]:
 57 |         cluster_map = {}
 58 | 
 59 |         for l1, l2 in lines:
 60 |             c1 = cluster_map.get(l1)
 61 |             c2 = cluster_map.get(l2)
 62 | 
 63 |             if c1 is not None and c2 is not None and c1 is not c2:
 64 |                 smaller, larger = (c1, c2) if len(c1) < len(c2) else (c2, c1)
 65 |                 larger.extend(smaller)
 66 |                 for x in smaller:
 67 |                     cluster_map[x] = larger
 68 |             elif c1 is not None and c2 is None:
 69 |                 c1.append(l2)
 70 |                 cluster_map[l2] = c1
 71 |             elif c1 is None and c2 is not None:
 72 |                 c2.append(l1)
 73 |                 cluster_map[l1] = c2
 74 |             elif c1 is None and c2 is None:
 75 |                 c = [l1, l2]
 76 |                 cluster_map[l1] = c
 77 |                 cluster_map[l2] = c
 78 | 
 79 |         return set(map(tuple, cluster_map.values()))
 80 | 
 81 |     def _group_lines_by_page(lines: Iterator[_Line]) -> Dict[Page, List[_Line]]:
 82 |         lines_grouped_by_page = defaultdict(list)
 83 |         for line in lines:
 84 |             lines_grouped_by_page[line.page].append(line)
 85 | 
 86 |         return lines_grouped_by_page
 87 | 
 88 |     def _get_last_header_line(lines: List[_Line]) -> Optional[int]:
 89 |         if not lines:
 90 |             return None
 91 | 
 92 |         return max(l.line_number for l in lines)
 93 | 
 94 |     def _get_first_footer_line(lines: List[_Line]) -> Optional[int]:
 95 |         if not lines:
 96 |             return None
 97 | 
 98 |         return min(l.line_number for l in lines)
 99 | 
100 |     def _extract_line_numbers(line: _Line) -> Tuple[_Line, List[int]]:
101 |         numbers = [int(match.group(0)) for match in
102 |                    re.finditer(r"(?:(?<=^)|(?<=\s))\d{1,4}(?=\s|$)", line.text, flags=re.UNICODE)]
103 | 
104 |         return line, numbers
105 | 
106 |     def _extract_potential_page_numbers(lines: List[_Line]) -> Tuple[_Line, List[int]]:
107 |         assert len(lines) > 0
108 |         line, numbers = _extract_line_numbers(lines[-1])
109 |         if not numbers and not str.strip(line.text) and len(lines) > 1:
110 |             line, numbers = _extract_line_numbers(lines[-2])
111 | 
112 |         return line, numbers
113 | 
114 |     candidate_header_lines = []
115 |     candidate_footer_lines = []
116 | 
117 |     pages_lines = [_get_page_lines(p) for p in pages]
118 | 
119 |     for lines in pages_lines:
120 |         # ignore lines that are <4 characters long and/or have no alphabetic characters
121 |         candidate_header_lines.append([l for l in lines[:max_header_lines] if not len(l.cleaned_text) < 4])
122 |         candidate_footer_lines.append([l for l in lines[-max_footer_lines:] if not len(l.cleaned_text) < 4])
123 | 
124 |     headers_for_comparison = pairwise_combine_within_distance(candidate_header_lines, window_size)
125 |     footers_for_comparison = pairwise_combine_within_distance(candidate_footer_lines, window_size)
126 | 
127 |     header_line_similarities = []
128 |     for (lines1, lines2) in headers_for_comparison:
129 |         header_line_similarities.extend(
130 |             (l1, l2) for l1 in lines1 for l2 in lines2 if l1.similarity_ratio(l2) >= min_similarity_ratio)
131 | 
132 |     footer_line_similarities = []
133 |     for (lines1, lines2) in footers_for_comparison:
134 |         footer_line_similarities.extend(
135 |             (l1, l2) for l1 in lines1 for l2 in lines2 if l1.similarity_ratio(l2) >= min_similarity_ratio)
136 | 
137 |     header_clusters = [cluster for cluster in _cluster_lines(header_line_similarities) if
138 |                        len(cluster) >= min_cluster_size]
139 |     footer_clusters = [cluster for cluster in _cluster_lines(footer_line_similarities) if
140 |                        len(cluster) >= min_cluster_size]
141 | 
142 |     if not footer_clusters:
143 |         potential_page_numbers = [_extract_potential_page_numbers(lines) for lines in pages_lines if lines]
144 |         potential_page_numbers = [(line, numbers[0]) for line, numbers in potential_page_numbers if len(numbers) == 1]
145 |         potential_clusters = map(lambda group: tuple(map(lambda t: t[0], group)),
146 |                                  group_consecutive_when(potential_page_numbers, lambda x, y: y[1] - x[1] == 1))
147 |         footer_clusters = [cluster for cluster in potential_clusters if len(cluster) >= min_cluster_size]
148 | 
149 |     header_lines_grouped_by_page = _group_lines_by_page(flatten(header_clusters))
150 |     footer_lines_grouped_by_page = _group_lines_by_page(flatten(footer_clusters))
151 | 
152 |     last_header_line_pages_map = {p: _get_last_header_line(lines) for p, lines in header_lines_grouped_by_page.items()}
153 |     first_footer_line_pages_map = {p: _get_first_footer_line(lines) for p, lines in
154 |                                    footer_lines_grouped_by_page.items()}
155 | 
156 |     for page in pages:
157 |         last_header_line = last_header_line_pages_map.get(page)
158 |         first_footer_line = first_footer_line_pages_map.get(page)
159 |         page.__class__ = type('StructuredPage', (page.__class__, PageStructure), {})
160 |         page.num_header_lines = last_header_line + 1 if last_header_line is not None else 0
161 |         page.num_footer_lines = len(page.text_lines) - first_footer_line if first_footer_line is not None else 0
162 | 
163 |     return pages
164 | 


--------------------------------------------------------------------------------
/htrc/tools/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/htrc/HTRC-WorksetToolkit/3c3428d80a72a644925dc6ab2827470c8467af30/htrc/tools/__init__.py


--------------------------------------------------------------------------------
/htrc/tools/mallet.py:
--------------------------------------------------------------------------------
  1 | from builtins import str
  2 | import os, os.path
  3 | import subprocess
  4 | import sys
  5 | import tarfile
  6 | import urllib.request
  7 | 
  8 | from htrc.volumes import download_volumes
  9 | from htrc.workset import path_to_volumes
 10 | 
 11 | MALLET_DIR = os.path.expanduser('~/mallet')
 12 | 
 13 | # Mallet is downloaded and intalled in user's home directory
 14 | def install_mallet():
 15 |     if not os.path.exists(MALLET_DIR):
 16 |         os.makedirs(MALLET_DIR)
 17 |         mallet_zip = urllib.request.urlopen('http://mallet.cs.umass.edu/dist/mallet-2.0.8RC3.tar.gz')
 18 |         mallet_dir = tarfile.open(mallet_zip, "r:gz")
 19 |         mallet_dir.extractall(path=MALLET_DIR)
 20 |         mallet_dir.close()
 21 | 
 22 | 
 23 | def main(path, topics, iterations, output_dir='/media/secure_volume/workset/'):
 24 |     if not os.path.exists(MALLET_DIR):
 25 |         if not os.path.exists('/media/secure_volume/'):
 26 |             print('Installing Mallet ...')
 27 |             install_mallet()
 28 |             print('\n')
 29 |         else:
 30 |             print('Mallet not installed, but capsule is in secure mode.')
 31 |             print('Switch to maintenance mode and run this command again')
 32 |             print('to install Mallet. Then, switch to secure mode to train')
 33 |             print('topic models.')
 34 |             sys.exit(1)
 35 | 
 36 |     if not os.path.isdir(path):
 37 |         try:
 38 |             volumes = path_to_volumes(path)
 39 |         except ValueError as e:
 40 |             print("Could not process workset. {}".format(str(e)))
 41 |             sys.exit(1)
 42 | 
 43 |         try:
 44 |             download_volumes(volumes, output_dir)
 45 |         except OSError as e:
 46 |             if not os.path.exists('/media/secure_volume/'):
 47 |                 print('Secure volume not mounted. Could not download volumes')
 48 |                 sys.exit(1)
 49 |             else:
 50 |                 print("Could not download volumes. {} {}".format(e.strerror, e.filename))
 51 |                 sys.exit(1)
 52 |         except RuntimeError as e:
 53 |             if not args.debug:
 54 |                 print("Could not download volumes. {}".format(str(e)))
 55 |                 sys.exit(1)
 56 |             else:
 57 |                 raise e
 58 |         path = output_dir
 59 | 
 60 | 
 61 |     # import the workset to MALLET format.
 62 |     subprocess.check_call([
 63 |         '{}/mallet-2.0.8RC3/bin/mallet'.format(MALLET_DIR),
 64 |         'import-dir',
 65 |         '--input', path,
 66 |         '--output', os.path.join(path, '../corpus.mallet'),
 67 |         '--keep-sequence',
 68 |         '--remove-stopwords'
 69 |         ])
 70 | 
 71 |     subprocess.check_call([
 72 |         '{}/mallet-2.0.8RC3/bin/mallet'.format(MALLET_DIR),
 73 |         'train-topics',
 74 |         '--input', os.path.join(path, '../corpus.mallet'),
 75 |         '--num-topics', str(topics),
 76 |         '--output-state', os.path.join(path, '../mallet_state.gz'),
 77 |         '--output-topic-keys', os.path.join(path, '../mallet_topic-keys.txt'),
 78 |         '--output-doc-topics', os.path.join(path, '../mallet_doc-topics.txt'),
 79 |         '--num-iterations', str(iterations)
 80 |         ])
 81 | 
 82 | def populate_parser(parser=None):
 83 |     if parser is None:
 84 |         from argparse import ArgumentParser
 85 |         parser = ArgumentParser()
 86 |     parser.add_argument('-k', help="number of topics", required=True)
 87 |     parser.add_argument('--iter', help="number of iterations", default=200)
 88 |     parser.add_argument('--workset-path', help="Location to store workset download.",
 89 |                         default='/media/secure_volume/workset/')
 90 |     parser.add_argument('path', default='/media/secure_volume/workset/',
 91 |         nargs='?')
 92 |     return parser
 93 | 
 94 | 
 95 | if __name__ == '__main__':
 96 |     from argparse import ArgumentParser
 97 |     parser = ArgumentParser(description="MALLET tools for the HTRC")
 98 |     populate_parser(parser)
 99 |     args = parser.parse_args()
100 | 
101 |     main(args.path, args.k, args.iter, args.workset_path)
102 | 


--------------------------------------------------------------------------------
/htrc/tools/topicexplorer.py:
--------------------------------------------------------------------------------
  1 | from builtins import map
  2 | import os.path
  3 | import subprocess
  4 | from tempfile import NamedTemporaryFile
  5 | 
  6 | from htrc.volumes import download_volumes
  7 | from htrc.workset import path_to_volumes
  8 | import sys
  9 | 
 10 | 
 11 | def main(path, topics, iterations, output_dir='/media/secure_volume/workset'):
 12 |     if os.path.exists("/media/secure_volume"):
 13 |         # If in secure mode, downlaod the volumes from data api
 14 |         try:
 15 |             volumes = path_to_volumes(path)
 16 |         except ValueError as e:
 17 |             print("Could not process workset. {}".format(str(e)))
 18 |             sys.exit(1)
 19 | 
 20 |         try:
 21 |             download_volumes(volumes, output_dir)
 22 |         except OSError as e:
 23 |             if not os.path.exists('/media/secure_volume/'):
 24 |                 print('Secure volume not mounted. Could not download volumes')
 25 |                 sys.exit(1)
 26 |             else:
 27 |                 print("Could not download volumes. {} {}".format(e.strerror, e.filename))
 28 |                 sys.exit(1)
 29 |         except RuntimeError as e:
 30 |             if not args.debug:
 31 |                 print("Could not download volumes. {}".format(str(e)))
 32 |                 sys.exit(1)
 33 |             else:
 34 |                 raise e
 35 |         path = output_dir
 36 | 
 37 |     elif not os.path.exists(path):
 38 |         # If in maintenance mode, use extracted features. 
 39 |         # Assume that if an existing path is given, it is a pre-downloaded set
 40 |         # or a file containing hathitrust ids and continue. 
 41 |         # If the path does not exist, assume it is a url to a hathitrust
 42 |         # collection and write volumes list into a temporary file for
 43 |         # proper handling by extracted features downloader
 44 |         try:
 45 |             volumes = path_to_volumes(path)
 46 | 
 47 |             volfile = NamedTemporaryFile(prefix='htrc-workset', delete=False)
 48 |             volfile.write(bytes('\n'.join(volumes), "ascii"))
 49 | 
 50 |             path = volfile.name
 51 | 
 52 |             volfile.close()
 53 | 
 54 |         except ValueError as e:
 55 |             print("Could not process workset. {}".format(str(e)))
 56 |             sys.exit(1)
 57 | 
 58 |     # strip trailing slash for topic support.
 59 |     if path.endswith('/'):
 60 |         path = path[:-1]
 61 | 
 62 |     # training the topics on the data from above.
 63 |     subprocess.check_call([
 64 |         'topicexplorer', 'init', path,
 65 |         '--name', '"HathiTrust Workset"',
 66 |         '--rebuild', '--htrc', '-q'
 67 |     ])
 68 |     subprocess.check_call([
 69 |         'topicexplorer', 'prep', path,
 70 |         '-q', '--min-word-len', '3', '--lang', 'en',
 71 |         '--high', '30', '--low', '10'
 72 |     ])
 73 |     subprocess.check_call([
 74 |         'topicexplorer', 'train', path,
 75 |         '-k'] + list(map(str,topics)) + [
 76 |         '--iter', str(iterations),
 77 |         '--context-type', 'book',
 78 |         '-q'
 79 |     ])
 80 | 
 81 |     subprocess.check_call([
 82 |         'topicexplorer', 'launch', path
 83 |     ])
 84 | 
 85 | def populate_parser(parser=None):
 86 |     if parser is None:
 87 |         from argparse import ArgumentParser
 88 |         parser = ArgumentParser()
 89 | 
 90 |     parser.add_argument('-k', type=int, nargs='+', required=True,
 91 |         help="number of topics")
 92 |     parser.add_argument('--iter', help="number of iterations", default=200)
 93 |     parser.add_argument('path', default='/media/secure_volume/workset',
 94 |         nargs='?')
 95 |     parser.add_argument('--workset-path', help="Location to store workset download.",
 96 |                         default='/media/secure_volume/workset')
 97 |     return parser
 98 | 
 99 | if __name__ == '__main__':
100 |     from argparse import ArgumentParser
101 |     parser = ArgumentParser(description="Topic Explorer tools for the HTRC")
102 |     populate_parser(parser)
103 |     args = parser.parse_args()
104 | 
105 |     main(args.path, args.k, args.iter, args.workset_path)
106 | 


--------------------------------------------------------------------------------
/htrc/util/__init__.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | 
 3 | import math
 4 | 
 5 | from .resolve import ORG_CODES
 6 | 
 7 | 
 8 | def split_items(seq, split_size):
 9 |     """
10 |     Returns a generator that returns portions of `seq` up to `split_size`.
11 |     Useful when chunking requests to bulk endpoints.
12 | 
13 |     :param seq: A sequence to split.
14 |     :param split_size: The maximum size of each split.
15 |     """
16 |     full_segments = int(math.floor(len(seq) / split_size))
17 |     for i in range(1, full_segments + 1):
18 |         yield seq[(i - 1) * split_size:i * split_size]
19 |     if (full_segments * split_size) < len(seq):
20 |         yield seq[full_segments * split_size:]
21 | 


--------------------------------------------------------------------------------
/htrc/util/resolve.py:
--------------------------------------------------------------------------------
  1 | from future import standard_library
  2 | standard_library.install_aliases()
  3 | import json
  4 | import re
  5 | from pprint import pprint
  6 | from urllib.request import urlopen
  7 | from urllib.parse import urlparse, parse_qs
  8 | 
  9 | # List of organization codes in HathiTrust Digital Library
 10 | # Derived from https://github.com/Bookworm-project/Bookworm-MARC/issues/1
 11 | # More info on HT codes at:  
 12 | ORG_CODES = {
 13 |     "mdp" : "University of Michigan",
 14 |     "miua" : "University of Michigan",
 15 |     "miun" : "University of Michigan",
 16 |     "wu" : "University of Wisconsin",
 17 |     "inu" : "Indiana University",
 18 |     "uc1" : "University of California",
 19 |     "uc2" : "University of California",
 20 |     "pst" : "Penn State University",
 21 |     "umn" : "University of Minnesota",
 22 |     "nnc1" : "Columbia University",
 23 |     "nnc2" : "Columbia University",
 24 |     "nyp" : "New York Public Library",
 25 |     "uiuo" : "University of Illinois",
 26 |     "njp" : "Princeton University",
 27 |     "yale" : "Yale University",
 28 |     "chi" : "University of Chicago",
 29 |     "coo" : "Cornell University",
 30 |     "ucm" : "Universidad Complutense de Madrid",
 31 |     "loc" : "Library of Congress",
 32 |     "ien" : "Northwestern University",
 33 |     "hvd" : "Harvard University",
 34 |     "uva" : "University of Virginia",
 35 |     "dul1" : "Duke University",
 36 |     "ncs1" : "North Carolina State University",
 37 |     "nc01" : "University of North Carolina",
 38 |     "pur1" : "Purdue University",
 39 |     "pur2" : "Purdue University",
 40 |     "mdl" : "Minnesota Digital Library",
 41 |     "usu" : "Utah State University Press",
 42 |     "gri" : "Getty Research Institute",
 43 |     "uiug" : "University of Illinois",
 44 |     "psia" : "Penn State University",
 45 |     "bc" : "Boston College",
 46 |     "ufl1" : "University of Florida",
 47 |     "ufl2" : "University of Florida",
 48 |     "txa" : "Texas A&M University",
 49 |     "keio" : "Keio University",
 50 |     "osu" : "The Ohio State University",
 51 |     "uma" : "University of Massachusets",
 52 |     "udel" : "University of Delaware",
 53 |     "caia" : "Clark Art Institute Library"
 54 | }
 55 | 
 56 | 
 57 | def parse_record_id(string, fix_truncated_id=False):
 58 |     # type: (str) -> str
 59 |     '''
 60 |     Takes either a record ID or a HT URL for a record. 
 61 |     Returns a string containing the record ID or None.
 62 | 
 63 |     >>> parse_record_id('https://catalog.hathitrust.org/Record/000234911')
 64 |     '000234911'
 65 |     >>> parse_record_id('001022499')
 66 |     '001022499'
 67 |     >>> parse_record_id('1022499', fix_truncated_id=True)
 68 |     '001022499'
 69 |     '''
 70 |     REGEX = r'(?:http[s]?://catalog.hathitrust.org/Record/)?([\d]+)'
 71 |     
 72 |     try:
 73 |         record = re.search(REGEX, string).group(1)
 74 |     except AttributeError:
 75 |         raise ValueError("No record ID found in string: {}".format(string))
 76 | 
 77 |     # Correct truncated IDs or raise error.
 78 |     if len(record) != 9:
 79 |         if fix_truncated_id:
 80 |             record = '0'*(9-len(record)) + record
 81 |         else:
 82 |             raise ValueError("Invalid record ID. Valid record IDs are 9 digits. " +
 83 |             "Call parse_record_id(string, fix_truncated_id=True) to correct.")
 84 | 
 85 |     return record
 86 | 
 87 | 
 88 | def parse_volume_id(string):
 89 |     # type: (str) -> str
 90 |     '''
 91 |     Takes either a volume ID, HT URL, or Handle URL for a volume.
 92 |     Returns a string containing the HTID or None.
 93 | 
 94 |     Organization codes for the volumes can be found in ORG_CODES.
 95 |     '''
 96 | 
 97 |     # First extract the volume ID from a URL, fallback to assume string.
 98 |     parsed_url = urlparse(string)
 99 |     if parsed_url.netloc == 'hdl.handle.net':
100 |         # Parse the Handle ID, ex:
101 |         # https://hdl.handle.net/2027/uc2.ark:/13960/fk92805m1s'
102 |         # Note that if the Handle URL contains page info, this is discarded.
103 |         htid = parsed_url.path.replace('/2027/', '')
104 | 
105 |     elif parsed_url.netloc == 'babel.hathitrust.org':
106 |         # Parse the HT Digital Library URL, ex:
107 |         # https://babel.hathitrust.org/cgi/pt?id=uc2.ark:/13960/fk92805m1s;view=1up;seq=7
108 |         if parsed_url.query:
109 |             htid = parse_qs(parsed_url.query).get('id', None)
110 |             if htid is not None:
111 |                 htid = htid[0]
112 |                 if ';' in htid:
113 |                     htid = htid.split(';')[0]
114 | 
115 |     else:
116 |         htid = string
117 | 
118 |     # Validate ID against ORG_CODES. 
119 |     # Won't guarantee volume existence, but it is a sanity check.
120 |     if htid and any(htid.startswith(org) for org in ORG_CODES):
121 |         return htid
122 |     else: 
123 |         raise ValueError("Invalid Organization Code in HathiTrust ID")
124 | 
125 | 
126 | def volume_id_to_record_id(volume_id):
127 |     # type: (str) -> str
128 |     """
129 |     Takes a volume id and returns a record id.
130 | 
131 |     See also: `parse_record_id`
132 |     """
133 |     URL = 'https://catalog.hathitrust.org/Record/HTID/{}'.format(volume_id)
134 |     record_url = urlopen(URL).geturl()
135 |     return parse_record_id(record_url)
136 | 
137 | 
138 | def record_id_to_volume_ids(record_id):
139 |     """
140 |     Takes a record id and returns a list of corresponding volume ids.
141 | 
142 |     HathiTrust is a Digital Library, but is composed of scans of physical
143 |     artifacts. A single catalog record may correspond to multiple volumes 
144 |     in print, especially among pre-20th century texts. Additionally, a single
145 |     catalog record may correspond to  multiple scans from multiple libraries.
146 |     
147 |     This function resolves these ambiguities by selecting only a single copy per
148 |     unique volume label. For example, if a book was printed as three volumes
149 |     labeled in the catalog record as 'v. 1', 'v. 2', and 'v. 3', and contained
150 |     scans from four different libraries of each, this function would return a
151 |     list of 3 volume ids.
152 | 
153 |     Future iterations of this function may take a list of preferred sources
154 |     based on ORG_CODE and attempt to use same-source volumes for consistency.
155 |     """
156 |     # Get record from BibAPI
157 |     URL = "http://catalog.hathitrust.org/api/volumes/brief/recordnumber/{0}.json"
158 |     URL = URL.format(record_id)
159 |     data = urlopen(URL)
160 |     data = json.load(data)
161 |     data = data['items']
162 | 
163 |     if not data:
164 |         raise KeyError("No items found for record ID: {}".format(record_id))
165 |     
166 |     # Normalize volume labels
167 |     REGEX = re.compile('\W')
168 |     items = [('DEFAULT' if not item['enumcron']
169 |                         else REGEX.sub('', item['enumcron']), 
170 |                  item['htid']) for item in data]
171 | 
172 |     # Cast to a dictionary, which removes duplicates as each dictionary key may
173 |     # only have a single value.
174 |     items = dict(items)
175 |     
176 |     if not items:
177 |         raise KeyError("No items found for record ID: {}".format(record_id))
178 |     
179 |     # Return the list of volume ids
180 |     return list(items.values())
181 | 
182 | 
183 | 


--------------------------------------------------------------------------------
/htrc/volumes/__init__.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | """
  3 | `htrc.volumes`
  4 | 
  5 | Contains functions to retrieve volumes from the HTRC Data API. 
  6 | 
  7 | The functions in this package will not operate unless they are 
  8 | executed from an HTRC Data Capsule in Secure Mode. The module 
  9 | `htrc.mock.volumes` contains Patch objects for testing workflows.
 10 | """
 11 | from __future__ import print_function
 12 | from future import standard_library
 13 | 
 14 | standard_library.install_aliases()
 15 | 
 16 | 
 17 | #from builtins import input
 18 | from htrc.models import HtrcPage
 19 | 
 20 | import http.client
 21 | from io import BytesIO, TextIOWrapper
 22 | import json
 23 | import os.path
 24 | import progressbar
 25 | 
 26 | #import re
 27 | import socket
 28 | import ssl
 29 | #import sys
 30 | #from time import sleep
 31 | #from urllib.request import urlopen
 32 | #from urllib.error import HTTPError
 33 | from urllib.parse import urlencode
 34 | #import xml.etree.ElementTree as ET
 35 | from urllib.parse import urlencode
 36 | from zipfile import ZipFile  # used to decompress requested zip archives.
 37 | from tqdm import tqdm
 38 | from htrc.runningheaders import parse_page_structure
 39 | from functools import partial
 40 | import pandas as pd
 41 | #from htrc.lib.cli import bool_prompt
 42 | from htrc.util import split_items
 43 | import htrc.config
 44 | import multiprocessing
 45 | 
 46 | import logging
 47 | from logging import NullHandler
 48 | 
 49 | logging.getLogger(__name__).addHandler(NullHandler())
 50 | 
 51 | 
 52 | def get_volumes(data_api_config: htrc.config.HtrcDataApiConfig, volume_ids, concat=False, mets=False, buffer_size=128):
 53 |     """
 54 |     Returns volumes from the Data API as a raw zip stream.
 55 | 
 56 |     Parameters:
 57 |     :token: An OAuth2 token for the app.
 58 |     :volume_ids: A list of volume_ids
 59 |     :concat: If True, return a single file per volume. If False, return a single
 60 |     file per page (default).
 61 |     :host: Data API host
 62 |     :port: Data API port
 63 |     """
 64 |     if not volume_ids:
 65 |         raise ValueError("volume_ids is empty.")
 66 | 
 67 |     url = data_api_config.epr + "volumes"
 68 | 
 69 |     for id in volume_ids:
 70 |         if ("." not in id
 71 |                 or " " in id):
 72 |             print("Invalid volume id " + id + ". Please correct this volume id and try again.")
 73 | 
 74 |     data = {'volumeIDs': '|'.join(
 75 |         [id.replace('+', ':').replace('=', '/') for id in volume_ids])}
 76 | 
 77 | 
 78 |     if concat:
 79 |         data['concat'] = 'true'
 80 | 
 81 |     if mets:
 82 |         data['mets'] = 'true'
 83 | 
 84 |     # Authorization
 85 |     headers = {"Authorization": "Bearer " + data_api_config.token,
 86 |                "Content-type": "application/x-www-form-urlencoded"}
 87 | 
 88 |     # Create SSL lookup
 89 |     # TODO: Fix SSL cert verification
 90 |     ctx = ssl.create_default_context()
 91 |     ctx.check_hostname = False
 92 |     #ctx.verify_mode = ssl.CERT_NONE
 93 | 
 94 |     # Retrieve the volumes
 95 |     httpsConnection = http.client.HTTPSConnection(
 96 |         data_api_config.host,
 97 |         data_api_config.port,
 98 |         context=ctx,
 99 |         key_file=data_api_config.key,
100 |         cert_file=data_api_config.cert)
101 | 
102 |     httpsConnection.request("POST", url, urlencode(data), headers)
103 | 
104 |     response = httpsConnection.getresponse()
105 | 
106 |     if response.status is 200:
107 |         body = True
108 |         data = BytesIO()
109 |         bytes_downloaded = 0
110 |         bar = progressbar.ProgressBar(max_value=progressbar.UnknownLength,
111 |                                       widgets=[progressbar.AnimatedMarker(), '    ',
112 |                                                progressbar.DataSize(),
113 |                                                ' (', progressbar.FileTransferSpeed(), ')'])
114 | 
115 |         while body:
116 |             body = response.read(buffer_size)
117 |             data.write(body)
118 |             bytes_downloaded += len(body)
119 |             bar.update(bytes_downloaded)
120 | 
121 |         data = data.getvalue()
122 |     else:
123 |         logging.debug("Unable to get volumes")
124 |         logging.debug("Response Code: {}".format(response.status))
125 |         logging.debug("Response: {}".format(response.reason))
126 |         raise EnvironmentError("Unable to get volumes.")
127 | 
128 |     if httpsConnection is not None:
129 |         httpsConnection.close()
130 | 
131 |     return data
132 | 
133 | 
134 | def get_pages(data_api_config: htrc.config.HtrcDataApiConfig, page_ids, concat=False, mets=False, buffer_size=128):
135 |     """
136 |     Returns a ZIP file containing specfic pages.
137 | 
138 |     Parameters:
139 |     :data_api_config: The configuration data of the DataAPI endpoint.
140 |     :volume_ids: A list of volume_ids
141 |     :concat: If True, return a single file per volume. If False, return a single
142 |     file per page (default).
143 |     """
144 |     if not page_ids:
145 |         raise ValueError("page_ids is empty.")
146 | 
147 |     url = data_api_config.epr + "pages"
148 | 
149 |     for id in page_ids:
150 |         if ("." not in id
151 |                 or " " in id):
152 |             print("Invalid volume id " + id + ". Please correct this volume id and try again.")
153 | 
154 |     data = {'pageIDs': '|'.join(
155 |         [id.replace('+', ':').replace('=', '/') for id in page_ids])}
156 | 
157 |     if concat and mets:
158 |         print("Cannot set both concat and mets with pages.")
159 |     elif concat:
160 |         data['concat'] = 'true'
161 |     elif mets:
162 |         data['mets'] = 'true'
163 | 
164 |     # Authorization
165 |     headers = {"Authorization": "Bearer " + data_api_config.token,
166 |                "Content-type": "application/x-www-form-urlencoded"}
167 | 
168 |     # Create SSL lookup
169 |     # TODO: Fix SSL cert verification
170 |     ctx = ssl.create_default_context()
171 |     ctx.check_hostname = False
172 |     #ctx.verify_mode = ssl.CERT_NONE
173 | 
174 |     # Retrieve the volumes
175 |     httpsConnection = http.client.HTTPSConnection(
176 |         data_api_config.host,
177 |         data_api_config.port,
178 |         context=ctx,
179 |         key_file=data_api_config.key,
180 |         cert_file=data_api_config.cert
181 |     )
182 | 
183 |     httpsConnection.request("POST", url, urlencode(data), headers)
184 | 
185 |     response = httpsConnection.getresponse()
186 | 
187 |     if response.status is 200:
188 |         body = True
189 |         data = BytesIO()
190 |         bytes_downloaded = 0
191 |         bar = progressbar.ProgressBar(max_value=progressbar.UnknownLength,
192 |                                       widgets=[progressbar.AnimatedMarker(), '    ',
193 |                                                progressbar.DataSize(),
194 |                                                ' (', progressbar.FileTransferSpeed(), ')'])
195 | 
196 |         while body:
197 |             body = response.read(buffer_size)
198 |             data.write(body)
199 |             bytes_downloaded += len(body)
200 |             bar.update(bytes_downloaded)
201 | 
202 |         data = data.getvalue()
203 |     else:
204 |         logging.debug("Unable to get pages")
205 |         logging.debug("Response Code: ".format(response.status))
206 |         logging.debug("Response: ".format(response.reason))
207 |         raise EnvironmentError("Unable to get pages.")
208 | 
209 |     if httpsConnection is not None:
210 |         httpsConnection.close()
211 | 
212 |     return data
213 | 
214 | 
215 | #def get_oauth2_token(username, password):
216 |     # make sure to set the request content-type as application/x-www-form-urlencoded
217 |     #headers = {"Content-type": "application/x-www-form-urlencoded"}
218 |     #data = { "grant_type": "client_credentials",
219 |              #"client_secret": password,
220 |              #"client_id": username }
221 |     #data = urlencode(data)
222 | 
223 |     # create an SSL context
224 |     #ctx = ssl.create_default_context()
225 |     #ctx.check_hostname = False
226 |     #ctx.verify_mode = ssl.CERT_NONE
227 | 
228 |     # make sure the request method is POST
229 |     #host, port = htrc.config.get_oauth2_host_port()
230 |     #oauth2port = htrc.config.get_oauth2_port()
231 |     #oauth2EPRurl = htrc.config.get_oauth2_url()
232 |     #httpsConnection = http.client.HTTPSConnection(host, oauth2port, context=ctx)
233 |     #httpsConnection.request("POST", oauth2EPRurl + "?" + data, "", headers)
234 | 
235 |     #response = httpsConnection.getresponse()
236 | 
237 |     # if response status is OK
238 |     #if response.status == 200:
239 |         #data = response.read().decode('utf8')
240 | 
241 |         #jsonData = json.loads(data)
242 |         #logging.info("*** JSON: {}".format(jsonData))
243 | 
244 |         #token = jsonData["access_token"]
245 |         #logging.info("*** parsed token: {}".format(token))
246 | 
247 | 
248 |     #else:
249 |         #logging.debug("Unable to get token")
250 |         #logging.debug("Response Code: {}".format(response.status))
251 |         #logging.debug("Response: {}".format(response.reason))
252 |         #logging.debug(response.read())
253 |         #raise EnvironmentError("Unable to get token.")
254 | 
255 |     #if httpsConnection is not None:
256 |         #httpsConnection.close()
257 | 
258 | 
259 |     #return token
260 | 
261 | 
262 | 
263 | def grep_error(file_name, output_dir, pattern, txt_index):
264 |     na_volume = []
265 |     if output_dir.endswith("/"):
266 |         file_path = output_dir + file_name
267 |     else:
268 |         file_path = output_dir + "/" + file_name
269 | 
270 |     if os.path.isfile(file_path):
271 |         for line in open(file_path):
272 |             if pattern in line:
273 |                 na_volume.append(line.split()[txt_index])
274 | 
275 |     return na_volume
276 | 
277 | 
278 |     return na_volume
279 | 
280 | 
281 | def _to_htrc_page(page_file, zip):
282 |     with TextIOWrapper(BytesIO(zip.read(page_file)), encoding='utf-8') as page:
283 |         return HtrcPage([line.rstrip() for line in page.readlines()])
284 | 
285 | 
286 | def download_volumes(volume_ids, output_dir, concat=False, mets=False, pages=False,
287 |                      remove_headers_footers=False, hf_window_size=6, hf_min_similarity=0.7, skip_removed_hf=False,
288 |                      parallelism=multiprocessing.cpu_count(), batch_size=250, data_api_config=None):
289 |     if not 0 < parallelism <= multiprocessing.cpu_count():
290 |         raise ValueError("Invalid parallelism level specified")
291 | 
292 |     remove_hf_fun = partial(
293 |         _remove_headers_footers_and_save,
294 |         concat=concat,
295 |         hf_min_similarity=hf_min_similarity,
296 |         hf_window_size=hf_window_size,
297 |         skip_removed_hf=skip_removed_hf,
298 |         output_dir=output_dir
299 |     )
300 | 
301 |     volume_ids = list(set(volume_ids))  # ensure unique volume ids
302 |     num_vols = len(volume_ids)
303 | 
304 |     data_api_config = data_api_config or htrc.config.HtrcDataApiConfig()
305 | 
306 |     os.makedirs(output_dir, exist_ok=True)
307 | 
308 |     if any((data_api_config.token, data_api_config.host, data_api_config.port)) is not None:
309 |         logging.info("obtained token: %s\n" % data_api_config.token)
310 | 
311 |         try:
312 |             errors = []
313 |             rights = []
314 | 
315 |             with tqdm(total=num_vols) as progress, multiprocessing.Pool(processes=parallelism) as pool:
316 |                 for ids in split_items(volume_ids, batch_size):
317 |                     if pages:
318 |                         if concat and mets:
319 |                             raise ValueError("Cannot set both concat and mets with pages.")
320 |                         else:
321 |                             data = get_pages(data_api_config, ids, concat and not remove_headers_footers, mets)
322 |                     else:
323 |                         data = get_volumes(data_api_config, ids, concat and not remove_headers_footers, mets)
324 | 
325 |                     volumes = []
326 | 
327 |                     with ZipFile(BytesIO(data)) as vols_zip:
328 |                         zip_list = vols_zip.namelist()
329 |                         if 'ERROR.err' in zip_list:
330 |                             errors.append(vols_zip.read('ERROR.err').decode('utf-8'))
331 |                             zip_list.remove('ERROR.err')
332 |                         if 'volume-rights.txt' in zip_list:
333 |                             rights_data = vols_zip.read('volume-rights.txt').decode('utf-8')
334 |                             zip_list.remove('volume-rights.txt')
335 |                             if not rights:
336 |                                 rights.append(rights_data)
337 |                             else:
338 |                                 # due to the format in which 'volume-rights.txt' is created, we have to skip
339 |                                 # the first 4 lines which make up the header of the file, to extract only the
340 |                                 # actual volume rights data for accumulation
341 |                                 rights.append(''.join(rights_data.splitlines(keepends=True)[4:]))
342 | 
343 |                         zip_volume_paths = [zip_vol_path for zip_vol_path in zip_list if zip_vol_path.endswith('/')]
344 |                         num_vols_in_zip = len(zip_volume_paths)
345 | 
346 |                         if not remove_headers_footers:
347 |                             vols_zip.extractall(output_dir, members=zip_list)
348 |                             progress.update(num_vols_in_zip)
349 |                         else:
350 |                             for zip_vol_path in zip_volume_paths:
351 |                                 sorted_vol_zip_page_paths = sorted(zip_page_path for zip_page_path in zip_list if zip_page_path.startswith(zip_vol_path) and not zip_page_path.endswith('/'))
352 |                                 vol_pages = [_to_htrc_page(page_path, vols_zip) for page_path in sorted_vol_zip_page_paths]
353 |                                 volumes.append((zip_vol_path, sorted_vol_zip_page_paths, vol_pages))
354 | 
355 |                     del data, vols_zip
356 | 
357 |                     num_missing = batch_size - num_vols_in_zip if num_vols >= batch_size else num_vols - num_vols_in_zip
358 |                     progress.update(num_missing)  # update progress bar state to include the missing volumes also
359 | 
360 |                     # `volumes` will be empty if `remove_headers_footers=False` since the ZIP was extracted
361 |                     # without further processing
362 |                     if volumes:
363 |                         for _ in pool.imap_unordered(remove_hf_fun, volumes):
364 |                             progress.update()
365 | 
366 |             na_volumes_all = []
367 | 
368 |             if errors:
369 |                 with open(os.path.join(output_dir, 'ERROR.err'), 'w') as err_file:
370 |                     err_file.write(''.join(errors))
371 | 
372 |                 na_volumes_error = grep_error('ERROR.err', output_dir, 'KeyNotFoundException', -1)
373 |                 na_volumes_all.extend(na_volumes_error)
374 | 
375 |             if rights:
376 |                 with open(os.path.join(output_dir, 'volume-rights.txt'), 'w') as rights_file:
377 |                     rights_file.write(''.join(rights))
378 | 
379 |                 if htrc.config.get_dataapi_access() == "true":
380 |                     na_volumes_rights = grep_error('volume-rights.txt', output_dir, ' 3', 0)
381 |                     na_volumes_all.extend(na_volumes_rights)
382 | 
383 |             num_na = len(na_volumes_all)
384 | 
385 |             if num_na > 0:
386 |                 with open(os.path.join(output_dir, 'volumes_not_available.txt'), 'w') as volumes_na:
387 |                     volumes_na.write("\n".join(str(item) for item in na_volumes_all))
388 | 
389 |                 if num_na < 100:
390 |                     print("\nThe following volume ids are not available. \n Please check volumes_not_available.txt "
391 |                           "for the complete list. ")
392 |                     print('\n'.join(str(item) for item in na_volumes_all))
393 |                 else:
394 |                     print("\nThere are {:,} unavailable volumes.\n Please check volumes_not_available.txt "
395 |                           "for the "
396 |                           "complete list. \nTo check the validity of volumes in your workset or volume id file go "
397 |                           "to:\n "
398 |                           "https://analytics.hathitrust.org/validateworkset \n or email us at "
399 |                           "htrc-help@hathitrust.org "
400 |                           "for assistance.".format(num_na))
401 | 
402 |         except socket.error:
403 |             raise RuntimeError("HTRC Data API time out. Check your inode usage if downloading a large workset. "
404 |                                "Contact HTRC for further help.")
405 | 
406 |     else:
407 |         raise RuntimeError("Failed to obtain the JWT token.")
408 | 
409 | 
410 | def _remove_headers_footers_and_save(vol_data, concat, hf_min_similarity, hf_window_size, skip_removed_hf, output_dir):
411 |     zip_vol_path, sorted_vol_zip_page_paths, vol_pages = vol_data
412 |     clean_volid = zip_vol_path[:-1]
413 | 
414 |     vol_pages = parse_page_structure(vol_pages, window_size=hf_window_size, min_similarity_ratio=hf_min_similarity)
415 |     pages_body = (page.body for page in vol_pages)
416 |     # save the removed headers/footers for user inspection
417 |     if skip_removed_hf:
418 |         if concat:
419 |             with open(os.path.join(output_dir, clean_volid + '.txt'), 'w', encoding='utf-8') as vol_file:
420 |                 vol_file.write('\n'.join(pages_body))
421 |         else:
422 |             vol_path = os.path.join(output_dir, zip_vol_path)
423 |             os.mkdir(vol_path)
424 |             for vol_page_path, page_body in zip(sorted_vol_zip_page_paths, pages_body):
425 |                 with open(os.path.join(output_dir, vol_page_path), 'w', encoding='utf-8') as page_file:
426 |                     page_file.write(page_body)
427 |     else:
428 |         if concat:
429 |             with open(os.path.join(output_dir, clean_volid + '.txt'), 'w', encoding='utf-8') as vol_file:
430 |                 vol_file.write('\n'.join(pages_body))
431 |         else:
432 |             vol_path = os.path.join(output_dir, zip_vol_path)
433 |             os.mkdir(vol_path)
434 |             for vol_page_path, page_body in zip(sorted_vol_zip_page_paths, pages_body):
435 |                 with open(os.path.join(output_dir, vol_page_path), 'w', encoding='utf-8') as page_file:
436 |                     page_file.write(page_body)
437 | 
438 | 
439 | 
440 |         removed_hf = []
441 |         for vol_page_path, vol_page in zip(sorted_vol_zip_page_paths, vol_pages):
442 |             if not (vol_page.has_header or vol_page.has_footer):
443 |                 # skip reporting pages that don't have an identified header or footer
444 |                 continue
445 |             _, page_name = os.path.split(vol_page_path)
446 |             page_name, _ = os.path.splitext(page_name)
447 |             removed_hf.append({'page': page_name, 'header': vol_page.header, 'footer': vol_page.footer})
448 | 
449 |         if concat:
450 |             removed_hf_filename = os.path.join(output_dir, clean_volid + '_removed_hf.csv')
451 |         else:
452 |             removed_hf_filename = os.path.join(output_dir, clean_volid, 'removed_hf.csv')
453 | 
454 |         pd.DataFrame(removed_hf, columns=['page', 'header', 'footer']).to_csv(removed_hf_filename, index=False)
455 | 
456 | 
457 | def download(args):
458 |     # extract files
459 |     with open(args.file) as IDfile:
460 |         volumeIDs = [line.strip() for line in IDfile]
461 | 
462 |     data_api_config = htrc.config.HtrcDataApiConfig(
463 |         token=args.token,
464 |         host=args.datahost,
465 |         port=args.dataport,
466 |         epr=args.dataepr,
467 |         cert=args.datacert,
468 |         key=args.datakey
469 |     )
470 | 
471 |     return download_volumes(volumeIDs, args.output,
472 |                             remove_headers_footers=args.remove_headers_footers or args.remove_headers_footers_and_concat,
473 |                             concat=args.concat or args.remove_headers_footers_and_concat,
474 |                             mets=args.mets,
475 |                             pages=args.pages,
476 |                             hf_window_size=args.window_size,
477 |                             hf_min_similarity=args.min_similarity_ratio,
478 |                             parallelism=args.parallelism,
479 |                             batch_size=args.batch_size,
480 |                             skip_removed_hf=args.skip_removed_hf,
481 |                             data_api_config=data_api_config)
482 | 
483 | 


--------------------------------------------------------------------------------
/htrc/workset/__init__.py:
--------------------------------------------------------------------------------
  1 | """
  2 | `htrc.workset`
  3 | 
  4 | Contains function to extract all volume IDs from a JSON-LD workset 
  5 | representation.
  6 | 
  7 | Will eventually be expanded to allow for querying based on arbitrary
  8 | ID and for update and removal of volumes from Workset.
  9 | """
 10 | from __future__ import absolute_import, print_function
 11 | from future import standard_library
 12 | standard_library.install_aliases()
 13 | 
 14 | import unicodecsv as csv
 15 | from io import BytesIO
 16 | import json
 17 | import os.path
 18 | from pprint import pprint
 19 | import re
 20 | from urllib.request import urlopen
 21 | from urllib.parse import urlparse
 22 | 
 23 | from pyld import jsonld
 24 | 
 25 | def get_volumes(data):
 26 |     """
 27 |     Takes a data structure in the canonical HathiTrust JSON-LD format
 28 |     and expands the dataset. Traverses the edm:gathers relation to find
 29 |     all HT volume IDs.
 30 |     
 31 |     Returns a list of volume IDs for use with the `htrc.metadata` and
 32 |     `htrc.volume` modules.
 33 |     """
 34 | 
 35 |     # Remove all namespaces to ensure proper referencing
 36 |     data = jsonld.expand(data)
 37 | 
 38 |     # Build up the list of volumes. Because the JSON-LD `@graph` may
 39 |     # contain multiple worksets, this code uses a set representation
 40 |     # to ensure that duplicates are removed
 41 |     volumes = set()
 42 |     for obj in data:
 43 |         # retrieve list of entities gathered
 44 |         gathers = obj.get('http://www.europeana.eu/schemas/edm/gathers', [])
 45 |         gathers = [vol['@id'].replace('http://hdl.handle.net/2027/','') 
 46 |                       for vol in gathers]
 47 | 
 48 |         # Check if `gathers` has any elements to ensure we don't add []
 49 |         # to the list of volumes.
 50 |         if gathers:
 51 |             volumes.update(gathers)
 52 | 
 53 |     # return the list representation, maintains a more consistent interface
 54 |     return list(volumes)
 55 | 
 56 | def create_jsonld(volumes, title=None, curator=None):
 57 |     """
 58 |     Takes a list of volumes and exports a JSON-LD formated workset
 59 |     """
 60 |     if curator is None:
 61 |         import getpass
 62 |         curator = getpass.getuser()
 63 |     
 64 |     context = "http://emblematica.library.illinois.edu/test/worksetcontext.jsonld"
 65 | 
 66 |     graph = {'@type':'http://wcsa.htrc.illinois.edu/Workset'}
 67 |     GATHERS = "http://www.europeana.eu/schemas/edm/gathers"
 68 |     graph[GATHERS] = [{'@id' : "http://hdl.handle.net/2027/" + vol} 
 69 |                           for vol in volumes]
 70 |     graph['numItems'] = len(volumes)
 71 |     if curator:
 72 |         graph['curator'] = curator
 73 |     if title:
 74 |         graph['title'] = title
 75 | 
 76 |     return jsonld.compact(graph, context)
 77 | 
 78 | def load(filename):
 79 |     """
 80 |     Takes a filename and retrieves a list of volumes from the workset
 81 |     description. If a URL is passed, automatically uses `load_url` to resolve.
 82 |     """
 83 |     if filename.startswith('http://') or filename.startswith('https://'):
 84 |         return load_url(filename)
 85 | 
 86 |     with open(filename) as infile:
 87 |         data = json.load(infile)
 88 | 
 89 |     # Retrieve and print the volumes
 90 |     return get_volumes(data)
 91 | 
 92 | 
 93 | def load_url(url):
 94 |     """
 95 |     Takes a workset URL, parses it, and uses the workset retrieval API to fetch
 96 |     the data and return the volumes..
 97 |     """
 98 |     url_components = urlparse(url)
 99 |     if url_components.netloc.startswith('babel.hathitrust.org'):
100 |         return load_hathitrust_collection(url)
101 |     elif (url_components.netloc.startswith('htrc.hathitrust.org')
102 |         and url_components.path.startswith('/wsid/')):
103 |         base_url = 'http://acbres224.ischool.illinois.edu:8080'
104 |         base_url += '/dcWSfetch/getDescription?id='
105 |         base_url += url
106 |         url = base_url
107 |     elif (url_components.netloc.startswith('acbres224.ischool.illinois.edu')
108 |         and url_components.path.startswith('/dcWSfetch/')):
109 |         # copied from direct call to WS fetch, a-ok.
110 |         pass
111 |     else:
112 |         raise ValueError("Invalid workset URL: {}".format(url))
113 | 
114 |     response = urlopen(url)
115 |     data = json.loads(response.read().decode('utf-8'))
116 | 
117 |     return get_volumes(data)
118 | 
119 | 
120 | def get_volumes_from_csv(data):
121 |     """
122 |     Retrieves the volume list for a given HathiTrust collection.
123 |     """
124 | 
125 |     csvfile = BytesIO(data)
126 |     reader = csv.DictReader(csvfile, delimiter='\t')
127 |     volumes = [row['htitem_id'] for row in reader] 
128 |     csvfile.close()
129 | 
130 |     return volumes 
131 | 
132 | 
133 | def load_hathitrust_collection(url):
134 |     """
135 |     Retrieves the volume list for a given HathiTrust Collection URL.
136 |     In contrast to `get_volumes_csv`, which makes the request and handles data,
137 |     this function parses out the collection ID from a variety of canonical URL
138 |     schemes for collections:
139 |     - https://babel.hathitrust.org/shcgi/mb?a=listis;c=548413090
140 |     - https://babel.hathitrust.org/cgi/mb?a=listis&c=548413090
141 |     """
142 |     if not url.startswith('https://babel.hathitrust.org/'):
143 |         raise ValueError('Invalid HathiTrust Collection URL: {}'.format(url))
144 |     try:
145 |         collection_id = re.search('c=(\d+)', url).group(1)
146 |     except AttributeError:
147 |         raise ValueError('Invalid HathiTrust Collection URL: {}'.format(url))
148 |     
149 |     url = "https://babel.hathitrust.org/shcgi/mb"
150 |     data = "a=download&c={}&format=text".format(collection_id)
151 | 
152 |     response = urlopen(url, bytes(data.encode('utf-8')))
153 |     data = response.read()
154 | 
155 |     return get_volumes_from_csv(data)
156 | 
157 | 
158 | def path_to_volumes(path):
159 |     """
160 |     Takes a path and resolves to a list of volumes.
161 | 
162 |     Accepts:
163 |     - Plaintext file, each line is an ID
164 |     - Directory with subfolders that are volume pages
165 |     - JSON or JSONLD workset representation
166 |     - HT CB or HTRC WCSA URL.
167 |     """
168 |     if os.path.isdir(path):
169 |         volumes = [id for id in os.listdir(path) if not id.endswith('.log')]
170 |     elif (path.endswith('json')
171 |         or path.endswith('jsonld')
172 |         or path.startswith('http://')
173 |         or path.startswith('https://')):
174 |         volumes = load(path)
175 |     elif os.path.isfile(path):
176 |         with open(path) as infile:
177 |             volumes = [line.strip() for line in infile]
178 |     else:
179 |         raise ValueError("Invalid workset path.")
180 | 
181 |     return volumes
182 | 


--------------------------------------------------------------------------------
/htrc/workset/__main__.py:
--------------------------------------------------------------------------------
 1 | from htrc.workset import *
 2 | 
 3 | if __name__ == '__main__':
 4 |     from argparse import ArgumentParser
 5 | 
 6 |     parser = ArgumentParser()
 7 |     parser.add_argument('filename')
 8 | 
 9 |     args = parser.parse_args()
10 | 
11 |     volumes = load(args.filename)
12 | 
13 |     for vol in volumes:
14 |         print(vol)
15 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from __future__ import print_function
 3 | 
 4 | from setuptools import setup, find_packages
 5 | from setuptools.command.install import install
 6 | import os
 7 | import platform
 8 | import sys
 9 | import atexit
10 | import tarfile
11 | 
12 | 
13 | __version__ = '0.1.58'
14 | 
15 | # Installing Cython indepdently bc of numpy version. Pinning packages bc of python3.6
16 | install_requires = ['PyLD', 'future', 'prov', 'unicodecsv', 'progressbar2==3.55.0', 'pandas==1.1.5','requests', 'argparse==1.1', 'Cython','tqdm==4.46.0']
17 | # TODO: migrate to docs confix:, 'sphinx-argparse', 'sphinxcontrib-fulltoc']
18 | if sys.version_info.major == 2:
19 |     install_requires.append('configparser')
20 |     install_requires.append('mock')
21 | 
22 | 
23 | def _download_config():
24 |     print("Downloading .htrc file...")
25 | 
26 |     _config_file_url = 'https://analytics.hathitrust.org/files/.htrc'
27 |     _path = os.path.expanduser('~/.htrc')
28 |     try: 
29 |         from urllib.request import urlretrieve
30 |     except ImportError:
31 |         from urllib import urlretrieve
32 |     urlretrieve(_config_file_url, _path)
33 | 
34 |     print("\n")
35 | 
36 | 
37 | def _install_mallet():
38 |     mallet_path = os.path.expanduser('~/mallet')
39 |     if not os.path.exists(mallet_path):
40 |         print('Installing Mallet ...')
41 |         os.makedirs(mallet_path)
42 |         try: 
43 |             from urllib.request import urlretrieve
44 |         except ImportError:
45 |             from urllib import urlretrieve
46 |         mallet_zip, _ = urlretrieve('http://mallet.cs.umass.edu/dist/mallet-2.0.8RC3.tar.gz')
47 |         mallet_dir = tarfile.open(mallet_zip, "r:gz")
48 |         mallet_dir.extractall(path=mallet_path)
49 |         mallet_dir.close()
50 |         print('\n')
51 | 
52 | 
53 | class PostInstallCommand(install, object):
54 |     def __init__(self, *args, **kwargs):
55 |         super(PostInstallCommand, self).__init__(*args, **kwargs)
56 |         atexit.register(_download_config)
57 |         atexit.register(_install_mallet)
58 | 
59 | 
60 | setup(
61 |     name='htrc',
62 |     version=__version__,
63 |     description='HathiTrust Research Center API Access',
64 |     author="HathiTrust Research Center",
65 |     author_email="htrc@indiana.edu",
66 |     url='http://analytics.hathitrust.org',
67 |     download_url='http://github.com/htrc/HTRC-PythonSDK',
68 |     keywords=[],
69 |     classifiers=[
70 |         "Programming Language :: Python",
71 |         "Programming Language :: Python :: 2.7",
72 |         "Programming Language :: Python :: 3.5",
73 |         "Development Status :: 4 - Beta",
74 |         "Intended Audience :: Science/Research",
75 |         "License :: OSI Approved :: MIT License",
76 |         "Operating System :: Unix",
77 |         "Topic :: Scientific/Engineering :: Information Analysis",
78 |         "Topic :: Software Development :: Libraries :: Python Modules",
79 |         "Topic :: Text Processing :: Linguistic",
80 |     ],
81 |     packages=find_packages(),
82 |     install_requires=install_requires,
83 |     include_package_data=True,
84 |     data_files=[('htrc/mock/volumes', ['htrc/mock/volumes/example.zip']),
85 |                 ('htrc', ['htrc/.htrc.default'])],
86 |     zip_safe=False,
87 |     entry_points={
88 |         'console_scripts': ['htrc = htrc.__main__:main']
89 |     },
90 |     test_suite="unittest2.collector",
91 |     tests_require=['unittest2'],
92 |     cmdclass={'install': PostInstallCommand}
93 | )
94 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/htrc/HTRC-WorksetToolkit/3c3428d80a72a644925dc6ab2827470c8467af30/tests/__init__.py


--------------------------------------------------------------------------------
/tests/data/example.csv:
--------------------------------------------------------------------------------
1 | htitem_id	title	author	date	rights	OCLC	LCCN	ISBN	catalog_url	handle_url
2 | mdp.39015050817181	Archaeology, art, and religion : new perspectives on Vijayanagar / Anila Verghese. 	Verghese, Anila.	2000-00-00	ic	41960671	00371223	9780195648904,0195648900	http://catalog.hathitrust.org/Record/004126407	http://hdl.handle.net/2027/mdp.39015050817181
3 | mdp.39015055436151	Hampi / Anila Verghese. 	Verghese, Anila.	2002-00-00	ic	47940144	2002285547	9780195654332,0195654331	http://catalog.hathitrust.org/Record/004227991	http://hdl.handle.net/2027/mdp.39015055436151
4 | mdp.39015056169157	Hampi / Anila Verghese. 	Verghese, Anila.	2002-00-00	ic	47940144	2002285547	9780195654332,0195654331	http://catalog.hathitrust.org/Record/004227991	http://hdl.handle.net/2027/mdp.39015056169157
5 | mdp.39015050161697	Religious traditions at Vijayanagara, as revealed through its monuments / Anila Verghese. 	Verghese, Anila.	1995-00-00	ic	32893147	95903106	9788173040863,8173040869	http://catalog.hathitrust.org/Record/004054378	http://hdl.handle.net/2027/mdp.39015050161697
6 | mdp.39015042791874	Sculpture at Vijayanagara : iconography and style / Anna L. Dallapiccola, Anila Verghese. 	Dallapiccola, Anna L. 1944-	1998-00-00	ic	40480543	98909496	9788173042324,8173042322	http://catalog.hathitrust.org/Record/003333435	http://hdl.handle.net/2027/mdp.39015042791874


--------------------------------------------------------------------------------
/tests/data/example.jsonld:
--------------------------------------------------------------------------------
 1 | {
 2 |     "@context": "http://emblematica.library.illinois.edu/test/worksetcontext.jsonld",
 3 |     "@graph" : [
 4 |     {
 5 |         "@id": "https://babel.hathitrust.org/shcgi/mb?a=listis;c=548413090",
 6 |         "@type": "Workset",
 7 |         "title": "Sample",
 8 |         "curator": "jammurdo",
 9 |         "numItems": 5,
10 |         "gathers": [
11 |         {
12 |             "@id": "http://hdl.handle.net/2027/mdp.39015050817181"
13 |         },
14 |         {
15 |             "@id": "http://hdl.handle.net/2027/mdp.39015055436151"
16 |         },
17 |         {
18 |             "@id": "http://hdl.handle.net/2027/mdp.39015056169157"
19 |         },
20 |         {
21 |             "@id": "http://hdl.handle.net/2027/mdp.39015050161697"
22 |         },
23 |         {
24 |             "@id": "http://hdl.handle.net/2027/mdp.39015042791874"
25 |         }
26 |     ]
27 |   }]
28 | }
29 | 


--------------------------------------------------------------------------------
/tests/data/example.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/htrc/HTRC-WorksetToolkit/3c3428d80a72a644925dc6ab2827470c8467af30/tests/data/example.zip


--------------------------------------------------------------------------------
/tests/test_download_cli.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | from future import standard_library
 3 | standard_library.install_aliases()
 4 | 
 5 | import sys
 6 | if sys.version_info.major == 2:
 7 |     from mock import Mock, patch, PropertyMock
 8 | elif sys.version_info.major == 3:
 9 |     from unittest.mock import Mock, patch, PropertyMock
10 | 
11 | import unittest2 as unittest
12 | 
13 | import htrc.__main__
14 | import argparse
15 | 
16 | class TestDownload(unittest.TestCase):
17 |     
18 |     @patch('argparse._sys.argv', ['htrc', 'download', 'mdp.1234567'])
19 |     @patch('htrc.__main__.download')
20 |     def test_raw_volume_id(self, download_mock):
21 |         htrc.__main__.main()
22 |         download_mock.assert_called_once()
23 | 
24 |     @patch('argparse._sys.argv', ['htrc', 'download', '001423370'])
25 |     @patch('htrc.__main__.download')
26 |     def test_raw_record_id(self, download_mock):
27 |         htrc.__main__.main()
28 |         download_mock.assert_called_once()
29 | 
30 |     @patch('argparse._sys.argv', ['htrc', 'download', 'https://babel.hathitrust.org/cgi/pt?id=mdp.39015078560078;view=1up;seq=13'])
31 |     @patch('htrc.__main__.download')
32 |     def test_babel_url(self, download_mock):
33 |         htrc.__main__.main()
34 |         download_mock.assert_called_once()
35 |     
36 |     @patch('argparse._sys.argv', ['htrc', 'download', 'https://hdl.handle.net/2027/mdp.39015078560078'])
37 |     @patch('htrc.__main__.download')
38 |     def test_handle_url(self, download_mock):
39 |         htrc.__main__.main()
40 |         download_mock.assert_called_once()
41 | 
42 |     @patch('argparse._sys.argv', ['htrc', 'download', 'https://catalog.hathitrust.org/Record/001423370'])
43 |     @patch('htrc.__main__.download')
44 |     def test_catalog_url(self, download_mock):
45 |         htrc.__main__.main()
46 |         download_mock.assert_called_once()
47 | 
48 |     @patch('argparse._sys.argv', ['htrc', 'download', 'https://babel.hathitrust.org/shcgi/mb?a=listis;c=696632727'])
49 |     @patch('htrc.__main__.download')
50 |     def test_collection_builder_url(self, download_mock):
51 |         htrc.__main__.main()
52 |         download_mock.assert_called_once()


--------------------------------------------------------------------------------
/tests/test_htrc_lib_cli.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | from future import standard_library
 3 | standard_library.install_aliases()
 4 | 
 5 | import sys
 6 | if sys.version_info.major == 2:
 7 |     from mock import Mock, patch, PropertyMock
 8 | elif sys.version_info.major == 3:
 9 |     from unittest.mock import Mock, patch, PropertyMock
10 | 
11 | import unittest2 as unittest
12 | 
13 | from htrc.lib.cli import *
14 | 
15 | class TestVolumes(unittest.TestCase):
16 |     @patch('htrc.lib.cli.input')
17 |     def test_bool_prompt(self, input_mock):
18 |         # test True
19 |         input_mock.return_value = 'y'
20 |         return_value = bool_prompt("Enter yes")
21 |         self.assertEqual(return_value, True)
22 |         
23 |         input_mock.return_value = 'n'
24 |         return_value = bool_prompt("Enter no")
25 |         self.assertEqual(return_value, False)
26 | 
27 |         input_mock.return_value = ''
28 |         return_value = bool_prompt("Enter nothing for false", default=False)
29 |         self.assertEqual(return_value, False)
30 |         
31 |         return_value = bool_prompt("Enter nothing for true", default=True)
32 |         self.assertEqual(return_value, True)
33 | 
34 |     @patch('htrc.lib.cli.input')
35 |     def test_prompt_default(self, input_mock):
36 |         input_mock.return_value = ''
37 |         return_value = prompt("Enter nothing for 3", default='3')
38 |         self.assertEqual(return_value, '3')
39 | 
40 | 


--------------------------------------------------------------------------------
/tests/test_htrc_mock_volumes.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | from future import standard_library
  3 | standard_library.install_aliases()
  4 | 
  5 | import sys
  6 | if sys.version_info.major == 2:
  7 |     from mock import Mock, patch, PropertyMock
  8 | elif sys.version_info.major == 3:
  9 |     from unittest.mock import Mock, patch, PropertyMock
 10 | 
 11 | from io import BytesIO  # used to stream http response into zipfile.
 12 | from tempfile import NamedTemporaryFile, mkdtemp
 13 | import unittest2 as unittest
 14 | 
 15 | import htrc.mock.volumes
 16 | 
 17 | class MockResponse(BytesIO):
 18 |     def __init__(self, data, status=200, *args, **kwargs):
 19 |         BytesIO.__init__(self, data, *args, **kwargs)
 20 |         self.status = status
 21 | 
 22 | class TestVolumes(unittest.TestCase):
 23 |     def setUp(self):
 24 |         self.test_vols = ['mdp.39015050817181', 'mdp.39015055436151',
 25 |             'mdp.39015056169157', 'mdp.39015050161697', 'mdp.39015042791874']
 26 | 
 27 |         self.config_path = NamedTemporaryFile(delete=False).name
 28 |         self.empty_config_path = NamedTemporaryFile(delete=False).name
 29 | 
 30 |         self.output_path = mkdtemp()
 31 | 
 32 |     def tearDown(self):
 33 |         import os, shutil
 34 |         os.remove(self.config_path)
 35 |         shutil.rmtree(self.output_path)
 36 | 
 37 |     @patch('htrc.mock.volumes.credential_prompt')
 38 |     def test_credential_prompt(self, credential_prompt_mock):
 39 |         # configure mocks
 40 |         credential_prompt_mock.return_value = ('1234', '1234')
 41 | 
 42 |         # test prompts
 43 |         username, password = htrc.mock.volumes.credential_prompt(self.config_path)
 44 |         self.assertEqual(username, '1234')
 45 |         self.assertEqual(password, '1234')
 46 | 
 47 |         # test read
 48 |         username, password = htrc.mock.volumes.credentials_from_config(
 49 |             self.config_path)
 50 |         self.assertEqual(username, None)
 51 |         self.assertEqual(password, None)
 52 | 
 53 |     def test_get_oauth2_token(self):
 54 |         token = htrc.mock.volumes.get_oauth2_token('1234','1234')
 55 |         self.assertEqual(token, 'a1b2c3d4e5f6')
 56 | 
 57 |     def test_get_volumes_and_pages(self):
 58 |         htrc.mock.volumes.get_volumes('1234', self.test_vols)
 59 |         htrc.mock.volumes.get_pages('1234', self.test_vols)
 60 | 
 61 |     def test_get_volumes_and_pages_empty(self):
 62 |         with self.assertRaises(ValueError):
 63 |             htrc.mock.volumes.get_volumes('1234', [])
 64 | 
 65 |         with self.assertRaises(ValueError):
 66 |             htrc.mock.volumes.get_pages('1234', [])
 67 | 
 68 |     @patch('htrc.mock.volumes.ZipFile')
 69 |     @patch('htrc.mock.volumes.get_volumes')
 70 |     @patch('htrc.mock.volumes.get_oauth2_token')
 71 |     def test_download_volumes(self, oauth2_mock, volumes_mock, zip_mock):
 72 |         oauth2_mock.return_value = 'a1b2c3d4e5'
 73 |         volumes_mock.return_value = b''
 74 | 
 75 |         htrc.mock.volumes.download_volumes(self.test_vols, self.output_path,
 76 |             username='1234', password='1234')
 77 | 
 78 |         # test directory creation
 79 |         import shutil
 80 |         shutil.rmtree(self.output_path)
 81 |         htrc.mock.volumes.download_volumes(self.test_vols, self.output_path,
 82 |             username='1234', password='1234')
 83 | 
 84 |     @patch('htrc.mock.volumes.ZipFile')
 85 |     @patch('htrc.mock.volumes.get_volumes')
 86 |     @patch('htrc.mock.volumes.get_oauth2_token')
 87 |     def test_download_volumes_saved_creds(self, oauth2_mock, volumes_mock, zip_mock):
 88 |         oauth2_mock.return_value = 'a1b2c3d4e5'
 89 |         volumes_mock.return_value = b''
 90 | 
 91 |         # test config-based auth
 92 |         import os, os.path
 93 |         config_path = os.path.expanduser('~')
 94 |         config_path = os.path.join(config_path, '.htrc')
 95 |         preexisting_config = os.path.exists(config_path)
 96 |         if not preexisting_config:
 97 |             htrc.mock.volumes.save_credentials('1234', '1234', config_path)
 98 | 
 99 |         htrc.mock.volumes.download_volumes(self.test_vols, self.output_path)
100 | 
101 |         if not preexisting_config:
102 |             os.remove(config_path)
103 | 
104 |     def test_download(self):
105 |         pass
106 | 
107 | suite = unittest.TestLoader().loadTestsFromTestCase(TestVolumes)
108 | unittest.TextTestRunner(verbosity=2).run(suite)
109 | 
110 | 


--------------------------------------------------------------------------------
/tests/test_htrc_util_resolve.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | from future import standard_library
 3 | standard_library.install_aliases()
 4 | 
 5 | import sys
 6 | if sys.version_info.major == 2:
 7 |     from mock import Mock, patch
 8 | elif sys.version_info.major == 3:
 9 |     from unittest.mock import Mock, patch
10 | 
11 | import unittest2 as unittest
12 | 
13 | import htrc.util.resolve as resolve
14 | 
15 | class TestResolve(unittest.TestCase):
16 |     def test_parse_record_id(self):
17 |         id = resolve.parse_record_id('https://catalog.hathitrust.org/Record/000234911')
18 |         self.assertEqual(id, '000234911')
19 | 
20 |         id = resolve.parse_record_id('000234911')
21 |         self.assertEqual(id, '000234911')
22 | 
23 |         
24 |         with self.assertRaises(ValueError):
25 |             resolve.parse_record_id('https://hdl.handle.net/2027/hvd.hn3t2m')
26 | 
27 |         with self.assertRaises(ValueError):
28 |             resolve.parse_record_id('this is not a valid URL or volume ID')
29 | 
30 |     def test_parse_truncated_record_id(self):
31 |         # test truncated IDs
32 |         with self.assertRaises(ValueError):
33 |             resolve.parse_record_id('234911')
34 | 
35 |         id = resolve.parse_record_id('234911', fix_truncated_id=True)
36 |         self.assertEqual(id, '000234911')
37 | 
38 |     def test_parse_volume_id(self):
39 |         id = resolve.parse_volume_id('https://hdl.handle.net/2027/uc2.ark:/13960/fk92805m1s')
40 |         self.assertEqual(id, 'uc2.ark:/13960/fk92805m1s')
41 | 
42 |         id = resolve.parse_volume_id('https://babel.hathitrust.org/cgi/pt?id=uc2.ark:/13960/fk92805m1s;view=1up;seq=7')
43 |         self.assertEqual(id, 'uc2.ark:/13960/fk92805m1s')
44 | 
45 |         id = resolve.parse_volume_id('https://babel.hathitrust.org/cgi/pt?id=uc2.ark:/13960/fk92805m1s&view=1up&seq=7')
46 |         self.assertEqual(id, 'uc2.ark:/13960/fk92805m1s')
47 | 
48 |         id = resolve.parse_volume_id('uc2.ark:/13960/fk92805m1s')
49 |         self.assertEqual(id, 'uc2.ark:/13960/fk92805m1s')
50 | 
51 |         with self.assertRaises(ValueError):
52 |             # check if incorrect institution ID raises error
53 |             resolve.parse_volume_id('uc42.ark:/13960/fk92805m1s')
54 |         
55 |     @patch('htrc.util.resolve.urlopen')
56 |     def test_volume_id_to_record_id(self, urlopen_mock):
57 |         urlopen_mock.return_value.geturl.return_value =\
58 |             'https://catalog.hathitrust.org/Record/000850926'
59 |         record_id = resolve.volume_id_to_record_id('uc2.ark:/13960/fk92805m1s')
60 | 
61 |         self.assertEqual(record_id, '000850926')
62 | 
63 | 
64 |     @patch('htrc.util.resolve.urlopen')
65 |     def test_record_id_to_volume_ids(self, urlopen_mock):
66 |         urlopen_mock.return_value.read.return_value =\
67 |             b'{"items":[{"orig":"Harvard University","fromRecord":"000850926","htid":"hvd.hn3t2m","itemURL":"https:\/\/hdl.handle.net\/2027\/hvd.hn3t2m","rightsCode":"pd","lastUpdate":"20130803","enumcron":false,"usRightsString":"Full view"}]}'.decode('utf-8')
68 |         ids = resolve.record_id_to_volume_ids('000234911')
69 |         self.assertEqual(ids, ['hvd.hn3t2m'])
70 | 


--------------------------------------------------------------------------------
/tests/test_htrc_volumes.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | from future import standard_library
  3 | standard_library.install_aliases()
  4 | 
  5 | import sys
  6 | if sys.version_info.major == 2:
  7 |     from mock import Mock, patch, PropertyMock
  8 | elif sys.version_info.major == 3:
  9 |     from unittest.mock import Mock, patch, PropertyMock
 10 | 
 11 | from io import BytesIO  # used to stream http response into zipfile.
 12 | from tempfile import NamedTemporaryFile, mkdtemp
 13 | import unittest2 as unittest
 14 | 
 15 | import htrc.volumes
 16 | import htrc.config
 17 | 
 18 | class MockResponse(BytesIO):
 19 |     def __init__(self, data, status=200, *args, **kwargs):
 20 |         BytesIO.__init__(self, data, *args, **kwargs)
 21 |         self.status = status
 22 | 
 23 | class TestVolumes(unittest.TestCase):
 24 |     def setUp(self):
 25 |         self.test_vols = ['mdp.39015050817181', 'mdp.39015055436151',
 26 |             'mdp.39015056169157', 'mdp.39015050161697', 'mdp.39015042791874']
 27 | 
 28 |         self.config_path = NamedTemporaryFile(delete=False).name
 29 |         self.empty_config_path = NamedTemporaryFile(delete=False).name
 30 | 
 31 |         self.output_path = mkdtemp()
 32 | 
 33 |     def tearDown(self):
 34 |         import os, shutil
 35 |         os.remove(self.config_path)
 36 |         shutil.rmtree(self.output_path)
 37 | 
 38 | 
 39 |     # @patch('htrc.volumes.http.client.HTTPSConnection')
 40 |     # def test_get_oauth2_token(self, https_mock):
 41 |     #     response_mock = Mock(status=200, return_value=b'')
 42 |     #     response_mock.read.return_value =\
 43 |     #         '{"access_token": "a1b2c3d4e5f6"}'.encode('utf8')
 44 |     #     https_mock.return_value.getresponse.return_value = response_mock
 45 |     #
 46 |     #     token = htrc.volumes.get_oauth2_token('1234','1234')
 47 |     #     self.assertEqual(token, 'a1b2c3d4e5f6')
 48 |     #
 49 |     # @patch('htrc.volumes.http.client.HTTPSConnection')
 50 |     # def test_get_oauth2_token_error(self, https_mock):
 51 |     #     response_mock = Mock(status=500)
 52 |     #     https_mock.return_value.getresponse.return_value = response_mock
 53 |     #
 54 |     #     with self.assertRaises(EnvironmentError):
 55 |     #         token = htrc.volumes.get_oauth2_token('1234','1234')
 56 | 
 57 |     @patch('htrc.volumes.http.client.HTTPSConnection')
 58 |     def test_get_volumes_and_pages(self, https_mock):
 59 |         response_mock = Mock(status=200)
 60 |         response_mock.read.return_value =\
 61 |             ''.encode('utf8')
 62 |         https_mock.return_value.getresponse.return_value = response_mock
 63 |         data_api_config = htrc.config.HtrcDataApiConfig(
 64 |             token='1234',
 65 |             host='data-host',
 66 |             port=443,
 67 |             epr='/',
 68 |             cert='/home/client-certs/client.pem',
 69 |             key='/home/client-certs/client.pem'
 70 |         )
 71 | 
 72 |         htrc.volumes.get_volumes(data_api_config, self.test_vols)
 73 |         htrc.volumes.get_pages(data_api_config, self.test_vols)
 74 | 
 75 |     @patch('htrc.volumes.http.client.HTTPSConnection')
 76 |     def test_get_volumes_and_pages_error(self, https_mock):
 77 |         response_mock = Mock(status=500)
 78 |         https_mock.return_value.getresponse.return_value = response_mock
 79 | 
 80 |         data_api_config = htrc.config.HtrcDataApiConfig(
 81 |             token='1234',
 82 |             host='data-host',
 83 |             port=443,
 84 |             epr='/',
 85 |             cert='/home/client-certs/client.pem',
 86 |             key='/home/client-certs/client.pem'
 87 |         )
 88 | 
 89 |         with self.assertRaises(EnvironmentError):
 90 |             htrc.volumes.get_volumes(data_api_config, self.test_vols)
 91 | 
 92 |         with self.assertRaises(EnvironmentError):
 93 |             htrc.volumes.get_pages(data_api_config, self.test_vols)
 94 | 
 95 |     def test_get_volumes_and_pages_empty(self):
 96 |         data_api_config = htrc.config.HtrcDataApiConfig(
 97 |             token='1234',
 98 |             host='data-host',
 99 |             port=443,
100 |             epr='/',
101 |             cert='/home/client-certs/client.pem',
102 |             key='/home/client-certs/client.pem'
103 |         )
104 | 
105 |         with self.assertRaises(ValueError):
106 |             htrc.volumes.get_volumes(data_api_config, [])
107 | 
108 |         with self.assertRaises(ValueError):
109 |             htrc.volumes.get_pages(data_api_config, [])
110 | 
111 |     @patch('htrc.volumes.ZipFile')
112 |     @patch('htrc.volumes.get_volumes')
113 |     # test is looking for oauth2 tokens. looks like we made a jump to jwt but not seeing tests for those.
114 |     # revised code to point towards mock.volumes.get_oauth2_token as a hot fix - 5/22 dan
115 |     @patch('htrc.mock.volumes.get_oauth2_token')
116 |     @patch('htrc.volumes.http.client.HTTPSConnection')
117 |     def test_download_volumes(self, https_mock, oauth2_mock, volumes_mock,
118 |                               zip_mock):
119 |         response_mock = Mock(status=200)
120 |         https_mock.return_value.getresponse.return_value = response_mock
121 |         oauth2_mock.return_value = 'a1b2c3d4e5'
122 |         volumes_mock.return_value = b''
123 | 
124 |         data_api_config = htrc.config.HtrcDataApiConfig(
125 |             token='1234',
126 |             host='data-host',
127 |             port=443,
128 |             epr='/',
129 |             cert='/home/client-certs/client.pem',
130 |             key='/home/client-certs/client.pem'
131 |         )
132 | 
133 |         htrc.volumes.download_volumes(self.test_vols, self.output_path, data_api_config=data_api_config)
134 | 
135 |         # test directory creation
136 |         import shutil
137 |         shutil.rmtree(self.output_path)
138 |         htrc.volumes.download_volumes(self.test_vols, self.output_path, data_api_config=data_api_config)
139 | 
140 |     # TODO: Fix this test for case where config file exists, but creds not set
141 |     """
142 |     @patch('htrc.volumes.ZipFile')
143 |     @patch('htrc.volumes.get_volumes')
144 |     @patch('htrc.volumes.get_oauth2_token')
145 |     @patch('htrc.volumes.http.client.HTTPSConnection')
146 |     def test_download_volumes_saved_creds(self, https_mock, oauth2_mock, volumes_mock,
147 |                               zip_mock):
148 |         response_mock = Mock(status=200)
149 |         https_mock.return_value.getresponse.return_value = response_mock
150 |         oauth2_mock.return_value = 'a1b2c3d4e5'
151 |         volumes_mock.return_value = b''
152 | 
153 |         # test config-based auth
154 |         import os, os.path
155 |         config_path = os.path.expanduser('~')
156 |         config_path = os.path.join(config_path, '.htrc')
157 |         preexisting_config = os.path.exists(config_path)
158 |         if not preexisting_config:
159 |             htrc.config.save_credentials('1234', '1234', config_path)
160 | 
161 |         htrc.volumes.download_volumes(self.test_vols, self.output_path)
162 | 
163 |         if not preexisting_config:
164 |             os.remove(config_path)
165 |     """
166 | 
167 |     def test_download(self):
168 |         pass
169 | 
170 | 
171 | suite = unittest.TestLoader().loadTestsFromTestCase(TestVolumes)
172 | unittest.TextTestRunner(verbosity=2).run(suite)
173 | 
174 | 


--------------------------------------------------------------------------------
/tests/test_htrc_workset.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | if sys.version_info.major == 2:
  3 |     from mock import Mock, patch
  4 | elif sys.version_info.major == 3:
  5 |     from unittest.mock import Mock, patch
  6 | 
  7 | import unittest2 as unittest
  8 | import json
  9 | import os.path
 10 | 
 11 | import htrc.workset
 12 | 
 13 | class TestWorkset(unittest.TestCase):
 14 |     def setUp(self):
 15 |         self.test_vols = ['mdp.39015050817181', 'mdp.39015055436151',
 16 |             'mdp.39015056169157', 'mdp.39015050161697', 'mdp.39015042791874']
 17 | 
 18 |         dirname = os.path.dirname(__file__)
 19 |         self.example_file = os.path.join(dirname, 'data/example.jsonld')
 20 |         with open(self.example_file, 'r') as infile:
 21 |             self.json = json.load(infile)
 22 | 
 23 |         self.example_csv = os.path.join(dirname, 'data/example.csv')
 24 |         with open(self.example_csv, 'r') as infile:
 25 |             self.csv = infile.read().encode('utf-8')
 26 | 
 27 |     def test_get_volumes(self):
 28 |         """
 29 |         Test get_volumes by ensuring that the JSON-LD data structure can be
 30 |         extracted.
 31 |         """
 32 |         vols = htrc.workset.get_volumes(self.json)
 33 | 
 34 |         # check that each volume correctly parsed out of JSONLD data
 35 |         for vol in self.test_vols:
 36 |             self.assertIn(vol, vols)
 37 | 
 38 |     def test_load_file(self):
 39 |         vols = htrc.workset.load(self.example_file)
 40 | 
 41 |         # check that each volume correctly parsed out of JSONLD data
 42 |         for vol in self.test_vols:
 43 |             self.assertIn(vol, vols)
 44 |     
 45 |     def test_get_volumes_from_csv(self):
 46 |         vols = htrc.workset.get_volumes_from_csv(self.csv)
 47 | 
 48 |         # check that each volume correctly parsed out of JSONLD data
 49 |         for vol in self.test_vols:
 50 |             self.assertIn(vol, vols)
 51 | 
 52 |     @patch('htrc.workset.urlopen') 
 53 |     def test_load_url_hathitrust(self, urlopen_mock):
 54 |         ht_url = "https://babel.hathitrust.org/cgi/mb?a=listis&c=548413090"
 55 |         
 56 |         # test the default URL with a mock
 57 |         response_mock = Mock()
 58 |         urlopen_mock.return_value = response_mock
 59 |         response_mock.read.return_value = self.csv
 60 | 
 61 |         vols = htrc.workset.load_url(ht_url)
 62 | 
 63 |         # check that each volume correctly parsed out of CSV data
 64 |         for vol in self.test_vols:
 65 |             self.assertIn(vol, vols)
 66 | 
 67 | 
 68 |     @patch('htrc.workset.urlopen') 
 69 |     def test_load_url_htrc(self, urlopen_mock):
 70 |         htrc_url = 'https://htrc.hathitrust.org/wsid/123456'
 71 | 
 72 |         # test the default URL with a mock
 73 |         response_mock = Mock()
 74 |         urlopen_mock.return_value = response_mock
 75 |         response_mock.read.return_value = json.dumps(self.json).encode('utf-8')
 76 | 
 77 |         vols = htrc.workset.load_url(htrc_url)
 78 | 
 79 |         # check that each volume correctly parsed out of JSONLD data
 80 |         for vol in self.test_vols:
 81 |             self.assertIn(vol, vols)
 82 | 
 83 |         # Also test a direct URL from the triple store service.
 84 |         # Since this is using a mock, it doesn't matter if the service 
 85 |         # is up or down
 86 |         htrc_url2 = ('http://acbres224.ischool.illinois.edu:8080/' +
 87 |             'dcWSfetch/getItems?id=http://htrc.hathitrust.org/wsid/189324102')
 88 |         vols = htrc.workset.load_url(htrc_url2)
 89 | 
 90 |         # check that each volume correctly parsed out of JSONLD data
 91 |         for vol in self.test_vols:
 92 |             self.assertIn(vol, vols)
 93 |     
 94 |     @patch('htrc.workset.urlopen') 
 95 |     def test_load_url_error(self, urlopen_mock):
 96 |         invalid_url = 'blahblahblah'
 97 | 
 98 |         with self.assertRaises(ValueError):
 99 |             htrc.workset.load_url(invalid_url)
100 | 
101 | 
102 |     @patch('htrc.workset.urlopen') 
103 |     def test_load_hathitrust_collection(self, urlopen_mock):
104 |         ht_url = "https://babel.hathitrust.org/cgi/mb?a=listis&c=548413090"
105 | 
106 |         # test the default URL with a mock
107 |         response_mock = Mock()
108 |         urlopen_mock.return_value = response_mock
109 |         response_mock.read.return_value = self.csv
110 |         
111 |         vols = htrc.workset.load_hathitrust_collection(ht_url)
112 | 
113 |         # check that each volume correctly parsed out of CSV data
114 |         for vol in self.test_vols:
115 |             self.assertIn(vol, vols)
116 | 
117 | 
118 |         # test three malformed URLs:
119 |         # 1. Missing Collection ID
120 |         ht_url = "https://babel.hathitrust.org/cgi/mb?a=listis"
121 |         with self.assertRaises(ValueError):
122 |             htrc.workset.load_hathitrust_collection(ht_url)
123 |         
124 |         # 2. http vs. https
125 |         ht_url = "http://babel.hathitrust.org/cgi/mb?a=listis&c=548413090"
126 |         with self.assertRaises(ValueError):
127 |             htrc.workset.load_hathitrust_collection(ht_url)
128 | 
129 |         # 3. Non-Babel URL
130 |         ht_url = "https://htrc.hathitrust.org/cgi/mb?a=listis&c=548413090"
131 |         with self.assertRaises(ValueError):
132 |             htrc.workset.load_hathitrust_collection(ht_url)
133 | 
134 |     @patch('htrc.workset.load_url') 
135 |     def test_load_with_url(self, load_url_mock):
136 |         ht_url = "https://babel.hathitrust.org/cgi/mb?a=listis&c=548413090"
137 |         load_url_mock.return_value = self.test_vols
138 | 
139 |         vols = htrc.workset.load(ht_url)
140 |         load_url_mock.assert_called_with(ht_url)
141 | 
142 | 
143 | suite = unittest.TestLoader().loadTestsFromTestCase(TestWorkset)
144 | unittest.TextTestRunner(verbosity=2).run(suite)
145 | 


--------------------------------------------------------------------------------
/utils/generate_data.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import itertools
 4 | import os, os.path
 5 | 
 6 | import loremipsum
 7 | 
 8 | VOL_NAME = 'htrc.test{}'
 9 | PAGE_FILENAME = '{0:08d}.txt'
10 | 
11 | def generate_file(filename, N=4, separator='\n\n'):
12 |     with open(filename, 'w') as outfile:
13 |         for _,_,text in loremipsum.generate_paragraphs(N):
14 |             outfile.write(text + separator)
15 | 
16 | def generate_volumes(num_volumes, num_pages=5):
17 |     if isinstance(num_pages, int):
18 |         num_pages = itertools.repeat(num_pages, num_volumes)
19 |     elif len(num_pages) != num_volumes:
20 |         raise ValueError("len(num_pages) != num_volumes")
21 |     
22 |     for i, pages in enumerate(num_pages):
23 |         vol_name = VOL_NAME.format(i)
24 |         if not os.path.exists(vol_name):
25 |             os.makedirs(vol_name)
26 | 
27 |         for page in range(pages):
28 |             page = page+1
29 |             filename = os.path.join(vol_name, PAGE_FILENAME.format(page))
30 |             generate_file(filename)
31 | 
32 | if __name__ == '__main__':
33 |     from argparse import ArgumentParser
34 |     parser = ArgumentParser()
35 |     parser.add_argument('vols', type=int)
36 |     parser.add_argument('pages', type=int)
37 |     args = parser.parse_args()
38 | 
39 |     generate_volumes(args.vols, args.pages)
40 |         
41 | 


--------------------------------------------------------------------------------