├── .appveyor.yml
├── .editorconfig
├── .github
    ├── FUNDING.yml
    └── workflows
    │   └── test.yml
├── .gitignore
├── .travis.yml
├── LICENSE
├── README.md
├── iscc_cli
    ├── __init__.py
    ├── audio_id.py
    ├── cli.py
    ├── commands
    │   ├── __init__.py
    │   ├── batch.py
    │   ├── dump.py
    │   ├── gen.py
    │   ├── info.py
    │   ├── init.py
    │   ├── sim.py
    │   ├── test.py
    │   └── web.py
    ├── const.py
    ├── datatypes.py
    ├── ffmpeg.py
    ├── fpcalc.py
    ├── lib.py
    ├── mediatype.py
    ├── tika
    │   ├── __init__.py
    │   ├── config.py
    │   ├── detector.py
    │   ├── language.py
    │   ├── parser.py
    │   ├── tika.py
    │   ├── translate.py
    │   └── unpack.py
    ├── uread.py
    ├── utils.py
    └── video_id.py
├── poetry.lock
├── pyproject.toml
├── tests
    ├── __init__.py
    ├── audio
    │   ├── demo.aif
    │   ├── demo.mp3
    │   ├── demo.ogg
    │   └── demo.wav
    ├── batch
    │   ├── demo.doc
    │   ├── demo.pdf
    │   ├── empty.txt
    │   └── subdir
    │   │   └── demo.png
    ├── conftest.py
    ├── image
    │   ├── demo.bmp
    │   ├── demo.gif
    │   ├── demo.jpg
    │   ├── demo.png
    │   ├── demo.psd
    │   └── demo.tif
    ├── test_0_pre_init.py
    ├── test_1_init.py
    ├── test_audio_id.py
    ├── test_batch.py
    ├── test_cli.py
    ├── test_dump.py
    ├── test_ffmpeg.py
    ├── test_formats.py
    ├── test_fpcalc.py
    ├── test_gen.py
    ├── test_info.py
    ├── test_lib.py
    ├── test_sim.py
    ├── test_test.py
    ├── test_utils.py
    ├── test_video_id.py
    ├── test_web.py
    ├── text
    │   ├── demo.doc
    │   ├── demo.docx
    │   ├── demo.epub
    │   ├── demo.html
    │   ├── demo.json
    │   ├── demo.md
    │   ├── demo.mobi
    │   ├── demo.odt
    │   ├── demo.pdf
    │   ├── demo.rtf
    │   ├── demo.sqlite
    │   ├── demo.txt
    │   ├── demo.xhtml
    │   ├── demo.xls
    │   ├── demo.xlsx
    │   └── demo.xml
    └── video
    │   ├── build_videos.py
    │   ├── demo.gif
    │   └── master.3gp
└── winbuild.bat


/.appveyor.yml:
--------------------------------------------------------------------------------
 1 | build: false
 2 | 
 3 | environment:
 4 |   PYTHONIOENCODING: "UTF-8"
 5 | 
 6 |   matrix:
 7 |     - PYTHON: "C:/Python36-x64"
 8 |     - PYTHON: "C:/Python37-x64"
 9 |     - PYTHON: "C:/Python38-x64"
10 | 
11 | 
12 | install:
13 |   - "SET PATH=%PYTHON%;%PYTHON%\\Scripts;%PATH%"
14 | 
15 |   # Installing Poetry
16 |   - "curl -sSL https://raw.githubusercontent.com/sdispater/poetry/master/get-poetry.py -o get-poetry.py"
17 |   - "python get-poetry.py --yes"
18 |   - "SET PATH=%USERPROFILE%\\.poetry\\bin;%PATH%"
19 | 
20 |   # Install dependencies
21 |   - "poetry install -v"
22 | 
23 | 
24 | test_script:
25 |   - "poetry run pytest -v --terminate tests"
26 | 


--------------------------------------------------------------------------------
/.editorconfig:
--------------------------------------------------------------------------------
 1 | # https://editorconfig.org/
 2 | 
 3 | root = true
 4 | 
 5 | [*]
 6 | indent_style = space
 7 | indent_size = 4
 8 | insert_final_newline = true
 9 | trim_trailing_whitespace = true
10 | end_of_line = lf
11 | charset = utf-8
12 | 
13 | # Docstrings and comments use max_line_length = 79
14 | [*.py]
15 | max_line_length = 88
16 | 


--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | github: titusz
2 | custom: "https://iscc.foundation/support/"
3 | 


--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
 1 | name: Tests
 2 | 
 3 | on: [push, pull_request]
 4 | 
 5 | jobs:
 6 |   Tests:
 7 | 
 8 |     name: ${{ matrix.os }} / ${{ matrix.python-version }}
 9 |     runs-on: ${{ matrix.os }}-latest
10 |     strategy:
11 |       matrix:
12 |         os: [Ubuntu, MacOS, Windows]
13 |         python-version: [3.6, 3.7, 3.8]
14 | 
15 |     steps:
16 |     - uses: actions/checkout@v2
17 | 
18 |     - name: Set up Python ${{ matrix.python-version }}
19 |       uses: actions/setup-python@v1
20 |       with:
21 |         python-version: ${{ matrix.python-version }}
22 | 
23 |     - name: Install ffmpeg
24 |       if: runner.os == 'Linux' &&  matrix.python-version == '3.6'
25 |       run: |
26 |         sudo apt-get update
27 |         sudo apt-get install ffmpeg
28 | 
29 |     - name: Update pip
30 |       if: runner.os == 'Windoes'
31 |       run: python -m pip install -U pip
32 | 
33 |     - name: Install poetry
34 |       run: pip install poetry
35 | 
36 |     - name: Install dependencies
37 |       run: poetry install
38 | 
39 |     - name: Run pytest
40 |       run: poetry run pytest -q tests
41 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 
106 | # IntelliJ
107 | .idea
108 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | 
 3 | cache:
 4 |   pip: true
 5 |   directories:
 6 |     - "$HOME/.cache/pypoetry"
 7 | 
 8 | install:
 9 |   - curl -sSL https://raw.githubusercontent.com/sdispater/poetry/master/get-poetry.py -o get-poetry.py
10 |   - python get-poetry.py --yes
11 |   - source $HOME/.poetry/env
12 |   - poetry install
13 | 
14 | script: pytest -q tests/
15 | 
16 | matrix:
17 |   include:
18 |     - python: "3.6"
19 |     - python: "3.7"
20 |       dist: xenial
21 |     - python: "3.8"
22 |       dist: bionic
23 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019-2020 Titusz Pan
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # iscc-cli  - Command Line Tool
  2 | 
  3 | [![Version](https://img.shields.io/pypi/v/iscc-cli.svg)](https://pypi.python.org/pypi/iscc-cli/)
  4 | [![Downloads](https://pepy.tech/badge/iscc-cli)](https://pepy.tech/project/iscc-cli)
  5 | 
  6 | > [!CAUTION]
  7 | > This implementation is currently not up to date and does **NOT** generate valid ISCCs.
  8 | 
  9 | > A command line tool that creates **ISCC Codes** for digital media files based on the [reference implementation](<https://github.com/iscc/iscc-specs>).
 10 | 
 11 | ## Table of Contents
 12 | 
 13 | - [Background](#background)
 14 | - [Install](#install)
 15 | - [Usage](#usage)
 16 | - [Maintainers](#maintainers)
 17 | - [Contributing](#contributing)
 18 | - [License](#license)
 19 | 
 20 | ## Background
 21 | 
 22 | The **International Standard Content Code** is a proposal for an [open standard](https://en.wikipedia.org/wiki/Open_standard) for decentralized content identification. **ISCC Codes** are generated algorithmically **from the content itself** and offer many powerful features like content similarity clustering and partial integrity checks. If you want to learn more about the **ISCC** please check out https://iscc.codes.
 23 | 
 24 | This tool offers an easy way to generate ISCC codes from the command line. It supports content extraction via [Apache Tika](https://tika.apache.org/) and uses the [ISCC reference implementation](https://github.com/iscc/iscc-specs).
 25 | 
 26 | 
 27 | ### Supported Media File Types
 28 | 
 29 | #### Text
 30 | 
 31 | doc, docx, epub, html, odt, pdf, rtf, txt, xml, ibooks, md, xls, mobi ...
 32 | 
 33 | 
 34 | #### Image
 35 | 
 36 | gif, jpg, png, tif, bmp, psd, eps ...
 37 | 
 38 | **Note**: EPS (postscript) support requires [Ghostscript](https://www.ghostscript.com/download.html) to be installed on your system and available on your PATH. (Make sure you can run `gs` from your command line.)
 39 | 
 40 | 
 41 | #### Audio
 42 | 
 43 | aif, mp3, ogg, wav ...
 44 | 
 45 | 
 46 | **Note**: Support for the Audio-ID is experimental and not yet part of the [specification](https://iscc.codes/specification/)
 47 | 
 48 | 
 49 | #### Video
 50 | 
 51 | 3gp, 3g2, asf, avi, flv, gif, mpg, mp4, mkv, mov, ogv, webm, wmv ...
 52 | 
 53 | 
 54 | **Note**: Support for the Video-ID is experimentel and not yet part of the [specification](https://iscc.codes/specification/)
 55 | 
 56 | ## Requirements
 57 | 
 58 | | NOTE: Requires JAVA to be installed and on your path! |
 59 | | --- |
 60 | 
 61 | **iscc-cli** is tested on Linux, Windows, and macOS with Python 3.6/3.7/3.8.
 62 | 
 63 | This tool depends on [tika-python](https://github.com/chrismattmann/tika-python).  [Tika](https://tika.apache.org/) is used for extracting metadata and content from media files before generating ISCC Codes. On first execution of the `iscc` command line tool it will automatically download and launch the Java Tika Server in the background (this may take some time). Consecutive runs will access the existing Tika instance. You may explicitly pre-launch the Tika server with `$ iscc init`
 64 | 
 65 | ## Install
 66 | 
 67 | The ISCC command line tool is published with the package name `iscc-cli` on the [Python Package Index](https://pypi.python.org/pypi/iscc-cli) and can be installed with pip:
 68 | 
 69 | ```console
 70 | $ pip3 install iscc-cli
 71 | ```
 72 | 
 73 | Self-contained Windows binary executables are available for download at:
 74 | <https://github.com/iscc/iscc-cli/releases/>
 75 | 
 76 | ## Usage
 77 | 
 78 | ### Getting Help
 79 | 
 80 | Show help overview by calling `iscc` without any arguments:
 81 | 
 82 | ```console
 83 | $ iscc
 84 | Usage: iscc [OPTIONS] COMMAND [ARGS]...
 85 | 
 86 | Options:
 87 |   --version  Show the version and exit.
 88 |   --help     Show this message and exit.
 89 | 
 90 | Commands:
 91 |   gen*   Generate ISCC Code for FILE.
 92 |   batch  Create ISCC Codes for all files in PATH.
 93 |   dump   Dump Tika extraction results for PATH (file or url path).
 94 |   info   Show information about environment.
 95 |   init   Inititalize and check environment.
 96 |   sim    Estimate Similarity of ISCC Codes A & B.
 97 |   test   Test conformance with latest reference data.
 98 |   web    Generate ISCC Code from URL.
 99 | ```
100 | 
101 | Get help for a specific command by entering `iscc <command>`:
102 | 
103 | ```console
104 | $ iscc gen
105 | Usage: iscc gen [OPTIONS] FILE
106 | 
107 |   Generate ISCC Code for FILE.
108 | 
109 | Options:
110 |   -g, --guess       Guess title (first line of text).
111 |   -t, --title TEXT  Title for Meta-ID creation.
112 |   -e, --extra TEXT  Extra text for Meta-ID creation.
113 |   -v, --verbose     Enables verbose mode.
114 |   -h, --help        Show this message and exit.
115 | ```
116 | 
117 | ### Generating ISCC Codes
118 | 
119 | #### For local files
120 | 
121 | The `gen` command generates an ISCC Code for a single file:
122 | 
123 | ```console
124 | $ iscc gen tests/image/demo.jpg
125 | ISCC:CC1GG3hSxtbWU-CYDfTq7Qc7Fre-CDYkLqqmQJaQk-CRAPu5NwQgAhv
126 | ```
127 | 
128 | The `gen` command is default so you can skip it and simply do `$ iscc tests/demo.jpg`
129 | 
130 | To get a more detailed result use the `-v` (`--verbose`) option:
131 | 
132 | ```console
133 | $ iscc -v tests/image/demo.jpg
134 | ISCC:CC1GG3hSxtbWU-CYDfTq7Qc7Fre-CDYkLqqmQJaQk-CRAPu5NwQgAhv
135 | Norm Title: concentrated cat
136 | Tophash:    7a8d0c513142c45f417e761355bf71f11ad61d783cd8958ffc0712d00224a4d0
137 | Filepath:   tests/image/demo.jpg
138 | GMT:        image
139 | ```
140 | 
141 | See `iscc batch` for help on how to generate ISCC codes for multiple files at once.
142 | 
143 | #### For web urls
144 | 
145 | The `web` command allows you to create ISCC codes from URLs:
146 | 
147 | ```console
148 | $ iscc web https://iscc.foundation/news/images/lib-arch-ottawa.jpg
149 | ISCC:CCbUCUSqQpyJo-CYaHPGcucqwe3-CDt4nQptEGP6M-CRestDoG7xZFy
150 | ```
151 | 
152 | ### Similarity of ISCC Codes
153 | 
154 | The `sim` command computes estimated similarity of two ISCC Codes:
155 | 
156 | ```console
157 | $ iscc sim CCUcKwdQc1jUM CCjMmrCsKWu1D
158 | Estimated Similarity of Meta-ID: 78.00 % (56 of 64 bits match)
159 | ```
160 | 
161 | You may also compare full four-component ISCC Codes.
162 | 
163 | ### Using from your python code
164 | 
165 | While this package is not built to be used as a library, some of the high level commands to generate ISCC Codes are exposed as vanilla python functions:
166 | 
167 | ```python
168 | from iscc_cli import lib
169 | from pprint import pprint
170 | 
171 | pprint(lib.iscc_from_url("https://iscc.foundation/news/images/lib-arch-ottawa.jpg"))
172 | 
173 | {'gmt': 'image',
174 |  'iscc': 'CCbUCUSqQpyJo-CYaHPGcucqwe3-CDt4nQptEGP6M-CRestDoG7xZFy',
175 |  'norm_title': 'library and archives canada ottawa',
176 |  'tophash': 'e264cc07209bfaecc291f97c7f8765229ce4c1d36ac6901c477e05b2422eea3e'}
177 | ```
178 | 
179 | ## Maintainers
180 | 
181 | [@titusz](https://github.com/titusz)
182 | 
183 | ## Contributing
184 | 
185 | Pull requests are welcome. For major changes, please open an issue first to discuss what you would like to change.
186 | 
187 | Please make sure to update tests as appropriate.
188 | 
189 | You may also want join our developer chat on Telegram at <https://t.me/iscc_dev>.
190 | 
191 | ## Change Log
192 | 
193 | ### [0.9.12] - 2021-07-16
194 | - Update to custom mediatype detection (without Tika requirement)
195 | - Update dependencies
196 | 
197 | ### [0.9.11] - 2020-06-12
198 | - Update dependencies
199 | - Remove support for creating ISCC codes from youtube urls
200 | 
201 | ### [0.9.10] - 2020-05-19
202 | - Fixed issue with mime-type detection
203 | - Changed wording of similarity output
204 | - Added CSV-compatible output for batch command
205 | - Added debug option for batch command
206 | - Updated dependencies
207 | 
208 | ### [0.9.9] - 2020-05-18
209 | - Fixed issue with tika & macOS
210 | - Added macOS ci testing
211 | - Updated dependencies
212 | 
213 | ### [0.9.8] - 2020-05-13
214 | - Updated Content-ID-Audio for robustness against transcoding (breaking change)
215 | - Changed similarity calculation to match with web demo
216 | - Fixed bug in mime-type detection
217 | - Updated dependencies
218 | 
219 | ### [0.9.7] - 2020-05-01
220 | - Add support for flac and opus audio formats
221 | - Update dependencies
222 | 
223 | ### [0.9.6] - 2020-04-24
224 | - Support urls with dump command
225 | - Updated tika 1.24 and fpcalc 1.50
226 | - Use filename for meta-id as last resort
227 | - Switch to signed audio fingerprint (breaking change)
228 | - Bugfixes and stability improvements
229 | 
230 | ### [0.9.5] - 2020-03-02
231 | - Support mobi7
232 | - Support mobi print replica
233 | - Support mobi with web command
234 | 
235 | ### [0.9.4] - 2020-03-02
236 | - Add experimental support for mobi files
237 | 
238 | ### [0.9.3] - 2020-02-18
239 | - Add support for XHTML
240 | - Fix error on unsupported media types
241 | 
242 | ### [0.9.2] - 2020-01-30
243 | - Add support for bmp, psd, xls, xlsx
244 | - Add tika server live testing
245 | - Fix error with title guess on image files
246 | 
247 | ### [0.9.1] - 2020-01-05
248 | - Fix issue with APP_DIR creation
249 | 
250 | ### [0.9.0] - 2020-01-05
251 | - Add experimental support for Video-ID
252 | - Add special handling of YouTube URLs
253 | - Add support for more Media Types (try & error)
254 | - Add support for Python 3.8
255 | - Remove support for Python 3.5
256 | 
257 | ### [0.8.2] - 2019-12-22
258 | - Add new `test` command for confromance testing
259 | - Add support for .md (Markdown) files
260 | - Update to ISCC v1.0.5
261 | - Update to Apache Tika 1.23
262 | - Fix issue with non-conformant Meta-ID
263 | 
264 | ### [0.8.1] - 2019-12-13
265 | - Add support for tif files
266 | - Add support for eps files
267 | - Set application directory to non-roaming path
268 | 
269 | ### [0.8.0] - 2019-11-23
270 | - Add new `dump` command (dumps extraction results)
271 | - Add support for iBooks files
272 | - Fix error with tika 1.22 dependency
273 | - Store tika server in non-volatile storage
274 | 
275 | ### [0.7.0] - 2019-09-12
276 | - Expose commands as python API
277 | - Fix title guessing bug
278 | 
279 | ### [0.6.0] - 2019-06-11
280 | 
281 | - Added new `web` command (creates ISCC Codes for URLs)
282 | 
283 | ### [0.5.0] - 2019-06-06
284 | 
285 | - Added experimental support for aif, mp3, ogg, wav
286 | - More verbose batch output
287 | - Fix batch output default Meta-ID
288 | 
289 | ### [0.4.0] - 2019-06-03
290 | 
291 | - Added support for html, odt, txt, xml, gif
292 | - Added optional guessing of title (first line of text)
293 | - Added new `info` command
294 | - Fixed wrong detection of identical Instance-ID
295 | 
296 | ### [0.3.0] - 2019-06-01
297 | 
298 | - Add `sim` command similarity comparison of ISCC Codes
299 | 
300 | ### [0.2.0] - 2019-05-31
301 | 
302 | - Add support for doc, docx and rtf documents
303 | - Update to ISCC 1.0.4 (fixes whitespace bug)
304 | 
305 | ### [0.1.0] - 2019-05-31
306 | 
307 | - Basic ISCC Code creation
308 | - Supported file types: jpg, png, pdf, epub
309 | 
310 | ## License
311 | 
312 | MIT © 2019-2021 Titusz Pan
313 | 
314 | 


--------------------------------------------------------------------------------
/iscc_cli/__init__.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import click
 3 | import iscc_cli
 4 | 
 5 | 
 6 | __version__ = "0.9.12"
 7 | APP_NAME = "iscc-cli"
 8 | APP_DIR = click.get_app_dir(APP_NAME, roaming=False)
 9 | os.makedirs(iscc_cli.APP_DIR, exist_ok=True)
10 | os.environ["TIKA_PATH"] = APP_DIR
11 | os.environ["TIKA_LOG_PATH"] = APP_DIR
12 | os.environ["TIKA_STARTUP_MAX_RETRY"] = "8"
13 | os.environ["LOGURU_AUTOINIT"] = "False"
14 | 
15 | 
16 | from iscc_cli.tika import tika
17 | 
18 | tika.log.disabled = True
19 | 


--------------------------------------------------------------------------------
/iscc_cli/audio_id.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """Experimetal support for Audio-ID."""
 3 | import json
 4 | import subprocess
 5 | import iscc
 6 | from iscc_cli import fpcalc
 7 | 
 8 | 
 9 | def content_id_audio(features, partial=False):
10 |     digests = []
11 |     for a, b in iscc.sliding_window(features, 2):
12 |         digest = a.to_bytes(4, "big", signed=True) + b.to_bytes(4, "big", signed=True)
13 |         digests.append(digest)
14 |     shash_digest = iscc.similarity_hash(digests)
15 |     if partial:
16 |         content_id_audio_digest = iscc.HEAD_CID_A_PCF + shash_digest
17 |     else:
18 |         content_id_audio_digest = iscc.HEAD_CID_A + shash_digest
19 |     return iscc.encode(content_id_audio_digest)
20 | 
21 | 
22 | def get_chroma_vector(file):
23 |     """Returns 32-bit (4 byte) integers as features"""
24 | 
25 |     if hasattr(file, "read"):
26 |         file.seek(0)
27 |         cmd = [fpcalc.exe_path(), "-raw", "-json", "-signed", "-"]
28 |         res = subprocess.run(cmd, stdout=subprocess.PIPE, input=file.read())
29 |     else:
30 |         cmd = [fpcalc.exe_path(), "-raw", "-json", "-signed", file]
31 |         res = subprocess.run(cmd, stdout=subprocess.PIPE)
32 | 
33 |     vec = json.loads(res.stdout.decode("utf-8"))["fingerprint"]
34 |     return vec
35 | 


--------------------------------------------------------------------------------
/iscc_cli/cli.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import click
 3 | from iscc_cli import __version__
 4 | from iscc_cli.commands import init, gen, batch, sim, info, web, dump, test
 5 | from click_default_group import DefaultGroup
 6 | 
 7 | 
 8 | @click.group(cls=DefaultGroup, default="gen", default_if_no_args=False)
 9 | @click.version_option(version=__version__, message="ISCC CLI - %(version)s")
10 | def cli():
11 |     pass
12 | 
13 | 
14 | cli.add_command(init.init)
15 | cli.add_command(gen.gen)
16 | cli.add_command(batch.batch)
17 | cli.add_command(web.web)
18 | cli.add_command(sim.sim)
19 | cli.add_command(info.info)
20 | cli.add_command(dump.dump)
21 | cli.add_command(test.test)
22 | 
23 | 
24 | if __name__ == "__main__":
25 |     cli()
26 | 


--------------------------------------------------------------------------------
/iscc_cli/commands/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iscc/iscc-cli/b8ca75567e842de504752440cb79f3dcb21177d4/iscc_cli/commands/__init__.py


--------------------------------------------------------------------------------
/iscc_cli/commands/batch.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import os
  3 | import shutil
  4 | import sys
  5 | from os.path import basename, abspath
  6 | import click
  7 | import mobi
  8 | from iscc_cli.tika import parser
  9 | import iscc
 10 | from iscc_cli import video_id
 11 | from iscc_cli.const import SUPPORTED_MIME_TYPES, GMT
 12 | from iscc_cli.utils import get_files, mime_to_gmt, get_title, DefaultHelp
 13 | from iscc_cli import audio_id, fpcalc
 14 | from loguru import logger as log
 15 | from iscc_cli.mediatype import mime_guess, mime_clean
 16 | 
 17 | 
 18 | @click.command(cls=DefaultHelp)
 19 | @click.argument("path", type=click.Path(exists=True))
 20 | @click.option("-r", "--recursive", is_flag=True, help="Recurse into subdirectories.")
 21 | @click.option(
 22 |     "-g",
 23 |     "--guess",
 24 |     is_flag=True,
 25 |     default=False,
 26 |     help="Guess title (first line of text).",
 27 |     show_default=True,
 28 | )
 29 | @click.option(
 30 |     "-d",
 31 |     "--debug",
 32 |     is_flag=True,
 33 |     default=False,
 34 |     help="Show debug output",
 35 |     show_default=True,
 36 | )
 37 | def batch(path, recursive, guess, debug):
 38 |     """Create ISCC Codes for all files in PATH.
 39 | 
 40 |     Example:
 41 | 
 42 |       $ iscc batch ~/Documents
 43 | 
 44 |     """
 45 |     if debug:
 46 |         log.add(sys.stdout)
 47 | 
 48 |     results = []
 49 |     for f in get_files(path, recursive=recursive):
 50 |         filesize = os.path.getsize(f)
 51 |         if not filesize:
 52 |             msg = "Cannot proccess empty file: {}".format(f)
 53 |             log.warning(msg)
 54 |             continue
 55 | 
 56 |         media_type = mime_clean(mime_guess(f))
 57 |         if media_type not in SUPPORTED_MIME_TYPES:
 58 |             fname = basename(f)
 59 |             msg = "Unsupported file {} with mime type: {},,,,".format(fname, media_type)
 60 |             log.warning(msg)
 61 |             continue
 62 | 
 63 |         if media_type == "application/x-mobipocket-ebook":
 64 |             try:
 65 |                 tempdir, epub_filepath = mobi.extract(f)
 66 |                 tika_result = parser.from_file(epub_filepath)
 67 |                 shutil.rmtree(tempdir)
 68 |             except Exception as e:
 69 |                 msg = "Error with mobi extraction %s"
 70 |                 log.error(msg)
 71 |                 continue
 72 |         else:
 73 |             tika_result = parser.from_file(f)
 74 | 
 75 |         title = get_title(tika_result, guess=guess, uri=f)
 76 | 
 77 |         mid, norm_title, _ = iscc.meta_id(title)
 78 |         gmt = mime_to_gmt(media_type, file_path=f)
 79 |         if gmt == GMT.IMAGE:
 80 |             try:
 81 |                 cid = iscc.content_id_image(f)
 82 |             except Exception as e:
 83 |                 msg = "Clould not proccess image: {} ({})".format(f, e)
 84 |                 log.error(msg)
 85 |                 continue
 86 | 
 87 |         elif gmt == GMT.TEXT:
 88 |             text = tika_result["content"]
 89 |             if not text:
 90 |                 msg = "Could not extract text from {}".format(basename(f))
 91 |                 log.warning(msg)
 92 |                 continue
 93 |             cid = iscc.content_id_text(tika_result["content"])
 94 |         elif gmt == GMT.AUDIO:
 95 |             if not fpcalc.is_installed():
 96 |                 fpcalc.install()
 97 |             features = audio_id.get_chroma_vector(f)
 98 |             cid = audio_id.content_id_audio(features)
 99 |         elif gmt == GMT.VIDEO:
100 |             features = video_id.get_frame_vectors(abspath(f))
101 |             cid = video_id.content_id_video(features)
102 |         else:
103 |             log.error("Could not generate ISCC")
104 |             continue
105 | 
106 |         did = iscc.data_id(f)
107 |         iid, tophash = iscc.instance_id(f)
108 | 
109 |         iscc_code_cs = ",".join((mid, cid, did, iid))
110 | 
111 |         click.echo(
112 |             "{iscc_code},{tophash},{fname},{gmt},{title}".format(
113 |                 iscc_code=iscc_code_cs,
114 |                 tophash=tophash,
115 |                 fname=basename(f),
116 |                 gmt=gmt,
117 |                 title=norm_title,
118 |             )
119 |         )
120 |         iscc_code = "-".join((mid, cid, did, iid))
121 |         results.append(
122 |             dict(
123 |                 iscc=iscc_code,
124 |                 norm_title=norm_title,
125 |                 tophash=tophash,
126 |                 gmt=gmt,
127 |                 file_name=basename(f),
128 |             )
129 |         )
130 | 
131 |     return results
132 | 


--------------------------------------------------------------------------------
/iscc_cli/commands/dump.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import shutil
 3 | 
 4 | import click
 5 | import mobi
 6 | from click import UsageError
 7 | from iscc_cli.tika import parser
 8 | from iscc_cli.utils import DefaultHelp
 9 | from iscc_cli.const import SUPPORTED_MIME_TYPES
10 | import json
11 | from iscc_cli.mediatype import mime_guess, mime_clean
12 | 
13 | 
14 | @click.command(cls=DefaultHelp)
15 | @click.argument("path", type=click.STRING)
16 | @click.option(
17 |     "-s", "--strip", type=click.INT, default=0, help="Strip content to first X chars."
18 | )
19 | @click.option("-m", "--meta", is_flag=True, default=False, help="Dump metadata only.")
20 | @click.option("-c", "--content", is_flag=True, default=False, help="Dump content only.")
21 | def dump(path, strip, meta, content):
22 |     """Dump Tika extraction results for PATH (file or url path)."""
23 | 
24 |     media_type = mime_clean(mime_guess(path))
25 | 
26 |     if media_type not in SUPPORTED_MIME_TYPES:
27 |         click.echo("Unsupported media type {}.".format(media_type))
28 |         click.echo("Please request support at https://github.com/iscc/iscc-cli/issues")
29 | 
30 |     if media_type == "application/x-mobipocket-ebook":
31 |         tempdir, epub_filepath = mobi.extract(path)
32 |         tika_result = parser.from_file(epub_filepath)
33 |         shutil.rmtree(tempdir)
34 |     else:
35 |         tika_result = parser.from_file(path)
36 | 
37 |     if all([meta, content]):
38 |         raise UsageError("Use either --meta or --content for selective output.")
39 | 
40 |     if strip:
41 |         tika_result["content"] = tika_result.get("content", "")[:strip]
42 | 
43 |     if meta:
44 |         click.echo(json.dumps(tika_result.get("metadata", ""), indent=2))
45 |     elif content:
46 |         click.echo(json.dumps(tika_result.get("content", ""), indent=2))
47 |     else:
48 |         click.echo(json.dumps(tika_result, indent=2))
49 | 


--------------------------------------------------------------------------------
/iscc_cli/commands/gen.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import os
 3 | import shutil
 4 | from os.path import abspath
 5 | 
 6 | import click
 7 | import iscc
 8 | import mobi
 9 | from iscc_cli.tika import parser
10 | from iscc_cli import audio_id, video_id, fpcalc
11 | from iscc_cli.const import SUPPORTED_MIME_TYPES, GMT
12 | from iscc_cli.utils import get_title, mime_to_gmt, DefaultHelp
13 | from iscc_cli.mediatype import mime_guess, mime_clean
14 | 
15 | 
16 | @click.command(cls=DefaultHelp)
17 | @click.argument("file", type=click.File("rb"))
18 | @click.option(
19 |     "-g",
20 |     "--guess",
21 |     is_flag=True,
22 |     default=False,
23 |     help="Guess title (first line of text).",
24 | )
25 | @click.option("-t", "--title", type=click.STRING, help="Title for Meta-ID creation.")
26 | @click.option(
27 |     "-e", "--extra", type=click.STRING, help="Extra text for Meta-ID creation."
28 | )
29 | @click.option("-v", "--verbose", is_flag=True, help="Enables verbose mode.")
30 | def gen(file, guess, title, extra, verbose):
31 |     """Generate ISCC Code for FILE."""
32 |     filesize = os.path.getsize(file.name)
33 |     if not filesize:
34 |         raise click.BadParameter("Cannot proccess empty file: {}".format(file.name))
35 | 
36 |     media_type = mime_clean(mime_guess(file.name))
37 |     if media_type not in SUPPORTED_MIME_TYPES:
38 |         click.echo("Unsupported media type {}.".format(media_type))
39 |         click.echo("Please request support at https://github.com/iscc/iscc-cli/issues")
40 | 
41 |     if media_type == "application/x-mobipocket-ebook":
42 |         tempdir, epub_filepath = mobi.extract(file.name)
43 |         tika_result = parser.from_file(epub_filepath)
44 |         shutil.rmtree(tempdir)
45 |     else:
46 |         tika_result = parser.from_file(file.name)
47 | 
48 |     if not title:
49 |         title = get_title(tika_result, guess=guess, uri=file.name)
50 | 
51 |     if not extra:
52 |         extra = ""
53 | 
54 |     mid, norm_title, _ = iscc.meta_id(title, extra)
55 |     gmt = mime_to_gmt(media_type, file_path=file.name)
56 |     if gmt == GMT.IMAGE:
57 |         cid = iscc.content_id_image(file.name)
58 |     elif gmt == GMT.TEXT:
59 |         text = tika_result["content"]
60 |         if not text:
61 |             click.echo("Could not extract text from {}".format(file.name))
62 |             return
63 |         cid = iscc.content_id_text(tika_result["content"])
64 |     elif gmt == GMT.AUDIO:
65 |         if not fpcalc.is_installed():
66 |             fpcalc.install()
67 |         features = audio_id.get_chroma_vector(file.name)
68 |         cid = audio_id.content_id_audio(features)
69 |     elif gmt == GMT.VIDEO:
70 |         features = video_id.get_frame_vectors(abspath(file.name))
71 |         cid = video_id.content_id_video(features)
72 |     else:
73 |         click.echo("Could not generate ISCC")
74 |         return
75 | 
76 |     did = iscc.data_id(file.name)
77 |     iid, tophash = iscc.instance_id(file.name)
78 | 
79 |     if not norm_title:
80 |         iscc_code = "-".join((cid, did, iid))
81 |     else:
82 |         iscc_code = "-".join((mid, cid, did, iid))
83 | 
84 |     click.echo("ISCC:{}".format(iscc_code))
85 | 
86 |     if verbose:
87 |         if norm_title:
88 |             click.echo("Norm Title: %s" % norm_title)
89 |         click.echo("Tophash:    %s" % tophash)
90 |         click.echo("Filepath:   %s" % file.name)
91 |         click.echo("GMT:        %s" % gmt)
92 | 
93 |     return dict(iscc=iscc_code, norm_title=norm_title, tophash=tophash, gmt=gmt)
94 | 


--------------------------------------------------------------------------------
/iscc_cli/commands/info.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import click
 3 | import iscc
 4 | import iscc_cli
 5 | from iscc_cli import fpcalc, ffmpeg
 6 | from iscc_cli.const import SUPPORTED_EXTENSIONS
 7 | from iscc_cli.tika import tika
 8 | import requests
 9 | 
10 | 
11 | def tika_version():
12 |     url = tika.ServerEndpoint + "/version"
13 |     try:
14 |         return requests.get(url).text
15 |     except Exception:
16 |         return 'WARNING: Not Installed - run "iscc init" to install!'
17 | 
18 | 
19 | @click.command()
20 | def info():
21 |     """Show information about environment."""
22 |     click.echo("ISCC Cli Version: %s" % iscc_cli.__version__)
23 |     click.echo("ISCC Version: %s" % iscc.__version__)
24 |     click.echo("FFMPEG Version: %s" % ffmpeg.get_version_info())
25 |     click.echo("FPCALC Version: %s" % fpcalc.get_version_info())
26 |     click.echo("Tika Version: %s" % tika_version())
27 |     click.echo("Tika Jar Path: %s" % tika.TikaJarPath)
28 |     click.echo("Supported File Types: %s" % ", ".join(sorted(SUPPORTED_EXTENSIONS)))
29 | 
30 | 
31 | if __name__ == "__main__":
32 |     info()
33 | 


--------------------------------------------------------------------------------
/iscc_cli/commands/init.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import io
 3 | import click
 4 | import requests
 5 | from iscc_cli.tika import detector
 6 | from iscc_cli import fpcalc
 7 | 
 8 | 
 9 | @click.command()
10 | def init():
11 |     """Inititalize and check environment."""
12 |     click.echo("Inititalizing Tika ...")
13 |     detector.from_buffer(io.BytesIO(b"Wakeup Tika"))
14 |     url = detector.ServerEndpoint + "/version"
15 |     resp = requests.get(url)
16 |     click.echo("Tika initialized: {}".format(resp.text))
17 |     click.echo("Testing fpcalc ...")
18 |     fpc_ok = fpcalc.is_installed()
19 |     if not fpc_ok:
20 |         fpcalc.install()
21 |     fpc_version = fpcalc.get_version_info()
22 |     click.echo("fpcalc installed: {}".format(fpc_version))
23 | 


--------------------------------------------------------------------------------
/iscc_cli/commands/sim.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import sys
 3 | import click
 4 | import iscc
 5 | 
 6 | from iscc_cli.const import ISCC_COMPONENT_CODES
 7 | from iscc_cli.utils import DefaultHelp, iscc_verify, iscc_split, iscc_clean
 8 | 
 9 | 
10 | @click.command(cls=DefaultHelp)
11 | @click.argument("a", nargs=1)
12 | @click.argument("b", nargs=1)
13 | def sim(a, b):
14 |     """Estimate Similarity of ISCC Codes A & B.
15 | 
16 |     Example:
17 | 
18 |         $ iscc sim CCUcKwdQc1jUM CCjMmrCsKWu1D
19 | 
20 |     You may also compare fully qualified ISCC Codes with each other.
21 |     """
22 |     try:
23 |         iscc_verify(a)
24 |         iscc_verify(b)
25 |     except ValueError as e:
26 |         click.echo(str(e))
27 |         sys.exit(1)
28 | 
29 |     # Fully Qualified ISCC Code Similarity
30 |     avg_msg = None
31 |     if len(iscc_clean(a)) == 52 and len(iscc_clean(b)) == 52:
32 |         digest_a = b"".join(iscc.decode(code)[1:] for code in iscc_split(a))
33 |         digest_b = b"".join(iscc.decode(code)[1:] for code in iscc_split(b))
34 |         int_a = int.from_bytes(digest_a, "big", signed=False)
35 |         int_b = int.from_bytes(digest_b, "big", signed=False)
36 |         dist = bin(int_a ^ int_b).count("1")
37 |         similarity = ((192 - dist) / 192) * 100
38 |         avg_msg = "Average Estimated Similarity: {:.2f} % ({} of 192 bits differnt)".format(
39 |             similarity, dist
40 |         )
41 | 
42 |     # Per Component Similarity
43 |     a = iscc_split(a)
44 |     b = iscc_split(b)
45 | 
46 |     if len(a) == 1 and len(b) == 1:
47 |         type_a = ISCC_COMPONENT_CODES.get(a[0][:2])["name"]
48 |         type_b = ISCC_COMPONENT_CODES.get(b[0][:2])["name"]
49 |         if type_a != type_b:
50 |             click.echo("Incompatible component types ({} & {}).".format(type_a, type_b))
51 | 
52 |     for ca in a:
53 |         for cb in b:
54 |             type_a = ISCC_COMPONENT_CODES.get(ca[:2])["name"]
55 |             type_b = ISCC_COMPONENT_CODES.get(cb[:2])["name"]
56 |             if type_a == type_b and type_a != "Instance-ID":
57 |                 hamming_dist = iscc.distance(ca, cb)
58 |                 hamming_sim = 64 - hamming_dist
59 |                 similarity = round(hamming_sim / (2 * 64 - hamming_sim) * 100)
60 |                 click.echo(
61 |                     "Estimated Similarity of {}: {:.2f} % ({} of 64 bits match)".format(
62 |                         type_a, similarity, hamming_sim
63 |                     )
64 |                 )
65 |             if type_a == "Instance-ID" and type_b == "Instance-ID":
66 |                 if ca == cb:
67 |                     click.echo("Identical Instance-ID")
68 |     if avg_msg:
69 |         click.echo(avg_msg)
70 | 


--------------------------------------------------------------------------------
/iscc_cli/commands/test.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import io
 3 | import click
 4 | import requests
 5 | import iscc
 6 | from iscc_cli import const
 7 | 
 8 | 
 9 | @click.command()
10 | def test():
11 |     """Test conformance with latest reference data."""
12 |     click.echo("Running confromance tests.\n")
13 |     test_data = requests.get(const.TEST_DATA_URL + "test_data.json").json()
14 |     for funcname, tests in test_data.items():
15 |         if not tests["required"]:
16 |             continue
17 |         for testname, testdata in tests.items():
18 |             if not testname.startswith("test_"):
19 |                 continue
20 |             func = getattr(iscc, funcname)
21 |             args = testdata["inputs"]
22 |             if isinstance(args[0], str) and args[0].startswith("file"):
23 |                 r = requests.get(const.TEST_DATA_URL + args[0])
24 |                 args[0] = io.BytesIO(r.content)
25 | 
26 |             if funcname in ["data_chunks"]:
27 |                 testdata["outputs"] = [
28 |                     bytes.fromhex(i.split(":")[1]) for i in testdata["outputs"]
29 |                 ]
30 |                 result = list(func(*args))
31 |             else:
32 |                 result = func(*args)
33 |             expected = testdata["outputs"]
34 |             try:
35 |                 assert result == expected, "%s %s " % (funcname, args)
36 |             except AssertionError:
37 |                 click.echo("FAILED %s" % testname)
38 |                 click.echo("Result %s != Expected %s" % (result, expected))
39 |             else:
40 |                 click.echo("PASSED %s" % testname)
41 | 


--------------------------------------------------------------------------------
/iscc_cli/commands/web.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import os
  3 | import shutil
  4 | from io import BytesIO
  5 | import click
  6 | import iscc
  7 | import mobi
  8 | import requests
  9 | from iscc_cli.tika import parser
 10 | import iscc_cli
 11 | from iscc_cli import fpcalc, audio_id, video_id
 12 | from iscc_cli.const import SUPPORTED_MIME_TYPES, GMT
 13 | from iscc_cli.utils import (
 14 |     get_title,
 15 |     mime_to_gmt,
 16 |     DefaultHelp,
 17 |     download_file,
 18 | )
 19 | from iscc_cli.mediatype import mime_guess, mime_clean
 20 | 
 21 | HEADERS = {"User-Agent": "ISCC {}".format(iscc_cli.__version__)}
 22 | 
 23 | 
 24 | @click.command(cls=DefaultHelp)
 25 | @click.argument("url", type=click.STRING)
 26 | @click.option(
 27 |     "-g",
 28 |     "--guess",
 29 |     is_flag=True,
 30 |     default=False,
 31 |     help="Guess title (first line of text).",
 32 | )
 33 | @click.option("-t", "--title", type=click.STRING, help="Title for Meta-ID creation.")
 34 | @click.option(
 35 |     "-e", "--extra", type=click.STRING, help="Extra text for Meta-ID creation."
 36 | )
 37 | @click.option("-v", "--verbose", is_flag=True, help="Enables verbose mode.")
 38 | def web(url, guess, title, extra, verbose):
 39 |     """Generate ISCC Code from URL."""
 40 | 
 41 |     extra = extra or ""
 42 | 
 43 |     try:
 44 |         resp = requests.get(url, headers=HEADERS, stream=True)
 45 |     except Exception as e:
 46 |         raise click.BadArgumentUsage(e)
 47 | 
 48 |     data = BytesIO(resp.content)
 49 |     media_type = mime_clean(mime_guess(data))
 50 |     if media_type not in SUPPORTED_MIME_TYPES:
 51 |         click.echo("Unsupported media type {}".format(media_type))
 52 |         click.echo("Please request support at https://github.com/iscc/iscc-cli/issues")
 53 |         return
 54 | 
 55 |     if media_type == "application/x-mobipocket-ebook":
 56 |         data.seek(0)
 57 |         tempdir, filepath = mobi.extract(data)
 58 |         tika_result = parser.from_file(filepath)
 59 |         shutil.rmtree(tempdir)
 60 |     else:
 61 |         data.seek(0)
 62 |         tika_result = parser.from_buffer(data)
 63 | 
 64 |     if not title:
 65 |         title = get_title(tika_result, guess=guess, uri=url)
 66 | 
 67 |     mid, norm_title, _ = iscc.meta_id(title, extra)
 68 |     gmt = mime_to_gmt(media_type)
 69 |     if gmt == GMT.IMAGE:
 70 |         data.seek(0)
 71 |         cid = iscc.content_id_image(data)
 72 |     elif gmt == GMT.TEXT:
 73 |         text = tika_result["content"]
 74 |         if not text:
 75 |             click.echo("Could not extract text")
 76 |             return
 77 |         cid = iscc.content_id_text(tika_result["content"])
 78 |     elif gmt == GMT.AUDIO:
 79 |         if not fpcalc.is_installed():
 80 |             fpcalc.install()
 81 |         data.seek(0)
 82 |         features = audio_id.get_chroma_vector(data)
 83 |         cid = audio_id.content_id_audio(features)
 84 |     elif gmt == GMT.VIDEO:
 85 |         local_path = download_file(url, sanitize=True)
 86 |         features = video_id.get_frame_vectors(local_path)
 87 |         cid = video_id.content_id_video(features)
 88 |         os.remove(local_path)
 89 | 
 90 |     data.seek(0)
 91 |     did = iscc.data_id(data)
 92 |     data.seek(0)
 93 |     iid, tophash = iscc.instance_id(data)
 94 | 
 95 |     if not norm_title:
 96 |         iscc_code = "-".join((cid, did, iid))
 97 |     else:
 98 |         iscc_code = "-".join((mid, cid, did, iid))
 99 | 
100 |     click.echo("ISCC:{}".format(iscc_code))
101 | 
102 |     if verbose:
103 |         if norm_title:
104 |             click.echo("Norm Title: %s" % norm_title)
105 |         click.echo("Tophash:    %s" % tophash)
106 |         click.echo("Filepath:   %s" % url)
107 |         click.echo("GMT:        %s" % gmt)
108 | 
109 |     return dict(iscc=iscc_code, norm_title=norm_title, tophash=tophash, gmt=gmt)
110 | 


--------------------------------------------------------------------------------
/iscc_cli/const.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from iscc import const
  3 | 
  4 | 
  5 | class GMT:
  6 |     """Generic Media Type"""
  7 | 
  8 |     IMAGE = "image"
  9 |     TEXT = "text"
 10 |     AUDIO = "audio"
 11 |     VIDEO = "video"
 12 | 
 13 | 
 14 | SUPPORTED_MIME_TYPES = {
 15 |     # Text Formats
 16 |     "application/rtf": {"gmt": GMT.TEXT, "ext": "rtf"},
 17 |     "application/msword": {"gmt": GMT.TEXT, "ext": "doc"},
 18 |     "application/pdf": {"gmt": GMT.TEXT, "ext": "pdf"},
 19 |     "application/epub+zip": {"gmt": GMT.TEXT, "ext": "epub"},
 20 |     "application/xml": {"gmt": GMT.TEXT, "ext": "xml"},
 21 |     "application/xhtml+xml": {"gmt": GMT.TEXT, "ext": "xhtml"},
 22 |     "application/vnd.oasis.opendocument.text": {"gmt": GMT.TEXT, "ext": "odt"},
 23 |     "text/html": {"gmt": GMT.TEXT, "ext": "html"},
 24 |     "text/plain": {"gmt": GMT.TEXT, "ext": "txt"},
 25 |     "application/x-ibooks+zip": {"gmt": GMT.TEXT, "ext": "ibooks"},
 26 |     "text/x-web-markdown": {"gmt": GMT.TEXT, "ext": "md"},
 27 |     "application/vnd.openxmlformats-officedocument.wordprocessingml.document": {
 28 |         "gmt": GMT.TEXT,
 29 |         "ext": "docx",
 30 |     },
 31 |     "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": {
 32 |         "gmt": GMT.TEXT,
 33 |         "ext": "xlsx",
 34 |     },
 35 |     "application/vnd.ms-excel": {"gmt": GMT.TEXT, "ext": "xls"},
 36 |     "application/x-mobipocket-ebook": {
 37 |         "gmt": GMT.TEXT,
 38 |         "ext": ["mobi", "prc", "azw", "azw3", "azw4"],
 39 |     },
 40 |     # Image Formats
 41 |     "image/bmp": {"gmt": GMT.IMAGE, "ext": "bmp"},
 42 |     "image/gif": {"gmt": GMT.IMAGE, "ext": "gif"},
 43 |     "image/jpeg": {"gmt": GMT.IMAGE, "ext": ["jpg", "jpeg"]},
 44 |     "image/png": {"gmt": GMT.IMAGE, "ext": "png"},
 45 |     "image/tiff": {"gmt": GMT.IMAGE, "ext": "tif"},
 46 |     "image/vnd.adobe.photoshop": {"gmt": GMT.IMAGE, "ext": "psd"},
 47 |     "application/postscript": {"gmt": GMT.IMAGE, "ext": "eps"},
 48 |     # Audio Formats
 49 |     "audio/mpeg": {"gmt": GMT.AUDIO, "ext": "mp3"},
 50 |     "audio/vnd.wave": {"gmt": GMT.AUDIO, "ext": "wav"},
 51 |     "audio/vorbis": {"gmt": GMT.AUDIO, "ext": "ogg"},
 52 |     "audio/x-aiff": {"gmt": GMT.AUDIO, "ext": "aif"},
 53 |     "audio/x-flac": {"gmt": GMT.AUDIO, "ext": "flac"},
 54 |     "audio/opus": {"gmt": GMT.AUDIO, "ext": "opus"},
 55 |     # Video Formats
 56 |     "application/vnd.rn-realmedia": {"gmt": GMT.VIDEO, "ext": "rm"},
 57 |     "video/x-dirac": {"gmt": GMT.VIDEO, "ext": "drc"},
 58 |     "video/3gpp": {"gmt": GMT.VIDEO, "ext": "3gp"},
 59 |     "video/3gpp2": {"gmt": GMT.VIDEO, "ext": "3g2"},
 60 |     "video/x-ms-asf": {"gmt": GMT.VIDEO, "ext": "asf"},
 61 |     "video/x-msvideo": {"gmt": GMT.VIDEO, "ext": "avi"},
 62 |     "video/webm": {"gmt": GMT.VIDEO, "ext": "webm"},
 63 |     "video/mpeg": {"gmt": GMT.VIDEO, "ext": ["mpeg", "mpg", "m1v", "vob"]},
 64 |     "video/mp4": {"gmt": GMT.VIDEO, "ext": "mp4"},
 65 |     "video/x-m4v": {"gmt": GMT.VIDEO, "ext": "m4v"},
 66 |     "video/x-matroska": {"gmt": GMT.VIDEO, "ext": "mkv"},
 67 |     "video/theora": {"gmt": GMT.VIDEO, "ext": ["ogg", "ogv"]},
 68 |     "video/quicktime": {"gmt": GMT.VIDEO, "ext": ["mov", "f4v"]},
 69 |     "video/x-flv": {"gmt": GMT.VIDEO, "ext": "flv"},
 70 |     "application/x-shockwave-flash": {"gmt": GMT.VIDEO, "ext": "swf"},
 71 |     "video/h264": {"gmt": GMT.VIDEO, "ext": "h264"},
 72 |     "video/x-ms-wmv": {"gmt": GMT.VIDEO, "ext": "wmv"},
 73 | }
 74 | 
 75 | 
 76 | SUPPORTED_EXTENSIONS = []
 77 | for v in SUPPORTED_MIME_TYPES.values():
 78 |     ext = v["ext"]
 79 |     if isinstance(ext, str):
 80 |         SUPPORTED_EXTENSIONS.append(ext)
 81 |     else:
 82 |         for e in ext:
 83 |             SUPPORTED_EXTENSIONS.append(e)
 84 | 
 85 | 
 86 | ISCC_COMPONENT_TYPES = {
 87 |     const.HEAD_MID: {"name": "Meta-ID", "code": "CC"},
 88 |     const.HEAD_CID_T: {"name": "Content-ID Text", "code": "CT"},
 89 |     const.HEAD_CID_T_PCF: {"name": "Content-ID Text", "code": "Ct"},
 90 |     const.HEAD_CID_I: {"name": "Content-ID Image", "code": "CY"},
 91 |     const.HEAD_CID_I_PCF: {"name": "Content-ID Image", "code": "Ci"},
 92 |     const.HEAD_CID_A: {"name": "Content-ID Audio", "code": "CA"},
 93 |     const.HEAD_CID_A_PCF: {"name": "Content-ID Audio", "code": "Ca"},
 94 |     const.HEAD_CID_V: {"name": "Content-ID Video", "code": "CV"},
 95 |     const.HEAD_CID_V_PCF: {"name": "Content-ID Video", "code": "Cv"},
 96 |     const.HEAD_CID_M: {"name": "Content-ID Mixed", "code": "CM"},
 97 |     const.HEAD_CID_M_PCF: {"name": "Content-ID Mixed", "code": "Cm"},
 98 |     const.HEAD_DID: {"name": "Data-ID", "code": "CD"},
 99 |     const.HEAD_IID: {"name": "Instance-ID", "code": "CR"},
100 | }
101 | 
102 | ISCC_COMPONENT_CODES = {
103 |     value["code"]: {"name": value["name"], "marker": key}
104 |     for key, value in ISCC_COMPONENT_TYPES.items()
105 | }
106 | 
107 | TEST_DATA_URL = "https://raw.githubusercontent.com/iscc/iscc-specs/master/tests/"
108 | 
109 | WTA_PERMUTATIONS = (
110 |     (292, 16),
111 |     (219, 247),
112 |     (295, 7),
113 |     (105, 236),
114 |     (251, 142),
115 |     (334, 82),
116 |     (17, 266),
117 |     (250, 167),
118 |     (38, 127),
119 |     (184, 22),
120 |     (215, 71),
121 |     (308, 181),
122 |     (195, 215),
123 |     (145, 345),
124 |     (134, 233),
125 |     (89, 351),
126 |     (155, 338),
127 |     (185, 68),
128 |     (233, 122),
129 |     (225, 314),
130 |     (192, 22),
131 |     (298, 2),
132 |     (120, 68),
133 |     (99, 155),
134 |     (274, 187),
135 |     (122, 160),
136 |     (341, 281),
137 |     (230, 223),
138 |     (240, 33),
139 |     (334, 299),
140 |     (166, 256),
141 |     (80, 114),
142 |     (211, 122),
143 |     (18, 16),
144 |     (254, 154),
145 |     (310, 336),
146 |     (36, 273),
147 |     (41, 76),
148 |     (196, 290),
149 |     (191, 307),
150 |     (76, 57),
151 |     (49, 226),
152 |     (85, 97),
153 |     (178, 221),
154 |     (212, 228),
155 |     (125, 348),
156 |     (140, 73),
157 |     (316, 267),
158 |     (91, 61),
159 |     (136, 233),
160 |     (154, 84),
161 |     (338, 332),
162 |     (89, 90),
163 |     (245, 177),
164 |     (167, 222),
165 |     (114, 2),
166 |     (278, 364),
167 |     (22, 169),
168 |     (163, 124),
169 |     (40, 134),
170 |     (229, 207),
171 |     (298, 81),
172 |     (199, 253),
173 |     (344, 123),
174 |     (376, 268),
175 |     (139, 266),
176 |     (247, 308),
177 |     (255, 32),
178 |     (85, 250),
179 |     (345, 236),
180 |     (205, 69),
181 |     (215, 277),
182 |     (299, 178),
183 |     (275, 198),
184 |     (250, 359),
185 |     (84, 286),
186 |     (225, 50),
187 |     (212, 18),
188 |     (1, 224),
189 |     (274, 33),
190 |     (25, 179),
191 |     (47, 77),
192 |     (55, 311),
193 |     (232, 248),
194 |     (71, 234),
195 |     (223, 256),
196 |     (228, 175),
197 |     (371, 132),
198 |     (357, 234),
199 |     (216, 168),
200 |     (332, 266),
201 |     (267, 78),
202 |     (378, 121),
203 |     (165, 316),
204 |     (16, 351),
205 |     (100, 329),
206 |     (301, 294),
207 |     (321, 245),
208 |     (12, 59),
209 |     (151, 222),
210 |     (126, 367),
211 |     (148, 45),
212 |     (23, 305),
213 |     (281, 54),
214 |     (146, 83),
215 |     (343, 244),
216 |     (72, 184),
217 |     (304, 205),
218 |     (98, 179),
219 |     (93, 40),
220 |     (302, 99),
221 |     (218, 106),
222 |     (49, 350),
223 |     (157, 237),
224 |     (355, 267),
225 |     (369, 216),
226 |     (229, 340),
227 |     (284, 106),
228 |     (136, 305),
229 |     (186, 59),
230 |     (3, 107),
231 |     (217, 312),
232 |     (209, 195),
233 |     (333, 102),
234 |     (35, 216),
235 |     (45, 28),
236 |     (178, 130),
237 |     (184, 233),
238 |     (217, 99),
239 |     (321, 144),
240 |     (238, 355),
241 |     (150, 259),
242 |     (255, 259),
243 |     (134, 207),
244 |     (226, 327),
245 |     (174, 178),
246 |     (371, 141),
247 |     (247, 228),
248 |     (244, 300),
249 |     (245, 42),
250 |     (353, 276),
251 |     (368, 187),
252 |     (369, 207),
253 |     (86, 308),
254 |     (212, 368),
255 |     (288, 33),
256 |     (304, 375),
257 |     (156, 8),
258 |     (302, 167),
259 |     (333, 164),
260 |     (37, 379),
261 |     (203, 312),
262 |     (191, 144),
263 |     (310, 95),
264 |     (123, 86),
265 |     (157, 48),
266 |     (284, 27),
267 |     (112, 291),
268 |     (37, 215),
269 |     (98, 291),
270 |     (292, 224),
271 |     (303, 8),
272 |     (200, 103),
273 |     (173, 294),
274 |     (97, 267),
275 |     (288, 167),
276 |     (24, 336),
277 |     (354, 296),
278 |     (25, 18),
279 |     (289, 187),
280 |     (203, 166),
281 |     (307, 326),
282 |     (87, 80),
283 |     (60, 310),
284 |     (176, 84),
285 |     (15, 370),
286 |     (274, 261),
287 |     (178, 45),
288 |     (203, 224),
289 |     (295, 178),
290 |     (30, 74),
291 |     (227, 361),
292 |     (241, 312),
293 |     (231, 369),
294 |     (226, 309),
295 |     (89, 181),
296 |     (216, 175),
297 |     (286, 262),
298 |     (234, 198),
299 |     (99, 49),
300 |     (221, 328),
301 |     (78, 21),
302 |     (95, 327),
303 |     (324, 97),
304 |     (291, 219),
305 |     (184, 286),
306 |     (192, 25),
307 |     (309, 26),
308 |     (84, 159),
309 |     (114, 25),
310 |     (296, 90),
311 |     (51, 325),
312 |     (289, 184),
313 |     (95, 154),
314 |     (21, 202),
315 |     (306, 219),
316 |     (39, 176),
317 |     (99, 251),
318 |     (83, 86),
319 |     (207, 239),
320 |     (168, 19),
321 |     (88, 90),
322 |     (297, 361),
323 |     (215, 78),
324 |     (262, 328),
325 |     (356, 200),
326 |     (48, 203),
327 |     (60, 120),
328 |     (54, 216),
329 |     (369, 327),
330 |     (159, 370),
331 |     (148, 273),
332 |     (332, 50),
333 |     (176, 267),
334 |     (317, 243),
335 |     (311, 125),
336 |     (272, 148),
337 |     (6, 340),
338 |     (80, 346),
339 |     (197, 355),
340 |     (117, 49),
341 |     (261, 326),
342 |     (242, 51),
343 |     (295, 204),
344 |     (298, 111),
345 |     (147, 181),
346 |     (35, 96),
347 |     (318, 285),
348 |     (271, 13),
349 |     (38, 204),
350 |     (16, 8),
351 |     (334, 220),
352 |     (173, 91),
353 |     (372, 24),
354 |     (183, 166),
355 |     (320, 243),
356 |     (87, 9),
357 |     (105, 65),
358 |     (148, 103),
359 |     (197, 314),
360 |     (279, 299),
361 |     (304, 214),
362 |     (282, 15),
363 |     (64, 2),
364 |     (63, 14),
365 |     (28, 351),
366 | )
367 | 


--------------------------------------------------------------------------------
/iscc_cli/datatypes.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import mmap
 3 | from enum import Enum
 4 | from io import BytesIO, BufferedReader
 5 | from pathlib import Path
 6 | from typing import Union, BinaryIO
 7 | 
 8 | Data = Union[bytes, bytearray, memoryview]
 9 | Uri = Union[str, Path]
10 | File = Union[BinaryIO, mmap.mmap, BytesIO, BufferedReader]
11 | Readable = Union[Uri, Data, File]
12 | 
13 | 
14 | class GMT(str, Enum):
15 |     """Generic Metdia Type"""
16 | 
17 |     text = "text"
18 |     image = "image"
19 |     audio = "audio"
20 |     video = "video"
21 |     unknown = "unknown"
22 | 


--------------------------------------------------------------------------------
/iscc_cli/ffmpeg.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """A thin cross plattform installer and wrapper around ffmpeg."""
 3 | import imageio_ffmpeg
 4 | 
 5 | 
 6 | def exe_path():
 7 |     """Returns path to ffmpeg executable."""
 8 |     return imageio_ffmpeg.get_ffmpeg_exe()
 9 | 
10 | 
11 | def get_version_info():
12 |     """Get ffmpeg version info."""
13 |     return imageio_ffmpeg.get_ffmpeg_version()
14 | 
15 | 
16 | if __name__ == "__main__":
17 |     print(exe_path())
18 |     print(get_version_info())
19 | 


--------------------------------------------------------------------------------
/iscc_cli/fpcalc.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """A thin cross plattform installer and wrapper around chromaprint fpcalc."""
 3 | import os
 4 | import platform
 5 | import shutil
 6 | import tarfile
 7 | import zipfile
 8 | import subprocess
 9 | import stat
10 | import click
11 | import iscc_cli
12 | from iscc_cli.utils import download_file
13 | 
14 | 
15 | FPCALC_VERSION = "1.5.0"
16 | FPCALC_URL_BASE = "https://github.com/acoustid/chromaprint/releases/download/v{}/".format(
17 |     FPCALC_VERSION
18 | )
19 | FPCALC_OS_MAP = {
20 |     "Linux": "chromaprint-fpcalc-{}-linux-x86_64.tar.gz".format(FPCALC_VERSION),
21 |     "Darwin": "chromaprint-fpcalc-{}-macos-x86_64.tar.gz".format(FPCALC_VERSION),
22 |     "Windows": "chromaprint-fpcalc-{}-windows-x86_64.zip".format(FPCALC_VERSION),
23 | }
24 | 
25 | 
26 | def exe_path():
27 |     """Returns path to fpcalc executable."""
28 |     if platform.system() == "Windows":
29 |         return os.path.join(iscc_cli.APP_DIR, "fpcalc-{}.exe".format(FPCALC_VERSION))
30 |     return os.path.join(iscc_cli.APP_DIR, "fpcalc-{}".format(FPCALC_VERSION))
31 | 
32 | 
33 | def is_installed():
34 |     """"Check if fpcalc is installed."""
35 |     fp = exe_path()
36 |     return os.path.isfile(fp) and os.access(fp, os.X_OK)
37 | 
38 | 
39 | def download_url():
40 |     """Return system and version dependant download url"""
41 |     return os.path.join(FPCALC_URL_BASE, FPCALC_OS_MAP[platform.system()])
42 | 
43 | 
44 | def download():
45 |     """Download fpcalc and return path to archive file."""
46 |     return download_file(download_url())
47 | 
48 | 
49 | def extract(archive):
50 |     """Extract archive with fpcalc executable."""
51 |     if archive.endswith(".zip"):
52 |         with zipfile.ZipFile(archive, "r") as zip_file:
53 |             for member in zip_file.namelist():
54 |                 filename = os.path.basename(member)
55 |                 if filename == "fpcalc.exe":
56 |                     source = zip_file.open(member)
57 |                     target = open(exe_path(), "wb")
58 |                     with source, target:
59 |                         shutil.copyfileobj(source, target)
60 |     elif archive.endswith("tar.gz"):
61 |         with tarfile.open(archive, "r:gz") as tar_file:
62 |             for member in tar_file.getmembers():
63 |                 if member.isfile() and member.name.endswith("fpcalc"):
64 |                     source = tar_file.extractfile(member)
65 |                     target = open(exe_path(), "wb")
66 |                     with source, target:
67 |                         shutil.copyfileobj(source, target)
68 | 
69 | 
70 | def install():
71 |     """Install fpcalc command line tool and retur path to executable."""
72 |     if is_installed():
73 |         click.echo("Fpcalc is already installed.")
74 |         return exe_path()
75 |     archive_path = download()
76 |     extract(archive_path)
77 |     st = os.stat(exe_path())
78 |     os.chmod(exe_path(), st.st_mode | stat.S_IEXEC)
79 |     assert is_installed()
80 |     return exe_path()
81 | 
82 | 
83 | def get_version_info():
84 |     """Get fpcalc version"""
85 |     try:
86 |         r = subprocess.run([exe_path(), "-v"], stdout=subprocess.PIPE)
87 |         return r.stdout.decode("utf-8").strip().split()[2]
88 |     except FileNotFoundError:
89 |         return 'WARNING: Not Installed - run "iscc init" to install!'
90 | 


--------------------------------------------------------------------------------
/iscc_cli/lib.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """Expose cli commands with standard python api."""
 3 | from typing import List, Dict
 4 | from iscc_cli.commands.gen import gen
 5 | from iscc_cli.commands.batch import batch
 6 | from iscc_cli.commands.web import web
 7 | 
 8 | 
 9 | def iscc_from_file(file, guess=False, title="", extra="") -> Dict:
10 |     if isinstance(file, str):
11 |         file = open(file)
12 |     return gen.callback(file, guess, title, extra, False)
13 | 
14 | 
15 | def isccs_from_dir(path, recursive=False, guess=False, debug=False) -> List[Dict]:
16 |     return batch.callback(path, recursive, guess, debug)
17 | 
18 | 
19 | def iscc_from_url(url, guess=False, title="", extra="") -> Dict:
20 |     return web.callback(url, guess, title, extra, False)
21 | 


--------------------------------------------------------------------------------
/iscc_cli/mediatype.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from loguru import logger
  3 | from typing import List, Optional, Union
  4 | import mimetypes
  5 | import magic
  6 | from PIL import Image
  7 | from iscc_cli import uread
  8 | 
  9 | 
 10 | __all__ = [
 11 |     "mime_guess",
 12 |     "mime_normalize",
 13 |     "mime_supported",
 14 |     "mime_clean",
 15 |     "mime_to_gmt",
 16 |     "mime_from_name",
 17 |     "mime_from_data",
 18 | ]
 19 | 
 20 | 
 21 | def mime_guess(data, file_name=None):
 22 |     # type: (Readable, str) -> str
 23 |     """Heuristic guessing of mediatype for different kinds of inputs.
 24 |     We try matching by file extension. If that fails we match by content sniffing.
 25 |     """
 26 | 
 27 |     guess_name, guess_data = None, None
 28 |     file = uread.open_data(data)
 29 | 
 30 |     if file_name is None:
 31 |         if hasattr(file, "name"):
 32 |             file_name = file.name
 33 |         elif hasattr(file, "filename"):
 34 |             file_name = file.filename
 35 | 
 36 |     if file_name:
 37 |         guess_name = mime_from_name(file_name)
 38 | 
 39 |     guess_data = mime_from_data(file.read(4096))
 40 | 
 41 |     # Normalize
 42 |     guess_data = mime_normalize(guess_data)
 43 |     guess_name = mime_normalize(guess_name)
 44 | 
 45 |     return guess_name or guess_data
 46 | 
 47 | 
 48 | def mime_normalize(mime: str) -> str:
 49 |     """Return normalized version of a mediatype."""
 50 |     return MEDIATYPE_NORM.get(mime, mime)
 51 | 
 52 | 
 53 | def mime_supported(mime: str) -> bool:
 54 |     """Check if mediatype is supported"""
 55 |     return mime_normalize(mime) in SUPPORTED_MEDIATYPES
 56 | 
 57 | 
 58 | def mime_from_name(name: str) -> Optional[str]:
 59 |     """Guess mediatype from filename or url."""
 60 |     return mimetypes.guess_type(name)[0]
 61 | 
 62 | 
 63 | def mime_from_data(data: bytes) -> Optional[str]:
 64 |     """Guess mediatype by sniffing raw header data."""
 65 |     return magic.from_buffer(data, mime=True)
 66 | 
 67 | 
 68 | def mime_clean(mime: Union[str, List]):
 69 |     """
 70 |     Clean mimetype/content-type string or first entry of a list of mimetype strings.
 71 |     Also removes semicolon separated encoding information.
 72 |     """
 73 |     if mime and isinstance(mime, List):
 74 |         mime = mime[0]
 75 |     if mime:
 76 |         mime = mime.split(";")[0]
 77 |     return mime.strip()
 78 | 
 79 | 
 80 | def mime_to_gmt(mime_type: str, file_path=None):
 81 |     """Get generic mediatype from mimetype."""
 82 |     mime_type = mime_clean(mime_type)
 83 |     if mime_type == "image/gif" and file_path:
 84 |         img = Image.open(file_path)
 85 |         if img.is_animated:
 86 |             return "video"
 87 |         else:
 88 |             return "image"
 89 |     entry = SUPPORTED_MEDIATYPES.get(mime_type)
 90 |     if entry:
 91 |         return entry["gmt"]
 92 |     gmt = mime_type.split("/")[0]
 93 |     if gmt in list(GMT):
 94 |         logger.warning(f"Guessing GMT from {mime_type}")
 95 |         return gmt
 96 | 
 97 | 
 98 | mimetypes.add_type("text/markdown", ".md")
 99 | mimetypes.add_type("text/markdown", ".markdown")
100 | mimetypes.add_type("application/x-mobipocket-ebook", ".mobi")
101 | mimetypes.add_type("application/x-sqlite3", ".sqlite")
102 | mimetypes.add_type("video/mp4", ".f4v")
103 | 
104 | 
105 | SUPPORTED_MEDIATYPES = {
106 |     # Text Formats
107 |     "application/rtf": {"gmt": "text", "ext": "rtf"},
108 |     "application/msword": {"gmt": "text", "ext": "doc"},
109 |     "application/pdf": {"gmt": "text", "ext": "pdf"},
110 |     "application/epub+zip": {"gmt": "text", "ext": "epub"},
111 |     "text/xml": {"gmt": "text", "ext": "xml"},
112 |     "application/json": {"gmt": "text", "ext": "json"},
113 |     "application/xhtml+xml": {"gmt": "text", "ext": "xhtml"},
114 |     "application/vnd.oasis.opendocument.text": {"gmt": "text", "ext": "odt"},
115 |     "text/html": {"gmt": "text", "ext": "html"},
116 |     "text/plain": {"gmt": "text", "ext": "txt"},
117 |     "application/x-ibooks+zip": {"gmt": "text", "ext": "ibooks"},
118 |     "text/markdown": {"gmt": "text", "ext": ["md", "markdown"]},
119 |     "application/vnd.openxmlformats-officedocument.wordprocessingml.document": {
120 |         "gmt": "text",
121 |         "ext": "docx",
122 |     },
123 |     "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": {
124 |         "gmt": "text",
125 |         "ext": "xlsx",
126 |     },
127 |     # Note: pptx only detected by file extension. Sniffing gives 'application/zip'
128 |     "application/vnd.openxmlformats-officedocument.presentationml.presentation": {
129 |         "gmt": "text",
130 |         "ext": "pptx",
131 |     },
132 |     "application/vnd.ms-excel": {"gmt": "text", "ext": "xls"},
133 |     "application/x-mobipocket-ebook": {
134 |         "gmt": "text",
135 |         "ext": ["mobi", "prc", "azw", "azw3", "azw4"],
136 |     },
137 |     # Image Formats
138 |     "image/bmp": {"gmt": "image", "ext": "bmp"},
139 |     "image/gif": {"gmt": "image", "ext": "gif"},
140 |     "image/jpeg": {"gmt": "image", "ext": ["jpg", "jpeg"]},
141 |     "image/png": {"gmt": "image", "ext": "png"},
142 |     "image/tiff": {"gmt": "image", "ext": "tif"},
143 |     "image/vnd.adobe.photoshop": {"gmt": "image", "ext": "psd"},
144 |     "application/postscript": {"gmt": "image", "ext": "eps"},
145 |     # Audio Formats
146 |     "audio/mpeg": {"gmt": "audio", "ext": "mp3"},
147 |     "audio/wav": {"gmt": "audio", "ext": "wav"},
148 |     "audio/x-wav": {"gmt": "audio", "ext": "wav"},
149 |     "audio/ogg": {"gmt": "audio", "ext": "ogg"},
150 |     "audio/aiff": {"gmt": "audio", "ext": "aif"},
151 |     "audio/x-aiff": {"gmt": "audio", "ext": "aif"},
152 |     "audio/x-flac": {"gmt": "audio", "ext": "flac"},
153 |     "audio/opus": {"gmt": "audio", "ext": "opus"},
154 |     # Video Formats
155 |     "application/vnd.rn-realmedia": {"gmt": "video", "ext": "rm"},
156 |     "video/x-dirac": {"gmt": "video", "ext": "drc"},
157 |     "video/3gpp": {"gmt": "video", "ext": "3gp"},
158 |     "video/3gpp2": {"gmt": "video", "ext": "3g2"},
159 |     "video/x-ms-asf": {"gmt": "video", "ext": "asf"},
160 |     "video/avi": {"gmt": "video", "ext": "avi"},
161 |     "video/webm": {"gmt": "video", "ext": "webm"},
162 |     "video/mpeg": {"gmt": "video", "ext": ["mpeg", "mpg", "m1v", "vob"]},
163 |     "video/mp4": {"gmt": "video", "ext": "mp4"},
164 |     "video/x-m4v": {"gmt": "video", "ext": "m4v"},
165 |     "video/x-matroska": {"gmt": "video", "ext": "mkv"},
166 |     "video/ogg": {"gmt": "video", "ext": ["ogg", "ogv"]},
167 |     "video/quicktime": {"gmt": "video", "ext": ["mov", "f4v"]},
168 |     "video/x-flv": {"gmt": "video", "ext": "flv"},
169 |     "application/x-shockwave-flash": {"gmt": "video", "ext": "swf"},
170 |     "video/h264": {"gmt": "video", "ext": "h264"},
171 |     "video/x-ms-wmv": {"gmt": "video", "ext": "wmv"},
172 | }
173 | 
174 | MEDIATYPE_NORM = {
175 |     "audio/x-aiff": "audio/aiff",
176 |     "audio/x-wav": "audio/wav",
177 |     "image/x-ms-bmp": "image/bmp",
178 |     "video/x-msvideo": "video/avi",
179 | }
180 | 
181 | SUPPORTED_EXTENSIONS = []
182 | for v in SUPPORTED_MEDIATYPES.values():
183 |     ext = v["ext"]
184 |     if isinstance(ext, str):
185 |         SUPPORTED_EXTENSIONS.append(ext)
186 |     else:
187 |         for e in ext:
188 |             SUPPORTED_EXTENSIONS.append(e)
189 | 


--------------------------------------------------------------------------------
/iscc_cli/tika/__init__.py:
--------------------------------------------------------------------------------
 1 | # encoding: utf-8
 2 | # Licensed to the Apache Software Foundation (ASF) under one or more
 3 | # contributor license agreements.  See the NOTICE file distributed with
 4 | # this work for additional information regarding copyright ownership.
 5 | # The ASF licenses this file to You under the Apache License, Version 2.0
 6 | # (the "License"); you may not use this file except in compliance with
 7 | # the License.  You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | __version__ = "1.24"
18 | 
19 | try:
20 |     __import__("pkg_resources").declare_namespace(__name__)
21 | except ImportError:
22 |     from pkgutil import extend_path
23 | 
24 |     __path__ = extend_path(__path__, __name__)
25 | 
26 | 
27 | def initVM():
28 |     """ back compat method for JCC based Tika"""
29 |     return
30 | 


--------------------------------------------------------------------------------
/iscc_cli/tika/config.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | # Licensed to the Apache Software Foundation (ASF) under one or more
 4 | # contributor license agreements.  See the NOTICE file distributed with
 5 | # this work for additional information regarding copyright ownership.
 6 | # The ASF licenses this file to You under the Apache License, Version 2.0
 7 | # (the "License"); you may not use this file except in compliance with
 8 | # the License.  You may obtain a copy of the License at
 9 | #
10 | #     http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 | 
19 | from .tika import getConfig
20 | 
21 | 
22 | def getParsers():
23 |     return getConfig("parsers")[1]
24 | 
25 | 
26 | def getMimeTypes():
27 |     return getConfig("mime-types")[1]
28 | 
29 | 
30 | def getDetectors():
31 |     return getConfig("detectors")[1]
32 | 


--------------------------------------------------------------------------------
/iscc_cli/tika/detector.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | # Licensed to the Apache Software Foundation (ASF) under one or more
 4 | # contributor license agreements.  See the NOTICE file distributed with
 5 | # this work for additional information regarding copyright ownership.
 6 | # The ASF licenses this file to You under the Apache License, Version 2.0
 7 | # (the "License"); you may not use this file except in compliance with
 8 | # the License.  You may obtain a copy of the License at
 9 | #
10 | #     http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 | 
19 | from .tika import detectType1, callServer, ServerEndpoint
20 | 
21 | 
22 | def from_file(filename, config_path=None, requestOptions={}):
23 |     """
24 |     Detects MIME type of specified file
25 |     :param filename: file whose type needs to be detected
26 |     :return: MIME type
27 |     """
28 |     jsonOutput = detectType1(
29 |         "type", filename, config_path=config_path, requestOptions=requestOptions
30 |     )
31 |     return jsonOutput[1]
32 | 
33 | 
34 | def from_buffer(string, config_path=None, requestOptions={}):
35 |     """
36 |     Detects MIME type of the buffered content
37 |     :param string: buffered content whose type needs to be detected
38 |     :return:
39 |     """
40 |     status, response = callServer(
41 |         "put",
42 |         ServerEndpoint,
43 |         "/detect/stream",
44 |         string,
45 |         {"Accept": "text/plain"},
46 |         False,
47 |         config_path=config_path,
48 |         requestOptions=requestOptions,
49 |     )
50 |     return response
51 | 


--------------------------------------------------------------------------------
/iscc_cli/tika/language.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | # Licensed to the Apache Software Foundation (ASF) under one or more
 4 | # contributor license agreements.  See the NOTICE file distributed with
 5 | # this work for additional information regarding copyright ownership.
 6 | # The ASF licenses this file to You under the Apache License, Version 2.0
 7 | # (the "License"); you may not use this file except in compliance with
 8 | # the License.  You may obtain a copy of the License at
 9 | #
10 | #     http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 | 
19 | from .tika import detectLang1, callServer, ServerEndpoint
20 | 
21 | 
22 | def from_file(filename, requestOptions={}):
23 |     """
24 |     Detects language of the file
25 |     :param filename: path to file whose language needs to be detected
26 |     :return:
27 |     """
28 |     jsonOutput = detectLang1("file", filename, requestOptions=requestOptions)
29 |     return jsonOutput[1]
30 | 
31 | 
32 | def from_buffer(string, requestOptions={}):
33 |     """
34 |     Detects language of content in the buffer
35 |     :param string: buffered data
36 |     :return:
37 |     """
38 |     status, response = callServer(
39 |         "put",
40 |         ServerEndpoint,
41 |         "/language/string",
42 |         string,
43 |         {"Accept": "text/plain"},
44 |         False,
45 |         requestOptions=requestOptions,
46 |     )
47 |     return response
48 | 


--------------------------------------------------------------------------------
/iscc_cli/tika/parser.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # encoding: utf-8
  3 | # Licensed to the Apache Software Foundation (ASF) under one or more
  4 | # contributor license agreements.  See the NOTICE file distributed with
  5 | # this work for additional information regarding copyright ownership.
  6 | # The ASF licenses this file to You under the Apache License, Version 2.0
  7 | # (the "License"); you may not use this file except in compliance with
  8 | # the License.  You may obtain a copy of the License at
  9 | #
 10 | #     http://www.apache.org/licenses/LICENSE-2.0
 11 | #
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | #
 18 | 
 19 | from .tika import parse1, callServer, ServerEndpoint
 20 | import os
 21 | import json
 22 | 
 23 | 
 24 | def from_file(
 25 |     filename,
 26 |     serverEndpoint=ServerEndpoint,
 27 |     service="all",
 28 |     xmlContent=False,
 29 |     headers=None,
 30 |     config_path=None,
 31 |     requestOptions={},
 32 | ):
 33 |     """
 34 |     Parses a file for metadata and content
 35 |     :param filename: path to file which needs to be parsed or binary file using open(path,'rb')
 36 |     :param serverEndpoint: Server endpoint url
 37 |     :param service: service requested from the tika server
 38 |                     Default is 'all', which results in recursive text content+metadata.
 39 |                     'meta' returns only metadata
 40 |                     'text' returns only content
 41 |     :param xmlContent: Whether or not XML content be requested.
 42 |                     Default is 'False', which results in text content.
 43 |     :param headers: Request headers to be sent to the tika reset server, should
 44 |                     be a dictionary. This is optional
 45 |     :return: dictionary having 'metadata' and 'content' keys.
 46 |             'content' has a str value and metadata has a dict type value.
 47 |     """
 48 |     if not xmlContent:
 49 |         output = parse1(
 50 |             service,
 51 |             filename,
 52 |             serverEndpoint,
 53 |             headers=headers,
 54 |             config_path=config_path,
 55 |             requestOptions=requestOptions,
 56 |         )
 57 |     else:
 58 |         output = parse1(
 59 |             service,
 60 |             filename,
 61 |             serverEndpoint,
 62 |             services={"meta": "/meta", "text": "/tika", "all": "/rmeta/xml"},
 63 |             headers=headers,
 64 |             config_path=config_path,
 65 |             requestOptions=requestOptions,
 66 |         )
 67 |     return _parse(output, service)
 68 | 
 69 | 
 70 | def from_buffer(
 71 |     string,
 72 |     serverEndpoint=ServerEndpoint,
 73 |     xmlContent=False,
 74 |     headers=None,
 75 |     config_path=None,
 76 |     requestOptions={},
 77 | ):
 78 |     """
 79 |     Parses the content from buffer
 80 |     :param string: Buffer value
 81 |     :param serverEndpoint: Server endpoint. This is optional
 82 |     :param xmlContent: Whether or not XML content be requested.
 83 |                     Default is 'False', which results in text content.
 84 |     :param headers: Request headers to be sent to the tika reset server, should
 85 |                     be a dictionary. This is optional
 86 |     :return:
 87 |     """
 88 |     headers = headers or {}
 89 |     headers.update({"Accept": "application/json"})
 90 | 
 91 |     if not xmlContent:
 92 |         status, response = callServer(
 93 |             "put",
 94 |             serverEndpoint,
 95 |             "/rmeta/text",
 96 |             string,
 97 |             headers,
 98 |             False,
 99 |             config_path=config_path,
100 |             requestOptions=requestOptions,
101 |         )
102 |     else:
103 |         status, response = callServer(
104 |             "put",
105 |             serverEndpoint,
106 |             "/rmeta/xml",
107 |             string,
108 |             headers,
109 |             False,
110 |             config_path=config_path,
111 |             requestOptions=requestOptions,
112 |         )
113 | 
114 |     return _parse((status, response))
115 | 
116 | 
117 | def _parse(output, service="all"):
118 |     """
119 |     Parses response from Tika REST API server
120 |     :param output: output from Tika Server
121 |     :param service: service requested from the tika server
122 |                     Default is 'all', which results in recursive text content+metadata.
123 |                     'meta' returns only metadata
124 |                     'text' returns only content
125 |     :return: a dictionary having 'metadata' and 'content' values
126 |     """
127 |     parsed = {"metadata": None, "content": None}
128 |     if not output:
129 |         return parsed
130 | 
131 |     parsed["status"] = output[0]
132 |     if output[1] == None or output[1] == "":
133 |         return parsed
134 | 
135 |     if service == "text":
136 |         parsed["content"] = output[1]
137 |         return parsed
138 | 
139 |     realJson = json.loads(output[1])
140 | 
141 |     parsed["metadata"] = {}
142 |     if service == "meta":
143 |         for key in realJson:
144 |             parsed["metadata"][key] = realJson[key]
145 |         return parsed
146 | 
147 |     content = ""
148 |     for js in realJson:
149 |         if "X-TIKA:content" in js:
150 |             content += js["X-TIKA:content"]
151 | 
152 |     if content == "":
153 |         content = None
154 | 
155 |     parsed["content"] = content
156 | 
157 |     for js in realJson:
158 |         for n in js:
159 |             if n != "X-TIKA:content":
160 |                 if n in parsed["metadata"]:
161 |                     if not isinstance(parsed["metadata"][n], list):
162 |                         parsed["metadata"][n] = [parsed["metadata"][n]]
163 |                     parsed["metadata"][n].append(js[n])
164 |                 else:
165 |                     parsed["metadata"][n] = js[n]
166 | 
167 |     return parsed
168 | 


--------------------------------------------------------------------------------
/iscc_cli/tika/tika.py:
--------------------------------------------------------------------------------
   1 | #!/usr/bin/env python
   2 | # encoding: utf-8
   3 | # Licensed to the Apache Software Foundation (ASF) under one or more
   4 | # contributor license agreements.  See the NOTICE file distributed with
   5 | # this work for additional information regarding copyright ownership.
   6 | # The ASF licenses this file to You under the Apache License, Version 2.0
   7 | # (the "License"); you may not use this file except in compliance with
   8 | # the License.  You may obtain a copy of the License at
   9 | #
  10 | #     http://www.apache.org/licenses/LICENSE-2.0
  11 | #
  12 | # Unless required by applicable law or agreed to in writing, software
  13 | # distributed under the License is distributed on an "AS IS" BASIS,
  14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15 | # See the License for the specific language governing permissions and
  16 | # limitations under the License.
  17 | #
  18 | 
  19 | # Module documentation
  20 | """
  21 | Tika Python module provides Python API client to Apache Tika Server.
  22 | 
  23 | **Example usage**::
  24 | 
  25 |     import tika
  26 |     from tika import parser
  27 |     parsed = parser.from_file('/path/to/file')
  28 |     print(parsed["metadata"])
  29 |     print(parsed["content"])
  30 | 
  31 | Visit https://github.com/chrismattmann/tika-python to learn more about it.
  32 | 
  33 | **Detect IANA MIME Type**::
  34 | 
  35 |     from tika import detector
  36 |     print(detector.from_file('/path/to/file'))
  37 | 
  38 | **Detect Language**::
  39 | 
  40 |     from tika import language
  41 |     print(language.from_file('/path/to/file'))
  42 | 
  43 | **Use Tika Translate**::
  44 | 
  45 |    from tika import translate
  46 |    print(translate.from_file('/path/to/file', 'srcLang', 'destLang')
  47 |    # Use auto Language detection feature
  48 |    print(translate.from_file('/path/to/file', 'destLang')
  49 | 
  50 | ***Tika-Python Configuration***
  51 | You can now use custom configuration files. See https://tika.apache.org/1.18/configuring.html
  52 | for details on writing configuration files. Configuration is set the first time the server is started.
  53 | To use a configuration file with a parser, or detector:
  54 |     parsed = parser.from_file('/path/to/file', config_path='/path/to/configfile')
  55 | or:
  56 |     detected = detector.from_file('/path/to/file', config_path='/path/to/configfile')
  57 | or:
  58 |     detected = detector.from_buffer('some buffered content', config_path='/path/to/configfile')
  59 | 
  60 | """
  61 | import types
  62 | 
  63 | USAGE = """
  64 | tika.py [-v] [-e] [-o <outputDir>] [--server <TikaServerEndpoint>] [--install <UrlToTikaServerJar>] [--port <portNumber>] <command> <option> <urlOrPathToFile>
  65 | 
  66 | tika.py parse all test.pdf test2.pdf                   (write output JSON metadata files for test1.pdf_meta.json and test2.pdf_meta.json)
  67 | tika.py detect type test.pdf                           (returns mime-type as text/plain)
  68 | tika.py language file french.txt                       (returns language e.g., fr as text/plain)
  69 | tika.py translate fr:en french.txt                     (translates the file french.txt from french to english)
  70 | tika.py config mime-types                              (see what mime-types the Tika Server can handle)
  71 | 
  72 | A simple python and command-line client for Tika using the standalone Tika server (JAR file).
  73 | All commands return results in JSON format by default (except text in text/plain).
  74 | 
  75 | To parse docs, use:
  76 | tika.py parse <meta | text | all> <path>
  77 | 
  78 | To check the configuration of the Tika server, use:
  79 | tika.py config <mime-types | detectors | parsers>
  80 | 
  81 | Commands:
  82 |   parse  = parse the input file and write a JSON doc file.ext_meta.json containing the extracted metadata, text, or both
  83 |   detect type = parse the stream and 'detect' the MIME/media type, return in text/plain
  84 |   language file = parse the file stream and identify the language of the text, return its 2 character code in text/plain
  85 |   translate src:dest = parse and extract text and then translate the text from source language to destination language
  86 |   config = return a JSON doc describing the configuration of the Tika server (i.e. mime-types it
  87 |              can handle, or installed detectors or parsers)
  88 | 
  89 | Arguments:
  90 |   urlOrPathToFile = file to be parsed, if URL it will first be retrieved and then passed to Tika
  91 | 
  92 | Switches:
  93 |   --verbose, -v                  = verbose mode
  94 |   --encode, -e           = encode response in UTF-8
  95 |   --csv, -c    = report detect output in comma-delimited format
  96 |   --server <TikaServerEndpoint>  = use a remote Tika Server at this endpoint, otherwise use local server
  97 |   --install <UrlToTikaServerJar> = download and exec Tika Server (JAR file), starting server on default port 9998
  98 | 
  99 | Example usage as python client:
 100 | -- from tika import runCommand, parse1
 101 | -- jsonOutput = runCommand('parse', 'all', filename)
 102 |  or
 103 | -- jsonOutput = parse1('all', filename)
 104 | 
 105 | """
 106 | 
 107 | import sys, os, getopt, time, codecs, re
 108 | 
 109 | try:
 110 |     unicode_string = unicode
 111 |     binary_string = str
 112 | except NameError:
 113 |     unicode_string = str
 114 |     binary_string = bytes
 115 | 
 116 | try:
 117 |     from urllib import urlretrieve
 118 | except ImportError:
 119 |     from urllib.request import urlretrieve
 120 | try:
 121 |     from urlparse import urlparse
 122 | except ImportError:
 123 |     from urllib.parse import urlparse as urlparse
 124 | 
 125 | try:
 126 |     from rfc6266 import build_header
 127 | 
 128 |     def make_content_disposition_header(fn):
 129 |         return build_header(os.path.basename(fn)).decode("ascii")
 130 | 
 131 | 
 132 | except ImportError:
 133 | 
 134 |     def make_content_disposition_header(fn):
 135 |         return "attachment; filename=%s" % os.path.basename(fn)
 136 | 
 137 | 
 138 | if sys.version_info[0] < 3:
 139 |     open = codecs.open
 140 | 
 141 | import requests
 142 | import socket
 143 | import tempfile
 144 | import hashlib
 145 | import platform
 146 | from subprocess import Popen
 147 | from subprocess import STDOUT
 148 | from os import walk
 149 | import signal
 150 | import logging
 151 | import io
 152 | import ctypes
 153 | 
 154 | log_path = os.getenv("TIKA_LOG_PATH", tempfile.gettempdir())
 155 | log_file = os.path.join(log_path, os.getenv("TIKA_LOG_FILE", "tika.log"))
 156 | 
 157 | logFormatter = logging.Formatter(
 158 |     "%(asctime)s [%(threadName)-12.12s] [%(levelname)-5.5s]  %(message)s"
 159 | )
 160 | log = logging.getLogger("tika.tika")
 161 | 
 162 | if os.getenv("TIKA_LOG_FILE", "tika.log"):
 163 |     # File logs
 164 |     fileHandler = logging.FileHandler(log_file)
 165 |     fileHandler.setFormatter(logFormatter)
 166 |     log.addHandler(fileHandler)
 167 | 
 168 |     # Stdout logs
 169 |     consoleHandler = logging.StreamHandler()
 170 |     consoleHandler.setFormatter(logFormatter)
 171 |     log.addHandler(consoleHandler)
 172 | 
 173 | # Log level
 174 | log.setLevel(logging.INFO)
 175 | 
 176 | Windows = True if platform.system() == "Windows" else False
 177 | TikaVersion = os.getenv("TIKA_VERSION", "1.24")
 178 | TikaJarPath = os.getenv("TIKA_PATH", tempfile.gettempdir())
 179 | TikaFilesPath = tempfile.gettempdir()
 180 | TikaServerLogFilePath = log_path
 181 | TikaServerJar = os.getenv(
 182 |     "TIKA_SERVER_JAR",
 183 |     "http://search.maven.org/remotecontent?filepath=org/apache/tika/tika-server/"
 184 |     + TikaVersion
 185 |     + "/tika-server-"
 186 |     + TikaVersion
 187 |     + ".jar",
 188 | )
 189 | ServerHost = "localhost"
 190 | Port = "9998"
 191 | ServerEndpoint = os.getenv("TIKA_SERVER_ENDPOINT", "http://" + ServerHost + ":" + Port)
 192 | Translator = os.getenv(
 193 |     "TIKA_TRANSLATOR", "org.apache.tika.language.translate.Lingo24Translator"
 194 | )
 195 | TikaClientOnly = os.getenv("TIKA_CLIENT_ONLY", False)
 196 | TikaServerClasspath = os.getenv("TIKA_SERVER_CLASSPATH", "")
 197 | TikaStartupSleep = float(os.getenv("TIKA_STARTUP_SLEEP", 5))
 198 | TikaStartupMaxRetry = int(os.getenv("TIKA_STARTUP_MAX_RETRY", 3))
 199 | TikaJava = os.getenv("TIKA_JAVA", "java")
 200 | TikaJavaArgs = os.getenv("TIKA_JAVA_ARGS", "")
 201 | 
 202 | Verbose = 0
 203 | EncodeUtf8 = 0
 204 | csvOutput = 0
 205 | 
 206 | # will be used later on to kill the process and free up ram
 207 | TikaServerProcess = False
 208 | 
 209 | 
 210 | class TikaException(Exception):
 211 |     pass
 212 | 
 213 | 
 214 | def echo2(*s):
 215 |     sys.stderr.write(
 216 |         unicode_string("tika.py: %s\n")
 217 |         % unicode_string(" ").join(map(unicode_string, s))
 218 |     )
 219 | 
 220 | 
 221 | def warn(*s):
 222 |     echo2("Warn:", *s)
 223 | 
 224 | 
 225 | def die(*s):
 226 |     warn("Error:", *s)
 227 |     echo2(USAGE)
 228 |     sys.exit()
 229 | 
 230 | 
 231 | def runCommand(
 232 |     cmd,
 233 |     option,
 234 |     urlOrPaths,
 235 |     port,
 236 |     outDir=None,
 237 |     serverHost=ServerHost,
 238 |     tikaServerJar=TikaServerJar,
 239 |     verbose=Verbose,
 240 |     encode=EncodeUtf8,
 241 | ):
 242 |     """
 243 |     Run the Tika command by calling the Tika server and return results in JSON format (or plain text).
 244 |     :param cmd: a command from set ``{'parse', 'detect', 'language', 'translate', 'config'}``
 245 |     :param option:
 246 |     :param urlOrPaths:
 247 |     :param port:
 248 |     :param outDir:
 249 |     :param serverHost:
 250 |     :param tikaServerJar:
 251 |     :param verbose:
 252 |     :param encode:
 253 |     :return: response for the command, usually a ``dict``
 254 |     """
 255 |     # import pdb; pdb.set_trace()
 256 |     if (cmd in "parse" or cmd in "detect") and (urlOrPaths == [] or urlOrPaths == None):
 257 |         log.exception("No URLs/paths specified.")
 258 |         raise TikaException("No URLs/paths specified.")
 259 |     serverEndpoint = "http://" + serverHost + ":" + port
 260 |     if cmd == "parse":
 261 |         return parseAndSave(
 262 |             option, urlOrPaths, outDir, serverEndpoint, verbose, tikaServerJar
 263 |         )
 264 |     elif cmd == "detect":
 265 |         return detectType(option, urlOrPaths, serverEndpoint, verbose, tikaServerJar)
 266 |     elif cmd == "language":
 267 |         return detectLang(option, urlOrPaths, serverEndpoint, verbose, tikaServerJar)
 268 |     elif cmd == "translate":
 269 |         return doTranslate(option, urlOrPaths, serverEndpoint, verbose, tikaServerJar)
 270 |     elif cmd == "config":
 271 |         status, resp = getConfig(option, serverEndpoint, verbose, tikaServerJar)
 272 |         return resp
 273 |     else:
 274 |         log.exception("Bad args")
 275 |         raise TikaException("Bad args")
 276 | 
 277 | 
 278 | def getPaths(urlOrPaths):
 279 |     """
 280 |     Determines if the given URL in urlOrPaths is a URL or a file or directory. If it's
 281 |     a directory, it walks the directory and then finds all file paths in it, and ads them
 282 |     too. If it's a file, it adds it to the paths. If it's a URL it just adds it to the path.
 283 |     :param urlOrPaths: the url or path to be scanned
 284 |     :return: ``list`` of paths
 285 |     """
 286 |     if isinstance(urlOrPaths, unicode_string):
 287 |         urlOrPaths = [
 288 |             urlOrPaths
 289 |         ]  # do not recursively walk over letters of a single path which can include "/"
 290 |     paths = []
 291 |     for eachUrlOrPaths in urlOrPaths:
 292 |         if os.path.isdir(eachUrlOrPaths):
 293 |             for root, directories, filenames in walk(eachUrlOrPaths):
 294 |                 for filename in filenames:
 295 |                     paths.append(os.path.join(root, filename))
 296 |         else:
 297 |             paths.append(eachUrlOrPaths)
 298 |     return paths
 299 | 
 300 | 
 301 | def parseAndSave(
 302 |     option,
 303 |     urlOrPaths,
 304 |     outDir=None,
 305 |     serverEndpoint=ServerEndpoint,
 306 |     verbose=Verbose,
 307 |     tikaServerJar=TikaServerJar,
 308 |     responseMimeType="application/json",
 309 |     metaExtension="_meta.json",
 310 |     services={"meta": "/meta", "text": "/tika", "all": "/rmeta"},
 311 | ):
 312 |     """
 313 |     Parse the objects and write extracted metadata and/or text in JSON format to matching
 314 |     filename with an extension of '_meta.json'.
 315 |     :param option:
 316 |     :param urlOrPaths:
 317 |     :param outDir:
 318 |     :param serverEndpoint:
 319 |     :param verbose:
 320 |     :param tikaServerJar:
 321 |     :param responseMimeType:
 322 |     :param metaExtension:
 323 |     :param services:
 324 |     :return:
 325 |     """
 326 |     metaPaths = []
 327 |     paths = getPaths(urlOrPaths)
 328 |     for path in paths:
 329 |         if outDir is None:
 330 |             metaPath = path + metaExtension
 331 |         else:
 332 |             metaPath = os.path.join(outDir, os.path.split(path)[1] + metaExtension)
 333 |             log.info("Writing %s" % metaPath)
 334 |             with open(metaPath, "w", encoding="utf-8") as f:
 335 |                 f.write(
 336 |                     parse1(
 337 |                         option,
 338 |                         path,
 339 |                         serverEndpoint,
 340 |                         verbose,
 341 |                         tikaServerJar,
 342 |                         responseMimeType,
 343 |                         services,
 344 |                     )[1]
 345 |                     + u"\n"
 346 |                 )
 347 |         metaPaths.append(metaPath)
 348 |     return metaPaths
 349 | 
 350 | 
 351 | def parse(
 352 |     option,
 353 |     urlOrPaths,
 354 |     serverEndpoint=ServerEndpoint,
 355 |     verbose=Verbose,
 356 |     tikaServerJar=TikaServerJar,
 357 |     responseMimeType="application/json",
 358 |     services={"meta": "/meta", "text": "/tika", "all": "/rmeta"},
 359 |     rawResponse=False,
 360 | ):
 361 |     """
 362 |     Parse the objects and return extracted metadata and/or text in JSON format.
 363 |     :param option:
 364 |     :param urlOrPaths:
 365 |     :param serverEndpoint:
 366 |     :param verbose:
 367 |     :param tikaServerJar:
 368 |     :param responseMimeType:
 369 |     :param services:
 370 |     :return:
 371 |     """
 372 |     return [
 373 |         parse1(
 374 |             option,
 375 |             path,
 376 |             serverEndpoint,
 377 |             verbose,
 378 |             tikaServerJar,
 379 |             responseMimeType,
 380 |             services,
 381 |         )
 382 |         for path in urlOrPaths
 383 |     ]
 384 | 
 385 | 
 386 | def parse1(
 387 |     option,
 388 |     urlOrPath,
 389 |     serverEndpoint=ServerEndpoint,
 390 |     verbose=Verbose,
 391 |     tikaServerJar=TikaServerJar,
 392 |     responseMimeType="application/json",
 393 |     services={"meta": "/meta", "text": "/tika", "all": "/rmeta/text"},
 394 |     rawResponse=False,
 395 |     headers=None,
 396 |     config_path=None,
 397 |     requestOptions={},
 398 | ):
 399 |     """
 400 |     Parse the object and return extracted metadata and/or text in JSON format.
 401 |     :param option:
 402 |     :param urlOrPath:
 403 |     :param serverEndpoint:
 404 |     :param verbose:
 405 |     :param tikaServerJar:
 406 |     :param responseMimeType:
 407 |     :param services:
 408 |     :param rawResponse:
 409 |     :param headers:
 410 |     :return:
 411 |     """
 412 |     headers = headers or {}
 413 | 
 414 |     path, file_type = getRemoteFile(urlOrPath, TikaFilesPath)
 415 |     headers.update(
 416 |         {
 417 |             "Accept": responseMimeType,
 418 |             "Content-Disposition": make_content_disposition_header(
 419 |                 path.encode("utf-8") if type(path) is unicode_string else path
 420 |             ),
 421 |         }
 422 |     )
 423 | 
 424 |     if option not in services:
 425 |         log.warning("config option must be one of meta, text, or all; using all.")
 426 |     service = services.get(option, services["all"])
 427 |     if service == "/tika":
 428 |         responseMimeType = "text/plain"
 429 |     headers.update(
 430 |         {
 431 |             "Accept": responseMimeType,
 432 |             "Content-Disposition": make_content_disposition_header(
 433 |                 path.encode("utf-8") if type(path) is unicode_string else path
 434 |             ),
 435 |         }
 436 |     )
 437 |     with urlOrPath if _is_file_object(urlOrPath) else open(path, "rb") as f:
 438 |         status, response = callServer(
 439 |             "put",
 440 |             serverEndpoint,
 441 |             service,
 442 |             f,
 443 |             headers,
 444 |             verbose,
 445 |             tikaServerJar,
 446 |             config_path=config_path,
 447 |             rawResponse=rawResponse,
 448 |             requestOptions=requestOptions,
 449 |         )
 450 | 
 451 |     if file_type == "remote":
 452 |         os.unlink(path)
 453 |     return (status, response)
 454 | 
 455 | 
 456 | def detectLang(
 457 |     option,
 458 |     urlOrPaths,
 459 |     serverEndpoint=ServerEndpoint,
 460 |     verbose=Verbose,
 461 |     tikaServerJar=TikaServerJar,
 462 |     responseMimeType="text/plain",
 463 |     services={"file": "/language/stream"},
 464 | ):
 465 |     """
 466 |     Detect the language of the provided stream and return its 2 character code as text/plain.
 467 |     :param option:
 468 |     :param urlOrPaths:
 469 |     :param serverEndpoint:
 470 |     :param verbose:
 471 |     :param tikaServerJar:
 472 |     :param responseMimeType:
 473 |     :param services:
 474 |     :return:
 475 |     """
 476 |     paths = getPaths(urlOrPaths)
 477 |     return [
 478 |         detectLang1(
 479 |             option,
 480 |             path,
 481 |             serverEndpoint,
 482 |             verbose,
 483 |             tikaServerJar,
 484 |             responseMimeType,
 485 |             services,
 486 |         )
 487 |         for path in paths
 488 |     ]
 489 | 
 490 | 
 491 | def detectLang1(
 492 |     option,
 493 |     urlOrPath,
 494 |     serverEndpoint=ServerEndpoint,
 495 |     verbose=Verbose,
 496 |     tikaServerJar=TikaServerJar,
 497 |     responseMimeType="text/plain",
 498 |     services={"file": "/language/stream"},
 499 |     requestOptions={},
 500 | ):
 501 |     """
 502 |     Detect the language of the provided stream and return its 2 character code as text/plain.
 503 |     :param option:
 504 |     :param urlOrPath:
 505 |     :param serverEndpoint:
 506 |     :param verbose:
 507 |     :param tikaServerJar:
 508 |     :param responseMimeType:
 509 |     :param services:
 510 |     :return:
 511 |     """
 512 |     path, mode = getRemoteFile(urlOrPath, TikaFilesPath)
 513 |     if option not in services:
 514 |         log.exception(
 515 |             "Language option must be one of %s " % binary_string(services.keys())
 516 |         )
 517 |         raise TikaException(
 518 |             "Language option must be one of %s " % binary_string(services.keys())
 519 |         )
 520 |     service = services[option]
 521 |     status, response = callServer(
 522 |         "put",
 523 |         serverEndpoint,
 524 |         service,
 525 |         open(path, "rb"),
 526 |         {"Accept": responseMimeType},
 527 |         verbose,
 528 |         tikaServerJar,
 529 |         requestOptions=requestOptions,
 530 |     )
 531 |     return (status, response)
 532 | 
 533 | 
 534 | def doTranslate(
 535 |     option,
 536 |     urlOrPaths,
 537 |     serverEndpoint=ServerEndpoint,
 538 |     verbose=Verbose,
 539 |     tikaServerJar=TikaServerJar,
 540 |     responseMimeType="text/plain",
 541 |     services={"all": "/translate/all"},
 542 | ):
 543 |     """
 544 |     Translate the file from source language to destination language.
 545 |     :param option:
 546 |     :param urlOrPaths:
 547 |     :param serverEndpoint:
 548 |     :param verbose:
 549 |     :param tikaServerJar:
 550 |     :param responseMimeType:
 551 |     :param services:
 552 |     :return:
 553 |     """
 554 |     paths = getPaths(urlOrPaths)
 555 |     return [
 556 |         doTranslate1(
 557 |             option,
 558 |             path,
 559 |             serverEndpoint,
 560 |             verbose,
 561 |             tikaServerJar,
 562 |             responseMimeType,
 563 |             services,
 564 |         )
 565 |         for path in paths
 566 |     ]
 567 | 
 568 | 
 569 | def doTranslate1(
 570 |     option,
 571 |     urlOrPath,
 572 |     serverEndpoint=ServerEndpoint,
 573 |     verbose=Verbose,
 574 |     tikaServerJar=TikaServerJar,
 575 |     responseMimeType="text/plain",
 576 |     services={"all": "/translate/all"},
 577 |     requestOptions={},
 578 | ):
 579 |     """
 580 | 
 581 |     :param option:
 582 |     :param urlOrPath:
 583 |     :param serverEndpoint:
 584 |     :param verbose:
 585 |     :param tikaServerJar:
 586 |     :param responseMimeType:
 587 |     :param services:
 588 |     :return:
 589 |     """
 590 |     path, mode = getRemoteFile(urlOrPath, TikaFilesPath)
 591 |     srcLang = ""
 592 |     destLang = ""
 593 | 
 594 |     if ":" in option:
 595 |         options = option.rsplit(":")
 596 |         srcLang = options[0]
 597 |         destLang = options[1]
 598 |         if len(options) != 2:
 599 |             log.exception(
 600 |                 "Translate options are specified as srcLang:destLang or as destLang"
 601 |             )
 602 |             raise TikaException(
 603 |                 "Translate options are specified as srcLang:destLang or as destLang"
 604 |             )
 605 |     else:
 606 |         destLang = option
 607 | 
 608 |     if srcLang != "" and destLang != "":
 609 |         service = services["all"] + "/" + Translator + "/" + srcLang + "/" + destLang
 610 |     else:
 611 |         service = services["all"] + "/" + Translator + "/" + destLang
 612 |     status, response = callServer(
 613 |         "put",
 614 |         serverEndpoint,
 615 |         service,
 616 |         open(path, "rb"),
 617 |         {"Accept": responseMimeType},
 618 |         verbose,
 619 |         tikaServerJar,
 620 |         requestOptions=requestOptions,
 621 |     )
 622 |     return (status, response)
 623 | 
 624 | 
 625 | def detectType(
 626 |     option,
 627 |     urlOrPaths,
 628 |     serverEndpoint=ServerEndpoint,
 629 |     verbose=Verbose,
 630 |     tikaServerJar=TikaServerJar,
 631 |     responseMimeType="text/plain",
 632 |     services={"type": "/detect/stream"},
 633 | ):
 634 |     """
 635 |     Detect the MIME/media type of the stream and return it in text/plain.
 636 |     :param option:
 637 |     :param urlOrPaths:
 638 |     :param serverEndpoint:
 639 |     :param verbose:
 640 |     :param tikaServerJar:
 641 |     :param responseMimeType:
 642 |     :param services:
 643 |     :return:
 644 |     """
 645 |     paths = getPaths(urlOrPaths)
 646 |     return [
 647 |         detectType1(
 648 |             option,
 649 |             path,
 650 |             serverEndpoint,
 651 |             verbose,
 652 |             tikaServerJar,
 653 |             responseMimeType,
 654 |             services,
 655 |         )
 656 |         for path in paths
 657 |     ]
 658 | 
 659 | 
 660 | def detectType1(
 661 |     option,
 662 |     urlOrPath,
 663 |     serverEndpoint=ServerEndpoint,
 664 |     verbose=Verbose,
 665 |     tikaServerJar=TikaServerJar,
 666 |     responseMimeType="text/plain",
 667 |     services={"type": "/detect/stream"},
 668 |     config_path=None,
 669 |     requestOptions={},
 670 | ):
 671 |     """
 672 |     Detect the MIME/media type of the stream and return it in text/plain.
 673 |     :param option:
 674 |     :param urlOrPath:
 675 |     :param serverEndpoint:
 676 |     :param verbose:
 677 |     :param tikaServerJar:
 678 |     :param responseMimeType:
 679 |     :param services:
 680 |     :return:
 681 |     """
 682 |     path, mode = getRemoteFile(urlOrPath, TikaFilesPath)
 683 |     if option not in services:
 684 |         log.exception(
 685 |             "Detect option must be one of %s" % binary_string(services.keys())
 686 |         )
 687 |         raise TikaException(
 688 |             "Detect option must be one of %s" % binary_string(services.keys())
 689 |         )
 690 |     service = services[option]
 691 |     status, response = callServer(
 692 |         "put",
 693 |         serverEndpoint,
 694 |         service,
 695 |         open(path, "rb"),
 696 |         {
 697 |             "Accept": responseMimeType,
 698 |             "Content-Disposition": make_content_disposition_header(
 699 |                 path.encode("utf-8") if type(path) is unicode_string else path
 700 |             ),
 701 |         },
 702 |         verbose,
 703 |         tikaServerJar,
 704 |         config_path=config_path,
 705 |         requestOptions=requestOptions,
 706 |     )
 707 |     if csvOutput == 1:
 708 |         return (status, urlOrPath.decode("UTF-8") + "," + response)
 709 |     else:
 710 |         return (status, response)
 711 | 
 712 | 
 713 | def getConfig(
 714 |     option,
 715 |     serverEndpoint=ServerEndpoint,
 716 |     verbose=Verbose,
 717 |     tikaServerJar=TikaServerJar,
 718 |     responseMimeType="application/json",
 719 |     services={
 720 |         "mime-types": "/mime-types",
 721 |         "detectors": "/detectors",
 722 |         "parsers": "/parsers/details",
 723 |     },
 724 |     requestOptions={},
 725 | ):
 726 |     """
 727 |     Get the configuration of the Tika Server (parsers, detectors, etc.) and return it in JSON format.
 728 |     :param option:
 729 |     :param serverEndpoint:
 730 |     :param verbose:
 731 |     :param tikaServerJar:
 732 |     :param responseMimeType:
 733 |     :param services:
 734 |     :return:
 735 |     """
 736 |     if option not in services:
 737 |         die("config option must be one of mime-types, detectors, or parsers")
 738 |     service = services[option]
 739 |     status, response = callServer(
 740 |         "get",
 741 |         serverEndpoint,
 742 |         service,
 743 |         None,
 744 |         {"Accept": responseMimeType},
 745 |         verbose,
 746 |         tikaServerJar,
 747 |         requestOptions=requestOptions,
 748 |     )
 749 |     return (status, response)
 750 | 
 751 | 
 752 | def callServer(
 753 |     verb,
 754 |     serverEndpoint,
 755 |     service,
 756 |     data,
 757 |     headers,
 758 |     verbose=Verbose,
 759 |     tikaServerJar=TikaServerJar,
 760 |     httpVerbs={"get": requests.get, "put": requests.put, "post": requests.post},
 761 |     classpath=None,
 762 |     rawResponse=False,
 763 |     config_path=None,
 764 |     requestOptions={},
 765 | ):
 766 |     """
 767 |     Call the Tika Server, do some error checking, and return the response.
 768 |     :param verb:
 769 |     :param serverEndpoint:
 770 |     :param service:
 771 |     :param data:
 772 |     :param headers:
 773 |     :param verbose:
 774 |     :param tikaServerJar:
 775 |     :param httpVerbs:
 776 |     :param classpath:
 777 |     :return:
 778 |     """
 779 |     parsedUrl = urlparse(serverEndpoint)
 780 |     serverHost = parsedUrl.hostname
 781 |     scheme = parsedUrl.scheme
 782 | 
 783 |     port = parsedUrl.port
 784 |     if classpath is None:
 785 |         classpath = TikaServerClasspath
 786 | 
 787 |     global TikaClientOnly
 788 |     if not TikaClientOnly:
 789 |         serverEndpoint = checkTikaServer(
 790 |             scheme, serverHost, port, tikaServerJar, classpath, config_path
 791 |         )
 792 | 
 793 |     serviceUrl = serverEndpoint + service
 794 |     if verb not in httpVerbs:
 795 |         log.exception(
 796 |             "Tika Server call must be one of %s" % binary_string(httpVerbs.keys())
 797 |         )
 798 |         raise TikaException(
 799 |             "Tika Server call must be one of %s" % binary_string(httpVerbs.keys())
 800 |         )
 801 |     verbFn = httpVerbs[verb]
 802 | 
 803 |     if Windows and hasattr(data, "read"):
 804 |         data = data.read()
 805 | 
 806 |     encodedData = data
 807 |     if type(data) is unicode_string:
 808 |         encodedData = data.encode("utf-8")
 809 | 
 810 |     requestOptionsDefault = {"timeout": 60, "headers": headers, "verify": False}
 811 |     effectiveRequestOptions = requestOptionsDefault.copy()
 812 |     effectiveRequestOptions.update(requestOptions)
 813 | 
 814 |     resp = verbFn(serviceUrl, encodedData, **effectiveRequestOptions)
 815 | 
 816 |     if verbose:
 817 |         print(sys.stderr, "Request headers: ", headers)
 818 |         print(sys.stderr, "Response headers: ", resp.headers)
 819 |     if resp.status_code != 200:
 820 |         log.warning("Tika server returned status: %d", resp.status_code)
 821 | 
 822 |     resp.encoding = "utf-8"
 823 |     if rawResponse:
 824 |         return (resp.status_code, resp.content)
 825 |     else:
 826 |         return (resp.status_code, resp.text)
 827 | 
 828 | 
 829 | def checkTikaServer(
 830 |     scheme="http",
 831 |     serverHost=ServerHost,
 832 |     port=Port,
 833 |     tikaServerJar=TikaServerJar,
 834 |     classpath=None,
 835 |     config_path=None,
 836 | ):
 837 |     """
 838 |     Check that tika-server is running.  If not, download JAR file and start it up.
 839 |     :param scheme: e.g. http or https
 840 |     :param serverHost:
 841 |     :param port:
 842 |     :param tikaServerJar:
 843 |     :param classpath:
 844 |     :return:
 845 |     """
 846 |     if classpath is None:
 847 |         classpath = TikaServerClasspath
 848 |     if port is None:
 849 |         port = "443" if scheme == "https" else "80"
 850 | 
 851 |     urlp = urlparse(tikaServerJar)
 852 |     serverEndpoint = "%s://%s:%s" % (scheme, serverHost, port)
 853 |     jarPath = os.path.join(TikaJarPath, "tika-server.jar")
 854 |     if "localhost" in serverEndpoint or "127.0.0.1" in serverEndpoint:
 855 |         alreadyRunning = checkPortIsOpen(serverHost, port)
 856 | 
 857 |         if not alreadyRunning:
 858 |             if not os.path.isfile(jarPath) and urlp.scheme != "":
 859 |                 getRemoteJar(tikaServerJar, jarPath)
 860 | 
 861 |             if not checkJarSig(tikaServerJar, jarPath):
 862 |                 os.remove(jarPath)
 863 |                 tikaServerJar = getRemoteJar(tikaServerJar, jarPath)
 864 | 
 865 |             status = startServer(
 866 |                 jarPath,
 867 |                 TikaJava,
 868 |                 TikaJavaArgs,
 869 |                 serverHost,
 870 |                 port,
 871 |                 classpath,
 872 |                 config_path,
 873 |             )
 874 |             if not status:
 875 |                 log.error("Failed to receive startup confirmation from startServer.")
 876 |                 raise RuntimeError("Unable to start Tika server.")
 877 |     return serverEndpoint
 878 | 
 879 | 
 880 | def checkJarSig(tikaServerJar, jarPath):
 881 |     """
 882 |     Checks the signature of Jar
 883 |     :param tikaServerJar:
 884 |     :param jarPath:
 885 |     :return: ``True`` if the signature of the jar matches
 886 |     """
 887 |     if not os.path.isfile(jarPath + ".md5"):
 888 |         getRemoteJar(tikaServerJar + ".md5", jarPath + ".md5")
 889 |     m = hashlib.md5()
 890 |     with open(jarPath, "rb") as f:
 891 |         binContents = f.read()
 892 |         m.update(binContents)
 893 |         with open(jarPath + ".md5", "r") as em:
 894 |             existingContents = em.read()
 895 |             return existingContents == m.hexdigest()
 896 | 
 897 | 
 898 | def startServer(
 899 |     tikaServerJar,
 900 |     java_path=TikaJava,
 901 |     java_args=TikaJavaArgs,
 902 |     serverHost=ServerHost,
 903 |     port=Port,
 904 |     classpath=None,
 905 |     config_path=None,
 906 | ):
 907 |     """
 908 |     Starts Tika Server
 909 |     :param tikaServerJar: path to tika server jar
 910 |     :param serverHost: the host interface address to be used for binding the service
 911 |     :param port: the host port to be used for binding the service
 912 |     :param classpath: Class path value to pass to JVM
 913 |     :return: None
 914 |     """
 915 |     if classpath is None:
 916 |         classpath = TikaServerClasspath
 917 | 
 918 |     host = "localhost"
 919 |     if Windows:
 920 |         host = "0.0.0.0"
 921 | 
 922 |     if classpath:
 923 |         classpath += ":" + tikaServerJar
 924 |     else:
 925 |         classpath = tikaServerJar
 926 | 
 927 |     # setup command string
 928 |     cmd_string = ""
 929 |     if not config_path:
 930 |         cmd_string = (
 931 |             '%s %s -cp "%s" org.apache.tika.server.TikaServerCli --port %s --host %s &'
 932 |             % (java_path, java_args, classpath, port, host)
 933 |         )
 934 |     else:
 935 |         cmd_string = (
 936 |             '%s %s -cp "%s" org.apache.tika.server.TikaServerCli --port %s --host %s --config %s &'
 937 |             % (java_path, java_args, classpath, port, host, config_path)
 938 |         )
 939 | 
 940 |     # Check that we can write to log path
 941 |     try:
 942 |         tika_log_file_path = os.path.join(TikaServerLogFilePath, "tika-server.log")
 943 |         logFile = open(tika_log_file_path, "w")
 944 |     except PermissionError as e:
 945 |         log.error(
 946 |             "Unable to create tika-server.log at %s due to permission error."
 947 |             % (TikaServerLogFilePath)
 948 |         )
 949 |         return False
 950 | 
 951 |     # Check that specified java binary is available on path
 952 |     try:
 953 |         _ = Popen(java_path, stdout=open(os.devnull, "w"), stderr=open(os.devnull, "w"))
 954 |     except FileNotFoundError as e:
 955 |         log.error("Unable to run java; is it installed?")
 956 |         return False
 957 | 
 958 |     # Run java with jar args
 959 |     global TikaServerProcess
 960 |     # Patch for Windows support
 961 |     if Windows:
 962 |         if sys.version.startswith("2"):
 963 |             # Python 2.x
 964 |             TikaServerProcess = Popen(
 965 |                 cmd_string, stdout=logFile, stderr=STDOUT, shell=True
 966 |             )
 967 |         elif sys.version.startswith("3"):
 968 |             # Python 3.x
 969 |             TikaServerProcess = Popen(
 970 |                 cmd_string,
 971 |                 stdout=logFile,
 972 |                 stderr=STDOUT,
 973 |                 shell=True,
 974 |                 start_new_session=True,
 975 |             )
 976 |     else:
 977 |         TikaServerProcess = Popen(
 978 |             cmd_string, stdout=logFile, stderr=STDOUT, shell=True, preexec_fn=os.setsid
 979 |         )
 980 | 
 981 |     # Check logs and retry as configured
 982 |     try_count = 0
 983 |     is_started = False
 984 |     while try_count < TikaStartupMaxRetry:
 985 |         with open(tika_log_file_path, "r") as tika_log_file_tmp:
 986 |             # check for INFO string to confirm listening endpoint
 987 |             if "Started Apache Tika server at" in tika_log_file_tmp.read():
 988 |                 is_started = True
 989 |             else:
 990 |                 log.warning("Failed to see startup log message; retrying...")
 991 |         time.sleep(TikaStartupSleep)
 992 |         try_count += 1
 993 | 
 994 |     if not is_started:
 995 |         log.error(
 996 |             "Tika startup log message not received after %d tries."
 997 |             % (TikaStartupMaxRetry)
 998 |         )
 999 |         return False
1000 |     else:
1001 |         return True
1002 | 
1003 | 
1004 | def killServer():
1005 |     """
1006 |     Kills the tika server started by the current execution instance
1007 |     """
1008 |     if TikaServerProcess:
1009 |         try:
1010 |             os.killpg(os.getpgid(TikaServerProcess.pid), signal.SIGTERM)
1011 |         except:
1012 |             log.error("Failed to kill the current server session")
1013 |         time.sleep(1)
1014 |         # patch to support subprocess killing for windows
1015 |         if Windows:
1016 |             if sys.version.startswith("2"):
1017 |                 # Python 2.x
1018 |                 PROCESS_TERMINATE = 1
1019 |                 handle = ctypes.windll.kernel32.OpenProcess(
1020 |                     PROCESS_TERMINATE, False, TikaServerProcess.pid
1021 |                 )
1022 |                 ctypes.windll.kernel32.TerminateProcess(handle, -1)
1023 |                 ctypes.windll.kernel32.CloseHandle(handle)
1024 |                 time.sleep(1)
1025 |             elif sys.version.startswith("3"):
1026 |                 # Python 3.x
1027 |                 os.kill(TikaServerProcess.pid, signal.SIGTERM)
1028 |                 time.sleep(1)
1029 |         else:
1030 |             try:
1031 |                 os.killpg(os.getpgid(TikaServerProcess.pid), signal.SIGTERM)
1032 |             except:
1033 |                 log.error("Failed to kill the current server session")
1034 |             time.sleep(1)
1035 |     else:
1036 |         log.error("Server not running, or was already running before")
1037 | 
1038 | 
1039 | def toFilename(url):
1040 |     """
1041 |     gets url and returns filename
1042 |     """
1043 |     urlp = urlparse(url)
1044 |     path = urlp.path
1045 |     if not path:
1046 |         path = "file_{}".format(int(time.time()))
1047 |     value = re.sub(r"[^\w\s\.\-]", "-", path).strip().lower()
1048 |     return re.sub(r"[-\s]+", "-", value).strip("-")[-200:]
1049 | 
1050 | 
1051 | def _is_file_object(f):
1052 |     try:
1053 |         file_types = (types.FileType, io.IOBase)
1054 |     except AttributeError:
1055 |         file_types = (io.IOBase,)
1056 | 
1057 |     return isinstance(f, file_types)
1058 | 
1059 | 
1060 | def getRemoteFile(urlOrPath, destPath):
1061 |     """
1062 |     Fetches URL to local path or just returns absolute path.
1063 |     :param urlOrPath: resource locator, generally URL or path
1064 |     :param destPath: path to store the resource, usually a path on file system
1065 |     :return: tuple having (path, 'local'/'remote'/'binary')
1066 |     """
1067 |     # handle binary stream input
1068 |     if _is_file_object(urlOrPath):
1069 |         return (urlOrPath.name, "binary")
1070 | 
1071 |     urlp = urlparse(urlOrPath)
1072 |     if urlp.scheme == "":
1073 |         return (os.path.abspath(urlOrPath), "local")
1074 |     elif urlp.scheme not in ("http", "https"):
1075 |         return (urlOrPath, "local")
1076 |     else:
1077 |         filename = toFilename(urlOrPath)
1078 |         destPath = destPath + "/" + filename
1079 |         log.info("Retrieving %s to %s." % (urlOrPath, destPath))
1080 |         try:
1081 |             urlretrieve(urlOrPath, destPath)
1082 |         except IOError:
1083 |             # monkey patch fix for SSL/Windows per Tika-Python #54
1084 |             # https://github.com/chrismattmann/tika-python/issues/54
1085 |             import ssl
1086 | 
1087 |             if hasattr(ssl, "_create_unverified_context"):
1088 |                 ssl._create_default_https_context = ssl._create_unverified_context
1089 |             # delete whatever we had there
1090 |             if os.path.exists(destPath) and os.path.isfile(destPath):
1091 |                 os.remove(destPath)
1092 |             urlretrieve(urlOrPath, destPath)
1093 |         return (destPath, "remote")
1094 | 
1095 | 
1096 | def getRemoteJar(urlOrPath, destPath):
1097 |     """
1098 |     Fetches URL to local path or just return absolute path.
1099 |     :param urlOrPath: remote resource locator
1100 |     :param destPath: Path to store the resource, usually a path on file system
1101 |     :return: tuple having (path, 'local'/'remote')
1102 |     """
1103 |     urlp = urlparse(urlOrPath)
1104 |     if urlp.scheme == "":
1105 |         return (os.path.abspath(urlOrPath), "local")
1106 |     else:
1107 |         log.info("Retrieving %s to %s." % (urlOrPath, destPath))
1108 |         try:
1109 |             urlretrieve(urlOrPath, destPath)
1110 |         except IOError:
1111 |             # monkey patch fix for SSL/Windows per Tika-Python #54
1112 |             # https://github.com/chrismattmann/tika-python/issues/54
1113 |             import ssl
1114 | 
1115 |             if hasattr(ssl, "_create_unverified_context"):
1116 |                 ssl._create_default_https_context = ssl._create_unverified_context
1117 |             # delete whatever we had there
1118 |             if os.path.exists(destPath) and os.path.isfile(destPath):
1119 |                 os.remove(destPath)
1120 |             urlretrieve(urlOrPath, destPath)
1121 | 
1122 |         return (destPath, "remote")
1123 | 
1124 | 
1125 | def checkPortIsOpen(remoteServerHost=ServerHost, port=Port):
1126 |     """
1127 |     Checks if the specified port is open
1128 |     :param remoteServerHost: the host address
1129 |     :param port: port which needs to be checked
1130 |     :return: ``True`` if port is open, ``False`` otherwise
1131 |     """
1132 |     remoteServerIP = socket.gethostbyname(remoteServerHost)
1133 |     try:
1134 |         sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
1135 |         result = sock.connect_ex((remoteServerIP, int(port)))
1136 |         if result == 0:
1137 |             return True
1138 |         else:
1139 |             return False
1140 | 
1141 |     except KeyboardInterrupt:
1142 |         print("You pressed Ctrl+C")
1143 |         sys.exit()
1144 | 
1145 |     except socket.gaierror:
1146 |         print("Hostname could not be resolved. Exiting")
1147 |         sys.exit()
1148 | 
1149 |     except socket.error:
1150 |         print("Couldn't connect to server")
1151 |         sys.exit()
1152 | 
1153 |     finally:
1154 |         sock.close()
1155 | 
1156 | 
1157 | def main(argv=None):
1158 |     """Run Tika from command line according to USAGE."""
1159 |     global Verbose
1160 |     global EncodeUtf8
1161 |     global csvOutput
1162 |     if argv is None:
1163 |         argv = sys.argv
1164 | 
1165 |     if len(argv) < 3 and not (("-h" in argv) or ("--help" in argv)):
1166 |         log.exception("Bad args")
1167 |         raise TikaException("Bad args")
1168 |     try:
1169 |         opts, argv = getopt.getopt(
1170 |             argv[1:],
1171 |             "hi:s:o:p:v:e:c",
1172 |             [
1173 |                 "help",
1174 |                 "install=",
1175 |                 "server=",
1176 |                 "output=",
1177 |                 "port=",
1178 |                 "verbose",
1179 |                 "encode",
1180 |                 "csv",
1181 |             ],
1182 |         )
1183 |     except getopt.GetoptError as opt_error:
1184 |         msg, bad_opt = opt_error
1185 |         log.exception("%s error: Bad option: %s, %s" % (argv[0], bad_opt, msg))
1186 |         raise TikaException("%s error: Bad option: %s, %s" % (argv[0], bad_opt, msg))
1187 | 
1188 |     tikaServerJar = TikaServerJar
1189 |     serverHost = ServerHost
1190 |     outDir = "."
1191 |     port = Port
1192 |     for opt, val in opts:
1193 |         if opt in ("-h", "--help"):
1194 |             echo2(USAGE)
1195 |             sys.exit()
1196 |         elif opt in ("--install"):
1197 |             tikaServerJar = val
1198 |         elif opt in ("--server"):
1199 |             serverHost = val
1200 |         elif opt in ("-o", "--output"):
1201 |             outDir = val
1202 |         elif opt in ("--port"):
1203 |             port = val
1204 |         elif opt in ("-v", "--verbose"):
1205 |             Verbose = 1
1206 |         elif opt in ("-e", "--encode"):
1207 |             EncodeUtf8 = 1
1208 |         elif opt in ("-c", "--csv"):
1209 |             csvOutput = 1
1210 |         else:
1211 |             raise TikaException(USAGE)
1212 | 
1213 |     cmd = argv[0]
1214 |     option = argv[1]
1215 |     try:
1216 |         paths = argv[2:]
1217 |     except:
1218 |         paths = None
1219 |     return runCommand(
1220 |         cmd,
1221 |         option,
1222 |         paths,
1223 |         port,
1224 |         outDir,
1225 |         serverHost=serverHost,
1226 |         tikaServerJar=tikaServerJar,
1227 |         verbose=Verbose,
1228 |         encode=EncodeUtf8,
1229 |     )
1230 | 
1231 | 
1232 | if __name__ == "__main__":
1233 |     log.info("Logging on '%s'" % (log_file))
1234 |     resp = main(sys.argv)
1235 | 
1236 |     # Set encoding of the terminal to UTF-8
1237 |     if sys.version.startswith("2"):
1238 |         # Python 2.x
1239 |         out = codecs.getwriter("UTF-8")(sys.stdout)
1240 |     elif sys.version.startswith("3"):
1241 |         # Python 3.x
1242 |         out = codecs.getwriter("UTF-8")(sys.stdout.buffer)
1243 | 
1244 |     if type(resp) == list:
1245 |         out.write("\n".join([r[1] for r in resp]))
1246 |     else:
1247 |         out.write(resp)
1248 |     out.write("\n")
1249 | 


--------------------------------------------------------------------------------
/iscc_cli/tika/translate.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # encoding: utf-8
  3 | # Licensed to the Apache Software Foundation (ASF) under one or more
  4 | # contributor license agreements.  See the NOTICE file distributed with
  5 | # this work for additional information regarding copyright ownership.
  6 | # The ASF licenses this file to You under the Apache License, Version 2.0
  7 | # (the "License"); you may not use this file except in compliance with
  8 | # the License.  You may obtain a copy of the License at
  9 | #
 10 | #     http://www.apache.org/licenses/LICENSE-2.0
 11 | #
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | #
 18 | 
 19 | from .tika import doTranslate1, callServer, Translator, ServerEndpoint
 20 | 
 21 | 
 22 | def from_file(
 23 |     filename, srcLang, destLang, serverEndpoint=ServerEndpoint, requestOptions={}
 24 | ):
 25 |     """
 26 |     Traslates the content of source file to destination language
 27 |     :param filename: file whose contents needs translation
 28 |     :param srcLang: name of language of input file
 29 |     :param destLang: name of language of desired language
 30 |     :param serverEndpoint: Tika server end point (Optional)
 31 |     :return: translated content
 32 |     """
 33 |     jsonOutput = doTranslate1(
 34 |         srcLang + ":" + destLang,
 35 |         filename,
 36 |         serverEndpoint,
 37 |         requestOptions=requestOptions,
 38 |     )
 39 |     return jsonOutput[1]
 40 | 
 41 | 
 42 | def from_buffer(
 43 |     string, srcLang, destLang, serverEndpoint=ServerEndpoint, requestOptions={}
 44 | ):
 45 |     """
 46 |     Translates content from source language to desired destination language
 47 |     :param string: input content which needs translation
 48 |     :param srcLang: name of language of the input content
 49 |     :param destLang: name of the desired language for translation
 50 |     :param serverEndpoint:
 51 |     :return:
 52 |     """
 53 |     status, response = callServer(
 54 |         "put",
 55 |         ServerEndpoint,
 56 |         "/translate/all/" + Translator + "/" + srcLang + "/" + destLang,
 57 |         string,
 58 |         {"Accept": "text/plain"},
 59 |         False,
 60 |         requestOptions=requestOptions,
 61 |     )
 62 |     return response
 63 | 
 64 | 
 65 | def auto_from_file(
 66 |     filename, destLang, serverEndpoint=ServerEndpoint, requestOptions={}
 67 | ):
 68 |     """
 69 |     Translates contents of a file to desired language by auto detecting the source language
 70 |     :param filename: file whose contents needs translation
 71 |     :param destLang: name of the desired language for translation
 72 |     :param serverEndpoint: Tika server end point (Optional)
 73 |     :return:
 74 |     """
 75 |     jsonOutput = doTranslate1(
 76 |         destLang, filename, serverEndpoint, requestOptions=requestOptions
 77 |     )
 78 |     return jsonOutput[1]
 79 | 
 80 | 
 81 | def auto_from_buffer(
 82 |     string, destLang, serverEndpoint=ServerEndpoint, requestOptions={}
 83 | ):
 84 |     """
 85 |     Translates content to desired language by auto detecting the source language
 86 |     :param string: input content which needs translation
 87 |     :param destLang: name of the desired language for translation
 88 |     :param serverEndpoint: Tika server end point (Optional)
 89 |     :return:
 90 |     """
 91 |     status, response = callServer(
 92 |         "put",
 93 |         ServerEndpoint,
 94 |         "/translate/all/" + Translator + "/" + destLang,
 95 |         string,
 96 |         {"Accept": "text/plain"},
 97 |         False,
 98 |         requestOptions=requestOptions,
 99 |     )
100 |     return response
101 | 


--------------------------------------------------------------------------------
/iscc_cli/tika/unpack.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # encoding: utf-8
  3 | # Licensed to the Apache Software Foundation (ASF) under one or more
  4 | # contributor license agreements.  See the NOTICE file distributed with
  5 | # this work for additional information regarding copyright ownership.
  6 | # The ASF licenses this file to You under the Apache License, Version 2.0
  7 | # (the "License"); you may not use this file except in compliance with
  8 | # the License.  You may obtain a copy of the License at
  9 | #
 10 | #     http://www.apache.org/licenses/LICENSE-2.0
 11 | #
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | #
 18 | 
 19 | from .tika import parse1, callServer, ServerEndpoint
 20 | import tarfile
 21 | from io import BytesIO, TextIOWrapper
 22 | import csv
 23 | from sys import version_info
 24 | 
 25 | # Python 3 introduced .readable() to tarfile extracted files objects - this
 26 | # is required to wrap a TextIOWrapper around the object. However, wrapping
 27 | # with TextIOWrapper is only required for csv.reader() in Python 3, so the
 28 | # tarfile returned object can be used as is in earlier versions.
 29 | _text_wrapper = TextIOWrapper if version_info.major >= 3 else lambda x: x
 30 | 
 31 | 
 32 | def from_file(filename, serverEndpoint=ServerEndpoint, requestOptions={}):
 33 |     """
 34 |     Parse from file
 35 |     :param filename: file
 36 |     :param serverEndpoint: Tika server end point (optional)
 37 |     :return:
 38 |     """
 39 |     tarOutput = parse1(
 40 |         "unpack",
 41 |         filename,
 42 |         serverEndpoint,
 43 |         responseMimeType="application/x-tar",
 44 |         services={
 45 |             "meta": "/meta",
 46 |             "text": "/tika",
 47 |             "all": "/rmeta/xml",
 48 |             "unpack": "/unpack/all",
 49 |         },
 50 |         rawResponse=True,
 51 |         requestOptions=requestOptions,
 52 |     )
 53 |     return _parse(tarOutput)
 54 | 
 55 | 
 56 | def from_buffer(string, serverEndpoint=ServerEndpoint, requestOptions={}):
 57 |     """
 58 |     Parse from buffered content
 59 |     :param string:  buffered content
 60 |     :param serverEndpoint: Tika server URL (Optional)
 61 |     :return: parsed content
 62 |     """
 63 |     status, response = callServer(
 64 |         "put",
 65 |         serverEndpoint,
 66 |         "/unpack/all",
 67 |         string,
 68 |         {"Accept": "application/x-tar"},
 69 |         False,
 70 |         rawResponse=True,
 71 |         requestOptions=requestOptions,
 72 |     )
 73 | 
 74 |     return _parse((status, response))
 75 | 
 76 | 
 77 | def _parse(tarOutput):
 78 |     parsed = {}
 79 |     if not tarOutput:
 80 |         return parsed
 81 |     elif tarOutput[1] is None or tarOutput[1] == b"":
 82 |         return parsed
 83 | 
 84 |     with tarfile.open(fileobj=BytesIO(tarOutput[1])) as tarFile:
 85 |         # get the member names
 86 |         memberNames = list(tarFile.getnames())
 87 | 
 88 |         # extract the metadata
 89 |         metadata = {}
 90 |         if "__METADATA__" in memberNames:
 91 |             memberNames.remove("__METADATA__")
 92 | 
 93 |         metadataMember = tarFile.getmember("__METADATA__")
 94 |         if not metadataMember.issym() and metadataMember.isfile():
 95 |             with _text_wrapper(tarFile.extractfile(metadataMember)) as metadataFile:
 96 |                 metadataReader = csv.reader(_truncate_nulls(metadataFile))
 97 |                 for metadataLine in metadataReader:
 98 |                     # each metadata line comes as a key-value pair, with list values
 99 |                     # returned as extra values in the line - convert single values
100 |                     # to non-list values to be consistent with parser metadata
101 |                     assert len(metadataLine) >= 2
102 | 
103 |                     if len(metadataLine) > 2:
104 |                         metadata[metadataLine[0]] = metadataLine[1:]
105 |                     else:
106 |                         metadata[metadataLine[0]] = metadataLine[1]
107 | 
108 |         # get the content
109 |         content = ""
110 |         if "__TEXT__" in memberNames:
111 |             memberNames.remove("__TEXT__")
112 | 
113 |             contentMember = tarFile.getmember("__TEXT__")
114 |             if not contentMember.issym() and contentMember.isfile():
115 |                 if version_info.major >= 3:
116 |                     with _text_wrapper(
117 |                         tarFile.extractfile(contentMember), encoding="utf8"
118 |                     ) as content_file:
119 |                         content = content_file.read()
120 |                 else:
121 |                     with tarFile.extractfile(contentMember) as content_file:
122 |                         content = content_file.read().decode("utf8")
123 | 
124 |         # get the remaining files as attachments
125 |         attachments = {}
126 |         for attachment in memberNames:
127 |             attachmentMember = tarFile.getmember(attachment)
128 |             if not attachmentMember.issym() and attachmentMember.isfile():
129 |                 with tarFile.extractfile(attachmentMember) as attachment_file:
130 |                     attachments[attachment] = attachment_file.read()
131 | 
132 |         parsed["content"] = content
133 |         parsed["metadata"] = metadata
134 |         parsed["attachments"] = attachments
135 | 
136 |         return parsed
137 | 
138 | 
139 | # TODO: Remove if/when fixed. https://issues.apache.org/jira/browse/TIKA-3070
140 | def _truncate_nulls(s):
141 |     for line in s:
142 |         yield line.replace("\0", "")
143 | 


--------------------------------------------------------------------------------
/iscc_cli/uread.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import io
 3 | from typing import Union
 4 | from iscc_cli.datatypes import Readable, Uri, File, Data
 5 | from typing import BinaryIO
 6 | from iscc_cli.utils import download_file
 7 | 
 8 | 
 9 | def open_data(data):
10 |     # type: (Readable) -> Union[BinaryIO]
11 |     """Open filepath, rawdata or file-like object."""
12 |     if isinstance(data, Uri.__args__):
13 |         if isinstance(data, str) and (
14 |             data.startswith("http://") or data.startswith("https://")
15 |         ):
16 |             data = download_file(data, sanitize=True)
17 |         return open(str(data), "rb")
18 |     elif isinstance(data, Data.__args__):
19 |         return io.BytesIO(data)
20 |     elif isinstance(data, File.__args__):
21 |         data.seek(0)
22 |         return data
23 |     else:
24 |         raise ValueError(f"unsupported data type {type(data)}")
25 | 


--------------------------------------------------------------------------------
/iscc_cli/utils.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from typing import Union, List
  3 | import hashlib
  4 | import io
  5 | import os
  6 | import re
  7 | import textwrap
  8 | from os import getcwd, listdir, walk
  9 | from os.path import isfile, splitext, isdir, join, basename
 10 | from urllib.parse import urlparse
 11 | import click
 12 | import iscc
 13 | import requests
 14 | from PIL import Image
 15 | import iscc_cli
 16 | from iscc_cli.const import (
 17 |     SUPPORTED_EXTENSIONS,
 18 |     SUPPORTED_MIME_TYPES,
 19 |     ISCC_COMPONENT_CODES,
 20 |     GMT,
 21 | )
 22 | 
 23 | 
 24 | def iter_files(root, exts=None, recursive=False):
 25 |     """
 26 |     Iterate over file paths within root filtered by specified extensions.
 27 |     :param str root: Root folder to start collecting files
 28 |     :param iterable exts: Restrict results to given file extensions
 29 |     :param bool recursive: Wether to walk the complete directory tree
 30 |     :rtype collections.Iterable[str]: absolute file paths with given extensions
 31 |     """
 32 | 
 33 |     if exts is not None:
 34 |         exts = set((x.lower() for x in exts))
 35 | 
 36 |     def matches(e):
 37 |         return (exts is None) or (e in exts)
 38 | 
 39 |     if recursive is False:
 40 |         for entry in listdir(root):
 41 |             ext = splitext(entry)[-1].lstrip(".").lower()
 42 |             if not isdir(entry) and matches(ext):
 43 |                 yield join(root, entry)
 44 |     else:
 45 |         for root, folders, files in walk(root):
 46 |             for f in files:
 47 |                 ext = splitext(f)[-1].lstrip(".").lower()
 48 |                 if matches(ext):
 49 |                     yield join(root, f)
 50 | 
 51 | 
 52 | def get_files(path, recursive=False):
 53 |     if path is None:
 54 |         path = getcwd()
 55 |     if isfile(path):
 56 |         return [path]
 57 |     return iter_files(path, exts=SUPPORTED_EXTENSIONS, recursive=recursive)
 58 | 
 59 | 
 60 | def clean_mime(mime: Union[str, List]):
 61 |     """Returns first entry in mime and removes semicolon separated charset info"""
 62 |     if mime and isinstance(mime, List):
 63 |         mime = mime[0]
 64 |     if mime:
 65 |         mime = mime.split(";")[0]
 66 |     return mime.strip()
 67 | 
 68 | 
 69 | def mime_to_gmt(mime_type, file_path=None):
 70 |     mime_type = clean_mime(mime_type)
 71 |     if mime_type == "image/gif" and file_path:
 72 |         img = Image.open(file_path)
 73 |         if img.is_animated:
 74 |             return GMT.VIDEO
 75 |         else:
 76 |             return GMT.IMAGE
 77 |     entry = SUPPORTED_MIME_TYPES.get(mime_type)
 78 |     if entry:
 79 |         return entry["gmt"]
 80 |     gmt = mime_type.split("/")[0]
 81 |     if gmt in (GMT.TEXT, GMT.IMAGE, GMT.AUDIO, GMT.VIDEO):
 82 |         click.echo(
 83 |             "WARNING: Attempting to process unsupported media type %s" % mime_type
 84 |         )
 85 |         return gmt
 86 | 
 87 | 
 88 | def get_title(tika_result: dict, guess=False, uri=None):
 89 |     title = ""
 90 |     gmt = None
 91 |     meta = tika_result.get("metadata")
 92 | 
 93 |     if meta:
 94 |         mime_type = clean_mime(meta.get("Content-Type"))
 95 |         gmt = mime_to_gmt(mime_type)
 96 |         title = meta.get("dc:title", "")
 97 |         title = title[0].strip() if isinstance(title, list) else title.strip()
 98 |         if not title:
 99 |             title = meta.get("title", "")
100 |             title = title[0].strip() if isinstance(title, list) else title.strip()
101 | 
102 |     # See if string would survive normalization
103 |     norm_title = iscc.text_normalize(title, keep_ws=True)
104 | 
105 |     if not norm_title and guess and gmt == GMT.TEXT:
106 |         content = tika_result.get("content", "")
107 |         if content is not None:
108 |             first_line = content.strip().splitlines()[0]
109 |             title = iscc.text_trim(iscc.text_normalize(first_line, keep_ws=True))
110 | 
111 |     if not title and uri is not None:
112 |         result = urlparse(uri)
113 |         base = basename(result.path)
114 |         title = splitext(base)[0]
115 |         title = title.replace("-", " ")
116 |         title = title.replace("_", " ")
117 |     return title
118 | 
119 | 
120 | class DefaultHelp(click.Command):
121 |     def __init__(self, *args, **kwargs):
122 |         context_settings = kwargs.setdefault("context_settings", {})
123 |         if "help_option_names" not in context_settings:
124 |             context_settings["help_option_names"] = ["-h", "--help"]
125 |         self.help_flag = context_settings["help_option_names"][0]
126 |         super(DefaultHelp, self).__init__(*args, **kwargs)
127 | 
128 |     def parse_args(self, ctx, args):
129 |         if not args:
130 |             args = [self.help_flag]
131 |         return super(DefaultHelp, self).parse_args(ctx, args)
132 | 
133 | 
134 | def iscc_clean(i):
135 |     """Remove leading scheme and dashes"""
136 |     return i.split(":")[-1].strip().replace("-", "")
137 | 
138 | 
139 | def iscc_verify(i):
140 |     i = iscc_clean(i)
141 |     for c in i:
142 |         if c not in iscc.SYMBOLS:
143 |             raise ValueError('Illegal character "{}" in ISCC Code'.format(c))
144 |     for component_code in iscc_split(i):
145 |         iscc_verify_component(component_code)
146 | 
147 | 
148 | def iscc_verify_component(component_code):
149 | 
150 |     if not len(component_code) == 13:
151 |         raise ValueError(
152 |             "Illegal component length {} for {}".format(
153 |                 len(component_code), component_code
154 |             )
155 |         )
156 | 
157 |     header_code = component_code[:2]
158 |     if header_code not in ISCC_COMPONENT_CODES.keys():
159 |         raise ValueError("Illegal component header {}".format(header_code))
160 | 
161 | 
162 | def iscc_split(i):
163 |     return textwrap.wrap(iscc_clean(i), 13)
164 | 
165 | 
166 | def download_file(url, md5=None, sanitize=False):
167 |     """Download file to app dir and return path."""
168 |     url_obj = urlparse(url)
169 |     file_name = os.path.basename(url_obj.path) or "temp.file"
170 |     if sanitize:
171 |         file_name = safe_filename(file_name)
172 |     out_path = os.path.join(iscc_cli.APP_DIR, file_name)
173 |     if os.path.exists(out_path):
174 |         click.echo("Already downloaded: %s" % file_name)
175 |         if md5:
176 |             md5_calc = hashlib.md5(open(out_path, "rb").read()).hexdigest()
177 |             assert md5 == md5_calc
178 |         return out_path
179 |     r = requests.get(url, stream=True)
180 |     length = int(r.headers["content-length"])
181 |     chunk_size = 512
182 |     iter_size = 0
183 |     with io.open(out_path, "wb") as fd:
184 |         with click.progressbar(
185 |             length=length, label="Downloading %s" % file_name
186 |         ) as bar:
187 |             for chunk in r.iter_content(chunk_size):
188 |                 fd.write(chunk)
189 |                 iter_size += chunk_size
190 |                 bar.update(chunk_size)
191 |     if md5:
192 |         md5_calc = hashlib.md5(open(out_path, "rb").read()).hexdigest()
193 |         assert md5 == md5_calc
194 |     return out_path
195 | 
196 | 
197 | class cd:
198 |     """Context manager for changing the current working directory"""
199 | 
200 |     def __init__(self, newPath):
201 |         self.newPath = os.path.expanduser(newPath)
202 | 
203 |     def __enter__(self):
204 |         self.savedPath = os.getcwd()
205 |         os.chdir(self.newPath)
206 | 
207 |     def __exit__(self, etype, value, traceback):
208 |         os.chdir(self.savedPath)
209 | 
210 | 
211 | def safe_filename(s: str, max_len: int = 255) -> str:
212 |     """Sanitize a string making it safe to use as a filename.
213 |     See: https://en.wikipedia.org/wiki/Filename.
214 |     """
215 |     ntfs_chars = [chr(i) for i in range(0, 31)]
216 |     chars = [
217 |         r'"',
218 |         r"\#",
219 |         r"\$",
220 |         r"\%",
221 |         r"'",
222 |         r"\*",
223 |         r"\,",
224 |         r"\.",
225 |         r"\/",
226 |         r"\:",
227 |         r'"',
228 |         r"\;",
229 |         r"\<",
230 |         r"\>",
231 |         r"\?",
232 |         r"\\",
233 |         r"\^",
234 |         r"\|",
235 |         r"\~",
236 |         r"\\\\",
237 |     ]
238 |     pattern = "|".join(ntfs_chars + chars)
239 |     regex = re.compile(pattern, re.UNICODE)
240 |     fname = regex.sub("", s)
241 |     return fname[:max_len].rsplit(" ", 0)[0]
242 | 


--------------------------------------------------------------------------------
/iscc_cli/video_id.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import os
 3 | import subprocess
 4 | import sys
 5 | from os.path import basename, dirname
 6 | from statistics import mode
 7 | import iscc
 8 | from lxml import etree
 9 | 
10 | from iscc_cli import ffmpeg
11 | from iscc_cli.const import WTA_PERMUTATIONS
12 | from iscc_cli.utils import cd
13 | 
14 | NSMAP = {
15 |     "a": "urn:mpeg:mpeg7:schema:2001",
16 |     "b": "http://www.w3.org/2001/XMLSchema-instance",
17 | }
18 | 
19 | 
20 | def content_id_video(features, partial=False):
21 |     sigs = set(features)
22 |     hashsum = [sum(col) for col in zip(*sigs)]
23 |     sh = wta_hash(hashsum, 64)
24 |     if partial:
25 |         content_id_video_digest = iscc.HEAD_CID_V_PCF + sh[:8]
26 |     else:
27 |         content_id_video_digest = iscc.HEAD_CID_V + sh[:8]
28 |     return iscc.encode(content_id_video_digest)
29 | 
30 | 
31 | def get_frame_vectors(file):
32 |     crop = get_crop(file)
33 |     sigfile = basename(file) + ".xml"
34 |     folder = dirname(file)
35 |     if crop:
36 |         vf = "{},signature=format=xml:filename={}".format(crop, sigfile)
37 |     else:
38 |         vf = "signature=format=xml:filename={}".format(sigfile)
39 |     with cd(folder):
40 |         cmd = [ffmpeg.exe_path(), "-i", file, "-vf", vf, "-f", "null", "-"]
41 |         subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
42 |         tree = etree.parse(sigfile)
43 |         root = tree.getroot()
44 |         os.remove(sigfile)
45 |     frames = []
46 |     frame_els = root.xpath("//a:FrameSignature", namespaces=NSMAP)
47 |     for frame_el in frame_els:
48 |         frames.append(tuple(int(t) for t in frame_el.text.split()))
49 |     return tuple(frames)
50 | 
51 | 
52 | def get_crop(file) -> str:
53 |     """Detect crop value for video."""
54 |     cmd = [ffmpeg.exe_path(), "-i", file, "-vf", "cropdetect", "-f", "null", "-"]
55 |     res = subprocess.run(cmd, stderr=subprocess.PIPE)
56 |     text = res.stderr.decode(encoding=sys.stdout.encoding)
57 |     crops = [
58 |         line.split()[-1]
59 |         for line in text.splitlines()
60 |         if line.startswith("[Parsed_cropdetect")
61 |     ]
62 |     return mode(crops)
63 | 
64 | 
65 | def wta_hash(vec, hl=64) -> bytes:
66 |     """Calculate WTA Hash from vector with 380 features."""
67 |     vl = len(vec)
68 |     perms = WTA_PERMUTATIONS
69 |     h = []
70 |     assert len(set(vec)) > 1, "Vector for wta_hash needs at least 2 different values."
71 | 
72 |     def get_neq_vals(idxs):
73 |         vals = vec[idxs[0]], vec[idxs[1]]
74 |         while vals[0] == vals[1]:
75 |             idxs = idxs[0], (idxs[1] + 1) % vl
76 |             vals = vec[idxs[0]], vec[idxs[1]]
77 |         return vals
78 | 
79 |     for n, perm in enumerate(perms):
80 |         vals = get_neq_vals(perm)
81 |         h.append(vals.index(max(vals)))
82 |         if len(h) == hl:
83 |             break
84 |     h = bytes([int("".join(map(str, h[i : i + 8])), 2) for i in range(0, len(h), 8)])
85 |     return h
86 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "iscc-cli"
 3 | version = "0.9.12"
 4 | description = "ISCC CLI - Creates ISCC Codes from Media Files"
 5 | authors = ["Titusz Pan <tp@py7.de>"]
 6 | license = "MIT"
 7 | readme = "README.md"
 8 | homepage = "https://iscc.codes/"
 9 | repository = "https://github.com/iscc/iscc-cli"
10 | keywords = ["iscc", "media", "identifier", "blockchain", "cli", "generator"]
11 | 
12 | classifiers = [
13 |     "Development Status :: 4 - Beta",
14 |     "Intended Audience :: Developers",
15 |     "Intended Audience :: Information Technology",
16 |     "Environment :: Console",
17 |     "License :: OSI Approved :: MIT License",
18 |     "Operating System :: Unix",
19 |     "Operating System :: POSIX",
20 |     "Operating System :: Microsoft :: Windows",
21 |     'Programming Language :: Python',
22 |     'Programming Language :: Python :: 3.6',
23 |     'Programming Language :: Python :: 3.7',
24 |     'Programming Language :: Python :: 3.8',
25 |     "Topic :: Multimedia",
26 |     "Topic :: System :: Archiving",
27 |     "Topic :: Utilities",
28 |     "Topic :: Software Development",
29 | ]
30 | 
31 | [tool.poetry.scripts]
32 | iscc = 'iscc_cli.cli:cli'
33 | 
34 | [tool.poetry.dependencies]
35 | python = "^3.6"
36 | click = "^7.0"
37 | iscc = "1.0.5"
38 | click-default-group = "^1.2"
39 | colorama = "^0.4"
40 | imageio-ffmpeg = "^0.4"
41 | lxml = "^4.4"
42 | mobi = "^0"
43 | requests = "^2.23.0"
44 | python-magic-bin = { version = "^0.4.14", markers = "sys_platform == 'win32' or sys_platform == 'darwin'" }
45 | python-magic = { version = "^0.4.22", markers = "sys_platform == 'linux'" }
46 | 
47 | [tool.poetry.dev-dependencies]
48 | pytest = "^5.0"
49 | black = { version = "^19.10b0", python = "^3.6" }
50 | pyinstaller = "^3.4"
51 | pywin32-ctypes = { version = "0.2.0", markers = "sys_platform == 'win32'" }
52 | pefile = { version = "2019.4.18", markers = "sys_platform == 'win32'" }
53 | pytest-ordering = "^0.6"
54 | 
55 | [build-system]
56 | requires = ["poetry==1.0.5"]
57 | build-backend = "poetry.core.masonry.api"
58 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | import os
2 | 
3 | TEST_DIR = os.path.dirname(os.path.realpath(__file__))
4 | ROOT_DIR = os.path.dirname(TEST_DIR)
5 | 


--------------------------------------------------------------------------------
/tests/audio/demo.aif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iscc/iscc-cli/b8ca75567e842de504752440cb79f3dcb21177d4/tests/audio/demo.aif


--------------------------------------------------------------------------------
/tests/audio/demo.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iscc/iscc-cli/b8ca75567e842de504752440cb79f3dcb21177d4/tests/audio/demo.mp3


--------------------------------------------------------------------------------
/tests/audio/demo.ogg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iscc/iscc-cli/b8ca75567e842de504752440cb79f3dcb21177d4/tests/audio/demo.ogg


--------------------------------------------------------------------------------
/tests/audio/demo.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iscc/iscc-cli/b8ca75567e842de504752440cb79f3dcb21177d4/tests/audio/demo.wav


--------------------------------------------------------------------------------
/tests/batch/demo.doc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iscc/iscc-cli/b8ca75567e842de504752440cb79f3dcb21177d4/tests/batch/demo.doc


--------------------------------------------------------------------------------
/tests/batch/demo.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iscc/iscc-cli/b8ca75567e842de504752440cb79f3dcb21177d4/tests/batch/demo.pdf


--------------------------------------------------------------------------------
/tests/batch/empty.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iscc/iscc-cli/b8ca75567e842de504752440cb79f3dcb21177d4/tests/batch/empty.txt


--------------------------------------------------------------------------------
/tests/batch/subdir/demo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iscc/iscc-cli/b8ca75567e842de504752440cb79f3dcb21177d4/tests/batch/subdir/demo.png


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | import platform
 2 | 
 3 | 
 4 | def pytest_addoption(parser):
 5 |     parser.addoption(
 6 |         "--terminate", action="store_true", default=False, help="cleanup tika proccess"
 7 |     )
 8 | 
 9 | 
10 | def pytest_unconfigure(config):
11 |     if platform.system() == "Windows":
12 |         terminate = config.getoption("--terminate")
13 |         if terminate:
14 |             import subprocess
15 | 
16 |             subprocess.call("taskkill /F /T /IM java.exe")
17 | 


--------------------------------------------------------------------------------
/tests/image/demo.bmp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iscc/iscc-cli/b8ca75567e842de504752440cb79f3dcb21177d4/tests/image/demo.bmp


--------------------------------------------------------------------------------
/tests/image/demo.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iscc/iscc-cli/b8ca75567e842de504752440cb79f3dcb21177d4/tests/image/demo.gif


--------------------------------------------------------------------------------
/tests/image/demo.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iscc/iscc-cli/b8ca75567e842de504752440cb79f3dcb21177d4/tests/image/demo.jpg


--------------------------------------------------------------------------------
/tests/image/demo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iscc/iscc-cli/b8ca75567e842de504752440cb79f3dcb21177d4/tests/image/demo.png


--------------------------------------------------------------------------------
/tests/image/demo.psd:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iscc/iscc-cli/b8ca75567e842de504752440cb79f3dcb21177d4/tests/image/demo.psd


--------------------------------------------------------------------------------
/tests/image/demo.tif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iscc/iscc-cli/b8ca75567e842de504752440cb79f3dcb21177d4/tests/image/demo.tif


--------------------------------------------------------------------------------
/tests/test_0_pre_init.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """Tests that should pass before external tools and dependencies are installed."""
 3 | import os
 4 | from tests import ROOT_DIR
 5 | from iscc_cli.cli import cli
 6 | from click.testing import CliRunner
 7 | 
 8 | os.chdir(ROOT_DIR)
 9 | r = CliRunner()
10 | 
11 | 
12 | def test_info():
13 |     result = r.invoke(cli, ["info"])
14 |     assert result.exit_code == 0
15 |     assert "Supported File Types" in result.output
16 | 


--------------------------------------------------------------------------------
/tests/test_1_init.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from tests import ROOT_DIR
 3 | from iscc_cli.cli import cli
 4 | from click.testing import CliRunner
 5 | 
 6 | 
 7 | os.chdir(ROOT_DIR)
 8 | r = CliRunner()
 9 | 
10 | 
11 | def test_init():
12 |     result = r.invoke(cli, ["init"])
13 |     assert result.exit_code == 0
14 |     assert "Apache Tika 1." in result.output
15 |     assert "fpcalc installed:" in result.output
16 | 


--------------------------------------------------------------------------------
/tests/test_audio_id.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import os
  3 | from tests import ROOT_DIR
  4 | from iscc_cli import audio_id
  5 | from iscc_cli import fpcalc
  6 | 
  7 | os.chdir(ROOT_DIR)
  8 | 
  9 | 
 10 | def test_content_id_audio():
 11 |     assert audio_id.content_id_audio([1, 2, 3]) == "CACCCCCidbPMJ"
 12 | 
 13 | 
 14 | def test_get_chroma_vector_file_path():
 15 |     if not fpcalc.is_installed():
 16 |         fpcalc.install()
 17 |     r = audio_id.get_chroma_vector("tests/audio/demo.mp3")
 18 |     assert isinstance(r, list)
 19 |     assert r == [
 20 |         684003877,
 21 |         683946551,
 22 |         1749295639,
 23 |         2017796679,
 24 |         2026256086,
 25 |         2022066918,
 26 |         2022001639,
 27 |         2021968035,
 28 |         2038741139,
 29 |         2059709571,
 30 |         503750851,
 31 |         369541315,
 32 |         320225426,
 33 |         289292450,
 34 |         830368930,
 35 |         838789539,
 36 |         1940835201,
 37 |         1928186752,
 38 |         1651297920,
 39 |         1651283600,
 40 |         1650959072,
 41 |         1655022116,
 42 |         1722069540,
 43 |         1726259749,
 44 |         1713694254,
 45 |         1847914286,
 46 |         1847912494,
 47 |         1780832302,
 48 |         -362410962,
 49 |         -352973810,
 50 |         1809196111,
 51 |         1770397775,
 52 |         1753686797,
 53 |         683942429,
 54 |         943989277,
 55 |         943989255,
 56 |         944121430,
 57 |         952503910,
 58 |         948374246,
 59 |         948717799,
 60 |         1485621411,
 61 |         462203011,
 62 |         508470403,
 63 |         370053251,
 64 |         303988867,
 65 |         322879651,
 66 |         322892963,
 67 |         862907811,
 68 |         1928256417,
 69 |         1928317841,
 70 |         1651297152,
 71 |         1647091344,
 72 |         1650827936,
 73 |         1659216416,
 74 |         1722069540,
 75 |         1726263844,
 76 |         1717887533,
 77 |         1713696302,
 78 |         1847912494,
 79 |         1847883822,
 80 |         -366540754,
 81 |         -345633778,
 82 |         -336184242,
 83 |         1771447375,
 84 |         1753620815,
 85 |         1757684255,
 86 |         675553815,
 87 |         943989255,
 88 |         944120390,
 89 |         952508006,
 90 |         948308582,
 91 |         948718050,
 92 |         411879650,
 93 |         428648578,
 94 |         516861059,
 95 |         370057347,
 96 |         303988865,
 97 |         306086033,
 98 |         306086049,
 99 |         841919649,
100 |         846133665,
101 |         1919929264,
102 |         1647168400,
103 |         1647101584,
104 |         1650827936,
105 |         1659216484,
106 |         1671733796,
107 |         1738838588,
108 |         1717887517,
109 |         1713696302,
110 |         1847913774,
111 |         1847879726,
112 |         1780960302,
113 |         -362410978,
114 |         -336196594,
115 |         1775641678,
116 |         1770397775,
117 |         1753555743,
118 |         683942429,
119 |         943989271,
120 |         944185926,
121 |         2026255094,
122 |         2022051494,
123 |         2021919654,
124 |     ]
125 | 
126 | 
127 | def test_get_chroma_vector_file_stream():
128 |     if not fpcalc.is_installed():
129 |         fpcalc.install()
130 |     with open("tests/audio/demo.mp3", "rb") as file_obj:
131 |         r = audio_id.get_chroma_vector(file_obj)
132 |     assert isinstance(r, list)
133 |     assert r == [
134 |         684003877,
135 |         683946551,
136 |         1749295639,
137 |         2017796679,
138 |         2026256086,
139 |         2022066918,
140 |         2022001639,
141 |         2021968035,
142 |         2038741139,
143 |         2059709571,
144 |         503750851,
145 |         369541315,
146 |         320225426,
147 |         289292450,
148 |         830368930,
149 |         838789539,
150 |         1940835201,
151 |         1928186752,
152 |         1651297920,
153 |         1651283600,
154 |         1650959072,
155 |         1655022116,
156 |         1722069540,
157 |         1726259749,
158 |         1713694254,
159 |         1847914286,
160 |         1847912494,
161 |         1780832302,
162 |         -362410962,
163 |         -352973810,
164 |         1809196111,
165 |         1770397775,
166 |         1753686797,
167 |         683942429,
168 |         943989277,
169 |         943989255,
170 |         944121430,
171 |         952503910,
172 |         948374246,
173 |         948717799,
174 |         1485621411,
175 |         462203011,
176 |         508470403,
177 |         370053251,
178 |         303988867,
179 |         322879651,
180 |         322892963,
181 |         862907811,
182 |         1928256417,
183 |         1928317841,
184 |         1651297152,
185 |         1647091344,
186 |         1650827936,
187 |         1659216416,
188 |         1722069540,
189 |         1726263844,
190 |         1717887533,
191 |         1713696302,
192 |         1847912494,
193 |         1847883822,
194 |         -366540754,
195 |         -345633778,
196 |         -336184242,
197 |         1771447375,
198 |         1753620815,
199 |         1757684255,
200 |         675553815,
201 |         943989255,
202 |         944120390,
203 |         952508006,
204 |         948308582,
205 |         948718050,
206 |         411879650,
207 |         428648578,
208 |         516861059,
209 |         370057347,
210 |         303988865,
211 |         306086033,
212 |         306086049,
213 |         841919649,
214 |         846133665,
215 |         1919929264,
216 |         1647168400,
217 |         1647101584,
218 |         1650827936,
219 |         1659216484,
220 |         1671733796,
221 |         1738838588,
222 |         1717887517,
223 |         1713696302,
224 |         1847913774,
225 |         1847879726,
226 |         1780960302,
227 |         -362410978,
228 |         -336196594,
229 |         1775641678,
230 |         1770397775,
231 |         1753555743,
232 |         683942429,
233 |         943989271,
234 |         944185926,
235 |         2026255094,
236 |         2022051494,
237 |         2021919654,
238 |     ]
239 | 


--------------------------------------------------------------------------------
/tests/test_batch.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import os
 3 | from tests import ROOT_DIR
 4 | from iscc_cli.cli import cli
 5 | from click.testing import CliRunner
 6 | 
 7 | 
 8 | os.chdir(ROOT_DIR)
 9 | r = CliRunner()
10 | 
11 | 
12 | def test_batch():
13 |     result = r.invoke(cli, ["batch", "./tests/batch"])
14 |     assert result.exit_code == 0
15 |     assert "CCKzUpp6U5hU7,CTMjk4o5H96BV,CDM6E14HcCZjQ,CR1LUvGDVrWye" in result.output
16 | 
17 | 
18 | def test_batch_recursive():
19 |     result = r.invoke(cli, ["batch", "-r", "./tests/batch"])
20 |     assert result.exit_code == 0
21 |     assert "CCKzUpp6U5hU7,CTMjk4o5H96BV,CDM6E14HcCZjQ,CR1LUvGDVrWye" in result.output
22 | 
23 | 
24 | def test_batch_python_call():
25 |     from iscc_cli.commands.batch import batch
26 | 
27 |     result = batch.callback("./tests/batch/subdir", False, False, False)
28 |     assert isinstance(result, list)
29 |     assert len(result) == 1
30 | 


--------------------------------------------------------------------------------
/tests/test_cli.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from tests import ROOT_DIR
 3 | from iscc_cli import __version__
 4 | from iscc_cli.cli import cli
 5 | from click.testing import CliRunner
 6 | 
 7 | 
 8 | os.chdir(ROOT_DIR)
 9 | r = CliRunner()
10 | 
11 | 
12 | def test_iscc_no_args():
13 |     result = r.invoke(cli)
14 |     assert result.exit_code == 0
15 |     assert result.output.startswith("Usage")
16 | 
17 | 
18 | def test_iscc_no_ars_but_opt():
19 |     result = r.invoke(cli, ["-v"])
20 |     assert result.exit_code == 2
21 |     assert "Error: Missing argument" in result.output
22 | 
23 | 
24 | def test_version():
25 |     result = r.invoke(cli, ["--version"])
26 |     assert result.exit_code == 0
27 |     assert "ISCC" in result.output
28 |     assert __version__ in result.output
29 | 
30 | 
31 | def test_no_command_with_valid_file():
32 |     result = r.invoke(cli, ["tests/image/demo.jpg"])
33 |     assert result.exit_code == 0
34 |     assert "CC1GG3hSxtbWU-CYDfTq7Qc7Fre-CDYkLqqmQJaQk-CRAPu5NwQgAhv" in result.output
35 |     assert "7a8d0c513142c45f" not in result.output
36 | 
37 | 
38 | def test_no_command_with_valid_file_verbose():
39 |     result = r.invoke(cli, ["-v", "tests/image/demo.jpg"])
40 |     assert result.exit_code == 0
41 |     assert "7a8d0c513142c45f" in result.output
42 | 


--------------------------------------------------------------------------------
/tests/test_dump.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | from tests import ROOT_DIR
 4 | from iscc_cli.cli import cli
 5 | from click.testing import CliRunner
 6 | 
 7 | 
 8 | os.chdir(ROOT_DIR)
 9 | r = CliRunner()
10 | 
11 | 
12 | def test_dump_no_arg_shows_help():
13 |     result = r.invoke(cli, ["dump"])
14 |     assert result.exit_code == 0
15 |     assert "dump [OPTIONS] PATH" in result.output
16 | 
17 | 
18 | def test_dump_with_doc():
19 |     result = r.invoke(cli, ["dump", "tests/text/demo.doc"])
20 |     assert result.exit_code == 0
21 |     assert '"status": 200' in result.output
22 | 
23 | 
24 | def test_dump_with_url():
25 |     result = r.invoke(cli, ["dump", "https://iscc.codes"])
26 |     assert result.exit_code == 0
27 |     assert "universal identifier" in result.output
28 | 
29 | 
30 | def test_dump_strip():
31 |     result = r.invoke(cli, ["dump", "-s", 50, "tests/text/demo.doc"])
32 |     assert result.exit_code == 0
33 |     data = json.loads(result.output)
34 |     assert len(data.get("content", "")) == 50
35 | 
36 | 
37 | def test_dump_meta_only():
38 |     result = r.invoke(cli, ["dump", "-m", "tests/text/demo.doc"])
39 |     assert result.exit_code == 0
40 |     data = json.loads(result.output)
41 |     assert "content" not in data
42 | 
43 | 
44 | def test_dump_usage_error():
45 |     result = r.invoke(cli, ["dump", "-m", "-c", "tests/text/demo.doc"])
46 |     assert result.exit_code == 2
47 |     assert "Use either" in result.output
48 | 


--------------------------------------------------------------------------------
/tests/test_ffmpeg.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import pytest
 3 | import os
 4 | import sys
 5 | from iscc_cli import ffmpeg
 6 | 
 7 | 
 8 | def is_linux():
 9 |     return sys.platform == "linux"
10 | 
11 | 
12 | def is_py36():
13 |     return sys.version.startswith("3.6")
14 | 
15 | 
16 | def test_exe_path():
17 |     assert "ffmpeg" in ffmpeg.exe_path()
18 | 
19 | 
20 | @pytest.mark.skipif(is_linux() and is_py36(), reason="custom ffmpeg")
21 | def test_ffmpeg_exists():
22 |     assert os.path.exists(ffmpeg.exe_path())
23 | 
24 | 
25 | @pytest.mark.skipif(is_linux() and is_py36(), reason="custom ffmpeg")
26 | def test_ffmpeg_executable():
27 |     assert os.access(ffmpeg.exe_path(), os.X_OK)
28 | 
29 | 
30 | @pytest.mark.skipif(is_linux() and is_py36(), reason="custom ffmpeg")
31 | def test_get_version_info():
32 |     vi = ffmpeg.get_version_info()
33 |     assert vi.startswith("4.2")
34 | 


--------------------------------------------------------------------------------
/tests/test_formats.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from tests import ROOT_DIR
 3 | from iscc_cli.cli import cli
 4 | from click.testing import CliRunner
 5 | 
 6 | 
 7 | os.chdir(ROOT_DIR)
 8 | r = CliRunner()
 9 | 
10 | 
11 | def test_unsupported():
12 |     result = r.invoke(cli, ["gen", "tests/text/demo.sqlite"])
13 |     assert result.exit_code == 0
14 |     assert "Unsupported media type" in result.output
15 | 
16 | 
17 | def test_xhtml():
18 |     result = r.invoke(cli, ["gen", "tests/text/demo.xhtml"])
19 |     assert result.exit_code == 0
20 |     assert "CTMjk4o5H96BV" in result.output
21 | 


--------------------------------------------------------------------------------
/tests/test_fpcalc.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import os
 3 | import sys
 4 | import pytest
 5 | import platform
 6 | from iscc_cli import fpcalc
 7 | 
 8 | 
 9 | def test_exe_path():
10 |     assert "fpcalc" in fpcalc.exe_path()
11 | 
12 | 
13 | def test_is_installed():
14 |     assert isinstance(fpcalc.is_installed(), bool)
15 | 
16 | 
17 | @pytest.mark.skipif(sys.platform == "darwin", reason="fpcalc broke naming convention")
18 | def test_download_url():
19 |     url = fpcalc.download_url()
20 |     assert platform.system().lower() in url
21 |     assert fpcalc.FPCALC_VERSION in url
22 | 
23 | 
24 | def test_download():
25 |     out_path = fpcalc.download()
26 |     assert os.path.exists(out_path)
27 | 
28 | 
29 | def test_install():
30 |     exe_path = fpcalc.install()
31 |     assert os.path.exists(exe_path)
32 |     assert fpcalc.is_installed()
33 | 
34 | 
35 | def test_get_version_info():
36 |     vi = fpcalc.get_version_info()
37 |     assert vi == fpcalc.FPCALC_VERSION
38 | 


--------------------------------------------------------------------------------
/tests/test_gen.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from tests import ROOT_DIR
 3 | from iscc_cli.cli import cli
 4 | from click.testing import CliRunner
 5 | 
 6 | 
 7 | os.chdir(ROOT_DIR)
 8 | r = CliRunner()
 9 | 
10 | 
11 | def test_gen_no_arg_shows_help():
12 |     result = r.invoke(cli, ["gen"])
13 |     assert result.exit_code == 0
14 |     assert "-t, --title TEXT" in result.output
15 | 
16 | 
17 | def test_gen_single_file():
18 |     result = r.invoke(cli, ["gen", "tests/image/demo.jpg"])
19 |     assert result.exit_code == 0
20 |     assert "CC1GG3hSxtbWU-CYDfTq7Qc7Fre-CDYkLqqmQJaQk-CRAPu5NwQgAhv" in result.output
21 | 
22 | 
23 | def test_gen_empty_file():
24 |     result = r.invoke(cli, ["gen", "tests/batch/empty.txt"])
25 |     assert result.exit_code == 2
26 |     assert "empty file" in result.output
27 | 
28 | 
29 | def test_gen_single_guess():
30 |     result = r.invoke(cli, ["gen", "tests/text/demo.doc"])
31 |     assert result.exit_code == 0
32 |     assert (
33 |         "ISCC:CCKzUpp6U5hU7-CTMjk4o5H96BV-CDM6E14HcCZjQ-CR1LUvGDVrWye" in result.output
34 |     )
35 |     result = r.invoke(cli, ["gen", "-g", "tests/text/demo.doc"])
36 |     assert result.exit_code == 0
37 |     assert (
38 |         "ISCC:CCKzUpp6U5hU7-CTMjk4o5H96BV-CDM6E14HcCZjQ-CR1LUvGDVrWye" in result.output
39 |     )
40 | 
41 | 
42 | def test_gen_image_guess():
43 |     result = r.invoke(cli, ["gen", "-g", "tests/image/demo.bmp"])
44 |     assert result.exit_code == 0
45 | 
46 | 
47 | def test_gen_image_no_title():
48 |     result = r.invoke(cli, ["gen", "-g", "tests/image/demo.png"])
49 |     assert "CCh7QKroUdKnH-CYDfTq7Qc7Fre-CDij3vGU1BkCZ-CRNssh4Qc1x5B" in result.output
50 | 
51 | 
52 | def test_gen_python_call():
53 |     from iscc_cli.commands.gen import gen
54 | 
55 |     file = open("tests/text/demo.doc")
56 |     result = gen.callback(file, True, "", "", True)
57 |     assert result["iscc"] == "CCKzUpp6U5hU7-CTMjk4o5H96BV-CDM6E14HcCZjQ-CR1LUvGDVrWye"
58 |     assert result["norm_title"] == "demo doc title from metadata"
59 | 


--------------------------------------------------------------------------------
/tests/test_info.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import os
 3 | from tests import ROOT_DIR
 4 | from iscc_cli.cli import cli
 5 | from click.testing import CliRunner
 6 | 
 7 | os.chdir(ROOT_DIR)
 8 | r = CliRunner()
 9 | 
10 | 
11 | def test_info():
12 |     result = r.invoke(cli, ["info"])
13 |     assert result.exit_code == 0
14 |     assert "Supported File Types" in result.output
15 | 


--------------------------------------------------------------------------------
/tests/test_lib.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import os
 3 | import iscc
 4 | from tests import ROOT_DIR
 5 | from iscc_cli import lib
 6 | 
 7 | 
 8 | os.chdir(ROOT_DIR)
 9 | 
10 | 
11 | def test_iscc_from_file():
12 |     res = lib.iscc_from_file("./tests/image/demo.jpg")
13 |     assert isinstance(res, dict)
14 |     assert res["iscc"] == "CC1GG3hSxtbWU-CYDfTq7Qc7Fre-CDYkLqqmQJaQk-CRAPu5NwQgAhv"
15 | 
16 | 
17 | def test_iscc_from_dir():
18 |     res = lib.isccs_from_dir("./tests/batch/subdir")
19 |     assert isinstance(res, list)
20 |     assert res[0]["iscc"] == "CCh7QKroUdKnH-CYDfTq7Qc7Fre-CDij3vGU1BkCZ-CRNssh4Qc1x5B"
21 | 
22 | 
23 | def test_iscc_from_url():
24 |     url = "https://iscc.foundation/news/images/lib-arch-ottawa.jpg"
25 |     res = lib.iscc_from_url(url)
26 |     assert isinstance(res, dict)
27 |     assert "CCbUCUSqQpyJo-CYaHPGcucqwe3-CDt4nQptEGP6M-CRestDoG7xZFy" in res["iscc"]
28 | 
29 | 
30 | def test_iscc_from_url_no_meta():
31 |     url = "https://github.com/iscc/iscc-cli/raw/master/tests/image/demo.png"
32 |     res = lib.iscc_from_url(url)
33 |     assert isinstance(res, dict)
34 |     assert "CYDfTq7Qc7Fre-CDij3vGU1BkCZ-CRNssh4Qc1x5B" in res["iscc"]
35 |     meta_id, _, _ = iscc.meta_id("demo")
36 |     assert meta_id in res["iscc"]
37 | 


--------------------------------------------------------------------------------
/tests/test_sim.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import os
 3 | from tests import ROOT_DIR
 4 | from iscc_cli.cli import cli
 5 | from click.testing import CliRunner
 6 | 
 7 | os.chdir(ROOT_DIR)
 8 | r = CliRunner()
 9 | 
10 | 
11 | def test_sim_no_args():
12 |     result = r.invoke(cli, ["sim"])
13 |     assert result.exit_code == 0
14 |     assert "$ iscc sim" in result.output
15 | 
16 | 
17 | def test_sim_components():
18 |     result = r.invoke(cli, ["sim", "CCKzPWegaT3hS", "CCcdAr6GDoF3p"])
19 |     assert result.exit_code == 0
20 |     assert "Estimated Similarity of Meta-ID" in result.output
21 | 
22 | 
23 | def test_sim_incompatible_components():
24 |     result = r.invoke(cli, ["sim", "CCKzPWegaT3hS", "CDM6E14HcCZjQ"])
25 |     assert result.exit_code == 0
26 |     assert "Incompatible component types" in result.output
27 | 
28 | 
29 | def test_sim_full_iscc():
30 |     result = r.invoke(
31 |         cli,
32 |         [
33 |             "sim",
34 |             "ISCC:CCKzPWegaT3hS-CTMjk4o5H96BV-CDM6E14HcCZjQ-CR1LUvGDVrWye",
35 |             "CCcdAr6GDoF3p-CTMjk4o5H96BV-CD6XL9SFyWgsW-CR28vgw3inZGw",
36 |         ],
37 |     )
38 |     assert result.exit_code == 0
39 |     assert "Average" in result.output
40 | 


--------------------------------------------------------------------------------
/tests/test_test.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import os
 3 | from tests import ROOT_DIR
 4 | from iscc_cli.cli import cli
 5 | from click.testing import CliRunner
 6 | 
 7 | 
 8 | os.chdir(ROOT_DIR)
 9 | r = CliRunner()
10 | 
11 | 
12 | def test_test_conformance():
13 |     result = r.invoke(cli, ["test"])
14 |     assert result.exit_code == 0
15 |     assert "PASSED" in result.output
16 |     assert "FAILED" not in result.output
17 | 


--------------------------------------------------------------------------------
/tests/test_utils.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from os.path import join
 3 | 
 4 | import pytest
 5 | 
 6 | from iscc_cli.const import GMT
 7 | from tests import TEST_DIR
 8 | from iscc_cli import utils
 9 | 
10 | 
11 | def test_iter_files_default():
12 |     result = utils.iter_files(TEST_DIR)
13 |     assert len(list(result)) >= 3
14 | 
15 | 
16 | def test_iter_files_empty():
17 |     result = utils.iter_files(TEST_DIR, exts=("nofile",))
18 |     assert len(list(result)) == 0
19 | 
20 | 
21 | def test_iter_files_filter():
22 |     result = utils.iter_files(join(TEST_DIR, "image"), exts=("jpg",))
23 |     assert list(result)[0].endswith("demo.jpg")
24 | 
25 | 
26 | def test_iter_files_recursive():
27 |     result = utils.iter_files(TEST_DIR, exts=("png",), recursive=False)
28 |     assert len(list(result)) == 0
29 |     result = utils.iter_files(TEST_DIR, exts=("png",), recursive=True)
30 |     assert list(result)[0].endswith("demo.png")
31 | 
32 | 
33 | def test_get_files():
34 |     result = utils.get_files(join(TEST_DIR, "batch"))
35 |     assert len(list(result)) == 3
36 |     result = utils.get_files(join(TEST_DIR, "batch"), recursive=True)
37 |     assert len(list(result)) == 4
38 | 
39 | 
40 | def test_mime_to_gmt():
41 |     result = utils.mime_to_gmt("image/jpeg")
42 |     assert result == GMT.IMAGE
43 | 
44 | 
45 | def test_mime_to_gmt_gif_image():
46 |     result = utils.mime_to_gmt("image/gif", join(TEST_DIR, "image", "demo.gif"))
47 |     assert result == GMT.IMAGE
48 | 
49 | 
50 | def test_mime_to_gmt_gif_video():
51 |     result = utils.mime_to_gmt("image/gif", join(TEST_DIR, "video", "demo.gif"))
52 |     assert result == GMT.VIDEO
53 | 
54 | 
55 | def test_iscc_clean():
56 |     assert utils.iscc_clean("ISCC: SOME-CODE") == "SOMECODE"
57 |     assert utils.iscc_clean(" SOMECODE ") == "SOMECODE"
58 |     assert utils.iscc_clean("ISCC:") == ""
59 | 
60 | 
61 | def test_iscc_verify():
62 |     with pytest.raises(ValueError):
63 |         utils.iscc_verify("I")
64 | 
65 | 
66 | def test_iscc_split():
67 |     i = "ISCC:CCcdAr6GDoF3p-CTMjk4o5H96BV-CD6XL9SFyWgsW-CR28vgw3inZGw"
68 |     assert utils.iscc_split(i) == [
69 |         "CCcdAr6GDoF3p",
70 |         "CTMjk4o5H96BV",
71 |         "CD6XL9SFyWgsW",
72 |         "CR28vgw3inZGw",
73 |     ]
74 | 
75 |     i = "ISCC:CCcdAr6GDoF3p"
76 |     assert utils.iscc_split(i) == ["CCcdAr6GDoF3p"]
77 | 
78 |     i = "CCcdAr6GDoF3p"
79 |     assert utils.iscc_split(i) == ["CCcdAr6GDoF3p"]
80 | 
81 | 
82 | def test_clean_mime():
83 |     assert utils.clean_mime("") == ""
84 |     assert utils.clean_mime("text/html ") == "text/html"
85 |     assert utils.clean_mime(["text/html", "audio/mp3"]) == "text/html"
86 |     assert utils.clean_mime([" text/html", "audio/mp3"]) == "text/html"
87 |     assert utils.clean_mime(" text/plain; charset=windows-1252 ") == "text/plain"
88 |     assert (
89 |         utils.clean_mime([" text/plain; charset=windows-1252 ", "audio/mp3"])
90 |         == "text/plain"
91 |     )
92 | 


--------------------------------------------------------------------------------
/tests/test_video_id.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import os
 3 | from os.path import abspath
 4 | 
 5 | from tests import ROOT_DIR
 6 | from iscc_cli import video_id
 7 | import pytest
 8 | 
 9 | 
10 | os.chdir(ROOT_DIR)
11 | 
12 | 
13 | def test_wta_hash():
14 |     vec = tuple([0] * 379) + (1,)
15 |     assert video_id.wta_hash(vec) == b"\xff\xff\xff\xff\xff\xff\xff\xff"
16 |     vec = (1,) + tuple([0] * 379)
17 |     assert video_id.wta_hash(vec) == b"\xff\xff\xff\xff\xff\xff\xff\xff"
18 |     vec = (1,) + tuple([0] * 378) + (1,)
19 |     assert video_id.wta_hash(vec) == b"\xff\xff\xff\xff\xff\xff\xff\xff"
20 |     vec = (0,) + tuple([2] * 378) + (0,)
21 |     assert video_id.wta_hash(vec) == b"\x00\x00\x00\x00\x00\x00\x00\x00"
22 | 
23 | 
24 | def test_crop():
25 |     assert video_id.get_crop("./tests/video/master.3gp") == "crop=176:96:0:24"
26 | 
27 | 
28 | def test_get_frame_vectors():
29 |     fv = video_id.get_frame_vectors(abspath("./tests/video/master.3gp"))
30 |     assert isinstance(fv, tuple)
31 |     assert isinstance(fv[0][0], int)
32 |     assert len(fv[0]) == 380
33 | 
34 | 
35 | def test_content_id_video():
36 |     assert video_id.content_id_video([tuple(range(380))]) == "CVEowL1rB7Z8P"
37 | 
38 | 
39 | def test_content_id_video_0_fetures():
40 |     with pytest.raises(AssertionError):
41 |         video_id.content_id_video([tuple([0] * 380)])
42 | 


--------------------------------------------------------------------------------
/tests/test_web.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import os
 3 | from tests import ROOT_DIR
 4 | from iscc_cli.cli import cli
 5 | from click.testing import CliRunner
 6 | 
 7 | 
 8 | os.chdir(ROOT_DIR)
 9 | r = CliRunner()
10 | 
11 | 
12 | def test_iscc_web_no_args():
13 |     result = r.invoke(cli)
14 |     assert result.exit_code == 0
15 |     assert result.output.startswith("Usage")
16 | 
17 | 
18 | def test_iscc_web_image():
19 |     result = r.invoke(
20 |         cli, ["web", "https://iscc.foundation/news/images/lib-arch-ottawa.jpg"]
21 |     )
22 |     assert result.exit_code == 0
23 |     assert "CCbUCUSqQpyJo-CYaHPGcucqwe3-CDt4nQptEGP6M-CRestDoG7xZFy" in result.output
24 | 
25 | 
26 | def test_iscc_web_video():
27 |     result = r.invoke(cli, ["web", "https://craft.de/iscc-grinder.mp4"])
28 |     assert result.exit_code == 0
29 |     assert "CV2TgqeKWE7K8-CDKaC252w9QKN-CRYKUhn2RpzF4" in result.output
30 | 
31 | 
32 | def test_iscc_web_invalid_url():
33 |     result = r.invoke(cli, ["web", "heise.de"])
34 |     assert result.exit_code == 2
35 |     assert "Error: Invalid URL" in result.output
36 | 
37 | 
38 | def test_iscc_web_python_call():
39 |     from iscc_cli.commands.web import web
40 | 
41 |     url = "https://iscc.foundation/news/images/lib-arch-ottawa.jpg"
42 |     result = web.callback(url=url, guess=False, title="", extra="", verbose=False)
43 |     assert "CCbUCUSqQpyJo-CYaHPGcucqwe3-CDt4nQptEGP6M-CRestDoG7xZFy" in result["iscc"]
44 |     assert result["norm_title"] == "library and archives canada ottawa"
45 | 


--------------------------------------------------------------------------------
/tests/text/demo.doc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iscc/iscc-cli/b8ca75567e842de504752440cb79f3dcb21177d4/tests/text/demo.doc


--------------------------------------------------------------------------------
/tests/text/demo.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iscc/iscc-cli/b8ca75567e842de504752440cb79f3dcb21177d4/tests/text/demo.docx


--------------------------------------------------------------------------------
/tests/text/demo.epub:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iscc/iscc-cli/b8ca75567e842de504752440cb79f3dcb21177d4/tests/text/demo.epub


--------------------------------------------------------------------------------
/tests/text/demo.html:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iscc/iscc-cli/b8ca75567e842de504752440cb79f3dcb21177d4/tests/text/demo.html


--------------------------------------------------------------------------------
/tests/text/demo.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "title": "The Neverending Story",
 3 |   "meta":
 4 |   [{
 5 |     "schema": "schema.org",
 6 |     "mediatype": "application/ld+json",
 7 |     "data":
 8 |       {
 9 |         "@context": "http://schema.org",
10 |         "@type": "Movie",
11 |         "name": "The Neverending Story",
12 |         "dateCreated": "6 April, 1984",
13 |         "director": "Wolfgang Petersen",
14 |         "actors": ["Noah Hathaway", "Barret Oliver", "Tami Stronach"],
15 |         "duration": "1:42:00"
16 |       }
17 |   }]
18 | }
19 | 


--------------------------------------------------------------------------------
/tests/text/demo.mobi:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iscc/iscc-cli/b8ca75567e842de504752440cb79f3dcb21177d4/tests/text/demo.mobi


--------------------------------------------------------------------------------
/tests/text/demo.odt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iscc/iscc-cli/b8ca75567e842de504752440cb79f3dcb21177d4/tests/text/demo.odt


--------------------------------------------------------------------------------
/tests/text/demo.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iscc/iscc-cli/b8ca75567e842de504752440cb79f3dcb21177d4/tests/text/demo.pdf


--------------------------------------------------------------------------------
/tests/text/demo.sqlite:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iscc/iscc-cli/b8ca75567e842de504752440cb79f3dcb21177d4/tests/text/demo.sqlite


--------------------------------------------------------------------------------
/tests/text/demo.txt:
--------------------------------------------------------------------------------
 1 | ISCC Test Document!
 2 | 
 3 | Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy
 4 | eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam
 5 | voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita
 6 | kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem
 7 | ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor
 8 | invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos
 9 | et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea
10 | takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet,
11 | consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et
12 | dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo
13 | duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est
14 | Lorem ipsum dolor sit amet.
15 | 
16 | Duis autem vel eum iriure dolor in hendrerit in vulputate velit esse molestie
17 | consequat, vel illum dolore eu feugiat nulla facilisis at vero eros et accumsan
18 | et iusto odio dignissim qui blandit praesent luptatum zzril delenit augue duis
19 | dolore te feugait nulla facilisi. Lorem ipsum dolor sit amet, consectetuer
20 | adipiscing elit, sed diam nonummy nibh euismod tincidunt ut laoreet dolore magna
21 | aliquam erat volutpat.
22 | 
23 | Ut wisi enim ad minim veniam, quis nostrud exerci tation ullamcorper suscipit
24 | lobortis nisl ut aliquip ex ea commodo consequat. Duis autem vel eum iriure
25 | dolor in hendrerit in vulputate velit esse molestie consequat, vel illum dolore
26 | eu feugiat nulla facilisis at vero eros et accumsan et iusto odio dignissim qui
27 | blandit praesent luptatum zzril delenit augue duis dolore te feugait nulla
28 | facilisi.
29 | 
30 | Nam liber tempor cum soluta nobis eleifend option congue nihil imperdiet doming
31 | id quod mazim placerat facer possim assum. Lorem ipsum dolor sit amet,
32 | consectetuer adipiscing elit, sed diam nonummy nibh euismod tincidunt ut laoreet
33 | dolore magna aliquam erat volutpat. Ut wisi enim ad minim veniam, quis nostrud
34 | exerci tation ullamcorper suscipit lobortis nisl ut aliquip ex ea commodo
35 | consequat.
36 | 
37 | Duis autem vel eum iriure dolor in hendrerit in vulputate velit esse molestie
38 | consequat, vel illum dolore eu feugiat nulla facilisis.
39 | 
40 | At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd
41 | gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum
42 | dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor
43 | invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos
44 | et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea
45 | takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet,
46 | consetetur sadipscing elitr, At accusam aliquyam diam diam dolore dolores duo
47 | eirmod eos erat, et nonumy sed tempor et et invidunt justo labore Stet clita ea
48 | et gubergren, kasd magna no rebum. sanctus sea sed takimata ut vero voluptua.
49 | est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur
50 | sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore
51 | magna aliquyam erat.
52 | 
53 | Consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et
54 | dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo
55 | duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est
56 | Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing
57 | elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam
58 | erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea
59 | rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor
60 | sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam
61 | nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam
62 | voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita
63 | kasd gubergren, no sea takimata sanctus.
64 | 
65 | Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod
66 | tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At
67 | vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren,
68 | no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit
69 | amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut
70 | labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam
71 | et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata
72 | sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur
73 | sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore
74 | magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo
75 | dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est
76 | Lorem ipsum dolor sit amet.
77 | 
78 | Duis autem vel eum iriure dolor in hendrerit in vulputate velit esse molestie
79 | consequat, vel illum dolore eu feugiat nulla facilisis at vero eros et accumsan
80 | et iusto odio dignissim qui blandit praesent luptatum zzril delenit augue duis
81 | dolore te feugait nulla facilisi. Lorem ipsum dolor sit amet, consectetuer
82 | adipiscing elit, sed diam nonummy nibh euismod tincidunt ut laoreet dolore magna
83 | aliquam erat volutpat.
84 | 
85 | Ut wisi enim ad minim veniam, quis nostrud exerci tation ullamcorper suscipit
86 | lobortis nisl ut aliquip ex ea commodo consequat. Duis autem vel eum iriure
87 | dolor in hendrerit in vulputate velit esse molestie consequat, vel illum dolore
88 | eu feugiat nulla facilisis at vero eros et accumsan et iusto odio dignissim qui
89 | blandit praesent luptatum zzril delenit augue duis dolore te feugait nulla
90 | facilisi.
91 | 
92 | Nam liber tempor cum soluta nobis eleifend option congue nihil imperdiet doming
93 | id quod mazim placerat facer possim assum. Lorem ipsum dolor sit amet,
94 | consectetuer adipiscing elit, sed diam nonummy nibh euismod tincidunt ut laoreet
95 | dolore magna aliquam erat volutpat. Ut wisi enim ad minim veniam, quis nostrud
96 | exerci tation ullamcorper suscipit lobortis nisl ut aliquip ex ea commodo
97 | consequat.
98 | 
99 | 


--------------------------------------------------------------------------------
/tests/text/demo.xhtml:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html xmlns="http://www.w3.org/1999/xhtml" lang="" xml:lang="">
 3 | <head>
 4 |   <meta charset="utf-8" />
 5 |   <meta name="generator" content="pandoc" />
 6 |   <meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes" />
 7 |   <title>demo</title>
 8 |   <style>
 9 |     code{white-space: pre-wrap;}
10 |     span.smallcaps{font-variant: small-caps;}
11 |     span.underline{text-decoration: underline;}
12 |     div.column{display: inline-block; vertical-align: top; width: 50%;}
13 |     div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
14 |     ul.task-list{list-style: none;}
15 |   </style>
16 |   <!--[if lt IE 9]>
17 |     <script src="//cdnjs.cloudflare.com/ajax/libs/html5shiv/3.7.3/html5shiv-printshiv.min.js"></script>
18 |   <![endif]-->
19 | </head>
20 | <body>
21 | <h1 id="iscc-test-document">ISCC Test Document</h1>
22 | <p>Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.</p>
23 | <p>Duis autem vel eum iriure dolor in hendrerit in vulputate velit esse molestie consequat, vel illum dolore eu feugiat nulla facilisis at vero eros et accumsan et iusto odio dignissim qui blandit praesent luptatum zzril delenit augue duis dolore te feugait nulla facilisi. Lorem ipsum dolor sit amet, consectetuer adipiscing elit, sed diam nonummy nibh euismod tincidunt ut laoreet dolore magna aliquam erat volutpat.</p>
24 | <p>Ut wisi enim ad minim veniam, quis nostrud exerci tation ullamcorper suscipit lobortis nisl ut aliquip ex ea commodo consequat. Duis autem vel eum iriure dolor in hendrerit in vulputate velit esse molestie consequat, vel illum dolore eu feugiat nulla facilisis at vero eros et accumsan et iusto odio dignissim qui blandit praesent luptatum zzril delenit augue duis dolore te feugait nulla facilisi.</p>
25 | <p>Nam liber tempor cum soluta nobis eleifend option congue nihil imperdiet doming id quod mazim placerat facer possim assum. Lorem ipsum dolor sit amet, consectetuer adipiscing elit, sed diam nonummy nibh euismod tincidunt ut laoreet dolore magna aliquam erat volutpat. Ut wisi enim ad minim veniam, quis nostrud exerci tation ullamcorper suscipit lobortis nisl ut aliquip ex ea commodo consequat.</p>
26 | <p>Duis autem vel eum iriure dolor in hendrerit in vulputate velit esse molestie consequat, vel illum dolore eu feugiat nulla facilisis.</p>
27 | <p>At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, At accusam aliquyam diam diam dolore dolores duo eirmod eos erat, et nonumy sed tempor et et invidunt justo labore Stet clita ea et gubergren, kasd magna no rebum. sanctus sea sed takimata ut vero voluptua. est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat.</p>
28 | <p>Consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus.</p>
29 | <p>Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.</p>
30 | <p>Duis autem vel eum iriure dolor in hendrerit in vulputate velit esse molestie consequat, vel illum dolore eu feugiat nulla facilisis at vero eros et accumsan et iusto odio dignissim qui blandit praesent luptatum zzril delenit augue duis dolore te feugait nulla facilisi. Lorem ipsum dolor sit amet, consectetuer adipiscing elit, sed diam nonummy nibh euismod tincidunt ut laoreet dolore magna aliquam erat volutpat.</p>
31 | <p>Ut wisi enim ad minim veniam, quis nostrud exerci tation ullamcorper suscipit lobortis nisl ut aliquip ex ea commodo consequat. Duis autem vel eum iriure dolor in hendrerit in vulputate velit esse molestie consequat, vel illum dolore eu feugiat nulla facilisis at vero eros et accumsan et iusto odio dignissim qui blandit praesent luptatum zzril delenit augue duis dolore te feugait nulla facilisi.</p>
32 | <p>Nam liber tempor cum soluta nobis eleifend option congue nihil imperdiet doming id quod mazim placerat facer possim assum. Lorem ipsum dolor sit amet, consectetuer adipiscing elit, sed diam nonummy nibh euismod tincidunt ut laoreet dolore magna aliquam erat volutpat. Ut wisi enim ad minim veniam, quis nostrud exerci tation ullamcorper suscipit lobortis nisl ut aliquip ex ea commodo consequat.</p>
33 | </body>
34 | </html>
35 | 


--------------------------------------------------------------------------------
/tests/text/demo.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iscc/iscc-cli/b8ca75567e842de504752440cb79f3dcb21177d4/tests/text/demo.xls


--------------------------------------------------------------------------------
/tests/text/demo.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iscc/iscc-cli/b8ca75567e842de504752440cb79f3dcb21177d4/tests/text/demo.xlsx


--------------------------------------------------------------------------------
/tests/text/demo.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8" standalone="yes"?>
  2 | <note>
  3 |     <title>ISCC Test Document</title>
  4 |     <body>Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy
  5 |         eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam
  6 |         voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita
  7 |         kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem
  8 |         ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor
  9 |         invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos
 10 |         et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea
 11 |         takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet,
 12 |         consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et
 13 |         dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo
 14 |         duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est
 15 |         Lorem ipsum dolor sit amet.
 16 | 
 17 |         Duis autem vel eum iriure dolor in hendrerit in vulputate velit esse molestie
 18 |         consequat, vel illum dolore eu feugiat nulla facilisis at vero eros et accumsan
 19 |         et iusto odio dignissim qui blandit praesent luptatum zzril delenit augue duis
 20 |         dolore te feugait nulla facilisi. Lorem ipsum dolor sit amet, consectetuer
 21 |         adipiscing elit, sed diam nonummy nibh euismod tincidunt ut laoreet dolore magna
 22 |         aliquam erat volutpat.
 23 | 
 24 |         Ut wisi enim ad minim veniam, quis nostrud exerci tation ullamcorper suscipit
 25 |         lobortis nisl ut aliquip ex ea commodo consequat. Duis autem vel eum iriure
 26 |         dolor in hendrerit in vulputate velit esse molestie consequat, vel illum dolore
 27 |         eu feugiat nulla facilisis at vero eros et accumsan et iusto odio dignissim qui
 28 |         blandit praesent luptatum zzril delenit augue duis dolore te feugait nulla
 29 |         facilisi.
 30 | 
 31 |         Nam liber tempor cum soluta nobis eleifend option congue nihil imperdiet doming
 32 |         id quod mazim placerat facer possim assum. Lorem ipsum dolor sit amet,
 33 |         consectetuer adipiscing elit, sed diam nonummy nibh euismod tincidunt ut laoreet
 34 |         dolore magna aliquam erat volutpat. Ut wisi enim ad minim veniam, quis nostrud
 35 |         exerci tation ullamcorper suscipit lobortis nisl ut aliquip ex ea commodo
 36 |         consequat.
 37 | 
 38 |         Duis autem vel eum iriure dolor in hendrerit in vulputate velit esse molestie
 39 |         consequat, vel illum dolore eu feugiat nulla facilisis.
 40 | 
 41 |         At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd
 42 |         gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum
 43 |         dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor
 44 |         invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos
 45 |         et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea
 46 |         takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet,
 47 |         consetetur sadipscing elitr, At accusam aliquyam diam diam dolore dolores duo
 48 |         eirmod eos erat, et nonumy sed tempor et et invidunt justo labore Stet clita ea
 49 |         et gubergren, kasd magna no rebum. sanctus sea sed takimata ut vero voluptua.
 50 |         est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur
 51 |         sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore
 52 |         magna aliquyam erat.
 53 | 
 54 |         Consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et
 55 |         dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo
 56 |         duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est
 57 |         Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing
 58 |         elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam
 59 |         erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea
 60 |         rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor
 61 |         sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam
 62 |         nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam
 63 |         voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita
 64 |         kasd gubergren, no sea takimata sanctus.
 65 | 
 66 |         Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod
 67 |         tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At
 68 |         vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren,
 69 |         no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit
 70 |         amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut
 71 |         labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam
 72 |         et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata
 73 |         sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur
 74 |         sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore
 75 |         magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo
 76 |         dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est
 77 |         Lorem ipsum dolor sit amet.
 78 | 
 79 |         Duis autem vel eum iriure dolor in hendrerit in vulputate velit esse molestie
 80 |         consequat, vel illum dolore eu feugiat nulla facilisis at vero eros et accumsan
 81 |         et iusto odio dignissim qui blandit praesent luptatum zzril delenit augue duis
 82 |         dolore te feugait nulla facilisi. Lorem ipsum dolor sit amet, consectetuer
 83 |         adipiscing elit, sed diam nonummy nibh euismod tincidunt ut laoreet dolore magna
 84 |         aliquam erat volutpat.
 85 | 
 86 |         Ut wisi enim ad minim veniam, quis nostrud exerci tation ullamcorper suscipit
 87 |         lobortis nisl ut aliquip ex ea commodo consequat. Duis autem vel eum iriure
 88 |         dolor in hendrerit in vulputate velit esse molestie consequat, vel illum dolore
 89 |         eu feugiat nulla facilisis at vero eros et accumsan et iusto odio dignissim qui
 90 |         blandit praesent luptatum zzril delenit augue duis dolore te feugait nulla
 91 |         facilisi.
 92 | 
 93 |         Nam liber tempor cum soluta nobis eleifend option congue nihil imperdiet doming
 94 |         id quod mazim placerat facer possim assum. Lorem ipsum dolor sit amet,
 95 |         consectetuer adipiscing elit, sed diam nonummy nibh euismod tincidunt ut laoreet
 96 |         dolore magna aliquam erat volutpat. Ut wisi enim ad minim veniam, quis nostrud
 97 |         exerci tation ullamcorper suscipit lobortis nisl ut aliquip ex ea commodo
 98 |         consequat.
 99 |     </body>
100 | </note>
101 | 


--------------------------------------------------------------------------------
/tests/video/build_videos.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """Build supported mediatype list based on ffmpeg & tika support.
 3 | 
 4 | See: https://en.wikipedia.org/wiki/Video_file_format
 5 | """
 6 | import subprocess
 7 | from collections import defaultdict
 8 | from os.path import exists, abspath
 9 | from iscc_cli.tika import detector
10 | from iscc_cli import ffmpeg
11 | from iscc_cli import video_id
12 | from utils import clean_mime
13 | 
14 | FORMATS = (
15 |     "rm",
16 |     "drc",
17 |     "3gp",
18 |     "3g2",
19 |     "asf",
20 |     "avi",
21 |     "webm",
22 |     "mpeg",
23 |     "mpg",
24 |     "mp4",
25 |     "m4v",
26 |     "mkv",
27 |     "m1v",
28 |     "ogg",
29 |     "mov",
30 |     "flv",
31 |     "swf",
32 |     "f4v",
33 |     "h264",
34 |     "ogv",
35 |     "vob",
36 |     "wmv",
37 | )
38 | 
39 | 
40 | def build_media_types():
41 |     mt = defaultdict(list)
42 |     for fmt in FORMATS:
43 |         outf = "demo.{}".format(fmt)
44 |         print("Processing {}:".format(outf), end=" ")
45 |         if not exists(outf):
46 |             if fmt in ("3gp", "3g2"):
47 |                 cmd = [
48 |                     ffmpeg.exe_path(),
49 |                     "-i",
50 |                     "master.3gp",
51 |                     "-f",
52 |                     fmt,
53 |                     "-vcodec",
54 |                     "h263",
55 |                     "-vf",
56 |                     "scale=352x288",
57 |                     "-acodec",
58 |                     "amr_nb",
59 |                     "-ar",
60 |                     "8000",
61 |                     "-ac",
62 |                     "1",
63 |                     outf,
64 |                 ]
65 |             else:
66 |                 cmd = [ffmpeg.exe_path(), "-i", "master.3gp", "-loglevel", "2", outf]
67 |             subprocess.run(cmd)
68 |         media_type = clean_mime(detector.from_file(abspath(outf)))
69 |         sigs = video_id.get_frame_vectors(abspath(outf))
70 |         vid = video_id.content_id_video(sigs)
71 |         print("{} -> {} -> {}".format(vid, outf, media_type))
72 |         mt[media_type].append(fmt)
73 |     for m, e in mt.items():
74 |         if len(e) == 1:
75 |             print(f'"{m}": {{"gmt": GMT.VIDEO, "ext": "{e[0]}"}},')
76 |         else:
77 |             print(f'"{m}": {{"gmt": GMT.VIDEO, "ext": {e}}},')
78 | 
79 | 
80 | if __name__ == "__main__":
81 |     build_media_types()
82 | 


--------------------------------------------------------------------------------
/tests/video/demo.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iscc/iscc-cli/b8ca75567e842de504752440cb79f3dcb21177d4/tests/video/demo.gif


--------------------------------------------------------------------------------
/tests/video/master.3gp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iscc/iscc-cli/b8ca75567e842de504752440cb79f3dcb21177d4/tests/video/master.3gp


--------------------------------------------------------------------------------
/winbuild.bat:
--------------------------------------------------------------------------------
1 | pyinstaller --clean --onefile --console --name iscc iscc_cli/cli.py
2 | 


--------------------------------------------------------------------------------