├── .appveyor.yml ├── .editorconfig ├── .github ├── FUNDING.yml └── workflows │ └── test.yml ├── .gitignore ├── .travis.yml ├── LICENSE ├── README.md ├── iscc_cli ├── __init__.py ├── audio_id.py ├── cli.py ├── commands │ ├── __init__.py │ ├── batch.py │ ├── dump.py │ ├── gen.py │ ├── info.py │ ├── init.py │ ├── sim.py │ ├── test.py │ └── web.py ├── const.py ├── datatypes.py ├── ffmpeg.py ├── fpcalc.py ├── lib.py ├── mediatype.py ├── tika │ ├── __init__.py │ ├── config.py │ ├── detector.py │ ├── language.py │ ├── parser.py │ ├── tika.py │ ├── translate.py │ └── unpack.py ├── uread.py ├── utils.py └── video_id.py ├── poetry.lock ├── pyproject.toml ├── tests ├── __init__.py ├── audio │ ├── demo.aif │ ├── demo.mp3 │ ├── demo.ogg │ └── demo.wav ├── batch │ ├── demo.doc │ ├── demo.pdf │ ├── empty.txt │ └── subdir │ │ └── demo.png ├── conftest.py ├── image │ ├── demo.bmp │ ├── demo.gif │ ├── demo.jpg │ ├── demo.png │ ├── demo.psd │ └── demo.tif ├── test_0_pre_init.py ├── test_1_init.py ├── test_audio_id.py ├── test_batch.py ├── test_cli.py ├── test_dump.py ├── test_ffmpeg.py ├── test_formats.py ├── test_fpcalc.py ├── test_gen.py ├── test_info.py ├── test_lib.py ├── test_sim.py ├── test_test.py ├── test_utils.py ├── test_video_id.py ├── test_web.py ├── text │ ├── demo.doc │ ├── demo.docx │ ├── demo.epub │ ├── demo.html │ ├── demo.json │ ├── demo.md │ ├── demo.mobi │ ├── demo.odt │ ├── demo.pdf │ ├── demo.rtf │ ├── demo.sqlite │ ├── demo.txt │ ├── demo.xhtml │ ├── demo.xls │ ├── demo.xlsx │ └── demo.xml └── video │ ├── build_videos.py │ ├── demo.gif │ └── master.3gp └── winbuild.bat /.appveyor.yml: -------------------------------------------------------------------------------- 1 | build: false 2 | 3 | environment: 4 | PYTHONIOENCODING: "UTF-8" 5 | 6 | matrix: 7 | - PYTHON: "C:/Python36-x64" 8 | - PYTHON: "C:/Python37-x64" 9 | - PYTHON: "C:/Python38-x64" 10 | 11 | 12 | install: 13 | - "SET PATH=%PYTHON%;%PYTHON%\\Scripts;%PATH%" 14 | 15 | # Installing Poetry 16 | - "curl -sSL https://raw.githubusercontent.com/sdispater/poetry/master/get-poetry.py -o get-poetry.py" 17 | - "python get-poetry.py --yes" 18 | - "SET PATH=%USERPROFILE%\\.poetry\\bin;%PATH%" 19 | 20 | # Install dependencies 21 | - "poetry install -v" 22 | 23 | 24 | test_script: 25 | - "poetry run pytest -v --terminate tests" 26 | -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | # https://editorconfig.org/ 2 | 3 | root = true 4 | 5 | [*] 6 | indent_style = space 7 | indent_size = 4 8 | insert_final_newline = true 9 | trim_trailing_whitespace = true 10 | end_of_line = lf 11 | charset = utf-8 12 | 13 | # Docstrings and comments use max_line_length = 79 14 | [*.py] 15 | max_line_length = 88 16 | -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | github: titusz 2 | custom: "https://iscc.foundation/support/" 3 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: Tests 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | Tests: 7 | 8 | name: ${{ matrix.os }} / ${{ matrix.python-version }} 9 | runs-on: ${{ matrix.os }}-latest 10 | strategy: 11 | matrix: 12 | os: [Ubuntu, MacOS, Windows] 13 | python-version: [3.6, 3.7, 3.8] 14 | 15 | steps: 16 | - uses: actions/checkout@v2 17 | 18 | - name: Set up Python ${{ matrix.python-version }} 19 | uses: actions/setup-python@v1 20 | with: 21 | python-version: ${{ matrix.python-version }} 22 | 23 | - name: Install ffmpeg 24 | if: runner.os == 'Linux' && matrix.python-version == '3.6' 25 | run: | 26 | sudo apt-get update 27 | sudo apt-get install ffmpeg 28 | 29 | - name: Update pip 30 | if: runner.os == 'Windoes' 31 | run: python -m pip install -U pip 32 | 33 | - name: Install poetry 34 | run: pip install poetry 35 | 36 | - name: Install dependencies 37 | run: poetry install 38 | 39 | - name: Run pytest 40 | run: poetry run pytest -q tests 41 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | 106 | # IntelliJ 107 | .idea 108 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | 3 | cache: 4 | pip: true 5 | directories: 6 | - "$HOME/.cache/pypoetry" 7 | 8 | install: 9 | - curl -sSL https://raw.githubusercontent.com/sdispater/poetry/master/get-poetry.py -o get-poetry.py 10 | - python get-poetry.py --yes 11 | - source $HOME/.poetry/env 12 | - poetry install 13 | 14 | script: pytest -q tests/ 15 | 16 | matrix: 17 | include: 18 | - python: "3.6" 19 | - python: "3.7" 20 | dist: xenial 21 | - python: "3.8" 22 | dist: bionic 23 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019-2020 Titusz Pan 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # iscc-cli - Command Line Tool 2 | 3 | [![Version](https://img.shields.io/pypi/v/iscc-cli.svg)](https://pypi.python.org/pypi/iscc-cli/) 4 | [![Downloads](https://pepy.tech/badge/iscc-cli)](https://pepy.tech/project/iscc-cli) 5 | 6 | > [!CAUTION] 7 | > This implementation is currently not up to date and does **NOT** generate valid ISCCs. 8 | 9 | > A command line tool that creates **ISCC Codes** for digital media files based on the [reference implementation](). 10 | 11 | ## Table of Contents 12 | 13 | - [Background](#background) 14 | - [Install](#install) 15 | - [Usage](#usage) 16 | - [Maintainers](#maintainers) 17 | - [Contributing](#contributing) 18 | - [License](#license) 19 | 20 | ## Background 21 | 22 | The **International Standard Content Code** is a proposal for an [open standard](https://en.wikipedia.org/wiki/Open_standard) for decentralized content identification. **ISCC Codes** are generated algorithmically **from the content itself** and offer many powerful features like content similarity clustering and partial integrity checks. If you want to learn more about the **ISCC** please check out https://iscc.codes. 23 | 24 | This tool offers an easy way to generate ISCC codes from the command line. It supports content extraction via [Apache Tika](https://tika.apache.org/) and uses the [ISCC reference implementation](https://github.com/iscc/iscc-specs). 25 | 26 | 27 | ### Supported Media File Types 28 | 29 | #### Text 30 | 31 | doc, docx, epub, html, odt, pdf, rtf, txt, xml, ibooks, md, xls, mobi ... 32 | 33 | 34 | #### Image 35 | 36 | gif, jpg, png, tif, bmp, psd, eps ... 37 | 38 | **Note**: EPS (postscript) support requires [Ghostscript](https://www.ghostscript.com/download.html) to be installed on your system and available on your PATH. (Make sure you can run `gs` from your command line.) 39 | 40 | 41 | #### Audio 42 | 43 | aif, mp3, ogg, wav ... 44 | 45 | 46 | **Note**: Support for the Audio-ID is experimental and not yet part of the [specification](https://iscc.codes/specification/) 47 | 48 | 49 | #### Video 50 | 51 | 3gp, 3g2, asf, avi, flv, gif, mpg, mp4, mkv, mov, ogv, webm, wmv ... 52 | 53 | 54 | **Note**: Support for the Video-ID is experimentel and not yet part of the [specification](https://iscc.codes/specification/) 55 | 56 | ## Requirements 57 | 58 | | NOTE: Requires JAVA to be installed and on your path! | 59 | | --- | 60 | 61 | **iscc-cli** is tested on Linux, Windows, and macOS with Python 3.6/3.7/3.8. 62 | 63 | This tool depends on [tika-python](https://github.com/chrismattmann/tika-python). [Tika](https://tika.apache.org/) is used for extracting metadata and content from media files before generating ISCC Codes. On first execution of the `iscc` command line tool it will automatically download and launch the Java Tika Server in the background (this may take some time). Consecutive runs will access the existing Tika instance. You may explicitly pre-launch the Tika server with `$ iscc init` 64 | 65 | ## Install 66 | 67 | The ISCC command line tool is published with the package name `iscc-cli` on the [Python Package Index](https://pypi.python.org/pypi/iscc-cli) and can be installed with pip: 68 | 69 | ```console 70 | $ pip3 install iscc-cli 71 | ``` 72 | 73 | Self-contained Windows binary executables are available for download at: 74 | 75 | 76 | ## Usage 77 | 78 | ### Getting Help 79 | 80 | Show help overview by calling `iscc` without any arguments: 81 | 82 | ```console 83 | $ iscc 84 | Usage: iscc [OPTIONS] COMMAND [ARGS]... 85 | 86 | Options: 87 | --version Show the version and exit. 88 | --help Show this message and exit. 89 | 90 | Commands: 91 | gen* Generate ISCC Code for FILE. 92 | batch Create ISCC Codes for all files in PATH. 93 | dump Dump Tika extraction results for PATH (file or url path). 94 | info Show information about environment. 95 | init Inititalize and check environment. 96 | sim Estimate Similarity of ISCC Codes A & B. 97 | test Test conformance with latest reference data. 98 | web Generate ISCC Code from URL. 99 | ``` 100 | 101 | Get help for a specific command by entering `iscc `: 102 | 103 | ```console 104 | $ iscc gen 105 | Usage: iscc gen [OPTIONS] FILE 106 | 107 | Generate ISCC Code for FILE. 108 | 109 | Options: 110 | -g, --guess Guess title (first line of text). 111 | -t, --title TEXT Title for Meta-ID creation. 112 | -e, --extra TEXT Extra text for Meta-ID creation. 113 | -v, --verbose Enables verbose mode. 114 | -h, --help Show this message and exit. 115 | ``` 116 | 117 | ### Generating ISCC Codes 118 | 119 | #### For local files 120 | 121 | The `gen` command generates an ISCC Code for a single file: 122 | 123 | ```console 124 | $ iscc gen tests/image/demo.jpg 125 | ISCC:CC1GG3hSxtbWU-CYDfTq7Qc7Fre-CDYkLqqmQJaQk-CRAPu5NwQgAhv 126 | ``` 127 | 128 | The `gen` command is default so you can skip it and simply do `$ iscc tests/demo.jpg` 129 | 130 | To get a more detailed result use the `-v` (`--verbose`) option: 131 | 132 | ```console 133 | $ iscc -v tests/image/demo.jpg 134 | ISCC:CC1GG3hSxtbWU-CYDfTq7Qc7Fre-CDYkLqqmQJaQk-CRAPu5NwQgAhv 135 | Norm Title: concentrated cat 136 | Tophash: 7a8d0c513142c45f417e761355bf71f11ad61d783cd8958ffc0712d00224a4d0 137 | Filepath: tests/image/demo.jpg 138 | GMT: image 139 | ``` 140 | 141 | See `iscc batch` for help on how to generate ISCC codes for multiple files at once. 142 | 143 | #### For web urls 144 | 145 | The `web` command allows you to create ISCC codes from URLs: 146 | 147 | ```console 148 | $ iscc web https://iscc.foundation/news/images/lib-arch-ottawa.jpg 149 | ISCC:CCbUCUSqQpyJo-CYaHPGcucqwe3-CDt4nQptEGP6M-CRestDoG7xZFy 150 | ``` 151 | 152 | ### Similarity of ISCC Codes 153 | 154 | The `sim` command computes estimated similarity of two ISCC Codes: 155 | 156 | ```console 157 | $ iscc sim CCUcKwdQc1jUM CCjMmrCsKWu1D 158 | Estimated Similarity of Meta-ID: 78.00 % (56 of 64 bits match) 159 | ``` 160 | 161 | You may also compare full four-component ISCC Codes. 162 | 163 | ### Using from your python code 164 | 165 | While this package is not built to be used as a library, some of the high level commands to generate ISCC Codes are exposed as vanilla python functions: 166 | 167 | ```python 168 | from iscc_cli import lib 169 | from pprint import pprint 170 | 171 | pprint(lib.iscc_from_url("https://iscc.foundation/news/images/lib-arch-ottawa.jpg")) 172 | 173 | {'gmt': 'image', 174 | 'iscc': 'CCbUCUSqQpyJo-CYaHPGcucqwe3-CDt4nQptEGP6M-CRestDoG7xZFy', 175 | 'norm_title': 'library and archives canada ottawa', 176 | 'tophash': 'e264cc07209bfaecc291f97c7f8765229ce4c1d36ac6901c477e05b2422eea3e'} 177 | ``` 178 | 179 | ## Maintainers 180 | 181 | [@titusz](https://github.com/titusz) 182 | 183 | ## Contributing 184 | 185 | Pull requests are welcome. For major changes, please open an issue first to discuss what you would like to change. 186 | 187 | Please make sure to update tests as appropriate. 188 | 189 | You may also want join our developer chat on Telegram at . 190 | 191 | ## Change Log 192 | 193 | ### [0.9.12] - 2021-07-16 194 | - Update to custom mediatype detection (without Tika requirement) 195 | - Update dependencies 196 | 197 | ### [0.9.11] - 2020-06-12 198 | - Update dependencies 199 | - Remove support for creating ISCC codes from youtube urls 200 | 201 | ### [0.9.10] - 2020-05-19 202 | - Fixed issue with mime-type detection 203 | - Changed wording of similarity output 204 | - Added CSV-compatible output for batch command 205 | - Added debug option for batch command 206 | - Updated dependencies 207 | 208 | ### [0.9.9] - 2020-05-18 209 | - Fixed issue with tika & macOS 210 | - Added macOS ci testing 211 | - Updated dependencies 212 | 213 | ### [0.9.8] - 2020-05-13 214 | - Updated Content-ID-Audio for robustness against transcoding (breaking change) 215 | - Changed similarity calculation to match with web demo 216 | - Fixed bug in mime-type detection 217 | - Updated dependencies 218 | 219 | ### [0.9.7] - 2020-05-01 220 | - Add support for flac and opus audio formats 221 | - Update dependencies 222 | 223 | ### [0.9.6] - 2020-04-24 224 | - Support urls with dump command 225 | - Updated tika 1.24 and fpcalc 1.50 226 | - Use filename for meta-id as last resort 227 | - Switch to signed audio fingerprint (breaking change) 228 | - Bugfixes and stability improvements 229 | 230 | ### [0.9.5] - 2020-03-02 231 | - Support mobi7 232 | - Support mobi print replica 233 | - Support mobi with web command 234 | 235 | ### [0.9.4] - 2020-03-02 236 | - Add experimental support for mobi files 237 | 238 | ### [0.9.3] - 2020-02-18 239 | - Add support for XHTML 240 | - Fix error on unsupported media types 241 | 242 | ### [0.9.2] - 2020-01-30 243 | - Add support for bmp, psd, xls, xlsx 244 | - Add tika server live testing 245 | - Fix error with title guess on image files 246 | 247 | ### [0.9.1] - 2020-01-05 248 | - Fix issue with APP_DIR creation 249 | 250 | ### [0.9.0] - 2020-01-05 251 | - Add experimental support for Video-ID 252 | - Add special handling of YouTube URLs 253 | - Add support for more Media Types (try & error) 254 | - Add support for Python 3.8 255 | - Remove support for Python 3.5 256 | 257 | ### [0.8.2] - 2019-12-22 258 | - Add new `test` command for confromance testing 259 | - Add support for .md (Markdown) files 260 | - Update to ISCC v1.0.5 261 | - Update to Apache Tika 1.23 262 | - Fix issue with non-conformant Meta-ID 263 | 264 | ### [0.8.1] - 2019-12-13 265 | - Add support for tif files 266 | - Add support for eps files 267 | - Set application directory to non-roaming path 268 | 269 | ### [0.8.0] - 2019-11-23 270 | - Add new `dump` command (dumps extraction results) 271 | - Add support for iBooks files 272 | - Fix error with tika 1.22 dependency 273 | - Store tika server in non-volatile storage 274 | 275 | ### [0.7.0] - 2019-09-12 276 | - Expose commands as python API 277 | - Fix title guessing bug 278 | 279 | ### [0.6.0] - 2019-06-11 280 | 281 | - Added new `web` command (creates ISCC Codes for URLs) 282 | 283 | ### [0.5.0] - 2019-06-06 284 | 285 | - Added experimental support for aif, mp3, ogg, wav 286 | - More verbose batch output 287 | - Fix batch output default Meta-ID 288 | 289 | ### [0.4.0] - 2019-06-03 290 | 291 | - Added support for html, odt, txt, xml, gif 292 | - Added optional guessing of title (first line of text) 293 | - Added new `info` command 294 | - Fixed wrong detection of identical Instance-ID 295 | 296 | ### [0.3.0] - 2019-06-01 297 | 298 | - Add `sim` command similarity comparison of ISCC Codes 299 | 300 | ### [0.2.0] - 2019-05-31 301 | 302 | - Add support for doc, docx and rtf documents 303 | - Update to ISCC 1.0.4 (fixes whitespace bug) 304 | 305 | ### [0.1.0] - 2019-05-31 306 | 307 | - Basic ISCC Code creation 308 | - Supported file types: jpg, png, pdf, epub 309 | 310 | ## License 311 | 312 | MIT © 2019-2021 Titusz Pan 313 | 314 | -------------------------------------------------------------------------------- /iscc_cli/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import click 3 | import iscc_cli 4 | 5 | 6 | __version__ = "0.9.12" 7 | APP_NAME = "iscc-cli" 8 | APP_DIR = click.get_app_dir(APP_NAME, roaming=False) 9 | os.makedirs(iscc_cli.APP_DIR, exist_ok=True) 10 | os.environ["TIKA_PATH"] = APP_DIR 11 | os.environ["TIKA_LOG_PATH"] = APP_DIR 12 | os.environ["TIKA_STARTUP_MAX_RETRY"] = "8" 13 | os.environ["LOGURU_AUTOINIT"] = "False" 14 | 15 | 16 | from iscc_cli.tika import tika 17 | 18 | tika.log.disabled = True 19 | -------------------------------------------------------------------------------- /iscc_cli/audio_id.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """Experimetal support for Audio-ID.""" 3 | import json 4 | import subprocess 5 | import iscc 6 | from iscc_cli import fpcalc 7 | 8 | 9 | def content_id_audio(features, partial=False): 10 | digests = [] 11 | for a, b in iscc.sliding_window(features, 2): 12 | digest = a.to_bytes(4, "big", signed=True) + b.to_bytes(4, "big", signed=True) 13 | digests.append(digest) 14 | shash_digest = iscc.similarity_hash(digests) 15 | if partial: 16 | content_id_audio_digest = iscc.HEAD_CID_A_PCF + shash_digest 17 | else: 18 | content_id_audio_digest = iscc.HEAD_CID_A + shash_digest 19 | return iscc.encode(content_id_audio_digest) 20 | 21 | 22 | def get_chroma_vector(file): 23 | """Returns 32-bit (4 byte) integers as features""" 24 | 25 | if hasattr(file, "read"): 26 | file.seek(0) 27 | cmd = [fpcalc.exe_path(), "-raw", "-json", "-signed", "-"] 28 | res = subprocess.run(cmd, stdout=subprocess.PIPE, input=file.read()) 29 | else: 30 | cmd = [fpcalc.exe_path(), "-raw", "-json", "-signed", file] 31 | res = subprocess.run(cmd, stdout=subprocess.PIPE) 32 | 33 | vec = json.loads(res.stdout.decode("utf-8"))["fingerprint"] 34 | return vec 35 | -------------------------------------------------------------------------------- /iscc_cli/cli.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import click 3 | from iscc_cli import __version__ 4 | from iscc_cli.commands import init, gen, batch, sim, info, web, dump, test 5 | from click_default_group import DefaultGroup 6 | 7 | 8 | @click.group(cls=DefaultGroup, default="gen", default_if_no_args=False) 9 | @click.version_option(version=__version__, message="ISCC CLI - %(version)s") 10 | def cli(): 11 | pass 12 | 13 | 14 | cli.add_command(init.init) 15 | cli.add_command(gen.gen) 16 | cli.add_command(batch.batch) 17 | cli.add_command(web.web) 18 | cli.add_command(sim.sim) 19 | cli.add_command(info.info) 20 | cli.add_command(dump.dump) 21 | cli.add_command(test.test) 22 | 23 | 24 | if __name__ == "__main__": 25 | cli() 26 | -------------------------------------------------------------------------------- /iscc_cli/commands/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iscc/iscc-cli/b8ca75567e842de504752440cb79f3dcb21177d4/iscc_cli/commands/__init__.py -------------------------------------------------------------------------------- /iscc_cli/commands/batch.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os 3 | import shutil 4 | import sys 5 | from os.path import basename, abspath 6 | import click 7 | import mobi 8 | from iscc_cli.tika import parser 9 | import iscc 10 | from iscc_cli import video_id 11 | from iscc_cli.const import SUPPORTED_MIME_TYPES, GMT 12 | from iscc_cli.utils import get_files, mime_to_gmt, get_title, DefaultHelp 13 | from iscc_cli import audio_id, fpcalc 14 | from loguru import logger as log 15 | from iscc_cli.mediatype import mime_guess, mime_clean 16 | 17 | 18 | @click.command(cls=DefaultHelp) 19 | @click.argument("path", type=click.Path(exists=True)) 20 | @click.option("-r", "--recursive", is_flag=True, help="Recurse into subdirectories.") 21 | @click.option( 22 | "-g", 23 | "--guess", 24 | is_flag=True, 25 | default=False, 26 | help="Guess title (first line of text).", 27 | show_default=True, 28 | ) 29 | @click.option( 30 | "-d", 31 | "--debug", 32 | is_flag=True, 33 | default=False, 34 | help="Show debug output", 35 | show_default=True, 36 | ) 37 | def batch(path, recursive, guess, debug): 38 | """Create ISCC Codes for all files in PATH. 39 | 40 | Example: 41 | 42 | $ iscc batch ~/Documents 43 | 44 | """ 45 | if debug: 46 | log.add(sys.stdout) 47 | 48 | results = [] 49 | for f in get_files(path, recursive=recursive): 50 | filesize = os.path.getsize(f) 51 | if not filesize: 52 | msg = "Cannot proccess empty file: {}".format(f) 53 | log.warning(msg) 54 | continue 55 | 56 | media_type = mime_clean(mime_guess(f)) 57 | if media_type not in SUPPORTED_MIME_TYPES: 58 | fname = basename(f) 59 | msg = "Unsupported file {} with mime type: {},,,,".format(fname, media_type) 60 | log.warning(msg) 61 | continue 62 | 63 | if media_type == "application/x-mobipocket-ebook": 64 | try: 65 | tempdir, epub_filepath = mobi.extract(f) 66 | tika_result = parser.from_file(epub_filepath) 67 | shutil.rmtree(tempdir) 68 | except Exception as e: 69 | msg = "Error with mobi extraction %s" 70 | log.error(msg) 71 | continue 72 | else: 73 | tika_result = parser.from_file(f) 74 | 75 | title = get_title(tika_result, guess=guess, uri=f) 76 | 77 | mid, norm_title, _ = iscc.meta_id(title) 78 | gmt = mime_to_gmt(media_type, file_path=f) 79 | if gmt == GMT.IMAGE: 80 | try: 81 | cid = iscc.content_id_image(f) 82 | except Exception as e: 83 | msg = "Clould not proccess image: {} ({})".format(f, e) 84 | log.error(msg) 85 | continue 86 | 87 | elif gmt == GMT.TEXT: 88 | text = tika_result["content"] 89 | if not text: 90 | msg = "Could not extract text from {}".format(basename(f)) 91 | log.warning(msg) 92 | continue 93 | cid = iscc.content_id_text(tika_result["content"]) 94 | elif gmt == GMT.AUDIO: 95 | if not fpcalc.is_installed(): 96 | fpcalc.install() 97 | features = audio_id.get_chroma_vector(f) 98 | cid = audio_id.content_id_audio(features) 99 | elif gmt == GMT.VIDEO: 100 | features = video_id.get_frame_vectors(abspath(f)) 101 | cid = video_id.content_id_video(features) 102 | else: 103 | log.error("Could not generate ISCC") 104 | continue 105 | 106 | did = iscc.data_id(f) 107 | iid, tophash = iscc.instance_id(f) 108 | 109 | iscc_code_cs = ",".join((mid, cid, did, iid)) 110 | 111 | click.echo( 112 | "{iscc_code},{tophash},{fname},{gmt},{title}".format( 113 | iscc_code=iscc_code_cs, 114 | tophash=tophash, 115 | fname=basename(f), 116 | gmt=gmt, 117 | title=norm_title, 118 | ) 119 | ) 120 | iscc_code = "-".join((mid, cid, did, iid)) 121 | results.append( 122 | dict( 123 | iscc=iscc_code, 124 | norm_title=norm_title, 125 | tophash=tophash, 126 | gmt=gmt, 127 | file_name=basename(f), 128 | ) 129 | ) 130 | 131 | return results 132 | -------------------------------------------------------------------------------- /iscc_cli/commands/dump.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import shutil 3 | 4 | import click 5 | import mobi 6 | from click import UsageError 7 | from iscc_cli.tika import parser 8 | from iscc_cli.utils import DefaultHelp 9 | from iscc_cli.const import SUPPORTED_MIME_TYPES 10 | import json 11 | from iscc_cli.mediatype import mime_guess, mime_clean 12 | 13 | 14 | @click.command(cls=DefaultHelp) 15 | @click.argument("path", type=click.STRING) 16 | @click.option( 17 | "-s", "--strip", type=click.INT, default=0, help="Strip content to first X chars." 18 | ) 19 | @click.option("-m", "--meta", is_flag=True, default=False, help="Dump metadata only.") 20 | @click.option("-c", "--content", is_flag=True, default=False, help="Dump content only.") 21 | def dump(path, strip, meta, content): 22 | """Dump Tika extraction results for PATH (file or url path).""" 23 | 24 | media_type = mime_clean(mime_guess(path)) 25 | 26 | if media_type not in SUPPORTED_MIME_TYPES: 27 | click.echo("Unsupported media type {}.".format(media_type)) 28 | click.echo("Please request support at https://github.com/iscc/iscc-cli/issues") 29 | 30 | if media_type == "application/x-mobipocket-ebook": 31 | tempdir, epub_filepath = mobi.extract(path) 32 | tika_result = parser.from_file(epub_filepath) 33 | shutil.rmtree(tempdir) 34 | else: 35 | tika_result = parser.from_file(path) 36 | 37 | if all([meta, content]): 38 | raise UsageError("Use either --meta or --content for selective output.") 39 | 40 | if strip: 41 | tika_result["content"] = tika_result.get("content", "")[:strip] 42 | 43 | if meta: 44 | click.echo(json.dumps(tika_result.get("metadata", ""), indent=2)) 45 | elif content: 46 | click.echo(json.dumps(tika_result.get("content", ""), indent=2)) 47 | else: 48 | click.echo(json.dumps(tika_result, indent=2)) 49 | -------------------------------------------------------------------------------- /iscc_cli/commands/gen.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os 3 | import shutil 4 | from os.path import abspath 5 | 6 | import click 7 | import iscc 8 | import mobi 9 | from iscc_cli.tika import parser 10 | from iscc_cli import audio_id, video_id, fpcalc 11 | from iscc_cli.const import SUPPORTED_MIME_TYPES, GMT 12 | from iscc_cli.utils import get_title, mime_to_gmt, DefaultHelp 13 | from iscc_cli.mediatype import mime_guess, mime_clean 14 | 15 | 16 | @click.command(cls=DefaultHelp) 17 | @click.argument("file", type=click.File("rb")) 18 | @click.option( 19 | "-g", 20 | "--guess", 21 | is_flag=True, 22 | default=False, 23 | help="Guess title (first line of text).", 24 | ) 25 | @click.option("-t", "--title", type=click.STRING, help="Title for Meta-ID creation.") 26 | @click.option( 27 | "-e", "--extra", type=click.STRING, help="Extra text for Meta-ID creation." 28 | ) 29 | @click.option("-v", "--verbose", is_flag=True, help="Enables verbose mode.") 30 | def gen(file, guess, title, extra, verbose): 31 | """Generate ISCC Code for FILE.""" 32 | filesize = os.path.getsize(file.name) 33 | if not filesize: 34 | raise click.BadParameter("Cannot proccess empty file: {}".format(file.name)) 35 | 36 | media_type = mime_clean(mime_guess(file.name)) 37 | if media_type not in SUPPORTED_MIME_TYPES: 38 | click.echo("Unsupported media type {}.".format(media_type)) 39 | click.echo("Please request support at https://github.com/iscc/iscc-cli/issues") 40 | 41 | if media_type == "application/x-mobipocket-ebook": 42 | tempdir, epub_filepath = mobi.extract(file.name) 43 | tika_result = parser.from_file(epub_filepath) 44 | shutil.rmtree(tempdir) 45 | else: 46 | tika_result = parser.from_file(file.name) 47 | 48 | if not title: 49 | title = get_title(tika_result, guess=guess, uri=file.name) 50 | 51 | if not extra: 52 | extra = "" 53 | 54 | mid, norm_title, _ = iscc.meta_id(title, extra) 55 | gmt = mime_to_gmt(media_type, file_path=file.name) 56 | if gmt == GMT.IMAGE: 57 | cid = iscc.content_id_image(file.name) 58 | elif gmt == GMT.TEXT: 59 | text = tika_result["content"] 60 | if not text: 61 | click.echo("Could not extract text from {}".format(file.name)) 62 | return 63 | cid = iscc.content_id_text(tika_result["content"]) 64 | elif gmt == GMT.AUDIO: 65 | if not fpcalc.is_installed(): 66 | fpcalc.install() 67 | features = audio_id.get_chroma_vector(file.name) 68 | cid = audio_id.content_id_audio(features) 69 | elif gmt == GMT.VIDEO: 70 | features = video_id.get_frame_vectors(abspath(file.name)) 71 | cid = video_id.content_id_video(features) 72 | else: 73 | click.echo("Could not generate ISCC") 74 | return 75 | 76 | did = iscc.data_id(file.name) 77 | iid, tophash = iscc.instance_id(file.name) 78 | 79 | if not norm_title: 80 | iscc_code = "-".join((cid, did, iid)) 81 | else: 82 | iscc_code = "-".join((mid, cid, did, iid)) 83 | 84 | click.echo("ISCC:{}".format(iscc_code)) 85 | 86 | if verbose: 87 | if norm_title: 88 | click.echo("Norm Title: %s" % norm_title) 89 | click.echo("Tophash: %s" % tophash) 90 | click.echo("Filepath: %s" % file.name) 91 | click.echo("GMT: %s" % gmt) 92 | 93 | return dict(iscc=iscc_code, norm_title=norm_title, tophash=tophash, gmt=gmt) 94 | -------------------------------------------------------------------------------- /iscc_cli/commands/info.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import click 3 | import iscc 4 | import iscc_cli 5 | from iscc_cli import fpcalc, ffmpeg 6 | from iscc_cli.const import SUPPORTED_EXTENSIONS 7 | from iscc_cli.tika import tika 8 | import requests 9 | 10 | 11 | def tika_version(): 12 | url = tika.ServerEndpoint + "/version" 13 | try: 14 | return requests.get(url).text 15 | except Exception: 16 | return 'WARNING: Not Installed - run "iscc init" to install!' 17 | 18 | 19 | @click.command() 20 | def info(): 21 | """Show information about environment.""" 22 | click.echo("ISCC Cli Version: %s" % iscc_cli.__version__) 23 | click.echo("ISCC Version: %s" % iscc.__version__) 24 | click.echo("FFMPEG Version: %s" % ffmpeg.get_version_info()) 25 | click.echo("FPCALC Version: %s" % fpcalc.get_version_info()) 26 | click.echo("Tika Version: %s" % tika_version()) 27 | click.echo("Tika Jar Path: %s" % tika.TikaJarPath) 28 | click.echo("Supported File Types: %s" % ", ".join(sorted(SUPPORTED_EXTENSIONS))) 29 | 30 | 31 | if __name__ == "__main__": 32 | info() 33 | -------------------------------------------------------------------------------- /iscc_cli/commands/init.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import io 3 | import click 4 | import requests 5 | from iscc_cli.tika import detector 6 | from iscc_cli import fpcalc 7 | 8 | 9 | @click.command() 10 | def init(): 11 | """Inititalize and check environment.""" 12 | click.echo("Inititalizing Tika ...") 13 | detector.from_buffer(io.BytesIO(b"Wakeup Tika")) 14 | url = detector.ServerEndpoint + "/version" 15 | resp = requests.get(url) 16 | click.echo("Tika initialized: {}".format(resp.text)) 17 | click.echo("Testing fpcalc ...") 18 | fpc_ok = fpcalc.is_installed() 19 | if not fpc_ok: 20 | fpcalc.install() 21 | fpc_version = fpcalc.get_version_info() 22 | click.echo("fpcalc installed: {}".format(fpc_version)) 23 | -------------------------------------------------------------------------------- /iscc_cli/commands/sim.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import sys 3 | import click 4 | import iscc 5 | 6 | from iscc_cli.const import ISCC_COMPONENT_CODES 7 | from iscc_cli.utils import DefaultHelp, iscc_verify, iscc_split, iscc_clean 8 | 9 | 10 | @click.command(cls=DefaultHelp) 11 | @click.argument("a", nargs=1) 12 | @click.argument("b", nargs=1) 13 | def sim(a, b): 14 | """Estimate Similarity of ISCC Codes A & B. 15 | 16 | Example: 17 | 18 | $ iscc sim CCUcKwdQc1jUM CCjMmrCsKWu1D 19 | 20 | You may also compare fully qualified ISCC Codes with each other. 21 | """ 22 | try: 23 | iscc_verify(a) 24 | iscc_verify(b) 25 | except ValueError as e: 26 | click.echo(str(e)) 27 | sys.exit(1) 28 | 29 | # Fully Qualified ISCC Code Similarity 30 | avg_msg = None 31 | if len(iscc_clean(a)) == 52 and len(iscc_clean(b)) == 52: 32 | digest_a = b"".join(iscc.decode(code)[1:] for code in iscc_split(a)) 33 | digest_b = b"".join(iscc.decode(code)[1:] for code in iscc_split(b)) 34 | int_a = int.from_bytes(digest_a, "big", signed=False) 35 | int_b = int.from_bytes(digest_b, "big", signed=False) 36 | dist = bin(int_a ^ int_b).count("1") 37 | similarity = ((192 - dist) / 192) * 100 38 | avg_msg = "Average Estimated Similarity: {:.2f} % ({} of 192 bits differnt)".format( 39 | similarity, dist 40 | ) 41 | 42 | # Per Component Similarity 43 | a = iscc_split(a) 44 | b = iscc_split(b) 45 | 46 | if len(a) == 1 and len(b) == 1: 47 | type_a = ISCC_COMPONENT_CODES.get(a[0][:2])["name"] 48 | type_b = ISCC_COMPONENT_CODES.get(b[0][:2])["name"] 49 | if type_a != type_b: 50 | click.echo("Incompatible component types ({} & {}).".format(type_a, type_b)) 51 | 52 | for ca in a: 53 | for cb in b: 54 | type_a = ISCC_COMPONENT_CODES.get(ca[:2])["name"] 55 | type_b = ISCC_COMPONENT_CODES.get(cb[:2])["name"] 56 | if type_a == type_b and type_a != "Instance-ID": 57 | hamming_dist = iscc.distance(ca, cb) 58 | hamming_sim = 64 - hamming_dist 59 | similarity = round(hamming_sim / (2 * 64 - hamming_sim) * 100) 60 | click.echo( 61 | "Estimated Similarity of {}: {:.2f} % ({} of 64 bits match)".format( 62 | type_a, similarity, hamming_sim 63 | ) 64 | ) 65 | if type_a == "Instance-ID" and type_b == "Instance-ID": 66 | if ca == cb: 67 | click.echo("Identical Instance-ID") 68 | if avg_msg: 69 | click.echo(avg_msg) 70 | -------------------------------------------------------------------------------- /iscc_cli/commands/test.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import io 3 | import click 4 | import requests 5 | import iscc 6 | from iscc_cli import const 7 | 8 | 9 | @click.command() 10 | def test(): 11 | """Test conformance with latest reference data.""" 12 | click.echo("Running confromance tests.\n") 13 | test_data = requests.get(const.TEST_DATA_URL + "test_data.json").json() 14 | for funcname, tests in test_data.items(): 15 | if not tests["required"]: 16 | continue 17 | for testname, testdata in tests.items(): 18 | if not testname.startswith("test_"): 19 | continue 20 | func = getattr(iscc, funcname) 21 | args = testdata["inputs"] 22 | if isinstance(args[0], str) and args[0].startswith("file"): 23 | r = requests.get(const.TEST_DATA_URL + args[0]) 24 | args[0] = io.BytesIO(r.content) 25 | 26 | if funcname in ["data_chunks"]: 27 | testdata["outputs"] = [ 28 | bytes.fromhex(i.split(":")[1]) for i in testdata["outputs"] 29 | ] 30 | result = list(func(*args)) 31 | else: 32 | result = func(*args) 33 | expected = testdata["outputs"] 34 | try: 35 | assert result == expected, "%s %s " % (funcname, args) 36 | except AssertionError: 37 | click.echo("FAILED %s" % testname) 38 | click.echo("Result %s != Expected %s" % (result, expected)) 39 | else: 40 | click.echo("PASSED %s" % testname) 41 | -------------------------------------------------------------------------------- /iscc_cli/commands/web.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os 3 | import shutil 4 | from io import BytesIO 5 | import click 6 | import iscc 7 | import mobi 8 | import requests 9 | from iscc_cli.tika import parser 10 | import iscc_cli 11 | from iscc_cli import fpcalc, audio_id, video_id 12 | from iscc_cli.const import SUPPORTED_MIME_TYPES, GMT 13 | from iscc_cli.utils import ( 14 | get_title, 15 | mime_to_gmt, 16 | DefaultHelp, 17 | download_file, 18 | ) 19 | from iscc_cli.mediatype import mime_guess, mime_clean 20 | 21 | HEADERS = {"User-Agent": "ISCC {}".format(iscc_cli.__version__)} 22 | 23 | 24 | @click.command(cls=DefaultHelp) 25 | @click.argument("url", type=click.STRING) 26 | @click.option( 27 | "-g", 28 | "--guess", 29 | is_flag=True, 30 | default=False, 31 | help="Guess title (first line of text).", 32 | ) 33 | @click.option("-t", "--title", type=click.STRING, help="Title for Meta-ID creation.") 34 | @click.option( 35 | "-e", "--extra", type=click.STRING, help="Extra text for Meta-ID creation." 36 | ) 37 | @click.option("-v", "--verbose", is_flag=True, help="Enables verbose mode.") 38 | def web(url, guess, title, extra, verbose): 39 | """Generate ISCC Code from URL.""" 40 | 41 | extra = extra or "" 42 | 43 | try: 44 | resp = requests.get(url, headers=HEADERS, stream=True) 45 | except Exception as e: 46 | raise click.BadArgumentUsage(e) 47 | 48 | data = BytesIO(resp.content) 49 | media_type = mime_clean(mime_guess(data)) 50 | if media_type not in SUPPORTED_MIME_TYPES: 51 | click.echo("Unsupported media type {}".format(media_type)) 52 | click.echo("Please request support at https://github.com/iscc/iscc-cli/issues") 53 | return 54 | 55 | if media_type == "application/x-mobipocket-ebook": 56 | data.seek(0) 57 | tempdir, filepath = mobi.extract(data) 58 | tika_result = parser.from_file(filepath) 59 | shutil.rmtree(tempdir) 60 | else: 61 | data.seek(0) 62 | tika_result = parser.from_buffer(data) 63 | 64 | if not title: 65 | title = get_title(tika_result, guess=guess, uri=url) 66 | 67 | mid, norm_title, _ = iscc.meta_id(title, extra) 68 | gmt = mime_to_gmt(media_type) 69 | if gmt == GMT.IMAGE: 70 | data.seek(0) 71 | cid = iscc.content_id_image(data) 72 | elif gmt == GMT.TEXT: 73 | text = tika_result["content"] 74 | if not text: 75 | click.echo("Could not extract text") 76 | return 77 | cid = iscc.content_id_text(tika_result["content"]) 78 | elif gmt == GMT.AUDIO: 79 | if not fpcalc.is_installed(): 80 | fpcalc.install() 81 | data.seek(0) 82 | features = audio_id.get_chroma_vector(data) 83 | cid = audio_id.content_id_audio(features) 84 | elif gmt == GMT.VIDEO: 85 | local_path = download_file(url, sanitize=True) 86 | features = video_id.get_frame_vectors(local_path) 87 | cid = video_id.content_id_video(features) 88 | os.remove(local_path) 89 | 90 | data.seek(0) 91 | did = iscc.data_id(data) 92 | data.seek(0) 93 | iid, tophash = iscc.instance_id(data) 94 | 95 | if not norm_title: 96 | iscc_code = "-".join((cid, did, iid)) 97 | else: 98 | iscc_code = "-".join((mid, cid, did, iid)) 99 | 100 | click.echo("ISCC:{}".format(iscc_code)) 101 | 102 | if verbose: 103 | if norm_title: 104 | click.echo("Norm Title: %s" % norm_title) 105 | click.echo("Tophash: %s" % tophash) 106 | click.echo("Filepath: %s" % url) 107 | click.echo("GMT: %s" % gmt) 108 | 109 | return dict(iscc=iscc_code, norm_title=norm_title, tophash=tophash, gmt=gmt) 110 | -------------------------------------------------------------------------------- /iscc_cli/const.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from iscc import const 3 | 4 | 5 | class GMT: 6 | """Generic Media Type""" 7 | 8 | IMAGE = "image" 9 | TEXT = "text" 10 | AUDIO = "audio" 11 | VIDEO = "video" 12 | 13 | 14 | SUPPORTED_MIME_TYPES = { 15 | # Text Formats 16 | "application/rtf": {"gmt": GMT.TEXT, "ext": "rtf"}, 17 | "application/msword": {"gmt": GMT.TEXT, "ext": "doc"}, 18 | "application/pdf": {"gmt": GMT.TEXT, "ext": "pdf"}, 19 | "application/epub+zip": {"gmt": GMT.TEXT, "ext": "epub"}, 20 | "application/xml": {"gmt": GMT.TEXT, "ext": "xml"}, 21 | "application/xhtml+xml": {"gmt": GMT.TEXT, "ext": "xhtml"}, 22 | "application/vnd.oasis.opendocument.text": {"gmt": GMT.TEXT, "ext": "odt"}, 23 | "text/html": {"gmt": GMT.TEXT, "ext": "html"}, 24 | "text/plain": {"gmt": GMT.TEXT, "ext": "txt"}, 25 | "application/x-ibooks+zip": {"gmt": GMT.TEXT, "ext": "ibooks"}, 26 | "text/x-web-markdown": {"gmt": GMT.TEXT, "ext": "md"}, 27 | "application/vnd.openxmlformats-officedocument.wordprocessingml.document": { 28 | "gmt": GMT.TEXT, 29 | "ext": "docx", 30 | }, 31 | "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": { 32 | "gmt": GMT.TEXT, 33 | "ext": "xlsx", 34 | }, 35 | "application/vnd.ms-excel": {"gmt": GMT.TEXT, "ext": "xls"}, 36 | "application/x-mobipocket-ebook": { 37 | "gmt": GMT.TEXT, 38 | "ext": ["mobi", "prc", "azw", "azw3", "azw4"], 39 | }, 40 | # Image Formats 41 | "image/bmp": {"gmt": GMT.IMAGE, "ext": "bmp"}, 42 | "image/gif": {"gmt": GMT.IMAGE, "ext": "gif"}, 43 | "image/jpeg": {"gmt": GMT.IMAGE, "ext": ["jpg", "jpeg"]}, 44 | "image/png": {"gmt": GMT.IMAGE, "ext": "png"}, 45 | "image/tiff": {"gmt": GMT.IMAGE, "ext": "tif"}, 46 | "image/vnd.adobe.photoshop": {"gmt": GMT.IMAGE, "ext": "psd"}, 47 | "application/postscript": {"gmt": GMT.IMAGE, "ext": "eps"}, 48 | # Audio Formats 49 | "audio/mpeg": {"gmt": GMT.AUDIO, "ext": "mp3"}, 50 | "audio/vnd.wave": {"gmt": GMT.AUDIO, "ext": "wav"}, 51 | "audio/vorbis": {"gmt": GMT.AUDIO, "ext": "ogg"}, 52 | "audio/x-aiff": {"gmt": GMT.AUDIO, "ext": "aif"}, 53 | "audio/x-flac": {"gmt": GMT.AUDIO, "ext": "flac"}, 54 | "audio/opus": {"gmt": GMT.AUDIO, "ext": "opus"}, 55 | # Video Formats 56 | "application/vnd.rn-realmedia": {"gmt": GMT.VIDEO, "ext": "rm"}, 57 | "video/x-dirac": {"gmt": GMT.VIDEO, "ext": "drc"}, 58 | "video/3gpp": {"gmt": GMT.VIDEO, "ext": "3gp"}, 59 | "video/3gpp2": {"gmt": GMT.VIDEO, "ext": "3g2"}, 60 | "video/x-ms-asf": {"gmt": GMT.VIDEO, "ext": "asf"}, 61 | "video/x-msvideo": {"gmt": GMT.VIDEO, "ext": "avi"}, 62 | "video/webm": {"gmt": GMT.VIDEO, "ext": "webm"}, 63 | "video/mpeg": {"gmt": GMT.VIDEO, "ext": ["mpeg", "mpg", "m1v", "vob"]}, 64 | "video/mp4": {"gmt": GMT.VIDEO, "ext": "mp4"}, 65 | "video/x-m4v": {"gmt": GMT.VIDEO, "ext": "m4v"}, 66 | "video/x-matroska": {"gmt": GMT.VIDEO, "ext": "mkv"}, 67 | "video/theora": {"gmt": GMT.VIDEO, "ext": ["ogg", "ogv"]}, 68 | "video/quicktime": {"gmt": GMT.VIDEO, "ext": ["mov", "f4v"]}, 69 | "video/x-flv": {"gmt": GMT.VIDEO, "ext": "flv"}, 70 | "application/x-shockwave-flash": {"gmt": GMT.VIDEO, "ext": "swf"}, 71 | "video/h264": {"gmt": GMT.VIDEO, "ext": "h264"}, 72 | "video/x-ms-wmv": {"gmt": GMT.VIDEO, "ext": "wmv"}, 73 | } 74 | 75 | 76 | SUPPORTED_EXTENSIONS = [] 77 | for v in SUPPORTED_MIME_TYPES.values(): 78 | ext = v["ext"] 79 | if isinstance(ext, str): 80 | SUPPORTED_EXTENSIONS.append(ext) 81 | else: 82 | for e in ext: 83 | SUPPORTED_EXTENSIONS.append(e) 84 | 85 | 86 | ISCC_COMPONENT_TYPES = { 87 | const.HEAD_MID: {"name": "Meta-ID", "code": "CC"}, 88 | const.HEAD_CID_T: {"name": "Content-ID Text", "code": "CT"}, 89 | const.HEAD_CID_T_PCF: {"name": "Content-ID Text", "code": "Ct"}, 90 | const.HEAD_CID_I: {"name": "Content-ID Image", "code": "CY"}, 91 | const.HEAD_CID_I_PCF: {"name": "Content-ID Image", "code": "Ci"}, 92 | const.HEAD_CID_A: {"name": "Content-ID Audio", "code": "CA"}, 93 | const.HEAD_CID_A_PCF: {"name": "Content-ID Audio", "code": "Ca"}, 94 | const.HEAD_CID_V: {"name": "Content-ID Video", "code": "CV"}, 95 | const.HEAD_CID_V_PCF: {"name": "Content-ID Video", "code": "Cv"}, 96 | const.HEAD_CID_M: {"name": "Content-ID Mixed", "code": "CM"}, 97 | const.HEAD_CID_M_PCF: {"name": "Content-ID Mixed", "code": "Cm"}, 98 | const.HEAD_DID: {"name": "Data-ID", "code": "CD"}, 99 | const.HEAD_IID: {"name": "Instance-ID", "code": "CR"}, 100 | } 101 | 102 | ISCC_COMPONENT_CODES = { 103 | value["code"]: {"name": value["name"], "marker": key} 104 | for key, value in ISCC_COMPONENT_TYPES.items() 105 | } 106 | 107 | TEST_DATA_URL = "https://raw.githubusercontent.com/iscc/iscc-specs/master/tests/" 108 | 109 | WTA_PERMUTATIONS = ( 110 | (292, 16), 111 | (219, 247), 112 | (295, 7), 113 | (105, 236), 114 | (251, 142), 115 | (334, 82), 116 | (17, 266), 117 | (250, 167), 118 | (38, 127), 119 | (184, 22), 120 | (215, 71), 121 | (308, 181), 122 | (195, 215), 123 | (145, 345), 124 | (134, 233), 125 | (89, 351), 126 | (155, 338), 127 | (185, 68), 128 | (233, 122), 129 | (225, 314), 130 | (192, 22), 131 | (298, 2), 132 | (120, 68), 133 | (99, 155), 134 | (274, 187), 135 | (122, 160), 136 | (341, 281), 137 | (230, 223), 138 | (240, 33), 139 | (334, 299), 140 | (166, 256), 141 | (80, 114), 142 | (211, 122), 143 | (18, 16), 144 | (254, 154), 145 | (310, 336), 146 | (36, 273), 147 | (41, 76), 148 | (196, 290), 149 | (191, 307), 150 | (76, 57), 151 | (49, 226), 152 | (85, 97), 153 | (178, 221), 154 | (212, 228), 155 | (125, 348), 156 | (140, 73), 157 | (316, 267), 158 | (91, 61), 159 | (136, 233), 160 | (154, 84), 161 | (338, 332), 162 | (89, 90), 163 | (245, 177), 164 | (167, 222), 165 | (114, 2), 166 | (278, 364), 167 | (22, 169), 168 | (163, 124), 169 | (40, 134), 170 | (229, 207), 171 | (298, 81), 172 | (199, 253), 173 | (344, 123), 174 | (376, 268), 175 | (139, 266), 176 | (247, 308), 177 | (255, 32), 178 | (85, 250), 179 | (345, 236), 180 | (205, 69), 181 | (215, 277), 182 | (299, 178), 183 | (275, 198), 184 | (250, 359), 185 | (84, 286), 186 | (225, 50), 187 | (212, 18), 188 | (1, 224), 189 | (274, 33), 190 | (25, 179), 191 | (47, 77), 192 | (55, 311), 193 | (232, 248), 194 | (71, 234), 195 | (223, 256), 196 | (228, 175), 197 | (371, 132), 198 | (357, 234), 199 | (216, 168), 200 | (332, 266), 201 | (267, 78), 202 | (378, 121), 203 | (165, 316), 204 | (16, 351), 205 | (100, 329), 206 | (301, 294), 207 | (321, 245), 208 | (12, 59), 209 | (151, 222), 210 | (126, 367), 211 | (148, 45), 212 | (23, 305), 213 | (281, 54), 214 | (146, 83), 215 | (343, 244), 216 | (72, 184), 217 | (304, 205), 218 | (98, 179), 219 | (93, 40), 220 | (302, 99), 221 | (218, 106), 222 | (49, 350), 223 | (157, 237), 224 | (355, 267), 225 | (369, 216), 226 | (229, 340), 227 | (284, 106), 228 | (136, 305), 229 | (186, 59), 230 | (3, 107), 231 | (217, 312), 232 | (209, 195), 233 | (333, 102), 234 | (35, 216), 235 | (45, 28), 236 | (178, 130), 237 | (184, 233), 238 | (217, 99), 239 | (321, 144), 240 | (238, 355), 241 | (150, 259), 242 | (255, 259), 243 | (134, 207), 244 | (226, 327), 245 | (174, 178), 246 | (371, 141), 247 | (247, 228), 248 | (244, 300), 249 | (245, 42), 250 | (353, 276), 251 | (368, 187), 252 | (369, 207), 253 | (86, 308), 254 | (212, 368), 255 | (288, 33), 256 | (304, 375), 257 | (156, 8), 258 | (302, 167), 259 | (333, 164), 260 | (37, 379), 261 | (203, 312), 262 | (191, 144), 263 | (310, 95), 264 | (123, 86), 265 | (157, 48), 266 | (284, 27), 267 | (112, 291), 268 | (37, 215), 269 | (98, 291), 270 | (292, 224), 271 | (303, 8), 272 | (200, 103), 273 | (173, 294), 274 | (97, 267), 275 | (288, 167), 276 | (24, 336), 277 | (354, 296), 278 | (25, 18), 279 | (289, 187), 280 | (203, 166), 281 | (307, 326), 282 | (87, 80), 283 | (60, 310), 284 | (176, 84), 285 | (15, 370), 286 | (274, 261), 287 | (178, 45), 288 | (203, 224), 289 | (295, 178), 290 | (30, 74), 291 | (227, 361), 292 | (241, 312), 293 | (231, 369), 294 | (226, 309), 295 | (89, 181), 296 | (216, 175), 297 | (286, 262), 298 | (234, 198), 299 | (99, 49), 300 | (221, 328), 301 | (78, 21), 302 | (95, 327), 303 | (324, 97), 304 | (291, 219), 305 | (184, 286), 306 | (192, 25), 307 | (309, 26), 308 | (84, 159), 309 | (114, 25), 310 | (296, 90), 311 | (51, 325), 312 | (289, 184), 313 | (95, 154), 314 | (21, 202), 315 | (306, 219), 316 | (39, 176), 317 | (99, 251), 318 | (83, 86), 319 | (207, 239), 320 | (168, 19), 321 | (88, 90), 322 | (297, 361), 323 | (215, 78), 324 | (262, 328), 325 | (356, 200), 326 | (48, 203), 327 | (60, 120), 328 | (54, 216), 329 | (369, 327), 330 | (159, 370), 331 | (148, 273), 332 | (332, 50), 333 | (176, 267), 334 | (317, 243), 335 | (311, 125), 336 | (272, 148), 337 | (6, 340), 338 | (80, 346), 339 | (197, 355), 340 | (117, 49), 341 | (261, 326), 342 | (242, 51), 343 | (295, 204), 344 | (298, 111), 345 | (147, 181), 346 | (35, 96), 347 | (318, 285), 348 | (271, 13), 349 | (38, 204), 350 | (16, 8), 351 | (334, 220), 352 | (173, 91), 353 | (372, 24), 354 | (183, 166), 355 | (320, 243), 356 | (87, 9), 357 | (105, 65), 358 | (148, 103), 359 | (197, 314), 360 | (279, 299), 361 | (304, 214), 362 | (282, 15), 363 | (64, 2), 364 | (63, 14), 365 | (28, 351), 366 | ) 367 | -------------------------------------------------------------------------------- /iscc_cli/datatypes.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import mmap 3 | from enum import Enum 4 | from io import BytesIO, BufferedReader 5 | from pathlib import Path 6 | from typing import Union, BinaryIO 7 | 8 | Data = Union[bytes, bytearray, memoryview] 9 | Uri = Union[str, Path] 10 | File = Union[BinaryIO, mmap.mmap, BytesIO, BufferedReader] 11 | Readable = Union[Uri, Data, File] 12 | 13 | 14 | class GMT(str, Enum): 15 | """Generic Metdia Type""" 16 | 17 | text = "text" 18 | image = "image" 19 | audio = "audio" 20 | video = "video" 21 | unknown = "unknown" 22 | -------------------------------------------------------------------------------- /iscc_cli/ffmpeg.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """A thin cross plattform installer and wrapper around ffmpeg.""" 3 | import imageio_ffmpeg 4 | 5 | 6 | def exe_path(): 7 | """Returns path to ffmpeg executable.""" 8 | return imageio_ffmpeg.get_ffmpeg_exe() 9 | 10 | 11 | def get_version_info(): 12 | """Get ffmpeg version info.""" 13 | return imageio_ffmpeg.get_ffmpeg_version() 14 | 15 | 16 | if __name__ == "__main__": 17 | print(exe_path()) 18 | print(get_version_info()) 19 | -------------------------------------------------------------------------------- /iscc_cli/fpcalc.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """A thin cross plattform installer and wrapper around chromaprint fpcalc.""" 3 | import os 4 | import platform 5 | import shutil 6 | import tarfile 7 | import zipfile 8 | import subprocess 9 | import stat 10 | import click 11 | import iscc_cli 12 | from iscc_cli.utils import download_file 13 | 14 | 15 | FPCALC_VERSION = "1.5.0" 16 | FPCALC_URL_BASE = "https://github.com/acoustid/chromaprint/releases/download/v{}/".format( 17 | FPCALC_VERSION 18 | ) 19 | FPCALC_OS_MAP = { 20 | "Linux": "chromaprint-fpcalc-{}-linux-x86_64.tar.gz".format(FPCALC_VERSION), 21 | "Darwin": "chromaprint-fpcalc-{}-macos-x86_64.tar.gz".format(FPCALC_VERSION), 22 | "Windows": "chromaprint-fpcalc-{}-windows-x86_64.zip".format(FPCALC_VERSION), 23 | } 24 | 25 | 26 | def exe_path(): 27 | """Returns path to fpcalc executable.""" 28 | if platform.system() == "Windows": 29 | return os.path.join(iscc_cli.APP_DIR, "fpcalc-{}.exe".format(FPCALC_VERSION)) 30 | return os.path.join(iscc_cli.APP_DIR, "fpcalc-{}".format(FPCALC_VERSION)) 31 | 32 | 33 | def is_installed(): 34 | """"Check if fpcalc is installed.""" 35 | fp = exe_path() 36 | return os.path.isfile(fp) and os.access(fp, os.X_OK) 37 | 38 | 39 | def download_url(): 40 | """Return system and version dependant download url""" 41 | return os.path.join(FPCALC_URL_BASE, FPCALC_OS_MAP[platform.system()]) 42 | 43 | 44 | def download(): 45 | """Download fpcalc and return path to archive file.""" 46 | return download_file(download_url()) 47 | 48 | 49 | def extract(archive): 50 | """Extract archive with fpcalc executable.""" 51 | if archive.endswith(".zip"): 52 | with zipfile.ZipFile(archive, "r") as zip_file: 53 | for member in zip_file.namelist(): 54 | filename = os.path.basename(member) 55 | if filename == "fpcalc.exe": 56 | source = zip_file.open(member) 57 | target = open(exe_path(), "wb") 58 | with source, target: 59 | shutil.copyfileobj(source, target) 60 | elif archive.endswith("tar.gz"): 61 | with tarfile.open(archive, "r:gz") as tar_file: 62 | for member in tar_file.getmembers(): 63 | if member.isfile() and member.name.endswith("fpcalc"): 64 | source = tar_file.extractfile(member) 65 | target = open(exe_path(), "wb") 66 | with source, target: 67 | shutil.copyfileobj(source, target) 68 | 69 | 70 | def install(): 71 | """Install fpcalc command line tool and retur path to executable.""" 72 | if is_installed(): 73 | click.echo("Fpcalc is already installed.") 74 | return exe_path() 75 | archive_path = download() 76 | extract(archive_path) 77 | st = os.stat(exe_path()) 78 | os.chmod(exe_path(), st.st_mode | stat.S_IEXEC) 79 | assert is_installed() 80 | return exe_path() 81 | 82 | 83 | def get_version_info(): 84 | """Get fpcalc version""" 85 | try: 86 | r = subprocess.run([exe_path(), "-v"], stdout=subprocess.PIPE) 87 | return r.stdout.decode("utf-8").strip().split()[2] 88 | except FileNotFoundError: 89 | return 'WARNING: Not Installed - run "iscc init" to install!' 90 | -------------------------------------------------------------------------------- /iscc_cli/lib.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """Expose cli commands with standard python api.""" 3 | from typing import List, Dict 4 | from iscc_cli.commands.gen import gen 5 | from iscc_cli.commands.batch import batch 6 | from iscc_cli.commands.web import web 7 | 8 | 9 | def iscc_from_file(file, guess=False, title="", extra="") -> Dict: 10 | if isinstance(file, str): 11 | file = open(file) 12 | return gen.callback(file, guess, title, extra, False) 13 | 14 | 15 | def isccs_from_dir(path, recursive=False, guess=False, debug=False) -> List[Dict]: 16 | return batch.callback(path, recursive, guess, debug) 17 | 18 | 19 | def iscc_from_url(url, guess=False, title="", extra="") -> Dict: 20 | return web.callback(url, guess, title, extra, False) 21 | -------------------------------------------------------------------------------- /iscc_cli/mediatype.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from loguru import logger 3 | from typing import List, Optional, Union 4 | import mimetypes 5 | import magic 6 | from PIL import Image 7 | from iscc_cli import uread 8 | 9 | 10 | __all__ = [ 11 | "mime_guess", 12 | "mime_normalize", 13 | "mime_supported", 14 | "mime_clean", 15 | "mime_to_gmt", 16 | "mime_from_name", 17 | "mime_from_data", 18 | ] 19 | 20 | 21 | def mime_guess(data, file_name=None): 22 | # type: (Readable, str) -> str 23 | """Heuristic guessing of mediatype for different kinds of inputs. 24 | We try matching by file extension. If that fails we match by content sniffing. 25 | """ 26 | 27 | guess_name, guess_data = None, None 28 | file = uread.open_data(data) 29 | 30 | if file_name is None: 31 | if hasattr(file, "name"): 32 | file_name = file.name 33 | elif hasattr(file, "filename"): 34 | file_name = file.filename 35 | 36 | if file_name: 37 | guess_name = mime_from_name(file_name) 38 | 39 | guess_data = mime_from_data(file.read(4096)) 40 | 41 | # Normalize 42 | guess_data = mime_normalize(guess_data) 43 | guess_name = mime_normalize(guess_name) 44 | 45 | return guess_name or guess_data 46 | 47 | 48 | def mime_normalize(mime: str) -> str: 49 | """Return normalized version of a mediatype.""" 50 | return MEDIATYPE_NORM.get(mime, mime) 51 | 52 | 53 | def mime_supported(mime: str) -> bool: 54 | """Check if mediatype is supported""" 55 | return mime_normalize(mime) in SUPPORTED_MEDIATYPES 56 | 57 | 58 | def mime_from_name(name: str) -> Optional[str]: 59 | """Guess mediatype from filename or url.""" 60 | return mimetypes.guess_type(name)[0] 61 | 62 | 63 | def mime_from_data(data: bytes) -> Optional[str]: 64 | """Guess mediatype by sniffing raw header data.""" 65 | return magic.from_buffer(data, mime=True) 66 | 67 | 68 | def mime_clean(mime: Union[str, List]): 69 | """ 70 | Clean mimetype/content-type string or first entry of a list of mimetype strings. 71 | Also removes semicolon separated encoding information. 72 | """ 73 | if mime and isinstance(mime, List): 74 | mime = mime[0] 75 | if mime: 76 | mime = mime.split(";")[0] 77 | return mime.strip() 78 | 79 | 80 | def mime_to_gmt(mime_type: str, file_path=None): 81 | """Get generic mediatype from mimetype.""" 82 | mime_type = mime_clean(mime_type) 83 | if mime_type == "image/gif" and file_path: 84 | img = Image.open(file_path) 85 | if img.is_animated: 86 | return "video" 87 | else: 88 | return "image" 89 | entry = SUPPORTED_MEDIATYPES.get(mime_type) 90 | if entry: 91 | return entry["gmt"] 92 | gmt = mime_type.split("/")[0] 93 | if gmt in list(GMT): 94 | logger.warning(f"Guessing GMT from {mime_type}") 95 | return gmt 96 | 97 | 98 | mimetypes.add_type("text/markdown", ".md") 99 | mimetypes.add_type("text/markdown", ".markdown") 100 | mimetypes.add_type("application/x-mobipocket-ebook", ".mobi") 101 | mimetypes.add_type("application/x-sqlite3", ".sqlite") 102 | mimetypes.add_type("video/mp4", ".f4v") 103 | 104 | 105 | SUPPORTED_MEDIATYPES = { 106 | # Text Formats 107 | "application/rtf": {"gmt": "text", "ext": "rtf"}, 108 | "application/msword": {"gmt": "text", "ext": "doc"}, 109 | "application/pdf": {"gmt": "text", "ext": "pdf"}, 110 | "application/epub+zip": {"gmt": "text", "ext": "epub"}, 111 | "text/xml": {"gmt": "text", "ext": "xml"}, 112 | "application/json": {"gmt": "text", "ext": "json"}, 113 | "application/xhtml+xml": {"gmt": "text", "ext": "xhtml"}, 114 | "application/vnd.oasis.opendocument.text": {"gmt": "text", "ext": "odt"}, 115 | "text/html": {"gmt": "text", "ext": "html"}, 116 | "text/plain": {"gmt": "text", "ext": "txt"}, 117 | "application/x-ibooks+zip": {"gmt": "text", "ext": "ibooks"}, 118 | "text/markdown": {"gmt": "text", "ext": ["md", "markdown"]}, 119 | "application/vnd.openxmlformats-officedocument.wordprocessingml.document": { 120 | "gmt": "text", 121 | "ext": "docx", 122 | }, 123 | "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": { 124 | "gmt": "text", 125 | "ext": "xlsx", 126 | }, 127 | # Note: pptx only detected by file extension. Sniffing gives 'application/zip' 128 | "application/vnd.openxmlformats-officedocument.presentationml.presentation": { 129 | "gmt": "text", 130 | "ext": "pptx", 131 | }, 132 | "application/vnd.ms-excel": {"gmt": "text", "ext": "xls"}, 133 | "application/x-mobipocket-ebook": { 134 | "gmt": "text", 135 | "ext": ["mobi", "prc", "azw", "azw3", "azw4"], 136 | }, 137 | # Image Formats 138 | "image/bmp": {"gmt": "image", "ext": "bmp"}, 139 | "image/gif": {"gmt": "image", "ext": "gif"}, 140 | "image/jpeg": {"gmt": "image", "ext": ["jpg", "jpeg"]}, 141 | "image/png": {"gmt": "image", "ext": "png"}, 142 | "image/tiff": {"gmt": "image", "ext": "tif"}, 143 | "image/vnd.adobe.photoshop": {"gmt": "image", "ext": "psd"}, 144 | "application/postscript": {"gmt": "image", "ext": "eps"}, 145 | # Audio Formats 146 | "audio/mpeg": {"gmt": "audio", "ext": "mp3"}, 147 | "audio/wav": {"gmt": "audio", "ext": "wav"}, 148 | "audio/x-wav": {"gmt": "audio", "ext": "wav"}, 149 | "audio/ogg": {"gmt": "audio", "ext": "ogg"}, 150 | "audio/aiff": {"gmt": "audio", "ext": "aif"}, 151 | "audio/x-aiff": {"gmt": "audio", "ext": "aif"}, 152 | "audio/x-flac": {"gmt": "audio", "ext": "flac"}, 153 | "audio/opus": {"gmt": "audio", "ext": "opus"}, 154 | # Video Formats 155 | "application/vnd.rn-realmedia": {"gmt": "video", "ext": "rm"}, 156 | "video/x-dirac": {"gmt": "video", "ext": "drc"}, 157 | "video/3gpp": {"gmt": "video", "ext": "3gp"}, 158 | "video/3gpp2": {"gmt": "video", "ext": "3g2"}, 159 | "video/x-ms-asf": {"gmt": "video", "ext": "asf"}, 160 | "video/avi": {"gmt": "video", "ext": "avi"}, 161 | "video/webm": {"gmt": "video", "ext": "webm"}, 162 | "video/mpeg": {"gmt": "video", "ext": ["mpeg", "mpg", "m1v", "vob"]}, 163 | "video/mp4": {"gmt": "video", "ext": "mp4"}, 164 | "video/x-m4v": {"gmt": "video", "ext": "m4v"}, 165 | "video/x-matroska": {"gmt": "video", "ext": "mkv"}, 166 | "video/ogg": {"gmt": "video", "ext": ["ogg", "ogv"]}, 167 | "video/quicktime": {"gmt": "video", "ext": ["mov", "f4v"]}, 168 | "video/x-flv": {"gmt": "video", "ext": "flv"}, 169 | "application/x-shockwave-flash": {"gmt": "video", "ext": "swf"}, 170 | "video/h264": {"gmt": "video", "ext": "h264"}, 171 | "video/x-ms-wmv": {"gmt": "video", "ext": "wmv"}, 172 | } 173 | 174 | MEDIATYPE_NORM = { 175 | "audio/x-aiff": "audio/aiff", 176 | "audio/x-wav": "audio/wav", 177 | "image/x-ms-bmp": "image/bmp", 178 | "video/x-msvideo": "video/avi", 179 | } 180 | 181 | SUPPORTED_EXTENSIONS = [] 182 | for v in SUPPORTED_MEDIATYPES.values(): 183 | ext = v["ext"] 184 | if isinstance(ext, str): 185 | SUPPORTED_EXTENSIONS.append(ext) 186 | else: 187 | for e in ext: 188 | SUPPORTED_EXTENSIONS.append(e) 189 | -------------------------------------------------------------------------------- /iscc_cli/tika/__init__.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | __version__ = "1.24" 18 | 19 | try: 20 | __import__("pkg_resources").declare_namespace(__name__) 21 | except ImportError: 22 | from pkgutil import extend_path 23 | 24 | __path__ = extend_path(__path__, __name__) 25 | 26 | 27 | def initVM(): 28 | """ back compat method for JCC based Tika""" 29 | return 30 | -------------------------------------------------------------------------------- /iscc_cli/tika/config.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | # Licensed to the Apache Software Foundation (ASF) under one or more 4 | # contributor license agreements. See the NOTICE file distributed with 5 | # this work for additional information regarding copyright ownership. 6 | # The ASF licenses this file to You under the Apache License, Version 2.0 7 | # (the "License"); you may not use this file except in compliance with 8 | # the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | from .tika import getConfig 20 | 21 | 22 | def getParsers(): 23 | return getConfig("parsers")[1] 24 | 25 | 26 | def getMimeTypes(): 27 | return getConfig("mime-types")[1] 28 | 29 | 30 | def getDetectors(): 31 | return getConfig("detectors")[1] 32 | -------------------------------------------------------------------------------- /iscc_cli/tika/detector.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | # Licensed to the Apache Software Foundation (ASF) under one or more 4 | # contributor license agreements. See the NOTICE file distributed with 5 | # this work for additional information regarding copyright ownership. 6 | # The ASF licenses this file to You under the Apache License, Version 2.0 7 | # (the "License"); you may not use this file except in compliance with 8 | # the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | from .tika import detectType1, callServer, ServerEndpoint 20 | 21 | 22 | def from_file(filename, config_path=None, requestOptions={}): 23 | """ 24 | Detects MIME type of specified file 25 | :param filename: file whose type needs to be detected 26 | :return: MIME type 27 | """ 28 | jsonOutput = detectType1( 29 | "type", filename, config_path=config_path, requestOptions=requestOptions 30 | ) 31 | return jsonOutput[1] 32 | 33 | 34 | def from_buffer(string, config_path=None, requestOptions={}): 35 | """ 36 | Detects MIME type of the buffered content 37 | :param string: buffered content whose type needs to be detected 38 | :return: 39 | """ 40 | status, response = callServer( 41 | "put", 42 | ServerEndpoint, 43 | "/detect/stream", 44 | string, 45 | {"Accept": "text/plain"}, 46 | False, 47 | config_path=config_path, 48 | requestOptions=requestOptions, 49 | ) 50 | return response 51 | -------------------------------------------------------------------------------- /iscc_cli/tika/language.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | # Licensed to the Apache Software Foundation (ASF) under one or more 4 | # contributor license agreements. See the NOTICE file distributed with 5 | # this work for additional information regarding copyright ownership. 6 | # The ASF licenses this file to You under the Apache License, Version 2.0 7 | # (the "License"); you may not use this file except in compliance with 8 | # the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | from .tika import detectLang1, callServer, ServerEndpoint 20 | 21 | 22 | def from_file(filename, requestOptions={}): 23 | """ 24 | Detects language of the file 25 | :param filename: path to file whose language needs to be detected 26 | :return: 27 | """ 28 | jsonOutput = detectLang1("file", filename, requestOptions=requestOptions) 29 | return jsonOutput[1] 30 | 31 | 32 | def from_buffer(string, requestOptions={}): 33 | """ 34 | Detects language of content in the buffer 35 | :param string: buffered data 36 | :return: 37 | """ 38 | status, response = callServer( 39 | "put", 40 | ServerEndpoint, 41 | "/language/string", 42 | string, 43 | {"Accept": "text/plain"}, 44 | False, 45 | requestOptions=requestOptions, 46 | ) 47 | return response 48 | -------------------------------------------------------------------------------- /iscc_cli/tika/parser.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | # Licensed to the Apache Software Foundation (ASF) under one or more 4 | # contributor license agreements. See the NOTICE file distributed with 5 | # this work for additional information regarding copyright ownership. 6 | # The ASF licenses this file to You under the Apache License, Version 2.0 7 | # (the "License"); you may not use this file except in compliance with 8 | # the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | from .tika import parse1, callServer, ServerEndpoint 20 | import os 21 | import json 22 | 23 | 24 | def from_file( 25 | filename, 26 | serverEndpoint=ServerEndpoint, 27 | service="all", 28 | xmlContent=False, 29 | headers=None, 30 | config_path=None, 31 | requestOptions={}, 32 | ): 33 | """ 34 | Parses a file for metadata and content 35 | :param filename: path to file which needs to be parsed or binary file using open(path,'rb') 36 | :param serverEndpoint: Server endpoint url 37 | :param service: service requested from the tika server 38 | Default is 'all', which results in recursive text content+metadata. 39 | 'meta' returns only metadata 40 | 'text' returns only content 41 | :param xmlContent: Whether or not XML content be requested. 42 | Default is 'False', which results in text content. 43 | :param headers: Request headers to be sent to the tika reset server, should 44 | be a dictionary. This is optional 45 | :return: dictionary having 'metadata' and 'content' keys. 46 | 'content' has a str value and metadata has a dict type value. 47 | """ 48 | if not xmlContent: 49 | output = parse1( 50 | service, 51 | filename, 52 | serverEndpoint, 53 | headers=headers, 54 | config_path=config_path, 55 | requestOptions=requestOptions, 56 | ) 57 | else: 58 | output = parse1( 59 | service, 60 | filename, 61 | serverEndpoint, 62 | services={"meta": "/meta", "text": "/tika", "all": "/rmeta/xml"}, 63 | headers=headers, 64 | config_path=config_path, 65 | requestOptions=requestOptions, 66 | ) 67 | return _parse(output, service) 68 | 69 | 70 | def from_buffer( 71 | string, 72 | serverEndpoint=ServerEndpoint, 73 | xmlContent=False, 74 | headers=None, 75 | config_path=None, 76 | requestOptions={}, 77 | ): 78 | """ 79 | Parses the content from buffer 80 | :param string: Buffer value 81 | :param serverEndpoint: Server endpoint. This is optional 82 | :param xmlContent: Whether or not XML content be requested. 83 | Default is 'False', which results in text content. 84 | :param headers: Request headers to be sent to the tika reset server, should 85 | be a dictionary. This is optional 86 | :return: 87 | """ 88 | headers = headers or {} 89 | headers.update({"Accept": "application/json"}) 90 | 91 | if not xmlContent: 92 | status, response = callServer( 93 | "put", 94 | serverEndpoint, 95 | "/rmeta/text", 96 | string, 97 | headers, 98 | False, 99 | config_path=config_path, 100 | requestOptions=requestOptions, 101 | ) 102 | else: 103 | status, response = callServer( 104 | "put", 105 | serverEndpoint, 106 | "/rmeta/xml", 107 | string, 108 | headers, 109 | False, 110 | config_path=config_path, 111 | requestOptions=requestOptions, 112 | ) 113 | 114 | return _parse((status, response)) 115 | 116 | 117 | def _parse(output, service="all"): 118 | """ 119 | Parses response from Tika REST API server 120 | :param output: output from Tika Server 121 | :param service: service requested from the tika server 122 | Default is 'all', which results in recursive text content+metadata. 123 | 'meta' returns only metadata 124 | 'text' returns only content 125 | :return: a dictionary having 'metadata' and 'content' values 126 | """ 127 | parsed = {"metadata": None, "content": None} 128 | if not output: 129 | return parsed 130 | 131 | parsed["status"] = output[0] 132 | if output[1] == None or output[1] == "": 133 | return parsed 134 | 135 | if service == "text": 136 | parsed["content"] = output[1] 137 | return parsed 138 | 139 | realJson = json.loads(output[1]) 140 | 141 | parsed["metadata"] = {} 142 | if service == "meta": 143 | for key in realJson: 144 | parsed["metadata"][key] = realJson[key] 145 | return parsed 146 | 147 | content = "" 148 | for js in realJson: 149 | if "X-TIKA:content" in js: 150 | content += js["X-TIKA:content"] 151 | 152 | if content == "": 153 | content = None 154 | 155 | parsed["content"] = content 156 | 157 | for js in realJson: 158 | for n in js: 159 | if n != "X-TIKA:content": 160 | if n in parsed["metadata"]: 161 | if not isinstance(parsed["metadata"][n], list): 162 | parsed["metadata"][n] = [parsed["metadata"][n]] 163 | parsed["metadata"][n].append(js[n]) 164 | else: 165 | parsed["metadata"][n] = js[n] 166 | 167 | return parsed 168 | -------------------------------------------------------------------------------- /iscc_cli/tika/tika.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | # Licensed to the Apache Software Foundation (ASF) under one or more 4 | # contributor license agreements. See the NOTICE file distributed with 5 | # this work for additional information regarding copyright ownership. 6 | # The ASF licenses this file to You under the Apache License, Version 2.0 7 | # (the "License"); you may not use this file except in compliance with 8 | # the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | # Module documentation 20 | """ 21 | Tika Python module provides Python API client to Apache Tika Server. 22 | 23 | **Example usage**:: 24 | 25 | import tika 26 | from tika import parser 27 | parsed = parser.from_file('/path/to/file') 28 | print(parsed["metadata"]) 29 | print(parsed["content"]) 30 | 31 | Visit https://github.com/chrismattmann/tika-python to learn more about it. 32 | 33 | **Detect IANA MIME Type**:: 34 | 35 | from tika import detector 36 | print(detector.from_file('/path/to/file')) 37 | 38 | **Detect Language**:: 39 | 40 | from tika import language 41 | print(language.from_file('/path/to/file')) 42 | 43 | **Use Tika Translate**:: 44 | 45 | from tika import translate 46 | print(translate.from_file('/path/to/file', 'srcLang', 'destLang') 47 | # Use auto Language detection feature 48 | print(translate.from_file('/path/to/file', 'destLang') 49 | 50 | ***Tika-Python Configuration*** 51 | You can now use custom configuration files. See https://tika.apache.org/1.18/configuring.html 52 | for details on writing configuration files. Configuration is set the first time the server is started. 53 | To use a configuration file with a parser, or detector: 54 | parsed = parser.from_file('/path/to/file', config_path='/path/to/configfile') 55 | or: 56 | detected = detector.from_file('/path/to/file', config_path='/path/to/configfile') 57 | or: 58 | detected = detector.from_buffer('some buffered content', config_path='/path/to/configfile') 59 | 60 | """ 61 | import types 62 | 63 | USAGE = """ 64 | tika.py [-v] [-e] [-o ] [--server ] [--install ] [--port ]