├── .cargo └── config.toml ├── .dockerignore ├── .gitattributes ├── .github ├── CODEOWNERS ├── ISSUE_TEMPLATE │ ├── misdetection.md │ └── new_content_type_request.md ├── dependabot.yml ├── labeler.yml ├── scorecard.yml └── workflows │ ├── codeql.yml │ ├── docs-check.yml │ ├── github-issue-labeler.yml │ ├── github-pages.yml │ ├── js-docs-builder.yml │ ├── js-publish.yml │ ├── js-test.yml │ ├── python-build-package.yml │ ├── python-test-published-package.yml │ ├── python-test-published-rc-package.yml │ ├── python-test-suite.yml │ ├── rust-test.yml │ ├── scorecard.yml │ └── website-test.yml ├── .gitignore ├── CITATION.cff ├── CONTRIBUTING.md ├── Dockerfile ├── LICENSE ├── README.md ├── assets ├── 2025_icse_magika.pdf ├── content_types_kb.min.json ├── magika-abusech.png ├── magika-screenshot.png ├── magika-vt.png └── models │ ├── CHANGELOG.md │ ├── begonly_v2_1 │ ├── config.min.json │ ├── metadata.json │ ├── model.keras │ └── model.onnx │ ├── fast_v2_1 │ ├── config.min.json │ ├── metadata.json │ ├── model.keras │ └── model.onnx │ ├── standard_v1 │ ├── README.md │ ├── content_types_config.json │ ├── magika_config.json │ ├── model.h5 │ ├── model_config.json │ └── thresholds.json │ ├── standard_v2_0 │ ├── README.md │ ├── config.min.json │ ├── metadata.json │ ├── model.keras │ └── model.onnx │ ├── standard_v2_1 │ ├── README.md │ ├── config.min.json │ ├── metadata.json │ ├── model.keras │ └── model.onnx │ ├── standard_v3_0 │ ├── README.md │ ├── config.min.json │ ├── metadata.json │ └── model.onnx │ ├── standard_v3_1 │ ├── README.md │ ├── config.min.json │ ├── metadata.json │ └── model.onnx │ ├── standard_v3_2 │ ├── README.md │ ├── config.min.json │ ├── metadata.json │ └── model.onnx │ └── standard_v3_3 │ ├── README.md │ ├── config.min.json │ ├── metadata.json │ └── model.onnx ├── docs ├── concepts.md ├── dev-notes.md ├── faq.md └── js.md ├── go ├── README.md ├── cli │ ├── cli.go │ ├── cli_test.go │ ├── main.go │ └── tests_data │ │ ├── magika_test.zip │ │ └── magika_test_pptx.txt ├── docker │ └── Dockerfile ├── go.mod ├── go.sum ├── magika │ ├── config.go │ ├── content.go │ ├── features.go │ ├── features_test.go │ ├── scanner.go │ └── scanner_test.go └── onnx │ ├── onnx.go │ ├── onnx_runtime.go │ ├── onnx_runtime.h │ ├── onnx_runtime_test.go │ └── onnx_zero.go ├── js ├── .gitignore ├── CHANGELOG.md ├── README.md ├── magika-cli.ts ├── magika-node.ts ├── magika.ts ├── package.json ├── postBuild.js ├── src │ ├── .npmignore │ ├── content-type-info.ts │ ├── content-type-label.ts │ ├── content-types-infos.ts │ ├── magika-options.ts │ ├── magika-prediction.ts │ ├── magika-result.ts │ ├── model-config-node.ts │ ├── model-config.ts │ ├── model-features.ts │ ├── model-node.ts │ ├── model-prediction.ts │ ├── model.ts │ ├── overwrite-reason.ts │ ├── prediction-mode.ts │ └── status.ts ├── test │ ├── features-extraction-vs-reference.test.ts │ ├── inference-vs-reference.test.ts │ ├── magika-cli.test.ts │ ├── magika.test.ts │ ├── tfnHook.ts │ └── utils.ts ├── tsconfig.cjs.json ├── tsconfig.esm.json ├── tsconfig.json └── yarn.lock ├── python ├── .gitignore ├── .python-version ├── CHANGELOG.md ├── README.md ├── mypy.ini ├── pyproject.toml ├── pytest.ini ├── scripts │ ├── check_copyright.sh │ ├── check_documentation.py │ ├── check_release_candidate_python_package.py │ ├── check_source.sh │ ├── fix_package_version.py │ ├── generate_reference.py │ ├── prepare_pyproject_for_pure_python_wheel.py │ ├── run_quick_test_magika_cli.py │ ├── run_quick_test_magika_module.py │ ├── sync.py │ └── test_magika_model.py ├── src │ └── magika │ │ ├── __init__.py │ │ ├── cli │ │ ├── magika_client.py │ │ └── magika_rust_client_not_found_warning.py │ │ ├── colors.py │ │ ├── config │ │ └── content_types_kb.min.json │ │ ├── logger.py │ │ ├── magika.py │ │ ├── models │ │ └── standard_v3_3 │ │ │ ├── README.md │ │ │ ├── config.min.json │ │ │ ├── metadata.json │ │ │ └── model.onnx │ │ ├── py.typed │ │ └── types │ │ ├── __init__.py │ │ ├── content_type_info.py │ │ ├── content_type_label.py │ │ ├── magika_error.py │ │ ├── magika_prediction.py │ │ ├── magika_result.py │ │ ├── model.py │ │ ├── overwrite_reason.py │ │ ├── prediction_mode.py │ │ ├── seekable.py │ │ ├── status.py │ │ └── strenum.py ├── tests │ ├── __init__.py │ ├── test_features_extraction_vs_reference.py │ ├── test_inference_vs_reference.py │ ├── test_magika_python_module.py │ ├── test_python_magika_client.py │ └── utils.py └── uv.lock ├── rust ├── .gitignore ├── README.md ├── changelog.sh ├── cli │ ├── CHANGELOG.md │ ├── Cargo.lock │ ├── Cargo.toml │ ├── LICENSE │ ├── README.md │ ├── output │ ├── src │ │ └── main.rs │ └── test.sh ├── color.sh ├── gen │ ├── Cargo.lock │ ├── Cargo.toml │ ├── README.md │ ├── content_types │ ├── model │ ├── src │ │ └── main.rs │ └── test.sh ├── lib │ ├── CHANGELOG.md │ ├── Cargo.lock │ ├── Cargo.toml │ ├── LICENSE │ ├── README.md │ ├── src │ │ ├── builder.rs │ │ ├── config.rs │ │ ├── content.rs │ │ ├── error.rs │ │ ├── file.rs │ │ ├── future.rs │ │ ├── input.rs │ │ ├── lib.rs │ │ ├── model.onnx │ │ ├── model.rs │ │ └── session.rs │ └── test.sh ├── onnx │ ├── build.sh │ └── maturin.sh ├── publish.sh ├── rustfmt.toml ├── sync.sh ├── taplo.toml └── test.sh ├── tests_data ├── README.md ├── basic │ ├── asm │ │ └── code.asm │ ├── batch │ │ └── simple.bat │ ├── c │ │ └── code.c │ ├── css │ │ └── code.css │ ├── csv │ │ └── magika_test.csv │ ├── dockerfile │ │ └── Dockerfile │ ├── docx │ │ ├── doc.docx │ │ └── magika_test.docx │ ├── empty │ │ └── empty_file │ ├── epub │ │ ├── doc.epub │ │ └── magika_test.epub │ ├── flac │ │ └── test.flac │ ├── handlebars │ │ └── example.handlebars │ ├── html │ │ └── doc.html │ ├── ignorefile │ │ └── example.ignorefile │ ├── ini │ │ └── doc.ini │ ├── javascript │ │ └── code.js │ ├── jinja │ │ └── example.j2 │ ├── jpeg │ │ └── magika_test.jpg │ ├── json │ │ └── doc.json │ ├── latex │ │ └── sample.tex │ ├── makefile │ │ └── simple.Makefile │ ├── markdown │ │ ├── README.md │ │ ├── magika_test.md │ │ └── simple.md │ ├── mp3 │ │ └── test.mp3 │ ├── odp │ │ └── magika_test.odp │ ├── ods │ │ └── magika_test.ods │ ├── odt │ │ ├── doc.odt │ │ └── magika_test.odt │ ├── ogg │ │ └── test.ogg │ ├── pdf │ │ ├── magika_test.pdf │ │ ├── magika_test_pptx.pdf │ │ └── magika_test_xlsx.pdf │ ├── pem │ │ ├── doc.pem │ │ └── doc.pub │ ├── png │ │ └── magika_test.png │ ├── pptx │ │ └── magika_test.pptx │ ├── python │ │ └── code.py │ ├── pytorch │ │ └── example.pth │ ├── rtf │ │ ├── doc.rtf │ │ └── magika_test.rtf │ ├── rust │ │ ├── asm.rs │ │ ├── code.rs │ │ ├── test_case1.rs │ │ └── test_case2.rs │ ├── smali │ │ └── code.smali │ ├── srt │ │ └── code.srt │ ├── svg │ │ └── magika_test.svg │ ├── toml │ │ └── doc.toml │ ├── tsv │ │ └── magika_test.tsv │ ├── twig │ │ └── example.twig │ ├── txt │ │ ├── complex-sentence.txt │ │ ├── few-words.txt │ │ ├── lorem-big.txt │ │ ├── lorem-small.txt │ │ ├── magika_test_pptx.txt │ │ ├── many-words.txt │ │ ├── one-sentence-with-newline.txt │ │ ├── one-sentence.txt │ │ └── random-ascii.txt │ ├── typescript │ │ └── code.ts │ ├── wav │ │ └── test.wav │ ├── xlsx │ │ └── magika_test.xlsx │ ├── yaml │ │ ├── dependabot.yml │ │ └── python-test.yml │ ├── yara │ │ └── rule.yar │ ├── zig │ │ └── code.zig │ └── zip │ │ └── magika_test.zip ├── current_missdetections │ ├── html │ │ └── malformed-html-gh-521.html │ └── xls │ │ └── password-protected-example.xls ├── features_extraction │ └── reference.json.gz ├── mitra │ ├── bmp │ │ └── bmp.bmp │ ├── bzip │ │ └── bzip2.bz2 │ ├── cab │ │ └── cab.cab │ ├── elf │ │ ├── elf.elf │ │ └── elf64.elf │ ├── flac │ │ ├── flac.flac │ │ └── tiny.flac │ ├── gif │ │ ├── gif87.gif │ │ └── gif89.gif │ ├── gzip │ │ └── gzip.gz │ ├── iso │ │ └── iso.iso │ ├── javabytecode │ │ └── java.class │ ├── jpeg │ │ └── jpg.jpg │ ├── mp3 │ │ ├── id3v1.mp3 │ │ └── id3v2.mp3 │ ├── mp4 │ │ └── mp4.mp4 │ ├── ogg │ │ └── vorbis.ogg │ ├── pcap │ │ └── pcap.pcap │ ├── pdf │ │ └── pdf.pdf │ ├── pebin │ │ ├── pe32.exe │ │ └── pe64.exe │ ├── php │ │ └── php.php │ ├── png │ │ ├── cgbi.png │ │ └── png.png │ ├── rar │ │ ├── rar4.rar │ │ └── rar5.rar │ ├── rtf │ │ └── rich.rtf │ ├── sevenzip │ │ └── 7-zip.7z │ ├── svg │ │ └── svg.svg │ ├── tar │ │ ├── hello-gnu.tar │ │ ├── hello-pax.tar │ │ ├── hello-ustar.tar │ │ └── tar.tar │ ├── tga │ │ └── footer.tga │ ├── tiff │ │ ├── tiff-be.tif │ │ └── tiff-le.tif │ ├── wav │ │ ├── riff.wav │ │ └── rifx.wav │ ├── webm │ │ └── webm.webm │ ├── webp │ │ ├── webp.webp │ │ └── webpl.webp │ ├── xar │ │ ├── hello-world.xar │ │ └── mini.xar │ ├── xz │ │ └── xz.xz │ └── zip │ │ ├── NT.zip │ │ ├── NTFS.zip │ │ ├── PPMd.zip │ │ ├── aes.zip │ │ ├── bz2.zip │ │ ├── deflate64.zip │ │ ├── directory.zip │ │ ├── drive.zip │ │ ├── dual.zip │ │ ├── filecomment.zip │ │ ├── implode.zip │ │ ├── implodeV3.zip │ │ ├── jpeg.zip │ │ ├── lzma.zip │ │ ├── mini.zip │ │ ├── reduced1.zip │ │ ├── reduced2.zip │ │ ├── reduced3.zip │ │ ├── reduced4.zip │ │ ├── shrunk.zip │ │ ├── simple.zip │ │ ├── store.zip │ │ ├── unicode.zip │ │ ├── unicode2.zip │ │ ├── unix.zip │ │ ├── unixdesc.zip │ │ ├── volumecomment.zip │ │ ├── wavpack.zip │ │ ├── zip.zip │ │ ├── zip64.zip │ │ ├── zipcrypto.zip │ │ └── zopfli.zip ├── mitra_candidates │ ├── DS_Store │ ├── ace.ace │ ├── dicom.dcm │ ├── hdf5.h5 │ ├── html.htm │ ├── ico.ico │ ├── jp2-stream.jp2 │ ├── jp2.jp2 │ ├── lha.lzh │ ├── lzip.lz │ ├── mini.bplist │ ├── mini.plist │ ├── mini.protobuf │ ├── pcapng.pcapng │ ├── photoshop.psd │ ├── qoi.qoi │ ├── qt.mov │ ├── rar14.rar │ ├── raw.tga │ ├── tiny.avro │ ├── wad.wad │ └── wasm.wasm ├── previous_missdetections │ └── sqlite │ │ └── test-gh-616.db └── reference │ ├── features_extraction_examples.json.gz │ ├── standard_v3_2-inference_examples_by_content.json.gz │ ├── standard_v3_2-inference_examples_by_path.json.gz │ ├── standard_v3_3-inference_examples_by_content.json.gz │ └── standard_v3_3-inference_examples_by_path.json.gz └── website ├── .gitignore ├── README.md ├── assets ├── custom.scss ├── logo.svg ├── model_card.html └── paper.png ├── index.html ├── jsconfig.json ├── package.json ├── public ├── model │ ├── config.json │ ├── group1-shard1of1.bin │ └── model.json └── models │ ├── standard_v3_2 │ ├── config.min.json │ ├── group1-shard1of1.bin │ ├── metadata.json │ └── model.json │ └── standard_v3_3 │ ├── README.md │ ├── config.min.json │ ├── group1-shard1of1.bin │ ├── metadata.json │ └── model.json ├── src ├── App.vue ├── components │ ├── BarsVisualization.vue │ ├── FileClassifierDemo.vue │ ├── Homepage.vue │ ├── Markdown.vue │ └── TextAreaClassifierDemo.vue ├── main.js └── plugins │ ├── index.js │ ├── router.js │ └── vuetify.js ├── vite.config.js └── yarn.lock /.cargo/config.toml: -------------------------------------------------------------------------------- 1 | [build] 2 | target-dir = "rust/target" 3 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | # Include any files or directories that you don't want to be copied to your 2 | # container here (e.g., local build artifacts, temporary files, etc.). 3 | # 4 | # For more help, visit the .dockerignore file reference guide at 5 | # https://docs.docker.com/go/build-context-dockerignore/ 6 | 7 | **/.DS_Store 8 | **/__pycache__ 9 | **/.venv 10 | **/.classpath 11 | **/.dockerignore 12 | **/.env 13 | **/.git 14 | **/.gitignore 15 | **/.project 16 | **/.settings 17 | **/.toolstarget 18 | **/.vs 19 | **/.vscode 20 | **/*.*proj.user 21 | **/*.dbmdl 22 | **/*.jfm 23 | **/bin 24 | **/charts 25 | **/docker-compose* 26 | **/compose* 27 | **/Dockerfile* 28 | **/node_modules 29 | **/npm-debug.log 30 | **/obj 31 | **/secrets.dev.yaml 32 | **/values.dev.yaml 33 | LICENSE 34 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | /tests_data/** -text 2 | -------------------------------------------------------------------------------- /.github/CODEOWNERS: -------------------------------------------------------------------------------- 1 | # Default owners (lowest precedence). 2 | * @reyammer @invernizzi 3 | 4 | # Julien owns the Rust code 5 | /rust/ @ia0 6 | 7 | # Yanick owns the Python code, all docs, and test data 8 | /python/ @reyammer 9 | *.md @reyammer 10 | /tests_data/ @reyammer 11 | 12 | # Luca owns the JS code, docs, and website 13 | /js/ @invernizzi 14 | /docs/js.md @invernizzi 15 | /website/ @invernizzi 16 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/misdetection.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Misdetection 3 | about: Report a file, or files, that have been misdetected as something that they 4 | aren't. 5 | title: "[Misdetection] file misdetected as " 6 | labels: misdetection, needs triage 7 | assignees: '' 8 | 9 | --- 10 | 11 | **What should the file have been detected as? What has the file been misdetected as?** 12 | Ex. "HTML pages are being mistaken for generic XML files.", "C# code misdetected as Java.", or "Can't tell the difference between exe and dll files." 13 | 14 | **Please link or attach the misdetected file below** (Do NOT upload PII!) 15 | Placeholder.zip 16 | 17 | **Additional context** 18 | Add any other context or screenshots about the feature request here. 19 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | updates: 4 | - package-ecosystem: "github-actions" 5 | directory: "/" 6 | schedule: 7 | interval: "monthly" 8 | - package-ecosystem: "docker" 9 | directory: "/" 10 | schedule: 11 | interval: "daily" 12 | - package-ecosystem: "pip" 13 | directory: "/python" 14 | schedule: 15 | interval: "daily" 16 | - package-ecosystem: "npm" 17 | directory: "/js" 18 | schedule: 19 | interval: "weekly" 20 | - package-ecosystem: "cargo" 21 | directory: "/rust" 22 | schedule: 23 | interval: "weekly" 24 | -------------------------------------------------------------------------------- /.github/labeler.yml: -------------------------------------------------------------------------------- 1 | needs triage: 2 | - "/.*/" 3 | -------------------------------------------------------------------------------- /.github/scorecard.yml: -------------------------------------------------------------------------------- 1 | # Scorecard maintainer annotations. 2 | # See https://github.com/ossf/scorecard/blob/main/config/README.md 3 | 4 | annotations: 5 | # Binary files in tests_data/ are only used for testing. 6 | - checks: 7 | - binary-artifacts 8 | reasons: 9 | - reason: test-data 10 | 11 | - checks: 12 | - pinned-dependencies 13 | reasons: 14 | # Test data with unpinned dependencies: 15 | # - tests_data/basic/dockerfile/Dockerfile 16 | - reason: test-data 17 | # CI/CD containers meant to run the latest version: 18 | # - .github/workflows/python-e2e-test.yml 19 | - reason: remediated 20 | -------------------------------------------------------------------------------- /.github/workflows/docs-check.yml: -------------------------------------------------------------------------------- 1 | name: Docs - Check documentation 2 | 3 | on: 4 | workflow_dispatch: 5 | push: 6 | branches: 7 | - "main" 8 | pull_request: 9 | paths: 10 | - "*.md" 11 | - "assets/**/*.md" 12 | - "docs/**/*.md" 13 | - "js/**/*.md" 14 | - "python/**/*.md" 15 | - "rust/**/*.md" 16 | schedule: 17 | - cron: "42 7 * * 4" # Run weekly 18 | 19 | permissions: 20 | contents: read 21 | 22 | jobs: 23 | run-check-docs: 24 | runs-on: ubuntu-latest 25 | steps: 26 | - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # pin@v4 27 | 28 | - name: Install uv 29 | run: curl -LsSf https://astral.sh/uv/0.5.22/install.sh | sh 30 | 31 | - name: "Run check_documentation.py script" 32 | working-directory: python 33 | run: uv run ./scripts/check_documentation.py 34 | -------------------------------------------------------------------------------- /.github/workflows/github-issue-labeler.yml: -------------------------------------------------------------------------------- 1 | name: New issue labeler 2 | on: 3 | # Runs on newly opened issues 4 | issues: 5 | types: [opened] 6 | 7 | # Sets permissions of the GITHUB_TOKEN 8 | permissions: 9 | issues: write 10 | contents: read 11 | 12 | jobs: 13 | triage: 14 | runs-on: ubuntu-latest 15 | steps: 16 | - uses: github/issue-labeler@c1b0f9f52a63158c4adc09425e858e87b32e9685 # pin@v3.4 17 | with: 18 | configuration-path: .github/labeler.yml 19 | enable-versioned-regex: 0 20 | repo-token: "${{secrets.GITHUB_TOKEN}}" 21 | -------------------------------------------------------------------------------- /.github/workflows/github-pages.yml: -------------------------------------------------------------------------------- 1 | name: Pages - deploy 2 | 3 | on: 4 | # Runs on pushes targeting the default branch 5 | push: 6 | branches: ["main"] 7 | paths: 8 | - "website/**" 9 | 10 | # Allows you to run this workflow manually from the Actions tab 11 | workflow_dispatch: 12 | 13 | # Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages 14 | permissions: 15 | contents: read 16 | pages: write 17 | id-token: write 18 | 19 | # Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued. 20 | concurrency: 21 | group: "pages" 22 | cancel-in-progress: false 23 | 24 | jobs: 25 | deploy-pages: 26 | environment: 27 | name: github-pages 28 | url: ${{ steps.deployment.outputs.page_url }} 29 | runs-on: ubuntu-latest 30 | defaults: 31 | run: 32 | working-directory: ./website 33 | steps: 34 | - name: Checkout 35 | uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # pin@v4 36 | with: 37 | ref: main 38 | - name: Set up Node 39 | uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020 # pin@v4 40 | with: 41 | node-version: 18.x 42 | - name: Install dependencies 43 | run: yarn install --frozen-lockfile 44 | - name: Build 45 | run: yarn run build-github 46 | - name: Setup Pages 47 | uses: actions/configure-pages@983d7736d9b0ae728b81ab479565c72886d7745b # pin@v4 48 | - name: Upload artifact 49 | uses: actions/upload-pages-artifact@56afc609e74202658d3ffba0e8f6dda462b719fa # pin@v3 50 | with: 51 | path: "./website/dist" 52 | - name: Deploy to GitHub Pages 53 | id: deployment 54 | uses: actions/deploy-pages@d6db90164ac5ed86f2b6aed7e0febac5b3c0c03e # pin@v4 55 | -------------------------------------------------------------------------------- /.github/workflows/js-docs-builder.yml: -------------------------------------------------------------------------------- 1 | name: JS - generate docs 2 | 3 | on: 4 | # Runs on pushes targeting the default branch 5 | push: 6 | branches: ["main"] 7 | paths: 8 | - "js/**" 9 | - ".github/workflows/**" 10 | 11 | # Allows you to run this workflow manually from the Actions tab 12 | workflow_dispatch: 13 | 14 | permissions: 15 | contents: read 16 | 17 | jobs: 18 | makeDocs: 19 | permissions: 20 | contents: write 21 | id-token: write 22 | runs-on: ubuntu-latest 23 | steps: 24 | - name: Checkout 25 | uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # pin@v4 26 | with: 27 | ref: main 28 | - name: Generate docs 29 | working-directory: js 30 | run: | 31 | yarn install --frozen-lockfile 32 | yarn run build 33 | yarn run make-docs 34 | 35 | - name: Commit 36 | run: | 37 | git config --local user.email "invernizzi.l@gmail.com" 38 | git config --local user.name "Luca Invernizzi" 39 | git commit -m "Update docs" -a 40 | -------------------------------------------------------------------------------- /.github/workflows/js-publish.yml: -------------------------------------------------------------------------------- 1 | name: JS - publish 2 | on: 3 | workflow_dispatch: 4 | permissions: 5 | contents: read 6 | 7 | jobs: 8 | build: 9 | runs-on: ubuntu-latest 10 | steps: 11 | - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # pin@v4 12 | - uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020 # pin@v4 13 | with: 14 | node-version: "20.x" 15 | registry-url: "https://registry.npmjs.org" 16 | - name: Build 17 | working-directory: js 18 | run: | 19 | yarn install --frozen-lockfile 20 | yarn run build 21 | yarn run test 22 | - name: Publish 23 | working-directory: js 24 | run: yarn publish 25 | env: 26 | NODE_AUTH_TOKEN: ${{ secrets.NPM_TOKEN }} 27 | -------------------------------------------------------------------------------- /.github/workflows/js-test.yml: -------------------------------------------------------------------------------- 1 | name: JS - tests 2 | 3 | on: 4 | workflow_dispatch: 5 | push: 6 | branches: 7 | - "main" 8 | pull_request: 9 | paths: 10 | - "js/**" 11 | - "tests_data/**" 12 | - ".github/workflows/js-*" 13 | permissions: 14 | contents: read 15 | jobs: 16 | unit-testing: 17 | strategy: 18 | matrix: 19 | node-version: ["18", "20"] 20 | os: ["ubuntu-latest", "macos-latest"] 21 | runs-on: ${{ matrix.os }} 22 | steps: 23 | - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # pin@v4 24 | - uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020 # pin@v4 25 | with: 26 | node-version: ${{ matrix.node-version }} 27 | - name: Install dependencies 28 | working-directory: js 29 | run: yarn install --frozen-lockfile 30 | - name: Build 31 | working-directory: js 32 | run: yarn run build 33 | - name: Run tests 34 | working-directory: js 35 | run: yarn test 36 | -------------------------------------------------------------------------------- /.github/workflows/python-test-published-rc-package.yml: -------------------------------------------------------------------------------- 1 | # This routinely checks that the latest published -rc packages are installable 2 | # and work properly. This makes sure that a new version of one of our 3 | # dependencies is not breaking our releases. 4 | # TODO: test more magika package versions 5 | # TODO: check the actual predicted content types 6 | name: Python - test published -rc packages 7 | 8 | on: 9 | schedule: 10 | - cron: "42 3 * * *" # Run daily 11 | workflow_dispatch: 12 | pull_request: 13 | paths: 14 | - ".github/workflows/python-test-published-rc-package.yml" 15 | 16 | permissions: 17 | contents: read 18 | 19 | jobs: 20 | unit-testing: 21 | strategy: 22 | matrix: 23 | python-version: ["3.8", "3.9", "3.10", "3.11", "3.12", "3.13"] 24 | os: ["ubuntu-latest", "macos-latest", "windows-latest"] 25 | runs-on: ${{ matrix.os }} 26 | steps: 27 | - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # pin@v4 28 | - name: Setup Python 29 | uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # pin@v5 30 | with: 31 | python-version: "${{ matrix.python-version }}" 32 | - if: matrix.os != 'windows-latest' 33 | name: Install uv 34 | run: curl -LsSf https://astral.sh/uv/0.5.22/install.sh | sh 35 | - if: matrix.os != 'windows-latest' 36 | name: Check that magika -rc can be installed with uv 37 | run: mkdir /tmp/test-uv && cd /tmp/test-uv && uv init && uv add --prerelease allow magika && cd - && rm -rf /tmp/test-uv 38 | - if: matrix.platform.runner == 'windows-latest' 39 | name: Check that magika -rc install with uv works on Windows 40 | shell: pwsh 41 | run: | 42 | mkdir C:\test-uv 43 | Copy-Item -Path dist\*.whl -Destination C:\test-uv 44 | cd C:\test-uv 45 | $env:PATH += ";$HOME/.local/bin" 46 | uv init 47 | $wheel = Get-ChildItem -Filter *.whl | Select-Object -ExpandProperty Name 48 | uv add --prerelease ".\$wheel" 49 | - name: Install magika with pip 50 | run: python3 -m pip install --pre magika 51 | - run: python3 -c 'import magika; m = magika.Magika(); print(m)' 52 | - run: magika --version 53 | # The latest published model does not necessarily support detection for 54 | # all types in our tests data; thus, for now we just check that the magika 55 | # CLI does not crash when scanning the files, without checking the actual 56 | # predictions. 57 | - run: magika -r tests_data/basic 58 | -------------------------------------------------------------------------------- /.github/workflows/rust-test.yml: -------------------------------------------------------------------------------- 1 | name: Rust - test 2 | 3 | on: 4 | workflow_dispatch: 5 | push: 6 | branches: 7 | - "main" 8 | pull_request: 9 | paths: 10 | - ".github/workflows/rust-*" 11 | - "assets/**" 12 | - "rust/**" 13 | - "tests_data/**" 14 | 15 | permissions: 16 | contents: read 17 | 18 | jobs: 19 | changelog: 20 | runs-on: ubuntu-latest 21 | steps: 22 | - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # pin@v4 23 | with: 24 | fetch-depth: 0 25 | - run: ./changelog.sh 26 | working-directory: rust 27 | test: 28 | runs-on: ubuntu-latest 29 | continue-on-error: ${{ matrix.toolchain == 'nightly' }} 30 | strategy: 31 | matrix: 32 | toolchain: [stable, nightly] 33 | steps: 34 | - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # pin@v4 35 | - run: rustup default ${{ matrix.toolchain }} 36 | - run: rustup component add rustfmt clippy 37 | - run: ./test.sh 38 | working-directory: rust 39 | run: 40 | runs-on: ${{ matrix.os }}-latest 41 | strategy: 42 | matrix: 43 | os: [ubuntu, macos, windows] 44 | steps: 45 | - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # pin@v4 46 | - run: cargo build --release 47 | working-directory: rust/cli 48 | - run: rust/target/release/magika -r tests_data/basic 49 | -------------------------------------------------------------------------------- /.github/workflows/website-test.yml: -------------------------------------------------------------------------------- 1 | name: Website - tests 2 | 3 | on: 4 | workflow_dispatch: 5 | push: 6 | branches: 7 | - "main" 8 | pull_request: 9 | paths: 10 | - "website/**" 11 | - "js/**" 12 | - "tests_data/**" 13 | - ".github/workflows/website-*" 14 | permissions: 15 | contents: read 16 | jobs: 17 | build-and-test: 18 | strategy: 19 | matrix: 20 | node-version: ["20"] 21 | os: ["ubuntu-latest"] 22 | runs-on: ${{ matrix.os }} 23 | steps: 24 | - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # pin@v4 25 | - uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020 # pin@v4 26 | with: 27 | node-version: ${{ matrix.node-version }} 28 | - name: Install js dependencies 29 | working-directory: js 30 | run: yarn install --frozen-lockfile 31 | - name: Build js 32 | working-directory: js 33 | run: yarn run build 34 | - name: Create magika link 35 | working-directory: js 36 | run: yarn link 37 | - name: Link to local magika 38 | working-directory: website 39 | run: yarn link magika 40 | - name: Install website dependencies 41 | working-directory: website 42 | run: yarn install --frozen-lockfile 43 | - name: Build website 44 | working-directory: website 45 | run: yarn build 46 | # TODO: add some actual testing 47 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | __pycache__/ 3 | .ipynb_checkpoints 4 | venv/ 5 | tmp/ 6 | .env 7 | *.swp 8 | *.egg-info 9 | dist/* 10 | *.pickle 11 | .s.yml 12 | 13 | */models-data/* 14 | 15 | .vscode 16 | 17 | scratchpad/ 18 | 19 | */node_modules/* 20 | docs/dist 21 | js/dist 22 | -------------------------------------------------------------------------------- /CITATION.cff: -------------------------------------------------------------------------------- 1 | cff-version: 1.2.0 2 | message: "If you use this software, please cite it as below." 3 | authors: 4 | - family-names: "Fratantonio" 5 | given-names: "Yanick" 6 | - family-names: "Invernizzi" 7 | given-names: "Luca" 8 | - family-names: "Farah" 9 | given-names: "Loua" 10 | - family-names: "Kurt" 11 | given-names: "Thomas" 12 | - family-names: "Zhang" 13 | given-names: "Marina" 14 | - family-names: "Albertini" 15 | given-names: "Ange" 16 | - family-names: "Galilee" 17 | given-names: "Francois" 18 | - family-names: "Metitieri" 19 | given-names: "Giancarlo" 20 | - family-names: "Cretin" 21 | given-names: "Julien" 22 | - family-names: "Petit-Bianco" 23 | given-names: "Alexandre" 24 | - family-names: "Tao" 25 | given-names: "David" 26 | - family-names: "Bursztein" 27 | given-names: "Elie" 28 | title: "Magika: AI-Powered Content-Type Detection" 29 | url: "https://arxiv.org/abs/2409.13768" 30 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # How to Contribute 2 | 3 | We would love to accept your patches and contributions to this project! 4 | 5 | Check [open issues labeled as "help wanted"](https://github.com/google/magika/issues?q=is%3Aissue+is%3Aopen+label%3A%22help+wanted%22) as a starting point. 6 | 7 | ## Before you begin 8 | 9 | ### Sign our Contributor License Agreement 10 | 11 | Contributions to this project must be accompanied by a 12 | [Contributor License Agreement](https://cla.developers.google.com/about) (CLA). 13 | You (or your employer) retain the copyright to your contribution; this simply 14 | gives us permission to use and redistribute your contributions as part of the 15 | project. 16 | 17 | If you or your current employer have already signed the Google CLA (even if it 18 | was for a different project), you probably don't need to do it again. 19 | 20 | Visit to see your current agreements or to 21 | sign a new one. 22 | 23 | ### Review our Community Guidelines 24 | 25 | This project follows [Google's Open Source Community 26 | Guidelines](https://opensource.google/conduct/). 27 | 28 | ## Contribution process 29 | 30 | ### Code Reviews 31 | 32 | All submissions, including submissions by project members, require review. We 33 | use [GitHub pull requests](https://docs.github.com/articles/about-pull-requests) 34 | for this purpose. 35 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # syntax=docker/dockerfile:1 2 | 3 | ARG PYTHON_VERSION=3.11 4 | FROM python:${PYTHON_VERSION}-slim as base 5 | 6 | WORKDIR /magika 7 | 8 | # This requires buildx 9 | # RUN --mount=type=cache,target=/root/.cache/pip \ 10 | # pip install magika 11 | 12 | RUN pip install magika 13 | 14 | ENTRYPOINT ["magika"] 15 | -------------------------------------------------------------------------------- /assets/2025_icse_magika.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/assets/2025_icse_magika.pdf -------------------------------------------------------------------------------- /assets/magika-abusech.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/assets/magika-abusech.png -------------------------------------------------------------------------------- /assets/magika-screenshot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/assets/magika-screenshot.png -------------------------------------------------------------------------------- /assets/magika-vt.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/assets/magika-vt.png -------------------------------------------------------------------------------- /assets/models/CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | Here we document the main changes of the various models. 4 | 5 | Indicated inference speed calculated by averaging 100 inferences (within one invocation) on an AMD Ryzen 9 7950X 16-Core Processor CPU. 6 | 7 | ## `standard_v3_3` - 2025-04-11 8 | 9 | - [216 possible tool's outputs](./standard_v3_3/README.md), ~99% average accuracy, ~2ms inference speed. 10 | - Better dataset balance between javascript vs. typescript (leading to an increased accuracy for typescript, 85% => 95%). 11 | - New synthetic datasets with utf8-encoded, non-ascii characters for simple text and JSON. 12 | - More thresholds tuning. 13 | 14 | ## `standard_v3_2` - 2025-03-17 15 | 16 | - [216 possible tool's outputs](./standard_v3_2/README.md), ~99% average accuracy, ~2ms inference speed. 17 | - Difference with respect `standard_v3_1`: trained on a new (synthetic) dataset of CSV files to address a regression with CSV files (https://github.com/google/magika/issues/983); model selection now uses minimal test loss instead of other heuristics. 18 | 19 | ## `standard_v3_1` 20 | 21 | - [216 possible tool's outputs](./standard_v3_1/README.md). 22 | - Overall same average accuracy of `standard_v3_0`, ~99%, but more robust detections of short textual input and improved detection of Javascript. 23 | - Inference speed: ~2ms (similar to `standard_v3_0`). 24 | - Augmentation techniques used during training: CutMix, which was used for `v1` but not for `v2_1`; and "Random Snippet Selection", with which we train the model with random snippets extracted from samples in our dataset (this is only enabled for key textual content types). 25 | - Tweaked balance among content types in training dataset. 26 | 27 | ## `standard_v3_0` 28 | 29 | - [216 possible tool's outputs](./standard_v3_0/README.md). 30 | - Overall same average accuracy of `standard_v2_1`, ~99%. 31 | - Inference speed: ~2ms (~3x faster than `standard_v2_1`, ~20% faster than `standard_v1`). 32 | 33 | ## `standard_v2_1` 34 | 35 | - [Support for 200+ content types](./standard_v2_1/README.md), almost double what supported in `standard_v1`. 36 | - Overall average accuracy of ~99%. 37 | - Inference speed: ~6.2ms, which is slower than `standard_v1`; See `fast_v2_1` in case you need something faster (at the price of less accuracy). 38 | 39 | ## `fast_v2_1` 40 | 41 | - Similar to `standard_v2_1`, but significantly faster (about 4x faster). 42 | - Overall average accuracy of ~98.5%. 43 | 44 | ## `standard_v1` 45 | 46 | - Initial release. 47 | - Support for about 100 content types. 48 | - Average accuracy 99%+. 49 | - Inference speed: ~2.6ms. 50 | -------------------------------------------------------------------------------- /assets/models/begonly_v2_1/config.min.json: -------------------------------------------------------------------------------- 1 | {"beg_size": 2048, "mid_size": 0, "end_size": 0, "use_inputs_at_offsets": false, "medium_confidence_threshold": 0.5, "min_file_size_for_dl": 8, "padding_token": 256, "block_size": 4096, "target_labels_space": ["3gp", "ace", "ai", "aidl", "apk", "applebplist", "appleplist", "asm", "asp", "autohotkey", "autoit", "awk", "batch", "bazel", "bib", "bmp", "bzip", "c", "cab", "cat", "chm", "clojure", "cmake", "cobol", "coff", "coffeescript", "cpp", "crt", "crx", "cs", "csproj", "css", "csv", "dart", "deb", "dex", "dicom", "diff", "dm", "dmg", "doc", "dockerfile", "docx", "dsstore", "dwg", "dxf", "elf", "elixir", "emf", "eml", "epub", "erb", "erlang", "flac", "flv", "fortran", "gemfile", "gemspec", "gif", "gitattributes", "gitmodules", "go", "gradle", "groovy", "gzip", "h5", "handlebars", "haskell", "hcl", "hlp", "htaccess", "html", "icns", "ico", "ics", "ignorefile", "ini", "internetshortcut", "ipynb", "iso", "jar", "java", "javabytecode", "javascript", "jinja", "jp2", "jpeg", "json", "jsonl", "julia", "kotlin", "latex", "lha", "lisp", "lnk", "lua", "m3u", "m4", "macho", "makefile", "markdown", "matlab", "mht", "midi", "mkv", "mp3", "mp4", "mscompress", "msi", "mum", "npy", "npz", "nupkg", "objectivec", "ocaml", "odp", "ods", "odt", "ogg", "one", "onnx", "otf", "outlook", "parquet", "pascal", "pcap", "pdb", "pdf", "pebin", "pem", "perl", "php", "pickle", "png", "po", "postscript", "powershell", "ppt", "pptx", "prolog", "proteindb", "proto", "psd", "python", "pythonbytecode", "pytorch", "qt", "r", "rar", "rdf", "rpm", "rst", "rtf", "ruby", "rust", "scala", "scss", "sevenzip", "sgml", "shell", "smali", "snap", "solidity", "sql", "sqlite", "squashfs", "srt", "stlbinary", "stltext", "sum", "svg", "swf", "swift", "tar", "tcl", "textproto", "tga", "thumbsdb", "tiff", "toml", "torrent", "tsv", "ttf", "twig", "txt", "typescript", "unknown", "vba", "vcxproj", "verilog", "vhdl", "vtt", "vue", "wasm", "wav", "webm", "webp", "winregistry", "wmf", "woff", "woff2", "xar", "xls", "xlsb", "xlsx", "xml", "xpi", "xz", "yaml", "yara", "zig", "zip", "zlibstream"], "thresholds": {"latex": 0.95, "pascal": 0.95}, "overwrite_map": {}} 2 | -------------------------------------------------------------------------------- /assets/models/begonly_v2_1/metadata.json: -------------------------------------------------------------------------------- 1 | {"model_name_hash": "e66844a04ae7a03bd9f228d9b778ec8429d361d0dca09b951b327ffad5beb07a", "namespace_hash": "ce3c9130af6416f40d71c5934f927acbd174f904a550fca2185aa3cd3528ca35"} 2 | -------------------------------------------------------------------------------- /assets/models/begonly_v2_1/model.keras: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/assets/models/begonly_v2_1/model.keras -------------------------------------------------------------------------------- /assets/models/begonly_v2_1/model.onnx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/assets/models/begonly_v2_1/model.onnx -------------------------------------------------------------------------------- /assets/models/fast_v2_1/config.min.json: -------------------------------------------------------------------------------- 1 | {"beg_size": 512, "mid_size": 0, "end_size": 512, "use_inputs_at_offsets": false, "medium_confidence_threshold": 0.5, "min_file_size_for_dl": 8, "padding_token": 256, "block_size": 4096, "target_labels_space": ["3gp", "ace", "ai", "aidl", "apk", "applebplist", "appleplist", "asm", "asp", "autohotkey", "autoit", "awk", "batch", "bazel", "bib", "bmp", "bzip", "c", "cab", "cat", "chm", "clojure", "cmake", "cobol", "coff", "coffeescript", "cpp", "crt", "crx", "cs", "csproj", "css", "csv", "dart", "deb", "dex", "dicom", "diff", "dm", "dmg", "doc", "dockerfile", "docx", "dsstore", "dwg", "dxf", "elf", "elixir", "emf", "eml", "epub", "erb", "erlang", "flac", "flv", "fortran", "gemfile", "gemspec", "gif", "gitattributes", "gitmodules", "go", "gradle", "groovy", "gzip", "h5", "handlebars", "haskell", "hcl", "hlp", "htaccess", "html", "icns", "ico", "ics", "ignorefile", "ini", "internetshortcut", "ipynb", "iso", "jar", "java", "javabytecode", "javascript", "jinja", "jp2", "jpeg", "json", "jsonl", "julia", "kotlin", "latex", "lha", "lisp", "lnk", "lua", "m3u", "m4", "macho", "makefile", "markdown", "matlab", "mht", "midi", "mkv", "mp3", "mp4", "mscompress", "msi", "mum", "npy", "npz", "nupkg", "objectivec", "ocaml", "odp", "ods", "odt", "ogg", "one", "onnx", "otf", "outlook", "parquet", "pascal", "pcap", "pdb", "pdf", "pebin", "pem", "perl", "php", "pickle", "png", "po", "postscript", "powershell", "ppt", "pptx", "prolog", "proteindb", "proto", "psd", "python", "pythonbytecode", "pytorch", "qt", "r", "rar", "rdf", "rpm", "rst", "rtf", "ruby", "rust", "scala", "scss", "sevenzip", "sgml", "shell", "smali", "snap", "solidity", "sql", "sqlite", "squashfs", "srt", "stlbinary", "stltext", "sum", "svg", "swf", "swift", "tar", "tcl", "textproto", "tga", "thumbsdb", "tiff", "toml", "torrent", "tsv", "ttf", "twig", "txt", "typescript", "unknown", "vba", "vcxproj", "verilog", "vhdl", "vtt", "vue", "wasm", "wav", "webm", "webp", "winregistry", "wmf", "woff", "woff2", "xar", "xls", "xlsb", "xlsx", "xml", "xpi", "xz", "yaml", "yara", "zig", "zip", "zlibstream"], "thresholds": {"latex": 0.95, "pascal": 0.95}, "overwrite_map": {}} 2 | -------------------------------------------------------------------------------- /assets/models/fast_v2_1/metadata.json: -------------------------------------------------------------------------------- 1 | {"model_name_hash": "83b9d2bd0c450deffc70624554c99fa63e1830db852cdce860b7e215fa176f9f", "namespace_hash": "ce3c9130af6416f40d71c5934f927acbd174f904a550fca2185aa3cd3528ca35"} 2 | -------------------------------------------------------------------------------- /assets/models/fast_v2_1/model.keras: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/assets/models/fast_v2_1/model.keras -------------------------------------------------------------------------------- /assets/models/fast_v2_1/model.onnx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/assets/models/fast_v2_1/model.onnx -------------------------------------------------------------------------------- /assets/models/standard_v1/README.md: -------------------------------------------------------------------------------- 1 | # Model documentation 2 | 3 | The list of supported content types is indicated by the `target_labels_space` list in the model config, which you can find [here](https://github.com/google/magika/blob/7f947319f1ebe09626368a3f989a0863fcd7c52a/assets/models/standard_v1/model_config.json#L440-L554). 4 | 5 | Note: This model will be deprecated soon, in favor of more recent models (e.g., `standard_v3_x`). -------------------------------------------------------------------------------- /assets/models/standard_v1/magika_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "default_model_name": "standard_v1", 3 | "medium_confidence_threshold": 0.5, 4 | "min_file_size_for_dl": 16, 5 | "padding_token": 256 6 | } -------------------------------------------------------------------------------- /assets/models/standard_v1/model.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/assets/models/standard_v1/model.h5 -------------------------------------------------------------------------------- /assets/models/standard_v2_0/config.min.json: -------------------------------------------------------------------------------- 1 | {"beg_size": 2048, "mid_size": 0, "end_size": 2048, "use_inputs_at_offsets": false, "medium_confidence_threshold": 0.5, "min_file_size_for_dl": 8, "padding_token": 256, "block_size": 4096, "target_labels_space": ["3gp", "ace", "ai", "aidl", "apk", "applebplist", "appleplist", "asm", "asp", "autohotkey", "autoit", "awk", "batch", "bazel", "bib", "bmp", "bzip", "c", "cab", "cat", "chm", "clojure", "cmake", "cobol", "coff", "coffeescript", "cpp", "crt", "crx", "cs", "csproj", "css", "csv", "dart", "deb", "dex", "dicom", "diff", "dm", "dmg", "doc", "dockerfile", "docx", "dsstore", "dwg", "dxf", "elf", "elixir", "emf", "eml", "epub", "erb", "erlang", "flac", "flv", "fortran", "gemfile", "gemspec", "gif", "gitattributes", "gitmodules", "go", "gradle", "groovy", "gzip", "h5", "handlebars", "haskell", "hcl", "hlp", "htaccess", "html", "icns", "ico", "ics", "ignorefile", "ini", "internetshortcut", "ipynb", "iso", "jar", "java", "javabytecode", "javascript", "jinja", "jp2", "jpeg", "json", "jsonl", "julia", "kotlin", "latex", "lha", "lisp", "lnk", "lua", "m3u", "m4", "macho", "makefile", "markdown", "matlab", "mht", "midi", "mkv", "mp3", "mp4", "mscompress", "msi", "mum", "npy", "npz", "nupkg", "objectivec", "ocaml", "odp", "ods", "odt", "ogg", "one", "onnx", "otf", "outlook", "parquet", "pascal", "pcap", "pdb", "pdf", "pebin", "pem", "perl", "php", "pickle", "png", "po", "postscript", "powershell", "ppt", "pptx", "prolog", "proteindb", "proto", "psd", "python", "pythonbytecode", "qt", "r", "rar", "rdf", "rpm", "rst", "rtf", "ruby", "rust", "scala", "scss", "sevenzip", "sgml", "shell", "smali", "snap", "solidity", "sql", "sqlite", "squashfs", "srt", "stlbinary", "stltext", "sum", "svg", "swf", "swift", "tar", "tcl", "textproto", "tga", "thumbsdb", "tiff", "toml", "torrent", "tsv", "ttf", "twig", "txt", "typescript", "unknown", "vba", "vcxproj", "verilog", "vhdl", "vtt", "vue", "wasm", "wav", "webm", "webp", "winregistry", "wmf", "woff", "woff2", "xar", "xls", "xlsb", "xlsx", "xml", "xpi", "xz", "yaml", "yara", "zig", "zip", "zlibstream"], "thresholds": {"latex": 0.95}, "overwrite_map": {}} 2 | -------------------------------------------------------------------------------- /assets/models/standard_v2_0/metadata.json: -------------------------------------------------------------------------------- 1 | {"model_name_hash": "5ae665b58305628b173e97edf9d3043a6021dd40a434b1f72bd88087713d8209", "namespace_hash": "c3a09b7885a7151502927c0380d2bf9a19c2999e54e8ffd282b21ad7b2aeec62"} -------------------------------------------------------------------------------- /assets/models/standard_v2_0/model.keras: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/assets/models/standard_v2_0/model.keras -------------------------------------------------------------------------------- /assets/models/standard_v2_0/model.onnx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/assets/models/standard_v2_0/model.onnx -------------------------------------------------------------------------------- /assets/models/standard_v2_1/config.min.json: -------------------------------------------------------------------------------- 1 | {"beg_size": 2048, "mid_size": 0, "end_size": 2048, "use_inputs_at_offsets": false, "medium_confidence_threshold": 0.5, "min_file_size_for_dl": 8, "padding_token": 256, "block_size": 4096, "target_labels_space": ["3gp", "ace", "ai", "aidl", "apk", "applebplist", "appleplist", "asm", "asp", "autohotkey", "autoit", "awk", "batch", "bazel", "bib", "bmp", "bzip", "c", "cab", "cat", "chm", "clojure", "cmake", "cobol", "coff", "coffeescript", "cpp", "crt", "crx", "cs", "csproj", "css", "csv", "dart", "deb", "dex", "dicom", "diff", "dm", "dmg", "doc", "dockerfile", "docx", "dsstore", "dwg", "dxf", "elf", "elixir", "emf", "eml", "epub", "erb", "erlang", "flac", "flv", "fortran", "gemfile", "gemspec", "gif", "gitattributes", "gitmodules", "go", "gradle", "groovy", "gzip", "h5", "handlebars", "haskell", "hcl", "hlp", "htaccess", "html", "icns", "ico", "ics", "ignorefile", "ini", "internetshortcut", "ipynb", "iso", "jar", "java", "javabytecode", "javascript", "jinja", "jp2", "jpeg", "json", "jsonl", "julia", "kotlin", "latex", "lha", "lisp", "lnk", "lua", "m3u", "m4", "macho", "makefile", "markdown", "matlab", "mht", "midi", "mkv", "mp3", "mp4", "mscompress", "msi", "mum", "npy", "npz", "nupkg", "objectivec", "ocaml", "odp", "ods", "odt", "ogg", "one", "onnx", "otf", "outlook", "parquet", "pascal", "pcap", "pdb", "pdf", "pebin", "pem", "perl", "php", "pickle", "png", "po", "postscript", "powershell", "ppt", "pptx", "prolog", "proteindb", "proto", "psd", "python", "pythonbytecode", "pytorch", "qt", "r", "rar", "rdf", "rpm", "rst", "rtf", "ruby", "rust", "scala", "scss", "sevenzip", "sgml", "shell", "smali", "snap", "solidity", "sql", "sqlite", "squashfs", "srt", "stlbinary", "stltext", "sum", "svg", "swf", "swift", "tar", "tcl", "textproto", "tga", "thumbsdb", "tiff", "toml", "torrent", "tsv", "ttf", "twig", "txt", "typescript", "unknown", "vba", "vcxproj", "verilog", "vhdl", "vtt", "vue", "wasm", "wav", "webm", "webp", "winregistry", "wmf", "woff", "woff2", "xar", "xls", "xlsb", "xlsx", "xml", "xpi", "xz", "yaml", "yara", "zig", "zip", "zlibstream"], "thresholds": {"latex": 0.95, "pascal": 0.95}, "overwrite_map": {}} 2 | -------------------------------------------------------------------------------- /assets/models/standard_v2_1/metadata.json: -------------------------------------------------------------------------------- 1 | {"model_name_hash": "5ae665b58305628b173e97edf9d3043a6021dd40a434b1f72bd88087713d8209", "namespace_hash": "ce3c9130af6416f40d71c5934f927acbd174f904a550fca2185aa3cd3528ca35"} 2 | -------------------------------------------------------------------------------- /assets/models/standard_v2_1/model.keras: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/assets/models/standard_v2_1/model.keras -------------------------------------------------------------------------------- /assets/models/standard_v2_1/model.onnx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/assets/models/standard_v2_1/model.onnx -------------------------------------------------------------------------------- /assets/models/standard_v3_0/config.min.json: -------------------------------------------------------------------------------- 1 | {"beg_size":1024,"mid_size":0,"end_size":1024,"use_inputs_at_offsets":false,"medium_confidence_threshold":0.5,"min_file_size_for_dl":8,"padding_token":256,"block_size":4096,"target_labels_space":["3gp","ace","ai","aidl","apk","applebplist","appleplist","asm","asp","autohotkey","autoit","awk","batch","bazel","bib","bmp","bzip","c","cab","cat","chm","clojure","cmake","cobol","coff","coffeescript","cpp","crt","crx","cs","csproj","css","csv","dart","deb","dex","dicom","diff","dm","dmg","doc","dockerfile","docx","dsstore","dwg","dxf","elf","elixir","emf","eml","epub","erb","erlang","flac","flv","fortran","gemfile","gemspec","gif","gitattributes","gitmodules","go","gradle","groovy","gzip","h5","handlebars","haskell","hcl","hlp","htaccess","html","icns","ico","ics","ignorefile","ini","internetshortcut","ipynb","iso","jar","java","javabytecode","javascript","jinja","jp2","jpeg","json","jsonl","julia","kotlin","latex","lha","lisp","lnk","lua","m3u","m4","macho","makefile","markdown","matlab","mht","midi","mkv","mp3","mp4","mscompress","msi","mum","npy","npz","nupkg","objectivec","ocaml","odp","ods","odt","ogg","one","onnx","otf","outlook","parquet","pascal","pcap","pdb","pdf","pebin","pem","perl","php","pickle","png","po","postscript","powershell","ppt","pptx","prolog","proteindb","proto","psd","python","pythonbytecode","pytorch","qt","r","randombytes","randomtxt","rar","rdf","rpm","rst","rtf","ruby","rust","scala","scss","sevenzip","sgml","shell","smali","snap","solidity","sql","sqlite","squashfs","srt","stlbinary","stltext","sum","svg","swf","swift","tar","tcl","textproto","tga","thumbsdb","tiff","toml","torrent","tsv","ttf","twig","txt","typescript","vba","vcxproj","verilog","vhdl","vtt","vue","wasm","wav","webm","webp","winregistry","wmf","woff","woff2","xar","xls","xlsb","xlsx","xml","xpi","xz","yaml","yara","zig","zip","zlibstream"],"thresholds":{"handlebars":0.9,"ignorefile":0.95,"latex":0.95,"markdown":0.9,"ocaml":0.9,"pascal":0.95,"rst":0.9,"sql":0.9,"tsv":0.9},"overwrite_map":{"randombytes":"unknown","randomtxt":"txt"},"version_major":3} -------------------------------------------------------------------------------- /assets/models/standard_v3_0/metadata.json: -------------------------------------------------------------------------------- 1 | {"namespace_hash":"7ca577b96738951c36df428f8435c81780f92c6f9ef3a73d796a792ffc817703","model_name_hash":"e5368af178b89eb94cb1927b8481d5a9dedf5d1ee810d335b018d19eb5195b45","epoch_num":"100"} -------------------------------------------------------------------------------- /assets/models/standard_v3_0/model.onnx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/assets/models/standard_v3_0/model.onnx -------------------------------------------------------------------------------- /assets/models/standard_v3_1/config.min.json: -------------------------------------------------------------------------------- 1 | {"beg_size":1024,"mid_size":0,"end_size":1024,"use_inputs_at_offsets":false,"medium_confidence_threshold":0.5,"min_file_size_for_dl":8,"padding_token":256,"block_size":4096,"target_labels_space":["3gp","ace","ai","aidl","apk","applebplist","appleplist","asm","asp","autohotkey","autoit","awk","batch","bazel","bib","bmp","bzip","c","cab","cat","chm","clojure","cmake","cobol","coff","coffeescript","cpp","crt","crx","cs","csproj","css","csv","dart","deb","dex","dicom","diff","dm","dmg","doc","dockerfile","docx","dsstore","dwg","dxf","elf","elixir","emf","eml","epub","erb","erlang","flac","flv","fortran","gemfile","gemspec","gif","gitattributes","gitmodules","go","gradle","groovy","gzip","h5","handlebars","haskell","hcl","hlp","htaccess","html","icns","ico","ics","ignorefile","ini","internetshortcut","ipynb","iso","jar","java","javabytecode","javascript","jinja","jp2","jpeg","json","jsonl","julia","kotlin","latex","lha","lisp","lnk","lua","m3u","m4","macho","makefile","markdown","matlab","mht","midi","mkv","mp3","mp4","mscompress","msi","mum","npy","npz","nupkg","objectivec","ocaml","odp","ods","odt","ogg","one","onnx","otf","outlook","parquet","pascal","pcap","pdb","pdf","pebin","pem","perl","php","pickle","png","po","postscript","powershell","ppt","pptx","prolog","proteindb","proto","psd","python","pythonbytecode","pytorch","qt","r","randombytes","randomtxt","rar","rdf","rpm","rst","rtf","ruby","rust","scala","scss","sevenzip","sgml","shell","smali","snap","solidity","sql","sqlite","squashfs","srt","stlbinary","stltext","sum","svg","swf","swift","tar","tcl","textproto","tga","thumbsdb","tiff","toml","torrent","tsv","ttf","twig","txt","typescript","vba","vcxproj","verilog","vhdl","vtt","vue","wasm","wav","webm","webp","winregistry","wmf","woff","woff2","xar","xls","xlsb","xlsx","xml","xpi","xz","yaml","yara","zig","zip","zlibstream"],"thresholds":{"crt":0.9,"handlebars":0.9,"ignorefile":0.95,"latex":0.95,"markdown":0.9,"ocaml":0.9,"pascal":0.95,"rst":0.9,"sql":0.9,"tsv":0.9},"overwrite_map":{"randombytes":"unknown","randomtxt":"txt"},"protection":"none","aes_key_hex":"","version_major":3} 2 | -------------------------------------------------------------------------------- /assets/models/standard_v3_1/metadata.json: -------------------------------------------------------------------------------- 1 | {"epoch_num":"200"} 2 | -------------------------------------------------------------------------------- /assets/models/standard_v3_1/model.onnx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/assets/models/standard_v3_1/model.onnx -------------------------------------------------------------------------------- /assets/models/standard_v3_2/config.min.json: -------------------------------------------------------------------------------- 1 | {"beg_size":1024,"mid_size":0,"end_size":1024,"use_inputs_at_offsets":false,"medium_confidence_threshold":0.5,"min_file_size_for_dl":8,"padding_token":256,"block_size":4096,"target_labels_space":["3gp","ace","ai","aidl","apk","applebplist","appleplist","asm","asp","autohotkey","autoit","awk","batch","bazel","bib","bmp","bzip","c","cab","cat","chm","clojure","cmake","cobol","coff","coffeescript","cpp","crt","crx","cs","csproj","css","csv","dart","deb","dex","dicom","diff","dm","dmg","doc","dockerfile","docx","dsstore","dwg","dxf","elf","elixir","emf","eml","epub","erb","erlang","flac","flv","fortran","gemfile","gemspec","gif","gitattributes","gitmodules","go","gradle","groovy","gzip","h5","handlebars","haskell","hcl","hlp","htaccess","html","icns","ico","ics","ignorefile","ini","internetshortcut","ipynb","iso","jar","java","javabytecode","javascript","jinja","jp2","jpeg","json","jsonl","julia","kotlin","latex","lha","lisp","lnk","lua","m3u","m4","macho","makefile","markdown","matlab","mht","midi","mkv","mp3","mp4","mscompress","msi","mum","npy","npz","nupkg","objectivec","ocaml","odp","ods","odt","ogg","one","onnx","otf","outlook","parquet","pascal","pcap","pdb","pdf","pebin","pem","perl","php","pickle","png","po","postscript","powershell","ppt","pptx","prolog","proteindb","proto","psd","python","pythonbytecode","pytorch","qt","r","randombytes","randomtxt","rar","rdf","rpm","rst","rtf","ruby","rust","scala","scss","sevenzip","sgml","shell","smali","snap","solidity","sql","sqlite","squashfs","srt","stlbinary","stltext","sum","svg","swf","swift","tar","tcl","textproto","tga","thumbsdb","tiff","toml","torrent","tsv","ttf","twig","txt","typescript","vba","vcxproj","verilog","vhdl","vtt","vue","wasm","wav","webm","webp","winregistry","wmf","woff","woff2","xar","xls","xlsb","xlsx","xml","xpi","xz","yaml","yara","zig","zip","zlibstream"],"thresholds":{"crt":0.9,"handlebars":0.9,"ignorefile":0.95,"latex":0.95,"markdown":0.75,"ocaml":0.9,"pascal":0.95,"rst":0.9,"sql":0.9,"tsv":0.9},"overwrite_map":{"randombytes":"unknown","randomtxt":"txt"},"protection":"none","aes_key_hex":"","version_major":3} 2 | -------------------------------------------------------------------------------- /assets/models/standard_v3_2/metadata.json: -------------------------------------------------------------------------------- 1 | {"epoch_num":"190"} 2 | -------------------------------------------------------------------------------- /assets/models/standard_v3_2/model.onnx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/assets/models/standard_v3_2/model.onnx -------------------------------------------------------------------------------- /assets/models/standard_v3_3/config.min.json: -------------------------------------------------------------------------------- 1 | {"beg_size":1024,"mid_size":0,"end_size":1024,"use_inputs_at_offsets":false,"medium_confidence_threshold":0.5,"min_file_size_for_dl":8,"padding_token":256,"block_size":4096,"target_labels_space":["3gp","ace","ai","aidl","apk","applebplist","appleplist","asm","asp","autohotkey","autoit","awk","batch","bazel","bib","bmp","bzip","c","cab","cat","chm","clojure","cmake","cobol","coff","coffeescript","cpp","crt","crx","cs","csproj","css","csv","dart","deb","dex","dicom","diff","dm","dmg","doc","dockerfile","docx","dsstore","dwg","dxf","elf","elixir","emf","eml","epub","erb","erlang","flac","flv","fortran","gemfile","gemspec","gif","gitattributes","gitmodules","go","gradle","groovy","gzip","h5","handlebars","haskell","hcl","hlp","htaccess","html","icns","ico","ics","ignorefile","ini","internetshortcut","ipynb","iso","jar","java","javabytecode","javascript","jinja","jp2","jpeg","json","jsonl","julia","kotlin","latex","lha","lisp","lnk","lua","m3u","m4","macho","makefile","markdown","matlab","mht","midi","mkv","mp3","mp4","mscompress","msi","mum","npy","npz","nupkg","objectivec","ocaml","odp","ods","odt","ogg","one","onnx","otf","outlook","parquet","pascal","pcap","pdb","pdf","pebin","pem","perl","php","pickle","png","po","postscript","powershell","ppt","pptx","prolog","proteindb","proto","psd","python","pythonbytecode","pytorch","qt","r","randombytes","randomtxt","rar","rdf","rpm","rst","rtf","ruby","rust","scala","scss","sevenzip","sgml","shell","smali","snap","solidity","sql","sqlite","squashfs","srt","stlbinary","stltext","sum","svg","swf","swift","tar","tcl","textproto","tga","thumbsdb","tiff","toml","torrent","tsv","ttf","twig","txt","typescript","vba","vcxproj","verilog","vhdl","vtt","vue","wasm","wav","webm","webp","winregistry","wmf","woff","woff2","xar","xls","xlsb","xlsx","xml","xpi","xz","yaml","yara","zig","zip","zlibstream"],"thresholds":{"crt":0.9,"handlebars":0.9,"ignorefile":0.95,"latex":0.95,"markdown":0.75,"ocaml":0.9,"pascal":0.95,"r":0.9,"rst":0.9,"sql":0.9,"tsv":0.9,"zig":0.9},"overwrite_map":{"randombytes":"unknown","randomtxt":"txt"},"protection":"none","aes_key_hex":"","version_major":3} 2 | -------------------------------------------------------------------------------- /assets/models/standard_v3_3/metadata.json: -------------------------------------------------------------------------------- 1 | {"epoch_num":"91"} 2 | -------------------------------------------------------------------------------- /assets/models/standard_v3_3/model.onnx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/assets/models/standard_v3_3/model.onnx -------------------------------------------------------------------------------- /go/README.md: -------------------------------------------------------------------------------- 1 | # Go library 2 | 3 | This directory contains the Go library for Magika. 4 | 5 | The inference relies on the [ONNX Runtime](https://onnxruntime.ai/), and it 6 | requires [cgo](https://go.dev/blog/cgo) for interfacing with the ONNX Runtime 7 | [C API](https://onnxruntime.ai/docs/api/c/). 8 | 9 | - [`docker`](./docker) contains a sample docker file that builds a 10 | container image that ties together a Magika CLI, an ONNX Runtime, 11 | and a [model](../assets/models/standard_v2_1). 12 | - [`cli`](./cli) contains a basic CLI that illustrates how to 13 | the Magika go library may be called from within an application. 14 | - [`magika`](./magika) contains the library, that extracts 15 | features from a sequence of bytes. 16 | - [`onnx`](./onnx) wraps the C API of the ONNX Runtime to 17 | provide an inference engine. -------------------------------------------------------------------------------- /go/cli/cli.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bytes" 5 | "fmt" 6 | "io" 7 | "os" 8 | 9 | "github.com/google/magika/magika" 10 | ) 11 | 12 | const ( 13 | assetsDirEnv = "MAGIKA_ASSETS_DIR" 14 | modelNameEnv = "MAGIKA_MODEL" 15 | ) 16 | 17 | // cli is a basic CLI that infers the content type of the files listed on 18 | // the command line. The assets dir and the model name are given via the 19 | // environment variable MAGIKA_ASSETS_DIR and MAGIKA_MODEL respectively. 20 | func cli(w io.Writer, args ...string) error { 21 | assetsDir := os.Getenv(assetsDirEnv) 22 | if assetsDir == "" { 23 | return fmt.Errorf("%s environment variable not set or empty", assetsDirEnv) 24 | } 25 | modelName := os.Getenv(modelNameEnv) 26 | if modelName == "" { 27 | return fmt.Errorf("%s environment variable not set or empty", modelNameEnv) 28 | } 29 | s, err := magika.NewScanner(assetsDir, modelName) 30 | if err != nil { 31 | return fmt.Errorf("create scanner: %w", err) 32 | } 33 | 34 | // For each filename given as argument, read the file and scan its content. 35 | for _, a := range args { 36 | fmt.Fprintf(w, "%s: ", a) 37 | b, err := os.ReadFile(a) 38 | if err != nil { 39 | fmt.Fprintf(w, "%v\n", err) 40 | continue 41 | } 42 | ct, err := s.Scan(bytes.NewReader(b), len(b)) 43 | if err != nil { 44 | fmt.Fprintf(w, "scan: %v\n", err) 45 | continue 46 | } 47 | fmt.Fprintf(w, "%s\n", ct.Label) 48 | } 49 | return nil 50 | } 51 | -------------------------------------------------------------------------------- /go/cli/cli_test.go: -------------------------------------------------------------------------------- 1 | //go:build cgo && onnxruntime 2 | 3 | package main 4 | 5 | import ( 6 | "path" 7 | "strings" 8 | "testing" 9 | 10 | "github.com/google/go-cmp/cmp" 11 | ) 12 | 13 | func TestCLI(t *testing.T) { 14 | const basicDir = "../../tests_data/basic" 15 | var ( 16 | files = []string{ 17 | path.Join(basicDir, "python/code.py"), 18 | path.Join(basicDir, "zip/magika_test.zip"), 19 | } 20 | b strings.Builder 21 | ) 22 | if err := cli(&b, files...); err != nil { 23 | t.Fatal(err) 24 | } 25 | if d := cmp.Diff(strings.TrimSpace(b.String()), strings.Join([]string{ 26 | "../../tests_data/basic/python/code.py: python", 27 | "../../tests_data/basic/zip/magika_test.zip: zip", 28 | }, "\n")); d != "" { 29 | t.Errorf("mismatch (-want +got):\n%s", d) 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /go/cli/main.go: -------------------------------------------------------------------------------- 1 | /* 2 | CLI is a simple command line interface for magika. 3 | 4 | It takes a list of files as argument, and infers their types in sequence. 5 | For example: 6 | 7 | $ magika test.go readme.md 8 | test.go: go 9 | readme.md: markdown 10 | 11 | The primary intent is to illustrate how the magika go library can be used 12 | and compiled, using cgo and the ONNX Runtime library. 13 | */ 14 | package main 15 | 16 | import ( 17 | "fmt" 18 | "os" 19 | ) 20 | 21 | func main() { 22 | if err := cli(os.Stdout, os.Args[1:]...); err != nil { 23 | fmt.Printf("Error: %v\n", err) 24 | os.Exit(1) 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /go/cli/tests_data/magika_test.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/go/cli/tests_data/magika_test.zip -------------------------------------------------------------------------------- /go/cli/tests_data/magika_test_pptx.txt: -------------------------------------------------------------------------------- 1 | This is a test for Magika! 2 | 3 | Very cool if this can be detected correctly! 4 | -------------------------------------------------------------------------------- /go/go.mod: -------------------------------------------------------------------------------- 1 | module github.com/google/magika 2 | 3 | go 1.22.3 4 | 5 | require github.com/google/go-cmp v0.6.0 // indirect 6 | -------------------------------------------------------------------------------- /go/go.sum: -------------------------------------------------------------------------------- 1 | github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= 2 | github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= 3 | -------------------------------------------------------------------------------- /go/magika/config.go: -------------------------------------------------------------------------------- 1 | package magika 2 | 3 | import ( 4 | "encoding/json" 5 | "fmt" 6 | "os" 7 | "path" 8 | ) 9 | 10 | const ( 11 | configFile = "config.min.json" 12 | contentTypesKBFile = "content_types_kb.min.json" 13 | modelFile = "model.onnx" 14 | modelsDir = "models" 15 | ) 16 | 17 | // Config holds the portion of Magika's model configuration that is relevant 18 | // for inference. 19 | type Config struct { 20 | BegSize int `json:"beg_size"` 21 | MidSize int `json:"mid_size"` 22 | EndSize int `json:"end_size"` 23 | UseInputsAtOffsets bool `json:"use_inputs_at_offsets"` 24 | MediumConfidenceThreshold float32 `json:"medium_confidence_threshold"` 25 | MinFileSizeForDl int64 `json:"min_file_size_for_dl"` 26 | PaddingToken int `json:"padding_token"` 27 | BlockSize int `json:"block_size"` 28 | TargetLabelsSpace []string `json:"target_labels_space"` 29 | Thresholds map[string]float32 `json:"thresholds"` 30 | } 31 | 32 | // ReadConfig is a helper that reads and unmarshal a Config, given an assets 33 | // dir and a model name. 34 | func ReadConfig(assetsDir, name string) (Config, error) { 35 | var cfg Config 36 | p := configPath(assetsDir, name) 37 | b, err := os.ReadFile(p) 38 | if err != nil { 39 | return Config{}, fmt.Errorf("read %q: %w", p, err) 40 | } 41 | if err := json.Unmarshal(b, &cfg); err != nil { 42 | return Config{}, fmt.Errorf("unmarshal: %w", err) 43 | } 44 | return cfg, nil 45 | } 46 | 47 | // contentTypesKBPath returns the content types KB path for the given 48 | // asset folder. 49 | func contentTypesKBPath(assetDir string) string { 50 | return path.Join(assetDir, contentTypesKBFile) 51 | } 52 | 53 | // configPath returns the model config for the given asset folder and model 54 | // name. 55 | func configPath(assetDir, name string) string { 56 | return path.Join(assetDir, modelsDir, name, configFile) 57 | } 58 | 59 | // modelPath returns the Onnx model for the given asset folder and model name. 60 | func modelPath(assetDir, name string) string { 61 | return path.Join(assetDir, modelsDir, name, modelFile) 62 | } 63 | -------------------------------------------------------------------------------- /go/magika/content.go: -------------------------------------------------------------------------------- 1 | package magika 2 | 3 | import ( 4 | "encoding/json" 5 | "fmt" 6 | "os" 7 | ) 8 | 9 | const ( 10 | contentTypeLabelEmpty = "empty" 11 | contentTypeLabelTxt = "txt" 12 | contentTypeLabelUnknown = "unknown" 13 | ) 14 | 15 | // ContentType holds the definition of a content type. 16 | type ContentType struct { 17 | Label string // As keyed in the content types KB. 18 | MimeType string `json:"mime_type"` 19 | Group string `json:"group"` 20 | Description string `json:"description"` 21 | Extensions []string `json:"extensions"` 22 | IsText bool `json:"is_text"` 23 | } 24 | 25 | // readContentTypesKB is a helper that reads and unmarshal a content types KB, 26 | // given the assets dir. 27 | // It returns a dictionary that maps a label as defined in the model config 28 | // target label space to a content type. 29 | func readContentTypesKB(assetsDir string) (map[string]ContentType, error) { 30 | var ckb map[string]ContentType 31 | p := contentTypesKBPath(assetsDir) 32 | b, err := os.ReadFile(p) 33 | if err != nil { 34 | return nil, fmt.Errorf("read %q: %w", p, err) 35 | } 36 | if err := json.Unmarshal(b, &ckb); err != nil { 37 | return nil, fmt.Errorf("unmarshal: %w", err) 38 | } 39 | for label, ct := range ckb { 40 | ct.Label = label 41 | ckb[label] = ct 42 | } 43 | return ckb, nil 44 | } 45 | -------------------------------------------------------------------------------- /go/magika/features_test.go: -------------------------------------------------------------------------------- 1 | package magika 2 | 3 | import ( 4 | "bytes" 5 | "compress/gzip" 6 | "encoding/json" 7 | "fmt" 8 | "io" 9 | "os" 10 | "testing" 11 | 12 | "github.com/google/go-cmp/cmp" 13 | "github.com/google/go-cmp/cmp/cmpopts" 14 | ) 15 | 16 | func TestExtractFeatures(t *testing.T) { 17 | f, err := os.Open("../../tests_data/features_extraction/reference.json.gz") 18 | if err != nil { 19 | t.Fatal(err) 20 | } 21 | r, err := gzip.NewReader(f) 22 | if err != nil { 23 | t.Fatalf("could not uncompress test data: %s", err) 24 | } 25 | b, err := io.ReadAll(r) 26 | if err != nil { 27 | t.Fatalf("could not read uncompress test data: %s", err) 28 | } 29 | 30 | var cases []struct { 31 | TestInfo Config `json:"test_info"` 32 | Content []byte `json:"content"` 33 | FeaturesV2 Features `json:"features_v2"` 34 | } 35 | if err := json.Unmarshal(b, &cases); err != nil { 36 | t.Fatal(err) 37 | } 38 | for i, c := range cases { 39 | t.Run(fmt.Sprintf("%d", i), func(t *testing.T) { 40 | ft, err := ExtractFeatures(c.TestInfo, bytes.NewReader(c.Content), len(c.Content)) 41 | if err != nil { 42 | t.Fatal(err) 43 | } 44 | if d := cmp.Diff(ft, c.FeaturesV2, cmpopts.IgnoreUnexported(Features{})); d != "" { 45 | t.Errorf("mismatch (-got +want):\n%s", d) 46 | } 47 | }) 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /go/magika/scanner_test.go: -------------------------------------------------------------------------------- 1 | //go:build cgo && onnxruntime 2 | 3 | package magika 4 | 5 | import ( 6 | "bytes" 7 | "os" 8 | "path" 9 | "testing" 10 | 11 | "github.com/google/go-cmp/cmp" 12 | ) 13 | 14 | func TestScannerBasic(t *testing.T) { 15 | const basicDir = "../../tests_data/basic" 16 | es, err := os.ReadDir(basicDir) 17 | if err != nil { 18 | t.Fatalf("read tests data: %v", err) 19 | } 20 | s := newTestScanner(t) 21 | for _, e := range es { 22 | t.Run(e.Name(), func(t *testing.T) { 23 | dir := path.Join(basicDir, e.Name()) 24 | es, err := os.ReadDir(dir) 25 | if err != nil { 26 | t.Fatalf("read tests data: %v", err) 27 | } 28 | for _, ee := range es { 29 | p := path.Join(dir, ee.Name()) 30 | fi, err := os.Stat(p) 31 | if err != nil { 32 | t.Fatalf("stat %s: %v", p, err) 33 | } 34 | f, err := os.Open(p) 35 | if err != nil { 36 | t.Fatalf("open %s: %v", p, err) 37 | } 38 | ct, err := s.Scan(f, int(fi.Size())) 39 | if err != nil { 40 | t.Fatalf("scan %s: %v", p, err) 41 | } 42 | if d := cmp.Diff(ct.Label, e.Name()); d != "" { 43 | t.Errorf("unexpected content type for %s (-got +want):\n%s", ee.Name(), d) 44 | } 45 | } 46 | }) 47 | } 48 | } 49 | 50 | func TestScannerSmall(t *testing.T) { 51 | s := newTestScanner(t) 52 | for _, c := range []struct { 53 | name string 54 | data []byte 55 | want string 56 | }{{ 57 | name: "empty", 58 | data: []byte{}, 59 | want: contentTypeLabelEmpty, 60 | }, { 61 | name: "small txt", 62 | data: []byte("small"), 63 | want: contentTypeLabelTxt, 64 | }, { 65 | name: "small bin", 66 | data: []byte{0x80, 0x80, 0x80, 0x80}, 67 | want: contentTypeLabelUnknown, 68 | }} { 69 | t.Run(c.name, func(t *testing.T) { 70 | ct, err := s.Scan(bytes.NewReader(c.data), len(c.data)) 71 | if err != nil { 72 | t.Fatalf("scan: %v", err) 73 | } 74 | if d := cmp.Diff(ct, s.ckb[c.want]); d != "" { 75 | t.Errorf("unexpected content type (-got +want):\n%s", d) 76 | } 77 | }) 78 | } 79 | } 80 | 81 | func newTestScanner(t *testing.T) *Scanner { 82 | t.Helper() 83 | const ( 84 | assetsDir = "../../assets" 85 | modelName = "standard_v2_1" 86 | ) 87 | s, err := NewScanner(assetsDir, modelName) 88 | if err != nil { 89 | t.Fatalf("new scanner: %v", err) 90 | } 91 | return s 92 | } 93 | -------------------------------------------------------------------------------- /go/onnx/onnx.go: -------------------------------------------------------------------------------- 1 | package onnx 2 | 3 | // Onnx represents something that can run inferences on features. 4 | type Onnx interface { 5 | // Run returns the result of the inference on the given features. 6 | Run(features []int32) ([]float32, error) 7 | } 8 | -------------------------------------------------------------------------------- /go/onnx/onnx_runtime.go: -------------------------------------------------------------------------------- 1 | //go:build cgo && onnxruntime 2 | 3 | package onnx 4 | 5 | // #cgo LDFLAGS: -lonnxruntime 6 | // #include "onnx_runtime.h" 7 | import "C" 8 | 9 | import ( 10 | "fmt" 11 | ) 12 | 13 | // NewOnnx returns an onnx that can perform inferences using an ONNX Runtime 14 | // (https://onnxruntime.ai/) and the given model. 15 | // It wraps the C calls to the ONNX Runtime API https://onnxruntime.ai/docs/api/c. 16 | func NewOnnx(modelPath string, sizeTarget int) (Onnx, error) { 17 | ort := &onnxRuntime{ 18 | api: C.GetApiBase(), 19 | sizeTarget: sizeTarget, 20 | } 21 | if err := C.CreateSession(ort.api, C.CString(modelPath), &ort.session, &ort.memory); err != nil { 22 | return nil, fmt.Errorf("create session: %v", C.GoString(C.GetErrorMessage(err))) 23 | } 24 | return ort, nil 25 | } 26 | 27 | // onnxRuntime implements the Onnx interface relying on a cgo call 28 | // to a C ONNX Runtime library. 29 | type onnxRuntime struct { 30 | api *C.OrtApi 31 | session *C.OrtSession 32 | memory *C.OrtMemoryInfo 33 | sizeTarget int 34 | } 35 | 36 | func (ort *onnxRuntime) Run(features []int32) ([]float32, error) { 37 | target := make([]float32, ort.sizeTarget) 38 | if err := C.Run(ort.api, ort.session, ort.memory, (*C.int32_t)(&features[0]), C.int64_t(len(features)), (*C.float)(&target[0]), C.int64_t(len(target))); err != nil { 39 | return nil, fmt.Errorf("run: %v", C.GoString(C.GetErrorMessage(err))) 40 | } 41 | return target, nil 42 | } 43 | -------------------------------------------------------------------------------- /go/onnx/onnx_runtime.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #define RETURN_ON_ERROR(expr) { \ 5 | OrtStatus* onnx_status = (expr); \ 6 | if (onnx_status != NULL) { \ 7 | return onnx_status; \ 8 | } \ 9 | } 10 | 11 | const OrtApi *GetApiBase() { 12 | return OrtGetApiBase()->GetApi(ORT_API_VERSION); 13 | } 14 | 15 | OrtStatus *CreateSession(const OrtApi *ort, const char *model, OrtSession **session, OrtMemoryInfo **memory_info) { 16 | OrtEnv *env; 17 | RETURN_ON_ERROR(ort->CreateEnv(ORT_LOGGING_LEVEL_ERROR, "onnx", &env)); 18 | RETURN_ON_ERROR(ort->DisableTelemetryEvents(env)); 19 | OrtSessionOptions *options; 20 | RETURN_ON_ERROR(ort->CreateSessionOptions(&options)); 21 | RETURN_ON_ERROR(ort->EnableCpuMemArena(options)); 22 | RETURN_ON_ERROR(ort->CreateSession(env, model, options, session)); 23 | RETURN_ON_ERROR(ort->CreateCpuMemoryInfo(OrtArenaAllocator, OrtMemTypeDefault, memory_info)); 24 | return NULL; 25 | } 26 | 27 | OrtStatus *Run(const OrtApi *ort, OrtSession *session, OrtMemoryInfo *memory_info, int32_t features[], int64_t sizeFeatures, float target[], int64_t sizeTarget) { 28 | const char *input_names[] = {"bytes"}; 29 | const char *output_names[] = {"target_label"}; 30 | const int64_t input_shape[] = {1, sizeFeatures}; 31 | OrtValue *input_tensor = NULL; 32 | RETURN_ON_ERROR(ort->CreateTensorWithDataAsOrtValue(memory_info, features, sizeFeatures * sizeof(int32_t), input_shape, 2, ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32, &input_tensor)); 33 | OrtValue *output_tensor = NULL; 34 | RETURN_ON_ERROR(ort->Run(session, NULL, input_names, (const OrtValue *const *) &input_tensor, 1, output_names, 1, &output_tensor)); 35 | float *out = NULL; 36 | RETURN_ON_ERROR(ort->GetTensorMutableData(output_tensor, (void **) &out)); 37 | memcpy(target, out, sizeTarget * sizeof(float)); 38 | ort->ReleaseValue(input_tensor); 39 | ort->ReleaseValue(output_tensor); 40 | return NULL; 41 | } 42 | 43 | const char *GetErrorMessage(const OrtStatus* onnx_status) { 44 | if (onnx_status == NULL) { 45 | return ""; 46 | } 47 | return OrtGetApiBase()->GetApi(ORT_API_VERSION)->GetErrorMessage(onnx_status); 48 | } 49 | -------------------------------------------------------------------------------- /go/onnx/onnx_runtime_test.go: -------------------------------------------------------------------------------- 1 | //go:build cgo && onnxruntime 2 | 3 | package onnx_test 4 | 5 | import ( 6 | "math/rand/v2" 7 | "testing" 8 | 9 | "github.com/google/magika/magika" 10 | "github.com/google/magika/onnx" 11 | ) 12 | 13 | func TestONNXRuntime(t *testing.T) { 14 | const ( 15 | assetsDir = "../../assets" 16 | modelName = "standard_v2_1" 17 | modelPath = "../../assets/models/" + modelName + "/model.onnx" 18 | ) 19 | 20 | cfg, err := magika.ReadConfig(assetsDir, modelName) 21 | if err != nil { 22 | t.Fatal(err) 23 | } 24 | 25 | rt, err := onnx.NewOnnx(modelPath, len(cfg.TargetLabelsSpace)) 26 | if err != nil { 27 | t.Fatalf("Create onnx: %v", err) 28 | } 29 | 30 | // Initialize a random features tensor. 31 | features := make([]int32, cfg.BegSize+cfg.MidSize+cfg.EndSize) 32 | for i := range features { 33 | features[i] = rand.Int32() 34 | } 35 | 36 | // Get the scores and check its size. 37 | scores, err := rt.Run(features) 38 | if err != nil { 39 | t.Fatalf("Run onnx: %v", err) 40 | } 41 | if n, m := len(scores), len(cfg.TargetLabelsSpace); n != m { 42 | t.Fatalf("Unexpected scores len: got %d, want %d", n, m) 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /go/onnx/onnx_zero.go: -------------------------------------------------------------------------------- 1 | //go:build !(cgo && onnxruntime) 2 | 3 | package onnx 4 | 5 | // NewOnnx returns a nil Onnx runtime. 6 | // This allows for building and unit testing in a non-cgo context. 7 | func NewOnnx(string, int) (Onnx, error) { 8 | return nil, nil 9 | } 10 | -------------------------------------------------------------------------------- /js/.gitignore: -------------------------------------------------------------------------------- 1 | package-lock.json 2 | -------------------------------------------------------------------------------- /js/CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # CHANGELOG 2 | 3 | ## [0.3.2] 4 | 5 | - Upgrade to `standard_v3_3` model. 6 | 7 | ## [0.3.1] 8 | 9 | - Overhaul of the API to use much cleaner abstractions. 10 | - Removed identifyBytesFull and identifyStreamFull: identifyBytes and 11 | identifyStream now return all the scores as well (accessible with 12 | `result.prediction.scores_map`). 13 | - Restrict the input types to `Uint8Array` and `Buffer`. 14 | 15 | ## [0.2.13] - 2024-03-26 16 | 17 | - This is the first working (but still very experimental) version. 18 | -------------------------------------------------------------------------------- /js/postBuild.js: -------------------------------------------------------------------------------- 1 | 2 | // format sub package.json for dual cjs and esm support 3 | import fs from 'fs'; 4 | 5 | const formatPackage = (source, output, type) => { 6 | const remove = {main: true, module: true, browser: true, types: true, exports: true}; 7 | const json = JSON.parse(fs.readFileSync(source, 'utf-8')); 8 | fs.writeFileSync(output, JSON.stringify({ 9 | ...Object.fromEntries(Object.entries(json).filter(([key]) => !remove[key])), 10 | type:type 11 | }, null, 4)); 12 | } 13 | formatPackage('./package.json', './dist/cjs/package.json', 'commonjs'); 14 | formatPackage('./package.json', './dist/mjs/package.json', 'module'); 15 | -------------------------------------------------------------------------------- /js/src/.npmignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/js/src/.npmignore -------------------------------------------------------------------------------- /js/src/content-type-info.ts: -------------------------------------------------------------------------------- 1 | // Copyright 2024 Google LLC 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | import { ContentTypeLabel } from "./content-type-label.js"; 16 | 17 | export interface ContentTypeInfo { 18 | label: ContentTypeLabel; 19 | is_text: boolean; 20 | } 21 | -------------------------------------------------------------------------------- /js/src/magika-options.ts: -------------------------------------------------------------------------------- 1 | // Copyright 2024 Google LLC 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | export interface MagikaOptions { 16 | modelURL?: string; 17 | modelPath?: string; 18 | modelConfigURL?: string; 19 | modelConfigPath?: string; 20 | } 21 | -------------------------------------------------------------------------------- /js/src/magika-prediction.ts: -------------------------------------------------------------------------------- 1 | // Copyright 2024 Google LLC 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | import { ContentTypeInfo } from "./content-type-info.js"; 16 | import { ContentTypeLabel } from "./content-type-label.js"; 17 | import { OverwriteReason } from "./overwrite-reason.js"; 18 | 19 | export interface MagikaPrediction { 20 | dl: ContentTypeInfo; 21 | output: ContentTypeInfo; 22 | score: number; 23 | overwrite_reason: OverwriteReason; 24 | scores_map?: Partial>; 25 | } 26 | -------------------------------------------------------------------------------- /js/src/magika-result.ts: -------------------------------------------------------------------------------- 1 | // Copyright 2024 Google LLC 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | import { MagikaPrediction } from "./magika-prediction.js"; 16 | import { Status } from "./status.js"; 17 | 18 | export interface MagikaResult { 19 | path: string; 20 | status: Status; 21 | prediction: MagikaPrediction; 22 | } 23 | -------------------------------------------------------------------------------- /js/src/model-config-node.ts: -------------------------------------------------------------------------------- 1 | // Copyright 2024 Google LLC 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | import * as fs from "fs/promises"; 16 | import { ModelConfig } from "./model-config.js"; 17 | 18 | export class ModelConfigNode extends ModelConfig { 19 | async loadFile(configPath: string): Promise { 20 | if (this.loaded) { 21 | return; 22 | } 23 | const config = JSON.parse((await fs.readFile(configPath)).toString()); 24 | this.setConfig(config); 25 | this.loaded = true; 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /js/src/model-features.ts: -------------------------------------------------------------------------------- 1 | // Copyright 2024 Google LLC 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | export class ModelFeatures { 16 | beg_ints: Uint16Array; 17 | end_ints: Uint16Array; 18 | locked: { beg: boolean; end: boolean }; 19 | 20 | constructor( 21 | beg_size: number, 22 | mid_size: number, 23 | end_size: number, 24 | padding_token: number, 25 | use_inputs_at_offsets: boolean, 26 | ) { 27 | if (mid_size != 0) { 28 | throw new Error( 29 | `Assertion failed: This implementation does not support mid_size (${mid_size}) != 0 model config.`, 30 | ); 31 | } 32 | if (use_inputs_at_offsets) { 33 | throw new Error( 34 | `Assertion failed: This implementation does not support use_inputs_at_offsets = true model config.`, 35 | ); 36 | } 37 | 38 | this.beg_ints = new Uint16Array(beg_size).fill(padding_token); 39 | this.end_ints = new Uint16Array(end_size).fill(padding_token); 40 | this.locked = { beg: false, end: false }; 41 | } 42 | 43 | withStart(data: Uint8Array, offset: number): this { 44 | if (!this.locked.beg) { 45 | this.locked.beg = true; 46 | this.beg_ints.set(data, offset); 47 | } 48 | return this; 49 | } 50 | 51 | withEnd(data: Uint8Array, offset: number): this { 52 | if (!this.locked.end) { 53 | this.locked.end = true; 54 | this.end_ints.set(data, offset); 55 | } 56 | return this; 57 | } 58 | 59 | toArray(): number[] { 60 | return [...this.beg_ints, ...this.end_ints]; 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /js/src/model-node.ts: -------------------------------------------------------------------------------- 1 | // Copyright 2024 Google LLC 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | import * as tf from "@tensorflow/tfjs"; 16 | import * as tfn from "@tensorflow/tfjs-node"; 17 | import { Model } from "./model.js"; 18 | 19 | export class ModelNode extends Model { 20 | async loadFile(modelPath: string): Promise { 21 | if (!this.model) { 22 | const handler = tfn.io.fileSystem(modelPath); 23 | this.model = await tf.loadGraphModel(handler); 24 | } 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /js/src/model-prediction.ts: -------------------------------------------------------------------------------- 1 | // Copyright 2024 Google LLC 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | import { ContentTypeLabel } from "./content-type-label.js"; 16 | 17 | export interface ModelPrediction { 18 | label: ContentTypeLabel; 19 | score: number; 20 | scores_map: Partial>; 21 | } 22 | -------------------------------------------------------------------------------- /js/src/overwrite-reason.ts: -------------------------------------------------------------------------------- 1 | // Copyright 2024 Google LLC 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | export enum OverwriteReason { 16 | NONE = "none", 17 | LOW_CONFIDENCE = "low_confidence", 18 | OVERWRITE_MAP = "overwrite_map", 19 | } 20 | -------------------------------------------------------------------------------- /js/src/prediction-mode.ts: -------------------------------------------------------------------------------- 1 | // Copyright 2024 Google LLC 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | export enum PredictionMode { 16 | BEST_GUESS = "best_guess", 17 | MEDIUM_CONFIDENCE = "medium_confidence", 18 | HIGH_CONFIDENCE = "high_confidence", 19 | } 20 | -------------------------------------------------------------------------------- /js/src/status.ts: -------------------------------------------------------------------------------- 1 | // Copyright 2024 Google LLC 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | export enum Status { 16 | // No errors. 17 | OK = "ok", 18 | 19 | // Used when a file path does not exist. 20 | FILE_NOT_FOUND_ERROR = "file-not-found-error", 21 | 22 | //Used when a file path exists, but there are permission issues, e.g., can't read file. 23 | PERMISSION_ERROR = "permission-error", 24 | 25 | // Represents a generic error-like unknown status. 26 | UNKNOWN = "unknown", 27 | } 28 | -------------------------------------------------------------------------------- /js/test/tfnHook.ts: -------------------------------------------------------------------------------- 1 | // Copyright 2024 Google LLC 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | import { jest } from "@jest/globals"; 16 | 17 | export class TfnMock { 18 | static accessed: Record = {}; 19 | 20 | static mock = jest.mock( 21 | "@tensorflow/tfjs-node", 22 | () => { 23 | const hook = {}; 24 | const original = jest.requireActual("@tensorflow/tfjs-node") as any; 25 | Object.keys(original as object).forEach((key) => { 26 | TfnMock.accessed[key] = 0; 27 | Object.defineProperty(hook, key, { 28 | configurable: true, // allow spyOn to work 29 | enumerable: true, // so the key shows up 30 | get(): any { 31 | TfnMock.accessed[key] = (TfnMock.accessed[key] || 0) + 1; 32 | return original[key]; 33 | }, 34 | }); 35 | }); 36 | return hook; 37 | }, 38 | { virtual: true }, 39 | ); 40 | 41 | static reset() { 42 | for (const i in TfnMock.accessed) { 43 | TfnMock.accessed[i] = 0; 44 | } 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /js/test/utils.ts: -------------------------------------------------------------------------------- 1 | // Copyright 2024 Google LLC 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | import * as fs from "fs"; 16 | import * as zlib from "zlib"; 17 | 18 | export function parseGzippedJSON(filePath: string): Array { 19 | const gzippedBuffer = fs.readFileSync(filePath); 20 | const jsonBuffer = zlib.gunzipSync(gzippedBuffer); 21 | const jsonString = jsonBuffer.toString("utf-8"); 22 | const parsedData = JSON.parse(jsonString); 23 | if (!Array.isArray(parsedData)) { 24 | throw new Error("Parsed JSON is not an array as expected for ExampleList."); 25 | } 26 | return parsedData as Array; 27 | } 28 | -------------------------------------------------------------------------------- /js/tsconfig.cjs.json: -------------------------------------------------------------------------------- 1 | { 2 | "extends": "./tsconfig.json", 3 | "compilerOptions": { 4 | "module": "commonjs", 5 | "moduleResolution": "node", 6 | "outDir": "dist/cjs", 7 | "target": "es2015" 8 | } 9 | } -------------------------------------------------------------------------------- /js/tsconfig.esm.json: -------------------------------------------------------------------------------- 1 | { 2 | "extends": "./tsconfig.json", 3 | "compilerOptions": { 4 | "module": "nodenext", 5 | "moduleResolution": "nodenext", 6 | "outDir": "dist/mjs", 7 | "target": "esnext" 8 | } 9 | } -------------------------------------------------------------------------------- /js/tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "allowSyntheticDefaultImports": true, 4 | "baseUrl": ".", 5 | "declaration": true, 6 | "downlevelIteration": true, 7 | "esModuleInterop": true, 8 | "inlineSourceMap": false, 9 | "lib": ["esnext"], 10 | "listEmittedFiles": false, 11 | "listFiles": false, 12 | "noFallthroughCasesInSwitch": true, 13 | "pretty": true, 14 | "resolveJsonModule": true, 15 | "rootDir": ".", 16 | "skipLibCheck": true, 17 | "strict": true, 18 | "traceResolution": false, 19 | "types": ["node", "jest"] 20 | }, 21 | "compileOnSave": false, 22 | "exclude": ["node_modules", "dist"], 23 | "include": ["src", "magika-cli.ts", "magika.ts", "magika-node.ts"] 24 | } -------------------------------------------------------------------------------- /python/.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | __pycache__/ 3 | **/.ruff_cache/ 4 | .ipynb_checkpoints 5 | venv/ 6 | .env 7 | *.swp 8 | *.h5 9 | *.egg-info 10 | dist/* 11 | *.pickle 12 | .s.yml 13 | -------------------------------------------------------------------------------- /python/.python-version: -------------------------------------------------------------------------------- 1 | 3.12 2 | -------------------------------------------------------------------------------- /python/mypy.ini: -------------------------------------------------------------------------------- 1 | [mypy] 2 | show_error_codes = True 3 | follow_imports = silent 4 | local_partial_types = true 5 | strict_equality = true 6 | no_implicit_optional = true 7 | warn_incomplete_stub = true 8 | warn_redundant_casts = true 9 | warn_unused_configs = true 10 | warn_unused_ignores = true 11 | enable_error_code = ignore-without-code, redundant-self, truthy-iterable 12 | disable_error_code = annotation-unchecked, import-not-found, import-untyped, type-arg, no-any-unimported 13 | extra_checks = false 14 | check_untyped_defs = true 15 | disallow_incomplete_defs = true 16 | disallow_subclassing_any = true 17 | disallow_untyped_calls = true 18 | disallow_untyped_decorators = true 19 | disallow_untyped_defs = true 20 | warn_return_any = true 21 | warn_unreachable = true 22 | allow_redefinition = false 23 | strict_optional = true 24 | 25 | [mypy-magika.*] 26 | ignore_missing_imports = true 27 | no_implicit_reexport = true 28 | disallow_untyped_calls = true 29 | disallow_any_unimported = true 30 | disallow_untyped_decorators = true 31 | strict = true 32 | enable_error_code = ignore-without-code, redundant-self, truthy-iterable, possibly-undefined, truthy-bool, truthy-iterable, unused-ignore, mutable-override 33 | 34 | [mypy-magika.strenum.*] 35 | ignore_errors = True 36 | 37 | [mypy-tests.*] 38 | disallow_untyped_defs = false 39 | disallow_untyped_calls = false 40 | disallow_untyped_decorators = false 41 | -------------------------------------------------------------------------------- /python/pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | log_cli = 1 3 | log_level = WARNING 4 | 5 | markers = 6 | smoketest 7 | slow 8 | 9 | -------------------------------------------------------------------------------- /python/scripts/check_copyright.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2023-2024 Google LLC 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # https://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | # Taken from https://github.com/google/scaaml/blob/main/tools/check_copyright.sh 18 | 19 | 20 | errors=0 21 | e() { 22 | echo -e "$(tput bold)$(tput setaf 1)Error:$(tput sgr0) $*" 23 | errors=$(( $error + 1 )) 24 | } 25 | 26 | # Files we want to check for copyright 27 | EXTENSIONS="py\|sh\|ts" 28 | 29 | 30 | for file in $(git ls-files | \ 31 | grep -e '\.\('"${EXTENSIONS}"'\)$' | \ 32 | grep -v "tests_data") 33 | do 34 | sed -n 'N;/Copyright/q;q1' $file || e "No copyright notice in $file" 35 | done 36 | 37 | if [ $errors -gt 0 ] 38 | then 39 | exit 1 40 | fi 41 | exit 0 42 | -------------------------------------------------------------------------------- /python/scripts/check_source.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2023-2024 Google LLC 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # https://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | # From https://stackoverflow.com/a/246128 17 | SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) 18 | 19 | PYTHON_ROOT_DIR=$SCRIPT_DIR/../ 20 | 21 | pushd $PYTHON_ROOT_DIR > /dev/null 22 | 23 | echo "Running ruff..." 24 | ruff check 25 | 26 | echo "Running mypy..." 27 | mypy src/magika tests 28 | 29 | popd > /dev/null 30 | -------------------------------------------------------------------------------- /python/scripts/generate_reference.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | from pathlib import Path 3 | 4 | import click 5 | 6 | python_root_dir = Path(__file__).parent.parent 7 | 8 | 9 | @click.command() 10 | def main(): 11 | test_scripts_paths = [ 12 | python_root_dir / "tests" / "test_features_extraction_vs_reference.py", 13 | python_root_dir / "tests" / "test_inference_vs_reference.py", 14 | ] 15 | 16 | for test_script_path in test_scripts_paths: 17 | assert test_script_path.is_file() 18 | cmd = [ 19 | "uv", 20 | "run", 21 | str(test_script_path), 22 | "generate-tests", 23 | ] 24 | 25 | print(f'Running CMD: {" ".join(cmd)}') 26 | subprocess.run( 27 | cmd, 28 | cwd=python_root_dir, 29 | check=True, 30 | ) 31 | 32 | print("Everything went good.") 33 | 34 | 35 | if __name__ == "__main__": 36 | main() 37 | -------------------------------------------------------------------------------- /python/scripts/prepare_pyproject_for_pure_python_wheel.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | from pathlib import Path 17 | 18 | import tomli 19 | import tomli_w 20 | 21 | 22 | def main() -> None: 23 | pyproject_toml_path = Path(__file__).parent.parent / "pyproject.toml" 24 | 25 | pyproject_content = tomli.loads(pyproject_toml_path.read_text()) 26 | 27 | # Remove entry about maturin, we don't need it 28 | _ = pyproject_content["tool"].pop("maturin") 29 | 30 | # Tell uv we want to use the hatchling build system 31 | pyproject_content["build-system"] = { 32 | "requires": ["hatchling"], 33 | "build-backend": "hatchling.build", 34 | } 35 | 36 | # Make the python's magika client available as a script 37 | pyproject_content["project"]["scripts"] = { 38 | "magika-python-client": "magika.cli.magika_client:main", 39 | "magika": "magika.cli.magika_rust_client_not_found_warning:main", 40 | } 41 | 42 | pyproject_toml_path.write_text(tomli_w.dumps(pyproject_content)) 43 | 44 | 45 | if __name__ == "__main__": 46 | main() 47 | -------------------------------------------------------------------------------- /python/scripts/run_quick_test_magika_module.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright 2023-2024 Google LLC 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # https://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """ 17 | This script should only rely on dependencies installed with `pip install 18 | magika`; this script is used as part of "build & test package" github action, 19 | and the dev dependencies are not available. 20 | """ 21 | 22 | import sys 23 | from pathlib import Path 24 | 25 | import click 26 | 27 | from magika import ContentTypeLabel, Magika, PredictionMode 28 | 29 | 30 | @click.command() 31 | def main() -> None: 32 | m = Magika(prediction_mode=PredictionMode.HIGH_CONFIDENCE) 33 | 34 | print(f"Magika instance details: {m}") 35 | 36 | res = m.identify_bytes(b"text") 37 | assert res.dl.label == ContentTypeLabel.UNDEFINED 38 | assert res.output.label == ContentTypeLabel.TXT 39 | assert res.score == 1.0 40 | 41 | res = m.identify_bytes(b"\xff\xff\xff") 42 | assert res.dl.label == ContentTypeLabel.UNDEFINED 43 | assert res.output.label == ContentTypeLabel.UNKNOWN 44 | assert res.score == 1.0 45 | 46 | basic_tests_dir = ( 47 | Path(__file__).parent.parent.parent / "tests_data" / "basic" 48 | ).resolve() 49 | 50 | files_paths = sorted(filter(lambda p: p.is_file(), basic_tests_dir.rglob("*"))) 51 | 52 | with_error = False 53 | for file_path in files_paths: 54 | res = m.identify_path(file_path) 55 | output_label = res.output.label 56 | expected_label = file_path.parent.name 57 | if expected_label != output_label: 58 | with_error = True 59 | print( 60 | f"ERROR: Misprediction for {file_path}: expected_label={expected_label}, output_label={output_label}" 61 | ) 62 | 63 | if with_error: 64 | print("ERROR: There was at least one misprediction") 65 | sys.exit(1) 66 | 67 | print("All examples were predicted correctly") 68 | 69 | 70 | if __name__ == "__main__": 71 | main() 72 | -------------------------------------------------------------------------------- /python/src/magika/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | __version__ = "0.6.3-dev" 17 | 18 | 19 | import dotenv 20 | 21 | from magika import magika 22 | from magika.types import content_type_label, magika_error, prediction_mode 23 | 24 | Magika = magika.Magika 25 | MagikaError = magika_error.MagikaError 26 | ContentTypeLabel = content_type_label.ContentTypeLabel 27 | PredictionMode = prediction_mode.PredictionMode 28 | 29 | dotenv.load_dotenv(dotenv.find_dotenv()) 30 | -------------------------------------------------------------------------------- /python/src/magika/cli/magika_rust_client_not_found_warning.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright 2024 Google LLC 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import sys 17 | 18 | 19 | def main() -> None: 20 | message = """ 21 | WARNING: you have attempted to run `$ magika` (the Rust client), but this is not 22 | available in the python package you installed, likely because magika pipeline 23 | does not currently build binary wheels compatible with your platform settings. 24 | 25 | If you think this is a problem worth solving, please open an issue at 26 | https://github.com/google/magika. 27 | 28 | In the meantime, you can use the old python magika client with `$ magika-python-client`. 29 | """ 30 | 31 | print(message.strip()) 32 | sys.exit(1) 33 | 34 | 35 | if __name__ == "__main__": 36 | main() 37 | -------------------------------------------------------------------------------- /python/src/magika/colors.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | # Taken from https://en.wikipedia.org/wiki/ANSI_escape_code 17 | 18 | BLACK = "\033[0;30m" 19 | RED = "\033[0;31m" 20 | GREEN = "\033[0;32m" 21 | YELLOW = "\033[0;33m" 22 | BLUE = "\033[0;34m" 23 | PURPLE = "\033[0;35m" 24 | CYAN = "\033[0;36m" 25 | LIGHT_GRAY = "\033[0;37m" 26 | 27 | DARK_GRAY = "\033[1;30m" 28 | LIGHT_RED = "\033[1;31m" 29 | LIGHT_GREEN = "\033[1;32m" 30 | LIGHT_YELLOW = "\033[1;33m" 31 | LIGHT_BLUE = "\033[1;34m" 32 | LIGHT_PURPLE = "\033[1;35m" 33 | LIGHT_CYAN = "\033[1;36m" 34 | WHITE = "\033[1;37m" 35 | 36 | RESET = "\033[0;39m" 37 | -------------------------------------------------------------------------------- /python/src/magika/models/standard_v3_3/config.min.json: -------------------------------------------------------------------------------- 1 | {"beg_size":1024,"mid_size":0,"end_size":1024,"use_inputs_at_offsets":false,"medium_confidence_threshold":0.5,"min_file_size_for_dl":8,"padding_token":256,"block_size":4096,"target_labels_space":["3gp","ace","ai","aidl","apk","applebplist","appleplist","asm","asp","autohotkey","autoit","awk","batch","bazel","bib","bmp","bzip","c","cab","cat","chm","clojure","cmake","cobol","coff","coffeescript","cpp","crt","crx","cs","csproj","css","csv","dart","deb","dex","dicom","diff","dm","dmg","doc","dockerfile","docx","dsstore","dwg","dxf","elf","elixir","emf","eml","epub","erb","erlang","flac","flv","fortran","gemfile","gemspec","gif","gitattributes","gitmodules","go","gradle","groovy","gzip","h5","handlebars","haskell","hcl","hlp","htaccess","html","icns","ico","ics","ignorefile","ini","internetshortcut","ipynb","iso","jar","java","javabytecode","javascript","jinja","jp2","jpeg","json","jsonl","julia","kotlin","latex","lha","lisp","lnk","lua","m3u","m4","macho","makefile","markdown","matlab","mht","midi","mkv","mp3","mp4","mscompress","msi","mum","npy","npz","nupkg","objectivec","ocaml","odp","ods","odt","ogg","one","onnx","otf","outlook","parquet","pascal","pcap","pdb","pdf","pebin","pem","perl","php","pickle","png","po","postscript","powershell","ppt","pptx","prolog","proteindb","proto","psd","python","pythonbytecode","pytorch","qt","r","randombytes","randomtxt","rar","rdf","rpm","rst","rtf","ruby","rust","scala","scss","sevenzip","sgml","shell","smali","snap","solidity","sql","sqlite","squashfs","srt","stlbinary","stltext","sum","svg","swf","swift","tar","tcl","textproto","tga","thumbsdb","tiff","toml","torrent","tsv","ttf","twig","txt","typescript","vba","vcxproj","verilog","vhdl","vtt","vue","wasm","wav","webm","webp","winregistry","wmf","woff","woff2","xar","xls","xlsb","xlsx","xml","xpi","xz","yaml","yara","zig","zip","zlibstream"],"thresholds":{"crt":0.9,"handlebars":0.9,"ignorefile":0.95,"latex":0.95,"markdown":0.75,"ocaml":0.9,"pascal":0.95,"r":0.9,"rst":0.9,"sql":0.9,"tsv":0.9,"zig":0.9},"overwrite_map":{"randombytes":"unknown","randomtxt":"txt"},"protection":"none","aes_key_hex":"","version_major":3} 2 | -------------------------------------------------------------------------------- /python/src/magika/models/standard_v3_3/metadata.json: -------------------------------------------------------------------------------- 1 | {"epoch_num":"91"} 2 | -------------------------------------------------------------------------------- /python/src/magika/models/standard_v3_3/model.onnx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/python/src/magika/models/standard_v3_3/model.onnx -------------------------------------------------------------------------------- /python/src/magika/py.typed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/python/src/magika/py.typed -------------------------------------------------------------------------------- /python/src/magika/types/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | from magika.types.content_type_info import ContentTypeInfo # noqa: F401 17 | from magika.types.content_type_label import ContentTypeLabel # noqa: F401 18 | from magika.types.magika_error import MagikaError # noqa: F401 19 | from magika.types.magika_prediction import MagikaPrediction # noqa: F401 20 | from magika.types.magika_result import MagikaResult # noqa: F401 21 | from magika.types.model import ( # noqa: F401 22 | ModelConfig, 23 | ModelFeatures, 24 | ModelOutput, 25 | ) 26 | from magika.types.overwrite_reason import OverwriteReason # noqa: F401 27 | from magika.types.prediction_mode import PredictionMode # noqa: F401 28 | from magika.types.seekable import Seekable # noqa: F401 29 | from magika.types.status import Status # noqa: F401 30 | 31 | __all__ = [ 32 | "ContentTypeInfo", 33 | "ContentTypeLabel", 34 | "MagikaError", 35 | "MagikaPrediction", 36 | "MagikaResult", 37 | "ModelConfig", 38 | "ModelFeatures", 39 | "ModelOutput", 40 | "OverwriteReason", 41 | "PredictionMode", 42 | "Seekable", 43 | "Status", 44 | ] 45 | -------------------------------------------------------------------------------- /python/src/magika/types/content_type_info.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | from dataclasses import dataclass 3 | from typing import List 4 | 5 | from magika.logger import get_logger 6 | from magika.types.content_type_label import ContentTypeLabel 7 | 8 | 9 | @dataclass(frozen=True) 10 | class ContentTypeInfo: 11 | label: ContentTypeLabel 12 | mime_type: str 13 | group: str 14 | description: str 15 | extensions: List[str] 16 | is_text: bool 17 | 18 | @property 19 | def ct_label(self) -> str: 20 | warnings.warn( 21 | "`.ct_label` is deprecated and will be removed in a future version. Use `.label` instead. Consult the documentation for more information.", 22 | category=DeprecationWarning, 23 | stacklevel=2, 24 | ) 25 | return str(self.label) 26 | 27 | @property 28 | def score(self) -> float: 29 | error_msg = "Unsupported field error: `.score.` is not stored anymore in the `dl` or `output` objects; it is now stored in `MagikaResult`. Consult the documentation for more information." 30 | log = get_logger() 31 | log.error(error_msg) 32 | raise AttributeError(error_msg) 33 | 34 | @property 35 | def magic(self) -> str: 36 | warnings.warn( 37 | "`.magic` is deprecated and will be removed in a future version. Use `.description` instead. Consult the documentation for more information.", 38 | category=DeprecationWarning, 39 | stacklevel=2, 40 | ) 41 | return self.description 42 | -------------------------------------------------------------------------------- /python/src/magika/types/magika_error.py: -------------------------------------------------------------------------------- 1 | class MagikaError(Exception): 2 | pass 3 | -------------------------------------------------------------------------------- /python/src/magika/types/magika_prediction.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from __future__ import annotations 16 | 17 | from dataclasses import dataclass 18 | 19 | from magika.types.content_type_info import ContentTypeInfo 20 | from magika.types.overwrite_reason import OverwriteReason 21 | 22 | 23 | @dataclass(frozen=True) 24 | class MagikaPrediction: 25 | dl: ContentTypeInfo 26 | output: ContentTypeInfo 27 | score: float 28 | overwrite_reason: OverwriteReason 29 | -------------------------------------------------------------------------------- /python/src/magika/types/model.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | from dataclasses import dataclass 17 | from typing import Dict, List 18 | 19 | from magika.types.content_type_label import ContentTypeLabel 20 | 21 | 22 | @dataclass(frozen=True) 23 | class ModelFeatures: 24 | beg: List[int] 25 | mid: List[int] 26 | end: List[int] 27 | # for ISO 28 | offset_0x8000_0x8007: List[int] 29 | offset_0x8800_0x8807: List[int] 30 | offset_0x9000_0x9007: List[int] 31 | # for UDF 32 | offset_0x9800_0x9807: List[int] 33 | 34 | 35 | @dataclass(frozen=True) 36 | class ModelOutput: 37 | label: ContentTypeLabel 38 | score: float 39 | 40 | 41 | @dataclass(frozen=True) 42 | class ModelConfig: 43 | beg_size: int 44 | mid_size: int 45 | end_size: int 46 | use_inputs_at_offsets: bool 47 | medium_confidence_threshold: float 48 | min_file_size_for_dl: int 49 | padding_token: int 50 | block_size: int 51 | target_labels_space: List[ContentTypeLabel] 52 | thresholds: Dict[ContentTypeLabel, float] 53 | overwrite_map: Dict[ContentTypeLabel, ContentTypeLabel] 54 | -------------------------------------------------------------------------------- /python/src/magika/types/overwrite_reason.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | import enum 17 | 18 | from magika.types.strenum import LowerCaseStrEnum 19 | 20 | 21 | class OverwriteReason(LowerCaseStrEnum): 22 | NONE = enum.auto() 23 | LOW_CONFIDENCE = enum.auto() 24 | OVERWRITE_MAP = enum.auto() 25 | -------------------------------------------------------------------------------- /python/src/magika/types/prediction_mode.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from __future__ import annotations 16 | 17 | import enum 18 | from typing import List 19 | 20 | from magika.types.strenum import LowerCaseStrEnum 21 | 22 | 23 | class PredictionMode(LowerCaseStrEnum): 24 | BEST_GUESS = enum.auto() 25 | MEDIUM_CONFIDENCE = enum.auto() 26 | HIGH_CONFIDENCE = enum.auto() 27 | 28 | @staticmethod 29 | def get_valid_prediction_modes() -> List[str]: 30 | return [pm for pm in PredictionMode] 31 | -------------------------------------------------------------------------------- /python/src/magika/types/seekable.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import io 16 | from typing import BinaryIO 17 | 18 | 19 | class Seekable: 20 | def __init__(self, stream: BinaryIO) -> None: 21 | self._stream = stream 22 | stream.seek(0, io.SEEK_END) 23 | self._size = stream.tell() 24 | 25 | @property 26 | def size(self) -> int: 27 | return self._size 28 | 29 | def read_at(self, offset: int, size: int) -> bytes: 30 | if size == 0: 31 | return b"" 32 | 33 | assert offset + size <= self.size 34 | self._stream.seek(offset) 35 | return self._stream.read(size) 36 | -------------------------------------------------------------------------------- /python/src/magika/types/status.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import enum 16 | 17 | from magika.types.strenum import LowerCaseStrEnum 18 | 19 | 20 | class Status(LowerCaseStrEnum): 21 | OK = enum.auto() 22 | 23 | # Used when a file path does not exist. 24 | FILE_NOT_FOUND_ERROR = enum.auto() 25 | 26 | # Used when a file path exists, but there are permission issues, e.g., can't 27 | # read file. 28 | PERMISSION_ERROR = enum.auto() 29 | 30 | # Represents a generic error-like unknown status. 31 | UNKNOWN = enum.auto() 32 | -------------------------------------------------------------------------------- /python/src/magika/types/strenum.py: -------------------------------------------------------------------------------- 1 | """ 2 | We use a StrEnum backport instead of relying on the newly introduced StrEnum 3 | as we want to support at least python 3.8; StrEnum was introduced in python 4 | 3.11. 5 | 6 | The following code has been taken (and adapted) from: 7 | https://github.com/irgeek/StrEnum/blob/master/strenum/__init__.py#L21 8 | """ 9 | 10 | from __future__ import annotations 11 | 12 | import enum 13 | from typing import Union 14 | 15 | 16 | class StrEnum(str, enum.Enum): 17 | """ 18 | StrEnum is a Python ``enum.Enum`` that inherits from ``str``. The default 19 | ``auto()`` behavior uses the lower-case version of the name. This is meant 20 | to reflect the behavior of `enum.StrEnum`, available from Python 3.11. 21 | """ 22 | 23 | def __new__(cls, value: Union[str, StrEnum], *args, **kwargs): # type: ignore[no-untyped-def] 24 | if not isinstance(value, (str, enum.auto)): 25 | raise TypeError( 26 | f"Values of StrEnums must be strings: {value!r} is a {type(value)}" 27 | ) 28 | return super().__new__(cls, value, *args, **kwargs) 29 | 30 | def __str__(self) -> str: 31 | return str(self.value) 32 | 33 | def _generate_next_value_(name, *_): # type: ignore[no-untyped-def,override] 34 | return name 35 | 36 | 37 | class LowerCaseStrEnum(StrEnum): 38 | def _generate_next_value_(name, *_): # type: ignore[no-untyped-def,override] 39 | return name.lower() 40 | -------------------------------------------------------------------------------- /python/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/python/tests/__init__.py -------------------------------------------------------------------------------- /python/tests/test_python_magika_client.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import subprocess 16 | from pathlib import Path 17 | 18 | 19 | def test_python_magika_client() -> None: 20 | python_root_dir = Path(__file__).parent.parent 21 | python_magika_client_path = ( 22 | python_root_dir / "src" / "magika" / "cli" / "magika_client.py" 23 | ).resolve() 24 | 25 | # quick test to check there are no obvious problems 26 | cmd = [str(python_magika_client_path), "--help"] 27 | subprocess.run(cmd, capture_output=True, check=True) 28 | 29 | # quick test to check there are no crashes 30 | cmd = [str(python_magika_client_path), str(python_magika_client_path)] 31 | subprocess.run(cmd, capture_output=True, check=True) 32 | -------------------------------------------------------------------------------- /rust/.gitignore: -------------------------------------------------------------------------------- 1 | /target/ 2 | -------------------------------------------------------------------------------- /rust/README.md: -------------------------------------------------------------------------------- 1 | This directory contains the Rust crates and their tools. It has the following structure: 2 | - The `cli` directory contains the Magika Rust CLI. It is published on crates.io as `magika-cli`. It 3 | can be compiled with `cargo build --release` from the `cli` directory. The output binary will be 4 | `target/release/magika`. 5 | - The `lib` directory contains the Magika Rust library. It is published on crates.io as `magika`. 6 | - The `gen` directory is for maintainers when a new model is available. 7 | - The `test.sh` script tests the crates listed above. It runs as part of the Github continuous 8 | integration. 9 | - The `sync.sh` script updates the library when a new model is available using the `gen` crate. 10 | - The `publish.sh` script prepares the library and the CLI for publishing to crates.io. It generates 11 | a commit that must be merged first. 12 | - The `color.sh` is a shell library for the scripts above. 13 | - The remaining files have the usual meaning associated to their name. 14 | -------------------------------------------------------------------------------- /rust/changelog.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # Copyright 2024 Google LLC 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | set -e 17 | . ./color.sh 18 | 19 | fail() { 20 | kind=$1 21 | dir=$2 22 | case $kind in 23 | stale) message="Some changes have not been logged." ;; 24 | format) message="This line should be an H2 version." ;; 25 | diff) message="This version differs from the Cargo.toml file." ;; 26 | *) error "Unsupported kind '$kind'" ;; 27 | esac 28 | if [ -z "$CI" ] 29 | then error "$message" 30 | else echo "::warning file=rust/$dir/CHANGELOG.md,line=3::$message" 31 | fi 32 | } 33 | 34 | for dir in lib cli; do 35 | ( cd $dir 36 | info "Checking $dir" 37 | ref=$(git log -n1 --pretty=format:%H origin/main.. -- CHANGELOG.md) 38 | [ -n "$ref" ] || ref=origin/main 39 | git diff --quiet $ref -- Cargo.toml src || fail stale $dir 40 | cver="$(sed -n '3s/^## //p' CHANGELOG.md)" 41 | [ -n "$cver" ] || fail format $dir 42 | pver="$(sed -n '/^\[package]$/,/^$/{s/^version = "\(.*\)"$/\1/p}' Cargo.toml)" 43 | [ "$pver" = "$cver" ] || fail diff $dir 44 | ) 45 | done 46 | -------------------------------------------------------------------------------- /rust/cli/CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | ## 0.1.2 4 | 5 | ### Minor 6 | 7 | - Use the `standard_v3_3` model instead of `standard_v3_2` (see [model changelog]) 8 | - Do not print the low-confidence warning if the content type was simply overwritten 9 | 10 | ### Patch 11 | 12 | - Update dependencies 13 | 14 | ## 0.1.1 15 | 16 | ### Minor 17 | 18 | - Use the `standard_v3_2` model instead of `standard_v3_1` (see [model changelog]) 19 | 20 | ## 0.1.0 21 | 22 | No changes. 23 | 24 | ## 0.1.0-rc.5 25 | 26 | ### Minor 27 | 28 | - Use the `standard_v3_1` model instead of `standard_v3_0` (see [model changelog]) 29 | 30 | ## 0.1.0-rc.4 31 | 32 | ### Minor 33 | 34 | - Update the model thresholds 35 | 36 | ## 0.1.0-rc.3 37 | 38 | ### Minor 39 | 40 | - Use the `standard_v3_0` model instead of `standard_v2_1` (see [model changelog]) 41 | 42 | ### Patch 43 | 44 | - Update dependencies 45 | 46 | ## 0.1.0-rc.2 47 | 48 | ### Minor 49 | 50 | - Exit with non-zero code if at least one error was encountered (fixes #780) 51 | 52 | ### Patch 53 | 54 | - Update dependencies 55 | 56 | ## 0.1.0-rc.1 57 | 58 | ### Minor 59 | 60 | - Print model version with `--version` 61 | - Change model from `standard_v2_0` to `standard_v2_1` 62 | 63 | ### Patch 64 | 65 | - Fix running on MacOS by defaulting `--intra-threads` to 4 66 | - Fix the `--version` binary name from `magika-cli` to `magika` 67 | - Make sure ONNX Runtime telemetry is disabled 68 | - Change the default of the hidden flag `--num-tasks` from 1 to the number of CPUs 69 | 70 | ## 0.1.0-rc.0 71 | 72 | This version is the initial implementation and should be considered unstable. In particular, it 73 | ships a new model in comparison to the Python binary and we would love feedback on 74 | [GitHub](https://github.com/google/magika/issues). 75 | 76 | ## 0.0.0 77 | 78 | This version is a placeholder and does not expose anything. 79 | 80 | [model changelog]: https://github.com/google/magika/blob/main/assets/models/CHANGELOG.md 81 | -------------------------------------------------------------------------------- /rust/cli/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "magika-cli" 3 | version = "0.1.2" 4 | authors = ["Magika Developers "] 5 | license = "Apache-2.0" 6 | edition = "2021" 7 | description = "Determines the content type of a file with deep-learning" 8 | repository = "https://github.com/google/magika" 9 | homepage = "https://google.github.io/magika/" 10 | keywords = ["cli", "file", "magic"] 11 | categories = ["command-line-utilities", "filesystem", "parser-implementations"] 12 | include = ["/LICENSE", "/src"] 13 | 14 | [[bin]] 15 | name = "magika" 16 | path = "src/main.rs" 17 | 18 | [dependencies] 19 | anyhow = "1.0.86" 20 | async-channel = "2.3.1" 21 | clap = { version = "4.5.9", features = ["cargo", "derive", "string"] } 22 | colored = "2.1.0" 23 | magika = { version = "=0.2.0", path = "../lib", features = ["serde"] } 24 | num_cpus = "1.16.0" 25 | ort = "=2.0.0-rc.9" 26 | serde = { version = "1.0.204", features = ["derive"] } 27 | serde_json = "1.0.120" 28 | tokio = { version = "1.43.1", features = ["full"] } 29 | -------------------------------------------------------------------------------- /rust/cli/LICENSE: -------------------------------------------------------------------------------- 1 | ../../LICENSE -------------------------------------------------------------------------------- /rust/cli/test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # Copyright 2024 Google LLC 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | set -e 17 | . ../color.sh 18 | 19 | x cargo check 20 | x cargo build --release 21 | x cargo fmt -- --check 22 | x cargo clippy -- --deny=warnings 23 | 24 | PATH=$(dirname $PWD)/target/release:$PATH 25 | 26 | TEST_SUITES='basic previous_missdetections' 27 | info "Test against the test suites: $TEST_SUITES" 28 | ( cd ../../tests_data 29 | magika --format='%p: %l' --recursive $TEST_SUITES | while read line; do 30 | file=${line%: *} 31 | directory=${file%/*} 32 | expected=${directory##*/} 33 | actual=${line#*: } 34 | [ "$expected" = "$actual" ] || error "$file is detected as $actual" 35 | done 36 | ) 37 | 38 | # We rely below on the fact that we don't have permission on /etc/shadow. 39 | [ $(id -u) -eq 0 ] && success "No more tests in Docker" 40 | 41 | info "Test exit code with at least one error" 42 | test_error() { 43 | files="$1" 44 | expected="$2" 45 | ( set +e 46 | actual="$(magika $files)" 47 | code=$? 48 | [ $code -eq 1 ] || error "invalid exit code for magika $files" 49 | [ "$actual" = "$expected" ] || error "invalid output for magika $files" 50 | ) 51 | } 52 | test_error '/etc/shadow' "\ 53 | /etc/shadow: Permission denied (os error 13) (error)" 54 | test_error 'non_existent src/main.rs' "\ 55 | non_existent: No such file or directory (os error 2) (error) 56 | src/main.rs: Rust source (code)" 57 | -------------------------------------------------------------------------------- /rust/color.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | x() { ( set -x; "$@"; ); } 16 | 17 | color() { echo "[$1m$2: $3"; } 18 | info() { color '1;36' Info "$*"; } 19 | todo() { color '1;33' Todo "$*"; } 20 | success() { color '1;32' Done "$*"; exit 0; } 21 | error() { color '1;31' Error "$*"; exit 1; } 22 | -------------------------------------------------------------------------------- /rust/gen/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "gen" 3 | version = "0.0.0" 4 | edition = "2021" 5 | publish = false 6 | 7 | [dependencies] 8 | anyhow = "1.0.82" 9 | serde = { version = "1.0.197", features = ["derive"] } 10 | serde_json = "1.0.114" 11 | -------------------------------------------------------------------------------- /rust/gen/README.md: -------------------------------------------------------------------------------- 1 | This crate is for maintenance purposes only. It is used to update the Rust library to a new model. 2 | There are 3 files in the Rust library that depend on the model: 3 | 4 | - The model itself, `rust/lib/src/model.onnx`, which is a symbolic link to some model under 5 | `assets/models`, controlled by the `rust/gen/model` symbolic link. Publishing the crate will 6 | dereference this symbolic link. 7 | - The labels describing the model output, `rust/lib/src/model.rs`, which is generated from the model 8 | configuration, `rust/gen/model/config.min.json`. 9 | - The list of possible file types, `rust/lib/src/content.rs`, which is generated from the knowledge 10 | base of content types, `assets/content_types_kb.min.json`. 11 | 12 | The purpose of this crate is to generate the last two files. There is a test to make sure that they 13 | are up-to-date. If the test fails, one simply needs to run `./sync.sh` from the `rust` directory to 14 | regenerate them. 15 | 16 | An alternative design to generating the files before publishing the crate, would be to publish the 17 | model and Magika configurations and use a build script to generate the files during compilation. 18 | This has a few disadvantages: 19 | 20 | - We need to publish the model and Magika configurations which contain more information than needed 21 | to use the library (and the CLI). 22 | - We need to use a build script, which is frown upon for security reasons, as the entity compiling 23 | the library or CLI now needs to trust the build script, which can run arbitrary code. This only 24 | matters when the entity compiling the library or CLI is not the same as the one running the 25 | library or CLI (e.g. Debian maintainers), since the library and CLI too can run arbitrary code. 26 | - Using a build script also increases compilation time (and compilation complexity) instead of 27 | having it factored before publishing. 28 | -------------------------------------------------------------------------------- /rust/gen/content_types: -------------------------------------------------------------------------------- 1 | 3gp 2 | ace 3 | ai 4 | aidl 5 | apk 6 | applebplist 7 | appleplist 8 | asm 9 | asp 10 | autohotkey 11 | autoit 12 | awk 13 | batch 14 | bazel 15 | bib 16 | bmp 17 | bzip 18 | c 19 | cab 20 | cat 21 | chm 22 | clojure 23 | cmake 24 | cobol 25 | coff 26 | coffeescript 27 | cpp 28 | crt 29 | crx 30 | cs 31 | csproj 32 | css 33 | csv 34 | dart 35 | deb 36 | dex 37 | dicom 38 | diff 39 | directory 40 | dm 41 | dmg 42 | doc 43 | dockerfile 44 | docx 45 | dsstore 46 | dwg 47 | dxf 48 | elf 49 | elixir 50 | emf 51 | eml 52 | empty 53 | epub 54 | erb 55 | erlang 56 | flac 57 | flv 58 | fortran 59 | gemfile 60 | gemspec 61 | gif 62 | gitattributes 63 | gitmodules 64 | go 65 | gradle 66 | groovy 67 | gzip 68 | h5 69 | handlebars 70 | haskell 71 | hcl 72 | hlp 73 | htaccess 74 | html 75 | icns 76 | ico 77 | ics 78 | ignorefile 79 | ini 80 | internetshortcut 81 | ipynb 82 | iso 83 | jar 84 | java 85 | javabytecode 86 | javascript 87 | jinja 88 | jp2 89 | jpeg 90 | json 91 | jsonl 92 | julia 93 | kotlin 94 | latex 95 | lha 96 | lisp 97 | lnk 98 | lua 99 | m3u 100 | m4 101 | macho 102 | makefile 103 | markdown 104 | matlab 105 | mht 106 | midi 107 | mkv 108 | mp3 109 | mp4 110 | mscompress 111 | msi 112 | mum 113 | npy 114 | npz 115 | nupkg 116 | objectivec 117 | ocaml 118 | odp 119 | ods 120 | odt 121 | ogg 122 | one 123 | onnx 124 | otf 125 | outlook 126 | parquet 127 | pascal 128 | pcap 129 | pdb 130 | pdf 131 | pebin 132 | pem 133 | perl 134 | php 135 | pickle 136 | png 137 | po 138 | postscript 139 | powershell 140 | ppt 141 | pptx 142 | prolog 143 | proteindb 144 | proto 145 | psd 146 | python 147 | pythonbytecode 148 | pytorch 149 | qt 150 | r 151 | randombytes 152 | randomtxt 153 | rar 154 | rdf 155 | rpm 156 | rst 157 | rtf 158 | ruby 159 | rust 160 | scala 161 | scss 162 | sevenzip 163 | sgml 164 | shell 165 | smali 166 | snap 167 | solidity 168 | sql 169 | sqlite 170 | squashfs 171 | srt 172 | stlbinary 173 | stltext 174 | sum 175 | svg 176 | swf 177 | swift 178 | symlink 179 | tar 180 | tcl 181 | textproto 182 | tga 183 | thumbsdb 184 | tiff 185 | toml 186 | torrent 187 | tsv 188 | ttf 189 | twig 190 | txt 191 | typescript 192 | undefined 193 | unknown 194 | vba 195 | vcxproj 196 | verilog 197 | vhdl 198 | vtt 199 | vue 200 | wasm 201 | wav 202 | webm 203 | webp 204 | winregistry 205 | wmf 206 | woff 207 | woff2 208 | xar 209 | xls 210 | xlsb 211 | xlsx 212 | xml 213 | xpi 214 | xz 215 | yaml 216 | yara 217 | zig 218 | zip 219 | zlibstream 220 | -------------------------------------------------------------------------------- /rust/gen/model: -------------------------------------------------------------------------------- 1 | ../../assets/models/standard_v3_3 -------------------------------------------------------------------------------- /rust/gen/test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # Copyright 2024 Google LLC 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | set -e 17 | . ../color.sh 18 | 19 | x cargo check 20 | x cargo fmt -- --check 21 | x cargo clippy -- --deny=warnings 22 | -------------------------------------------------------------------------------- /rust/lib/CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | ## 0.2.0 4 | 5 | ### Major 6 | 7 | - Change `FileType::Ruled` to take `ContentType` directly and remove `RuledType` 8 | - Change `InferredType::content_type` to describe the content type when overwritten 9 | - Add `InferredType::inferred_type` for the (possibly overwritten) inferred content type 10 | 11 | ### Minor 12 | 13 | - Remove features extraction logic of older models 14 | - Use the `standard_v3_3` model instead of `standard_v3_2` (see [model changelog]) 15 | - Add `OverwriteReason` to document why the inferred content type is overwritten 16 | 17 | ### Patch 18 | 19 | - Update dependencies 20 | - Add inference tests with the new reference files 21 | - Update features extraction test to the new reference file 22 | 23 | ## 0.1.1 24 | 25 | ### Minor 26 | 27 | - Use the `standard_v3_2` model instead of `standard_v3_1` (see [model changelog]) 28 | 29 | ## 0.1.0 30 | 31 | No changes. 32 | 33 | ## 0.1.0-rc.5 34 | 35 | ### Minor 36 | 37 | - Use the `standard_v3_1` model instead of `standard_v3_0` (see [model changelog]) 38 | 39 | ## 0.1.0-rc.4 40 | 41 | ### Minor 42 | 43 | - Update the model thresholds 44 | 45 | ## 0.1.0-rc.3 46 | 47 | ### Minor 48 | 49 | - Use the `standard_v3_0` model instead of `standard_v2_1` (see [model changelog]) 50 | - Add content types `ContentType::Random{bytes,txt}` (those are only returned in 51 | `InferredType::content_type` and not in `RuledType::content_type`) 52 | - Add a `MODEL_MAJOR_VERSION` integer in addition to the `MODEL_NAME` string 53 | 54 | ### Patch 55 | 56 | - Update dependencies 57 | 58 | ## 0.1.0-rc.2 59 | 60 | ### Patch 61 | 62 | - Update dependencies 63 | 64 | ## 0.1.0-rc.1 65 | 66 | ### Minor 67 | 68 | - Change model from `standard_v2_0` to `standard_v2_1` 69 | 70 | ## 0.1.0-rc.0 71 | 72 | This version is the initial implementation and should be considered unstable. In particular, it 73 | ships a new model in comparison to the Python binary and we would love feedback on 74 | [GitHub](https://github.com/google/magika/issues). 75 | 76 | ## 0.0.0 77 | 78 | This version is a placeholder and does not expose anything. 79 | 80 | [model changelog]: https://github.com/google/magika/blob/main/assets/models/CHANGELOG.md 81 | -------------------------------------------------------------------------------- /rust/lib/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "magika" 3 | version = "0.2.0" 4 | authors = ["Magika Developers "] 5 | license = "Apache-2.0" 6 | edition = "2021" 7 | description = "Determines the content type of a file with deep-learning" 8 | repository = "https://github.com/google/magika" 9 | homepage = "https://google.github.io/magika/" 10 | keywords = ["file", "magic"] 11 | categories = ["command-line-utilities", "filesystem", "parser-implementations"] 12 | include = ["/LICENSE", "/src"] 13 | 14 | [package.metadata.docs.rs] 15 | features = ["_doc"] 16 | 17 | [features] 18 | serde = ["dep:serde"] 19 | # Internal features. 20 | _doc = ["serde"] 21 | _test = ["ort/download-binaries"] 22 | 23 | [dependencies] 24 | ndarray = "0.16.1" 25 | serde = { version = "1.0.204", features = ["derive"], optional = true } 26 | thiserror = "1.0.63" 27 | tokio = { version = "1.43.1", features = ["fs", "io-util"] } 28 | 29 | [dependencies.ort] 30 | version = "=2.0.0-rc.9" 31 | default-features = false 32 | features = ["ndarray"] 33 | 34 | [dev-dependencies] 35 | data-encoding = "2.6.0" 36 | flate2 = "1.0.30" 37 | serde = { version = "1.0.204", features = ["derive"] } 38 | serde_json = "1.0.120" 39 | 40 | [lints.rust] 41 | missing_docs = "warn" 42 | unreachable_pub = "warn" 43 | unused = { level = "warn", priority = -1 } 44 | -------------------------------------------------------------------------------- /rust/lib/LICENSE: -------------------------------------------------------------------------------- 1 | ../../LICENSE -------------------------------------------------------------------------------- /rust/lib/README.md: -------------------------------------------------------------------------------- 1 | # Magika 2 | 3 | This library crate provides file type detection with deep-learning. A command-line interface (CLI) 4 | for this library is provided by the [magika-cli](https://crates.io/crates/magika-cli) binary crate. 5 | 6 | ## Disclaimer 7 | 8 | This project is not an official Google project. It is not supported by Google and Google 9 | specifically disclaims all warranties as to its quality, merchantability, or fitness for a 10 | particular purpose. 11 | 12 | This `magika` library and the `magika-cli` binary are still unstable (as indicated by the major 13 | version of zero) and new versions might introduce breaking changes (all changes will follow [cargo 14 | semver compatibility](https://doc.rust-lang.org/cargo/reference/semver.html)). In particular, 15 | version 0.1.0-rc.0 ships a new model in comparison to the Python binary and we would love feedback 16 | on [GitHub](https://github.com/google/magika/issues). 17 | -------------------------------------------------------------------------------- /rust/lib/src/config.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2024 Google LLC 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | use std::borrow::Cow; 16 | 17 | use crate::ContentType; 18 | 19 | #[derive(Debug)] 20 | pub(crate) struct ModelConfig { 21 | pub(crate) beg_size: usize, 22 | pub(crate) end_size: usize, 23 | pub(crate) min_file_size_for_dl: usize, 24 | pub(crate) padding_token: i32, 25 | pub(crate) block_size: usize, 26 | pub(crate) thresholds: Cow<'static, [f32; ContentType::SIZE]>, 27 | pub(crate) overwrite_map: Cow<'static, [ContentType; ContentType::SIZE]>, 28 | } 29 | 30 | pub(crate) struct SplitFeatures<'a> { 31 | pub(crate) beg: &'a mut [i32], 32 | pub(crate) end: &'a mut [i32], 33 | } 34 | 35 | impl ModelConfig { 36 | pub(crate) fn features_size(&self) -> usize { 37 | self.beg_size + self.end_size 38 | } 39 | 40 | pub(crate) fn split_features<'a>(&self, features: &'a mut [i32]) -> SplitFeatures<'a> { 41 | let (beg, features) = features.split_at_mut(self.beg_size); 42 | let (end, features) = features.split_at_mut(self.end_size); 43 | debug_assert!(features.is_empty()); 44 | SplitFeatures { beg, end } 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /rust/lib/src/error.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2024 Google LLC 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | /// Result type of Magika functions. 16 | pub type Result = core::result::Result; 17 | 18 | /// Errors returned by Magika functions. 19 | #[derive(Debug, thiserror::Error)] 20 | pub enum Error { 21 | /// Input/output errors reported by the standard library. 22 | #[error("{0}")] 23 | IOError(#[from] std::io::Error), 24 | 25 | /// Errors reported by the ONNX Runtime. 26 | #[error("{0}")] 27 | OrtError(#[from] ort::Error), 28 | 29 | /// Shape errors reported by the ndarray library. 30 | #[error("{0}")] 31 | ShapeError(#[from] ndarray::ShapeError), 32 | } 33 | -------------------------------------------------------------------------------- /rust/lib/src/model.onnx: -------------------------------------------------------------------------------- 1 | ../../gen/model/model.onnx -------------------------------------------------------------------------------- /rust/lib/test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # Copyright 2024 Google LLC 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | set -e 17 | . ../color.sh 18 | 19 | x cargo check 20 | x cargo check --features=serde 21 | x cargo test --features=_test 22 | x cargo fmt -- --check 23 | x cargo clippy -- --deny=warnings 24 | if cargo --version | grep -q nightly; then 25 | x env RUSTDOCFLAGS=--deny=warnings cargo doc --features=_doc 26 | fi 27 | -------------------------------------------------------------------------------- /rust/onnx/build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2024 Google LLC 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | set -e 17 | . ../color.sh 18 | 19 | # This script builds ONNX Runtime as a static library to be linked in the Magika CLI. 20 | # 21 | # This is needed when building for manylinux since the prebuilt binaries provided by the ort crate 22 | # have too recent dependency requirements. 23 | 24 | if [ -e runtime ]; then 25 | info "Using cached static libraries." 26 | else 27 | info "Make sure we have Python 3.x and cmake-3.27 or higher." 28 | python3 -m venv venv 29 | source venv/bin/activate 30 | python3 -m pip install cmake 31 | 32 | info "Clone ONNX Runtime repository (recursively)." 33 | git clone --recursive https://github.com/Microsoft/onnxruntime.git runtime 34 | cd runtime 35 | 36 | info "Checkout v1.20.0 because that's what ort v2.0.0-rc.9 supports." 37 | git checkout v1.20.0 38 | 39 | # The build fails with GCC 14 due to warnings as errors. 40 | sed -i '/function(onnxruntime_set_compile_flags/a\ 41 | target_compile_options(${target_name} PRIVATE "$<$:-Wno-maybe-uninitialized>")\ 42 | target_compile_options(${target_name} PRIVATE "$<$:-Wno-uninitialized>")' \ 43 | cmake/CMakeLists.txt 44 | 45 | info "Build the static libraries." 46 | x ./build.sh --config=Release --parallel $ONNX_RUNTIME_BUILD_FLAGS 47 | 48 | info "Only keep the static libraries to save cache space." 49 | find build/Linux -not -name '*.a' \( -not -type d -or -empty \) -delete 50 | cd .. 51 | fi 52 | 53 | info "Point the ort crate to the locally built static library." 54 | cd ../.. 55 | cat >> .cargo/config.toml < cli/output 2>&1 36 | 37 | if [ "$1" = --check ]; then 38 | if ! git diff --exit-code; then 39 | [ -n "$CI" ] && todo 'Execute ./sync.sh from the rust directory' 40 | error 'Generated files are not in sync' 41 | fi 42 | fi 43 | success "Generated files are synced" 44 | -------------------------------------------------------------------------------- /rust/taplo.toml: -------------------------------------------------------------------------------- 1 | [formatting] 2 | column_width = 100 3 | reorder_arrays = true 4 | reorder_keys = true 5 | 6 | [[rule]] 7 | formatting = { reorder_keys = false } 8 | keys = ["dependencies.*", "package"] 9 | 10 | [[rule]] 11 | formatting = { reorder_keys = true } 12 | keys = ["package.*"] 13 | -------------------------------------------------------------------------------- /rust/test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # Copyright 2024 Google LLC 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | set -e 17 | . ./color.sh 18 | 19 | TOOLCHAINS='stable nightly' 20 | [ -z "$CI" ] || TOOLCHAINS=$(rustup show active-toolchain | sed 's/-.*//') 21 | 22 | for toolchain in $TOOLCHAINS; do 23 | for dir in gen lib cli; do 24 | info "Running tests from $dir with $toolchain" 25 | ( cd $dir && rustup run $toolchain ./test.sh; ) 26 | done 27 | done 28 | 29 | ./sync.sh --check 30 | -------------------------------------------------------------------------------- /tests_data/README.md: -------------------------------------------------------------------------------- 1 | # Tests Data 2 | 3 | We use these files for regressions testing. 4 | 5 | These files were not (and should not) be used for training purposes. 6 | 7 | They are organized by directory: 8 | - `basic/`: a number of simple files of various content types. 9 | - `mitra/`: a selection of the files available at [https://github.com/corkami/mitra](https://github.com/corkami/mitra/tree/master/input). 10 | -------------------------------------------------------------------------------- /tests_data/basic/asm/code.asm: -------------------------------------------------------------------------------- 1 | .section .text 2 | xor %eax,%eax 3 | push %eax 4 | push $0x68732f2f 5 | push $0x6e69622f 6 | mov %esp,%ebx 7 | push %eax 8 | push %ebx 9 | mov %esp,%ecx 10 | mov $0xb,%al 11 | int $0x80 12 | -------------------------------------------------------------------------------- /tests_data/basic/batch/simple.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | echo %1 3 | echo %2 4 | echo %3 -------------------------------------------------------------------------------- /tests_data/basic/c/code.c: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | int main() { 4 | char c; 5 | printf("Enter a character: "); 6 | scanf("%c", &c); 7 | 8 | if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')) 9 | printf("%c is a letter"); 10 | else 11 | printf("%c is not a letter", c); 12 | 13 | return 0; 14 | } -------------------------------------------------------------------------------- /tests_data/basic/css/code.css: -------------------------------------------------------------------------------- 1 | /* table.mycv-entry { */ 2 | /* border: 3px; */ 3 | /* background-color: red; */ 4 | /* } */ 5 | 6 | table.cv-entry { 7 | border: 0px; 8 | } 9 | 10 | .cv-entry tbody tr td, 11 | tbody tr th { 12 | background-color: #ffffff; 13 | border: 0px; 14 | } 15 | 16 | div.cv-entry { 17 | margin: 0px 10px 10px 10px; 18 | text-align: left 19 | } 20 | 21 | span.doctitle { 22 | font-weight: bold; 23 | display: block; 24 | /*color:#dcb975;*/ 25 | color:#000000; 26 | text-align: left 27 | } 28 | 29 | span.docauthors { 30 | display: block; 31 | } 32 | 33 | span.docproc { 34 | font-style: italic; 35 | } 36 | 37 | span.doclink { 38 | display:block; 39 | } 40 | 41 | img.social-icon { 42 | display:inline; 43 | margin: 0px; 44 | vertical-align:middle; 45 | } 46 | 47 | table.contact-info { 48 | border: 0px; 49 | } 50 | 51 | .contact-info tbody tr td, 52 | tbody tr th { 53 | background-color: #ffffff; 54 | border: 0px; 55 | } 56 | -------------------------------------------------------------------------------- /tests_data/basic/csv/magika_test.csv: -------------------------------------------------------------------------------- 1 | Name,Value1,Value2,Value3 2 | Test1,1,10,100 3 | Test2,2,20,200 4 | Test3,3,30,300 5 | Test4,4,40,400 6 | Test5,5,50,500 -------------------------------------------------------------------------------- /tests_data/basic/dockerfile/Dockerfile: -------------------------------------------------------------------------------- 1 | # syntax=docker/dockerfile:1 2 | 3 | ARG PYTHON_VERSION=3.11 4 | FROM python:${PYTHON_VERSION}-slim as base 5 | 6 | WORKDIR /magika 7 | 8 | # This requires buildx 9 | # RUN --mount=type=cache,target=/root/.cache/pip \ 10 | # pip install magika 11 | 12 | RUN pip install magika 13 | 14 | ENTRYPOINT ["magika"] 15 | -------------------------------------------------------------------------------- /tests_data/basic/docx/doc.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/tests_data/basic/docx/doc.docx -------------------------------------------------------------------------------- /tests_data/basic/docx/magika_test.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/tests_data/basic/docx/magika_test.docx -------------------------------------------------------------------------------- /tests_data/basic/empty/empty_file: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/tests_data/basic/empty/empty_file -------------------------------------------------------------------------------- /tests_data/basic/epub/doc.epub: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/tests_data/basic/epub/doc.epub -------------------------------------------------------------------------------- /tests_data/basic/epub/magika_test.epub: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/tests_data/basic/epub/magika_test.epub -------------------------------------------------------------------------------- /tests_data/basic/flac/test.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/tests_data/basic/flac/test.flac -------------------------------------------------------------------------------- /tests_data/basic/handlebars/example.handlebars: -------------------------------------------------------------------------------- 1 | {{#if isUserLoggedIn}} 2 | Welcome, {{username}}! 3 | {{#each notifications}} 4 |

{{this}}

5 | {{/each}} 6 | {{else}} 7 | Please log in. 8 | {{/if}} 9 | -------------------------------------------------------------------------------- /tests_data/basic/html/doc.html: -------------------------------------------------------------------------------- 1 | 2 | 301 Moved Permanently 3 | 4 |

301 Moved Permanently

5 |
nginx/1.18.0 (Ubuntu)
6 | 7 | 8 | -------------------------------------------------------------------------------- /tests_data/basic/ignorefile/example.ignorefile: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | .classpath 3 | .project 4 | .target/ 5 | .settings/ 6 | -------------------------------------------------------------------------------- /tests_data/basic/ini/doc.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | log_cli = 1 3 | log_level = WARNING 4 | filterwarnings = 5 | ignore::DeprecationWarning 6 | 7 | markers = 8 | smoketest 9 | slow 10 | 11 | -------------------------------------------------------------------------------- /tests_data/basic/jinja/example.j2: -------------------------------------------------------------------------------- 1 | {% extends "base.html" %} 2 | 3 | {% block content %} 4 |

Welcome, {{ user.username }}

5 | 6 | {% if user.is_admin %} 7 |

Admin privileges granted. You can go to the admin dashboard.

8 | {% else %} 9 |

Regular user access. You can view your profile.

10 | {% endif %} 11 | 12 |
    13 | {% for item in items %} 14 |
  • {{ item.name }} - {{ item.price | round(2) }} {{ currency_symbol }}
  • 15 | {% else %} 16 |
  • No items available.
  • 17 | {% endfor %} 18 |
19 | 20 | {% set now = current_time() %} 21 |

Page generated at: {{ now.strftime('%Y-%m-%d %H:%M:%S') }}

22 | {% endblock %} 23 | -------------------------------------------------------------------------------- /tests_data/basic/jpeg/magika_test.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/tests_data/basic/jpeg/magika_test.jpg -------------------------------------------------------------------------------- /tests_data/basic/latex/sample.tex: -------------------------------------------------------------------------------- 1 | \documentclass{article} 2 | \usepackage{todonotes} 3 | 4 | \title{Example of a LaTeX document} 5 | 6 | \begin{document} 7 | \maketitle 8 | 9 | This is just an example used for testing Magika. 10 | 11 | \end{document} 12 | -------------------------------------------------------------------------------- /tests_data/basic/makefile/simple.Makefile: -------------------------------------------------------------------------------- 1 | prog: prog.o 2 | gcc -o prog prog.o 3 | 4 | prog.o: prog.c lib.c 5 | gcc -c prog.c lib.c -------------------------------------------------------------------------------- /tests_data/basic/markdown/magika_test.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # Introduction 4 | 5 | This is a test document for Magika, yay\! We are going to take this file and convert it in a number of other formats. -------------------------------------------------------------------------------- /tests_data/basic/markdown/simple.md: -------------------------------------------------------------------------------- 1 | # This is the greatest markdown test sample 2 | 3 | ## Introduction 4 | 5 | Test! 6 | 7 | ## Main content 8 | 9 | Lorem ipsum? 10 | 11 | ## Conclusions 12 | 13 | Nothing much to say! 14 | 15 | -------------------------------------------------------------------------------- /tests_data/basic/mp3/test.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/tests_data/basic/mp3/test.mp3 -------------------------------------------------------------------------------- /tests_data/basic/odp/magika_test.odp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/tests_data/basic/odp/magika_test.odp -------------------------------------------------------------------------------- /tests_data/basic/ods/magika_test.ods: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/tests_data/basic/ods/magika_test.ods -------------------------------------------------------------------------------- /tests_data/basic/odt/doc.odt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/tests_data/basic/odt/doc.odt -------------------------------------------------------------------------------- /tests_data/basic/odt/magika_test.odt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/tests_data/basic/odt/magika_test.odt -------------------------------------------------------------------------------- /tests_data/basic/ogg/test.ogg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/tests_data/basic/ogg/test.ogg -------------------------------------------------------------------------------- /tests_data/basic/pdf/magika_test.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/tests_data/basic/pdf/magika_test.pdf -------------------------------------------------------------------------------- /tests_data/basic/pdf/magika_test_pptx.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/tests_data/basic/pdf/magika_test_pptx.pdf -------------------------------------------------------------------------------- /tests_data/basic/pdf/magika_test_xlsx.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/tests_data/basic/pdf/magika_test_xlsx.pdf -------------------------------------------------------------------------------- /tests_data/basic/pem/doc.pem: -------------------------------------------------------------------------------- 1 | -----BEGIN PGP PUBLIC KEY BLOCK----- 2 | 3 | mFIEWhdc7RMIKoZIzj0DAQcCAwTCrR7Da5QHqFi/CtJJ6egFb48zR9bn48epqb92 4 | kfLIN/sjBc6iqvjcXQM8pfhFZnf5Bhk0ZzwvuAHzCZSJgNgNtCFFbGllIEJ1cnN6 5 | dGVpbiA8Y29udGFjdEBlbGllLm5ldD6IgAQTEwgAHAUCWhdc7QILCQIbAwQVCAkK 6 | BBYCAwECF4ACHgEAFgkQtc4ql0fc7HoLGlRSRVpPUi1HUEd3nQD/bqujXolVmt7n 7 | GmY/kIRWEro2oUp2rXL5sAbABMC/SrIA/ihJa5nfZz8wAe7IzD76cRHW0qGpUHSM 8 | ehJzdDXXsEhruFYEWhdc7RIIKoZIzj0DAQcCAwSF6kdXcDKXmK5UYjfoRV07yxQo 9 | xapjucsZcXytjdLqbPDJr+Sw7Rlz41XIM3QQzOksFdNzlNemBuXBUE/K2522AwEI 10 | B4htBBgTCAAJBQJaF1ztAhsMABYJELXOKpdH3Ox6CxpUUkVaT1ItR1BHl9QBAI55 11 | 7DxLdB2WMXemGZ0U07vqGt2jSzTtUdYhqk4DkXUeAP98LFF4syoKrxD2pcArpKzI 12 | OwBiyuQgLZqQr2mtIPFWCw== 13 | =qq2a 14 | -----END PGP PUBLIC KEY BLOCK----- 15 | -------------------------------------------------------------------------------- /tests_data/basic/pem/doc.pub: -------------------------------------------------------------------------------- 1 | -----BEGIN PGP PUBLIC KEY BLOCK----- 2 | 3 | mFIEWhdc7RMIKoZIzj0DAQcCAwTCrR7Da5QHqFi/CtJJ6egFb48zR9bn48epqb92 4 | kfLIN/sjBc6iqvjcXQM8pfhFZnf5Bhk0ZzwvuAHzCZSJgNgNtCFFbGllIEJ1cnN6 5 | dGVpbiA8Y29udGFjdEBlbGllLm5ldD6IgAQTEwgAHAUCWhdc7QILCQIbAwQVCAkK 6 | BBYCAwECF4ACHgEAFgkQtc4ql0fc7HoLGlRSRVpPUi1HUEd3nQD/bqujXolVmt7n 7 | GmY/kIRWEro2oUp2rXL5sAbABMC/SrIA/ihJa5nfZz8wAe7IzD76cRHW0qGpUHSM 8 | ehJzdDXXsEhruFYEWhdc7RIIKoZIzj0DAQcCAwSF6kdXcDKXmK5UYjfoRV07yxQo 9 | xapjucsZcXytjdLqbPDJr+Sw7Rlz41XIM3QQzOksFdNzlNemBuXBUE/K2522AwEI 10 | B4htBBgTCAAJBQJaF1ztAhsMABYJELXOKpdH3Ox6CxpUUkVaT1ItR1BHl9QBAI55 11 | 7DxLdB2WMXemGZ0U07vqGt2jSzTtUdYhqk4DkXUeAP98LFF4syoKrxD2pcArpKzI 12 | OwBiyuQgLZqQr2mtIPFWCw== 13 | =qq2a 14 | -----END PGP PUBLIC KEY BLOCK----- 15 | -------------------------------------------------------------------------------- /tests_data/basic/png/magika_test.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/tests_data/basic/png/magika_test.png -------------------------------------------------------------------------------- /tests_data/basic/pptx/magika_test.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/tests_data/basic/pptx/magika_test.pptx -------------------------------------------------------------------------------- /tests_data/basic/python/code.py: -------------------------------------------------------------------------------- 1 | def print_primes(max_n: int) -> None: 2 | for i in range(2, max_n + 1): 3 | if is_prime(i): 4 | print(i) 5 | 6 | 7 | def is_prime(n: int) -> bool: 8 | for i in range(2, n // 2 + 1): 9 | if n % i == 0: 10 | return False 11 | return True 12 | -------------------------------------------------------------------------------- /tests_data/basic/pytorch/example.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/tests_data/basic/pytorch/example.pth -------------------------------------------------------------------------------- /tests_data/basic/rust/asm.rs: -------------------------------------------------------------------------------- 1 | use std::arch::asm; 2 | 3 | fn main() { 4 | 5 | let mut x: u64 = 5; 6 | 7 | println!("Original value of x: {}", x); 8 | 9 | unsafe { 10 | asm!( 11 | "mov rax, {x}", 12 | "mul rax", 13 | "mov {x}, rax", 14 | x = inout(reg) x, 15 | ); 16 | } 17 | 18 | println!("Squared value of x: {}", x); 19 | 20 | assert_eq!(x, 5 * 5); 21 | } 22 | -------------------------------------------------------------------------------- /tests_data/basic/rust/code.rs: -------------------------------------------------------------------------------- 1 | fn main() { 2 | println!("Hello World!"); 3 | } 4 | -------------------------------------------------------------------------------- /tests_data/basic/rust/test_case1.rs: -------------------------------------------------------------------------------- 1 | /// Sample function to load a file 2 | fn load_model() { 3 | println!("Magika model ⏳"); 4 | // Simulating a delay for loading 5 | std::thread::sleep(std::time::Duration::from_millis(200)); 6 | println!("Model loaded successfully!"); 7 | } 8 | 9 | fn main() { 10 | // Load the model 11 | load_model(); 12 | } -------------------------------------------------------------------------------- /tests_data/basic/rust/test_case2.rs: -------------------------------------------------------------------------------- 1 | use std::fs::File; 2 | use std::io::{Write, BufWriter}; 3 | 4 | // Function to create a CSV file 5 | fn create_csv() -> std::io::Result<()> { 6 | let mut file = File::create("sample.csv")?; 7 | writeln!(file, "Name,Age,City")?; 8 | writeln!(file, "Alice,30,New York")?; 9 | writeln!(file, "Bob,25,Los Angeles")?; 10 | writeln!(file, "Charlie,35,Chicago")?; 11 | Ok(()) 12 | } 13 | 14 | // Function to create a JSON file 15 | fn create_json() -> std::io::Result<()> { 16 | let mut file = File::create("sample.json")?; 17 | writeln!(file, "{{\"name\": \"Alice\", \"age\": 30, \"city\": \"New York\"}}")?; 18 | Ok(()) 19 | } 20 | 21 | // Main function 22 | fn main() -> std::io::Result<()> { 23 | // Function calls 24 | create_csv()?; 25 | create_json()?; 26 | println!("Sample files created successfully."); 27 | Ok(()) 28 | } -------------------------------------------------------------------------------- /tests_data/basic/smali/code.smali: -------------------------------------------------------------------------------- 1 | .class public LHelloWorld; 2 | 3 | .super Ljava/lang/Object; 4 | 5 | .method public static main([Ljava/lang/String;)V 6 | .registers 2 7 | 8 | sget-object v0, Ljava/lang/System;->out:Ljava/io/PrintStream; 9 | 10 | const-string v1, "Hello World!" 11 | 12 | invoke-virtual {v0, v1}, Ljava/io/PrintStream;->println(Ljava/lang/String;)V 13 | 14 | return-void 15 | .end method 16 | -------------------------------------------------------------------------------- /tests_data/basic/srt/code.srt: -------------------------------------------------------------------------------- 1 | 1 2 | 00:00:00,000 --> 00:00:02,000 3 | Hello, World! 4 | -------------------------------------------------------------------------------- /tests_data/basic/toml/doc.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "magika" 3 | version = "0.6.0-dev" 4 | description = "A tool to determine the content type of a file with deep-learning" 5 | authors = ["Yanick Fratantonio "] 6 | readme = "README.md" 7 | packages = [{include = "magika"}] 8 | 9 | [tool.poetry.dependencies] 10 | python = "^3.8,<3.13" 11 | click = "^8.1.3" 12 | tqdm = "^4.66.2" 13 | onnxruntime = "^1.17.0" 14 | numpy = [ 15 | {version = "^1.24", python = ">=3.8,<3.9"}, 16 | {version = "^1.26", python = ">=3.9,<3.13"} 17 | ] 18 | tabulate = "^0.9.0" 19 | python-dotenv = "^1.0.1" 20 | 21 | [tool.poetry.group.dev.dependencies] 22 | pytest = "^8.0.1" 23 | ipython = [ 24 | {version = "^8.12.3", python = ">=3.8,<3.9"}, 25 | {version = "^8.18.1", python = ">=3.9,<3.10"}, 26 | {version = "^8.21.0", python = ">=3.10,<3.13"} 27 | ] 28 | ruff = ">=0.2.2,<0.4.0" 29 | mypy = "^1.8.0" 30 | 31 | [build-system] 32 | requires = ["poetry-core"] 33 | build-backend = "poetry.core.masonry.api" 34 | 35 | [tool.ruff.lint] 36 | # Enable Pyflakes (`F`) and a subset of the pycodestyle (`E`) codes by default. 37 | # Unlike Flake8, Ruff doesn't enable pycodestyle warnings (`W`) or 38 | # McCabe complexity (`C901`) by default. 39 | select = ["E4", "E7", "E9", "F", "I001"] 40 | ignore = [] 41 | -------------------------------------------------------------------------------- /tests_data/basic/tsv/magika_test.tsv: -------------------------------------------------------------------------------- 1 | Name Value1 Value2 Value3 2 | Test1 1 10 100 3 | Test2 2 20 200 4 | Test3 3 30 300 5 | Test4 4 40 400 6 | Test5 5 50 500 -------------------------------------------------------------------------------- /tests_data/basic/twig/example.twig: -------------------------------------------------------------------------------- 1 | {% set items = ['apple', 'banana', 'cherry'] %} 2 | 3 |
    4 | {% for item in items %} 5 |
  • {{ loop.index }} - {{ item|title }}
  • 6 | {% endfor %} 7 |
8 | 9 | {% block content %} 10 |

This is content from a block definition.

11 | {% endblock %} 12 | 13 | {{ dump(items) }} 14 | 15 | {{ 'hello world'|title }} 16 | 17 | {% macro input(name, value = '', type = 'text') %} 18 | 19 | {% endmacro %} 20 | 21 | -------------------------------------------------------------------------------- /tests_data/basic/txt/complex-sentence.txt: -------------------------------------------------------------------------------- 1 | This is yet another simple test, it includes one simple sentence, but it is not as trivial as other simpler tests. -------------------------------------------------------------------------------- /tests_data/basic/txt/few-words.txt: -------------------------------------------------------------------------------- 1 | this is just a test -------------------------------------------------------------------------------- /tests_data/basic/txt/lorem-small.txt: -------------------------------------------------------------------------------- 1 | Utilitatis causa amicitia est quaesita. 2 | Lorem ipsum dolor sit amet, consectetur adipiscing elit. Collatio igitur ista te nihil iuvat. Honesta oratio, Socratica, Platonis etiam. Primum in nostrane potestate est, quid meminerimus? Duo Reges: constructio interrete. Quid, si etiam iucunda memoria est praeteritorum malorum? Si quidem, inquit, tollerem, sed relinquo. An nisi populari fama? 3 | 4 | Quamquam id quidem licebit iis existimare, qui legerint. Summum a vobis bonum voluptas dicitur. At hoc in eo M. Refert tamen, quo modo. Quid sequatur, quid repugnet, vident. Iam id ipsum absurdum, maximum malum neglegi. -------------------------------------------------------------------------------- /tests_data/basic/txt/magika_test_pptx.txt: -------------------------------------------------------------------------------- 1 | This is a test for Magika! 2 | 3 | Very cool if this can be detected correctly! 4 | -------------------------------------------------------------------------------- /tests_data/basic/txt/many-words.txt: -------------------------------------------------------------------------------- 1 | this is just a test but it contains more words than the simple test -------------------------------------------------------------------------------- /tests_data/basic/txt/one-sentence-with-newline.txt: -------------------------------------------------------------------------------- 1 | This is just a test that includes a simple sentence. 2 | -------------------------------------------------------------------------------- /tests_data/basic/txt/one-sentence.txt: -------------------------------------------------------------------------------- 1 | This is just a test that includes a simple sentence. -------------------------------------------------------------------------------- /tests_data/basic/txt/random-ascii.txt: -------------------------------------------------------------------------------- 1 | faslkdfjhasdfkljhasdfklajshdfaklsjdfhaluehzsdjvnmcnbxzcv -------------------------------------------------------------------------------- /tests_data/basic/typescript/code.ts: -------------------------------------------------------------------------------- 1 | // This is typescript, and not valid javascript. 2 | interface Person { 3 | name: string; 4 | age: number; 5 | } 6 | 7 | function greet(person: Person): string { 8 | return `Hello, ${person.name}. You are ${person.age} years old.`; 9 | } 10 | 11 | const user: Person = { 12 | name: "Bob", 13 | age: 42, 14 | }; 15 | 16 | console.log(greet(user)); 17 | -------------------------------------------------------------------------------- /tests_data/basic/wav/test.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/tests_data/basic/wav/test.wav -------------------------------------------------------------------------------- /tests_data/basic/xlsx/magika_test.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/tests_data/basic/xlsx/magika_test.xlsx -------------------------------------------------------------------------------- /tests_data/basic/yaml/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | updates: 4 | - package-ecosystem: "github-actions" 5 | directory: "/" 6 | schedule: 7 | interval: "monthly" 8 | - package-ecosystem: "docker" 9 | directory: "/" 10 | schedule: 11 | interval: "daily" 12 | - package-ecosystem: "pip" 13 | directory: "/python" 14 | schedule: 15 | interval: "daily" 16 | - package-ecosystem: "npm" 17 | directory: "/js" 18 | schedule: 19 | interval: "weekly" 20 | - package-ecosystem: "cargo" 21 | directory: "/rust" 22 | schedule: 23 | interval: "weekly" 24 | -------------------------------------------------------------------------------- /tests_data/basic/yaml/python-test.yml: -------------------------------------------------------------------------------- 1 | name: Python - test 2 | 3 | on: 4 | workflow_dispatch: 5 | push: 6 | branches: 7 | - 'main' 8 | paths: 9 | - 'python/**' 10 | - 'tests_data/**' 11 | - '.github/workflows/**' 12 | pull_request: 13 | paths: 14 | - 'python/**' 15 | - 'tests_data/**' 16 | - '.github/workflows/**' 17 | 18 | permissions: 19 | contents: read 20 | 21 | jobs: 22 | unit-testing: 23 | strategy: 24 | matrix: 25 | python-version: [ "3.8.x", "3.9.x", "3.10.x", "3.11.x", "3.12.x" ] 26 | os: [ "ubuntu-latest", "macos-latest" ] 27 | runs-on: ${{ matrix.os }} 28 | steps: 29 | - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # pin@v4 30 | 31 | - name: Setup Python 32 | uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # pin@v5 33 | with: 34 | python-version: '${{ matrix.python-version }}' 35 | 36 | - name: Install poetry 37 | uses: abatilo/actions-poetry@7b6d33e44b4f08d7021a1dee3c044e9c253d6439 # pin@v3 38 | with: 39 | poetry-version: "1.7.1" 40 | 41 | - name: Install the project dependencies 42 | working-directory: python 43 | run: poetry install 44 | 45 | - name: Run ruff check 46 | working-directory: python 47 | run: poetry run ruff check --verbose 48 | 49 | - name: Run ruff format check 50 | working-directory: python 51 | run: poetry run ruff format --check --verbose 52 | 53 | - name: Run mypy 54 | working-directory: python 55 | run: poetry run mypy magika tests 56 | 57 | - name: Run pytest 58 | working-directory: python 59 | run: poetry run pytest tests -m "not slow" 60 | -------------------------------------------------------------------------------- /tests_data/basic/yara/rule.yar: -------------------------------------------------------------------------------- 1 | rule Rule_485729_77379 { 2 | strings: 3 | $s1 = "HKEY_LOCAL_MACHINE\\SOFTWARE\\Microsoft\\Windows\\CurrentVersion\\Run" 4 | $s2 = "Win32_Process" 5 | $s3 = "Create" wide 6 | condition: 7 | $s1 and ($s2 and $s3) 8 | meta: 9 | author = "CyberThreatResearch" 10 | date = "2019-09-23" 11 | tags = "malware, persistence, registry" 12 | } 13 | -------------------------------------------------------------------------------- /tests_data/basic/zig/code.zig: -------------------------------------------------------------------------------- 1 | const std = @import("std"); 2 | 3 | pub fn main() void { 4 | std.debug.print("Hello, World!\n", .{}); 5 | } 6 | -------------------------------------------------------------------------------- /tests_data/basic/zip/magika_test.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/tests_data/basic/zip/magika_test.zip -------------------------------------------------------------------------------- /tests_data/current_missdetections/xls/password-protected-example.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/tests_data/current_missdetections/xls/password-protected-example.xls -------------------------------------------------------------------------------- /tests_data/features_extraction/reference.json.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/tests_data/features_extraction/reference.json.gz -------------------------------------------------------------------------------- /tests_data/mitra/bmp/bmp.bmp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/tests_data/mitra/bmp/bmp.bmp -------------------------------------------------------------------------------- /tests_data/mitra/bzip/bzip2.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/tests_data/mitra/bzip/bzip2.bz2 -------------------------------------------------------------------------------- /tests_data/mitra/cab/cab.cab: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/tests_data/mitra/cab/cab.cab -------------------------------------------------------------------------------- /tests_data/mitra/elf/elf.elf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/tests_data/mitra/elf/elf.elf -------------------------------------------------------------------------------- /tests_data/mitra/elf/elf64.elf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/tests_data/mitra/elf/elf64.elf -------------------------------------------------------------------------------- /tests_data/mitra/flac/flac.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/tests_data/mitra/flac/flac.flac -------------------------------------------------------------------------------- /tests_data/mitra/flac/tiny.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/tests_data/mitra/flac/tiny.flac -------------------------------------------------------------------------------- /tests_data/mitra/gif/gif87.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/tests_data/mitra/gif/gif87.gif -------------------------------------------------------------------------------- /tests_data/mitra/gif/gif89.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/tests_data/mitra/gif/gif89.gif -------------------------------------------------------------------------------- /tests_data/mitra/gzip/gzip.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/tests_data/mitra/gzip/gzip.gz -------------------------------------------------------------------------------- /tests_data/mitra/iso/iso.iso: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/tests_data/mitra/iso/iso.iso -------------------------------------------------------------------------------- /tests_data/mitra/javabytecode/java.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/tests_data/mitra/javabytecode/java.class -------------------------------------------------------------------------------- /tests_data/mitra/jpeg/jpg.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/tests_data/mitra/jpeg/jpg.jpg -------------------------------------------------------------------------------- /tests_data/mitra/mp3/id3v1.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/tests_data/mitra/mp3/id3v1.mp3 -------------------------------------------------------------------------------- /tests_data/mitra/mp3/id3v2.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/tests_data/mitra/mp3/id3v2.mp3 -------------------------------------------------------------------------------- /tests_data/mitra/mp4/mp4.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/tests_data/mitra/mp4/mp4.mp4 -------------------------------------------------------------------------------- /tests_data/mitra/ogg/vorbis.ogg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/tests_data/mitra/ogg/vorbis.ogg -------------------------------------------------------------------------------- /tests_data/mitra/pcap/pcap.pcap: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/tests_data/mitra/pcap/pcap.pcap -------------------------------------------------------------------------------- /tests_data/mitra/pdf/pdf.pdf: -------------------------------------------------------------------------------- 1 | %PDF-1.3 2 | %µ¶ 3 | 4 | 1 0 obj 5 | <> 6 | endobj 7 | 8 | 2 0 obj 9 | <> 10 | endobj 11 | 12 | 3 0 obj 13 | <>>>>>>> 14 | endobj 15 | 16 | 4 0 obj 17 | <> 18 | stream 19 | BT/F 270 Tf 30 300 Td(PDF)' ET 20 | 21 | endstream 22 | endobj 23 | 24 | xref 25 | 0 5 26 | 0000000000 65536 f 27 | 0000000016 00000 n 28 | 0000000062 00000 n 29 | 0000000114 00000 n 30 | 0000000241 00000 n 31 | 32 | trailer 33 | <> 34 | startxref 35 | 321 36 | %%EOF 37 | -------------------------------------------------------------------------------- /tests_data/mitra/pebin/pe32.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/tests_data/mitra/pebin/pe32.exe -------------------------------------------------------------------------------- /tests_data/mitra/pebin/pe64.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/tests_data/mitra/pebin/pe64.exe -------------------------------------------------------------------------------- /tests_data/mitra/php/php.php: -------------------------------------------------------------------------------- 1 | PHP: Hypertext Preprocessor
"; ?> -------------------------------------------------------------------------------- /tests_data/mitra/png/cgbi.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/tests_data/mitra/png/cgbi.png -------------------------------------------------------------------------------- /tests_data/mitra/png/png.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/tests_data/mitra/png/png.png -------------------------------------------------------------------------------- /tests_data/mitra/rar/rar4.rar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/tests_data/mitra/rar/rar4.rar -------------------------------------------------------------------------------- /tests_data/mitra/rar/rar5.rar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/tests_data/mitra/rar/rar5.rar -------------------------------------------------------------------------------- /tests_data/mitra/rtf/rich.rtf: -------------------------------------------------------------------------------- 1 | {\rtf1\ansi\ansicpg1252\deff0\deflang1033{\fonttbl{\f0\fnil\fcharset0 Arial;}} 2 | \f0\fs50 Rich Text Format\par 3 | } 4 | -------------------------------------------------------------------------------- /tests_data/mitra/sevenzip/7-zip.7z: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/tests_data/mitra/sevenzip/7-zip.7z -------------------------------------------------------------------------------- /tests_data/mitra/svg/svg.svg: -------------------------------------------------------------------------------- 1 | SVG -------------------------------------------------------------------------------- /tests_data/mitra/tar/tar.tar: -------------------------------------------------------------------------------- 1 | hello.txt000644 000000000031364433342200063250ustar TAR -------------------------------------------------------------------------------- /tests_data/mitra/tga/footer.tga: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/tests_data/mitra/tga/footer.tga -------------------------------------------------------------------------------- /tests_data/mitra/tiff/tiff-be.tif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/tests_data/mitra/tiff/tiff-be.tif -------------------------------------------------------------------------------- /tests_data/mitra/tiff/tiff-le.tif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/tests_data/mitra/tiff/tiff-le.tif -------------------------------------------------------------------------------- /tests_data/mitra/wav/riff.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/tests_data/mitra/wav/riff.wav -------------------------------------------------------------------------------- /tests_data/mitra/wav/rifx.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/tests_data/mitra/wav/rifx.wav -------------------------------------------------------------------------------- /tests_data/mitra/webm/webm.webm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/tests_data/mitra/webm/webm.webm -------------------------------------------------------------------------------- /tests_data/mitra/webp/webp.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/tests_data/mitra/webp/webp.webp -------------------------------------------------------------------------------- /tests_data/mitra/webp/webpl.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/tests_data/mitra/webp/webpl.webp -------------------------------------------------------------------------------- /tests_data/mitra/xar/hello-world.xar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/tests_data/mitra/xar/hello-world.xar -------------------------------------------------------------------------------- /tests_data/mitra/xar/mini.xar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/tests_data/mitra/xar/mini.xar -------------------------------------------------------------------------------- /tests_data/mitra/xz/xz.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/tests_data/mitra/xz/xz.xz -------------------------------------------------------------------------------- /tests_data/mitra/zip/NT.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/tests_data/mitra/zip/NT.zip -------------------------------------------------------------------------------- /tests_data/mitra/zip/NTFS.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/tests_data/mitra/zip/NTFS.zip -------------------------------------------------------------------------------- /tests_data/mitra/zip/PPMd.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/tests_data/mitra/zip/PPMd.zip -------------------------------------------------------------------------------- /tests_data/mitra/zip/aes.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/tests_data/mitra/zip/aes.zip -------------------------------------------------------------------------------- /tests_data/mitra/zip/bz2.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/tests_data/mitra/zip/bz2.zip -------------------------------------------------------------------------------- /tests_data/mitra/zip/deflate64.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/tests_data/mitra/zip/deflate64.zip -------------------------------------------------------------------------------- /tests_data/mitra/zip/directory.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/tests_data/mitra/zip/directory.zip -------------------------------------------------------------------------------- /tests_data/mitra/zip/drive.zip: -------------------------------------------------------------------------------- 1 | PK C_DRIVEPK (C_DRIVEPK5% -------------------------------------------------------------------------------- /tests_data/mitra/zip/dual.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/tests_data/mitra/zip/dual.zip -------------------------------------------------------------------------------- /tests_data/mitra/zip/filecomment.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/tests_data/mitra/zip/filecomment.zip -------------------------------------------------------------------------------- /tests_data/mitra/zip/implode.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/tests_data/mitra/zip/implode.zip -------------------------------------------------------------------------------- /tests_data/mitra/zip/implodeV3.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/tests_data/mitra/zip/implodeV3.zip -------------------------------------------------------------------------------- /tests_data/mitra/zip/jpeg.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/tests_data/mitra/zip/jpeg.zip -------------------------------------------------------------------------------- /tests_data/mitra/zip/lzma.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/tests_data/mitra/zip/lzma.zip -------------------------------------------------------------------------------- /tests_data/mitra/zip/mini.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/tests_data/mitra/zip/mini.zip -------------------------------------------------------------------------------- /tests_data/mitra/zip/reduced1.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/tests_data/mitra/zip/reduced1.zip -------------------------------------------------------------------------------- /tests_data/mitra/zip/reduced2.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/tests_data/mitra/zip/reduced2.zip -------------------------------------------------------------------------------- /tests_data/mitra/zip/reduced3.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/tests_data/mitra/zip/reduced3.zip -------------------------------------------------------------------------------- /tests_data/mitra/zip/reduced4.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/tests_data/mitra/zip/reduced4.zip -------------------------------------------------------------------------------- /tests_data/mitra/zip/shrunk.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/tests_data/mitra/zip/shrunk.zip -------------------------------------------------------------------------------- /tests_data/mitra/zip/simple.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/tests_data/mitra/zip/simple.zip -------------------------------------------------------------------------------- /tests_data/mitra/zip/store.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/tests_data/mitra/zip/store.zip -------------------------------------------------------------------------------- /tests_data/mitra/zip/unicode.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/tests_data/mitra/zip/unicode.zip -------------------------------------------------------------------------------- /tests_data/mitra/zip/unicode2.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/tests_data/mitra/zip/unicode2.zip -------------------------------------------------------------------------------- /tests_data/mitra/zip/unix.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/tests_data/mitra/zip/unix.zip -------------------------------------------------------------------------------- /tests_data/mitra/zip/unixdesc.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/tests_data/mitra/zip/unixdesc.zip -------------------------------------------------------------------------------- /tests_data/mitra/zip/volumecomment.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/tests_data/mitra/zip/volumecomment.zip -------------------------------------------------------------------------------- /tests_data/mitra/zip/wavpack.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/tests_data/mitra/zip/wavpack.zip -------------------------------------------------------------------------------- /tests_data/mitra/zip/zip.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/tests_data/mitra/zip/zip.zip -------------------------------------------------------------------------------- /tests_data/mitra/zip/zip64.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/tests_data/mitra/zip/zip64.zip -------------------------------------------------------------------------------- /tests_data/mitra/zip/zipcrypto.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/tests_data/mitra/zip/zipcrypto.zip -------------------------------------------------------------------------------- /tests_data/mitra/zip/zopfli.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/tests_data/mitra/zip/zopfli.zip -------------------------------------------------------------------------------- /tests_data/mitra_candidates/DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/tests_data/mitra_candidates/DS_Store -------------------------------------------------------------------------------- /tests_data/mitra_candidates/ace.ace: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/tests_data/mitra_candidates/ace.ace -------------------------------------------------------------------------------- /tests_data/mitra_candidates/dicom.dcm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/tests_data/mitra_candidates/dicom.dcm -------------------------------------------------------------------------------- /tests_data/mitra_candidates/hdf5.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/magika/7bd56c924ccbd5f29dc1dd1674d90cc1cc281cf4/tests_data/mitra_candidates/hdf5.h5 -------------------------------------------------------------------------------- /tests_data/mitra_candidates/html.htm: -------------------------------------------------------------------------------- 1 | --> 2 |
3 | 4 |

HTML page

5 | 6 | 14 | 15 |
16 |