├── .github └── workflows │ └── release.yml ├── .gitignore ├── LICENSE ├── Makefile ├── README.md ├── assets ├── example-output.json └── update.py ├── fpgen ├── __init__.py ├── __main__.py ├── bayesian_network.py ├── exceptions.py ├── generator.py ├── pkgman.py ├── py.typed ├── structs.py ├── trace.py ├── unpacker.py └── utils.py ├── pyproject.toml └── tests ├── README.md ├── failed_cases.py ├── generator_matches_trace.py ├── profile.py ├── speed_test.py └── user_inputs.py /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: Release to PyPi 2 | 3 | on: 4 | workflow_dispatch: 5 | push: 6 | tags: 7 | - 'v*' # Only trigger on tags that start with 'v' 8 | 9 | permissions: 10 | contents: read 11 | 12 | jobs: 13 | build: 14 | runs-on: ubuntu-latest 15 | env: 16 | TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }} 17 | 18 | steps: 19 | - uses: actions/checkout@v4 20 | 21 | - name: Install Python 22 | run: | 23 | sudo apt-get update 24 | sudo apt-get install -y \ 25 | python3 \ 26 | python3-pip \ 27 | python3-dev 28 | 29 | - name: Install Python dependencies 30 | run: | 31 | python3 -m pip install twine build vermin 32 | 33 | - name: Release 34 | run: | 35 | make release -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Old files 2 | _old/ 3 | _old_*/ 4 | *.old 5 | 6 | # Model files 7 | fpgen/**.json 8 | *.zst 9 | *.ndjson 10 | *.dat 11 | *.zip 12 | data/ 13 | output/ 14 | 15 | # Tests 16 | test*.py 17 | diagnostics/ 18 | 19 | # Logs 20 | *.log 21 | *.kate-swp 22 | 23 | 24 | # ==== DEFAULTS ==== 25 | 26 | # Byte-compiled / optimized / DLL files 27 | __pycache__/ 28 | *.py[cod] 29 | *$py.class 30 | 31 | # C extensions 32 | *.so 33 | 34 | # Binaries 35 | *.dll 36 | *.dylib 37 | 38 | # Distribution / packaging 39 | .Python 40 | build/ 41 | develop-eggs/ 42 | dist/ 43 | downloads/ 44 | eggs/ 45 | .eggs/ 46 | lib/ 47 | lib64/ 48 | parts/ 49 | sdist/ 50 | var/ 51 | wheels/ 52 | share/python-wheels/ 53 | *.egg-info/ 54 | .installed.cfg 55 | *.egg 56 | MANIFEST 57 | 58 | # PyInstaller 59 | # Usually these files are written by a python script from a template 60 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 61 | *.manifest 62 | *.spec 63 | 64 | # Installer logs 65 | pip-log.txt 66 | pip-delete-this-directory.txt 67 | 68 | # Unit test / coverage reports 69 | htmlcov/ 70 | .tox/ 71 | .nox/ 72 | .coverage 73 | .coverage.* 74 | .cache 75 | nosetests.xml 76 | coverage.xml 77 | *.cover 78 | *.py,cover 79 | .hypothesis/ 80 | .pytest_cache/ 81 | cover/ 82 | 83 | # Translations 84 | *.mo 85 | *.pot 86 | 87 | # Django stuff: 88 | *.log 89 | local_settings.py 90 | db.sqlite3 91 | db.sqlite3-journal 92 | 93 | # Flask stuff: 94 | instance/ 95 | .webassets-cache 96 | 97 | # Scrapy stuff: 98 | .scrapy 99 | 100 | # Sphinx documentation 101 | docs/_build/ 102 | 103 | # PyBuilder 104 | .pybuilder/ 105 | target/ 106 | 107 | # Jupyter Notebook 108 | .ipynb_checkpoints 109 | 110 | # IPython 111 | profile_default/ 112 | ipython_config.py 113 | 114 | # pyenv 115 | # For a library or package, you might want to ignore these files since the code is 116 | # intended to run in multiple environments; otherwise, check them in: 117 | # .python-version 118 | 119 | # pipenv 120 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 121 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 122 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 123 | # install all needed dependencies. 124 | #Pipfile.lock 125 | 126 | # poetry 127 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 128 | # This is especially recommended for binary packages to ensure reproducibility, and is more 129 | # commonly ignored for libraries. 130 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 131 | #poetry.lock 132 | 133 | # pdm 134 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 135 | #pdm.lock 136 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 137 | # in version control. 138 | # https://pdm.fming.dev/#use-with-ide 139 | .pdm.toml 140 | 141 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 142 | __pypackages__/ 143 | 144 | # Celery stuff 145 | celerybeat-schedule 146 | celerybeat.pid 147 | 148 | # SageMath parsed files 149 | *.sage.py 150 | 151 | # Environments 152 | .env 153 | .venv 154 | env/ 155 | venv/ 156 | ENV/ 157 | env.bak/ 158 | venv.bak/ 159 | 160 | # Spyder project settings 161 | .spyderproject 162 | .spyproject 163 | 164 | # Rope project settings 165 | .ropeproject 166 | 167 | # mkdocs documentation 168 | /site 169 | 170 | # mypy 171 | .mypy_cache/ 172 | .dmypy.json 173 | dmypy.json 174 | 175 | # Pyre type checker 176 | .pyre/ 177 | 178 | # pytype static type analyzer 179 | .pytype/ 180 | 181 | # Cython debug symbols 182 | cython_debug/ 183 | 184 | # PyCharm 185 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 186 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 187 | # and can be added to the global gitignore or merged into this file. For a more nuclear 188 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 189 | #.idea/ 190 | 191 | # VsCode 192 | .vscode 193 | .trunk 194 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | vermin: 2 | vermin . --eval-annotations --target=3.8 --violations fpgen/ || exit 1 3 | 4 | clean: 5 | @echo Cleaning... 6 | find ./fpgen -type f ! -name "*.typed" ! -name "*.py" -exec rm -v {} \; 7 | rm -rf ./dist 8 | 9 | prepare: vermin clean 10 | 11 | check: prepare 12 | @echo Building... 13 | python -m build 14 | twine check dist/* 15 | 16 | release: check 17 | @echo Releasing... 18 | twine upload dist/* 19 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

Fingerprint Generator

2 | 3 |

A fast browser data generator that mimics actual traffic patterns in the wild. With extensive data coverage.

4 | 5 |

Created by daijro. Data provided by Scrapfly.

6 | 7 | --- 8 | 9 | ## Features 10 | 11 | - Uses a Bayesian generative network to mimic real-world web traffic patterns 12 | - Extensive data coverage for **nearly all known** browser data points 13 | - Creates complete fingerprints in a few milliseconds ⚡ 14 | - Easily specify custom criteria for any data point (e.g. "only Windows + Chrome, with Intel GPUs") 15 | - Simple for humans to use 🚀 16 | 17 | ## Demo Video 18 | 19 | Here is a demonstration of what fpgen generates & its ability to filter data points: 20 | 21 | https://github.com/user-attachments/assets/5c56691a-5804-4007-b179-0bae7069a111 22 | 23 | --- 24 | 25 | # Installation 26 | 27 | Install the package using pip: 28 | 29 | ```bash 30 | pip install fpgen 31 | ``` 32 | 33 |
34 | 35 | ### Downloading the model 36 | 37 | Fetch the latest model: 38 | 39 | ```bash 40 | fpgen fetch 41 | ``` 42 | 43 | This will be ran automatically on the first import, or every 5 weeks. 44 | 45 | To decompress the model for faster generation (_up to 10-50x faster!_), run: 46 | 47 | ```bash 48 | fpgen decompress 49 | ``` 50 | 51 | Note: This action will use an additional 100mb+ of storage. 52 | 53 |
54 | CLI Usage 55 | 56 | ``` 57 | Usage: python -m fpgen [OPTIONS] COMMAND [ARGS]... 58 | 59 | Options: 60 | --help Show this message and exit. 61 | 62 | Commands: 63 | decompress Decompress model files for speed efficiency (will take 100mb+) 64 | fetch Fetch the latest model from GitHub 65 | recompress Compress model files after running decompress 66 | remove Remove all downloaded and/or extracted model files 67 | ``` 68 | 69 |
70 | 71 | --- 72 | 73 | # Usage 74 | 75 | ### Generate a fingerprint 76 | 77 | Simple usage: 78 | 79 | ```python 80 | >>> import fpgen 81 | >>> fpgen.generate(browser='Chrome', os='Windows') 82 | ``` 83 | 84 | Or use the Generator object to pass filters downward: 85 | 86 | ```python 87 | >>> gen = fpgen.Generator(browser='Chrome') # Filter by Chrome 88 | >>> gen.generate(os='Windows') # Generate Windows & Chrome fingerprints 89 | ``` 90 | 91 |
92 | 93 | Parameters list 94 | 95 | 96 | ``` 97 | Initializes the Generator with the given options. 98 | Values passed to the Generator object will be inherited when calling Generator.generate() 99 | 100 | Parameters: 101 | conditions (dict, optional): Conditions for the generated fingerprint. 102 | window_bounds (WindowBounds, optional): Constrain the output window size. 103 | strict (bool, optional): Whether to raise an exception if the conditions are too strict. 104 | flatten (bool, optional): Whether to flatten the output dictionary 105 | target (Optional[Union[str, StrContainer]]): Only generate specific value(s) 106 | **conditions_kwargs: Conditions for the generated fingerprint (passed as kwargs) 107 | ``` 108 | 109 |
110 | 111 | [See example output.](https://raw.githubusercontent.com/scrapfly/fingerprint-generator/refs/heads/main/assets/example-output.json) 112 | 113 | --- 114 | 115 | ## Filtering the output 116 | 117 | ### Setting fingerprint criteria 118 | 119 | You can narrow down generated fingerprints by specifying filters for **any** data field. 120 | 121 | ```python 122 | # Only generate fingerprints with Windows, Chrome, and Intel GPU: 123 | >>> fpgen.generate( 124 | ... os='Windows', 125 | ... browser='Chrome', 126 | ... gpu={'vendor': 'Google Inc. (Intel)'} 127 | ... ) 128 | ``` 129 | 130 |
131 | 132 | This can also be passed as a dictionary. 133 | 134 | 135 | ```python 136 | >>> fpgen.generate({ 137 | ... 'os': 'Windows', 138 | ... 'browser': 'Chrome', 139 | ... 'gpu': {'vendor': 'Google Inc. (Intel)'}, 140 | ... }) 141 | ``` 142 | 143 |
144 | 145 | ### Multiple constraints 146 | 147 | Pass in multiple constraints for the generator to select from using a tuple. 148 | 149 | ```python 150 | >>> fpgen.generate({ 151 | ... 'os': ('Windows', 'MacOS'), 152 | ... 'browser': ('Firefox', 'Chrome'), 153 | ... }) 154 | ``` 155 | 156 | If you are passing many nested constraints, run `fpgen decompress` to improve model performance. 157 | 158 | ## Custom filters 159 | 160 | Data can be filtered by passing in callable functions. 161 | 162 | ### Examples 163 | 164 | Set the minimum browser version: 165 | 166 | ```python 167 | # Constrain client: 168 | >>> fpgen.generate(client={'browser': {'major': lambda ver: int(ver) >= 130}}) 169 | # Or, just pass a dot seperated path to client.browser.major: 170 | >>> fpgen.generate({'client.browser.major': lambda ver: int(ver) >= 130}) 171 | ``` 172 | 173 | Only allow NVIDIA GPUs: 174 | 175 | ```python 176 | # Note: Strings are lowercased before they're passed. 177 | >>> fpgen.generate(gpu={'vendor': lambda vdr: 'nvidia' in vdr}) 178 | ``` 179 | 180 | Limit the maximum/minimum window size: 181 | 182 | ```python 183 | # Set allowed ranges for outerWidth & outerHeight: 184 | >>> fpgen.generate( 185 | ... window={ 186 | ... 'outerWidth': lambda width: 1000 <= width <= 2000, 187 | ... 'outerHeight': lambda height: 500 <= height <= 1500 188 | ... } 189 | ... ) 190 | ``` 191 | 192 | 193 |
194 | 195 | Or, filter the window dictionary directly. 196 | 197 | 198 | ```python 199 | def window_filter(window): 200 | if not (1000 <= window['outerWidth'] <= 2000): 201 | return False 202 | if not (500 <= window['outerHeight'] <= 1500): 203 | return False 204 | return True 205 | 206 | fpgen.generate(window=window_filter) 207 | ``` 208 |
209 | 210 | 211 | --- 212 | 213 | ## Only generate specific data 214 | 215 | To generate specific data fields, use the `target` parameter with a string or a list of strings. 216 | 217 | ### Examples 218 | 219 | Only generate HTTP headers: 220 | 221 | ```python 222 | >>> fpgen.generate(target='headers') 223 | {'accept': '*/*', 'accept-encoding': 'gzip, deflate, br, zstd', 'accept-language': 'en-US,en;q=0.9', 'priority': 'u=1, i', 'sec-ch-ua': '"Google Chrome";v="131", "Chromium";v="131", "Not_A Brand";v="24"', 'sec-ch-ua-mobile': None, 'sec-ch-ua-platform': '"Windows"', 'sec-fetch-dest': 'empty', 'sec-fetch-mode': 'cors', 'sec-fetch-site': 'same-site', 'sec-gpc': None, 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36'} 224 | ``` 225 | 226 |
227 | 228 | Or, by using the generate_target shortcut. 229 | 230 | 231 | ```python 232 | >>> fpgen.generate_target('headers') 233 | {'accept': '*/*', 'accept-encoding': 'gzip, deflate, br, zstd', 'accept-language': 'en-GB,en;q=0.9,en-US;q=0.8,sk;q=0.7', 'priority': 'u=1, i', 'sec-ch-ua': '"Google Chrome";v="131", "Chromium";v="131", "Not_A Brand";v="24"', 'sec-ch-ua-mobile': None, 'sec-ch-ua-platform': '"Windows"', 'sec-fetch-dest': 'empty', 'sec-fetch-mode': 'cors', 'sec-fetch-site': 'same-site', 'sec-gpc': None, 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36'} 234 | ``` 235 | 236 |
237 | 238 | 239 | Generate a User-Agent for Windows & Chrome: 240 | 241 | ```python 242 | >>> fpgen.generate( 243 | ... os='Windows', 244 | ... browser='Chrome', 245 | ... # Nested targets must be seperated by dots: 246 | ... target='headers.user-agent' 247 | ... ) 248 | 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:134.0) Gecko/20100101 Firefox/134.0' 249 | ``` 250 | 251 | Generate a Firefox TLS fingerprint: 252 | 253 | ```python 254 | >>> fpgen.generate( 255 | ... browser='Firefox', 256 | ... target='network.tls.scrapfly_fp' 257 | ... ) 258 | {'version': '772', 'ch_ciphers': '4865-4867-4866-49195-49199-52393-52392-49196-49200-49162-49161-49171-49172-156-157-47-53', 'ch_extensions': '0-5-10-11-13-16-23-27-28-34-35-43-45-51-65037-65281', 'groups': '4588-29-23-24-25-256-257', 'points': '0', 'compression': '0', 'supported_versions': '772-771', 'supported_protocols': 'h2-http11', 'key_shares': '4588-29-23', 'psk': '1', 'signature_algs': '1027-1283-1539-2052-2053-2054-1025-1281-1537-515-513', 'early_data': '0'} 259 | ``` 260 | 261 | You can provide multiple targets as a list. 262 | 263 | --- 264 | 265 | ## Get the probabilities of a target 266 | 267 | Calculate the probability distribution of a target given any filter: 268 | 269 | ```python 270 | >>> fpgen.trace(target='browser', os='Windows') 271 | [, , , , , ] 272 | ``` 273 | 274 | Multiple targets can be passed as a list/tuple. 275 | Here is an example of tracking the probability of browser & OS given a GPU vendor: 276 | 277 | ```python 278 | >>> fpgen.trace( 279 | ... target=('browser', 'os'), 280 | ... gpu={'vendor': 'Google Inc. (Intel)'} 281 | ... ) 282 | {'browser': [, , , , , ], 283 | 'os': [, , , ]} 284 | ``` 285 | 286 | This also works in the Generator object: 287 | 288 | ```python 289 | >>> gen = fpgen.Generator(os='ChromeOS') 290 | >>> gen.trace(target='browser') 291 | [] 292 | ``` 293 | 294 |
295 | 296 | Parameters for trace 297 | 298 | 299 | ``` 300 | Compute the probability distribution(s) of a target variable given conditions. 301 | 302 | Parameters: 303 | target (str): The target variable name. 304 | conditions (Dict[str, Any], optional): A dictionary mapping variable names 305 | flatten (bool, optional): If True, return a flattened dictionary. 306 | **conditions_kwargs: Additional conditions to apply 307 | 308 | Returns: 309 | A dictionary mapping probabilities to the target's possible values. 310 | ``` 311 | 312 |
313 | 314 |
315 | 316 | ### Reading TraceResult 317 | 318 | To read the output `TraceResult` object: 319 | 320 | ```python 321 | >>> chrome = fpgen.trace(target='browser', os='ChromeOS')[0] 322 | >>> chrome.probability 323 | 1.0 324 | >>> chrome.value 325 | 'Chrome' 326 | ``` 327 | 328 | --- 329 | 330 | ## Query possible values 331 | 332 | You can get a list of a target's possible values by passing it into `fpgen.query`: 333 | 334 | List all possible browsers: 335 | 336 | ```python 337 | >>> fpgen.query('browser') 338 | ['Chrome', 'Edge', 'Firefox', 'Opera', 'Safari', 'Samsung Internet', 'Yandex Browser'] 339 | ``` 340 | 341 | Passing a nested target: 342 | 343 | ```python 344 | >>> fpgen.query('navigator.maxTouchPoints') # Dot seperated path 345 | [0, 1, 2, 5, 6, 9, 10, 17, 20, 40, 256] 346 | ``` 347 | 348 |
349 | 350 | Parameters for query 351 | 352 | 353 | ``` 354 | Query a list of possibilities given a target. 355 | 356 | Parameters: 357 | target (str): Target node to query possible values for 358 | flatten (bool, optional): Whether to flatten the output dictionary 359 | sort (bool, optional): Whether to sort the output arrays 360 | ``` 361 | 362 |
363 | 364 | > [!NOTE] 365 | > Since fpgen is trained on live data, queries may occasionally return invalid or anomalous values. Values lower than a 0.001% probability will not appear in traces or generated fingerprints. 366 | 367 | --- 368 | 369 | ## Generated data 370 | 371 | Here is a rough list of the data fpgen can generate: 372 | 373 | - **Browser data:** 374 | - All navigator data 375 | - All mimetype data: Audio, video, media source, play types, PDF, etc 376 | - All window viewport data (position, inner/outer viewport sizes, toolbar & scrollbar sizes, etc) 377 | - All screen data 378 | - Supported & unsupported DRM modules 379 | - Memory heap limit 380 | 381 | * **System data:** 382 | - GPU data (vendor, renderer, WebGL/WebGL2, extensions, context attributes, parameters, shader precision formats, etc) 383 | - Battery data (charging, charging time, discharging time, level) 384 | - Screen size, color depth, taskbar size, etc. 385 | - Full fonts list 386 | - Cast receiver data 387 | 388 | - **Network data:** 389 | - HTTP headers 390 | - TLS fingerprint data 391 | - HTTP/2 fingerprint & frames 392 | - RTC video & audio capabilities, codecs, clock rates, mimetypes, header extensions, etc 393 | 394 | * **Audio data:** 395 | - Audio signal 396 | - All Audio API constants (AnalyserNode, BiquadFilterNode, DynamicsCompressorNode, OscillatorNode, etc) 397 | 398 | - **Internationalization data:** 399 | - Regional internationalization (Locale, calendar, numbering system, timezone, date format, etc) 400 | - Voices 401 | 402 | * **_And much more!_** 403 | 404 | For a more complete list, see the [full example output](https://raw.githubusercontent.com/scrapfly/fingerprint-generator/refs/heads/main/assets/example-output.json). 405 | 406 | --- 407 | -------------------------------------------------------------------------------- /assets/update.py: -------------------------------------------------------------------------------- 1 | """ 2 | Refreshes example-output.json 3 | """ 4 | 5 | import json 6 | from pathlib import Path 7 | 8 | import fpgen 9 | 10 | DIR = Path('.').absolute() 11 | 12 | 13 | with open(DIR / 'assets' / 'example-output.json', 'w') as f: 14 | data = fpgen.generate() 15 | json.dump(data, f, indent=2) 16 | data = fpgen.generate() 17 | json.dump(data, f, indent=2) 18 | -------------------------------------------------------------------------------- /fpgen/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Fpgen is a fast & comprehensive browser fingerprint generator. 3 | """ 4 | 5 | 6 | def __check_module__() -> None: 7 | ''' 8 | Detect if fpgen is being ran as a module. 9 | ''' 10 | import inspect 11 | import os 12 | import sys 13 | 14 | # Detect if we're running as poetry script 15 | if sys.argv and os.path.basename(sys.argv[0]) == 'fpgen': 16 | os.environ['FPGEN_NO_INIT'] = '1' 17 | return 18 | 19 | stack: list = inspect.stack(2) 20 | if len(stack) >= 2: 21 | prev, launch = stack[-2:] 22 | try: 23 | if (launch.function, prev.function) == ('_run_module_as_main', '_get_module_details'): 24 | # Enable "partial execution mode" to prevent automatic downloads, starting network, etc. 25 | os.environ['FPGEN_NO_INIT'] = '1' 26 | except AttributeError: 27 | pass 28 | 29 | 30 | __check_module__() 31 | del __check_module__ # Remove from namespace 32 | 33 | # ruff: noqa: E402 34 | 35 | from .generator import Generator, generate, generate_target 36 | from .trace import TraceResult, trace 37 | 38 | # Expose the bayesian network interface for tests 39 | from .utils import NETWORK as __network__ 40 | from .utils import query 41 | 42 | __all__ = ( 43 | 'Generator', 44 | 'TraceResult', 45 | 'generate', 46 | 'generate_target', 47 | 'query', 48 | 'trace', 49 | '__network__', 50 | ) 51 | -------------------------------------------------------------------------------- /fpgen/__main__.py: -------------------------------------------------------------------------------- 1 | import click 2 | 3 | from .pkgman import ( 4 | FILE_PAIRS, 5 | decompress_model, 6 | download_model, 7 | recompress_model, 8 | remove_model, 9 | ) 10 | 11 | 12 | @click.group() 13 | def cli(): 14 | pass 15 | 16 | 17 | @cli.command() 18 | @click.option( 19 | '--decompress', is_flag=True, help='Also decompress the model files after downloading' 20 | ) 21 | def fetch(decompress): 22 | """ 23 | Fetch the latest model from GitHub 24 | """ 25 | # Remove existing files 26 | remove_model(log=False) 27 | # Download new files 28 | download_model() 29 | if decompress: 30 | decompress_model() 31 | click.echo(click.style("Complete!", fg="green")) 32 | 33 | 34 | @cli.command() 35 | def remove(): 36 | """ 37 | Remove all downloaded and/or extracted model files 38 | """ 39 | found = remove_model() 40 | if not found: 41 | click.echo(click.style("No files found to remove.", fg="yellow")) 42 | return 43 | click.echo(click.style("Complete!", fg="green")) 44 | 45 | 46 | @cli.command() 47 | def decompress(): 48 | """ 49 | Recompress model files for speed efficiency (will take 100mb+) 50 | """ 51 | # Check there's anything to decompress 52 | if any(f.exists() for f in FILE_PAIRS.keys()): 53 | click.echo(click.style("Model is already decompressed.", fg="yellow")) 54 | return 55 | decompress_model() 56 | 57 | 58 | @cli.command() 59 | def recompress(): 60 | """ 61 | Compress model files after running decompress 62 | """ 63 | # Check there's anything to compress 64 | if any(f.exists() for f in FILE_PAIRS.values()): 65 | click.echo(click.style("Model is already compressed.", fg="yellow")) 66 | return 67 | recompress_model() 68 | 69 | 70 | if __name__ == '__main__': 71 | cli() 72 | -------------------------------------------------------------------------------- /fpgen/bayesian_network.py: -------------------------------------------------------------------------------- 1 | import heapq 2 | import random 3 | from pathlib import Path 4 | from typing import Any, Dict, Iterable, List, Mapping, Optional, Set, Tuple, Union 5 | 6 | from .exceptions import RestrictiveConstraints 7 | from .pkgman import extract_json 8 | from .structs import CaseInsensitiveDict 9 | from .unpacker import lookup_value_list 10 | 11 | StrContainer = Union[str, List[str], Tuple[str, ...], Set[str]] 12 | 13 | # Width for beam search 14 | # This cuts off values that are way too low or contaminated 15 | BEAM_WIDTH = 1000 16 | 17 | 18 | class BayesianNode: 19 | """ 20 | A single node in a Bayesian network with methods to sample conditional probabilities 21 | """ 22 | 23 | __slots__ = ( 24 | 'node_definition', 25 | 'name', 26 | 'parent_names', 27 | 'possible_values', 28 | 'probabilities', 29 | 'index', 30 | ) 31 | 32 | def __init__(self, node_definition: Dict[str, Any], index: int): 33 | # Node defintion info 34 | self.node_definition = node_definition 35 | self.name = node_definition['name'] 36 | self.parent_names = node_definition['parentNames'] 37 | self.possible_values = node_definition['possibleValues'] 38 | # CPT data structure 39 | self.probabilities = node_definition['conditionalProbabilities'] 40 | # Index in the sampling order 41 | self.index = index 42 | 43 | def get_probabilities_given_known_values( 44 | self, parent_values: Mapping[str, Any] 45 | ) -> Dict[Any, float]: 46 | """ 47 | Extracts the probabilities for this node's values, given known parent values 48 | """ 49 | probabilities = self.probabilities 50 | for parent_name in self.parent_names: 51 | parent_value = parent_values[parent_name] 52 | probabilities = probabilities.get(parent_value, {}) 53 | return probabilities 54 | 55 | 56 | class BayesianNetwork: 57 | """ 58 | Bayesian network implementation for probabilistic sampling 59 | """ 60 | 61 | def __init__(self, network_file: Path) -> None: 62 | network_definition = extract_json(network_file) 63 | self.nodes_in_sampling_order = [ 64 | BayesianNode(node_def, index) 65 | for index, node_def in enumerate(network_definition['nodes']) 66 | ] 67 | nodes_by_name = {node.name: node for node in self.nodes_in_sampling_order} 68 | self.nodes_by_name = CaseInsensitiveDict(nodes_by_name) 69 | # Keep a list of the original names 70 | self.node_names = tuple(nodes_by_name.keys()) 71 | # Precompute ancestors 72 | self.ancestors_by_name: Dict[str, Set[str]] = {} 73 | for node in self.nodes_in_sampling_order: 74 | self.get_all_ancestors(node.name) 75 | 76 | def generate_consistent_sample( 77 | self, evidence: Mapping[str, Set[str]] 78 | ) -> Optional[Dict[str, Any]]: 79 | """ 80 | Generate a full sample from the Bayesian network. 81 | """ 82 | result: Dict[str, str] = {} 83 | # Create a working copy of evidence that we can update in place. 84 | current_evidence = {k: set(v) for k, v in evidence.items()} 85 | 86 | for node in self.nodes_in_sampling_order: 87 | node_name = node.name 88 | 89 | # For nodes with explicit evidence, remove the node itself from the evidence for beam search. 90 | if node_name in current_evidence: 91 | allowed_values = current_evidence[node_name] 92 | search_evidence = {k: v for k, v in current_evidence.items() if k != node_name} 93 | distribution = self.trace(node_name, search_evidence) 94 | 95 | # Filter the distribution to allowed values and renormalize. 96 | filtered_dist = {k: v for k, v in distribution.items() if k in allowed_values} 97 | if not filtered_dist or sum(filtered_dist.values()) <= 0: 98 | uniform_prob = 1.0 / len(allowed_values) 99 | filtered_dist = {val: uniform_prob for val in allowed_values} 100 | else: 101 | total = sum(filtered_dist.values()) 102 | filtered_dist = {k: v / total for k, v in filtered_dist.items()} 103 | sampled_value = self.sample_value_from_distribution(filtered_dist) 104 | else: 105 | # For unconstrained nodes, use all current evidence. 106 | distribution = self.trace(node_name, current_evidence) 107 | sampled_value = self.sample_value_from_distribution(distribution) 108 | 109 | result[node_name] = sampled_value 110 | # Update current evidence with the newly sampled node value. 111 | current_evidence[node_name] = {sampled_value} 112 | 113 | return result 114 | 115 | def generate_certain_nodes( 116 | self, 117 | evidence: Mapping[str, Set[str]], 118 | targets: Optional[StrContainer] = None, 119 | ) -> Optional[Dict[str, Any]]: 120 | """ 121 | Generate values for target nodes given conditions. 122 | """ 123 | # If no target specified, generate full sample 124 | if targets is None: 125 | return self.generate_consistent_sample(evidence) 126 | 127 | # Generate result for each target 128 | result = {} 129 | 130 | for target_node in targets: 131 | # Calculate distribution for this target 132 | distribution = self.trace(target_node, evidence) 133 | 134 | # # Handle multi-value conditions for the target 135 | if target_node in evidence: 136 | allowed_values = evidence[target_node] 137 | 138 | # Filter and renormalize 139 | filtered_dist = {k: v for k, v in distribution.items() if k in allowed_values} 140 | 141 | # If no probability mass, use uniform distribution over allowed values 142 | if not filtered_dist or sum(filtered_dist.values()) <= 0: 143 | raise RestrictiveConstraints( 144 | f"Cannot generate fingerprint: No valid values for {target_node} with current conditions." 145 | ) 146 | 147 | # Renormalize 148 | total = sum(filtered_dist.values()) 149 | filtered_dist = {k: v / total for k, v in filtered_dist.items()} 150 | 151 | distribution = filtered_dist 152 | 153 | # Sample from the distribution 154 | if distribution: 155 | result[target_node] = self.sample_value_from_distribution(distribution) 156 | else: 157 | raise RestrictiveConstraints( 158 | f"Cannot generate fingerprint: Empty distribution for {target_node}." 159 | ) 160 | 161 | return result 162 | 163 | def validate_evidence(self, evidence: Mapping[str, Set[str]]) -> None: 164 | """ 165 | Validate that evidence is compatible with each other based on the 166 | Bayesian network structure. Raises an exception if they're incompatible. 167 | """ 168 | # Skip validation for single constraint 169 | if len(evidence) <= 1: 170 | return 171 | 172 | # For each constrained node, check if it's compatible with other fixed conditions 173 | for node_name, allowed_values in evidence.items(): 174 | # Get other fixed conditions (nodes with single values) 175 | fixed_constraints: Dict[str, Union[str, Set[str]]] = {} 176 | for k, v in evidence.items(): 177 | if k != node_name and len(v) == 1: 178 | fixed_constraints[k] = next(iter(v)) 179 | 180 | # If we have fixed conditions, check compatibility 181 | if fixed_constraints: 182 | # Use beam search with fixed conditions to check compatibility 183 | dist = self.trace(node_name, fixed_constraints) 184 | 185 | # If beam search returns a distribution 186 | if dist and all(dist.get(val, 0) <= 0 for val in allowed_values): 187 | # === BUILD THE EXCEPTION MESSAGE === 188 | # Show first 5 values failing node values. 189 | values_str = ", ".join(lookup_value_list(tuple(allowed_values)[:5])) 190 | if len(allowed_values) > 5: 191 | values_str += ", ..." 192 | # Get the constraints 193 | constraints_values = lookup_value_list(fixed_constraints.values()) 194 | constraints_str = ", ".join( 195 | f"{k}={v}" for k, v in zip(fixed_constraints.keys(), constraints_values) 196 | ) 197 | raise RestrictiveConstraints( 198 | f"Cannot generate fingerprint: {node_name}=({values_str}) " 199 | f"is impossible with constraint: {constraints_str}" 200 | ) 201 | 202 | def get_all_ancestors(self, node_name: str) -> Set[str]: 203 | """ 204 | Get all ancestors of a node (nodes that can influence its value) 205 | """ 206 | if node_name in self.ancestors_by_name: 207 | return self.ancestors_by_name[node_name] 208 | 209 | node = self.nodes_by_name[node_name] 210 | ancestors: Set[str] = set() 211 | if not node: 212 | return ancestors 213 | 214 | # Add direct parents 215 | for parent in node.parent_names: 216 | ancestors.add(parent) 217 | # Add parent's ancestors recursively 218 | ancestors.update(self.get_all_ancestors(parent)) 219 | 220 | self.ancestors_by_name[node_name] = ancestors 221 | return ancestors 222 | 223 | def trace(self, target: str, evidence: Mapping[str, Union[str, Set[str]]]) -> Dict[str, float]: 224 | """ 225 | Calculate conditional probability distribution for target given evidence 226 | using beam search. 227 | """ 228 | # Get the actual target name and build relevant nodes set. 229 | target = self.nodes_by_name[target].name 230 | relevant_nodes = self.get_all_ancestors(target).copy() 231 | relevant_nodes.add(target) 232 | 233 | # Add evidence nodes and their ancestors. 234 | for ev_node in evidence: 235 | if ev_node in self.nodes_by_name: 236 | relevant_nodes.add(ev_node) 237 | relevant_nodes.update(self.get_all_ancestors(ev_node)) 238 | 239 | # Sort nodes by sampling order 240 | ordered_nodes = [ 241 | node for node in self.nodes_in_sampling_order if node.name in relevant_nodes 242 | ] 243 | 244 | # Initialize beam 245 | beam: List[Tuple[Dict[str, Any], float]] = [({}, 1.0)] 246 | # Local cache for conditional probability lookups 247 | cpt_cache: Dict[Tuple[str, Tuple[Any, ...]], Dict[Any, float]] = {} 248 | 249 | for node in ordered_nodes: 250 | new_beam = [] 251 | node_name = node.name 252 | 253 | # Determine allowed values from evidence if present 254 | allowed_values = evidence[node_name] if node_name in evidence else None 255 | 256 | # Process each assignment in the current beam 257 | for assignment, prob in beam: 258 | # Parent order is defined by node.parent_names 259 | try: 260 | parent_values_tuple = tuple(assignment[parent] for parent in node.parent_names) 261 | except KeyError: 262 | # Should not occur if assignments are built in order 263 | parent_values_tuple = () 264 | 265 | cache_key = (node_name, parent_values_tuple) 266 | if cache_key in cpt_cache: 267 | cpt = cpt_cache[cache_key] 268 | else: 269 | parent_values = {parent: assignment[parent] for parent in node.parent_names} 270 | cpt = node.get_probabilities_given_known_values(parent_values) 271 | # Use uniform distribution if missing 272 | if not cpt and node.possible_values: 273 | uniform_prob = 1.0 / len(node.possible_values) 274 | cpt = {val: uniform_prob for val in node.possible_values} 275 | 276 | # Expand the beam with new assignments 277 | for value, p in cpt.items(): 278 | if (allowed_values is None or value in allowed_values) and p > 0: 279 | # Create a new assignment with the new node value 280 | new_assignment = assignment.copy() 281 | new_assignment[node_name] = value 282 | new_beam.append((new_assignment, prob * p)) 283 | 284 | # Prune the beam if no valid configurations are left 285 | if new_beam: 286 | if len(new_beam) > BEAM_WIDTH: 287 | # Get the top BEAM_WIDTH assignments 288 | beam = heapq.nlargest(BEAM_WIDTH, new_beam, key=lambda x: x[1]) 289 | else: 290 | beam = new_beam 291 | else: 292 | return {} 293 | 294 | # Extract the target distribution 295 | target_dist: Dict[str, float] = {} 296 | total_prob = 0.0 297 | for assignment, prob in beam: 298 | if target in assignment: 299 | value = assignment[target] 300 | target_dist[value] = target_dist.get(value, 0) + prob 301 | total_prob += prob 302 | 303 | if total_prob > 0: 304 | return {val: p / total_prob for val, p in target_dist.items()} 305 | return {} 306 | 307 | def sample_value_from_distribution(self, distribution: Mapping[str, float]) -> str: 308 | """ 309 | Sample a value from a probability distribution 310 | """ 311 | anchor = random.random() # nosec 312 | cumulative_probability = 0.0 313 | for value, probability in distribution.items(): 314 | cumulative_probability += probability 315 | if anchor < cumulative_probability: 316 | return value 317 | # Fall back to first value 318 | return next(iter(distribution.keys())) 319 | 320 | def get_distribution_for_node( 321 | self, 322 | node: BayesianNode, 323 | sample: Mapping[str, Any], 324 | evidence: Optional[Dict[str, Set[str]]] = None, 325 | ) -> Dict[str, float]: 326 | """ 327 | Get the probability distribution for a node given the current sample 328 | """ 329 | # For multi-value conditions, use beam search 330 | if evidence and node.name in evidence and len(evidence[node.name]) > 1: 331 | # Current evidence is what we've sampled so far 332 | current_evidence = {k: v for k, v in sample.items()} 333 | 334 | # Calculate distribution using beam search 335 | distribution = self.trace(node.name, current_evidence) 336 | # Filter by allowed values and renormalize 337 | if node.name in evidence: 338 | allowed_values = evidence[node.name] 339 | filtered_dist = {k: v for k, v in distribution.items() if k in allowed_values} 340 | 341 | # If no probability mass, the conditions are impossible 342 | if not filtered_dist or sum(filtered_dist.values()) <= 0: 343 | raise RestrictiveConstraints( 344 | f"Cannot generate fingerprint: no valid values for {node.name} with current conditions" 345 | ) 346 | 347 | # Renormalize 348 | total = sum(filtered_dist.values()) 349 | filtered_dist = {k: v / total for k, v in filtered_dist.items()} 350 | return filtered_dist 351 | 352 | return distribution 353 | 354 | # For regular nodes, use direct sampling 355 | parent_values = {parent: sample[parent] for parent in node.parent_names} 356 | 357 | cpt = node.get_probabilities_given_known_values(parent_values) 358 | if not cpt and node.possible_values: 359 | # If missing probabilities, use uniform distribution 360 | uniform_prob = 1.0 / len(node.possible_values) 361 | cpt = {v: uniform_prob for v in node.possible_values} 362 | 363 | if not cpt: 364 | raise RestrictiveConstraints( 365 | f"Cannot generate fingerprint: no probability table for {node.name}" 366 | ) 367 | 368 | return cpt 369 | 370 | def get_shared_possibilities( 371 | self, 372 | value_possibilities: Mapping[str, Set[str]], 373 | seen_nodes: Optional[Set[Tuple[str, int]]] = None, 374 | orig_parents: Optional[Tuple[str, ...]] = None, 375 | ) -> Optional[Dict[str, Set[str]]]: 376 | """ 377 | Get shared possibilities across nodes based on conditions. 378 | Returns None if conditions are contradictory. 379 | 380 | This is deprecated as of v1.3.0 but still exposed for testing. 381 | """ 382 | # Return empty dict immediately 383 | if not value_possibilities: 384 | return {} 385 | 386 | if seen_nodes is None: 387 | seen_nodes = set() 388 | 389 | # Propagate upward to find possible parent values 390 | all_parents = {node: set(values) for node, values in value_possibilities.items()} 391 | for node, values in value_possibilities.items(): 392 | # Track nodes we've processed 393 | if (node, len(values)) in seen_nodes: 394 | continue 395 | seen_nodes.add((node, len(values))) 396 | self._intersect_parents(node, values, all_parents) 397 | 398 | if orig_parents is None: 399 | orig_parents = tuple(all_parents.keys()) 400 | 401 | # If any parent has no valid values, conditions are contradictory 402 | if any(len(parents) == 0 for parents in all_parents.values()): 403 | return None 404 | 405 | return all_parents 406 | 407 | def _intersect_parents( 408 | self, node: str, values: Set[str], all_parents: Dict[str, Set[str]] 409 | ) -> None: 410 | """ 411 | Intersect possible parent values based on child node conditions 412 | """ 413 | node_obj = self.nodes_by_name.get(node) 414 | if not node_obj: 415 | return 416 | 417 | parent_names = node_obj.parent_names 418 | num_parents = len(parent_names) 419 | 420 | # No parents exist, nothing to do 421 | if not num_parents: 422 | return 423 | 424 | # Build a set of each parent's possible values 425 | parent_values: List[Set[str]] = [set() for _ in range(num_parents)] 426 | for value in values: 427 | collect_parents( 428 | node_obj.probabilities, 429 | value, 430 | parent_values=parent_values, 431 | ) 432 | 433 | # Update all_parents with the intersection of this node's parents 434 | for n, parents in enumerate(parent_values): 435 | parent_name = parent_names[n] 436 | if parent_name not in all_parents: 437 | all_parents[parent_name] = parents 438 | else: 439 | all_parents[parent_name] = all_parents[parent_name].intersection(parents) 440 | 441 | # Recurse to earlier parents if needed 442 | if parent_names and parent_names[0] != self.nodes_in_sampling_order[0].name: 443 | self._intersect_parents( 444 | node=parent_names[0], values=parent_values[0], all_parents=all_parents 445 | ) 446 | 447 | 448 | def collect_parents( 449 | probabilities: Mapping[str, Any], 450 | target: str, 451 | parent_values: List[Set[str]], 452 | so_far: Optional[List[str]] = None, 453 | depth: int = 0, 454 | ) -> None: 455 | """ 456 | Collects all the possible parent values of a node 457 | """ 458 | if so_far is None: 459 | so_far = [] 460 | for parent, values in probabilities.items(): 461 | if isinstance(values, dict): 462 | collect_parents( 463 | probabilities=values, 464 | target=target, 465 | parent_values=parent_values, 466 | so_far=so_far + [parent], 467 | depth=depth + 1, 468 | ) 469 | elif parent == target: 470 | for n, parent in enumerate(so_far): 471 | parent_values[n].add(parent) 472 | -------------------------------------------------------------------------------- /fpgen/exceptions.py: -------------------------------------------------------------------------------- 1 | class NetworkError(ValueError): 2 | """Error with the network""" 3 | 4 | 5 | class InvalidConstraints(NetworkError): 6 | """Raises when a constraint isn't possible""" 7 | 8 | 9 | class RestrictiveConstraints(InvalidConstraints): 10 | """Raises when the passed constraints are too restrictive""" 11 | 12 | 13 | class InvalidNode(NetworkError): 14 | """Raises when a node doesn't exist""" 15 | 16 | 17 | class NodePathError(InvalidNode): 18 | """Raises when a key path doesn't exist""" 19 | 20 | 21 | class MissingRelease(Exception): 22 | """Raised when a required GitHub release asset is missing.""" 23 | 24 | 25 | class CannotTraceLargeConfigSpace(ValueError): 26 | """Raises when the configuration space of a node is too large to trace with exact inference""" 27 | -------------------------------------------------------------------------------- /fpgen/generator.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from typing import Any, Dict, List, Optional, Set, Union, overload 3 | 4 | from .bayesian_network import StrContainer 5 | from .exceptions import RestrictiveConstraints 6 | from .trace import TraceResult, TraceResultDict, trace 7 | from .utils import ( 8 | NETWORK, 9 | _assert_dict_xor_kwargs, 10 | _assert_network_exists, 11 | _find_roots, 12 | _make_output_dict, 13 | _maybe_flatten, 14 | _reassemble_targets, 15 | _tupilize, 16 | build_evidence, 17 | ) 18 | 19 | 20 | class Generator: 21 | """ 22 | Generates realistic browser fingerprints 23 | """ 24 | 25 | def __init__( 26 | self, 27 | conditions: Optional[Dict[str, Any]] = None, 28 | *, 29 | strict: bool = True, 30 | flatten: bool = False, 31 | **conditions_kwargs: Any, 32 | ): 33 | """ 34 | Initializes the Generator with the given options. 35 | Values passed to the Generator object will be inherited when calling Generator.generate() 36 | 37 | Parameters: 38 | conditions (dict, optional): Conditions for the generated fingerprint. 39 | strict (bool, optional): Whether to raise an exception if the conditions are too strict. 40 | flatten (bool, optional): Whether to flatten the output dictionary 41 | target (Optional[Union[str, StrContainer]]): Only generate specific value(s) 42 | **conditions_kwargs: Conditions for the generated fingerprint (passed as kwargs) 43 | """ 44 | _assert_dict_xor_kwargs(conditions, conditions_kwargs) 45 | # Set default options 46 | self.strict: bool = strict 47 | self.flatten: bool = flatten 48 | self.evidence: Dict[str, Set[str]] = {} 49 | 50 | if conditions_kwargs: 51 | conditions = conditions_kwargs 52 | if conditions: 53 | build_evidence(conditions, self.evidence) 54 | 55 | @overload 56 | def generate( 57 | self, 58 | conditions: Optional[Dict[str, Any]] = None, 59 | *, 60 | strict: Optional[bool] = None, 61 | flatten: Optional[bool] = None, 62 | target: str, 63 | **conditions_kwargs: Any, 64 | ) -> Any: ... 65 | 66 | @overload 67 | def generate( 68 | self, 69 | conditions: Optional[Dict[str, Any]] = None, 70 | *, 71 | strict: Optional[bool] = None, 72 | flatten: Optional[bool] = None, 73 | target: Optional[StrContainer] = None, 74 | **conditions_kwargs: Any, 75 | ) -> Dict[str, Any]: ... 76 | 77 | def generate( 78 | self, 79 | conditions: Optional[Dict[str, Any]] = None, 80 | *, 81 | strict: Optional[bool] = None, 82 | flatten: Optional[bool] = None, 83 | target: Optional[Union[str, StrContainer]] = None, 84 | **conditions_kwargs: Any, 85 | ) -> Dict[str, Any]: 86 | """ 87 | Generates a fingerprint. 88 | 89 | Parameters: 90 | conditions (dict, optional): Conditions for the generated fingerprints. 91 | These conditions will be inherited by generated fingerprints. 92 | strict (bool, optional): Whether to raise an exception if the conditions are too strict. 93 | flatten (bool, optional): Whether to flatten the output dictionary 94 | target (Optional[Union[str, StrContainer]]): Only generate specific value(s) 95 | **conditions_kwargs: Conditions for the generated fingerprints (passed as kwargs) 96 | 97 | Returns: 98 | A generated fingerprint. 99 | """ 100 | _assert_dict_xor_kwargs(conditions, conditions_kwargs) 101 | _assert_network_exists() 102 | 103 | if conditions_kwargs: 104 | conditions = conditions_kwargs 105 | 106 | # Merge new options with old 107 | strict = _first(strict, self.strict) 108 | flatten = _first(flatten, self.flatten) 109 | 110 | # Inherit the evidence from the class instance 111 | evidence = self.evidence.copy() 112 | if conditions: 113 | build_evidence(conditions, evidence, strict=strict) 114 | 115 | # Convert targets to set 116 | if target: 117 | target_tup = _tupilize(target) 118 | target_roots = set(_find_roots(target_tup)) 119 | else: 120 | target_roots = None 121 | 122 | # Generate fingerprint 123 | while True: 124 | # If we only are searching for certain targets, call generate_certain_nodes 125 | if target_roots: 126 | fingerprint = NETWORK.generate_certain_nodes(evidence, target_roots) 127 | else: 128 | fingerprint = NETWORK.generate_consistent_sample(evidence) 129 | 130 | # Found the fingerprint 131 | if fingerprint is not None: 132 | break 133 | # Raise an error if the evidence are too strict 134 | if strict: 135 | raise RestrictiveConstraints( 136 | 'Cannot generate fingerprint. Constraints are too restrictive.' 137 | ) 138 | # If no fingerprint was generated, relax the filtered values until we find one 139 | evidence.pop(next(iter(evidence.keys()))) 140 | 141 | # If we arent searching for certain targets, we can return right away 142 | if target: 143 | output = _make_output_dict(fingerprint, flatten=False) # Don't flatten yet 144 | output = _reassemble_targets(_tupilize(target), output) 145 | if isinstance(target, str): 146 | output = output[target] 147 | return _maybe_flatten(flatten, output) 148 | 149 | return _make_output_dict(fingerprint, flatten=flatten) 150 | 151 | def generate_target( 152 | self, target: str, conditions: Optional[Dict[str, Any]] = None, **kwargs: Any 153 | ) -> Any: 154 | """ 155 | Generates a specific target. Shortcut for the `generate` method. 156 | """ 157 | return self.generate(target=target, conditions=conditions, **kwargs) 158 | 159 | @overload 160 | def trace( 161 | self, 162 | target: str, 163 | conditions: Optional[Dict[str, Any]] = None, 164 | *, 165 | flatten: bool = False, 166 | **conditions_kwargs: Any, 167 | ) -> List[TraceResult]: ... 168 | 169 | @overload 170 | def trace( 171 | self, 172 | target: StrContainer, 173 | conditions: Optional[Dict[str, Any]] = None, 174 | *, 175 | flatten: bool = False, 176 | **conditions_kwargs: Any, 177 | ) -> TraceResultDict: ... 178 | 179 | def trace( 180 | self, 181 | target: Union[str, StrContainer], 182 | conditions: Optional[Dict[str, Any]] = None, 183 | *, 184 | flatten: bool = False, 185 | **conditions_kwargs: Any, 186 | ) -> Union[List[TraceResult], TraceResultDict]: 187 | """ 188 | Compute the probability distribution(s) of a target variable given conditions. 189 | 190 | Parameters: 191 | target (str): The target variable name. 192 | conditions (Dict[str, Any], optional): A dictionary mapping variable names 193 | flatten (bool, optional): If True, return a flattened dictionary. 194 | **conditions_kwargs: Additional conditions to apply 195 | 196 | Returns: 197 | A dictionary mapping probabilities to the target's possible values. 198 | """ 199 | return trace( 200 | target=target, 201 | flatten=flatten, 202 | conditions=conditions, 203 | **conditions_kwargs, 204 | # Inherit the conditions from the class instance 205 | __evidence__=self.evidence.copy(), 206 | ) 207 | 208 | 209 | def _first(*values): 210 | """ 211 | Simple function that returns the first non-None value passed 212 | """ 213 | return next((v for v in values if v is not None), None) 214 | 215 | 216 | """ 217 | A global `generate` function for those calling 218 | fpgen.generate() directly without creating a Generator object 219 | """ 220 | 221 | GLOBAL_GENERATOR: Optional[Generator] = None 222 | 223 | 224 | def generate(*args, **kwargs) -> Dict[str, Any]: 225 | """ 226 | Generates a fingerprint. 227 | 228 | Parameters: 229 | conditions (dict, optional): Conditions for the generated fingerprints. 230 | These conditions will be inherited by generated fingerprints. 231 | strict (bool, optional): Whether to raise an exception if the conditions are too strict. 232 | flatten (bool, optional): Whether to flatten the output dictionary 233 | target (Optional[Union[str, StrContainer]]): Only generate specific value(s) 234 | **conditions_kwargs: Conditions for the generated fingerprints (passed as kwargs) 235 | 236 | Returns: 237 | A generated fingerprint. 238 | """ 239 | global GLOBAL_GENERATOR 240 | if GLOBAL_GENERATOR is None: 241 | GLOBAL_GENERATOR = Generator() 242 | return GLOBAL_GENERATOR.generate(*args, **kwargs) 243 | 244 | 245 | def generate_target(target: str, conditions: Optional[Dict[str, Any]] = None, **kwargs) -> Any: 246 | """ 247 | Generates a specific target. Shortcut for the `generate` method. 248 | """ 249 | return generate(target=target, conditions=conditions, **kwargs) 250 | 251 | 252 | __all__ = ('Generator', 'WindowBounds', 'generate', 'generate_target') 253 | -------------------------------------------------------------------------------- /fpgen/pkgman.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tempfile 3 | import zipfile 4 | from datetime import datetime, timedelta 5 | from pathlib import Path 6 | from typing import Any, Dict 7 | 8 | import click 9 | import httpx 10 | import orjson 11 | import zstandard 12 | 13 | from .exceptions import MissingRelease 14 | 15 | # Model files 16 | DATA_DIR = Path(__file__).parent / 'data' 17 | 18 | NETWORK_FILE = DATA_DIR / "fingerprint-network.json" 19 | VALUES_JSON = DATA_DIR / 'values.json' 20 | VALUES_DATA = DATA_DIR / 'values.dat' 21 | 22 | # Mapping of files to their compressed variant 23 | FILE_PAIRS = { 24 | NETWORK_FILE: NETWORK_FILE.with_suffix('.json.zst'), 25 | VALUES_JSON: VALUES_JSON.with_suffix('.json.zst'), 26 | VALUES_DATA: VALUES_DATA.with_suffix('.dat.zst'), 27 | } 28 | 29 | # Repo to pull releases from 30 | GITHUB_REPO = 'scrapfly/fingerprint-generator' 31 | 32 | 33 | class ModelPuller: 34 | """ 35 | Pulls the model from GitHub and extracts it to the data directory. 36 | """ 37 | 38 | def __init__(self) -> None: 39 | self.api_url = f"https://api.github.com/repos/{GITHUB_REPO}/releases" 40 | 41 | def check_asset(self, asset: Dict) -> Any: 42 | """ 43 | Compare the asset to determine if it's the desired asset. 44 | 45 | Args: 46 | asset: Asset information from GitHub API 47 | 48 | Returns: 49 | Any: Data to be returned if this is the desired asset, or None/False if not 50 | """ 51 | url = asset.get('browser_download_url') 52 | if url and url.endswith('.zip'): 53 | return url 54 | 55 | def missing_asset_error(self) -> None: 56 | """ 57 | Raise a MissingRelease exception if no release is found. 58 | """ 59 | raise MissingRelease(f"Could not find a release asset in {GITHUB_REPO}.") 60 | 61 | def get_asset(self) -> Any: 62 | """ 63 | Fetch the latest release from the GitHub API. 64 | Gets the first asset that returns a truthy value from check_asset. 65 | """ 66 | resp = httpx.get(self.api_url, timeout=20, verify=False) 67 | resp.raise_for_status() 68 | 69 | releases = resp.json() 70 | 71 | for release in releases: 72 | for asset in release['assets']: 73 | if data := self.check_asset(asset): 74 | return data 75 | 76 | self.missing_asset_error() 77 | 78 | def download(self): 79 | """ 80 | Download the model from GitHub and extract it to the data directory. 81 | """ 82 | # Pull form a custom source, or the GitHub API 83 | 84 | url = os.getenv('FPGEN_MODEL_URL') 85 | if url: 86 | click.echo(f"Fetching model files from {url}...") 87 | else: 88 | click.echo("Fetching model files from GitHub...") 89 | url = self.get_asset() 90 | 91 | # Optionally get the model password 92 | password = os.getenv('FPGEN_MODEL_PASSWORD') 93 | if password: 94 | password = password.encode() 95 | 96 | # Stream to tempfile then extract using zipfile 97 | with tempfile.NamedTemporaryFile(delete=False) as temp_file: 98 | with httpx.stream( 99 | 'GET', url, timeout=20, verify=False, follow_redirects=True 100 | ) as r: # nosec 101 | for chunk in r.iter_bytes(): 102 | temp_file.write(chunk) 103 | temp_file.flush() 104 | temp_file.close() 105 | # Print extraction message if running as module 106 | if __is_module__(): 107 | click.echo(f"Extracting to {DATA_DIR}...") 108 | with zipfile.ZipFile(temp_file.name) as z: 109 | z.extractall(DATA_DIR, pwd=password) 110 | 111 | os.unlink(temp_file.name) 112 | 113 | 114 | """ 115 | File helper 116 | """ 117 | 118 | 119 | def extract_json(path: Path) -> dict: 120 | """ 121 | Reads JSON from a file (or from a zst if needed). 122 | """ 123 | # Check for uncompressed json 124 | if path.exists(): 125 | with open(path, 'rb') as f: 126 | return orjson.loads(f.read()) 127 | 128 | # Check for zst json 129 | elif (zst_path := path.with_suffix('.json.zst')).exists(): 130 | with open(zst_path, 'rb') as f: 131 | decomp = zstandard.ZstdDecompressor() 132 | return orjson.loads(decomp.decompress(f.read())) 133 | 134 | raise FileNotFoundError(f'Missing required data file for: {path}') 135 | 136 | 137 | """ 138 | Model file utility functions 139 | """ 140 | 141 | 142 | def download_model(): 143 | """ 144 | Call the model puller to download files 145 | """ 146 | ModelPuller().download() 147 | 148 | 149 | def decompress_model(): 150 | """ 151 | Decompress model files 152 | """ 153 | import zstandard 154 | 155 | dctx = zstandard.ZstdDecompressor() 156 | for src_zst, dst in {v: k for k, v in FILE_PAIRS.items()}.items(): 157 | if not src_zst.exists(): 158 | click.echo(f"Warning: {src_zst} not found, skipping") 159 | continue 160 | 161 | click.echo(f"Decompressing {src_zst} -> {dst}") 162 | with open(src_zst, 'rb') as src, open(dst, 'wb') as dst_f: 163 | dctx.copy_stream(src, dst_f) 164 | src_zst.unlink() 165 | 166 | 167 | def recompress_model(): 168 | """ 169 | Recompress model files after running decompress 170 | """ 171 | import zstandard 172 | 173 | cctx = zstandard.ZstdCompressor(level=19) 174 | for src, dst_zst in FILE_PAIRS.items(): 175 | if not src.exists(): 176 | click.echo(f"Warning: {src} not found, skipping") 177 | continue 178 | 179 | click.echo(f"Compressing {src} -> {dst_zst}") 180 | with open(src, 'rb') as src_f: 181 | data = src_f.read() 182 | compressed = cctx.compress(data) 183 | with open(dst_zst, 'wb') as dst: 184 | dst.write(compressed) 185 | src.unlink() 186 | 187 | 188 | def remove_model(log=True): 189 | """ 190 | Remove all model files 191 | """ 192 | for file_pair in FILE_PAIRS.items(): 193 | found = False 194 | for file in file_pair: 195 | if not file.exists(): 196 | continue 197 | if log: 198 | click.echo(f"Removing {file}") 199 | file.unlink() 200 | found = True 201 | return found 202 | 203 | 204 | def files_are_recent(file_list): 205 | """ 206 | Checks if all passed files are <5 weeks old 207 | """ 208 | cutoff = datetime.now() - timedelta(weeks=5) 209 | return all(datetime.fromtimestamp(f.stat().st_mtime) >= cutoff for f in file_list) 210 | 211 | 212 | def assert_downloaded(): 213 | """ 214 | Checks if the model files are downloaded 215 | """ 216 | if __is_module__(): 217 | return # Skip if running as a module 218 | 219 | # Check decompressed files (FILE_PAIRS keys) 220 | if all(file.exists() for file in FILE_PAIRS.keys()): 221 | # When updating decompressed files, decompress again after redownloading 222 | if not files_are_recent(FILE_PAIRS.keys()): 223 | ModelPuller().download() 224 | decompress_model() 225 | return 226 | 227 | # Check compressed files (FILE_PAIRS values) 228 | if all(file.exists() for file in FILE_PAIRS.values()) and files_are_recent(FILE_PAIRS.values()): 229 | return 230 | 231 | # First time importing 232 | ModelPuller().download() 233 | 234 | 235 | def __is_module__() -> bool: 236 | """ 237 | Checks if fpgen is being ran as a module 238 | """ 239 | return bool(os.getenv('FPGEN_NO_INIT')) 240 | 241 | 242 | # Check model files are downloaded 243 | assert_downloaded() 244 | -------------------------------------------------------------------------------- /fpgen/py.typed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapfly/fingerprint-generator/fc7e489687821f691bae9308d73e294497d015fb/fpgen/py.typed -------------------------------------------------------------------------------- /fpgen/structs.py: -------------------------------------------------------------------------------- 1 | # https://stackoverflow.com/a/32888599 2 | class CaseInsensitiveDict(dict): 3 | @classmethod 4 | def _k(cls, key): 5 | return key.casefold() if isinstance(key, str) else key 6 | 7 | def __init__(self, *args, **kwargs): 8 | super(CaseInsensitiveDict, self).__init__(*args, **kwargs) 9 | self._convert_keys() 10 | 11 | def __getitem__(self, key): 12 | return super(CaseInsensitiveDict, self).__getitem__(self.__class__._k(key)) 13 | 14 | def __setitem__(self, key, value): 15 | super(CaseInsensitiveDict, self).__setitem__(self.__class__._k(key), value) 16 | 17 | def __delitem__(self, key): 18 | return super(CaseInsensitiveDict, self).__delitem__(self.__class__._k(key)) 19 | 20 | def __contains__(self, key): 21 | return super(CaseInsensitiveDict, self).__contains__(self.__class__._k(key)) 22 | 23 | def has_key(self, key): 24 | return super(CaseInsensitiveDict, self).has_key(self.__class__._k(key)) 25 | 26 | def pop(self, key, *args, **kwargs): 27 | return super(CaseInsensitiveDict, self).pop(self.__class__._k(key), *args, **kwargs) 28 | 29 | def get(self, key, *args, **kwargs): 30 | return super(CaseInsensitiveDict, self).get(self.__class__._k(key), *args, **kwargs) 31 | 32 | def setdefault(self, key, *args, **kwargs): 33 | return super(CaseInsensitiveDict, self).setdefault(self.__class__._k(key), *args, **kwargs) 34 | 35 | def update(self, E={}, **F): 36 | super(CaseInsensitiveDict, self).update(self.__class__(E)) 37 | super(CaseInsensitiveDict, self).update(self.__class__(**F)) 38 | 39 | def _convert_keys(self): 40 | for k in list(self.keys()): 41 | v = super(CaseInsensitiveDict, self).pop(k) 42 | self.__setitem__(k, v) 43 | -------------------------------------------------------------------------------- /fpgen/trace.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from typing import Any, Dict, List, Optional, Set, Union, overload 3 | 4 | import orjson 5 | 6 | from .bayesian_network import StrContainer 7 | from .exceptions import RestrictiveConstraints 8 | from .unpacker import lookup_value_list 9 | from .utils import ( 10 | NETWORK, 11 | _assert_dict_xor_kwargs, 12 | _find_roots, 13 | _tupilize, 14 | build_evidence, 15 | ) 16 | 17 | 18 | @dataclass 19 | class TraceResult: 20 | value: Any 21 | probability: float 22 | 23 | def __repr__(self) -> str: 24 | return f"<{self.value}: {self.probability * 100:.5f}%>" 25 | 26 | 27 | # Recursive type for the return value 28 | TraceResultDict = Dict[str, Union[List[TraceResult], "TraceResultDict"]] 29 | 30 | 31 | @overload 32 | def trace( 33 | target: str, 34 | conditions: Optional[Dict[str, Any]] = None, 35 | *, 36 | flatten: bool = False, 37 | **conditions_kwargs, 38 | ) -> List[TraceResult]: ... 39 | 40 | 41 | @overload 42 | def trace( 43 | target: StrContainer, 44 | conditions: Optional[Dict[str, Any]] = None, 45 | *, 46 | flatten: bool = False, 47 | **conditions_kwargs, 48 | ) -> TraceResultDict: ... 49 | 50 | 51 | def trace( 52 | target: Union[str, StrContainer], 53 | conditions: Optional[Dict[str, Any]] = None, 54 | *, 55 | flatten: bool = False, 56 | __evidence__: Optional[Dict[str, Set[str]]] = None, 57 | **conditions_kwargs, 58 | ) -> Union[List[TraceResult], TraceResultDict]: 59 | """ 60 | Compute the probability distribution(s) of a target variable given conditions. 61 | 62 | Parameters: 63 | target (str): The target variable name. 64 | conditions (Dict[str, Any], optional): A dictionary mapping variable names 65 | flatten (bool, optional): If True, return a flattened dictionary. 66 | **conditions_kwargs: Additional conditions to apply 67 | 68 | Returns: 69 | A dictionary mapping probabilities to the target's possible values. 70 | """ 71 | _assert_dict_xor_kwargs(conditions, conditions_kwargs) 72 | 73 | # If evidence was already passed, consume it 74 | evidence: Dict[str, Set[str]] = __evidence__ or {} 75 | 76 | # Build conditions 77 | if conditions_kwargs: 78 | conditions = conditions_kwargs 79 | if conditions: 80 | build_evidence(conditions, evidence) 81 | 82 | # Get the targets 83 | target_tup = _tupilize(target) 84 | target_roots = tuple(_find_roots(target_tup)) 85 | 86 | # List is empty, raise an error 87 | if not target_tup: 88 | raise ValueError("Please pass at least one valid target.") 89 | 90 | # If there is only one target, return the result 91 | if len(target_roots) == 1: 92 | return _pull_target(target_roots[0], evidence) 93 | 94 | # If flatten is true, return a dictionary of targets 95 | if flatten: 96 | return {root: _pull_target(root, evidence) for root in target_roots} 97 | 98 | # Otherwise, return a expanded dictionary of targets 99 | output: Dict[str, Any] = {} 100 | for root in target_roots: 101 | parts = root.split(".") 102 | d = output 103 | for part in parts[:-1]: 104 | if part not in d: 105 | d[part] = {} 106 | d = d[part] 107 | output[parts[-1]] = _pull_target(root, evidence) 108 | return output 109 | 110 | 111 | def _pull_target(target: str, evidence: Dict[str, Set[str]]) -> List[TraceResult]: 112 | """ 113 | Gets the probability distribution for a target variable given conditions. 114 | """ 115 | possibilities = NETWORK.trace(target=target, evidence=evidence) 116 | if not possibilities: 117 | raise RestrictiveConstraints( 118 | f"Restraints are too restrictive. No possible values for {target}." 119 | ) 120 | data = lookup_value_list(possibilities.keys()) 121 | data = map(orjson.loads, data) 122 | probs = possibilities.values() 123 | resp = [ 124 | TraceResult(value=value, probability=probability) for value, probability in zip(data, probs) 125 | ] 126 | resp.sort(key=lambda x: x.probability, reverse=True) 127 | return resp 128 | -------------------------------------------------------------------------------- /fpgen/unpacker.py: -------------------------------------------------------------------------------- 1 | import base64 2 | from typing import List, Tuple 3 | 4 | import numpy as np 5 | from indexed_zstd import IndexedZstdFile 6 | 7 | from .pkgman import VALUES_DATA, VALUES_JSON, __is_module__, extract_json 8 | 9 | 10 | def load_values_json() -> List[Tuple[str, List[str]]]: 11 | values_json = extract_json(VALUES_JSON) 12 | return list(values_json.items()) 13 | 14 | 15 | if not __is_module__(): 16 | # Do not attempt to load values.json if we are running as a module 17 | VALUE_PAIRS = load_values_json() 18 | 19 | 20 | def base85_to_int(s: str) -> int: 21 | # Decode using base85 22 | decoded_bytes = base64.b85decode(s) 23 | # Convert bytes to integer 24 | return int.from_bytes(decoded_bytes, byteorder='big') 25 | 26 | 27 | def get_dat_file(): 28 | """ 29 | Returns a seekable file descriptor (or indexed zst file) 30 | """ 31 | if VALUES_DATA.exists(): 32 | return open(VALUES_DATA, 'rb') 33 | elif (zst_path := VALUES_DATA.with_suffix('.dat.zst')).exists(): 34 | return IndexedZstdFile(str(zst_path)) 35 | 36 | raise FileNotFoundError(f'Missing required file: {VALUES_DATA}') 37 | 38 | 39 | def lookup_value(index): 40 | offset, length = VALUE_PAIRS[base85_to_int(index)] 41 | file = get_dat_file() 42 | file.seek(int(offset, 16)) 43 | data = file.read(length).decode('utf-8') 44 | file.close() 45 | return data 46 | 47 | 48 | def lookup_value_list(index_list): 49 | """ 50 | Returns a list of values from the data file given a list of lookup values 51 | """ 52 | # Empty numpy array of len(index_list) 53 | value_map = np.empty(len(index_list), dtype=object) 54 | 55 | file = get_dat_file() 56 | # Read in order from lowest index to highest 57 | sorted_indices = sorted( 58 | (base85_to_int(lookup_index), n) for n, lookup_index in enumerate(index_list) 59 | ) 60 | 61 | for index, n in sorted_indices: 62 | offset, length = VALUE_PAIRS[index] 63 | file.seek(int(offset, 16)) 64 | # Set to key in order of the original list 65 | value_map[n] = file.read(length).decode('utf-8') 66 | 67 | file.close() 68 | return value_map 69 | -------------------------------------------------------------------------------- /fpgen/utils.py: -------------------------------------------------------------------------------- 1 | from collections.abc import MutableMapping 2 | from typing import ( 3 | Any, 4 | Dict, 5 | Iterable, 6 | Iterator, 7 | List, 8 | Literal, 9 | Mapping, 10 | Optional, 11 | Set, 12 | Tuple, 13 | Union, 14 | overload, 15 | ) 16 | 17 | import orjson 18 | 19 | from .bayesian_network import BayesianNetwork, StrContainer 20 | from .exceptions import ( 21 | InvalidConstraints, 22 | InvalidNode, 23 | NetworkError, 24 | NodePathError, 25 | RestrictiveConstraints, 26 | ) 27 | from .pkgman import NETWORK_FILE, __is_module__ 28 | from .structs import CaseInsensitiveDict 29 | from .unpacker import lookup_value_list 30 | 31 | # Load the network. (unless we're running as a module) 32 | NETWORK: BayesianNetwork 33 | if __is_module__(): 34 | NETWORK = None # type: ignore 35 | else: 36 | NETWORK = BayesianNetwork(NETWORK_FILE) 37 | 38 | 39 | def _assert_network_exists() -> None: 40 | """ 41 | Quick helper to confirm NETWORK is defined 42 | """ 43 | if NETWORK is None: 44 | raise NetworkError("FPGEN_NO_INIT is enabled. Cannot access the network.") 45 | 46 | 47 | def query( 48 | target: str, *, flatten: bool = False, sort: bool = False 49 | ) -> Union[Dict[str, Any], List[Any]]: 50 | """ 51 | Query a list of possibilities given a target. 52 | 53 | Parameters: 54 | target (str): Target node to query possible values for 55 | flatten (bool, optional): Whether to flatten the output dictionary 56 | sort (bool, optional): Whether to sort the output arrays 57 | """ 58 | _assert_network_exists() 59 | 60 | # Check node list first 61 | values = _lookup_possibilities(target, casefold=False) 62 | if values: 63 | output: Union[Tuple, map] 64 | output = tuple(map(orjson.loads, values)) 65 | # Merge dicts if data is all dicts, else just return a deduped list 66 | if all(isinstance(d, dict) for d in output): 67 | # Flatten the output dict before returning if needed 68 | return _maybe_flatten(flatten, _merge_dicts(output, sort=sort)) 69 | else: 70 | # Dedupe the list 71 | return _dedupe(output, sort=sort) 72 | 73 | # Target is within a node. Need to look up the tree 74 | nested_keys: List[str] = [] 75 | root_data = _lookup_root_possibilities( 76 | target, nested_keys=nested_keys, none_if_missing=True, casefold=False 77 | ) 78 | if root_data is not None: 79 | # Read possibile values as jsons 80 | output = map(orjson.loads, root_data[1]) 81 | # Pull the item at the target path 82 | output = map(lambda d: _at_path(d, nested_keys), output) 83 | output = tuple(output) 84 | 85 | # If they are all dicts, merge them 86 | if all(isinstance(d, dict) for d in output): 87 | # Flatten the output dict if needed 88 | return _maybe_flatten(flatten, _merge_dicts(output, sort=sort)) 89 | 90 | # Return a deduped list 91 | return _dedupe(output, sort=sort) 92 | 93 | # Search down the tree 94 | data = _search_downward(target) 95 | resp: Dict[str, List[Any]] = { 96 | # Remove the current node path 97 | key.removeprefix(f'{target}.'): [ 98 | # Parse each possible value via orjson 99 | orjson.loads(d) 100 | for d in (_lookup_possibilities(key, casefold=False) or tuple()) 101 | ] 102 | for key in data 103 | } 104 | if flatten: 105 | # May need to flatten further 106 | return _flatten({node: _dedupe(values, sort=sort) for node, values in resp.items()}) 107 | return _unflatten(resp, sort=sort) 108 | 109 | 110 | """ 111 | Helper functions for searching for nodes up/down the network 112 | """ 113 | 114 | 115 | def _at_path(data: Mapping, path: StrContainer, *, casefold=False) -> Any: 116 | """ 117 | Gets the value in nested dictionary given its path 118 | """ 119 | for key in path: 120 | if casefold: 121 | data = CaseInsensitiveDict(data) 122 | if not isinstance(data, MutableMapping) or key not in data: 123 | raise NodePathError(key) 124 | data = data[key] 125 | return data 126 | 127 | 128 | @overload 129 | def _lookup_root_possibilities( 130 | key: str, 131 | nested_keys: Optional[List[str]] = None, 132 | casefold: bool = True, 133 | none_if_missing: Literal[False] = False, 134 | ) -> Tuple[str, Dict[str, Any]]: ... 135 | 136 | 137 | @overload 138 | def _lookup_root_possibilities( 139 | key: str, 140 | nested_keys: Optional[List[str]] = None, 141 | casefold: bool = True, 142 | none_if_missing: Literal[True] = True, 143 | ) -> Optional[Tuple[str, Dict[str, Any]]]: ... 144 | 145 | 146 | def _lookup_root_possibilities( 147 | key: str, 148 | nested_keys: Optional[List[str]] = None, 149 | casefold: bool = True, 150 | none_if_missing: bool = False, 151 | ) -> Optional[Tuple[str, Dict[str, Any]]]: 152 | """ 153 | Finds the first avaliable root node of a given key, and queries its possibilities 154 | """ 155 | if not key: 156 | raise InvalidNode('Key cannot be empty.') 157 | while key: 158 | keys = key.rsplit('.', 1) 159 | # Ran out of keys to parse 160 | if len(keys) != 2: 161 | if none_if_missing: 162 | return None 163 | raise InvalidNode(f'{key} is not a valid node') 164 | key, sliced_key = keys 165 | 166 | if nested_keys is not None: 167 | nested_keys.append(sliced_key) 168 | 169 | # if a nested key is avaliable, enter it 170 | possible_values = _lookup_possibilities(key, casefold) 171 | # iterate backwards until we find the node 172 | if possible_values is not None: 173 | break 174 | 175 | if possible_values is None: 176 | if none_if_missing: 177 | return None 178 | raise InvalidNode(f'{key} is not a valid node') 179 | 180 | if nested_keys: 181 | nested_keys.reverse() 182 | 183 | return key, possible_values 184 | 185 | 186 | def _lookup_possibilities(node_name: str, casefold: bool = True) -> Optional[Dict[str, Any]]: 187 | """ 188 | Returns the possible values for the given node name. 189 | Returns as a dictionary {value: lookup_index} 190 | """ 191 | if node_name not in NETWORK.nodes_by_name: 192 | return None 193 | 194 | lookup_values = NETWORK.nodes_by_name[node_name].possible_values 195 | actual_values = lookup_value_list(lookup_values) 196 | 197 | return { 198 | (actual.casefold() if casefold else actual): lookup 199 | for actual, lookup in zip(actual_values, lookup_values) 200 | } 201 | 202 | 203 | def _search_downward(domain: str) -> Iterable[str]: 204 | """ 205 | Searches for all nodes that begin with a specific key 206 | """ 207 | found = False 208 | for i, node in enumerate(NETWORK.nodes_by_name.keys()): 209 | if not node.startswith(domain): 210 | continue 211 | # Check if its a . afterward 212 | key_len = len(domain) 213 | if len(node) > key_len and node[key_len] != '.': 214 | continue 215 | if not found: 216 | found = True 217 | # Get the original case 218 | yield NETWORK.node_names[i] 219 | 220 | if not found: 221 | raise InvalidNode(f'Unknown node: "{domain}"') 222 | 223 | 224 | def _find_roots(targets: Union[str, StrContainer]) -> Iterator[str]: 225 | """ 226 | Given a list of targets, return all nodes that make up that target's data 227 | """ 228 | for target in targets: 229 | target = target.casefold() 230 | while True: 231 | # Found a valid target 232 | if target in NETWORK.nodes_by_name: 233 | yield target 234 | break 235 | 236 | keys = target.rsplit('.', 1) 237 | if len(keys) > 1: 238 | # Move target back 1 239 | target = keys[0] 240 | continue 241 | 242 | # We are at the root key. 243 | # Find potential keys before quitting 244 | yield from _search_downward(keys[0]) 245 | break 246 | 247 | 248 | def _reassemble_targets(targets: StrContainer, fingerprint: Dict[str, Any]): 249 | result = {} 250 | for target in targets: 251 | try: 252 | data = _at_path(fingerprint, target.split('.'), casefold=True) 253 | except NodePathError as key: 254 | raise InvalidNode(f"'{target}' is not a valid key path (missing {key}).") 255 | result[target] = data 256 | return result 257 | 258 | 259 | """ 260 | Miscellaneous python list/dict helpers 261 | """ 262 | 263 | 264 | def _dedupe(lst: Iterable[Any], sort: bool) -> List[Any]: 265 | """ 266 | Group items by their type, deduping each group 267 | """ 268 | groups: Dict[type, Any] = {} 269 | for item in lst: 270 | t = type(item) 271 | if t not in groups: 272 | groups[t] = [] 273 | # Only add item if it's not already in its type group 274 | if item not in groups[t]: 275 | groups[t].append(item) 276 | 277 | result = [] 278 | # Process groups in order sorted by type name 279 | for t in sorted(groups.keys(), key=lambda typ: typ.__name__): 280 | items = groups[t] 281 | # Do not sort if `sort` is False, or if type is unhashable 282 | if not sort or t in (list, dict): 283 | result.extend(items) 284 | else: 285 | result.extend(sorted(items)) 286 | return result 287 | 288 | 289 | def _unflatten(dictionary, sort: bool) -> Dict[str, Any]: 290 | """ 291 | Unflatten dicts and dedupe any nested lists 292 | """ 293 | result_dict: Dict[str, Any] = dict() 294 | for key, value in dictionary.items(): 295 | parts = key.split(".") 296 | d = result_dict 297 | for part in parts[:-1]: 298 | if part not in d: 299 | d[part] = dict() 300 | d = d[part] 301 | # Dedupe lists 302 | if isinstance(value, list): 303 | value = _dedupe(value, sort=sort) 304 | d[parts[-1]] = value 305 | return result_dict 306 | 307 | 308 | def _flatten(dictionary: Dict[str, Any], parent_key=False) -> Dict[str, Any]: 309 | """ 310 | Turn a nested dictionary into a flattened dictionary 311 | https://stackoverflow.com/questions/6027558/flatten-nested-dictionaries-compressing-keys 312 | """ 313 | items: List[Tuple[str, Any]] = [] 314 | for key, value in dictionary.items(): 315 | new_key = str(parent_key) + '.' + key if parent_key else key 316 | if isinstance(value, dict): 317 | items.extend(_flatten(value, new_key).items()) 318 | else: 319 | items.append((new_key, value)) 320 | return dict(items) 321 | 322 | 323 | def _maybe_flatten(flatten: Optional[bool], data): 324 | if not isinstance(data, dict): 325 | return data 326 | if flatten: 327 | return _flatten(data) 328 | return data 329 | 330 | 331 | def _merge_dicts(dict_list: Iterable[Dict[str, Any]], sort: bool) -> Dict[str, Any]: 332 | """ 333 | Merge items in a list of dicts into a dict of merged values. 334 | For a given key, if all values are dicts, merge them recursively. 335 | If all values are lists, flatten them into a single list and dedupe. 336 | Otherwise, dedupe the list of values. 337 | """ 338 | if not dict_list: 339 | return {} 340 | 341 | merged: Dict[str, Any] = {} 342 | # Get the union of keys from all dictionaries. 343 | all_keys: Set[str] = set() 344 | for d in dict_list: 345 | all_keys.update(d.keys()) 346 | 347 | for key in all_keys: 348 | # Get the list of values for the current key, skipping dicts that don't have it 349 | values = [d[key] for d in dict_list if key in d] 350 | 351 | if all(isinstance(v, dict) for v in values): 352 | # Merge dictionaries recursively 353 | merged[key] = _merge_dicts(values, sort=sort) 354 | elif all(isinstance(v, list) for v in values): 355 | # Merge lists 356 | merged_list = [] 357 | for lst in values: 358 | merged_list.extend(lst) 359 | merged[key] = _dedupe(merged_list, sort=sort) 360 | else: 361 | # For mixed/scalar values, dedupe 362 | merged[key] = _dedupe(values, sort=sort) 363 | 364 | return merged 365 | 366 | 367 | def _tupilize(value) -> Union[List[str], Tuple[str, ...]]: 368 | """ 369 | If a value is not a tuple or list, wrap it in a tuple 370 | """ 371 | return value if isinstance(value, (tuple, list)) else (value,) 372 | 373 | 374 | """ 375 | Parse user input 376 | """ 377 | 378 | 379 | def _flatten_conditions( 380 | dictionary: Mapping[str, Any], parent_key: str = '', casefold: bool = False 381 | ) -> Dict[str, Any]: 382 | """ 383 | Flattens the passed list of conditions 384 | """ 385 | # Original flattening logic from here: 386 | # https://stackoverflow.com/questions/6027558/flatten-nested-dictionaries-compressing-keys 387 | items: List[Tuple[str, Any]] = [] 388 | for key, value in dictionary.items(): 389 | new_key = parent_key + '.' + key if parent_key else key 390 | if isinstance(value, MutableMapping): 391 | items.extend(_flatten_conditions(value, new_key).items()) 392 | else: 393 | # If we have a tuple or set, treat it as an array of possible values 394 | if isinstance(value, (set, tuple)): 395 | value = tuple(orjson.dumps(v).decode() for v in value) 396 | # If we have a function, don't flatten it 397 | elif not callable(value): 398 | value = orjson.dumps(value).decode() 399 | if casefold: 400 | new_key = new_key.casefold() 401 | items.append((new_key, value)) 402 | return dict(items) 403 | 404 | 405 | def build_evidence( 406 | conditions: Dict[str, Any], evidence: Dict[str, Set[str]], strict: Optional[bool] = None 407 | ) -> None: 408 | """ 409 | Builds evidence based on the user's inputted conditions 410 | """ 411 | if strict is None: 412 | strict = True 413 | 414 | # Flatten to match the format of the fingerprint network 415 | conditions = _flatten_conditions(conditions, casefold=True) 416 | 417 | for key, value in conditions.items(): 418 | possible_values = _lookup_possibilities(key) 419 | 420 | # Handle nested keys 421 | nested_keys: List[str] = [] 422 | if possible_values is None: 423 | key, possible_values = _lookup_root_possibilities(key, nested_keys) 424 | # Get the real name for the key 425 | key = NETWORK.nodes_by_name[key].name 426 | 427 | evidence[key] = set() 428 | 429 | for value_con in _tupilize(value): 430 | # Read the passed value 431 | if callable(value_con): 432 | val = value_con # Callable 433 | else: 434 | val = orjson.loads(value_con.casefold()) # Dict/list/str data 435 | 436 | # Handle nested keys by filtering out possible values that dont 437 | # match the value at the target 438 | if nested_keys: 439 | nested_keys = list(map(lambda s: s.casefold(), nested_keys)) 440 | for poss_value, lookup_index in possible_values.items(): 441 | # Parse the dictionary 442 | outputted_possible = orjson.loads(poss_value) 443 | 444 | # Check if the value is a possible value at the nested path 445 | try: 446 | target_value = _at_path(outputted_possible, nested_keys) 447 | except NodePathError: 448 | continue # Path didn't exist, bad data 449 | if callable(val) and val(target_value): 450 | evidence[key].add(lookup_index) 451 | elif target_value == val: 452 | evidence[key].add(lookup_index) 453 | 454 | # If nothing was found, raise an error 455 | if not evidence[key]: 456 | if callable(val): 457 | # Callable didnt work 458 | raise InvalidConstraints( 459 | f'The passed function ({val}) yielded no possible values for "{key}" ' 460 | f'at "{".".join(nested_keys)}"' 461 | ) 462 | raise InvalidConstraints( 463 | f'{value_con} is not a possible value for "{key}" ' 464 | f'at "{".".join(nested_keys)}"' 465 | ) 466 | continue 467 | 468 | # ===== NON NESTED VALUE HANDLING ===== 469 | 470 | # If callable, get all possible values then check for matches 471 | if callable(val): 472 | # Filter by val(x) 473 | found = False 474 | for possible_val, lookup_index in possible_values.items(): 475 | if val(orjson.loads(possible_val)): 476 | evidence[key].add(lookup_index) 477 | found = True 478 | if not found: 479 | raise InvalidConstraints( 480 | f'The passed function ({val}) yielded no possible values for "{key}"' 481 | ) 482 | continue 483 | 484 | # Non nested values can be handled by directly checking possible_values 485 | lookup_index = possible_values.get(value_con.casefold()) 486 | # Value is not possible 487 | if lookup_index is None: 488 | raise InvalidConstraints(f'{value_con} is not a possible value for "{key}"') 489 | evidence[key].add(lookup_index) 490 | 491 | # Validate the evidence is possible (or try to relax the evidence if strict is False) 492 | while True: 493 | try: 494 | NETWORK.validate_evidence(evidence) 495 | except RestrictiveConstraints as e: 496 | if strict: 497 | raise e 498 | # Remove the last added key 499 | evidence.pop(next(iter(evidence.keys()))) 500 | break 501 | 502 | 503 | def _assert_dict_xor_kwargs( 504 | passed_dict: Optional[Dict[str, Any]], passed_kwargs: Optional[Dict[str, Any]] 505 | ) -> None: 506 | """ 507 | Confirms a dict is either passed as an argument, xor kwargs are passed. 508 | """ 509 | # Exit if neither is passed 510 | if passed_dict is None and passed_kwargs is None: 511 | return 512 | # Exit if both are passed 513 | if passed_dict and passed_kwargs: 514 | raise ValueError( 515 | f"Cannot pass values as dict & as parameters: {passed_dict} and {passed_kwargs}" 516 | ) 517 | # Raise if incorrect type 518 | if not isinstance(passed_dict or passed_kwargs, dict): 519 | raise ValueError( 520 | "Invalid argument. Constraints must be passed as kwargs or as a dictionary." 521 | ) 522 | 523 | 524 | """ 525 | Convert network output to human readable output 526 | """ 527 | 528 | 529 | def _make_output_dict(data: Dict[str, Any], flatten: Optional[bool]) -> Dict[str, Any]: 530 | """ 531 | Unflattens & builds the output dictionary 532 | """ 533 | if flatten: 534 | # Get key value pairs directly without building structure 535 | values = lookup_value_list(data.values()) 536 | for key, value in zip(data.keys(), values): 537 | data[key] = orjson.loads(value) 538 | # Flatten node values that themselves are dicts 539 | return _flatten(data) 540 | 541 | # Original unflattening logic from here: 542 | # https://stackoverflow.com/questions/6037503/python-unflatten-dict 543 | result_dict: Dict[str, Any] = dict() 544 | for key, value in zip(data.keys(), lookup_value_list(data.values())): 545 | parts = key.split(".") 546 | d = result_dict 547 | for part in parts[:-1]: 548 | if part not in d: 549 | d[part] = dict() 550 | d = d[part] 551 | d[parts[-1]] = orjson.loads(value) 552 | 553 | return result_dict 554 | 555 | 556 | # Only expose `query` publicly 557 | __all__ = ('query',) 558 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["poetry-core>=1.0.0"] 3 | build-backend = "poetry.core.masonry.api" 4 | 5 | [tool.poetry] 6 | name = "fpgen" 7 | version = "1.3.0" 8 | description = "A fast & comprehensive browser fingerprint generator" 9 | authors = ["daijro "] 10 | license = "Apache-2.0" 11 | readme = "README.md" 12 | repository = "https://github.com/scrapfly/fingerprint-generator" 13 | keywords = [ 14 | "client", 15 | "headers", 16 | "fingerprint", 17 | "generator", 18 | "browser", 19 | "http", 20 | "scraping", 21 | "requests", 22 | "playwright", 23 | ] 24 | classifiers = [ 25 | "Topic :: Internet :: WWW/HTTP", 26 | "Topic :: Internet :: WWW/HTTP :: Browsers", 27 | "Topic :: Software Development :: Libraries :: Python Modules", 28 | ] 29 | 30 | [tool.poetry.dependencies] 31 | python = "^3.8" 32 | click = "*" 33 | indexed-zstd = "*" 34 | orjson = "*" 35 | numpy = "*" 36 | zstandard = "*" 37 | httpx = "*" 38 | 39 | [tool.poetry.scripts] 40 | fpgen = "fpgen.__main__:cli" -------------------------------------------------------------------------------- /tests/README.md: -------------------------------------------------------------------------------- 1 | # Tests 2 | 3 | Dirty test cases for internal use. 4 | 5 | Confirms that user inputs & conditional probabilities are handled correctly. -------------------------------------------------------------------------------- /tests/failed_cases.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test for exceptions that should be raised. 3 | """ 4 | 5 | import sys 6 | from pathlib import Path 7 | 8 | sys.path.append(str(Path(__file__).parent.parent)) 9 | from fpgen import Generator, generate, trace 10 | from fpgen.exceptions import InvalidConstraints, InvalidNode, RestrictiveConstraints 11 | 12 | # Generator with incorrect values 13 | try: 14 | generate(screen={'width': 1920, 'height': 5000}) 15 | except InvalidConstraints as e: 16 | print('[PASSED] VERIFIER:', e) 17 | else: 18 | print('[FAILED]') 19 | 20 | # Incorrect nested node 21 | try: 22 | generate(screen={'width': 1920, 'heighta': 1080}) 23 | except InvalidConstraints as e: 24 | print('[PASSED] VERIFIER:', e) 25 | else: 26 | print('[FAILED]') 27 | 28 | try: 29 | generate(screena={'width': 1920, 'height': 1080}) 30 | except InvalidNode as e: 31 | print('[PASSED] VERIFIER:', e) 32 | else: 33 | print('[FAILED]') 34 | 35 | 36 | # Test generator constructor 37 | try: 38 | Generator(screen={'width': 1920, 'height': lambda x: x > 5000}) 39 | except InvalidConstraints as e: 40 | print('[PASSED] VERIFIER:', e) 41 | else: 42 | print('[FAILED]') 43 | 44 | # Test that Chrome is 100% probability on ChromeOS 45 | result = trace(target='browser', os='ChromeOS') 46 | try: 47 | assert len(result) == 1 and result[0].value == 'Chrome' and result[0].probability == 1.0 48 | except AssertionError: 49 | print('[FAILED] TRACE: Expected Chrome 100% probability on ChromeOS, got:', result) 50 | else: 51 | print('[PASSED] TRACE: Chrome is 100% probability on ChromeOS') 52 | 53 | # Test that Firefox/Safari are impossible on ChromeOS 54 | try: 55 | result = trace(target='browser', os='ChromeOS', browser=('Firefox', 'Safari')) 56 | except RestrictiveConstraints as e: 57 | print('[PASSED] TRACE: Firefox/Safari correctly impossible on ChromeOS') 58 | else: 59 | print('[FAILED] TRACE: Expected exception for impossible Firefox/Safari on ChromeOS') 60 | 61 | # Test Firefox/Safari probabilities without OS constraint 62 | result = trace(target='browser', browser=('Firefox', 'Safari')) 63 | try: 64 | assert len(result) == 2 65 | assert all(r.value in ('Firefox', 'Safari') for r in result) 66 | assert abs(sum(r.probability for r in result) - 1.0) < 0.0001 67 | except AssertionError: 68 | print('[FAILED] TRACE: Expected valid Firefox/Safari probabilities, got:', result) 69 | else: 70 | print('[PASSED] TRACE: Valid Firefox/Safari probabilities') 71 | 72 | # Test Chrome is 100% on ChromeOS even with Firefox/Safari allowed 73 | result = trace(target='browser', os='ChromeOS', browser=('Firefox', 'Safari', 'Chrome')) 74 | try: 75 | assert len(result) == 1 and result[0].value == 'Chrome' and result[0].probability == 1.0 76 | except AssertionError: 77 | print( 78 | '[FAILED] TRACE: Expected Chrome 100% on ChromeOS with Firefox/Safari allowed, got:', result 79 | ) 80 | else: 81 | print('[PASSED] TRACE: Chrome is 100% on ChromeOS with Firefox/Safari allowed') 82 | 83 | try: 84 | trace(target='browser', os='ChromeOS', browser='Firefox') 85 | except RestrictiveConstraints as e: 86 | print('[PASSED] TRACE: Firefox cannot exist on ChromeOS') 87 | else: 88 | print('[FAILED] TRACE: Should have raised an exception.') 89 | 90 | 91 | # Basic passing case 92 | try: 93 | data = generate(os='ChromeOS') 94 | except Exception as e: 95 | print('[FAILED] GENERATE: Basic target case failed:', e) 96 | else: 97 | print('[PASSED] GENERATE: Passed basic case (control)') 98 | 99 | try: 100 | data = generate(os='ChromeOS', target='browser') 101 | except Exception as e: 102 | print('[FAILED] GENERATE: Basic target case failed:', e) 103 | else: 104 | print('[PASSED] GENERATE: Passed basic case (control)') 105 | 106 | # Test impossible constraint handling 107 | try: 108 | data = generate(browser='firefox', os='ChromeOS') 109 | except RestrictiveConstraints as e: 110 | print('[PASSED] GENERATE: Throws on impossible constraint', e) 111 | else: 112 | print('[FAILED] GENERATE: Firefox should not exist on ChromeOS') 113 | 114 | try: 115 | data = generate(browser='firefox', os='ChromeOS', target='browser') 116 | except RestrictiveConstraints as e: 117 | print('[PASSED] GENERATE: Throws on impossible constraint', e) 118 | else: 119 | print('[FAILED] GENERATE: Firefox should not exist on ChromeOS (target)') 120 | 121 | try: 122 | data = generate(browser=('firefox', 'safari', 'chrome'), os='ChromeOS', target='browser') 123 | assert data == 'Chrome' 124 | except AssertionError: 125 | print('[FAILED] GENERATE: Doesn\'t pick the correct constraint') 126 | else: 127 | print('[PASSED] GENERATE: Picks the correct constraint') 128 | -------------------------------------------------------------------------------- /tests/generator_matches_trace.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from pathlib import Path 3 | 4 | sys.path.append(str(Path(__file__).parent.parent)) 5 | import fpgen 6 | 7 | # Controls whether to use `target` or not (uses a different algorithm) 8 | USE_TARGET = False 9 | # Number of trials to run 10 | TRIALS = 500 11 | 12 | TESTS = [ 13 | ('browser', {'os': ('Linux', 'MacOS'), 'browser': ('Edge', 'Safari')}), 14 | ('browser', {'os': ('Linux')}), 15 | ('browser', {'navigator': {'productsub': '20100101'}}), 16 | ('navigator.language', {'browser': 'firefox'}), 17 | ('os', {'browser': 'Firefox'}), 18 | ] 19 | 20 | LIMIT = 10 21 | 22 | for target, constraints in TESTS: 23 | pretty_constraints = ', '.join(f'{k}={v}' for k, v in constraints.items()) 24 | print(f'Expected P({target}|{pretty_constraints}):') 25 | print(fpgen.trace(target=target, **constraints)[:LIMIT]) 26 | print(f'Expected P({target}):') 27 | print(fpgen.trace(target=target)[:LIMIT]) 28 | 29 | # Collected data 30 | browser_data = {} 31 | 32 | for _ in range(TRIALS): 33 | print(f'Trial {_+1}/{TRIALS}', end='\r') 34 | if USE_TARGET: 35 | a = fpgen.generate(flatten=True, target=target, **constraints) 36 | else: 37 | a = fpgen.generate(flatten=True, **constraints)[target] 38 | browser_data[a] = browser_data.get(a, 0) + 1 39 | 40 | print(f"\nGenerator test using P({target}|{pretty_constraints}):") 41 | for browser, count in sorted(browser_data.items(), key=lambda x: x[1], reverse=True)[:LIMIT]: 42 | print(f"{browser}: {count/TRIALS*100:.2f}%") 43 | print('\n---------\n') 44 | -------------------------------------------------------------------------------- /tests/profile.py: -------------------------------------------------------------------------------- 1 | """ 2 | Track where the generator hangs. 3 | """ 4 | 5 | import sys 6 | from pathlib import Path 7 | 8 | sys.path.append(str(Path(__file__).parent.parent)) 9 | from pyinstrument import Profiler 10 | 11 | from fpgen import generate 12 | 13 | profiler = Profiler(interval=0.001) 14 | profiler.start() 15 | 16 | # Intensive constraint 17 | generate( 18 | browser=('Firefox', 'Chrome'), 19 | client={'browser': {'major': ('134', '133')}}, 20 | ) 21 | profiler.stop() 22 | 23 | print(profiler.output_text(show_all=True)) 24 | -------------------------------------------------------------------------------- /tests/speed_test.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test the speed of the generate, trace, and query functions. 3 | """ 4 | 5 | import sys 6 | from pathlib import Path 7 | from timeit import timeit 8 | 9 | sys.path.append(str(Path(__file__).parent.parent)) 10 | 11 | from contextlib import contextmanager 12 | from time import time 13 | 14 | from fpgen import Generator, generate, query, trace 15 | 16 | 17 | @contextmanager 18 | def timer(description): 19 | print(description) 20 | start = time() 21 | yield 22 | print('Time to run', time() - start) 23 | 24 | 25 | # Trace operations 26 | with timer('trace (target=navigator.language)'): 27 | trace(target='navigator.language') 28 | 29 | with timer("trace (target=browser, os=('Linux', 'MacOS'), browser=('Edge', 'Safari'))"): 30 | trace('browser', os=('Linux', 'MacOS'), browser=('Edge', 'Safari')) 31 | 32 | # Generate operations 33 | with timer('generating (full fingerprint)'): 34 | generate() 35 | 36 | # with timer('generating (navigator.language=en-US)'): 37 | # generate({'navigator.language': 'en-US'}) 38 | 39 | with timer('generating (navigator.language=en-US, target=browser)'): 40 | generate({'navigator.language': ('en-US', 'en-GB', 'fr', 'de-DE')}, target='browser') 41 | 42 | with timer('generating (browser=firefox, target=browser)'): 43 | generate(browser=('firefox'), target='browser') 44 | 45 | with timer('generating (browser=firefox, target=navigator.language)'): 46 | generate(browser=('firefox'), target='navigator.language') 47 | 48 | with timer('generating with a function constraint'): 49 | generate({'window': {'innerWidth': lambda x: x > 1000}}, target='navigator.language') 50 | 51 | # Timeit tests 52 | 53 | print('\n========== TIMEIT TESTS ==========\n') 54 | 55 | print('Generator test') 56 | print(timeit(lambda: generate(), number=100), '/ 100') 57 | 58 | print('Generator test (with nested constraints)') 59 | print(timeit(lambda: generate(screen={'width': 1920, 'height': 1080}), number=10), '/ 10') 60 | 61 | gen = Generator(screen={'width': 1920, 'height': 1080}) 62 | 63 | print('Generator test with nested constraints (pre-filtered)') 64 | print(timeit(lambda: gen.generate(), number=10), '/ 10') 65 | 66 | print('Query test (large value set)') 67 | print(timeit(lambda: query('allFonts'), number=10), '/ 10') 68 | 69 | print('Trace test') 70 | print(timeit(lambda: trace('browser'), number=100), '/ 100') 71 | 72 | print('Trace test (large value set)') 73 | print(timeit(lambda: trace('allFonts'), number=10), '/ 10') 74 | -------------------------------------------------------------------------------- /tests/user_inputs.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tests various user inputs to confirm that they are handled correctly. 3 | """ 4 | 5 | import sys 6 | from pathlib import Path 7 | 8 | sys.path.append(str(Path(__file__).parent.parent)) 9 | import re 10 | 11 | from fpgen import Generator, __network__, query 12 | 13 | 14 | def test(name, out, x=None): 15 | if x and not re.search(x, str(out)): 16 | print( 17 | ('> FAILED: ' + name).ljust(60, ' '), 18 | out, 19 | ) 20 | return 21 | print(('PASSED! ' + name).ljust(60, ' '), str(out)[:100]) 22 | 23 | 24 | # Test options 25 | FLATTEN_OPT = False 26 | SORT_OPT = False 27 | 28 | print(' ==== GENERATOR TESTS ==== ') 29 | 30 | gen = Generator(flatten=FLATTEN_OPT) 31 | 32 | test( 33 | 'Generate full fp', 34 | gen.generate(), 35 | '^{.*', 36 | ) 37 | test( 38 | 'Generate full fp (non strict, fallback os)', 39 | gen.generate( 40 | os='ChromeOS', 41 | browser='Firefox', 42 | strict=False, 43 | target=('os', 'browser'), 44 | ), 45 | ".*'browser': 'Firefox'.*", 46 | ) 47 | test( 48 | 'Target (at node) given constraint', 49 | gen.generate( 50 | {'navigator': {'productsub': '20100101'}}, 51 | target=('headers.user-agent'), 52 | ), 53 | 'Firefox', 54 | ) 55 | test( 56 | 'Target (within node) given constraint', 57 | gen.generate( 58 | {'navigator': {'productsub': '20100101'}}, 59 | target=['screen.width', 'screen.height'], 60 | ), 61 | r'\d+', 62 | ) 63 | test( 64 | 'Target (above node) given constraint', 65 | gen.generate( 66 | {'navigator': {'productsub': '20100101'}}, 67 | target='navigator', 68 | ), 69 | '^{.*$', 70 | ) 71 | test( 72 | 'Passing multi constraints (no target)', 73 | gen.generate( 74 | browser=('Firefox', 'Chrome'), 75 | client={'browser': {'major': ('134', '133')}}, 76 | ), 77 | r'\b13[34]\b', 78 | ) 79 | test( 80 | 'Passing multi constraints (target)', 81 | gen.generate( 82 | browser=('Firefox', 'Chrome'), 83 | client={'browser': {'major': ('134', '133')}}, 84 | target='client', 85 | ), 86 | r'\b13[34]\b', 87 | ) 88 | gpu = { 89 | 'vendor': 'Google Inc. (Apple)', 90 | 'renderer': 'ANGLE (Apple, ANGLE Metal Renderer: Apple M2, Unspecified Version)', 91 | } 92 | test( 93 | 'Constraint tgt (at node, `window`)', 94 | gen.generate(gpu=gpu, target='window'), 95 | ) 96 | test( 97 | 'Constraint tgt (above nodes, `navigator`)', 98 | gen.generate(gpu=gpu, target='navigator'), 99 | ) 100 | test( 101 | 'Constraint tgt (within node, `screen.width`)', 102 | gen.generate(gpu=gpu, target='screen.width'), 103 | ) 104 | 105 | print('\n ==== QUERY TESTS ==== ') 106 | 107 | test( 108 | 'Possibilities (at node 1, `navigator.productsub`)', 109 | query('navigator.productsub', flatten=FLATTEN_OPT, sort=SORT_OPT), 110 | ) 111 | test( 112 | 'Possibilities (at node 2, `screen`)', 113 | query('screen', flatten=FLATTEN_OPT, sort=SORT_OPT), 114 | ) 115 | test( 116 | 'Possibilities (above nodes, `navigator`)', 117 | query('navigator', flatten=FLATTEN_OPT, sort=SORT_OPT), 118 | ) 119 | test( 120 | 'Possibilities (within node, `screen.width`)', 121 | query('screen.width', flatten=FLATTEN_OPT, sort=SORT_OPT), 122 | ) 123 | 124 | 125 | print(' ==== QUERY ALL NODES ==== ') 126 | 127 | for node in __network__.nodes_by_name: 128 | # Get the possibilities 129 | print(f'Listing possibilities for {node}') 130 | a = query(node, flatten=FLATTEN_OPT, sort=SORT_OPT) 131 | print(str(a)[:100]) 132 | --------------------------------------------------------------------------------