├── .github
└── workflows
│ └── release.yml
├── .gitignore
├── LICENSE
├── Makefile
├── README.md
├── assets
├── example-output.json
└── update.py
├── fpgen
├── __init__.py
├── __main__.py
├── bayesian_network.py
├── exceptions.py
├── generator.py
├── pkgman.py
├── py.typed
├── structs.py
├── trace.py
├── unpacker.py
└── utils.py
├── pyproject.toml
└── tests
├── README.md
├── failed_cases.py
├── generator_matches_trace.py
├── profile.py
├── speed_test.py
└── user_inputs.py
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
1 | name: Release to PyPi
2 |
3 | on:
4 | workflow_dispatch:
5 | push:
6 | tags:
7 | - 'v*' # Only trigger on tags that start with 'v'
8 |
9 | permissions:
10 | contents: read
11 |
12 | jobs:
13 | build:
14 | runs-on: ubuntu-latest
15 | env:
16 | TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }}
17 |
18 | steps:
19 | - uses: actions/checkout@v4
20 |
21 | - name: Install Python
22 | run: |
23 | sudo apt-get update
24 | sudo apt-get install -y \
25 | python3 \
26 | python3-pip \
27 | python3-dev
28 |
29 | - name: Install Python dependencies
30 | run: |
31 | python3 -m pip install twine build vermin
32 |
33 | - name: Release
34 | run: |
35 | make release
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Old files
2 | _old/
3 | _old_*/
4 | *.old
5 |
6 | # Model files
7 | fpgen/**.json
8 | *.zst
9 | *.ndjson
10 | *.dat
11 | *.zip
12 | data/
13 | output/
14 |
15 | # Tests
16 | test*.py
17 | diagnostics/
18 |
19 | # Logs
20 | *.log
21 | *.kate-swp
22 |
23 |
24 | # ==== DEFAULTS ====
25 |
26 | # Byte-compiled / optimized / DLL files
27 | __pycache__/
28 | *.py[cod]
29 | *$py.class
30 |
31 | # C extensions
32 | *.so
33 |
34 | # Binaries
35 | *.dll
36 | *.dylib
37 |
38 | # Distribution / packaging
39 | .Python
40 | build/
41 | develop-eggs/
42 | dist/
43 | downloads/
44 | eggs/
45 | .eggs/
46 | lib/
47 | lib64/
48 | parts/
49 | sdist/
50 | var/
51 | wheels/
52 | share/python-wheels/
53 | *.egg-info/
54 | .installed.cfg
55 | *.egg
56 | MANIFEST
57 |
58 | # PyInstaller
59 | # Usually these files are written by a python script from a template
60 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
61 | *.manifest
62 | *.spec
63 |
64 | # Installer logs
65 | pip-log.txt
66 | pip-delete-this-directory.txt
67 |
68 | # Unit test / coverage reports
69 | htmlcov/
70 | .tox/
71 | .nox/
72 | .coverage
73 | .coverage.*
74 | .cache
75 | nosetests.xml
76 | coverage.xml
77 | *.cover
78 | *.py,cover
79 | .hypothesis/
80 | .pytest_cache/
81 | cover/
82 |
83 | # Translations
84 | *.mo
85 | *.pot
86 |
87 | # Django stuff:
88 | *.log
89 | local_settings.py
90 | db.sqlite3
91 | db.sqlite3-journal
92 |
93 | # Flask stuff:
94 | instance/
95 | .webassets-cache
96 |
97 | # Scrapy stuff:
98 | .scrapy
99 |
100 | # Sphinx documentation
101 | docs/_build/
102 |
103 | # PyBuilder
104 | .pybuilder/
105 | target/
106 |
107 | # Jupyter Notebook
108 | .ipynb_checkpoints
109 |
110 | # IPython
111 | profile_default/
112 | ipython_config.py
113 |
114 | # pyenv
115 | # For a library or package, you might want to ignore these files since the code is
116 | # intended to run in multiple environments; otherwise, check them in:
117 | # .python-version
118 |
119 | # pipenv
120 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
121 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
122 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
123 | # install all needed dependencies.
124 | #Pipfile.lock
125 |
126 | # poetry
127 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
128 | # This is especially recommended for binary packages to ensure reproducibility, and is more
129 | # commonly ignored for libraries.
130 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
131 | #poetry.lock
132 |
133 | # pdm
134 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
135 | #pdm.lock
136 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
137 | # in version control.
138 | # https://pdm.fming.dev/#use-with-ide
139 | .pdm.toml
140 |
141 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
142 | __pypackages__/
143 |
144 | # Celery stuff
145 | celerybeat-schedule
146 | celerybeat.pid
147 |
148 | # SageMath parsed files
149 | *.sage.py
150 |
151 | # Environments
152 | .env
153 | .venv
154 | env/
155 | venv/
156 | ENV/
157 | env.bak/
158 | venv.bak/
159 |
160 | # Spyder project settings
161 | .spyderproject
162 | .spyproject
163 |
164 | # Rope project settings
165 | .ropeproject
166 |
167 | # mkdocs documentation
168 | /site
169 |
170 | # mypy
171 | .mypy_cache/
172 | .dmypy.json
173 | dmypy.json
174 |
175 | # Pyre type checker
176 | .pyre/
177 |
178 | # pytype static type analyzer
179 | .pytype/
180 |
181 | # Cython debug symbols
182 | cython_debug/
183 |
184 | # PyCharm
185 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
186 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
187 | # and can be added to the global gitignore or merged into this file. For a more nuclear
188 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
189 | #.idea/
190 |
191 | # VsCode
192 | .vscode
193 | .trunk
194 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | vermin:
2 | vermin . --eval-annotations --target=3.8 --violations fpgen/ || exit 1
3 |
4 | clean:
5 | @echo Cleaning...
6 | find ./fpgen -type f ! -name "*.typed" ! -name "*.py" -exec rm -v {} \;
7 | rm -rf ./dist
8 |
9 | prepare: vermin clean
10 |
11 | check: prepare
12 | @echo Building...
13 | python -m build
14 | twine check dist/*
15 |
16 | release: check
17 | @echo Releasing...
18 | twine upload dist/*
19 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
Fingerprint Generator
2 |
3 | A fast browser data generator that mimics actual traffic patterns in the wild. With extensive data coverage.
4 |
5 | Created by daijro. Data provided by Scrapfly.
6 |
7 | ---
8 |
9 | ## Features
10 |
11 | - Uses a Bayesian generative network to mimic real-world web traffic patterns
12 | - Extensive data coverage for **nearly all known** browser data points
13 | - Creates complete fingerprints in a few milliseconds ⚡
14 | - Easily specify custom criteria for any data point (e.g. "only Windows + Chrome, with Intel GPUs")
15 | - Simple for humans to use 🚀
16 |
17 | ## Demo Video
18 |
19 | Here is a demonstration of what fpgen generates & its ability to filter data points:
20 |
21 | https://github.com/user-attachments/assets/5c56691a-5804-4007-b179-0bae7069a111
22 |
23 | ---
24 |
25 | # Installation
26 |
27 | Install the package using pip:
28 |
29 | ```bash
30 | pip install fpgen
31 | ```
32 |
33 |
34 |
35 | ### Downloading the model
36 |
37 | Fetch the latest model:
38 |
39 | ```bash
40 | fpgen fetch
41 | ```
42 |
43 | This will be ran automatically on the first import, or every 5 weeks.
44 |
45 | To decompress the model for faster generation (_up to 10-50x faster!_), run:
46 |
47 | ```bash
48 | fpgen decompress
49 | ```
50 |
51 | Note: This action will use an additional 100mb+ of storage.
52 |
53 |
54 | CLI Usage
55 |
56 | ```
57 | Usage: python -m fpgen [OPTIONS] COMMAND [ARGS]...
58 |
59 | Options:
60 | --help Show this message and exit.
61 |
62 | Commands:
63 | decompress Decompress model files for speed efficiency (will take 100mb+)
64 | fetch Fetch the latest model from GitHub
65 | recompress Compress model files after running decompress
66 | remove Remove all downloaded and/or extracted model files
67 | ```
68 |
69 |
70 |
71 | ---
72 |
73 | # Usage
74 |
75 | ### Generate a fingerprint
76 |
77 | Simple usage:
78 |
79 | ```python
80 | >>> import fpgen
81 | >>> fpgen.generate(browser='Chrome', os='Windows')
82 | ```
83 |
84 | Or use the Generator object to pass filters downward:
85 |
86 | ```python
87 | >>> gen = fpgen.Generator(browser='Chrome') # Filter by Chrome
88 | >>> gen.generate(os='Windows') # Generate Windows & Chrome fingerprints
89 | ```
90 |
91 |
92 |
93 | Parameters list
94 |
95 |
96 | ```
97 | Initializes the Generator with the given options.
98 | Values passed to the Generator object will be inherited when calling Generator.generate()
99 |
100 | Parameters:
101 | conditions (dict, optional): Conditions for the generated fingerprint.
102 | window_bounds (WindowBounds, optional): Constrain the output window size.
103 | strict (bool, optional): Whether to raise an exception if the conditions are too strict.
104 | flatten (bool, optional): Whether to flatten the output dictionary
105 | target (Optional[Union[str, StrContainer]]): Only generate specific value(s)
106 | **conditions_kwargs: Conditions for the generated fingerprint (passed as kwargs)
107 | ```
108 |
109 |
110 |
111 | [See example output.](https://raw.githubusercontent.com/scrapfly/fingerprint-generator/refs/heads/main/assets/example-output.json)
112 |
113 | ---
114 |
115 | ## Filtering the output
116 |
117 | ### Setting fingerprint criteria
118 |
119 | You can narrow down generated fingerprints by specifying filters for **any** data field.
120 |
121 | ```python
122 | # Only generate fingerprints with Windows, Chrome, and Intel GPU:
123 | >>> fpgen.generate(
124 | ... os='Windows',
125 | ... browser='Chrome',
126 | ... gpu={'vendor': 'Google Inc. (Intel)'}
127 | ... )
128 | ```
129 |
130 |
131 |
132 | This can also be passed as a dictionary.
133 |
134 |
135 | ```python
136 | >>> fpgen.generate({
137 | ... 'os': 'Windows',
138 | ... 'browser': 'Chrome',
139 | ... 'gpu': {'vendor': 'Google Inc. (Intel)'},
140 | ... })
141 | ```
142 |
143 |
144 |
145 | ### Multiple constraints
146 |
147 | Pass in multiple constraints for the generator to select from using a tuple.
148 |
149 | ```python
150 | >>> fpgen.generate({
151 | ... 'os': ('Windows', 'MacOS'),
152 | ... 'browser': ('Firefox', 'Chrome'),
153 | ... })
154 | ```
155 |
156 | If you are passing many nested constraints, run `fpgen decompress` to improve model performance.
157 |
158 | ## Custom filters
159 |
160 | Data can be filtered by passing in callable functions.
161 |
162 | ### Examples
163 |
164 | Set the minimum browser version:
165 |
166 | ```python
167 | # Constrain client:
168 | >>> fpgen.generate(client={'browser': {'major': lambda ver: int(ver) >= 130}})
169 | # Or, just pass a dot seperated path to client.browser.major:
170 | >>> fpgen.generate({'client.browser.major': lambda ver: int(ver) >= 130})
171 | ```
172 |
173 | Only allow NVIDIA GPUs:
174 |
175 | ```python
176 | # Note: Strings are lowercased before they're passed.
177 | >>> fpgen.generate(gpu={'vendor': lambda vdr: 'nvidia' in vdr})
178 | ```
179 |
180 | Limit the maximum/minimum window size:
181 |
182 | ```python
183 | # Set allowed ranges for outerWidth & outerHeight:
184 | >>> fpgen.generate(
185 | ... window={
186 | ... 'outerWidth': lambda width: 1000 <= width <= 2000,
187 | ... 'outerHeight': lambda height: 500 <= height <= 1500
188 | ... }
189 | ... )
190 | ```
191 |
192 |
193 |
194 |
195 | Or, filter the window dictionary directly.
196 |
197 |
198 | ```python
199 | def window_filter(window):
200 | if not (1000 <= window['outerWidth'] <= 2000):
201 | return False
202 | if not (500 <= window['outerHeight'] <= 1500):
203 | return False
204 | return True
205 |
206 | fpgen.generate(window=window_filter)
207 | ```
208 |
209 |
210 |
211 | ---
212 |
213 | ## Only generate specific data
214 |
215 | To generate specific data fields, use the `target` parameter with a string or a list of strings.
216 |
217 | ### Examples
218 |
219 | Only generate HTTP headers:
220 |
221 | ```python
222 | >>> fpgen.generate(target='headers')
223 | {'accept': '*/*', 'accept-encoding': 'gzip, deflate, br, zstd', 'accept-language': 'en-US,en;q=0.9', 'priority': 'u=1, i', 'sec-ch-ua': '"Google Chrome";v="131", "Chromium";v="131", "Not_A Brand";v="24"', 'sec-ch-ua-mobile': None, 'sec-ch-ua-platform': '"Windows"', 'sec-fetch-dest': 'empty', 'sec-fetch-mode': 'cors', 'sec-fetch-site': 'same-site', 'sec-gpc': None, 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36'}
224 | ```
225 |
226 |
227 |
228 | Or, by using the generate_target shortcut.
229 |
230 |
231 | ```python
232 | >>> fpgen.generate_target('headers')
233 | {'accept': '*/*', 'accept-encoding': 'gzip, deflate, br, zstd', 'accept-language': 'en-GB,en;q=0.9,en-US;q=0.8,sk;q=0.7', 'priority': 'u=1, i', 'sec-ch-ua': '"Google Chrome";v="131", "Chromium";v="131", "Not_A Brand";v="24"', 'sec-ch-ua-mobile': None, 'sec-ch-ua-platform': '"Windows"', 'sec-fetch-dest': 'empty', 'sec-fetch-mode': 'cors', 'sec-fetch-site': 'same-site', 'sec-gpc': None, 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36'}
234 | ```
235 |
236 |
237 |
238 |
239 | Generate a User-Agent for Windows & Chrome:
240 |
241 | ```python
242 | >>> fpgen.generate(
243 | ... os='Windows',
244 | ... browser='Chrome',
245 | ... # Nested targets must be seperated by dots:
246 | ... target='headers.user-agent'
247 | ... )
248 | 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:134.0) Gecko/20100101 Firefox/134.0'
249 | ```
250 |
251 | Generate a Firefox TLS fingerprint:
252 |
253 | ```python
254 | >>> fpgen.generate(
255 | ... browser='Firefox',
256 | ... target='network.tls.scrapfly_fp'
257 | ... )
258 | {'version': '772', 'ch_ciphers': '4865-4867-4866-49195-49199-52393-52392-49196-49200-49162-49161-49171-49172-156-157-47-53', 'ch_extensions': '0-5-10-11-13-16-23-27-28-34-35-43-45-51-65037-65281', 'groups': '4588-29-23-24-25-256-257', 'points': '0', 'compression': '0', 'supported_versions': '772-771', 'supported_protocols': 'h2-http11', 'key_shares': '4588-29-23', 'psk': '1', 'signature_algs': '1027-1283-1539-2052-2053-2054-1025-1281-1537-515-513', 'early_data': '0'}
259 | ```
260 |
261 | You can provide multiple targets as a list.
262 |
263 | ---
264 |
265 | ## Get the probabilities of a target
266 |
267 | Calculate the probability distribution of a target given any filter:
268 |
269 | ```python
270 | >>> fpgen.trace(target='browser', os='Windows')
271 | [, , , , , ]
272 | ```
273 |
274 | Multiple targets can be passed as a list/tuple.
275 | Here is an example of tracking the probability of browser & OS given a GPU vendor:
276 |
277 | ```python
278 | >>> fpgen.trace(
279 | ... target=('browser', 'os'),
280 | ... gpu={'vendor': 'Google Inc. (Intel)'}
281 | ... )
282 | {'browser': [, , , , , ],
283 | 'os': [, , , ]}
284 | ```
285 |
286 | This also works in the Generator object:
287 |
288 | ```python
289 | >>> gen = fpgen.Generator(os='ChromeOS')
290 | >>> gen.trace(target='browser')
291 | []
292 | ```
293 |
294 |
295 |
296 | Parameters for trace
297 |
298 |
299 | ```
300 | Compute the probability distribution(s) of a target variable given conditions.
301 |
302 | Parameters:
303 | target (str): The target variable name.
304 | conditions (Dict[str, Any], optional): A dictionary mapping variable names
305 | flatten (bool, optional): If True, return a flattened dictionary.
306 | **conditions_kwargs: Additional conditions to apply
307 |
308 | Returns:
309 | A dictionary mapping probabilities to the target's possible values.
310 | ```
311 |
312 |
313 |
314 |
315 |
316 | ### Reading TraceResult
317 |
318 | To read the output `TraceResult` object:
319 |
320 | ```python
321 | >>> chrome = fpgen.trace(target='browser', os='ChromeOS')[0]
322 | >>> chrome.probability
323 | 1.0
324 | >>> chrome.value
325 | 'Chrome'
326 | ```
327 |
328 | ---
329 |
330 | ## Query possible values
331 |
332 | You can get a list of a target's possible values by passing it into `fpgen.query`:
333 |
334 | List all possible browsers:
335 |
336 | ```python
337 | >>> fpgen.query('browser')
338 | ['Chrome', 'Edge', 'Firefox', 'Opera', 'Safari', 'Samsung Internet', 'Yandex Browser']
339 | ```
340 |
341 | Passing a nested target:
342 |
343 | ```python
344 | >>> fpgen.query('navigator.maxTouchPoints') # Dot seperated path
345 | [0, 1, 2, 5, 6, 9, 10, 17, 20, 40, 256]
346 | ```
347 |
348 |
349 |
350 | Parameters for query
351 |
352 |
353 | ```
354 | Query a list of possibilities given a target.
355 |
356 | Parameters:
357 | target (str): Target node to query possible values for
358 | flatten (bool, optional): Whether to flatten the output dictionary
359 | sort (bool, optional): Whether to sort the output arrays
360 | ```
361 |
362 |
363 |
364 | > [!NOTE]
365 | > Since fpgen is trained on live data, queries may occasionally return invalid or anomalous values. Values lower than a 0.001% probability will not appear in traces or generated fingerprints.
366 |
367 | ---
368 |
369 | ## Generated data
370 |
371 | Here is a rough list of the data fpgen can generate:
372 |
373 | - **Browser data:**
374 | - All navigator data
375 | - All mimetype data: Audio, video, media source, play types, PDF, etc
376 | - All window viewport data (position, inner/outer viewport sizes, toolbar & scrollbar sizes, etc)
377 | - All screen data
378 | - Supported & unsupported DRM modules
379 | - Memory heap limit
380 |
381 | * **System data:**
382 | - GPU data (vendor, renderer, WebGL/WebGL2, extensions, context attributes, parameters, shader precision formats, etc)
383 | - Battery data (charging, charging time, discharging time, level)
384 | - Screen size, color depth, taskbar size, etc.
385 | - Full fonts list
386 | - Cast receiver data
387 |
388 | - **Network data:**
389 | - HTTP headers
390 | - TLS fingerprint data
391 | - HTTP/2 fingerprint & frames
392 | - RTC video & audio capabilities, codecs, clock rates, mimetypes, header extensions, etc
393 |
394 | * **Audio data:**
395 | - Audio signal
396 | - All Audio API constants (AnalyserNode, BiquadFilterNode, DynamicsCompressorNode, OscillatorNode, etc)
397 |
398 | - **Internationalization data:**
399 | - Regional internationalization (Locale, calendar, numbering system, timezone, date format, etc)
400 | - Voices
401 |
402 | * **_And much more!_**
403 |
404 | For a more complete list, see the [full example output](https://raw.githubusercontent.com/scrapfly/fingerprint-generator/refs/heads/main/assets/example-output.json).
405 |
406 | ---
407 |
--------------------------------------------------------------------------------
/assets/update.py:
--------------------------------------------------------------------------------
1 | """
2 | Refreshes example-output.json
3 | """
4 |
5 | import json
6 | from pathlib import Path
7 |
8 | import fpgen
9 |
10 | DIR = Path('.').absolute()
11 |
12 |
13 | with open(DIR / 'assets' / 'example-output.json', 'w') as f:
14 | data = fpgen.generate()
15 | json.dump(data, f, indent=2)
16 | data = fpgen.generate()
17 | json.dump(data, f, indent=2)
18 |
--------------------------------------------------------------------------------
/fpgen/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Fpgen is a fast & comprehensive browser fingerprint generator.
3 | """
4 |
5 |
6 | def __check_module__() -> None:
7 | '''
8 | Detect if fpgen is being ran as a module.
9 | '''
10 | import inspect
11 | import os
12 | import sys
13 |
14 | # Detect if we're running as poetry script
15 | if sys.argv and os.path.basename(sys.argv[0]) == 'fpgen':
16 | os.environ['FPGEN_NO_INIT'] = '1'
17 | return
18 |
19 | stack: list = inspect.stack(2)
20 | if len(stack) >= 2:
21 | prev, launch = stack[-2:]
22 | try:
23 | if (launch.function, prev.function) == ('_run_module_as_main', '_get_module_details'):
24 | # Enable "partial execution mode" to prevent automatic downloads, starting network, etc.
25 | os.environ['FPGEN_NO_INIT'] = '1'
26 | except AttributeError:
27 | pass
28 |
29 |
30 | __check_module__()
31 | del __check_module__ # Remove from namespace
32 |
33 | # ruff: noqa: E402
34 |
35 | from .generator import Generator, generate, generate_target
36 | from .trace import TraceResult, trace
37 |
38 | # Expose the bayesian network interface for tests
39 | from .utils import NETWORK as __network__
40 | from .utils import query
41 |
42 | __all__ = (
43 | 'Generator',
44 | 'TraceResult',
45 | 'generate',
46 | 'generate_target',
47 | 'query',
48 | 'trace',
49 | '__network__',
50 | )
51 |
--------------------------------------------------------------------------------
/fpgen/__main__.py:
--------------------------------------------------------------------------------
1 | import click
2 |
3 | from .pkgman import (
4 | FILE_PAIRS,
5 | decompress_model,
6 | download_model,
7 | recompress_model,
8 | remove_model,
9 | )
10 |
11 |
12 | @click.group()
13 | def cli():
14 | pass
15 |
16 |
17 | @cli.command()
18 | @click.option(
19 | '--decompress', is_flag=True, help='Also decompress the model files after downloading'
20 | )
21 | def fetch(decompress):
22 | """
23 | Fetch the latest model from GitHub
24 | """
25 | # Remove existing files
26 | remove_model(log=False)
27 | # Download new files
28 | download_model()
29 | if decompress:
30 | decompress_model()
31 | click.echo(click.style("Complete!", fg="green"))
32 |
33 |
34 | @cli.command()
35 | def remove():
36 | """
37 | Remove all downloaded and/or extracted model files
38 | """
39 | found = remove_model()
40 | if not found:
41 | click.echo(click.style("No files found to remove.", fg="yellow"))
42 | return
43 | click.echo(click.style("Complete!", fg="green"))
44 |
45 |
46 | @cli.command()
47 | def decompress():
48 | """
49 | Recompress model files for speed efficiency (will take 100mb+)
50 | """
51 | # Check there's anything to decompress
52 | if any(f.exists() for f in FILE_PAIRS.keys()):
53 | click.echo(click.style("Model is already decompressed.", fg="yellow"))
54 | return
55 | decompress_model()
56 |
57 |
58 | @cli.command()
59 | def recompress():
60 | """
61 | Compress model files after running decompress
62 | """
63 | # Check there's anything to compress
64 | if any(f.exists() for f in FILE_PAIRS.values()):
65 | click.echo(click.style("Model is already compressed.", fg="yellow"))
66 | return
67 | recompress_model()
68 |
69 |
70 | if __name__ == '__main__':
71 | cli()
72 |
--------------------------------------------------------------------------------
/fpgen/bayesian_network.py:
--------------------------------------------------------------------------------
1 | import heapq
2 | import random
3 | from pathlib import Path
4 | from typing import Any, Dict, Iterable, List, Mapping, Optional, Set, Tuple, Union
5 |
6 | from .exceptions import RestrictiveConstraints
7 | from .pkgman import extract_json
8 | from .structs import CaseInsensitiveDict
9 | from .unpacker import lookup_value_list
10 |
11 | StrContainer = Union[str, List[str], Tuple[str, ...], Set[str]]
12 |
13 | # Width for beam search
14 | # This cuts off values that are way too low or contaminated
15 | BEAM_WIDTH = 1000
16 |
17 |
18 | class BayesianNode:
19 | """
20 | A single node in a Bayesian network with methods to sample conditional probabilities
21 | """
22 |
23 | __slots__ = (
24 | 'node_definition',
25 | 'name',
26 | 'parent_names',
27 | 'possible_values',
28 | 'probabilities',
29 | 'index',
30 | )
31 |
32 | def __init__(self, node_definition: Dict[str, Any], index: int):
33 | # Node defintion info
34 | self.node_definition = node_definition
35 | self.name = node_definition['name']
36 | self.parent_names = node_definition['parentNames']
37 | self.possible_values = node_definition['possibleValues']
38 | # CPT data structure
39 | self.probabilities = node_definition['conditionalProbabilities']
40 | # Index in the sampling order
41 | self.index = index
42 |
43 | def get_probabilities_given_known_values(
44 | self, parent_values: Mapping[str, Any]
45 | ) -> Dict[Any, float]:
46 | """
47 | Extracts the probabilities for this node's values, given known parent values
48 | """
49 | probabilities = self.probabilities
50 | for parent_name in self.parent_names:
51 | parent_value = parent_values[parent_name]
52 | probabilities = probabilities.get(parent_value, {})
53 | return probabilities
54 |
55 |
56 | class BayesianNetwork:
57 | """
58 | Bayesian network implementation for probabilistic sampling
59 | """
60 |
61 | def __init__(self, network_file: Path) -> None:
62 | network_definition = extract_json(network_file)
63 | self.nodes_in_sampling_order = [
64 | BayesianNode(node_def, index)
65 | for index, node_def in enumerate(network_definition['nodes'])
66 | ]
67 | nodes_by_name = {node.name: node for node in self.nodes_in_sampling_order}
68 | self.nodes_by_name = CaseInsensitiveDict(nodes_by_name)
69 | # Keep a list of the original names
70 | self.node_names = tuple(nodes_by_name.keys())
71 | # Precompute ancestors
72 | self.ancestors_by_name: Dict[str, Set[str]] = {}
73 | for node in self.nodes_in_sampling_order:
74 | self.get_all_ancestors(node.name)
75 |
76 | def generate_consistent_sample(
77 | self, evidence: Mapping[str, Set[str]]
78 | ) -> Optional[Dict[str, Any]]:
79 | """
80 | Generate a full sample from the Bayesian network.
81 | """
82 | result: Dict[str, str] = {}
83 | # Create a working copy of evidence that we can update in place.
84 | current_evidence = {k: set(v) for k, v in evidence.items()}
85 |
86 | for node in self.nodes_in_sampling_order:
87 | node_name = node.name
88 |
89 | # For nodes with explicit evidence, remove the node itself from the evidence for beam search.
90 | if node_name in current_evidence:
91 | allowed_values = current_evidence[node_name]
92 | search_evidence = {k: v for k, v in current_evidence.items() if k != node_name}
93 | distribution = self.trace(node_name, search_evidence)
94 |
95 | # Filter the distribution to allowed values and renormalize.
96 | filtered_dist = {k: v for k, v in distribution.items() if k in allowed_values}
97 | if not filtered_dist or sum(filtered_dist.values()) <= 0:
98 | uniform_prob = 1.0 / len(allowed_values)
99 | filtered_dist = {val: uniform_prob for val in allowed_values}
100 | else:
101 | total = sum(filtered_dist.values())
102 | filtered_dist = {k: v / total for k, v in filtered_dist.items()}
103 | sampled_value = self.sample_value_from_distribution(filtered_dist)
104 | else:
105 | # For unconstrained nodes, use all current evidence.
106 | distribution = self.trace(node_name, current_evidence)
107 | sampled_value = self.sample_value_from_distribution(distribution)
108 |
109 | result[node_name] = sampled_value
110 | # Update current evidence with the newly sampled node value.
111 | current_evidence[node_name] = {sampled_value}
112 |
113 | return result
114 |
115 | def generate_certain_nodes(
116 | self,
117 | evidence: Mapping[str, Set[str]],
118 | targets: Optional[StrContainer] = None,
119 | ) -> Optional[Dict[str, Any]]:
120 | """
121 | Generate values for target nodes given conditions.
122 | """
123 | # If no target specified, generate full sample
124 | if targets is None:
125 | return self.generate_consistent_sample(evidence)
126 |
127 | # Generate result for each target
128 | result = {}
129 |
130 | for target_node in targets:
131 | # Calculate distribution for this target
132 | distribution = self.trace(target_node, evidence)
133 |
134 | # # Handle multi-value conditions for the target
135 | if target_node in evidence:
136 | allowed_values = evidence[target_node]
137 |
138 | # Filter and renormalize
139 | filtered_dist = {k: v for k, v in distribution.items() if k in allowed_values}
140 |
141 | # If no probability mass, use uniform distribution over allowed values
142 | if not filtered_dist or sum(filtered_dist.values()) <= 0:
143 | raise RestrictiveConstraints(
144 | f"Cannot generate fingerprint: No valid values for {target_node} with current conditions."
145 | )
146 |
147 | # Renormalize
148 | total = sum(filtered_dist.values())
149 | filtered_dist = {k: v / total for k, v in filtered_dist.items()}
150 |
151 | distribution = filtered_dist
152 |
153 | # Sample from the distribution
154 | if distribution:
155 | result[target_node] = self.sample_value_from_distribution(distribution)
156 | else:
157 | raise RestrictiveConstraints(
158 | f"Cannot generate fingerprint: Empty distribution for {target_node}."
159 | )
160 |
161 | return result
162 |
163 | def validate_evidence(self, evidence: Mapping[str, Set[str]]) -> None:
164 | """
165 | Validate that evidence is compatible with each other based on the
166 | Bayesian network structure. Raises an exception if they're incompatible.
167 | """
168 | # Skip validation for single constraint
169 | if len(evidence) <= 1:
170 | return
171 |
172 | # For each constrained node, check if it's compatible with other fixed conditions
173 | for node_name, allowed_values in evidence.items():
174 | # Get other fixed conditions (nodes with single values)
175 | fixed_constraints: Dict[str, Union[str, Set[str]]] = {}
176 | for k, v in evidence.items():
177 | if k != node_name and len(v) == 1:
178 | fixed_constraints[k] = next(iter(v))
179 |
180 | # If we have fixed conditions, check compatibility
181 | if fixed_constraints:
182 | # Use beam search with fixed conditions to check compatibility
183 | dist = self.trace(node_name, fixed_constraints)
184 |
185 | # If beam search returns a distribution
186 | if dist and all(dist.get(val, 0) <= 0 for val in allowed_values):
187 | # === BUILD THE EXCEPTION MESSAGE ===
188 | # Show first 5 values failing node values.
189 | values_str = ", ".join(lookup_value_list(tuple(allowed_values)[:5]))
190 | if len(allowed_values) > 5:
191 | values_str += ", ..."
192 | # Get the constraints
193 | constraints_values = lookup_value_list(fixed_constraints.values())
194 | constraints_str = ", ".join(
195 | f"{k}={v}" for k, v in zip(fixed_constraints.keys(), constraints_values)
196 | )
197 | raise RestrictiveConstraints(
198 | f"Cannot generate fingerprint: {node_name}=({values_str}) "
199 | f"is impossible with constraint: {constraints_str}"
200 | )
201 |
202 | def get_all_ancestors(self, node_name: str) -> Set[str]:
203 | """
204 | Get all ancestors of a node (nodes that can influence its value)
205 | """
206 | if node_name in self.ancestors_by_name:
207 | return self.ancestors_by_name[node_name]
208 |
209 | node = self.nodes_by_name[node_name]
210 | ancestors: Set[str] = set()
211 | if not node:
212 | return ancestors
213 |
214 | # Add direct parents
215 | for parent in node.parent_names:
216 | ancestors.add(parent)
217 | # Add parent's ancestors recursively
218 | ancestors.update(self.get_all_ancestors(parent))
219 |
220 | self.ancestors_by_name[node_name] = ancestors
221 | return ancestors
222 |
223 | def trace(self, target: str, evidence: Mapping[str, Union[str, Set[str]]]) -> Dict[str, float]:
224 | """
225 | Calculate conditional probability distribution for target given evidence
226 | using beam search.
227 | """
228 | # Get the actual target name and build relevant nodes set.
229 | target = self.nodes_by_name[target].name
230 | relevant_nodes = self.get_all_ancestors(target).copy()
231 | relevant_nodes.add(target)
232 |
233 | # Add evidence nodes and their ancestors.
234 | for ev_node in evidence:
235 | if ev_node in self.nodes_by_name:
236 | relevant_nodes.add(ev_node)
237 | relevant_nodes.update(self.get_all_ancestors(ev_node))
238 |
239 | # Sort nodes by sampling order
240 | ordered_nodes = [
241 | node for node in self.nodes_in_sampling_order if node.name in relevant_nodes
242 | ]
243 |
244 | # Initialize beam
245 | beam: List[Tuple[Dict[str, Any], float]] = [({}, 1.0)]
246 | # Local cache for conditional probability lookups
247 | cpt_cache: Dict[Tuple[str, Tuple[Any, ...]], Dict[Any, float]] = {}
248 |
249 | for node in ordered_nodes:
250 | new_beam = []
251 | node_name = node.name
252 |
253 | # Determine allowed values from evidence if present
254 | allowed_values = evidence[node_name] if node_name in evidence else None
255 |
256 | # Process each assignment in the current beam
257 | for assignment, prob in beam:
258 | # Parent order is defined by node.parent_names
259 | try:
260 | parent_values_tuple = tuple(assignment[parent] for parent in node.parent_names)
261 | except KeyError:
262 | # Should not occur if assignments are built in order
263 | parent_values_tuple = ()
264 |
265 | cache_key = (node_name, parent_values_tuple)
266 | if cache_key in cpt_cache:
267 | cpt = cpt_cache[cache_key]
268 | else:
269 | parent_values = {parent: assignment[parent] for parent in node.parent_names}
270 | cpt = node.get_probabilities_given_known_values(parent_values)
271 | # Use uniform distribution if missing
272 | if not cpt and node.possible_values:
273 | uniform_prob = 1.0 / len(node.possible_values)
274 | cpt = {val: uniform_prob for val in node.possible_values}
275 |
276 | # Expand the beam with new assignments
277 | for value, p in cpt.items():
278 | if (allowed_values is None or value in allowed_values) and p > 0:
279 | # Create a new assignment with the new node value
280 | new_assignment = assignment.copy()
281 | new_assignment[node_name] = value
282 | new_beam.append((new_assignment, prob * p))
283 |
284 | # Prune the beam if no valid configurations are left
285 | if new_beam:
286 | if len(new_beam) > BEAM_WIDTH:
287 | # Get the top BEAM_WIDTH assignments
288 | beam = heapq.nlargest(BEAM_WIDTH, new_beam, key=lambda x: x[1])
289 | else:
290 | beam = new_beam
291 | else:
292 | return {}
293 |
294 | # Extract the target distribution
295 | target_dist: Dict[str, float] = {}
296 | total_prob = 0.0
297 | for assignment, prob in beam:
298 | if target in assignment:
299 | value = assignment[target]
300 | target_dist[value] = target_dist.get(value, 0) + prob
301 | total_prob += prob
302 |
303 | if total_prob > 0:
304 | return {val: p / total_prob for val, p in target_dist.items()}
305 | return {}
306 |
307 | def sample_value_from_distribution(self, distribution: Mapping[str, float]) -> str:
308 | """
309 | Sample a value from a probability distribution
310 | """
311 | anchor = random.random() # nosec
312 | cumulative_probability = 0.0
313 | for value, probability in distribution.items():
314 | cumulative_probability += probability
315 | if anchor < cumulative_probability:
316 | return value
317 | # Fall back to first value
318 | return next(iter(distribution.keys()))
319 |
320 | def get_distribution_for_node(
321 | self,
322 | node: BayesianNode,
323 | sample: Mapping[str, Any],
324 | evidence: Optional[Dict[str, Set[str]]] = None,
325 | ) -> Dict[str, float]:
326 | """
327 | Get the probability distribution for a node given the current sample
328 | """
329 | # For multi-value conditions, use beam search
330 | if evidence and node.name in evidence and len(evidence[node.name]) > 1:
331 | # Current evidence is what we've sampled so far
332 | current_evidence = {k: v for k, v in sample.items()}
333 |
334 | # Calculate distribution using beam search
335 | distribution = self.trace(node.name, current_evidence)
336 | # Filter by allowed values and renormalize
337 | if node.name in evidence:
338 | allowed_values = evidence[node.name]
339 | filtered_dist = {k: v for k, v in distribution.items() if k in allowed_values}
340 |
341 | # If no probability mass, the conditions are impossible
342 | if not filtered_dist or sum(filtered_dist.values()) <= 0:
343 | raise RestrictiveConstraints(
344 | f"Cannot generate fingerprint: no valid values for {node.name} with current conditions"
345 | )
346 |
347 | # Renormalize
348 | total = sum(filtered_dist.values())
349 | filtered_dist = {k: v / total for k, v in filtered_dist.items()}
350 | return filtered_dist
351 |
352 | return distribution
353 |
354 | # For regular nodes, use direct sampling
355 | parent_values = {parent: sample[parent] for parent in node.parent_names}
356 |
357 | cpt = node.get_probabilities_given_known_values(parent_values)
358 | if not cpt and node.possible_values:
359 | # If missing probabilities, use uniform distribution
360 | uniform_prob = 1.0 / len(node.possible_values)
361 | cpt = {v: uniform_prob for v in node.possible_values}
362 |
363 | if not cpt:
364 | raise RestrictiveConstraints(
365 | f"Cannot generate fingerprint: no probability table for {node.name}"
366 | )
367 |
368 | return cpt
369 |
370 | def get_shared_possibilities(
371 | self,
372 | value_possibilities: Mapping[str, Set[str]],
373 | seen_nodes: Optional[Set[Tuple[str, int]]] = None,
374 | orig_parents: Optional[Tuple[str, ...]] = None,
375 | ) -> Optional[Dict[str, Set[str]]]:
376 | """
377 | Get shared possibilities across nodes based on conditions.
378 | Returns None if conditions are contradictory.
379 |
380 | This is deprecated as of v1.3.0 but still exposed for testing.
381 | """
382 | # Return empty dict immediately
383 | if not value_possibilities:
384 | return {}
385 |
386 | if seen_nodes is None:
387 | seen_nodes = set()
388 |
389 | # Propagate upward to find possible parent values
390 | all_parents = {node: set(values) for node, values in value_possibilities.items()}
391 | for node, values in value_possibilities.items():
392 | # Track nodes we've processed
393 | if (node, len(values)) in seen_nodes:
394 | continue
395 | seen_nodes.add((node, len(values)))
396 | self._intersect_parents(node, values, all_parents)
397 |
398 | if orig_parents is None:
399 | orig_parents = tuple(all_parents.keys())
400 |
401 | # If any parent has no valid values, conditions are contradictory
402 | if any(len(parents) == 0 for parents in all_parents.values()):
403 | return None
404 |
405 | return all_parents
406 |
407 | def _intersect_parents(
408 | self, node: str, values: Set[str], all_parents: Dict[str, Set[str]]
409 | ) -> None:
410 | """
411 | Intersect possible parent values based on child node conditions
412 | """
413 | node_obj = self.nodes_by_name.get(node)
414 | if not node_obj:
415 | return
416 |
417 | parent_names = node_obj.parent_names
418 | num_parents = len(parent_names)
419 |
420 | # No parents exist, nothing to do
421 | if not num_parents:
422 | return
423 |
424 | # Build a set of each parent's possible values
425 | parent_values: List[Set[str]] = [set() for _ in range(num_parents)]
426 | for value in values:
427 | collect_parents(
428 | node_obj.probabilities,
429 | value,
430 | parent_values=parent_values,
431 | )
432 |
433 | # Update all_parents with the intersection of this node's parents
434 | for n, parents in enumerate(parent_values):
435 | parent_name = parent_names[n]
436 | if parent_name not in all_parents:
437 | all_parents[parent_name] = parents
438 | else:
439 | all_parents[parent_name] = all_parents[parent_name].intersection(parents)
440 |
441 | # Recurse to earlier parents if needed
442 | if parent_names and parent_names[0] != self.nodes_in_sampling_order[0].name:
443 | self._intersect_parents(
444 | node=parent_names[0], values=parent_values[0], all_parents=all_parents
445 | )
446 |
447 |
448 | def collect_parents(
449 | probabilities: Mapping[str, Any],
450 | target: str,
451 | parent_values: List[Set[str]],
452 | so_far: Optional[List[str]] = None,
453 | depth: int = 0,
454 | ) -> None:
455 | """
456 | Collects all the possible parent values of a node
457 | """
458 | if so_far is None:
459 | so_far = []
460 | for parent, values in probabilities.items():
461 | if isinstance(values, dict):
462 | collect_parents(
463 | probabilities=values,
464 | target=target,
465 | parent_values=parent_values,
466 | so_far=so_far + [parent],
467 | depth=depth + 1,
468 | )
469 | elif parent == target:
470 | for n, parent in enumerate(so_far):
471 | parent_values[n].add(parent)
472 |
--------------------------------------------------------------------------------
/fpgen/exceptions.py:
--------------------------------------------------------------------------------
1 | class NetworkError(ValueError):
2 | """Error with the network"""
3 |
4 |
5 | class InvalidConstraints(NetworkError):
6 | """Raises when a constraint isn't possible"""
7 |
8 |
9 | class RestrictiveConstraints(InvalidConstraints):
10 | """Raises when the passed constraints are too restrictive"""
11 |
12 |
13 | class InvalidNode(NetworkError):
14 | """Raises when a node doesn't exist"""
15 |
16 |
17 | class NodePathError(InvalidNode):
18 | """Raises when a key path doesn't exist"""
19 |
20 |
21 | class MissingRelease(Exception):
22 | """Raised when a required GitHub release asset is missing."""
23 |
24 |
25 | class CannotTraceLargeConfigSpace(ValueError):
26 | """Raises when the configuration space of a node is too large to trace with exact inference"""
27 |
--------------------------------------------------------------------------------
/fpgen/generator.py:
--------------------------------------------------------------------------------
1 | from dataclasses import dataclass
2 | from typing import Any, Dict, List, Optional, Set, Union, overload
3 |
4 | from .bayesian_network import StrContainer
5 | from .exceptions import RestrictiveConstraints
6 | from .trace import TraceResult, TraceResultDict, trace
7 | from .utils import (
8 | NETWORK,
9 | _assert_dict_xor_kwargs,
10 | _assert_network_exists,
11 | _find_roots,
12 | _make_output_dict,
13 | _maybe_flatten,
14 | _reassemble_targets,
15 | _tupilize,
16 | build_evidence,
17 | )
18 |
19 |
20 | class Generator:
21 | """
22 | Generates realistic browser fingerprints
23 | """
24 |
25 | def __init__(
26 | self,
27 | conditions: Optional[Dict[str, Any]] = None,
28 | *,
29 | strict: bool = True,
30 | flatten: bool = False,
31 | **conditions_kwargs: Any,
32 | ):
33 | """
34 | Initializes the Generator with the given options.
35 | Values passed to the Generator object will be inherited when calling Generator.generate()
36 |
37 | Parameters:
38 | conditions (dict, optional): Conditions for the generated fingerprint.
39 | strict (bool, optional): Whether to raise an exception if the conditions are too strict.
40 | flatten (bool, optional): Whether to flatten the output dictionary
41 | target (Optional[Union[str, StrContainer]]): Only generate specific value(s)
42 | **conditions_kwargs: Conditions for the generated fingerprint (passed as kwargs)
43 | """
44 | _assert_dict_xor_kwargs(conditions, conditions_kwargs)
45 | # Set default options
46 | self.strict: bool = strict
47 | self.flatten: bool = flatten
48 | self.evidence: Dict[str, Set[str]] = {}
49 |
50 | if conditions_kwargs:
51 | conditions = conditions_kwargs
52 | if conditions:
53 | build_evidence(conditions, self.evidence)
54 |
55 | @overload
56 | def generate(
57 | self,
58 | conditions: Optional[Dict[str, Any]] = None,
59 | *,
60 | strict: Optional[bool] = None,
61 | flatten: Optional[bool] = None,
62 | target: str,
63 | **conditions_kwargs: Any,
64 | ) -> Any: ...
65 |
66 | @overload
67 | def generate(
68 | self,
69 | conditions: Optional[Dict[str, Any]] = None,
70 | *,
71 | strict: Optional[bool] = None,
72 | flatten: Optional[bool] = None,
73 | target: Optional[StrContainer] = None,
74 | **conditions_kwargs: Any,
75 | ) -> Dict[str, Any]: ...
76 |
77 | def generate(
78 | self,
79 | conditions: Optional[Dict[str, Any]] = None,
80 | *,
81 | strict: Optional[bool] = None,
82 | flatten: Optional[bool] = None,
83 | target: Optional[Union[str, StrContainer]] = None,
84 | **conditions_kwargs: Any,
85 | ) -> Dict[str, Any]:
86 | """
87 | Generates a fingerprint.
88 |
89 | Parameters:
90 | conditions (dict, optional): Conditions for the generated fingerprints.
91 | These conditions will be inherited by generated fingerprints.
92 | strict (bool, optional): Whether to raise an exception if the conditions are too strict.
93 | flatten (bool, optional): Whether to flatten the output dictionary
94 | target (Optional[Union[str, StrContainer]]): Only generate specific value(s)
95 | **conditions_kwargs: Conditions for the generated fingerprints (passed as kwargs)
96 |
97 | Returns:
98 | A generated fingerprint.
99 | """
100 | _assert_dict_xor_kwargs(conditions, conditions_kwargs)
101 | _assert_network_exists()
102 |
103 | if conditions_kwargs:
104 | conditions = conditions_kwargs
105 |
106 | # Merge new options with old
107 | strict = _first(strict, self.strict)
108 | flatten = _first(flatten, self.flatten)
109 |
110 | # Inherit the evidence from the class instance
111 | evidence = self.evidence.copy()
112 | if conditions:
113 | build_evidence(conditions, evidence, strict=strict)
114 |
115 | # Convert targets to set
116 | if target:
117 | target_tup = _tupilize(target)
118 | target_roots = set(_find_roots(target_tup))
119 | else:
120 | target_roots = None
121 |
122 | # Generate fingerprint
123 | while True:
124 | # If we only are searching for certain targets, call generate_certain_nodes
125 | if target_roots:
126 | fingerprint = NETWORK.generate_certain_nodes(evidence, target_roots)
127 | else:
128 | fingerprint = NETWORK.generate_consistent_sample(evidence)
129 |
130 | # Found the fingerprint
131 | if fingerprint is not None:
132 | break
133 | # Raise an error if the evidence are too strict
134 | if strict:
135 | raise RestrictiveConstraints(
136 | 'Cannot generate fingerprint. Constraints are too restrictive.'
137 | )
138 | # If no fingerprint was generated, relax the filtered values until we find one
139 | evidence.pop(next(iter(evidence.keys())))
140 |
141 | # If we arent searching for certain targets, we can return right away
142 | if target:
143 | output = _make_output_dict(fingerprint, flatten=False) # Don't flatten yet
144 | output = _reassemble_targets(_tupilize(target), output)
145 | if isinstance(target, str):
146 | output = output[target]
147 | return _maybe_flatten(flatten, output)
148 |
149 | return _make_output_dict(fingerprint, flatten=flatten)
150 |
151 | def generate_target(
152 | self, target: str, conditions: Optional[Dict[str, Any]] = None, **kwargs: Any
153 | ) -> Any:
154 | """
155 | Generates a specific target. Shortcut for the `generate` method.
156 | """
157 | return self.generate(target=target, conditions=conditions, **kwargs)
158 |
159 | @overload
160 | def trace(
161 | self,
162 | target: str,
163 | conditions: Optional[Dict[str, Any]] = None,
164 | *,
165 | flatten: bool = False,
166 | **conditions_kwargs: Any,
167 | ) -> List[TraceResult]: ...
168 |
169 | @overload
170 | def trace(
171 | self,
172 | target: StrContainer,
173 | conditions: Optional[Dict[str, Any]] = None,
174 | *,
175 | flatten: bool = False,
176 | **conditions_kwargs: Any,
177 | ) -> TraceResultDict: ...
178 |
179 | def trace(
180 | self,
181 | target: Union[str, StrContainer],
182 | conditions: Optional[Dict[str, Any]] = None,
183 | *,
184 | flatten: bool = False,
185 | **conditions_kwargs: Any,
186 | ) -> Union[List[TraceResult], TraceResultDict]:
187 | """
188 | Compute the probability distribution(s) of a target variable given conditions.
189 |
190 | Parameters:
191 | target (str): The target variable name.
192 | conditions (Dict[str, Any], optional): A dictionary mapping variable names
193 | flatten (bool, optional): If True, return a flattened dictionary.
194 | **conditions_kwargs: Additional conditions to apply
195 |
196 | Returns:
197 | A dictionary mapping probabilities to the target's possible values.
198 | """
199 | return trace(
200 | target=target,
201 | flatten=flatten,
202 | conditions=conditions,
203 | **conditions_kwargs,
204 | # Inherit the conditions from the class instance
205 | __evidence__=self.evidence.copy(),
206 | )
207 |
208 |
209 | def _first(*values):
210 | """
211 | Simple function that returns the first non-None value passed
212 | """
213 | return next((v for v in values if v is not None), None)
214 |
215 |
216 | """
217 | A global `generate` function for those calling
218 | fpgen.generate() directly without creating a Generator object
219 | """
220 |
221 | GLOBAL_GENERATOR: Optional[Generator] = None
222 |
223 |
224 | def generate(*args, **kwargs) -> Dict[str, Any]:
225 | """
226 | Generates a fingerprint.
227 |
228 | Parameters:
229 | conditions (dict, optional): Conditions for the generated fingerprints.
230 | These conditions will be inherited by generated fingerprints.
231 | strict (bool, optional): Whether to raise an exception if the conditions are too strict.
232 | flatten (bool, optional): Whether to flatten the output dictionary
233 | target (Optional[Union[str, StrContainer]]): Only generate specific value(s)
234 | **conditions_kwargs: Conditions for the generated fingerprints (passed as kwargs)
235 |
236 | Returns:
237 | A generated fingerprint.
238 | """
239 | global GLOBAL_GENERATOR
240 | if GLOBAL_GENERATOR is None:
241 | GLOBAL_GENERATOR = Generator()
242 | return GLOBAL_GENERATOR.generate(*args, **kwargs)
243 |
244 |
245 | def generate_target(target: str, conditions: Optional[Dict[str, Any]] = None, **kwargs) -> Any:
246 | """
247 | Generates a specific target. Shortcut for the `generate` method.
248 | """
249 | return generate(target=target, conditions=conditions, **kwargs)
250 |
251 |
252 | __all__ = ('Generator', 'WindowBounds', 'generate', 'generate_target')
253 |
--------------------------------------------------------------------------------
/fpgen/pkgman.py:
--------------------------------------------------------------------------------
1 | import os
2 | import tempfile
3 | import zipfile
4 | from datetime import datetime, timedelta
5 | from pathlib import Path
6 | from typing import Any, Dict
7 |
8 | import click
9 | import httpx
10 | import orjson
11 | import zstandard
12 |
13 | from .exceptions import MissingRelease
14 |
15 | # Model files
16 | DATA_DIR = Path(__file__).parent / 'data'
17 |
18 | NETWORK_FILE = DATA_DIR / "fingerprint-network.json"
19 | VALUES_JSON = DATA_DIR / 'values.json'
20 | VALUES_DATA = DATA_DIR / 'values.dat'
21 |
22 | # Mapping of files to their compressed variant
23 | FILE_PAIRS = {
24 | NETWORK_FILE: NETWORK_FILE.with_suffix('.json.zst'),
25 | VALUES_JSON: VALUES_JSON.with_suffix('.json.zst'),
26 | VALUES_DATA: VALUES_DATA.with_suffix('.dat.zst'),
27 | }
28 |
29 | # Repo to pull releases from
30 | GITHUB_REPO = 'scrapfly/fingerprint-generator'
31 |
32 |
33 | class ModelPuller:
34 | """
35 | Pulls the model from GitHub and extracts it to the data directory.
36 | """
37 |
38 | def __init__(self) -> None:
39 | self.api_url = f"https://api.github.com/repos/{GITHUB_REPO}/releases"
40 |
41 | def check_asset(self, asset: Dict) -> Any:
42 | """
43 | Compare the asset to determine if it's the desired asset.
44 |
45 | Args:
46 | asset: Asset information from GitHub API
47 |
48 | Returns:
49 | Any: Data to be returned if this is the desired asset, or None/False if not
50 | """
51 | url = asset.get('browser_download_url')
52 | if url and url.endswith('.zip'):
53 | return url
54 |
55 | def missing_asset_error(self) -> None:
56 | """
57 | Raise a MissingRelease exception if no release is found.
58 | """
59 | raise MissingRelease(f"Could not find a release asset in {GITHUB_REPO}.")
60 |
61 | def get_asset(self) -> Any:
62 | """
63 | Fetch the latest release from the GitHub API.
64 | Gets the first asset that returns a truthy value from check_asset.
65 | """
66 | resp = httpx.get(self.api_url, timeout=20, verify=False)
67 | resp.raise_for_status()
68 |
69 | releases = resp.json()
70 |
71 | for release in releases:
72 | for asset in release['assets']:
73 | if data := self.check_asset(asset):
74 | return data
75 |
76 | self.missing_asset_error()
77 |
78 | def download(self):
79 | """
80 | Download the model from GitHub and extract it to the data directory.
81 | """
82 | # Pull form a custom source, or the GitHub API
83 |
84 | url = os.getenv('FPGEN_MODEL_URL')
85 | if url:
86 | click.echo(f"Fetching model files from {url}...")
87 | else:
88 | click.echo("Fetching model files from GitHub...")
89 | url = self.get_asset()
90 |
91 | # Optionally get the model password
92 | password = os.getenv('FPGEN_MODEL_PASSWORD')
93 | if password:
94 | password = password.encode()
95 |
96 | # Stream to tempfile then extract using zipfile
97 | with tempfile.NamedTemporaryFile(delete=False) as temp_file:
98 | with httpx.stream(
99 | 'GET', url, timeout=20, verify=False, follow_redirects=True
100 | ) as r: # nosec
101 | for chunk in r.iter_bytes():
102 | temp_file.write(chunk)
103 | temp_file.flush()
104 | temp_file.close()
105 | # Print extraction message if running as module
106 | if __is_module__():
107 | click.echo(f"Extracting to {DATA_DIR}...")
108 | with zipfile.ZipFile(temp_file.name) as z:
109 | z.extractall(DATA_DIR, pwd=password)
110 |
111 | os.unlink(temp_file.name)
112 |
113 |
114 | """
115 | File helper
116 | """
117 |
118 |
119 | def extract_json(path: Path) -> dict:
120 | """
121 | Reads JSON from a file (or from a zst if needed).
122 | """
123 | # Check for uncompressed json
124 | if path.exists():
125 | with open(path, 'rb') as f:
126 | return orjson.loads(f.read())
127 |
128 | # Check for zst json
129 | elif (zst_path := path.with_suffix('.json.zst')).exists():
130 | with open(zst_path, 'rb') as f:
131 | decomp = zstandard.ZstdDecompressor()
132 | return orjson.loads(decomp.decompress(f.read()))
133 |
134 | raise FileNotFoundError(f'Missing required data file for: {path}')
135 |
136 |
137 | """
138 | Model file utility functions
139 | """
140 |
141 |
142 | def download_model():
143 | """
144 | Call the model puller to download files
145 | """
146 | ModelPuller().download()
147 |
148 |
149 | def decompress_model():
150 | """
151 | Decompress model files
152 | """
153 | import zstandard
154 |
155 | dctx = zstandard.ZstdDecompressor()
156 | for src_zst, dst in {v: k for k, v in FILE_PAIRS.items()}.items():
157 | if not src_zst.exists():
158 | click.echo(f"Warning: {src_zst} not found, skipping")
159 | continue
160 |
161 | click.echo(f"Decompressing {src_zst} -> {dst}")
162 | with open(src_zst, 'rb') as src, open(dst, 'wb') as dst_f:
163 | dctx.copy_stream(src, dst_f)
164 | src_zst.unlink()
165 |
166 |
167 | def recompress_model():
168 | """
169 | Recompress model files after running decompress
170 | """
171 | import zstandard
172 |
173 | cctx = zstandard.ZstdCompressor(level=19)
174 | for src, dst_zst in FILE_PAIRS.items():
175 | if not src.exists():
176 | click.echo(f"Warning: {src} not found, skipping")
177 | continue
178 |
179 | click.echo(f"Compressing {src} -> {dst_zst}")
180 | with open(src, 'rb') as src_f:
181 | data = src_f.read()
182 | compressed = cctx.compress(data)
183 | with open(dst_zst, 'wb') as dst:
184 | dst.write(compressed)
185 | src.unlink()
186 |
187 |
188 | def remove_model(log=True):
189 | """
190 | Remove all model files
191 | """
192 | for file_pair in FILE_PAIRS.items():
193 | found = False
194 | for file in file_pair:
195 | if not file.exists():
196 | continue
197 | if log:
198 | click.echo(f"Removing {file}")
199 | file.unlink()
200 | found = True
201 | return found
202 |
203 |
204 | def files_are_recent(file_list):
205 | """
206 | Checks if all passed files are <5 weeks old
207 | """
208 | cutoff = datetime.now() - timedelta(weeks=5)
209 | return all(datetime.fromtimestamp(f.stat().st_mtime) >= cutoff for f in file_list)
210 |
211 |
212 | def assert_downloaded():
213 | """
214 | Checks if the model files are downloaded
215 | """
216 | if __is_module__():
217 | return # Skip if running as a module
218 |
219 | # Check decompressed files (FILE_PAIRS keys)
220 | if all(file.exists() for file in FILE_PAIRS.keys()):
221 | # When updating decompressed files, decompress again after redownloading
222 | if not files_are_recent(FILE_PAIRS.keys()):
223 | ModelPuller().download()
224 | decompress_model()
225 | return
226 |
227 | # Check compressed files (FILE_PAIRS values)
228 | if all(file.exists() for file in FILE_PAIRS.values()) and files_are_recent(FILE_PAIRS.values()):
229 | return
230 |
231 | # First time importing
232 | ModelPuller().download()
233 |
234 |
235 | def __is_module__() -> bool:
236 | """
237 | Checks if fpgen is being ran as a module
238 | """
239 | return bool(os.getenv('FPGEN_NO_INIT'))
240 |
241 |
242 | # Check model files are downloaded
243 | assert_downloaded()
244 |
--------------------------------------------------------------------------------
/fpgen/py.typed:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapfly/fingerprint-generator/fc7e489687821f691bae9308d73e294497d015fb/fpgen/py.typed
--------------------------------------------------------------------------------
/fpgen/structs.py:
--------------------------------------------------------------------------------
1 | # https://stackoverflow.com/a/32888599
2 | class CaseInsensitiveDict(dict):
3 | @classmethod
4 | def _k(cls, key):
5 | return key.casefold() if isinstance(key, str) else key
6 |
7 | def __init__(self, *args, **kwargs):
8 | super(CaseInsensitiveDict, self).__init__(*args, **kwargs)
9 | self._convert_keys()
10 |
11 | def __getitem__(self, key):
12 | return super(CaseInsensitiveDict, self).__getitem__(self.__class__._k(key))
13 |
14 | def __setitem__(self, key, value):
15 | super(CaseInsensitiveDict, self).__setitem__(self.__class__._k(key), value)
16 |
17 | def __delitem__(self, key):
18 | return super(CaseInsensitiveDict, self).__delitem__(self.__class__._k(key))
19 |
20 | def __contains__(self, key):
21 | return super(CaseInsensitiveDict, self).__contains__(self.__class__._k(key))
22 |
23 | def has_key(self, key):
24 | return super(CaseInsensitiveDict, self).has_key(self.__class__._k(key))
25 |
26 | def pop(self, key, *args, **kwargs):
27 | return super(CaseInsensitiveDict, self).pop(self.__class__._k(key), *args, **kwargs)
28 |
29 | def get(self, key, *args, **kwargs):
30 | return super(CaseInsensitiveDict, self).get(self.__class__._k(key), *args, **kwargs)
31 |
32 | def setdefault(self, key, *args, **kwargs):
33 | return super(CaseInsensitiveDict, self).setdefault(self.__class__._k(key), *args, **kwargs)
34 |
35 | def update(self, E={}, **F):
36 | super(CaseInsensitiveDict, self).update(self.__class__(E))
37 | super(CaseInsensitiveDict, self).update(self.__class__(**F))
38 |
39 | def _convert_keys(self):
40 | for k in list(self.keys()):
41 | v = super(CaseInsensitiveDict, self).pop(k)
42 | self.__setitem__(k, v)
43 |
--------------------------------------------------------------------------------
/fpgen/trace.py:
--------------------------------------------------------------------------------
1 | from dataclasses import dataclass
2 | from typing import Any, Dict, List, Optional, Set, Union, overload
3 |
4 | import orjson
5 |
6 | from .bayesian_network import StrContainer
7 | from .exceptions import RestrictiveConstraints
8 | from .unpacker import lookup_value_list
9 | from .utils import (
10 | NETWORK,
11 | _assert_dict_xor_kwargs,
12 | _find_roots,
13 | _tupilize,
14 | build_evidence,
15 | )
16 |
17 |
18 | @dataclass
19 | class TraceResult:
20 | value: Any
21 | probability: float
22 |
23 | def __repr__(self) -> str:
24 | return f"<{self.value}: {self.probability * 100:.5f}%>"
25 |
26 |
27 | # Recursive type for the return value
28 | TraceResultDict = Dict[str, Union[List[TraceResult], "TraceResultDict"]]
29 |
30 |
31 | @overload
32 | def trace(
33 | target: str,
34 | conditions: Optional[Dict[str, Any]] = None,
35 | *,
36 | flatten: bool = False,
37 | **conditions_kwargs,
38 | ) -> List[TraceResult]: ...
39 |
40 |
41 | @overload
42 | def trace(
43 | target: StrContainer,
44 | conditions: Optional[Dict[str, Any]] = None,
45 | *,
46 | flatten: bool = False,
47 | **conditions_kwargs,
48 | ) -> TraceResultDict: ...
49 |
50 |
51 | def trace(
52 | target: Union[str, StrContainer],
53 | conditions: Optional[Dict[str, Any]] = None,
54 | *,
55 | flatten: bool = False,
56 | __evidence__: Optional[Dict[str, Set[str]]] = None,
57 | **conditions_kwargs,
58 | ) -> Union[List[TraceResult], TraceResultDict]:
59 | """
60 | Compute the probability distribution(s) of a target variable given conditions.
61 |
62 | Parameters:
63 | target (str): The target variable name.
64 | conditions (Dict[str, Any], optional): A dictionary mapping variable names
65 | flatten (bool, optional): If True, return a flattened dictionary.
66 | **conditions_kwargs: Additional conditions to apply
67 |
68 | Returns:
69 | A dictionary mapping probabilities to the target's possible values.
70 | """
71 | _assert_dict_xor_kwargs(conditions, conditions_kwargs)
72 |
73 | # If evidence was already passed, consume it
74 | evidence: Dict[str, Set[str]] = __evidence__ or {}
75 |
76 | # Build conditions
77 | if conditions_kwargs:
78 | conditions = conditions_kwargs
79 | if conditions:
80 | build_evidence(conditions, evidence)
81 |
82 | # Get the targets
83 | target_tup = _tupilize(target)
84 | target_roots = tuple(_find_roots(target_tup))
85 |
86 | # List is empty, raise an error
87 | if not target_tup:
88 | raise ValueError("Please pass at least one valid target.")
89 |
90 | # If there is only one target, return the result
91 | if len(target_roots) == 1:
92 | return _pull_target(target_roots[0], evidence)
93 |
94 | # If flatten is true, return a dictionary of targets
95 | if flatten:
96 | return {root: _pull_target(root, evidence) for root in target_roots}
97 |
98 | # Otherwise, return a expanded dictionary of targets
99 | output: Dict[str, Any] = {}
100 | for root in target_roots:
101 | parts = root.split(".")
102 | d = output
103 | for part in parts[:-1]:
104 | if part not in d:
105 | d[part] = {}
106 | d = d[part]
107 | output[parts[-1]] = _pull_target(root, evidence)
108 | return output
109 |
110 |
111 | def _pull_target(target: str, evidence: Dict[str, Set[str]]) -> List[TraceResult]:
112 | """
113 | Gets the probability distribution for a target variable given conditions.
114 | """
115 | possibilities = NETWORK.trace(target=target, evidence=evidence)
116 | if not possibilities:
117 | raise RestrictiveConstraints(
118 | f"Restraints are too restrictive. No possible values for {target}."
119 | )
120 | data = lookup_value_list(possibilities.keys())
121 | data = map(orjson.loads, data)
122 | probs = possibilities.values()
123 | resp = [
124 | TraceResult(value=value, probability=probability) for value, probability in zip(data, probs)
125 | ]
126 | resp.sort(key=lambda x: x.probability, reverse=True)
127 | return resp
128 |
--------------------------------------------------------------------------------
/fpgen/unpacker.py:
--------------------------------------------------------------------------------
1 | import base64
2 | from typing import List, Tuple
3 |
4 | import numpy as np
5 | from indexed_zstd import IndexedZstdFile
6 |
7 | from .pkgman import VALUES_DATA, VALUES_JSON, __is_module__, extract_json
8 |
9 |
10 | def load_values_json() -> List[Tuple[str, List[str]]]:
11 | values_json = extract_json(VALUES_JSON)
12 | return list(values_json.items())
13 |
14 |
15 | if not __is_module__():
16 | # Do not attempt to load values.json if we are running as a module
17 | VALUE_PAIRS = load_values_json()
18 |
19 |
20 | def base85_to_int(s: str) -> int:
21 | # Decode using base85
22 | decoded_bytes = base64.b85decode(s)
23 | # Convert bytes to integer
24 | return int.from_bytes(decoded_bytes, byteorder='big')
25 |
26 |
27 | def get_dat_file():
28 | """
29 | Returns a seekable file descriptor (or indexed zst file)
30 | """
31 | if VALUES_DATA.exists():
32 | return open(VALUES_DATA, 'rb')
33 | elif (zst_path := VALUES_DATA.with_suffix('.dat.zst')).exists():
34 | return IndexedZstdFile(str(zst_path))
35 |
36 | raise FileNotFoundError(f'Missing required file: {VALUES_DATA}')
37 |
38 |
39 | def lookup_value(index):
40 | offset, length = VALUE_PAIRS[base85_to_int(index)]
41 | file = get_dat_file()
42 | file.seek(int(offset, 16))
43 | data = file.read(length).decode('utf-8')
44 | file.close()
45 | return data
46 |
47 |
48 | def lookup_value_list(index_list):
49 | """
50 | Returns a list of values from the data file given a list of lookup values
51 | """
52 | # Empty numpy array of len(index_list)
53 | value_map = np.empty(len(index_list), dtype=object)
54 |
55 | file = get_dat_file()
56 | # Read in order from lowest index to highest
57 | sorted_indices = sorted(
58 | (base85_to_int(lookup_index), n) for n, lookup_index in enumerate(index_list)
59 | )
60 |
61 | for index, n in sorted_indices:
62 | offset, length = VALUE_PAIRS[index]
63 | file.seek(int(offset, 16))
64 | # Set to key in order of the original list
65 | value_map[n] = file.read(length).decode('utf-8')
66 |
67 | file.close()
68 | return value_map
69 |
--------------------------------------------------------------------------------
/fpgen/utils.py:
--------------------------------------------------------------------------------
1 | from collections.abc import MutableMapping
2 | from typing import (
3 | Any,
4 | Dict,
5 | Iterable,
6 | Iterator,
7 | List,
8 | Literal,
9 | Mapping,
10 | Optional,
11 | Set,
12 | Tuple,
13 | Union,
14 | overload,
15 | )
16 |
17 | import orjson
18 |
19 | from .bayesian_network import BayesianNetwork, StrContainer
20 | from .exceptions import (
21 | InvalidConstraints,
22 | InvalidNode,
23 | NetworkError,
24 | NodePathError,
25 | RestrictiveConstraints,
26 | )
27 | from .pkgman import NETWORK_FILE, __is_module__
28 | from .structs import CaseInsensitiveDict
29 | from .unpacker import lookup_value_list
30 |
31 | # Load the network. (unless we're running as a module)
32 | NETWORK: BayesianNetwork
33 | if __is_module__():
34 | NETWORK = None # type: ignore
35 | else:
36 | NETWORK = BayesianNetwork(NETWORK_FILE)
37 |
38 |
39 | def _assert_network_exists() -> None:
40 | """
41 | Quick helper to confirm NETWORK is defined
42 | """
43 | if NETWORK is None:
44 | raise NetworkError("FPGEN_NO_INIT is enabled. Cannot access the network.")
45 |
46 |
47 | def query(
48 | target: str, *, flatten: bool = False, sort: bool = False
49 | ) -> Union[Dict[str, Any], List[Any]]:
50 | """
51 | Query a list of possibilities given a target.
52 |
53 | Parameters:
54 | target (str): Target node to query possible values for
55 | flatten (bool, optional): Whether to flatten the output dictionary
56 | sort (bool, optional): Whether to sort the output arrays
57 | """
58 | _assert_network_exists()
59 |
60 | # Check node list first
61 | values = _lookup_possibilities(target, casefold=False)
62 | if values:
63 | output: Union[Tuple, map]
64 | output = tuple(map(orjson.loads, values))
65 | # Merge dicts if data is all dicts, else just return a deduped list
66 | if all(isinstance(d, dict) for d in output):
67 | # Flatten the output dict before returning if needed
68 | return _maybe_flatten(flatten, _merge_dicts(output, sort=sort))
69 | else:
70 | # Dedupe the list
71 | return _dedupe(output, sort=sort)
72 |
73 | # Target is within a node. Need to look up the tree
74 | nested_keys: List[str] = []
75 | root_data = _lookup_root_possibilities(
76 | target, nested_keys=nested_keys, none_if_missing=True, casefold=False
77 | )
78 | if root_data is not None:
79 | # Read possibile values as jsons
80 | output = map(orjson.loads, root_data[1])
81 | # Pull the item at the target path
82 | output = map(lambda d: _at_path(d, nested_keys), output)
83 | output = tuple(output)
84 |
85 | # If they are all dicts, merge them
86 | if all(isinstance(d, dict) for d in output):
87 | # Flatten the output dict if needed
88 | return _maybe_flatten(flatten, _merge_dicts(output, sort=sort))
89 |
90 | # Return a deduped list
91 | return _dedupe(output, sort=sort)
92 |
93 | # Search down the tree
94 | data = _search_downward(target)
95 | resp: Dict[str, List[Any]] = {
96 | # Remove the current node path
97 | key.removeprefix(f'{target}.'): [
98 | # Parse each possible value via orjson
99 | orjson.loads(d)
100 | for d in (_lookup_possibilities(key, casefold=False) or tuple())
101 | ]
102 | for key in data
103 | }
104 | if flatten:
105 | # May need to flatten further
106 | return _flatten({node: _dedupe(values, sort=sort) for node, values in resp.items()})
107 | return _unflatten(resp, sort=sort)
108 |
109 |
110 | """
111 | Helper functions for searching for nodes up/down the network
112 | """
113 |
114 |
115 | def _at_path(data: Mapping, path: StrContainer, *, casefold=False) -> Any:
116 | """
117 | Gets the value in nested dictionary given its path
118 | """
119 | for key in path:
120 | if casefold:
121 | data = CaseInsensitiveDict(data)
122 | if not isinstance(data, MutableMapping) or key not in data:
123 | raise NodePathError(key)
124 | data = data[key]
125 | return data
126 |
127 |
128 | @overload
129 | def _lookup_root_possibilities(
130 | key: str,
131 | nested_keys: Optional[List[str]] = None,
132 | casefold: bool = True,
133 | none_if_missing: Literal[False] = False,
134 | ) -> Tuple[str, Dict[str, Any]]: ...
135 |
136 |
137 | @overload
138 | def _lookup_root_possibilities(
139 | key: str,
140 | nested_keys: Optional[List[str]] = None,
141 | casefold: bool = True,
142 | none_if_missing: Literal[True] = True,
143 | ) -> Optional[Tuple[str, Dict[str, Any]]]: ...
144 |
145 |
146 | def _lookup_root_possibilities(
147 | key: str,
148 | nested_keys: Optional[List[str]] = None,
149 | casefold: bool = True,
150 | none_if_missing: bool = False,
151 | ) -> Optional[Tuple[str, Dict[str, Any]]]:
152 | """
153 | Finds the first avaliable root node of a given key, and queries its possibilities
154 | """
155 | if not key:
156 | raise InvalidNode('Key cannot be empty.')
157 | while key:
158 | keys = key.rsplit('.', 1)
159 | # Ran out of keys to parse
160 | if len(keys) != 2:
161 | if none_if_missing:
162 | return None
163 | raise InvalidNode(f'{key} is not a valid node')
164 | key, sliced_key = keys
165 |
166 | if nested_keys is not None:
167 | nested_keys.append(sliced_key)
168 |
169 | # if a nested key is avaliable, enter it
170 | possible_values = _lookup_possibilities(key, casefold)
171 | # iterate backwards until we find the node
172 | if possible_values is not None:
173 | break
174 |
175 | if possible_values is None:
176 | if none_if_missing:
177 | return None
178 | raise InvalidNode(f'{key} is not a valid node')
179 |
180 | if nested_keys:
181 | nested_keys.reverse()
182 |
183 | return key, possible_values
184 |
185 |
186 | def _lookup_possibilities(node_name: str, casefold: bool = True) -> Optional[Dict[str, Any]]:
187 | """
188 | Returns the possible values for the given node name.
189 | Returns as a dictionary {value: lookup_index}
190 | """
191 | if node_name not in NETWORK.nodes_by_name:
192 | return None
193 |
194 | lookup_values = NETWORK.nodes_by_name[node_name].possible_values
195 | actual_values = lookup_value_list(lookup_values)
196 |
197 | return {
198 | (actual.casefold() if casefold else actual): lookup
199 | for actual, lookup in zip(actual_values, lookup_values)
200 | }
201 |
202 |
203 | def _search_downward(domain: str) -> Iterable[str]:
204 | """
205 | Searches for all nodes that begin with a specific key
206 | """
207 | found = False
208 | for i, node in enumerate(NETWORK.nodes_by_name.keys()):
209 | if not node.startswith(domain):
210 | continue
211 | # Check if its a . afterward
212 | key_len = len(domain)
213 | if len(node) > key_len and node[key_len] != '.':
214 | continue
215 | if not found:
216 | found = True
217 | # Get the original case
218 | yield NETWORK.node_names[i]
219 |
220 | if not found:
221 | raise InvalidNode(f'Unknown node: "{domain}"')
222 |
223 |
224 | def _find_roots(targets: Union[str, StrContainer]) -> Iterator[str]:
225 | """
226 | Given a list of targets, return all nodes that make up that target's data
227 | """
228 | for target in targets:
229 | target = target.casefold()
230 | while True:
231 | # Found a valid target
232 | if target in NETWORK.nodes_by_name:
233 | yield target
234 | break
235 |
236 | keys = target.rsplit('.', 1)
237 | if len(keys) > 1:
238 | # Move target back 1
239 | target = keys[0]
240 | continue
241 |
242 | # We are at the root key.
243 | # Find potential keys before quitting
244 | yield from _search_downward(keys[0])
245 | break
246 |
247 |
248 | def _reassemble_targets(targets: StrContainer, fingerprint: Dict[str, Any]):
249 | result = {}
250 | for target in targets:
251 | try:
252 | data = _at_path(fingerprint, target.split('.'), casefold=True)
253 | except NodePathError as key:
254 | raise InvalidNode(f"'{target}' is not a valid key path (missing {key}).")
255 | result[target] = data
256 | return result
257 |
258 |
259 | """
260 | Miscellaneous python list/dict helpers
261 | """
262 |
263 |
264 | def _dedupe(lst: Iterable[Any], sort: bool) -> List[Any]:
265 | """
266 | Group items by their type, deduping each group
267 | """
268 | groups: Dict[type, Any] = {}
269 | for item in lst:
270 | t = type(item)
271 | if t not in groups:
272 | groups[t] = []
273 | # Only add item if it's not already in its type group
274 | if item not in groups[t]:
275 | groups[t].append(item)
276 |
277 | result = []
278 | # Process groups in order sorted by type name
279 | for t in sorted(groups.keys(), key=lambda typ: typ.__name__):
280 | items = groups[t]
281 | # Do not sort if `sort` is False, or if type is unhashable
282 | if not sort or t in (list, dict):
283 | result.extend(items)
284 | else:
285 | result.extend(sorted(items))
286 | return result
287 |
288 |
289 | def _unflatten(dictionary, sort: bool) -> Dict[str, Any]:
290 | """
291 | Unflatten dicts and dedupe any nested lists
292 | """
293 | result_dict: Dict[str, Any] = dict()
294 | for key, value in dictionary.items():
295 | parts = key.split(".")
296 | d = result_dict
297 | for part in parts[:-1]:
298 | if part not in d:
299 | d[part] = dict()
300 | d = d[part]
301 | # Dedupe lists
302 | if isinstance(value, list):
303 | value = _dedupe(value, sort=sort)
304 | d[parts[-1]] = value
305 | return result_dict
306 |
307 |
308 | def _flatten(dictionary: Dict[str, Any], parent_key=False) -> Dict[str, Any]:
309 | """
310 | Turn a nested dictionary into a flattened dictionary
311 | https://stackoverflow.com/questions/6027558/flatten-nested-dictionaries-compressing-keys
312 | """
313 | items: List[Tuple[str, Any]] = []
314 | for key, value in dictionary.items():
315 | new_key = str(parent_key) + '.' + key if parent_key else key
316 | if isinstance(value, dict):
317 | items.extend(_flatten(value, new_key).items())
318 | else:
319 | items.append((new_key, value))
320 | return dict(items)
321 |
322 |
323 | def _maybe_flatten(flatten: Optional[bool], data):
324 | if not isinstance(data, dict):
325 | return data
326 | if flatten:
327 | return _flatten(data)
328 | return data
329 |
330 |
331 | def _merge_dicts(dict_list: Iterable[Dict[str, Any]], sort: bool) -> Dict[str, Any]:
332 | """
333 | Merge items in a list of dicts into a dict of merged values.
334 | For a given key, if all values are dicts, merge them recursively.
335 | If all values are lists, flatten them into a single list and dedupe.
336 | Otherwise, dedupe the list of values.
337 | """
338 | if not dict_list:
339 | return {}
340 |
341 | merged: Dict[str, Any] = {}
342 | # Get the union of keys from all dictionaries.
343 | all_keys: Set[str] = set()
344 | for d in dict_list:
345 | all_keys.update(d.keys())
346 |
347 | for key in all_keys:
348 | # Get the list of values for the current key, skipping dicts that don't have it
349 | values = [d[key] for d in dict_list if key in d]
350 |
351 | if all(isinstance(v, dict) for v in values):
352 | # Merge dictionaries recursively
353 | merged[key] = _merge_dicts(values, sort=sort)
354 | elif all(isinstance(v, list) for v in values):
355 | # Merge lists
356 | merged_list = []
357 | for lst in values:
358 | merged_list.extend(lst)
359 | merged[key] = _dedupe(merged_list, sort=sort)
360 | else:
361 | # For mixed/scalar values, dedupe
362 | merged[key] = _dedupe(values, sort=sort)
363 |
364 | return merged
365 |
366 |
367 | def _tupilize(value) -> Union[List[str], Tuple[str, ...]]:
368 | """
369 | If a value is not a tuple or list, wrap it in a tuple
370 | """
371 | return value if isinstance(value, (tuple, list)) else (value,)
372 |
373 |
374 | """
375 | Parse user input
376 | """
377 |
378 |
379 | def _flatten_conditions(
380 | dictionary: Mapping[str, Any], parent_key: str = '', casefold: bool = False
381 | ) -> Dict[str, Any]:
382 | """
383 | Flattens the passed list of conditions
384 | """
385 | # Original flattening logic from here:
386 | # https://stackoverflow.com/questions/6027558/flatten-nested-dictionaries-compressing-keys
387 | items: List[Tuple[str, Any]] = []
388 | for key, value in dictionary.items():
389 | new_key = parent_key + '.' + key if parent_key else key
390 | if isinstance(value, MutableMapping):
391 | items.extend(_flatten_conditions(value, new_key).items())
392 | else:
393 | # If we have a tuple or set, treat it as an array of possible values
394 | if isinstance(value, (set, tuple)):
395 | value = tuple(orjson.dumps(v).decode() for v in value)
396 | # If we have a function, don't flatten it
397 | elif not callable(value):
398 | value = orjson.dumps(value).decode()
399 | if casefold:
400 | new_key = new_key.casefold()
401 | items.append((new_key, value))
402 | return dict(items)
403 |
404 |
405 | def build_evidence(
406 | conditions: Dict[str, Any], evidence: Dict[str, Set[str]], strict: Optional[bool] = None
407 | ) -> None:
408 | """
409 | Builds evidence based on the user's inputted conditions
410 | """
411 | if strict is None:
412 | strict = True
413 |
414 | # Flatten to match the format of the fingerprint network
415 | conditions = _flatten_conditions(conditions, casefold=True)
416 |
417 | for key, value in conditions.items():
418 | possible_values = _lookup_possibilities(key)
419 |
420 | # Handle nested keys
421 | nested_keys: List[str] = []
422 | if possible_values is None:
423 | key, possible_values = _lookup_root_possibilities(key, nested_keys)
424 | # Get the real name for the key
425 | key = NETWORK.nodes_by_name[key].name
426 |
427 | evidence[key] = set()
428 |
429 | for value_con in _tupilize(value):
430 | # Read the passed value
431 | if callable(value_con):
432 | val = value_con # Callable
433 | else:
434 | val = orjson.loads(value_con.casefold()) # Dict/list/str data
435 |
436 | # Handle nested keys by filtering out possible values that dont
437 | # match the value at the target
438 | if nested_keys:
439 | nested_keys = list(map(lambda s: s.casefold(), nested_keys))
440 | for poss_value, lookup_index in possible_values.items():
441 | # Parse the dictionary
442 | outputted_possible = orjson.loads(poss_value)
443 |
444 | # Check if the value is a possible value at the nested path
445 | try:
446 | target_value = _at_path(outputted_possible, nested_keys)
447 | except NodePathError:
448 | continue # Path didn't exist, bad data
449 | if callable(val) and val(target_value):
450 | evidence[key].add(lookup_index)
451 | elif target_value == val:
452 | evidence[key].add(lookup_index)
453 |
454 | # If nothing was found, raise an error
455 | if not evidence[key]:
456 | if callable(val):
457 | # Callable didnt work
458 | raise InvalidConstraints(
459 | f'The passed function ({val}) yielded no possible values for "{key}" '
460 | f'at "{".".join(nested_keys)}"'
461 | )
462 | raise InvalidConstraints(
463 | f'{value_con} is not a possible value for "{key}" '
464 | f'at "{".".join(nested_keys)}"'
465 | )
466 | continue
467 |
468 | # ===== NON NESTED VALUE HANDLING =====
469 |
470 | # If callable, get all possible values then check for matches
471 | if callable(val):
472 | # Filter by val(x)
473 | found = False
474 | for possible_val, lookup_index in possible_values.items():
475 | if val(orjson.loads(possible_val)):
476 | evidence[key].add(lookup_index)
477 | found = True
478 | if not found:
479 | raise InvalidConstraints(
480 | f'The passed function ({val}) yielded no possible values for "{key}"'
481 | )
482 | continue
483 |
484 | # Non nested values can be handled by directly checking possible_values
485 | lookup_index = possible_values.get(value_con.casefold())
486 | # Value is not possible
487 | if lookup_index is None:
488 | raise InvalidConstraints(f'{value_con} is not a possible value for "{key}"')
489 | evidence[key].add(lookup_index)
490 |
491 | # Validate the evidence is possible (or try to relax the evidence if strict is False)
492 | while True:
493 | try:
494 | NETWORK.validate_evidence(evidence)
495 | except RestrictiveConstraints as e:
496 | if strict:
497 | raise e
498 | # Remove the last added key
499 | evidence.pop(next(iter(evidence.keys())))
500 | break
501 |
502 |
503 | def _assert_dict_xor_kwargs(
504 | passed_dict: Optional[Dict[str, Any]], passed_kwargs: Optional[Dict[str, Any]]
505 | ) -> None:
506 | """
507 | Confirms a dict is either passed as an argument, xor kwargs are passed.
508 | """
509 | # Exit if neither is passed
510 | if passed_dict is None and passed_kwargs is None:
511 | return
512 | # Exit if both are passed
513 | if passed_dict and passed_kwargs:
514 | raise ValueError(
515 | f"Cannot pass values as dict & as parameters: {passed_dict} and {passed_kwargs}"
516 | )
517 | # Raise if incorrect type
518 | if not isinstance(passed_dict or passed_kwargs, dict):
519 | raise ValueError(
520 | "Invalid argument. Constraints must be passed as kwargs or as a dictionary."
521 | )
522 |
523 |
524 | """
525 | Convert network output to human readable output
526 | """
527 |
528 |
529 | def _make_output_dict(data: Dict[str, Any], flatten: Optional[bool]) -> Dict[str, Any]:
530 | """
531 | Unflattens & builds the output dictionary
532 | """
533 | if flatten:
534 | # Get key value pairs directly without building structure
535 | values = lookup_value_list(data.values())
536 | for key, value in zip(data.keys(), values):
537 | data[key] = orjson.loads(value)
538 | # Flatten node values that themselves are dicts
539 | return _flatten(data)
540 |
541 | # Original unflattening logic from here:
542 | # https://stackoverflow.com/questions/6037503/python-unflatten-dict
543 | result_dict: Dict[str, Any] = dict()
544 | for key, value in zip(data.keys(), lookup_value_list(data.values())):
545 | parts = key.split(".")
546 | d = result_dict
547 | for part in parts[:-1]:
548 | if part not in d:
549 | d[part] = dict()
550 | d = d[part]
551 | d[parts[-1]] = orjson.loads(value)
552 |
553 | return result_dict
554 |
555 |
556 | # Only expose `query` publicly
557 | __all__ = ('query',)
558 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["poetry-core>=1.0.0"]
3 | build-backend = "poetry.core.masonry.api"
4 |
5 | [tool.poetry]
6 | name = "fpgen"
7 | version = "1.3.0"
8 | description = "A fast & comprehensive browser fingerprint generator"
9 | authors = ["daijro "]
10 | license = "Apache-2.0"
11 | readme = "README.md"
12 | repository = "https://github.com/scrapfly/fingerprint-generator"
13 | keywords = [
14 | "client",
15 | "headers",
16 | "fingerprint",
17 | "generator",
18 | "browser",
19 | "http",
20 | "scraping",
21 | "requests",
22 | "playwright",
23 | ]
24 | classifiers = [
25 | "Topic :: Internet :: WWW/HTTP",
26 | "Topic :: Internet :: WWW/HTTP :: Browsers",
27 | "Topic :: Software Development :: Libraries :: Python Modules",
28 | ]
29 |
30 | [tool.poetry.dependencies]
31 | python = "^3.8"
32 | click = "*"
33 | indexed-zstd = "*"
34 | orjson = "*"
35 | numpy = "*"
36 | zstandard = "*"
37 | httpx = "*"
38 |
39 | [tool.poetry.scripts]
40 | fpgen = "fpgen.__main__:cli"
--------------------------------------------------------------------------------
/tests/README.md:
--------------------------------------------------------------------------------
1 | # Tests
2 |
3 | Dirty test cases for internal use.
4 |
5 | Confirms that user inputs & conditional probabilities are handled correctly.
--------------------------------------------------------------------------------
/tests/failed_cases.py:
--------------------------------------------------------------------------------
1 | """
2 | Test for exceptions that should be raised.
3 | """
4 |
5 | import sys
6 | from pathlib import Path
7 |
8 | sys.path.append(str(Path(__file__).parent.parent))
9 | from fpgen import Generator, generate, trace
10 | from fpgen.exceptions import InvalidConstraints, InvalidNode, RestrictiveConstraints
11 |
12 | # Generator with incorrect values
13 | try:
14 | generate(screen={'width': 1920, 'height': 5000})
15 | except InvalidConstraints as e:
16 | print('[PASSED] VERIFIER:', e)
17 | else:
18 | print('[FAILED]')
19 |
20 | # Incorrect nested node
21 | try:
22 | generate(screen={'width': 1920, 'heighta': 1080})
23 | except InvalidConstraints as e:
24 | print('[PASSED] VERIFIER:', e)
25 | else:
26 | print('[FAILED]')
27 |
28 | try:
29 | generate(screena={'width': 1920, 'height': 1080})
30 | except InvalidNode as e:
31 | print('[PASSED] VERIFIER:', e)
32 | else:
33 | print('[FAILED]')
34 |
35 |
36 | # Test generator constructor
37 | try:
38 | Generator(screen={'width': 1920, 'height': lambda x: x > 5000})
39 | except InvalidConstraints as e:
40 | print('[PASSED] VERIFIER:', e)
41 | else:
42 | print('[FAILED]')
43 |
44 | # Test that Chrome is 100% probability on ChromeOS
45 | result = trace(target='browser', os='ChromeOS')
46 | try:
47 | assert len(result) == 1 and result[0].value == 'Chrome' and result[0].probability == 1.0
48 | except AssertionError:
49 | print('[FAILED] TRACE: Expected Chrome 100% probability on ChromeOS, got:', result)
50 | else:
51 | print('[PASSED] TRACE: Chrome is 100% probability on ChromeOS')
52 |
53 | # Test that Firefox/Safari are impossible on ChromeOS
54 | try:
55 | result = trace(target='browser', os='ChromeOS', browser=('Firefox', 'Safari'))
56 | except RestrictiveConstraints as e:
57 | print('[PASSED] TRACE: Firefox/Safari correctly impossible on ChromeOS')
58 | else:
59 | print('[FAILED] TRACE: Expected exception for impossible Firefox/Safari on ChromeOS')
60 |
61 | # Test Firefox/Safari probabilities without OS constraint
62 | result = trace(target='browser', browser=('Firefox', 'Safari'))
63 | try:
64 | assert len(result) == 2
65 | assert all(r.value in ('Firefox', 'Safari') for r in result)
66 | assert abs(sum(r.probability for r in result) - 1.0) < 0.0001
67 | except AssertionError:
68 | print('[FAILED] TRACE: Expected valid Firefox/Safari probabilities, got:', result)
69 | else:
70 | print('[PASSED] TRACE: Valid Firefox/Safari probabilities')
71 |
72 | # Test Chrome is 100% on ChromeOS even with Firefox/Safari allowed
73 | result = trace(target='browser', os='ChromeOS', browser=('Firefox', 'Safari', 'Chrome'))
74 | try:
75 | assert len(result) == 1 and result[0].value == 'Chrome' and result[0].probability == 1.0
76 | except AssertionError:
77 | print(
78 | '[FAILED] TRACE: Expected Chrome 100% on ChromeOS with Firefox/Safari allowed, got:', result
79 | )
80 | else:
81 | print('[PASSED] TRACE: Chrome is 100% on ChromeOS with Firefox/Safari allowed')
82 |
83 | try:
84 | trace(target='browser', os='ChromeOS', browser='Firefox')
85 | except RestrictiveConstraints as e:
86 | print('[PASSED] TRACE: Firefox cannot exist on ChromeOS')
87 | else:
88 | print('[FAILED] TRACE: Should have raised an exception.')
89 |
90 |
91 | # Basic passing case
92 | try:
93 | data = generate(os='ChromeOS')
94 | except Exception as e:
95 | print('[FAILED] GENERATE: Basic target case failed:', e)
96 | else:
97 | print('[PASSED] GENERATE: Passed basic case (control)')
98 |
99 | try:
100 | data = generate(os='ChromeOS', target='browser')
101 | except Exception as e:
102 | print('[FAILED] GENERATE: Basic target case failed:', e)
103 | else:
104 | print('[PASSED] GENERATE: Passed basic case (control)')
105 |
106 | # Test impossible constraint handling
107 | try:
108 | data = generate(browser='firefox', os='ChromeOS')
109 | except RestrictiveConstraints as e:
110 | print('[PASSED] GENERATE: Throws on impossible constraint', e)
111 | else:
112 | print('[FAILED] GENERATE: Firefox should not exist on ChromeOS')
113 |
114 | try:
115 | data = generate(browser='firefox', os='ChromeOS', target='browser')
116 | except RestrictiveConstraints as e:
117 | print('[PASSED] GENERATE: Throws on impossible constraint', e)
118 | else:
119 | print('[FAILED] GENERATE: Firefox should not exist on ChromeOS (target)')
120 |
121 | try:
122 | data = generate(browser=('firefox', 'safari', 'chrome'), os='ChromeOS', target='browser')
123 | assert data == 'Chrome'
124 | except AssertionError:
125 | print('[FAILED] GENERATE: Doesn\'t pick the correct constraint')
126 | else:
127 | print('[PASSED] GENERATE: Picks the correct constraint')
128 |
--------------------------------------------------------------------------------
/tests/generator_matches_trace.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from pathlib import Path
3 |
4 | sys.path.append(str(Path(__file__).parent.parent))
5 | import fpgen
6 |
7 | # Controls whether to use `target` or not (uses a different algorithm)
8 | USE_TARGET = False
9 | # Number of trials to run
10 | TRIALS = 500
11 |
12 | TESTS = [
13 | ('browser', {'os': ('Linux', 'MacOS'), 'browser': ('Edge', 'Safari')}),
14 | ('browser', {'os': ('Linux')}),
15 | ('browser', {'navigator': {'productsub': '20100101'}}),
16 | ('navigator.language', {'browser': 'firefox'}),
17 | ('os', {'browser': 'Firefox'}),
18 | ]
19 |
20 | LIMIT = 10
21 |
22 | for target, constraints in TESTS:
23 | pretty_constraints = ', '.join(f'{k}={v}' for k, v in constraints.items())
24 | print(f'Expected P({target}|{pretty_constraints}):')
25 | print(fpgen.trace(target=target, **constraints)[:LIMIT])
26 | print(f'Expected P({target}):')
27 | print(fpgen.trace(target=target)[:LIMIT])
28 |
29 | # Collected data
30 | browser_data = {}
31 |
32 | for _ in range(TRIALS):
33 | print(f'Trial {_+1}/{TRIALS}', end='\r')
34 | if USE_TARGET:
35 | a = fpgen.generate(flatten=True, target=target, **constraints)
36 | else:
37 | a = fpgen.generate(flatten=True, **constraints)[target]
38 | browser_data[a] = browser_data.get(a, 0) + 1
39 |
40 | print(f"\nGenerator test using P({target}|{pretty_constraints}):")
41 | for browser, count in sorted(browser_data.items(), key=lambda x: x[1], reverse=True)[:LIMIT]:
42 | print(f"{browser}: {count/TRIALS*100:.2f}%")
43 | print('\n---------\n')
44 |
--------------------------------------------------------------------------------
/tests/profile.py:
--------------------------------------------------------------------------------
1 | """
2 | Track where the generator hangs.
3 | """
4 |
5 | import sys
6 | from pathlib import Path
7 |
8 | sys.path.append(str(Path(__file__).parent.parent))
9 | from pyinstrument import Profiler
10 |
11 | from fpgen import generate
12 |
13 | profiler = Profiler(interval=0.001)
14 | profiler.start()
15 |
16 | # Intensive constraint
17 | generate(
18 | browser=('Firefox', 'Chrome'),
19 | client={'browser': {'major': ('134', '133')}},
20 | )
21 | profiler.stop()
22 |
23 | print(profiler.output_text(show_all=True))
24 |
--------------------------------------------------------------------------------
/tests/speed_test.py:
--------------------------------------------------------------------------------
1 | """
2 | Test the speed of the generate, trace, and query functions.
3 | """
4 |
5 | import sys
6 | from pathlib import Path
7 | from timeit import timeit
8 |
9 | sys.path.append(str(Path(__file__).parent.parent))
10 |
11 | from contextlib import contextmanager
12 | from time import time
13 |
14 | from fpgen import Generator, generate, query, trace
15 |
16 |
17 | @contextmanager
18 | def timer(description):
19 | print(description)
20 | start = time()
21 | yield
22 | print('Time to run', time() - start)
23 |
24 |
25 | # Trace operations
26 | with timer('trace (target=navigator.language)'):
27 | trace(target='navigator.language')
28 |
29 | with timer("trace (target=browser, os=('Linux', 'MacOS'), browser=('Edge', 'Safari'))"):
30 | trace('browser', os=('Linux', 'MacOS'), browser=('Edge', 'Safari'))
31 |
32 | # Generate operations
33 | with timer('generating (full fingerprint)'):
34 | generate()
35 |
36 | # with timer('generating (navigator.language=en-US)'):
37 | # generate({'navigator.language': 'en-US'})
38 |
39 | with timer('generating (navigator.language=en-US, target=browser)'):
40 | generate({'navigator.language': ('en-US', 'en-GB', 'fr', 'de-DE')}, target='browser')
41 |
42 | with timer('generating (browser=firefox, target=browser)'):
43 | generate(browser=('firefox'), target='browser')
44 |
45 | with timer('generating (browser=firefox, target=navigator.language)'):
46 | generate(browser=('firefox'), target='navigator.language')
47 |
48 | with timer('generating with a function constraint'):
49 | generate({'window': {'innerWidth': lambda x: x > 1000}}, target='navigator.language')
50 |
51 | # Timeit tests
52 |
53 | print('\n========== TIMEIT TESTS ==========\n')
54 |
55 | print('Generator test')
56 | print(timeit(lambda: generate(), number=100), '/ 100')
57 |
58 | print('Generator test (with nested constraints)')
59 | print(timeit(lambda: generate(screen={'width': 1920, 'height': 1080}), number=10), '/ 10')
60 |
61 | gen = Generator(screen={'width': 1920, 'height': 1080})
62 |
63 | print('Generator test with nested constraints (pre-filtered)')
64 | print(timeit(lambda: gen.generate(), number=10), '/ 10')
65 |
66 | print('Query test (large value set)')
67 | print(timeit(lambda: query('allFonts'), number=10), '/ 10')
68 |
69 | print('Trace test')
70 | print(timeit(lambda: trace('browser'), number=100), '/ 100')
71 |
72 | print('Trace test (large value set)')
73 | print(timeit(lambda: trace('allFonts'), number=10), '/ 10')
74 |
--------------------------------------------------------------------------------
/tests/user_inputs.py:
--------------------------------------------------------------------------------
1 | """
2 | Tests various user inputs to confirm that they are handled correctly.
3 | """
4 |
5 | import sys
6 | from pathlib import Path
7 |
8 | sys.path.append(str(Path(__file__).parent.parent))
9 | import re
10 |
11 | from fpgen import Generator, __network__, query
12 |
13 |
14 | def test(name, out, x=None):
15 | if x and not re.search(x, str(out)):
16 | print(
17 | ('> FAILED: ' + name).ljust(60, ' '),
18 | out,
19 | )
20 | return
21 | print(('PASSED! ' + name).ljust(60, ' '), str(out)[:100])
22 |
23 |
24 | # Test options
25 | FLATTEN_OPT = False
26 | SORT_OPT = False
27 |
28 | print(' ==== GENERATOR TESTS ==== ')
29 |
30 | gen = Generator(flatten=FLATTEN_OPT)
31 |
32 | test(
33 | 'Generate full fp',
34 | gen.generate(),
35 | '^{.*',
36 | )
37 | test(
38 | 'Generate full fp (non strict, fallback os)',
39 | gen.generate(
40 | os='ChromeOS',
41 | browser='Firefox',
42 | strict=False,
43 | target=('os', 'browser'),
44 | ),
45 | ".*'browser': 'Firefox'.*",
46 | )
47 | test(
48 | 'Target (at node) given constraint',
49 | gen.generate(
50 | {'navigator': {'productsub': '20100101'}},
51 | target=('headers.user-agent'),
52 | ),
53 | 'Firefox',
54 | )
55 | test(
56 | 'Target (within node) given constraint',
57 | gen.generate(
58 | {'navigator': {'productsub': '20100101'}},
59 | target=['screen.width', 'screen.height'],
60 | ),
61 | r'\d+',
62 | )
63 | test(
64 | 'Target (above node) given constraint',
65 | gen.generate(
66 | {'navigator': {'productsub': '20100101'}},
67 | target='navigator',
68 | ),
69 | '^{.*$',
70 | )
71 | test(
72 | 'Passing multi constraints (no target)',
73 | gen.generate(
74 | browser=('Firefox', 'Chrome'),
75 | client={'browser': {'major': ('134', '133')}},
76 | ),
77 | r'\b13[34]\b',
78 | )
79 | test(
80 | 'Passing multi constraints (target)',
81 | gen.generate(
82 | browser=('Firefox', 'Chrome'),
83 | client={'browser': {'major': ('134', '133')}},
84 | target='client',
85 | ),
86 | r'\b13[34]\b',
87 | )
88 | gpu = {
89 | 'vendor': 'Google Inc. (Apple)',
90 | 'renderer': 'ANGLE (Apple, ANGLE Metal Renderer: Apple M2, Unspecified Version)',
91 | }
92 | test(
93 | 'Constraint tgt (at node, `window`)',
94 | gen.generate(gpu=gpu, target='window'),
95 | )
96 | test(
97 | 'Constraint tgt (above nodes, `navigator`)',
98 | gen.generate(gpu=gpu, target='navigator'),
99 | )
100 | test(
101 | 'Constraint tgt (within node, `screen.width`)',
102 | gen.generate(gpu=gpu, target='screen.width'),
103 | )
104 |
105 | print('\n ==== QUERY TESTS ==== ')
106 |
107 | test(
108 | 'Possibilities (at node 1, `navigator.productsub`)',
109 | query('navigator.productsub', flatten=FLATTEN_OPT, sort=SORT_OPT),
110 | )
111 | test(
112 | 'Possibilities (at node 2, `screen`)',
113 | query('screen', flatten=FLATTEN_OPT, sort=SORT_OPT),
114 | )
115 | test(
116 | 'Possibilities (above nodes, `navigator`)',
117 | query('navigator', flatten=FLATTEN_OPT, sort=SORT_OPT),
118 | )
119 | test(
120 | 'Possibilities (within node, `screen.width`)',
121 | query('screen.width', flatten=FLATTEN_OPT, sort=SORT_OPT),
122 | )
123 |
124 |
125 | print(' ==== QUERY ALL NODES ==== ')
126 |
127 | for node in __network__.nodes_by_name:
128 | # Get the possibilities
129 | print(f'Listing possibilities for {node}')
130 | a = query(node, flatten=FLATTEN_OPT, sort=SORT_OPT)
131 | print(str(a)[:100])
132 |
--------------------------------------------------------------------------------