├── .editorconfig ├── .flake8 ├── .gitignore ├── .gitmodules ├── .python-version ├── .vscode └── settings.json ├── CITATION ├── LICENSE ├── README.md ├── docs ├── index.html ├── index.js └── llama-tokenizer.js ├── llama2d.png ├── pyproject.toml ├── requirements.in ├── requirements.txt ├── screenshot.png ├── script.sh ├── src ├── __init__.py ├── data │ ├── .gitignore │ ├── mind2web_example.json │ ├── mind2web_example.mhtml │ ├── pretraining-cache │ │ └── .gitignore │ ├── pretraining_urls.py │ └── urls.txt ├── llama2d │ ├── __init__.py │ ├── constants.py │ ├── datasets │ │ ├── __init__.py │ │ ├── cached.py │ │ ├── huggingface.py │ │ ├── mhtml_to_hhtml.py │ │ ├── mind2web.py │ │ ├── mind2web_convert.py │ │ ├── pretraining.py │ │ └── synthetic │ │ │ ├── top_or_bottom.py │ │ │ ├── unscramble_words.py │ │ │ └── zoo_compass.py │ ├── find_pos_given_attr │ │ ├── download_mind2web.py │ │ └── find_pos_given_attr.py │ ├── modal │ │ ├── __init__.py │ │ ├── common.py │ │ ├── datasets │ │ │ ├── cached_dataset.py │ │ │ ├── hf_dataset.py │ │ │ ├── modal_docs.jsonl │ │ │ ├── new_dataset.py │ │ │ ├── sql_dataset.py │ │ │ └── zoo_dataset.py │ │ ├── finetuning.py │ │ ├── flat_param.py │ │ ├── inference.py │ │ ├── repro.py │ │ ├── requirements.txt │ │ ├── train.py │ │ ├── urls.txt │ │ └── validate_dataset.py │ ├── tagging │ │ ├── add_tags_to_page.py │ │ └── tagUtils.js │ └── vision │ │ ├── __init__.py │ │ ├── learn_mlp_on_embeds.py │ │ ├── ocr.py │ │ ├── render_dataset.py │ │ ├── take_screenshot.py │ │ ├── url_to_llama_input.py │ │ ├── viz_pt_input.py │ │ └── webutils │ │ ├── LICENSE.chromedriver │ │ ├── chromedriver │ │ ├── playwright_browser.py │ │ ├── selenium_action_chain.py │ │ ├── stacked_image.png │ │ ├── stitch_webpage.py │ │ └── web_to_action.py ├── mhtml │ ├── demos │ │ ├── finance.mhtml │ │ ├── local.mhtml │ │ ├── megabus.mhtml │ │ ├── megabus2.mhtml │ │ └── megabus3.mhtml │ ├── download.js │ ├── finance.json │ ├── index.js │ ├── package-lock.json │ ├── package.json │ ├── serve.js │ └── serve_local_data.js ├── models │ └── .gitignore └── secrets │ └── .gitignore └── tests └── testing.py /.editorconfig: -------------------------------------------------------------------------------- 1 | root = true 2 | 3 | [*] 4 | tab_width = 4 5 | end_of_line = lf 6 | max_line_length = 88 7 | ij_visual_guides = 88 8 | insert_final_newline = true 9 | trim_trailing_whitespace = true 10 | 11 | [*.{js,py,html}] 12 | charset = utf-8 13 | 14 | [*.md] 15 | trim_trailing_whitespace = false 16 | 17 | [*.{yml,yaml}] 18 | indent_style = space 19 | indent_size = 2 20 | 21 | [Makefile] 22 | indent_style = tab 23 | 24 | [.flake8] 25 | indent_style = space 26 | indent_size = 2 27 | 28 | [*.py] 29 | indent_style = space 30 | indent_size = 4 31 | ij_python_from_import_parentheses_force_if_multiline = true -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-complexity = 6 3 | inline-quotes = double 4 | max-line-length = 88 5 | extend-ignore = E203 6 | docstring_style=sphinx 7 | 8 | ignore = 9 | ; too complex 10 | C901, 11 | ; redefinition of used 12 | F811, 13 | ; Found `f` string 14 | WPS305, 15 | ; Missing docstring in public module 16 | D100, 17 | ; Missing docstring in magic method 18 | D105, 19 | ; Missing docstring in __init__ 20 | D107, 21 | ; Found `__init__.py` module with logic 22 | WPS412, 23 | ; Found class without a base class 24 | WPS306, 25 | ; Missing docstring in public nested class 26 | D106, 27 | ; First line should be in imperative mood 28 | D401, 29 | ; Found wrong variable name 30 | WPS110, 31 | ; Found `__init__.py` module with logic 32 | WPS326, 33 | ; Found string constant over-use 34 | WPS226, 35 | ; Found upper-case constant in a class 36 | WPS115, 37 | ; Found nested function 38 | WPS602, 39 | ; Found method without arguments 40 | WPS605, 41 | ; Found overused expression 42 | WPS204, 43 | ; Found too many module members 44 | WPS202, 45 | ; Found too high module cognitive complexity 46 | WPS232, 47 | ; line break before binary operator 48 | W503, 49 | ; Found module with too many imports 50 | WPS201, 51 | ; Inline strong start-string without end-string. 52 | RST210, 53 | ; Found nested class 54 | WPS431, 55 | ; Found wrong module name 56 | WPS100, 57 | ; Found too many methods 58 | WPS214, 59 | ; Found too long ``try`` body 60 | WPS229, 61 | ; Found unpythonic getter or setter 62 | WPS615, 63 | ; Found a line that starts with a dot 64 | WPS348, 65 | ; Found complex default value (for dependency injection) 66 | WPS404, 67 | ; not perform function calls in argument defaults (for dependency injection) 68 | B008, 69 | ; Model should define verbose_name in its Meta inner class 70 | DJ10, 71 | ; Model should define verbose_name_plural in its Meta inner class 72 | DJ11, 73 | ; Found mutable module constant. 74 | WPS407, 75 | ; Found too many empty lines in `def` 76 | WPS473, 77 | ; too many no-cover comments. 78 | WPS403, 79 | 80 | per-file-ignores = 81 | ; all tests 82 | test_*.py,tests.py,tests_*.py,*/tests/*,conftest.py: 83 | ; Use of assert detected 84 | S101, 85 | ; Found outer scope names shadowing 86 | WPS442, 87 | ; Found too many local variables 88 | WPS210, 89 | ; Found magic number 90 | WPS432, 91 | ; Missing parameter(s) in Docstring 92 | DAR101, 93 | ; Found too many arguments 94 | WPS211, 95 | 96 | exclude = 97 | ./.cache, 98 | ./.git, 99 | ./.idea, 100 | ./.mypy_cache, 101 | ./.pytest_cache, 102 | ./.venv, 103 | ./venv, 104 | ./env, 105 | ./cached_venv, 106 | ./docs, 107 | ./deploy, 108 | ./var, 109 | ./.vscode, 110 | app/tests/*, 111 | *migrations*, 112 | alembic/*, 113 | transformers/*, 114 | tests/*, 115 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | task.json 2 | data/*.zip 3 | data/* 4 | node_modules 5 | secrets/* 6 | 7 | *.png 8 | 9 | # Created by https://www.toptal.com/developers/gitignore/api/python,venv,virtualenv,dotenv,visualstudiocode,pycharm+all,windows,macos 10 | # Edit at https://www.toptal.com/developers/gitignore?templates=python,venv,virtualenv,dotenv,visualstudiocode,pycharm+all,windows,macos 11 | 12 | # ignore session_name.session 13 | *.session 14 | *.session-journal 15 | 16 | .fleet 17 | 18 | ### dotenv ### 19 | .env 20 | 21 | ### macOS ### 22 | # General 23 | .DS_Store 24 | .AppleDouble 25 | .LSOverride 26 | 27 | # Icon must end with two \r 28 | Icon 29 | 30 | 31 | # Thumbnails 32 | ._* 33 | 34 | # Files that might appear in the root of a volume 35 | .DocumentRevisions-V100 36 | .fseventsd 37 | .Spotlight-V100 38 | .TemporaryItems 39 | .Trashes 40 | .VolumeIcon.icns 41 | .com.apple.timemachine.donotpresent 42 | 43 | # Directories potentially created on remote AFP share 44 | .AppleDB 45 | .AppleDesktop 46 | Network Trash Folder 47 | Temporary Items 48 | .apdisk 49 | 50 | ### macOS Patch ### 51 | # iCloud generated files 52 | *.icloud 53 | 54 | ### PyCharm+all ### 55 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider 56 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 57 | 58 | # User-specific stuff 59 | .idea/**/workspace.xml 60 | .idea/**/tasks.xml 61 | .idea/**/usage.statistics.xml 62 | .idea/**/dictionaries 63 | .idea/**/shelf 64 | 65 | # AWS User-specific 66 | .idea/**/aws.xml 67 | 68 | # Generated files 69 | .idea/**/contentModel.xml 70 | 71 | # Sensitive or high-churn files 72 | .idea/**/dataSources/ 73 | .idea/**/dataSources.ids 74 | .idea/**/dataSources.local.xml 75 | .idea/**/sqlDataSources.xml 76 | .idea/**/dynamic.xml 77 | .idea/**/uiDesigner.xml 78 | .idea/**/dbnavigator.xml 79 | 80 | # Gradle 81 | .idea/**/gradle.xml 82 | .idea/**/libraries 83 | 84 | # Gradle and Maven with auto-import 85 | # When using Gradle or Maven with auto-import, you should exclude module files, 86 | # since they will be recreated, and may cause churn. Uncomment if using 87 | # auto-import. 88 | # .idea/artifacts 89 | # .idea/compiler.xml 90 | # .idea/jarRepositories.xml 91 | # .idea/modules.xml 92 | # .idea/*.iml 93 | # .idea/modules 94 | # *.iml 95 | # *.ipr 96 | 97 | # CMake 98 | cmake-build-*/ 99 | 100 | # Mongo Explorer plugin 101 | .idea/**/mongoSettings.xml 102 | 103 | # File-based project format 104 | *.iws 105 | 106 | # IntelliJ 107 | out/ 108 | 109 | # mpeltonen/sbt-idea plugin 110 | .idea_modules/ 111 | 112 | # JIRA plugin 113 | atlassian-ide-plugin.xml 114 | 115 | # Cursive Clojure plugin 116 | .idea/replstate.xml 117 | 118 | # SonarLint plugin 119 | .idea/sonarlint/ 120 | 121 | # Crashlytics plugin (for Android Studio and IntelliJ) 122 | com_crashlytics_export_strings.xml 123 | crashlytics.properties 124 | crashlytics-build.properties 125 | fabric.properties 126 | 127 | # Editor-based Rest Client 128 | .idea/httpRequests 129 | 130 | # Android studio 3.1+ serialized cache file 131 | .idea/caches/build_file_checksums.ser 132 | 133 | ### PyCharm+all Patch ### 134 | # Ignore everything but code style settings and run configurations 135 | # that are supposed to be shared within teams. 136 | 137 | .idea/* 138 | 139 | !.idea/codeStyles 140 | !.idea/runConfigurations 141 | 142 | ### Python ### 143 | # Byte-compiled / optimized / DLL files 144 | __pycache__/ 145 | *.py[cod] 146 | *$py.class 147 | 148 | # C extensions 149 | *.so 150 | 151 | # Distribution / packaging 152 | .Python 153 | build/ 154 | develop-eggs/ 155 | dist/ 156 | downloads/ 157 | eggs/ 158 | .eggs/ 159 | lib/ 160 | lib64/ 161 | parts/ 162 | sdist/ 163 | var/ 164 | wheels/ 165 | share/python-wheels/ 166 | *.egg-info/ 167 | .installed.cfg 168 | *.egg 169 | MANIFEST 170 | 171 | # PyInstaller 172 | # Usually these files are written by a python script from a template 173 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 174 | *.manifest 175 | *.spec 176 | 177 | # Installer logs 178 | pip-log.txt 179 | pip-delete-this-directory.txt 180 | 181 | # Unit test / coverage reports 182 | htmlcov/ 183 | .tox/ 184 | .nox/ 185 | .coverage 186 | .coverage.* 187 | .cache 188 | nosetests.xml 189 | coverage.xml 190 | *.cover 191 | *.py,cover 192 | .hypothesis/ 193 | .pytest_cache/ 194 | cover/ 195 | 196 | # Translations 197 | *.mo 198 | *.pot 199 | 200 | # Django stuff: 201 | *.log 202 | local_settings.py 203 | db.sqlite3 204 | db.sqlite3-journal 205 | 206 | # Flask stuff: 207 | instance/ 208 | .webassets-cache 209 | 210 | # Scrapy stuff: 211 | .scrapy 212 | 213 | # Sphinx documentation 214 | docs/_build/ 215 | 216 | # PyBuilder 217 | .pybuilder/ 218 | target/ 219 | 220 | # Jupyter Notebook 221 | .ipynb_checkpoints 222 | 223 | # IPython 224 | profile_default/ 225 | ipython_config.py 226 | 227 | # pyenv 228 | # For a library or package, you might want to ignore these files since the code is 229 | # intended to run in multiple environments; otherwise, check them in: 230 | # .python-version 231 | 232 | # pipenv 233 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 234 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 235 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 236 | # install all needed dependencies. 237 | #Pipfile.lock 238 | 239 | # poetry 240 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 241 | # This is especially recommended for binary packages to ensure reproducibility, and is more 242 | # commonly ignored for libraries. 243 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 244 | #poetry.lock 245 | 246 | # pdm 247 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 248 | #pdm.lock 249 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 250 | # in version control. 251 | # https://pdm.fming.dev/#use-with-ide 252 | .pdm.toml 253 | 254 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 255 | __pypackages__/ 256 | 257 | # Celery stuff 258 | celerybeat-schedule 259 | celerybeat.pid 260 | 261 | # SageMath parsed files 262 | *.sage.py 263 | 264 | # Environments 265 | .venv 266 | env/ 267 | venv/ 268 | ENV/ 269 | env.bak/ 270 | venv.bak/ 271 | 272 | # Spyder project settings 273 | .spyderproject 274 | .spyproject 275 | 276 | # Rope project settings 277 | .ropeproject 278 | 279 | # mkdocs documentation 280 | /site 281 | 282 | # mypy 283 | .mypy_cache/ 284 | .dmypy.json 285 | dmypy.json 286 | 287 | # Pyre type checker 288 | .pyre/ 289 | 290 | # pytype static type analyzer 291 | .pytype/ 292 | 293 | # Cython debug symbols 294 | cython_debug/ 295 | 296 | # PyCharm 297 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 298 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 299 | # and can be added to the global gitignore or merged into this file. For a more nuclear 300 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 301 | #.idea/ 302 | 303 | ### Python Patch ### 304 | # Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration 305 | poetry.toml 306 | 307 | # ruff 308 | .ruff_cache/ 309 | 310 | # LSP config files 311 | pyrightconfig.json 312 | 313 | ### venv ### 314 | # Virtualenv 315 | # http://iamzed.com/2009/05/07/a-primer-on-virtualenv/ 316 | [Bb]in 317 | [Ii]nclude 318 | [Ll]ib 319 | [Ll]ib64 320 | [Ll]ocal 321 | [Ss]cripts 322 | pyvenv.cfg 323 | pip-selfcheck.json 324 | 325 | ### VirtualEnv ### 326 | # Virtualenv 327 | # http://iamzed.com/2009/05/07/a-primer-on-virtualenv/ 328 | 329 | ### VisualStudioCode ### 330 | .vscode/* 331 | !.vscode/settings.json 332 | !.vscode/tasks.json 333 | !.vscode/launch.json 334 | !.vscode/extensions.json 335 | !.vscode/*.code-snippets 336 | 337 | # Local History for Visual Studio Code 338 | .history/ 339 | 340 | # Built Visual Studio Code Extensions 341 | *.vsix 342 | 343 | ### VisualStudioCode Patch ### 344 | # Ignore all local history of files 345 | .history 346 | .ionide 347 | 348 | ### Windows ### 349 | # Windows thumbnail cache files 350 | Thumbs.db 351 | Thumbs.db:encryptable 352 | ehthumbs.db 353 | ehthumbs_vista.db 354 | 355 | # Dump file 356 | *.stackdump 357 | 358 | # Folder config file 359 | [Dd]esktop.ini 360 | 361 | # Recycle Bin used on file shares 362 | $RECYCLE.BIN/ 363 | 364 | # Windows Installer files 365 | *.cab 366 | *.msi 367 | *.msix 368 | *.msm 369 | *.msp 370 | 371 | # Windows shortcuts 372 | *.lnk 373 | 374 | 375 | # End of https://www.toptal.com/developers/gitignore/api/python,venv,virtualenv,dotenv,visualstudiocode,pycharm+all,windows,macos -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "transformers"] 2 | path = transformers 3 | url = https://github.com/llama2d/transformers.git 4 | [submodule "llama-recipes"] 5 | path = llama-recipes 6 | url = https://github.com/facebookresearch/llama-recipes.git 7 | [submodule "llama-finetuning"] 8 | path = llama-finetuning 9 | url = https://github.com/Llama2D/llama-finetuning.git 10 | -------------------------------------------------------------------------------- /.python-version: -------------------------------------------------------------------------------- 1 | 3.11.4 -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "python.linting.flake8Enabled": false, 3 | "python.linting.enabled": false, 4 | "python.linting.pylintEnabled": true 5 | } 6 | -------------------------------------------------------------------------------- /CITATION: -------------------------------------------------------------------------------- 1 | cff-version: 1.2.0 2 | message: "If you use this software, please cite it as below." 3 | authors: 4 | - family-names: "Liu" 5 | given-names: "Houjun" 6 | - family-names: "Healey" 7 | given-names: "Andrew" 8 | - family-names: "Stelmach" 9 | given-names: "Andrew" 10 | - family-names: "Settles" 11 | given-names: "Christopher" 12 | - family-names: "Tangirala" 13 | given-names: "Sarma" 14 | - family-names: "Pandey" 15 | given-names: "Rohan" 16 | title: "Llama2D: Two Dimensional Positional Embeddings for Webpage Structural Understanding" 17 | date-released: 2024-1-30 18 | license: "MIT" 19 | url: "https://github.com/reworkd/tarsier" 20 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GNU GENERAL PUBLIC LICENSE 2 | Version 3, 29 June 2007 3 | 4 | Copyright (C) 2007 Free Software Foundation, Inc. 5 | Everyone is permitted to copy and distribute verbatim copies 6 | of this license document, but changing it is not allowed. 7 | 8 | Preamble 9 | 10 | The GNU General Public License is a free, copyleft license for 11 | software and other kinds of works. 12 | 13 | The licenses for most software and other practical works are designed 14 | to take away your freedom to share and change the works. By contrast, 15 | the GNU General Public License is intended to guarantee your freedom to 16 | share and change all versions of a program--to make sure it remains free 17 | software for all its users. We, the Free Software Foundation, use the 18 | GNU General Public License for most of our software; it applies also to 19 | any other work released this way by its authors. You can apply it to 20 | your programs, too. 21 | 22 | When we speak of free software, we are referring to freedom, not 23 | price. Our General Public Licenses are designed to make sure that you 24 | have the freedom to distribute copies of free software (and charge for 25 | them if you wish), that you receive source code or can get it if you 26 | want it, that you can change the software or use pieces of it in new 27 | free programs, and that you know you can do these things. 28 | 29 | To protect your rights, we need to prevent others from denying you 30 | these rights or asking you to surrender the rights. Therefore, you have 31 | certain responsibilities if you distribute copies of the software, or if 32 | you modify it: responsibilities to respect the freedom of others. 33 | 34 | For example, if you distribute copies of such a program, whether 35 | gratis or for a fee, you must pass on to the recipients the same 36 | freedoms that you received. You must make sure that they, too, receive 37 | or can get the source code. And you must show them these terms so they 38 | know their rights. 39 | 40 | Developers that use the GNU GPL protect your rights with two steps: 41 | (1) assert copyright on the software, and (2) offer you this License 42 | giving you legal permission to copy, distribute and/or modify it. 43 | 44 | For the developers' and authors' protection, the GPL clearly explains 45 | that there is no warranty for this free software. For both users' and 46 | authors' sake, the GPL requires that modified versions be marked as 47 | changed, so that their problems will not be attributed erroneously to 48 | authors of previous versions. 49 | 50 | Some devices are designed to deny users access to install or run 51 | modified versions of the software inside them, although the manufacturer 52 | can do so. This is fundamentally incompatible with the aim of 53 | protecting users' freedom to change the software. The systematic 54 | pattern of such abuse occurs in the area of products for individuals to 55 | use, which is precisely where it is most unacceptable. Therefore, we 56 | have designed this version of the GPL to prohibit the practice for those 57 | products. If such problems arise substantially in other domains, we 58 | stand ready to extend this provision to those domains in future versions 59 | of the GPL, as needed to protect the freedom of users. 60 | 61 | Finally, every program is threatened constantly by software patents. 62 | States should not allow patents to restrict development and use of 63 | software on general-purpose computers, but in those that do, we wish to 64 | avoid the special danger that patents applied to a free program could 65 | make it effectively proprietary. To prevent this, the GPL assures that 66 | patents cannot be used to render the program non-free. 67 | 68 | The precise terms and conditions for copying, distribution and 69 | modification follow. 70 | 71 | TERMS AND CONDITIONS 72 | 73 | 0. Definitions. 74 | 75 | "This License" refers to version 3 of the GNU General Public License. 76 | 77 | "Copyright" also means copyright-like laws that apply to other kinds of 78 | works, such as semiconductor masks. 79 | 80 | "The Program" refers to any copyrightable work licensed under this 81 | License. Each licensee is addressed as "you". "Licensees" and 82 | "recipients" may be individuals or organizations. 83 | 84 | To "modify" a work means to copy from or adapt all or part of the work 85 | in a fashion requiring copyright permission, other than the making of an 86 | exact copy. The resulting work is called a "modified version" of the 87 | earlier work or a work "based on" the earlier work. 88 | 89 | A "covered work" means either the unmodified Program or a work based 90 | on the Program. 91 | 92 | To "propagate" a work means to do anything with it that, without 93 | permission, would make you directly or secondarily liable for 94 | infringement under applicable copyright law, except executing it on a 95 | computer or modifying a private copy. Propagation includes copying, 96 | distribution (with or without modification), making available to the 97 | public, and in some countries other activities as well. 98 | 99 | To "convey" a work means any kind of propagation that enables other 100 | parties to make or receive copies. Mere interaction with a user through 101 | a computer network, with no transfer of a copy, is not conveying. 102 | 103 | An interactive user interface displays "Appropriate Legal Notices" 104 | to the extent that it includes a convenient and prominently visible 105 | feature that (1) displays an appropriate copyright notice, and (2) 106 | tells the user that there is no warranty for the work (except to the 107 | extent that warranties are provided), that licensees may convey the 108 | work under this License, and how to view a copy of this License. If 109 | the interface presents a list of user commands or options, such as a 110 | menu, a prominent item in the list meets this criterion. 111 | 112 | 1. Source Code. 113 | 114 | The "source code" for a work means the preferred form of the work 115 | for making modifications to it. "Object code" means any non-source 116 | form of a work. 117 | 118 | A "Standard Interface" means an interface that either is an official 119 | standard defined by a recognized standards body, or, in the case of 120 | interfaces specified for a particular programming language, one that 121 | is widely used among developers working in that language. 122 | 123 | The "System Libraries" of an executable work include anything, other 124 | than the work as a whole, that (a) is included in the normal form of 125 | packaging a Major Component, but which is not part of that Major 126 | Component, and (b) serves only to enable use of the work with that 127 | Major Component, or to implement a Standard Interface for which an 128 | implementation is available to the public in source code form. A 129 | "Major Component", in this context, means a major essential component 130 | (kernel, window system, and so on) of the specific operating system 131 | (if any) on which the executable work runs, or a compiler used to 132 | produce the work, or an object code interpreter used to run it. 133 | 134 | The "Corresponding Source" for a work in object code form means all 135 | the source code needed to generate, install, and (for an executable 136 | work) run the object code and to modify the work, including scripts to 137 | control those activities. However, it does not include the work's 138 | System Libraries, or general-purpose tools or generally available free 139 | programs which are used unmodified in performing those activities but 140 | which are not part of the work. For example, Corresponding Source 141 | includes interface definition files associated with source files for 142 | the work, and the source code for shared libraries and dynamically 143 | linked subprograms that the work is specifically designed to require, 144 | such as by intimate data communication or control flow between those 145 | subprograms and other parts of the work. 146 | 147 | The Corresponding Source need not include anything that users 148 | can regenerate automatically from other parts of the Corresponding 149 | Source. 150 | 151 | The Corresponding Source for a work in source code form is that 152 | same work. 153 | 154 | 2. Basic Permissions. 155 | 156 | All rights granted under this License are granted for the term of 157 | copyright on the Program, and are irrevocable provided the stated 158 | conditions are met. This License explicitly affirms your unlimited 159 | permission to run the unmodified Program. The output from running a 160 | covered work is covered by this License only if the output, given its 161 | content, constitutes a covered work. This License acknowledges your 162 | rights of fair use or other equivalent, as provided by copyright law. 163 | 164 | You may make, run and propagate covered works that you do not 165 | convey, without conditions so long as your license otherwise remains 166 | in force. You may convey covered works to others for the sole purpose 167 | of having them make modifications exclusively for you, or provide you 168 | with facilities for running those works, provided that you comply with 169 | the terms of this License in conveying all material for which you do 170 | not control copyright. Those thus making or running the covered works 171 | for you must do so exclusively on your behalf, under your direction 172 | and control, on terms that prohibit them from making any copies of 173 | your copyrighted material outside their relationship with you. 174 | 175 | Conveying under any other circumstances is permitted solely under 176 | the conditions stated below. Sublicensing is not allowed; section 10 177 | makes it unnecessary. 178 | 179 | 3. Protecting Users' Legal Rights From Anti-Circumvention Law. 180 | 181 | No covered work shall be deemed part of an effective technological 182 | measure under any applicable law fulfilling obligations under article 183 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or 184 | similar laws prohibiting or restricting circumvention of such 185 | measures. 186 | 187 | When you convey a covered work, you waive any legal power to forbid 188 | circumvention of technological measures to the extent such circumvention 189 | is effected by exercising rights under this License with respect to 190 | the covered work, and you disclaim any intention to limit operation or 191 | modification of the work as a means of enforcing, against the work's 192 | users, your or third parties' legal rights to forbid circumvention of 193 | technological measures. 194 | 195 | 4. Conveying Verbatim Copies. 196 | 197 | You may convey verbatim copies of the Program's source code as you 198 | receive it, in any medium, provided that you conspicuously and 199 | appropriately publish on each copy an appropriate copyright notice; 200 | keep intact all notices stating that this License and any 201 | non-permissive terms added in accord with section 7 apply to the code; 202 | keep intact all notices of the absence of any warranty; and give all 203 | recipients a copy of this License along with the Program. 204 | 205 | You may charge any price or no price for each copy that you convey, 206 | and you may offer support or warranty protection for a fee. 207 | 208 | 5. Conveying Modified Source Versions. 209 | 210 | You may convey a work based on the Program, or the modifications to 211 | produce it from the Program, in the form of source code under the 212 | terms of section 4, provided that you also meet all of these conditions: 213 | 214 | a) The work must carry prominent notices stating that you modified 215 | it, and giving a relevant date. 216 | 217 | b) The work must carry prominent notices stating that it is 218 | released under this License and any conditions added under section 219 | 7. This requirement modifies the requirement in section 4 to 220 | "keep intact all notices". 221 | 222 | c) You must license the entire work, as a whole, under this 223 | License to anyone who comes into possession of a copy. This 224 | License will therefore apply, along with any applicable section 7 225 | additional terms, to the whole of the work, and all its parts, 226 | regardless of how they are packaged. This License gives no 227 | permission to license the work in any other way, but it does not 228 | invalidate such permission if you have separately received it. 229 | 230 | d) If the work has interactive user interfaces, each must display 231 | Appropriate Legal Notices; however, if the Program has interactive 232 | interfaces that do not display Appropriate Legal Notices, your 233 | work need not make them do so. 234 | 235 | A compilation of a covered work with other separate and independent 236 | works, which are not by their nature extensions of the covered work, 237 | and which are not combined with it such as to form a larger program, 238 | in or on a volume of a storage or distribution medium, is called an 239 | "aggregate" if the compilation and its resulting copyright are not 240 | used to limit the access or legal rights of the compilation's users 241 | beyond what the individual works permit. Inclusion of a covered work 242 | in an aggregate does not cause this License to apply to the other 243 | parts of the aggregate. 244 | 245 | 6. Conveying Non-Source Forms. 246 | 247 | You may convey a covered work in object code form under the terms 248 | of sections 4 and 5, provided that you also convey the 249 | machine-readable Corresponding Source under the terms of this License, 250 | in one of these ways: 251 | 252 | a) Convey the object code in, or embodied in, a physical product 253 | (including a physical distribution medium), accompanied by the 254 | Corresponding Source fixed on a durable physical medium 255 | customarily used for software interchange. 256 | 257 | b) Convey the object code in, or embodied in, a physical product 258 | (including a physical distribution medium), accompanied by a 259 | written offer, valid for at least three years and valid for as 260 | long as you offer spare parts or customer support for that product 261 | model, to give anyone who possesses the object code either (1) a 262 | copy of the Corresponding Source for all the software in the 263 | product that is covered by this License, on a durable physical 264 | medium customarily used for software interchange, for a price no 265 | more than your reasonable cost of physically performing this 266 | conveying of source, or (2) access to copy the 267 | Corresponding Source from a network server at no charge. 268 | 269 | c) Convey individual copies of the object code with a copy of the 270 | written offer to provide the Corresponding Source. This 271 | alternative is allowed only occasionally and noncommercially, and 272 | only if you received the object code with such an offer, in accord 273 | with subsection 6b. 274 | 275 | d) Convey the object code by offering access from a designated 276 | place (gratis or for a charge), and offer equivalent access to the 277 | Corresponding Source in the same way through the same place at no 278 | further charge. You need not require recipients to copy the 279 | Corresponding Source along with the object code. If the place to 280 | copy the object code is a network server, the Corresponding Source 281 | may be on a different server (operated by you or a third party) 282 | that supports equivalent copying facilities, provided you maintain 283 | clear directions next to the object code saying where to find the 284 | Corresponding Source. Regardless of what server hosts the 285 | Corresponding Source, you remain obligated to ensure that it is 286 | available for as long as needed to satisfy these requirements. 287 | 288 | e) Convey the object code using peer-to-peer transmission, provided 289 | you inform other peers where the object code and Corresponding 290 | Source of the work are being offered to the general public at no 291 | charge under subsection 6d. 292 | 293 | A separable portion of the object code, whose source code is excluded 294 | from the Corresponding Source as a System Library, need not be 295 | included in conveying the object code work. 296 | 297 | A "User Product" is either (1) a "consumer product", which means any 298 | tangible personal property which is normally used for personal, family, 299 | or household purposes, or (2) anything designed or sold for incorporation 300 | into a dwelling. In determining whether a product is a consumer product, 301 | doubtful cases shall be resolved in favor of coverage. For a particular 302 | product received by a particular user, "normally used" refers to a 303 | typical or common use of that class of product, regardless of the status 304 | of the particular user or of the way in which the particular user 305 | actually uses, or expects or is expected to use, the product. A product 306 | is a consumer product regardless of whether the product has substantial 307 | commercial, industrial or non-consumer uses, unless such uses represent 308 | the only significant mode of use of the product. 309 | 310 | "Installation Information" for a User Product means any methods, 311 | procedures, authorization keys, or other information required to install 312 | and execute modified versions of a covered work in that User Product from 313 | a modified version of its Corresponding Source. The information must 314 | suffice to ensure that the continued functioning of the modified object 315 | code is in no case prevented or interfered with solely because 316 | modification has been made. 317 | 318 | If you convey an object code work under this section in, or with, or 319 | specifically for use in, a User Product, and the conveying occurs as 320 | part of a transaction in which the right of possession and use of the 321 | User Product is transferred to the recipient in perpetuity or for a 322 | fixed term (regardless of how the transaction is characterized), the 323 | Corresponding Source conveyed under this section must be accompanied 324 | by the Installation Information. But this requirement does not apply 325 | if neither you nor any third party retains the ability to install 326 | modified object code on the User Product (for example, the work has 327 | been installed in ROM). 328 | 329 | The requirement to provide Installation Information does not include a 330 | requirement to continue to provide support service, warranty, or updates 331 | for a work that has been modified or installed by the recipient, or for 332 | the User Product in which it has been modified or installed. Access to a 333 | network may be denied when the modification itself materially and 334 | adversely affects the operation of the network or violates the rules and 335 | protocols for communication across the network. 336 | 337 | Corresponding Source conveyed, and Installation Information provided, 338 | in accord with this section must be in a format that is publicly 339 | documented (and with an implementation available to the public in 340 | source code form), and must require no special password or key for 341 | unpacking, reading or copying. 342 | 343 | 7. Additional Terms. 344 | 345 | "Additional permissions" are terms that supplement the terms of this 346 | License by making exceptions from one or more of its conditions. 347 | Additional permissions that are applicable to the entire Program shall 348 | be treated as though they were included in this License, to the extent 349 | that they are valid under applicable law. If additional permissions 350 | apply only to part of the Program, that part may be used separately 351 | under those permissions, but the entire Program remains governed by 352 | this License without regard to the additional permissions. 353 | 354 | When you convey a copy of a covered work, you may at your option 355 | remove any additional permissions from that copy, or from any part of 356 | it. (Additional permissions may be written to require their own 357 | removal in certain cases when you modify the work.) You may place 358 | additional permissions on material, added by you to a covered work, 359 | for which you have or can give appropriate copyright permission. 360 | 361 | Notwithstanding any other provision of this License, for material you 362 | add to a covered work, you may (if authorized by the copyright holders of 363 | that material) supplement the terms of this License with terms: 364 | 365 | a) Disclaiming warranty or limiting liability differently from the 366 | terms of sections 15 and 16 of this License; or 367 | 368 | b) Requiring preservation of specified reasonable legal notices or 369 | author attributions in that material or in the Appropriate Legal 370 | Notices displayed by works containing it; or 371 | 372 | c) Prohibiting misrepresentation of the origin of that material, or 373 | requiring that modified versions of such material be marked in 374 | reasonable ways as different from the original version; or 375 | 376 | d) Limiting the use for publicity purposes of names of licensors or 377 | authors of the material; or 378 | 379 | e) Declining to grant rights under trademark law for use of some 380 | trade names, trademarks, or service marks; or 381 | 382 | f) Requiring indemnification of licensors and authors of that 383 | material by anyone who conveys the material (or modified versions of 384 | it) with contractual assumptions of liability to the recipient, for 385 | any liability that these contractual assumptions directly impose on 386 | those licensors and authors. 387 | 388 | All other non-permissive additional terms are considered "further 389 | restrictions" within the meaning of section 10. If the Program as you 390 | received it, or any part of it, contains a notice stating that it is 391 | governed by this License along with a term that is a further 392 | restriction, you may remove that term. If a license document contains 393 | a further restriction but permits relicensing or conveying under this 394 | License, you may add to a covered work material governed by the terms 395 | of that license document, provided that the further restriction does 396 | not survive such relicensing or conveying. 397 | 398 | If you add terms to a covered work in accord with this section, you 399 | must place, in the relevant source files, a statement of the 400 | additional terms that apply to those files, or a notice indicating 401 | where to find the applicable terms. 402 | 403 | Additional terms, permissive or non-permissive, may be stated in the 404 | form of a separately written license, or stated as exceptions; 405 | the above requirements apply either way. 406 | 407 | 8. Termination. 408 | 409 | You may not propagate or modify a covered work except as expressly 410 | provided under this License. Any attempt otherwise to propagate or 411 | modify it is void, and will automatically terminate your rights under 412 | this License (including any patent licenses granted under the third 413 | paragraph of section 11). 414 | 415 | However, if you cease all violation of this License, then your 416 | license from a particular copyright holder is reinstated (a) 417 | provisionally, unless and until the copyright holder explicitly and 418 | finally terminates your license, and (b) permanently, if the copyright 419 | holder fails to notify you of the violation by some reasonable means 420 | prior to 60 days after the cessation. 421 | 422 | Moreover, your license from a particular copyright holder is 423 | reinstated permanently if the copyright holder notifies you of the 424 | violation by some reasonable means, this is the first time you have 425 | received notice of violation of this License (for any work) from that 426 | copyright holder, and you cure the violation prior to 30 days after 427 | your receipt of the notice. 428 | 429 | Termination of your rights under this section does not terminate the 430 | licenses of parties who have received copies or rights from you under 431 | this License. If your rights have been terminated and not permanently 432 | reinstated, you do not qualify to receive new licenses for the same 433 | material under section 10. 434 | 435 | 9. Acceptance Not Required for Having Copies. 436 | 437 | You are not required to accept this License in order to receive or 438 | run a copy of the Program. Ancillary propagation of a covered work 439 | occurring solely as a consequence of using peer-to-peer transmission 440 | to receive a copy likewise does not require acceptance. However, 441 | nothing other than this License grants you permission to propagate or 442 | modify any covered work. These actions infringe copyright if you do 443 | not accept this License. Therefore, by modifying or propagating a 444 | covered work, you indicate your acceptance of this License to do so. 445 | 446 | 10. Automatic Licensing of Downstream Recipients. 447 | 448 | Each time you convey a covered work, the recipient automatically 449 | receives a license from the original licensors, to run, modify and 450 | propagate that work, subject to this License. You are not responsible 451 | for enforcing compliance by third parties with this License. 452 | 453 | An "entity transaction" is a transaction transferring control of an 454 | organization, or substantially all assets of one, or subdividing an 455 | organization, or merging organizations. If propagation of a covered 456 | work results from an entity transaction, each party to that 457 | transaction who receives a copy of the work also receives whatever 458 | licenses to the work the party's predecessor in interest had or could 459 | give under the previous paragraph, plus a right to possession of the 460 | Corresponding Source of the work from the predecessor in interest, if 461 | the predecessor has it or can get it with reasonable efforts. 462 | 463 | You may not impose any further restrictions on the exercise of the 464 | rights granted or affirmed under this License. For example, you may 465 | not impose a license fee, royalty, or other charge for exercise of 466 | rights granted under this License, and you may not initiate litigation 467 | (including a cross-claim or counterclaim in a lawsuit) alleging that 468 | any patent claim is infringed by making, using, selling, offering for 469 | sale, or importing the Program or any portion of it. 470 | 471 | 11. Patents. 472 | 473 | A "contributor" is a copyright holder who authorizes use under this 474 | License of the Program or a work on which the Program is based. The 475 | work thus licensed is called the contributor's "contributor version". 476 | 477 | A contributor's "essential patent claims" are all patent claims 478 | owned or controlled by the contributor, whether already acquired or 479 | hereafter acquired, that would be infringed by some manner, permitted 480 | by this License, of making, using, or selling its contributor version, 481 | but do not include claims that would be infringed only as a 482 | consequence of further modification of the contributor version. For 483 | purposes of this definition, "control" includes the right to grant 484 | patent sublicenses in a manner consistent with the requirements of 485 | this License. 486 | 487 | Each contributor grants you a non-exclusive, worldwide, royalty-free 488 | patent license under the contributor's essential patent claims, to 489 | make, use, sell, offer for sale, import and otherwise run, modify and 490 | propagate the contents of its contributor version. 491 | 492 | In the following three paragraphs, a "patent license" is any express 493 | agreement or commitment, however denominated, not to enforce a patent 494 | (such as an express permission to practice a patent or covenant not to 495 | sue for patent infringement). To "grant" such a patent license to a 496 | party means to make such an agreement or commitment not to enforce a 497 | patent against the party. 498 | 499 | If you convey a covered work, knowingly relying on a patent license, 500 | and the Corresponding Source of the work is not available for anyone 501 | to copy, free of charge and under the terms of this License, through a 502 | publicly available network server or other readily accessible means, 503 | then you must either (1) cause the Corresponding Source to be so 504 | available, or (2) arrange to deprive yourself of the benefit of the 505 | patent license for this particular work, or (3) arrange, in a manner 506 | consistent with the requirements of this License, to extend the patent 507 | license to downstream recipients. "Knowingly relying" means you have 508 | actual knowledge that, but for the patent license, your conveying the 509 | covered work in a country, or your recipient's use of the covered work 510 | in a country, would infringe one or more identifiable patents in that 511 | country that you have reason to believe are valid. 512 | 513 | If, pursuant to or in connection with a single transaction or 514 | arrangement, you convey, or propagate by procuring conveyance of, a 515 | covered work, and grant a patent license to some of the parties 516 | receiving the covered work authorizing them to use, propagate, modify 517 | or convey a specific copy of the covered work, then the patent license 518 | you grant is automatically extended to all recipients of the covered 519 | work and works based on it. 520 | 521 | A patent license is "discriminatory" if it does not include within 522 | the scope of its coverage, prohibits the exercise of, or is 523 | conditioned on the non-exercise of one or more of the rights that are 524 | specifically granted under this License. You may not convey a covered 525 | work if you are a party to an arrangement with a third party that is 526 | in the business of distributing software, under which you make payment 527 | to the third party based on the extent of your activity of conveying 528 | the work, and under which the third party grants, to any of the 529 | parties who would receive the covered work from you, a discriminatory 530 | patent license (a) in connection with copies of the covered work 531 | conveyed by you (or copies made from those copies), or (b) primarily 532 | for and in connection with specific products or compilations that 533 | contain the covered work, unless you entered into that arrangement, 534 | or that patent license was granted, prior to 28 March 2007. 535 | 536 | Nothing in this License shall be construed as excluding or limiting 537 | any implied license or other defenses to infringement that may 538 | otherwise be available to you under applicable patent law. 539 | 540 | 12. No Surrender of Others' Freedom. 541 | 542 | If conditions are imposed on you (whether by court order, agreement or 543 | otherwise) that contradict the conditions of this License, they do not 544 | excuse you from the conditions of this License. If you cannot convey a 545 | covered work so as to satisfy simultaneously your obligations under this 546 | License and any other pertinent obligations, then as a consequence you may 547 | not convey it at all. For example, if you agree to terms that obligate you 548 | to collect a royalty for further conveying from those to whom you convey 549 | the Program, the only way you could satisfy both those terms and this 550 | License would be to refrain entirely from conveying the Program. 551 | 552 | 13. Use with the GNU Affero General Public License. 553 | 554 | Notwithstanding any other provision of this License, you have 555 | permission to link or combine any covered work with a work licensed 556 | under version 3 of the GNU Affero General Public License into a single 557 | combined work, and to convey the resulting work. The terms of this 558 | License will continue to apply to the part which is the covered work, 559 | but the special requirements of the GNU Affero General Public License, 560 | section 13, concerning interaction through a network will apply to the 561 | combination as such. 562 | 563 | 14. Revised Versions of this License. 564 | 565 | The Free Software Foundation may publish revised and/or new versions of 566 | the GNU General Public License from time to time. Such new versions will 567 | be similar in spirit to the present version, but may differ in detail to 568 | address new problems or concerns. 569 | 570 | Each version is given a distinguishing version number. If the 571 | Program specifies that a certain numbered version of the GNU General 572 | Public License "or any later version" applies to it, you have the 573 | option of following the terms and conditions either of that numbered 574 | version or of any later version published by the Free Software 575 | Foundation. If the Program does not specify a version number of the 576 | GNU General Public License, you may choose any version ever published 577 | by the Free Software Foundation. 578 | 579 | If the Program specifies that a proxy can decide which future 580 | versions of the GNU General Public License can be used, that proxy's 581 | public statement of acceptance of a version permanently authorizes you 582 | to choose that version for the Program. 583 | 584 | Later license versions may give you additional or different 585 | permissions. However, no additional obligations are imposed on any 586 | author or copyright holder as a result of your choosing to follow a 587 | later version. 588 | 589 | 15. Disclaimer of Warranty. 590 | 591 | THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY 592 | APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT 593 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY 594 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, 595 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 596 | PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM 597 | IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF 598 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 599 | 600 | 16. Limitation of Liability. 601 | 602 | IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 603 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS 604 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY 605 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE 606 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF 607 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD 608 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), 609 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF 610 | SUCH DAMAGES. 611 | 612 | 17. Interpretation of Sections 15 and 16. 613 | 614 | If the disclaimer of warranty and limitation of liability provided 615 | above cannot be given local legal effect according to their terms, 616 | reviewing courts shall apply local law that most closely approximates 617 | an absolute waiver of all civil liability in connection with the 618 | Program, unless a warranty or assumption of liability accompanies a 619 | copy of the Program in return for a fee. 620 | 621 | END OF TERMS AND CONDITIONS 622 | 623 | How to Apply These Terms to Your New Programs 624 | 625 | If you develop a new program, and you want it to be of the greatest 626 | possible use to the public, the best way to achieve this is to make it 627 | free software which everyone can redistribute and change under these terms. 628 | 629 | To do so, attach the following notices to the program. It is safest 630 | to attach them to the start of each source file to most effectively 631 | state the exclusion of warranty; and each file should have at least 632 | the "copyright" line and a pointer to where the full notice is found. 633 | 634 | 635 | Copyright (C) 636 | 637 | This program is free software: you can redistribute it and/or modify 638 | it under the terms of the GNU General Public License as published by 639 | the Free Software Foundation, either version 3 of the License, or 640 | (at your option) any later version. 641 | 642 | This program is distributed in the hope that it will be useful, 643 | but WITHOUT ANY WARRANTY; without even the implied warranty of 644 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 645 | GNU General Public License for more details. 646 | 647 | You should have received a copy of the GNU General Public License 648 | along with this program. If not, see . 649 | 650 | Also add information on how to contact you by electronic and paper mail. 651 | 652 | If the program does terminal interaction, make it output a short 653 | notice like this when it starts in an interactive mode: 654 | 655 | Copyright (C) 656 | This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. 657 | This is free software, and you are welcome to redistribute it 658 | under certain conditions; type `show c' for details. 659 | 660 | The hypothetical commands `show w' and `show c' should show the appropriate 661 | parts of the General Public License. Of course, your program's commands 662 | might be different; for a GUI interface, you would use an "about box". 663 | 664 | You should also get your employer (if you work as a programmer) or school, 665 | if any, to sign a "copyright disclaimer" for the program, if necessary. 666 | For more information on this, and how to apply and follow the GNU GPL, see 667 | . 668 | 669 | The GNU General Public License does not permit incorporating your program 670 | into proprietary programs. If your program is a subroutine library, you 671 | may consider it more useful to permit linking proprietary applications with 672 | the library. If this is what you want to do, use the GNU Lesser General 673 | Public License instead of this License. But first, please read 674 | . 675 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | 2D Llama line-art 3 |

4 |

5 | 2D Positional Embeddings for Webpage Structural Understanding 🦙👀 6 |

7 |

8 | 🐦 Twitter 9 |

10 | 11 | # llama2d 12 | How can we get LLM-based agents to understand the *visual structure* of a webpage? We fine-tune Llama on OCR'd screenshots of webpages but with 2D positional embeddings, enabling it to "see" the structure of a webpage rather than just a sequence of tokens. 13 | 14 | To construct the dataset, we: 15 | - took each MHTML provided by Mind2Web 16 | - rendered it in Playwright 17 | - tagged interactable elements 18 | - ran OCR to get (x, y) coordinates of words on the page 19 | 20 | We then calculate 2D positional embeddings for each word and fine-tune Llama! 21 | 22 | Note: this repo is still a bit disorganized and a work in progress, but we encourage community contributions & forks to explore this direction in LLM web interaction 23 | 24 | ## Setup 25 | 26 | ```bash 27 | git clone https://github.com/llama2d/llama2d.git --recursive 28 | cd transformers && pip install -e . && cd .. 29 | pip install -r requirements.txt 30 | playwright install 31 | pre-commit install 32 | ``` 33 | 34 | ## Secrets 35 | 36 | 1. Create a Google Cloud Vision credential file and put it at `secrets/gcp-vision.json`. 37 | 38 | 2. Run the Modal login command in the Slack channel. It looks like this: `modal token set --token-id --token-secret ` 39 | 40 | ## Datasets 41 | 42 | Datasets are defined in the `src/llama2d/datasets/` directory. 43 | 44 | Every row of a dataset is defined by a prompt, a 2D "screen", and an output. 45 | 46 | However, a row is converted into pure tokens before being fed into Llama - see [this dataset]() for an example. 47 | 48 | You can visualize a dataset on Huggingface by copying all the numbers in a row and pasting it into [this webpage](). 49 | 50 | ### Synthetic datasets 51 | 52 | We will have lots of synthetic datasets--i.e. the Zoo Compass dataset defined in `src/llama2d/datasets/synthetic/zoo_compass.py`. 53 | 54 | These datasets are simple. They each spit out a bunch of rows with `prompt: str`, `screen: Llama2dScreen`, and `output: str`. 55 | 56 | It is easy to create a `Llama2dScreen`: 57 | 58 | ```py 59 | from llama2d.vision import Llama2dScreen 60 | 61 | screen = Llama2dScreen() 62 | 63 | screen.push_word(word="north",xy=(0.5,0)) 64 | screen.push_word(word="south",xy=(0.5,1)) 65 | screen.push_word(word="east",xy=(1,0.5)) 66 | screen.push_word(word="west",xy=(0,0.5)) 67 | ``` 68 | 69 | To create this dataset, look at it in your console, and publish it to Huggingface, run the following: 70 | 71 | ```bash 72 | python -m llama2d.datasets.synthetic.zoo_compass 73 | ``` 74 | 75 | I recommend reading the Zoo Compass dataset code for reference. 76 | 77 | ### Pretraining dataset 78 | 79 | This dataset contains over 600 retail websites. The task is next-token prediction. 80 | 81 | Here, the prompt and output are empty. The website text is all in the screen. 82 | 83 | The model is trained to predict the next token of the website text. It is NOT trained to predict the position of the next token. 84 | 85 | This dataset is implemented in [`src/llama2d/datasets/pretraining.py`](https://github.com/Llama2D/llama2d/blob/main/src/llama2d/datasets/pretraining.py). 86 | 87 | To collect this dataset and upload it to Huggingface, run the file: 88 | 89 | ```bash 90 | python -m src.llama2d.datasets.pretraining 91 | ``` 92 | 93 | ### Mind2Web dataset 94 | 95 | This dataset contains ~1000 tasks from the Mind2Web dataset. 96 | 97 | The task is to take an intention, a screenshot of a webpage, and choose the correct action to take. 98 | 99 | To download this dataset, first download the Mind2Web `mhtml` files generated by Andrew Stelmach. 100 | 101 | The zip with the files is [here](https://drive.google.com/file/d/1RGNcNTlQrZhF1KuGBcGenkON1u74_IYx/view). Download it and unzip it into `src/data/mind2web-mhtml`. Your `src/data/mind2web-mhtml` directory should look like this: 102 | 103 | ``` 104 | src/data/mind2web-mhtml 105 | ├── 0004f2a7-90d6-4f96-902a-b1d25d39a93d_before.mhtml 106 | ├── 00068a1e-b6a3-4c53-a60c-3ed777d4b05d_before.mhtml 107 | ├── 00146964-4b74-4e28-8292-5810a604639a_before.mhtml 108 | ├── 0018120a-8da1-4a36-a1c4-b4642c97211b_before.mhtml 109 | ``` 110 | 111 | To process and cache the Mind2Web dataset, run the following: 112 | 113 | ```bash 114 | python -m llama2d.datasets.mind2web 115 | ``` 116 | 117 | ## Modal training 118 | 119 | To train a model with Modal, change your directory to `src/llama2d/modal/` and run i.e. 120 | 121 | ```bash 122 | modal run train.py --dataset hf_dataset.py --repo src/llama2d/llama2d-mind2web --no-peft --num-epochs 4 123 | ``` 124 | 125 | `peft` is a synonym for LoRA. `hf_dataset` means we are using a dataset uploaded to Huggingface (thanks Matthew!). [`src/llama2d/llama2d-mind2web`](https://huggingface.co/datasets/llama2d/llama2d-mind2web/viewer/default/train?row=0) is the Huggingface repo containing the dataset. 126 | 127 | ## In the Repo 128 | 129 | To add a requirement, add it to `requirements.in`, run `pip-compile`, and run `pip-sync`. 130 | 131 | Run `black . --exclude '/transformers/|/venv/'` to format the code. 132 | 133 | Pre-commit hooks are used to maintain code quality. 134 | 135 | ## Citations 136 | 137 | ``` 138 | bibtex 139 | @misc{llama2d2024, 140 | title = {Llama2D: Two Dimensional Positional Embeddings for Webpage Structural Understanding}, 141 | author = {Houjun Liu and Andrew Healey and Andrew Stelmach and Christopher Settles and Sarma Tangirala and Rohan Pandey}, 142 | year = {2024}, 143 | howpublished = {GitHub}, 144 | url = {https://github.com/llama2d/llama2d} 145 | } 146 | ``` 147 | -------------------------------------------------------------------------------- /docs/index.js: -------------------------------------------------------------------------------- 1 | document.body.onload = ()=>{ 2 | 3 | console.log("hey") 4 | 5 | // listen for updates to the textarea 6 | // when it updates, extract [input_ids,coords,labels,attention_mask] from the textarea 7 | const textarea = document.querySelector('textarea'); 8 | 9 | textarea.addEventListener('input', function () { 10 | render(); 11 | }); 12 | 13 | const canvas = document.getElementById('rendered-output'); 14 | const ctx = canvas.getContext('2d'); 15 | 16 | window.render = ()=>{ 17 | const text = textarea.value; 18 | // split text into newlines, parse each as JSON 19 | const lines = text.split('\n').filter(line=>line.trim().length>0); 20 | const [tokenIds, coords, labels, attentionMask] = lines.map(JSON.parse); 21 | 22 | const lastIdx = tokenIds.findLastIndex(i=>i>0) 23 | const firstIdxLastChunk = tokenIds.slice(0,lastIdx).findLastIndex(i=>i<=0)+1 24 | 25 | // console.log(lastIdx,firstIdxLastChunk) 26 | // console.log(labelIds.slice(firstIdxLastChunk,lastIdx+1)) 27 | // console.log(llamaTokenizer.decode([0,...tokenIds.slice(0,firstIdxLastChunk)])) 28 | 29 | const prompt = llamaTokenizer.decode([0,...tokenIds.slice(0,firstIdxLastChunk)]) 30 | 31 | const completion = llamaTokenizer.decode([0,...tokenIds.slice(firstIdxLastChunk,lastIdx+1)]) 32 | 33 | const coordTokens = coords.map(([x,y],i)=>[x,y,tokenIds[i]]).filter(([x,y,tokenid])=>x>=0); 34 | 35 | /* 36 | python impl: 37 | # graph tokens with coords in a matplotlib figure 38 | # print the tokens without coords 39 | 40 | # every word has a few tokens with the same coord. 41 | # we should generate the word, turn it into a string, then plot it at the coord 42 | 43 | without_coords = [input_ids[i] for i in range(len(input_ids)) if coords[i][0] == -1 and attention_mask[i] == 1] 44 | 45 | with_coords = [(input_ids[i],coords[i]) for i in range(len(input_ids)) if coords[i][0] != -1 and attention_mask[i] == 1] 46 | # split with_coords into words - where a word is a list of tokens with the same coord 47 | words = [] 48 | current_word = [] 49 | current_coord = None 50 | for token in with_coords: 51 | if current_coord is None or (token[1] != current_coord).any(): 52 | if len(current_word) > 0: 53 | words.append(current_word) 54 | current_word = [] 55 | current_coord = token[1] 56 | current_word.append(token) 57 | words.append(current_word) 58 | 59 | 60 | # plot with_coords as text on a matplotlib figure 61 | 62 | fig = plt.figure() 63 | # make fig very big 64 | fig.set_size_inches(20,20) 65 | 66 | ax = fig.add_subplot(111) 67 | ax.set_xlim([0,1]) 68 | ax.set_ylim([0,1]) 69 | ax.set_aspect('equal') 70 | 71 | for word in words: 72 | word_str = "".join(tokenizer.convert_ids_to_tokens([i[0] for i in word])) 73 | word_coord = word[0][1] 74 | # very small text 75 | ax.text(word_coord[0],-word_coord[1],word_str,fontsize=10) 76 | 77 | # save the figure 78 | fig.savefig("tokens_with_coords.png") 79 | 80 | */ 81 | 82 | const words = coordTokens.reduce((acc,[x,y,tokenid])=>{ 83 | if(acc.length === 0 || acc[acc.length-1].length === 0 || acc[acc.length-1][0][0] !== x || acc[acc.length-1][0][1] !== y){ 84 | acc.push([]) 85 | } 86 | acc[acc.length-1].push([x,y,tokenid]) 87 | return acc 88 | },[]) 89 | 90 | const wordStrings = words.map(word=>llamaTokenizer.decode([0,...word.map(([x,y,tokenid])=>tokenid)])) 91 | 92 | const wordCoords = words.map(word=>word[0].slice(0,2)) 93 | 94 | // clear canvas, map onto canvas 95 | ctx.clearRect(0, 0, canvas.width, canvas.height); 96 | ctx.textAlign = "center"; 97 | ctx.font = '10px monospace'; 98 | 99 | const canvasCoords = wordCoords.map(([x,y])=>[x*canvas.width,(1-y)*canvas.height]) 100 | wordCoords.forEach(([x,y],i)=>{ 101 | const wordString = wordStrings[i]; 102 | ctx.fillStyle = wordString.match(/^\[\d+\]/) ? 'red' : 'black'; 103 | ctx.fillText(wordStrings[i],canvasCoords[i][0],canvasCoords[i][1]) 104 | }) 105 | 106 | // paste non-coord tokens into the pre 107 | // the first line is the prompt 108 | // the second line is the completion 109 | // find prompt vs. completion using firstIdxLastChunk 110 | 111 | const promptTokens = tokenIds.map((tokenId,i)=>[tokenId,coords[i][0],ix<0).filter(([_,__,b])=>b).map(([tokenId,x])=>tokenId) 112 | const completionTokens = [0,...tokenIds.slice(firstIdxLastChunk,lastIdx+1).filter(i=>i>0)]; 113 | 114 | const promptString = llamaTokenizer.decode(promptTokens); 115 | const completionString = llamaTokenizer.decode(completionTokens); 116 | 117 | const output = document.getElementById('output'); 118 | output.innerText = promptString + '\n' + completionString; 119 | 120 | console.log(llamaTokenizer.decode(tokenIds)) 121 | } 122 | 123 | setTimeout(render, 500); 124 | } -------------------------------------------------------------------------------- /llama2d.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Llama2D/llama2d/e28b97255d396c717fe183b96b802ff39ffd7e6d/llama2d.png -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.black] 2 | line-length = 88 3 | target-version = ['py311'] 4 | excludes = [ 5 | ".venv", 6 | "venv", 7 | ".git", 8 | "build", 9 | "alembic", 10 | "transformers", 11 | "transformers/*", 12 | ] 13 | 14 | [build-system] 15 | requires = ["black"] 16 | build-backend = "setuptools.build_meta" 17 | 18 | [tool.isort] 19 | profile = "black" 20 | multi_line_output = 3 21 | include_trailing_comma = true 22 | -------------------------------------------------------------------------------- /requirements.in: -------------------------------------------------------------------------------- 1 | huggingface_hub[cli,torch] 2 | datasets 3 | huggingface_hub 4 | langchain 5 | 6 | wandb 7 | matplotlib 8 | playwright 9 | selenium 10 | 11 | google-cloud-vision 12 | Pillow 13 | modal 14 | 15 | faiss-cpu 16 | sentencepiece 17 | 18 | torch 19 | nest-asyncio 20 | gdown 21 | peft 22 | pre-commit 23 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # 2 | # This file is autogenerated by pip-compile with Python 3.11 3 | # by the following command: 4 | # 5 | # pip-compile 6 | # 7 | accelerate==0.23.0 8 | # via peft 9 | aiohttp==3.8.5 10 | # via 11 | # datasets 12 | # fsspec 13 | # langchain 14 | # modal 15 | aiosignal==1.3.1 16 | # via aiohttp 17 | aiostream==0.5.0 18 | # via modal 19 | annotated-types==0.5.0 20 | # via pydantic 21 | anyio==3.7.1 22 | # via 23 | # fastapi 24 | # langchain 25 | # starlette 26 | # watchfiles 27 | appdirs==1.4.4 28 | # via wandb 29 | asgiref==3.7.2 30 | # via modal 31 | async-timeout==4.0.3 32 | # via aiohttp 33 | attrs==23.1.0 34 | # via 35 | # aiohttp 36 | # outcome 37 | # sigtools 38 | # trio 39 | beautifulsoup4==4.12.2 40 | # via gdown 41 | cachetools==5.3.1 42 | # via google-auth 43 | certifi==2023.7.22 44 | # via 45 | # modal 46 | # requests 47 | # selenium 48 | # sentry-sdk 49 | cfgv==3.4.0 50 | # via pre-commit 51 | charset-normalizer==3.2.0 52 | # via 53 | # aiohttp 54 | # requests 55 | click==8.1.7 56 | # via 57 | # modal 58 | # typer 59 | # wandb 60 | cloudpickle==2.2.1 61 | # via modal 62 | contourpy==1.1.1 63 | # via matplotlib 64 | cycler==0.11.0 65 | # via matplotlib 66 | dataclasses-json==0.6.1 67 | # via langchain 68 | datasets==2.14.5 69 | # via -r requirements.in 70 | dill==0.3.7 71 | # via 72 | # datasets 73 | # multiprocess 74 | distlib==0.3.7 75 | # via virtualenv 76 | docker-pycreds==0.4.0 77 | # via wandb 78 | faiss-cpu==1.7.4 79 | # via -r requirements.in 80 | fastapi==0.103.1 81 | # via modal 82 | filelock==3.12.4 83 | # via 84 | # gdown 85 | # huggingface-hub 86 | # torch 87 | # transformers 88 | # virtualenv 89 | fonttools==4.42.1 90 | # via matplotlib 91 | frozenlist==1.4.0 92 | # via 93 | # aiohttp 94 | # aiosignal 95 | fsspec[http]==2023.6.0 96 | # via 97 | # datasets 98 | # huggingface-hub 99 | gdown==4.7.1 100 | # via -r requirements.in 101 | gitdb==4.0.10 102 | # via gitpython 103 | gitpython==3.1.37 104 | # via wandb 105 | google-api-core[grpc]==2.12.0 106 | # via google-cloud-vision 107 | google-auth==2.23.1 108 | # via google-api-core 109 | google-cloud-vision==3.4.4 110 | # via -r requirements.in 111 | googleapis-common-protos==1.60.0 112 | # via 113 | # google-api-core 114 | # grpcio-status 115 | greenlet==2.0.2 116 | # via playwright 117 | grpcio==1.58.0 118 | # via 119 | # google-api-core 120 | # grpcio-status 121 | grpcio-status==1.58.0 122 | # via google-api-core 123 | grpclib==0.4.3 124 | # via modal 125 | h11==0.14.0 126 | # via wsproto 127 | h2==4.1.0 128 | # via grpclib 129 | hpack==4.0.0 130 | # via h2 131 | huggingface-hub[cli,torch]==0.17.3 132 | # via 133 | # -r requirements.in 134 | # accelerate 135 | # datasets 136 | # transformers 137 | hyperframe==6.0.1 138 | # via h2 139 | identify==2.5.29 140 | # via pre-commit 141 | idna==3.4 142 | # via 143 | # anyio 144 | # requests 145 | # trio 146 | # yarl 147 | importlib-metadata==6.8.0 148 | # via modal 149 | inquirerpy==0.3.4 150 | # via huggingface-hub 151 | jinja2==3.1.2 152 | # via torch 153 | jsonpatch==1.33 154 | # via langchain 155 | jsonpointer==2.4 156 | # via jsonpatch 157 | kiwisolver==1.4.5 158 | # via matplotlib 159 | langchain==0.0.304 160 | # via -r requirements.in 161 | langsmith==0.0.41 162 | # via langchain 163 | markdown-it-py==3.0.0 164 | # via rich 165 | markupsafe==2.1.3 166 | # via jinja2 167 | marshmallow==3.20.1 168 | # via dataclasses-json 169 | matplotlib==3.8.0 170 | # via -r requirements.in 171 | mdurl==0.1.2 172 | # via markdown-it-py 173 | modal==0.53.3665 174 | # via -r requirements.in 175 | mpmath==1.3.0 176 | # via sympy 177 | multidict==6.0.4 178 | # via 179 | # aiohttp 180 | # grpclib 181 | # yarl 182 | multiprocess==0.70.15 183 | # via datasets 184 | mypy-extensions==1.0.0 185 | # via typing-inspect 186 | nest-asyncio==1.5.8 187 | # via -r requirements.in 188 | networkx==3.1 189 | # via torch 190 | nodeenv==1.8.0 191 | # via pre-commit 192 | numexpr==2.8.7 193 | # via langchain 194 | numpy==1.26.0 195 | # via 196 | # accelerate 197 | # contourpy 198 | # datasets 199 | # langchain 200 | # matplotlib 201 | # numexpr 202 | # pandas 203 | # peft 204 | # pyarrow 205 | # transformers 206 | outcome==1.2.0 207 | # via trio 208 | packaging==23.1 209 | # via 210 | # accelerate 211 | # datasets 212 | # huggingface-hub 213 | # marshmallow 214 | # matplotlib 215 | # peft 216 | # transformers 217 | pandas==2.1.1 218 | # via datasets 219 | pathtools==0.1.2 220 | # via wandb 221 | peft==0.5.0 222 | # via -r requirements.in 223 | pfzy==0.3.4 224 | # via inquirerpy 225 | pillow==10.0.1 226 | # via 227 | # -r requirements.in 228 | # matplotlib 229 | platformdirs==3.10.0 230 | # via virtualenv 231 | playwright==1.38.0 232 | # via -r requirements.in 233 | pre-commit==3.4.0 234 | # via -r requirements.in 235 | prompt-toolkit==3.0.39 236 | # via inquirerpy 237 | proto-plus==1.22.3 238 | # via google-cloud-vision 239 | protobuf==4.24.3 240 | # via 241 | # google-api-core 242 | # google-cloud-vision 243 | # googleapis-common-protos 244 | # grpcio-status 245 | # modal 246 | # proto-plus 247 | # wandb 248 | psutil==5.9.5 249 | # via 250 | # accelerate 251 | # peft 252 | # wandb 253 | pyarrow==13.0.0 254 | # via datasets 255 | pyasn1==0.5.0 256 | # via 257 | # pyasn1-modules 258 | # rsa 259 | pyasn1-modules==0.3.0 260 | # via google-auth 261 | pydantic==2.4.1 262 | # via 263 | # fastapi 264 | # langchain 265 | # langsmith 266 | pydantic-core==2.10.1 267 | # via pydantic 268 | pyee==9.0.4 269 | # via playwright 270 | pygments==2.16.1 271 | # via rich 272 | pyparsing==3.1.1 273 | # via matplotlib 274 | pysocks==1.7.1 275 | # via 276 | # requests 277 | # urllib3 278 | python-dateutil==2.8.2 279 | # via 280 | # matplotlib 281 | # pandas 282 | pytz==2023.3.post1 283 | # via pandas 284 | pyyaml==6.0.1 285 | # via 286 | # accelerate 287 | # datasets 288 | # huggingface-hub 289 | # langchain 290 | # peft 291 | # pre-commit 292 | # transformers 293 | # wandb 294 | regex==2023.8.8 295 | # via transformers 296 | requests[socks]==2.31.0 297 | # via 298 | # datasets 299 | # fsspec 300 | # gdown 301 | # google-api-core 302 | # huggingface-hub 303 | # langchain 304 | # langsmith 305 | # transformers 306 | # wandb 307 | rich==13.5.3 308 | # via modal 309 | rsa==4.9 310 | # via google-auth 311 | safetensors==0.3.3 312 | # via 313 | # peft 314 | # transformers 315 | selenium==4.13.0 316 | # via -r requirements.in 317 | sentencepiece==0.1.99 318 | # via -r requirements.in 319 | sentry-sdk==1.31.0 320 | # via wandb 321 | setproctitle==1.3.2 322 | # via wandb 323 | sigtools==4.0.1 324 | # via synchronicity 325 | six==1.16.0 326 | # via 327 | # docker-pycreds 328 | # gdown 329 | # python-dateutil 330 | smmap==5.0.1 331 | # via gitdb 332 | sniffio==1.3.0 333 | # via 334 | # anyio 335 | # trio 336 | sortedcontainers==2.4.0 337 | # via trio 338 | soupsieve==2.5 339 | # via beautifulsoup4 340 | sqlalchemy==2.0.21 341 | # via langchain 342 | starlette==0.27.0 343 | # via fastapi 344 | sympy==1.12 345 | # via torch 346 | synchronicity==0.5.3 347 | # via modal 348 | tblib==2.0.0 349 | # via modal 350 | tenacity==8.2.3 351 | # via langchain 352 | tokenizers==0.13.3 353 | # via transformers 354 | toml==0.10.2 355 | # via modal 356 | torch==2.0.1 357 | # via 358 | # -r requirements.in 359 | # accelerate 360 | # huggingface-hub 361 | # peft 362 | tqdm==4.66.1 363 | # via 364 | # datasets 365 | # gdown 366 | # huggingface-hub 367 | # peft 368 | # transformers 369 | transformers==4.33.3 370 | # via peft 371 | trio==0.22.2 372 | # via 373 | # selenium 374 | # trio-websocket 375 | trio-websocket==0.11.1 376 | # via selenium 377 | typer==0.9.0 378 | # via modal 379 | types-certifi==2021.10.8.3 380 | # via modal 381 | types-toml==0.10.8.7 382 | # via modal 383 | typing-extensions==4.8.0 384 | # via 385 | # aiostream 386 | # fastapi 387 | # huggingface-hub 388 | # modal 389 | # pydantic 390 | # pydantic-core 391 | # pyee 392 | # sqlalchemy 393 | # torch 394 | # typer 395 | # typing-inspect 396 | typing-inspect==0.9.0 397 | # via dataclasses-json 398 | tzdata==2023.3 399 | # via pandas 400 | urllib3[socks]==2.0.5 401 | # via 402 | # google-auth 403 | # requests 404 | # selenium 405 | # sentry-sdk 406 | virtualenv==20.24.5 407 | # via pre-commit 408 | wandb==0.15.11 409 | # via -r requirements.in 410 | watchfiles==0.20.0 411 | # via modal 412 | wcwidth==0.2.6 413 | # via prompt-toolkit 414 | wsproto==1.2.0 415 | # via trio-websocket 416 | xxhash==3.3.0 417 | # via datasets 418 | yarl==1.9.2 419 | # via aiohttp 420 | zipp==3.17.0 421 | # via importlib-metadata 422 | 423 | # The following packages are considered to be unsafe in a requirements file: 424 | # setuptools 425 | -------------------------------------------------------------------------------- /screenshot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Llama2D/llama2d/e28b97255d396c717fe183b96b802ff39ffd7e6d/screenshot.png -------------------------------------------------------------------------------- /script.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if ImageMagick is installed 4 | if ! command -v convert &> /dev/null; then 5 | echo "ImageMagick is not installed. Please install it before running this script." 6 | exit 1 7 | fi 8 | 9 | # Directory containing the images 10 | input_dir="." 11 | 12 | # Output GIF file name 13 | output_gif="output.gif" 14 | 15 | # Check if the input directory exists 16 | if [ ! -d "$input_dir" ]; then 17 | echo "Input directory not found: $input_dir" 18 | exit 1 19 | fi 20 | 21 | # Change to the input directory 22 | cd "$input_dir" || exit 23 | 24 | # Create the GIF from images 0.png through 8.png 25 | convert -delay 100 -loop 0 {0..8}.png "$output_gif" 26 | 27 | # Verify if the GIF creation was successful 28 | if [ $? -eq 0 ]; then 29 | echo "GIF file created successfully: $output_gif" 30 | else 31 | echo "Failed to create the GIF." 32 | fi 33 | -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | ROOT_DIR = Path(__file__).parent.parent.resolve() 4 | SRC_DIR = ROOT_DIR / "src" 5 | -------------------------------------------------------------------------------- /src/data/.gitignore: -------------------------------------------------------------------------------- 1 | # hide all files in subdirectories 2 | */**/* 3 | 4 | # allow everything in the root directory 5 | !./* -------------------------------------------------------------------------------- /src/data/mind2web_example.json: -------------------------------------------------------------------------------- 1 | { 2 | "pos_candidates": [ 3 | { 4 | "attributes": "{\"backend_node_id\": \"136\", \"bounding_box_rect\": \"110,607.390625,264,78\", \"class\": \"MuiSelect-root MuiSelect-select jss31 MuiSelect-filled jss32 MuiInputBase-input MuiFilledInput-input jss22 MuiInputBase-inputAdornedStart MuiFilledInput-inputAdornedStart\", \"id\": \"reservations-city-search-type\", \"name\": \"type\", \"data_pw_testid_buckeye_candidate\": \"1\"}", 5 | "backend_node_id": "136", 6 | "is_original_target": true, 7 | "is_top_level_target": true, 8 | "tag": "select" 9 | } 10 | ] 11 | } -------------------------------------------------------------------------------- /src/data/pretraining-cache/.gitignore: -------------------------------------------------------------------------------- 1 | **/* 2 | !.gitignore -------------------------------------------------------------------------------- /src/data/pretraining_urls.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | current_dir = Path(__file__).parent 4 | 5 | with open(current_dir / "urls.txt", "r") as f: 6 | urls = f.read().splitlines() 7 | -------------------------------------------------------------------------------- /src/data/urls.txt: -------------------------------------------------------------------------------- 1 | http://dannysellstampabay.com 2 | http://floridahomeswithcarrie.com 3 | https://www.tarynsellshouses.com/ 4 | https://robin.homes/signup/alex.colley 5 | https://www.demirealestates.com/ 6 | https://johngarcia.lpthomesearch.com/ 7 | https://keepingfloridamoving.com/ 8 | https://yeseniaalicea.lpthomesearch.com/ 9 | https://diannephillips.lpthomesearch.com/ 10 | https://angelwilkinson.lpthomesearch.com/ 11 | https://loreanarealestate.com/ 12 | https://www.judy-cortez.net/ 13 | https://victorialegrow.lpthomesearch.com/ 14 | https://tanyaveitch.lpthomesearch.com/ 15 | http://therealgatorrealty.com 16 | https://neldagregory.lpthomesearch.com/ 17 | http://jennysuncoast.com 18 | https://www.cendonrealtor.net/ 19 | https://appletreerealty.com 20 | https://makersolutionsinc.com/ 21 | http://www.thealsbrooksteam.com/ 22 | https://www.jandjrealtygroup.com/ 23 | https://www.beachpropertymanagerandsales.com/ 24 | http://tampabayhomessearch.com 25 | http://www.sarasotasandy.com 26 | http://www.getrichhome.com 27 | https://robin.homes/signup/amy.tejeda 28 | https://robin.homes/signup/lisa.spencer2 29 | https://robin.homes/signup/azalia.vasquez 30 | https://www.hoppersellshomes.com/ 31 | https://estradahomesales.com 32 | https://pier21realty.myrealestateplatform.com/ 33 | http://wyserhomespreferred.com 34 | https://florencezimmerman.com/ 35 | http://tampabayhomessearch.com 36 | http://therealrachelgoldman.com/ 37 | https://robin.homes/signup/william.vergara 38 | https://clermontrealestate.rezora.com/listing/demomls/155497461/6 39 | https://www.lauraschulerrealtor.com/ 40 | http://suewoodsellsleesburg.com 41 | https://joshuniversityrealty.com 42 | https://chelseacooper.lpthomesearch.com/ 43 | https://larissafloridarealtor.com 44 | https://robin.homes/signup/stavrula.crafa 45 | https://www.themillsgroupkw.com/ 46 | http://viewverobeachhomes.com 47 | http://www.sarasotasandy.com 48 | http://lauralynchristianhomes.com 49 | http://agnesrosehomes.com 50 | http://joycesellspalmcoast.com/ 51 | http://venicefloridahomes.com 52 | http://jcsunrisehomes.com 53 | http://livinparadiserealty.com 54 | https://ritapellens.com/ 55 | http://therealbrianwalsh.com 56 | http://www.everydayintampabay.com 57 | https://stefaniewargo.lpthomesearch.com/ 58 | https://mariatroncoso.lpthomesearch.com 59 | https://axelrodriguez.lpthomesearch.com/ 60 | https://buyandsellrealestateinflorida.com/ 61 | https://angelacardona.lpthomesearch.com/ 62 | https://trystanfoglia.lpthomesearch.com/ 63 | https://www.mariacastrellonrealtor.net 64 | https://yaecolon.tampabayhome.com/ 65 | https://zidniaayala.lpthomesearch.com/ 66 | https://mattsellsbrevard.com/ 67 | https://mirianriera1.lpthomesearch.com/ 68 | https://lilianaoviedo.lpthomesearch.com/ 69 | https://sandravargas.lpthomesearch.com/ 70 | https://josephbarnes.lpthomesearch.com/ 71 | https://loriwilson.lpthomesearch.com/ 72 | https://sierrarealtyfl.com/ 73 | http://katiuskaquinterorealtor.com/ 74 | https://yourkeytoflorida.com/ 75 | https://andreabishop.lpthomesearch.com/ 76 | https://davianmedina.unrealty.com/ 77 | https://closewithkhalid.net/ 78 | https://edeliosanchez.expchime.com/ 79 | https://matthewlester.lpthomesearch.com/ 80 | https://swflhomesales.com/ 81 | https://homesbyyessy.com/ 82 | http://www.cevhomes.com/ 83 | https://robin.homes/signup/daniel.hut 84 | https://robin.homes/signup/irene.guy 85 | http://brendaefffect.com 86 | https://robin.homes/signup/rachael.corry 87 | http://www.eddierealty.com/ 88 | https://destination.myrealtyonegroup.com/ 89 | http://www.nazarbasrealtor.com 90 | http://www.theresilienthomegroup.com 91 | https://robin.homes/signup/jeevan.hanuman 92 | http://www.thepensalagroup.com/ 93 | http://www.gulfcoastintegritygroup.com/ 94 | https://robin.homes/signup/alexis.willims 95 | https://robin.homes/signup/valentino.sanchez 96 | http://pgpcrealty.com 97 | http://buyandsellpolkhomes.com 98 | https://omarandreasen.com/ 99 | https://rebeccareadusre.com 100 | http://www.nazarbasrealtor.com 101 | https://robin.homes/signup/chodry.andre1 102 | http://realty.com/siesta-key-fl 103 | http://goldenclassrealty.com 104 | http://www.homeasap.com/1564211 105 | https://robin.homes/signup/valentina.cappetta 106 | https://robin.homes/signup/justin.owens 107 | https://robin.homes/signup/tyler.beasley 108 | http://buymultifamilyinvestments.com 109 | https://robin.homes/signup/karla.joneswilson2 110 | https://robin.homes/signup/sara.schneider 111 | http://www.homeasap.com/354522 112 | https://www.jandjrealtygroup.com 113 | http://thadismyrealtor.com 114 | https://karladeleon.lpthomesearch.com/ 115 | https://gomezhomegroup.com/ 116 | https://robin.homes/signup/krystal.crichlow 117 | https://robin.homes/signup/ashley.vanpelt 118 | https://blakeesekie.lpthomesearch.com 119 | http://www.homeasap.com/1630578 120 | http://www.homeasap.com/94340 121 | https://robin.homes/signup/justin.owens 122 | https://www.jesvalrealestate.com/ 123 | https://www.kaizen-realty.com/ 124 | https://www.yourrealtyspecialist.com/ 125 | https://www.livelocalre.com 126 | http://www.tjcosgrove.com/ 127 | http://danjoproperties.com 128 | https://robin.homes/signup/anabely.delatorre1 129 | http://www.homeasap.com/161009 130 | https://robin.homes/signup/elena.sherstikova 131 | https://taquishamccluster.lpthomesearch.com 132 | https://ahsguarantee.com 133 | http://www.homeasap.com/1647338 134 | https://argeliavidal.com/ 135 | https://tamiamirealtyllc.com 136 | https://robin.homes/signup/amy.brocco 137 | https://robin.homes/signup/pharah.dutrevil 138 | https://taylor-smalley.com/ 139 | http://mermaizinghomes.com 140 | http://properties4saleinflorida.com 141 | http://hillsboroughcountyhomes4sale.co 142 | http://tampabayareahomesforsales.com 143 | https://robin.homes/signup/suzanne.dickson2 144 | https://robin.homes/signup/david.ponte 145 | https://robin.homes/signup/emily.kirshaw 146 | https://robin.homes/signup/azalia.vasquez 147 | http://www.homeasap.com/1047411 148 | https://hirethepirate.com 149 | http://lsteuberrealty.com 150 | https://www.srqareahomefinder.com 151 | http://www.bluesunrealty.com/ 152 | https://robin.homes/signup/david.brown12 153 | http://jessicalipprealty.com 154 | https://robin.homes/signup/lori.moses 155 | https://robin.homes/signup/stacy.bracewell 156 | https://robin.homes/signup/dana.lincolnpa 157 | https://robin.homes/signup/maurice.johnson3 158 | https://robin.homes/signup/gracemary.guastella 159 | https://robin.homes/signup/angelo.marcello 160 | http://www.homeasap.com/1645736 161 | https://robin.homes/signup/julie.sbrocco 162 | https://robin.homes/signup/missy.mcamis 163 | https://robin.homes/signup/lissette.sanchez 164 | https://robin.homes/signup/lisa.kelly2 165 | https://robin.homes/signup/bobbie.robinson 166 | https://robin.homes/signup/denise.becker 167 | https://robin.homes/signup/ashley.cooley 168 | https://robin.homes/signup/brandon.johnson 169 | https://robin.homes/signup/khalid.inshan 170 | https://robin.homes/signup/milton.figueroa 171 | https://robin.homes/signup/laura.rodrigueztello 172 | https://robin.homes/signup/ha.benacquisto 173 | https://robin.homes/signup/don.latimer 174 | https://robin.homes/signup/francesca.wilson 175 | https://robin.homes/signup/lsalma.abdelaal 176 | http://viewveniceflhomes.com 177 | https://www.yourwayhome.net 178 | https://www.lakebrantleyhomes.com/ 179 | https://danesacolon.lpthomesearch.com 180 | http://viewbradentonflhomes.com 181 | https://issaygonzalez.lpthomesearch.com/ 182 | https://robin.homes/signup/elizabethdesiree.morales1 183 | https://yanirasuarez.lpthomesearch.com/ 184 | http://tampabayareahomesforsale.com 185 | http://www.homeasap.com/1331654 186 | http://topleesburgrealestate.com 187 | http://jillwillsell.com/ 188 | http://jennifer-sims.elevatesite.com 189 | https://unrealty.com/ 190 | https://robin.homes/signup/norma.gonsalves 191 | https://robin.homes/signup/ted.moseley 192 | https://robin.homes/signup/kellie.birmingham 193 | http://www.homeasap.com/881410 194 | https://cynthiaporpora.lpthomesearch.com 195 | https://binghamrealtyinc.com/ 196 | http://www.themeadowsteam.com/ 197 | https://sage-chaja-eedcbf.netlify.app/ 198 | https://sage-chaja-eedcbf.netlify.app/ 199 | https://www.bhhsfloridarealty.com/ 200 | https://searchpalmharbor.com 201 | https://tkc-platinum-properties.findme.homes/ 202 | http://projectmyhomeflorida.com 203 | http://www.earlsellstampa.com 204 | http://paigeboothrealty.com 205 | http://www.homesweettampabay.com 206 | https://ruthiearchie.myhomehq.biz 207 | http://mykwgb.com 208 | https://championsgate.realtor 209 | http://saintpetersburghomesfl.com 210 | http://www.homeasap.com/640516 211 | http://tampa-homesforsale.com 212 | http://genevievesproperties.com 213 | https://robin.homes/signup/abraham.mendez1 214 | https://seminoleheightsliving.com/ 215 | http://www.homeasap.com/632713 216 | http://www.blakerealestate.com/ 217 | http://www.homeasap.com/984715 218 | http://www.homeasap.com/619950 219 | http://lizcarvalho.propertyportalmarketing.com 220 | http://thebucketlistteam.com 221 | https://scottbryant.lpthomesearch.com 222 | http://flynnsellsflorida.com 223 | http://www.homeasap.com/1645802 224 | http://www.homeasap.com/1645492 225 | http://www.homeasap.com/1643639 226 | http://www.thetampapropertyfinder.com/ 227 | https://robin.homes/signup/chodry.andre1 228 | https://robin.homes/signup/william.dibernardo1 229 | https://robin.homes/signup/lisa.eichenblatt 230 | https://robin.homes/signup/jeannette.mcintosh 231 | http://www.homeasap.com/1646297 232 | http://southtampasweethome.com 233 | http://kathycongdonhomessold.com 234 | https://bricksfolios.inbestments.com/ 235 | https://www.inspiredpropertiessrq.com 236 | https://robin.homes/signup/liliana.lassalle 237 | http://stpetetropical.net 238 | https://peoplearemypassion.com/ 239 | https://www.lpsantos.com 240 | http://www.briannacapuano.com/ 241 | https://neighborhood-professionals-101944491.remax.com 242 | https://local-expert-101937672.remax.com 243 | https://capital-realty-100430055.remax.com 244 | https://legacy-100430027.remax.com 245 | https://tropical-sands-100429845.remax.com 246 | https://domenicaaraguache.lpthomesearch.com/ 247 | http://kbhomesrealty1.com 248 | http://drivingfloridahome.net 249 | https://robin.homes/signup/anabely.delatorre1 250 | https://robin.homes/signup/kristal.saladin 251 | http://davidhgoodii.com 252 | http://robyncavallaro.com/ 253 | https://www.ezhomesearch.com 254 | https://bursonhomeadvisors.com 255 | http://www.residethrivetampa.com/ 256 | https://www.TurnerPropertyMgmt.com 257 | https://mensnyoreste1.lpthomesearch.com/ 258 | https://victordeleon.rogtampabay.com 259 | http://www.valeriayafferealtor.com 260 | https://doreenlandi.info/ 261 | http://AmandaAlligoodsellsfl.com 262 | http://homesbydonnawilliams.com 263 | http://realty.com/lakewood-ranch-fl 264 | https://www.joenewstreet.com/ 265 | https://robin.homes/signup/cecilia.cabrales 266 | http://franciscoromerorealtor.com/ 267 | https://robin.homes/signup/danielle.kielpikowski1 268 | http://www.susanbenante.com 269 | https://hidalisnunez.lpthomesearch.com/ 270 | https://greaterlakelandhomes.com/ 271 | https://orlandoandbeyond.com/ 272 | https://www.sarasotadreamlifestyle.com/ 273 | https://elizabethcolon.tampabayhome.com 274 | https://davidnpacheco.lpthomesearch.com/ 275 | https://charliesantos.expchime.com 276 | https://ernestoperez1.lpthomesearch.com/ComplianceCheck/active/586 277 | https://robin.homes/signup/lily.aymat 278 | https://robin.homes/signup/jeevan.hanuman 279 | https://robin.homes/signup/cheryl.burcham 280 | https://chrislarue.lpthomesearch.com/ 281 | https://danielpaz.lpthomesearch.com/ 282 | http://darbiepfeifferrealestate.com 283 | http://johnkelleyflhomes.com 284 | http://merlybuysandsells.com 285 | http://justinbrandonhomes.com 286 | http://www.homeasap.com/659994 287 | http://remaxassured.comandzinnoteam.com 288 | http://www.isellbabcockranch.com 289 | https://floridarealtortony.com 290 | https://staugustine.evrealestate.com/ 291 | https://robin.homes/signup/michael.bellamy1 292 | https://robin.homes/signup/morgan.porter1 293 | https://robin.homes/signup/maria.tilton 294 | https://robin.homes/signup/tennille.moore1 295 | https://robin.homes/signup/bianca.pineda 296 | https://p-33d82e42-351e-4188-b36b-11ae45b6ac8c.presencepreview.site/ 297 | https://www.cathyrunningrealtor.com/ 298 | https://valerusre.com/ 299 | http://buyorsellsouthwestfloridahomes.com 300 | http://buyeorselleastoralndohomes.com 301 | https://www.mvprealty.com/ 302 | http://tourtampabayhomes.com 303 | http://ltrhomes.com 304 | https://vanderleelie.com 305 | https://janellepruitt.realtor 306 | https://karuna.realestate 307 | http://helensfloridahomes.com 308 | http://www.fivestarflorida.com/ 309 | http://DeannaBradley.com 310 | http://integrity1stgroup.com/ 311 | https://dallascrider.lpthomesearch.com/ 312 | http://monopolygre.com 313 | https://www.mysahomes.net 314 | http://vanderleelie.com 315 | https://robin.homes/signup/gemma.peterson 316 | https://robin.homes/signup/alex.estevez 317 | https://bursonhomeadvisors.com 318 | https://www.elliman.com 319 | http://williamroganrealtor.com/ 320 | http://www.homeasap.com/14523 321 | https://valeriemcinerney.sarasotarealestatehub.com/ 322 | https://pinpointrealtyfl.com 323 | https://www.mcsellsmanatee.com/ 324 | https://robin.homes/signup/paul.mcdonald 325 | http://www.homeasap.com/1637509 326 | https://danrojas.lpthomesearch.com 327 | https://reganpappas.com 328 | https://priscillaarzivian.lpthomesearch.com/ 329 | http://www.sunwestrealtyflorida.com/ 330 | https://www.gibbsgrouptampa.com/ 331 | http://www.homeasap.com/1401384 332 | https://robin.homes/signup/nan.robinson 333 | http://thelaygroup.com/ 334 | http://www.ivanaldearealty.com 335 | https://usa.premmedia.com/better_homes_and_gardens_flagler_beach/ 336 | https://www.baywestrealtygroup.com 337 | https://robin.homes/signup/christine.nargi 338 | https://www.garyberkson.com/ 339 | http://BillandGinger.com 340 | https://robin.homes/signup/evan.devorace 341 | https://gabrielhoyos.lpthomesearch.com.lpthomesearch.com 342 | https://robin.homes/signup/mariaelena.martinez 343 | https://robin.homes/signup/nicole.musgrave1 344 | https://robin.homes/signup/janet.mansfield 345 | https://robin.homes/signup/myriah.schifley 346 | https://robin.homes/signup/muneera.mohamed 347 | https://brysonwalters.findhomesintampaflorida.com/ 348 | http://jeanjannrealty.com 349 | https://johson.com 350 | http://amysellsbrevard.com 351 | http://seacowobsessedagent.com 352 | https://valriggshomes.com/ 353 | https://www.rbfloridahomes.com 354 | https://searchswflhomesforsale.com 355 | https://robin.homes/signup/ann.osullivan 356 | https://robin.homes/signup/mary.blinkhorn 357 | https://robin.homes/signup/debbie.snowden 358 | https://robin.homes/signup/brent.canevari 359 | https://jillanayas.com/ 360 | https://robin.homes/signup/william.dibernardo 361 | http://www.foxxteam.com/ 362 | http://www.homeasap.com/768207 363 | http://soldbyjosh.net 364 | https://westerberggroup.com/ 365 | http://myfloridarealestateforyou.com 366 | https://stephanieeisenbach.lpthomesearch.com/ 367 | https://davidcardona.lpthomesearch.com/ 368 | https://samdiasrealestate.com 369 | https://monicadiazquiroz.lpthomesearch.com 370 | https://anthonyrussell.lpthomesearch.com/ 371 | https://michaelatate.myrealestateplatform.com/ 372 | https://robin.homes/signup/jeanna.jackson1 373 | https://robin.homes/signup/alana.ohanlan1 374 | https://marlolaney.lpthomesearch.com 375 | https://www.hoppersellshomes.com/ 376 | https://www.anchorrealtorgroup.com/ 377 | https://lisbethbetizagatti.lpthomesearch.com/ 378 | https://victoriatejeda1.lpthomesearch.com/ 379 | http://www.teambelmonte.com 380 | https://robin.homes/signup/dinorat.querales 381 | http://buysellliveorlando.com 382 | https://mandjphamdevelopment.com/ 383 | https://gabriellechavez.lpthomesearch.com/ 384 | https://shannonhartrealtor.com/ 385 | http://www.jbricksrealty.com 386 | http://wrarealestate.com 387 | https://www.livingdilife.com 388 | https://robin.homes/signup/olga.sexson 389 | https://robin.homes/signup/lucia.yang 390 | https://paulapalomino1.lpthomesearch.com/ 391 | https://www.succesrealtyco.net/ 392 | https://janicerodriguez.lpthomesearch.com/ 393 | https://johnarroyo.lpthomesearch.com/ 394 | https://teamchristie.lpthomesearch.com 395 | https://robin.homes/signup/fanny.horn 396 | https://robin.homes/signup/kayla.durias 397 | http://searchvenicehomesfl.com 398 | https://michaelatate.myrealestateplatform.com/ 399 | https://www.yournaturecoasthomesearch.com/ 400 | https://www.yourhomegirlbeth.net/ 401 | http://www.homeasap.com/1638771 402 | https://unrealtyflorida.unrealty.com 403 | http://jazzysellsflorida.com 404 | https://amandatheiler.chime.me 405 | https://timeisoftheessencewithtosha.estate 406 | https://jessicaalopaeus.lpthomesearch.com/ 407 | https://theyinglingteam.com 408 | https://www.paradisegrpfl.com 409 | https://adamrobinson.lpthomesearch.com 410 | https://sarahsmith.lpthomesearch.com/ 411 | https://robertcruz.lpthomesearch.com 412 | https://zerahcruz.lpthomesearch.com 413 | http://www.iselldelandflorida.com/ 414 | https://www.jbsellshomes.com/ 415 | http://www.trusshomessarasota.com/ 416 | http://www.sarasotarealtor.com/ 417 | https://fcrg.backagent.net/ 418 | https://johnlstrauss.lpthomesearch.com/ 419 | http://matikglobalrealty.com 420 | https://usa.premmedia.com/better_homes_and_gardens_new_smyrna_beach/ 421 | http://www.1percentlistfl.com 422 | https://www.terraexcelsior.com/ 423 | https://robin.homes/signup/silvia.mozer 424 | http://www.homeasap.com/1636913 425 | https://robin.homes/signup/daisy.gonzalez 426 | http://www.homeasap.com/1636606 427 | http://veronicawhittingtonhomes.com 428 | http://www.livingdilife.com 429 | https://themarkrameygroup.com 430 | https://robin.homes/signup/jeevan.hanuman 431 | http://www.nelsoncruzteam.com/ 432 | https://thewilchergroup.com 433 | https://homeasap.com/856425 434 | https://robin.homes/signup/nathan.jacoby 435 | https://sandranaumovski.lpthomesearch.com/ 436 | https://yetseniamtorres.lpthomesearch.com/ 437 | https://robin.homes/signup/mark.langley 438 | https://www.caseytranrealestate.com/ 439 | https://robin.homes/signup/leidy.lara 440 | http://emeraldrealtycofl.com 441 | http://sagegainesville.com/ 442 | https://helloreeve.com 443 | https://windermereintrealty.com 444 | http://listwithpeteandcheryl.com 445 | http://janicesellsorlando.com 446 | http://zoyasellsflorida.com 447 | http://livingabundantlygroup.com 448 | -------------------------------------------------------------------------------- /src/llama2d/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Llama2D/llama2d/e28b97255d396c717fe183b96b802ff39ffd7e6d/src/llama2d/__init__.py -------------------------------------------------------------------------------- /src/llama2d/constants.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | ROOT_DIR = Path(__file__).parent.parent.parent.resolve() 4 | 5 | # 3 times the resolution of a 1080p monitor 6 | SCREEN_RESOLUTION = (1280, 1080 * 3) 7 | 8 | DATA_DIR = ROOT_DIR / "data" 9 | 10 | MIND2WEB_MHTML_DIR = DATA_DIR / "mind2web-mhtml" 11 | MIND2WEB_HHTML_DIR = DATA_DIR / "mind2web-hhtml" 12 | 13 | MIND2WEB_OUT_DIR = DATA_DIR / "mind2web-out" 14 | MIND2WEB_IN_DIR = DATA_DIR / "mind2web-in" 15 | MIND2WEB_VIZ_DIR = DATA_DIR / "mind2web-viz" 16 | 17 | MIND2WEB_CACHE_DIR = DATA_DIR / "mind2web-cache" 18 | PRETRAINING_CACHE_DIR = DATA_DIR / "pretraining-cache" 19 | 20 | # path to the Google Cloud credentials file 21 | SECRETS_FILE = ROOT_DIR / "secrets" / "gcp-vision.json" 22 | 23 | # max number of tokens allowed in a page screenshot 24 | # we will remove all page tokens after this number 25 | MAX_PAGE_LEN = 1000 26 | 27 | # max number of tokens inputted to Llama2d - between prompt, page, and completion 28 | # we will truncate big inputs to this number 29 | # we will also pad small inputs to this number 30 | MAX_SEQ_LEN = 300 31 | 32 | MAX_TAGS_LEN = 150 33 | -------------------------------------------------------------------------------- /src/llama2d/datasets/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Llama2D/llama2d/e28b97255d396c717fe183b96b802ff39ffd7e6d/src/llama2d/datasets/__init__.py -------------------------------------------------------------------------------- /src/llama2d/datasets/cached.py: -------------------------------------------------------------------------------- 1 | from glob import glob 2 | from pathlib import Path 3 | 4 | import torch 5 | from torch.utils.data import Dataset 6 | 7 | 8 | def save_dataset(dataset, save_dir: Path): 9 | # make the directory if it doesn't exist 10 | save_dir.mkdir(parents=True, exist_ok=True) 11 | 12 | for i in range(len(dataset)): 13 | torch.save(dataset[i], save_dir / f"{i}.pt") 14 | 15 | 16 | class CachedDataset(Dataset): 17 | def __init__(self, load_dir, use_2d=True, keep_fraction=1.0): 18 | self.load_dir = load_dir 19 | self.files = sorted(glob(f"{load_dir}/*.pt")) 20 | self.use_2d = use_2d 21 | self.keep_fraction = keep_fraction 22 | 23 | def __getitem__(self, i): 24 | ret = torch.load(self.files[i]) 25 | # if not self.use_2d: 26 | # return {k: v for k, v in ret.items() if k != "coords"} 27 | return {k: v.to(torch.bfloat16) if k == "coords" else v for k, v in ret.items()} 28 | 29 | def __len__(self): 30 | return int(len(self.files) * self.keep_fraction) 31 | -------------------------------------------------------------------------------- /src/llama2d/datasets/huggingface.py: -------------------------------------------------------------------------------- 1 | import types 2 | from dataclasses import dataclass 3 | from time import time 4 | 5 | import numpy as np 6 | import torch 7 | from datasets import Dataset 8 | from torch.utils import data 9 | 10 | # 11 | from llama2d.datasets.cached import CachedDataset 12 | 13 | 14 | @dataclass 15 | class DatasetInfo: 16 | repo: str 17 | desc: str 18 | 19 | 20 | def dataset_dict_to_list(dataset_dict): 21 | """ 22 | Converts a Torch dataset stored as a dictionary to a list of dictionaries. 23 | 24 | Args: 25 | dataset_dict (dict): The input dataset dictionary with keys 'input_ids', 'coords', 'labels', and 'attention_mask'. 26 | 27 | Returns: 28 | list: A list of dictionaries where each dictionary contains values for the keys at each index. 29 | """ 30 | keys = dataset_dict.keys() 31 | num_samples = len(dataset_dict[list(keys)[0]]) 32 | # Assuming all keys have the same length 33 | dataset_list = [] 34 | for i in range(num_samples): 35 | sample_dict = dict.fromkeys(keys) 36 | for key in keys: 37 | sample_dict[key] = dataset_dict[key][i] 38 | dataset_list.append(sample_dict) 39 | return dataset_list 40 | 41 | 42 | def to(a, device: torch.device): 43 | if torch.is_tensor(a): 44 | return a.to(device) 45 | elif isinstance(a, dict): 46 | return {k: to(v, device) for k, v in a.items()} 47 | elif isinstance(a, (list, tuple)): 48 | return type(a)(to(v, device) for v in a) 49 | else: 50 | return a 51 | 52 | 53 | from tqdm import tqdm 54 | 55 | 56 | def pt2hf(torch_dataset: data.Dataset, convert_type: types = torch.float32): 57 | torch_dataset = [el for el in tqdm(torch_dataset) if el is not None] 58 | if convert_type is not None: 59 | torch_dataset = to(torch_dataset, convert_type) 60 | # import pdb; pdb.set_trace() 61 | try: 62 | dset_hf = Dataset.from_list(torch_dataset) 63 | except Exception as e: 64 | print(f"Exception while converting to hf dataset: {e}") 65 | import pdb 66 | 67 | pdb.set_trace() 68 | return dset_hf 69 | 70 | 71 | def publish_pt_dataset(ds_pt, dataset_info): 72 | try: 73 | ds = pt2hf(ds_pt) # may require setting: convert_type=np.float32 74 | print(f"Dataset type:{ds}") 75 | ds.info.description = dataset_info.desc 76 | ds.set_format(type="torch", columns=list(ds[0].keys())) 77 | ds.push_to_hub(dataset_info.repo) 78 | print(f"Push succeeded.") 79 | except Exception as e: 80 | print(f"Exception while publishing: {e}") 81 | raise e 82 | 83 | 84 | import torch 85 | from datasets import load_dataset 86 | 87 | dtypes = { 88 | "coords": torch.float16, 89 | "input_ids": torch.int64, 90 | "labels": torch.int64, 91 | "attention_mask": torch.int64, 92 | } 93 | 94 | 95 | class HuggingFaceDataset(torch.utils.data.Dataset): 96 | def __init__( 97 | self, repo: str, split: str, keep_fraction: float = 1.0, use_2d: bool = True 98 | ): 99 | print("Loading dataset...") 100 | start_time = time() 101 | 102 | hf_dataset = load_dataset(repo) 103 | 104 | print(f"Loaded dataset in {time()-start_time} seconds.") 105 | # dataset = [d for d in dataset if d is not None and sum([1 for i in d["labels"] if i>0])>0] 106 | df = hf_dataset["train"].to_pandas() 107 | df_filtered = df[df.labels.apply(lambda x: np.sum(np.array(x[::-1]) > 0) > 0)] 108 | 109 | dataset = Dataset.from_pandas(df_filtered) 110 | 111 | # split into train/val 112 | train_percent = 95 113 | train_size = int(len(dataset) * train_percent / 100) 114 | val_size = len(dataset) - train_size 115 | train_dataset, val_dataset = torch.utils.data.random_split( 116 | dataset, [train_size, val_size] 117 | ) 118 | 119 | self.dataset = train_dataset if split == "train" else val_dataset 120 | 121 | # keep only a fraction of the dataset 122 | if keep_fraction < 1.0: 123 | self.dataset = torch.utils.data.Subset( 124 | self.dataset, range(int(len(self.dataset) * keep_fraction)) 125 | ) 126 | 127 | self.use_2d = use_2d 128 | 129 | def __getitem__(self, index): 130 | hf_dict = self.dataset[index] 131 | 132 | # convert to torch tensors 133 | ret = {k: torch.tensor(v, dtype=dtypes[k]) for k, v in hf_dict.items()} 134 | 135 | # if not self.use_2d: 136 | # del ret["coords"] 137 | 138 | return ret 139 | 140 | def __len__(self): 141 | return len(self.dataset) 142 | 143 | 144 | if __name__ == "__main__": 145 | import argparse 146 | 147 | from ..constants import PRETRAINING_CACHE_DIR 148 | 149 | parser = argparse.ArgumentParser(description="Description of your script") 150 | # Argument 1: First argument (e.g., input file) 151 | parser.add_argument( 152 | "-C", 153 | "--cache_dir", 154 | type=str, 155 | default=PRETRAINING_CACHE_DIR, 156 | help="Cache directory", 157 | ) 158 | # Argument 2: Second argument (e.g., output file) 159 | parser.add_argument( 160 | "-R", 161 | "--repo", 162 | default="supermomo668/Llama2D-Pretrain", 163 | type=str, 164 | help="Name of Repo", 165 | ) 166 | # Argument 2: Second argument (e.g., output file) 167 | parser.add_argument( 168 | "-D", 169 | "--desc", 170 | default="Llama2D is a project from AGI UI/UX Hackathon. Check our main Git Repo at : https://github.com/Llama2D/llama2d/tree/main", 171 | type=str, 172 | help="Name of Repo", 173 | ) 174 | 175 | args = parser.parse_args() 176 | ds_pt = CachedDataset(args.cache_dir) 177 | publish_pt_dataset(ds_pt, args) 178 | -------------------------------------------------------------------------------- /src/llama2d/datasets/mhtml_to_hhtml.py: -------------------------------------------------------------------------------- 1 | from tqdm import tqdm 2 | 3 | from llama2d.constants import MIND2WEB_HHTML_DIR, MIND2WEB_MHTML_DIR 4 | 5 | mhtml_files = [f for f in MIND2WEB_MHTML_DIR.iterdir() if f.suffix == ".mhtml"] 6 | 7 | for mhtml_filename in tqdm(mhtml_files): 8 | # print(mhtml_filename) 9 | mhtml_path = MIND2WEB_MHTML_DIR / mhtml_filename 10 | html_path = MIND2WEB_HHTML_DIR / mhtml_filename.with_suffix(".html") 11 | 12 | if html_path.exists(): 13 | html_path.unlink() 14 | 15 | mhtml_content = open(mhtml_path, "r").read() 16 | hhtml_content = mhtml_content.replace(":hover", ".hvvvr") 17 | 18 | with open(html_path, "w") as f: 19 | f.write(hhtml_content) 20 | 21 | print("Done!") 22 | -------------------------------------------------------------------------------- /src/llama2d/datasets/mind2web.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | from glob import glob 4 | from random import random 5 | from time import sleep 6 | from typing import Dict 7 | 8 | import torch 9 | from datasets import load_dataset 10 | from playwright.sync_api import sync_playwright 11 | from torch.utils.data import Dataset 12 | 13 | from llama2d.constants import MIND2WEB_MHTML_DIR, SCREEN_RESOLUTION 14 | from llama2d.datasets.huggingface import DatasetInfo, publish_pt_dataset 15 | from llama2d.tagging.add_tags_to_page import add_tags_to_webpage 16 | from llama2d.vision.take_screenshot import take_screenshot 17 | from llama2d.vision.url_to_llama_input import Llama2dWebsiteFeatureExtractor 18 | from llama2d.vision.viz_pt_input import debug_dataset 19 | 20 | should_debug = False 21 | 22 | 23 | class Mind2webDataset(Dataset): 24 | def __init__( 25 | self, model="decapoda-research/llama-7b-hf", playwright=None, headless=False,show_errors=False 26 | ): 27 | assert playwright is not None, "Please pass in playwright" 28 | self.__extractor = Llama2dWebsiteFeatureExtractor(mask_out_body=True) 29 | 30 | self.uid_to_mhtml = self.get_uid_to_mhtml_map() 31 | 32 | dataset = load_dataset("osunlp/Mind2Web") 33 | self.dataset = dataset["train"] 34 | 35 | self.actions = [ 36 | (i, j) 37 | for i in range(len(self.dataset)) 38 | for j in range(len(self.dataset[i]["actions"])) 39 | ] 40 | 41 | self.browser = playwright.chromium.launch( 42 | headless=headless, args=["--disable-web-security"] 43 | ) 44 | self.page = self.browser.new_page() 45 | 46 | width, height = SCREEN_RESOLUTION 47 | self.page.set_viewport_size({"width": width, "height": height}) 48 | 49 | self.page.set_extra_http_headers( 50 | { 51 | "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " 52 | "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 " 53 | "Safari/537.36" 54 | } 55 | ) 56 | self.page.set_default_navigation_timeout(1000 * 10) 57 | self.show_errors = show_errors 58 | 59 | def __len__(self): 60 | return len(self.actions) 61 | 62 | def __getitem__(self, index): 63 | screenshot_path = None 64 | try: 65 | task_idx, action_idx = self.actions[index] 66 | task = self.dataset[task_idx] 67 | action = task["actions"][action_idx] 68 | 69 | pos_candidates = action["pos_candidates"] 70 | if len(pos_candidates) == 0: 71 | raise Exception("No positive candidates in dataset!") 72 | 73 | uid = action["action_uid"] 74 | mhtml_file = self.uid_to_mhtml[uid] 75 | 76 | mhtml_file_name = mhtml_file.split("/")[-1] 77 | mhtml_file = "http://localhost:5002/" + mhtml_file_name 78 | self.page.goto(mhtml_file) 79 | sleep(1) 80 | 81 | gt_tag, tags_and_boxes = add_tags_to_webpage(self.page, action) 82 | 83 | rand_num = random() 84 | screenshot_path = f"screenshot_{rand_num}.png" 85 | take_screenshot(self.page, None, screenshot_path) 86 | 87 | self.page.evaluate("window.demo()") 88 | take_screenshot(self.page, None, "screenshot.png") 89 | 90 | intention = task["confirmed_task"] 91 | 92 | actions_str = "\n".join(task["action_reprs"]) 93 | prompt = f""" 94 | You are a bot using a website. Your goal is: "{intention}" 95 | {"So far, you have done the following actions: " 96 | +actions_str if len(actions_str) > 0 else ""} 97 | The website looks like so:""" 98 | 99 | operation = action["operation"] 100 | op = operation["op"] 101 | value = operation["value"] 102 | 103 | completion = None 104 | if op == "CLICK": 105 | completion = f"CLICK [{gt_tag}]" 106 | elif op == "TYPE": 107 | completion = f"TYPE [{gt_tag}] {json.dumps(value)}" 108 | elif op == "SELECT": 109 | completion = f"SELECT [{gt_tag}]" 110 | else: 111 | raise NotImplementedError(f"Don't understand operation {op}") 112 | 113 | ret = self.__extractor.process( 114 | prompt, screenshot_path, completion, tags_and_boxes=tags_and_boxes 115 | ) 116 | 117 | # delete the screenshot 118 | os.remove(screenshot_path) 119 | 120 | return ret 121 | except Exception as e: 122 | # raise e 123 | if self.show_errors: 124 | print("Error in dataset:", str(e)[:100] + "...") 125 | 126 | if "ImageAnnotation" in str(e): 127 | raise e 128 | 129 | if screenshot_path is not None: 130 | if os.path.exists(screenshot_path): 131 | os.remove(screenshot_path) 132 | return None 133 | 134 | def get_uid_to_mhtml_map(self) -> Dict[str, str]: 135 | all_mhtmls = glob(f"{MIND2WEB_MHTML_DIR}/*_before.mhtml") 136 | print("mhtml count:", len(all_mhtmls)) 137 | 138 | # extract the uid from *_before.mhtml 139 | def get_uid(path): 140 | return path.split("/")[-1].split("_")[0] 141 | 142 | return {get_uid(path): path for path in all_mhtmls} 143 | 144 | 145 | mind2web_repo = "llama2d/llama2d-mind2web" 146 | 147 | if __name__ == "__main__": 148 | ds_info = DatasetInfo( 149 | repo=mind2web_repo, 150 | desc="Llama2d Mind2Web dataset - SFT dataset for" 151 | " tag interaction on diverse websites", 152 | ) 153 | 154 | with sync_playwright() as playwright: 155 | dataset = Mind2webDataset(playwright=playwright, headless=True) 156 | 157 | # debug_dataset(dataset) 158 | 159 | # publish a subset 160 | num_samples = 2_000 161 | 162 | if num_samples is not None: 163 | dataset, _ = torch.utils.data.random_split( 164 | dataset, [num_samples, len(dataset) - num_samples] 165 | ) 166 | 167 | publish_pt_dataset(dataset, ds_info) 168 | -------------------------------------------------------------------------------- /src/llama2d/datasets/mind2web_convert.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import time 4 | from glob import glob 5 | 6 | from ..constants import MIND2WEB_CACHE_DIR, MIND2WEB_OUT_DIR 7 | 8 | files = glob(f"{MIND2WEB_OUT_DIR}/*/input.pt") 9 | 10 | # copy /input.pd to MIND2WEB_CACHE_DIR/.pt 11 | # but only for input.pt files that are less than 1 day old 12 | 13 | for f in files: 14 | # get date modified 15 | date_modified = os.path.getmtime(f) 16 | # get current time 17 | current_time = time.time() 18 | # get difference 19 | diff = current_time - date_modified 20 | # if less than 1 day old 21 | if diff < 60 * 60 * 15: 22 | # get uid 23 | uid = f.split("/")[-2] 24 | # copy file 25 | shutil.copy(f, f"{MIND2WEB_CACHE_DIR}/{uid}.pt") 26 | print(f"Copied {f} to {MIND2WEB_CACHE_DIR}/{uid}.pt") 27 | else: 28 | print(f"Skipping {f} because it is {diff//(60*60)} hrs old") 29 | -------------------------------------------------------------------------------- /src/llama2d/datasets/pretraining.py: -------------------------------------------------------------------------------- 1 | from playwright.sync_api import sync_playwright 2 | from torch.utils.data import Dataset 3 | 4 | from llama2d.datasets.huggingface import DatasetInfo, publish_pt_dataset 5 | from llama2d.vision.url_to_llama_input import Llama2dWebsiteFeatureExtractor 6 | from src.data.pretraining_urls import urls 7 | 8 | 9 | class Llama2dPretrainingDataset(Dataset): 10 | def __init__( 11 | self, model="decapoda-research/llama-7b-hf", urls=[], include_coords=True 12 | ): 13 | self.__extractor = Llama2dWebsiteFeatureExtractor(model, mask_out_body=False) 14 | self.__urls = urls 15 | 16 | self.__include_coords = include_coords 17 | 18 | with sync_playwright() as p: 19 | # Using the Chromium browser but you can also use 'firefox' or 'webkit' 20 | browser = p.chromium.launch() 21 | page = browser.new_page() 22 | 23 | page.set_extra_http_headers( 24 | { 25 | "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " 26 | "AppleWebKit/537.36 (KHTML, like Gecko)" 27 | " Chrome/116.0.0.0 Safari/537.36" 28 | } 29 | ) 30 | # exceptional() is a function calling helper that returns 31 | # None if the method errors. 32 | # we call all the functions 33 | self.extractions = [ 34 | exceptional(self.__extractor.create_inference_data, args=(page, "", i)) 35 | for i in self.__urls 36 | ] 37 | # or otherwise return None 38 | self.extractions = [i for i in self.extractions if i] 39 | 40 | def __getitem__(self, index): 41 | ret = self.extractions[index] 42 | if not self.__include_coords: 43 | return {k: v for k, v in ret.items() if k != "coords"} 44 | return ret 45 | 46 | def __len__(self): 47 | return len(self.extractions) 48 | 49 | 50 | def exceptional(call, args): 51 | """Wrapper function to return None for a function if it errors. 52 | 53 | Parameters 54 | ---------- 55 | call : callable 56 | The function to call 57 | args : List[Any] 58 | The arguments to call it with 59 | 60 | Returns 61 | ------- 62 | Any 63 | The output of the funciton. 64 | """ 65 | 66 | try: 67 | return call(*args) 68 | except Exception as e: 69 | print("your call to", call, "errored! Returning None") 70 | print(e) 71 | 72 | return None 73 | 74 | 75 | pretraining_repo = "llama2d/llama2d-pretraining" 76 | 77 | if __name__ == "__main__": 78 | print("Downloading pretraining dataset with Playwright...") 79 | 80 | ds_info = DatasetInfo( 81 | repo=pretraining_repo, 82 | desc="Llama2d pretraining dataset - next-token prediction " 83 | "on real estate websites", 84 | ) 85 | 86 | dataset = Llama2dPretrainingDataset( 87 | model="decapoda-research/llama-7b-hf", urls=urls, include_coords=True 88 | ) 89 | 90 | publish_pt_dataset(dataset, ds_info) 91 | -------------------------------------------------------------------------------- /src/llama2d/datasets/synthetic/top_or_bottom.py: -------------------------------------------------------------------------------- 1 | from math import inf 2 | from random import choice, random 3 | 4 | from torch.utils.data import Dataset 5 | 6 | from llama2d.datasets.huggingface import DatasetInfo, publish_pt_dataset 7 | from llama2d.vision import Llama2dScreen, Llama2dTokenizer, debug_dataset 8 | 9 | directions = { 10 | "t": (0.5, 0), # in -y direction 11 | "b": (0.5, 1), # in +y direction 12 | } 13 | 14 | rand_words = "bob,jane,alice,carol,ted,lisa,barry,frank,george,harold,henry,ian,john,james,kevin,mark,neil,oliver,peter,quinn,robert,steve,thomas,william".split( 15 | "," 16 | ) 17 | 18 | 19 | class TopBottomDataset(Dataset): 20 | def __init__(self, num_screens: int, tokenizer: Llama2dTokenizer = None): 21 | self.num_screens = num_screens 22 | 23 | if tokenizer is None: 24 | tokenizer = Llama2dTokenizer() 25 | self.tokenizer = tokenizer 26 | 27 | self.screens = [] 28 | for i in range(num_screens): 29 | screen = Llama2dScreen() 30 | direction, vector = choice(list(directions.items())) 31 | 32 | screen.push_word(word=choice(rand_words), xy=vector) 33 | 34 | prompt = f"Top or bottom? (t/b)" 35 | output = direction 36 | 37 | self.screens.append(self.tokenizer.process(prompt, screen, output)) 38 | 39 | def __len__(self): 40 | return self.num_screens 41 | 42 | def __getitem__(self, i: int): 43 | return self.screens[i] 44 | 45 | 46 | if __name__ == "__main__": 47 | dataset = TopBottomDataset(num_screens=500) 48 | 49 | debug_dataset(dataset) 50 | 51 | info = DatasetInfo( 52 | repo="llama2d/llama2d-top-or-bottom", desc="Identify if a person is up or down." 53 | ) 54 | publish_pt_dataset(dataset, info) 55 | -------------------------------------------------------------------------------- /src/llama2d/datasets/synthetic/unscramble_words.py: -------------------------------------------------------------------------------- 1 | 2 | from llama2d.vision import debug_dataset,Llama2dTokenizer,Llama2dScreen 3 | from llama2d.datasets.huggingface import DatasetInfo, publish_pt_dataset 4 | from torch.utils.data import Dataset 5 | 6 | from random import choice,random 7 | rand_words = "bob,jane,alice,carol,ted,lisa,barry,frank,george,harold,henry,ian,john,james,kevin,mark,neil,oliver,peter,quinn,robert,steve,thomas,william".split(",") 8 | 9 | class UnscrambleDataset(Dataset): 10 | def __init__( 11 | self, 12 | num_screens:int, 13 | words_per_screen:int, 14 | words_per_line:int=20, 15 | lines_per_screen:int=5, 16 | tokenizer:Llama2dTokenizer=None 17 | ): 18 | self.num_screens = num_screens 19 | self.words_per_screen = words_per_screen 20 | 21 | if tokenizer is None: 22 | tokenizer = Llama2dTokenizer() 23 | self.tokenizer = tokenizer 24 | 25 | self.screens = [] 26 | for i in range(num_screens): 27 | screen = Llama2dScreen() 28 | 29 | words = [choice(rand_words) for _ in range(words_per_screen)] 30 | 31 | # render in a grid of lines 32 | for k,word in enumerate(words): 33 | i,j = k%words_per_line,k//words_per_line 34 | # convert i,j to x,y, where x is horizontal and y is vertical 35 | # x is in [0,1] and y is in [0,1] 36 | 37 | x = (i+0.5)/words_per_line 38 | y = (j+0.5)/lines_per_screen 39 | 40 | assert y<1,"Too many words for the screen" 41 | 42 | screen.push_word(word=word,xy=(x,y)) 43 | 44 | from random import shuffle 45 | shuffle(screen.words) 46 | 47 | prompt = "Read out the words in the order they appear." 48 | response = " ".join(words) 49 | 50 | self.screens.append(self.tokenizer.process(prompt,screen,response)) 51 | 52 | def __len__(self): 53 | return self.num_screens 54 | def __getitem__(self,i:int): 55 | return self.screens[i] 56 | 57 | if __name__ == "__main__": 58 | 59 | dataset = UnscrambleDataset( 60 | num_screens=5000, 61 | words_per_screen=50, 62 | words_per_line=15, 63 | lines_per_screen=5 64 | ) 65 | 66 | debug_dataset(dataset) 67 | 68 | info = DatasetInfo(repo="llama2d/llama2d-unscramble",desc="Unscramble the words displayed on the screen.") 69 | publish_pt_dataset(dataset,info) -------------------------------------------------------------------------------- /src/llama2d/datasets/synthetic/zoo_compass.py: -------------------------------------------------------------------------------- 1 | from math import inf 2 | from random import choice, random 3 | 4 | from torch.utils.data import Dataset 5 | 6 | from llama2d.datasets.huggingface import DatasetInfo, publish_pt_dataset 7 | from llama2d.vision import Llama2dScreen, Llama2dTokenizer, debug_dataset 8 | 9 | animals = "frog,cat,bear,big lion,eagle,elephant,tiger,baboon,archerfish,gorilla,gerbil,ant colony".split( 10 | "," 11 | ) 12 | directions = { 13 | "northernmost": (0, -1), # in -y direction 14 | "farthest west": (-1, 0), # in -x direction 15 | "southernmost": (0, 1), # in +y direction 16 | "farthest east": (1, 0), # in +x direction 17 | } 18 | 19 | 20 | class Llama2dZooCompassDataset(Dataset): 21 | def __init__( 22 | self, 23 | num_screens: int, 24 | words_per_screen: int, 25 | tokenizer: Llama2dTokenizer = None, 26 | ): 27 | self.num_screens = num_screens 28 | 29 | if tokenizer is None: 30 | tokenizer = Llama2dTokenizer() 31 | self.tokenizer = tokenizer 32 | 33 | self.screens = [] 34 | for i in range(num_screens): 35 | screen = Llama2dScreen() 36 | direction, vector = choice(list(directions.items())) 37 | 38 | farthest_animal = None 39 | farthest_distance = -inf 40 | for j in range(words_per_screen): 41 | animal = choice(animals) 42 | coords = (random(), random()) 43 | screen.push_word(word=animal, xy=coords) 44 | 45 | distance = coords[0] * vector[0] + coords[1] * vector[1] 46 | if distance > farthest_distance: 47 | farthest_animal = animal 48 | farthest_distance = distance 49 | 50 | assert farthest_animal is not None, "No animal is farthest" 51 | 52 | prompt = ( 53 | f"Here is a map of the zoo. Find the {direction} animal in the zoo." 54 | ) 55 | output = farthest_animal 56 | 57 | self.screens.append(self.tokenizer.process(prompt, screen, output)) 58 | 59 | def __len__(self): 60 | return self.num_screens 61 | 62 | def __getitem__(self, i: int): 63 | return self.screens[i] 64 | 65 | 66 | if __name__ == "__main__": 67 | tokenizer = Llama2dTokenizer() 68 | dataset = Llama2dZooCompassDataset( 69 | tokenizer=tokenizer, num_screens=10_000, words_per_screen=20 70 | ) 71 | 72 | debug_dataset(dataset) 73 | 74 | info = DatasetInfo( 75 | repo="llama2d/llama2d-zoo-compass", 76 | desc="Identify the animal farthest north/west/east/south in the zoo.", 77 | ) 78 | publish_pt_dataset(dataset, info) 79 | -------------------------------------------------------------------------------- /src/llama2d/find_pos_given_attr/download_mind2web.py: -------------------------------------------------------------------------------- 1 | from pprint import pprint 2 | 3 | from datasets import load_dataset 4 | 5 | # Load the Mind2Web dataset 6 | dataset = load_dataset("osunlp/Mind2Web") 7 | 8 | # Print the first sample for verification 9 | 10 | 11 | example = dataset["train"][0] 12 | 13 | pprint(example) 14 | # breakpoint() 15 | 16 | actions = example["actions"] 17 | 18 | print(actions[0].keys()) 19 | 20 | print(example["action_reprs"]) 21 | -------------------------------------------------------------------------------- /src/llama2d/find_pos_given_attr/find_pos_given_attr.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import requests 4 | from bs4 import BeautifulSoup 5 | from datasets import load_dataset 6 | 7 | # Load the Mind2Web dataset 8 | dataset = load_dataset("osunlp/Mind2Web") 9 | example = dataset["train"][0] 10 | 11 | 12 | attrs = example["actions"][0]["pos_candidates"][0]["attributes"] 13 | 14 | 15 | # URL of the webpage you want to scrape 16 | url = "http://example.com" 17 | 18 | print(example["domain"]) 19 | print(example["subdomain"]) 20 | 21 | # We might be able to assume website we can append .com to it 22 | print(example["website"]) 23 | print(len(dataset["train"])) 24 | 25 | print("Attemping to find all tags that contains that contain the attrs:") 26 | print(type(attrs)) 27 | print(attrs) 28 | 29 | attributes = json.loads(attrs) 30 | 31 | 32 | # Send a GET request to the webpage 33 | response = requests.get(url) 34 | soup = BeautifulSoup(response.content, "html.parser") 35 | 36 | # Find all tags that match the attributes 37 | matching_tags = soup.find_all(attrs=attributes) 38 | 39 | # Check if there are matching tags 40 | if matching_tags: 41 | print(f"Found {len(matching_tags)} matching tag(s)!") 42 | for tag in matching_tags: 43 | print(tag) 44 | else: 45 | print("No matching tags found!") 46 | -------------------------------------------------------------------------------- /src/llama2d/modal/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Llama2D/llama2d/e28b97255d396c717fe183b96b802ff39ffd7e6d/src/llama2d/modal/__init__.py -------------------------------------------------------------------------------- /src/llama2d/modal/common.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | from modal import Image, Secret, Stub, Volume 3 | 4 | N_GPUS = 2 5 | GPU_MEM = 80 6 | BASE_MODELS = { 7 | "base7": "meta-llama/Llama-2-7b-hf", 8 | "chat7": "meta-llama/Llama-2-7b-chat-hf", 9 | "chat13": "meta-llama/Llama-2-13b-chat-hf", 10 | "code7": "codellama/CodeLlama-7b-hf", 11 | "code34": "codellama/CodeLlama-34b-hf", 12 | "instruct7": "codellama/CodeLlama-7b-Instruct-hf", 13 | "instruct13": "codellama/CodeLlama-13b-Instruct-hf", 14 | "instruct34": "codellama/CodeLlama-34b-Instruct-hf", 15 | # Training 70B requires experimental flag fsdp_peft_cpu_offload_for_save. 16 | "chat70": "meta-llama/Llama-2-70b-chat-hf", 17 | } 18 | 19 | import os 20 | import random 21 | 22 | own_dir = os.path.dirname(os.path.realpath(__file__)) 23 | root_dir = f"{own_dir}/../../.." 24 | 25 | secrets_dir = f"{root_dir}/secrets/" 26 | data_dir = f"{root_dir}/data/" 27 | dataset_dir = f"{own_dir}/datasets/" 28 | 29 | transformers_dir = f"{root_dir}/transformers" 30 | llama_recipes_dir = f"{root_dir}/llama-recipes" 31 | 32 | if os.path.exists(transformers_dir) and os.path.exists(llama_recipes_dir): 33 | import os 34 | 35 | transformers_commit = ( 36 | os.popen(f"cd {transformers_dir} && git rev-parse HEAD").read().strip() 37 | ) 38 | llama_recipes_commit = ( 39 | os.popen(f"cd {llama_recipes_dir} && git rev-parse HEAD").read().strip() 40 | ) 41 | 42 | assert transformers_commit != "", "Could not get transformers commit." 43 | assert llama_recipes_commit != "", "Could not get llama-recipes commit." 44 | else: 45 | transformers_commit = "overwriting-llama" 46 | llama_recipes_commit = "andrew-dev" 47 | 48 | print( 49 | f"Transformers commit: {transformers_commit}, llama-recipes commit: {llama_recipes_commit}" 50 | ) 51 | 52 | import random 53 | 54 | image = ( 55 | Image.micromamba() 56 | .micromamba_install( 57 | "cudatoolkit=11.8", 58 | "cudnn=8.1.0", 59 | "cuda-nvcc", 60 | channels=["conda-forge", "nvidia"], 61 | ) 62 | .apt_install("git", "unzip") 63 | .pip_install( 64 | "huggingface_hub==0.17.1", 65 | "hf-transfer==0.1.3", 66 | "scipy", 67 | "gdown", 68 | "google-cloud-vision", 69 | "sentencepiece", 70 | "playwright", 71 | "wandb", 72 | "transformers", 73 | "matplotlib", 74 | ) 75 | .pip_install( 76 | f"llama-recipes @ git+https://github.com/modal-labs/llama-recipes.git", 77 | extra_index_url="https://download.pytorch.org/whl/nightly/cu118", 78 | pre=True, 79 | ) 80 | .run_commands( 81 | f"pip install 'llama-recipes @ git+https://github.com/llama2d/llama-recipes.git@{llama_recipes_commit}' git+https://github.com/llama2d/transformers.git@{transformers_commit} --no-deps" 82 | ) 83 | .env(dict(HUGGINGFACE_HUB_CACHE="/pretrained", HF_HUB_ENABLE_HF_TRANSFER="1")) 84 | .copy_local_dir(secrets_dir, "/root/secrets") 85 | .copy_local_file( 86 | f"{os.path.dirname(os.path.realpath(__file__))}/finetuning.py", 87 | "/root/finetuning.py", 88 | ) 89 | ) 90 | 91 | stub = Stub( 92 | "llama-finetuning", 93 | image=image, 94 | secrets=[Secret.from_name("huggingface"), Secret.from_name("wandb")], 95 | ) 96 | 97 | stub.hf_cache_volume = Volume.persisted("hf-cache") 98 | 99 | # Download pre-trained models into this volume. 100 | stub.pretrained_volume = Volume.persisted("example-pretrained-vol") 101 | 102 | # Save trained models into this volume. 103 | stub.results_volume = Volume.persisted("example-results-vol") 104 | 105 | VOLUME_CONFIG = { 106 | "/pretrained": stub.pretrained_volume, 107 | "/results": stub.results_volume, 108 | "/hf_cache": stub.hf_cache_volume, 109 | } 110 | -------------------------------------------------------------------------------- /src/llama2d/modal/datasets/cached_dataset.py: -------------------------------------------------------------------------------- 1 | import gdown 2 | import torch 3 | 4 | from llama2d.datasets.cached import CachedDataset 5 | 6 | 7 | def get_custom_dataset(dataset_config, tokenizer, split): 8 | dataset_folder = dataset_config.dataset_folder 9 | print(f"Using dataset folder {dataset_folder}") 10 | 11 | use_2d = dataset_config.use_2d 12 | 13 | gdown.download(id="1bgbnuVQjhRku60gCLrFfqfM66bp0Z4sI") 14 | gdown.download(id="1LBT_gMNntS0mj-S8oTEWQE8pcJOIAXLA") 15 | # unzip the dataset 16 | import os 17 | 18 | os.system("unzip -qo cached-pretrain.zip") 19 | os.system("unzip -qo mind2web-cache.zip") 20 | 21 | train_percent = 80 22 | 23 | full_dataset = CachedDataset( 24 | dataset_folder, use_2d=use_2d, keep_fraction=dataset_config.keep_fraction 25 | ) 26 | 27 | train_size = int(len(full_dataset) * train_percent / 100) 28 | val_size = len(full_dataset) - train_size 29 | 30 | train_dataset, val_dataset = torch.utils.data.random_split( 31 | full_dataset, [train_size, val_size] 32 | ) 33 | 34 | return train_dataset if split == "train" else val_dataset 35 | -------------------------------------------------------------------------------- /src/llama2d/modal/datasets/hf_dataset.py: -------------------------------------------------------------------------------- 1 | from llama2d.datasets.huggingface import HuggingFaceDataset 2 | 3 | 4 | def get_custom_dataset(dataset_config, tokenizer, split): 5 | repo = dataset_config.repo 6 | use_2d = dataset_config.use_2d 7 | print("get_custom_dataset, use_2d:", use_2d) 8 | return HuggingFaceDataset( 9 | repo, split, keep_fraction=dataset_config.keep_fraction, use_2d=use_2d 10 | ) 11 | -------------------------------------------------------------------------------- /src/llama2d/modal/datasets/new_dataset.py: -------------------------------------------------------------------------------- 1 | from llama2d.datasets.pretraining import Llama2dPretrainingDataset 2 | 3 | 4 | def format_text(row, tokenizer): 5 | return tokenizer(row) 6 | 7 | 8 | def get_custom_dataset(): 9 | urls = [ 10 | "https://github.com/OSU-NLP-Group/Mind2Web", 11 | "https://stackoverflow.com/questions/60352003/how-to-download-webpage-as-mhtml", 12 | ] 13 | dataset = Llama2dPretrainingDataset( 14 | model="decapoda-research/llama-7b-hf", urls=urls 15 | ) 16 | 17 | return dataset 18 | -------------------------------------------------------------------------------- /src/llama2d/modal/datasets/sql_dataset.py: -------------------------------------------------------------------------------- 1 | import datasets 2 | from llama_recipes.datasets.utils import Concatenator 3 | 4 | B_INST, E_INST = "[INST] ", " [/INST]" 5 | B_SYS, E_SYS = "<>\n", "\n<>\n\n" 6 | 7 | 8 | def format_text(row, tokenizer): 9 | text = ( 10 | B_INST 11 | + B_SYS 12 | + "You are an advanced SQL assistant that uses this SQL table schema " 13 | "to generate" 14 | " a SQL query which answers the user question.\n" 15 | + row["context"] 16 | + E_SYS 17 | + row["question"] 18 | + E_INST 19 | + "\n[SQL]\n" 20 | + row["answer"] 21 | + "\n[/SQL]" 22 | + "" 23 | ) 24 | 25 | return tokenizer(text) 26 | 27 | 28 | def get_custom_dataset(dataset_config, tokenizer, split): 29 | full_dataset = datasets.load_dataset("b-mc2/sql-create-context", split="train") 30 | 31 | # Since the dataset has no train/test split, we create one and select it 32 | dataset = full_dataset.train_test_split( 33 | train_size=10000, 34 | test_size=200, 35 | seed=42, 36 | )["train" if split == dataset_config.train_split else "test"] 37 | 38 | dataset = dataset.map( 39 | lambda x: format_text(x, tokenizer), remove_columns=list(dataset.features) 40 | ) 41 | 42 | dataset = dataset.map(Concatenator(), batched=True, batch_size=None) 43 | 44 | return dataset 45 | -------------------------------------------------------------------------------- /src/llama2d/modal/datasets/zoo_dataset.py: -------------------------------------------------------------------------------- 1 | from llama2d.datasets.synthetic.zoo_compass import Llama2dZooCompassDataset 2 | 3 | dataset_registry = {} 4 | 5 | 6 | def get_custom_dataset(dataset_config, tokenizer, split): 7 | keep_fraction = dataset_config.keep_fraction 8 | train_size = int(5000 * keep_fraction) 9 | val_size = int(200 * keep_fraction) # make val_size very small - we're short on GPU time 10 | return Llama2dZooCompassDataset( 11 | num_screens=train_size if split == "train" else val_size, 12 | words_per_screen=20, 13 | ) 14 | -------------------------------------------------------------------------------- /src/llama2d/modal/finetuning.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # This software may be used and distributed according to the 3 | # terms of the Llama 2 Community License Agreement. 4 | 5 | import os 6 | 7 | import fire 8 | import torch 9 | import torch.distributed as dist 10 | import torch.optim as optim 11 | from llama_recipes.configs import fsdp_config, train_config 12 | from llama_recipes.policies import AnyPrecisionAdamW, apply_fsdp_checkpointing 13 | from llama_recipes.utils import fsdp_auto_wrap_policy 14 | from llama_recipes.utils.config_utils import ( 15 | generate_dataset_config, 16 | generate_peft_config, 17 | update_config, 18 | ) 19 | from llama_recipes.utils.dataset_utils import get_preprocessed_dataset 20 | from llama_recipes.utils.train_utils import ( 21 | clear_gpu_cache, 22 | freeze_transformer_layers, 23 | get_policies, 24 | print_model_size, 25 | setup, 26 | setup_environ_flags, 27 | train, 28 | ) 29 | from peft import get_peft_model, prepare_model_for_int8_training 30 | from pkg_resources import packaging 31 | from torch.distributed.fsdp import FullyShardedDataParallel as FSDP 32 | from torch.distributed.fsdp.fully_sharded_data_parallel import CPUOffload 33 | from torch.optim.lr_scheduler import StepLR 34 | from torch.utils.data import DistributedSampler 35 | 36 | from transformers import AutoTokenizer, default_data_collator 37 | from transformers.models.llama.modeling_llama import LlamaDecoderLayer 38 | from transformers.models.llama.sam_embed import PositionEmbeddingRandom 39 | 40 | # dataclass serialization 41 | import dataclasses, json 42 | 43 | class EnhancedJSONEncoder(json.JSONEncoder): 44 | def default(self, o): 45 | if dataclasses.is_dataclass(o): 46 | return dataclasses.asdict(o) 47 | return super().default(o) 48 | def json_dumps(obj, *args,**kwargs): 49 | return json.dumps(obj,*args, cls=EnhancedJSONEncoder, **kwargs) 50 | 51 | def main(Llama, LlamaCfg, **kwargs): 52 | # Update the configuration for the training and sharding process 53 | update_config((train_config, fsdp_config), **kwargs) 54 | 55 | print(f"Full config: {train_config=},{kwargs=}") 56 | dataset_config = generate_dataset_config(train_config, kwargs) 57 | print(f"Dataset config: {dataset_config=}") 58 | 59 | use_2d = train_config.use_2d 60 | # Set the seeds for reproducibility 61 | torch.cuda.manual_seed(train_config.seed) 62 | torch.manual_seed(train_config.seed) 63 | import random 64 | random.seed(train_config.seed) 65 | import numpy as np 66 | np.random.seed(train_config.seed) 67 | 68 | if train_config.enable_fsdp: 69 | setup() 70 | # torchrun specific 71 | local_rank = int(os.environ["LOCAL_RANK"]) 72 | rank = int(os.environ["RANK"]) 73 | # world_size = int(os.environ["WORLD_SIZE"]) 74 | 75 | if torch.distributed.is_initialized(): 76 | torch.cuda.set_device(local_rank) 77 | clear_gpu_cache(local_rank) 78 | setup_environ_flags(rank) 79 | 80 | # Load the tokenizer and add special tokens 81 | tokenizer = AutoTokenizer.from_pretrained(train_config.model_name) 82 | tokenizer.add_special_tokens( 83 | { 84 | "pad_token": "", 85 | } 86 | ) 87 | 88 | # Load and preprocess the dataset for training and validation 89 | dataset_train = get_preprocessed_dataset( 90 | tokenizer, 91 | dataset_config, 92 | split="train", 93 | ) 94 | 95 | if not train_config.enable_fsdp or rank == 0: 96 | print(f"--> Training Set Length = {len(dataset_train)}") 97 | 98 | dataset_val = get_preprocessed_dataset( 99 | tokenizer, 100 | dataset_config, 101 | split="test", 102 | ) 103 | if not train_config.enable_fsdp or rank == 0: 104 | print(f"--> Validation Set Length = {len(dataset_val)}") 105 | 106 | kwargs = { 107 | "use_2d": use_2d, 108 | "lbd_start_value": train_config.lbd_start_value, 109 | "use_point_embed": train_config.use_point_embed, 110 | "separate_point_embed": train_config.separate_point_embed, 111 | } 112 | 113 | # Load the pre-trained model and setup its configuration 114 | use_cache = False if train_config.enable_fsdp else None 115 | if train_config.enable_fsdp and train_config.low_cpu_fsdp: 116 | """ 117 | for FSDP, we can save cpu memory by loading pretrained model on rank0 only. 118 | this avoids cpu oom when loading large models like llama 70B, in which case 119 | model alone would consume 2+TB cpu mem (70 * 4 * 8). This will add some comms 120 | overhead and currently requires latest nightly. 121 | """ 122 | v = packaging.version.parse(torch.__version__) 123 | verify_latest_nightly = v.is_devrelease and v.dev >= 20230701 124 | if not verify_latest_nightly: 125 | raise Exception( 126 | "latest pytorch nightly build is required to " 127 | "run with low_cpu_fsdp config, " 128 | "please install latest nightly." 129 | ) 130 | if rank == 0: 131 | model = Llama.from_pretrained( 132 | train_config.model_name, 133 | load_in_8bit=True if train_config.quantization else None, 134 | device_map="auto" if train_config.quantization else None, 135 | use_cache=use_cache, 136 | **kwargs, 137 | ) 138 | else: 139 | llama_config = LlamaCfg.from_pretrained(train_config.model_name) 140 | llama_config.use_cache = use_cache 141 | 142 | llama_config.use_2d = use_2d 143 | llama_config.lbd_start_value = train_config.lbd_start_value 144 | llama_config.use_point_embed = train_config.use_point_embed 145 | llama_config.separate_point_embed = train_config.separate_point_embed 146 | 147 | with torch.device("meta"): 148 | model = Llama(llama_config) 149 | 150 | else: 151 | model = Llama.from_pretrained( 152 | train_config.model_name, 153 | load_in_8bit=True if train_config.quantization else None, 154 | device_map="auto" if train_config.quantization else None, 155 | use_cache=use_cache, 156 | **kwargs, 157 | ) 158 | 159 | print(f"Using model type: {type(model)}") 160 | 161 | if train_config.enable_fsdp and train_config.use_fast_kernels: 162 | """ 163 | For FSDP and FSDP+PEFT, setting 'use_fast_kernels' will enable 164 | using of Flash Attention or Xformer memory-efficient kernels 165 | based on the hardware being used. This would speed up fine-tuning. 166 | """ 167 | try: 168 | from optimum.bettertransformer import BetterTransformer 169 | 170 | model = BetterTransformer.transform(model) 171 | except ImportError: 172 | print( 173 | "Module 'optimum' not found." 174 | " Please install 'optimum' it before proceeding." 175 | ) 176 | print_model_size(model, train_config, rank if train_config.enable_fsdp else 0) 177 | 178 | # Prepare the model for int8 training if quantization is enabled 179 | if train_config.quantization: 180 | model = prepare_model_for_int8_training(model) 181 | 182 | # Convert the model to bfloat16 if fsdp and pure_bf16 is enabled 183 | if train_config.enable_fsdp and fsdp_config.pure_bf16: 184 | print("Converting to bfloat16") 185 | model.to(torch.bfloat16) 186 | 187 | if train_config.use_peft: 188 | peft_config = generate_peft_config(train_config, kwargs) 189 | print(f"PEFT config: {peft_config=}") 190 | model = get_peft_model(model, peft_config) 191 | 192 | # Llama2D weight initialization code 193 | 194 | trainable_params_before, _ = model.get_nb_trainable_parameters() 195 | 196 | print("--------IGNORE POS EMBEDS IS FALSE--------") 197 | for k, v in model.named_parameters(): 198 | if k.endswith(".lbd"): 199 | v.requires_grad = True 200 | print(k, "requires_grad=", v.requires_grad, v) 201 | 202 | trainable_params_after, _ = model.get_nb_trainable_parameters() 203 | assert trainable_params_after > trainable_params_before, ( 204 | "Looks like lambda gating parameter isn't marked as trainable." 205 | f" Before: {trainable_params_before}, after: {trainable_params_after}" 206 | ) 207 | 208 | model.print_trainable_parameters() 209 | else: 210 | for k, v in model.named_parameters(): 211 | if k.endswith(".lbd"): 212 | v.requires_grad = v.data.requires_grad = True 213 | print(k, "requires_grad=", v.requires_grad, v.data) 214 | 215 | # setting up FSDP if enable_fsdp is enabled 216 | if train_config.enable_fsdp: 217 | if not train_config.use_peft and train_config.freeze_layers: 218 | freeze_transformer_layers(train_config.num_freeze_layers) 219 | 220 | mixed_precision_policy, wrapping_policy = get_policies(fsdp_config, rank) 221 | my_auto_wrapping_policy = fsdp_auto_wrap_policy( 222 | model, LlamaDecoderLayer, PositionEmbeddingRandom 223 | ) 224 | 225 | model = FSDP( 226 | model, 227 | auto_wrap_policy=my_auto_wrapping_policy 228 | if train_config.use_peft 229 | else wrapping_policy, 230 | cpu_offload=CPUOffload(offload_params=True) 231 | if fsdp_config.fsdp_cpu_offload 232 | else None, 233 | mixed_precision=mixed_precision_policy 234 | if not fsdp_config.pure_bf16 235 | else None, 236 | sharding_strategy=fsdp_config.sharding_strategy, 237 | device_id=torch.cuda.current_device(), 238 | limit_all_gathers=True, 239 | sync_module_states=train_config.low_cpu_fsdp, 240 | param_init_fn=lambda module: module.to_empty( 241 | device=torch.device("cuda"), recurse=False 242 | ) 243 | if train_config.low_cpu_fsdp and rank != 0 244 | else None, 245 | ) 246 | if fsdp_config.fsdp_activation_checkpointing: 247 | apply_fsdp_checkpointing(model) 248 | elif not train_config.quantization and not train_config.enable_fsdp: 249 | model.to("cuda") 250 | 251 | train_sampler = None 252 | val_sampler = None 253 | if train_config.enable_fsdp: 254 | train_sampler = DistributedSampler( 255 | dataset_train, 256 | rank=dist.get_rank(), 257 | num_replicas=dist.get_world_size(), 258 | shuffle=True, 259 | ) 260 | if train_config.run_validation: 261 | val_sampler = DistributedSampler( 262 | dataset_val, 263 | rank=dist.get_rank(), 264 | num_replicas=dist.get_world_size(), 265 | ) 266 | 267 | # Create DataLoaders for the training and validation dataset 268 | train_dataloader = torch.utils.data.DataLoader( 269 | dataset_train, 270 | batch_size=train_config.batch_size_training, 271 | num_workers=train_config.num_workers_dataloader, 272 | pin_memory=True, 273 | sampler=train_sampler if train_sampler else None, 274 | drop_last=True, 275 | collate_fn=default_data_collator, 276 | ) 277 | 278 | eval_dataloader = None 279 | if train_config.run_validation: 280 | eval_dataloader = torch.utils.data.DataLoader( 281 | dataset_val, 282 | batch_size=train_config.val_batch_size, 283 | num_workers=train_config.num_workers_dataloader, 284 | pin_memory=True, 285 | sampler=val_sampler if val_sampler else None, 286 | drop_last=True, 287 | collate_fn=default_data_collator, 288 | ) 289 | 290 | # Initialize the optimizer and learning rate scheduler 291 | 292 | # make custom param groups 293 | group_substrs = { 294 | "lambda":[train_config.lambda_lr,"lbd"], 295 | "point_embed":[train_config.point_embed_lr,"is_a_point_embed"], 296 | } 297 | param_groups = [] 298 | for n,p in model.named_parameters(): 299 | for group_name,(lr,substr) in group_substrs.items(): 300 | if substr in n: 301 | param_groups.append({"params":[p],"lr":lr}) 302 | break 303 | else: 304 | param_groups.append({"params":[p],"lr":train_config.lr}) 305 | 306 | 307 | if fsdp_config.pure_bf16 and fsdp_config.optimizer == "anyprecision": 308 | optimizer = AnyPrecisionAdamW( 309 | param_groups, 310 | momentum_dtype=torch.bfloat16, 311 | variance_dtype=torch.bfloat16, 312 | use_kahan_summation=False, 313 | weight_decay=train_config.weight_decay, 314 | ) 315 | else: 316 | optimizer = optim.AdamW( 317 | param_groups, 318 | weight_decay=train_config.weight_decay, 319 | ) 320 | scheduler = StepLR(optimizer, step_size=1, gamma=train_config.gamma) 321 | 322 | if train_config.num_epochs > 0: 323 | # Start the training process 324 | results = train( 325 | model, 326 | train_dataloader, 327 | eval_dataloader, 328 | tokenizer, 329 | optimizer, 330 | scheduler, 331 | train_config.gradient_accumulation_steps, 332 | train_config, 333 | fsdp_config if train_config.enable_fsdp else None, 334 | local_rank if train_config.enable_fsdp else None, 335 | rank if train_config.enable_fsdp else None, 336 | kwargs, 337 | ) 338 | if not train_config.enable_fsdp or rank == 0: 339 | [print(f"Key: {k}, Value: {v}") for k, v in results.items()] 340 | else: 341 | print("Skipping training") 342 | 343 | # print lambda values 344 | print("-----Lambda gating values-------") 345 | with FSDP.summon_full_params( 346 | model, rank0_only=True, writeback=False, with_grads=False 347 | ): 348 | print("-----full-params Lambda gating values-------") 349 | for k, v in model.named_parameters(): 350 | if k.endswith(".lbd"): 351 | print(k, v.data) 352 | print("--------------------------------") 353 | 354 | 355 | if __name__ == "__main__": 356 | fire.Fire(main) 357 | -------------------------------------------------------------------------------- /src/llama2d/modal/inference.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | 4 | from common import BASE_MODELS, VOLUME_CONFIG, stub 5 | from modal import Image, gpu, method 6 | 7 | tgi_image = ( 8 | Image.from_registry("ghcr.io/huggingface/text-generation-inference:1.0.3") 9 | .dockerfile_commands("ENTRYPOINT []") 10 | .pip_install("text-generation", "transformers>=4.33.0") 11 | # .pip_install("git+https://github.com/Llama2D/transformers") 12 | .env(dict(HUGGINGFACE_HUB_CACHE="/pretrained")) 13 | ) 14 | 15 | 16 | @stub.function(image=tgi_image, volumes=VOLUME_CONFIG, timeout=60 * 20) 17 | def merge(run_id: str, commit: bool = False): 18 | from text_generation_server.utils.peft import download_and_unload_peft 19 | 20 | os.mkdir(f"/results/{run_id}/merged") 21 | subprocess.call(f"cp /results/{run_id}/*.* /results/{run_id}/merged", shell=True) 22 | 23 | print(f"Merging weights for fine-tuned {run_id=}.") 24 | download_and_unload_peft(f"/results/{run_id}/merged", None, False) 25 | 26 | if commit: 27 | print("Committing merged model permanently (can take a few minutes).") 28 | stub.results_volume.commit() 29 | 30 | 31 | @stub.cls( 32 | image=tgi_image, 33 | gpu=gpu.A100(count=1, memory=40), 34 | allow_concurrent_inputs=100, 35 | volumes=VOLUME_CONFIG, 36 | ) 37 | class Model: 38 | def __init__(self, base: str = "", run_id: str = ""): 39 | import socket 40 | import time 41 | 42 | from text_generation import AsyncClient 43 | 44 | model = f"/results/{run_id}/merged" if run_id else BASE_MODELS[base] 45 | 46 | if run_id and not os.path.isdir(model): 47 | merge.local(run_id) # local = run in the same container 48 | 49 | print(f"Loading {model} into GPU ... ") 50 | launch_cmd = ["text-generation-launcher", "--model-id", model, "--port", "8000"] 51 | self.launcher = subprocess.Popen(launch_cmd, stdout=subprocess.DEVNULL) 52 | 53 | self.client = None 54 | while not self.client and self.launcher.returncode is None: 55 | try: 56 | socket.create_connection(("127.0.0.1", 8000), timeout=1).close() 57 | self.client = AsyncClient("http://127.0.0.1:8000", timeout=60) 58 | except (socket.timeout, ConnectionRefusedError): 59 | time.sleep(1.0) 60 | 61 | assert self.launcher.returncode is None 62 | 63 | def __exit__(self, _exc_type, _exc_value, _traceback): 64 | self.launcher.terminate() 65 | 66 | @method() 67 | async def generate(self, prompt: str): 68 | result = await self.client.generate(prompt, max_new_tokens=512) 69 | 70 | return result.generated_text 71 | 72 | 73 | @stub.local_entrypoint() 74 | def main(prompt: str, base: str, run_id: str = "", batch: int = 1): 75 | print(f"Running completion for prompt:\n{prompt}") 76 | 77 | print("=" * 20 + "Generating without adapter" + "=" * 20) 78 | for output in Model(base).generate.map([prompt] * batch): 79 | print(output) 80 | 81 | if run_id: 82 | print("=" * 20 + "Generating with adapter" + "=" * 20) 83 | for output in Model(base, run_id).generate.map([prompt] * batch): 84 | print(output) 85 | -------------------------------------------------------------------------------- /src/llama2d/modal/repro.py: -------------------------------------------------------------------------------- 1 | from common import transformers_dir,llama_recipes_dir,root_dir 2 | import os 3 | import sys 4 | 5 | def check_all_code_committed(dir): 6 | 7 | old_dir = os.getcwd() 8 | os.chdir(dir) 9 | 10 | # assert that all code in current directory is committed 11 | git_diff = os.popen(f"git diff").read() 12 | git_diff_cached = os.popen("git diff --cached").read() 13 | 14 | dir_name = os.path.basename(dir) 15 | assert ( 16 | git_diff == "" and git_diff_cached == "" 17 | ), f"Please commit all code in {dir_name} before running this script." 18 | 19 | git_commit_hash = os.popen(f"git rev-parse HEAD").read().strip() 20 | 21 | # assert that all code in transformers is committed 22 | os.chdir(old_dir) 23 | 24 | return git_commit_hash 25 | 26 | def check_llama2d_code(): 27 | llama2d = check_all_code_committed(root_dir) 28 | transformers = check_all_code_committed(transformers_dir) 29 | llama_recipes = check_all_code_committed(llama_recipes_dir) 30 | 31 | return { 32 | "llama2d": llama2d, 33 | "transformers": transformers, 34 | "llama_recipes": llama_recipes, 35 | } 36 | 37 | def make_repro_command(): 38 | commits = check_llama2d_code() 39 | 40 | # get full command line command 41 | command = " ".join(sys.argv) 42 | 43 | # TODO: fill in HF dataset name if it's not there 44 | 45 | return f""" 46 | # run in llama2d 47 | git checkout {commits["llama2d"]} 48 | cd transformers && git checkout {commits["transformers"]} 49 | cd ../llama-recipes && git checkout {commits["llama_recipes"]} 50 | cd src/llama2d/modal 51 | {command} 52 | """ -------------------------------------------------------------------------------- /src/llama2d/modal/requirements.txt: -------------------------------------------------------------------------------- 1 | aiohttp==3.8.5 2 | aiosignal==1.3.1 3 | aiostream==0.4.5 4 | annotated-types==0.5.0 5 | anyio==3.7.1 6 | asgiref==3.7.2 7 | async-timeout==4.0.3 8 | attrs==23.1.0 9 | certifi==2023.7.22 10 | charset-normalizer==3.2.0 11 | click==8.1.7 12 | cloudpickle==2.0.0 13 | datasets==2.14.5 14 | dill==0.3.7 15 | exceptiongroup==1.1.3 16 | fastapi==0.103.1 17 | filelock==3.12.4 18 | frozenlist==1.4.0 19 | fsspec==2023.6.0 20 | grpclib==0.4.3 21 | h2==4.1.0 22 | hpack==4.0.0 23 | huggingface-hub==0.17.1 24 | hyperframe==6.0.1 25 | idna==3.4 26 | importlib-metadata==6.8.0 27 | markdown-it-py==3.0.0 28 | mdurl==0.1.2 29 | modal==0.52.3439 30 | multidict==6.0.4 31 | multiprocess==0.70.15 32 | numpy==1.24.4 33 | packaging==23.1 34 | pandas==2.0.3 35 | protobuf==4.24.3 36 | pyarrow==13.0.0 37 | pydantic==2.3.0 38 | pydantic_core==2.6.3 39 | Pygments==2.16.1 40 | python-dateutil==2.8.2 41 | pytz==2023.3.post1 42 | PyYAML==6.0.1 43 | regex==2023.8.8 44 | requests==2.31.0 45 | rich==13.5.2 46 | safetensors==0.3.3 47 | sigtools==4.0.1 48 | six==1.16.0 49 | sniffio==1.3.0 50 | starlette==0.27.0 51 | synchronicity==0.5.3 52 | tblib==2.0.0 53 | tokenizers==0.13.3 54 | toml==0.10.2 55 | tqdm==4.66.1 56 | transformers @ git+https://github.com/Llama2D/transformers@cdffed967e6941bf72f333b33b599da601cb21d8 57 | typer==0.9.0 58 | types-certifi==2021.10.8.3 59 | types-toml==0.10.8.7 60 | typing_extensions==4.7.1 61 | tzdata==2023.3 62 | urllib3==2.0.4 63 | watchfiles==0.20.0 64 | xxhash==3.3.0 65 | yarl==1.9.2 66 | zipp==3.16.2 67 | -------------------------------------------------------------------------------- /src/llama2d/modal/train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | from repro import make_repro_command 5 | from common import BASE_MODELS, GPU_MEM, N_GPUS, VOLUME_CONFIG, stub 6 | from modal import Mount, Secret, gpu 7 | 8 | # add llama2d to path 9 | sys.path.append(f"{os.path.dirname(os.path.realpath(__file__))}/../../.") 10 | import llama2d 11 | 12 | 13 | @stub.function( 14 | volumes=VOLUME_CONFIG, 15 | memory=1024 * 100, 16 | timeout=3600 * 4, 17 | secrets=[Secret.from_name("huggingface")], 18 | ) 19 | def download(model_name: str): 20 | assert ( 21 | "HUGGINGFACE_TOKEN" in os.environ 22 | ), "Please set the HUGGINGFACE_TOKEN environment variable." 23 | from huggingface_hub.hf_api import HfFolder 24 | 25 | HfFolder.save_token(os.environ["HUGGINGFACE_TOKEN"]) 26 | 27 | from huggingface_hub import snapshot_download 28 | 29 | from transformers.utils import move_cache 30 | 31 | try: 32 | snapshot_download(model_name, local_files_only=True) 33 | print(f"Volume contains {model_name}.") 34 | except FileNotFoundError: 35 | print(f"Downloading {model_name} (no progress bar) ...") 36 | snapshot_download(model_name) 37 | move_cache() 38 | 39 | print("Committing /pretrained directory (no progress bar) ...") 40 | stub.pretrained_volume.commit() 41 | 42 | 43 | def library_entrypoint(config): 44 | import os 45 | 46 | print(os.getcwd(), os.listdir()) 47 | assert ( 48 | "HUGGINGFACE_TOKEN" in os.environ 49 | ), "Please set the HUGGINGFACE_TOKEN environment variable." 50 | from huggingface_hub.hf_api import HfFolder 51 | 52 | HfFolder.save_token(os.environ["HUGGINGFACE_TOKEN"]) 53 | 54 | print(config) 55 | from finetuning import main 56 | 57 | from transformers import LlamaConfig, LlamaForCausalLM 58 | 59 | # from llama2d.model.modeling_llama import Llama2DForCausalLM 60 | # from llama2d.model.configuration_llama import Llama2DConfig 61 | # from llama2d.model.modeling_llama_old import LlamaForCausalLM 62 | # from llama2d.model.configuration_llama_old import LlamaConfig 63 | 64 | Llama = LlamaForCausalLM 65 | # LlamaConfig = Llama2DConfig 66 | 67 | main(Llama, LlamaConfig, **config) 68 | 69 | 70 | @stub.function( 71 | volumes=VOLUME_CONFIG, 72 | mounts=[ 73 | Mount.from_local_dir("./datasets", remote_path="/root"), 74 | ], 75 | gpu=gpu.A100(count=N_GPUS, memory=GPU_MEM), 76 | timeout=3600 * 12, 77 | ) 78 | def train(train_kwargs): 79 | from torch.distributed.run import config_from_args, elastic_launch, parse_args 80 | 81 | torch_args = parse_args(["--nnodes", "1", "--nproc_per_node", str(N_GPUS), ""]) 82 | print(f"{torch_args=}\n{train_kwargs=}") 83 | 84 | elastic_launch( 85 | config=config_from_args(torch_args)[0], 86 | entrypoint=library_entrypoint, 87 | )(train_kwargs) 88 | 89 | print("Committing results volume (no progress bar) ...") 90 | stub.results_volume.commit() 91 | 92 | @stub.local_entrypoint() # Runs locally to kick off remote training job. 93 | def main( 94 | dataset: str, 95 | base: str = "base7", 96 | run_id: str = "", 97 | num_epochs: int = 1, 98 | batch_size: int = 16, 99 | repo: str = "llama2d/llama2d-mind2web", 100 | keep_fraction: float = 1.0, 101 | seed: int = 0, 102 | 103 | peft: bool = False, 104 | use_2d: bool = True, 105 | use_point_embed: bool = True, 106 | lbd_start_value: float = 0.0, 107 | lr: float = 3e-5, 108 | lambda_lr: float = 3e-2, 109 | point_embed_lr: float = 3e-5, 110 | separate_point_embed: bool = False, 111 | 112 | # wandb args 113 | group: str = None, 114 | name: str = None, 115 | ): 116 | print("Welcome to Modal Llama fine-tuning.") 117 | print(f"Dataset is {dataset}.") 118 | 119 | model_name = BASE_MODELS[base] 120 | print(f"Syncing base model {model_name} to volume.") 121 | download.remote(model_name) 122 | 123 | cmd = make_repro_command() 124 | print(cmd) 125 | raise Exception("Done") 126 | 127 | if not run_id: 128 | import secrets 129 | 130 | run_id = f"{base}-{secrets.token_hex(3)}" 131 | elif not run_id.startswith(base): 132 | run_id = f"{base}-{run_id}" 133 | 134 | print(f"Beginning run {run_id=}.") 135 | train.remote( 136 | { 137 | "model_name": BASE_MODELS[base], 138 | "output_dir": f"/results/{run_id}", 139 | "batch_size_training": batch_size, 140 | "lr": lr, 141 | "lambda_lr": lambda_lr, 142 | "num_epochs": num_epochs, 143 | "val_batch_size": 1, 144 | # --- Dataset options --- 145 | "dataset": "custom_dataset", 146 | "custom_dataset.file": dataset, 147 | # --- FSDP options --- 148 | "enable_fsdp": True, 149 | "low_cpu_fsdp": True, # Optimization for FSDP model loading (RAM won't scale with num GPUs) # noqa 150 | "fsdp_config.use_fast_kernels": True, # Only works when FSDP is on 151 | "fsdp_config.fsdp_activation_checkpointing": True, # Activation checkpointing for fsdp # noqa 152 | "pure_bf16": True, 153 | # --- Required for 70B --- 154 | "fsdp_config.fsdp_cpu_offload": True, 155 | "fsdp_peft_cpu_offload_for_save": True, # Experimental 156 | # --- PEFT options --- 157 | "use_peft": peft, 158 | "peft_method": "lora", 159 | "lora_config.r": 8, 160 | "lora_config.lora_alpha": 16, 161 | # --- Llama2D options --- 162 | "label_names": ["coords"], 163 | "dataset_folder": "mind2web-cache", 164 | "use_2d": use_2d, 165 | "keep_fraction": keep_fraction, 166 | "repo": repo, 167 | "lbd_start_value": lbd_start_value, 168 | "seed": seed, 169 | "use_point_embed": use_point_embed, 170 | "point_embed_lr": point_embed_lr, 171 | "separate_point_embed": separate_point_embed, 172 | 173 | "group": group, 174 | "name": name, 175 | } 176 | ) 177 | 178 | print(f"Training completed {run_id=}.") 179 | print( 180 | f"Test: `modal run compare.py --base {base} --run-id {run_id} --prompt '...'`." 181 | ) 182 | -------------------------------------------------------------------------------- /src/llama2d/modal/validate_dataset.py: -------------------------------------------------------------------------------- 1 | from common import BASE_MODELS, stub 2 | from llama_recipes.configs.datasets import custom_dataset 3 | from llama_recipes.utils.config_utils import update_config 4 | from llama_recipes.utils.dataset_utils import get_custom_dataset 5 | from modal import Mount 6 | 7 | 8 | @stub.function( 9 | volumes={ 10 | "/pretrained": stub.pretrained_volume, 11 | "/results": stub.results_volume, 12 | }, 13 | mounts=[ 14 | Mount.from_local_dir("./datasets", remote_path="/root"), 15 | ], 16 | ) 17 | def dataset(base: str = "chat7", dataset: str = "local_dataset.py"): 18 | from transformers import AutoTokenizer 19 | 20 | tokenizer = AutoTokenizer.from_pretrained(BASE_MODELS[base]) 21 | tokenizer.add_special_tokens({"pad_token": ""}) 22 | 23 | config = custom_dataset() 24 | update_config(config, file=dataset) 25 | 26 | BLOCK = "=" * 20 27 | 28 | for split in [config.train_split, config.test_split]: 29 | dataset = get_custom_dataset(config, tokenizer, split) 30 | print(f"{split}: {len(dataset)} sequences") 31 | 32 | sample = tokenizer.decode(dataset[0]["input_ids"])[:500] 33 | print(f"{BLOCK} Sample {BLOCK}\n{sample} ...") 34 | print(f"{BLOCK} Tokens {BLOCK}\n{dataset[0]['input_ids'][:25]} ...\n") 35 | -------------------------------------------------------------------------------- /src/llama2d/tagging/add_tags_to_page.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | from dataclasses import dataclass 4 | from typing import List, Tuple 5 | 6 | 7 | @dataclass 8 | class TagAndBox: 9 | word: str 10 | coords: Tuple[int, int] 11 | 12 | 13 | def add_tags_to_webpage(page, mind2web_action) -> Tuple[int, List[TagAndBox]]: 14 | """ 15 | Add visual tags to a webpage, and find the tag # of the desired Mind2Web action. 16 | A visual tag looks like [12] and is superimposed on buttons, textboxes, links, etc. 17 | """ 18 | 19 | attrss = [ 20 | json.loads(pos_candidate["attributes"]) 21 | for pos_candidate in mind2web_action["pos_candidates"] 22 | ] 23 | 24 | els = [] 25 | for attrs in attrss: 26 | cls = attrs.get("class", None) 27 | tag_id = attrs.get("id", None) 28 | bbox_rect = [float(i) for i in attrs["bounding_box_rect"].split(",")] 29 | els.append({"cls": cls, "tag_id": tag_id, "bbox_rect": bbox_rect}) 30 | 31 | raw_html = mind2web_action["raw_html"] 32 | 33 | # print(f"Looking for element with class {cls} 34 | # and id {tag_id} and bbox {bbox_rect}") 35 | 36 | curr_dir = os.path.dirname(os.path.realpath(__file__)) 37 | with open(f"{curr_dir}/tagUtils.js", "r") as f: 38 | page.evaluate(f.read()) 39 | 40 | try: 41 | to_eval = f"tagifyWebpage({json.dumps(els)},true,{json.dumps(raw_html)})" 42 | gt_tag_id, el_tags = page.evaluate(to_eval) 43 | except Exception as e: 44 | raise e 45 | raise Exception(f"Error evaluating:\n{to_eval}\n{e}") 46 | 47 | assert isinstance(gt_tag_id, int), f"gt_tag_id is {json.dumps(gt_tag_id)}!" 48 | 49 | return gt_tag_id, [TagAndBox(**i) for i in el_tags] 50 | 51 | 52 | if __name__ == "__main__": 53 | from playwright.sync_api import sync_playwright 54 | 55 | with sync_playwright() as p: 56 | browser = p.chromium.launch(headless=False) 57 | page = browser.new_page() 58 | 59 | # get path to current file 60 | curr_dir = os.path.dirname(os.path.realpath(__file__)) 61 | example_mhtml_path = f"{curr_dir}/../../data/mind2web_example.mhtml" 62 | example_json_path = f"{curr_dir}/../../data/mind2web_example.json" 63 | page.goto(f"file://{example_mhtml_path}") 64 | with open(example_json_path, "r") as f: 65 | dummy_action = json.load(f) 66 | 67 | try: 68 | print(add_tags_to_webpage(page, dummy_action)) 69 | except Exception as e: 70 | print(e) 71 | 72 | input("Press enter to stop the program") 73 | -------------------------------------------------------------------------------- /src/llama2d/tagging/tagUtils.js: -------------------------------------------------------------------------------- 1 | const assert = (condition, message) => { 2 | if(!condition) throw new Error(message) 3 | } 4 | 5 | const elIsClean = (el) => { 6 | if(el.style && el.style.display === 'none') return false 7 | if(el.hidden) return false 8 | if(el.disabled) return false 9 | 10 | const rect = el.getBoundingClientRect() 11 | if(rect.width === 0 || rect.height === 0) return false 12 | 13 | if(el.tagName === 'SCRIPT') return false 14 | if(el.tagName === 'STYLE') return false 15 | 16 | return true; 17 | } 18 | 19 | const isNotCovered = (el) => { 20 | const rect = el.getBoundingClientRect() 21 | const elCenter = [rect.left + rect.width/2, rect.top + rect.height/2]; 22 | 23 | const elAtPoint = document.elementFromPoint(...elCenter) 24 | 25 | return el.contains(elAtPoint) 26 | } 27 | const isInteractiveCursor = (el) => ["pointer","text"].includes(el.computedStyleMap().get("cursor")) 28 | 29 | const inputs = ['a', 'button', 'textarea', 'select', 'details', 'label'] 30 | const _isInteractible = (el) => (inputs.includes(el.tagName.toLowerCase()) || 31 | (el.tagName.toLowerCase() === 'input' && el.type !== 'hidden') || 32 | el.role === 'button' || 33 | isInteractiveCursor(el) && !(el.parentElement && isInteractiveCursor(el.parentElement))) && isNotCovered(el) 34 | 35 | const isInteractible = (el) => _isInteractible(el) || el.parentElement && isInteractible(el.parentElement); 36 | 37 | const emptyTagWhitelist = ["input","textarea","select","button","a"] 38 | const isEmpty = (el) => { 39 | 40 | const bbox = el.getBoundingClientRect() 41 | // check if center of element is offscreen 42 | const center = [bbox.left + bbox.width/2, bbox.top + bbox.height/2] 43 | if(center[0] < 0 || center[0] > window.innerWidth || center[1] < 0 || center[1] > window.innerHeight) return true 44 | 45 | const tagName = el.tagName.toLowerCase() 46 | if(emptyTagWhitelist.includes(tagName)) return false 47 | if("innerText" in el && el.innerText.trim().length === 0) { 48 | // look for svg or img in the element 49 | const svg = el.querySelector("svg") 50 | const img = el.querySelector("img") 51 | 52 | if(svg || img) return false 53 | 54 | return true 55 | } 56 | 57 | return false 58 | } 59 | 60 | window.tagifyWebpageOneEl = (gtCls, gtId, gtBbox) => tagifyWebpage([{ 61 | cls: gtCls, 62 | tag_id: gtId, 63 | bbox_rect: gtBbox 64 | }]) 65 | 66 | const convertHoverToCls = () => { 67 | [...document.styleSheets].forEach(sheet=>{ 68 | try{ 69 | [...sheet.cssRules].forEach(rule=>{ 70 | if(rule.selectorText) rule.selectorText = rule.selectorText.replace(/:hover/g,".mind2web-hover") 71 | }) 72 | } catch(err){ 73 | if(!(err+"").includes("Cannot access rules")) throw err; 74 | } 75 | }) 76 | } 77 | 78 | window.tagifyWebpage = (gtEls,useGt=true,rawHtml="") =>{ 79 | 80 | // Populate mHTML input values with raw_html from action JSON 81 | if(rawHtml.length>0){ 82 | // parse html 83 | const parser = new DOMParser(); 84 | const htmlDoc = parser.parseFromString(rawHtml, 'text/html'); 85 | 86 | [...htmlDoc.querySelectorAll("[input_value], [input_checked]")].forEach(el=>{ 87 | if(el.attributes.bounding_box_rect.value==="-1,-1,-1,-1") return; 88 | 89 | // get the position of the input on the page 90 | const classNames = [...el.classList].map(cls=>"."+cls).join(""); 91 | 92 | const id = [el.id].filter(e=>e).map(id=>"#"+id) 93 | console.log(el.id,el.attributes.id) 94 | const tag = el.tagName.toLowerCase(); 95 | 96 | const selector = `${tag}${classNames}${id}`; 97 | 98 | const fragmentMatches = htmlDoc.querySelectorAll(selector) 99 | const numMatchesInFragment = fragmentMatches.length; 100 | const fragmentIdx = [...fragmentMatches].indexOf(el); 101 | 102 | if(fragmentIdx<0) throw new Error("Could not find element with its own selector"); 103 | 104 | const docMatches = document.querySelectorAll(selector); 105 | if(docMatches.length != fragmentMatches.length) throw new Error(`Mismatched lengths: ${docMatches.length} vs. ${fragmentMatches.length}: ${selector}`); 106 | const docEl = docMatches[fragmentIdx]; 107 | 108 | // if has input_value, set docEl.value 109 | if("input_value" in el.attributes) { 110 | docEl.value = el.attributes.input_value.value; 111 | } 112 | else if("input_checked" in el.attributes) docEl.checked = el.attributes.input_checked.value; 113 | else { 114 | throw new Error("didn't find things"); 115 | } 116 | 117 | }) 118 | } 119 | 120 | convertHoverToCls(); 121 | 122 | let numTagsSoFar = 0; 123 | 124 | let gtCandidates = []; 125 | 126 | let elTags = []; 127 | 128 | const validEls = new Set(); 129 | const hasValidParent = el => validEls.has(el) || (el.parentElement && hasValidParent(el.parentElement)); 130 | 131 | for(let el of document.body.querySelectorAll("*")){ 132 | 133 | const stringifiedClasses = el.classList.toString(); 134 | 135 | const gtMatches = gtEls.filter(({cls,tag_id,bbox_rect})=>(cls===null || stringifiedClasses===cls) && (tag_id===null || el.id === tag_id)); 136 | const isGt = gtMatches.length > 0; 137 | 138 | el.classList.add("mind2web-hover") 139 | 140 | const empty = isEmpty(el); 141 | const dirty = !elIsClean(el); 142 | const uninteractible = !isInteractible(el); 143 | const validParent = hasValidParent(el) 144 | 145 | el.classList.remove("mind2web-hover") 146 | 147 | if(logElements.includes(el)) { 148 | console.log(`Logging ${el.innerText}, ${empty},${dirty},${uninteractible},${validParent}`) 149 | } 150 | 151 | const isGood = !(empty || dirty || uninteractible) || validParent; 152 | if(isGood) validEls.add(el); 153 | 154 | if(!isGood){ 155 | if(isGt) console.log("Skipping!", el,`empty: ${empty}, dirty: ${dirty}, uninteractible: ${uninteractible}, validParent: ${validParent}`); 156 | continue; 157 | } 158 | 159 | const elBbox = el.getBoundingClientRect(); 160 | const elCenter = [elBbox.left + elBbox.width/2, elBbox.top + elBbox.height/2]; 161 | 162 | // get closest el in elTags 163 | const [closestDist,closestEl] = elTags.map(({coords})=>coords).map(([x,y])=>Math.sqrt((x-elCenter[0])*(x-elCenter[0]) + (y-elCenter[1])*(y-elCenter[1]))).reduce((acc,cur,i)=>cur 5; 165 | 166 | if(isGt){ 167 | const gtTagId = useNewTag ? numTagsSoFar : closestEl; 168 | console.log("Tagging GT!", el); 169 | gtCandidates.push({ 170 | el, 171 | tagId: gtTagId, 172 | stats:{empty, dirty, uninteractible, validParent}, 173 | gtEls: gtMatches 174 | }); 175 | } 176 | 177 | if(useNewTag){ 178 | 179 | const tagStr = `[${numTagsSoFar}] ` 180 | 181 | elTags.push({ 182 | word:tagStr, 183 | coords:elCenter, 184 | }) 185 | validEls.add(el); 186 | 187 | numTagsSoFar++; 188 | } 189 | } 190 | console.log(validEls) 191 | 192 | if(!useGt) return [null, elTags]; 193 | 194 | 195 | 196 | const validGtCandidates = gtCandidates.filter(({el, stats}) => { 197 | const {empty, dirty, uninteractible, validParent} = stats 198 | return !empty && !dirty && !uninteractible || validParent 199 | }) 200 | 201 | if(validGtCandidates.length === 0){ 202 | console.log("No GT found!") 203 | // show stats for all candidates 204 | console.log(gtCandidates.map(({stats})=>`empty: ${stats.empty}, dirty: ${stats.dirty}, uninteractible: ${stats.uninteractible}`).join("\n")); 205 | throw new Error(`No GT found!\n${gtCandidates.map(({el})=>el.innerText).join("\n")}`) 206 | } 207 | 208 | if(validGtCandidates.length > 1){ 209 | console.log("Multiple GTs found!") 210 | } 211 | 212 | const elementDistancesDeep = validGtCandidates.map(({el,gtEls}) => gtEls.map(({bbox_rect})=>bbox_rect).map((gtBbox)=>{ 213 | const rect = el.getBoundingClientRect() 214 | const [x,y,w,h] = gtBbox; 215 | const gtCenter = [x+w/2, y+h/2]; 216 | const elCenter = [rect.left + rect.width/2, rect.top + rect.height/2]; 217 | 218 | const dx = gtCenter[0] - elCenter[0]; 219 | const dy = gtCenter[1] - elCenter[1]; 220 | return Math.sqrt(dx*dx + dy*dy) 221 | })) 222 | 223 | const elementDistances = elementDistancesDeep.map((distances)=>Math.min(...distances)); 224 | 225 | const closestDistance = Math.min(...elementDistances); 226 | const closestElement = validGtCandidates[elementDistances.indexOf(closestDistance)]; 227 | 228 | if(closestDistance > 20) { 229 | throw new Error(`Closest element is ${closestDistance}px away! Bboxes are ${validGtCandidates.map(({el})=>el.getBoundingClientRect()).map(({left, top, width, height})=>[left, top, width, height])})}}`); 230 | } 231 | 232 | 233 | return [closestElement.tagId, elTags]; 234 | } 235 | logElements=[]; // some elements where you can check your classification performance. useful for debugging. 236 | 237 | window.showTag = coords => { 238 | myBox = document.createElement("div") 239 | myBox.style.width = "10px"; 240 | myBox.style.height = "10px"; 241 | myBox.style.background = "red"; 242 | myBox.style.position = "absolute"; 243 | myBox.style.top = coords[1]-5+"px"; 244 | myBox.style.left = coords[0]-5+"px"; 245 | myBox.textContent = ""; 246 | myBox.style.zIndex = 2000 247 | document.body.appendChild(myBox) 248 | } 249 | 250 | window.demo = () => tagifyWebpage([],false)[1].forEach(({coords})=>showTag(coords)) 251 | 1; 252 | 253 | -------------------------------------------------------------------------------- /src/llama2d/vision/__init__.py: -------------------------------------------------------------------------------- 1 | from .ocr import Llama2dScreen 2 | from .url_to_llama_input import Llama2dTokenizer 3 | from .viz_pt_input import debug_dataset 4 | -------------------------------------------------------------------------------- /src/llama2d/vision/learn_mlp_on_embeds.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from torch.nn import functional as F 4 | from tqdm import tqdm 5 | 6 | from transformers.models.llama.sam_embed import PositionEmbeddingRandom 7 | 8 | 9 | class CoordMlp(nn.Module): 10 | def __init__(self, n: int, hidden: int): 11 | super().__init__() 12 | self.embed = PositionEmbeddingRandom(n, torch_dtype=torch.float32) 13 | self.a = nn.Linear(n * 2, hidden) 14 | self.b = nn.Linear(hidden, 1) 15 | 16 | self.n = n 17 | self.hidden = hidden 18 | 19 | def forward(self, x): 20 | b, c, d = x.shape 21 | assert d == 2, "Coords are not 2d" 22 | 23 | max_y_el = torch.argmax(x[:, :, 1], dim=1) 24 | 25 | pos_embeds = self.embed(x).squeeze(1) 26 | assert pos_embeds.shape == ( 27 | b, 28 | c, 29 | self.n * 2, 30 | ), f"Pos_embeds are {pos_embeds.shape}. vs. {(b,c,self.n*2)}" 31 | 32 | logits = self.b(F.relu(self.a(pos_embeds))) 33 | 34 | preds = logits.squeeze(dim=2) 35 | loss = F.cross_entropy(preds, F.one_hot(max_y_el).to(torch.float32)) 36 | 37 | return loss 38 | 39 | 40 | def learn_mlp_for_top_point(): 41 | rand_points = torch.rand((100, 50, 2)) 42 | 43 | model = CoordMlp(100, 100) 44 | params = model.parameters() 45 | lr = 3e-2 46 | optimizer = torch.optim.SGD(params, lr=lr) 47 | 48 | epochs = 500 49 | for epoch in tqdm(range(epochs)): 50 | loss = model(rand_points) 51 | 52 | print(loss.item(), loss.shape) 53 | 54 | optimizer.zero_grad() 55 | loss.backward() 56 | optimizer.step() 57 | 58 | 59 | if __name__ == "__main__": 60 | learn_mlp_for_top_point() 61 | -------------------------------------------------------------------------------- /src/llama2d/vision/ocr.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, field, replace 2 | from typing import List, Optional, Tuple 3 | 4 | from google.cloud import vision 5 | 6 | from llama2d.constants import SCREEN_RESOLUTION, SECRETS_FILE 7 | 8 | 9 | @dataclass 10 | class ImageAnnotation: 11 | text: str # the word 12 | midpoint: Tuple[float, float] # the UNNORMALIZED midpoint of the word, (X,Y) 13 | midpoint_normalized: Tuple[ 14 | float, float 15 | ] # the normalized midpoint between 0 - 1 (X,Y) 16 | 17 | 18 | @dataclass 19 | class Llama2dScreen: 20 | full_text: str = "" # full text 21 | orig_text_dims: Tuple[float, float] = ( 22 | 1.0, 23 | 1.0, 24 | ) # the dimension of the *TEXT PORTION* of the image 25 | 26 | words: List[ImageAnnotation] = field( 27 | default_factory=list 28 | ) # a list of words and their midpoints 29 | 30 | def __add__(self, other): 31 | assert self.orig_text_dims == other.orig_text_dims 32 | return replace(self, words=self.words + other.words) 33 | 34 | def push_word( 35 | self, 36 | word: str, 37 | # must use exactly one 38 | # all 4 corners 39 | xyxy: Optional[Tuple[float, float, float, float]] = None, 40 | # midpoint 41 | xy: Optional[Tuple[float, float, float, float]] = None, 42 | ): 43 | new = self.concat_word(word=word, xyxy=xyxy, xy=xy) 44 | 45 | self.words = new.words 46 | self.full_text = new.full_text 47 | 48 | def concat_word( 49 | self, 50 | word: str, 51 | # must use exactly one 52 | # all 4 corners 53 | xyxy: Optional[Tuple[float, float, float, float]] = None, 54 | # midpoint 55 | xy: Optional[Tuple[float, float, float, float]] = None, 56 | ): 57 | full_text = self.full_text 58 | words = self.words 59 | 60 | if len(words) > 0: 61 | full_text += " " 62 | full_text += word 63 | 64 | assert (xyxy is None) != ( 65 | xy is None 66 | ), "You should specify xy (midpoint) xor xyxy (corners)." 67 | if xy is None: 68 | x = (xyxy[0] + xyxy[2]) / 2 69 | y = (xyxy[1] + xyxy[3]) / 2 70 | xy = (x, y) 71 | 72 | x, y = xy 73 | w, h = self.orig_text_dims 74 | xy_norm = (x / w, y / h) 75 | 76 | new_ann = ImageAnnotation(text=word, midpoint=xy, midpoint_normalized=xy_norm) 77 | words = words + [new_ann] 78 | 79 | return replace(self, words=words, full_text=full_text) 80 | 81 | def __getitem__(self, key: slice): 82 | assert type(key) == slice, "__getitem__ only supports slice right now" 83 | words = self.words[key] 84 | 85 | full_text = " ".join([word.text for word in words]) 86 | 87 | return replace(self, words=words, full_text=full_text) 88 | 89 | 90 | width, height = SCREEN_RESOLUTION 91 | 92 | 93 | class ImageAnnotator: 94 | def __init__(self, credentials=SECRETS_FILE): 95 | if not credentials.exists(): 96 | raise ValueError( 97 | f"Place the Google Cloud credentials file in {credentials}" 98 | ) 99 | 100 | self.client = vision.ImageAnnotatorClient.from_service_account_file(credentials) 101 | self.__features = [vision.Feature(type_=vision.Feature.Type.TEXT_DETECTION)] 102 | 103 | def __call__(self, path): 104 | with open(path, "rb") as image_file: 105 | content = image_file.read() 106 | 107 | image = vision.Image(content=content) 108 | request = vision.AnnotateImageRequest(image=image, features=self.__features) 109 | res = self.client.annotate_image(request) 110 | 111 | full_text = res.full_text_annotation.text 112 | 113 | annotations = res.text_annotations 114 | 115 | annotations_normed = Llama2dScreen( 116 | full_text=full_text, 117 | orig_text_dims=SCREEN_RESOLUTION, 118 | ) 119 | for text in annotations[1:]: 120 | xs = [vertex.x for vertex in text.bounding_poly.vertices] 121 | ys = [vertex.y for vertex in text.bounding_poly.vertices] 122 | 123 | prev_len = len(annotations_normed.words) 124 | annotations_normed.push_word( 125 | word=text.description, xyxy=[min(xs), min(ys), max(xs), max(ys)] 126 | ) 127 | assert len(annotations_normed.words) == prev_len + 1 128 | 129 | # optionally, sort the words by midpoint 130 | annotations_normed.words = list( 131 | sorted( 132 | annotations_normed.words, 133 | key=lambda x: (x.midpoint_normalized[1], x.midpoint_normalized[0]), 134 | ) 135 | ) 136 | 137 | return annotations_normed 138 | -------------------------------------------------------------------------------- /src/llama2d/vision/render_dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Llama2D/llama2d/e28b97255d396c717fe183b96b802ff39ffd7e6d/src/llama2d/vision/render_dataset.py -------------------------------------------------------------------------------- /src/llama2d/vision/take_screenshot.py: -------------------------------------------------------------------------------- 1 | from urllib.parse import urlparse 2 | 3 | from playwright.sync_api import sync_playwright 4 | 5 | from llama2d.constants import SCREEN_RESOLUTION 6 | 7 | width, height = SCREEN_RESOLUTION 8 | 9 | 10 | def take_screenshot(page, url, save_path="image_of_website.png"): 11 | if page is None: 12 | with sync_playwright() as p: 13 | # Using the Chromium browser but you can also use 'firefox' or 'webkit' 14 | browser = p.chromium.launch() 15 | page = browser.new_page() 16 | 17 | page.set_extra_http_headers( 18 | { 19 | "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)" 20 | " AppleWebKit/537.36 (KHTML, like Gecko)" 21 | " Chrome/116.0.0.0 Safari/537.36" 22 | } 23 | ) 24 | 25 | return take_screenshot(page, url, save_path) 26 | 27 | if url is not None: 28 | print("going to " + url) 29 | page.goto(url) 30 | 31 | # Set the viewport height to be the height of the content 32 | content_height = page.evaluate("document.documentElement.scrollHeight") 33 | thresholded_height = min(content_height, height) 34 | 35 | page.set_viewport_size({"width": width, "height": thresholded_height}) 36 | 37 | page.screenshot(path=save_path) 38 | 39 | 40 | def extract_domain(url): 41 | parsed_uri = urlparse(url) 42 | domain = "{uri.netloc}".format(uri=parsed_uri) 43 | domain = domain.replace(".", "_") 44 | return domain 45 | 46 | 47 | if __name__ == "__main__": 48 | target_url = "https://www.mytampahomeagent.com/" 49 | # target_url = "https://www.reddit.com" 50 | path = "./extracted/" + extract_domain(target_url) + ".png" 51 | print(path) 52 | 53 | take_screenshot(url=target_url, save_path=path) 54 | -------------------------------------------------------------------------------- /src/llama2d/vision/url_to_llama_input.py: -------------------------------------------------------------------------------- 1 | """ 2 | feature_extraction.py 3 | Extract features using the tokenizer, including text and image 4 | """ 5 | 6 | import tempfile 7 | from pathlib import Path 8 | from typing import Dict, List, Optional 9 | 10 | import torch 11 | 12 | from llama2d.constants import MAX_PAGE_LEN, MAX_SEQ_LEN, MAX_TAGS_LEN 13 | from llama2d.tagging.add_tags_to_page import TagAndBox 14 | from llama2d.vision.ocr import ImageAnnotator, Llama2dScreen 15 | from llama2d.vision.take_screenshot import extract_domain, take_screenshot 16 | from transformers import LlamaTokenizer 17 | 18 | 19 | class Llama2dTokenizer(object): 20 | def __init__( 21 | self, 22 | model_path: str = "decapoda-research/llama-7b-hf", 23 | separator_id=None, 24 | label_mask_id=-100, 25 | mask_out_body=True, 26 | ): 27 | self.tokenizer = LlamaTokenizer.from_pretrained(model_path) 28 | 29 | if not separator_id: 30 | self.__separator_id = ( 31 | self.tokenizer.unk_token_id 32 | ) # this should be kept at 0 for most uses, as it is a special token 33 | else: 34 | self.__separator_id = separator_id 35 | 36 | self.__label_mask_id = label_mask_id 37 | self.__mask_out_body = mask_out_body 38 | 39 | def process( 40 | self, prompt: str, screen: Llama2dScreen, output: str 41 | ) -> Dict[str, torch.Tensor]: 42 | # output tokens 43 | output_tokens = self.tokenizer.tokenize(output) 44 | # and use (-1,-1) for the 2d embeddings for the prompt 45 | output_tokens_locs = [(-1, -1) for _ in range(len(output_tokens))] 46 | 47 | # extract tokens 48 | image_tokens = [self.tokenizer.tokenize(i.text) for i in screen.words] 49 | # and, correspondingly, get their midpoints. If a word is broken up into 50 | # multiple pieces by the BPE, we return multiple of the word's location 51 | image_token_locs = [ 52 | [annot.midpoint_normalized for j in range(len(i))] 53 | for i, annot in zip(image_tokens, screen.words) 54 | ] 55 | 56 | # extract tokens from the prompt 57 | prompt_tokens = self.tokenizer.tokenize(prompt) 58 | # and use (-1,-1) for the 2d embeddings for the prompt 59 | prompt_tokens_locs = [(-1, -1) for _ in range(len(prompt_tokens))] 60 | 61 | # and now we stich it together 62 | input_ids = ( 63 | [self.tokenizer.bos_token_id] 64 | + self.tokenizer.convert_tokens_to_ids(prompt_tokens) # bos token 65 | + [self.__separator_id] 66 | + self.tokenizer.convert_tokens_to_ids( # seperating prompt with context 67 | [j for i in image_tokens for j in i] 68 | ) 69 | + [self.__separator_id] 70 | + self.tokenizer.convert_tokens_to_ids( # seperating context with answer 71 | output_tokens 72 | ) 73 | ) 74 | 75 | # mask out the prompt 76 | label_ids = ( 77 | [self.tokenizer.bos_token_id] 78 | + [-100 for _ in range(len(prompt_tokens))] # bos token 79 | + [-100] # we don not want to predict the prompt 80 | + [ # seperating prompt with context 81 | -100 if self.__mask_out_body else k 82 | for k in self.tokenizer.convert_tokens_to_ids( 83 | [j for i in image_tokens for j in i] 84 | ) 85 | ] 86 | + [-100] 87 | + self.tokenizer.convert_tokens_to_ids( # seperating context with answer 88 | output_tokens 89 | ) 90 | ) 91 | 92 | # and we switch together the image locs 93 | input_coords = ( 94 | [(-1, -1)] 95 | + prompt_tokens_locs # bos token 96 | + [(-1, -1)] 97 | + [j for i in image_token_locs for j in i] # for the separator 98 | + [(-1, -1)] 99 | + output_tokens_locs # for the separator 100 | ) 101 | input_coords = torch.tensor(input_coords) 102 | input_ids = torch.tensor(input_ids) 103 | label_ids = torch.tensor(label_ids) 104 | 105 | attention_mask = torch.ones_like(input_ids) 106 | 107 | assert ( 108 | len(input_ids) == len(label_ids) == len(input_coords) == len(attention_mask) 109 | ), ( 110 | f"len(input_ids) = {len(input_ids)}, len(label_ids) = {len(label_ids)}," 111 | f" len(input_coords) = {len(input_coords)}," 112 | f" len(attention_mask) = {len(attention_mask)}" 113 | ) 114 | 115 | # pad or truncate 116 | if len(input_ids) > MAX_SEQ_LEN: 117 | input_ids = input_ids[:MAX_SEQ_LEN] 118 | label_ids = label_ids[:MAX_SEQ_LEN] 119 | input_coords = input_coords[:MAX_SEQ_LEN] 120 | attention_mask = attention_mask[:MAX_SEQ_LEN] 121 | elif len(input_ids) < MAX_SEQ_LEN: 122 | # right-pad label_ids with -100, 123 | # input_coords with (-1,-1), and input_ids with 0 124 | input_ids = torch.cat( 125 | [input_ids, torch.zeros(MAX_SEQ_LEN - len(input_ids), dtype=torch.long)] 126 | ) 127 | label_ids = torch.cat( 128 | [ 129 | label_ids, 130 | torch.ones(MAX_SEQ_LEN - len(label_ids), dtype=torch.long) 131 | * self.__label_mask_id, 132 | ] 133 | ) 134 | input_coords = torch.cat( 135 | [input_coords, torch.ones(MAX_SEQ_LEN - len(input_coords), 2) * -1] 136 | ).to(torch.float16) 137 | attention_mask = torch.cat( 138 | [ 139 | attention_mask, 140 | torch.zeros(MAX_SEQ_LEN - len(attention_mask), dtype=torch.long), 141 | ] 142 | ) 143 | 144 | # assert all tensors are the desired length 145 | assert len(input_ids) == MAX_SEQ_LEN, f"len(input_ids) = {len(input_ids)}" 146 | assert len(label_ids) == MAX_SEQ_LEN, f"len(label_ids) = {len(label_ids)}" 147 | assert ( 148 | len(input_coords) == MAX_SEQ_LEN 149 | ), f"len(input_coords) = {len(input_coords)}" 150 | assert ( 151 | len(attention_mask) == MAX_SEQ_LEN 152 | ), f"len(attention_mask) = {len(attention_mask)}" 153 | 154 | # return output 155 | return { 156 | "input_ids": input_ids.to(torch.long), 157 | "coords": input_coords.to(torch.float16), 158 | "labels": label_ids.to(torch.long), 159 | "attention_mask": attention_mask.to(torch.long), 160 | } 161 | 162 | 163 | class Llama2dWebsiteFeatureExtractor(object): 164 | def __init__( 165 | self, 166 | **kwargs, 167 | ): # -100 is default 168 | self.tokenizer = Llama2dTokenizer(**kwargs) 169 | self.__annotator = ImageAnnotator() 170 | 171 | def process( 172 | self, prompt, page, output, tags_and_boxes: Optional[List[TagAndBox]] = None 173 | ): 174 | # run OCR 175 | annotations = self.__annotator(page) 176 | annotations = annotations[:MAX_PAGE_LEN] 177 | 178 | if tags_and_boxes is not None: 179 | for tag in tags_and_boxes[:MAX_TAGS_LEN]: 180 | annotations = annotations.concat_word(word=tag.word, xy=tag.coords) 181 | 182 | return self.tokenizer.process(prompt, annotations, output) 183 | 184 | def create_inference_data(self, page, prompt, uri): 185 | with tempfile.TemporaryDirectory() as tmpdir: 186 | path = Path(tmpdir) / extract_domain(uri) + ".png" 187 | # html = os.path.join(tmpdir, extract_domain(uri)+".mhtml") 188 | 189 | # driver = webdriver.Chrome() 190 | # driver.get(uri) 191 | 192 | # # Execute Chrome dev tool command to obtain the mhtml file 193 | # res = driver.execute_cdp_cmd('Page.captureSnapshot', {}) 194 | 195 | take_screenshot(page=page, url=uri, save_path=path) 196 | return self.__process(prompt, path, "") 197 | 198 | def from_training_data(self, page, html, uri): 199 | with tempfile.TemporaryDirectory() as tmpdir: 200 | path = Path(tmpdir) / extract_domain(uri) + ".png" 201 | prompt, label = take_screenshot(page=page, url=html, save_path=path) 202 | return self.__process(prompt, path, label) 203 | -------------------------------------------------------------------------------- /src/llama2d/vision/viz_pt_input.py: -------------------------------------------------------------------------------- 1 | # use headless 2 | import matplotlib 3 | from matplotlib import pyplot as plt 4 | from playwright.sync_api import sync_playwright 5 | 6 | from transformers import LlamaTokenizer 7 | 8 | matplotlib.use("Agg") 9 | 10 | # noqa 11 | """ 12 | pytorch input is a dictionary of the form 13 | { 14 | "input_ids": [ids of the tokens, from 0 to vocab_size-1], 15 | "attention_mask": [0 for padding, 1 for non-padding], 16 | "coords": [x,y] for each token - normalized to [0,1] for tokens with coords, and (-1,-1) for tokens without coords 17 | "labels": [ids of the tokens, from 0 to vocab_size-1] - same as input_ids, but with -100 for tokens that should not be predicted # noqa 18 | } 19 | """ 20 | 21 | 22 | model_path = "decapoda-research/llama-7b-hf" 23 | tokenizer = LlamaTokenizer.from_pretrained(model_path) 24 | 25 | # print(tokenizer.convert_ids_to_tokens([0,1,2,3,4,5,6,7,8,9,10,11,12,13,14])) 26 | 27 | 28 | def viz_pt_input(pt_input): 29 | input_ids = pt_input["input_ids"] 30 | attention_mask = pt_input["attention_mask"] 31 | coords = pt_input["coords"] 32 | # labels = pt_input["labels"] 33 | 34 | # graph tokens with coords in a matplotlib figure 35 | # print the tokens without coords 36 | 37 | # every word has a few tokens with the same coord. 38 | # we should generate the word, turn it into a string, then plot it at the coord 39 | 40 | without_coords = [ 41 | input_ids[i] 42 | for i in range(len(input_ids)) 43 | if coords[i][0] == -1 and attention_mask[i] == 1 44 | ] 45 | 46 | with_coords = [ 47 | (input_ids[i], coords[i]) 48 | for i in range(len(input_ids)) 49 | if coords[i][0] != -1 and attention_mask[i] == 1 50 | ] 51 | # split with_coords into words - 52 | # where a word is a list of tokens with the same coord 53 | words = [] 54 | current_word = [] 55 | current_coord = None 56 | for token in with_coords: 57 | if current_coord is None or (token[1] != current_coord).any(): 58 | if len(current_word) > 0: 59 | words.append(current_word) 60 | current_word = [] 61 | current_coord = token[1] 62 | current_word.append(token) 63 | if len(current_word) > 0: 64 | words.append(current_word) 65 | 66 | # plot with_coords as text on a matplotlib figure 67 | 68 | fig = plt.figure() 69 | # make fig very big 70 | fig.set_size_inches(20, 20) 71 | 72 | ax = fig.add_subplot(111) 73 | ax.set_xlim([0, 1]) 74 | ax.set_ylim([0, 1]) 75 | ax.set_aspect("equal") 76 | 77 | for word in words: 78 | word_str = "".join(tokenizer.convert_ids_to_tokens([i[0] for i in word])) 79 | word_coord = word[0][1] 80 | # very small text 81 | ax.text( 82 | word_coord[0], 83 | 1 - word_coord[1], 84 | word_str, 85 | fontsize=10, 86 | horizontalalignment="center", 87 | verticalalignment="center", 88 | ) 89 | 90 | # save the figure 91 | fig.savefig("tokens_with_coords.png") 92 | 93 | normal_str = "".join(tokenizer.convert_ids_to_tokens(input_ids)) 94 | print(normal_str) 95 | print() 96 | 97 | # as a str: 98 | without_coords_str = "".join(tokenizer.convert_ids_to_tokens(without_coords)) 99 | print(without_coords_str) 100 | 101 | print("") 102 | 103 | 104 | from torch.utils.data import Dataset 105 | 106 | 107 | def debug_dataset(dataset: Dataset): 108 | pt_input = None 109 | 110 | action = None 111 | i = 0 112 | while i < len(dataset): 113 | pt_input = dataset[i] 114 | if pt_input is not None: 115 | viz_pt_input(pt_input) 116 | action = input("Continue? [y/n/debug/]") 117 | if action == "n": 118 | break 119 | if action.startswith("d"): 120 | import pdb 121 | 122 | pdb.set_trace() 123 | # check if action is an integer - then skip that many 124 | if action.isdigit(): 125 | print(f"Skipping {action}...") 126 | i += int(action) 127 | continue 128 | i += 1 129 | 130 | assert pt_input is not None, "Didn't find any valid dataset entries!" 131 | if action != "n": 132 | input("Dataset has ended. Press enter to continue program.") 133 | 134 | 135 | if __name__ == "__main__": 136 | from llama2d.datasets.mind2web import Mind2webDataset 137 | 138 | with sync_playwright() as playwright: 139 | dataset = HuggingFaceDataset("llama2d/llama2d-mind2web", split="train") 140 | for entry in dataset: 141 | assert ( 142 | entry["labels"] > 0 143 | ).any(), f"No labels in entry! {entry['labels'].tolist()}" 144 | 145 | dataset = Mind2webDataset(playwright=playwright, headless=False) 146 | 147 | debug_dataset(dataset) 148 | -------------------------------------------------------------------------------- /src/llama2d/vision/webutils/chromedriver: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Llama2D/llama2d/e28b97255d396c717fe183b96b802ff39ffd7e6d/src/llama2d/vision/webutils/chromedriver -------------------------------------------------------------------------------- /src/llama2d/vision/webutils/playwright_browser.py: -------------------------------------------------------------------------------- 1 | import nest_asyncio 2 | from langchain.agents import AgentType, initialize_agent 3 | from langchain.agents.agent_toolkits import PlayWrightBrowserToolkit 4 | from langchain.chat_models import ChatAnthropic 5 | from langchain.tools.playwright.utils import create_async_playwright_browser 6 | 7 | nest_asyncio.apply() 8 | DEFAULT_STARTER_URL = { 9 | "url": "https://web.archive.org/web/20230428131116/https://www.cnn.com/world" 10 | } 11 | 12 | 13 | async def init_agent_chain(starter_url, llm): 14 | # tools 15 | toolkit = PlayWrightBrowserToolkit.from_browser(async_browser=async_browser) 16 | tools = toolkit.get_tools() 17 | tools_by_name = {tool.name: tool for tool in tools} 18 | navigate_tool = tools_by_name["navigate_browser"] 19 | get_elements_tool = tools_by_name["get_elements"] # 20 | 21 | await navigate_tool.arun(starter_url) 22 | # action 23 | # The browser is shared across tools, so the agent can interact in a stateful manner 24 | await get_elements_tool.arun( 25 | {"selector": ".container__headline", "attributes": ["innerText"]} 26 | ) 27 | 28 | agent_chain = initialize_agent( 29 | tools, 30 | llm, 31 | agent=AgentType.STRUCTURED_CHAT_ZERO_SHOT_REACT_DESCRIPTION, 32 | verbose=True, 33 | ) 34 | return agent_chain 35 | 36 | 37 | async def run(agent_chain, prompt): 38 | result = await agent_chain.arun(prompt) 39 | return result 40 | 41 | 42 | if __name__ == "__main__": 43 | async_browser = create_async_playwright_browser() 44 | llm = ChatAnthropic(temperature=0) # or any other LLM, e.g., ChatOpenAI(), OpenAI() 45 | init_agent_chain(async_browser, llm) 46 | -------------------------------------------------------------------------------- /src/llama2d/vision/webutils/selenium_action_chain.py: -------------------------------------------------------------------------------- 1 | # import webdriver 2 | from selenium import webdriver 3 | 4 | # import Action chains 5 | from selenium.webdriver.common.action_chains import ActionChains 6 | 7 | 8 | def run(driver): 9 | menu = driver.find_element_by_css_selector(".nav") 10 | hidden_submenu = driver.find_element_by_css_selector(".nav # submenu1") 11 | 12 | ActionChains(driver).move_to_element(menu).click(hidden_submenu).perform() 13 | # Or actions can be queued up one by one, then performed.: 14 | 15 | menu = driver.find_element_by_css_selector(".nav") 16 | hidden_submenu = driver.find_element_by_css_selector(".nav # submenu1") 17 | 18 | actions = ActionChains(driver) 19 | actions.move_to_element(menu) 20 | actions.click(hidden_submenu) 21 | actions.perform() 22 | 23 | 24 | # Project Example – 25 | # create webdriver object 26 | # get geeksforgeeks.org 27 | driver = webdriver.Chrome() 28 | driver.get("https://www.geeksforgeeks.org/") 29 | # get element 30 | element = driver.find_element_by_link_text("Courses") 31 | # create action chain object 32 | action = ActionChains(driver) 33 | 34 | # click the item 35 | action.click(on_element=element) 36 | 37 | # perform the operation 38 | action.perform() 39 | 40 | 41 | if "__main__" == __name__: 42 | # create webdriver object 43 | driver = webdriver.Firefox() 44 | # create action chain object 45 | action = ActionChains(driver) 46 | run(driver, action) 47 | -------------------------------------------------------------------------------- /src/llama2d/vision/webutils/stacked_image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Llama2D/llama2d/e28b97255d396c717fe183b96b802ff39ffd7e6d/src/llama2d/vision/webutils/stacked_image.png -------------------------------------------------------------------------------- /src/llama2d/vision/webutils/stitch_webpage.py: -------------------------------------------------------------------------------- 1 | import io 2 | 3 | import numpy as np 4 | from PIL import Image 5 | from selenium import webdriver 6 | from selenium.webdriver.common.by import By 7 | 8 | user_agent = ( 9 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" 10 | " (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" 11 | ) 12 | 13 | options = webdriver.ChromeOptions() 14 | options.add_argument(f"user-agent={user_agent}") 15 | options.add_argument("--disable-blink-features=AutomationControlled") 16 | options.add_argument("--disable-extensions") 17 | options.add_experimental_option("useAutomationExtension", False) 18 | options.add_experimental_option("excludeSwitches", ["enable-automation"]) 19 | 20 | options.add_argument("--headless") # Optional: Run the browser in headless mode 21 | # DEFAULT_CHROMEDRIVER_PATH = 'chromedriver' 22 | 23 | 24 | def stitch(images): 25 | stacked_array = np.vstack(images) 26 | # Convert the NumPy array to a Pillow image 27 | image = Image.fromarray(stacked_array) 28 | 29 | # Save the image to a file 30 | image.save("stacked_image.png") 31 | 32 | 33 | def scrape_scroll(url): 34 | driver = webdriver.Chrome(options=options) # Make sure the path to 35 | # driver = uc.Chrome(headless=True, use_subprocess=False, option) 36 | 37 | driver.get(url) 38 | # Replace with the URL of the webpage you want to screenshot 39 | # Set the initial scroll height 40 | screenshots = [] 41 | scroll_height = 0 42 | try: 43 | while True: 44 | total_height = driver.execute_script("return document.body.scrollHeight") 45 | 46 | driver.set_window_size( 47 | 1920, total_height 48 | ) # Adjust the window size to your liking 49 | screenshot = driver.find_element(By.TAG_NAME, "body").screenshot_as_png 50 | 51 | # print(type(screenshot)) 52 | image = np.array(Image.open(io.BytesIO(screenshot))) 53 | print(image.shape) 54 | # with open('screenshot.png', 'wb') as f: 55 | # f.write(screenshot) 56 | screenshots.append(image) 57 | # Scroll down to the bottom of the page 58 | # Increment the scroll height 59 | scroll_height += 1 60 | driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") 61 | # determine if this is end of page 62 | # Break the loop if we have reached the end of the page 63 | if scroll_height > 10: # You can adjust the number of scrolls as needed 64 | break 65 | except Exception: 66 | pass 67 | 68 | finally: 69 | print(f"Length of screenshots:{len(screenshots)}") 70 | stitch(screenshots) 71 | # Close the WebDriver 72 | driver.quit() 73 | 74 | 75 | if __name__ == "__main__": 76 | scrape_scroll("https://www.mytampahomeagent.com/") 77 | -------------------------------------------------------------------------------- /src/llama2d/vision/webutils/web_to_action.py: -------------------------------------------------------------------------------- 1 | import faiss 2 | from langchain.agents import Tool 3 | from langchain.docstore import InMemoryDocstore 4 | from langchain.embeddings import OpenAIEmbeddings 5 | from langchain.tools.file_management.read import ReadFileTool 6 | from langchain.tools.file_management.write import WriteFileTool 7 | from langchain.utilities import SerpAPIWrapper 8 | 9 | # setup memory 10 | from langchain.vectorstores import FAISS 11 | 12 | # search agent 13 | search = SerpAPIWrapper() 14 | tools = [ 15 | Tool( 16 | name="search", 17 | func=search.run, 18 | description="useful for when you need to answer questions about current events." 19 | " You should ask targeted questions", 20 | ), 21 | WriteFileTool(), 22 | ReadFileTool(), 23 | ] 24 | 25 | 26 | # Define your embedding model 27 | embeddings_model = OpenAIEmbeddings() 28 | # Initialize the vectorstore as empty 29 | 30 | embedding_size = 1536 31 | index = faiss.IndexFlatL2(embedding_size) 32 | vectorstore = FAISS(embeddings_model.embed_query, index, InMemoryDocstore({}), {}) 33 | -------------------------------------------------------------------------------- /src/mhtml/download.js: -------------------------------------------------------------------------------- 1 | // playwright 2 | 3 | const playwright = require('playwright'); 4 | const { setTimeout } = require('timers/promises'); 5 | 6 | (async () => { 7 | const browser = await playwright.chromium.launch({headless:false}); 8 | const context = await browser.newContext(); 9 | const page = await context.newPage(); 10 | 11 | await page.goto("https://google.com/"); 12 | await setTimeout(20_000); 13 | 14 | const session = await page.context().newCDPSession(page) 15 | const doc = await session.send('Page.captureSnapshot', { format: 'mhtml' }); 16 | console.log(doc.data); 17 | 18 | // save 19 | const {writeFileSync} = require('fs'); 20 | writeFileSync('./finance.mhtml', doc.data); 21 | 22 | })(); -------------------------------------------------------------------------------- /src/mhtml/index.js: -------------------------------------------------------------------------------- 1 | const { Parser } = require("fast-mhtml"); 2 | const p = new Parser({ 3 | rewriteFn: (url)=>{ 4 | console.log(url) 5 | return url 6 | // set base url to localhost:8080 7 | }, // default, urls are rewritten with this function 8 | }); 9 | 10 | 11 | const {readFileSync,writeFileSync} = require('fs'); 12 | 13 | const mhtmlFileContents = readFileSync('./finance.mhtml'); // read file 14 | const files = p.parse(mhtmlFileContents) // parse file 15 | .rewrite() // rewrite all links 16 | .spit(); // return all content 17 | 18 | console.log(files) 19 | 20 | writeFileSync('./finance.json', JSON.stringify(result,null,2)); // write file 21 | 22 | 23 | // mkdir -p ./finance 24 | // const {join} = require('path'); 25 | // const {mkdirSync} = require('fs'); 26 | // mkdirSync('./finance',{recursive:true}); 27 | 28 | // files.forEach(({filename,content})=>{ -------------------------------------------------------------------------------- /src/mhtml/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "mhtml", 3 | "version": "1.0.0", 4 | "description": "", 5 | "main": "index.js", 6 | "scripts": { 7 | "test": "echo \"Error: no test specified\" && exit 1" 8 | }, 9 | "keywords": [], 10 | "author": "", 11 | "license": "ISC", 12 | "dependencies": { 13 | "fast-mhtml": "^2.1.0", 14 | "playwright": "^1.38.1" 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /src/mhtml/serve.js: -------------------------------------------------------------------------------- 1 | const {Processor} = require('fast-mhtml'); 2 | 3 | Processor.serve(8080) -------------------------------------------------------------------------------- /src/mhtml/serve_local_data.js: -------------------------------------------------------------------------------- 1 | const express = require('express'); 2 | const { Parser } = require("fast-mhtml"); 3 | const fs = require('fs'); 4 | 5 | const filenamify = require('filenamify'); 6 | 7 | const { join } = require('path'); 8 | const mhtmlDir = join(__dirname, '../../data/mind2web-mhtml'); 9 | // const mhtmlDir = join(__dirname, 'demos'); 10 | 11 | const sentinel = 'mind2web_local_serve:' 12 | 13 | const app = express(); 14 | const fileCache = new Map(); 15 | app.get('/:path', (req, res) => { 16 | 17 | const file = req.params.path; 18 | 19 | if (file.endsWith('mhtml')) { // main file 20 | fileCache.clear(); // empty cache 21 | 22 | let base = null; 23 | 24 | const parser = new Parser({ 25 | rewriteFn: (url) => { 26 | if(new URL(url,`http://localhost:${port}/`).protocol.startsWith('http')) { 27 | return url; 28 | } 29 | return sentinel+filenamify(url); 30 | } 31 | }); 32 | // const fp = promised(fs.readFile, `${mhtmlDir}/${file}`); 33 | const fp = fs.promises.readFile(`${mhtmlDir}/${file}`); 34 | fp.then((data) => parser.parse(data).rewrite().spit()).then((spitFiles) => { 35 | for (const result of spitFiles) { 36 | fileCache.set(result.filename.replace(/#.*/, ''), result); // remove hash and set in cache 37 | } 38 | res.setHeader('Content-Type', spitFiles[0].type); 39 | res.send(spitFiles[0].content); 40 | res.end(); 41 | }).catch((err) => { 42 | res.status(500); 43 | res.send(`Error: ${err}
${err.stack.replace(/\n/, '
')}`); 44 | res.end(); 45 | }); 46 | return; 47 | } 48 | 49 | // redirect to URL in path 50 | if(!file.startsWith(sentinel) && (file.includes(".css") || file.includes(".js"))){ 51 | return res.redirect(file); 52 | } 53 | 54 | const result = fileCache.get(file); 55 | if (!result) { 56 | res.status(404); 57 | res.send(`MISS ${file} FROM${JSON.stringify(fileCache.keys())}`); 58 | res.end(); 59 | return; 60 | } 61 | res.setHeader('Content-Type', result.type); 62 | res.send(result.content); 63 | res.end(); 64 | }); 65 | 66 | const port = 5002; 67 | app.listen(port,() => console.log('Listening on port '+port)); -------------------------------------------------------------------------------- /src/models/.gitignore: -------------------------------------------------------------------------------- 1 | **/* 2 | 3 | !.gitignore 4 | -------------------------------------------------------------------------------- /src/secrets/.gitignore: -------------------------------------------------------------------------------- 1 | **/* 2 | !.gitignore -------------------------------------------------------------------------------- /tests/testing.py: -------------------------------------------------------------------------------- 1 | from playwright.sync_api import Playwright, sync_playwright 2 | 3 | with sync_playwright() as p: 4 | browser = p.chromium.launch() 5 | page = browser.new_page() 6 | page.goto( 7 | "file:///Users/andrewstelmach/Desktop/llama2d/data/mind2web-mhtml/961c3a5e-f8ce-4c71-a917-aa546dcea7fb_before.mhtml" 8 | ) 9 | # do something with the page... 10 | browser.close() 11 | --------------------------------------------------------------------------------