├── .gitignore ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── community_research └── mozfest.pdf ├── data_analysis ├── README.md ├── comment-to-code-ratio │ ├── README.md │ ├── analysis_comments_ratio.ipynb │ └── text_extraction.py ├── decontamination │ ├── README.md │ ├── find_substrings.py │ ├── minhash.py │ ├── requirements.txt │ └── requirements_minhash.txt ├── github_issues_analysis │ ├── analysis.ipynb │ └── utils.py ├── kenlm │ ├── kenlm.ipynb │ ├── kenlm_analysis.ipynb │ └── setup.sh ├── mathjax │ └── mathjax.ipynb ├── near-deduplication │ ├── .gitignore │ ├── README.md │ ├── minhash_deduplication.py │ ├── minhash_deduplication_alt.py │ ├── minhash_deduplication_debug.py │ ├── near_deduplicate.py │ ├── requirements.txt │ └── requirements_alt.txt ├── notebooks │ ├── ScalingLaws.ipynb │ ├── ScalingLawsHE.ipynb │ ├── bigcode_pls.csv │ ├── chinchilla_analysis.ipynb │ ├── code_compilation.ipynb │ ├── data_filters.ipynb │ ├── embedding_clustering.ipynb │ ├── file_size_analysis.ipynb │ ├── loss_analysis.ipynb │ ├── new_extension_distribution.csv │ ├── stats.csv │ ├── unimax.ipynb │ └── utils.py ├── python_data_analysis │ ├── code_compilation │ │ ├── README.md │ │ ├── compile_py_files.py │ │ └── requirements.txt │ ├── config_test_estimation │ │ ├── README.md │ │ └── config_test.py │ └── nl_language_identification │ │ ├── README.md │ │ ├── analysis.ipynb │ │ ├── fasttext_model │ │ └── README.md │ │ ├── language_identifier.py │ │ ├── requirements.txt │ │ └── text_extraction.py └── stars_filtering │ ├── README.md │ └── stars_analysis.ipynb ├── evaluation └── README.md ├── multi_query_experiments ├── README.md ├── attention_types_imp.py ├── profile_attention_types.json ├── profile_attention_types.py ├── profile_attention_types_visualise.ipynb ├── profile_hf_generate.py └── profile_mqa.ipynb ├── requirements.txt └── tokenization └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | 162 | .vscode/ 163 | .trunk 164 | .DS_Store -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # How to contribute to BigCode? 2 | 3 | Everyone is welcome to contribute, and we value everybody's contribution. Code 4 | is thus not the only way to help the community. Answering questions, helping 5 | others, reaching out and improving the documentations are immensely valuable to 6 | the community. 7 | 8 | Whichever way you choose to contribute, please be mindful to respect our 9 | [code of conduct](https://bigcode-project.org/docs/about/code_of_conduct/). 10 | 11 | ## You can contribute in so many ways! 12 | 13 | There are 4 ways you can contribute to this repository: 14 | * Fixing outstanding issues with the existing code; 15 | * Implementing new models; 16 | * Contributing to the examples or to the documentation; 17 | * Submitting issues related to bugs or desired new features. 18 | 19 | *All are equally valuable to the community.* 20 | 21 | ## License 22 | 23 | Note that all contributions are licensed under Apache 2.0 by default. The 24 | Technical Steering Committee (TSC) may approve the use of an alternative 25 | license or licenses for inbound or outbound contributions on an exception basis. 26 | To request an exception, please describe the contribution, the alternative 27 | license, and the justification for using an alternative license for the 28 | described contribution. License exceptions must be approved by the TSC. 29 | Contributed files should contain license information indicating the open 30 | source license or licenses pertaining to the file. 31 | 32 | ## Submitting a new issue or feature request 33 | 34 | Do your best to follow these guidelines when submitting an issue or a feature 35 | request. It will make it easier for us to come back to you quickly and with good 36 | feedback. 37 | 38 | ### Did you find a bug? 39 | 40 | First, we would really appreciate it if you could **make sure the bug was not 41 | already reported** (use the search bar on Github under Issues). 42 | 43 | Did not find it? :( So we can act quickly on it, please follow these steps: 44 | 45 | * Include your **OS type and version**, the versions of **Python**, **PyTorch** and 46 | **Tensorflow** when applicable; 47 | * A short, self-contained, code snippet that allows us to reproduce the bug in 48 | less than 30s; 49 | * Provide the *full* traceback if an exception is raised. 50 | 51 | ### Do you want a new feature? 52 | 53 | A world-class feature request addresses the following points: 54 | 55 | 1. Motivation first: 56 | * Is it related to a problem/frustration with the current features? If so, please explain 57 | why. Providing a code snippet that demonstrates the problem is best. 58 | * Is it related to something you would need for a project? We'd love to hear 59 | about it! 60 | * Is it something you worked on and think could benefit the community? 61 | Awesome! Tell us what problem it solved for you. 62 | 2. Write a *full paragraph* describing the feature; 63 | 3. Provide a **code snippet** that demonstrates its future use; 64 | 4. In case this is related to a paper, please attach a link; 65 | 5. Attach any additional information (drawings, screenshots, etc.) you think may help. 66 | 67 | If your issue is well written we're already 80% of the way there by the time you 68 | post it. 69 | 70 | ## Start contributing! (Pull Requests) 71 | 72 | Before writing code, we strongly advise you to search through the existing PRs or 73 | issues to make sure that nobody is already working on the same thing. If you are 74 | unsure, it is always a good idea to open an issue to get some feedback. 75 | 76 | You will need basic `git` proficiency to be able to contribute to 77 | BigCode. `git` is not the easiest tool to use but it has the greatest 78 | manual. Type `git --help` in a shell and enjoy. If you prefer books, [Pro 79 | Git](https://git-scm.com/book/en/v2) is a very good reference. 80 | 81 | Follow these steps to start contributing: 82 | 83 | 1. Fork the repository by 84 | clicking on the 'Fork' button on the repository's page. This creates a copy of the code 85 | under your GitHub user account. 86 | 87 | 2. Clone your fork to your local disk, and add the base repository as a remote: 88 | 89 | ```bash 90 | $ git clone git@github.com:/.git 91 | $ cd 92 | $ git remote add upstream https://github.com/bigcode-project/.git 93 | ``` 94 | 95 | 3. Create a new branch to hold your development changes: 96 | 97 | ```bash 98 | $ git checkout -b a-descriptive-name-for-my-changes 99 | ``` 100 | 101 | **Do not** work on the `main` branch. 102 | 103 | 4. Set up a development environment by running the following command in a virtual environment: 104 | 105 | ```bash 106 | $ pip install -r requirements.txt 107 | ``` 108 | 109 | 5. Develop the features on your branch. 110 | 111 | Once you're happy with your changes, add changed files using `git add` and 112 | make a commit with `git commit` to record your changes locally: 113 | 114 | ```bash 115 | $ git add modified_file.py 116 | $ git commit 117 | ``` 118 | 119 | Please write [good commit 120 | messages](https://chris.beams.io/posts/git-commit/). 121 | 122 | It is a good idea to sync your copy of the code with the original 123 | repository regularly. This way you can quickly account for changes: 124 | 125 | ```bash 126 | $ git fetch upstream 127 | $ git rebase upstream/main 128 | ``` 129 | 130 | Push the changes to your account using: 131 | 132 | ```bash 133 | $ git push -u origin a-descriptive-name-for-my-changes 134 | ``` 135 | 136 | 6. Once you are satisfied (**and the checklist below is happy too**), go to the 137 | webpage of your fork on GitHub. Click on 'Pull request' to send your changes 138 | to the project maintainers for review. 139 | 140 | 7. It's ok if maintainers ask you for changes. It happens to core contributors 141 | too! So everyone can see the changes in the Pull request, work in your local 142 | branch and push the changes to your fork. They will automatically appear in 143 | the pull request. 144 | 145 | 146 | ### Checklist 147 | 148 | 1. The title of your pull request should be a summary of its contribution; 149 | 2. If your pull request addresses an issue, please mention the issue number in 150 | the pull request description to make sure they are linked (and people 151 | consulting the issue know you are working on it); 152 | 3. To indicate a work in progress please prefix the title with `[WIP]`. These 153 | are useful to avoid duplicated work, and to differentiate it from PRs ready 154 | to be merged; 155 | 4. Make sure existing tests pass; 156 | 5. All public methods must have informative docstrings. 157 | 158 | ### Style guide 159 | 160 | For documentation strings, BigCode follows the [google style](https://google.github.io/styleguide/pyguide.html). 161 | 162 | **This guide was heavily inspired by the awesome [scikit-learn guide to contributing](https://github.com/scikit-learn/scikit-learn/blob/main/CONTRIBUTING.md).** 163 | 164 | ### Develop on Windows 165 | 166 | On windows, you need to configure git to transform Windows `CRLF` line endings to Linux `LF` line endings: 167 | 168 | `git config core.autocrlf input` 169 | 170 | One way one can run the make command on Window is to pass by MSYS2: 171 | 172 | 1. [Download MSYS2](https://www.msys2.org/), we assume to have it installed in C:\msys64 173 | 2. Open the command line C:\msys64\msys2.exe (it should be available from the start menu) 174 | 3. Run in the shell: `pacman -Syu` and install make with `pacman -S make` 175 | 4. Add `C:\msys64\usr\bin` to your PATH environment variable. 176 | 177 | You can now use `make` from any terminal (Powershell, cmd.exe, etc) 🎉 178 | 179 | ### Syncing forked main with upstream `main` 180 | 181 | To avoid pinging the upstream repository which adds reference notes to each upstream PR and sends unnecessary notifications to the developers involved in these PRs, 182 | when syncing the main branch of a forked repository, please, follow these steps: 183 | 1. When possible, avoid syncing with the upstream using a branch and PR on the forked repository. Instead merge directly into the forked main. 184 | 2. If a PR is absolutely necessary, use the following steps after checking out your branch: 185 | ``` 186 | $ git checkout -b your-branch-for-syncing 187 | $ git pull --squash --no-commit upstream main 188 | $ git commit -m '' 189 | $ git push --set-upstream origin your-branch-for-syncing 190 | ``` 191 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # BigCode Analysis 3 | This repository is for the analysis done in BigCode Project. You can find analysis of datasets, models, architecture choices and more. 4 | 5 | ## Contents 6 | * **Data analysis**: In the folder `data_analysis`, we provide code for data analysis: 7 | * Near deduplication 8 | * Python data analysis: 9 | * Natural language distribution in comments/docstrings 10 | * Data decontamination for HumanEval and MBPP benchmarks 11 | * Percentage of files that can be successfully compiled 12 | * Percentage of configuration and test files 13 | * Exploration of unimax sampling on The Stack 14 | Some notebooks with some early data and model loss analysis. 15 | 16 | * **Multi-Query Attention experiments**, for details please to [multi_query_experiments/README.md)](/multi_query_experiments/README.md) 17 | -------------------------------------------------------------------------------- /community_research/mozfest.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigcode-project/bigcode-analysis/e0b88d6cefa14e3b0d3fc5e3d6667e1fa1eb30ee/community_research/mozfest.pdf -------------------------------------------------------------------------------- /data_analysis/README.md: -------------------------------------------------------------------------------- 1 | # Data anlaysis 2 | 3 | In this folder we provide code for analysis of code datasets: 4 | * Near deduplication using MinHash and LSH 5 | 6 | * Data decontamination from HumanEval and MBPP evaluation benchmarks 7 | 8 | * Python data analysis: 9 | * Natural language distribution in comments/docstrings 10 | * Detection of configuration and test files (valid for other languages than Python) 11 | * Estimation of the number of files that can be successfully compiled 12 | 13 | * Comment to code ratio: analysis notebook for filtering based on the ratio of comments in a file. Filtering code available at [bigcode-dataset/preprocessing](https://github.com/bigcode-project/bigcode-dataset/tree/main/preprocessing) 14 | 15 | * Stars filtering: analysis notebook for filtering based on the number of stars of files. Filtering code available at [bigcode-dataset/preprocessing](https://github.com/bigcode-project/bigcode-dataset/tree/main/preprocessing) 16 | 17 | * PII Redaction: moved to [bigcode-dataset/preprocessing](https://github.com/bigcode-project/bigcode-dataset/tree/main/pii) 18 | * PII detection of emails, IP addresses and secret keys 19 | * PII anonymization 20 | * Pipeline evaluation on an annotated benchmark 21 | 22 | * Preprocessing: moved to [bigcode-dataset/preprocessing](https://github.com/bigcode-project/bigcode-dataset/tree/main/preprocessing) 23 | * code for data filtering based on line length and percentage of alphanumeric characters, comment to code ratio and stars. 24 | 25 | -------------------------------------------------------------------------------- /data_analysis/comment-to-code-ratio/README.md: -------------------------------------------------------------------------------- 1 | # Filtering based on comment to code ratio 2 | 3 | Here we are interested in filtering files based on their comments to code ratio. We can expect files with a higher number of comments and docstrings to be of better quality.On the other hand files where the majority of lines are comments may not be as uselful for a code generation model. We filter with a minimum and maximum comment to code ratio, which is computed in the following way: 4 | * For Python, we extract comments using Python tokenizer and docstrings using `ast` parsing. 5 | * For other languages (Java and Javascript), we extract comments using `pygments` library. 6 | * We compute the comment to code ratio of a file by counting the number of characters in comments over the total number of characters in the file. 7 | 8 | You can find clean filtering code in `bigcode-dataset`repository under [preprocessing](https://github.com/bigcode-project/bigcode-dataset/tree/main/preprocessing). 9 | * `analysis_comments_ratio.ipynb` contains the code for the analysis of the comment to code ratio filter, used to come up with minimum and maximum thresholds (0.01 and 0.8) for the Python, Java and JavaScript subsets of [The Stack](https://huggingface.co/datasets/bigcode/the-stack). -------------------------------------------------------------------------------- /data_analysis/comment-to-code-ratio/text_extraction.py: -------------------------------------------------------------------------------- 1 | """Extract Python comments (using Python tokenizer) and docstrings (using AST parsing).""" 2 | 3 | import io 4 | from itertools import groupby 5 | from os.path import basename, splitext 6 | import ast 7 | import tokenize 8 | import warnings 9 | 10 | StringIO = io.StringIO 11 | 12 | NODE_TYPES = { 13 | ast.ClassDef: 'Class', 14 | ast.FunctionDef: 'Function/Method', 15 | ast.Module: 'Module' 16 | } 17 | 18 | # comment extraction 19 | def get_comments(s, clean=False): 20 | "Returns a string including all coments" 21 | coments = [] 22 | g = tokenize.generate_tokens(StringIO(s).readline) 23 | for toknum, tokval, _, _, _ in g: 24 | # print(toknum,tokval) 25 | if toknum == tokenize.COMMENT: 26 | coments.append((toknum, tokval)) 27 | result = tokenize.untokenize(coments) 28 | if clean: 29 | result = result.replace('#', '') 30 | return result 31 | 32 | # TODO: extraction work well (with decorators over classes) 33 | # ast parsing, source: https://gist.github.com/SpotlightKid/1548cb6c97f2a844f72d 34 | def parse_docstrings(source): 35 | """Parse Python source code and yield a tuple of ast node instance, name, 36 | and docstring for each function/method, class and module.""" 37 | tree = ast.parse(source) 38 | 39 | for node in ast.walk(tree): 40 | if isinstance(node, tuple(NODE_TYPES)): 41 | docstring = ast.get_docstring(node) 42 | 43 | yield (node, getattr(node, 'name', None), docstring) 44 | 45 | def get_docstrings(source, module=''): 46 | """Parse Python source code from file or string and print docstrings.""" 47 | if hasattr(source, 'read'): 48 | filename = getattr(source, 'name', module) 49 | module = splitext(basename(filename))[0] 50 | source = source.read() 51 | 52 | docstrings = sorted(parse_docstrings(source), 53 | key=lambda x: (NODE_TYPES.get(type(x[0])), x[1])) 54 | 55 | grouped = groupby(docstrings, key=lambda x: NODE_TYPES.get(type(x[0]))) 56 | results = [] 57 | for _, group in grouped: 58 | for _, name, docstring in group: 59 | name = name if name else module 60 | #print(docstring or '') 61 | if docstring: 62 | results.append(docstring) 63 | return results 64 | 65 | def get_text(source, comments=True, clean_comments=True): 66 | """Extract all natural text in source: comments + doctsrings 67 | the extraction fails in case of syntax errors in the file 68 | Args: 69 | source: the code to parse 70 | comments: if True extract comments two 71 | clean_comment: if True remove # from extracted comments 72 | Returns: 73 | a string with concatenated docstrings and comments""" 74 | 75 | try: 76 | docstrings = '\n'.join(get_docstrings(source)) 77 | except : 78 | docstrings = '' 79 | warnings.warn("code couldn't be parsed due to compilation failure, no docstring is extracted") 80 | 81 | if comments: 82 | try: 83 | comments = get_comments(source, clean=clean_comments) 84 | except : 85 | comments = '' 86 | warnings.warn("tokenization error, no comment is extracted") 87 | else: 88 | comments = '' 89 | 90 | output = docstrings + "\n\n" + comments 91 | return output.strip() -------------------------------------------------------------------------------- /data_analysis/decontamination/README.md: -------------------------------------------------------------------------------- 1 | # Decontamination 2 | 3 | This directory contains several scripts for decontamination of the data. 4 | 1. Exact prompt matching `find_substrings.py` 5 | 2. Near matching `minhash.py` 6 | 7 | ## Near Matching with MinHash and LSH 8 | 9 | This is similar to the near deduplication script `data_analysis/near-deduplication/minhash_deduplication_alt.py` with one modification: we use benchmark datasets as index source instead of the dataset itself. 10 | 11 | ### Usage: 12 | 1. Update the script to include any benchmark you want to check agains in `DATASETS_TO_CHECK`. Be sure to create a global variable for the index using the same name in that config. Benchmark columns should be of type string or sequence of string, so that they can be concatenated. 13 | 2. Then you can run the script by 14 | ```bash 15 | pip install -r requirements_minhash.txt 16 | # Quick example 17 | python minhash.py \ 18 | --dataset codeparrot/codeparrot-clean-valid \ 19 | --split train \ 20 | --column content \ 21 | --cache-dir .cache \ 22 | --verbose 23 | # Check parameters with the help message 24 | python minhash.py --help 25 | ``` -------------------------------------------------------------------------------- /data_analysis/decontamination/find_substrings.py: -------------------------------------------------------------------------------- 1 | """ 2 | Takes a direcrory containing jsonl files as input. 3 | Filter out all samples that contain certain substrings. 4 | """ 5 | import sys 6 | import os 7 | import json 8 | import glob 9 | from tqdm import tqdm 10 | from multiprocessing import Pool 11 | 12 | from datasets import load_dataset 13 | 14 | 15 | # ========= data to filter out of the dataset ============ 16 | MBPP_PATH = "/data/mbpp/mbpp.jsonl" 17 | TEST_IDS = list(range(11, 511)) 18 | 19 | def mbpp_docstrings(): 20 | data = [] 21 | with open(MBPP_PATH) as f: 22 | for line in f: 23 | data.append(json.loads(line)) 24 | 25 | data = [sample for sample in data if sample["task_id"] in TEST_IDS] 26 | 27 | assert len(data) == 500 28 | 29 | # Checksum / version issues here 30 | # dataset = load_dataset("mbpp", split="test") 31 | 32 | return [sample["text"] for sample in data] 33 | 34 | 35 | def extract_docstring(prompt: str) -> str: 36 | if '"""' in prompt: 37 | if prompt.count('"""') == 2: 38 | return prompt.split('"""')[1].strip() 39 | elif prompt.count('"""') == 4: 40 | return prompt.split('"""')[3].strip() 41 | else: 42 | raise ValueError() 43 | elif '\'\'\'' in prompt: 44 | assert prompt.count('\'\'\'') == 2 45 | return prompt.split('\'\'\'')[1].strip() 46 | else: 47 | raise ValueError() 48 | 49 | 50 | def human_eval_docstrings(): 51 | ds = load_dataset("openai_humaneval", split="test") 52 | docstrings = [extract_docstring(v['prompt']) for v in ds] 53 | return docstrings 54 | 55 | FILTER_OUT = { 56 | "mbpp": mbpp_docstrings(), 57 | "human_eval": human_eval_docstrings() 58 | } 59 | # ============================================================ 60 | 61 | def add_dict(dict1: dict, dict2: dict) -> None: 62 | """ 63 | Add the values of dict2 to dict1. All values must be int, float or dictionaries that also verify this condition. 64 | Will modify dict1 and return None 65 | """ 66 | for key, value in dict2.items(): 67 | if isinstance(value, (int, float)): 68 | if key not in dict1: 69 | dict1[key] = 0 70 | dict1[key] += value 71 | elif isinstance(value, dict): 72 | if key not in dict1: 73 | dict1[key] = {} 74 | assert isinstance(dict1[key], dict) 75 | add_dict(dict1[key], value) 76 | else: 77 | raise ValueError(f"Invalid type for key/value {key}: {value}") 78 | 79 | def filter_file(data): 80 | """ 81 | Return True, None if the file should be included in the dataset. 82 | Otherwise return False and some metadata about the file excluded 83 | """ 84 | content = data['content'].lower() 85 | # For each substring, try to find it in the file (case insensitive) 86 | for benchmark, substrings in FILTER_OUT.items(): 87 | for substring in substrings: 88 | if substring.lower() in content: 89 | return False, f"{benchmark}_match" 90 | 91 | # Return True, None if none of the substrings was found 92 | return True, None 93 | 94 | 95 | def _update_meta_dict(meta_dict, filter_reason): 96 | if filter_reason not in meta_dict: 97 | meta_dict[filter_reason] = 0 98 | meta_dict[filter_reason] += 1 99 | 100 | 101 | def filter_jsonl_file(args): 102 | """ 103 | Filter a given file and write the output to the disk 104 | """ 105 | 106 | file_name, write_to = args 107 | meta = f"{write_to}_meta" 108 | meta_dict = {} 109 | with open(file_name, "r") as f: 110 | with open(write_to, "w") as out: 111 | with open(meta, "w") as meta_file: 112 | for i, line in tqdm(enumerate(f)): 113 | data = json.loads(line) 114 | # Write line to output-file if filter has passed 115 | to_include, filter_reason = filter_file(data) 116 | if to_include: 117 | out.write(line) 118 | else: 119 | _update_meta_dict(meta_dict, filter_reason) 120 | # Dump meta dict 121 | meta_file.write(json.dumps(meta_dict)) 122 | meta_file.write("\n") 123 | 124 | 125 | def main(): 126 | num_processes = 64 127 | # The input directory containing the jsonl files 128 | input_dir = sys.argv[1] 129 | # Where to write worker files and output file 130 | output_dir = sys.argv[2] 131 | 132 | assert os.path.isdir(input_dir) 133 | 134 | tmp_files_dir = os.path.join(output_dir, "tmp") 135 | output_file = os.path.join(output_dir, "data.jsonl") 136 | os.makedirs(tmp_files_dir, exist_ok=True) 137 | 138 | # Process all the files in the input directory 139 | # Get the arguments for each worker 140 | files = glob.glob(f"{input_dir}/data_*.jsonl") 141 | filter_args = [(file, f"{tmp_files_dir}/{os.path.basename(file)}") for file in files] 142 | output_files = [arg[1] for arg in filter_args] 143 | 144 | # Process the files in parallel 145 | with Pool(num_processes) as p: 146 | for i, res in enumerate(p.imap(filter_jsonl_file, filter_args)): 147 | print(i, res) 148 | 149 | # Concatenate the outputs of all the workers into one big file 150 | with open(output_file, "w") as outfile: 151 | for fname in output_files: 152 | with open(fname) as f: 153 | for line in f: 154 | outfile.write(line) 155 | 156 | # compile meta 157 | meta = {} 158 | for fname in output_files: 159 | fmeta = json.load(open(f"{fname}_meta")) 160 | add_dict(meta, fmeta) 161 | with open(f"{output_file}_meta", "w") as outfile: 162 | json.dump(meta, outfile) 163 | 164 | 165 | if __name__ == "__main__": 166 | main() 167 | 168 | -------------------------------------------------------------------------------- /data_analysis/decontamination/minhash.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # author : Chenghao Mou (mouchenghao@gmail.com) 4 | # created : 10/21/2022 5 | from __future__ import annotations 6 | 7 | import glob 8 | import logging 9 | import multiprocessing 10 | import os 11 | import random 12 | import re 13 | import time 14 | from pathlib import Path 15 | from typing import Any, Dict, Set 16 | 17 | import pandas as pd 18 | 19 | multiprocessing.set_start_method("fork", force=True) 20 | 21 | import numpy as np 22 | import typer 23 | from datasets import Dataset, Features, Sequence, Value, concatenate_datasets, load_dataset, load_from_disk 24 | from datasketch import LeanMinHash, MinHash, MinHashLSH 25 | from rich.console import Console 26 | from rich.logging import RichHandler 27 | from tqdm import tqdm 28 | 29 | random.seed(42) 30 | MINHASH_SEED = 42 31 | NON_ALPHA = re.compile("[^A-Za-z_0-9]") 32 | console = Console() 33 | logger = logging.getLogger(__name__) 34 | logger.setLevel(logging.INFO) 35 | logger.addHandler(RichHandler(rich_tracebacks=True)) 36 | logger.propagate = False 37 | 38 | 39 | human_eval_lsh: MinHashLSH | None = None 40 | mbpp_lsh: MinHashLSH | None = None 41 | 42 | dup_ids: Set[int] = set() 43 | 44 | DATASETS_TO_CHECK = [ 45 | { 46 | "name": "openai_humaneval", 47 | "splits": ["test"], 48 | "columns": ["prompt", "canonical_solution", "test"], 49 | "codename": "human_eval", 50 | "index": "human_eval_lsh", # The same name as the global variable 51 | }, 52 | { 53 | "name": "mbpp", 54 | "splits": ["train", "validation", "test"], 55 | "columns": ["text", "code", "test_list"], 56 | "codename": "mbpp", 57 | "index": "mbpp_lsh", # The same name as the global variable 58 | }, 59 | ] 60 | 61 | 62 | def load_dataset_with_config(conf: Dict[str, Any]) -> Dataset: 63 | """ 64 | Load a dataset based on the configuration. Be careful about changing this function, 65 | as it is used for caching the intermediate results. 66 | 67 | Parameters 68 | ---------- 69 | conf : Dict[str, Any] 70 | The configuration. Mainly, there are three ways to load a dataset: 71 | 1. Directly from th ehub 72 | 2. From a local git repository 73 | 3. From a local dataset directory that was saved by `save_to_disk` before 74 | 75 | Returns 76 | ------- 77 | Dataset 78 | The loaded dataset. 79 | """ 80 | 81 | # Load from hub 82 | if not conf["lfs"]: 83 | ds = load_dataset( 84 | conf["dataset"], 85 | conf["config"], 86 | data_dir=conf["data_dir"], 87 | split=conf["split"], 88 | use_auth_token=True, 89 | cache_dir=conf["cache_dir"], 90 | ) 91 | # Or load from git lfs files 92 | elif not os.path.exists(conf["concat_output"]): 93 | datasets = [] 94 | # In practice, it might stuck here, you can hit Ctrl+C and run it again. 95 | for file in tqdm(sorted(glob.glob(conf["data_dir"] + "/*.jsonl")), desc="Loading datasets..."): 96 | datasets.append(load_dataset("json", data_files=file, split=conf["split"], cache_dir=conf["cache_dir"])) 97 | ds = concatenate_datasets(datasets) 98 | ds.save_to_disk(conf["concat_output"]) 99 | ds = load_from_disk(conf["concat_output"]) 100 | # Or load from the concatenated dataset 101 | else: 102 | ds = load_from_disk(conf["concat_output"]) 103 | 104 | ds = ds.map( 105 | lambda _, idx: {"__id__": idx}, 106 | with_indices=True, 107 | num_proc=os.cpu_count(), 108 | desc="Adding index...", 109 | ) 110 | 111 | return ds 112 | 113 | 114 | def embed_func(idx: int, content: str, *, num_perm: int) -> Dict[str, Any]: 115 | """ 116 | Embed the content of a record into a MinHash object. This function should be 117 | used with multiprocessing and it scales well with the number of cores. 118 | 119 | Parameters 120 | ---------- 121 | idx : int 122 | The index of the record. 123 | content : str 124 | The content to embed. 125 | num_perm : int 126 | The number of permutations to use in the MinHash object. 127 | seed : int 128 | The seed to use in the MinHash object. 129 | 130 | Returns 131 | ------- 132 | Dict[str, Any] 133 | The MinHash signature and the index of the record. 134 | 135 | Examples 136 | -------- 137 | >>> result = embed_func(0, "Hello world!", num_perm=128) 138 | >>> result["__id__"] 139 | 0 140 | >>> result["__signature__"].shape 141 | (128,) 142 | >>> result["__signature__"].dtype 143 | dtype('uint64') 144 | """ 145 | m = MinHash(num_perm=num_perm, seed=MINHASH_SEED) 146 | m.update_batch([token.encode("utf-8") for token in {t for t in NON_ALPHA.split(content) if t}]) 147 | return {"__signature__": m.hashvalues, "__id__": idx} 148 | 149 | 150 | def query_func(idx: int, signature: np.ndarray, *, index: MinHashLSH) -> Dict[str, Any]: 151 | """ 152 | Query the MinHashLSH index for the record. This function can be used with multiprocessing 153 | as long as the index is shared across processes. 154 | 155 | Parameters 156 | ---------- 157 | index : MinHashLSH 158 | The MinHashLSH index. It is shared across all processes when using multiprocessing with fork without copy. 159 | record : Dict[str, Any] 160 | The record to query. 161 | 162 | Returns 163 | ------- 164 | Dict[str, Any] 165 | The query result. 166 | """ 167 | return { 168 | "__neighbors__": [ 169 | str(dup_idx) 170 | for dup_idx in index.query( 171 | LeanMinHash(seed=MINHASH_SEED, hashvalues=signature), 172 | ) 173 | ], 174 | "__id__": idx, 175 | } 176 | 177 | 178 | def jaccard_similarity(code1: str, code2: str) -> float: 179 | """ 180 | Calculate the jaccard similarity between two code snippets. 181 | 182 | Parameters 183 | ---------- 184 | code1 : str 185 | The first code snippet. 186 | code2 : str 187 | The second code snippet. 188 | 189 | Returns 190 | ------- 191 | float 192 | The jaccard similarity between the two code snippets. 193 | 194 | Examples 195 | -------- 196 | >>> jaccard_similarity("a = 1", "a = 2") 197 | 0.3333333333333333 198 | >>> jaccard_similarity("a = 1", "a = 1") 199 | 1.0 200 | """ 201 | tokens1 = set([t for t in NON_ALPHA.split(code1) if t.strip()]) 202 | tokens2 = set([t for t in NON_ALPHA.split(code2) if t.strip()]) 203 | return len(tokens1 & tokens2) / max(1, len(tokens1 | tokens2)) 204 | 205 | 206 | if __name__ == "__main__": 207 | 208 | def run( 209 | dataset: str = typer.Option("codeparrot/codeparrot-clean-valid", help="The dataset to use"), 210 | config: str = typer.Option("default", help="Dataset config"), 211 | data_dir: str = typer.Option(None, help="Dataset data directory"), 212 | split: str = typer.Option("train", help="Dataset split"), 213 | column: str = typer.Option("content", help="Dataset column"), 214 | cache_dir: str = typer.Option(".cache", help="Cache directory"), 215 | num_perm: int = typer.Option(128, help="Number of permutations"), 216 | seed: int = typer.Option(42, help="Random seed"), 217 | threshold: float = typer.Option(0.58, help="Minhash threshold"), 218 | verbose: bool = typer.Option(False, "--verbose", "-v", help="Verbose logging"), 219 | output: str = typer.Option(None, help="Store the deduplicated dataset"), 220 | lfs: bool = typer.Option(False, help="Use LFS files"), 221 | ): 222 | global dup_ids 223 | 224 | OUTPUT_BASE = Path("results") / dataset / config / (data_dir or "all") / split / column 225 | OUTPUT_BASE.mkdir(exist_ok=True, parents=True) 226 | output_concat = OUTPUT_BASE / "concat" 227 | output = output or (OUTPUT_BASE / "decontaminated") 228 | output_duplicates = OUTPUT_BASE / "duplicates" 229 | output_duplicate_results = OUTPUT_BASE / "duplicate_results.jsonl" 230 | logger.info(f"{'Output base':<30}: {OUTPUT_BASE}") 231 | logger.info(f"{'Output concat':<30}: {output_concat}") 232 | logger.info(f"{'Output duplicates':<30}: {output_duplicates}") 233 | logger.info(f"{'Output duplicate results':<30}: {output_duplicate_results}") 234 | logger.info(f"{'Output':<30}: {output}") 235 | 236 | conf = { 237 | "cache_dir": cache_dir, 238 | "num_perm": num_perm, 239 | "seed": seed, 240 | "threshold": threshold, 241 | "dataset": dataset, 242 | "config": config, 243 | "data_dir": data_dir, 244 | "split": split, 245 | "column": column, 246 | "verbose": verbose, 247 | "output": output, 248 | "lfs": lfs, 249 | "concat_output": output_concat, 250 | } 251 | 252 | time_measures = {} 253 | 254 | for benchmark in DATASETS_TO_CHECK: 255 | globals()[benchmark["index"]] = MinHashLSH( 256 | threshold=conf["threshold"], 257 | num_perm=conf["num_perm"], 258 | ) 259 | time_measures["load_dataset"] = time.time() 260 | ds = load_dataset_with_config(conf) 261 | time_measures["load_dataset"] = time.time() - time_measures["load_dataset"] 262 | DATA_SIZE = len(ds) 263 | start_time = time.time() 264 | 265 | embedded = ds.map( 266 | function=embed_func, 267 | fn_kwargs={"num_perm": conf["num_perm"]}, 268 | input_columns=["__id__", conf["column"]], 269 | remove_columns=[conf["column"]], 270 | num_proc=os.cpu_count(), 271 | desc=f"Fingerprinting...", 272 | ) 273 | 274 | duplicate_results = [] 275 | for _, benchmark in enumerate(DATASETS_TO_CHECK): 276 | benchmark_ds = concatenate_datasets( 277 | [ 278 | load_dataset(benchmark["name"], split=split, cache_dir=conf["cache_dir"]) 279 | for split in benchmark["splits"] 280 | ] 281 | ) 282 | benchmark_ds = benchmark_ds.map( 283 | function=lambda x, idx: { 284 | **embed_func( 285 | idx, 286 | " ".join( 287 | [x[col] if isinstance(x[col], str) else " ".join(x[col]) for col in benchmark["columns"]] 288 | ), 289 | num_perm=conf["num_perm"], 290 | ), 291 | "__content__": " ".join( 292 | [x[col] if isinstance(x[col], str) else " ".join(x[col]) for col in benchmark["columns"]] 293 | ), 294 | }, 295 | num_proc=os.cpu_count(), 296 | with_indices=True, 297 | desc=f"Fingerprinting...", 298 | ) 299 | with globals()[benchmark["index"]].insertion_session() as session: 300 | for record in benchmark_ds: 301 | session.insert(record["__id__"], LeanMinHash(seed=MINHASH_SEED, hashvalues=record["__signature__"])) 302 | 303 | queried = embedded.map( 304 | function=lambda x, y: query_func(x, y, index=globals()[benchmark["index"]]), 305 | num_proc=os.cpu_count(), 306 | input_columns=[ 307 | "__id__", 308 | "__signature__", 309 | ], 310 | remove_columns=["__signature__"], 311 | desc="Querying...", 312 | features=Features( 313 | { 314 | "__id__": Value("uint64"), 315 | "__neighbors__": Sequence(Value("string")), 316 | } 317 | ), 318 | ).filter( 319 | lambda x: len(x["__neighbors__"]) > 0, 320 | num_proc=os.cpu_count(), 321 | desc=f"Filtering...", 322 | ) 323 | 324 | for record in tqdm( 325 | queried, 326 | desc=f"Checking for false positives...", 327 | ): 328 | neighbors = set(record["__neighbors__"]) 329 | curr_text = ds[record["__id__"]][conf["column"]] 330 | for neighbor in neighbors: 331 | reference = benchmark_ds[int(neighbor)] 332 | reference_text = reference["__content__"] 333 | if jaccard_similarity(curr_text, reference_text) >= conf["threshold"]: 334 | break 335 | else: 336 | continue 337 | dup_ids.add(record["__id__"]) 338 | duplicate_results.append( 339 | { 340 | "original_record": ds[record["__id__"]], 341 | "duplicate_dataset": benchmark["name"], 342 | "duplicate_ids": [benchmark_ds[int(neighbor)] for neighbor in neighbors], 343 | } 344 | ) 345 | 346 | logger.info(f"Done querying false positives for {benchmark['name']}") 347 | 348 | if benchmark["name"] == "openai_humaneval": 349 | if "repository_name" not in ds.features or "path" not in ds.features: 350 | break 351 | logger.info("Checking HumanEval") 352 | KNOWN_PATH = "LaudateCorpus1/code-align-evals-data/human_eval" 353 | subset = ds.filter( 354 | lambda x: KNOWN_PATH in x["repository_name"] + "/" + x["path"], 355 | num_proc=os.cpu_count(), 356 | desc=f"Filtering for HumanEval...", 357 | ) 358 | # Find out the minimum maximum similarity 359 | thresholds = [] 360 | for record in subset: 361 | thresholds.append(0) 362 | for target in benchmark_ds: 363 | thresholds[-1] = max( 364 | thresholds[-1], jaccard_similarity(record[conf["column"]], target["__content__"]) 365 | ) 366 | 367 | logger.info(f"{'Minimum maximum similarity':<30}: {min(thresholds):.3f}") 368 | logger.info(f"{'Maximum maximum similarity':<30}: {max(thresholds):.3f}") 369 | logger.info(f"{'Mean maximum similarity':<30}: {np.mean(thresholds):.3f}") 370 | 371 | logger.info(f"Finished checking benchmark {benchmark['name']}") 372 | 373 | time_measures["total_processing_time"] = time.time() - start_time 374 | 375 | duplicates = ds.filter(lambda x: x["__id__"] in dup_ids, num_proc=os.cpu_count()) 376 | final_data = ds.filter( 377 | lambda idx: idx not in dup_ids, 378 | input_columns=["__id__"], 379 | num_proc=os.cpu_count(), 380 | desc="Filtering duplicates...", 381 | ) 382 | 383 | final_data.save_to_disk(output) 384 | duplicates.save_to_disk(output_duplicates) 385 | pd.DataFrame(duplicate_results).to_json(output_duplicate_results, lines=True, orient="records") 386 | 387 | FINAL_DATA_SIZE = len(final_data) 388 | DUP_SIZE = DATA_SIZE - FINAL_DATA_SIZE 389 | LAN = (data_dir or "all").split("/")[-1] 390 | 391 | logger.info(f"{'Language':<30}: {LAN}") 392 | logger.info(f"{'Data Number':<30}: {DATA_SIZE}") 393 | logger.info(f"{'Duplicate Number':<30}: {DUP_SIZE}") 394 | logger.info(f"{'Duplicate Rate':<30}: {DUP_SIZE / DATA_SIZE:.2%}") 395 | logger.info(f"{'Total Time':<30}: {time.time() - start_time:.2f} seconds") 396 | 397 | typer.run(run) 398 | -------------------------------------------------------------------------------- /data_analysis/decontamination/requirements.txt: -------------------------------------------------------------------------------- 1 | datasets 2 | datasketch 3 | rich -------------------------------------------------------------------------------- /data_analysis/decontamination/requirements_minhash.txt: -------------------------------------------------------------------------------- 1 | networkit==10.0 2 | datasketch==1.5.8 3 | rich==12.6.0 4 | tqdm==4.64.1 5 | datasets==2.5.1 6 | typer==0.6.1 7 | tabulate==0.9.0 8 | dill==0.3.5.1 -------------------------------------------------------------------------------- /data_analysis/github_issues_analysis/utils.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | import datasets 4 | import regex 5 | import torch 6 | from transformers import pipeline 7 | 8 | GITHUB_EMAILS = [ 9 | re.compile(pattern, re.DOTALL) 10 | for pattern in [ 11 | "(.*)From:.+Reply to this email directly.+view it on GitHub(.*)\n?(.*)", 12 | "(.*)On.+notifications@github.com.+wrote:.+Reply to this email directly.+view it on GitHub(.*)\n?(.*)", 13 | "(.*)Signed-off-by: .+<.+>(.*?)\n?(.*)", 14 | ] 15 | ] 16 | GITHUB_EMAIL_DATE = re.compile("\d+/\d+/\d+ \d{2}:\d{2} [AP]M.+wrote") 17 | GITHUB_EMAIL_LINEBREAK = re.compile("_{20,}") 18 | 19 | 20 | BOT_AUTHORS = [ 21 | "Apache-HBase", 22 | "AutorestCI", 23 | "CLAassistant", 24 | "cmsbuild", 25 | "codecov-io", 26 | "codecov-commenter", 27 | "coveralls", 28 | "danger-public", 29 | "dnfclas", 30 | "msftclas", 31 | "PyDocTeur", 32 | "SparkQA", 33 | "karma-pr-reporter", 34 | "danger-public", 35 | "claassistantio", 36 | "probot-stale", 37 | ] 38 | 39 | BOT_KEYWORDS = ["[bot]", "botmanager", "bors-", "jenkins", "k8s-", "-test-", "travis"] 40 | 41 | BOT_SUFFIXES = [ 42 | "-automaton", 43 | "-automation", 44 | "-benchmark", 45 | "-build", 46 | "-deployer", 47 | "-cloud", 48 | "bot", 49 | "-ci", 50 | "-linter", 51 | "-teamcity", 52 | "-test", 53 | "-testing", 54 | "-Service-Account", 55 | ] 56 | 57 | 58 | def merge_text_columns(example): 59 | """Combines description and comment to one column (text) 60 | 61 | Descriptions are issue-level text (body of text when opening an issue), 62 | comments are replies to the parent issue or one of its comments. 63 | We merge them as an event cannot have both at the same time. 64 | """ 65 | events_new = [] 66 | text_columns = ["comment", "description"] 67 | for event_old in example["events"]: 68 | event_new = {k: v for k, v in event_old.items() if k not in text_columns} 69 | comment, description = event_old["comment"], event_old["description"] 70 | text = comment if comment else description 71 | event_new["text"] = text if text else "" 72 | events_new.append(event_new) 73 | example["events"] = events_new 74 | return example 75 | 76 | 77 | def _strip_automated_email_text(text): 78 | """Removes text auto-generated when users post in issues via email reply""" 79 | if text: 80 | text = text.strip() 81 | else: 82 | return "" 83 | # try to extract with regex directly 84 | for pattern in GITHUB_EMAILS: 85 | m = pattern.match(text) 86 | if m: 87 | break 88 | if m: 89 | text = m.group(1) + m.group(3) 90 | else: 91 | # if no exact matches, apply matching line by line and 92 | # get potential content before/after automated email text 93 | lines = text.split("\n") 94 | start, end = 0, -1 95 | for i, line in enumerate(lines): 96 | line = line.strip() 97 | if "notifications@github.com" in line or bool( 98 | GITHUB_EMAIL_DATE.search(line) 99 | ): 100 | start = i 101 | if "Reply to this email directly" in line: 102 | end = i + 1 if line.endswith(":") else i 103 | if line.startswith(">"): 104 | # remove quoted text in replies 105 | end = i 106 | text = "\n".join(lines[:start] + lines[end + 1 :]) 107 | # remove page break line 108 | return GITHUB_EMAIL_LINEBREAK.sub("", text).strip() 109 | 110 | 111 | def strip_automated_email_text(example): 112 | """Removes auto-generated text from emails in Github issues""" 113 | # assumes merge_text_columns() was already applied on dataset 114 | example["events"] = [ 115 | { 116 | k: _strip_automated_email_text(v) if k == "text" else v 117 | for k, v in event.items() 118 | } 119 | for event in example["events"] 120 | ] 121 | return example 122 | 123 | 124 | def remove_bot_comments(example): 125 | """Discard auto comments from issues based on author pattern matching""" 126 | filtered_events = [] 127 | modified = False 128 | for event in example["events"]: 129 | author = event["author"] 130 | # assumes single `text' field rather than comment/description 131 | is_bot = ( 132 | any(bp.lower() in author.lower() for bp in BOT_KEYWORDS) 133 | or any(author.lower().endswith(s) for s in BOT_SUFFIXES) 134 | or any(author == a for a in BOT_AUTHORS) 135 | ) 136 | if not is_bot: 137 | filtered_events.append(event) 138 | else: 139 | modified = True 140 | # example["old_events"] = example["events"] 141 | example["events"] = filtered_events 142 | example["bot_issue"] = len(example["events"]) == 0 143 | example["modified_by_bot"] = modified 144 | return example 145 | -------------------------------------------------------------------------------- /data_analysis/kenlm/kenlm.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "c62f3811-73a4-4de2-9800-84c148508838", 6 | "metadata": {}, 7 | "source": [ 8 | "## Install KenLM\n", 9 | "\n", 10 | "```bash\n", 11 | "sudo apt -y install build-essential cmake libboost-system-dev libboost-thread-dev libboost-program-options-dev libboost-test-dev libeigen3-dev zlib1g-dev libbz2-dev liblzma-dev\n", 12 | "wget -O - https://kheafield.com/code/kenlm.tar.gz | tar xz\n", 13 | "mkdir kenlm/build && cd kenlm/build && cmake .. && make -j2\n", 14 | "ls kenlm/build/bin\n", 15 | "```" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": 1, 21 | "id": "7a73b1c0-1349-48d6-930d-b6b1dd6e76cb", 22 | "metadata": {}, 23 | "outputs": [], 24 | "source": [ 25 | "from datasets import load_dataset" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 2, 31 | "id": "4361baf4-75d2-4dbe-9dab-2a6bacdc3008", 32 | "metadata": {}, 33 | "outputs": [ 34 | { 35 | "data": { 36 | "application/vnd.jupyter.widget-view+json": { 37 | "model_id": "4382dfdd8a1542518b8f65f27d755bf9", 38 | "version_major": 2, 39 | "version_minor": 0 40 | }, 41 | "text/plain": [ 42 | "VBox(children=(HTML(value='
\"gram.arpa\"\n", 188 | "kenlm/build/bin/build_binary gram.arpa gram.binary\n", 189 | "```" 190 | ] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "execution_count": 84, 195 | "id": "88d82de8-7bf2-44ee-a9a2-18cf81b847a3", 196 | "metadata": {}, 197 | "outputs": [], 198 | "source": [ 199 | "import kenlm\n", 200 | "model = kenlm.LanguageModel('./gram.binary')" 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": 85, 206 | "id": "3087cf00-acaf-4f15-a1be-63d7ba6f0763", 207 | "metadata": {}, 208 | "outputs": [ 209 | { 210 | "data": { 211 | "text/plain": [ 212 | "-21.19799041748047" 213 | ] 214 | }, 215 | "execution_count": 85, 216 | "metadata": {}, 217 | "output_type": "execute_result" 218 | } 219 | ], 220 | "source": [ 221 | "model.score(\"this is a test\")" 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": null, 227 | "id": "c2e4fda3-7e99-49d7-8a93-c5318c459bf2", 228 | "metadata": {}, 229 | "outputs": [], 230 | "source": [] 231 | } 232 | ], 233 | "metadata": { 234 | "kernelspec": { 235 | "display_name": "Python 3", 236 | "language": "python", 237 | "name": "python3" 238 | }, 239 | "language_info": { 240 | "codemirror_mode": { 241 | "name": "ipython", 242 | "version": 3 243 | }, 244 | "file_extension": ".py", 245 | "mimetype": "text/x-python", 246 | "name": "python", 247 | "nbconvert_exporter": "python", 248 | "pygments_lexer": "ipython3", 249 | "version": "3.7.12" 250 | } 251 | }, 252 | "nbformat": 4, 253 | "nbformat_minor": 5 254 | } 255 | -------------------------------------------------------------------------------- /data_analysis/kenlm/setup.sh: -------------------------------------------------------------------------------- 1 | sudo apt -y install build-essential cmake libboost-system-dev libboost-thread-dev libboost-program-options-dev libboost-test-dev libeigen3-dev zlib1g-dev libbz2-dev liblzma-dev 2 | wget -O - https://kheafield.com/code/kenlm.tar.gz | tar xz 3 | mkdir kenlm/build && cd kenlm/build && cmake .. && make -j2 4 | cd kenlm 5 | python setup.py install -------------------------------------------------------------------------------- /data_analysis/near-deduplication/.gitignore: -------------------------------------------------------------------------------- 1 | results 2 | *log 3 | *json 4 | minhash_deduplication_alt_streaming.py 5 | dump.rdb 6 | data -------------------------------------------------------------------------------- /data_analysis/near-deduplication/README.md: -------------------------------------------------------------------------------- 1 | # Near deduplication 2 | ## For the lastest version of near-deduplication with speed-ups, check [bigcode-dataset/near_deduplication](https://github.com/bigcode-project/bigcode-dataset/tree/main/near_deduplication) 3 | Code for running near-deduplication with MinHash and LSH indexing 4 | 5 | ### Setup 6 | 7 | ```` 8 | pip install -r requirements.txt 9 | ```` 10 | 11 | Login to be able to push the dataset to the hub after deduplication and clone your huggingface-hub repositories: 12 | 13 | ```` 14 | huggingface-cli login 15 | ```` 16 | 17 | And make sure you have git-lfs installed. 18 | 19 | If you use datasets with different column names from the BigCode ones, you might need to change `PATH_COLUMN` and `CONTENT` variables in `minhash_deduplication.py`. 20 | 21 | ### Usage 22 | 23 | To run near deduplication use the following command and adapt the arguments for your case: 24 | 25 | ```` 26 | python near_deduplicate.py \ 27 | --dataset_name bigcode-data/python_any_license_v2 \ 28 | --org bigcode-data \ 29 | --repo_name python_any_license_v2_near_dedup \ 30 | --out_path ./data/any_license-near-dedup \ 31 | --text_column content 32 | ```` 33 | 34 | To make just a test run with a subset of the data, set `test_run` argument to True. 35 | 36 | The first time you load the dataset might be slow if it is large, but the data is saved in the cache thanks to `datasets`, and the subsequent calls will be fast. 37 | 38 | ### Alternative Deduplication Script 39 | 40 | `minhash_deduplication_alt.py` is an alternative you might find useful to use as well. It is best for a single multi-core machine environment and uses similar parameters to the original deduplication script. 41 | 42 | ```bash 43 | pip install -r requirements_alt.txt 44 | # Quick example 45 | python minhash_deduplication_alt.py --dataset codeparrot/codeparrot-clean-valid \ 46 | --split train \ 47 | --column content \ 48 | --cache-dir .cache \ 49 | --verbose 50 | # For details on the arguments, see the help message 51 | python minhash_deduplication_alt.py --help 52 | ``` 53 | 54 | #### Implementation Analysis 55 | 56 | This is for the alternative script that is designed for single-machine setup. 57 | 58 | ##### Scaling 59 | 60 | To understand the limitation of current deduplication implementation, it is important to have an idea of how each step in the pipeline affects the overall time: 61 | 1. Minhashing is fast, but it takes longer for long documents. Hashing scales with both the number of cores and single core performance (clock speed, for example). With `datasets`s caching, it also does not require much memory. 62 | 2. Indexing is basically putting minhash signatures into different buckets. This is one bottleneck in this pipeline. In an ideal situation where MapReduce is seamlessly integrated with other parts, it can be further improved with distributed buckets. 63 | 3. Depending on how you look at duplicates, querying can be easily created by iterating the buckets or iterating the simhashes. 64 | 4. Depending on how you decide to group duplicates, you can build a graph and then do connected component analysis or use a simple algorithm like union-find. 65 | 5. What to do with a group of duplicates is also a widely open question. We opt to keep one document within a group/cluster in this case. 66 | 67 | ##### Experiments 68 | 69 | We report here some stats on the experiments we did along the way with a 80-core machine on GCP (M1): 70 | 71 | For SantaCoder, our results can be replicated by the following commands: 72 | 73 | ```bash 74 | python minhash_deduplication_alt.py --dataset bigcode/the-stack-dedup-pjj --data-dir data/java --revision v1.1.a1 --cache-dir cache2 --ngram-size 5 --threshold 0.7 --min-token-length 10 --fast 75 | python minhash_deduplication_alt.py --dataset bigcode/the-stack-dedup-pjj --data-dir data/javascript --revision v1.1.a1 --cache-dir cache2 --ngram-size 5 --threshold 0.7 --min-token-length 10 --fast 76 | python minhash_deduplication_alt.py --dataset bigcode/the-stack-dedup-pjj --data-dir data/python --revision v1.1.a1 --cache-dir cache2 --ngram-size 5 --threshold 0.7 --min-token-length 10 --fast 77 | ``` 78 | 79 | Java Results as of Dec 20, 2022 80 | ``` 81 | load_dataset : 3414.68 seconds 82 | minhash : 22966.13 seconds 83 | clustering : 7676.72 seconds 84 | filtering : 1118.62 seconds 85 | save : 3105.66 seconds 86 | Data Number (before) : 40113161 87 | Data Number (after) : 21108567 (52.62%) 88 | Duplicate Number : 19004594 (47.38%) 89 | Total Time : 38281.88 seconds (10.6 hours) 90 | ``` 91 | 92 | 93 | Java (already deduplicated) Results as of Dec 2, 2022 94 | ``` 95 | Load Dataset : 77.18 seconds 96 | Embed : 5052.87 seconds 97 | Create Index : 16253.12 seconds 98 | Save Index : 0.00 seconds 99 | Freeze Memory : 0.00 seconds 100 | Query : 1321.61 seconds 101 | Save Neighbors : 0.00 seconds 102 | Unfreeze Memory : 0.00 seconds 103 | Clustering : 10825.30 seconds 104 | Total Processing Time : 34919.87 seconds 105 | Deduplicate : 605.83 seconds 106 | Save Deduplicated : 2356.10 seconds 107 | Language : java 108 | Data Number (before filtering) : 25124914 109 | Data Number (after filtering) : 24972491 110 | Duplicate Number : 4822205 (19.31%) 111 | Total Reduction : 4974628 (19.80%) 112 | Total Time : 37881.83 seconds (10.5 hours) 113 | ``` 114 | 115 | More details can be found on https://zippy-anise-556.notion.site/Deduplication-Log-d75d1b3f2e684e96a12b069c5aff68cb. 116 | -------------------------------------------------------------------------------- /data_analysis/near-deduplication/minhash_deduplication.py: -------------------------------------------------------------------------------- 1 | import json 2 | import multiprocessing as mp 3 | import re 4 | from collections import defaultdict 5 | from functools import partial 6 | from typing import Dict, List, Optional, Set, Tuple, Type 7 | 8 | from datasets import Dataset 9 | from tqdm import tqdm 10 | 11 | from datasketch import MinHash, MinHashLSH 12 | from dpu_utils.utils.iterators import ThreadedIterator 13 | 14 | 15 | NON_ALPHA = re.compile("[^A-Za-z_0-9]") 16 | # parameters used in DuplicationIndex 17 | MIN_NUM_TOKENS = 10 18 | NUM_PERM = 256 19 | 20 | # column name of file paths, we add as file identifiers 21 | PATH_COLUMN = "original_path" 22 | # name of the "text" column used in deduplication 23 | CONTENT = "content" 24 | 25 | def get_min_hash(tokens: List[str]) -> Optional[MinHash]: 26 | """Compute the MinHash of a code snippet.""" 27 | if len(tokens) < MIN_NUM_TOKENS: 28 | return None 29 | min_hash = MinHash(num_perm=NUM_PERM) 30 | for token in set(tokens): 31 | min_hash.update(token.encode()) 32 | return min_hash 33 | 34 | 35 | def get_tokens(code: str) -> Set[str]: 36 | """Tokenize a code snippet.""" 37 | return set([t for t in NON_ALPHA.split(code) if len(t.strip()) > 0]) 38 | 39 | 40 | class DuplicationIndex: 41 | def __init__( 42 | self, 43 | *, 44 | duplication_jaccard_threshold: float = 0.85, 45 | ): 46 | self._duplication_jaccard_threshold = duplication_jaccard_threshold 47 | self._num_perm = NUM_PERM 48 | self._index = MinHashLSH(threshold=self._duplication_jaccard_threshold, num_perm=self._num_perm) 49 | 50 | self._duplicate_clusters = defaultdict(set) 51 | 52 | def add(self, code_key: Tuple, min_hash: MinHash) -> None: 53 | """Add a key to _index (MinHashLSH) 54 | the min_hash is used to query closest matches based on the jaccard_threshold. 55 | The new key is either added to a existing cluster of one close match, 56 | or a new cluster is created. The clusters created in this way, depend on the order of add. 57 | 58 | Args: 59 | code_key (Tuple of (index, repo_name, path)): 60 | Theoritically any hasbale key. Here we use a tuple to retrieve the information later. 61 | min_hash: MinHash of the code_key. 62 | """ 63 | close_duplicates = self._index.query(min_hash) 64 | if code_key in self._index.keys: 65 | print(f"Duplicate key {code_key}") 66 | return 67 | 68 | self._index.insert(code_key, min_hash) 69 | if len(close_duplicates) > 0: 70 | 71 | for base_duplicate in close_duplicates: 72 | if base_duplicate in self._duplicate_clusters: 73 | self._duplicate_clusters[base_duplicate].add(code_key) 74 | break 75 | else: 76 | self._duplicate_clusters[close_duplicates[0]].add(code_key) 77 | 78 | def get_duplicate_clusters(self) -> List[List[Dict]]: 79 | """Export the duplicate clusters. 80 | For each cluster, the first element is the base element of the cluster. 81 | The base element has an estimation jaccard similarity higher than the threshold with all the other elements. 82 | 83 | Returns: 84 | duplicate_clusters (List[List[Dict]]): 85 | List of duplicate clusters. 86 | """ 87 | duplicate_clusters = [] 88 | for base, duplicates in self._duplicate_clusters.items(): 89 | cluster = [base] + list(duplicates) 90 | # reformat the cluster to be a list of dict 91 | cluster = [{"base_index": el[0], "original_path": el[1]} for el in cluster] 92 | duplicate_clusters.append(cluster) 93 | return duplicate_clusters 94 | 95 | def save(self, filepath) -> None: 96 | duplicate_clusters = self.get_duplicate_clusters() 97 | with open(filepath, "w") as f: 98 | json.dump(duplicate_clusters, f) 99 | 100 | 101 | def _compute_min_hash(element): 102 | index, data = element 103 | min_hash = get_min_hash([t for t in NON_ALPHA.split(data[CONTENT]) if len(t.strip()) > 0]) 104 | if min_hash is not None: 105 | return (index, data[PATH_COLUMN]), min_hash 106 | 107 | 108 | def minhash_iter(dataset_iterator: Type[Dataset]): 109 | with mp.Pool() as pool: 110 | for data in pool.imap_unordered( 111 | _compute_min_hash, 112 | ThreadedIterator(dataset_iterator, max_queue_size=10000), 113 | chunksize=100, 114 | ): 115 | if data is not None: 116 | yield data 117 | 118 | 119 | def make_duplicate_clusters(dataset_iterator: Type[Dataset], jaccard_threshold: float): 120 | """Find duplicate clusters in the dataset in two steps: 121 | 1. Compute MinHash for each code snippet. MinHash is a tool for fast jaccard similarity estimation. 122 | This step is computed using an asynchronous multiprocessing pool, minhash_iter 123 | 2. Find duplicate clusters. The computed MinHash is added sequentially to the DuplicationIndex. 124 | This step cannot be parallelized. So using asynchronous thread in the previous step helps to speed up the process. 125 | """ 126 | di = DuplicationIndex(duplication_jaccard_threshold=jaccard_threshold) 127 | 128 | for filename, min_hash in tqdm(ThreadedIterator(minhash_iter(enumerate(dataset_iterator)), max_queue_size=100)): 129 | di.add(filename, min_hash) 130 | 131 | # Returns a List[Cluster] where Cluster is List[str] with the filenames. 132 | return di.get_duplicate_clusters() 133 | 134 | 135 | def jaccard_similarity(code1: str, code2: str) -> float: 136 | """Compute the Jaccard similarity of two code snippets.""" 137 | tokens1 = get_tokens(code1) 138 | tokens2 = get_tokens(code2) 139 | return len(tokens1 & tokens2) / len(tokens1 | tokens2) 140 | 141 | 142 | _shared_dataset = None 143 | 144 | 145 | def _find_cluster_extremes_shared(cluster, jaccard_threshold): 146 | """Find a reduced cluster such that each code in the origin cluster is similar to at least one code in the reduced cluster. 147 | Two codes are similar if their Jaccard similarity is above the threshold. 148 | 149 | Args: 150 | cluster (List[dict]): 151 | cluster is a list of dict, each dict contains the following keys: 152 | - base_index 153 | - repo_name 154 | - path 155 | This is a typical output of DuplicationIndex.get_duplicate_clusters() 156 | jaccard_threshold (float): 157 | threshold for Jaccard similarity. 158 | Two codes are similar if their Jaccard similarity is above the threshold. 159 | 160 | Returns: 161 | extremes (List[dict]): 162 | A reduced representation of the cluster. The field copies is added to each dict. 163 | The copies field indicates the number of similar codes in the cluster for a extreme. 164 | """ 165 | extremes = [] 166 | for element1 in cluster: 167 | code1 = _shared_dataset[element1["base_index"]][CONTENT] 168 | for element2 in extremes: 169 | code2 = _shared_dataset[element2["base_index"]][CONTENT] 170 | if jaccard_similarity(code1, code2) >= jaccard_threshold: 171 | element2["copies"] += 1 172 | break 173 | else: 174 | element1["copies"] = 1 175 | extremes.append(element1) 176 | return extremes 177 | 178 | 179 | def find_extremes(cluster_list, dataset, jaccard_threshold): 180 | """Call the _find_cluster_extremes_shared function in a parallel fashion. 181 | 182 | Args: 183 | cluster_list (List[List[Dict]]): 184 | each cluster is a list of dicts with the key base_index, 185 | referring to the index of the base code in the dataset. 186 | dataset (Type[Dataset]): 187 | dataset is used to access the content of the code snippets, 188 | using the base_index from the cluster_list. 189 | dataset is shared between all the processes using a glabal variable (any other way to share the dataset?), 190 | otherwise the multi processing is not speeded up. 191 | jaccard_threshold (float): 192 | the threshold for the jaccard similarity. The default value is 0.85 193 | 194 | Returns: 195 | extremes_list (List[Dict]): 196 | Each cluster is reduced to extremes. 197 | See _find_cluster_extremes_shared for the definition of extremes. 198 | """ 199 | global _shared_dataset 200 | _shared_dataset = dataset 201 | extremes_list = [] 202 | f = partial(_find_cluster_extremes_shared, jaccard_threshold=jaccard_threshold) 203 | with mp.Pool() as pool: 204 | for extremes in tqdm( 205 | pool.imap_unordered( 206 | f, 207 | cluster_list, 208 | ), 209 | total=len(cluster_list), 210 | ): 211 | extremes_list.append(extremes) 212 | return extremes_list 213 | 214 | 215 | def deduplicate_dataset( 216 | dataset: Type[Dataset], jaccard_threshold: float = 0.85 217 | ) -> Tuple[Type[Dataset], List[List[Dict]]]: 218 | """Deduplicate the dataset using minhash and jaccard similarity. 219 | This function first generate duplicate clusters, then each cluster 220 | is reduced to the extremes that are similar to the other elements in the cluster. 221 | Codes are called similar if their Jaccard similarity is greater than jaccard_threshold (0.85 default). 222 | 223 | Args: 224 | dataset (Type[Dataset]): 225 | The dataset to deduplicate. 226 | jaccard_threshold (float, default=0.85): 227 | jaccard threshold to determine if two codes are similar 228 | 229 | Returns: 230 | ds_dedup (Type[Dataset]): 231 | The deduplicated dataset. 232 | duplicate_clusters (List[List[Dict]]): 233 | The list of duplicate clusters. 234 | Each cluster is a list of dicts with the following keys: 235 | - base_index : int 236 | The index of the code in the original dataset. 237 | - repo_name : str 238 | - path : str 239 | - copies : int 240 | The number of copies of the code in the cluster. (find_cluster_extremes) 241 | - is_extreme : bool 242 | Whether the code is an extreme in the cluster. 243 | All the codes in the cluster are removed from the dataset except the extremes. 244 | 245 | Example: 246 | >>> from datasets import load_dataset 247 | >>> from minhash_deduplication import deduplicate_dataset 248 | >>> ds = load_dataset("lvwerra/codeparrot-clean", split="train") 249 | >>> ds_dedup, duplicate_clusters = deduplicate_dataset(ds, jaccard_threshold=0.85) 250 | """ 251 | duplicate_clusters = make_duplicate_clusters(dataset, jaccard_threshold) 252 | duplicate_indices = set(x["base_index"] for cluster in duplicate_clusters for x in cluster) 253 | extreme_dict = {} 254 | extremes_clusters = find_extremes(duplicate_clusters, dataset, jaccard_threshold) 255 | for extremes in extremes_clusters: 256 | for element in extremes: 257 | extreme_dict[element["base_index"]] = element 258 | remove_indices = duplicate_indices - set(extreme_dict.keys()) 259 | ds_filter = dataset.filter(lambda x, idx: idx not in remove_indices, with_indices=True) 260 | 261 | # update duplicate_clusters 262 | for cluster in duplicate_clusters: 263 | for element in cluster: 264 | element["is_extreme"] = element["base_index"] in extreme_dict 265 | if element["is_extreme"]: 266 | element["copies"] = extreme_dict[element["base_index"]]["copies"] 267 | 268 | print(f"Original dataset size: {len(dataset)}") 269 | print(f"Number of duplicate clusters: {len(duplicate_clusters)}") 270 | print(f"Files in duplicate cluster: {len(duplicate_indices)}") 271 | print(f"Unique files in duplicate cluster: {len(extreme_dict)}") 272 | print(f"Filtered dataset size: {len(ds_filter)}") 273 | 274 | return ds_filter, duplicate_clusters 275 | -------------------------------------------------------------------------------- /data_analysis/near-deduplication/minhash_deduplication_alt.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # author : Chenghao Mou (mouchenghao@gmail.com) 4 | # created : 10/4/22 5 | from __future__ import annotations 6 | 7 | import gc 8 | import hashlib 9 | import logging 10 | import multiprocessing as mp 11 | import os 12 | import random 13 | import re 14 | import struct 15 | import time 16 | import warnings 17 | from collections import defaultdict 18 | from itertools import tee 19 | from pathlib import Path 20 | from typing import Any 21 | from typing import Dict 22 | from typing import Iterable 23 | from typing import List 24 | from typing import Tuple 25 | 26 | with warnings.catch_warnings(): 27 | warnings.filterwarnings("ignore", category=FutureWarning) 28 | import datasets 29 | import numpy as np 30 | import typer 31 | from datasets import load_dataset 32 | from scipy.integrate import quad as integrate 33 | from tqdm import tqdm 34 | 35 | 36 | SEED = 42 37 | NON_ALPHA = re.compile("[^A-Za-z_0-9]") 38 | RNG = np.random.RandomState(SEED) 39 | MAX_HASH = np.uint64((1 << 32) - 1) 40 | MERSENNE_PRIME = np.uint64((1 << 61) - 1) 41 | logger = logging.getLogger(__name__) 42 | logger.setLevel(logging.INFO) 43 | datasets.logging.set_verbosity_error() 44 | 45 | 46 | def ngrams(sequence: List[str], n: int) -> Iterable: 47 | """ 48 | Directly taken from nltk package to avoid dependency. 49 | 50 | Parameters 51 | ---------- 52 | sequence : list 53 | The sequence of items to be n-grammed. 54 | n : int 55 | The order of the n-grams to be extracted. 56 | 57 | Returns 58 | ------- 59 | Iterable 60 | The n-grams generated from the sequence. 61 | """ 62 | iterables = tee(sequence, n) 63 | for i, sub_iterable in enumerate(iterables): 64 | for _ in range(i): 65 | next(sub_iterable, None) 66 | return zip(*iterables) 67 | 68 | 69 | def sha1_hash32(data): 70 | """ 71 | Directly taken from datasketch package to avoid dependency. 72 | 73 | Parameters 74 | ---------- 75 | data : bytes 76 | 77 | Returns 78 | ------- 79 | int 80 | """ 81 | return struct.unpack(" Dict[str, Any]: 93 | """ 94 | Combined with some datasketch code to better parallelize computation. 95 | 96 | Parameters 97 | ---------- 98 | content : str 99 | The content to be embedded. 100 | idx : int 101 | The index of the content. 102 | num_perm : int 103 | The number of permutations. 104 | ngram_size : int 105 | The size of n-grams. 106 | hashranges : List[Tuple[int, int]] 107 | The ranges of hash values. 108 | permutations : np.ndarray 109 | The permutations for the minhash. 110 | 111 | Returns 112 | ------- 113 | Dict[str, Any] 114 | The hash values in each range and the index. 115 | """ 116 | hashvalues = np.ones(num_perm, dtype=np.uint64) * MAX_HASH 117 | tokens = {" ".join(t) for t in ngrams(NON_ALPHA.split(content), ngram_size)} 118 | hv = np.array([sha1_hash32(token.encode("utf-8")) for token in tokens], dtype=np.uint64) # noqa: E501 119 | a, b = permutations 120 | phv = np.bitwise_and(((hv * np.tile(a, (len(hv), 1)).T).T + b) % MERSENNE_PRIME, MAX_HASH) # noqa: E501 121 | hashvalues = np.vstack([phv, hashvalues]).min(axis=0) 122 | Hs = [bytes(hashvalues[start:end].byteswap().data) for start, end in hashranges] 123 | return {"__signatures__": Hs, "__id__": idx} 124 | 125 | 126 | def optimal_param( 127 | threshold: float, 128 | num_perm: int, 129 | false_positive_weight: float = 0.5, 130 | false_negative_weight: float = 0.5, 131 | ): 132 | """ 133 | Compute the optimal `MinHashLSH` parameter that minimizes the weighted sum 134 | of probabilities of false positive and false negative, taken from datasketch. 135 | 136 | Parameters 137 | ---------- 138 | threshold : float 139 | The threshold for similarity. 140 | num_perm : int 141 | The number of permutations. 142 | false_positive_weight : float 143 | The weight of false positive. 144 | false_negative_weight : float 145 | The weight of false negative. 146 | 147 | Returns 148 | ------- 149 | Tuple[int, int] 150 | The optimal `b` and `r` parameters. 151 | The number of bands, and the number of rows per band respectively. 152 | """ 153 | 154 | def false_positive_probability(threshold: float, b: int, r: int): 155 | """Source: `datasketch.lsh`""" 156 | 157 | def proba(s): 158 | return 1 - (1 - s ** float(r)) ** float(b) 159 | 160 | a, _ = integrate(proba, 0.0, threshold) 161 | return a 162 | 163 | def false_negative_probability(threshold: float, b: int, r: int): 164 | """Source: `datasketch.lsh`""" 165 | 166 | def proba(s): 167 | return 1 - (1 - (1 - s ** float(r)) ** float(b)) 168 | 169 | a, _ = integrate(proba, threshold, 1.0) 170 | return a 171 | 172 | min_error = float("inf") 173 | opt = (0, 0) 174 | for b in range(1, num_perm + 1): 175 | max_r = int(num_perm / b) 176 | for r in range(1, max_r + 1): 177 | fp = false_positive_probability(threshold, b, r) 178 | fn = false_negative_probability(threshold, b, r) 179 | error = fp * false_positive_weight + fn * false_negative_weight 180 | if error < min_error: 181 | min_error = error 182 | opt = (b, r) 183 | return opt 184 | 185 | 186 | class UnionFind: 187 | def __init__(self): 188 | self.parent: Dict[int, int] = {} 189 | 190 | def find(self, x): 191 | if x not in self.parent: 192 | self.parent[x] = x 193 | if self.parent[x] != x: 194 | self.parent[x] = self.find(self.parent[x]) 195 | return self.parent[x] 196 | 197 | def union(self, x, y): 198 | px = self.find(x) 199 | py = self.find(y) 200 | self.parent[px] = self.parent[py] = min(px, py) 201 | 202 | 203 | if __name__ == "__main__": 204 | 205 | def run( 206 | dataset: str = typer.Option("codeparrot/codeparrot-clean-valid", help="The dataset to use"), # noqa: E501 207 | config: str = typer.Option("default", help="Dataset config"), 208 | split: str = typer.Option("train", help="Dataset split"), 209 | data_dir: str = typer.Option(None, help="Dataset data directory"), 210 | revision: str = typer.Option("main", help="Dataset revision"), 211 | column: str = typer.Option("content", help="Dataset column"), 212 | cache_dir: str = typer.Option(".cache", help="Cache directory"), 213 | ngram_size: int = typer.Option(5, help="The ngram size to use for MinHash"), 214 | num_perm: int = typer.Option(256, help="Number of permutations"), 215 | threshold: float = typer.Option(0.7, help="Minhash threshold"), 216 | output: str = typer.Option(None, help="Store the deduplicated dataset"), 217 | ): 218 | global uf 219 | OUTPUT_BASE = Path(output or "output") 220 | OUTPUT_BASE.mkdir(exist_ok=True, parents=True) 221 | output = OUTPUT_BASE / "deduplicated" 222 | 223 | logging.basicConfig(level=logging.INFO) 224 | 225 | time_measures = {} 226 | start_time = time.time() 227 | 228 | B, R = optimal_param(threshold, num_perm) 229 | HASH_RANGES = [(i * R, (i + 1) * R) for i in range(B)] 230 | HASH_TABLES = [defaultdict(set) for _ in range(B)] 231 | 232 | time_measures["load_dataset"] = time.time() 233 | ds = load_dataset( 234 | dataset, 235 | config, 236 | data_dir=data_dir, 237 | split=split, 238 | use_auth_token=True, 239 | cache_dir=cache_dir, 240 | revision=revision, 241 | num_proc=os.cpu_count(), 242 | ) 243 | time_measures["load_dataset"] = time.time() - time_measures["load_dataset"] 244 | DATA_SIZE = len(ds) 245 | PERMUTATIONS = np.array( 246 | [ 247 | ( 248 | RNG.randint(1, MERSENNE_PRIME, dtype=np.uint64), 249 | RNG.randint(0, MERSENNE_PRIME, dtype=np.uint64), 250 | ) 251 | for _ in range(num_perm) 252 | ], 253 | dtype=np.uint64, 254 | ).T 255 | 256 | time_measures["minhash"] = time.time() 257 | embedded = ds.map( 258 | function=embed_func, 259 | fn_kwargs={ 260 | "num_perm": num_perm, 261 | "hashranges": HASH_RANGES, 262 | "ngram_size": ngram_size, 263 | "permutations": PERMUTATIONS, 264 | }, 265 | input_columns=[column], 266 | remove_columns=ds.column_names, 267 | num_proc=os.cpu_count(), 268 | with_indices=True, 269 | desc="Fingerprinting...", 270 | ) 271 | time_measures["minhash"] = time.time() - time_measures["minhash"] 272 | 273 | time_measures["clustering"] = time.time() 274 | batch_size: int = 10000 275 | for i in tqdm( 276 | range(0, len(embedded), batch_size), dynamic_ncols=True, desc="Iterating MinHashes..." # noqa: E501 277 | ): 278 | batch = embedded[i : i + batch_size] 279 | for key, Hs in zip(batch["__id__"], batch["__signatures__"]): 280 | for H, hashtable in zip(Hs, HASH_TABLES): 281 | hashtable[H].add(key) 282 | for table in tqdm(HASH_TABLES, dynamic_ncols=True, desc="Clustering..."): 283 | for cluster in table.values(): 284 | if len(cluster) <= 1: 285 | continue 286 | idx = min(cluster) 287 | for x in cluster: 288 | uf.union(x, idx) 289 | time_measures["clustering"] = time.time() - time_measures["clustering"] 290 | 291 | time_measures["filtering"] = time.time() 292 | gc.freeze() 293 | gc.disable() 294 | ds = ds.map( 295 | function=lambda _, idx: {"__cluster__": uf.find(idx)}, 296 | with_indices=True, 297 | num_proc=os.cpu_count(), 298 | new_fingerprint=str(random.getrandbits(128)), 299 | desc="Finding clusters...", 300 | ) 301 | gc.enable() 302 | gc.collect() 303 | # This is where the deduplication happens 304 | # Since there is no easy groupby in datasets 305 | # I will use this simple filter for now 306 | final_data = ds.filter( 307 | function=lambda record, idx: record["__cluster__"] == idx, 308 | with_indices=True, 309 | num_proc=os.cpu_count(), 310 | desc="Filtering clusters...", 311 | ) 312 | time_measures["filtering"] = time.time() - time_measures["filtering"] 313 | 314 | time_measures["save"] = time.time() 315 | final_data = final_data.remove_columns(["__cluster__"]) 316 | final_data.save_to_disk(output) 317 | time_measures["save"] = time.time() - time_measures["save"] 318 | 319 | FINAL_DATA_SIZE = len(final_data) 320 | DUP_SIZE = DATA_SIZE - FINAL_DATA_SIZE 321 | PAD = 32 322 | 323 | for key, value in time_measures.items(): 324 | logger.info(f"{key:<{PAD}}: {value:.2f} seconds") 325 | logger.info(f"{'Data Number (before)':<{PAD}}: {DATA_SIZE}") 326 | logger.info( 327 | f"{'Data Number (after)':<{PAD}}: {FINAL_DATA_SIZE} ({FINAL_DATA_SIZE / DATA_SIZE:.2%})" # noqa: E501 328 | ) 329 | logger.info(f"{'Duplicate Number':<{PAD}}: {DUP_SIZE} ({DUP_SIZE / DATA_SIZE:.2%})") # noqa: E501 330 | logger.info(f"{'Total Time':<{PAD}}: {time.time() - start_time:.2f} seconds") 331 | logger.info(f"{'Deduplicated Dataset':<{PAD}}: {output}") 332 | logger.info("🤗 Happy Deduplicating 🤗") 333 | 334 | mp.set_start_method("fork", force=True) 335 | uf = UnionFind() 336 | typer.run(run) 337 | -------------------------------------------------------------------------------- /data_analysis/near-deduplication/minhash_deduplication_debug.py: -------------------------------------------------------------------------------- 1 | import json 2 | import multiprocessing as mp 3 | import re 4 | import time 5 | from collections import defaultdict 6 | from functools import partial 7 | from typing import Dict, List, Optional, Set, Tuple, Type 8 | import numpy as np 9 | 10 | from datasets import Dataset 11 | from tqdm import tqdm 12 | 13 | from datasketch import MinHash, MinHashLSH 14 | from dpu_utils.utils.iterators import ThreadedIterator 15 | 16 | 17 | NON_ALPHA = re.compile("[^A-Za-z_0-9]") 18 | # parameters used in DuplicationIndex 19 | MIN_NUM_TOKENS = 10 20 | NUM_PERM = 256 21 | 22 | # column name of file paths, we add as file identifiers 23 | PATH_COLUMN = "path" 24 | # name of the "text" column used in deduplication 25 | CONTENT = "content" 26 | 27 | def get_min_hash(tokens: List[str]) -> Optional[MinHash]: 28 | """Compute the MinHash of a code snippet.""" 29 | if len(tokens) < MIN_NUM_TOKENS: 30 | return None 31 | min_hash = MinHash(num_perm=NUM_PERM) 32 | for token in set(tokens): 33 | min_hash.update(token.encode()) 34 | return min_hash 35 | 36 | 37 | def get_tokens(code: str) -> Set[str]: 38 | """Tokenize a code snippet.""" 39 | return set([t for t in NON_ALPHA.split(code) if len(t.strip()) > 0]) 40 | 41 | 42 | class DuplicationIndex: 43 | def __init__( 44 | self, 45 | *, 46 | duplication_jaccard_threshold: float = 0.85, 47 | ): 48 | self._duplication_jaccard_threshold = duplication_jaccard_threshold 49 | self._num_perm = NUM_PERM 50 | self._index = MinHashLSH(threshold=self._duplication_jaccard_threshold, num_perm=self._num_perm) 51 | 52 | self._duplicate_clusters = defaultdict(set) 53 | 54 | def add(self, code_key: Tuple, min_hash: MinHash) -> None: 55 | """Add a key to _index (MinHashLSH) 56 | the min_hash is used to query closest matches based on the jaccard_threshold. 57 | The new key is either added to a existing cluster of one close match, 58 | or a new cluster is created. The clusters created in this way, depend on the order of add. 59 | 60 | Args: 61 | code_key (Tuple of (index, repo_name, path)): 62 | Theoritically any hasbale key. Here we use a tuple to retrieve the information later. 63 | min_hash: MinHash of the code_key. 64 | """ 65 | close_duplicates = self._index.query(min_hash) 66 | if code_key in self._index.keys: 67 | print(f"Duplicate key {code_key}") 68 | return 69 | 70 | self._index.insert(code_key, min_hash) 71 | if len(close_duplicates) > 0: 72 | 73 | for base_duplicate in close_duplicates: 74 | if base_duplicate in self._duplicate_clusters: 75 | self._duplicate_clusters[base_duplicate].add(code_key) 76 | break 77 | else: 78 | self._duplicate_clusters[close_duplicates[0]].add(code_key) 79 | 80 | def get_duplicate_clusters(self) -> List[List[Dict]]: 81 | """Export the duplicate clusters. 82 | For each cluster, the first element is the base element of the cluster. 83 | The base element has an estimation jaccard similarity higher than the threshold with all the other elements. 84 | 85 | Returns: 86 | duplicate_clusters (List[List[Dict]]): 87 | List of duplicate clusters. 88 | """ 89 | duplicate_clusters = [] 90 | for base, duplicates in self._duplicate_clusters.items(): 91 | cluster = [base] + list(duplicates) 92 | # reformat the cluster to be a list of dict 93 | cluster = [{"base_index": el[0], "original_path": el[1]} for el in cluster] 94 | duplicate_clusters.append(cluster) 95 | return duplicate_clusters 96 | 97 | def save(self, filepath) -> None: 98 | duplicate_clusters = self.get_duplicate_clusters() 99 | with open(filepath, "w") as f: 100 | json.dump(duplicate_clusters, f) 101 | 102 | 103 | def _compute_min_hash(element): 104 | index, data = element 105 | min_hash = get_min_hash([t for t in NON_ALPHA.split(data[CONTENT]) if len(t.strip()) > 0]) 106 | if min_hash is not None: 107 | return (index, data[PATH_COLUMN]), min_hash 108 | 109 | 110 | def minhash_iter(dataset_iterator: Type[Dataset]): 111 | # computing minhah hash of the samples in dataset iterator in parallel 112 | with mp.Pool() as pool: 113 | for data in pool.imap_unordered( 114 | _compute_min_hash, 115 | ThreadedIterator(dataset_iterator, max_queue_size=10000), 116 | chunksize=100, 117 | ): 118 | if data is not None: 119 | yield data 120 | 121 | 122 | def make_duplicate_clusters(dataset_iterator: Type[Dataset], jaccard_threshold: float): 123 | """Find duplicate clusters in the dataset in two steps: 124 | 1. Compute MinHash for each code snippet. MinHash is a tool for fast jaccard similarity estimation. 125 | This step is computed using an asynchronous multiprocessing pool, minhash_iter 126 | 2. Find duplicate clusters. The computed MinHash is added sequentially to the DuplicationIndex. 127 | This step cannot be parallelized. So using asynchronous thread in the previous step helps to speed up the process. 128 | """ 129 | di = DuplicationIndex(duplication_jaccard_threshold=jaccard_threshold) 130 | 131 | print("\ncomputing minhashes") 132 | t_start = time.time() 133 | hashes = [] 134 | for filename, min_hash in tqdm(ThreadedIterator(minhash_iter(enumerate(dataset_iterator)), max_queue_size=100)): 135 | hashes.append((filename, min_hash)) 136 | print(f"minhashes computed in: {time.time()-t_start:.2f}s => {(time.time()-t_start)/60:.2f}min") 137 | 138 | print("\nbuilding clusters") 139 | t_start = time.time() 140 | for filename, min_hash in tqdm(hashes): 141 | di.add(filename, min_hash) 142 | print(f"clusters built in: {time.time()-t_start:.2f}s => {(time.time()-t_start)/60:.2f}min") 143 | 144 | # Returns a List[Cluster] where Cluster is List[str] with the filenames. 145 | print("\nexporting the clusters") 146 | t_start = time.time() 147 | clusters = di.get_duplicate_clusters() 148 | print(f"clusters exported in: {time.time()-t_start:.2f}s => {(time.time()-t_start)/60:.2f}min") 149 | 150 | stats = [len(cluster) for cluster in clusters] 151 | print(f"max, min, mean and median of the cluster sizes: {max(stats)}, {min(stats)}, {np.mean(stats)}, {np.median(stats)}") 152 | print("saving the clusters list and stats") 153 | with open("./clusters_list.json", "w") as fp: 154 | json.dump(clusters, fp) 155 | with open("./clusters_stats.json", "w") as fp: 156 | json.dump(stats, fp) 157 | 158 | return clusters 159 | 160 | 161 | def jaccard_similarity(code1: str, code2: str) -> float: 162 | """Compute the Jaccard similarity of two code snippets.""" 163 | tokens1 = get_tokens(code1) 164 | tokens2 = get_tokens(code2) 165 | return len(tokens1 & tokens2) / len(tokens1 | tokens2) 166 | 167 | 168 | _shared_dataset = None 169 | 170 | 171 | def _find_cluster_extremes_shared(cluster, jaccard_threshold): 172 | """Find a reduced cluster such that each code in the origin cluster is similar to at least one code in the reduced cluster. 173 | Two codes are similar if their Jaccard similarity is above the threshold. 174 | 175 | Args: 176 | cluster (List[dict]): 177 | cluster is a list of dict, each dict contains the following keys: 178 | - base_index 179 | - repo_name 180 | - path 181 | This is a typical output of DuplicationIndex.get_duplicate_clusters() 182 | jaccard_threshold (float): 183 | threshold for Jaccard similarity. 184 | Two codes are similar if their Jaccard similarity is above the threshold. 185 | 186 | Returns: 187 | extremes (List[dict]): 188 | A reduced representation of the cluster. The field copies is added to each dict. 189 | The copies field indicates the number of similar codes in the cluster for a extreme. 190 | """ 191 | extremes = [] 192 | for element1 in cluster: 193 | code1 = _shared_dataset[element1["base_index"]][CONTENT] 194 | for element2 in extremes: 195 | code2 = _shared_dataset[element2["base_index"]][CONTENT] 196 | if jaccard_similarity(code1, code2) >= jaccard_threshold: 197 | element2["copies"] += 1 198 | break 199 | else: 200 | element1["copies"] = 1 201 | extremes.append(element1) 202 | return extremes 203 | 204 | 205 | def find_extremes(cluster_list, dataset, jaccard_threshold): 206 | """Call the _find_cluster_extremes_shared function in a parallel fashion. 207 | 208 | Args: 209 | cluster_list (List[List[Dict]]): 210 | each cluster is a list of dicts with the key base_index, 211 | referring to the index of the base code in the dataset. 212 | dataset (Type[Dataset]): 213 | dataset is used to access the content of the code snippets, 214 | using the base_index from the cluster_list. 215 | dataset is shared between all the processes using a glabal variable (any other way to share the dataset?), 216 | otherwise the multi processing is not speeded up. 217 | jaccard_threshold (float): 218 | the threshold for the jaccard similarity. The default value is 0.85 219 | 220 | Returns: 221 | extremes_list (List[Dict]): 222 | Each cluster is reduced to extremes. 223 | See _find_cluster_extremes_shared for the definition of extremes. 224 | """ 225 | global _shared_dataset 226 | _shared_dataset = dataset 227 | extremes_list = [] 228 | f = partial(_find_cluster_extremes_shared, jaccard_threshold=jaccard_threshold) 229 | with mp.Pool() as pool: 230 | for extremes in tqdm( 231 | pool.imap_unordered( 232 | f, 233 | cluster_list, 234 | ), 235 | total=len(cluster_list), 236 | ): 237 | extremes_list.append(extremes) 238 | return extremes_list 239 | 240 | 241 | def deduplicate_dataset( 242 | dataset: Type[Dataset], jaccard_threshold: float = 0.85 243 | ) -> Tuple[Type[Dataset], List[List[Dict]]]: 244 | """Deduplicate the dataset using minhash and jaccard similarity. 245 | This function first generate duplicate clusters, then each cluster 246 | is reduced to the extremes that are similar to the other elements in the cluster. 247 | Codes are called similar if their Jaccard similarity is greater than jaccard_threshold (0.85 default). 248 | 249 | Args: 250 | dataset (Type[Dataset]): 251 | The dataset to deduplicate. 252 | jaccard_threshold (float, default=0.85): 253 | jaccard threshold to determine if two codes are similar 254 | 255 | Returns: 256 | ds_dedup (Type[Dataset]): 257 | The deduplicated dataset. 258 | duplicate_clusters (List[List[Dict]]): 259 | The list of duplicate clusters. 260 | Each cluster is a list of dicts with the following keys: 261 | - base_index : int 262 | The index of the code in the original dataset. 263 | - repo_name : str 264 | - path : str 265 | - copies : int 266 | The number of copies of the code in the cluster. (find_cluster_extremes) 267 | - is_extreme : bool 268 | Whether the code is an extreme in the cluster. 269 | All the codes in the cluster are removed from the dataset except the extremes. 270 | 271 | Example: 272 | >>> from datasets import load_dataset 273 | >>> from minhash_deduplication import deduplicate_dataset 274 | >>> ds = load_dataset("lvwerra/codeparrot-clean", split="train") 275 | >>> ds_dedup, duplicate_clusters = deduplicate_dataset(ds, jaccard_threshold=0.85) 276 | """ 277 | duplicate_clusters = make_duplicate_clusters(dataset, jaccard_threshold) 278 | print("MinHash computation done and cluster info saved") 279 | 280 | duplicate_indices = set(x["base_index"] for cluster in duplicate_clusters for x in cluster) 281 | print("\nComputing extremes for all clusters") 282 | extreme_dict = {} 283 | t_start = time.time() 284 | extremes_clusters = find_extremes(duplicate_clusters, dataset, jaccard_threshold) 285 | print(f"Extremes found in: {time.time()-t_start:.2f}s => {(time.time()-t_start)/60:.2f}min") 286 | 287 | for extremes in extremes_clusters: 288 | for element in extremes: 289 | extreme_dict[element["base_index"]] = element 290 | remove_indices = duplicate_indices - set(extreme_dict.keys()) 291 | print("\nnow filtering the duplicates(extremes) from the dataset") 292 | t_start = time.time() 293 | ds_filter = dataset.filter(lambda x, idx: idx not in remove_indices, with_indices=True) 294 | print(f"dataset filtered in: {time.time()-t_start:.2f}s => {(time.time()-t_start)/60:.2f}min") 295 | 296 | t_start = time.time() 297 | # update duplicate_clusters 298 | for cluster in duplicate_clusters: 299 | for element in cluster: 300 | element["is_extreme"] = element["base_index"] in extreme_dict 301 | if element["is_extreme"]: 302 | element["copies"] = extreme_dict[element["base_index"]]["copies"] 303 | print(f"clusters (for analysis) updated in: {time.time()-t_start:.2f}s => {(time.time()-t_start)/60:.2f}min") 304 | 305 | print(f"Original dataset size: {len(dataset)}") 306 | print(f"Number of duplicate clusters: {len(duplicate_clusters)}") 307 | print(f"Files in duplicate cluster: {len(duplicate_indices)}") 308 | print(f"Unique files in duplicate cluster: {len(extreme_dict)}") 309 | print(f"Filtered dataset size: {len(ds_filter)}") 310 | 311 | return ds_filter, duplicate_clusters 312 | -------------------------------------------------------------------------------- /data_analysis/near-deduplication/near_deduplicate.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import time 4 | from pathlib import Path 5 | import re 6 | from huggingface_hub import Repository 7 | from multiprocessing import Pool 8 | from tqdm import tqdm 9 | from argparse import Namespace, ArgumentParser 10 | 11 | from datasets import load_dataset 12 | 13 | from minhash_deduplication import deduplicate_dataset 14 | 15 | 16 | def parse_args(): 17 | parser = ArgumentParser(description='near deduplication') 18 | parser.add_argument( 19 | "--dataset_name", 20 | default="bigcode-data/python_any_license_v2", 21 | type=str, 22 | help="dataset to deduplicate, path to HF repo or local path", 23 | ) 24 | parser.add_argument( 25 | "--text_column", 26 | default="content", 27 | type=str, 28 | help="column name of the text to dedulicate", 29 | ) 30 | parser.add_argument( 31 | "--jaccard_threshold", 32 | default=0.85, 33 | type=float, 34 | help="Jaccard similarity threshold", 35 | ) 36 | # we save data locally before pushing to the Hub to avoid any issues 37 | # the remote HF repo where we want the new data is cloned inside a folder out_path 38 | # and the data is saved inside 39 | parser.add_argument( 40 | "--repo_name", 41 | default="python_any_license_v2_near_dedup", 42 | type=str, 43 | help="HF repo where deduplicated dataset will be pushed later, repo is cloned, and data is saved inside", 44 | ) 45 | parser.add_argument( 46 | "--out_path", 47 | default="./data/data-near-dedup", 48 | type=str, 49 | help="local directory where repo_name is cloned", 50 | ) 51 | parser.add_argument( 52 | "--org", 53 | default="bigcode-data", 54 | type=str, 55 | help="HF org/username where the data will be pushed", 56 | ) 57 | parser.add_argument( 58 | "--shard_size", 59 | default=1000 << 20, 60 | type=int, 61 | help="size of the dataset shards", 62 | ) 63 | parser.add_argument( 64 | "--test_run", 65 | default=False, 66 | type=bool, 67 | help="make a test run, if True we only deduplicate a small subset", 68 | ) 69 | return parser.parse_args() 70 | 71 | 72 | 73 | def save_shard(shard_tuple): 74 | """Save shard""" 75 | filename, shard = shard_tuple 76 | shard.to_parquet(filename) 77 | 78 | args = parse_args() 79 | 80 | print("setting up the repo") 81 | repo = Repository( 82 | local_dir=args.out_path, 83 | clone_from=args.org + "/" + args.repo_name, 84 | repo_type="dataset", 85 | private=True, 86 | use_auth_token=True, 87 | git_user=args.org 88 | ) 89 | output_dir = Path(args.out_path) 90 | output_dir.mkdir(exist_ok=True) 91 | os.mkdir(args.out_path + "/data") 92 | print("setup done") 93 | 94 | 95 | t_start = time.time() 96 | # the data is saved in the cache for future loadings 97 | ds = load_dataset(args.dataset_name, split="train", use_auth_token=True) 98 | #ds = load_dataset("bigcode-data/python_any_license_v2", split="train", use_auth_token=True) 99 | 100 | if args.test_run: 101 | # for a test run we only use a small subset 102 | ds = ds.select([i for i in range(7000)]) 103 | init_size = len(ds) 104 | print(f"Time to load dataset: {time.time()-t_start:.2f}") 105 | 106 | 107 | # Deduplicate with minhash and jaccard similarity 108 | t_start = time.time() 109 | ds, duplicate_clusters = deduplicate_dataset(ds, args.jaccard_threshold) 110 | new_size = len(ds) 111 | print(f"Time to deduplicate dataset: {time.time()-t_start:.2f}") 112 | print(f"Size of deduplicated dataset: {len(ds)}, old dataset size {init_size}") 113 | with open("size_info.json", "w") as f: 114 | json.dump([init_size, new_size, (init_size-new_size)*100/init_size],f) 115 | 116 | 117 | with open(output_dir / "duplicate_clusters.json", "w") as f: 118 | json.dump(duplicate_clusters, f) 119 | 120 | 121 | if ds._indices is not None: 122 | dataset_nbytes = ds.data.nbytes * len(ds._indices) / len(ds.data) 123 | else: 124 | dataset_nbytes = ds.data.nbytes 125 | num_shards = int(dataset_nbytes / args.shard_size) + 1 126 | 127 | 128 | t_start = time.time() 129 | shards = (ds.shard(num_shards=num_shards, index=i, contiguous=True) for i in range(num_shards)) 130 | filenames = (f"{args.out_path}/data/train-{index:05d}-of-{num_shards:05d}.parquet" for index in range(num_shards)) 131 | 132 | with Pool(16) as p: 133 | list(tqdm(p.imap_unordered(save_shard, zip(filenames, shards), chunksize=4), total=num_shards)) 134 | print(f"Time to save dataset: {time.time()-t_start:.2f}") 135 | 136 | # To push to hub run `git add data/commit/push` inside dataset repo folder (the one cloned from HF: out_path/args.repo_name) 137 | # no need to push duplicate_clusters.json 138 | -------------------------------------------------------------------------------- /data_analysis/near-deduplication/requirements.txt: -------------------------------------------------------------------------------- 1 | datasets==2.5.1 2 | huggingface-hub==0.8.1 3 | datasketch==1.5.8 4 | dpu_utils 5 | -------------------------------------------------------------------------------- /data_analysis/near-deduplication/requirements_alt.txt: -------------------------------------------------------------------------------- 1 | datasets>=2.5.1 2 | typer>=0.6.1 -------------------------------------------------------------------------------- /data_analysis/notebooks/bigcode_pls.csv: -------------------------------------------------------------------------------- 1 | index,lang,count,size (Gb),Arjun's comments,Include,Size 2 | 1,abap,14766.00,0.097798223,,, 3 | 2,actionscript,151475.00,0.714806413,Omit: legacy PL for Flash. Flash no longer supported on any major browser. Unlikely that people are writing new Flash code.,, 4 | 3,ada,39273.00,0.577616957,"Include: used in aerospace, defense, etc.?",1,0.577616957 5 | 4,agda,18697.00,0.076828045,Include: significant in PL research. Functional. Verification.,1,0.076828045 6 | 5,ags-script,1148.00,0.010587383,,,0 7 | 6,alloy,5207.00,0.015588033,Include: significant in PL/SE/FM research. Relational.,1,0.015588033 8 | 7,ampl,75.00,7.65E-05,,,0 9 | 8,antlr,9312.00,0.061952353,Include: widely-used for specifying context-free grammars,1,0.061952353 10 | 9,apacheconf,187.00,0.000245907,,,0 11 | 10,api-blueprint,2895.00,0.032142603,,,0 12 | 11,apl,2127.00,0.007897713,undecided,,0 13 | 12,applescript,5169.00,0.010201155,Include: major Appple language,1,0.010201155 14 | 13,arc,2089.00,0.019751439,,,0 15 | 14,arduino,163528.00,0.788386157,Harm: C++ variant for programmable electronics,,0 16 | 15,asciidoc,221025.00,1.201338633,Harm: data looks good from a few samples (it's documentation style),,0 17 | 16,asp,111163.00,0.522229033,,,0 18 | 17,aspectj,2870.00,0.008254705,undecided,,0 19 | 18,assembly,262334.00,1.648889979,Assembly. Include,1,1.648889979 20 | 19,ats,3247.00,0.014441544,,,0 21 | 20,augeas,208.00,0.000826699,Include: minor DSL from RedHet. Declarative. Very different from other langs.,1,0.000826699 22 | 21,autohotkey,15258.00,0.084246694,,,0 23 | 22,autoit,11864.00,0.102856281,,,0 24 | 23,awk,11172.00,0.024450499,Include: widely used scripting language,1,0.024450499 25 | 24,batchfile,281187.00,0.333024122,Include: widely used scripting language,1,0.333024122 26 | 25,befunge,37.00,4.01E-06,,,0 27 | 26,bison,145.00,0.003887651,Include: widely-used for specifying lexers,1,0.003887651 28 | 27,bitbake,73167.00,0.089609174,,,0 29 | 28,blitzbasic,228.00,0.007069605,,,0 30 | 29,blitzmax,1720.00,0.013420046,,,0 31 | 30,bluespec,6923.00,0.037450439,Include: hardware description language,1,0.037450439 32 | 31,boo,4721,0.007027597,,,0 33 | 32,brainfuck,10077,0.049801268,undecided,,0 34 | 33,brightscript,2340,0.014593211,,,0 35 | 34,bro,912,0.00293523,,,0 36 | 35,c,11206308,75.92987105,Obvious,1,75.92987105 37 | 36,c++,7600356,65.96642214,Obvious,1,65.96642214 38 | 37,c-sharp,13281504,57.98494339,Obvious,1,57.98494339 39 | 38,c2hs-haskell,1084,0.011598729,,,0 40 | 39,cap'n-proto,1538,0.003322507,undecided,,0 41 | 40,cartocss,1781,0.020215215,,,0 42 | 41,ceylon,6506,0.017001518,undecided: a Red Hat language; seems abandoned?,,0 43 | 42,chapel,15912,0.037359901,undecided,,0 44 | 43,chuck,1397,0.003407997,,,0 45 | 44,cirru,1078,0.06595696,,,0 46 | 45,clarion,1039,0.010567484,,,0 47 | 46,clean,835,0.004459314,undecided,,0 48 | 47,click,371,0.00161367,,,0 49 | 48,clips,1701,0.010305217,,,0 50 | 49,clojure,137204,0.56312332,"Include: Lispy, JVM, Datomic",1,0.56312332 51 | 50,cmake,248361,0.679152478,Obvious,1,0.679152478 52 | 51,cobol,3175,0.018076838,undecided,1,0.018076838 53 | 52,coffeescript,256615,0.882924413,undecided,1,0.882924413 54 | 53,coldfusion,14649,0.072415269,,,0 55 | 54,coldfusion-cfc,14331,0.07927666,,,0 56 | 55,common-lisp,111008,1.936640009,Include: still widely used in software that is still maintained,1,1.936640009 57 | 56,component-pascal,860,0.108625083,,,0 58 | 57,coq,40,0.000229547,Misclassified. There must be more data,,0 59 | 58,creole,689,0.001459754,,,0 60 | 59,crystal,90251,0.301409398,,,0 61 | 60,csound,1608,0.04287534,,,0 62 | 61,css,3586141,34.86330899,,,0 63 | 62,csv,5992650,266.826193,Exclude,,0 64 | 63,cucumber,92741,0.256383451,,,0 65 | 64,cuda,71446,0.730044154,Obvious,1,0.730044154 66 | 65,cycript,459,0.011675073,,,0 67 | 66,cython,42621,0.403100053,Harm: looks good to me,,0 68 | 67,d,431,0.00177274,Misclassified. There must be more data,1,0.00177274 69 | 68,darcs-patch,369,0.002560221,Exclude. Probably not a PL,,0 70 | 69,dart,1070286,4.877312353,Include: major Google language,1,4.877312353 71 | 70,desktop,17075,0.009376839,not a PL?,,0 72 | 71,diff,386667,2.599852065,Exclude. Probably not a PL,,0 73 | 72,digital-command-language,19109,0.070433624,,,0 74 | 73,dm,4603,0.032649213,,,0 75 | 74,dns-zone,1457,0.023461303,,,0 76 | 75,dockerfile,639343,0.517538485,undecided. This is basically shell scripts,1,0.517538485 77 | 76,dogescript,364,0.00013444,,,0 78 | 77,dylan,4747,0.040813775,undecided,,0 79 | 78,eagle,45991,4.645208881,"Data, based on 3-4 samples",,0 80 | 79,ec,1152,0.023841255,,,0 81 | 80,ecere-projects,182,0.001313465,,,0 82 | 81,ecl,5120,0.028274082,,,0 83 | 82,edn,23358,1.374275976,,,0 84 | 83,eiffel,28472,0.126345794,undecided,,0 85 | 84,elixir,351916,0.93891419,"Include: Erlang, concurrent",1,0.93891419 86 | 85,elm,68002,0.367896114,"Include: functional, web",1,0.367896114 87 | 86,emacs-lisp,68984,0.551702192,Obvious,1,0.551702192 88 | 87,emberscript,1969,0.025593958,,,0 89 | 88,erlang,121296,0.931688288,Obvious,1,0.931688288 90 | 89,f#,142144,1.031253976,Obvious,1,1.031253976 91 | 90,factor,12163,0.036390924,,,0 92 | 91,fancy,538,0.001420258,,,0 93 | 92,fantom,2598,0.010674187,,,0 94 | 93,fish,33460,0.040602801,,,0 95 | 94,flux,1221,0.00724385,,,0 96 | 95,forth,4692,0.021339541,undecided,,0 97 | 96,fortran,186211,2.079444236,Obvious,1,2.079444236 98 | 97,freemarker,74678,0.317162226,,,0 99 | 98,g-code,12185,0.890362356,Harm: 3d printers,,0 100 | 99,gams,2011,0.059093651,,,0 101 | 100,gap,1210,0.012636526,,,0 102 | 101,gas,111211,1.303199948,GNU Assembler,1,1.303199948 103 | 102,gdscript,129390,0.27310127,,,0 104 | 103,genshi,398,0.00844459,,,0 105 | 104,gentoo-ebuild,45554,0.051730603,,,0 106 | 105,gentoo-eclass,379,0.002866541,,,0 107 | 106,gettext-catalog,273229,9.711336079,Harm: multi-lingual API descriptions,,0 108 | 107,glsl,206223,0.686847354,Include: shaders,1,0.686847354 109 | 108,glyph,47,0.000743859,,,0 110 | 109,gnuplot,31328,1.461753918,A language?,,0 111 | 110,go,5889635,32.01376363,Obvious,1,32.01376363 112 | 111,golo,540,0.000686322,,,0 113 | 112,gosu,1112,0.005721374,,,0 114 | 113,grace,583,0.001056405,,,0 115 | 114,grammatical-framework,4231,0.038704313,,,0 116 | 115,graphql,54946,0.179884882,"Harm: Facebook query language for APIs, 14k stars on Github",,0 117 | 116,graphviz-(dot),99748,1.074293176,"Exclude: this is data, not a PL. IMO Graphviz is usually generated and not hand-written",,0 118 | 117,groff,240689,2.938305336,"Old typesetting language. Still used of course, but mostly replaced by tex. This is mostly data",,0 119 | 118,groovy,306823,1.207558695,Include,1,1.207558695 120 | 119,groovy-server-pages,15392,0.097240488,,,0 121 | 120,haml,136898,0.167000394,,,0 122 | 121,handlebars,292850,0.594546259,,,0 123 | 122,harbour,819,0.001420626,,,0 124 | 123,haskell,582682,2.745870866,Include: major functional language,1,2.745870866 125 | 124,haxe,147781,0.596376245,"Harm: Game dev, mobile, ",,0 126 | 125,hcl,341067,0.807011745,Harm: data format,,0 127 | 126,hlsl,30266,0.086940443,,,0 128 | 127,html,16602372,291.4582452,,,0 129 | 128,html+django,60784,0.148110205,,,0 130 | 129,html+eex,17119,0.024966882,,,0 131 | 130,html+erb,649197,1.008359107,,,0 132 | 131,html+php,98821,0.405201841,,,0 133 | 132,http,25314,0.067294465,,,0 134 | 133,hy,1404,0.007694671,,,0 135 | 134,idl,492,0.008022681,,,0 136 | 135,idris,8449,0.034530222,"Include: significant in PL research, functional, verification",1,0.034530222 137 | 136,igor-pro,730,0.011822097,,,0 138 | 137,inform-7,429,0.015622285,,,0 139 | 138,ini,1630329,3.566209389,Probably exclude: Not a PL?,,0 140 | 139,inno-setup,3752,0.019808392,,,0 141 | 140,io,3031,0.00836172,undecided,,0 142 | 141,ioke,409,0.002303905,,,0 143 | 142,irc-log,101,0.006792731,,,0 144 | 143,isabelle,5724,0.098911734,"Include: significant in PL research, functional, verification",1,0.098911734 145 | 144,j,3093,0.015198472,,,0 146 | 145,jade,94273,0.178474889,,,0 147 | 146,jasmin,7371,0.04117105,,,0 148 | 147,java,25124914,112.8234043,Obvious,1,112.8234043 149 | 148,java-server-pages,281662,1.330394207,Obvious,1,1.330394207 150 | 149,javascript,25429179,166.2414118,Obvious,1,166.2414118 151 | 150,jflex,3398,0.014072463,,,0 152 | 151,json,36297006,627.8661835,,,0 153 | 152,json5,6048,0.075441422,,,0 154 | 153,jsoniq,4242,0.006691644,,,0 155 | 154,jsonld,30033,0.244416576,,,0 156 | 155,jsx,1094154,3.045787612,Harm: javascript react ,,0 157 | 156,julia,332174,1.751905383,"Include: HPC, concurrent, etc.",1,1.751905383 158 | 157,jupyter-notebook,1199902,162.4975517,Harm: processed separately,,0 159 | 158,kicad,11882,2.991856997,Sampled 3-4. Mostly data from what I can tell,,0 160 | 159,kit,1446,0.006728298,,,0 161 | 160,kotlin,2644255,6.822951943,Include: the Android PL,1,6.822951943 162 | 161,krl,292,0.001202317,,,0 163 | 162,labview,2176,0.091341864,undecided,,0 164 | 163,lasso,1158,0.048377834,,,0 165 | 164,latte,7211,0.014196855,,,0 166 | 165,lean,21003,0.162765045,"Include: significant in PL research, functional, verification",1,0.162765045 167 | 166,less,407086,1.145523703,Harm: markup language ,,0 168 | 167,lex,3380,0.081799333,undecided. related to Bison?,1,0.081799333 169 | 168,lfe,1048,0.002028635,,,0 170 | 169,lilypond,7406,0.031897442,,,0 171 | 170,linker-script,15072,0.081791257,,,0 172 | 171,liquid,30751,0.122617805,Harm: Shopify markup language,,0 173 | 172,literate-agda,573,0.005294123,Include if including Agda,1,0.005294123 174 | 173,literate-coffeescript,1156,0.005056236,Include if including CoffeeScript,1,0.005056236 175 | 174,literate-haskell,6703,0.067831422,Include if including Haskell,1,0.067831422 176 | 175,livescript,9699,0.039472374,,,0 177 | 176,llvm,72679,0.785443486,undecided. Is this just generated stuff?,,0 178 | 177,logos,21190,0.349273476,Harm: old PL,,0 179 | 178,logtalk,3125,0.009893734,,,0 180 | 179,lolcode,811,0.001302327,,,0 181 | 180,lookml,484,0.006730641,,,0 182 | 181,lsl,2753,0.015521771,,,0 183 | 182,lua,637541,3.766811033,"Include: significantly used in scripting, games",1,3.766811033 184 | 183,m,134,7.75E-05,,,0 185 | 184,m4,23018,0.153150917,,,0 186 | 185,makefile,801562,2.143090361,Include: more shell scripts,1,2.143090361 187 | 186,mako,9332,0.037311069,,,0 188 | 187,maple,2308,0.026826063,Include: scientific programming,1,0.026826063 189 | 188,markdown,25656996,95.84382086,,,0 190 | 189,mask,1448,0.009652023,,,0 191 | 190,mathematica,41260,1.877927466,Include: scientific programming,1,1.877927466 192 | 191,matlab,1046,0.047973853,Include: scientific programming,1,0.047973853 193 | 192,max,12744,0.596285678,,,0 194 | 193,maxscript,604,0.005334702,,,0 195 | 194,mediawiki,21551,0.158945413,,,0 196 | 195,metal,5042,0.0181611,,,0 197 | 196,mirah,5709,0.070237047,,,0 198 | 197,modelica,26120,0.141068962,,,0 199 | 198,module-management-system,346,0.002231462,,,0 200 | 199,monkey,1828,0.007924306,,,0 201 | 200,moonscript,5309,0.016770277,,,0 202 | 201,mtml,536,0.001453616,,,0 203 | 202,muf,561,0.001667817,,,0 204 | 203,mupad,774,0.006010949,,,0 205 | 204,myghty,11,6.29E-05,,,0 206 | 205,nesc,19216,0.138510587,,,0 207 | 206,netlinx,188,0.002778406,,,0 208 | 207,netlogo,1524,0.039246293,,,0 209 | 208,nginx,15,2.26E-05,,,0 210 | 209,nimrod,57910,0.522520055,,,0 211 | 210,ninja,3893,0.099536967,,,0 212 | 211,nit,87,0.000264594,,,0 213 | 212,nix,195025,0.649707883,,,0 214 | 213,nsis,4159,0.029944153,,,0 215 | 214,nu,927,0.002204864,,,0 216 | 215,numpy,7,1.32E-05,,,0 217 | 216,objdump,711,0.052410804,,,0 218 | 217,objective-c++,75367,0.694668418,,,0 219 | 218,objective-j,456,0.008176827,Exclude: I think this is dead?,,0 220 | 219,ocaml,187323,1.365987005,Include: major functional language,1,1.365987005 221 | 220,octave,201,0.003441835,"Undecided. Precursor to R, etc?",,0 222 | 221,omgrofl,4,3.03E-06,,,0 223 | 222,ooc,1487,0.005106774,,,0 224 | 223,opa,363,0.004085012,,,0 225 | 224,opal,158,0.000459592,,,0 226 | 225,opencl,27492,0.201742827,,,0 227 | 226,openscad,21218,0.111863208,,,0 228 | 227,org,51999,0.365003693,,,0 229 | 228,ox,250,0.001307936,,,0 230 | 229,oxygene,55,0.000240078,,,0 231 | 230,oz,1208,0.004989877,,,0 232 | 231,pan,5673,0.060544641,,,0 233 | 232,papyrus,12577,0.04664692,,,0 234 | 233,parrot,30,5.35E-05,,,0 235 | 234,parrot-assembly,431,0.000497986,,,0 236 | 235,parrot-internal-representation,1576,0.047925378,,,0 237 | 236,pascal,131974,1.877279695,Include: best PL,1,1.877279695 238 | 237,pawn,2530,0.057843412,,,0 239 | 238,perl,475344,2.987812011,Obvious,1,2.987812011 240 | 239,perl6,11279,0.050197622,,,0 241 | 240,php,22633374,89.45680388,Include: ugh,1,89.45680388 242 | 241,piglatin,1702,0.00286154,,,0 243 | 242,pike,1033,0.003621717,,,0 244 | 243,pod,13406,0.136778901,,,0 245 | 244,pogoscript,228,0.000541998,,,0 246 | 245,pony,3860,0.020518823,,,0 247 | 246,postscript,17707,1.568144315,Undecided. Is this just data?,,0 248 | 247,pov-ray-sdl,1225,0.021130333,,,0 249 | 248,powershell,333193,1.654198124,Include: Windows scripting languge,1,1.654198124 250 | 249,processing,59476,0.240804965,"Likely include, but remind myself -- this just Java, right?",1,0.240804965 251 | 250,prolog,1162,0.012870975,Include: GOFAI,1,0.012870975 252 | 251,propeller-spin,2335,0.022317081,,,0 253 | 252,protocol-buffer,124069,0.556668256,Include: Google engineers will be happy,1,0.556668256 254 | 253,pure-data,15018,0.104991313,,,0 255 | 254,purebasic,19536,0.054749768,,,0 256 | 255,purescript,33671,0.14750348,,,0 257 | 256,python,15148604,80.13463578,Obvious,1,80.13463578 258 | 257,python-traceback,9,3.25E-05,Exclude,,0 259 | 258,qmake,10338,0.022073257,,,0 260 | 259,qml,54070,0.238437227,,,0 261 | 260,r,41580,0.316489511,Obvious,1,0.316489511 262 | 261,racket,4318,0.039497334,"Include: PL used in teaching and research, mixed-paradigm",1,0.039497334 263 | 262,ragel-in-ruby-host,1847,0.011011379,,,0 264 | 263,raml,13598,0.048217182,,,0 265 | 264,rdoc,10940,0.031951045,,,0 266 | 265,realbasic,871,0.010335615,,,0 267 | 266,rebol,1495,0.006508232,Is this dead?,,0 268 | 267,red,3611,0.039918747,,,0 269 | 268,redcode,513,0.002673693,,,0 270 | 269,ren'py,4898,0.11411255,,,0 271 | 270,renderscript,995,0.002270658,,,0 272 | 271,restructuredtext,1122999,4.281577848,Harm points out to include,1,4.281577848 273 | 272,rhtml,7469,0.009793544,Is this R?,,0 274 | 273,rmarkdown,5831,0.061474482,"Include, this is R",1,0.061474482 275 | 274,robotframework,14924,0.063781898,,,0 276 | 275,rouge,1542,0.004224455,,,0 277 | 276,ruby,4463248,9.781043003,Obvious,1,9.781043003 278 | 277,rust,1677940,12.91866254,Obvious,1,12.91866254 279 | 278,sage,1930,0.014832049,,,0 280 | 279,saltstack,31203,0.042226819,,,0 281 | 280,sas,10755,0.160669991,"Carolyn: include, this is a stats scripting language",1,0.160669991 282 | 281,sass,108862,0.196029291,,,0 283 | 282,scala,1607698,6.061807569,Obvious,1,6.061807569 284 | 283,scaml,114,0.000120361,,,0 285 | 284,scheme,64961,0.597677147,Include: Lisp derivative,1,0.597677147 286 | 285,scilab,3129,0.011411926,,,0 287 | 286,scss,2449305,5.284018372,,,0 288 | 287,self,85,0.000509786,Inspiration for JavaScript. No more of this on the web?,1,0.000509786 289 | 288,shell,2540313,4.066701981,Obvious,1,4.066701981 290 | 289,shellsession,10,1.58E-05,,,0 291 | 290,shen,306,0.001125251,,,0 292 | 291,slash,9274,0.035859409,,,0 293 | 292,slim,52114,0.059704412,,,0 294 | 293,smali,288385,3.211701155,,,0 295 | 294,smalltalk,652835,0.762851187,Probably include. Still some people using this I believe,1,0.762851187 296 | 295,smarty,167634,0.56143135,,,0 297 | 296,smt,26696,0.404797122,Undecided. Lispy. These are likely generated files.,,0 298 | 297,solidity,216020,1.696340017,Include: smart contracts,1,1.696340017 299 | 298,sourcepawn,7288,0.130712384,,,0 300 | 299,sparql,16585,0.045287982,"Include: Spark, etc.",1,0.045287982 301 | 300,sqf,39747,0.140966975,,,0 302 | 301,sql,1066540,12.67551878,Obvious,1,12.67551878 303 | 302,squirrel,6385,0.038697967,,,0 304 | 303,stan,6556,0.015036825,Include: PPL,1,0.015036825 305 | 304,standard-ml,52443,0.538388205,Include: FP,1,0.538388205 306 | 305,stata,34926,0.418126748,"Carolyn: stata is an important data analysis lang. but we want only the .do files, not the .dta files",1,0.418126748 307 | 306,ston,1720,0.001329725,,,0 308 | 307,stylus,88412,0.157867404,,,0 309 | 308,supercollider,2761,0.01793768,,,0 310 | 309,svg,3816962,79.85220843,Undecided. Same category as XML,,0 311 | 310,swift,2063629,7.305693131,Obvious,1,7.305693131 312 | 311,systemverilog,56261,0.498070786,Include: HDL,1,0.498070786 313 | 312,tcl,63362,0.511421635,Include: 90s scripting language; still widely used,1,0.511421635 314 | 313,tcsh,6313,0.027505715,Include: shell scripts,1,0.027505715 315 | 314,tea,1086,0.043406292,,,0 316 | 315,tex,593741,5.863253431,Include: help me write my papers,1,5.863253431 317 | 316,text,17241649,233.4156933,what,,0 318 | 317,textile,8550,0.036826583,,,0 319 | 318,thrift,6008,0.019724138,Include: Facebook engineers want this,1,0.019724138 320 | 319,toml,603802,0.682639067,Undecided. Likely just data,,0 321 | 320,turing,281,0.003482555,,,0 322 | 321,turtle,167790,2.961368586,https://www.w3.org/TR/turtle/ Data. Ignore?,,0 323 | 322,twig,445738,1.318711043,,,0 324 | 323,txl,149,0.002262777,,,0 325 | 324,typescript,12817789,36.60518411,Obvious,1,36.60518411 326 | 325,unified-parallel-c,204,0.001573707,,,0 327 | 326,unity3d-asset,5283462,9.059069948,,,0 328 | 327,uno,4380,0.011293002,,,0 329 | 328,unrealscript,11238,0.070370664,,,0 330 | 329,urweb,1181,0.007658499,"Undecided. This is probably Adam Chlipala's language, but double-check",,0 331 | 330,vala,4830,0.024860858,,,0 332 | 331,vcl,1325,0.005250865,,,0 333 | 332,verilog,169,0.000955152,Include: HDL,1,0.000955152 334 | 333,vhdl,73888,1.457399755,Include: HDL,1,1.457399755 335 | 334,viml,110500,0.404044383,Include: VIM configurations,1,0.404044383 336 | 335,visual-basic,193066,1.709779092,Obvious,1,1.709779092 337 | 336,volt,4226,0.014539185,,,0 338 | 337,vue,1938940,8.574533644,.vue -- these have a significant HTML component,,0 339 | 338,web-ontology-language,13385,1.07383534,,,0 340 | 339,webassembly,8113,0.214897947,Undecided. Mostly generated code,,0 341 | 340,webidl,2622,0.005019794,,,0 342 | 341,wisp,130,0.00064634,,,0 343 | 342,x10,406,0.001619354,"Exclude: this project is done, right?",,0 344 | 343,xbase,7993,0.066203089,,,0 345 | 344,xc,585,0.004971472,,,0 346 | 345,xml,12683687,149.8550762,Data,,0 347 | 346,xojo,2940,0.032677002,,,0 348 | 347,xpages,315,0.002216953,,,0 349 | 348,xproc,1297,0.006994676,,,0 350 | 349,xquery,25933,0.046211199,,,0 351 | 350,xs,2025,0.03171586,,,0 352 | 351,xslt,57501,0.715532013,Include: XSLT is an XML transformation language; these are likely hand-written. XSLT is also a huge PITA to write by hand,1,0.715532013 353 | 352,xtend,11916,0.052799695,,,0 354 | 353,yacc,78972,0.557021816,Include: parser generator,1,0.557021816 355 | 354,yaml,7675977,40.82317001,Data,,0 356 | 355,yang,13539,0.216720152,,,0 357 | 356,zephir,3184,0.012502994,,,0 358 | 357,zig,19483,0.249499308,Include: up and coming language in the same space as Rust,1,0.249499308 359 | 358,zimpl,314,0.00236159,,,0 360 | ,,,,,92,842.6753674 -------------------------------------------------------------------------------- /data_analysis/notebooks/new_extension_distribution.csv: -------------------------------------------------------------------------------- 1 | ,extension,language,count,low_alphanum_count,long_lines_count,non_lexable_count 2 | 0,adb,ada,1000,0,1,85 3 | 1,ads,ada,1000,1,2,20 4 | 2,ada,ada,1000,0,4,31 5 | 3,agda,agda,1000,0,3,49 6 | 4,als,alloy,1000,0,1,39 7 | 5,g4,antlr,1000,3,0,443 8 | 6,scpt,applescript,1000,3,22,57 9 | 7,applescript,applescript,1000,0,38,113 10 | 8,asm,assembly,1000,2,115,0 11 | 9,nasm,assembly,159,0,0,0 12 | 10,a51,assembly,28,0,0,0 13 | 11,aug,augeas,255,0,0,0 14 | 12,awk,awk,1000,5,2,255 15 | 13,gawk,awk,225,0,1,103 16 | 14,mawk,awk,22,0,0,13 17 | 15,nawk,awk,8,0,0,1 18 | 16,auk,awk,3,0,0,3 19 | 17,cmd,batchfile,1000,0,163,14 20 | 18,bat,batchfile,1000,1,37,13 21 | 19,bison,bison,176,0,1,0 22 | 20,bsv,bluespec,1000,3,7,0 23 | 21,c,c,1000,5,4,11 24 | 22,h,c,1000,1,3,192 25 | 23,cats,c,3,0,0,2 26 | 24,w,c,5,0,0,2 27 | 25,cpp,c++,1000,2,3,3 28 | 26,ipp,c++,20,0,2,0 29 | 27,cc,c++,1000,0,1,3 30 | 28,hpp,c++,1000,1,7,3 31 | 29,inl,c++,91,0,0,0 32 | 30,cxx,c++,389,0,2,0 33 | 31,cp,c++,10,0,0,0 34 | 32,hh,c++,197,0,0,5 35 | 33,tcc,c++,19,0,0,0 36 | 34,hxx,c++,140,0,1,0 37 | 35,tpp,c++,3,0,0,0 38 | 36,c++,c++,15,0,0,0 39 | 37,h++,c++,1,0,0,0 40 | 38,cs,c-sharp,1000,0,2,31 41 | 39,cshtml,c-sharp,585,0,9,429 42 | 40,cake,c-sharp,9,0,0,8 43 | 41,csx,c-sharp,8,0,0,6 44 | 42,clj,clojure,1000,5,10,9 45 | 43,cljs,clojure,1000,2,2,3 46 | 44,cljc,clojure,1000,0,21,11 47 | 45,boot,clojure,121,0,1,23 48 | 46,cl2,clojure,7,0,0,0 49 | 47,cljx,clojure,8,0,0,0 50 | 48,cmake,cmake,1000,0,29,41 51 | 49,coffee,coffeescript,1000,2,20,47 52 | 50,cson,coffeescript,1000,3,13,0 53 | 51,cjsx,coffeescript,185,0,1,8 54 | 52,iced,coffeescript,92,0,0,9 55 | 53,_coffee,coffeescript,4,0,0,0 56 | 54,lisp,common-lisp,1000,2,21,32 57 | 55,asd,common-lisp,1000,0,1,6 58 | 56,lsp,common-lisp,1000,0,0,28 59 | 57,sexp,common-lisp,197,0,107,63 60 | 58,ny,common-lisp,52,0,5,6 61 | 59,css,css,1000,0,154,273 62 | 60,cuh,cuda,1000,1,3,0 63 | 61,cu,cuda,1000,0,4,2 64 | 62,dart,dart,1000,0,3,17 65 | 63,,dockerfile,1000,0,0,29 66 | 64,1,dockerfile,1,0,0,0 67 | 65,dockerfile,dockerfile,334,0,0,1 68 | 66,3,dockerfile,1,0,0,0 69 | 67,mustache,dockerfile,1,0,0,0 70 | 68,ex,elixir,1000,0,7,379 71 | 69,exs,elixir,1000,0,2,57 72 | 70,elm,elm,1000,2,10,82 73 | 71,el,emacs-lisp,1000,1,31,109 74 | 72,emacs,emacs-lisp,142,0,4,10 75 | 73,erl,erlang,1000,6,3,35 76 | 74,hrl,erlang,1000,2,8,13 77 | 75,yrl,erlang,57,0,0,6 78 | 76,xrl,erlang,30,0,0,30 79 | 77,escript,erlang,71,0,1,6 80 | 78,fs,f-sharp,1000,3,13,39 81 | 79,fsx,f-sharp,1000,1,32,31 82 | 80,fsi,f-sharp,516,0,0,5 83 | 81,f,fortran,1000,6,38,559 84 | 82,f90,fortran,1000,9,1,14 85 | 83,f03,fortran,145,0,0,1 86 | 84,for,fortran,549,1,0,226 87 | 85,f95,fortran,216,0,0,4 88 | 86,f08,fortran,65,0,1,4 89 | 87,fpp,fortran,75,1,0,55 90 | 88,f77,fortran,5,0,0,0 91 | 89,glsl,glsl,1000,2,22,119 92 | 90,frag,glsl,1000,1,2,120 93 | 91,vert,glsl,1000,0,1,44 94 | 92,shader,glsl,1000,0,45,908 95 | 93,fsh,glsl,692,1,2,165 96 | 94,frg,glsl,35,0,0,28 97 | 95,vsh,glsl,326,0,1,39 98 | 96,geom,glsl,149,0,0,12 99 | 97,geo,glsl,289,0,2,215 100 | 98,fp,glsl,221,2,37,84 101 | 99,glslv,glsl,35,0,0,2 102 | 100,vshader,glsl,8,0,0,0 103 | 101,fshader,glsl,5,0,0,0 104 | 102,vrx,glsl,2,0,2,2 105 | 103,go,go,1000,0,13,0 106 | 104,groovy,groovy,1000,0,3,16 107 | 105,gtpl,groovy,21,0,0,13 108 | 106,gvy,groovy,2,0,0,0 109 | 107,grt,groovy,2,0,0,0 110 | 108,hs,haskell,1000,1,3,39 111 | 109,hsc,haskell,105,0,0,8 112 | 110,html,html,1000,0,240,131 113 | 111,htm,html,541,1,241,51 114 | 112,xhtml,html,149,0,60,1 115 | 113,xht,html,30,0,0,0 116 | 114,idr,idris,1000,1,2,195 117 | 115,lidr,idris,291,0,0,23 118 | 116,thy,isabelle,1000,0,20,399 119 | 117,java,java,1000,0,3,10 120 | 118,jsp,java-server-pages,1000,1,9,362 121 | 119,js,javascript,1000,3,113,151 122 | 120,es6,javascript,9,0,0,1 123 | 121,jsm,javascript,1,0,0,0 124 | 122,pac,javascript,2,0,0,0 125 | 123,xsjslib,javascript,1,0,0,0 126 | 124,sjs,javascript,3,0,0,0 127 | 125,stan,stan,1000,0,2,3 128 | 126,jl,julia,1000,0,12,15 129 | 127,kt,kotlin,1000,1,0,186 130 | 128,kts,kotlin,669,0,1,16 131 | 129,lean,lean,1000,0,0,6 132 | 130,hlean,lean,268,0,0,0 133 | 131,lagda,literate-agda,1000,0,0,12 134 | 132,litcoffee,literate-coffeescript,1000,0,4,0 135 | 133,lhs,literate-haskell,1000,0,2,114 136 | 134,lua,lua,1000,1,15,15 137 | 135,nse,lua,91,0,0,1 138 | 136,wlua,lua,8,0,0,0 139 | 137,,makefile,1000,2,9,80 140 | 138,mk,makefile,1000,1,20,104 141 | 139,mak,makefile,273,1,11,111 142 | 140,txt,makefile,1,0,0,0 143 | 141,cmake,makefile,1,0,0,0 144 | 142,mpl,maple,1000,1,6,0 145 | 143,md,markdown,1000,2,73,20 146 | 144,markdown,markdown,244,0,32,2 147 | 145,mkd,markdown,5,0,0,0 148 | 146,ron,markdown,2,0,0,0 149 | 147,mkdn,markdown,1,0,0,0 150 | 148,ma,mathematica,739,0,390,680 151 | 149,cdf,mathematica,610,1,8,207 152 | 150,nb,mathematica,1000,0,720,301 153 | 151,mt,mathematica,766,6,61,657 154 | 152,wl,mathematica,686,1,13,503 155 | 153,wlt,mathematica,113,1,1,103 156 | 154,nbp,mathematica,30,0,0,30 157 | 155,mathematica,mathematica,1,0,0,1 158 | 156,matlab,matlab,1000,0,338,2 159 | 157,ml,ocaml,1000,1,6,22 160 | 158,mli,ocaml,1000,0,3,0 161 | 159,mll,ocaml,163,2,2,0 162 | 160,mly,ocaml,149,0,0,2 163 | 161,eliom,ocaml,10,0,0,0 164 | 162,eliomi,ocaml,2,0,0,0 165 | 163,ml4,ocaml,11,0,0,0 166 | 164,pas,pascal,1000,0,4,62 167 | 165,dfm,pascal,1000,0,0,3 168 | 166,dpr,pascal,1000,0,0,6 169 | 167,lpr,pascal,339,0,1,18 170 | 168,pl,perl,1000,2,6,119 171 | 169,t,perl,1000,1,5,130 172 | 170,pm,perl,1000,0,8,32 173 | 171,al,perl,788,1,1,35 174 | 172,plx,perl,28,0,6,0 175 | 173,ph,perl,62,0,7,4 176 | 174,psgi,perl,11,0,0,0 177 | 175,perl,perl,62,0,1,8 178 | 176,php,php,1000,1,15,0 179 | 177,phpt,php,92,0,0,0 180 | 178,ctp,php,23,0,1,0 181 | 179,ps1,powershell,1000,0,6,15 182 | 180,psm1,powershell,1000,1,7,8 183 | 181,psd1,powershell,999,1,9,1 184 | 182,yap,prolog,1000,2,18,36 185 | 183,prolog,prolog,480,1,36,67 186 | 184,proto,protocol-buffer,1000,0,17,0 187 | 185,py,python,1000,0,11,1 188 | 186,bzl,python,50,0,0,0 189 | 187,pyw,python,6,0,0,0 190 | 188,gyp,python,15,0,0,0 191 | 189,pyde,python,2,0,0,0 192 | 190,r,r,1000,4,20,18 193 | 191,rd,r,912,0,8,67 194 | 192,rsx,r,38,1,2,0 195 | 193,scrbl,racket,1000,0,10,85 196 | 194,rktd,racket,281,1,132,0 197 | 195,rktl,racket,396,3,0,0 198 | 196,rst,restructuredtext,1000,6,21,7 199 | 197,rest,restructuredtext,41,0,2,0 200 | 198,rmd,rmarkdown,1000,0,72,0 201 | 199,rb,ruby,1000,0,6,0 202 | 200,gemspec,ruby,600,0,24,0 203 | 201,podspec,ruby,430,0,0,0 204 | 202,rake,ruby,134,0,0,0 205 | 203,jbuilder,ruby,102,0,2,1 206 | 204,ru,ruby,59,0,12,24 207 | 205,builder,ruby,14,0,0,1 208 | 206,ruby,ruby,3,0,0,0 209 | 207,rabl,ruby,12,0,0,0 210 | 208,thor,ruby,2,0,0,0 211 | 209,rbw,ruby,1,0,0,0 212 | 210,rs,rust,1000,0,3,1 213 | 211,sas,sas,1000,200,43,230 214 | 212,scala,scala,1000,0,5,25 215 | 213,sbt,scala,913,0,1,3 216 | 214,scm,scheme,1000,5,9,86 217 | 215,sld,scheme,1000,0,99,60 218 | 216,sps,scheme,446,2,9,87 219 | 217,sh,shell,1000,0,20,1 220 | 218,bash,shell,312,1,4,0 221 | 219,tmux,shell,7,0,0,0 222 | 220,bats,shell,127,0,0,0 223 | 221,zsh,shell,232,0,1,0 224 | 222,ksh,shell,26,0,0,0 225 | 223,command,shell,24,0,0,0 226 | 224,tool,shell,1,0,0,0 227 | 225,st,smalltalk,1000,0,10,416 228 | 226,sol,solidity,1000,8,22,575 229 | 227,sparql,sparql,1000,1,43,138 230 | 228,rq,sparql,1000,2,6,126 231 | 229,sql,sql,1000,0,83,129 232 | 230,tab,sql,267,2,40,60 233 | 231,cql,sql,96,0,6,49 234 | 232,pkb,sql,26,0,0,1 235 | 233,prc,sql,23,0,2,9 236 | 234,pls,sql,17,0,0,1 237 | 235,pks,sql,16,0,0,1 238 | 236,ddl,sql,75,0,2,22 239 | 237,pck,sql,9,0,3,4 240 | 238,plsql,sql,7,0,0,1 241 | 239,db2,sql,4,0,0,0 242 | 240,udf,sql,7,0,0,7 243 | 241,plb,sql,1,0,0,0 244 | 242,stan,stan,1000,0,2,3 245 | 243,sig,standard-ml,1000,1,415,442 246 | 244,fun,standard-ml,958,0,0,463 247 | 245,sml,standard-ml,1000,1,4,367 248 | 246,do,stata,1000,3,56,29 249 | 247,ado,stata,1000,15,24,20 250 | 248,sthlp,stata,1000,0,0,17 251 | 249,ihlp,stata,406,0,0,1 252 | 250,mata,stata,392,0,4,9 253 | 251,doh,stata,5,0,0,0 254 | 252,matah,stata,8,0,0,0 255 | 253,sv,systemverilog,1000,2,3,9 256 | 254,svh,systemverilog,1000,15,0,47 257 | 255,vh,systemverilog,615,2,10,5 258 | 256,tcl,tcl,1000,0,44,501 259 | 257,tm,tcl,398,4,10,258 260 | 258,adp,tcl,322,0,0,318 261 | 259,csh,tcsh,1000,1,6,7 262 | 260,tcsh,tcsh,340,0,6,2 263 | 261,toc,tex,479,0,25,0 264 | 262,ltx,tex,151,2,2,0 265 | 263,bib,tex,1000,0,214,0 266 | 264,sty,tex,611,0,6,0 267 | 265,tex,tex,1000,4,88,2 268 | 266,ins,tex,44,1,2,0 269 | 267,dtx,tex,174,0,1,2 270 | 268,aux,tex,438,0,26,0 271 | 269,mkii,tex,12,0,0,0 272 | 270,bbx,tex,15,0,0,0 273 | 271,lbx,tex,2,0,0,0 274 | 272,cbx,tex,6,0,0,0 275 | 273,mkiv,tex,27,0,0,0 276 | 274,mkvi,tex,3,0,0,0 277 | 275,thrift,thrift,1000,0,0,32 278 | 276,ts,typescript,1000,0,9,32 279 | 277,tsx,typescript,1000,0,15,775 280 | 278,veo,verilog,1000,0,2,0 281 | 279,vhd,vhdl,1000,0,8,32 282 | 280,vhdl,vhdl,1000,0,106,335 283 | 281,vho,vhdl,309,0,0,59 284 | 282,vhi,vhdl,17,0,5,7 285 | 283,vht,vhdl,75,0,0,2 286 | 284,vhf,vhdl,33,0,4,0 287 | 285,vhw,vhdl,5,0,0,0 288 | 286,vb,visual-basic,1000,0,6,0 289 | 287,frm,visual-basic,1000,1,840,0 290 | 288,bas,visual-basic,1000,0,7,0 291 | 289,vbs,visual-basic,722,0,6,0 292 | 290,vba,visual-basic,77,0,0,0 293 | 291,vbhtml,visual-basic,57,0,1,0 294 | 292,frx,visual-basic,38,1,14,0 295 | 293,xsl,xslt,1000,0,20,2 296 | 294,xslt,xslt,1000,1,51,7 297 | 295,yy,yacc,1000,2,30,0 298 | 296,y,yacc,1000,8,0,0 299 | 297,yacc,yacc,14,0,0,0 300 | 298,zig,zig,1000,0,5,123 301 | -------------------------------------------------------------------------------- /data_analysis/notebooks/stats.csv: -------------------------------------------------------------------------------- 1 | table_id,size_before_gb,size_after_gb,row_before,row_after,delta_size_percentage,delta_row_percentage 2 | total,6814.8,2412.78,545847408,236746567,64.59,56.63 3 | abap,0.25,0.09,23512,12161,65.33,48.28 4 | actionscript,1.39,0.71,215655,136143,48.78,36.87 5 | ada,0.96,0.32,60325,31291,66.94,48.13 6 | agda,0.14,0.09,24996,17608,34.2,29.56 7 | ags-script,0.02,0.01,2004,1061,60.19,47.06 8 | alloy,0.17,0.01,19907,5374,92.6,73.0 9 | ampl,0.0,0.0,88,75,21.43,14.77 10 | antlr,0.19,0.05,16787,7983,71.89,52.45 11 | apacheconf,0.0,0.0,471,149,76.95,68.37 12 | api-blueprint,0.05,0.03,3714,2634,42.54,29.08 13 | apl,0.01,0.01,2396,2039,24.75,14.9 14 | applescript,0.02,0.01,6030,4906,17.87,18.64 15 | arc,0.03,0.02,2637,1548,22.72,41.3 16 | arduino,1.29,0.76,211901,144914,41.08,31.61 17 | asciidoc,5.39,1.04,474531,184331,80.72,61.16 18 | asp,0.98,0.56,159294,84104,42.34,47.2 19 | aspectj,0.01,0.01,4449,2509,39.31,43.61 20 | assembly,2.77,1.71,363453,248396,38.23,31.66 21 | ats,0.02,0.01,4234,2963,34.91,30.02 22 | augeas,0.0,0.0,254,195,40.67,23.23 23 | autohotkey,0.14,0.09,18230,14648,34.54,19.65 24 | autoit,0.19,0.1,16777,10982,43.63,34.54 25 | awk,0.04,0.03,13454,10430,36.4,22.48 26 | batchfile,1.28,0.42,440070,252514,67.34,42.62 27 | befunge,0.0,0.0,35,33,5.51,5.71 28 | bison,0.0,0.0,176,134,5.08,23.86 29 | bitbake,0.28,0.08,137980,49453,70.28,64.16 30 | blitzbasic,0.01,0.01,386,237,40.62,38.6 31 | blitzmax,0.02,0.01,2000,1695,31.39,15.25 32 | bluespec,0.07,0.04,9050,5940,47.39,34.36 33 | boo,0.01,0.01,6868,4570,30.58,33.46 34 | brainfuck,0.06,0.05,10822,3602,24.85,66.72 35 | brightscript,0.02,0.01,2862,2148,34.06,24.95 36 | bro,0.0,0.0,1100,871,21.12,20.82 37 | c,267.01,61.99,21383832,8625559,76.78,59.66 38 | c-sharp,146.43,52.5,21702269,10839399,64.15,50.05 39 | c2hs-haskell,0.02,0.01,1492,968,32.38,35.12 40 | capn-proto,0.04,0.0,3203,1308,92.31,59.16 41 | cartocss,0.05,0.02,3053,1322,69.99,56.7 42 | ceylon,0.03,0.02,8271,5900,38.22,28.67 43 | chapel,0.22,0.03,33787,13591,85.25,59.77 44 | chuck,0.0,0.0,1621,1260,22.1,22.27 45 | cirru,0.07,0.06,1197,980,13.06,18.13 46 | clarion,0.01,0.01,1237,1086,19.34,12.21 47 | clean,0.01,0.0,925,776,18.32,16.11 48 | click,0.0,0.0,483,330,38.22,31.68 49 | clips,0.01,0.01,2291,1545,26.89,32.56 50 | clojure,1.13,0.56,177398,126191,50.99,28.87 51 | cmake,2.42,0.56,586269,186517,77.02,68.19 52 | cobol,0.03,0.02,4047,2978,30.56,26.41 53 | coffeescript,2.81,0.81,331824,227889,71.19,31.32 54 | coldfusion,0.13,0.07,20990,12931,44.73,38.39 55 | coldfusion-cfc,0.18,0.08,23567,12745,57.17,45.92 56 | common-lisp,3.67,1.74,156804,101370,52.69,35.35 57 | component-pascal,0.14,0.08,1151,529,41.84,54.04 58 | coq,0.0,0.0,59,33,83.6,44.07 59 | cpp,223.03,54.23,14820829,6377914,75.69,56.97 60 | creole,0.0,0.0,767,656,17.23,14.47 61 | crystal,0.64,0.3,124741,78484,52.27,37.08 62 | csound,0.05,0.04,1845,1341,23.56,27.32 63 | css,153.05,24.03,5889790,2994829,84.3,49.15 64 | csv,411.55,314.12,7640022,6404239,23.67,16.18 65 | cucumber,0.55,0.27,132603,84276,51.69,36.44 66 | cuda,1.85,0.62,145560,58355,66.15,59.91 67 | cycript,0.02,0.01,554,434,34.06,21.66 68 | cython,1.61,0.32,86161,35870,80.46,58.37 69 | d,0.0,0.0,603,371,44.32,38.47 70 | darcs-patch,0.0,0.0,376,362,2.93,3.72 71 | dart,13.19,4.36,1734126,932583,66.95,46.22 72 | desktop,0.03,0.02,23218,19569,27.12,15.72 73 | diff,5.19,2.29,526857,320925,55.87,39.09 74 | digital-command-language,0.14,0.07,35497,17738,48.87,50.03 75 | dm,0.04,0.04,4842,4571,3.45,5.6 76 | dns-zone,0.03,0.03,1948,1471,20.48,24.49 77 | dockerfile,2.54,0.68,1265281,572186,73.25,54.78 78 | dogescript,0.0,0.0,386,286,20.15,25.91 79 | dylan,0.07,0.04,7152,4553,38.01,36.34 80 | eagle,6.27,2.69,70846,45187,57.04,36.22 81 | ec,0.06,0.02,1976,1062,63.23,46.26 82 | ecere-projects,0.0,0.0,311,75,63.28,75.88 83 | ecl,0.06,0.03,10899,3843,53.78,64.74 84 | edn,1.47,1.36,30683,20731,7.08,32.43 85 | eiffel,0.22,0.13,39311,24070,40.11,38.77 86 | elixir,2.54,0.88,594074,282110,65.4,52.51 87 | elm,0.63,0.37,90637,62861,41.51,30.65 88 | emacs-lisp,2.0,0.46,122317,54768,76.94,55.22 89 | emberscript,0.04,0.03,3487,1603,41.97,54.03 90 | erlang,2.8,0.78,203193,99368,72.28,51.1 91 | f-sharp,2.67,0.96,219519,127161,63.93,42.07 92 | factor,0.14,0.03,29484,10270,76.62,65.17 93 | fancy,0.0,0.0,609,511,16.82,16.09 94 | fantom,0.02,0.01,3186,2448,30.67,23.16 95 | fish,0.13,0.05,53407,32206,61.43,39.7 96 | flux,0.02,0.01,3201,998,69.87,68.82 97 | forth,0.03,0.02,5665,4425,30.33,21.89 98 | fortran,3.97,1.93,287086,165446,51.33,42.37 99 | freemarker,0.66,0.29,114988,60206,55.87,47.64 100 | g-code,1.05,0.89,16020,10004,14.95,37.55 101 | gams,0.12,0.07,5275,1814,44.64,65.61 102 | gap,0.02,0.01,2158,1124,33.53,47.91 103 | gas,3.15,1.23,186463,91662,60.93,50.84 104 | gdscript,0.44,0.31,156343,119556,29.8,23.53 105 | genshi,0.01,0.01,466,350,38.08,24.89 106 | gentoo-ebuild,0.14,0.06,75148,34122,56.93,54.59 107 | gentoo-eclass,0.0,0.0,439,364,21.81,17.08 108 | gettext-catalog,40.72,4.52,775246,139375,88.89,82.02 109 | glsl,1.34,0.67,317741,175576,49.97,44.74 110 | glyph,0.0,0.0,52,47,8.95,9.62 111 | gnuplot,2.93,2.85,53925,47178,2.51,12.51 112 | go,118.96,28.09,11653185,4730461,76.39,59.41 113 | golo,0.0,0.0,629,522,17.01,17.01 114 | gosu,0.01,0.01,1625,795,40.17,51.08 115 | grace,0.0,0.0,637,561,21.4,11.93 116 | grammatical-framework,0.07,0.04,5938,3576,41.34,39.78 117 | graphql,0.68,0.15,68237,45331,77.79,33.57 118 | graphviz_dot,6.77,1.09,561726,68824,83.87,87.75 119 | groff,6.58,2.57,490292,181789,60.94,62.92 120 | groovy,3.17,1.1,504166,251627,65.22,50.09 121 | groovy-server-pages,0.14,0.09,21695,12237,36.26,43.6 122 | haml,0.36,0.2,180504,116033,46.03,35.72 123 | handlebars,1.16,0.62,389923,243252,46.81,37.62 124 | harbour,0.0,0.0,911,787,31.96,13.61 125 | haskell,6.24,2.63,798865,544969,57.93,31.78 126 | haxe,1.63,0.52,241032,131491,68.15,45.45 127 | hcl,1.81,0.82,470644,272971,54.45,42.0 128 | hlsl,0.21,0.09,48653,25476,59.06,47.64 129 | html,837.25,153.06,35592089,9533367,81.72,73.21 130 | html_django,0.43,0.14,97815,48857,67.19,50.05 131 | html_eex,0.05,0.03,22764,13306,46.59,41.55 132 | html_erb,2.1,1.12,862407,529049,46.8,38.65 133 | html_php,0.71,0.36,142410,76790,49.59,46.08 134 | http,0.11,0.07,31307,20830,37.46,33.47 135 | hy,0.02,0.01,2090,1238,42.22,40.77 136 | idl,0.07,0.01,991,545,80.93,45.01 137 | idris,0.06,0.03,10529,8060,40.49,23.45 138 | igor-pro,0.01,0.01,819,712,18.82,13.06 139 | inform-7,0.02,0.01,514,328,29.26,36.19 140 | ini,11.82,3.44,2537776,1187507,70.93,53.21 141 | inno-setup,0.05,0.02,6567,3227,64.54,50.86 142 | io,0.01,0.01,3523,2634,24.55,25.23 143 | ioke,0.0,0.0,537,374,50.96,30.35 144 | irc-log,0.01,0.01,105,100,0.09,4.76 145 | isabelle,0.14,0.09,7738,5086,34.85,34.27 146 | j,0.02,0.02,3321,3030,10.3,8.76 147 | jade,0.33,0.2,117328,84298,38.59,28.15 148 | jasmin,0.08,0.04,11534,6293,47.33,45.44 149 | java,294.72,102.29,42429211,20151565,65.29,52.51 150 | java-server-pages,2.37,1.17,407189,214133,50.45,47.41 151 | javascript,519.8,152.95,40112121,21108587,70.57,47.38 152 | jflex,0.04,0.01,5346,1662,70.14,68.91 153 | json,1385.98,350.67,80802152,17012912,74.7,78.94 154 | json5,0.15,0.07,8462,4652,51.15,45.02 155 | jsoniq,0.01,0.01,5604,3729,26.99,33.46 156 | jsonld,0.63,0.2,52263,10836,68.16,79.27 157 | jsx,5.73,3.12,1462062,970156,45.53,33.64 158 | julia,3.64,1.69,480267,298672,53.56,37.81 159 | jupyter-notebook,188.84,155.85,1459463,1073534,17.47,26.44 160 | kicad,3.19,2.92,12744,11448,8.5,10.17 161 | kit,0.01,0.01,1730,1324,22.15,23.47 162 | kotlin,14.67,7.19,3782188,2242771,50.96,40.7 163 | krl,0.0,0.0,366,267,28.73,27.05 164 | labview,0.13,0.05,2879,1212,58.18,57.9 165 | lasso,0.25,0.05,6232,1089,77.59,82.53 166 | latte,0.02,0.02,8614,6429,30.65,25.37 167 | lean,0.8,0.11,51125,16891,86.57,66.96 168 | less,3.34,1.1,615585,344780,67.07,43.99 169 | lex,0.11,0.09,4254,3189,22.03,25.04 170 | lfe,0.0,0.0,1194,1017,28.33,14.82 171 | lilypond,0.04,0.04,8932,6454,19.78,27.74 172 | linker-script,0.37,0.06,53158,8474,82.77,84.06 173 | liquid,0.21,0.11,40826,24759,45.44,39.35 174 | literate-agda,0.02,0.01,1211,523,67.89,56.81 175 | literate-coffeescript,0.01,0.01,1316,1138,20.83,13.53 176 | literate-haskell,0.18,0.06,10613,6135,67.98,42.19 177 | livescript,0.06,0.04,12084,9265,33.01,23.33 178 | llvm,3.13,0.77,180863,56247,75.37,68.9 179 | logos,0.51,0.35,30033,19242,31.65,35.93 180 | logtalk,0.03,0.01,8419,2185,71.3,74.05 181 | lolcode,0.0,0.0,883,798,6.33,9.63 182 | lookml,0.01,0.01,635,448,34.8,29.45 183 | lsl,0.02,0.02,3314,2536,36.8,23.48 184 | lua,7.65,3.6,925895,558861,53.01,39.64 185 | m,0.0,0.0,134,128,3.04,4.48 186 | m4,1.18,0.09,57797,17902,91.99,69.03 187 | makefile,7.6,1.8,1483161,661424,76.29,55.4 188 | mako,0.15,0.03,15939,7476,79.97,53.1 189 | maple,0.04,0.02,3472,1259,60.55,63.74 190 | markdown,282.57,90.93,40751875,21045171,67.82,48.36 191 | mask,0.02,0.01,2129,1087,43.78,48.94 192 | mathematica,2.3,1.73,63791,26895,24.72,57.84 193 | matlab,0.06,0.04,1296,967,35.0,25.39 194 | max,0.68,0.54,14305,10476,21.33,26.77 195 | maxscript,0.01,0.01,753,531,31.04,29.48 196 | mediawiki,0.69,0.2,49427,15478,70.82,68.69 197 | metal,0.04,0.02,8711,4061,51.64,53.38 198 | mirah,0.31,0.04,17005,4043,87.02,76.22 199 | modelica,0.4,0.15,43297,23853,63.19,44.91 200 | module-management-system,0.01,0.0,640,302,66.34,52.81 201 | monkey,0.01,0.01,1939,1660,8.15,14.39 202 | moonscript,0.02,0.02,6159,5068,28.78,17.71 203 | mtml,0.0,0.0,601,462,30.01,23.13 204 | muf,0.0,0.0,684,490,10.67,28.36 205 | mupad,0.01,0.01,865,643,18.51,25.66 206 | myghty,0.0,0.0,13,11,12.78,15.38 207 | nesc,0.21,0.15,27205,15109,28.91,44.46 208 | netlinx,0.0,0.0,202,177,5.23,12.38 209 | netlogo,0.09,0.03,3678,931,66.42,74.69 210 | nginx,0.0,0.0,15,14,7.1,6.67 211 | nimrod,1.18,0.48,91048,51660,59.25,43.26 212 | ninja,0.28,0.07,11995,2130,73.5,82.24 213 | nit,0.0,0.0,121,110,18.16,9.09 214 | nix,5.91,0.42,456544,121964,92.87,73.29 215 | nsis,0.09,0.03,10359,3806,66.8,63.26 216 | nu,0.01,0.0,1372,877,51.95,36.08 217 | numpy,0.0,0.0,7,7,-0.34,0.0 218 | objdump,0.08,0.07,1407,676,16.63,51.95 219 | objective-cpp,2.13,0.64,167633,65034,70.02,61.2 220 | objective-j,0.01,0.01,520,395,29.09,24.04 221 | ocaml,4.04,1.19,354810,159734,70.57,54.98 222 | octave,0.0,0.0,215,203,8.22,5.58 223 | omgrofl,0.0,0.0,4,4,-0.6,0.0 224 | ooc,0.01,0.01,1771,1426,29.46,19.48 225 | opa,0.0,0.0,385,354,1.93,8.05 226 | opal,0.0,0.0,185,152,13.93,17.84 227 | opencl,0.68,0.14,59129,19729,79.13,66.63 228 | openscad,0.17,0.11,23888,20380,34.56,14.69 229 | org,0.51,0.39,57582,50013,22.97,13.14 230 | ox,0.0,0.0,414,192,56.71,53.62 231 | oxygene,0.0,0.0,63,31,49.54,50.79 232 | oz,0.01,0.01,1258,1179,6.24,6.28 233 | pan,0.1,0.02,8901,2672,81.9,69.98 234 | papyrus,0.06,0.05,19646,10358,18.87,47.28 235 | parrot,0.0,0.0,32,28,11.35,12.5 236 | parrot-assembly,0.0,0.0,799,384,50.74,51.94 237 | parrot-internal-representation,0.11,0.03,2158,1447,76.8,32.95 238 | pascal,3.21,1.77,176640,118675,44.84,32.82 239 | pawn,0.08,0.06,3091,2412,26.19,21.97 240 | perl,7.52,2.84,834305,392108,62.24,53.0 241 | perl6,0.16,0.04,15602,9782,72.54,37.3 242 | php,203.92,74.87,34851418,15904518,63.28,54.36 243 | piglatin,0.01,0.0,2237,1514,33.27,32.32 244 | pike,0.0,0.0,1067,1008,4.89,5.53 245 | pod,0.49,0.12,35697,11730,76.13,67.14 246 | pogoscript,0.0,0.0,254,226,9.72,11.02 247 | pony,0.04,0.02,5222,3529,42.68,32.42 248 | postscript,1.93,1.57,24065,16096,18.6,33.11 249 | pov-ray-sdl,0.04,0.02,2364,1067,39.4,54.86 250 | powershell,3.67,1.4,527898,271487,61.91,48.57 251 | processing,0.39,0.26,70860,55528,34.21,21.64 252 | prolog,0.03,0.01,1884,1023,59.49,45.7 253 | propeller-spin,0.04,0.02,2926,2120,43.73,27.55 254 | protocol-buffer,2.14,0.49,254672,98246,77.1,61.42 255 | public-key,0.06,0.06,55301,53841,1.82,2.64 256 | pure-data,0.16,0.11,18689,14569,29.47,22.05 257 | purebasic,0.12,0.05,22424,13001,58.84,42.02 258 | purescript,0.23,0.16,42903,32331,31.92,24.64 259 | python,213.56,70.87,24214270,12962249,66.81,46.47 260 | python-traceback,0.0,0.0,13,9,12.04,30.77 261 | qmake,0.05,0.02,14696,9100,61.21,38.08 262 | qml,0.52,0.22,82710,45565,58.35,44.91 263 | r,0.41,0.32,51877,39194,20.7,24.45 264 | racket,0.07,0.04,5553,4201,44.36,24.35 265 | ragel-in-ruby-host,0.03,0.01,2891,1610,59.21,44.31 266 | raml,0.1,0.05,17876,11862,51.98,33.64 267 | raw-token-data,1.05,0.74,51067,36913,28.91,27.72 268 | rdoc,0.08,0.03,14958,9535,62.98,36.25 269 | realbasic,0.01,0.01,989,800,15.03,19.11 270 | rebol,0.01,0.01,2023,1388,44.66,31.39 271 | red,0.08,0.04,5725,3291,55.07,42.52 272 | redcode,0.0,0.0,588,495,12.02,15.82 273 | renderscript,0.01,0.0,1756,822,53.95,53.19 274 | renpy,0.16,0.12,6333,4529,24.63,28.49 275 | restructuredtext,13.54,4.06,1760914,905679,69.99,48.57 276 | rhtml,0.02,0.01,8676,6812,21.7,21.48 277 | rmarkdown,0.08,0.06,6572,5389,20.0,18.0 278 | robotframework,0.15,0.06,24996,11844,62.24,52.62 279 | rouge,0.02,0.0,6276,614,86.99,90.22 280 | ruby,29.76,8.9,7205146,3405374,70.08,52.74 281 | rust,41.35,10.2,3057230,1386585,75.33,54.65 282 | sage,0.02,0.02,2174,1857,14.77,14.58 283 | saltstack,0.09,0.05,40179,27071,41.87,32.62 284 | sas,0.34,0.14,15258,9772,60.85,35.95 285 | sass,0.35,0.23,132724,101656,34.73,23.41 286 | scala,17.34,5.71,2787552,1362426,67.07,51.12 287 | scaml,0.0,0.0,149,88,55.96,40.94 288 | scheme,0.63,0.32,64946,44261,49.12,31.85 289 | scilab,0.02,0.01,3840,2937,45.89,23.52 290 | scss,12.06,5.48,3384703,2094964,54.62,38.1 291 | self,0.0,0.0,111,82,32.18,26.13 292 | shell,10.96,4.47,3769888,2236434,59.21,40.68 293 | shellsession,0.0,0.0,12,9,10.74,25.0 294 | shen,0.0,0.0,421,298,36.38,29.22 295 | slash,0.07,0.04,15825,6960,49.84,56.02 296 | slim,0.12,0.08,64833,45887,37.86,29.22 297 | smali,4.45,2.71,403309,192445,39.08,52.28 298 | smalltalk,1.76,1.12,774045,592999,36.8,23.39 299 | smarty,1.18,0.53,256481,137110,55.15,46.54 300 | smt,8.03,0.29,144547,14877,96.41,89.71 301 | solidity,3.76,1.32,388997,164242,64.99,57.78 302 | sourcepawn,0.2,0.15,9570,6708,27.41,29.91 303 | sparql,0.07,0.05,24511,14173,27.62,42.18 304 | sqf,0.26,0.14,54949,34892,47.93,36.5 305 | sql,21.22,13.06,1386738,994019,38.46,28.32 306 | squirrel,0.06,0.04,7490,4956,30.58,33.83 307 | stan,0.2,0.02,14017,5441,91.68,61.18 308 | standard-ml,1.23,0.54,68834,48995,55.76,28.82 309 | stata,0.6,0.42,45607,31282,29.61,31.41 310 | ston,0.0,0.0,2087,946,38.81,54.67 311 | stylus,0.31,0.18,116347,81646,41.1,29.83 312 | supercollider,0.03,0.02,3535,2529,39.49,28.46 313 | svg,118.55,77.17,6411336,3267524,34.9,49.04 314 | swift,15.26,7.13,2941299,1756144,53.29,40.29 315 | systemverilog,1.18,0.44,100755,46915,63.16,53.44 316 | tcl,1.13,0.43,109545,50579,61.87,53.83 317 | tcsh,0.08,0.03,20547,4911,67.41,76.1 318 | tea,0.05,0.04,1292,1012,22.35,21.67 319 | tex,8.67,5.82,705363,547888,32.89,22.33 320 | text,363.02,266.01,21630263,15267582,26.72,29.42 321 | textile,0.08,0.04,10965,8196,54.12,25.25 322 | thrift,0.09,0.02,11730,4663,82.56,60.25 323 | toml,2.3,0.88,994820,417483,61.77,58.03 324 | turing,0.01,0.01,375,292,26.97,22.13 325 | turtle,4.55,2.64,219880,96432,41.9,56.14 326 | twig,2.13,1.25,575087,342973,41.45,40.36 327 | txl,0.0,0.0,180,132,3.9,26.67 328 | typescript,143.21,34.5,19589267,10637070,75.91,45.7 329 | unified-parallel-c,0.0,0.0,241,194,10.07,19.5 330 | unity3d-asset,25.1,7.87,9834153,928191,68.63,90.56 331 | uno,0.02,0.01,6159,3817,47.9,38.03 332 | unrealscript,0.12,0.07,13293,10221,42.61,23.11 333 | urweb,0.01,0.01,1341,1111,22.21,17.15 334 | vala,0.04,0.03,5837,4480,32.05,23.25 335 | vcl,0.01,0.0,1848,1152,47.73,37.66 336 | verilog,0.01,0.0,1831,77,93.36,95.79 337 | vhdl,3.14,1.15,119386,60027,63.45,49.72 338 | viml,1.39,0.36,171857,96484,73.77,43.86 339 | visual-basic,4.05,1.59,398672,163291,60.81,59.04 340 | volt,0.03,0.01,5821,3493,54.27,39.99 341 | vue,15.09,8.0,2822678,1556867,46.99,44.84 342 | web-ontology-language,2.21,0.98,42906,10784,55.58,74.87 343 | webassembly,0.62,0.1,15489,5359,83.41,65.4 344 | webidl,0.01,0.01,4674,2016,57.51,56.87 345 | wisp,0.0,0.0,160,124,38.45,22.5 346 | x10,0.0,0.0,458,373,16.2,18.56 347 | xbase,0.13,0.06,12621,6989,49.56,44.62 348 | xc,0.01,0.0,1519,462,70.28,69.59 349 | xml,309.24,118.59,23441154,6267525,61.65,73.26 350 | xojo,0.04,0.03,3791,2305,30.98,39.2 351 | xpages,0.0,0.0,939,179,45.23,80.94 352 | xproc,0.01,0.01,1791,993,37.08,44.56 353 | xquery,0.1,0.05,38949,19713,49.79,49.39 354 | xs,0.14,0.02,4049,1683,83.86,58.43 355 | xslt,1.92,0.59,101092,43095,69.46,57.37 356 | xtend,0.08,0.05,15151,9844,35.7,35.03 357 | yacc,1.32,0.42,109233,25775,68.27,76.4 358 | yaml,98.91,31.38,13439939,5282081,68.27,60.7 359 | yang,1.86,0.14,55459,9653,92.5,82.59 360 | zephir,0.06,0.01,8701,2361,83.2,72.87 361 | zig,1.18,0.19,39894,15913,84.11,60.11 362 | zimpl,0.0,0.0,356,308,4.55,13.48 -------------------------------------------------------------------------------- /data_analysis/notebooks/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from tqdm import tqdm 3 | import numpy as np 4 | 5 | def get_size(text): 6 | # size of a string in bytes 7 | return len(text.encode('utf-8')) 8 | 9 | def add_size(sample): 10 | sample["size"] = get_size(sample["content"]) 11 | return sample 12 | 13 | def sample_eval_losses(model_all_license, model_safe_license, tokenizer_all, tokenizer_safe, ds, n=2000, device="cuda"): 14 | """ compute losses on the first n samples for both models""" 15 | losses_all = [] 16 | losses_safe = [] 17 | model_all_license.to(device) 18 | model_safe_license.to(device) 19 | for i in tqdm(range(n)): 20 | with torch.no_grad(): 21 | tokens_all = torch.tensor(tokenizer_all(ds[i]["content"], truncation=True)['input_ids']) 22 | tokens_safe = torch.tensor(tokenizer_safe(ds[i]["content"], truncation=True)['input_ids']) 23 | 24 | outputs = model_all_license(tokens_all.to(device), labels=tokens_all.to(device)) 25 | losses_all.append(outputs.loss.item()) 26 | outputs = model_safe_license(tokens_safe.to(device), labels=tokens_safe.to(device)) 27 | losses_safe.append(outputs.loss.item()) 28 | 29 | return losses_all, losses_safe 30 | 31 | 32 | def get_embeddings(model, tokenizer, ds, n=200, device="cuda"): 33 | """get embeddings of n files from the iterable dataset ds 34 | as the average of token embeddings of the file""" 35 | embeddings = [] 36 | model.to('cuda') 37 | for i, example in tqdm(enumerate(ds)): 38 | with torch.no_grad(): 39 | inputs = torch.tensor(tokenizer(example["content"], truncation=True)['input_ids']) 40 | outputs = model(inputs.to(device), labels=inputs.to(device), output_hidden_states=True) 41 | embeddings.append(np.mean(outputs.hidden_states[-1].detach().cpu().numpy(),axis=0)) 42 | if i == n - 1: 43 | break 44 | return np.array(embeddings) 45 | -------------------------------------------------------------------------------- /data_analysis/python_data_analysis/code_compilation/README.md: -------------------------------------------------------------------------------- 1 | # Code compilation 2 | Here we provide code to estimate the number of valid Python files by using the \texttt{py\_compile} module on some samples from a code dataset. We try to compile files for both python2 and python3 and count how many throw syntax errors. 3 | 4 | You can execute the code using: 5 | ```bash 6 | python compile.py --dataset_name --n_samples --seed 7 | ``` 8 | where `dataset_name` is the name of the dataset you want to analyze, `n_samples` is the number of samples to use, `seed` is the seed for the random shuffling. 9 | -------------------------------------------------------------------------------- /data_analysis/python_data_analysis/code_compilation/compile_py_files.py: -------------------------------------------------------------------------------- 1 | 2 | import tempfile 3 | import subprocess 4 | from tqdm import tqdm 5 | import argparse 6 | from datasets import load_dataset 7 | 8 | 9 | def parseArgs(): 10 | parser = argparse.ArgumentParser( 11 | description="Code compilation" 12 | ) 13 | parser.add_argument( 14 | "--dataset_name", 15 | default="bigcode/python_permissive", 16 | type=str, 17 | help="HF repo name/path of the dataset.", 18 | ) 19 | parser.add_argument( 20 | "--n_samples", 21 | default=10_000, 22 | type=int, 23 | help="Number of samples in the subset to analyze", 24 | ) 25 | parser.add_argument( 26 | "--seed", 27 | default=0, 28 | type=int, 29 | help="Seed", 30 | ) 31 | return parser.parse_args() 32 | 33 | 34 | def compile_python_code(sample): 35 | string = sample["content"] 36 | tmp = tempfile.NamedTemporaryFile() 37 | with open(tmp.name, "w") as f: 38 | f.write(string) 39 | py_command = "python{v} -m py_compile " + tmp.name 40 | 41 | try: 42 | subprocess.check_call(py_command.format(v=3).split()) 43 | python3_works = True 44 | except subprocess.CalledProcessError: 45 | python3_works = False 46 | 47 | try: 48 | subprocess.check_call(py_command.format(v=2).split()) 49 | python2_works = True 50 | except subprocess.CalledProcessError: 51 | python2_works = False 52 | 53 | return python2_works or python3_works 54 | 55 | 56 | if __name__ == '__main__': 57 | args = parseArgs() 58 | 59 | print(f"Loading {args.n_samples} samples from {args.dataset_name} dataset") 60 | data = load_dataset(args.dataset_name, streaming=True, split="train", use_auth_token=True) 61 | subset = list(data.shuffle(seed=args.seed).take(args.n_samples)) 62 | 63 | valid_files = 0 64 | for i in tqdm(range(len(subset))): 65 | if compile_python_code(subset[i]): 66 | valid_files += 1 67 | 68 | print(f"Number of valid python files in {args.n_samples} random samples: {valid_files}") 69 | print(f"Percentage of non valid files: {(len(subset) - valid_files) * 100 / len(subset)}%") -------------------------------------------------------------------------------- /data_analysis/python_data_analysis/code_compilation/requirements.txt: -------------------------------------------------------------------------------- 1 | datasets -------------------------------------------------------------------------------- /data_analysis/python_data_analysis/config_test_estimation/README.md: -------------------------------------------------------------------------------- 1 | # Config and test files detection 2 | 3 | Here we provide the code to detect and estimate the number of configuration and test files in a code dataset. 4 | 5 | You can execute the code using: 6 | ```bash 7 | python config_test.py --dataset_name --split 8 | ``` 9 | where `dataset_name` is the name of the dataset you want to analyze and `split` is the dataset split. -------------------------------------------------------------------------------- /data_analysis/python_data_analysis/config_test_estimation/config_test.py: -------------------------------------------------------------------------------- 1 | import time 2 | import argparse 3 | from datasets import load_dataset 4 | import numpy as np 5 | from tqdm import tqdm 6 | 7 | 8 | def parseArgs(): 9 | parser = argparse.ArgumentParser(description="Config and test files detection") 10 | parser.add_argument( 11 | "--dataset_name", 12 | default="bigcode/python_permissive", 13 | type=str, 14 | help="HF repo name/path of the dataset.", 15 | ) 16 | parser.add_argument( 17 | "--num_workers", 18 | default=96, 19 | type=int, 20 | help="Number f workers for multiprocessing", 21 | ) 22 | parser.add_argument( 23 | "--split", 24 | default="train", 25 | type=str, 26 | help="Datasset split to process", 27 | ) 28 | parser.add_argument( 29 | "--push_to_hub", 30 | action="store_true", 31 | help="Push the dataset to the Hub", 32 | ) 33 | return parser.parse_args() 34 | 35 | 36 | def is_config_or_test(example, scan_width=5, coeff=0.05): 37 | """Check if file is a configuration file or a unit test by : 38 | 1- looking for keywords in the first few lines of the file. 39 | 2- counting number of occurence of the words 'config' and 'test' with respect to number of lines. 40 | """ 41 | 42 | keywords = ["unit tests", "test file", "configuration file"] 43 | lines = example["content"].splitlines() 44 | count_config = 0 45 | count_test = 0 46 | # first test 47 | for _, line in zip(range(scan_width), lines): 48 | for keyword in keywords: 49 | if keyword in line.lower(): 50 | return {"config_or_test": True} 51 | # second test 52 | nlines = example["content"].count("\n") 53 | threshold = int(coeff * nlines) 54 | for line in lines: 55 | count_config += line.lower().count("config") 56 | count_test += line.lower().count("test") 57 | if count_config > threshold or count_test > threshold: 58 | return {"config_or_test": True} 59 | return {"config_or_test": False} 60 | 61 | 62 | def preprocess(example): 63 | results = dict() 64 | results.update(is_config_or_test(example)) 65 | return results 66 | 67 | 68 | def filter(example): 69 | """Filter files that are config or test files""" 70 | if example["config_or_test"]: 71 | return False 72 | return True 73 | 74 | 75 | args = parseArgs() 76 | 77 | # Load dataset 78 | t_start = time.time() 79 | print(f"Loading dataset {args.dataset_name}") 80 | dataset = load_dataset(args.dataset_name, split=args.split) 81 | # dataset = load_dataset("bigcode/the-stack", data_files = ["data/python/*"], split="train", use_auth_token=True, chunksize=40<<20) 82 | print(f"Time to load dataset: {time.time()-t_start:.2f}") 83 | 84 | # Run preprocessing 85 | t_start = time.time() 86 | ds = dataset.map(preprocess, num_proc=args.num_workers) 87 | print(f"Time to preprocess dataset: {time.time()-t_start:.2f}") 88 | print(ds) 89 | 90 | t_start = time.time() 91 | old_size = len(ds) 92 | ds = ds.filter(filter) 93 | print(f"Time to filter dataset: {time.time()-t_start:.2f}") 94 | print(f"\nSize of original dataset: {old_size}") 95 | print(f"Size of filtered dataset: {len(ds)}") 96 | print( 97 | f"\nPercentage of removed files: {np.round((old_size - len(ds))*100/old_size, 2)}%" 98 | ) 99 | 100 | print("\nCounting size in Gb of the new datase") 101 | new_size, old_size = 0, 0 102 | for i in tqdm(range(len(ds))): 103 | new_size += len(ds[i]["content"]) 104 | 105 | for i in tqdm(range(len(dataset))): 106 | old_size += len(dataset[i]["content"]) 107 | 108 | print(f"current size in Gb is {np.round(new_size/10**9), 4}") 109 | print(f"old size in Gb is {np.round(old_size/10**9, 4)}") 110 | print(f"volume removed: {np.round((old_size-new_size)*100/new_size, 2)}%") 111 | 112 | if args.push_to_hub: 113 | ds.push_to_hub("no_conf_test_ds") 114 | -------------------------------------------------------------------------------- /data_analysis/python_data_analysis/nl_language_identification/README.md: -------------------------------------------------------------------------------- 1 | # Natural Language identification in Python code 2 | 3 | In this folder, we provide code to extract Python docstrings and comment and identify their natural language. 4 | 5 | # Setup 6 | We use `fasttext` for language identification, download the language detection model `lid.176.bin` from [fasttext.cc/docs/en/language-identification](https://fasttext.cc/docs/en/language-identification.html) and seve it in `fastext_model`folder. You need to install `fastext` and `datasets` libraries. 7 | 8 | ``` 9 | pip install fastext 10 | pip install datasets 11 | ``` 12 | 13 | # Usage 14 | The command below saves a dataset with additional columns giving the language of each file, the score/confidence of model in the prediction, the extracted natural text and its size: 15 | ```` 16 | python language_identifier.py \ 17 | --dataset_name \ 18 | --model_path fasttext_model/lid.176.bin\ 19 | --save_path ./data/ 20 | ```` 21 | # Analysis 22 | 23 | See the notebook `analysis.ipynb`. 24 | 25 | Detected language distribution on 2,000 samples from CodeParrot data: 26 |

27 | 28 |

29 | -------------------------------------------------------------------------------- /data_analysis/python_data_analysis/nl_language_identification/fasttext_model/README.md: -------------------------------------------------------------------------------- 1 | Download the language detection model lid.176.bin from [fasttext.cc/docs/en/language-identification](https://fasttext.cc/docs/en/language-identification.html) 2 | -------------------------------------------------------------------------------- /data_analysis/python_data_analysis/nl_language_identification/language_identifier.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import multiprocessing 3 | import pathlib 4 | import fasttext 5 | 6 | from datasets import load_dataset 7 | 8 | from text_extraction import get_text 9 | 10 | #adapted from: https://github.com/bigscience-workshop/data-preparation/blob/main/sourcing/ 11 | # cc_pseudo_crawl/language_annotation/python_scripts/annotate_langid_crawl.py 12 | 13 | COLUMN = "content" 14 | 15 | def parseArgs(): 16 | parser = argparse.ArgumentParser( 17 | description="Identify natural languages in code" 18 | ) 19 | parser.add_argument( 20 | "dataset_name", 21 | type=str, 22 | help="HF repo name/path of the dataset.", 23 | ) 24 | parser.add_argument( 25 | "save_path", 26 | default="./data_with_language/", 27 | type=str, 28 | help="Path to save the new dataset with language column.", 29 | ) 30 | parser.add_argument( 31 | "model_path", 32 | default= "fasttext_model/lid.176.bin", 33 | type=str, 34 | help="Path to fasttext model.", 35 | ) 36 | args = parser.parse_args() 37 | return args 38 | 39 | def load_fasttext_model(path_fasttext_model): 40 | return fasttext.load_model(path_fasttext_model) 41 | 42 | 43 | def get_fasttext_info(line, model_lang_id): 44 | """The line should be in lower case and without \n in it.""" 45 | pred = model_lang_id.predict(line) 46 | lang_pred_fasttext_id = pred[0][0].replace("__label__", "") 47 | score_pred = pred[1][0] 48 | return lang_pred_fasttext_id, score_pred 49 | 50 | 51 | def get_all_fasttext_info(document, model_lang_id): 52 | document = document.lower() 53 | lang_pred_fasttext_id, score_pred = get_fasttext_info( 54 | document.replace("\n", " "), model_lang_id 55 | ) 56 | info = { 57 | "lang_pred_fasttext_id": lang_pred_fasttext_id, 58 | "score_pred": score_pred, 59 | "on_lines": [ 60 | { 61 | "id_line": id_line, 62 | "number_caracters_line": len(line), 63 | "lang_pred_fasttext_id_line": result_fasttext_line[0], 64 | "score_pred_line": result_fasttext_line[1], 65 | } 66 | for id_line, line in enumerate(document.split("\n")) 67 | for result_fasttext_line in [get_fasttext_info(line, model_lang_id)] 68 | ], 69 | } 70 | return info 71 | 72 | 73 | def extract_nl_text(example): 74 | text = get_text(example[COLUMN]) 75 | example["nl_text"] = text 76 | example["nl_size"] = len(text) 77 | return example 78 | 79 | 80 | class FunctionDatasetModifyingDocuments: 81 | def __init__(self, path_fasttext_model): 82 | self.path_fasttext_model = path_fasttext_model 83 | self.model_lang_id = load_fasttext_model(path_fasttext_model) 84 | 85 | def __call__(self, example): 86 | fasttext_pred = get_all_fasttext_info( 87 | example["nl_text"], self.model_lang_id 88 | ) 89 | example["nl_language"] = fasttext_pred["lang_pred_fasttext_id"] 90 | example["nl_language_score"] = fasttext_pred["score_pred"] 91 | return example 92 | 93 | def __reduce__(self): 94 | return (self.__class__, (self.path_fasttext_model,)) 95 | 96 | 97 | def main(): 98 | args = parseArgs() 99 | 100 | dataset = load_dataset(args.dataset_name) 101 | print("Loading dataset done") 102 | 103 | func_dataset_modifying_documents = FunctionDatasetModifyingDocuments( 104 | args.model_path 105 | ) 106 | 107 | dataset = dataset.map(extract_nl_text, num_proc=multiprocessing.cpu_count()) 108 | 109 | # Could be improved by allowing multiprocessing with map (currently doesn't work) 110 | dataset = dataset.map( 111 | func_dataset_modifying_documents, num_proc=1 112 | ) # num_proc=cpu_count() 113 | print("Fasttext done") 114 | 115 | pathlib.Path(args.save_path).mkdir(parents=True, exist_ok=True) 116 | dataset.save_to_disk(args.save_path) 117 | print("Shard successfully saved") -------------------------------------------------------------------------------- /data_analysis/python_data_analysis/nl_language_identification/requirements.txt: -------------------------------------------------------------------------------- 1 | datasets==2.4.0 2 | fasttext==0.9.2 -------------------------------------------------------------------------------- /data_analysis/python_data_analysis/nl_language_identification/text_extraction.py: -------------------------------------------------------------------------------- 1 | """Extract Python comments (using Python tokenizer) and docstrings (using AST parsing).""" 2 | 3 | import io 4 | from itertools import groupby 5 | from os.path import basename, splitext 6 | import ast 7 | import tokenize 8 | import warnings 9 | 10 | StringIO = io.StringIO 11 | 12 | NODE_TYPES = { 13 | ast.ClassDef: 'Class', 14 | ast.FunctionDef: 'Function/Method', 15 | ast.Module: 'Module' 16 | } 17 | 18 | # comment extraction 19 | def get_comments(s, clean=False): 20 | "Returns a string including all coments" 21 | coments = [] 22 | g = tokenize.generate_tokens(StringIO(s).readline) 23 | for toknum, tokval, _, _, _ in g: 24 | # print(toknum,tokval) 25 | if toknum == tokenize.COMMENT: 26 | coments.append((toknum, tokval)) 27 | result = tokenize.untokenize(coments) 28 | if clean: 29 | result = result.replace('#', '') 30 | return result 31 | 32 | # TODO: extraction work well (with decorators over classes) 33 | # ast parsing, source: https://gist.github.com/SpotlightKid/1548cb6c97f2a844f72d 34 | def parse_docstrings(source): 35 | """Parse Python source code and yield a tuple of ast node instance, name, 36 | and docstring for each function/method, class and module.""" 37 | tree = ast.parse(source) 38 | 39 | for node in ast.walk(tree): 40 | if isinstance(node, tuple(NODE_TYPES)): 41 | docstring = ast.get_docstring(node) 42 | 43 | yield (node, getattr(node, 'name', None), docstring) 44 | 45 | def get_docstrings(source, module=''): 46 | """Parse Python source code from file or string and print docstrings.""" 47 | if hasattr(source, 'read'): 48 | filename = getattr(source, 'name', module) 49 | module = splitext(basename(filename))[0] 50 | source = source.read() 51 | 52 | docstrings = sorted(parse_docstrings(source), 53 | key=lambda x: (NODE_TYPES.get(type(x[0])), x[1])) 54 | 55 | grouped = groupby(docstrings, key=lambda x: NODE_TYPES.get(type(x[0]))) 56 | results = [] 57 | for _, group in grouped: 58 | for _, name, docstring in group: 59 | name = name if name else module 60 | #print(docstring or '') 61 | if docstring: 62 | results.append(docstring) 63 | return results 64 | 65 | def get_text(source, comments=True, clean_comments=True): 66 | """Extract all natural text in source: comments + doctsrings 67 | the extraction fails in case of syntax errors in the file 68 | Args: 69 | source: the code to parse 70 | comments: if True extract comments two 71 | clean_comment: if True remove # from extracted comments 72 | Returns: 73 | a string with concatenated docstrings and comments""" 74 | 75 | try: 76 | docstrings = '\n'.join(get_docstrings(source)) 77 | except : 78 | docstrings = '' 79 | warnings.warn("code couldn't be parsed due to compilation failure, no docstring is extracted") 80 | 81 | if comments: 82 | try: 83 | comments = get_comments(source, clean=clean_comments) 84 | except : 85 | comments = '' 86 | warnings.warn("tokenization error, no comment is extracted") 87 | else: 88 | comments = '' 89 | 90 | output = docstrings + "\n\n" + comments 91 | return output.strip() -------------------------------------------------------------------------------- /data_analysis/stars_filtering/README.md: -------------------------------------------------------------------------------- 1 | # Filtering based on number of stars 2 | 3 | Here we are interested in filtering files based on their number of stars (i.e. of their parent repositories). 4 | 5 | You can find clean filtering code in `bigcode-dataset`repository under [preprocessing](https://github.com/bigcode-project/bigcode-dataset/tree/main/preprocessing). 6 | * `stars_analysis.ipynb` contains the code for the analysis of the stars filter, used to come up with minimum threshold of 5 stars for Python, Java and JavaScript subsets of [The Stack](https://huggingface.co/datasets/bigcode/the-stack). -------------------------------------------------------------------------------- /evaluation/README.md: -------------------------------------------------------------------------------- 1 | ## Evaluation analysis 2 | 3 | -------------------------------------------------------------------------------- /multi_query_experiments/README.md: -------------------------------------------------------------------------------- 1 | # Profiling of multi head vs multi query attention separately 2 | - `attention_types_imp.py` contains simplistic implementations of different attention layers without normalization, masks and softmax, just matrix multiplications and rearranging of tensors: 3 | - `MultiHead` is a multi head variant closely following the implementaion in Hugging Face. 4 | - `MultiQuery` is a multi query variant with dimension order of hidden states as in [Megatron-LM](https://github.com/NVIDIA/Megatron-LM) `[sl, bs]`. The reordering of the tensors avoids explicit copies here, however, `bmm` subsequently makes internal copies and speed suffers. TODO: try with separate tensors for `q`, `k` and `v`. 5 | - `MultiQuery1` uses the same hidden states order as in HF and one explicit `reshape`. It is the fastest and is currently ported to HF transformers. 6 | - `profile_attention_types.py` contains code to run timing experiments. Results are in `profile_attention_types.json`. 7 | - `profile_attention_types_visualise.ipynb` contains graphs. 8 | - There is uncertainty about the accuracy times of the profiler. Cpu times, through, decrease slightly in proportion, but still remain significant event for bigger tensors. Around 33% for sequence length of ~2K. However, `MultiQuery1` is the fastest and is ported to HF transformers. 9 | 10 | # Profiling of multi head vs multi query attention in HF transformers 11 | 12 | [![Open In SageMaker Studio Lab](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/bigcode-project/bigcode-analysis/multi_query_experiments/profile_mqa.ipynb) 13 | 14 | - The implementation of multi-query attention currently lives in a custom fork of `transformers` : [here](https://github.com/bigcode-project/transformers/tree/multi_query) 15 | - `profile_hf_generate.py` contains experiments. 16 | - There are 2 implementations variants of multi query attention controlled by `attention_type` parameter: 17 | - `AttentionType.MULTI_QUERY` with minimal changes to the code. 18 | - `AttentionType.MULTI_QUERY_1` with some reordering of dimensions from explorations with @harm-devries and bmm instead of matmul similarly as in `MultiQuery1`. 19 | - `AttentionType.MULTI_QUERY_1` is the fastest, with around 24% speedup: 20 | ``` 21 | -------------------- attention_type == AttentionType.MULTI_QUERY--------------------- 22 | {'get_test_batch': 5.9604644775390625e-05, 'generate_text_batch': 18.453815460205078, 'input_batch_size': 8, 'input_batch_length': 16, 'max_gen_length': 1024, 'num_beams': 1, 'do_sample': False, 'pad_token_id': 50256, 'dtype': torch.int64, 'device': device(type='cuda'), 'cuda_device_name': 'Tesla V100-PCIE-16GB-LS'} 23 | -------------------- attention_type == AttentionType.MULTI_QUERY_1--------------------- 24 | {'get_test_batch': 4.172325134277344e-05, 'generate_text_batch': 15.190143346786499, 'input_batch_size': 8, 'input_batch_length': 16, 'max_gen_length': 1024, 'num_beams': 1, 'do_sample': False, 'pad_token_id': 50256, 'dtype': torch.int64, 'device': device(type='cuda'), 'cuda_device_name': 'Tesla V100-PCIE-16GB-LS'} 25 | -------------------- attention_type == AttentionType.MULTI_HEAD--------------------- 26 | {'get_test_batch': 5.459785461425781e-05, 'generate_text_batch': 19.78107237815857, 'input_batch_size': 8, 'input_batch_length': 16, 'max_gen_length': 1024, 'num_beams': 1, 'do_sample': False, 'pad_token_id': 50256, 'dtype': torch.int64, 'device': device(type='cuda'), 'cuda_device_name': 'Tesla V100-PCIE-16GB-LS'} 27 | ``` 28 | -------------------------------------------------------------------------------- /multi_query_experiments/attention_types_imp.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | class MultiHead: 4 | ''' 5 | bs = batch size 6 | sl = sequence length 7 | nh = number of heads 8 | hs = head size 9 | nm = number of embeddings = nh * hs 10 | ''' 11 | 12 | @classmethod 13 | def allocate_data(cls, bs, sl, nh, hs, print_shapes): 14 | nm = nh * hs 15 | hidden_state = torch.randn(bs, sl, nm, device=torch.device('cuda')) 16 | c_attn_w = torch.randn(nm, 3*nm, device=torch.device('cuda')) 17 | i0 = None 18 | i1 = None 19 | i2 = None 20 | if print_shapes: 21 | print('hidden_state', hidden_state.shape) 22 | print('c_attn_w', c_attn_w.shape) 23 | return hidden_state, c_attn_w, i0, i1, i2 24 | 25 | @classmethod 26 | def get_qkv(cls, hidden_state, c_attn_w, i0, bs, sl, nh, hs, print_shapes): 27 | return torch.matmul( 28 | hidden_state.view(bs * sl, nh * hs), 29 | c_attn_w 30 | ).view(bs, sl, -1) 31 | 32 | @classmethod 33 | def split_qkv(cls, qkv, bs, sl, nh, hs, print_shapes): 34 | q, k, v = qkv.split(nh*hs, dim=2) 35 | 36 | if print_shapes: 37 | print('q', q.shape) 38 | print('k', k.shape) 39 | print('v', v.shape) 40 | 41 | q = q.view(bs, sl, nh, hs).permute(0, 2, 1, 3) 42 | k = k.view(bs, sl, nh, hs).permute(0, 2, 3, 1) 43 | v = v.view(bs, sl, nh, hs).permute(0, 2, 1, 3) 44 | 45 | if print_shapes: 46 | print('q', q.shape) 47 | print('k', k.shape) 48 | print('v', v.shape) 49 | 50 | return q, k, v 51 | 52 | @classmethod 53 | def get_attention_weights(cls, q, k, i1, bs, sl, nh, hs, print_shapes): 54 | attention_weights = torch.matmul(q, k) 55 | return attention_weights 56 | 57 | @classmethod 58 | def get_attention_output(cls, attention_weights, v, i2, bs, sl, nh, hs, print_shapes): 59 | attn_output = torch.matmul(attention_weights, v) 60 | if print_shapes: 61 | print('attn_output', attn_output.shape) 62 | #attn_output = attn_output.view( 63 | # bs, nh, sl, hs).permute(0, 2, 1, 3) 64 | return attn_output 65 | 66 | 67 | class MultiQuery: 68 | ''' 69 | bs = batch size 70 | sl = sequence length 71 | nh = number of heads 72 | hs = head size 73 | nm = number of embeddings = nh * hs 74 | ''' 75 | 76 | @classmethod 77 | def allocate_data(cls, bs, sl, nh, hs, print_shapes): 78 | nm = nh * hs 79 | hidden_state = torch.randn(sl, bs, nm, device=torch.device('cuda')) 80 | c_attn_w = torch.randn((nh + 2) * hs, nm, device=torch.device('cuda')) 81 | i0 = torch.zeros((nh + 2) * hs, sl * bs, device=torch.device('cuda')) 82 | i1 = torch.zeros(bs, sl * nh, sl, device=torch.device('cuda')) 83 | i2 = torch.zeros(bs, sl * nh, hs, device=torch.device('cuda')) 84 | if print_shapes: 85 | print('hidden_state', hidden_state.shape) 86 | print('c_attn_w', c_attn_w.shape) 87 | print('i0', i0.shape) 88 | print('i1', i1.shape) 89 | print('i2', i2.shape) 90 | return hidden_state, c_attn_w, i0, i1, i2 91 | 92 | @classmethod 93 | def get_qkv1(cls, hidden_state, c_attn_w, i0, bs, sl, nh, hs, print_shapes): 94 | return torch.addmm( 95 | i0, 96 | c_attn_w, 97 | hidden_state.transpose(0, 1) 98 | ) 99 | 100 | @classmethod 101 | def get_qkv(cls, hidden_state, c_attn_w, i0, bs, sl, nh, hs, print_shapes): 102 | return torch.matmul( 103 | c_attn_w, 104 | hidden_state.view(sl * bs, nh * hs).transpose(0, 1) 105 | ) 106 | 107 | @classmethod 108 | def split_qkv(cls, qkv, bs, sl, nh, hs, print_shapes): 109 | q, k, v = qkv.split((nh*hs, hs, hs), dim=0) 110 | 111 | if print_shapes: 112 | print('q', q.shape) 113 | print('k', k.shape) 114 | print('v', v.shape) 115 | 116 | q = q.view(hs, nh, sl, bs 117 | ).permute(3, 1, 2, 0).view(bs, sl*nh, hs) 118 | k = k.view(hs, sl, bs).permute(2, 0, 1) 119 | v = v.view(hs, sl, bs).permute(2, 1, 0) 120 | 121 | if print_shapes: 122 | print('q', q.shape) 123 | print('k', k.shape) 124 | print('v', v.shape) 125 | 126 | return q, k, v 127 | 128 | @classmethod 129 | def get_attention_weights(cls, q, k, i1, bs, sl, nh, hs, print_shapes): 130 | return torch.baddbmm(i1, q, k) 131 | 132 | @classmethod 133 | def get_attention_output(cls, attention_weights, v, i2, bs, sl, nh, hs, print_shapes): 134 | attn_output = torch.baddbmm(i2, attention_weights, v) 135 | if print_shapes: 136 | print('attn_output', attn_output.shape) 137 | #attn_output = attn_output.view( 138 | # bs, sl, nh, hs).permute(1, 0, 2, 3).view(sl, bs, nh * hs) 139 | return attn_output 140 | 141 | 142 | class MultiQuery1: 143 | ''' 144 | bs = batch size 145 | sl = sequence length 146 | nh = number of heads 147 | hs = head size 148 | nm = number of embeddings = nh * hs 149 | ''' 150 | 151 | @classmethod 152 | def allocate_data(cls, bs, sl, nh, hs, print_shapes): 153 | nm = nh * hs 154 | hidden_state = torch.randn(bs, sl, nm, device=torch.device('cuda')) 155 | c_attn_w = torch.randn(nm, (nh + 2) * hs, device=torch.device('cuda')) 156 | i0 = torch.zeros(sl * bs, (nh + 2) * hs, device=torch.device('cuda')) 157 | i1 = torch.zeros(bs, sl * nh, sl, device=torch.device('cuda')) 158 | i2 = torch.zeros(bs, sl * nh, hs, device=torch.device('cuda')) 159 | if print_shapes: 160 | print('hidden_state', hidden_state.shape) 161 | print('c_attn_w', c_attn_w.shape) 162 | print('i0', i0.shape) 163 | print('i1', i1.shape) 164 | print('i2', i2.shape) 165 | return hidden_state, c_attn_w, i0, i1, i2 166 | 167 | @classmethod 168 | def get_qkv1(cls, hidden_state, c_attn_w, i0, bs, sl, nh, hs, print_shapes): 169 | return torch.addmm( 170 | i0, 171 | hidden_state, 172 | c_attn_w, 173 | ).view(bs, sl, -1) 174 | 175 | @classmethod 176 | def get_qkv(cls, hidden_state, c_attn_w, i0, bs, sl, nh, hs, print_shapes): 177 | return torch.matmul( 178 | hidden_state.view(sl * bs, nh * hs), 179 | c_attn_w, 180 | ).view(bs, sl, -1) 181 | 182 | @classmethod 183 | def split_qkv(cls, qkv, bs, sl, nh, hs, print_shapes): 184 | q, k, v = qkv.split((nh*hs, hs, hs), dim=2) 185 | 186 | if print_shapes: 187 | print('q', q.shape) 188 | print('k', k.shape) 189 | print('v', v.shape) 190 | 191 | q = q.view( 192 | bs, sl, nh, hs, 193 | ).reshape( 194 | bs, sl * nh, hs 195 | ) 196 | k = k.permute(0, 2, 1) 197 | v = v 198 | 199 | if print_shapes: 200 | print('q', q.shape) 201 | print('k', k.shape) 202 | print('v', v.shape) 203 | 204 | return q, k, v 205 | 206 | @classmethod 207 | def get_attention_weights(cls, q, k, i1, bs, sl, nh, hs, print_shapes): 208 | return torch.baddbmm(i1, q, k) 209 | 210 | @classmethod 211 | def get_attention_output(cls, attention_weights, v, i2, bs, sl, nh, hs, print_shapes): 212 | attn_output = torch.baddbmm(i2, attention_weights, v) 213 | if print_shapes: 214 | print('attn_output', attn_output.shape) 215 | #attn_output = attn_output.view( 216 | # bs, sl, nh, hs).permute(1, 0, 2, 3).view(sl, bs, nh * hs) 217 | return attn_output 218 | 219 | 220 | def get_key_totals(prof): 221 | names = set(('GET_QKV', 'SPLIT_QKV', 'GET_ATTENTION_WEIGHTS', 'GET_ATTENTION_OUTPUT')) 222 | ka = prof.key_averages() 223 | stats = [[el.key, el.cpu_time_total / el.count, el.cuda_time_total / el.count, el.cpu_time_total / el.count + el.cuda_time_total / el.count] for el in ka if el.key in names] 224 | el_total = ['TOTAL', 0, 0, 0] 225 | for el in stats: 226 | el_total[1] += el[1] 227 | el_total[2] += el[2] 228 | el_total[3] += el[3] 229 | 230 | return [['key', 'cpu us', 'cuda us', 'all us']] + stats + [el_total] 231 | 232 | 233 | def test_attention_total_time(cls, bs, sl, nh, hs, repeat_cnt): 234 | hidden_state, c_attn_w, i0, i1, i2 = cls.allocate_data(bs, sl, nh, hs, False) 235 | with torch.autograd.profiler.profile(use_cuda=True) as prof: 236 | for i in range(repeat_cnt): 237 | with torch.autograd.profiler.record_function("GET_QKV"): 238 | qkv = cls.get_qkv(hidden_state, c_attn_w, i0, bs, sl, nh, hs, False) 239 | with torch.autograd.profiler.record_function("SPLIT_QKV"): 240 | q, k, v = cls.split_qkv(qkv, bs, sl, nh, hs, False) 241 | with torch.autograd.profiler.record_function("GET_ATTENTION_WEIGHTS"): 242 | attention_weights = cls.get_attention_weights(q, k, i1, bs, sl, nh, hs, False) 243 | with torch.autograd.profiler.record_function("GET_ATTENTION_OUTPUT"): 244 | attention_output = cls.get_attention_output(attention_weights, v, i2, bs, sl, nh, hs, False) 245 | res = get_key_totals(prof) 246 | return res 247 | 248 | def test_attention(cls, bs, sl, nh, hs, repeat_cnt): 249 | hidden_state, c_attn_w, i0, i1, i2 = cls.allocate_data(bs, sl, nh, hs, True) 250 | with torch.autograd.profiler.profile(use_cuda=True) as prof: 251 | for i in range(repeat_cnt): 252 | with torch.autograd.profiler.record_function("GET_QKV"): 253 | qkv = cls.get_qkv(hidden_state, c_attn_w, i0, bs, sl, nh, hs, i == 0) 254 | with torch.autograd.profiler.record_function("SPLIT_QKV"): 255 | q, k, v = cls.split_qkv(qkv, bs, sl, nh, hs, i == 0) 256 | with torch.autograd.profiler.record_function("GET_ATTENTION_WEIGHTS"): 257 | attention_weights = cls.get_attention_weights(q, k, i1, bs, sl, nh, hs, i == 0) 258 | with torch.autograd.profiler.record_function("GET_ATTENTION_OUTPUT"): 259 | attention_output = cls.get_attention_output(attention_weights, v, i2, bs, sl, nh, hs, i == 0) 260 | print(prof.key_averages().table(sort_by="self_cuda_time_total")) 261 | return get_key_totals(prof) 262 | 263 | 264 | def test_qkv(cls, bs, sl, nh, hs, repeat_cnt): 265 | hidden_state, c_attn_w, i0, i1, i2 = cls.allocate_data(bs, sl, nh, hs, True) 266 | with torch.autograd.profiler.profile(use_cuda=True) as prof: 267 | for i in range(repeat_cnt): 268 | with torch.autograd.profiler.record_function("GET_QKV"): 269 | qkv = cls.get_qkv(hidden_state, c_attn_w, i0, bs, sl, nh, hs, i == 0) 270 | print(prof.key_averages().table(sort_by="self_cuda_time_total")) 271 | return get_key_totals(prof) 272 | 273 | def test_attention_weights(cls, bs, sl, nh, hs, repeat_cnt): 274 | hidden_state, c_attn_w, i0, i1, i2 = cls.allocate_data(bs, sl, nh, hs, True) 275 | qkv = cls.get_qkv(hidden_state, c_attn_w, i0, bs, sl, nh, hs, True) 276 | q, k, v = cls.split_qkv(qkv, bs, sl, nh, hs, True) 277 | with torch.autograd.profiler.profile(use_cuda=True) as prof: 278 | for i in range(repeat_cnt): 279 | with torch.autograd.profiler.record_function("GET_ATTENTION_WEIGHTS"): 280 | attention_weights = cls.get_attention_weights(q, k, i1, bs, sl, nh, hs, i == 0) 281 | print(prof.key_averages().table(sort_by="self_cuda_time_total")) 282 | return get_key_totals(prof) 283 | 284 | def test_attention_weights_output(cls, bs, sl, nh, hs, repeat_cnt): 285 | hidden_state, c_attn_w, i0, i1, i2 = cls.allocate_data(bs, sl, nh, hs, True) 286 | qkv = cls.get_qkv(hidden_state, c_attn_w, i0, bs, sl, nh, hs, True) 287 | q, k, v = cls.split_qkv(qkv, bs, sl, nh, hs, True) 288 | with torch.autograd.profiler.profile(use_cuda=True) as prof: 289 | for i in range(repeat_cnt): 290 | with torch.autograd.profiler.record_function("GET_ATTENTION_WEIGHTS"): 291 | attention_weights = cls.get_attention_weights(q, k, i1, bs, sl, nh, hs, i == 0) 292 | with torch.autograd.profiler.record_function("GET_ATTENTION_OUTPUT"): 293 | attention_output = cls.get_attention_output(attention_weights, v, i2, bs, sl, nh, hs, i == 0) 294 | print(prof.key_averages().table(sort_by="self_cuda_time_total")) 295 | return get_key_totals(prof) 296 | -------------------------------------------------------------------------------- /multi_query_experiments/profile_attention_types.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import attention_types_imp as imp 3 | from tqdm.auto import tqdm 4 | import json 5 | import math 6 | 7 | def profile_attention_type(cls): 8 | repeat_cnt=500 9 | 10 | print(f'----------------------{cls}-------------------') 11 | 12 | res = [] 13 | for bs in tqdm(range(8, 17, 8)): 14 | sl_times = [] 15 | for sl in tqdm(range(64, 2000, 128)): 16 | rp = max(1, int(repeat_cnt * math.pow(64, 1/3.0) / math.pow(sl, 1/3.0))) 17 | totals_mh = imp.test_attention_total_time(cls, bs=bs, sl=sl, nh=16, hs=64, repeat_cnt=rp) 18 | sl_times.append((sl, totals_mh)) 19 | res.append((bs, sl_times)) 20 | 21 | return res 22 | 23 | 24 | # warmup 25 | imp.test_attention_total_time(imp.MultiHead, bs=24, sl=8, nh=16, hs=64, repeat_cnt=100) 26 | 27 | if True: 28 | res = { 29 | 'MultiHead': profile_attention_type(imp.MultiHead), 30 | 'MultiQuery': profile_attention_type(imp.MultiQuery), 31 | 'MultiQuery1': profile_attention_type(imp.MultiQuery1), 32 | } 33 | 34 | with open('profile_attention_types1.json', 'w') as f: 35 | json.dump(res, f) 36 | 37 | -------------------------------------------------------------------------------- /multi_query_experiments/profile_hf_generate.py: -------------------------------------------------------------------------------- 1 | import os 2 | # we cache pretrained models in a user-writable dir (friendlier to SageMaker environments) 3 | os.environ['TRANSFORMERS_CACHE'] = os.environ['PWD'] + '/hf_transformers_cache' 4 | 5 | import torch 6 | import time 7 | import transformers 8 | 9 | from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2Config 10 | from transformers.models.gpt2.modeling_gpt2 import AttentionType 11 | 12 | def env(evar:str): 13 | return os.environ[evar] 14 | 15 | def dev(): 16 | if torch.cuda.is_available(): 17 | return torch.device("cuda") 18 | else: 19 | return torch.device("cpu") 20 | 21 | 22 | print(transformers.__file__) 23 | print(f'CUDA device : { torch.cuda.get_device_name(0) if torch.cuda.is_available() else None }') 24 | print(f'PWD : {env("PWD")}') 25 | print(f'transformers_cache : {env("TRANSFORMERS_CACHE")}') 26 | 27 | def get_test_batch(vocab_size, size, length, dtype=torch.int64, device=None): 28 | #TODO: eliminate special tokens, for now assumes the last one is the only special token 29 | return { 30 | 'input_ids': torch.randint(0, vocab_size-1, (size, length), dtype=dtype, device=device), 31 | 'attention_mask': torch.ones((size, length), dtype=dtype, device=device) 32 | } 33 | 34 | def generate_text_batch(model, inputs, max_length, num_beams=1, do_sample=False, pad_token_id=50256): 35 | return model.generate( 36 | **inputs, max_length=max_length, num_beams=num_beams, do_sample=do_sample, pad_token_id=pad_token_id 37 | ) 38 | 39 | def decode_batch(tokenizer, outputs): 40 | # outputs = outputs.numpy().tolist() 41 | outputs = outputs.tolist() 42 | return [ 43 | tokenizer.decode(output) 44 | for output in outputs 45 | ] 46 | 47 | def time_generate( 48 | vocab_size, model, input_batch_size, input_batch_length, max_gen_length, 49 | num_beams=1, do_sample=False, pad_token_id=50256, dtype=torch.int64, device=None, tokenizer=None 50 | ): 51 | stats = {} 52 | 53 | t1 = time.time() 54 | inputs = get_test_batch(vocab_size, input_batch_size, input_batch_length, dtype, device) 55 | stats['get_test_batch'] = time.time() - t1 56 | 57 | t1 = time.time() 58 | outputs = generate_text_batch( 59 | model, inputs, max_gen_length, num_beams=num_beams, do_sample=do_sample, pad_token_id=pad_token_id 60 | ) 61 | stats['generate_text_batch'] = time.time() - t1 62 | 63 | if do_sample: 64 | t1 = time.time() 65 | decs = decode_batch(tokenizer, outputs) 66 | dt = time.time() - t1 67 | stats['decode_batch'] = dt 68 | 69 | stats['input_batch_size'] = input_batch_size 70 | stats['input_batch_length'] = input_batch_length 71 | stats['max_gen_length'] = max_gen_length 72 | stats['num_beams'] = num_beams 73 | stats['do_sample'] = do_sample 74 | stats['pad_token_id'] = pad_token_id 75 | stats['dtype'] = dtype 76 | 77 | return inputs, outputs, stats 78 | 79 | def profile(attention_type): 80 | tokenizer = GPT2Tokenizer.from_pretrained('gpt2', cache_dir=os.environ['TRANSFORMERS_CACHE']) 81 | 82 | config = GPT2Config( 83 | vocab_size=len(tokenizer), 84 | n_layer=24, 85 | n_embd=1024, 86 | n_head=16, 87 | n_positions=2048, 88 | #n_ctx=tokenizer.model_max_length, 89 | bos_token_id=tokenizer.bos_token_id, 90 | eos_token_id=tokenizer.eos_token_id, 91 | attention_type=attention_type, 92 | print_details=False 93 | ) 94 | model = GPT2LMHeadModel(config).to(dev()) 95 | 96 | inputs = get_test_batch(tokenizer.vocab_size, 1, 4, device=dev()) 97 | 98 | print(f'-------------------- attention_type == {attention_type} ---------------------') 99 | 100 | inputs, outputs, stats = time_generate(tokenizer.vocab_size, model, 8, 16, 1024, device=dev(), tokenizer=tokenizer, do_sample=True) 101 | print(stats) 102 | 103 | 104 | t0 = time.time() 105 | # # warm up 106 | # profile(AttentionType.MULTI_QUERY) 107 | 108 | profile(AttentionType.MULTI_QUERY) 109 | profile(AttentionType.MULTI_QUERY_1) 110 | profile(AttentionType.MULTI_HEAD) 111 | dt = time.time() - t0 112 | print(f'Total elapsed time : {dt} [s]') 113 | -------------------------------------------------------------------------------- /multi_query_experiments/profile_mqa.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 14, 6 | "id": "f99e438c-1802-4573-844c-d91e4951ec19", 7 | "metadata": {}, 8 | "outputs": [ 9 | { 10 | "name": "stdout", 11 | "output_type": "stream", 12 | "text": [ 13 | "Python 3.9.13\n" 14 | ] 15 | } 16 | ], 17 | "source": [ 18 | "!python --version" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "id": "21f81c44-13cb-4bb2-8dd5-1b1e783ca3d4", 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "!pip install -r requirements.txt" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 1, 34 | "id": "4970d655-1539-4e3a-8fef-812de8af5173", 35 | "metadata": {}, 36 | "outputs": [ 37 | { 38 | "name": "stdout", 39 | "output_type": "stream", 40 | "text": [ 41 | "/home/studio-lab-user/.conda/envs/default/lib/python3.9/site-packages/transformers/__init__.py\n", 42 | "CUDA device Tesla T4\n", 43 | "PWD : /home/studio-lab-user/bigcode-analysis\n", 44 | "transformers_cache : /home/studio-lab-user/bigcode-analysis/hf_transformers_cache\n", 45 | "-------------------- attention_type == AttentionType.MULTI_QUERY ---------------------\n", 46 | "{'get_test_batch': 0.00037789344787597656, 'generate_text_batch': 25.9916410446167, 'decode_batch': 0.031884193420410156, 'input_batch_size': 8, 'input_batch_length': 16, 'max_gen_length': 1024, 'num_beams': 1, 'do_sample': True, 'pad_token_id': 50256, 'dtype': torch.int64}\n", 47 | "-------------------- attention_type == AttentionType.MULTI_QUERY_1 ---------------------\n", 48 | "{'get_test_batch': 0.0003807544708251953, 'generate_text_batch': 18.601619243621826, 'decode_batch': 0.021413087844848633, 'input_batch_size': 8, 'input_batch_length': 16, 'max_gen_length': 1024, 'num_beams': 1, 'do_sample': True, 'pad_token_id': 50256, 'dtype': torch.int64}\n", 49 | "-------------------- attention_type == AttentionType.MULTI_HEAD ---------------------\n", 50 | "{'get_test_batch': 0.0004012584686279297, 'generate_text_batch': 28.731690883636475, 'decode_batch': 0.021346569061279297, 'input_batch_size': 8, 'input_batch_length': 16, 'max_gen_length': 1024, 'num_beams': 1, 'do_sample': True, 'pad_token_id': 50256, 'dtype': torch.int64}\n", 51 | "Total elapsed time : 108.40390658378601 [s]\n" 52 | ] 53 | } 54 | ], 55 | "source": [ 56 | "import multi_query_experiments.profile_hf_generate" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": null, 62 | "id": "f66365b2-f1dd-41b0-b67e-23f935390d61", 63 | "metadata": {}, 64 | "outputs": [], 65 | "source": [] 66 | } 67 | ], 68 | "metadata": { 69 | "kernelspec": { 70 | "display_name": "default:Python", 71 | "language": "python", 72 | "name": "conda-env-default-py" 73 | }, 74 | "language_info": { 75 | "codemirror_mode": { 76 | "name": "ipython", 77 | "version": 3 78 | }, 79 | "file_extension": ".py", 80 | "mimetype": "text/x-python", 81 | "name": "python", 82 | "nbconvert_exporter": "python", 83 | "pygments_lexer": "ipython3", 84 | "version": "3.9.13" 85 | } 86 | }, 87 | "nbformat": 4, 88 | "nbformat_minor": 5 89 | } 90 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | datasets==2.3.2 2 | torch==1.13.1 3 | transformers @ git+https://github.com/bigcode-project/transformers.git@multi_query 4 | umap-learn==0.5.3 5 | plotly==4.14.3 6 | -------------------------------------------------------------------------------- /tokenization/README.md: -------------------------------------------------------------------------------- 1 | 2 | --------------------------------------------------------------------------------