├── .gitignore
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── community_research
    └── mozfest.pdf
├── data_analysis
    ├── README.md
    ├── comment-to-code-ratio
    │   ├── README.md
    │   ├── analysis_comments_ratio.ipynb
    │   └── text_extraction.py
    ├── decontamination
    │   ├── README.md
    │   ├── find_substrings.py
    │   ├── minhash.py
    │   ├── requirements.txt
    │   └── requirements_minhash.txt
    ├── github_issues_analysis
    │   ├── analysis.ipynb
    │   └── utils.py
    ├── kenlm
    │   ├── kenlm.ipynb
    │   ├── kenlm_analysis.ipynb
    │   └── setup.sh
    ├── mathjax
    │   └── mathjax.ipynb
    ├── near-deduplication
    │   ├── .gitignore
    │   ├── README.md
    │   ├── minhash_deduplication.py
    │   ├── minhash_deduplication_alt.py
    │   ├── minhash_deduplication_debug.py
    │   ├── near_deduplicate.py
    │   ├── requirements.txt
    │   └── requirements_alt.txt
    ├── notebooks
    │   ├── ScalingLaws.ipynb
    │   ├── ScalingLawsHE.ipynb
    │   ├── bigcode_pls.csv
    │   ├── chinchilla_analysis.ipynb
    │   ├── code_compilation.ipynb
    │   ├── data_filters.ipynb
    │   ├── embedding_clustering.ipynb
    │   ├── file_size_analysis.ipynb
    │   ├── loss_analysis.ipynb
    │   ├── new_extension_distribution.csv
    │   ├── stats.csv
    │   ├── unimax.ipynb
    │   └── utils.py
    ├── python_data_analysis
    │   ├── code_compilation
    │   │   ├── README.md
    │   │   ├── compile_py_files.py
    │   │   └── requirements.txt
    │   ├── config_test_estimation
    │   │   ├── README.md
    │   │   └── config_test.py
    │   └── nl_language_identification
    │   │   ├── README.md
    │   │   ├── analysis.ipynb
    │   │   ├── fasttext_model
    │   │       └── README.md
    │   │   ├── language_identifier.py
    │   │   ├── requirements.txt
    │   │   └── text_extraction.py
    └── stars_filtering
    │   ├── README.md
    │   └── stars_analysis.ipynb
├── evaluation
    └── README.md
├── multi_query_experiments
    ├── README.md
    ├── attention_types_imp.py
    ├── profile_attention_types.json
    ├── profile_attention_types.py
    ├── profile_attention_types_visualise.ipynb
    ├── profile_hf_generate.py
    └── profile_mqa.ipynb
├── requirements.txt
└── tokenization
    └── README.md


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 | 
162 | .vscode/
163 | .trunk
164 | .DS_Store


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
  1 | # How to contribute to BigCode?
  2 | 
  3 | Everyone is welcome to contribute, and we value everybody's contribution. Code
  4 | is thus not the only way to help the community. Answering questions, helping
  5 | others, reaching out and improving the documentations are immensely valuable to
  6 | the community.
  7 | 
  8 | Whichever way you choose to contribute, please be mindful to respect our
  9 | [code of conduct](https://bigcode-project.org/docs/about/code_of_conduct/).
 10 | 
 11 | ## You can contribute in so many ways!
 12 | 
 13 | There are 4 ways you can contribute to this repository:
 14 | * Fixing outstanding issues with the existing code;
 15 | * Implementing new models;
 16 | * Contributing to the examples or to the documentation;
 17 | * Submitting issues related to bugs or desired new features.
 18 | 
 19 | *All are equally valuable to the community.*
 20 | 
 21 | ## License
 22 | 
 23 | Note that all contributions are licensed under Apache 2.0 by default. The 
 24 | Technical Steering Committee (TSC) may approve the use of an alternative 
 25 | license or licenses for inbound or outbound contributions on an exception basis. 
 26 | To request an exception, please describe the contribution, the alternative 
 27 | license, and the justification for using an alternative license for the 
 28 | described contribution. License exceptions must be approved by the TSC. 
 29 | Contributed files should contain license information indicating the open 
 30 | source license or licenses pertaining to the file.
 31 | 
 32 | ## Submitting a new issue or feature request
 33 | 
 34 | Do your best to follow these guidelines when submitting an issue or a feature
 35 | request. It will make it easier for us to come back to you quickly and with good
 36 | feedback.
 37 | 
 38 | ### Did you find a bug?
 39 | 
 40 | First, we would really appreciate it if you could **make sure the bug was not
 41 | already reported** (use the search bar on Github under Issues).
 42 | 
 43 | Did not find it? :( So we can act quickly on it, please follow these steps:
 44 | 
 45 | * Include your **OS type and version**, the versions of **Python**, **PyTorch** and
 46 |   **Tensorflow** when applicable;
 47 | * A short, self-contained, code snippet that allows us to reproduce the bug in
 48 |   less than 30s;
 49 | * Provide the *full* traceback if an exception is raised.
 50 | 
 51 | ### Do you want a new feature?
 52 | 
 53 | A world-class feature request addresses the following points:
 54 | 
 55 | 1. Motivation first:
 56 |   * Is it related to a problem/frustration with the current features? If so, please explain
 57 |     why. Providing a code snippet that demonstrates the problem is best.
 58 |   * Is it related to something you would need for a project? We'd love to hear
 59 |     about it!
 60 |   * Is it something you worked on and think could benefit the community?
 61 |     Awesome! Tell us what problem it solved for you.
 62 | 2. Write a *full paragraph* describing the feature;
 63 | 3. Provide a **code snippet** that demonstrates its future use;
 64 | 4. In case this is related to a paper, please attach a link;
 65 | 5. Attach any additional information (drawings, screenshots, etc.) you think may help.
 66 | 
 67 | If your issue is well written we're already 80% of the way there by the time you
 68 | post it.
 69 | 
 70 | ## Start contributing! (Pull Requests)
 71 | 
 72 | Before writing code, we strongly advise you to search through the existing PRs or
 73 | issues to make sure that nobody is already working on the same thing. If you are
 74 | unsure, it is always a good idea to open an issue to get some feedback.
 75 | 
 76 | You will need basic `git` proficiency to be able to contribute to
 77 | BigCode. `git` is not the easiest tool to use but it has the greatest
 78 | manual. Type `git --help` in a shell and enjoy. If you prefer books, [Pro
 79 | Git](https://git-scm.com/book/en/v2) is a very good reference.
 80 | 
 81 | Follow these steps to start contributing:
 82 | 
 83 | 1. Fork the repository by
 84 |    clicking on the 'Fork' button on the repository's page. This creates a copy of the code
 85 |    under your GitHub user account.
 86 | 
 87 | 2. Clone your fork to your local disk, and add the base repository as a remote:
 88 | 
 89 |    ```bash
 90 |    $ git clone git@github.com:<your Github handle>/<Repo name>.git
 91 |    $ cd <Repo name>
 92 |    $ git remote add upstream https://github.com/bigcode-project/<Repo name>.git
 93 |    ```
 94 | 
 95 | 3. Create a new branch to hold your development changes:
 96 | 
 97 |    ```bash
 98 |    $ git checkout -b a-descriptive-name-for-my-changes
 99 |    ```
100 | 
101 |    **Do not** work on the `main` branch.
102 | 
103 | 4. Set up a development environment by running the following command in a virtual environment:
104 | 
105 |    ```bash
106 |    $ pip install -r requirements.txt
107 |    ```
108 | 
109 | 5. Develop the features on your branch.
110 | 
111 |    Once you're happy with your changes, add changed files using `git add` and
112 |    make a commit with `git commit` to record your changes locally:
113 | 
114 |    ```bash
115 |    $ git add modified_file.py
116 |    $ git commit
117 |    ```
118 | 
119 |    Please write [good commit
120 |    messages](https://chris.beams.io/posts/git-commit/).
121 | 
122 |    It is a good idea to sync your copy of the code with the original
123 |    repository regularly. This way you can quickly account for changes:
124 | 
125 |    ```bash
126 |    $ git fetch upstream
127 |    $ git rebase upstream/main
128 |    ```
129 | 
130 |    Push the changes to your account using:
131 | 
132 |    ```bash
133 |    $ git push -u origin a-descriptive-name-for-my-changes
134 |    ```
135 | 
136 | 6. Once you are satisfied (**and the checklist below is happy too**), go to the
137 |    webpage of your fork on GitHub. Click on 'Pull request' to send your changes
138 |    to the project maintainers for review.
139 | 
140 | 7. It's ok if maintainers ask you for changes. It happens to core contributors
141 |    too! So everyone can see the changes in the Pull request, work in your local
142 |    branch and push the changes to your fork. They will automatically appear in
143 |    the pull request.
144 | 
145 | 
146 | ### Checklist
147 | 
148 | 1. The title of your pull request should be a summary of its contribution;
149 | 2. If your pull request addresses an issue, please mention the issue number in
150 |    the pull request description to make sure they are linked (and people
151 |    consulting the issue know you are working on it);
152 | 3. To indicate a work in progress please prefix the title with `[WIP]`. These
153 |    are useful to avoid duplicated work, and to differentiate it from PRs ready
154 |    to be merged;
155 | 4. Make sure existing tests pass;
156 | 5. All public methods must have informative docstrings.
157 | 
158 | ### Style guide
159 | 
160 | For documentation strings, BigCode follows the [google style](https://google.github.io/styleguide/pyguide.html).
161 | 
162 | **This guide was heavily inspired by the awesome [scikit-learn guide to contributing](https://github.com/scikit-learn/scikit-learn/blob/main/CONTRIBUTING.md).**
163 | 
164 | ### Develop on Windows
165 | 
166 | On windows, you need to configure git to transform Windows `CRLF` line endings to Linux `LF` line endings:
167 | 
168 | `git config core.autocrlf input`
169 | 
170 | One way one can run the make command on Window is to pass by MSYS2:
171 | 
172 | 1. [Download MSYS2](https://www.msys2.org/), we assume to have it installed in C:\msys64
173 | 2. Open the command line C:\msys64\msys2.exe (it should be available from the start menu)
174 | 3. Run in the shell: `pacman -Syu` and install make with `pacman -S make`
175 | 4. Add `C:\msys64\usr\bin` to your PATH environment variable.
176 | 
177 | You can now use `make` from any terminal (Powershell, cmd.exe, etc) 🎉
178 | 
179 | ### Syncing forked main with upstream `main`
180 | 
181 | To avoid pinging the upstream repository which adds reference notes to each upstream PR and sends unnecessary notifications to the developers involved in these PRs,
182 | when syncing the main branch of a forked repository, please, follow these steps:
183 | 1. When possible, avoid syncing with the upstream using a branch and PR on the forked repository. Instead merge directly into the forked main.
184 | 2. If a PR is absolutely necessary, use the following steps after checking out your branch:
185 | ```
186 | $ git checkout -b your-branch-for-syncing
187 | $ git pull --squash --no-commit upstream main
188 | $ git commit -m '<your message without GitHub references>'
189 | $ git push --set-upstream origin your-branch-for-syncing
190 | ```
191 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright {yyyy} {name of copyright owner}
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # BigCode Analysis
 3 | This repository is for the analysis done in BigCode Project. You can find analysis of datasets, models, architecture choices and more.
 4 | 
 5 | ## Contents
 6 | * **Data analysis**: In the folder `data_analysis`, we provide code for data analysis:
 7 |   * Near deduplication
 8 |   * Python data analysis:
 9 |     * Natural language distribution in comments/docstrings 
10 |     * Data decontamination for HumanEval and MBPP benchmarks
11 |     * Percentage of files that can be successfully compiled
12 |     * Percentage of configuration and test files
13 |     * Exploration of unimax sampling on The Stack
14 | Some notebooks with some early data and model loss analysis.
15 | 
16 | * **Multi-Query Attention experiments**, for details please to [multi_query_experiments/README.md)](/multi_query_experiments/README.md)
17 | 


--------------------------------------------------------------------------------
/community_research/mozfest.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigcode-project/bigcode-analysis/e0b88d6cefa14e3b0d3fc5e3d6667e1fa1eb30ee/community_research/mozfest.pdf


--------------------------------------------------------------------------------
/data_analysis/README.md:
--------------------------------------------------------------------------------
 1 | # Data anlaysis
 2 | 
 3 | In this folder we provide code for analysis of code datasets:
 4 | * Near deduplication using MinHash and LSH
 5 | 
 6 | * Data decontamination from HumanEval and MBPP evaluation benchmarks
 7 | 
 8 | * Python data analysis:
 9 |     * Natural language distribution in comments/docstrings 
10 |     * Detection of configuration and test files (valid for other languages than Python)
11 |     * Estimation of the number of files that can be successfully compiled
12 | 
13 | * Comment to code ratio: analysis notebook for filtering based on the ratio of comments in a file. Filtering code available at [bigcode-dataset/preprocessing](https://github.com/bigcode-project/bigcode-dataset/tree/main/preprocessing)
14 | 
15 | * Stars filtering: analysis notebook for filtering based on the number of stars of files. Filtering code available at [bigcode-dataset/preprocessing](https://github.com/bigcode-project/bigcode-dataset/tree/main/preprocessing)
16 | 
17 | * PII Redaction: moved to [bigcode-dataset/preprocessing](https://github.com/bigcode-project/bigcode-dataset/tree/main/pii)
18 |     * PII detection of emails, IP addresses and secret keys
19 |     * PII anonymization
20 |     * Pipeline evaluation on an annotated benchmark
21 | 
22 | * Preprocessing:  moved to [bigcode-dataset/preprocessing](https://github.com/bigcode-project/bigcode-dataset/tree/main/preprocessing)
23 |    * code for data filtering based on line length and percentage of alphanumeric characters, comment to code ratio and stars.
24 | 
25 | 


--------------------------------------------------------------------------------
/data_analysis/comment-to-code-ratio/README.md:
--------------------------------------------------------------------------------
1 | # Filtering based on comment to code ratio
2 | 
3 | Here we are interested in filtering files based on their comments to code ratio. We can expect files with a higher number of comments and docstrings to be of better quality.On the other hand files where the majority of lines are comments may not be as uselful for a code generation model. We filter with a minimum and maximum comment to code ratio, which is computed in the following way:
4 |     * For Python, we extract comments using Python tokenizer and docstrings using `ast` parsing.
5 |     * For other languages (Java and Javascript), we extract comments using `pygments` library.
6 |     * We compute the comment to code ratio of a file by counting the number of characters in comments over the total number of characters in the file.
7 | 
8 | You can find clean filtering code in `bigcode-dataset`repository under [preprocessing](https://github.com/bigcode-project/bigcode-dataset/tree/main/preprocessing).
9 | * `analysis_comments_ratio.ipynb` contains the code for the analysis of the comment to code ratio filter, used to come up with minimum and maximum thresholds (0.01 and 0.8) for the Python, Java and JavaScript subsets of [The Stack](https://huggingface.co/datasets/bigcode/the-stack).


--------------------------------------------------------------------------------
/data_analysis/comment-to-code-ratio/text_extraction.py:
--------------------------------------------------------------------------------
 1 | """Extract Python comments (using Python tokenizer) and docstrings (using AST parsing)."""
 2 | 
 3 | import io
 4 | from itertools import groupby
 5 | from os.path import basename, splitext
 6 | import ast
 7 | import tokenize
 8 | import warnings 
 9 | 
10 | StringIO = io.StringIO
11 | 
12 | NODE_TYPES = {
13 |     ast.ClassDef: 'Class',
14 |     ast.FunctionDef: 'Function/Method',
15 |     ast.Module: 'Module'
16 | }
17 | 
18 | # comment extraction
19 | def get_comments(s, clean=False):
20 |     "Returns a string including all coments"
21 |     coments = []
22 |     g = tokenize.generate_tokens(StringIO(s).readline)
23 |     for toknum, tokval, _, _, _  in g:
24 |         # print(toknum,tokval)
25 |         if toknum == tokenize.COMMENT:
26 |             coments.append((toknum, tokval))
27 |     result = tokenize.untokenize(coments)
28 |     if clean:
29 |         result = result.replace('#', '')
30 |     return result
31 | 
32 | # TODO: extraction work well (with decorators over classes)
33 | # ast parsing, source: https://gist.github.com/SpotlightKid/1548cb6c97f2a844f72d
34 | def parse_docstrings(source):
35 |     """Parse Python source code and yield a tuple of ast node instance, name,
36 |     and docstring for each function/method, class and module.""" 
37 |     tree = ast.parse(source)
38 |     
39 |     for node in ast.walk(tree):
40 |         if isinstance(node, tuple(NODE_TYPES)):
41 |             docstring = ast.get_docstring(node)
42 | 
43 |             yield (node, getattr(node, 'name', None), docstring)
44 | 
45 | def get_docstrings(source, module='<string>'):
46 |     """Parse Python source code from file or string and print docstrings."""
47 |     if hasattr(source, 'read'):
48 |         filename = getattr(source, 'name', module)
49 |         module = splitext(basename(filename))[0]
50 |         source = source.read()
51 | 
52 |     docstrings = sorted(parse_docstrings(source),
53 |         key=lambda x: (NODE_TYPES.get(type(x[0])), x[1]))
54 | 
55 |     grouped = groupby(docstrings, key=lambda x: NODE_TYPES.get(type(x[0])))
56 |     results = []
57 |     for _, group in grouped:
58 |         for _, name, docstring in group:
59 |             name = name if name else module
60 |             #print(docstring or '')
61 |             if docstring:
62 |                 results.append(docstring)
63 |     return results
64 | 
65 | def get_text(source, comments=True, clean_comments=True):
66 |     """Extract all natural text in source: comments + doctsrings
67 |     the extraction fails in case of syntax errors in the file 
68 |     Args:
69 |         source: the code to parse
70 |         comments: if True extract comments two
71 |         clean_comment: if True remove # from extracted comments
72 |     Returns:
73 |         a string with concatenated docstrings and comments"""
74 | 
75 |     try:
76 |         docstrings = '\n'.join(get_docstrings(source))
77 |     except :
78 |         docstrings = ''
79 |         warnings.warn("code couldn't be parsed due to compilation failure, no docstring is extracted")
80 | 
81 |     if comments:
82 |         try:
83 |             comments = get_comments(source, clean=clean_comments)
84 |         except :
85 |             comments = ''
86 |             warnings.warn("tokenization error, no comment is extracted")
87 |     else:
88 |         comments = ''
89 | 
90 |     output = docstrings + "\n\n" + comments
91 |     return output.strip()


--------------------------------------------------------------------------------
/data_analysis/decontamination/README.md:
--------------------------------------------------------------------------------
 1 | # Decontamination
 2 | 
 3 | This directory contains several scripts for decontamination of the data.
 4 | 1. Exact prompt matching `find_substrings.py`
 5 | 2. Near matching `minhash.py`
 6 | 
 7 | ## Near Matching with MinHash and LSH
 8 | 
 9 | This is similar to the near deduplication script `data_analysis/near-deduplication/minhash_deduplication_alt.py` with one modification: we use benchmark datasets as index source instead of the dataset itself.
10 | 
11 | ### Usage:
12 | 1. Update the script to include any benchmark you want to check agains in `DATASETS_TO_CHECK`. Be sure to create a global variable for the index using the same name in that config. Benchmark columns should be of type string or sequence of string, so that they can be concatenated.
13 | 2. Then you can run the script by
14 | ```bash
15 | pip install -r requirements_minhash.txt
16 | # Quick example
17 | python minhash.py \
18 |   --dataset codeparrot/codeparrot-clean-valid \
19 |   --split train \
20 |   --column content \
21 |   --cache-dir .cache \
22 |   --verbose
23 | # Check parameters with the help message
24 | python minhash.py --help
25 | ```


--------------------------------------------------------------------------------
/data_analysis/decontamination/find_substrings.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Takes a direcrory containing jsonl files as input.
  3 | Filter out all samples that contain certain substrings.
  4 | """
  5 | import sys
  6 | import os
  7 | import json
  8 | import glob
  9 | from tqdm import tqdm
 10 | from multiprocessing import Pool
 11 | 
 12 | from datasets import load_dataset
 13 | 
 14 | 
 15 | # ========= data to filter out of the dataset ============
 16 | MBPP_PATH = "/data/mbpp/mbpp.jsonl"
 17 | TEST_IDS = list(range(11, 511))
 18 | 
 19 | def mbpp_docstrings():
 20 |     data = []
 21 |     with open(MBPP_PATH) as f:
 22 |         for line in f:
 23 |             data.append(json.loads(line))
 24 |     
 25 |     data = [sample for sample in data if sample["task_id"] in TEST_IDS]
 26 | 
 27 |     assert len(data) == 500
 28 |         
 29 |     # Checksum / version issues here
 30 |     # dataset = load_dataset("mbpp", split="test")
 31 | 
 32 |     return [sample["text"] for sample in data]
 33 | 
 34 | 
 35 | def extract_docstring(prompt: str) -> str:
 36 |     if '"""' in prompt:
 37 |         if prompt.count('"""') == 2:
 38 |             return prompt.split('"""')[1].strip()
 39 |         elif prompt.count('"""') == 4:
 40 |             return prompt.split('"""')[3].strip()
 41 |         else:
 42 |             raise ValueError()
 43 |     elif '\'\'\'' in prompt:
 44 |         assert prompt.count('\'\'\'') == 2
 45 |         return prompt.split('\'\'\'')[1].strip()
 46 |     else:
 47 |         raise ValueError()
 48 | 
 49 | 
 50 | def human_eval_docstrings():
 51 |     ds = load_dataset("openai_humaneval", split="test")
 52 |     docstrings = [extract_docstring(v['prompt']) for v in ds]
 53 |     return docstrings
 54 | 
 55 | FILTER_OUT = {
 56 |     "mbpp": mbpp_docstrings(),
 57 |     "human_eval": human_eval_docstrings()
 58 | }
 59 | # ============================================================
 60 | 
 61 | def add_dict(dict1: dict, dict2: dict) -> None:
 62 |     """
 63 |     Add the values of dict2 to dict1. All values must be int, float or dictionaries that also verify this condition.
 64 |     Will modify dict1 and return None
 65 |     """
 66 |     for key, value in dict2.items():
 67 |         if isinstance(value, (int, float)):
 68 |             if key not in dict1:
 69 |                 dict1[key] = 0
 70 |             dict1[key] += value
 71 |         elif isinstance(value, dict):
 72 |             if key not in dict1:
 73 |                 dict1[key] = {}
 74 |             assert isinstance(dict1[key], dict)
 75 |             add_dict(dict1[key], value)
 76 |         else:
 77 |             raise ValueError(f"Invalid type for key/value {key}: {value}")
 78 | 
 79 | def filter_file(data):
 80 |     """
 81 |     Return True, None if the file should be included in the dataset.
 82 |     Otherwise return False and some metadata about the file excluded
 83 |     """
 84 |     content = data['content'].lower()
 85 |     # For each substring, try to find it in the file (case insensitive)
 86 |     for benchmark, substrings in FILTER_OUT.items():
 87 |         for substring in substrings:
 88 |             if substring.lower() in content:
 89 |                 return False, f"{benchmark}_match"
 90 | 
 91 |     # Return True, None if none of the substrings was found
 92 |     return True, None
 93 | 
 94 | 
 95 | def _update_meta_dict(meta_dict, filter_reason):
 96 |     if filter_reason not in meta_dict:
 97 |         meta_dict[filter_reason] = 0
 98 |     meta_dict[filter_reason] += 1
 99 | 
100 | 
101 | def filter_jsonl_file(args):
102 |     """
103 |     Filter a given file and write the output to the disk
104 |     """
105 | 
106 |     file_name, write_to = args
107 |     meta = f"{write_to}_meta"
108 |     meta_dict = {}
109 |     with open(file_name, "r") as f:
110 |         with open(write_to, "w") as out:
111 |             with open(meta, "w") as meta_file:
112 |                 for i, line in tqdm(enumerate(f)):
113 |                     data = json.loads(line)
114 |                     # Write line to output-file if filter has passed
115 |                     to_include, filter_reason = filter_file(data)
116 |                     if to_include:
117 |                         out.write(line)
118 |                     else:
119 |                         _update_meta_dict(meta_dict, filter_reason)
120 |                 # Dump meta dict
121 |                 meta_file.write(json.dumps(meta_dict))
122 |                 meta_file.write("\n")
123 | 
124 | 
125 | def main():
126 |     num_processes = 64
127 |     # The input directory containing the jsonl files
128 |     input_dir = sys.argv[1]
129 |     # Where to write worker files and output file
130 |     output_dir = sys.argv[2]
131 | 
132 |     assert os.path.isdir(input_dir)
133 | 
134 |     tmp_files_dir = os.path.join(output_dir, "tmp")
135 |     output_file = os.path.join(output_dir, "data.jsonl")
136 |     os.makedirs(tmp_files_dir, exist_ok=True)
137 | 
138 |     # Process all the files in the input directory
139 |     # Get the arguments for each worker
140 |     files = glob.glob(f"{input_dir}/data_*.jsonl")
141 |     filter_args = [(file, f"{tmp_files_dir}/{os.path.basename(file)}") for file in files]
142 |     output_files = [arg[1] for arg in filter_args]
143 | 
144 |     # Process the files in parallel
145 |     with Pool(num_processes) as p:
146 |         for i, res in enumerate(p.imap(filter_jsonl_file, filter_args)):
147 |             print(i, res)
148 |         
149 |     # Concatenate the outputs of all the workers into one big file
150 |     with open(output_file, "w") as outfile:
151 |         for fname in output_files:
152 |             with open(fname) as f:
153 |                 for line in f:
154 |                     outfile.write(line)
155 | 
156 |     # compile meta
157 |     meta = {}
158 |     for fname in output_files:
159 |         fmeta = json.load(open(f"{fname}_meta"))
160 |         add_dict(meta, fmeta)
161 |     with open(f"{output_file}_meta", "w") as outfile:
162 |         json.dump(meta, outfile)
163 | 
164 | 
165 | if __name__ == "__main__":
166 |     main()
167 | 
168 | 


--------------------------------------------------------------------------------
/data_analysis/decontamination/minhash.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | # author      : Chenghao Mou (mouchenghao@gmail.com)
  4 | # created     : 10/21/2022
  5 | from __future__ import annotations
  6 | 
  7 | import glob
  8 | import logging
  9 | import multiprocessing
 10 | import os
 11 | import random
 12 | import re
 13 | import time
 14 | from pathlib import Path
 15 | from typing import Any, Dict, Set
 16 | 
 17 | import pandas as pd
 18 | 
 19 | multiprocessing.set_start_method("fork", force=True)
 20 | 
 21 | import numpy as np
 22 | import typer
 23 | from datasets import Dataset, Features, Sequence, Value, concatenate_datasets, load_dataset, load_from_disk
 24 | from datasketch import LeanMinHash, MinHash, MinHashLSH
 25 | from rich.console import Console
 26 | from rich.logging import RichHandler
 27 | from tqdm import tqdm
 28 | 
 29 | random.seed(42)
 30 | MINHASH_SEED = 42
 31 | NON_ALPHA = re.compile("[^A-Za-z_0-9]")
 32 | console = Console()
 33 | logger = logging.getLogger(__name__)
 34 | logger.setLevel(logging.INFO)
 35 | logger.addHandler(RichHandler(rich_tracebacks=True))
 36 | logger.propagate = False
 37 | 
 38 | 
 39 | human_eval_lsh: MinHashLSH | None = None
 40 | mbpp_lsh: MinHashLSH | None = None
 41 | 
 42 | dup_ids: Set[int] = set()
 43 | 
 44 | DATASETS_TO_CHECK = [
 45 |     {
 46 |         "name": "openai_humaneval",
 47 |         "splits": ["test"],
 48 |         "columns": ["prompt", "canonical_solution", "test"],
 49 |         "codename": "human_eval",
 50 |         "index": "human_eval_lsh",  # The same name as the global variable
 51 |     },
 52 |     {
 53 |         "name": "mbpp",
 54 |         "splits": ["train", "validation", "test"],
 55 |         "columns": ["text", "code", "test_list"],
 56 |         "codename": "mbpp",
 57 |         "index": "mbpp_lsh",  # The same name as the global variable
 58 |     },
 59 | ]
 60 | 
 61 | 
 62 | def load_dataset_with_config(conf: Dict[str, Any]) -> Dataset:
 63 |     """
 64 |     Load a dataset based on the configuration. Be careful about changing this function,
 65 |     as it is used for caching the intermediate results.
 66 | 
 67 |     Parameters
 68 |     ----------
 69 |     conf : Dict[str, Any]
 70 |         The configuration. Mainly, there are three ways to load a dataset:
 71 |         1. Directly from th ehub
 72 |         2. From a local git repository
 73 |         3. From a local dataset directory that was saved by `save_to_disk` before
 74 | 
 75 |     Returns
 76 |     -------
 77 |     Dataset
 78 |         The loaded dataset.
 79 |     """
 80 | 
 81 |     # Load from hub
 82 |     if not conf["lfs"]:
 83 |         ds = load_dataset(
 84 |             conf["dataset"],
 85 |             conf["config"],
 86 |             data_dir=conf["data_dir"],
 87 |             split=conf["split"],
 88 |             use_auth_token=True,
 89 |             cache_dir=conf["cache_dir"],
 90 |         )
 91 |     # Or load from git lfs files
 92 |     elif not os.path.exists(conf["concat_output"]):
 93 |         datasets = []
 94 |         # In practice, it might stuck here, you can hit Ctrl+C and run it again.
 95 |         for file in tqdm(sorted(glob.glob(conf["data_dir"] + "/*.jsonl")), desc="Loading datasets..."):
 96 |             datasets.append(load_dataset("json", data_files=file, split=conf["split"], cache_dir=conf["cache_dir"]))
 97 |         ds = concatenate_datasets(datasets)
 98 |         ds.save_to_disk(conf["concat_output"])
 99 |         ds = load_from_disk(conf["concat_output"])
100 |     # Or load from the concatenated dataset
101 |     else:
102 |         ds = load_from_disk(conf["concat_output"])
103 | 
104 |     ds = ds.map(
105 |         lambda _, idx: {"__id__": idx},
106 |         with_indices=True,
107 |         num_proc=os.cpu_count(),
108 |         desc="Adding index...",
109 |     )
110 | 
111 |     return ds
112 | 
113 | 
114 | def embed_func(idx: int, content: str, *, num_perm: int) -> Dict[str, Any]:
115 |     """
116 |     Embed the content of a record into a MinHash object. This function should be
117 |     used with multiprocessing and it scales well with the number of cores.
118 | 
119 |     Parameters
120 |     ----------
121 |     idx : int
122 |         The index of the record.
123 |     content : str
124 |         The content to embed.
125 |     num_perm : int
126 |         The number of permutations to use in the MinHash object.
127 |     seed : int
128 |         The seed to use in the MinHash object.
129 | 
130 |     Returns
131 |     -------
132 |     Dict[str, Any]
133 |         The MinHash signature and the index of the record.
134 | 
135 |     Examples
136 |     --------
137 |     >>> result = embed_func(0, "Hello world!", num_perm=128)
138 |     >>> result["__id__"]
139 |     0
140 |     >>> result["__signature__"].shape
141 |     (128,)
142 |     >>> result["__signature__"].dtype
143 |     dtype('uint64')
144 |     """
145 |     m = MinHash(num_perm=num_perm, seed=MINHASH_SEED)
146 |     m.update_batch([token.encode("utf-8") for token in {t for t in NON_ALPHA.split(content) if t}])
147 |     return {"__signature__": m.hashvalues, "__id__": idx}
148 | 
149 | 
150 | def query_func(idx: int, signature: np.ndarray, *, index: MinHashLSH) -> Dict[str, Any]:
151 |     """
152 |     Query the MinHashLSH index for the record. This function can be used with multiprocessing
153 |     as long as the index is shared across processes.
154 | 
155 |     Parameters
156 |     ----------
157 |     index : MinHashLSH
158 |         The MinHashLSH index. It is shared across all processes when using multiprocessing with fork without copy.
159 |     record : Dict[str, Any]
160 |         The record to query.
161 | 
162 |     Returns
163 |     -------
164 |     Dict[str, Any]
165 |         The query result.
166 |     """
167 |     return {
168 |         "__neighbors__": [
169 |             str(dup_idx)
170 |             for dup_idx in index.query(
171 |                 LeanMinHash(seed=MINHASH_SEED, hashvalues=signature),
172 |             )
173 |         ],
174 |         "__id__": idx,
175 |     }
176 | 
177 | 
178 | def jaccard_similarity(code1: str, code2: str) -> float:
179 |     """
180 |     Calculate the jaccard similarity between two code snippets.
181 | 
182 |     Parameters
183 |     ----------
184 |     code1 : str
185 |         The first code snippet.
186 |     code2 : str
187 |         The second code snippet.
188 | 
189 |     Returns
190 |     -------
191 |     float
192 |         The jaccard similarity between the two code snippets.
193 | 
194 |     Examples
195 |     --------
196 |     >>> jaccard_similarity("a = 1", "a = 2")
197 |     0.3333333333333333
198 |     >>> jaccard_similarity("a = 1", "a = 1")
199 |     1.0
200 |     """
201 |     tokens1 = set([t for t in NON_ALPHA.split(code1) if t.strip()])
202 |     tokens2 = set([t for t in NON_ALPHA.split(code2) if t.strip()])
203 |     return len(tokens1 & tokens2) / max(1, len(tokens1 | tokens2))
204 | 
205 | 
206 | if __name__ == "__main__":
207 | 
208 |     def run(
209 |         dataset: str = typer.Option("codeparrot/codeparrot-clean-valid", help="The dataset to use"),
210 |         config: str = typer.Option("default", help="Dataset config"),
211 |         data_dir: str = typer.Option(None, help="Dataset data directory"),
212 |         split: str = typer.Option("train", help="Dataset split"),
213 |         column: str = typer.Option("content", help="Dataset column"),
214 |         cache_dir: str = typer.Option(".cache", help="Cache directory"),
215 |         num_perm: int = typer.Option(128, help="Number of permutations"),
216 |         seed: int = typer.Option(42, help="Random seed"),
217 |         threshold: float = typer.Option(0.58, help="Minhash threshold"),
218 |         verbose: bool = typer.Option(False, "--verbose", "-v", help="Verbose logging"),
219 |         output: str = typer.Option(None, help="Store the deduplicated dataset"),
220 |         lfs: bool = typer.Option(False, help="Use LFS files"),
221 |     ):
222 |         global dup_ids
223 | 
224 |         OUTPUT_BASE = Path("results") / dataset / config / (data_dir or "all") / split / column
225 |         OUTPUT_BASE.mkdir(exist_ok=True, parents=True)
226 |         output_concat = OUTPUT_BASE / "concat"
227 |         output = output or (OUTPUT_BASE / "decontaminated")
228 |         output_duplicates = OUTPUT_BASE / "duplicates"
229 |         output_duplicate_results = OUTPUT_BASE / "duplicate_results.jsonl"
230 |         logger.info(f"{'Output base':<30}: {OUTPUT_BASE}")
231 |         logger.info(f"{'Output concat':<30}: {output_concat}")
232 |         logger.info(f"{'Output duplicates':<30}: {output_duplicates}")
233 |         logger.info(f"{'Output duplicate results':<30}: {output_duplicate_results}")
234 |         logger.info(f"{'Output':<30}: {output}")
235 | 
236 |         conf = {
237 |             "cache_dir": cache_dir,
238 |             "num_perm": num_perm,
239 |             "seed": seed,
240 |             "threshold": threshold,
241 |             "dataset": dataset,
242 |             "config": config,
243 |             "data_dir": data_dir,
244 |             "split": split,
245 |             "column": column,
246 |             "verbose": verbose,
247 |             "output": output,
248 |             "lfs": lfs,
249 |             "concat_output": output_concat,
250 |         }
251 | 
252 |         time_measures = {}
253 | 
254 |         for benchmark in DATASETS_TO_CHECK:
255 |             globals()[benchmark["index"]] = MinHashLSH(
256 |                 threshold=conf["threshold"],
257 |                 num_perm=conf["num_perm"],
258 |             )
259 |         time_measures["load_dataset"] = time.time()
260 |         ds = load_dataset_with_config(conf)
261 |         time_measures["load_dataset"] = time.time() - time_measures["load_dataset"]
262 |         DATA_SIZE = len(ds)
263 |         start_time = time.time()
264 | 
265 |         embedded = ds.map(
266 |             function=embed_func,
267 |             fn_kwargs={"num_perm": conf["num_perm"]},
268 |             input_columns=["__id__", conf["column"]],
269 |             remove_columns=[conf["column"]],
270 |             num_proc=os.cpu_count(),
271 |             desc=f"Fingerprinting...",
272 |         )
273 | 
274 |         duplicate_results = []
275 |         for _, benchmark in enumerate(DATASETS_TO_CHECK):
276 |             benchmark_ds = concatenate_datasets(
277 |                 [
278 |                     load_dataset(benchmark["name"], split=split, cache_dir=conf["cache_dir"])
279 |                     for split in benchmark["splits"]
280 |                 ]
281 |             )
282 |             benchmark_ds = benchmark_ds.map(
283 |                 function=lambda x, idx: {
284 |                     **embed_func(
285 |                         idx,
286 |                         " ".join(
287 |                             [x[col] if isinstance(x[col], str) else " ".join(x[col]) for col in benchmark["columns"]]
288 |                         ),
289 |                         num_perm=conf["num_perm"],
290 |                     ),
291 |                     "__content__": " ".join(
292 |                         [x[col] if isinstance(x[col], str) else " ".join(x[col]) for col in benchmark["columns"]]
293 |                     ),
294 |                 },
295 |                 num_proc=os.cpu_count(),
296 |                 with_indices=True,
297 |                 desc=f"Fingerprinting...",
298 |             )
299 |             with globals()[benchmark["index"]].insertion_session() as session:
300 |                 for record in benchmark_ds:
301 |                     session.insert(record["__id__"], LeanMinHash(seed=MINHASH_SEED, hashvalues=record["__signature__"]))
302 | 
303 |             queried = embedded.map(
304 |                 function=lambda x, y: query_func(x, y, index=globals()[benchmark["index"]]),
305 |                 num_proc=os.cpu_count(),
306 |                 input_columns=[
307 |                     "__id__",
308 |                     "__signature__",
309 |                 ],
310 |                 remove_columns=["__signature__"],
311 |                 desc="Querying...",
312 |                 features=Features(
313 |                     {
314 |                         "__id__": Value("uint64"),
315 |                         "__neighbors__": Sequence(Value("string")),
316 |                     }
317 |                 ),
318 |             ).filter(
319 |                 lambda x: len(x["__neighbors__"]) > 0,
320 |                 num_proc=os.cpu_count(),
321 |                 desc=f"Filtering...",
322 |             )
323 | 
324 |             for record in tqdm(
325 |                 queried,
326 |                 desc=f"Checking for false positives...",
327 |             ):
328 |                 neighbors = set(record["__neighbors__"])
329 |                 curr_text = ds[record["__id__"]][conf["column"]]
330 |                 for neighbor in neighbors:
331 |                     reference = benchmark_ds[int(neighbor)]
332 |                     reference_text = reference["__content__"]
333 |                     if jaccard_similarity(curr_text, reference_text) >= conf["threshold"]:
334 |                         break
335 |                 else:
336 |                     continue
337 |                 dup_ids.add(record["__id__"])
338 |                 duplicate_results.append(
339 |                     {
340 |                         "original_record": ds[record["__id__"]],
341 |                         "duplicate_dataset": benchmark["name"],
342 |                         "duplicate_ids": [benchmark_ds[int(neighbor)] for neighbor in neighbors],
343 |                     }
344 |                 )
345 | 
346 |             logger.info(f"Done querying false positives for {benchmark['name']}")
347 | 
348 |             if benchmark["name"] == "openai_humaneval":
349 |                 if "repository_name" not in ds.features or "path" not in ds.features:
350 |                     break
351 |                 logger.info("Checking HumanEval")
352 |                 KNOWN_PATH = "LaudateCorpus1/code-align-evals-data/human_eval"
353 |                 subset = ds.filter(
354 |                     lambda x: KNOWN_PATH in x["repository_name"] + "/" + x["path"],
355 |                     num_proc=os.cpu_count(),
356 |                     desc=f"Filtering for HumanEval...",
357 |                 )
358 |                 # Find out the minimum maximum similarity
359 |                 thresholds = []
360 |                 for record in subset:
361 |                     thresholds.append(0)
362 |                     for target in benchmark_ds:
363 |                         thresholds[-1] = max(
364 |                             thresholds[-1], jaccard_similarity(record[conf["column"]], target["__content__"])
365 |                         )
366 | 
367 |                 logger.info(f"{'Minimum maximum similarity':<30}: {min(thresholds):.3f}")
368 |                 logger.info(f"{'Maximum maximum similarity':<30}: {max(thresholds):.3f}")
369 |                 logger.info(f"{'Mean maximum similarity':<30}: {np.mean(thresholds):.3f}")
370 | 
371 |             logger.info(f"Finished checking benchmark {benchmark['name']}")
372 | 
373 |         time_measures["total_processing_time"] = time.time() - start_time
374 | 
375 |         duplicates = ds.filter(lambda x: x["__id__"] in dup_ids, num_proc=os.cpu_count())
376 |         final_data = ds.filter(
377 |             lambda idx: idx not in dup_ids,
378 |             input_columns=["__id__"],
379 |             num_proc=os.cpu_count(),
380 |             desc="Filtering duplicates...",
381 |         )
382 | 
383 |         final_data.save_to_disk(output)
384 |         duplicates.save_to_disk(output_duplicates)
385 |         pd.DataFrame(duplicate_results).to_json(output_duplicate_results, lines=True, orient="records")
386 | 
387 |         FINAL_DATA_SIZE = len(final_data)
388 |         DUP_SIZE = DATA_SIZE - FINAL_DATA_SIZE
389 |         LAN = (data_dir or "all").split("/")[-1]
390 | 
391 |         logger.info(f"{'Language':<30}: {LAN}")
392 |         logger.info(f"{'Data Number':<30}: {DATA_SIZE}")
393 |         logger.info(f"{'Duplicate Number':<30}: {DUP_SIZE}")
394 |         logger.info(f"{'Duplicate Rate':<30}: {DUP_SIZE / DATA_SIZE:.2%}")
395 |         logger.info(f"{'Total Time':<30}: {time.time() - start_time:.2f} seconds")
396 | 
397 |     typer.run(run)
398 | 


--------------------------------------------------------------------------------
/data_analysis/decontamination/requirements.txt:
--------------------------------------------------------------------------------
1 | datasets
2 | datasketch
3 | rich


--------------------------------------------------------------------------------
/data_analysis/decontamination/requirements_minhash.txt:
--------------------------------------------------------------------------------
1 | networkit==10.0
2 | datasketch==1.5.8
3 | rich==12.6.0
4 | tqdm==4.64.1
5 | datasets==2.5.1
6 | typer==0.6.1
7 | tabulate==0.9.0
8 | dill==0.3.5.1


--------------------------------------------------------------------------------
/data_analysis/github_issues_analysis/utils.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | 
  3 | import datasets
  4 | import regex
  5 | import torch
  6 | from transformers import pipeline
  7 | 
  8 | GITHUB_EMAILS = [
  9 |     re.compile(pattern, re.DOTALL)
 10 |     for pattern in [
 11 |         "(.*)From:.+Reply to this email directly.+view it on GitHub(.*)\n?(.*)",
 12 |         "(.*)On.+notifications@github.com.+wrote:.+Reply to this email directly.+view it on GitHub(.*)\n?(.*)",
 13 |         "(.*)Signed-off-by: .+<.+>(.*?)\n?(.*)",
 14 |     ]
 15 | ]
 16 | GITHUB_EMAIL_DATE = re.compile("\d+/\d+/\d+ \d{2}:\d{2} [AP]M.+wrote")
 17 | GITHUB_EMAIL_LINEBREAK = re.compile("_{20,}")
 18 | 
 19 | 
 20 | BOT_AUTHORS = [
 21 |     "Apache-HBase",
 22 |     "AutorestCI",
 23 |     "CLAassistant",
 24 |     "cmsbuild",
 25 |     "codecov-io",
 26 |     "codecov-commenter",
 27 |     "coveralls",
 28 |     "danger-public",
 29 |     "dnfclas",
 30 |     "msftclas",
 31 |     "PyDocTeur",
 32 |     "SparkQA",
 33 |     "karma-pr-reporter",
 34 |     "danger-public",
 35 |     "claassistantio",
 36 |     "probot-stale",
 37 | ]
 38 | 
 39 | BOT_KEYWORDS = ["[bot]", "botmanager", "bors-", "jenkins", "k8s-", "-test-", "travis"]
 40 | 
 41 | BOT_SUFFIXES = [
 42 |     "-automaton",
 43 |     "-automation",
 44 |     "-benchmark",
 45 |     "-build",
 46 |     "-deployer",
 47 |     "-cloud",
 48 |     "bot",
 49 |     "-ci",
 50 |     "-linter",
 51 |     "-teamcity",
 52 |     "-test",
 53 |     "-testing",
 54 |     "-Service-Account",
 55 | ]
 56 | 
 57 | 
 58 | def merge_text_columns(example):
 59 |     """Combines description and comment to one column (text)
 60 | 
 61 |     Descriptions are issue-level text (body of text when opening an issue),
 62 |     comments are replies to the parent issue or one of its comments.
 63 |     We merge them as an event cannot have both at the same time.
 64 |     """
 65 |     events_new = []
 66 |     text_columns = ["comment", "description"]
 67 |     for event_old in example["events"]:
 68 |         event_new = {k: v for k, v in event_old.items() if k not in text_columns}
 69 |         comment, description = event_old["comment"], event_old["description"]
 70 |         text = comment if comment else description
 71 |         event_new["text"] = text if text else ""
 72 |         events_new.append(event_new)
 73 |     example["events"] = events_new
 74 |     return example
 75 | 
 76 | 
 77 | def _strip_automated_email_text(text):
 78 |     """Removes text auto-generated when users post in issues via email reply"""
 79 |     if text:
 80 |         text = text.strip()
 81 |     else:
 82 |         return ""
 83 |     # try to extract with regex directly
 84 |     for pattern in GITHUB_EMAILS:
 85 |         m = pattern.match(text)
 86 |         if m:
 87 |             break
 88 |     if m:
 89 |         text = m.group(1) + m.group(3)
 90 |     else:
 91 |         # if no exact matches, apply matching line by line and
 92 |         # get potential content before/after automated email text
 93 |         lines = text.split("\n")
 94 |         start, end = 0, -1
 95 |         for i, line in enumerate(lines):
 96 |             line = line.strip()
 97 |             if "notifications@github.com" in line or bool(
 98 |                 GITHUB_EMAIL_DATE.search(line)
 99 |             ):
100 |                 start = i
101 |             if "Reply to this email directly" in line:
102 |                 end = i + 1 if line.endswith(":") else i
103 |             if line.startswith(">"):
104 |                 # remove quoted text in replies
105 |                 end = i
106 |         text = "\n".join(lines[:start] + lines[end + 1 :])
107 |     # remove page break line
108 |     return GITHUB_EMAIL_LINEBREAK.sub("", text).strip()
109 | 
110 | 
111 | def strip_automated_email_text(example):
112 |     """Removes auto-generated text from emails in Github issues"""
113 |     # assumes merge_text_columns() was already applied on dataset
114 |     example["events"] = [
115 |         {
116 |             k: _strip_automated_email_text(v) if k == "text" else v
117 |             for k, v in event.items()
118 |         }
119 |         for event in example["events"]
120 |     ]
121 |     return example
122 | 
123 | 
124 | def remove_bot_comments(example):
125 |     """Discard auto comments from issues based on author pattern matching"""
126 |     filtered_events = []
127 |     modified = False
128 |     for event in example["events"]:
129 |         author = event["author"]
130 |         # assumes single `text' field rather than comment/description
131 |         is_bot = (
132 |             any(bp.lower() in author.lower() for bp in BOT_KEYWORDS)
133 |             or any(author.lower().endswith(s) for s in BOT_SUFFIXES)
134 |             or any(author == a for a in BOT_AUTHORS)
135 |         )
136 |         if not is_bot:
137 |             filtered_events.append(event)
138 |         else:
139 |             modified = True
140 |     # example["old_events"] = example["events"]
141 |     example["events"] = filtered_events
142 |     example["bot_issue"] = len(example["events"]) == 0
143 |     example["modified_by_bot"] = modified
144 |     return example
145 | 


--------------------------------------------------------------------------------
/data_analysis/kenlm/kenlm.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "c62f3811-73a4-4de2-9800-84c148508838",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "## Install KenLM\n",
  9 |     "\n",
 10 |     "```bash\n",
 11 |     "sudo apt -y install build-essential cmake libboost-system-dev libboost-thread-dev libboost-program-options-dev libboost-test-dev libeigen3-dev zlib1g-dev libbz2-dev liblzma-dev\n",
 12 |     "wget -O - https://kheafield.com/code/kenlm.tar.gz | tar xz\n",
 13 |     "mkdir kenlm/build && cd kenlm/build && cmake .. && make -j2\n",
 14 |     "ls kenlm/build/bin\n",
 15 |     "```"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": 1,
 21 |    "id": "7a73b1c0-1349-48d6-930d-b6b1dd6e76cb",
 22 |    "metadata": {},
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "from datasets import load_dataset"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": 2,
 31 |    "id": "4361baf4-75d2-4dbe-9dab-2a6bacdc3008",
 32 |    "metadata": {},
 33 |    "outputs": [
 34 |     {
 35 |      "data": {
 36 |       "application/vnd.jupyter.widget-view+json": {
 37 |        "model_id": "4382dfdd8a1542518b8f65f27d755bf9",
 38 |        "version_major": 2,
 39 |        "version_minor": 0
 40 |       },
 41 |       "text/plain": [
 42 |        "VBox(children=(HTML(value='<center> <img\\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…"
 43 |       ]
 44 |      },
 45 |      "metadata": {},
 46 |      "output_type": "display_data"
 47 |     }
 48 |    ],
 49 |    "source": [
 50 |     "from huggingface_hub import notebook_login\n",
 51 |     "notebook_login()"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": 6,
 57 |    "id": "34a67c7e-41a6-41c3-a647-da1362267782",
 58 |    "metadata": {},
 59 |    "outputs": [
 60 |     {
 61 |      "name": "stderr",
 62 |      "output_type": "stream",
 63 |      "text": [
 64 |       "Using custom data configuration bigcode--the-stack-pjjs-medium-d5c5ce7928f966c0\n",
 65 |       "Found cached dataset json (/home/leandro/.cache/huggingface/datasets/bigcode___json/bigcode--the-stack-pjjs-medium-d5c5ce7928f966c0/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab)\n"
 66 |      ]
 67 |     }
 68 |    ],
 69 |    "source": [
 70 |     "ds = load_dataset(\"bigcode/the-stack-pjjs-medium\", use_auth_token=True, split=\"train\")"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "code",
 75 |    "execution_count": 8,
 76 |    "id": "9486011f-471f-4a9e-bce6-fc57fc0f9c9c",
 77 |    "metadata": {},
 78 |    "outputs": [],
 79 |    "source": [
 80 |     "from transformers import AutoTokenizer"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "code",
 85 |    "execution_count": 70,
 86 |    "id": "569ea610-e50b-400b-89cf-3243dce69790",
 87 |    "metadata": {},
 88 |    "outputs": [],
 89 |    "source": [
 90 |     "tokenizer = AutoTokenizer.from_pretrained(\"bigcode/digit-bytelevel-bpe-jss-v1.1-49152\", use_auth_token=True, slow=True)"
 91 |    ]
 92 |   },
 93 |   {
 94 |    "cell_type": "code",
 95 |    "execution_count": 7,
 96 |    "id": "6bc0b647-2a01-4261-ac7f-a63b53d4f618",
 97 |    "metadata": {},
 98 |    "outputs": [
 99 |     {
100 |      "data": {
101 |       "text/plain": [
102 |        "Dataset({\n",
103 |        "    features: ['content', 'avg_line_length', 'max_line_length', 'alphanum_fraction', 'licenses', 'repository_name', 'path', 'size', 'lang'],\n",
104 |        "    num_rows: 600000\n",
105 |        "})"
106 |       ]
107 |      },
108 |      "execution_count": 7,
109 |      "metadata": {},
110 |      "output_type": "execute_result"
111 |     }
112 |    ],
113 |    "source": [
114 |     "ds"
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "code",
119 |    "execution_count": 80,
120 |    "id": "eca86f94-6649-4832-8eec-29a46aedc3f9",
121 |    "metadata": {},
122 |    "outputs": [
123 |     {
124 |      "name": "stdout",
125 |      "output_type": "stream",
126 |      "text": [
127 |       "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
128 |       "To disable this warning, you can either:\n",
129 |       "\t- Avoid using `tokenizers` before the fork if possible\n",
130 |       "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
131 |      ]
132 |     }
133 |    ],
134 |    "source": [
135 |     "!rm samples.txt"
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "code",
140 |    "execution_count": 81,
141 |    "id": "fa4c22f1-ab16-400d-bcc1-03b56c1b5799",
142 |    "metadata": {},
143 |    "outputs": [],
144 |    "source": [
145 |     "def save_tokens(examples):\n",
146 |     "    tokenized_content = tokenizer(examples[\"content\"])\n",
147 |     "    with open('samples.txt', 'a') as f:\n",
148 |     "        for input_ids in tokenized_content.input_ids:\n",
149 |     "            f.write(\" \".join(tokenizer.convert_ids_to_tokens(input_ids)))\n",
150 |     "            f.write(\"\\n\")"
151 |    ]
152 |   },
153 |   {
154 |    "cell_type": "code",
155 |    "execution_count": 82,
156 |    "id": "bab27429-4a89-4f8c-991c-30178dfe6d64",
157 |    "metadata": {
158 |     "scrolled": true,
159 |     "tags": []
160 |    },
161 |    "outputs": [
162 |     {
163 |      "data": {
164 |       "application/vnd.jupyter.widget-view+json": {
165 |        "model_id": "747b638d559d42e7990387d38e958f8a",
166 |        "version_major": 2,
167 |        "version_minor": 0
168 |       },
169 |       "text/plain": [
170 |        "  0%|          | 0/600 [00:00<?, ?ba/s]"
171 |       ]
172 |      },
173 |      "metadata": {},
174 |      "output_type": "display_data"
175 |     }
176 |    ],
177 |    "source": [
178 |     "_ = ds.map(save_tokens, batched=True, batch_size=1000)"
179 |    ]
180 |   },
181 |   {
182 |    "cell_type": "markdown",
183 |    "id": "e13d0cdf-0d57-4059-8f43-29762f424a4b",
184 |    "metadata": {},
185 |    "source": [
186 |     "```bash\n",
187 |     "kenlm/build/bin/lmplz -o 5 < \"samples.txt\" > \"gram.arpa\"\n",
188 |     "kenlm/build/bin/build_binary gram.arpa gram.binary\n",
189 |     "```"
190 |    ]
191 |   },
192 |   {
193 |    "cell_type": "code",
194 |    "execution_count": 84,
195 |    "id": "88d82de8-7bf2-44ee-a9a2-18cf81b847a3",
196 |    "metadata": {},
197 |    "outputs": [],
198 |    "source": [
199 |     "import kenlm\n",
200 |     "model = kenlm.LanguageModel('./gram.binary')"
201 |    ]
202 |   },
203 |   {
204 |    "cell_type": "code",
205 |    "execution_count": 85,
206 |    "id": "3087cf00-acaf-4f15-a1be-63d7ba6f0763",
207 |    "metadata": {},
208 |    "outputs": [
209 |     {
210 |      "data": {
211 |       "text/plain": [
212 |        "-21.19799041748047"
213 |       ]
214 |      },
215 |      "execution_count": 85,
216 |      "metadata": {},
217 |      "output_type": "execute_result"
218 |     }
219 |    ],
220 |    "source": [
221 |     "model.score(\"this is a test\")"
222 |    ]
223 |   },
224 |   {
225 |    "cell_type": "code",
226 |    "execution_count": null,
227 |    "id": "c2e4fda3-7e99-49d7-8a93-c5318c459bf2",
228 |    "metadata": {},
229 |    "outputs": [],
230 |    "source": []
231 |   }
232 |  ],
233 |  "metadata": {
234 |   "kernelspec": {
235 |    "display_name": "Python 3",
236 |    "language": "python",
237 |    "name": "python3"
238 |   },
239 |   "language_info": {
240 |    "codemirror_mode": {
241 |     "name": "ipython",
242 |     "version": 3
243 |    },
244 |    "file_extension": ".py",
245 |    "mimetype": "text/x-python",
246 |    "name": "python",
247 |    "nbconvert_exporter": "python",
248 |    "pygments_lexer": "ipython3",
249 |    "version": "3.7.12"
250 |   }
251 |  },
252 |  "nbformat": 4,
253 |  "nbformat_minor": 5
254 | }
255 | 


--------------------------------------------------------------------------------
/data_analysis/kenlm/setup.sh:
--------------------------------------------------------------------------------
1 | sudo apt -y install build-essential cmake libboost-system-dev libboost-thread-dev libboost-program-options-dev libboost-test-dev libeigen3-dev zlib1g-dev libbz2-dev liblzma-dev
2 | wget -O - https://kheafield.com/code/kenlm.tar.gz | tar xz
3 | mkdir kenlm/build && cd kenlm/build && cmake .. && make -j2
4 | cd kenlm
5 | python setup.py install


--------------------------------------------------------------------------------
/data_analysis/near-deduplication/.gitignore:
--------------------------------------------------------------------------------
1 | results
2 | *log
3 | *json
4 | minhash_deduplication_alt_streaming.py
5 | dump.rdb
6 | data


--------------------------------------------------------------------------------
/data_analysis/near-deduplication/README.md:
--------------------------------------------------------------------------------
  1 | # Near deduplication
  2 | ## For the lastest version of near-deduplication with speed-ups, check [bigcode-dataset/near_deduplication](https://github.com/bigcode-project/bigcode-dataset/tree/main/near_deduplication)
  3 | Code for running near-deduplication with MinHash and LSH indexing
  4 | 
  5 | ### Setup
  6 | 
  7 | ````
  8 | pip install -r requirements.txt
  9 | ````
 10 | 
 11 | Login to be able to push the dataset to the hub after deduplication and clone your huggingface-hub repositories:
 12 | 
 13 | ````
 14 | huggingface-cli login
 15 | ````
 16 | 
 17 | And make sure you have git-lfs installed.
 18 | 
 19 | If you use datasets with different column names from the BigCode ones, you might need to change `PATH_COLUMN` and `CONTENT` variables in `minhash_deduplication.py`.
 20 | 
 21 | ### Usage
 22 | 
 23 | To run near deduplication use the following command and adapt the arguments for your case:
 24 | 
 25 | ````
 26 | python near_deduplicate.py \
 27 |     --dataset_name bigcode-data/python_any_license_v2 \
 28 |     --org bigcode-data \
 29 |     --repo_name python_any_license_v2_near_dedup \
 30 |     --out_path ./data/any_license-near-dedup \
 31 |     --text_column content 
 32 | ````
 33 | 
 34 | To make just a test run with a subset of the data, set `test_run` argument to True.
 35 | 
 36 | The first time you load the dataset might be slow if it is large, but the data is saved in the cache thanks to `datasets`, and the subsequent calls will be fast.
 37 | 
 38 | ### Alternative Deduplication Script
 39 | 
 40 | `minhash_deduplication_alt.py` is an alternative you might find useful to use as well. It is best for a single multi-core machine environment and uses similar parameters to the original deduplication script.
 41 | 
 42 | ```bash
 43 | pip install -r requirements_alt.txt
 44 | # Quick example
 45 | python minhash_deduplication_alt.py --dataset codeparrot/codeparrot-clean-valid \  
 46 |     --split train \
 47 |     --column content \
 48 |     --cache-dir .cache \
 49 |     --verbose
 50 | # For details on the arguments, see the help message
 51 | python minhash_deduplication_alt.py --help
 52 | ```
 53 | 
 54 | #### Implementation Analysis
 55 | 
 56 | This is for the alternative script that is designed for single-machine setup.
 57 | 
 58 | ##### Scaling
 59 | 
 60 | To understand the limitation of current deduplication implementation, it is important to have an idea of how each step in the pipeline affects the overall time:
 61 | 1. Minhashing is fast, but it takes longer for long documents. Hashing scales with both the number of cores and single core performance (clock speed, for example). With `datasets`s caching, it also does not require much memory.
 62 | 2. Indexing is basically putting minhash signatures into different buckets. This is one bottleneck in this pipeline. In an ideal situation where MapReduce is seamlessly integrated with other parts, it can be further improved with distributed buckets.
 63 | 3. Depending on how you look at duplicates, querying can be easily created by iterating the buckets or iterating the simhashes.
 64 | 4. Depending on how you decide to group duplicates, you can build a graph and then do connected component analysis or use a simple algorithm like union-find.
 65 | 5. What to do with a group of duplicates is also a widely open question. We opt to keep one document within a group/cluster in this case.
 66 | 
 67 | ##### Experiments
 68 | 
 69 | We report here some stats on the experiments we did along the way with a 80-core machine on GCP (M1):
 70 | 
 71 | For SantaCoder, our results can be replicated by the following commands:
 72 | 
 73 | ```bash
 74 | python minhash_deduplication_alt.py --dataset bigcode/the-stack-dedup-pjj --data-dir data/java --revision v1.1.a1 --cache-dir cache2 --ngram-size 5 --threshold 0.7 --min-token-length 10 --fast
 75 | python minhash_deduplication_alt.py --dataset bigcode/the-stack-dedup-pjj --data-dir data/javascript --revision v1.1.a1 --cache-dir cache2 --ngram-size 5 --threshold 0.7 --min-token-length 10 --fast
 76 | python minhash_deduplication_alt.py --dataset bigcode/the-stack-dedup-pjj --data-dir data/python --revision v1.1.a1 --cache-dir cache2 --ngram-size 5 --threshold 0.7 --min-token-length 10 --fast
 77 | ```
 78 | 
 79 | Java Results as of Dec 20, 2022
 80 | ```
 81 | load_dataset                    : 3414.68 seconds
 82 | minhash                         : 22966.13 seconds
 83 | clustering                      : 7676.72 seconds
 84 | filtering                       : 1118.62 seconds
 85 | save                            : 3105.66 seconds
 86 | Data Number (before)            : 40113161
 87 | Data Number (after)             : 21108567 (52.62%)
 88 | Duplicate Number                : 19004594 (47.38%)
 89 | Total Time                      : 38281.88 seconds (10.6 hours)
 90 | ```
 91 | 
 92 | 
 93 | Java (already deduplicated) Results as of Dec 2, 2022
 94 | ```
 95 | Load Dataset                    : 77.18 seconds                                                                                       
 96 | Embed                           : 5052.87 seconds                                                                                     
 97 | Create Index                    : 16253.12 seconds                                                                                    
 98 | Save Index                      : 0.00 seconds                                                                                        
 99 | Freeze Memory                   : 0.00 seconds                                                                                        
100 | Query                           : 1321.61 seconds                                                                                     
101 | Save Neighbors                  : 0.00 seconds                                                                                        
102 | Unfreeze Memory                 : 0.00 seconds                                                                                        
103 | Clustering                      : 10825.30 seconds                                                                                    
104 | Total Processing Time           : 34919.87 seconds                                                                                    
105 | Deduplicate                     : 605.83 seconds                                                                                      
106 | Save Deduplicated               : 2356.10 seconds                                                                                     
107 | Language                        : java                                                                                                
108 | Data Number (before filtering)  : 25124914                                                                                            
109 | Data Number (after filtering)   : 24972491                                                                                            
110 | Duplicate Number                : 4822205 (19.31%)                                                                                    
111 | Total Reduction                 : 4974628 (19.80%)                                                                                    
112 | Total Time                      : 37881.83 seconds (10.5 hours)                                                                        
113 | ```
114 | 
115 | More details can be found on https://zippy-anise-556.notion.site/Deduplication-Log-d75d1b3f2e684e96a12b069c5aff68cb.
116 | 


--------------------------------------------------------------------------------
/data_analysis/near-deduplication/minhash_deduplication.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import multiprocessing as mp
  3 | import re
  4 | from collections import defaultdict
  5 | from functools import partial
  6 | from typing import Dict, List, Optional, Set, Tuple, Type
  7 | 
  8 | from datasets import Dataset
  9 | from tqdm import tqdm
 10 | 
 11 | from datasketch import MinHash, MinHashLSH
 12 | from dpu_utils.utils.iterators import ThreadedIterator
 13 | 
 14 | 
 15 | NON_ALPHA = re.compile("[^A-Za-z_0-9]")
 16 | # parameters used in DuplicationIndex
 17 | MIN_NUM_TOKENS = 10
 18 | NUM_PERM = 256
 19 | 
 20 | # column name of file paths, we add as file identifiers
 21 | PATH_COLUMN = "original_path"
 22 | # name of the "text" column used in deduplication
 23 | CONTENT = "content"
 24 | 
 25 | def get_min_hash(tokens: List[str]) -> Optional[MinHash]:
 26 |     """Compute the MinHash of a code snippet."""
 27 |     if len(tokens) < MIN_NUM_TOKENS:
 28 |         return None
 29 |     min_hash = MinHash(num_perm=NUM_PERM)
 30 |     for token in set(tokens):
 31 |         min_hash.update(token.encode())
 32 |     return min_hash
 33 | 
 34 | 
 35 | def get_tokens(code: str) -> Set[str]:
 36 |     """Tokenize a code snippet."""
 37 |     return set([t for t in NON_ALPHA.split(code) if len(t.strip()) > 0])
 38 | 
 39 | 
 40 | class DuplicationIndex:
 41 |     def __init__(
 42 |         self,
 43 |         *,
 44 |         duplication_jaccard_threshold: float = 0.85,
 45 |     ):
 46 |         self._duplication_jaccard_threshold = duplication_jaccard_threshold
 47 |         self._num_perm = NUM_PERM
 48 |         self._index = MinHashLSH(threshold=self._duplication_jaccard_threshold, num_perm=self._num_perm)
 49 | 
 50 |         self._duplicate_clusters = defaultdict(set)
 51 | 
 52 |     def add(self, code_key: Tuple, min_hash: MinHash) -> None:
 53 |         """Add a key to _index (MinHashLSH)
 54 |         the min_hash is used to query closest matches based on the jaccard_threshold.
 55 |         The new key is either added to a existing cluster of one close match,
 56 |         or a new cluster is created. The clusters created in this way, depend on the order of add.
 57 | 
 58 |         Args:
 59 |             code_key (Tuple of (index, repo_name, path)):
 60 |                 Theoritically any hasbale key. Here we use a tuple to retrieve the information later.
 61 |             min_hash: MinHash of the code_key.
 62 |         """
 63 |         close_duplicates = self._index.query(min_hash)
 64 |         if code_key in self._index.keys:
 65 |             print(f"Duplicate key {code_key}")
 66 |             return
 67 | 
 68 |         self._index.insert(code_key, min_hash)
 69 |         if len(close_duplicates) > 0:
 70 | 
 71 |             for base_duplicate in close_duplicates:
 72 |                 if base_duplicate in self._duplicate_clusters:
 73 |                     self._duplicate_clusters[base_duplicate].add(code_key)
 74 |                     break
 75 |             else:
 76 |                 self._duplicate_clusters[close_duplicates[0]].add(code_key)
 77 | 
 78 |     def get_duplicate_clusters(self) -> List[List[Dict]]:
 79 |         """Export the duplicate clusters.
 80 |         For each cluster, the first element is the base element of the cluster.
 81 |         The base element has an estimation jaccard similarity higher than the threshold with all the other elements.
 82 | 
 83 |         Returns:
 84 |             duplicate_clusters (List[List[Dict]]):
 85 |                 List of duplicate clusters.
 86 |         """
 87 |         duplicate_clusters = []
 88 |         for base, duplicates in self._duplicate_clusters.items():
 89 |             cluster = [base] + list(duplicates)
 90 |             # reformat the cluster to be a list of dict
 91 |             cluster = [{"base_index": el[0], "original_path": el[1]} for el in cluster]
 92 |             duplicate_clusters.append(cluster)
 93 |         return duplicate_clusters
 94 | 
 95 |     def save(self, filepath) -> None:
 96 |         duplicate_clusters = self.get_duplicate_clusters()
 97 |         with open(filepath, "w") as f:
 98 |             json.dump(duplicate_clusters, f)
 99 | 
100 | 
101 | def _compute_min_hash(element):
102 |     index, data = element
103 |     min_hash = get_min_hash([t for t in NON_ALPHA.split(data[CONTENT]) if len(t.strip()) > 0])
104 |     if min_hash is not None:
105 |         return (index, data[PATH_COLUMN]), min_hash
106 | 
107 | 
108 | def minhash_iter(dataset_iterator: Type[Dataset]):
109 |     with mp.Pool() as pool:
110 |         for data in pool.imap_unordered(
111 |             _compute_min_hash,
112 |             ThreadedIterator(dataset_iterator, max_queue_size=10000),
113 |             chunksize=100,
114 |         ):
115 |             if data is not None:
116 |                 yield data
117 | 
118 | 
119 | def make_duplicate_clusters(dataset_iterator: Type[Dataset], jaccard_threshold: float):
120 |     """Find duplicate clusters in the dataset in two steps:
121 |     1. Compute MinHash for each code snippet. MinHash is a tool for fast jaccard similarity estimation.
122 |     This step is computed using an asynchronous multiprocessing pool, minhash_iter
123 |     2. Find duplicate clusters. The computed MinHash is added sequentially to the DuplicationIndex.
124 |     This step cannot be parallelized. So using asynchronous thread in the previous step helps to speed up the process.
125 |     """
126 |     di = DuplicationIndex(duplication_jaccard_threshold=jaccard_threshold)
127 | 
128 |     for filename, min_hash in tqdm(ThreadedIterator(minhash_iter(enumerate(dataset_iterator)), max_queue_size=100)):
129 |         di.add(filename, min_hash)
130 | 
131 |     # Returns a List[Cluster] where Cluster is List[str] with the filenames.
132 |     return di.get_duplicate_clusters()
133 | 
134 | 
135 | def jaccard_similarity(code1: str, code2: str) -> float:
136 |     """Compute the Jaccard similarity of two code snippets."""
137 |     tokens1 = get_tokens(code1)
138 |     tokens2 = get_tokens(code2)
139 |     return len(tokens1 & tokens2) / len(tokens1 | tokens2)
140 | 
141 | 
142 | _shared_dataset = None
143 | 
144 | 
145 | def _find_cluster_extremes_shared(cluster, jaccard_threshold):
146 |     """Find a reduced cluster such that each code in the origin cluster is similar to at least one code in the reduced cluster.
147 |     Two codes are similar if their Jaccard similarity is above the threshold.
148 | 
149 |     Args:
150 |         cluster (List[dict]):
151 |            cluster is a list of dict, each dict contains the following keys:
152 |                 - base_index
153 |                 - repo_name
154 |                 - path
155 |             This is a typical output of DuplicationIndex.get_duplicate_clusters()
156 |         jaccard_threshold (float):
157 |             threshold for Jaccard similarity.
158 |             Two codes are similar if their Jaccard similarity is above the threshold.
159 | 
160 |     Returns:
161 |         extremes (List[dict]):
162 |             A reduced representation of the cluster. The field copies is added to each dict.
163 |             The copies field indicates the number of similar codes in the cluster for a extreme.
164 |     """
165 |     extremes = []
166 |     for element1 in cluster:
167 |         code1 = _shared_dataset[element1["base_index"]][CONTENT]
168 |         for element2 in extremes:
169 |             code2 = _shared_dataset[element2["base_index"]][CONTENT]
170 |             if jaccard_similarity(code1, code2) >= jaccard_threshold:
171 |                 element2["copies"] += 1
172 |                 break
173 |         else:
174 |             element1["copies"] = 1
175 |             extremes.append(element1)
176 |     return extremes
177 | 
178 | 
179 | def find_extremes(cluster_list, dataset, jaccard_threshold):
180 |     """Call the _find_cluster_extremes_shared function in a parallel fashion.
181 | 
182 |     Args:
183 |         cluster_list (List[List[Dict]]):
184 |             each cluster is a list of dicts with the key base_index,
185 |             referring to the index of the base code in the dataset.
186 |         dataset (Type[Dataset]):
187 |             dataset is used to access the content of the code snippets,
188 |             using the base_index from the cluster_list.
189 |             dataset is shared between all the processes using a glabal variable (any other way to share the dataset?),
190 |             otherwise the multi processing is not speeded up.
191 |         jaccard_threshold (float):
192 |             the threshold for the jaccard similarity. The default value is 0.85
193 | 
194 |     Returns:
195 |         extremes_list (List[Dict]):
196 |             Each cluster is reduced to extremes.
197 |             See _find_cluster_extremes_shared for the definition of extremes.
198 |     """
199 |     global _shared_dataset
200 |     _shared_dataset = dataset
201 |     extremes_list = []
202 |     f = partial(_find_cluster_extremes_shared, jaccard_threshold=jaccard_threshold)
203 |     with mp.Pool() as pool:
204 |         for extremes in tqdm(
205 |             pool.imap_unordered(
206 |                 f,
207 |                 cluster_list,
208 |             ),
209 |             total=len(cluster_list),
210 |         ):
211 |             extremes_list.append(extremes)
212 |     return extremes_list
213 | 
214 | 
215 | def deduplicate_dataset(
216 |     dataset: Type[Dataset], jaccard_threshold: float = 0.85
217 | ) -> Tuple[Type[Dataset], List[List[Dict]]]:
218 |     """Deduplicate the dataset using minhash and jaccard similarity.
219 |     This function first generate duplicate clusters, then each cluster
220 |     is reduced to the extremes that are similar to the other elements in the cluster.
221 |     Codes are called similar if their Jaccard similarity is greater than jaccard_threshold (0.85 default).
222 | 
223 |     Args:
224 |         dataset (Type[Dataset]):
225 |             The dataset to deduplicate.
226 |         jaccard_threshold (float, default=0.85):
227 |             jaccard threshold to determine if two codes are similar
228 | 
229 |     Returns:
230 |         ds_dedup (Type[Dataset]):
231 |             The deduplicated dataset.
232 |         duplicate_clusters (List[List[Dict]]):
233 |             The list of duplicate clusters.
234 |             Each cluster is a list of dicts with the following keys:
235 |             - base_index : int
236 |                 The index of the code in the original dataset.
237 |             - repo_name : str
238 |             - path : str
239 |             - copies : int
240 |                 The number of copies of the code in the cluster. (find_cluster_extremes)
241 |             - is_extreme : bool
242 |                 Whether the code is an extreme in the cluster.
243 |             All the codes in the cluster are removed from the dataset except the extremes.
244 | 
245 |     Example:
246 |         >>> from datasets import load_dataset
247 |         >>> from minhash_deduplication import deduplicate_dataset
248 |         >>> ds = load_dataset("lvwerra/codeparrot-clean", split="train")
249 |         >>> ds_dedup, duplicate_clusters = deduplicate_dataset(ds, jaccard_threshold=0.85)
250 |     """
251 |     duplicate_clusters = make_duplicate_clusters(dataset, jaccard_threshold)
252 |     duplicate_indices = set(x["base_index"] for cluster in duplicate_clusters for x in cluster)
253 |     extreme_dict = {}
254 |     extremes_clusters = find_extremes(duplicate_clusters, dataset, jaccard_threshold)
255 |     for extremes in extremes_clusters:
256 |         for element in extremes:
257 |             extreme_dict[element["base_index"]] = element
258 |     remove_indices = duplicate_indices - set(extreme_dict.keys())
259 |     ds_filter = dataset.filter(lambda x, idx: idx not in remove_indices, with_indices=True)
260 | 
261 |     # update duplicate_clusters
262 |     for cluster in duplicate_clusters:
263 |         for element in cluster:
264 |             element["is_extreme"] = element["base_index"] in extreme_dict
265 |             if element["is_extreme"]:
266 |                 element["copies"] = extreme_dict[element["base_index"]]["copies"]
267 | 
268 |     print(f"Original dataset size: {len(dataset)}")
269 |     print(f"Number of duplicate clusters: {len(duplicate_clusters)}")
270 |     print(f"Files in duplicate cluster: {len(duplicate_indices)}")
271 |     print(f"Unique files in duplicate cluster: {len(extreme_dict)}")
272 |     print(f"Filtered dataset size: {len(ds_filter)}")
273 | 
274 |     return ds_filter, duplicate_clusters
275 | 


--------------------------------------------------------------------------------
/data_analysis/near-deduplication/minhash_deduplication_alt.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | # author      : Chenghao Mou (mouchenghao@gmail.com)
  4 | # created     : 10/4/22
  5 | from __future__ import annotations
  6 | 
  7 | import gc
  8 | import hashlib
  9 | import logging
 10 | import multiprocessing as mp
 11 | import os
 12 | import random
 13 | import re
 14 | import struct
 15 | import time
 16 | import warnings
 17 | from collections import defaultdict
 18 | from itertools import tee
 19 | from pathlib import Path
 20 | from typing import Any
 21 | from typing import Dict
 22 | from typing import Iterable
 23 | from typing import List
 24 | from typing import Tuple
 25 | 
 26 | with warnings.catch_warnings():
 27 |     warnings.filterwarnings("ignore", category=FutureWarning)
 28 |     import datasets
 29 |     import numpy as np
 30 |     import typer
 31 |     from datasets import load_dataset
 32 |     from scipy.integrate import quad as integrate
 33 |     from tqdm import tqdm
 34 | 
 35 | 
 36 | SEED = 42
 37 | NON_ALPHA = re.compile("[^A-Za-z_0-9]")
 38 | RNG = np.random.RandomState(SEED)
 39 | MAX_HASH = np.uint64((1 << 32) - 1)
 40 | MERSENNE_PRIME = np.uint64((1 << 61) - 1)
 41 | logger = logging.getLogger(__name__)
 42 | logger.setLevel(logging.INFO)
 43 | datasets.logging.set_verbosity_error()
 44 | 
 45 | 
 46 | def ngrams(sequence: List[str], n: int) -> Iterable:
 47 |     """
 48 |     Directly taken from nltk package to avoid dependency.
 49 | 
 50 |     Parameters
 51 |     ----------
 52 |     sequence : list
 53 |         The sequence of items to be n-grammed.
 54 |     n : int
 55 |         The order of the n-grams to be extracted.
 56 | 
 57 |     Returns
 58 |     -------
 59 |     Iterable
 60 |         The n-grams generated from the sequence.
 61 |     """
 62 |     iterables = tee(sequence, n)
 63 |     for i, sub_iterable in enumerate(iterables):
 64 |         for _ in range(i):
 65 |             next(sub_iterable, None)
 66 |     return zip(*iterables)
 67 | 
 68 | 
 69 | def sha1_hash32(data):
 70 |     """
 71 |     Directly taken from datasketch package to avoid dependency.
 72 | 
 73 |     Parameters
 74 |     ----------
 75 |     data : bytes
 76 | 
 77 |     Returns
 78 |     -------
 79 |     int
 80 |     """
 81 |     return struct.unpack("<I", hashlib.sha1(data).digest()[:4])[0]
 82 | 
 83 | 
 84 | def embed_func(
 85 |     content: str,
 86 |     idx: int,
 87 |     *,
 88 |     num_perm: int,
 89 |     ngram_size: int,
 90 |     hashranges: List[Tuple[int, int]],
 91 |     permutations: np.ndarray,
 92 | ) -> Dict[str, Any]:
 93 |     """
 94 |     Combined with some datasketch code to better parallelize computation.
 95 | 
 96 |     Parameters
 97 |     ----------
 98 |     content : str
 99 |         The content to be embedded.
100 |     idx : int
101 |         The index of the content.
102 |     num_perm : int
103 |         The number of permutations.
104 |     ngram_size : int
105 |         The size of n-grams.
106 |     hashranges : List[Tuple[int, int]]
107 |         The ranges of hash values.
108 |     permutations : np.ndarray
109 |         The permutations for the minhash.
110 | 
111 |     Returns
112 |     -------
113 |     Dict[str, Any]
114 |         The hash values in each range and the index.
115 |     """
116 |     hashvalues = np.ones(num_perm, dtype=np.uint64) * MAX_HASH
117 |     tokens = {" ".join(t) for t in ngrams(NON_ALPHA.split(content), ngram_size)}
118 |     hv = np.array([sha1_hash32(token.encode("utf-8")) for token in tokens], dtype=np.uint64)  # noqa: E501
119 |     a, b = permutations
120 |     phv = np.bitwise_and(((hv * np.tile(a, (len(hv), 1)).T).T + b) % MERSENNE_PRIME, MAX_HASH)  # noqa: E501
121 |     hashvalues = np.vstack([phv, hashvalues]).min(axis=0)
122 |     Hs = [bytes(hashvalues[start:end].byteswap().data) for start, end in hashranges]
123 |     return {"__signatures__": Hs, "__id__": idx}
124 | 
125 | 
126 | def optimal_param(
127 |     threshold: float,
128 |     num_perm: int,
129 |     false_positive_weight: float = 0.5,
130 |     false_negative_weight: float = 0.5,
131 | ):
132 |     """
133 |     Compute the optimal `MinHashLSH` parameter that minimizes the weighted sum
134 |     of probabilities of false positive and false negative, taken from datasketch.
135 | 
136 |     Parameters
137 |     ----------
138 |     threshold : float
139 |         The threshold for similarity.
140 |     num_perm : int
141 |         The number of permutations.
142 |     false_positive_weight : float
143 |         The weight of false positive.
144 |     false_negative_weight : float
145 |         The weight of false negative.
146 | 
147 |     Returns
148 |     -------
149 |     Tuple[int, int]
150 |         The optimal `b` and `r` parameters.
151 |         The number of bands, and the number of rows per band respectively.
152 |     """
153 | 
154 |     def false_positive_probability(threshold: float, b: int, r: int):
155 |         """Source: `datasketch.lsh`"""
156 | 
157 |         def proba(s):
158 |             return 1 - (1 - s ** float(r)) ** float(b)
159 | 
160 |         a, _ = integrate(proba, 0.0, threshold)
161 |         return a
162 | 
163 |     def false_negative_probability(threshold: float, b: int, r: int):
164 |         """Source: `datasketch.lsh`"""
165 | 
166 |         def proba(s):
167 |             return 1 - (1 - (1 - s ** float(r)) ** float(b))
168 | 
169 |         a, _ = integrate(proba, threshold, 1.0)
170 |         return a
171 | 
172 |     min_error = float("inf")
173 |     opt = (0, 0)
174 |     for b in range(1, num_perm + 1):
175 |         max_r = int(num_perm / b)
176 |         for r in range(1, max_r + 1):
177 |             fp = false_positive_probability(threshold, b, r)
178 |             fn = false_negative_probability(threshold, b, r)
179 |             error = fp * false_positive_weight + fn * false_negative_weight
180 |             if error < min_error:
181 |                 min_error = error
182 |                 opt = (b, r)
183 |     return opt
184 | 
185 | 
186 | class UnionFind:
187 |     def __init__(self):
188 |         self.parent: Dict[int, int] = {}
189 | 
190 |     def find(self, x):
191 |         if x not in self.parent:
192 |             self.parent[x] = x
193 |         if self.parent[x] != x:
194 |             self.parent[x] = self.find(self.parent[x])
195 |         return self.parent[x]
196 | 
197 |     def union(self, x, y):
198 |         px = self.find(x)
199 |         py = self.find(y)
200 |         self.parent[px] = self.parent[py] = min(px, py)
201 | 
202 | 
203 | if __name__ == "__main__":
204 | 
205 |     def run(
206 |         dataset: str = typer.Option("codeparrot/codeparrot-clean-valid", help="The dataset to use"),  # noqa: E501
207 |         config: str = typer.Option("default", help="Dataset config"),
208 |         split: str = typer.Option("train", help="Dataset split"),
209 |         data_dir: str = typer.Option(None, help="Dataset data directory"),
210 |         revision: str = typer.Option("main", help="Dataset revision"),
211 |         column: str = typer.Option("content", help="Dataset column"),
212 |         cache_dir: str = typer.Option(".cache", help="Cache directory"),
213 |         ngram_size: int = typer.Option(5, help="The ngram size to use for MinHash"),
214 |         num_perm: int = typer.Option(256, help="Number of permutations"),
215 |         threshold: float = typer.Option(0.7, help="Minhash threshold"),
216 |         output: str = typer.Option(None, help="Store the deduplicated dataset"),
217 |     ):
218 |         global uf
219 |         OUTPUT_BASE = Path(output or "output")
220 |         OUTPUT_BASE.mkdir(exist_ok=True, parents=True)
221 |         output = OUTPUT_BASE / "deduplicated"
222 | 
223 |         logging.basicConfig(level=logging.INFO)
224 | 
225 |         time_measures = {}
226 |         start_time = time.time()
227 | 
228 |         B, R = optimal_param(threshold, num_perm)
229 |         HASH_RANGES = [(i * R, (i + 1) * R) for i in range(B)]
230 |         HASH_TABLES = [defaultdict(set) for _ in range(B)]
231 | 
232 |         time_measures["load_dataset"] = time.time()
233 |         ds = load_dataset(
234 |             dataset,
235 |             config,
236 |             data_dir=data_dir,
237 |             split=split,
238 |             use_auth_token=True,
239 |             cache_dir=cache_dir,
240 |             revision=revision,
241 |             num_proc=os.cpu_count(),
242 |         )
243 |         time_measures["load_dataset"] = time.time() - time_measures["load_dataset"]
244 |         DATA_SIZE = len(ds)
245 |         PERMUTATIONS = np.array(
246 |             [
247 |                 (
248 |                     RNG.randint(1, MERSENNE_PRIME, dtype=np.uint64),
249 |                     RNG.randint(0, MERSENNE_PRIME, dtype=np.uint64),
250 |                 )
251 |                 for _ in range(num_perm)
252 |             ],
253 |             dtype=np.uint64,
254 |         ).T
255 | 
256 |         time_measures["minhash"] = time.time()
257 |         embedded = ds.map(
258 |             function=embed_func,
259 |             fn_kwargs={
260 |                 "num_perm": num_perm,
261 |                 "hashranges": HASH_RANGES,
262 |                 "ngram_size": ngram_size,
263 |                 "permutations": PERMUTATIONS,
264 |             },
265 |             input_columns=[column],
266 |             remove_columns=ds.column_names,
267 |             num_proc=os.cpu_count(),
268 |             with_indices=True,
269 |             desc="Fingerprinting...",
270 |         )
271 |         time_measures["minhash"] = time.time() - time_measures["minhash"]
272 | 
273 |         time_measures["clustering"] = time.time()
274 |         batch_size: int = 10000
275 |         for i in tqdm(
276 |             range(0, len(embedded), batch_size), dynamic_ncols=True, desc="Iterating MinHashes..."  # noqa: E501
277 |         ):
278 |             batch = embedded[i : i + batch_size]
279 |             for key, Hs in zip(batch["__id__"], batch["__signatures__"]):
280 |                 for H, hashtable in zip(Hs, HASH_TABLES):
281 |                     hashtable[H].add(key)
282 |         for table in tqdm(HASH_TABLES, dynamic_ncols=True, desc="Clustering..."):
283 |             for cluster in table.values():
284 |                 if len(cluster) <= 1:
285 |                     continue
286 |                 idx = min(cluster)
287 |                 for x in cluster:
288 |                     uf.union(x, idx)
289 |         time_measures["clustering"] = time.time() - time_measures["clustering"]
290 | 
291 |         time_measures["filtering"] = time.time()
292 |         gc.freeze()
293 |         gc.disable()
294 |         ds = ds.map(
295 |             function=lambda _, idx: {"__cluster__": uf.find(idx)},
296 |             with_indices=True,
297 |             num_proc=os.cpu_count(),
298 |             new_fingerprint=str(random.getrandbits(128)),
299 |             desc="Finding clusters...",
300 |         )
301 |         gc.enable()
302 |         gc.collect()
303 |         # This is where the deduplication happens
304 |         # Since there is no easy groupby in datasets
305 |         # I will use this simple filter for now
306 |         final_data = ds.filter(
307 |             function=lambda record, idx: record["__cluster__"] == idx,
308 |             with_indices=True,
309 |             num_proc=os.cpu_count(),
310 |             desc="Filtering clusters...",
311 |         )
312 |         time_measures["filtering"] = time.time() - time_measures["filtering"]
313 | 
314 |         time_measures["save"] = time.time()
315 |         final_data = final_data.remove_columns(["__cluster__"])
316 |         final_data.save_to_disk(output)
317 |         time_measures["save"] = time.time() - time_measures["save"]
318 | 
319 |         FINAL_DATA_SIZE = len(final_data)
320 |         DUP_SIZE = DATA_SIZE - FINAL_DATA_SIZE
321 |         PAD = 32
322 | 
323 |         for key, value in time_measures.items():
324 |             logger.info(f"{key:<{PAD}}: {value:.2f} seconds")
325 |         logger.info(f"{'Data Number (before)':<{PAD}}: {DATA_SIZE}")
326 |         logger.info(
327 |             f"{'Data Number (after)':<{PAD}}: {FINAL_DATA_SIZE} ({FINAL_DATA_SIZE / DATA_SIZE:.2%})"  # noqa: E501
328 |         )
329 |         logger.info(f"{'Duplicate Number':<{PAD}}: {DUP_SIZE} ({DUP_SIZE / DATA_SIZE:.2%})")  # noqa: E501
330 |         logger.info(f"{'Total Time':<{PAD}}: {time.time() - start_time:.2f} seconds")
331 |         logger.info(f"{'Deduplicated Dataset':<{PAD}}: {output}")
332 |         logger.info("🤗 Happy Deduplicating 🤗")
333 | 
334 |     mp.set_start_method("fork", force=True)
335 |     uf = UnionFind()
336 |     typer.run(run)
337 | 


--------------------------------------------------------------------------------
/data_analysis/near-deduplication/minhash_deduplication_debug.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import multiprocessing as mp
  3 | import re
  4 | import time
  5 | from collections import defaultdict
  6 | from functools import partial
  7 | from typing import Dict, List, Optional, Set, Tuple, Type
  8 | import numpy as np
  9 | 
 10 | from datasets import Dataset
 11 | from tqdm import tqdm
 12 | 
 13 | from datasketch import MinHash, MinHashLSH
 14 | from dpu_utils.utils.iterators import ThreadedIterator
 15 | 
 16 | 
 17 | NON_ALPHA = re.compile("[^A-Za-z_0-9]")
 18 | # parameters used in DuplicationIndex
 19 | MIN_NUM_TOKENS = 10
 20 | NUM_PERM = 256
 21 | 
 22 | # column name of file paths, we add as file identifiers
 23 | PATH_COLUMN = "path"
 24 | # name of the "text" column used in deduplication
 25 | CONTENT = "content"
 26 | 
 27 | def get_min_hash(tokens: List[str]) -> Optional[MinHash]:
 28 |     """Compute the MinHash of a code snippet."""
 29 |     if len(tokens) < MIN_NUM_TOKENS:
 30 |         return None
 31 |     min_hash = MinHash(num_perm=NUM_PERM)
 32 |     for token in set(tokens):
 33 |         min_hash.update(token.encode())
 34 |     return min_hash
 35 | 
 36 | 
 37 | def get_tokens(code: str) -> Set[str]:
 38 |     """Tokenize a code snippet."""
 39 |     return set([t for t in NON_ALPHA.split(code) if len(t.strip()) > 0])
 40 | 
 41 | 
 42 | class DuplicationIndex:
 43 |     def __init__(
 44 |         self,
 45 |         *,
 46 |         duplication_jaccard_threshold: float = 0.85,
 47 |     ):
 48 |         self._duplication_jaccard_threshold = duplication_jaccard_threshold
 49 |         self._num_perm = NUM_PERM
 50 |         self._index = MinHashLSH(threshold=self._duplication_jaccard_threshold, num_perm=self._num_perm)
 51 | 
 52 |         self._duplicate_clusters = defaultdict(set)
 53 | 
 54 |     def add(self, code_key: Tuple, min_hash: MinHash) -> None:
 55 |         """Add a key to _index (MinHashLSH)
 56 |         the min_hash is used to query closest matches based on the jaccard_threshold.
 57 |         The new key is either added to a existing cluster of one close match,
 58 |         or a new cluster is created. The clusters created in this way, depend on the order of add.
 59 | 
 60 |         Args:
 61 |             code_key (Tuple of (index, repo_name, path)):
 62 |                 Theoritically any hasbale key. Here we use a tuple to retrieve the information later.
 63 |             min_hash: MinHash of the code_key.
 64 |         """
 65 |         close_duplicates = self._index.query(min_hash)
 66 |         if code_key in self._index.keys:
 67 |             print(f"Duplicate key {code_key}")
 68 |             return
 69 | 
 70 |         self._index.insert(code_key, min_hash)
 71 |         if len(close_duplicates) > 0:
 72 | 
 73 |             for base_duplicate in close_duplicates:
 74 |                 if base_duplicate in self._duplicate_clusters:
 75 |                     self._duplicate_clusters[base_duplicate].add(code_key)
 76 |                     break
 77 |             else:
 78 |                 self._duplicate_clusters[close_duplicates[0]].add(code_key)
 79 | 
 80 |     def get_duplicate_clusters(self) -> List[List[Dict]]:
 81 |         """Export the duplicate clusters.
 82 |         For each cluster, the first element is the base element of the cluster.
 83 |         The base element has an estimation jaccard similarity higher than the threshold with all the other elements.
 84 | 
 85 |         Returns:
 86 |             duplicate_clusters (List[List[Dict]]):
 87 |                 List of duplicate clusters.
 88 |         """
 89 |         duplicate_clusters = []
 90 |         for base, duplicates in self._duplicate_clusters.items():
 91 |             cluster = [base] + list(duplicates)
 92 |             # reformat the cluster to be a list of dict
 93 |             cluster = [{"base_index": el[0], "original_path": el[1]} for el in cluster]
 94 |             duplicate_clusters.append(cluster)
 95 |         return duplicate_clusters
 96 | 
 97 |     def save(self, filepath) -> None:
 98 |         duplicate_clusters = self.get_duplicate_clusters()
 99 |         with open(filepath, "w") as f:
100 |             json.dump(duplicate_clusters, f)
101 | 
102 | 
103 | def _compute_min_hash(element):
104 |     index, data = element
105 |     min_hash = get_min_hash([t for t in NON_ALPHA.split(data[CONTENT]) if len(t.strip()) > 0])
106 |     if min_hash is not None:
107 |         return (index, data[PATH_COLUMN]), min_hash
108 | 
109 | 
110 | def minhash_iter(dataset_iterator: Type[Dataset]):
111 |     # computing minhah hash of the samples in dataset iterator in parallel
112 |     with mp.Pool() as pool:
113 |         for data in pool.imap_unordered(
114 |             _compute_min_hash,
115 |             ThreadedIterator(dataset_iterator, max_queue_size=10000),
116 |             chunksize=100,
117 |         ):
118 |             if data is not None:
119 |                 yield data
120 | 
121 | 
122 | def make_duplicate_clusters(dataset_iterator: Type[Dataset], jaccard_threshold: float):
123 |     """Find duplicate clusters in the dataset in two steps:
124 |     1. Compute MinHash for each code snippet. MinHash is a tool for fast jaccard similarity estimation.
125 |     This step is computed using an asynchronous multiprocessing pool, minhash_iter
126 |     2. Find duplicate clusters. The computed MinHash is added sequentially to the DuplicationIndex.
127 |     This step cannot be parallelized. So using asynchronous thread in the previous step helps to speed up the process.
128 |     """
129 |     di = DuplicationIndex(duplication_jaccard_threshold=jaccard_threshold)
130 | 
131 |     print("\ncomputing minhashes")
132 |     t_start = time.time()
133 |     hashes = []
134 |     for filename, min_hash in tqdm(ThreadedIterator(minhash_iter(enumerate(dataset_iterator)), max_queue_size=100)):
135 |         hashes.append((filename, min_hash))
136 |     print(f"minhashes computed in: {time.time()-t_start:.2f}s => {(time.time()-t_start)/60:.2f}min")
137 | 
138 |     print("\nbuilding clusters")
139 |     t_start = time.time()
140 |     for filename, min_hash in tqdm(hashes):
141 |         di.add(filename, min_hash)
142 |     print(f"clusters built in: {time.time()-t_start:.2f}s => {(time.time()-t_start)/60:.2f}min")
143 | 
144 |     # Returns a List[Cluster] where Cluster is List[str] with the filenames.
145 |     print("\nexporting the clusters")
146 |     t_start = time.time()
147 |     clusters = di.get_duplicate_clusters()
148 |     print(f"clusters exported in: {time.time()-t_start:.2f}s => {(time.time()-t_start)/60:.2f}min")
149 | 
150 |     stats = [len(cluster) for cluster in clusters]
151 |     print(f"max, min, mean and median of the cluster sizes: {max(stats)}, {min(stats)}, {np.mean(stats)}, {np.median(stats)}")
152 |     print("saving the clusters list and stats")
153 |     with open("./clusters_list.json", "w") as fp:
154 |         json.dump(clusters, fp)
155 |     with open("./clusters_stats.json", "w") as fp:
156 |         json.dump(stats, fp)
157 | 
158 |     return clusters
159 | 
160 | 
161 | def jaccard_similarity(code1: str, code2: str) -> float:
162 |     """Compute the Jaccard similarity of two code snippets."""
163 |     tokens1 = get_tokens(code1)
164 |     tokens2 = get_tokens(code2)
165 |     return len(tokens1 & tokens2) / len(tokens1 | tokens2)
166 | 
167 | 
168 | _shared_dataset = None
169 | 
170 | 
171 | def _find_cluster_extremes_shared(cluster, jaccard_threshold):
172 |     """Find a reduced cluster such that each code in the origin cluster is similar to at least one code in the reduced cluster.
173 |     Two codes are similar if their Jaccard similarity is above the threshold.
174 | 
175 |     Args:
176 |         cluster (List[dict]):
177 |            cluster is a list of dict, each dict contains the following keys:
178 |                 - base_index
179 |                 - repo_name
180 |                 - path
181 |             This is a typical output of DuplicationIndex.get_duplicate_clusters()
182 |         jaccard_threshold (float):
183 |             threshold for Jaccard similarity.
184 |             Two codes are similar if their Jaccard similarity is above the threshold.
185 | 
186 |     Returns:
187 |         extremes (List[dict]):
188 |             A reduced representation of the cluster. The field copies is added to each dict.
189 |             The copies field indicates the number of similar codes in the cluster for a extreme.
190 |     """
191 |     extremes = []
192 |     for element1 in cluster:
193 |         code1 = _shared_dataset[element1["base_index"]][CONTENT]
194 |         for element2 in extremes:
195 |             code2 = _shared_dataset[element2["base_index"]][CONTENT]
196 |             if jaccard_similarity(code1, code2) >= jaccard_threshold:
197 |                 element2["copies"] += 1
198 |                 break
199 |         else:
200 |             element1["copies"] = 1
201 |             extremes.append(element1)
202 |     return extremes
203 | 
204 | 
205 | def find_extremes(cluster_list, dataset, jaccard_threshold):
206 |     """Call the _find_cluster_extremes_shared function in a parallel fashion.
207 | 
208 |     Args:
209 |         cluster_list (List[List[Dict]]):
210 |             each cluster is a list of dicts with the key base_index,
211 |             referring to the index of the base code in the dataset.
212 |         dataset (Type[Dataset]):
213 |             dataset is used to access the content of the code snippets,
214 |             using the base_index from the cluster_list.
215 |             dataset is shared between all the processes using a glabal variable (any other way to share the dataset?),
216 |             otherwise the multi processing is not speeded up.
217 |         jaccard_threshold (float):
218 |             the threshold for the jaccard similarity. The default value is 0.85
219 | 
220 |     Returns:
221 |         extremes_list (List[Dict]):
222 |             Each cluster is reduced to extremes.
223 |             See _find_cluster_extremes_shared for the definition of extremes.
224 |     """
225 |     global _shared_dataset
226 |     _shared_dataset = dataset
227 |     extremes_list = []
228 |     f = partial(_find_cluster_extremes_shared, jaccard_threshold=jaccard_threshold)
229 |     with mp.Pool() as pool:
230 |         for extremes in tqdm(
231 |             pool.imap_unordered(
232 |                 f,
233 |                 cluster_list,
234 |             ),
235 |             total=len(cluster_list),
236 |         ):
237 |             extremes_list.append(extremes)
238 |     return extremes_list
239 | 
240 | 
241 | def deduplicate_dataset(
242 |     dataset: Type[Dataset], jaccard_threshold: float = 0.85
243 | ) -> Tuple[Type[Dataset], List[List[Dict]]]:
244 |     """Deduplicate the dataset using minhash and jaccard similarity.
245 |     This function first generate duplicate clusters, then each cluster
246 |     is reduced to the extremes that are similar to the other elements in the cluster.
247 |     Codes are called similar if their Jaccard similarity is greater than jaccard_threshold (0.85 default).
248 | 
249 |     Args:
250 |         dataset (Type[Dataset]):
251 |             The dataset to deduplicate.
252 |         jaccard_threshold (float, default=0.85):
253 |             jaccard threshold to determine if two codes are similar
254 | 
255 |     Returns:
256 |         ds_dedup (Type[Dataset]):
257 |             The deduplicated dataset.
258 |         duplicate_clusters (List[List[Dict]]):
259 |             The list of duplicate clusters.
260 |             Each cluster is a list of dicts with the following keys:
261 |             - base_index : int
262 |                 The index of the code in the original dataset.
263 |             - repo_name : str
264 |             - path : str
265 |             - copies : int
266 |                 The number of copies of the code in the cluster. (find_cluster_extremes)
267 |             - is_extreme : bool
268 |                 Whether the code is an extreme in the cluster.
269 |             All the codes in the cluster are removed from the dataset except the extremes.
270 | 
271 |     Example:
272 |         >>> from datasets import load_dataset
273 |         >>> from minhash_deduplication import deduplicate_dataset
274 |         >>> ds = load_dataset("lvwerra/codeparrot-clean", split="train")
275 |         >>> ds_dedup, duplicate_clusters = deduplicate_dataset(ds, jaccard_threshold=0.85)
276 |     """
277 |     duplicate_clusters = make_duplicate_clusters(dataset, jaccard_threshold)
278 |     print("MinHash computation done and cluster info saved")
279 | 
280 |     duplicate_indices = set(x["base_index"] for cluster in duplicate_clusters for x in cluster)
281 |     print("\nComputing extremes for all clusters")
282 |     extreme_dict = {}
283 |     t_start = time.time()
284 |     extremes_clusters = find_extremes(duplicate_clusters, dataset, jaccard_threshold)
285 |     print(f"Extremes found in: {time.time()-t_start:.2f}s => {(time.time()-t_start)/60:.2f}min")
286 | 
287 |     for extremes in extremes_clusters:
288 |         for element in extremes:
289 |             extreme_dict[element["base_index"]] = element
290 |     remove_indices = duplicate_indices - set(extreme_dict.keys())
291 |     print("\nnow filtering the duplicates(extremes) from the dataset")
292 |     t_start = time.time()
293 |     ds_filter = dataset.filter(lambda x, idx: idx not in remove_indices, with_indices=True)
294 |     print(f"dataset filtered in: {time.time()-t_start:.2f}s => {(time.time()-t_start)/60:.2f}min")
295 | 
296 |     t_start = time.time()
297 |     # update duplicate_clusters
298 |     for cluster in duplicate_clusters:
299 |         for element in cluster:
300 |             element["is_extreme"] = element["base_index"] in extreme_dict
301 |             if element["is_extreme"]:
302 |                 element["copies"] = extreme_dict[element["base_index"]]["copies"]
303 |     print(f"clusters (for analysis) updated in: {time.time()-t_start:.2f}s => {(time.time()-t_start)/60:.2f}min")
304 | 
305 |     print(f"Original dataset size: {len(dataset)}")
306 |     print(f"Number of duplicate clusters: {len(duplicate_clusters)}")
307 |     print(f"Files in duplicate cluster: {len(duplicate_indices)}")
308 |     print(f"Unique files in duplicate cluster: {len(extreme_dict)}")
309 |     print(f"Filtered dataset size: {len(ds_filter)}")
310 | 
311 |     return ds_filter, duplicate_clusters
312 | 


--------------------------------------------------------------------------------
/data_analysis/near-deduplication/near_deduplicate.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | import time
  4 | from pathlib import Path
  5 | import re
  6 | from huggingface_hub import Repository
  7 | from multiprocessing import Pool
  8 | from tqdm import tqdm
  9 | from argparse import Namespace, ArgumentParser
 10 | 
 11 | from datasets import load_dataset
 12 | 
 13 | from minhash_deduplication import deduplicate_dataset
 14 | 
 15 | 
 16 | def parse_args():
 17 |     parser = ArgumentParser(description='near deduplication')
 18 |     parser.add_argument(
 19 |             "--dataset_name",
 20 |             default="bigcode-data/python_any_license_v2",
 21 |             type=str,
 22 |             help="dataset to deduplicate, path to HF repo or local path",
 23 |         )
 24 |     parser.add_argument(
 25 |             "--text_column",
 26 |             default="content",
 27 |             type=str,
 28 |             help="column name of the text to dedulicate",
 29 |         )
 30 |     parser.add_argument(
 31 |             "--jaccard_threshold",
 32 |             default=0.85,
 33 |             type=float,
 34 |             help="Jaccard similarity threshold",
 35 |         )
 36 |     # we save data locally before pushing to the Hub to avoid any issues
 37 |     # the remote HF repo where we want the new data is cloned inside a folder out_path 
 38 |     # and the data is saved inside
 39 |     parser.add_argument(
 40 |             "--repo_name",
 41 |             default="python_any_license_v2_near_dedup",
 42 |             type=str,
 43 |             help="HF repo where deduplicated dataset will be pushed later, repo is cloned, and data is saved inside",
 44 |         )
 45 |     parser.add_argument(
 46 |             "--out_path",
 47 |             default="./data/data-near-dedup",
 48 |             type=str,
 49 |             help="local directory where repo_name is cloned",
 50 |         )
 51 |     parser.add_argument(
 52 |             "--org",
 53 |             default="bigcode-data",
 54 |             type=str,
 55 |             help="HF org/username where the data will be pushed",
 56 |         )
 57 |     parser.add_argument(
 58 |             "--shard_size",
 59 |             default=1000 << 20,
 60 |             type=int,
 61 |             help="size of the dataset shards",
 62 |         )
 63 |     parser.add_argument(
 64 |             "--test_run",
 65 |             default=False,
 66 |             type=bool,
 67 |             help="make a test run, if True we only deduplicate a small subset",
 68 |         )
 69 |     return parser.parse_args()
 70 | 
 71 | 
 72 | 
 73 | def save_shard(shard_tuple):
 74 |     """Save shard"""
 75 |     filename, shard = shard_tuple
 76 |     shard.to_parquet(filename)
 77 | 
 78 | args = parse_args()
 79 | 
 80 | print("setting up the repo")
 81 | repo = Repository(
 82 |         local_dir=args.out_path,
 83 |         clone_from=args.org + "/" + args.repo_name,
 84 |         repo_type="dataset",
 85 |         private=True,
 86 |         use_auth_token=True,
 87 |         git_user=args.org
 88 |         )
 89 | output_dir = Path(args.out_path)
 90 | output_dir.mkdir(exist_ok=True)
 91 | os.mkdir(args.out_path + "/data")
 92 | print("setup done")
 93 | 
 94 | 
 95 | t_start = time.time()
 96 | # the data is saved in the cache for future loadings
 97 | ds = load_dataset(args.dataset_name, split="train", use_auth_token=True) 
 98 | #ds = load_dataset("bigcode-data/python_any_license_v2", split="train", use_auth_token=True)
 99 | 
100 | if args.test_run:
101 |     # for a test run we only use a small subset
102 |     ds = ds.select([i for i in range(7000)])
103 | init_size = len(ds)
104 | print(f"Time to load dataset: {time.time()-t_start:.2f}")
105 | 
106 | 
107 | # Deduplicate with minhash and jaccard similarity
108 | t_start = time.time()
109 | ds, duplicate_clusters = deduplicate_dataset(ds, args.jaccard_threshold)
110 | new_size = len(ds)
111 | print(f"Time to deduplicate dataset: {time.time()-t_start:.2f}")
112 | print(f"Size of deduplicated dataset: {len(ds)}, old dataset size {init_size}")
113 | with open("size_info.json", "w") as f:
114 |     json.dump([init_size, new_size, (init_size-new_size)*100/init_size],f)
115 | 
116 | 
117 | with open(output_dir / "duplicate_clusters.json", "w") as f:
118 |     json.dump(duplicate_clusters, f)
119 | 
120 | 
121 | if ds._indices is not None:
122 |     dataset_nbytes = ds.data.nbytes * len(ds._indices) / len(ds.data)
123 | else:
124 |     dataset_nbytes = ds.data.nbytes
125 | num_shards = int(dataset_nbytes / args.shard_size) + 1
126 | 
127 | 
128 | t_start = time.time()
129 | shards = (ds.shard(num_shards=num_shards, index=i, contiguous=True) for i in range(num_shards))
130 | filenames = (f"{args.out_path}/data/train-{index:05d}-of-{num_shards:05d}.parquet" for index in range(num_shards))
131 | 
132 | with Pool(16) as p:
133 |     list(tqdm(p.imap_unordered(save_shard, zip(filenames, shards), chunksize=4), total=num_shards))
134 | print(f"Time to save dataset: {time.time()-t_start:.2f}")
135 | 
136 | # To push to hub run `git add data/commit/push` inside dataset repo folder (the one cloned from HF: out_path/args.repo_name)
137 | # no need to push duplicate_clusters.json
138 | 


--------------------------------------------------------------------------------
/data_analysis/near-deduplication/requirements.txt:
--------------------------------------------------------------------------------
1 | datasets==2.5.1
2 | huggingface-hub==0.8.1
3 | datasketch==1.5.8
4 | dpu_utils
5 | 


--------------------------------------------------------------------------------
/data_analysis/near-deduplication/requirements_alt.txt:
--------------------------------------------------------------------------------
1 | datasets>=2.5.1
2 | typer>=0.6.1


--------------------------------------------------------------------------------
/data_analysis/notebooks/bigcode_pls.csv:
--------------------------------------------------------------------------------
  1 | index,lang,count,size (Gb),Arjun's comments,Include,Size
  2 | 1,abap,14766.00,0.097798223,,,
  3 | 2,actionscript,151475.00,0.714806413,Omit: legacy PL for Flash. Flash no longer supported on any major browser. Unlikely that people are writing new Flash code.,,
  4 | 3,ada,39273.00,0.577616957,"Include: used in aerospace, defense, etc.?",1,0.577616957
  5 | 4,agda,18697.00,0.076828045,Include: significant in PL research. Functional. Verification.,1,0.076828045
  6 | 5,ags-script,1148.00,0.010587383,,,0
  7 | 6,alloy,5207.00,0.015588033,Include: significant in PL/SE/FM research.  Relational.,1,0.015588033
  8 | 7,ampl,75.00,7.65E-05,,,0
  9 | 8,antlr,9312.00,0.061952353,Include: widely-used for specifying context-free grammars,1,0.061952353
 10 | 9,apacheconf,187.00,0.000245907,,,0
 11 | 10,api-blueprint,2895.00,0.032142603,,,0
 12 | 11,apl,2127.00,0.007897713,undecided,,0
 13 | 12,applescript,5169.00,0.010201155,Include: major Appple language,1,0.010201155
 14 | 13,arc,2089.00,0.019751439,,,0
 15 | 14,arduino,163528.00,0.788386157,Harm: C++ variant for programmable electronics,,0
 16 | 15,asciidoc,221025.00,1.201338633,Harm: data looks good from a few samples (it's documentation style),,0
 17 | 16,asp,111163.00,0.522229033,,,0
 18 | 17,aspectj,2870.00,0.008254705,undecided,,0
 19 | 18,assembly,262334.00,1.648889979,Assembly. Include,1,1.648889979
 20 | 19,ats,3247.00,0.014441544,,,0
 21 | 20,augeas,208.00,0.000826699,Include: minor DSL from RedHet. Declarative. Very different from other langs.,1,0.000826699
 22 | 21,autohotkey,15258.00,0.084246694,,,0
 23 | 22,autoit,11864.00,0.102856281,,,0
 24 | 23,awk,11172.00,0.024450499,Include: widely used scripting language,1,0.024450499
 25 | 24,batchfile,281187.00,0.333024122,Include: widely used scripting language,1,0.333024122
 26 | 25,befunge,37.00,4.01E-06,,,0
 27 | 26,bison,145.00,0.003887651,Include: widely-used for specifying lexers,1,0.003887651
 28 | 27,bitbake,73167.00,0.089609174,,,0
 29 | 28,blitzbasic,228.00,0.007069605,,,0
 30 | 29,blitzmax,1720.00,0.013420046,,,0
 31 | 30,bluespec,6923.00,0.037450439,Include: hardware description language,1,0.037450439
 32 | 31,boo,4721,0.007027597,,,0
 33 | 32,brainfuck,10077,0.049801268,undecided,,0
 34 | 33,brightscript,2340,0.014593211,,,0
 35 | 34,bro,912,0.00293523,,,0
 36 | 35,c,11206308,75.92987105,Obvious,1,75.92987105
 37 | 36,c++,7600356,65.96642214,Obvious,1,65.96642214
 38 | 37,c-sharp,13281504,57.98494339,Obvious,1,57.98494339
 39 | 38,c2hs-haskell,1084,0.011598729,,,0
 40 | 39,cap'n-proto,1538,0.003322507,undecided,,0
 41 | 40,cartocss,1781,0.020215215,,,0
 42 | 41,ceylon,6506,0.017001518,undecided: a Red Hat language; seems abandoned?,,0
 43 | 42,chapel,15912,0.037359901,undecided,,0
 44 | 43,chuck,1397,0.003407997,,,0
 45 | 44,cirru,1078,0.06595696,,,0
 46 | 45,clarion,1039,0.010567484,,,0
 47 | 46,clean,835,0.004459314,undecided,,0
 48 | 47,click,371,0.00161367,,,0
 49 | 48,clips,1701,0.010305217,,,0
 50 | 49,clojure,137204,0.56312332,"Include: Lispy, JVM, Datomic",1,0.56312332
 51 | 50,cmake,248361,0.679152478,Obvious,1,0.679152478
 52 | 51,cobol,3175,0.018076838,undecided,1,0.018076838
 53 | 52,coffeescript,256615,0.882924413,undecided,1,0.882924413
 54 | 53,coldfusion,14649,0.072415269,,,0
 55 | 54,coldfusion-cfc,14331,0.07927666,,,0
 56 | 55,common-lisp,111008,1.936640009,Include: still widely used in software that is still maintained,1,1.936640009
 57 | 56,component-pascal,860,0.108625083,,,0
 58 | 57,coq,40,0.000229547,Misclassified. There must be more data,,0
 59 | 58,creole,689,0.001459754,,,0
 60 | 59,crystal,90251,0.301409398,,,0
 61 | 60,csound,1608,0.04287534,,,0
 62 | 61,css,3586141,34.86330899,,,0
 63 | 62,csv,5992650,266.826193,Exclude,,0
 64 | 63,cucumber,92741,0.256383451,,,0
 65 | 64,cuda,71446,0.730044154,Obvious,1,0.730044154
 66 | 65,cycript,459,0.011675073,,,0
 67 | 66,cython,42621,0.403100053,Harm: looks good to me,,0
 68 | 67,d,431,0.00177274,Misclassified. There must be more data,1,0.00177274
 69 | 68,darcs-patch,369,0.002560221,Exclude. Probably not a PL,,0
 70 | 69,dart,1070286,4.877312353,Include: major Google language,1,4.877312353
 71 | 70,desktop,17075,0.009376839,not a PL?,,0
 72 | 71,diff,386667,2.599852065,Exclude. Probably not a PL,,0
 73 | 72,digital-command-language,19109,0.070433624,,,0
 74 | 73,dm,4603,0.032649213,,,0
 75 | 74,dns-zone,1457,0.023461303,,,0
 76 | 75,dockerfile,639343,0.517538485,undecided. This is basically shell scripts,1,0.517538485
 77 | 76,dogescript,364,0.00013444,,,0
 78 | 77,dylan,4747,0.040813775,undecided,,0
 79 | 78,eagle,45991,4.645208881,"Data, based on 3-4 samples",,0
 80 | 79,ec,1152,0.023841255,,,0
 81 | 80,ecere-projects,182,0.001313465,,,0
 82 | 81,ecl,5120,0.028274082,,,0
 83 | 82,edn,23358,1.374275976,,,0
 84 | 83,eiffel,28472,0.126345794,undecided,,0
 85 | 84,elixir,351916,0.93891419,"Include: Erlang, concurrent",1,0.93891419
 86 | 85,elm,68002,0.367896114,"Include: functional, web",1,0.367896114
 87 | 86,emacs-lisp,68984,0.551702192,Obvious,1,0.551702192
 88 | 87,emberscript,1969,0.025593958,,,0
 89 | 88,erlang,121296,0.931688288,Obvious,1,0.931688288
 90 | 89,f#,142144,1.031253976,Obvious,1,1.031253976
 91 | 90,factor,12163,0.036390924,,,0
 92 | 91,fancy,538,0.001420258,,,0
 93 | 92,fantom,2598,0.010674187,,,0
 94 | 93,fish,33460,0.040602801,,,0
 95 | 94,flux,1221,0.00724385,,,0
 96 | 95,forth,4692,0.021339541,undecided,,0
 97 | 96,fortran,186211,2.079444236,Obvious,1,2.079444236
 98 | 97,freemarker,74678,0.317162226,,,0
 99 | 98,g-code,12185,0.890362356,Harm: 3d printers,,0
100 | 99,gams,2011,0.059093651,,,0
101 | 100,gap,1210,0.012636526,,,0
102 | 101,gas,111211,1.303199948,GNU Assembler,1,1.303199948
103 | 102,gdscript,129390,0.27310127,,,0
104 | 103,genshi,398,0.00844459,,,0
105 | 104,gentoo-ebuild,45554,0.051730603,,,0
106 | 105,gentoo-eclass,379,0.002866541,,,0
107 | 106,gettext-catalog,273229,9.711336079,Harm: multi-lingual API descriptions,,0
108 | 107,glsl,206223,0.686847354,Include: shaders,1,0.686847354
109 | 108,glyph,47,0.000743859,,,0
110 | 109,gnuplot,31328,1.461753918,A language?,,0
111 | 110,go,5889635,32.01376363,Obvious,1,32.01376363
112 | 111,golo,540,0.000686322,,,0
113 | 112,gosu,1112,0.005721374,,,0
114 | 113,grace,583,0.001056405,,,0
115 | 114,grammatical-framework,4231,0.038704313,,,0
116 | 115,graphql,54946,0.179884882,"Harm: Facebook query language for APIs, 14k stars on Github",,0
117 | 116,graphviz-(dot),99748,1.074293176,"Exclude: this is data, not a PL. IMO Graphviz is usually generated and not hand-written",,0
118 | 117,groff,240689,2.938305336,"Old typesetting language. Still used of course, but mostly replaced by tex. This is mostly data",,0
119 | 118,groovy,306823,1.207558695,Include,1,1.207558695
120 | 119,groovy-server-pages,15392,0.097240488,,,0
121 | 120,haml,136898,0.167000394,,,0
122 | 121,handlebars,292850,0.594546259,,,0
123 | 122,harbour,819,0.001420626,,,0
124 | 123,haskell,582682,2.745870866,Include: major functional language,1,2.745870866
125 | 124,haxe,147781,0.596376245,"Harm: Game dev, mobile, ",,0
126 | 125,hcl,341067,0.807011745,Harm:  data format,,0
127 | 126,hlsl,30266,0.086940443,,,0
128 | 127,html,16602372,291.4582452,,,0
129 | 128,html+django,60784,0.148110205,,,0
130 | 129,html+eex,17119,0.024966882,,,0
131 | 130,html+erb,649197,1.008359107,,,0
132 | 131,html+php,98821,0.405201841,,,0
133 | 132,http,25314,0.067294465,,,0
134 | 133,hy,1404,0.007694671,,,0
135 | 134,idl,492,0.008022681,,,0
136 | 135,idris,8449,0.034530222,"Include: significant in PL research, functional, verification",1,0.034530222
137 | 136,igor-pro,730,0.011822097,,,0
138 | 137,inform-7,429,0.015622285,,,0
139 | 138,ini,1630329,3.566209389,Probably exclude: Not a PL?,,0
140 | 139,inno-setup,3752,0.019808392,,,0
141 | 140,io,3031,0.00836172,undecided,,0
142 | 141,ioke,409,0.002303905,,,0
143 | 142,irc-log,101,0.006792731,,,0
144 | 143,isabelle,5724,0.098911734,"Include: significant in PL research, functional, verification",1,0.098911734
145 | 144,j,3093,0.015198472,,,0
146 | 145,jade,94273,0.178474889,,,0
147 | 146,jasmin,7371,0.04117105,,,0
148 | 147,java,25124914,112.8234043,Obvious,1,112.8234043
149 | 148,java-server-pages,281662,1.330394207,Obvious,1,1.330394207
150 | 149,javascript,25429179,166.2414118,Obvious,1,166.2414118
151 | 150,jflex,3398,0.014072463,,,0
152 | 151,json,36297006,627.8661835,,,0
153 | 152,json5,6048,0.075441422,,,0
154 | 153,jsoniq,4242,0.006691644,,,0
155 | 154,jsonld,30033,0.244416576,,,0
156 | 155,jsx,1094154,3.045787612,Harm: javascript react ,,0
157 | 156,julia,332174,1.751905383,"Include: HPC, concurrent, etc.",1,1.751905383
158 | 157,jupyter-notebook,1199902,162.4975517,Harm: processed separately,,0
159 | 158,kicad,11882,2.991856997,Sampled 3-4. Mostly data from what I can tell,,0
160 | 159,kit,1446,0.006728298,,,0
161 | 160,kotlin,2644255,6.822951943,Include: the Android PL,1,6.822951943
162 | 161,krl,292,0.001202317,,,0
163 | 162,labview,2176,0.091341864,undecided,,0
164 | 163,lasso,1158,0.048377834,,,0
165 | 164,latte,7211,0.014196855,,,0
166 | 165,lean,21003,0.162765045,"Include: significant in PL research, functional, verification",1,0.162765045
167 | 166,less,407086,1.145523703,Harm: markup language ,,0
168 | 167,lex,3380,0.081799333,undecided. related to Bison?,1,0.081799333
169 | 168,lfe,1048,0.002028635,,,0
170 | 169,lilypond,7406,0.031897442,,,0
171 | 170,linker-script,15072,0.081791257,,,0
172 | 171,liquid,30751,0.122617805,Harm: Shopify markup language,,0
173 | 172,literate-agda,573,0.005294123,Include if including Agda,1,0.005294123
174 | 173,literate-coffeescript,1156,0.005056236,Include if including CoffeeScript,1,0.005056236
175 | 174,literate-haskell,6703,0.067831422,Include if including Haskell,1,0.067831422
176 | 175,livescript,9699,0.039472374,,,0
177 | 176,llvm,72679,0.785443486,undecided. Is this just generated stuff?,,0
178 | 177,logos,21190,0.349273476,Harm: old PL,,0
179 | 178,logtalk,3125,0.009893734,,,0
180 | 179,lolcode,811,0.001302327,,,0
181 | 180,lookml,484,0.006730641,,,0
182 | 181,lsl,2753,0.015521771,,,0
183 | 182,lua,637541,3.766811033,"Include: significantly used in scripting, games",1,3.766811033
184 | 183,m,134,7.75E-05,,,0
185 | 184,m4,23018,0.153150917,,,0
186 | 185,makefile,801562,2.143090361,Include: more shell scripts,1,2.143090361
187 | 186,mako,9332,0.037311069,,,0
188 | 187,maple,2308,0.026826063,Include: scientific programming,1,0.026826063
189 | 188,markdown,25656996,95.84382086,,,0
190 | 189,mask,1448,0.009652023,,,0
191 | 190,mathematica,41260,1.877927466,Include: scientific programming,1,1.877927466
192 | 191,matlab,1046,0.047973853,Include: scientific programming,1,0.047973853
193 | 192,max,12744,0.596285678,,,0
194 | 193,maxscript,604,0.005334702,,,0
195 | 194,mediawiki,21551,0.158945413,,,0
196 | 195,metal,5042,0.0181611,,,0
197 | 196,mirah,5709,0.070237047,,,0
198 | 197,modelica,26120,0.141068962,,,0
199 | 198,module-management-system,346,0.002231462,,,0
200 | 199,monkey,1828,0.007924306,,,0
201 | 200,moonscript,5309,0.016770277,,,0
202 | 201,mtml,536,0.001453616,,,0
203 | 202,muf,561,0.001667817,,,0
204 | 203,mupad,774,0.006010949,,,0
205 | 204,myghty,11,6.29E-05,,,0
206 | 205,nesc,19216,0.138510587,,,0
207 | 206,netlinx,188,0.002778406,,,0
208 | 207,netlogo,1524,0.039246293,,,0
209 | 208,nginx,15,2.26E-05,,,0
210 | 209,nimrod,57910,0.522520055,,,0
211 | 210,ninja,3893,0.099536967,,,0
212 | 211,nit,87,0.000264594,,,0
213 | 212,nix,195025,0.649707883,,,0
214 | 213,nsis,4159,0.029944153,,,0
215 | 214,nu,927,0.002204864,,,0
216 | 215,numpy,7,1.32E-05,,,0
217 | 216,objdump,711,0.052410804,,,0
218 | 217,objective-c++,75367,0.694668418,,,0
219 | 218,objective-j,456,0.008176827,Exclude: I think this is dead?,,0
220 | 219,ocaml,187323,1.365987005,Include: major functional language,1,1.365987005
221 | 220,octave,201,0.003441835,"Undecided. Precursor to R, etc?",,0
222 | 221,omgrofl,4,3.03E-06,,,0
223 | 222,ooc,1487,0.005106774,,,0
224 | 223,opa,363,0.004085012,,,0
225 | 224,opal,158,0.000459592,,,0
226 | 225,opencl,27492,0.201742827,,,0
227 | 226,openscad,21218,0.111863208,,,0
228 | 227,org,51999,0.365003693,,,0
229 | 228,ox,250,0.001307936,,,0
230 | 229,oxygene,55,0.000240078,,,0
231 | 230,oz,1208,0.004989877,,,0
232 | 231,pan,5673,0.060544641,,,0
233 | 232,papyrus,12577,0.04664692,,,0
234 | 233,parrot,30,5.35E-05,,,0
235 | 234,parrot-assembly,431,0.000497986,,,0
236 | 235,parrot-internal-representation,1576,0.047925378,,,0
237 | 236,pascal,131974,1.877279695,Include: best PL,1,1.877279695
238 | 237,pawn,2530,0.057843412,,,0
239 | 238,perl,475344,2.987812011,Obvious,1,2.987812011
240 | 239,perl6,11279,0.050197622,,,0
241 | 240,php,22633374,89.45680388,Include: ugh,1,89.45680388
242 | 241,piglatin,1702,0.00286154,,,0
243 | 242,pike,1033,0.003621717,,,0
244 | 243,pod,13406,0.136778901,,,0
245 | 244,pogoscript,228,0.000541998,,,0
246 | 245,pony,3860,0.020518823,,,0
247 | 246,postscript,17707,1.568144315,Undecided. Is this just data?,,0
248 | 247,pov-ray-sdl,1225,0.021130333,,,0
249 | 248,powershell,333193,1.654198124,Include: Windows scripting languge,1,1.654198124
250 | 249,processing,59476,0.240804965,"Likely include, but remind myself -- this just Java, right?",1,0.240804965
251 | 250,prolog,1162,0.012870975,Include: GOFAI,1,0.012870975
252 | 251,propeller-spin,2335,0.022317081,,,0
253 | 252,protocol-buffer,124069,0.556668256,Include: Google engineers will be happy,1,0.556668256
254 | 253,pure-data,15018,0.104991313,,,0
255 | 254,purebasic,19536,0.054749768,,,0
256 | 255,purescript,33671,0.14750348,,,0
257 | 256,python,15148604,80.13463578,Obvious,1,80.13463578
258 | 257,python-traceback,9,3.25E-05,Exclude,,0
259 | 258,qmake,10338,0.022073257,,,0
260 | 259,qml,54070,0.238437227,,,0
261 | 260,r,41580,0.316489511,Obvious,1,0.316489511
262 | 261,racket,4318,0.039497334,"Include: PL used in teaching and research, mixed-paradigm",1,0.039497334
263 | 262,ragel-in-ruby-host,1847,0.011011379,,,0
264 | 263,raml,13598,0.048217182,,,0
265 | 264,rdoc,10940,0.031951045,,,0
266 | 265,realbasic,871,0.010335615,,,0
267 | 266,rebol,1495,0.006508232,Is this dead?,,0
268 | 267,red,3611,0.039918747,,,0
269 | 268,redcode,513,0.002673693,,,0
270 | 269,ren'py,4898,0.11411255,,,0
271 | 270,renderscript,995,0.002270658,,,0
272 | 271,restructuredtext,1122999,4.281577848,Harm points out to include,1,4.281577848
273 | 272,rhtml,7469,0.009793544,Is this R?,,0
274 | 273,rmarkdown,5831,0.061474482,"Include, this is R",1,0.061474482
275 | 274,robotframework,14924,0.063781898,,,0
276 | 275,rouge,1542,0.004224455,,,0
277 | 276,ruby,4463248,9.781043003,Obvious,1,9.781043003
278 | 277,rust,1677940,12.91866254,Obvious,1,12.91866254
279 | 278,sage,1930,0.014832049,,,0
280 | 279,saltstack,31203,0.042226819,,,0
281 | 280,sas,10755,0.160669991,"Carolyn: include, this is a stats scripting language",1,0.160669991
282 | 281,sass,108862,0.196029291,,,0
283 | 282,scala,1607698,6.061807569,Obvious,1,6.061807569
284 | 283,scaml,114,0.000120361,,,0
285 | 284,scheme,64961,0.597677147,Include: Lisp derivative,1,0.597677147
286 | 285,scilab,3129,0.011411926,,,0
287 | 286,scss,2449305,5.284018372,,,0
288 | 287,self,85,0.000509786,Inspiration for JavaScript. No more of this on the web?,1,0.000509786
289 | 288,shell,2540313,4.066701981,Obvious,1,4.066701981
290 | 289,shellsession,10,1.58E-05,,,0
291 | 290,shen,306,0.001125251,,,0
292 | 291,slash,9274,0.035859409,,,0
293 | 292,slim,52114,0.059704412,,,0
294 | 293,smali,288385,3.211701155,,,0
295 | 294,smalltalk,652835,0.762851187,Probably include. Still some people using this I believe,1,0.762851187
296 | 295,smarty,167634,0.56143135,,,0
297 | 296,smt,26696,0.404797122,Undecided. Lispy. These are likely generated files.,,0
298 | 297,solidity,216020,1.696340017,Include: smart contracts,1,1.696340017
299 | 298,sourcepawn,7288,0.130712384,,,0
300 | 299,sparql,16585,0.045287982,"Include: Spark, etc.",1,0.045287982
301 | 300,sqf,39747,0.140966975,,,0
302 | 301,sql,1066540,12.67551878,Obvious,1,12.67551878
303 | 302,squirrel,6385,0.038697967,,,0
304 | 303,stan,6556,0.015036825,Include: PPL,1,0.015036825
305 | 304,standard-ml,52443,0.538388205,Include: FP,1,0.538388205
306 | 305,stata,34926,0.418126748,"Carolyn: stata is an important data analysis lang. but we want only the .do files, not the .dta files",1,0.418126748
307 | 306,ston,1720,0.001329725,,,0
308 | 307,stylus,88412,0.157867404,,,0
309 | 308,supercollider,2761,0.01793768,,,0
310 | 309,svg,3816962,79.85220843,Undecided. Same category as XML,,0
311 | 310,swift,2063629,7.305693131,Obvious,1,7.305693131
312 | 311,systemverilog,56261,0.498070786,Include: HDL,1,0.498070786
313 | 312,tcl,63362,0.511421635,Include: 90s scripting language; still widely used,1,0.511421635
314 | 313,tcsh,6313,0.027505715,Include: shell scripts,1,0.027505715
315 | 314,tea,1086,0.043406292,,,0
316 | 315,tex,593741,5.863253431,Include: help me write my papers,1,5.863253431
317 | 316,text,17241649,233.4156933,what,,0
318 | 317,textile,8550,0.036826583,,,0
319 | 318,thrift,6008,0.019724138,Include: Facebook engineers want this,1,0.019724138
320 | 319,toml,603802,0.682639067,Undecided. Likely just data,,0
321 | 320,turing,281,0.003482555,,,0
322 | 321,turtle,167790,2.961368586,https://www.w3.org/TR/turtle/ Data. Ignore?,,0
323 | 322,twig,445738,1.318711043,,,0
324 | 323,txl,149,0.002262777,,,0
325 | 324,typescript,12817789,36.60518411,Obvious,1,36.60518411
326 | 325,unified-parallel-c,204,0.001573707,,,0
327 | 326,unity3d-asset,5283462,9.059069948,,,0
328 | 327,uno,4380,0.011293002,,,0
329 | 328,unrealscript,11238,0.070370664,,,0
330 | 329,urweb,1181,0.007658499,"Undecided. This is probably Adam Chlipala's language, but double-check",,0
331 | 330,vala,4830,0.024860858,,,0
332 | 331,vcl,1325,0.005250865,,,0
333 | 332,verilog,169,0.000955152,Include: HDL,1,0.000955152
334 | 333,vhdl,73888,1.457399755,Include: HDL,1,1.457399755
335 | 334,viml,110500,0.404044383,Include: VIM configurations,1,0.404044383
336 | 335,visual-basic,193066,1.709779092,Obvious,1,1.709779092
337 | 336,volt,4226,0.014539185,,,0
338 | 337,vue,1938940,8.574533644,.vue -- these have a significant HTML component,,0
339 | 338,web-ontology-language,13385,1.07383534,,,0
340 | 339,webassembly,8113,0.214897947,Undecided. Mostly generated code,,0
341 | 340,webidl,2622,0.005019794,,,0
342 | 341,wisp,130,0.00064634,,,0
343 | 342,x10,406,0.001619354,"Exclude: this project is done, right?",,0
344 | 343,xbase,7993,0.066203089,,,0
345 | 344,xc,585,0.004971472,,,0
346 | 345,xml,12683687,149.8550762,Data,,0
347 | 346,xojo,2940,0.032677002,,,0
348 | 347,xpages,315,0.002216953,,,0
349 | 348,xproc,1297,0.006994676,,,0
350 | 349,xquery,25933,0.046211199,,,0
351 | 350,xs,2025,0.03171586,,,0
352 | 351,xslt,57501,0.715532013,Include: XSLT is an XML transformation language; these are likely hand-written. XSLT is also a huge PITA to write by hand,1,0.715532013
353 | 352,xtend,11916,0.052799695,,,0
354 | 353,yacc,78972,0.557021816,Include: parser generator,1,0.557021816
355 | 354,yaml,7675977,40.82317001,Data,,0
356 | 355,yang,13539,0.216720152,,,0
357 | 356,zephir,3184,0.012502994,,,0
358 | 357,zig,19483,0.249499308,Include: up and coming language in the same space as Rust,1,0.249499308
359 | 358,zimpl,314,0.00236159,,,0
360 | ,,,,,92,842.6753674


--------------------------------------------------------------------------------
/data_analysis/notebooks/new_extension_distribution.csv:
--------------------------------------------------------------------------------
  1 | ,extension,language,count,low_alphanum_count,long_lines_count,non_lexable_count
  2 | 0,adb,ada,1000,0,1,85
  3 | 1,ads,ada,1000,1,2,20
  4 | 2,ada,ada,1000,0,4,31
  5 | 3,agda,agda,1000,0,3,49
  6 | 4,als,alloy,1000,0,1,39
  7 | 5,g4,antlr,1000,3,0,443
  8 | 6,scpt,applescript,1000,3,22,57
  9 | 7,applescript,applescript,1000,0,38,113
 10 | 8,asm,assembly,1000,2,115,0
 11 | 9,nasm,assembly,159,0,0,0
 12 | 10,a51,assembly,28,0,0,0
 13 | 11,aug,augeas,255,0,0,0
 14 | 12,awk,awk,1000,5,2,255
 15 | 13,gawk,awk,225,0,1,103
 16 | 14,mawk,awk,22,0,0,13
 17 | 15,nawk,awk,8,0,0,1
 18 | 16,auk,awk,3,0,0,3
 19 | 17,cmd,batchfile,1000,0,163,14
 20 | 18,bat,batchfile,1000,1,37,13
 21 | 19,bison,bison,176,0,1,0
 22 | 20,bsv,bluespec,1000,3,7,0
 23 | 21,c,c,1000,5,4,11
 24 | 22,h,c,1000,1,3,192
 25 | 23,cats,c,3,0,0,2
 26 | 24,w,c,5,0,0,2
 27 | 25,cpp,c++,1000,2,3,3
 28 | 26,ipp,c++,20,0,2,0
 29 | 27,cc,c++,1000,0,1,3
 30 | 28,hpp,c++,1000,1,7,3
 31 | 29,inl,c++,91,0,0,0
 32 | 30,cxx,c++,389,0,2,0
 33 | 31,cp,c++,10,0,0,0
 34 | 32,hh,c++,197,0,0,5
 35 | 33,tcc,c++,19,0,0,0
 36 | 34,hxx,c++,140,0,1,0
 37 | 35,tpp,c++,3,0,0,0
 38 | 36,c++,c++,15,0,0,0
 39 | 37,h++,c++,1,0,0,0
 40 | 38,cs,c-sharp,1000,0,2,31
 41 | 39,cshtml,c-sharp,585,0,9,429
 42 | 40,cake,c-sharp,9,0,0,8
 43 | 41,csx,c-sharp,8,0,0,6
 44 | 42,clj,clojure,1000,5,10,9
 45 | 43,cljs,clojure,1000,2,2,3
 46 | 44,cljc,clojure,1000,0,21,11
 47 | 45,boot,clojure,121,0,1,23
 48 | 46,cl2,clojure,7,0,0,0
 49 | 47,cljx,clojure,8,0,0,0
 50 | 48,cmake,cmake,1000,0,29,41
 51 | 49,coffee,coffeescript,1000,2,20,47
 52 | 50,cson,coffeescript,1000,3,13,0
 53 | 51,cjsx,coffeescript,185,0,1,8
 54 | 52,iced,coffeescript,92,0,0,9
 55 | 53,_coffee,coffeescript,4,0,0,0
 56 | 54,lisp,common-lisp,1000,2,21,32
 57 | 55,asd,common-lisp,1000,0,1,6
 58 | 56,lsp,common-lisp,1000,0,0,28
 59 | 57,sexp,common-lisp,197,0,107,63
 60 | 58,ny,common-lisp,52,0,5,6
 61 | 59,css,css,1000,0,154,273
 62 | 60,cuh,cuda,1000,1,3,0
 63 | 61,cu,cuda,1000,0,4,2
 64 | 62,dart,dart,1000,0,3,17
 65 | 63,,dockerfile,1000,0,0,29
 66 | 64,1,dockerfile,1,0,0,0
 67 | 65,dockerfile,dockerfile,334,0,0,1
 68 | 66,3,dockerfile,1,0,0,0
 69 | 67,mustache,dockerfile,1,0,0,0
 70 | 68,ex,elixir,1000,0,7,379
 71 | 69,exs,elixir,1000,0,2,57
 72 | 70,elm,elm,1000,2,10,82
 73 | 71,el,emacs-lisp,1000,1,31,109
 74 | 72,emacs,emacs-lisp,142,0,4,10
 75 | 73,erl,erlang,1000,6,3,35
 76 | 74,hrl,erlang,1000,2,8,13
 77 | 75,yrl,erlang,57,0,0,6
 78 | 76,xrl,erlang,30,0,0,30
 79 | 77,escript,erlang,71,0,1,6
 80 | 78,fs,f-sharp,1000,3,13,39
 81 | 79,fsx,f-sharp,1000,1,32,31
 82 | 80,fsi,f-sharp,516,0,0,5
 83 | 81,f,fortran,1000,6,38,559
 84 | 82,f90,fortran,1000,9,1,14
 85 | 83,f03,fortran,145,0,0,1
 86 | 84,for,fortran,549,1,0,226
 87 | 85,f95,fortran,216,0,0,4
 88 | 86,f08,fortran,65,0,1,4
 89 | 87,fpp,fortran,75,1,0,55
 90 | 88,f77,fortran,5,0,0,0
 91 | 89,glsl,glsl,1000,2,22,119
 92 | 90,frag,glsl,1000,1,2,120
 93 | 91,vert,glsl,1000,0,1,44
 94 | 92,shader,glsl,1000,0,45,908
 95 | 93,fsh,glsl,692,1,2,165
 96 | 94,frg,glsl,35,0,0,28
 97 | 95,vsh,glsl,326,0,1,39
 98 | 96,geom,glsl,149,0,0,12
 99 | 97,geo,glsl,289,0,2,215
100 | 98,fp,glsl,221,2,37,84
101 | 99,glslv,glsl,35,0,0,2
102 | 100,vshader,glsl,8,0,0,0
103 | 101,fshader,glsl,5,0,0,0
104 | 102,vrx,glsl,2,0,2,2
105 | 103,go,go,1000,0,13,0
106 | 104,groovy,groovy,1000,0,3,16
107 | 105,gtpl,groovy,21,0,0,13
108 | 106,gvy,groovy,2,0,0,0
109 | 107,grt,groovy,2,0,0,0
110 | 108,hs,haskell,1000,1,3,39
111 | 109,hsc,haskell,105,0,0,8
112 | 110,html,html,1000,0,240,131
113 | 111,htm,html,541,1,241,51
114 | 112,xhtml,html,149,0,60,1
115 | 113,xht,html,30,0,0,0
116 | 114,idr,idris,1000,1,2,195
117 | 115,lidr,idris,291,0,0,23
118 | 116,thy,isabelle,1000,0,20,399
119 | 117,java,java,1000,0,3,10
120 | 118,jsp,java-server-pages,1000,1,9,362
121 | 119,js,javascript,1000,3,113,151
122 | 120,es6,javascript,9,0,0,1
123 | 121,jsm,javascript,1,0,0,0
124 | 122,pac,javascript,2,0,0,0
125 | 123,xsjslib,javascript,1,0,0,0
126 | 124,sjs,javascript,3,0,0,0
127 | 125,stan,stan,1000,0,2,3
128 | 126,jl,julia,1000,0,12,15
129 | 127,kt,kotlin,1000,1,0,186
130 | 128,kts,kotlin,669,0,1,16
131 | 129,lean,lean,1000,0,0,6
132 | 130,hlean,lean,268,0,0,0
133 | 131,lagda,literate-agda,1000,0,0,12
134 | 132,litcoffee,literate-coffeescript,1000,0,4,0
135 | 133,lhs,literate-haskell,1000,0,2,114
136 | 134,lua,lua,1000,1,15,15
137 | 135,nse,lua,91,0,0,1
138 | 136,wlua,lua,8,0,0,0
139 | 137,,makefile,1000,2,9,80
140 | 138,mk,makefile,1000,1,20,104
141 | 139,mak,makefile,273,1,11,111
142 | 140,txt,makefile,1,0,0,0
143 | 141,cmake,makefile,1,0,0,0
144 | 142,mpl,maple,1000,1,6,0
145 | 143,md,markdown,1000,2,73,20
146 | 144,markdown,markdown,244,0,32,2
147 | 145,mkd,markdown,5,0,0,0
148 | 146,ron,markdown,2,0,0,0
149 | 147,mkdn,markdown,1,0,0,0
150 | 148,ma,mathematica,739,0,390,680
151 | 149,cdf,mathematica,610,1,8,207
152 | 150,nb,mathematica,1000,0,720,301
153 | 151,mt,mathematica,766,6,61,657
154 | 152,wl,mathematica,686,1,13,503
155 | 153,wlt,mathematica,113,1,1,103
156 | 154,nbp,mathematica,30,0,0,30
157 | 155,mathematica,mathematica,1,0,0,1
158 | 156,matlab,matlab,1000,0,338,2
159 | 157,ml,ocaml,1000,1,6,22
160 | 158,mli,ocaml,1000,0,3,0
161 | 159,mll,ocaml,163,2,2,0
162 | 160,mly,ocaml,149,0,0,2
163 | 161,eliom,ocaml,10,0,0,0
164 | 162,eliomi,ocaml,2,0,0,0
165 | 163,ml4,ocaml,11,0,0,0
166 | 164,pas,pascal,1000,0,4,62
167 | 165,dfm,pascal,1000,0,0,3
168 | 166,dpr,pascal,1000,0,0,6
169 | 167,lpr,pascal,339,0,1,18
170 | 168,pl,perl,1000,2,6,119
171 | 169,t,perl,1000,1,5,130
172 | 170,pm,perl,1000,0,8,32
173 | 171,al,perl,788,1,1,35
174 | 172,plx,perl,28,0,6,0
175 | 173,ph,perl,62,0,7,4
176 | 174,psgi,perl,11,0,0,0
177 | 175,perl,perl,62,0,1,8
178 | 176,php,php,1000,1,15,0
179 | 177,phpt,php,92,0,0,0
180 | 178,ctp,php,23,0,1,0
181 | 179,ps1,powershell,1000,0,6,15
182 | 180,psm1,powershell,1000,1,7,8
183 | 181,psd1,powershell,999,1,9,1
184 | 182,yap,prolog,1000,2,18,36
185 | 183,prolog,prolog,480,1,36,67
186 | 184,proto,protocol-buffer,1000,0,17,0
187 | 185,py,python,1000,0,11,1
188 | 186,bzl,python,50,0,0,0
189 | 187,pyw,python,6,0,0,0
190 | 188,gyp,python,15,0,0,0
191 | 189,pyde,python,2,0,0,0
192 | 190,r,r,1000,4,20,18
193 | 191,rd,r,912,0,8,67
194 | 192,rsx,r,38,1,2,0
195 | 193,scrbl,racket,1000,0,10,85
196 | 194,rktd,racket,281,1,132,0
197 | 195,rktl,racket,396,3,0,0
198 | 196,rst,restructuredtext,1000,6,21,7
199 | 197,rest,restructuredtext,41,0,2,0
200 | 198,rmd,rmarkdown,1000,0,72,0
201 | 199,rb,ruby,1000,0,6,0
202 | 200,gemspec,ruby,600,0,24,0
203 | 201,podspec,ruby,430,0,0,0
204 | 202,rake,ruby,134,0,0,0
205 | 203,jbuilder,ruby,102,0,2,1
206 | 204,ru,ruby,59,0,12,24
207 | 205,builder,ruby,14,0,0,1
208 | 206,ruby,ruby,3,0,0,0
209 | 207,rabl,ruby,12,0,0,0
210 | 208,thor,ruby,2,0,0,0
211 | 209,rbw,ruby,1,0,0,0
212 | 210,rs,rust,1000,0,3,1
213 | 211,sas,sas,1000,200,43,230
214 | 212,scala,scala,1000,0,5,25
215 | 213,sbt,scala,913,0,1,3
216 | 214,scm,scheme,1000,5,9,86
217 | 215,sld,scheme,1000,0,99,60
218 | 216,sps,scheme,446,2,9,87
219 | 217,sh,shell,1000,0,20,1
220 | 218,bash,shell,312,1,4,0
221 | 219,tmux,shell,7,0,0,0
222 | 220,bats,shell,127,0,0,0
223 | 221,zsh,shell,232,0,1,0
224 | 222,ksh,shell,26,0,0,0
225 | 223,command,shell,24,0,0,0
226 | 224,tool,shell,1,0,0,0
227 | 225,st,smalltalk,1000,0,10,416
228 | 226,sol,solidity,1000,8,22,575
229 | 227,sparql,sparql,1000,1,43,138
230 | 228,rq,sparql,1000,2,6,126
231 | 229,sql,sql,1000,0,83,129
232 | 230,tab,sql,267,2,40,60
233 | 231,cql,sql,96,0,6,49
234 | 232,pkb,sql,26,0,0,1
235 | 233,prc,sql,23,0,2,9
236 | 234,pls,sql,17,0,0,1
237 | 235,pks,sql,16,0,0,1
238 | 236,ddl,sql,75,0,2,22
239 | 237,pck,sql,9,0,3,4
240 | 238,plsql,sql,7,0,0,1
241 | 239,db2,sql,4,0,0,0
242 | 240,udf,sql,7,0,0,7
243 | 241,plb,sql,1,0,0,0
244 | 242,stan,stan,1000,0,2,3
245 | 243,sig,standard-ml,1000,1,415,442
246 | 244,fun,standard-ml,958,0,0,463
247 | 245,sml,standard-ml,1000,1,4,367
248 | 246,do,stata,1000,3,56,29
249 | 247,ado,stata,1000,15,24,20
250 | 248,sthlp,stata,1000,0,0,17
251 | 249,ihlp,stata,406,0,0,1
252 | 250,mata,stata,392,0,4,9
253 | 251,doh,stata,5,0,0,0
254 | 252,matah,stata,8,0,0,0
255 | 253,sv,systemverilog,1000,2,3,9
256 | 254,svh,systemverilog,1000,15,0,47
257 | 255,vh,systemverilog,615,2,10,5
258 | 256,tcl,tcl,1000,0,44,501
259 | 257,tm,tcl,398,4,10,258
260 | 258,adp,tcl,322,0,0,318
261 | 259,csh,tcsh,1000,1,6,7
262 | 260,tcsh,tcsh,340,0,6,2
263 | 261,toc,tex,479,0,25,0
264 | 262,ltx,tex,151,2,2,0
265 | 263,bib,tex,1000,0,214,0
266 | 264,sty,tex,611,0,6,0
267 | 265,tex,tex,1000,4,88,2
268 | 266,ins,tex,44,1,2,0
269 | 267,dtx,tex,174,0,1,2
270 | 268,aux,tex,438,0,26,0
271 | 269,mkii,tex,12,0,0,0
272 | 270,bbx,tex,15,0,0,0
273 | 271,lbx,tex,2,0,0,0
274 | 272,cbx,tex,6,0,0,0
275 | 273,mkiv,tex,27,0,0,0
276 | 274,mkvi,tex,3,0,0,0
277 | 275,thrift,thrift,1000,0,0,32
278 | 276,ts,typescript,1000,0,9,32
279 | 277,tsx,typescript,1000,0,15,775
280 | 278,veo,verilog,1000,0,2,0
281 | 279,vhd,vhdl,1000,0,8,32
282 | 280,vhdl,vhdl,1000,0,106,335
283 | 281,vho,vhdl,309,0,0,59
284 | 282,vhi,vhdl,17,0,5,7
285 | 283,vht,vhdl,75,0,0,2
286 | 284,vhf,vhdl,33,0,4,0
287 | 285,vhw,vhdl,5,0,0,0
288 | 286,vb,visual-basic,1000,0,6,0
289 | 287,frm,visual-basic,1000,1,840,0
290 | 288,bas,visual-basic,1000,0,7,0
291 | 289,vbs,visual-basic,722,0,6,0
292 | 290,vba,visual-basic,77,0,0,0
293 | 291,vbhtml,visual-basic,57,0,1,0
294 | 292,frx,visual-basic,38,1,14,0
295 | 293,xsl,xslt,1000,0,20,2
296 | 294,xslt,xslt,1000,1,51,7
297 | 295,yy,yacc,1000,2,30,0
298 | 296,y,yacc,1000,8,0,0
299 | 297,yacc,yacc,14,0,0,0
300 | 298,zig,zig,1000,0,5,123
301 | 


--------------------------------------------------------------------------------
/data_analysis/notebooks/stats.csv:
--------------------------------------------------------------------------------
  1 | table_id,size_before_gb,size_after_gb,row_before,row_after,delta_size_percentage,delta_row_percentage
  2 | total,6814.8,2412.78,545847408,236746567,64.59,56.63
  3 | abap,0.25,0.09,23512,12161,65.33,48.28
  4 | actionscript,1.39,0.71,215655,136143,48.78,36.87
  5 | ada,0.96,0.32,60325,31291,66.94,48.13
  6 | agda,0.14,0.09,24996,17608,34.2,29.56
  7 | ags-script,0.02,0.01,2004,1061,60.19,47.06
  8 | alloy,0.17,0.01,19907,5374,92.6,73.0
  9 | ampl,0.0,0.0,88,75,21.43,14.77
 10 | antlr,0.19,0.05,16787,7983,71.89,52.45
 11 | apacheconf,0.0,0.0,471,149,76.95,68.37
 12 | api-blueprint,0.05,0.03,3714,2634,42.54,29.08
 13 | apl,0.01,0.01,2396,2039,24.75,14.9
 14 | applescript,0.02,0.01,6030,4906,17.87,18.64
 15 | arc,0.03,0.02,2637,1548,22.72,41.3
 16 | arduino,1.29,0.76,211901,144914,41.08,31.61
 17 | asciidoc,5.39,1.04,474531,184331,80.72,61.16
 18 | asp,0.98,0.56,159294,84104,42.34,47.2
 19 | aspectj,0.01,0.01,4449,2509,39.31,43.61
 20 | assembly,2.77,1.71,363453,248396,38.23,31.66
 21 | ats,0.02,0.01,4234,2963,34.91,30.02
 22 | augeas,0.0,0.0,254,195,40.67,23.23
 23 | autohotkey,0.14,0.09,18230,14648,34.54,19.65
 24 | autoit,0.19,0.1,16777,10982,43.63,34.54
 25 | awk,0.04,0.03,13454,10430,36.4,22.48
 26 | batchfile,1.28,0.42,440070,252514,67.34,42.62
 27 | befunge,0.0,0.0,35,33,5.51,5.71
 28 | bison,0.0,0.0,176,134,5.08,23.86
 29 | bitbake,0.28,0.08,137980,49453,70.28,64.16
 30 | blitzbasic,0.01,0.01,386,237,40.62,38.6
 31 | blitzmax,0.02,0.01,2000,1695,31.39,15.25
 32 | bluespec,0.07,0.04,9050,5940,47.39,34.36
 33 | boo,0.01,0.01,6868,4570,30.58,33.46
 34 | brainfuck,0.06,0.05,10822,3602,24.85,66.72
 35 | brightscript,0.02,0.01,2862,2148,34.06,24.95
 36 | bro,0.0,0.0,1100,871,21.12,20.82
 37 | c,267.01,61.99,21383832,8625559,76.78,59.66
 38 | c-sharp,146.43,52.5,21702269,10839399,64.15,50.05
 39 | c2hs-haskell,0.02,0.01,1492,968,32.38,35.12
 40 | capn-proto,0.04,0.0,3203,1308,92.31,59.16
 41 | cartocss,0.05,0.02,3053,1322,69.99,56.7
 42 | ceylon,0.03,0.02,8271,5900,38.22,28.67
 43 | chapel,0.22,0.03,33787,13591,85.25,59.77
 44 | chuck,0.0,0.0,1621,1260,22.1,22.27
 45 | cirru,0.07,0.06,1197,980,13.06,18.13
 46 | clarion,0.01,0.01,1237,1086,19.34,12.21
 47 | clean,0.01,0.0,925,776,18.32,16.11
 48 | click,0.0,0.0,483,330,38.22,31.68
 49 | clips,0.01,0.01,2291,1545,26.89,32.56
 50 | clojure,1.13,0.56,177398,126191,50.99,28.87
 51 | cmake,2.42,0.56,586269,186517,77.02,68.19
 52 | cobol,0.03,0.02,4047,2978,30.56,26.41
 53 | coffeescript,2.81,0.81,331824,227889,71.19,31.32
 54 | coldfusion,0.13,0.07,20990,12931,44.73,38.39
 55 | coldfusion-cfc,0.18,0.08,23567,12745,57.17,45.92
 56 | common-lisp,3.67,1.74,156804,101370,52.69,35.35
 57 | component-pascal,0.14,0.08,1151,529,41.84,54.04
 58 | coq,0.0,0.0,59,33,83.6,44.07
 59 | cpp,223.03,54.23,14820829,6377914,75.69,56.97
 60 | creole,0.0,0.0,767,656,17.23,14.47
 61 | crystal,0.64,0.3,124741,78484,52.27,37.08
 62 | csound,0.05,0.04,1845,1341,23.56,27.32
 63 | css,153.05,24.03,5889790,2994829,84.3,49.15
 64 | csv,411.55,314.12,7640022,6404239,23.67,16.18
 65 | cucumber,0.55,0.27,132603,84276,51.69,36.44
 66 | cuda,1.85,0.62,145560,58355,66.15,59.91
 67 | cycript,0.02,0.01,554,434,34.06,21.66
 68 | cython,1.61,0.32,86161,35870,80.46,58.37
 69 | d,0.0,0.0,603,371,44.32,38.47
 70 | darcs-patch,0.0,0.0,376,362,2.93,3.72
 71 | dart,13.19,4.36,1734126,932583,66.95,46.22
 72 | desktop,0.03,0.02,23218,19569,27.12,15.72
 73 | diff,5.19,2.29,526857,320925,55.87,39.09
 74 | digital-command-language,0.14,0.07,35497,17738,48.87,50.03
 75 | dm,0.04,0.04,4842,4571,3.45,5.6
 76 | dns-zone,0.03,0.03,1948,1471,20.48,24.49
 77 | dockerfile,2.54,0.68,1265281,572186,73.25,54.78
 78 | dogescript,0.0,0.0,386,286,20.15,25.91
 79 | dylan,0.07,0.04,7152,4553,38.01,36.34
 80 | eagle,6.27,2.69,70846,45187,57.04,36.22
 81 | ec,0.06,0.02,1976,1062,63.23,46.26
 82 | ecere-projects,0.0,0.0,311,75,63.28,75.88
 83 | ecl,0.06,0.03,10899,3843,53.78,64.74
 84 | edn,1.47,1.36,30683,20731,7.08,32.43
 85 | eiffel,0.22,0.13,39311,24070,40.11,38.77
 86 | elixir,2.54,0.88,594074,282110,65.4,52.51
 87 | elm,0.63,0.37,90637,62861,41.51,30.65
 88 | emacs-lisp,2.0,0.46,122317,54768,76.94,55.22
 89 | emberscript,0.04,0.03,3487,1603,41.97,54.03
 90 | erlang,2.8,0.78,203193,99368,72.28,51.1
 91 | f-sharp,2.67,0.96,219519,127161,63.93,42.07
 92 | factor,0.14,0.03,29484,10270,76.62,65.17
 93 | fancy,0.0,0.0,609,511,16.82,16.09
 94 | fantom,0.02,0.01,3186,2448,30.67,23.16
 95 | fish,0.13,0.05,53407,32206,61.43,39.7
 96 | flux,0.02,0.01,3201,998,69.87,68.82
 97 | forth,0.03,0.02,5665,4425,30.33,21.89
 98 | fortran,3.97,1.93,287086,165446,51.33,42.37
 99 | freemarker,0.66,0.29,114988,60206,55.87,47.64
100 | g-code,1.05,0.89,16020,10004,14.95,37.55
101 | gams,0.12,0.07,5275,1814,44.64,65.61
102 | gap,0.02,0.01,2158,1124,33.53,47.91
103 | gas,3.15,1.23,186463,91662,60.93,50.84
104 | gdscript,0.44,0.31,156343,119556,29.8,23.53
105 | genshi,0.01,0.01,466,350,38.08,24.89
106 | gentoo-ebuild,0.14,0.06,75148,34122,56.93,54.59
107 | gentoo-eclass,0.0,0.0,439,364,21.81,17.08
108 | gettext-catalog,40.72,4.52,775246,139375,88.89,82.02
109 | glsl,1.34,0.67,317741,175576,49.97,44.74
110 | glyph,0.0,0.0,52,47,8.95,9.62
111 | gnuplot,2.93,2.85,53925,47178,2.51,12.51
112 | go,118.96,28.09,11653185,4730461,76.39,59.41
113 | golo,0.0,0.0,629,522,17.01,17.01
114 | gosu,0.01,0.01,1625,795,40.17,51.08
115 | grace,0.0,0.0,637,561,21.4,11.93
116 | grammatical-framework,0.07,0.04,5938,3576,41.34,39.78
117 | graphql,0.68,0.15,68237,45331,77.79,33.57
118 | graphviz_dot,6.77,1.09,561726,68824,83.87,87.75
119 | groff,6.58,2.57,490292,181789,60.94,62.92
120 | groovy,3.17,1.1,504166,251627,65.22,50.09
121 | groovy-server-pages,0.14,0.09,21695,12237,36.26,43.6
122 | haml,0.36,0.2,180504,116033,46.03,35.72
123 | handlebars,1.16,0.62,389923,243252,46.81,37.62
124 | harbour,0.0,0.0,911,787,31.96,13.61
125 | haskell,6.24,2.63,798865,544969,57.93,31.78
126 | haxe,1.63,0.52,241032,131491,68.15,45.45
127 | hcl,1.81,0.82,470644,272971,54.45,42.0
128 | hlsl,0.21,0.09,48653,25476,59.06,47.64
129 | html,837.25,153.06,35592089,9533367,81.72,73.21
130 | html_django,0.43,0.14,97815,48857,67.19,50.05
131 | html_eex,0.05,0.03,22764,13306,46.59,41.55
132 | html_erb,2.1,1.12,862407,529049,46.8,38.65
133 | html_php,0.71,0.36,142410,76790,49.59,46.08
134 | http,0.11,0.07,31307,20830,37.46,33.47
135 | hy,0.02,0.01,2090,1238,42.22,40.77
136 | idl,0.07,0.01,991,545,80.93,45.01
137 | idris,0.06,0.03,10529,8060,40.49,23.45
138 | igor-pro,0.01,0.01,819,712,18.82,13.06
139 | inform-7,0.02,0.01,514,328,29.26,36.19
140 | ini,11.82,3.44,2537776,1187507,70.93,53.21
141 | inno-setup,0.05,0.02,6567,3227,64.54,50.86
142 | io,0.01,0.01,3523,2634,24.55,25.23
143 | ioke,0.0,0.0,537,374,50.96,30.35
144 | irc-log,0.01,0.01,105,100,0.09,4.76
145 | isabelle,0.14,0.09,7738,5086,34.85,34.27
146 | j,0.02,0.02,3321,3030,10.3,8.76
147 | jade,0.33,0.2,117328,84298,38.59,28.15
148 | jasmin,0.08,0.04,11534,6293,47.33,45.44
149 | java,294.72,102.29,42429211,20151565,65.29,52.51
150 | java-server-pages,2.37,1.17,407189,214133,50.45,47.41
151 | javascript,519.8,152.95,40112121,21108587,70.57,47.38
152 | jflex,0.04,0.01,5346,1662,70.14,68.91
153 | json,1385.98,350.67,80802152,17012912,74.7,78.94
154 | json5,0.15,0.07,8462,4652,51.15,45.02
155 | jsoniq,0.01,0.01,5604,3729,26.99,33.46
156 | jsonld,0.63,0.2,52263,10836,68.16,79.27
157 | jsx,5.73,3.12,1462062,970156,45.53,33.64
158 | julia,3.64,1.69,480267,298672,53.56,37.81
159 | jupyter-notebook,188.84,155.85,1459463,1073534,17.47,26.44
160 | kicad,3.19,2.92,12744,11448,8.5,10.17
161 | kit,0.01,0.01,1730,1324,22.15,23.47
162 | kotlin,14.67,7.19,3782188,2242771,50.96,40.7
163 | krl,0.0,0.0,366,267,28.73,27.05
164 | labview,0.13,0.05,2879,1212,58.18,57.9
165 | lasso,0.25,0.05,6232,1089,77.59,82.53
166 | latte,0.02,0.02,8614,6429,30.65,25.37
167 | lean,0.8,0.11,51125,16891,86.57,66.96
168 | less,3.34,1.1,615585,344780,67.07,43.99
169 | lex,0.11,0.09,4254,3189,22.03,25.04
170 | lfe,0.0,0.0,1194,1017,28.33,14.82
171 | lilypond,0.04,0.04,8932,6454,19.78,27.74
172 | linker-script,0.37,0.06,53158,8474,82.77,84.06
173 | liquid,0.21,0.11,40826,24759,45.44,39.35
174 | literate-agda,0.02,0.01,1211,523,67.89,56.81
175 | literate-coffeescript,0.01,0.01,1316,1138,20.83,13.53
176 | literate-haskell,0.18,0.06,10613,6135,67.98,42.19
177 | livescript,0.06,0.04,12084,9265,33.01,23.33
178 | llvm,3.13,0.77,180863,56247,75.37,68.9
179 | logos,0.51,0.35,30033,19242,31.65,35.93
180 | logtalk,0.03,0.01,8419,2185,71.3,74.05
181 | lolcode,0.0,0.0,883,798,6.33,9.63
182 | lookml,0.01,0.01,635,448,34.8,29.45
183 | lsl,0.02,0.02,3314,2536,36.8,23.48
184 | lua,7.65,3.6,925895,558861,53.01,39.64
185 | m,0.0,0.0,134,128,3.04,4.48
186 | m4,1.18,0.09,57797,17902,91.99,69.03
187 | makefile,7.6,1.8,1483161,661424,76.29,55.4
188 | mako,0.15,0.03,15939,7476,79.97,53.1
189 | maple,0.04,0.02,3472,1259,60.55,63.74
190 | markdown,282.57,90.93,40751875,21045171,67.82,48.36
191 | mask,0.02,0.01,2129,1087,43.78,48.94
192 | mathematica,2.3,1.73,63791,26895,24.72,57.84
193 | matlab,0.06,0.04,1296,967,35.0,25.39
194 | max,0.68,0.54,14305,10476,21.33,26.77
195 | maxscript,0.01,0.01,753,531,31.04,29.48
196 | mediawiki,0.69,0.2,49427,15478,70.82,68.69
197 | metal,0.04,0.02,8711,4061,51.64,53.38
198 | mirah,0.31,0.04,17005,4043,87.02,76.22
199 | modelica,0.4,0.15,43297,23853,63.19,44.91
200 | module-management-system,0.01,0.0,640,302,66.34,52.81
201 | monkey,0.01,0.01,1939,1660,8.15,14.39
202 | moonscript,0.02,0.02,6159,5068,28.78,17.71
203 | mtml,0.0,0.0,601,462,30.01,23.13
204 | muf,0.0,0.0,684,490,10.67,28.36
205 | mupad,0.01,0.01,865,643,18.51,25.66
206 | myghty,0.0,0.0,13,11,12.78,15.38
207 | nesc,0.21,0.15,27205,15109,28.91,44.46
208 | netlinx,0.0,0.0,202,177,5.23,12.38
209 | netlogo,0.09,0.03,3678,931,66.42,74.69
210 | nginx,0.0,0.0,15,14,7.1,6.67
211 | nimrod,1.18,0.48,91048,51660,59.25,43.26
212 | ninja,0.28,0.07,11995,2130,73.5,82.24
213 | nit,0.0,0.0,121,110,18.16,9.09
214 | nix,5.91,0.42,456544,121964,92.87,73.29
215 | nsis,0.09,0.03,10359,3806,66.8,63.26
216 | nu,0.01,0.0,1372,877,51.95,36.08
217 | numpy,0.0,0.0,7,7,-0.34,0.0
218 | objdump,0.08,0.07,1407,676,16.63,51.95
219 | objective-cpp,2.13,0.64,167633,65034,70.02,61.2
220 | objective-j,0.01,0.01,520,395,29.09,24.04
221 | ocaml,4.04,1.19,354810,159734,70.57,54.98
222 | octave,0.0,0.0,215,203,8.22,5.58
223 | omgrofl,0.0,0.0,4,4,-0.6,0.0
224 | ooc,0.01,0.01,1771,1426,29.46,19.48
225 | opa,0.0,0.0,385,354,1.93,8.05
226 | opal,0.0,0.0,185,152,13.93,17.84
227 | opencl,0.68,0.14,59129,19729,79.13,66.63
228 | openscad,0.17,0.11,23888,20380,34.56,14.69
229 | org,0.51,0.39,57582,50013,22.97,13.14
230 | ox,0.0,0.0,414,192,56.71,53.62
231 | oxygene,0.0,0.0,63,31,49.54,50.79
232 | oz,0.01,0.01,1258,1179,6.24,6.28
233 | pan,0.1,0.02,8901,2672,81.9,69.98
234 | papyrus,0.06,0.05,19646,10358,18.87,47.28
235 | parrot,0.0,0.0,32,28,11.35,12.5
236 | parrot-assembly,0.0,0.0,799,384,50.74,51.94
237 | parrot-internal-representation,0.11,0.03,2158,1447,76.8,32.95
238 | pascal,3.21,1.77,176640,118675,44.84,32.82
239 | pawn,0.08,0.06,3091,2412,26.19,21.97
240 | perl,7.52,2.84,834305,392108,62.24,53.0
241 | perl6,0.16,0.04,15602,9782,72.54,37.3
242 | php,203.92,74.87,34851418,15904518,63.28,54.36
243 | piglatin,0.01,0.0,2237,1514,33.27,32.32
244 | pike,0.0,0.0,1067,1008,4.89,5.53
245 | pod,0.49,0.12,35697,11730,76.13,67.14
246 | pogoscript,0.0,0.0,254,226,9.72,11.02
247 | pony,0.04,0.02,5222,3529,42.68,32.42
248 | postscript,1.93,1.57,24065,16096,18.6,33.11
249 | pov-ray-sdl,0.04,0.02,2364,1067,39.4,54.86
250 | powershell,3.67,1.4,527898,271487,61.91,48.57
251 | processing,0.39,0.26,70860,55528,34.21,21.64
252 | prolog,0.03,0.01,1884,1023,59.49,45.7
253 | propeller-spin,0.04,0.02,2926,2120,43.73,27.55
254 | protocol-buffer,2.14,0.49,254672,98246,77.1,61.42
255 | public-key,0.06,0.06,55301,53841,1.82,2.64
256 | pure-data,0.16,0.11,18689,14569,29.47,22.05
257 | purebasic,0.12,0.05,22424,13001,58.84,42.02
258 | purescript,0.23,0.16,42903,32331,31.92,24.64
259 | python,213.56,70.87,24214270,12962249,66.81,46.47
260 | python-traceback,0.0,0.0,13,9,12.04,30.77
261 | qmake,0.05,0.02,14696,9100,61.21,38.08
262 | qml,0.52,0.22,82710,45565,58.35,44.91
263 | r,0.41,0.32,51877,39194,20.7,24.45
264 | racket,0.07,0.04,5553,4201,44.36,24.35
265 | ragel-in-ruby-host,0.03,0.01,2891,1610,59.21,44.31
266 | raml,0.1,0.05,17876,11862,51.98,33.64
267 | raw-token-data,1.05,0.74,51067,36913,28.91,27.72
268 | rdoc,0.08,0.03,14958,9535,62.98,36.25
269 | realbasic,0.01,0.01,989,800,15.03,19.11
270 | rebol,0.01,0.01,2023,1388,44.66,31.39
271 | red,0.08,0.04,5725,3291,55.07,42.52
272 | redcode,0.0,0.0,588,495,12.02,15.82
273 | renderscript,0.01,0.0,1756,822,53.95,53.19
274 | renpy,0.16,0.12,6333,4529,24.63,28.49
275 | restructuredtext,13.54,4.06,1760914,905679,69.99,48.57
276 | rhtml,0.02,0.01,8676,6812,21.7,21.48
277 | rmarkdown,0.08,0.06,6572,5389,20.0,18.0
278 | robotframework,0.15,0.06,24996,11844,62.24,52.62
279 | rouge,0.02,0.0,6276,614,86.99,90.22
280 | ruby,29.76,8.9,7205146,3405374,70.08,52.74
281 | rust,41.35,10.2,3057230,1386585,75.33,54.65
282 | sage,0.02,0.02,2174,1857,14.77,14.58
283 | saltstack,0.09,0.05,40179,27071,41.87,32.62
284 | sas,0.34,0.14,15258,9772,60.85,35.95
285 | sass,0.35,0.23,132724,101656,34.73,23.41
286 | scala,17.34,5.71,2787552,1362426,67.07,51.12
287 | scaml,0.0,0.0,149,88,55.96,40.94
288 | scheme,0.63,0.32,64946,44261,49.12,31.85
289 | scilab,0.02,0.01,3840,2937,45.89,23.52
290 | scss,12.06,5.48,3384703,2094964,54.62,38.1
291 | self,0.0,0.0,111,82,32.18,26.13
292 | shell,10.96,4.47,3769888,2236434,59.21,40.68
293 | shellsession,0.0,0.0,12,9,10.74,25.0
294 | shen,0.0,0.0,421,298,36.38,29.22
295 | slash,0.07,0.04,15825,6960,49.84,56.02
296 | slim,0.12,0.08,64833,45887,37.86,29.22
297 | smali,4.45,2.71,403309,192445,39.08,52.28
298 | smalltalk,1.76,1.12,774045,592999,36.8,23.39
299 | smarty,1.18,0.53,256481,137110,55.15,46.54
300 | smt,8.03,0.29,144547,14877,96.41,89.71
301 | solidity,3.76,1.32,388997,164242,64.99,57.78
302 | sourcepawn,0.2,0.15,9570,6708,27.41,29.91
303 | sparql,0.07,0.05,24511,14173,27.62,42.18
304 | sqf,0.26,0.14,54949,34892,47.93,36.5
305 | sql,21.22,13.06,1386738,994019,38.46,28.32
306 | squirrel,0.06,0.04,7490,4956,30.58,33.83
307 | stan,0.2,0.02,14017,5441,91.68,61.18
308 | standard-ml,1.23,0.54,68834,48995,55.76,28.82
309 | stata,0.6,0.42,45607,31282,29.61,31.41
310 | ston,0.0,0.0,2087,946,38.81,54.67
311 | stylus,0.31,0.18,116347,81646,41.1,29.83
312 | supercollider,0.03,0.02,3535,2529,39.49,28.46
313 | svg,118.55,77.17,6411336,3267524,34.9,49.04
314 | swift,15.26,7.13,2941299,1756144,53.29,40.29
315 | systemverilog,1.18,0.44,100755,46915,63.16,53.44
316 | tcl,1.13,0.43,109545,50579,61.87,53.83
317 | tcsh,0.08,0.03,20547,4911,67.41,76.1
318 | tea,0.05,0.04,1292,1012,22.35,21.67
319 | tex,8.67,5.82,705363,547888,32.89,22.33
320 | text,363.02,266.01,21630263,15267582,26.72,29.42
321 | textile,0.08,0.04,10965,8196,54.12,25.25
322 | thrift,0.09,0.02,11730,4663,82.56,60.25
323 | toml,2.3,0.88,994820,417483,61.77,58.03
324 | turing,0.01,0.01,375,292,26.97,22.13
325 | turtle,4.55,2.64,219880,96432,41.9,56.14
326 | twig,2.13,1.25,575087,342973,41.45,40.36
327 | txl,0.0,0.0,180,132,3.9,26.67
328 | typescript,143.21,34.5,19589267,10637070,75.91,45.7
329 | unified-parallel-c,0.0,0.0,241,194,10.07,19.5
330 | unity3d-asset,25.1,7.87,9834153,928191,68.63,90.56
331 | uno,0.02,0.01,6159,3817,47.9,38.03
332 | unrealscript,0.12,0.07,13293,10221,42.61,23.11
333 | urweb,0.01,0.01,1341,1111,22.21,17.15
334 | vala,0.04,0.03,5837,4480,32.05,23.25
335 | vcl,0.01,0.0,1848,1152,47.73,37.66
336 | verilog,0.01,0.0,1831,77,93.36,95.79
337 | vhdl,3.14,1.15,119386,60027,63.45,49.72
338 | viml,1.39,0.36,171857,96484,73.77,43.86
339 | visual-basic,4.05,1.59,398672,163291,60.81,59.04
340 | volt,0.03,0.01,5821,3493,54.27,39.99
341 | vue,15.09,8.0,2822678,1556867,46.99,44.84
342 | web-ontology-language,2.21,0.98,42906,10784,55.58,74.87
343 | webassembly,0.62,0.1,15489,5359,83.41,65.4
344 | webidl,0.01,0.01,4674,2016,57.51,56.87
345 | wisp,0.0,0.0,160,124,38.45,22.5
346 | x10,0.0,0.0,458,373,16.2,18.56
347 | xbase,0.13,0.06,12621,6989,49.56,44.62
348 | xc,0.01,0.0,1519,462,70.28,69.59
349 | xml,309.24,118.59,23441154,6267525,61.65,73.26
350 | xojo,0.04,0.03,3791,2305,30.98,39.2
351 | xpages,0.0,0.0,939,179,45.23,80.94
352 | xproc,0.01,0.01,1791,993,37.08,44.56
353 | xquery,0.1,0.05,38949,19713,49.79,49.39
354 | xs,0.14,0.02,4049,1683,83.86,58.43
355 | xslt,1.92,0.59,101092,43095,69.46,57.37
356 | xtend,0.08,0.05,15151,9844,35.7,35.03
357 | yacc,1.32,0.42,109233,25775,68.27,76.4
358 | yaml,98.91,31.38,13439939,5282081,68.27,60.7
359 | yang,1.86,0.14,55459,9653,92.5,82.59
360 | zephir,0.06,0.01,8701,2361,83.2,72.87
361 | zig,1.18,0.19,39894,15913,84.11,60.11
362 | zimpl,0.0,0.0,356,308,4.55,13.48


--------------------------------------------------------------------------------
/data_analysis/notebooks/utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from tqdm import tqdm
 3 | import numpy as np
 4 | 
 5 | def get_size(text):
 6 |     # size of a string in bytes
 7 |     return len(text.encode('utf-8'))
 8 | 
 9 | def add_size(sample):
10 |     sample["size"] = get_size(sample["content"])
11 |     return sample
12 | 
13 | def sample_eval_losses(model_all_license, model_safe_license, tokenizer_all, tokenizer_safe, ds, n=2000,  device="cuda"):
14 |     """ compute losses on the first n samples for both models"""
15 |     losses_all = []
16 |     losses_safe = []
17 |     model_all_license.to(device)
18 |     model_safe_license.to(device)
19 |     for i in tqdm(range(n)):
20 |         with torch.no_grad():
21 |             tokens_all = torch.tensor(tokenizer_all(ds[i]["content"], truncation=True)['input_ids'])
22 |             tokens_safe = torch.tensor(tokenizer_safe(ds[i]["content"], truncation=True)['input_ids'])
23 |             
24 |             outputs = model_all_license(tokens_all.to(device), labels=tokens_all.to(device))
25 |             losses_all.append(outputs.loss.item())
26 |             outputs = model_safe_license(tokens_safe.to(device), labels=tokens_safe.to(device))
27 |             losses_safe.append(outputs.loss.item())
28 |             
29 |     return losses_all, losses_safe
30 | 
31 | 
32 | def get_embeddings(model, tokenizer, ds, n=200, device="cuda"):
33 |     """get embeddings of n files from the iterable dataset ds
34 |     as the average of token embeddings of the file"""
35 |     embeddings = []
36 |     model.to('cuda')
37 |     for i, example in tqdm(enumerate(ds)):
38 |         with torch.no_grad():
39 |             inputs = torch.tensor(tokenizer(example["content"], truncation=True)['input_ids'])
40 |             outputs = model(inputs.to(device), labels=inputs.to(device), output_hidden_states=True)
41 |             embeddings.append(np.mean(outputs.hidden_states[-1].detach().cpu().numpy(),axis=0))
42 |         if i == n - 1:
43 |             break
44 |     return np.array(embeddings)
45 | 


--------------------------------------------------------------------------------
/data_analysis/python_data_analysis/code_compilation/README.md:
--------------------------------------------------------------------------------
1 | # Code compilation
2 | Here we provide code to estimate the number of valid Python files by using the \texttt{py\_compile} module on some samples from a code dataset. We try to compile files for both python2 and python3 and count how many throw syntax errors.
3 | 
4 | You can execute the code using:
5 | ```bash
6 | python compile.py --dataset_name <dataset_name> --n_samples <n_samples> --seed <seed>
7 | ```
8 | where `dataset_name` is the name of the dataset you want to analyze, `n_samples` is the number of samples to use, `seed` is the seed for the random shuffling.
9 | 


--------------------------------------------------------------------------------
/data_analysis/python_data_analysis/code_compilation/compile_py_files.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import tempfile
 3 | import subprocess
 4 | from tqdm import tqdm
 5 | import argparse
 6 | from datasets import load_dataset
 7 | 
 8 | 
 9 | def parseArgs():
10 |     parser = argparse.ArgumentParser(
11 |         description="Code compilation"
12 |     )
13 |     parser.add_argument(
14 |         "--dataset_name",
15 |         default="bigcode/python_permissive",
16 |         type=str,
17 |         help="HF repo name/path of the dataset.",
18 |     )
19 |     parser.add_argument(
20 |         "--n_samples",
21 |         default=10_000,
22 |         type=int,
23 |         help="Number of samples in the subset to analyze",
24 |     )
25 |     parser.add_argument(
26 |         "--seed",
27 |         default=0,
28 |         type=int,
29 |         help="Seed",
30 |     )
31 |     return parser.parse_args()
32 | 
33 | 
34 | def compile_python_code(sample):
35 |     string = sample["content"]
36 |     tmp = tempfile.NamedTemporaryFile()
37 |     with open(tmp.name, "w") as f:
38 |         f.write(string)
39 |     py_command = "python{v} -m py_compile " + tmp.name
40 |     
41 |     try:
42 |         subprocess.check_call(py_command.format(v=3).split())
43 |         python3_works = True
44 |     except subprocess.CalledProcessError:
45 |         python3_works = False
46 | 
47 |     try:
48 |         subprocess.check_call(py_command.format(v=2).split())
49 |         python2_works = True
50 |     except subprocess.CalledProcessError:
51 |         python2_works = False
52 | 
53 |     return python2_works or python3_works
54 | 
55 | 
56 | if __name__ == '__main__':
57 |     args = parseArgs()
58 | 
59 |     print(f"Loading {args.n_samples} samples from {args.dataset_name} dataset")
60 |     data = load_dataset(args.dataset_name, streaming=True, split="train", use_auth_token=True)
61 |     subset = list(data.shuffle(seed=args.seed).take(args.n_samples))
62 | 
63 |     valid_files = 0
64 |     for i in tqdm(range(len(subset))):
65 |         if compile_python_code(subset[i]):
66 |             valid_files += 1
67 | 
68 |     print(f"Number of valid python files in {args.n_samples} random samples: {valid_files}")
69 |     print(f"Percentage of non valid files: {(len(subset) - valid_files) * 100 / len(subset)}%")


--------------------------------------------------------------------------------
/data_analysis/python_data_analysis/code_compilation/requirements.txt:
--------------------------------------------------------------------------------
1 | datasets


--------------------------------------------------------------------------------
/data_analysis/python_data_analysis/config_test_estimation/README.md:
--------------------------------------------------------------------------------
1 | # Config and test files detection
2 | 
3 | Here we provide the code to detect and estimate the number of configuration and test files in a code dataset.
4 | 
5 | You can execute the code using:
6 | ```bash
7 | python config_test.py --dataset_name <dataset_name> --split <split>
8 | ```
9 | where `dataset_name` is the name of the dataset you want to analyze and `split` is the dataset split.


--------------------------------------------------------------------------------
/data_analysis/python_data_analysis/config_test_estimation/config_test.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | import argparse
  3 | from datasets import load_dataset
  4 | import numpy as np
  5 | from tqdm import tqdm
  6 | 
  7 | 
  8 | def parseArgs():
  9 |     parser = argparse.ArgumentParser(description="Config and test files detection")
 10 |     parser.add_argument(
 11 |         "--dataset_name",
 12 |         default="bigcode/python_permissive",
 13 |         type=str,
 14 |         help="HF repo name/path of the dataset.",
 15 |     )
 16 |     parser.add_argument(
 17 |         "--num_workers",
 18 |         default=96,
 19 |         type=int,
 20 |         help="Number f workers for multiprocessing",
 21 |     )
 22 |     parser.add_argument(
 23 |         "--split",
 24 |         default="train",
 25 |         type=str,
 26 |         help="Datasset split to process",
 27 |     )
 28 |     parser.add_argument(
 29 |         "--push_to_hub",
 30 |         action="store_true",
 31 |         help="Push the dataset to the Hub",
 32 |     )
 33 |     return parser.parse_args()
 34 | 
 35 | 
 36 | def is_config_or_test(example, scan_width=5, coeff=0.05):
 37 |     """Check if file is a configuration file or a unit test by :
 38 |     1- looking for keywords in the first few lines of the file.
 39 |     2- counting number of occurence of the words 'config' and 'test' with respect to number of lines.
 40 |     """
 41 | 
 42 |     keywords = ["unit tests", "test file", "configuration file"]
 43 |     lines = example["content"].splitlines()
 44 |     count_config = 0
 45 |     count_test = 0
 46 |     # first test
 47 |     for _, line in zip(range(scan_width), lines):
 48 |         for keyword in keywords:
 49 |             if keyword in line.lower():
 50 |                 return {"config_or_test": True}
 51 |     # second test
 52 |     nlines = example["content"].count("\n")
 53 |     threshold = int(coeff * nlines)
 54 |     for line in lines:
 55 |         count_config += line.lower().count("config")
 56 |         count_test += line.lower().count("test")
 57 |         if count_config > threshold or count_test > threshold:
 58 |             return {"config_or_test": True}
 59 |     return {"config_or_test": False}
 60 | 
 61 | 
 62 | def preprocess(example):
 63 |     results = dict()
 64 |     results.update(is_config_or_test(example))
 65 |     return results
 66 | 
 67 | 
 68 | def filter(example):
 69 |     """Filter files that are config or test files"""
 70 |     if example["config_or_test"]:
 71 |         return False
 72 |     return True
 73 | 
 74 | 
 75 | args = parseArgs()
 76 | 
 77 | # Load dataset
 78 | t_start = time.time()
 79 | print(f"Loading dataset {args.dataset_name}")
 80 | dataset = load_dataset(args.dataset_name, split=args.split)
 81 | # dataset = load_dataset("bigcode/the-stack", data_files = ["data/python/*"], split="train", use_auth_token=True, chunksize=40<<20)
 82 | print(f"Time to load dataset: {time.time()-t_start:.2f}")
 83 | 
 84 | # Run preprocessing
 85 | t_start = time.time()
 86 | ds = dataset.map(preprocess, num_proc=args.num_workers)
 87 | print(f"Time to preprocess dataset: {time.time()-t_start:.2f}")
 88 | print(ds)
 89 | 
 90 | t_start = time.time()
 91 | old_size = len(ds)
 92 | ds = ds.filter(filter)
 93 | print(f"Time to filter dataset: {time.time()-t_start:.2f}")
 94 | print(f"\nSize of original dataset: {old_size}")
 95 | print(f"Size of filtered dataset: {len(ds)}")
 96 | print(
 97 |     f"\nPercentage of removed files: {np.round((old_size - len(ds))*100/old_size, 2)}%"
 98 | )
 99 | 
100 | print("\nCounting size in Gb of the new datase")
101 | new_size, old_size = 0, 0
102 | for i in tqdm(range(len(ds))):
103 |     new_size += len(ds[i]["content"])
104 | 
105 | for i in tqdm(range(len(dataset))):
106 |     old_size += len(dataset[i]["content"])
107 | 
108 | print(f"current size in Gb is {np.round(new_size/10**9), 4}")
109 | print(f"old size in Gb is {np.round(old_size/10**9, 4)}")
110 | print(f"volume removed: {np.round((old_size-new_size)*100/new_size, 2)}%")
111 | 
112 | if args.push_to_hub:
113 |     ds.push_to_hub("no_conf_test_ds")
114 | 


--------------------------------------------------------------------------------
/data_analysis/python_data_analysis/nl_language_identification/README.md:
--------------------------------------------------------------------------------
 1 | # Natural Language identification in Python code
 2 | 
 3 | In this folder, we provide code to extract Python docstrings and comment and identify their natural language.
 4 | 
 5 | # Setup
 6 | We use `fasttext` for language identification, download the language detection model `lid.176.bin` from [fasttext.cc/docs/en/language-identification](https://fasttext.cc/docs/en/language-identification.html) and seve it in `fastext_model`folder. You need to install `fastext` and `datasets` libraries.
 7 | 
 8 | ```
 9 | pip install fastext
10 | pip install datasets
11 | ```
12 | 
13 | # Usage
14 | The command below saves a dataset with additional columns giving the language of each file, the score/confidence of model in the prediction, the extracted natural text and its size:
15 | ````
16 | python language_identifier.py \
17 |     --dataset_name <DATA>\
18 |     --model_path fasttext_model/lid.176.bin\
19 |     --save_path ./data/
20 | ````
21 | # Analysis
22 | 
23 | See the notebook `analysis.ipynb`.
24 | 
25 | Detected language distribution on 2,000 samples from CodeParrot data:
26 | <h3 align="center">
27 |     <img width="360" height="300" src="https://user-images.githubusercontent.com/44069155/191994477-6246467b-eec7-4ae1-a14d-dd2262254762.png" /></a>
28 | </h3>
29 | 


--------------------------------------------------------------------------------
/data_analysis/python_data_analysis/nl_language_identification/fasttext_model/README.md:
--------------------------------------------------------------------------------
1 | Download the language detection model lid.176.bin from [fasttext.cc/docs/en/language-identification](https://fasttext.cc/docs/en/language-identification.html)
2 | 


--------------------------------------------------------------------------------
/data_analysis/python_data_analysis/nl_language_identification/language_identifier.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import multiprocessing
  3 | import pathlib
  4 | import fasttext
  5 | 
  6 | from datasets import load_dataset
  7 | 
  8 | from text_extraction import get_text
  9 | 
 10 | #adapted from: https://github.com/bigscience-workshop/data-preparation/blob/main/sourcing/
 11 | # cc_pseudo_crawl/language_annotation/python_scripts/annotate_langid_crawl.py
 12 | 
 13 | COLUMN = "content"
 14 | 
 15 | def parseArgs():
 16 |     parser = argparse.ArgumentParser(
 17 |         description="Identify natural languages in code"
 18 |     )
 19 |     parser.add_argument(
 20 |         "dataset_name",
 21 |         type=str,
 22 |         help="HF repo name/path of the dataset.",
 23 |     )
 24 |     parser.add_argument(
 25 |         "save_path",
 26 |         default="./data_with_language/",
 27 |         type=str,
 28 |         help="Path to save the new dataset with language column.",
 29 |     )
 30 |     parser.add_argument(
 31 |         "model_path",
 32 |         default= "fasttext_model/lid.176.bin",
 33 |         type=str,
 34 |         help="Path to fasttext model.",
 35 |     )
 36 |     args = parser.parse_args()
 37 |     return args
 38 | 
 39 | def load_fasttext_model(path_fasttext_model):
 40 |     return fasttext.load_model(path_fasttext_model)
 41 | 
 42 | 
 43 | def get_fasttext_info(line, model_lang_id):
 44 |     """The line should be in lower case and without \n in it."""
 45 |     pred = model_lang_id.predict(line)
 46 |     lang_pred_fasttext_id = pred[0][0].replace("__label__", "")
 47 |     score_pred = pred[1][0]
 48 |     return lang_pred_fasttext_id, score_pred
 49 | 
 50 | 
 51 | def get_all_fasttext_info(document, model_lang_id):
 52 |     document = document.lower()
 53 |     lang_pred_fasttext_id, score_pred = get_fasttext_info(
 54 |         document.replace("\n", " "), model_lang_id
 55 |     )
 56 |     info = {
 57 |         "lang_pred_fasttext_id": lang_pred_fasttext_id,
 58 |         "score_pred": score_pred,
 59 |         "on_lines": [
 60 |             {
 61 |                 "id_line": id_line,
 62 |                 "number_caracters_line": len(line),
 63 |                 "lang_pred_fasttext_id_line": result_fasttext_line[0],
 64 |                 "score_pred_line": result_fasttext_line[1],
 65 |             }
 66 |             for id_line, line in enumerate(document.split("\n"))
 67 |             for result_fasttext_line in [get_fasttext_info(line, model_lang_id)]
 68 |         ],
 69 |     }
 70 |     return info
 71 | 
 72 | 
 73 | def extract_nl_text(example):
 74 |         text = get_text(example[COLUMN])
 75 |         example["nl_text"] = text
 76 |         example["nl_size"] = len(text)
 77 |         return example
 78 | 
 79 | 
 80 | class FunctionDatasetModifyingDocuments:
 81 |     def __init__(self, path_fasttext_model):
 82 |         self.path_fasttext_model = path_fasttext_model
 83 |         self.model_lang_id = load_fasttext_model(path_fasttext_model)
 84 | 
 85 |     def __call__(self, example):
 86 |         fasttext_pred = get_all_fasttext_info(
 87 |             example["nl_text"], self.model_lang_id
 88 |         )
 89 |         example["nl_language"] = fasttext_pred["lang_pred_fasttext_id"]
 90 |         example["nl_language_score"]  = fasttext_pred["score_pred"]
 91 |         return example
 92 | 
 93 |     def __reduce__(self):
 94 |         return (self.__class__, (self.path_fasttext_model,))
 95 | 
 96 | 
 97 | def main():
 98 |     args = parseArgs()
 99 | 
100 |     dataset = load_dataset(args.dataset_name)
101 |     print("Loading dataset done")
102 | 
103 |     func_dataset_modifying_documents = FunctionDatasetModifyingDocuments(
104 |         args.model_path
105 |     )
106 | 
107 |     dataset = dataset.map(extract_nl_text, num_proc=multiprocessing.cpu_count())
108 | 
109 |     # Could be improved by allowing multiprocessing with map (currently doesn't work)
110 |     dataset = dataset.map(
111 |         func_dataset_modifying_documents, num_proc=1
112 |     )  # num_proc=cpu_count()
113 |     print("Fasttext done")
114 | 
115 |     pathlib.Path(args.save_path).mkdir(parents=True, exist_ok=True)
116 |     dataset.save_to_disk(args.save_path)
117 |     print("Shard successfully saved")


--------------------------------------------------------------------------------
/data_analysis/python_data_analysis/nl_language_identification/requirements.txt:
--------------------------------------------------------------------------------
1 | datasets==2.4.0
2 | fasttext==0.9.2


--------------------------------------------------------------------------------
/data_analysis/python_data_analysis/nl_language_identification/text_extraction.py:
--------------------------------------------------------------------------------
 1 | """Extract Python comments (using Python tokenizer) and docstrings (using AST parsing)."""
 2 | 
 3 | import io
 4 | from itertools import groupby
 5 | from os.path import basename, splitext
 6 | import ast
 7 | import tokenize
 8 | import warnings 
 9 | 
10 | StringIO = io.StringIO
11 | 
12 | NODE_TYPES = {
13 |     ast.ClassDef: 'Class',
14 |     ast.FunctionDef: 'Function/Method',
15 |     ast.Module: 'Module'
16 | }
17 | 
18 | # comment extraction
19 | def get_comments(s, clean=False):
20 |     "Returns a string including all coments"
21 |     coments = []
22 |     g = tokenize.generate_tokens(StringIO(s).readline)
23 |     for toknum, tokval, _, _, _  in g:
24 |         # print(toknum,tokval)
25 |         if toknum == tokenize.COMMENT:
26 |             coments.append((toknum, tokval))
27 |     result = tokenize.untokenize(coments)
28 |     if clean:
29 |         result = result.replace('#', '')
30 |     return result
31 | 
32 | # TODO: extraction work well (with decorators over classes)
33 | # ast parsing, source: https://gist.github.com/SpotlightKid/1548cb6c97f2a844f72d
34 | def parse_docstrings(source):
35 |     """Parse Python source code and yield a tuple of ast node instance, name,
36 |     and docstring for each function/method, class and module.""" 
37 |     tree = ast.parse(source)
38 |     
39 |     for node in ast.walk(tree):
40 |         if isinstance(node, tuple(NODE_TYPES)):
41 |             docstring = ast.get_docstring(node)
42 | 
43 |             yield (node, getattr(node, 'name', None), docstring)
44 | 
45 | def get_docstrings(source, module='<string>'):
46 |     """Parse Python source code from file or string and print docstrings."""
47 |     if hasattr(source, 'read'):
48 |         filename = getattr(source, 'name', module)
49 |         module = splitext(basename(filename))[0]
50 |         source = source.read()
51 | 
52 |     docstrings = sorted(parse_docstrings(source),
53 |         key=lambda x: (NODE_TYPES.get(type(x[0])), x[1]))
54 | 
55 |     grouped = groupby(docstrings, key=lambda x: NODE_TYPES.get(type(x[0])))
56 |     results = []
57 |     for _, group in grouped:
58 |         for _, name, docstring in group:
59 |             name = name if name else module
60 |             #print(docstring or '')
61 |             if docstring:
62 |                 results.append(docstring)
63 |     return results
64 | 
65 | def get_text(source, comments=True, clean_comments=True):
66 |     """Extract all natural text in source: comments + doctsrings
67 |     the extraction fails in case of syntax errors in the file 
68 |     Args:
69 |         source: the code to parse
70 |         comments: if True extract comments two
71 |         clean_comment: if True remove # from extracted comments
72 |     Returns:
73 |         a string with concatenated docstrings and comments"""
74 | 
75 |     try:
76 |         docstrings = '\n'.join(get_docstrings(source))
77 |     except :
78 |         docstrings = ''
79 |         warnings.warn("code couldn't be parsed due to compilation failure, no docstring is extracted")
80 | 
81 |     if comments:
82 |         try:
83 |             comments = get_comments(source, clean=clean_comments)
84 |         except :
85 |             comments = ''
86 |             warnings.warn("tokenization error, no comment is extracted")
87 |     else:
88 |         comments = ''
89 | 
90 |     output = docstrings + "\n\n" + comments
91 |     return output.strip()


--------------------------------------------------------------------------------
/data_analysis/stars_filtering/README.md:
--------------------------------------------------------------------------------
1 | # Filtering based on number of stars
2 | 
3 | Here we are interested in filtering files based on their number of stars (i.e. of their parent repositories). 
4 | 
5 | You can find clean filtering code in `bigcode-dataset`repository under [preprocessing](https://github.com/bigcode-project/bigcode-dataset/tree/main/preprocessing).
6 | * `stars_analysis.ipynb` contains the code for the analysis of the stars filter, used to come up with minimum threshold of 5 stars for Python, Java and JavaScript subsets of [The Stack](https://huggingface.co/datasets/bigcode/the-stack).


--------------------------------------------------------------------------------
/evaluation/README.md:
--------------------------------------------------------------------------------
1 | ## Evaluation analysis
2 | 
3 | 


--------------------------------------------------------------------------------
/multi_query_experiments/README.md:
--------------------------------------------------------------------------------
 1 | # Profiling of multi head vs multi query attention separately
 2 | - `attention_types_imp.py` contains simplistic implementations of different attention layers without normalization, masks and softmax, just matrix multiplications and rearranging of tensors:
 3 |     - `MultiHead` is a multi head variant closely following the implementaion in Hugging Face.
 4 |     - `MultiQuery` is a multi query variant with dimension order of hidden states as in [Megatron-LM](https://github.com/NVIDIA/Megatron-LM) `[sl, bs]`. The reordering of the tensors avoids explicit copies here, however, `bmm` subsequently makes internal copies and  speed suffers. TODO: try with separate tensors for `q`, `k` and `v`.
 5 |     - `MultiQuery1` uses the same hidden states order as in HF and one explicit `reshape`. It is the fastest and  is currently ported to HF transformers.
 6 | - `profile_attention_types.py` contains code to run timing experiments. Results are in `profile_attention_types.json`.
 7 | - `profile_attention_types_visualise.ipynb` contains graphs.
 8 | - There is uncertainty about the accuracy times of the profiler. Cpu times, through, decrease slightly in proportion, but still remain significant event for bigger tensors. Around 33% for sequence length of ~2K. However, `MultiQuery1` is the fastest and is ported to HF transformers.
 9 | 
10 | # Profiling of multi head vs multi query attention in HF transformers
11 | 
12 | [![Open In SageMaker Studio Lab](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/bigcode-project/bigcode-analysis/multi_query_experiments/profile_mqa.ipynb)
13 | 
14 | - The implementation of multi-query attention currently lives in a custom fork of `transformers` : [here](https://github.com/bigcode-project/transformers/tree/multi_query)
15 | - `profile_hf_generate.py` contains experiments.
16 | - There are 2 implementations variants of multi query attention controlled by `attention_type` parameter:
17 |     - `AttentionType.MULTI_QUERY` with minimal changes to the code.
18 |     - `AttentionType.MULTI_QUERY_1` with some reordering of dimensions from explorations with @harm-devries and bmm instead of matmul similarly as in `MultiQuery1`.
19 | - `AttentionType.MULTI_QUERY_1` is the fastest, with around 24% speedup:
20 | ```
21 | -------------------- attention_type == AttentionType.MULTI_QUERY---------------------
22 | {'get_test_batch': 5.9604644775390625e-05, 'generate_text_batch': 18.453815460205078, 'input_batch_size': 8, 'input_batch_length': 16, 'max_gen_length': 1024, 'num_beams': 1, 'do_sample': False, 'pad_token_id': 50256, 'dtype': torch.int64, 'device': device(type='cuda'), 'cuda_device_name': 'Tesla V100-PCIE-16GB-LS'}
23 | -------------------- attention_type == AttentionType.MULTI_QUERY_1---------------------
24 | {'get_test_batch': 4.172325134277344e-05, 'generate_text_batch': 15.190143346786499, 'input_batch_size': 8, 'input_batch_length': 16, 'max_gen_length': 1024, 'num_beams': 1, 'do_sample': False, 'pad_token_id': 50256, 'dtype': torch.int64, 'device': device(type='cuda'), 'cuda_device_name': 'Tesla V100-PCIE-16GB-LS'}
25 | -------------------- attention_type == AttentionType.MULTI_HEAD---------------------
26 | {'get_test_batch': 5.459785461425781e-05, 'generate_text_batch': 19.78107237815857, 'input_batch_size': 8, 'input_batch_length': 16, 'max_gen_length': 1024, 'num_beams': 1, 'do_sample': False, 'pad_token_id': 50256, 'dtype': torch.int64, 'device': device(type='cuda'), 'cuda_device_name': 'Tesla V100-PCIE-16GB-LS'}
27 | ```
28 | 


--------------------------------------------------------------------------------
/multi_query_experiments/attention_types_imp.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | 
  3 | class MultiHead:
  4 |     '''
  5 |     bs = batch size
  6 |     sl = sequence length
  7 |     nh = number of heads
  8 |     hs = head size
  9 |     nm = number of embeddings = nh * hs
 10 |     '''
 11 | 
 12 |     @classmethod
 13 |     def allocate_data(cls, bs, sl, nh, hs, print_shapes):
 14 |         nm = nh * hs 
 15 |         hidden_state = torch.randn(bs, sl, nm, device=torch.device('cuda'))
 16 |         c_attn_w = torch.randn(nm, 3*nm, device=torch.device('cuda'))
 17 |         i0 = None
 18 |         i1 = None
 19 |         i2 = None
 20 |         if print_shapes:
 21 |             print('hidden_state', hidden_state.shape)
 22 |             print('c_attn_w', c_attn_w.shape)
 23 |         return hidden_state, c_attn_w, i0, i1, i2
 24 | 
 25 |     @classmethod
 26 |     def get_qkv(cls, hidden_state, c_attn_w, i0, bs, sl, nh, hs, print_shapes):
 27 |         return torch.matmul(
 28 |             hidden_state.view(bs * sl, nh * hs),
 29 |             c_attn_w
 30 |         ).view(bs, sl, -1)
 31 | 
 32 |     @classmethod
 33 |     def split_qkv(cls, qkv, bs, sl, nh, hs, print_shapes):
 34 |         q, k, v = qkv.split(nh*hs, dim=2)
 35 |         
 36 |         if print_shapes:
 37 |             print('q', q.shape)
 38 |             print('k', k.shape)
 39 |             print('v', v.shape)
 40 | 
 41 |         q = q.view(bs, sl, nh, hs).permute(0, 2, 1, 3)
 42 |         k = k.view(bs, sl, nh, hs).permute(0, 2, 3, 1)
 43 |         v = v.view(bs, sl, nh, hs).permute(0, 2, 1, 3)
 44 |         
 45 |         if print_shapes:
 46 |             print('q', q.shape)
 47 |             print('k', k.shape)
 48 |             print('v', v.shape)
 49 |         
 50 |         return q, k, v
 51 | 
 52 |     @classmethod
 53 |     def get_attention_weights(cls, q, k, i1, bs, sl, nh, hs, print_shapes):
 54 |         attention_weights = torch.matmul(q, k)
 55 |         return attention_weights
 56 |     
 57 |     @classmethod
 58 |     def get_attention_output(cls, attention_weights, v, i2, bs, sl, nh, hs, print_shapes):
 59 |         attn_output = torch.matmul(attention_weights, v)
 60 |         if print_shapes:
 61 |             print('attn_output', attn_output.shape)
 62 |         #attn_output = attn_output.view(
 63 |         #    bs, nh, sl, hs).permute(0, 2, 1, 3)
 64 |         return attn_output
 65 |     
 66 |     
 67 | class MultiQuery:
 68 |     '''
 69 |     bs = batch size
 70 |     sl = sequence length
 71 |     nh = number of heads
 72 |     hs = head size
 73 |     nm = number of embeddings = nh * hs
 74 |     '''
 75 | 
 76 |     @classmethod
 77 |     def allocate_data(cls, bs, sl, nh, hs, print_shapes):
 78 |         nm = nh * hs 
 79 |         hidden_state = torch.randn(sl, bs, nm, device=torch.device('cuda'))
 80 |         c_attn_w = torch.randn((nh + 2) * hs, nm, device=torch.device('cuda'))
 81 |         i0 = torch.zeros((nh + 2) * hs, sl * bs, device=torch.device('cuda'))
 82 |         i1 = torch.zeros(bs, sl * nh, sl, device=torch.device('cuda'))
 83 |         i2 = torch.zeros(bs, sl * nh, hs, device=torch.device('cuda'))
 84 |         if print_shapes:
 85 |             print('hidden_state', hidden_state.shape)
 86 |             print('c_attn_w', c_attn_w.shape)
 87 |             print('i0', i0.shape)
 88 |             print('i1', i1.shape)
 89 |             print('i2', i2.shape)
 90 |         return hidden_state, c_attn_w, i0, i1, i2
 91 | 
 92 |     @classmethod
 93 |     def get_qkv1(cls, hidden_state, c_attn_w, i0, bs, sl, nh, hs, print_shapes):
 94 |         return torch.addmm(
 95 |             i0,
 96 |             c_attn_w,
 97 |             hidden_state.transpose(0, 1)
 98 |         )
 99 |     
100 |     @classmethod
101 |     def get_qkv(cls, hidden_state, c_attn_w, i0, bs, sl, nh, hs, print_shapes):
102 |         return torch.matmul(
103 |             c_attn_w,
104 |             hidden_state.view(sl * bs, nh * hs).transpose(0, 1)
105 |         )
106 | 
107 |     @classmethod
108 |     def split_qkv(cls, qkv, bs, sl, nh, hs, print_shapes):
109 |         q, k, v = qkv.split((nh*hs, hs, hs), dim=0)
110 |         
111 |         if print_shapes:
112 |             print('q', q.shape)
113 |             print('k', k.shape)
114 |             print('v', v.shape)
115 | 
116 |         q = q.view(hs, nh, sl, bs
117 |                   ).permute(3, 1, 2, 0).view(bs, sl*nh, hs)
118 |         k = k.view(hs, sl, bs).permute(2, 0, 1)
119 |         v = v.view(hs, sl, bs).permute(2, 1, 0)
120 |         
121 |         if print_shapes:
122 |             print('q', q.shape)
123 |             print('k', k.shape)
124 |             print('v', v.shape)
125 |         
126 |         return q, k, v
127 | 
128 |     @classmethod
129 |     def get_attention_weights(cls, q, k, i1, bs, sl, nh, hs, print_shapes):
130 |         return torch.baddbmm(i1, q, k)
131 |     
132 |     @classmethod
133 |     def get_attention_output(cls, attention_weights, v, i2, bs, sl, nh, hs, print_shapes):
134 |         attn_output = torch.baddbmm(i2, attention_weights, v)
135 |         if print_shapes:
136 |             print('attn_output', attn_output.shape)
137 |         #attn_output = attn_output.view(
138 |         #    bs, sl, nh, hs).permute(1, 0, 2, 3).view(sl, bs, nh * hs)
139 |         return attn_output
140 |     
141 |     
142 | class MultiQuery1:
143 |     '''
144 |     bs = batch size
145 |     sl = sequence length
146 |     nh = number of heads
147 |     hs = head size
148 |     nm = number of embeddings = nh * hs
149 |     '''
150 | 
151 |     @classmethod
152 |     def allocate_data(cls, bs, sl, nh, hs, print_shapes):
153 |         nm = nh * hs 
154 |         hidden_state = torch.randn(bs, sl, nm, device=torch.device('cuda'))
155 |         c_attn_w = torch.randn(nm, (nh + 2) * hs, device=torch.device('cuda'))
156 |         i0 = torch.zeros(sl * bs, (nh + 2) * hs, device=torch.device('cuda'))
157 |         i1 = torch.zeros(bs, sl * nh, sl, device=torch.device('cuda'))
158 |         i2 = torch.zeros(bs, sl * nh, hs, device=torch.device('cuda'))
159 |         if print_shapes:
160 |             print('hidden_state', hidden_state.shape)
161 |             print('c_attn_w', c_attn_w.shape)
162 |             print('i0', i0.shape)
163 |             print('i1', i1.shape)
164 |             print('i2', i2.shape)
165 |         return hidden_state, c_attn_w, i0, i1, i2
166 | 
167 |     @classmethod
168 |     def get_qkv1(cls, hidden_state, c_attn_w, i0, bs, sl, nh, hs, print_shapes):
169 |         return torch.addmm(
170 |             i0,
171 |             hidden_state,
172 |             c_attn_w,   
173 |         ).view(bs, sl, -1)
174 |     
175 |     @classmethod
176 |     def get_qkv(cls, hidden_state, c_attn_w, i0, bs, sl, nh, hs, print_shapes):
177 |         return torch.matmul(
178 |             hidden_state.view(sl * bs, nh * hs),
179 |             c_attn_w,
180 |         ).view(bs, sl, -1)
181 | 
182 |     @classmethod
183 |     def split_qkv(cls, qkv, bs, sl, nh, hs, print_shapes):
184 |         q, k, v = qkv.split((nh*hs, hs, hs), dim=2)
185 |         
186 |         if print_shapes:
187 |             print('q', q.shape)
188 |             print('k', k.shape)
189 |             print('v', v.shape)
190 | 
191 |         q = q.view(
192 |              bs, sl, nh, hs,
193 |         ).reshape(
194 |             bs, sl * nh, hs
195 |         )
196 |         k = k.permute(0, 2, 1)
197 |         v = v
198 |         
199 |         if print_shapes:
200 |             print('q', q.shape)
201 |             print('k', k.shape)
202 |             print('v', v.shape)
203 |         
204 |         return q, k, v
205 | 
206 |     @classmethod
207 |     def get_attention_weights(cls, q, k, i1, bs, sl, nh, hs, print_shapes):
208 |         return torch.baddbmm(i1, q, k)
209 |     
210 |     @classmethod
211 |     def get_attention_output(cls, attention_weights, v, i2, bs, sl, nh, hs, print_shapes):
212 |         attn_output = torch.baddbmm(i2, attention_weights, v)
213 |         if print_shapes:
214 |             print('attn_output', attn_output.shape)
215 |         #attn_output = attn_output.view(
216 |         #    bs, sl, nh, hs).permute(1, 0, 2, 3).view(sl, bs, nh * hs)
217 |         return attn_output
218 |     
219 |     
220 | def get_key_totals(prof):
221 |     names = set(('GET_QKV', 'SPLIT_QKV', 'GET_ATTENTION_WEIGHTS', 'GET_ATTENTION_OUTPUT'))
222 |     ka = prof.key_averages()
223 |     stats = [[el.key, el.cpu_time_total / el.count, el.cuda_time_total / el.count, el.cpu_time_total / el.count + el.cuda_time_total / el.count] for el in ka if el.key in names]
224 |     el_total  = ['TOTAL', 0, 0, 0]
225 |     for el in stats:
226 |         el_total[1] += el[1]
227 |         el_total[2] += el[2]
228 |         el_total[3] += el[3]
229 |     
230 |     return [['key', 'cpu us', 'cuda us', 'all us']] + stats + [el_total]
231 | 
232 | 
233 | def test_attention_total_time(cls, bs, sl, nh, hs, repeat_cnt):
234 |     hidden_state, c_attn_w, i0, i1, i2 = cls.allocate_data(bs, sl, nh, hs, False)
235 |     with torch.autograd.profiler.profile(use_cuda=True) as prof:
236 |         for i in range(repeat_cnt):
237 |             with torch.autograd.profiler.record_function("GET_QKV"):
238 |                 qkv = cls.get_qkv(hidden_state, c_attn_w, i0, bs, sl, nh, hs, False)
239 |             with torch.autograd.profiler.record_function("SPLIT_QKV"):
240 |                 q, k, v = cls.split_qkv(qkv, bs, sl, nh, hs, False)
241 |             with torch.autograd.profiler.record_function("GET_ATTENTION_WEIGHTS"):
242 |                 attention_weights = cls.get_attention_weights(q, k, i1, bs, sl, nh, hs, False)
243 |             with torch.autograd.profiler.record_function("GET_ATTENTION_OUTPUT"):
244 |                 attention_output = cls.get_attention_output(attention_weights, v, i2, bs, sl, nh, hs, False)
245 |     res = get_key_totals(prof)
246 |     return res
247 | 
248 | def test_attention(cls, bs, sl, nh, hs, repeat_cnt):
249 |     hidden_state, c_attn_w, i0, i1, i2 = cls.allocate_data(bs, sl, nh, hs, True)
250 |     with torch.autograd.profiler.profile(use_cuda=True) as prof:
251 |         for i in range(repeat_cnt):
252 |             with torch.autograd.profiler.record_function("GET_QKV"):
253 |                 qkv = cls.get_qkv(hidden_state, c_attn_w, i0, bs, sl, nh, hs, i == 0)
254 |             with torch.autograd.profiler.record_function("SPLIT_QKV"):
255 |                 q, k, v = cls.split_qkv(qkv, bs, sl, nh, hs, i == 0)
256 |             with torch.autograd.profiler.record_function("GET_ATTENTION_WEIGHTS"):
257 |                 attention_weights = cls.get_attention_weights(q, k, i1, bs, sl, nh, hs, i == 0)
258 |             with torch.autograd.profiler.record_function("GET_ATTENTION_OUTPUT"):
259 |                 attention_output = cls.get_attention_output(attention_weights, v, i2, bs, sl, nh, hs, i == 0)
260 |     print(prof.key_averages().table(sort_by="self_cuda_time_total"))
261 |     return get_key_totals(prof)
262 | 
263 | 
264 | def test_qkv(cls, bs, sl, nh, hs, repeat_cnt):
265 |     hidden_state, c_attn_w, i0, i1, i2 = cls.allocate_data(bs, sl, nh, hs, True)
266 |     with torch.autograd.profiler.profile(use_cuda=True) as prof:
267 |         for i in range(repeat_cnt):
268 |             with torch.autograd.profiler.record_function("GET_QKV"):
269 |                 qkv = cls.get_qkv(hidden_state, c_attn_w, i0, bs, sl, nh, hs, i == 0)
270 |     print(prof.key_averages().table(sort_by="self_cuda_time_total"))
271 |     return get_key_totals(prof)
272 | 
273 | def test_attention_weights(cls, bs, sl, nh, hs, repeat_cnt):
274 |     hidden_state, c_attn_w, i0, i1, i2 = cls.allocate_data(bs, sl, nh, hs, True)
275 |     qkv = cls.get_qkv(hidden_state, c_attn_w, i0, bs, sl, nh, hs, True)
276 |     q, k, v = cls.split_qkv(qkv, bs, sl, nh, hs, True)
277 |     with torch.autograd.profiler.profile(use_cuda=True) as prof:
278 |         for i in range(repeat_cnt):
279 |             with torch.autograd.profiler.record_function("GET_ATTENTION_WEIGHTS"):
280 |                 attention_weights = cls.get_attention_weights(q, k, i1, bs, sl, nh, hs, i == 0)
281 |     print(prof.key_averages().table(sort_by="self_cuda_time_total"))
282 |     return get_key_totals(prof)
283 | 
284 | def test_attention_weights_output(cls, bs, sl, nh, hs, repeat_cnt):
285 |     hidden_state, c_attn_w, i0, i1, i2 = cls.allocate_data(bs, sl, nh, hs, True)
286 |     qkv = cls.get_qkv(hidden_state, c_attn_w, i0, bs, sl, nh, hs, True)
287 |     q, k, v = cls.split_qkv(qkv, bs, sl, nh, hs, True)
288 |     with torch.autograd.profiler.profile(use_cuda=True) as prof:
289 |         for i in range(repeat_cnt):
290 |             with torch.autograd.profiler.record_function("GET_ATTENTION_WEIGHTS"):
291 |                 attention_weights = cls.get_attention_weights(q, k, i1, bs, sl, nh, hs, i == 0)
292 |             with torch.autograd.profiler.record_function("GET_ATTENTION_OUTPUT"):
293 |                 attention_output = cls.get_attention_output(attention_weights, v, i2, bs, sl, nh, hs, i == 0)
294 |     print(prof.key_averages().table(sort_by="self_cuda_time_total"))
295 |     return get_key_totals(prof)
296 | 


--------------------------------------------------------------------------------
/multi_query_experiments/profile_attention_types.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import attention_types_imp as imp
 3 | from tqdm.auto import tqdm
 4 | import json
 5 | import math
 6 | 
 7 | def profile_attention_type(cls):
 8 |     repeat_cnt=500
 9 | 
10 |     print(f'----------------------{cls}-------------------')
11 | 
12 |     res = []
13 |     for bs in tqdm(range(8, 17, 8)):
14 |         sl_times = []
15 |         for sl in tqdm(range(64, 2000, 128)):
16 |             rp = max(1, int(repeat_cnt * math.pow(64, 1/3.0) / math.pow(sl, 1/3.0)))
17 |             totals_mh = imp.test_attention_total_time(cls, bs=bs, sl=sl, nh=16, hs=64, repeat_cnt=rp)
18 |             sl_times.append((sl, totals_mh))
19 |         res.append((bs, sl_times))
20 | 
21 |     return res
22 | 
23 | 
24 | # warmup
25 | imp.test_attention_total_time(imp.MultiHead, bs=24, sl=8, nh=16, hs=64, repeat_cnt=100)
26 | 
27 | if True:
28 |     res = {
29 |         'MultiHead': profile_attention_type(imp.MultiHead),
30 |         'MultiQuery': profile_attention_type(imp.MultiQuery),
31 |         'MultiQuery1': profile_attention_type(imp.MultiQuery1),
32 |     }
33 | 
34 |     with open('profile_attention_types1.json', 'w') as f:
35 |         json.dump(res, f)
36 | 
37 | 


--------------------------------------------------------------------------------
/multi_query_experiments/profile_hf_generate.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | # we cache pretrained models in a user-writable dir (friendlier to SageMaker environments)
  3 | os.environ['TRANSFORMERS_CACHE'] = os.environ['PWD'] + '/hf_transformers_cache'
  4 | 
  5 | import torch
  6 | import time
  7 | import transformers
  8 | 
  9 | from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2Config
 10 | from transformers.models.gpt2.modeling_gpt2 import AttentionType
 11 | 
 12 | def env(evar:str):
 13 |     return os.environ[evar]
 14 | 
 15 | def dev():
 16 |     if torch.cuda.is_available():
 17 |       return torch.device("cuda")
 18 |     else:
 19 |       return torch.device("cpu")
 20 | 
 21 | 
 22 | print(transformers.__file__)
 23 | print(f'CUDA device : { torch.cuda.get_device_name(0) if torch.cuda.is_available() else None }')
 24 | print(f'PWD : {env("PWD")}')
 25 | print(f'transformers_cache : {env("TRANSFORMERS_CACHE")}')
 26 | 
 27 | def get_test_batch(vocab_size, size, length, dtype=torch.int64, device=None):
 28 |     #TODO: eliminate special tokens, for now assumes the last one is the only special token
 29 |     return {
 30 |         'input_ids': torch.randint(0, vocab_size-1, (size, length), dtype=dtype, device=device),
 31 |         'attention_mask': torch.ones((size, length), dtype=dtype, device=device)
 32 |     }
 33 | 
 34 | def generate_text_batch(model, inputs, max_length, num_beams=1, do_sample=False, pad_token_id=50256):
 35 |     return model.generate(
 36 |         **inputs,  max_length=max_length, num_beams=num_beams, do_sample=do_sample, pad_token_id=pad_token_id
 37 |     )
 38 | 
 39 | def decode_batch(tokenizer, outputs):
 40 |     # outputs = outputs.numpy().tolist()
 41 |     outputs = outputs.tolist()
 42 |     return [
 43 |         tokenizer.decode(output)
 44 |         for output in outputs
 45 |     ]
 46 | 
 47 | def time_generate(
 48 |     vocab_size, model, input_batch_size, input_batch_length, max_gen_length,
 49 |     num_beams=1, do_sample=False, pad_token_id=50256, dtype=torch.int64, device=None, tokenizer=None
 50 | ):
 51 |     stats = {}
 52 | 
 53 |     t1 = time.time()
 54 |     inputs = get_test_batch(vocab_size, input_batch_size, input_batch_length, dtype, device)
 55 |     stats['get_test_batch'] = time.time() - t1
 56 | 
 57 |     t1 = time.time()
 58 |     outputs = generate_text_batch(
 59 |         model, inputs, max_gen_length, num_beams=num_beams, do_sample=do_sample, pad_token_id=pad_token_id
 60 |     )
 61 |     stats['generate_text_batch'] = time.time() - t1
 62 |     
 63 |     if do_sample:
 64 |         t1 = time.time()
 65 |         decs = decode_batch(tokenizer, outputs)
 66 |         dt = time.time() - t1
 67 |         stats['decode_batch'] = dt
 68 |     
 69 |     stats['input_batch_size'] = input_batch_size
 70 |     stats['input_batch_length'] = input_batch_length
 71 |     stats['max_gen_length'] = max_gen_length
 72 |     stats['num_beams'] = num_beams
 73 |     stats['do_sample'] = do_sample
 74 |     stats['pad_token_id'] = pad_token_id
 75 |     stats['dtype'] = dtype
 76 | 
 77 |     return inputs, outputs, stats
 78 | 
 79 | def profile(attention_type):
 80 |     tokenizer = GPT2Tokenizer.from_pretrained('gpt2', cache_dir=os.environ['TRANSFORMERS_CACHE'])
 81 | 
 82 |     config = GPT2Config(
 83 |         vocab_size=len(tokenizer),
 84 |         n_layer=24,
 85 |         n_embd=1024,
 86 |         n_head=16,
 87 |         n_positions=2048,
 88 |         #n_ctx=tokenizer.model_max_length,
 89 |         bos_token_id=tokenizer.bos_token_id,
 90 |         eos_token_id=tokenizer.eos_token_id,
 91 |         attention_type=attention_type,
 92 |         print_details=False
 93 |     )
 94 |     model = GPT2LMHeadModel(config).to(dev())
 95 | 
 96 |     inputs = get_test_batch(tokenizer.vocab_size, 1, 4, device=dev())
 97 | 
 98 |     print(f'-------------------- attention_type == {attention_type} ---------------------')
 99 | 
100 |     inputs, outputs, stats = time_generate(tokenizer.vocab_size, model, 8, 16, 1024, device=dev(), tokenizer=tokenizer, do_sample=True)
101 |     print(stats)
102 | 
103 | 
104 | t0 = time.time()
105 | # # warm up
106 | # profile(AttentionType.MULTI_QUERY)
107 | 
108 | profile(AttentionType.MULTI_QUERY)
109 | profile(AttentionType.MULTI_QUERY_1)
110 | profile(AttentionType.MULTI_HEAD)
111 | dt = time.time() - t0
112 | print(f'Total elapsed time : {dt} [s]')
113 | 


--------------------------------------------------------------------------------
/multi_query_experiments/profile_mqa.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": 14,
 6 |    "id": "f99e438c-1802-4573-844c-d91e4951ec19",
 7 |    "metadata": {},
 8 |    "outputs": [
 9 |     {
10 |      "name": "stdout",
11 |      "output_type": "stream",
12 |      "text": [
13 |       "Python 3.9.13\n"
14 |      ]
15 |     }
16 |    ],
17 |    "source": [
18 |     "!python --version"
19 |    ]
20 |   },
21 |   {
22 |    "cell_type": "code",
23 |    "execution_count": null,
24 |    "id": "21f81c44-13cb-4bb2-8dd5-1b1e783ca3d4",
25 |    "metadata": {},
26 |    "outputs": [],
27 |    "source": [
28 |     "!pip install -r requirements.txt"
29 |    ]
30 |   },
31 |   {
32 |    "cell_type": "code",
33 |    "execution_count": 1,
34 |    "id": "4970d655-1539-4e3a-8fef-812de8af5173",
35 |    "metadata": {},
36 |    "outputs": [
37 |     {
38 |      "name": "stdout",
39 |      "output_type": "stream",
40 |      "text": [
41 |       "/home/studio-lab-user/.conda/envs/default/lib/python3.9/site-packages/transformers/__init__.py\n",
42 |       "CUDA device Tesla T4\n",
43 |       "PWD : /home/studio-lab-user/bigcode-analysis\n",
44 |       "transformers_cache : /home/studio-lab-user/bigcode-analysis/hf_transformers_cache\n",
45 |       "-------------------- attention_type == AttentionType.MULTI_QUERY ---------------------\n",
46 |       "{'get_test_batch': 0.00037789344787597656, 'generate_text_batch': 25.9916410446167, 'decode_batch': 0.031884193420410156, 'input_batch_size': 8, 'input_batch_length': 16, 'max_gen_length': 1024, 'num_beams': 1, 'do_sample': True, 'pad_token_id': 50256, 'dtype': torch.int64}\n",
47 |       "-------------------- attention_type == AttentionType.MULTI_QUERY_1 ---------------------\n",
48 |       "{'get_test_batch': 0.0003807544708251953, 'generate_text_batch': 18.601619243621826, 'decode_batch': 0.021413087844848633, 'input_batch_size': 8, 'input_batch_length': 16, 'max_gen_length': 1024, 'num_beams': 1, 'do_sample': True, 'pad_token_id': 50256, 'dtype': torch.int64}\n",
49 |       "-------------------- attention_type == AttentionType.MULTI_HEAD ---------------------\n",
50 |       "{'get_test_batch': 0.0004012584686279297, 'generate_text_batch': 28.731690883636475, 'decode_batch': 0.021346569061279297, 'input_batch_size': 8, 'input_batch_length': 16, 'max_gen_length': 1024, 'num_beams': 1, 'do_sample': True, 'pad_token_id': 50256, 'dtype': torch.int64}\n",
51 |       "Total elapsed time : 108.40390658378601 [s]\n"
52 |      ]
53 |     }
54 |    ],
55 |    "source": [
56 |     "import multi_query_experiments.profile_hf_generate"
57 |    ]
58 |   },
59 |   {
60 |    "cell_type": "code",
61 |    "execution_count": null,
62 |    "id": "f66365b2-f1dd-41b0-b67e-23f935390d61",
63 |    "metadata": {},
64 |    "outputs": [],
65 |    "source": []
66 |   }
67 |  ],
68 |  "metadata": {
69 |   "kernelspec": {
70 |    "display_name": "default:Python",
71 |    "language": "python",
72 |    "name": "conda-env-default-py"
73 |   },
74 |   "language_info": {
75 |    "codemirror_mode": {
76 |     "name": "ipython",
77 |     "version": 3
78 |    },
79 |    "file_extension": ".py",
80 |    "mimetype": "text/x-python",
81 |    "name": "python",
82 |    "nbconvert_exporter": "python",
83 |    "pygments_lexer": "ipython3",
84 |    "version": "3.9.13"
85 |   }
86 |  },
87 |  "nbformat": 4,
88 |  "nbformat_minor": 5
89 | }
90 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | datasets==2.3.2
2 | torch==1.13.1
3 | transformers @ git+https://github.com/bigcode-project/transformers.git@multi_query
4 | umap-learn==0.5.3
5 | plotly==4.14.3
6 | 


--------------------------------------------------------------------------------
/tokenization/README.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------