├── .github ├── ISSUE_TEMPLATE │ ├── dataset-request.md │ └── stat-request └── workflows │ └── python_starter.yml ├── .gitignore ├── CODE_OF_CONDUCT.md ├── LICENSE ├── README.md ├── codepile ├── __init__.py ├── codepile.py ├── dataset.py ├── ghtorrent │ ├── get_github_repo.py │ └── gh_download.py └── stackexchange │ ├── __init__.py │ └── stackexchange.py ├── pyproject.toml └── tests └── test_test.py /.github/ISSUE_TEMPLATE/dataset-request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Dataset-Request 3 | about: Request for an addition of new dataset in the catalogue 4 | title: '' 5 | labels: dataset-request 6 | assignees: '' 7 | 8 | --- 9 | 10 | ## Title 11 | 12 | Dataset URL - [here]() 13 | 14 | Does the dataset exists in a scraped format ? 15 | URL if Yes - [here]() 16 | 17 | ## Description 18 | 19 | 20 | ## Procedure 21 | 22 | 23 | ## Tests 24 | 25 | Include a dummy_dataset.parquet file to test your code against. This dummy_dataset should include the columns for the data and metadata associated with the dataset, which will then be converted into the final format for language model consumption, along with an example row or rows that you can verify your code correctly collects. In addition to this file, include the unit test that evaluates your code against this dummy_dataset. 26 | 27 | Give an example of the columns and data: 28 | 29 | | col1 | col2 | .... | 30 | | ---- | ---- | ---- | 31 | | row1 | row1 | .... | 32 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/stat-request: -------------------------------------------------------------------------------- 1 | --- 2 | name: Stat-Request 3 | about: Request for an addition of new statistics in the EDA catalogue 4 | title: '' 5 | labels: stat-request 6 | assignees: '' 7 | 8 | --- 9 | 10 | ## Title 11 | 12 | EDA Reference URL - [here]() #If there are references. 13 | 14 | Does the dataset exists in a scraped format ? 15 | URL if Yes - [here]() 16 | 17 | ## Description 18 | 19 | 20 | ## Procedure 21 | 22 | 23 | ## Tests 24 | -------------------------------------------------------------------------------- /.github/workflows/python_starter.yml: -------------------------------------------------------------------------------- 1 | name: pytest 2 | 3 | on: [push] 4 | 5 | jobs: 6 | build: 7 | 8 | runs-on: ubuntu-latest 9 | strategy: 10 | matrix: 11 | python-version: ["3.7", "3.8", "3.9", "3.10"] 12 | 13 | steps: 14 | - uses: actions/checkout@v3 15 | - name: Set up Python ${{ matrix.python-version }} 16 | uses: actions/setup-python@v4 17 | with: 18 | python-version: ${{ matrix.python-version }} 19 | - name: Install dependencies 20 | run: | 21 | python -m pip install --upgrade pip 22 | pip install pytest 23 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi 24 | - name: Test with pytest 25 | run: | 26 | pytest -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | We as members, contributors, and leaders pledge to make participation in our 6 | community a harassment-free experience for everyone, regardless of age, body 7 | size, visible or invisible disability, ethnicity, sex characteristics, gender 8 | identity and expression, level of experience, education, socio-economic status, 9 | nationality, personal appearance, race, religion, or sexual identity 10 | and orientation. 11 | 12 | We pledge to act and interact in ways that contribute to an open, welcoming, 13 | diverse, inclusive, and healthy community. 14 | 15 | ## Our Standards 16 | 17 | Examples of behavior that contributes to a positive environment for our 18 | community include: 19 | 20 | * Demonstrating empathy and kindness toward other people 21 | * Being respectful of differing opinions, viewpoints, and experiences 22 | * Giving and gracefully accepting constructive feedback 23 | * Accepting responsibility and apologizing to those affected by our mistakes, 24 | and learning from the experience 25 | * Focusing on what is best not just for us as individuals, but for the 26 | overall community 27 | 28 | Examples of unacceptable behavior include: 29 | 30 | * The use of sexualized language or imagery, and sexual attention or 31 | advances of any kind 32 | * Trolling, insulting or derogatory comments, and personal or political attacks 33 | * Public or private harassment 34 | * Publishing others' private information, such as a physical or email 35 | address, without their explicit permission 36 | * Other conduct which could reasonably be considered inappropriate in a 37 | professional setting 38 | 39 | ## Enforcement Responsibilities 40 | 41 | Community leaders are responsible for clarifying and enforcing our standards of 42 | acceptable behavior and will take appropriate and fair corrective action in 43 | response to any behavior that they deem inappropriate, threatening, offensive, 44 | or harmful. 45 | 46 | Community leaders have the right and responsibility to remove, edit, or reject 47 | comments, commits, code, wiki edits, issues, and other contributions that are 48 | not aligned to this Code of Conduct, and will communicate reasons for moderation 49 | decisions when appropriate. 50 | 51 | ## Scope 52 | 53 | This Code of Conduct applies within all community spaces, and also applies when 54 | an individual is officially representing the community in public spaces. 55 | Examples of representing our community include using an official e-mail address, 56 | posting via an official social media account, or acting as an appointed 57 | representative at an online or offline event. 58 | 59 | ## Enforcement 60 | 61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 62 | reported to the community leaders responsible for enforcement at 63 | louis@stability.ai. 64 | All complaints will be reviewed and investigated promptly and fairly. 65 | 66 | All community leaders are obligated to respect the privacy and security of the 67 | reporter of any incident. 68 | 69 | ## Enforcement Guidelines 70 | 71 | Community leaders will follow these Community Impact Guidelines in determining 72 | the consequences for any action they deem in violation of this Code of Conduct: 73 | 74 | ### 1. Correction 75 | 76 | **Community Impact**: Use of inappropriate language or other behavior deemed 77 | unprofessional or unwelcome in the community. 78 | 79 | **Consequence**: A private, written warning from community leaders, providing 80 | clarity around the nature of the violation and an explanation of why the 81 | behavior was inappropriate. A public apology may be requested. 82 | 83 | ### 2. Warning 84 | 85 | **Community Impact**: A violation through a single incident or series 86 | of actions. 87 | 88 | **Consequence**: A warning with consequences for continued behavior. No 89 | interaction with the people involved, including unsolicited interaction with 90 | those enforcing the Code of Conduct, for a specified period of time. This 91 | includes avoiding interactions in community spaces as well as external channels 92 | like social media. Violating these terms may lead to a temporary or 93 | permanent ban. 94 | 95 | ### 3. Temporary Ban 96 | 97 | **Community Impact**: A serious violation of community standards, including 98 | sustained inappropriate behavior. 99 | 100 | **Consequence**: A temporary ban from any sort of interaction or public 101 | communication with the community for a specified period of time. No public or 102 | private interaction with the people involved, including unsolicited interaction 103 | with those enforcing the Code of Conduct, is allowed during this period. 104 | Violating these terms may lead to a permanent ban. 105 | 106 | ### 4. Permanent Ban 107 | 108 | **Community Impact**: Demonstrating a pattern of violation of community 109 | standards, including sustained inappropriate behavior, harassment of an 110 | individual, or aggression toward or disparagement of classes of individuals. 111 | 112 | **Consequence**: A permanent ban from any sort of public interaction within 113 | the community. 114 | 115 | ## Attribution 116 | 117 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], 118 | version 2.0, available at 119 | https://www.contributor-covenant.org/version/2/0/code_of_conduct.html. 120 | 121 | Community Impact Guidelines were inspired by [Mozilla's code of conduct 122 | enforcement ladder](https://github.com/mozilla/diversity). 123 | 124 | [homepage]: https://www.contributor-covenant.org 125 | 126 | For answers to common questions about this code of conduct, see the FAQ at 127 | https://www.contributor-covenant.org/faq. Translations are available at 128 | https://www.contributor-covenant.org/translations. 129 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 CarperAI 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Code-Pile 2 | 3 | ![pytest](https://github.com/CarperAI/Code-Pile/actions/workflows/python_starter.yml/badge.svg) 4 | 5 | 6 | This repository contains the processing scripts to scrape/process the code-pile dataset. 7 | 8 | ## Table of Contents 9 | * Project Description 10 | * How to use the Code-Pile (todo) 11 | * How to Contribute 12 | * Additional Resources 13 | 14 | ## Project Description 15 | Check out [The code pile proposal](https://carperai.notion.site/Code-Pile-Organization-adfe8babbe07451cbd489a50cc0c985a) 16 | 17 | The Code-Pile will be released similar to "the pile" as a folder of .jsonl.zst files, see [lm-dataformat](https://github.com/EleutherAI/lm_dataformat) 18 | 19 | ## How to use the Code-Pile 20 | It's not finished, ask on discord 21 | 22 | ## How to Contribute 23 | Think about the most usefull Code-data for the next generation of textual Code Models. 24 | 25 | The most valuable dataset properties (use your own judgment) are: 26 | 1. Open License 27 | 2. Data quality 28 | 3. Dataset size 29 | 4. Data variance/variety/nicheness 30 | 5. Ease of obtaining/processing 31 | 32 | To add a new dataset, open a Issue under given `dataset-request` template. Gather all the related informations appropriate to it. Use the issue to track. 33 | 34 | Check if there is existing Code or someone already working on it: 35 | See Additional Resources 36 | 37 | 1. Eleuthers Pile V1 Repos 38 | 2. Ask on Carper #code-pile 39 | 3. Ask on Eleuther 40 | 4. Consult the linked Spreadsheets below 41 | 42 | Then implement it through the following steps: 43 | 44 | 1. Fork this repo 45 | 2. Use the `working` branch 46 | 3. Read the shared classes in `datasets.py` and `codepile.py` 47 | 4. Create mvp/example for your dataset 48 | 5. Create a pull request 49 | 6. Keep building the data-domain specific classes and repeat 50 | 51 | Citation Placeholder: 52 | ``` 53 | @misc{Code-Pile, 54 | author = {}, 55 | doi = {}, 56 | month = {}, 57 | title = {}, 58 | url = {https://github.com/CarperAI/Code-Pile}, 59 | version = {}, 60 | year = {2022} 61 | } 62 | ``` 63 | 64 | ## Additional Resources 65 | * [Preliminary spreadsheet of useful resources](https://docs.google.com/spreadsheets/d/1OrOnv-Cv1wRq0jNk4AegHiMtLk88YQDz5b1TP-o5SE8/edit#gid=0) 66 | 67 | Closely related projects: 68 | 69 | * [The Pile V2 spreadsheet](https://docs.google.com/spreadsheets/d/1nVxbXj0k-5p9kY_TlY8xMnpsqp_JNlWXpD48L8hXH8E/edit#gid=906372269) 70 | * [Pile V1 stackexchange-dataset repo](https://github.com/EleutherAI/stackexchange-dataset/tree/fc34e85c12a5a2fb41b324db1c416cdac8ca5732) 71 | * [Collins stackexchange gist](https://gist.github.com/craffel/a1e2aff893776d0ef2b0a95ed0fd7e7a) 72 | 73 | Previous work: 74 | * [Codeparrot] (https://github.com/huggingface/transformers/tree/main/examples/research_projects/codeparrot) 75 | * ... 76 | -------------------------------------------------------------------------------- /codepile/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CarperAI/Code-Pile/a4474b0b7f5914a700203edd9aa2ec847babfb76/codepile/__init__.py -------------------------------------------------------------------------------- /codepile/codepile.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import argparse 3 | from codepile.stackexchange.stackexchange import * 4 | from codepile.dataset import Dataset 5 | 6 | class CodePile(Dataset): 7 | def __init__(self, tempdir, target_dir): 8 | self.subdatasets = [] 9 | 10 | subsets = [ 11 | StackExchangeDataset 12 | ] 13 | for d in subsets: 14 | self.subdatasets.append(d(tempdir, target_dir)) 15 | 16 | def download(self): 17 | for d in self.subdatasets: 18 | d.scraper.scrape() 19 | 20 | def process(self): 21 | for d in self.subdatasets: 22 | d.processor.process() 23 | 24 | def merge(self): 25 | raise NotImplementedError() 26 | 27 | 28 | def download(args): 29 | ds = CodePile(args.tempdir, args.output_dir) 30 | ds.download() 31 | 32 | 33 | def process(args): 34 | ds = CodePile(args.tempdir, args.output_dir) 35 | ds.process() 36 | 37 | 38 | def cli(cli_args, *args, **kwargs): 39 | parser = argparse.ArgumentParser('codepile dataset') 40 | 41 | subparsers = parser.add_subparsers() 42 | 43 | download_parser = subparsers.add_parser('download') 44 | download_parser.add_argument('output_dir', type=str) 45 | download_parser.add_argument('tempdir', type=str) 46 | download_parser.set_defaults(func=download) 47 | 48 | process_parser = subparsers.add_parser('process') 49 | process_parser.add_argument('input_dir', type=str) 50 | process_parser.add_argument('output_dir', type=str) 51 | process_parser.add_argument('tempdir', type=str) 52 | 53 | process_parser.set_defaults(func=process) 54 | 55 | args = parser.parse_args(cli_args[1:]) 56 | args.func(args) 57 | 58 | if len(cli_args) == 1: 59 | parser.print_help() 60 | 61 | if __name__ == "__main__": 62 | cli(sys.argv) 63 | 64 | 65 | -------------------------------------------------------------------------------- /codepile/dataset.py: -------------------------------------------------------------------------------- 1 | from typing import Union, Optional, TypeAlias, Literal, Any 2 | from abc import ABC, abstractmethod 3 | import uuid 4 | import pydantic 5 | from pydantic import BaseModel, AnyUrl, FileUrl 6 | from datetime import datetime 7 | 8 | 9 | MODALITY = Literal['discussion', 'code_review', 'source_code', 'unittest'] 10 | 11 | class DatasetInfo(BaseModel): 12 | identifier: str 13 | description: str 14 | # the last time when new information was incorporated into the dataset 15 | # aka when was the latest sample collected 16 | data_end: datetime 17 | # the beginning of the datasets data 18 | data_start: Optional[datetime] 19 | # estimated size in bits 20 | size: int 21 | 22 | # compute cost needed for processing 23 | # usefull information for rebuilding 24 | cpu_hours: Optional[int] 25 | gpu_hours: Optional[int] 26 | ram_requirement: Optional[int] 27 | tempfile_requirement: Optional[int] 28 | 29 | # the main sources website/description/entrypoint/domain 30 | source_uri: AnyUrl 31 | 32 | # what are the advantages of including this dataset 33 | # like a good fit for the downstream modelling tasks 34 | dataset_pros: str 35 | # what are the disadvantages of including this dataset 36 | # like biases 37 | dataset_cons: str 38 | 39 | # the languages that are present from the source download 40 | languages: list[str] 41 | # the programming languages that are present from the source download 42 | coding_languages: list[str] 43 | # the language modalities that are present in the dataset: 44 | # like discussion, code_review, source_code, unittest 45 | modalities: list[MODALITY] 46 | # to track copyright 47 | source_license: str 48 | # a citation for acknowledging the data source 49 | # as this is convention in academia 50 | source_citation: str 51 | # a single person responsible for the dataset 52 | data_owner: str 53 | contributers: list[str] 54 | 55 | 56 | SourceType = Literal['bulk', 'api', 'staticpages', 'dynamicpages'] 57 | 58 | class DatasetSources(BaseModel): 59 | # stores the urls from where the data can be collected 60 | sources : list[AnyUrl] 61 | sourcetype : SourceType 62 | # storage format of the blobs that are captured from the source 63 | source_format: str 64 | 65 | 66 | class RawDataset(BaseModel): 67 | # where the raw dataset files is stored after the scrape 68 | storage_uris: list[Union[AnyUrl, FileUrl]] 69 | # possible locks for parallel writing to the storage_uris 70 | storage_locks: Optional[list[Any]] 71 | # wether the download is complete 72 | # if more finegrained saving of state is needed, handle it customly 73 | # in the scraper 74 | complete: bool = False 75 | 76 | # miscellanous metadata we additionally want to track 77 | metadata: Optional[str] 78 | 79 | 80 | class Scraper(ABC): 81 | # logic for downloading/scraping the datasets 82 | def __init__(self, tempdir, target_dir, *args, **kwargs): 83 | self.tempdir = tempdir 84 | self.target_dir = target_dir 85 | 86 | def scrape(self): 87 | raise NotImplementedError() 88 | 89 | 90 | class Processor(ABC): 91 | # logic for processing the datasets 92 | # filtering out bad data 93 | # data transformations 94 | # if you wanna use kind a workflow, implement it in here 95 | def process(self): 96 | raise NotImplementedError() 97 | 98 | 99 | class Analyser(ABC): 100 | # logic for getting basic statistics of the dataset 101 | def analyse(self): 102 | raise NotImplementedError() 103 | 104 | 105 | class Dataset(ABC): 106 | def __init__(self, tempdir, target_dir, *args, **kwargs): 107 | self.tempdir = tempdir 108 | self.target_dir = target_dir 109 | 110 | self.info : DatasetInfo = None 111 | 112 | self.scraper = None 113 | self.processor = None 114 | self.analyser = None 115 | 116 | def download(self, *args, **kwargs): 117 | self.scraper.scrape() 118 | 119 | def process(self, *args, **kwargs): 120 | self.processor.process() 121 | 122 | def analyse(self, *args, **kwargs): 123 | self.analyser.analyse() 124 | 125 | ''' 126 | @property 127 | @abstractmethod 128 | def info(self) -> DatasetInfo: 129 | if self.info is None: 130 | raise NotImplementedError() 131 | return self.info 132 | ''' 133 | 134 | 135 | -------------------------------------------------------------------------------- /codepile/ghtorrent/get_github_repo.py: -------------------------------------------------------------------------------- 1 | import re 2 | import gzip 3 | from tqdm import tqdm 4 | import os 5 | 6 | 7 | 8 | def get_repo(url): 9 | try: 10 | repo = url.replace("https://api.github.com/repos/", "").split("/") 11 | repo = "https://api.github.com/repos/" + repo[0] + "/" + repo[1] 12 | return repo 13 | except Exception as error: 14 | print("error logs: ", error) 15 | return "" 16 | 17 | 18 | def main(): 19 | 20 | regex_list = ['((https?):((//)|(\\\\))+([\w\d:#@%/;$()~_?\+-=\\\.&](#!)?)*)', 21 | '((http?):((//)|(\\\\))+([\w\d:#@%/;$()~_?\+-=\\\.&](#!)?)*)'] 22 | 23 | file_mode = 'text' 24 | 25 | files = [os.path.join("ghtorrent_data", x) 26 | for x in os.listdir("ghtorrent_data/") if x.endswith(".gz")] 27 | 28 | total_urls = [] 29 | for file_name in files: 30 | urls = [] 31 | if file_mode == 'text': 32 | with open(file_name, "r", encoding="ISO-8859-1") as file: 33 | for line in tqdm(file): 34 | for link_regex in regex_list: 35 | links = re.findall(link_regex, line) 36 | urls += [x[0] for x in links] 37 | else: 38 | with gzip.open(file_name,'r') as file: 39 | for line in tqdm(file): 40 | line = line.decode('ISO-8859-1') 41 | for link_regex in regex_list: 42 | links = re.findall(link_regex, line) 43 | urls += [x[0] for x in links] 44 | 45 | urls = [x for x in urls if 'https://api.github.com/repos/' in x] 46 | total_urls += urls 47 | 48 | github_repos = [] 49 | for url in total_urls: 50 | repo = get_repo(url) 51 | if repo != "": 52 | github_repos.append(repo) 53 | github_repos = set(github_repos) 54 | 55 | with open("GHTorrent_github.txt", "w") as fp: 56 | for url in github_repos: 57 | fp.write(url + '\n') 58 | 59 | if __name__=="__main__": 60 | main() -------------------------------------------------------------------------------- /codepile/ghtorrent/gh_download.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | with open("ghtorrent_urls.txt", "w") as fp: 4 | for line in fp: 5 | line = line.replace("\n", "") 6 | os.system(f"wget http://ghtorrent-downloads.ewi.tudelft.nl/mysql/{line} -O ghtorrent_data/{line}") -------------------------------------------------------------------------------- /codepile/stackexchange/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CarperAI/Code-Pile/a4474b0b7f5914a700203edd9aa2ec847babfb76/codepile/stackexchange/__init__.py -------------------------------------------------------------------------------- /codepile/stackexchange/stackexchange.py: -------------------------------------------------------------------------------- 1 | from codepile.dataset import DatasetInfo, DatasetSources, RawDataset, Scraper, Processor, Analyser, Dataset 2 | 3 | from datetime import datetime 4 | 5 | import internetarchive as ia 6 | 7 | ''' 8 | # example 9 | StackExchangeInfo = DatasetInfo( 10 | identifier='StackExchange', 11 | description='', 12 | data_end=datetime(2022,1,1), 13 | data_start=10, 14 | size=10, 15 | storage_format='tar', 16 | #storage_uri='/root', 17 | cpu_hours=1, 18 | gpu_hours=1, 19 | ram_requirements=1, 20 | tempfile_requirement=1, 21 | source_uri='https://archive.org/details/stackexchange', 22 | dataset_pros='l', 23 | dataset_cons='l', 24 | languages=[''], 25 | coding_languages=[''], 26 | modalities=['discussion'], 27 | source_license='gpl', 28 | source_citation='this', 29 | data_owner='me', 30 | contributers=['me'] 31 | ) 32 | ''' 33 | 34 | class StackExchangeScraper(Scraper): 35 | def scrape(self) -> RawDataset: 36 | item = ia.get_item('stackexchange') 37 | metadata = item.metadata 38 | ia.download('stackexchange', checksum=True, verbose=True, destdir=self.target_dir) 39 | 40 | return RawDataset(storage_uris=['file:///{self.target_dir}'], 41 | metadata=str(metadata)) 42 | 43 | 44 | class StackExchangeDataset(Dataset): 45 | def __init__(self, tempdir, target_dir): 46 | self.scraper = StackExchangeScraper(tempdir, target_dir) 47 | def download(self): 48 | self.scraper.download() 49 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = [ 3 | "setuptools", 4 | ] 5 | build-backend = "setuptools.build_meta" 6 | 7 | [project] 8 | name = "Code-Pile" 9 | version = "0.0.1" 10 | dependencies = [ 11 | "pydantic", 12 | "internetarchive" 13 | ] 14 | -------------------------------------------------------------------------------- /tests/test_test.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | 3 | class TryTesting(TestCase): 4 | def test_always_passes(self): 5 | self.assertTrue(True) --------------------------------------------------------------------------------