├── .github ├── ISSUE_TEMPLATE │ ├── 1-bug-report.yml │ ├── 2-feature-request.yml │ ├── 3-documentation-improve.yml │ └── config.yml └── pull_request_template.md ├── .gitignore ├── .pre-commit-config.yaml ├── .readthedocs.yaml ├── LICENSE ├── Makefile ├── README.md ├── contribution └── CONTRIBUTING.md ├── dataverse ├── README.md ├── __init__.py ├── api │ ├── README.md │ ├── __init__.py │ ├── cli.py │ └── emr.py ├── config │ ├── README.md │ ├── __init__.py │ ├── etl │ │ └── sample │ │ │ ├── ETL___one_cycle.yaml │ │ │ ├── data_ingestion___1T_loading.yaml │ │ │ ├── data_ingestion___one_stage.yaml │ │ │ ├── data_ingestion___sampling.yaml │ │ │ ├── data_ingestion___two_stage.yaml │ │ │ ├── data_preprocess___dedup.yaml │ │ │ └── data_save___hf_obj.yaml │ └── interface.py ├── etl │ ├── README.md │ ├── __init__.py │ ├── __sample │ │ ├── README.md │ │ ├── __init__.py │ │ ├── ducky.py │ │ └── github.py │ ├── bias │ │ ├── README.md │ │ └── __init__.py │ ├── cleaning │ │ ├── README.md │ │ ├── __init__.py │ │ ├── char.py │ │ ├── document.py │ │ ├── html.py │ │ ├── korean.py │ │ ├── length.py │ │ ├── number.py │ │ ├── table.py │ │ └── unicode.py │ ├── data_ingestion │ │ ├── README.md │ │ ├── __init__.py │ │ ├── arrow.py │ │ ├── common_crawl.py │ │ ├── csv.py │ │ ├── cultura_x.py │ │ ├── huggingface.py │ │ ├── parquet.py │ │ ├── red_pajama.py │ │ ├── slim_pajama.py │ │ └── test.py │ ├── data_save │ │ ├── README.md │ │ ├── __init__.py │ │ ├── aws.py │ │ ├── huggingface.py │ │ └── parquet.py │ ├── decontamination │ │ ├── README.md │ │ └── __init__.py │ ├── deduplication │ │ ├── README.md │ │ ├── __init__.py │ │ ├── common_crawl.py │ │ ├── exact.py │ │ ├── minhash.py │ │ └── polyglot.py │ ├── pii │ │ ├── README.md │ │ ├── __init__.py │ │ ├── card.py │ │ └── nin.py │ ├── pipeline.py │ ├── quality │ │ ├── README.md │ │ ├── __init__.py │ │ └── language.py │ ├── registry.py │ ├── toxicity │ │ ├── README.md │ │ └── __init__.py │ └── utils │ │ ├── README.md │ │ ├── __init__.py │ │ ├── log.py │ │ ├── sampling.py │ │ └── statistics.py ├── lab │ ├── README.md │ └── __init__.py ├── tests │ ├── conftest.py │ ├── test_cleaning_accent.py │ ├── test_cleaning_char.py │ ├── test_cleaning_document.py │ ├── test_cleaning_html.py │ ├── test_cleaning_korean.py │ ├── test_cleaning_length.py │ ├── test_cleaning_number.py │ ├── test_cleaning_table.py │ ├── test_cleaning_unicode.py │ ├── test_deduplication_common_crawl.py │ ├── test_deduplication_exact.py │ ├── test_deduplication_minhash.py │ ├── test_deduplication_polyglot.py │ ├── test_pii_card.py │ └── test_pii_nin.py └── utils │ ├── README.md │ ├── __init__.py │ ├── analyze │ ├── README.md │ ├── __init__.py │ ├── pip.py │ └── python.py │ ├── api │ ├── README.md │ ├── __init__.py │ └── aws.py │ ├── format │ ├── README.md │ ├── __init__.py │ ├── huggingface.py │ └── ufl.py │ └── setting │ ├── README.md │ ├── __init__.py │ ├── system.py │ └── user.py ├── docs ├── Makefile ├── images │ ├── dataverse_hero.png │ ├── dataverse_logo-color.png │ ├── dataverse_logo-white.png │ ├── dataverse_symbol.png │ └── dataverse_system_architecture_white.jpeg ├── make.bat └── source │ ├── citation.rst │ ├── conf.py │ ├── config │ └── config.interface.rst │ ├── etl │ ├── etl.bias.rst │ ├── etl.cleaning.rst │ ├── etl.data_ingestion.rst │ ├── etl.data_save.rst │ ├── etl.decontamination.rst │ ├── etl.deduplication.rst │ ├── etl.pii.rst │ ├── etl.pipeline.rst │ ├── etl.quality.rst │ ├── etl.registry.rst │ ├── etl.rst │ ├── etl.toxicity.rst │ └── etl.utils.rst │ ├── index.rst │ ├── installation.rst │ ├── quickstart.rst │ └── requirements.txt ├── examples ├── README.md └── etl │ ├── ETL_01_how_to_run.ipynb │ ├── ETL_02_one_cycle.ipynb │ ├── ETL_03_create_new_etl_process.ipynb │ ├── ETL_04_add_new_etl_process.ipynb │ ├── ETL_05_test_etl_process.ipynb │ ├── ETL_06_scaleout_with_EMR.ipynb │ ├── EX_use_common_crawl_data.ipynb │ ├── EX_use_pyspark_ui.ipynb │ └── README.md ├── requirements.txt └── setup.py /.github/ISSUE_TEMPLATE/1-bug-report.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/.github/ISSUE_TEMPLATE/1-bug-report.yml -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/2-feature-request.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/.github/ISSUE_TEMPLATE/2-feature-request.yml -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/3-documentation-improve.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/.github/ISSUE_TEMPLATE/3-documentation-improve.yml -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/config.yml: -------------------------------------------------------------------------------- 1 | blank_issues_enabled: true -------------------------------------------------------------------------------- /.github/pull_request_template.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/.github/pull_request_template.md -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/.gitignore -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/.pre-commit-config.yaml -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/.readthedocs.yaml -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/LICENSE -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/Makefile -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/README.md -------------------------------------------------------------------------------- /contribution/CONTRIBUTING.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/contribution/CONTRIBUTING.md -------------------------------------------------------------------------------- /dataverse/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/dataverse/README.md -------------------------------------------------------------------------------- /dataverse/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dataverse/api/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/dataverse/api/README.md -------------------------------------------------------------------------------- /dataverse/api/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dataverse/api/cli.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/dataverse/api/cli.py -------------------------------------------------------------------------------- /dataverse/api/emr.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/dataverse/api/emr.py -------------------------------------------------------------------------------- /dataverse/config/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/dataverse/config/README.md -------------------------------------------------------------------------------- /dataverse/config/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from .interface import Config -------------------------------------------------------------------------------- /dataverse/config/etl/sample/ETL___one_cycle.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/dataverse/config/etl/sample/ETL___one_cycle.yaml -------------------------------------------------------------------------------- /dataverse/config/etl/sample/data_ingestion___1T_loading.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/dataverse/config/etl/sample/data_ingestion___1T_loading.yaml -------------------------------------------------------------------------------- /dataverse/config/etl/sample/data_ingestion___one_stage.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/dataverse/config/etl/sample/data_ingestion___one_stage.yaml -------------------------------------------------------------------------------- /dataverse/config/etl/sample/data_ingestion___sampling.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/dataverse/config/etl/sample/data_ingestion___sampling.yaml -------------------------------------------------------------------------------- /dataverse/config/etl/sample/data_ingestion___two_stage.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/dataverse/config/etl/sample/data_ingestion___two_stage.yaml -------------------------------------------------------------------------------- /dataverse/config/etl/sample/data_preprocess___dedup.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/dataverse/config/etl/sample/data_preprocess___dedup.yaml -------------------------------------------------------------------------------- /dataverse/config/etl/sample/data_save___hf_obj.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/dataverse/config/etl/sample/data_save___hf_obj.yaml -------------------------------------------------------------------------------- /dataverse/config/interface.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/dataverse/config/interface.py -------------------------------------------------------------------------------- /dataverse/etl/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/dataverse/etl/README.md -------------------------------------------------------------------------------- /dataverse/etl/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/dataverse/etl/__init__.py -------------------------------------------------------------------------------- /dataverse/etl/__sample/README.md: -------------------------------------------------------------------------------- 1 | # Sample 2 | > This is a showcase -------------------------------------------------------------------------------- /dataverse/etl/__sample/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dataverse/etl/__sample/ducky.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/dataverse/etl/__sample/ducky.py -------------------------------------------------------------------------------- /dataverse/etl/__sample/github.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/dataverse/etl/__sample/github.py -------------------------------------------------------------------------------- /dataverse/etl/bias/README.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dataverse/etl/bias/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dataverse/etl/cleaning/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/dataverse/etl/cleaning/README.md -------------------------------------------------------------------------------- /dataverse/etl/cleaning/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dataverse/etl/cleaning/char.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/dataverse/etl/cleaning/char.py -------------------------------------------------------------------------------- /dataverse/etl/cleaning/document.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/dataverse/etl/cleaning/document.py -------------------------------------------------------------------------------- /dataverse/etl/cleaning/html.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/dataverse/etl/cleaning/html.py -------------------------------------------------------------------------------- /dataverse/etl/cleaning/korean.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/dataverse/etl/cleaning/korean.py -------------------------------------------------------------------------------- /dataverse/etl/cleaning/length.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/dataverse/etl/cleaning/length.py -------------------------------------------------------------------------------- /dataverse/etl/cleaning/number.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/dataverse/etl/cleaning/number.py -------------------------------------------------------------------------------- /dataverse/etl/cleaning/table.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/dataverse/etl/cleaning/table.py -------------------------------------------------------------------------------- /dataverse/etl/cleaning/unicode.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/dataverse/etl/cleaning/unicode.py -------------------------------------------------------------------------------- /dataverse/etl/data_ingestion/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/dataverse/etl/data_ingestion/README.md -------------------------------------------------------------------------------- /dataverse/etl/data_ingestion/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dataverse/etl/data_ingestion/arrow.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/dataverse/etl/data_ingestion/arrow.py -------------------------------------------------------------------------------- /dataverse/etl/data_ingestion/common_crawl.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/dataverse/etl/data_ingestion/common_crawl.py -------------------------------------------------------------------------------- /dataverse/etl/data_ingestion/csv.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/dataverse/etl/data_ingestion/csv.py -------------------------------------------------------------------------------- /dataverse/etl/data_ingestion/cultura_x.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/dataverse/etl/data_ingestion/cultura_x.py -------------------------------------------------------------------------------- /dataverse/etl/data_ingestion/huggingface.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/dataverse/etl/data_ingestion/huggingface.py -------------------------------------------------------------------------------- /dataverse/etl/data_ingestion/parquet.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/dataverse/etl/data_ingestion/parquet.py -------------------------------------------------------------------------------- /dataverse/etl/data_ingestion/red_pajama.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/dataverse/etl/data_ingestion/red_pajama.py -------------------------------------------------------------------------------- /dataverse/etl/data_ingestion/slim_pajama.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/dataverse/etl/data_ingestion/slim_pajama.py -------------------------------------------------------------------------------- /dataverse/etl/data_ingestion/test.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/dataverse/etl/data_ingestion/test.py -------------------------------------------------------------------------------- /dataverse/etl/data_save/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/dataverse/etl/data_save/README.md -------------------------------------------------------------------------------- /dataverse/etl/data_save/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dataverse/etl/data_save/aws.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/dataverse/etl/data_save/aws.py -------------------------------------------------------------------------------- /dataverse/etl/data_save/huggingface.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/dataverse/etl/data_save/huggingface.py -------------------------------------------------------------------------------- /dataverse/etl/data_save/parquet.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/dataverse/etl/data_save/parquet.py -------------------------------------------------------------------------------- /dataverse/etl/decontamination/README.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dataverse/etl/decontamination/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dataverse/etl/deduplication/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/dataverse/etl/deduplication/README.md -------------------------------------------------------------------------------- /dataverse/etl/deduplication/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dataverse/etl/deduplication/common_crawl.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/dataverse/etl/deduplication/common_crawl.py -------------------------------------------------------------------------------- /dataverse/etl/deduplication/exact.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/dataverse/etl/deduplication/exact.py -------------------------------------------------------------------------------- /dataverse/etl/deduplication/minhash.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/dataverse/etl/deduplication/minhash.py -------------------------------------------------------------------------------- /dataverse/etl/deduplication/polyglot.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/dataverse/etl/deduplication/polyglot.py -------------------------------------------------------------------------------- /dataverse/etl/pii/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/dataverse/etl/pii/README.md -------------------------------------------------------------------------------- /dataverse/etl/pii/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dataverse/etl/pii/card.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/dataverse/etl/pii/card.py -------------------------------------------------------------------------------- /dataverse/etl/pii/nin.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/dataverse/etl/pii/nin.py -------------------------------------------------------------------------------- /dataverse/etl/pipeline.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/dataverse/etl/pipeline.py -------------------------------------------------------------------------------- /dataverse/etl/quality/README.md: -------------------------------------------------------------------------------- 1 | # Quality -------------------------------------------------------------------------------- /dataverse/etl/quality/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dataverse/etl/quality/language.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/dataverse/etl/quality/language.py -------------------------------------------------------------------------------- /dataverse/etl/registry.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/dataverse/etl/registry.py -------------------------------------------------------------------------------- /dataverse/etl/toxicity/README.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dataverse/etl/toxicity/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dataverse/etl/utils/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/dataverse/etl/utils/README.md -------------------------------------------------------------------------------- /dataverse/etl/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dataverse/etl/utils/log.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/dataverse/etl/utils/log.py -------------------------------------------------------------------------------- /dataverse/etl/utils/sampling.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/dataverse/etl/utils/sampling.py -------------------------------------------------------------------------------- /dataverse/etl/utils/statistics.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/dataverse/etl/utils/statistics.py -------------------------------------------------------------------------------- /dataverse/lab/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/dataverse/lab/README.md -------------------------------------------------------------------------------- /dataverse/lab/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dataverse/tests/conftest.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/dataverse/tests/conftest.py -------------------------------------------------------------------------------- /dataverse/tests/test_cleaning_accent.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/dataverse/tests/test_cleaning_accent.py -------------------------------------------------------------------------------- /dataverse/tests/test_cleaning_char.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/dataverse/tests/test_cleaning_char.py -------------------------------------------------------------------------------- /dataverse/tests/test_cleaning_document.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/dataverse/tests/test_cleaning_document.py -------------------------------------------------------------------------------- /dataverse/tests/test_cleaning_html.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/dataverse/tests/test_cleaning_html.py -------------------------------------------------------------------------------- /dataverse/tests/test_cleaning_korean.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/dataverse/tests/test_cleaning_korean.py -------------------------------------------------------------------------------- /dataverse/tests/test_cleaning_length.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/dataverse/tests/test_cleaning_length.py -------------------------------------------------------------------------------- /dataverse/tests/test_cleaning_number.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/dataverse/tests/test_cleaning_number.py -------------------------------------------------------------------------------- /dataverse/tests/test_cleaning_table.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/dataverse/tests/test_cleaning_table.py -------------------------------------------------------------------------------- /dataverse/tests/test_cleaning_unicode.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/dataverse/tests/test_cleaning_unicode.py -------------------------------------------------------------------------------- /dataverse/tests/test_deduplication_common_crawl.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/dataverse/tests/test_deduplication_common_crawl.py -------------------------------------------------------------------------------- /dataverse/tests/test_deduplication_exact.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/dataverse/tests/test_deduplication_exact.py -------------------------------------------------------------------------------- /dataverse/tests/test_deduplication_minhash.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/dataverse/tests/test_deduplication_minhash.py -------------------------------------------------------------------------------- /dataverse/tests/test_deduplication_polyglot.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/dataverse/tests/test_deduplication_polyglot.py -------------------------------------------------------------------------------- /dataverse/tests/test_pii_card.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/dataverse/tests/test_pii_card.py -------------------------------------------------------------------------------- /dataverse/tests/test_pii_nin.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/dataverse/tests/test_pii_nin.py -------------------------------------------------------------------------------- /dataverse/utils/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/dataverse/utils/README.md -------------------------------------------------------------------------------- /dataverse/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dataverse/utils/analyze/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/dataverse/utils/analyze/README.md -------------------------------------------------------------------------------- /dataverse/utils/analyze/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/dataverse/utils/analyze/__init__.py -------------------------------------------------------------------------------- /dataverse/utils/analyze/pip.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/dataverse/utils/analyze/pip.py -------------------------------------------------------------------------------- /dataverse/utils/analyze/python.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/dataverse/utils/analyze/python.py -------------------------------------------------------------------------------- /dataverse/utils/api/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/dataverse/utils/api/README.md -------------------------------------------------------------------------------- /dataverse/utils/api/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/dataverse/utils/api/__init__.py -------------------------------------------------------------------------------- /dataverse/utils/api/aws.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/dataverse/utils/api/aws.py -------------------------------------------------------------------------------- /dataverse/utils/format/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/dataverse/utils/format/README.md -------------------------------------------------------------------------------- /dataverse/utils/format/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/dataverse/utils/format/__init__.py -------------------------------------------------------------------------------- /dataverse/utils/format/huggingface.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/dataverse/utils/format/huggingface.py -------------------------------------------------------------------------------- /dataverse/utils/format/ufl.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/dataverse/utils/format/ufl.py -------------------------------------------------------------------------------- /dataverse/utils/setting/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/dataverse/utils/setting/README.md -------------------------------------------------------------------------------- /dataverse/utils/setting/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/dataverse/utils/setting/__init__.py -------------------------------------------------------------------------------- /dataverse/utils/setting/system.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/dataverse/utils/setting/system.py -------------------------------------------------------------------------------- /dataverse/utils/setting/user.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/dataverse/utils/setting/user.py -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/docs/Makefile -------------------------------------------------------------------------------- /docs/images/dataverse_hero.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/docs/images/dataverse_hero.png -------------------------------------------------------------------------------- /docs/images/dataverse_logo-color.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/docs/images/dataverse_logo-color.png -------------------------------------------------------------------------------- /docs/images/dataverse_logo-white.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/docs/images/dataverse_logo-white.png -------------------------------------------------------------------------------- /docs/images/dataverse_symbol.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/docs/images/dataverse_symbol.png -------------------------------------------------------------------------------- /docs/images/dataverse_system_architecture_white.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/docs/images/dataverse_system_architecture_white.jpeg -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/docs/make.bat -------------------------------------------------------------------------------- /docs/source/citation.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/docs/source/citation.rst -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/docs/source/conf.py -------------------------------------------------------------------------------- /docs/source/config/config.interface.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/docs/source/config/config.interface.rst -------------------------------------------------------------------------------- /docs/source/etl/etl.bias.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/docs/source/etl/etl.bias.rst -------------------------------------------------------------------------------- /docs/source/etl/etl.cleaning.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/docs/source/etl/etl.cleaning.rst -------------------------------------------------------------------------------- /docs/source/etl/etl.data_ingestion.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/docs/source/etl/etl.data_ingestion.rst -------------------------------------------------------------------------------- /docs/source/etl/etl.data_save.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/docs/source/etl/etl.data_save.rst -------------------------------------------------------------------------------- /docs/source/etl/etl.decontamination.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/docs/source/etl/etl.decontamination.rst -------------------------------------------------------------------------------- /docs/source/etl/etl.deduplication.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/docs/source/etl/etl.deduplication.rst -------------------------------------------------------------------------------- /docs/source/etl/etl.pii.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/docs/source/etl/etl.pii.rst -------------------------------------------------------------------------------- /docs/source/etl/etl.pipeline.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/docs/source/etl/etl.pipeline.rst -------------------------------------------------------------------------------- /docs/source/etl/etl.quality.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/docs/source/etl/etl.quality.rst -------------------------------------------------------------------------------- /docs/source/etl/etl.registry.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/docs/source/etl/etl.registry.rst -------------------------------------------------------------------------------- /docs/source/etl/etl.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/docs/source/etl/etl.rst -------------------------------------------------------------------------------- /docs/source/etl/etl.toxicity.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/docs/source/etl/etl.toxicity.rst -------------------------------------------------------------------------------- /docs/source/etl/etl.utils.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/docs/source/etl/etl.utils.rst -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/docs/source/index.rst -------------------------------------------------------------------------------- /docs/source/installation.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/docs/source/installation.rst -------------------------------------------------------------------------------- /docs/source/quickstart.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/docs/source/quickstart.rst -------------------------------------------------------------------------------- /docs/source/requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/docs/source/requirements.txt -------------------------------------------------------------------------------- /examples/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/examples/README.md -------------------------------------------------------------------------------- /examples/etl/ETL_01_how_to_run.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/examples/etl/ETL_01_how_to_run.ipynb -------------------------------------------------------------------------------- /examples/etl/ETL_02_one_cycle.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/examples/etl/ETL_02_one_cycle.ipynb -------------------------------------------------------------------------------- /examples/etl/ETL_03_create_new_etl_process.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/examples/etl/ETL_03_create_new_etl_process.ipynb -------------------------------------------------------------------------------- /examples/etl/ETL_04_add_new_etl_process.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/examples/etl/ETL_04_add_new_etl_process.ipynb -------------------------------------------------------------------------------- /examples/etl/ETL_05_test_etl_process.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/examples/etl/ETL_05_test_etl_process.ipynb -------------------------------------------------------------------------------- /examples/etl/ETL_06_scaleout_with_EMR.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/examples/etl/ETL_06_scaleout_with_EMR.ipynb -------------------------------------------------------------------------------- /examples/etl/EX_use_common_crawl_data.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/examples/etl/EX_use_common_crawl_data.ipynb -------------------------------------------------------------------------------- /examples/etl/EX_use_pyspark_ui.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/examples/etl/EX_use_pyspark_ui.ipynb -------------------------------------------------------------------------------- /examples/etl/README.md: -------------------------------------------------------------------------------- 1 | 2 | # 🗺️ ETL (Extract, Transform, Load) -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/requirements.txt -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/dataverse/HEAD/setup.py --------------------------------------------------------------------------------