├── .gitignore ├── LICENSE ├── MANIFEST.in ├── Makefile ├── README.md ├── bin ├── spark-submit.sh └── sparkapp.py ├── configs ├── dedup_job.yaml ├── dedup_job_ja.yaml ├── dedup_job_th.yaml ├── df │ └── preproc-example.yaml ├── indonesia_job.yaml ├── ja_minhashlsh_dedup_job.yaml ├── japanese_job.yaml ├── japanese_job_v2_oscar-2301.yaml ├── japanese_job_v2_oscar2323_0-100.yaml ├── korean_job.yaml ├── sample_job.yaml └── thai_job.yaml ├── datasets ├── japanese_test_dataset │ ├── 0 │ │ └── oscar-2201_00000-0.jsonl │ └── 1 │ │ └── oscar-2201_00000-1.jsonl ├── test_ja_minhashlsh_dedup_job │ └── 0.jsonl ├── test_korean_jsonl_data │ ├── 0 │ │ ├── text_0.jsonl │ │ └── text_1.jsonl │ └── 1 │ │ ├── text_0.jsonl │ │ └── text_1.jsonl ├── test_sample_jsonl_data │ ├── 0 │ │ ├── text_0.jsonl │ │ ├── text_1.jsonl │ │ ├── text_2.jsonl │ │ ├── text_3.jsonl │ │ └── text_4.jsonl │ ├── 1 │ │ ├── text_0.jsonl │ │ ├── text_1.jsonl │ │ ├── text_2.jsonl │ │ ├── text_3.jsonl │ │ └── text_4.jsonl │ ├── 2 │ │ ├── text_0.jsonl │ │ ├── text_1.jsonl │ │ ├── text_2.jsonl │ │ ├── text_3.jsonl │ │ └── text_4.jsonl │ ├── 3 │ │ ├── text_0.jsonl │ │ ├── text_1.jsonl │ │ ├── text_2.jsonl │ │ ├── text_3.jsonl │ │ └── text_4.jsonl │ └── 4 │ │ ├── text_0.jsonl │ │ ├── text_1.jsonl │ │ ├── text_2.jsonl │ │ ├── text_3.jsonl │ │ └── text_4.jsonl └── test_thai_jsonl_data │ └── text.jsonl ├── doc ├── adding-df-processors.md ├── adding-udf-processors.md ├── dataframe.md ├── spark-session.md └── udf │ ├── langfilter.md │ └── splitter.md ├── dps ├── __init__.py ├── spark │ ├── __init__.py │ ├── jobs │ │ ├── __init__.py │ │ ├── chinese_job.py │ │ ├── dedup_job.py │ │ ├── indonesian_job.py │ │ ├── japanese_job.py │ │ ├── japanese_minhash_dedup_job.py │ │ ├── korean_job.py │ │ ├── romance_job.py │ │ ├── sample_job.py │ │ └── thai_job.py │ ├── prep │ │ ├── __init__.py │ │ ├── chinese_prep.py │ │ ├── dedup_prep.py │ │ ├── indonesia_prep.py │ │ ├── japanese_prep.py │ │ ├── korean_prep.py │ │ ├── lang_agnostic_prep.py │ │ ├── romance_prep.py │ │ └── thai_prep.py │ ├── run.py │ ├── spark_session.py │ └── utils │ │ ├── __init__.py │ │ ├── chinese_utils.py │ │ ├── indonesian_utils.py │ │ ├── io_utils.py │ │ ├── japanese_utils.py │ │ ├── korean_utils.py │ │ ├── lang_agnostic_utils.py │ │ ├── romance_utils.py │ │ ├── stopwords_th.txt │ │ ├── thai_utils.py │ │ └── token_utils.py └── spark_df │ ├── __init__.py │ ├── app │ ├── __init__.py │ └── sparkapp.py │ ├── defs.py │ ├── df │ └── __init__.py │ ├── dfprocessor.py │ ├── process.py │ ├── udf │ ├── __init__.py │ ├── langfilter.py │ └── splitter.py │ ├── udfprocessor.py │ └── utils │ ├── __init__.py │ ├── exception.py │ ├── io.py │ ├── logging.py │ ├── misc.py │ └── spark_session_utils.py ├── requirements-df.txt ├── requirements-ja.txt ├── requirements-kor.txt └── setup.py /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/dps/HEAD/.gitignore -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/dps/HEAD/LICENSE -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/dps/HEAD/MANIFEST.in -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/dps/HEAD/Makefile -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/dps/HEAD/README.md -------------------------------------------------------------------------------- /bin/spark-submit.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/dps/HEAD/bin/spark-submit.sh -------------------------------------------------------------------------------- /bin/sparkapp.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/dps/HEAD/bin/sparkapp.py -------------------------------------------------------------------------------- /configs/dedup_job.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/dps/HEAD/configs/dedup_job.yaml -------------------------------------------------------------------------------- /configs/dedup_job_ja.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/dps/HEAD/configs/dedup_job_ja.yaml -------------------------------------------------------------------------------- /configs/dedup_job_th.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/dps/HEAD/configs/dedup_job_th.yaml -------------------------------------------------------------------------------- /configs/df/preproc-example.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/dps/HEAD/configs/df/preproc-example.yaml -------------------------------------------------------------------------------- /configs/indonesia_job.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/dps/HEAD/configs/indonesia_job.yaml -------------------------------------------------------------------------------- /configs/ja_minhashlsh_dedup_job.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/dps/HEAD/configs/ja_minhashlsh_dedup_job.yaml -------------------------------------------------------------------------------- /configs/japanese_job.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/dps/HEAD/configs/japanese_job.yaml -------------------------------------------------------------------------------- /configs/japanese_job_v2_oscar-2301.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/dps/HEAD/configs/japanese_job_v2_oscar-2301.yaml -------------------------------------------------------------------------------- /configs/japanese_job_v2_oscar2323_0-100.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/dps/HEAD/configs/japanese_job_v2_oscar2323_0-100.yaml -------------------------------------------------------------------------------- /configs/korean_job.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/dps/HEAD/configs/korean_job.yaml -------------------------------------------------------------------------------- /configs/sample_job.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/dps/HEAD/configs/sample_job.yaml -------------------------------------------------------------------------------- /configs/thai_job.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/dps/HEAD/configs/thai_job.yaml -------------------------------------------------------------------------------- /datasets/japanese_test_dataset/0/oscar-2201_00000-0.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/dps/HEAD/datasets/japanese_test_dataset/0/oscar-2201_00000-0.jsonl -------------------------------------------------------------------------------- /datasets/japanese_test_dataset/1/oscar-2201_00000-1.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/dps/HEAD/datasets/japanese_test_dataset/1/oscar-2201_00000-1.jsonl -------------------------------------------------------------------------------- /datasets/test_ja_minhashlsh_dedup_job/0.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/dps/HEAD/datasets/test_ja_minhashlsh_dedup_job/0.jsonl -------------------------------------------------------------------------------- /datasets/test_korean_jsonl_data/0/text_0.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/dps/HEAD/datasets/test_korean_jsonl_data/0/text_0.jsonl -------------------------------------------------------------------------------- /datasets/test_korean_jsonl_data/0/text_1.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/dps/HEAD/datasets/test_korean_jsonl_data/0/text_1.jsonl -------------------------------------------------------------------------------- /datasets/test_korean_jsonl_data/1/text_0.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/dps/HEAD/datasets/test_korean_jsonl_data/1/text_0.jsonl -------------------------------------------------------------------------------- /datasets/test_korean_jsonl_data/1/text_1.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/dps/HEAD/datasets/test_korean_jsonl_data/1/text_1.jsonl -------------------------------------------------------------------------------- /datasets/test_sample_jsonl_data/0/text_0.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/dps/HEAD/datasets/test_sample_jsonl_data/0/text_0.jsonl -------------------------------------------------------------------------------- /datasets/test_sample_jsonl_data/0/text_1.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/dps/HEAD/datasets/test_sample_jsonl_data/0/text_1.jsonl -------------------------------------------------------------------------------- /datasets/test_sample_jsonl_data/0/text_2.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/dps/HEAD/datasets/test_sample_jsonl_data/0/text_2.jsonl -------------------------------------------------------------------------------- /datasets/test_sample_jsonl_data/0/text_3.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/dps/HEAD/datasets/test_sample_jsonl_data/0/text_3.jsonl -------------------------------------------------------------------------------- /datasets/test_sample_jsonl_data/0/text_4.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/dps/HEAD/datasets/test_sample_jsonl_data/0/text_4.jsonl -------------------------------------------------------------------------------- /datasets/test_sample_jsonl_data/1/text_0.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/dps/HEAD/datasets/test_sample_jsonl_data/1/text_0.jsonl -------------------------------------------------------------------------------- /datasets/test_sample_jsonl_data/1/text_1.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/dps/HEAD/datasets/test_sample_jsonl_data/1/text_1.jsonl -------------------------------------------------------------------------------- /datasets/test_sample_jsonl_data/1/text_2.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/dps/HEAD/datasets/test_sample_jsonl_data/1/text_2.jsonl -------------------------------------------------------------------------------- /datasets/test_sample_jsonl_data/1/text_3.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/dps/HEAD/datasets/test_sample_jsonl_data/1/text_3.jsonl -------------------------------------------------------------------------------- /datasets/test_sample_jsonl_data/1/text_4.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/dps/HEAD/datasets/test_sample_jsonl_data/1/text_4.jsonl -------------------------------------------------------------------------------- /datasets/test_sample_jsonl_data/2/text_0.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/dps/HEAD/datasets/test_sample_jsonl_data/2/text_0.jsonl -------------------------------------------------------------------------------- /datasets/test_sample_jsonl_data/2/text_1.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/dps/HEAD/datasets/test_sample_jsonl_data/2/text_1.jsonl -------------------------------------------------------------------------------- /datasets/test_sample_jsonl_data/2/text_2.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/dps/HEAD/datasets/test_sample_jsonl_data/2/text_2.jsonl -------------------------------------------------------------------------------- /datasets/test_sample_jsonl_data/2/text_3.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/dps/HEAD/datasets/test_sample_jsonl_data/2/text_3.jsonl -------------------------------------------------------------------------------- /datasets/test_sample_jsonl_data/2/text_4.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/dps/HEAD/datasets/test_sample_jsonl_data/2/text_4.jsonl -------------------------------------------------------------------------------- /datasets/test_sample_jsonl_data/3/text_0.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/dps/HEAD/datasets/test_sample_jsonl_data/3/text_0.jsonl -------------------------------------------------------------------------------- /datasets/test_sample_jsonl_data/3/text_1.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/dps/HEAD/datasets/test_sample_jsonl_data/3/text_1.jsonl -------------------------------------------------------------------------------- /datasets/test_sample_jsonl_data/3/text_2.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/dps/HEAD/datasets/test_sample_jsonl_data/3/text_2.jsonl -------------------------------------------------------------------------------- /datasets/test_sample_jsonl_data/3/text_3.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/dps/HEAD/datasets/test_sample_jsonl_data/3/text_3.jsonl -------------------------------------------------------------------------------- /datasets/test_sample_jsonl_data/3/text_4.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/dps/HEAD/datasets/test_sample_jsonl_data/3/text_4.jsonl -------------------------------------------------------------------------------- /datasets/test_sample_jsonl_data/4/text_0.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/dps/HEAD/datasets/test_sample_jsonl_data/4/text_0.jsonl -------------------------------------------------------------------------------- /datasets/test_sample_jsonl_data/4/text_1.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/dps/HEAD/datasets/test_sample_jsonl_data/4/text_1.jsonl -------------------------------------------------------------------------------- /datasets/test_sample_jsonl_data/4/text_2.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/dps/HEAD/datasets/test_sample_jsonl_data/4/text_2.jsonl -------------------------------------------------------------------------------- /datasets/test_sample_jsonl_data/4/text_3.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/dps/HEAD/datasets/test_sample_jsonl_data/4/text_3.jsonl -------------------------------------------------------------------------------- /datasets/test_sample_jsonl_data/4/text_4.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/dps/HEAD/datasets/test_sample_jsonl_data/4/text_4.jsonl -------------------------------------------------------------------------------- /datasets/test_thai_jsonl_data/text.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/dps/HEAD/datasets/test_thai_jsonl_data/text.jsonl -------------------------------------------------------------------------------- /doc/adding-df-processors.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/dps/HEAD/doc/adding-df-processors.md -------------------------------------------------------------------------------- /doc/adding-udf-processors.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/dps/HEAD/doc/adding-udf-processors.md -------------------------------------------------------------------------------- /doc/dataframe.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/dps/HEAD/doc/dataframe.md -------------------------------------------------------------------------------- /doc/spark-session.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/dps/HEAD/doc/spark-session.md -------------------------------------------------------------------------------- /doc/udf/langfilter.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/dps/HEAD/doc/udf/langfilter.md -------------------------------------------------------------------------------- /doc/udf/splitter.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/dps/HEAD/doc/udf/splitter.md -------------------------------------------------------------------------------- /dps/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dps/spark/__init__.py: -------------------------------------------------------------------------------- 1 | from .run import run -------------------------------------------------------------------------------- /dps/spark/jobs/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dps/spark/jobs/chinese_job.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/dps/HEAD/dps/spark/jobs/chinese_job.py -------------------------------------------------------------------------------- /dps/spark/jobs/dedup_job.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/dps/HEAD/dps/spark/jobs/dedup_job.py -------------------------------------------------------------------------------- /dps/spark/jobs/indonesian_job.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/dps/HEAD/dps/spark/jobs/indonesian_job.py -------------------------------------------------------------------------------- /dps/spark/jobs/japanese_job.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/dps/HEAD/dps/spark/jobs/japanese_job.py -------------------------------------------------------------------------------- /dps/spark/jobs/japanese_minhash_dedup_job.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/dps/HEAD/dps/spark/jobs/japanese_minhash_dedup_job.py -------------------------------------------------------------------------------- /dps/spark/jobs/korean_job.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/dps/HEAD/dps/spark/jobs/korean_job.py -------------------------------------------------------------------------------- /dps/spark/jobs/romance_job.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/dps/HEAD/dps/spark/jobs/romance_job.py -------------------------------------------------------------------------------- /dps/spark/jobs/sample_job.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/dps/HEAD/dps/spark/jobs/sample_job.py -------------------------------------------------------------------------------- /dps/spark/jobs/thai_job.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/dps/HEAD/dps/spark/jobs/thai_job.py -------------------------------------------------------------------------------- /dps/spark/prep/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dps/spark/prep/chinese_prep.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/dps/HEAD/dps/spark/prep/chinese_prep.py -------------------------------------------------------------------------------- /dps/spark/prep/dedup_prep.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/dps/HEAD/dps/spark/prep/dedup_prep.py -------------------------------------------------------------------------------- /dps/spark/prep/indonesia_prep.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/dps/HEAD/dps/spark/prep/indonesia_prep.py -------------------------------------------------------------------------------- /dps/spark/prep/japanese_prep.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/dps/HEAD/dps/spark/prep/japanese_prep.py -------------------------------------------------------------------------------- /dps/spark/prep/korean_prep.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/dps/HEAD/dps/spark/prep/korean_prep.py -------------------------------------------------------------------------------- /dps/spark/prep/lang_agnostic_prep.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/dps/HEAD/dps/spark/prep/lang_agnostic_prep.py -------------------------------------------------------------------------------- /dps/spark/prep/romance_prep.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/dps/HEAD/dps/spark/prep/romance_prep.py -------------------------------------------------------------------------------- /dps/spark/prep/thai_prep.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/dps/HEAD/dps/spark/prep/thai_prep.py -------------------------------------------------------------------------------- /dps/spark/run.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/dps/HEAD/dps/spark/run.py -------------------------------------------------------------------------------- /dps/spark/spark_session.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/dps/HEAD/dps/spark/spark_session.py -------------------------------------------------------------------------------- /dps/spark/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dps/spark/utils/chinese_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/dps/HEAD/dps/spark/utils/chinese_utils.py -------------------------------------------------------------------------------- /dps/spark/utils/indonesian_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/dps/HEAD/dps/spark/utils/indonesian_utils.py -------------------------------------------------------------------------------- /dps/spark/utils/io_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/dps/HEAD/dps/spark/utils/io_utils.py -------------------------------------------------------------------------------- /dps/spark/utils/japanese_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/dps/HEAD/dps/spark/utils/japanese_utils.py -------------------------------------------------------------------------------- /dps/spark/utils/korean_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/dps/HEAD/dps/spark/utils/korean_utils.py -------------------------------------------------------------------------------- /dps/spark/utils/lang_agnostic_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/dps/HEAD/dps/spark/utils/lang_agnostic_utils.py -------------------------------------------------------------------------------- /dps/spark/utils/romance_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/dps/HEAD/dps/spark/utils/romance_utils.py -------------------------------------------------------------------------------- /dps/spark/utils/stopwords_th.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/dps/HEAD/dps/spark/utils/stopwords_th.txt -------------------------------------------------------------------------------- /dps/spark/utils/thai_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/dps/HEAD/dps/spark/utils/thai_utils.py -------------------------------------------------------------------------------- /dps/spark/utils/token_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/dps/HEAD/dps/spark/utils/token_utils.py -------------------------------------------------------------------------------- /dps/spark_df/__init__.py: -------------------------------------------------------------------------------- 1 | VERSION = "0.2.0" 2 | -------------------------------------------------------------------------------- /dps/spark_df/app/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dps/spark_df/app/sparkapp.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/dps/HEAD/dps/spark_df/app/sparkapp.py -------------------------------------------------------------------------------- /dps/spark_df/defs.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/dps/HEAD/dps/spark_df/defs.py -------------------------------------------------------------------------------- /dps/spark_df/df/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dps/spark_df/dfprocessor.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/dps/HEAD/dps/spark_df/dfprocessor.py -------------------------------------------------------------------------------- /dps/spark_df/process.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/dps/HEAD/dps/spark_df/process.py -------------------------------------------------------------------------------- /dps/spark_df/udf/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dps/spark_df/udf/langfilter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/dps/HEAD/dps/spark_df/udf/langfilter.py -------------------------------------------------------------------------------- /dps/spark_df/udf/splitter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/dps/HEAD/dps/spark_df/udf/splitter.py -------------------------------------------------------------------------------- /dps/spark_df/udfprocessor.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/dps/HEAD/dps/spark_df/udfprocessor.py -------------------------------------------------------------------------------- /dps/spark_df/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dps/spark_df/utils/exception.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/dps/HEAD/dps/spark_df/utils/exception.py -------------------------------------------------------------------------------- /dps/spark_df/utils/io.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/dps/HEAD/dps/spark_df/utils/io.py -------------------------------------------------------------------------------- /dps/spark_df/utils/logging.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/dps/HEAD/dps/spark_df/utils/logging.py -------------------------------------------------------------------------------- /dps/spark_df/utils/misc.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/dps/HEAD/dps/spark_df/utils/misc.py -------------------------------------------------------------------------------- /dps/spark_df/utils/spark_session_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/dps/HEAD/dps/spark_df/utils/spark_session_utils.py -------------------------------------------------------------------------------- /requirements-df.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/dps/HEAD/requirements-df.txt -------------------------------------------------------------------------------- /requirements-ja.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/dps/HEAD/requirements-ja.txt -------------------------------------------------------------------------------- /requirements-kor.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/dps/HEAD/requirements-kor.txt -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/dps/HEAD/setup.py --------------------------------------------------------------------------------