├── .gitignore ├── LICENSE ├── README.MD ├── data_evals ├── README.md ├── __init__.py └── law │ ├── __init__.py │ ├── disc-law │ ├── __init__.py │ └── process_state1.py │ └── jecqa │ ├── __init__.py │ ├── process_state1.py │ ├── process_state2.py │ └── process_state3.py ├── data_pretrain └── __init__.py ├── data_sft_alpaca ├── amake_dataset_example │ ├── make_arrow_dataset.py │ ├── make_parquet_dataset.py │ ├── make_record_dataset.py │ └── make_record_dataset_with_crypt.py ├── load_datasets.py ├── make_alpaca │ └── make_parquet_dataset.py ├── make_alpaca_common │ └── make_parquet_dataset.py ├── make_alpaca_gpt4 │ └── make_parquet_dataset.py ├── make_belle │ └── make_parquet_dataset.py ├── make_firefly │ └── make_firefly_dataset.py ├── make_moss │ ├── make_parquet_moss_sft2.py │ └── make_parquet_moss_sft3.py ├── make_sharegpt │ └── make_parquet_dataset.py ├── make_tabular │ └── make_parquet_dataset.py └── make_ultrachat │ └── make_parquet_dataset.py ├── data_sft_tools ├── __init__.py ├── base │ ├── __init__.py │ └── tool_maker.py ├── glm3 │ ├── __init__.py │ ├── main.py │ └── tools_builder.py ├── qwen │ ├── __init__.py │ ├── custom_agent.py │ ├── custom_parser.py │ ├── main.py │ └── tools_builder.py ├── readme.md └── utils │ ├── __init__.py │ └── utils.py ├── docs ├── GLM.md ├── data.md ├── img │ ├── dataset_example_1.png │ ├── dataset_example_2.png │ ├── dataset_example_3.png │ ├── dataset_example_4.png │ ├── dataset_example_5.png │ ├── dataset_figure_0.png │ ├── dataset_figure_2.png │ ├── dataset_figure_3.png │ ├── dataset_figure_4.png │ ├── dataset_pipeline.png │ ├── glm_blank_filling.png │ ├── glm_example_1.png │ ├── glm_example_2.png │ ├── glm_example_3.png │ ├── glm_example_4.png │ ├── glm_example_5.png │ ├── glm_io_1.png │ ├── glm_io_2.png │ ├── glm_io_3.png │ ├── glm_io_4.png │ ├── glm_performance.png │ ├── glm_results2.png │ ├── gpt2_writing_model.png │ ├── pet_example.png │ ├── predictor_map.png │ ├── prompt_figure_1.png │ ├── prompt_figure_2.png │ ├── semantic_matching_model.png │ ├── tokenize.png │ ├── tokenizer_example_1.png │ └── transformer.png └── tokenization.md └── requirements.txt /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ssbuild/aigc_data/HEAD/.gitignore -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ssbuild/aigc_data/HEAD/LICENSE -------------------------------------------------------------------------------- /README.MD: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ssbuild/aigc_data/HEAD/README.MD -------------------------------------------------------------------------------- /data_evals/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ssbuild/aigc_data/HEAD/data_evals/README.md -------------------------------------------------------------------------------- /data_evals/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ssbuild/aigc_data/HEAD/data_evals/__init__.py -------------------------------------------------------------------------------- /data_evals/law/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ssbuild/aigc_data/HEAD/data_evals/law/__init__.py -------------------------------------------------------------------------------- /data_evals/law/disc-law/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ssbuild/aigc_data/HEAD/data_evals/law/disc-law/__init__.py -------------------------------------------------------------------------------- /data_evals/law/disc-law/process_state1.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ssbuild/aigc_data/HEAD/data_evals/law/disc-law/process_state1.py -------------------------------------------------------------------------------- /data_evals/law/jecqa/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ssbuild/aigc_data/HEAD/data_evals/law/jecqa/__init__.py -------------------------------------------------------------------------------- /data_evals/law/jecqa/process_state1.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ssbuild/aigc_data/HEAD/data_evals/law/jecqa/process_state1.py -------------------------------------------------------------------------------- /data_evals/law/jecqa/process_state2.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ssbuild/aigc_data/HEAD/data_evals/law/jecqa/process_state2.py -------------------------------------------------------------------------------- /data_evals/law/jecqa/process_state3.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ssbuild/aigc_data/HEAD/data_evals/law/jecqa/process_state3.py -------------------------------------------------------------------------------- /data_pretrain/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ssbuild/aigc_data/HEAD/data_pretrain/__init__.py -------------------------------------------------------------------------------- /data_sft_alpaca/amake_dataset_example/make_arrow_dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ssbuild/aigc_data/HEAD/data_sft_alpaca/amake_dataset_example/make_arrow_dataset.py -------------------------------------------------------------------------------- /data_sft_alpaca/amake_dataset_example/make_parquet_dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ssbuild/aigc_data/HEAD/data_sft_alpaca/amake_dataset_example/make_parquet_dataset.py -------------------------------------------------------------------------------- /data_sft_alpaca/amake_dataset_example/make_record_dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ssbuild/aigc_data/HEAD/data_sft_alpaca/amake_dataset_example/make_record_dataset.py -------------------------------------------------------------------------------- /data_sft_alpaca/amake_dataset_example/make_record_dataset_with_crypt.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ssbuild/aigc_data/HEAD/data_sft_alpaca/amake_dataset_example/make_record_dataset_with_crypt.py -------------------------------------------------------------------------------- /data_sft_alpaca/load_datasets.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ssbuild/aigc_data/HEAD/data_sft_alpaca/load_datasets.py -------------------------------------------------------------------------------- /data_sft_alpaca/make_alpaca/make_parquet_dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ssbuild/aigc_data/HEAD/data_sft_alpaca/make_alpaca/make_parquet_dataset.py -------------------------------------------------------------------------------- /data_sft_alpaca/make_alpaca_common/make_parquet_dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ssbuild/aigc_data/HEAD/data_sft_alpaca/make_alpaca_common/make_parquet_dataset.py -------------------------------------------------------------------------------- /data_sft_alpaca/make_alpaca_gpt4/make_parquet_dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ssbuild/aigc_data/HEAD/data_sft_alpaca/make_alpaca_gpt4/make_parquet_dataset.py -------------------------------------------------------------------------------- /data_sft_alpaca/make_belle/make_parquet_dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ssbuild/aigc_data/HEAD/data_sft_alpaca/make_belle/make_parquet_dataset.py -------------------------------------------------------------------------------- /data_sft_alpaca/make_firefly/make_firefly_dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ssbuild/aigc_data/HEAD/data_sft_alpaca/make_firefly/make_firefly_dataset.py -------------------------------------------------------------------------------- /data_sft_alpaca/make_moss/make_parquet_moss_sft2.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ssbuild/aigc_data/HEAD/data_sft_alpaca/make_moss/make_parquet_moss_sft2.py -------------------------------------------------------------------------------- /data_sft_alpaca/make_moss/make_parquet_moss_sft3.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ssbuild/aigc_data/HEAD/data_sft_alpaca/make_moss/make_parquet_moss_sft3.py -------------------------------------------------------------------------------- /data_sft_alpaca/make_sharegpt/make_parquet_dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ssbuild/aigc_data/HEAD/data_sft_alpaca/make_sharegpt/make_parquet_dataset.py -------------------------------------------------------------------------------- /data_sft_alpaca/make_tabular/make_parquet_dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ssbuild/aigc_data/HEAD/data_sft_alpaca/make_tabular/make_parquet_dataset.py -------------------------------------------------------------------------------- /data_sft_alpaca/make_ultrachat/make_parquet_dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ssbuild/aigc_data/HEAD/data_sft_alpaca/make_ultrachat/make_parquet_dataset.py -------------------------------------------------------------------------------- /data_sft_tools/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ssbuild/aigc_data/HEAD/data_sft_tools/__init__.py -------------------------------------------------------------------------------- /data_sft_tools/base/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ssbuild/aigc_data/HEAD/data_sft_tools/base/__init__.py -------------------------------------------------------------------------------- /data_sft_tools/base/tool_maker.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ssbuild/aigc_data/HEAD/data_sft_tools/base/tool_maker.py -------------------------------------------------------------------------------- /data_sft_tools/glm3/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ssbuild/aigc_data/HEAD/data_sft_tools/glm3/__init__.py -------------------------------------------------------------------------------- /data_sft_tools/glm3/main.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ssbuild/aigc_data/HEAD/data_sft_tools/glm3/main.py -------------------------------------------------------------------------------- /data_sft_tools/glm3/tools_builder.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ssbuild/aigc_data/HEAD/data_sft_tools/glm3/tools_builder.py -------------------------------------------------------------------------------- /data_sft_tools/qwen/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ssbuild/aigc_data/HEAD/data_sft_tools/qwen/__init__.py -------------------------------------------------------------------------------- /data_sft_tools/qwen/custom_agent.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ssbuild/aigc_data/HEAD/data_sft_tools/qwen/custom_agent.py -------------------------------------------------------------------------------- /data_sft_tools/qwen/custom_parser.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ssbuild/aigc_data/HEAD/data_sft_tools/qwen/custom_parser.py -------------------------------------------------------------------------------- /data_sft_tools/qwen/main.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ssbuild/aigc_data/HEAD/data_sft_tools/qwen/main.py -------------------------------------------------------------------------------- /data_sft_tools/qwen/tools_builder.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ssbuild/aigc_data/HEAD/data_sft_tools/qwen/tools_builder.py -------------------------------------------------------------------------------- /data_sft_tools/readme.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ssbuild/aigc_data/HEAD/data_sft_tools/readme.md -------------------------------------------------------------------------------- /data_sft_tools/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ssbuild/aigc_data/HEAD/data_sft_tools/utils/__init__.py -------------------------------------------------------------------------------- /data_sft_tools/utils/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ssbuild/aigc_data/HEAD/data_sft_tools/utils/utils.py -------------------------------------------------------------------------------- /docs/GLM.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ssbuild/aigc_data/HEAD/docs/GLM.md -------------------------------------------------------------------------------- /docs/data.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ssbuild/aigc_data/HEAD/docs/data.md -------------------------------------------------------------------------------- /docs/img/dataset_example_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ssbuild/aigc_data/HEAD/docs/img/dataset_example_1.png -------------------------------------------------------------------------------- /docs/img/dataset_example_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ssbuild/aigc_data/HEAD/docs/img/dataset_example_2.png -------------------------------------------------------------------------------- /docs/img/dataset_example_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ssbuild/aigc_data/HEAD/docs/img/dataset_example_3.png -------------------------------------------------------------------------------- /docs/img/dataset_example_4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ssbuild/aigc_data/HEAD/docs/img/dataset_example_4.png -------------------------------------------------------------------------------- /docs/img/dataset_example_5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ssbuild/aigc_data/HEAD/docs/img/dataset_example_5.png -------------------------------------------------------------------------------- /docs/img/dataset_figure_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ssbuild/aigc_data/HEAD/docs/img/dataset_figure_0.png -------------------------------------------------------------------------------- /docs/img/dataset_figure_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ssbuild/aigc_data/HEAD/docs/img/dataset_figure_2.png -------------------------------------------------------------------------------- /docs/img/dataset_figure_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ssbuild/aigc_data/HEAD/docs/img/dataset_figure_3.png -------------------------------------------------------------------------------- /docs/img/dataset_figure_4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ssbuild/aigc_data/HEAD/docs/img/dataset_figure_4.png -------------------------------------------------------------------------------- /docs/img/dataset_pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ssbuild/aigc_data/HEAD/docs/img/dataset_pipeline.png -------------------------------------------------------------------------------- /docs/img/glm_blank_filling.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ssbuild/aigc_data/HEAD/docs/img/glm_blank_filling.png -------------------------------------------------------------------------------- /docs/img/glm_example_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ssbuild/aigc_data/HEAD/docs/img/glm_example_1.png -------------------------------------------------------------------------------- /docs/img/glm_example_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ssbuild/aigc_data/HEAD/docs/img/glm_example_2.png -------------------------------------------------------------------------------- /docs/img/glm_example_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ssbuild/aigc_data/HEAD/docs/img/glm_example_3.png -------------------------------------------------------------------------------- /docs/img/glm_example_4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ssbuild/aigc_data/HEAD/docs/img/glm_example_4.png -------------------------------------------------------------------------------- /docs/img/glm_example_5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ssbuild/aigc_data/HEAD/docs/img/glm_example_5.png -------------------------------------------------------------------------------- /docs/img/glm_io_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ssbuild/aigc_data/HEAD/docs/img/glm_io_1.png -------------------------------------------------------------------------------- /docs/img/glm_io_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ssbuild/aigc_data/HEAD/docs/img/glm_io_2.png -------------------------------------------------------------------------------- /docs/img/glm_io_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ssbuild/aigc_data/HEAD/docs/img/glm_io_3.png -------------------------------------------------------------------------------- /docs/img/glm_io_4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ssbuild/aigc_data/HEAD/docs/img/glm_io_4.png -------------------------------------------------------------------------------- /docs/img/glm_performance.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ssbuild/aigc_data/HEAD/docs/img/glm_performance.png -------------------------------------------------------------------------------- /docs/img/glm_results2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ssbuild/aigc_data/HEAD/docs/img/glm_results2.png -------------------------------------------------------------------------------- /docs/img/gpt2_writing_model.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ssbuild/aigc_data/HEAD/docs/img/gpt2_writing_model.png -------------------------------------------------------------------------------- /docs/img/pet_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ssbuild/aigc_data/HEAD/docs/img/pet_example.png -------------------------------------------------------------------------------- /docs/img/predictor_map.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ssbuild/aigc_data/HEAD/docs/img/predictor_map.png -------------------------------------------------------------------------------- /docs/img/prompt_figure_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ssbuild/aigc_data/HEAD/docs/img/prompt_figure_1.png -------------------------------------------------------------------------------- /docs/img/prompt_figure_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ssbuild/aigc_data/HEAD/docs/img/prompt_figure_2.png -------------------------------------------------------------------------------- /docs/img/semantic_matching_model.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ssbuild/aigc_data/HEAD/docs/img/semantic_matching_model.png -------------------------------------------------------------------------------- /docs/img/tokenize.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ssbuild/aigc_data/HEAD/docs/img/tokenize.png -------------------------------------------------------------------------------- /docs/img/tokenizer_example_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ssbuild/aigc_data/HEAD/docs/img/tokenizer_example_1.png -------------------------------------------------------------------------------- /docs/img/transformer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ssbuild/aigc_data/HEAD/docs/img/transformer.png -------------------------------------------------------------------------------- /docs/tokenization.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ssbuild/aigc_data/HEAD/docs/tokenization.md -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | fastcrypto 2 | fastdatasets>=0.9.17 3 | tqdm --------------------------------------------------------------------------------