├── .github ├── ISSUE_TEMPLATE │ ├── 1-add_documentation_report.yml │ ├── 2-bug_python.yml │ ├── 3-feature_request.yml │ ├── 4-blank-issue-template.md │ └── config.yml └── workflows │ ├── codspeed.yml │ ├── docs-pr-close.yml │ ├── docs-pr.yml │ ├── docs.yml │ ├── release.yml │ └── test.yml ├── .gitignore ├── .pre-commit-config.yaml ├── CITATION.cff ├── LICENSE ├── LICENSE_HEADER ├── Makefile ├── README.md ├── docs ├── CNAME ├── api │ ├── cli.md │ ├── distiset.md │ ├── errors.md │ ├── exceptions.md │ ├── mixins │ │ ├── requirements.md │ │ └── runtime_parameters.md │ ├── models │ │ ├── embedding │ │ │ ├── embedding_gallery.md │ │ │ └── index.md │ │ ├── image_generation │ │ │ ├── image_generation_gallery.md │ │ │ └── index.md │ │ └── llm │ │ │ ├── index.md │ │ │ └── llm_gallery.md │ ├── pipeline │ │ ├── index.md │ │ ├── routing_batch_function.md │ │ └── step_wrapper.md │ ├── step │ │ ├── decorator.md │ │ ├── generator_step.md │ │ ├── global_step.md │ │ ├── index.md │ │ └── resources.md │ ├── step_gallery │ │ ├── argilla.md │ │ ├── columns.md │ │ ├── extra.md │ │ └── hugging_face.md │ ├── task │ │ ├── generator_task.md │ │ ├── image_task.md │ │ ├── index.md │ │ └── task_gallery.md │ └── typing.md ├── assets │ ├── distilabel-badge-dark.png │ ├── distilabel-badge-light.png │ ├── distilabel-black.png │ ├── distilabel-black.svg │ ├── distilabel-icon.svg │ ├── distilabel-white.png │ ├── distilabel-white.svg │ ├── images │ │ ├── distilabel-diagram-dark.svg │ │ ├── distilabel-diagram.svg │ │ └── sections │ │ │ ├── caching │ │ │ ├── caching_1.png │ │ │ └── caching_2.png │ │ │ ├── cli │ │ │ └── cli_pipe.png │ │ │ ├── community │ │ │ ├── compare-pull-request.PNG │ │ │ ├── create-branch.PNG │ │ │ └── edit-file.PNG │ │ │ ├── examples │ │ │ └── knowledge-graph-example.png │ │ │ └── how_to_guides │ │ │ ├── basic │ │ │ └── pipeline.png │ │ │ ├── steps │ │ │ └── argilla │ │ │ │ ├── preference.png │ │ │ │ └── text_generation.png │ │ │ └── tasks │ │ │ └── task_print.png │ ├── logo.svg │ ├── pipelines │ │ ├── arena-hard.png │ │ ├── clair.png │ │ ├── clean-dataset.png │ │ ├── deepseek.png │ │ ├── deita.png │ │ ├── generate-preference-dataset.png │ │ ├── instruction_backtranslation.png │ │ ├── knowledge_graphs.png │ │ ├── prometheus.png │ │ ├── sentence-transformer.png │ │ └── ultrafeedback.png │ └── tutorials-assets │ │ ├── deepseek_prover.png │ │ ├── deita │ │ ├── datasets.png │ │ ├── diversity.png │ │ ├── overview.png │ │ └── results.png │ │ ├── instrucion_dataset_notus_ui.png │ │ ├── math-sheperd.png │ │ ├── overview-apigen.jpg │ │ ├── preference_dataset_notus_ui.png │ │ └── wiki_transfer_learning.png ├── index.md ├── scripts │ ├── gen_popular_issues.py │ └── gen_ref_pages.py ├── sections │ ├── community │ │ ├── contributor.md │ │ ├── developer_documentation.md │ │ └── index.md │ ├── getting_started │ │ ├── faq.md │ │ ├── installation.md │ │ └── quickstart.md │ ├── how_to_guides │ │ ├── advanced │ │ │ ├── argilla.md │ │ │ ├── assigning_resources_to_step.md │ │ │ ├── caching.md │ │ │ ├── cli │ │ │ │ └── index.md │ │ │ ├── distiset.md │ │ │ ├── fs_to_pass_data.md │ │ │ ├── load_groups_and_execution_stages.md │ │ │ ├── offline_batch_generation.md │ │ │ ├── pipeline_requirements.md │ │ │ ├── saving_step_generated_artifacts.md │ │ │ ├── scaling_with_ray.md │ │ │ ├── serving_an_llm_for_reuse.md │ │ │ └── structured_generation.md │ │ ├── basic │ │ │ ├── llm │ │ │ │ └── index.md │ │ │ ├── pipeline │ │ │ │ └── index.md │ │ │ ├── step │ │ │ │ ├── generator_step.md │ │ │ │ ├── global_step.md │ │ │ │ └── index.md │ │ │ └── task │ │ │ │ ├── generator_task.md │ │ │ │ ├── image_task.md │ │ │ │ └── index.md │ │ └── index.md │ └── pipeline_samples │ │ ├── examples │ │ ├── benchmarking_with_distilabel.md │ │ ├── exam_questions.md │ │ ├── fine_personas_social_network.md │ │ ├── image_generation.md │ │ ├── llama_cpp_with_outlines.md │ │ ├── mistralai_with_instructor.md │ │ └── text_generation_with_image.md │ │ ├── index.md │ │ ├── papers │ │ ├── apigen.md │ │ ├── clair.md │ │ ├── deepseek_prover.md │ │ ├── deita.md │ │ ├── instruction_backtranslation.md │ │ ├── math_shepherd.md │ │ ├── prometheus.md │ │ └── ultrafeedback.md │ │ └── tutorials │ │ ├── GenerateSentencePair.ipynb │ │ ├── clean_existing_dataset.ipynb │ │ ├── generate_preference_dataset.ipynb │ │ └── generate_textcat_dataset.ipynb └── stylesheets │ ├── extra.css │ └── fonts │ ├── FontAwesome.otf │ ├── fontawesome-webfont.eot │ ├── fontawesome-webfont.svg │ ├── fontawesome-webfont.ttf │ ├── fontawesome-webfont.woff │ └── fontawesome-webfont.woff2 ├── examples ├── arena_hard.py ├── deepseek_prover.py ├── draw_kg.py ├── exam_questions.py ├── finepersonas_social_ai.py ├── image_generation.py ├── lib_apigen.py ├── pipe_math_shepherd.py ├── pipeline_apigen.py ├── structured_generation_with_instructor.py ├── structured_generation_with_outlines.py └── text_generation_with_image.py ├── mkdocs.yml ├── pyproject.toml ├── scripts ├── install_cpu_vllm.sh ├── install_dependencies.sh └── install_docs_dependencies.sh ├── src └── distilabel │ ├── __init__.py │ ├── __main__.py │ ├── cli │ ├── __init__.py │ ├── app.py │ └── pipeline │ │ ├── __init__.py │ │ ├── app.py │ │ └── utils.py │ ├── constants.py │ ├── distiset.py │ ├── embeddings.py │ ├── envs.py │ ├── errors.py │ ├── exceptions.py │ ├── llms.py │ ├── mixins │ ├── __init__.py │ ├── requirements.py │ ├── runtime_parameters.py │ └── signature.py │ ├── models │ ├── __init__.py │ ├── base_clients │ │ ├── __init__.py │ │ ├── inference_endpoints.py │ │ └── openai.py │ ├── embeddings │ │ ├── __init__.py │ │ ├── base.py │ │ ├── llamacpp.py │ │ ├── sentence_transformers.py │ │ └── vllm.py │ ├── image_generation │ │ ├── __init__.py │ │ ├── base.py │ │ ├── huggingface │ │ │ ├── __init__.py │ │ │ └── inference_endpoints.py │ │ ├── openai.py │ │ └── utils.py │ ├── llms │ │ ├── __init__.py │ │ ├── anthropic.py │ │ ├── anyscale.py │ │ ├── azure.py │ │ ├── base.py │ │ ├── cohere.py │ │ ├── groq.py │ │ ├── huggingface │ │ │ ├── __init__.py │ │ │ ├── inference_endpoints.py │ │ │ └── transformers.py │ │ ├── litellm.py │ │ ├── llamacpp.py │ │ ├── mistral.py │ │ ├── mlx.py │ │ ├── moa.py │ │ ├── ollama.py │ │ ├── openai.py │ │ ├── together.py │ │ ├── utils.py │ │ ├── vertexai.py │ │ └── vllm.py │ └── mixins │ │ ├── __init__.py │ │ ├── cuda_device_placement.py │ │ └── magpie.py │ ├── pipeline │ ├── __init__.py │ ├── _dag.py │ ├── base.py │ ├── batch.py │ ├── batch_manager.py │ ├── local.py │ ├── ray.py │ ├── routing_batch_function.py │ ├── step_wrapper.py │ ├── templates │ │ ├── __init__.py │ │ └── instruction.py │ └── write_buffer.py │ ├── steps │ ├── __init__.py │ ├── argilla │ │ ├── __init__.py │ │ ├── base.py │ │ ├── preference.py │ │ └── text_generation.py │ ├── base.py │ ├── clustering │ │ ├── __init__.py │ │ ├── dbscan.py │ │ ├── text_clustering.py │ │ └── umap.py │ ├── columns │ │ ├── __init__.py │ │ ├── combine.py │ │ ├── expand.py │ │ ├── group.py │ │ ├── keep.py │ │ ├── merge.py │ │ └── utils.py │ ├── decorator.py │ ├── deita.py │ ├── embeddings │ │ ├── __init__.py │ │ ├── embedding_generation.py │ │ └── nearest_neighbour.py │ ├── filtering │ │ ├── __init__.py │ │ ├── _datasketch.py │ │ ├── embedding.py │ │ └── minhash.py │ ├── formatting │ │ ├── __init__.py │ │ ├── conversation.py │ │ ├── dpo.py │ │ └── sft.py │ ├── generators │ │ ├── __init__.py │ │ ├── data.py │ │ ├── data_sampler.py │ │ ├── huggingface.py │ │ └── utils.py │ ├── globals │ │ ├── __init__.py │ │ └── huggingface.py │ ├── reward_model.py │ ├── tasks │ │ ├── __init__.py │ │ ├── apigen │ │ │ ├── __init__.py │ │ │ ├── execution_checker.py │ │ │ ├── generator.py │ │ │ ├── semantic_checker.py │ │ │ └── utils.py │ │ ├── argilla_labeller.py │ │ ├── base.py │ │ ├── clair.py │ │ ├── complexity_scorer.py │ │ ├── decorator.py │ │ ├── evol_instruct │ │ │ ├── __init__.py │ │ │ ├── base.py │ │ │ ├── english_nouns.txt │ │ │ ├── evol_complexity │ │ │ │ ├── __init__.py │ │ │ │ ├── base.py │ │ │ │ ├── generator.py │ │ │ │ └── utils.py │ │ │ ├── generator.py │ │ │ └── utils.py │ │ ├── evol_quality │ │ │ ├── __init__.py │ │ │ ├── base.py │ │ │ └── utils.py │ │ ├── generate_embeddings.py │ │ ├── genstruct.py │ │ ├── image_generation.py │ │ ├── improving_text_embeddings.py │ │ ├── instruction_backtranslation.py │ │ ├── magpie │ │ │ ├── __init__.py │ │ │ ├── base.py │ │ │ └── generator.py │ │ ├── math_shepherd │ │ │ ├── __init__.py │ │ │ ├── completer.py │ │ │ ├── generator.py │ │ │ └── utils.py │ │ ├── pair_rm.py │ │ ├── prometheus_eval.py │ │ ├── quality_scorer.py │ │ ├── self_instruct.py │ │ ├── sentence_transformers.py │ │ ├── structured_generation.py │ │ ├── structured_outputs │ │ │ ├── __init__.py │ │ │ ├── instructor.py │ │ │ ├── outlines.py │ │ │ └── utils.py │ │ ├── templates │ │ │ ├── apigen │ │ │ │ ├── generator.jinja2 │ │ │ │ └── semantic_checker.jinja2 │ │ │ ├── argillalabeller.jinja2 │ │ │ ├── clair.jinja2 │ │ │ ├── complexity-scorer.jinja2 │ │ │ ├── generate-sentence-pair.jinja2 │ │ │ ├── genstruct.jinja2 │ │ │ ├── improving_text_embeddings │ │ │ │ ├── bitext-retrieval.jinja2 │ │ │ │ ├── brainstorming │ │ │ │ │ ├── text-classification.jinja2 │ │ │ │ │ ├── text-matching-long.jinja2 │ │ │ │ │ ├── text-matching-short.jinja2 │ │ │ │ │ └── text-retrieval.jinja2 │ │ │ │ ├── long-text-matching.jinja2 │ │ │ │ ├── monolingual-triplet.jinja2 │ │ │ │ ├── short-text-matching.jinja2 │ │ │ │ ├── text-classification.jinja2 │ │ │ │ └── text-retrieval.jinja2 │ │ │ ├── instruction-backtranslation.jinja2 │ │ │ ├── prometheus │ │ │ │ ├── absolute_with_reference.jinja2 │ │ │ │ ├── absolute_without_reference.jinja2 │ │ │ │ ├── relative_with_reference.jinja2 │ │ │ │ └── relative_without_reference.jinja2 │ │ │ ├── quality-scorer.jinja2 │ │ │ ├── self-instruct.jinja2 │ │ │ ├── ultrafeedback │ │ │ │ ├── helpfulness.jinja2 │ │ │ │ ├── honesty.jinja2 │ │ │ │ ├── instruction-following.jinja2 │ │ │ │ ├── overall-rating.jinja2 │ │ │ │ └── truthfulness.jinja2 │ │ │ └── urial.jinja2 │ │ ├── text_classification.py │ │ ├── text_generation.py │ │ ├── text_generation_with_image.py │ │ ├── ultrafeedback.py │ │ └── urial.py │ └── truncate.py │ ├── typing │ ├── __init__.py │ ├── base.py │ ├── models.py │ ├── pipeline.py │ └── steps.py │ └── utils │ ├── __init__.py │ ├── card │ ├── __init__.py │ ├── dataset_card.py │ └── distilabel_template.md │ ├── chat.py │ ├── dicts.py │ ├── docstring.py │ ├── export_components_info.py │ ├── files.py │ ├── huggingface.py │ ├── image.py │ ├── itertools.py │ ├── lists.py │ ├── logging.py │ ├── mkdocs │ ├── __init__.py │ ├── components_gallery.py │ └── templates │ │ └── components-gallery │ │ ├── components-list.jinja2 │ │ ├── index.md │ │ ├── llm-detail.jinja2 │ │ └── step-detail.jinja2 │ ├── notebook.py │ ├── ray.py │ ├── requirements.py │ ├── serialization.py │ ├── template.py │ └── typing_.py └── tests ├── __init__.py ├── conftest.py ├── integration ├── __init__.py ├── conftest.py ├── test_branching_missaligmnent.py ├── test_cache.py ├── test_caching_steps.py ├── test_dataset_without_step.py ├── test_deduplication.py ├── test_dry_run.py ├── test_embedding_dedup.py ├── test_generator_and_sampler.py ├── test_load_groups.py ├── test_load_stages.py ├── test_multiple_replicas.py ├── test_offline_batch_generation.py ├── test_pipe_llms.py ├── test_pipe_simple.py ├── test_prints.py ├── test_ray_pipeline.py ├── test_routing_batch_function.py └── test_using_fs_to_pass_data.py └── unit ├── __init__.py ├── cli ├── __init__.py ├── pipeline │ ├── __init__.py │ ├── test_app.py │ └── utils.py ├── test_pipeline.yaml └── utils.py ├── conftest.py ├── helpers.py ├── mixins ├── __init__.py └── test_runtime_parameters.py ├── models ├── __init__.py ├── embeddings │ ├── __init__.py │ ├── test_llamacpp.py │ ├── test_sentence_transformers.py │ └── test_vllm.py ├── image_generation │ ├── __init__.py │ ├── huggingface │ │ ├── __init__.py │ │ └── test_inference_endpoints.py │ └── test_openai.py ├── llms │ ├── __init__.py │ ├── huggingface │ │ ├── __init__.py │ │ ├── test_inference_endpoints.py │ │ └── test_transformers.py │ ├── test_anthropic.py │ ├── test_anyscale.py │ ├── test_azure.py │ ├── test_base.py │ ├── test_cohere.py │ ├── test_groq.py │ ├── test_litellm.py │ ├── test_llamacpp.py │ ├── test_mistral.py │ ├── test_mlx.py │ ├── test_moa.py │ ├── test_ollama.py │ ├── test_openai.py │ ├── test_together.py │ ├── test_vertexai.py │ ├── test_vllm.py │ └── utils.py └── mixins │ ├── __init__.py │ ├── test_cuda_device_placement.py │ └── test_magpie.py ├── pipeline ├── __init__.py ├── conftest.py ├── test_base.py ├── test_batch.py ├── test_batch_manager.py ├── test_dag.py ├── test_local.py ├── test_ray.py ├── test_routing_batch_function.py ├── test_write_buffer.py └── utils.py ├── steps ├── __init__.py ├── argilla │ ├── __init__.py │ ├── test_base.py │ ├── test_preference.py │ └── test_text_generation.py ├── clustering │ ├── __init__.py │ ├── test_dbscan.py │ ├── test_text_clustering.py │ └── test_umap.py ├── columns │ ├── __init__.py │ ├── test_combine.py │ ├── test_expand.py │ ├── test_group.py │ ├── test_keep.py │ ├── test_merge.py │ └── test_utils.py ├── embeddings │ ├── __init__.py │ ├── test_embedding_generation.py │ └── test_nearest_neighbour.py ├── filtering │ ├── __init__.py │ ├── test_embeddings.py │ └── test_minhash.py ├── formatting │ ├── __init__.py │ ├── test_conversation.py │ ├── test_dpo.py │ └── test_sft.py ├── generators │ ├── sample_functions.jsonl │ ├── test_data.py │ ├── test_data_sampler.py │ ├── test_huggingface.py │ └── test_utils.py ├── tasks │ ├── __init__.py │ ├── apigen │ │ ├── __init__.py │ │ ├── _sample_lib │ │ │ ├── final_velocity.py │ │ │ └── get_value.py │ │ ├── _sample_module.py │ │ ├── test_execution_checker.py │ │ ├── test_generator.py │ │ ├── test_semantic_checker.py │ │ └── test_utils.py │ ├── evol_instruct │ │ ├── __init__.py │ │ ├── evol_complexity.py │ │ │ ├── __init__.py │ │ │ ├── test_base.py │ │ │ └── test_generator.py │ │ ├── test_base.py │ │ └── test_generator.py │ ├── evol_quality │ │ ├── __init__.py │ │ └── test_base.py │ ├── magpie │ │ ├── __init__.py │ │ ├── test_base.py │ │ └── test_generator.py │ ├── math_shepherd │ │ ├── __init__.py │ │ ├── test_completer.py │ │ ├── test_generator.py │ │ └── test_utils.py │ ├── structured_outputs │ │ ├── __init__.py │ │ ├── test_outlines.py │ │ └── test_utils.py │ ├── test_argilla_labeller.py │ ├── test_base.py │ ├── test_clair.py │ ├── test_complexity_scorer.py │ ├── test_decorator.py │ ├── test_generate_embeddings.py │ ├── test_genstruct.py │ ├── test_image_generation.py │ ├── test_improving_text_embeddings.py │ ├── test_instruction_backtranslation.py │ ├── test_pair_rm.py │ ├── test_prometheus_eval.py │ ├── test_quality_scorer.py │ ├── test_self_instruct.py │ ├── test_sentence_transformers.py │ ├── test_structured_generation.py │ ├── test_text_classification.py │ ├── test_text_generation.py │ ├── test_text_generation_with_image.py │ ├── test_ultrafeedback.py │ └── test_urial.py ├── test_base.py ├── test_decorator.py ├── test_deita.py ├── test_reward_model.py └── test_truncate.py ├── test_distiset.py ├── test_errors.py ├── test_imports.py └── utils ├── __init__.py ├── test_chat.py ├── test_docstring.py ├── test_files.py ├── test_lists.py ├── test_ray.py ├── test_requirements.py ├── test_serialization.py └── test_typing.py /.github/ISSUE_TEMPLATE/1-add_documentation_report.yml: -------------------------------------------------------------------------------- 1 | name: "\U0001F4DA Add a documentation report" 2 | description: "Have you spotted a typo or mistake in our docs?" 3 | title: "[DOCS]" 4 | labels: ["documentation"] 5 | assignees: [] 6 | 7 | body: 8 | - type: markdown 9 | attributes: 10 | value: "Thank you for reporting a documentation mistake! Before you get started, please [search to see](https://github.com/argilla-io/distilabel/issues) if an issue already exists for the bug you encountered." 11 | 12 | - type: textarea 13 | id: doc_report 14 | attributes: 15 | label: "Which page or section is this issue related to?" 16 | description: "Please include the URL and/or source." 17 | validations: 18 | required: false 19 | 20 | - type: textarea 21 | id: doc_review 22 | attributes: 23 | label: "What are you documenting, or what change are you making in the documentation?" 24 | description: "If a documentation needs to be created, please specify its coverage.\n If there's a typo or something needs revisiting, please indicate it and show code/text/screenshots." 25 | validations: 26 | required: false 27 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/3-feature_request.yml: -------------------------------------------------------------------------------- 1 | name: "\U0001F195 Feature request" 2 | description: "Share cool new ideas for the project." 3 | title: "[FEATURE]" 4 | labels: ["enhancement", "ml-internal"] 5 | assignees: [] 6 | 7 | 8 | body: 9 | - type: markdown 10 | attributes: 11 | value: "Thank you for sharing your feature request! Please fill out the sections below." 12 | 13 | - type: textarea 14 | id: feature_request 15 | attributes: 16 | label: "Is your feature request related to a problem? Please describe." 17 | description: "A clear and concise description of what the problem is." 18 | placeholder: "I'm always frustrated when..." 19 | validations: 20 | required: false 21 | 22 | - type: textarea 23 | id: feature_description 24 | attributes: 25 | label: "Describe the solution you'd like" 26 | description: "A clear and concise description of what you want to happen." 27 | validations: 28 | required: false 29 | 30 | - type: textarea 31 | id: feature_alternatives 32 | attributes: 33 | label: "Describe alternatives you've considered" 34 | description: "A clear and concise description of any alternative solutions or features you've considered." 35 | validations: 36 | required: false 37 | 38 | - type: textarea 39 | id: additional_context 40 | attributes: 41 | label: "Additional context" 42 | description: "Add any other context or screenshots about the feature request here." 43 | validations: 44 | required: false 45 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/4-blank-issue-template.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Blank issue template 3 | about: A template for all other issues. 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | 11 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/config.yml: -------------------------------------------------------------------------------- 1 | blank_issues_enabled: false 2 | contact_links: 3 | - name: 🗯 Community Discussions 4 | url: http://hf.co/join/discord 5 | about: Our Discord Community loves to discuss distilabel and NLP topics 6 | -------------------------------------------------------------------------------- /.github/workflows/codspeed.yml: -------------------------------------------------------------------------------- 1 | name: Benchmarks 2 | 3 | on: 4 | push: 5 | branches: 6 | - "main" 7 | - "develop" 8 | pull_request: 9 | 10 | concurrency: 11 | group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} 12 | cancel-in-progress: true 13 | 14 | jobs: 15 | benchmarks: 16 | runs-on: ubuntu-22.04 17 | steps: 18 | - uses: actions/checkout@v4 19 | 20 | - name: Setup Python 21 | uses: actions/setup-python@v5 22 | with: 23 | python-version: "3.12" 24 | # Looks like it's not working very well for other people: 25 | # https://github.com/actions/setup-python/issues/436 26 | # cache: "pip" 27 | # cache-dependency-path: pyproject.toml 28 | 29 | - uses: actions/cache@v4 30 | id: cache 31 | with: 32 | path: ${{ env.pythonLocation }} 33 | key: ${{ runner.os }}-python-${{ env.pythonLocation }}-${{ hashFiles('pyproject.toml') }}-benchmarks-v00 34 | 35 | - name: Install dependencies 36 | if: steps.cache.outputs.cache-hit != 'true' 37 | run: ./scripts/install_dependencies.sh 38 | 39 | - name: Run benchmarks 40 | uses: CodSpeedHQ/action@v3 41 | with: 42 | token: ${{ secrets.CODSPEED_TOKEN }} 43 | run: pytest tests/ --codspeed 44 | -------------------------------------------------------------------------------- /.github/workflows/docs-pr-close.yml: -------------------------------------------------------------------------------- 1 | name: Clean up PR documentation 2 | 3 | on: 4 | pull_request: 5 | types: [closed] 6 | 7 | concurrency: 8 | group: distilabel-docs 9 | cancel-in-progress: false 10 | 11 | permissions: 12 | contents: write 13 | pull-requests: write 14 | 15 | jobs: 16 | cleanup: 17 | runs-on: ubuntu-latest 18 | steps: 19 | - name: Checkout merged branch 20 | uses: actions/checkout@v4 21 | with: 22 | ref: ${{ github.event.pull_request.base.ref }} 23 | fetch-depth: 0 24 | 25 | - name: Setup Python 26 | uses: actions/setup-python@v5 27 | with: 28 | python-version: "3.11" 29 | 30 | - name: Install dependencies 31 | run: ./scripts/install_docs_dependencies.sh 32 | 33 | - name: Set git credentials 34 | run: | 35 | git config --global user.name "${{ github.actor }}" 36 | git config --global user.email "${{ github.actor }}@users.noreply.github.com" 37 | 38 | - name: Remove PR documentation 39 | run: | 40 | PR_NUMBER=${{ github.event.pull_request.number }} 41 | mike delete pr-$PR_NUMBER --push 42 | -------------------------------------------------------------------------------- /.github/workflows/docs.yml: -------------------------------------------------------------------------------- 1 | name: Publish documentation 2 | 3 | on: 4 | push: 5 | branches: 6 | - develop 7 | - gh-pages 8 | tags: 9 | - "**" 10 | 11 | concurrency: 12 | group: distilabel-docs 13 | cancel-in-progress: false 14 | 15 | permissions: 16 | contents: write 17 | pull-requests: write 18 | 19 | jobs: 20 | publish: 21 | runs-on: ubuntu-latest 22 | steps: 23 | - name: checkout docs-site 24 | uses: actions/checkout@v4 25 | with: 26 | ref: gh-pages 27 | 28 | - uses: actions/checkout@v4 29 | 30 | - name: Setup Python 31 | uses: actions/setup-python@v5 32 | with: 33 | python-version: "3.11" 34 | 35 | - uses: actions/cache@v4 36 | id: cache 37 | with: 38 | path: ${{ env.pythonLocation }} 39 | key: ${{ runner.os }}-python-${{ env.pythonLocation }}-${{ hashFiles('pyproject.toml') }}-docs-v00 40 | 41 | - name: Install dependencies 42 | if: steps.cache.outputs.cache-hit != 'true' 43 | run: ./scripts/install_docs_dependencies.sh 44 | 45 | - name: Check no warnings 46 | run: mkdocs build --strict 47 | 48 | - name: Set git credentials 49 | run: | 50 | git config --global user.name "${{ github.actor }}" 51 | git config --global user.email "${{ github.actor }}@users.noreply.github.com" 52 | 53 | - run: mike deploy dev --push 54 | if: github.ref == 'refs/heads/develop' 55 | env: 56 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 57 | 58 | - run: mike deploy ${{ github.ref_name }} latest --update-aliases --push 59 | if: startsWith(github.ref, 'refs/tags/') 60 | env: 61 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 62 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: Release 2 | 3 | on: 4 | release: 5 | types: 6 | - published 7 | 8 | jobs: 9 | publish: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - uses: actions/checkout@v4 13 | 14 | - name: Setup Python 15 | uses: actions/setup-python@v4 16 | with: 17 | python-version: "3.11" 18 | # Looks like it's not working very well for other people: 19 | # https://github.com/actions/setup-python/issues/436 20 | # cache: "pip" 21 | # cache-dependency-path: pyproject.toml 22 | 23 | - uses: actions/cache@v3 24 | id: cache 25 | with: 26 | path: ${{ env.pythonLocation }} 27 | key: ${{ runner.os }}-python-${{ env.pythonLocation }}-${{ hashFiles('pyproject.toml') }}-release-v00 28 | 29 | - name: Install dependencies 30 | if: steps.cache.outputs.cache-hit != 'true' 31 | run: pip install build 32 | 33 | - name: Build distribution 34 | run: python -m build 35 | 36 | - name: Publish 37 | uses: pypa/gh-action-pypi-publish@release/v1 38 | with: 39 | password: ${{ secrets.PYPI_API_TOKEN }} 40 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: Test 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | - develop 8 | pull_request: 9 | types: 10 | - opened 11 | - synchronize 12 | workflow_dispatch: 13 | inputs: 14 | tmate_session: 15 | description: Starts the workflow with tmate enabled. 16 | required: false 17 | default: "false" 18 | 19 | concurrency: 20 | group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} 21 | cancel-in-progress: true 22 | 23 | jobs: 24 | test: 25 | runs-on: ubuntu-latest 26 | strategy: 27 | matrix: 28 | python-version: ["3.9", "3.10", "3.11", "3.12"] 29 | fail-fast: false 30 | 31 | steps: 32 | - uses: actions/checkout@v4 33 | 34 | - name: Setup Python 35 | uses: actions/setup-python@v5 36 | with: 37 | python-version: ${{ matrix.python-version }} 38 | # Looks like it's not working very well for other people: 39 | # https://github.com/actions/setup-python/issues/436 40 | # cache: "pip" 41 | # cache-dependency-path: pyproject.toml 42 | 43 | - uses: actions/cache@v4 44 | id: cache 45 | with: 46 | path: ${{ env.pythonLocation }} 47 | key: ${{ runner.os }}-python-${{ env.pythonLocation }}-${{ hashFiles('pyproject.toml') }}-test-v00 48 | 49 | - name: Install dependencies 50 | if: steps.cache.outputs.cache-hit != 'true' 51 | run: ./scripts/install_dependencies.sh 52 | 53 | - name: Setup tmate session 54 | uses: mxschmitt/action-tmate@v3 55 | if: ${{ matrix.python-version == '3.12' && github.event_name == 'workflow_dispatch' && inputs.tmate_session }} 56 | with: 57 | limit-access-to-actor: true 58 | 59 | - name: Lint 60 | run: make lint 61 | 62 | - name: Unit Tests 63 | run: make unit-tests 64 | 65 | - name: Integration Tests 66 | run: make integration-tests 67 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # Distribution / packaging 7 | .Python 8 | build/ 9 | develop-eggs/ 10 | dist/ 11 | downloads/ 12 | eggs/ 13 | .eggs/ 14 | lib/ 15 | lib64/ 16 | parts/ 17 | sdist/ 18 | var/ 19 | wheels/ 20 | share/python-wheels/ 21 | *.egg-info/ 22 | .installed.cfg 23 | *.egg 24 | MANIFEST 25 | 26 | # IDEs and editors 27 | .idea/ 28 | .vscode/ 29 | *.sublime-project 30 | *.sublime-workspace 31 | 32 | # Installer logs 33 | pip-log.txt 34 | pip-delete-this-directory.txt 35 | 36 | # Unit test / coverage reports 37 | htmlcov/ 38 | .tox/ 39 | .nox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *.cover 46 | *.py,cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | cover/ 50 | 51 | # Sphinx documentation 52 | docs/_build/ 53 | 54 | # Jupyter Notebook 55 | .ipynb_checkpoints 56 | 57 | # pyenv 58 | # For a library or package, you might want to ignore these files since the code is 59 | # intended to run in multiple environments; otherwise, check them in: 60 | .python-version 61 | 62 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 63 | __pypackages__/ 64 | 65 | # Environments 66 | .env 67 | .venv 68 | env/ 69 | venv/ 70 | ENV/ 71 | env.bak/ 72 | venv.bak/ 73 | 74 | # mkdocs documentation 75 | /site 76 | 77 | # Other 78 | *.log 79 | *.swp 80 | .DS_Store 81 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/Lucas-C/pre-commit-hooks 3 | rev: v1.5.5 4 | hooks: 5 | - id: insert-license 6 | name: "Insert license header in Python source files" 7 | files: \.py$ 8 | args: 9 | - --license-filepath 10 | - LICENSE_HEADER 11 | - --fuzzy-match-generates-todo 12 | 13 | - repo: https://github.com/astral-sh/ruff-pre-commit 14 | rev: v0.8.1 15 | hooks: 16 | - id: ruff 17 | args: [--fix] 18 | - id: ruff-format 19 | 20 | ci: 21 | autofix_commit_msg: | 22 | [pre-commit.ci] auto fixes from pre-commit.com hooks 23 | for more information, see https://pre-commit.ci 24 | autofix_prs: true 25 | autoupdate_branch: "" 26 | autoupdate_commit_msg: "[pre-commit.ci] pre-commit autoupdate" 27 | autoupdate_schedule: weekly 28 | skip: [] 29 | submodules: false 30 | -------------------------------------------------------------------------------- /CITATION.cff: -------------------------------------------------------------------------------- 1 | cff-version: 1.2.0 2 | message: "If you use this software, please cite it as below." 3 | authors: 4 | - family-names: "Bartolomé" 5 | given-names: "Álvaro" 6 | - family-names: "Martín-Blázquez" 7 | given-names: "Gabriel" 8 | - family-names: "Piqueres-Lajarín" 9 | given-names: "Agustín" 10 | - family-names: "Vila-Suero" 11 | given-names: "Daniel" 12 | title: "Distilabel: An AI Feedback (AIF) framework for building datasets with and for LLMs." 13 | version: 1.1.1 14 | date-released: 2024-05-22 15 | url: "https://github.com/argilla-io/distilabel" 16 | -------------------------------------------------------------------------------- /LICENSE_HEADER: -------------------------------------------------------------------------------- 1 | Copyright 2023-present, Argilla, Inc. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | sources = src/distilabel tests 2 | 3 | .PHONY: format 4 | format: 5 | ruff --version 6 | ruff check --fix $(sources) 7 | ruff format $(sources) 8 | 9 | .PHONY: lint 10 | lint: 11 | ruff --version 12 | ruff check $(sources) 13 | ruff format --check $(sources) 14 | 15 | .PHONY: unit-tests 16 | unit-tests: 17 | pytest tests/unit 18 | 19 | .PHONY: integration-tests 20 | integration-tests: 21 | pytest tests/integration 22 | -------------------------------------------------------------------------------- /docs/CNAME: -------------------------------------------------------------------------------- 1 | distilabel.argilla.io 2 | -------------------------------------------------------------------------------- /docs/api/cli.md: -------------------------------------------------------------------------------- 1 | # Command Line Interface (CLI) 2 | 3 | This section contains the API reference for the CLI. For more information on how to use the CLI, see [Tutorial - CLI](../sections/how_to_guides/advanced/cli/index.md). 4 | 5 | ## Utility functions for the `distilabel pipeline` sub-commands 6 | 7 | Here are some utility functions to help working with the pipelines in the console. 8 | 9 | :::distilabel.cli.pipeline.utils 10 | -------------------------------------------------------------------------------- /docs/api/distiset.md: -------------------------------------------------------------------------------- 1 | # Distiset 2 | 3 | This section contains the API reference for the Distiset. For more information on how to use the CLI, see [Tutorial - CLI](../sections/how_to_guides/advanced/distiset.md). 4 | 5 | :::distilabel.distiset.Distiset 6 | :::distilabel.distiset.create_distiset 7 | -------------------------------------------------------------------------------- /docs/api/errors.md: -------------------------------------------------------------------------------- 1 | # Errors 2 | 3 | This section contains the `distilabel` custom errors. Unlike [exceptions](exceptions.md), errors in `distilabel` are used to handle unexpected situations that can't be anticipated and that can't be handled in a controlled way. 4 | 5 | :::distilabel.errors.DistilabelError 6 | :::distilabel.errors.DistilabelUserError 7 | :::distilabel.errors.DistilabelTypeError 8 | :::distilabel.errors.DistilabelNotImplementedError 9 | -------------------------------------------------------------------------------- /docs/api/exceptions.md: -------------------------------------------------------------------------------- 1 | # Exceptions 2 | 3 | This section contains the `distilabel` custom exceptions. Unlike [errors](errors.md), exceptions in `distilabel` are used to handle specific situations that can be anticipated and that can be handled in a controlled way internally by the library. 4 | 5 | :::distilabel.exceptions.DistilabelException 6 | :::distilabel.exceptions.DistilabelGenerationException 7 | :::distilabel.exceptions.DistilabelOfflineBatchGenerationNotFinishedException 8 | -------------------------------------------------------------------------------- /docs/api/mixins/requirements.md: -------------------------------------------------------------------------------- 1 | ::: distilabel.mixins.requirements.RequirementsMixin 2 | -------------------------------------------------------------------------------- /docs/api/mixins/runtime_parameters.md: -------------------------------------------------------------------------------- 1 | ::: distilabel.mixins.runtime_parameters.RuntimeParametersMixin 2 | -------------------------------------------------------------------------------- /docs/api/models/embedding/embedding_gallery.md: -------------------------------------------------------------------------------- 1 | # Embedding Gallery 2 | 3 | This section contains the existing [`Embeddings`][distilabel.models.embeddings] subclasses implemented in `distilabel`. 4 | 5 | ::: distilabel.models.embeddings 6 | options: 7 | filters: 8 | - "!^Embeddings$" -------------------------------------------------------------------------------- /docs/api/models/embedding/index.md: -------------------------------------------------------------------------------- 1 | # Embedding 2 | 3 | This section contains the API reference for the `distilabel` embeddings. 4 | 5 | For more information on how the [`Embeddings`][distilabel.steps.tasks.Task] works and see some examples. 6 | 7 | ::: distilabel.models.embeddings.base -------------------------------------------------------------------------------- /docs/api/models/image_generation/image_generation_gallery.md: -------------------------------------------------------------------------------- 1 | # ImageGenerationModel Gallery 2 | 3 | This section contains the existing [`ImageGenerationModel`][distilabel.models.image_generation] subclasses implemented in `distilabel`. 4 | 5 | ::: distilabel.models.image_generation 6 | options: 7 | filters: 8 | - "!^ImageGenerationModel$" 9 | - "!^AsyngImageGenerationModel$" 10 | - "!typing" -------------------------------------------------------------------------------- /docs/api/models/image_generation/index.md: -------------------------------------------------------------------------------- 1 | # ImageGenerationModel 2 | 3 | This section contains the API reference for the `distilabel` image generation models, both for the [`ImageGenerationModel`][distilabel.models.image_generation.ImageGenerationModel] synchronous implementation, and for the [`AsyncImageGenerationModel`][distilabel.models.image_generation.AsyncImageGenerationModel] asynchronous one. 4 | 5 | For more information and examples on how to use existing LLMs or create custom ones, please refer to [Tutorial - ImageGenerationModel](../../../sections/how_to_guides/basic/task/image_task.md). 6 | 7 | ::: distilabel.models.image_generation.base 8 | -------------------------------------------------------------------------------- /docs/api/models/llm/index.md: -------------------------------------------------------------------------------- 1 | # LLM 2 | 3 | This section contains the API reference for the `distilabel` LLMs, both for the [`LLM`][distilabel.models.llms.LLM] synchronous implementation, and for the [`AsyncLLM`][distilabel.models.llms.AsyncLLM] asynchronous one. 4 | 5 | For more information and examples on how to use existing LLMs or create custom ones, please refer to [Tutorial - LLM](../../../sections/how_to_guides/basic/llm/index.md). 6 | 7 | ::: distilabel.models.llms.base 8 | -------------------------------------------------------------------------------- /docs/api/models/llm/llm_gallery.md: -------------------------------------------------------------------------------- 1 | # LLM Gallery 2 | 3 | This section contains the existing [`LLM`][distilabel.models.llms] subclasses implemented in `distilabel`. 4 | 5 | ::: distilabel.models.llms 6 | options: 7 | filters: 8 | - "!^LLM$" 9 | - "!^AsyncLLM$" 10 | - "!typing" -------------------------------------------------------------------------------- /docs/api/pipeline/index.md: -------------------------------------------------------------------------------- 1 | # Pipeline 2 | 3 | This section contains the API reference for the `distilabel` pipelines. For an example on how to use the pipelines, see the [Tutorial - Pipeline](../../sections/how_to_guides/basic/pipeline/index.md). 4 | 5 | ::: distilabel.pipeline.base 6 | ::: distilabel.pipeline.local 7 | -------------------------------------------------------------------------------- /docs/api/pipeline/routing_batch_function.md: -------------------------------------------------------------------------------- 1 | # Routing batch function 2 | 3 | ::: distilabel.pipeline.routing_batch_function 4 | -------------------------------------------------------------------------------- /docs/api/pipeline/step_wrapper.md: -------------------------------------------------------------------------------- 1 | # Step Wrapper 2 | 3 | ::: distilabel.pipeline.step_wrapper._StepWrapper 4 | ::: distilabel.pipeline.step_wrapper._StepWrapperException 5 | -------------------------------------------------------------------------------- /docs/api/step/decorator.md: -------------------------------------------------------------------------------- 1 | # @step 2 | 3 | This section contains the reference for the `@step` decorator, used to create new [`Step`][distilabel.steps.Step] subclasses without having to manually define the class. 4 | 5 | For more information check the [Tutorial - Step](../../sections/how_to_guides/basic/step/index.md) page. 6 | 7 | ::: distilabel.steps.decorator 8 | -------------------------------------------------------------------------------- /docs/api/step/generator_step.md: -------------------------------------------------------------------------------- 1 | # GeneratorStep 2 | 3 | This section contains the API reference for the [`GeneratorStep`][distilabel.steps.base.GeneratorStep] class. 4 | 5 | For more information and examples on how to use existing generator steps or create custom ones, please refer to [Tutorial - Step - GeneratorStep](../../sections/how_to_guides/basic/step/generator_step.md). 6 | 7 | ::: distilabel.steps.base.GeneratorStep 8 | 9 | ::: distilabel.steps.generators.utils.make_generator_step 10 | -------------------------------------------------------------------------------- /docs/api/step/global_step.md: -------------------------------------------------------------------------------- 1 | # GlobalStep 2 | 3 | This section contains the API reference for the [`GlobalStep`][distilabel.steps.base.GlobalStep] class. 4 | 5 | For more information and examples on how to use existing global steps or create custom ones, please refer to [Tutorial - Step - GlobalStep](../../sections/how_to_guides/basic/step/global_step.md). 6 | 7 | ::: distilabel.steps.base.GlobalStep 8 | -------------------------------------------------------------------------------- /docs/api/step/index.md: -------------------------------------------------------------------------------- 1 | # Step 2 | 3 | This section contains the API reference for the `distilabel` step, both for the [`_Step`][distilabel.steps.base._Step] base class and the [`Step`][distilabel.steps.Step] class. 4 | 5 | For more information and examples on how to use existing steps or create custom ones, please refer to [Tutorial - Step](../../sections/how_to_guides/basic/step/index.md). 6 | 7 | ::: distilabel.steps.base 8 | options: 9 | members: 10 | - _Step 11 | - Step 12 | - StepInput 13 | -------------------------------------------------------------------------------- /docs/api/step/resources.md: -------------------------------------------------------------------------------- 1 | # StepResources 2 | 3 | ::: distilabel.steps.base.StepResources 4 | -------------------------------------------------------------------------------- /docs/api/step_gallery/argilla.md: -------------------------------------------------------------------------------- 1 | # Argilla 2 | 3 | This section contains the existing steps integrated with `Argilla` so as to easily push the generated datasets to Argilla. 4 | 5 | ::: distilabel.steps.argilla.base 6 | ::: distilabel.steps.argilla.preference 7 | ::: distilabel.steps.argilla.text_generation 8 | -------------------------------------------------------------------------------- /docs/api/step_gallery/columns.md: -------------------------------------------------------------------------------- 1 | # Columns 2 | 3 | This section contains the existing steps intended to be used for common column operations to apply to the batches. 4 | 5 | ::: distilabel.steps.columns.expand 6 | ::: distilabel.steps.columns.keep 7 | ::: distilabel.steps.columns.merge 8 | ::: distilabel.steps.columns.group 9 | ::: distilabel.steps.columns.utils 10 | -------------------------------------------------------------------------------- /docs/api/step_gallery/extra.md: -------------------------------------------------------------------------------- 1 | # Extra 2 | 3 | ::: distilabel.steps 4 | options: 5 | filters: 6 | - "!Argilla" 7 | - "!Columns" 8 | - "!From(Disk|FileSystem)" 9 | - "!Hub" 10 | - "![Ss]tep" 11 | - "!typing" 12 | -------------------------------------------------------------------------------- /docs/api/step_gallery/hugging_face.md: -------------------------------------------------------------------------------- 1 | # Hugging Face 2 | 3 | This section contains the existing steps integrated with `Hugging Face` so as to easily push the generated datasets to Hugging Face. 4 | 5 | ::: distilabel.steps.LoadDataFromDisk 6 | ::: distilabel.steps.LoadDataFromFileSystem 7 | ::: distilabel.steps.LoadDataFromHub 8 | ::: distilabel.steps.PushToHub -------------------------------------------------------------------------------- /docs/api/task/generator_task.md: -------------------------------------------------------------------------------- 1 | # GeneratorTask 2 | 3 | This section contains the API reference for the `distilabel` generator tasks. 4 | 5 | For more information on how the [`GeneratorTask`][distilabel.steps.tasks.GeneratorTask] works and see some examples, check the [Tutorial - Task - GeneratorTask](../../sections/how_to_guides/basic/task/generator_task.md) page. 6 | 7 | ::: distilabel.steps.tasks.base.GeneratorTask 8 | -------------------------------------------------------------------------------- /docs/api/task/image_task.md: -------------------------------------------------------------------------------- 1 | # ImageTask 2 | 3 | This section contains the API reference for the `distilabel` image generation tasks. 4 | 5 | For more information on how the [`ImageTask`][distilabel.steps.tasks.ImageTask] works and see some examples, check the [Tutorial - Task - ImageTask](../../sections/how_to_guides/basic/task/generator_task.md) page. 6 | 7 | ::: distilabel.steps.tasks.base.ImageTask 8 | -------------------------------------------------------------------------------- /docs/api/task/index.md: -------------------------------------------------------------------------------- 1 | # Task 2 | 3 | This section contains the API reference for the `distilabel` tasks. 4 | 5 | For more information on how the [`Task`][distilabel.steps.tasks.Task] works and see some examples, check the [Tutorial - Task](../../sections/how_to_guides/basic/task/index.md) page. 6 | 7 | ::: distilabel.steps.tasks.base 8 | options: 9 | members: 10 | - _Task 11 | - Task 12 | -------------------------------------------------------------------------------- /docs/api/task/task_gallery.md: -------------------------------------------------------------------------------- 1 | # Task Gallery 2 | 3 | This section contains the existing [`Task`][distilabel.steps.tasks.Task] subclasses implemented in `distilabel`. 4 | 5 | ::: distilabel.steps.tasks 6 | options: 7 | filters: 8 | - "!Task" 9 | - "!_Task" 10 | - "!GeneratorTask" 11 | - "!ImageTask" 12 | - "!ChatType" 13 | - "!typing" -------------------------------------------------------------------------------- /docs/api/typing.md: -------------------------------------------------------------------------------- 1 | # Types 2 | 3 | This section contains the different types used accross the distilabel codebase. 4 | 5 | ::: distilabel.typing.base 6 | ::: distilabel.typing.steps 7 | ::: distilabel.typing.models 8 | ::: distilabel.typing.pipeline 9 | -------------------------------------------------------------------------------- /docs/assets/distilabel-badge-dark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argilla-io/distilabel/957934db33f0ee71471dff2b579bc1e99c99aed8/docs/assets/distilabel-badge-dark.png -------------------------------------------------------------------------------- /docs/assets/distilabel-badge-light.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argilla-io/distilabel/957934db33f0ee71471dff2b579bc1e99c99aed8/docs/assets/distilabel-badge-light.png -------------------------------------------------------------------------------- /docs/assets/distilabel-black.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argilla-io/distilabel/957934db33f0ee71471dff2b579bc1e99c99aed8/docs/assets/distilabel-black.png -------------------------------------------------------------------------------- /docs/assets/distilabel-icon.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /docs/assets/distilabel-white.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argilla-io/distilabel/957934db33f0ee71471dff2b579bc1e99c99aed8/docs/assets/distilabel-white.png -------------------------------------------------------------------------------- /docs/assets/images/sections/caching/caching_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argilla-io/distilabel/957934db33f0ee71471dff2b579bc1e99c99aed8/docs/assets/images/sections/caching/caching_1.png -------------------------------------------------------------------------------- /docs/assets/images/sections/caching/caching_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argilla-io/distilabel/957934db33f0ee71471dff2b579bc1e99c99aed8/docs/assets/images/sections/caching/caching_2.png -------------------------------------------------------------------------------- /docs/assets/images/sections/cli/cli_pipe.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argilla-io/distilabel/957934db33f0ee71471dff2b579bc1e99c99aed8/docs/assets/images/sections/cli/cli_pipe.png -------------------------------------------------------------------------------- /docs/assets/images/sections/community/compare-pull-request.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argilla-io/distilabel/957934db33f0ee71471dff2b579bc1e99c99aed8/docs/assets/images/sections/community/compare-pull-request.PNG -------------------------------------------------------------------------------- /docs/assets/images/sections/community/create-branch.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argilla-io/distilabel/957934db33f0ee71471dff2b579bc1e99c99aed8/docs/assets/images/sections/community/create-branch.PNG -------------------------------------------------------------------------------- /docs/assets/images/sections/community/edit-file.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argilla-io/distilabel/957934db33f0ee71471dff2b579bc1e99c99aed8/docs/assets/images/sections/community/edit-file.PNG -------------------------------------------------------------------------------- /docs/assets/images/sections/examples/knowledge-graph-example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argilla-io/distilabel/957934db33f0ee71471dff2b579bc1e99c99aed8/docs/assets/images/sections/examples/knowledge-graph-example.png -------------------------------------------------------------------------------- /docs/assets/images/sections/how_to_guides/basic/pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argilla-io/distilabel/957934db33f0ee71471dff2b579bc1e99c99aed8/docs/assets/images/sections/how_to_guides/basic/pipeline.png -------------------------------------------------------------------------------- /docs/assets/images/sections/how_to_guides/steps/argilla/preference.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argilla-io/distilabel/957934db33f0ee71471dff2b579bc1e99c99aed8/docs/assets/images/sections/how_to_guides/steps/argilla/preference.png -------------------------------------------------------------------------------- /docs/assets/images/sections/how_to_guides/steps/argilla/text_generation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argilla-io/distilabel/957934db33f0ee71471dff2b579bc1e99c99aed8/docs/assets/images/sections/how_to_guides/steps/argilla/text_generation.png -------------------------------------------------------------------------------- /docs/assets/images/sections/how_to_guides/tasks/task_print.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argilla-io/distilabel/957934db33f0ee71471dff2b579bc1e99c99aed8/docs/assets/images/sections/how_to_guides/tasks/task_print.png -------------------------------------------------------------------------------- /docs/assets/pipelines/arena-hard.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argilla-io/distilabel/957934db33f0ee71471dff2b579bc1e99c99aed8/docs/assets/pipelines/arena-hard.png -------------------------------------------------------------------------------- /docs/assets/pipelines/clair.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argilla-io/distilabel/957934db33f0ee71471dff2b579bc1e99c99aed8/docs/assets/pipelines/clair.png -------------------------------------------------------------------------------- /docs/assets/pipelines/clean-dataset.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argilla-io/distilabel/957934db33f0ee71471dff2b579bc1e99c99aed8/docs/assets/pipelines/clean-dataset.png -------------------------------------------------------------------------------- /docs/assets/pipelines/deepseek.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argilla-io/distilabel/957934db33f0ee71471dff2b579bc1e99c99aed8/docs/assets/pipelines/deepseek.png -------------------------------------------------------------------------------- /docs/assets/pipelines/deita.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argilla-io/distilabel/957934db33f0ee71471dff2b579bc1e99c99aed8/docs/assets/pipelines/deita.png -------------------------------------------------------------------------------- /docs/assets/pipelines/generate-preference-dataset.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argilla-io/distilabel/957934db33f0ee71471dff2b579bc1e99c99aed8/docs/assets/pipelines/generate-preference-dataset.png -------------------------------------------------------------------------------- /docs/assets/pipelines/instruction_backtranslation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argilla-io/distilabel/957934db33f0ee71471dff2b579bc1e99c99aed8/docs/assets/pipelines/instruction_backtranslation.png -------------------------------------------------------------------------------- /docs/assets/pipelines/knowledge_graphs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argilla-io/distilabel/957934db33f0ee71471dff2b579bc1e99c99aed8/docs/assets/pipelines/knowledge_graphs.png -------------------------------------------------------------------------------- /docs/assets/pipelines/prometheus.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argilla-io/distilabel/957934db33f0ee71471dff2b579bc1e99c99aed8/docs/assets/pipelines/prometheus.png -------------------------------------------------------------------------------- /docs/assets/pipelines/sentence-transformer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argilla-io/distilabel/957934db33f0ee71471dff2b579bc1e99c99aed8/docs/assets/pipelines/sentence-transformer.png -------------------------------------------------------------------------------- /docs/assets/pipelines/ultrafeedback.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argilla-io/distilabel/957934db33f0ee71471dff2b579bc1e99c99aed8/docs/assets/pipelines/ultrafeedback.png -------------------------------------------------------------------------------- /docs/assets/tutorials-assets/deepseek_prover.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argilla-io/distilabel/957934db33f0ee71471dff2b579bc1e99c99aed8/docs/assets/tutorials-assets/deepseek_prover.png -------------------------------------------------------------------------------- /docs/assets/tutorials-assets/deita/datasets.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argilla-io/distilabel/957934db33f0ee71471dff2b579bc1e99c99aed8/docs/assets/tutorials-assets/deita/datasets.png -------------------------------------------------------------------------------- /docs/assets/tutorials-assets/deita/diversity.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argilla-io/distilabel/957934db33f0ee71471dff2b579bc1e99c99aed8/docs/assets/tutorials-assets/deita/diversity.png -------------------------------------------------------------------------------- /docs/assets/tutorials-assets/deita/overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argilla-io/distilabel/957934db33f0ee71471dff2b579bc1e99c99aed8/docs/assets/tutorials-assets/deita/overview.png -------------------------------------------------------------------------------- /docs/assets/tutorials-assets/deita/results.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argilla-io/distilabel/957934db33f0ee71471dff2b579bc1e99c99aed8/docs/assets/tutorials-assets/deita/results.png -------------------------------------------------------------------------------- /docs/assets/tutorials-assets/instrucion_dataset_notus_ui.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argilla-io/distilabel/957934db33f0ee71471dff2b579bc1e99c99aed8/docs/assets/tutorials-assets/instrucion_dataset_notus_ui.png -------------------------------------------------------------------------------- /docs/assets/tutorials-assets/math-sheperd.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argilla-io/distilabel/957934db33f0ee71471dff2b579bc1e99c99aed8/docs/assets/tutorials-assets/math-sheperd.png -------------------------------------------------------------------------------- /docs/assets/tutorials-assets/overview-apigen.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argilla-io/distilabel/957934db33f0ee71471dff2b579bc1e99c99aed8/docs/assets/tutorials-assets/overview-apigen.jpg -------------------------------------------------------------------------------- /docs/assets/tutorials-assets/preference_dataset_notus_ui.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argilla-io/distilabel/957934db33f0ee71471dff2b579bc1e99c99aed8/docs/assets/tutorials-assets/preference_dataset_notus_ui.png -------------------------------------------------------------------------------- /docs/assets/tutorials-assets/wiki_transfer_learning.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argilla-io/distilabel/957934db33f0ee71471dff2b579bc1e99c99aed8/docs/assets/tutorials-assets/wiki_transfer_learning.png -------------------------------------------------------------------------------- /docs/scripts/gen_ref_pages.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # Ported from https://mkdocstrings.github.io/recipes/#automatic-code-reference-pages 16 | 17 | from pathlib import Path 18 | 19 | import mkdocs_gen_files 20 | 21 | nav = mkdocs_gen_files.Nav() 22 | 23 | src = Path(__file__).parent.parent.parent / "src" 24 | excluded = ["distilabel/utils"] 25 | 26 | for path in sorted(src.rglob("*.py")): 27 | if any(path.name.__contains__(exclude) for exclude in excluded): 28 | continue 29 | module_path = path.relative_to(src).with_suffix("") 30 | doc_path = path.relative_to(src).with_suffix(".md") 31 | full_doc_path = Path("reference", doc_path) 32 | 33 | parts = tuple(module_path.parts) 34 | 35 | if parts[-1] == "__init__": 36 | parts = parts[:-1] 37 | doc_path = doc_path.with_name("index.md") 38 | full_doc_path = full_doc_path.with_name("index.md") 39 | elif parts[-1] == "__main__": 40 | continue 41 | 42 | nav[parts] = doc_path.as_posix() 43 | 44 | with mkdocs_gen_files.open(full_doc_path, "w") as fd: 45 | ident = ".".join(parts) 46 | fd.write(f"::: {ident}") 47 | 48 | mkdocs_gen_files.set_edit_path(full_doc_path, path) 49 | 50 | with mkdocs_gen_files.open("reference/SUMMARY.md", "w") as nav_file: 51 | nav_file.writelines(nav.build_literate_nav()) 52 | -------------------------------------------------------------------------------- /docs/sections/how_to_guides/advanced/fs_to_pass_data.md: -------------------------------------------------------------------------------- 1 | # Using a file system to pass data of batches between steps 2 | 3 | In some situations, it can happen that the batches contains so much data that is faster to write it to disk and read it back in the next step, instead of passing it using the queue. To solve this issue, `distilabel` uses [`fsspec`](https://filesystem-spec.readthedocs.io/en/latest/) to allow providing a file system configuration and whether if this file system should be used to pass data between steps in the `run` method of the `distilabel` pipelines: 4 | 5 | !!! WARNING 6 | 7 | In order to use a specific file system/cloud storage, you will need to install the specific package providing the `fsspec` implementation for that file system. For instance, to use Google Cloud Storage you will need to install `gcsfs`: 8 | 9 | ```bash 10 | pip install gcsfs 11 | ``` 12 | 13 | Check the available implementations: [fsspec - Other known implementations](https://filesystem-spec.readthedocs.io/en/latest/api.html#other-known-implementations) 14 | 15 | ```python 16 | from distilabel.pipeline import Pipeline 17 | 18 | with Pipeline(name="my-pipeline") as pipeline: 19 | ... 20 | 21 | if __name__ == "__main__": 22 | distiset = pipeline.run( 23 | ..., 24 | storage_parameters={"path": "gcs://my-bucket"}, 25 | use_fs_to_pass_data=True 26 | ) 27 | ``` 28 | 29 | The code above setups a file system (in this case Google Cloud Storage) and sets the flag `use_fs_to_pass_data` to specify that the data of the batches should be passed to the steps using the file system. The `storage_parameters` argument is optional, and in the case it's not provided but `use_fs_to_pass_data==True`, `distilabel` will use the local file system. 30 | 31 | !!! NOTE 32 | 33 | As `GlobalStep`s receives all the data from the previous steps in one single batch accumulating all the data, it's very likely that the data of the batch will be too big to be passed using the queue. In this case and even if `use_fs_to_pass_data==False`, `distilabel` will use the file system to pass the data to the `GlobalStep`. 34 | 35 | -------------------------------------------------------------------------------- /docs/sections/pipeline_samples/examples/benchmarking_with_distilabel.md: -------------------------------------------------------------------------------- 1 | --- 2 | hide: toc 3 | --- 4 | # Benchmarking with `distilabel` 5 | 6 | Benchmark LLMs with `distilabel`: reproducing the Arena Hard benchmark. 7 | 8 | The script below first defines both the `ArenaHard` and the `ArenaHardResults` tasks, so as to generate responses for a given collection of prompts/questions with up to two LLMs, and then calculate the results as per the original implementation, respectively. Additionally, the second part of the example builds a `Pipeline` to run the generation on top of the prompts with `InferenceEndpointsLLM` while streaming the rest of the generations from a pre-computed set of GPT-4 generations, and then evaluate one against the other with `OpenAILLM` generating an alternate response, a comparison between the responses, and a result as A>>B, A>B, B>A, B>>A, or tie. 9 | 10 | ![Arena Hard](../../../assets/pipelines/arena-hard.png) 11 | 12 | To run this example you will first need to install the Arena Hard optional dependencies, being `pandas`, `scikit-learn`, and `numpy`. 13 | 14 | ??? Run 15 | 16 | ```python 17 | python examples/arena_hard.py 18 | ``` 19 | 20 | ```python title="arena_hard.py" 21 | --8<-- "examples/arena_hard.py" 22 | ``` -------------------------------------------------------------------------------- /docs/sections/pipeline_samples/examples/llama_cpp_with_outlines.md: -------------------------------------------------------------------------------- 1 | --- 2 | hide: toc 3 | --- 4 | # Structured generation with `outlines` 5 | 6 | Generate RPG characters following a `pydantic.BaseModel` with `outlines` in `distilabel`. 7 | 8 | This script makes use of [`LlamaCppLLM`][distilabel.models.llms.llamacpp.LlamaCppLLM] and the structured output capabilities thanks to [`outlines`](https://outlines-dev.github.io/outlines/welcome/) to generate RPG characters that adhere to a JSON schema. 9 | 10 | ![Arena Hard](../../../assets/pipelines/knowledge_graphs.png) 11 | 12 | It makes use of a local model which can be downloaded using curl (explained in the script itself), and can be exchanged with other `LLMs` like [`vLLM`][distilabel.models.llms.vllm.vLLM]. 13 | 14 | ??? Run 15 | 16 | ```python 17 | python examples/structured_generation_with_outlines.py 18 | ``` 19 | 20 | ```python title="structured_generation_with_outlines.py" 21 | --8<-- "examples/structured_generation_with_outlines.py" 22 | ``` -------------------------------------------------------------------------------- /docs/sections/pipeline_samples/examples/mistralai_with_instructor.md: -------------------------------------------------------------------------------- 1 | --- 2 | hide: toc 3 | --- 4 | # Structured generation with `instructor` 5 | 6 | Answer instructions with knowledge graphs defined as `pydantic.BaseModel` objects using `instructor` in `distilabel`. 7 | 8 | This script makes use of [`MistralLLM`][distilabel.models.llms.mistral.MistralLLM] and the structured output capabilities thanks to [`instructor`](https://python.useinstructor.com/) to generate knowledge graphs from complex topics. 9 | 10 | ![Knowledge graph figure](../../../assets/pipelines/knowledge_graphs.png) 11 | 12 | This example is translated from this [awesome example](https://python.useinstructor.com/examples/knowledge_graph/) from `instructor` cookbook. 13 | 14 | ??? Run 15 | 16 | ```python 17 | python examples/structured_generation_with_instructor.py 18 | ``` 19 | 20 | ```python title="structured_generation_with_instructor.py" 21 | --8<-- "examples/structured_generation_with_instructor.py" 22 | ``` 23 | 24 | ??? "Visualizing the graphs" 25 | 26 | Want to see how to visualize the graphs? You can test it using the following script. Generate some samples on your own and take a look: 27 | 28 | !!! NOTE 29 | 30 | This example uses graphviz to render the graph, you can install with `pip` in the following way: 31 | 32 | ```console 33 | pip install graphviz 34 | ``` 35 | 36 | ```python 37 | python examples/draw_kg.py 2 # You can pass 0,1,2 to visualize each of the samples. 38 | ``` 39 | 40 | ![Knowledge graph figure](../../../assets/images/sections/examples/knowledge-graph-example.png) -------------------------------------------------------------------------------- /docs/stylesheets/extra.css: -------------------------------------------------------------------------------- 1 | @import url('https://fonts.googleapis.com/css2?family=Inter:wght@100..600&display=swap'); 2 | 3 | :root { 4 | --md-primary-fg-color: #f2a8ff; 5 | --md-primary-fg-color--light: #f2a8ff; 6 | --md-primary-fg-color--dark: #f2a8ff; 7 | --md-text-font: "Inter"; 8 | } 9 | [data-md-color-scheme="default"] { 10 | --md-primary-fg-color: #000000; 11 | --md-typeset-a-color: #9c50c2; 12 | --md-accent-fg-color: #c57fed; 13 | } 14 | [data-md-color-scheme="slate"] { 15 | --md-primary-fg-color: #000000; 16 | --md-typeset-a-color: #ca77d8; 17 | --md-accent-fg-color: #f2a8ff; 18 | } 19 | 20 | .md-sidebar__scrollwrap:focus-within, .md-sidebar__scrollwrap:hover { 21 | scrollbar-color: var(--md-default-fg-color--lighter) #0000; 22 | } 23 | -------------------------------------------------------------------------------- /docs/stylesheets/fonts/FontAwesome.otf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argilla-io/distilabel/957934db33f0ee71471dff2b579bc1e99c99aed8/docs/stylesheets/fonts/FontAwesome.otf -------------------------------------------------------------------------------- /docs/stylesheets/fonts/fontawesome-webfont.eot: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argilla-io/distilabel/957934db33f0ee71471dff2b579bc1e99c99aed8/docs/stylesheets/fonts/fontawesome-webfont.eot -------------------------------------------------------------------------------- /docs/stylesheets/fonts/fontawesome-webfont.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argilla-io/distilabel/957934db33f0ee71471dff2b579bc1e99c99aed8/docs/stylesheets/fonts/fontawesome-webfont.ttf -------------------------------------------------------------------------------- /docs/stylesheets/fonts/fontawesome-webfont.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argilla-io/distilabel/957934db33f0ee71471dff2b579bc1e99c99aed8/docs/stylesheets/fonts/fontawesome-webfont.woff -------------------------------------------------------------------------------- /docs/stylesheets/fonts/fontawesome-webfont.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argilla-io/distilabel/957934db33f0ee71471dff2b579bc1e99c99aed8/docs/stylesheets/fonts/fontawesome-webfont.woff2 -------------------------------------------------------------------------------- /examples/image_generation.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from datasets import load_dataset 16 | 17 | from distilabel.models.image_generation import InferenceEndpointsImageGeneration 18 | from distilabel.pipeline import Pipeline 19 | from distilabel.steps import KeepColumns 20 | from distilabel.steps.tasks import ImageGeneration 21 | 22 | ds = load_dataset("dvilasuero/finepersonas-v0.1-tiny", split="train").select(range(3)) 23 | 24 | with Pipeline(name="image_generation_pipeline") as pipeline: 25 | igm = InferenceEndpointsImageGeneration(model_id="black-forest-labs/FLUX.1-schnell") 26 | 27 | img_generation = ImageGeneration( 28 | name="flux_schnell", 29 | image_generation_model=igm, 30 | input_mappings={"prompt": "persona"}, 31 | ) 32 | 33 | keep_columns = KeepColumns(columns=["persona", "model_name", "image"]) 34 | 35 | img_generation >> keep_columns 36 | 37 | 38 | if __name__ == "__main__": 39 | distiset = pipeline.run(use_cache=False, dataset=ds) 40 | # Save the images as `PIL.Image.Image` 41 | distiset = distiset.transform_columns_to_image("image") 42 | distiset.push_to_hub("plaguss/test-finepersonas-v0.1-tiny-flux-schnell") 43 | -------------------------------------------------------------------------------- /examples/text_generation_with_image.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from distilabel.models.llms import InferenceEndpointsLLM 16 | from distilabel.pipeline import Pipeline 17 | from distilabel.steps import LoadDataFromDicts 18 | from distilabel.steps.tasks.text_generation_with_image import TextGenerationWithImage 19 | 20 | with Pipeline(name="vision_generation_pipeline") as pipeline: 21 | loader = LoadDataFromDicts( 22 | data=[ 23 | { 24 | "instruction": "What’s in this image?", 25 | "image": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg", 26 | } 27 | ], 28 | ) 29 | 30 | llm = InferenceEndpointsLLM( 31 | model_id="meta-llama/Llama-3.2-11B-Vision-Instruct", 32 | ) 33 | 34 | vision = TextGenerationWithImage(name="vision_gen", llm=llm, image_type="url") 35 | 36 | loader >> vision 37 | 38 | 39 | if __name__ == "__main__": 40 | distiset = pipeline.run(use_cache=False) 41 | distiset.push_to_hub("plaguss/test-vision-generation-Llama-3.2-11B-Vision-Instruct") 42 | -------------------------------------------------------------------------------- /scripts/install_cpu_vllm.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | echo "Updating system and installing build dependencies..." 6 | sudo apt-get update -y 7 | sudo apt-get install -y gcc-12 g++-12 libnuma-dev cmake libdnnl-dev 8 | sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 9 | 10 | echo "Python version:" 11 | python --version 12 | 13 | echo "Python executable location:" 14 | which python 15 | 16 | echo "Installing Python build dependencies..." 17 | python -m pip install --upgrade pip 18 | python -m pip install wheel packaging ninja "setuptools>=49.4.0" numpy setuptools-scm 19 | 20 | echo "Cloning 'vllm-project/vllm' GitHub repository..." 21 | git clone https://github.com/vllm-project/vllm.git 22 | cd vllm || exit 23 | 24 | git fetch --tags 25 | latest_tag=$(git describe --tags "$(git rev-list --tags --max-count=1)") 26 | 27 | echo "Checking out to '$latest_tag' tag..." 28 | git checkout "$latest_tag" 29 | 30 | echo "Installing vLLM CPU requirements..." 31 | python -m pip install -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu 32 | 33 | echo "Installing vLLM for CPU..." 34 | export CMAKE_ARGS="-DPYTHON_EXECUTABLE=$(which python) -DPYTHON_INCLUDE_DIR=$(python -c "from sysconfig import get_path; print(get_path('include'))") -DPYTHON_LIBRARY=$(python -c "import sysconfig; print(sysconfig.get_config_var('LIBDIR'))")" 35 | echo "CMake args: $CMAKE_ARGS" 36 | VLLM_TARGET_DEVICE=cpu python setup.py install 37 | 38 | echo "Installation complete!" 39 | -------------------------------------------------------------------------------- /scripts/install_dependencies.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | python_version=$(python -c "import sys; print(sys.version_info[:2])") 6 | 7 | python -m pip install uv 8 | 9 | uv pip install --system -e ".[anthropic,argilla,cohere,groq,hf-inference-endpoints,hf-transformers,litellm,llama-cpp,ollama,openai,outlines,vertexai,mistralai,instructor,sentence-transformers,faiss-cpu,minhash,text-clustering]" 10 | 11 | if [ "${python_version}" != "(3, 12)" ]; then 12 | uv pip install --system -e .[ray] 13 | fi 14 | 15 | ./scripts/install_cpu_vllm.sh 16 | 17 | uv pip install --system -e ".[dev,tests]" 18 | -------------------------------------------------------------------------------- /scripts/install_docs_dependencies.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | python_version=$(python -c "import sys; print(sys.version_info[:2])") 6 | 7 | python -m pip install uv 8 | 9 | uv pip install --system -e ".[docs]" 10 | -------------------------------------------------------------------------------- /src/distilabel/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from rich import traceback as rich_traceback 16 | 17 | __version__ = "1.5.3" 18 | 19 | rich_traceback.install(show_locals=True) 20 | -------------------------------------------------------------------------------- /src/distilabel/__main__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from distilabel.cli.app import app 16 | 17 | if __name__ == "__main__": 18 | app(prog_name="distilabel") 19 | -------------------------------------------------------------------------------- /src/distilabel/cli/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | -------------------------------------------------------------------------------- /src/distilabel/cli/app.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import typer 16 | 17 | from distilabel.cli.pipeline import app as pipeline_app 18 | 19 | app = typer.Typer(name="distilabel") 20 | 21 | app.add_typer(pipeline_app, name="pipeline") 22 | -------------------------------------------------------------------------------- /src/distilabel/cli/pipeline/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from distilabel.cli.pipeline.app import app 16 | 17 | __all__ = ["app"] 18 | -------------------------------------------------------------------------------- /src/distilabel/embeddings.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # ruff: noqa: E402 16 | 17 | import warnings 18 | 19 | deprecation_message = ( 20 | "Importing from 'distilabel.embeddings' is deprecated and will be removed in a version 1.7.0. " 21 | "Import from 'distilabel.models' instead." 22 | ) 23 | 24 | warnings.warn(deprecation_message, DeprecationWarning, stacklevel=2) 25 | 26 | from distilabel.models.embeddings.base import Embeddings 27 | from distilabel.models.embeddings.sentence_transformers import ( 28 | SentenceTransformerEmbeddings, 29 | ) 30 | from distilabel.models.embeddings.vllm import vLLMEmbeddings 31 | 32 | __all__ = [ 33 | "Embeddings", 34 | "SentenceTransformerEmbeddings", 35 | "vLLMEmbeddings", 36 | ] 37 | -------------------------------------------------------------------------------- /src/distilabel/envs.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # Idea from: https://github.com/vllm-project/vllm/blob/main/vllm/envs.py 16 | 17 | import os 18 | from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional 19 | 20 | from distilabel import constants 21 | 22 | if TYPE_CHECKING: 23 | DISTILABEL_LOG_LEVEL: str = "INFO" 24 | DISTILABEL_PIPELINE_NAME: Optional[str] = None 25 | DISTILABEL_PIPELINE_CACHE_ID: Optional[str] = None 26 | DISTILABEL_CACHE_DIR: Optional[str] = None 27 | 28 | ENVIRONMENT_VARIABLES: Dict[str, Callable[[], Any]] = { 29 | # `distilabel` logging level. 30 | "DISTILABEL_LOG_LEVEL": lambda: os.getenv("DISTILABEL_LOG_LEVEL", "INFO").upper(), 31 | # The name of the `distilabel` pipeline currently running. 32 | constants.PIPELINE_NAME_ENV_NAME: lambda: os.getenv( 33 | constants.PIPELINE_NAME_ENV_NAME, None 34 | ), 35 | # The cache ID of the `distilabel` pipeline currently running. 36 | constants.PIPELINE_CACHE_ID_ENV_NAME: lambda: os.getenv( 37 | constants.PIPELINE_CACHE_ID_ENV_NAME, None 38 | ), 39 | # The cache ID of the `distilabel` pipeline currently running. 40 | "DISTILABEL_CACHE_DIR": lambda: os.getenv("DISTILABEL_CACHE_DIR", None), 41 | } 42 | 43 | 44 | def __getattr__(name: str) -> Any: 45 | # lazy evaluation of environment variables 46 | if name in ENVIRONMENT_VARIABLES: 47 | return ENVIRONMENT_VARIABLES[name]() 48 | raise AttributeError(f"module {__name__!r} has no attribute {name!r}") 49 | 50 | 51 | def __dir__() -> List[str]: 52 | return list(ENVIRONMENT_VARIABLES.keys()) 53 | -------------------------------------------------------------------------------- /src/distilabel/exceptions.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | from typing import Tuple 17 | 18 | 19 | class DistilabelException(Exception): 20 | """Base exception (can be gracefully handled) for `distilabel` framework.""" 21 | 22 | pass 23 | 24 | 25 | class DistilabelGenerationException(DistilabelException): 26 | """Base exception for `LLM` generation errors.""" 27 | 28 | pass 29 | 30 | 31 | class DistilabelOfflineBatchGenerationNotFinishedException( 32 | DistilabelGenerationException 33 | ): 34 | """Exception raised when a batch generation is not finished.""" 35 | 36 | jobs_ids: Tuple[str, ...] 37 | 38 | def __init__(self, jobs_ids: Tuple[str, ...]) -> None: 39 | self.jobs_ids = jobs_ids 40 | super().__init__(f"Batch generation with jobs_ids={jobs_ids} is not finished") 41 | -------------------------------------------------------------------------------- /src/distilabel/mixins/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /src/distilabel/models/base_clients/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from distilabel.models.base_clients.inference_endpoints import ( 16 | InferenceEndpointsBaseClient, 17 | ) 18 | from distilabel.models.base_clients.openai import OpenAIBaseClient 19 | 20 | __all__ = ["InferenceEndpointsBaseClient", "OpenAIBaseClient"] 21 | -------------------------------------------------------------------------------- /src/distilabel/models/embeddings/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from distilabel.models.embeddings.base import Embeddings 16 | from distilabel.models.embeddings.llamacpp import LlamaCppEmbeddings 17 | from distilabel.models.embeddings.sentence_transformers import ( 18 | SentenceTransformerEmbeddings, 19 | ) 20 | from distilabel.models.embeddings.vllm import vLLMEmbeddings 21 | 22 | __all__ = [ 23 | "Embeddings", 24 | "LlamaCppEmbeddings", 25 | "SentenceTransformerEmbeddings", 26 | "vLLMEmbeddings", 27 | ] 28 | -------------------------------------------------------------------------------- /src/distilabel/models/image_generation/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from distilabel.models.image_generation.base import ( 16 | AsyncImageGenerationModel, 17 | ImageGenerationModel, 18 | ) 19 | from distilabel.models.image_generation.huggingface.inference_endpoints import ( 20 | InferenceEndpointsImageGeneration, 21 | ) 22 | from distilabel.models.image_generation.openai import OpenAIImageGeneration 23 | 24 | __all__ = [ 25 | "AsyncImageGenerationModel", 26 | "ImageGenerationModel", 27 | "InferenceEndpointsImageGeneration", 28 | "OpenAIImageGeneration", 29 | ] 30 | -------------------------------------------------------------------------------- /src/distilabel/models/image_generation/huggingface/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | -------------------------------------------------------------------------------- /src/distilabel/models/image_generation/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import base64 16 | import io 17 | 18 | from PIL import Image 19 | 20 | 21 | def image_to_str(image: "Image.Image", image_format: str = "JPEG") -> str: 22 | """Converts a PIL Image to a base64 encoded string.""" 23 | buffered = io.BytesIO() 24 | image.save(buffered, format=image_format) 25 | return base64.b64encode(buffered.getvalue()).decode("utf-8") 26 | 27 | 28 | def image_from_str(image_str: str) -> "Image.Image": 29 | """Converts a base64 encoded string to a PIL Image.""" 30 | image_bytes = base64.b64decode(image_str) 31 | return Image.open(io.BytesIO(image_bytes)) 32 | -------------------------------------------------------------------------------- /src/distilabel/models/llms/huggingface/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from distilabel.models.llms.huggingface.inference_endpoints import InferenceEndpointsLLM 16 | from distilabel.models.llms.huggingface.transformers import TransformersLLM 17 | 18 | __all__ = ["InferenceEndpointsLLM", "TransformersLLM"] 19 | -------------------------------------------------------------------------------- /src/distilabel/models/mixins/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | -------------------------------------------------------------------------------- /src/distilabel/pipeline/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from distilabel.pipeline.local import Pipeline 16 | from distilabel.pipeline.ray import RayPipeline 17 | from distilabel.pipeline.routing_batch_function import ( 18 | routing_batch_function, 19 | sample_n_steps, 20 | ) 21 | from distilabel.pipeline.templates import ( 22 | InstructionResponsePipeline, 23 | ) 24 | 25 | __all__ = [ 26 | "InstructionResponsePipeline", 27 | "Pipeline", 28 | "RayPipeline", 29 | "routing_batch_function", 30 | "sample_n_steps", 31 | ] 32 | -------------------------------------------------------------------------------- /src/distilabel/pipeline/templates/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from .instruction import InstructionResponsePipeline # noqa: F401 16 | -------------------------------------------------------------------------------- /src/distilabel/steps/argilla/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /src/distilabel/steps/clustering/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | -------------------------------------------------------------------------------- /src/distilabel/steps/columns/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | -------------------------------------------------------------------------------- /src/distilabel/steps/embeddings/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | -------------------------------------------------------------------------------- /src/distilabel/steps/filtering/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | -------------------------------------------------------------------------------- /src/distilabel/steps/formatting/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /src/distilabel/steps/generators/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /src/distilabel/steps/globals/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /src/distilabel/steps/tasks/apigen/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | -------------------------------------------------------------------------------- /src/distilabel/steps/tasks/evol_instruct/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /src/distilabel/steps/tasks/evol_instruct/evol_complexity/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /src/distilabel/steps/tasks/evol_instruct/evol_complexity/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from distilabel.steps.tasks.evol_instruct.utils import ( 16 | GENERATION_MUTATION_TEMPLATES as GENERATION_MUTATION_TEMPLATES_EVOL_INSTRUCT, 17 | ) 18 | from distilabel.steps.tasks.evol_instruct.utils import ( 19 | MUTATION_TEMPLATES as MUTATION_TEMPLATES_EVOL_INSTRUCT, 20 | ) 21 | 22 | MUTATION_TEMPLATES = { 23 | "CONSTRAINTS": MUTATION_TEMPLATES_EVOL_INSTRUCT["CONSTRAINTS"], 24 | "DEEPENING": MUTATION_TEMPLATES_EVOL_INSTRUCT["DEEPENING"], 25 | "CONCRETIZING": MUTATION_TEMPLATES_EVOL_INSTRUCT["CONCRETIZING"], 26 | "INCREASED_REASONING_STEPS": MUTATION_TEMPLATES_EVOL_INSTRUCT[ 27 | "INCREASED_REASONING_STEPS" 28 | ], 29 | } 30 | 31 | GENERATION_MUTATION_TEMPLATES = { 32 | "FRESH_START": GENERATION_MUTATION_TEMPLATES_EVOL_INSTRUCT["FRESH_START"], 33 | "CONSTRAINTS": GENERATION_MUTATION_TEMPLATES_EVOL_INSTRUCT["CONSTRAINTS"], 34 | "DEEPENING": GENERATION_MUTATION_TEMPLATES_EVOL_INSTRUCT["DEEPENING"], 35 | "CONCRETIZING": GENERATION_MUTATION_TEMPLATES_EVOL_INSTRUCT["CONCRETIZING"], 36 | "INCREASED_REASONING_STEPS": GENERATION_MUTATION_TEMPLATES_EVOL_INSTRUCT[ 37 | "INCREASED_REASONING_STEPS" 38 | ], 39 | } 40 | -------------------------------------------------------------------------------- /src/distilabel/steps/tasks/evol_quality/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /src/distilabel/steps/tasks/magpie/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | -------------------------------------------------------------------------------- /src/distilabel/steps/tasks/math_shepherd/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | -------------------------------------------------------------------------------- /src/distilabel/steps/tasks/structured_outputs/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /src/distilabel/steps/tasks/templates/apigen/generator.jinja2: -------------------------------------------------------------------------------- 1 | Here are examples of queries and the corresponding answers for similar functions: 2 | {{ examples }} 3 | 4 | Note that the query could be interpreted as a combination of several independent requests. 5 | {{ parallel_queries }} 6 | Based on these examples, generate {{ number }} diverse query and answer pairs for the function `{{ func_name }}`. 7 | The detailed function description is the following: 8 | {{ func_desc }} 9 | {{ format_inst }} 10 | Now please generate {{ number }} diverse query and answer pairs following the above format. -------------------------------------------------------------------------------- /src/distilabel/steps/tasks/templates/apigen/semantic_checker.jinja2: -------------------------------------------------------------------------------- 1 | Given Information: 2 | - All Available Functions: 3 | {{ func_desc }} 4 | - User Query: {{ query }} 5 | - Generated Function Calls: {{ func_call }} 6 | - Execution Results: {{ execution_result }} 7 | 8 | Note: The query may have multiple intentions. Functions may be placeholders, and execution results may be truncated due to length, which is acceptable and should not cause a failure. 9 | 10 | The main decision factor is wheather the function calls accurately reflect the query's intentions and the function descriptions. 11 | Provide your reasoning in the thought section and decide if the data passes (answer yes or no). 12 | If not passing, concisely explain your reasons in the thought section; otherwise, leave this section blank. 13 | {{ format_inst }} -------------------------------------------------------------------------------- /src/distilabel/steps/tasks/templates/argillalabeller.jinja2: -------------------------------------------------------------------------------- 1 | Please provide an answer to the question based on the input fields{% if examples %} and examples{% endif %}. 2 | {% if guidelines %} 3 | # Guidelines 4 | {{ guidelines }}{% endif %} 5 | {% if examples %} 6 | # Examples 7 | {{ examples }}{% endif %} 8 | # Question 9 | {{ question }} 10 | 11 | # Fields 12 | {{ fields }} 13 | response: -------------------------------------------------------------------------------- /src/distilabel/steps/tasks/templates/clair.jinja2: -------------------------------------------------------------------------------- 1 | {task}: {{ task }} 2 | 3 | {student_solution}: {{ student_solution }} 4 | 5 | ----------------- 6 | 7 | Let's first think step by step with a {teacher_reasoning} to decide how to improve the {student_solution}, then give the {corrected_student_solution}. Mention the {teacher_reasoning} and {corrected_student_solution} identifiers to structure your answer. -------------------------------------------------------------------------------- /src/distilabel/steps/tasks/templates/complexity-scorer.jinja2: -------------------------------------------------------------------------------- 1 | Ranking the following questions according to the difficulty and complexity. Score 1-{{ instructions|length }}. 2 | You can give a score of {{ (instructions|length) + 1 }} if the question is too complex for you to answer it. You should 3 | respond with the format: 4 | [1] Score: 1 5 | [2] Score: 2 6 | ... 7 | {% for instruction in instructions %} 8 | [{{ loop.index }}] {{ instruction }} 9 | {%- endfor %} 10 | -------------------------------------------------------------------------------- /src/distilabel/steps/tasks/templates/generate-sentence-pair.jinja2: -------------------------------------------------------------------------------- 1 | {% if context is not none -%} 2 | ## Context 3 | 4 | {{ context }} 5 | 6 | {% endif -%} 7 | 8 | ## Anchor 9 | 10 | {{ anchor }} 11 | 12 | -------------------------------------------------------------------------------- /src/distilabel/steps/tasks/templates/genstruct.jinja2: -------------------------------------------------------------------------------- 1 | [[[Title]]] {{ title | trim }} 2 | [[[Content]]] {{ content | trim }} 3 | 4 | The following is an interaction between a user and an AI assistant that is related to the above text. 5 | 6 | [[[User]]] 7 | -------------------------------------------------------------------------------- /src/distilabel/steps/tasks/templates/improving_text_embeddings/bitext-retrieval.jinja2: -------------------------------------------------------------------------------- 1 | Write a {{ unit }} triple with one {{ unit }} in {{ source_language }} and two {{ unit }}s in {{ target_language }} with varying translation qualities in JSON format. 2 | 3 | The triple is denotes as ("S1", "S2", "S3"). The translation quality score ranges from 1 to 5, with higher scores are better. 4 | 5 | Please adhere to the following guidelines: 6 | - The values of "S1" is a string in {{ source_language }}, the value of "S2" and "S3" are strings in {{ target_language }}. 7 | - There should be some word overlaps between "S2" and "S3". 8 | - The translation quality score of "S2" with respect to "S1" should be {{ high_score }}. 9 | - The translation quality score of "S3" with respect to "S1" should be {{ low_score }}. 10 | - "S3" should be grammatical and fluent, but contain some keyword or number translation errors, or miss some information, or contain some redundant information. 11 | - "S1" requires {{ difficulty }} level education to understand and should be diverse in terms of topic and length. 12 | 13 | Your output must always be a JSON object only with three keys "S1", "S2" and "S3", do not explain yourself or output anything else. Be creative! 14 | -------------------------------------------------------------------------------- /src/distilabel/steps/tasks/templates/improving_text_embeddings/brainstorming/text-classification.jinja2: -------------------------------------------------------------------------------- 1 | Brainstorm a list of potentially useful text classification tasks. 2 | 3 | Please adhere to the following guidelines: 4 | - Tasks should cover a diverse range of domains and task types. 5 | 6 | Your output must always be a python list of strings only, with about 20 elements, and each element corresponds to a distinct text classification task in one sentence. Do not explain yourself or output anything else. Be creative! 7 | -------------------------------------------------------------------------------- /src/distilabel/steps/tasks/templates/improving_text_embeddings/brainstorming/text-matching-long.jinja2: -------------------------------------------------------------------------------- 1 | Brainstorm a list of text matching tasks where the queries are long documents. 2 | 3 | Here are a few examples: 4 | - Given a document that supports a debatable argument, find another document that contains opposite arguments. 5 | - Provided a lengthy business proposal, retrieve competitive business strategies in the same industry. 6 | 7 | Your output must always be a python list of strings only, with about 20 elements, and each element corresponds to a distinct task in one sentence. Do not explain yourself or output anything else. Be creative! 8 | -------------------------------------------------------------------------------- /src/distilabel/steps/tasks/templates/improving_text_embeddings/brainstorming/text-matching-short.jinja2: -------------------------------------------------------------------------------- 1 | Brainstorm a list of text matching tasks where both the queries and the groundtruth documents are very short (one or two sentences, even a short phrase). 2 | 3 | Here are a few examples: 4 | - Given a scientific paper title, retrieve the title of papers that cite the given paper. 5 | - Match a word with its definition. 6 | - Provided a notable person's name, identify their occupation or achievement. 7 | 8 | Your output must always be a python list of strings only, with about 20 elements, and each element corresponds to a distinct task in one sentence. Do not explain yourself or output anything else. Be creative! 9 | -------------------------------------------------------------------------------- /src/distilabel/steps/tasks/templates/improving_text_embeddings/brainstorming/text-retrieval.jinja2: -------------------------------------------------------------------------------- 1 | Brainstorm a list of potentially useful text retrieval tasks. 2 | 3 | Here are a few examples for your reference: 4 | - Provided a scientific claim as query, retrieve documents that help verify or refute the claim. 5 | - Search for documents that answers a FAQ-style query on children's nutrition. 6 | 7 | Please adhere to the following guidelines: 8 | - Specify what the query is, and what the desired documents are. 9 | - Each retrieval task should cover a wide range of queries, and should not be too specific. 10 | 11 | Your output should always be a python list of strings only, with about 20 elements, and each element corresponds to a distinct retrieval task in one sentence. Do not explain yourself or output anything else. Be creative! 12 | -------------------------------------------------------------------------------- /src/distilabel/steps/tasks/templates/improving_text_embeddings/long-text-matching.jinja2: -------------------------------------------------------------------------------- 1 | You have been assigned a text matching task: {{ task }} 2 | 3 | Your mission is to write one example for this task in JSON format. The JSON object must contain the following keys: 4 | - "input": a string, a random input specified by the task. 5 | - "positive_document": a string, a relevant document for the "input" according to the task. 6 | 7 | Please adhere to the following guidelines: 8 | - The values of all fields should be in {{ language }}. 9 | - Both the "input" and "positive_document" should be long documents (at least 300 words), avoid substantial word overlaps, otherwise the task would be too easy. 10 | - The "input" and "positive_document" should be independent of each other. 11 | 12 | Your output must always be a JSON object only, do not explain yourself or output anything else. Be creative! 13 | -------------------------------------------------------------------------------- /src/distilabel/steps/tasks/templates/improving_text_embeddings/monolingual-triplet.jinja2: -------------------------------------------------------------------------------- 1 | Write a {{ unit }} triple with varying semantic similarity scores in JSON format. The semantic similarity score ranges from 1 to 5, with 1 denotes least similar and 5 denotes most similar. 2 | 3 | Please adhere to the following guidelines: 4 | - The keys in JSON are "S1", "S2", and "S3", the values are all strings in {{ language }}, do not add any other keys. 5 | - There should be some word overlaps between all three {{ unit }}s. 6 | - The similarity score between S1 and S2 should be {{ high_score }}. 7 | - The similarity score between S1 and S3 should be {{ low_score }}. 8 | - The {{ unit }}s require {{ difficulty }} level education to understand and should be diverse in terms of topic and length. 9 | 10 | Your output must always be a JSON object only with three keys "S1", "S2" and "S3", do not explain yourself or output anything else. Be creative! 11 | -------------------------------------------------------------------------------- /src/distilabel/steps/tasks/templates/improving_text_embeddings/short-text-matching.jinja2: -------------------------------------------------------------------------------- 1 | You have been assigned a text matching task: {{ task }} 2 | 3 | Your mission is to write one example for this task in JSON format. The JSON object must contain the following keys: 4 | - "input": a string, a random input specified by the task. 5 | - "positive_document": a string, a relevant document for the "input" according to the task. 6 | 7 | Please adhere to the following guidelines: 8 | - The values of all fields should be in {{ language }}. 9 | - Both the "input" and "positive_document" should be very short (a sentence or a phrase), avoid substantial word overlaps, otherwise the task would be too easy. 10 | - The "input" and "positive_document" should be independent of each other. 11 | 12 | Your output must always be a JSON object only, do not explain yourself or output anything else. Be creative! 13 | -------------------------------------------------------------------------------- /src/distilabel/steps/tasks/templates/improving_text_embeddings/text-classification.jinja2: -------------------------------------------------------------------------------- 1 | You have been assigned a text classification task: {{ task }} 2 | 3 | Your mission is to write one text classification example for this task in JSON format. The JSON object must contain the following keys: 4 | - "input_text": a string, the input text specified by the classification task. 5 | - "label": a string, the correct label of the input text. 6 | - "misleading_label": a string, an incorrect label that is related to the task. 7 | 8 | Please adhere to the following guidelines: 9 | - The "input_text" should be diverse in expression. 10 | - The "misleading_label" must be a valid label for the given task, but not as appropriate as the "label" for the "input_text". 11 | - The values for all fields should be in {{ language }}. 12 | - Avoid including the values of the "label" and "misleading_label" fields in the "input_text", that would make the task too easy. 13 | - The "input_text" is {{ clarity }} and requires {{ difficulty }} level education to comprehend. 14 | 15 | Your output must always be a JSON object only, do not explain yourself or output anything else. Be creative! 16 | -------------------------------------------------------------------------------- /src/distilabel/steps/tasks/templates/improving_text_embeddings/text-retrieval.jinja2: -------------------------------------------------------------------------------- 1 | You have been assigned a retrieval task: {{ task }} 2 | 3 | Your mission is to write one text retrieval example for this task in JSON format. The JSON object must contain the following keys: 4 | - "user_query": a string, a random user search query specified by the retrieval task. 5 | - "positive_document": a string, a relevant document for the user query. 6 | - "hard_negative_document": a string, a hard negative document that only appears relevant to the query. 7 | 8 | Please adhere to the following guidelines: 9 | - The "user_query" should be {{ query_type }}, {{ query_length }}, {{ clarity }}, and diverse in topic. 10 | - All documents must be created independent of the query. Avoid copying the query verbatim. It's acceptable if some parts of the "positive_document" are not topically related to the query. 11 | - All documents should be at least {{ num_words}} words long. 12 | - The "hard_negative_document" contains some useful information, but it should be less useful or comprehensive compared to the "positive_document". 13 | - Both the query and documents should be in {{ language }}. 14 | - Do not provide any explanation in any document on why it is relevant or not relevant to the query. 15 | - Both the query and documents require {{ difficulty }} level education to understand. 16 | 17 | Your output must always be a JSON object only, do not explain yourself or output anything else. Be creative! 18 | -------------------------------------------------------------------------------- /src/distilabel/steps/tasks/templates/prometheus/absolute_with_reference.jinja2: -------------------------------------------------------------------------------- 1 | ###Task Description: 2 | An instruction (might include an Input inside it), a response to evaluate, a reference answer that gets a score of 5, and a score rubric representing a evaluation criteria are given. 3 | 1. Write a detailed feedback that assess the quality of the response strictly based on the given score rubric, not evaluating in general. 4 | 2. After writing a feedback, write a score that is an integer between 1 and 5. You should refer to the score rubric. 5 | 3. The output format should look as follows: "Feedback: (write a feedback for criteria) [RESULT] (an integer number between 1 and 5)" 6 | 4. Please do not generate any other opening, closing, and explanations. 7 | 8 | ###The instruction to evaluate: 9 | {{ instruction }} 10 | 11 | ###Response to evaluate: 12 | {{ generation }} 13 | 14 | ###Reference Answer (Score 5): 15 | {{ reference }} 16 | 17 | ###Score Rubrics: 18 | {{ rubric }} 19 | 20 | ###Feedback: 21 | -------------------------------------------------------------------------------- /src/distilabel/steps/tasks/templates/prometheus/absolute_without_reference.jinja2: -------------------------------------------------------------------------------- 1 | ###Task Description: 2 | An instruction (might include an Input inside it), a response to evaluate, and a score rubric representing a evaluation criteria are given. 3 | 1. Write a detailed feedback that assess the quality of the response strictly based on the given score rubric, not evaluating in general. 4 | 2. After writing a feedback, write a score that is an integer between 1 and 5. You should refer to the score rubric. 5 | 3. The output format should look as follows: "Feedback: (write a feedback for criteria) [RESULT] (an integer number between 1 and 5)" 6 | 4. Please do not generate any other opening, closing, and explanations. 7 | 8 | ###The instruction to evaluate: 9 | {{ instruction }} 10 | 11 | ###Response to evaluate: 12 | {{ generation }} 13 | 14 | ###Score Rubrics: 15 | {{ rubric }} 16 | 17 | ###Feedback: 18 | -------------------------------------------------------------------------------- /src/distilabel/steps/tasks/templates/prometheus/relative_with_reference.jinja2: -------------------------------------------------------------------------------- 1 | ###Task Description: 2 | An instruction (might include an Input inside it), a response to evaluate, and a score rubric representing a evaluation criteria are given. 3 | 1. Write a detailed feedback that assess the quality of two responses strictly based on the given score rubric, not evaluating in general. 4 | 2. After writing a feedback, choose a better response between Response A and Response B. You should refer to the score rubric. 5 | 3. The output format should look as follows: "Feedback: (write a feedback for criteria) [RESULT] (A or B)" 6 | 4. Please do not generate any other opening, closing, and explanations. 7 | 8 | ###Instruction: 9 | {{ instruction }} 10 | 11 | ###Response A: 12 | {{ generations[0] }} 13 | 14 | ###Response B: 15 | {{ generations[1] }} 16 | 17 | ###Reference Answer: 18 | {{ reference }} 19 | 20 | ###Score Rubric: 21 | {{ rubric }} 22 | 23 | ###Feedback: 24 | -------------------------------------------------------------------------------- /src/distilabel/steps/tasks/templates/prometheus/relative_without_reference.jinja2: -------------------------------------------------------------------------------- 1 | ###Task Description: 2 | An instruction (might include an Input inside it), a response to evaluate, a reference answer, and a score rubric representing a evaluation criteria are given. 3 | 1. Write a detailed feedback that assess the quality of two responses strictly based on the given score rubric, not evaluating in general. 4 | 2. After writing a feedback, choose a better response between Response A and Response B. You should refer to the score rubric. 5 | 3. The output format should look as follows: "Feedback: (write a feedback for criteria) [RESULT] (A or B)" 6 | 4. Please do not generate any other opening, closing, and explanations. 7 | 8 | ###Instruction: 9 | {{ instruction }} 10 | 11 | ###Response A: 12 | {{ generations[0] }} 13 | 14 | ###Response B: 15 | {{ generations[1] }} 16 | 17 | ###Score Rubric: 18 | {{ rubric }} 19 | 20 | ###Feedback: 21 | -------------------------------------------------------------------------------- /src/distilabel/steps/tasks/templates/quality-scorer.jinja2: -------------------------------------------------------------------------------- 1 | Rank the following pair of instructions and responses according to their quality. Your evaluation should consider factors such as helpfulness, relevance, accuracy, depth, creativity, and level of detail of the response. Score 1-{{ responses|length }}. 2 | Score each response from 1 to {{ responses|length }}, with {{ (responses|length) + 1}} reserved for responses that are already very well written and cannot be improved further. You should respond with the format: 3 | [1] Score: 1 4 | [2] Score: 2 5 | ... 6 | #Question#: {{ instruction }} 7 | #Response List#: 8 | {% for response in responses %} 9 | [{{ loop.index }}] {{ response }} 10 | {%- endfor %} 11 | -------------------------------------------------------------------------------- /src/distilabel/steps/tasks/templates/self-instruct.jinja2: -------------------------------------------------------------------------------- 1 | # Task Description 2 | Develop {{ num_instructions }} user queries that can be received by the given AI application and applicable to the provided context. Emphasize diversity in verbs and linguistic structures within the model's textual capabilities. 3 | 4 | # Criteria for Queries 5 | {{ criteria_for_query_generation }} 6 | Write each query on a separate line and avoid using numbered lists or bullet points. 7 | 8 | # AI Application 9 | {{ application_description }} 10 | 11 | # Context 12 | {{ input }} 13 | 14 | # Output 15 | 16 | -------------------------------------------------------------------------------- /src/distilabel/steps/tasks/templates/ultrafeedback/instruction-following.jinja2: -------------------------------------------------------------------------------- 1 | # Instruction Following Assessment 2 | 3 | Evaluate alignment between output and intent. Assess understanding of task goal and restrictions. 4 | 5 | **Instruction Components**: Task Goal (intended outcome), Restrictions (text styles, formats, or designated methods, etc). 6 | 7 | **Scoring**: Rate outputs 1 to 5: 8 | 1. **Irrelevant**: No alignment. 9 | 2. **Partial Focus**: Addresses one aspect poorly. 10 | 3. **Partial Compliance**: 11 | - (1) Meets goal or restrictions, neglecting other. 12 | - (2) Acknowledges both but slight deviations. 13 | 4. **Almost There**: Near alignment, minor deviations. 14 | 5. **Comprehensive Compliance**: Fully aligns, meets all requirements. 15 | 16 | ## Format: 17 | 18 | ### Input 19 | Instruction: [Clearly specify the task goal and restrictions] 20 | 21 | Texts: 22 | {%- for index in range(generations|length) %} 23 | [Text {{ index + 1}}] 24 | {%- endfor %} 25 | 26 | ### Output 27 | #### Output for Text 1 28 | Rating: [Rating for text 1] 29 | Rationale: [Rationale for the rating in short sentences] 30 | 31 | {%- for index in range(1, generations|length) %} 32 | 33 | #### Output for Text {{ index + 1}} 34 | Rating: [Rating] 35 | Rationale: [Rationale] 36 | {%- endfor %} 37 | 38 | --- 39 | 40 | ## Annotation 41 | 42 | ### Input 43 | Instruction: {{ instruction }} 44 | 45 | Texts: 46 | {%- for generation in generations %} 47 | {{ generation }} 48 | {%- endfor %} 49 | 50 | ### Output 51 | 52 | -------------------------------------------------------------------------------- /src/distilabel/steps/tasks/templates/ultrafeedback/overall-rating.jinja2: -------------------------------------------------------------------------------- 1 | # General Text Quality Assessment 2 | 3 | Evaluate the model's outputs based on various criteria: 4 | 5 | 1. **Correctness & Informativeness**: Does the output provide accurate and helpful information? 6 | 2. **Honesty & Uncertainty**: How confidently does the model convey its information, and does it express uncertainty appropriately? 7 | 3. **Truthfulness & Hallucination**: Does the model introduce misleading or fabricated details? 8 | 4. **Instruction Following**: Does the model's output align with given instructions and the user's intent? 9 | 10 | Your role is to provide a holistic assessment considering all the above factors. 11 | 12 | **Scoring**: Rate outputs 1 to 5 based on the overall quality, considering all aspects: 13 | 1. **Low Quality**: Contains inaccuracies, may be entirely wrong or has severe hallucinations. 14 | 2. **Moderate Quality**: Addresses some aspects, but has errors or is partially aligned with instructions. 15 | 3. **Good**: Generally accurate but may contain minor errors or slight deviations. 16 | 4. **Very Good**: Near perfect, with minor issues in terms of alignment or confidence. 17 | 5, **Excellent**: Accurate, confident, aligned with instructions, and free of hallucinations. 18 | 19 | ## Format: 20 | 21 | ### Input 22 | Instruction: [Clearly specify the task goal and restrictions] 23 | 24 | Texts: 25 | {%- for index in range(generations|length) %} 26 | [Text {{ index + 1}}] 27 | {%- endfor %} 28 | 29 | ### Output 30 | #### Output for Text 1 31 | Rating: [Rating for text 1] 32 | Rationale: [Rationale for the rating in short sentences] 33 | 34 | {%- for index in range(1, generations|length) %} 35 | 36 | #### Output for Text {{ index + 1}} 37 | Rating: [Rating] 38 | Rationale: [Rationale] 39 | {%- endfor %} 40 | 41 | --- 42 | 43 | ## Annotation 44 | 45 | ### Input 46 | Instruction: {{ instruction }} 47 | 48 | Texts: 49 | {%- for generation in generations %} 50 | {{ generation }} 51 | {%- endfor %} 52 | 53 | ### Output 54 | 55 | -------------------------------------------------------------------------------- /src/distilabel/steps/tasks/templates/urial.jinja2: -------------------------------------------------------------------------------- 1 | # Instruction 2 | 3 | Below is a list of conversations between a human and an AI assistant (you). 4 | Users place their queries under "# User:", and your responses are under "# Assistant:". 5 | You are a helpful, respectful, and honest assistant. 6 | You should always answer as helpfully as possible while ensuring safety. 7 | Your answers should be well-structured and provide detailed information. They should also have an engaging tone. 8 | Your responses must not contain any fake, harmful, unethical, racist, sexist, toxic, dangerous, or illegal content, even if it may be helpful. 9 | Your response must be socially responsible, and thus you can refuse to answer some controversial topics. 10 | 11 | {% for message in messages %} 12 | # {{ message.role | capitalize }}: 13 | 14 | {{ message.content }} 15 | {% endfor %} 16 | # Assistant: 17 | -------------------------------------------------------------------------------- /src/distilabel/typing/base.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from typing import List, Literal, Union 16 | 17 | from typing_extensions import Required, TypedDict 18 | 19 | 20 | class TextContent(TypedDict, total=False): 21 | type: Required[Literal["text"]] 22 | text: Required[str] 23 | 24 | 25 | class ImageUrl(TypedDict): 26 | url: Required[str] 27 | """Either a URL of the image or the base64 encoded image data.""" 28 | 29 | 30 | class ImageContent(TypedDict, total=False): 31 | """Type alias for the user's message in a conversation that can include text or an image. 32 | It's the standard type for vision language models: 33 | https://platform.openai.com/docs/guides/vision 34 | """ 35 | 36 | type: Required[Literal["image_url"]] 37 | image_url: Required[ImageUrl] 38 | 39 | 40 | class ChatItem(TypedDict): 41 | role: Literal["system", "user", "assistant"] 42 | content: Union[str, list[Union[TextContent, ImageContent]]] 43 | 44 | 45 | ChatType = List[ChatItem] 46 | """ChatType is a type alias for a `list` of `dict`s following the OpenAI conversational format.""" 47 | -------------------------------------------------------------------------------- /src/distilabel/typing/steps.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from typing import Any, Dict, Iterator, List, Tuple, Union 16 | 17 | StepOutput = Iterator[List[Dict[str, Any]]] 18 | """`StepOutput` is an alias of the typing `Iterator[List[Dict[str, Any]]]`""" 19 | 20 | GeneratorStepOutput = Iterator[Tuple[List[Dict[str, Any]], bool]] 21 | """`GeneratorStepOutput` is an alias of the typing `Iterator[Tuple[List[Dict[str, Any]], bool]]`""" 22 | 23 | StepColumns = Union[List[str], Dict[str, bool]] 24 | """`StepColumns` is an alias of the typing `Union[List[str], Dict[str, bool]]` used by the 25 | `inputs` and `outputs` properties of an `Step`. In the case of a `List[str]`, it is a list 26 | with the required columns. In the case of a `Dict[str, bool]`, it is a dictionary where 27 | the keys are the columns and the values are booleans indicating whether the column is 28 | required or not. 29 | """ 30 | -------------------------------------------------------------------------------- /src/distilabel/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /src/distilabel/utils/card/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | -------------------------------------------------------------------------------- /src/distilabel/utils/card/dataset_card.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from pathlib import Path 16 | 17 | from huggingface_hub import DatasetCard 18 | 19 | TEMPLATE_DISTILABEL_DATASET_CARD_PATH = Path(__file__).parent / "distilabel_template.md" 20 | 21 | 22 | AVAILABLE_SIZE_CATEGORIES = { 23 | 1_000: "n<1K", 24 | 10_000: "1K str: 37 | for size, category in AVAILABLE_SIZE_CATEGORIES.items(): 38 | if input_size < size: 39 | return category 40 | return "n>1T" 41 | 42 | 43 | class DistilabelDatasetCard(DatasetCard): 44 | """A `DatasetCard` subclass that uses the Distilabel template by default.""" 45 | 46 | default_template_path = TEMPLATE_DISTILABEL_DATASET_CARD_PATH 47 | -------------------------------------------------------------------------------- /src/distilabel/utils/chat.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from typing import Any 16 | 17 | 18 | def is_openai_format(input: Any) -> bool: 19 | """Checks if the input is in OpenAI chat-like format: 20 | 21 | ```python 22 | [ 23 | {"role": "user", "content": "Hello!"}, 24 | {"role": "assistant", "content": "Hi! How can I help you?"}, 25 | ] 26 | ``` 27 | 28 | Args: 29 | input: The input to check. 30 | 31 | Returns: 32 | A boolean indicating if the input is in OpenAI chat-like format. 33 | """ 34 | if not isinstance(input, list): 35 | return False 36 | return all( 37 | isinstance(x, dict) and "role" in x.keys() and "content" in x.keys() 38 | for x in input 39 | ) 40 | -------------------------------------------------------------------------------- /src/distilabel/utils/files.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from pathlib import Path 16 | from typing import Callable, List, Optional 17 | 18 | 19 | def list_files_in_dir( 20 | dir_path: Path, key: Optional[Callable] = lambda x: int(x.stem) 21 | ) -> List[Path]: 22 | """List all files in a directory. 23 | 24 | Args: 25 | dir_path: Path to the directory. 26 | key: A function to sort the files. Defaults to sorting by the integer value of the file name. 27 | This is useful when loading files from the cache, as the name will be numbered. 28 | 29 | Returns: 30 | A list of file names in the directory. 31 | """ 32 | return [f for f in sorted(dir_path.iterdir(), key=key) if f.is_file()] 33 | -------------------------------------------------------------------------------- /src/distilabel/utils/huggingface.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import os 16 | from pathlib import Path 17 | from typing import Final 18 | 19 | from huggingface_hub import constants 20 | 21 | HF_TOKEN_ENV_VAR: Final[str] = "HF_TOKEN" 22 | 23 | 24 | def get_hf_token(cls_name: str, token_arg: str) -> str: 25 | """Get the token for the hugging face API. 26 | 27 | Tries to extract it from the environment variable, if it is not found 28 | it tries to read it from the file using 'huggingface_hub', 29 | and if not possible raises a ValueError. 30 | 31 | Args: 32 | cls_name: Name of the class/function that requires the token. 33 | token_arg: Argument name to use in the error message, normally 34 | is "token" or "api_key". 35 | 36 | Raises: 37 | ValueError: If the token is not found in the file. 38 | 39 | Returns: 40 | The token for the hugging face API. 41 | """ 42 | token = os.getenv(HF_TOKEN_ENV_VAR) 43 | if token is None: 44 | if not Path(constants.HF_TOKEN_PATH).exists(): 45 | raise ValueError( 46 | f"To use `{cls_name}` an API key must be provided via `{token_arg}`," 47 | f" set the environment variable `{HF_TOKEN_ENV_VAR}` or use the" 48 | " `huggingface-hub` CLI to login with `huggingface-cli login`." 49 | ) 50 | with open(constants.HF_TOKEN_PATH) as f: 51 | token = f.read().strip() 52 | return token 53 | -------------------------------------------------------------------------------- /src/distilabel/utils/image.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import base64 16 | import io 17 | from typing import TYPE_CHECKING 18 | 19 | if TYPE_CHECKING: 20 | from PIL import Image 21 | 22 | 23 | def image_to_str(image: "Image.Image", image_format: str = "JPEG") -> str: 24 | """Converts a PIL Image to a base64 encoded string.""" 25 | buffered = io.BytesIO() 26 | image.save(buffered, format=image_format) 27 | return base64.b64encode(buffered.getvalue()).decode("utf-8") 28 | -------------------------------------------------------------------------------- /src/distilabel/utils/lists.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from typing import List 16 | 17 | 18 | def flatten_responses(responses: List[List[str]]) -> List[str]: 19 | """Flattens the list of lists of strings into a single list of strings. 20 | 21 | Args: 22 | responses: The list of lists of strings to flatten. 23 | 24 | Returns: 25 | A single list of strings containing the last item of each list. 26 | """ 27 | return [response[-1] for response in responses] 28 | -------------------------------------------------------------------------------- /src/distilabel/utils/mkdocs/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | -------------------------------------------------------------------------------- /src/distilabel/utils/mkdocs/templates/components-gallery/components-list.jinja2: -------------------------------------------------------------------------------- 1 | --- 2 | hide: 3 | - toc 4 | - navigation 5 | --- 6 | # {{ title }} 7 | 8 | {{ description }} 9 | 10 |
11 | 12 | {% for component in components %} 13 | - {% if component.docstring.icon %}{{ component.docstring.icon }}{% else %}{{ default_icon }}{% endif %}{ .lg .middle } __{{ component.name }}__ 14 | 15 | --- 16 | 17 | {{ component.docstring.short_description }} 18 | 19 | [:octicons-arrow-right-24: {{ component.name }}]({{ component.name | lower }}.md){ .bottom } 20 | {% endfor %} 21 | 22 |
23 | -------------------------------------------------------------------------------- /src/distilabel/utils/mkdocs/templates/components-gallery/index.md: -------------------------------------------------------------------------------- 1 | --- 2 | hide: 3 | - navigation 4 | - toc 5 | --- 6 | # Components Gallery 7 | 8 | ??? info "Category Overview" 9 | | Icon | Category | Description | 10 | |----------------------------|------------|-------------------------------------------------------------------| 11 | | :material-step-forward: | Steps | Steps are used for data manipulation. | 12 | | :material-check-outline: | Tasks | Tasks allow performing data generation, annotation, and more. | 13 | | :material-brain: | LLMs | Explore all available Large Language Models integrated with distilabel. | 14 | | :material-vector-line: | Embeddings | Explore all available Embeddings Models integrated with distilabel. | 15 | 16 |
17 | 18 | - :material-step-forward:{ .lg .middle } __Steps__ 19 | 20 | --- 21 | 22 | Explore all the available `Step`s that can be used for data manipulation. 23 | 24 | [:octicons-arrow-right-24: Steps](steps/index.md){ .bottom } 25 | 26 | - :material-check-outline:{ .lg .middle } __Tasks__ 27 | 28 | --- 29 | 30 | Explore all the available `Task`s that can be used with an `LLM` to perform data generation, annotation, and more. 31 | 32 | [:octicons-arrow-right-24: Tasks](tasks/index.md) 33 | 34 | - :material-brain:{ .lg .middle } __LLMs__ 35 | 36 | --- 37 | 38 | Explore all the available `LLM`s integrated with `distilabel`. 39 | 40 | [:octicons-arrow-right-24: LLMs](llms/index.md){ .bottom } 41 | 42 | - :material-image:{ .lg .middle } __ImageGenerationModels__ 43 | 44 | --- 45 | 46 | Explore all the available `ImageGenerationModels`s integrated with `distilabel`. 47 | 48 | [:octicons-arrow-right-24: ImageGenerationModels](image_generation/index.md){ .bottom } 49 | 50 | - :material-vector-line:{ .lg .middle } __Embeddings__ 51 | 52 | --- 53 | 54 | Explore all the available `Embeddings` models integrated with `distilabel`. 55 | 56 | [:octicons-arrow-right-24: Embeddings](embeddings/index.md){ .bottom } 57 | 58 |
59 | -------------------------------------------------------------------------------- /src/distilabel/utils/mkdocs/templates/components-gallery/llm-detail.jinja2: -------------------------------------------------------------------------------- 1 | --- 2 | hide: 3 | - navigation 4 | --- 5 | # {{ llm.name }} 6 | 7 | {% if llm.docstring.short_description %} 8 | {{ llm.docstring.short_description }} 9 | {% endif %} 10 | 11 | {% if llm.docstring.description %} 12 | {{ llm.docstring.description }} 13 | {% endif %} 14 | 15 | {% if llm.docstring.note %} 16 | ### Note 17 | {{ llm.docstring.note }} 18 | {% endif %} 19 | 20 | {% if llm.docstring.attributes %} 21 | ### Attributes 22 | {% for attribute_name, description in llm.docstring.attributes.items() %} 23 | - **{{ attribute_name }}**: {{ description }} 24 | {% endfor %} 25 | {% endif %} 26 | 27 | 28 | {% if llm.docstring.runtime_parameters %} 29 | ### Runtime Parameters 30 | {% for parameter_name, description in llm.docstring.runtime_parameters.items() %} 31 | - **{{ parameter_name }}**: {{ description }} 32 | {% endfor %} 33 | {% endif %} 34 | 35 | {% if llm.docstring.examples %} 36 | ### Examples 37 | 38 | {% for example_title, code in llm.docstring.examples.items() %} 39 | #### {{ example_title }} 40 | ```python 41 | {{ code | replace("\n", "\n") }} 42 | ``` 43 | {% endfor %} 44 | {% endif %} 45 | 46 | {% if llm.docstring.references %} 47 | ### References 48 | {% for reference, url in llm.docstring.references.items() %} 49 | - [{{ reference }}]({{ url }}) 50 | {% endfor %} 51 | {% endif %} 52 | -------------------------------------------------------------------------------- /src/distilabel/utils/mkdocs/templates/components-gallery/step-detail.jinja2: -------------------------------------------------------------------------------- 1 | --- 2 | hide: 3 | - navigation 4 | --- 5 | # {{ step.name }} 6 | {% if step.docstring.short_description %} 7 | {{ step.docstring.short_description }} 8 | {% endif %} 9 | 10 | {% if step.docstring.description %} 11 | {{ step.docstring.description }} 12 | {% endif %} 13 | 14 | {% if step.docstring.note %} 15 | ### Note 16 | {{ step.docstring.note }} 17 | {% endif %} 18 | 19 | {% if step.docstring.attributes %} 20 | ### Attributes 21 | {% for attribute_name, description in step.docstring.attributes.items() %} 22 | - **{{ attribute_name }}**: {{ description }} 23 | {% endfor %} 24 | {% endif %} 25 | 26 | {% if step.docstring.runtime_parameters %} 27 | ### Runtime Parameters 28 | {% for parameter_name, description in step.docstring.runtime_parameters.items() %} 29 | - **{{ parameter_name }}**: {{ description }} 30 | {% endfor %} 31 | {% endif %} 32 | 33 | ### Input & Output Columns 34 | 35 | ``` mermaid 36 | {{ mermaid_diagram }} 37 | ``` 38 | 39 | {% if step.docstring.input_columns %} 40 | #### Inputs 41 | 42 | {% for column_name, value in step.docstring.input_columns.items() %} 43 | - **{{ column_name }}** ({{ value[0] }}): {{ value[1] }} 44 | {% endfor %} 45 | {% endif %} 46 | 47 | {% if step.docstring.output_columns %} 48 | #### Outputs 49 | 50 | {% for column_name, value in step.docstring.output_columns.items() %} 51 | - **{{ column_name }}** ({{ value[0] }}): {{ value[1] }} 52 | {% endfor %} 53 | {% endif %} 54 | 55 | 56 | {% if step.docstring.examples %} 57 | ### Examples 58 | 59 | {% for example_title, code in step.docstring.examples.items() %} 60 | #### {{ example_title }} 61 | ```python 62 | {{ code | replace("\n", "\n") }} 63 | ``` 64 | {% endfor %} 65 | {% endif %} 66 | 67 | {% if step.docstring.references %} 68 | ### References 69 | {% for reference, url in step.docstring.references.items() %} 70 | - [{{ reference }}]({{ url }}) 71 | {% endfor %} 72 | {% endif %} 73 | 74 | -------------------------------------------------------------------------------- /src/distilabel/utils/notebook.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | def in_notebook() -> bool: 17 | """Checks if the current code is being executed from a Jupyter Notebook. 18 | This is useful for better handling the `asyncio` events under `nest_asyncio`, 19 | as Jupyter Notebook runs a separate event loop. 20 | 21 | Returns: 22 | Whether the current code is being executed from a Jupyter Notebook. 23 | 24 | References: 25 | - https://stackoverflow.com/a/22424821 26 | """ 27 | try: 28 | from IPython import get_ipython 29 | 30 | if "IPKernelApp" not in get_ipython().config: # pragma: no cover 31 | return False 32 | except ImportError: 33 | return False 34 | except AttributeError: 35 | return False 36 | return True 37 | -------------------------------------------------------------------------------- /src/distilabel/utils/ray.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import os 16 | 17 | 18 | def script_executed_in_ray_cluster() -> bool: 19 | """Checks if running in a Ray cluster. The checking is based on the presence of 20 | typical Ray environment variables that are set in each node of the cluster. 21 | 22 | Returns: 23 | `True` if running on a Ray cluster, `False` otherwise. 24 | """ 25 | return all( 26 | env in os.environ 27 | for env in ["RAY_NODE_TYPE_NAME", "RAY_CLUSTER_NAME", "RAY_ADDRESS"] 28 | ) 29 | -------------------------------------------------------------------------------- /src/distilabel/utils/requirements.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from typing import TYPE_CHECKING, Callable, List, TypeVar, Union 16 | 17 | if TYPE_CHECKING: 18 | from distilabel.steps.base import _Step 19 | 20 | S = TypeVar("S", bound="_Step") 21 | 22 | 23 | def requirements(requirements: Union[List[str]]) -> Callable[[S], S]: 24 | """Decorator to add requirements to a Step. 25 | 26 | When creating a custom step for a Pipeline that requires additional packages to be installed, 27 | (in case you want to distribute the pipeline) you can use this decorator to add the requirements. 28 | 29 | Args: 30 | requirements: List of requirements to be added to the step. 31 | 32 | Returns: 33 | The step with the requirements added. 34 | 35 | Example: 36 | 37 | ```python 38 | @requirements(["my_library>=1.0.1"]) 39 | class CustomStep(Step): 40 | @property 41 | def inputs(self) -> List[str]: 42 | return ["instruction"] 43 | 44 | @property 45 | def outputs(self) -> List[str]: 46 | return ["response"] 47 | 48 | def process(self, inputs: StepInput) -> StepOutput: # type: ignore 49 | for input in inputs: 50 | input["response"] = "unit test" 51 | yield inputs 52 | ``` 53 | """ 54 | 55 | def decorator(step: S) -> S: 56 | step.requirements = requirements 57 | return step 58 | 59 | return decorator 60 | -------------------------------------------------------------------------------- /src/distilabel/utils/template.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import re 16 | 17 | from distilabel.errors import DistilabelUserError 18 | 19 | 20 | def check_column_in_template( 21 | column: str, template: str, page: str = "components-gallery/tasks/textgeneration/" 22 | ) -> None: 23 | """Checks if a column is present in the template, and raises an error if it isn't. 24 | 25 | Args: 26 | column: The column name to check in the template. 27 | template: The template of the Task to be checked, the input from the user. 28 | page: The page to redirect the user for help . Defaults to "components-gallery/tasks/textgeneration/". 29 | 30 | Raises: 31 | DistilabelUserError: Custom error if the column is not present in the template. 32 | """ 33 | pattern = ( 34 | r"(?:{%.*?\b" 35 | + re.escape(column) 36 | + r"\b.*?%}|{{\s*" 37 | + re.escape(column) 38 | + r"\s*}})" 39 | ) 40 | if not re.search(pattern, template): 41 | raise DistilabelUserError( 42 | ( 43 | f"You required column name '{column}', but is not present in the template, " 44 | "ensure the 'columns' match with the 'template' to avoid errors." 45 | ), 46 | page=page, 47 | ) 48 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import sys 16 | from typing import TYPE_CHECKING, List 17 | 18 | import pytest 19 | 20 | if TYPE_CHECKING: 21 | from _pytest.config import Config 22 | from _pytest.nodes import Item 23 | 24 | 25 | def pytest_configure(config: "Config") -> None: 26 | config.addinivalue_line( 27 | "markers", 28 | "skip_python_versions(versions): mark test to be skipped on specified Python versions", 29 | ) 30 | 31 | 32 | def pytest_collection_modifyitems(config: "Config", items: List["Item"]) -> None: 33 | current_version = f"{sys.version_info.major}.{sys.version_info.minor}" 34 | for item in items: 35 | skip_versions_marker = item.get_closest_marker("skip_python_versions") 36 | if skip_versions_marker: 37 | versions_to_skip = skip_versions_marker.args[0] 38 | if current_version in versions_to_skip: 39 | skip_reason = f"Test not supported on Python {current_version}" 40 | item.add_marker(pytest.mark.skip(reason=skip_reason)) 41 | -------------------------------------------------------------------------------- /tests/integration/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | -------------------------------------------------------------------------------- /tests/integration/conftest.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import os 16 | import tempfile 17 | from typing import Generator 18 | 19 | import pytest 20 | 21 | 22 | @pytest.fixture(autouse=True) 23 | def temp_cache_dir() -> Generator[None, None, None]: 24 | """Set the cache directory to a temporary directory for all tests.""" 25 | with tempfile.TemporaryDirectory() as tmpdirname: 26 | os.environ["DISTILABEL_CACHE_DIR"] = tmpdirname 27 | yield 28 | -------------------------------------------------------------------------------- /tests/integration/test_branching_missaligmnent.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from typing import TYPE_CHECKING 16 | 17 | from distilabel.pipeline import Pipeline 18 | from distilabel.steps import GroupColumns, LoadDataFromDicts, StepInput, step 19 | 20 | if TYPE_CHECKING: 21 | from distilabel.steps import StepOutput 22 | 23 | 24 | @step(inputs=["instruction"], outputs=["response"]) 25 | def FailAlways(_: StepInput) -> "StepOutput": 26 | raise Exception("This step always fails") 27 | 28 | 29 | @step(inputs=["instruction"], outputs=["response"]) 30 | def SucceedAlways(inputs: StepInput) -> "StepOutput": 31 | for input in inputs: 32 | input["response"] = "This step always succeeds" 33 | yield inputs 34 | 35 | 36 | def test_branching_missalignment_because_step_fails_processing_batch() -> None: 37 | with Pipeline(name="") as pipeline: 38 | load_data = LoadDataFromDicts(data=[{"instruction": i} for i in range(20)]) 39 | 40 | fail = FailAlways() 41 | succeed = SucceedAlways() 42 | combine = GroupColumns(columns=["response"]) 43 | 44 | load_data >> [fail, succeed] >> combine 45 | 46 | distiset = pipeline.run(use_cache=False) 47 | 48 | assert ( 49 | distiset["default"]["train"]["grouped_response"] 50 | == [[None, "This step always succeeds"]] * 20 51 | ) 52 | -------------------------------------------------------------------------------- /tests/integration/test_cache.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from typing import TYPE_CHECKING, List 16 | 17 | import numpy as np 18 | import pytest 19 | 20 | from distilabel.pipeline import Pipeline 21 | from distilabel.steps import GeneratorStep, StepInput, step 22 | 23 | if TYPE_CHECKING: 24 | from distilabel.steps import GeneratorStepOutput, StepOutput 25 | 26 | 27 | class NumpyBigArrayGenerator(GeneratorStep): 28 | num_batches: int 29 | 30 | @property 31 | def outputs(self) -> List[str]: 32 | return ["array"] 33 | 34 | def process(self, offset: int = 0) -> "GeneratorStepOutput": 35 | for i in range(self.num_batches): 36 | yield ( 37 | [{"array": np.random.randn(256)} for _ in range(self.batch_size)], # type: ignore 38 | i == self.num_batches - 1, 39 | ) # type: ignore 40 | 41 | 42 | @step(step_type="global") 43 | def ReceiveArrays(inputs: StepInput) -> "StepOutput": 44 | yield inputs 45 | 46 | 47 | @pytest.mark.benchmark 48 | def test_cache_time() -> None: 49 | with Pipeline(name="dummy") as pipeline: 50 | numpy_generator = NumpyBigArrayGenerator(num_batches=2, batch_size=100) 51 | 52 | receive_arrays = ReceiveArrays() 53 | 54 | numpy_generator >> receive_arrays 55 | 56 | pipeline.run(use_cache=False) 57 | -------------------------------------------------------------------------------- /tests/integration/test_deduplication.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from distilabel.pipeline import Pipeline 16 | from distilabel.steps import LoadDataFromDicts, MinHashDedup 17 | 18 | 19 | def test_minhash_deduplication() -> None: 20 | with Pipeline() as pipeline: 21 | ds_size = 1000 22 | batch_size = 500 23 | data = LoadDataFromDicts( 24 | data=[ 25 | {"text": "This is a test document."}, 26 | {"text": "This document is a test."}, 27 | {"text": "Test document for duplication."}, 28 | {"text": "Document for duplication test."}, 29 | {"text": "This is another unique document."}, 30 | ] 31 | * (ds_size // 5), 32 | batch_size=batch_size, 33 | ) 34 | minhash = MinHashDedup( 35 | tokenizer="ngrams", 36 | n=2, 37 | threshold=0.9, 38 | storage="disk", 39 | input_batch_size=batch_size, 40 | ) 41 | data >> minhash 42 | 43 | distiset = pipeline.run(use_cache=False) 44 | ds = distiset["default"]["train"] 45 | ds_dedup = ds.filter(lambda x: x["keep_row_after_minhash_filtering"]) 46 | assert len(ds_dedup) == 4 47 | 48 | 49 | if __name__ == "__main__": 50 | test_minhash_deduplication() 51 | -------------------------------------------------------------------------------- /tests/unit/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | -------------------------------------------------------------------------------- /tests/unit/cli/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /tests/unit/cli/pipeline/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /tests/unit/cli/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import os 16 | 17 | current_dir = os.path.dirname(os.path.abspath(__file__)) 18 | 19 | TEST_PIPELINE_PATH = os.path.join(current_dir, "test_pipeline.yaml") 20 | -------------------------------------------------------------------------------- /tests/unit/helpers.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import os 16 | 17 | DISTILABEL_RUN_SLOW_TESTS = os.getenv("DISTILABEL_RUN_SLOW_TESTS", False) 18 | -------------------------------------------------------------------------------- /tests/unit/mixins/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | -------------------------------------------------------------------------------- /tests/unit/models/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | -------------------------------------------------------------------------------- /tests/unit/models/embeddings/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | -------------------------------------------------------------------------------- /tests/unit/models/embeddings/test_sentence_transformers.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from distilabel.models.embeddings.sentence_transformers import ( 16 | SentenceTransformerEmbeddings, 17 | ) 18 | 19 | 20 | class TestSentenceTransformersEmbeddings: 21 | def test_model_name(self) -> None: 22 | embeddings = SentenceTransformerEmbeddings( 23 | model="sentence-transformers/all-MiniLM-L6-v2" 24 | ) 25 | 26 | assert embeddings.model_name == "sentence-transformers/all-MiniLM-L6-v2" 27 | 28 | def test_encode(self) -> None: 29 | embeddings = SentenceTransformerEmbeddings( 30 | model="sentence-transformers/all-MiniLM-L6-v2" 31 | ) 32 | 33 | embeddings.load() 34 | 35 | results = embeddings.encode( 36 | inputs=[ 37 | "Hello, how are you?", 38 | "What a nice day!", 39 | "I hear that llamas are very popular now.", 40 | ] 41 | ) 42 | 43 | for result in results: 44 | assert len(result) == 384 45 | -------------------------------------------------------------------------------- /tests/unit/models/embeddings/test_vllm.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from unittest.mock import MagicMock, Mock 16 | 17 | from distilabel.models.embeddings.vllm import vLLMEmbeddings 18 | 19 | 20 | # @patch("vllm.entrypoints.LLM") 21 | class TestSentenceTransformersEmbeddings: 22 | model_name = "group/model-name" 23 | 24 | def test_model_name(self) -> None: 25 | embeddings = vLLMEmbeddings(model=self.model_name) 26 | 27 | assert embeddings.model_name == self.model_name 28 | 29 | def test_encode(self) -> None: 30 | embeddings = vLLMEmbeddings(model=self.model_name) 31 | 32 | # the loading should be done here, it's just mocked 33 | # embeddings.load() 34 | embeddings._model = MagicMock() 35 | 36 | mocked_response = Mock(outputs=Mock(embedding=[0.1] * 10)) 37 | embeddings._model.encode = Mock( 38 | side_effect=lambda x: [mocked_response for _ in range(len(x))] 39 | ) 40 | 41 | results = embeddings.encode( 42 | inputs=[ 43 | "Hello, how are you?", 44 | "What a nice day!", 45 | "I hear that llamas are very popular now.", 46 | ] 47 | ) 48 | 49 | for result in results: 50 | assert len(result) == 10 51 | -------------------------------------------------------------------------------- /tests/unit/models/image_generation/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | -------------------------------------------------------------------------------- /tests/unit/models/image_generation/huggingface/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | -------------------------------------------------------------------------------- /tests/unit/models/llms/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /tests/unit/models/llms/huggingface/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /tests/unit/models/llms/test_base.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import pytest 16 | 17 | from distilabel.errors import DistilabelNotImplementedError 18 | from tests.unit.conftest import DummyLLM 19 | 20 | 21 | class TestLLM: 22 | def test_offline_batch_generate_raise_distilabel_not_implemented_error( 23 | self, 24 | ) -> None: 25 | llm = DummyLLM() 26 | 27 | with pytest.raises(DistilabelNotImplementedError): 28 | llm.offline_batch_generate() 29 | -------------------------------------------------------------------------------- /tests/unit/models/llms/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from typing import Any 16 | 17 | from pydantic import BaseModel, PrivateAttr 18 | 19 | 20 | class DummyUserDetail(BaseModel): 21 | name: str 22 | age: int 23 | _raw_response: Any = PrivateAttr() 24 | 25 | def __init__(self, **data): 26 | super().__init__(**data) 27 | self._raw_response = data.get("_raw_response") 28 | -------------------------------------------------------------------------------- /tests/unit/models/mixins/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | -------------------------------------------------------------------------------- /tests/unit/pipeline/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | -------------------------------------------------------------------------------- /tests/unit/steps/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /tests/unit/steps/argilla/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /tests/unit/steps/clustering/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | -------------------------------------------------------------------------------- /tests/unit/steps/clustering/test_dbscan.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | from distilabel.steps.clustering.dbscan import DBSCAN 17 | 18 | 19 | class TestDBSCAN: 20 | def test_process(self) -> None: 21 | step = DBSCAN(n_jobs=1, eps=0.5, min_samples=5) 22 | step.load() 23 | 24 | results = next( 25 | step.process( 26 | inputs=[ 27 | {"projection": [0.1, -0.4]}, 28 | {"projection": [-0.3, 0.9]}, 29 | {"projection": [0.6, 0.2]}, 30 | {"projection": [-0.2, -0.6]}, 31 | {"projection": [0.9, 0.1]}, 32 | {"projection": [0.4, -0.7]}, 33 | {"projection": [-0.5, 0.3]}, 34 | {"projection": [0.7, 0.5]}, 35 | {"projection": [-0.1, -0.9]}, 36 | ] 37 | ) 38 | ) 39 | assert all(result["cluster_label"] == -1 for result in results) 40 | -------------------------------------------------------------------------------- /tests/unit/steps/clustering/test_umap.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import numpy as np 16 | 17 | from distilabel.steps.clustering.umap import UMAP 18 | 19 | 20 | class TestUMAP: 21 | def test_process(self) -> None: 22 | n_components = 2 23 | step = UMAP(n_jobs=1, n_components=n_components) 24 | step.load() 25 | 26 | results = next( 27 | step.process( 28 | inputs=[ 29 | {"embedding": [0.1, -0.4, 0.7, 0.2]}, 30 | {"embedding": [-0.3, 0.9, 0.1, -0.5]}, 31 | {"embedding": [0.6, 0.2, -0.1, 0.8]}, 32 | {"embedding": [-0.2, -0.6, 0.4, 0.3]}, 33 | {"embedding": [0.9, 0.1, -0.3, -0.2]}, 34 | {"embedding": [0.4, -0.7, 0.6, 0.1]}, 35 | {"embedding": [-0.5, 0.3, -0.2, 0.9]}, 36 | {"embedding": [0.7, 0.5, -0.4, -0.1]}, 37 | {"embedding": [-0.1, -0.9, 0.8, 0.6]}, 38 | ] 39 | ) 40 | ) 41 | assert all(isinstance(result["projection"], np.ndarray) for result in results) 42 | assert all(len(result["projection"]) == n_components for result in results) 43 | -------------------------------------------------------------------------------- /tests/unit/steps/columns/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | -------------------------------------------------------------------------------- /tests/unit/steps/columns/test_combine.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from distilabel.constants import DISTILABEL_METADATA_KEY 16 | from distilabel.steps.columns.combine import CombineOutputs 17 | 18 | 19 | class TestCombineOutputs: 20 | def test_process(self) -> None: 21 | combine = CombineOutputs() 22 | 23 | output = next( 24 | combine.process( 25 | [ 26 | { 27 | "a": 1, 28 | "b": 2, 29 | DISTILABEL_METADATA_KEY: {"model": "model-1", "a": 1}, 30 | } 31 | ], 32 | [ 33 | { 34 | "c": 3, 35 | "d": 4, 36 | DISTILABEL_METADATA_KEY: {"model": "model-2", "b": 1}, 37 | } 38 | ], 39 | ) 40 | ) 41 | 42 | assert output == [ 43 | { 44 | "a": 1, 45 | "b": 2, 46 | "c": 3, 47 | "d": 4, 48 | DISTILABEL_METADATA_KEY: { 49 | "model": ["model-1", "model-2"], 50 | "a": 1, 51 | "b": 1, 52 | }, 53 | } 54 | ] 55 | -------------------------------------------------------------------------------- /tests/unit/steps/columns/test_keep.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from distilabel.pipeline.local import Pipeline 16 | from distilabel.steps.columns.keep import KeepColumns 17 | 18 | 19 | class TestKeepColumns: 20 | def test_init(self) -> None: 21 | task = KeepColumns( 22 | name="keep-columns", 23 | columns=["a", "b"], 24 | pipeline=Pipeline(name="unit-test-pipeline"), 25 | ) 26 | assert task.inputs == ["a", "b"] 27 | assert task.outputs == ["a", "b"] 28 | 29 | def test_process(self) -> None: 30 | combine = KeepColumns( 31 | name="keep-columns", 32 | columns=["a", "b"], 33 | pipeline=Pipeline(name="unit-test-pipeline"), 34 | ) 35 | output = next(combine.process([{"a": 1, "b": 2, "c": 3, "d": 4}])) 36 | assert output == [{"a": 1, "b": 2}] 37 | 38 | def test_process_preserve_order(self) -> None: 39 | combine = KeepColumns( 40 | name="keep-columns", 41 | columns=["b", "a"], 42 | pipeline=Pipeline(name="unit-test-pipeline"), 43 | ) 44 | output = next(combine.process([{"a": 1, "b": 2, "c": 3, "d": 4}])) 45 | assert output == [{"b": 2, "a": 1}] 46 | -------------------------------------------------------------------------------- /tests/unit/steps/columns/test_merge.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from typing import Any, Dict, List, Optional 16 | 17 | import pytest 18 | 19 | from distilabel.steps.columns.merge import MergeColumns 20 | 21 | 22 | class TestMergeColumns: 23 | @pytest.mark.parametrize( 24 | "output_column, expected", 25 | [ 26 | (None, "merged_column"), 27 | ("queries", "queries"), 28 | ], 29 | ) 30 | def test_init(self, output_column: Optional[str], expected: str) -> None: 31 | task = MergeColumns(columns=["query", "queries"], output_column=output_column) 32 | 33 | assert task.inputs == ["query", "queries"] 34 | assert task.outputs == [expected] 35 | 36 | @pytest.mark.parametrize( 37 | "columns", 38 | [ 39 | [{"query": 1, "queries": 2}], 40 | [{"query": 1, "queries": [2]}], 41 | [{"query": [1], "queries": [2]}], 42 | ], 43 | ) 44 | def test_process(self, columns: List[Dict[str, Any]]) -> None: 45 | combiner = MergeColumns( 46 | columns=["query", "queries"], 47 | ) 48 | output: List[Dict[str, Any]] = next(combiner.process(columns)) 49 | assert output == [{"merged_column": [1, 2]}] 50 | -------------------------------------------------------------------------------- /tests/unit/steps/columns/test_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from distilabel.constants import DISTILABEL_METADATA_KEY 16 | from distilabel.steps.columns.utils import merge_distilabel_metadata 17 | 18 | 19 | def test_merge_distilabel_metadata() -> None: 20 | rows = [ 21 | {DISTILABEL_METADATA_KEY: {"a": 1, "b": 1}}, 22 | {DISTILABEL_METADATA_KEY: {"a": 2, "b": 2}}, 23 | ] 24 | result = merge_distilabel_metadata(*rows) 25 | assert result == {"a": [1, 2], "b": [1, 2]} 26 | 27 | 28 | def test_merge_distilabel_metadata_list() -> None: 29 | rows = [ 30 | { 31 | DISTILABEL_METADATA_KEY: [ 32 | {"a": 1.0, "b": 1.0}, 33 | {"a": 1.1, "b": 1.1}, 34 | {"a": 1.2, "b": 1.2}, 35 | ] 36 | }, 37 | { 38 | DISTILABEL_METADATA_KEY: [ 39 | {"a": 2.0, "b": 2.0}, 40 | {"a": 2.1, "b": 2.1}, 41 | {"a": 2.2, "b": 2.2}, 42 | ] 43 | }, 44 | ] 45 | result = merge_distilabel_metadata(*rows) 46 | assert result == [ 47 | {"a": 1.0, "b": 1.0}, 48 | {"a": 1.1, "b": 1.1}, 49 | {"a": 1.2, "b": 1.2}, 50 | {"a": 2.0, "b": 2.0}, 51 | {"a": 2.1, "b": 2.1}, 52 | {"a": 2.2, "b": 2.2}, 53 | ] 54 | -------------------------------------------------------------------------------- /tests/unit/steps/embeddings/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | -------------------------------------------------------------------------------- /tests/unit/steps/embeddings/test_embedding_generation.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from distilabel.models.embeddings.sentence_transformers import ( 16 | SentenceTransformerEmbeddings, 17 | ) 18 | from distilabel.steps.embeddings.embedding_generation import EmbeddingGeneration 19 | 20 | 21 | class TestEmbeddingGeneration: 22 | def test_process(self) -> None: 23 | step = EmbeddingGeneration( 24 | embeddings=SentenceTransformerEmbeddings( 25 | model="sentence-transformers/all-MiniLM-L6-v2" 26 | ) 27 | ) 28 | 29 | step.load() 30 | 31 | results = next( 32 | step.process( 33 | inputs=[ 34 | {"text": "Hello, how are you?"}, 35 | {"text": "What a nice day!"}, 36 | {"text": "I hear that llamas are very popular now."}, 37 | ] 38 | ) 39 | ) 40 | 41 | step.unload() 42 | 43 | for result, text in zip( 44 | results, 45 | [ 46 | "Hello, how are you?", 47 | "What a nice day!", 48 | "I hear that llamas are very popular now.", 49 | ], 50 | ): 51 | assert len(result["embedding"]) == 384 52 | assert result["text"] == text 53 | assert result["model_name"] == "sentence-transformers/all-MiniLM-L6-v2" 54 | -------------------------------------------------------------------------------- /tests/unit/steps/filtering/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | -------------------------------------------------------------------------------- /tests/unit/steps/formatting/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /tests/unit/steps/formatting/test_conversation.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from distilabel.pipeline.local import Pipeline 16 | from distilabel.steps.formatting.conversation import ConversationTemplate 17 | 18 | 19 | class TestConversationTemplate: 20 | def test_process(self) -> None: 21 | conversation_template = ConversationTemplate( 22 | name="conversation_template", 23 | pipeline=Pipeline(name="unit-test"), 24 | ) 25 | 26 | result = next( 27 | conversation_template.process([{"instruction": "Hello", "response": "Hi"}]) 28 | ) 29 | 30 | assert result == [ 31 | { 32 | "instruction": "Hello", 33 | "response": "Hi", 34 | "conversation": [ 35 | {"role": "user", "content": "Hello"}, 36 | {"role": "assistant", "content": "Hi"}, 37 | ], 38 | } 39 | ] 40 | -------------------------------------------------------------------------------- /tests/unit/steps/generators/test_data.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import pytest 15 | 16 | from distilabel.pipeline.local import Pipeline 17 | from distilabel.steps.generators.data import LoadDataFromDicts 18 | 19 | 20 | class TestLoadDataFromDicts: 21 | data = [{"instruction": "test"}] * 10 22 | 23 | def test_init(self) -> None: 24 | pipeline = Pipeline(name="unit-test-pipeline") 25 | data: list[dict[str, str]] = self.data 26 | task = LoadDataFromDicts( 27 | name="task", pipeline=pipeline, data=data, batch_size=10 28 | ) 29 | assert task.data == data 30 | assert task.batch_size == 10 31 | 32 | def test_process(self) -> None: 33 | pipeline = Pipeline(name="unit-test-pipeline") 34 | data: list[dict[str, str]] = self.data 35 | batch_size = 1 36 | task = LoadDataFromDicts( 37 | name="task", pipeline=pipeline, data=data, batch_size=batch_size 38 | ) 39 | 40 | result = task.process() 41 | for i in range(len(self.data) - batch_size): 42 | assert next(result) == ([self.data[i]], False) 43 | assert next(result) == ([self.data[-batch_size]], True) 44 | with pytest.raises(StopIteration): 45 | next(result) 46 | -------------------------------------------------------------------------------- /tests/unit/steps/generators/test_data_sampler.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from typing import List 16 | 17 | import pytest 18 | 19 | from distilabel.steps.generators.data_sampler import DataSampler 20 | 21 | 22 | @pytest.mark.parametrize( 23 | "samples, size, batch_size, expected", 24 | [ 25 | (10, 2, 4, [4, 4, 2]), 26 | (7, 5, 6, [6, 1]), 27 | (20, 5, 20, [20]), 28 | (20, 50, 8, [8, 8, 4]), 29 | ], 30 | ) 31 | def test_generator_and_sampler( 32 | samples: int, size: int, batch_size: int, expected: List[int] 33 | ): 34 | sampler = DataSampler( 35 | data=[{"sample": f"sample {i}"} for i in range(30)], 36 | size=size, 37 | samples=samples, 38 | batch_size=batch_size, 39 | ) 40 | sampler.load() 41 | results = [item[0] for item in sampler.process()] 42 | assert len(results) == len(expected) 43 | assert len(results[0]) == batch_size 44 | for i, result in enumerate(results): 45 | assert len(result) == expected[i] 46 | -------------------------------------------------------------------------------- /tests/unit/steps/generators/test_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from typing import Dict, List, Union 16 | 17 | import pandas as pd 18 | import pytest 19 | from datasets import Dataset 20 | 21 | from distilabel.pipeline.local import Pipeline 22 | from distilabel.steps.generators.utils import make_generator_step 23 | 24 | data = [{"instruction": "Tell me a joke."}] * 10 25 | 26 | 27 | @pytest.mark.parametrize("dataset", (data, Dataset.from_list(data), pd.DataFrame(data))) 28 | def test_make_generator_step( 29 | dataset: Union[Dataset, pd.DataFrame, List[Dict[str, str]]], 30 | ) -> None: 31 | batch_size = 5 32 | load_dataset = make_generator_step( 33 | dataset, batch_size=batch_size, output_mappings={"instruction": "other"} 34 | ) 35 | load_dataset.load() 36 | result = next(load_dataset.process()) 37 | assert len(result[0]) == batch_size 38 | if isinstance(dataset, (pd.DataFrame, Dataset)): 39 | assert isinstance(load_dataset._dataset, Dataset) 40 | else: 41 | assert isinstance(load_dataset.data, list) 42 | 43 | assert load_dataset.output_mappings == {"instruction": "other"} 44 | 45 | 46 | def test_make_generator_step_with_pipeline() -> None: 47 | pipeline = Pipeline() 48 | load_dataset = make_generator_step(data, pipeline=pipeline) 49 | assert load_dataset.pipeline == pipeline 50 | -------------------------------------------------------------------------------- /tests/unit/steps/tasks/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /tests/unit/steps/tasks/apigen/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | -------------------------------------------------------------------------------- /tests/unit/steps/tasks/apigen/_sample_lib/final_velocity.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | def final_velocity(initial_velocity: float, acceleration: float, time: float) -> int: 17 | """Calculates the final velocity of an object given its initial velocity, acceleration, and time. 18 | 19 | Args: 20 | initial_velocity: The initial velocity of the object. 21 | acceleration: The acceleration of the object. 22 | time: The time elapsed. 23 | 24 | Returns: 25 | The final velocity 26 | """ 27 | return initial_velocity + acceleration * time 28 | -------------------------------------------------------------------------------- /tests/unit/steps/tasks/apigen/_sample_lib/get_value.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from typing import List, Optional, Tuple 16 | 17 | 18 | def get_value(matrix: List[List[int]], indices: Tuple[int, int]) -> Optional[int]: 19 | """Gets the value at the specified index in the matrix. 20 | 21 | Args: 22 | matrix: A list of lists representing the matrix. 23 | indices: A tuple containing the row and column indices. 24 | """ 25 | row_index, col_index = indices 26 | if ( 27 | row_index < 0 28 | or row_index >= len(matrix) 29 | or col_index < 0 30 | or col_index >= len(matrix[row_index]) 31 | ): 32 | return None 33 | return matrix[row_index][col_index] 34 | -------------------------------------------------------------------------------- /tests/unit/steps/tasks/apigen/_sample_module.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from typing import List, Optional, Tuple 16 | 17 | 18 | def final_velocity(initial_velocity: float, acceleration: float, time: float) -> int: 19 | """Calculates the final velocity of an object given its initial velocity, acceleration, and time. 20 | 21 | Args: 22 | initial_velocity: The initial velocity of the object. 23 | acceleration: The acceleration of the object. 24 | time: The time elapsed. 25 | 26 | Returns: 27 | The final velocity 28 | """ 29 | return initial_velocity + acceleration * time 30 | 31 | 32 | def get_value(matrix: List[List[int]], indices: Tuple[int, int]) -> Optional[int]: 33 | """Gets the value at the specified index in the matrix. 34 | 35 | Args: 36 | matrix: A list of lists representing the matrix. 37 | indices: A tuple containing the row and column indices. 38 | """ 39 | row_index, col_index = indices 40 | if ( 41 | row_index < 0 42 | or row_index >= len(matrix) 43 | or col_index < 0 44 | or col_index >= len(matrix[row_index]) 45 | ): 46 | return None 47 | return matrix[row_index][col_index] 48 | -------------------------------------------------------------------------------- /tests/unit/steps/tasks/evol_instruct/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /tests/unit/steps/tasks/evol_instruct/evol_complexity.py/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | -------------------------------------------------------------------------------- /tests/unit/steps/tasks/evol_instruct/evol_complexity.py/test_base.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from distilabel.models.llms.base import LLM 16 | from distilabel.pipeline.local import Pipeline 17 | from distilabel.steps.tasks.evol_instruct.evol_complexity.base import ( 18 | EvolComplexity, 19 | ) 20 | from distilabel.steps.tasks.evol_instruct.evol_complexity.utils import ( 21 | MUTATION_TEMPLATES, 22 | ) 23 | 24 | 25 | class TestEvolComplexity: 26 | def test_mutation_templates(self, dummy_llm: LLM) -> None: 27 | pipeline = Pipeline(name="unit-test-pipeline") 28 | task = EvolComplexity( 29 | name="task", llm=dummy_llm, num_evolutions=2, pipeline=pipeline 30 | ) 31 | assert task.name == "task" 32 | assert task.llm is dummy_llm 33 | assert task.num_evolutions == 2 34 | assert task.mutation_templates == MUTATION_TEMPLATES 35 | assert "BREADTH" not in task.mutation_templates 36 | -------------------------------------------------------------------------------- /tests/unit/steps/tasks/evol_instruct/evol_complexity.py/test_generator.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from distilabel.models.llms.base import LLM 16 | from distilabel.pipeline.local import Pipeline 17 | from distilabel.steps.tasks.evol_instruct.evol_complexity.generator import ( 18 | EvolComplexityGenerator, 19 | ) 20 | from distilabel.steps.tasks.evol_instruct.evol_complexity.utils import ( 21 | GENERATION_MUTATION_TEMPLATES, 22 | ) 23 | 24 | 25 | class TestEvolComplexityGenerator: 26 | def test_mutation_templates(self, dummy_llm: LLM) -> None: 27 | pipeline = Pipeline(name="unit-test-pipeline") 28 | task = EvolComplexityGenerator( 29 | name="task", llm=dummy_llm, num_instructions=2, pipeline=pipeline 30 | ) 31 | assert task.name == "task" 32 | assert task.llm is dummy_llm 33 | assert task.num_instructions == 2 34 | assert task.mutation_templates == GENERATION_MUTATION_TEMPLATES 35 | assert "BREADTH" not in task.mutation_templates 36 | -------------------------------------------------------------------------------- /tests/unit/steps/tasks/evol_quality/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /tests/unit/steps/tasks/magpie/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | -------------------------------------------------------------------------------- /tests/unit/steps/tasks/math_shepherd/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | -------------------------------------------------------------------------------- /tests/unit/steps/tasks/structured_outputs/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | -------------------------------------------------------------------------------- /tests/unit/steps/tasks/test_generate_embeddings.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from typing import Generator 16 | 17 | import pytest 18 | 19 | from distilabel.models.llms.huggingface.transformers import TransformersLLM 20 | from distilabel.pipeline.local import Pipeline 21 | from distilabel.steps.tasks.generate_embeddings import GenerateEmbeddings 22 | 23 | 24 | @pytest.fixture(scope="module") 25 | def transformers_llm() -> Generator[TransformersLLM, None, None]: 26 | llm = TransformersLLM( 27 | model="distilabel-internal-testing/tiny-random-mistral", 28 | cuda_devices=[], 29 | ) 30 | llm.load() 31 | 32 | yield llm 33 | 34 | 35 | class TestGenerateEmbeddings: 36 | def test_process(self, transformers_llm: TransformersLLM) -> None: 37 | task = GenerateEmbeddings( 38 | name="task", 39 | llm=transformers_llm, 40 | pipeline=Pipeline(name="unit-test-pipeline"), 41 | ) 42 | result = next(task.process([{"text": "Hello, how are you?"}])) 43 | 44 | assert "embedding" in result[0] 45 | assert len(result[0]["embedding"]) == 128 46 | -------------------------------------------------------------------------------- /tests/unit/steps/test_truncate.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from typing import Optional 16 | 17 | import pytest 18 | 19 | from distilabel.steps.truncate import TruncateTextColumn 20 | 21 | 22 | @pytest.mark.parametrize( 23 | "max_length, text, tokenizer, expected", 24 | [ 25 | ( 26 | 10, 27 | "This is a sample text that is longer than 10 characters", 28 | None, 29 | "This is a ", 30 | ), 31 | ( 32 | 4, 33 | "This is a sample text that is longer than 10 characters", 34 | "teknium/OpenHermes-2.5-Mistral-7B", 35 | "This is a sample", 36 | ), 37 | ], 38 | ) 39 | def test_truncate_row( 40 | max_length: int, text: str, tokenizer: Optional[str], expected: str 41 | ) -> None: 42 | trunc = TruncateTextColumn( 43 | column="text", max_length=max_length, tokenizer=tokenizer 44 | ) 45 | trunc.load() 46 | 47 | assert next(trunc.process([{"text": text}])) == [{"text": expected}] 48 | -------------------------------------------------------------------------------- /tests/unit/test_errors.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from distilabel.errors import DistilabelUserError 16 | 17 | 18 | def test_distilabel_user_error() -> None: 19 | msg = DistilabelUserError("This is an error message.") 20 | assert str(msg) == "This is an error message." 21 | msg = DistilabelUserError( 22 | "This is an error message.", page="sections/getting_started/faq/" 23 | ) 24 | assert ( 25 | str(msg) 26 | == "This is an error message.\n\nFor further information visit 'https://distilabel.argilla.io/latest/sections/getting_started/faq/'" 27 | ) 28 | -------------------------------------------------------------------------------- /tests/unit/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | -------------------------------------------------------------------------------- /tests/unit/utils/test_files.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import tempfile 16 | from pathlib import Path 17 | 18 | from distilabel.utils.files import list_files_in_dir 19 | 20 | 21 | def test_list_files_in_dir() -> None: 22 | with tempfile.TemporaryDirectory() as temp_dir: 23 | temp_dir = Path(temp_dir) 24 | 25 | created_files = [] 26 | for i in range(20): 27 | file_path = temp_dir / f"{i}.txt" 28 | created_files.append(file_path) 29 | with open(file_path, "w") as f: 30 | f.write("hello") 31 | 32 | assert list_files_in_dir(Path(temp_dir)) == created_files 33 | -------------------------------------------------------------------------------- /tests/unit/utils/test_lists.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from typing import List 16 | 17 | import pytest 18 | 19 | from distilabel.utils.lists import flatten_responses 20 | 21 | 22 | @pytest.mark.parametrize( 23 | "input, expected", 24 | [ 25 | ([["A"], ["B"]], ["A", "B"]), 26 | ([["A", "B"], ["C", "D"]], ["B", "D"]), 27 | ], 28 | ) 29 | def test_flatten_responses(input: List[List[str]], expected: List[str]) -> None: 30 | assert flatten_responses(input) == expected 31 | -------------------------------------------------------------------------------- /tests/unit/utils/test_ray.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import os 16 | from unittest import mock 17 | 18 | from distilabel.utils.ray import script_executed_in_ray_cluster 19 | 20 | 21 | def test_script_executed_on_ray_cluster() -> None: 22 | assert not script_executed_in_ray_cluster() 23 | 24 | with mock.patch.dict( 25 | os.environ, 26 | { 27 | "RAY_NODE_TYPE_NAME": "headgroup", 28 | "RAY_CLUSTER_NAME": "disticluster", 29 | "RAY_ADDRESS": "127.0.0.1:6379", 30 | }, 31 | ): 32 | assert script_executed_in_ray_cluster() 33 | -------------------------------------------------------------------------------- /tests/unit/utils/test_serialization.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from pydantic import BaseModel 16 | 17 | from distilabel.utils.serialization import _extra_serializable_fields, _Serializable 18 | 19 | 20 | def test_extra_serializable_fields() -> None: 21 | class DummyAttribute(BaseModel, _Serializable): 22 | pass 23 | 24 | class Dummy(BaseModel, _Serializable): 25 | attr: DummyAttribute 26 | 27 | dummy = Dummy(attr=DummyAttribute()) 28 | 29 | assert _extra_serializable_fields(dummy) == [ 30 | { 31 | "attr": { 32 | "type_info": { 33 | "module": "tests.unit.utils.test_serialization", 34 | "name": "DummyAttribute", 35 | } 36 | } 37 | } 38 | ] 39 | -------------------------------------------------------------------------------- /tests/unit/utils/test_typing.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present, Argilla, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import inspect 16 | 17 | from typing_extensions import Annotated 18 | 19 | from distilabel.utils.typing_ import is_parameter_annotated_with 20 | 21 | 22 | def test_is_parameter_annotated_with() -> None: 23 | def dummy_function(arg: Annotated[int, "unit-test"], arg2: int) -> None: 24 | pass 25 | 26 | signature = inspect.signature(dummy_function) 27 | arg_parameter = signature.parameters["arg"] 28 | arg2_parameter = signature.parameters["arg2"] 29 | 30 | assert is_parameter_annotated_with(arg_parameter, "hello") is False 31 | assert is_parameter_annotated_with(arg_parameter, "unit-test") is True 32 | assert is_parameter_annotated_with(arg2_parameter, "unit-test") is False 33 | --------------------------------------------------------------------------------