├── knowledge
    ├── arts
    │   ├── .gitignore
    │   └── music
    │   │   └── fandom
    │   │       └── swifties
    │   │           ├── attribution.txt
    │   │           └── qna.yaml
    ├── history
    │   └── .gitignore
    ├── science
    │   ├── .gitignore
    │   └── animals
    │   │   └── birds
    │   │       └── black_capped_chickadee
    │   │           ├── attribution.txt
    │   │           └── qna.yaml
    ├── engineering
    │   └── .gitignore
    ├── geography
    │   └── .gitignore
    ├── linguistics
    │   └── .gitignore
    ├── mathematics
    │   └── .gitignore
    ├── philosophy
    │   └── .gitignore
    ├── religion
    │   └── .gitignore
    ├── technology
    │   └── .gitignore
    └── miscellaneous_unknown
    │   └── .gitignore
├── compositional_skills
    ├── arts
    │   └── .gitignore
    ├── engineering
    │   └── .gitignore
    ├── geography
    │   └── .gitignore
    ├── history
    │   └── .gitignore
    ├── linguistics
    │   ├── .gitignore
    │   └── synonyms
    │   │   ├── attribution.txt
    │   │   └── qna.yaml
    ├── philosophy
    │   └── .gitignore
    ├── religion
    │   └── .gitignore
    ├── science
    │   └── .gitignore
    ├── technology
    │   └── .gitignore
    ├── grounded
    │   ├── arts
    │   │   └── .gitignore
    │   ├── history
    │   │   └── .gitignore
    │   ├── science
    │   │   └── .gitignore
    │   ├── engineering
    │   │   └── .gitignore
    │   ├── geography
    │   │   └── .gitignore
    │   ├── linguistics
    │   │   ├── .gitignore
    │   │   ├── writing
    │   │   │   └── rewriting
    │   │   │   │   ├── attribution.txt
    │   │   │   │   └── qna.yaml
    │   │   └── inclusion
    │   │   │   ├── attribution.txt
    │   │   │   └── qna.yaml
    │   ├── philosophy
    │   │   └── .gitignore
    │   ├── religion
    │   │   └── .gitignore
    │   └── technology
    │   │   └── .gitignore
    └── miscellaneous_unknown
    │   └── .gitignore
├── docs
    ├── assets
    │   ├── backend.png
    │   ├── qna_ex.png
    │   ├── comp_skill.png
    │   ├── new_file.png
    │   ├── tax_labels.png
    │   ├── linguistics.png
    │   ├── review-process.png
    │   ├── simile_folder.png
    │   ├── triage_schedule.png
    │   └── add_skill_branch.png
    ├── taxonomy_diagram.png
    ├── triaging
    │   ├── insights
    │   │   ├── images
    │   │   │   ├── gh-bot.png
    │   │   │   ├── gh-workflow.png
    │   │   │   ├── login-screen.png
    │   │   │   ├── gh-message_no-violation.png
    │   │   │   ├── InstructLab-Insights-report.png
    │   │   │   ├── gh-message_analysis-in-progress.png
    │   │   │   └── gh-message_potential-violation.png
    │   │   └── taxonomy-insights-documentation.md
    │   ├── safe-responses.md
    │   └── triaging-contributions.md
    ├── taxonomy_diagram.md
    ├── template_qna.yaml
    ├── README.md
    ├── contributing_via_GH_UI.md
    ├── knowledge-contribution-guide.md
    ├── KNOWLEDGE_GUIDE.md
    └── SKILLS_GUIDE.md
├── scripts
    ├── requirements.txt
    └── check-yaml.py
├── MAINTAINERS.md
├── governance.md
├── CONTRIBUTOR_ROLES.md
├── SECURITY.md
├── .github
    ├── workflows
    │   ├── actionlint.dockerfile
    │   ├── matchers
    │   │   └── actionlint.json
    │   ├── docs.yml
    │   ├── actionlint.yml
    │   ├── update-CoC-report.yml
    │   ├── insight.yml
    │   ├── label.yml
    │   └── lint.yml
    ├── dependabot.yml
    ├── labeler.yml
    ├── ISSUE_TEMPLATE
    │   ├── proposal.md
    │   └── bug_report.md
    └── pull_request_template.md
├── CODE_OF_CONDUCT.md
├── .markdownlint-cli2.yaml
├── Makefile
├── foundational_skills
    └── reasoning
    │   ├── linguistics_reasoning
    │       ├── odd_one_out
    │       │   └── qna.yaml
    │       ├── object_identification
    │       │   └── qna.yaml
    │       └── logical_sequence_of_words
    │       │   └── qna.yaml
    │   ├── logical_reasoning
    │       ├── causal
    │       │   └── qna.yaml
    │       ├── tabular
    │       │   └── qna.yaml
    │       └── general
    │       │   └── qna.yaml
    │   ├── mathematical_reasoning
    │       └── qna.yaml
    │   ├── unconventional_reasoning
    │       └── lower_score_wins
    │       │   └── qna.yaml
    │   ├── common_sense_reasoning
    │       └── qna.yaml
    │   ├── temporal_reasoning
    │       └── qna.yaml
    │   └── theory_of_mind
    │       └── qna.yaml
├── .gitignore
├── CONTRIBUTING.md
├── LICENSE
└── README.md


/knowledge/arts/.gitignore:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/knowledge/history/.gitignore:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/knowledge/science/.gitignore:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/knowledge/engineering/.gitignore:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/knowledge/geography/.gitignore:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/knowledge/linguistics/.gitignore:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/knowledge/mathematics/.gitignore:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/knowledge/philosophy/.gitignore:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/knowledge/religion/.gitignore:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/knowledge/technology/.gitignore:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/compositional_skills/arts/.gitignore:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/compositional_skills/engineering/.gitignore:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/compositional_skills/geography/.gitignore:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/compositional_skills/history/.gitignore:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/compositional_skills/linguistics/.gitignore:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/compositional_skills/philosophy/.gitignore:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/compositional_skills/religion/.gitignore:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/compositional_skills/science/.gitignore:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/compositional_skills/technology/.gitignore:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/knowledge/miscellaneous_unknown/.gitignore:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/compositional_skills/grounded/arts/.gitignore:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/compositional_skills/grounded/history/.gitignore:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/compositional_skills/grounded/science/.gitignore:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/compositional_skills/grounded/engineering/.gitignore:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/compositional_skills/grounded/geography/.gitignore:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/compositional_skills/grounded/linguistics/.gitignore:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/compositional_skills/grounded/philosophy/.gitignore:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/compositional_skills/grounded/religion/.gitignore:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/compositional_skills/grounded/technology/.gitignore:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/compositional_skills/miscellaneous_unknown/.gitignore:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/docs/assets/backend.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/instructlab/taxonomy/HEAD/docs/assets/backend.png


--------------------------------------------------------------------------------
/docs/assets/qna_ex.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/instructlab/taxonomy/HEAD/docs/assets/qna_ex.png


--------------------------------------------------------------------------------
/scripts/requirements.txt:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | 
3 | instructlab-schema>=0.4.0
4 | 


--------------------------------------------------------------------------------
/docs/assets/comp_skill.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/instructlab/taxonomy/HEAD/docs/assets/comp_skill.png


--------------------------------------------------------------------------------
/docs/assets/new_file.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/instructlab/taxonomy/HEAD/docs/assets/new_file.png


--------------------------------------------------------------------------------
/docs/assets/tax_labels.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/instructlab/taxonomy/HEAD/docs/assets/tax_labels.png


--------------------------------------------------------------------------------
/docs/taxonomy_diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/instructlab/taxonomy/HEAD/docs/taxonomy_diagram.png


--------------------------------------------------------------------------------
/docs/assets/linguistics.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/instructlab/taxonomy/HEAD/docs/assets/linguistics.png


--------------------------------------------------------------------------------
/docs/assets/review-process.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/instructlab/taxonomy/HEAD/docs/assets/review-process.png


--------------------------------------------------------------------------------
/docs/assets/simile_folder.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/instructlab/taxonomy/HEAD/docs/assets/simile_folder.png


--------------------------------------------------------------------------------
/docs/assets/triage_schedule.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/instructlab/taxonomy/HEAD/docs/assets/triage_schedule.png


--------------------------------------------------------------------------------
/docs/assets/add_skill_branch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/instructlab/taxonomy/HEAD/docs/assets/add_skill_branch.png


--------------------------------------------------------------------------------
/docs/triaging/insights/images/gh-bot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/instructlab/taxonomy/HEAD/docs/triaging/insights/images/gh-bot.png


--------------------------------------------------------------------------------
/docs/triaging/insights/images/gh-workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/instructlab/taxonomy/HEAD/docs/triaging/insights/images/gh-workflow.png


--------------------------------------------------------------------------------
/docs/triaging/insights/images/login-screen.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/instructlab/taxonomy/HEAD/docs/triaging/insights/images/login-screen.png


--------------------------------------------------------------------------------
/MAINTAINERS.md:
--------------------------------------------------------------------------------
1 | For a complete list of InstructLab project maintainers, see [Maintainers](https://github.com/instructlab/community/blob/main/MAINTAINERS.md).


--------------------------------------------------------------------------------
/docs/triaging/insights/images/gh-message_no-violation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/instructlab/taxonomy/HEAD/docs/triaging/insights/images/gh-message_no-violation.png


--------------------------------------------------------------------------------
/docs/triaging/insights/images/InstructLab-Insights-report.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/instructlab/taxonomy/HEAD/docs/triaging/insights/images/InstructLab-Insights-report.png


--------------------------------------------------------------------------------
/docs/triaging/insights/images/gh-message_analysis-in-progress.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/instructlab/taxonomy/HEAD/docs/triaging/insights/images/gh-message_analysis-in-progress.png


--------------------------------------------------------------------------------
/docs/triaging/insights/images/gh-message_potential-violation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/instructlab/taxonomy/HEAD/docs/triaging/insights/images/gh-message_potential-violation.png


--------------------------------------------------------------------------------
/governance.md:
--------------------------------------------------------------------------------
1 | # InstructLab Governance
2 | 
3 | For information about how the InstructLab project governance operates, see [InstructLab Governance](https://github.com/instructlab/community/blob/main/GOVERNANCE.md).
4 | 


--------------------------------------------------------------------------------
/CONTRIBUTOR_ROLES.md:
--------------------------------------------------------------------------------
1 | # Contributor roles for InstructLab
2 | 
3 | For information about contributor roles, see [Contributor roles for InstructLab](https://github.com/instructlab/community/blob/main/CONTRIBUTOR_ROLES.md).


--------------------------------------------------------------------------------
/compositional_skills/grounded/linguistics/writing/rewriting/attribution.txt:
--------------------------------------------------------------------------------
1 | Title of work: Rewrite complex texts in an easier way
2 | Link to work: -
3 | License of the work: CC-BY-SA-4.0
4 | Creator names: Alicia Jiménez Avilés


--------------------------------------------------------------------------------
/compositional_skills/linguistics/synonyms/attribution.txt:
--------------------------------------------------------------------------------
1 | Title of work: To teach a language model how to get a certain number of synonyms.
2 | Link to work: -
3 | License of the work: CC BY-NC-SA 4.0
4 | Creator names: Yanping Liu
5 | 


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
1 | You can find information on how to report a potential security vulnerability, as well as where to subscribe to receive security alerts, on the project's [Security Page](https://github.com/instructlab/.github/blob/main/SECURITY.md).
2 | 


--------------------------------------------------------------------------------
/.github/workflows/actionlint.dockerfile:
--------------------------------------------------------------------------------
1 | # Since dependabot cannot update workflows using docker,
2 | # we use this indirection since dependabot can update this file.
3 | FROM rhysd/actionlint:1.7.7@sha256:887a259a5a534f3c4f36cb02dca341673c6089431057242cdc931e9f133147e9
4 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | # InstructLab/taxonomy - Code of Conduct and Covenant
2 | 
3 | This project adheres to the [InstructLab - Code of Conduct and Covenant](https://github.com/instructlab/community/blob/main/CODE_OF_CONDUCT.md). By participating, you are expected to uphold this code.
4 | 


--------------------------------------------------------------------------------
/knowledge/arts/music/fandom/swifties/attribution.txt:
--------------------------------------------------------------------------------
1 | Title of work: Swifties
2 | Link to work: https://en.wikipedia.org/wiki/Swifties
3 | Revision: https://en.wikipedia.org/w/index.php?title=Swifties&oldid=1221036610
4 | License of the work: CC-BY-SA-4.0
5 | Creator names: Wikipedia Authors


--------------------------------------------------------------------------------
/knowledge/science/animals/birds/black_capped_chickadee/attribution.txt:
--------------------------------------------------------------------------------
1 | Title of work: Black-capped chickadee
2 | Link to work: https://en.wikipedia.org/wiki/Black-capped_chickadee
3 | Revision: https://en.wikipedia.org/w/index.php?title=Black-capped_chickadee&oldid=1219937468
4 | License of the work: CC-BY-SA-4.0
5 | Creator names: Wikipedia Authors
6 | 


--------------------------------------------------------------------------------
/.markdownlint-cli2.yaml:
--------------------------------------------------------------------------------
 1 | config:
 2 |   line-length: false
 3 |   no-emphasis-as-header: false
 4 |   first-line-heading: false
 5 |   code-block-style: false
 6 |   no-duplicate-header: false
 7 |   single-trailing-newline: false
 8 |   no-bare-urls: false
 9 | globs:
10 |   - "**/*.md"
11 | ignores:
12 |   - ".github/**"
13 |   - ".tox/**"
14 |   - "venv/**"
15 |   - ".venv/**"
16 |   - "knowledge/knowledge_domains.md"
17 | 


--------------------------------------------------------------------------------
/.github/workflows/matchers/actionlint.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "problemMatcher": [
 3 |     {
 4 |       "owner": "actionlint",
 5 |       "pattern": [
 6 |         {
 7 |           "regexp": "^(?:\\x1b\\[\\d+m)?(.+?)(?:\\x1b\\[\\d+m)*:(?:\\x1b\\[\\d+m)*(\\d+)(?:\\x1b\\[\\d+m)*:(?:\\x1b\\[\\d+m)*(\\d+)(?:\\x1b\\[\\d+m)*: (?:\\x1b\\[\\d+m)*(.+?)(?:\\x1b\\[\\d+m)* \\[(.+?)\\]$",
 8 |           "file": 1,
 9 |           "line": 2,
10 |           "column": 3,
11 |           "message": 4,
12 |           "code": 5
13 |         }
14 |       ]
15 |     }
16 |   ]
17 | }
18 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | 
 3 | # GitHub Dependabot configuration file
 4 | version: 2
 5 | updates:
 6 | 
 7 |   # Maintain dependencies for GitHub Actions
 8 |   - package-ecosystem: "github-actions"
 9 |     directory: "/"
10 |     schedule:
11 |       interval: "daily"
12 |   - package-ecosystem: "docker"
13 |     directory: "/.github/workflows"
14 |     schedule:
15 |       interval: "daily"
16 | 
17 |   # Maintain dependencies for Python scripts
18 |   - package-ecosystem: "pip"
19 |     directory: "/scripts"
20 |     schedule:
21 |       interval: "daily"
22 | 


--------------------------------------------------------------------------------
/compositional_skills/grounded/linguistics/inclusion/attribution.txt:
--------------------------------------------------------------------------------
 1 | Title of work: Master-slave(technology)
 2 | Link to work: https://en.wikipedia.org/wiki/Master%E2%80%93slave_(technology)
 3 | License of the work: CC-BY-SA-4.0
 4 | Creator names: Wikipedia Authors
 5 | 
 6 | Title of work: Whitelist
 7 | Link to work: https://en.wikipedia.org/wiki/Whitelist
 8 | License of the work: CC-BY-SA-4.0
 9 | Creator names: Wikipedia Authors
10 | 
11 | 
12 | Title of work: Blacklist (computing)
13 | Link to work: https://en.wikipedia.org/wiki/Blacklist_(computing)
14 | License of the work: CC-BY-SA-4.0
15 | Creator names: Wikipedia Authors


--------------------------------------------------------------------------------
/.github/labeler.yml:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | 
 3 | ci:
 4 |   - changed-files:
 5 |       - any-glob-to-any-file:
 6 |           - scripts/**
 7 |           - .github/workflows/**
 8 |           - .github/*.yml
 9 | 
10 | documentation:
11 |   - changed-files:
12 |       - any-glob-to-any-file:
13 |           - "*.md"
14 |           - docs/**
15 | 
16 | knowledge:
17 |   - changed-files:
18 |       - any-glob-to-any-file:
19 |           - knowledge/**
20 | 
21 | skill:
22 |   - changed-files:
23 |       - any-glob-to-any-file:
24 |           - compositional_skills/**
25 | 
26 | triage-needed:
27 |   - changed-files:
28 |       - any-glob-to-any-file:
29 |           - compositional_skills/**
30 |           - knowledge/**
31 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: help
 2 | help:
 3 | 	@awk 'BEGIN {FS = ":.*##"; printf "\nUsage:\n  make \033[36m<target>\033[0m\n"} /^[a-zA-Z_0-9-]+:.*?##/ { printf "  \033[36m%-18s\033[0m %s\n", $$1, $$2 } /^##@/ { printf "\n\033[1m%s\033[0m\n", substr($$0, 5) } ' $(MAKEFILE_LIST)
 4 | 
 5 | #
 6 | # If you want to see the full commands, run:
 7 | #   NOISY_BUILD=y make
 8 | #
 9 | ifeq ($(NOISY_BUILD),)
10 |     ECHO_PREFIX=@
11 |     CMD_PREFIX=@
12 |     PIPE_DEV_NULL=> /dev/null 2> /dev/null
13 | else
14 |     ECHO_PREFIX=@\#
15 |     CMD_PREFIX=
16 |     PIPE_DEV_NULL=
17 | endif
18 | 
19 | .PHONY: md-lint
20 | md-lint: ## Lint markdown files
21 | 	$(ECHO_PREFIX) printf "  %-12s ./...\n" "[MD LINT]"
22 | 	$(CMD_PREFIX) podman run --rm -v $(CURDIR):/workdir --security-opt label=disable docker.io/davidanson/markdownlint-cli2:latest > /dev/null
23 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/proposal.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Proposal
 3 | about: Create a contribution proposal
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Describe the proposed contribution to the taxonomy**
11 | 
12 | <!-- A concise description of what the proposed contribution would bring, replace "..." in the bullet list. -->
13 | 
14 | - ...
15 | - ...
16 | - ...
17 | 
18 | **Input given at the prompt**
19 | 
20 | <!-- What you entered, replace "..." -->
21 | 
22 | ```
23 |    ...
24 | ```
25 | 
26 | **Response from the current model**
27 | 
28 | <!-- What you received from the current model in response to your input, 
29 | replace "..." -->
30 | 
31 | ```
32 |   ...
33 | ```
34 | 
35 | **Response that you would expect instead with the contribution**
36 | <!-- What you expect to receive instead with the finetuned model, replace "...". -->
37 | 
38 | ```
39 |   ...
40 | ```
41 | 


--------------------------------------------------------------------------------
/foundational_skills/reasoning/linguistics_reasoning/odd_one_out/qna.yaml:
--------------------------------------------------------------------------------
 1 | created_by: IBM
 2 | seed_examples:
 3 | - answer: 'C. important. Remaining are synonyms of each other.
 4 | 
 5 |     '
 6 |   question: 'Which word is the odd man out? Also give an explanation why
 7 | 
 8 |     A. trivial
 9 | 
10 |     B. unimportant
11 | 
12 |     C. important
13 | 
14 |     D. insignificant
15 | 
16 |     '
17 | - answer: 'D. Book. Rest are all parts of a book.
18 | 
19 |     '
20 |   question: 'Which word does NOT belong with the others? Also give an explanation
21 |     why
22 | 
23 |     A. index
24 | 
25 |     B. glossary
26 | 
27 |     C. chapter
28 | 
29 |     D. book'
30 | - answer: 'A. hate. Rest are positive emotions.
31 | 
32 |     '
33 |   question: 'Which word is the odd man out? Also give an explanation why
34 | 
35 |     A. hate
36 | 
37 |     B. fondness
38 | 
39 |     C. liking
40 | 
41 |     D. attachment
42 | 
43 |     '
44 | task_description: 'To teach a language model about logical odd-one-out.
45 | 
46 |   '
47 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug/Problem report
 3 | about: Create a report to help us improve
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | <!-- If you want to report a problem with the current model or taxonomy, please, fill out the following questionnaire. If the questionnaire doesn't match the type of problem you want to report, just delete the sections related to the model. -->
11 | 
12 | **Describe the bug/problem**
13 | 
14 | <!-- A concise description of what the problem is, replace "..." in the bullet list. -->
15 | 
16 | - ...
17 | - ...
18 | - ...
19 | 
20 | **Input given at the prompt**
21 | 
22 | <!-- What you entered, replace "..." -->
23 | 
24 | ```
25 |    ...
26 | ```
27 | 
28 | **Response that was received from the current model**
29 | 
30 | <!-- What you received from the current model in response to your input, 
31 | replace "..." -->
32 | 
33 | ```
34 |   ...
35 | ```
36 | 
37 | **Response that you expected instead**
38 | <!-- What you expected to receive instead, replace "...". -->
39 | 
40 | ```
41 |   ...
42 | ```
43 | 


--------------------------------------------------------------------------------
/foundational_skills/reasoning/linguistics_reasoning/object_identification/qna.yaml:
--------------------------------------------------------------------------------
 1 | created_by: IBM
 2 | seed_examples:
 3 | - answer: 'The list contains two distinct colors - blue and pink.
 4 | 
 5 |     '
 6 |   question: 'How many colors are there in this list: four, five, twenty six, blue,
 7 |     alpha, pink.
 8 | 
 9 |     '
10 | - answer: 'There are 2 human body parts: Bone and tongue.  Depending on if you consider
11 |     nails to be fingernails, then you could also include them as a third body part.
12 | 
13 |     '
14 |   question: 'How many human body parts in this list and what are they: bone, table,
15 |     moon, tongue, mercury, postulate, America, nails.'
16 | - answer: "The following items in the list are associated with both humans and animals:\
17 |     \ intestine, cheating. \n"
18 |   question: 'In the list which items are associated with both humans and animals:
19 |     wisdom, utility, microscope, intestine, cheating, divorce, pulling, magnetism,
20 |     space, ludo, axiom?
21 | 
22 |     '
23 | task_description: 'To teach a language model about Object identification
24 | 
25 |   '
26 | 


--------------------------------------------------------------------------------
/foundational_skills/reasoning/logical_reasoning/causal/qna.yaml:
--------------------------------------------------------------------------------
 1 | created_by: IBM
 2 | seed_examples:
 3 | - answer: 'While days tend to be longer in the summer, just because it is not summer
 4 |     doesn''t mean days are necessarily shorter.
 5 | 
 6 |     '
 7 |   question: 'If it is summer, then the days are longer. Are the days longer if it
 8 |     is not summer ?
 9 | 
10 |     '
11 | - answer: 'No, we cannot conclusively conclude that some cats are black based solely
12 |     on the given premises. The statement "some mammals are black" does not necessarily
13 |     guarantee that among those mammals are cats.
14 | 
15 |     '
16 |   question: If all cats are mammals and some mammals are black, can we conclude that
17 |     some cats are black?
18 | - answer: 'Yes, we can conclude that all squares have four sides based on the given
19 |     premises.
20 | 
21 |     '
22 |   question: 'If all squares are rectangles and a rectangle has four sides, can we
23 |     conclude that all squares have four sides?
24 | 
25 |     '
26 | task_description: To teach a language model about Logical Reasoning - causal relationships
27 | 


--------------------------------------------------------------------------------
/foundational_skills/reasoning/mathematical_reasoning/qna.yaml:
--------------------------------------------------------------------------------
 1 | created_by: IBM
 2 | seed_examples:
 3 | - answer: 'A kilo of feathers is heavier than a pound of hammers. 1 kilo is equal
 4 |     to ~2.2046 pounds, so no matter what you are weighing 1 kilo will be heavier than
 5 |     1 pound
 6 | 
 7 |     '
 8 |   question: 'What is heavier? A kilo of feathers, or a pound of hammers?
 9 | 
10 |     '
11 | - answer: "100 servers. If 1 Administrator can manage 10 servers, and we increase\
12 |     \ that number by 10, we can assume that 10 administrators could handle 10 times\
13 |     \ the number of servers 10x10=100 \n"
14 |   question: '10 servers require 1 administrator to manage.
15 | 
16 |     How many servers can I manage with 10 administrators?'
17 | - answer: 'You need either 1 x 2 socket subscription, or 16 x 2-core subscriptions.
18 | 
19 |     '
20 |   question: 'I have a server with 2 sockets, with 16 cores each. How many OpenShift
21 |     subscriptions do I need to purchase, with either 2 socket pair or 2 core pair
22 |     licensing model?
23 | 
24 | 
25 |     '
26 | task_description: "To teach a language model about mathematical operations. \n"
27 | 


--------------------------------------------------------------------------------
/foundational_skills/reasoning/linguistics_reasoning/logical_sequence_of_words/qna.yaml:
--------------------------------------------------------------------------------
 1 | created_by: IBM
 2 | seed_examples:
 3 | - answer: 'Here is a meaningful sequence:
 4 | 
 5 |     1. Crime
 6 | 
 7 |     2. Police
 8 | 
 9 |     3. Judge
10 | 
11 |     4. Judgment
12 | 
13 |     5. Punishment
14 | 
15 |     '
16 |   question: "Arrange the words given below in a meaningful sequence.\n1.Police\_ 2.\
17 |     \ Punishment\_ 3. Crime\_ 4. Judge\_ 5. Judgment\n"
18 | - answer: 'Here is a meaningful sequence:
19 | 
20 |     1. Infection
21 | 
22 |     2. Doctor
23 | 
24 |     3. Consultation
25 | 
26 |     4. Treatment
27 | 
28 |     5. Recovery
29 | 
30 |     '
31 |   question: 'Arrange the following words in a meaningful sequence.
32 | 
33 |     1. Infection
34 | 
35 |     2. Consultation
36 | 
37 |     3. Doctor
38 | 
39 |     4. Treatment
40 | 
41 |     5. Recovery'
42 | - answer: 'Here is a meaningful sequence:
43 | 
44 |     1. Letters
45 | 
46 |     2. Word
47 | 
48 |     3. Phrase
49 | 
50 |     4. Sentence
51 | 
52 |     5. Paragraph
53 | 
54 |     '
55 |   question: 'Arrange the words given below in a meaningful sequence.
56 | 
57 |     a.Word b.Paragraph c.Sentence d.Letters e.Phrase
58 | 
59 |     '
60 | task_description: 'To teach a language model about the logical sequence of words
61 | 
62 |   '
63 | 


--------------------------------------------------------------------------------
/.github/workflows/docs.yml:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | 
 3 | name: Lint Markdown documents
 4 | 
 5 | on:
 6 |   push:
 7 |     branches:
 8 |       - "main"
 9 |     paths:
10 |       - '**/*.md'
11 |       - '.markdownlint-cli2.yaml'
12 |       - '.github/workflows/docs.yml' # This workflow
13 |   pull_request:
14 |     branches:
15 |       - "main"
16 |     paths:
17 |       - '**/*.md'
18 |       - '.markdownlint-cli2.yaml'
19 |       - '.github/workflows/docs.yml' # This workflow
20 | 
21 | env:
22 |   LC_ALL: en_US.UTF-8
23 | 
24 | defaults:
25 |   run:
26 |     shell: bash
27 | 
28 | permissions:
29 |   contents: read
30 | 
31 | jobs:
32 |   markdown-lint:
33 |     runs-on: ubuntu-latest
34 |     steps:
35 |       - name: "Harden Runner"
36 |         uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0
37 |         with:
38 |           egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs
39 |       - name: "Checkout"
40 |         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
41 |         with:
42 |           fetch-depth: 0
43 |       - name: "Check Markdown documents"
44 |         uses: DavidAnson/markdownlint-cli2-action@05f32210e84442804257b2a6f20b273450ec8265 # v19.1.0
45 |         with:
46 |           globs: '**/*.md'
47 | 


--------------------------------------------------------------------------------
/foundational_skills/reasoning/unconventional_reasoning/lower_score_wins/qna.yaml:
--------------------------------------------------------------------------------
 1 | created_by: IBM
 2 | seed_examples:
 3 | - answer: 'No that is incorrect, In golf, the objective is to score as few points
 4 |     as possible through various methods such as driving, chipping, and putting. The
 5 |     player with the lowest score at the end of the round wins.
 6 | 
 7 |     '
 8 |   question: 'Part of golf is trying to get a higher point total than others. Yes or
 9 |     No?
10 | 
11 |     '
12 | - answer: "In rummy, the player with the lowest score wins. \n"
13 |   question: In the card game rummy does the player with the highest score or the lowest
14 |     score win.
15 | - answer: 'In most running competitions, including marathons, the winner is the runner
16 |     who completes the racecourse in the shortest amount of time. So, in this case,
17 |     the runner with a time of 4 hours and 20 minutes would win because they completed
18 |     the race faster than the runner with a time of 4 hours and 30 minutes.
19 | 
20 |     '
21 |   question: 'one runner has a time of 4h 30 and another a time of 4h 20, which one
22 |     wins?
23 | 
24 |     '
25 | task_description: 'To teach a language model about  how different activities have
26 |   different performance measures and lower scores win in many sports, eg running,
27 |   golf and shooting.
28 | 
29 |   '
30 | 


--------------------------------------------------------------------------------
/docs/taxonomy_diagram.md:
--------------------------------------------------------------------------------
 1 | ## Taxonomy diagram
 2 | 
 3 | > [!Note]
 4 | > This diagram shows a subset of the taxonomy. It is not a complete representation.
 5 | 
 6 | ```mermaid
 7 |  flowchart TD;
 8 |    na[not accepting contributions\n at this time]:::na
 9 |    taxonomy --> foundational_skill & compositional_skills & knowledge
10 | 
11 |    foundational_skill:::na --> reasoning:::na
12 |    reasoning:::na --> common_sense_reasoning:::na
13 |    reasoning:::na --> mathematical_reasoning:::na
14 |    reasoning:::na --> theory_of_mind:::na
15 | 
16 |    compositional_skills --> engineering
17 |    compositional_skills --> grounded
18 |    compositional_skills --> lingustics
19 | 
20 |    grounded --> grounded/arts
21 |    grounded --> grounded/geography
22 |    grounded --> grounded/history
23 |    grounded --> grounded/science
24 | 
25 |    knowledge --> knowledge/arts
26 | 
27 |    knowledge --> knowledge/miscellaneous_unknown
28 |    knowledge --> knowledge/science
29 |    knowledge --> knowledge/technology
30 |    knowledge/science --> animals --> birds --> black_capped_chickadee --> black_capped_chikadee-a & black_capped_chikadee-q
31 |    knowledge/science --> astronomy --> constellations --> phoenix --> phoenix-a & phoenix-q
32 | 
33 |    black_capped_chikadee-a{attribution.txt}
34 |    black_capped_chikadee-q{qna.yaml}
35 |    phoenix-a{attribution.txt}
36 |    phoenix-q{qna.yaml}
37 |    classDef na fill:#EEE
38 | ```
39 | 


--------------------------------------------------------------------------------
/foundational_skills/reasoning/common_sense_reasoning/qna.yaml:
--------------------------------------------------------------------------------
 1 | created_by: IBM
 2 | seed_examples:
 3 | - answer: "The answer is 3 killers. A person comes into the room and kills one of\
 4 |     \ the 3 killers, so 2 of the 3 killers remain. However the new person, having\
 5 |     \ killed one of the killers, is now a killer. Hence there are 3 killers in the\
 6 |     \ room. \n"
 7 |   question: 'There are 3 killers in a room. A new person comes into the room and kills
 8 |     one of the 3 people. How many killers are in the room now? Explain the answer
 9 |     step by step.
10 | 
11 |     '
12 | - answer: 'The answer is 4 hours. Since I am drying the shirts in the sun, it does
13 |     not matter if I dry 4 or 8 shirts, it will take the same amount of time.
14 | 
15 |     '
16 |   question: I am drying some shirts in a wide open space in the sun. If it takes 4
17 |     hours to dry 4 shirts, how many hours does it take to dry 8 shirts?
18 | - answer: 'The ball is now in the box. Jack put the ball in the basket. After he left,
19 |     Jill put the ball in the box, so it is now in the box.
20 | 
21 |     '
22 |   question: "Jack and Jill are in a room with a ball, box, and a basket. Jack put\
23 |     \ the ball in the basket and then went to school. After Jack left, Jill put the\
24 |     \ ball in the box and went to school. Where is the ball now? Explain step by step.\
25 |     \ \n"
26 | task_description: 'To teach a language model about step by step reasoning
27 | 
28 |   '
29 | 


--------------------------------------------------------------------------------
/.github/workflows/actionlint.yml:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | 
 3 | name: Lint GitHub Actions workflows
 4 | on:
 5 |   push:
 6 |     branches:
 7 |       - "main"
 8 |     paths:
 9 |       - '.github/workflows/*.ya?ml'
10 |       - '.github/workflows/actionlint.*' # This workflow
11 |   pull_request:
12 |     branches:
13 |       - "main"
14 |     paths:
15 |       - '.github/workflows/*.ya?ml'
16 |       - '.github/workflows/actionlint.*' # This workflow
17 | 
18 | env:
19 |   LC_ALL: en_US.UTF-8
20 | 
21 | defaults:
22 |   run:
23 |     shell: bash
24 | 
25 | permissions:
26 |   contents: read
27 | 
28 | jobs:
29 |   actionlint:
30 |     runs-on: ubuntu-latest
31 |     steps:
32 |       - name: "Harden Runner"
33 |         uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0
34 |         with:
35 |           egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs
36 | 
37 |       - name: "Checkout"
38 |         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
39 |         with:
40 |           fetch-depth: 0
41 | 
42 |       - name: "Download actionlint"
43 |         run: |
44 |           docker build --tag actionlint - < .github/workflows/actionlint.dockerfile
45 | 
46 |       - name: "Check workflow files"
47 |         run: |
48 |           echo "::add-matcher::.github/workflows/matchers/actionlint.json"
49 |           docker run --volume="${PWD}:/repo" --workdir=/repo actionlint -color
50 | 


--------------------------------------------------------------------------------
/compositional_skills/linguistics/synonyms/qna.yaml:
--------------------------------------------------------------------------------
 1 | version: 3
 2 | created_by: yanpliu
 3 | seed_examples:
 4 |   - answer: 'Synonym for Attend is take part in
 5 | 
 6 |       '
 7 |     question: List a synonym for the word attend.
 8 |   - answer: 'Two synonyms for Attend are
 9 | 
10 |       take part in
11 | 
12 |       be present at
13 | 
14 |       '
15 |     question: List two synonyms for the word attend and separate with newline.
16 |   - answer: 'Three synonyms for Attend are
17 | 
18 |       take part in
19 | 
20 |       be present at
21 | 
22 |       be there at
23 | 
24 |       '
25 |     question: List three synonyms for the word attend and separate with newline.
26 |   - answer: 'Four synonyms for Attend are
27 | 
28 |       take part in
29 | 
30 |       be present at
31 | 
32 |       be there at
33 | 
34 |       make an appearance at
35 | 
36 |       '
37 |     question: List four synonyms for the word attend and separate with newline.
38 |   - answer: 'Five synonyms for Beautiful are
39 | 
40 |       attractive
41 | 
42 |       pretty
43 | 
44 |       gorgeous
45 | 
46 |       good-looking
47 | 
48 |       handsome
49 | 
50 |       '
51 |     question: List five synonyms for the word beautiful and separate with newline.
52 |   - answer: 'Five antonyms for attend are
53 | 
54 |      absent
55 | 
56 |      neglect
57 | 
58 |      disregard
59 | 
60 |      disown
61 | 
62 |      ignore
63 | 
64 |       '
65 |     question: List five antonyms for the word attend and separate with newline.
66 | task_description: to teach a large language model to list the synonyms for a word.
67 | 


--------------------------------------------------------------------------------
/.github/workflows/update-CoC-report.yml:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | 
 3 | name: Update CoC report
 4 | 
 5 | run-name: "${{ github.event.pull_request.title }} (#${{ github.event.pull_request.number}})"
 6 | on:
 7 |   pull_request_target:
 8 |     types:
 9 |       - closed
10 | 
11 | env:
12 |   LC_ALL: en_US.UTF-8
13 | 
14 | defaults:
15 |   run:
16 |     shell: bash
17 | 
18 | permissions:
19 |   contents: read
20 | 
21 | jobs:
22 |   update-coc-insight-handler:
23 |     runs-on: ubuntu-latest
24 |     if: ${{ github.repository == 'instructlab/taxonomy' }}
25 |     name: Update CoC insights handler for PR Close event
26 |     steps:
27 |       - name: "Harden Runner"
28 |         uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0
29 |         with:
30 |           egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs
31 | 
32 |       - name: "Debug info"
33 |         run: |
34 |           jq '.' "$GITHUB_EVENT_PATH"
35 | 
36 |       - name: "Update CoC insight analysis handler"
37 |         run: |
38 |           curl -X "GET" "https://pr-analysis-handler.1fuhf5gskmng.us-east.codeengine.appdomain.cloud/pr-closed?pr_number=${PULL_REQUEST_NUMBER}&owner=${REPOSITORY_OWNER}&repo=${REPOSITORY_NAME}&merged=${IS_MERGED}" -H "accept: application/json" -H "Authorization: Bearer ${ANALYSIS_TOKEN}"
39 |         env:
40 |           REPOSITORY_OWNER: ${{ github.repository_owner }}
41 |           REPOSITORY_NAME: ${{ github.event.repository.name }}
42 |           PULL_REQUEST_NUMBER: ${{ github.event.pull_request.number}}
43 |           IS_MERGED: ${{ github.event.pull_request.merged }}
44 |           ANALYSIS_TOKEN: ${{ secrets.COC_ANALYSIS_TOKEN }}
45 | 


--------------------------------------------------------------------------------
/docs/template_qna.yaml:
--------------------------------------------------------------------------------
 1 | version: 3
 2 | domain: <The knowledge domain which is used in prompts to the teacher model during synthetic data generation. The domain should be brief such as the title to a textbook chapter or section.>
 3 | created_by: <The GitHub username of the contributor.>
 4 | seed_examples:
 5 |   - context: |
 6 |         <Context from the document associated with this set of sample q&a pairs.>
 7 |     questions_and_answers:
 8 |       - question: |
 9 |            <A question used for synthetic data generation. Based upon the previous context.>
10 |         answer: |
11 |            <The desired response for the question. Based upon the previous context.>
12 |       - question: |
13 |         answer: |
14 |       - question: |
15 |         answer: |
16 |   - context: |
17 |     questions_and_answers:
18 |       - question: |
19 |         answer: |
20 |       - question: |
21 |         answer: |
22 |       - question: |
23 |         answer: |
24 |   - context: |
25 |     questions_and_answers:
26 |       - question:
27 |         answer: |
28 |       - question: |
29 |         answer: |
30 |       - question: |
31 |         answer: |
32 |   - context: |
33 |     questions_and_answers:
34 |       - question: |
35 |         answer: |
36 |       - question: |
37 |         answer: |
38 |   - context: |
39 |     questions_and_answers:
40 |       - question: |
41 |         answer: |
42 |       - question: |
43 |         answer: |
44 |       - question: |
45 |         answer: |
46 | document_outline: |
47 |   <A brief summary of the document.>
48 | document:
49 |   repo: https://github.com/<username>/<repo>.git
50 |   commit: <full commit SHA>
51 |   patterns:
52 |     - <FILENAME>.md
53 |     - <FILENAME>.pdf
54 |     - "*.md"
55 |     - "*.pdf"
56 |     - "<directory>/*.md"
57 | 
58 | 


--------------------------------------------------------------------------------
/docs/README.md:
--------------------------------------------------------------------------------
 1 | # Docs
 2 | 
 3 | The purpose of these documents is to inform users and maintainers on the review/triaging process and different ways to contribute to the taxonomy repository. You can view the following:
 4 | 
 5 | - Full layout of taxonomy repository - [Taxonomy diagram](taxonomy_diagram.md)
 6 | - Flowchart of the pull request review process - [Review process](assets/review-process.png)
 7 | - Flowchart of how the backend works with reviewing PRS - [Backend process](assets/backend.png)
 8 | 
 9 | ## Overview of the Review Process
10 | 
11 | The review process starts with contributor drafting the PR.
12 | 
13 | 1. At the PR stage, reviewers manually check its contents (e.g. making sure the examples are added to the correct path, inspecting the contents of examples).
14 |    - If not valid, go back to the contributor and ask them to take actions.
15 | 2. If a PR passes step 1, the SDG will be triggered to generate synthetic data samples.
16 |    - If not valid, go back to the contributor and ask them to take actions.
17 | 3. If step 2 passes, it will be used in the next model update.
18 | 
19 | See the [README.md in `instructlab/docs`](https://github.com/instructlab/instructlab/blob/main/docs/README.md) on how to modify and render the flowcharts.
20 | 
21 | For more information on the review process, see [Pull request review in CONTRIBUTING.md](https://github.com/instructlab/taxonomy/blob/main/CONTRIBUTING.md#pull-request-review)
22 | 
23 | ## Triaging documentation
24 | 
25 | For more information on triaging contributions pull requests, see:
26 | 
27 | - [Safe responses for common PR mistakes](triaging/safe-responses.md)
28 | - [Triaging guide](triaging/triaging-contributions.md)
29 | 
30 | ## Contributing documentation
31 | 
32 | For documentation contribution processes, see:
33 | 
34 | - [Contributing using the GitHub webpage UI](contributing_via_GH_UI.md)
35 | - [Knowledge contribution guidelines](knowledge-contribution-guide.md)
36 | - [CONTRIBUTING.md](../CONTRIBUTING.md)
37 | 


--------------------------------------------------------------------------------
/compositional_skills/grounded/linguistics/writing/rewriting/qna.yaml:
--------------------------------------------------------------------------------
 1 | created_by: aliciaibm
 2 | version: 3
 3 | seed_examples:
 4 |   - answer: 'Scientists are studying how our genes and the world around us affect our
 5 |       health.'
 6 |     context: 'The elucidation of the intricate interplay between genetic
 7 |       predisposition and environmental factors remains a focal point of
 8 |       contemporary biomedical research.'
 9 |     question: Could you rewrite this text in a simpler way?
10 |   - answer: 'Technology is growing fast, and now we use digital tools to talk and
11 |       connect in new ways.'
12 |     context: 'The exponential growth of technological innovation has led to the
13 |       proliferation of digital platforms, fundamentally transforming the way we
14 |       communicate and interact.'
15 |     question: Rewrite the text in an easier way
16 |   - answer: 'Scientists have been figuring out how tiny particles act using quantum
17 |       mechanics, a big idea in physics.'
18 |     context: 'The elucidation of the implications of quantum mechanics for the
19 |       behavior of subatomic particles has been a cornerstone of modern physics.'
20 |     question: Could you give me the text rewritten for beter understanding?
21 |   - answer: 'It is hard for policymakers and bioethicists to make rules for new
22 |       biotechnologies that everyone agrees are fair.'
23 |     context: 'The articulation of ethical frameworks for the regulation of emerging
24 |       biotechnologies presents a significant challenge for policymakers and
25 |       bioethicists alike.'
26 |     question: Could you tell me in a simple way what the text is saying?
27 |   - answer: 'We need ideas from lots of different fields to solve big problems and
28 |       come up with new stuff.'
29 |     context: 'The synthesis of diverse perspectives from interdisciplinary fields is
30 |       crucial for fostering innovation and addressing complex societal
31 |       challenges.'
32 |     question: Rewrite the text in order to I understand better
33 | task_description: 'Rewrite complex texts in an easier way'
34 | 


--------------------------------------------------------------------------------
/.github/pull_request_template.md:
--------------------------------------------------------------------------------
 1 | If your PR is related to a contribution to the taxonomy, please, fill
 2 | out the following questionnaire. If not, replace this whole text and the
 3 | following questionnaire with whatever information is applicable to your PR.
 4 | 
 5 | 
 6 | **Describe the contribution to the taxonomy**
 7 | 
 8 | <!-- A concise description of what the contribution brings, replace "..." in the bullet list -->
 9 | 
10 | - ...
11 | - ...
12 | - ...
13 | 
14 | 
15 | **Input given at the prompt**
16 | 
17 | <!-- What you entered, replace "..." -->
18 | 
19 | ```
20 |    ...
21 | ```
22 | 
23 | 
24 | **Response from the original model**
25 | 
26 | 
27 | <!-- What you received from the original model in response to your input, 
28 | replace "..." -->
29 | 
30 | ```
31 |   ...
32 | ```
33 | 
34 | 
35 | **Response from the fine-tuned model**
36 | 
37 | 
38 | <!-- Generate a synthetic dataset based on your newly added seed data; train the model 
39 | with the synthetic data and now re-test the model's response with the same prompt.
40 | Replace "..." with what you receive with the finetuned model. -->
41 | 
42 | ```
43 |   ...
44 | ```
45 | 
46 | **Contribution checklist**
47 | 
48 | <!-- Insert an x between the empty brackets: [ ] >> [x] -->
49 | 
50 | - [ ] The contribution was tested with `ilab generate`
51 | - [ ] No errors or warnings were produced by `ilab generate`
52 | - [ ] All [commits are signed off](https://github.com/instructlab/taxonomy/blob/main/CONTRIBUTING.md#legal) (DCO)
53 | - [ ] The `qna.yaml` file contains at least 5 `seed_examples`
54 | - [ ] The `qna.yaml` file was [linted](https://yamllint.com) and [prettified](https://onlineyamltools.com/prettify-yaml) ([yaml-validator](https://jsonformatter.org/yaml-validator) can do both)
55 | - [ ] An `attribution.txt` file in the same folder as the `qna.yaml` file
56 | - [ ] Content does not include PII or otherwise sensitive or confidential information
57 | - [ ] Content does not include anything documented in the project's [Avoid these Topics](https://github.com/instructlab/taxonomy/blob/main/docs/SKILLS_GUIDE.md#avoid-these-topics) guidelines
58 | 


--------------------------------------------------------------------------------
/.github/workflows/insight.yml:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | 
 3 | name: CoC insight analysis
 4 | 
 5 | run-name: ${{ github.event.workflow_run.display_title || github.workflow }}
 6 | 
 7 | on:
 8 |   workflow_run:
 9 |     workflows:
10 |       - "Lint compositional_skills and knowledge"
11 |     types:
12 |       - completed
13 | 
14 | env:
15 |   LC_ALL: en_US.UTF-8
16 | 
17 | defaults:
18 |   run:
19 |     shell: bash
20 | 
21 | permissions:
22 |   contents: read
23 | 
24 | jobs:
25 |   insight-analysis:
26 |     runs-on: ubuntu-latest
27 |     if: ${{ (github.event.workflow_run.event == 'pull_request') && (github.repository == 'instructlab/taxonomy') && (github.event.workflow_run.conclusion == 'success') }}
28 |     name: Run insight analysis
29 |     steps:
30 |       - name: "Harden Runner"
31 |         uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0
32 |         with:
33 |           egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs
34 | 
35 |       - name: "Debug info"
36 |         run: |
37 |           jq '.' "$GITHUB_EVENT_PATH"
38 | 
39 |       - name: "Download Pull Request number"
40 |         uses: actions/download-artifact@cc203385981b70ca67e1cc392babf9cc229d5806 # v4.1.9
41 |         with:
42 |           name: pull_request_number
43 |           run-id: ${{ github.event.workflow_run.id }}
44 |           github-token: ${{ secrets.GITHUB_TOKEN }}
45 | 
46 |       - name: "Get Pull Request number"
47 |         run: |
48 |           echo "PULL_REQUEST_NUMBER=$(cat pull_request_number.txt)" >> "$GITHUB_ENV"
49 | 
50 |       - name: "Invoke insight analysis handler"
51 |         if: ${{ fromJSON(env.PULL_REQUEST_NUMBER) != 0 }}
52 |         run: |
53 |           curl -X "POST" "https://pr-analysis-handler.1fuhf5gskmng.us-east.codeengine.appdomain.cloud/analyze-pr?pr_number=${PULL_REQUEST_NUMBER}&owner=${REPOSITORY_OWNER}&repo=${REPOSITORY_NAME}" -H "accept: application/json" -H "Authorization: Bearer ${ANALYSIS_TOKEN}"
54 |         env:
55 |           REPOSITORY_OWNER: ${{ github.repository_owner }}
56 |           REPOSITORY_NAME: ${{ github.event.repository.name }}
57 |           ANALYSIS_TOKEN: ${{ secrets.COC_ANALYSIS_TOKEN }}
58 | 


--------------------------------------------------------------------------------
/.github/workflows/label.yml:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | 
 3 | name: "Pull Request Labeler"
 4 | 
 5 | on:
 6 |   pull_request_target:
 7 | 
 8 | env:
 9 |   LC_ALL: en_US.UTF-8
10 | 
11 | defaults:
12 |   run:
13 |     shell: bash
14 | 
15 | permissions:
16 |   contents: read
17 | 
18 | jobs:
19 |   labeler:
20 |     permissions:
21 |       pull-requests: write
22 |     runs-on: ubuntu-latest
23 |     steps:
24 |     - name: "Harden Runner"
25 |       uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0
26 |       with:
27 |         egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs
28 | 
29 |     - name: "Labeler Action"
30 |       uses: actions/labeler@8558fd74291d67161a8a78ce36a881fa63b766a9 # v5.0.0
31 |       with:
32 |         repo-token: ${{ secrets.GITHUB_TOKEN }}
33 |         sync-labels: false
34 | 
35 |   add-comment:
36 |       if: ${{ (github.event.action == 'labeled') && (github.event.label.name == 'triage-uncertain') }}
37 |       permissions:
38 |         pull-requests: write
39 |       runs-on: ubuntu-latest
40 |       steps:
41 |         - name: "Harden Runner"
42 |           uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0
43 |           with:
44 |             egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs
45 | 
46 |         - name: "Add comment"
47 |           run: |
48 |             gh pr comment "${PULL_REQUEST_NUMBER}" --body "${PULL_REQUEST_COMMENT}"
49 |           env:
50 |             GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
51 |             GH_REPO: ${{ github.repository }}
52 |             PULL_REQUEST_NUMBER: ${{ github.event.number }}
53 |             PULL_REQUEST_COMMENT: >
54 |               Hi! So you've been marked as `triage-uncertain`, don't fret!
55 |               This just means the triage team or member needs to discuss your PR during one of our [public meetings](https://github.com/instructlab/community/blob/main/Collaboration.md#triager-standup) before making a decision. (Yes! You are more then welcome to join us.)
56 |               You have either made something amazing, or maybe hit a corner case that we hadn't thought of, or something like that.
57 |               :star2: Thank you for your contribution! And you are pushing our :dog: :computer: project and we can't thank you enough. :sparkles:
58 | 


--------------------------------------------------------------------------------
/foundational_skills/reasoning/temporal_reasoning/qna.yaml:
--------------------------------------------------------------------------------
 1 | created_by: IBM
 2 | seed_examples:
 3 | - answer: 'Hello! I''m here to help you understand the order of arrival at a party.
 4 |     Let''s analyze the situation step by step:
 5 | 
 6 | 
 7 |     1. Joe reached first (before you).
 8 | 
 9 |     2. You arrived next.
10 | 
11 |     3. David arrived after you.
12 | 
13 |     4. John reached last (after David).
14 | 
15 | 
16 |     Based on this information, Joe was the first one to arrive at the party, followed
17 |     by you, then David, and finally John.
18 | 
19 |     '
20 |   question: 'Consider I went to a party. I reached before John. David arrived after.
21 |     Joe reached before me. John arrived after David. Who reached first and who reached
22 |     last?
23 | 
24 |     '
25 | - answer: 'Hello! I''m here to help you understand the order of arrival at a celebration.
26 |     Let''s analyze the situation step by step:
27 | 
28 | 
29 |     1. Emma reached first (before Peter).
30 | 
31 |     2. Peter arrived next.
32 | 
33 |     3. Sophia arrived after Peter.
34 | 
35 |     4. Olivia reached last (after Sophia).
36 | 
37 | 
38 |     Based on this information, Emma was the first one to arrive at the celebration,
39 |     followed by Peter, then Sophia, and finally Olivia.
40 | 
41 |     '
42 |   question: At a celebration, Peter came in before Olivia. Sophia arrived after. Emma
43 |     got there before Peter. Olivia arrived after Sophia. Who reached first, and who
44 |     reached last?
45 | - answer: 'Hello! I''m here to help you understand the order of photos in a photography
46 |     competition. Let''s analyze the situation step by step:
47 | 
48 | 
49 |     1. Emily captured her photo first (before James).
50 | 
51 |     2. James captured a photo next.
52 | 
53 |     3. Alex entered with a photo after James.
54 | 
55 |     4. Sarah submitted her photo last (after Alex).
56 | 
57 | 
58 |     Based on this information, Emily''s photo was captured first, followed by Jame''s
59 |     entry, then Alex''s photo, and finally Sarah''s submission.
60 | 
61 | 
62 |     '
63 |   question: 'In a photography competition, James captured a photo before Sarah. Alex''s
64 |     entry came after. Emily took a photo before James. Sarah submitted her photo after
65 |     Alex. Whose photo was captured first, and whose photo was captured last?
66 | 
67 |     '
68 | task_description: "To teach a language model about chronological order of time. \n"
69 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 
131 | scratch.ipynb
132 | 
133 | # IDEs
134 | .vscode/
135 | .idea/
136 | 
137 | # Mac personalization files
138 | .DS_Store
139 | 
140 | # Ignore config.yaml from the cli
141 | config.yaml
142 | 


--------------------------------------------------------------------------------
/foundational_skills/reasoning/logical_reasoning/tabular/qna.yaml:
--------------------------------------------------------------------------------
 1 | created_by: IBM
 2 | seed_examples:
 3 | - answer: "To predict the next value in the temperature sequence, we can look at the\
 4 |     \ pattern in the previous values. The differences between consecutive temperatures\
 5 |     \ are:\n\n2 (day 2 - day 1)\n3 (day 3 - day 2)\n3 (day 4 - day 3)\n4 (day 5 -\
 6 |     \ day 4)\n\nThis increasing difference suggests that the temperature will continue\
 7 |     \ to increase. To find the next temperature, we can add the difference from day\
 8 |     \ 4 to the temperature of day 4:\n\nNext temperature = Day 4 temperature + Difference\n\
 9 |     = 28 \xB0C + 3 \xB0C\n= 31 \xB0C\n\nTherefore, the next value in the sequence\
10 |     \ is 31 degrees Celsius.\n"
11 |   question: "Given the following table, predict the next value in the sequence:\n\n\
12 |     Day\tTemperature (\xB0C)\n1\t20\n2\t22\n3\t25\n4\t28\n5\t?\n\n"
13 | - answer: 'North: Product A
14 | 
15 |     South: Product C
16 | 
17 |     East: Product A
18 | 
19 |     West: Product C
20 | 
21 |     '
22 |   question: "You're an analyst examining the performance of a retail company's product\
23 |     \ lines across different regions. Analyze the sales data provided in the table\
24 |     \ below and identify the best-performing product line in each region. For example\
25 |     \ the best performing product in North is Product B. Similarly identify for other\
26 |     \ regions as well.\n\nRegion\tProduct A Sales (in $)\tProduct B Sales (in $)\t\
27 |     Product C Sales (in $)\nNorth\t$10,000\t                            $12,000\t\
28 |     \                             $9,000\nSouth      $8,000\t                    \
29 |     \        $10,000\t                             $11,000\nEast\t        $11,000\t\
30 |     \                            $9,000\t                                     $10,000\n\
31 |     West\t$9,000\t                            $11,000\t                          \
32 |     \   $12,000"
33 | - answer: 'To calculate the total number of fruits bought on each day we first sum
34 |     up the number of fruits bought in each day.
35 | 
36 |     Total fruits for A = 5 (Day 1) + 8 (Day 2) + 4 (Day 3) = 17
37 | 
38 |     Total fruits for B = 3 (Day 1) + 6 (Day 2) + 7 (Day 3) = 16
39 | 
40 |     Total fruits for C = 2 (Day 1) + 5 (Day 2) + 6 (Day 3) = 13
41 | 
42 | 
43 |     Hence on Day 3 the lowest number of fruits were bought.
44 | 
45 |     '
46 |   question: "Consider the following table representing the number of fruits bought\
47 |     \ by three different people (A, B, C) at a fruit stall over three days (Day 1,\
48 |     \ Day 2, Day 3):\n\nPerson\tDay 1\tDay 2\tDay 3\nA\t5\t8\t4\nB\t3\t6\t7\nC\t2\t\
49 |     5\t6\nOn which day lowest number of fruits were brought?\n"
50 | task_description: 'To teach a language model about logical reasoning using tabular
51 |   data
52 | 
53 |   '
54 | 


--------------------------------------------------------------------------------
/docs/triaging/safe-responses.md:
--------------------------------------------------------------------------------
 1 | # Templates of safe responses to common PR mistakes
 2 | 
 3 | ## Skills Content
 4 | 
 5 | ### Jokes and Poems
 6 | 
 7 | Thank you for taking the time to submit to InstructLab! I know this wasn't easy and could have been quite time-consuming. Unfortunately, our team has decided that submissions for jokes or poems (skills and knowledge) are no longer supported or approved. We received so many at the beginning, and with jokes being "in the eye of the beholder" and puns requiring nuance for native English speakers, we realized we were possibly unconsciously biasing our model. We have discovered that working with both topics has its own challenges, and if we want something generalized, finding consensus was unsuccessful.
 8 | 
 9 | We have updated the [topics to avoid](../SKILLS_GUIDE.md#avoid-these-topics) to reflect this, and if you have more questions, don't hesitate to comment on this PR. We may revisit this in the future, but we are looking for unique skills or new knowledge for the model; poems and jokes, unfortunately, don't fit this plan.
10 | 
11 | Thank you again for taking this time, and we are looking forward to your next PR to the InstructLab project!
12 | 
13 | ### Math
14 | 
15 | LLMs aren't great at counting, math, or computation. Right now we're not seeking to improve the model's math and computation ability as this skill belongs to ["core skill"](../SKILLS_GUIDE.md#core-skills), which is not open to contribution.
16 | 
17 | This will be rejected for now and we can reconsider if we open a math / computation taxonomy.
18 | 
19 | ### Coding
20 | 
21 | Thanks for the submission! Coding belongs to ["core skill"](../SKILLS_GUIDE.md#core-skills), which is not open to contribution.
22 | 
23 | This will be rejected for now and we can reconsider if we open a programming / coding taxonomy.
24 | 
25 | ### Knowledge not Skill
26 | 
27 | Given the content of the file, it seems that this would likely fall under knowledge, which we are currently accepting contributions for under different [guidelines](../../README.md#getting-started-with-knowledge-contributions).
28 | 
29 | Closing this PR, but feel free to resubmit as knowledge following those guidelines!
30 | 
31 | ## Contribution Formatting Guidlines
32 | 
33 | ### Not Enough Examples
34 | 
35 | Could you please make sure there are five examples with good diversity, meaning having more examples that have very little overlapping content? We have recently updated the [readme](../../README.md#getting-started-with-skill-contributions) with the new requirement. Thank you for your contribution!
36 | 
37 | ### Attribution
38 | 
39 | Thanks for the submission! It looks interesting. Based on the [current guide](../../README.md#getting-started-with-skill-contributions), do you mind adding attribution to the examples?
40 | 
41 | ### Mentions of GPT, ChatGPT, Bard, or other proprietary LLMs
42 | 
43 | Since this is a non-proprietary project, please remove all mentions of the names for commercial or proprietary LLM models; we are in favor of wordings like “LLMs”, or “Foundation Models”, etc.
44 | 


--------------------------------------------------------------------------------
/docs/triaging/insights/taxonomy-insights-documentation.md:
--------------------------------------------------------------------------------
 1 | # InstructLab Contribution Insights
 2 | 
 3 | This module automatically analyzes the taxonomy of pull request contribution files, checking for potential code of conduct violations.
 4 | 
 5 | The analysis currently covers the following categories:
 6 | 
 7 | - **PII**: Privacy (Personal Identifiable Information)
 8 | - **Hap**: Hate, Abuse, Profanity
 9 | - **Adult**: Sexual and erotic content
10 | 
11 | **Note**: For knowledge contributions, analysis of referenced sources included in markdown files is out of scope for the current version but is planned for the future.
12 | 
13 | ## Automation workflow
14 | 
15 | The insights analysis is trigerred by a Github Action on every PR creation or update. The high level flow is described the the following diagram:
16 | 
17 | ![Insight Analysis Automation using github workflow](images/gh-workflow.png)
18 | 
19 | ### Main workflow steps
20 | 
21 | 1. **PR is Opened**: The Contributor opens a Pull Request (PR).
22 | 2. **Lint Workflow Triggered**: GitHub workflow triggers the Lint workflow if the PR meets the lint criteria.
23 | 3. **Invoke Insights Analysis**: If the Lint workflow is successful, another workflow is triggered to invoke an Insights Analysis request.
24 | 4. **Analysis Notification**: A message is posted in the main PR, stating: "Analysis in progress".
25 | 5. **Post Analysis Result**: Once the analysis is completed, a follow-up message with the result is posted in the PR, with one of two possible outcomes:
26 |     1. **No Violations Found**: No code of conduct violations detected.
27 |     2. **Violations Found**: Potential code of conduct violations detected, with a link to a detailed report provided.
28 | 
29 | #### GitHub messages
30 | 
31 | #### Analysis Notification
32 | 
33 | A message is posted in the main PR, stating: "Analysis in progress". Here is a screen capture of the message:
34 | 
35 | ![Analysis in progress](images/gh-message_analysis-in-progress.png)
36 | 
37 | #### No Violations Found
38 | 
39 | If no code of conduct violations is detected, a message is posted in the main PR, as shown in the screen capture below:
40 | 
41 | ![No Violations Found](images/gh-message_no-violation.png)
42 | 
43 | #### Potential Violations to the code of conduct
44 | 
45 | When potential violations to the code of conduct are identified, the following message is shown, and a link to a report is provided (see more below):
46 | 
47 | ![Potential Violations to the CoC](images/gh-message_potential-violation.png)
48 | 
49 | ## Insights Report Page
50 | 
51 | When potential violations of the code of conduct are identified, a separate report link is provided.
52 | 
53 | When the report link is opened, the web app asks the user to log in with Github OAuth - see the screen capture of the login page.
54 | 
55 | ![Login scren](images/login-screen.png)
56 | 
57 | After login, the user sees the report that lists all the potential violations found, with each violation category highlighted in a different color. Confidence scores are also provided. See the screen capture of the report below:
58 | 
59 | ![Potential Violations to the CoC](images/InstructLab-Insights-report.png)


--------------------------------------------------------------------------------
/compositional_skills/grounded/linguistics/inclusion/qna.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | version: 3
 3 | task_description: This skill helps the writer use more inclusive IT terminology.
 4 | created_by: imstilllearning
 5 | seed_examples:
 6 |   - context: >
 7 |       In database replication, the master database is regarded as the
 8 |       authoritative source, and the slave databases are synchronized to it.
 9 |     question: How would you rewrite this sentence to use more inclusive IT terminology?
10 |     answer: >
11 |       In database replication, the primary database is regarded as the
12 |       authoritative source, and the secondary databases are syncrhonized to it.
13 |   - context: >
14 |       Blacklisting is the action of a group or authority compiling a
15 |       blacklist of people, countries, or other entites to be avoided or
16 |       distrusted as being deemed unacceptable to those making the list.
17 |     question: Show me how to use inclusive IT terminology to replace non-inclusive
18 |       IT terminology.
19 |     answer: >
20 |       Denylisting is the action of a group or authority compiling a denylist
21 |       of people, countries, or other entites to be avoided or distrusted as
22 |       being deemed unacceptable to those making the list.
23 |   - context: >
24 |       In database replication, the master database is regarded as the
25 |       authoritative source, and the slave databases are synchronized to it.
26 |     question: How would you revise this sentence to use Inclusive IT terminology?
27 |     answer: >
28 |       In database replication, the parent database is regarded as the
29 |       authoritative source, and the child databases are synchronized to it.
30 |   - context: >
31 |       A use for whitelists is in local area network (LAN) security. Many
32 |       network admins set up MAC address whitelists, or a MAC address filter, to
33 |       control who is allowed on their networks.
34 |     question: How would you revise this sentence to use inclusive IT terminology?
35 |     answer: >
36 |       A use for allowlists is in local area network (LAN) security. Many
37 |       network admins set up MAC address allowlists, or a MAC address filter, to
38 |       control who is allowed on their networks.
39 |   - context: >
40 |       A master clock that provides time signals used to synchronize one or
41 |       more slave clocks as a part of a clock network.
42 |     question: How would you improve this sentence to use inclusive IT language?
43 |     answer: >
44 |       A primary clock that provides time signals used to synchronize one or
45 |       more secondary clocks as a part of a clock network.
46 |   - context: >
47 |       As a verb, blacklist can mean to put an individual or entity on such a
48 |       list. A blacklist is synonymous with a list of banned persons or
49 |       organizations and is the opposite of a whitelist.
50 |     question: How would you adjust the word choice to use Inclusive IT terminology?
51 |     answer: >
52 |       As a verb, denylist can mean to put an individual or entity on such a
53 |       list. A denylist is synonymous with a list of banned persons or
54 |       organizations and is the opposite of an allowlist.
55 | 


--------------------------------------------------------------------------------
/.github/workflows/lint.yml:
--------------------------------------------------------------------------------
  1 | # SPDX-License-Identifier: Apache-2.0
  2 | 
  3 | name: Lint compositional_skills and knowledge
  4 | 
  5 | on:
  6 |   workflow_dispatch:
  7 |   push:
  8 |     branches:
  9 |       - main
 10 |     paths:
 11 |       - compositional_skills/**/qna.yaml
 12 |       - knowledge/**/qna.yaml
 13 |       - '.github/workflows/lint.yml' # This workflow
 14 |       - 'scripts/**' # Scripts used by this workflow
 15 | 
 16 |   pull_request:
 17 |     branches:
 18 |       - main
 19 |     paths:
 20 |       - compositional_skills/**/*.yaml
 21 |       - compositional_skills/**/*.yml
 22 |       - knowledge/**/*.yaml
 23 |       - knowledge/**/*.yml
 24 |       - '.github/workflows/lint.yml' # This workflow
 25 |       - 'scripts/**' # Scripts used by this workflow
 26 | 
 27 | env:
 28 |   LC_ALL: en_US.UTF-8
 29 | 
 30 | defaults:
 31 |   run:
 32 |     shell: bash
 33 | 
 34 | permissions:
 35 |   contents: read
 36 | 
 37 | jobs:
 38 |   yamllint:
 39 |     runs-on: ubuntu-latest
 40 |     steps:
 41 |       - name: "Harden Runner"
 42 |         uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0
 43 |         with:
 44 |           egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs
 45 | 
 46 |       - name: "Checkout"
 47 |         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
 48 |         with:
 49 |           fetch-depth: 0
 50 | 
 51 |       - name: "Setup Python"
 52 |         uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
 53 |         with:
 54 |           python-version: "3.11"
 55 | 
 56 |       - name: "Install Python Packages"
 57 |         run: |
 58 |           pip install -r scripts/requirements.txt
 59 | 
 60 |       - name: "Find changed skills and knowledge files"
 61 |         id: changed-files
 62 |         uses: tj-actions/changed-files@dcc7a0cba800f454d79fff4b993e8c3555bcc0a8 # v45.0.7
 63 |         with:
 64 |           files: |
 65 |             compositional_skills/**/*.yaml
 66 |             compositional_skills/**/*.yml
 67 |             knowledge/**/*.yaml
 68 |             knowledge/**/*.yml
 69 | 
 70 |       - name: "Check changed YAML file contents"
 71 |         if: ${{ fromJSON(steps.changed-files.outputs.any_changed) }}
 72 |         run: |
 73 |           scripts/check-yaml.py ${{ steps.changed-files.outputs.all_changed_files }}
 74 |         env:
 75 |           TAXONOMY_FOLDERS: >-
 76 |             compositional_skills
 77 |             knowledge
 78 | 
 79 |       - name: "Check all YAML file contents"
 80 |         if: ${{ !fromJSON(steps.changed-files.outputs.any_changed) }}
 81 |         run: |
 82 |           read -ra folders <<< "${TAXONOMY_FOLDERS}"
 83 |           # shellcheck disable=SC2046
 84 |           scripts/check-yaml.py $(find "${folders[@]}" -name "qna.yaml" -print)
 85 |         env:
 86 |           SCHEMA_VERSION: 0 # use the schema version specified in the "version" key
 87 |           TAXONOMY_FOLDERS: >-
 88 |             compositional_skills
 89 |             knowledge
 90 | 
 91 |       - name: "Save Pull Request number"
 92 |         if: ${{ (github.event_name == 'pull_request') && (github.repository == 'instructlab/taxonomy') }}
 93 |         run: |
 94 |           echo "${PULL_REQUEST_NUMBER}" > pull_request_number.txt
 95 |         env:
 96 |           PULL_REQUEST_NUMBER: ${{ fromJSON(steps.changed-files.outputs.any_changed) && github.event.number || '0' }}
 97 | 
 98 |       - name: "Upload Pull Request number"
 99 |         if: ${{ (github.event_name == 'pull_request') && (github.repository == 'instructlab/taxonomy') }}
100 |         uses: actions/upload-artifact@4cec3d8aa04e39d1a68397de0c4cd6fb9dce8ec1 # v4.6.1
101 |         with:
102 |           name: pull_request_number
103 |           path: pull_request_number.txt
104 |           if-no-files-found: error
105 |           retention-days: 5
106 | 


--------------------------------------------------------------------------------
/docs/contributing_via_GH_UI.md:
--------------------------------------------------------------------------------
 1 | # Contributing using the GitHub webpage UI
 2 | 
 3 | There are a few ways you can create pull requests in an open source project: Locally using the git CLI tool in your terminal or using the GitHub webpage user interface. The following instructions show you how to create a PR using the GitHub webpage UI
 4 | 
 5 | ## Prerequisites
 6 | 
 7 | - You have a GitHub account
 8 | - You have access to this repo
 9 | 
10 | ## Make a copy of the taxonomy repo
11 | 
12 | 1. Go to [github.com/instructlab/taxonomy](https://github.com/instructlab/taxonomy).
13 | 
14 | 2. Click **Fork** to fork your own copy of the repo.
15 | 
16 | 3. On the **Create a new fork** page, enter the information into the following fields:
17 |     - **Repository name:** Name your fork the same as the repository, in this case `taxonomy` is the name of your fork.
18 |     - **Description:** Enter the description of _your fork_, not of the skills you will create. You can write something that makes sense to you or leave it blank.
19 |     - **Copy the main branch only:** The box is selected by default. You can choose to leave the box selected or clear it.
20 | 
21 | 4. Click **Create Fork**.
22 | 
23 | You will get a copy of the taxonomy repo in your github account. This is your own copy, so don't worry about making mistakes. _If you do end up making a mistake and want to start over: you can delete the fork and create a new fork._
24 | 
25 | ## Contributing a skill
26 | 
27 | 1. IMPORTANT: Before you begin, create a working branch for your contribution and name it whatever you like. This allows you to have an up-to-date `main` branch in your fork as well as a working branch to add any changes.
28 | 
29 |     ![add_branch](assets/add_skill_branch.png)
30 | 
31 | 2. Navigate to the directory where you want your knowledge or skill. The following image shows the compositional skills directory and its contents. Skills are contributed to this directory:
32 | 
33 |     ![comp_skill](assets/comp_skill.png)
34 | 
35 |     The other top-level directory you can contribute to is the knowledge directory, which is used for knowlege contributions. You can read more about the difference between skills and knowledge in the [community documentation](https://github.com/instructlab/community/blob/main/docs/README.md).
36 | 
37 | 3. Based on the directories that exist in the tree, make a best guess at where in the tree structure to add the skill that you want to contribute. If you get to a point where you've gone deep enough into the tree and you can't find any directories that match, create a new directory (and subdirectories, if needed) to best represent your skill.
38 | 
39 |     For example, I want to train the model to learn how to create sentences using rhetorical devices, specifically similes. I started in the compositional_skills section, and navigated to linguistics.
40 | 
41 |     ![linguistics_file](assets/linguistics.png)
42 | 
43 |     There is currently no rhetorical_devices folder in the linguistics section, so I can create one called "rhetorical_devices", then click `/` to create another folder called "simile"
44 | 
45 |     ![new_file](assets/new_file.png)
46 | 
47 |     ![simile_folder](assets/simile_folder.png)
48 | 
49 | 4. It can be a little tricky mechanically to create directories in GitHub's web UI, but you can complete the process using the following steps:
50 | 
51 |     a. In the GitHub repo, click the folder that you want to create the new directory inside of.
52 | 
53 |     b. Click Add File and select Create new file from the menu.
54 | 
55 |     c. Type the name of the first directory that you want to create. The  example image uses rhetorical_devices as the first directory, then metaphor as the next.
56 | 
57 |     d. After you have entered the name of all of the directories that you want to add, type the file name. The file name should always be qna.yaml (qna stands for "Question aNd Answer.")
58 | 
59 |     ![qna_example](assets/qna_ex.png)
60 | 
61 |     e. You can then click "commit changes" to your branch. The GitHub UI will prompt you to open a pull requestion. Select the "open pull request" button.
62 | 
63 | 5. Verify that your YAML follows the proper structure. See [Knowledge: YAML examples](https://github.com/instructlab/taxonomy/blob/main/README.md#knowledge-yaml-examples) and [Skills: YAML examples](https://github.com/instructlab/taxonomy/blob/main/README.md#skills-yaml-examples) to help with formatting. The [yamllint](https://www.yamllint.com/) tool is another great way to verify yaml.
64 | 


--------------------------------------------------------------------------------
/scripts/check-yaml.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # SPDX-License-Identifier: Apache-2.0
  3 | 
  4 | # Standard
  5 | import argparse
  6 | import os
  7 | import pathlib
  8 | import sys
  9 | 
 10 | # Third Party
 11 | from instructlab.schema.taxonomy import TaxonomyParser
 12 | 
 13 | 
 14 | class CheckYaml:
 15 |     def __init__(
 16 |         self,
 17 |         *,
 18 |         yaml_files: list[pathlib.Path],
 19 |         taxonomy_folders: list[str] | None = None,
 20 |         yamllint_config: str | None = None,
 21 |         schema_version: int | None = None,
 22 |         message_format: str | None = None,
 23 |     ) -> None:
 24 |         self.yaml_files = yaml_files
 25 |         self.taxonomy_folders = taxonomy_folders
 26 |         self.yamllint_config = yamllint_config
 27 |         self.schema_version = schema_version
 28 |         self.message_format = message_format
 29 | 
 30 |     def check(self) -> int:
 31 |         exit_code: int = 0
 32 |         parser = TaxonomyParser(
 33 |                     taxonomy_folders=self.taxonomy_folders,
 34 |                     schema_version=self.schema_version,
 35 |                     message_format=self.message_format,
 36 |                     yamllint_config=self.yamllint_config,
 37 |                 )
 38 |         for file in self.yaml_files:
 39 |             taxonomy = parser.parse(file)
 40 |             if taxonomy.version > 1:
 41 |                 attribution_path = taxonomy.rel_path.with_name("attribution.txt")
 42 |                 if not attribution_path.is_file():
 43 |                     taxonomy.error(
 44 |                         "The \"%s\" file does not exist or is not a file",
 45 |                         attribution_path.name,
 46 |                     )
 47 |                 elif os.path.getsize(attribution_path) == 0:
 48 |                     taxonomy.error(
 49 |                         "The \"%s\" file must be non-empty",
 50 |                         taxonomy.path.with_name(attribution_path.name),
 51 |                     )
 52 |             if taxonomy.errors > 0:
 53 |                 exit_code = 1
 54 |         if not self.yaml_files:
 55 |             print("No yaml files specified.")
 56 |         return exit_code
 57 | 
 58 | 
 59 | def cli() -> int:
 60 |     parser = argparse.ArgumentParser(
 61 |         description="""
 62 |         Check Taxonomy YAML files for linting and schema validation.
 63 |         """,
 64 |     )
 65 |     parser.add_argument(
 66 |         "-t",
 67 |         "--taxonomy-folder",
 68 |         action="append",
 69 |         metavar="TAXONOMY_FOLDER",
 70 |         dest="taxonomy_folders",
 71 |         help="""
 72 |             A taxonomy folder. This argument can be specified multiple times.
 73 |             Alternately, the TAXONOMY_FOLDERS environment variable can be used
 74 |             to specify a space-separated list of folders.
 75 |             """,
 76 |         default=os.environ.get("TAXONOMY_FOLDERS"),
 77 |     )
 78 |     parser.add_argument(
 79 |         "-v",
 80 |         "--schema-version",
 81 |         help="""
 82 |             The version of the Taxonomy schema.
 83 |             Alternately, the SCHEMA_VERSION environment variable can be used
 84 |             to specify the version.
 85 |             Specifying a version less than 1 will use the schema version
 86 |             specified by each YAML document's "version" key.
 87 |             If not specified, the highest schema version is used.
 88 |             """,
 89 |         default=os.environ.get("SCHEMA_VERSION"),
 90 |         type=int,
 91 |     )
 92 |     parser.add_argument(
 93 |         "-l",
 94 |         "--lint-config",
 95 |         dest="yamllint_config",
 96 |         help="""
 97 |             The yamllint configuration data.
 98 |             Alternately, the YAMLLINT_CONFIG environment variable can be used
 99 |             to specify the configuration data.
100 |             """,
101 |         default=os.environ.get("YAMLLINT_CONFIG"),
102 |     )
103 |     parser.add_argument(
104 |         "-f",
105 |         "--format",
106 |         help="The message format.",
107 |         dest="message_format",
108 |         choices=["standard", "github", "auto"],
109 |         default=None,
110 |     )
111 |     parser.add_argument(
112 |         "yaml_file",
113 |         help="A qna.yaml file.",
114 |         nargs="*",
115 |         type=pathlib.Path,
116 |     )
117 |     args = parser.parse_args()
118 | 
119 |     taxonomy_folders = args.taxonomy_folders
120 |     if isinstance(taxonomy_folders, str):
121 |         taxonomy_folders = taxonomy_folders.split()
122 |     check_yaml = CheckYaml(
123 |         yaml_files=args.yaml_file,
124 |         taxonomy_folders=taxonomy_folders,
125 |         yamllint_config=args.yamllint_config,
126 |         schema_version=args.schema_version,
127 |         message_format=args.message_format,
128 |     )
129 |     exit_code = check_yaml.check()
130 |     return exit_code
131 | 
132 | 
133 | if __name__ == "__main__":
134 |     sys.exit(cli())
135 | 


--------------------------------------------------------------------------------
/docs/knowledge-contribution-guide.md:
--------------------------------------------------------------------------------
 1 | # Knowledge contribution guidelines
 2 | 
 3 | You can create a Git repository to host your knowledge contributions anywhere (GitLab, Gerrit, etc.) but it may be favorable to create one on GitHub. The following instructions show you how to create a knowledge repository in GitHub and contribute to the taxonomy.
 4 | 
 5 | ## Prerequisites
 6 | 
 7 | - You have a GitHub account
 8 | - You have a forked copy of the [taxonomy](https://github.com/instructlab/taxonomy/tree/main) repository
 9 | - Verify that the model does not already know the knowledge you want to submit
10 | 
11 | ## Creating your own knowledge repository
12 | 
13 | To create a new GitHub repository, follow the GitHub documentation in [Creating a new repository](https://docs.github.com/en/repositories/creating-and-managing-repositories/creating-a-new-repository).
14 | 
15 | The specific steps are listed as follows:
16 | 
17 | 1. In your GitHub profile page, navigate to the repositories tab. You will see a search bar where you can search your repositories, or create a new one.
18 | 2. This takes you to a page titled “Create a new repository”. Create a custom name for your repository and add a README.md file. For example, “knowlege_contributions” could be a good name for your repository.
19 | 3. Click “Create” when you are all set.
20 | 
21 | ## Convert your knowledge documentation to markdown
22 | 
23 | There are many online tools that can help you convert your documents to markdown. If you are using a wiki page for your contributions, you can use [pandocs](https://pandoc.org/try/) to convert the documents. For wikipedia sources on pandoc, use `from: mediawiki` and convert `to: markdown_strict` to access the proper markdown format.
24 | 
25 | ## Add the markdown file to your repository
26 | 
27 | To add a file to your GitHub repository, follow the GitHub documentation in [Adding a file to a repository](https://docs.github.com/en/repositories/working-with-files/managing-files/adding-a-file-to-a-repository).
28 | 
29 | The specific steps are listed as follows:
30 | 
31 | 1. Navigate to “Add files”. Click “Create new file” if you want to manually add your markdown content. Click “Upload files” if you have a file locally to add.
32 | 2. Add a description and commit your changes.
33 | 
34 |     Since this is your own repository, you can commit directly to the `main` branch.
35 | 
36 | 3. You can then see your new content in your repository.
37 | 
38 | > [!IMPORTANT]
39 | > Make a note of your commit SHA; you need it for your `qna.yaml`.
40 | 
41 | ## Create a pull request in the taxonomy repository
42 | 
43 | Navigate to your forked taxonomy repository and ensure it is up-to-date.
44 | 
45 | There are a few ways you can create a pull request:
46 | 
47 | - For details on the local process, check out [The GitHub Workflow Guide](https://github.com/kubernetes/community/blob/master/contributors/guide/github-workflow.md) in the kubernetes documentation and the [GitHub flow](https://docs.github.com/en/get-started/using-github/github-flow) in the GitHub documentation.
48 | - For details on contributing using the GitHub webpage UI, see [Contributing using the GH UI](https://github.com/instructlab/taxonomy/docs/contributing_via_GH_UI.md) or [Creating a pull request](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/creating-a-pull-request?tool=webui) in the GitHub documentation.
49 | 
50 | ## Verification
51 | 
52 | Here are a few things to check before seeking reviews for your contribution:
53 | 
54 | - Your `qna.yaml` follows the proper formatting. See examples in [Knowledge: YAML examples](https://github.com/instructlab/taxonomy/blob/main/README.md#knowledge-yaml-examples)
55 | - Ensure all parameters are set. Especially the `document`, `repo`, `commit` and `pattern` keys; these parameters are specific to knowledge contributions and require more analysis.
56 | - Include an `attribution.txt` file for citing your sources. see [For your attribution.txt file](https://github.com/instructlab/taxonomy/blob/main/CONTRIBUTING.md#for-your-attributiontxt-file) for more information.
57 | 
58 | ## PR Upstream Workflow
59 | 
60 | The following table outlines the expected timing for the PR(s) you have put in. The PRs go through a few steps, and checks, but you should be able to map your `label` to
61 | the place that it is in.
62 | 
63 | | Label | Actor | Action | Duration |
64 | | --- | --- | --- | --- |
65 | | | Contributor | Submit PR | - |
66 | | | Contributor | Fix failed PR checks | - |
67 | | https://github.com/instructlab/taxonomy/labels/triage-needed | Triager | Review PR, ask for changes | Days |
68 | | https://github.com/instructlab/taxonomy/labels/triage-requested-changes | Contributor | Make requested changes | Days |
69 | | https://github.com/instructlab/taxonomy/labels/precheck-generate-ready | Triager | Run prechecks and generate  | Days |
70 | | https://github.com/instructlab/taxonomy/labels/community-build-ready | Backend | Model gets retrained | Weeks |
71 | | | Triager | Check the numbers and PR merged or closed | - |
72 | 


--------------------------------------------------------------------------------
/foundational_skills/reasoning/theory_of_mind/qna.yaml:
--------------------------------------------------------------------------------
 1 | created_by: IBM
 2 | seed_examples:
 3 | - answer: "Sarah placed her shoes under her dress before she went shopping. Her sister\
 4 |     \ borrowed the shoes and put them under Sarah\u2019s bed. Sarah doesn\u2019t know\
 5 |     \ that her sister borrowed her shoes. Therefore, the answer is yes, she assumes\
 6 |     \ that the shoes are still under her dress.\n"
 7 |   question: "Read the scenario and answer the following question: Scenario: \"The\
 8 |     \ morning of the high school dance Sarah placed her high heel shoes under her\
 9 |     \ dress and then went shopping. That afternoon, her sister borrowed the shoes\
10 |     \ and later put them under Sarah\u2019s bed.\" Question: When Sarah gets ready,\
11 |     \ does she assume her shoes are under her dress?\n"
12 | - answer: 'When the Garcia family left, the game was tied, 3-3, and they expected
13 |     the game to be postponed due to the rain. So, they don''t know that the game ended
14 |     during their commute with a score of 5-3. Therefore, the answer is no, they don''t
15 |     believe the score is 5-3.
16 | 
17 |     '
18 |   question: 'Read the scenario and answer the following question: Scenario: "Expecting
19 |     the game to be postponed because of the rain, the Garcia family took the subway
20 |     home. The score was tied, 3-3. During their commute the rain stopped, and the
21 |     game soon ended with a score of 5- 3." Question: When the Garcia family arrives
22 |     home, do they believe the score is 5-3?'
23 | - answer: 'When Lisa left Jacob, he was asleep on the beach. So, Lisa doesn''t know
24 |     that Jacob woke up and went swimming. Therefore, the answer is yes, Lisa believes
25 |     that Jacob is still asleep on the beach.
26 | 
27 |     '
28 |   question: 'Read the scenario and answer the following question: Scenario: "When
29 |     Lisa left Jacob, he was deep asleep on the beach. A few minutes later a wave woke
30 |     him. Seeing Lisa was gone, Jacob decided to go swimming." Question: Does Lisa
31 |     now believe that Jacob is asleep?
32 | 
33 |     '
34 | - answer: 'Since Pam has been at work all day, she doesn''t know that the tulips in
35 |     her backyard have suddenly bloomed. Therefore, the answer is no, when driving
36 |     home she doesn''t assume that her tulips have bloomed.
37 | 
38 |     '
39 |   question: "Read the scenario and answer the following question: Scenario: \"The\
40 |     \ weather was so warm today that all the tulips in Pam\u2019s backyard suddenly\
41 |     \ bloomed. The tulips next to Pam\u2019s office still have not yet flowered, though.\
42 |     \ Pam has been at work all day.\" Question: When Pam is driving home after work,\
43 |     \ does she assume her tulips have bloomed?\n"
44 | - answer: 'The scenario mentions that Jeff put on a pink shirt instead of a white
45 |     one. This means he wanted to put on a white shirt but since he is colorblind and
46 |     can''t tell the difference, he mistakenly put on a pink shirt. He doesn''t know
47 |     we is wearing a pink short. Therefore, the answer is yes, he think he is wearing
48 |     a white shirt.
49 | 
50 |     '
51 |   question: "Read the scenario and answer the following question: Scenario: \"When\
52 |     \ Jeff got ready this morning, he put on a light pink shirt instead of a white\
53 |     \ one. Jeff is colorblind, so he can\u2019t tell the difference between subtle\
54 |     \ shades of color.\" Question: Does Jeff believe he is wearing a white shirt?"
55 | - answer: 'The scenario mentions that the lab was empty that night. Therefore, the
56 |     answer is yes, the number of people in the lab that night is zero.
57 | 
58 |     '
59 |   question: 'Read the scenario and answer the following question: Scenario: "To detect
60 |     intruders, the lab uses an automated system for recording voices. In the empty
61 |     lab one night, a computer error occurs and a synthetic voice reads the error message."
62 |     Question: Is the number of people in the lab that night zero?
63 | 
64 |     '
65 | - answer: "The scenario mentions that the country\u2019s economic success were recorded\
66 |     \ in books from the early 1900s. Therefore, the answer is yes, early 1900s novels\
67 |     \ portray the country as experiencing economic wealth.\n"
68 |   question: "Read the scenario and answer the following question: Scenario: \"Accounts\
69 |     \ of the country\u2019s economic success were recorded in books from the early\
70 |     \ 1900s. Soon after, a horrible plague hit the country, and the country was sent\
71 |     \ into an economic depression.\" Question: Do early 1900s novels portray the country\
72 |     \ as experiencing economic wealth?"
73 | - answer: 'The scenario mentions that a part of garden is reserved for roses and it
74 |     has a label showing that. Therefore, the answer is yes, the label say the flowers
75 |     are roses.
76 | 
77 |     '
78 |   question: "Read the scenario and answer the following question: Scenario: \"Part\
79 |     \ of the garden is supposed to be reserved for the roses; it\u2019s labeled accordingly.\
80 |     \ Recently the garden has run wild, and dandelions have taken over the entire\
81 |     \ flower bed.\" Question: Does the label say these flowers are roses?\n"
82 | task_description: 'To teach a language model about theory-of-mind reasoning
83 | 
84 |   '
85 | 


--------------------------------------------------------------------------------
/foundational_skills/reasoning/logical_reasoning/general/qna.yaml:
--------------------------------------------------------------------------------
  1 | created_by: IBM
  2 | seed_examples:
  3 | - answer: 'If you have one bun and need to share it among 7 kids without splitting
  4 |     it, each child would receive 1/7 of the bun. So the split fraction would be 1/7.
  5 | 
  6 |     '
  7 |   question: 'i have 7 kids and i have only one bun and i need to share it without
  8 |     splitting, tell me the split fraction
  9 | 
 10 |     '
 11 | - answer: 'The doctor instructs you to take one pill every half hour. So, you would
 12 |     take the first pill immediately, the second pill after 30 minutes, and the third
 13 |     pill after another 30 minutes, totaling one hour. After one hour, all three pills
 14 |     would have been consumed.
 15 | 
 16 |     '
 17 |   question: If a doctor gives you three pills and tells you to take one pill every
 18 |     half hour, how long would the pills last?
 19 | - answer: 'The answer to this question is still 5 minutes.
 20 | 
 21 |     The rate at which the widgets are produced remains constant regardless of the
 22 |     number of machines. Each machine produces one widget in 5 minutes, so whether
 23 |     there are 5 machines or 100 machines, it would still take 5 minutes to produce
 24 |     100 widgets.
 25 | 
 26 |     '
 27 |   question: 'If it takes 5 machines 5 minutes to make 5 widgets, how long would it
 28 |     take 100 machines to make 100 widgets?
 29 | 
 30 |     '
 31 | - answer: 'The odd one out is Moon.  All of the others are planets from the solar
 32 |     system, but Moon is a satillite that orbits around the Earth.
 33 | 
 34 |     '
 35 |   question: 'Which of the following is the odd one out?
 36 | 
 37 |     A) Earth
 38 | 
 39 |     B) Mars
 40 | 
 41 |     C) Jupiter
 42 | 
 43 |     D) Moon
 44 | 
 45 |     '
 46 | - answer: "A is D's aunt. \nD is the brother of E\nE is the daughter of B, making\
 47 |     \ D the son of B\nB is the sister of A, making A the aunt of both D and E. \n"
 48 |   question: A, B and C are sisters, D is the brother of E and E is the daughter of
 49 |     B. How is A related to D ?
 50 | - answer: '6\. The series is dividing the the number in half at each step
 51 | 
 52 |     '
 53 |   question: 'Which number should come next in the series, 48, 24, 12, ......?
 54 | 
 55 |     '
 56 | - answer: 'Sam doesn''t run   faster than Jane as Jane runs faster than Joe and Joe
 57 |     runs faster than Sam.
 58 | 
 59 |     '
 60 |   question: 'Jane runs faster than Joe. Joe runs faster than Sam. Does Sam run faster
 61 |     than Jane?
 62 | 
 63 |     '
 64 | - answer: 'When drying clothes outside, several factors can affect the drying time,
 65 |     such as temperature, humidity, wind, and the number of shirts.
 66 | 
 67 |     If there''s sufficient space for drying shirts in parallel, drying 20 shirts could
 68 |     take as long as drying 5 shirts so 4 hours. However, when there''s only sufficient
 69 |     space for 5 shirts, you''ll need to dry the shirts in batches so considering that
 70 |     you need to dry 20 shirts, you would need 4 batches bringing the total time to
 71 |     16 hours.
 72 | 
 73 | 
 74 |     '
 75 |   question: You're an expert in house chores. It takes 4 hours to dry 5 shirts in
 76 |     open air. How long does it take to dry 20 shirts?
 77 | - answer: 'Marc thinks the ball is in the basket as that''s where it was when he left
 78 |     the room. John, on the other hand, thinks the ball is in the cupboard where he
 79 |     put it before he left the room.
 80 | 
 81 |     '
 82 |   question: 'Marc and John are together in a room with a ball, a basket and a cupboard.
 83 |     Marc puts the ball in the basket and leaves the room. John takes the ball and
 84 |     puts in the cupboard and leaves the room. When they return in the evening, they
 85 |     don''t know what happened to the ball after they''ve left the room. Where do they
 86 |     each think the ball is? Explain your reasoning step by step.
 87 | 
 88 | 
 89 | 
 90 |     '
 91 | - answer: 'Based on the information provided, the names of Mike''s mom''s four kids
 92 |     are Luis, Drake, Matilda, and Mike.   Therefore the name of the 4th kid is Mike.
 93 | 
 94 |     '
 95 |   question: 'Mike''s mom has 4 kids. 3 of them are Luis, Drake and Matilda. What is
 96 |     the name of the 4th kid?
 97 | 
 98 |     '
 99 | - answer: 'Let''s analyze the situation step by step:
100 | 
101 | 
102 |     1\. We know that 4 cars traveled from San Francisco to San Jose, which is 41.60
103 |     miles apart.
104 | 
105 |     2\. Each car travels at a similar speed. Having more cars on the road will not
106 |     reduce the time that any one car takes to drive from San Francisco to San Jose.
107 | 
108 |     3\. Therefore, the time it takes for each car to travel the distance is same,
109 |     1 hour.
110 | 
111 |     '
112 |   question: 4 cars left San Fransisco at 12pm and they reached San Jose, 41.60 mi
113 |     apart, at 1pm. How much time will be needed for 8 cars? Consider they have similar
114 |     speed.
115 | - answer: 'Based on the information provided, we know that Jane has one car and that
116 |     it is in the driveway when she is at home. However, when Jane is at her office,
117 |     we know that she used her car for commuting. This implies that the car is not
118 |     in the driveway. As Jane only has 1 car, then there must be no cars in Jane''s
119 |     driveway now.  Therefore the answer is 0.
120 | 
121 |     '
122 |   question: 'Jane has 1 car. When she is at home then her car is in the driveway.
123 |     Jane is now at her office. Can you find the  number of car in her driveway now?
124 | 
125 |     '
126 | - answer: 'C (The student''s marks is between 40 and 60.)
127 | 
128 |     '
129 |   question: '"You are a teacher and need to give grade to the students. The grades
130 |     are defined as follows:\n\nFor marks less than 40, grade is F.\nFor marks between
131 |     40 and 60, grade is C.\nFor marks between 60 and 80, grade is B.\nFor marks above
132 |     80, grade is A.\n\nYou will be given a student''s marks as input and you need
133 |     to output the grade.\n\nMarks : 47\nGrade:\u00a0\n"
134 | 
135 |     '
136 | - answer: 'Intermediate
137 | 
138 |     '
139 |   question: 'You are a fitness trainer designing workout plans for clients. Based
140 |     on number of hours a client exercised in a week, determine and output the appropriate
141 |     fitness level:
142 | 
143 | 
144 |     For less than 3 hours, fitness level is Beginner.
145 | 
146 |     For 3 to 5 hours, fitness level is Intermediate.
147 | 
148 |     For 5 to 7 hours, fitness level is Advanced.
149 | 
150 |     For more than 7 hours, fitness level is Expert.
151 | 
152 | 
153 |     Weekly exercise hours: 4
154 | 
155 |     Fitness level: '
156 | - answer: 'Enthusiast
157 | 
158 |     '
159 |   question: "As a travel guide, you want to assess the experiences of tourists. Based\
160 |     \ on the number of landmarks visited by a tourist in a city, determine the travel\
161 |     \ exploration level using the following guidelines: \n\nFor 0 to 2 landmarks,\
162 |     \ exploration level is Casual. \nFor 3 to 5 landmarks, exploration level is Enthusiast.\
163 |     \ \nFor 6 to 8 landmarks, exploration level is Adventurer. \nFor more than 8 landmarks,\
164 |     \ exploration level is Explorer. \n\nNumber of landmarks visited: 4 \nExploration\
165 |     \ level:\_\n"
166 | task_description: 'To teach a language model about logical reasoning
167 | 
168 |   '
169 | 


--------------------------------------------------------------------------------
/docs/KNOWLEDGE_GUIDE.md:
--------------------------------------------------------------------------------
  1 | # Knowledge Guide
  2 | 
  3 | ## What is "Knowledge"?
  4 | 
  5 | Knowledge consists of data and facts and is backed by documents. When you create knowledge for a model, you're giving it additional data to more accurately answer questions.
  6 | 
  7 | Knowledge contributions in this project contain a few things.
  8 | 
  9 | - A file in a git repository that holds your information. For example, these repositories can include markdown versions of information on: Oscar 2024 winners, Law books, Shakespeare, Sports, Chemistry, etc.
 10 | - A `qna.yaml` file that asks and answers questions about the information in the git repository.
 11 | - A `attribution.txt` that includes the sources for the information used in the `qna.yaml`.
 12 | 
 13 | You can learn more about the knowledge structure in [Getting Started with Knowledge contributions](https://github.com/instructlab/taxonomy/blob/main/README.md#getting-started-with-knowledge-contributions).
 14 | 
 15 | ## Accepted Knowledge
 16 | 
 17 | > [!IMPORTANT]
 18 | > We are currently only accepting knowledge contributions as a limited private beta and sources will be limited to articles from Wikipedia.
 19 | 
 20 | There are a few domains of knowledge that we are currently accepting. For a full list of knowledge fields, see [Knowledge domains](https://github.com/instructlab/taxonomy/blob/main/knowledge/knowledge_domains.md) in the taxonomy documentation
 21 | 
 22 | A few examples are as follows:
 23 | 
 24 | ### STEM fields
 25 | 
 26 | - Physics
 27 |   - Astronomy and Astrophysics
 28 |   - Quantum Mechanics
 29 |   - Special Relativity and General Relativity
 30 | 
 31 | - Chemistry & Chemical Engineering
 32 |   - Organic Chemistry
 33 |   - Inorganic Chemistry
 34 |   - Chemical engineering
 35 |   - Biotechnology
 36 | 
 37 | - Earth & Environmental Science
 38 |   - Geology
 39 |   - Geography
 40 | 
 41 | - Biology & Life Sciences
 42 |   - Plants (Botany)
 43 |   - Medicine & health
 44 | 
 45 | - Electrical Engineering
 46 | - Bioengineering
 47 | - Civil Engineering
 48 | - Industrial Engineering
 49 | 
 50 | ### Legal and regulatory
 51 | 
 52 | - Intellectual Property
 53 | - Criminal Law
 54 | - Civil Rights
 55 | - Healthcare compliance
 56 | 
 57 | ### Economy and Business
 58 | 
 59 | - Economy and Businesses
 60 | - Accounting and Finance
 61 | - Marketing
 62 | - Human Resource
 63 | - Management
 64 | 
 65 | ### Philosophy
 66 | 
 67 | - Philosophy
 68 | - Metaphysics
 69 | - Epistemology
 70 | - Ethics
 71 | - Parapsychology & occultism
 72 | - Philosophical schools of thought
 73 | 
 74 | ### Literature
 75 | 
 76 | - Literature, rhetoric & criticism
 77 | - American literature in English
 78 | - Other literatures
 79 | 
 80 | ## Avoid These Topics
 81 | 
 82 | While the tuning process may eventually benefit from being used to help the models work with complex social topics, at this time this is an area of active research we do not want to take lightly. Therefore please keep your submissions clear of the following topics:
 83 | 
 84 | - PII (personally identifiable information) or any content invasive of individual privacy rights
 85 | - Violence including self-harm
 86 | - Cyber Bullying
 87 | - Internal documentation or other that is confidential to your employer or organization, e.g. trade secrets
 88 | - Discrimination
 89 | - Religion
 90 |   - Facts such as, "[Christianity is, according to the 2011 census, the fifth most practiced religion in Nepal, with 375,699 adherents, or 1.4% of the population](https://en.wikipedia.org/wiki/Christianity_in_Nepal)", are fine as a knowledge contribution. Advocating in favor of or against any religious faith is not acceptable.
 91 | - Medical or health information
 92 |   - Facts such as,  "[In mammals, pulmonary ventilation occurs via inhalation (breathing)](https://opentextbc.ca/biology/chapter/11-3-circulatory-and-respiratory-systems/)," are fine as a knowledge contribution. Tailored medical/health advice is not acceptable.
 93 | - Financial information
 94 |   - Facts such as "[laissez-faire economics ... argues that market forces alone should drive the economy and that governments should refrain from direct intervention in or moderation of the economic system](https://openstax.org/books/world-history-volume-2/pages/6-3-capitalism-and-the-first-industrial-revolution)," are fine as a knowledge contribution. Tailored financial advice is not acceptable.
 95 | - Legal settlements/mitigations
 96 | - Gender Bias
 97 | - Hostile Language, threats, slurs, derogatory or insensitive jokes or comments
 98 | - Profanity
 99 | - Pornography and sexually explicit or suggestive content
100 | - Any contributions that would allow for automated decision making that affect an individual's rights or well-being, e.g. social scoring
101 | - Any contributions that engage in political campaigning or lobbying
102 | 
103 | We are also not accepting submissions of the following content:
104 | 
105 | - Code
106 |   - Anything code-related that can be traced back to code for a computer. Not limited to `sed` or `bash` but `yaml`s for OpenShift or Kubernetes, to `python` snippets to `Java` suggestions. There are specific models focused on this space and this isn't for this model for the time being.
107 | - Jokes
108 | - Poems
109 | 
110 | We received many joke and poem submissions at the beginning of the project, and with jokes being "in the eye of the beholder" and puns requiring nuance for native English speakers, we realized we were possibly unconsciously biasing our model. We have discovered that working with both topics has its own challenges, and if we want something generalized, finding consensus was unsuccessful. For now, we're not accepting additional submissions of jokes and poems.
111 | 
112 | ## Building Your LLM Intuition
113 | 
114 | LLMs have inherent limitations that make certain tasks extremely difficult, like doing math problems. They're great at other tasks, like creative writing. And they could be better at things like logical reasoning.
115 | 
116 | An LLM with knowledge helps it create a basis of information that it can learn from, then you can teach it to use this knowledge via the `qna.yaml` files.
117 | 
118 | For example, you can give an LLM the entire periodic table, then in a `qna.yaml` add something like:
119 | 
120 | question: What is the symbol and atomic number for Chlorine?
121 | answer: |
122 |         The symbol for chlorine is Cl and the atomic number is 17.
123 | 
124 | With a few of these qna's, the model will learn the periodic table because it has the knowledge data.
125 | 
126 | ### LLMs are great at
127 | 
128 | For these, however, it's common for LLMs to already have excellent performance. Try 3-5 examples in `lab chat` to confirm a deficit in the model before you build your submission, and share the examples in your Pull Request (PR).
129 | 
130 | - Brainstorming
131 | - Creativity
132 | - Connecting information
133 | - Cross-lingual behavior
134 | 
135 | ### LLMs need help with
136 | 
137 | LLM behavior in these sorts of topics are very difficult for the model to get right. Try several examples to understand the nuances of the model's ability to do these sorts of tasks, and consider using corrections to the results you get in your tuning process.
138 | 
139 | - Chains of reasoning
140 | - Analysis
141 | - Story plots
142 | - Reassembling information
143 | - Effective and succinct summaries
144 | 
145 | ### LLMs are not so great at
146 | 
147 | LLMs may struggle with solving math and computation. That said, improving some of these foundational skills may be something this work tackles in the future, but not at this time.
148 | 
149 | - Math
150 | - Computation
151 | - "Turing-complete" type tasks
152 | - Generating only true real-world information (they're prone to hallucinations)
153 | 


--------------------------------------------------------------------------------
/docs/triaging/triaging-contributions.md:
--------------------------------------------------------------------------------
  1 | ## Triaging contributions
  2 | 
  3 | - [Triaging guide](#triaging-guide)
  4 |   - [Basic review questions](#basic-review-questions)
  5 |   - [Subjective review questions](#subjective-review-questions)
  6 | - [Triager responsibilities](#triager-responsibilities)
  7 |   - [Labels](#labels)
  8 |   - [Label workflow guide](#label-workflow-guide)
  9 | - [Triaging schedule](#triaging-schedule)
 10 | 
 11 | ## Triaging guide
 12 | 
 13 | Before you begin, familiarize yourself with the basic concepts around skills and knowledge, as well as the file formats for compositional skills, grounded vs freeform skills, and knowledge.
 14 | 
 15 | - [InstructLab Community Learning Guide](https://github.com/instructlab/community/blob/main/docs/README.md)
 16 | - [Taxonomy README](../README.md)
 17 | 
 18 | Triaging is the practice of reviewing existing [skill and knowledge pull requests (PRs)](https://github.com/instructlab/taxonomy/pulls?q=is%3Apr+is%3Aopen+label%3Askill) to make sure they're relevant, actionable, and have all the information needed to be fully evaluated by both the Taxonomy Triage team (Triagers, @taxonomy-triagers) and the Taxonomy Approvers (Approvers, @taxonomy-approvers).
 19 | 
 20 | Triagers review open pull requests and use labels to manage their state and any actions needed. Triagers are also encouraged to provide informative and helpful comments either back to the contributor, to other Triagers or to the Approvers. And remember to be nice.
 21 | 
 22 | > [!IMPORTANT]
 23 | > Triagers DO NOT MERGE skills pull requests. This action will be done after final approval by @taxonomy-approvers
 24 | 
 25 | ### Basic review questions
 26 | 
 27 | - Does the PR have the pull request template information filled out?
 28 |   - If skill has not been run through lmdk, assign unverified label
 29 | - Did all the PR checks pass?
 30 | - Does the skill have 5 or more examples?
 31 |   - **NOTE 2024-03-12:** This has been increased from 3 in the most recent guidance from the approvers!
 32 | - Make sure fields in YAML are correct
 33 |   - configure tooling eventually: linting, formatting
 34 | - No PII in content (may eventually be automated)
 35 | - No toxic or hateful content (HAP - hate abuse and profanity) (may eventually be automated)
 36 | - Was response clearly generated by LLM (not easy but if very obvious) (may eventually be automated)
 37 | 
 38 | ### Subjective review questions
 39 | 
 40 | - Is this a skill or knowledge
 41 | - Do we think that the model could actually be improved by the examples?
 42 |   - Is this a skill that you can even teach an LLM
 43 | - Is the skill appropriately placed within the taxonomy? (If outside of skill directory, address the issue)
 44 | 
 45 | > [!NOTE]
 46 | > The skill taxonomy structure is used in several ways:
 47 | >
 48 | > 1. Selecting the right subset of the taxonomy to use for data generation.
 49 | > 2. Interpretability by human contributors and maintainers.
 50 | > 3. As part of the prompt to the model used to generate synthetic samples.
 51 | > Therefore: Make sure the names of directories match the intent of the
 52 | > taxonomy files, perhaps also see if there's a more logical place in the
 53 | > taxonomy structure for a person's contribution to live before signing off.
 54 | 
 55 | ### Potential automation: (at a later date)
 56 | 
 57 | - HAP filtering
 58 | - PII filtering
 59 | - sanity check: is model response similar or complete different from provided by contributor?
 60 | - sanity check: is this a skill that you can even teach an LLM?
 61 | - generation check: do the teacher-model-generated instructions actually make sense with the skill being added?
 62 | 
 63 | ## Triager Responsibilities
 64 | 
 65 | ### Labels
 66 | 
 67 | There are a few labels that the triager is responsible for when reviewing PRs:
 68 | 
 69 | - https://github.com/instructlab/taxonomy/labels/ci - the PR touches our CI system
 70 | - https://github.com/instructlab/taxonomy/labels/enhancement - the PR has a new feature or request
 71 | - https://github.com/instructlab/taxonomy/labels/knowledge - the PR is a knowledge contribution
 72 | - https://github.com/instructlab/taxonomy/labels/legal-hold - the PR is a good suggestion but we need legal signoff, or review
 73 | - https://github.com/instructlab/taxonomy/labels/github_actions - the PR touches our GitHub Actions configuration
 74 | - https://github.com/instructlab/taxonomy/labels/help%20wanted - extra attention is needed
 75 | - https://github.com/instructlab/taxonomy/labels/question - further information is requested
 76 | - https://github.com/instructlab/taxonomy/labels/precheck-generate-ready - The PR has passed all the linting and "code" now is in the model enagegment loop
 77 | - https://github.com/instructlab/taxonomy/labels/sdg-unsuccessful - PR failed Synthetic Data Generation
 78 | - https://github.com/instructlab/taxonomy/labels/stale - the stale-bot has marked you as stale
 79 | - https://github.com/instructlab/taxonomy/labels/skill - a skills contribution as opposed to documentation contribution or a knowledge contribution
 80 | - https://github.com/instructlab/taxonomy/labels/topic-failure - a topic that we are not accepting (leave comment on specifics)
 81 | - https://github.com/instructlab/taxonomy/labels/triage-approved - triage team has signed off
 82 |   - re-assign to @taxonomy-approvers
 83 |   - add comment and tag @taxonomy-approvers
 84 | - https://github.com/instructlab/taxonomy/labels/triage-follow-up - triager needs to follow up after requested changes have been made
 85 | - https://github.com/instructlab/taxonomy/labels/triage-needed - skill needs a triager to review it
 86 |   - triager assigns to themself when you beginning review
 87 | - https://github.com/instructlab/taxonomy/labels/triage-requested-changes - skill has been reviewed; changes requested from contributor
 88 |   - triager provides comment in PR asking for additional changes or information
 89 |   - triager assigns to contributor
 90 | - https://github.com/instructlab/taxonomy/labels/triage-rejected - skill fails criteria
 91 |   - add informative comment while tagging @taxonomy-approvers
 92 |   - re-assign to @taxonomy-approvers
 93 | - https://github.com/instructlab/taxonomy/labels/triage-uncertain - triager is uncertain which can be for a variety of reasons
 94 |   - triager stays assigned
 95 |   - use comment to ask the rest of the triage team for input tagging @taxonomy-triagers
 96 |   - if still uncertain
 97 |     - then re-assign to @taxonomy-approvers
 98 |     - triager tags @taxonomy-approvers in informative comment asking for further review from that team
 99 | 
100 | ### Label Workflow guide
101 | 
102 | ![tax_label](../assets/tax_labels.png)
103 | 
104 | ### Helpful guidance for different determinations
105 | 
106 | #### Reasons for approval
107 | 
108 | - Generation seeds (successfully creates more instructions in a `.jsonl` file)
109 | - Meets all criteria
110 | 
111 | #### Reasons for needing further review
112 | 
113 | - Needs more extensive edits
114 | - General "I Don't Know"
115 | - Safety tasks and skills should always be escalated to @taxonomy-approvers
116 | - Super interesting, warrants further study
117 | 
118 | #### Reasons for rejection
119 | 
120 | - Submitted knowledge not a skill. For example, troubleshooting on an uncommon IBM Storage Fusion error message.
121 | - Obvious LLM answer, blocklist.
122 |   - If you're not familiar with what ChatGPT / Bard / etc writing typically looks like, play with it a bit until you can recognize the tone and linguistic patterns.
123 | - Couldn’t verify that the model actually lacks the skills — i.e. model can already answer the submitted questions well enough.
124 | - Provide examples of model response is too short and neglected reasoning details. For example: A logical question requires multi-step reasoning to reach to the final answer. The submitted model response only gives the final answer.
125 | - Uninformative examples. For example, not all examples match the skill requested; Or the user didn’t put three independent question/answer pairs for the skill, but mistakenly submitted three chat turns for the three questions/answer pairs. Or overly repetitive examples which do not help to clear define of the requested skill.
126 | - Missing examples: didn’t provide desired model response for the skill.
127 | 
128 | > [!NOTE]
129 | > Skills triagers should try to include as much information as to why the contribution is rejected.
130 | >
131 | ### Scrubbing data from issues and pull requests
132 | 
133 | - Title: edit title to remove information
134 | - Comment: simply edit or delete a comment; if info is very sensitive and needs to be fully deleted, after editing the comment, use the edit history dropdown menu in the comment to delete previous versions of the comment’s content
135 | - Description of issue or pull request cannot be simply deleted, so follow process above to edit and delete history revisions
136 | - Code (in pull request files):
137 |   - Do NOT close PR or delete source branch yet (**important** as this would disconnect the PR from the PR source branch and the PR's changed files view remain visible)
138 |   - Edit/delete the files on the forked branch (clone the fork, checkout the PR's branch, edit the file(s), git amend last commit or reset HEAD~n to revert last n commits, force push)
139 |   - Now close PR, delete source branch
140 |   - The original now orphaned commits can still be found, but it takes some effort and the changed files view no longer shows any of the sensitive information
141 |   - Edit any comments on the PR with sensitive info and delete the previous versions
142 | 
143 | ## Triaging schedule
144 | 
145 | ![tax_label](../assets/triage_schedule.png)
146 | 


--------------------------------------------------------------------------------
/knowledge/arts/music/fandom/swifties/qna.yaml:
--------------------------------------------------------------------------------
  1 | version: 3
  2 | domain: Fandom
  3 | created_by: mairin
  4 | seed_examples:
  5 |   - context: |
  6 |       Critics have opined that Swift has redefined artist–fandom relationships
  7 |       by establishing an intimate connection with Swifties. She has frequently
  8 |       engaged with, helped, credited and prioritized her fans, who have
  9 |       offered unprecedented support and interest in her works irrespective of
 10 |       her wavering reception in the media. They continued to support Swift
 11 |       through her genre transitions, unexpected artistic pivots, and her
 12 |       highly publicized controversies such as the 2019 masters dispute, while
 13 |       instigating the political scrutiny of Ticketmaster that led to
 14 |       implementation of various laws and stimulated economic growth with the
 15 |       Eras Tour. Swift's releases, promotional efforts, and fashion have
 16 |       garnered attention for incorporating Easter eggs and clues that are
 17 |       decoded by Swifties and considered part of her musical universe.
 18 | 
 19 |       Cultural analyses have variably described Swifties as a community of
 20 |       interest, a subculture, and a near-metaverse, while academics have
 21 |       studied them for their consumerism, content creation, social capital,
 22 |       collective effervescence, organizing prolificacy, and interpersonal
 23 |       relationships. The word Swiftie(s) was added to the Oxford Dictionary of
 24 |       English in 2023. Swifties have also been a subject of criticism, with
 25 |       some fans displaying disregard for Swift's privacy by publicizing her
 26 |       real-time locations and engaging in verbal attack of individuals,
 27 |       including celebrities, who malign Swift. On the other hand, some
 28 |       Swifties criticize Swift for her lifestyle and professional choices,
 29 |       which journalists disapprove as a parasocial relationship.
 30 |     questions_and_answers:
 31 |       - question: |
 32 |           Is Swiftie a real word?
 33 |         answer: |
 34 |           Yes, Swiftie is a real word. Swiftie was added to the Oxford
 35 |           Dictionary of English in 2023.
 36 |       - question: |
 37 |           What do Swifties think about Ticketmaster?
 38 |         answer: |
 39 |           Swifties supported Taylor Swift in scrutinizing Ticketmaster's
 40 |           practices and helped advocate for various laws now in place to
 41 |           help address problems with those practices.
 42 |       - question: |
 43 |           How did Swifties come about?
 44 |         answer: |
 45 |           Taylor Swift has cultivated Swifties as a group by establishing
 46 |           intimate connections with her fans, redefining artist-fandom
 47 |           relationships.
 48 |   - context: |
 49 |       The word "Swiftie" for a Swift fan gained popularity in the late 2000s.
 50 |       Etymologically, the word is formed from Swift's name and the suffix
 51 |       "ie", which is often used in diminutives to imply affection. Swift
 52 |       stated in a 2012 Vevo interview that her fans call themselves
 53 |       "Swifties", which she found "adorable". Swift filed the term for
 54 |       trademark in March 2017. In 2023, Oxford Dictionary of English defined
 55 |       Swiftie as a noun meaning "an enthusiastic fan of the singer Taylor
 56 |       Swift." As per the dictionary, some words that collocate with Swiftie in
 57 |       popular usage are "fandom", "die-hard", "hardcore" and "self-
 58 |       proclaimed". According to Dictionary.com, the term Swiftie often implies
 59 |       that the person is "a very passionate and loyal fan—as opposed to just a
 60 |       casual listener."
 61 | 
 62 |     questions_and_answers:
 63 |       - question: |
 64 |           How does someone become a Swiftie?
 65 |         answer: |
 66 |           Swifties are "self-proclaimed."
 67 |       - question: |
 68 |           What are the requirements to be considered a Swiftie?
 69 |         answer: |
 70 |           Swifties are die-hard, hardcore fans who are enthusiastic,
 71 |           passionate, and loyal to Taylor Swift. They are not casual
 72 |           listeners. To be considered a Swiftie, you must embody
 73 |           these qualities.
 74 |       - question: |
 75 |           How does Taylor Swift feel about the term "Swifties"?
 76 |         answer: |
 77 |           Taylor Swift has said she finds the term Swifties to be
 78 |           "adorable", which she said in a 2012 Vevo interview.
 79 |   - context: |
 80 |       The nicknames generally used by Swifties for Swift include "Blondie"
 81 |       (referencing her blonde hair), "T-Swizzle" (after the lyrics in Swift's
 82 |       2009 parody song "Thug Story" with American record producer T-Pain), and
 83 |       "The Music Industry" (referring to her influence on the music industry).
 84 |       Chinese Swifties dubbed her "Meimei" (霉霉), a pun based on the Chinese
 85 |       character "Mei" (霉) for "unlucky" that has since been adopted by Chinese
 86 |       state media.
 87 |     questions_and_answers:
 88 |       - question: |
 89 |           What are some nicknames for Taylor Swift?
 90 |         answer: |
 91 |           "Blondie", in reference to her blonde hair; "T-Swizzle", after lyrics
 92 |           in a 2009 parody song called "Thug Story"; "Meimei" (霉霉), a pun
 93 |           based on the Chinese character "Mei" (霉).
 94 |       - question: |
 95 |           What do Chinese Swifties call Taylor Swift?
 96 |         answer: |
 97 |           Swifties in China call Taylor Swift "Meimei" (霉霉), which is a pun
 98 |           based on the Chinese character "Mei" (霉), which means "unlucky."
 99 |       - question: |
100 |           Why is Taylor Swift sometimes called "T-Swizzle"?
101 |         answer: |
102 |           "T-Swizzle" is a reference to a song called "Thug Story" that Taylor
103 |           Swift released in 2009 with T-Pain.
104 |   - context: |
105 |       Journalists describe Swift's works, celebrity, and the fanfare
106 |       surrounding them as a world of its own, dubbing it a music "universe"
107 |       subject to analyses by Swifties. Propagated by her prolific use of
108 |       Easter eggs and "unusually close connection with her fans", Swift is a
109 |       source of myth in popular culture. Her outfits, accessories, diction,
110 |       color coding, and numerology have also been Easter eggs. Swifties are
111 |       known for their fan theories, having gained a reputation as "the best
112 |       online sleuths" for breaking down and associating various elements they
113 |       consider as clues or Easter eggs. According to Bruce Arthur of Toronto
114 |       Star, "Swift is followed by fans whose dedication to her mythology is
115 |       Byzantine and layered and complex and messianic."
116 |     questions_and_answers:
117 |       - question: |
118 |           What are some different elements for Swiftie fan theories about
119 |           Taylor Swift?
120 |         answer: |
121 |           Swiftie fan theories include analyses of Taylor Swift's outfits,
122 |           accessories, diction, color coding, numeroology, and Easter Eggs.
123 |       - question: |
124 |           What American pop singer fanbase are known as "the best online
125 |           sleuths?"
126 |         answer: |
127 |           Swifties, the fan base for Taylor Swift, are known as "the best
128 |           online sleuths," in part due to how they develop rich fan theories
129 |           based on signals they interpret from Swift herself.
130 |       - question: |
131 |           What does Taylor Swift have to do with mythology?
132 |         answer: |
133 |           The devotion of Swifties towards analyzing various attributes and
134 |           signals they interpret as coming from Taylor Swift has been
135 |           referenced as "dedication to her mythology [which] is Byzantine
136 |           and layered and complex and messianic," by Bruce Arthur of the
137 |           Toronto Star.
138 |   - context: |
139 |       Swift is known for her album rollouts and promotional concepts, often
140 |       referred to as "eras". Each era is characterized by an aesthetic idea,
141 |       color palette, mood, and a fashion style. As such, Swift has reinvented
142 |       her image and style throughout her career, which Ashley Lutz of Fortune
143 |       felt aided in broadening her fanbase. Today senior editor Elena Nicolaou
144 |       reported on how Swifties, who are mostly millennials, have incorporated
145 |       Swiftie culture into their weddings and other events.
146 |     questions_and_answers:
147 |       - question: |
148 |           What is a Taylor Swift "era"?
149 |         answer: |
150 |           "Era" in Swiftie parlance refers to Taylor Swift's album rollouts
151 |           and promotional concepts, characterized by specific aesthetic ideas,
152 |           color palettes, moods, and fashion styles.
153 |       - question: |
154 |           What is Taylor Swift's image and style like?
155 |         answer: |
156 |           Taylor Swift is known for reinventing her image and style throughout
157 |           her career as part of her various "eras," different aesthetic
158 |           concepts reflected in her albums and promotions.
159 |       - question: |
160 |           What do Swifties have to do with weddings?
161 |         answer: |
162 |           Taylor Swift's millenial fanbase often incorporate Swiftie culture
163 |           into their weddings and other important life milestone events,
164 |           according to Today senior editor Elena Nicolaou.
165 | document_outline: |
166 |   Overview of Swifties, explaining what they are, their behaviors, how
167 |   they came to be, and their relationship with Taylor Swift.
168 | 
169 | document:
170 |   repo: https://github.com/juliadenham/Summit_knowledge.git
171 |   commit: 5f7158a5ce83c4ff493bfe341fe31ecad64ff697
172 |   patterns:
173 |     - swifties.md
174 | 


--------------------------------------------------------------------------------
/knowledge/science/animals/birds/black_capped_chickadee/qna.yaml:
--------------------------------------------------------------------------------
  1 | created_by: juliadenham
  2 | version: 3
  3 | domain: animals
  4 | seed_examples:
  5 |   - context: |
  6 |       The **black-capped chickadee** (***Poecile atricapillus***) is a small,
  7 |       nonmigratory, North American [passerine](passerine "wikilink") bird that
  8 |       lives in deciduous and mixed forests. It is a member of the
  9 |       [Paridae](Paridae "wikilink") [family](family_(biology) "wikilink"),
 10 |       also known as tits. It has a distinct black cap on its head, a black bib
 11 |       underneath, and white cheeks. It has a white belly, buff sides, and grey
 12 |       wings, back, and tail. The bird is well known for its vocalizations,
 13 |       including its *fee-bee* call and its *chick-a-dee-dee-dee* call, from
 14 |       which it derives its name.
 15 | 
 16 |       The black-capped chickadee is widely distributed throughout North
 17 |       America, ranging from the northern United States to southern Canada and
 18 |       all the way up to Alaska and [Yukon](Yukon "wikilink"). It feeds
 19 |       primarily on insects and seeds, and is known for its ability to
 20 |       [cache](Hoarding_(animal_behavior) "wikilink") food for use during the
 21 |       winter. The [hippocampus](hippocampus "wikilink") of the black-capped
 22 |       chickadee grows during the caching season, which is believed to help it
 23 |       better remember its cache locations. The black-capped chickadee is a
 24 |       social bird and forms strict [dominance
 25 |       hierarchies](Dominance_hierarchy "wikilink") within its flock. During
 26 |       the winter, these flocks include other bird species. It has the ability
 27 |       to lower its body temperature during cold winter nights, allowing it to
 28 |       conserve energy.
 29 |     questions_and_answers:
 30 |       - question: |
 31 |           Where do black-capped chickadees live?
 32 |         answer: |
 33 |           The black-capped chickadee (Poecile atricapillus) is a small,
 34 |           nonmigratory, North American passerine bird that lives in deciduous
 35 |           and mixed forests.
 36 |       - question: |
 37 |           What sound does a black-capped chickadee make?
 38 |         answer: |
 39 |           The black-capped chickadee is well known for its vocalizations,
 40 |           including its fee-bee call and its chick-a-dee-dee-dee call, from
 41 |           which it derives its name.
 42 |       - question: |
 43 |           What does a black-capped chickadee eat?
 44 |         answer: |
 45 |           Black-capped chickadees feed primarily on insects and seeds, and is
 46 |           known for its ability to cache food for use during the winter.
 47 |   - context: |
 48 |       The black-capped chickadee has a black cap and "bib" with white cheeks.
 49 |       Its underparts are white with buff-colored flanks. Its back is
 50 |       unstreaked and greenish gray, and the tail and wings are slate gray. It
 51 |       has a black [beak](beak "wikilink") and legs, and dark brown
 52 |       [irises](iris_(anatomy) "wikilink"). Males and females are generally
 53 |       similar, although males have a larger bib. They can also be
 54 |       distinguished based on a combination of weight and tail length.
 55 |       [Tarsus](Bird_feet_and_legs "wikilink") length does not significantly
 56 |       differentiate sexes. Males have a wing length of 63.5–67.5 mm (2.50–2.66 in),
 57 |       a tail length of 58–63 mm (2.3–2.5 in), a bill length of 8–9.5 mm (0.31–0.37 in),
 58 |       and a tarsus length of 16–17 mm (0.63–0.67 in). Females are on average slightly
 59 |       smaller, with a wing length of 60.5–66.5 mm (2.38–2.62 in), a tail length of
 60 |       56.3–63 mm (2.22–2.48 in), a bill length of 9–9.5 mm (0.35–0.37 in), and a
 61 |       tarsus length of 16–17 mm (0.63–0.67 in). Both sexes weigh 10–14 g (0.35–0.49 oz).
 62 |       Juveniles are visually similar to adults but with fluffier plumage.
 63 |     questions_and_answers:
 64 |       - question: |
 65 |           What color cheeks does the black capped chickadee have?
 66 |         answer: |
 67 |           The black-capped chickadee has a black cap and "bib" with white
 68 |           cheeks.
 69 |       - question: |
 70 |           Do male black capped chickadees have a larger bib than the females?
 71 |         answer: |
 72 |           Yes, males and females are generally similar, although males have a
 73 |           larger bib.
 74 |       - question: |
 75 |           How can you tell that a black capped chickadee is a juvenile?
 76 |         answer: |
 77 |           Juveniles are visually similar to adults but with fluffier plumage.
 78 |   - context: |
 79 |       One of the most recognizable sounds produced, particularly by the males,
 80 |       is the two-note *fee-bee* song. It is a simple, clear whistle of two
 81 |       notes, identical in rhythm, the first roughly a
 82 |       [whole-step](Major_second "wikilink") above the second. The
 83 |       frequency of their songs will typically start around
 84 |       400 [Hz](Hertz "wikilink"), and various tones spanning roughly 1 kHz
 85 |       will be sung within the song. A decrease of roughly 200 Hz occurs when
 86 |       the first note (*fee*) is sung, and then another decrease around 400 Hz
 87 |       takes place between the end of *fee* and the beginning of *bee*. In
 88 |       spite of these multiple changes in frequency, though, anybody listening
 89 |       to the song only hears a pure, high-frequency tone. This is
 90 |       distinguished from the Carolina chickadee's four-note call *fee-bee
 91 |       fee-bay*; the lower notes are nearly identical but the higher *fee*
 92 |       notes are omitted, making the black-capped song like *bee bay*. The
 93 |       males sing the song only in relative isolation from other chickadees
 94 |       (including their mates). In late summer, some young birds sing only a
 95 |       single note.
 96 |     questions_and_answers:
 97 |       - question: |
 98 |           What is the black-capped chickadee's fee-bee song?
 99 |         answer: |
100 |           It is a simple, clear whistle of two notes, identical in rhythm,
101 |           the first roughly a whole-step above the second.
102 |       - question: |
103 |           What is the frequency of the black-capped chickadee's fee-bee
104 |           song?
105 |         answer: |
106 |           The frequency of their songs will typically start around 400 Hz,
107 |           and various tones spanning roughly 1 kHz will be sung within the
108 |           song. A decrease of roughly 200 Hz occurs when the first note (fee)
109 |           is sung, and then another decrease around 400 Hz takes place between
110 |           the end of fee and the beginning of bee.
111 |       - question: |
112 |           Do male black-capped chickadees sing their fee-bee song with other
113 |           chickadees around?
114 |         answer: |
115 |           No, the males sing the song only in relative isolation from other
116 |           chickadees (including their mates).
117 |   - context: |
118 |       Insects (especially caterpillars) form a large part of their diet in
119 |       summer. The birds hop along tree branches searching for food, sometimes
120 |       hanging upside down or hovering; they may make short flights to catch
121 |       insects in the air. Seeds and berries become more important in winter,
122 |       though insect eggs and pupae are eaten when available. Black-capped
123 |       chickadees have also been known to eat the fat off of dead mammals.
124 |       Sunflower seeds are readily taken from bird feeders. The birds take a
125 |       seed in their beak and commonly fly from the feeder to a tree, where
126 |       they proceed to hammer the seed on a branch to open it.
127 | 
128 |       Like many other species in the family Paridae, black-capped chickadees
129 |       commonly [cache](Hoarding_(animal_behavior) "wikilink") food, mostly
130 |       seeds, but sometimes insects, also. Items are stored singly in
131 |       various sites such as bark, dead leaves, clusters of conifer needles, or
132 |       knothole. Memory for the location of caches can last up to 28 days.
133 |       Within the first 24 hours, the birds can even remember the relative
134 |       quality of the stored items.
135 |     questions_and_answers:
136 |       - question: What do black-capped chickadees eat?
137 |         answer: |
138 |           Insects (especially caterpillars) form a large part of their diet
139 |           in summer. Seeds and berries become more important in winter,
140 |           though insect eggs and pupae are eaten when available.
141 |       - question: How do black-capped chickadees eat seeds?
142 |         answer: |
143 |           Black-capped chickadees take a seed in their beak and commonly
144 |           fly from the feeder to a tree, where they proceed to hammer the
145 |           seed on a branch to open it.
146 |       - question: |
147 |           How long can black-capped chickadees remember where they cached
148 |           their food?
149 |         answer: |
150 |           Memory for the location of caches can last up to 28 days.
151 |   - context: |
152 |       Black-capped chickadees are primarily subject to predation by [birds of
153 |       prey](birds_of_prey "wikilink"), including owls, hawks, and shrikes.
154 |       Nest-predation also occurs, primarily by raccoons, squirrels, opossums,
155 |       and snakes. Nest sites are also sometimes raided by [house
156 |       wrens](house_wren "wikilink"), who will destroy chickadee eggs in order
157 |       to reuse the location for their nest.
158 | 
159 |       Like many birds, black-capped chickadees are susceptible to [West Nile
160 |       Virus](West_Nile_Virus "wikilink"). They are also known to be affected
161 |       by blood parasites, including those that cause
162 |       [malaria](malaria "wikilink"), but particularly high rates of infection
163 |       have not been detected. Black-capped chickadees are also known to
164 |       be affected by [avian keratin
165 |       disorder](avian_keratin_disorder "wikilink").
166 |     questions_and_answers:
167 |       - question: |
168 |           What birds of prey eat black-capped chickadees?
169 |         answer: |
170 |           Black-capped chickadees are primarily subject to predation by birds
171 |           of prey, including owls, hawks, and shrikes.
172 |       - question: |
173 |           What is a big danger to black-capped chickadee nests?
174 |         answer: |
175 |           Nest sites are also sometimes raided by house wrens, who will
176 |           destroy chickadee eggs in order to reuse the location for their nest.
177 |       - question: |
178 |           Do black-capped chickadees carry the West Nile virus?
179 |         answer: |
180 |           Like many birds, black-capped chickadees are susceptible to West
181 |           Nile Virus.
182 | document_outline: |
183 |   Information about the black capped chickadee including distribution, appearance,
184 |   vocalization, diet, and predators.
185 | document:
186 |   repo: https://github.com/juliadenham/Summit_knowledge
187 |   commit: 5f7158a5ce83c4ff493bfe341fe31ecad64ff697
188 |   patterns:
189 |     - chickadee.md
190 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
  1 | # Contributing
  2 | 
  3 | 👍🎉 First off, thank you for taking the time to contribute! 🎉👍
  4 | 
  5 | The following is a set of guidelines for contributing. These are just guidelines, not rules. Use your best judgment, and feel free to propose changes to this document in a pull request. Please read the [Community Contribution Guide](https://github.com/instructlab/community/blob/main/CONTRIBUTING.md) first for general practices for the InstructLab community.
  6 | 
  7 | ## What Should I Know Before I Get Started?
  8 | 
  9 | ### Code of Conduct
 10 | 
 11 | This project adheres to the [Contributor Covenant](https://github.com/instructlab/community/blob/main/CODE_OF_CONDUCT.md). By participating, you are expected to uphold this code.
 12 | 
 13 | Please report unacceptable behavior to one of the [Maintainers](https://github.com/instructlab/community/blob/main/MAINTAINERS.md).
 14 | 
 15 | ### Related repositories
 16 | 
 17 | In addition to this repository, InstructLab has two related repositories:
 18 | 
 19 | - [CLI](https://github.com/instructlab/instructlab). This repository is responsible for the the `ilab` command-line interface (CLI) tool.
 20 | - [Community](https://github.com/instructlab/community). This repository is responsible for showing collaboration details across the InstructLab community.
 21 | 
 22 | The following sections provide a general overview for contributing to the Taxonomy repository.
 23 | 
 24 | ## Ways of contributing to the taxonomy repository
 25 | 
 26 | ### Contributing skills and knowledge
 27 | 
 28 | You can contribute to the taxonomy in the following two ways:
 29 | 
 30 | 1. Adding new examples to **existing leaf nodes**:
 31 | 
 32 |     - Go to the corresponding leaf node / end of the branch and modify the YAML
 33 |     - Add a new example to the `qna.yaml` files as a new entry to the list
 34 | 
 35 | 2. Adding **new branches/skills** corresponding to the existing domain:
 36 | 
 37 |     - You can add new folders under the corresponding category (replace any spaces with underscores `_`)
 38 |     - Create a new `qna.yaml` file containing examples for the new skill
 39 | 
 40 | A detailed contribution guide is documented in the [How can I contribute section](#how-can-i-contribute) section.
 41 | 
 42 | ### Contributing new features, enhancements or documentation
 43 | 
 44 | Help on open source projects is always welcome and there is always something that can be improved. For example, documentation (like the text you are reading now) can always use improvement, code can always be clarified, variables or functions can always be renamed or commented on, and there is always a need for more test coverage. If you see something that you think should be fixed, take ownership! Here is how you get started:
 45 | 
 46 | To propose a new feature, it's best to raise an issue in the appropriate repository. This way, features can be discussed with the project maintainers, ensuring that your time is not wasted working on a feature that the project developers will not accept into the codebase.
 47 | 
 48 | *How Do I Submit A (Good) Enhancement or Improvement item?:* Enhancements and improvement items suggestions are tracked as [GitHub issues: Proposal](https://github.com/instructlab/taxonomy/issues/new?assignees=&labels=&projects=&template=proposal.md&title=). Create an issue and provide the information suggested in the proposal template.
 49 | 
 50 | ## How do I start contributing?
 51 | 
 52 | The following workflow is designed to help you begin your first contribution journey. It will guide you through creating and picking up issues, working through them, having your work reviewed, and then merging.
 53 | 
 54 | ### How Can I Contribute?
 55 | 
 56 | The goal of InstructLab is to enable true collaborative development around common Large Language Models (LLMs) using a technology that enables collaboration following standard open source development practices. A general overview of making a contribution to this project consists of writing an extension to the existing taxonomy, making a pull request, and getting your work reviewed and merged so that it can benefit the whole community.
 57 | 
 58 | Before you start, review the [open issues](https://github.com/instructlab/taxonomy/issues) and [opened pull requests](https://github.com/instructlab/taxonomy/pulls) board to see if your contribution or enhancements are already proposed. You might instead be able to join forces with them by providing input to what they have started. If you are unsure about what kind of skill or knowledge to contribute, you can [open an issue](https://github.com/instructlab/taxonomy/issues/new?assignees=&labels=&template=proposal.md&title=) first to discuss your proposal idea with the maintainers.
 59 | 
 60 | To contribute to this repo, you'll use the *Fork and Pull* model common in many open source repositories. You can follow this process in a local terminal or in the GitHub web UI.
 61 | 
 62 | - For details on the local process, check out the [GitHub flow](https://docs.github.com/en/get-started/using-github/github-flow) documentation from GitHub and [The GitHub Workflow Guide](https://github.com/kubernetes/community/blob/master/contributors/guide/github-workflow.md) documentation from Kubernetes.
 63 | - For details on contributing using the GitHub webpage UI, see [Contributing using the GH UI](docs/contributing_via_GH_UI.md).
 64 | 
 65 | > [!IMPORTANT]
 66 | > For all contributions to InstructLab 🐶, you want to become familiar with the workflow described in the [InstructLab 🐶 CLI
 67 | > `ilab`](https://github.com/instructlab/instructlab) documentation. It would be best to understand how to test
 68 | > your changes, generating new data, training and downloading the new model, and testing it to check that it gives you the desired results.
 69 | 
 70 | When your contribution is ready, you can create a pull request (PR). In general, we follow the standard [GitHub pull request](https://help.github.com/en/articles/about-pull-requests) process. Follow the template to provide details about your pull request to the maintainers. Before submitting pull requests, make sure your changes pass applicable formatting tests in the repository CI.
 71 | 
 72 | >[!NOTE]
 73 | > Always refer to the [README.md](https://github.com/instructlab/taxonomy/blob/main/README.md)
 74 | > if you are unsure on how to format your contributions.
 75 | >
 76 | ### What can I contribute?
 77 | 
 78 | You can contribute [Knowledge](https://github.com/instructlab/taxonomy/blob/main/README.md#getting-started-with-knowledge-contributions) or [Skills](https://github.com/instructlab/taxonomy/blob/main/README.md#getting-started-with-skill-contributions) to the taxonomy tree. Make sure to review the [general documentation](README.md) for a detail explanation of these concepts and the differences between the two.
 79 | 
 80 | > [!IMPORTANT]
 81 | > You can only contribute original material. **DO NOT** contribute copyrighted content or content coming from another system.
 82 | 
 83 | ### Submitting your contribution
 84 | 
 85 | When submitting your PR, give it a title which is as explicit as possible. Include in the description of the PR on GitHub, both what the system gave you before your contribution and what it gives you with your contribution.
 86 | 
 87 | ### Pull request review
 88 | 
 89 | Once you've [created a pull request](#how-can-i-contribute), maintainers will review your proposed addition and may make suggestions to fix before merging. It will be easier for your pull request to receive reviews if you consider the criteria the reviewers follow while working. Remember to:
 90 | 
 91 | - Run tests locally and ensure that they pass
 92 | - Ensure your contribution is in the proper format (`ilab generate` shouldn't report any warnings or errors)
 93 | - Break large changes into a logical series of smaller patches, which are easy to understand individually and combine to solve a broader issue
 94 | - Follow the project coding conventions
 95 | - Include the DCO sign off; see [Legal](#legal)
 96 | 
 97 | The project maintainers use `LGTM` (Looks Good To Me) in comments on the code review to indicate acceptance. You can see more information on the triaging process in the [Triaging skills](https://github.com/instructlab/taxonomy/blob/main/docs/triaging/triaging-contributions.md) documentation.
 98 | 
 99 | For a list of the maintainers and triagers, see the [MAINTAINERS.md](https://github.com/instructlab/community/blob/main/MAINTAINERS.md) page.
100 | 
101 | ## Submitting bugs
102 | 
103 | To submit a new bug, raise an issue in the appropriate repository before creating a pull request. This ensures that the issue is properly tracked. To fix an existing bug, assign yourself a bug from the issues page of the desired repository. Then, submit a pull request for review.
104 | 
105 | Bugs are tracked as [GitHub issues using the Bug Report template](https://github.com/instructlab/taxonomy/issues/new?assignees=&labels=&template=bug_report.md&title=). Create an issue on that and provide the information suggested in the bug report issue template.
106 | 
107 | ## Legal
108 | 
109 | We have tried to make it as easy as possible to make contributions.
110 | This applies to how we handle the legal aspects of contribution.
111 | We use the same approach - the [Developer's Certificate of Origin 1.1 (DCO)][DCO] - that [the Linux Kernel community uses][Linux-DCO] to manage code contributions. All contributions that leverage third-party content should either come from the public domain or be licensed with an open data license that does not restrict commercial use or the creation of derivative works, including the following license types:
112 | 
113 | - CC0-1.0
114 | - CDLA-Permissive-2.0
115 | - CC-BY-4.0
116 | - CC-BY-SA-4.0
117 | - Apache-2.0
118 | - MIT
119 | 
120 | Any third-party content contributed to this project undergoes modifications in order to formulate it in the templated format required for submission to this project.
121 | 
122 | We simply ask that when submitting a patch for review, the developer must include a sign-off statement in the commit message.
123 | 
124 | Here is an example `Signed-off-by` line, which indicates that the submitter accepts the DCO:
125 | 
126 | ```text
127 | Signed-off-by: John Doe <john.doe@example.com>
128 | ```
129 | 
130 | You can include this automatically when you commit a change to your local Git repository using the following command:
131 | 
132 | ```shell
133 | git commit -s
134 | ```
135 | 
136 | > [!TIP]
137 | > If you created a commit message that did not include the `-s` option, you can edit your original commit message by using the `git commit -s --amend` command. Ensure you force push the amended commit to your pull request (PR).
138 | 
139 | ### License
140 | 
141 | Unless specifically stated, this project is
142 | distributed under the [Apache License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0).
143 | 
144 | SPDX-License-Identifier: [Apache-2.0](https://spdx.org/licenses/Apache-2.0)
145 | 
146 | For more details, see the [LICENSE](LICENSE).
147 | 
148 | ### For your attribution.txt file
149 | 
150 | An important part of contributing to the InstructLab project is citing your sources of information. This comes in the form of your `attribution.txt` that you add to the pull requests. Almost all instances of attribution can be covered by the parameters required for Creative Commons Attribution licenses. Some parameters are as follows:
151 | 
152 | - Title of work
153 | - Link to work
154 | - Include link to a specific revision where possible
155 | - License of the work
156 | - Include an SPDX identifier where possible
157 | - Creator names
158 | - Copyright information
159 | - Modification information
160 | - Indicate if work was itself derived from another openly licensed work
161 | 
162 | You can also see this citation style in the [Data sources documentation](https://github.com/instructlab/community/blob/main/docs/DataSources.md)
163 | 
164 | ## Development
165 | 
166 | Please consult the [`ilab` documentation](https://github.com/instructlab/instructlab) to set up your environment.
167 | 
168 | [DCO]: https://developercertificate.org/
169 | [Linux-DCO]: https://docs.kernel.org/process/submitting-patches.html#sign-your-work-the-developer-s-certificate-of-origin
170 | 


--------------------------------------------------------------------------------
/docs/SKILLS_GUIDE.md:
--------------------------------------------------------------------------------
  1 | # Skills Guide
  2 | 
  3 | ## What is a "Skill"?
  4 | 
  5 | There are various types of skills that you can contribute to the taxonomy.
  6 | 
  7 | ### Compositional Skills
  8 | 
  9 | Skills are performative. When you create a skill for the model, you're
 10 | teaching it how to do something: "write me a song," "rearrange words in a sentence" or
 11 | "summarize an email."
 12 | 
 13 | There are two types of compositional skills:
 14 | 
 15 | #### Freeform Compositional Skills
 16 | 
 17 | Freeform compositional skills are performative and do **not** require additional context. An example of a freeform skill is teaching the model words that rhyme. You could provide examples of "words that rhyme with 'tool'". By providing those examples, you're essentially tickling the latent knowledge of the LLM. In our example, you're enabling the LLM to be able to identify words that rhyme in its latent knowledge.
 18 | 
 19 | Freeform skills include things like:
 20 | 
 21 | * Speak like Yoda
 22 | * Convert to camel case
 23 | * Write me a limerick
 24 | * Generate StabeDiffusion prompts
 25 | 
 26 | #### Grounded Compositional Skills
 27 | 
 28 | Grounded skills are performative and **do** require additional context. An
 29 | example of a grounded skill would be to read the value of a cell in a table
 30 | layout, or to parse a JSON file. To create a grounded skill to read a
 31 | markdown formatted table layout, the additional context could be an example
 32 | table layout. This additional context is including in the YAML for the
 33 | skill and not external to it.
 34 | 
 35 | > [!NOTE]
 36 | > The content of the table layout will not be used in training
 37 | > or aligning the model; only the table layout format itself will be used.
 38 | 
 39 | Grounded skills include things like:
 40 | 
 41 | * Game creation like Sudoku or tic tac toe
 42 | * Summarizing or extracting from a piece of text
 43 | * Find unresolved items in a meeting transcript
 44 | 
 45 | [Example Grounded compositional skill pull request](https://github.com/instructlab/taxonomy/pull/250)
 46 | 
 47 | ### Core Skills
 48 | 
 49 | Core skills are foundational skills like math, reasoning, and coding.
 50 | 
 51 | > [!NOTE]
 52 | > Unlike **knowledge** and **compositional skills**, core skills
 53 | > are not contributable to the tree. So when you see reference to contributing
 54 | > "skills" to the taxonomy from this point forward, it is **compositional
 55 | > skills** that are being referenced.
 56 | 
 57 | ## Accepted Skills
 58 | 
 59 | ### Creative Writing / Poetics
 60 | 
 61 | Adding new types of documents and writing styles to the LLM are welcome. Consider:
 62 | 
 63 | * Song lyrics
 64 | * Soliloquies
 65 | * Five paragraph essays
 66 | * Arguments
 67 | 
 68 | ### Learning to Format Information
 69 | 
 70 | Skills to better format and reassemble information are helpful.
 71 | 
 72 | ### Table Analysis and Processing
 73 | 
 74 | Consider:
 75 | 
 76 | * Drawing verbal inferences and conclusions about what's in a table
 77 | * Sorting
 78 | * Selecting
 79 | * Joining
 80 | 
 81 | ### Qualitative Inference and Chain-of-Thought Reasoning
 82 | 
 83 | Example:
 84 | 
 85 | > Mary is taller than John.
 86 | > John is taller than Anna.
 87 | > Is Anna taller than Mary?
 88 | 
 89 | Example:
 90 | 
 91 | > An elephant, a mouse and a horse are in a room. How would they be ordered if they were standing in order by size?
 92 | 
 93 | Great skills in this category should include the correct line of reasoning in the answer, not just what the answer is.
 94 | 
 95 | ### Word Problems
 96 | 
 97 | Is your LLM smarter than a second grader?
 98 | 
 99 | ### Trust and Safety
100 | 
101 | Please avoid HAP (hate, abuse and profanity) and PII (personal identifiable information) in your examples.
102 | 
103 | Anything related to trust and safety will be flagged for higher-level review.
104 | 
105 | ### Searching, Extraction and Summarization
106 | 
107 | Skills to select odd information in a document, draw conclusions, pull out information, draw insights or generate TODOs from information provided in the "context" field are welcome.
108 | 
109 | ### Complex Rulesets and Games
110 | 
111 | > [!NOTE]
112 | > This is a good example of the need for a *grounded skill*. Grounded skills require the user to provide context containing information that the model is expected to take into account during processing. This is different from *knowledge*, where the model is expected to gain facts and background knowledge from the tuning process.
113 | >
114 | > Context added when tuning a grounded skill would need to be again provided by the end user at inference time. The skill here is better adherence to the rule set.
115 | 
116 | To add a skill for a structured game or other task with a complex rule set, use a grounded skill. Add the rules to the game as "context" in every example. Add the interpretation as a question.
117 | 
118 | ### Writing Style and Personalities
119 | 
120 | When adding a skill, expect that you're tuning a fairly general purpose LLM to behave better given particular circumstances.
121 | 
122 | If you want to add a skill to better adopt a particular personality - say, "a little boy living in the 1800s" - that context needs to be provided in either the "context" or "question" field.
123 | 
124 | ### Instruction-Following Behavior
125 | 
126 | LLMs could be better at following extra instructions in a prompt about how to do a task, such as: "Keep your response to 200 words." Or: "Only produce 10 items." Skills to improve this behavior will help the model behave with more precision.
127 | 
128 | ## Skills to Avoid
129 | 
130 | There are several types of skills that we don't expect this procedure to improve. Most skills in these categories will be rejected.
131 | 
132 | ### Math
133 | 
134 | Trying to make the LLM solve math problems will be rejected.
135 | 
136 | ### Real world knowledge-based skills
137 | 
138 | Unless it can be framed as a "grounded skill", where the user is expected to provide context, knowledge contributions will be a separate part of the taxonomy. Skills shouldn't expect the model to come up with its own facts, but instead assemble facts provided.
139 | 
140 | ### Red Teaming
141 | 
142 | Adversarial questions and answers will be rejected at this time.
143 | 
144 | ### Turing-complete style problems
145 | 
146 | These are an edge case, but things like palindromes and regexes, where getting the right answer with a non-stochastic program would be easy, aren't good targets for the moment.
147 | 
148 | Open an issue in the taxonomy repository if you have an idea in this space before submitting a PR.
149 | 
150 | ### Small Changes to Original Response
151 | 
152 | If the original LLM response is pretty close, but it's not responding to your exact expectations, a skill is not the right way to solve that problem.
153 | 
154 | ## Avoid These Topics
155 | 
156 | While the tuning process may eventually benefit from being used to help the models work with complex social topics, at this time this is an area of active research we do not want to take lightly. Therefore please keep your submissions clear of the following topics:
157 | 
158 | * PII (personally identifiable information) or any content invasive of individual privacy rights
159 | * Violence including self-harm
160 | * Cyber Bullying
161 | * Internal documentation or other that is confidential to your employer or organization, e.g. trade secrets
162 | * Discrimination
163 | * Religion
164 |   * Facts such as, "[Christianity is, according to the 2011 census, the fifth most practiced religion in Nepal, with 375,699 adherents, or 1.4% of the population](https://en.wikipedia.org/wiki/Christianity_in_Nepal)", are fine as a knowledge contribution. Advocating in favor of or against any religious faith is not acceptable.
165 | * Medical or health information
166 |   * Facts such as,  "[In mammals, pulmonary ventilation occurs via inhalation (breathing)](https://opentextbc.ca/biology/chapter/11-3-circulatory-and-respiratory-systems/)," are fine as a knowledge contribution. Tailored medical/health advice is not acceptable.
167 | * Financial information
168 |   * Facts such as "[laissez-faire economics ... argues that market forces alone should drive the economy and that governments should refrain from direct intervention in or moderation of the economic system](https://openstax.org/books/world-history-volume-2/pages/6-3-capitalism-and-the-first-industrial-revolution)," are fine as a knowledge contribution. Tailored financial advice is not acceptable.
169 | * Legal settlements/mitigations
170 | * Gender Bias
171 | * Hostile Language, threats, slurs, derogatory or insensitive jokes or comments
172 | * Profanity
173 | * Pornography and sexually explicit or suggestive content
174 | * Any contributions that would allow for automated decision making that affect an individual's rights or well-being, e.g. social scoring
175 | * Any contributions that engage in political campaigning or lobbying
176 | 
177 | We are also not accepting submissions of the following content:
178 | 
179 | * Jokes
180 | * Poems
181 | * Code
182 |   * Anything code-related that can be traced back to code for a computer. Not limited to `sed` or `bash` but `yaml`s for OpenShift or Kubernetes, to `python` snippets to `Java` suggestions. There are specific models focused on this space and this isn't for this model for the time being.
183 | * "Guard Rails" for AI
184 |   * We expect our upstream engineering team to create these types of skills and safe guards. We appriciate our community wanting to help with this, but there are underlying engineering decisions and taking this from the community may conflict with these.
185 | 
186 | We received so many at the beginning, and with jokes being "in the eye of the beholder" and puns requiring nuance for native English speakers, we realized we were possibly unconsciously biasing our model. We have discovered that working with both topics has its own challenges, and if we want something generalized, finding consensus was unsuccessful.
187 | 
188 | ## Building Your LLM Intuition
189 | 
190 | LLMs have inherent limitations that make certain tasks extremely difficult, like doing math problems. They're great at other tasks, like creative writing. And they could be better at things like logical reasoning.
191 | 
192 | Consider these when you're generating skills. Skills in the first and second categories are welcomed. Skills in the third category are usually borderline and may be rejected.
193 | 
194 | ### LLMs are great at
195 | 
196 | Skills in this category are welcomed, as refining these abilities helps us get better at the kinds of tasks where LLMs can excel.
197 | 
198 | For these, however, it's common for LLMs to already have excellent performance. Try 3-5 examples in `lab chat` to confirm a deficit in the model before you build your submission, and share the examples in your Pull Request (PR).
199 | 
200 | * Brainstorming
201 | * Creativity
202 | * Connecting information
203 | * Cross-lingual behavior
204 | 
205 | ### LLMs need help with
206 | 
207 | Skills in this category are welcomed, since LLM behavior in these sorts of topics are very difficult for the model to get right. Try several examples to understand the nuances of the model's ability to do these sorts of tasks, and consider using corrections to the results you get in your tuning process.
208 | 
209 | * Chains of reasoning
210 | * Analysis
211 | * Story plots
212 | * Reassembling information
213 | * Effective and succinct summaries
214 | 
215 | ### LLMs are not so great at
216 | 
217 | Skills in this category are ways in which LLMs struggle, and may always struggle. Solving math and computation problems via probability on natural language queries is probably not the best way to solve them. That said, improving some of these foundational skills may be something this work tackles in the future, but not at this time.
218 | 
219 | Most skill submissions in these categories are likely to be rejected.
220 | 
221 | For hallucinations in particular, trying to solve this with a skill is unlikely to work. Consider contributing to the Knowledge taxonomy when it opens instead to improve the model's understanding of facts.
222 | 
223 | * Math
224 | * Computation
225 | * "Turing-complete" type tasks
226 | * Generating only true real-world information (they're prone to hallucinations)
227 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS
178 | 
179 |    APPENDIX: How to apply the Apache License to your work.
180 | 
181 |       To apply the Apache License to your work, attach the following
182 |       boilerplate notice, with the fields enclosed by brackets "[]"
183 |       replaced with your own identifying information. (Don't include
184 |       the brackets!)  The text should be enclosed in the appropriate
185 |       comment syntax for the file format. We also recommend that a
186 |       file or class name and description of purpose be included on the
187 |       same "printed page" as the copyright notice for easier
188 |       identification within third-party archives.
189 | 
190 |    Copyright [yyyy] [name of copyright owner]
191 | 
192 |    Licensed under the Apache License, Version 2.0 (the "License");
193 |    you may not use this file except in compliance with the License.
194 |    You may obtain a copy of the License at
195 | 
196 |        http://www.apache.org/licenses/LICENSE-2.0
197 | 
198 |    Unless required by applicable law or agreed to in writing, software
199 |    distributed under the License is distributed on an "AS IS" BASIS,
200 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 |    See the License for the specific language governing permissions and
202 |    limitations under the License.
203 | 
204 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # InstructLab 🐶 Taxonomy
  2 | 
  3 | ## Contents 📖
  4 | 
  5 | - [Welcome to the InstructLab Taxonomy](#welcome-to-the-instructlab-taxonomy)
  6 | - [Learning](#learning)
  7 | - [Getting Started with Skill Contributions](#getting-started-with-skill-contributions)
  8 |   - [Skills: YAML examples](#skills-yaml-examples)
  9 | - [Getting Started with Knowledge Contributions](#getting-started-with-knowledge-contributions)
 10 |   - [Knowledge: YAML examples](#knowledge-yaml-examples)
 11 | - [Taxonomy tree layout](#taxonomy-tree-layout)
 12 | - [Contribute knowledge and skills to the taxonomy!](#contribute-knowledge-and-skills-to-the-taxonomy)
 13 |   - [Ways to contribute](#ways-to-contribute)
 14 |   - [How to contribute skills and knowledge](#how-to-contribute-skills-and-knowledge)
 15 | 
 16 | ## Welcome to the InstructLab Taxonomy
 17 | 
 18 | InstructLab 🐶 uses a novel synthetic data-based alignment tuning method for
 19 | Large Language Models (LLMs.) The "**lab**" in Instruct**Lab** 🐶 stands for
 20 | [**L**arge-Scale **A**lignment for Chat**B**ots](https://arxiv.org/abs/2403.01081) [1].
 21 | 
 22 | The LAB method is driven by taxonomies, which are largely created manually and
 23 | with care.
 24 | 
 25 | This repository contains a taxonomy tree that allows you to create models
 26 | tuned with your data (enhanced via synthetic data generation) using the LAB 🐶
 27 | method.
 28 | 
 29 | [1] Shivchander Sudalairaj*, Abhishek Bhandwaldar*, Aldo Pareja*, Kai Xu, David D. Cox, Akash Srivastava*. "LAB: Large-Scale Alignment for ChatBots", arXiv preprint arXiv: 2403.01081, 2024. (* denotes equal contributions)
 30 | 
 31 | ## Choosing domains for the taxonomy
 32 | 
 33 | In general, we use the Dewey Decimal Classification (DDC) System to determine our domains (and subdomains) in the taxonomy. This [DDC SUMMARIES document](https://www.oclc.org/content/dam/oclc/dewey/resources/summaries/deweysummaries.pdf) is a great resource for determining where a topic might be classified.
 34 | 
 35 | If you are unsure where to put your knowledge or compositional skill, create a folder in the `miscellaneous_unknown` folder under the `knowledge` or `compositional_skills` folders.
 36 | 
 37 | ## Learning
 38 | 
 39 | Learn about the concepts of "skills" and "knowledge" in our [InstructLab Community Learning Guide](https://github.com/instructlab/community/blob/main/docs/README.md).
 40 | 
 41 | ## Getting Started with Skill Contributions
 42 | 
 43 | Skills require a much smaller volume of content than knowledge contributions. An entire skill contribution to the taxonomy tree can be just a few lines of YAML in the `qna.yaml` file ("qna" is short for "questions and answers") and an `attribution.txt` file for citing sources.
 44 | 
 45 | Your skills contribution pull requests must include the following:
 46 | 
 47 | - A `qna.yaml` that contains a set of key/value entries with the following keys
 48 |   - Each `qna.yaml` file requires a minimum of five question and answer pairs.
 49 | - An `attribution.txt` that includes the sources for the information used in the `qna.yaml`
 50 | 
 51 | > [!TIP]
 52 | > The skill taxonomy structure is used in several ways:
 53 | >
 54 | > 1. To select the right subset of the taxonomy to use for data generation.
 55 | > 2. To determine the interpretability by human contributors and maintainers.
 56 | > 3. As part of the prompt to the LLM used to generate synthetic samples.
 57 | <!-- -->
 58 | > [!IMPORTANT]
 59 | > There is a limit to how much content can exist in the question/answer pairs for the model to process. Due to this, only add a maximum
 60 | > of around 2300 words to your question and answer seed example pairs in the `qna.yaml` file.
 61 | 
 62 | Compositional skills can either be grounded (includes a context) or ungrounded (does not include a context).  Grounded or ungrounded is declared in the taxonomy tree, for example: `linguistics/writing/poetry/haiku/` (ungrounded) or `grounded/linguistics/grammar` (grounded). The `qna.yaml` is in the final node.
 63 | 
 64 | Taxonomy skill files must be a valid [YAML](https://yaml.org/) file named `qna.yaml`. Each `qna.yaml` file contains a set of key/value entries with the following keys:
 65 | 
 66 | - `version`: The value must be the number 2. **Required**
 67 | - `task_description`: A description of the skill. **Required**
 68 | - `created_by`: The GitHub username of the contributor. **Required**
 69 | - `seed_examples`: A collection of key/value entries. New
 70 |   submissions should have at least five entries, although
 71 |   older files may have fewer. **Required**
 72 |   - `context`: Grounded skills require the user to provide context containing information that the model is expected to take into account during processing. This is different from knowledge, where the model is expected to gain facts and background knowledge from the tuning process. The context key should not be used for ungrounded skills.
 73 |   - `question`: A question for the model. **Required**
 74 |   - `answer`: The desired response from the model. **Required**
 75 | 
 76 | Other keys at any level are currently ignored.
 77 | 
 78 | ### Skills: YAML examples
 79 | 
 80 | To make the `qna.yaml` files easier and faster for humans to read, it is recommended to specify `version` first, followed by `task_description`, then `created_by`, and finally `seed_examples`.
 81 | In `seed_examples`, it is recommended to specify `context` first (if applicable), followed by `question` and `answer`.
 82 | 
 83 | *Example `qna.yaml`*
 84 | 
 85 | ```yaml
 86 | version: 2
 87 | task_description: <string>
 88 | created_by: <string>
 89 | seed_examples:
 90 |   - question: <string>
 91 |     answer: |
 92 |       <multi-line string>
 93 |   - context: |
 94 |       <multi-line string>
 95 |     question: <string>
 96 |     answer: |
 97 |       <multi-line string>
 98 |   ...
 99 | ```
100 | 
101 | Then, you create an `attribution.txt` file that includes the sources of your information. These can also be self authored sources.
102 | 
103 | *Example `attribution.txt`*
104 | 
105 | ```text
106 | [Link to source]
107 | [Link to work]
108 | [License of the work]
109 | [Creator name]
110 | ```
111 | 
112 | For more information on what to include in your `attribution.txt` file, see [For your attribution.txt file](https://github.com/instructlab/taxonomy/blob/main/CONTRIBUTING.md#for-your-attributiontxt-file) in CONTRIBUTING.md.
113 | 
114 | If you have not written YAML before, don't be intimidated - it's just text.
115 | 
116 | > [!TIP]
117 | >
118 | > - Spaces and indentation matter in YAML. Two spaces to indent.
119 | > - Don't use tabs!
120 | > - Be careful to not have trailing spaces at the end of a line.
121 | > - Each example in `seed_examples` begins with a "-". Place this "-" in
122 |   front of the first field (`question` or `context`). The remaining keys in the
123 |   example should not have this "-".
124 | > - Some special characters such as " and ' need to be escaped with backslash. This is why some
125 |   of the lines for keys in the example YAML start the value with the '|' character followed a new line and then an indented multi-line string.
126 |   This character disables all of the special characters in the value for the key.
127 |   You might also want to use the '|' character for multi-line strings.
128 | > - Consider quoting all values with " to avoid surprising YAML parser behavior
129 |   (e.g. Yes answer can be interpreted by the parser as a boolean of `True`
130 |   value, unless "Yes" is quoted.)
131 | > - See https://yaml-multiline.info/ for more info.
132 | 
133 | It is recommended that you **lint**, or verify, your YAML using a tool. One linter option is [yamllint.com](https://yamllint.com). You can copy/paste your YAML into the box and click **Go** to have it analyze your YAML and make recommendations. Online tools like [prettified](https://onlineyamltools.com/prettify-yaml) and [yaml-validator](https://jsonformatter.org/yaml-validator) can automatically reformat your YAML to adhere to our `yamllint` PR checks, such as breaking lines longer than 120 characters.
134 | 
135 | #### Ungrounded compositional skill: YAML example
136 | 
137 | ```yaml
138 | version: 2
139 | task_description: 'Teach the model how to rhyme.'
140 | created_by: juliadenham
141 | seed_examples:
142 |   - question: What are 5 words that rhyme with horn?
143 |     answer: warn, torn, born, thorn, and corn.
144 |   - question: What are 5 words that rhyme with cat?
145 |     answer: bat, gnat, rat, vat, and mat.
146 |   - question: What are 5 words that rhyme with poor?
147 |     answer: door, shore, core, bore, and tore.
148 |   - question: What are 5 words that rhyme with bank?
149 |     answer: tank, rank, prank, sank, and drank.
150 |   - question: What are 5 words that rhyme with bake?
151 |     answer: wake, lake, steak, make, and quake.
152 | ```
153 | 
154 | Seriously, that's it.
155 | 
156 | Here is the location of this YAML in the taxonomy tree. Note that the YAML file
157 | itself, plus any added directories that contain the file, is the entirety of the skill
158 | in terms of a taxonomy contribution:
159 | 
160 | #### Ungrounded compositional skill: Directory tree example
161 | 
162 | ```ascii
163 | [...]
164 | 
165 | └── writing
166 |     └── poetry
167 |     |   └── haiku <=== here it is :)
168 |     |   |   └── qna.yaml
169 |     |   |       attribution.txt
170 |         [...]
171 |     └── prose
172 |     |   └── debate
173 |     |   |   └── qna.yaml
174 |     |   |       attribution.txt
175 |     [...]
176 | 
177 | [...]
178 | ```
179 | 
180 | #### Grounded compositional skill: YAML example
181 | 
182 | Remember that [grounded compositional skills](docs/SKILLS_GUIDE.md) require additional context and include a `context` field.
183 | 
184 | This example snippet assumes the GitHub username `mairin` and shows some of the question/answer pairs present in the actual file:
185 | 
186 | ```yaml
187 | version: 2
188 | task_description: |
189 |     This skill provides the ability to read a markdown-formatted table.
190 | created_by: mairin # Use your GitHub username; only one creator supported
191 | seed_examples:
192 |   - context: |
193 |       | **Breed**      | **Size**     | **Barking** | **Energy** |
194 |       |----------------|--------------|-------------|------------|
195 |       | Afghan Hound   | 25-27 in     | 3/5         | 4/5        |
196 |       | Labrador       | 22.5-24.5 in | 3/5         | 5/5        |
197 |       | Cocker Spaniel | 14.5-15.5 in | 3/5         | 4/5        |
198 |       | Poodle (Toy)   | <= 10 in     | 4/5         | 4/5        |
199 |     question: |
200 |       Which breed has the most energy?
201 |     answer: |
202 |       The breed with the most energy is the Labrador.
203 |   - context: |
204 |       | **Name** | **Date** | **Color** | **Letter** | **Number** |
205 |       |----------|----------|-----------|------------|------------|
206 |       | George   | Mar 5    | Green     | A          | 1          |
207 |       | Gráinne  | Dec 31   | Red       | B          | 2          |
208 |       | Abigail  | Jan 17   | Yellow    | C          | 3          |
209 |       | Bhavna   | Apr 29   | Purple    | D          | 4          |
210 |       | Rémy     | Sep 9    | Blue      | E          | 5          |
211 |     question: |
212 |       What is Gráinne's letter and what is her color?
213 |     answer: |
214 |       Gráinne's letter is B and her color is red.
215 |   - context: |
216 |       | Banana | Apple      | Blueberry | Strawberry |
217 |       |--------|------------|-----------|------------|
218 |       | Yellow | Red, Green | Blue      | Red        |
219 |       | Large  | Medium     | Small     | Small      |
220 |       | Peel   | Peel       | No peel   | No peel    |
221 |     question: |
222 |       Which fruit is blue, small, and has no peel?
223 |     answer: |
224 |       The blueberry is blue, small, and has no peel.
225 | ```
226 | 
227 | #### Grounded compositional skill: Directory tree example
228 | 
229 | ```ascii
230 | [...]
231 | 
232 | grounded
233 | └── technology
234 |     └── machine_learning
235 |         └── natural_language_processing
236 |     |   |     └── information_extraction
237 |     |            └── inference
238 |     |   |            └── qualitative
239 |     |   |               ├── sentiment
240 |     |   |               |     └── qna.yaml
241 |     |   |               |         attribution.txt
242 |     │                   ├── quantitative
243 |     │   │                   ├── table_analysis <=== here it is :)
244 |     │   |   |               |     └── qna.yaml
245 |     │   │   │               |         attribution.txt
246 | 
247 | [...]
248 | ```
249 | 
250 | ## Getting Started with Knowledge Contributions
251 | 
252 | While skills are foundational or performative, knowledge is based more on answering questions that involve facts,
253 | data, or references.
254 | 
255 | Knowledge is supported by documents, such as a textbook, technical manual, encyclopedia, journal, or magazine.
256 | 
257 | Knowledge in the taxonomy tree consists of a few more elements than skills:
258 | 
259 | > [!IMPORTANT]
260 | > If you are using InstructLab version `0.21.0` or above, you can specify PDF files in your knowledge `qna.yaml` file as a valid document type. Any previous version of InstructLab still only consumes knowledge documents in markdown format.
261 | 
262 | - Each knowledge node in the tree has a `qna.yaml`, similar to the format of the `qna.yaml` for skills.
263 | - ⭐ Knowledge submissions require you to create a Git repository, can be with GitHub, that contains the files of your knowledge contributions.
264 | - The `qna.yaml` includes parameters that contain information from your repository.
265 | 
266 | > [!TIP]
267 | > Guidelines for Knowledge contributions
268 | >
269 | > - Submit the most up-to-date version of the document
270 | > - All submissions must be text, images will be ignored
271 | > - Do not use tables in your markdown freeform contribution
272 | 
273 | The `qna.yaml` format must include the following fields:
274 | 
275 | - `version`: The version of the qna.yaml file, this is the format of the file used for SDG. The value must be the number 3.
276 | - `created_by`: Your GitHub username.
277 | - `domain`: Specify the category of the knowledge.
278 | - `seed_examples`: A collection of key/value entries.
279 |   - `context`: A chunk of information from the knowledge document. Each `qna.yaml` needs five `context` blocks and has a maximum word count of 500 words.
280 |   - `questions_and_answers`: The parameter that holds your questions and answers
281 |     - `question`: Specify a question for the model. Each `qna.yaml` file needs at least three question and answer pairs per `context` chunk with a maximum word count of 250 words.
282 |     - `answer`: Specify the desired answer from the model. Each `qna.yaml` file needs at least three question and answer pairs per `context` chunk with a maximum word count of 250 words.
283 | - `document_outline`: Describe an overview of the document your submitting.
284 | - `document`: The source of your knowledge contribution.
285 |   - `repo`: The URL to your repository that holds your knowledge files.
286 |   - `commit`: The SHA of the commit in your repository with your knowledge files.
287 |   - `patterns`: A list of glob patterns specifying the files in your repository. Any glob pattern that starts with `*`, such as `*.md`, must be quoted due to YAML rules. For example, `"*.md"`.
288 | 
289 | ### Knowledge: YAML examples
290 | 
291 | ```yaml
292 | version: 3
293 | domain: astronomy
294 | created_by: juliadenham
295 | seed_examples:
296 |   - context: |
297 |       **Phoenix** is a minor [constellation](constellation "wikilink") in the
298 |       [southern sky](southern_sky "wikilink"). Named after the mythical
299 |       [phoenix](Phoenix_(mythology) "wikilink"), it was first depicted on a
300 |       celestial atlas by [Johann Bayer](Johann_Bayer "wikilink") in his 1603
301 |       *[Uranometria](Uranometria "wikilink")*. The French explorer and
302 |       astronomer [Nicolas Louis de
303 |       Lacaille](Nicolas_Louis_de_Lacaille "wikilink") charted the brighter
304 |       stars and gave their [Bayer designations](Bayer_designation "wikilink")
305 |       in 1756. The constellation stretches from roughly −39 degrees to −57 degrees
306 |       [declination](declination "wikilink"), and from 23.5h to 2.5h of [right
307 |       ascension](right_ascension "wikilink"). The constellations Phoenix,
308 |       [Grus](Grus_(constellation) "wikilink"),
309 |       [Pavo](Pavo_(constellation) "wikilink") and [Tucana](Tucana "wikilink"),
310 |       are known as the Southern Birds.
311 |     questions_and_answers:
312 |       - question: |
313 |           What is the Phoenix constellation?
314 |         answer: |
315 |           Phoenix is a minor constellation in the southern sky.
316 |       - question: |
317 |           Who charted the Phoenix constellation?
318 |         answer: |
319 |           The Phoenix constellation was charted by french explorer and
320 |           astronomer Nicolas Louis de Lacaille.
321 |       - question: |
322 |           How far does the Phoenix constellation stretch?
323 |         answer: |
324 |           The phoenix constellation stretches from roughly −39° to −57°
325 |           declination, and from 23.5h to 2.5h of right ascension.
326 |   - context: |
327 |       Phoenix was the largest of the 12 constellations established by [Petrus
328 |       Plancius](Petrus_Plancius "wikilink") from the observations of [Pieter
329 |       Dirkszoon Keyser](Pieter_Dirkszoon_Keyser "wikilink") and [Frederick de
330 |       Houtman](Frederick_de_Houtman "wikilink"). It first appeared on a 35cm
331 |       diameter celestial globe published in 1597 (or 1598) in Amsterdam by
332 |       Plancius with [Jodocus Hondius](Jodocus_Hondius "wikilink"). The first
333 |       depiction of this constellation in a celestial atlas was in [Johann
334 |       Bayer](Johann_Bayer "wikilink")'s
335 |       *[Uranometria](Uranometria "wikilink")* of 1603. De Houtman included
336 |       it in his southern star catalog the same year under the Dutch name *Den
337 |       voghel Fenicx*, "The Bird Phoenix", symbolising the
338 |       [phoenix](Phoenix_(mythology) "wikilink") of classical mythology. One
339 |       name of the brightest star [Alpha
340 |       Phoenicis](Alpha_Phoenicis "wikilink")—Ankaa—is derived from the Arabic:
341 |       العنقاء, romanized: al-‘anqā’, lit. 'the phoenix', and
342 |       was coined sometime after 1800 in relation to the constellation.
343 |     questions_and_answers:
344 |       - question: |
345 |           What is the brightest star in the Phoenix constellation
346 |           called?
347 |         answer: |
348 |           Alpha Phoenicis or Ankaa is the brightest star in the Phoenix
349 |           Constellation.
350 |       - question: Where did the Phoenix constellation first appear?
351 |         answer: |
352 |           The Phoenix constellation first appeared on a 35-cm diameter
353 |           celestial globe published in 1597 (or 1598) in Amsterdam by
354 |           Plancius with Jodocus Hondius.
355 |       - question: |
356 |           What does "The Bird Phoenix" symbolize?
357 |         answer: |
358 |           "The Bird Phoenix" symbolizes the phoenix of classical mythology.
359 |   - context: |
360 |       Phoenix is a small constellation bordered by [Fornax](Fornax "wikilink")
361 |       and Sculptor to the north, Grus to the west, Tucana to the south,
362 |       touching on the corner of [Hydrus](Hydrus "wikilink") to the south, and
363 |       [Eridanus](Eridanus_(constellation) "wikilink") to the east and
364 |       southeast. The bright star [Achernar](Achernar "wikilink") is
365 |       nearby. The three-letter abbreviation for the constellation, as
366 |       adopted by the [International Astronomical
367 |       Union](International_Astronomical_Union "wikilink") in 1922, is
368 |       "Phe". The official constellation boundaries, as set by Belgian
369 |       astronomer [Eugène Delporte](Eugène_Joseph_Delporte "wikilink") in 1930,
370 |       are defined by a polygon of 10 segments. In the [equatorial coordinate
371 |       system](equatorial_coordinate_system "wikilink"), the [right
372 |       ascension](right_ascension "wikilink") coordinates of these borders lie
373 |       between 23<sup>h</sup> 26.5<sup>m</sup> and 02<sup>h</sup> 25.0<sup>m</sup>,
374 |       while the [declination](declination "wikilink")
375 |       coordinates are between −39.31° and −57.84°. This means it remains
376 |       below the horizon to anyone living north of the [40th
377 |       parallel](40th_parallel_north "wikilink") in the [Northern
378 |       Hemisphere](Northern_Hemisphere "wikilink"), and remains low in the sky
379 |       for anyone living north of the [equator](equator "wikilink"). It is most
380 |       visible from locations such as Australia and South Africa during late
381 |       [Southern Hemisphere](Southern_Hemisphere "wikilink") spring. Most
382 |       of the constellation lies within, and can be located by, forming a
383 |       triangle of the bright stars Achernar, [Fomalhaut](Fomalhaut "wikilink")
384 |       and [Beta Ceti](Beta_Ceti "wikilink")—Ankaa lies roughly in the centre
385 |       of this.
386 |     questions_and_answers:
387 |       - question: What are the characteristics of the Phoenix constellation?
388 |         answer: |
389 |           Phoenix is a small constellation bordered by Fornax and Sculptor to
390 |           the north, Grus to the west, Tucana to the south, touching on the
391 |           corner of Hydrus to the south, and Eridanus to the east and southeast.
392 |           The bright star Achernar is nearby.
393 |       - question: |
394 |           When is the phoenix constellation most visible?
395 |         answer: |
396 |           Phoenix is most visible from locations such as Australia and
397 |           South Africa during late Southern Hemisphere spring.
398 |       - question: |
399 |           What are the Phoenix Constellation boundaries?
400 |         answer: |
401 |           The official constellation boundaries for Phoenix, as set by Belgian
402 |           astronomer Eugène Delporte in 1930, are defined by a polygon of 10
403 |           segments.
404 |   - context: |
405 |       Ten stars have been found to have planets to date, and four planetary
406 |       systems have been discovered with the [SuperWASP](SuperWASP "wikilink")
407 |       project. [HD 142](HD_142 "wikilink") is a yellow giant that has an
408 |       apparent magnitude of 5.7, and has a planet ([HD 142b](HD_142_b
409 |       "wikilink")) 1.36 times the mass of Jupiter which orbits every 328 days.
410 |       [HD 2039](HD_2039 "wikilink") is a yellow subgiant with an apparent
411 |       magnitude of 9.0 around 330 light years away which has a planet ([HD 2039
412 |       b](HD_2039_b "wikilink")) six times the mass of Jupiter. [WASP-18](WASP-18
413 |       "wikilink") is a star of magnitude 9.29 which was discovered to have a hot
414 |       Jupiter-like planet ([WASP-18b](WASP-18b "wikilink")) taking less than a
415 |       day to orbit the star. The planet is suspected to be causing WASP-18 to
416 |       appear older than it really is. [WASP-4](WASP-4 "wikilink") and
417 |       [WASP-5](WASP-5 "wikilink") are solar-type yellow stars around 1000
418 |       light years distant and of 13th magnitude, each with a single planet
419 |       larger than Jupiter. [WASP-29](WASP-29 "wikilink") is an orange
420 |       dwarf of spectral type K4V and visual magnitude 11.3, which has a
421 |       planetary companion of similar size and mass to Saturn. The planet
422 |       completes an orbit every 3.9 days.
423 |     questions_and_answers:
424 |       - question: In the Phoenix constellation, how many stars have planets?
425 |         answer: |
426 |           In the Phoenix constellation, ten stars have been found to have
427 |           planets to date, and four planetary systems have been discovered
428 |           with the SuperWASP project.
429 |       - question: |
430 |           What is HD 142?
431 |         answer: |
432 |           HD 142 is a yellow giant that has an apparent magnitude of 5.7, and
433 |           has a planet (HD 142 b) 1.36 times the mass of Jupiter which
434 |           orbits every 328 days.
435 |       - question: |
436 |           Are WASP-4 and WASP-5 solar-type yellow stars?
437 |         answer: |
438 |           Yes, WASP-4 and WASP-5 are solar-type yellow stars around 1000 light
439 |           years distant and of 13th magnitude, each with a single planet
440 |           larger than Jupiter.
441 |   - context: |
442 |       The constellation does not lie on the
443 |       [galactic plane](galactic_plane "wikilink") of the Milky Way, and there
444 |       are no prominent star clusters. [NGC 625](NGC_625 "wikilink") is a dwarf
445 |       [irregular galaxy](irregular_galaxy "wikilink") of apparent magnitude 11.0
446 |       and lying some 12.7 million light years distant. Only 24000 light years in
447 |       diameter, it is an outlying member of the [Sculptor Group](Sculptor_Group
448 |       "wikilink"). NGC 625 is thought to have been involved in a collision and
449 |       is experiencing a burst of [active star formation](Active_galactic_nucleus
450 |       "wikilink"). [NGC 37](NGC_37 "wikilink") is a
451 |       [lenticular galaxy](lenticular_galaxy "wikilink") of apparent magnitude
452 |       14.66. It is approximately 42 [kiloparsecs](kiloparsecs "wikilink")
453 |       (137,000 [light-years](light-years "wikilink")) in diameter and about
454 |       12.9 billion years old. [Robert's Quartet](Robert's_Quartet "wikilink")
455 |       (composed of the irregular galaxy [NGC 87](NGC_87 "wikilink"), and three
456 |       spiral galaxies [NGC 88](NGC_88 "wikilink"), [NGC 89](NGC_89 "wikilink")
457 |       and [NGC 92](NGC_92 "wikilink")) is a group of four galaxies located
458 |       around 160 million light-years away which are in the process of colliding
459 |       and merging. They are within a circle of radius of 1.6 arcmin,
460 |       corresponding to about 75,000 light-years. Located in the galaxy ESO
461 |       243-49 is [HLX-1](HLX-1 "wikilink"), an
462 |       [intermediate-mass black hole](intermediate-mass_black_hole
463 |       "wikilink")—the first one of its kind identified. It is thought to be a
464 |       remnant of a dwarf galaxy that was absorbed in a
465 |       [collision](Interacting_galaxy "wikilink") with ESO 243-49. Before its
466 |       discovery, this class of black hole was only hypothesized.
467 |     questions_and_answers:
468 |       - question: |
469 |           Is the Phoenix Constellation part of the Milky Way?
470 |         answer: |
471 |           The Phoenix constellation does not lie on the galactic plane of
472 |           the Milky Way, and there are no prominent star clusters.
473 |       - question: |
474 |           How many light years away is NGC 625?
475 |         answer: |
476 |           NGC 625 is 24000 light years in diameter and is an outlying
477 |           member of the Sculptor Group.
478 |       - question: |
479 |           What is Robert's Quartet composed of?
480 |         answer: |
481 |           Robert's Quartet is composed of the irregular galaxy NGC 87,
482 |           and three spiral galaxies NGC 88, NGC 89 and NGC 92.
483 | document_outline: |
484 |   Information about the Phoenix Constellation including the
485 |   history, characteristics, and features of the stars in the constellation.
486 | document:
487 |   repo: https://github.com/juliadenham/Summit_knowledge
488 |   commit: 0a1f2672b9b90582e6115333e3ed62fd628f1c0f
489 |   patterns:
490 |     - phoenix_constellation.md
491 | 
492 | ```
493 | 
494 | *Example `attribution.txt` file*
495 | 
496 | ```text
497 | Title of work: Phoenix (constellation)
498 | Link to work: https://en.wikipedia.org/wiki/Phoenix_(constellation)
499 | Revision: https://en.wikipedia.org/w/index.php?title=Phoenix_(constellation)&oldid=1237187773
500 | License of the work: CC-BY-SA-4.0
501 | Creator names: Wikipedia Authors
502 | ```
503 | 
504 | This knowledge example references one markdown file: `phoenix_constellation.md`. You can also add multiple files for knowledge contributions.
505 | 
506 | > [!NOTE]
507 | > Due to the higher volume, **it will naturally take longer to receive acceptance for
508 | > a knowledge contribution pull request than for a skill pull request**. Smaller
509 | > pull requests are simpler and require less time and effort to review.
510 | 
511 | What might these markdown files look like? They can be freeform. Here's what a
512 | snippet of `phoenix_constellation.md` might look like in your Git repository.
513 | 
514 | #### Knowledge: Markdown file example
515 | 
516 | ```markdown
517 | # Phoenix (constellation)
518 | 
519 | **Phoenix** is a minor [constellation](constellation "wikilink") in the
520 | [southern sky](southern_sky "wikilink"). Named after the mythical
521 | [phoenix](Phoenix_(mythology) "wikilink"), it was first depicted on a
522 | celestial atlas by [Johann Bayer](Johann_Bayer "wikilink") in his 1603
523 | *[Uranometria](Uranometria "wikilink")*. The French explorer and
524 | astronomer [Nicolas Louis de
525 | Lacaille](Nicolas_Louis_de_Lacaille "wikilink") charted the brighter
526 | stars and gave their [Bayer designations](Bayer_designation "wikilink")
527 | in 1756. The constellation stretches from roughly −39 degrees to −57 degrees
528 | [declination](declination "wikilink"), and from 23.5h to 2.5h of [right
529 | ascension](right_ascension "wikilink"). The constellations Phoenix,
530 | [Grus](Grus_(constellation) "wikilink"),
531 | [Pavo](Pavo_(constellation) "wikilink") and [Tucana](Tucana "wikilink"),
532 | are known as the Southern Birds.
533 | 
534 | The brightest star, [Alpha Phoenicis](Alpha_Phoenicis "wikilink"), is
535 | named Ankaa, an [Arabic](Arabic "wikilink") word meaning 'the Phoenix'.
536 | It is an orange giant of apparent magnitude 2.4. Next is [Beta
537 | Phoenicis](Beta_Phoenicis "wikilink"), actually a
538 | [binary](Binary_star "wikilink") system composed of two yellow giants
539 | with a combined apparent magnitude of 3.3. [Nu
540 | Phoenicis](Nu_Phoenicis "wikilink") has a dust disk, while the
541 | constellation has ten star systems with known planets and the recently
542 | discovered [galaxy clusters](galaxy_cluster "wikilink") [El
543 | Gordo](El_Gordo_(galaxy_cluster) "wikilink") and the [Phoenix
544 | Cluster](Phoenix_Cluster "wikilink")—located 7.2 and 5.7 billion light
545 | years away respectively, two of the largest objects in the [visible
546 | universe](visible_universe "wikilink"). Phoenix is the
547 | [radiant](radiant_(meteor_shower) "wikilink") of two annual [meteor
548 | showers](meteor_shower "wikilink"): the
549 | [Phoenicids](Phoenicids "wikilink") in December, and the July
550 | Phoenicids.
551 | ```
552 | 
553 | In the taxonomy repository, here's what the previously referenced knowledge might look like in the tree:
554 | 
555 | #### Knowledge: directory tree example
556 | 
557 | ```ascii
558 | [...]
559 | 
560 | └── knowledge
561 |     └── science
562 |         ├── astronomy
563 |         │ └── constellations
564 |         │     └── Phoenix <=== here it is :)
565 |         │     |    └── qna.yaml
566 |         |     |        attribution.txt
567 |         │     └── Orion
568 |         │          └── qna.yaml
569 |         |              attribution.txt
570 | [...]
571 | ```
572 | 
573 | For more information on what to include in your `attribution.txt` file, see [For your attribution.txt file](https://github.com/instructlab/taxonomy/blob/main/CONTRIBUTING.md#for-your-attributiontxt-file) in CONTRIBUTING.md.
574 | 
575 | You can organize the knowledge markdown files in your repository however you want. You just need to ensure the YAML is pointing to the correct file.
576 | 
577 | ## Taxonomy tree Layout
578 | 
579 | The taxonomy tree is organized in a cascading directory structure. At the end of
580 | each branch, there is a YAML file (qna.yaml) that contains the examples for that
581 | domain. Maintainers can decide to change the names of the existing branches or to add new branches.
582 | 
583 | > [!IMPORTANT]
584 | > Folder names do not have spaces. Use underscores between words.
585 | 
586 | Below is an illustrative directory structure to show this layout:
587 | 
588 | ```ascii
589 | .
590 | └── linguistics
591 |     ├── writing
592 |     │   ├── brainstorming
593 |     │   │   ├── idea_generation
594 |     |   │       └── qna.yaml
595 |     │   │           attribution.txt
596 |     │   │   ├── refute_claim
597 |     |   │       └── qna.yaml
598 |     │   │           attribution.txt
599 |     │   ├── prose
600 |     │   │   ├── articles
601 |     │   │       └── qna.yaml
602 |     │   │           attribution.txt
603 |     └── grammar
604 |         └── qna.yaml
605 |         │   attribution.txt
606 |         └── spelling
607 |             └── qna.yaml
608 |                 attribution.txt
609 | ```
610 | 
611 | For an extensive example of this layout see, [taxonomy_tree_layout](docs/taxonomy_diagram.md) in the documentation folder.
612 | 
613 | ## Contribute knowledge and skills to the taxonomy
614 | 
615 | The ability to contribute to a Large Language Model (LLM) has been difficult in no small part because it is difficult to get access to the necessary compute infrastructure.
616 | 
617 | This taxonomy repository will be used as the seed to synthesize the training data for InstructLab-trained models. We intend to retrain the model(s) using the main branch following InstructLab's progressive training on a regular basis. This enables fast iteration of the model(s), for the benefit of the open source community.
618 | 
619 | By contributing your skills and knowledge to this repository, you will see your changes built into an LLM within days of your contribution rather than months or years! If you are working with a model and notice its knowledge or ability lacking, you can correct it by contributing knowledge or skills and check if it's improved after your changes are built.
620 | 
621 | While public contributions are welcome to help drive community progress, you can also fork this repository under [the Apache License, Version 2.0](LICENSE), add your own internal skills, and train your own models internally. However, you might need your own access to significant compute infrastructure to perform sufficient retraining.
622 | 
623 | ## Ways to Contribute
624 | 
625 | You can contribute to the taxonomy in the following two ways:
626 | 
627 | 1. Adding new examples to **existing leaf nodes**:
628 | 2. Adding **new branches/skills** corresponding to the existing domain:
629 | 
630 | For more information, see the [Ways of contributing to the taxonomy repository](https://github.com/instructlab/taxonomy/blob/main/CONTRIBUTING.md#ways-of-contributing-to-the-taxonomy-repository) documentation.
631 | 
632 | ## How to contribute skills and knowledge
633 | 
634 | To contribute to this repo, you'll use the *Fork and Pull* model common in many open source repositories. You can add your skills and knowledge to the taxonomy in multiple ways; for additional information on how to make a contribution, see the [Documentation on contributing](CONTRIBUTING.md). You can also use the following guides to help with contributing:
635 | 
636 | - Contributing using the [GitHub webpage UI](docs/contributing_via_GH_UI.md).
637 | - Contributing knowledge to the taxonomy in the [Knowledge contribution guidelines](docs/knowledge-contribution-guide.md).
638 | 
639 | ### Why should I contribute?
640 | 
641 | This taxonomy repository will be used as the seed to synthesize the training
642 | data for InstructLab-trained models. We intend to retrain the model(s) using the main
643 | branch as often as possible (at least weekly).
644 | Fast iteration of the model(s) benefits the open source community and enables model developers who do not have access to the necessary compute infrastructure.
645 | 


--------------------------------------------------------------------------------