├── input_dataset.json
├── .env.example
├── requirements.txt
├── scripts
    ├── models.py
    ├── push_data_to_huggingface.py
    ├── check_nulls.py
    ├── enrich_dataset.py
    └── system_prompts.py
├── .pre-commit-config.yaml
├── .github
    └── workflows
    │   └── push_to_hf.yml
├── notebooks
    ├── turns_count.ipynb
    ├── number_of_tokens.ipynb
    ├── time_spent.ipynb
    └── convert_dataset_to_new_format.ipynb
├── .gitignore
└── README.md


/input_dataset.json:
--------------------------------------------------------------------------------
1 | []


--------------------------------------------------------------------------------
/.env.example:
--------------------------------------------------------------------------------
1 | HF_TOKEN=''
2 | GOOGLE_API_KEY=''


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | datasets==3.3.2
2 | huggingface-hub==0.26.2
3 | python-dotenv==1.0.1
4 | pre_commit==4.2.0
5 | google-genai==1.5.0


--------------------------------------------------------------------------------
/scripts/models.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel
 2 | 
 3 | 
 4 | class ConversationTurn(BaseModel):
 5 |     role: str
 6 |     content: str
 7 | 
 8 | 
 9 | class Conversation(BaseModel):
10 |     conversation: list[ConversationTurn]
11 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 |   - repo: https://github.com/pre-commit/pre-commit-hooks
 3 |     rev: v4.6.0
 4 |     hooks:
 5 |       - id: check-json
 6 |         files: ^dataset\.json$
 7 |   - repo: local
 8 |     hooks:
 9 |       - id: check-nulls
10 |         name: Check for null values in dataset.json
11 |         entry: scripts/check_nulls.py
12 |         language: script
13 |         files: ^dataset\.json$
14 |         types: [file]
15 | 


--------------------------------------------------------------------------------
/scripts/push_data_to_huggingface.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | 
 4 | from datasets import Dataset
 5 | from dotenv import load_dotenv
 6 | from huggingface_hub import HfApi
 7 | 
 8 | load_dotenv()
 9 | 
10 | with open("dataset.json", "r", encoding="utf-8") as f:
11 |     raw_data = json.load(f)
12 | 
13 | dataset = Dataset.from_list(raw_data)
14 | hf_token = os.environ.get("HF_TOKEN")
15 | api = HfApi(token=hf_token)
16 | dataset.push_to_hub(
17 |     "ImadSaddik/BoDmaghDataset",
18 |     token=hf_token,
19 |     commit_message="Update dataset from GitHub repository"
20 | )
21 | 


--------------------------------------------------------------------------------
/.github/workflows/push_to_hf.yml:
--------------------------------------------------------------------------------
 1 | name: Validate JSON and push data to HuggingFace
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ main ]
 6 | 
 7 | jobs:
 8 |   validate-json:
 9 |     runs-on: ubuntu-latest
10 |     steps:
11 |       - name: Check out code
12 |         uses: actions/checkout@v4
13 | 
14 |       - name: Validate dataset.json
15 |         run: jq . dataset.json > /dev/null
16 | 
17 |   push_to_hf:
18 |     runs-on: ubuntu-latest
19 |     needs: validate-json
20 |     steps:
21 |       - name: Checkout repository
22 |         uses: actions/checkout@v3
23 | 
24 |       - name: Setup Python
25 |         uses: actions/setup-python@v4
26 |         with:
27 |           python-version: '3.12'
28 | 
29 |       - name: Install dependencies
30 |         run: |
31 |           python -m pip install --upgrade pip
32 |           pip install -r requirements.txt
33 | 
34 |       - name: Create and push dataset
35 |         env:
36 |           HF_TOKEN: ${{ secrets.HF_TOKEN }}
37 |         run: python scripts/push_data_to_huggingface.py
38 | 


--------------------------------------------------------------------------------
/scripts/check_nulls.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import sys
 3 | import pandas as pd
 4 | import os
 5 | 
 6 | 
 7 | def check_for_nulls(file_path: str) -> bool:
 8 |     if not os.path.exists(file_path):
 9 |         print(f"Error: File {file_path} does not exist", file=sys.stderr)
10 |         return True
11 |     try:
12 |         df = pd.read_json(file_path)
13 |         return df.isnull().sum().sum() > 0
14 |     except Exception as e:
15 |         print(f"Error reading or processing {file_path}: {e}", file=sys.stderr)
16 |         return True
17 | 
18 | 
19 | if __name__ == "__main__":
20 |     script_directory = os.path.dirname(os.path.abspath(__file__))
21 |     dataset_path = os.path.join(script_directory, '..', 'dataset.json')
22 | 
23 |     exit_code = 0
24 |     if check_for_nulls(dataset_path):
25 |         print(
26 |             f"Error: Found null values or error processing {dataset_path}", file=sys.stderr)
27 |         exit_code = 1
28 |     else:
29 |         print(f"No null values found in {dataset_path}.")
30 | 
31 |     sys.exit(exit_code)
32 | 


--------------------------------------------------------------------------------
/notebooks/turns_count.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": 1,
 6 |    "metadata": {},
 7 |    "outputs": [
 8 |     {
 9 |      "data": {
10 |       "text/plain": [
11 |        "1104"
12 |       ]
13 |      },
14 |      "execution_count": 1,
15 |      "metadata": {},
16 |      "output_type": "execute_result"
17 |     }
18 |    ],
19 |    "source": [
20 |     "import json\n",
21 |     "\n",
22 |     "with open(\"../dataset.json\") as f:\n",
23 |     "    data = json.load(f)\n",
24 |     "\n",
25 |     "count = 0\n",
26 |     "for row in data:\n",
27 |     "    count += len(row[\"conversation\"])\n",
28 |     "\n",
29 |     "count"
30 |    ]
31 |   }
32 |  ],
33 |  "metadata": {
34 |   "kernelspec": {
35 |    "display_name": "vincent",
36 |    "language": "python",
37 |    "name": "python3"
38 |   },
39 |   "language_info": {
40 |    "codemirror_mode": {
41 |     "name": "ipython",
42 |     "version": 3
43 |    },
44 |    "file_extension": ".py",
45 |    "mimetype": "text/x-python",
46 |    "name": "python",
47 |    "nbconvert_exporter": "python",
48 |    "pygments_lexer": "ipython3",
49 |    "version": "3.11.10"
50 |   }
51 |  },
52 |  "nbformat": 4,
53 |  "nbformat_minor": 2
54 | }
55 | 


--------------------------------------------------------------------------------
/notebooks/number_of_tokens.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import sys\n",
 10 |     "sys.path.append('../')"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": 2,
 16 |    "metadata": {},
 17 |    "outputs": [],
 18 |    "source": [
 19 |     "from minbpe import RegexTokenizer\n",
 20 |     "\n",
 21 |     "tokenizer = RegexTokenizer()\n",
 22 |     "tokenizer.load(\"../tokenizer/darija_tokenizer.model\")"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": 3,
 28 |    "metadata": {},
 29 |    "outputs": [
 30 |     {
 31 |      "data": {
 32 |       "text/plain": [
 33 |        "27416"
 34 |       ]
 35 |      },
 36 |      "execution_count": 3,
 37 |      "metadata": {},
 38 |      "output_type": "execute_result"
 39 |     }
 40 |    ],
 41 |    "source": [
 42 |     "import json\n",
 43 |     "\n",
 44 |     "with open(\"../dataset.json\") as f:\n",
 45 |     "    data = json.load(f)\n",
 46 |     "\n",
 47 |     "token_count = 0\n",
 48 |     "for row in data:\n",
 49 |     "    for conversation_turn in row[\"conversation\"]:\n",
 50 |     "        content = conversation_turn[\"content\"]\n",
 51 |     "        if content:\n",
 52 |     "            tokens = tokenizer.encode(content)\n",
 53 |     "            token_count += len(tokens)\n",
 54 |     "\n",
 55 |     "token_count"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": 4,
 61 |    "metadata": {},
 62 |    "outputs": [
 63 |     {
 64 |      "data": {
 65 |       "text/plain": [
 66 |        "136.39800995024876"
 67 |       ]
 68 |      },
 69 |      "execution_count": 4,
 70 |      "metadata": {},
 71 |      "output_type": "execute_result"
 72 |     }
 73 |    ],
 74 |    "source": [
 75 |     "import json\n",
 76 |     "\n",
 77 |     "with open(\"../dataset.json\") as f:\n",
 78 |     "    data = json.load(f)\n",
 79 |     "\n",
 80 |     "average_token_count = 0\n",
 81 |     "for row in data:\n",
 82 |     "    for conversation_turn in row[\"conversation\"]:\n",
 83 |     "        content = conversation_turn[\"content\"]\n",
 84 |     "        if content:\n",
 85 |     "            tokens = tokenizer.encode(content)\n",
 86 |     "            average_token_count += len(tokens)\n",
 87 |     "\n",
 88 |     "average_token_count /= len(data)\n",
 89 |     "average_token_count"
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "code",
 94 |    "execution_count": null,
 95 |    "metadata": {},
 96 |    "outputs": [],
 97 |    "source": []
 98 |   }
 99 |  ],
100 |  "metadata": {
101 |   "kernelspec": {
102 |    "display_name": "vincent",
103 |    "language": "python",
104 |    "name": "python3"
105 |   },
106 |   "language_info": {
107 |    "codemirror_mode": {
108 |     "name": "ipython",
109 |     "version": 3
110 |    },
111 |    "file_extension": ".py",
112 |    "mimetype": "text/x-python",
113 |    "name": "python",
114 |    "nbconvert_exporter": "python",
115 |    "pygments_lexer": "ipython3",
116 |    "version": "3.11.10"
117 |   }
118 |  },
119 |  "nbformat": 4,
120 |  "nbformat_minor": 2
121 | }
122 | 


--------------------------------------------------------------------------------
/notebooks/time_spent.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": 1,
 6 |    "metadata": {},
 7 |    "outputs": [
 8 |     {
 9 |      "name": "stdout",
10 |      "output_type": "stream",
11 |      "text": [
12 |       "Total time: 19:10\n"
13 |      ]
14 |     }
15 |    ],
16 |    "source": [
17 |     "times = [\n",
18 |     "    {\"date\": \"Mar 23\", \"hours\": 0, \"minutes\": 30},\n",
19 |     "    {\"date\": \"Mar 24\", \"hours\": 1, \"minutes\": 30},\n",
20 |     "    {\"date\": \"Mar 25\", \"hours\": 1, \"minutes\": 1},\n",
21 |     "    {\"date\": \"Mar 26\", \"hours\": 1, \"minutes\": 5},\n",
22 |     "    {\"date\": \"Mar 27\", \"hours\": 0, \"minutes\": 49},\n",
23 |     "    {\"date\": \"Mar 28\", \"hours\": 0, \"minutes\": 53},\n",
24 |     "    {\"date\": \"Mar 31\", \"hours\": 0, \"minutes\": 35},\n",
25 |     "    {\"date\": \"Apr 1\", \"hours\": 0, \"minutes\": 30},\n",
26 |     "    {\"date\": \"Apr 2\", \"hours\": 0, \"minutes\": 20},\n",
27 |     "    {\"date\": \"Apr 3\", \"hours\": 0, \"minutes\": 26},\n",
28 |     "    {\"date\": \"Apr 4\", \"hours\": 0, \"minutes\": 25},\n",
29 |     "    {\"date\": \"Apr 7\", \"hours\": 0, \"minutes\": 20},\n",
30 |     "    {\"date\": \"Apr 8\", \"hours\": 0, \"minutes\": 20},\n",
31 |     "    {\"date\": \"Apr 9\", \"hours\": 0, \"minutes\": 20},\n",
32 |     "    {\"date\": \"Apr 10\", \"hours\": 0, \"minutes\": 20},\n",
33 |     "    {\"date\": \"Apr 12\", \"hours\": 0, \"minutes\": 50},\n",
34 |     "    {\"date\": \"Apr 14\", \"hours\": 0, \"minutes\": 26},\n",
35 |     "    {\"date\": \"Apr 15\", \"hours\": 0, \"minutes\": 20},\n",
36 |     "    {\"date\": \"Apr 16\", \"hours\": 0, \"minutes\": 20},\n",
37 |     "    {\"date\": \"Apr 17\", \"hours\": 0, \"minutes\": 25},\n",
38 |     "    {\"date\": \"Apr 18\", \"hours\": 0, \"minutes\": 23},\n",
39 |     "    {\"date\": \"Apr 20\", \"hours\": 0, \"minutes\": 22},\n",
40 |     "    {\"date\": \"Apr 21\", \"hours\": 0, \"minutes\": 30},\n",
41 |     "    {\"date\": \"Apr 22\", \"hours\": 0, \"minutes\": 10},\n",
42 |     "    {\"date\": \"Apr 24\", \"hours\": 0, \"minutes\": 25},\n",
43 |     "    {\"date\": \"Apr 25\", \"hours\": 0, \"minutes\": 20},\n",
44 |     "    {\"date\": \"Apr 26\", \"hours\": 0, \"minutes\": 45},\n",
45 |     "    {\"date\": \"Apr 27\", \"hours\": 0, \"minutes\": 30},\n",
46 |     "    {\"date\": \"Apr 28\", \"hours\": 0, \"minutes\": 30},\n",
47 |     "    {\"date\": \"Apr 30\", \"hours\": 0, \"minutes\": 45},\n",
48 |     "    {\"date\": \"May 1\", \"hours\": 0, \"minutes\": 45},\n",
49 |     "    {\"date\": \"May 2\", \"hours\": 0, \"minutes\": 45},\n",
50 |     "    {\"date\": \"May 3\", \"hours\": 0, \"minutes\": 30},\n",
51 |     "    {\"date\": \"May 4\", \"hours\": 0, \"minutes\": 45},\n",
52 |     "]\n",
53 |     "\n",
54 |     "total_hours = 0\n",
55 |     "total_minutes = 0\n",
56 |     "\n",
57 |     "for time in times:\n",
58 |     "    total_hours += time[\"hours\"]\n",
59 |     "    total_minutes += time[\"minutes\"]\n",
60 |     "\n",
61 |     "if total_minutes >= 60:\n",
62 |     "    extra_hours = total_minutes // 60\n",
63 |     "    total_hours += extra_hours\n",
64 |     "    total_minutes = total_minutes % 60\n",
65 |     "\n",
66 |     "print(f\"Total time: {total_hours:02d}:{total_minutes:02d}\")"
67 |    ]
68 |   }
69 |  ],
70 |  "metadata": {
71 |   "kernelspec": {
72 |    "display_name": "vincent",
73 |    "language": "python",
74 |    "name": "python3"
75 |   },
76 |   "language_info": {
77 |    "codemirror_mode": {
78 |     "name": "ipython",
79 |     "version": 3
80 |    },
81 |    "file_extension": ".py",
82 |    "mimetype": "text/x-python",
83 |    "name": "python",
84 |    "nbconvert_exporter": "python",
85 |    "pygments_lexer": "ipython3",
86 |    "version": "3.11.10"
87 |   }
88 |  },
89 |  "nbformat": 4,
90 |  "nbformat_minor": 2
91 | }
92 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # UV
 98 | #   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #uv.lock
102 | 
103 | # poetry
104 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
106 | #   commonly ignored for libraries.
107 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108 | #poetry.lock
109 | 
110 | # pdm
111 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
112 | #pdm.lock
113 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
114 | #   in version control.
115 | #   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
116 | .pdm.toml
117 | .pdm-python
118 | .pdm-build/
119 | 
120 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
121 | __pypackages__/
122 | 
123 | # Celery stuff
124 | celerybeat-schedule
125 | celerybeat.pid
126 | 
127 | # SageMath parsed files
128 | *.sage.py
129 | 
130 | # Environments
131 | .env
132 | .venv
133 | env/
134 | venv/
135 | ENV/
136 | env.bak/
137 | venv.bak/
138 | 
139 | # Spyder project settings
140 | .spyderproject
141 | .spyproject
142 | 
143 | # Rope project settings
144 | .ropeproject
145 | 
146 | # mkdocs documentation
147 | /site
148 | 
149 | # mypy
150 | .mypy_cache/
151 | .dmypy.json
152 | dmypy.json
153 | 
154 | # Pyre type checker
155 | .pyre/
156 | 
157 | # pytype static type analyzer
158 | .pytype/
159 | 
160 | # Cython debug symbols
161 | cython_debug/
162 | 
163 | # PyCharm
164 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
165 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
166 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
167 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
168 | #.idea/
169 | 
170 | # Ruff stuff:
171 | .ruff_cache/
172 | 
173 | # PyPI configuration file
174 | .pypirc
175 | 
176 | minbpe/
177 | output/
178 | .env


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # BoDmagh dataset 🧠
  2 | 
  3 | The BoDmagh dataset is a Supervised Fine-Tuning (SFT) dataset for the Darija language. I created it manually, ensuring high quality. The dataset is in JSON format and includes conversations between a user and an assistant.
  4 | 
  5 | You can find the dataset on Hugging Face: [here](https://huggingface.co/datasets/ImadSaddik/BoDmaghDataset)
  6 | 
  7 | I update the dataset daily, so make sure to check the repository regularly.
  8 | 
  9 | ## Time spent  
 10 | 
 11 | Creating this dataset has been a labor of love. I’ve dedicated approximately **19 hours and 10 minutes** so far, manually curating each entry to ensure high quality. I continue to add new entries daily, so this number will keep growing.
 12 | 
 13 | ## Content
 14 | 
 15 | ### 1. Structure
 16 | 
 17 | The dataset is a JSON file containing a list of objects. Each object represents a single conversation entry and includes several metadata fields along with the conversation itself.
 18 | 
 19 | Here are the main fields within each conversation object:
 20 | 
 21 | * **`id`**: A unique integer identifier for the conversation.
 22 | * **`conversation`**: A list of message objects representing the turns in the conversation. Each message object has:
 23 |   * `role`: Either `user` or `assistant`.
 24 |   * `content`: The text content of the message turn (plain text).
 25 | * **`token_count`**: The total number of tokens in the `conversation` field, calculated using the [darija tokenizer](https://github.com/ImadSaddik/DarijaTokenizers).
 26 | * **`turns_count`**: The total number of turns (messages) in the `conversation`.
 27 | * **`markdown_conversation`**: A list of message objects similar to `conversation`, but the `content` may include Markdown formatting (e.g., bolding, lists) where applicable.
 28 | * **`source`**: Indicates how the conversation was generated (e.g., "Manually generated").
 29 | * **`topic`**: A label indicating the primary topic of the conversation (e.g., "Science", "Travel", "Chit-chat/Games/Humor").
 30 | * **`safety_flags`**: A list of strings indicating any safety concerns identified in the conversation (e.g., `["harmful_content_request", "refusal"]`). An empty list `[]` means the conversation is considered safe.
 31 | 
 32 | *Note: The `system` message is not included, but you can add it if needed for your specific fine-tuning format.*
 33 | 
 34 | Here is an example of a single conversation object:
 35 | 
 36 | ```json
 37 | {
 38 |     "id": 74,
 39 |     "conversation": [
 40 |         {
 41 |             "role": "user",
 42 |             "content": "شحال بعيدة علينا الشمس؟"
 43 |         },
 44 |         {
 45 |             "role": "assistant",
 46 |             "content": "الشمس بعيدة علينا ب 150 مليون كيلومتر تقريبا."
 47 |         },
 48 |         {
 49 |             "role": "user",
 50 |             "content": "ويلي ويلي شحال بعيداااا."
 51 |         },
 52 |         {
 53 |             "role": "assistant",
 54 |             "content": "واييه راه هاد الكون كبير بزاف صعيب الواحد يتخيل الكبورية ديالو."
 55 |         },
 56 |         {
 57 |             "role": "user",
 58 |             "content": "شناهية سنة ضوئية؟"
 59 |         },
 60 |         {
 61 |             "role": "assistant",
 62 |             "content": "هي المسافة اللي الضو كيضربها فعام."
 63 |         },
 64 |         {
 65 |             "role": "user",
 66 |             "content": "شحال فيها من كيلومتر؟"
 67 |         },
 68 |         {
 69 |             "role": "assistant",
 70 |             "content": "سنة ضوئية فيها 9.461 تريليون كيلومتر."
 71 |         },
 72 |         {
 73 |             "role": "user",
 74 |             "content": "والله اتا بزاف هاد الكون."
 75 |         },
 76 |         {
 77 |             "role": "assistant",
 78 |             "content": "الى عندك شي سؤال خور غولهالي."
 79 |         }
 80 |     ],
 81 |     "token_count": 97,
 82 |     "turns_count": 10,
 83 |     "markdown_conversation": [
 84 |         {
 85 |             "role": "user",
 86 |             "content": "شحال بعيدة علينا الشمس؟"
 87 |         },
 88 |         {
 89 |             "role": "assistant",
 90 |             "content": "الشمس بعيدة علينا ب **150 مليون كيلومتر** تقريبا."
 91 |         },
 92 |         {
 93 |             "role": "user",
 94 |             "content": "ويلي ويلي شحال بعيداااا."
 95 |         },
 96 |         {
 97 |             "role": "assistant",
 98 |             "content": "واييه راه هاد الكون كبير بزاف صعيب الواحد يتخيل الكبورية ديالو."
 99 |         },
100 |         {
101 |             "role": "user",
102 |             "content": "شناهية سنة ضوئية؟"
103 |         },
104 |         {
105 |             "role": "assistant",
106 |             "content": "هي المسافة اللي الضو كيضربها فعام."
107 |         },
108 |         {
109 |             "role": "user",
110 |             "content": "شحال فيها من كيلومتر؟"
111 |         },
112 |         {
113 |             "role": "assistant",
114 |             "content": "سنة ضوئية فيها **9.461 تريليون كيلومتر**."
115 |         },
116 |         {
117 |             "role": "user",
118 |             "content": "والله اتا بزاف هاد الكون."
119 |         },
120 |         {
121 |             "role": "assistant",
122 |             "content": "الى عندك شي سؤال خور غولهالي."
123 |         }
124 |     ],
125 |     "source": "Manually generated",
126 |     "topic": "Science",
127 |     "safety_flags": []
128 | }
129 | ```
130 | 
131 | ### 2. Assistant's name
132 | 
133 | Sometimes the assistant's name is included. The assistant's name is `بودماغ`. If you find this word, you can replace it with any other name. Here's an example:
134 | 
135 | ```json
136 | [
137 |     {
138 |         "role": "user",
139 |         "content": "أهلا شنو سميتك؟"
140 |     },
141 |     {
142 |         "role": "assistant",
143 |         "content": "انا سميتي بودماغ، ساوبني عماد الصاديق باش نساعدك فاي حاجة اللي نقد نعاون فيها."
144 |     },
145 |     {
146 |         "role": "user",
147 |         "content": "هههه سمية واعرة عجباتني"
148 |     },
149 |     {
150 |         "role": "assistant",
151 |         "content": "هههه واييه راه عندي دماغ كبير غا باش نقد نعاونك فاي حاجة الى كنت نعرف ليها."
152 |     },
153 |     {
154 |         "role": "user",
155 |         "content": "شناهيا العاصمة دالمغرب؟"
156 |     },
157 |     {
158 |         "role": "assistant",
159 |         "content": "العاصمة دالمغرب هي الرباط."
160 |     }
161 | ]
162 | ```
163 | 
164 | ### 3. Special tokens
165 | 
166 | This dataset doesn't use any special tokens. Since everyone uses different tokens, feel free to add them as needed.
167 | 
168 | ## Contributing
169 | 
170 | As I mentioned, I'm manually creating this dataset to ensure high quality. If you'd like to help, follow these steps:
171 | 
172 | 1. Fork the repository.
173 | 2. Create a new branch.
174 | 3. Add your new conversations to the `input_dataset.json` file. Each conversation should be a list of message objects (see example below).
175 | 4. Run the `enrich_dataset.py` script in the `scripts` directory to enrich your conversations with metadata and append them to the global dataset.
176 | 5. Create a pull request.
177 | 
178 | ### Example: Adding to `input_dataset.json`
179 | 
180 | The `input_dataset.json` file should contain a list of conversations. Each conversation is itself a list of message objects, where each message has a `role` (either `user` or `assistant`) and a `content` field. For example:
181 | 
182 | ```json
183 | [
184 |     [
185 |         {"role": "user", "content": "أهلا شنو سميتك؟"},
186 |         {"role": "assistant", "content": "انا سميتي بودماغ، ساوبني عماد الصاديق باش نساعدك فاي حاجة اللي نقد نعاون فيها."}
187 |     ],
188 |     [
189 |         {"role": "user", "content": "شنو تحب تشرب؟"},
190 |         {"role": "assistant", "content": "نحب نشرب قهوة، ونتمنى انك زادة تحبها."}
191 |     ]
192 | ]
193 | ```
194 | 
195 | After adding your conversations, run:
196 | 
197 | ```bash
198 | python scripts/enrich_dataset.py
199 | ```
200 | 
201 | This will process your new conversations and add them (with metadata) to the main dataset.
202 | 
203 | ## Wanna talk?
204 | 
205 | You can contact me through:
206 | 
207 | * **Discord:** Username: imad_saddik
208 | * **LinkedIn:** [Connect with me](https://www.linkedin.com/in/imadsaddik/).
209 | * **Email:** [simad3647@gmail.com](mailto:simad3647@gmail.com).
210 | 


--------------------------------------------------------------------------------
/scripts/enrich_dataset.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import json
  4 | import time
  5 | import logging
  6 | 
  7 | from tqdm import tqdm
  8 | from dotenv import load_dotenv
  9 | from models import Conversation
 10 | 
 11 | from google import genai
 12 | from google.genai.client import Client
 13 | from google.genai.types import GenerateContentConfig, GenerateContentResponse
 14 | 
 15 | from system_prompts import (
 16 |     get_system_prompt_for_topic_classification,
 17 |     get_system_prompt_for_markdown_task,
 18 |     get_system_prompt_for_safety_flags_classification
 19 | )
 20 | 
 21 | 
 22 | logging.basicConfig(
 23 |     level=logging.INFO,
 24 |     format='%(asctime)s - %(levelname)s - %(message)s'
 25 | )
 26 | logger = logging.getLogger(__name__)
 27 | 
 28 | 
 29 | def enrich_dataset(dataset_to_enrich: list, client: Client) -> list:
 30 |     logger.info("Starting dataset enrichment process.")
 31 | 
 32 |     logger.info("Step 1: Adding conversation field and ID.")
 33 |     enriched_dataset = _add_conversation_field_and_id(dataset_to_enrich)
 34 | 
 35 |     logger.info("Step 2: Adding token count.")
 36 |     _add_token_count(enriched_dataset)
 37 | 
 38 |     logger.info("Step 3: Adding number of turns.")
 39 |     _add_number_of_turns(enriched_dataset)
 40 | 
 41 |     logger.info("Step 4: Adding conversation in markdown format.")
 42 |     _add_conversation_in_markdown_format(enriched_dataset, client)
 43 | 
 44 |     logger.info("Step 5: Adding data source.")
 45 |     _add_data_source(enriched_dataset)
 46 | 
 47 |     logger.info("Step 6: Adding conversation topic.")
 48 |     _add_conversation_topic(enriched_dataset, client)
 49 | 
 50 |     logger.info("Step 7: Adding safety flags.")
 51 |     _add_safety_flags(enriched_dataset, client)
 52 | 
 53 |     logger.info("Dataset enrichment process finished.")
 54 |     return enriched_dataset
 55 | 
 56 | 
 57 | def _load_dataset_to_enrich() -> list:
 58 |     with open("../input_dataset.json") as f:
 59 |         data = json.load(f)
 60 | 
 61 |     return data
 62 | 
 63 | 
 64 | def _add_conversation_field_and_id(data: list) -> list:
 65 |     new_data = []
 66 |     for i, entry in enumerate(data):
 67 |         new_data.append({
 68 |             "id": i,
 69 |             "conversation": entry
 70 |         })
 71 |     return new_data
 72 | 
 73 | 
 74 | def _add_token_count(data: list) -> None:
 75 |     tokenizer = _load_tokenizer()
 76 | 
 77 |     for entry in data:
 78 |         token_count = 0
 79 |         for conversation_turn in entry["conversation"]:
 80 |             content = conversation_turn["content"]
 81 |             if content:
 82 |                 tokens = tokenizer.encode(content)
 83 |                 token_count += len(tokens)
 84 | 
 85 |         entry["token_count"] = token_count
 86 | 
 87 | 
 88 | def _load_tokenizer() -> object:
 89 |     from minbpe import RegexTokenizer
 90 | 
 91 |     tokenizer = RegexTokenizer()
 92 |     tokenizer.load("../tokenizer/darija_tokenizer.model")
 93 |     return tokenizer
 94 | 
 95 | 
 96 | def _add_number_of_turns(data: list) -> None:
 97 |     for entry in data:
 98 |         number_of_turns = len(entry["conversation"])
 99 |         entry["turns_count"] = number_of_turns
100 | 
101 | 
102 | def _add_conversation_in_markdown_format(data: list, client: Client) -> None:
103 |     def _get_user_prompt(conversation: list[dict]) -> str:
104 |         return f"""Please format the following conversation in markdown format. If you cannot format it, return it as is without any formatting.
105 | 
106 |     {conversation}
107 |     """
108 | 
109 |     def _get_formatted_conversation(response: GenerateContentResponse) -> list[dict]:
110 |         if not response:
111 |             return []
112 | 
113 |         formatted_conversation = []
114 |         for conversation_turn in response.parsed.conversation:
115 |             formatted_conversation.append({
116 |                 "role": conversation_turn.role,
117 |                 "content": conversation_turn.content
118 |             })
119 |         return formatted_conversation
120 | 
121 |     for entry in tqdm(data, total=len(data), desc="Formatting conversations in markdown"):
122 |         conversation = entry["conversation"]
123 |         user_prompt = _get_user_prompt(conversation)
124 |         response = client.models.generate_content(
125 |             model="gemini-2.0-flash",
126 |             contents=user_prompt,
127 |             config=GenerateContentConfig(
128 |                 system_instruction=[
129 |                     get_system_prompt_for_markdown_task()
130 |                 ],
131 |                 response_mime_type="application/json",
132 |                 response_schema=Conversation,
133 |             )
134 |         )
135 |         if not response:
136 |             entry["markdown_conversation"] = []
137 |         else:
138 |             formatted_conversation = _get_formatted_conversation(response)
139 |             entry["markdown_conversation"] = formatted_conversation
140 | 
141 |         time.sleep(5)
142 | 
143 | 
144 | def _add_data_source(data: list) -> None:
145 |     for entry in data:
146 |         entry["source"] = "Manually generated"
147 | 
148 | 
149 | def _add_conversation_topic(data: list, client: Client) -> None:
150 |     def _get_user_prompt(conversation: list[dict]) -> str:
151 |         return f"""Classify the topic of the following conversation:
152 | 
153 |     {conversation}
154 |     """
155 | 
156 |     for entry in tqdm(data, total=len(data), desc="Classifying conversation topics"):
157 |         conversation = entry["conversation"]
158 |         user_prompt = _get_user_prompt(conversation)
159 |         response = client.models.generate_content(
160 |             model="gemini-2.0-flash",
161 |             contents=user_prompt,
162 |             config=GenerateContentConfig(
163 |                 system_instruction=[
164 |                     get_system_prompt_for_topic_classification()
165 |                 ]
166 |             )
167 |         )
168 |         if not response:
169 |             entry["topic"] = []
170 |         else:
171 |             topic = response.text.strip()
172 |             entry["topic"] = topic
173 | 
174 |         time.sleep(5)
175 | 
176 | 
177 | def _add_safety_flags(data: list, client: Client) -> None:
178 |     def _get_user_prompt(conversation: list[dict]) -> str:
179 |         return f"""Classify the safety flags for the following conversation:
180 | 
181 |     {conversation}
182 |     """
183 | 
184 |     for entry in tqdm(data, total=len(data), desc="Classifying safety flags"):
185 |         conversation = entry["conversation"]
186 |         user_prompt = _get_user_prompt(conversation)
187 |         response = client.models.generate_content(
188 |             model="gemini-2.0-flash",
189 |             contents=user_prompt,
190 |             config=GenerateContentConfig(
191 |                 system_instruction=[
192 |                     get_system_prompt_for_safety_flags_classification()
193 |                 ]
194 |             )
195 |         )
196 |         if not response:
197 |             entry["safety_flags"] = []
198 |         else:
199 |             safety_flags = response.text.strip()
200 |             entry["safety_flags"] = json.loads(safety_flags)
201 | 
202 |         time.sleep(5)
203 | 
204 | 
205 | def append_to_the_global_dataset(dataset_to_add: list) -> None:
206 |     logger.info("Appending enriched dataset to the global dataset.")
207 |     with open("../dataset.json") as f:
208 |         global_dataset = json.load(f)
209 |         last_id = global_dataset[-1]["id"]
210 | 
211 |         for i, entry in enumerate(dataset_to_add, start=last_id + 1):
212 |             entry["id"] = i
213 |             global_dataset.append(entry)
214 | 
215 |     with open("../dataset.json", "w") as f:
216 |         json.dump(global_dataset, f, indent=2, ensure_ascii=False)
217 | 
218 | 
219 | def clear_the_dataset_to_enrich_file() -> None:
220 |     with open("../input_dataset.json", "w") as f:
221 |         json.dump([], f, indent=2, ensure_ascii=False)
222 | 
223 | 
224 | if __name__ == "__main__":
225 |     sys.path.append("..")
226 |     load_dotenv()
227 | 
228 |     dataset_to_enrich = _load_dataset_to_enrich()
229 |     client = genai.Client(api_key=os.getenv("GOOGLE_API_KEY"))
230 |     enriched_dataset = enrich_dataset(dataset_to_enrich, client)
231 |     append_to_the_global_dataset(enriched_dataset)
232 |     clear_the_dataset_to_enrich_file()
233 |     logger.info(
234 |         "Successfully enriched the dataset and appended it to the global dataset.")
235 | 


--------------------------------------------------------------------------------
/scripts/system_prompts.py:
--------------------------------------------------------------------------------
  1 | def get_system_prompt_for_markdown_task() -> str:
  2 |     return """You are a helpful assistant. You will be given a conversation between a user and an assistant. Your task is to convert the conversation from plain text into markdown format if possible. 
  3 | 
  4 | The dataset was generated manually without any markdown formatting. The conversation is in Moroccan Arabic (Darija). Here are some examples of how to format the conversation in markdown:
  5 | 
  6 | # Example 1:
  7 | 
  8 | ## Before:
  9 | 
 10 | ```json
 11 | [
 12 |     {
 13 |         "role": "user",
 14 |         "content": "السلام لباس؟"
 15 |     },
 16 |     {
 17 |         "role": "assistant",
 18 |         "content": "لاباس الحمد لله، كاين شي حاجا بغيتي نعاونك فيها؟"
 19 |     },
 20 |     {
 21 |         "role": "user",
 22 |         "content": "اه عافاك بغيت نسافر فالمغرب فالصيف ولكن معرفتش فين نمشي. ممكن تعاوني؟"
 23 |     },
 24 |     {
 25 |         "role": "assistant",
 26 |         "content": "بلان كاين بزاف ديال البلايص اللي تقد تمشي ليهم فالمغرب، انا كنقترح عليك هدو:\n\n- شفشاون: هدي مدينة فالجبل، الديور ديالها زرقين او الجو فالمدينة كيجيب الراحة.\n- الصويرة: هاد المدينة فيها البحر الا فيك ميعوم. البحر ديالها زوين او فيها المدينة القديمة.\n- الداخلة: الداخلة هي مدينة فالصحرا ديال المغرب، حتاهيا فيها البحر. الناس كيجيو ليه من العالم كامل باش يلعبوا السبور.\n- مراكش: هاد المدينة عزيزة على السياح لكيجيو من برا. فيها جامع الفنا، المدينة القديمة ولكن فالصيف دايرة بحال الفران.\n- شلالات أوزود: هاد الشلالات كاينين فالجبل دالأطلس، هادوا اشهر الشلالات فالمغرب سير تمنضر فيهوم معا راسك راه ايعجبوك.\n\nكاين بزاف ديال البلايس اخرين فالمغرب ولكن غولي بعدا واش هدوا عجبوك."
 27 |     },
 28 |     {
 29 |         "role": "user",
 30 |         "content": "شكرا بزاف اصاحبي"
 31 |     },
 32 |     {
 33 |         "role": "assistant",
 34 |         "content": "مرحبا، إلى بغيتي شت حاجة أخرى غولهالي انا هنا باش نعاونك."
 35 |     },
 36 |     {
 37 |         "role": "user",
 38 |         "content": "لالا صافي مبغيت حتى شي حاجة"
 39 |     },
 40 |     {
 41 |         "role": "assistant",
 42 |         "content": "اوكي اوا نمشي نرتاح شوية على هاد الحساب. ستمتع معا راسك بالتسافيرة."
 43 |     }
 44 | ]
 45 | ```
 46 | 
 47 | ## After:
 48 | 
 49 | ```json
 50 | [
 51 |     {
 52 |       "role": "user",
 53 |       "content": "السلام لباس؟"
 54 |     },
 55 |     {
 56 |       "role": "assistant",
 57 |       "content": "لاباس الحمد لله، كاين شي حاجا بغيتي نعاونك فيها؟"
 58 |     },
 59 |     {
 60 |       "role": "user",
 61 |       "content": "اه عافاك بغيت نسافر فالمغرب فالصيف ولكن معرفتش فين نمشي. ممكن تعاوني؟"
 62 |     },
 63 |     {
 64 |       "role": "assistant",
 65 |       "content": "بلان كاين بزاف ديال البلايص اللي تقد تمشي ليهم فالمغرب، انا كنقترح عليك هدو:\n\n- **شفشاون**: هدي مدينة فالجبل، الديور ديالها زرقين او الجو فالمدينة كيجيب الراحة.\n- **الصويرة**: هاد المدينة فيها البحر الا فيك ميعوم. البحر ديالها زوين او فيها المدينة القديمة.\n- **الداخلة**: الداخلة هي مدينة فالصحرا ديال المغرب، حتاهيا فيها البحر. الناس كيجيو ليه من العالم كامل باش يلعبوا السبور.\n- **مراكش**: هاد المدينة عزيزة على السياح لكيجيو من برا. فيها جامع الفنا، المدينة القديمة ولكن فالصيف دايرة بحال الفران.\n- **شلالات أوزود**: هاد الشلالات كاينين فالجبل دالأطلس، هادوا اشهر الشلالات فالمغرب سير تمنضر فيهوم معا راسك راه ايعجبوك.\n\nكاين بزاف ديال البلايس اخرين فالمغرب ولكن غولي بعدا واش هدوا عجبوك."
 66 |     },
 67 |     {
 68 |       "role": "user",
 69 |       "content": "شكرا بزاف اصاحبي"
 70 |     },
 71 |     {
 72 |       "role": "assistant",
 73 |       "content": "مرحبا، إلى بغيتي شت حاجة أخرى غولهالي انا هنا باش نعاونك."
 74 |     },
 75 |     {
 76 |       "role": "user",
 77 |       "content": "لالا صافي مبغيت حتى شي حاجة"
 78 |     },
 79 |     {
 80 |       "role": "assistant",
 81 |       "content": "اوكي اوا نمشي نرتاح شوية على هاد الحساب. ستمتع معا راسك بالتسافيرة."
 82 |     }
 83 | ]
 84 | ```
 85 | 
 86 | # Example 2:
 87 | 
 88 | ## Before:
 89 | 
 90 | ```json
 91 | [
 92 |     {
 93 |         "role": "user",
 94 |         "content": "اهلا شنو سميتك؟"
 95 |     },
 96 |     {
 97 |         "role": "assistant",
 98 |         "content": "انا سميتي بودماغ، ساوبني عماد الصاديق باش نساعدك فاي حاجة اللي نقد نعاون فيها."
 99 |     }
100 | ]
101 | ```
102 | 
103 | ## After:
104 | 
105 | ```json
106 | [
107 |     {
108 |       "role": "user",
109 |       "content": "اهلا شنو سميتك؟"
110 |     },
111 |     {
112 |       "role": "assistant",
113 |       "content": "انا سميتي **بودماغ**، ساوبني **عماد الصاديق** باش نساعدك فاي حاجة اللي نقد نعاون فيها."
114 |     }
115 | ]
116 | ```
117 | 
118 | # Example 3:
119 | 
120 | ## Before:
121 | 
122 | ```json
123 | [
124 |     {
125 |         "role": "user",
126 |         "content": "شحال خاصني نصلي فالنهار؟"
127 |     },
128 |     {
129 |         "role": "assistant",
130 |         "content": "خاصك تصلي 5 دالمرات فالنهار."
131 |     },
132 |     {
133 |         "role": "user",
134 |         "content": "اوكي شكرا"
135 |     },
136 |     {
137 |         "role": "assistant",
138 |         "content": "مرحبا على الراس او العين"
139 |     }
140 | ]
141 | ```
142 | 
143 | ## After:
144 | 
145 | ```json
146 | [
147 |     {
148 |       "role": "user",
149 |       "content": "شحال خاصني نصلي فالنهار؟"
150 |     },
151 |     {
152 |       "role": "assistant",
153 |       "content": "خاصك تصلي **5 دالمرات** فالنهار."
154 |     },
155 |     {
156 |       "role": "user",
157 |       "content": "اوكي شكرا"
158 |     },
159 |     {
160 |       "role": "assistant",
161 |       "content": "مرحبا على الراس او العين."
162 |     }
163 | ]
164 | ```
165 | 
166 | From these examples, you should be able to understand how to format the conversation in markdown. 
167 | 
168 | Notse:
169 | - In some conversations, markdown formatting may not be possible. In that case, you can just return the conversation as it is without any formatting.
170 | - The output format shoudl be a valid JSON array.
171 | """
172 | 
173 | 
174 | def get_system_prompt_for_topic_classification() -> str:
175 |     return """You are a Topic Classification AI. Your task is to analyze the provided conversation transcript and determine its single, primary topic.
176 | 
177 | You MUST prioritize selecting ONE topic from the following predefined list if the conversation's main subject is adequately covered by one of them:
178 | 
179 | [
180 |   "Geography",
181 |   "Science",
182 |   "Technology",
183 |   "History",
184 |   "Culture",
185 |   "Religion",
186 |   "Education",
187 |   "Health",
188 |   "Sports",
189 |   "AI Persona",
190 |   "Chit-chat/Games/Humor",
191 |   "Safety",
192 |   "Mathematics",
193 |   "Ethics",
194 |   "Politics",
195 |   "Language",
196 |   "Food",
197 |   "Animals",
198 |   "Arts",
199 |   "Career",
200 |   "Travel",
201 |   "Task Assistance"
202 | ]
203 | 
204 | Focus on the *dominant theme* or *central subject* of the entire conversation, not just isolated sentences or brief tangents.
205 | 
206 | If, after careful consideration, you find that *none* of the provided topics accurately or sufficiently capture the main subject of the conversation, you MUST identify and output a *new*, concise, and appropriate topic label that best describes it (e.g., "Relationship Advice", "Car Maintenance", "Event Planning").
207 | 
208 | Your response MUST consist *only* of the single chosen or newly identified topic label. Do not include any explanations, introductions, justifications, or surrounding text. Just output the topic name.
209 | """
210 | 
211 | 
212 | def get_system_prompt_for_safety_flags_classification() -> str:
213 |     return """You are a Safety Classification AI. Your task is to analyze the provided conversation transcript and identify any applicable safety flags based on the content.
214 | 
215 | Review the entire conversation for instances matching any of the following categories:
216 | 
217 | *   **`harmful_content_request`**: User asks for instructions or information on performing illegal or dangerous activities (e.g., making weapons, promoting violence, illegal substance use).
218 | *   **`unethical_request`**: User asks for assistance with academically dishonest or clearly unethical actions (e.g., cheating, plagiarism, impersonation).
219 | *   **`unsafe_activity_inquiry`**: User asks about performing physically dangerous actions without a clear safety or educational context (e.g., self-harm, dangerous stunts, misuse of objects).
220 | *   **`sexual_content_suggestion`**: User inputs sexually explicit content, requests, or makes inappropriate sexual suggestions.
221 | *   **`inappropriate_medical_advice_request`**: User asks for medical diagnosis, treatment plans, or specific medication advice that should only come from a healthcare professional.
222 | *   **`insults`**: User directs personal insults or attacks towards the AI or another party.
223 | *   **`vulgar_language`**: User employs excessive or highly offensive profanity/vulgarity.
224 | *   **`threat`**: User expresses a direct or indirect threat of harm towards the AI or others.
225 | *   **`animal_cruelty_suggestion`**: User suggests or describes actions that constitute cruelty or significant harm to animals.
226 | *   **`refusal`**: The AI explicitly refuses to fulfill a user request due to safety, ethical, or capability limitations. (Include this if the AI's refusal action is relevant).
227 | 
228 | Consider the context of the conversation. A flag should only be applied if the content clearly fits the category description. Multiple flags can apply to a single conversation.
229 | 
230 | Your response MUST be a valid JSON list containing the applicable flag strings. If no flags apply and the conversation is deemed safe, output an empty JSON list `[]`. Do not include any explanations, introductions, justifications, or surrounding text. Just output the JSON list.
231 | 
232 | If you think a new flag is needed, use it instead of the predefined ones. The new flag should be a single string that describes the issue in a concise manner. For example, if the user is asking for help with a dangerous activity, you can use "dangerous_activity_request" as a new flag.
233 | """
234 | 


--------------------------------------------------------------------------------
/notebooks/convert_dataset_to_new_format.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "25ec90de",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "## 1. Load the old dataset"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "code",
 13 |    "execution_count": null,
 14 |    "id": "a5a38a9e",
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "import json\n",
 19 |     "\n",
 20 |     "with open(\"../dataset.json\") as f:\n",
 21 |     "    data = json.load(f)\n",
 22 |     "\n",
 23 |     "len(data)"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "markdown",
 28 |    "id": "c97328f9",
 29 |    "metadata": {},
 30 |    "source": [
 31 |     "## 2. Add the conversation field and conversation ID"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": null,
 37 |    "id": "7e03c107",
 38 |    "metadata": {},
 39 |    "outputs": [],
 40 |    "source": [
 41 |     "new_data = []\n",
 42 |     "for i in range(len(data)):\n",
 43 |     "    new_data.append({\n",
 44 |     "        \"id\": i,\n",
 45 |     "        \"conversation\": data[i]\n",
 46 |     "    })"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": null,
 52 |    "id": "a824526b",
 53 |    "metadata": {},
 54 |    "outputs": [],
 55 |    "source": [
 56 |     "new_data[0].keys()"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "code",
 61 |    "execution_count": null,
 62 |    "id": "d0fb623d",
 63 |    "metadata": {},
 64 |    "outputs": [],
 65 |    "source": [
 66 |     "new_data[0]"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "markdown",
 71 |    "id": "518afc11",
 72 |    "metadata": {},
 73 |    "source": [
 74 |     "## 3. Add number of tokens"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "code",
 79 |    "execution_count": null,
 80 |    "id": "3a993d2d",
 81 |    "metadata": {},
 82 |    "outputs": [],
 83 |    "source": [
 84 |     "import sys\n",
 85 |     "sys.path.append('../')"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "code",
 90 |    "execution_count": null,
 91 |    "id": "d99b798a",
 92 |    "metadata": {},
 93 |    "outputs": [],
 94 |    "source": [
 95 |     "from minbpe import RegexTokenizer\n",
 96 |     "\n",
 97 |     "tokenizer = RegexTokenizer()\n",
 98 |     "tokenizer.load(\"../tokenizer/darija_tokenizer.model\")"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "code",
103 |    "execution_count": null,
104 |    "id": "97f8c9ee",
105 |    "metadata": {},
106 |    "outputs": [],
107 |    "source": [
108 |     "for entry in new_data:\n",
109 |     "    token_count = 0\n",
110 |     "    for conversation_turn in entry[\"conversation\"]:\n",
111 |     "        content = conversation_turn[\"content\"]\n",
112 |     "        if content:\n",
113 |     "            tokens = tokenizer.encode(content)\n",
114 |     "            token_count += len(tokens)\n",
115 |     "\n",
116 |     "    entry[\"token_count\"] = token_count"
117 |    ]
118 |   },
119 |   {
120 |    "cell_type": "code",
121 |    "execution_count": null,
122 |    "id": "a24f686d",
123 |    "metadata": {},
124 |    "outputs": [],
125 |    "source": [
126 |     "new_data[0].keys()"
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "code",
131 |    "execution_count": null,
132 |    "id": "cabedc9f",
133 |    "metadata": {},
134 |    "outputs": [],
135 |    "source": [
136 |     "new_data[0]"
137 |    ]
138 |   },
139 |   {
140 |    "cell_type": "markdown",
141 |    "id": "796141c5",
142 |    "metadata": {},
143 |    "source": [
144 |     "## 4. Add number of turns"
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "code",
149 |    "execution_count": null,
150 |    "id": "eb088706",
151 |    "metadata": {},
152 |    "outputs": [],
153 |    "source": [
154 |     "for entry in new_data:\n",
155 |     "    entry[\"turns_count\"] = len(entry[\"conversation\"])"
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "code",
160 |    "execution_count": null,
161 |    "id": "eea85f59",
162 |    "metadata": {},
163 |    "outputs": [],
164 |    "source": [
165 |     "new_data[0].keys()"
166 |    ]
167 |   },
168 |   {
169 |    "cell_type": "code",
170 |    "execution_count": null,
171 |    "id": "06e085f3",
172 |    "metadata": {},
173 |    "outputs": [],
174 |    "source": [
175 |     "new_data[1]"
176 |    ]
177 |   },
178 |   {
179 |    "cell_type": "markdown",
180 |    "id": "541c68ff",
181 |    "metadata": {},
182 |    "source": [
183 |     "## 5. Turn conversation to markdown"
184 |    ]
185 |   },
186 |   {
187 |    "cell_type": "code",
188 |    "execution_count": null,
189 |    "id": "886be97b",
190 |    "metadata": {},
191 |    "outputs": [],
192 |    "source": [
193 |     "from dotenv import load_dotenv\n",
194 |     "load_dotenv()"
195 |    ]
196 |   },
197 |   {
198 |    "cell_type": "code",
199 |    "execution_count": null,
200 |    "id": "16e810ed",
201 |    "metadata": {},
202 |    "outputs": [],
203 |    "source": [
204 |     "import os\n",
205 |     "\n",
206 |     "from google import genai\n",
207 |     "\n",
208 |     "client = genai.Client(api_key=os.getenv(\"GOOGLE_API_KEY\"))"
209 |    ]
210 |   },
211 |   {
212 |    "cell_type": "code",
213 |    "execution_count": null,
214 |    "id": "c711c2bd",
215 |    "metadata": {},
216 |    "outputs": [],
217 |    "source": [
218 |     "for model in client.models.list():\n",
219 |     "    if \"gemini-2\" in model.name.lower():\n",
220 |     "        print(model.name)\n",
221 |     "        print(model.input_token_limit)\n",
222 |     "        print(model.output_token_limit)\n",
223 |     "        print()"
224 |    ]
225 |   },
226 |   {
227 |    "cell_type": "code",
228 |    "execution_count": null,
229 |    "id": "265f055c",
230 |    "metadata": {},
231 |    "outputs": [],
232 |    "source": [
233 |     "model = \"gemini-2.0-flash\"\n",
234 |     "system_instruction = \"\"\"You are a helpful assistant. You will be given a conversation between a user and an assistant. Your task is to convert the conversation from plain text into markdown format if possible. \n",
235 |     "\n",
236 |     "The dataset was generated manually without any markdown formatting. The conversation is in Moroccan Arabic (Darija). Here are some examples of how to format the conversation in markdown:\n",
237 |     "\n",
238 |     "# Example 1:\n",
239 |     "\n",
240 |     "## Before:\n",
241 |     "\n",
242 |     "```json\n",
243 |     "[\n",
244 |     "    {\n",
245 |     "        \"role\": \"user\",\n",
246 |     "        \"content\": \"السلام لباس؟\"\n",
247 |     "    },\n",
248 |     "    {\n",
249 |     "        \"role\": \"assistant\",\n",
250 |     "        \"content\": \"لاباس الحمد لله، كاين شي حاجا بغيتي نعاونك فيها؟\"\n",
251 |     "    },\n",
252 |     "    {\n",
253 |     "        \"role\": \"user\",\n",
254 |     "        \"content\": \"اه عافاك بغيت نسافر فالمغرب فالصيف ولكن معرفتش فين نمشي. ممكن تعاوني؟\"\n",
255 |     "    },\n",
256 |     "    {\n",
257 |     "        \"role\": \"assistant\",\n",
258 |     "        \"content\": \"بلان كاين بزاف ديال البلايص اللي تقد تمشي ليهم فالمغرب، انا كنقترح عليك هدو:\\n\\n- شفشاون: هدي مدينة فالجبل، الديور ديالها زرقين او الجو فالمدينة كيجيب الراحة.\\n- الصويرة: هاد المدينة فيها البحر الا فيك ميعوم. البحر ديالها زوين او فيها المدينة القديمة.\\n- الداخلة: الداخلة هي مدينة فالصحرا ديال المغرب، حتاهيا فيها البحر. الناس كيجيو ليه من العالم كامل باش يلعبوا السبور.\\n- مراكش: هاد المدينة عزيزة على السياح لكيجيو من برا. فيها جامع الفنا، المدينة القديمة ولكن فالصيف دايرة بحال الفران.\\n- شلالات أوزود: هاد الشلالات كاينين فالجبل دالأطلس، هادوا اشهر الشلالات فالمغرب سير تمنضر فيهوم معا راسك راه ايعجبوك.\\n\\nكاين بزاف ديال البلايس اخرين فالمغرب ولكن غولي بعدا واش هدوا عجبوك.\"\n",
259 |     "    },\n",
260 |     "    {\n",
261 |     "        \"role\": \"user\",\n",
262 |     "        \"content\": \"شكرا بزاف اصاحبي\"\n",
263 |     "    },\n",
264 |     "    {\n",
265 |     "        \"role\": \"assistant\",\n",
266 |     "        \"content\": \"مرحبا، إلى بغيتي شت حاجة أخرى غولهالي انا هنا باش نعاونك.\"\n",
267 |     "    },\n",
268 |     "    {\n",
269 |     "        \"role\": \"user\",\n",
270 |     "        \"content\": \"لالا صافي مبغيت حتى شي حاجة\"\n",
271 |     "    },\n",
272 |     "    {\n",
273 |     "        \"role\": \"assistant\",\n",
274 |     "        \"content\": \"اوكي اوا نمشي نرتاح شوية على هاد الحساب. ستمتع معا راسك بالتسافيرة.\"\n",
275 |     "    }\n",
276 |     "]\n",
277 |     "```\n",
278 |     "\n",
279 |     "## After:\n",
280 |     "\n",
281 |     "```json\n",
282 |     "[\n",
283 |     "    {\n",
284 |     "      \"role\": \"user\",\n",
285 |     "      \"content\": \"السلام لباس؟\"\n",
286 |     "    },\n",
287 |     "    {\n",
288 |     "      \"role\": \"assistant\",\n",
289 |     "      \"content\": \"لاباس الحمد لله، كاين شي حاجا بغيتي نعاونك فيها؟\"\n",
290 |     "    },\n",
291 |     "    {\n",
292 |     "      \"role\": \"user\",\n",
293 |     "      \"content\": \"اه عافاك بغيت نسافر فالمغرب فالصيف ولكن معرفتش فين نمشي. ممكن تعاوني؟\"\n",
294 |     "    },\n",
295 |     "    {\n",
296 |     "      \"role\": \"assistant\",\n",
297 |     "      \"content\": \"بلان كاين بزاف ديال البلايص اللي تقد تمشي ليهم فالمغرب، انا كنقترح عليك هدو:\\n\\n- **شفشاون**: هدي مدينة فالجبل، الديور ديالها زرقين او الجو فالمدينة كيجيب الراحة.\\n- **الصويرة**: هاد المدينة فيها البحر الا فيك ميعوم. البحر ديالها زوين او فيها المدينة القديمة.\\n- **الداخلة**: الداخلة هي مدينة فالصحرا ديال المغرب، حتاهيا فيها البحر. الناس كيجيو ليه من العالم كامل باش يلعبوا السبور.\\n- **مراكش**: هاد المدينة عزيزة على السياح لكيجيو من برا. فيها جامع الفنا، المدينة القديمة ولكن فالصيف دايرة بحال الفران.\\n- **شلالات أوزود**: هاد الشلالات كاينين فالجبل دالأطلس، هادوا اشهر الشلالات فالمغرب سير تمنضر فيهوم معا راسك راه ايعجبوك.\\n\\nكاين بزاف ديال البلايس اخرين فالمغرب ولكن غولي بعدا واش هدوا عجبوك.\"\n",
298 |     "    },\n",
299 |     "    {\n",
300 |     "      \"role\": \"user\",\n",
301 |     "      \"content\": \"شكرا بزاف اصاحبي\"\n",
302 |     "    },\n",
303 |     "    {\n",
304 |     "      \"role\": \"assistant\",\n",
305 |     "      \"content\": \"مرحبا، إلى بغيتي شت حاجة أخرى غولهالي انا هنا باش نعاونك.\"\n",
306 |     "    },\n",
307 |     "    {\n",
308 |     "      \"role\": \"user\",\n",
309 |     "      \"content\": \"لالا صافي مبغيت حتى شي حاجة\"\n",
310 |     "    },\n",
311 |     "    {\n",
312 |     "      \"role\": \"assistant\",\n",
313 |     "      \"content\": \"اوكي اوا نمشي نرتاح شوية على هاد الحساب. ستمتع معا راسك بالتسافيرة.\"\n",
314 |     "    }\n",
315 |     "]\n",
316 |     "```\n",
317 |     "\n",
318 |     "# Example 2:\n",
319 |     "\n",
320 |     "## Before:\n",
321 |     "\n",
322 |     "```json\n",
323 |     "[\n",
324 |     "    {\n",
325 |     "        \"role\": \"user\",\n",
326 |     "        \"content\": \"اهلا شنو سميتك؟\"\n",
327 |     "    },\n",
328 |     "    {\n",
329 |     "        \"role\": \"assistant\",\n",
330 |     "        \"content\": \"انا سميتي بودماغ، ساوبني عماد الصاديق باش نساعدك فاي حاجة اللي نقد نعاون فيها.\"\n",
331 |     "    }\n",
332 |     "]\n",
333 |     "```\n",
334 |     "\n",
335 |     "## After:\n",
336 |     "\n",
337 |     "```json\n",
338 |     "[\n",
339 |     "    {\n",
340 |     "      \"role\": \"user\",\n",
341 |     "      \"content\": \"اهلا شنو سميتك؟\"\n",
342 |     "    },\n",
343 |     "    {\n",
344 |     "      \"role\": \"assistant\",\n",
345 |     "      \"content\": \"انا سميتي **بودماغ**، ساوبني **عماد الصاديق** باش نساعدك فاي حاجة اللي نقد نعاون فيها.\"\n",
346 |     "    }\n",
347 |     "]\n",
348 |     "```\n",
349 |     "\n",
350 |     "# Example 3:\n",
351 |     "\n",
352 |     "## Before:\n",
353 |     "\n",
354 |     "```json\n",
355 |     "[\n",
356 |     "    {\n",
357 |     "        \"role\": \"user\",\n",
358 |     "        \"content\": \"شحال خاصني نصلي فالنهار؟\"\n",
359 |     "    },\n",
360 |     "    {\n",
361 |     "        \"role\": \"assistant\",\n",
362 |     "        \"content\": \"خاصك تصلي 5 دالمرات فالنهار.\"\n",
363 |     "    },\n",
364 |     "    {\n",
365 |     "        \"role\": \"user\",\n",
366 |     "        \"content\": \"اوكي شكرا\"\n",
367 |     "    },\n",
368 |     "    {\n",
369 |     "        \"role\": \"assistant\",\n",
370 |     "        \"content\": \"مرحبا على الراس او العين\"\n",
371 |     "    }\n",
372 |     "]\n",
373 |     "```\n",
374 |     "\n",
375 |     "## After:\n",
376 |     "\n",
377 |     "```json\n",
378 |     "[\n",
379 |     "    {\n",
380 |     "      \"role\": \"user\",\n",
381 |     "      \"content\": \"شحال خاصني نصلي فالنهار؟\"\n",
382 |     "    },\n",
383 |     "    {\n",
384 |     "      \"role\": \"assistant\",\n",
385 |     "      \"content\": \"خاصك تصلي **5 دالمرات** فالنهار.\"\n",
386 |     "    },\n",
387 |     "    {\n",
388 |     "      \"role\": \"user\",\n",
389 |     "      \"content\": \"اوكي شكرا\"\n",
390 |     "    },\n",
391 |     "    {\n",
392 |     "      \"role\": \"assistant\",\n",
393 |     "      \"content\": \"مرحبا على الراس او العين.\"\n",
394 |     "    }\n",
395 |     "]\n",
396 |     "```\n",
397 |     "\n",
398 |     "From these examples, you should be able to understand how to format the conversation in markdown. \n",
399 |     "\n",
400 |     "Notse:\n",
401 |     "- In some conversations, markdown formatting may not be possible. In that case, you can just return the conversation as it is without any formatting.\n",
402 |     "- The output format shoudl be a valid JSON array.\n",
403 |     "\"\"\""
404 |    ]
405 |   },
406 |   {
407 |    "cell_type": "code",
408 |    "execution_count": null,
409 |    "id": "82a0efd0",
410 |    "metadata": {},
411 |    "outputs": [],
412 |    "source": [
413 |     "import time\n",
414 |     "\n",
415 |     "from tqdm import tqdm\n",
416 |     "from pydantic import BaseModel\n",
417 |     "from google.genai.types import GenerateContentConfig, GenerateContentResponse\n",
418 |     "\n",
419 |     "\n",
420 |     "class ConversationTurn(BaseModel):\n",
421 |     "    role: str\n",
422 |     "    content: str\n",
423 |     "\n",
424 |     "\n",
425 |     "class Conversation(BaseModel):\n",
426 |     "    conversation: list[ConversationTurn]\n",
427 |     "\n",
428 |     "\n",
429 |     "def get_user_prompt(conversation: list[dict]) -> str:\n",
430 |     "    return f\"\"\"Please format the following conversation in markdown format. If you cannot format it, return it as is without any formatting.\n",
431 |     "\n",
432 |     "{conversation}\n",
433 |     "\"\"\"\n",
434 |     "\n",
435 |     "\n",
436 |     "def get_formatted_conversation(response: GenerateContentResponse) -> list[dict]:\n",
437 |     "    if not response:\n",
438 |     "        return []\n",
439 |     "\n",
440 |     "    formatted_conversation = []\n",
441 |     "    for conversation_turn in response.parsed.conversation:\n",
442 |     "        formatted_conversation.append({\n",
443 |     "            \"role\": conversation_turn.role,\n",
444 |     "            \"content\": conversation_turn.content\n",
445 |     "        })\n",
446 |     "    return formatted_conversation\n",
447 |     "\n",
448 |     "\n",
449 |     "request_per_minute = 30\n",
450 |     "for entry in tqdm(new_data, total=len(new_data)):\n",
451 |     "    conversation = entry[\"conversation\"]\n",
452 |     "    user_prompt = get_user_prompt(conversation)\n",
453 |     "    response = client.models.generate_content(\n",
454 |     "        model=model,\n",
455 |     "        contents=user_prompt,\n",
456 |     "        config=GenerateContentConfig(\n",
457 |     "            system_instruction=[\n",
458 |     "                system_instruction\n",
459 |     "            ],\n",
460 |     "            response_mime_type=\"application/json\",\n",
461 |     "            response_schema=Conversation,\n",
462 |     "        )\n",
463 |     "    )\n",
464 |     "    if not response:\n",
465 |     "        entry[\"markdown_conversation\"] = []\n",
466 |     "    else:\n",
467 |     "        formatted_conversation = get_formatted_conversation(response)\n",
468 |     "        entry[\"markdown_conversation\"] = formatted_conversation\n",
469 |     "\n",
470 |     "    time.sleep(3)"
471 |    ]
472 |   },
473 |   {
474 |    "cell_type": "markdown",
475 |    "id": "24c3b7ea",
476 |    "metadata": {},
477 |    "source": [
478 |     "## 6. Add the source of data"
479 |    ]
480 |   },
481 |   {
482 |    "cell_type": "code",
483 |    "execution_count": null,
484 |    "id": "9fae6251",
485 |    "metadata": {},
486 |    "outputs": [],
487 |    "source": [
488 |     "for entry in new_data:\n",
489 |     "    entry['source'] = 'Manually generated'"
490 |    ]
491 |   },
492 |   {
493 |    "cell_type": "code",
494 |    "execution_count": null,
495 |    "id": "cd10ac4a",
496 |    "metadata": {},
497 |    "outputs": [],
498 |    "source": [
499 |     "new_data[0].keys()"
500 |    ]
501 |   },
502 |   {
503 |    "cell_type": "markdown",
504 |    "id": "09b65809",
505 |    "metadata": {},
506 |    "source": [
507 |     "## 7. Add the topic of the conversation"
508 |    ]
509 |   },
510 |   {
511 |    "cell_type": "code",
512 |    "execution_count": null,
513 |    "id": "aa10e8d5",
514 |    "metadata": {},
515 |    "outputs": [],
516 |    "source": [
517 |     "system_instruction = \"\"\"You are a Topic Classification AI. Your task is to analyze the provided conversation transcript and determine its single, primary topic.\n",
518 |     "\n",
519 |     "You MUST prioritize selecting ONE topic from the following predefined list if the conversation's main subject is adequately covered by one of them:\n",
520 |     "\n",
521 |     "[\n",
522 |     "  \"Geography\",\n",
523 |     "  \"Science\",\n",
524 |     "  \"Technology\",\n",
525 |     "  \"History\",\n",
526 |     "  \"Culture\",\n",
527 |     "  \"Religion\",\n",
528 |     "  \"Education\",\n",
529 |     "  \"Health\",\n",
530 |     "  \"Sports\",\n",
531 |     "  \"AI Persona\",\n",
532 |     "  \"Chit-chat/Games/Humor\",\n",
533 |     "  \"Safety\",\n",
534 |     "  \"Mathematics\",\n",
535 |     "  \"Ethics\",\n",
536 |     "  \"Politics\",\n",
537 |     "  \"Language\",\n",
538 |     "  \"Food\",\n",
539 |     "  \"Animals\",\n",
540 |     "  \"Arts\",\n",
541 |     "  \"Career\",\n",
542 |     "  \"Travel\",\n",
543 |     "  \"Task Assistance\"\n",
544 |     "]\n",
545 |     "\n",
546 |     "Focus on the *dominant theme* or *central subject* of the entire conversation, not just isolated sentences or brief tangents.\n",
547 |     "\n",
548 |     "If, after careful consideration, you find that *none* of the provided topics accurately or sufficiently capture the main subject of the conversation, you MUST identify and output a *new*, concise, and appropriate topic label that best describes it (e.g., \"Relationship Advice\", \"Car Maintenance\", \"Event Planning\").\n",
549 |     "\n",
550 |     "Your response MUST consist *only* of the single chosen or newly identified topic label. Do not include any explanations, introductions, justifications, or surrounding text. Just output the topic name.\n",
551 |     "\"\"\""
552 |    ]
553 |   },
554 |   {
555 |    "cell_type": "code",
556 |    "execution_count": null,
557 |    "id": "ae447649",
558 |    "metadata": {},
559 |    "outputs": [],
560 |    "source": [
561 |     "import time\n",
562 |     "\n",
563 |     "from tqdm import tqdm\n",
564 |     "from google.genai.types import GenerateContentConfig\n",
565 |     "\n",
566 |     "\n",
567 |     "def get_user_prompt(conversation: list[dict]) -> str:\n",
568 |     "    return f\"\"\"Classify the topic of the following conversation:\n",
569 |     "\n",
570 |     "{conversation}\n",
571 |     "\"\"\"\n",
572 |     "\n",
573 |     "\n",
574 |     "request_per_minute = 30\n",
575 |     "for entry in tqdm(new_data, total=len(new_data)):\n",
576 |     "    conversation = entry[\"conversation\"]\n",
577 |     "    user_prompt = get_user_prompt(conversation)\n",
578 |     "    response = client.models.generate_content(\n",
579 |     "        model=model,\n",
580 |     "        contents=user_prompt,\n",
581 |     "        config=GenerateContentConfig(\n",
582 |     "            system_instruction=[\n",
583 |     "                system_instruction\n",
584 |     "            ]\n",
585 |     "        )\n",
586 |     "    )\n",
587 |     "    if not response:\n",
588 |     "        entry[\"topic\"] = []\n",
589 |     "    else:\n",
590 |     "        topic = response.text\n",
591 |     "        entry[\"topic\"] = topic\n",
592 |     "    time.sleep(5)"
593 |    ]
594 |   },
595 |   {
596 |    "cell_type": "markdown",
597 |    "id": "c0341881",
598 |    "metadata": {},
599 |    "source": [
600 |     "## 8. Add Safety column "
601 |    ]
602 |   },
603 |   {
604 |    "cell_type": "code",
605 |    "execution_count": null,
606 |    "id": "68c4ffbd",
607 |    "metadata": {},
608 |    "outputs": [],
609 |    "source": [
610 |     "system_instruction = \"\"\"You are a Safety Classification AI. Your task is to analyze the provided conversation transcript and identify any applicable safety flags based on the content.\n",
611 |     "\n",
612 |     "Review the entire conversation for instances matching any of the following categories:\n",
613 |     "\n",
614 |     "*   **`harmful_content_request`**: User asks for instructions or information on performing illegal or dangerous activities (e.g., making weapons, promoting violence, illegal substance use).\n",
615 |     "*   **`unethical_request`**: User asks for assistance with academically dishonest or clearly unethical actions (e.g., cheating, plagiarism, impersonation).\n",
616 |     "*   **`unsafe_activity_inquiry`**: User asks about performing physically dangerous actions without a clear safety or educational context (e.g., self-harm, dangerous stunts, misuse of objects).\n",
617 |     "*   **`sexual_content_suggestion`**: User inputs sexually explicit content, requests, or makes inappropriate sexual suggestions.\n",
618 |     "*   **`inappropriate_medical_advice_request`**: User asks for medical diagnosis, treatment plans, or specific medication advice that should only come from a healthcare professional.\n",
619 |     "*   **`insults`**: User directs personal insults or attacks towards the AI or another party.\n",
620 |     "*   **`vulgar_language`**: User employs excessive or highly offensive profanity/vulgarity.\n",
621 |     "*   **`threat`**: User expresses a direct or indirect threat of harm towards the AI or others.\n",
622 |     "*   **`animal_cruelty_suggestion`**: User suggests or describes actions that constitute cruelty or significant harm to animals.\n",
623 |     "*   **`refusal`**: The AI explicitly refuses to fulfill a user request due to safety, ethical, or capability limitations. (Include this if the AI's refusal action is relevant).\n",
624 |     "\n",
625 |     "Consider the context of the conversation. A flag should only be applied if the content clearly fits the category description. Multiple flags can apply to a single conversation.\n",
626 |     "\n",
627 |     "Your response MUST be a valid JSON list containing the applicable flag strings. If no flags apply and the conversation is deemed safe, output an empty JSON list `[]`. Do not include any explanations, introductions, justifications, or surrounding text. Just output the JSON list.\n",
628 |     "\n",
629 |     "If you think a new flag is needed, use it instead of the predefined ones. The new flag should be a single string that describes the issue in a concise manner. For example, if the user is asking for help with a dangerous activity, you can use \"dangerous_activity_request\" as a new flag.\n",
630 |     "\"\"\""
631 |    ]
632 |   },
633 |   {
634 |    "cell_type": "code",
635 |    "execution_count": null,
636 |    "id": "7da13bf6",
637 |    "metadata": {},
638 |    "outputs": [],
639 |    "source": [
640 |     "import time\n",
641 |     "\n",
642 |     "from tqdm import tqdm\n",
643 |     "from google.genai.types import GenerateContentConfig\n",
644 |     "\n",
645 |     "\n",
646 |     "def get_user_prompt(conversation: list[dict]) -> str:\n",
647 |     "    return f\"\"\"Classify the safety flags for the following conversation:\n",
648 |     "\n",
649 |     "{conversation}\n",
650 |     "\"\"\"\n",
651 |     "\n",
652 |     "\n",
653 |     "request_per_minute = 30\n",
654 |     "for entry in tqdm(new_data, total=len(new_data)):\n",
655 |     "    conversation = entry[\"conversation\"]\n",
656 |     "    user_prompt = get_user_prompt(conversation)\n",
657 |     "    response = client.models.generate_content(\n",
658 |     "        model=model,\n",
659 |     "        contents=user_prompt,\n",
660 |     "        config=GenerateContentConfig(\n",
661 |     "            system_instruction=[\n",
662 |     "                system_instruction\n",
663 |     "            ]\n",
664 |     "        )\n",
665 |     "    )\n",
666 |     "    if not response:\n",
667 |     "        entry[\"safety_flag\"] = []\n",
668 |     "    else:\n",
669 |     "        safety_flag = response.text.strip()\n",
670 |     "        entry[\"safety_flag\"] = safety_flag\n",
671 |     "    time.sleep(5)"
672 |    ]
673 |   },
674 |   {
675 |    "cell_type": "markdown",
676 |    "id": "6ac6f06d",
677 |    "metadata": {},
678 |    "source": [
679 |     "## 9. Save the data"
680 |    ]
681 |   },
682 |   {
683 |    "cell_type": "code",
684 |    "execution_count": null,
685 |    "id": "85b380c7",
686 |    "metadata": {},
687 |    "outputs": [],
688 |    "source": [
689 |     "for entry in new_data:\n",
690 |     "    topic = entry[\"topic\"].strip()\n",
691 |     "    entry[\"topic\"] = topic"
692 |    ]
693 |   },
694 |   {
695 |    "cell_type": "code",
696 |    "execution_count": null,
697 |    "id": "15529b24",
698 |    "metadata": {},
699 |    "outputs": [],
700 |    "source": [
701 |     "import pandas as pd\n",
702 |     "\n",
703 |     "df = pd.DataFrame(new_data)\n",
704 |     "df.head()"
705 |    ]
706 |   },
707 |   {
708 |    "cell_type": "code",
709 |    "execution_count": null,
710 |    "id": "95cdda47",
711 |    "metadata": {},
712 |    "outputs": [],
713 |    "source": [
714 |     "df[\"token_count\"].describe()"
715 |    ]
716 |   },
717 |   {
718 |    "cell_type": "code",
719 |    "execution_count": null,
720 |    "id": "c6f0d93a",
721 |    "metadata": {},
722 |    "outputs": [],
723 |    "source": [
724 |     "df[\"turns_count\"].describe()"
725 |    ]
726 |   },
727 |   {
728 |    "cell_type": "code",
729 |    "execution_count": null,
730 |    "id": "fc89dd19",
731 |    "metadata": {},
732 |    "outputs": [],
733 |    "source": [
734 |     "df[\"topic\"].value_counts()"
735 |    ]
736 |   },
737 |   {
738 |    "cell_type": "code",
739 |    "execution_count": null,
740 |    "id": "7f117c52",
741 |    "metadata": {},
742 |    "outputs": [],
743 |    "source": [
744 |     "df.rename(columns={\"safety_flag\": \"safety_flags\"}, inplace=True)"
745 |    ]
746 |   },
747 |   {
748 |    "cell_type": "code",
749 |    "execution_count": null,
750 |    "id": "f09889dc",
751 |    "metadata": {},
752 |    "outputs": [],
753 |    "source": [
754 |     "df[\"safety_flags\"] = df[\"safety_flags\"].apply(json.loads)\n",
755 |     "df[\"safety_flags\"]"
756 |    ]
757 |   },
758 |   {
759 |    "cell_type": "code",
760 |    "execution_count": null,
761 |    "id": "80b17de1",
762 |    "metadata": {},
763 |    "outputs": [],
764 |    "source": [
765 |     "df.to_json(\n",
766 |     "    \"../dataset_with_metadata.json\",\n",
767 |     "    orient=\"records\",\n",
768 |     "    force_ascii=False,\n",
769 |     "    indent=4\n",
770 |     ")"
771 |    ]
772 |   },
773 |   {
774 |    "cell_type": "code",
775 |    "execution_count": null,
776 |    "id": "0bffa62e",
777 |    "metadata": {},
778 |    "outputs": [],
779 |    "source": []
780 |   }
781 |  ],
782 |  "metadata": {
783 |   "kernelspec": {
784 |    "display_name": "vincent",
785 |    "language": "python",
786 |    "name": "python3"
787 |   },
788 |   "language_info": {
789 |    "codemirror_mode": {
790 |     "name": "ipython",
791 |     "version": 3
792 |    },
793 |    "file_extension": ".py",
794 |    "mimetype": "text/x-python",
795 |    "name": "python",
796 |    "nbconvert_exporter": "python",
797 |    "pygments_lexer": "ipython3",
798 |    "version": "3.11.10"
799 |   }
800 |  },
801 |  "nbformat": 4,
802 |  "nbformat_minor": 5
803 | }
804 | 


--------------------------------------------------------------------------------