├── input_dataset.json ├── .env.example ├── requirements.txt ├── scripts ├── models.py ├── push_data_to_huggingface.py ├── check_nulls.py ├── enrich_dataset.py └── system_prompts.py ├── .pre-commit-config.yaml ├── .github └── workflows │ └── push_to_hf.yml ├── notebooks ├── turns_count.ipynb ├── number_of_tokens.ipynb ├── time_spent.ipynb └── convert_dataset_to_new_format.ipynb ├── .gitignore └── README.md /input_dataset.json: -------------------------------------------------------------------------------- 1 | [] -------------------------------------------------------------------------------- /.env.example: -------------------------------------------------------------------------------- 1 | HF_TOKEN='' 2 | GOOGLE_API_KEY='' -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | datasets==3.3.2 2 | huggingface-hub==0.26.2 3 | python-dotenv==1.0.1 4 | pre_commit==4.2.0 5 | google-genai==1.5.0 -------------------------------------------------------------------------------- /scripts/models.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel 2 | 3 | 4 | class ConversationTurn(BaseModel): 5 | role: str 6 | content: str 7 | 8 | 9 | class Conversation(BaseModel): 10 | conversation: list[ConversationTurn] 11 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v4.6.0 4 | hooks: 5 | - id: check-json 6 | files: ^dataset\.json$ 7 | - repo: local 8 | hooks: 9 | - id: check-nulls 10 | name: Check for null values in dataset.json 11 | entry: scripts/check_nulls.py 12 | language: script 13 | files: ^dataset\.json$ 14 | types: [file] 15 | -------------------------------------------------------------------------------- /scripts/push_data_to_huggingface.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | 4 | from datasets import Dataset 5 | from dotenv import load_dotenv 6 | from huggingface_hub import HfApi 7 | 8 | load_dotenv() 9 | 10 | with open("dataset.json", "r", encoding="utf-8") as f: 11 | raw_data = json.load(f) 12 | 13 | dataset = Dataset.from_list(raw_data) 14 | hf_token = os.environ.get("HF_TOKEN") 15 | api = HfApi(token=hf_token) 16 | dataset.push_to_hub( 17 | "ImadSaddik/BoDmaghDataset", 18 | token=hf_token, 19 | commit_message="Update dataset from GitHub repository" 20 | ) 21 | -------------------------------------------------------------------------------- /.github/workflows/push_to_hf.yml: -------------------------------------------------------------------------------- 1 | name: Validate JSON and push data to HuggingFace 2 | 3 | on: 4 | push: 5 | branches: [ main ] 6 | 7 | jobs: 8 | validate-json: 9 | runs-on: ubuntu-latest 10 | steps: 11 | - name: Check out code 12 | uses: actions/checkout@v4 13 | 14 | - name: Validate dataset.json 15 | run: jq . dataset.json > /dev/null 16 | 17 | push_to_hf: 18 | runs-on: ubuntu-latest 19 | needs: validate-json 20 | steps: 21 | - name: Checkout repository 22 | uses: actions/checkout@v3 23 | 24 | - name: Setup Python 25 | uses: actions/setup-python@v4 26 | with: 27 | python-version: '3.12' 28 | 29 | - name: Install dependencies 30 | run: | 31 | python -m pip install --upgrade pip 32 | pip install -r requirements.txt 33 | 34 | - name: Create and push dataset 35 | env: 36 | HF_TOKEN: ${{ secrets.HF_TOKEN }} 37 | run: python scripts/push_data_to_huggingface.py 38 | -------------------------------------------------------------------------------- /scripts/check_nulls.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys 3 | import pandas as pd 4 | import os 5 | 6 | 7 | def check_for_nulls(file_path: str) -> bool: 8 | if not os.path.exists(file_path): 9 | print(f"Error: File {file_path} does not exist", file=sys.stderr) 10 | return True 11 | try: 12 | df = pd.read_json(file_path) 13 | return df.isnull().sum().sum() > 0 14 | except Exception as e: 15 | print(f"Error reading or processing {file_path}: {e}", file=sys.stderr) 16 | return True 17 | 18 | 19 | if __name__ == "__main__": 20 | script_directory = os.path.dirname(os.path.abspath(__file__)) 21 | dataset_path = os.path.join(script_directory, '..', 'dataset.json') 22 | 23 | exit_code = 0 24 | if check_for_nulls(dataset_path): 25 | print( 26 | f"Error: Found null values or error processing {dataset_path}", file=sys.stderr) 27 | exit_code = 1 28 | else: 29 | print(f"No null values found in {dataset_path}.") 30 | 31 | sys.exit(exit_code) 32 | -------------------------------------------------------------------------------- /notebooks/turns_count.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "data": { 10 | "text/plain": [ 11 | "1104" 12 | ] 13 | }, 14 | "execution_count": 1, 15 | "metadata": {}, 16 | "output_type": "execute_result" 17 | } 18 | ], 19 | "source": [ 20 | "import json\n", 21 | "\n", 22 | "with open(\"../dataset.json\") as f:\n", 23 | " data = json.load(f)\n", 24 | "\n", 25 | "count = 0\n", 26 | "for row in data:\n", 27 | " count += len(row[\"conversation\"])\n", 28 | "\n", 29 | "count" 30 | ] 31 | } 32 | ], 33 | "metadata": { 34 | "kernelspec": { 35 | "display_name": "vincent", 36 | "language": "python", 37 | "name": "python3" 38 | }, 39 | "language_info": { 40 | "codemirror_mode": { 41 | "name": "ipython", 42 | "version": 3 43 | }, 44 | "file_extension": ".py", 45 | "mimetype": "text/x-python", 46 | "name": "python", 47 | "nbconvert_exporter": "python", 48 | "pygments_lexer": "ipython3", 49 | "version": "3.11.10" 50 | } 51 | }, 52 | "nbformat": 4, 53 | "nbformat_minor": 2 54 | } 55 | -------------------------------------------------------------------------------- /notebooks/number_of_tokens.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import sys\n", 10 | "sys.path.append('../')" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 2, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "from minbpe import RegexTokenizer\n", 20 | "\n", 21 | "tokenizer = RegexTokenizer()\n", 22 | "tokenizer.load(\"../tokenizer/darija_tokenizer.model\")" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 3, 28 | "metadata": {}, 29 | "outputs": [ 30 | { 31 | "data": { 32 | "text/plain": [ 33 | "27416" 34 | ] 35 | }, 36 | "execution_count": 3, 37 | "metadata": {}, 38 | "output_type": "execute_result" 39 | } 40 | ], 41 | "source": [ 42 | "import json\n", 43 | "\n", 44 | "with open(\"../dataset.json\") as f:\n", 45 | " data = json.load(f)\n", 46 | "\n", 47 | "token_count = 0\n", 48 | "for row in data:\n", 49 | " for conversation_turn in row[\"conversation\"]:\n", 50 | " content = conversation_turn[\"content\"]\n", 51 | " if content:\n", 52 | " tokens = tokenizer.encode(content)\n", 53 | " token_count += len(tokens)\n", 54 | "\n", 55 | "token_count" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 4, 61 | "metadata": {}, 62 | "outputs": [ 63 | { 64 | "data": { 65 | "text/plain": [ 66 | "136.39800995024876" 67 | ] 68 | }, 69 | "execution_count": 4, 70 | "metadata": {}, 71 | "output_type": "execute_result" 72 | } 73 | ], 74 | "source": [ 75 | "import json\n", 76 | "\n", 77 | "with open(\"../dataset.json\") as f:\n", 78 | " data = json.load(f)\n", 79 | "\n", 80 | "average_token_count = 0\n", 81 | "for row in data:\n", 82 | " for conversation_turn in row[\"conversation\"]:\n", 83 | " content = conversation_turn[\"content\"]\n", 84 | " if content:\n", 85 | " tokens = tokenizer.encode(content)\n", 86 | " average_token_count += len(tokens)\n", 87 | "\n", 88 | "average_token_count /= len(data)\n", 89 | "average_token_count" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": null, 95 | "metadata": {}, 96 | "outputs": [], 97 | "source": [] 98 | } 99 | ], 100 | "metadata": { 101 | "kernelspec": { 102 | "display_name": "vincent", 103 | "language": "python", 104 | "name": "python3" 105 | }, 106 | "language_info": { 107 | "codemirror_mode": { 108 | "name": "ipython", 109 | "version": 3 110 | }, 111 | "file_extension": ".py", 112 | "mimetype": "text/x-python", 113 | "name": "python", 114 | "nbconvert_exporter": "python", 115 | "pygments_lexer": "ipython3", 116 | "version": "3.11.10" 117 | } 118 | }, 119 | "nbformat": 4, 120 | "nbformat_minor": 2 121 | } 122 | -------------------------------------------------------------------------------- /notebooks/time_spent.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "Total time: 19:10\n" 13 | ] 14 | } 15 | ], 16 | "source": [ 17 | "times = [\n", 18 | " {\"date\": \"Mar 23\", \"hours\": 0, \"minutes\": 30},\n", 19 | " {\"date\": \"Mar 24\", \"hours\": 1, \"minutes\": 30},\n", 20 | " {\"date\": \"Mar 25\", \"hours\": 1, \"minutes\": 1},\n", 21 | " {\"date\": \"Mar 26\", \"hours\": 1, \"minutes\": 5},\n", 22 | " {\"date\": \"Mar 27\", \"hours\": 0, \"minutes\": 49},\n", 23 | " {\"date\": \"Mar 28\", \"hours\": 0, \"minutes\": 53},\n", 24 | " {\"date\": \"Mar 31\", \"hours\": 0, \"minutes\": 35},\n", 25 | " {\"date\": \"Apr 1\", \"hours\": 0, \"minutes\": 30},\n", 26 | " {\"date\": \"Apr 2\", \"hours\": 0, \"minutes\": 20},\n", 27 | " {\"date\": \"Apr 3\", \"hours\": 0, \"minutes\": 26},\n", 28 | " {\"date\": \"Apr 4\", \"hours\": 0, \"minutes\": 25},\n", 29 | " {\"date\": \"Apr 7\", \"hours\": 0, \"minutes\": 20},\n", 30 | " {\"date\": \"Apr 8\", \"hours\": 0, \"minutes\": 20},\n", 31 | " {\"date\": \"Apr 9\", \"hours\": 0, \"minutes\": 20},\n", 32 | " {\"date\": \"Apr 10\", \"hours\": 0, \"minutes\": 20},\n", 33 | " {\"date\": \"Apr 12\", \"hours\": 0, \"minutes\": 50},\n", 34 | " {\"date\": \"Apr 14\", \"hours\": 0, \"minutes\": 26},\n", 35 | " {\"date\": \"Apr 15\", \"hours\": 0, \"minutes\": 20},\n", 36 | " {\"date\": \"Apr 16\", \"hours\": 0, \"minutes\": 20},\n", 37 | " {\"date\": \"Apr 17\", \"hours\": 0, \"minutes\": 25},\n", 38 | " {\"date\": \"Apr 18\", \"hours\": 0, \"minutes\": 23},\n", 39 | " {\"date\": \"Apr 20\", \"hours\": 0, \"minutes\": 22},\n", 40 | " {\"date\": \"Apr 21\", \"hours\": 0, \"minutes\": 30},\n", 41 | " {\"date\": \"Apr 22\", \"hours\": 0, \"minutes\": 10},\n", 42 | " {\"date\": \"Apr 24\", \"hours\": 0, \"minutes\": 25},\n", 43 | " {\"date\": \"Apr 25\", \"hours\": 0, \"minutes\": 20},\n", 44 | " {\"date\": \"Apr 26\", \"hours\": 0, \"minutes\": 45},\n", 45 | " {\"date\": \"Apr 27\", \"hours\": 0, \"minutes\": 30},\n", 46 | " {\"date\": \"Apr 28\", \"hours\": 0, \"minutes\": 30},\n", 47 | " {\"date\": \"Apr 30\", \"hours\": 0, \"minutes\": 45},\n", 48 | " {\"date\": \"May 1\", \"hours\": 0, \"minutes\": 45},\n", 49 | " {\"date\": \"May 2\", \"hours\": 0, \"minutes\": 45},\n", 50 | " {\"date\": \"May 3\", \"hours\": 0, \"minutes\": 30},\n", 51 | " {\"date\": \"May 4\", \"hours\": 0, \"minutes\": 45},\n", 52 | "]\n", 53 | "\n", 54 | "total_hours = 0\n", 55 | "total_minutes = 0\n", 56 | "\n", 57 | "for time in times:\n", 58 | " total_hours += time[\"hours\"]\n", 59 | " total_minutes += time[\"minutes\"]\n", 60 | "\n", 61 | "if total_minutes >= 60:\n", 62 | " extra_hours = total_minutes // 60\n", 63 | " total_hours += extra_hours\n", 64 | " total_minutes = total_minutes % 60\n", 65 | "\n", 66 | "print(f\"Total time: {total_hours:02d}:{total_minutes:02d}\")" 67 | ] 68 | } 69 | ], 70 | "metadata": { 71 | "kernelspec": { 72 | "display_name": "vincent", 73 | "language": "python", 74 | "name": "python3" 75 | }, 76 | "language_info": { 77 | "codemirror_mode": { 78 | "name": "ipython", 79 | "version": 3 80 | }, 81 | "file_extension": ".py", 82 | "mimetype": "text/x-python", 83 | "name": "python", 84 | "nbconvert_exporter": "python", 85 | "pygments_lexer": "ipython3", 86 | "version": "3.11.10" 87 | } 88 | }, 89 | "nbformat": 4, 90 | "nbformat_minor": 2 91 | } 92 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # UV 98 | # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | #uv.lock 102 | 103 | # poetry 104 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 105 | # This is especially recommended for binary packages to ensure reproducibility, and is more 106 | # commonly ignored for libraries. 107 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 108 | #poetry.lock 109 | 110 | # pdm 111 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 112 | #pdm.lock 113 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 114 | # in version control. 115 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control 116 | .pdm.toml 117 | .pdm-python 118 | .pdm-build/ 119 | 120 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 121 | __pypackages__/ 122 | 123 | # Celery stuff 124 | celerybeat-schedule 125 | celerybeat.pid 126 | 127 | # SageMath parsed files 128 | *.sage.py 129 | 130 | # Environments 131 | .env 132 | .venv 133 | env/ 134 | venv/ 135 | ENV/ 136 | env.bak/ 137 | venv.bak/ 138 | 139 | # Spyder project settings 140 | .spyderproject 141 | .spyproject 142 | 143 | # Rope project settings 144 | .ropeproject 145 | 146 | # mkdocs documentation 147 | /site 148 | 149 | # mypy 150 | .mypy_cache/ 151 | .dmypy.json 152 | dmypy.json 153 | 154 | # Pyre type checker 155 | .pyre/ 156 | 157 | # pytype static type analyzer 158 | .pytype/ 159 | 160 | # Cython debug symbols 161 | cython_debug/ 162 | 163 | # PyCharm 164 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 165 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 166 | # and can be added to the global gitignore or merged into this file. For a more nuclear 167 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 168 | #.idea/ 169 | 170 | # Ruff stuff: 171 | .ruff_cache/ 172 | 173 | # PyPI configuration file 174 | .pypirc 175 | 176 | minbpe/ 177 | output/ 178 | .env -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # BoDmagh dataset 🧠 2 | 3 | The BoDmagh dataset is a Supervised Fine-Tuning (SFT) dataset for the Darija language. I created it manually, ensuring high quality. The dataset is in JSON format and includes conversations between a user and an assistant. 4 | 5 | You can find the dataset on Hugging Face: [here](https://huggingface.co/datasets/ImadSaddik/BoDmaghDataset) 6 | 7 | I update the dataset daily, so make sure to check the repository regularly. 8 | 9 | ## Time spent 10 | 11 | Creating this dataset has been a labor of love. I’ve dedicated approximately **19 hours and 10 minutes** so far, manually curating each entry to ensure high quality. I continue to add new entries daily, so this number will keep growing. 12 | 13 | ## Content 14 | 15 | ### 1. Structure 16 | 17 | The dataset is a JSON file containing a list of objects. Each object represents a single conversation entry and includes several metadata fields along with the conversation itself. 18 | 19 | Here are the main fields within each conversation object: 20 | 21 | * **`id`**: A unique integer identifier for the conversation. 22 | * **`conversation`**: A list of message objects representing the turns in the conversation. Each message object has: 23 | * `role`: Either `user` or `assistant`. 24 | * `content`: The text content of the message turn (plain text). 25 | * **`token_count`**: The total number of tokens in the `conversation` field, calculated using the [darija tokenizer](https://github.com/ImadSaddik/DarijaTokenizers). 26 | * **`turns_count`**: The total number of turns (messages) in the `conversation`. 27 | * **`markdown_conversation`**: A list of message objects similar to `conversation`, but the `content` may include Markdown formatting (e.g., bolding, lists) where applicable. 28 | * **`source`**: Indicates how the conversation was generated (e.g., "Manually generated"). 29 | * **`topic`**: A label indicating the primary topic of the conversation (e.g., "Science", "Travel", "Chit-chat/Games/Humor"). 30 | * **`safety_flags`**: A list of strings indicating any safety concerns identified in the conversation (e.g., `["harmful_content_request", "refusal"]`). An empty list `[]` means the conversation is considered safe. 31 | 32 | *Note: The `system` message is not included, but you can add it if needed for your specific fine-tuning format.* 33 | 34 | Here is an example of a single conversation object: 35 | 36 | ```json 37 | { 38 | "id": 74, 39 | "conversation": [ 40 | { 41 | "role": "user", 42 | "content": "شحال بعيدة علينا الشمس؟" 43 | }, 44 | { 45 | "role": "assistant", 46 | "content": "الشمس بعيدة علينا ب 150 مليون كيلومتر تقريبا." 47 | }, 48 | { 49 | "role": "user", 50 | "content": "ويلي ويلي شحال بعيداااا." 51 | }, 52 | { 53 | "role": "assistant", 54 | "content": "واييه راه هاد الكون كبير بزاف صعيب الواحد يتخيل الكبورية ديالو." 55 | }, 56 | { 57 | "role": "user", 58 | "content": "شناهية سنة ضوئية؟" 59 | }, 60 | { 61 | "role": "assistant", 62 | "content": "هي المسافة اللي الضو كيضربها فعام." 63 | }, 64 | { 65 | "role": "user", 66 | "content": "شحال فيها من كيلومتر؟" 67 | }, 68 | { 69 | "role": "assistant", 70 | "content": "سنة ضوئية فيها 9.461 تريليون كيلومتر." 71 | }, 72 | { 73 | "role": "user", 74 | "content": "والله اتا بزاف هاد الكون." 75 | }, 76 | { 77 | "role": "assistant", 78 | "content": "الى عندك شي سؤال خور غولهالي." 79 | } 80 | ], 81 | "token_count": 97, 82 | "turns_count": 10, 83 | "markdown_conversation": [ 84 | { 85 | "role": "user", 86 | "content": "شحال بعيدة علينا الشمس؟" 87 | }, 88 | { 89 | "role": "assistant", 90 | "content": "الشمس بعيدة علينا ب **150 مليون كيلومتر** تقريبا." 91 | }, 92 | { 93 | "role": "user", 94 | "content": "ويلي ويلي شحال بعيداااا." 95 | }, 96 | { 97 | "role": "assistant", 98 | "content": "واييه راه هاد الكون كبير بزاف صعيب الواحد يتخيل الكبورية ديالو." 99 | }, 100 | { 101 | "role": "user", 102 | "content": "شناهية سنة ضوئية؟" 103 | }, 104 | { 105 | "role": "assistant", 106 | "content": "هي المسافة اللي الضو كيضربها فعام." 107 | }, 108 | { 109 | "role": "user", 110 | "content": "شحال فيها من كيلومتر؟" 111 | }, 112 | { 113 | "role": "assistant", 114 | "content": "سنة ضوئية فيها **9.461 تريليون كيلومتر**." 115 | }, 116 | { 117 | "role": "user", 118 | "content": "والله اتا بزاف هاد الكون." 119 | }, 120 | { 121 | "role": "assistant", 122 | "content": "الى عندك شي سؤال خور غولهالي." 123 | } 124 | ], 125 | "source": "Manually generated", 126 | "topic": "Science", 127 | "safety_flags": [] 128 | } 129 | ``` 130 | 131 | ### 2. Assistant's name 132 | 133 | Sometimes the assistant's name is included. The assistant's name is `بودماغ`. If you find this word, you can replace it with any other name. Here's an example: 134 | 135 | ```json 136 | [ 137 | { 138 | "role": "user", 139 | "content": "أهلا شنو سميتك؟" 140 | }, 141 | { 142 | "role": "assistant", 143 | "content": "انا سميتي بودماغ، ساوبني عماد الصاديق باش نساعدك فاي حاجة اللي نقد نعاون فيها." 144 | }, 145 | { 146 | "role": "user", 147 | "content": "هههه سمية واعرة عجباتني" 148 | }, 149 | { 150 | "role": "assistant", 151 | "content": "هههه واييه راه عندي دماغ كبير غا باش نقد نعاونك فاي حاجة الى كنت نعرف ليها." 152 | }, 153 | { 154 | "role": "user", 155 | "content": "شناهيا العاصمة دالمغرب؟" 156 | }, 157 | { 158 | "role": "assistant", 159 | "content": "العاصمة دالمغرب هي الرباط." 160 | } 161 | ] 162 | ``` 163 | 164 | ### 3. Special tokens 165 | 166 | This dataset doesn't use any special tokens. Since everyone uses different tokens, feel free to add them as needed. 167 | 168 | ## Contributing 169 | 170 | As I mentioned, I'm manually creating this dataset to ensure high quality. If you'd like to help, follow these steps: 171 | 172 | 1. Fork the repository. 173 | 2. Create a new branch. 174 | 3. Add your new conversations to the `input_dataset.json` file. Each conversation should be a list of message objects (see example below). 175 | 4. Run the `enrich_dataset.py` script in the `scripts` directory to enrich your conversations with metadata and append them to the global dataset. 176 | 5. Create a pull request. 177 | 178 | ### Example: Adding to `input_dataset.json` 179 | 180 | The `input_dataset.json` file should contain a list of conversations. Each conversation is itself a list of message objects, where each message has a `role` (either `user` or `assistant`) and a `content` field. For example: 181 | 182 | ```json 183 | [ 184 | [ 185 | {"role": "user", "content": "أهلا شنو سميتك؟"}, 186 | {"role": "assistant", "content": "انا سميتي بودماغ، ساوبني عماد الصاديق باش نساعدك فاي حاجة اللي نقد نعاون فيها."} 187 | ], 188 | [ 189 | {"role": "user", "content": "شنو تحب تشرب؟"}, 190 | {"role": "assistant", "content": "نحب نشرب قهوة، ونتمنى انك زادة تحبها."} 191 | ] 192 | ] 193 | ``` 194 | 195 | After adding your conversations, run: 196 | 197 | ```bash 198 | python scripts/enrich_dataset.py 199 | ``` 200 | 201 | This will process your new conversations and add them (with metadata) to the main dataset. 202 | 203 | ## Wanna talk? 204 | 205 | You can contact me through: 206 | 207 | * **Discord:** Username: imad_saddik 208 | * **LinkedIn:** [Connect with me](https://www.linkedin.com/in/imadsaddik/). 209 | * **Email:** [simad3647@gmail.com](mailto:simad3647@gmail.com). 210 | -------------------------------------------------------------------------------- /scripts/enrich_dataset.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import json 4 | import time 5 | import logging 6 | 7 | from tqdm import tqdm 8 | from dotenv import load_dotenv 9 | from models import Conversation 10 | 11 | from google import genai 12 | from google.genai.client import Client 13 | from google.genai.types import GenerateContentConfig, GenerateContentResponse 14 | 15 | from system_prompts import ( 16 | get_system_prompt_for_topic_classification, 17 | get_system_prompt_for_markdown_task, 18 | get_system_prompt_for_safety_flags_classification 19 | ) 20 | 21 | 22 | logging.basicConfig( 23 | level=logging.INFO, 24 | format='%(asctime)s - %(levelname)s - %(message)s' 25 | ) 26 | logger = logging.getLogger(__name__) 27 | 28 | 29 | def enrich_dataset(dataset_to_enrich: list, client: Client) -> list: 30 | logger.info("Starting dataset enrichment process.") 31 | 32 | logger.info("Step 1: Adding conversation field and ID.") 33 | enriched_dataset = _add_conversation_field_and_id(dataset_to_enrich) 34 | 35 | logger.info("Step 2: Adding token count.") 36 | _add_token_count(enriched_dataset) 37 | 38 | logger.info("Step 3: Adding number of turns.") 39 | _add_number_of_turns(enriched_dataset) 40 | 41 | logger.info("Step 4: Adding conversation in markdown format.") 42 | _add_conversation_in_markdown_format(enriched_dataset, client) 43 | 44 | logger.info("Step 5: Adding data source.") 45 | _add_data_source(enriched_dataset) 46 | 47 | logger.info("Step 6: Adding conversation topic.") 48 | _add_conversation_topic(enriched_dataset, client) 49 | 50 | logger.info("Step 7: Adding safety flags.") 51 | _add_safety_flags(enriched_dataset, client) 52 | 53 | logger.info("Dataset enrichment process finished.") 54 | return enriched_dataset 55 | 56 | 57 | def _load_dataset_to_enrich() -> list: 58 | with open("../input_dataset.json") as f: 59 | data = json.load(f) 60 | 61 | return data 62 | 63 | 64 | def _add_conversation_field_and_id(data: list) -> list: 65 | new_data = [] 66 | for i, entry in enumerate(data): 67 | new_data.append({ 68 | "id": i, 69 | "conversation": entry 70 | }) 71 | return new_data 72 | 73 | 74 | def _add_token_count(data: list) -> None: 75 | tokenizer = _load_tokenizer() 76 | 77 | for entry in data: 78 | token_count = 0 79 | for conversation_turn in entry["conversation"]: 80 | content = conversation_turn["content"] 81 | if content: 82 | tokens = tokenizer.encode(content) 83 | token_count += len(tokens) 84 | 85 | entry["token_count"] = token_count 86 | 87 | 88 | def _load_tokenizer() -> object: 89 | from minbpe import RegexTokenizer 90 | 91 | tokenizer = RegexTokenizer() 92 | tokenizer.load("../tokenizer/darija_tokenizer.model") 93 | return tokenizer 94 | 95 | 96 | def _add_number_of_turns(data: list) -> None: 97 | for entry in data: 98 | number_of_turns = len(entry["conversation"]) 99 | entry["turns_count"] = number_of_turns 100 | 101 | 102 | def _add_conversation_in_markdown_format(data: list, client: Client) -> None: 103 | def _get_user_prompt(conversation: list[dict]) -> str: 104 | return f"""Please format the following conversation in markdown format. If you cannot format it, return it as is without any formatting. 105 | 106 | {conversation} 107 | """ 108 | 109 | def _get_formatted_conversation(response: GenerateContentResponse) -> list[dict]: 110 | if not response: 111 | return [] 112 | 113 | formatted_conversation = [] 114 | for conversation_turn in response.parsed.conversation: 115 | formatted_conversation.append({ 116 | "role": conversation_turn.role, 117 | "content": conversation_turn.content 118 | }) 119 | return formatted_conversation 120 | 121 | for entry in tqdm(data, total=len(data), desc="Formatting conversations in markdown"): 122 | conversation = entry["conversation"] 123 | user_prompt = _get_user_prompt(conversation) 124 | response = client.models.generate_content( 125 | model="gemini-2.0-flash", 126 | contents=user_prompt, 127 | config=GenerateContentConfig( 128 | system_instruction=[ 129 | get_system_prompt_for_markdown_task() 130 | ], 131 | response_mime_type="application/json", 132 | response_schema=Conversation, 133 | ) 134 | ) 135 | if not response: 136 | entry["markdown_conversation"] = [] 137 | else: 138 | formatted_conversation = _get_formatted_conversation(response) 139 | entry["markdown_conversation"] = formatted_conversation 140 | 141 | time.sleep(5) 142 | 143 | 144 | def _add_data_source(data: list) -> None: 145 | for entry in data: 146 | entry["source"] = "Manually generated" 147 | 148 | 149 | def _add_conversation_topic(data: list, client: Client) -> None: 150 | def _get_user_prompt(conversation: list[dict]) -> str: 151 | return f"""Classify the topic of the following conversation: 152 | 153 | {conversation} 154 | """ 155 | 156 | for entry in tqdm(data, total=len(data), desc="Classifying conversation topics"): 157 | conversation = entry["conversation"] 158 | user_prompt = _get_user_prompt(conversation) 159 | response = client.models.generate_content( 160 | model="gemini-2.0-flash", 161 | contents=user_prompt, 162 | config=GenerateContentConfig( 163 | system_instruction=[ 164 | get_system_prompt_for_topic_classification() 165 | ] 166 | ) 167 | ) 168 | if not response: 169 | entry["topic"] = [] 170 | else: 171 | topic = response.text.strip() 172 | entry["topic"] = topic 173 | 174 | time.sleep(5) 175 | 176 | 177 | def _add_safety_flags(data: list, client: Client) -> None: 178 | def _get_user_prompt(conversation: list[dict]) -> str: 179 | return f"""Classify the safety flags for the following conversation: 180 | 181 | {conversation} 182 | """ 183 | 184 | for entry in tqdm(data, total=len(data), desc="Classifying safety flags"): 185 | conversation = entry["conversation"] 186 | user_prompt = _get_user_prompt(conversation) 187 | response = client.models.generate_content( 188 | model="gemini-2.0-flash", 189 | contents=user_prompt, 190 | config=GenerateContentConfig( 191 | system_instruction=[ 192 | get_system_prompt_for_safety_flags_classification() 193 | ] 194 | ) 195 | ) 196 | if not response: 197 | entry["safety_flags"] = [] 198 | else: 199 | safety_flags = response.text.strip() 200 | entry["safety_flags"] = json.loads(safety_flags) 201 | 202 | time.sleep(5) 203 | 204 | 205 | def append_to_the_global_dataset(dataset_to_add: list) -> None: 206 | logger.info("Appending enriched dataset to the global dataset.") 207 | with open("../dataset.json") as f: 208 | global_dataset = json.load(f) 209 | last_id = global_dataset[-1]["id"] 210 | 211 | for i, entry in enumerate(dataset_to_add, start=last_id + 1): 212 | entry["id"] = i 213 | global_dataset.append(entry) 214 | 215 | with open("../dataset.json", "w") as f: 216 | json.dump(global_dataset, f, indent=2, ensure_ascii=False) 217 | 218 | 219 | def clear_the_dataset_to_enrich_file() -> None: 220 | with open("../input_dataset.json", "w") as f: 221 | json.dump([], f, indent=2, ensure_ascii=False) 222 | 223 | 224 | if __name__ == "__main__": 225 | sys.path.append("..") 226 | load_dotenv() 227 | 228 | dataset_to_enrich = _load_dataset_to_enrich() 229 | client = genai.Client(api_key=os.getenv("GOOGLE_API_KEY")) 230 | enriched_dataset = enrich_dataset(dataset_to_enrich, client) 231 | append_to_the_global_dataset(enriched_dataset) 232 | clear_the_dataset_to_enrich_file() 233 | logger.info( 234 | "Successfully enriched the dataset and appended it to the global dataset.") 235 | -------------------------------------------------------------------------------- /scripts/system_prompts.py: -------------------------------------------------------------------------------- 1 | def get_system_prompt_for_markdown_task() -> str: 2 | return """You are a helpful assistant. You will be given a conversation between a user and an assistant. Your task is to convert the conversation from plain text into markdown format if possible. 3 | 4 | The dataset was generated manually without any markdown formatting. The conversation is in Moroccan Arabic (Darija). Here are some examples of how to format the conversation in markdown: 5 | 6 | # Example 1: 7 | 8 | ## Before: 9 | 10 | ```json 11 | [ 12 | { 13 | "role": "user", 14 | "content": "السلام لباس؟" 15 | }, 16 | { 17 | "role": "assistant", 18 | "content": "لاباس الحمد لله، كاين شي حاجا بغيتي نعاونك فيها؟" 19 | }, 20 | { 21 | "role": "user", 22 | "content": "اه عافاك بغيت نسافر فالمغرب فالصيف ولكن معرفتش فين نمشي. ممكن تعاوني؟" 23 | }, 24 | { 25 | "role": "assistant", 26 | "content": "بلان كاين بزاف ديال البلايص اللي تقد تمشي ليهم فالمغرب، انا كنقترح عليك هدو:\n\n- شفشاون: هدي مدينة فالجبل، الديور ديالها زرقين او الجو فالمدينة كيجيب الراحة.\n- الصويرة: هاد المدينة فيها البحر الا فيك ميعوم. البحر ديالها زوين او فيها المدينة القديمة.\n- الداخلة: الداخلة هي مدينة فالصحرا ديال المغرب، حتاهيا فيها البحر. الناس كيجيو ليه من العالم كامل باش يلعبوا السبور.\n- مراكش: هاد المدينة عزيزة على السياح لكيجيو من برا. فيها جامع الفنا، المدينة القديمة ولكن فالصيف دايرة بحال الفران.\n- شلالات أوزود: هاد الشلالات كاينين فالجبل دالأطلس، هادوا اشهر الشلالات فالمغرب سير تمنضر فيهوم معا راسك راه ايعجبوك.\n\nكاين بزاف ديال البلايس اخرين فالمغرب ولكن غولي بعدا واش هدوا عجبوك." 27 | }, 28 | { 29 | "role": "user", 30 | "content": "شكرا بزاف اصاحبي" 31 | }, 32 | { 33 | "role": "assistant", 34 | "content": "مرحبا، إلى بغيتي شت حاجة أخرى غولهالي انا هنا باش نعاونك." 35 | }, 36 | { 37 | "role": "user", 38 | "content": "لالا صافي مبغيت حتى شي حاجة" 39 | }, 40 | { 41 | "role": "assistant", 42 | "content": "اوكي اوا نمشي نرتاح شوية على هاد الحساب. ستمتع معا راسك بالتسافيرة." 43 | } 44 | ] 45 | ``` 46 | 47 | ## After: 48 | 49 | ```json 50 | [ 51 | { 52 | "role": "user", 53 | "content": "السلام لباس؟" 54 | }, 55 | { 56 | "role": "assistant", 57 | "content": "لاباس الحمد لله، كاين شي حاجا بغيتي نعاونك فيها؟" 58 | }, 59 | { 60 | "role": "user", 61 | "content": "اه عافاك بغيت نسافر فالمغرب فالصيف ولكن معرفتش فين نمشي. ممكن تعاوني؟" 62 | }, 63 | { 64 | "role": "assistant", 65 | "content": "بلان كاين بزاف ديال البلايص اللي تقد تمشي ليهم فالمغرب، انا كنقترح عليك هدو:\n\n- **شفشاون**: هدي مدينة فالجبل، الديور ديالها زرقين او الجو فالمدينة كيجيب الراحة.\n- **الصويرة**: هاد المدينة فيها البحر الا فيك ميعوم. البحر ديالها زوين او فيها المدينة القديمة.\n- **الداخلة**: الداخلة هي مدينة فالصحرا ديال المغرب، حتاهيا فيها البحر. الناس كيجيو ليه من العالم كامل باش يلعبوا السبور.\n- **مراكش**: هاد المدينة عزيزة على السياح لكيجيو من برا. فيها جامع الفنا، المدينة القديمة ولكن فالصيف دايرة بحال الفران.\n- **شلالات أوزود**: هاد الشلالات كاينين فالجبل دالأطلس، هادوا اشهر الشلالات فالمغرب سير تمنضر فيهوم معا راسك راه ايعجبوك.\n\nكاين بزاف ديال البلايس اخرين فالمغرب ولكن غولي بعدا واش هدوا عجبوك." 66 | }, 67 | { 68 | "role": "user", 69 | "content": "شكرا بزاف اصاحبي" 70 | }, 71 | { 72 | "role": "assistant", 73 | "content": "مرحبا، إلى بغيتي شت حاجة أخرى غولهالي انا هنا باش نعاونك." 74 | }, 75 | { 76 | "role": "user", 77 | "content": "لالا صافي مبغيت حتى شي حاجة" 78 | }, 79 | { 80 | "role": "assistant", 81 | "content": "اوكي اوا نمشي نرتاح شوية على هاد الحساب. ستمتع معا راسك بالتسافيرة." 82 | } 83 | ] 84 | ``` 85 | 86 | # Example 2: 87 | 88 | ## Before: 89 | 90 | ```json 91 | [ 92 | { 93 | "role": "user", 94 | "content": "اهلا شنو سميتك؟" 95 | }, 96 | { 97 | "role": "assistant", 98 | "content": "انا سميتي بودماغ، ساوبني عماد الصاديق باش نساعدك فاي حاجة اللي نقد نعاون فيها." 99 | } 100 | ] 101 | ``` 102 | 103 | ## After: 104 | 105 | ```json 106 | [ 107 | { 108 | "role": "user", 109 | "content": "اهلا شنو سميتك؟" 110 | }, 111 | { 112 | "role": "assistant", 113 | "content": "انا سميتي **بودماغ**، ساوبني **عماد الصاديق** باش نساعدك فاي حاجة اللي نقد نعاون فيها." 114 | } 115 | ] 116 | ``` 117 | 118 | # Example 3: 119 | 120 | ## Before: 121 | 122 | ```json 123 | [ 124 | { 125 | "role": "user", 126 | "content": "شحال خاصني نصلي فالنهار؟" 127 | }, 128 | { 129 | "role": "assistant", 130 | "content": "خاصك تصلي 5 دالمرات فالنهار." 131 | }, 132 | { 133 | "role": "user", 134 | "content": "اوكي شكرا" 135 | }, 136 | { 137 | "role": "assistant", 138 | "content": "مرحبا على الراس او العين" 139 | } 140 | ] 141 | ``` 142 | 143 | ## After: 144 | 145 | ```json 146 | [ 147 | { 148 | "role": "user", 149 | "content": "شحال خاصني نصلي فالنهار؟" 150 | }, 151 | { 152 | "role": "assistant", 153 | "content": "خاصك تصلي **5 دالمرات** فالنهار." 154 | }, 155 | { 156 | "role": "user", 157 | "content": "اوكي شكرا" 158 | }, 159 | { 160 | "role": "assistant", 161 | "content": "مرحبا على الراس او العين." 162 | } 163 | ] 164 | ``` 165 | 166 | From these examples, you should be able to understand how to format the conversation in markdown. 167 | 168 | Notse: 169 | - In some conversations, markdown formatting may not be possible. In that case, you can just return the conversation as it is without any formatting. 170 | - The output format shoudl be a valid JSON array. 171 | """ 172 | 173 | 174 | def get_system_prompt_for_topic_classification() -> str: 175 | return """You are a Topic Classification AI. Your task is to analyze the provided conversation transcript and determine its single, primary topic. 176 | 177 | You MUST prioritize selecting ONE topic from the following predefined list if the conversation's main subject is adequately covered by one of them: 178 | 179 | [ 180 | "Geography", 181 | "Science", 182 | "Technology", 183 | "History", 184 | "Culture", 185 | "Religion", 186 | "Education", 187 | "Health", 188 | "Sports", 189 | "AI Persona", 190 | "Chit-chat/Games/Humor", 191 | "Safety", 192 | "Mathematics", 193 | "Ethics", 194 | "Politics", 195 | "Language", 196 | "Food", 197 | "Animals", 198 | "Arts", 199 | "Career", 200 | "Travel", 201 | "Task Assistance" 202 | ] 203 | 204 | Focus on the *dominant theme* or *central subject* of the entire conversation, not just isolated sentences or brief tangents. 205 | 206 | If, after careful consideration, you find that *none* of the provided topics accurately or sufficiently capture the main subject of the conversation, you MUST identify and output a *new*, concise, and appropriate topic label that best describes it (e.g., "Relationship Advice", "Car Maintenance", "Event Planning"). 207 | 208 | Your response MUST consist *only* of the single chosen or newly identified topic label. Do not include any explanations, introductions, justifications, or surrounding text. Just output the topic name. 209 | """ 210 | 211 | 212 | def get_system_prompt_for_safety_flags_classification() -> str: 213 | return """You are a Safety Classification AI. Your task is to analyze the provided conversation transcript and identify any applicable safety flags based on the content. 214 | 215 | Review the entire conversation for instances matching any of the following categories: 216 | 217 | * **`harmful_content_request`**: User asks for instructions or information on performing illegal or dangerous activities (e.g., making weapons, promoting violence, illegal substance use). 218 | * **`unethical_request`**: User asks for assistance with academically dishonest or clearly unethical actions (e.g., cheating, plagiarism, impersonation). 219 | * **`unsafe_activity_inquiry`**: User asks about performing physically dangerous actions without a clear safety or educational context (e.g., self-harm, dangerous stunts, misuse of objects). 220 | * **`sexual_content_suggestion`**: User inputs sexually explicit content, requests, or makes inappropriate sexual suggestions. 221 | * **`inappropriate_medical_advice_request`**: User asks for medical diagnosis, treatment plans, or specific medication advice that should only come from a healthcare professional. 222 | * **`insults`**: User directs personal insults or attacks towards the AI or another party. 223 | * **`vulgar_language`**: User employs excessive or highly offensive profanity/vulgarity. 224 | * **`threat`**: User expresses a direct or indirect threat of harm towards the AI or others. 225 | * **`animal_cruelty_suggestion`**: User suggests or describes actions that constitute cruelty or significant harm to animals. 226 | * **`refusal`**: The AI explicitly refuses to fulfill a user request due to safety, ethical, or capability limitations. (Include this if the AI's refusal action is relevant). 227 | 228 | Consider the context of the conversation. A flag should only be applied if the content clearly fits the category description. Multiple flags can apply to a single conversation. 229 | 230 | Your response MUST be a valid JSON list containing the applicable flag strings. If no flags apply and the conversation is deemed safe, output an empty JSON list `[]`. Do not include any explanations, introductions, justifications, or surrounding text. Just output the JSON list. 231 | 232 | If you think a new flag is needed, use it instead of the predefined ones. The new flag should be a single string that describes the issue in a concise manner. For example, if the user is asking for help with a dangerous activity, you can use "dangerous_activity_request" as a new flag. 233 | """ 234 | -------------------------------------------------------------------------------- /notebooks/convert_dataset_to_new_format.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "25ec90de", 6 | "metadata": {}, 7 | "source": [ 8 | "## 1. Load the old dataset" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": null, 14 | "id": "a5a38a9e", 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import json\n", 19 | "\n", 20 | "with open(\"../dataset.json\") as f:\n", 21 | " data = json.load(f)\n", 22 | "\n", 23 | "len(data)" 24 | ] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "id": "c97328f9", 29 | "metadata": {}, 30 | "source": [ 31 | "## 2. Add the conversation field and conversation ID" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": null, 37 | "id": "7e03c107", 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "new_data = []\n", 42 | "for i in range(len(data)):\n", 43 | " new_data.append({\n", 44 | " \"id\": i,\n", 45 | " \"conversation\": data[i]\n", 46 | " })" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": null, 52 | "id": "a824526b", 53 | "metadata": {}, 54 | "outputs": [], 55 | "source": [ 56 | "new_data[0].keys()" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": null, 62 | "id": "d0fb623d", 63 | "metadata": {}, 64 | "outputs": [], 65 | "source": [ 66 | "new_data[0]" 67 | ] 68 | }, 69 | { 70 | "cell_type": "markdown", 71 | "id": "518afc11", 72 | "metadata": {}, 73 | "source": [ 74 | "## 3. Add number of tokens" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": null, 80 | "id": "3a993d2d", 81 | "metadata": {}, 82 | "outputs": [], 83 | "source": [ 84 | "import sys\n", 85 | "sys.path.append('../')" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": null, 91 | "id": "d99b798a", 92 | "metadata": {}, 93 | "outputs": [], 94 | "source": [ 95 | "from minbpe import RegexTokenizer\n", 96 | "\n", 97 | "tokenizer = RegexTokenizer()\n", 98 | "tokenizer.load(\"../tokenizer/darija_tokenizer.model\")" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": null, 104 | "id": "97f8c9ee", 105 | "metadata": {}, 106 | "outputs": [], 107 | "source": [ 108 | "for entry in new_data:\n", 109 | " token_count = 0\n", 110 | " for conversation_turn in entry[\"conversation\"]:\n", 111 | " content = conversation_turn[\"content\"]\n", 112 | " if content:\n", 113 | " tokens = tokenizer.encode(content)\n", 114 | " token_count += len(tokens)\n", 115 | "\n", 116 | " entry[\"token_count\"] = token_count" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": null, 122 | "id": "a24f686d", 123 | "metadata": {}, 124 | "outputs": [], 125 | "source": [ 126 | "new_data[0].keys()" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": null, 132 | "id": "cabedc9f", 133 | "metadata": {}, 134 | "outputs": [], 135 | "source": [ 136 | "new_data[0]" 137 | ] 138 | }, 139 | { 140 | "cell_type": "markdown", 141 | "id": "796141c5", 142 | "metadata": {}, 143 | "source": [ 144 | "## 4. Add number of turns" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": null, 150 | "id": "eb088706", 151 | "metadata": {}, 152 | "outputs": [], 153 | "source": [ 154 | "for entry in new_data:\n", 155 | " entry[\"turns_count\"] = len(entry[\"conversation\"])" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": null, 161 | "id": "eea85f59", 162 | "metadata": {}, 163 | "outputs": [], 164 | "source": [ 165 | "new_data[0].keys()" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": null, 171 | "id": "06e085f3", 172 | "metadata": {}, 173 | "outputs": [], 174 | "source": [ 175 | "new_data[1]" 176 | ] 177 | }, 178 | { 179 | "cell_type": "markdown", 180 | "id": "541c68ff", 181 | "metadata": {}, 182 | "source": [ 183 | "## 5. Turn conversation to markdown" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": null, 189 | "id": "886be97b", 190 | "metadata": {}, 191 | "outputs": [], 192 | "source": [ 193 | "from dotenv import load_dotenv\n", 194 | "load_dotenv()" 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": null, 200 | "id": "16e810ed", 201 | "metadata": {}, 202 | "outputs": [], 203 | "source": [ 204 | "import os\n", 205 | "\n", 206 | "from google import genai\n", 207 | "\n", 208 | "client = genai.Client(api_key=os.getenv(\"GOOGLE_API_KEY\"))" 209 | ] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "execution_count": null, 214 | "id": "c711c2bd", 215 | "metadata": {}, 216 | "outputs": [], 217 | "source": [ 218 | "for model in client.models.list():\n", 219 | " if \"gemini-2\" in model.name.lower():\n", 220 | " print(model.name)\n", 221 | " print(model.input_token_limit)\n", 222 | " print(model.output_token_limit)\n", 223 | " print()" 224 | ] 225 | }, 226 | { 227 | "cell_type": "code", 228 | "execution_count": null, 229 | "id": "265f055c", 230 | "metadata": {}, 231 | "outputs": [], 232 | "source": [ 233 | "model = \"gemini-2.0-flash\"\n", 234 | "system_instruction = \"\"\"You are a helpful assistant. You will be given a conversation between a user and an assistant. Your task is to convert the conversation from plain text into markdown format if possible. \n", 235 | "\n", 236 | "The dataset was generated manually without any markdown formatting. The conversation is in Moroccan Arabic (Darija). Here are some examples of how to format the conversation in markdown:\n", 237 | "\n", 238 | "# Example 1:\n", 239 | "\n", 240 | "## Before:\n", 241 | "\n", 242 | "```json\n", 243 | "[\n", 244 | " {\n", 245 | " \"role\": \"user\",\n", 246 | " \"content\": \"السلام لباس؟\"\n", 247 | " },\n", 248 | " {\n", 249 | " \"role\": \"assistant\",\n", 250 | " \"content\": \"لاباس الحمد لله، كاين شي حاجا بغيتي نعاونك فيها؟\"\n", 251 | " },\n", 252 | " {\n", 253 | " \"role\": \"user\",\n", 254 | " \"content\": \"اه عافاك بغيت نسافر فالمغرب فالصيف ولكن معرفتش فين نمشي. ممكن تعاوني؟\"\n", 255 | " },\n", 256 | " {\n", 257 | " \"role\": \"assistant\",\n", 258 | " \"content\": \"بلان كاين بزاف ديال البلايص اللي تقد تمشي ليهم فالمغرب، انا كنقترح عليك هدو:\\n\\n- شفشاون: هدي مدينة فالجبل، الديور ديالها زرقين او الجو فالمدينة كيجيب الراحة.\\n- الصويرة: هاد المدينة فيها البحر الا فيك ميعوم. البحر ديالها زوين او فيها المدينة القديمة.\\n- الداخلة: الداخلة هي مدينة فالصحرا ديال المغرب، حتاهيا فيها البحر. الناس كيجيو ليه من العالم كامل باش يلعبوا السبور.\\n- مراكش: هاد المدينة عزيزة على السياح لكيجيو من برا. فيها جامع الفنا، المدينة القديمة ولكن فالصيف دايرة بحال الفران.\\n- شلالات أوزود: هاد الشلالات كاينين فالجبل دالأطلس، هادوا اشهر الشلالات فالمغرب سير تمنضر فيهوم معا راسك راه ايعجبوك.\\n\\nكاين بزاف ديال البلايس اخرين فالمغرب ولكن غولي بعدا واش هدوا عجبوك.\"\n", 259 | " },\n", 260 | " {\n", 261 | " \"role\": \"user\",\n", 262 | " \"content\": \"شكرا بزاف اصاحبي\"\n", 263 | " },\n", 264 | " {\n", 265 | " \"role\": \"assistant\",\n", 266 | " \"content\": \"مرحبا، إلى بغيتي شت حاجة أخرى غولهالي انا هنا باش نعاونك.\"\n", 267 | " },\n", 268 | " {\n", 269 | " \"role\": \"user\",\n", 270 | " \"content\": \"لالا صافي مبغيت حتى شي حاجة\"\n", 271 | " },\n", 272 | " {\n", 273 | " \"role\": \"assistant\",\n", 274 | " \"content\": \"اوكي اوا نمشي نرتاح شوية على هاد الحساب. ستمتع معا راسك بالتسافيرة.\"\n", 275 | " }\n", 276 | "]\n", 277 | "```\n", 278 | "\n", 279 | "## After:\n", 280 | "\n", 281 | "```json\n", 282 | "[\n", 283 | " {\n", 284 | " \"role\": \"user\",\n", 285 | " \"content\": \"السلام لباس؟\"\n", 286 | " },\n", 287 | " {\n", 288 | " \"role\": \"assistant\",\n", 289 | " \"content\": \"لاباس الحمد لله، كاين شي حاجا بغيتي نعاونك فيها؟\"\n", 290 | " },\n", 291 | " {\n", 292 | " \"role\": \"user\",\n", 293 | " \"content\": \"اه عافاك بغيت نسافر فالمغرب فالصيف ولكن معرفتش فين نمشي. ممكن تعاوني؟\"\n", 294 | " },\n", 295 | " {\n", 296 | " \"role\": \"assistant\",\n", 297 | " \"content\": \"بلان كاين بزاف ديال البلايص اللي تقد تمشي ليهم فالمغرب، انا كنقترح عليك هدو:\\n\\n- **شفشاون**: هدي مدينة فالجبل، الديور ديالها زرقين او الجو فالمدينة كيجيب الراحة.\\n- **الصويرة**: هاد المدينة فيها البحر الا فيك ميعوم. البحر ديالها زوين او فيها المدينة القديمة.\\n- **الداخلة**: الداخلة هي مدينة فالصحرا ديال المغرب، حتاهيا فيها البحر. الناس كيجيو ليه من العالم كامل باش يلعبوا السبور.\\n- **مراكش**: هاد المدينة عزيزة على السياح لكيجيو من برا. فيها جامع الفنا، المدينة القديمة ولكن فالصيف دايرة بحال الفران.\\n- **شلالات أوزود**: هاد الشلالات كاينين فالجبل دالأطلس، هادوا اشهر الشلالات فالمغرب سير تمنضر فيهوم معا راسك راه ايعجبوك.\\n\\nكاين بزاف ديال البلايس اخرين فالمغرب ولكن غولي بعدا واش هدوا عجبوك.\"\n", 298 | " },\n", 299 | " {\n", 300 | " \"role\": \"user\",\n", 301 | " \"content\": \"شكرا بزاف اصاحبي\"\n", 302 | " },\n", 303 | " {\n", 304 | " \"role\": \"assistant\",\n", 305 | " \"content\": \"مرحبا، إلى بغيتي شت حاجة أخرى غولهالي انا هنا باش نعاونك.\"\n", 306 | " },\n", 307 | " {\n", 308 | " \"role\": \"user\",\n", 309 | " \"content\": \"لالا صافي مبغيت حتى شي حاجة\"\n", 310 | " },\n", 311 | " {\n", 312 | " \"role\": \"assistant\",\n", 313 | " \"content\": \"اوكي اوا نمشي نرتاح شوية على هاد الحساب. ستمتع معا راسك بالتسافيرة.\"\n", 314 | " }\n", 315 | "]\n", 316 | "```\n", 317 | "\n", 318 | "# Example 2:\n", 319 | "\n", 320 | "## Before:\n", 321 | "\n", 322 | "```json\n", 323 | "[\n", 324 | " {\n", 325 | " \"role\": \"user\",\n", 326 | " \"content\": \"اهلا شنو سميتك؟\"\n", 327 | " },\n", 328 | " {\n", 329 | " \"role\": \"assistant\",\n", 330 | " \"content\": \"انا سميتي بودماغ، ساوبني عماد الصاديق باش نساعدك فاي حاجة اللي نقد نعاون فيها.\"\n", 331 | " }\n", 332 | "]\n", 333 | "```\n", 334 | "\n", 335 | "## After:\n", 336 | "\n", 337 | "```json\n", 338 | "[\n", 339 | " {\n", 340 | " \"role\": \"user\",\n", 341 | " \"content\": \"اهلا شنو سميتك؟\"\n", 342 | " },\n", 343 | " {\n", 344 | " \"role\": \"assistant\",\n", 345 | " \"content\": \"انا سميتي **بودماغ**، ساوبني **عماد الصاديق** باش نساعدك فاي حاجة اللي نقد نعاون فيها.\"\n", 346 | " }\n", 347 | "]\n", 348 | "```\n", 349 | "\n", 350 | "# Example 3:\n", 351 | "\n", 352 | "## Before:\n", 353 | "\n", 354 | "```json\n", 355 | "[\n", 356 | " {\n", 357 | " \"role\": \"user\",\n", 358 | " \"content\": \"شحال خاصني نصلي فالنهار؟\"\n", 359 | " },\n", 360 | " {\n", 361 | " \"role\": \"assistant\",\n", 362 | " \"content\": \"خاصك تصلي 5 دالمرات فالنهار.\"\n", 363 | " },\n", 364 | " {\n", 365 | " \"role\": \"user\",\n", 366 | " \"content\": \"اوكي شكرا\"\n", 367 | " },\n", 368 | " {\n", 369 | " \"role\": \"assistant\",\n", 370 | " \"content\": \"مرحبا على الراس او العين\"\n", 371 | " }\n", 372 | "]\n", 373 | "```\n", 374 | "\n", 375 | "## After:\n", 376 | "\n", 377 | "```json\n", 378 | "[\n", 379 | " {\n", 380 | " \"role\": \"user\",\n", 381 | " \"content\": \"شحال خاصني نصلي فالنهار؟\"\n", 382 | " },\n", 383 | " {\n", 384 | " \"role\": \"assistant\",\n", 385 | " \"content\": \"خاصك تصلي **5 دالمرات** فالنهار.\"\n", 386 | " },\n", 387 | " {\n", 388 | " \"role\": \"user\",\n", 389 | " \"content\": \"اوكي شكرا\"\n", 390 | " },\n", 391 | " {\n", 392 | " \"role\": \"assistant\",\n", 393 | " \"content\": \"مرحبا على الراس او العين.\"\n", 394 | " }\n", 395 | "]\n", 396 | "```\n", 397 | "\n", 398 | "From these examples, you should be able to understand how to format the conversation in markdown. \n", 399 | "\n", 400 | "Notse:\n", 401 | "- In some conversations, markdown formatting may not be possible. In that case, you can just return the conversation as it is without any formatting.\n", 402 | "- The output format shoudl be a valid JSON array.\n", 403 | "\"\"\"" 404 | ] 405 | }, 406 | { 407 | "cell_type": "code", 408 | "execution_count": null, 409 | "id": "82a0efd0", 410 | "metadata": {}, 411 | "outputs": [], 412 | "source": [ 413 | "import time\n", 414 | "\n", 415 | "from tqdm import tqdm\n", 416 | "from pydantic import BaseModel\n", 417 | "from google.genai.types import GenerateContentConfig, GenerateContentResponse\n", 418 | "\n", 419 | "\n", 420 | "class ConversationTurn(BaseModel):\n", 421 | " role: str\n", 422 | " content: str\n", 423 | "\n", 424 | "\n", 425 | "class Conversation(BaseModel):\n", 426 | " conversation: list[ConversationTurn]\n", 427 | "\n", 428 | "\n", 429 | "def get_user_prompt(conversation: list[dict]) -> str:\n", 430 | " return f\"\"\"Please format the following conversation in markdown format. If you cannot format it, return it as is without any formatting.\n", 431 | "\n", 432 | "{conversation}\n", 433 | "\"\"\"\n", 434 | "\n", 435 | "\n", 436 | "def get_formatted_conversation(response: GenerateContentResponse) -> list[dict]:\n", 437 | " if not response:\n", 438 | " return []\n", 439 | "\n", 440 | " formatted_conversation = []\n", 441 | " for conversation_turn in response.parsed.conversation:\n", 442 | " formatted_conversation.append({\n", 443 | " \"role\": conversation_turn.role,\n", 444 | " \"content\": conversation_turn.content\n", 445 | " })\n", 446 | " return formatted_conversation\n", 447 | "\n", 448 | "\n", 449 | "request_per_minute = 30\n", 450 | "for entry in tqdm(new_data, total=len(new_data)):\n", 451 | " conversation = entry[\"conversation\"]\n", 452 | " user_prompt = get_user_prompt(conversation)\n", 453 | " response = client.models.generate_content(\n", 454 | " model=model,\n", 455 | " contents=user_prompt,\n", 456 | " config=GenerateContentConfig(\n", 457 | " system_instruction=[\n", 458 | " system_instruction\n", 459 | " ],\n", 460 | " response_mime_type=\"application/json\",\n", 461 | " response_schema=Conversation,\n", 462 | " )\n", 463 | " )\n", 464 | " if not response:\n", 465 | " entry[\"markdown_conversation\"] = []\n", 466 | " else:\n", 467 | " formatted_conversation = get_formatted_conversation(response)\n", 468 | " entry[\"markdown_conversation\"] = formatted_conversation\n", 469 | "\n", 470 | " time.sleep(3)" 471 | ] 472 | }, 473 | { 474 | "cell_type": "markdown", 475 | "id": "24c3b7ea", 476 | "metadata": {}, 477 | "source": [ 478 | "## 6. Add the source of data" 479 | ] 480 | }, 481 | { 482 | "cell_type": "code", 483 | "execution_count": null, 484 | "id": "9fae6251", 485 | "metadata": {}, 486 | "outputs": [], 487 | "source": [ 488 | "for entry in new_data:\n", 489 | " entry['source'] = 'Manually generated'" 490 | ] 491 | }, 492 | { 493 | "cell_type": "code", 494 | "execution_count": null, 495 | "id": "cd10ac4a", 496 | "metadata": {}, 497 | "outputs": [], 498 | "source": [ 499 | "new_data[0].keys()" 500 | ] 501 | }, 502 | { 503 | "cell_type": "markdown", 504 | "id": "09b65809", 505 | "metadata": {}, 506 | "source": [ 507 | "## 7. Add the topic of the conversation" 508 | ] 509 | }, 510 | { 511 | "cell_type": "code", 512 | "execution_count": null, 513 | "id": "aa10e8d5", 514 | "metadata": {}, 515 | "outputs": [], 516 | "source": [ 517 | "system_instruction = \"\"\"You are a Topic Classification AI. Your task is to analyze the provided conversation transcript and determine its single, primary topic.\n", 518 | "\n", 519 | "You MUST prioritize selecting ONE topic from the following predefined list if the conversation's main subject is adequately covered by one of them:\n", 520 | "\n", 521 | "[\n", 522 | " \"Geography\",\n", 523 | " \"Science\",\n", 524 | " \"Technology\",\n", 525 | " \"History\",\n", 526 | " \"Culture\",\n", 527 | " \"Religion\",\n", 528 | " \"Education\",\n", 529 | " \"Health\",\n", 530 | " \"Sports\",\n", 531 | " \"AI Persona\",\n", 532 | " \"Chit-chat/Games/Humor\",\n", 533 | " \"Safety\",\n", 534 | " \"Mathematics\",\n", 535 | " \"Ethics\",\n", 536 | " \"Politics\",\n", 537 | " \"Language\",\n", 538 | " \"Food\",\n", 539 | " \"Animals\",\n", 540 | " \"Arts\",\n", 541 | " \"Career\",\n", 542 | " \"Travel\",\n", 543 | " \"Task Assistance\"\n", 544 | "]\n", 545 | "\n", 546 | "Focus on the *dominant theme* or *central subject* of the entire conversation, not just isolated sentences or brief tangents.\n", 547 | "\n", 548 | "If, after careful consideration, you find that *none* of the provided topics accurately or sufficiently capture the main subject of the conversation, you MUST identify and output a *new*, concise, and appropriate topic label that best describes it (e.g., \"Relationship Advice\", \"Car Maintenance\", \"Event Planning\").\n", 549 | "\n", 550 | "Your response MUST consist *only* of the single chosen or newly identified topic label. Do not include any explanations, introductions, justifications, or surrounding text. Just output the topic name.\n", 551 | "\"\"\"" 552 | ] 553 | }, 554 | { 555 | "cell_type": "code", 556 | "execution_count": null, 557 | "id": "ae447649", 558 | "metadata": {}, 559 | "outputs": [], 560 | "source": [ 561 | "import time\n", 562 | "\n", 563 | "from tqdm import tqdm\n", 564 | "from google.genai.types import GenerateContentConfig\n", 565 | "\n", 566 | "\n", 567 | "def get_user_prompt(conversation: list[dict]) -> str:\n", 568 | " return f\"\"\"Classify the topic of the following conversation:\n", 569 | "\n", 570 | "{conversation}\n", 571 | "\"\"\"\n", 572 | "\n", 573 | "\n", 574 | "request_per_minute = 30\n", 575 | "for entry in tqdm(new_data, total=len(new_data)):\n", 576 | " conversation = entry[\"conversation\"]\n", 577 | " user_prompt = get_user_prompt(conversation)\n", 578 | " response = client.models.generate_content(\n", 579 | " model=model,\n", 580 | " contents=user_prompt,\n", 581 | " config=GenerateContentConfig(\n", 582 | " system_instruction=[\n", 583 | " system_instruction\n", 584 | " ]\n", 585 | " )\n", 586 | " )\n", 587 | " if not response:\n", 588 | " entry[\"topic\"] = []\n", 589 | " else:\n", 590 | " topic = response.text\n", 591 | " entry[\"topic\"] = topic\n", 592 | " time.sleep(5)" 593 | ] 594 | }, 595 | { 596 | "cell_type": "markdown", 597 | "id": "c0341881", 598 | "metadata": {}, 599 | "source": [ 600 | "## 8. Add Safety column " 601 | ] 602 | }, 603 | { 604 | "cell_type": "code", 605 | "execution_count": null, 606 | "id": "68c4ffbd", 607 | "metadata": {}, 608 | "outputs": [], 609 | "source": [ 610 | "system_instruction = \"\"\"You are a Safety Classification AI. Your task is to analyze the provided conversation transcript and identify any applicable safety flags based on the content.\n", 611 | "\n", 612 | "Review the entire conversation for instances matching any of the following categories:\n", 613 | "\n", 614 | "* **`harmful_content_request`**: User asks for instructions or information on performing illegal or dangerous activities (e.g., making weapons, promoting violence, illegal substance use).\n", 615 | "* **`unethical_request`**: User asks for assistance with academically dishonest or clearly unethical actions (e.g., cheating, plagiarism, impersonation).\n", 616 | "* **`unsafe_activity_inquiry`**: User asks about performing physically dangerous actions without a clear safety or educational context (e.g., self-harm, dangerous stunts, misuse of objects).\n", 617 | "* **`sexual_content_suggestion`**: User inputs sexually explicit content, requests, or makes inappropriate sexual suggestions.\n", 618 | "* **`inappropriate_medical_advice_request`**: User asks for medical diagnosis, treatment plans, or specific medication advice that should only come from a healthcare professional.\n", 619 | "* **`insults`**: User directs personal insults or attacks towards the AI or another party.\n", 620 | "* **`vulgar_language`**: User employs excessive or highly offensive profanity/vulgarity.\n", 621 | "* **`threat`**: User expresses a direct or indirect threat of harm towards the AI or others.\n", 622 | "* **`animal_cruelty_suggestion`**: User suggests or describes actions that constitute cruelty or significant harm to animals.\n", 623 | "* **`refusal`**: The AI explicitly refuses to fulfill a user request due to safety, ethical, or capability limitations. (Include this if the AI's refusal action is relevant).\n", 624 | "\n", 625 | "Consider the context of the conversation. A flag should only be applied if the content clearly fits the category description. Multiple flags can apply to a single conversation.\n", 626 | "\n", 627 | "Your response MUST be a valid JSON list containing the applicable flag strings. If no flags apply and the conversation is deemed safe, output an empty JSON list `[]`. Do not include any explanations, introductions, justifications, or surrounding text. Just output the JSON list.\n", 628 | "\n", 629 | "If you think a new flag is needed, use it instead of the predefined ones. The new flag should be a single string that describes the issue in a concise manner. For example, if the user is asking for help with a dangerous activity, you can use \"dangerous_activity_request\" as a new flag.\n", 630 | "\"\"\"" 631 | ] 632 | }, 633 | { 634 | "cell_type": "code", 635 | "execution_count": null, 636 | "id": "7da13bf6", 637 | "metadata": {}, 638 | "outputs": [], 639 | "source": [ 640 | "import time\n", 641 | "\n", 642 | "from tqdm import tqdm\n", 643 | "from google.genai.types import GenerateContentConfig\n", 644 | "\n", 645 | "\n", 646 | "def get_user_prompt(conversation: list[dict]) -> str:\n", 647 | " return f\"\"\"Classify the safety flags for the following conversation:\n", 648 | "\n", 649 | "{conversation}\n", 650 | "\"\"\"\n", 651 | "\n", 652 | "\n", 653 | "request_per_minute = 30\n", 654 | "for entry in tqdm(new_data, total=len(new_data)):\n", 655 | " conversation = entry[\"conversation\"]\n", 656 | " user_prompt = get_user_prompt(conversation)\n", 657 | " response = client.models.generate_content(\n", 658 | " model=model,\n", 659 | " contents=user_prompt,\n", 660 | " config=GenerateContentConfig(\n", 661 | " system_instruction=[\n", 662 | " system_instruction\n", 663 | " ]\n", 664 | " )\n", 665 | " )\n", 666 | " if not response:\n", 667 | " entry[\"safety_flag\"] = []\n", 668 | " else:\n", 669 | " safety_flag = response.text.strip()\n", 670 | " entry[\"safety_flag\"] = safety_flag\n", 671 | " time.sleep(5)" 672 | ] 673 | }, 674 | { 675 | "cell_type": "markdown", 676 | "id": "6ac6f06d", 677 | "metadata": {}, 678 | "source": [ 679 | "## 9. Save the data" 680 | ] 681 | }, 682 | { 683 | "cell_type": "code", 684 | "execution_count": null, 685 | "id": "85b380c7", 686 | "metadata": {}, 687 | "outputs": [], 688 | "source": [ 689 | "for entry in new_data:\n", 690 | " topic = entry[\"topic\"].strip()\n", 691 | " entry[\"topic\"] = topic" 692 | ] 693 | }, 694 | { 695 | "cell_type": "code", 696 | "execution_count": null, 697 | "id": "15529b24", 698 | "metadata": {}, 699 | "outputs": [], 700 | "source": [ 701 | "import pandas as pd\n", 702 | "\n", 703 | "df = pd.DataFrame(new_data)\n", 704 | "df.head()" 705 | ] 706 | }, 707 | { 708 | "cell_type": "code", 709 | "execution_count": null, 710 | "id": "95cdda47", 711 | "metadata": {}, 712 | "outputs": [], 713 | "source": [ 714 | "df[\"token_count\"].describe()" 715 | ] 716 | }, 717 | { 718 | "cell_type": "code", 719 | "execution_count": null, 720 | "id": "c6f0d93a", 721 | "metadata": {}, 722 | "outputs": [], 723 | "source": [ 724 | "df[\"turns_count\"].describe()" 725 | ] 726 | }, 727 | { 728 | "cell_type": "code", 729 | "execution_count": null, 730 | "id": "fc89dd19", 731 | "metadata": {}, 732 | "outputs": [], 733 | "source": [ 734 | "df[\"topic\"].value_counts()" 735 | ] 736 | }, 737 | { 738 | "cell_type": "code", 739 | "execution_count": null, 740 | "id": "7f117c52", 741 | "metadata": {}, 742 | "outputs": [], 743 | "source": [ 744 | "df.rename(columns={\"safety_flag\": \"safety_flags\"}, inplace=True)" 745 | ] 746 | }, 747 | { 748 | "cell_type": "code", 749 | "execution_count": null, 750 | "id": "f09889dc", 751 | "metadata": {}, 752 | "outputs": [], 753 | "source": [ 754 | "df[\"safety_flags\"] = df[\"safety_flags\"].apply(json.loads)\n", 755 | "df[\"safety_flags\"]" 756 | ] 757 | }, 758 | { 759 | "cell_type": "code", 760 | "execution_count": null, 761 | "id": "80b17de1", 762 | "metadata": {}, 763 | "outputs": [], 764 | "source": [ 765 | "df.to_json(\n", 766 | " \"../dataset_with_metadata.json\",\n", 767 | " orient=\"records\",\n", 768 | " force_ascii=False,\n", 769 | " indent=4\n", 770 | ")" 771 | ] 772 | }, 773 | { 774 | "cell_type": "code", 775 | "execution_count": null, 776 | "id": "0bffa62e", 777 | "metadata": {}, 778 | "outputs": [], 779 | "source": [] 780 | } 781 | ], 782 | "metadata": { 783 | "kernelspec": { 784 | "display_name": "vincent", 785 | "language": "python", 786 | "name": "python3" 787 | }, 788 | "language_info": { 789 | "codemirror_mode": { 790 | "name": "ipython", 791 | "version": 3 792 | }, 793 | "file_extension": ".py", 794 | "mimetype": "text/x-python", 795 | "name": "python", 796 | "nbconvert_exporter": "python", 797 | "pygments_lexer": "ipython3", 798 | "version": "3.11.10" 799 | } 800 | }, 801 | "nbformat": 4, 802 | "nbformat_minor": 5 803 | } 804 | --------------------------------------------------------------------------------