├── .gitignore
├── LICENSE
├── README.md
├── configs
    ├── example_config_for_detaset_creation.yaml
    └── example_train_config.yaml
├── data
    ├── example_project_data
    │   ├── prepared_generated_data_for_example_project.csv
    │   └── raw_generated_data_for_example_project.csv
    ├── medical_tasks_gpt4
    │   └── prepared_generated_data_for_medical_tasks.csv
    ├── nhs_conditions_small_sample
    │   ├── data_split_by_length.csv
    │   └── original_data.csv
    ├── nhs_uk_full
    │   ├── prepared_generated_data_for_nhs_uk_conversations.csv
    │   └── prepared_generated_data_for_nhs_uk_qa.csv
    └── prompts.json
├── experiments
    ├── Dataset Generation.ipynb
    ├── Prompt Creation.ipynb
    └── Supervised Training.ipynb
├── llama_train_requirements.txt
├── opengpt
    ├── config.py
    ├── data_collator.py
    ├── dataset_utils.py
    ├── model_utils.py
    ├── parsers.py
    ├── prompt_utils.py
    └── teachers.py
├── requirements.txt
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | #Directories to be ignored fully
 2 | /books/
 3 | /articles/
 4 | /other/
 5 | /output/
 6 | /graphics/
 7 | models/
 8 | static/
 9 | dist/
10 | tmp/
11 | logs/
12 | results/
13 | wandb/
14 | *_tmp/
15 | *.egg-info/
16 | build/
17 | .idea
18 | venv
19 | db.sqlite3
20 | .ipynb_checkpoints/
21 | opengpt.code-workspace
22 | 
23 | #tmp and similar files
24 | .nfs*
25 | *.log
26 | *.pyc
27 | *.out
28 | *.swp
29 | *.swn
30 | tmp_*
31 | t_*
32 | tmp_*
33 | *_tmp
34 | *.swo
35 | *.lyx.emergency
36 | *.lyx#
37 | *~
38 | *hidden*
39 | nohup.out
40 | tmp.py
41 | .DS_Store
42 | *.lock
43 | 
44 | # models files
45 | *.dat
46 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # OpenGPT
 2 | 
 3 | A framework for creating grounded instruction based datasets and training conversational domain expert Large Language Models (LLMs).
 4 | 
 5 | Learn more in our blog: [AI for Healthcare | Introducing OpenGPT](https://aiforhealthcare.substack.com/p/a-large-language-model-for-healthcare).
 6 | 
 7 | <p align="center">
 8 |   <img height='400px' src='https://substackcdn.com/image/fetch/f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fcbc199b9-3aec-4c80-83c6-9a64886919dc_1318x868.png' />
 9 | </p>
10 | 
11 | ## NHS-LLM
12 | A conversational model for healthcare trained using OpenGPT. All the medical datasets used to train this model were created using OpenGPT and are available below.
13 | 
14 | ## Available datasets
15 | - NHS UK Q/A, 24,665 question and answer pairs, Prompt used: f53cf99826, Generated via OpenGPT using data available on the [NHS UK Website](https://www.nhs.uk/conditions/). Download [here](./data/nhs_uk_full/prepared_generated_data_for_nhs_uk_qa.csv)
16 | - NHS UK Conversations, 2,354 unique conversations, Prompt used: f4df95ec69, Generated via OpenGPT using data available on the [NHS UK Website](https://www.nhs.uk/conditions/). Download [here](./data/nhs_uk_full/prepared_generated_data_for_nhs_uk_conversations.csv)
17 | - Medical Task/Solution, 4,688 pairs generated via OpenGPT using GPT-4, prompt used: 5755564c19. Download [here](./data/medical_tasks_gpt4/prepared_generated_data_for_medical_tasks.csv)
18 | 
19 | All datasets are in the `/data` folder.
20 | 
21 | ## Installation
22 | ```
23 | pip install opengpt
24 | ```
25 | If you are working with LLaMA models, you will also need some extra requirements:
26 | ```
27 | pip install -r ./llama_train_requirements.txt
28 | ```
29 | 
30 | ## Tutorials
31 | 
32 | - Making a mini conversational LLM for healthcare, [Google Colab - OpenGPT | The making of Dum-E](https://colab.research.google.com/drive/1GQj9dwBSCmzEh1PmbRlQQYlojCvOG-qG?usp=sharing) 
33 | 
34 | 
35 | ## How to
36 | 
37 | 1. We start by collecting a base dataset in a certain domain. For example, collect definitions of all disases (e.g. from [NHS UK](https://www.nhs.uk/conditions/)). You can find a small sample dataset [here](https://github.com/CogStack/OpenGPT/blob/main/data/nhs_conditions_small_sample/original_data.csv). It is important that the collected dataset has a column named `text` where each row of the CSV has one disease definition.
38 | 
39 | 2. Find a prompt matching your use case in the [prompt database](https://github.com/CogStack/OpenGPT/blob/main/data/prompts.json), or create a new prompt using the [Prompt Creation Notebook](https://github.com/CogStack/OpenGPT/blob/main/experiments/Prompt%20Creation.ipynb). A prompt will be used to generate tasks/solutions based on the `context` (the dataset collected in step 1.)
40 |   - Edit the config file for dataset generation and add the appropirate promtps and datasets ([example config file](https://github.com/CogStack/OpenGPT/blob/main/configs/example_config_for_detaset_creation.yaml)).
41 |   - Run the Dataset generation notebook ([link](https://github.com/CogStack/OpenGPT/blob/main/experiments/Dataset%20Generation.ipynb))
42 | 
43 | 3. Edit the [train_config](https://github.com/CogStack/OpenGPT/blob/main/configs/example_train_config.yaml) file and add the datasets you want to use for training.
44 | 4. Use the [train notebook](https://github.com/CogStack/OpenGPT/blob/main/experiments/Supervised%20Training.ipynb) or run the training scripts to train a model on the new dataset you created.
45 | 
46 | **If you have any questions please checkout [discourse](https://discourse.cogstack.org/)**
47 | 
48 | ## More Examples
49 | 
50 | <p align="center">
51 |   <img width='600px' src='https://substackcdn.com/image/fetch/f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F3916352d-d1c9-451d-92db-652171f471e0_1318x1842.png' />
52 | </p>
53 | 
54 | 
55 | <p align="center">
56 |   <img width='600px' src='https://substackcdn.com/image/fetch/f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fe47dc8e1-d26c-4312-a7a4-8a32bf5375b9_1318x1168.png' />
57 | </p>
58 | 
59 | <p align="center">
60 |   <img width='600px' src='https://substackcdn.com/image/fetch/f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F42ab1ebe-2fab-4c94-80e7-69d4b95c8098_1318x854.png' />
61 | </p>
62 | 
63 | 
64 | 


--------------------------------------------------------------------------------
/configs/example_config_for_detaset_creation.yaml:
--------------------------------------------------------------------------------
 1 | #If starting a new project, please copy this config and change the `name` and set the `base_path`. 
 2 | # You can also remove things that you do not need or just leave them blank.
 3 | #If you are using this to generate a dataset, then configure the `datasets`, `openai`and `prompts` parameters.
 4 | name: 'example_project_data'
 5 | base_path: '../data/' # Where the new created datasets, interim files and everything else will be saved
 6 | to_box: True # Should all properities of the config class be coverted to Box, box makes properties accessible with a . (e.g. config.name, instead of config['name'])
 7 | special_tokens:
 8 |   user: "<|user|>" # For chat like interactions we want to have a <user> and <ai> token
 9 |   ai: "<|ai|>" # See above
10 |   eos: "<|eos|>" # End of stream (one question, or one answer, or one message)
11 |   eod: "<|eod|>" # End of document, or conversation - in other words the text that comes after this token is not related to the text before it
12 |   pad: "<|pad|>" # Padding 
13 | teacher:
14 |   name: 'openai' # Has to be one of the available teachers in opengpt/teachers.py
15 |   max_len: 2560 # Max length of text in tokens (by tiktoken) to send to OpenAI, usually 3/4 of the max length, longer sequences will be split
16 |   min_len: 10 # The minimum length of the context in words, if less an example will be skipped
17 |   model: 'gpt-3.5-turbo' # Model to be used as teacher (gpt-4 or gpt-3.5-turbo for openai)
18 | static_paths:
19 |   prompt_db: "../data/prompts.json" # Where is the propmpt database located
20 | data_generation_checkpoint_every: 5 # When querying the teacher, after this many queries a checkpoint will be saved on disk
21 | datasets: 
22 |   # All datasets to be used to generate grounded instruction-based datasets. Every dataset (CSV) has to have a `text` column that 
23 |   # will be sent to the Teacher as contex (chatgpt, gpt-4, ...):
24 |   #   name - the name to be used for this dataset, this name is used to reference this dataset in prompts
25 |   #   path - where is the csv
26 |   #   nrows - how many rows from the csv should be processed, usually used for testing only, -1 or None if all rows should be processed
27 |   - name: "nhs_conditions_small_sample"
28 |     path: "../data/nhs_conditions_small_sample/original_data.csv"
29 |     nrows: -1
30 | prompts:
31 |   - hashes: [f53cf99826, f4df95ec69] # Hashes of prompts to be used
32 |     languages: ["English", "French"] # Some prompts have a {language} field, so this wil lbe used to populate it
33 |     random_prompt: True # If True, for each example in the datasets a random prompt will be picked from `hashes`, otherwise all prompts will be used sequentially
34 |     datasets: ["nhs_conditions_small_sample"] # Datasets to be used with the prompt hashes above, name of the dataset has to match what is defined in `datasets`
35 |     runs: 2 # How many iterrations to do, so if we put 5 we will send each document from the `datasets` 5 times to the Teacher (e.g. ChatGPT)
36 |     extra_parameters: # Extra paramters that the prompt might require
37 |      quantity: 10
38 | 


--------------------------------------------------------------------------------
/configs/example_train_config.yaml:
--------------------------------------------------------------------------------
 1 | #If starting a new project, please copy this config and change the `name` and set the `base_path`. 
 2 | # You can also remove things that you do not need or just leave them blank.
 3 | #If you are using this to generate a dataset, then configure the `datasets`, `openai`and `prompts` parameters.
 4 | name: 'example_project_train'
 5 | base_path: '../data/' # Where the new created datasets, interim files and everything else will be saved
 6 | to_box: True # Should all properities of the config class be coverted to Box, box makes properties accessible with a . (e.g. config.name, instead of config['name'])
 7 | special_tokens:
 8 |   user: "<|user|>" # For chat like interactions we want to have a <user> and <ai> token
 9 |   ai: "<|ai|>" # See above
10 |   eos: "<|eos|>" # End of stream (one question, or one answer, or one message)
11 |   eod: "<|eod|>" # End of document, or conversation - in other words the text that comes after this token is not related to the text before it
12 |   pad: "<|pad|>" # Padding 
13 | test:
14 |   dataset: "<path to your test set>" # If you have one
15 | train: # Training parameters
16 |   model: 'olm/olm-gpt2-oct-2022' # This model can be used for testing, but the performance will not be the best (we need bigger models)
17 |   # The models below require bigger GPUs, usually at least one A100 (80GB) or more smaller GPUs
18 |   #model: 'stabilityai/stablelm-base-alpha-3b'
19 |   #model: '<path to your lama models>/llama-hf/7B' 
20 |   #model: 'facebook/opt-1.3b'
21 |   datasets: # One or more datasets to be used for training, the csvs have to have the same columns
22 |    - "../data/example_project_data/prepared_generated_data_for_example_project.csv" 
23 |    - "../data/nhs_uk_full/prepared_generated_data_for_nhs_uk_qa.csv"
24 |    - "../data/nhs_uk_full/prepared_generated_data_for_nhs_uk_conversations.csv"
25 |    - "../data/medical_tasks_gpt4/prepared_generated_data_for_medical_tasks.csv"
26 |   ignore_index: -100 # This will be added as label if we want to skip something
27 |   max_seq_len: 512 # Should match the models max seq len, or be smaller
28 |   packing_type: 'partial' # one of 'partial', 'full' or 'none' - IMPORTANT, but experimental, Full/Partial will speedup the training drastically (2-3x)
29 |   shuffle_dataset: True # Will shuffle the dataset after loading, usually better not to do this and during data preparation make sure your dataset is in the right shape
30 |   hf_training_arguments:
31 |     output_dir: '../data/results/'
32 |     gradient_accumulation_steps: 16 # Aim for a BS of 128, forumla is: n_dev * batch_size * acc_steps
33 |     per_device_eval_batch_size: 1
34 |     per_device_train_batch_size: 1
35 |     load_best_model_at_end: False
36 |     learning_rate: 2.0e-5 # Use float with 'e-x' notation
37 |     weight_decay: 0.1
38 |     adam_beta1: 0.9
39 |     adam_beta2: 0.95
40 |     adam_epsilon: 1.0e-7
41 |     max_grad_norm: 1
42 |     num_train_epochs: 1
43 |     lr_scheduler_type: 'cosine'
44 |     warmup_ratio: 0.03
45 |     logging_strategy: 'steps'
46 |     logging_steps: 100
47 |     save_strategy: "steps"
48 |     save_steps: 30000
49 |     seed: 11
50 |     optim: 'adamw_hf'
51 |     do_eval: False
52 |     #bf16: True # Enable if supported by your GPUs
53 |     #tf32: True
54 |     #fsdp: "full_shard auto_wrap" # Enable for distributed training
55 |     #fsdp_transformer_layer_cls_to_wrap: "LlamaDecoderLayer"
56 | 


--------------------------------------------------------------------------------
/data/nhs_conditions_small_sample/data_split_by_length.csv:
--------------------------------------------------------------------------------
  1 | ,text,url,len,part
  2 | 0,"Overview
  3 | High blood pressure (hypertension)
  4 | High blood pressure, or hypertension, rarely has noticeable symptoms. But if untreated, it increases your risk of serious problems such as heart attacks and strokes.
  5 | Around a third of adults in the UK have high blood pressure, although many will not realise it.
  6 | The only way to find out if your blood pressure is high is to have your blood pressure checked.
  7 | What is high blood pressure?
  8 | Blood pressure is recorded with 2 numbers. The systolic pressure (higher number) is the force at which your heart pumps blood around your body.
  9 | The diastolic pressure (lower number) is the resistance to the blood flow in the blood vessels.
 10 | They're both measured in millimetres of mercury (mmHg).
 11 | As a general guide:
 12 | high blood pressure is considered to be from 140/90mmHg (or an average of 135/85mmHg at home) – or 150/90mmHg (or an average of 145/85mmHg at home) if you're over the age of 80
 13 | ideal blood pressure is usually considered to be between 90/60mmHg and 120/80mmHg, while the target for over-80s is below 150/90mmHg (or 145/85mmHg at home)
 14 | Blood pressure readings between 120/80mmHg and 140/90mmHg could mean you're at risk of developing high blood pressure if you do not take steps to keep your blood pressure under control.
 15 | Everyone's blood pressure will be slightly different. What's considered low or high for you may be normal for someone else.
 16 | Risks of high blood pressure
 17 | If your blood pressure is too high, it puts extra strain on your blood vessels, heart and other organs, such as the brain, kidneys and eyes.
 18 | Persistent high blood pressure can increase your risk of a number of serious and potentially life-threatening health conditions, such as:
 19 | heart disease
 20 | heart attacks
 21 | strokes
 22 | heart failure
 23 | peripheral arterial disease
 24 | aortic aneurysms
 25 | kidney disease
 26 | vascular dementia
 27 | If you have high blood pressure, reducing it even a small amount can help lower your risk of these health conditions.
 28 | Check your blood pressure
 29 | The only way of knowing whether you have high blood pressure is to have a blood pressure test.
 30 | All adults over 40 are advised to have their blood pressure checked at least every 5 years.
 31 | Getting this done is easy and could save your life.
 32 | You can get your blood pressure tested at a number of places, including:
 33 | at your GP surgery
 34 | at some pharmacies
 35 | as part of your NHS Health Check
 36 | in some workplaces
 37 | You can also check your blood pressure yourself with a home blood pressure monitor.
 38 | Find out more about getting a blood pressure test
 39 | Things that can increase your risk of getting high blood pressure
 40 | It's not always clear what causes high blood pressure, but there are things that can increase your risk.
 41 | You might be more at risk if you:
 42 | are overweight
 43 | eat too much salt and do not eat enough fruit and vegetables
 44 | do not do enough exercise
 45 | drink too much alcohol or coffee (or other caffeine-based drinks)
 46 | smoke
 47 | do not get much sleep or have disturbed sleep
 48 | are over 65
 49 | have a relative with high blood pressure
 50 | are of black African or black Caribbean descent
 51 | live in a deprived area
 52 | Making healthy lifestyle changes can sometimes help reduce your chances of getting high blood pressure and help lower your blood pressure if it's already high.
 53 | Treatment for high blood pressure
 54 | Doctors can help you keep your blood pressure to a safe level using:
 55 | lifestyle changes
 56 | medicines
 57 | What works best is different for each person.
 58 | Talk to your doctor to help you decide about treatment.
 59 | This patient decision aid (PDF, 132kb) can also help you to understand your treatment options.
 60 | Lifestyle changes to reduce blood pressure
 61 | These lifestyle changes can help prevent and lower high blood pressure:
 62 | reduce the amount of salt you eat and have a generally healthy diet
 63 | cut back on alcohol
 64 | lose weight if you're overweight
 65 | exercise regularly
 66 | cut down on caffeine
 67 | stop smoking
 68 | Some people with high blood pressure may also need to take 1 or more medicines to stop their blood pressure getting too high.
 69 | Medicines for high blood pressure
 70 | If you're diagnosed with high blood pressure, your doctor may recommend taking 1 or more medicines to keep it under control.
 71 | These come as tablets and usually need to be taken once a day.
 72 | Common blood pressure medicines include:
 73 | ACE inhibitors – such as enalapril, lisinopril, perindopril and ramipril
 74 | angiotensin-2 receptor blockers (ARBs) – such as candesartan, irbesartan, losartan, valsartan and olmesartan
 75 | calcium channel blockers – such as amlodipine, felodipine and nifedipine or diltiazem and verapamil
 76 | diuretics – such as indapamide and bendroflumethiazide
 77 | beta blockers – such as atenolol and bisoprolol
 78 | alpha blockers – such as doxazosin
 79 | other diuretics – such as amiloride and spironolactone
 80 | The medicine recommended for you will depend on things like how high your blood pressure is, your age and your ethnicity.",https://www.nhs.uk/conditions/Blood-pressure-(high)/Pages/Introduction.aspx,1135,part_0
 81 | 1,"Bronchiolitis
 82 | Bronchiolitis is a common chest infection that affects babies and children under 2. It's usually mild and can be treated at home, but it can be serious.
 83 | Bronchiolitis is different from bronchitis, which causes a cough with lots of mucus and can affect people of all ages.
 84 | Check if it's bronchiolitis
 85 | The early symptoms of bronchiolitis are similar to a cold, such as sneezing, a runny or blocked nose, a cough and a slightly high temperature of 38C.
 86 | A child with bronchiolitis may then get other symptoms, such as:
 87 | breathing more quickly
 88 | finding it difficult to feed or eat
 89 | noisy breathing (wheezing)
 90 | becoming irritable
 91 | Symptoms are usually worst between days 3 and 5, and the cough usually gets better in 3 weeks.
 92 | Immediate action required:
 93 | Call 999 or go to A&E if:
 94 | your child is having difficulty breathing – you may notice grunting noises or their tummy sucking under their ribs
 95 | there are pauses when your child breathes
 96 | your child's skin, tongue or lips are blue
 97 | your child is floppy and will not wake up or stay awake
 98 | As a parent, you may know if your child seems seriously unwell and should trust your own judgement.
 99 | Find your nearest A&E
100 | Urgent advice:
101 | Ask for an urgent GP appointment or call 111 if:
102 | your child has had a cold and it's getting worse
103 | your child is feeding or eating much less than normal
104 | your child has had a dry nappy for 12 hours or more, or shows other signs of dehydration
105 | your baby is under 3 months and has a temperature of 38C, or is older than 3 months and has a temperature of 39C or higher
106 | your baby feels hotter than usual when you touch their back or chest, or feels sweaty
107 | your child is very tired or irritable
108 | Treatments for bronchiolitis
109 | There's no specific treatment for bronchiolitis. It usually gets better on its own and you can look after your child at home.
110 | But it can be serious in some children, who may need to be treated in hospital.
111 | Do
112 | give children's paracetamol to babies and children over 2 months old or ibuprofen to babies and children over 3 months old – but do not give aspirin to a child under 16
113 | try using salt water (saline) drops if your child's nose is blocked
114 | keep your child upright as much as possible when they're awake – this will help them breathe more easily
115 | encourage your child to drink lots of fluids – try smaller feeds more often in babies, and give older children extra water or diluted fruit juice
116 | Don’t
117 | do not smoke around your child
118 | do not try to lower your child's temperature by sponging them with cool water or taking off all their clothes
119 | Preventing bronchiolitis
120 | There are some things you can do to lower the chances of your child getting bronchiolitis or spreading the viruses that cause it, such as:
121 | wash your hands and your child's hands often
122 | wash or wipe down toys and clean surfaces regularly
123 | use disposable tissues and throw them away as soon as you've used them
124 | keep newborn babies away from anyone with a cold or the flu – especially if they're under 2 months old or were premature
125 | It's also important not to smoke around your child. Children who breathe in cigarette smoke have a higher risk of getting bronchiolitis.
126 | Children at risk of severe bronchiolitis
127 | Some children may have a higher risk of getting seriously ill with bronchiolitis.
128 | This includes children who:
129 | were born very prematurely
130 | have a heart or lung condition
131 | have a weakened immune system
132 | These children may be able to have treatment in the winter (between October and March) to stop them getting severe bronchiolitis.
133 | Causes of bronchiolitis
134 | Bronchiolitis is caused by a viral infection, usually the respiratory syncytial virus (RSV).
135 | RSV is very common and spreads easily in coughs and sneezes. Almost all children have had it by the time they're 2.
136 | In older children and adults, RSV may cause a cough or cold, but in young children it can cause bronchiolitis.",https://www.nhs.uk/conditions/Bronchiolitis/,893,part_0
137 | 2,"Bronchitis
138 | Bronchitis is inflammation of the airways in the lungs that is usually caused by an infection. It often gets better without treatment in around 3 weeks.
139 | Some people have long-term inflammation of the airways in the lungs called chronic bronchitis. This is known as chronic obstructive pulmonary disease (COPD).
140 | Check if you have bronchitis
141 | Symptoms of bronchitis can be similar to a cold or flu.
142 | Symptoms include:
143 | a cough – you may cough up clear, white, yellow or green mucus
144 | chest pain when coughing
145 | shortness of breath
146 | a sore throat
147 | a runny nose
148 | a high temperature
149 | Things you can do to help with bronchitis
150 | There are some things you can do to ease the symptoms of bronchitis and reduce the risk of spreading infections to other people.
151 | Do
152 | get plenty of rest – try to stay at home and avoid contact with other people if you have a high temperature or do not feel well enough to do your normal activities
153 | drink plenty of fluids
154 | take painkillers like paracetamol or ibuprofen to help with pain and bring down a high temperature
155 | try adding honey to a warm drink to help soothe your throat (do not give honey to babies under 1)
156 | cover your mouth and nose with a tissue when you cough or sneeze – put used tissues in the bin as quickly as possible
157 | wash your hands regularly with water and soap
158 | Don’t
159 | do not smoke
160 | Urgent advice:
161 | Ask for an urgent GP appointment or get help from NHS 111 if:
162 | you've had a cough for more than 3 weeks
163 | you cough up blood or blood-stained mucus
164 | you have chest pain that comes and goes, or when breathing or coughing
165 | you're over 65
166 | you're pregnant
167 | you have a long-term condition, such as diabetes, or a heart, lung or kidney condition
168 | you have a weakened immune system – for example, you have a condition that affects the immune system, or you're having chemotherapy
169 | you feel very unwell
170 | Immediate action required:
171 | Call 999 if:
172 | you are struggling to breathe – you are choking, gasping and unable to speak
173 | you have pale, blue or blotchy skin, lips or tongue – on brown or black skin, this may be easier to see on the lips, tongue or gums, under the nails or around the eyes
174 | you suddenly feel confused – for example, you do not know where you are
175 | you're unable to wake your baby or they feel floppy
176 | Treatments for bronchitis
177 | Bronchitis usually clears up without treatment in around 3 weeks. See a GP if your symptoms last longer than 3 weeks.
178 | You may need antibiotics if your bronchitis is caused by a bacterial infection.",https://www.nhs.uk/conditions/Bronchitis/,574,part_0
179 | 3,"Steroids
180 | Steroids, also called corticosteroids, are anti-inflammatory medicines used to treat a range of conditions.
181 | They're different from anabolic steroids, which are often used illegally by some people to increase their muscle mass.
182 | Types of steroids
183 | Steroids come in many different forms.
184 | The main types are:
185 | tablets, syrups and liquids – such as prednisolone
186 | inhalers – such as beclometasone and fluticasone
187 | nasal sprays – such as beclometasone and fluticasone
188 | injections (given into joints, muscles or blood vessels) – such as methylprednisolone
189 | creams, lotions and gels – such as hydrocortisone skin cream
190 | Most steroids are only available on prescription, but a few (such as some creams or nasal sprays) can be bought from pharmacies and shops.
191 | Side effects of steroids
192 | Steroids do not tend to cause significant side effects if they're taken for a short time or at a low dose.
193 | But sometimes they can cause unpleasant side effects, such as an increased appetite, mood changes and difficulty sleeping. This is most common with steroid tablets.
194 | The side effects will usually pass once you finish the treatment, but do not stop taking your medicine without speaking to your doctor. Stopping a prescribed course of medicine can cause further unpleasant side effects (withdrawal symptoms).
195 | Read more about:
196 | side effects of steroid tablets
197 | side effects of steroid inhalers
198 | side effects of steroid nasal sprays
199 | side effects of steroid injections
200 | side effects of steroid creams
201 | You can report any suspected side effect to the Yellow Card Scheme.
202 | Uses for steroids
203 | Steroids can be used to treat a wide range of conditions, including:
204 | asthma and chronic obstructive pulmonary disease (COPD)
205 | hay fever
206 | hives and eczema
207 | painful joints or muscles – such as arthritis, tennis elbow and frozen shoulder
208 | pain caused by an irritated or trapped nerve – such as sciatica
209 | inflammatory bowel disease – such as Crohn's disease
210 | lupus
211 | multiple sclerosis (MS)
212 | How steroids work
213 | Steroids are a man-made version of hormones normally produced by the adrenal glands which are 2 small glands found above the kidneys.
214 | When taken in doses higher than the amount your body normally produces, steroids reduce redness and swelling (inflammation). This can help with inflammatory conditions such as asthma and eczema.
215 | Steroids also reduce the activity of the immune system, which is the body's natural defence against illness and infection.
216 | This can help treat autoimmune conditions, such as rheumatoid arthritis or lupus, which are caused by the immune system mistakenly attacking the body.",https://www.nhs.uk/conditions/Corticosteroid-(drugs)/Pages/Introduction.aspx,587,part_0
217 | 4,"Overview
218 | Creutzfeldt-Jakob disease
219 | Creutzfeldt-Jakob disease (CJD) is a rare and fatal condition that affects the brain. It causes brain damage that worsens rapidly over time.
220 | Symptoms of CJD
221 | Symptoms of CJD include:
222 | loss of intellect and memory
223 | changes in personality
224 | loss of balance and co-ordination
225 | slurred speech
226 | vision problems and blindness
227 | abnormal jerking movements
228 | progressive loss of brain function and mobility
229 | Most people with CJD will die within a year of the symptoms starting, usually from infection.
230 | This is because the immobility caused by CJD can make people with the condition vulnerable to infection.
231 | Read more about the symptoms of Creutzfeldt-Jakob disease and diagnosing Creutzfeldt-Jakob disease.
232 | What causes CJD?
233 | CJD appears to be caused by an abnormal infectious protein called a prion. These prions accumulate at high levels in the brain and cause irreversible damage to nerve cells.
234 | While the abnormal prions are technically infectious, they're very different from viruses and bacteria.
235 | For example, prions aren't destroyed by the extremes of heat and radiation used to kill bacteria and viruses, and antibiotics or antiviral medicines have no effect on them.
236 | Read more about the causes of Creutzfeldt-Jakob disease.
237 | Types of CJD
238 | There are 4 main types of CJD.
239 | Sporadic CJD
240 | Sporadic CJD is the most common type.
241 | The precise cause of sporadic CJD is unclear, but it's been suggested that a normal brain protein changes abnormally (""misfolds"") and turns into a prion.
242 | Most cases of sporadic CJD occur in adults aged between 45 and 75. On average, symptoms develop between the ages of 60 and 65.
243 | Despite being the most common type of CJD, sporadic CJD is still very rare, affecting only 1 or 2 people in every million each year in the UK.
244 | In 2020, there were 131 recorded deaths from sporadic CJD in the UK.
245 | Variant CJD
246 | Variant CJD (vCJD) is likely to be caused by consuming meat from a cow that had bovine spongiform encephalopathy (BSE, or ""mad cow"" disease), a similar prion disease to CJD.
247 | Since the link between variant CJD and BSE was discovered in 1996, strict controls have proved very effective in preventing meat from infected cattle entering the food chain.
248 | See preventing Creutzfeldt-Jakob disease for more information.
249 | But the average time it takes for the symptoms of variant CJD to occur after initial infection (the incubation period) is still unclear.
250 | The incubation period could be very long (more than 10 years) in some people, so those exposed to infected meat before the food controls were introduced can still develop variant CJD.
251 | The prion that causes variant CJD can also be transmitted by blood transfusion, although this has only happened 5 times in the UK.
252 | In 2020, there were no recorded deaths from variant CJD in the UK.
253 | Familial or inherited CJD
254 | Familial CJD is a very rare genetic condition where one of the genes a person inherits from their parent (the prion protein gene) carries a mutation that causes prions to form in their brain during adulthood, triggering the symptoms of CJD.
255 | It affects about 1 in every 9 million people in the UK.
256 | The symptoms of familial CJD usually first develop in people when they're in their early 50s.
257 | In 2020, there were 6 deaths from familial CJD and similar inherited prion diseases in the UK.
258 | Iatrogenic CJD
259 | Iatrogenic CJD is where the infection is accidentally spread from someone with CJD through medical or surgical treatment.
260 | For example, a common cause of iatrogenic CJD in the past was growth hormone treatment using human pituitary growth hormones extracted from deceased individuals, some of whom were infected with CJD.
261 | Synthetic versions of human growth hormone have been used since 1985, so this is no longer a risk.
262 | Iatrogenic CJD can also occur if instruments used during brain surgery on a person with CJD aren't properly cleaned between each surgical procedure and are reused on another person.
263 | But increased awareness of these risks means iatrogenic CJD is now very rare.
264 | In 2020, there was 1 death from iatrogenic CJD in the UK caused by receiving human growth hormone before 1985.
265 | How CJD is treated
266 | There's currently no cure for CJD, so treatment aims to relieve symptoms and make the affected person feel as comfortable as possible.
267 | This can include using medicine such as antidepressants to help with anxiety and depression, and painkillers to relieve pain.
268 | Some people will need nursing care and assistance with feeding.
269 | Read more about treating Creutzfeldt-Jakob disease.
270 | Variant CJD compensation scheme
271 | In October 2001, the government announced a compensation scheme for UK victims of variant CJD.
272 | The vCJD Trust assesses claims and pays compensation to victims and their families.",https://www.nhs.uk/conditions/Creutzfeldt-Jakob-disease/Pages/Introduction.aspx,1103,part_0
273 | 5,"Overview
274 | Atopic eczema
275 | Atopic eczema (atopic dermatitis) is the most common form of eczema, a condition that causes the skin to become itchy, dry and cracked.
276 | Atopic eczema is more common in children, often developing before their first birthday. But it may also develop for the first time in adults.
277 | It's usually a long-term (chronic) condition, although it can improve significantly, or even clear completely, in some children as they get older.
278 | Symptoms of atopic eczema
279 | Atopic eczema causes the skin to become itchy, dry, cracked and sore.
280 | Some people only have small patches of dry skin, but others may experience widespread inflamed skin all over the body.
281 | Inflamed skin can become red on lighter skin, and darker brown, purple or grey on darker skin. This can also be more difficult to see on darker skin.
282 | Although atopic eczema can affect any part of the body, it most often affects the hands, insides of the elbows, backs of the knees and the face and scalp in children.
283 | People with atopic eczema usually have periods when symptoms are less noticeable, as well as periods when symptoms become more severe (flare-ups).
284 | When to seek medical advice
285 | See a GP if you have symptoms of atopic eczema. They'll usually be able to diagnose atopic eczema by looking at your skin and asking questions, such as:
286 | whether the rash is itchy and where it appears
287 | when the symptoms first began
288 | whether it comes and goes over time
289 | whether there's a history of atopic eczema in your family
290 | whether you have any other conditions, such as allergies or asthma
291 | whether something in your diet or lifestyle may be contributing to your symptoms
292 | Typically, to be diagnosed with atopic eczema you should have had an itchy skin condition in the last 12 months and 3 or more of the following:
293 | visibly irritated red skin in the creases of your skin – such as the insides of your elbows or behind your knees (or on the cheeks, outsides of elbows, or fronts of the knees in children aged 18 months or under) at the time of examination by a health professional
294 | a history of skin irritation occurring in the same areas mentioned above
295 | generally dry skin in the last 12 months
296 | a history of asthma or hay fever – children under 4 must have an immediate relative, such as a parent, brother or sister, who has 1 of these conditions
297 | the condition started before the age of 2 (this does not apply to children under the age of 4)
298 | Causes of atopic eczema
299 | The exact cause of atopic eczema is unknown, but it's clear it is not down to one single thing.
300 | Atopic eczema often occurs in people who get allergies. ""Atopic"" means sensitivity to allergens.
301 | It can run in families, and often develops alongside other conditions, such as asthma and hay fever.
302 | The symptoms of atopic eczema often have certain triggers, such as soaps, detergents, stress and the weather.
303 | Sometimes food allergies can play a part, especially in young children with severe eczema.
304 | You may be asked to keep a food diary to try to determine whether a specific food makes your symptoms worse.
305 | Allergy tests are not usually needed, although they're sometimes helpful in identifying whether a food allergy may be triggering symptoms.
306 | Treating atopic eczema
307 | Treatment for atopic eczema can help to relieve the symptoms and many cases improve over time.
308 | But there's currently no cure and severe eczema often has a significant impact on daily life, which may be difficult to cope with physically and mentally.
309 | There's also an increased risk of skin infections.
310 | Many different treatments can be used to control symptoms and manage eczema, including:
311 | self-care techniques, such as reducing scratching and avoiding triggers
312 | emollients (moisturising treatments) – used on a daily basis for dry skin
313 | topical corticosteroids – used to reduce swelling, redness and itching during flare-ups
314 | Other types of eczema
315 | Eczema is the name for a group of skin conditions that cause dry, irritated skin.
316 | Other types of eczema include:
317 | discoid eczema – a type of eczema that occurs in circular or oval patches on the skin
318 | contact dermatitis – a type of eczema that occurs when the body comes into contact with a particular substance
319 | varicose eczema – a type of eczema that most often affects the lower legs and is caused by problems with the flow of blood through the leg veins
320 | seborrhoeic eczema – a type of eczema where red, scaly patches develop on the sides of the nose, eyebrows, ears and scalp
321 | dyshidrotic eczema (pompholyx) – a type of eczema that causes tiny blisters to erupt across the palms of the hands",https://www.nhs.uk/conditions/Eczema-(atopic)/Pages/Introduction.aspx,1075,part_0
322 | 6,"Overview
323 | HIV and AIDS
324 | HIV (human immunodeficiency virus) is a virus that damages the cells in your immune system and weakens your ability to fight everyday infections and disease.
325 | AIDS (acquired immune deficiency syndrome) is the name used to describe a number of potentially life-threatening infections and illnesses that happen when your immune system has been severely damaged by the HIV virus.
326 | While AIDS cannot be transmitted from 1 person to another, the HIV virus can.
327 | There's currently no cure for HIV, but there are very effective drug treatments that enable most people with the virus to live a long and healthy life.
328 | With an early diagnosis and effective treatments, most people with HIV will not develop any AIDS-related illnesses and will live a near-normal lifespan.
329 | Symptoms of HIV infection
330 | Most people experience a short flu-like illness 2 to 6 weeks after HIV infection, which lasts for a week or 2.
331 | After these symptoms disappear, HIV may not cause any symptoms for many years, although the virus continues to damage your immune system.
332 | This means many people with HIV do not know they're infected.
333 | Anyone who thinks they could have HIV should get tested.
334 | Some people are advised to have regular tests as they're at particularly high risk.
335 | Read more about who's most at risk of HIV
336 | Causes of HIV infection
337 | HIV is found in the body fluids of an infected person. This includes semen, vaginal and anal fluids, blood and breast milk.
338 | It's a fragile virus and does not survive outside the body for long.
339 | HIV cannot be transmitted through sweat, urine or saliva.
340 | The most common way of getting HIV in the UK is through having anal or vaginal sex without a condom.
341 | Other ways of getting HIV include:
342 | sharing needles, syringes or other injecting equipment
343 | transmission from mother to baby during pregnancy, birth or breastfeeding
344 | The chance of getting HIV through oral sex is very low and will be dependent on many things, such as whether you receive or give oral sex and the oral hygiene of the person giving the oral sex.
345 | Diagnosing HIV
346 | Seek medical advice as soon as possible if you think you might have been exposed to HIV.
347 | You can get tested in a number of places, including at a GP surgery, sexual health clinics and clinics run by charities.
348 | Find HIV testing services near you
349 | The only way to find out if you have HIV is to have an HIV test. This involves testing a sample of your blood or saliva for signs of the infection.
350 | It's important to be aware that:
351 | emergency anti-HIV medicine called post-exposure prophylaxis (PEP) may stop you becoming infected if started within 72 hours of possible exposure to the virus – it's recommended that you start it as soon as possible, ideally within 24 hours
352 | an early diagnosis means you can start treatment sooner, which can improve your chances of controlling the virus, reduce the risk of becoming more unwell and reduce the chance of passing the virus on to others
353 | Both positive and negative HIV tests may need to be repeated 1 to 3 months after potential exposure to HIV infection (this is known as the window period), but you should not wait this long to seek help:
354 | clinics may offer a finger prick blood test, which can give you a result in minutes, but it may take up to a few days to get the results of a more detailed HIV test
355 | home testing or home sampling kits are available to buy online or from pharmacies – depending on the type of test you use, your result will be available in a few minutes or a few days
356 | If your first test suggests you have HIV, a further blood test will need to be carried out to confirm the result.
357 | If this is positive, you'll be referred to a specialist HIV clinic for some more tests and a discussion about your treatment options.
358 | Treatment for HIV
359 | Antiretroviral medicines are used to treat HIV. They work by stopping the virus replicating in the body, allowing the immune system to repair itself and preventing further damage.
360 | These come in the form of tablets, which need to be taken every day.
361 | HIV is able to develop resistance to a single HIV medicine very easily, but taking a combination of different medicines makes this much less likely.
362 | Most people with HIV take a combination of medicines. It's vital these are taken every day as recommended by your doctor.
363 | The goal of HIV treatment is to have an undetectable viral load. This means the level of HIV virus in your body is low enough to not be detected by a test.
364 | Living with HIV
365 | If you're living with HIV, taking effective HIV treatment and being undetectable significantly reduces your risk of passing HIV on to others.
366 | You'll also be encouraged to:
367 | take regular exercise
368 | eat a healthy diet
369 | stop smoking
370 | have yearly flu jabs to minimise the risk of getting serious illnesses
371 | Without treatment, the immune system will become severely damaged, and life-threatening illnesses such as cancer and severe infections can occur.
372 | If you're planning on getting pregnant, it's important to talk to a GP. Although rare, it's possible to transmit HIV to your baby.
373 | Preventing HIV
374 | Anyone who has sex without a condom or shares needles is at risk of HIV infection.
375 | There are many effective ways to prevent or reduce the risk of HIV infection, including:
376 | using a condom for sex
377 | post-exposure prophylaxis (PEP)
378 | pre-exposure prophylaxis (PrEP)
379 | treatment for HIV to reduce the viral load to undetectable
380 | if you use drugs, never sharing needles or other injecting equipment, including syringes, spoons and swabs
381 | Speak to your local sexual health clinic or a GP for further advice about the best way to reduce your risk.
382 | For people with HIV, if you have been taking effective HIV treatment and your viral load has been undetectable for 6 months or more, it means you cannot pass the virus on through sex.
383 | This is called undetectable=untransmittable (U=U).
384 | Further information on U=U
385 | NAM aidsmap: undetectable equals untransmittable (U=U) consensus statement",https://www.nhs.uk/conditions/HIV/Pages/Introduction.aspx,1258,part_0
386 | 7,"Overview
387 | Heart attack
388 | A heart attack (myocardial infarction or MI) is a serious medical emergency in which the supply of blood to the heart is suddenly blocked, usually by a blood clot.
389 | A heart attack is a medical emergency. Call 999 and ask for an ambulance if you suspect a heart attack.
390 | A lack of blood to the heart may seriously damage the heart muscle and can be life threatening.
391 | Symptoms of a heart attack
392 | Symptoms of a heart attack can include:
393 | chest pain – a feeling of pressure, heaviness, tightness or squeezing across your chest
394 | pain in other parts of the body – it can feel as if the pain is spreading from your chest to your arms (usually the left arm, but it can affect both arms), jaw, neck, back and tummy
395 | feeling lightheaded or dizzy
396 | sweating
397 | shortness of breath
398 | feeling sick (nausea) or being sick (vomiting)
399 | an overwhelming feeling of anxiety (similar to a panic attack)
400 | coughing or wheezing
401 | The chest pain is often severe, but some people may only experience minor pain, similar to indigestion.
402 | While the most common symptom in both men and women is chest pain, women are more likely to have other symptoms such as shortness of breath, feeling or being sick and back or jaw pain.
403 | Call 999 immediately if you think someone might be having a heart attack. The faster you act, the better their chances.
404 | Treating heart attacks
405 | While waiting for an ambulance, it may help to chew and then swallow a tablet of aspirin (ideally 300mg), as long as the person having a heart attack is not allergic to aspirin.
406 | Aspirin helps to thin the blood and improves blood flow to the heart.
407 | In hospital, treatment for a heart attack depends on how serious it is.
408 | The 2 main treatments are:
409 | using medicines to dissolve blood clots
410 | surgery to help restore blood to the heart
411 | Causes of a heart attack
412 | Coronary heart disease (CHD) is the leading cause of heart attacks.
413 | CHD is a condition in which the major blood vessels that supply the heart get clogged with deposits of cholesterol, known as plaques.
414 | Before a heart attack, 1 of the plaques bursts (ruptures), causing a blood clot to develop at the site of the rupture.
415 | The clot may block the supply of blood to the heart, triggering a heart attack.
416 | Recovering from a heart attack
417 | The time it takes to recover from a heart attack will depend on the amount of damage to your heart muscle.
418 | Most people can return to work after having a heart attack. Some people are well enough to return to work after 2 weeks. Other people may take several months to recover. How quickly you can go back to work depends on your health, the state of your heart and the type of work you do.
419 | The recovery process aims to:
420 | reduce your risk of another heart attack through a combination of lifestyle changes (such as eating a healthy diet), and medicines (such as statins), which help to lower blood cholesterol levels
421 | gradually restore your physical fitness so you can resume normal activities (cardiac rehabilitation)
422 | Find out more about recovering from a heart attack
423 | Complications of a heart attack
424 | Complications of a heart attack can be serious and possibly life threatening.
425 | These include:
426 | arrhythmias – these are abnormal heartbeats. 1 type is where the heart begins beating faster and faster, then stops beating (cardiac arrest)
427 | cardiogenic shock – where the heart's muscles are severely damaged and can no longer contract properly to supply enough blood to maintain many body functions
428 | heart rupture – where the heart's muscles, walls or valves split apart (rupture)
429 | These complications can happen quickly after a heart attack and are a leading cause of death.
430 | Many people die suddenly from a complication of a heart attack before reaching hospital or within the 1st month after a heart attack.
431 | The outlook often depends on:
432 | age – serious complications are more likely as you get older
433 | the severity of the heart attack – how much of the heart's muscle has been damaged during the attack
434 | how long it took before a person received treatment – treatment for a heart attack should begin as soon as possible
435 | Find out more about complications of a heart attack
436 | Preventing a heart attack
437 | There are 5 main steps you can take to reduce your risk of having a heart attack (or having another heart attack):
438 | smokers should quit smoking
439 | lose weight if you're overweight or obese
440 | do regular exercise – adults should do at least 150 minutes (2 hours and 30 minutes) of moderate-intensity aerobic activity each week, unless advised otherwise by the doctor in charge of your care
441 | eat a low-fat, high-fibre diet, including wholegrains and at least 5 portions of fruit and vegetables a day
442 | moderate your alcohol consumption
443 | Video: heart attack
444 | This video explores the symptoms, surgical treatments and importance of reducing risk factors for a heart attack.",https://www.nhs.uk/conditions/Heart-attack/Pages/Introduction.aspx,1037,part_0
445 | 8,"Laryngitis
446 | Laryngitis is when your voice box or vocal cords in the throat become irritated or swollen. It usually goes away by itself within 1 to 2 weeks.
447 | Check if you have laryngitis
448 | Laryngitis usually comes on suddenly and gets worse during the first 3 days.
449 | The main symptoms are:
450 | a hoarse (croaky) voice
451 | sometimes losing your voice
452 | an irritating cough that does not go away
453 | always needing to clear your throat
454 | a sore throat
455 | Children can also:
456 | have a temperature of 38C or above
457 | be off their food or drink
458 | have difficulty breathing (but this is rare)
459 | Laryngitis is often linked to other illnesses, such as colds and flu, so you may also have other symptoms.
460 | If you're not sure it's laryngitis, check other sore throat symptoms.
461 | How you can treat laryngitis yourself
462 | Laryngitis usually goes away on its own after 1 to 2 weeks and you do not need to see a GP.
463 | Do
464 | try to speak as little as possible
465 | drink plenty of fluids
466 | keep the air moist by putting out bowls of water – central heating and air conditioning make the air dry
467 | gargle with warm salty water (children should not try this)
468 | Don’t
469 | do not talk loudly or whisper – both strain your voice
470 | do not smoke
471 | do not spend time in smoky or dusty places
472 | do not drink too much caffeine or alcohol – they cause dehydration
473 | How to gargle with salty water
474 | Dissolve half a teaspoon of salt in a glass of warm water. Warm water helps salt dissolve.
475 | Gargle with the solution then spit it out. Do not swallow it.
476 | Repeat as often as you like.
477 | This is not suitable for younger children.
478 | A pharmacist can help with laryngitis
479 | Speak to a pharmacist about your sore throat.
480 | They can give advice and suggest treatments, including:
481 | paracetamol or ibuprofen
482 | cough syrup to help with your cough
483 | solutions to gargle or lozenges for the pain
484 | Find a pharmacy
485 | Non-urgent advice:
486 | See a GP if:
487 | your symptoms do not improve after 2 weeks
488 | it's very painful or it's difficult to swallow
489 | you keep getting laryngitis or voice problems
490 | What happens at your appointment
491 | The GP will try to work out what has caused your laryngitis.
492 | They may:
493 | look inside your throat using a small mirror
494 | wipe a cotton bud around the back of your throat for testing
495 | arrange a blood test
496 | refer you to an ear, nose and throat (ENT) specialist (if you keep getting laryngitis)
497 | If your laryngitis is caused by an infection, the GP might prescribe antibiotics.
498 | Immediate action required:
499 | Call 999 or go to A&E if:
500 | you or your child are having difficulty breathing
501 | Find your nearest A&E
502 | What causes laryngitis
503 | Laryngitis usually happens when you have an infection from a virus, such as cold or flu. A flu vaccination will help prevent you getting flu.
504 | Other things that cause laryngitis include:
505 | allergies to things like dust and fumes
506 | acid from your stomach coming up your throat (acid reflux)
507 | coughing over a long time
508 | clearing your throat all the time",https://www.nhs.uk/conditions/Laryngitis/,685,part_0
509 | 9,"Overview
510 | Multiple sclerosis
511 | Multiple sclerosis (MS) is a condition that can affect the brain and spinal cord, causing a wide range of potential symptoms, including problems with vision, arm or leg movement, sensation or balance.
512 | It's a lifelong condition that can sometimes cause serious disability, although it can occasionally be mild.
513 | In many cases, it's possible to treat symptoms. Average life expectancy is slightly reduced for people with MS.
514 | It's most commonly diagnosed in people in their 20s, 30s and 40s although it can develop at any age. It's about 2 to 3 times more common in women than men.
515 | MS is one of the most common causes of disability in younger adults.
516 | Symptoms of multiple sclerosis
517 | The symptoms of MS vary widely from person to person and can affect any part of the body.
518 | The main symptoms include:
519 | fatigue
520 | difficulty walking
521 | vision problems, such as blurred vision
522 | problems controlling the bladder
523 | numbness or tingling in different parts of the body
524 | muscle stiffness and spasms
525 | problems with balance and co-ordination
526 | problems with thinking, learning and planning
527 | Depending on the type of MS you have, your symptoms may come and go in phases or get steadily worse over time (progress).
528 | Getting medical advice
529 | See a GP if you're worried you might have signs of MS.
530 | The symptoms often have many other causes, so they're not necessarily a sign of MS.
531 | Let the GP know about the specific pattern of symptoms you're experiencing.
532 | If they think you could have MS, you'll be referred to a specialist in conditions of the nervous system (a neurologist), who may suggest tests such as an MRI scan to check for features of MS.
533 | Find out more about diagnosing MS
534 | Types of multiple sclerosis
535 | MS starts in 1 of 2 general ways: with individual relapses (attacks or exacerbations) or with gradual progression.
536 | Relapsing remitting MS
537 | Between 8 and 9 of every 10 people with MS are diagnosed with the relapsing remitting type.
538 | Someone with relapsing remitting MS will have episodes of new or worsening symptoms, known as relapses.
539 | These typically worsen over a few days, last for days to weeks to months, then slowly improve over a similar time period.
540 | Relapses often occur without warning, but are sometimes associated with a period of illness or stress.
541 | The symptoms of a relapse may disappear altogether, with or without treatment, although some symptoms often persist, with repeated attacks happening over several years.
542 | Periods between attacks are known as periods of remission. These can last for years at a time.
543 | After many years (usually decades), many, but not all, people with relapsing remitting MS go on to develop secondary progressive MS.
544 | In this type of MS, symptoms gradually worsen over time without obvious attacks. Some people continue to have infrequent relapses during this stage.
545 | About two-thirds of people with relapsing remitting MS will develop secondary progressive MS.
546 | Primary progressive MS
547 | Between 1 and 2 in every 10 people with the condition start their MS with a gradual worsening of symptoms.
548 | In primary progressive MS, symptoms gradually worsen and accumulate over several years, and there are no periods of remission, though people often have periods where their condition appears to stabilise.
549 | What causes multiple sclerosis?
550 | MS is an autoimmune condition. This is when something goes wrong with the immune system and it mistakenly attacks a healthy part of the body – in this case, the brain or spinal cord of the nervous system.
551 | In MS, the immune system attacks the layer that surrounds and protects the nerves called the myelin sheath.
552 | This damages and scars the sheath, and potentially the underlying nerves, meaning that messages travelling along the nerves become slowed or disrupted.
553 | Exactly what causes the immune system to act in this way is unclear, but most experts think a combination of genetic and environmental factors is involved.
554 | Treatments for multiple sclerosis
555 | There's currently no cure for MS, but a number of treatments can help control the condition and ease symptoms.
556 | The treatment you need will depend on the specific symptoms and difficulties you have.
557 | It may include:
558 | treating relapses with short courses of steroid medicine to speed up recovery
559 | specific treatments for individual MS symptoms
560 | treatment to reduce the number of relapses using medicines called disease-modifying therapies
561 | Disease-modifying therapies may also help to slow or reduce the overall worsening of disability in people with a type of MS called relapsing remitting MS, and in some people with types called primary and secondary progressive MS, who have relapses.
562 | Unfortunately, there's currently no treatment that can slow the progress of primary progressive MS, or secondary progressive MS, where there are no relapses.
563 | Many therapies aiming to treat progressive MS are currently being researched.
564 | Living with multiple sclerosis
565 | If you have been diagnosed with MS, it's important to take care of your general health.
566 | Read more advice about living with MS
567 | Outlook
568 | MS can be a challenging condition to live with, but new treatments over the past 20 years have considerably improved the quality of life of people with the condition.
569 | MS itself is rarely fatal, but complications may arise from severe MS, such as chest or bladder infections, or swallowing difficulties.
570 | The average life expectancy for people with MS is around 5 to 10 years lower than average, and this gap appears to be getting smaller all the time.
571 | Charities and support groups for multiple sclerosis
572 | There are 2 main MS charities in the UK:
573 | MS Society
574 | MS Trust
575 | These organisations offer useful advice, publications, news items about ongoing research, blogs and chatrooms.
576 | They can be very useful if you, or someone you know, has just been diagnosed with MS.
577 | There's also the shift.ms website, an online community for younger people affected by MS.
578 | Information:
579 | Social care and support guide
580 | The social care and support guide explains your options and where you can get support if you:
581 | need help with day-to-day living because of illness or disability
582 | care for someone regularly because they're ill, elderly or disabled, including family members",https://www.nhs.uk/conditions/Multiple-sclerosis,1241,part_0
583 | 


--------------------------------------------------------------------------------
/data/nhs_conditions_small_sample/original_data.csv:
--------------------------------------------------------------------------------
  1 | text,url
  2 | "Overview
  3 | High blood pressure (hypertension)
  4 | High blood pressure, or hypertension, rarely has noticeable symptoms. But if untreated, it increases your risk of serious problems such as heart attacks and strokes.
  5 | Around a third of adults in the UK have high blood pressure, although many will not realise it.
  6 | The only way to find out if your blood pressure is high is to have your blood pressure checked.
  7 | What is high blood pressure?
  8 | Blood pressure is recorded with 2 numbers. The systolic pressure (higher number) is the force at which your heart pumps blood around your body.
  9 | The diastolic pressure (lower number) is the resistance to the blood flow in the blood vessels.
 10 | They're both measured in millimetres of mercury (mmHg).
 11 | As a general guide:
 12 | high blood pressure is considered to be from 140/90mmHg (or an average of 135/85mmHg at home) – or 150/90mmHg (or an average of 145/85mmHg at home) if you're over the age of 80
 13 | ideal blood pressure is usually considered to be between 90/60mmHg and 120/80mmHg, while the target for over-80s is below 150/90mmHg (or 145/85mmHg at home)
 14 | Blood pressure readings between 120/80mmHg and 140/90mmHg could mean you're at risk of developing high blood pressure if you do not take steps to keep your blood pressure under control.
 15 | Everyone's blood pressure will be slightly different. What's considered low or high for you may be normal for someone else.
 16 | Risks of high blood pressure
 17 | If your blood pressure is too high, it puts extra strain on your blood vessels, heart and other organs, such as the brain, kidneys and eyes.
 18 | Persistent high blood pressure can increase your risk of a number of serious and potentially life-threatening health conditions, such as:
 19 | heart disease
 20 | heart attacks
 21 | strokes
 22 | heart failure
 23 | peripheral arterial disease
 24 | aortic aneurysms
 25 | kidney disease
 26 | vascular dementia
 27 | If you have high blood pressure, reducing it even a small amount can help lower your risk of these health conditions.
 28 | Check your blood pressure
 29 | The only way of knowing whether you have high blood pressure is to have a blood pressure test.
 30 | All adults over 40 are advised to have their blood pressure checked at least every 5 years.
 31 | Getting this done is easy and could save your life.
 32 | You can get your blood pressure tested at a number of places, including:
 33 | at your GP surgery
 34 | at some pharmacies
 35 | as part of your NHS Health Check
 36 | in some workplaces
 37 | You can also check your blood pressure yourself with a home blood pressure monitor.
 38 | Find out more about getting a blood pressure test
 39 | Things that can increase your risk of getting high blood pressure
 40 | It's not always clear what causes high blood pressure, but there are things that can increase your risk.
 41 | You might be more at risk if you:
 42 | are overweight
 43 | eat too much salt and do not eat enough fruit and vegetables
 44 | do not do enough exercise
 45 | drink too much alcohol or coffee (or other caffeine-based drinks)
 46 | smoke
 47 | do not get much sleep or have disturbed sleep
 48 | are over 65
 49 | have a relative with high blood pressure
 50 | are of black African or black Caribbean descent
 51 | live in a deprived area
 52 | Making healthy lifestyle changes can sometimes help reduce your chances of getting high blood pressure and help lower your blood pressure if it's already high.
 53 | Treatment for high blood pressure
 54 | Doctors can help you keep your blood pressure to a safe level using:
 55 | lifestyle changes
 56 | medicines
 57 | What works best is different for each person.
 58 | Talk to your doctor to help you decide about treatment.
 59 | This patient decision aid (PDF, 132kb) can also help you to understand your treatment options.
 60 | Lifestyle changes to reduce blood pressure
 61 | These lifestyle changes can help prevent and lower high blood pressure:
 62 | reduce the amount of salt you eat and have a generally healthy diet
 63 | cut back on alcohol
 64 | lose weight if you're overweight
 65 | exercise regularly
 66 | cut down on caffeine
 67 | stop smoking
 68 | Some people with high blood pressure may also need to take 1 or more medicines to stop their blood pressure getting too high.
 69 | Medicines for high blood pressure
 70 | If you're diagnosed with high blood pressure, your doctor may recommend taking 1 or more medicines to keep it under control.
 71 | These come as tablets and usually need to be taken once a day.
 72 | Common blood pressure medicines include:
 73 | ACE inhibitors – such as enalapril, lisinopril, perindopril and ramipril
 74 | angiotensin-2 receptor blockers (ARBs) – such as candesartan, irbesartan, losartan, valsartan and olmesartan
 75 | calcium channel blockers – such as amlodipine, felodipine and nifedipine or diltiazem and verapamil
 76 | diuretics – such as indapamide and bendroflumethiazide
 77 | beta blockers – such as atenolol and bisoprolol
 78 | alpha blockers – such as doxazosin
 79 | other diuretics – such as amiloride and spironolactone
 80 | The medicine recommended for you will depend on things like how high your blood pressure is, your age and your ethnicity.",https://www.nhs.uk/conditions/Blood-pressure-(high)/Pages/Introduction.aspx
 81 | "Bronchiolitis
 82 | Bronchiolitis is a common chest infection that affects babies and children under 2. It's usually mild and can be treated at home, but it can be serious.
 83 | Bronchiolitis is different from bronchitis, which causes a cough with lots of mucus and can affect people of all ages.
 84 | Check if it's bronchiolitis
 85 | The early symptoms of bronchiolitis are similar to a cold, such as sneezing, a runny or blocked nose, a cough and a slightly high temperature of 38C.
 86 | A child with bronchiolitis may then get other symptoms, such as:
 87 | breathing more quickly
 88 | finding it difficult to feed or eat
 89 | noisy breathing (wheezing)
 90 | becoming irritable
 91 | Symptoms are usually worst between days 3 and 5, and the cough usually gets better in 3 weeks.
 92 | Immediate action required:
 93 | Call 999 or go to A&E if:
 94 | your child is having difficulty breathing – you may notice grunting noises or their tummy sucking under their ribs
 95 | there are pauses when your child breathes
 96 | your child's skin, tongue or lips are blue
 97 | your child is floppy and will not wake up or stay awake
 98 | As a parent, you may know if your child seems seriously unwell and should trust your own judgement.
 99 | Find your nearest A&E
100 | Urgent advice:
101 | Ask for an urgent GP appointment or call 111 if:
102 | your child has had a cold and it's getting worse
103 | your child is feeding or eating much less than normal
104 | your child has had a dry nappy for 12 hours or more, or shows other signs of dehydration
105 | your baby is under 3 months and has a temperature of 38C, or is older than 3 months and has a temperature of 39C or higher
106 | your baby feels hotter than usual when you touch their back or chest, or feels sweaty
107 | your child is very tired or irritable
108 | Treatments for bronchiolitis
109 | There's no specific treatment for bronchiolitis. It usually gets better on its own and you can look after your child at home.
110 | But it can be serious in some children, who may need to be treated in hospital.
111 | Do
112 | give children's paracetamol to babies and children over 2 months old or ibuprofen to babies and children over 3 months old – but do not give aspirin to a child under 16
113 | try using salt water (saline) drops if your child's nose is blocked
114 | keep your child upright as much as possible when they're awake – this will help them breathe more easily
115 | encourage your child to drink lots of fluids – try smaller feeds more often in babies, and give older children extra water or diluted fruit juice
116 | Don’t
117 | do not smoke around your child
118 | do not try to lower your child's temperature by sponging them with cool water or taking off all their clothes
119 | Preventing bronchiolitis
120 | There are some things you can do to lower the chances of your child getting bronchiolitis or spreading the viruses that cause it, such as:
121 | wash your hands and your child's hands often
122 | wash or wipe down toys and clean surfaces regularly
123 | use disposable tissues and throw them away as soon as you've used them
124 | keep newborn babies away from anyone with a cold or the flu – especially if they're under 2 months old or were premature
125 | It's also important not to smoke around your child. Children who breathe in cigarette smoke have a higher risk of getting bronchiolitis.
126 | Children at risk of severe bronchiolitis
127 | Some children may have a higher risk of getting seriously ill with bronchiolitis.
128 | This includes children who:
129 | were born very prematurely
130 | have a heart or lung condition
131 | have a weakened immune system
132 | These children may be able to have treatment in the winter (between October and March) to stop them getting severe bronchiolitis.
133 | Causes of bronchiolitis
134 | Bronchiolitis is caused by a viral infection, usually the respiratory syncytial virus (RSV).
135 | RSV is very common and spreads easily in coughs and sneezes. Almost all children have had it by the time they're 2.
136 | In older children and adults, RSV may cause a cough or cold, but in young children it can cause bronchiolitis.",https://www.nhs.uk/conditions/Bronchiolitis/
137 | "Bronchitis
138 | Bronchitis is inflammation of the airways in the lungs that is usually caused by an infection. It often gets better without treatment in around 3 weeks.
139 | Some people have long-term inflammation of the airways in the lungs called chronic bronchitis. This is known as chronic obstructive pulmonary disease (COPD).
140 | Check if you have bronchitis
141 | Symptoms of bronchitis can be similar to a cold or flu.
142 | Symptoms include:
143 | a cough – you may cough up clear, white, yellow or green mucus
144 | chest pain when coughing
145 | shortness of breath
146 | a sore throat
147 | a runny nose
148 | a high temperature
149 | Things you can do to help with bronchitis
150 | There are some things you can do to ease the symptoms of bronchitis and reduce the risk of spreading infections to other people.
151 | Do
152 | get plenty of rest – try to stay at home and avoid contact with other people if you have a high temperature or do not feel well enough to do your normal activities
153 | drink plenty of fluids
154 | take painkillers like paracetamol or ibuprofen to help with pain and bring down a high temperature
155 | try adding honey to a warm drink to help soothe your throat (do not give honey to babies under 1)
156 | cover your mouth and nose with a tissue when you cough or sneeze – put used tissues in the bin as quickly as possible
157 | wash your hands regularly with water and soap
158 | Don’t
159 | do not smoke
160 | Urgent advice:
161 | Ask for an urgent GP appointment or get help from NHS 111 if:
162 | you've had a cough for more than 3 weeks
163 | you cough up blood or blood-stained mucus
164 | you have chest pain that comes and goes, or when breathing or coughing
165 | you're over 65
166 | you're pregnant
167 | you have a long-term condition, such as diabetes, or a heart, lung or kidney condition
168 | you have a weakened immune system – for example, you have a condition that affects the immune system, or you're having chemotherapy
169 | you feel very unwell
170 | Immediate action required:
171 | Call 999 if:
172 | you are struggling to breathe – you are choking, gasping and unable to speak
173 | you have pale, blue or blotchy skin, lips or tongue – on brown or black skin, this may be easier to see on the lips, tongue or gums, under the nails or around the eyes
174 | you suddenly feel confused – for example, you do not know where you are
175 | you're unable to wake your baby or they feel floppy
176 | Treatments for bronchitis
177 | Bronchitis usually clears up without treatment in around 3 weeks. See a GP if your symptoms last longer than 3 weeks.
178 | You may need antibiotics if your bronchitis is caused by a bacterial infection.",https://www.nhs.uk/conditions/Bronchitis/
179 | "Steroids
180 | Steroids, also called corticosteroids, are anti-inflammatory medicines used to treat a range of conditions.
181 | They're different from anabolic steroids, which are often used illegally by some people to increase their muscle mass.
182 | Types of steroids
183 | Steroids come in many different forms.
184 | The main types are:
185 | tablets, syrups and liquids – such as prednisolone
186 | inhalers – such as beclometasone and fluticasone
187 | nasal sprays – such as beclometasone and fluticasone
188 | injections (given into joints, muscles or blood vessels) – such as methylprednisolone
189 | creams, lotions and gels – such as hydrocortisone skin cream
190 | Most steroids are only available on prescription, but a few (such as some creams or nasal sprays) can be bought from pharmacies and shops.
191 | Side effects of steroids
192 | Steroids do not tend to cause significant side effects if they're taken for a short time or at a low dose.
193 | But sometimes they can cause unpleasant side effects, such as an increased appetite, mood changes and difficulty sleeping. This is most common with steroid tablets.
194 | The side effects will usually pass once you finish the treatment, but do not stop taking your medicine without speaking to your doctor. Stopping a prescribed course of medicine can cause further unpleasant side effects (withdrawal symptoms).
195 | Read more about:
196 | side effects of steroid tablets
197 | side effects of steroid inhalers
198 | side effects of steroid nasal sprays
199 | side effects of steroid injections
200 | side effects of steroid creams
201 | You can report any suspected side effect to the Yellow Card Scheme.
202 | Uses for steroids
203 | Steroids can be used to treat a wide range of conditions, including:
204 | asthma and chronic obstructive pulmonary disease (COPD)
205 | hay fever
206 | hives and eczema
207 | painful joints or muscles – such as arthritis, tennis elbow and frozen shoulder
208 | pain caused by an irritated or trapped nerve – such as sciatica
209 | inflammatory bowel disease – such as Crohn's disease
210 | lupus
211 | multiple sclerosis (MS)
212 | How steroids work
213 | Steroids are a man-made version of hormones normally produced by the adrenal glands which are 2 small glands found above the kidneys.
214 | When taken in doses higher than the amount your body normally produces, steroids reduce redness and swelling (inflammation). This can help with inflammatory conditions such as asthma and eczema.
215 | Steroids also reduce the activity of the immune system, which is the body's natural defence against illness and infection.
216 | This can help treat autoimmune conditions, such as rheumatoid arthritis or lupus, which are caused by the immune system mistakenly attacking the body.",https://www.nhs.uk/conditions/Corticosteroid-(drugs)/Pages/Introduction.aspx
217 | "Overview
218 | Creutzfeldt-Jakob disease
219 | Creutzfeldt-Jakob disease (CJD) is a rare and fatal condition that affects the brain. It causes brain damage that worsens rapidly over time.
220 | Symptoms of CJD
221 | Symptoms of CJD include:
222 | loss of intellect and memory
223 | changes in personality
224 | loss of balance and co-ordination
225 | slurred speech
226 | vision problems and blindness
227 | abnormal jerking movements
228 | progressive loss of brain function and mobility
229 | Most people with CJD will die within a year of the symptoms starting, usually from infection.
230 | This is because the immobility caused by CJD can make people with the condition vulnerable to infection.
231 | Read more about the symptoms of Creutzfeldt-Jakob disease and diagnosing Creutzfeldt-Jakob disease.
232 | What causes CJD?
233 | CJD appears to be caused by an abnormal infectious protein called a prion. These prions accumulate at high levels in the brain and cause irreversible damage to nerve cells.
234 | While the abnormal prions are technically infectious, they're very different from viruses and bacteria.
235 | For example, prions aren't destroyed by the extremes of heat and radiation used to kill bacteria and viruses, and antibiotics or antiviral medicines have no effect on them.
236 | Read more about the causes of Creutzfeldt-Jakob disease.
237 | Types of CJD
238 | There are 4 main types of CJD.
239 | Sporadic CJD
240 | Sporadic CJD is the most common type.
241 | The precise cause of sporadic CJD is unclear, but it's been suggested that a normal brain protein changes abnormally (""misfolds"") and turns into a prion.
242 | Most cases of sporadic CJD occur in adults aged between 45 and 75. On average, symptoms develop between the ages of 60 and 65.
243 | Despite being the most common type of CJD, sporadic CJD is still very rare, affecting only 1 or 2 people in every million each year in the UK.
244 | In 2020, there were 131 recorded deaths from sporadic CJD in the UK.
245 | Variant CJD
246 | Variant CJD (vCJD) is likely to be caused by consuming meat from a cow that had bovine spongiform encephalopathy (BSE, or ""mad cow"" disease), a similar prion disease to CJD.
247 | Since the link between variant CJD and BSE was discovered in 1996, strict controls have proved very effective in preventing meat from infected cattle entering the food chain.
248 | See preventing Creutzfeldt-Jakob disease for more information.
249 | But the average time it takes for the symptoms of variant CJD to occur after initial infection (the incubation period) is still unclear.
250 | The incubation period could be very long (more than 10 years) in some people, so those exposed to infected meat before the food controls were introduced can still develop variant CJD.
251 | The prion that causes variant CJD can also be transmitted by blood transfusion, although this has only happened 5 times in the UK.
252 | In 2020, there were no recorded deaths from variant CJD in the UK.
253 | Familial or inherited CJD
254 | Familial CJD is a very rare genetic condition where one of the genes a person inherits from their parent (the prion protein gene) carries a mutation that causes prions to form in their brain during adulthood, triggering the symptoms of CJD.
255 | It affects about 1 in every 9 million people in the UK.
256 | The symptoms of familial CJD usually first develop in people when they're in their early 50s.
257 | In 2020, there were 6 deaths from familial CJD and similar inherited prion diseases in the UK.
258 | Iatrogenic CJD
259 | Iatrogenic CJD is where the infection is accidentally spread from someone with CJD through medical or surgical treatment.
260 | For example, a common cause of iatrogenic CJD in the past was growth hormone treatment using human pituitary growth hormones extracted from deceased individuals, some of whom were infected with CJD.
261 | Synthetic versions of human growth hormone have been used since 1985, so this is no longer a risk.
262 | Iatrogenic CJD can also occur if instruments used during brain surgery on a person with CJD aren't properly cleaned between each surgical procedure and are reused on another person.
263 | But increased awareness of these risks means iatrogenic CJD is now very rare.
264 | In 2020, there was 1 death from iatrogenic CJD in the UK caused by receiving human growth hormone before 1985.
265 | How CJD is treated
266 | There's currently no cure for CJD, so treatment aims to relieve symptoms and make the affected person feel as comfortable as possible.
267 | This can include using medicine such as antidepressants to help with anxiety and depression, and painkillers to relieve pain.
268 | Some people will need nursing care and assistance with feeding.
269 | Read more about treating Creutzfeldt-Jakob disease.
270 | Variant CJD compensation scheme
271 | In October 2001, the government announced a compensation scheme for UK victims of variant CJD.
272 | The vCJD Trust assesses claims and pays compensation to victims and their families.",https://www.nhs.uk/conditions/Creutzfeldt-Jakob-disease/Pages/Introduction.aspx
273 | "Overview
274 | Atopic eczema
275 | Atopic eczema (atopic dermatitis) is the most common form of eczema, a condition that causes the skin to become itchy, dry and cracked.
276 | Atopic eczema is more common in children, often developing before their first birthday. But it may also develop for the first time in adults.
277 | It's usually a long-term (chronic) condition, although it can improve significantly, or even clear completely, in some children as they get older.
278 | Symptoms of atopic eczema
279 | Atopic eczema causes the skin to become itchy, dry, cracked and sore.
280 | Some people only have small patches of dry skin, but others may experience widespread inflamed skin all over the body.
281 | Inflamed skin can become red on lighter skin, and darker brown, purple or grey on darker skin. This can also be more difficult to see on darker skin.
282 | Although atopic eczema can affect any part of the body, it most often affects the hands, insides of the elbows, backs of the knees and the face and scalp in children.
283 | People with atopic eczema usually have periods when symptoms are less noticeable, as well as periods when symptoms become more severe (flare-ups).
284 | When to seek medical advice
285 | See a GP if you have symptoms of atopic eczema. They'll usually be able to diagnose atopic eczema by looking at your skin and asking questions, such as:
286 | whether the rash is itchy and where it appears
287 | when the symptoms first began
288 | whether it comes and goes over time
289 | whether there's a history of atopic eczema in your family
290 | whether you have any other conditions, such as allergies or asthma
291 | whether something in your diet or lifestyle may be contributing to your symptoms
292 | Typically, to be diagnosed with atopic eczema you should have had an itchy skin condition in the last 12 months and 3 or more of the following:
293 | visibly irritated red skin in the creases of your skin – such as the insides of your elbows or behind your knees (or on the cheeks, outsides of elbows, or fronts of the knees in children aged 18 months or under) at the time of examination by a health professional
294 | a history of skin irritation occurring in the same areas mentioned above
295 | generally dry skin in the last 12 months
296 | a history of asthma or hay fever – children under 4 must have an immediate relative, such as a parent, brother or sister, who has 1 of these conditions
297 | the condition started before the age of 2 (this does not apply to children under the age of 4)
298 | Causes of atopic eczema
299 | The exact cause of atopic eczema is unknown, but it's clear it is not down to one single thing.
300 | Atopic eczema often occurs in people who get allergies. ""Atopic"" means sensitivity to allergens.
301 | It can run in families, and often develops alongside other conditions, such as asthma and hay fever.
302 | The symptoms of atopic eczema often have certain triggers, such as soaps, detergents, stress and the weather.
303 | Sometimes food allergies can play a part, especially in young children with severe eczema.
304 | You may be asked to keep a food diary to try to determine whether a specific food makes your symptoms worse.
305 | Allergy tests are not usually needed, although they're sometimes helpful in identifying whether a food allergy may be triggering symptoms.
306 | Treating atopic eczema
307 | Treatment for atopic eczema can help to relieve the symptoms and many cases improve over time.
308 | But there's currently no cure and severe eczema often has a significant impact on daily life, which may be difficult to cope with physically and mentally.
309 | There's also an increased risk of skin infections.
310 | Many different treatments can be used to control symptoms and manage eczema, including:
311 | self-care techniques, such as reducing scratching and avoiding triggers
312 | emollients (moisturising treatments) – used on a daily basis for dry skin
313 | topical corticosteroids – used to reduce swelling, redness and itching during flare-ups
314 | Other types of eczema
315 | Eczema is the name for a group of skin conditions that cause dry, irritated skin.
316 | Other types of eczema include:
317 | discoid eczema – a type of eczema that occurs in circular or oval patches on the skin
318 | contact dermatitis – a type of eczema that occurs when the body comes into contact with a particular substance
319 | varicose eczema – a type of eczema that most often affects the lower legs and is caused by problems with the flow of blood through the leg veins
320 | seborrhoeic eczema – a type of eczema where red, scaly patches develop on the sides of the nose, eyebrows, ears and scalp
321 | dyshidrotic eczema (pompholyx) – a type of eczema that causes tiny blisters to erupt across the palms of the hands",https://www.nhs.uk/conditions/Eczema-(atopic)/Pages/Introduction.aspx
322 | "Overview
323 | HIV and AIDS
324 | HIV (human immunodeficiency virus) is a virus that damages the cells in your immune system and weakens your ability to fight everyday infections and disease.
325 | AIDS (acquired immune deficiency syndrome) is the name used to describe a number of potentially life-threatening infections and illnesses that happen when your immune system has been severely damaged by the HIV virus.
326 | While AIDS cannot be transmitted from 1 person to another, the HIV virus can.
327 | There's currently no cure for HIV, but there are very effective drug treatments that enable most people with the virus to live a long and healthy life.
328 | With an early diagnosis and effective treatments, most people with HIV will not develop any AIDS-related illnesses and will live a near-normal lifespan.
329 | Symptoms of HIV infection
330 | Most people experience a short flu-like illness 2 to 6 weeks after HIV infection, which lasts for a week or 2.
331 | After these symptoms disappear, HIV may not cause any symptoms for many years, although the virus continues to damage your immune system.
332 | This means many people with HIV do not know they're infected.
333 | Anyone who thinks they could have HIV should get tested.
334 | Some people are advised to have regular tests as they're at particularly high risk.
335 | Read more about who's most at risk of HIV
336 | Causes of HIV infection
337 | HIV is found in the body fluids of an infected person. This includes semen, vaginal and anal fluids, blood and breast milk.
338 | It's a fragile virus and does not survive outside the body for long.
339 | HIV cannot be transmitted through sweat, urine or saliva.
340 | The most common way of getting HIV in the UK is through having anal or vaginal sex without a condom.
341 | Other ways of getting HIV include:
342 | sharing needles, syringes or other injecting equipment
343 | transmission from mother to baby during pregnancy, birth or breastfeeding
344 | The chance of getting HIV through oral sex is very low and will be dependent on many things, such as whether you receive or give oral sex and the oral hygiene of the person giving the oral sex.
345 | Diagnosing HIV
346 | Seek medical advice as soon as possible if you think you might have been exposed to HIV.
347 | You can get tested in a number of places, including at a GP surgery, sexual health clinics and clinics run by charities.
348 | Find HIV testing services near you
349 | The only way to find out if you have HIV is to have an HIV test. This involves testing a sample of your blood or saliva for signs of the infection.
350 | It's important to be aware that:
351 | emergency anti-HIV medicine called post-exposure prophylaxis (PEP) may stop you becoming infected if started within 72 hours of possible exposure to the virus – it's recommended that you start it as soon as possible, ideally within 24 hours
352 | an early diagnosis means you can start treatment sooner, which can improve your chances of controlling the virus, reduce the risk of becoming more unwell and reduce the chance of passing the virus on to others
353 | Both positive and negative HIV tests may need to be repeated 1 to 3 months after potential exposure to HIV infection (this is known as the window period), but you should not wait this long to seek help:
354 | clinics may offer a finger prick blood test, which can give you a result in minutes, but it may take up to a few days to get the results of a more detailed HIV test
355 | home testing or home sampling kits are available to buy online or from pharmacies – depending on the type of test you use, your result will be available in a few minutes or a few days
356 | If your first test suggests you have HIV, a further blood test will need to be carried out to confirm the result.
357 | If this is positive, you'll be referred to a specialist HIV clinic for some more tests and a discussion about your treatment options.
358 | Treatment for HIV
359 | Antiretroviral medicines are used to treat HIV. They work by stopping the virus replicating in the body, allowing the immune system to repair itself and preventing further damage.
360 | These come in the form of tablets, which need to be taken every day.
361 | HIV is able to develop resistance to a single HIV medicine very easily, but taking a combination of different medicines makes this much less likely.
362 | Most people with HIV take a combination of medicines. It's vital these are taken every day as recommended by your doctor.
363 | The goal of HIV treatment is to have an undetectable viral load. This means the level of HIV virus in your body is low enough to not be detected by a test.
364 | Living with HIV
365 | If you're living with HIV, taking effective HIV treatment and being undetectable significantly reduces your risk of passing HIV on to others.
366 | You'll also be encouraged to:
367 | take regular exercise
368 | eat a healthy diet
369 | stop smoking
370 | have yearly flu jabs to minimise the risk of getting serious illnesses
371 | Without treatment, the immune system will become severely damaged, and life-threatening illnesses such as cancer and severe infections can occur.
372 | If you're planning on getting pregnant, it's important to talk to a GP. Although rare, it's possible to transmit HIV to your baby.
373 | Preventing HIV
374 | Anyone who has sex without a condom or shares needles is at risk of HIV infection.
375 | There are many effective ways to prevent or reduce the risk of HIV infection, including:
376 | using a condom for sex
377 | post-exposure prophylaxis (PEP)
378 | pre-exposure prophylaxis (PrEP)
379 | treatment for HIV to reduce the viral load to undetectable
380 | if you use drugs, never sharing needles or other injecting equipment, including syringes, spoons and swabs
381 | Speak to your local sexual health clinic or a GP for further advice about the best way to reduce your risk.
382 | For people with HIV, if you have been taking effective HIV treatment and your viral load has been undetectable for 6 months or more, it means you cannot pass the virus on through sex.
383 | This is called undetectable=untransmittable (U=U).
384 | Further information on U=U
385 | NAM aidsmap: undetectable equals untransmittable (U=U) consensus statement",https://www.nhs.uk/conditions/HIV/Pages/Introduction.aspx
386 | "Overview
387 | Heart attack
388 | A heart attack (myocardial infarction or MI) is a serious medical emergency in which the supply of blood to the heart is suddenly blocked, usually by a blood clot.
389 | A heart attack is a medical emergency. Call 999 and ask for an ambulance if you suspect a heart attack.
390 | A lack of blood to the heart may seriously damage the heart muscle and can be life threatening.
391 | Symptoms of a heart attack
392 | Symptoms of a heart attack can include:
393 | chest pain – a feeling of pressure, heaviness, tightness or squeezing across your chest
394 | pain in other parts of the body – it can feel as if the pain is spreading from your chest to your arms (usually the left arm, but it can affect both arms), jaw, neck, back and tummy
395 | feeling lightheaded or dizzy
396 | sweating
397 | shortness of breath
398 | feeling sick (nausea) or being sick (vomiting)
399 | an overwhelming feeling of anxiety (similar to a panic attack)
400 | coughing or wheezing
401 | The chest pain is often severe, but some people may only experience minor pain, similar to indigestion.
402 | While the most common symptom in both men and women is chest pain, women are more likely to have other symptoms such as shortness of breath, feeling or being sick and back or jaw pain.
403 | Call 999 immediately if you think someone might be having a heart attack. The faster you act, the better their chances.
404 | Treating heart attacks
405 | While waiting for an ambulance, it may help to chew and then swallow a tablet of aspirin (ideally 300mg), as long as the person having a heart attack is not allergic to aspirin.
406 | Aspirin helps to thin the blood and improves blood flow to the heart.
407 | In hospital, treatment for a heart attack depends on how serious it is.
408 | The 2 main treatments are:
409 | using medicines to dissolve blood clots
410 | surgery to help restore blood to the heart
411 | Causes of a heart attack
412 | Coronary heart disease (CHD) is the leading cause of heart attacks.
413 | CHD is a condition in which the major blood vessels that supply the heart get clogged with deposits of cholesterol, known as plaques.
414 | Before a heart attack, 1 of the plaques bursts (ruptures), causing a blood clot to develop at the site of the rupture.
415 | The clot may block the supply of blood to the heart, triggering a heart attack.
416 | Recovering from a heart attack
417 | The time it takes to recover from a heart attack will depend on the amount of damage to your heart muscle.
418 | Most people can return to work after having a heart attack. Some people are well enough to return to work after 2 weeks. Other people may take several months to recover. How quickly you can go back to work depends on your health, the state of your heart and the type of work you do.
419 | The recovery process aims to:
420 | reduce your risk of another heart attack through a combination of lifestyle changes (such as eating a healthy diet), and medicines (such as statins), which help to lower blood cholesterol levels
421 | gradually restore your physical fitness so you can resume normal activities (cardiac rehabilitation)
422 | Find out more about recovering from a heart attack
423 | Complications of a heart attack
424 | Complications of a heart attack can be serious and possibly life threatening.
425 | These include:
426 | arrhythmias – these are abnormal heartbeats. 1 type is where the heart begins beating faster and faster, then stops beating (cardiac arrest)
427 | cardiogenic shock – where the heart's muscles are severely damaged and can no longer contract properly to supply enough blood to maintain many body functions
428 | heart rupture – where the heart's muscles, walls or valves split apart (rupture)
429 | These complications can happen quickly after a heart attack and are a leading cause of death.
430 | Many people die suddenly from a complication of a heart attack before reaching hospital or within the 1st month after a heart attack.
431 | The outlook often depends on:
432 | age – serious complications are more likely as you get older
433 | the severity of the heart attack – how much of the heart's muscle has been damaged during the attack
434 | how long it took before a person received treatment – treatment for a heart attack should begin as soon as possible
435 | Find out more about complications of a heart attack
436 | Preventing a heart attack
437 | There are 5 main steps you can take to reduce your risk of having a heart attack (or having another heart attack):
438 | smokers should quit smoking
439 | lose weight if you're overweight or obese
440 | do regular exercise – adults should do at least 150 minutes (2 hours and 30 minutes) of moderate-intensity aerobic activity each week, unless advised otherwise by the doctor in charge of your care
441 | eat a low-fat, high-fibre diet, including wholegrains and at least 5 portions of fruit and vegetables a day
442 | moderate your alcohol consumption
443 | Video: heart attack
444 | This video explores the symptoms, surgical treatments and importance of reducing risk factors for a heart attack.",https://www.nhs.uk/conditions/Heart-attack/Pages/Introduction.aspx
445 | "Laryngitis
446 | Laryngitis is when your voice box or vocal cords in the throat become irritated or swollen. It usually goes away by itself within 1 to 2 weeks.
447 | Check if you have laryngitis
448 | Laryngitis usually comes on suddenly and gets worse during the first 3 days.
449 | The main symptoms are:
450 | a hoarse (croaky) voice
451 | sometimes losing your voice
452 | an irritating cough that does not go away
453 | always needing to clear your throat
454 | a sore throat
455 | Children can also:
456 | have a temperature of 38C or above
457 | be off their food or drink
458 | have difficulty breathing (but this is rare)
459 | Laryngitis is often linked to other illnesses, such as colds and flu, so you may also have other symptoms.
460 | If you're not sure it's laryngitis, check other sore throat symptoms.
461 | How you can treat laryngitis yourself
462 | Laryngitis usually goes away on its own after 1 to 2 weeks and you do not need to see a GP.
463 | Do
464 | try to speak as little as possible
465 | drink plenty of fluids
466 | keep the air moist by putting out bowls of water – central heating and air conditioning make the air dry
467 | gargle with warm salty water (children should not try this)
468 | Don’t
469 | do not talk loudly or whisper – both strain your voice
470 | do not smoke
471 | do not spend time in smoky or dusty places
472 | do not drink too much caffeine or alcohol – they cause dehydration
473 | How to gargle with salty water
474 | Dissolve half a teaspoon of salt in a glass of warm water. Warm water helps salt dissolve.
475 | Gargle with the solution then spit it out. Do not swallow it.
476 | Repeat as often as you like.
477 | This is not suitable for younger children.
478 | A pharmacist can help with laryngitis
479 | Speak to a pharmacist about your sore throat.
480 | They can give advice and suggest treatments, including:
481 | paracetamol or ibuprofen
482 | cough syrup to help with your cough
483 | solutions to gargle or lozenges for the pain
484 | Find a pharmacy
485 | Non-urgent advice:
486 | See a GP if:
487 | your symptoms do not improve after 2 weeks
488 | it's very painful or it's difficult to swallow
489 | you keep getting laryngitis or voice problems
490 | What happens at your appointment
491 | The GP will try to work out what has caused your laryngitis.
492 | They may:
493 | look inside your throat using a small mirror
494 | wipe a cotton bud around the back of your throat for testing
495 | arrange a blood test
496 | refer you to an ear, nose and throat (ENT) specialist (if you keep getting laryngitis)
497 | If your laryngitis is caused by an infection, the GP might prescribe antibiotics.
498 | Immediate action required:
499 | Call 999 or go to A&E if:
500 | you or your child are having difficulty breathing
501 | Find your nearest A&E
502 | What causes laryngitis
503 | Laryngitis usually happens when you have an infection from a virus, such as cold or flu. A flu vaccination will help prevent you getting flu.
504 | Other things that cause laryngitis include:
505 | allergies to things like dust and fumes
506 | acid from your stomach coming up your throat (acid reflux)
507 | coughing over a long time
508 | clearing your throat all the time",https://www.nhs.uk/conditions/Laryngitis/
509 | "Overview
510 | Multiple sclerosis
511 | Multiple sclerosis (MS) is a condition that can affect the brain and spinal cord, causing a wide range of potential symptoms, including problems with vision, arm or leg movement, sensation or balance.
512 | It's a lifelong condition that can sometimes cause serious disability, although it can occasionally be mild.
513 | In many cases, it's possible to treat symptoms. Average life expectancy is slightly reduced for people with MS.
514 | It's most commonly diagnosed in people in their 20s, 30s and 40s although it can develop at any age. It's about 2 to 3 times more common in women than men.
515 | MS is one of the most common causes of disability in younger adults.
516 | Symptoms of multiple sclerosis
517 | The symptoms of MS vary widely from person to person and can affect any part of the body.
518 | The main symptoms include:
519 | fatigue
520 | difficulty walking
521 | vision problems, such as blurred vision
522 | problems controlling the bladder
523 | numbness or tingling in different parts of the body
524 | muscle stiffness and spasms
525 | problems with balance and co-ordination
526 | problems with thinking, learning and planning
527 | Depending on the type of MS you have, your symptoms may come and go in phases or get steadily worse over time (progress).
528 | Getting medical advice
529 | See a GP if you're worried you might have signs of MS.
530 | The symptoms often have many other causes, so they're not necessarily a sign of MS.
531 | Let the GP know about the specific pattern of symptoms you're experiencing.
532 | If they think you could have MS, you'll be referred to a specialist in conditions of the nervous system (a neurologist), who may suggest tests such as an MRI scan to check for features of MS.
533 | Find out more about diagnosing MS
534 | Types of multiple sclerosis
535 | MS starts in 1 of 2 general ways: with individual relapses (attacks or exacerbations) or with gradual progression.
536 | Relapsing remitting MS
537 | Between 8 and 9 of every 10 people with MS are diagnosed with the relapsing remitting type.
538 | Someone with relapsing remitting MS will have episodes of new or worsening symptoms, known as relapses.
539 | These typically worsen over a few days, last for days to weeks to months, then slowly improve over a similar time period.
540 | Relapses often occur without warning, but are sometimes associated with a period of illness or stress.
541 | The symptoms of a relapse may disappear altogether, with or without treatment, although some symptoms often persist, with repeated attacks happening over several years.
542 | Periods between attacks are known as periods of remission. These can last for years at a time.
543 | After many years (usually decades), many, but not all, people with relapsing remitting MS go on to develop secondary progressive MS.
544 | In this type of MS, symptoms gradually worsen over time without obvious attacks. Some people continue to have infrequent relapses during this stage.
545 | About two-thirds of people with relapsing remitting MS will develop secondary progressive MS.
546 | Primary progressive MS
547 | Between 1 and 2 in every 10 people with the condition start their MS with a gradual worsening of symptoms.
548 | In primary progressive MS, symptoms gradually worsen and accumulate over several years, and there are no periods of remission, though people often have periods where their condition appears to stabilise.
549 | What causes multiple sclerosis?
550 | MS is an autoimmune condition. This is when something goes wrong with the immune system and it mistakenly attacks a healthy part of the body – in this case, the brain or spinal cord of the nervous system.
551 | In MS, the immune system attacks the layer that surrounds and protects the nerves called the myelin sheath.
552 | This damages and scars the sheath, and potentially the underlying nerves, meaning that messages travelling along the nerves become slowed or disrupted.
553 | Exactly what causes the immune system to act in this way is unclear, but most experts think a combination of genetic and environmental factors is involved.
554 | Treatments for multiple sclerosis
555 | There's currently no cure for MS, but a number of treatments can help control the condition and ease symptoms.
556 | The treatment you need will depend on the specific symptoms and difficulties you have.
557 | It may include:
558 | treating relapses with short courses of steroid medicine to speed up recovery
559 | specific treatments for individual MS symptoms
560 | treatment to reduce the number of relapses using medicines called disease-modifying therapies
561 | Disease-modifying therapies may also help to slow or reduce the overall worsening of disability in people with a type of MS called relapsing remitting MS, and in some people with types called primary and secondary progressive MS, who have relapses.
562 | Unfortunately, there's currently no treatment that can slow the progress of primary progressive MS, or secondary progressive MS, where there are no relapses.
563 | Many therapies aiming to treat progressive MS are currently being researched.
564 | Living with multiple sclerosis
565 | If you have been diagnosed with MS, it's important to take care of your general health.
566 | Read more advice about living with MS
567 | Outlook
568 | MS can be a challenging condition to live with, but new treatments over the past 20 years have considerably improved the quality of life of people with the condition.
569 | MS itself is rarely fatal, but complications may arise from severe MS, such as chest or bladder infections, or swallowing difficulties.
570 | The average life expectancy for people with MS is around 5 to 10 years lower than average, and this gap appears to be getting smaller all the time.
571 | Charities and support groups for multiple sclerosis
572 | There are 2 main MS charities in the UK:
573 | MS Society
574 | MS Trust
575 | These organisations offer useful advice, publications, news items about ongoing research, blogs and chatrooms.
576 | They can be very useful if you, or someone you know, has just been diagnosed with MS.
577 | There's also the shift.ms website, an online community for younger people affected by MS.
578 | Information:
579 | Social care and support guide
580 | The social care and support guide explains your options and where you can get support if you:
581 | need help with day-to-day living because of illness or disability
582 | care for someone regularly because they're ill, elderly or disabled, including family members",https://www.nhs.uk/conditions/Multiple-sclerosis
583 | 


--------------------------------------------------------------------------------
/data/prompts.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "hash": "f4df95ec69",
 4 |     "text": "Given the following context: {context}\n\n----------------\nCreate a conversation between a Patient concerned about the symptoms and disorders presented above and an AI-Assistant. The conversation should abide by the following rules and instructions:\n1. The output should be in {language}.\n2. All questions and answers have to be strictly based on the context provided above.\n3. The questions should be diverse and cover different aspects of the context provided above.\n4. When generating the conversation please always use the English placeholders \"Patient\" for the Patient and \"AI-Assistant\" for the AI-Assistant, irrelevant of the language.\n5. The conversation should always be started by the Patient.\n6. The patient should start the conversation with a \"Hi,\".\n7. The AI-Assistant should be helpful, and supportive and try to not scare the patient. The answers have be to detailed and self-contained.",
 5 |     "description": "This will generate a conversation between a Patient and an AI assistant in the specified languages. The only argument for this prompt is the {language} argument, if not provided it will default to English.",
 6 |     "parser": "medical_conversation_parser"
 7 |   },
 8 |   {
 9 |     "hash": "f53cf99826",
10 |     "text": "Given the following context: {context}\n\n----------------\nCreate {quantity} detailed question-answer pairs from the context above, the questions are asked by a curious user and the answers are by a helpful AI-Assistant. The question-answer pairs should abide by the following rules and instructions:\n1. The output should be in {language}.\n2. The output should be in CSV format, with the following header: ID;Question;Answer\n3. All question/answer pairs have to be strictly based on the context provided above and be self-contained and independent.\n4. The questions should be diverse and cover different aspects of the context provided above.\n5. The answers should be long, extensive, detailed, informative, helpful and self-contained.",
11 |     "description": "A general purpose prompt creating qustion answer pairs. This prompt takes two arguments {quantity} or how many q/a pairs to generate and {language}",
12 |     "parser": "csv_qa_parser"
13 |   },
14 |   {
15 |     "hash": "5755564c19",
16 |     "text": "You are asked to come up with a set of {quantity} diverse task instructions in the field of medicine and healthcare. These task instructions will be given to a Medical GPT model and we will evaluate the Medical GPT model for completing the instructions.\n\nHere are the requirements:\n1. Try not to repeat the verb for each instruction to maximize diversity.\n2. The language used for the instruction also should be diverse. For example, you should combine questions with imperative instructions.\n3. The type of instructions should be diverse. The list should include diverse kinds of tasks like step-by-step reasoning, multiple-choice-questions, open-ended generation, classification, editing, complex medical questions, simple medical questions, etc.\n4. A GPT language model should be able to complete the instruction. For example, do not ask the assistant to create any visual or audio output. For another example, do not ask the assistant to wake you up at 5pm or set a reminder because it cannot perform any action.\n5. The instructions should be in {language}.\n6. The instructions should be 1 to 4 sentences long. Either an imperative sentence or a question is permitted.\n7. You should generate an appropriate input to the instruction. The input field should contain a specific example provided for the instruction. It should involve realistic data and should not contain simple placeholders. The input should provide substantial content to make the instruction challenging but should ideally not exceed 300 words.\n8. Not all instructions require input. For example, when an instruction asks about some general information, \"What is diabetes\", it is not necessary to provide a specific context. In this case, we simply put \"<noinput>\" in the input field.\n9. The output should be an appropriate response to the instruction and the input. It should ideally not exceed 400 words.\n10. All generated output should use the metric system for measurements and UK names for medications, substances, drugs and everything else.\n\nList of {quantity} tasks (every task has the following fields: Task:, Instruction:, Input:, Output:):",
17 |     "description": "Generates high complexity various medical instruction-tasks",
18 |     "parser": null
19 |   }
20 | ]


--------------------------------------------------------------------------------
/experiments/Dataset Generation.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 2,
  6 |    "id": "4c5b7353-c14b-434c-a6c9-3a7c1fde8168",
  7 |    "metadata": {
  8 |     "tags": []
  9 |    },
 10 |    "outputs": [],
 11 |    "source": [
 12 |     "import tiktoken\n",
 13 |     "import pandas as pd\n",
 14 |     "import getpass\n",
 15 |     "import openai\n",
 16 |     "\n",
 17 |     "from opengpt.config import Config\n",
 18 |     "from opengpt.dataset_utils import split_csv_by_max_len, create_dataset"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "markdown",
 23 |    "id": "7a16f03d-c27c-4831-8eaa-447bca04bd62",
 24 |    "metadata": {},
 25 |    "source": [
 26 |     "## Prompt and domain-data configuration\n",
 27 |     "\n",
 28 |     "Make sure the prompts and domain-data is configured properly. Domain-data in the `config` file is the `datasets` part."
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": 20,
 34 |    "id": "655a1116-0a27-4d27-ae8c-614b9f559f32",
 35 |    "metadata": {
 36 |     "tags": []
 37 |    },
 38 |    "outputs": [],
 39 |    "source": [
 40 |     "config = Config(yaml_path='../configs/example_config_for_detaset_creation.yaml')"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "code",
 45 |    "execution_count": null,
 46 |    "id": "85b20ef3-00ce-4dd3-8c53-2d1166fbb6e9",
 47 |    "metadata": {
 48 |     "tags": []
 49 |    },
 50 |    "outputs": [],
 51 |    "source": [
 52 |     "# Based on the teacher and model, you might need to change the tokenizer\n",
 53 |     "tokenizer = tiktoken.encoding_for_model(config.teacher.model)"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "code",
 58 |    "execution_count": 22,
 59 |    "id": "f8c37bf8-2ba7-4b98-aa52-f319a45fcd3a",
 60 |    "metadata": {
 61 |     "tags": []
 62 |    },
 63 |    "outputs": [
 64 |     {
 65 |      "data": {
 66 |       "application/vnd.jupyter.widget-view+json": {
 67 |        "model_id": "a9d3ec8e030c4f9f81982593d2f68694",
 68 |        "version_major": 2,
 69 |        "version_minor": 0
 70 |       },
 71 |       "text/plain": [
 72 |        "Datasets:   0%|          | 0/1 [00:00<?, ?it/s]"
 73 |       ]
 74 |      },
 75 |      "metadata": {},
 76 |      "output_type": "display_data"
 77 |     },
 78 |     {
 79 |      "data": {
 80 |       "application/vnd.jupyter.widget-view+json": {
 81 |        "model_id": "7296f66408a140b885de7b6e84229102",
 82 |        "version_major": 2,
 83 |        "version_minor": 0
 84 |       },
 85 |       "text/plain": [
 86 |        "nhs_conditions_small_sample:   0%|          | 0/10 [00:00<?, ?it/s]"
 87 |       ]
 88 |      },
 89 |      "metadata": {},
 90 |      "output_type": "display_data"
 91 |     },
 92 |     {
 93 |      "name": "stderr",
 94 |      "output_type": "stream",
 95 |      "text": [
 96 |       "WARNING:root:nhs_conditions_small_sample: length before vs after: 10 vs 10\n",
 97 |       "\n"
 98 |      ]
 99 |     }
100 |    ],
101 |    "source": [
102 |     "split_csv_by_max_len(config.datasets, \n",
103 |     "                     max_len=config.teacher.max_len, \n",
104 |     "                     tokenizer=tokenizer,\n",
105 |     "                     base_path=config.base_path)"
106 |    ]
107 |   },
108 |   {
109 |    "cell_type": "markdown",
110 |    "id": "50b421e9-fc31-453d-bbfe-8fd592548720",
111 |    "metadata": {
112 |     "tags": []
113 |    },
114 |    "source": [
115 |     "## Send data to the Teacher and create a instruction based dataset\n",
116 |     "\n",
117 |     "If an example ocasionally fails, that is fine. But, if no examples pass, have a look at the parser and prompt you are using, maybe they are not aligned. \n",
118 |     "\n",
119 |     "If the connection breaks or for some reason the generation stops, that is fine, the scripts below make checkpoints and will continue generation. Everytime we run the script it will also try to re-do the examples that failed in the previous run."
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "code",
124 |    "execution_count": null,
125 |    "id": "d45e1d24-12dc-4f5d-a07d-994e98050f7e",
126 |    "metadata": {
127 |     "tags": []
128 |    },
129 |    "outputs": [],
130 |    "source": [
131 |     "openai.api_key = getpass.getpass(\"Your OPENAI_API_KEY: \")"
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "code",
136 |    "execution_count": 8,
137 |    "id": "3e7d8cf5-b313-42e1-a974-2dbd44be791c",
138 |    "metadata": {
139 |     "tags": []
140 |    },
141 |    "outputs": [
142 |     {
143 |      "name": "stderr",
144 |      "output_type": "stream",
145 |      "text": [
146 |       "WARNING:root:\n",
147 |       "Starting prompts: ['f53cf99826', 'f4df95ec69']\n",
148 |       "Run: 0\n",
149 |       "Language: English\n"
150 |      ]
151 |     },
152 |     {
153 |      "data": {
154 |       "application/vnd.jupyter.widget-view+json": {
155 |        "model_id": "77b23dfe1ad840e5ac4e5541abedecfc",
156 |        "version_major": 2,
157 |        "version_minor": 0
158 |       },
159 |       "text/plain": [
160 |        "nhs_conditions_small_sample:   0%|          | 0/10 [00:00<?, ?it/s]"
161 |       ]
162 |      },
163 |      "metadata": {},
164 |      "output_type": "display_data"
165 |     },
166 |     {
167 |      "name": "stderr",
168 |      "output_type": "stream",
169 |      "text": [
170 |       "WARNING:root:Checkpointing the generated dataset.\n",
171 |       "WARNING:root:\n",
172 |       "Starting prompts: ['f53cf99826', 'f4df95ec69']\n",
173 |       "Run: 0\n",
174 |       "Language: French\n"
175 |      ]
176 |     },
177 |     {
178 |      "data": {
179 |       "application/vnd.jupyter.widget-view+json": {
180 |        "model_id": "236f31e8b0424cdab29206074e28798b",
181 |        "version_major": 2,
182 |        "version_minor": 0
183 |       },
184 |       "text/plain": [
185 |        "nhs_conditions_small_sample:   0%|          | 0/10 [00:00<?, ?it/s]"
186 |       ]
187 |      },
188 |      "metadata": {},
189 |      "output_type": "display_data"
190 |     },
191 |     {
192 |      "name": "stderr",
193 |      "output_type": "stream",
194 |      "text": [
195 |       "WARNING:root:Checkpointing the generated dataset.\n",
196 |       "WARNING:root:\n",
197 |       "Starting prompts: ['f53cf99826', 'f4df95ec69']\n",
198 |       "Run: 1\n",
199 |       "Language: English\n"
200 |      ]
201 |     },
202 |     {
203 |      "data": {
204 |       "application/vnd.jupyter.widget-view+json": {
205 |        "model_id": "c132236b39a6481081e132bf9a961e50",
206 |        "version_major": 2,
207 |        "version_minor": 0
208 |       },
209 |       "text/plain": [
210 |        "nhs_conditions_small_sample:   0%|          | 0/10 [00:00<?, ?it/s]"
211 |       ]
212 |      },
213 |      "metadata": {},
214 |      "output_type": "display_data"
215 |     },
216 |     {
217 |      "name": "stderr",
218 |      "output_type": "stream",
219 |      "text": [
220 |       "WARNING:root:Checkpointing the generated dataset.\n",
221 |       "WARNING:root:\n",
222 |       "Starting prompts: ['f53cf99826', 'f4df95ec69']\n",
223 |       "Run: 1\n",
224 |       "Language: French\n"
225 |      ]
226 |     },
227 |     {
228 |      "data": {
229 |       "application/vnd.jupyter.widget-view+json": {
230 |        "model_id": "626e48b166644ec6814c6b9db6d68fef",
231 |        "version_major": 2,
232 |        "version_minor": 0
233 |       },
234 |       "text/plain": [
235 |        "nhs_conditions_small_sample:   0%|          | 0/10 [00:00<?, ?it/s]"
236 |       ]
237 |      },
238 |      "metadata": {},
239 |      "output_type": "display_data"
240 |     },
241 |     {
242 |      "name": "stderr",
243 |      "output_type": "stream",
244 |      "text": [
245 |       "WARNING:root:Checkpointing the generated dataset.\n"
246 |      ]
247 |     }
248 |    ],
249 |    "source": [
250 |     "# The same dataset will create outputs in two languages and do this for two epochs and two different prompts\n",
251 |     "raw_data, prepared_data = create_dataset(config)"
252 |    ]
253 |   },
254 |   {
255 |    "cell_type": "code",
256 |    "execution_count": 31,
257 |    "id": "2f520c8b-657a-4245-9257-68e6dc0724c6",
258 |    "metadata": {
259 |     "tags": []
260 |    },
261 |    "outputs": [
262 |     {
263 |      "data": {
264 |       "text/html": [
265 |        "<div>\n",
266 |        "<style scoped>\n",
267 |        "    .dataframe tbody tr th:only-of-type {\n",
268 |        "        vertical-align: middle;\n",
269 |        "    }\n",
270 |        "\n",
271 |        "    .dataframe tbody tr th {\n",
272 |        "        vertical-align: top;\n",
273 |        "    }\n",
274 |        "\n",
275 |        "    .dataframe thead th {\n",
276 |        "        text-align: right;\n",
277 |        "    }\n",
278 |        "</style>\n",
279 |        "<table border=\"1\" class=\"dataframe\">\n",
280 |        "  <thead>\n",
281 |        "    <tr style=\"text-align: right;\">\n",
282 |        "      <th></th>\n",
283 |        "      <th>text</th>\n",
284 |        "      <th>raw_data_id</th>\n",
285 |        "    </tr>\n",
286 |        "  </thead>\n",
287 |        "  <tbody>\n",
288 |        "    <tr>\n",
289 |        "      <th>0</th>\n",
290 |        "      <td>&lt;|user|&gt; What is considered a high blood press...</td>\n",
291 |        "      <td>0</td>\n",
292 |        "    </tr>\n",
293 |        "    <tr>\n",
294 |        "      <th>1</th>\n",
295 |        "      <td>&lt;|user|&gt; What are the risks of having high blo...</td>\n",
296 |        "      <td>0</td>\n",
297 |        "    </tr>\n",
298 |        "    <tr>\n",
299 |        "      <th>2</th>\n",
300 |        "      <td>&lt;|user|&gt; What lifestyle changes can help preve...</td>\n",
301 |        "      <td>0</td>\n",
302 |        "    </tr>\n",
303 |        "    <tr>\n",
304 |        "      <th>3</th>\n",
305 |        "      <td>&lt;|user|&gt; What kind of medicines can help contr...</td>\n",
306 |        "      <td>0</td>\n",
307 |        "    </tr>\n",
308 |        "    <tr>\n",
309 |        "      <th>4</th>\n",
310 |        "      <td>&lt;|user|&gt; What are some things that can increas...</td>\n",
311 |        "      <td>0</td>\n",
312 |        "    </tr>\n",
313 |        "  </tbody>\n",
314 |        "</table>\n",
315 |        "</div>"
316 |       ],
317 |       "text/plain": [
318 |        "                                                text  raw_data_id\n",
319 |        "0  <|user|> What is considered a high blood press...            0\n",
320 |        "1  <|user|> What are the risks of having high blo...            0\n",
321 |        "2  <|user|> What lifestyle changes can help preve...            0\n",
322 |        "3  <|user|> What kind of medicines can help contr...            0\n",
323 |        "4  <|user|> What are some things that can increas...            0"
324 |       ]
325 |      },
326 |      "execution_count": 31,
327 |      "metadata": {},
328 |      "output_type": "execute_result"
329 |     }
330 |    ],
331 |    "source": [
332 |     "prepared_data.head()"
333 |    ]
334 |   },
335 |   {
336 |    "cell_type": "code",
337 |    "execution_count": 30,
338 |    "id": "ceb30e42-f079-4d9d-80e4-4c38378d00d1",
339 |    "metadata": {
340 |     "tags": []
341 |    },
342 |    "outputs": [
343 |     {
344 |      "data": {
345 |       "text/html": [
346 |        "<div>\n",
347 |        "<style scoped>\n",
348 |        "    .dataframe tbody tr th:only-of-type {\n",
349 |        "        vertical-align: middle;\n",
350 |        "    }\n",
351 |        "\n",
352 |        "    .dataframe tbody tr th {\n",
353 |        "        vertical-align: top;\n",
354 |        "    }\n",
355 |        "\n",
356 |        "    .dataframe thead th {\n",
357 |        "        text-align: right;\n",
358 |        "    }\n",
359 |        "</style>\n",
360 |        "<table border=\"1\" class=\"dataframe\">\n",
361 |        "  <thead>\n",
362 |        "    <tr style=\"text-align: right;\">\n",
363 |        "      <th></th>\n",
364 |        "      <th>id</th>\n",
365 |        "      <th>raw_output</th>\n",
366 |        "      <th>dataset</th>\n",
367 |        "      <th>language</th>\n",
368 |        "      <th>run</th>\n",
369 |        "      <th>prompt_hash</th>\n",
370 |        "      <th>prompt_text_hash</th>\n",
371 |        "      <th>context</th>\n",
372 |        "    </tr>\n",
373 |        "  </thead>\n",
374 |        "  <tbody>\n",
375 |        "    <tr>\n",
376 |        "      <th>0</th>\n",
377 |        "      <td>0</td>\n",
378 |        "      <td>ID;Question;Answer\\n1;What is considered a hig...</td>\n",
379 |        "      <td>nhs_conditions_small_sample</td>\n",
380 |        "      <td>English</td>\n",
381 |        "      <td>0</td>\n",
382 |        "      <td>f53cf99826</td>\n",
383 |        "      <td>a886c127f3f267e647b41b8f12caf0fac51bec9ff2d908...</td>\n",
384 |        "      <td>Overview\\nHigh blood pressure (hypertension)\\n...</td>\n",
385 |        "    </tr>\n",
386 |        "    <tr>\n",
387 |        "      <th>1</th>\n",
388 |        "      <td>1</td>\n",
389 |        "      <td>ID;Question;Answer\\n1;What is bronchiolitis an...</td>\n",
390 |        "      <td>nhs_conditions_small_sample</td>\n",
391 |        "      <td>English</td>\n",
392 |        "      <td>0</td>\n",
393 |        "      <td>f53cf99826</td>\n",
394 |        "      <td>ca0cc4cd9acd981724d8478f85610d37679383bbb330cb...</td>\n",
395 |        "      <td>Bronchiolitis\\nBronchiolitis is a common chest...</td>\n",
396 |        "    </tr>\n",
397 |        "    <tr>\n",
398 |        "      <th>2</th>\n",
399 |        "      <td>2</td>\n",
400 |        "      <td>Patient: Hi, I've been coughing for a week now...</td>\n",
401 |        "      <td>nhs_conditions_small_sample</td>\n",
402 |        "      <td>English</td>\n",
403 |        "      <td>0</td>\n",
404 |        "      <td>f4df95ec69</td>\n",
405 |        "      <td>2bfdb5c693f89fec690df0b78321945e82d630f0dabf98...</td>\n",
406 |        "      <td>Bronchitis\\nBronchitis is inflammation of the ...</td>\n",
407 |        "    </tr>\n",
408 |        "    <tr>\n",
409 |        "      <th>3</th>\n",
410 |        "      <td>3</td>\n",
411 |        "      <td>Patient: Hi, I have been experiencing joint pa...</td>\n",
412 |        "      <td>nhs_conditions_small_sample</td>\n",
413 |        "      <td>English</td>\n",
414 |        "      <td>0</td>\n",
415 |        "      <td>f4df95ec69</td>\n",
416 |        "      <td>14142c96ed663d1313091800a7778df104e50577c26f23...</td>\n",
417 |        "      <td>Steroids\\nSteroids, also called corticosteroid...</td>\n",
418 |        "    </tr>\n",
419 |        "    <tr>\n",
420 |        "      <th>4</th>\n",
421 |        "      <td>4</td>\n",
422 |        "      <td>ID;Question;Answer\\n1;What are the symptoms of...</td>\n",
423 |        "      <td>nhs_conditions_small_sample</td>\n",
424 |        "      <td>English</td>\n",
425 |        "      <td>0</td>\n",
426 |        "      <td>f53cf99826</td>\n",
427 |        "      <td>ed53b30b98e2f13a1b120f459ff0ca70d3e8f305bc8855...</td>\n",
428 |        "      <td>Overview\\nCreutzfeldt-Jakob disease\\nCreutzfel...</td>\n",
429 |        "    </tr>\n",
430 |        "  </tbody>\n",
431 |        "</table>\n",
432 |        "</div>"
433 |       ],
434 |       "text/plain": [
435 |        "   id                                         raw_output   \n",
436 |        "0   0  ID;Question;Answer\\n1;What is considered a hig...  \\\n",
437 |        "1   1  ID;Question;Answer\\n1;What is bronchiolitis an...   \n",
438 |        "2   2  Patient: Hi, I've been coughing for a week now...   \n",
439 |        "3   3  Patient: Hi, I have been experiencing joint pa...   \n",
440 |        "4   4  ID;Question;Answer\\n1;What are the symptoms of...   \n",
441 |        "\n",
442 |        "                       dataset language  run prompt_hash   \n",
443 |        "0  nhs_conditions_small_sample  English    0  f53cf99826  \\\n",
444 |        "1  nhs_conditions_small_sample  English    0  f53cf99826   \n",
445 |        "2  nhs_conditions_small_sample  English    0  f4df95ec69   \n",
446 |        "3  nhs_conditions_small_sample  English    0  f4df95ec69   \n",
447 |        "4  nhs_conditions_small_sample  English    0  f53cf99826   \n",
448 |        "\n",
449 |        "                                    prompt_text_hash   \n",
450 |        "0  a886c127f3f267e647b41b8f12caf0fac51bec9ff2d908...  \\\n",
451 |        "1  ca0cc4cd9acd981724d8478f85610d37679383bbb330cb...   \n",
452 |        "2  2bfdb5c693f89fec690df0b78321945e82d630f0dabf98...   \n",
453 |        "3  14142c96ed663d1313091800a7778df104e50577c26f23...   \n",
454 |        "4  ed53b30b98e2f13a1b120f459ff0ca70d3e8f305bc8855...   \n",
455 |        "\n",
456 |        "                                             context  \n",
457 |        "0  Overview\\nHigh blood pressure (hypertension)\\n...  \n",
458 |        "1  Bronchiolitis\\nBronchiolitis is a common chest...  \n",
459 |        "2  Bronchitis\\nBronchitis is inflammation of the ...  \n",
460 |        "3  Steroids\\nSteroids, also called corticosteroid...  \n",
461 |        "4  Overview\\nCreutzfeldt-Jakob disease\\nCreutzfel...  "
462 |       ]
463 |      },
464 |      "execution_count": 30,
465 |      "metadata": {},
466 |      "output_type": "execute_result"
467 |     }
468 |    ],
469 |    "source": [
470 |     "raw_data.head()"
471 |    ]
472 |   }
473 |  ],
474 |  "metadata": {
475 |   "kernelspec": {
476 |    "display_name": "Python 3 (ipykernel)",
477 |    "language": "python",
478 |    "name": "python3"
479 |   },
480 |   "language_info": {
481 |    "codemirror_mode": {
482 |     "name": "ipython",
483 |     "version": 3
484 |    },
485 |    "file_extension": ".py",
486 |    "mimetype": "text/x-python",
487 |    "name": "python",
488 |    "nbconvert_exporter": "python",
489 |    "pygments_lexer": "ipython3",
490 |    "version": "3.8.0"
491 |   }
492 |  },
493 |  "nbformat": 4,
494 |  "nbformat_minor": 5
495 | }
496 | 


--------------------------------------------------------------------------------
/experiments/Prompt Creation.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 27,
  6 |    "id": "77303897-4f09-4726-ad0a-e177747c82fd",
  7 |    "metadata": {
  8 |     "tags": []
  9 |    },
 10 |    "outputs": [],
 11 |    "source": [
 12 |     "from opengpt.prompt_utils import add_to_prompt_database\n",
 13 |     "from opengpt.config import Config\n",
 14 |     "import hashlib\n",
 15 |     "import json"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": 3,
 21 |    "id": "a8e3f831-bf93-416a-9f2d-f5326a4bc20f",
 22 |    "metadata": {
 23 |     "tags": []
 24 |    },
 25 |    "outputs": [],
 26 |    "source": [
 27 |     "config = Config(yaml_path='../configs/example_config_for_detaset_creation.yaml')"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "markdown",
 32 |    "id": "0ae8d8a7-926d-4ab5-8266-024b28bc7304",
 33 |    "metadata": {},
 34 |    "source": [
 35 |     "## How to add prompts into the prompt database\n",
 36 |     "\n",
 37 |     "Here we will add three prompts to the prompt database, the same template can be used to add any prompt. \n",
 38 |     "\n",
 39 |     "Each prompt requires three fields:\n",
 40 |     "\n",
 41 |     "- `text` - The text of the prompt that will be sent to OpenAI (ChatGPT, or GPT-4, or any other model that can be used to create a dataset). The prompt needs to have at least two placeholders. First, `context` - the content of a document that will be used to generate question/answer pairs, or a conversation, or anything else. And second, `language` in what language do we want the output to be.\n",
 42 |     "\n",
 43 |     "- `description` - A short description of what is this prompt for and how to use it.\n",
 44 |     "\n",
 45 |     "- `parser` - The parser to be used to parse the output from a Teacher (e.g. OpenAI, Google).\n",
 46 |     "\n",
 47 |     "Notes:\n",
 48 |     "- When using the `csv_*` parsers the separator \";\" has to be used."
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "code",
 53 |    "execution_count": null,
 54 |    "id": "4127e8b4-b9e3-4e25-a20e-18415f530340",
 55 |    "metadata": {
 56 |     "tags": []
 57 |    },
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "# Check what is already in the prompt DB, use existing prompts or add new ones \n",
 61 |     "if os.path.exists(config.path.prompt_db):\n",
 62 |     "    db = json.load(open(config.path.prompt_db))\n",
 63 |     "    for prompt in db:\n",
 64 |     "        print('Description: ', prompt['description'])\n",
 65 |     "        print('Hash: ', prompt['hash'])\n",
 66 |     "        print('Parser: ', prompt['parser'])\n",
 67 |     "        print('Text: ', prompt['text'])\n",
 68 |     "        print(\"*\"*100)\n",
 69 |     "        print()"
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "code",
 74 |    "execution_count": 11,
 75 |    "id": "2a3e445f-4454-4209-950c-d4ee8be40ecc",
 76 |    "metadata": {
 77 |     "tags": []
 78 |    },
 79 |    "outputs": [],
 80 |    "source": [
 81 |     "text = '''Given the following context: {context}\n",
 82 |     "\n",
 83 |     "----------------\n",
 84 |     "Create a conversation between a Patient concerned about the symptoms and disorders presented above and an AI-Assistant. The conversation should abide by the following rules and instructions:\n",
 85 |     "1. The output should be in {language}.\n",
 86 |     "2. All questions and answers have to be strictly based on the context provided above.\n",
 87 |     "3. The questions should be diverse and cover different aspects of the context provided above.\n",
 88 |     "4. When generating the conversation please always use the English placeholders \"Patient\" for the Patient and \"AI-Assistant\" for the AI-Assistant, irrelevant of the language.\n",
 89 |     "5. The conversation should always be started by the Patient.\n",
 90 |     "6. The patient should start the conversation with a \"Hi,\".\n",
 91 |     "7. The AI-Assistant should be helpful, and supportive and try to not scare the patient. The answers have be to detailed and self-contained.'''\n",
 92 |     "description = '''This will generate a conversation between a Patient and an AI assistant in the specified languages. The only argument for this prompt is the {language} argument, if not provided it will default to English.'''\n",
 93 |     "parser = 'medical_conversation_parser'"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "code",
 98 |    "execution_count": 12,
 99 |    "id": "a4d2789d-811d-4dc3-8371-98cd8987fb79",
100 |    "metadata": {
101 |     "tags": []
102 |    },
103 |    "outputs": [
104 |     {
105 |      "name": "stderr",
106 |      "output_type": "stream",
107 |      "text": [
108 |       "WARNING:root:Added prompt: f4df95ec69\n"
109 |      ]
110 |     }
111 |    ],
112 |    "source": [
113 |     "db = add_to_prompt_database(text, \n",
114 |     "                            description, \n",
115 |     "                            parser, \n",
116 |     "                            config.path.prompt_db, \n",
117 |     "                            force_replace=False)"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "code",
122 |    "execution_count": 13,
123 |    "id": "3a09bce3-a264-4d3d-bbac-6a5c8a148ae0",
124 |    "metadata": {
125 |     "tags": []
126 |    },
127 |    "outputs": [],
128 |    "source": [
129 |     "text = '''Given the following context: {context}\n",
130 |     "\n",
131 |     "----------------\n",
132 |     "Create {quantity} detailed question-answer pairs from the context above, the questions are asked by a curious user and the answers are by a helpful AI-Assistant. The question-answer pairs should abide by the following rules and instructions:\n",
133 |     "1. The output should be in {language}.\n",
134 |     "2. The output should be in CSV format, with the following header: ID;Question;Answer\n",
135 |     "3. All question/answer pairs have to be strictly based on the context provided above and be self-contained and independent.\n",
136 |     "4. The questions should be diverse and cover different aspects of the context provided above.\n",
137 |     "5. The answers should be long, extensive, detailed, informative, helpful and self-contained.'''\n",
138 |     "description = 'A general purpose prompt creating qustion answer pairs. This prompt takes two arguments {quantity} or how many q/a pairs to generate and {language}'\n",
139 |     "parser = 'csv_qa_parser'"
140 |    ]
141 |   },
142 |   {
143 |    "cell_type": "code",
144 |    "execution_count": 14,
145 |    "id": "588ed831-144b-4cf3-8088-2f7da264ed1f",
146 |    "metadata": {
147 |     "tags": []
148 |    },
149 |    "outputs": [
150 |     {
151 |      "name": "stderr",
152 |      "output_type": "stream",
153 |      "text": [
154 |       "WARNING:root:Added prompt: f53cf99826\n"
155 |      ]
156 |     }
157 |    ],
158 |    "source": [
159 |     "db = add_to_prompt_database(text, \n",
160 |     "                            description, \n",
161 |     "                            parser, \n",
162 |     "                            config.path.prompt_db, \n",
163 |     "                            force_replace=False)"
164 |    ]
165 |   },
166 |   {
167 |    "cell_type": "code",
168 |    "execution_count": 15,
169 |    "id": "0b877351-fd90-4728-b4a0-adf00d986d4f",
170 |    "metadata": {
171 |     "tags": []
172 |    },
173 |    "outputs": [],
174 |    "source": [
175 |     "text = '''You are asked to come up with a set of {quantity} diverse task instructions in the field of medicine and healthcare. These task instructions will be given to a Medical GPT model and we will evaluate the Medical GPT model for completing the instructions.\n",
176 |     "\n",
177 |     "Here are the requirements:\n",
178 |     "1. Try not to repeat the verb for each instruction to maximize diversity.\n",
179 |     "2. The language used for the instruction also should be diverse. For example, you should combine questions with imperative instructions.\n",
180 |     "3. The type of instructions should be diverse. The list should include diverse kinds of tasks like step-by-step reasoning, multiple-choice-questions, open-ended generation, classification, editing, complex medical questions, simple medical questions, etc.\n",
181 |     "4. A GPT language model should be able to complete the instruction. For example, do not ask the assistant to create any visual or audio output. For another example, do not ask the assistant to wake you up at 5pm or set a reminder because it cannot perform any action.\n",
182 |     "5. The instructions should be in {language}.\n",
183 |     "6. The instructions should be 1 to 4 sentences long. Either an imperative sentence or a question is permitted.\n",
184 |     "7. You should generate an appropriate input to the instruction. The input field should contain a specific example provided for the instruction. It should involve realistic data and should not contain simple placeholders. The input should provide substantial content to make the instruction challenging but should ideally not exceed 300 words.\n",
185 |     "8. Not all instructions require input. For example, when an instruction asks about some general information, \"What is diabetes\", it is not necessary to provide a specific context. In this case, we simply put \"<noinput>\" in the input field.\n",
186 |     "9. The output should be an appropriate response to the instruction and the input. It should ideally not exceed 400 words.\n",
187 |     "10. All generated output should use the metric system for measurements and UK names for medications, substances, drugs and everything else.\n",
188 |     "\n",
189 |     "List of {quantity} tasks (every task has the following fields: Task:, Instruction:, Input:, Output:):'''\n",
190 |     "description = '''Generates high complexity various medical instruction-tasks'''\n",
191 |     "parser = None"
192 |    ]
193 |   },
194 |   {
195 |    "cell_type": "code",
196 |    "execution_count": 16,
197 |    "id": "9502955a-2710-427c-aab0-aa6c20990021",
198 |    "metadata": {
199 |     "tags": []
200 |    },
201 |    "outputs": [
202 |     {
203 |      "name": "stderr",
204 |      "output_type": "stream",
205 |      "text": [
206 |       "WARNING:root:Added prompt: 5755564c19\n"
207 |      ]
208 |     }
209 |    ],
210 |    "source": [
211 |     "db = add_to_prompt_database(text, \n",
212 |     "                            description, \n",
213 |     "                            parser, \n",
214 |     "                            config.path.prompt_db, \n",
215 |     "                            force_replace=False)"
216 |    ]
217 |   },
218 |   {
219 |    "cell_type": "code",
220 |    "execution_count": 25,
221 |    "id": "ca6a923c-9e34-42d7-a93d-039c3908252f",
222 |    "metadata": {
223 |     "tags": []
224 |    },
225 |    "outputs": [
226 |     {
227 |      "name": "stdout",
228 |      "output_type": "stream",
229 |      "text": [
230 |       "Description:  This will generate a conversation between a Patient and an AI assistant in the specified languages. The only argument for this prompt is the {language} argument, if not provided it will default to English.\n",
231 |       "Hash:  f4df95ec69\n",
232 |       "Parser:  medical_conversation_parser\n",
233 |       "Text:  Given the following context: {context}\n",
234 |       "\n",
235 |       "----------------\n",
236 |       "Create a conversation between a Patient concerned about the symptoms and disorders presented above and an AI-Assistant. The conversation should abide by the following rules and instructions:\n",
237 |       "1. The output should be in {language}.\n",
238 |       "2. All questions and answers have to be strictly based on the context provided above.\n",
239 |       "3. The questions should be diverse and cover different aspects of the context provided above.\n",
240 |       "4. When generating the conversation please always use the English placeholders \"Patient\" for the Patient and \"AI-Assistant\" for the AI-Assistant, irrelevant of the language.\n",
241 |       "5. The conversation should always be started by the Patient.\n",
242 |       "6. The patient should start the conversation with a \"Hi,\".\n",
243 |       "7. The AI-Assistant should be helpful, and supportive and try to not scare the patient. The answers have be to detailed and self-contained.\n",
244 |       "****************************************************************************************************\n",
245 |       "\n",
246 |       "Description:  A general purpose prompt creating qustion answer pairs. This prompt takes two arguments {quantity} or how many q/a pairs to generate and {language}\n",
247 |       "Hash:  f53cf99826\n",
248 |       "Parser:  csv_qa_parser\n",
249 |       "Text:  Given the following context: {context}\n",
250 |       "\n",
251 |       "----------------\n",
252 |       "Create {quantity} detailed question-answer pairs from the context above, the questions are asked by a curious user and the answers are by a helpful AI-Assistant. The question-answer pairs should abide by the following rules and instructions:\n",
253 |       "1. The output should be in {language}.\n",
254 |       "2. The output should be in CSV format, with the following header: ID;Question;Answer\n",
255 |       "3. All question/answer pairs have to be strictly based on the context provided above and be self-contained and independent.\n",
256 |       "4. The questions should be diverse and cover different aspects of the context provided above.\n",
257 |       "5. The answers should be long, extensive, detailed, informative, helpful and self-contained.\n",
258 |       "****************************************************************************************************\n",
259 |       "\n",
260 |       "Description:  Generates high complexity various medical instruction-tasks\n",
261 |       "Hash:  5755564c19\n",
262 |       "Parser:  None\n",
263 |       "Text:  You are asked to come up with a set of {quantity} diverse task instructions in the field of medicine and healthcare. These task instructions will be given to a Medical GPT model and we will evaluate the Medical GPT model for completing the instructions.\n",
264 |       "\n",
265 |       "Here are the requirements:\n",
266 |       "1. Try not to repeat the verb for each instruction to maximize diversity.\n",
267 |       "2. The language used for the instruction also should be diverse. For example, you should combine questions with imperative instructions.\n",
268 |       "3. The type of instructions should be diverse. The list should include diverse kinds of tasks like step-by-step reasoning, multiple-choice-questions, open-ended generation, classification, editing, complex medical questions, simple medical questions, etc.\n",
269 |       "4. A GPT language model should be able to complete the instruction. For example, do not ask the assistant to create any visual or audio output. For another example, do not ask the assistant to wake you up at 5pm or set a reminder because it cannot perform any action.\n",
270 |       "5. The instructions should be in {language}.\n",
271 |       "6. The instructions should be 1 to 4 sentences long. Either an imperative sentence or a question is permitted.\n",
272 |       "7. You should generate an appropriate input to the instruction. The input field should contain a specific example provided for the instruction. It should involve realistic data and should not contain simple placeholders. The input should provide substantial content to make the instruction challenging but should ideally not exceed 300 words.\n",
273 |       "8. Not all instructions require input. For example, when an instruction asks about some general information, \"What is diabetes\", it is not necessary to provide a specific context. In this case, we simply put \"<noinput>\" in the input field.\n",
274 |       "9. The output should be an appropriate response to the instruction and the input. It should ideally not exceed 400 words.\n",
275 |       "10. All generated output should use the metric system for measurements and UK names for medications, substances, drugs and everything else.\n",
276 |       "\n",
277 |       "List of {quantity} tasks (every task has the following fields: Task:, Instruction:, Input:, Output:):\n",
278 |       "****************************************************************************************************\n",
279 |       "\n"
280 |      ]
281 |     }
282 |    ],
283 |    "source": [
284 |     "# Print the prompt db\n",
285 |     "for prompt in db:\n",
286 |     "    print('Description: ', prompt['description'])\n",
287 |     "    print('Hash: ', prompt['hash'])\n",
288 |     "    print('Parser: ', prompt['parser'])\n",
289 |     "    print('Text: ', prompt['text'])\n",
290 |     "    print(\"*\"*100)\n",
291 |     "    print()"
292 |    ]
293 |   },
294 |   {
295 |    "cell_type": "code",
296 |    "execution_count": null,
297 |    "id": "2d4a2a9b-4de8-488e-9914-8181ec218935",
298 |    "metadata": {},
299 |    "outputs": [],
300 |    "source": []
301 |   }
302 |  ],
303 |  "metadata": {
304 |   "kernelspec": {
305 |    "display_name": "Python 3 (ipykernel)",
306 |    "language": "python",
307 |    "name": "python3"
308 |   },
309 |   "language_info": {
310 |    "codemirror_mode": {
311 |     "name": "ipython",
312 |     "version": 3
313 |    },
314 |    "file_extension": ".py",
315 |    "mimetype": "text/x-python",
316 |    "name": "python",
317 |    "nbconvert_exporter": "python",
318 |    "pygments_lexer": "ipython3",
319 |    "version": "3.8.0"
320 |   }
321 |  },
322 |  "nbformat": 4,
323 |  "nbformat_minor": 5
324 | }
325 | 


--------------------------------------------------------------------------------
/experiments/Supervised Training.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 2,
  6 |    "id": "6f859a45-5ca7-47cc-8055-dabedd301963",
  7 |    "metadata": {
  8 |     "tags": []
  9 |    },
 10 |    "outputs": [
 11 |     {
 12 |      "name": "stderr",
 13 |      "output_type": "stream",
 14 |      "text": [
 15 |       "2023-05-08 18:13:35.960625: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
 16 |       "To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
 17 |       "2023-05-08 18:13:37.052768: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n"
 18 |      ]
 19 |     }
 20 |    ],
 21 |    "source": [
 22 |     "from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, pipeline\n",
 23 |     "import pickle\n",
 24 |     "import pandas as pd\n",
 25 |     "import datasets\n",
 26 |     "\n",
 27 |     "\n",
 28 |     "from opengpt.config import Config\n",
 29 |     "from opengpt.model_utils import add_tokens_to_model_and_tokenizer\n",
 30 |     "from opengpt.dataset_utils import create_labels, pack_examples\n",
 31 |     "from opengpt.data_collator import DataCollatorWithPadding"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": 4,
 37 |    "id": "dd137647-a6d7-49c8-a241-404137ef3b08",
 38 |    "metadata": {
 39 |     "tags": []
 40 |    },
 41 |    "outputs": [],
 42 |    "source": [
 43 |     "config = Config(yaml_path='../configs/example_train_config.yaml')\n",
 44 |     "model = AutoModelForCausalLM.from_pretrained(config.train.model)\n",
 45 |     "tokenizer = AutoTokenizer.from_pretrained(config.train.model)\n",
 46 |     "tokenizer.model_max_length = config.train.max_seq_len"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": 5,
 52 |    "id": "5c389455-d401-4857-a61d-0cc1e72d312b",
 53 |    "metadata": {},
 54 |    "outputs": [
 55 |     {
 56 |      "name": "stderr",
 57 |      "output_type": "stream",
 58 |      "text": [
 59 |       "WARNING:root:Added: 5 tokens to the tokenizer\n"
 60 |      ]
 61 |     }
 62 |    ],
 63 |    "source": [
 64 |     "add_tokens_to_model_and_tokenizer(config, tokenizer, model)"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "markdown",
 69 |    "id": "bdb0bac3-1504-430f-a28c-0aea4ea28bbc",
 70 |    "metadata": {},
 71 |    "source": [
 72 |     "### Load data"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "markdown",
 77 |    "id": "b0f6ef4f-3c62-47f8-a717-9d0a57d086ad",
 78 |    "metadata": {},
 79 |    "source": [
 80 |     "The datasets used for training have to have the special tokens as defined in the config. By default this means that the datasets have to be organised as conversations using the `<|user|> <|ai|>` and `<|eos|> <|eod|>` special tokens. An example of a question/answer pair from the NHS-UK dataset:\n",
 81 |     "\n",
 82 |     "```\n",
 83 |     "<|user|> What is high blood pressure? <|eos|> <|ai|> High blood pressure is a condition where the force at which your heart pumps blood around your body is high. It is recorded with 2 numbers, the systolic pressure and the diastolic pressure, both measured in millimetres of mercury (mmHg).\n",
 84 |     "References:\n",
 85 |     "- https://www.nhs.uk/conditions/Blood-pressure-(high)/Pages/Introduction.aspx <|eos|> <|eod|>\n",
 86 |     "```\n",
 87 |     "\n",
 88 |     "If not done the training scripts below will not work."
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "code",
 93 |    "execution_count": 8,
 94 |    "id": "6a2589a6-2370-4b23-98fc-91ebcf6b24aa",
 95 |    "metadata": {
 96 |     "tags": []
 97 |    },
 98 |    "outputs": [
 99 |     {
100 |      "name": "stdout",
101 |      "output_type": "stream",
102 |      "text": [
103 |       "Shuffling dataset\n"
104 |      ]
105 |     }
106 |    ],
107 |    "source": [
108 |     "train_dataset = datasets.Dataset.from_csv(config.train.datasets)\n",
109 |     "if config.train.shuffle_dataset:\n",
110 |     "    train_dataset = train_dataset.shuffle()\n",
111 |     "    print(\"Shuffling dataset!\")"
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "markdown",
116 |    "id": "11ab2eea-65d7-4b9e-934a-4179608bd6f4",
117 |    "metadata": {},
118 |    "source": [
119 |     "#### Remove all columns that we do not need, filtering of the dataset can be done before removal if needed"
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "code",
124 |    "execution_count": 9,
125 |    "id": "75b45ea0-c435-4193-aa1b-622a062b4386",
126 |    "metadata": {
127 |     "tags": []
128 |    },
129 |    "outputs": [],
130 |    "source": [
131 |     "# Remove everything but text\n",
132 |     "to_remove = list(train_dataset.column_names)\n",
133 |     "to_remove.remove('text')\n",
134 |     "train_dataset = train_dataset.remove_columns(to_remove)"
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "code",
139 |    "execution_count": 11,
140 |    "id": "e270be31-1643-42c7-8dc1-a9206b88a243",
141 |    "metadata": {
142 |     "tags": []
143 |    },
144 |    "outputs": [
145 |     {
146 |      "data": {
147 |       "application/vnd.jupyter.widget-view+json": {
148 |        "model_id": "",
149 |        "version_major": 2,
150 |        "version_minor": 0
151 |       },
152 |       "text/plain": [
153 |        "Map:   0%|          | 0/29660 [00:00<?, ? examples/s]"
154 |       ]
155 |      },
156 |      "metadata": {},
157 |      "output_type": "display_data"
158 |     }
159 |    ],
160 |    "source": [
161 |     "# Ignore max_seq_len warning, it is handled by the packer or data_collator\n",
162 |     "train_dataset = train_dataset.map(\n",
163 |     "    lambda examples: tokenizer(examples['text'], add_special_tokens=False), \n",
164 |     "    batched=True, \n",
165 |     "    num_proc=1, \n",
166 |     "    remove_columns=[\"text\"])\n",
167 |     "# Create labels\n",
168 |     "train_dataset = train_dataset.map(\n",
169 |     "    lambda examples: create_labels(examples, config, tokenizer),\n",
170 |     "    batched=True,\n",
171 |     "    batch_size=1000,\n",
172 |     "    num_proc=1,\n",
173 |     ")\n",
174 |     "# We only do packing for the train set\n",
175 |     "train_dataset = train_dataset.map(\n",
176 |     "    lambda examples: pack_examples(examples, config.train.max_seq_len, packing_type=config.train.packing_type),\n",
177 |     "    batched=True,\n",
178 |     "    batch_size=1000,\n",
179 |     "    num_proc=1,\n",
180 |     ")"
181 |    ]
182 |   },
183 |   {
184 |    "cell_type": "code",
185 |    "execution_count": 13,
186 |    "id": "6919836a-eab8-43ff-9b1e-2b1d169fe3bd",
187 |    "metadata": {
188 |     "tags": []
189 |    },
190 |    "outputs": [],
191 |    "source": [
192 |     "training_args = TrainingArguments(**config.train.hf_training_arguments.to_dict())\n",
193 |     "dc = DataCollatorWithPadding(tokenizer.pad_token_id, config.train.ignore_index, max_seq_len=config.train.max_seq_len)\n",
194 |     "\n",
195 |     "trainer = Trainer(\n",
196 |     "    model=model,\n",
197 |     "    args=training_args,\n",
198 |     "    train_dataset=train_dataset,\n",
199 |     "    eval_dataset=None,\n",
200 |     "    data_collator=dc,\n",
201 |     ")"
202 |    ]
203 |   },
204 |   {
205 |    "cell_type": "code",
206 |    "execution_count": 16,
207 |    "id": "a4bfc4d5-bbff-4b74-b75f-4ca79ce04124",
208 |    "metadata": {
209 |     "tags": []
210 |    },
211 |    "outputs": [
212 |     {
213 |      "data": {
214 |       "text/plain": [
215 |        "Dataset({\n",
216 |        "    features: ['input_ids', 'attention_mask', 'labels'],\n",
217 |        "    num_rows: 8771\n",
218 |        "})"
219 |       ]
220 |      },
221 |      "execution_count": 16,
222 |      "metadata": {},
223 |      "output_type": "execute_result"
224 |     }
225 |    ],
226 |    "source": [
227 |     "train_dataset"
228 |    ]
229 |   },
230 |   {
231 |    "cell_type": "code",
232 |    "execution_count": 17,
233 |    "id": "1e96d99b-c915-4ac4-8cbe-fb01081a7a42",
234 |    "metadata": {
235 |     "tags": []
236 |    },
237 |    "outputs": [
238 |     {
239 |      "name": "stderr",
240 |      "output_type": "stream",
241 |      "text": [
242 |       "/data/zeljko/.venv/llama/lib/python3.8/site-packages/transformers/optimization.py:391: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n",
243 |       "  warnings.warn(\n"
244 |      ]
245 |     },
246 |     {
247 |      "data": {
248 |       "text/html": [
249 |        "\n",
250 |        "    <div>\n",
251 |        "      \n",
252 |        "      <progress value='548' max='548' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
253 |        "      [548/548 06:27, Epoch 0/1]\n",
254 |        "    </div>\n",
255 |        "    <table border=\"1\" class=\"dataframe\">\n",
256 |        "  <thead>\n",
257 |        " <tr style=\"text-align: left;\">\n",
258 |        "      <th>Step</th>\n",
259 |        "      <th>Training Loss</th>\n",
260 |        "    </tr>\n",
261 |        "  </thead>\n",
262 |        "  <tbody>\n",
263 |        "    <tr>\n",
264 |        "      <td>100</td>\n",
265 |        "      <td>1.650500</td>\n",
266 |        "    </tr>\n",
267 |        "    <tr>\n",
268 |        "      <td>200</td>\n",
269 |        "      <td>1.489700</td>\n",
270 |        "    </tr>\n",
271 |        "    <tr>\n",
272 |        "      <td>300</td>\n",
273 |        "      <td>1.445900</td>\n",
274 |        "    </tr>\n",
275 |        "    <tr>\n",
276 |        "      <td>400</td>\n",
277 |        "      <td>1.416500</td>\n",
278 |        "    </tr>\n",
279 |        "    <tr>\n",
280 |        "      <td>500</td>\n",
281 |        "      <td>1.399800</td>\n",
282 |        "    </tr>\n",
283 |        "  </tbody>\n",
284 |        "</table><p>"
285 |       ],
286 |       "text/plain": [
287 |        "<IPython.core.display.HTML object>"
288 |       ]
289 |      },
290 |      "metadata": {},
291 |      "output_type": "display_data"
292 |     },
293 |     {
294 |      "data": {
295 |       "text/plain": [
296 |        "TrainOutput(global_step=548, training_loss=1.4748950457050853, metrics={'train_runtime': 388.7346, 'train_samples_per_second': 22.563, 'train_steps_per_second': 1.41, 'total_flos': 1931665648896000.0, 'train_loss': 1.4748950457050853, 'epoch': 1.0})"
297 |       ]
298 |      },
299 |      "execution_count": 17,
300 |      "metadata": {},
301 |      "output_type": "execute_result"
302 |     }
303 |    ],
304 |    "source": [
305 |     "trainer.train()"
306 |    ]
307 |   },
308 |   {
309 |    "cell_type": "markdown",
310 |    "id": "11a88494-afb1-44a5-b607-6382fe5b0c9c",
311 |    "metadata": {},
312 |    "source": [
313 |     "# Test Generation"
314 |    ]
315 |   },
316 |   {
317 |    "cell_type": "code",
318 |    "execution_count": 18,
319 |    "id": "2d37cb47-c029-40f5-bc7d-accefae42f50",
320 |    "metadata": {
321 |     "tags": []
322 |    },
323 |    "outputs": [],
324 |    "source": [
325 |     "gen = pipeline(model=model, tokenizer=tokenizer, task='text-generation', device=model.device)"
326 |    ]
327 |   },
328 |   {
329 |    "cell_type": "code",
330 |    "execution_count": 19,
331 |    "id": "86ce8f6e-6037-4c2b-8aac-5ad7a129834a",
332 |    "metadata": {
333 |     "tags": []
334 |    },
335 |    "outputs": [],
336 |    "source": [
337 |     "t = \"<|user|> What is diabetes? <|eos|> <|ai|>\" # The format with special tokens is required, because of training"
338 |    ]
339 |   },
340 |   {
341 |    "cell_type": "code",
342 |    "execution_count": 39,
343 |    "id": "daab006f-eda3-43db-b864-a1c1de52d4d3",
344 |    "metadata": {
345 |     "tags": []
346 |    },
347 |    "outputs": [
348 |     {
349 |      "name": "stderr",
350 |      "output_type": "stream",
351 |      "text": [
352 |       "Setting `pad_token_id` to `eos_token_id`:50267 for open-end generation.\n"
353 |      ]
354 |     },
355 |     {
356 |      "name": "stdout",
357 |      "output_type": "stream",
358 |      "text": [
359 |       "<|user|> What is diabetes? <|eos|> <|ai|> Diabetes is a condition in which the body's insulin levels are too low, which can lead to high blood sugar levels.\n",
360 |       "References:\n",
361 |       "- https://www.nhs.uk/conditions/diabetes/ \n"
362 |      ]
363 |     }
364 |    ],
365 |    "source": [
366 |     "# Temperature is important, and depending on your model different values will be good (this one is for gpt-2)\n",
367 |     "print(gen(t, do_sample=True, max_length=128, temperature=0.2)[0]['generated_text'])"
368 |    ]
369 |   }
370 |  ],
371 |  "metadata": {
372 |   "kernelspec": {
373 |    "display_name": "Python 3 (ipykernel)",
374 |    "language": "python",
375 |    "name": "python3"
376 |   },
377 |   "language_info": {
378 |    "codemirror_mode": {
379 |     "name": "ipython",
380 |     "version": 3
381 |    },
382 |    "file_extension": ".py",
383 |    "mimetype": "text/x-python",
384 |    "name": "python",
385 |    "nbconvert_exporter": "python",
386 |    "pygments_lexer": "ipython3",
387 |    "version": "3.8.0"
388 |   }
389 |  },
390 |  "nbformat": 4,
391 |  "nbformat_minor": 5
392 | }
393 | 


--------------------------------------------------------------------------------
/llama_train_requirements.txt:
--------------------------------------------------------------------------------
1 | .
2 | protobuf==3.20.3
3 | accelerate
4 | https://github.com/huggingface/transformers
5 | sentencepiece
6 | 


--------------------------------------------------------------------------------
/opengpt/config.py:
--------------------------------------------------------------------------------
  1 | from box import Box
  2 | import jsonpickle
  3 | import os
  4 | import yaml
  5 | 
  6 | class BaseConfig(object):
  7 |     def __init__(self, to_box=False):
  8 |         pass
  9 | 
 10 |     def _to_box(self):
 11 |         # Convert all dicts to boxes
 12 |         for key, val in self.__dict__.items():
 13 |             if isinstance(val, dict):
 14 |                 self.__setattr__(key, Box(val))
 15 | 
 16 |     def _from_box(self):
 17 |         # Convert all dicts to boxes
 18 |         for key, val in self.__dict__.items():
 19 |             if isinstance(val, Box):
 20 |                 self.__setattr__(key, val.to_dict())
 21 | 
 22 |     def save(self, save_path=None):
 23 |         r''' Save the config into a .json file
 24 |         Args:
 25 |             save_path (`str`):
 26 |                 Where to save the created json file, if nothing we use the default from paths.
 27 |         '''
 28 |         if save_path is None:
 29 |             save_path = self.path.self
 30 | 
 31 |         # We want to save the dict here, not the whole class
 32 |         self._from_box()
 33 |         json_string = jsonpickle.encode({k:v for k,v in self.__dict__.items() if k != 'path'})
 34 | 
 35 |         with open(save_path, 'w') as f:
 36 |             f.write(json_string)
 37 |         self._to_box()
 38 | 
 39 |     @classmethod
 40 |     def load(cls, save_path):
 41 |         config = cls(to_box=False)
 42 |         # Read the jsonpickle string
 43 |         with open(save_path) as f:
 44 |             config_dict = jsonpickle.decode(f.read())
 45 |         config.merge_config(config_dict)
 46 |         config._to_box()
 47 |         return config
 48 | 
 49 |     def merge_config(self, config_dict):
 50 |         r''' Merge a config_dict with the existing config object.
 51 |         Args:
 52 |             config_dict (`dict`):
 53 |                 A dictionary which key/values should be added to this class.
 54 |         '''
 55 |         for key in config_dict.keys():
 56 |             if key in self.__dict__ and isinstance(self.__dict__[key], dict):
 57 |                 self.__dict__[key].update(config_dict[key])
 58 |             else:
 59 |                 self.__dict__[key] = config_dict[key]
 60 | 
 61 | 
 62 | class Config(BaseConfig):
 63 |     r''' There are probably nicer ways to do this, but I like this one.
 64 |     '''
 65 |     def __init__(self, yaml_path):
 66 |         self.yaml_path = yaml_path
 67 |         self.load_yaml(yaml_path)
 68 | 
 69 |     def reload_yaml(self):
 70 |         self.load_yaml(self.yaml_path)
 71 | 
 72 |     def load_yaml(self, yaml_path):
 73 |         _config = yaml.safe_load(open(yaml_path, 'r'))
 74 |         self.to_box = True
 75 |         self.base_path = './'
 76 |         self.datasets = {}
 77 |         self.name = 'opengpt'
 78 | 
 79 |         for k,v in _config.items():
 80 |             self.__setattr__(k, v)
 81 |         # For fun, we will also keept the _config
 82 |         self._config = _config
 83 | 
 84 |         self.path = {'self': os.path.join(self.base_path, f'config_for_{self.name}.json')}
 85 |         if _config.get('static_paths', None):
 86 |             self.path.update(_config['static_paths'])
 87 | 
 88 |         if self.to_box:
 89 |             self._to_box()
 90 | 
 91 |             def create_dirs(paths):
 92 |                 for path in paths:
 93 |                     if isinstance(path, str):
 94 |                         os.makedirs(os.path.dirname(path), exist_ok=True)
 95 |                     elif isinstance(path, dict):
 96 |                         create_dirs(path.values())
 97 |             create_dirs(self.path.values())
 98 |         
 99 |         # Create dirs for datasets, this is where all the data from one dataset will go
100 |         for ds in self.datasets:
101 |             os.makedirs(os.path.join(self.base_path, ds['name']), exist_ok=True)


--------------------------------------------------------------------------------
/opengpt/data_collator.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | class DataCollatorWithPadding(object):
 4 |     r''' Will pad or trim examples to the appropriate length.
 5 |     '''
 6 |     def __init__(self, pad_token_id, ignore_index, max_seq_len):
 7 |         self.pad_token_id = pad_token_id
 8 |         self.ignore_index = ignore_index
 9 |         self.max_seq_len = max_seq_len
10 | 
11 |     def __call__(self, instances):
12 |         input_ids, labels = tuple([torch.tensor(instance[key][0:self.max_seq_len]) for instance in instances] for key in ("input_ids", "labels"))
13 |         batch = {}
14 |         
15 |         batch['input_ids'] = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True, padding_value=self.pad_token_id) 
16 |         batch['labels'] = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=self.ignore_index)
17 |         batch['attention_mask'] = batch['input_ids'].ne(self.pad_token_id)
18 |     
19 |         return batch


--------------------------------------------------------------------------------
/opengpt/dataset_utils.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import math
  3 | import os
  4 | import json
  5 | import hashlib
  6 | from tqdm.auto import tqdm
  7 | from opengpt import parsers, teachers
  8 | import logging
  9 | import random
 10 | 
 11 | 
 12 | def split_csv_by_max_len(datasets, max_len, tokenizer, base_path):
 13 |     r''' Given a tokenizer it will split the dataset (based on the `text` column) into max_len sequencse 
 14 |     '''
 15 |     for dataset in tqdm(datasets, desc='Datasets', total=len(datasets)):
 16 |         csv_path = dataset['path']
 17 |         name = dataset['name']
 18 | 
 19 |         nrows = None
 20 |         if dataset.get('nrows', -1) > 0:
 21 |             nrows = dataset['nrows']
 22 | 
 23 |         df = pd.read_csv(csv_path, nrows=nrows)
 24 |         cols = df.columns
 25 |         assert 'text' in cols, f'The CSV for dataset {name} has no "text" column.'
 26 | 
 27 |         new_data = [list(cols) + ['len', 'part']]
 28 |         for _, row in tqdm(df.iterrows(), desc=dataset['name'], total=len(df)):
 29 |             text = row['text']
 30 |             tokens = tokenizer.encode(text)
 31 | 
 32 |             for i in range(math.ceil(len(tokens) / max_len)):
 33 |                 new_text = tokenizer.decode(tokens[i*max_len:(i+1)*max_len])
 34 |                 new_data_row = [row[c] if c != 'text' else new_text for c in cols]
 35 |                 new_data_row.append(len(tokens[i*max_len:(i+1)*max_len]))
 36 |                 new_data_row.append(f'part_{i}')
 37 |                 new_data.append(new_data_row)
 38 | 
 39 |         # Save
 40 |         new_df = pd.DataFrame(new_data[1:], columns=new_data[0])
 41 |         new_df.to_csv(os.path.join(base_path, name, 'data_split_by_length.csv'), index=False)
 42 |         logging.warning(f'{dataset["name"]}: length before vs after: {len(df)} vs {len(new_df)}\n')
 43 | 
 44 | 
 45 | def create_dataset_no_input(config):
 46 |     r''' This does not require an input dataset to generate a new dataset, only a prompt is needed
 47 |     '''
 48 |     prompt_db = json.load(open(config.path.prompt_db, 'rb'))
 49 |     raw_data_columns = ['id', 'raw_output', 'prompt_hash']
 50 |     raw_data = pd.DataFrame(None, columns=raw_data_columns)
 51 |     raw_data_path = os.path.join(config.base_path, config.name, f"raw_generated_data_for_{config.name}.csv")
 52 |     if os.path.exists(raw_data_path):
 53 |         raw_data = pd.read_csv(raw_data_path)
 54 |         logging.warning(f"Loading an existing openai generated dataset found at: {raw_data_path}" + 
 55 |                         f"There are already {len(raw_data)} rows in the that dataset, the generation will continue from where last left off. " + 
 56 |                         f"The script will also do all examples that were not done in the previous run.")
 57 | 
 58 | 
 59 |     teacher = getattr(teachers, f'ask_{config.teacher.name}')
 60 |     for prompt_config in config.prompts: 
 61 |         prompts = [prompt for prompt in prompt_db if prompt['hash'] in prompt_config['hashes']] # There must be one
 62 | 
 63 |         parameters = prompt_config.get('extra_parameters', {})
 64 | 
 65 |         for language in prompt_config.get('languages', ['English']):
 66 |             parameters['language'] = language
 67 |             logging.warning(f"\nStarting prompts: {prompt_config['hashes']}\n #Runs: {prompt_config['runs']}\nLanguage: {language}")
 68 |             for prompt in prompts:
 69 |                 # If some examples exist already
 70 | 
 71 | 
 72 |                 start = len(raw_data[raw_data.prompt_hash == prompt['hash']])
 73 |                 for _ in tqdm(range(start, prompt_config['runs']), total=(prompt_config['runs'] - start)):
 74 |                     prompt_text_template = prompt['text']
 75 |                     prompt_text = prompt_text_template.format(**parameters)
 76 |                     try:
 77 |                         out = teacher(prompt_text, config)
 78 |                         new_data = pd.DataFrame([[len(raw_data), out, prompt['hash']]], columns=raw_data_columns)
 79 |                         raw_data = pd.concat([raw_data, new_data], ignore_index=True)
 80 | 
 81 |                         if len(raw_data) % config.data_generation_checkpoint_every == 0:
 82 |                             logging.warning("Checkpointing the generated dataset.")
 83 |                             raw_data.to_csv(raw_data_path, index=False)
 84 | 
 85 |                     except Exception as e:
 86 |                         logging.exception(e)
 87 |                         logging.warning(f"Skipping example for prompt: {prompt['hash']}\n")
 88 | 
 89 |     if raw_data is not None and len(raw_data) > 0:
 90 |         raw_data.to_csv(raw_data_path, index=False)
 91 | 
 92 |     return raw_data
 93 | 
 94 | 
 95 | def create_dataset(config):
 96 |     prompt_db = json.load(open(config.path.prompt_db, 'rb'))
 97 |     raw_data_columns = ['id', 'raw_output', 'dataset', 'language', 'run', 'prompt_hash', 'prompt_text_hash', 'context']
 98 |     raw_data = pd.DataFrame(None, columns=raw_data_columns)
 99 |     prepared_data = None
100 |     raw_data_path = os.path.join(config.base_path, config.name, f"raw_generated_data_for_{config.name}.csv")
101 |     prepared_data_path = os.path.join(config.base_path, config.name, f"prepared_generated_data_for_{config.name}.csv")
102 |     if os.path.exists(raw_data_path) and os.path.exists(prepared_data_path):
103 |         raw_data = pd.read_csv(raw_data_path)
104 |         prepared_data = pd.read_csv(prepared_data_path)
105 |         logging.warning(f"Loading an existing openai generated dataset found at: \n{raw_data_path}\n and\n{prepared_data_path}\n" + 
106 |                         f"There are already {len(raw_data)} rows in the that dataset, the generation will continue from where last left off. " + 
107 |                         f"The script will also do all examples that were not done in the previous run.\n" + 
108 |                         "***Take care that if prompt_config['random_prompt'] is set to true, it can produce unwanted results.\n\n")
109 | 
110 |     cnt = 0
111 |     for prompt_config in config.prompts:
112 |         prompts = [prompt for prompt in prompt_db if prompt['hash'] in prompt_config['hashes']] # There must be one
113 |         teacher = getattr(teachers, f'ask_{config.teacher.name}')
114 | 
115 |         for run in range(prompt_config.get('runs', 1)):
116 |             parameters = prompt_config.get('extra_parameters', {})
117 |             extra_data_columns = prompt_config.get('extra_data_columns', [])
118 | 
119 |             for language in prompt_config.get('languages', ['English']):
120 |                 parameters['language'] = language
121 |                 logging.warning(f"\nStarting prompts: {prompt_config['hashes']}\nRun: {run}\nLanguage: {language}")
122 |                 for dataset_name in prompt_config['datasets']:
123 |                     df = pd.read_csv(os.path.join(config.base_path, dataset_name, 'data_split_by_length.csv'))
124 |                     for row_ind, row in tqdm(df.iterrows(), desc=dataset_name, total=len(df)):
125 |                         # Set the context from the current row
126 |                         parameters['context'] = row['text']
127 |                         for col in extra_data_columns:
128 |                             parameters[col] = row[col]
129 |                         if prompt_config.get('random_prompt', False):
130 |                             # This means for each example in the dataset we randomly select a prompt to be used, if False
131 |                             #every example will run through every prompt
132 |                             selected_prompts = [random.choice(prompts)]
133 |                         else:
134 |                             selected_prompts = prompts # Use all prompts sequentially
135 |                         for prompt in selected_prompts:
136 |                             prompt_text_template = prompt['text']
137 |                             # Every prompt has its own parser
138 |                             parser = getattr(parsers, prompt['parser'])
139 |                             if len(str(row['text']).split(" ")) > config.teacher.min_len:
140 |                                 prompt_text = prompt_text_template.format(**parameters)
141 |                                 # The hash is of everything that is used to generate the output
142 |                                 h = hashlib.sha256(prompt_text.encode("utf-8"))
143 |                                 h.update(str(run).encode("utf-8"))
144 |                                 h = h.hexdigest()
145 | 
146 |                                 # Only get the output if this was not done already
147 |                                 if h not in raw_data.prompt_text_hash.values:
148 |                                     # Get output from OpenAI and parse using parser, the parser will append the parsed data onto the prepared_data CSV.
149 |                                     try:
150 |                                         openai_output = teacher(prompt_text, config)
151 |                                         prepared_data = parser(data=openai_output, prepared_data=prepared_data, prompt_config=prompt_config, config=config, row=row, 
152 |                                                                raw_data_id=len(raw_data), prompt_text=prompt_text) # ID is length of raw_data
153 | 
154 |                                         # Concat the current output to the data dataframe, only if not None
155 |                                         if prepared_data is not None and len(prepared_data) > 0:
156 |                                             new_data = pd.DataFrame([[len(raw_data), openai_output, dataset_name, language, run, prompt['hash'], h, parameters['context']]], 
157 |                                                                     columns=raw_data_columns)
158 |                                             raw_data = pd.concat([raw_data, new_data], ignore_index=True)
159 |                                         if len(raw_data) % config.data_generation_checkpoint_every == 0:
160 |                                             logging.warning("Checkpointing the generated dataset.")
161 |                                             raw_data.to_csv(raw_data_path, index=False)
162 |                                             prepared_data.to_csv(prepared_data_path, index=False)
163 |                                     except Exception as e:
164 |                                         logging.exception(e)
165 |                                         logging.warning(f"Skipping example at position: {row_ind} for dataset: {dataset_name}\n")
166 |     # Final save
167 |     if raw_data is not None and prepared_data is not None and len(raw_data) > 0 and len(prepared_data) > 0:
168 |         raw_data.to_csv(raw_data_path, index=False)
169 |         prepared_data.to_csv(prepared_data_path, index=False)
170 |     return raw_data, prepared_data 
171 | 
172 | 
173 | def create_labels(examples, config, tokenizer):
174 |     r''' This is used with a prepared HF dataset that is already tokenized. It will add labels
175 |     so that only the AI generated parts (answers) will be trained on.
176 |     '''
177 |     
178 |     user_token_id = tokenizer.vocab[config.special_tokens.user]
179 |     ai_token_id = tokenizer.vocab[config.special_tokens.ai]
180 |     # Everything written by an AI will be used for training, and everything by a user will be ignored
181 | 
182 |     examples['labels'] = []
183 |     for i in range(len(examples['input_ids'])):
184 |         labels = []
185 |         ignore = True
186 |         for tkn_id in examples['input_ids'][i]:
187 |             if tkn_id == user_token_id:
188 |                 ignore = True
189 |             elif tkn_id == ai_token_id:
190 |                 ignore = False
191 |             
192 |             if ignore:
193 |                 labels.append(config.train.ignore_index)
194 |             else:
195 |                 labels.append(tkn_id)
196 |         examples['labels'].append(labels)
197 |     return examples
198 | 
199 | 
200 | def pack_examples(examples, block_size, packing_type='partial'):
201 |     r''' Used with a prepared HF dataset, will pack/group examples. Use with care, can mess up many things
202 |     if the input is not formated properly (requires the <|eod|> token).
203 |     
204 |     packing_type: partial/full/no 
205 |     '''
206 |     # Concatenate all texts.
207 |     if packing_type == 'partial':
208 |         result = {k:[] for k in examples.keys()}
209 |         _key = list(examples.keys())[0] # Take whichever key
210 |         new_example = {k:[] for k in examples.keys()}
211 | 
212 |         for ind in range(len(examples[_key])):
213 |             # Trim long sequences to block_size, this is required for partial packing
214 |             example = {k:v[ind][0:block_size] for k,v in examples.items()}
215 |             if len(new_example[_key]) + len(example[_key]) > block_size:
216 |                 result = {k:result[k] + [v] for k,v in new_example.items()}
217 |                 new_example = example 
218 |             else:
219 |                 new_example = {k:new_example[k] + v for k,v in example.items()}
220 |         #  Add the last example if there is something to add  
221 |         if len(new_example[_key]) > 0:   
222 |             result = {k:result[k] + [v] for k,v in new_example.items()}
223 |     elif packing_type == 'full':
224 |         # Full packing
225 |         concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
226 |         total_length = len(concatenated_examples[list(examples.keys())[0]])
227 |         total_length = (total_length // block_size) * block_size
228 |         # Split by chunks of max_len.
229 |         result = {
230 |             k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
231 |             for k, t in concatenated_examples.items()
232 |         }
233 |     else:
234 |         # Do nothing
235 |         result = examples
236 |     return result
237 | 


--------------------------------------------------------------------------------
/opengpt/model_utils.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | def add_tokens_to_model_and_tokenizer(config, tokenizer, model):
 4 |     ntkns = tokenizer.add_tokens(list(config.special_tokens.values()))
 5 |     logging.warning(f"Added: {ntkns} tokens to the tokenizer")
 6 |     if ntkns > 0:
 7 |         input_embeddings = model.get_input_embeddings().weight.data
 8 |         output_embeddings = model.get_output_embeddings().weight.data
 9 |         input_embeddings_avg = input_embeddings[:-ntkns].mean(dim=0, keepdim=True)
10 |         output_embeddings_avg = output_embeddings[:-ntkns].mean(dim=0, keepdim=True)
11 |         model.resize_token_embeddings(len(tokenizer))
12 |         input_embeddings[-ntkns:] = input_embeddings_avg
13 |         output_embeddings[-ntkns:] = output_embeddings_avg 
14 |     
15 |     # Set the eos and pad tokens properly
16 |     tokenizer.add_special_tokens({"eos_token": config.special_tokens.eos, "pad_token": config.special_tokens.pad})
17 |     model.config.eos_token_id = tokenizer.eos_token_id
18 | 
19 |     assert model.get_input_embeddings().num_embeddings == len(tokenizer)


--------------------------------------------------------------------------------
/opengpt/parsers.py:
--------------------------------------------------------------------------------
  1 | r'''
  2 | Parsers are used to parse the output from a Teacher (OpenAI, Google, ...) into the right format. The purpose of the paraser is to
  3 |  parse the new output and append it to the prepared_data. Every parser will receive:
  4 |     - data: the new data output from a Teacher model
  5 |     - prepared_data: the dataset we are creating, in other words old data that was output by a parser
  6 |     - prompt_config: the prompt_config for the current prompt as a dictionary (taken from the .yaml file)
  7 |     - config: general config, ie the whole .yaml file as a python-box (can be used as a dictionary)
  8 |     - row: the row from the original CSV that was used for context to generate the `data`, can be empty given the use-case
  9 |     - raw_data_id: the ID of the `data` in the raw_data CSV (used to store the raw output from OpenAI)
 10 |     - prompt_text: the prepared prompt that was used to generate `data`
 11 | 
 12 | If we are running the paraser for the first time the `prepared_data` will be empty (None) and it is up to us to define how that prepared_data (e.g. CSV) should look. Every parser can have different columns depending on the use-case.
 13 | 
 14 | If the parser will output the final prepeared data that will be used for model training, it should append special tokens: config.special_tokens.[user, ai, eos, eod],
 15 | have a look at the functions below (e.g. csv_qa_parser).
 16 | '''
 17 | 
 18 | import pandas as pd
 19 | from io import StringIO
 20 | import re
 21 | import logging
 22 | 
 23 | def csv_qa_parser(data, prepared_data, prompt_config, config, row, raw_data_id, prompt_text):
 24 |     r''' Expects data in the CSV format, with the separator `;`, the dataframe has to have two columns: `Question`, `Answer`
 25 |     '''
 26 |     qa_pairs = None
 27 |     df = pd.read_csv(StringIO(data), sep=';')
 28 | 
 29 |     # Strip everything
 30 |     df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
 31 | 
 32 |     ref_col = prompt_config.get('reference_column_to_append', None)
 33 |     if ref_col and row is not None and ref_col in row and row[ref_col]:
 34 |         # Means we want to append a reference at the end of each Answer
 35 |         to_append = f"\nReferences:\n- {row[ref_col]}"
 36 |         df['Answer'] = df['Answer'] + to_append
 37 |     df['Question'] += f' {config.special_tokens.eos}' # Every Q/A pair is independent
 38 |     df['Answer'] += f' {config.special_tokens.eos} {config.special_tokens.eod}'
 39 |     qa_pairs = [f'{config.special_tokens.user} {q.strip()} {config.special_tokens.ai} {a.strip()}' for q,a in df[['Question', 'Answer']].values]
 40 | 
 41 |     new_data = pd.DataFrame([[text, raw_data_id] for text in qa_pairs], columns=['text', 'raw_data_id'])
 42 |     if prepared_data is None:
 43 |         prepared_data = new_data
 44 |     else:
 45 |         prepared_data = pd.concat([prepared_data, new_data], ignore_index=True)
 46 | 
 47 |     return prepared_data
 48 | 
 49 | 
 50 | instruction_text = re.compile(r'Instruction:?(.*?)Input:', re.DOTALL)
 51 | input_text = re.compile(r'Input:?(.*?)Output:?', re.DOTALL)
 52 | output_text = re.compile(r'Output:?(.*?)$', re.DOTALL)
 53 | def task_parser(data, prepared_data, prompt_config, row, config, raw_data_id, prompt_text):
 54 |     r''' This parser can be used with prompts similar to Alpaca, it expects `data` in the following format:
 55 |         Task:
 56 |         Instruction:
 57 |         Input:
 58 |         Output:
 59 |         
 60 |         Task:
 61 |         Instruction:
 62 |         Input:
 63 |         Output:
 64 |     .
 65 |     .
 66 |     .
 67 |     '''
 68 |     tasks = re.split(r'[1-9 \.]*Task[:\s]*', str(data))
 69 |     st = config.special_tokens
 70 |     new_data = []
 71 |     for task in tasks:
 72 |         task = task.strip()
 73 |         ins = re.search(instruction_text, task).group(1).strip()
 74 |         inp = re.search(input_text, task).group(1).strip()
 75 |         out = re.search(output_text, task).group(1).strip()
 76 | 
 77 |         if inp:
 78 |             if inp.startswith('"'):
 79 |                 inp = inp[1:]
 80 |             if inp.endswith('"'):
 81 |                 inp = inp[:-1]
 82 |             if inp == '<noinput>':
 83 |                 inp = ''
 84 |             else:
 85 |                 inp = '\n' + str(inp)
 86 | 
 87 |         if ins and out:
 88 |             if inp in ins:
 89 |                 new_data.append((len(prepared_data), f'{st.user} {ins} {st.eos} {st.ai} {out} {st.eos} {st.eod}', raw_data_id))
 90 |             else:
 91 |                 new_data.append((len(prepared_data), f'{st.user} {ins}{inp} {st.eos} {st.ai} {out} {st.eos} {st.eod}', raw_data_id))
 92 |     
 93 |     new_data = pd.DataFrame(new_data, columns=['text', 'raw_data_id'])
 94 |     if prepared_data is None:
 95 |         prepared_data = new_data
 96 |     else:
 97 |         prepared_data = pd.concat([prepared_data, new_data], ignore_index=True)
 98 | 
 99 |     return prepared_data   
100 | 
101 | 
102 | def simple_task_parser(data, prepared_data, prompt_config, row, config, raw_data_id, prompt_text):
103 |     r''' This parser can be used with prompts similar to Alpaca, but that only have Instructions, it expects data :
104 |         Task Number:
105 |         Instruction:
106 |         
107 |         Task Number:
108 |         Instruction:
109 |         
110 |     This parser is used as an intermediate, so the output is a csv with columns `text`, `instruction`, `raw_data_id`
111 |     .
112 |     .
113 |     .
114 |     '''
115 |     tasks = [x.replace("Instruction:", "").strip() for x in re.split(r'[1-9 \.]*Task Number[:\s]*[\d\n]*', str(data)) if x.strip()]
116 |     new_data = []
117 |     for task in tasks:
118 |         task = task.strip()
119 |    
120 |     new_data = pd.DataFrame([[[row['text']], task, raw_data_id] for task in tasks], columns=['text', 'instruction', 'raw_data_id'])
121 |     if prepared_data is None:
122 |         prepared_data = new_data
123 |     else:
124 |         prepared_data = pd.concat([prepared_data, new_data], ignore_index=True)
125 | 
126 |     return prepared_data   
127 | 
128 | 
129 | def medical_conversation_parser(data, prepared_data, prompt_config, config, row, raw_data_id, prompt_text):
130 |     r''' It expects data to be in form of a conversation, like:
131 |         Patient: <some text>
132 |         AI-Assistant: <some text>
133 |         Patient: <some text>
134 |         .
135 |         .
136 |         .
137 |     The actor names 'Patient' and 'AI-Assistant" have to match exactlty 
138 |     '''
139 |     conversation = None
140 | 
141 |     # Merge the extractions into one conversation
142 |     data = re.split(r'\s*(Patient\s*:|AI-Assistant\s*:)\s*', data)[1:]
143 |     if len(data) > 0:
144 |         conversation = ""
145 |         to_append = None
146 | 
147 |         ref_col = prompt_config.get('reference_column_to_append', None)
148 |         if ref_col and ref_col in row and row[ref_col]:
149 |             # Means we want to append a reference at the end of each Answer
150 |             to_append = f"\nReferences:\n- {row[ref_col]}"
151 | 
152 |         actor = None
153 |         for message in data:
154 |             message = message.strip()
155 |             if message in ['Patient:', 'AI-Assistant:', 'Patient', 'AI-Assistant', 'Patient :', 'AI-Assistant :']:
156 |                 actor = message
157 |             elif actor is not None: #TODO: Make this nicer
158 |                 if actor in ['Patient:', 'Patient :', 'Patient']:
159 |                     conversation += f'{config.special_tokens.user} {message} {config.special_tokens.eos} '
160 |                 elif actor in ['AI-Assistant:', 'AI-Assistant :', 'AI-Assistant']:
161 |                     conversation += f'{config.special_tokens.ai} {message}'
162 |                     if to_append is not None and to_append:
163 |                         conversation += to_append
164 |                     conversation += f" {config.special_tokens.eos} "
165 |         if conversation:
166 |             conversation = conversation.strip() + f" {config.special_tokens.eod}"
167 | 
168 |     new_data = pd.DataFrame([[conversation, raw_data_id]], columns=['text', 'raw_data_id'])
169 |     if prepared_data is None:
170 |         prepared_data = new_data
171 |     else:
172 |         prepared_data = pd.concat([prepared_data, new_data], ignore_index=True)
173 | 
174 |     return prepared_data
175 | 
176 | 
177 | def csv_ner_parser(data, prepared_data, prompt_config, config, row, raw_data_id, prompt_text):
178 |     r''' Expects data in CSV format, using the `;` separator
179 |     '''
180 |     df = pd.read_csv(StringIO(data), sep=';', engine='python')
181 |     df['raw_data_id'] = raw_data_id
182 | 
183 |     if prepared_data is None:
184 |         prepared_data = df
185 |     else:
186 |         prepared_data = pd.concat([prepared_data, df], ignore_index=True)
187 | 
188 |     return prepared_data


--------------------------------------------------------------------------------
/opengpt/prompt_utils.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import hashlib
 3 | import os
 4 | import logging
 5 | 
 6 | def add_to_prompt_database(text, description, parser, database_path, force_replace=False):
 7 |     r''' The database is a simple json file where all the prompts are saved.
 8 |     '''
 9 |     if os.path.exists(database_path):
10 |         logging.info(f"Loading db from: {database_path}")
11 |         db = json.load(open(database_path, 'r'))
12 |         hashes = set([prompt['hash'] for prompt in db])
13 |     else:
14 |         db = []
15 |         hashes = set()
16 | 
17 |     # Good enough for what we need 
18 |     h = hashlib.sha256(text.encode("utf-8")).hexdigest()[:10]
19 |     if force_replace and h in hashes:
20 |         logging.warning("Found an existing prompt with the same hash, it will be replaced with the new one.")
21 |         # Remove the prompt with the hash as the current one
22 |         db = [prompt for prompt in db if prompt['hash'] != h]
23 |         hashes = set([prompt['hash'] for prompt in db])
24 |     if h not in hashes:
25 |         db.append({
26 |                   'hash': h,
27 |                   'text': text,
28 |                   'description': description,
29 |                   'parser': parser
30 |                   })
31 |         
32 |         json.dump(db, open(database_path, 'w'), indent=2)
33 |         logging.warning(f"Added prompt: {h}")
34 |     else:
35 |         logging.warning("The prompt is already in the database. It will not be added, you can use force_replace if you really want to add it.")
36 | 
37 |     return db


--------------------------------------------------------------------------------
/opengpt/teachers.py:
--------------------------------------------------------------------------------
 1 | import openai
 2 | 
 3 | def ask_openai(prompt, config):
 4 |     response = openai.ChatCompletion.create(
 5 |         model = config.teacher.model,
 6 |         messages = [
 7 |             {"role": "user", "content": prompt},
 8 |         ]
 9 |     )
10 | 
11 |     message = None
12 |     if response['choices'][0]['finish_reason'] == 'stop':
13 |         message = response['choices'][0]['message']['content']
14 | 
15 |     return message


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | .
2 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import setuptools
 2 | from setuptools.command.install import install
 3 | from setuptools.command.develop import develop
 4 | from setuptools.command.egg_info import egg_info
 5 | 
 6 | with open("./README.md", "r") as fh:
 7 |     long_description = fh.read()
 8 | 
 9 | setuptools.setup(
10 |     name="opengpt",
11 |     version="0.0.5",
12 |     author="w-is-h",
13 |     author_email="w.kraljevic@gmail.com",
14 |     description="OpenGPT a framework for producing grounded domain specific LLMs, and NHS-LLM a conversational model for healthcare made using OpenGPT.",
15 |     long_description=long_description,
16 |     long_description_content_type="text/markdown",
17 |     url="https://github.com/cogstack/opengpt",
18 |     packages=['opengpt'],
19 |     install_requires=[
20 |         'datasets>=2,<3',
21 |         'transformers>=4.2,<5',
22 |         'tiktoken>=0.3.2',
23 |         'pandas',
24 |         'openai',
25 |         'numpy',
26 |         'tqdm',
27 |         'python-box',
28 |         'jsonpickle',
29 |         ],
30 |     classifiers=[
31 |         "Programming Language :: Python :: 3",
32 |         "License :: OSI Approved :: MIT License",
33 |         "Operating System :: OS Independent",
34 |     ],
35 | )
36 | 


--------------------------------------------------------------------------------