├── .gitignore ├── MODEL_LICENSE.md ├── README.md ├── README_EN.md ├── finetune ├── data │ └── coig_10k.json ├── ds_config.json └── finetune.py └── requirements.txt /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | -------------------------------------------------------------------------------- /MODEL_LICENSE.md: -------------------------------------------------------------------------------- 1 | # LingoWhale-8B模型许可协议 2 | 3 | ## 1. 定义 4 | - “发布方”:指发布源模型的LingoWhale-8B模型团队。 5 | - “源模型”:指根据本许可提供的LingoWhale-8B模型参数。 6 | - “使用方”:指根据本协议使用源模型的单位或个人。 7 | 8 | ## 2. 许可内容 9 | 根据本许可的条款和条件,发布方特此授予您非排他性、全球性、不可转让、不可再许可、可撤销、免版税的版权许可。 10 | 11 | 上述版权声明和本许可声明应包含在此源模型的所有副本或重要部分中。 12 | 13 | ## 3. 限制 14 | 您不得出于任何军事或非法目的使用、复制、修改、合并、发布、分发、复制或创建此源模型的全部或部分衍生品。 15 | 16 | 您不得利用此源模型从事任何危害国家安全和国家统一、危害社会公共利益、侵犯人身权益的行为。 17 | 18 | ## 4. 免责声明 19 | 此源模型“按原样”提供,不提供任何明示或暗示的保证,包括但不限于对适销性、特定用途的适用性和非侵权性的保证。在任何情况下,作者或版权持有人均不对任何索赔、损害或其他责任负责,无论是在合同诉讼、侵权行为还是其他方面,由源模型或源模型的使用或其他交易引起、由源模型引起或与之相关模型。 20 | 21 | ## 5. 责任限制 22 | 除适用法律禁止的范围外,在任何情况下且根据任何法律理论,无论是基于侵权行为、疏忽、合同、责任或其他原因,任何发布方均不对您承担任何直接、间接、特殊、偶然、示范性、 或间接损害,或任何其他商业损失,即使使用方已被告知此类损害的可能性。 23 | 24 | ## 6. 争议解决 25 | 本许可受中华人民共和国法律管辖并按其解释。 因本许可引起的或与本许可有关的任何争议,由发布方住所地人民法院管辖。 26 | 27 | 请注意,许可证可能会更新到更全面的版本。 有关许可和版权的任何问题,请通过[license@deeplang.ai](mailto:license@deeplang.ai)与我们联系。 28 | 29 | ## 7. 商业使用 30 | 若您期望基于本协议的许可条件与限制,将此源模型或其衍生品用作商业用途,请您填写[问卷](https://deeplang.feishu.cn/share/base/form/shrcnzt705EDAvVrmAJUCyg7TIb)进行申请,通过后即可**免费商用**。 -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | English  |  中文 3 |

4 |
5 | 6 |
7 |

8 | LingoWhale-8B 9 |

10 |
11 | 12 |

13 | 🤗 Hugging Face • 🤖 ModelScope • ⛵ Wisemodel 14 |

15 | 16 |
17 | 18 | 深言科技联合清华大学NLP实验室开源语鲸-8B模型 🎉 19 | 20 |
21 | 22 | # 目录 23 | 24 | - [目录](#目录) 25 | - [模型介绍](#模型介绍) 26 | - [测评结果](#测评结果) 27 | - [生成样例](#生成样例) 28 | - [部署和推理](#部署和推理) 29 | - [微调方法](#微调方法) 30 | - [开源协议](#开源协议) 31 | 32 | # 模型介绍 33 | 34 | LingoWhale-8B是由深言科技推出的语鲸系列大模型中首个开源的中英双语大语言模型。 35 | 36 | LingoWhale-8B模型在数万亿token的高质量中英数据上进行预训练,具有强大的基础能力,在多个公开评测基准上均达到领先效果。在预训练阶段,模型使用8K的上下文长度进行训练,能够完成更长上下文的理解和生成任务。 37 | 38 | LingoWhale-8B模型对学术研究完全开放,使用方通过邮件申请并获得官方商用许可后,即可免费商用。 39 | 40 | 在开源模型权重的同时,我们也提供了符合用户习惯的Huggingface推理接口以及LoRA等参数高效微调示例,便于开发者快速使用LingoWhale-8B模型。 41 | 42 | 受模型参数量影响,大模型固有的幻觉问题、数学计算能力相对较弱等问题在LingoWhale-8B模型中仍然存在。请大家在使用前了解这些问题,评估可能存在的风险。后续版本的LingoWhale模型将会针对此类问题进行重点优化。 43 | 44 | 45 | # 测评结果 46 | 47 | 我们在以下公开评测基准上进行了测试: 48 | 49 | - [C-Eval](https://arxiv.org/abs/2305.08322)是一个中文基础模型评估基准,包含了13948个多项选择题,涵盖了52个不同的学科和四个难度级别。它旨在评估中文语言模型的能力。我们使用该数据集的dev集作为few-shot的来源,在test集上进行了5-shot测试。 50 | - [MMLU](https://arxiv.org/abs/2009.03300)是一个英文基础模型评估基准,涵盖了基本数学、美国历史、计算机科学、法律等多个领域,共包含57个任务。它用于评估语言模型在不同领域任务上的表现。我们对模型进行了5-shot测试。 51 | - [CMMLU](https://arxiv.org/abs/2306.09212)是一个中文评估基准,涵盖了从基础学科到高级专业水平的67个主题。它用于评估中文语言模型在知识和推理能力方面的表现。我们使用该数据集的dev集作为few-shot的来源,在test集上进行了5-shot测试。 52 | - [Gaokao](https://arxiv.org/abs/2305.12474)是一个以中国高考题目为数据集的评估基准。它旨在提供测评中文语言模型在语言理解能力和逻辑推理能力方面的能力。我们只保留了其中的四选一的选择题,随机划分后对模型进行了5-shot测试。 53 | - [HumanEval](https://arxiv.org/abs/2107.03374)是一个包含上百个编程问题的英文评估基准。它用于评估语言模型在程序理解与生成能力方面的表现。我们采用了zero-shot计算Pass@1的方法对模型进行了测试。 54 | - [GSM8K](https://arxiv.org/abs/2110.14168)是一个由高质量、语言多样化的小学数学应用题组成的数据集。它要求根据给定的场景选择最合理的方案,用于评估语言模型在数学应用方面的能力。我们对模型进行了8-shot测试。 55 | - [BBH](https://arxiv.org/abs/2210.09261)是一个从204项Big-Bench评测基准任务中选择出的表现较差的任务单独形成的评估基准。它用于评估大型语言模型在具有挑战性的任务上的表现。我们对模型进行了3-shot测试。 56 | - [AGIEval](https://arxiv.org/abs/2304.06364)是一项考察基础模型类人能力的基准测试,专门用于评估基础模型在人类认知和问题解决相关任务中的能力。我们只保留了其中的四选一的选择题,随机划分后对模型进行了5-shot测试。 57 | 58 | 59 | 这些评估基准提供了标准化的测试和度量,用于评估语言模型在不同任务和领域上的性能和能力。评测方法和测评结果如下表所示: 60 | 61 | | **Model** |**C-Eval**| **MMLU** |**CMMLU** |**GAOKAO**|**HumanEval**|**GSM8K** | **BBH** |**AGIEval**| 62 | |:-----------------------|:--------:|:--------:|:--------:|:--------:|:-----------:|:--------:|:--------:|:---------:| 63 | | | 5-shot | 5-shot | 5-shot | 5-shot | 0-shot | 8-shot | 3-shot | 5-shot | 64 | | **GPT-4** | 68.4 | 83.9 | 70.3 | 66.2 | 69.5 | 90.0 | 75.1 | 63.3 | 65 | | **GPT-3.5 Turbo** | 51.1 | 68.5 | 54.1 | 47.1 | 52.4 | 57.8 | 61.6 | 46.1 | 66 | | **LLaMA2-7B** | 28.9 | 45.7 | 31.4 | 26.0 | 12.8 | 16.2 | 39.2 | 26.5 | 67 | | **ChatGLM2-6B*** | 51.7 | 47.9 | - | - | - | 32.4 | 33.7 | - | 68 | | **Baichuan2-7B-Base*** | 54.0 | 54.2 | 57.1 | 47.5 | 18.3 | 24.5 | 41.6 | 42.7 | 69 | | **Qwen-7B v1.1*** | 63.5 | 58.2 | 62.2 | - | 29.9 | 51.7 | 45.0 | - | 70 | | **LingoWhale-8B-base** | 63.6 | 60.2 | 62.8 | 50.3 | 32.9 | 55.0 | 47.5 | 43.8 | 71 | 72 | 73 | *表示其模型结果来自于官方, 所有的结果都精确到小数点后1位。 74 | 75 | # 生成样例 76 | 77 | > 生成以下样例的模型是使用开源SFT数据基于LingoWhale-8B模型进行微调后的对话模型 78 | 79 |
数学计算

80 | 81 | **问题**:f(x)为偶函数, f(1) = 3, f(2) = 4, 那么f(1) + f(-2) ^2 等于多少? 请一步一步说明 82 | 83 | **解答**:首先,根据题目中给出的条件,我们可以知道: 84 | 85 | - f(x) 是一个偶函数,即 f(-x) = f(x)。 86 | - f(1) = 3。 87 | - f(2) = 4。 88 | 89 | 接下来,我们需要求出 f(1) + f(-2)^2 的值。 90 | 由于 f(x) 是偶函数,所以 f(-2) = f(2),因此 f(-2) = 4。 91 | 将 f(1) 和 f(-2) 的值代入 f(1) + f(-2)^2 中,得到: 92 | f(1) + f(-2)^2 = 3 + (4)^2 = 3 + 16 = 19。 93 | 因此,f(1) + f(-2)^2 的值为 19。 94 | 95 |

96 | 97 |
角色扮演

98 | 99 | **问题**:作为有丰富营销经验的品牌经理,请为我提供一条关于以下情境的品牌营销策略: 100 | 情境:618期间,提高品牌销售额 101 | 品牌定位:国产精品眉笔 102 | 建议要求:提供具有创新性和实效性的品牌营销策略,帮助提升品牌的销售额, 并加入直播带货的方案。 103 | 104 | **解答**:在618期间,为了提高品牌销售额,我们可以采取以下品牌营销策略: 105 | 106 | 1. 推出限时优惠活动:在618活动期间,我们可以在品牌官方网站上推出限时优惠活动,例如购买眉笔送赠品、打折促销等,以吸引消费者的注意力并促进销售。 107 | 108 | 2. 利用社交媒体宣传:我们可以通过社交媒体平台(如微博、微信、抖音等)发布品牌宣传内容,包括产品介绍、使用技巧、用户评价等,以增加品牌曝光率和用户互动性。 109 | 110 | 3. 与知名博主合作推广:我们可以邀请一些知名的美妆博主或网红来试用我们的产品,并在其社交媒体上发布推荐视频或图文,以吸引更多的目标受众。 111 | 112 | 4. 直播带货:我们可以邀请一些知名的带货主播或明星来为我们的产品进行直播带货,通过他们的影响力来吸引更多的消费者关注和购买。 113 | 114 | 5. 参与电商平台活动:我们可以积极参与各大电商平台的促销活动,例如满减、满赠等,以吸引更多的消费者购买我们的产品。 115 | 116 | 综上所述,以上品牌营销策略可以帮助我们在618期间提高品牌销售额,同时加入直播带货的方案可以更好地吸引目标受众,提高销售效果。 117 | 118 |

119 | 120 |
诗歌创作

121 | 122 | **问题**:写一个七言诗,描述岳飞的诗。诗词希望是宋代诗的风格。 123 | 124 | **解答**:《岳飞》 125 | 126 | 铁马金戈壮士魂, 127 | 128 | 精忠报国志凌云。 129 | 130 | 千古英雄谁可及, 131 | 132 | 岳家军威震乾坤。 133 | 134 |

135 | 136 |
文化知识

137 | 138 | **问题**:我要去成都武侯祠参观,能帮我详细介绍下武侯的生平事迹吗? 139 | 140 | **解答**:诸葛亮(181年-234年),字孔明,号卧龙,徐州琅琊阳都(今山东临沂市沂南县)人,三国时期蜀汉丞相,杰出的政治家、军事家、文学家、书法家、发明家。早年随叔父诸葛玄到荆州,诸葛玄死后,诸葛亮就在隆中隐居。刘备三顾茅庐请出诸葛亮,联合东吴孙权于赤壁之战大败曹军,形成三国鼎足之势,又夺占荆益。刘备入蜀后,诸葛亮镇守荆州。刘备称帝后,诸葛亮被任命为丞相,主持朝政。后主刘禅继位,诸葛亮被封为武乡侯,领益州牧。勤勉谨慎,大小政事必亲自处理,赏罚严明;与东吴联盟,改善和西南各族的关系;实行屯田政策,加强战备。前后五次北伐中原,多以粮尽无功。终因积劳成疾,于234年病逝于五丈原(今陕西岐山县南),享年54岁。刘禅追封其为忠武侯,后世常以武侯尊称诸葛亮。东晋政权因其军事才能特追封他为武兴王。诸葛亮散文代表作有《出师表》、《诫子书》等。曾发明木牛流马、孔明灯等,并改造连弩,叫做诸葛连弩,可一弩十矢俱发。诸葛亮一生“鞠躬尽瘁、死而后已”,是中国传统文化中忠臣与智者的代表人物。 141 | 142 |

143 | 144 |
代码生成

145 | 146 | **问题**:你是一名编程开发者,能写一个正则帮忙判断一个字符串是否是合理的手机号码, 用Python实现。 147 | 148 | **解答**: 149 | ```python 150 | import re 151 | 152 | def is_valid_phone(phone): 153 | pattern = r'^1[3-9]\d{9}$' 154 | return bool(re.match(pattern, phone)) 155 | 156 | print(is_valid_phone('13800138000')) # True 157 | print(is_valid_phone('12345678901')) # False 158 | ``` 159 | 160 |

161 | 162 | # 部署和推理 163 | 164 | ## 安装依赖 165 | ```shell 166 | pip install -r requirements.txt 167 | ``` 168 | 169 | + 如需使用flash attention需要安装[flash-attn](https://github.com/Dao-AILab/flash-attention) 170 | + 如需使用LoRA等少参数微调方法需要额外安装[peft](https://github.com/huggingface/peft) 171 | 172 | ## Python 样例 173 | 174 | ```python 175 | >>> from transformers import AutoModelForCausalLM, AutoTokenizer 176 | >>> tokenizer = AutoTokenizer.from_pretrained("deeplang-ai/LingoWhale-8B", trust_remote_code=True) 177 | >>> model = AutoModelForCausalLM.from_pretrained("deeplang-ai/LingoWhale-8B", device_map="auto", trust_remote_code=True) 178 | >>> inputs = tokenizer("陋室铭\n唐 刘禹锡\n", return_tensors="pt") 179 | >>> inputs = inputs.to("cuda:0") 180 | >>> pred = model.generate(**inputs, max_new_tokens=100, repetition_penalty=1.1) 181 | >>> print(tokenizer.decode(pred.cpu()[0], skip_special_tokens=True)) 182 | ``` 183 | 184 | # 微调方法 185 | 模型微调样例代码以`transformers.Trainer`为基础,其中大部分参数和使用方法都可以参考Huggingface中[`Trainer`](https://huggingface.co/docs/transformers/v4.34.1/en/main_classes/trainer#trainer) 的教程和介绍。 186 | 187 | > 本章节旨在展示微调过程,并不对该微调配置下进行微调后的模型效果进行保证。 188 | 189 | ## 单机训练 190 | 下面是一个单机进行微调的例子,使用的数据为从[COIG](https://huggingface.co/datasets/BAAI/COIG)数据集中随机选取的10000条指令微调数据,可以使用自己的数据进行替换。 191 | 192 | ```shell 193 | hostfile="" 194 | deepspeed --hostfile=$hostfile finetune/finetune.py \ 195 | --report_to "none" \ 196 | --data_path "finetune/data/coig_10k.json" \ 197 | --model_name_or_path deeplang-ai/LingoWhale-8B \ 198 | --output_dir "output" \ 199 | --model_max_length 2048 \ 200 | --num_train_epochs 4 \ 201 | --per_device_train_batch_size 4 \ 202 | --gradient_accumulation_steps 1 \ 203 | --save_strategy epoch \ 204 | --learning_rate 2e-5 \ 205 | --lr_scheduler_type constant \ 206 | --adam_beta1 0.9 \ 207 | --adam_beta2 0.98 \ 208 | --adam_epsilon 1e-8 \ 209 | --max_grad_norm 1.0 \ 210 | --weight_decay 1e-4 \ 211 | --warmup_ratio 0.0 \ 212 | --logging_steps 1 \ 213 | --gradient_checkpointing True \ 214 | --deepspeed finetune/ds_config.json \ 215 | --bf16 True \ 216 | --tf32 True 217 | ``` 218 | 219 | 若要替换为自己的数据,可以使用如下格式的json文件。 220 | ```json 221 | [ 222 | { 223 | "id": 0, 224 | "conversations": [ 225 | { 226 | "from": "human", 227 | "value": "请问什么是“模式年龄”?" 228 | }, 229 | { 230 | "from": "model", 231 | "value": "模式年龄是指利用放射性衰变规律假定地质样品形成时的初始同位素组成计算得到的年龄。" 232 | }, 233 | ... 234 | ] 235 | }, 236 | ... 237 | ] 238 | ``` 239 | 240 | ## 多机训练 241 | 242 | 多机器训练需要编辑如下格式的`hostfile`文件。其中,每一行表示一个机器,`ip_address-X`为各个机器对应的ip地址,`slots`内容表示机器可用GPU数量。内容格式如下: 243 | 244 | ``` 245 | ip_address-1 slots=8 246 | ip_address-2 slots=8 247 | ip_address-3 slots=8 248 | ip_address-4 slots=8 249 | ... 250 | ``` 251 | 252 | 同时指定hostfile参数为`hostfile`文件路径,然后运行如下命令即可启动多机训练。 253 | 254 | ```shell 255 | hostfile="/path/to/hostfile" 256 | deepspeed --hostfile=$hostfile finetune/finetune.py \ 257 | --report_to "none" \ 258 | --data_path "finetune/data/coig_10k.json" \ 259 | --model_name_or_path deeplang-ai/LingoWhale-8B \ 260 | --output_dir "output" \ 261 | --model_max_length 2048 \ 262 | --num_train_epochs 4 \ 263 | --per_device_train_batch_size 4 \ 264 | --gradient_accumulation_steps 1 \ 265 | --save_strategy epoch \ 266 | --learning_rate 2e-5 \ 267 | --lr_scheduler_type constant \ 268 | --adam_beta1 0.9 \ 269 | --adam_beta2 0.98 \ 270 | --adam_epsilon 1e-8 \ 271 | --max_grad_norm 1.0 \ 272 | --weight_decay 1e-4 \ 273 | --warmup_ratio 0.0 \ 274 | --logging_steps 1 \ 275 | --gradient_checkpointing True \ 276 | --deepspeed finetune/ds_config.json \ 277 | --bf16 True \ 278 | --tf32 True 279 | ``` 280 | ## 少参数微调 281 | 通过使用[peft](https://github.com/huggingface/peft),可以轻松调LoRA, Prefix-Tuning等少参数微调的方法。目前在代码中集合了LoRA的训练方法,可以通过加入`--use_lora True`启动。 282 | 283 | 使用LoRA训练的checkpoint可以通过下面的代码读取和调用: 284 | ```python 285 | from peft import AutoPeftModelForCausalLM 286 | model = AutoPeftModelForCausalLM.from_pretrained("output", trust_remote_code=True) 287 | ``` 288 | 289 | # 开源协议 290 | 社区使用LingoWhale-8B模型需要遵循[Apache 2.0](http://www.apache.org/licenses/LICENSE-2.0)和[《LingoWhale-8B模型许可协议》](MODEL_LICENSE.md)。若您期望将此源模型或其衍生品用作商业用途,请填写[问卷](https://deeplang.feishu.cn/share/base/form/shrcnzt705EDAvVrmAJUCyg7TIb)进行申请,通过后即可**免费商用**。 291 | -------------------------------------------------------------------------------- /README_EN.md: -------------------------------------------------------------------------------- 1 |

2 | English  |  中文 3 |

4 |
5 | 6 |
7 |

8 | LingoWhale-8B 9 |

10 |
11 | 12 |

13 | 🤗 Hugging Face • 🤖 ModelScope • ⛵ Wisemodel 14 |

15 | 16 |
17 | 18 | LingoWhale-8B model open-sourced by DeepLangAI in collaboration with THUNLP Lab 🎉 19 | 20 |
21 | 22 | # Table of Contents 23 | 24 | - [Introduction](#introduction) 25 | - [Evaluation](#evaluation) 26 | - [Generated Examples](#generated-examples) 27 | - [Deployment and Inference](#deployment-and-inference) 28 | - [Fine-tuning](#fine-tuning) 29 | - [Open Source License](#open-source-license) 30 | 31 | # Introduction 32 | 33 | LingoWhale-8B is the first open-source model in the LingoWhale series introduced by DeepLangAI. It's a bilingual (Chinese-English) large language model. 34 | 35 | LingoWhale-8B has been pre-trained on a large volume of high-quality bilingual data and exhibits powerful capabilities as a foundation model. It has achieved leading results on multiple public benchmarks. During its pre-training phase, the model was trained with a context window of 8K, allowing it to comprehend and generate longer sequences. 36 | 37 | LingoWhale-8B is fully open for academic research. Users can apply for commercial use by email, and once granted official commercial permission, they can use it for free. 38 | 39 | Along with open-sourcing the model weights, we also provide a Huggingface inference interface and parameter efficient fine-tuning examples like LoRA, making it easier for developers to use the LingoWhale-8B model. 40 | 41 | Due to the scale of model parameters, intrinsic issues of large language models like hallucination and relatively weak mathematical computation capabilities persist in LingoWhale-8B. Please understand these issues and evaluate the possible risks before using the model. Future versions of the LingoWhale model will focus on optimizing these areas. 42 | 43 | # Evaluation 44 | 45 | We tested on the following public evaluation benchmarks: 46 | 47 | - [C-Eval](https://arxiv.org/abs/2305.08322) is a Chinese foundation model evaluation benchmark consisting of 13,948 multiple-choice questions, covering 52 different subjects and four difficulty levels. It aims to assess the capability of Chinese language models. We used the dev set of this dataset as a few-shot source and conducted a 5-shot test on the test set. 48 | 49 | - [MMLU](https://arxiv.org/abs/2009.03300) is an English foundation model evaluation benchmark that spans various domains like basic mathematics, American history, computer science, law, among others, with a total of 57 tasks. It evaluates language models' performance on different domain tasks. We performed a 5-shot test on this benchmark. 50 | 51 | - [CMMLU](https://arxiv.org/abs/2306.09212) is a Chinese evaluation benchmark that encompasses 67 topics ranging from basic subjects to advanced professional levels. It evaluates Chinese language models' performance in knowledge and reasoning capabilities. We used the dev set of this dataset as a few-shot source and conducted a 5-shot test on the test set. 52 | 53 | - [Gaokao](https://arxiv.org/abs/2305.12474) is an evaluation benchmark based on the dataset of Chinese college entrance examination questions. It aims to provide an assessment of Chinese language models in terms of language comprehension and logical reasoning capabilities. We retained only the four-option multiple-choice questions from it and conducted a 5-shot test after random partitioning. 54 | 55 | - [HumanEval](https://arxiv.org/abs/2107.03374) is an English evaluation benchmark containing over one hundred coding problems. It assesses language models' abilities in code comprehension and generation. We adopted a zero-shot setting and the Pass@1 metric for testing the model. 56 | 57 | - [GSM8K](https://arxiv.org/abs/2110.14168) is a dataset composed of high-quality elementary school math application problems. It requires the models to select the most appropriate solution based on the provided scenario and evaluates the models' capabilities in mathematical application. We conducted an 8-shot test on this benchmark. 58 | 59 | - [BBH](https://arxiv.org/abs/2210.09261) is an evaluation benchmark formed from a selection of challenging tasks out of 204 Big-Bench benchmark tasks. We performed a 3-shot test on this benchmark. 60 | 61 | - [AGIEval](https://arxiv.org/abs/2304.06364) is a benchmark to examine foundation models' human-like capabilities, specifically assessing foundational models' abilities in human cognition and problem-solving tasks. We retained only the four-option multiple-choice questions from it and conducted a 5-shot test after random partitioning. 62 | 63 | These evaluation benchmarks provide standardized tests and metrics to assess language models' performance and capabilities across various tasks and domains. The evaluation results are shown in the table below: 64 | 65 | | **Model** |**C-Eval**| **MMLU** |**CMMLU** |**GAOKAO**|**HumanEval**|**GSM8K** | **BBH** |**AGIEval**| 66 | |:-----------------------|:--------:|:--------:|:--------:|:--------:|:-----------:|:--------:|:--------:|:---------:| 67 | | | 5-shot | 5-shot | 5-shot | 5-shot | 0-shot | 8-shot | 3-shot | 5-shot | 68 | | **GPT-4** | 68.4 | 83.9 | 70.3 | 66.2 | 69.5 | 90.0 | 75.1 | 63.3 | 69 | | **GPT-3.5 Turbo** | 51.1 | 68.5 | 54.1 | 47.1 | 52.4 | 57.8 | 61.6 | 46.1 | 70 | | **LLaMA2-7B** | 28.9 | 45.7 | 31.4 | 26.0 | 12.8 | 16.2 | 39.2 | 26.5 | 71 | | **ChatGLM2-6B*** | 51.7 | 47.9 | - | - | - | 32.4 | 33.7 | - | 72 | | **Baichuan2-7B-Base*** | 54.0 | 54.2 | 57.1 | 47.5 | 18.3 | 24.5 | 41.6 | 42.7 | 73 | | **Qwen-7B v1.1*** | 63.5 | 58.2 | 62.2 | - | 29.9 | 51.7 | 45.0 | - | 74 | | **LingoWhale-8B-base** | 63.6 | 60.2 | 62.8 | 50.3 | 32.9 | 55.0 | 47.5 | 43.8 | 75 | 76 | 77 | * indicates that the model results are from the official, and all the results are accurate to 1 decimal place. 78 | 79 | # Generated Examples 80 | 81 | The following examples are generated by a Chat model fine-tuned based on the LingoWhale-8B model using open source SFT data. 82 | 83 |
Mathematical calculations

84 | 85 | **Query**:f(x)为偶函数, f(1) = 3, f(2) = 4, 那么f(1) + f(-2) ^2 等于多少? 请一步一步说明 86 | 87 | **Answer**:首先,根据题目中给出的条件,我们可以知道: 88 | 89 | - f(x) 是一个偶函数,即 f(-x) = f(x)。 90 | - f(1) = 3。 91 | - f(2) = 4。 92 | 93 | 接下来,我们需要求出 f(1) + f(-2)^2 的值。 94 | 由于 f(x) 是偶函数,所以 f(-2) = f(2),因此 f(-2) = 4。 95 | 将 f(1) 和 f(-2) 的值代入 f(1) + f(-2)^2 中,得到: 96 | f(1) + f(-2)^2 = 3 + (4)^2 = 3 + 16 = 19。 97 | 因此,f(1) + f(-2)^2 的值为 19。 98 | 99 |

100 | 101 |
Role playing

102 | 103 | **Query**:作为有丰富营销经验的品牌经理,请为我提供一条关于以下情境的品牌营销策略: 104 | 情境:618期间,提高品牌销售额 105 | 品牌定位:国产精品眉笔 106 | 建议要求:提供具有创新性和实效性的品牌营销策略,帮助提升品牌的销售额, 并加入直播带货的方案。 107 | 108 | **Answer**:在618期间,为了提高品牌销售额,我们可以采取以下品牌营销策略: 109 | 110 | 1. 推出限时优惠活动:在618活动期间,我们可以在品牌官方网站上推出限时优惠活动,例如购买眉笔送赠品、打折促销等,以吸引消费者的注意力并促进销售。 111 | 112 | 2. 利用社交媒体宣传:我们可以通过社交媒体平台(如微博、微信、抖音等)发布品牌宣传内容,包括产品介绍、使用技巧、用户评价等,以增加品牌曝光率和用户互动性。 113 | 114 | 3. 与知名博主合作推广:我们可以邀请一些知名的美妆博主或网红来试用我们的产品,并在其社交媒体上发布推荐视频或图文,以吸引更多的目标受众。 115 | 116 | 4. 直播带货:我们可以邀请一些知名的带货主播或明星来为我们的产品进行直播带货,通过他们的影响力来吸引更多的消费者关注和购买。 117 | 118 | 5. 参与电商平台活动:我们可以积极参与各大电商平台的促销活动,例如满减、满赠等,以吸引更多的消费者购买我们的产品。 119 | 120 | 综上所述,以上品牌营销策略可以帮助我们在618期间提高品牌销售额,同时加入直播带货的方案可以更好地吸引目标受众,提高销售效果。 121 | 122 |

123 | 124 |
Poetry Writing

125 | 126 | **Query**:写一个七言诗,描述岳飞的诗。诗词希望是宋代诗的风格。 127 | 128 | **Answer**:《岳飞》 129 | 130 | 铁马金戈壮士魂, 131 | 132 | 精忠报国志凌云。 133 | 134 | 千古英雄谁可及, 135 | 136 | 岳家军威震乾坤。 137 | 138 |

139 | 140 |
Cultural knowledge

141 | 142 | **Query**:我要去成都武侯祠参观,能帮我详细介绍下武侯的生平事迹吗? 143 | 144 | **Answer**:诸葛亮(181年-234年),字孔明,号卧龙,徐州琅琊阳都(今山东临沂市沂南县)人,三国时期蜀汉丞相,杰出的政治家、军事家、文学家、书法家、发明家。早年随叔父诸葛玄到荆州,诸葛玄死后,诸葛亮就在隆中隐居。刘备三顾茅庐请出诸葛亮,联合东吴孙权于赤壁之战大败曹军,形成三国鼎足之势,又夺占荆益。刘备入蜀后,诸葛亮镇守荆州。刘备称帝后,诸葛亮被任命为丞相,主持朝政。后主刘禅继位,诸葛亮被封为武乡侯,领益州牧。勤勉谨慎,大小政事必亲自处理,赏罚严明;与东吴联盟,改善和西南各族的关系;实行屯田政策,加强战备。前后五次北伐中原,多以粮尽无功。终因积劳成疾,于234年病逝于五丈原(今陕西岐山县南),享年54岁。刘禅追封其为忠武侯,后世常以武侯尊称诸葛亮。东晋政权因其军事才能特追封他为武兴王。诸葛亮散文代表作有《出师表》、《诫子书》等。曾发明木牛流马、孔明灯等,并改造连弩,叫做诸葛连弩,可一弩十矢俱发。诸葛亮一生“鞠躬尽瘁、死而后已”,是中国传统文化中忠臣与智者的代表人物。 145 | 146 |

147 | 148 |
Code generation

149 | 150 | **Query**:你是一名编程开发者,能写一个正则帮忙判断一个字符串是否是合理的手机号码, 用Python实现。 151 | 152 | **Answer**: 153 | ```python 154 | import re 155 | 156 | def is_valid_phone(phone): 157 | pattern = r'^1[3-9]\d{9}$' 158 | return bool(re.match(pattern, phone)) 159 | 160 | print(is_valid_phone('13800138000')) # True 161 | print(is_valid_phone('12345678901')) # False 162 | ``` 163 | 164 |

165 | 166 | # Deployment and Inference 167 | 168 | ## Install Dependencies 169 | ```shell 170 | pip install -r requirements.txt 171 | ``` 172 | 173 | + To use flash attention, you need to install [flash-attn](https://github.com/Dao-AILab/flash-attention) 174 | 175 | + For using LoRA or other parameter efficient fine-tuning methods, please install [peft](https://github.com/huggingface/peft) 176 | 177 | ## Python Example 178 | 179 | ```python 180 | >>> from transformers import AutoModelForCausalLM, AutoTokenizer 181 | >>> tokenizer = AutoTokenizer.from_pretrained("deeplang-ai/LingoWhale-8B", trust_remote_code=True) 182 | >>> model = AutoModelForCausalLM.from_pretrained("deeplang-ai/LingoWhale-8B", device_map="auto", trust_remote_code=True) 183 | >>> inputs = tokenizer("陋室铭\n唐 刘禹锡\n", return_tensors="pt") 184 | >>> inputs = inputs.to("cuda:0") 185 | >>> pred = model.generate(**inputs, max_new_tokens=100, repetition_penalty=1.1) 186 | >>> print(tokenizer.decode(pred.cpu()[0], skip_special_tokens=True)) 187 | ``` 188 | 189 | # Fine-tuning 190 | The fine-tuning example is based on `transformers.Trainer`. For a more deteiled guide on the arguments usage, please refer to Huggingface [`Trainer`](https://huggingface.co/docs/transformers/v4.34.1/en/main_classes/trainer#trainer) tutorial. 191 | 192 | The aim of this section is to showcase the fine-tuning process. No guarantees are made about the model performance under this fine-tuning configuration. 193 | 194 | ## Single-Machine Training 195 | Below is an example of fine-tuning on a single machine. The data used is 10,000 randomly selected instruction fine-tuning data from the COIG dataset. You can replace it with your own data. 196 | 197 | ```shell 198 | hostfile="" 199 | deepspeed --hostfile=$hostfile finetune/finetune.py \ 200 | --report_to "none" \ 201 | --data_path "finetune/data/coig_10k.json" \ 202 | --model_name_or_path deeplang-ai/LingoWhale-8B \ 203 | --output_dir "output" \ 204 | --model_max_length 2048 \ 205 | --num_train_epochs 4 \ 206 | --per_device_train_batch_size 16 \ 207 | --gradient_accumulation_steps 1 \ 208 | --save_strategy epoch \ 209 | --learning_rate 2e-5 \ 210 | --lr_scheduler_type constant \ 211 | --adam_beta1 0.9 \ 212 | --adam_beta2 0.98 \ 213 | --adam_epsilon 1e-8 \ 214 | --max_grad_norm 1.0 \ 215 | --weight_decay 1e-4 \ 216 | --warmup_ratio 0.0 \ 217 | --logging_steps 1 \ 218 | --gradient_checkpointing True \ 219 | --deepspeed finetune/ds_config.json \ 220 | --bf16 True \ 221 | --tf32 True 222 | ``` 223 | 224 | To use you own data, please convert it to the json format below 225 | ```json 226 | [ 227 | { 228 | "id": 0, 229 | "conversations": [ 230 | { 231 | "from": "human", 232 | "value": "请问什么是“模式年龄”?" 233 | }, 234 | { 235 | "from": "model", 236 | "value": "模式年龄是指利用放射性衰变规律假定地质样品形成时的初始同位素组成计算得到的年龄。" 237 | }, 238 | ... 239 | ] 240 | }, 241 | ... 242 | ] 243 | ``` 244 | 245 | ## Multi-Machine Training 246 | 247 | For multi-machine training, you need to create a `hostfile` in the following format. Each line represents a machine. `ip_address-X` refers to the IP address of each machine, and the `slots` content indicates the number of available GPUs on the machine. The content format is as follows: 248 | 249 | ``` 250 | ip_address-1 slots=8 251 | ip_address-2 slots=8 252 | ip_address-3 slots=8 253 | ip_address-4 slots=8 254 | ... 255 | ``` 256 | 257 | Next, specify the `hostfile` argument using the path of the `hostfile`, and run the following command to start multi-machine training. 258 | 259 | ```shell 260 | hostfile="/path/to/hostfile" 261 | deepspeed --hostfile=$hostfile finetune/finetune.py \ 262 | --report_to "none" \ 263 | --data_path "finetune/data/coig_10k.json" \ 264 | --model_name_or_path deeplang-ai/LingoWhale-8B \ 265 | --output_dir "output" \ 266 | --model_max_length 2048 \ 267 | --num_train_epochs 4 \ 268 | --per_device_train_batch_size 16 \ 269 | --gradient_accumulation_steps 1 \ 270 | --save_strategy epoch \ 271 | --learning_rate 2e-5 \ 272 | --lr_scheduler_type constant \ 273 | --adam_beta1 0.9 \ 274 | --adam_beta2 0.98 \ 275 | --adam_epsilon 1e-8 \ 276 | --max_grad_norm 1.0 \ 277 | --weight_decay 1e-4 \ 278 | --warmup_ratio 0.0 \ 279 | --logging_steps 1 \ 280 | --gradient_checkpointing True \ 281 | --deepspeed finetune/ds_config.json \ 282 | --bf16 True \ 283 | --tf32 True 284 | ``` 285 | 286 | ## Parameter-Efficient Fine-Tuning 287 | By using [peft](https://github.com/huggingface/peft), you can easily apply parameter-efficient fine-tuning methods like LoRA, Prefix-Tuning, etc. The training method for LoRA is currently integrated into the code, which can be activated by adding `--use_lora True`. 288 | 289 | LoRA checkpoints can be loaded using the following code: 290 | ```python 291 | from peft import AutoPeftModelForCausalLM 292 | model = AutoPeftModelForCausalLM.from_pretrained("output", trust_remote_code=True) 293 | ``` 294 | 295 | # Open Source License 296 | The community use of the LingoWhale-8B model must adhere to the [Apache 2.0](http://www.apache.org/licenses/LICENSE-2.0) and the [LingoWhale-8B Model License Agreement](MODEL_LICENSE.md). If you wish to use this source model or its derivatives for commercial purposes, please complete the [questionnaire](https://deeplang.feishu.cn/share/base/form/shrcnzt705EDAvVrmAJUCyg7TIb) to apply for free commercial use. 297 | 298 | -------------------------------------------------------------------------------- /finetune/ds_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size": "auto", 3 | "train_micro_batch_size_per_gpu": "auto", 4 | "gradient_accumulation_steps": "auto", 5 | "gradient_clipping": 1.0, 6 | "bf16": { 7 | "enabled": "auto" 8 | }, 9 | "zero_optimization": { 10 | "stage": 3, 11 | "overlap_comm": true, 12 | "stage3_gather_16bit_weights_on_model_save": true 13 | }, 14 | "flops_profiler": { 15 | "enabled": false, 16 | "profile_step": 1, 17 | "module_depth": -1, 18 | "top_modules": 1, 19 | "detailed": true, 20 | "output_file": null 21 | } 22 | } -------------------------------------------------------------------------------- /finetune/finetune.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import Optional, Dict 3 | from dataclasses import dataclass, field 4 | import json 5 | 6 | import torch 7 | from torch.utils.data import Dataset 8 | import transformers 9 | from transformers.training_args import TrainingArguments 10 | 11 | 12 | @dataclass 13 | class ModelArguments: 14 | model_name_or_path: Optional[str] = field( 15 | default="deeplang-ai/LingoWhale-8B") 16 | 17 | 18 | @dataclass 19 | class DataArguments: 20 | data_path: str = field(default=None, 21 | metadata={"help": "Path to the training data."}) 22 | 23 | 24 | @dataclass 25 | class TrainingArguments(transformers.TrainingArguments): 26 | cache_dir: Optional[str] = field( 27 | default=None, metadata={"help": "cache dir for huggingface model."}) 28 | optim: str = field(default="adamw_torch") 29 | model_max_length: int = field( 30 | default=512, 31 | metadata={ 32 | "help": 33 | "Maximum sequence length. Sequences will be right padded (and possibly truncated)." 34 | }, 35 | ) 36 | use_lora: bool = field( 37 | default=False, 38 | metadata={ 39 | "help": "Whether to use lora instead of full parameter finetuning." 40 | }) 41 | 42 | 43 | class SupervisedDataset(Dataset): 44 | """Dataset for supervised fine-tuning.""" 45 | 46 | def __init__( 47 | self, 48 | data_path, 49 | tokenizer, 50 | model_max_length, 51 | user_tokens=[3], 52 | assistant_tokens=[4], 53 | ): 54 | super(SupervisedDataset, self).__init__() 55 | self.data = json.load(open(data_path)) 56 | self.tokenizer = tokenizer 57 | self.model_max_length = model_max_length 58 | self.user_tokens = user_tokens 59 | self.assistant_tokens = assistant_tokens 60 | self.ignore_index = -100 61 | 62 | def __len__(self): 63 | return len(self.data) 64 | 65 | def preprocessing(self, example): 66 | input_ids = [] 67 | labels = [] 68 | 69 | for message in example["conversations"]: 70 | from_ = message["from"] 71 | value = message["value"] 72 | value_ids = self.tokenizer.encode(value) 73 | 74 | if from_ == "human": 75 | input_ids += self.user_tokens + value_ids 76 | labels += [self.tokenizer.eos_token_id 77 | ] + [self.ignore_index] * len(value_ids) 78 | else: 79 | input_ids += self.assistant_tokens + value_ids 80 | labels += [self.ignore_index] + value_ids 81 | input_ids.append(self.tokenizer.eos_token_id) 82 | labels.append(self.tokenizer.eos_token_id) 83 | input_ids = input_ids[:self.model_max_length] 84 | labels = labels[:self.model_max_length] 85 | input_ids += [self.tokenizer.pad_token_id 86 | ] * (self.model_max_length - len(input_ids)) 87 | labels += [self.ignore_index] * (self.model_max_length - len(labels)) 88 | input_ids = torch.LongTensor(input_ids) 89 | labels = torch.LongTensor(labels) 90 | attention_mask = input_ids.ne(self.tokenizer.pad_token_id) 91 | return { 92 | "input_ids": input_ids, 93 | "labels": labels, 94 | "attention_mask": attention_mask, 95 | } 96 | 97 | def __getitem__(self, idx) -> Dict[str, torch.Tensor]: 98 | return self.preprocessing(self.data[idx]) 99 | 100 | 101 | def train(): 102 | parser = transformers.HfArgumentParser( 103 | (ModelArguments, DataArguments, TrainingArguments)) 104 | model_args, data_args, training_args = parser.parse_args_into_dataclasses() 105 | 106 | config = transformers.AutoConfig.from_pretrained( 107 | model_args.model_name_or_path, 108 | trust_remote_code=True, 109 | cache_dir=training_args.cache_dir, 110 | ) 111 | if training_args.gradient_checkpointing: 112 | config.use_cache = False 113 | 114 | model = transformers.AutoModelForCausalLM.from_pretrained( 115 | model_args.model_name_or_path, 116 | trust_remote_code=True, 117 | cache_dir=training_args.cache_dir, 118 | config=config) 119 | tokenizer = transformers.AutoTokenizer.from_pretrained( 120 | model_args.model_name_or_path, 121 | use_fast=False, 122 | trust_remote_code=True, 123 | model_max_length=training_args.model_max_length, 124 | cache_dir=training_args.cache_dir, 125 | add_bos_token=False, 126 | add_eos_token=False, 127 | ) 128 | if training_args.use_lora: 129 | from peft import LoraConfig, TaskType, get_peft_model 130 | 131 | peft_config = LoraConfig( 132 | task_type=TaskType.CAUSAL_LM, 133 | target_modules=["qkv_proj"], 134 | inference_mode=False, 135 | r=8, 136 | lora_alpha=16, 137 | lora_dropout=0.1, 138 | ) 139 | model.enable_input_require_grads() 140 | model = get_peft_model(model, peft_config) 141 | model.print_trainable_parameters() 142 | 143 | dataset = SupervisedDataset(data_args.data_path, tokenizer, 144 | training_args.model_max_length) 145 | trainer = transformers.Trainer(model=model, 146 | args=training_args, 147 | train_dataset=dataset, 148 | tokenizer=tokenizer) 149 | trainer.train() 150 | trainer.save_state() 151 | trainer.save_model(output_dir=training_args.output_dir) 152 | 153 | 154 | if __name__ == "__main__": 155 | train() 156 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | torch==2.1.0 2 | transformers==4.31.0 3 | accelerate==0.23.0 4 | einops 5 | sentencepiece 6 | tokenizers 7 | deepspeed 8 | numpy --------------------------------------------------------------------------------