├── README.md ├── content ├── Qwen-blog │ ├── img │ │ ├── GQA.png │ │ ├── GQA2.png │ │ ├── MLP1.png │ │ ├── Mask1.png │ │ ├── Mask2.png │ │ ├── Mask3.png │ │ ├── Qwen2Attention.png │ │ ├── RMSNorm_formulation.jpg │ │ ├── ROPE1.png │ │ ├── ROPE2.png │ │ ├── ROPE3.png │ │ ├── ROPE4.png │ │ ├── ROPE5.png │ │ ├── ROPE6.png │ │ ├── ROPE7.png │ │ ├── ROPE8.png │ │ ├── decoderlayer.png │ │ └── framework.JPG │ └── readme.md ├── TinyAgent │ ├── agent_demo.ipynb │ ├── images │ │ ├── Agent.png │ │ └── React.png │ ├── readme.md │ ├── requirements.txt │ └── tinyAgent │ │ ├── Agent.py │ │ ├── LLM.py │ │ ├── __pycache__ │ │ ├── Agent.cpython-310.pyc │ │ ├── LLM.cpython-310.pyc │ │ └── tool.cpython-310.pyc │ │ └── tool.py ├── TinyDiffusion │ ├── datasets │ │ └── .keep │ ├── ddpm │ │ ├── dataloader.py │ │ ├── diffusion.py │ │ ├── metrics.py │ │ ├── requirements.txt │ │ ├── sample.py │ │ ├── train.py │ │ └── unet.py │ ├── fig │ │ ├── attn-200.png │ │ ├── attn.png │ │ ├── ddpm.png │ │ ├── fig1.png │ │ ├── fig2.png │ │ ├── fig3.png │ │ ├── unet-200.png │ │ └── unet.png │ └── readme.md ├── TinyEval │ ├── Eval │ │ ├── __pycache__ │ │ │ └── metrics.cpython-39.pyc │ │ ├── config │ │ │ ├── adapter2path.json │ │ │ ├── dataset2maxlen.json │ │ │ ├── dataset2prompt.json │ │ │ ├── model2maxlen.json │ │ │ └── model2path.json │ │ ├── dataset │ │ │ ├── GAOKAO-new1-math.json │ │ │ ├── GAOKAO_math.jsonl │ │ │ ├── custom_zh.jsonl │ │ │ ├── multi_news.jsonl │ │ │ ├── multifieldqa_zh.jsonl │ │ │ └── trec.jsonl │ │ ├── docs │ │ │ └── compass.png │ │ ├── metrics.py │ │ ├── model │ │ │ ├── LLM.py │ │ │ └── __pycache__ │ │ │ │ ├── LLM.cpython-310.pyc │ │ │ │ └── LLM.cpython-39.pyc │ │ └── pred │ │ │ ├── Qwen2 │ │ │ ├── GAOKAO_math.jsonl │ │ │ └── result.json │ │ │ ├── internlm │ │ │ ├── multi_news.jsonl │ │ │ ├── multifieldqa_zh.jsonl │ │ │ ├── result.json │ │ │ └── trec.jsonl │ │ │ └── internlm2 │ │ │ └── multifieldqa_zh.jsonl │ ├── eval.py │ ├── gaokao.ipynb │ ├── inference.py │ ├── pred │ │ └── results.txt │ ├── readme.md │ └── requirements.txt ├── TinyIMGRAG │ ├── IMGRAG │ │ ├── ImgEvaluator.py │ │ ├── ImgGenerator.py │ │ ├── ImgRetrieval.py │ │ ├── RewritePrompt.py │ │ └── __pycache__ │ │ │ ├── ImgEvaluator.cpython-310.pyc │ │ │ ├── ImgGenerator.cpython-310.pyc │ │ │ ├── ImgRetrieval.cpython-310.pyc │ │ │ └── RewritePrompt.cpython-310.pyc │ ├── README.md │ ├── datasets │ │ ├── imgs │ │ │ ├── apple.jpg │ │ │ ├── bamboo.jpg │ │ │ ├── brown_bear.jpg │ │ │ ├── classroom.jpg │ │ │ ├── cradle.jpg │ │ │ ├── dog.jpg │ │ │ ├── oil_painting.png │ │ │ ├── panda.jpg │ │ │ ├── shark.jpg │ │ │ ├── sketch.jpg │ │ │ ├── sunflower.jpg │ │ │ ├── teacher.jpg │ │ │ └── wash_painting.jpg │ │ └── templates │ │ │ ├── enhanced_image.png │ │ │ ├── imgrag.png │ │ │ └── initial_image.png │ ├── download_model.py │ ├── main.py │ ├── model │ │ └── __init__.py │ ├── packages │ │ └── CLIP-main │ │ │ ├── .github │ │ │ └── workflows │ │ │ │ └── test.yml │ │ │ ├── .gitignore │ │ │ ├── 0.26.0 │ │ │ ├── CLIP.png │ │ │ ├── LICENSE │ │ │ ├── MANIFEST.in │ │ │ ├── README.md │ │ │ ├── clip │ │ │ ├── __init__.py │ │ │ ├── bpe_simple_vocab_16e6.txt.gz │ │ │ ├── clip.py │ │ │ ├── model.py │ │ │ └── simple_tokenizer.py │ │ │ ├── data │ │ │ ├── country211.md │ │ │ ├── prompts.md │ │ │ ├── rendered-sst2.md │ │ │ └── yfcc100m.md │ │ │ ├── hubconf.py │ │ │ ├── model-card.md │ │ │ ├── notebooks │ │ │ ├── Interacting_with_CLIP.ipynb │ │ │ └── Prompt_Engineering_for_ImageNet.ipynb │ │ │ ├── requirements.txt │ │ │ ├── setup.py │ │ │ └── tests │ │ │ └── test_consistency.py │ └── requirements.txt ├── TinyLLM │ ├── README.md │ ├── code │ │ ├── model.py │ │ ├── preprocess.py │ │ ├── requirements.txt │ │ ├── sample.py │ │ ├── tok4096.model │ │ ├── tokenizer.py │ │ ├── train.py │ │ └── train_vocab.py │ └── images │ │ ├── model_show.png │ │ └── nvidia.png ├── TinyRAG │ ├── .env.example │ ├── RAG │ │ ├── Embeddings.py │ │ ├── LLM.py │ │ ├── VectorBase.py │ │ ├── __pycache__ │ │ │ ├── Embeddings.cpython-310.pyc │ │ │ ├── LLM.cpython-310.pyc │ │ │ ├── VectorBase.cpython-310.pyc │ │ │ └── utils.cpython-310.pyc │ │ └── utils.py │ ├── down_model.py │ ├── images │ │ ├── RAG.png │ │ └── Retrieval-Augmented Generation(RAG-Learning).png │ ├── readme.md │ ├── requirements.txt │ └── test.ipynb └── TinyTransformer │ ├── figures │ ├── position_encoding.png │ ├── transformer_Multi-Head attention.png │ ├── transformer_Multi-Head attention_compute.png │ ├── transformer_Multi-Head visual.jpg │ ├── transformer_architecture.png │ └── transformer_datalink.png │ ├── readme.md │ ├── requirements.txt │ ├── test.ipynb │ └── tiny_transformer.py └── images ├── model_show.png ├── qrcode.jpeg ├── tiny-universe-head.png ├── tiny-universe-head2.png └── tiny-universe2.jpg /README.md: -------------------------------------------------------------------------------- 1 |
2 | 3 |

大模型白盒子构建指南

4 |
5 | 6 |   本项目是一个从原理出发、以“白盒”为导向、围绕大模型全链路的“手搓”大模型指南,旨在帮助有传统深度学习基础的读者从底层原理出发,“纯手搓”搭建一个清晰、可用的大模型系统,包括大模型本身、RAG 框架、Agent 系统及大模型评估体系。本项目将从基础原理出发,深入剖析每一个技术点并附以完整的代码实现,以细致讲解和代码注释帮助读者独立复现大模型核心部分,并在复现中实现对大模型的深入理解与掌握。 7 | 8 |   本项目旨在为广大学习者搭建一个清晰的、可用的、可复现的大模型世界,帮助每一位有兴趣的学习者纯手工独立搭建自己的 **Tiny LLM Universe**。 9 | 10 |   本项目的主要内容包括: 11 | 1. 手写图像生成模型--Tiny Diffusion 12 | 2. 深入剖析大模型原理——Qwen Blog 13 | 3. 逐步预训练一个手搓大模型——Tiny Llama3 14 | 4. 如何评估你的大模型——Tiny Eval 15 | 5. 纯手工搭建 RAG 框架——Tiny RAG 16 | 6. 手搓一个最小的 Agent 系统——Tiny Agent 17 | 7. 深入理解大模型基础——Tiny Transformer 18 | 19 | ## 项目意义 20 | 21 |   随着百模大战的经久不衰,开源或是闭源的大模型正不断刷新着模型能力上限,逼近 AGI 的宏伟未来。随着大模型能力的不断增强,基于大模型进行二次微调、应用开发的门槛也不断降低,大模型正在不断深入各行各业,为生产生活赋予智能力量。飞速成熟的大模型生态正不断带来更多的开源或闭源框架、API,层出不穷的各式教程让更多的开发者可以快速、便捷地实现大模型的应用。但生态愈是成熟,深入理解框架之内的细节,实现独立于框架的开发、应用能力愈是关键。只有从核心原理出发,尽可能地脱离框架,实现大模型系统的“纯手搓”,才能真正理解模型的核心能力、关键部分,也才能够对框架实现自由的修改应用,随心所欲地将大模型应用到各行各业各类任务。 22 | 23 |   目前,大部分教程目标在于指导开发者如何基于高度封装的 API、开源框架实现便捷、快速的开发和训练,有利于初学者入门,却忽视了掌握模型原理、框架内部细节的重要性。不管是大模型本身,还是基于大模型的赋能系统 RAG 或者是 Agent,又或者是开发应用大模型的必备组件评估体系,都有丰富的基于工具包的使用教程,使很多学习者“知其然而不知其所以然”,只能机械地使用工具包而无法从原理出发进行自由的魔改。本项目旨在抛弃高度封装的工具包与 API,从底层(Pytorch 层)出发,“纯手搓”一个大模型系统,完成大模型的 RAG 、 Agent 、Eval 任务,帮助具备一定的大模型基础的学习者进一步掌握大模型原理,拥抱更自由、更丰富也更精彩的大模型世界。 24 | 25 | ## 项目受众 26 | 27 |  本项目适合以下学习者: 28 | 29 | - 掌握了大模型的基本应用,想要学习原理,但不知从何开始 30 | - 好奇心旺盛、求知欲强的同学,具备一定的学习热情的同学 31 | - 对大模型的RAG、Agent、Eval任务感兴趣,并想要从原理层面了解 32 | - 喜欢动手实践写代码,想要从零开始手写大模型的RAG、Agent、Eval任务 33 | - 想要了解大模型的底层原理,提升自己的大模型技术水平 34 | 35 |   ***我们希望本项目能为广大学习者提供一个可以看得见、够得着、跟得上的大模型实践机会。让大家在实践中学习,不断提升自己的技术水平。*** 36 | 37 |   ***我们希望为更多学习者打开 LLM 的神奇世界,以“知其然更知其所以然”的心态,搭建属于自己的“Tiny LLM Universe”。*** 38 | 39 | ## 项目亮点 40 | 41 | - 本项目旨在全流程 **从零手搓** 42 | - 本项目包含LLM全流程,从Model,到RAG,Agent,Eval,打造LLM **全栈教程** 43 | - 区别于大型的算法包,我们的项目代码对初级开发者更 **简洁清晰** ,更"白盒子" 44 | - 后续会持续迭代更新项目,如动手训练一个 **Tiny-llama3** ,动手制作 **垂直领域数据集** 等等。 45 | - 欢迎大家参与贡献哦,一起打造一个更好的LLM生态! 46 | 47 | ## *News* 48 | 49 | - ***2024.12.25:TinyDiffusion,从零上手Diffusion震撼发布!两小时完成图像生成预训练!*** 50 | 51 | - ***2024.10.28:TinyLlama3,从零上手Llama预训练到加载模型推理,2G显存即可完成!*** 52 | 53 | - ***2024.6.26:增加选修内容——高考数学评测,使TinyEval接入高考题目评测!!*** 54 | 55 | - ***2024.5.20:增加选修内容——Tiny Transformer,纯手工搭建 Transformer 模型*** 56 | 57 | - ***2024.5.1:发布 Tiny-Universe V1 版本,劳动节帕鲁献礼!*** 58 | 59 | ## 项目结构 60 | 61 | ### [*TinyDiffusion*](./content/TinyDiffusion/) 62 | 63 |   Diffusion模型作为当下最流行的图像生成模型,其图像生成效果优秀、训练过程稳定,已经成为了图像生成领域的主流方案。然而,对于初学者来说,Diffusion模型的公式原理过于复杂,对于如何从公式出发实现对应的代码也很困惑。 64 | 65 |   ***本项目手工搭建了一个最简化版本的DDPM模型,从论文中的公式出发,对应到具体的训练与采样过程代码实现,旨在帮助学习者更好地理解Diffusion模型的原理,熟悉训练、推理、评估的整套流程。*** 66 | 67 |
68 | 69 |
70 | 71 | ### [*Qwen-Blog*](./content/Qwen-blog/) ([对应讲解视频](https://meeting.tencent.com/v2/cloud-record/share?id=0be29bb2-0648-4aeb-9baa-c9dc91dfc7a6&from=3&is-single=false&record_type=2)) 72 | 73 |   初学者在学习LLM时,往往对庞大的代码与各种封装的功能"谈码色变"~ 74 | 75 |   但其实,LLM的网络架构并没有想象的那么复杂! 本项目以Qwen2为例,带大家深入LLM模型结构内部,以输入tensor为第一视角,带大家经过Model的各个操作块,点亮LLM的"黑匣子"。 76 | 77 |   ***项目内除了Model内容外,也添加了嵌入模型内部的GQA,ROPE与Attention Mask等机制的细致讲解,促进大家对LLM的全面理解!*** 78 | 79 |
80 | 81 |
82 | 83 | 84 | ### [*TinyRAG*](./content/TinyRAG/)([对应讲解视频](https://meeting.tencent.com/v2/cloud-record/share?id=4306b90c-d772-4faa-baeb-1f4e0bf4569f&from=3&is-single=false&record_type=2)) 85 | 86 |   LLM会产生误导性的 “幻觉”,依赖的信息可能过时,处理特定知识时效率不高,缺乏专业领域的深度洞察,同时在推理能力上也有所欠缺。 87 | 88 |   正是在这样的背景下,检索增强生成技术(Retrieval-Augmented Generation,RAG)应时而生,成为 AI 时代的一大趋势。 89 | 90 |   RAG 通过在语言模型生成答案之前,先从广泛的文档数据库中检索相关信息,然后利用这些信息来引导生成过程,极大地提升了内容的准确性和相关性。RAG 有效地缓解了幻觉问题,提高了知识更新的速度,并增强了内容生成的可追溯性,使得大型语言模型在实际应用中变得更加实用和可信。 91 | 92 |   ***RAG 已经成为 LLM 应用的重要组成部分,但其他RAG项目都基于封装框架提供完整服务,虽然易于使用,却隐藏了 RAG 的底层原理,也难以随心所欲地魔改升级。本项目抛弃高度封装的 RAG 框架,手搓一个从零开始的RAG项目,帮助学习者更好地理解RAG的原理。*** 93 | 94 | > *镜像地址:https://www.codewithgpu.com/i/datawhalechina/tiny-universe/tiny-universe-tiny-rag* 95 | 96 |
97 | 98 |
99 | 100 | ### [*TinyAgent*](./content/TinyAgent/)(暂无录播,Datawhale视频号搜索“动手搭建一个最小Agent系统”) 101 | 102 |   大模型具有出人意料的强大能力,却也有其固定缺陷,在逻辑推理、现实事件、高度垂直领域等方面仍然存在薄弱之处。因此,通过针对性的工具来为大模型赋能,给大模型一个抓手,让大模型和现实世界发生的事情对齐颗粒度,从而打造一个更智能、更专业、更全面的大模型应用,是大模型未来的重要发展方向。Agent 就基于这样的理念,将 LLM 打造为能自主理解、规划决策、执行复杂任务的智能体,给予其丰富的专业工具,从而真正让大模型走入现实生活,为未来赋能。 103 | 104 |   ***本项目基于 React 的方式,手动制作了一个最小的 Agent 结构(其实更多的是调用工具),通过一步一步手写`Agent`,让读者对`Agent`的构成和运作更加的了解,也让后续自由地搭建个性化的 Agent 系统成为可能。暑假的时候我们会尝试将 React 结构修改为 SOP 结构,欢迎大家一起加入进来啊~!*** 105 | 106 |
107 | 108 |
109 | 110 | 111 | ### [*TinyEval*](./content/TinyEval)([对应讲解视频](https://meeting.tencent.com/v2/cloud-record/share?id=8b9cf6ca-add6-477b-affe-5b62e2d8f27e&from=3)) 112 | 113 |   随着LLM的推广,越来越多的小伙伴已经熟悉了模型sft微调流程,但是对于微调的结果,尤其是如何判断各大模型在当前数据集上的表现,仍然是一个待解决的问题。并且,对于选择式、判别式、生成式等不同的生成任务,如何才能够客观地评价模型生成质量,仍是一个需要明确的问题。 114 | 115 |   ***基于上述问题,我们搭建了一个完善的评测体系介绍,让大家能够学会根据自身的任务量身定做合适的评测指标,并使用该指标进行客观评测,为模型能力提供准确可量化的数据支持!*** 116 | 117 |
118 | 119 |
120 | 121 | ### [*TinyLLM*](./content/TinyLLM) 122 | 123 |   此项目在于实现一个简单的大语言模型,从训练tokenizer开始,到训练模型,再到使用模型生成文本。仅使用Numpy和Pytorch即可实现一个简单的大语言模型训练,显存使用2G左右。以下为项目效果展示。 124 | 125 |   训练模型所需要的资源也是很少的,仅需要一个显卡即可,显存使用2G左右。训练模型的时间也不长,仅需要几个小时即可完成。 126 | 127 |
128 | RAG 129 |
130 | 131 | ### [*TinyTransformer*](./content/TinyTransformer) 132 | 133 |   目前,所有 LLM 几乎都以 Transformer 提出的 Attention 机制作为基础,要深入理解 LLM 的原理和结构,深入、全面地理解 Transformer 是必经之路。 134 | 135 |   ***基于 Transformer 经典论文《Attention is All You Need》,我们在 pytorch 层手工搭建了一个完整、可复现、可运行的 Transformer 模型,帮助学习者更深入地理解 Transformer 原理结构,打牢 LLM 基础。*** 136 | 137 | ## 致谢 138 | 139 | #### 核心贡献者 140 | 141 | - [肖鸿儒-项目负责人](https://github.com/Hongru0306)(Datawhale成员-同济大学) 142 | - [宋志学-项目负责人](https://github.com/KMnO4-zx)(Datawhale成员-中国矿业大学(北京)) 143 | - [邹雨衡-项目负责人](https://github.com/logan-zou)(Datawhale成员-对外经济贸易大学) 144 | - [刘啸宇-核心贡献者](https://github.com/lxysl)(华中科技大学) 145 | 146 | #### 其他 147 | 148 | - 特别感谢[@Sm1les](https://github.com/Sm1les)对本项目的帮助与支持 149 | - 如果有任何想法可以联系我们 DataWhale 也欢迎大家多多提出 issue 150 | - 特别感谢以下为教程做出贡献的同学! 151 | 152 |
153 | 154 | 155 | 156 |
157 | 158 | ## 关注我们 159 | 160 |
161 |

扫描下方二维码关注公众号:Datawhale

162 | 163 |
164 |   Datawhale,一个专注于AI领域的学习圈子。初衷是for the learner,和学习者一起成长。目前加入学习社群的人数已经数千人,组织了机器学习,深度学习,数据分析,数据挖掘,爬虫,编程,统计学,Mysql,数据竞赛等多个领域的内容学习,微信搜索公众号Datawhale可以加入我们。 165 | 166 | ## LICENSE 167 | 168 | 知识共享许可协议
本作品采用知识共享署名-非商业性使用-相同方式共享 4.0 国际许可协议进行许可。 169 | 170 | *注:默认使用CC 4.0协议,也可根据自身项目情况选用其他协议* 171 | -------------------------------------------------------------------------------- /content/Qwen-blog/img/GQA.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/tiny-universe/638006d147ed6fc902e610fad50875549fa2d944/content/Qwen-blog/img/GQA.png -------------------------------------------------------------------------------- /content/Qwen-blog/img/GQA2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/tiny-universe/638006d147ed6fc902e610fad50875549fa2d944/content/Qwen-blog/img/GQA2.png -------------------------------------------------------------------------------- /content/Qwen-blog/img/MLP1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/tiny-universe/638006d147ed6fc902e610fad50875549fa2d944/content/Qwen-blog/img/MLP1.png -------------------------------------------------------------------------------- /content/Qwen-blog/img/Mask1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/tiny-universe/638006d147ed6fc902e610fad50875549fa2d944/content/Qwen-blog/img/Mask1.png -------------------------------------------------------------------------------- /content/Qwen-blog/img/Mask2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/tiny-universe/638006d147ed6fc902e610fad50875549fa2d944/content/Qwen-blog/img/Mask2.png -------------------------------------------------------------------------------- /content/Qwen-blog/img/Mask3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/tiny-universe/638006d147ed6fc902e610fad50875549fa2d944/content/Qwen-blog/img/Mask3.png -------------------------------------------------------------------------------- /content/Qwen-blog/img/Qwen2Attention.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/tiny-universe/638006d147ed6fc902e610fad50875549fa2d944/content/Qwen-blog/img/Qwen2Attention.png -------------------------------------------------------------------------------- /content/Qwen-blog/img/RMSNorm_formulation.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/tiny-universe/638006d147ed6fc902e610fad50875549fa2d944/content/Qwen-blog/img/RMSNorm_formulation.jpg -------------------------------------------------------------------------------- /content/Qwen-blog/img/ROPE1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/tiny-universe/638006d147ed6fc902e610fad50875549fa2d944/content/Qwen-blog/img/ROPE1.png -------------------------------------------------------------------------------- /content/Qwen-blog/img/ROPE2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/tiny-universe/638006d147ed6fc902e610fad50875549fa2d944/content/Qwen-blog/img/ROPE2.png -------------------------------------------------------------------------------- /content/Qwen-blog/img/ROPE3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/tiny-universe/638006d147ed6fc902e610fad50875549fa2d944/content/Qwen-blog/img/ROPE3.png -------------------------------------------------------------------------------- /content/Qwen-blog/img/ROPE4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/tiny-universe/638006d147ed6fc902e610fad50875549fa2d944/content/Qwen-blog/img/ROPE4.png -------------------------------------------------------------------------------- /content/Qwen-blog/img/ROPE5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/tiny-universe/638006d147ed6fc902e610fad50875549fa2d944/content/Qwen-blog/img/ROPE5.png -------------------------------------------------------------------------------- /content/Qwen-blog/img/ROPE6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/tiny-universe/638006d147ed6fc902e610fad50875549fa2d944/content/Qwen-blog/img/ROPE6.png -------------------------------------------------------------------------------- /content/Qwen-blog/img/ROPE7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/tiny-universe/638006d147ed6fc902e610fad50875549fa2d944/content/Qwen-blog/img/ROPE7.png -------------------------------------------------------------------------------- /content/Qwen-blog/img/ROPE8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/tiny-universe/638006d147ed6fc902e610fad50875549fa2d944/content/Qwen-blog/img/ROPE8.png -------------------------------------------------------------------------------- /content/Qwen-blog/img/decoderlayer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/tiny-universe/638006d147ed6fc902e610fad50875549fa2d944/content/Qwen-blog/img/decoderlayer.png -------------------------------------------------------------------------------- /content/Qwen-blog/img/framework.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/tiny-universe/638006d147ed6fc902e610fad50875549fa2d944/content/Qwen-blog/img/framework.JPG -------------------------------------------------------------------------------- /content/TinyAgent/agent_demo.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 3, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stderr", 10 | "output_type": "stream", 11 | "text": [ 12 | "/root/.conda/envs/social/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", 13 | " from .autonotebook import tqdm as notebook_tqdm\n" 14 | ] 15 | } 16 | ], 17 | "source": [ 18 | "from tinyAgent.Agent import Agent" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 4, 24 | "metadata": {}, 25 | "outputs": [ 26 | { 27 | "name": "stdout", 28 | "output_type": "stream", 29 | "text": [ 30 | "================ Loading model ================\n" 31 | ] 32 | }, 33 | { 34 | "name": "stderr", 35 | "output_type": "stream", 36 | "text": [ 37 | "Loading checkpoint shards: 0%| | 0/21 [00:00 论文:***[ReAct: Synergizing Reasoning and Acting in Language Models](http://arxiv.org/abs/2210.03629)*** 11 | 12 |
13 | 14 |
15 | 16 | ## 实现细节 17 | 18 | ### Step 1: 构造大模型 19 | 20 | 首先我们需要一个大模型,这里我们使用`InternLM2`作为我们的 Agent 模型。`InternLM2`是一个基于`Decoder-Only`的通用对话大模型,可以使用`transformers`库来加载`InternLM2`模型。 21 | 22 | 首先,还是先创建一个`BaseModel`类,我们可以在这个类中定义一些基本的方法,比如`chat`方法和`load_model`方法,方便以后扩展使用其他模型。 23 | 24 | ```python 25 | class BaseModel: 26 | def __init__(self, path: str = '') -> None: 27 | self.path = path 28 | 29 | def chat(self, prompt: str, history: List[dict]): 30 | pass 31 | 32 | def load_model(self): 33 | pass 34 | ``` 35 | 36 | 接着,我们创建一个`InternLM2`类,这个类继承自`BaseModel`类,我们在这个类中实现`chat`方法和`load_model`方法。就和正常加载`InternLM2`模型一样,来做一个简单的加载和返回即可。 37 | 38 | ```python 39 | class InternLM2Chat(BaseModel): 40 | def __init__(self, path: str = '') -> None: 41 | super().__init__(path) 42 | self.load_model() 43 | 44 | def load_model(self): 45 | print('================ Loading model ================') 46 | self.tokenizer = AutoTokenizer.from_pretrained(self.path, trust_remote_code=True) 47 | self.model = AutoModelForCausalLM.from_pretrained(self.path, torch_dtype=torch.float16, trust_remote_code=True).cuda().eval() 48 | print('================ Model loaded ================') 49 | 50 | def chat(self, prompt: str, history: List[dict], meta_instruction:str ='') -> str: 51 | response, history = self.model.chat(self.tokenizer, prompt, history, temperature=0.1, meta_instruction=meta_instruction) 52 | return response, history 53 | ``` 54 | 55 | ### Step 2: 构造工具 56 | 57 | 我们在`tools.py`文件中,构造一些工具,比如`Google搜索`。在这个文件中,构造一个`Tools`类。在这个类中,我们需要添加一些工具的描述信息和具体实现方式。 58 | 59 | 添加工具的描述信息,是为了在构造`system_prompt`的时候,让模型能够知道可以调用哪些工具,以及工具的描述信息和参数。 60 | 61 | - 首先要在 `tools` 中添加工具的描述信息 62 | - 然后在 `tools` 中添加工具的具体实现 63 | 64 | > 使用Google搜索功能的话需要去`serper`官网申请一下`token`: https://serper.dev/dashboard, 然后在tools.py文件中填写你的key,这个key每人可以免费申请一个,且有2500次的免费调用额度,足够做实验用啦~ 65 | 66 | ```python 67 | class Tools: 68 | def __init__(self) -> None: 69 | self.toolConfig = self._tools() 70 | 71 | def _tools(self): 72 | tools = [ 73 | { 74 | 'name_for_human': '谷歌搜索', 75 | 'name_for_model': 'google_search', 76 | 'description_for_model': '谷歌搜索是一个通用搜索引擎,可用于访问互联网、查询百科知识、了解时事新闻等。', 77 | 'parameters': [ 78 | { 79 | 'name': 'search_query', 80 | 'description': '搜索关键词或短语', 81 | 'required': True, 82 | 'schema': {'type': 'string'}, 83 | } 84 | ], 85 | } 86 | ] 87 | return tools 88 | 89 | def google_search(self, search_query: str): 90 | pass 91 | ``` 92 | 93 | ### Step 3: 构造Agent 94 | 95 | 我们在`Agent.py`文件中,构造一个`Agent`类,这个`Agent`是一个`React`范式的`Agent`,我们在这个`Agent`类中,实现了`text_completion`方法,这个方法是一个对话方法,我们在这个方法中,调用`InternLM2`模型,然后根据`React`的`Agent`的逻辑,来调用`Tools`中的工具。 96 | 97 | 首先我们要构造`system_prompt`, 这个是系统的提示,我们可以在这个提示中,添加一些系统的提示信息,比如`ReAct`形式的`prompt`。 98 | 99 | ```python 100 | def build_system_input(self): 101 | tool_descs, tool_names = [], [] 102 | for tool in self.tool.toolConfig: 103 | tool_descs.append(TOOL_DESC.format(**tool)) 104 | tool_names.append(tool['name_for_model']) 105 | tool_descs = '\n\n'.join(tool_descs) 106 | tool_names = ','.join(tool_names) 107 | sys_prompt = REACT_PROMPT.format(tool_descs=tool_descs, tool_names=tool_names) 108 | return sys_prompt 109 | ``` 110 | 111 | OK, 如果顺利的话,运行出来的示例应该是这样的: 112 | 113 | ``` 114 | Answer the following questions as best you can. You have access to the following tools: 115 | 116 | google_search: Call this tool to interact with the 谷歌搜索 API. What is the 谷歌搜索 API useful for? 谷歌搜索是一个通用搜索引擎,可用于访问互联网、查询百科知识、了解时事新闻等。 Parameters: [{'name': 'search_query', 'description': '搜索关键词或短语', 'required': True, 'schema': {'type': 'string'}}] Format the arguments as a JSON object. 117 | 118 | Use the following format: 119 | 120 | Question: the input question you must answer 121 | Thought: you should always think about what to do 122 | Action: the action to take, should be one of [google_search] 123 | Action Input: the input to the action 124 | Observation: the result of the action 125 | ... (this Thought/Action/Action Input/Observation can be repeated zero or more times) 126 | Thought: I now know the final answer 127 | Final Answer: the final answer to the original input question 128 | 129 | Begin! 130 | ``` 131 | 132 | 这个`system_prompt`告诉了大模型,它可以调用哪些工具,以什么样的方式输出,以及工具的描述信息和工具应该接受什么样的参数。 133 | 134 | > 目前只是实现了一个简单的`Google搜索`工具,后续会添加更多的关于地理信息系统分析的工具,没错,我是一个地理信息系统的学生。 135 | 136 | 关于Agent的具体结构可以在`tinyAgent/Agent.py`中查看。这里就简单说一下,`Agent`的结构是一个`React`的结构,提供一个`system_prompt`,使得大模型知道自己可以调用那些工具,并以什么样的格式输出。 137 | 138 | 每次用户的提问,如果需要调用工具的话,都会进行两次的大模型调用,第一次解析用户的提问,选择调用的工具和参数,第二次将工具返回的结果与用户的提问整合。这样就可以实现一个`React`的结构。 139 | 140 | 下面为`Agent`代码的简易实现,每个函数的具体实现可以在`tinyAgent/Agent.py`中查看。 141 | 142 | ```python 143 | class Agent: 144 | def __init__(self, path: str = '') -> None: 145 | pass 146 | 147 | def build_system_input(self): 148 | # 构造上文中所说的系统提示词 149 | pass 150 | 151 | def parse_latest_plugin_call(self, text): 152 | # 解析第一次大模型返回选择的工具和工具参数 153 | pass 154 | 155 | def call_plugin(self, plugin_name, plugin_args): 156 | # 调用选择的工具 157 | pass 158 | 159 | def text_completion(self, text, history=[]): 160 | # 整合两次调用 161 | pass 162 | ``` 163 | 164 |
165 | 166 |
167 | 168 | ### Step 4: 运行Agent 169 | 170 | 在这个案例中,使用了`InternLM2-chat-7B`模型, 如果你想要`Agent`运行的更加稳定,可以使用它的`big cup`版本`InternLM2-20b-chat`,这样可以提高`Agent`的稳定性。 171 | 172 | ```python 173 | from Agent import Agent 174 | 175 | 176 | agent = Agent('/root/share/model_repos/internlm2-chat-20b') 177 | 178 | response, _ = agent.text_completion(text='你好', history=[]) 179 | print(response) 180 | 181 | # Thought: 你好,请问有什么我可以帮助你的吗? 182 | # Action: google_search 183 | # Action Input: {'search_query': '你好'} 184 | # Observation:Many translated example sentences containing "你好" – English-Chinese dictionary and search engine for English translations. 185 | # Final Answer: 你好,请问有什么我可以帮助你的吗? 186 | 187 | response, _ = agent.text_completion(text='周杰伦是哪一年出生的?', history=_) 188 | print(response) 189 | 190 | # Final Answer: 周杰伦的出生年份是1979年。 191 | 192 | response, _ = agent.text_completion(text='周杰伦是谁?', history=_) 193 | print(response) 194 | 195 | # Thought: 根据我的搜索结果,周杰伦是一位台湾的创作男歌手、钢琴家和词曲作家。他的首张专辑《杰倫》于2000年推出,他的音乐遍及亚太区和西方国家。 196 | # Final Answer: 周杰伦是一位台湾创作男歌手、钢琴家、词曲作家和唱片制作人。他于2000年推出了首张专辑《杰伦》,他的音乐遍布亚太地区和西方国家。他的音乐风格独特,融合了流行、摇滚、嘻哈、电子等多种元素,深受全球粉丝喜爱。他的代表作品包括《稻香》、《青花瓷》、《听妈妈的话》等。 197 | 198 | response, _ = agent.text_completion(text='他的第一张专辑是什么?', history=_) 199 | print(response) 200 | 201 | # Final Answer: 周杰伦的第一张专辑是《Jay》。 202 | ``` 203 | 204 | > ***记得给仓库点个小小的 star 哦~*** 205 | 206 | ## 论文参考 207 | 208 | - [ReAct: Synergizing Reasoning and Acting in Language Models](http://arxiv.org/abs/2210.03629) -------------------------------------------------------------------------------- /content/TinyAgent/requirements.txt: -------------------------------------------------------------------------------- 1 | torch 2 | torchvision 3 | transformers 4 | tqdm 5 | requests 6 | json5 7 | sentencepiece -------------------------------------------------------------------------------- /content/TinyAgent/tinyAgent/Agent.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, List, Optional, Tuple, Union 2 | import json5 3 | 4 | from tinyAgent.LLM import InternLM2Chat 5 | from tinyAgent.tool import Tools 6 | 7 | 8 | TOOL_DESC = """{name_for_model}: Call this tool to interact with the {name_for_human} API. What is the {name_for_human} API useful for? {description_for_model} Parameters: {parameters} Format the arguments as a JSON object.""" 9 | REACT_PROMPT = """Answer the following questions as best you can. You have access to the following tools: 10 | 11 | {tool_descs} 12 | 13 | Use the following format: 14 | 15 | Question: the input question you must answer 16 | Thought: you should always think about what to do 17 | Action: the action to take, should be one of [{tool_names}] 18 | Action Input: the input to the action 19 | Observation: the result of the action 20 | ... (this Thought/Action/Action Input/Observation can be repeated zero or more times) 21 | Thought: I now know the final answer 22 | Final Answer: the final answer to the original input question 23 | 24 | Begin! 25 | """ 26 | 27 | 28 | class Agent: 29 | def __init__(self, path: str = '') -> None: 30 | self.path = path 31 | self.tool = Tools() 32 | self.system_prompt = self.build_system_input() 33 | self.model = InternLM2Chat(path) 34 | 35 | def build_system_input(self): 36 | tool_descs, tool_names = [], [] 37 | for tool in self.tool.toolConfig: 38 | tool_descs.append(TOOL_DESC.format(**tool)) 39 | tool_names.append(tool['name_for_model']) 40 | tool_descs = '\n\n'.join(tool_descs) 41 | tool_names = ','.join(tool_names) 42 | sys_prompt = REACT_PROMPT.format(tool_descs=tool_descs, tool_names=tool_names) 43 | return sys_prompt 44 | 45 | def parse_latest_plugin_call(self, text): 46 | plugin_name, plugin_args = '', '' 47 | i = text.rfind('\nAction:') 48 | j = text.rfind('\nAction Input:') 49 | k = text.rfind('\nObservation:') 50 | if 0 <= i < j: # If the text has `Action` and `Action input`, 51 | if k < j: # but does not contain `Observation`, 52 | text = text.rstrip() + '\nObservation:' # Add it back. 53 | k = text.rfind('\nObservation:') 54 | plugin_name = text[i + len('\nAction:') : j].strip() 55 | plugin_args = text[j + len('\nAction Input:') : k].strip() 56 | text = text[:k] 57 | return plugin_name, plugin_args, text 58 | 59 | def call_plugin(self, plugin_name, plugin_args): 60 | plugin_args = json5.loads(plugin_args) 61 | if plugin_name == 'google_search': 62 | return '\nObservation:' + self.tool.google_search(**plugin_args) 63 | 64 | def text_completion(self, text, history=[]): 65 | text = "\nQuestion:" + text 66 | response, his = self.model.chat(text, history, self.system_prompt) 67 | print(response) 68 | plugin_name, plugin_args, response = self.parse_latest_plugin_call(response) 69 | if plugin_name: 70 | response += self.call_plugin(plugin_name, plugin_args) 71 | response, his = self.model.chat(response, history, self.system_prompt) 72 | return response, his 73 | 74 | if __name__ == '__main__': 75 | agent = Agent('/root/share/model_repos/internlm2-chat-7b') 76 | prompt = agent.build_system_input() 77 | print(prompt) -------------------------------------------------------------------------------- /content/TinyAgent/tinyAgent/LLM.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, List, Optional, Tuple, Union 2 | 3 | import torch 4 | from transformers import AutoTokenizer, AutoModelForCausalLM 5 | 6 | 7 | class BaseModel: 8 | def __init__(self, path: str = '') -> None: 9 | self.path = path 10 | 11 | def chat(self, prompt: str, history: List[dict]): 12 | pass 13 | 14 | def load_model(self): 15 | pass 16 | 17 | class InternLM2Chat(BaseModel): 18 | def __init__(self, path: str = '') -> None: 19 | super().__init__(path) 20 | self.load_model() 21 | 22 | def load_model(self): 23 | print('================ Loading model ================') 24 | self.tokenizer = AutoTokenizer.from_pretrained(self.path, trust_remote_code=True) 25 | self.model = AutoModelForCausalLM.from_pretrained(self.path, torch_dtype=torch.float16, trust_remote_code=True).cuda().eval() 26 | print('================ Model loaded ================') 27 | 28 | def chat(self, prompt: str, history: List[dict], meta_instruction:str ='') -> str: 29 | response, history = self.model.chat(self.tokenizer, prompt, history, temperature=0.1, meta_instruction=meta_instruction) 30 | return response, history 31 | 32 | # if __name__ == '__main__': 33 | # model = InternLM2Chat('/root/share/model_repos/internlm2-chat-7b') 34 | # print(model.chat('Hello', [])) -------------------------------------------------------------------------------- /content/TinyAgent/tinyAgent/__pycache__/Agent.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/tiny-universe/638006d147ed6fc902e610fad50875549fa2d944/content/TinyAgent/tinyAgent/__pycache__/Agent.cpython-310.pyc -------------------------------------------------------------------------------- /content/TinyAgent/tinyAgent/__pycache__/LLM.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/tiny-universe/638006d147ed6fc902e610fad50875549fa2d944/content/TinyAgent/tinyAgent/__pycache__/LLM.cpython-310.pyc -------------------------------------------------------------------------------- /content/TinyAgent/tinyAgent/__pycache__/tool.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/tiny-universe/638006d147ed6fc902e610fad50875549fa2d944/content/TinyAgent/tinyAgent/__pycache__/tool.cpython-310.pyc -------------------------------------------------------------------------------- /content/TinyAgent/tinyAgent/tool.py: -------------------------------------------------------------------------------- 1 | import os, json 2 | import requests 3 | 4 | """ 5 | 工具函数 6 | 7 | - 首先要在 tools 中添加工具的描述信息 8 | - 然后在 tools 中添加工具的具体实现 9 | 10 | - https://serper.dev/dashboard 11 | """ 12 | 13 | class Tools: 14 | def __init__(self) -> None: 15 | self.toolConfig = self._tools() 16 | 17 | def _tools(self): 18 | tools = [ 19 | { 20 | 'name_for_human': '谷歌搜索', 21 | 'name_for_model': 'google_search', 22 | 'description_for_model': '谷歌搜索是一个通用搜索引擎,可用于访问互联网、查询百科知识、了解时事新闻等。', 23 | 'parameters': [ 24 | { 25 | 'name': 'search_query', 26 | 'description': '搜索关键词或短语', 27 | 'required': True, 28 | 'schema': {'type': 'string'}, 29 | } 30 | ], 31 | } 32 | ] 33 | return tools 34 | 35 | def google_search(self, search_query: str): 36 | url = "https://google.serper.dev/search" 37 | 38 | payload = json.dumps({"q": search_query}) 39 | headers = { 40 | 'X-API-KEY': '修改为你自己的key', 41 | 'Content-Type': 'application/json' 42 | } 43 | 44 | response = requests.request("POST", url, headers=headers, data=payload).json() 45 | 46 | return response['organic'][0]['snippet'] 47 | -------------------------------------------------------------------------------- /content/TinyDiffusion/datasets/.keep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/tiny-universe/638006d147ed6fc902e610fad50875549fa2d944/content/TinyDiffusion/datasets/.keep -------------------------------------------------------------------------------- /content/TinyDiffusion/ddpm/dataloader.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torchvision 4 | from torchvision import transforms 5 | from torch.utils.data import DataLoader 6 | import matplotlib.pyplot as plt 7 | 8 | 9 | def load_transformed_dataset(img_size=32, batch_size=128) -> DataLoader: 10 | """加载并转换CIFAR10数据集""" 11 | train_data_transform = transforms.Compose([ 12 | transforms.Resize((img_size, img_size)), 13 | transforms.RandomHorizontalFlip(), # 随机水平翻转 14 | transforms.ToTensor(), # 将数据缩放到[0, 1]范围 15 | transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)), # 将数据缩放到[-1, 1]范围 16 | ]) 17 | test_data_transform = transforms.Compose([ 18 | transforms.Resize((img_size, img_size)), 19 | transforms.ToTensor(), 20 | transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)), 21 | ]) 22 | 23 | # 加载训练集和测试集 24 | train_dataset = torchvision.datasets.CIFAR10(root="./datasets", 25 | train=True, 26 | download=False, 27 | transform=train_data_transform) 28 | 29 | test_dataset = torchvision.datasets.CIFAR10(root="./datasets", 30 | train=False, 31 | download=False, 32 | transform=test_data_transform) 33 | 34 | # 创建 DataLoader 35 | train_loader = DataLoader(train_dataset, 36 | batch_size=batch_size, 37 | shuffle=True, 38 | drop_last=True) 39 | 40 | test_loader = DataLoader(test_dataset, 41 | batch_size=batch_size, 42 | shuffle=False, 43 | drop_last=True) 44 | 45 | return train_loader, test_loader 46 | 47 | 48 | def show_tensor_image(image): 49 | reverse_transforms = transforms.Compose([ 50 | transforms.Lambda(lambda t: (t + 1) / 2), # 将数据从[-1, 1]缩放到[0, 1]范围 51 | transforms.Lambda(lambda t: t.permute(1, 2, 0)), # 将通道顺序从CHW改为HWC 52 | transforms.Lambda(lambda t: t * 255.), # 将数据缩放到[0, 255]范围 53 | transforms.Lambda(lambda t: t.numpy().astype(np.uint8)), # 将数据转换为uint8类型 54 | transforms.ToPILImage(), # 将数据转换为PIL图像格式 55 | ]) 56 | 57 | # 如果图像是批次数据,则取第一个图像 58 | if len(image.shape) == 4: 59 | image = image[0, :, :, :] 60 | return reverse_transforms(image) 61 | 62 | 63 | if __name__ == "__main__": 64 | train_loader, test_loader = load_transformed_dataset() 65 | image, _ = next(iter(train_loader)) 66 | plt.imshow(show_tensor_image(image)) 67 | plt.show() 68 | -------------------------------------------------------------------------------- /content/TinyDiffusion/ddpm/diffusion.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | 5 | class NoiseScheduler(nn.Module): 6 | def __init__(self, beta_start=0.0001, beta_end=0.02, num_steps=1000): 7 | """初始化噪声调度器 8 | Args: 9 | beta_start: β1,初始噪声水平 10 | beta_end: βT,最终噪声水平 11 | num_steps: T,扩散步数 12 | device: 运行设备 13 | """ 14 | super().__init__() 15 | self.beta_start = beta_start 16 | self.beta_end = beta_end 17 | self.num_steps = num_steps 18 | 19 | # β_t: 线性噪声调度 20 | self.register_buffer('betas', torch.linspace(beta_start, beta_end, num_steps)) 21 | # α_t = 1 - β_t 22 | self.register_buffer('alphas', 1.0 - self.betas) 23 | # α_bar_t = ∏(1-β_i) from i=1 to t 24 | self.register_buffer('alpha_bar', torch.cumprod(self.alphas, dim=0)) 25 | # α_bar_(t-1) 26 | self.register_buffer('alpha_bar_prev', torch.cat([torch.tensor([1.0]), self.alpha_bar[:-1]], dim=0)) 27 | # sqrt(α_bar_t) 28 | self.register_buffer('sqrt_alpha_bar', torch.sqrt(self.alpha_bar)) 29 | # 1/sqrt(α_t) 30 | self.register_buffer('sqrt_recip_alphas', torch.sqrt(1.0 / self.alphas)) 31 | # sqrt(1-α_bar_t) 32 | self.register_buffer('sqrt_one_minus_alpha_bar', torch.sqrt(1.0 - self.alpha_bar)) 33 | 34 | # 1/sqrt(α_bar_t) 35 | self.register_buffer('sqrt_recip_alphas_bar', torch.sqrt(1.0 / self.alpha_bar)) 36 | # sqrt(1/α_bar_t - 1) 37 | self.register_buffer('sqrt_recipm1_alphas_bar', torch.sqrt(1.0 / self.alpha_bar - 1)) 38 | # 后验分布方差 σ_t^2 39 | self.register_buffer('posterior_var', self.betas * (1.0 - self.alpha_bar_prev) / (1.0 - self.alpha_bar)) 40 | # 后验分布均值系数1: β_t * sqrt(α_bar_(t-1))/(1-α_bar_t) 41 | self.register_buffer('posterior_mean_coef1', self.betas * torch.sqrt(self.alpha_bar_prev) / (1.0 - self.alpha_bar)) 42 | # 后验分布均值系数2: (1-α_bar_(t-1)) * sqrt(α_t)/(1-α_bar_t) 43 | self.register_buffer('posterior_mean_coef2', (1.0 - self.alpha_bar_prev) * torch.sqrt(self.alphas) / (1.0 - self.alpha_bar)) 44 | 45 | def get(self, var, t, x_shape): 46 | """获取指定时间步的变量值并调整形状 47 | Args: 48 | var: 要查询的变量 49 | t: 时间步 50 | x_shape: 目标形状 51 | Returns: 52 | 调整后的变量值 53 | """ 54 | # 从变量张量中收集指定时间步的值 55 | out = var[t] 56 | # 调整形状为[batch_size, 1, 1, 1],以便进行广播 57 | return out.view([t.shape[0]] + [1] * (len(x_shape) - 1)) 58 | 59 | def add_noise(self, x, t): 60 | """向输入添加噪声 61 | 实现公式: x_t = sqrt(α_bar_t) * x_0 + sqrt(1-α_bar_t) * ε, ε ~ N(0,I) 62 | Args: 63 | x: 输入图像 x_0 64 | t: 时间步 65 | Returns: 66 | (noisy_x, noise): 加噪后的图像和使用的噪声 67 | """ 68 | # 获取时间步t对应的sqrt(α_bar_t) 69 | sqrt_alpha_bar = self.get(self.sqrt_alpha_bar, t, x.shape) 70 | # 获取时间步t对应的sqrt(1-α_bar_t) 71 | sqrt_one_minus_alpha_bar = self.get(self.sqrt_one_minus_alpha_bar, t, x.shape) 72 | # 从标准正态分布采样噪声 ε ~ N(0,I) 73 | noise = torch.randn_like(x) 74 | # 实现前向扩散过程: x_t = sqrt(α_bar_t) * x_0 + sqrt(1-α_bar_t) * ε 75 | return sqrt_alpha_bar * x + sqrt_one_minus_alpha_bar * noise, noise 76 | 77 | 78 | def plot_diffusion_steps(image, noise_scheduler, step_size=100): 79 | """绘制图像逐步加噪的过程 80 | Args: 81 | image: 原始图像 82 | noise_scheduler: 噪声调度器 83 | step_size: 每隔多少步绘制一次 84 | Returns: 85 | fig: 绘制的图像 86 | """ 87 | num_images = noise_scheduler.num_steps // step_size 88 | fig = plt.figure(figsize=(15, 3)) 89 | 90 | # 绘制原始图像 91 | plt.subplot(1, num_images + 1, 1) 92 | plt.imshow(show_tensor_image(image)) 93 | plt.axis('off') 94 | plt.title('Original') 95 | 96 | # 绘制不同时间步的噪声图像 97 | for idx in range(num_images): 98 | t = torch.tensor([idx * step_size]) 99 | noisy_image, _ = noise_scheduler.add_noise(image, t) 100 | plt.subplot(1, num_images + 1, idx + 2) 101 | plt.imshow(show_tensor_image(noisy_image)) 102 | plt.axis('off') 103 | plt.title(f't={t.item()}') 104 | 105 | plt.tight_layout() 106 | return fig 107 | 108 | 109 | if __name__ == "__main__": 110 | import matplotlib.pyplot as plt 111 | from dataloader import load_transformed_dataset, show_tensor_image 112 | 113 | train_loader, test_loader = load_transformed_dataset() 114 | image, _ = next(iter(train_loader)) 115 | noise_scheduler = NoiseScheduler() 116 | noisy_image, noise = noise_scheduler.add_noise(image, torch.randint(0, noise_scheduler.num_steps, (image.shape[0],))) 117 | plt.imshow(show_tensor_image(noisy_image)) 118 | 119 | # 绘制加噪过程 120 | fig = plot_diffusion_steps(image[0:1], noise_scheduler) 121 | plt.show() 122 | -------------------------------------------------------------------------------- /content/TinyDiffusion/ddpm/metrics.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torchvision.models as models 4 | from torch.nn import functional as F 5 | import numpy as np 6 | from scipy import linalg 7 | from tqdm import tqdm 8 | from torchvision import transforms 9 | from torch.utils.data import DataLoader, TensorDataset 10 | 11 | 12 | class InceptionStatistics: 13 | def __init__(self, device='cuda'): 14 | self.device = device 15 | # 加载预训练的Inception v3模型 16 | self.model = models.inception_v3(weights=models.Inception_V3_Weights.IMAGENET1K_V1, transform_input=False) 17 | self.model.fc = nn.Identity() # 移除最后的全连接层 18 | self.model = self.model.to(device) 19 | self.model.eval() 20 | 21 | # 设置图像预处理 22 | self.preprocess = transforms.Compose([ 23 | transforms.Resize(299), 24 | transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) 25 | ]) 26 | 27 | @torch.no_grad() 28 | def get_features(self, images): 29 | """获取Inception特征""" 30 | features = [] 31 | probs = [] 32 | 33 | # 将图像处理为299x299大小 34 | images = self.preprocess(images) 35 | 36 | # 批量处理图像 37 | dataset = TensorDataset(images) 38 | dataloader = DataLoader(dataset, batch_size=32) 39 | 40 | for (batch,) in tqdm(dataloader): 41 | batch = batch.to(self.device) 42 | 43 | # 获取特征和logits 44 | feature = self.model(batch) 45 | prob = F.softmax(feature, dim=1) 46 | 47 | features.append(feature.cpu().numpy()) 48 | probs.append(prob.cpu().numpy()) 49 | 50 | features = np.concatenate(features, axis=0) 51 | probs = np.concatenate(probs, axis=0) 52 | 53 | return features, probs 54 | 55 | def calculate_inception_score(probs, splits=10): 56 | """计算Inception Score 57 | 58 | IS = exp(E[KL(p(y|x) || p(y))]) 59 | 60 | 其中: 61 | - p(y|x) 是生成图像通过Inception模型得到的条件类别分布(probs) 62 | - p(y) 是边缘类别分布,通过对所有图像的p(y|x)取平均得到 63 | - KL是KL散度,用于衡量两个分布的差异 64 | - E是对所有图像的期望 65 | 66 | 具体步骤: 67 | 1. 将所有图像分成splits组 68 | 2. 对每组计算: 69 | - 计算边缘分布p(y) 70 | - 计算KL散度 71 | - 取指数 72 | 3. 返回所有组得分的均值和标准差 73 | """ 74 | # 存储每个split的IS分数 75 | scores = [] 76 | # 计算每个split的大小 77 | split_size = probs.shape[0] // splits 78 | 79 | # 对每个split进行计算 80 | for i in tqdm(range(splits)): 81 | # 获取当前split的概率分布 82 | part = probs[i * split_size:(i + 1) * split_size] 83 | # 计算KL散度: KL(p(y|x) || p(y)) 84 | kl = part * (np.log(part) - np.log(np.expand_dims(np.mean(part, axis=0), 0))) 85 | # 对每个样本的KL散度求平均 86 | kl = np.mean(np.sum(kl, axis=1)) 87 | # 计算exp(KL)并添加到scores列表 88 | scores.append(np.exp(kl)) 89 | 90 | # 返回所有split的IS分数的均值和标准差 91 | return np.mean(scores), np.std(scores) 92 | 93 | def calculate_fid(real_features, fake_features): 94 | """计算Fréchet Inception Distance (FID)分数 95 | 96 | FID = ||μ_r - μ_f||^2 + Tr(Σ_r + Σ_f - 2(Σ_r Σ_f)^(1/2)) 97 | 98 | 其中: 99 | - μ_r, μ_f 分别是真实图像和生成图像特征的均值向量 100 | - Σ_r, Σ_f 分别是真实图像和生成图像特征的协方差矩阵 101 | - Tr 表示矩阵的迹(对角线元素之和) 102 | - ||·||^2 表示欧几里得距离的平方 103 | 104 | FID越小表示生成图像的质量越好,分布越接近真实图像 105 | """ 106 | # 计算真实图像和生成图像特征的均值向量和协方差矩阵 107 | mu1, sigma1 = real_features.mean(axis=0), np.cov(real_features, rowvar=False) 108 | mu2, sigma2 = fake_features.mean(axis=0), np.cov(fake_features, rowvar=False) 109 | 110 | # 计算均值向量之间的欧几里得距离的平方 111 | ssdiff = np.sum((mu1 - mu2) ** 2) 112 | # 计算协方差矩阵的平方根项:(Σ_r Σ_f)^(1/2) 113 | covmean = linalg.sqrtm(sigma1.dot(sigma2)) # 耗时较长 114 | # 如果结果包含复数,取其实部 115 | if np.iscomplexobj(covmean): 116 | covmean = covmean.real 117 | 118 | # 计算最终的FID分数 119 | fid = ssdiff + np.trace(sigma1 + sigma2 - 2 * covmean) 120 | return fid 121 | 122 | def evaluate_model(model, scheduler, train_loader, num_samples, batch_size, image_size, device="cuda"): 123 | """评估模型的IS和FID分数""" 124 | # 生成样本 125 | fake_images = [] 126 | num_batches = num_samples // batch_size # 每批生成batch_size张图片 127 | 128 | print(f"生成{num_samples}张图像...") 129 | for _ in tqdm(range(num_batches)): 130 | fake_batch = sample(model, scheduler, batch_size, (3, image_size, image_size), device) 131 | fake_batch = ((fake_batch + 1) / 2) # 转换到[0,1]范围 132 | fake_images.append(fake_batch.cpu()) 133 | 134 | fake_images = torch.cat(fake_images, dim=0) 135 | 136 | # 收集所有真实图像 137 | print("收集真实图像...") 138 | real_images = [] 139 | for batch in tqdm(train_loader): 140 | real_images.append(batch[0]) 141 | real_images = torch.cat(real_images, dim=0) 142 | 143 | # 初始化Inception模型 144 | inception = InceptionStatistics(device=device) 145 | 146 | # 获取真实图像和生成图像的特征 147 | print("计算真实图像特征...") 148 | real_features, real_probs = inception.get_features(real_images) 149 | print("计算生成图像特征...") 150 | fake_features, fake_probs = inception.get_features(fake_images) 151 | 152 | # 计算IS分数 153 | print("计算IS分数...") 154 | is_score, is_std = calculate_inception_score(fake_probs) 155 | 156 | # 计算FID分数 157 | print("计算FID分数...") 158 | fid_score = calculate_fid(real_features, fake_features) 159 | 160 | return { 161 | "is_score": is_score, 162 | "is_std": is_std, 163 | "fid_score": fid_score 164 | } 165 | 166 | if __name__ == "__main__": 167 | from unet import SimpleUnet 168 | from diffusion import NoiseScheduler 169 | from sample import sample 170 | from dataloader import load_transformed_dataset 171 | 172 | # 加载模型和数据 173 | device = "cuda" if torch.cuda.is_available() else "cpu" 174 | image_size = 32 175 | model = SimpleUnet() 176 | model.load_state_dict(torch.load(f"simple-unet-ddpm-{image_size}.pth", weights_only=True)) 177 | model = model.to(device) 178 | model.eval() 179 | 180 | scheduler = NoiseScheduler().to(device) 181 | 182 | # 加载真实图像数据 183 | train_loader, _ = load_transformed_dataset(image_size, batch_size=128) 184 | 185 | # 评估模型 186 | metrics = evaluate_model( 187 | model=model, 188 | scheduler=scheduler, 189 | train_loader=train_loader, # 传入整个train_loader 190 | num_samples=10000, # 生成10000张图片进行评估 191 | batch_size=100, 192 | image_size=image_size, 193 | device=device 194 | ) 195 | 196 | print(f"Inception Score: {metrics['is_score']:.2f} ± {metrics['is_std']:.2f}") 197 | print(f"FID Score: {metrics['fid_score']:.2f}") 198 | -------------------------------------------------------------------------------- /content/TinyDiffusion/ddpm/requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | matplotlib 3 | torch 4 | torchvision 5 | tqdm -------------------------------------------------------------------------------- /content/TinyDiffusion/ddpm/sample.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torchvision 3 | from tqdm import tqdm 4 | import matplotlib.pyplot as plt 5 | from unet import SimpleUnet 6 | from diffusion import NoiseScheduler 7 | 8 | 9 | def sample(model, scheduler, num_samples, size, device="cpu"): 10 | """从噪声采样生成图像的函数 11 | Args: 12 | model: UNet模型,用于预测噪声 13 | scheduler: 噪声调度器,包含采样所需的所有系数 14 | num_samples: 要生成的样本数量 15 | size: 生成图像的大小,如(3,32,32) 16 | device: 运行设备 17 | Returns: 18 | 生成的图像张量 19 | """ 20 | model.eval() 21 | with torch.no_grad(): 22 | # 从标准正态分布采样初始噪声 x_T ~ N(0,I) 23 | x_t = torch.randn(num_samples, *size).to(device) 24 | 25 | # 逐步去噪,从t=T到t=0 26 | for t in tqdm(reversed(range(scheduler.num_steps)), desc="Sampling"): 27 | # 构造时间步batch 28 | t_batch = torch.tensor([t] * num_samples).to(device) 29 | 30 | # 获取采样需要的系数 31 | sqrt_recip_alpha_bar = scheduler.get(scheduler.sqrt_recip_alphas_bar, t_batch, x_t.shape) 32 | sqrt_recipm1_alpha_bar = scheduler.get(scheduler.sqrt_recipm1_alphas_bar, t_batch, x_t.shape) 33 | posterior_mean_coef1 = scheduler.get(scheduler.posterior_mean_coef1, t_batch, x_t.shape) 34 | posterior_mean_coef2 = scheduler.get(scheduler.posterior_mean_coef2, t_batch, x_t.shape) 35 | 36 | # 预测噪声 ε_θ(x_t,t) 37 | predicted_noise = model(x_t, t_batch) 38 | 39 | # 计算x_0的预测值: x_0 = 1/sqrt(α_bar_t) * x_t - sqrt(1/α_bar_t-1) * ε_θ(x_t,t) 40 | _x_0 = sqrt_recip_alpha_bar * x_t - sqrt_recipm1_alpha_bar * predicted_noise 41 | # 计算后验分布均值 μ_θ(x_t,t) 42 | model_mean = posterior_mean_coef1 * _x_0 + posterior_mean_coef2 * x_t 43 | # 计算后验分布方差的对数值 log(σ_t^2) 44 | model_log_var = scheduler.get(torch.log(torch.cat([scheduler.posterior_var[1:2], scheduler.betas[1:]])), t_batch, x_t.shape) 45 | 46 | if t > 0: 47 | # t>0时从后验分布采样: x_t-1 = μ_θ(x_t,t) + σ_t * z, z~N(0,I) 48 | noise = torch.randn_like(x_t).to(device) 49 | x_t = model_mean + torch.exp(0.5 * model_log_var) * noise 50 | else: 51 | # t=0时直接使用均值作为生成结果 52 | x_t = model_mean 53 | # 将最终结果裁剪到[-1,1]范围 54 | x_0 = torch.clamp(x_t, -1.0, 1.0) 55 | return x_0 56 | 57 | 58 | def plot(images): 59 | fig = plt.figure(figsize=(12, 8)) 60 | plt.axis("off") 61 | plt.imshow(torchvision.utils.make_grid(images, nrow=5).permute(1, 2, 0)) 62 | plt.tight_layout(pad=1) 63 | return fig 64 | 65 | 66 | if __name__ == "__main__": 67 | image_size = 32 68 | model = SimpleUnet() 69 | model.load_state_dict(torch.load(f"simple-unet-ddpm-{image_size}.pth", weights_only=True)) 70 | device = "cuda" if torch.cuda.is_available() else "cpu" 71 | model.to(device) 72 | scheduler = NoiseScheduler(device=device) 73 | 74 | images = sample(model, scheduler, 10, (3, image_size, image_size), device) 75 | images = ((images + 1) / 2).detach().cpu() 76 | fig = plot(images) 77 | fig.savefig("images-simple-unet-ddpm.png", bbox_inches='tight', pad_inches=0) 78 | -------------------------------------------------------------------------------- /content/TinyDiffusion/ddpm/train.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import torch 4 | import torch.nn as nn 5 | import torch.optim as optim 6 | import argparse 7 | from tqdm import tqdm 8 | 9 | from diffusion import NoiseScheduler 10 | from unet import SimpleUnet 11 | from dataloader import load_transformed_dataset 12 | from sample import sample, plot 13 | 14 | 15 | def test_step(model, dataloader, noise_scheduler, criterion, epoch, num_epochs, device): 16 | """测试步骤,计算测试集上的损失""" 17 | model.eval() 18 | with torch.no_grad(): 19 | loss_sum = 0 20 | num_batches = 0 21 | pbar = tqdm(dataloader) 22 | for batch in pbar: 23 | images, _ = batch 24 | images = images.to(device) 25 | t = torch.full((images.shape[0],), noise_scheduler.num_steps-1, device=device) 26 | noisy_images, noise = noise_scheduler.add_noise(images, t) 27 | 28 | predicted_noise = model(noisy_images, t) 29 | loss = criterion(noise, predicted_noise) 30 | loss_sum += loss.item() 31 | num_batches += 1 32 | pbar.set_description(f"Epoch {epoch+1}/{num_epochs}, Test Loss: {loss_sum/num_batches:.4f}") 33 | return loss_sum / len(dataloader) 34 | 35 | 36 | def train_step(model, dataloader, noise_scheduler, criterion, optimizer, epoch, num_epochs, device): 37 | """训练步骤,计算训练集上的损失并更新模型参数""" 38 | # 设置模型为训练模式 39 | model.train() 40 | loss_sum = 0 41 | num_batches = 0 42 | pbar = tqdm(dataloader) 43 | for batch in pbar: 44 | # 获取一个batch的图像数据并移至指定设备 45 | images, _ = batch 46 | images = images.to(device) 47 | 48 | # 随机采样时间步t 49 | t = torch.randint(0, noise_scheduler.num_steps, (images.shape[0],), device=device) 50 | 51 | # 对图像添加噪声,获得带噪声的图像和噪声 52 | noisy_images, noise = noise_scheduler.add_noise(images, t) 53 | 54 | # 使用模型预测噪声 55 | predicted_noise = model(noisy_images, t) 56 | 57 | # 计算预测噪声和真实噪声之间的MSE损失 58 | loss = criterion(noise, predicted_noise) 59 | 60 | # 反向传播和优化 61 | optimizer.zero_grad() # 清空梯度 62 | loss.backward() # 计算梯度 63 | torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) # 梯度裁剪,防止梯度爆炸 64 | optimizer.step() # 更新参数 65 | 66 | # 累计损失并更新进度条 67 | loss_sum += loss.item() 68 | num_batches += 1 69 | pbar.set_description(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {loss_sum/num_batches:.4f}") 70 | 71 | # 返回平均损失 72 | return loss_sum / len(dataloader) 73 | 74 | 75 | def train(model, train_loader, test_loader, noise_scheduler, criterion, optimizer, device, num_epochs=100, img_size=32): 76 | """训练模型""" 77 | for epoch in range(num_epochs): 78 | train_loss = train_step(model, train_loader, noise_scheduler, criterion, optimizer, epoch, num_epochs, device) 79 | test_loss = test_step(model, test_loader, noise_scheduler, criterion, epoch, num_epochs, device) 80 | if epoch % 10 == 0: 81 | # 采样10张图像 82 | images = sample(model, noise_scheduler, 10, (3, img_size, img_size), device) 83 | # 将图像从[-1, 1]范围缩放到[0, 1]范围,以便可视化 84 | images = ((images + 1) / 2).detach().cpu() 85 | fig = plot(images) 86 | os.makedirs("samples", exist_ok=True) 87 | fig.savefig(f"samples/epoch_{epoch}.png") 88 | return model 89 | 90 | 91 | if __name__ == "__main__": 92 | parser = argparse.ArgumentParser() 93 | parser.add_argument('--batch_size', type=int, default=128) 94 | parser.add_argument('--epochs', type=int, default=200) 95 | parser.add_argument('--lr', type=float, default=1e-4) 96 | parser.add_argument('--img_size', type=int, default=32) 97 | args = parser.parse_args() 98 | 99 | device = "cuda" if torch.cuda.is_available() else "cpu" 100 | 101 | train_loader, test_loader = load_transformed_dataset(args.img_size, args.batch_size) 102 | noise_scheduler = NoiseScheduler().to(device) 103 | model = SimpleUnet().to(device) 104 | optimizer = optim.Adam(model.parameters(), lr=args.lr) 105 | criterion = nn.MSELoss() 106 | model = train(model, train_loader, test_loader, noise_scheduler, criterion, optimizer, device, args.epochs, args.img_size) 107 | torch.save(model.state_dict(), f"simple-unet-ddpm-{args.img_size}.pth") 108 | -------------------------------------------------------------------------------- /content/TinyDiffusion/ddpm/unet.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | import math 4 | 5 | 6 | class Block(nn.Module): 7 | def __init__(self, in_channels, out_channels, time_emb_dim, up=False): 8 | """UNet中的基本Block模块,包含时间嵌入和上/下采样功能""" 9 | super().__init__() 10 | self.time_mlp = nn.Linear(time_emb_dim, out_channels) 11 | if up: 12 | self.conv1 = nn.Conv2d(2 * in_channels, out_channels, kernel_size=3, padding=1) 13 | self.transform = nn.ConvTranspose2d(out_channels, out_channels, kernel_size=4, stride=2, padding=1) 14 | else: 15 | self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1) 16 | self.transform = nn.Conv2d(out_channels, out_channels, kernel_size=4, stride=2, padding=1) 17 | 18 | self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1) 19 | self.bnorm1 = nn.BatchNorm2d(out_channels) 20 | self.bnorm2 = nn.BatchNorm2d(out_channels) 21 | self.relu = nn.ReLU() 22 | 23 | def forward(self, x, t): 24 | # 第一次卷积 25 | h = self.bnorm1(self.relu(self.conv1(x))) 26 | # 时间嵌入 27 | time_emb = self.relu(self.time_mlp(t)) 28 | # 将时间信息注入特征图 29 | h = h + time_emb[..., None, None] 30 | # 第二次卷积 31 | h = self.bnorm2(self.relu(self.conv2(h))) 32 | # 上采样或下采样 33 | return self.transform(h) 34 | 35 | 36 | class SinusoidalPositionEmbeddings(nn.Module): 37 | """使用正弦位置编码实现时间步的嵌入,参考Transformer中的位置编码方法,使用正余弦函数将时间步映射到高维空间""" 38 | def __init__(self, dim): 39 | super().__init__() 40 | self.dim = dim 41 | 42 | def forward(self, time): 43 | device = time.device 44 | # 将维度分成两半,分别用于sin和cos 45 | half_dim = self.dim // 2 46 | # 计算不同频率的指数衰减 47 | embeddings = math.log(10000) / (half_dim - 1) 48 | # 生成频率序列 49 | embeddings = torch.exp(torch.arange(half_dim, device=device) * -embeddings) 50 | # 将时间步与频率序列相乘 51 | embeddings = time[:, None] * embeddings[None, :] 52 | # 拼接sin和cos得到最终的嵌入向量 53 | embeddings = torch.cat((embeddings.sin(), embeddings.cos()), dim=-1) 54 | return embeddings 55 | 56 | 57 | class SimpleUnet(nn.Module): 58 | """简单的UNet模型,用于扩散模型的噪声预测""" 59 | def __init__(self): 60 | super().__init__() 61 | image_channels = 3 62 | down_channels = (64, 128, 256, 512, 1024) 63 | up_channels = (1024, 512, 256, 128, 64) 64 | out_dim = 3 65 | time_emb_dim = 32 66 | 67 | # 时间嵌入层 68 | self.time_embed = nn.Sequential( 69 | SinusoidalPositionEmbeddings(time_emb_dim), 70 | nn.Linear(time_emb_dim, time_emb_dim), 71 | ) 72 | 73 | # 输入层、下采样层、上采样层和输出层 74 | self.input = nn.Conv2d(image_channels, down_channels[0], kernel_size=3, padding=1) 75 | self.downs = nn.ModuleList([Block(down_channels[i], down_channels[i + 1], time_emb_dim) for i in range(len(down_channels) - 1)]) 76 | self.ups = nn.ModuleList([Block(up_channels[i], up_channels[i + 1], time_emb_dim, up=True) for i in range(len(up_channels) - 1)]) 77 | self.output = nn.Conv2d(up_channels[-1], out_dim, kernel_size=3, padding=1) 78 | 79 | def forward(self, x, time_step): 80 | # 时间步嵌入 81 | t = self.time_embed(time_step) 82 | # 初始卷积 83 | x = self.input(x) 84 | # UNet前向传播:先下采样收集特征,再上采样恢复分辨率 85 | residual_stack = [] 86 | for down in self.downs: 87 | x = down(x, t) 88 | residual_stack.append(x) 89 | for up in self.ups: 90 | residual_x = residual_stack.pop() 91 | x = torch.cat((x, residual_x), dim=1) 92 | x = up(x, t) 93 | return self.output(x) 94 | 95 | 96 | def print_shapes(model, x, time_step): 97 | print("Input shape:", x.shape) 98 | 99 | # 时间步嵌入 100 | t = model.time_embed(time_step) 101 | print("Time embedding shape:", t.shape) 102 | 103 | # 初始卷积 104 | x = model.input(x) 105 | print("After input conv shape:", x.shape) 106 | 107 | #下采样过程 108 | residual_stack = [] 109 | print("\nDownsampling process:") 110 | for i, down in enumerate(model.downs): 111 | x = down(x, t) 112 | residual_stack.append(x) 113 | print(f"Down block {i+1} output shape:", x.shape) 114 | 115 | # 上采样过程 116 | print("\nUpsampling process:") 117 | for i, up in enumerate(model.ups): 118 | residual_x = residual_stack.pop() 119 | x = torch.cat((x, residual_x), dim=1) 120 | print(f"Concatenated input shape before up block {i+1}:", x.shape) 121 | x = up(x, t) 122 | print(f"Up block {i+1} output shape:", x.shape) 123 | 124 | # 最终输出 125 | output = model.output(x) 126 | print("\nFinal output shape:", output.shape) 127 | return output 128 | 129 | 130 | if __name__ == "__main__": 131 | model = SimpleUnet() 132 | x = torch.randn(1, 3, 32, 32) 133 | time_step = torch.tensor([10]) 134 | print_shapes(model, x, time_step) -------------------------------------------------------------------------------- /content/TinyDiffusion/fig/attn-200.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/tiny-universe/638006d147ed6fc902e610fad50875549fa2d944/content/TinyDiffusion/fig/attn-200.png -------------------------------------------------------------------------------- /content/TinyDiffusion/fig/attn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/tiny-universe/638006d147ed6fc902e610fad50875549fa2d944/content/TinyDiffusion/fig/attn.png -------------------------------------------------------------------------------- /content/TinyDiffusion/fig/ddpm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/tiny-universe/638006d147ed6fc902e610fad50875549fa2d944/content/TinyDiffusion/fig/ddpm.png -------------------------------------------------------------------------------- /content/TinyDiffusion/fig/fig1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/tiny-universe/638006d147ed6fc902e610fad50875549fa2d944/content/TinyDiffusion/fig/fig1.png -------------------------------------------------------------------------------- /content/TinyDiffusion/fig/fig2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/tiny-universe/638006d147ed6fc902e610fad50875549fa2d944/content/TinyDiffusion/fig/fig2.png -------------------------------------------------------------------------------- /content/TinyDiffusion/fig/fig3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/tiny-universe/638006d147ed6fc902e610fad50875549fa2d944/content/TinyDiffusion/fig/fig3.png -------------------------------------------------------------------------------- /content/TinyDiffusion/fig/unet-200.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/tiny-universe/638006d147ed6fc902e610fad50875549fa2d944/content/TinyDiffusion/fig/unet-200.png -------------------------------------------------------------------------------- /content/TinyDiffusion/fig/unet.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/tiny-universe/638006d147ed6fc902e610fad50875549fa2d944/content/TinyDiffusion/fig/unet.png -------------------------------------------------------------------------------- /content/TinyEval/Eval/__pycache__/metrics.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/tiny-universe/638006d147ed6fc902e610fad50875549fa2d944/content/TinyEval/Eval/__pycache__/metrics.cpython-39.pyc -------------------------------------------------------------------------------- /content/TinyEval/Eval/config/adapter2path.json: -------------------------------------------------------------------------------- 1 | { 2 | "internlm2": "", 3 | "Qwen2": "", 4 | "Qwen2_moe": "" 5 | } -------------------------------------------------------------------------------- /content/TinyEval/Eval/config/dataset2maxlen.json: -------------------------------------------------------------------------------- 1 | { 2 | "multifieldqa_zh": 64, 3 | "multi_news": 512, 4 | "trec": 64, 5 | "custom_zh": 512, 6 | "custom_en": 512 7 | } -------------------------------------------------------------------------------- /content/TinyEval/Eval/config/dataset2prompt.json: -------------------------------------------------------------------------------- 1 | { 2 | "multifieldqa_zh": "阅读以下文字并用中文简短回答:\n\n{context}\n\n现在请基于上面的文章回答下面的问题,只告诉我答案,不要输出任何其他字词。\n\n问题:{input}\n回答:", 3 | "multi_news": "You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{context}\n\nNow, write a one-page summary of all the news.\n\nSummary:", 4 | "trec": "Please determine the type of the question below. Here are some examples of questions.\n\n{context}\n{input}", 5 | "custom_zh": "问题是:{input} \n回答是: ", 6 | "custom_en": "Question:{input} \nAnswer: " 7 | } -------------------------------------------------------------------------------- /content/TinyEval/Eval/config/model2maxlen.json: -------------------------------------------------------------------------------- 1 | { 2 | "internlm2": 2048, 3 | "Qwen2": 2048, 4 | "Qwen2_moe": 2048 5 | } 6 | -------------------------------------------------------------------------------- /content/TinyEval/Eval/config/model2path.json: -------------------------------------------------------------------------------- 1 | { 2 | "internlm2": "path2model", 3 | "Qwen2": "path2model", 4 | "Qwen2_moe": "path2model" 5 | } 6 | -------------------------------------------------------------------------------- /content/TinyEval/Eval/dataset/GAOKAO-new1-math.json: -------------------------------------------------------------------------------- 1 | { 2 | "one-choice-question": [ 3 | { 4 | "question": "已知集合 A = {x | -5 < x^3 < 5}, B = {-3, -1, 0, 2, 3},则 A ∩ B = ( )", 5 | "options": [ 6 | {"option": "A", "value": "{-1, 0}"}, 7 | {"option": "B", "value": "{2, 3}"}, 8 | {"option": "C", "value": "{-3, -1, 0}"}, 9 | {"option": "D", "value": "{-1, 0, 2}"} 10 | ], 11 | "answer": "A" 12 | }, 13 | { 14 | "question": "若 z / (z - 1) = 1 + i,则 z = ( )", 15 | "options": [ 16 | {"option": "A", "value": "-1 - i"}, 17 | {"option": "B", "value": "-1 + i"}, 18 | {"option": "C", "value": "1 - i"}, 19 | {"option": "D", "value": "1 + i"} 20 | ], 21 | "answer": "C" 22 | }, 23 | { 24 | "question": "已知向量 ā = (0, 1), b̄ = (2, x),若 b̄ ⊥ (b̄ - 4ā),则 x = ( )", 25 | "options": [ 26 | {"option": "A", "value": "-2"}, 27 | {"option": "B", "value": "-1"}, 28 | {"option": "C", "value": "1"}, 29 | {"option": "D", "value": "2"} 30 | ], 31 | "answer": "D" 32 | }, 33 | { 34 | "question": "已知 cos(α + β) = m, tanαtan β = 2,则 cos(α - β) = ( )", 35 | "options": [ 36 | {"option": "A", "value": "-3m"}, 37 | {"option": "B", "value": "-m/3"}, 38 | {"option": "C", "value": "m/3"}, 39 | {"option": "D", "value": "3m"} 40 | ], 41 | "answer": "A" 42 | }, 43 | { 44 | "question": "已知圆柱和圆锥底面半径相等,侧面半径相等,且它们的高均为 √3,则圆锥的体积为 ( )", 45 | "options": [ 46 | {"option": "A", "value": "2√3π"}, 47 | {"option": "B", "value": "3√3π"}, 48 | {"option": "C", "value": "6√3π"}, 49 | {"option": "D", "value": "9√3π"} 50 | ], 51 | "answer": "B" 52 | }, 53 | { 54 | "question": "已知函数 f(x) = { -x^2 - 2ax - a, x < 0; e^x + ln(x + 1), x ≥ 0 },在 R 上单调递增,则 a 取值的范围是 ( )", 55 | "options": [ 56 | {"option": "A", "value": "(-∞, 0]"}, 57 | {"option": "B", "value": "[-1, 0]"}, 58 | {"option": "C", "value": "[-1, 1]"}, 59 | {"option": "D", "value": "[0, +∞]"} 60 | ], 61 | "answer": "B" 62 | }, 63 | { 64 | "question": "当 x ∈ [0, 2π] 时,曲线 y = sin x 与 y = 2 sin (3x - π/6) 的交点个数为 ( )", 65 | "options": [ 66 | {"option": "A", "value": "3"}, 67 | {"option": "B", "value": "4"}, 68 | {"option": "C", "value": "6"}, 69 | {"option": "D", "value": "8"} 70 | ], 71 | "answer": "C" 72 | }, 73 | { 74 | "question": "已知函数为 f(x) 的定义域为 R,f(x) > f(x - 1) + f(x - 2),且当 x < 3 时 f(x) = x,则下列结论中一定正确的是 ( )", 75 | "options": [ 76 | {"option": "A", "value": "f(10) > 100"}, 77 | {"option": "B", "value": "f(20) > 1000"}, 78 | {"option": "C", "value": "f(10) < 1000"}, 79 | {"option": "D", "value": "f(20) < 10000"} 80 | ], 81 | "answer": "B" 82 | } 83 | ], 84 | "multiple-choice-question": [ 85 | { 86 | "question": "为了理解动出口后的亩收入(单位:万元)情况,从该种植区抽取样本,得到推动出口后亩收入的样本均值 x̄ = 2.1,样本方差 s^2 = 0.01,已知该种植区以往的亩收入 X 服从正态分布 N(1.8, 0.1^2),假设推动出口后的亩收入 Y 服从正态分布 N(x̄, s^2),则 ( ) 若随机变量 Z 服从正态分布 N(u, σ^2),P(Z < u + σ) ≈ 0.8413", 87 | "options": [ 88 | {"option": "A", "value": "P(X > 2) > 0.2"}, 89 | {"option": "B", "value": "P(X > 2) < 0.5"}, 90 | {"option": "C", "value": "P(Y > 2) > 0.5"}, 91 | {"option": "D", "value": "P(Y > 2) < 0.8"} 92 | ], 93 | "answer": "BC" 94 | }, 95 | { 96 | "question": "设函数 f(x) = (x - 1)^2(x - 4),则 ( )", 97 | "options": [ 98 | {"option": "A", "value": "x = 3 是 f(x) 的极小值点"}, 99 | {"option": "B", "value": "当 0 < x < 1 时,f(x) < f(x^2)"}, 100 | {"option": "C", "value": "当 1 < x < 2 时,-4 < f(2x - 1) < 0"}, 101 | {"option": "D", "value": "当 -1 < x < 0 时,f(2 - x) > f(x)"} 102 | ], 103 | "answer": "ACD" 104 | } 105 | ] 106 | } 107 | -------------------------------------------------------------------------------- /content/TinyEval/Eval/dataset/GAOKAO_math.jsonl: -------------------------------------------------------------------------------- 1 | { 2 | "instruction": "请您根据以下问题,参考相关参考选项进行作答。", 3 | "input": "问题是:已知集合 A = {x | -5 < x^3 < 5}, B = {-3, -1, 0, 2, 3},则 A ∩ B = ( ), 选项是:A: {-1, 0}, B: {2, 3}, C: {-3, -1, 0}, D: {-1, 0, 2}, 您的答案是:", 4 | "output": "A" 5 | } 6 | { 7 | "instruction": "请您根据以下问题,参考相关参考选项进行作答。", 8 | "input": "问题是:若 z / (z - 1) = 1 + i,则 z = ( ), 选项是:A: -1 - i, B: -1 + i, C: 1 - i, D: 1 + i, 您的答案是:", 9 | "output": "C" 10 | } 11 | { 12 | "instruction": "请您根据以下问题,参考相关参考选项进行作答。", 13 | "input": "问题是:已知向量 ā = (0, 1), b̄ = (2, x),若 b̄ ⊥ (b̄ - 4ā),则 x = ( ), 选项是:A: -2, B: -1, C: 1, D: 2, 您的答案是:", 14 | "output": "D" 15 | } 16 | { 17 | "instruction": "请您根据以下问题,参考相关参考选项进行作答。", 18 | "input": "问题是:已知 cos(α + β) = m, tanαtan β = 2,则 cos(α - β) = ( ), 选项是:A: -3m, B: -m/3, C: m/3, D: 3m, 您的答案是:", 19 | "output": "A" 20 | } 21 | { 22 | "instruction": "请您根据以下问题,参考相关参考选项进行作答。", 23 | "input": "问题是:已知圆柱和圆锥底面半径相等,侧面半径相等,且它们的高均为 √3,则圆锥的体积为 ( ), 选项是:A: 2√3π, B: 3√3π, C: 6√3π, D: 9√3π, 您的答案是:", 24 | "output": "B" 25 | } 26 | { 27 | "instruction": "请您根据以下问题,参考相关参考选项进行作答。", 28 | "input": "问题是:已知函数 f(x) = { -x^2 - 2ax - a, x < 0; e^x + ln(x + 1), x ≥ 0 },在 R 上单调递增,则 a 取值的范围是 ( ), 选项是:A: (-∞, 0], B: [-1, 0], C: [-1, 1], D: [0, +∞], 您的答案是:", 29 | "output": "B" 30 | } 31 | { 32 | "instruction": "请您根据以下问题,参考相关参考选项进行作答。", 33 | "input": "问题是:当 x ∈ [0, 2π] 时,曲线 y = sin x 与 y = 2 sin (3x - π/6) 的交点个数为 ( ), 选项是:A: 3, B: 4, C: 6, D: 8, 您的答案是:", 34 | "output": "C" 35 | } 36 | { 37 | "instruction": "请您根据以下问题,参考相关参考选项进行作答。", 38 | "input": "问题是:已知函数为 f(x) 的定义域为 R,f(x) > f(x - 1) + f(x - 2),且当 x < 3 时 f(x) = x,则下列结论中一定正确的是 ( ), 选项是:A: f(10) > 100, B: f(20) > 1000, C: f(10) < 1000, D: f(20) < 10000, 您的答案是:", 39 | "output": "B" 40 | } 41 | { 42 | "instruction": "请您根据以下问题,参考相关参考选项进行作答,此题为多选题。", 43 | "input": "问题是:为了理解动出口后的亩收入(单位:万元)情况,从该种植区抽取样本,得到推动出口后亩收入的样本均值 x̄ = 2.1,样本方差 s^2 = 0.01,已知该种植区以往的亩收入 X 服从正态分布 N(1.8, 0.1^2),假设推动出口后的亩收入 Y 服从正态分布 N(x̄, s^2),则 ( ) 若随机变量 Z 服从正态分布 N(u, σ^2),P(Z < u + σ) ≈ 0.8413, 选项是:A: P(X > 2) > 0.2, B: P(X > 2) < 0.5, C: P(Y > 2) > 0.5, D: P(Y > 2) < 0.8, 您的答案是:", 44 | "output": "BC" 45 | } 46 | { 47 | "instruction": "请您根据以下问题,参考相关参考选项进行作答,此题为多选题。", 48 | "input": "问题是:设函数 f(x) = (x - 1)^2(x - 4),则 ( ), 选项是:A: x = 3 是 f(x) 的极小值点, B: 当 0 < x < 1 时,f(x) < f(x^2), C: 当 1 < x < 2 时,-4 < f(2x - 1) < 0, D: 当 -1 < x < 0 时,f(2 - x) > f(x), 您的答案是:", 49 | "output": "ACD" 50 | } 51 | -------------------------------------------------------------------------------- /content/TinyEval/Eval/dataset/custom_zh.jsonl: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "instruction": "假设你是皇帝身边的女人--甄嬛", 4 | "input": "你是谁?", 5 | "output": "臣妾是甄嬛,家父是大理寺少卿。" 6 | } 7 | 8 | ] -------------------------------------------------------------------------------- /content/TinyEval/Eval/docs/compass.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/tiny-universe/638006d147ed6fc902e610fad50875549fa2d944/content/TinyEval/Eval/docs/compass.png -------------------------------------------------------------------------------- /content/TinyEval/Eval/metrics.py: -------------------------------------------------------------------------------- 1 | import re 2 | import string 3 | import jieba 4 | from rouge import Rouge 5 | from collections import Counter 6 | jieba.setLogLevel(jieba.logging.INFO) 7 | 8 | 9 | def normalize_zh_aswer(s): 10 | """小写化,删除标点,删除空格""" 11 | 12 | def white_space_fix(text): 13 | return "".join(text.split()) 14 | 15 | def remove_punc(text): 16 | cn_punctuation = "!?。。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏." 17 | all_punctuation = set(string.punctuation + cn_punctuation) 18 | return ''.join(ch for ch in text if ch not in all_punctuation) 19 | 20 | def lower(text): 21 | return text.lower() 22 | 23 | return white_space_fix(remove_punc(lower(s))) 24 | 25 | def normalize_en_answer(s): 26 | """小写化,删除标点,删除冠词和多余空白.""" 27 | 28 | def remove_articles(text): 29 | return re.sub(r"\b(a|an|the)\b", " ", text) 30 | 31 | def white_space_fix(text): 32 | return " ".join(text.split()) 33 | 34 | def remove_punc(text): 35 | exclude = set(string.punctuation) 36 | return "".join(ch for ch in text if ch not in exclude) 37 | 38 | def lower(text): 39 | return text.lower() 40 | 41 | return white_space_fix(remove_articles(remove_punc(lower(s)))) 42 | 43 | def classification_score(prediction, ground_truth, **kwargs): 44 | em_match_list = [] 45 | all_classes = kwargs["all_classes"] 46 | for class_name in all_classes: 47 | if class_name in prediction: # 总类别里面的类别是否在预测中出现 48 | em_match_list.append(class_name) 49 | for match_term in em_match_list: 50 | if match_term in ground_truth and match_term != ground_truth: # 如果预测中的类别在答案中出现,但是不是答案 'two step'--'step' 51 | em_match_list.remove(match_term) 52 | if ground_truth in em_match_list: 53 | score = (1.0 / len(em_match_list)) 54 | else: 55 | score = 0.0 56 | return score 57 | 58 | def rouge_score(prediction, ground_truth, **kwargs): 59 | rouge = Rouge() 60 | try: 61 | scores = rouge.get_scores([prediction], [ground_truth], avg=True) 62 | except: 63 | return 0.0 64 | return scores["rouge-l"]["f"] 65 | 66 | def rouge_zh_score(prediction, ground_truth, **kwargs): 67 | prediction = " ".join(list(jieba.cut(prediction, cut_all=False))) 68 | ground_truth = " ".join(list(jieba.cut(ground_truth, cut_all=False))) 69 | score = rouge_score(prediction, ground_truth) 70 | return score 71 | 72 | def f1_score(prediction, ground_truth, **kwargs): 73 | # Counter以dict的形式存储各个句子对应的词与其对应个数,&操作符返回两个Counter中共同的元素的键值对 74 | common = Counter(prediction) & Counter(ground_truth) 75 | num_same = sum(common.values()) # 显示prediction与gt的共同元素的个数 76 | if num_same == 0: 77 | return 0 78 | precision = 1.0 * num_same / len(prediction) # 即模型预测正确的样本数量与总预测样本数量的比值 79 | recall = 1.0 * num_same / len(ground_truth) # 模型正确预测的样本数量与总实际样本数量的比值 80 | f1 = (2 * precision * recall) / (precision + recall) 81 | return f1 82 | 83 | def qa_f1_score(prediction, ground_truth, **kwargs): 84 | normalized_prediction = normalize_en_answer(prediction) 85 | normalized_ground_truth = normalize_en_answer(ground_truth) 86 | 87 | prediction_tokens = normalized_prediction.split() 88 | ground_truth_tokens = normalized_ground_truth.split() 89 | return f1_score(prediction_tokens, ground_truth_tokens) 90 | 91 | def qa_f1_zh_score(prediction, ground_truth, **kwargs): 92 | prediction_tokens = list(jieba.cut(prediction, cut_all=False)) 93 | ground_truth_tokens = list(jieba.cut(ground_truth, cut_all=False)) 94 | prediction_tokens_norm = [normalize_zh_aswer(t) for t in prediction_tokens] 95 | ground_truth_tokens_norm = [normalize_zh_aswer(t) for t in ground_truth_tokens] 96 | prediction_tokens = [t for t in prediction_tokens_norm if len(t) > 0] 97 | ground_truth_tokens = [t for t in ground_truth_tokens_norm if len(t) > 0] 98 | return f1_score(prediction_tokens, ground_truth_tokens) 99 | 100 | def GAOKAO_math(prediction, ground_truth, **kwargs): 101 | score = 0 102 | # 判断是单选还是多选 103 | if len(ground_truth) > 1: 104 | # 多选 105 | pattern = r"[A-D]" 106 | matches = re.findall(pattern, prediction) 107 | predicted_answer = '' 108 | 109 | if matches: 110 | # 从后往前匹配大写字母,且满足之间长度不超过10个字符的条件 111 | reversed_prediction = prediction[::-1] 112 | if len(matches) > 1: 113 | # 从后往前遍历匹配项 114 | for i, match in enumerate(matches): 115 | if i == 0: 116 | predicted_answer += match 117 | else: 118 | # 计算当前匹配项与上一个匹配项之间的距离 119 | distance = reversed_prediction.find(matches[i-1]) - reversed_prediction.find(match) - 1 120 | # 如果距离大于5,则停止添加更多的选项 121 | if distance > 5: 122 | break 123 | predicted_answer += match 124 | # 将预测答案排序并去重 125 | predicted_answer = ''.join(sorted(set(predicted_answer))) 126 | # 计算得分 127 | if predicted_answer == ground_truth: 128 | score = 1 129 | elif all(option in ground_truth for option in predicted_answer) and len(predicted_answer) < len(ground_truth): 130 | score = 0.5 131 | else: 132 | # 单选 133 | pattern = r"[A-D]" 134 | matches = re.findall(pattern, prediction) 135 | if matches and matches[-1] == ground_truth: 136 | score = 1 137 | 138 | return score 139 | 140 | 141 | 142 | -------------------------------------------------------------------------------- /content/TinyEval/Eval/model/LLM.py: -------------------------------------------------------------------------------- 1 | import json 2 | from transformers import AutoTokenizer, LlamaTokenizer, LlamaForCausalLM, AutoModelForCausalLM 3 | from peft import PeftModel 4 | from typing import Dict, List, Optional, Tuple, Union 5 | import torch 6 | import os 7 | from tqdm import tqdm 8 | 9 | 10 | class BaseLLM: 11 | def __init__(self, path: str, model_name: str, adapter_path: str) -> None: 12 | self.path = path 13 | self.model_name = model_name 14 | self.adapter_path = adapter_path 15 | 16 | def build_chat(self, tokenizer, prompt, model_name): 17 | pass 18 | 19 | def load_model_and_tokenizer(self, path, model_name, device): 20 | pass 21 | 22 | def post_process(self, response, model_name): 23 | pass 24 | 25 | def get_pred(self, data: list, max_length: int, max_gen: int, prompt_format: str, device, out_path: str): 26 | pass 27 | 28 | 29 | class internlm2Chat(BaseLLM): 30 | def __init__(self, path: str, model_name: str = '', adapter_path: str = '') -> None: 31 | super().__init__(path, model_name, adapter_path) # 调用父类初始化函数并传入参数 32 | 33 | def build_chat(self, prompt): 34 | prompt = f'<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n' 35 | return prompt 36 | 37 | def post_process(self, response): 38 | response = response.split("<|im_end|>")[0] 39 | return response 40 | 41 | def load_model_and_tokenizer(self, path, device, adapter_path): 42 | model = AutoModelForCausalLM.from_pretrained(path, trust_remote_code=True, torch_dtype=torch.bfloat16).to(device) 43 | tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True) 44 | if adapter_path: 45 | # print(adapter_path) 46 | model = PeftModel.from_pretrained(model, model_id=adapter_path) 47 | model = model.eval() 48 | return model, tokenizer 49 | 50 | def get_pred(self, data, max_length, max_gen, prompt_format, device, out_path): 51 | model, tokenizer = self.load_model_and_tokenizer(self.path, device, self.adapter_path) 52 | for json_obj in tqdm(data): 53 | prompt = prompt_format.format(**json_obj) 54 | # 在中间截断,因为两头有关键信息. 55 | tokenized_prompt = tokenizer(prompt, truncation=False, return_tensors="pt").input_ids[0] 56 | if len(tokenized_prompt) > max_length: 57 | half = int(max_length/2) 58 | prompt = tokenizer.decode(tokenized_prompt[:half], skip_special_tokens=True)+tokenizer.decode(tokenized_prompt[-half:], skip_special_tokens=True) 59 | 60 | prompt = self.build_chat(prompt) 61 | 62 | input = tokenizer(prompt, truncation=False, return_tensors="pt").to(device) 63 | context_length = input.input_ids.shape[-1] # 表示喂进去的tokens的长度 64 | eos_token_id = [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids(["<|im_end|>"])[0]] 65 | 66 | output = model.generate( 67 | **input, 68 | max_new_tokens=max_gen, 69 | do_sample=False, 70 | temperature=1.0, 71 | eos_token_id=eos_token_id, 72 | )[0] 73 | 74 | pred = tokenizer.decode(output[context_length:], skip_special_tokens=True) 75 | pred = self.post_process(pred) 76 | 77 | with open(out_path, "a", encoding="utf-8") as f: 78 | json.dump({"pred": pred, "answers": json_obj["answers"], "all_classes": json_obj["all_classes"], "length": json_obj["length"]}, f, ensure_ascii=False) 79 | f.write('\n') 80 | 81 | 82 | class Qwen2Chat(BaseLLM): 83 | def __init__(self, path: str, model_name: str = '', adapter_path: str = '') -> None: 84 | super().__init__(path, model_name, adapter_path) # 调用父类初始化函数并传入参数 85 | 86 | def build_chat(self, prompt, instruct=None): 87 | if instruct is None: 88 | instruct = 'You are a helpful assistant.' 89 | prompt = f'<|im_start|>system\n{instruct}\n<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n' 90 | return prompt 91 | 92 | 93 | def load_model_and_tokenizer(self, path, device, adapter_path): 94 | model = AutoModelForCausalLM.from_pretrained(path, trust_remote_code=True, torch_dtype=torch.bfloat16).to(device) 95 | tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True) 96 | # adapter_path = '' 97 | if adapter_path: 98 | model = PeftModel.from_pretrained(model, model_id=adapter_path) 99 | print(f"adapter loaded in {adapter_path}") 100 | model = model.eval() 101 | return model, tokenizer 102 | 103 | def get_pred(self, data, max_length, max_gen, prompt_format, device, out_path): 104 | model, tokenizer = self.load_model_and_tokenizer(self.path, device, self.adapter_path) 105 | for json_obj in tqdm(data): 106 | prompt = prompt_format.format(**json_obj) 107 | # 在中间截断,因为两头有关键信息. 108 | tokenized_prompt = tokenizer(prompt, truncation=False, return_tensors="pt").input_ids[0] 109 | if len(tokenized_prompt) > max_length: 110 | half = int(max_length/2) 111 | prompt = tokenizer.decode(tokenized_prompt[:half], skip_special_tokens=True)+tokenizer.decode(tokenized_prompt[-half:], skip_special_tokens=True) 112 | 113 | prompts = self.build_chat(prompt, json_obj.get('instruction', None)) 114 | inputs = tokenizer(prompts, truncation=False, return_tensors="pt").to(device) 115 | 116 | output = model.generate( 117 | inputs.input_ids, 118 | do_sample=True, 119 | temperature=1.0, 120 | max_new_tokens=max_gen, 121 | top_p=0.8 122 | ) 123 | 124 | pred = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output)] 125 | pred = tokenizer.batch_decode(pred, skip_special_tokens=True)[0] 126 | 127 | with open(out_path, "a", encoding="utf-8") as f: 128 | json.dump({"pred": pred, "answers": json_obj["output"], "all_classes": json_obj.get("all_classes", None), "length": json_obj.get("length", None)}, f, ensure_ascii=False) 129 | f.write('\n') -------------------------------------------------------------------------------- /content/TinyEval/Eval/model/__pycache__/LLM.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/tiny-universe/638006d147ed6fc902e610fad50875549fa2d944/content/TinyEval/Eval/model/__pycache__/LLM.cpython-310.pyc -------------------------------------------------------------------------------- /content/TinyEval/Eval/model/__pycache__/LLM.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/tiny-universe/638006d147ed6fc902e610fad50875549fa2d944/content/TinyEval/Eval/model/__pycache__/LLM.cpython-39.pyc -------------------------------------------------------------------------------- /content/TinyEval/Eval/pred/Qwen2/result.json: -------------------------------------------------------------------------------- 1 | { 2 | "GAOKAO_math": 10.0 3 | } -------------------------------------------------------------------------------- /content/TinyEval/Eval/pred/internlm/result.json: -------------------------------------------------------------------------------- 1 | { 2 | "multi_news": 23.66, 3 | "multifieldqa_zh": 34.97, 4 | "trec": 52.0 5 | } -------------------------------------------------------------------------------- /content/TinyEval/eval.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import argparse 4 | import numpy as np 5 | 6 | 7 | from Eval.metrics import ( 8 | qa_f1_score, 9 | qa_f1_zh_score, 10 | rouge_score, 11 | classification_score, 12 | rouge_zh_score, 13 | GAOKAO_math 14 | ) 15 | 16 | def parse_args(args=None): 17 | parser = argparse.ArgumentParser() 18 | parser.add_argument('--model', type=str, default='Qwen2') 19 | return parser.parse_args(args) 20 | 21 | dataset2metric = { 22 | 'multifieldqa_zh': qa_f1_zh_score, 23 | 'multi_news': rouge_score, 24 | 'trec': classification_score, 25 | 'custom_zh': rouge_zh_score, 26 | "GAOKAO_math": GAOKAO_math 27 | } 28 | 29 | # 计算得分 30 | def scorer(dataset, predictions, answers, all_classes): 31 | total_score = 0. 32 | for (prediction, ground_truths) in zip(predictions, answers): 33 | score = 0. 34 | if dataset in ["trec"]: 35 | prediction = prediction.lstrip('\n').split('\n')[0] # 格式抽取 36 | if dataset in ['custom_zh', 'custom_en']: 37 | score = max(score, dataset2metric[dataset](prediction, ground_truths, all_classes=all_classes)) 38 | else: 39 | score = max(score, dataset2metric.get(dataset, dataset2metric[dataset])(prediction, ground_truths, all_classes=all_classes)) 40 | # for ground_truth in ground_truths: 41 | # score = max(score, dataset2metric[dataset](prediction, ground_truth, all_classes=all_classes)) 42 | total_score += score 43 | return round(100 * total_score / len(predictions), 2) 44 | 45 | if __name__ == '__main__': 46 | scores = dict() 47 | args = parse_args() 48 | path = f"Eval/pred/{args.model}/" 49 | all_files = os.listdir(path) 50 | print("Evaluating on:", all_files) 51 | for file in all_files: 52 | if not file.endswith(".jsonl") or file == "result.json": 53 | continue 54 | predictions, answers, lengths = [], [], [] 55 | dataset = file.split('.')[0] 56 | with open(f'{path}{file}', 'r', ) as f: 57 | for line in f: 58 | data = json.loads(line) # str转为dict 59 | predictions.append(data["pred"]) 60 | answers.append(data["answers"]) 61 | all_classes = data["all_classes"] 62 | if "length" in data: 63 | lengths.append(data["length"]) 64 | 65 | score = scorer(dataset, predictions, answers, all_classes) 66 | scores[dataset] = score 67 | 68 | # 保存结果 69 | out_path = f"Eval/pred/{args.model}/result.json" 70 | with open(out_path, "w") as f: 71 | json.dump(scores, f, ensure_ascii=False, indent=4) 72 | -------------------------------------------------------------------------------- /content/TinyEval/gaokao.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 43, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from transformers import AutoModelForCausalLM, AutoTokenizer\n", 10 | "import json\n", 11 | "import random\n", 12 | "from datasets import load_dataset\n", 13 | "device = \"cuda\" # the device to load the model onto" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 36, 19 | "metadata": {}, 20 | "outputs": [ 21 | { 22 | "name": "stderr", 23 | "output_type": "stream", 24 | "text": [ 25 | "Loading checkpoint shards: 100%|██████████| 4/4 [00:05<00:00, 1.32s/it]\n", 26 | "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n" 27 | ] 28 | } 29 | ], 30 | "source": [ 31 | "model = AutoModelForCausalLM.from_pretrained(\n", 32 | " \"Qwen2-7B-Instruct\", # path2Qwen2\n", 33 | " torch_dtype=\"auto\",\n", 34 | " device_map=\"auto\"\n", 35 | ")\n", 36 | "\n", 37 | "tokenizer = AutoTokenizer.from_pretrained(\"Qwen2-7B-Instruct\")\n" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 37, 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [ 46 | "def chat(system, prompt, model=model, tokenizer=tokenizer):\n", 47 | "\n", 48 | " messages = [\n", 49 | " {\"role\": \"system\", \"content\": system},\n", 50 | " {\"role\": \"user\", \"content\": prompt}\n", 51 | " ]\n", 52 | " text = tokenizer.apply_chat_template(\n", 53 | " messages,\n", 54 | " tokenize=False,\n", 55 | " add_generation_prompt=True\n", 56 | " )\n", 57 | " model_inputs = tokenizer([text], return_tensors=\"pt\").to(device)\n", 58 | "\n", 59 | " generated_ids = model.generate(\n", 60 | " model_inputs.input_ids,\n", 61 | " max_new_tokens=768\n", 62 | " )\n", 63 | " generated_ids = [\n", 64 | " output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)\n", 65 | " ]\n", 66 | "\n", 67 | " return tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": 46, 73 | "metadata": {}, 74 | "outputs": [ 75 | { 76 | "data": { 77 | "text/plain": [ 78 | "Dataset({\n", 79 | " features: ['instruction', 'input', 'output'],\n", 80 | " num_rows: 10\n", 81 | "})" 82 | ] 83 | }, 84 | "execution_count": 46, 85 | "metadata": {}, 86 | "output_type": "execute_result" 87 | } 88 | ], 89 | "source": [ 90 | "import json\n", 91 | "datasets='GAOKAO_math'\n", 92 | "content = load_dataset('json', data_files=f'Eval/dataset/{datasets}.jsonl',split='train')\n", 93 | "content" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": 47, 99 | "metadata": {}, 100 | "outputs": [ 101 | { 102 | "name": "stdout", 103 | "output_type": "stream", 104 | "text": [ 105 | "The question is: 问题是:已知向量 ā = (0, 1), b̄ = (2, x),若 b̄ ⊥ (b̄ - 4ā),则 x = ( ), 选项是:A: -2, B: -1, C: 1, D: 2, 您的答案是:\n", 106 | "The prediction is: 要解决这个问题,我们首先需要理解题目中的条件 \"b̄ ⊥ (b̄ - 4ā)\",这表示向量 b̄ 与向量 (b̄ - 4ā) 是垂直的。两个向量垂直意味着它们的点积为0。\n", 107 | "\n", 108 | "给定向量 ā = (0, 1) 和 b̄ = (2, x),我们可以计算出 (b̄ - 4ā) = (2 - 4*0, x - 4*1) = (2, x - 4)。\n", 109 | "\n", 110 | "两个向量的点积公式为:\\[ \\vec{a} \\cdot \\vec{b} = a_xb_x + a_yb_y \\]\n", 111 | "\n", 112 | "所以,对于向量 b̄ 和 (b̄ - 4ā),我们有:\n", 113 | "\n", 114 | "\\[ b̄ \\cdot (b̄ - 4ā) = 2*2 + x*(x-4) = 0 \\]\n", 115 | "\n", 116 | "\\[ 4 + x^2 - 4x = 0 \\]\n", 117 | "\n", 118 | "整理得:\n", 119 | "\n", 120 | "\\[ x^2 - 4x + 4 = 0 \\]\n", 121 | "\n", 122 | "这是一个完全平方公式,可以写为:\n", 123 | "\n", 124 | "\\[ (x - 2)^2 = 0 \\]\n", 125 | "\n", 126 | "因此,解得:\n", 127 | "\n", 128 | "\\[ x = 2 \\]\n", 129 | "\n", 130 | "所以,正确答案是 D: 2。\n", 131 | "The answer is: D\n" 132 | ] 133 | } 134 | ], 135 | "source": [ 136 | "idx = random.randint(0, len(content))\n", 137 | "instruction, question, answer = content[idx].values()\n", 138 | "prompt = '{question}'.format(question=question)\n", 139 | "response = chat(instruction, prompt)\n", 140 | "print(\"The question is:\", prompt)\n", 141 | "print(\"The prediction is: \", response)\n", 142 | "print(\"The answer is: \", answer)" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": null, 148 | "metadata": {}, 149 | "outputs": [], 150 | "source": [] 151 | } 152 | ], 153 | "metadata": { 154 | "kernelspec": { 155 | "display_name": "xhr", 156 | "language": "python", 157 | "name": "python3" 158 | }, 159 | "language_info": { 160 | "codemirror_mode": { 161 | "name": "ipython", 162 | "version": 3 163 | }, 164 | "file_extension": ".py", 165 | "mimetype": "text/x-python", 166 | "name": "python", 167 | "nbconvert_exporter": "python", 168 | "pygments_lexer": "ipython3", 169 | "version": "3.9.18" 170 | } 171 | }, 172 | "nbformat": 4, 173 | "nbformat_minor": 2 174 | } 175 | -------------------------------------------------------------------------------- /content/TinyEval/inference.py: -------------------------------------------------------------------------------- 1 | import os 2 | from datasets import load_dataset 3 | from transformers import AutoTokenizer, LlamaTokenizer, LlamaForCausalLM, AutoModelForCausalLM 4 | import torch 5 | import json 6 | from tqdm import tqdm 7 | import numpy as np 8 | import random 9 | import argparse 10 | from Eval.model.LLM import internlm2Chat, Qwen2Chat 11 | 12 | def seed_everything(seed): 13 | torch.manual_seed(seed) 14 | torch.cuda.manual_seed(seed) 15 | np.random.seed(seed) 16 | random.seed(seed) 17 | torch.backends.cudnn.benchmark = False 18 | torch.backends.cudnn.deterministic = True 19 | torch.cuda.manual_seed_all(seed) 20 | 21 | 22 | def parse_args(args=None): 23 | parser = argparse.ArgumentParser() 24 | parser.add_argument('--model', type=str, default='Qwen2') 25 | return parser.parse_args(args) 26 | 27 | if __name__ == '__main__': 28 | seed_everything(42) 29 | args = parse_args() 30 | 31 | model2path = json.load(open("Eval/config/model2path.json", "r")) 32 | model2maxlen = json.load(open("Eval/config/model2maxlen.json", "r")) 33 | adapter2path = json.load(open("Eval/config/adapter2path.json", "r")) 34 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 35 | model_name = args.model 36 | # define your model 37 | max_length = model2maxlen[model_name] 38 | 39 | # datasets = ["multi_news", "multifieldqa_zh", "trec"] 40 | datasets = ['GAOKAO_math'] 41 | 42 | dataset2prompt = json.load(open("Eval/config/dataset2prompt.json", "r")) 43 | dataset2maxlen = json.load(open("Eval/config/dataset2maxlen.json", "r")) 44 | pred_model = Qwen2Chat(model2path[model_name], model_name, adapter2path[model_name]) 45 | # predict on each dataset 46 | if not os.path.exists("pred"): 47 | os.makedirs("pred") 48 | 49 | for dataset in datasets: 50 | data = load_dataset('json', data_files=f'Eval/dataset/{dataset}.jsonl',split='train') 51 | if not os.path.exists(f"Eval/pred/{model_name}"): 52 | os.makedirs(f"Eval/pred/{model_name}") 53 | out_path = f"Eval/pred/{model_name}/{dataset}.jsonl" 54 | if os.path.isfile(out_path): 55 | os.remove(out_path) 56 | prompt_format = dataset2prompt.get(dataset, dataset2prompt.get('custom_zh')) 57 | max_gen = dataset2maxlen.get(dataset, dataset2maxlen.get('custom_zh')) 58 | data_all = [data_sample for data_sample in data] 59 | 60 | pred_model.get_pred(data, max_length, max_gen, prompt_format, device, out_path) 61 | 62 | -------------------------------------------------------------------------------- /content/TinyEval/pred/results.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/tiny-universe/638006d147ed6fc902e610fad50875549fa2d944/content/TinyEval/pred/results.txt -------------------------------------------------------------------------------- /content/TinyEval/readme.md: -------------------------------------------------------------------------------- 1 | # TinyEval 2 | 手搓LLM评测系统直播:[直播链接](https://meeting.tencent.com/v2/cloud-record/share?id=8b9cf6ca-add6-477b-affe-5b62e2d8f27e&from=3) 3 | 4 | 下面我会带领大家一步一步实现一个简单的LLM评测框架,该框架是一个双阶段的评测体系,我们称之为`TinyEval`,包含了`LLM`通用评测的核心功能,支持生成式、判别式、选则式评测问题,框架主要包含`inference`与`eval`部分,目的是为了帮助大家更好的力即LLM评测的原理与实现。 5 | 6 | ## 1.项目的Motivation是什么? 7 | 初入`LLM`大门,你是否有类似的困惑: 8 | 9 | 1. 各个模型的评测指标五花八门?小白初学者看不懂,难以学习? 10 | 2. 评测`metric`不会选,除了`rouge`,`blue`想不到其他的`metric`? 11 | 3. 想让`LLM`做选择题,但是模型输出了一大堆,如何评价选择能力? 12 | 4. 模型五花八门,垂域任务也五花八门。除了`human_eval`之外,如何对个性化的任务提供有说服力的定量性能指标? 13 | 14 | So, 本项目将逐个为你解开上述的困惑! 15 | 16 | ## 2.Eval都包含哪些流程? 17 | 首先要明确评测任务的基础`pipeline`。下图是评测任务的简要流程: 18 | 19 |
20 | 21 |
22 | 23 | 24 | - 首先,根据目标数据集的任务类型指定合理的评测`metric`. 25 | - 根据目标数据的形式总结模型引导`prompt`. 26 | - 根据模型初步预测结果采纳合理的抽取方式. 27 | - 对相应的`pred`与`anwser`进行得分计算. 28 | 29 | OK,上述这些也就是TinyEval仓库的所有模块内容。 30 | 31 | ## 3.支持的评测数据集与评测Metric. 32 | 所采用的数据集在这里[here](./Eval/dataset/),目前有的数据集与类型包含(后续会持续更新!): 33 | 34 | |name|type|metric| 35 | |---|---|---| 36 | |multi_news|长文本问答|Rouge| 37 | |multifieldqa_zh|短文本问答|F1| 38 | |trec|生成式选则|accuracy| 39 | 40 | 大家可以按照需要的任务进行探索,接下来我也会手把手为大家讲解评测步骤! 41 | 42 | ## 评测过程介绍. 43 | 看到了上面的指标是否有这样的疑问: 44 | - What? `F1` 不是分类指标,怎么跑`llm`去了? 45 | - `accuracy`不是要分`label`标签的吗?怎么跑生成式里来了? 46 | 47 | Okey,这一节主要就是讲解上述的疑问,如果有基础的同学,可以先自行探索[相关代码](./Eval/metrics.py) 48 | 49 | ### 1. 生成式的F1 50 | 51 | #### 1.1 模型推理 52 | - 首先,对于一个评测数据集,我们首先要构造引导prompt,即引导llm生成我们想要的答案。对于已有的数据集,大部分都提供了相应的prompt,在自己数据集评测时,也可自行设计。以`multifieldqa_zh`为例,其引导prompt为: 53 | ``` 54 | 阅读以下文字并用中文简短回答:\n\n{context}\n\n现在请基于上面的文章回答下面的问题,只告诉我答案,不要输出任何其他字词。\n\n问题:{input}\n回答: 55 | ``` 56 | - 之后,再指定模型的输入长度,在此主要是规定每次送进模型多少token数,一般为了追求性能可以设置为模型最大长度,可以在下载好的模型文件里面的`config.json`里面的"max_position_embeddings"查询,也可以不设置作为默认最大长度.但本项目设置为了2048,主要为了演示使用~ 57 | 58 | - 之后就是创建model整体,在此我对模型整体创建了一个class,大家可以参考对其他任意的model进行组装: 59 | ```python 60 | class BaseLLM: 61 | def __init__(self, path: str, model_name: str) -> None: 62 | self.path = path 63 | self.model_name = model_name 64 | 65 | def build_chat(self, tokenizer: str, prompt: str, model_name: str): 66 | pass 67 | 68 | def load_model_and_tokenizer(self, path: str, model_name: str, device): 69 | pass 70 | 71 | def post_process(self, response: str, model_name: str): 72 | pass 73 | 74 | def get_pred(self, data: list, max_length: int, max_gen: int, prompt_format: str, device, out_path: str): 75 | pass 76 | ``` 77 | - 参数解读,build_chat为使用模型固有的数据加载形式,以`internlm2`为例,其为 78 | ```python 79 | def build_chat(self, prompt): 80 | prompt = f'<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n' 81 | return prompt 82 | ``` 83 | - model与tokenizer不用多说,后处理根据model的形式选择性判断是否需要,重点讲一下`get_pred`函数: 84 | 85 | ```python 86 | def get_pred(self, data, max_length, max_gen, prompt_format, device, out_path): 87 | model, tokenizer = self.load_model_and_tokenizer(self.path, device) 88 | for json_obj in tqdm(data): 89 | prompt = prompt_format.format(**json_obj) 90 | # 在中间截断,因为两头有关键信息. 91 | tokenized_prompt = tokenizer(prompt, truncation=False, return_tensors="pt").input_ids[0] 92 | if len(tokenized_prompt) > max_length: 93 | half = int(max_length/2) 94 | prompt = tokenizer.decode(tokenized_prompt[:half], skip_special_tokens=True)+tokenizer.decode(tokenized_prompt[-half:], skip_special_tokens=True) 95 | 96 | prompt = self.build_chat(prompt) 97 | 98 | input = tokenizer(prompt, truncation=False, return_tensors="pt").to(device) 99 | # 表示喂进去的tokens的长度 100 | context_length = input.input_ids.shape[-1] 101 | eos_token_id = [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids(["<|im_end|>"])[0]] 102 | 103 | output = model.generate( 104 | **input, 105 | max_new_tokens=max_gen, 106 | do_sample=False, 107 | temperature=1.0, 108 | eos_token_id=eos_token_id, 109 | )[0] 110 | 111 | pred = tokenizer.decode(output[context_length:], skip_special_tokens=True) 112 | pred = self.post_process(pred) 113 | 114 | with open(out_path, "a", encoding="utf-8") as f: 115 | json.dump({"pred": pred, "answers": json_obj["answers"], "all_classes": json_obj["all_classes"], "length": json_obj["length"]}, f, ensure_ascii=False) 116 | f.write('\n') 117 | ``` 118 | 119 | - 有的同学可能会问,为啥要整这么一大串,直接用`model.chat()`不香吗?? 120 | - Okey!这个函数就告诉了你答案。原因就在于截断策略,对于模型而言,尤其是制定了输入的长度,如果使用阶段命令则其会在输入的末尾进行阶段,但由于引导性`prompt`的存在,在`inputs`的两端均有关键信息,故需要对两端的信息进行保留,对中间部位进行截断操作,才能最大限度地抱持输出效果! 121 | 122 | > tips: get_pred部分,可以参考各大模型各自的`model`相关脚本中的`chat`函数(`internlm2`在`modeling_internlm2.py`里面),也可以更好的理解原始文本输入与结构化模型输出。 123 | 124 | #### 1.2 结果评测 125 | 直接show例子: 126 | ``` 127 | "pred": "57081.86元", "answers": "人民币57081.86元。" 128 | ``` 129 | - 首先,经过数据清洗与`jieba`分词,将短句分为词组,以示例文本为例,经过分词与去掉标点符号等操作,得到下列输出: 130 | ``` 131 | "pred": ['5708186', '元'], "answers": ['人民币', '5708186', '元']" 132 | ``` 133 | 将上述的两个"干净"的输出送入`f1`评分函数如下: 134 | ```python 135 | def f1_score(prediction, ground_truth, **kwargs): 136 | # Counter以dict的形式存储各个句子对应的词与其对应个数,&操作符返回两个Counter中共同的元素的键值对 137 | common = Counter(prediction) & Counter(ground_truth) 138 | # 显示prediction与gt的共同元素的个数 139 | num_same = sum(common.values()) 140 | if num_same == 0: 141 | return 0 142 | # 即模型预测正确的样本数量与总预测样本数量的比值 143 | precision = 1.0 * num_same / len(prediction) 144 | # 模型正确预测的样本数量与总实际样本数量的比值 145 | recall = 1.0 * num_same / len(ground_truth) 146 | f1 = (2 * precision * recall) / (precision + recall) 147 | return f1 148 | ``` 149 | - 首先记录两个list中相同的元素,再统计相同的元素的总数,最终再按照precision与recall的定义分别计算相应的分数。 150 | - 然后就得到该结果的对应分数啦,最后再将所有的结果取平均值,即得到该`task`的`F1_score` 151 | 152 | ### 2.思考 153 | 当然,这些只是基础的`metric`评测指标,或许细心的你已经发现了相应的漏洞,比如在上述预测中,相比较的结果都是经过了相应的规则抽取的,如果出现了比如`answer`是"厦门大学",而`pred`是"不是厦门大学"/"厦大",则二者的结果按照当前的评分指标则有失偏颇。 154 | 155 | 当然,更加准确的评测metric也是学术界一直努力的目标,本项目也会及时跟进更加先进的评测策略,也欢迎大佬PR!! 156 | 157 | ## 😆成功运行! 158 | 159 | ### 1. get inference results 160 | ```python 161 | python inference.py 162 | ``` 163 | 164 | ### 2. get eval results 165 | ```python 166 | python eval.py 167 | ``` 168 | 169 | ## support metrics 170 | 1. F1 score 171 | 2. rouge-series/blue-series 172 | 3. accuracy 173 | 174 | ## 支持自定义评测 175 | 我们repo也支持自定义评测,如果进行了自定义sft数据,我们命名为`custom_zh`,或如果是英文的话可以为`custom_en`,数据形式与sft格式一致,如下: 176 | ```python 177 | { 178 | "instruction": "假设你是皇帝身边的女人--甄嬛", 179 | "input": "你是谁?", 180 | "output": "臣妾是甄嬛,家父是大理寺少卿。" 181 | } 182 | ``` 183 | 即可支持自定义数据集的评测~ 184 | 185 | ## Reference & Acknowledgment 186 | [LongBench: A Bilingual, Multitask Benchmark for Long Context Understanding](https://arxiv.org/abs/2308.14508) 187 | -------------------------------------------------------------------------------- /content/TinyEval/requirements.txt: -------------------------------------------------------------------------------- 1 | datasets 2 | tqdm 3 | rouge 4 | jieba 5 | fuzzywuzzy 6 | torch 7 | transformers==4.38.0 8 | einops -------------------------------------------------------------------------------- /content/TinyIMGRAG/IMGRAG/ImgEvaluator.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | ''' 4 | @File : ImgEvaluator.py 5 | @Time : 2025/04/29 19:34:11 6 | @Author : Cecilll 7 | @Version : 1.0 8 | @Desc : Image Generation and Retrieval Pipeline with careful GPU memory management 9 | ''' 10 | 11 | import os 12 | import base64 13 | from modelscope import Qwen2_5_VLForConditionalGeneration, AutoProcessor 14 | from qwen_vl_utils import process_vision_info 15 | 16 | def load_qwen_vlm(pretrained_model="./model/Qwen/Qwen2.5-VL-3B-Instruct"): 17 | 18 | min_pixels = 256 * 28 * 28 19 | max_pixels = 1280 * 28 * 28 20 | # default: Load the model on the available device(s) 21 | model = Qwen2_5_VLForConditionalGeneration.from_pretrained( 22 | pretrained_model, torch_dtype="auto", device_map="auto" 23 | ) 24 | 25 | # default processer 26 | processor = AutoProcessor.from_pretrained(pretrained_model, min_pixels=min_pixels, 27 | max_pixels=max_pixels) 28 | return model,processor 29 | 30 | def run_qwen_vl(image_path,prompt,model,processor): 31 | 32 | # 编码图片 33 | with open(image_path, "rb") as image_file: 34 | base64_image = base64.b64encode(image_file.read()).decode('utf-8') 35 | 36 | messages = [ 37 | { 38 | "role": "user", 39 | "content": [ 40 | { 41 | "type": "image", 42 | "image": f"data:image/jpeg;base64,{base64_image}", 43 | }, 44 | {"type": "text", "text": f"Please identify the different between the image and the description of the {prompt}, and output in the format 'The different conception is a XX'. If no inconsistent content are found, return ."}, 45 | ], 46 | } 47 | ] 48 | 49 | # Preparation for inference 50 | text = processor.apply_chat_template( 51 | messages, tokenize=False, add_generation_prompt=True 52 | ) 53 | image_inputs, video_inputs = process_vision_info(messages) 54 | inputs = processor( 55 | text=[text], 56 | images=image_inputs, 57 | videos=video_inputs, 58 | padding=True, 59 | return_tensors="pt" 60 | ) 61 | inputs = inputs.to("cuda") 62 | 63 | # Inference: Generation of the output 64 | generated_ids = model.generate(**inputs, max_new_tokens=128, temperature=0.01) 65 | generated_ids_trimmed = [ 66 | out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) 67 | ] 68 | output_text = processor.batch_decode( 69 | generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False 70 | ) 71 | 72 | return output_text[0] 73 | 74 | if __name__ == "__main__": 75 | 76 | prompt = "The brown bear is giving a lecture on the platform." 77 | img_path = "../datasets/results/output.png" 78 | 79 | vl_model,vl_processor = load_qwen_vlm(pretrained_model="../model/Qwen/Qwen2.5-VL-3B-Instruct") 80 | 81 | vl_prompt = run_qwen_vl(img_path,prompt,vl_model,vl_processor) 82 | 83 | print(vl_prompt) -------------------------------------------------------------------------------- /content/TinyIMGRAG/IMGRAG/ImgGenerator.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | ''' 4 | @File : ImgGenerator.py 5 | @Time : 2025/04/29 19:32:02 6 | @Author : Cecilll 7 | @Version : 1.0 8 | @Desc : Image Generation and Retrieval Pipeline with careful GPU memory management 9 | ''' 10 | 11 | from PIL import Image 12 | from diffusers import AutoPipelineForText2Image 13 | from transformers import CLIPVisionModelWithProjection 14 | import torch 15 | 16 | 17 | class SDXLGenerator: 18 | def __init__(self, prompt, output_path, steps=50, seed=0, 19 | use_image_guidance=False, image_path=None, ip_scale=0.5, 20 | sd_path="./model/stabilityai/stable-diffusion-xl-base-1.0", 21 | adapter_path="./model/h94/IP-Adapter"): 22 | self.prompt = prompt 23 | self.output_path = output_path 24 | self.steps = steps 25 | self.seed = seed 26 | self.use_image_guidance = use_image_guidance 27 | self.image_path = image_path 28 | self.ip_scale = ip_scale 29 | 30 | # 设备设置 31 | self.device = "cuda" if torch.cuda.is_available() else "cpu" 32 | self.torch_dtype = torch.float16 if self.device == "cuda" else torch.float32 33 | 34 | # 初始化管道 35 | self.pipe = AutoPipelineForText2Image.from_pretrained( 36 | sd_path, 37 | torch_dtype=self.torch_dtype 38 | ).to(self.device) 39 | 40 | # 如果需要使用图像引导,加载图像编码器和IP-Adapter 41 | if self.use_image_guidance: 42 | # 初始化图像编码器 43 | self.image_encoder = CLIPVisionModelWithProjection.from_pretrained( 44 | adapter_path, 45 | subfolder="models/image_encoder", 46 | torch_dtype=self.torch_dtype 47 | ).to(self.device) 48 | 49 | # 将图像编码器设置到管道中 50 | self.pipe = AutoPipelineForText2Image.from_pretrained( 51 | sd_path, 52 | image_encoder=self.image_encoder, 53 | torch_dtype=self.torch_dtype 54 | ).to(self.device) 55 | 56 | # 加载IP-Adapter 57 | self.pipe.load_ip_adapter( 58 | adapter_path, 59 | subfolder="sdxl_models", 60 | weight_name="ip-adapter-plus_sdxl_vit-h.safetensors" 61 | ) 62 | self.pipe.set_ip_adapter_scale(self.ip_scale) 63 | 64 | def generate_image(self): 65 | # 准备生成参数 66 | generator = torch.Generator(device=self.device).manual_seed(self.seed) 67 | 68 | # 执行生成 69 | print("开始生成...") 70 | if self.use_image_guidance: 71 | # 加载参考图像 72 | ref_image = Image.open(self.image_path) 73 | 74 | # 使用图像和文本生成 75 | result = self.pipe( 76 | prompt=self.prompt, 77 | ip_adapter_image=ref_image, 78 | negative_prompt="monochrome, lowres, bad anatomy, worst quality, low quality", 79 | num_inference_steps=self.steps, 80 | generator=generator, 81 | ) 82 | else: 83 | # 只使用文本生成 84 | result = self.pipe( 85 | prompt=self.prompt, 86 | negative_prompt="monochrome, lowres, bad anatomy, worst quality, low quality", 87 | num_inference_steps=self.steps, 88 | generator=generator, 89 | ) 90 | 91 | # 保存结果 92 | result.images[0].save(self.output_path) 93 | print(f"生成完成,结果已保存至: {self.output_path}") 94 | return self.output_path 95 | 96 | 97 | # 使用示例 98 | if __name__ == "__main__": 99 | # 示例 1:只用文本生成图片 100 | generator = SDXLGenerator( 101 | prompt="A golden retriever and a cradle", 102 | output_path="output.png", 103 | steps=50, 104 | seed=0, 105 | use_image_guidance=False, # 设置为False,表示只用文本生成 106 | sd_path="../model/stabilityai/stable-diffusion-xl-base-1.0", 107 | adapter_path="../model/h94/IP-Adapter" 108 | ) 109 | generator.generate_image() 110 | 111 | # # 示例 2:使用图像和文本生成图片 112 | # generator = SDXLGenerator( 113 | # prompt="A golden retriever and a cradle", 114 | # image_path="./datasets/imgs/cradle.jpg", 115 | # output_path="output.png", 116 | # steps=50, 117 | # seed=0, 118 | # use_image_guidance=True, # 设置为True,表示使用图像和文本生成 119 | # ip_scale=0.5 120 | # ) 121 | # generator.generate_image() 122 | -------------------------------------------------------------------------------- /content/TinyIMGRAG/IMGRAG/ImgRetrieval.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | ''' 4 | @File : ImgRetrieval.py 5 | @Time : 2025/04/29 19:31:39 6 | @Author : Cecilll 7 | @Version : 1.0 8 | @Desc : Image Generation and Retrieval Pipeline with careful GPU memory management 9 | ''' 10 | 11 | import os 12 | import torch 13 | import clip 14 | import numpy as np 15 | from PIL import Image 16 | 17 | 18 | def get_clip_similarities(prompts, image_paths, embeddings_path="./datasets/vector_bases", bs=2, k=5, device='cuda:0', 19 | model_path="./model/ViT-B-32.pt"): 20 | """ 21 | Calculate similarity between text prompts and images using CLIP model. 22 | 23 | Args: 24 | prompts: List of text prompts to compare against images 25 | image_paths: List of paths to images 26 | embeddings_path: Directory to save/load precomputed image embeddings 27 | bs: Batch size for processing images 28 | k: Number of top similar images to return 29 | device: Device to run computations on 30 | model_path: Path to CLIP model weights 31 | 32 | Returns: 33 | Tuple of (top image paths, top similarity scores) sorted by similarity 34 | """ 35 | # Load CLIP model and preprocessing 36 | model, preprocess = clip.load(model_path, device=device) 37 | text_tokens = clip.tokenize(prompts).to(device) 38 | 39 | # Initialize result containers 40 | all_scores = [] 41 | all_paths = [] 42 | all_embeddings = torch.empty((0, 512)).to(device) 43 | 44 | # Process in batches 45 | with torch.no_grad(): 46 | # Get text features once 47 | text_features = model.encode_text(text_tokens) 48 | text_features = torch.nn.functional.normalize(text_features, p=2, dim=1) 49 | 50 | # Process images in batches 51 | for batch_start in range(0, len(image_paths), bs): 52 | batch_end = min(batch_start + bs, len(image_paths)) 53 | batch_paths = image_paths[batch_start:batch_end] 54 | 55 | # Try to load precomputed embeddings or compute new ones 56 | embeddings, valid_paths = _get_image_embeddings( 57 | batch_paths, batch_start, 58 | model, preprocess, device, 59 | embeddings_path 60 | ) 61 | 62 | if embeddings is None: 63 | continue 64 | 65 | # Calculate similarities 66 | batch_scores = torch.matmul(text_features, embeddings.T) 67 | batch_scores = batch_scores.cpu().numpy().squeeze() 68 | 69 | # Update running totals 70 | if batch_scores.ndim == 0: # 如果是标量 71 | all_scores.append(batch_scores.item()) # 用 .item() 获取标量值 72 | else: 73 | all_scores.extend(batch_scores.tolist()) # 如果是多维数组,转换为列表 74 | 75 | # all_scores.extend(batch_scores) 76 | all_paths.extend(valid_paths) 77 | all_embeddings = torch.cat([all_embeddings, embeddings]) 78 | 79 | # Keep only top k results 80 | if len(all_scores) > k: 81 | top_indices = np.argsort(all_scores)[-k:] 82 | all_scores = [all_scores[i] for i in top_indices] 83 | all_paths = [all_paths[i] for i in top_indices] 84 | all_embeddings = all_embeddings[top_indices] 85 | 86 | # Return sorted results (highest first) 87 | sorted_indices = np.argsort(all_scores)[::-1] 88 | return [all_paths[i] for i in sorted_indices], [all_scores[i] for i in sorted_indices] 89 | 90 | 91 | def _get_image_embeddings(image_paths, batch_idx, model, preprocess, device, embeddings_path): 92 | """Helper to get embeddings either from cache or by computing them""" 93 | cache_file = os.path.join(embeddings_path, f"clip_embeddings_b{batch_idx}.pt") 94 | 95 | # Try loading from cache 96 | if os.path.exists(cache_file): 97 | cached = torch.load(cache_file, map_location=device) 98 | return cached['normalized_clip_embeddings'], cached['paths'] 99 | 100 | # Compute new embeddings 101 | images = [] 102 | valid_paths = [] 103 | 104 | for path in image_paths: 105 | try: 106 | img = preprocess(Image.open(path)).unsqueeze(0).to(device) 107 | images.append(img) 108 | valid_paths.append(path) 109 | except Exception as e: 110 | print(f"Couldn't read {path}: {str(e)}") 111 | continue 112 | 113 | if not images: 114 | return None, None 115 | 116 | images = torch.cat(images) 117 | features = model.encode_image(images) 118 | features = torch.nn.functional.normalize(features, p=2, dim=1) 119 | 120 | # Save to cache if requested 121 | if embeddings_path: 122 | os.makedirs(embeddings_path, exist_ok=True) 123 | torch.save({ 124 | 'normalized_clip_embeddings': features, 125 | 'paths': valid_paths 126 | }, cache_file) 127 | 128 | return features, valid_paths 129 | 130 | if __name__ == "__main__": 131 | 132 | clip_texts = ['a cradle'] 133 | for clip_text in clip_texts: 134 | print(clip_text) 135 | 136 | src = "../datasets/imgs" 137 | files = os.listdir(src) 138 | 139 | file_paths = [] 140 | for f in files: 141 | file_path = os.path.join(src, f) 142 | file_paths.append(file_path) 143 | 144 | top_text_im_paths, top_text_im_scores = get_clip_similarities(prompts=clip_text, image_paths=file_paths, model_path="../model/ViT-B-32.pt") 145 | 146 | print(top_text_im_paths, top_text_im_scores) 147 | print('----------------------') 148 | -------------------------------------------------------------------------------- /content/TinyIMGRAG/IMGRAG/RewritePrompt.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | ''' 4 | @File : RewritePrompt.py 5 | @Time : 2025/04/29 19:34:20 6 | @Author : Cecilll 7 | @Version : 1.0 8 | @Desc : Image Generation and Retrieval Pipeline with careful GPU memory management 9 | ''' 10 | 11 | from modelscope import AutoModelForCausalLM, AutoTokenizer 12 | 13 | def load_qwen_llm(model_name = "./model/Qwen/Qwen2.5-7B-Instruct-GPTQ-Int4"): 14 | 15 | model = AutoModelForCausalLM.from_pretrained( 16 | model_name, 17 | torch_dtype="auto", 18 | device_map="auto" 19 | ) 20 | tokenizer = AutoTokenizer.from_pretrained(model_name) 21 | 22 | return model,tokenizer 23 | 24 | def run_qwen_llm(prompt,model,tokenizer): 25 | 26 | messages = [ 27 | {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."}, 28 | {"role": "user", "content": prompt} 29 | ] 30 | text = tokenizer.apply_chat_template( 31 | messages, 32 | tokenize=False, 33 | add_generation_prompt=True 34 | ) 35 | model_inputs = tokenizer([text], return_tensors="pt").to(model.device) 36 | generated_ids = model.generate( 37 | **model_inputs, 38 | max_new_tokens=512, 39 | temperature=0.01 40 | ) 41 | generated_ids = [ 42 | output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids) 43 | ] 44 | response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] 45 | 46 | return response 47 | 48 | if __name__ == "__main__": 49 | 50 | prompt = "The brown bear is giving a lecture on the platform." 51 | vl_prompt = "A brown bear." 52 | 53 | llm_model,llm_tokenizer = load_qwen_llm(model_name = "../model/Qwen/Qwen2.5-7B-Instruct-GPTQ-Int4") 54 | llm_prompt = run_qwen_llm(f"If you want to change '{vl_prompt}' to '{prompt}', which specific concepts need to be adjusted? These concepts should be used to generate images. Please output a maximum of 3 concepts, and remember to only output the concepts without any additional information.",llm_model,llm_tokenizer) 55 | print(llm_prompt) 56 | 57 | llm_prompt = f"Please separate the concepts in '{llm_prompt}' with an '&'." 58 | print(llm_prompt) 59 | response = run_qwen_llm(llm_prompt,llm_model,llm_tokenizer) 60 | print(response) 61 | 62 | for conception_text in response.split('&'): 63 | print(conception_text) -------------------------------------------------------------------------------- /content/TinyIMGRAG/IMGRAG/__pycache__/ImgEvaluator.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/tiny-universe/638006d147ed6fc902e610fad50875549fa2d944/content/TinyIMGRAG/IMGRAG/__pycache__/ImgEvaluator.cpython-310.pyc -------------------------------------------------------------------------------- /content/TinyIMGRAG/IMGRAG/__pycache__/ImgGenerator.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/tiny-universe/638006d147ed6fc902e610fad50875549fa2d944/content/TinyIMGRAG/IMGRAG/__pycache__/ImgGenerator.cpython-310.pyc -------------------------------------------------------------------------------- /content/TinyIMGRAG/IMGRAG/__pycache__/ImgRetrieval.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/tiny-universe/638006d147ed6fc902e610fad50875549fa2d944/content/TinyIMGRAG/IMGRAG/__pycache__/ImgRetrieval.cpython-310.pyc -------------------------------------------------------------------------------- /content/TinyIMGRAG/IMGRAG/__pycache__/RewritePrompt.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/tiny-universe/638006d147ed6fc902e610fad50875549fa2d944/content/TinyIMGRAG/IMGRAG/__pycache__/RewritePrompt.cpython-310.pyc -------------------------------------------------------------------------------- /content/TinyIMGRAG/datasets/imgs/apple.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/tiny-universe/638006d147ed6fc902e610fad50875549fa2d944/content/TinyIMGRAG/datasets/imgs/apple.jpg -------------------------------------------------------------------------------- /content/TinyIMGRAG/datasets/imgs/bamboo.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/tiny-universe/638006d147ed6fc902e610fad50875549fa2d944/content/TinyIMGRAG/datasets/imgs/bamboo.jpg -------------------------------------------------------------------------------- /content/TinyIMGRAG/datasets/imgs/brown_bear.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/tiny-universe/638006d147ed6fc902e610fad50875549fa2d944/content/TinyIMGRAG/datasets/imgs/brown_bear.jpg -------------------------------------------------------------------------------- /content/TinyIMGRAG/datasets/imgs/classroom.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/tiny-universe/638006d147ed6fc902e610fad50875549fa2d944/content/TinyIMGRAG/datasets/imgs/classroom.jpg -------------------------------------------------------------------------------- /content/TinyIMGRAG/datasets/imgs/cradle.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/tiny-universe/638006d147ed6fc902e610fad50875549fa2d944/content/TinyIMGRAG/datasets/imgs/cradle.jpg -------------------------------------------------------------------------------- /content/TinyIMGRAG/datasets/imgs/dog.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/tiny-universe/638006d147ed6fc902e610fad50875549fa2d944/content/TinyIMGRAG/datasets/imgs/dog.jpg -------------------------------------------------------------------------------- /content/TinyIMGRAG/datasets/imgs/oil_painting.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/tiny-universe/638006d147ed6fc902e610fad50875549fa2d944/content/TinyIMGRAG/datasets/imgs/oil_painting.png -------------------------------------------------------------------------------- /content/TinyIMGRAG/datasets/imgs/panda.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/tiny-universe/638006d147ed6fc902e610fad50875549fa2d944/content/TinyIMGRAG/datasets/imgs/panda.jpg -------------------------------------------------------------------------------- /content/TinyIMGRAG/datasets/imgs/shark.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/tiny-universe/638006d147ed6fc902e610fad50875549fa2d944/content/TinyIMGRAG/datasets/imgs/shark.jpg -------------------------------------------------------------------------------- /content/TinyIMGRAG/datasets/imgs/sketch.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/tiny-universe/638006d147ed6fc902e610fad50875549fa2d944/content/TinyIMGRAG/datasets/imgs/sketch.jpg -------------------------------------------------------------------------------- /content/TinyIMGRAG/datasets/imgs/sunflower.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/tiny-universe/638006d147ed6fc902e610fad50875549fa2d944/content/TinyIMGRAG/datasets/imgs/sunflower.jpg -------------------------------------------------------------------------------- /content/TinyIMGRAG/datasets/imgs/teacher.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/tiny-universe/638006d147ed6fc902e610fad50875549fa2d944/content/TinyIMGRAG/datasets/imgs/teacher.jpg -------------------------------------------------------------------------------- /content/TinyIMGRAG/datasets/imgs/wash_painting.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/tiny-universe/638006d147ed6fc902e610fad50875549fa2d944/content/TinyIMGRAG/datasets/imgs/wash_painting.jpg -------------------------------------------------------------------------------- /content/TinyIMGRAG/datasets/templates/enhanced_image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/tiny-universe/638006d147ed6fc902e610fad50875549fa2d944/content/TinyIMGRAG/datasets/templates/enhanced_image.png -------------------------------------------------------------------------------- /content/TinyIMGRAG/datasets/templates/imgrag.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/tiny-universe/638006d147ed6fc902e610fad50875549fa2d944/content/TinyIMGRAG/datasets/templates/imgrag.png -------------------------------------------------------------------------------- /content/TinyIMGRAG/datasets/templates/initial_image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/tiny-universe/638006d147ed6fc902e610fad50875549fa2d944/content/TinyIMGRAG/datasets/templates/initial_image.png -------------------------------------------------------------------------------- /content/TinyIMGRAG/download_model.py: -------------------------------------------------------------------------------- 1 | # 模型下载 2 | import modelscope 3 | import huggingface_hub 4 | 5 | model_dir = modelscope.snapshot_download('Qwen/Qwen2.5-VL-3B-Instruct', cache_dir='./model/', revision='master') 6 | model_dir = modelscope.snapshot_download('Qwen/Qwen2.5-7B-Instruct-GPTQ-Int4', cache_dir='./model/', revision='master') 7 | model_dir = modelscope.snapshot_download('stabilityai/stable-diffusion-xl-base-1.0', cache_dir='./model/', revision='master') 8 | model_dir = huggingface_hub.snapshot_download(repo_id="h94/IP-Adapter", local_dir="./model/", max_workers=1) 9 | 10 | -------------------------------------------------------------------------------- /content/TinyIMGRAG/model/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/tiny-universe/638006d147ed6fc902e610fad50875549fa2d944/content/TinyIMGRAG/model/__init__.py -------------------------------------------------------------------------------- /content/TinyIMGRAG/packages/CLIP-main/.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: test 2 | on: 3 | push: 4 | branches: 5 | - main 6 | pull_request: 7 | branches: 8 | - main 9 | jobs: 10 | CLIP-test: 11 | runs-on: ubuntu-latest 12 | strategy: 13 | matrix: 14 | python-version: [3.8] 15 | pytorch-version: [1.7.1, 1.9.1, 1.10.1] 16 | include: 17 | - python-version: 3.8 18 | pytorch-version: 1.7.1 19 | torchvision-version: 0.8.2 20 | - python-version: 3.8 21 | pytorch-version: 1.9.1 22 | torchvision-version: 0.10.1 23 | - python-version: 3.8 24 | pytorch-version: 1.10.1 25 | torchvision-version: 0.11.2 26 | steps: 27 | - uses: conda-incubator/setup-miniconda@v2 28 | - run: conda install -n test python=${{ matrix.python-version }} pytorch=${{ matrix.pytorch-version }} torchvision=${{ matrix.torchvision-version }} cpuonly -c pytorch 29 | - uses: actions/checkout@v2 30 | - run: echo "$CONDA/envs/test/bin" >> $GITHUB_PATH 31 | - run: pip install pytest 32 | - run: pip install . 33 | - run: pytest 34 | -------------------------------------------------------------------------------- /content/TinyIMGRAG/packages/CLIP-main/.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | *.py[cod] 3 | *$py.class 4 | *.egg-info 5 | .pytest_cache 6 | .ipynb_checkpoints 7 | 8 | thumbs.db 9 | .DS_Store 10 | .idea 11 | -------------------------------------------------------------------------------- /content/TinyIMGRAG/packages/CLIP-main/0.26.0: -------------------------------------------------------------------------------- 1 | Collecting accelerate 2 | Downloading accelerate-1.6.0-py3-none-any.whl.metadata (19 kB) 3 | Requirement already satisfied: numpy<3.0.0,>=1.17 in e:\programdata\anaconda3\envs\tinyimgrag\lib\site-packages (from accelerate) (2.2.5) 4 | Requirement already satisfied: packaging>=20.0 in e:\programdata\anaconda3\envs\tinyimgrag\lib\site-packages (from accelerate) (25.0) 5 | Requirement already satisfied: psutil in c:\users\administrator\appdata\roaming\python\python310\site-packages (from accelerate) (6.1.1) 6 | Requirement already satisfied: pyyaml in e:\programdata\anaconda3\envs\tinyimgrag\lib\site-packages (from accelerate) (6.0.2) 7 | Requirement already satisfied: torch>=2.0.0 in e:\programdata\anaconda3\envs\tinyimgrag\lib\site-packages (from accelerate) (2.2.1+cu121) 8 | Requirement already satisfied: huggingface-hub>=0.21.0 in e:\programdata\anaconda3\envs\tinyimgrag\lib\site-packages (from accelerate) (0.29.3) 9 | Requirement already satisfied: safetensors>=0.4.3 in e:\programdata\anaconda3\envs\tinyimgrag\lib\site-packages (from accelerate) (0.5.3) 10 | Requirement already satisfied: filelock in e:\programdata\anaconda3\envs\tinyimgrag\lib\site-packages (from huggingface-hub>=0.21.0->accelerate) (3.18.0) 11 | Requirement already satisfied: fsspec>=2023.5.0 in e:\programdata\anaconda3\envs\tinyimgrag\lib\site-packages (from huggingface-hub>=0.21.0->accelerate) (2025.3.2) 12 | Requirement already satisfied: requests in e:\programdata\anaconda3\envs\tinyimgrag\lib\site-packages (from huggingface-hub>=0.21.0->accelerate) (2.32.3) 13 | Requirement already satisfied: tqdm>=4.42.1 in e:\programdata\anaconda3\envs\tinyimgrag\lib\site-packages (from huggingface-hub>=0.21.0->accelerate) (4.67.1) 14 | Requirement already satisfied: typing-extensions>=3.7.4.3 in e:\programdata\anaconda3\envs\tinyimgrag\lib\site-packages (from huggingface-hub>=0.21.0->accelerate) (4.13.2) 15 | Requirement already satisfied: sympy in e:\programdata\anaconda3\envs\tinyimgrag\lib\site-packages (from torch>=2.0.0->accelerate) (1.13.3) 16 | Requirement already satisfied: networkx in e:\programdata\anaconda3\envs\tinyimgrag\lib\site-packages (from torch>=2.0.0->accelerate) (3.4.2) 17 | Requirement already satisfied: jinja2 in e:\programdata\anaconda3\envs\tinyimgrag\lib\site-packages (from torch>=2.0.0->accelerate) (3.1.6) 18 | Requirement already satisfied: colorama in e:\programdata\anaconda3\envs\tinyimgrag\lib\site-packages (from tqdm>=4.42.1->huggingface-hub>=0.21.0->accelerate) (0.4.6) 19 | Requirement already satisfied: MarkupSafe>=2.0 in e:\programdata\anaconda3\envs\tinyimgrag\lib\site-packages (from jinja2->torch>=2.0.0->accelerate) (3.0.2) 20 | Requirement already satisfied: charset-normalizer<4,>=2 in e:\programdata\anaconda3\envs\tinyimgrag\lib\site-packages (from requests->huggingface-hub>=0.21.0->accelerate) (3.4.1) 21 | Requirement already satisfied: idna<4,>=2.5 in e:\programdata\anaconda3\envs\tinyimgrag\lib\site-packages (from requests->huggingface-hub>=0.21.0->accelerate) (3.10) 22 | Requirement already satisfied: urllib3<3,>=1.21.1 in e:\programdata\anaconda3\envs\tinyimgrag\lib\site-packages (from requests->huggingface-hub>=0.21.0->accelerate) (2.4.0) 23 | Requirement already satisfied: certifi>=2017.4.17 in e:\programdata\anaconda3\envs\tinyimgrag\lib\site-packages (from requests->huggingface-hub>=0.21.0->accelerate) (2025.1.31) 24 | Requirement already satisfied: mpmath<1.4,>=1.1.0 in e:\programdata\anaconda3\envs\tinyimgrag\lib\site-packages (from sympy->torch>=2.0.0->accelerate) (1.3.0) 25 | Downloading accelerate-1.6.0-py3-none-any.whl (354 kB) 26 | Installing collected packages: accelerate 27 | Successfully installed accelerate-1.6.0 28 | -------------------------------------------------------------------------------- /content/TinyIMGRAG/packages/CLIP-main/CLIP.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/tiny-universe/638006d147ed6fc902e610fad50875549fa2d944/content/TinyIMGRAG/packages/CLIP-main/CLIP.png -------------------------------------------------------------------------------- /content/TinyIMGRAG/packages/CLIP-main/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 OpenAI 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /content/TinyIMGRAG/packages/CLIP-main/MANIFEST.in: -------------------------------------------------------------------------------- 1 | include clip/bpe_simple_vocab_16e6.txt.gz 2 | -------------------------------------------------------------------------------- /content/TinyIMGRAG/packages/CLIP-main/README.md: -------------------------------------------------------------------------------- 1 | # CLIP 2 | 3 | [[Blog]](https://openai.com/blog/clip/) [[Paper]](https://arxiv.org/abs/2103.00020) [[Model Card]](model-card.md) [[Colab]](https://colab.research.google.com/github/openai/clip/blob/master/notebooks/Interacting_with_CLIP.ipynb) 4 | 5 | CLIP (Contrastive Language-Image Pre-Training) is a neural network trained on a variety of (image, text) pairs. It can be instructed in natural language to predict the most relevant text snippet, given an image, without directly optimizing for the task, similarly to the zero-shot capabilities of GPT-2 and 3. We found CLIP matches the performance of the original ResNet50 on ImageNet “zero-shot” without using any of the original 1.28M labeled examples, overcoming several major challenges in computer vision. 6 | 7 | 8 | 9 | ## Approach 10 | 11 | ![CLIP](CLIP.png) 12 | 13 | 14 | 15 | ## Usage 16 | 17 | First, [install PyTorch 1.7.1](https://pytorch.org/get-started/locally/) (or later) and torchvision, as well as small additional dependencies, and then install this repo as a Python package. On a CUDA GPU machine, the following will do the trick: 18 | 19 | ```bash 20 | $ conda install --yes -c pytorch pytorch=1.7.1 torchvision cudatoolkit=11.0 21 | $ pip install ftfy regex tqdm 22 | $ pip install git+https://github.com/openai/CLIP.git 23 | ``` 24 | 25 | Replace `cudatoolkit=11.0` above with the appropriate CUDA version on your machine or `cpuonly` when installing on a machine without a GPU. 26 | 27 | ```python 28 | import torch 29 | import clip 30 | from PIL import Image 31 | 32 | device = "cuda" if torch.cuda.is_available() else "cpu" 33 | model, preprocess = clip.load("ViT-B/32", device=device) 34 | 35 | image = preprocess(Image.open("CLIP.png")).unsqueeze(0).to(device) 36 | text = clip.tokenize(["a diagram", "a dog", "a cat"]).to(device) 37 | 38 | with torch.no_grad(): 39 | image_features = model.encode_image(image) 40 | text_features = model.encode_text(text) 41 | 42 | logits_per_image, logits_per_text = model(image, text) 43 | probs = logits_per_image.softmax(dim=-1).cpu().numpy() 44 | 45 | print("Label probs:", probs) # prints: [[0.9927937 0.00421068 0.00299572]] 46 | ``` 47 | 48 | 49 | ## API 50 | 51 | The CLIP module `clip` provides the following methods: 52 | 53 | #### `clip.available_models()` 54 | 55 | Returns the names of the available CLIP models. 56 | 57 | #### `clip.load(name, device=..., jit=False)` 58 | 59 | Returns the model and the TorchVision transform needed by the model, specified by the model name returned by `clip.available_models()`. It will download the model as necessary. The `name` argument can also be a path to a local checkpoint. 60 | 61 | The device to run the model can be optionally specified, and the default is to use the first CUDA device if there is any, otherwise the CPU. When `jit` is `False`, a non-JIT version of the model will be loaded. 62 | 63 | #### `clip.tokenize(text: Union[str, List[str]], context_length=77)` 64 | 65 | Returns a LongTensor containing tokenized sequences of given text input(s). This can be used as the input to the model 66 | 67 | --- 68 | 69 | The model returned by `clip.load()` supports the following methods: 70 | 71 | #### `model.encode_image(image: Tensor)` 72 | 73 | Given a batch of images, returns the image features encoded by the vision portion of the CLIP model. 74 | 75 | #### `model.encode_text(text: Tensor)` 76 | 77 | Given a batch of text tokens, returns the text features encoded by the language portion of the CLIP model. 78 | 79 | #### `model(image: Tensor, text: Tensor)` 80 | 81 | Given a batch of images and a batch of text tokens, returns two Tensors, containing the logit scores corresponding to each image and text input. The values are cosine similarities between the corresponding image and text features, times 100. 82 | 83 | 84 | 85 | ## More Examples 86 | 87 | ### Zero-Shot Prediction 88 | 89 | The code below performs zero-shot prediction using CLIP, as shown in Appendix B in the paper. This example takes an image from the [CIFAR-100 dataset](https://www.cs.toronto.edu/~kriz/cifar.html), and predicts the most likely labels among the 100 textual labels from the dataset. 90 | 91 | ```python 92 | import os 93 | import clip 94 | import torch 95 | from torchvision.datasets import CIFAR100 96 | 97 | # Load the model 98 | device = "cuda" if torch.cuda.is_available() else "cpu" 99 | model, preprocess = clip.load('ViT-B/32', device) 100 | 101 | # Download the dataset 102 | cifar100 = CIFAR100(root=os.path.expanduser("~/.cache"), download=True, train=False) 103 | 104 | # Prepare the inputs 105 | image, class_id = cifar100[3637] 106 | image_input = preprocess(image).unsqueeze(0).to(device) 107 | text_inputs = torch.cat([clip.tokenize(f"a photo of a {c}") for c in cifar100.classes]).to(device) 108 | 109 | # Calculate features 110 | with torch.no_grad(): 111 | image_features = model.encode_image(image_input) 112 | text_features = model.encode_text(text_inputs) 113 | 114 | # Pick the top 5 most similar labels for the image 115 | image_features /= image_features.norm(dim=-1, keepdim=True) 116 | text_features /= text_features.norm(dim=-1, keepdim=True) 117 | similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1) 118 | values, indices = similarity[0].topk(5) 119 | 120 | # Print the result 121 | print("\nTop predictions:\n") 122 | for value, index in zip(values, indices): 123 | print(f"{cifar100.classes[index]:>16s}: {100 * value.item():.2f}%") 124 | ``` 125 | 126 | The output will look like the following (the exact numbers may be slightly different depending on the compute device): 127 | 128 | ``` 129 | Top predictions: 130 | 131 | snake: 65.31% 132 | turtle: 12.29% 133 | sweet_pepper: 3.83% 134 | lizard: 1.88% 135 | crocodile: 1.75% 136 | ``` 137 | 138 | Note that this example uses the `encode_image()` and `encode_text()` methods that return the encoded features of given inputs. 139 | 140 | 141 | ### Linear-probe evaluation 142 | 143 | The example below uses [scikit-learn](https://scikit-learn.org/) to perform logistic regression on image features. 144 | 145 | ```python 146 | import os 147 | import clip 148 | import torch 149 | 150 | import numpy as np 151 | from sklearn.linear_model import LogisticRegression 152 | from torch.utils.data import DataLoader 153 | from torchvision.datasets import CIFAR100 154 | from tqdm import tqdm 155 | 156 | # Load the model 157 | device = "cuda" if torch.cuda.is_available() else "cpu" 158 | model, preprocess = clip.load('ViT-B/32', device) 159 | 160 | # Load the dataset 161 | root = os.path.expanduser("~/.cache") 162 | train = CIFAR100(root, download=True, train=True, transform=preprocess) 163 | test = CIFAR100(root, download=True, train=False, transform=preprocess) 164 | 165 | 166 | def get_features(dataset): 167 | all_features = [] 168 | all_labels = [] 169 | 170 | with torch.no_grad(): 171 | for images, labels in tqdm(DataLoader(dataset, batch_size=100)): 172 | features = model.encode_image(images.to(device)) 173 | 174 | all_features.append(features) 175 | all_labels.append(labels) 176 | 177 | return torch.cat(all_features).cpu().numpy(), torch.cat(all_labels).cpu().numpy() 178 | 179 | # Calculate the image features 180 | train_features, train_labels = get_features(train) 181 | test_features, test_labels = get_features(test) 182 | 183 | # Perform logistic regression 184 | classifier = LogisticRegression(random_state=0, C=0.316, max_iter=1000, verbose=1) 185 | classifier.fit(train_features, train_labels) 186 | 187 | # Evaluate using the logistic regression classifier 188 | predictions = classifier.predict(test_features) 189 | accuracy = np.mean((test_labels == predictions).astype(float)) * 100. 190 | print(f"Accuracy = {accuracy:.3f}") 191 | ``` 192 | 193 | Note that the `C` value should be determined via a hyperparameter sweep using a validation split. 194 | 195 | 196 | ## See Also 197 | 198 | * [OpenCLIP](https://github.com/mlfoundations/open_clip): includes larger and independently trained CLIP models up to ViT-G/14 199 | * [Hugging Face implementation of CLIP](https://huggingface.co/docs/transformers/model_doc/clip): for easier integration with the HF ecosystem 200 | -------------------------------------------------------------------------------- /content/TinyIMGRAG/packages/CLIP-main/clip/__init__.py: -------------------------------------------------------------------------------- 1 | from .clip import * 2 | -------------------------------------------------------------------------------- /content/TinyIMGRAG/packages/CLIP-main/clip/bpe_simple_vocab_16e6.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/tiny-universe/638006d147ed6fc902e610fad50875549fa2d944/content/TinyIMGRAG/packages/CLIP-main/clip/bpe_simple_vocab_16e6.txt.gz -------------------------------------------------------------------------------- /content/TinyIMGRAG/packages/CLIP-main/clip/simple_tokenizer.py: -------------------------------------------------------------------------------- 1 | import gzip 2 | import html 3 | import os 4 | from functools import lru_cache 5 | 6 | import ftfy 7 | import regex as re 8 | 9 | 10 | @lru_cache() 11 | def default_bpe(): 12 | return os.path.join(os.path.dirname(os.path.abspath(__file__)), "bpe_simple_vocab_16e6.txt.gz") 13 | 14 | 15 | @lru_cache() 16 | def bytes_to_unicode(): 17 | """ 18 | Returns list of utf-8 byte and a corresponding list of unicode strings. 19 | The reversible bpe codes work on unicode strings. 20 | This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. 21 | When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. 22 | This is a signficant percentage of your normal, say, 32K bpe vocab. 23 | To avoid that, we want lookup tables between utf-8 bytes and unicode strings. 24 | And avoids mapping to whitespace/control characters the bpe code barfs on. 25 | """ 26 | bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1)) 27 | cs = bs[:] 28 | n = 0 29 | for b in range(2**8): 30 | if b not in bs: 31 | bs.append(b) 32 | cs.append(2**8+n) 33 | n += 1 34 | cs = [chr(n) for n in cs] 35 | return dict(zip(bs, cs)) 36 | 37 | 38 | def get_pairs(word): 39 | """Return set of symbol pairs in a word. 40 | Word is represented as tuple of symbols (symbols being variable-length strings). 41 | """ 42 | pairs = set() 43 | prev_char = word[0] 44 | for char in word[1:]: 45 | pairs.add((prev_char, char)) 46 | prev_char = char 47 | return pairs 48 | 49 | 50 | def basic_clean(text): 51 | text = ftfy.fix_text(text) 52 | text = html.unescape(html.unescape(text)) 53 | return text.strip() 54 | 55 | 56 | def whitespace_clean(text): 57 | text = re.sub(r'\s+', ' ', text) 58 | text = text.strip() 59 | return text 60 | 61 | 62 | class SimpleTokenizer(object): 63 | def __init__(self, bpe_path: str = default_bpe()): 64 | self.byte_encoder = bytes_to_unicode() 65 | self.byte_decoder = {v: k for k, v in self.byte_encoder.items()} 66 | merges = gzip.open(bpe_path).read().decode("utf-8").split('\n') 67 | merges = merges[1:49152-256-2+1] 68 | merges = [tuple(merge.split()) for merge in merges] 69 | vocab = list(bytes_to_unicode().values()) 70 | vocab = vocab + [v+'' for v in vocab] 71 | for merge in merges: 72 | vocab.append(''.join(merge)) 73 | vocab.extend(['<|startoftext|>', '<|endoftext|>']) 74 | self.encoder = dict(zip(vocab, range(len(vocab)))) 75 | self.decoder = {v: k for k, v in self.encoder.items()} 76 | self.bpe_ranks = dict(zip(merges, range(len(merges)))) 77 | self.cache = {'<|startoftext|>': '<|startoftext|>', '<|endoftext|>': '<|endoftext|>'} 78 | self.pat = re.compile(r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""", re.IGNORECASE) 79 | 80 | def bpe(self, token): 81 | if token in self.cache: 82 | return self.cache[token] 83 | word = tuple(token[:-1]) + ( token[-1] + '',) 84 | pairs = get_pairs(word) 85 | 86 | if not pairs: 87 | return token+'' 88 | 89 | while True: 90 | bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf'))) 91 | if bigram not in self.bpe_ranks: 92 | break 93 | first, second = bigram 94 | new_word = [] 95 | i = 0 96 | while i < len(word): 97 | try: 98 | j = word.index(first, i) 99 | new_word.extend(word[i:j]) 100 | i = j 101 | except: 102 | new_word.extend(word[i:]) 103 | break 104 | 105 | if word[i] == first and i < len(word)-1 and word[i+1] == second: 106 | new_word.append(first+second) 107 | i += 2 108 | else: 109 | new_word.append(word[i]) 110 | i += 1 111 | new_word = tuple(new_word) 112 | word = new_word 113 | if len(word) == 1: 114 | break 115 | else: 116 | pairs = get_pairs(word) 117 | word = ' '.join(word) 118 | self.cache[token] = word 119 | return word 120 | 121 | def encode(self, text): 122 | bpe_tokens = [] 123 | text = whitespace_clean(basic_clean(text)).lower() 124 | for token in re.findall(self.pat, text): 125 | token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8')) 126 | bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' ')) 127 | return bpe_tokens 128 | 129 | def decode(self, tokens): 130 | text = ''.join([self.decoder[token] for token in tokens]) 131 | text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors="replace").replace('', ' ') 132 | return text 133 | -------------------------------------------------------------------------------- /content/TinyIMGRAG/packages/CLIP-main/data/country211.md: -------------------------------------------------------------------------------- 1 | # The Country211 Dataset 2 | 3 | In the paper, we used an image classification dataset called Country211, to evaluate the model's capability on geolocation. To do so, we filtered the YFCC100m dataset that have GPS coordinate corresponding to a [ISO-3166 country code](https://en.wikipedia.org/wiki/List_of_ISO_3166_country_codes) and created a balanced dataset by sampling 150 train images, 50 validation images, and 100 test images images for each country. 4 | 5 | The following command will download an 11GB archive countaining the images and extract into a subdirectory `country211`: 6 | 7 | ```bash 8 | wget https://openaipublic.azureedge.net/clip/data/country211.tgz 9 | tar zxvf country211.tgz 10 | ``` 11 | 12 | These images are a subset of the YFCC100m dataset. Use of the underlying media files is subject to the Creative Commons licenses chosen by their creators/uploaders. For more information about the YFCC100M dataset, visit [the official website](https://multimediacommons.wordpress.com/yfcc100m-core-dataset/). -------------------------------------------------------------------------------- /content/TinyIMGRAG/packages/CLIP-main/data/rendered-sst2.md: -------------------------------------------------------------------------------- 1 | # The Rendered SST2 Dataset 2 | 3 | In the paper, we used an image classification dataset called Rendered SST2, to evaluate the model's capability on optical character recognition. To do so, we rendered the sentences in the [Standford Sentiment Treebank v2](https://nlp.stanford.edu/sentiment/treebank.html) dataset and used those as the input to the CLIP image encoder. 4 | 5 | The following command will download a 131MB archive countaining the images and extract into a subdirectory `rendered-sst2`: 6 | 7 | ```bash 8 | wget https://openaipublic.azureedge.net/clip/data/rendered-sst2.tgz 9 | tar zxvf rendered-sst2.tgz 10 | ``` 11 | 12 | -------------------------------------------------------------------------------- /content/TinyIMGRAG/packages/CLIP-main/data/yfcc100m.md: -------------------------------------------------------------------------------- 1 | # The YFCC100M Subset 2 | 3 | In the paper, we performed a dataset ablation using a subset of the YFCC100M dataset and showed that the performance remained largely similar. 4 | 5 | The subset contains 14,829,396 images, about 15% of the full dataset, which have been filtered to only keep those with natural languag titles and/or descriptions in English. 6 | 7 | We provide the list of (line number, photo identifier, photo hash) of each image contained in this subset. These correspond to the first three columns in the dataset's metadata TSV file. 8 | 9 | ```bash 10 | wget https://openaipublic.azureedge.net/clip/data/yfcc100m_subset_data.tsv.bz2 11 | bunzip2 yfcc100m_subset_data.tsv.bz2 12 | ``` 13 | 14 | Use of the underlying media files is subject to the Creative Commons licenses chosen by their creators/uploaders. For more information about the YFCC100M dataset, visit [the official website](https://multimediacommons.wordpress.com/yfcc100m-core-dataset/). -------------------------------------------------------------------------------- /content/TinyIMGRAG/packages/CLIP-main/hubconf.py: -------------------------------------------------------------------------------- 1 | from clip.clip import tokenize as _tokenize, load as _load, available_models as _available_models 2 | import re 3 | import string 4 | 5 | dependencies = ["torch", "torchvision", "ftfy", "regex", "tqdm"] 6 | 7 | # For compatibility (cannot include special characters in function name) 8 | model_functions = { model: re.sub(f'[{string.punctuation}]', '_', model) for model in _available_models()} 9 | 10 | def _create_hub_entrypoint(model): 11 | def entrypoint(**kwargs): 12 | return _load(model, **kwargs) 13 | 14 | entrypoint.__doc__ = f"""Loads the {model} CLIP model 15 | 16 | Parameters 17 | ---------- 18 | device : Union[str, torch.device] 19 | The device to put the loaded model 20 | 21 | jit : bool 22 | Whether to load the optimized JIT model or more hackable non-JIT model (default). 23 | 24 | download_root: str 25 | path to download the model files; by default, it uses "~/.cache/clip" 26 | 27 | Returns 28 | ------- 29 | model : torch.nn.Module 30 | The {model} CLIP model 31 | 32 | preprocess : Callable[[PIL.Image], torch.Tensor] 33 | A torchvision transform that converts a PIL image into a tensor that the returned model can take as its input 34 | """ 35 | return entrypoint 36 | 37 | def tokenize(): 38 | return _tokenize 39 | 40 | _entrypoints = {model_functions[model]: _create_hub_entrypoint(model) for model in _available_models()} 41 | 42 | globals().update(_entrypoints) -------------------------------------------------------------------------------- /content/TinyIMGRAG/packages/CLIP-main/model-card.md: -------------------------------------------------------------------------------- 1 | # Model Card: CLIP 2 | 3 | Inspired by [Model Cards for Model Reporting (Mitchell et al.)](https://arxiv.org/abs/1810.03993) and [Lessons from Archives (Jo & Gebru)](https://arxiv.org/pdf/1912.10389.pdf), we’re providing some accompanying information about the multimodal model. 4 | 5 | ## Model Details 6 | 7 | The CLIP model was developed by researchers at OpenAI to learn about what contributes to robustness in computer vision tasks. The model was also developed to test the ability of models to generalize to arbitrary image classification tasks in a zero-shot manner. It was not developed for general model deployment - to deploy models like CLIP, researchers will first need to carefully study their capabilities in relation to the specific context they’re being deployed within. 8 | 9 | ### Model Date 10 | 11 | January 2021 12 | 13 | ### Model Type 14 | 15 | The base model uses a ResNet50 with several modifications as an image encoder and uses a masked self-attention Transformer as a text encoder. These encoders are trained to maximize the similarity of (image, text) pairs via a contrastive loss. There is also a variant of the model where the ResNet image encoder is replaced with a Vision Transformer. 16 | 17 | ### Model Versions 18 | 19 | Initially, we’ve released one CLIP model based on the Vision Transformer architecture equivalent to ViT-B/32, along with the RN50 model, using the architecture equivalent to ResNet-50. 20 | 21 | As part of the staged release process, we have also released the RN101 model, as well as RN50x4, a RN50 scaled up 4x according to the [EfficientNet](https://arxiv.org/abs/1905.11946) scaling rule. In July 2021, we additionally released the RN50x16 and ViT-B/16 models, and in January 2022, the RN50x64 and ViT-L/14 models were released. Lastly, the ViT-L/14@336px model was released in April 2022. 22 | 23 | Please see the paper linked below for further details about their specification. 24 | 25 | ### Documents 26 | 27 | - [Blog Post](https://openai.com/blog/clip/) 28 | - [CLIP Paper](https://arxiv.org/abs/2103.00020) 29 | 30 | 31 | 32 | ## Model Use 33 | 34 | ### Intended Use 35 | 36 | The model is intended as a research output for research communities. We hope that this model will enable researchers to better understand and explore zero-shot, arbitrary image classification. We also hope it can be used for interdisciplinary studies of the potential impact of such models - the CLIP paper includes a discussion of potential downstream impacts to provide an example for this sort of analysis. 37 | 38 | #### Primary intended uses 39 | 40 | The primary intended users of these models are AI researchers. 41 | 42 | We primarily imagine the model will be used by researchers to better understand robustness, generalization, and other capabilities, biases, and constraints of computer vision models. 43 | 44 | ### Out-of-Scope Use Cases 45 | 46 | **Any** deployed use case of the model - whether commercial or not - is currently out of scope. Non-deployed use cases such as image search in a constrained environment, are also not recommended unless there is thorough in-domain testing of the model with a specific, fixed class taxonomy. This is because our safety assessment demonstrated a high need for task specific testing especially given the variability of CLIP’s performance with different class taxonomies. This makes untested and unconstrained deployment of the model in any use case currently potentially harmful. 47 | 48 | Certain use cases which would fall under the domain of surveillance and facial recognition are always out-of-scope regardless of performance of the model. This is because the use of artificial intelligence for tasks such as these can be premature currently given the lack of testing norms and checks to ensure its fair use. 49 | 50 | Since the model has not been purposefully trained in or evaluated on any languages other than English, its use should be limited to English language use cases. 51 | 52 | 53 | 54 | ## Data 55 | 56 | The model was trained on publicly available image-caption data. This was done through a combination of crawling a handful of websites and using commonly-used pre-existing image datasets such as [YFCC100M](http://projects.dfki.uni-kl.de/yfcc100m/). A large portion of the data comes from our crawling of the internet. This means that the data is more representative of people and societies most connected to the internet which tend to skew towards more developed nations, and younger, male users. 57 | 58 | ### Data Mission Statement 59 | 60 | Our goal with building this dataset was to test out robustness and generalizability in computer vision tasks. As a result, the focus was on gathering large quantities of data from different publicly-available internet data sources. The data was gathered in a mostly non-interventionist manner. However, we only crawled websites that had policies against excessively violent and adult images and allowed us to filter out such content. We do not intend for this dataset to be used as the basis for any commercial or deployed model and will not be releasing the dataset. 61 | 62 | 63 | 64 | ## Performance and Limitations 65 | 66 | ### Performance 67 | 68 | We have evaluated the performance of CLIP on a wide range of benchmarks across a variety of computer vision datasets such as OCR to texture recognition to fine-grained classification. The paper describes model performance on the following datasets: 69 | 70 | - Food101 71 | - CIFAR10 72 | - CIFAR100 73 | - Birdsnap 74 | - SUN397 75 | - Stanford Cars 76 | - FGVC Aircraft 77 | - VOC2007 78 | - DTD 79 | - Oxford-IIIT Pet dataset 80 | - Caltech101 81 | - Flowers102 82 | - MNIST 83 | - SVHN 84 | - IIIT5K 85 | - Hateful Memes 86 | - SST-2 87 | - UCF101 88 | - Kinetics700 89 | - Country211 90 | - CLEVR Counting 91 | - KITTI Distance 92 | - STL-10 93 | - RareAct 94 | - Flickr30 95 | - MSCOCO 96 | - ImageNet 97 | - ImageNet-A 98 | - ImageNet-R 99 | - ImageNet Sketch 100 | - ObjectNet (ImageNet Overlap) 101 | - Youtube-BB 102 | - ImageNet-Vid 103 | 104 | ## Limitations 105 | 106 | CLIP and our analysis of it have a number of limitations. CLIP currently struggles with respect to certain tasks such as fine grained classification and counting objects. CLIP also poses issues with regards to fairness and bias which we discuss in the paper and briefly in the next section. Additionally, our approach to testing CLIP also has an important limitation- in many cases we have used linear probes to evaluate the performance of CLIP and there is evidence suggesting that linear probes can underestimate model performance. 107 | 108 | ### Bias and Fairness 109 | 110 | We find that the performance of CLIP - and the specific biases it exhibits - can depend significantly on class design and the choices one makes for categories to include and exclude. We tested the risk of certain kinds of denigration with CLIP by classifying images of people from [Fairface](https://arxiv.org/abs/1908.04913) into crime-related and non-human animal categories. We found significant disparities with respect to race and gender. Additionally, we found that these disparities could shift based on how the classes were constructed. (Details captured in the Broader Impacts Section in the paper). 111 | 112 | We also tested the performance of CLIP on gender, race and age classification using the Fairface dataset (We default to using race categories as they are constructed in the Fairface dataset.) in order to assess quality of performance across different demographics. We found accuracy >96% across all races for gender classification with ‘Middle Eastern’ having the highest accuracy (98.4%) and ‘White’ having the lowest (96.5%). Additionally, CLIP averaged ~93% for racial classification and ~63% for age classification. Our use of evaluations to test for gender, race and age classification as well as denigration harms is simply to evaluate performance of the model across people and surface potential risks and not to demonstrate an endorsement/enthusiasm for such tasks. 113 | 114 | 115 | 116 | ## Feedback 117 | 118 | ### Where to send questions or comments about the model 119 | 120 | Please use [this Google Form](https://forms.gle/Uv7afRH5dvY34ZEs9) 121 | -------------------------------------------------------------------------------- /content/TinyIMGRAG/packages/CLIP-main/requirements.txt: -------------------------------------------------------------------------------- 1 | ftfy 2 | packaging 3 | regex 4 | tqdm 5 | torch 6 | torchvision 7 | -------------------------------------------------------------------------------- /content/TinyIMGRAG/packages/CLIP-main/setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import pkg_resources 4 | from setuptools import setup, find_packages 5 | 6 | setup( 7 | name="clip", 8 | py_modules=["clip"], 9 | version="1.0", 10 | description="", 11 | author="OpenAI", 12 | packages=find_packages(exclude=["tests*"]), 13 | install_requires=[ 14 | str(r) 15 | for r in pkg_resources.parse_requirements( 16 | open(os.path.join(os.path.dirname(__file__), "requirements.txt")) 17 | ) 18 | ], 19 | include_package_data=True, 20 | extras_require={'dev': ['pytest']}, 21 | ) 22 | -------------------------------------------------------------------------------- /content/TinyIMGRAG/packages/CLIP-main/tests/test_consistency.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | import torch 4 | from PIL import Image 5 | 6 | import clip 7 | 8 | 9 | @pytest.mark.parametrize('model_name', clip.available_models()) 10 | def test_consistency(model_name): 11 | device = "cpu" 12 | jit_model, transform = clip.load(model_name, device=device, jit=True) 13 | py_model, _ = clip.load(model_name, device=device, jit=False) 14 | 15 | image = transform(Image.open("CLIP.png")).unsqueeze(0).to(device) 16 | text = clip.tokenize(["a diagram", "a dog", "a cat"]).to(device) 17 | 18 | with torch.no_grad(): 19 | logits_per_image, _ = jit_model(image, text) 20 | jit_probs = logits_per_image.softmax(dim=-1).cpu().numpy() 21 | 22 | logits_per_image, _ = py_model(image, text) 23 | py_probs = logits_per_image.softmax(dim=-1).cpu().numpy() 24 | 25 | assert np.allclose(jit_probs, py_probs, atol=0.01, rtol=0.1) 26 | -------------------------------------------------------------------------------- /content/TinyIMGRAG/requirements.txt: -------------------------------------------------------------------------------- 1 | numpy==1.26.2 2 | Pillow==10.1.0 3 | accelerate>=0.26.0 4 | optimum 5 | torch==2.2.1+cu121 6 | torchaudio==2.2.1+cu121 7 | torchvision==0.17.1+cu121 8 | transformers==4.50.1 9 | diffusers==0.33.1 10 | modelscope==1.24.0 11 | huggingface-hub==0.29.3 12 | open_clip_torch==2.29.0 13 | qwen-vl-utils==0.0.8 14 | auto-gptq 15 | 16 | # clip请从readme中查看安装方式 17 | -------------------------------------------------------------------------------- /content/TinyLLM/code/preprocess.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import json 3 | import os 4 | import random 5 | from concurrent.futures import ProcessPoolExecutor 6 | from functools import partial 7 | 8 | import numpy as np 9 | import sentencepiece as spm 10 | import torch 11 | import torch.distributed as dist 12 | from tqdm import tqdm 13 | 14 | from tokenizer import Tokenizer 15 | 16 | DATA_CACHE_DIR = 'data' 17 | TOKENIZER_MODEL = "./data/tok4096.model" 18 | 19 | 20 | # 定义分片处理函数 21 | def process_shard(args, vocab_size, tokenizer_model_path): 22 | """ 23 | 处理数据分片,将其中的文本进行分词并保存为二进制文件。 24 | 25 | 参数: 26 | args: tuple, 包含分片ID和分片文件名 27 | vocab_size: int, 词汇表大小,用于决定输出文件存储路径 28 | """ 29 | # 提取分片ID和文件名 30 | shard_id, shard = args 31 | 32 | # 初始化分词器 33 | enc = Tokenizer(tokenizer_model_path) 34 | 35 | # 打开并读取当前分片的JSON文件 36 | with open(shard, "r") as f: 37 | data = json.load(f) 38 | 39 | # 用于保存所有的分词后的token 40 | all_tokens = [] 41 | 42 | # 遍历每一个例子,tqdm显示进度条 43 | for example in tqdm(data, position=shard_id): 44 | # 提取故事文本,并去除首尾空白字符 45 | text = example["story"] 46 | text = text.strip() # 去掉首尾空白字符 47 | 48 | # 对文本进行编码,使用BOS(开始标志)但不使用EOS(结束标志) 49 | tokens = enc.encode(text, bos=True, eos=False) 50 | # 将当前文本的token添加到总token列表 51 | all_tokens.extend(tokens) 52 | 53 | # 将所有的token转换为uint16类型的NumPy数组 54 | all_tokens = np.array(all_tokens, dtype=np.uint16) 55 | 56 | # 根据词汇表大小确定输出文件名 57 | if vocab_size == 0: 58 | # 如果词汇表大小为0,使用默认的Llama 2分词器,将文件保存到原路径 59 | tokenized_filename = shard.replace(".json", ".bin") 60 | else: 61 | # 如果有指定词汇表大小,保存到新目录`tok{vocab_size}`下 62 | bin_dir = os.path.join(DATA_CACHE_DIR, f"tok{vocab_size}") 63 | shard_basename = os.path.basename(shard) 64 | bin_basename = shard_basename.replace(".json", ".bin") 65 | tokenized_filename = os.path.join(bin_dir, bin_basename) 66 | 67 | # 将token以二进制形式保存 68 | with open(tokenized_filename, "wb") as f: 69 | f.write(all_tokens.tobytes()) 70 | 71 | # 计算平均序列长度(以BOS标记`1`分隔的序列) 72 | avg_seq_len = all_tokens.size / ((all_tokens == 1).sum()) 73 | print(f"Saved {tokenized_filename}, average seqlen: {avg_seq_len:.2f}") 74 | 75 | 76 | # 定义预处理函数,用于对多个数据分片进行批量处理 77 | def pretokenize(vocab_size): 78 | """ 79 | 预处理所有的数据分片,并将分词后的数据保存为二进制文件。 80 | 81 | 参数: 82 | vocab_size: int, 词汇表大小,用于决定输出文件存储路径 83 | """ 84 | # 数据所在目录 85 | data_dir = os.path.join(DATA_CACHE_DIR, "TinyStories_all_data") 86 | 87 | # 获取所有JSON文件的文件名列表,并按字典序排序 88 | shard_filenames = sorted(glob.glob(os.path.join(data_dir, "*.json"))) 89 | 90 | # 如果词汇表大小大于0,则创建对应的保存目录 91 | if vocab_size > 0: 92 | bin_dir = os.path.join(DATA_CACHE_DIR, f"tok{vocab_size}") 93 | os.makedirs(bin_dir, exist_ok=True) 94 | 95 | # 使用partial函数将vocab_size绑定到process_shard函数 96 | fun = partial(process_shard, vocab_size=vocab_size, tokenizer_model_path=TOKENIZER_MODEL) 97 | 98 | # 使用进程池并行处理每个分片 99 | with ProcessPoolExecutor() as executor: 100 | executor.map(fun, enumerate(shard_filenames)) 101 | 102 | print("Done.") 103 | 104 | 105 | class PretokDataset(torch.utils.data.IterableDataset): 106 | """从磁盘加载已预处理的分词数据,并将其以 PyTorch 张量的形式返回。""" 107 | 108 | def __init__(self, split, max_seq_len, vocab_size, vocab_source): 109 | """ 110 | 初始化数据集。 111 | 112 | 参数: 113 | split: str, 数据集的分割方式('train' 或 'test')。 114 | max_seq_len: int, 最大序列长度,用于生成输入输出序列。 115 | vocab_size: int, 词汇表的大小。 116 | vocab_source: str, 词汇表的来源('llama2' 或 'custom')。 117 | """ 118 | super().__init__() 119 | self.split = split # 数据集划分(训练集或测试集) 120 | self.max_seq_len = max_seq_len # 最大序列长度 121 | self.vocab_size = vocab_size # 词汇表大小 122 | self.vocab_source = vocab_source # 词汇表来源 123 | 124 | def __iter__(self): 125 | """ 126 | 返回迭代器,按批次加载数据并生成模型输入/输出。 127 | """ 128 | # 获取DataLoader的worker信息(用于并行数据加载) 129 | worker_info = torch.utils.data.get_worker_info() 130 | worker_id = worker_info.id if worker_info else 0 # worker ID 131 | # 获取分布式训练的rank信息(用于多GPU训练) 132 | rank = dist.get_rank() if dist.is_initialized() else 0 133 | # 基于worker_id和rank生成唯一的随机数种子,确保数据在每个worker和rank之间是唯一的 134 | seed = 42 + worker_id + 1337 * rank 135 | rng = random.Random(seed) 136 | print(f"Created a PretokDataset with rng seed {seed}") 137 | 138 | # 根据词汇表来源决定数据路径 139 | if self.vocab_source == "llama2": 140 | # 如果使用 Llama 2 词汇表,.bin 文件和 .json 文件在同一目录下 141 | bin_dir = os.path.join(DATA_CACHE_DIR, "TinyStories_all_data") 142 | shard_filenames = sorted(glob.glob(os.path.join(bin_dir, "*.bin"))) 143 | elif self.vocab_source == "custom": 144 | # 如果使用自定义词汇表,.bin 文件在 tok{N} 目录下 145 | bin_dir = os.path.join(DATA_CACHE_DIR, f"tok{self.vocab_size}") 146 | shard_filenames = sorted(glob.glob(os.path.join(bin_dir, "*.bin"))) 147 | 148 | # 根据数据集划分使用不同的分片文件 149 | # 训练集使用所有分片文件,测试集只使用第一个分片 150 | shard_filenames = shard_filenames[1:] if self.split == "train" else shard_filenames[:1] 151 | assert len(shard_filenames) > 0, f"在 {bin_dir} 中未找到任何 .bin 文件" 152 | 153 | while True: 154 | # 随机打乱分片文件 155 | rng.shuffle(shard_filenames) 156 | for shard in shard_filenames: 157 | # 使用 memmap 读取文件,使得数据留在磁盘上,减少内存占用 158 | m = np.memmap(shard, dtype=np.uint16, mode="r") 159 | # 计算该分片中的批次数量 160 | num_batches = len(m) // self.max_seq_len 161 | num_batches -= 1 # 去掉最后一个不完整的批次 162 | assert num_batches > 0, "这个分片文件太小了?请检查。" 163 | # 随机打乱批次索引 164 | ixs = list(range(num_batches)) 165 | rng.shuffle(ixs) 166 | # 对每个批次生成输入 x 和目标输出 y 167 | for ix in ixs: 168 | start = ix * self.max_seq_len # 批次起始索引 169 | end = start + self.max_seq_len + 1 # 批次结束索引 170 | # 将数据转换为 NumPy 数组并拷贝到 RAM 中 171 | chunk = torch.from_numpy((m[start:end]).astype(np.int64)) 172 | # 模型输入 x 是当前批次的前 max_seq_len 个词元 173 | x = chunk[:-1] 174 | # 模型输出 y 是下一个词元 175 | y = chunk[1:] 176 | # 生成 x, y 对 177 | yield x, y 178 | 179 | 180 | class Task: 181 | @staticmethod 182 | def iter_batches(batch_size, device, num_workers=0, **dataset_kwargs): 183 | ds = PretokDataset(**dataset_kwargs) 184 | dl = torch.utils.data.DataLoader( 185 | ds, batch_size=batch_size, pin_memory=True, num_workers=num_workers 186 | ) 187 | for x, y in dl: 188 | x = x.to(device, non_blocking=True) 189 | y = y.to(device, non_blocking=True) 190 | yield x, y 191 | 192 | 193 | if __name__ == "__main__": 194 | pretokenize(vocab_size=4096) -------------------------------------------------------------------------------- /content/TinyLLM/code/requirements.txt: -------------------------------------------------------------------------------- 1 | numpy==1.23.5 2 | Requests==2.31.0 3 | sentencepiece==0.1.99 4 | torch==2.0.1 5 | tqdm==4.64.1 -------------------------------------------------------------------------------- /content/TinyLLM/code/sample.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | from contextlib import nullcontext 4 | import torch 5 | from model import ModelArgs, Transformer 6 | from tokenizer import Tokenizer 7 | import argparse 8 | 9 | class TextGenerator: 10 | def __init__(self, 11 | checkpoint='output/ckpt.pt', # 模型检查点路径 12 | tokenizer_model_path='tok4096.model', # 分词器模型路径 13 | seed=1337, # 随机种子,确保可重复性 14 | device=None, # 设备,优先使用 CUDA,如果没有可用的 CUDA,则使用 CPU 15 | dtype="float32"): # 数据类型,默认为 float32,可以选择 float16 或 bfloat16 16 | """ 17 | 初始化 TextGenerator 类,加载模型、设置设备和分词器等。 18 | """ 19 | # 模型加载配置 20 | self.checkpoint = checkpoint # 保存的模型检查点路径 21 | self.tokenizer_model_path = tokenizer_model_path # 分词器模型文件路径 22 | self.seed = seed # 随机数种子,用于生成的可重复性 23 | self.device = device or ('cuda' if torch.cuda.is_available() else 'cpu') # 根据硬件条件选择设备 24 | self.dtype = dtype # 模型的浮点数类型 25 | self.device_type = 'cuda' if 'cuda' in self.device else 'cpu' # 判断当前设备是否为 CUDA 26 | 27 | # 设置随机种子,确保生成的可重复性 28 | torch.manual_seed(seed) # 设置 CPU 随机种子 29 | torch.cuda.manual_seed(seed) # 设置 CUDA 随机种子 30 | torch.backends.cuda.matmul.allow_tf32 = True # 允许 CUDA 使用 TF32 精度进行矩阵乘法运算 31 | torch.backends.cudnn.allow_tf32 = True # 允许 cuDNN 使用 TF32 精度加速 32 | 33 | # 根据 dtype 选择适当的自动混合精度上下文 34 | ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[self.dtype] 35 | self.ctx = nullcontext() if self.device_type == 'cpu' else torch.amp.autocast(device_type=self.device_type, dtype=ptdtype) 36 | 37 | # 加载模型检查点文件 38 | checkpoint_dict = torch.load(self.checkpoint, map_location=self.device) # 加载模型参数 39 | gptconf = ModelArgs(**checkpoint_dict['model_args']) # 初始化模型参数 40 | self.model = Transformer(gptconf) # 实例化 Transformer 模型 41 | state_dict = checkpoint_dict['model'] # 获取模型状态字典 42 | 43 | # 去除状态字典中的不必要前缀 44 | unwanted_prefix = '_orig_mod.' # 这个前缀在保存时可能被添加,现在要去除它 45 | for k, v in list(state_dict.items()): 46 | if k.startswith(unwanted_prefix): 47 | state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k) # 去除不必要的前缀 48 | 49 | # 加载模型参数到模型中 50 | self.model.load_state_dict(state_dict, strict=False) 51 | # 计算模型参数量 52 | num_params = sum(p.numel() for p in self.model.parameters() if p.requires_grad) 53 | print(f"Model has {num_params} parameters.") 54 | # 设置模型为评估模式(evaluation mode),防止训练模式下的 dropout 等操作影响结果 55 | self.model.eval() 56 | # 将模型放置到正确的设备上(GPU 或 CPU) 57 | self.model.to(self.device) 58 | # 初始化分词器 59 | self.tokenizer = Tokenizer(tokenizer_model=self.tokenizer_model_path) # 根据指定的路径加载分词器 60 | 61 | def sample(self, 62 | start="Hello!", # 生成文本的起始提示词,可以是任意字符串 63 | num_samples=3, # 生成样本的数量,默认生成 3 个样本 64 | max_new_tokens=256, # 每个样本生成的最大 token 数,默认最多生成 256 个 token 65 | temperature=1.0, # 控制生成的随机性,1.0 为标准,值越大越随机 66 | top_k=300): # 保留概率最高的 top_k 个 token,限制生成时的选择范围 67 | """ 68 | 根据给定的起始文本生成样本。 69 | 70 | :param start: 生成文本的起始提示词 71 | :param num_samples: 要生成的文本样本数 72 | :param max_new_tokens: 每个样本生成的最大 token 数 73 | :param temperature: 控制生成的随机性,值越小生成越确定,值越大生成越随机 74 | :param top_k: 限制生成时选择的 token 范围 75 | :return: 生成的文本样本列表 76 | """ 77 | # 如果 start 是以 'FILE:' 开头,表示从文件中读取起始文本 78 | if start.startswith('FILE:'): 79 | with open(start[5:], 'r', encoding='utf-8') as f: 80 | start = f.read() # 读取文件内容作为起始文本 81 | 82 | # 将起始文本编码为 token id 序列 83 | start_ids = self.tokenizer.encode(start, bos=True, eos=False) # bos=True 表示加上句首标记,eos=False 表示不加句尾标记 84 | x = (torch.tensor(start_ids, dtype=torch.long, device=self.device)[None, ...]) # 将编码后的 token id 转为 PyTorch 张量 85 | 86 | generated_texts = [] # 用于保存生成的文本样本 87 | with torch.no_grad(): # 禁用梯度计算,提升效率 88 | with self.ctx: # 进入自动混合精度的上下文(如果是 GPU 并使用 float16 时) 89 | for k in range(num_samples): # 循环生成指定数量的样本 90 | y = self.model.generate(x, max_new_tokens, temperature=temperature, top_k=top_k) # 生成文本 91 | generated_texts.append(self.tokenizer.decode(y[0].tolist())) # 解码生成的 token 序列为可读文本 92 | 93 | return generated_texts # 返回生成的文本样本 94 | 95 | # 示例使用 96 | if __name__ == "__main__": 97 | parser = argparse.ArgumentParser() 98 | parser.add_argument("--prompt", type=str, default="One day, Lily met a Shoggoth") 99 | args = parser.parse_args() 100 | 101 | generator = TextGenerator() # 初始化生成器 102 | samples = generator.sample(start=args.prompt, num_samples=3, max_new_tokens=256) # 生成 3 个样本 103 | for i, sample in enumerate(samples): 104 | print(f"\nSample {i+1}:\n{sample}\n{'-'*20}") # 打印生成的样本并用分隔线分割 105 | -------------------------------------------------------------------------------- /content/TinyLLM/code/tok4096.model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/tiny-universe/638006d147ed6fc902e610fad50875549fa2d944/content/TinyLLM/code/tok4096.model -------------------------------------------------------------------------------- /content/TinyLLM/code/tokenizer.py: -------------------------------------------------------------------------------- 1 | import os 2 | import struct 3 | from sentencepiece import SentencePieceProcessor 4 | from typing import List 5 | 6 | TOKENIZER_MODEL = "./data/tok4096.model" 7 | 8 | class Tokenizer: 9 | def __init__(self, tokenizer_model=None): 10 | """ 11 | 初始化分词器。加载预训练的SentencePiece模型,并设置一些特殊的token ID。 12 | 13 | 参数: 14 | tokenizer_model: str, 可选,分词器模型的路径,如果不指定则使用默认路径 TOKENIZER_MODEL。 15 | """ 16 | # 如果提供了分词器模型路径,使用该路径;否则使用默认模型路径 17 | model_path = tokenizer_model if tokenizer_model else TOKENIZER_MODEL 18 | # 确保模型文件存在 19 | assert os.path.isfile(model_path), model_path 20 | 21 | # 加载 SentencePiece 模型 22 | self.sp_model = SentencePieceProcessor(model_file=model_path) 23 | self.model_path = model_path 24 | 25 | # 获取分词器的特殊token和词汇表大小 26 | self.n_words: int = self.sp_model.vocab_size() # 词汇表大小 27 | self.bos_id: int = self.sp_model.bos_id() # 句子开头 (BOS) 的ID 28 | self.eos_id: int = self.sp_model.eos_id() # 句子结尾 (EOS) 的ID 29 | self.pad_id: int = self.sp_model.pad_id() # 填充 (PAD) 的ID 30 | 31 | # 验证分词器词汇表大小是否正确 32 | assert self.sp_model.vocab_size() == self.sp_model.get_piece_size() 33 | 34 | def encode(self, s: str, bos: bool, eos: bool) -> List[int]: 35 | """ 36 | 将字符串编码为词元ID列表。可以选择是否添加句子开头 (BOS) 和句子结尾 (EOS) 标记。 37 | 38 | 参数: 39 | s: str, 要编码的字符串。 40 | bos: bool, 是否在编码的词元列表前添加 BOS 标记。 41 | eos: bool, 是否在编码的词元列表末尾添加 EOS 标记。 42 | 43 | 返回: 44 | List[int]: 编码后的词元ID列表。 45 | """ 46 | # 确保输入是字符串类型 47 | assert type(s) is str 48 | # 使用SentencePiece将字符串编码为词元ID 49 | t = self.sp_model.encode(s) 50 | # 如果需要BOS标记,将其添加到词元列表开头 51 | if bos: 52 | t = [self.bos_id] + t 53 | # 如果需要EOS标记,将其添加到词元列表末尾 54 | if eos: 55 | t = t + [self.eos_id] 56 | return t 57 | 58 | def decode(self, t: List[int]) -> str: 59 | """ 60 | 将词元ID列表解码为字符串。 61 | 62 | 参数: 63 | t: List[int], 词元ID列表。 64 | 65 | 返回: 66 | str: 解码后的字符串。 67 | """ 68 | return self.sp_model.decode(t) -------------------------------------------------------------------------------- /content/TinyLLM/code/train_vocab.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import json 3 | import os 4 | from tqdm import tqdm 5 | import requests 6 | import sentencepiece as spm 7 | import argparse 8 | 9 | DATA_CACHE_DIR = 'data' 10 | 11 | def download_file(url: str, fname: str, chunk_size=1024): 12 | """发送HTTP GET请求以流式方式获取文件""" 13 | resp = requests.get(url, stream=True) 14 | 15 | # 获取文件的总大小(以字节为单位),默认为0如果没有提供'content-length'头信息 16 | total = int(resp.headers.get("content-length", 0)) 17 | 18 | # 以写二进制模式打开一个文件以保存下载的内容 19 | with open(fname, "wb") as file, tqdm( 20 | desc=fname, # 进度条前面的描述信息(通常是文件名) 21 | total=total, # 总的字节数,用于设置进度条的总长度 22 | unit="iB", # 进度条的单位,'iB'代表二进制字节 23 | unit_scale=True, # 启用单位缩放,如KB、MB等 24 | unit_divisor=1024, # 设置单位换算的除数,这里为1024 25 | ) as bar: 26 | # 逐块读取响应内容并写入文件 27 | for data in resp.iter_content(chunk_size=chunk_size): 28 | size = file.write(data) # 写入数据块到文件 29 | bar.update(size) # 更新进度条 30 | 31 | def download(): 32 | """在DATA_CACHE_DIR中创建目录,如果目录不存在则创建""" 33 | os.makedirs(DATA_CACHE_DIR, exist_ok=True) 34 | 35 | # 定义TinyStories数据集的下载URL和保存的文件名 36 | data_url = "https://www.modelscope.cn/datasets/AI-ModelScope/TinyStories/resolve/master/TinyStories_all_data.tar.gz" 37 | data_filename = os.path.join(DATA_CACHE_DIR, "TinyStories_all_data.tar.gz") 38 | 39 | # 检查数据集是否已经下载,如果没有下载则进行下载 40 | if not os.path.exists(data_filename): 41 | print(f"Downloading {data_url} to {data_filename}...") 42 | download_file(data_url, data_filename) # 使用之前定义的download_file函数进行下载 43 | else: 44 | print(f"{data_filename} already exists, skipping download...") 45 | 46 | # 定义解压缩后的数据目录 47 | data_dir = os.path.join(DATA_CACHE_DIR, "TinyStories_all_data") 48 | 49 | # 检查数据目录是否存在,如果不存在则解压缩数据集 50 | if not os.path.exists(data_dir): 51 | os.makedirs(data_dir, exist_ok=True) # 创建数据目录 52 | print(f"Unpacking {data_filename}...") 53 | os.system(f"tar -xzf {data_filename} -C {data_dir}") # 使用系统命令解压缩.tar.gz文件 54 | else: 55 | print(f"{data_dir} already exists, skipping unpacking...") 56 | 57 | # 查找解压后的所有JSON文件,排序后获取文件名列表 58 | shard_filenames = sorted(glob.glob(os.path.join(data_dir, "*.json"))) 59 | 60 | # 打开第一个JSON文件并读取内容 61 | with open(shard_filenames[0], "r") as f: 62 | data = json.load(f) # 将JSON文件内容加载到变量data中 63 | 64 | print("Download done.") # 下载完成信息 65 | print(f"Number of shards: {len(shard_filenames)}") # 打印解压后数据分片的数量 66 | print(f"Example story:\n{data[0]}") # 打印第一个分片中的一个示例故事 67 | 68 | def load_text_from_files(path): 69 | path_list = glob.glob(path) 70 | text_data = [] 71 | for file_path in path_list: 72 | with open(file_path, 'r', encoding='utf-8') as file: 73 | text_data.extend(file.readlines()) 74 | return text_data 75 | 76 | def batch_iterator(text_data, batch_size=648): 77 | for i in range(0, len(text_data), batch_size): 78 | yield text_data[i:i + batch_size] 79 | 80 | def train_vocab(vocab_size: int=32000, num_shards: int=20): 81 | """ 82 | vocab_size: int, 词汇表的大小,决定分词器的词汇量。 83 | num_shards: int, 用于加快词汇表训练的效率,指定要处理的分片数量。 84 | """ 85 | # 确保词汇表大小为正数 86 | assert vocab_size > 0, "Vocab size must be positive" 87 | 88 | # SentencePiece 模型的前缀路径,将用于保存分词器 89 | prefix = os.path.join(DATA_CACHE_DIR, f"tok{vocab_size}") 90 | 91 | # 1) 将多个分片中的文本导出为单个文本文件 tiny.txt 92 | tiny_file = os.path.join(DATA_CACHE_DIR, "tiny.txt") 93 | data_dir = os.path.join(DATA_CACHE_DIR, "TinyStories_all_data") 94 | shard_filenames = sorted(glob.glob(os.path.join(data_dir, "*.json"))) 95 | 96 | # 创建 tiny.txt 文件并写入指定数量的分片中的文本 97 | print(f"Writing temporary file {tiny_file} with {num_shards} shards...") 98 | with open(tiny_file, "w", encoding="utf-8") as of: 99 | # 遍历前 num_shards 个分片 100 | for shard in tqdm(shard_filenames[:num_shards]): 101 | with open(shard, "r") as f: 102 | data = json.load(f) # 读取分片中的JSON数据 103 | # 遍历每个例子,将其中的故事文本写入 tiny.txt 文件 104 | for example in data: 105 | text = example["story"] 106 | text = text.strip() # 去除文本首尾的空白字符 107 | of.write(text + "\n") # 每个文本写入一行 108 | 109 | # 输出生成的 tiny.txt 文件的大小 110 | print(f"Size is: {os.path.getsize(tiny_file) / 1024 / 1024:.2f} MB") 111 | 112 | # 2) 使用 SentencePiece 训练分词器 113 | print("Will now train the vocab...") 114 | spm.SentencePieceTrainer.train( 115 | input=tiny_file, # 输入文件为之前生成的 tiny.txt 116 | model_prefix=prefix, # 模型前缀路径 117 | model_type="bpe", # 使用 Byte-Pair Encoding (BPE) 训练分词器 118 | vocab_size=vocab_size, # 词汇表大小 119 | self_test_sample_size=0, # 自测样本大小设置为 0 120 | input_format="text", # 输入文件格式为纯文本 121 | character_coverage=1.0, # 覆盖所有字符(包括非常见字符) 122 | num_threads=os.cpu_count(), # 使用 CPU 的线程数 123 | split_digits=True, # 拆分数字 124 | allow_whitespace_only_pieces=True, # 允许仅由空格组成的词元 125 | byte_fallback=True, # 启用字节级回退 126 | unk_surface=r" \342\201\207 ", # UNK token 表示未知字符的方式 127 | normalization_rule_name="identity" # 使用“identity”归一化规则 128 | ) 129 | 130 | # 3) 可选的清理操作,询问用户是否删除临时文件 tiny.txt 131 | dec = input(f"Delete the temporary file {tiny_file}? [y/N] ") 132 | if dec.lower() == "y": 133 | os.remove(tiny_file) # 删除临时文件 134 | print(f"Deleted {tiny_file}") 135 | 136 | # 输出模型保存的路径 137 | print(f"Trained tokenizer is in {prefix}.model") 138 | print("Done.") 139 | 140 | if __name__ == "__main__": 141 | parser = argparse.ArgumentParser() 142 | parser.add_argument("--download", type=bool, default=True, help="download the dataset") 143 | parser.add_argument("--vocab_size", type=int, default=4096, help="vocab size") 144 | args = parser.parse_args() 145 | if args.download: 146 | download() 147 | train_vocab(args.vocab_size) -------------------------------------------------------------------------------- /content/TinyLLM/images/model_show.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/tiny-universe/638006d147ed6fc902e610fad50875549fa2d944/content/TinyLLM/images/model_show.png -------------------------------------------------------------------------------- /content/TinyLLM/images/nvidia.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/tiny-universe/638006d147ed6fc902e610fad50875549fa2d944/content/TinyLLM/images/nvidia.png -------------------------------------------------------------------------------- /content/TinyRAG/.env.example: -------------------------------------------------------------------------------- 1 | OPENAI_API_KEY='your openai key' 2 | OPENAI_BASE_URL='https://api.openai.com/v1' # 有些国内的代理商需要加v1,有些不需要 3 | 4 | ZHIPUAI_API_KEY='your zhipuai key' -------------------------------------------------------------------------------- /content/TinyRAG/RAG/Embeddings.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | ''' 4 | @File : Embeddings.py 5 | @Time : 2024/02/10 21:55:39 6 | @Author : 不要葱姜蒜 7 | @Version : 1.0 8 | @Desc : None 9 | ''' 10 | 11 | import os 12 | from copy import copy 13 | from typing import Dict, List, Optional, Tuple, Union 14 | import numpy as np 15 | 16 | os.environ['CURL_CA_BUNDLE'] = '' 17 | from dotenv import load_dotenv, find_dotenv 18 | _ = load_dotenv(find_dotenv()) 19 | 20 | 21 | class BaseEmbeddings: 22 | """ 23 | Base class for embeddings 24 | """ 25 | def __init__(self, path: str, is_api: bool) -> None: 26 | self.path = path 27 | self.is_api = is_api 28 | 29 | def get_embedding(self, text: str, model: str) -> List[float]: 30 | raise NotImplementedError 31 | 32 | @classmethod 33 | def cosine_similarity(cls, vector1: List[float], vector2: List[float]) -> float: 34 | """ 35 | calculate cosine similarity between two vectors 36 | """ 37 | dot_product = np.dot(vector1, vector2) 38 | magnitude = np.linalg.norm(vector1) * np.linalg.norm(vector2) 39 | if not magnitude: 40 | return 0 41 | return dot_product / magnitude 42 | 43 | 44 | class OpenAIEmbedding(BaseEmbeddings): 45 | """ 46 | class for OpenAI embeddings 47 | """ 48 | def __init__(self, path: str = '', is_api: bool = True) -> None: 49 | super().__init__(path, is_api) 50 | if self.is_api: 51 | from openai import OpenAI 52 | self.client = OpenAI() 53 | self.client.api_key = os.getenv("OPENAI_API_KEY") 54 | self.client.base_url = os.getenv("OPENAI_BASE_URL") 55 | 56 | def get_embedding(self, text: str, model: str = "text-embedding-3-large") -> List[float]: 57 | if self.is_api: 58 | text = text.replace("\n", " ") 59 | return self.client.embeddings.create(input=[text], model=model).data[0].embedding 60 | else: 61 | raise NotImplementedError 62 | 63 | class JinaEmbedding(BaseEmbeddings): 64 | """ 65 | class for Jina embeddings 66 | """ 67 | def __init__(self, path: str = 'jinaai/jina-embeddings-v2-base-zh', is_api: bool = False) -> None: 68 | super().__init__(path, is_api) 69 | self._model = self.load_model() 70 | 71 | def get_embedding(self, text: str) -> List[float]: 72 | return self._model.encode([text])[0].tolist() 73 | 74 | def load_model(self): 75 | import torch 76 | from transformers import AutoModel 77 | if torch.cuda.is_available(): 78 | device = torch.device("cuda") 79 | else: 80 | device = torch.device("cpu") 81 | model = AutoModel.from_pretrained(self.path, trust_remote_code=True).to(device) 82 | return model 83 | 84 | class ZhipuEmbedding(BaseEmbeddings): 85 | """ 86 | class for Zhipu embeddings 87 | """ 88 | def __init__(self, path: str = '', is_api: bool = True) -> None: 89 | super().__init__(path, is_api) 90 | if self.is_api: 91 | from zhipuai import ZhipuAI 92 | self.client = ZhipuAI(api_key=os.getenv("ZHIPUAI_API_KEY")) 93 | 94 | def get_embedding(self, text: str) -> List[float]: 95 | response = self.client.embeddings.create( 96 | model="embedding-2", 97 | input=text, 98 | ) 99 | return response.data[0].embedding 100 | 101 | class DashscopeEmbedding(BaseEmbeddings): 102 | """ 103 | class for Dashscope embeddings 104 | """ 105 | def __init__(self, path: str = '', is_api: bool = True) -> None: 106 | super().__init__(path, is_api) 107 | if self.is_api: 108 | import dashscope 109 | dashscope.api_key = os.getenv("DASHSCOPE_API_KEY") 110 | self.client = dashscope.TextEmbedding 111 | 112 | def get_embedding(self, text: str, model: str='text-embedding-v1') -> List[float]: 113 | response = self.client.call( 114 | model=model, 115 | input=text 116 | ) 117 | return response.output['embeddings'][0]['embedding'] -------------------------------------------------------------------------------- /content/TinyRAG/RAG/LLM.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | ''' 4 | @File : LLM.py 5 | @Time : 2024/02/12 13:50:47 6 | @Author : 不要葱姜蒜 7 | @Version : 1.0 8 | @Desc : None 9 | ''' 10 | import os 11 | from typing import Dict, List, Optional, Tuple, Union 12 | 13 | PROMPT_TEMPLATE = dict( 14 | RAG_PROMPT_TEMPALTE="""使用以上下文来回答用户的问题。如果你不知道答案,就说你不知道。总是使用中文回答。 15 | 问题: {question} 16 | 可参考的上下文: 17 | ··· 18 | {context} 19 | ··· 20 | 如果给定的上下文无法让你做出回答,请回答数据库中没有这个内容,你不知道。 21 | 有用的回答:""", 22 | InternLM_PROMPT_TEMPALTE="""先对上下文进行内容总结,再使用上下文来回答用户的问题。如果你不知道答案,就说你不知道。总是使用中文回答。 23 | 问题: {question} 24 | 可参考的上下文: 25 | ··· 26 | {context} 27 | ··· 28 | 如果给定的上下文无法让你做出回答,请回答数据库中没有这个内容,你不知道。 29 | 有用的回答:""" 30 | ) 31 | 32 | 33 | class BaseModel: 34 | def __init__(self, path: str = '') -> None: 35 | self.path = path 36 | 37 | def chat(self, prompt: str, history: List[dict], content: str) -> str: 38 | pass 39 | 40 | def load_model(self): 41 | pass 42 | 43 | class OpenAIChat(BaseModel): 44 | def __init__(self, path: str = '', model: str = "gpt-3.5-turbo-1106") -> None: 45 | super().__init__(path) 46 | self.model = model 47 | 48 | def chat(self, prompt: str, history: List[dict], content: str) -> str: 49 | from openai import OpenAI 50 | client = OpenAI() 51 | client.api_key = os.getenv("OPENAI_API_KEY") 52 | client.base_url = os.getenv("OPENAI_BASE_URL") 53 | history.append({'role': 'user', 'content': PROMPT_TEMPLATE['RAG_PROMPT_TEMPALTE'].format(question=prompt, context=content)}) 54 | response = client.chat.completions.create( 55 | model=self.model, 56 | messages=history, 57 | max_tokens=150, 58 | temperature=0.1 59 | ) 60 | return response.choices[0].message.content 61 | 62 | class InternLMChat(BaseModel): 63 | def __init__(self, path: str = '') -> None: 64 | super().__init__(path) 65 | self.load_model() 66 | 67 | def chat(self, prompt: str, history: List = [], content: str='') -> str: 68 | prompt = PROMPT_TEMPLATE['InternLM_PROMPT_TEMPALTE'].format(question=prompt, context=content) 69 | response, history = self.model.chat(self.tokenizer, prompt, history) 70 | return response 71 | 72 | 73 | def load_model(self): 74 | import torch 75 | from transformers import AutoTokenizer, AutoModelForCausalLM 76 | self.tokenizer = AutoTokenizer.from_pretrained(self.path, trust_remote_code=True) 77 | self.model = AutoModelForCausalLM.from_pretrained(self.path, torch_dtype=torch.float16, trust_remote_code=True).cuda() 78 | 79 | class DashscopeChat(BaseModel): 80 | def __init__(self, path: str = '', model: str = "qwen-turbo") -> None: 81 | super().__init__(path) 82 | self.model = model 83 | 84 | def chat(self, prompt: str, history: List[Dict], content: str) -> str: 85 | import dashscope 86 | dashscope.api_key = os.getenv("DASHSCOPE_API_KEY") 87 | history.append({'role': 'user', 'content': PROMPT_TEMPLATE['RAG_PROMPT_TEMPALTE'].format(question=prompt, context=content)}) 88 | response = dashscope.Generation.call( 89 | model=self.model, 90 | messages=history, 91 | result_format='message', 92 | max_tokens=150, 93 | temperature=0.1 94 | ) 95 | return response.output.choices[0].message.content 96 | 97 | 98 | class ZhipuChat(BaseModel): 99 | def __init__(self, path: str = '', model: str = "glm-4") -> None: 100 | super().__init__(path) 101 | from zhipuai import ZhipuAI 102 | self.client = ZhipuAI(api_key=os.getenv("ZHIPUAI_API_KEY")) 103 | self.model = model 104 | 105 | def chat(self, prompt: str, history: List[Dict], content: str) -> str: 106 | history.append({'role': 'user', 'content': PROMPT_TEMPLATE['RAG_PROMPT_TEMPALTE'].format(question=prompt, context=content)}) 107 | response = self.client.chat.completions.create( 108 | model=self.model, 109 | messages=history, 110 | max_tokens=150, 111 | temperature=0.1 112 | ) 113 | return response.choices[0].message -------------------------------------------------------------------------------- /content/TinyRAG/RAG/VectorBase.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | ''' 4 | @File : VectorBase.py 5 | @Time : 2024/02/12 10:11:13 6 | @Author : 不要葱姜蒜 7 | @Version : 1.0 8 | @Desc : None 9 | ''' 10 | 11 | import os 12 | from typing import Dict, List, Optional, Tuple, Union 13 | import json 14 | from RAG.Embeddings import BaseEmbeddings, OpenAIEmbedding, JinaEmbedding, ZhipuEmbedding 15 | import numpy as np 16 | from tqdm import tqdm 17 | 18 | 19 | class VectorStore: 20 | def __init__(self, document: List[str] = ['']) -> None: 21 | self.document = document 22 | 23 | def get_vector(self, EmbeddingModel: BaseEmbeddings) -> List[List[float]]: 24 | 25 | self.vectors = [] 26 | for doc in tqdm(self.document, desc="Calculating embeddings"): 27 | self.vectors.append(EmbeddingModel.get_embedding(doc)) 28 | return self.vectors 29 | 30 | def persist(self, path: str = 'storage'): 31 | if not os.path.exists(path): 32 | os.makedirs(path) 33 | with open(f"{path}/doecment.json", 'w', encoding='utf-8') as f: 34 | json.dump(self.document, f, ensure_ascii=False) 35 | if self.vectors: 36 | with open(f"{path}/vectors.json", 'w', encoding='utf-8') as f: 37 | json.dump(self.vectors, f) 38 | 39 | def load_vector(self, path: str = 'storage'): 40 | with open(f"{path}/vectors.json", 'r', encoding='utf-8') as f: 41 | self.vectors = json.load(f) 42 | with open(f"{path}/doecment.json", 'r', encoding='utf-8') as f: 43 | self.document = json.load(f) 44 | 45 | def get_similarity(self, vector1: List[float], vector2: List[float]) -> float: 46 | return BaseEmbeddings.cosine_similarity(vector1, vector2) 47 | 48 | def query(self, query: str, EmbeddingModel: BaseEmbeddings, k: int = 1) -> List[str]: 49 | query_vector = EmbeddingModel.get_embedding(query) 50 | result = np.array([self.get_similarity(query_vector, vector) 51 | for vector in self.vectors]) 52 | return np.array(self.document)[result.argsort()[-k:][::-1]].tolist() 53 | -------------------------------------------------------------------------------- /content/TinyRAG/RAG/__pycache__/Embeddings.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/tiny-universe/638006d147ed6fc902e610fad50875549fa2d944/content/TinyRAG/RAG/__pycache__/Embeddings.cpython-310.pyc -------------------------------------------------------------------------------- /content/TinyRAG/RAG/__pycache__/LLM.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/tiny-universe/638006d147ed6fc902e610fad50875549fa2d944/content/TinyRAG/RAG/__pycache__/LLM.cpython-310.pyc -------------------------------------------------------------------------------- /content/TinyRAG/RAG/__pycache__/VectorBase.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/tiny-universe/638006d147ed6fc902e610fad50875549fa2d944/content/TinyRAG/RAG/__pycache__/VectorBase.cpython-310.pyc -------------------------------------------------------------------------------- /content/TinyRAG/RAG/__pycache__/utils.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/tiny-universe/638006d147ed6fc902e610fad50875549fa2d944/content/TinyRAG/RAG/__pycache__/utils.cpython-310.pyc -------------------------------------------------------------------------------- /content/TinyRAG/RAG/utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | ''' 4 | @File : utils.py 5 | @Time : 2024/02/11 09:52:26 6 | @Author : 不要葱姜蒜 7 | @Version : 1.0 8 | @Desc : None 9 | ''' 10 | 11 | import os 12 | from typing import Dict, List, Optional, Tuple, Union 13 | 14 | import PyPDF2 15 | import markdown 16 | import html2text 17 | import json 18 | from tqdm import tqdm 19 | import tiktoken 20 | from bs4 import BeautifulSoup 21 | import re 22 | 23 | enc = tiktoken.get_encoding("cl100k_base") 24 | 25 | 26 | class ReadFiles: 27 | """ 28 | class to read files 29 | """ 30 | 31 | def __init__(self, path: str) -> None: 32 | self._path = path 33 | self.file_list = self.get_files() 34 | 35 | def get_files(self): 36 | # args:dir_path,目标文件夹路径 37 | file_list = [] 38 | for filepath, dirnames, filenames in os.walk(self._path): 39 | # os.walk 函数将递归遍历指定文件夹 40 | for filename in filenames: 41 | # 通过后缀名判断文件类型是否满足要求 42 | if filename.endswith(".md"): 43 | # 如果满足要求,将其绝对路径加入到结果列表 44 | file_list.append(os.path.join(filepath, filename)) 45 | elif filename.endswith(".txt"): 46 | file_list.append(os.path.join(filepath, filename)) 47 | elif filename.endswith(".pdf"): 48 | file_list.append(os.path.join(filepath, filename)) 49 | return file_list 50 | 51 | def get_content(self, max_token_len: int = 600, cover_content: int = 150): 52 | docs = [] 53 | # 读取文件内容 54 | for file in self.file_list: 55 | content = self.read_file_content(file) 56 | chunk_content = self.get_chunk( 57 | content, max_token_len=max_token_len, cover_content=cover_content) 58 | docs.extend(chunk_content) 59 | return docs 60 | 61 | @classmethod 62 | def get_chunk(cls, text: str, max_token_len: int = 600, cover_content: int = 150): 63 | chunk_text = [] 64 | 65 | curr_len = 0 66 | curr_chunk = '' 67 | 68 | token_len = max_token_len - cover_content 69 | lines = text.splitlines() # 假设以换行符分割文本为行 70 | 71 | for line in lines: 72 | line = line.replace(' ', '') 73 | line_len = len(enc.encode(line)) 74 | if line_len > max_token_len: 75 | # 如果单行长度就超过限制,则将其分割成多个块 76 | num_chunks = (line_len + token_len - 1) // token_len 77 | for i in range(num_chunks): 78 | start = i * token_len 79 | end = start + token_len 80 | # 避免跨单词分割 81 | while not line[start:end].rstrip().isspace(): 82 | start += 1 83 | end += 1 84 | if start >= line_len: 85 | break 86 | curr_chunk = curr_chunk[-cover_content:] + line[start:end] 87 | chunk_text.append(curr_chunk) 88 | # 处理最后一个块 89 | start = (num_chunks - 1) * token_len 90 | curr_chunk = curr_chunk[-cover_content:] + line[start:end] 91 | chunk_text.append(curr_chunk) 92 | 93 | if curr_len + line_len <= token_len: 94 | curr_chunk += line 95 | curr_chunk += '\n' 96 | curr_len += line_len 97 | curr_len += 1 98 | else: 99 | chunk_text.append(curr_chunk) 100 | curr_chunk = curr_chunk[-cover_content:]+line 101 | curr_len = line_len + cover_content 102 | 103 | if curr_chunk: 104 | chunk_text.append(curr_chunk) 105 | 106 | return chunk_text 107 | 108 | @classmethod 109 | def read_file_content(cls, file_path: str): 110 | # 根据文件扩展名选择读取方法 111 | if file_path.endswith('.pdf'): 112 | return cls.read_pdf(file_path) 113 | elif file_path.endswith('.md'): 114 | return cls.read_markdown(file_path) 115 | elif file_path.endswith('.txt'): 116 | return cls.read_text(file_path) 117 | else: 118 | raise ValueError("Unsupported file type") 119 | 120 | @classmethod 121 | def read_pdf(cls, file_path: str): 122 | # 读取PDF文件 123 | with open(file_path, 'rb') as file: 124 | reader = PyPDF2.PdfReader(file) 125 | text = "" 126 | for page_num in range(len(reader.pages)): 127 | text += reader.pages[page_num].extract_text() 128 | return text 129 | 130 | @classmethod 131 | def read_markdown(cls, file_path: str): 132 | # 读取Markdown文件 133 | with open(file_path, 'r', encoding='utf-8') as file: 134 | md_text = file.read() 135 | html_text = markdown.markdown(md_text) 136 | # 使用BeautifulSoup从HTML中提取纯文本 137 | soup = BeautifulSoup(html_text, 'html.parser') 138 | plain_text = soup.get_text() 139 | # 使用正则表达式移除网址链接 140 | text = re.sub(r'http\S+', '', plain_text) 141 | return text 142 | 143 | @classmethod 144 | def read_text(cls, file_path: str): 145 | # 读取文本文件 146 | with open(file_path, 'r', encoding='utf-8') as file: 147 | return file.read() 148 | 149 | 150 | class Documents: 151 | """ 152 | 获取已分好类的json格式文档 153 | """ 154 | def __init__(self, path: str = '') -> None: 155 | self.path = path 156 | 157 | def get_content(self): 158 | with open(self.path, mode='r', encoding='utf-8') as f: 159 | content = json.load(f) 160 | return content 161 | -------------------------------------------------------------------------------- /content/TinyRAG/down_model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from modelscope import snapshot_download, AutoModel, AutoTokenizer 3 | import os 4 | 5 | model_dir = snapshot_download('Shanghai_AI_Laboratory/internlm2-chat-7b', cache_dir='/root/autodl-tmp/', revision='master') 6 | model_dir = snapshot_download('jinaai/jina-embeddings-v2-base-zh', cache_dir='/root/autodl-tmp/', revision='master') -------------------------------------------------------------------------------- /content/TinyRAG/images/RAG.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/tiny-universe/638006d147ed6fc902e610fad50875549fa2d944/content/TinyRAG/images/RAG.png -------------------------------------------------------------------------------- /content/TinyRAG/images/Retrieval-Augmented Generation(RAG-Learning).png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/tiny-universe/638006d147ed6fc902e610fad50875549fa2d944/content/TinyRAG/images/Retrieval-Augmented Generation(RAG-Learning).png -------------------------------------------------------------------------------- /content/TinyRAG/requirements.txt: -------------------------------------------------------------------------------- 1 | openai 2 | zhipuai 3 | numpy 4 | python-dotenv 5 | torch 6 | torchvision 7 | torchaudio 8 | transformers 9 | tqdm 10 | PyPDF2 11 | markdown 12 | html2text 13 | tiktoken 14 | beautifulsoup4 15 | sentencepiece 16 | modelscope 17 | 18 | -------------------------------------------------------------------------------- /content/TinyRAG/test.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from RAG.VectorBase import VectorStore\n", 10 | "from RAG.utils import ReadFiles\n", 11 | "from RAG.LLM import OpenAIChat, InternLMChat\n", 12 | "from RAG.Embeddings import JinaEmbedding" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 2, 18 | "metadata": {}, 19 | "outputs": [ 20 | { 21 | "name": "stderr", 22 | "output_type": "stream", 23 | "text": [ 24 | "Calculating embeddings: 100%|██████████| 459/459 [00:04<00:00, 92.38it/s] \n" 25 | ] 26 | } 27 | ], 28 | "source": [ 29 | "# 建立向量数据库\n", 30 | "docs = ReadFiles('./data').get_content(max_token_len=600, cover_content=150) # 获得data目录下的所有文件内容并分割\n", 31 | "vector = VectorStore(docs)\n", 32 | "embedding = JinaEmbedding(path='/root/autodl-tmp/jinaai/jina-embeddings-v2-base-zh') # 创建EmbeddingModel\n", 33 | "vector.get_vector(EmbeddingModel=embedding)\n", 34 | "vector.persist(path='storage') # 将向量和文档内容保存到storage目录下,下次再用就可以直接加载本地的数据库" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 3, 40 | "metadata": {}, 41 | "outputs": [ 42 | { 43 | "name": "stdout", 44 | "output_type": "stream", 45 | "text": [ 46 | "Chronos:LearningtheLanguageofTimeSeries\n", 47 | "AbdulFatirAnsari1∗,LorenzoStella1∗,CanerTurkmen1,XiyuanZhang3†,PedroMercado1,\n", 48 | "HuibinShen1,OleksandrShchur1,SyamaSundarRangapuram1,SebastianPinedaArango4†,\n", 49 | "ShubhamKapoor1,JasperZschiegner†,DanielleC.Maddix1,HaoWang1,5†,MichaelW.Ma-\n", 50 | "honey2,6†,KariTorkkola2,AndrewGordonWilson2,7†,MichaelBohlke-Schneider1,Yuyang\n", 51 | "Wang1{ansarnd,stellalo}@amazon.com\n", 52 | "1AWSAILabs,2AmazonSupplyChainOptimizationTechnologies,3UCSanDiego,4UniversityofFreiburg,5Rutgers\n", 53 | "University,6UCBerkeley,7NewYorkUniversity\n", 54 | "Abstract\n", 55 | "WeintroduceChronos,asimpleyeteffectiveframeworkforpretrainedprobabilistictime\n", 56 | "seriesmodels.Chronostokenizestimeseriesvaluesusingscalingandquantizationinto\n", 57 | "afixedvocabularyandtrainsexistingtransformer-basedlanguagemodelarchitectureson\n", 58 | "thesetokenizedtimeseriesviathecross-entropyloss.WepretrainedChronosmodels\n", 59 | "basedontheT5family(rangingfrom20Mto710Mparameters)onalargecollectionof\n", 60 | "publiclyavailabledatasets,complementedbyasyntheticdatasetthatwegeneratedvia\n", 61 | "Gaussianprocessestoimprovegeneralization.Inacomprehensivebenchmarkconsistingof\n", 62 | "42datasets,andcomprisingbothclassicallocalmodelsanddeeplearningmethods,weshow\n", 63 | "thatChronosmodels:(a)significantlyoutperformothermethodsondatasetsthatwere\n", 64 | "partofthetrainingcorpus;and(b)havecomparableandoccasionallysuperiorzero-shot\n", 65 | "performanceonnewdatasets,relativetomethodsthatweretrainedspecificallyonthem.\n", 66 | "\n" 67 | ] 68 | } 69 | ], 70 | "source": [ 71 | "vector = VectorStore()\n", 72 | "\n", 73 | "vector.load_vector('./storage') # 加载本地的数据库\n", 74 | "\n", 75 | "embedding = JinaEmbedding(path='/root/autodl-tmp/jinaai/jina-embeddings-v2-base-zh')\n", 76 | "\n", 77 | "question = 'chronos是什么?'\n", 78 | "\n", 79 | "content = vector.query(question, EmbeddingModel=embedding, k=1)[0]\n", 80 | "\n", 81 | "print(content)" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": 4, 87 | "metadata": {}, 88 | "outputs": [ 89 | { 90 | "data": { 91 | "application/vnd.jupyter.widget-view+json": { 92 | "model_id": "de27a17e84bb48e8ae3df85e001b6458", 93 | "version_major": 2, 94 | "version_minor": 0 95 | }, 96 | "text/plain": [ 97 | "Loading checkpoint shards: 0%| | 0/8 [00:00