├── README.md
├── install.sh
├── prompt_utils.py
├── qwen-agent.py
└── vllm_server.py


/README.md:
--------------------------------------------------------------------------------
1 | # agent
2 | 
3 | 基于qwen的agent demo，支持历史对话
4 | 
5 | ## 依赖
6 | 
7 | 基于阿里云百炼的通义千问200B大模型


--------------------------------------------------------------------------------
/install.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | # Install vLLM with CUDA 11.8.
4 | export VLLM_VERSION=0.2.7
5 | export PYTHON_VERSION=39
6 | pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux1_x86_64.whl -i https://mirrors.aliyun.com/pypi/simple/
7 | 
8 | pip install langchain modelscope tiktoken requests -i https://mirrors.aliyun.com/pypi/simple/


--------------------------------------------------------------------------------
/prompt_utils.py:
--------------------------------------------------------------------------------
 1 | import copy 
 2 | 
 3 | # 按chatml格式构造千问的Prompt
 4 | def _build_prompt(
 5 |                 generation_config,
 6 |                 tokenizer,
 7 |                 query,
 8 |                 history=None,
 9 |                 system=""):
10 |     if history is None:
11 |         history=[]
12 | 
13 |     # 包裹发言内容的token
14 |     im_start,im_start_tokens='<|im_start|>',[tokenizer.im_start_id]
15 |     im_end,im_end_tokens='<|im_end|>',[tokenizer.im_end_id]
16 |     # 换行符token
17 |     nl_tokens=tokenizer.encode("\n")
18 | 
19 |     # 用于编码system/user/assistant的一段发言, 格式{role}\n{content}
20 |     def _tokenize_str(role,content): # 返回元组，下标0是文本，下标1是token ids
21 |         return f"{role}\n{content}",tokenizer.encode(role)+nl_tokens+tokenizer.encode(content)
22 |     
23 |     # 剩余token数
24 |     left_token_space=generation_config.max_window_size
25 | 
26 |     # prompt头部: system发言
27 |     system_text_part,system_tokens_part=_tokenize_str("system", system) # system_tokens_part -->    system\nYou are a helpful assistant.
28 |     system_text=f'{im_start}{system_text_part}{im_end}'
29 |     system_tokens=im_start_tokens+system_tokens_part+im_end_tokens # <|im_start|>system\nYou are a helpful assistant.<|im_end|>
30 |     left_token_space-=len(system_tokens)
31 |     
32 |     # prompt尾部: user发言和assistant引导
33 |     query_text_part,query_tokens_part=_tokenize_str('user', query)
34 |     query_tokens_prefix=nl_tokens+ im_start_tokens
35 |     query_tokens_suffix=im_end_tokens+nl_tokens+im_start_tokens+tokenizer.encode('assistant')+nl_tokens
36 |     if len(query_tokens_prefix)+len(query_tokens_part)+len(query_tokens_suffix)>left_token_space: # query太长截断
37 |         query_token_len=left_token_space-len(query_tokens_prefix)-len(query_tokens_suffix)
38 |         query_tokens_part=query_tokens_part[:query_token_len]
39 |         query_text_part=tokenizer.decode(query_tokens_part)
40 |     query_tokens=query_tokens_prefix+query_tokens_part+query_tokens_suffix
41 |     query_text=f"\n{im_start}{query_text_part}{im_end}\n{im_start}assistant\n"
42 |     left_token_space-=len(query_tokens)
43 |     
44 |     # prompt腰部: 历史user+assitant对话
45 |     history_text,history_tokens='',[]
46 |     for hist_query,hist_response in reversed(history):    # 优先采用最近的对话历史
47 |         hist_query_text,hist_query_tokens_part=_tokenize_str("user",hist_query) # user\n历史提问
48 |         hist_response_text,hist_response_tokens_part=_tokenize_str("assistant",hist_response) # assistant\n历史回答
49 |         # 生成本轮对话
50 |         cur_history_tokens=nl_tokens+im_start_tokens+hist_query_tokens_part+im_end_tokens+nl_tokens+im_start_tokens+hist_response_tokens_part+im_end_tokens
51 |         cur_history_text=f"\n{im_start}{hist_query_text}{im_end}\n{im_start}{hist_response_text}{im_end}"
52 |         # 储存多轮对话
53 |         if len(cur_history_tokens)<=left_token_space:
54 |             history_text=cur_history_text+history_text
55 |             history_tokens=cur_history_tokens+history_tokens
56 |             left_token_space-=len(cur_history_tokens)
57 |         else:
58 |             break 
59 |             
60 |     # 生成完整Prompt
61 |     prompt_str=f'{system_text}{history_text}{query_text}'
62 |     prompt_tokens=system_tokens+history_tokens+query_tokens
63 |     return prompt_str,prompt_tokens
64 | 
65 | # 停用词清理
66 | def remove_stop_words(token_ids,stop_words_ids):
67 |     token_ids=copy.deepcopy(token_ids)
68 |     while len(token_ids)>0:
69 |         if token_ids[-1] in stop_words_ids:
70 |             token_ids.pop(-1)
71 |         else:
72 |             break
73 |     return token_ids


--------------------------------------------------------------------------------
/qwen-agent.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | from langchain_community.tools.tavily_search import TavilySearchResults
  4 | import broadscope_bailian
  5 | import datetime
  6 | 
  7 | def llm(query,history=[],user_stop_words=[]):    # 调用api_server
  8 |     access_key_id=os.environ.get("ACCESS_KEY_ID")
  9 |     access_key_secret=os.environ.get("ACCESS_KEY_SECRET")
 10 |     agent_key=os.environ.get("AGENT_KEY")
 11 |     app_id=os.environ.get("APP_ID")
 12 | 
 13 |     try:
 14 |         messages=[{'role':'system','content':'You are a helpful assistant.'}]
 15 |         for hist in history:
 16 |             messages.append({'role':'user','content':hist[0]})
 17 |             messages.append({'role':'assistant','content':hist[1]})
 18 |         messages.append({'role':'user','content':query})
 19 |         client=broadscope_bailian.AccessTokenClient(access_key_id=access_key_id, access_key_secret=access_key_secret,
 20 |                                                         agent_key=agent_key)
 21 |         resp=broadscope_bailian.Completions(token=client.get_token()).create(
 22 |             app_id=app_id,
 23 |             messages=messages,
 24 |             result_format="message",
 25 |             stop=user_stop_words,
 26 |         )
 27 |         # print(resp)
 28 |         content=resp.get("Data", {}).get("Choices", [])[0].get("Message", {}).get("Content")
 29 |         return content
 30 |     except Exception as e:
 31 |         return str(e)
 32 |     
 33 | # travily搜索引擎
 34 | os.environ['TAVILY_API_KEY']='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'    # travily搜索引擎api key
 35 | tavily=TavilySearchResults(max_results=5)
 36 | tavily.description='这是一个类似谷歌和百度的搜索引擎，搜索知识、天气、股票、电影、小说、百科等都是支持的哦，如果你不确定就应该搜索一下，谢谢！s'
 37 | 
 38 | # 工具列表
 39 | tools=[tavily, ]
 40 | 
 41 | tool_names='or'.join([tool.name for tool in tools])  # 拼接工具名
 42 | tool_descs=[] # 拼接工具详情
 43 | for t in tools:
 44 |     args_desc=[]
 45 |     for name,info in t.args.items():
 46 |         args_desc.append({'name':name,'description':info['description'] if 'description' in info else '','type':info['type']})
 47 |     args_desc=json.dumps(args_desc,ensure_ascii=False)
 48 |     tool_descs.append('%s: %s,args: %s'%(t.name,t.description,args_desc))
 49 | tool_descs='\n'.join(tool_descs)
 50 | 
 51 | prompt_tpl='''Today is {today}. Please Answer the following questions as best you can. You have access to the following tools:
 52 | 
 53 | {tool_descs}
 54 | 
 55 | These are chat history before:
 56 | {chat_history}
 57 | 
 58 | Use the following format:
 59 | 
 60 | Question: the input question you must answer
 61 | Thought: you should always think about what to do
 62 | Action: the action to take, should be one of [{tool_names}]
 63 | Action Input: the input to the action
 64 | Observation: the result of the action
 65 | ... (this Thought/Action/Action Input/Observation can be repeated zero or more times)
 66 | Thought: I now know the final answer
 67 | Final Answer: the final answer to the original input question
 68 | 
 69 | Begin!
 70 | 
 71 | Question: {query}
 72 | {agent_scratchpad}
 73 | '''
 74 | 
 75 | def agent_execute(query,chat_history=[]):
 76 |     global tools,tool_names,tool_descs,prompt_tpl,llm,tokenizer
 77 |     
 78 |     agent_scratchpad='' # agent执行过程
 79 |     while True:
 80 |         # 1）触发llm思考下一步action
 81 |         history='\n'.join(['Question:%s\nAnswer:%s'%(his[0],his[1]) for his in chat_history])
 82 |         today=datetime.datetime.now().strftime('%Y-%m-%d')
 83 |         prompt=prompt_tpl.format(today=today,chat_history=history,tool_descs=tool_descs,tool_names=tool_names,query=query,agent_scratchpad=agent_scratchpad)
 84 |         print('\033[32m---等待LLM返回... ...\n%s\n\033[0m'%prompt,flush=True)
 85 |         response=llm(prompt,user_stop_words=['Observation:'])
 86 |         print('\033[34m---LLM返回---\n%s\n---\033[34m'%response,flush=True)
 87 |         
 88 |         # 2）解析thought+action+action input+observation or thought+final answer
 89 |         thought_i=response.rfind('Thought:')
 90 |         final_answer_i=response.rfind('\nFinal Answer:')
 91 |         action_i=response.rfind('\nAction:')
 92 |         action_input_i=response.rfind('\nAction Input:')
 93 |         observation_i=response.rfind('\nObservation:')
 94 |         
 95 |         # 3）返回final answer，执行完成
 96 |         if final_answer_i!=-1 and thought_i<final_answer_i:
 97 |             final_answer=response[final_answer_i+len('\nFinal Answer:'):].strip()
 98 |             chat_history.append((query,final_answer))
 99 |             return True,final_answer,chat_history
100 |         
101 |         # 4）解析action
102 |         if not (thought_i<action_i<action_input_i):
103 |             return False,'LLM回复格式异常',chat_history
104 |         if observation_i==-1:
105 |             observation_i=len(response)
106 |             response=response+'Observation: '
107 |         thought=response[thought_i+len('Thought:'):action_i].strip()
108 |         action=response[action_i+len('\nAction:'):action_input_i].strip()
109 |         action_input=response[action_input_i+len('\nAction Input:'):observation_i].strip()
110 |         
111 |         # 5）匹配tool
112 |         the_tool=None
113 |         for t in tools:
114 |             if t.name==action:
115 |                 the_tool=t
116 |                 break
117 |         if the_tool is None:
118 |             observation='the tool not exist'
119 |             agent_scratchpad=agent_scratchpad+response+observation+'\n'
120 |             continue 
121 |         
122 |         # 6）执行tool
123 |         try:
124 |             action_input=json.loads(action_input)
125 |             tool_ret=the_tool.invoke(input=json.dumps(action_input))
126 |         except Exception as e:
127 |             observation='the tool has error:{}'.format(e)
128 |         else:
129 |             observation=str(tool_ret)
130 |         agent_scratchpad=agent_scratchpad+response+observation+'\n'
131 | 
132 | def agent_execute_with_retry(query,chat_history=[],retry_times=3):
133 |     for i in range(retry_times):
134 |         success,result,chat_history=agent_execute(query,chat_history=chat_history)
135 |         if success:
136 |             return success,result,chat_history
137 |     return success,result,chat_history
138 | 
139 | my_history=[]
140 | while True:
141 |     query=input('query:')
142 |     success,result,my_history=agent_execute_with_retry(query,chat_history=my_history)
143 |     my_history=my_history[-10:]


--------------------------------------------------------------------------------
/vllm_server.py:
--------------------------------------------------------------------------------
  1 | import os 
  2 | from vllm import AsyncEngineArgs,AsyncLLMEngine
  3 | from vllm.sampling_params import SamplingParams
  4 | from modelscope import AutoTokenizer, GenerationConfig,snapshot_download
  5 | from fastapi import FastAPI, Request
  6 | from fastapi.responses import JSONResponse, Response, StreamingResponse
  7 | import uvicorn
  8 | from prompt_utils import _build_prompt,remove_stop_words
  9 | import uuid
 10 | import json 
 11 | 
 12 | # http接口服务
 13 | app=FastAPI()
 14 | 
 15 | # vLLM参数
 16 | model_dir="qwen/Qwen-7B-Chat-Int4"
 17 | tensor_parallel_size=1
 18 | gpu_memory_utilization=0.6
 19 | quantization='gptq'
 20 | dtype='float16'
 21 | 
 22 | # vLLM模型加载
 23 | def load_vllm():
 24 |     global generation_config,tokenizer,stop_words_ids,engine    
 25 |     # 模型下载
 26 |     snapshot_download(model_dir)
 27 |     # 模型基础配置
 28 |     generation_config=GenerationConfig.from_pretrained(model_dir,trust_remote_code=True)
 29 |     # 加载分词器
 30 |     tokenizer=AutoTokenizer.from_pretrained(model_dir,trust_remote_code=True)
 31 |     tokenizer.eos_token_id=generation_config.eos_token_id
 32 |     # 推理终止词
 33 |     stop_words_ids=[tokenizer.im_start_id,tokenizer.im_end_id,tokenizer.eos_token_id]
 34 |     # vLLM基础配置
 35 |     args=AsyncEngineArgs(model_dir)
 36 |     args.worker_use_ray=False
 37 |     args.engine_use_ray=False
 38 |     args.tokenizer=model_dir
 39 |     args.tensor_parallel_size=tensor_parallel_size
 40 |     args.trust_remote_code=True
 41 |     args.quantization=quantization
 42 |     args.gpu_memory_utilization=gpu_memory_utilization
 43 |     args.dtype=dtype
 44 |     args.max_num_seqs=20    # batch最大20条样本
 45 |     # 加载模型
 46 |     os.environ['VLLM_USE_MODELSCOPE']='True'
 47 |     engine=AsyncLLMEngine.from_engine_args(args)
 48 |     return generation_config,tokenizer,stop_words_ids,engine
 49 | 
 50 | generation_config,tokenizer,stop_words_ids,engine=load_vllm()
 51 | 
 52 | # 用户停止句匹配
 53 | def match_user_stop_words(response_token_ids,user_stop_tokens):
 54 |     for stop_tokens in user_stop_tokens:
 55 |         if len(response_token_ids)<len(stop_tokens):
 56 |             continue 
 57 |         if response_token_ids[-len(stop_tokens):]==stop_tokens:
 58 |             return True  # 命中停止句, 返回True
 59 |     return False
 60 | 
 61 | # chat对话接口
 62 | @app.post("/chat")
 63 | async def chat(request: Request):
 64 |     request=await request.json()
 65 |     
 66 |     query=request.get('query',None)
 67 |     history=request.get('history',[])
 68 |     system=request.get('system','You are a helpful assistant.')
 69 |     stream=request.get("stream",False)
 70 |     user_stop_words=request.get("user_stop_words",[])    # list[str]，用户自定义停止句，例如：['Observation: ', 'Action: ']定义了2个停止句，遇到任何一个都会停止
 71 |     
 72 |     if query is None:
 73 |         return Response(status_code=502,content='query is empty')
 74 | 
 75 |     # 用户停止词
 76 |     user_stop_tokens=[]
 77 |     for words in user_stop_words:
 78 |         user_stop_tokens.append(tokenizer.encode(words))
 79 |     
 80 |     # 构造prompt
 81 |     prompt_text,prompt_tokens=_build_prompt(generation_config,tokenizer,query,history=history,system=system)
 82 |         
 83 |     # vLLM请求配置
 84 |     sampling_params=SamplingParams(stop_token_ids=stop_words_ids, 
 85 |                                     early_stopping=False,
 86 |                                     top_p=generation_config.top_p,
 87 |                                     top_k=-1 if generation_config.top_k == 0 else generation_config.top_k,
 88 |                                     temperature=generation_config.temperature,
 89 |                                     repetition_penalty=generation_config.repetition_penalty,
 90 |                                     max_tokens=generation_config.max_new_tokens)
 91 |     # vLLM异步推理（在独立线程中阻塞执行推理，主线程异步等待完成通知）
 92 |     request_id=str(uuid.uuid4().hex)
 93 |     results_iter=engine.generate(prompt=None,sampling_params=sampling_params,prompt_token_ids=prompt_tokens,request_id=request_id)
 94 |     
 95 |     # 流式返回，即迭代transformer的每一步推理结果并反复返回
 96 |     if stream:
 97 |         async def streaming_resp():
 98 |             async for result in results_iter:
 99 |                 # 移除im_end,eos等系统停止词
100 |                 token_ids=remove_stop_words(result.outputs[0].token_ids,stop_words_ids)
101 |                 # 返回截止目前的tokens输出                
102 |                 text=tokenizer.decode(token_ids)
103 |                 yield (json.dumps({'text':text})+'\0').encode('utf-8')
104 |                 # 匹配用户停止词,终止推理
105 |                 if match_user_stop_words(token_ids,user_stop_tokens):
106 |                     await engine.abort(request_id)   # 终止vllm后续推理
107 |                     break
108 |         return StreamingResponse(streaming_resp())
109 | 
110 |     # 整体一次性返回模式
111 |     async for result in results_iter:
112 |         # 移除im_end,eos等系统停止词
113 |         token_ids=remove_stop_words(result.outputs[0].token_ids,stop_words_ids)
114 |         # 返回截止目前的tokens输出                
115 |         text=tokenizer.decode(token_ids)
116 |         # 匹配用户停止词,终止推理
117 |         if match_user_stop_words(token_ids,user_stop_tokens):
118 |             await engine.abort(request_id)   # 终止vllm后续推理
119 |             break
120 | 
121 |     ret={"text":text}
122 |     return JSONResponse(ret)
123 | 
124 | if __name__=='__main__':
125 |     uvicorn.run(app,
126 |                 host=None,
127 |                 port=8000,
128 |                 log_level="debug")


--------------------------------------------------------------------------------