├── Res2Net ├── __init__.py ├── config.py ├── main.py ├── data_utils.py └── res2next.py ├── 异步测试 ├── bench_singlethread.html ├── bench_singlethread.txt ├── test.png ├── images │ ├── test.png │ ├── aiohttp_rate2000.png │ ├── singlthread_rate2000.png │ ├── multiprocess_4core_rate2000.png │ └── multithread_10thread_rate2000.png ├── requirements.txt ├── test_aiohttp_api_server.sh ├── gunicorn_flask_api_server.sh ├── bench_aiohttp.txt ├── bench_multiprocess.txt ├── test_flask_api_server.py ├── README.md ├── test_flask_api_server_v2.py ├── aiohttp_api_server.py ├── benchmark.sh ├── quart_api_server.py ├── flask_api_server.py └── flask_api_server_v2.py ├── text_classsification ├── nets │ ├── __init__.py │ ├── metric.py │ ├── text_rnn.py │ ├── text_cnn.py │ ├── text_rnn_improve.py │ ├── text_rnn_improve2.py │ ├── text_transformer.py │ ├── text_cnn_rnn.py │ ├── text_rnn_transformer.py │ ├── text_cnn_transformer.py │ ├── base_model.py │ └── text_adversarial_rnn_improve.py ├── utils │ ├── __init__.py │ ├── network_utils.py │ ├── data_helpers.py │ └── vocabulary_utils.py ├── Readme.md └── eval.py ├── unsuper.png ├── translate ├── 翻译对照表总表.xlsx ├── data │ ├── custom_zh_en.py │ └── sacrebleu.py └── do_translate.py ├── TextCNN ├── Readme.md ├── config.py ├── main.py ├── data_utils.py └── model_utils.py ├── README.md ├── .gitignore ├── langconv.py ├── LICENSE └── unsuper_classification.py /Res2Net/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /异步测试/bench_singlethread.html: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /异步测试/bench_singlethread.txt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /text_classsification/nets/__init__.py: -------------------------------------------------------------------------------- 1 | # -- encoding:utf-8 -- 2 | -------------------------------------------------------------------------------- /text_classsification/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # -- encoding:utf-8 -- 2 | -------------------------------------------------------------------------------- /unsuper.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/johnson7788/NLP/HEAD/unsuper.png -------------------------------------------------------------------------------- /异步测试/test.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/johnson7788/NLP/HEAD/异步测试/test.png -------------------------------------------------------------------------------- /异步测试/images/test.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/johnson7788/NLP/HEAD/异步测试/images/test.png -------------------------------------------------------------------------------- /translate/翻译对照表总表.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/johnson7788/NLP/HEAD/translate/翻译对照表总表.xlsx -------------------------------------------------------------------------------- /异步测试/images/aiohttp_rate2000.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/johnson7788/NLP/HEAD/异步测试/images/aiohttp_rate2000.png -------------------------------------------------------------------------------- /异步测试/requirements.txt: -------------------------------------------------------------------------------- 1 | aiohttp==3.6.2 2 | Flask==1.1.2 3 | requests==2.24.0 4 | urllib3==1.25.9 5 | Werkzeug==1.0.1 6 | -------------------------------------------------------------------------------- /异步测试/images/singlthread_rate2000.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/johnson7788/NLP/HEAD/异步测试/images/singlthread_rate2000.png -------------------------------------------------------------------------------- /异步测试/images/multiprocess_4core_rate2000.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/johnson7788/NLP/HEAD/异步测试/images/multiprocess_4core_rate2000.png -------------------------------------------------------------------------------- /异步测试/images/multithread_10thread_rate2000.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/johnson7788/NLP/HEAD/异步测试/images/multithread_10thread_rate2000.png -------------------------------------------------------------------------------- /text_classsification/nets/metric.py: -------------------------------------------------------------------------------- 1 | # -- encoding:utf-8 -- 2 | 3 | from collections import namedtuple 4 | 5 | 6 | class Metrics(namedtuple('Metrics', 7 | ['accuracy', 'recall', 'f1'])): 8 | pass 9 | -------------------------------------------------------------------------------- /异步测试/test_aiohttp_api_server.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #测试上传文件接口 3 | echo "Test upload image" 4 | curl -X POST -H "token:ff1c1eef10cad322ddbcd842a952b46c" -H "timestamp:1592805495.355849" -H "sign:c8499156ae8acc3f2d6a1453b9315eb1" -F "image=@test.png" http://127.0.0.1:5000/upload 5 | 6 | -------------------------------------------------------------------------------- /TextCNN/Readme.md: -------------------------------------------------------------------------------- 1 | ### 主要目录结构 2 | ``` 3 | ├── Readme.md 4 | ├── config.py #模型配置文件 5 | ├── data #数据集,包括训练数据和验证数据, 每个子文件是一个类别,里面放对于文本 6 | ├── main.py #模型运行入口,里面主要包含3个函数,分别是训练,测试,和实际运行的预测接口predict 7 | ├── model #保存TextCNN模型和生成的字典 8 | ├── model_utils.py #TextCNN模型文件 9 | └── data_utils.py #文本预处理模块 10 | ``` 11 | 12 | -------------------------------------------------------------------------------- /text_classsification/Readme.md: -------------------------------------------------------------------------------- 1 | 1. data:文件夹中存储的主要是数据 2 | 2. nets: package,主要存储网络结构模型代码 3 | text_cnn: TextCNN做文本分类 4 | 3. utils: package,主要存储工具函数相关的代码 5 | data_helpers.py: 数据加载、批次构建相关函数代码 6 | network_utils.py: 优化器参数构建相关代码 7 | vocabulary_utils.py: 词汇转换相关代码 8 | 4. train.py: 模型训练入口函数 9 | 5. eval.py: 模型效果评估的入口函数 10 | 6. graph:模型执行可视化文件保存的文件夹 11 | 7. model:模型持久化保存的文件夹 12 | 8. deploy:模型部署相关package -------------------------------------------------------------------------------- /异步测试/gunicorn_flask_api_server.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | PORT=5000 3 | SCRIPT=flask_api_server 4 | #进程数量 5 | WORKER=${WORKER:-2} 6 | #默认多线程个数10 7 | THREAD=${THREAD:-10} 8 | 9 | #是否使用多线程模式, 默认多线程 10 | USERTHREAD=${USERTHREAD:-0} 11 | 12 | if [ $USERTHREAD -eq 0 ]; then 13 | echo "启用多线程,线程个数 $THREAD" 14 | gunicorn -b localhost:$PORT -w $WORKER --threads $THREAD $SCRIPT:app 15 | else 16 | echo "启用多进程, 进程个数 $WORKER" 17 | gunicorn -b localhost:$PORT -w $WORKER $SCRIPT:app 18 | fi -------------------------------------------------------------------------------- /异步测试/bench_aiohttp.txt: -------------------------------------------------------------------------------- 1 | Requests [total, rate, throughput] 6000, 2000.47, 2000.25 2 | Duration [total, attack, wait] 3s, 2.999s, 320.618µs 3 | Latencies [min, mean, 50, 90, 95, 99, max] 290.812µs, 414.588µs, 344.785µs, 481.47µs, 591.848µs, 2.359ms, 5.04ms 4 | Bytes In [total, mean] 24000, 4.00 5 | Bytes Out [total, mean] 36000, 6.00 6 | Success [ratio] 100.00% 7 | Status Codes [code:count] 200:6000 8 | Error Set: 9 | -------------------------------------------------------------------------------- /异步测试/bench_multiprocess.txt: -------------------------------------------------------------------------------- 1 | Requests [total, rate, throughput] 6000, 2000.50, 1999.61 2 | Duration [total, attack, wait] 3.001s, 2.999s, 1.336ms 3 | Latencies [min, mean, 50, 90, 95, 99, max] 668.591µs, 1.377ms, 955.315µs, 2.313ms, 3.528ms, 5.872ms, 9.594ms 4 | Bytes In [total, mean] 96000, 16.00 5 | Bytes Out [total, mean] 36000, 6.00 6 | Success [ratio] 100.00% 7 | Status Codes [code:count] 200:6000 8 | Error Set: 9 | -------------------------------------------------------------------------------- /TextCNN/config.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | class Config(object): 4 | """Base configuration class.""" 5 | #训练文件夹位置 6 | train_dir = "data/train" 7 | #评估文件夹位置 8 | eval_dir = "data/eval" 9 | #模型的保存位置 10 | save_path='model/' 11 | #是否使用gpu 12 | cuda = True 13 | #训练的epoch 14 | epochs = 2 15 | batch_size = 64 16 | #学习率 17 | learning_rate = 0.001 18 | #学习率动量 19 | learning_momentum = 0.9 20 | #学习率衰减稀疏 21 | weight_decay = 0.0001 22 | dropout = 0.5 23 | #生成的词嵌入的维度 24 | embed_dim = 128 25 | #卷积核的数量 26 | kernel_num = 100 27 | #卷积核的尺寸 28 | kernel_sizes = "3,4,5" 29 | #训练多少个epoch时,模型保存 30 | save_interval = 2 31 | 32 | #初始化,是否使用gpu 33 | def __init__(self): 34 | if self.cuda: 35 | self.cuda = torch.cuda.is_available() 36 | self.device = torch.device("cuda:0" if self.cuda else "cpu") 37 | 38 | def dump(self): 39 | """打印配置信息""" 40 | print("模型配置如下:") 41 | for a in dir(self): 42 | if not a.startswith("__") and not callable(getattr(self, a)): 43 | print("\t{:30} = {}".format(a, getattr(self, a))) 44 | print() 45 | -------------------------------------------------------------------------------- /异步测试/test_flask_api_server.py: -------------------------------------------------------------------------------- 1 | import aiohttp 2 | import asyncio 3 | import unittest 4 | import requests 5 | 6 | url = 'http://127.0.0.1:5000' 7 | #确保Flask server已经启动 8 | 9 | def get_header(): 10 | headers = { 11 | 'token': 'ff1c1eef10cad322ddbcd842a952b46c', 12 | 'timestamp': '1592805495.355849', 13 | 'sign': 'c8499156ae8acc3f2d6a1453b9315eb1' 14 | } 15 | return headers 16 | 17 | class NamesTestCase(unittest.TestCase): 18 | def test_get_echo(self): 19 | """测试服务器正常启动""" 20 | r = requests.get(url +'/echo') 21 | self.assertEqual(r.status_code, 200) 22 | 23 | def test_get_token(self): 24 | """测试获取token""" 25 | r = requests.get(url+'/token') 26 | self.assertEqual(r.status_code, 200) 27 | self.assertIn('sign', r.json()) 28 | self.assertIn('timestamp', r.json()) 29 | self.assertIn('token', r.json()) 30 | 31 | def test_try_process(self): 32 | """测试多进程""" 33 | headers = get_header() 34 | r = requests.get(url + '/process', headers=headers) 35 | self.assertEqual(r.status_code, 200) 36 | self.assertTrue(r.json()['result']) 37 | 38 | def test_try_upload(self): 39 | """测试用获取的token上传图片""" 40 | with open("test.png", 'rb') as img: 41 | files = { 42 | 'image': img 43 | } 44 | headers = get_header() 45 | r = requests.post(url + '/upload', headers=headers, files=files) 46 | self.assertEqual(r.status_code, 200) 47 | 48 | if __name__ == '__main__': 49 | ##确保Flask server已经启动 50 | unittest.main() 51 | -------------------------------------------------------------------------------- /Res2Net/config.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | class Config(object): 4 | """Base configuration class.""" 5 | #指定包含训练集,验证集合测试集的文件夹 6 | data_directory = "data/zhengjian/" 7 | #模型的保存位置 8 | save_path='model/' 9 | #模型保存名称 10 | save_name='checkpoint.pth' 11 | #使用哪个模型类型,可选 ['densenet161', 'resnet18', 'vgg16', 'res2next50'] 12 | arch = 'res2next50' 13 | # classifier的 隐藏层数, 可以任意个[1024,512,256],每个是一个FC 14 | hidden_units = [256] 15 | # 评估间隔, 训练多少个epoch,进行一次评估 16 | eval_interval = 100 17 | # 是否绘图还是直接返回结果 18 | plot = False 19 | # 绘图显示的预测个数, 需要是偶数个 20 | plot_image = 6 21 | #是否使用gpu 22 | cuda = False 23 | #device name ,如果使用cpu,那么就是cpu,如果使用gpu, 可能是第几块显卡cuda:0 24 | device_name = 'cpu' 25 | #训练的epoch 26 | epochs = 2 27 | batch_size = 64 28 | #学习率 29 | learning_rate = 0.001 30 | #学习率动量 31 | learning_momentum = 0.9 32 | #学习率衰减稀疏 33 | weight_decay = 0.0001 34 | dropout = 0.5 35 | #生成的词嵌入的维度 36 | embed_dim = 128 37 | #卷积核的数量 38 | kernel_num = 100 39 | #卷积核的尺寸 40 | kernel_sizes = "3,4,5" 41 | #训练多少个epoch时,模型保存 42 | save_interval = 2 43 | 44 | #初始化,是否使用gpu 45 | def __init__(self): 46 | if self.cuda: 47 | self.cuda = torch.cuda.is_available() 48 | self.device = torch.device("cuda:0" if self.cuda else "cpu") 49 | 50 | def dump(self): 51 | """打印配置信息""" 52 | print("模型配置如下:") 53 | for a in dir(self): 54 | if not a.startswith("__") and not callable(getattr(self, a)): 55 | print("\t{:30} = {}".format(a, getattr(self, a))) 56 | print() 57 | -------------------------------------------------------------------------------- /异步测试/README.md: -------------------------------------------------------------------------------- 1 | # API 2 | 3 | #### Flask 版本介绍 4 | - flask_api_server.py 是Flask的版本 5 | - 可以使用gunicorn_flask_api_server.sh启动 6 | - 使用test_flask_api_server.py进行单元测试 7 | 8 | ### 异步版本aiohttp 9 | - aiohttp_api_server.py 是基于aiohttp的异步版本 10 | - 使用test_aiohttp_api_server.sh测试 11 | 12 | 13 | ### 压力测试Benchmark 14 | * 使用的工具是: Vegeta, Star是14.8, https://github.com/tsenart/vegeta 15 | * 当前环境:Mac OS i5, 1.4GHZ, 16GB, SSD 16 | 17 | * 测试脚本, 自动测试多进程,多线程,和异步的serverd的对比结果,结果保存为2种形式,分别为html图片格式,和txt文本格式 18 | `benchmark.sh` 19 | 20 | 21 | #### 对比结果 22 | 以下是每秒请求2000次,测试3秒的结果对比, 根据每个人的电脑或服务器的性能不同,结果也不同,这是一个相对的结果 23 | - 单线程每秒超过1000次请求时,已经无力,超2000时已经超时 24 | ![singlethread](images/singlthread_rate2000.png) 25 | - 多核,4core 对比 26 | ![multiprocess](images/multiprocess_4core_rate2000.png) 27 | - 多线程,单核10thread 对比 28 | ![multithread](images/multithread_10thread_rate2000.png) 29 | - aiohttp异步对比 30 | ![aiohttp](images/aiohttp_rate2000.png) 31 | 32 | - 异步的延迟 rate 2000/s, 成功率100% 33 | ``` 34 | Latencies [min, mean, 50, 90, 95, 99, max] 290.812µs, 414.588µs, 344.785µs, 481.47µs, 591.848µs, 2.359ms, 5.04ms 35 | ``` 36 | - 多线程的延迟 rate 2000/s, 成功率 75.38% 37 | ``` 38 | Latencies [min, mean, 50, 90, 95, 99, max] 261.772µs, 4.391s, 799.417ms, 28.908s, 30s, 30.001s, 30.005s 39 | ``` 40 | 41 | ### 当增加请求频率到2500/s时,CPU出现瓶颈, 42 | - 异步的成功率在31.06% 43 | ``` 44 | Latencies [min, mean, 50, 90, 95, 99, max] 5.224ms, 13.68s, 8.096s, 30s, 30.001s, 30.002s, 30.005s 45 | Success [ratio] 31.06% 46 | ``` 47 | - 多线程的成功率在16.26% 48 | ``` 49 | Latencies [min, mean, 50, 90, 95, 99, max] 13.181ms, 26.705s, 30s, 30.001s, 30.001s, 30.003s, 30.006s 50 | Success [ratio] 16.26% 51 | ``` 52 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # NLP 2 | 3 | ## 欢迎来我的知乎博客提问: https://www.zhihu.com/people/be_with_you 4 | 5 | ## News 6 | translate 域适应翻译模型, 微调模型命令 7 | ```buildoutcfg 8 | python run_translation.py --model_name_or_path 9 | facebook/m2m100_418M 10 | --do_train 11 | --do_eval 12 | --fp16 True 13 | --dataset_name custom_zh_en 14 | --source_lang zh 15 | --target_lang en 16 | --output_dir output/zh-en-translation 17 | --per_device_train_batch_size=8 18 | --per_device_eval_batch_size=8 19 | --overwrite_output_dir 20 | --predict_with_generate 21 | ``` 22 | 23 | ## 主要使用tfidf+doc2vec+albert实现无监督文本分类, 代码有些粗鄙,有些问题已经在代码中标注,我会逐渐修改完善,欢迎任何意见和建议! 24 | 实际测试准确率不高,还是需要结合人工制定规则+有监督训练,在结合部分无监督训练可能有更好效果 25 | 26 | ### 无监督分类算法,使用tfidf, doc2vec, albert 27 | ![构思图](unsuper.png) 28 | ### python文件 29 | unsuper_classification.py 30 | 31 | 32 | ## TextCNN 通用文本分类模型 33 | TextCNN文件夹 34 | 注意:数据目录下的每个文件夹放好对应的要训练的目录,目录里面放好单个文件就可以 35 | /data/train/ 36 | ``` 37 | ├── Readme.md 38 | ├── config.py 39 | ├── data 40 | │   ├── eval 41 | │   │   ├── 新闻 42 | │   │   ├── 科技 43 | │   │   └── 天气 44 | │   ├── predict 45 | │   └── train 46 | │   ├── 新闻 47 | │   ├── 科技 48 | │   └── 天气 49 | ├── data_utils.py 50 | ├── main.py 51 | ├── model 52 | └── model_utils.py 53 | ``` 54 | 55 | ## Res2Net 通用图片分类模型 56 | Res2Net文件夹 57 | ``` 58 | ├── __init__.py 59 | ├── config.py 60 | ├── data 61 | │   ├── eval 62 | │   │   ├── 狗 63 | │   │   ├── 猫 64 | │   │   ├── 鸡 65 | │   │   └── 鸭 66 | │   ├── predict 67 | │   ├── test 68 | │   │   ├── 狗 69 | │   │   ├── 猫 70 | │   │   ├── 鸡 71 | │   │   └── 鸭 72 | │   └── train 73 | │   ├── 狗 74 | │   ├── 猫 75 | │   ├── 鸡 76 | │   └── 鸭 77 | ├── data_utils.py #数据处理 78 | ├── main.py #程序入口,支持训练,预测,继续训练 79 | ├── model #保存模型 80 | ├── model_utils.py #模型处理 81 | └── res2next.py #res2next模型 82 | ``` 83 | -------------------------------------------------------------------------------- /异步测试/test_flask_api_server_v2.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import requests 3 | import time 4 | 5 | url = 'http://127.0.0.1:5000' 6 | #确保Flask server已经启动 7 | 8 | r = requests.get(url + '/gentoken') 9 | res = r.json() 10 | headers = { 11 | 'token': res['token'], 12 | 'timestamp': res['timestamp'], 13 | 'sign': res['sign'] 14 | } 15 | 16 | class FlaskTestCase(unittest.TestCase): 17 | def test_get_echo(self): 18 | """测试服务器正常启动""" 19 | r = requests.get(url +'/echo') 20 | self.assertEqual(r.status_code, 200) 21 | 22 | def test_get_token(self): 23 | """测试获取token""" 24 | r = requests.get(url+'/gentoken') 25 | self.assertEqual(r.status_code, 200) 26 | self.assertIn('sign', r.json()) 27 | self.assertIn('timestamp', r.json()) 28 | self.assertIn('token', r.json()) 29 | 30 | def test_try_upload_sync(self): 31 | """上传图片文件,并立即处理,返回成功结果""" 32 | with open("test.png", 'rb') as img: 33 | files = { 34 | 'image': img 35 | } 36 | r = requests.post(url + '/upload_sync', headers=headers, files=files) 37 | self.assertEqual(r.status_code, 200) 38 | self.assertEqual(r.json()['code'], 0) 39 | 40 | def test_try_upload_async(self): 41 | """测试上传图片,不能立即处理完成,先返回成功处理的页面,等待用户调取""" 42 | with open("test.png", 'rb') as img: 43 | files = { 44 | 'image': img 45 | } 46 | r = requests.post(url + '/upload_async', headers=headers, files=files) 47 | self.assertEqual(r.status_code, 200) 48 | time.sleep(3) 49 | r = requests.get(url+'/upload_async_result', headers=headers) 50 | self.assertEqual(r.status_code, 200) 51 | self.assertEqual(r.json()['code'], 0) 52 | 53 | if __name__ == '__main__': 54 | ##确保Flask server已经启动 55 | unittest.main() 56 | -------------------------------------------------------------------------------- /异步测试/aiohttp_api_server.py: -------------------------------------------------------------------------------- 1 | from aiohttp import web 2 | from typing import List 3 | import os 4 | 5 | 6 | Tokens = [ 7 | {"sign": "c8499156ae8acc3f2d6a1453b9315eb1", "timestamp": "1592805495.355849", "token": "ff1c1eef10cad322ddbcd842a952b46c"} 8 | ] 9 | UPLOAD_FOLDER = 'images/' 10 | 11 | def verify(token:str, timestamp:str ,sign:str)-> bool: 12 | """ 13 | 验证token,timestamp,sign 14 | :param token: token 15 | :param timestamp: 时间戳 16 | :param sign: 签名 17 | :return: bool 18 | """ 19 | for tk in Tokens: 20 | tkv = tk.values() 21 | if token in tkv and timestamp in tkv and sign in tkv: 22 | return True 23 | return False 24 | 25 | async def authorize(request: web.Request): 26 | if not 'token' in request.headers or not 'timestamp' in request.headers or not 'sign' in request.headers: 27 | return web.HTTPForbidden() 28 | 29 | token = request.headers['token'] 30 | timestamp = request.headers['timestamp'] 31 | sign = request.headers['sign'] 32 | if not verify(token,timestamp,sign): 33 | return web.HTTPForbidden() 34 | request.transport.write(b"HTTP/1.1 100 Continue\r\n\r\n") 35 | 36 | async def handle(request): 37 | name = request.match_info.get('name', "Anonymous") 38 | text = "Hello, " + name 39 | return web.Response(text=text) 40 | 41 | async def echo(request): 42 | return web.Response(text='echo') 43 | 44 | async def upload(request): 45 | post = await request.post() 46 | image = post.get("image") 47 | with open(UPLOAD_FOLDER + image.filename, 'wb') as file: 48 | file.write(image.file.read()) 49 | return web.json_response({'result': 'success upload'}) 50 | 51 | app = web.Application() 52 | app.add_routes([web.get('/', handle), 53 | web.post('/upload', upload, expect_handler=authorize), 54 | web.get('/echo', echo)]) 55 | 56 | if __name__ == '__main__': 57 | web.run_app(app, port=5000, host='0.0.0.0' ) -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | .idea 6 | 7 | # C extensions 8 | *.so 9 | 10 | # Distribution / packaging 11 | .Python 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | pip-wheel-metadata/ 25 | share/python-wheels/ 26 | *.egg-info/ 27 | .installed.cfg 28 | *.egg 29 | MANIFEST 30 | 31 | # PyInstaller 32 | # Usually these files are written by a python script from a template 33 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 34 | *.manifest 35 | *.spec 36 | 37 | # Installer logs 38 | pip-log.txt 39 | pip-delete-this-directory.txt 40 | 41 | # Unit test / coverage reports 42 | htmlcov/ 43 | .tox/ 44 | .nox/ 45 | .coverage 46 | .coverage.* 47 | .cache 48 | nosetests.xml 49 | coverage.xml 50 | *.cover 51 | *.py,cover 52 | .hypothesis/ 53 | .pytest_cache/ 54 | 55 | # Translations 56 | *.mo 57 | *.pot 58 | 59 | # Django stuff: 60 | *.log 61 | local_settings.py 62 | db.sqlite3 63 | db.sqlite3-journal 64 | 65 | # Flask stuff: 66 | instance/ 67 | .webassets-cache 68 | 69 | # Scrapy stuff: 70 | .scrapy 71 | 72 | # Sphinx documentation 73 | docs/_build/ 74 | 75 | # PyBuilder 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | .python-version 87 | 88 | # pipenv 89 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 90 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 91 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 92 | # install all needed dependencies. 93 | #Pipfile.lock 94 | 95 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 96 | __pypackages__/ 97 | 98 | # Celery stuff 99 | celerybeat-schedule 100 | celerybeat.pid 101 | 102 | # SageMath parsed files 103 | *.sage.py 104 | 105 | # Environments 106 | .env 107 | .venv 108 | env/ 109 | venv/ 110 | ENV/ 111 | env.bak/ 112 | venv.bak/ 113 | 114 | # Spyder project settings 115 | .spyderproject 116 | .spyproject 117 | 118 | # Rope project settings 119 | .ropeproject 120 | 121 | # mkdocs documentation 122 | /site 123 | 124 | # mypy 125 | .mypy_cache/ 126 | .dmypy.json 127 | dmypy.json 128 | 129 | # Pyre type checker 130 | .pyre/ 131 | -------------------------------------------------------------------------------- /异步测试/benchmark.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #每秒发送请求次数 3 | RATE=2000/s 4 | #测试时间,测试,总测试请求RATE*DURATION 5 | DURATION=3s 6 | 7 | #如果使用多线程,线程个数 8 | THREAD=10 9 | #如果使用多进程,进程个数 10 | WORKER=4 11 | 12 | #单核单线程 13 | flaskbench_singlethread(){ 14 | title=bench_singlethread 15 | echo "启动单核单线程server" 16 | python flask_api_server.py & > /dev/null 2>&1 17 | # ps aux | grep flask | grep -v grep > bench_singlethread.html 18 | echo "开始压测并生成结果" 19 | $1 $title 20 | pkill python 21 | echo "关闭server" 22 | } 23 | 24 | #单核10线程 25 | flaskbench_multithread(){ 26 | title=bench_multithread 27 | echo "启动单核多线程server" 28 | bash gunicorn_flask_api_server.sh & 29 | # ps aux | grep flask | grep -v grep > bench_multithread.html 30 | #等待gunicorn完全启动 31 | sleep 1 32 | echo "开始压测并生成结果" 33 | $1 $title 34 | pkill python 35 | sleep 1 36 | echo "关闭server" 37 | } 38 | 39 | #多核单线程 40 | flaskbench_multiprocess(){ 41 | title=bench_multiprocess 42 | #禁用线程FLAG,改用多核 43 | USERTHREAD=1 44 | echo "启动多核单线程 gunicorn_flask_api_server server" 45 | bash gunicorn_flask_api_server.sh & 46 | # ps aux | grep flask | grep -v grep > bench_multiprocess.html 47 | #等待gunicorn完全启动 48 | sleep 1 49 | echo "开始压测并生成结果" 50 | $1 $title 51 | pkill python 52 | sleep 1 53 | echo "关闭server" 54 | } 55 | 56 | #ayncio异步server测试 57 | flaskbench_aiohttp(){ 58 | title=bench_aiohttp 59 | #禁用线程FLAG,改用多核 60 | echo "启动aiohttp_api_server server" 61 | python aiohttp_api_server.py & > /dev/null 2>&1 62 | # ps aux | grep flask | grep -v grep > bench_multiprocess.html 63 | #等待gunicorn完全启动 64 | sleep 1 65 | echo "开始压测并生成结果" 66 | $1 $title 67 | pkill python 68 | sleep 1 69 | echo "关闭server" 70 | } 71 | 72 | #无负载压力测试 73 | noload(){ 74 | jq -ncM '{method: "GET", url: "http://127.0.0.1:5000/echo", body: "Hello!" | @base64 }' | vegeta attack -format=json -rate=$RATE -duration=$DURATION > results.bin 75 | cat results.bin | vegeta report > $1.txt 76 | cat results.bin | vegeta plot -title $1 > $1.html 77 | rm results.bin 78 | } 79 | 80 | #加上传图片负载时的测试结果, 图片的上传方法暂时无文档,todo 81 | uploadimage(){ 82 | jq -ncM '{method: "POST", url: "http://127.0.0.1:5000/upload", file:"image=@test.png", header: {"token": "ff1c1eef10cad322ddbcd842a952b46c", "timestamp":"1592805495.355849", "sign":"c8499156ae8acc3f2d6a1453b9315eb1"} }' | vegeta attack -format=json -rate=$RATE -duration=$DURATION > results.bin 83 | cat results.bin | vegeta report > $1.txt 84 | cat results.bin | vegeta plot -title $1 > $1.html 85 | rm results.bin 86 | } 87 | 88 | #flaskbench_singlethread noload 89 | flaskbench_multithread noload 90 | #flaskbench_multiprocess noload 91 | flaskbench_aiohttp noload -------------------------------------------------------------------------------- /text_classsification/utils/network_utils.py: -------------------------------------------------------------------------------- 1 | # -- encoding:utf-8 -- 2 | 3 | import tensorflow as tf 4 | 5 | 6 | def build_optimizer_parameters_func(flags): 7 | """ 8 | 构建优化器的参数 9 | :param flags: 10 | :return: 11 | """ 12 | optimizer_type = flags.optimizer_type 13 | parameters = {} 14 | 15 | # 添加各自不同优化器对应的参数 16 | if optimizer_type == 'adam': 17 | parameters['beta1'] = flags.adam_beta1 18 | parameters['beta2'] = flags.adam_beta2 19 | parameters['epsilon'] = flags.adam_epsilon 20 | elif optimizer_type == 'momentum': 21 | parameters['momentum'] = flags.momentum 22 | 23 | def build_optimizer_parameters(global_step): 24 | # 添加共同参数: learning_rate 25 | learning_rate_type = flags.learning_rate_type 26 | base_learning_rate = flags.base_learning_rate 27 | if learning_rate_type == 'exponential': 28 | tf.logging.info("使用指数变化学习率形式.....") 29 | # staircase=False:decayed_learning_rate = learning_rate * decay_rate ^ (global_step / decay_steps) 30 | # staircase=True:decayed_learning_rate = learning_rate * decay_rate ^ int(global_step / decay_steps) 31 | lr = tf.train.exponential_decay( 32 | learning_rate=base_learning_rate, # 基础学习率 33 | global_step=global_step, # 迭代的步数 34 | decay_steps=flags.lr_decay_steps, # 间隔大小 35 | decay_rate=flags.lr_decay_rate, # 缩放比例 36 | staircase=flags.lr_staircase, # 是否整间隔的进行缩放 37 | name="exponential_learning_rate") 38 | pass 39 | elif learning_rate_type == 'polynomial': 40 | tf.logging.info("使用多项式变化学习率形式.....") 41 | # global_step = min(global_step, decay_steps) 42 | # decayed_learning_rate = (learning_rate - end_learning_rate) * (1 - global_step / decay_steps) ^ (power) + end_learning_rate 43 | lr = tf.train.polynomial_decay( 44 | learning_rate=base_learning_rate, # 基础学习率 45 | global_step=global_step, # 迭代的步数 46 | decay_steps=flags.lr_decay_steps, # 间隔大小 47 | end_learning_rate=flags.end_learning_rate, # 最终学习率大小 48 | power=1.0, # 给定是否的时候是否是线性的系数 49 | cycle=True, # 当学习率为最小值的时候,是否将学习率重置设置比较大,然后再进行学习率下降的操作 50 | name="polynomial_learning_rate") 51 | else: 52 | tf.logging.info("使用常数不变的学习率.....") 53 | lr = tf.constant(base_learning_rate, name='lr') 54 | parameters['learning_rate'] = lr 55 | tf.summary.scalar('learning_rate', lr) 56 | 57 | return parameters 58 | 59 | return build_optimizer_parameters 60 | -------------------------------------------------------------------------------- /text_classsification/utils/data_helpers.py: -------------------------------------------------------------------------------- 1 | # -- encoding:utf-8 -- 2 | 3 | import numpy as np 4 | import re 5 | 6 | 7 | # 清洗字符串,字符切分 8 | def clean_str(string): 9 | """ 10 | Tokenization/string cleaning for all datasets except for SST. 11 | Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py 12 | """ 13 | string = re.sub(r"[^\u4e00-\u9fa5A-Za-z0-9(),.!?,。?!、“”\'\`]", " ", string) # 考虑到中文 14 | string = re.sub(r"\'s", " \'s", string) 15 | string = re.sub(r"\'ve", " \'ve", string) 16 | string = re.sub(r"n\'t", " n\'t", string) 17 | string = re.sub(r"\'re", " \'re", string) 18 | string = re.sub(r"\'d", " \'d", string) 19 | string = re.sub(r"\'ll", " \'ll", string) 20 | string = re.sub(r",", " , ", string) 21 | string = re.sub(r"!", " ! ", string) 22 | string = re.sub(r"\(", " \( ", string) 23 | string = re.sub(r"\)", " \) ", string) 24 | string = re.sub(r"\?", " \? ", string) 25 | string = re.sub(r"\s{2,}", " ", string) 26 | return string.strip().lower() 27 | 28 | 29 | def load_data_and_labels(positive_data_file, negative_data_file): 30 | """ 31 | 基于给定的正例和负例文件路径加载数据 32 | :param positive_data_file: 33 | :param negative_data_file: 34 | :return: 35 | """ 36 | # 1. 加载所有数据组成list列表 37 | positive = open(positive_data_file, 'rb').read().decode('utf-8') 38 | negative = open(negative_data_file, 'rb').read().decode('utf-8') 39 | 40 | # 2.数据的划分(转换成一个一个样本) 41 | positive = positive.split("\n") 42 | negative = negative.split("\n") 43 | 44 | # 3. 数据简单处理 45 | positive = [clean_str(s.strip()) for s in positive] 46 | negative = [clean_str(s.strip()) for s in negative] 47 | positive = [s for s in positive if len(s) > 0] 48 | negative = [s for s in negative if len(s) > 0] 49 | 50 | # 4. 数据合并得到x 51 | texts = positive + negative 52 | 53 | # 5. 得到对应的id 54 | labels = [1] * len(positive) + [0] * len(negative) 55 | 56 | # 6. 结果返回 57 | return np.asarray(texts), np.asarray(labels) 58 | 59 | 60 | def batch_iter(data, batch_size, num_epochs, shuffle=True): 61 | """ 62 | 基于给定的data数据获取批次数据 63 | :param data: 64 | :param batch_size: 65 | :param num_epochs: 66 | :param shuffle: 67 | :return: 68 | """ 69 | data = np.array(data) 70 | data_size = len(data) 71 | # 一个epoch里面有多少个bachsize 72 | num_batches_per_epoch = int((len(data) - 1) / batch_size) + 1 73 | for epoch in range(num_epochs): 74 | # Shuffle the data at each epoch 75 | if shuffle: 76 | # 传给permutation一个矩阵,它会返回一个洗牌后的矩阵副本 77 | shuffle_indices = np.random.permutation(np.arange(data_size)) 78 | shuffled_data = data[shuffle_indices] 79 | else: 80 | shuffled_data = data 81 | for batch_num in range(num_batches_per_epoch): 82 | start_index = batch_num * batch_size 83 | end_index = min((batch_num + 1) * batch_size, data_size) 84 | yield shuffled_data[start_index:end_index] 85 | 86 | 87 | if __name__ == '__main__': 88 | texts, labels = load_data_and_labels() 89 | # from utils.vocabulary_utils import VocabularyProcessorUtil, split_with_word 90 | # 91 | # _, vocabulary = VocabularyProcessorUtil.load_word2vec_embedding("../model/w2v.bin") 92 | # VocabularyProcessorUtil.building_model(documents=texts, save_path='../model/vocab.pkl', max_document_length=512, 93 | # vocabulary=vocabulary, 94 | # split_fn=split_with_word) 95 | # model = VocabularyProcessorUtil.load_model('../model/vocab.pkl') 96 | 97 | -------------------------------------------------------------------------------- /translate/data/custom_zh_en.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # @Date : 2021/4/27 10:30 上午 3 | # @File : custom_zh_en.py 4 | # @Author: johnson 5 | # @Contact : github: johnson7788 6 | # @Desc : 中文到英文的数据集 7 | 8 | from __future__ import absolute_import, division, print_function 9 | 10 | import csv 11 | import json 12 | import os 13 | 14 | import datasets 15 | 16 | 17 | 18 | _CITATION = """\ 19 | @InProceedings{huggingface:dataset, 20 | title = {repair test}, 21 | authors={johnson 22 | }, 23 | year={2020} 24 | } 25 | """ 26 | 27 | #数据集描述 28 | _DESCRIPTION = """\ 29 | 中文到英文的句子翻译 30 | """ 31 | 32 | _HOMEPAGE = "johnson homepage" 33 | 34 | _LICENSE = "johnson license" 35 | 36 | # 数据集下载地址 37 | _URLs = { 38 | 'custom_zh_en': "https://huggingface.co/great-new-dataset-first_domain.zip", 39 | } 40 | 41 | 42 | #通常CamelCase命名 43 | class ZhEnDataset(datasets.GeneratorBasedBuilder): 44 | """连贯性测试数据集""" 45 | 46 | VERSION = datasets.Version("1.1.0") 47 | 48 | BUILDER_CONFIGS = [ 49 | datasets.BuilderConfig(name="custom_zh_en", version=VERSION, description="正常数量数据集"), 50 | ] 51 | 52 | DEFAULT_CONFIG_NAME = "custom_zh_en" 53 | def _info(self): 54 | return datasets.DatasetInfo( 55 | description=_DESCRIPTION, 56 | features=datasets.Features( 57 | {"translation": datasets.features.Translation(languages=('zh', 'en'))} 58 | ), 59 | supervised_keys=('zh', 'en'), 60 | homepage=_HOMEPAGE, 61 | license=_LICENSE, 62 | citation=_CITATION, 63 | ) 64 | def _split_generators(self, dl_manager): 65 | """下载数据集 66 | 此方法的任务是下载/提取数据并根据配置定义拆分 67 | 根据不同的配置BUILDER_CONFIGS,和数据集的name定义 68 | """ 69 | # dl_manager是一个datasets.download.DownloadManager,可用于下载和提取URL, 70 | # 它可以接受任何类型或嵌套的列表/字典,并将返回相同的结构,url也可以替换为局部文件的路径。 71 | # 默认情况下,将提取压缩包,如果文件是压缩的,并返回提取压缩的缓存文件夹的路径,而不是压缩文件 72 | return [ 73 | datasets.SplitGenerator( 74 | name=datasets.Split.TRAIN, 75 | # 下面的参数将传给 _generate_examples 76 | gen_kwargs={ 77 | "filepath": self.config.data_files['train'], 78 | "split": "train", 79 | }, 80 | ), 81 | datasets.SplitGenerator( 82 | name=datasets.Split.TEST, 83 | # 下面的参数将传给 _generate_examples 84 | gen_kwargs={ 85 | "filepath": self.config.data_files['test'], 86 | "split": "test" 87 | }, 88 | ), 89 | datasets.SplitGenerator( 90 | name=datasets.Split.VALIDATION, 91 | # 下面的参数将传给 _generate_examples 92 | gen_kwargs={ 93 | "filepath": self.config.data_files['validation'], 94 | "split": "dev", 95 | }, 96 | ), 97 | ] 98 | 99 | def _generate_examples(self, filepath, split): 100 | """ Yields 方法返回每个样本. """ 101 | # 被函数_split_generators 调用,参数也是通过 gen_kwargs被传过来 102 | # 它负责打开给定的文件并从数据集中产生(key, example)元组 103 | # key是不重要的,只是习惯于这样 104 | zhfile, enfile = filepath 105 | with open(zhfile, encoding="utf-8") as zf, open(enfile, encoding="utf-8") as ef: 106 | zhlines = zf.readlines() 107 | eflines = ef.readlines() 108 | assert len(zhlines) == len(eflines), "警告:读入的2个文件总的行数不等" 109 | for id_, (zh, en) in enumerate(zip(zhlines, eflines)): 110 | yield id_, {'translation': 111 | {'zh': zh, 112 | 'en': en 113 | } 114 | } -------------------------------------------------------------------------------- /异步测试/quart_api_server.py: -------------------------------------------------------------------------------- 1 | import os 2 | from quart import Quart, request, jsonify, abort 3 | from werkzeug.utils import secure_filename 4 | import hashlib 5 | from functools import wraps 6 | import time 7 | 8 | Tokens = [ 9 | {"sign": "c8499156ae8acc3f2d6a1453b9315eb1", "timestamp": "1592805495.355849", "token": "ff1c1eef10cad322ddbcd842a952b46c"} 10 | ] 11 | 12 | UPLOAD_FOLDER = 'images/' 13 | ALLOWED_EXTENSIONS = set(['jpg', 'png']) 14 | 15 | app = Quart(__name__) 16 | 17 | def verify(token:str, timestamp:str ,sign:str)-> bool: 18 | """ 19 | 验证token,timestamp,sign 20 | :param token: token 21 | :param timestamp: 时间戳 22 | :param sign: 签名 23 | :return: bool 24 | """ 25 | for tk in Tokens: 26 | tkv = tk.values() 27 | if token in tkv and timestamp in tkv and sign in tkv: 28 | return True 29 | return False 30 | 31 | def authorize(f): 32 | @wraps(f) 33 | def decorated_function(*args, **kws): 34 | if not 'token' in request.headers or not 'timestamp' in request.headers or not 'sign' in request.headers: 35 | abort(401) 36 | 37 | token = request.headers['token'] 38 | timestamp = request.headers['timestamp'] 39 | sign = request.headers['sign'] 40 | if not verify(token,timestamp,sign): 41 | abort(401) 42 | return f(*args, **kws) 43 | return decorated_function 44 | 45 | def cal_md5(content) -> str: 46 | """ 47 | 给定content,计算md5 48 | :param content: 49 | :return: 50 | """ 51 | md5 = hashlib.md5() 52 | content=str(content) 53 | md5.update(content.encode('UTF-8')) 54 | result = md5.hexdigest() 55 | return result 56 | 57 | @app.route("/token", methods=['GET', 'POST']) 58 | async def generate_token(): 59 | """ 60 | 生成用户token 61 | :return: 62 | """ 63 | rand = os.urandom(32) 64 | token = cal_md5(rand) 65 | timestamp = str(time.time()) 66 | sign = cal_md5(str(token)+timestamp) 67 | TK = {'token': token, 'timestamp':timestamp, 'sign':sign} 68 | Tokens.append(TK) 69 | return jsonify({'token': token, 'timestamp': timestamp, 'sign': sign}) 70 | 71 | def allowed_file(filename): 72 | return '.' in filename and \ 73 | filename.rsplit('.', 1)[1] in ALLOWED_EXTENSIONS 74 | 75 | #做一些异步任务,测试 76 | async def blocking_io(): 77 | #文件操作,IO类型任务,例如日志等,使用线程或异步asyncio 78 | with open("/dev/urandom", "rb") as f: 79 | return f.read(100) 80 | async def cpu_bound(): 81 | # CPU-Bound, 消耗CPU的操作,使用多进程完成 82 | return sum(i * i for i in range(10 ** 6)) 83 | 84 | #模拟同步任务, 85 | @app.route("/upload_sync", methods=['POST']) 86 | async def upload_sync(): 87 | """ 88 | 上传图片文件,并立即处理,返回成功结果 89 | :return: 90 | """ 91 | if not os.path.exists(UPLOAD_FOLDER): 92 | os.mkdir(UPLOAD_FOLDER) 93 | file = request.files['image'] 94 | if file and allowed_file(file.filename): 95 | filename = secure_filename(file.filename) 96 | file.save(os.path.join(UPLOAD_FOLDER, filename)) 97 | return jsonify({'result': 'success upload'}) 98 | 99 | #模拟异步任务 100 | @app.route("/upload_async", methods=['POST']) 101 | async def upload_async(): 102 | """ 103 | 上传图片,不能立即处理完成,先返回成功处理的页面,等待用户调取 104 | :return: 105 | """ 106 | with concurrent.futures.ProcessPoolExecutor() as pool: 107 | future = pool.submit(cpu_bound) 108 | for fut in concurrent.futures.as_completed([future]): 109 | return jsonify({'result': fut.done()}) 110 | 111 | @app.route("/echo", methods=['GET']) 112 | async def echo(): 113 | """ 114 | 测试服务器运行正常 115 | :return: 116 | """ 117 | return jsonify({'result': True}) 118 | 119 | if __name__ == "__main__": 120 | app.run(host='0.0.0.0', port=5000, debug=True, threaded=True) 121 | -------------------------------------------------------------------------------- /异步测试/flask_api_server.py: -------------------------------------------------------------------------------- 1 | import os 2 | from flask import Flask, request, jsonify, abort 3 | from werkzeug.utils import secure_filename 4 | import hashlib 5 | from functools import wraps 6 | import time 7 | # import asyncio 8 | import concurrent.futures 9 | 10 | Tokens = [ 11 | {"sign": "c8499156ae8acc3f2d6a1453b9315eb1", "timestamp": "1592805495.355849", "token": "ff1c1eef10cad322ddbcd842a952b46c"} 12 | ] 13 | 14 | UPLOAD_FOLDER = 'images/' 15 | ALLOWED_EXTENSIONS = set(['jpg', 'png']) 16 | 17 | app = Flask(__name__) 18 | 19 | def verify(token:str, timestamp:str ,sign:str)-> bool: 20 | """ 21 | 验证token,timestamp,sign 22 | :param token: token 23 | :param timestamp: 时间戳 24 | :param sign: 签名 25 | :return: bool 26 | """ 27 | for tk in Tokens: 28 | tkv = tk.values() 29 | if token in tkv and timestamp in tkv and sign in tkv: 30 | return True 31 | return False 32 | 33 | def authorize(f): 34 | @wraps(f) 35 | def decorated_function(*args, **kws): 36 | if not 'token' in request.headers or not 'timestamp' in request.headers or not 'sign' in request.headers: 37 | abort(401) 38 | 39 | token = request.headers['token'] 40 | timestamp = request.headers['timestamp'] 41 | sign = request.headers['sign'] 42 | if not verify(token,timestamp,sign): 43 | abort(401) 44 | return f(*args, **kws) 45 | return decorated_function 46 | 47 | def cal_md5(content) -> str: 48 | """ 49 | 给定content,计算md5 50 | :param content: 51 | :return: 52 | """ 53 | md5 = hashlib.md5() 54 | content=str(content) 55 | md5.update(content.encode('UTF-8')) 56 | result = md5.hexdigest() 57 | return result 58 | 59 | @app.route("/token", methods=['GET', 'POST']) 60 | def generate_token(): 61 | """ 62 | 生成用户token 63 | :return: 64 | """ 65 | rand = os.urandom(32) 66 | token = cal_md5(rand) 67 | timestamp = str(time.time()) 68 | sign = cal_md5(str(token)+timestamp) 69 | TK = {'token': token, 'timestamp':timestamp, 'sign':sign} 70 | Tokens.append(TK) 71 | return jsonify({'token': token, 'timestamp': timestamp, 'sign': sign}) 72 | 73 | def allowed_file(filename): 74 | return '.' in filename and \ 75 | filename.rsplit('.', 1)[1] in ALLOWED_EXTENSIONS 76 | 77 | #同步任务 78 | @app.route("/upload", methods=['POST', 'GET']) 79 | @authorize 80 | def upload(): 81 | """ 82 | 上传图片文件 83 | :return: 84 | """ 85 | if not os.path.exists(UPLOAD_FOLDER): 86 | os.mkdir(UPLOAD_FOLDER) 87 | file = request.files['image'] 88 | if file and allowed_file(file.filename): 89 | filename = secure_filename(file.filename) 90 | file.save(os.path.join(UPLOAD_FOLDER, filename)) 91 | return jsonify({'result': 'success upload'}) 92 | 93 | #做一些异步任务,测试 94 | def blocking_io(): 95 | #文件操作,IO类型任务,例如日志等,使用线程或异步asyncio 96 | with open("/dev/urandom", "rb") as f: 97 | return f.read(100) 98 | def cpu_bound(): 99 | # CPU-Bound, 消耗CPU的操作,使用多进程完成 100 | return sum(i * i for i in range(10 ** 6)) 101 | 102 | @app.route("/process", methods=['GET']) 103 | @authorize 104 | def do_multiprocess(): 105 | """ 106 | 使用进程池处理cpu-bound型任务 107 | :return: 108 | """ 109 | with concurrent.futures.ProcessPoolExecutor() as pool: 110 | future = pool.submit(cpu_bound) 111 | for fut in concurrent.futures.as_completed([future]): 112 | return jsonify({'result': fut.done()}) 113 | 114 | @app.route("/echo", methods=['GET']) 115 | def echo(): 116 | """ 117 | 测试服务器运行正常 118 | :return: 119 | """ 120 | return jsonify({'result': True}) 121 | 122 | if __name__ == "__main__": 123 | app.run(host='0.0.0.0', port=5000, debug=True, threaded=True) 124 | -------------------------------------------------------------------------------- /TextCNN/main.py: -------------------------------------------------------------------------------- 1 | import os, sys 2 | import argparse 3 | import torch 4 | import model_utils 5 | import data_utils 6 | from config import Config 7 | 8 | #TextCNN模型 9 | def dotrain(): 10 | parser = argparse.ArgumentParser(description='Text CNN 分类器') 11 | parser.add_argument('--model', type=str, default="model/textcnn.model", help='读取model继续训练') 12 | conf = Config() 13 | #打印模型配置信息 14 | conf.dump() 15 | args = parser.parse_args() 16 | if not os.path.isdir("model"): 17 | os.mkdir("model") 18 | print("处理训练数据") 19 | train_iter, text_field, label_field = data_utils.text_dataloader(conf.train_dir, conf.batch_size) 20 | #使用pickle保存字典到本地 21 | data_utils.save_vocab(text_field.vocab, "model/text.vocab") 22 | data_utils.save_vocab(label_field.vocab, "model/label.vocab") 23 | 24 | #添加新的配置,嵌入的维度vocab_num, 分类的类别数量class_num, 25 | conf.vocab_num = len(text_field.vocab) 26 | conf.class_num = len(label_field.vocab) - 1 27 | # 卷积核大小, 代表跨越的句子和字的大小, 找打相邻字直接的联系, 例如[3, 4, 5] 28 | conf.kernel_sizes = [int(k) for k in conf.kernel_sizes.split(',')] 29 | 30 | #模型加载和初始化 31 | if os.path.exists(args.model): 32 | print('发现模型文件, 加载模型: {}'.format(args.model)) 33 | cnn = torch.load(args.model) 34 | else: 35 | cnn = model_utils.TextCNN(conf) 36 | #模型训练 37 | try: 38 | model_utils.train(train_iter, cnn, conf) 39 | except KeyboardInterrupt: 40 | print('-' * 80) 41 | print('提前退出训练.') 42 | 43 | #评估模型 44 | def doeval(): 45 | parser = argparse.ArgumentParser(description='Text CNN 分类器') 46 | #必须指定已经训练好的模型 47 | parser.add_argument('--model', type=str, default="model/textcnn.model", help='读取model进行评估') 48 | conf = Config() 49 | #打印模型配置信息 50 | conf.dump() 51 | args = parser.parse_args() 52 | print("加载测试数据") 53 | #测试时不进行数据打乱操作 54 | eval_iter, text_field, label_field = data_utils.text_dataloader(conf.eval_dir, conf.batch_size, shuffle=False) 55 | # 模型加载和初始化 56 | if os.path.exists(args.model): 57 | print('发现模型文件, 加载模型: {}'.format(args.model)) 58 | cnn = torch.load(args.model) 59 | else: 60 | print("未找到模型文件,退出") 61 | sys.exit(-1) 62 | #加载以保存的字典 63 | text_field.vocab = data_utils.load_vocab("model/text.vocab") 64 | label_field.vocab = data_utils.load_vocab("model/label.vocab") 65 | #开始模型评估 66 | model_utils.eval(eval_iter, cnn, conf) 67 | 68 | #预测 69 | def dopredict(): 70 | """ 71 | 给定一个文件或一句话,预测结果 72 | :return: 73 | """ 74 | parser = argparse.ArgumentParser(description='Text CNN 分类器') 75 | #必须指定已经训练好的模型 76 | parser.add_argument('--path', type=str, default="data/predict/",help='要进行预测的文本文件的路径,或文件夹') 77 | parser.add_argument('--model', type=str, default="model/textcnn.model", help='读取model进行预测') 78 | conf = Config() 79 | args = parser.parse_args() 80 | #指定Field格式 81 | text_field = data_utils.TextTEXT 82 | label_field = data_utils.TextLABEL 83 | text_field.vocab = data_utils.load_vocab("model/text.vocab") 84 | label_field.vocab = data_utils.load_vocab("model/label.vocab") 85 | # 模型加载和初始化 86 | if os.path.exists(args.model): 87 | print('发现模型文件, 加载模型: {}'.format(args.model)) 88 | cnn = torch.load(args.model) 89 | else: 90 | print("未找到模型文件,退出") 91 | sys.exit(-1) 92 | #如果是文件夹,那么预测里面的文件,否则就是文件,直接预测 93 | if os.path.isdir(args.path): 94 | files = os.listdir(args.path) 95 | files_path = [args.path+f for f in files] 96 | else: 97 | files_path = [args.path] 98 | #开始预测 99 | for file in files_path: 100 | text, label = model_utils.predict(file, cnn, text_field, label_field, conf.cuda) 101 | print('[path] {}\n[Text] {}\n[Label] {}\n'.format(file, text, label)) 102 | print(f'共预测{len(files_path)}个文件') 103 | 104 | if __name__ == '__main__': 105 | dotrain() 106 | # doeval() 107 | # dopredict() -------------------------------------------------------------------------------- /TextCNN/data_utils.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import os 3 | import torchtext 4 | import jieba 5 | from typing import List 6 | 7 | class TextDataset(torchtext.data.Dataset): 8 | """ 9 | 读取数据并处理 10 | """ 11 | @staticmethod 12 | def sort_key(example): 13 | """ 14 | 用于在torchtext.data.Iterator生成批次迭代器的时候,用于example进行排序以将具有相似长度的example批次放在一起并最小化填充 15 | 如果在使用torchtext.data.Iterator时,提供了sort_key,那么就会覆盖这个Dataset的sort_key属性, 默认为None 16 | :param example: 是按example中单个样本的text属性的长度排序 17 | :return: 18 | """ 19 | return len(example.text) 20 | def __init__(self, path, text_field, label_field, **kwargs): 21 | """根据给的Field和数据集路径处理数据, 之后交给torchtext.data.Dataset处理 22 | Arguments: 23 | path: 数据集的路径 24 | text_field: text数据的Field格式 25 | label_field: label数据的Field格式 26 | **kwargs: data.Dataset的参数 27 | """ 28 | #定义fields 29 | fields = [('text', text_field), ('label', label_field)] 30 | #定义一个空的数据集 31 | examples = [] 32 | #列出当前目录下的所有文件夹,文件夹名称作为label,文件夹里面的文件内容作为text 33 | dirname = os.listdir(path) 34 | for dir in dirname: 35 | #循环一个label目录下的所有文件 36 | files = os.listdir(path + '/' + dir) 37 | for file in files: 38 | document = '' 39 | with open(path + '/' + dir + '/' + file, encoding="utf8", errors='ignore') as f: 40 | for line in f: 41 | if line != '\n': 42 | document += text_filter(line) 43 | # 如果文本长度小于10个字符,那么就过滤掉 44 | if len(document) < 10: 45 | continue 46 | text, label = document, dir 47 | #定义一个训练或测试的样本的Example格式 48 | example = torchtext.data.Example() 49 | # text_field.preprocess 是进行token的处理, 例如用jieba处理 50 | setattr(example, "text", text_field.preprocess(text)) 51 | setattr(example, "label", label_field.preprocess(label)) 52 | examples.append(example) 53 | super(TextDataset, self).__init__(examples, fields, **kwargs) 54 | 55 | def text_filter(sentence:str)-> str: 56 | """ 57 | 过滤掉非汉字和标点符号和非数字 58 | :param sentence: 59 | :return: 60 | """ 61 | line = sentence.replace('\n', '。') 62 | # 过滤掉非汉字和标点符号和非数字 63 | linelist = [word for word in line if 64 | word >= u'\u4e00' and word <= u'\u9fa5' or word in [',', '。', '?', '!', 65 | ':'] or word.isdigit()] 66 | return ''.join(linelist) 67 | 68 | #定义text的Field 69 | def text_token(sentence: str)-> List: 70 | """ 71 | 使用jieba分词 72 | :param sentence: 要分词的sentence 73 | :return: 一个text的分词后的列表 74 | """ 75 | return jieba.lcut(sentence) 76 | 77 | #sequential 是否要变成序列,tokenize表示使用的token 函数是, lower表示是否转换成小写 78 | TextTEXT = torchtext.data.Field(sequential=True, tokenize=text_token, lower=True) 79 | 80 | #定义label的Field 81 | TextLABEL = torchtext.data.Field(sequential=False, lower=True) 82 | 83 | def text_dataloader(path, batch_size, shuffle=False): 84 | """ 85 | 加载数据 86 | :param path: 训练集和测试集的文件路径 87 | :param batchsize: 批处理大小 88 | :param shuffle: 是否做shuffle 89 | :return: 90 | """ 91 | #定义text和label的 Field格式 92 | text_field = TextTEXT 93 | label_field = TextLABEL 94 | 95 | #读取数据 96 | #dataset 包含examples和fields2部分,examples保存所有的数据,field是这类数据的名字,例如field是(text,label), examples里面就是[(label的内容(纯文本),text内容(纯文本)),...] 97 | dataset = TextDataset(path, text_field, label_field) 98 | #构建字典,使用build_vocab之后text_field会多出一个vocab的属性,vocab中是字典 99 | text_field.build_vocab(dataset) 100 | label_field.build_vocab(dataset) 101 | #创建迭代器 102 | dataiter = torchtext.data.Iterator(dataset, batch_size, shuffle=shuffle, repeat=False) 103 | return dataiter, text_field, label_field 104 | 105 | def save_vocab(vocab, filename): 106 | """ 107 | 使用pickle保存字典 108 | :param vocab: 109 | :param filename: 110 | :return: 111 | """ 112 | with open(filename, 'wb') as f: 113 | pickle.dump(vocab, f) 114 | 115 | def load_vocab(filename): 116 | """ 117 | 使用pickle加载字典 118 | :param filename: 119 | :return: 120 | """ 121 | with open(filename, 'rb') as f: 122 | vocab = pickle.load(f) 123 | return vocab 124 | -------------------------------------------------------------------------------- /Res2Net/main.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from data_utils import load_data, display_prediction, process_image 3 | from config import Config 4 | import os 5 | from model_utils import ( 6 | create_model, #根据架构创建模型 7 | create_optimizer, # 为模型的最后的Classifier层添加优化器 8 | load_checkpoint, # 加载checkpoint,重建预训练模型 9 | plot_history, # 绘制历史的训练损失和准确率的图表 10 | save_checkpoint, #保存模型checkpoint 11 | train_model, #训练模型 12 | classify_image, #用于预测阶段,使用模型进行预测 13 | test_model) #使用测试集测试模型性能,并打印准确率 14 | 15 | def train(): 16 | conf = Config() 17 | # 打印模型配置信息 18 | conf.dump() 19 | parser = argparse.ArgumentParser(description='图片分类模型训练') 20 | parser.add_argument( 21 | '--resume_checkpoint', action='store', type=str, default='model/checkpoint.pth', 22 | help='从模型的checkpoint恢复模型,并继续训练,如果resume_checkpoint这个参数提供' 23 | '这些参数将忽略--arch, --learning_rate, --hidden_units, and --drop_p') 24 | args = parser.parse_args() 25 | 26 | #加载数据 27 | dataloaders, class_to_idx = load_data(conf.data_directory) 28 | 29 | #创建模型,如果模型文件存在 30 | if args.resume_checkpoint and os.path.exists(args.resume_checkpoint): 31 | #加载checkpoint 32 | print('resume_checkpoint已存在,开始加载模型') 33 | model, optimizer, epoch, history = load_checkpoint( 34 | checkpoint_path=args.resume_checkpoint, 35 | load_optimizer=True, gpu=conf.cuda) 36 | start_epoch = epoch + 1 37 | else: 38 | #创建新模型和优化器 39 | print('resume_checkpoint未设置或模型文件不存在,创建新的模型') 40 | model = create_model( 41 | arch=conf.arch, class_to_idx=class_to_idx, 42 | hidden_units=conf.hidden_units, drop_p=conf.dropout) 43 | optimizer = create_optimizer(model=model, lr=conf.learning_rate) 44 | start_epoch = 1 45 | history = None 46 | 47 | #训练模型 48 | history, best_epoch = train_model( 49 | dataloaders=dataloaders, model=model, 50 | optimizer=optimizer, gpu=conf.cuda, start_epoch=start_epoch, 51 | epochs=conf.epochs, train_history=history) 52 | 53 | #测试集上测试模型 54 | test_acc = test_model(dataloader=dataloaders['test'], model=model, gpu=conf.cuda) 55 | print(f'模型在测试集上的准确率是 {(test_acc * 100):.2f}%') 56 | 57 | #保存模型 58 | save_checkpoint( 59 | save_path=conf.save_path+conf.save_name, epoch=best_epoch, model=model, 60 | optimizer=optimizer, history=history) 61 | 62 | #绘制历史记录 63 | plot_history(history) 64 | 65 | def predict(): 66 | conf = Config() 67 | # 打印模型配置信息 68 | conf.dump() 69 | parser = argparse.ArgumentParser(description='图片分类模型训练') 70 | parser.add_argument( 71 | '--image_path', type=str,default='data/zhengjian/predict/test/3601216003722.jpg', help='指定要分类的路径') 72 | parser.add_argument( 73 | '--checkpoint', type=str, default='model/checkpoint.pth', help='指定checkpoint的模型的保存位置') 74 | parser.add_argument( 75 | '--top_k', type=int, default=2, help='选取topk概率的最大类别, dafault=2') 76 | args = parser.parse_args() 77 | 78 | # 加载转换,处理,转换图片到Tensor 79 | image_tensor = process_image(image_path=args.image_path) 80 | 81 | # 加载模型,是否使用gpu 82 | model, _, _, _ = load_checkpoint( 83 | checkpoint_path=args.checkpoint, load_optimizer=False, gpu=conf.cuda) 84 | 85 | #图片分类 86 | probabilities, predictions = classify_image( 87 | image_tensor=image_tensor, model=model, top_k=args.top_k, gpu=conf.cuda) 88 | 89 | #分类结果 90 | top_class = predictions[0] 91 | top_prob = probabilities[0] 92 | top_k = args.top_k 93 | print(f'\n预测概率最高的类别是 {top_class.capitalize()} ' 94 | f' 概率是{top_prob:.4f}') 95 | print(f'\n预测的topk是 {top_k} 类别是 {predictions}' 96 | f'概率是 {probabilities}') 97 | 98 | # 绘图 99 | display_prediction( 100 | image_path=args.image_path, 101 | probabilities=probabilities, 102 | predictions=predictions) 103 | 104 | if __name__ == '__main__': 105 | train() 106 | # predict() -------------------------------------------------------------------------------- /text_classsification/eval.py: -------------------------------------------------------------------------------- 1 | # -- encoding:utf-8 -- 2 | 3 | import os 4 | import csv 5 | import numpy as np 6 | import tensorflow as tf 7 | 8 | from utils import data_helpers 9 | from utils.vocabulary_utils import VocabularyProcessorUtil 10 | 11 | # 数据文件 12 | tf.flags.DEFINE_string("positive_data_file", 13 | "Data source for the positive data.") 14 | tf.flags.DEFINE_string("negative_data_file", 15 | "Data source for the positive data.") 16 | # Eval Parameters 17 | tf.flags.DEFINE_string("network_name", None, "给定模型名称!!!") 18 | tf.flags.DEFINE_string("checkpoint_dir", "./model", "给定模型持久化的文件夹路径!") 19 | tf.flags.DEFINE_string("vocab_model_path", "./model/vocab.pkl", "给定词汇模型所在的磁盘路径") 20 | tf.flags.DEFINE_integer("batch_size", 64, "Batch Size (default: 64)") 21 | tf.flags.DEFINE_boolean("eval_train", True, "Evaluate on all training data") 22 | 23 | FLAGS = tf.flags.FLAGS 24 | 25 | 26 | def main(_): 27 | network_name = FLAGS.network_name 28 | if network_name is None: 29 | raise Exception("参数network_name必须给定!!!!") 30 | 31 | # 0. 数据校验,要求训练数据文件文件存在 32 | if not (os.path.isfile(FLAGS.positive_data_file) and os.path.isfile(FLAGS.negative_data_file)): 33 | raise Exception("给定的训练数据必须是文件路径的形成!!!") 34 | 35 | with tf.Graph().as_default(): 36 | graph = tf.get_default_graph() 37 | with tf.Session() as sess: 38 | # 1. 加载词汇转换模型 39 | vocab_model_path = FLAGS.vocab_model_path 40 | if not tf.gfile.Exists(vocab_model_path): 41 | raise Exception("词汇转换模型必须存在,请检查磁盘路径:{}".format(vocab_model_path)) 42 | vocab_model = VocabularyProcessorUtil.load_model(save_path=vocab_model_path) 43 | 44 | # 2. 恢复加载网络 45 | ckpt = tf.train.get_checkpoint_state(checkpoint_dir=FLAGS.checkpoint_dir) 46 | if not (ckpt and ckpt.model_checkpoint_path): 47 | raise Exception("不存在对应的模型文件,请检查:{}".format(FLAGS.checkpoint_dir)) 48 | tf.logging.info("恢复模型:{}".format(ckpt.model_checkpoint_path)) 49 | saver = tf.train.import_meta_graph("{}.meta".format(ckpt.model_checkpoint_path)) 50 | saver.restore(sess=sess, save_path=ckpt.model_checkpoint_path) 51 | 52 | # 3. 获取Tensor对象 53 | inputs = graph.get_tensor_by_name("{}/placeholders/input_word_id:0".format(network_name.upper())) 54 | dropout_keep_prob = graph.get_tensor_by_name("{}/placeholders/dropout_keep_prob:0".format(network_name.upper())) 55 | predictions = graph.get_tensor_by_name("{}/project/predictions:0".format(network_name.upper())) 56 | 57 | # 4. 加载数据 58 | tf.logging.info("开始加载文本数据,并转换处理......") 59 | old_texts, labels = data_helpers.load_data_and_labels( 60 | positive_data_file=FLAGS.positive_data_file, 61 | negative_data_file=FLAGS.negative_data_file 62 | ) 63 | 64 | # 4a. 文本数据id转换(截取、填充) 65 | texts = np.asarray(list(vocab_model.transform(old_texts))) 66 | # 4c. 构建批次 67 | batches = data_helpers.batch_iter( 68 | data=list(texts), 69 | batch_size=FLAGS.batch_size, # 每个批次的样本数据量 70 | num_epochs=1, # 总共迭代多少个epoch数据 71 | shuffle=False 72 | ) 73 | 74 | # 5. 遍历数据进行预测 75 | all_predictions = [] 76 | for x_test_batch in batches: 77 | batch_predictions = sess.run(predictions, {inputs: x_test_batch, dropout_keep_prob: 1.0}) 78 | # 数组拼接 79 | all_predictions = np.concatenate([all_predictions, batch_predictions]) 80 | # 类型转换,以及格式/数据类型转换 81 | all_predictions = np.asarray(all_predictions, dtype=np.int32).reshape(-1) 82 | 83 | # 6. 效果评估 84 | correct_predictions = float(sum(all_predictions == labels)) 85 | print("Total number of test examples: {}".format(len(labels))) 86 | print("Accuracy: {:g}".format(correct_predictions / float(len(labels)))) 87 | print("实际值为:\n{}".format(labels)) 88 | print("预测值为:\n{}".format(all_predictions)) 89 | 90 | # 将评价保存到CSV 91 | predictions_human_readable = np.column_stack((all_predictions, labels, np.array(old_texts))) 92 | out_path = os.path.join(FLAGS.checkpoint_dir, "prediction.csv") 93 | print("Saving evaluation to {0}".format(out_path)) 94 | # 参数:newline=''是给定不添加新行 95 | with open(out_path, 'w', encoding='utf-8', newline='') as f: 96 | writer = csv.writer(f) # 获取输出对象 97 | writer.writerows(predictions_human_readable) # 输出CSV格式 98 | 99 | 100 | if __name__ == '__main__': 101 | tf.logging.set_verbosity(tf.logging.INFO) 102 | tf.app.run() 103 | -------------------------------------------------------------------------------- /异步测试/flask_api_server_v2.py: -------------------------------------------------------------------------------- 1 | import os 2 | from flask import Flask, request, jsonify, abort 3 | from werkzeug.utils import secure_filename 4 | import hashlib 5 | from functools import wraps 6 | import time 7 | from flask_executor import Executor 8 | from PIL import Image 9 | import pytesseract 10 | from flask_redis import FlaskRedis 11 | 12 | app = Flask(__name__) 13 | 14 | #redis用于存放用户的token和异步时临时存储用户的索引 15 | redis_client = FlaskRedis(app, charset='utf-8', decode_responses=True) 16 | 17 | #启动异步操作多线程 18 | executor = Executor(app) 19 | app.config['EXECUTOR_TYPE'] = 'thread' 20 | app.config['EXECUTOR_MAX_WORKERS'] = 5 21 | 22 | #存放图片位置 23 | UPLOAD_FOLDER = 'images/' 24 | ALLOWED_EXTENSIONS = set(['jpg', 'png']) 25 | 26 | 27 | @app.route("/echo", methods=['GET']) 28 | def echo(): 29 | """ 30 | 测试服务器运行正常 31 | :return: 32 | """ 33 | return jsonify({'result': True}) 34 | 35 | def verify(token:str, timestamp:str ,sign:str)-> bool: 36 | """ 37 | 验证token,timestamp,sign 38 | :param token: token 39 | :param timestamp: 时间戳 40 | :param sign: 签名 41 | :return: bool 42 | """ 43 | res = redis_client.hgetall(token) 44 | #如果token不存在,返回False 45 | if not res: 46 | return False 47 | #如果sign不正确,返回FALSE, 48 | res_vale = res.values() 49 | if timestamp in res_vale and sign in res_vale: 50 | return True 51 | return False 52 | 53 | def authorize(f): 54 | @wraps(f) 55 | def decorated_function(*args, **kws): 56 | #如果header不存在token等关键字,直接返回401 57 | if not 'token' in request.headers or not 'timestamp' in request.headers or not 'sign' in request.headers: 58 | abort(401) 59 | token = request.headers['token'] 60 | timestamp = request.headers['timestamp'] 61 | sign = request.headers['sign'] 62 | #如果token,签名验证不通过,返回401 63 | if not verify(token,timestamp,sign): 64 | abort(401) 65 | return f(*args, **kws) 66 | return decorated_function 67 | 68 | def cal_md5(content) -> str: 69 | """ 70 | 给定content,计算md5 71 | :param content: 72 | :return: 73 | """ 74 | md5 = hashlib.md5() 75 | content=str(content) 76 | md5.update(content.encode('UTF-8')) 77 | result = md5.hexdigest() 78 | return result 79 | 80 | @app.route("/gentoken", methods=['GET', 'POST']) 81 | def generate_token(): 82 | """ 83 | 生成用户token, 把token放入redis,以后可以把用户信息存入DB,临时的token放入redis 84 | :return: 85 | """ 86 | rand = os.urandom(32) 87 | token = cal_md5(rand) 88 | timestamp = str(time.time()) 89 | sign = cal_md5(str(token)+timestamp) 90 | TK = {'timestamp':timestamp, 'sign':sign} 91 | redis_client.hmset(token, TK) 92 | return jsonify({'token': token, 'timestamp': timestamp, 'sign': sign}) 93 | 94 | def allowed_file(filename: str)-> bool: 95 | """ 96 | 校验上传的图片格式, 如果格式正确返回True 97 | :param filename: 98 | :return: 99 | """ 100 | return '.' in filename and filename.rsplit('.', 1)[1] in ALLOWED_EXTENSIONS 101 | 102 | def recognise(image: str) -> str: 103 | """ 104 | 使用tesseract路径识别图片 105 | :param image: 图片名字 106 | :return: 图片识别后的结果 107 | """ 108 | #tesseract路径 109 | pytesseract.pytesseract.tesseract_cmd = r'/usr/local/bin/tesseract' 110 | #图片识别成文字 111 | res = pytesseract.image_to_string(Image.open(UPLOAD_FOLDER+'/'+image)) 112 | return res 113 | 114 | #同步任务 115 | @app.route("/upload_sync", methods=['POST']) 116 | @authorize 117 | def upload_sync(): 118 | """ 119 | 上传图片文件,并立即处理,返回成功结果 120 | :return: 121 | """ 122 | #存储图片 123 | if not os.path.exists(UPLOAD_FOLDER): 124 | os.mkdir(UPLOAD_FOLDER) 125 | file = request.files['image'] 126 | if file and allowed_file(file.filename): 127 | filename = secure_filename(file.filename) 128 | file.save(os.path.join(UPLOAD_FOLDER, filename)) 129 | #开始识别图片 130 | res = recognise(filename) 131 | return jsonify({'code':0,'result': res}) 132 | 133 | 134 | @app.route("/upload_async", methods=['POST']) 135 | @authorize 136 | def upload_async(): 137 | """ 138 | 上传图片,不能立即处理完成,先返回成功处理的页面,等待用户调取 139 | :return: 140 | """ 141 | if not os.path.exists(UPLOAD_FOLDER): 142 | os.mkdir(UPLOAD_FOLDER) 143 | file = request.files['image'] 144 | if file and allowed_file(file.filename): 145 | filename = secure_filename(file.filename) 146 | file.save(os.path.join(UPLOAD_FOLDER, filename)) 147 | token = request.headers['token'] 148 | #参数说明,第一个参数是标识操作的用户的token,后面是funtion和它的参数, 识别图片的并发 149 | executor.submit_stored(token, recognise, filename) 150 | return jsonify({'code':0, 'result':'upload sucess, Please get result from API upload_async_result'}) 151 | 152 | @app.route('/upload_async_result', methods=['GET']) 153 | @authorize 154 | def get_result(): 155 | token = request.headers['token'] 156 | #如果图片识别没有完成,那么返回图片正在识别中的状态,等待用户再次请求此接口 157 | if not executor.futures.done(token): 158 | return jsonify({'code':1, 'status': executor.futures._state(token), 'result': "Task is not complete, Please wait a second"}) 159 | #用户图片识别完成,获取识别结果并返回给用户 160 | future = executor.futures.pop(token) 161 | return jsonify({'code':0, 'status': 'done', 'result': future.result()}) 162 | 163 | if __name__ == "__main__": 164 | app.run(host='0.0.0.0', port=5000, debug=True, threaded=True) 165 | -------------------------------------------------------------------------------- /text_classsification/nets/text_rnn.py: -------------------------------------------------------------------------------- 1 | # -- encoding:utf-8 -- 2 | 3 | 4 | import numpy as np 5 | import tensorflow as tf 6 | from tensorflow.contrib import slim 7 | 8 | from nets.metric import Metrics 9 | from nets import base_model 10 | 11 | 12 | class Network(base_model.Network): 13 | 14 | def __init__(self, with_word2vec=False, vocab_size=None, embedding_dimensions=None, 15 | embedding_table=None, train_embedding_table=False, 16 | num_class=2, network_name="TextRNN", weight_decay=0.01, 17 | optimizer_type="adam", optimizer_parameters_func=None, saver_parameters={'max_to_keep': 2}, 18 | num_units=128, layers=3, *args, **kwargs): 19 | """ 20 | :param with_word2vec: 是否使用Word2Vec训练好的转换参数作为Embedding Lookup的参赛值 21 | :param vocab_size: 词汇数目 22 | :param embedding_dimensions: Embedding Loopup转换的时候,单词转换的词向量大小 23 | :param embedding_table: 训练好的单词向量映射表 24 | :param train_embedding_table: 是否训练train_embedding_table的参数值 25 | :param num_class: 类别数目 26 | :param network_name: 网络名称 27 | :param weight_decay: L2正则项的系数 28 | :param optimizer_type: 优化器的类别 29 | :param optimizer_parameters_func: 构建优化器的参数的函数 30 | :param saver_parameters: 模型持久化器的参数 31 | :param num_units: RNN Cell中的神经元数目 32 | :param layers: RNN的层次 33 | """ 34 | self.num_units = num_units # RNN Cell的神经元数目 35 | self.layers = layers # RNN的层次 36 | 37 | super(Network, self).__init__(with_word2vec=with_word2vec, vocab_size=vocab_size, 38 | embedding_dimensions=embedding_dimensions, 39 | embedding_table=embedding_table, train_embedding_table=train_embedding_table, 40 | num_class=num_class, network_name=network_name, weight_decay=weight_decay, 41 | optimizer_type=optimizer_type, 42 | optimizer_parameters_func=optimizer_parameters_func, 43 | saver_parameters=saver_parameters) 44 | 45 | def interface(self): 46 | """ 47 | 前向网络构建 48 | batch_size: N 49 | feature height: H, 将序列长度T认为是H 50 | feature width: W,将Embedding size大小认为是W 51 | feature channel : C,一个文本就相当于一个Feature Map,通道数为1 52 | sentence_length: T 53 | embedding size: E 54 | :return: 55 | """ 56 | with tf.variable_scope(self.network_name): 57 | with slim.arg_scope(self.arg_score()): 58 | with tf.variable_scope("placeholders"): 59 | self.global_step = tf.train.get_or_create_global_step() 60 | # 输入的单词id,形状为:[N,T] 61 | self.inputs = tf.placeholder(dtype=tf.int32, shape=[None, None], name='input_word_id') 62 | # 希望输出的类别id, 形状为:[N,] 63 | self.targets = tf.placeholder(dtype=tf.int32, shape=[None], name='target_class_id') 64 | # Dropout 65 | self.dropout_keep_prob = tf.placeholder_with_default(1.0, shape=[], name='dropout_keep_prob') 66 | 67 | # 1. Embedding Layer 68 | embedding_inputs = self.embedding_lookup(self.inputs) 69 | 70 | # 2. 使用RNN来提取高阶特征 71 | with tf.variable_scope("rnn"): 72 | # a. 定义RNN的cell构建函数 73 | def cell(_units): 74 | _cell = tf.nn.rnn_cell.BasicLSTMCell(num_units=_units) 75 | return tf.nn.rnn_cell.DropoutWrapper(cell=_cell, output_keep_prob=self.dropout_keep_prob) 76 | 77 | # b. 构建前向的cell和反向cell 78 | cell_fw = tf.nn.rnn_cell.MultiRNNCell(cells=[cell(self.num_units) for _ in range(self.layers)]) 79 | cell_bw = tf.nn.rnn_cell.MultiRNNCell(cells=[cell(self.num_units) for _ in range(self.layers)]) 80 | 81 | # c. 获取得到序列的输出向量 82 | # 数据都是按照原始的从左往右的序列得到的最终特征 83 | # (正向提取特征信息[N,T,E], 反向提取特征信息[N,T,E]),(正向最终的状态信息,反向最终的状态信息) 84 | (output_fw, output_bw), (output_state_fw, output_state_bw) = tf.nn.bidirectional_dynamic_rnn( 85 | cell_fw, # 前向的RNN Cell 86 | cell_bw, # 反向的RNN Cell 87 | inputs=embedding_inputs, # 输入值, [N,T,E] 88 | dtype=tf.float32, # 给定RNN状态初始化值的类型 89 | ) 90 | 91 | # 3. 将高阶特征拼接到一起,作为CNN提取出来的最终高阶特征信息 92 | with tf.variable_scope("merge_feature"): 93 | # 4. 提取最后一个时刻的特征信息作为这个序列的最终特征信息 94 | features = tf.concat([output_fw[:, -1, :], output_bw[:, -1, :]], axis=-1) 95 | 96 | # 4. FFN+Softmax做最终的决策输出 97 | with tf.variable_scope("project"): 98 | score = slim.fully_connected(features, num_outputs=self.num_class, activation_fn=None) 99 | # 重命名, 得到的是N个文本属于num_class个类别的置信度 100 | self.logits = tf.identity(score, 'logits') 101 | # 得到N个文本分别属于各个类别的概率值 102 | self.probability = tf.nn.softmax(self.logits, name='probability') 103 | # 得到最终的预测id 104 | self.predictions = tf.argmax(self.logits, axis=-1, name='predictions') 105 | 106 | # 配置一个参数表示仅恢复模型参数 107 | self.saver_parameters['var_list'] = tf.global_variables() 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | -------------------------------------------------------------------------------- /Res2Net/data_utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from PIL import Image 3 | import json 4 | import matplotlib.pyplot as plt 5 | import matplotlib 6 | import pickle 7 | 8 | import torch 9 | from torchvision import datasets, transforms 10 | 11 | 12 | def load_data(path): 13 | """ 14 | 加载,转换,创建torch.utils.data.Dataloaders 15 | Args: 16 | path (str): 路径需要包含子文件夹,例如 17 | path/train/.. 18 | path/eval/.. 19 | path/test/.. 20 | 21 | Returns: 22 | dataloaders (dict): {'train': Dataloader(train_data), 23 | 'eval':, Dataloader(valid_data), 24 | 'test': Dataloader(test_data)} 25 | """ 26 | #训练图片的数据属性 27 | IMG_SIZE = 224 #训练数据尺寸 28 | IMG_MEAN = [0.485, 0.456, 0.406] # 图片归一化均值 29 | IMG_SDEV = [0.229, 0.224, 0.225] # 图片归一化标准差 30 | 31 | #训练阶段 32 | phases = ['train', 'eval', 'test'] 33 | 34 | #文件夹路径 35 | data_dir = {n: path + n for n in phases} 36 | 37 | #设置transforms 38 | data_transforms = { 39 | 'train': 40 | transforms.Compose([ 41 | transforms.RandomRotation(30), 42 | transforms.RandomResizedCrop(IMG_SIZE), 43 | transforms.RandomHorizontalFlip(p=0.5), 44 | transforms.ToTensor(), 45 | transforms.Normalize(IMG_MEAN, IMG_SDEV)]), 46 | 'eval': 47 | transforms.Compose([ 48 | transforms.Resize(256), 49 | transforms.CenterCrop(IMG_SIZE), 50 | transforms.ToTensor(), 51 | transforms.Normalize(IMG_MEAN, IMG_SDEV)]), 52 | 'test': 53 | transforms.Compose([ 54 | transforms.Resize(256), 55 | transforms.CenterCrop(IMG_SIZE), 56 | transforms.ToTensor(), 57 | transforms.Normalize(IMG_MEAN, IMG_SDEV)]) 58 | } 59 | 60 | #加载文件生成datasets 61 | image_datasets = {n: datasets.ImageFolder( 62 | data_dir[n], transform=data_transforms[n]) 63 | for n in phases} 64 | 65 | #创建dataloaders 66 | dataloaders = {n: torch.utils.data.DataLoader( 67 | image_datasets[n], batch_size=64, shuffle=True) 68 | for n in phases} 69 | 70 | # 类别到id 71 | class_to_idx = image_datasets['train'].class_to_idx 72 | 73 | return dataloaders, class_to_idx 74 | 75 | 76 | def display_prediction(image_path, probabilities, predictions): 77 | """ 78 | 绘制分类图像,将top预测类别作为标题,并显示预测top类别的预测概率图 79 | Args: 80 | image_path (str): 分类图片的路径 81 | probabilities ([float]): topk预测概率的列表 82 | class_idxs ([int]): topk类别id的列表 83 | class_names ([str]): topk的类别名称 84 | """ 85 | top_class = predictions[0] 86 | #设置字体 87 | matplotlib.rcParams['font.family'] = ['Kaiti'] 88 | 89 | #设置网格和标题 90 | fig = plt.figure(figsize=(4, 5.4)) 91 | ax1 = plt.subplot2grid((2, 1), (0, 0)) 92 | ax2 = plt.subplot2grid((2, 1), (1, 0)) 93 | fig.suptitle(top_class.capitalize(), x=0.6, y=1, fontsize=16) 94 | 95 | #显示图片 96 | ax1.imshow(Image.open(image_path)) 97 | ax1.set_xticks([]) 98 | ax1.set_yticks([]) 99 | 100 | # 显示预测的类别和概率 101 | #设置y轴 102 | y = np.arange(len(predictions)) 103 | ax2.barh(y, probabilities) 104 | ax2.set_yticks(y) 105 | ax2.set_yticklabels(predictions) 106 | #预测的最高概率 107 | ax2.invert_yaxis() 108 | ax2.set_xlabel('Prediction probability') 109 | 110 | #调整layout 111 | fig.tight_layout() 112 | plt.subplots_adjust(top=0.93) 113 | 114 | plt.show() 115 | 116 | def prediction_class_names(predictions, class_to_idx): 117 | """ 118 | 转换索引到类别名称 119 | Args: 120 | predictions ([int]): 要预测的类别索引 121 | class_to_idx (dict): 类别到id映射 122 | 123 | Returns: 124 | class_names ([str]): 返回预测的类别名称 125 | """ 126 | class_dict = {val: key for key, val in class_to_idx.items()} 127 | class_idxs = [class_dict[pred] for pred in predictions] 128 | 129 | return class_idxs 130 | 131 | def process_image(image_path): 132 | """ 133 | 缩放,裁剪,归一化PIL 图片, 返回一个Numpy数组 134 | Args: 135 | image_path : 输入PIL图片的路径 136 | 137 | Returns: 138 | image_tensor (Tensor): 处理图片,返回torch.FloatTensor 139 | """ 140 | IMG_SIZE = 224 141 | IMG_MEAN = [0.485, 0.456, 0.406] 142 | IMG_SDEV = [0.229, 0.224, 0.225] 143 | 144 | #加载图片 145 | image = Image.open(image_path) 146 | 147 | # Resize最大维度256 148 | if image.size[0] >= image.size[1]: 149 | image.thumbnail((256, image.size[1] * 256 // image.size[0])) 150 | else: 151 | image.thumbnail((image.size[0] * 256 // image.size[1], 256)) 152 | 153 | #中间裁切 154 | image = image.crop(( 155 | (image.size[0] - IMG_SIZE) // 2, 156 | (image.size[1] - IMG_SIZE) // 2, 157 | (image.size[0] + IMG_SIZE) // 2 , 158 | (image.size[1] + IMG_SIZE) // 2)) 159 | # 转换到np.array ,rescape channels到0-1之间 160 | image = np.array(image) / 255 161 | # 归一化图片 162 | image = (image - np.array(IMG_MEAN)) / np.array(IMG_SDEV) 163 | # 调整颜色通道到维度1 164 | image = image.transpose(2, 0, 1) 165 | # 转换成toch.FloatTensor 166 | image_tensor = torch.from_numpy( 167 | np.expand_dims(image, axis=0)).type(torch.FloatTensor) 168 | 169 | return image_tensor 170 | 171 | 172 | def save_label(label, filename): 173 | """ 174 | 使用pickle保存字典 175 | :param vocab: 176 | :param filename: 177 | :return: 178 | """ 179 | with open(filename, 'wb') as f: 180 | pickle.dump(label, f) 181 | 182 | def load_label(filename): 183 | """ 184 | 使用pickle加载字典 185 | :param filename: 186 | :return: 187 | """ 188 | with open(filename, 'rb') as f: 189 | vocab = pickle.load(f) 190 | return vocab -------------------------------------------------------------------------------- /translate/data/sacrebleu.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2020 The HuggingFace Datasets Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """ SACREBLEU metric. """ 16 | 17 | import sacrebleu as scb 18 | from packaging import version 19 | 20 | import datasets 21 | 22 | 23 | _CITATION = """\ 24 | @inproceedings{post-2018-call, 25 | title = "A Call for Clarity in Reporting {BLEU} Scores", 26 | author = "Post, Matt", 27 | booktitle = "Proceedings of the Third Conference on Machine Translation: Research Papers", 28 | month = oct, 29 | year = "2018", 30 | address = "Belgium, Brussels", 31 | publisher = "Association for Computational Linguistics", 32 | url = "https://www.aclweb.org/anthology/W18-6319", 33 | pages = "186--191", 34 | } 35 | """ 36 | 37 | _DESCRIPTION = """\ 38 | SacreBLEU provides hassle-free computation of shareable, comparable, and reproducible BLEU scores. 39 | Inspired by Rico Sennrich's `multi-bleu-detok.perl`, it produces the official WMT scores but works with plain text. 40 | It also knows all the standard test sets and handles downloading, processing, and tokenization for you. 41 | 42 | See the [README.md] file at https://github.com/mjpost/sacreBLEU for more information. 43 | """ 44 | 45 | _KWARGS_DESCRIPTION = """ 46 | Produces BLEU scores along with its sufficient statistics 47 | from a source against one or more references. 48 | 49 | Args: 50 | predictions: The system stream (a sequence of segments) 51 | references: A list of one or more reference streams (each a sequence of segments) 52 | smooth: The smoothing method to use 53 | smooth_value: For 'floor' smoothing, the floor to use 54 | force: Ignore data that looks already tokenized 55 | lowercase: Lowercase the data 56 | tokenize: The tokenizer to use 57 | Returns: 58 | 'score': BLEU score, 59 | 'counts': Counts, 60 | 'totals': Totals, 61 | 'precisions': Precisions, 62 | 'bp': Brevity penalty, 63 | 'sys_len': predictions length, 64 | 'ref_len': reference length, 65 | Examples: 66 | 67 | >>> predictions = ["hello there general kenobi", "foo bar foobar"] 68 | >>> references = [["hello there general kenobi", "hello there !"], ["foo bar foobar", "foo bar foobar"]] 69 | >>> sacrebleu = datasets.load_metric("sacrebleu") 70 | >>> results = sacrebleu.compute(predictions=predictions, references=references) 71 | >>> print(list(results.keys())) 72 | ['score', 'counts', 'totals', 'precisions', 'bp', 'sys_len', 'ref_len'] 73 | >>> print(round(results["score"], 1)) 74 | 100.0 75 | """ 76 | 77 | 78 | @datasets.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) 79 | class Sacrebleu(datasets.Metric): 80 | def _info(self): 81 | if version.parse(scb.__version__) < version.parse("1.4.12"): 82 | raise ImportWarning( 83 | "To use `sacrebleu`, the module `sacrebleu>=1.4.12` is required, and the current version of `sacrebleu` doesn't match this condition.\n" 84 | 'You can install it with `pip install "sacrebleu>=1.4.12"`.' 85 | ) 86 | return datasets.MetricInfo( 87 | description=_DESCRIPTION, 88 | citation=_CITATION, 89 | homepage="https://github.com/mjpost/sacreBLEU", 90 | inputs_description=_KWARGS_DESCRIPTION, 91 | features=datasets.Features( 92 | { 93 | "predictions": datasets.Value("string", id="sequence"), 94 | "references": datasets.Sequence(datasets.Value("string", id="sequence"), id="references"), 95 | } 96 | ), 97 | codebase_urls=["https://github.com/mjpost/sacreBLEU"], 98 | reference_urls=[ 99 | "https://github.com/mjpost/sacreBLEU", 100 | "https://en.wikipedia.org/wiki/BLEU", 101 | "https://towardsdatascience.com/evaluating-text-output-in-nlp-bleu-at-your-own-risk-e8609665a213", 102 | ], 103 | ) 104 | 105 | def _compute( 106 | self, 107 | predictions, 108 | references, 109 | smooth_method="exp", 110 | smooth_value=None, 111 | force=False, 112 | lowercase=False, 113 | tokenize=scb.DEFAULT_TOKENIZER, 114 | use_effective_order=False, 115 | ): 116 | references_per_prediction = len(references[0]) 117 | if any(len(refs) != references_per_prediction for refs in references): 118 | raise ValueError("Sacrebleu requires the same number of references for each prediction") 119 | transformed_references = [[refs[i] for refs in references] for i in range(references_per_prediction)] 120 | output = scb.corpus_bleu( 121 | sys_stream=predictions, 122 | ref_streams=transformed_references, 123 | smooth_method=smooth_method, 124 | smooth_value=smooth_value, 125 | force=force, 126 | lowercase=lowercase, 127 | tokenize=tokenize, 128 | use_effective_order=use_effective_order, 129 | ) 130 | output_dict = { 131 | "score": output.score, 132 | "counts": output.counts, 133 | "totals": output.totals, 134 | "precisions": output.precisions, 135 | "bp": output.bp, 136 | "sys_len": output.sys_len, 137 | "ref_len": output.ref_len, 138 | } 139 | return output_dict 140 | -------------------------------------------------------------------------------- /text_classsification/nets/text_cnn.py: -------------------------------------------------------------------------------- 1 | # -- encoding:utf-8 -- 2 | 3 | 4 | import numpy as np 5 | import tensorflow as tf 6 | from tensorflow.contrib import slim 7 | 8 | from nets.metric import Metrics 9 | from nets import base_model 10 | 11 | 12 | class Network(base_model.Network): 13 | 14 | def __init__(self, with_word2vec=False, vocab_size=None, embedding_dimensions=None, 15 | embedding_table=None, train_embedding_table=False, 16 | num_class=2, network_name="TextCNN", weight_decay=0.01, 17 | optimizer_type="adam", optimizer_parameters_func=None, saver_parameters={'max_to_keep': 2}, 18 | num_filters=128, region_sizes=[2, 3, 4], *args, **kwargs): 19 | """ 20 | :param with_word2vec: 是否使用Word2Vec训练好的转换参数作为Embedding Lookup的参赛值 21 | :param vocab_size: 词汇数目 22 | :param embedding_dimensions: Embedding Loopup转换的时候,单词转换的词向量大小 23 | :param embedding_table: 训练好的单词向量映射表 24 | :param train_embedding_table: 是否训练train_embedding_table的参数值 25 | :param num_class: 类别数目 26 | :param network_name: 网络名称 27 | :param weight_decay: L2正则项的系数 28 | :param optimizer_type: 优化器的类别 29 | :param optimizer_parameters_func: 构建优化器的参数的函数 30 | :param saver_parameters: 模型持久化器的参数 31 | :param num_filters: TextCNN 各个不同类型卷积核的数目,可以给定为int或者list 32 | :param region_sizes: TextCNN各个不同类别卷积核提取单词特征的单词数量范围 33 | """ 34 | self.region_sizes = region_sizes # 使用CNN提取特征信息的时候,提取范围大小 35 | if isinstance(num_filters, list): 36 | # 相当于针对每个范围给定不同的卷积核数目 37 | if len(region_sizes) != len(num_filters): 38 | raise Exception("resize_sizes和num_filters大小必须一致!!!") 39 | else: 40 | self.num_filters = num_filters 41 | elif isinstance(num_filters, int): 42 | self.num_filters = [num_filters] * len(region_sizes) 43 | else: 44 | raise Exception("参数num_filters仅支持int类型或者list类型数据!!") 45 | 46 | super(Network, self).__init__(with_word2vec=with_word2vec, vocab_size=vocab_size, 47 | embedding_dimensions=embedding_dimensions, 48 | embedding_table=embedding_table, train_embedding_table=train_embedding_table, 49 | num_class=num_class, network_name=network_name, weight_decay=weight_decay, 50 | optimizer_type=optimizer_type, 51 | optimizer_parameters_func=optimizer_parameters_func, 52 | saver_parameters=saver_parameters) 53 | 54 | def interface(self): 55 | """ 56 | 前向网络构建 57 | batch_size: N 58 | feature height: H, 将序列长度T认为是H 59 | feature width: W,将Embedding size大小认为是W 60 | feature channel : C,一个文本就相当于一个Feature Map,通道数为1 61 | sentence_length: T 62 | embedding size: E 63 | :return: 64 | """ 65 | with tf.variable_scope(self.network_name): 66 | with slim.arg_scope(self.arg_score()): 67 | with tf.variable_scope("placeholders"): 68 | self.global_step = tf.train.get_or_create_global_step() 69 | # 输入的单词id,形状为:[N,T] 70 | self.inputs = tf.placeholder(dtype=tf.int32, shape=[None, None], name='input_word_id') 71 | # 希望输出的类别id, 形状为:[N,] 72 | self.targets = tf.placeholder(dtype=tf.int32, shape=[None], name='target_class_id') 73 | # Dropout 74 | self.dropout_keep_prob = tf.placeholder_with_default(1.0, shape=[], name='dropout_keep_prob') 75 | 76 | # 1. Embedding Layer 77 | # 将单词id转换为单词向量,[N,T] --> [N,T,E] 78 | embedding_inputs = self.embedding_lookup(self.inputs) 79 | # 增加维度信息,将其转换为四维对象, [N,T,E] --> [N,T,E,1] 80 | expanded_embedding_inputs = tf.expand_dims(embedding_inputs, axis=-1) 81 | 82 | # 2. 使用卷积来提取高阶特征 83 | outputs = [] 84 | for idx, region_size in enumerate(self.region_sizes): 85 | with tf.variable_scope("conv-max-pooling-{}".format(idx)): 86 | # 卷积的功能相当于将region_size个单词看成一个整体,然后进行单词的特征向量信息的融合提取 87 | # 最终返回结果形状为: [N,T,1,C] 88 | # NOTE: 这里的T实际上是比原来的序列长度小, T = sequence_length - region_size + 1 89 | conv = slim.conv2d( 90 | expanded_embedding_inputs, # [N,T,E,1] 91 | num_outputs=self.num_filters[idx], # C, eg:2 92 | kernel_size=(region_size, self.embedding_dimensions) # (h,w), eg:(3,E) 93 | ) 94 | # 针对序列的每个通道获取一个最大值,相当于认为每个卷积核提取某种特征信息,这里直接获取主要特征信息出来 95 | # [N,T,1,C] --> [N,1,1,C] 96 | pooled = tf.reduce_max(conv, axis=[1, 2], keep_dims=True) 97 | # 通道压缩,因为维度1其实是无用的 98 | output = tf.squeeze(pooled, axis=[1, 2]) 99 | # 添加到临时列表中 100 | outputs.append(output) 101 | 102 | # 3. 将高阶特征拼接到一起,作为CNN提取出来的最终高阶特征信息 103 | with tf.variable_scope("merge_feature"): 104 | features = tf.concat(outputs, axis=-1) 105 | features = tf.nn.dropout(features, keep_prob=self.dropout_keep_prob) 106 | 107 | # 4. FFN+Softmax做最终的决策输出 108 | with tf.variable_scope("project"): 109 | score = slim.fully_connected(features, num_outputs=self.num_class, activation_fn=None) 110 | # 重命名, 得到的是N个文本属于num_class个类别的置信度 111 | self.logits = tf.identity(score, 'logits') 112 | # 得到N个文本分别属于各个类别的概率值 113 | self.probability = tf.nn.softmax(self.logits, name='probability') 114 | # 得到最终的预测id 115 | self.predictions = tf.argmax(self.logits, axis=-1, name='predictions') 116 | 117 | # 配置一个参数表示仅恢复模型参数 118 | self.saver_parameters['var_list'] = tf.global_variables() 119 | -------------------------------------------------------------------------------- /text_classsification/nets/text_rnn_improve.py: -------------------------------------------------------------------------------- 1 | # -- encoding:utf-8 -- 2 | 3 | 4 | import numpy as np 5 | import tensorflow as tf 6 | from tensorflow.contrib import slim 7 | 8 | from nets.metric import Metrics 9 | from nets import base_model 10 | 11 | 12 | class Network(base_model.Network): 13 | 14 | def __init__(self, with_word2vec=False, vocab_size=None, embedding_dimensions=None, 15 | embedding_table=None, train_embedding_table=False, 16 | num_class=2, network_name="TextRNN", weight_decay=0.01, 17 | optimizer_type="adam", optimizer_parameters_func=None, saver_parameters={'max_to_keep': 2}, 18 | num_units=128, layers=3, *args, **kwargs): 19 | """ 20 | :param with_word2vec: 是否使用Word2Vec训练好的转换参数作为Embedding Lookup的参赛值 21 | :param vocab_size: 词汇数目 22 | :param embedding_dimensions: Embedding Loopup转换的时候,单词转换的词向量大小 23 | :param embedding_table: 训练好的单词向量映射表 24 | :param train_embedding_table: 是否训练train_embedding_table的参数值 25 | :param num_class: 类别数目 26 | :param network_name: 网络名称 27 | :param weight_decay: L2正则项的系数 28 | :param optimizer_type: 优化器的类别 29 | :param optimizer_parameters_func: 构建优化器的参数的函数 30 | :param saver_parameters: 模型持久化器的参数 31 | :param num_units: RNN Cell中的神经元数目 32 | :param layers: RNN的层次 33 | """ 34 | self.num_units = num_units # RNN Cell的神经元数目 35 | self.layers = layers # RNN的层次 36 | 37 | super(Network, self).__init__(with_word2vec=with_word2vec, vocab_size=vocab_size, 38 | embedding_dimensions=embedding_dimensions, 39 | embedding_table=embedding_table, train_embedding_table=train_embedding_table, 40 | num_class=num_class, network_name=network_name, weight_decay=weight_decay, 41 | optimizer_type=optimizer_type, 42 | optimizer_parameters_func=optimizer_parameters_func, 43 | saver_parameters=saver_parameters) 44 | 45 | def interface(self): 46 | """ 47 | 前向网络构建 48 | batch_size: N 49 | feature height: H, 将序列长度T认为是H 50 | feature width: W,将Embedding size大小认为是W 51 | feature channel : C,一个文本就相当于一个Feature Map,通道数为1 52 | sentence_length: T 53 | embedding size: E 54 | :return: 55 | """ 56 | with tf.variable_scope(self.network_name): 57 | with slim.arg_scope(self.arg_score()): 58 | with tf.variable_scope("placeholders"): 59 | self.global_step = tf.train.get_or_create_global_step() 60 | # 输入的单词id,形状为:[N,T] 61 | self.inputs = tf.placeholder(dtype=tf.int32, shape=[None, None], name='input_word_id') 62 | # 希望输出的类别id, 形状为:[N,] 63 | self.targets = tf.placeholder(dtype=tf.int32, shape=[None], name='target_class_id') 64 | # Dropout 65 | self.dropout_keep_prob = tf.placeholder_with_default(1.0, shape=[], name='dropout_keep_prob') 66 | # 计算序列实际长度, 最终形状为:[N,] 67 | sequence_length = tf.reduce_sum(tf.sign(tf.abs(self.inputs)), axis=-1) 68 | 69 | # 1. Embedding Layer 70 | embedding_inputs = self.embedding_lookup(self.inputs) 71 | 72 | # 2. 使用RNN来提取高阶特征 73 | with tf.variable_scope("rnn"): 74 | # a. 定义RNN的cell构建函数 75 | def cell(_units): 76 | _cell = tf.nn.rnn_cell.BasicLSTMCell(num_units=_units) 77 | return tf.nn.rnn_cell.DropoutWrapper(cell=_cell, output_keep_prob=self.dropout_keep_prob) 78 | 79 | # b. 构建前向的cell和反向cell 80 | cell_fw = tf.nn.rnn_cell.MultiRNNCell(cells=[cell(self.num_units) for _ in range(self.layers)]) 81 | cell_bw = tf.nn.rnn_cell.MultiRNNCell(cells=[cell(self.num_units) for _ in range(self.layers)]) 82 | 83 | # c. 获取得到序列的输出向量 84 | # 数据都是按照原始的从左往右的序列得到的最终特征 85 | # (正向提取特征信息[N,T,E], 反向提取特征信息[N,T,E]),(正向最终的状态信息,反向最终的状态信息) 86 | # 如果给定了序列的实际长度,那么在进行计算的时候,仅计算实际序列长度部分的内容,对于后面填充的内直接返回zero 87 | (output_fw, output_bw), (output_state_fw, output_state_bw) = tf.nn.bidirectional_dynamic_rnn( 88 | cell_fw, # 前向的RNN Cell 89 | cell_bw, # 反向的RNN Cell 90 | inputs=embedding_inputs, # 输入值, [N,T,E] 91 | dtype=tf.float32, # 给定RNN状态初始化值的类型 92 | sequence_length=sequence_length, # 给定序列的实际长度(因为序列是经过填充的) 93 | ) 94 | 95 | # 3. 将高阶特征拼接到一起,作为CNN提取出来的最终高阶特征信息 96 | with tf.variable_scope("merge_feature"): 97 | # 4. 直接将所有时刻的输出特征值mean作为最终特征信息(由于填充位置输出是zero,所以求均值不会产生影响) 98 | # [N,T,E] --> [N,E] --> [N,E] 99 | div_denominator = tf.reshape(tf.to_float(sequence_length), shape=(-1, 1)) 100 | features_fw = tf.div(tf.reduce_sum(output_fw, axis=1), div_denominator) 101 | features_bw = tf.div(tf.reduce_sum(output_bw, axis=1), div_denominator) 102 | features = tf.concat([features_fw, features_bw], axis=-1) 103 | # TODO: 获取实际序列最后要给时刻的输出特征向量作为高阶向量(下周一做) 104 | 105 | # 4. FFN+Softmax做最终的决策输出 106 | with tf.variable_scope("project"): 107 | score = slim.fully_connected(features, num_outputs=self.num_class, activation_fn=None) 108 | # 重命名, 得到的是N个文本属于num_class个类别的置信度 109 | self.logits = tf.identity(score, 'logits') 110 | # 得到N个文本分别属于各个类别的概率值 111 | self.probability = tf.nn.softmax(self.logits, name='probability') 112 | # 得到最终的预测id 113 | self.predictions = tf.argmax(self.logits, axis=-1, name='predictions') 114 | 115 | # 配置一个参数表示仅恢复模型参数 116 | self.saver_parameters['var_list'] = tf.global_variables() 117 | -------------------------------------------------------------------------------- /text_classsification/nets/text_rnn_improve2.py: -------------------------------------------------------------------------------- 1 | # -- encoding:utf-8 -- 2 | 3 | 4 | import numpy as np 5 | import tensorflow as tf 6 | from tensorflow.contrib import slim 7 | 8 | from nets.metric import Metrics 9 | from nets import base_model 10 | 11 | 12 | class Network(base_model.Network): 13 | 14 | def __init__(self, with_word2vec=False, vocab_size=None, embedding_dimensions=None, 15 | embedding_table=None, train_embedding_table=False, 16 | num_class=2, network_name="TextRNN", weight_decay=0.01, 17 | optimizer_type="adam", optimizer_parameters_func=None, saver_parameters={'max_to_keep': 2}, 18 | num_units=128, layers=3, *args, **kwargs): 19 | """ 20 | :param with_word2vec: 是否使用Word2Vec训练好的转换参数作为Embedding Lookup的参赛值 21 | :param vocab_size: 词汇数目 22 | :param embedding_dimensions: Embedding Loopup转换的时候,单词转换的词向量大小 23 | :param embedding_table: 训练好的单词向量映射表 24 | :param train_embedding_table: 是否训练train_embedding_table的参数值 25 | :param num_class: 类别数目 26 | :param network_name: 网络名称 27 | :param weight_decay: L2正则项的系数 28 | :param optimizer_type: 优化器的类别 29 | :param optimizer_parameters_func: 构建优化器的参数的函数 30 | :param saver_parameters: 模型持久化器的参数 31 | :param num_units: RNN Cell中的神经元数目 32 | :param layers: RNN的层次 33 | """ 34 | self.num_units = num_units # RNN Cell的神经元数目 35 | self.layers = layers # RNN的层次 36 | 37 | super(Network, self).__init__(with_word2vec=with_word2vec, vocab_size=vocab_size, 38 | embedding_dimensions=embedding_dimensions, 39 | embedding_table=embedding_table, train_embedding_table=train_embedding_table, 40 | num_class=num_class, network_name=network_name, weight_decay=weight_decay, 41 | optimizer_type=optimizer_type, 42 | optimizer_parameters_func=optimizer_parameters_func, 43 | saver_parameters=saver_parameters) 44 | 45 | def interface(self): 46 | """ 47 | 前向网络构建 48 | batch_size: N 49 | feature height: H, 将序列长度T认为是H 50 | feature width: W,将Embedding size大小认为是W 51 | feature channel : C,一个文本就相当于一个Feature Map,通道数为1 52 | sentence_length: T 53 | embedding size: E 54 | :return: 55 | """ 56 | with tf.variable_scope(self.network_name): 57 | with slim.arg_scope(self.arg_score()): 58 | with tf.variable_scope("placeholders"): 59 | self.global_step = tf.train.get_or_create_global_step() 60 | # 输入的单词id,形状为:[N,T] 61 | self.inputs = tf.placeholder(dtype=tf.int32, shape=[None, None], name='input_word_id') 62 | # 希望输出的类别id, 形状为:[N,] 63 | self.targets = tf.placeholder(dtype=tf.int32, shape=[None], name='target_class_id') 64 | # Dropout 65 | self.dropout_keep_prob = tf.placeholder_with_default(1.0, shape=[], name='dropout_keep_prob') 66 | # 计算序列实际长度, 最终形状为:[N,] 67 | sequence_length = tf.reduce_sum(tf.sign(tf.abs(self.inputs)), axis=-1) 68 | 69 | # 1. Embedding Layer 70 | embedding_inputs = self.embedding_lookup(self.inputs) 71 | 72 | # 2. 使用RNN来提取高阶特征 73 | with tf.variable_scope("rnn"): 74 | # a. 定义RNN的cell构建函数 75 | def cell(_units): 76 | _cell = tf.nn.rnn_cell.BasicLSTMCell(num_units=_units) 77 | return tf.nn.rnn_cell.DropoutWrapper(cell=_cell, output_keep_prob=self.dropout_keep_prob) 78 | 79 | # b. 构建前向的cell和反向cell 80 | cell_fw = tf.nn.rnn_cell.MultiRNNCell(cells=[cell(self.num_units) for _ in range(self.layers)]) 81 | cell_bw = tf.nn.rnn_cell.MultiRNNCell(cells=[cell(self.num_units) for _ in range(self.layers)]) 82 | 83 | # c. 获取得到序列的输出向量 84 | # 数据都是按照原始的从左往右的序列得到的最终特征 85 | # (正向提取特征信息[N,T,E], 反向提取特征信息[N,T,E]),(正向最终的状态信息,反向最终的状态信息) 86 | # 如果给定了序列的实际长度,那么在进行计算的时候,仅计算实际序列长度部分的内容,对于后面填充的内直接返回zero 87 | (output_fw, output_bw), (output_state_fw, output_state_bw) = tf.nn.bidirectional_dynamic_rnn( 88 | cell_fw, # 前向的RNN Cell 89 | cell_bw, # 反向的RNN Cell 90 | inputs=embedding_inputs, # 输入值, [N,T,E] 91 | dtype=tf.float32, # 给定RNN状态初始化值的类型 92 | sequence_length=sequence_length, # 给定序列的实际长度(因为序列是经过填充的) 93 | ) 94 | 95 | # 3. 将高阶特征拼接到一起,作为CNN提取出来的最终高阶特征信息 96 | with tf.variable_scope("merge_feature"): 97 | # a. 获取正向执行网络中执行序列的最后一个时刻的输出值作为正向的特征 98 | # 实现方式一:先将output_fw反转,然后获取output_fw[:,0,:]即为最终结果 99 | # 实现方式二:使用tf.gather_nd获取不固定索引位置的向量信息 100 | batch_size = tf.shape(output_fw)[0] # 批次大小 101 | indices_fw = tf.concat([ 102 | tf.reshape(tf.range(batch_size), shape=(-1, 1)), # 样本索引, [0,N-1] 103 | tf.reshape(sequence_length - 1, shape=(-1, 1)) # 样本长度最后一个时刻的索引值, 每个样本的长度信息 104 | ], axis=-1) 105 | features_fw = tf.gather_nd(output_fw, indices_fw) 106 | 107 | # b. 获取反向执行网络中执行序列的最后一个时刻的输出值作为反向的特征,也就是真实序列中的第0个时刻 108 | features_bw = output_bw[:, 0, :] 109 | 110 | # c. 将正向和方向结果合并 111 | features = tf.concat([features_fw, features_bw], axis=-1) 112 | 113 | # 4. FFN+Softmax做最终的决策输出 114 | with tf.variable_scope("project"): 115 | score = slim.fully_connected(features, num_outputs=self.num_class, activation_fn=None) 116 | # 重命名, 得到的是N个文本属于num_class个类别的置信度 117 | self.logits = tf.identity(score, 'logits') 118 | # 得到N个文本分别属于各个类别的概率值 119 | self.probability = tf.nn.softmax(self.logits, name='probability') 120 | # 得到最终的预测id 121 | self.predictions = tf.argmax(self.logits, axis=-1, name='predictions') 122 | 123 | # 配置一个参数表示仅恢复模型参数 124 | self.saver_parameters['var_list'] = tf.global_variables() 125 | -------------------------------------------------------------------------------- /Res2Net/res2next.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | import math 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | from torch.nn import init 6 | import torch 7 | import torch.utils.model_zoo as model_zoo 8 | 9 | __all__ = ['res2next50'] 10 | model_urls = { 11 | 'res2next50': 'https://shanghuagao.oss-cn-beijing.aliyuncs.com/res2net/res2next50_4s-6ef7e7bf.pth', 12 | } 13 | 14 | class Bottle2neckX(nn.Module): 15 | expansion = 4 16 | 17 | def __init__(self, inplanes, planes, baseWidth, cardinality, stride=1, downsample=None, scale = 4, stype='normal'): 18 | """ Constructor 19 | Args: 20 | inplanes: input channel dimensionality 21 | planes: output channel dimensionality 22 | baseWidth: base width. 23 | cardinality: num of convolution groups. 24 | stride: conv stride. Replaces pooling layer. 25 | scale: number of scale. 26 | type: 'normal': normal set. 'stage': frist blokc of a new stage. 27 | """ 28 | super(Bottle2neckX, self).__init__() 29 | 30 | D = int(math.floor(planes * (baseWidth/64.0))) 31 | C = cardinality 32 | 33 | self.conv1 = nn.Conv2d(inplanes, D*C*scale, kernel_size=1, stride=1, padding=0, bias=False) 34 | self.bn1 = nn.BatchNorm2d(D*C*scale) 35 | 36 | if scale == 1: 37 | self.nums = 1 38 | else: 39 | self.nums = scale -1 40 | if stype == 'stage': 41 | self.pool = nn.AvgPool2d(kernel_size=3, stride = stride, padding=1) 42 | convs = [] 43 | bns = [] 44 | for i in range(self.nums): 45 | convs.append(nn.Conv2d(D*C, D*C, kernel_size=3, stride = stride, padding=1, groups=C, bias=False)) 46 | bns.append(nn.BatchNorm2d(D*C)) 47 | self.convs = nn.ModuleList(convs) 48 | self.bns = nn.ModuleList(bns) 49 | 50 | self.conv3 = nn.Conv2d(D*C*scale, planes * 4, kernel_size=1, stride=1, padding=0, bias=False) 51 | self.bn3 = nn.BatchNorm2d(planes * 4) 52 | self.relu = nn.ReLU(inplace=True) 53 | 54 | self.downsample = downsample 55 | self.width = D*C 56 | self.stype = stype 57 | self.scale = scale 58 | 59 | def forward(self, x): 60 | residual = x 61 | 62 | out = self.conv1(x) 63 | out = self.bn1(out) 64 | out = self.relu(out) 65 | 66 | spx = torch.split(out, self.width, 1) 67 | for i in range(self.nums): 68 | if i==0 or self.stype=='stage': 69 | sp = spx[i] 70 | else: 71 | sp = sp + spx[i] 72 | sp = self.convs[i](sp) 73 | sp = self.relu(self.bns[i](sp)) 74 | if i==0: 75 | out = sp 76 | else: 77 | out = torch.cat((out, sp), 1) 78 | if self.scale != 1 and self.stype=='normal': 79 | out = torch.cat((out, spx[self.nums]),1) 80 | elif self.scale != 1 and self.stype=='stage': 81 | out = torch.cat((out, self.pool(spx[self.nums])),1) 82 | 83 | out = self.conv3(out) 84 | out = self.bn3(out) 85 | 86 | if self.downsample is not None: 87 | residual = self.downsample(x) 88 | 89 | out += residual 90 | out = self.relu(out) 91 | 92 | return out 93 | 94 | 95 | class Res2NeXt(nn.Module): 96 | def __init__(self, block, baseWidth, cardinality, layers, num_classes, scale=4): 97 | """ Constructor 98 | Args: 99 | baseWidth: baseWidth for ResNeXt. 100 | cardinality: number of convolution groups. 101 | layers: config of layers, e.g., [3, 4, 6, 3] 102 | num_classes: number of classes 103 | scale: scale in res2net 104 | """ 105 | super(Res2NeXt, self).__init__() 106 | 107 | self.cardinality = cardinality 108 | self.baseWidth = baseWidth 109 | self.num_classes = num_classes 110 | self.inplanes = 64 111 | self.output_size = 64 112 | self.scale = scale 113 | 114 | self.conv1 = nn.Conv2d(3, 64, 7, 2, 3, bias=False) 115 | self.bn1 = nn.BatchNorm2d(64) 116 | self.relu = nn.ReLU(inplace=True) 117 | self.maxpool1 = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) 118 | self.layer1 = self._make_layer(block, 64, layers[0]) 119 | self.layer2 = self._make_layer(block, 128, layers[1], 2) 120 | self.layer3 = self._make_layer(block, 256, layers[2], 2) 121 | self.layer4 = self._make_layer(block, 512, layers[3], 2) 122 | self.avgpool = nn.AdaptiveAvgPool2d(1) 123 | self.fc = nn.Linear(512 * block.expansion, num_classes) 124 | 125 | for m in self.modules(): 126 | if isinstance(m, nn.Conv2d): 127 | n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels 128 | m.weight.data.normal_(0, math.sqrt(2. / n)) 129 | elif isinstance(m, nn.BatchNorm2d): 130 | m.weight.data.fill_(1) 131 | m.bias.data.zero_() 132 | 133 | def _make_layer(self, block, planes, blocks, stride=1): 134 | downsample = None 135 | if stride != 1 or self.inplanes != planes * block.expansion: 136 | downsample = nn.Sequential( 137 | nn.Conv2d(self.inplanes, planes * block.expansion, 138 | kernel_size=1, stride=stride, bias=False), 139 | nn.BatchNorm2d(planes * block.expansion), 140 | ) 141 | 142 | layers = [] 143 | layers.append(block(self.inplanes, planes, self.baseWidth, self.cardinality, stride, downsample, scale=self.scale, stype='stage')) 144 | self.inplanes = planes * block.expansion 145 | for i in range(1, blocks): 146 | layers.append(block(self.inplanes, planes, self.baseWidth, self.cardinality, scale=self.scale)) 147 | 148 | return nn.Sequential(*layers) 149 | 150 | def forward(self, x): 151 | x = self.conv1(x) 152 | x = self.bn1(x) 153 | x = self.relu(x) 154 | x = self.maxpool1(x) 155 | x = self.layer1(x) 156 | x = self.layer2(x) 157 | x = self.layer3(x) 158 | x = self.layer4(x) 159 | x = self.avgpool(x) 160 | x = x.view(x.size(0), -1) 161 | x = self.fc(x) 162 | 163 | return x 164 | def res2next50(pretrained=False, map_location='cpu', **kwargs): 165 | """ Construct Res2NeXt-50. 166 | The default scale is 4. 167 | Args: 168 | pretrained (bool): If True, returns a model pre-trained on ImageNet 169 | """ 170 | model = Res2NeXt(Bottle2neckX, layers = [3, 4, 6, 3], baseWidth = 4, cardinality=8, scale = 4, num_classes=1000) 171 | if pretrained: 172 | model.load_state_dict(model_zoo.load_url(model_urls['res2next50'], map_location=map_location)) 173 | return model 174 | 175 | if __name__ == '__main__': 176 | images = torch.rand(1, 3, 224, 224).cuda(0) 177 | model = res2next50(pretrained=True) 178 | model = model.cuda(0) 179 | print(model(images).size()) 180 | -------------------------------------------------------------------------------- /text_classsification/utils/vocabulary_utils.py: -------------------------------------------------------------------------------- 1 | # -- encoding:utf-8 -- 2 | 3 | import os 4 | import itertools 5 | import jieba 6 | import numpy as np 7 | import tensorflow as tf 8 | from tensorflow.contrib.learn import preprocessing 9 | from gensim import utils 10 | from gensim.models import word2vec 11 | 12 | 13 | def default_split_fn(documents): 14 | return split_with_char(documents) 15 | 16 | 17 | def split_with_char(documents): 18 | return [list(sentence) for sentence in documents] 19 | 20 | 21 | def split_with_word(documents): 22 | return [list(filter(lambda word: len(word) > 0, jieba.cut(sentence.strip()))) for sentence in documents] 23 | 24 | 25 | class CategoricalVocabulary(preprocessing.CategoricalVocabulary): 26 | def __init__(self, unknown_token=""): 27 | super(CategoricalVocabulary, self).__init__(unknown_token, False) 28 | 29 | # 特殊值(填充0,未知1) 30 | self.padding_token = "" 31 | self._mapping[self.padding_token] = 0 32 | self._mapping[self._unknown_token] = 1 33 | # 添加一个属性 34 | self.vocab_size = 2 35 | 36 | def get(self, category): 37 | if category not in self._mapping: 38 | return 1 39 | return self._mapping[category] 40 | 41 | def set(self, category, index): 42 | self._mapping[category] = index 43 | self.vocab_size += 1 44 | 45 | 46 | class PathLineSentences(object): 47 | """Like :class:`~gensim.models.word2vec.LineSentence`, but process all files in a directory 48 | in alphabetical order by filename. 49 | 50 | The directory must only contain files that can be read by :class:`gensim.models.word2vec.LineSentence`: 51 | .bz2, .gz, and text files. Any file not ending with .bz2 or .gz is assumed to be a text file. 52 | 53 | The format of files (either text, or compressed text files) in the path is one sentence = one line, 54 | with words already preprocessed and separated by whitespace. 55 | 56 | Warnings 57 | -------- 58 | Does **not recurse** into subdirectories. 59 | 60 | """ 61 | 62 | def __init__(self, source, max_sentence_length=word2vec.MAX_WORDS_IN_BATCH, limit=None, split_fn=None): 63 | self.source = source 64 | self.max_sentence_length = max_sentence_length 65 | self.limit = limit 66 | if split_fn is None: 67 | self.split_fn = default_split_fn 68 | else: 69 | self.split_fn = split_fn 70 | 71 | if os.path.isfile(self.source): 72 | self.input_files = [self.source] # force code compatibility with list of files 73 | elif os.path.isdir(self.source): 74 | self.source = os.path.join(self.source, '') # ensures os-specific slash at end of path 75 | self.input_files = os.listdir(self.source) 76 | self.input_files = [self.source + filename for filename in self.input_files] # make full paths 77 | self.input_files.sort() 78 | else: 79 | raise ValueError('input is neither a file nor a path') 80 | 81 | def __iter__(self): 82 | """iterate through the files""" 83 | for file_name in self.input_files: 84 | with utils.open(file_name, 'rb') as fin: 85 | for line in itertools.islice(fin, self.limit): 86 | line = self.split_fn([utils.to_unicode(line).strip()])[0] 87 | i = 0 88 | while i < len(line): 89 | yield line[i:i + self.max_sentence_length] 90 | i += self.max_sentence_length 91 | 92 | 93 | class VocabularyProcessorUtil(object): 94 | 95 | @staticmethod 96 | def building_model(documents, save_path, max_document_length=512, vocabulary=None, split_fn=default_split_fn): 97 | """ 98 | 基于传入的文档数据构建字典相关信息 99 | :param documents: 进行模型训练的时候的文本数据 100 | :param save_path: 模型持久化的路径 101 | :param vocabulary: 词汇映射表 102 | :param split_fn: 将文本转换为单词过程中的函数, 默认是将每个字当作一个单词 103 | :param max_document_length: 将文本单词id转换的时候,最长文本允许的单词数目 104 | :return: 105 | """ 106 | tf.logging.info("开始构建词汇转换模型.....") 107 | model = preprocessing.VocabularyProcessor(max_document_length=max_document_length, 108 | vocabulary=vocabulary, tokenizer_fn=split_fn) 109 | model.fit(raw_documents=documents) 110 | tf.logging.info("词汇转换模型构建完成,开始模型保存操作!!!") 111 | model.save(save_path) 112 | tf.logging.info("词汇转换模型保存完成,保存位置为:{}".format(save_path)) 113 | 114 | @staticmethod 115 | def load_model(save_path) -> preprocessing.VocabularyProcessor: 116 | """ 117 | 基于给定的路径加载模型并返回 118 | :param save_path: 119 | :return: 120 | """ 121 | if os.path.exists(save_path): 122 | tf.logging.info("从【{}】位置进行词汇转换模型的恢复!!!".format(save_path)) 123 | return preprocessing.VocabularyProcessor.restore(save_path) 124 | else: 125 | raise Exception("词汇转换模型不存在,请检查磁盘路径:{}".format(save_path)) 126 | 127 | @staticmethod 128 | def build_word2vec_embedding(data_path, save_path, embedding_dimensions): 129 | """ 130 | 基于data_path下的文件内容构建Word2Vec向量,并将向量保存到save_path这个路径中 131 | :param data_path: 原始数据所在的文件夹路径 132 | :param save_path: 训练好的数据保存路径 133 | :param embedding_dimensions: 转换的Embedding向量大小 134 | :return: 135 | """ 136 | # 0. 加载数据 137 | sentences = PathLineSentences(source=data_path, split_fn=split_with_word) 138 | # 1. 构建Word2Vec模型 139 | model = word2vec.Word2Vec(sentences=sentences, size=embedding_dimensions, 140 | window=9, min_count=2, iter=50) 141 | # 3. 模型保存(以文本形式保存) 142 | model.wv.save_word2vec_format(fname=save_path, binary=True) 143 | 144 | @staticmethod 145 | def load_word2vec_embedding(save_path): 146 | """ 147 | 加载Word2Vec训练好的embedding转换矩阵 148 | :param save_path: 数据存储的路径 149 | :param binary: 是否是二进制存储 150 | :return: embedding_table, vocabulary 151 | """ 152 | # 1. 加载数据 153 | model = word2vec.Word2VecKeyedVectors.load_word2vec_format(save_path, binary=True) 154 | # 2. 获取embedding_table 155 | embedding_table = model.vectors 156 | embedding_dimensions = np.shape(embedding_table)[1] 157 | # 3. 获取单词和id之间的映射关系 158 | vocabulary = CategoricalVocabulary() 159 | vocab_size = vocabulary.vocab_size 160 | for word in model.vocab: 161 | vocabulary.set(word, model.vocab[word].index + vocab_size) 162 | # 4. 在embedding_table前面加入特征字符所代表的含义 163 | embedding_table = np.concatenate( 164 | [ 165 | np.zeros(shape=(1, embedding_dimensions), dtype=embedding_table.dtype), # PAD对应的的特征值 166 | np.random.normal(0, 0.01, size=(1, embedding_dimensions)), # UNK对应的特征值 167 | embedding_table # 原始单词对应的特征值 168 | ], 169 | axis=0 170 | ) 171 | return embedding_table, vocabulary 172 | 173 | 174 | if __name__ == '__main__': 175 | VocabularyProcessorUtil.build_word2vec_embedding("../data", "../model/w2v2.bin", 128) 176 | embedding_table, vob = VocabularyProcessorUtil.load_word2vec_embedding("../model/w2v.bin") 177 | print(vob.vocab_size) 178 | -------------------------------------------------------------------------------- /text_classsification/nets/text_transformer.py: -------------------------------------------------------------------------------- 1 | # -- encoding:utf-8 -- 2 | 3 | 4 | import numpy as np 5 | import tensorflow as tf 6 | from tensorflow.contrib import slim 7 | 8 | from nets.metric import Metrics 9 | from nets import base_model 10 | 11 | 12 | class Network(base_model.Network): 13 | 14 | def __init__(self, with_word2vec=False, vocab_size=None, embedding_dimensions=None, 15 | embedding_table=None, train_embedding_table=False, 16 | num_class=2, network_name="TextRNN", weight_decay=0.01, 17 | optimizer_type="adam", optimizer_parameters_func=None, saver_parameters={'max_to_keep': 2}, 18 | attention_dimension_size=128, attention_layers=3, attention_headers=16, *args, **kwargs): 19 | """ 20 | :param with_word2vec: 是否使用Word2Vec训练好的转换参数作为Embedding Lookup的参赛值 21 | :param vocab_size: 词汇数目 22 | :param embedding_dimensions: Embedding Loopup转换的时候,单词转换的词向量大小 23 | :param embedding_table: 训练好的单词向量映射表 24 | :param train_embedding_table: 是否训练train_embedding_table的参数值 25 | :param num_class: 类别数目 26 | :param network_name: 网络名称 27 | :param weight_decay: L2正则项的系数 28 | :param optimizer_type: 优化器的类别 29 | :param optimizer_parameters_func: 构建优化器的参数的函数 30 | :param saver_parameters: 模型持久化器的参数 31 | :param attention_dimension_size: Self Attention计算过程中的维度大小 32 | :param attention_layers: RNN的层次 33 | :param attention_headers: 头的数目 34 | """ 35 | self.attention_dimension_size = attention_dimension_size 36 | self.attention_layers = attention_layers 37 | self.attention_headers = attention_headers 38 | 39 | super(Network, self).__init__(with_word2vec=with_word2vec, vocab_size=vocab_size, 40 | embedding_dimensions=embedding_dimensions, 41 | embedding_table=embedding_table, train_embedding_table=train_embedding_table, 42 | num_class=num_class, network_name=network_name, weight_decay=weight_decay, 43 | optimizer_type=optimizer_type, 44 | optimizer_parameters_func=optimizer_parameters_func, 45 | saver_parameters=saver_parameters) 46 | 47 | def interface(self): 48 | """ 49 | 前向网络构建 50 | batch_size: N 51 | feature height: H, 将序列长度T认为是H 52 | feature width: W,将Embedding size大小认为是W 53 | feature channel : C,一个文本就相当于一个Feature Map,通道数为1 54 | sentence_length: T 55 | embedding size: E 56 | :return: 57 | """ 58 | with tf.variable_scope(self.network_name): 59 | with slim.arg_scope(self.arg_score()): 60 | with tf.variable_scope("placeholders"): 61 | self.global_step = tf.train.get_or_create_global_step() 62 | # 输入的单词id,形状为:[N,T] 63 | self.inputs = tf.placeholder(dtype=tf.int32, shape=[None, None], name='input_word_id') 64 | # 希望输出的类别id, 形状为:[N,] 65 | self.targets = tf.placeholder(dtype=tf.int32, shape=[None], name='target_class_id') 66 | # Dropout 67 | self.dropout_keep_prob = tf.placeholder_with_default(1.0, shape=[], name='dropout_keep_prob') 68 | 69 | # 1. Embedding Layer(N,T,E) 70 | embedding_inputs = self.embedding_lookup(self.inputs) 71 | 72 | # 2. 使用Transformer来提取高阶特征 73 | with tf.variable_scope("transformer"): 74 | with tf.variable_scope("Input"): 75 | encoder_input = tf.layers.dense(embedding_inputs, units=self.attention_dimension_size, 76 | activation=tf.nn.relu) 77 | 78 | for layer in range(self.attention_layers): 79 | with tf.variable_scope("Encoder_{}".format(layer)): 80 | # 1. 得到各个头的信息 81 | attention_outputs = [] 82 | for header in range(self.attention_headers): 83 | with tf.variable_scope("Header_{}".format(header)): 84 | attention_output = self._self_attention( 85 | H=encoder_input, 86 | attention_dimension_size=self.attention_dimension_size 87 | ) 88 | attention_outputs.append(attention_output) 89 | 90 | # 2. 拼接 91 | attention_output = tf.concat(attention_outputs, axis=-1) 92 | 93 | # 3. 做一个线性转换 94 | attention_output = tf.layers.dense(attention_output, 95 | units=self.attention_dimension_size, 96 | activation=None) 97 | 98 | # 4. 将当前层的输出和当前层的输入做一个残差结构 99 | attention_output = tf.nn.relu(attention_output + encoder_input) 100 | 101 | # 5. 将当前层输出作为下一层的输入 102 | encoder_input = attention_output 103 | 104 | # 3. 将高阶特征拼接到一起,作为CNN提取出来的最终高阶特征信息 105 | with tf.variable_scope("merge_feature"): 106 | # 4. 将所有时刻的特征信息求均值 107 | features = tf.reduce_mean(attention_output, axis=1) 108 | 109 | # 4. FFN+Softmax做最终的决策输出 110 | with tf.variable_scope("project"): 111 | score = slim.fully_connected(features, num_outputs=self.num_class, activation_fn=None) 112 | # 重命名, 得到的是N个文本属于num_class个类别的置信度 113 | self.logits = tf.identity(score, 'logits') 114 | # 得到N个文本分别属于各个类别的概率值 115 | self.probability = tf.nn.softmax(self.logits, name='probability') 116 | # 得到最终的预测id 117 | self.predictions = tf.argmax(self.logits, axis=-1, name='predictions') 118 | 119 | # 配置一个参数表示仅恢复模型参数 120 | self.saver_parameters['var_list'] = tf.global_variables() 121 | 122 | def _self_attention(self, H, attention_dimension_size): 123 | """ 124 | 计算Self-Attention 125 | :param H: [N,T,E], N个序列,每个序列T个时刻,每个时刻E维的向量 126 | :return: 127 | """ 128 | # 0. 获取大小信息 129 | hidden_size = H.shape[-1] 130 | batch_size, sequence_length, _ = tf.unstack(tf.shape(H)) 131 | # 1. 对输入数据reshape操作 132 | H = tf.reshape(H, shape=tf.stack([batch_size * sequence_length, hidden_size])) 133 | # 2. 分别计算Q、K、V 134 | Q = tf.layers.dense(H, units=attention_dimension_size) 135 | K = tf.layers.dense(H, units=attention_dimension_size) 136 | V = tf.layers.dense(H, units=attention_dimension_size, activation=tf.nn.relu) 137 | # 3. Reshape 138 | Q = tf.reshape(Q, shape=tf.stack([batch_size, sequence_length, attention_dimension_size])) 139 | K = tf.reshape(K, shape=tf.stack([batch_size, sequence_length, attention_dimension_size])) 140 | V = tf.reshape(V, shape=tf.stack([batch_size, sequence_length, attention_dimension_size])) 141 | # 4. 计算相关性([N,T,E],[N,T,E],F,T) --> [N,T,T] 142 | scores = tf.matmul(Q, K, False, True) / np.sqrt(attention_dimension_size) 143 | # 5. 计算概率值([N,T,T]) 144 | weights = tf.nn.softmax(scores) 145 | # 6. 计算最终结果 146 | attention = tf.matmul(weights, V) 147 | return attention 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | -------------------------------------------------------------------------------- /TextCNN/model_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import data_utils 6 | from tqdm import tqdm 7 | 8 | class TextCNN(nn.Module): 9 | """ 10 | TextCNN模型 11 | """ 12 | def __init__(self, args): 13 | super(TextCNN, self).__init__() 14 | self.args = args 15 | # 单词表的长度,用于做词嵌入lookingup查表 16 | vocab_num = args.vocab_num 17 | # 词嵌入后生成的单词的维度 18 | embed_dim = args.embed_dim 19 | # 类别的维度 20 | class_num = args.class_num 21 | # 卷积核的起始输入维度,因为开始只有1个维度输入,所以默认in_channels维度是1 22 | kernel_in = 1 23 | # 卷积核的数量,等于输出的卷积核的channel数量 24 | kernel_num = args.kernel_num 25 | #卷积核尺寸,是一个列表[3,4,5] 26 | kernel_sizes = args.kernel_sizes 27 | #单词做lookingup查表的词嵌入,将词id变成词向量 28 | self.embed = nn.Embedding(vocab_num, embed_dim) 29 | #ModuleList,子模型作为一个列表传入, kernel_size卷积核的尺寸,这里是分别是[3,embed_dim], [4,embed_dim], [5,embed_dim] 30 | #kernel_size卷积核的尺寸的形状是[H,W], 高是3,代表3个词之间的关系 31 | self.convs1 = nn.ModuleList( 32 | [nn.Conv2d(in_channels=kernel_in, out_channels=kernel_num, kernel_size=(size, embed_dim)) 33 | for size in kernel_sizes] 34 | ) 35 | #做一次dropout 36 | self.dropout = nn.Dropout(args.dropout) 37 | #做全连接, 输入维度是len(kernel_sizes) * kernel_num, 因为是把所有卷积后的结果进行拼接,所以这个是拼接后的维度,class_num是要预测的类别 38 | self.fc1 = nn.Linear(len(kernel_sizes) * kernel_num, class_num) 39 | 40 | def conv_and_pool(self, x, conv): 41 | x = F.relu(conv(x)).squeeze(3) # (N, Co, W) 42 | x = F.max_pool1d(x, x.size(2)).squeeze(2) 43 | return x 44 | 45 | def forward(self, text): 46 | """ 47 | 前向网络 48 | :param text: text的维度是[batch_size, sequence_length], 输入的是text的单词的id 49 | :return: 50 | """ 51 | # 对text进行embedding lookup,生成的维度是[batch_size,sequence_length,Embedding_demission], 即[N,W,D] 52 | x = self.embed(text) 53 | # 添加一个维度,用于卷积,在第二个维度上扩充,变成[batch_size,1,sequence_length,Embedding_demission], 54 | x = x.unsqueeze(1) 55 | # 使用ModuleList中的卷积,卷积后进行relu激活,激活后, 56 | #第一次卷积Conv2d(1, 100, kernel_size=(3, 128), stride=(1, 1)),输入的x[batch_size,1,sequence_length,Embedding_demission], 卷积后x[batch_size,kernel_num,sequence_length,1], squeeze最后一个维度 57 | #第二次Conv2d(1, 100, kernel_size=(4, 128), stride=(1, 1)), 输出的形状和第一次相同 58 | #第三次Conv2d(1, 100, kernel_size=(5, 128), stride=(1, 1)), 输出的形状和第一次相同 59 | x_conv_pool_result = [] 60 | #分别进行3次卷积,x_conv_result存储3次卷积的结果, 分布进行池化操作 61 | for conv in self.convs1: 62 | #输入的x[batch_size,1,sequence_length,Embedding_demission], 卷积后x[batch_size,kernel_num,sequence_length,1] 63 | x1 = conv(x) 64 | #激活不改变形状 65 | x1 = F.relu(x1) 66 | #squeeze后 [batch_size, kernel_num, sequence_length】 67 | x1 = x1.squeeze(3) 68 | #x1的shape是[batch_size,kernel_num,sequence_length], 设置kernel_size的大小是sequence_length * sequence_length 69 | x1 = F.max_pool1d(x1, kernel_size=x1.size(2)) 70 | # max_pool1d后输出的x1的shape是[batch_size, kernel_num, 1] 71 | x1 = x1.squeeze(2) 72 | #squeeze后的shape是[batch_size, kernel_num] 73 | x_conv_pool_result.append(x1) 74 | #拼接输出结果, 形状是[batch_size, kernel_num*卷积的次数] 75 | x = torch.cat(x_conv_pool_result, 1) 76 | #做一次dropout, 形状不变 77 | x = self.dropout(x) 78 | #做全连接后得到输出结果 [batch_size, class_num] 79 | logit = self.fc1(x) 80 | return logit 81 | 82 | 83 | def train(train_iter, model, args): 84 | """ 85 | 训练 86 | :param train_iter: 训练数据 87 | :param model: 模型,例如初始化的TextCNN 88 | :param args: paraser传入的config信息 89 | :return: 90 | """ 91 | print("开始训练模型") 92 | #创建优化器 93 | optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate) 94 | model.train() 95 | 96 | #如果有GPU,使用gpu 97 | if args.cuda: 98 | model.cuda() 99 | 100 | for epoch in range(1, args.epochs+1): 101 | training_loss = 0.0 102 | training_acc = 0.0 103 | training_count = 0.0 104 | 105 | for batch in tqdm(train_iter): 106 | # batch.text返回的形状是[sequence_length, batch_size], batch.label[batch_size] 107 | feature, target = batch.text, batch.label 108 | #feautre进行转置,形状变成【batch_size, sequence_length] 109 | feature.t_() 110 | # 所有label的数值减去1 111 | target.sub_(1) 112 | #如果是gpu,转换成gpu资源 113 | if args.cuda: 114 | feature, target = feature.cuda(), target.cuda() 115 | 116 | optimizer.zero_grad() 117 | #得到预测结果 118 | logit = model(feature) 119 | #计算交叉熵损失 120 | loss = F.cross_entropy(logit, target) 121 | loss.backward() 122 | optimizer.step() 123 | #计算准确率 124 | corrects = (torch.max(logit, 1)[1].view(target.size()).data == target.data).sum() 125 | #损失training_loss更新 126 | training_loss += loss.item() 127 | training_acc += corrects.item() 128 | training_count += batch.batch_size 129 | 130 | #计算平均算是和准确率 131 | training_loss /= training_count 132 | training_acc /= training_count 133 | accuracy = 100.0 * training_acc 134 | print('Training epoch [{}/{}] - loss: {:.6f} acc: {:.2f}%'.format( 135 | epoch, args.epochs, training_loss, accuracy)) 136 | #保存模型 137 | if epoch % args.save_interval == 0: 138 | torch.save(model, args.save_path + f"textcnn.model-{epoch}") 139 | print('保存模型完成') 140 | #训练完成后再次保存模型 141 | torch.save(model, args.save_path + "textcnn.model") 142 | print("训练完成") 143 | 144 | 145 | def eval(data_iter, model, args): 146 | """ 147 | 评估模型 148 | :param train_iter: 训练数据 149 | :param model: 模型,例如初始化的TextCNN 150 | :param args: paraser传入的config信息 151 | :return: 152 | """ 153 | print("开始评估模型") 154 | #设置评估模型 155 | model.eval() 156 | if args.cuda: 157 | model.cuda() 158 | #评估准确率和损失 159 | corrects, avg_loss = 0, 0 160 | for batch in data_iter: 161 | feature, target = batch.text, batch.label 162 | feature.t_() 163 | target.sub_(1) 164 | if args.cuda: 165 | feature, target = feature.cuda(), target.cuda() 166 | 167 | logit = model(feature) 168 | loss = F.cross_entropy(logit, target, size_average=False) 169 | 170 | avg_loss += loss.data.item() 171 | corrects += (torch.max(logit, 1)[1].view(target.size()).data == target.data).sum() 172 | 173 | size = len(data_iter.dataset) 174 | avg_loss /= size 175 | accuracy = 100.0 * corrects/size 176 | print('Evaluation - loss: {:.6f} acc: {:.4f}%'.format(avg_loss, accuracy)) 177 | print('评估完成') 178 | return accuracy 179 | 180 | def predict(path, model, text_field, label_feild, cuda): 181 | """ 182 | 模型预测 183 | :param path: 要预测文本文件的路径 184 | :param model: 初始化好的模型 185 | :param text_field: text_field 文件 186 | :param label_feild: 187 | :param cuda: 是否使用gpu 188 | :return: 189 | """ 190 | model.eval() 191 | if cuda: 192 | model.cuda() 193 | 194 | document = '' 195 | with open(path, encoding="utf8", errors='ignore') as f: 196 | for line in f: 197 | if line != '\n': 198 | document += data_utils.text_filter(line) 199 | 200 | #对文本进行jieba处理 201 | text = text_field.preprocess(document) 202 | 203 | #文本转换成id 204 | text = [[text_field.vocab.stoi[x] for x in text]] 205 | x = torch.LongTensor(text) 206 | if cuda: 207 | x = x.cuda() 208 | #预测结果 209 | output = model(x) 210 | #获取概率最大的结果 211 | _, predicted = torch.max(output, 1) 212 | #预测的索引id转换成文字 213 | label = label_feild.vocab.itos[predicted.data[0] + 1] 214 | return document, label 215 | -------------------------------------------------------------------------------- /text_classsification/nets/text_cnn_rnn.py: -------------------------------------------------------------------------------- 1 | # -- encoding:utf-8 -- 2 | 3 | 4 | import numpy as np 5 | import tensorflow as tf 6 | from tensorflow.contrib import slim 7 | 8 | from nets.metric import Metrics 9 | from nets import base_model 10 | 11 | 12 | class Network(base_model.Network): 13 | 14 | def __init__(self, with_word2vec=False, vocab_size=None, embedding_dimensions=None, 15 | embedding_table=None, train_embedding_table=False, 16 | num_class=2, network_name="TextCNNRNN", weight_decay=0.01, 17 | optimizer_type="adam", optimizer_parameters_func=None, saver_parameters={'max_to_keep': 2}, 18 | num_filters=128, region_sizes=[2, 3, 4], num_units=128, layers=3, *args, **kwargs): 19 | """ 20 | :param with_word2vec: 是否使用Word2Vec训练好的转换参数作为Embedding Lookup的参赛值 21 | :param vocab_size: 词汇数目 22 | :param embedding_dimensions: Embedding Loopup转换的时候,单词转换的词向量大小 23 | :param embedding_table: 训练好的单词向量映射表 24 | :param train_embedding_table: 是否训练train_embedding_table的参数值 25 | :param num_class: 类别数目 26 | :param network_name: 网络名称 27 | :param weight_decay: L2正则项的系数 28 | :param optimizer_type: 优化器的类别 29 | :param optimizer_parameters_func: 构建优化器的参数的函数 30 | :param saver_parameters: 模型持久化器的参数 31 | :param num_filters: TextCNN 各个不同类型卷积核的数目,可以给定为int或者list 32 | :param region_sizes: TextCNN各个不同类别卷积核提取单词特征的单词数量范围 33 | :param num_units: RNN Cell中的神经元数目 34 | :param layers: RNN的层次 35 | """ 36 | self.num_units = num_units # RNN Cell的神经元数目 37 | self.layers = layers # RNN的层次 38 | self.region_sizes = region_sizes # 使用CNN提取特征信息的时候,提取范围大小 39 | if isinstance(num_filters, list): 40 | # 相当于针对每个范围给定不同的卷积核数目 41 | if len(region_sizes) != len(num_filters): 42 | raise Exception("resize_sizes和num_filters大小必须一致!!!") 43 | else: 44 | self.num_filters = num_filters 45 | elif isinstance(num_filters, int): 46 | self.num_filters = [num_filters] * len(region_sizes) 47 | else: 48 | raise Exception("参数num_filters仅支持int类型或者list类型数据!!") 49 | 50 | super(Network, self).__init__(with_word2vec=with_word2vec, vocab_size=vocab_size, 51 | embedding_dimensions=embedding_dimensions, 52 | embedding_table=embedding_table, train_embedding_table=train_embedding_table, 53 | num_class=num_class, network_name=network_name, weight_decay=weight_decay, 54 | optimizer_type=optimizer_type, 55 | optimizer_parameters_func=optimizer_parameters_func, 56 | saver_parameters=saver_parameters) 57 | 58 | def interface(self): 59 | """ 60 | 前向网络构建 61 | batch_size: N 62 | feature height: H, 将序列长度T认为是H 63 | feature width: W,将Embedding size大小认为是W 64 | feature channel : C,一个文本就相当于一个Feature Map,通道数为1 65 | sentence_length: T 66 | embedding size: E 67 | :return: 68 | """ 69 | with tf.variable_scope(self.network_name): 70 | with slim.arg_scope(self.arg_score()): 71 | with tf.variable_scope("placeholders"): 72 | self.global_step = tf.train.get_or_create_global_step() 73 | # 输入的单词id,形状为:[N,T] 74 | self.inputs = tf.placeholder(dtype=tf.int32, shape=[None, None], name='input_word_id') 75 | # 希望输出的类别id, 形状为:[N,] 76 | self.targets = tf.placeholder(dtype=tf.int32, shape=[None], name='target_class_id') 77 | # Dropout 78 | self.dropout_keep_prob = tf.placeholder_with_default(1.0, shape=[], name='dropout_keep_prob') 79 | 80 | # 1. Embedding Layer 81 | # 将单词id转换为单词向量,[N,T] --> [N,T,E] 82 | embedding_inputs = self.embedding_lookup(self.inputs) 83 | # 增加维度信息,将其转换为四维对象, [N,T,E] --> [N,T,E,1] 84 | expanded_embedding_inputs = tf.expand_dims(embedding_inputs, axis=-1) 85 | 86 | # 2. 使用卷积来提取高阶特征 87 | outputs = [] 88 | with tf.variable_scope("cnn"): 89 | for idx, region_size in enumerate(self.region_sizes): 90 | with tf.variable_scope("conv-max-pooling-{}".format(idx)): 91 | conv2d_input = expanded_embedding_inputs 92 | # 卷积的功能相当于将region_size个单词看成一个整体,然后进行单词的特征向量信息的融合提取 93 | # 最终返回结果形状为: [N,T,1,C] 94 | # 为了保障卷积之后的Feature Map大小和原始大小一致(序列长度一致),所以这里进行数据的填充 95 | if region_size - 1 != 0: 96 | top = (region_size - 1) // 2 97 | bottom = region_size - 1 - top 98 | conv2d_input = tf.pad(conv2d_input, paddings=[[0, 0], [top, bottom], [0, 0], [0, 0]]) 99 | # 卷积(序列长度不变) 100 | conv = slim.conv2d( 101 | conv2d_input, # [N,T,E,1] 102 | num_outputs=self.num_filters[idx], # C, eg:2 103 | kernel_size=(region_size, self.embedding_dimensions) # (h,w), eg:(3,E) 104 | ) 105 | # 添加到临时列表中 106 | outputs.append(tf.squeeze(conv, axis=2)) 107 | with tf.variable_scope("rnn"): 108 | with tf.variable_scope("input"): 109 | # 数据合并,将不同卷积核提取的特征信息作为不同维度的特征 110 | rnn_input = tf.concat(outputs, axis=-1) 111 | 112 | with tf.variable_scope("feature"): 113 | with tf.variable_scope("rnn"): 114 | # a. 定义RNN的cell构建函数 115 | def cell(_units): 116 | _cell = tf.nn.rnn_cell.BasicLSTMCell(num_units=_units) 117 | return tf.nn.rnn_cell.DropoutWrapper(cell=_cell, 118 | output_keep_prob=self.dropout_keep_prob) 119 | 120 | # b. 构建前向的cell和反向cell 121 | cell_fw = tf.nn.rnn_cell.MultiRNNCell( 122 | cells=[cell(self.num_units) for _ in range(self.layers)]) 123 | cell_bw = tf.nn.rnn_cell.MultiRNNCell( 124 | cells=[cell(self.num_units) for _ in range(self.layers)]) 125 | 126 | # c. 获取得到序列的输出向量 127 | # 数据都是按照原始的从左往右的序列得到的最终特征 128 | # (正向提取特征信息[N,T,E], 反向提取特征信息[N,T,E]),(正向最终的状态信息,反向最终的状态信息) 129 | (output_fw, output_bw), _ = tf.nn.bidirectional_dynamic_rnn( 130 | cell_fw, # 前向的RNN Cell 131 | cell_bw, # 反向的RNN Cell 132 | inputs=rnn_input, # 输入值, [N,T,E] 133 | dtype=tf.float32, # 给定RNN状态初始化值的类型 134 | ) 135 | 136 | # 3. 将高阶特征拼接到一起,作为CNN提取出来的最终高阶特征信息 137 | with tf.variable_scope("merge_feature"): 138 | # 前向使用最后一个时刻,后向使用第一个时刻 139 | features = tf.concat([output_fw[:, -1, :], output_bw[:, 0, :]], axis=-1) 140 | 141 | # 4. FFN+Softmax做最终的决策输出 142 | with tf.variable_scope("project"): 143 | score = slim.fully_connected(features, num_outputs=self.num_class, activation_fn=None) 144 | # 重命名, 得到的是N个文本属于num_class个类别的置信度 145 | self.logits = tf.identity(score, 'logits') 146 | # 得到N个文本分别属于各个类别的概率值 147 | self.probability = tf.nn.softmax(self.logits, name='probability') 148 | # 得到最终的预测id 149 | self.predictions = tf.argmax(self.logits, axis=-1, name='predictions') 150 | 151 | # 配置一个参数表示仅恢复模型参数 152 | self.saver_parameters['var_list'] = tf.global_variables() 153 | -------------------------------------------------------------------------------- /langconv.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from copy import deepcopy 5 | import re 6 | 7 | try: 8 | import psyco 9 | psyco.full() 10 | except: 11 | pass 12 | 13 | try: 14 | from zh_wiki import zh2Hant, zh2Hans 15 | except ImportError: 16 | from zhtools.zh_wiki import zh2Hant, zh2Hans 17 | 18 | import sys 19 | py3k = sys.version_info >= (3, 0, 0) 20 | 21 | if py3k: 22 | UEMPTY = '' 23 | else: 24 | _zh2Hant, _zh2Hans = {}, {} 25 | for old, new in ((zh2Hant, _zh2Hant), (zh2Hans, _zh2Hans)): 26 | for k, v in old.items(): 27 | new[k.decode('utf8')] = v.decode('utf8') 28 | zh2Hant = _zh2Hant 29 | zh2Hans = _zh2Hans 30 | UEMPTY = ''.decode('utf8') 31 | 32 | # states 33 | (START, END, FAIL, WAIT_TAIL) = list(range(4)) 34 | # conditions 35 | (TAIL, ERROR, MATCHED_SWITCH, UNMATCHED_SWITCH, CONNECTOR) = list(range(5)) 36 | 37 | MAPS = {} 38 | 39 | class Node(object): 40 | def __init__(self, from_word, to_word=None, is_tail=True, 41 | have_child=False): 42 | self.from_word = from_word 43 | if to_word is None: 44 | self.to_word = from_word 45 | self.data = (is_tail, have_child, from_word) 46 | self.is_original = True 47 | else: 48 | self.to_word = to_word or from_word 49 | self.data = (is_tail, have_child, to_word) 50 | self.is_original = False 51 | self.is_tail = is_tail 52 | self.have_child = have_child 53 | 54 | def is_original_long_word(self): 55 | return self.is_original and len(self.from_word)>1 56 | 57 | def is_follow(self, chars): 58 | return chars != self.from_word[:-1] 59 | 60 | def __str__(self): 61 | return '' % (repr(self.from_word), 62 | repr(self.to_word), self.is_tail, self.have_child) 63 | 64 | __repr__ = __str__ 65 | 66 | class ConvertMap(object): 67 | def __init__(self, name, mapping=None): 68 | self.name = name 69 | self._map = {} 70 | if mapping: 71 | self.set_convert_map(mapping) 72 | 73 | def set_convert_map(self, mapping): 74 | convert_map = {} 75 | have_child = {} 76 | max_key_length = 0 77 | for key in sorted(mapping.keys()): 78 | if len(key)>1: 79 | for i in range(1, len(key)): 80 | parent_key = key[:i] 81 | have_child[parent_key] = True 82 | have_child[key] = False 83 | max_key_length = max(max_key_length, len(key)) 84 | for key in sorted(have_child.keys()): 85 | convert_map[key] = (key in mapping, have_child[key], 86 | mapping.get(key, UEMPTY)) 87 | self._map = convert_map 88 | self.max_key_length = max_key_length 89 | 90 | def __getitem__(self, k): 91 | try: 92 | is_tail, have_child, to_word = self._map[k] 93 | return Node(k, to_word, is_tail, have_child) 94 | except: 95 | return Node(k) 96 | 97 | def __contains__(self, k): 98 | return k in self._map 99 | 100 | def __len__(self): 101 | return len(self._map) 102 | 103 | class StatesMachineException(Exception): pass 104 | 105 | class StatesMachine(object): 106 | def __init__(self): 107 | self.state = START 108 | self.final = UEMPTY 109 | self.len = 0 110 | self.pool = UEMPTY 111 | 112 | def clone(self, pool): 113 | new = deepcopy(self) 114 | new.state = WAIT_TAIL 115 | new.pool = pool 116 | return new 117 | 118 | def feed(self, char, map): 119 | node = map[self.pool+char] 120 | 121 | if node.have_child: 122 | if node.is_tail: 123 | if node.is_original: 124 | cond = UNMATCHED_SWITCH 125 | else: 126 | cond = MATCHED_SWITCH 127 | else: 128 | cond = CONNECTOR 129 | else: 130 | if node.is_tail: 131 | cond = TAIL 132 | else: 133 | cond = ERROR 134 | 135 | new = None 136 | if cond == ERROR: 137 | self.state = FAIL 138 | elif cond == TAIL: 139 | if self.state == WAIT_TAIL and node.is_original_long_word(): 140 | self.state = FAIL 141 | else: 142 | self.final += node.to_word 143 | self.len += 1 144 | self.pool = UEMPTY 145 | self.state = END 146 | elif self.state == START or self.state == WAIT_TAIL: 147 | if cond == MATCHED_SWITCH: 148 | new = self.clone(node.from_word) 149 | self.final += node.to_word 150 | self.len += 1 151 | self.state = END 152 | self.pool = UEMPTY 153 | elif cond == UNMATCHED_SWITCH or cond == CONNECTOR: 154 | if self.state == START: 155 | new = self.clone(node.from_word) 156 | self.final += node.to_word 157 | self.len += 1 158 | self.state = END 159 | else: 160 | if node.is_follow(self.pool): 161 | self.state = FAIL 162 | else: 163 | self.pool = node.from_word 164 | elif self.state == END: 165 | # END is a new START 166 | self.state = START 167 | new = self.feed(char, map) 168 | elif self.state == FAIL: 169 | raise StatesMachineException('Translate States Machine ' 170 | 'have error with input data %s' % node) 171 | return new 172 | 173 | def __len__(self): 174 | return self.len + 1 175 | 176 | def __str__(self): 177 | return '' % ( 178 | id(self), self.pool, self.state, self.final) 179 | __repr__ = __str__ 180 | 181 | class Converter(object): 182 | def __init__(self, to_encoding): 183 | self.to_encoding = to_encoding 184 | self.map = MAPS[to_encoding] 185 | self.start() 186 | 187 | def feed(self, char): 188 | branches = [] 189 | for fsm in self.machines: 190 | new = fsm.feed(char, self.map) 191 | if new: 192 | branches.append(new) 193 | if branches: 194 | self.machines.extend(branches) 195 | self.machines = [fsm for fsm in self.machines if fsm.state != FAIL] 196 | all_ok = True 197 | for fsm in self.machines: 198 | if fsm.state != END: 199 | all_ok = False 200 | if all_ok: 201 | self._clean() 202 | return self.get_result() 203 | 204 | def _clean(self): 205 | if len(self.machines): 206 | self.machines.sort(key=lambda x: len(x)) 207 | # self.machines.sort(cmp=lambda x,y: cmp(len(x), len(y))) 208 | self.final += self.machines[0].final 209 | self.machines = [StatesMachine()] 210 | 211 | def start(self): 212 | self.machines = [StatesMachine()] 213 | self.final = UEMPTY 214 | 215 | def end(self): 216 | self.machines = [fsm for fsm in self.machines 217 | if fsm.state == FAIL or fsm.state == END] 218 | self._clean() 219 | 220 | def convert(self, string): 221 | self.start() 222 | for char in string: 223 | self.feed(char) 224 | self.end() 225 | return self.get_result() 226 | 227 | def get_result(self): 228 | return self.final 229 | 230 | 231 | def registery(name, mapping): 232 | global MAPS 233 | MAPS[name] = ConvertMap(name, mapping) 234 | 235 | registery('zh-hant', zh2Hant) 236 | registery('zh-hans', zh2Hans) 237 | del zh2Hant, zh2Hans 238 | 239 | 240 | def run(): 241 | import sys 242 | from optparse import OptionParser 243 | parser = OptionParser() 244 | parser.add_option('-e', type='string', dest='encoding', 245 | help='encoding') 246 | parser.add_option('-f', type='string', dest='file_in', 247 | help='input file (- for stdin)') 248 | parser.add_option('-t', type='string', dest='file_out', 249 | help='output file') 250 | (options, args) = parser.parse_args() 251 | if not options.encoding: 252 | parser.error('encoding must be set') 253 | if options.file_in: 254 | if options.file_in == '-': 255 | file_in = sys.stdin 256 | else: 257 | file_in = open(options.file_in) 258 | else: 259 | file_in = sys.stdin 260 | if options.file_out: 261 | if options.file_out == '-': 262 | file_out = sys.stdout 263 | else: 264 | file_out = open(options.file_out, 'wb') 265 | else: 266 | file_out = sys.stdout 267 | 268 | c = Converter(options.encoding) 269 | for line in file_in: 270 | # print >> file_out, c.convert(line.rstrip('\n').decode( 271 | file_out.write(c.convert(line.rstrip('\n').decode( 272 | 'utf8')).encode('utf8')) 273 | 274 | 275 | if __name__ == '__main__': 276 | run() 277 | 278 | -------------------------------------------------------------------------------- /text_classsification/nets/text_rnn_transformer.py: -------------------------------------------------------------------------------- 1 | # -- encoding:utf-8 -- 2 | 3 | 4 | import numpy as np 5 | import tensorflow as tf 6 | from tensorflow.contrib import slim 7 | 8 | from nets.metric import Metrics 9 | from nets import base_model 10 | 11 | 12 | class Network(base_model.Network): 13 | 14 | def __init__(self, with_word2vec=False, vocab_size=None, embedding_dimensions=None, 15 | embedding_table=None, train_embedding_table=False, 16 | num_class=2, network_name="TextRNN", weight_decay=0.01, 17 | optimizer_type="adam", optimizer_parameters_func=None, saver_parameters={'max_to_keep': 2}, 18 | num_units=128, layers=3, 19 | attention_dimension_size=128, attention_layers=3, attention_headers=16, *args, **kwargs): 20 | """ 21 | :param with_word2vec: 是否使用Word2Vec训练好的转换参数作为Embedding Lookup的参赛值 22 | :param vocab_size: 词汇数目 23 | :param embedding_dimensions: Embedding Loopup转换的时候,单词转换的词向量大小 24 | :param embedding_table: 训练好的单词向量映射表 25 | :param train_embedding_table: 是否训练train_embedding_table的参数值 26 | :param num_class: 类别数目 27 | :param network_name: 网络名称 28 | :param weight_decay: L2正则项的系数 29 | :param optimizer_type: 优化器的类别 30 | :param optimizer_parameters_func: 构建优化器的参数的函数 31 | :param saver_parameters: 模型持久化器的参数 32 | :param num_units: RNN Cell中的神经元数目 33 | :param layers: RNN的层次 34 | :param attention_dimension_size: Self Attention计算过程中的维度大小 35 | :param attention_layers: Transformer的层次 36 | :param attention_headers: 头的数目 37 | """ 38 | self.attention_dimension_size = attention_dimension_size 39 | self.attention_layers = attention_layers 40 | self.attention_headers = attention_headers 41 | self.num_units = num_units # RNN Cell的神经元数目 42 | self.layers = layers # RNN的层次 43 | 44 | super(Network, self).__init__(with_word2vec=with_word2vec, vocab_size=vocab_size, 45 | embedding_dimensions=embedding_dimensions, 46 | embedding_table=embedding_table, train_embedding_table=train_embedding_table, 47 | num_class=num_class, network_name=network_name, weight_decay=weight_decay, 48 | optimizer_type=optimizer_type, 49 | optimizer_parameters_func=optimizer_parameters_func, 50 | saver_parameters=saver_parameters) 51 | 52 | def interface(self): 53 | """ 54 | 前向网络构建 55 | batch_size: N 56 | feature height: H, 将序列长度T认为是H 57 | feature width: W,将Embedding size大小认为是W 58 | feature channel : C,一个文本就相当于一个Feature Map,通道数为1 59 | sentence_length: T 60 | embedding size: E 61 | :return: 62 | """ 63 | with tf.variable_scope(self.network_name): 64 | with slim.arg_scope(self.arg_score()): 65 | with tf.variable_scope("placeholders"): 66 | self.global_step = tf.train.get_or_create_global_step() 67 | # 输入的单词id,形状为:[N,T] 68 | self.inputs = tf.placeholder(dtype=tf.int32, shape=[None, None], name='input_word_id') 69 | # 希望输出的类别id, 形状为:[N,] 70 | self.targets = tf.placeholder(dtype=tf.int32, shape=[None], name='target_class_id') 71 | # Dropout 72 | self.dropout_keep_prob = tf.placeholder_with_default(1.0, shape=[], name='dropout_keep_prob') 73 | 74 | # 1. Embedding Layer(N,T,E) 75 | embedding_inputs = self.embedding_lookup(self.inputs) 76 | 77 | # 2. 使用Transformer来提取高阶特征 78 | with tf.variable_scope("rnn"): 79 | # a. 定义RNN的cell构建函数 80 | def cell(_units): 81 | _cell = tf.nn.rnn_cell.BasicLSTMCell(num_units=_units) 82 | return tf.nn.rnn_cell.DropoutWrapper(cell=_cell, output_keep_prob=self.dropout_keep_prob) 83 | 84 | # b. 构建前向的cell和反向cell 85 | cell_fw = tf.nn.rnn_cell.MultiRNNCell(cells=[cell(self.num_units) for _ in range(self.layers)]) 86 | cell_bw = tf.nn.rnn_cell.MultiRNNCell(cells=[cell(self.num_units) for _ in range(self.layers)]) 87 | 88 | # c. 获取得到序列的输出向量 89 | # 数据都是按照原始的从左往右的序列得到的最终特征 90 | # (正向提取特征信息[N,T,E], 反向提取特征信息[N,T,E]),(正向最终的状态信息,反向最终的状态信息) 91 | (output_fw, output_bw), (output_state_fw, output_state_bw) = tf.nn.bidirectional_dynamic_rnn( 92 | cell_fw, # 前向的RNN Cell 93 | cell_bw, # 反向的RNN Cell 94 | inputs=embedding_inputs, # 输入值, [N,T,E] 95 | dtype=tf.float32, # 给定RNN状态初始化值的类型 96 | ) 97 | 98 | with tf.variable_scope("transformer"): 99 | with tf.variable_scope("Input"): 100 | encoder_input = tf.layers.dense(tf.concat([output_fw, output_bw], axis=-1), 101 | units=self.attention_dimension_size, 102 | activation=tf.nn.relu) 103 | 104 | for layer in range(self.attention_layers): 105 | with tf.variable_scope("Encoder_{}".format(layer)): 106 | # 1. 得到各个头的信息 107 | attention_outputs = [] 108 | for header in range(self.attention_headers): 109 | with tf.variable_scope("Header_{}".format(header)): 110 | attention_output = self._self_attention( 111 | H=encoder_input, 112 | attention_dimension_size=self.attention_dimension_size 113 | ) 114 | attention_outputs.append(attention_output) 115 | 116 | # 2. 拼接 117 | attention_output = tf.concat(attention_outputs, axis=-1) 118 | 119 | # 3. 做一个线性转换 120 | attention_output = tf.layers.dense(attention_output, 121 | units=self.attention_dimension_size, 122 | activation=None) 123 | 124 | # 4. 将当前层的输出和当前层的输入做一个残差结构 125 | attention_output = tf.nn.relu(attention_output + encoder_input) 126 | 127 | # 5. 将当前层输出作为下一层的输入 128 | encoder_input = attention_output 129 | 130 | # 3. 将高阶特征拼接到一起,作为CNN提取出来的最终高阶特征信息 131 | with tf.variable_scope("merge_feature"): 132 | # 4. 将所有时刻的特征信息求均值 133 | features = tf.reduce_mean(attention_output, axis=1) 134 | 135 | # 4. FFN+Softmax做最终的决策输出 136 | with tf.variable_scope("project"): 137 | score = slim.fully_connected(features, num_outputs=self.num_class, activation_fn=None) 138 | # 重命名, 得到的是N个文本属于num_class个类别的置信度 139 | self.logits = tf.identity(score, 'logits') 140 | # 得到N个文本分别属于各个类别的概率值 141 | self.probability = tf.nn.softmax(self.logits, name='probability') 142 | # 得到最终的预测id 143 | self.predictions = tf.argmax(self.logits, axis=-1, name='predictions') 144 | 145 | # 配置一个参数表示仅恢复模型参数 146 | self.saver_parameters['var_list'] = tf.global_variables() 147 | 148 | def _self_attention(self, H, attention_dimension_size): 149 | """ 150 | 计算Self-Attention 151 | :param H: [N,T,E], N个序列,每个序列T个时刻,每个时刻E维的向量 152 | :return: 153 | """ 154 | # 0. 获取大小信息 155 | hidden_size = H.shape[-1] 156 | batch_size, sequence_length, _ = tf.unstack(tf.shape(H)) 157 | # 1. 对输入数据reshape操作 158 | H = tf.reshape(H, shape=tf.stack([batch_size * sequence_length, hidden_size])) 159 | # 2. 分别计算Q、K、V 160 | Q = tf.layers.dense(H, units=attention_dimension_size) 161 | K = tf.layers.dense(H, units=attention_dimension_size) 162 | V = tf.layers.dense(H, units=attention_dimension_size, activation=tf.nn.relu) 163 | # 3. Reshape 164 | Q = tf.reshape(Q, shape=tf.stack([batch_size, sequence_length, attention_dimension_size])) 165 | K = tf.reshape(K, shape=tf.stack([batch_size, sequence_length, attention_dimension_size])) 166 | V = tf.reshape(V, shape=tf.stack([batch_size, sequence_length, attention_dimension_size])) 167 | # 4. 计算相关性([N,T,E],[N,T,E],F,T) --> [N,T,T] 168 | scores = tf.matmul(Q, K, False, True) / np.sqrt(attention_dimension_size) 169 | # 5. 计算概率值([N,T,T]) 170 | weights = tf.nn.softmax(scores) 171 | # 6. 计算最终结果 172 | attention = tf.matmul(weights, V) 173 | return attention 174 | -------------------------------------------------------------------------------- /text_classsification/nets/text_cnn_transformer.py: -------------------------------------------------------------------------------- 1 | # -- encoding:utf-8 -- 2 | 3 | 4 | import numpy as np 5 | import tensorflow as tf 6 | from tensorflow.contrib import slim 7 | 8 | from nets.metric import Metrics 9 | from nets import base_model 10 | 11 | 12 | class Network(base_model.Network): 13 | 14 | def __init__(self, with_word2vec=False, vocab_size=None, embedding_dimensions=None, 15 | embedding_table=None, train_embedding_table=False, 16 | num_class=2, network_name="TextCNNRNN", weight_decay=0.01, 17 | optimizer_type="adam", optimizer_parameters_func=None, saver_parameters={'max_to_keep': 2}, 18 | num_filters=128, region_sizes=[2, 3, 4], 19 | attention_dimension_size=128, attention_layers=3, attention_headers=16, *args, **kwargs): 20 | """ 21 | :param with_word2vec: 是否使用Word2Vec训练好的转换参数作为Embedding Lookup的参赛值 22 | :param vocab_size: 词汇数目 23 | :param embedding_dimensions: Embedding Loopup转换的时候,单词转换的词向量大小 24 | :param embedding_table: 训练好的单词向量映射表 25 | :param train_embedding_table: 是否训练train_embedding_table的参数值 26 | :param num_class: 类别数目 27 | :param network_name: 网络名称 28 | :param weight_decay: L2正则项的系数 29 | :param optimizer_type: 优化器的类别 30 | :param optimizer_parameters_func: 构建优化器的参数的函数 31 | :param saver_parameters: 模型持久化器的参数 32 | :param num_filters: TextCNN 各个不同类型卷积核的数目,可以给定为int或者list 33 | :param region_sizes: TextCNN各个不同类别卷积核提取单词特征的单词数量范围 34 | :param attention_dimension_size: Self Attention计算过程中的维度大小 35 | :param attention_layers: Transformer的层次 36 | :param attention_headers: 头的数目 37 | """ 38 | self.attention_dimension_size = attention_dimension_size 39 | self.attention_layers = attention_layers 40 | self.attention_headers = attention_headers 41 | self.region_sizes = region_sizes # 使用CNN提取特征信息的时候,提取范围大小 42 | if isinstance(num_filters, list): 43 | # 相当于针对每个范围给定不同的卷积核数目 44 | if len(region_sizes) != len(num_filters): 45 | raise Exception("resize_sizes和num_filters大小必须一致!!!") 46 | else: 47 | self.num_filters = num_filters 48 | elif isinstance(num_filters, int): 49 | self.num_filters = [num_filters] * len(region_sizes) 50 | else: 51 | raise Exception("参数num_filters仅支持int类型或者list类型数据!!") 52 | 53 | super(Network, self).__init__(with_word2vec=with_word2vec, vocab_size=vocab_size, 54 | embedding_dimensions=embedding_dimensions, 55 | embedding_table=embedding_table, train_embedding_table=train_embedding_table, 56 | num_class=num_class, network_name=network_name, weight_decay=weight_decay, 57 | optimizer_type=optimizer_type, 58 | optimizer_parameters_func=optimizer_parameters_func, 59 | saver_parameters=saver_parameters) 60 | 61 | def interface(self): 62 | """ 63 | 前向网络构建 64 | batch_size: N 65 | feature height: H, 将序列长度T认为是H 66 | feature width: W,将Embedding size大小认为是W 67 | feature channel : C,一个文本就相当于一个Feature Map,通道数为1 68 | sentence_length: T 69 | embedding size: E 70 | :return: 71 | """ 72 | with tf.variable_scope(self.network_name): 73 | with slim.arg_scope(self.arg_score()): 74 | with tf.variable_scope("placeholders"): 75 | self.global_step = tf.train.get_or_create_global_step() 76 | # 输入的单词id,形状为:[N,T] 77 | self.inputs = tf.placeholder(dtype=tf.int32, shape=[None, None], name='input_word_id') 78 | # 希望输出的类别id, 形状为:[N,] 79 | self.targets = tf.placeholder(dtype=tf.int32, shape=[None], name='target_class_id') 80 | # Dropout 81 | self.dropout_keep_prob = tf.placeholder_with_default(1.0, shape=[], name='dropout_keep_prob') 82 | 83 | # 1. Embedding Layer 84 | # 将单词id转换为单词向量,[N,T] --> [N,T,E] 85 | embedding_inputs = self.embedding_lookup(self.inputs) 86 | # 增加维度信息,将其转换为四维对象, [N,T,E] --> [N,T,E,1] 87 | expanded_embedding_inputs = tf.expand_dims(embedding_inputs, axis=-1) 88 | 89 | # 2. 使用卷积来提取高阶特征 90 | outputs = [] 91 | with tf.variable_scope("cnn"): 92 | for idx, region_size in enumerate(self.region_sizes): 93 | with tf.variable_scope("conv-max-pooling-{}".format(idx)): 94 | conv2d_input = expanded_embedding_inputs 95 | # 卷积的功能相当于将region_size个单词看成一个整体,然后进行单词的特征向量信息的融合提取 96 | # 最终返回结果形状为: [N,T,1,C] 97 | # 为了保障卷积之后的Feature Map大小和原始大小一致(序列长度一致),所以这里进行数据的填充 98 | if region_size - 1 != 0: 99 | top = (region_size - 1) // 2 100 | bottom = region_size - 1 - top 101 | conv2d_input = tf.pad(conv2d_input, paddings=[[0, 0], [top, bottom], [0, 0], [0, 0]]) 102 | # 卷积(序列长度不变) 103 | conv = slim.conv2d( 104 | conv2d_input, # [N,T,E,1] 105 | num_outputs=self.num_filters[idx], # C, eg:2 106 | kernel_size=(region_size, self.embedding_dimensions) # (h,w), eg:(3,E) 107 | ) 108 | 109 | # 最大池化(变成原来的一半), 将相邻单词的特征提取主要特征信息 110 | pool = slim.max_pool2d(conv, (3, 1), stride=[2, 1]) 111 | 112 | # 添加到临时列表中 113 | outputs.append(tf.squeeze(pool, axis=2)) 114 | 115 | with tf.variable_scope("transformer"): 116 | with tf.variable_scope("Input"): 117 | encoder_input = tf.layers.dense(tf.concat(outputs, axis=-1), 118 | units=self.attention_dimension_size, 119 | activation=tf.nn.relu) 120 | 121 | for layer in range(self.attention_layers): 122 | with tf.variable_scope("Encoder_{}".format(layer)): 123 | # 1. 得到各个头的信息 124 | attention_outputs = [] 125 | for header in range(self.attention_headers): 126 | with tf.variable_scope("Header_{}".format(header)): 127 | attention_output = self._self_attention( 128 | H=encoder_input, 129 | attention_dimension_size=self.attention_dimension_size 130 | ) 131 | attention_outputs.append(attention_output) 132 | 133 | # 2. 拼接 134 | attention_output = tf.concat(attention_outputs, axis=-1) 135 | 136 | # 3. 做一个线性转换 137 | attention_output = tf.layers.dense(attention_output, 138 | units=self.attention_dimension_size, 139 | activation=None) 140 | 141 | # 4. 将当前层的输出和当前层的输入做一个残差结构 142 | attention_output = tf.nn.relu(attention_output + encoder_input) 143 | 144 | # 5. 将当前层输出作为下一层的输入 145 | encoder_input = attention_output 146 | 147 | # 3. 将高阶特征拼接到一起,作为CNN提取出来的最终高阶特征信息 148 | with tf.variable_scope("merge_feature"): 149 | features = tf.reduce_mean(attention_output, axis=1) 150 | 151 | # 4. FFN+Softmax做最终的决策输出 152 | with tf.variable_scope("project"): 153 | score = slim.fully_connected(features, num_outputs=self.num_class, activation_fn=None) 154 | # 重命名, 得到的是N个文本属于num_class个类别的置信度 155 | self.logits = tf.identity(score, 'logits') 156 | # 得到N个文本分别属于各个类别的概率值 157 | self.probability = tf.nn.softmax(self.logits, name='probability') 158 | # 得到最终的预测id 159 | self.predictions = tf.argmax(self.logits, axis=-1, name='predictions') 160 | 161 | # 配置一个参数表示仅恢复模型参数 162 | self.saver_parameters['var_list'] = tf.global_variables() 163 | 164 | def _self_attention(self, H, attention_dimension_size): 165 | """ 166 | 计算Self-Attention 167 | :param H: [N,T,E], N个序列,每个序列T个时刻,每个时刻E维的向量 168 | :return: 169 | """ 170 | # 0. 获取大小信息 171 | hidden_size = H.shape[-1] 172 | batch_size, sequence_length, _ = tf.unstack(tf.shape(H)) 173 | # 1. 对输入数据reshape操作 174 | H = tf.reshape(H, shape=tf.stack([batch_size * sequence_length, hidden_size])) 175 | # 2. 分别计算Q、K、V 176 | Q = tf.layers.dense(H, units=attention_dimension_size) 177 | K = tf.layers.dense(H, units=attention_dimension_size) 178 | V = tf.layers.dense(H, units=attention_dimension_size, activation=tf.nn.relu) 179 | # 3. Reshape 180 | Q = tf.reshape(Q, shape=tf.stack([batch_size, sequence_length, attention_dimension_size])) 181 | K = tf.reshape(K, shape=tf.stack([batch_size, sequence_length, attention_dimension_size])) 182 | V = tf.reshape(V, shape=tf.stack([batch_size, sequence_length, attention_dimension_size])) 183 | # 4. 计算相关性([N,T,E],[N,T,E],F,T) --> [N,T,T] 184 | scores = tf.matmul(Q, K, False, True) / np.sqrt(attention_dimension_size) 185 | # 5. 计算概率值([N,T,T]) 186 | weights = tf.nn.softmax(scores) 187 | # 6. 计算最终结果 188 | attention = tf.matmul(weights, V) 189 | return attention 190 | -------------------------------------------------------------------------------- /text_classsification/nets/base_model.py: -------------------------------------------------------------------------------- 1 | # -- encoding:utf-8 -- 2 | 3 | import numpy as np 4 | import tensorflow as tf 5 | from tensorflow.contrib import slim 6 | 7 | from nets.metric import Metrics 8 | 9 | 10 | class Network(object): 11 | 12 | def __init__(self, with_word2vec=False, vocab_size=None, embedding_dimensions=None, 13 | embedding_table=None, train_embedding_table=False, 14 | num_class=2, network_name="TextCNN", weight_decay=0.01, 15 | optimizer_type="adam", optimizer_parameters_func=None, saver_parameters={'max_to_keep': 2}, 16 | *args, **kwargs): 17 | """ 18 | :param with_word2vec: 是否使用Word2Vec训练好的转换参数作为Embedding Lookup的参赛值 19 | :param vocab_size: 词汇数目 20 | :param embedding_dimensions: Embedding Loopup转换的时候,单词转换的词向量大小 21 | :param embedding_table: 训练好的单词向量映射表 22 | :param train_embedding_table: 是否训练train_embedding_table的参数值 23 | :param num_class: 类别数目 24 | :param network_name: 网络名称 25 | :param weight_decay: L2正则项的系数 26 | :param optimizer_type: 优化器的类别 27 | :param optimizer_parameters_func: 构建优化器的参数的函数 28 | :param saver_parameters: 模型持久化器的参数 29 | :param num_filters: TextCNN 各个不同类型卷积核的数目,可以给定为int或者list 30 | :param region_sizes: TextCNN各个不同类别卷积核提取单词特征的单词数量范围 31 | """ 32 | self.with_word2vec = with_word2vec 33 | self.weight_decay = weight_decay # 正则的权重系数 34 | self.network_name = network_name # 网络名称 35 | self.vocab_size = vocab_size # 词汇表大小 36 | self.embedding_dimensions = embedding_dimensions # 做单词id转换为向量的时候,向量维度大小 37 | self.input_embedding_table = embedding_table 38 | self.train_embedding_table = train_embedding_table 39 | self.num_class = num_class # 类别数目 40 | 41 | if self.with_word2vec: 42 | if self.input_embedding_table is None or np.ndim(self.input_embedding_table) != 2: 43 | tf.logging.warn("当参数with_word2vec为True的时候,必须给定embedding_table的2维转换矩阵值!!") 44 | self.with_word2vec = False 45 | else: 46 | self.vocab_size, self.embedding_dimensions = np.shape(self.input_embedding_table) 47 | else: 48 | if self.embedding_dimensions is None or self.vocab_size is None: 49 | raise Exception("当参数with_word2vec为False的时候,必须给定embedding_dimensions和vocab_size的参数值!!") 50 | 51 | self.global_step = None # Tensor变量对象,用于记录模型的更新次数 52 | self.embedding_table = None # 做词嵌入的变量 53 | self.inputs = None # 输入的文本单词id,[None,None] 54 | self.targets = None # 实际标签下标对象, [None,] 55 | self.dropout_keep_prob = None # Drouout系数 56 | self.logits = None # 模型前向网络执行之后得到的预测置信度信息,[None, num_class] 57 | self.probability = None # 模型前向网络执行之后得到的预测概率信息, [None, num_class] 58 | self.predictions = None # 模型前向网络的预测结果/类别下标,[None,] 59 | self.saver = None # 模型持久化的对象 60 | self.saver_parameters = saver_parameters # 初始化模型持久化对象的参数 61 | 62 | self.optimizer_type = optimizer_type # 优化器类型 63 | self.optimizer_parameters_func = optimizer_parameters_func # 优化器参数 64 | 65 | self.interface() 66 | 67 | def arg_score(self): 68 | """ 69 | 作用域默认参数给定 70 | :return: 71 | """ 72 | with slim.arg_scope([slim.conv2d, slim.fully_connected], 73 | activation_fn=tf.nn.relu, 74 | weights_regularizer=slim.l2_regularizer(self.weight_decay), 75 | weights_initializer=tf.contrib.layers.xavier_initializer(), 76 | biases_initializer=tf.zeros_initializer()): 77 | with slim.arg_scope([slim.conv2d, slim.max_pool2d], padding='VALID', stride=1) as sc: 78 | return sc 79 | 80 | def embedding_lookup(self, inputs): 81 | """ 82 | 对输入做一个Embedding转换处理 83 | :param inputs: 输入的Tensor对象 84 | :return: 85 | """ 86 | with tf.device('/cpu:0'), tf.variable_scope('embedding'): 87 | if self.with_word2vec: 88 | tf.logging.info("Embedding Table初始化使用Word2Vec训练好的转换参数.....") 89 | _embedding = tf.get_variable( 90 | name='embedding_table', 91 | shape=[self.vocab_size, self.embedding_dimensions], 92 | initializer=tf.constant_initializer(value=self.input_embedding_table), 93 | trainable=self.train_embedding_table # 给定是否参与模型训练 94 | ) 95 | else: 96 | tf.logging.info("Embedding Table初始化使用随机初始化值.....") 97 | _embedding = tf.get_variable(name='embedding_table', 98 | shape=[self.vocab_size, self.embedding_dimensions]) 99 | self.embedding_table = _embedding 100 | # 将单词id转换为单词向量,[N,T] --> [N,T,E] 101 | embedding_inputs = tf.nn.embedding_lookup(self.embedding_table, inputs) 102 | return embedding_inputs 103 | 104 | def interface(self): 105 | raise NotImplementedError("请实现具体的interface代码,用于构建前向网络结构!!!") 106 | 107 | def losses(self): 108 | """ 109 | 计算损失函数,并返回对应的Tensor对象值 110 | 基于预测的置信度logits以及实际的标签值来构建分类损失函数 111 | :return: 112 | """ 113 | with tf.name_scope("Loss"): 114 | # 1. 计算实际值和预测值之间差值所导致的损失值 115 | if self.num_class == 2: 116 | # 二分类,可以考虑使用sigmoid交叉熵损失函数 117 | # 将id哑编码: [None,] --> [None,num_class] 118 | labels = tf.one_hot(self.targets, depth=self.num_class) 119 | # 计算损失:([None,num_class], [None,num_class]) --> [None,] 120 | loss = tf.nn.sigmoid_cross_entropy_with_logits(labels=labels, logits=self.logits) 121 | # 所有样本损失合并求均值:[None,] --> [] 122 | loss = tf.reduce_mean(loss) 123 | else: 124 | # 多分类,考虑使用softmax交叉熵损失函数 125 | # 基于id和logits置信度直接计算损失: ([None,], [None,num_class]) --> [None,] 126 | loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.targets, logits=self.logits) 127 | # 所有样本损失合并求均值:[None,] --> [] 128 | loss = tf.reduce_mean(loss) 129 | 130 | # 2. 将损失添加到collection中 131 | tf.losses.add_loss(loss) 132 | 133 | # 3. 获取所有损失合并之后的值(分类损失、正则损失等等) 134 | total_loss = tf.losses.get_total_loss(name='total_loss') 135 | 136 | # 4. 可视化操作 137 | tf.summary.scalar('loss', loss) 138 | tf.summary.scalar('total_loss', total_loss) 139 | return total_loss 140 | 141 | def optimizer(self, loss=None, return_train_operation=True): 142 | """ 143 | 构建优化器,并根据参数return_train_operation决定是否返回训练对象 144 | :param loss: 如果return_train_operation为True,那么loss参数必须有值,并且表示为损失值 145 | :param return_train_operation: True or False,True表示返回训练对象,False表示不返回 146 | :return: 如果return_train_operation为True,返回优化器以及训练操作对象,否则仅返回优化器本身 147 | """ 148 | if return_train_operation and loss is None: 149 | raise Exception("当需要返回训练对象的时候,loss参数必须有值!!") 150 | 151 | with tf.name_scope("optimizer"): 152 | # 1. 构建优化器 153 | parameters = self.optimizer_parameters_func(self.global_step) 154 | if self.optimizer_type == 'adam': 155 | opt = tf.train.AdamOptimizer(**parameters) 156 | elif self.optimizer_type == 'adadelta': 157 | opt = tf.train.AdadeltaOptimizer(**parameters) 158 | elif self.optimizer_type == 'adagrad': 159 | opt = tf.train.AdagradOptimizer(**parameters) 160 | elif self.optimizer_type == 'ftrl': 161 | opt = tf.train.FtrlOptimizer(**parameters) 162 | elif self.optimizer_type == 'momentum': 163 | opt = tf.train.MomentumOptimizer(**parameters) 164 | else: 165 | opt = tf.train.GradientDescentOptimizer(**parameters) 166 | 167 | # 2. 构建训练对象 168 | train_op = None 169 | if return_train_operation: 170 | train_op = opt.minimize(loss=loss, global_step=self.global_step) 171 | return opt, train_op 172 | 173 | def metrics(self, loss=None): 174 | """ 175 | 构建模型的评估指标,并返回对象 176 | :param loss: 177 | :return: 178 | """ 179 | 180 | def accuracy(true_y, pre_y): 181 | with tf.name_scope("accuracy"): 182 | is_correct = tf.to_float(tf.equal(true_y, pre_y)) 183 | return tf.reduce_mean(is_correct) 184 | 185 | with tf.name_scope("metrics"): 186 | labels = self.targets 187 | predictions = self.predictions 188 | # 要求shape形状一致 189 | predictions.get_shape().assert_is_compatible_with(labels.get_shape()) 190 | # 要求数据类型一致,不一致进行转换 191 | if labels.dtype != predictions.dtype: 192 | predictions = tf.cast(predictions, labels.dtype) 193 | # 基于预测索引id和实际的索引id,构建这个准确率 194 | accuracy_ = accuracy(true_y=labels, pre_y=predictions) 195 | tf.summary.scalar('accuracy', accuracy_) 196 | 197 | metrics = Metrics(accuracy=accuracy_, recall=None, f1=None) 198 | return metrics 199 | 200 | def restore(self, checkpoint_dir, session): 201 | """ 202 | 进行模型参数恢复操作(直接恢复) 203 | :param checkpoint_dir: 204 | :param session: 205 | :return: 206 | """ 207 | # 0. 相关参数初始化 208 | if self.saver is None: 209 | self.saver = tf.train.Saver(**self.saver_parameters) 210 | 211 | # 1. 检查是否存在持久化的模型文件 212 | ckpt = tf.train.get_checkpoint_state(checkpoint_dir) 213 | # 2. 进行判断 214 | if ckpt and ckpt.model_checkpoint_path: 215 | tf.logging.info("开始进行模型恢复操作:{}".format(ckpt.model_checkpoint_path)) 216 | # 参数恢复 217 | self.saver.restore(sess=session, save_path=ckpt.model_checkpoint_path) 218 | # 恢复模型管理(保存磁盘中最多存在max_to_keep个模型) 219 | self.saver.recover_last_checkpoints(ckpt.all_model_checkpoint_paths) 220 | else: 221 | tf.logging.warn("从文件夹【{}】没有发现训练好的模型文件,不能进行模型恢复操作!!".format(checkpoint_dir)) 222 | 223 | def save(self, session, save_path): 224 | # 0. 相关参数初始化 225 | if self.saver is None: 226 | self.saver = tf.train.Saver(**self.saver_parameters) 227 | 228 | # 1. 模型持久化 229 | tf.logging.info("进行模型持久化操作, 持久化路径为:{}".format(save_path)) 230 | self.saver.save(sess=session, save_path=save_path, global_step=self.global_step) 231 | tf.logging.info("模型持久化完成!!!") 232 | -------------------------------------------------------------------------------- /text_classsification/nets/text_adversarial_rnn_improve.py: -------------------------------------------------------------------------------- 1 | # -- encoding:utf-8 -- 2 | 3 | 4 | import numpy as np 5 | import tensorflow as tf 6 | from tensorflow.contrib import slim 7 | 8 | from nets.metric import Metrics 9 | from nets import base_model 10 | 11 | 12 | class Network(base_model.Network): 13 | 14 | def __init__(self, with_word2vec=False, vocab_size=None, embedding_dimensions=None, 15 | embedding_table=None, train_embedding_table=False, 16 | num_class=2, network_name="TextRNN", weight_decay=0.01, 17 | optimizer_type="adam", optimizer_parameters_func=None, saver_parameters={'max_to_keep': 2}, 18 | num_units=128, layers=3, *args, **kwargs): 19 | """ 20 | :param with_word2vec: 是否使用Word2Vec训练好的转换参数作为Embedding Lookup的参赛值 21 | :param vocab_size: 词汇数目 22 | :param embedding_dimensions: Embedding Loopup转换的时候,单词转换的词向量大小 23 | :param embedding_table: 训练好的单词向量映射表 24 | :param train_embedding_table: 是否训练train_embedding_table的参数值 25 | :param num_class: 类别数目 26 | :param network_name: 网络名称 27 | :param weight_decay: L2正则项的系数 28 | :param optimizer_type: 优化器的类别 29 | :param optimizer_parameters_func: 构建优化器的参数的函数 30 | :param saver_parameters: 模型持久化器的参数 31 | :param num_units: RNN Cell中的神经元数目 32 | :param layers: RNN的层次 33 | """ 34 | self.num_units = num_units # RNN Cell的神经元数目 35 | self.layers = layers # RNN的层次 36 | self.embedding_inputs = None 37 | self.sequence_length = None 38 | 39 | super(Network, self).__init__(with_word2vec=with_word2vec, vocab_size=vocab_size, 40 | embedding_dimensions=embedding_dimensions, 41 | embedding_table=embedding_table, train_embedding_table=train_embedding_table, 42 | num_class=num_class, network_name=network_name, weight_decay=weight_decay, 43 | optimizer_type=optimizer_type, 44 | optimizer_parameters_func=optimizer_parameters_func, 45 | saver_parameters=saver_parameters) 46 | 47 | def interface(self): 48 | """ 49 | 前向网络构建 50 | batch_size: N 51 | feature height: H, 将序列长度T认为是H 52 | feature width: W,将Embedding size大小认为是W 53 | feature channel : C,一个文本就相当于一个Feature Map,通道数为1 54 | sentence_length: T 55 | embedding size: E 56 | :return: 57 | """ 58 | with tf.variable_scope(self.network_name): 59 | with slim.arg_scope(self.arg_score()): 60 | with tf.variable_scope("placeholders"): 61 | self.global_step = tf.train.get_or_create_global_step() 62 | # 输入的单词id,形状为:[N,T] 63 | self.inputs = tf.placeholder(dtype=tf.int32, shape=[None, None], name='input_word_id') 64 | # 希望输出的类别id, 形状为:[N,] 65 | self.targets = tf.placeholder(dtype=tf.int32, shape=[None], name='target_class_id') 66 | # Dropout 67 | self.dropout_keep_prob = tf.placeholder_with_default(1.0, shape=[], name='dropout_keep_prob') 68 | # 计算序列实际长度, 最终形状为:[N,] 69 | self.sequence_length = tf.reduce_sum(tf.sign(tf.abs(self.inputs)), axis=-1) 70 | 71 | # 1. Embedding Layer 72 | self.embedding_inputs = self.embedding_lookup(self.inputs) 73 | 74 | # 2. 使用RNN来提取高阶特征 75 | with tf.variable_scope("rnn"): 76 | # a. 定义RNN的cell构建函数 77 | def cell(_units): 78 | _cell = tf.nn.rnn_cell.BasicLSTMCell(num_units=_units) 79 | return tf.nn.rnn_cell.DropoutWrapper(cell=_cell, output_keep_prob=self.dropout_keep_prob) 80 | 81 | # b. 构建前向的cell和反向cell 82 | cell_fw = tf.nn.rnn_cell.MultiRNNCell(cells=[cell(self.num_units) for _ in range(self.layers)]) 83 | cell_bw = tf.nn.rnn_cell.MultiRNNCell(cells=[cell(self.num_units) for _ in range(self.layers)]) 84 | 85 | # c. 获取得到序列的输出向量 86 | # 数据都是按照原始的从左往右的序列得到的最终特征 87 | # (正向提取特征信息[N,T,E], 反向提取特征信息[N,T,E]),(正向最终的状态信息,反向最终的状态信息) 88 | # 如果给定了序列的实际长度,那么在进行计算的时候,仅计算实际序列长度部分的内容,对于后面填充的内直接返回zero 89 | (output_fw, output_bw), (output_state_fw, output_state_bw) = tf.nn.bidirectional_dynamic_rnn( 90 | cell_fw, # 前向的RNN Cell 91 | cell_bw, # 反向的RNN Cell 92 | inputs=self.embedding_inputs, # 输入值, [N,T,E] 93 | dtype=tf.float32, # 给定RNN状态初始化值的类型 94 | sequence_length=self.sequence_length, # 给定序列的实际长度(因为序列是经过填充的) 95 | ) 96 | 97 | # 3. 将高阶特征拼接到一起,作为CNN提取出来的最终高阶特征信息 98 | with tf.variable_scope("merge_feature"): 99 | # 4. 直接将所有时刻的输出特征值mean作为最终特征信息(由于填充位置输出是zero,所以求均值不会产生影响) 100 | # [N,T,E] --> [N,E] --> [N,E] 101 | div_denominator = tf.reshape(tf.to_float(self.sequence_length), shape=(-1, 1)) 102 | features_fw = tf.div(tf.reduce_sum(output_fw, axis=1), div_denominator) 103 | features_bw = tf.div(tf.reduce_sum(output_bw, axis=1), div_denominator) 104 | features = tf.concat([features_fw, features_bw], axis=-1) 105 | # TODO: 获取实际序列最后要给时刻的输出特征向量作为高阶向量(下周一做) 106 | 107 | # 4. FFN+Softmax做最终的决策输出 108 | with tf.variable_scope("project"): 109 | score = slim.fully_connected(features, num_outputs=self.num_class, activation_fn=None) 110 | # 重命名, 得到的是N个文本属于num_class个类别的置信度 111 | self.logits = tf.identity(score, 'logits') 112 | # 得到N个文本分别属于各个类别的概率值 113 | self.probability = tf.nn.softmax(self.logits, name='probability') 114 | # 得到最终的预测id 115 | self.predictions = tf.argmax(self.logits, axis=-1, name='predictions') 116 | 117 | # 配置一个参数表示仅恢复模型参数 118 | self.saver_parameters['var_list'] = tf.global_variables() 119 | 120 | def losses(self): 121 | with tf.name_scope("loss"): 122 | # 1. 调用父类获得正常的分类损失函数 123 | total_loss = super(Network, self).losses() 124 | # 2. 加入对抗学习部分的损失函数 125 | with tf.name_scope("perturLoss"): 126 | with tf.variable_scope(self.network_name, reuse=True): 127 | # a. 在Embedding上加入噪声信息 128 | pertur_embedding_inputs = self._add_perturbation(total_loss) 129 | # b. 正常网络结构的构建 130 | with tf.variable_scope("rnn"): 131 | # a. 定义RNN的cell构建函数 132 | def cell(_units): 133 | _cell = tf.nn.rnn_cell.BasicLSTMCell(num_units=_units) 134 | return tf.nn.rnn_cell.DropoutWrapper(cell=_cell, output_keep_prob=self.dropout_keep_prob) 135 | 136 | # b. 构建前向的cell和反向cell 137 | cell_fw = tf.nn.rnn_cell.MultiRNNCell(cells=[cell(self.num_units) for _ in range(self.layers)]) 138 | cell_bw = tf.nn.rnn_cell.MultiRNNCell(cells=[cell(self.num_units) for _ in range(self.layers)]) 139 | 140 | # c. 获取得到序列的输出向量 141 | # 数据都是按照原始的从左往右的序列得到的最终特征 142 | # (正向提取特征信息[N,T,E], 反向提取特征信息[N,T,E]),(正向最终的状态信息,反向最终的状态信息) 143 | # 如果给定了序列的实际长度,那么在进行计算的时候,仅计算实际序列长度部分的内容,对于后面填充的内直接返回zero 144 | (output_fw, output_bw), (output_state_fw, output_state_bw) = tf.nn.bidirectional_dynamic_rnn( 145 | cell_fw, # 前向的RNN Cell 146 | cell_bw, # 反向的RNN Cell 147 | inputs=pertur_embedding_inputs, # 输入值, [N,T,E] 148 | dtype=tf.float32, # 给定RNN状态初始化值的类型 149 | sequence_length=self.sequence_length, # 给定序列的实际长度(因为序列是经过填充的) 150 | ) 151 | 152 | # 3. 将高阶特征拼接到一起,作为CNN提取出来的最终高阶特征信息 153 | with tf.variable_scope("merge_feature"): 154 | # 4. 直接将所有时刻的输出特征值mean作为最终特征信息(由于填充位置输出是zero,所以求均值不会产生影响) 155 | # [N,T,E] --> [N,E] --> [N,E] 156 | div_denominator = tf.reshape(tf.to_float(self.sequence_length), shape=(-1, 1)) 157 | features_fw = tf.div(tf.reduce_sum(output_fw, axis=1), div_denominator) 158 | features_bw = tf.div(tf.reduce_sum(output_bw, axis=1), div_denominator) 159 | features = tf.concat([features_fw, features_bw], axis=-1) 160 | 161 | # 4. FFN+Softmax做最终的决策输出 162 | with tf.variable_scope("project"): 163 | score = slim.fully_connected(features, num_outputs=self.num_class, activation_fn=None) 164 | 165 | # 构建损失 166 | if self.num_class == 2: 167 | # 二分类,可以考虑使用sigmoid交叉熵损失函数 168 | # 将id哑编码: [None,] --> [None,num_class] 169 | labels = tf.one_hot(self.targets, depth=self.num_class) 170 | # 计算损失:([None,num_class], [None,num_class]) --> [None,] 171 | loss = tf.nn.sigmoid_cross_entropy_with_logits(labels=labels, logits=score) 172 | # 所有样本损失合并求均值:[None,] --> [] 173 | perturLoss = tf.reduce_mean(loss) 174 | else: 175 | # 多分类,考虑使用softmax交叉熵损失函数 176 | # 基于id和logits置信度直接计算损失: ([None,], [None,num_class]) --> [None,] 177 | loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.targets, logits=score) 178 | # 所有样本损失合并求均值:[None,] --> [] 179 | perturLoss = tf.reduce_mean(loss) 180 | 181 | pass 182 | # 3. 合并损失 183 | total_loss = total_loss + perturLoss 184 | tf.summary.scalar('total_loss2', total_loss) 185 | tf.summary.scalar('pertur_loss', perturLoss) 186 | return total_loss 187 | 188 | def _add_perturbation(self, loss): 189 | """ 190 | 给词向量添加噪声信息 191 | :param loss: 192 | :return: 193 | """ 194 | with tf.name_scope("add_noise"): 195 | # 求解loss关于embedding input值求解对应梯度值 196 | grad, = tf.gradients(loss, self.embedding_inputs, 197 | aggregation_method=tf.AggregationMethod.EXPERIMENTAL_ACCUMULATE_N) 198 | # 停止反向传播([N,T,E]) 199 | grad = tf.stop_gradient(grad) 200 | # 计算噪声信息 201 | with tf.name_scope("noise"): 202 | # a. 求每个样本的梯度均值([N,T,E] --> [N,1,1]) 203 | alpha = tf.reduce_mean(tf.abs(grad), axis=[1, 2], keep_dims=True) + 1e-12 204 | # b. 求解L2 norm值 205 | l2_norm = alpha * tf.sqrt(tf.reduce_mean(tf.pow(grad / alpha, 2), [1, 2], keep_dims=True) + 1e-6) 206 | # c. 将grad除以l2_norm 207 | x_unit = grad / l2_norm 208 | # d. 扩展一下数据 209 | perturb = x_unit * 2.0 210 | return self.embedding_inputs + perturb 211 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /unsuper_classification.py: -------------------------------------------------------------------------------- 1 | import time 2 | import json 3 | from typing import List 4 | import re 5 | import os 6 | import jieba 7 | from langconv import Converter 8 | from gensim.models.doc2vec import Doc2Vec, TaggedDocument 9 | from gensim.models.tfidfmodel import TfidfModel 10 | from gensim import corpora, similarities 11 | from gensim.models.keyedvectors import Word2VecKeyedVectors 12 | from transformers import AlbertModel, AlbertTokenizer, BertTokenizer, AlbertForMaskedLM 13 | import torch 14 | 15 | ###################################################### 16 | # 参数, 使用tfidf+doc2vec+albert实现无监督文本分类 17 | #对内存影响较大的是文件读取, 可以改用迭代器(todo), 影响运行时间的doc2vec训练次数,albert使用的模型 18 | ###################################################### 19 | # 加载nlpcc的中文数据 20 | source_file = '/Users/admin/Downloads/nlpcc2017textsummarization/train_without_summ.txt' 21 | #把nlpcc的数据过滤后放入des_file文件夹中 22 | des_file = 'data/test/' 23 | #停止词文件 24 | stopword_file = 'stopwords.txt' 25 | #经过预处理后的文件缓存位置 26 | final_file = 'data/documents.txt' 27 | #doc2vec 模型保存文章 28 | docmodel = 'data/doc.model' 29 | #tfidf模型保存位置 30 | tfidfmodel = 'data/tfidf.model' 31 | #人工定义的文章分类的类别,标签 32 | finTags = ['明星', '诗歌', '故事', '美食', '企业', '个人', '证件', '新闻'] 33 | #停止词过滤 34 | stopwords_list = [line.rstrip() for line in open(stopword_file, encoding="utf-8")] 35 | 36 | def percent_chinese(sentence: str)-> bool: 37 | """ 38 | 过滤掉英文字符和数字占30%的文档 39 | :param sentence: 40 | :return: 41 | """ 42 | #文本总的长度 43 | tol = len(list(sentence.split())) 44 | pattern = '[a-z0-9]+' 45 | #英文和数字的长度 46 | english_count = len(re.findall(pattern, sentence)) 47 | return english_count/tol < 0.3 48 | 49 | def filter_data(): 50 | """ 51 | 过滤掉单词小于10000的文本,并且中文占比过低的文本, 保存到des_file文件夹 52 | :return: 53 | """ 54 | count = 0 55 | with open(source_file) as f: 56 | for line in f: 57 | line_dict = json.loads(line) 58 | article = line_dict['article'] 59 | if len(article) > 5000 and percent_chinese(article) : 60 | count += 1 61 | des = des_file + str(count) + '.txt' 62 | with open(des, 'w', encoding='utf-8') as wf: 63 | wf.write(article + "\n") 64 | print('生成文档的个数',count) 65 | 66 | def filter_chinese(sentence: str)-> str: 67 | """ 68 | 中文的一些预处理 69 | :param sentence: 输入的句子或文本 70 | :return: 71 | """ 72 | # 去除文本中的url 73 | # sentence = re.sub(r"http\S+", "", sentence) 74 | #剔除所有数字 75 | # decimal_regex = re.compile(r"[^a-zA-Z]\d+") 76 | # sentence = decimal_regex.sub(r"", sentence) 77 | #删除英文字符 78 | # eng_regex = re.compile(r'[a-zA-z]') 79 | # sentence = eng_regex.sub(r"", sentence) 80 | #只保留中文和标点符号 81 | words = [word for word in sentence if word >= u'\u4e00' and word <= u'\u9fa5' or word in [',','。','?','!']] 82 | sentence = ''.join(words) 83 | # 去除空格 84 | space_regex = re.compile(r"\s+") 85 | sentence = space_regex.sub(r"", sentence) 86 | # 繁体字转换成简体字 87 | sentence = Converter('zh-hans').convert(sentence) 88 | return sentence.strip().lower() 89 | 90 | def jieba_segment(sentence: str)-> str: 91 | """ 92 | jieba分词,并去掉停止词 93 | :param sentence: 94 | :return: 95 | """ 96 | sentence_list = jieba.cut(sentence) 97 | sentence_list = [w for w in sentence_list if w not in stopwords_list] 98 | sentence = ' '.join(sentence_list) 99 | return sentence 100 | 101 | def get_documents(cache=True, jieba=True)-> List: 102 | """ 103 | 返回所有文档预处理和jieba分词后的一个列表 104 | :param cache: 是否使用缓存的文件 105 | :param jieba: 是否进行分词 106 | :return: 107 | """ 108 | documents = [] 109 | #使用缓存文件 110 | if os.path.isfile(final_file) and cache: 111 | with open(final_file, 'r', encoding='utf-8') as file: 112 | for document in file: 113 | if jieba: 114 | document = jieba_segment(document) 115 | documents.append(document) 116 | else: 117 | #读取要处理的文件列表 118 | desfiles = os.listdir(des_file) 119 | #处理后存入到final_file单个文件 120 | with open(final_file, 'w', encoding='utf-8') as wf: 121 | for des in desfiles: 122 | document = '' 123 | with open(des_file+des, 'r', encoding='utf-8', errors='ignore') as file: 124 | for sentence in file: 125 | sentence = filter_chinese(sentence) 126 | if sentence: 127 | document = document + sentence + '。' 128 | if document: 129 | wf.write(document + "\n") 130 | if jieba: 131 | document = jieba_segment(document) 132 | documents.append(document) 133 | print("文档的个数:",len(documents)) 134 | return documents 135 | 136 | def cal_tfidf(documents, topk=10)-> List: 137 | """ 138 | tfidf模型训练 139 | :param documents: 要进行训练的文档 140 | :param topk: 提取tfidf score 的前多少个单词, 如果topk大于提取到的单词个数,返回所有单词 141 | :return: 142 | """ 143 | # 单个文档分成列表 144 | docs = [[word for word in document.split(' ')] for document in documents] 145 | # 生成字典 146 | dictionary = corpora.Dictionary(docs) 147 | # 生成bag of word 148 | docs_bow = [dictionary.doc2bow(doc) for doc in docs] 149 | if os.path.isfile(tfidfmodel): 150 | model = TfidfModel.load(tfidfmodel) 151 | else: 152 | model = TfidfModel(docs_bow) 153 | model.save(tfidfmodel) 154 | # 生成文本向量 155 | docs_vector = list(model[docs_bow]) 156 | # 对所有的文本向量进行排序,取钱topk 157 | docs_sort_vector = [sorted(doc, key=lambda x: x[1], reverse=True)[:topk] for doc in docs_vector] 158 | # 把对应的向量id转换成中文单词,docs_sort_chinese是中文单词和tfidf的score的列表 159 | docs_sort_chinese = [[(dictionary[vec[0]],vec[1]) for vec in doc] for doc in docs_sort_vector] 160 | return docs_sort_chinese 161 | 162 | def albert_model(seq_length=510, model_name='voidful/albert_chinese_xxlarge'): 163 | """ 164 | albert模型计算fintags和文档的相似度(使用余弦相似度) 165 | :param seq_length: 一个序列的最长长度 166 | :param model_name: 使用的albert的模型名称, 可选模型如下 167 | voidful/albert_chinese_tiny 168 | voidful/albert_chinese_small 169 | voidful/albert_chinese_base 170 | voidful/albert_chinese_large 171 | voidful/albert_chinese_xlarge 172 | voidful/albert_chinese_xxlarge 173 | :return: 返回所有文档和每个fintags的相似度列表 174 | """ 175 | tokenizer = BertTokenizer.from_pretrained(model_name) 176 | model = AlbertModel.from_pretrained(model_name) 177 | #不是用jieba分词 178 | docs = get_documents(cache=True, jieba=False) 179 | #用于保存所有tags的向量 180 | tags_cls = [] 181 | for tag in finTags: 182 | #对单个单词encode,生成单词对应的字典的id,是逐个字的id 183 | tag_token = tokenizer.encode(tag, add_special_tokens=True) 184 | # 转变成tensor向量,并扩充一个batch_size维度 185 | tagid = torch.tensor(tag_token).unsqueeze(0) 186 | #获取模型的输出结果 187 | outputs = model(tagid) 188 | #获取hidden_states的向量 189 | last_hidden_states = outputs[0] 190 | #获取单词的cls向量,代表整个单词的向量 191 | tag_cls = last_hidden_states[:, :1, :].squeeze(1) 192 | tags_cls.append(tag_cls) 193 | # 初始化余弦相速度 194 | cos = torch.nn.CosineSimilarity(dim=1, eps=1e-6) 195 | # 用于保存计算所有关键字和所有文档计算余弦相似度后的结果 196 | docs_similiarity = [] 197 | for doc in docs: 198 | # 对每个文档进行拆分分长度固定的句子 199 | doc_tup = [doc[i:i + seq_length] for i in range(0, len(doc), seq_length)] 200 | # 对每个文档进行换成token id,如果最后一个句子不够512,就padding到512位 201 | doc_token = tokenizer.batch_encode_plus(doc_tup, pad_to_max_length=True) 202 | # 获取生成的token id 203 | docid = torch.tensor(doc_token['input_ids']) 204 | # 放入模型 205 | outputs = model(docid) 206 | # 获取隐藏层的状态 207 | last_hidden_states = outputs[0] 208 | # 获取文档的cls向量,维度是[batch_size, Hidden_dimension], 这个batch_size就是上面的一个文档拆出来的每个句子,Hidden_dimension是模型的输出维度 209 | doc_cls = last_hidden_states[:,:1,:].squeeze(1) 210 | #用于保存每个fintags对这个句子的余弦相似度,就是这个类别关键字和这个句子的相似度 211 | tags_similiarity = [] 212 | for tag_cls in tags_cls: 213 | #计算余弦相似度,tag_cls的维度是[1,Hidden_dimension], doc_cls维度[batch_size, Hidden_dimension], tag_doc_simliarity的维度[batch_size] 214 | # tag_doc_simliarity 这个关键字和每个句子的余弦相似度 215 | tag_doc_simliarity = cos(tag_cls,doc_cls) 216 | # 对比这个关键字和所有句子,取最大相似度 217 | tags_similiarity.append(torch.mean(tag_doc_simliarity)) 218 | docs_similiarity.append(tags_similiarity) 219 | #用于测试,否则太慢 220 | if len(docs_similiarity) ==10: 221 | break 222 | return docs_similiarity 223 | 224 | def test_tfidf(): 225 | """ 226 | 测试tfidf的效果 227 | :return: 输出结果 228 | """ 229 | documents = get_documents() 230 | #取前20个tfidf分数最大的值 231 | res = cal_tfidf(documents, topk=100) 232 | #用于打印文档,有标点符号,比较好看 233 | documents = get_documents(cache=True, jieba=False) 234 | keywords = [] 235 | for idx, doc in enumerate(res): 236 | #取出关键tfidf文档计算得到的的关键字 237 | docword = [vec[0] for vec in doc] 238 | # 如果我们自定义的关键字在tfidf关键字列表中,就打印出来 239 | tags = [tag for tag in finTags if tag in docword] 240 | if not tags: 241 | #如果没有和给定的类别关键字重合,打印tfidf给出的前3个关键字 242 | print('没有找到和给定关键字匹配的,取tfidf的前3个关键字') 243 | tags = docword[:3] 244 | print(f"tfidf计算的最接近的keyword是: {tags}, 文档是: {documents[idx]}") 245 | keywords.append(tags[0]) 246 | print(keywords) 247 | return keywords 248 | 249 | def train_doc2vec(documents, training=False, epoch=300): 250 | """ 251 | 训练doc2vec 252 | :param documents:预处理后的文档 253 | :param training:是否继续训练 254 | :param epoch: 训练次数 255 | :return: 256 | """ 257 | # 单个文档分成列表 258 | docs = [[word for word in document.split(' ')] for document in documents] 259 | # 是否使用已缓存的模型 260 | if os.path.isfile(docmodel): 261 | model = Doc2Vec.load(docmodel) 262 | else: 263 | #使用TaggedDocument处理成文档和文档名称索引处理数据 264 | documents = [TaggedDocument(doc, tags = [i]) for i, doc in enumerate(docs)] 265 | model = Doc2Vec(documents, vector_size=100, window=6, min_count=1, workers=3, dm=1, negative=20, epochs=epoch) 266 | model.save(docmodel) 267 | #是否继续训练, 这里有bug,需要改进 268 | if training: 269 | documents = [TaggedDocument(doc, tags = [i]) for i, doc in enumerate(docs)] 270 | model.train(documents, total_examples=model.corpus_count, epochs=epoch) 271 | return model 272 | 273 | def test_doc2vec(): 274 | """ 275 | 测试doc2vec的效果 276 | :return: 输出结果 277 | """ 278 | documents = get_documents(cache=True, jieba=True) 279 | #加载模型, training继续训练模型 280 | model = train_doc2vec(documents, training=True, epoch=200) 281 | #用于打印 282 | documents = get_documents(cache=True, jieba=False) 283 | # 过滤出给的关键字fintags不在字典中的词语 ,所以这个词语没有词向量,无法计算相似度 284 | filter_tags = [tag for tag in finTags if tag in model.wv] 285 | if finTags != filter_tags: 286 | print('给定的fintags这写关键字不在doc2vec生成的字典中, 请更改关键字或者扩充训练文档, 使得训练文档包含这个关键字', set(finTags) - set(filter_tags)) 287 | tagsvec = model.wv[filter_tags] 288 | keywords = [] 289 | for idx, doc in enumerate(documents): 290 | docvec = model.docvecs[idx] 291 | #计算所有tag与这个文档的相似度 292 | tagssim = Word2VecKeyedVectors.cosine_similarities(docvec, tagsvec) 293 | maxsim = max(tagssim) 294 | keyword = finTags[list(tagssim).index(maxsim)] 295 | print(f"doc2vec计算的最接近的keyword是: {keyword}, 相似度是: {maxsim}, 文档是: {doc}") 296 | keywords.append(keyword) 297 | print(keywords) 298 | return keywords 299 | 300 | def test_albert(): 301 | """ 302 | 测试albert模型的效果 303 | :return: 304 | """ 305 | docs_similiarity = albert_model(model_name='voidful/albert_chinese_tiny') 306 | # docs_similiarity = albert_model(model_name='voidful/albert_chinese_base') 307 | #获取所有文档列表 308 | docs = get_documents(cache=True, jieba=False) 309 | keywords = [] 310 | for idx, doc_similiarity in enumerate(docs_similiarity): 311 | #找出最高的相似度 312 | maxsim = max(doc_similiarity) 313 | #找出最高相似度所对应的单词 314 | keyword = finTags[doc_similiarity.index(maxsim)] 315 | print(f'albert计算后的结果最相似的标签是{keyword}, 相似度是:{maxsim}, 文档是: {docs[idx]}') 316 | keywords.append(keyword) 317 | print(keywords) 318 | return keywords 319 | 320 | if __name__ == '__main__': 321 | # filter_data() 322 | # docs = get_documents(cache=False, jieba=True) 323 | # twords = test_tfidf() 324 | dwords = test_doc2vec() 325 | # awords = test_albert() 326 | # print(twords,dwords,awords) 327 | -------------------------------------------------------------------------------- /translate/do_translate.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Date : 2021/4/28 5:44 下午 4 | # @File : translate_ppt.py 5 | # @Author: johnson 6 | # @Desc : m2m100的翻译的API接口 7 | 8 | from flask import Flask, request, jsonify, abort 9 | import os 10 | from pptx import Presentation 11 | from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer 12 | import logging 13 | import re 14 | import torch 15 | import gc 16 | import atexit,time 17 | from apscheduler.schedulers.background import BackgroundScheduler 18 | import pandas as pd 19 | import pymysql 20 | 21 | def unload_model_schedule(): 22 | global nobusy_count 23 | global model 24 | if model.model: 25 | nobusy_count += 1 26 | app.logger.info(f"Model 是存在的,不使用GPU的时间是 {nobusy_count*10}秒") 27 | if nobusy_count >= 30: 28 | #开始清理GPU, 30次,一共300秒,即5分钟不使用GPU,就卸载 29 | app.logger.info(f"开始清理模型") 30 | model.unload_model() 31 | nobusy_count = 0 32 | 33 | scheduler = BackgroundScheduler() 34 | scheduler.add_job(func=unload_model_schedule, trigger="interval", seconds=10) 35 | scheduler.start() 36 | 37 | atexit.register(lambda: scheduler.shutdown()) 38 | 39 | app = Flask(__name__) 40 | app.config['DEBUG'] = False 41 | if os.path.exists('/data/var/log/'): 42 | app.config['API_LOG_FILE'] = '/data/var/log/translate_api.log' 43 | else: 44 | app.config['API_LOG_FILE'] = 'translate_api.log' 45 | # 日志配置信息, Running on之类的显示在日志里面 46 | if app.config['DEBUG']: 47 | logging.basicConfig(filename=app.config['API_LOG_FILE'], level=logging.DEBUG) 48 | else: 49 | logging.basicConfig(filename=app.config['API_LOG_FILE'], level=logging.INFO) 50 | 51 | class TranslateModel(object): 52 | def __init__(self, verbose=False): 53 | self.model_name = "./translate_model" 54 | self.excel_file = '翻译对照表总表.xlsx' 55 | # 判断使用的设备 56 | self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 57 | self.model = None 58 | self.tokenizer = None 59 | # 提取中英文的替换的关键字列表 60 | self.corres_keys = None 61 | self.cn_keys = None 62 | self.en_keys = None 63 | self.connet_mysql() 64 | # beam size 为3 65 | self.num_beams = 3 66 | # 返回句子为3 67 | self.num_return_sequences = 3 68 | # 防止重复生成单词的ngram 69 | self.no_repeat_ngram_size = 2 70 | def load_keywords_from_mysql(self): 71 | """ 72 | 不同于extract_dict从excel中加载,这里从mysql数据库加载 73 | :return: 74 | :rtype: 75 | """ 76 | self.connet_mysql() 77 | sql = 'select * from pro_translate_dict' 78 | df = pd.read_sql(sql,self.db) 79 | self.cn_keys = df['cnword'].tolist() 80 | self.en_keys = df['enword'].tolist() 81 | self.corres_keys = df['modelword'].tolist() 82 | 83 | def connet_mysql(self): 84 | db = pymysql.connect(host="192.168.xx.xx", 85 | user="xx", 86 | password="xxx", 87 | port=3306, 88 | db='xxx', 89 | charset='utf8mb4', 90 | ) 91 | self.db = db 92 | def submit_mysql(self,cn_keys,en_keys,corres_keys): 93 | # 提交数据data到数据库表pro_translate_dict 94 | total = 0 95 | loss = 0 96 | for cnword, enword, modelword in zip(cn_keys,en_keys,corres_keys): 97 | cursor = self.db.cursor() 98 | sql = "INSERT INTO `pro_translate_dict` (`cnword`, `enword`, `modelword`) VALUES ('%s','%s', '%s')" % ( 99 | cnword, enword, modelword) 100 | try: 101 | cursor.execute(sql) 102 | self.db.commit() 103 | total += 1 104 | app.logger.info(f"提交{cnword},{enword},{modelword}数据到mysql成功") 105 | except Exception as e: 106 | self.db.rollback() 107 | app.logger.info(f"提交{cnword},{enword},{modelword}数据到mysql失败") 108 | print(e) 109 | loss += 1 110 | self.db.commit() 111 | app.logger.info(f"提交{total}条数据到mysql成功, {loss}条失败") 112 | 113 | def load_model(self): 114 | """ 115 | 加载模型 116 | :return: 117 | """ 118 | app.logger.info(f"开始加载模型") 119 | model = M2M100ForConditionalGeneration.from_pretrained(self.model_name) 120 | model.to(self.device) 121 | self.tokenizer = M2M100Tokenizer.from_pretrained(self.model_name) 122 | self.model = model 123 | 124 | def extract_dict(self, excel_file, write2mysql=False): 125 | """ 126 | 提取excel文件中的中英文关键字 127 | :param excel_file: excel文件 128 | :param write2mysql: 是否把结果写到mysql 129 | :return: 130 | :rtype: 131 | """ 132 | df = pd.read_excel(excel_file) 133 | cn_keywords = df['中文'].tolist() 134 | en_keywords = df['英文'].tolist() 135 | app.logger.info(f"处理前中文和英文单词个数分别是 {len(cn_keywords)}, {len(en_keywords)}") 136 | cn_keys = [] 137 | en_keys = [] 138 | # 处理一下/分隔的词,这样的词是多个的 139 | for cn, en in zip(cn_keywords, en_keywords): 140 | if '/' in cn: 141 | cn_words = cn.split('/') 142 | else: 143 | cn_words = [cn] 144 | if '/' in en: 145 | en_words = en.split('/') 146 | else: 147 | en_words = [en] 148 | # 中英文单词都加入单词表 149 | for c in cn_words: 150 | for e in en_words: 151 | cn_keys.append(c) 152 | en_keys.append(e) 153 | app.logger.info(f"处理后中文和英文单词个数分别是 {len(cn_keys)}, {len(en_keys)}") 154 | app.logger.info(f"中文关键字cn_keys是{cn_keys}") 155 | app.logger.info(f"英文关键字en_keys是{en_keys}") 156 | self.cn_keys = cn_keys 157 | self.en_keys = en_keys 158 | # 每个中文关键字对应的翻译结果,从模型中获取 159 | app.logger.info(f"首先处理对应的关键字的中文到英文的翻译结果") 160 | corres_keys = [] 161 | for cnkey in cn_keys: 162 | #每个中文对应的英文翻译结果 163 | corres_key = self.translate2en(text=cnkey,do_replace=False) 164 | corres_keys.append(corres_key) 165 | app.logger.info(f"corres_keys是 {corres_keys}") 166 | self.corres_keys = corres_keys 167 | if write2mysql: 168 | #开始同步到mysql数据库 169 | self.submit_mysql(cn_keys,en_keys,corres_keys) 170 | 171 | def start_replace(self, cntext, entext): 172 | """ 173 | 使用cn_keys和en_keys进行正则替换 174 | :param cntext: 175 | :param entext: 176 | :return: 177 | """ 178 | result = entext 179 | for cnkey, enkey, corres_key in zip(self.cn_keys, self.en_keys, self.corres_keys): 180 | if cnkey in cntext: 181 | result = re.sub(corres_key,enkey,entext,flags=re.I) 182 | if result != entext: 183 | app.logger.info(f"进行了替换: {entext},被从{corres_key}替换成{enkey}") 184 | return result 185 | def translate2en(self, text, do_replace=True): 186 | """ 187 | 翻译中文text到英文 188 | :param do_replace: 默认使用excel中的关键字进行替换 189 | """ 190 | global nobusy_count 191 | nobusy_count = 0 192 | if self.model is None: 193 | #如果predict_model没有加载,自动加载默认的模型 194 | self.load_model() 195 | res = re.findall('[\u4e00-\u9fa5]+', text) 196 | if not res: 197 | app.logger.info(f"原文是: {text}, 不包含中文字符,不需要翻译") 198 | return text 199 | self.tokenizer.src_lang = "zh" 200 | encoded_zh = self.tokenizer(text, return_tensors="pt") 201 | encoded_zh.data['attention_mask'] = encoded_zh.data['attention_mask'].to(self.device) 202 | encoded_zh.data['input_ids'] = encoded_zh.data['input_ids'].to(self.device) 203 | generated_tokens = self.model.generate(**encoded_zh, num_beams=self.num_beams, num_return_sequences=self.num_return_sequences,no_repeat_ngram_size=self.no_repeat_ngram_size, forced_bos_token_id=self.tokenizer.get_lang_id("en")) 204 | entext = self.tokenizer.batch_decode(generated_tokens, skip_special_tokens=True) 205 | app.logger.info(f"原文是: {text}\n模型翻译的结果是: {entext}") 206 | result_text = entext[0] 207 | for ent in entext: 208 | #如果没发现中文,就用这个翻译结果,如果发现中文,就换一个翻译结果,如果仍然有中文,那也没办法了 209 | if not re.findall('[\u4e00-\u9fa5]+', ent): 210 | result_text = ent 211 | break 212 | if do_replace: 213 | result_text = self.start_replace(cntext=text, entext=result_text) 214 | app.logger.info(f"原文是: {text}\n最终翻译的结果是: {entext}") 215 | return result_text 216 | def unload_model(self): 217 | """ 218 | 卸载模型,不把模型加载到GPU中 219 | :return: 220 | :rtype: 221 | """ 222 | app.logger.info(f"开始卸载模型") 223 | self.model = None 224 | self.tokenizer = None 225 | gc.collect() 226 | torch.cuda.empty_cache() 227 | app.logger.info(f"模型卸载完成") 228 | 229 | def slade_change(SRC_PPT, TARGET_PPT, DICT_FILE=None): 230 | """ 231 | 中文到英文PPT的翻译,SRC_PPT中文PPT,TARGET_PPT英文PPT 232 | :param DICT_FILE: excel格式的,包含中文和英文2列,用于正则替换 233 | """ 234 | if DICT_FILE: 235 | # 如果传了excel,那么用用户传的excel做替换,否则用默认的 236 | model.extract_dict(excel_file=DICT_FILE) 237 | else: 238 | model.load_keywords_from_mysql() 239 | prs = Presentation(SRC_PPT) 240 | for slide_idx, slide in enumerate(prs.slides): 241 | for shape_idx, shape in enumerate(slide.shapes): 242 | # 翻译表格 243 | if shape.has_table: 244 | for row_idx, row in enumerate(shape.table.rows): 245 | for cell_idx, cell in enumerate(row.cells): 246 | cell_text = cell.text_frame.text 247 | cell_entext = model.translate2en(cell_text) 248 | prs.slides[slide_idx].shapes[shape_idx].table.rows[row_idx].cells[cell_idx].text_frame.text = cell_entext 249 | # 图表翻译 250 | if shape.has_chart: 251 | # 图表的标题翻译 252 | title_txt = shape.chart.chart_title.text_frame.text 253 | title_entext = model.translate2en(title_txt) 254 | shape.chart.chart_title.text_frame.text = title_entext 255 | #翻译其它文本 256 | if not shape.has_text_frame: 257 | continue 258 | for paragraph_idx,paragraph in enumerate(shape.text_frame.paragraphs): 259 | paragraph_text = "" 260 | for run_idx, run in enumerate(paragraph.runs): 261 | paragraph_text = paragraph_text + run.text 262 | # 只要第一个run的txt,其它的都设为空 263 | if run_idx != 0: 264 | prs.slides[slide_idx].shapes[shape_idx].text_frame.paragraphs[paragraph_idx].runs[run_idx].text = '' 265 | if paragraph_text: 266 | entext = model.translate2en(paragraph_text) 267 | prs.slides[slide_idx].shapes[shape_idx].text_frame.paragraphs[paragraph_idx].runs[0].text = entext 268 | prs.save(TARGET_PPT) 269 | app.logger.info(f"读取{SRC_PPT},修改完成了{len(prs.slides)}页的PPT{TARGET_PPT}") 270 | return 200, "成功" 271 | 272 | @app.route("/fileTranslate", methods=['POST']) 273 | def translate(): 274 | """ 275 | 翻译api 276 | :return: 277 | """ 278 | form_dict = request.form.to_dict() 279 | json_dict = request.get_json() 280 | app.logger.info(f"用户请求的form内容是{form_dict}") 281 | app.logger.info(f"用户请求的json内容是{json_dict}") 282 | # dict_file是翻译后检查和替换的字典 283 | if json_dict: 284 | source_ppt = json_dict.get('inputFilePath', None) 285 | des_ppt = json_dict.get('resultFilePath', None) 286 | dict_file = json_dict.get('dictFilePath', None) 287 | else: 288 | source_ppt = form_dict.get('inputFilePath', None) 289 | des_ppt = form_dict.get('resultFilePath', None) 290 | dict_file = form_dict.get('dictFilePath', None) 291 | if source_ppt.split('.')[-1].lower() not in ['pptx']: 292 | app.logger.warning('ppt格式不符合要求') 293 | return jsonify({'ret': -101, 'msg': 'ppt格式不符合要求'}) 294 | if not os.path.exists(source_ppt): 295 | app.logger.warning('ppt文件不存在,请检查服务器上是否存在这个ppt') 296 | return jsonify({'ret': -102, 'msg': 'ppt文件不存在,请检查服务器上是否存在这个ppt'}) 297 | if dict_file and not os.path.exists(dict_file): 298 | app.logger.warning('给了dictFilePath参数,但文件不在服务器上') 299 | return jsonify({'ret': -103, 'msg': '给了dictFilePath参数,但文件不在服务器上'}) 300 | # 保存文件 301 | code, msg = slade_change(SRC_PPT=source_ppt, TARGET_PPT=des_ppt, DICT_FILE=dict_file) 302 | # 判断OCR后端识别的结果是否正确 303 | if code == 200: 304 | return jsonify({'ret': 0, 'msg': '成功'}) 305 | else: 306 | return jsonify({'ret': -104, 'msg': msg}) 307 | 308 | 309 | @app.route("/syncMysql", methods=['POST']) 310 | def syncmysql(): 311 | """ 312 | 把dictfile的内容写到mysql 313 | :return: 314 | """ 315 | form_dict = request.form.to_dict() 316 | json_dict = request.get_json() 317 | app.logger.info(f"准备同步excel内容到mysql数据库") 318 | app.logger.info(f"用户请求的form内容是{form_dict}") 319 | app.logger.info(f"用户请求的json内容是{json_dict}") 320 | # dict_file是翻译后检查和替换的字典 321 | if json_dict: 322 | source_ppt = json_dict.get('inputFilePath', None) 323 | des_ppt = json_dict.get('resultFilePath', None) 324 | dict_file = json_dict.get('dictFilePath', None) 325 | else: 326 | source_ppt = form_dict.get('inputFilePath', None) 327 | des_ppt = form_dict.get('resultFilePath', None) 328 | dict_file = form_dict.get('dictFilePath', None) 329 | if dict_file and not os.path.exists(dict_file): 330 | app.logger.warning('给了dictFilePath参数,但文件不在服务器上') 331 | return jsonify({'ret': -103, 'msg': '给了dictFilePath参数,但文件不在服务器上'}) 332 | # 保存文件 333 | model.extract_dict(excel_file=dict_file,write2mysql=True) 334 | return jsonify({'ret': 0, 'msg': '成功'}) 335 | 336 | if __name__ == "__main__": 337 | # 预训练模型 338 | model = TranslateModel() 339 | nobusy_count = 0 340 | # slade_change(SRC_PPT="图表翻译.pptx",TARGET_PPT="表格翻译-英文.pptx") 341 | app.run(host='0.0.0.0', port=3325, debug=False, threaded=True) 342 | --------------------------------------------------------------------------------