├── Res2Net
    ├── __init__.py
    ├── config.py
    ├── main.py
    ├── data_utils.py
    └── res2next.py
├── 异步测试
    ├── bench_singlethread.html
    ├── bench_singlethread.txt
    ├── test.png
    ├── images
    │   ├── test.png
    │   ├── aiohttp_rate2000.png
    │   ├── singlthread_rate2000.png
    │   ├── multiprocess_4core_rate2000.png
    │   └── multithread_10thread_rate2000.png
    ├── requirements.txt
    ├── test_aiohttp_api_server.sh
    ├── gunicorn_flask_api_server.sh
    ├── bench_aiohttp.txt
    ├── bench_multiprocess.txt
    ├── test_flask_api_server.py
    ├── README.md
    ├── test_flask_api_server_v2.py
    ├── aiohttp_api_server.py
    ├── benchmark.sh
    ├── quart_api_server.py
    ├── flask_api_server.py
    └── flask_api_server_v2.py
├── text_classsification
    ├── nets
    │   ├── __init__.py
    │   ├── metric.py
    │   ├── text_rnn.py
    │   ├── text_cnn.py
    │   ├── text_rnn_improve.py
    │   ├── text_rnn_improve2.py
    │   ├── text_transformer.py
    │   ├── text_cnn_rnn.py
    │   ├── text_rnn_transformer.py
    │   ├── text_cnn_transformer.py
    │   ├── base_model.py
    │   └── text_adversarial_rnn_improve.py
    ├── utils
    │   ├── __init__.py
    │   ├── network_utils.py
    │   ├── data_helpers.py
    │   └── vocabulary_utils.py
    ├── Readme.md
    └── eval.py
├── unsuper.png
├── translate
    ├── 翻译对照表总表.xlsx
    ├── data
    │   ├── custom_zh_en.py
    │   └── sacrebleu.py
    └── do_translate.py
├── TextCNN
    ├── Readme.md
    ├── config.py
    ├── main.py
    ├── data_utils.py
    └── model_utils.py
├── README.md
├── .gitignore
├── langconv.py
├── LICENSE
└── unsuper_classification.py


/Res2Net/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/异步测试/bench_singlethread.html:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/异步测试/bench_singlethread.txt:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/text_classsification/nets/__init__.py:
--------------------------------------------------------------------------------
1 | # -- encoding:utf-8 --
2 | 


--------------------------------------------------------------------------------
/text_classsification/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # -- encoding:utf-8 --
2 | 


--------------------------------------------------------------------------------
/unsuper.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/johnson7788/NLP/HEAD/unsuper.png


--------------------------------------------------------------------------------
/异步测试/test.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/johnson7788/NLP/HEAD/异步测试/test.png


--------------------------------------------------------------------------------
/异步测试/images/test.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/johnson7788/NLP/HEAD/异步测试/images/test.png


--------------------------------------------------------------------------------
/translate/翻译对照表总表.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/johnson7788/NLP/HEAD/translate/翻译对照表总表.xlsx


--------------------------------------------------------------------------------
/异步测试/images/aiohttp_rate2000.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/johnson7788/NLP/HEAD/异步测试/images/aiohttp_rate2000.png


--------------------------------------------------------------------------------
/异步测试/requirements.txt:
--------------------------------------------------------------------------------
1 | aiohttp==3.6.2
2 | Flask==1.1.2
3 | requests==2.24.0
4 | urllib3==1.25.9
5 | Werkzeug==1.0.1
6 | 


--------------------------------------------------------------------------------
/异步测试/images/singlthread_rate2000.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/johnson7788/NLP/HEAD/异步测试/images/singlthread_rate2000.png


--------------------------------------------------------------------------------
/异步测试/images/multiprocess_4core_rate2000.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/johnson7788/NLP/HEAD/异步测试/images/multiprocess_4core_rate2000.png


--------------------------------------------------------------------------------
/异步测试/images/multithread_10thread_rate2000.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/johnson7788/NLP/HEAD/异步测试/images/multithread_10thread_rate2000.png


--------------------------------------------------------------------------------
/text_classsification/nets/metric.py:
--------------------------------------------------------------------------------
1 | # -- encoding:utf-8 --
2 | 
3 | from collections import namedtuple
4 | 
5 | 
6 | class Metrics(namedtuple('Metrics',
7 |                          ['accuracy', 'recall', 'f1'])):
8 |     pass
9 | 


--------------------------------------------------------------------------------
/异步测试/test_aiohttp_api_server.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #测试上传文件接口
3 | echo "Test upload image"
4 | curl -X POST -H "token:ff1c1eef10cad322ddbcd842a952b46c" -H "timestamp:1592805495.355849" -H "sign:c8499156ae8acc3f2d6a1453b9315eb1" -F "image=@test.png" http://127.0.0.1:5000/upload
5 | 
6 | 


--------------------------------------------------------------------------------
/TextCNN/Readme.md:
--------------------------------------------------------------------------------
 1 | ### 主要目录结构
 2 | ```
 3 | ├── Readme.md
 4 | ├── config.py    #模型配置文件
 5 | ├── data      #数据集，包括训练数据和验证数据, 每个子文件是一个类别，里面放对于文本
 6 | ├── main.py    #模型运行入口，里面主要包含3个函数，分别是训练，测试，和实际运行的预测接口predict
 7 | ├── model     #保存TextCNN模型和生成的字典
 8 | ├── model_utils.py   #TextCNN模型文件
 9 | └── data_utils.py    #文本预处理模块
10 | ```
11 | 
12 | 


--------------------------------------------------------------------------------
/text_classsification/Readme.md:
--------------------------------------------------------------------------------
 1 | 1. data：文件夹中存储的主要是数据
 2 | 2. nets: package,主要存储网络结构模型代码
 3 |     text_cnn: TextCNN做文本分类
 4 | 3. utils: package,主要存储工具函数相关的代码
 5 |     data_helpers.py: 数据加载、批次构建相关函数代码
 6 |     network_utils.py: 优化器参数构建相关代码
 7 |     vocabulary_utils.py: 词汇转换相关代码
 8 | 4. train.py: 模型训练入口函数
 9 | 5. eval.py: 模型效果评估的入口函数
10 | 6. graph：模型执行可视化文件保存的文件夹
11 | 7. model：模型持久化保存的文件夹
12 | 8. deploy：模型部署相关package


--------------------------------------------------------------------------------
/异步测试/gunicorn_flask_api_server.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | PORT=5000
 3 | SCRIPT=flask_api_server
 4 | #进程数量
 5 | WORKER=${WORKER:-2}
 6 | #默认多线程个数10
 7 | THREAD=${THREAD:-10}
 8 | 
 9 | #是否使用多线程模式, 默认多线程
10 | USERTHREAD=${USERTHREAD:-0}
11 | 
12 | if [ $USERTHREAD -eq 0 ]; then
13 |   echo "启用多线程,线程个数 $THREAD"
14 |   gunicorn -b localhost:$PORT -w $WORKER --threads $THREAD $SCRIPT:app
15 | else
16 |   echo "启用多进程, 进程个数 $WORKER"
17 |   gunicorn -b localhost:$PORT -w $WORKER $SCRIPT:app
18 | fi


--------------------------------------------------------------------------------
/异步测试/bench_aiohttp.txt:
--------------------------------------------------------------------------------
1 | Requests      [total, rate, throughput]         6000, 2000.47, 2000.25
2 | Duration      [total, attack, wait]             3s, 2.999s, 320.618µs
3 | Latencies     [min, mean, 50, 90, 95, 99, max]  290.812µs, 414.588µs, 344.785µs, 481.47µs, 591.848µs, 2.359ms, 5.04ms
4 | Bytes In      [total, mean]                     24000, 4.00
5 | Bytes Out     [total, mean]                     36000, 6.00
6 | Success       [ratio]                           100.00%
7 | Status Codes  [code:count]                      200:6000  
8 | Error Set:
9 | 


--------------------------------------------------------------------------------
/异步测试/bench_multiprocess.txt:
--------------------------------------------------------------------------------
1 | Requests      [total, rate, throughput]         6000, 2000.50, 1999.61
2 | Duration      [total, attack, wait]             3.001s, 2.999s, 1.336ms
3 | Latencies     [min, mean, 50, 90, 95, 99, max]  668.591µs, 1.377ms, 955.315µs, 2.313ms, 3.528ms, 5.872ms, 9.594ms
4 | Bytes In      [total, mean]                     96000, 16.00
5 | Bytes Out     [total, mean]                     36000, 6.00
6 | Success       [ratio]                           100.00%
7 | Status Codes  [code:count]                      200:6000  
8 | Error Set:
9 | 


--------------------------------------------------------------------------------
/TextCNN/config.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | class Config(object):
 4 |     """Base configuration class."""
 5 |     #训练文件夹位置
 6 |     train_dir = "data/train"
 7 |     #评估文件夹位置
 8 |     eval_dir = "data/eval"
 9 |     #模型的保存位置
10 |     save_path='model/'
11 |     #是否使用gpu
12 |     cuda = True
13 |     #训练的epoch
14 |     epochs = 2
15 |     batch_size = 64
16 |     #学习率
17 |     learning_rate = 0.001
18 |     #学习率动量
19 |     learning_momentum = 0.9
20 |     #学习率衰减稀疏
21 |     weight_decay = 0.0001
22 |     dropout = 0.5
23 |     #生成的词嵌入的维度
24 |     embed_dim = 128
25 |     #卷积核的数量
26 |     kernel_num = 100
27 |     #卷积核的尺寸
28 |     kernel_sizes = "3,4,5"
29 |     #训练多少个epoch时，模型保存
30 |     save_interval = 2
31 | 
32 |     #初始化，是否使用gpu
33 |     def __init__(self):
34 |         if self.cuda:
35 |             self.cuda = torch.cuda.is_available()
36 |         self.device = torch.device("cuda:0" if self.cuda else "cpu")
37 | 
38 |     def dump(self):
39 |         """打印配置信息"""
40 |         print("模型配置如下:")
41 |         for a in dir(self):
42 |             if not a.startswith("__") and not callable(getattr(self, a)):
43 |                 print("\t{:30} = {}".format(a, getattr(self, a)))
44 |         print()
45 | 


--------------------------------------------------------------------------------
/异步测试/test_flask_api_server.py:
--------------------------------------------------------------------------------
 1 | import aiohttp
 2 | import asyncio
 3 | import unittest
 4 | import requests
 5 | 
 6 | url = 'http://127.0.0.1:5000'
 7 | #确保Flask server已经启动
 8 | 
 9 | def get_header():
10 |     headers = {
11 |         'token': 'ff1c1eef10cad322ddbcd842a952b46c',
12 |         'timestamp': '1592805495.355849',
13 |         'sign': 'c8499156ae8acc3f2d6a1453b9315eb1'
14 |     }
15 |     return headers
16 | 
17 | class NamesTestCase(unittest.TestCase):
18 |     def test_get_echo(self):
19 |         """测试服务器正常启动"""
20 |         r = requests.get(url +'/echo')
21 |         self.assertEqual(r.status_code,  200)
22 | 
23 |     def test_get_token(self):
24 |         """测试获取token"""
25 |         r = requests.get(url+'/token')
26 |         self.assertEqual(r.status_code,  200)
27 |         self.assertIn('sign', r.json())
28 |         self.assertIn('timestamp', r.json())
29 |         self.assertIn('token', r.json())
30 | 
31 |     def test_try_process(self):
32 |         """测试多进程"""
33 |         headers = get_header()
34 |         r = requests.get(url + '/process',  headers=headers)
35 |         self.assertEqual(r.status_code, 200)
36 |         self.assertTrue(r.json()['result'])
37 | 
38 |     def test_try_upload(self):
39 |         """测试用获取的token上传图片"""
40 |         with open("test.png", 'rb') as img:
41 |             files = {
42 |                 'image': img
43 |             }
44 |             headers = get_header()
45 |             r = requests.post(url + '/upload', headers=headers, files=files)
46 |             self.assertEqual(r.status_code, 200)
47 | 
48 | if __name__ == '__main__':
49 |     ##确保Flask server已经启动
50 |     unittest.main()
51 | 


--------------------------------------------------------------------------------
/Res2Net/config.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | class Config(object):
 4 |     """Base configuration class."""
 5 |     #指定包含训练集，验证集合测试集的文件夹
 6 |     data_directory = "data/zhengjian/"
 7 |     #模型的保存位置
 8 |     save_path='model/'
 9 |     #模型保存名称
10 |     save_name='checkpoint.pth'
11 |     #使用哪个模型类型，可选 ['densenet161', 'resnet18', 'vgg16', 'res2next50']
12 |     arch = 'res2next50'
13 |     # classifier的 隐藏层数, 可以任意个[1024,512,256]，每个是一个FC
14 |     hidden_units = [256]
15 |     # 评估间隔, 训练多少个epoch，进行一次评估
16 |     eval_interval = 100
17 |     # 是否绘图还是直接返回结果
18 |     plot = False
19 |     # 绘图显示的预测个数, 需要是偶数个
20 |     plot_image = 6
21 |     #是否使用gpu
22 |     cuda = False
23 |     #device name ,如果使用cpu，那么就是cpu，如果使用gpu, 可能是第几块显卡cuda:0
24 |     device_name = 'cpu'
25 |     #训练的epoch
26 |     epochs = 2
27 |     batch_size = 64
28 |     #学习率
29 |     learning_rate = 0.001
30 |     #学习率动量
31 |     learning_momentum = 0.9
32 |     #学习率衰减稀疏
33 |     weight_decay = 0.0001
34 |     dropout = 0.5
35 |     #生成的词嵌入的维度
36 |     embed_dim = 128
37 |     #卷积核的数量
38 |     kernel_num = 100
39 |     #卷积核的尺寸
40 |     kernel_sizes = "3,4,5"
41 |     #训练多少个epoch时，模型保存
42 |     save_interval = 2
43 | 
44 |     #初始化，是否使用gpu
45 |     def __init__(self):
46 |         if self.cuda:
47 |             self.cuda = torch.cuda.is_available()
48 |         self.device = torch.device("cuda:0" if self.cuda else "cpu")
49 | 
50 |     def dump(self):
51 |         """打印配置信息"""
52 |         print("模型配置如下:")
53 |         for a in dir(self):
54 |             if not a.startswith("__") and not callable(getattr(self, a)):
55 |                 print("\t{:30} = {}".format(a, getattr(self, a)))
56 |         print()
57 | 


--------------------------------------------------------------------------------
/异步测试/README.md:
--------------------------------------------------------------------------------
 1 | # API
 2 | 
 3 | #### Flask 版本介绍
 4 | - flask_api_server.py 是Flask的版本
 5 | - 可以使用gunicorn_flask_api_server.sh启动
 6 | - 使用test_flask_api_server.py进行单元测试
 7 | 
 8 | ### 异步版本aiohttp
 9 | - aiohttp_api_server.py 是基于aiohttp的异步版本
10 | - 使用test_aiohttp_api_server.sh测试
11 | 
12 | 
13 | ### 压力测试Benchmark
14 | * 使用的工具是: Vegeta, Star是14.8， https://github.com/tsenart/vegeta
15 | * 当前环境：Mac OS i5, 1.4GHZ, 16GB, SSD
16 | 
17 | * 测试脚本， 自动测试多进程，多线程，和异步的serverd的对比结果，结果保存为2种形式，分别为html图片格式，和txt文本格式
18 | `benchmark.sh`
19 | 
20 | 
21 | #### 对比结果
22 | 以下是每秒请求2000次，测试3秒的结果对比, 根据每个人的电脑或服务器的性能不同，结果也不同，这是一个相对的结果
23 | - 单线程每秒超过1000次请求时，已经无力，超2000时已经超时
24 | ![singlethread](images/singlthread_rate2000.png)
25 | - 多核，4core 对比
26 | ![multiprocess](images/multiprocess_4core_rate2000.png)
27 | - 多线程，单核10thread 对比
28 | ![multithread](images/multithread_10thread_rate2000.png)
29 | - aiohttp异步对比
30 | ![aiohttp](images/aiohttp_rate2000.png)
31 | 
32 | - 异步的延迟 rate 2000/s, 成功率100%
33 | ```
34 | Latencies     [min, mean, 50, 90, 95, 99, max]  290.812µs, 414.588µs, 344.785µs, 481.47µs, 591.848µs, 2.359ms, 5.04ms
35 | ```
36 | - 多线程的延迟 rate 2000/s， 成功率 75.38%
37 | ```
38 | Latencies     [min, mean, 50, 90, 95, 99, max]  261.772µs, 4.391s, 799.417ms, 28.908s, 30s, 30.001s, 30.005s
39 | ```
40 | 
41 | ### 当增加请求频率到2500/s时，CPU出现瓶颈，
42 | - 异步的成功率在31.06%
43 | ```
44 | Latencies     [min, mean, 50, 90, 95, 99, max]  5.224ms, 13.68s, 8.096s, 30s, 30.001s, 30.002s, 30.005s
45 | Success       [ratio]                           31.06%
46 | ```
47 | - 多线程的成功率在16.26%
48 | ```
49 | Latencies     [min, mean, 50, 90, 95, 99, max]  13.181ms, 26.705s, 30s, 30.001s, 30.001s, 30.003s, 30.006s
50 | Success       [ratio]                           16.26%
51 | ```
52 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # NLP
 2 | 
 3 | ## 欢迎来我的知乎博客提问: https://www.zhihu.com/people/be_with_you
 4 | 
 5 | ## News
 6 | translate 域适应翻译模型, 微调模型命令
 7 | ```buildoutcfg
 8 | python run_translation.py --model_name_or_path
 9 | facebook/m2m100_418M
10 | --do_train
11 | --do_eval
12 | --fp16 True
13 | --dataset_name custom_zh_en
14 | --source_lang zh
15 | --target_lang en
16 | --output_dir output/zh-en-translation
17 | --per_device_train_batch_size=8
18 | --per_device_eval_batch_size=8
19 | --overwrite_output_dir
20 | --predict_with_generate
21 | ```
22 | 
23 | ## 主要使用tfidf+doc2vec+albert实现无监督文本分类, 代码有些粗鄙，有些问题已经在代码中标注，我会逐渐修改完善，欢迎任何意见和建议！
24 | 实际测试准确率不高，还是需要结合人工制定规则+有监督训练，在结合部分无监督训练可能有更好效果
25 | 
26 | ### 无监督分类算法，使用tfidf， doc2vec， albert
27 | ![构思图](unsuper.png)
28 | ### python文件
29 | unsuper_classification.py
30 | 
31 | 
32 | ## TextCNN 通用文本分类模型
33 | TextCNN文件夹
34 | 注意：数据目录下的每个文件夹放好对应的要训练的目录，目录里面放好单个文件就可以
35 | /data/train/
36 | ```
37 | ├── Readme.md
38 | ├── config.py
39 | ├── data
40 | │   ├── eval
41 | │   │   ├── 新闻
42 | │   │   ├── 科技
43 | │   │   └── 天气
44 | │   ├── predict
45 | │   └── train
46 | │       ├── 新闻
47 | │       ├── 科技
48 | │       └── 天气
49 | ├── data_utils.py
50 | ├── main.py
51 | ├── model
52 | └── model_utils.py
53 | ```
54 | 
55 | ## Res2Net 通用图片分类模型
56 | Res2Net文件夹
57 | ```
58 | ├── __init__.py
59 | ├── config.py
60 | ├── data
61 | │   ├── eval
62 | │   │   ├── 狗
63 | │   │   ├── 猫
64 | │   │   ├── 鸡
65 | │   │   └── 鸭
66 | │   ├── predict
67 | │   ├── test
68 | │   │   ├── 狗
69 | │   │   ├── 猫
70 | │   │   ├── 鸡
71 | │   │   └── 鸭
72 | │   └── train
73 | │       ├── 狗
74 | │       ├── 猫
75 | │       ├── 鸡
76 | │       └── 鸭
77 | ├── data_utils.py   #数据处理
78 | ├── main.py  #程序入口，支持训练，预测，继续训练
79 | ├── model    #保存模型
80 | ├── model_utils.py   #模型处理
81 | └── res2next.py    #res2next模型
82 | ```
83 | 


--------------------------------------------------------------------------------
/异步测试/test_flask_api_server_v2.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import requests
 3 | import time
 4 | 
 5 | url = 'http://127.0.0.1:5000'
 6 | #确保Flask server已经启动
 7 | 
 8 | r = requests.get(url + '/gentoken')
 9 | res = r.json()
10 | headers = {
11 |     'token': res['token'],
12 |     'timestamp': res['timestamp'],
13 |     'sign': res['sign']
14 | }
15 | 
16 | class FlaskTestCase(unittest.TestCase):
17 |     def test_get_echo(self):
18 |         """测试服务器正常启动"""
19 |         r = requests.get(url +'/echo')
20 |         self.assertEqual(r.status_code,  200)
21 | 
22 |     def test_get_token(self):
23 |         """测试获取token"""
24 |         r = requests.get(url+'/gentoken')
25 |         self.assertEqual(r.status_code,  200)
26 |         self.assertIn('sign', r.json())
27 |         self.assertIn('timestamp', r.json())
28 |         self.assertIn('token', r.json())
29 | 
30 |     def test_try_upload_sync(self):
31 |         """上传图片文件,并立即处理，返回成功结果"""
32 |         with open("test.png", 'rb') as img:
33 |             files = {
34 |                 'image': img
35 |             }
36 |             r = requests.post(url + '/upload_sync', headers=headers, files=files)
37 |             self.assertEqual(r.status_code, 200)
38 |             self.assertEqual(r.json()['code'], 0)
39 | 
40 |     def test_try_upload_async(self):
41 |         """测试上传图片，不能立即处理完成，先返回成功处理的页面，等待用户调取"""
42 |         with open("test.png", 'rb') as img:
43 |             files = {
44 |                 'image': img
45 |             }
46 |             r = requests.post(url + '/upload_async', headers=headers, files=files)
47 |             self.assertEqual(r.status_code, 200)
48 |         time.sleep(3)
49 |         r = requests.get(url+'/upload_async_result', headers=headers)
50 |         self.assertEqual(r.status_code,  200)
51 |         self.assertEqual(r.json()['code'], 0)
52 | 
53 | if __name__ == '__main__':
54 |     ##确保Flask server已经启动
55 |     unittest.main()
56 | 


--------------------------------------------------------------------------------
/异步测试/aiohttp_api_server.py:
--------------------------------------------------------------------------------
 1 | from aiohttp import web
 2 | from typing import List
 3 | import os
 4 | 
 5 | 
 6 | Tokens = [
 7 |     {"sign": "c8499156ae8acc3f2d6a1453b9315eb1", "timestamp": "1592805495.355849", "token": "ff1c1eef10cad322ddbcd842a952b46c"}
 8 | ]
 9 | UPLOAD_FOLDER = 'images/'
10 | 
11 | def verify(token:str, timestamp:str ,sign:str)-> bool:
12 |     """
13 |     验证token，timestamp，sign
14 |     :param token: token
15 |     :param timestamp: 时间戳
16 |     :param sign: 签名
17 |     :return: bool
18 |     """
19 |     for tk in Tokens:
20 |         tkv = tk.values()
21 |         if token in tkv and timestamp in tkv and sign in tkv:
22 |             return True
23 |     return False
24 | 
25 | async def authorize(request: web.Request):
26 |         if not 'token' in request.headers or not 'timestamp' in request.headers or not 'sign' in request.headers:
27 |             return web.HTTPForbidden()
28 | 
29 |         token = request.headers['token']
30 |         timestamp = request.headers['timestamp']
31 |         sign = request.headers['sign']
32 |         if not verify(token,timestamp,sign):
33 |             return web.HTTPForbidden()
34 |         request.transport.write(b"HTTP/1.1 100 Continue\r\n\r\n")
35 | 
36 | async def handle(request):
37 |     name = request.match_info.get('name', "Anonymous")
38 |     text = "Hello, " + name
39 |     return web.Response(text=text)
40 | 
41 | async def echo(request):
42 |     return web.Response(text='echo')
43 | 
44 | async def upload(request):
45 |     post = await request.post()
46 |     image = post.get("image")
47 |     with open(UPLOAD_FOLDER + image.filename, 'wb') as file:
48 |         file.write(image.file.read())
49 |     return web.json_response({'result': 'success upload'})
50 | 
51 | app = web.Application()
52 | app.add_routes([web.get('/', handle),
53 |                 web.post('/upload', upload, expect_handler=authorize),
54 |                 web.get('/echo', echo)])
55 | 
56 | if __name__ == '__main__':
57 |     web.run_app(app, port=5000, host='0.0.0.0' )


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | .idea
  6 | 
  7 | # C extensions
  8 | *.so
  9 | 
 10 | # Distribution / packaging
 11 | .Python
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | pip-wheel-metadata/
 25 | share/python-wheels/
 26 | *.egg-info/
 27 | .installed.cfg
 28 | *.egg
 29 | MANIFEST
 30 | 
 31 | # PyInstaller
 32 | #  Usually these files are written by a python script from a template
 33 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 34 | *.manifest
 35 | *.spec
 36 | 
 37 | # Installer logs
 38 | pip-log.txt
 39 | pip-delete-this-directory.txt
 40 | 
 41 | # Unit test / coverage reports
 42 | htmlcov/
 43 | .tox/
 44 | .nox/
 45 | .coverage
 46 | .coverage.*
 47 | .cache
 48 | nosetests.xml
 49 | coverage.xml
 50 | *.cover
 51 | *.py,cover
 52 | .hypothesis/
 53 | .pytest_cache/
 54 | 
 55 | # Translations
 56 | *.mo
 57 | *.pot
 58 | 
 59 | # Django stuff:
 60 | *.log
 61 | local_settings.py
 62 | db.sqlite3
 63 | db.sqlite3-journal
 64 | 
 65 | # Flask stuff:
 66 | instance/
 67 | .webassets-cache
 68 | 
 69 | # Scrapy stuff:
 70 | .scrapy
 71 | 
 72 | # Sphinx documentation
 73 | docs/_build/
 74 | 
 75 | # PyBuilder
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | .python-version
 87 | 
 88 | # pipenv
 89 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 90 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 91 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 92 | #   install all needed dependencies.
 93 | #Pipfile.lock
 94 | 
 95 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 96 | __pypackages__/
 97 | 
 98 | # Celery stuff
 99 | celerybeat-schedule
100 | celerybeat.pid
101 | 
102 | # SageMath parsed files
103 | *.sage.py
104 | 
105 | # Environments
106 | .env
107 | .venv
108 | env/
109 | venv/
110 | ENV/
111 | env.bak/
112 | venv.bak/
113 | 
114 | # Spyder project settings
115 | .spyderproject
116 | .spyproject
117 | 
118 | # Rope project settings
119 | .ropeproject
120 | 
121 | # mkdocs documentation
122 | /site
123 | 
124 | # mypy
125 | .mypy_cache/
126 | .dmypy.json
127 | dmypy.json
128 | 
129 | # Pyre type checker
130 | .pyre/
131 | 


--------------------------------------------------------------------------------
/异步测试/benchmark.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #每秒发送请求次数
 3 | RATE=2000/s
 4 | #测试时间，测试,总测试请求RATE*DURATION
 5 | DURATION=3s
 6 | 
 7 | #如果使用多线程，线程个数
 8 | THREAD=10
 9 | #如果使用多进程，进程个数
10 | WORKER=4
11 | 
12 | #单核单线程
13 | flaskbench_singlethread(){
14 |   title=bench_singlethread
15 |   echo "启动单核单线程server"
16 |   python flask_api_server.py & > /dev/null 2>&1
17 | #  ps aux | grep flask | grep -v grep  > bench_singlethread.html
18 |   echo "开始压测并生成结果"
19 |   $1 $title
20 |   pkill python
21 |   echo "关闭server"
22 | }
23 | 
24 | #单核10线程
25 | flaskbench_multithread(){
26 |   title=bench_multithread
27 |   echo "启动单核多线程server"
28 |   bash gunicorn_flask_api_server.sh &
29 | #  ps aux | grep flask | grep -v grep  > bench_multithread.html
30 |   #等待gunicorn完全启动
31 |   sleep 1
32 |   echo "开始压测并生成结果"
33 |   $1 $title
34 |   pkill python
35 |   sleep 1
36 |   echo "关闭server"
37 | }
38 | 
39 | #多核单线程
40 | flaskbench_multiprocess(){
41 |   title=bench_multiprocess
42 |   #禁用线程FLAG，改用多核
43 |   USERTHREAD=1
44 |   echo "启动多核单线程 gunicorn_flask_api_server server"
45 |   bash gunicorn_flask_api_server.sh &
46 | #  ps aux | grep flask | grep -v grep  > bench_multiprocess.html
47 |   #等待gunicorn完全启动
48 |   sleep 1
49 |   echo "开始压测并生成结果"
50 |   $1 $title
51 |   pkill python
52 |   sleep 1
53 |   echo "关闭server"
54 | }
55 | 
56 | #ayncio异步server测试
57 | flaskbench_aiohttp(){
58 |   title=bench_aiohttp
59 |   #禁用线程FLAG，改用多核
60 |   echo "启动aiohttp_api_server server"
61 |   python aiohttp_api_server.py & > /dev/null 2>&1
62 | #  ps aux | grep flask | grep -v grep  > bench_multiprocess.html
63 |   #等待gunicorn完全启动
64 |   sleep 1
65 |   echo "开始压测并生成结果"
66 |   $1 $title
67 |   pkill python
68 |   sleep 1
69 |   echo "关闭server"
70 | }
71 | 
72 | #无负载压力测试
73 | noload(){
74 | jq -ncM '{method: "GET", url: "http://127.0.0.1:5000/echo", body: "Hello!" | @base64 }' | vegeta attack -format=json -rate=$RATE -duration=$DURATION > results.bin
75 | cat results.bin  | vegeta report > $1.txt
76 | cat results.bin | vegeta plot -title $1 > $1.html
77 | rm results.bin
78 | }
79 | 
80 | #加上传图片负载时的测试结果, 图片的上传方法暂时无文档，todo
81 | uploadimage(){
82 | jq -ncM '{method: "POST", url: "http://127.0.0.1:5000/upload", file:"image=@test.png", header: {"token": "ff1c1eef10cad322ddbcd842a952b46c", "timestamp":"1592805495.355849", "sign":"c8499156ae8acc3f2d6a1453b9315eb1"} }' | vegeta attack -format=json -rate=$RATE -duration=$DURATION > results.bin
83 | cat results.bin  | vegeta report > $1.txt
84 | cat results.bin | vegeta plot -title $1 > $1.html
85 | rm results.bin
86 | }
87 | 
88 | #flaskbench_singlethread noload
89 | flaskbench_multithread noload
90 | #flaskbench_multiprocess noload
91 | flaskbench_aiohttp noload


--------------------------------------------------------------------------------
/text_classsification/utils/network_utils.py:
--------------------------------------------------------------------------------
 1 | # -- encoding:utf-8 --
 2 | 
 3 | import tensorflow as tf
 4 | 
 5 | 
 6 | def build_optimizer_parameters_func(flags):
 7 |     """
 8 |     构建优化器的参数
 9 |     :param flags:
10 |     :return:
11 |     """
12 |     optimizer_type = flags.optimizer_type
13 |     parameters = {}
14 | 
15 |     # 添加各自不同优化器对应的参数
16 |     if optimizer_type == 'adam':
17 |         parameters['beta1'] = flags.adam_beta1
18 |         parameters['beta2'] = flags.adam_beta2
19 |         parameters['epsilon'] = flags.adam_epsilon
20 |     elif optimizer_type == 'momentum':
21 |         parameters['momentum'] = flags.momentum
22 | 
23 |     def build_optimizer_parameters(global_step):
24 |         # 添加共同参数: learning_rate
25 |         learning_rate_type = flags.learning_rate_type
26 |         base_learning_rate = flags.base_learning_rate
27 |         if learning_rate_type == 'exponential':
28 |             tf.logging.info("使用指数变化学习率形式.....")
29 |             # staircase=False：decayed_learning_rate = learning_rate * decay_rate ^ (global_step / decay_steps)
30 |             # staircase=True：decayed_learning_rate = learning_rate * decay_rate ^ int(global_step / decay_steps)
31 |             lr = tf.train.exponential_decay(
32 |                 learning_rate=base_learning_rate,  # 基础学习率
33 |                 global_step=global_step,  # 迭代的步数
34 |                 decay_steps=flags.lr_decay_steps,  # 间隔大小
35 |                 decay_rate=flags.lr_decay_rate,  # 缩放比例
36 |                 staircase=flags.lr_staircase,  # 是否整间隔的进行缩放
37 |                 name="exponential_learning_rate")
38 |             pass
39 |         elif learning_rate_type == 'polynomial':
40 |             tf.logging.info("使用多项式变化学习率形式.....")
41 |             # global_step = min(global_step, decay_steps)
42 |             # decayed_learning_rate = (learning_rate - end_learning_rate) * (1 - global_step / decay_steps) ^ (power) + end_learning_rate
43 |             lr = tf.train.polynomial_decay(
44 |                 learning_rate=base_learning_rate,  # 基础学习率
45 |                 global_step=global_step,  # 迭代的步数
46 |                 decay_steps=flags.lr_decay_steps,  # 间隔大小
47 |                 end_learning_rate=flags.end_learning_rate,  # 最终学习率大小
48 |                 power=1.0,  # 给定是否的时候是否是线性的系数
49 |                 cycle=True,  # 当学习率为最小值的时候，是否将学习率重置设置比较大，然后再进行学习率下降的操作
50 |                 name="polynomial_learning_rate")
51 |         else:
52 |             tf.logging.info("使用常数不变的学习率.....")
53 |             lr = tf.constant(base_learning_rate, name='lr')
54 |         parameters['learning_rate'] = lr
55 |         tf.summary.scalar('learning_rate', lr)
56 | 
57 |         return parameters
58 | 
59 |     return build_optimizer_parameters
60 | 


--------------------------------------------------------------------------------
/text_classsification/utils/data_helpers.py:
--------------------------------------------------------------------------------
 1 | # -- encoding:utf-8 --
 2 | 
 3 | import numpy as np
 4 | import re
 5 | 
 6 | 
 7 | # 清洗字符串，字符切分
 8 | def clean_str(string):
 9 |     """
10 |     Tokenization/string cleaning for all datasets except for SST.
11 |     Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
12 |     """
13 |     string = re.sub(r"[^\u4e00-\u9fa5A-Za-z0-9(),.!?，。？！、“”\'\`]", " ", string)  # 考虑到中文
14 |     string = re.sub(r"\'s", " \'s", string)
15 |     string = re.sub(r"\'ve", " \'ve", string)
16 |     string = re.sub(r"n\'t", " n\'t", string)
17 |     string = re.sub(r"\'re", " \'re", string)
18 |     string = re.sub(r"\'d", " \'d", string)
19 |     string = re.sub(r"\'ll", " \'ll", string)
20 |     string = re.sub(r",", " , ", string)
21 |     string = re.sub(r"!", " ! ", string)
22 |     string = re.sub(r"\(", " \( ", string)
23 |     string = re.sub(r"\)", " \) ", string)
24 |     string = re.sub(r"\?", " \? ", string)
25 |     string = re.sub(r"\s{2,}", " ", string)
26 |     return string.strip().lower()
27 | 
28 | 
29 | def load_data_and_labels(positive_data_file, negative_data_file):
30 |     """
31 |     基于给定的正例和负例文件路径加载数据
32 |     :param positive_data_file:
33 |     :param negative_data_file:
34 |     :return:
35 |     """
36 |     # 1. 加载所有数据组成list列表
37 |     positive = open(positive_data_file, 'rb').read().decode('utf-8')
38 |     negative = open(negative_data_file, 'rb').read().decode('utf-8')
39 | 
40 |     # 2.数据的划分(转换成一个一个样本)
41 |     positive = positive.split("\n")
42 |     negative = negative.split("\n")
43 | 
44 |     # 3. 数据简单处理
45 |     positive = [clean_str(s.strip()) for s in positive]
46 |     negative = [clean_str(s.strip()) for s in negative]
47 |     positive = [s for s in positive if len(s) > 0]
48 |     negative = [s for s in negative if len(s) > 0]
49 | 
50 |     # 4. 数据合并得到x
51 |     texts = positive + negative
52 | 
53 |     # 5. 得到对应的id
54 |     labels = [1] * len(positive) + [0] * len(negative)
55 | 
56 |     # 6. 结果返回
57 |     return np.asarray(texts), np.asarray(labels)
58 | 
59 | 
60 | def batch_iter(data, batch_size, num_epochs, shuffle=True):
61 |     """
62 |     基于给定的data数据获取批次数据
63 |     :param data:
64 |     :param batch_size:
65 |     :param num_epochs:
66 |     :param shuffle:
67 |     :return:
68 |     """
69 |     data = np.array(data)
70 |     data_size = len(data)
71 |     # 一个epoch里面有多少个bachsize
72 |     num_batches_per_epoch = int((len(data) - 1) / batch_size) + 1
73 |     for epoch in range(num_epochs):
74 |         # Shuffle the data at each epoch
75 |         if shuffle:
76 |             # 传给permutation一个矩阵，它会返回一个洗牌后的矩阵副本
77 |             shuffle_indices = np.random.permutation(np.arange(data_size))
78 |             shuffled_data = data[shuffle_indices]
79 |         else:
80 |             shuffled_data = data
81 |         for batch_num in range(num_batches_per_epoch):
82 |             start_index = batch_num * batch_size
83 |             end_index = min((batch_num + 1) * batch_size, data_size)
84 |             yield shuffled_data[start_index:end_index]
85 | 
86 | 
87 | if __name__ == '__main__':
88 |     texts, labels = load_data_and_labels()
89 |     # from utils.vocabulary_utils import VocabularyProcessorUtil, split_with_word
90 |     #
91 |     # _, vocabulary = VocabularyProcessorUtil.load_word2vec_embedding("../model/w2v.bin")
92 |     # VocabularyProcessorUtil.building_model(documents=texts, save_path='../model/vocab.pkl', max_document_length=512,
93 |     #                                        vocabulary=vocabulary,
94 |     #                                        split_fn=split_with_word)
95 |     # model = VocabularyProcessorUtil.load_model('../model/vocab.pkl')
96 | 
97 | 


--------------------------------------------------------------------------------
/translate/data/custom_zh_en.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # @Date  : 2021/4/27 10:30 上午
  3 | # @File  : custom_zh_en.py
  4 | # @Author: johnson
  5 | # @Contact : github: johnson7788
  6 | # @Desc  : 中文到英文的数据集
  7 | 
  8 | from __future__ import absolute_import, division, print_function
  9 | 
 10 | import csv
 11 | import json
 12 | import os
 13 | 
 14 | import datasets
 15 | 
 16 | 
 17 | 
 18 | _CITATION = """\
 19 | @InProceedings{huggingface:dataset,
 20 | title = {repair test},
 21 | authors={johnson
 22 | },
 23 | year={2020}
 24 | }
 25 | """
 26 | 
 27 | #数据集描述
 28 | _DESCRIPTION = """\
 29 | 中文到英文的句子翻译
 30 | """
 31 | 
 32 | _HOMEPAGE = "johnson homepage"
 33 | 
 34 | _LICENSE = "johnson license"
 35 | 
 36 | # 数据集下载地址
 37 | _URLs = {
 38 |     'custom_zh_en': "https://huggingface.co/great-new-dataset-first_domain.zip",
 39 | }
 40 | 
 41 | 
 42 | #通常CamelCase命名
 43 | class ZhEnDataset(datasets.GeneratorBasedBuilder):
 44 |     """连贯性测试数据集"""
 45 | 
 46 |     VERSION = datasets.Version("1.1.0")
 47 | 
 48 |     BUILDER_CONFIGS = [
 49 |         datasets.BuilderConfig(name="custom_zh_en", version=VERSION, description="正常数量数据集"),
 50 |     ]
 51 | 
 52 |     DEFAULT_CONFIG_NAME = "custom_zh_en"
 53 |     def _info(self):
 54 |         return datasets.DatasetInfo(
 55 |             description=_DESCRIPTION,
 56 |             features=datasets.Features(
 57 |                 {"translation": datasets.features.Translation(languages=('zh', 'en'))}
 58 |             ),
 59 |             supervised_keys=('zh', 'en'),
 60 |             homepage=_HOMEPAGE,
 61 |             license=_LICENSE,
 62 |             citation=_CITATION,
 63 |         )
 64 |     def _split_generators(self, dl_manager):
 65 |         """下载数据集
 66 |         此方法的任务是下载/提取数据并根据配置定义拆分
 67 |         根据不同的配置BUILDER_CONFIGS，和数据集的name定义
 68 |         """
 69 |         # dl_manager是一个datasets.download.DownloadManager，可用于下载和提取URL，
 70 |         # 它可以接受任何类型或嵌套的列表/字典，并将返回相同的结构，url也可以替换为局部文件的路径。
 71 |         # 默认情况下，将提取压缩包，如果文件是压缩的，并返回提取压缩的缓存文件夹的路径，而不是压缩文件
 72 |         return [
 73 |             datasets.SplitGenerator(
 74 |                 name=datasets.Split.TRAIN,
 75 |                 # 下面的参数将传给 _generate_examples
 76 |                 gen_kwargs={
 77 |                     "filepath": self.config.data_files['train'],
 78 |                     "split": "train",
 79 |                 },
 80 |             ),
 81 |             datasets.SplitGenerator(
 82 |                 name=datasets.Split.TEST,
 83 |                 # 下面的参数将传给 _generate_examples
 84 |                 gen_kwargs={
 85 |                     "filepath": self.config.data_files['test'],
 86 |                     "split": "test"
 87 |                 },
 88 |             ),
 89 |             datasets.SplitGenerator(
 90 |                 name=datasets.Split.VALIDATION,
 91 |                 # 下面的参数将传给 _generate_examples
 92 |                 gen_kwargs={
 93 |                     "filepath": self.config.data_files['validation'],
 94 |                     "split": "dev",
 95 |                 },
 96 |             ),
 97 |         ]
 98 | 
 99 |     def _generate_examples(self, filepath, split):
100 |         """ Yields 方法返回每个样本. """
101 |         # 被函数_split_generators 调用，参数也是通过 gen_kwargs被传过来
102 |         # 它负责打开给定的文件并从数据集中产生(key, example)元组
103 |         # key是不重要的，只是习惯于这样
104 |         zhfile, enfile = filepath
105 |         with open(zhfile, encoding="utf-8") as zf, open(enfile, encoding="utf-8") as ef:
106 |             zhlines = zf.readlines()
107 |             eflines = ef.readlines()
108 |             assert len(zhlines) == len(eflines), "警告：读入的2个文件总的行数不等"
109 |             for id_, (zh, en) in enumerate(zip(zhlines, eflines)):
110 |                 yield id_, {'translation':
111 |                                 {'zh': zh,
112 |                                  'en': en
113 |                                  }
114 |                             }


--------------------------------------------------------------------------------
/异步测试/quart_api_server.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from quart import Quart, request, jsonify, abort
  3 | from werkzeug.utils import secure_filename
  4 | import hashlib
  5 | from functools import wraps
  6 | import time
  7 | 
  8 | Tokens = [
  9 |     {"sign": "c8499156ae8acc3f2d6a1453b9315eb1", "timestamp": "1592805495.355849", "token": "ff1c1eef10cad322ddbcd842a952b46c"}
 10 | ]
 11 | 
 12 | UPLOAD_FOLDER = 'images/'
 13 | ALLOWED_EXTENSIONS = set(['jpg', 'png'])
 14 | 
 15 | app = Quart(__name__)
 16 | 
 17 | def verify(token:str, timestamp:str ,sign:str)-> bool:
 18 |     """
 19 |     验证token，timestamp，sign
 20 |     :param token: token
 21 |     :param timestamp: 时间戳
 22 |     :param sign: 签名
 23 |     :return: bool
 24 |     """
 25 |     for tk in Tokens:
 26 |         tkv = tk.values()
 27 |         if token in tkv and timestamp in tkv and sign in tkv:
 28 |             return True
 29 |     return False
 30 | 
 31 | def authorize(f):
 32 |     @wraps(f)
 33 |     def decorated_function(*args, **kws):
 34 |             if not 'token' in request.headers or not 'timestamp' in request.headers or not 'sign' in request.headers:
 35 |                 abort(401)
 36 | 
 37 |             token = request.headers['token']
 38 |             timestamp = request.headers['timestamp']
 39 |             sign = request.headers['sign']
 40 |             if not verify(token,timestamp,sign):
 41 |                 abort(401)
 42 |             return f(*args, **kws)
 43 |     return decorated_function
 44 | 
 45 | def cal_md5(content) -> str:
 46 |     """
 47 |     给定content，计算md5
 48 |     :param content:
 49 |     :return:
 50 |     """
 51 |     md5 = hashlib.md5()
 52 |     content=str(content)
 53 |     md5.update(content.encode('UTF-8'))
 54 |     result = md5.hexdigest()
 55 |     return result
 56 | 
 57 | @app.route("/token", methods=['GET', 'POST'])
 58 | async def generate_token():
 59 |     """
 60 |     生成用户token
 61 |     :return:
 62 |     """
 63 |     rand = os.urandom(32)
 64 |     token = cal_md5(rand)
 65 |     timestamp = str(time.time())
 66 |     sign = cal_md5(str(token)+timestamp)
 67 |     TK = {'token': token, 'timestamp':timestamp, 'sign':sign}
 68 |     Tokens.append(TK)
 69 |     return jsonify({'token': token, 'timestamp': timestamp, 'sign': sign})
 70 | 
 71 | def allowed_file(filename):
 72 |     return '.' in filename and \
 73 |            filename.rsplit('.', 1)[1] in ALLOWED_EXTENSIONS
 74 | 
 75 | #做一些异步任务,测试
 76 | async def blocking_io():
 77 |     #文件操作，IO类型任务，例如日志等，使用线程或异步asyncio
 78 |     with open("/dev/urandom", "rb") as f:
 79 |         return f.read(100)
 80 | async def cpu_bound():
 81 |     # CPU-Bound， 消耗CPU的操作，使用多进程完成
 82 |     return sum(i * i for i in range(10 ** 6))
 83 | 
 84 | #模拟同步任务，
 85 | @app.route("/upload_sync", methods=['POST'])
 86 | async def upload_sync():
 87 |     """
 88 |     上传图片文件,并立即处理，返回成功结果
 89 |     :return:
 90 |     """
 91 |     if not os.path.exists(UPLOAD_FOLDER):
 92 |         os.mkdir(UPLOAD_FOLDER)
 93 |     file = request.files['image']
 94 |     if file and allowed_file(file.filename):
 95 |         filename = secure_filename(file.filename)
 96 |         file.save(os.path.join(UPLOAD_FOLDER, filename))
 97 |         return jsonify({'result': 'success upload'})
 98 | 
 99 | #模拟异步任务
100 | @app.route("/upload_async", methods=['POST'])
101 | async def upload_async():
102 |     """
103 |     上传图片，不能立即处理完成，先返回成功处理的页面，等待用户调取
104 |     :return:
105 |     """
106 |     with concurrent.futures.ProcessPoolExecutor() as pool:
107 |         future = pool.submit(cpu_bound)
108 |         for fut in concurrent.futures.as_completed([future]):
109 |             return jsonify({'result': fut.done()})
110 | 
111 | @app.route("/echo", methods=['GET'])
112 | async def echo():
113 |     """
114 |     测试服务器运行正常
115 |     :return:
116 |     """
117 |     return jsonify({'result': True})
118 | 
119 | if __name__ == "__main__":
120 |     app.run(host='0.0.0.0', port=5000, debug=True, threaded=True)
121 | 


--------------------------------------------------------------------------------
/异步测试/flask_api_server.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from flask import Flask, request, jsonify, abort
  3 | from werkzeug.utils import secure_filename
  4 | import hashlib
  5 | from functools import wraps
  6 | import time
  7 | # import asyncio
  8 | import concurrent.futures
  9 | 
 10 | Tokens = [
 11 |     {"sign": "c8499156ae8acc3f2d6a1453b9315eb1", "timestamp": "1592805495.355849", "token": "ff1c1eef10cad322ddbcd842a952b46c"}
 12 | ]
 13 | 
 14 | UPLOAD_FOLDER = 'images/'
 15 | ALLOWED_EXTENSIONS = set(['jpg', 'png'])
 16 | 
 17 | app = Flask(__name__)
 18 | 
 19 | def verify(token:str, timestamp:str ,sign:str)-> bool:
 20 |     """
 21 |     验证token，timestamp，sign
 22 |     :param token: token
 23 |     :param timestamp: 时间戳
 24 |     :param sign: 签名
 25 |     :return: bool
 26 |     """
 27 |     for tk in Tokens:
 28 |         tkv = tk.values()
 29 |         if token in tkv and timestamp in tkv and sign in tkv:
 30 |             return True
 31 |     return False
 32 | 
 33 | def authorize(f):
 34 |     @wraps(f)
 35 |     def decorated_function(*args, **kws):
 36 |             if not 'token' in request.headers or not 'timestamp' in request.headers or not 'sign' in request.headers:
 37 |                 abort(401)
 38 | 
 39 |             token = request.headers['token']
 40 |             timestamp = request.headers['timestamp']
 41 |             sign = request.headers['sign']
 42 |             if not verify(token,timestamp,sign):
 43 |                 abort(401)
 44 |             return f(*args, **kws)
 45 |     return decorated_function
 46 | 
 47 | def cal_md5(content) -> str:
 48 |     """
 49 |     给定content，计算md5
 50 |     :param content:
 51 |     :return:
 52 |     """
 53 |     md5 = hashlib.md5()
 54 |     content=str(content)
 55 |     md5.update(content.encode('UTF-8'))
 56 |     result = md5.hexdigest()
 57 |     return result
 58 | 
 59 | @app.route("/token", methods=['GET', 'POST'])
 60 | def generate_token():
 61 |     """
 62 |     生成用户token
 63 |     :return:
 64 |     """
 65 |     rand = os.urandom(32)
 66 |     token = cal_md5(rand)
 67 |     timestamp = str(time.time())
 68 |     sign = cal_md5(str(token)+timestamp)
 69 |     TK = {'token': token, 'timestamp':timestamp, 'sign':sign}
 70 |     Tokens.append(TK)
 71 |     return jsonify({'token': token, 'timestamp': timestamp, 'sign': sign})
 72 | 
 73 | def allowed_file(filename):
 74 |     return '.' in filename and \
 75 |            filename.rsplit('.', 1)[1] in ALLOWED_EXTENSIONS
 76 | 
 77 | #同步任务
 78 | @app.route("/upload", methods=['POST', 'GET'])
 79 | @authorize
 80 | def upload():
 81 |     """
 82 |     上传图片文件
 83 |     :return:
 84 |     """
 85 |     if not os.path.exists(UPLOAD_FOLDER):
 86 |         os.mkdir(UPLOAD_FOLDER)
 87 |     file = request.files['image']
 88 |     if file and allowed_file(file.filename):
 89 |         filename = secure_filename(file.filename)
 90 |         file.save(os.path.join(UPLOAD_FOLDER, filename))
 91 |         return jsonify({'result': 'success upload'})
 92 | 
 93 | #做一些异步任务,测试
 94 | def blocking_io():
 95 |     #文件操作，IO类型任务，例如日志等，使用线程或异步asyncio
 96 |     with open("/dev/urandom", "rb") as f:
 97 |         return f.read(100)
 98 | def cpu_bound():
 99 |     # CPU-Bound， 消耗CPU的操作，使用多进程完成
100 |     return sum(i * i for i in range(10 ** 6))
101 | 
102 | @app.route("/process", methods=['GET'])
103 | @authorize
104 | def do_multiprocess():
105 |     """
106 |     使用进程池处理cpu-bound型任务
107 |     :return:
108 |     """
109 |     with concurrent.futures.ProcessPoolExecutor() as pool:
110 |         future = pool.submit(cpu_bound)
111 |         for fut in concurrent.futures.as_completed([future]):
112 |             return jsonify({'result': fut.done()})
113 | 
114 | @app.route("/echo", methods=['GET'])
115 | def echo():
116 |     """
117 |     测试服务器运行正常
118 |     :return:
119 |     """
120 |     return jsonify({'result': True})
121 | 
122 | if __name__ == "__main__":
123 |     app.run(host='0.0.0.0', port=5000, debug=True, threaded=True)
124 | 


--------------------------------------------------------------------------------
/TextCNN/main.py:
--------------------------------------------------------------------------------
  1 | import os, sys
  2 | import argparse
  3 | import torch
  4 | import model_utils
  5 | import data_utils
  6 | from config import Config
  7 | 
  8 | #TextCNN模型
  9 | def dotrain():
 10 |     parser = argparse.ArgumentParser(description='Text CNN 分类器')
 11 |     parser.add_argument('--model', type=str, default="model/textcnn.model", help='读取model继续训练')
 12 |     conf = Config()
 13 |     #打印模型配置信息
 14 |     conf.dump()
 15 |     args = parser.parse_args()
 16 |     if not os.path.isdir("model"):
 17 |         os.mkdir("model")
 18 |     print("处理训练数据")
 19 |     train_iter, text_field, label_field = data_utils.text_dataloader(conf.train_dir, conf.batch_size)
 20 |     #使用pickle保存字典到本地
 21 |     data_utils.save_vocab(text_field.vocab, "model/text.vocab")
 22 |     data_utils.save_vocab(label_field.vocab, "model/label.vocab")
 23 | 
 24 |     #添加新的配置，嵌入的维度vocab_num， 分类的类别数量class_num，
 25 |     conf.vocab_num = len(text_field.vocab)
 26 |     conf.class_num = len(label_field.vocab) - 1
 27 |     # 卷积核大小, 代表跨越的句子和字的大小, 找打相邻字直接的联系, 例如[3, 4, 5]
 28 |     conf.kernel_sizes = [int(k) for k in conf.kernel_sizes.split(',')]
 29 | 
 30 |     #模型加载和初始化
 31 |     if os.path.exists(args.model):
 32 |         print('发现模型文件, 加载模型: {}'.format(args.model))
 33 |         cnn = torch.load(args.model)
 34 |     else:
 35 |         cnn = model_utils.TextCNN(conf)
 36 |     #模型训练
 37 |     try:
 38 |         model_utils.train(train_iter, cnn, conf)
 39 |     except KeyboardInterrupt:
 40 |         print('-' * 80)
 41 |         print('提前退出训练.')
 42 | 
 43 | #评估模型
 44 | def doeval():
 45 |     parser = argparse.ArgumentParser(description='Text CNN 分类器')
 46 |     #必须指定已经训练好的模型
 47 |     parser.add_argument('--model', type=str, default="model/textcnn.model", help='读取model进行评估')
 48 |     conf = Config()
 49 |     #打印模型配置信息
 50 |     conf.dump()
 51 |     args = parser.parse_args()
 52 |     print("加载测试数据")
 53 |     #测试时不进行数据打乱操作
 54 |     eval_iter, text_field, label_field = data_utils.text_dataloader(conf.eval_dir, conf.batch_size, shuffle=False)
 55 |     # 模型加载和初始化
 56 |     if os.path.exists(args.model):
 57 |         print('发现模型文件, 加载模型: {}'.format(args.model))
 58 |         cnn = torch.load(args.model)
 59 |     else:
 60 |         print("未找到模型文件，退出")
 61 |         sys.exit(-1)
 62 |     #加载以保存的字典
 63 |     text_field.vocab = data_utils.load_vocab("model/text.vocab")
 64 |     label_field.vocab = data_utils.load_vocab("model/label.vocab")
 65 |     #开始模型评估
 66 |     model_utils.eval(eval_iter, cnn, conf)
 67 | 
 68 | #预测
 69 | def dopredict():
 70 |     """
 71 |     给定一个文件或一句话，预测结果
 72 |     :return:
 73 |     """
 74 |     parser = argparse.ArgumentParser(description='Text CNN 分类器')
 75 |     #必须指定已经训练好的模型
 76 |     parser.add_argument('--path', type=str, default="data/predict/",help='要进行预测的文本文件的路径,或文件夹')
 77 |     parser.add_argument('--model', type=str, default="model/textcnn.model", help='读取model进行预测')
 78 |     conf = Config()
 79 |     args = parser.parse_args()
 80 |     #指定Field格式
 81 |     text_field = data_utils.TextTEXT
 82 |     label_field = data_utils.TextLABEL
 83 |     text_field.vocab = data_utils.load_vocab("model/text.vocab")
 84 |     label_field.vocab = data_utils.load_vocab("model/label.vocab")
 85 |     # 模型加载和初始化
 86 |     if os.path.exists(args.model):
 87 |         print('发现模型文件, 加载模型: {}'.format(args.model))
 88 |         cnn = torch.load(args.model)
 89 |     else:
 90 |         print("未找到模型文件，退出")
 91 |         sys.exit(-1)
 92 |     #如果是文件夹，那么预测里面的文件，否则就是文件，直接预测
 93 |     if os.path.isdir(args.path):
 94 |         files = os.listdir(args.path)
 95 |         files_path = [args.path+f for f in files]
 96 |     else:
 97 |         files_path = [args.path]
 98 |     #开始预测
 99 |     for file in files_path:
100 |         text, label = model_utils.predict(file, cnn, text_field, label_field, conf.cuda)
101 |         print('[path]  {}\n[Text]  {}\n[Label] {}\n'.format(file, text, label))
102 |     print(f'共预测{len(files_path)}个文件')
103 | 
104 | if __name__ == '__main__':
105 |     dotrain()
106 |     # doeval()
107 |     # dopredict()


--------------------------------------------------------------------------------
/TextCNN/data_utils.py:
--------------------------------------------------------------------------------
  1 | import pickle
  2 | import os
  3 | import torchtext
  4 | import jieba
  5 | from typing import List
  6 | 
  7 | class TextDataset(torchtext.data.Dataset):
  8 |     """
  9 |     读取数据并处理
 10 |     """
 11 |     @staticmethod
 12 |     def sort_key(example):
 13 |         """
 14 |         用于在torchtext.data.Iterator生成批次迭代器的时候，用于example进行排序以将具有相似长度的example批次放在一起并最小化填充
 15 |         如果在使用torchtext.data.Iterator时，提供了sort_key，那么就会覆盖这个Dataset的sort_key属性, 默认为None
 16 |         :param example:  是按example中单个样本的text属性的长度排序
 17 |         :return:
 18 |         """
 19 |         return len(example.text)
 20 |     def __init__(self, path, text_field, label_field, **kwargs):
 21 |         """根据给的Field和数据集路径处理数据, 之后交给torchtext.data.Dataset处理
 22 |         Arguments:
 23 |             path: 数据集的路径
 24 |             text_field: text数据的Field格式
 25 |             label_field: label数据的Field格式
 26 |             **kwargs: data.Dataset的参数
 27 |         """
 28 |         #定义fields
 29 |         fields = [('text', text_field), ('label', label_field)]
 30 |         #定义一个空的数据集
 31 |         examples = []
 32 |         #列出当前目录下的所有文件夹，文件夹名称作为label，文件夹里面的文件内容作为text
 33 |         dirname = os.listdir(path)
 34 |         for dir in dirname:
 35 |             #循环一个label目录下的所有文件
 36 |             files = os.listdir(path + '/' + dir)
 37 |             for file in files:
 38 |                 document = ''
 39 |                 with open(path + '/' + dir + '/' + file, encoding="utf8", errors='ignore') as f:
 40 |                     for line in f:
 41 |                         if line != '\n':
 42 |                             document += text_filter(line)
 43 |                 # 如果文本长度小于10个字符，那么就过滤掉
 44 |                 if len(document) < 10:
 45 |                     continue
 46 |                 text, label = document, dir
 47 |                 #定义一个训练或测试的样本的Example格式
 48 |                 example = torchtext.data.Example()
 49 |                 # text_field.preprocess 是进行token的处理, 例如用jieba处理
 50 |                 setattr(example, "text", text_field.preprocess(text))
 51 |                 setattr(example, "label", label_field.preprocess(label))
 52 |                 examples.append(example)
 53 |         super(TextDataset, self).__init__(examples, fields, **kwargs)
 54 | 
 55 | def text_filter(sentence:str)-> str:
 56 |     """
 57 |     过滤掉非汉字和标点符号和非数字
 58 |     :param sentence:
 59 |     :return:
 60 |     """
 61 |     line = sentence.replace('\n', '。')
 62 |     # 过滤掉非汉字和标点符号和非数字
 63 |     linelist = [word for word in line if
 64 |                 word >= u'\u4e00' and word <= u'\u9fa5' or word in ['，', '。', '？', '！',
 65 |                                                                     '：'] or word.isdigit()]
 66 |     return ''.join(linelist)
 67 | 
 68 | #定义text的Field
 69 | def text_token(sentence: str)-> List:
 70 |     """
 71 |     使用jieba分词
 72 |     :param sentence: 要分词的sentence
 73 |     :return: 一个text的分词后的列表
 74 |     """
 75 |     return jieba.lcut(sentence)
 76 | 
 77 | #sequential 是否要变成序列，tokenize表示使用的token 函数是， lower表示是否转换成小写
 78 | TextTEXT = torchtext.data.Field(sequential=True, tokenize=text_token, lower=True)
 79 | 
 80 | #定义label的Field
 81 | TextLABEL = torchtext.data.Field(sequential=False, lower=True)
 82 | 
 83 | def text_dataloader(path, batch_size, shuffle=False):
 84 |     """
 85 |     加载数据
 86 |     :param path:  训练集和测试集的文件路径
 87 |     :param batchsize: 批处理大小
 88 |     :param shuffle: 是否做shuffle
 89 |     :return:
 90 |     """
 91 |     #定义text和label的 Field格式
 92 |     text_field = TextTEXT
 93 |     label_field = TextLABEL
 94 | 
 95 |     #读取数据
 96 |     #dataset 包含examples和fields2部分,examples保存所有的数据，field是这类数据的名字，例如field是(text,label), examples里面就是[(label的内容(纯文本),text内容(纯文本)),...]
 97 |     dataset = TextDataset(path, text_field, label_field)
 98 |     #构建字典,使用build_vocab之后text_field会多出一个vocab的属性，vocab中是字典
 99 |     text_field.build_vocab(dataset)
100 |     label_field.build_vocab(dataset)
101 |     #创建迭代器
102 |     dataiter = torchtext.data.Iterator(dataset, batch_size, shuffle=shuffle, repeat=False)
103 |     return dataiter, text_field, label_field
104 | 
105 | def save_vocab(vocab, filename):
106 |     """
107 |     使用pickle保存字典
108 |     :param vocab:
109 |     :param filename:
110 |     :return:
111 |     """
112 |     with open(filename, 'wb') as f:
113 |         pickle.dump(vocab, f)
114 | 
115 | def load_vocab(filename):
116 |     """
117 |     使用pickle加载字典
118 |     :param filename:
119 |     :return:
120 |     """
121 |     with open(filename, 'rb') as f:
122 |         vocab = pickle.load(f)
123 |     return vocab
124 | 


--------------------------------------------------------------------------------
/Res2Net/main.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | from data_utils import load_data, display_prediction, process_image
  3 | from config import Config
  4 | import os
  5 | from model_utils import (
  6 |         create_model,  #根据架构创建模型
  7 |         create_optimizer,   # 为模型的最后的Classifier层添加优化器
  8 |         load_checkpoint,  # 加载checkpoint，重建预训练模型
  9 |         plot_history,  # 绘制历史的训练损失和准确率的图表
 10 |         save_checkpoint,   #保存模型checkpoint
 11 |         train_model,   #训练模型
 12 |         classify_image,  #用于预测阶段，使用模型进行预测
 13 |         test_model)  #使用测试集测试模型性能，并打印准确率
 14 | 
 15 | def train():
 16 |         conf = Config()
 17 |         # 打印模型配置信息
 18 |         conf.dump()
 19 |         parser = argparse.ArgumentParser(description='图片分类模型训练')
 20 |         parser.add_argument(
 21 |                 '--resume_checkpoint', action='store', type=str, default='model/checkpoint.pth',
 22 |                 help='从模型的checkpoint恢复模型，并继续训练，如果resume_checkpoint这个参数提供'
 23 |                      '这些参数将忽略--arch, --learning_rate, --hidden_units, and --drop_p')
 24 |         args = parser.parse_args()
 25 | 
 26 |         #加载数据
 27 |         dataloaders, class_to_idx = load_data(conf.data_directory)
 28 | 
 29 |         #创建模型，如果模型文件存在
 30 |         if args.resume_checkpoint and os.path.exists(args.resume_checkpoint):
 31 |                 #加载checkpoint
 32 |                 print('resume_checkpoint已存在，开始加载模型')
 33 |                 model, optimizer, epoch, history = load_checkpoint(
 34 |                         checkpoint_path=args.resume_checkpoint,
 35 |                         load_optimizer=True, gpu=conf.cuda)
 36 |                 start_epoch = epoch + 1
 37 |         else:
 38 |                 #创建新模型和优化器
 39 |                 print('resume_checkpoint未设置或模型文件不存在，创建新的模型')
 40 |                 model = create_model(
 41 |                         arch=conf.arch, class_to_idx=class_to_idx,
 42 |                         hidden_units=conf.hidden_units, drop_p=conf.dropout)
 43 |                 optimizer = create_optimizer(model=model, lr=conf.learning_rate)
 44 |                 start_epoch = 1
 45 |                 history = None
 46 | 
 47 |         #训练模型
 48 |         history, best_epoch = train_model(
 49 |                 dataloaders=dataloaders, model=model,
 50 |                 optimizer=optimizer, gpu=conf.cuda, start_epoch=start_epoch,
 51 |                 epochs=conf.epochs, train_history=history)
 52 | 
 53 |         #测试集上测试模型
 54 |         test_acc = test_model(dataloader=dataloaders['test'], model=model, gpu=conf.cuda)
 55 |         print(f'模型在测试集上的准确率是 {(test_acc * 100):.2f}%')
 56 | 
 57 |         #保存模型
 58 |         save_checkpoint(
 59 |                 save_path=conf.save_path+conf.save_name, epoch=best_epoch, model=model,
 60 |                 optimizer=optimizer, history=history)
 61 | 
 62 |         #绘制历史记录
 63 |         plot_history(history)
 64 | 
 65 | def predict():
 66 |         conf = Config()
 67 |         # 打印模型配置信息
 68 |         conf.dump()
 69 |         parser = argparse.ArgumentParser(description='图片分类模型训练')
 70 |         parser.add_argument(
 71 |                 '--image_path', type=str,default='data/zhengjian/predict/test/3601216003722.jpg', help='指定要分类的路径')
 72 |         parser.add_argument(
 73 |                 '--checkpoint', type=str, default='model/checkpoint.pth', help='指定checkpoint的模型的保存位置')
 74 |         parser.add_argument(
 75 |                 '--top_k', type=int, default=2, help='选取topk概率的最大类别， dafault=2')
 76 |         args = parser.parse_args()
 77 | 
 78 |         # 加载转换，处理，转换图片到Tensor
 79 |         image_tensor = process_image(image_path=args.image_path)
 80 | 
 81 |         # 加载模型，是否使用gpu
 82 |         model, _, _, _ = load_checkpoint(
 83 |                 checkpoint_path=args.checkpoint, load_optimizer=False, gpu=conf.cuda)
 84 | 
 85 |         #图片分类
 86 |         probabilities, predictions = classify_image(
 87 |                 image_tensor=image_tensor, model=model, top_k=args.top_k, gpu=conf.cuda)
 88 | 
 89 |         #分类结果
 90 |         top_class = predictions[0]
 91 |         top_prob = probabilities[0]
 92 |         top_k = args.top_k
 93 |         print(f'\n预测概率最高的类别是 {top_class.capitalize()} '
 94 |               f' 概率是{top_prob:.4f}')
 95 |         print(f'\n预测的topk是 {top_k} 类别是 {predictions}'
 96 |               f'概率是 {probabilities}')
 97 | 
 98 |         # 绘图
 99 |         display_prediction(
100 |                 image_path=args.image_path,
101 |                 probabilities=probabilities,
102 |                 predictions=predictions)
103 | 
104 | if __name__ == '__main__':
105 |     train()
106 |     # predict()


--------------------------------------------------------------------------------
/text_classsification/eval.py:
--------------------------------------------------------------------------------
  1 | # -- encoding:utf-8 --
  2 | 
  3 | import os
  4 | import csv
  5 | import numpy as np
  6 | import tensorflow as tf
  7 | 
  8 | from utils import data_helpers
  9 | from utils.vocabulary_utils import VocabularyProcessorUtil
 10 | 
 11 | # 数据文件
 12 | tf.flags.DEFINE_string("positive_data_file",
 13 |                        "Data source for the positive data.")
 14 | tf.flags.DEFINE_string("negative_data_file",
 15 |                        "Data source for the positive data.")
 16 | # Eval Parameters
 17 | tf.flags.DEFINE_string("network_name", None, "给定模型名称!!!")
 18 | tf.flags.DEFINE_string("checkpoint_dir", "./model", "给定模型持久化的文件夹路径！")
 19 | tf.flags.DEFINE_string("vocab_model_path", "./model/vocab.pkl", "给定词汇模型所在的磁盘路径")
 20 | tf.flags.DEFINE_integer("batch_size", 64, "Batch Size (default: 64)")
 21 | tf.flags.DEFINE_boolean("eval_train", True, "Evaluate on all training data")
 22 | 
 23 | FLAGS = tf.flags.FLAGS
 24 | 
 25 | 
 26 | def main(_):
 27 |     network_name = FLAGS.network_name
 28 |     if network_name is None:
 29 |         raise Exception("参数network_name必须给定!!!!")
 30 | 
 31 |     # 0. 数据校验，要求训练数据文件文件存在
 32 |     if not (os.path.isfile(FLAGS.positive_data_file) and os.path.isfile(FLAGS.negative_data_file)):
 33 |         raise Exception("给定的训练数据必须是文件路径的形成!!!")
 34 | 
 35 |     with tf.Graph().as_default():
 36 |         graph = tf.get_default_graph()
 37 |         with tf.Session() as sess:
 38 |             # 1. 加载词汇转换模型
 39 |             vocab_model_path = FLAGS.vocab_model_path
 40 |             if not tf.gfile.Exists(vocab_model_path):
 41 |                 raise Exception("词汇转换模型必须存在,请检查磁盘路径:{}".format(vocab_model_path))
 42 |             vocab_model = VocabularyProcessorUtil.load_model(save_path=vocab_model_path)
 43 | 
 44 |             # 2. 恢复加载网络
 45 |             ckpt = tf.train.get_checkpoint_state(checkpoint_dir=FLAGS.checkpoint_dir)
 46 |             if not (ckpt and ckpt.model_checkpoint_path):
 47 |                 raise Exception("不存在对应的模型文件，请检查:{}".format(FLAGS.checkpoint_dir))
 48 |             tf.logging.info("恢复模型:{}".format(ckpt.model_checkpoint_path))
 49 |             saver = tf.train.import_meta_graph("{}.meta".format(ckpt.model_checkpoint_path))
 50 |             saver.restore(sess=sess, save_path=ckpt.model_checkpoint_path)
 51 | 
 52 |             # 3. 获取Tensor对象
 53 |             inputs = graph.get_tensor_by_name("{}/placeholders/input_word_id:0".format(network_name.upper()))
 54 |             dropout_keep_prob = graph.get_tensor_by_name("{}/placeholders/dropout_keep_prob:0".format(network_name.upper()))
 55 |             predictions = graph.get_tensor_by_name("{}/project/predictions:0".format(network_name.upper()))
 56 | 
 57 |             # 4. 加载数据
 58 |             tf.logging.info("开始加载文本数据，并转换处理......")
 59 |             old_texts, labels = data_helpers.load_data_and_labels(
 60 |                 positive_data_file=FLAGS.positive_data_file,
 61 |                 negative_data_file=FLAGS.negative_data_file
 62 |             )
 63 | 
 64 |             # 4a. 文本数据id转换（截取、填充）
 65 |             texts = np.asarray(list(vocab_model.transform(old_texts)))
 66 |             # 4c. 构建批次
 67 |             batches = data_helpers.batch_iter(
 68 |                 data=list(texts),
 69 |                 batch_size=FLAGS.batch_size,  # 每个批次的样本数据量
 70 |                 num_epochs=1,  # 总共迭代多少个epoch数据
 71 |                 shuffle=False
 72 |             )
 73 | 
 74 |             # 5. 遍历数据进行预测
 75 |             all_predictions = []
 76 |             for x_test_batch in batches:
 77 |                 batch_predictions = sess.run(predictions, {inputs: x_test_batch, dropout_keep_prob: 1.0})
 78 |                 # 数组拼接
 79 |                 all_predictions = np.concatenate([all_predictions, batch_predictions])
 80 |             # 类型转换，以及格式/数据类型转换
 81 |             all_predictions = np.asarray(all_predictions, dtype=np.int32).reshape(-1)
 82 | 
 83 |             # 6. 效果评估
 84 |             correct_predictions = float(sum(all_predictions == labels))
 85 |             print("Total number of test examples: {}".format(len(labels)))
 86 |             print("Accuracy: {:g}".format(correct_predictions / float(len(labels))))
 87 |             print("实际值为:\n{}".format(labels))
 88 |             print("预测值为:\n{}".format(all_predictions))
 89 | 
 90 |             # 将评价保存到CSV
 91 |             predictions_human_readable = np.column_stack((all_predictions, labels, np.array(old_texts)))
 92 |             out_path = os.path.join(FLAGS.checkpoint_dir, "prediction.csv")
 93 |             print("Saving evaluation to {0}".format(out_path))
 94 |             # 参数：newline=''是给定不添加新行
 95 |             with open(out_path, 'w', encoding='utf-8', newline='') as f:
 96 |                 writer = csv.writer(f)  # 获取输出对象
 97 |                 writer.writerows(predictions_human_readable)  # 输出CSV格式
 98 | 
 99 | 
100 | if __name__ == '__main__':
101 |     tf.logging.set_verbosity(tf.logging.INFO)
102 |     tf.app.run()
103 | 


--------------------------------------------------------------------------------
/异步测试/flask_api_server_v2.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from flask import Flask, request, jsonify, abort
  3 | from werkzeug.utils import secure_filename
  4 | import hashlib
  5 | from functools import wraps
  6 | import time
  7 | from flask_executor import Executor
  8 | from PIL import Image
  9 | import pytesseract
 10 | from flask_redis import FlaskRedis
 11 | 
 12 | app = Flask(__name__)
 13 | 
 14 | #redis用于存放用户的token和异步时临时存储用户的索引
 15 | redis_client = FlaskRedis(app, charset='utf-8', decode_responses=True)
 16 | 
 17 | #启动异步操作多线程
 18 | executor = Executor(app)
 19 | app.config['EXECUTOR_TYPE'] = 'thread'
 20 | app.config['EXECUTOR_MAX_WORKERS'] = 5
 21 | 
 22 | #存放图片位置
 23 | UPLOAD_FOLDER = 'images/'
 24 | ALLOWED_EXTENSIONS = set(['jpg', 'png'])
 25 | 
 26 | 
 27 | @app.route("/echo", methods=['GET'])
 28 | def echo():
 29 |     """
 30 |     测试服务器运行正常
 31 |     :return:
 32 |     """
 33 |     return jsonify({'result': True})
 34 | 
 35 | def verify(token:str, timestamp:str ,sign:str)-> bool:
 36 |     """
 37 |     验证token，timestamp，sign
 38 |     :param token: token
 39 |     :param timestamp: 时间戳
 40 |     :param sign: 签名
 41 |     :return: bool
 42 |     """
 43 |     res = redis_client.hgetall(token)
 44 |     #如果token不存在，返回False
 45 |     if not res:
 46 |         return False
 47 |     #如果sign不正确，返回FALSE，
 48 |     res_vale = res.values()
 49 |     if timestamp in res_vale and sign in res_vale:
 50 |         return True
 51 |     return False
 52 | 
 53 | def authorize(f):
 54 |     @wraps(f)
 55 |     def decorated_function(*args, **kws):
 56 |             #如果header不存在token等关键字，直接返回401
 57 |             if not 'token' in request.headers or not 'timestamp' in request.headers or not 'sign' in request.headers:
 58 |                 abort(401)
 59 |             token = request.headers['token']
 60 |             timestamp = request.headers['timestamp']
 61 |             sign = request.headers['sign']
 62 |             #如果token，签名验证不通过，返回401
 63 |             if not verify(token,timestamp,sign):
 64 |                 abort(401)
 65 |             return f(*args, **kws)
 66 |     return decorated_function
 67 | 
 68 | def cal_md5(content) -> str:
 69 |     """
 70 |     给定content，计算md5
 71 |     :param content:
 72 |     :return:
 73 |     """
 74 |     md5 = hashlib.md5()
 75 |     content=str(content)
 76 |     md5.update(content.encode('UTF-8'))
 77 |     result = md5.hexdigest()
 78 |     return result
 79 | 
 80 | @app.route("/gentoken", methods=['GET', 'POST'])
 81 | def generate_token():
 82 |     """
 83 |     生成用户token, 把token放入redis，以后可以把用户信息存入DB，临时的token放入redis
 84 |     :return:
 85 |     """
 86 |     rand = os.urandom(32)
 87 |     token = cal_md5(rand)
 88 |     timestamp = str(time.time())
 89 |     sign = cal_md5(str(token)+timestamp)
 90 |     TK = {'timestamp':timestamp, 'sign':sign}
 91 |     redis_client.hmset(token, TK)
 92 |     return jsonify({'token': token, 'timestamp': timestamp, 'sign': sign})
 93 | 
 94 | def allowed_file(filename: str)-> bool:
 95 |     """
 96 |     校验上传的图片格式, 如果格式正确返回True
 97 |     :param filename:
 98 |     :return:
 99 |     """
100 |     return '.' in filename and filename.rsplit('.', 1)[1] in ALLOWED_EXTENSIONS
101 | 
102 | def recognise(image: str) -> str:
103 |     """
104 |     使用tesseract路径识别图片
105 |     :param image: 图片名字
106 |     :return: 图片识别后的结果
107 |     """
108 |     #tesseract路径
109 |     pytesseract.pytesseract.tesseract_cmd = r'/usr/local/bin/tesseract'
110 |     #图片识别成文字
111 |     res = pytesseract.image_to_string(Image.open(UPLOAD_FOLDER+'/'+image))
112 |     return res
113 | 
114 | #同步任务
115 | @app.route("/upload_sync", methods=['POST'])
116 | @authorize
117 | def upload_sync():
118 |     """
119 |     上传图片文件,并立即处理，返回成功结果
120 |     :return:
121 |     """
122 |     #存储图片
123 |     if not os.path.exists(UPLOAD_FOLDER):
124 |         os.mkdir(UPLOAD_FOLDER)
125 |     file = request.files['image']
126 |     if file and allowed_file(file.filename):
127 |         filename = secure_filename(file.filename)
128 |         file.save(os.path.join(UPLOAD_FOLDER, filename))
129 |         #开始识别图片
130 |         res = recognise(filename)
131 |         return jsonify({'code':0,'result': res})
132 | 
133 | 
134 | @app.route("/upload_async", methods=['POST'])
135 | @authorize
136 | def upload_async():
137 |     """
138 |     上传图片，不能立即处理完成，先返回成功处理的页面，等待用户调取
139 |     :return:
140 |     """
141 |     if not os.path.exists(UPLOAD_FOLDER):
142 |         os.mkdir(UPLOAD_FOLDER)
143 |     file = request.files['image']
144 |     if file and allowed_file(file.filename):
145 |         filename = secure_filename(file.filename)
146 |         file.save(os.path.join(UPLOAD_FOLDER, filename))
147 |     token = request.headers['token']
148 |     #参数说明，第一个参数是标识操作的用户的token，后面是funtion和它的参数, 识别图片的并发
149 |     executor.submit_stored(token, recognise, filename)
150 |     return jsonify({'code':0, 'result':'upload sucess, Please get result from API upload_async_result'})
151 | 
152 | @app.route('/upload_async_result', methods=['GET'])
153 | @authorize
154 | def get_result():
155 |     token = request.headers['token']
156 |     #如果图片识别没有完成，那么返回图片正在识别中的状态，等待用户再次请求此接口
157 |     if not executor.futures.done(token):
158 |         return jsonify({'code':1, 'status': executor.futures._state(token), 'result': "Task is not complete, Please wait a second"})
159 |     #用户图片识别完成，获取识别结果并返回给用户
160 |     future = executor.futures.pop(token)
161 |     return jsonify({'code':0, 'status': 'done', 'result': future.result()})
162 | 
163 | if __name__ == "__main__":
164 |     app.run(host='0.0.0.0', port=5000, debug=True, threaded=True)
165 | 


--------------------------------------------------------------------------------
/text_classsification/nets/text_rnn.py:
--------------------------------------------------------------------------------
  1 | # -- encoding:utf-8 --
  2 | 
  3 | 
  4 | import numpy as np
  5 | import tensorflow as tf
  6 | from tensorflow.contrib import slim
  7 | 
  8 | from nets.metric import Metrics
  9 | from nets import base_model
 10 | 
 11 | 
 12 | class Network(base_model.Network):
 13 | 
 14 |     def __init__(self, with_word2vec=False, vocab_size=None, embedding_dimensions=None,
 15 |                  embedding_table=None, train_embedding_table=False,
 16 |                  num_class=2, network_name="TextRNN", weight_decay=0.01,
 17 |                  optimizer_type="adam", optimizer_parameters_func=None, saver_parameters={'max_to_keep': 2},
 18 |                  num_units=128, layers=3, *args, **kwargs):
 19 |         """
 20 |         :param with_word2vec: 是否使用Word2Vec训练好的转换参数作为Embedding Lookup的参赛值
 21 |         :param vocab_size:  词汇数目
 22 |         :param embedding_dimensions: Embedding Loopup转换的时候，单词转换的词向量大小
 23 |         :param embedding_table: 训练好的单词向量映射表
 24 |         :param train_embedding_table: 是否训练train_embedding_table的参数值
 25 |         :param num_class:  类别数目
 26 |         :param network_name:  网络名称
 27 |         :param weight_decay: L2正则项的系数
 28 |         :param optimizer_type: 优化器的类别
 29 |         :param optimizer_parameters_func: 构建优化器的参数的函数
 30 |         :param saver_parameters: 模型持久化器的参数
 31 |         :param num_units: RNN Cell中的神经元数目
 32 |         :param layers: RNN的层次
 33 |         """
 34 |         self.num_units = num_units  # RNN Cell的神经元数目
 35 |         self.layers = layers  # RNN的层次
 36 | 
 37 |         super(Network, self).__init__(with_word2vec=with_word2vec, vocab_size=vocab_size,
 38 |                                       embedding_dimensions=embedding_dimensions,
 39 |                                       embedding_table=embedding_table, train_embedding_table=train_embedding_table,
 40 |                                       num_class=num_class, network_name=network_name, weight_decay=weight_decay,
 41 |                                       optimizer_type=optimizer_type,
 42 |                                       optimizer_parameters_func=optimizer_parameters_func,
 43 |                                       saver_parameters=saver_parameters)
 44 | 
 45 |     def interface(self):
 46 |         """
 47 |         前向网络构建
 48 |         batch_size: N
 49 |         feature height: H, 将序列长度T认为是H
 50 |         feature width: W，将Embedding size大小认为是W
 51 |         feature channel : C，一个文本就相当于一个Feature Map，通道数为1
 52 |         sentence_length: T
 53 |         embedding size: E
 54 |         :return:
 55 |         """
 56 |         with tf.variable_scope(self.network_name):
 57 |             with slim.arg_scope(self.arg_score()):
 58 |                 with tf.variable_scope("placeholders"):
 59 |                     self.global_step = tf.train.get_or_create_global_step()
 60 |                     # 输入的单词id，形状为:[N,T]
 61 |                     self.inputs = tf.placeholder(dtype=tf.int32, shape=[None, None], name='input_word_id')
 62 |                     # 希望输出的类别id, 形状为:[N,]
 63 |                     self.targets = tf.placeholder(dtype=tf.int32, shape=[None], name='target_class_id')
 64 |                     # Dropout
 65 |                     self.dropout_keep_prob = tf.placeholder_with_default(1.0, shape=[], name='dropout_keep_prob')
 66 | 
 67 |                 # 1. Embedding Layer
 68 |                 embedding_inputs = self.embedding_lookup(self.inputs)
 69 | 
 70 |                 # 2. 使用RNN来提取高阶特征
 71 |                 with tf.variable_scope("rnn"):
 72 |                     # a. 定义RNN的cell构建函数
 73 |                     def cell(_units):
 74 |                         _cell = tf.nn.rnn_cell.BasicLSTMCell(num_units=_units)
 75 |                         return tf.nn.rnn_cell.DropoutWrapper(cell=_cell, output_keep_prob=self.dropout_keep_prob)
 76 | 
 77 |                     # b. 构建前向的cell和反向cell
 78 |                     cell_fw = tf.nn.rnn_cell.MultiRNNCell(cells=[cell(self.num_units) for _ in range(self.layers)])
 79 |                     cell_bw = tf.nn.rnn_cell.MultiRNNCell(cells=[cell(self.num_units) for _ in range(self.layers)])
 80 | 
 81 |                     # c. 获取得到序列的输出向量
 82 |                     # 数据都是按照原始的从左往右的序列得到的最终特征
 83 |                     # (正向提取特征信息[N,T,E], 反向提取特征信息[N,T,E])，(正向最终的状态信息，反向最终的状态信息)
 84 |                     (output_fw, output_bw), (output_state_fw, output_state_bw) = tf.nn.bidirectional_dynamic_rnn(
 85 |                         cell_fw,  # 前向的RNN Cell
 86 |                         cell_bw,  # 反向的RNN Cell
 87 |                         inputs=embedding_inputs,  # 输入值, [N,T,E]
 88 |                         dtype=tf.float32,  # 给定RNN状态初始化值的类型
 89 |                     )
 90 | 
 91 |                 # 3. 将高阶特征拼接到一起,作为CNN提取出来的最终高阶特征信息
 92 |                 with tf.variable_scope("merge_feature"):
 93 |                     # 4. 提取最后一个时刻的特征信息作为这个序列的最终特征信息
 94 |                     features = tf.concat([output_fw[:, -1, :], output_bw[:, -1, :]], axis=-1)
 95 | 
 96 |                 # 4. FFN+Softmax做最终的决策输出
 97 |                 with tf.variable_scope("project"):
 98 |                     score = slim.fully_connected(features, num_outputs=self.num_class, activation_fn=None)
 99 |                     # 重命名, 得到的是N个文本属于num_class个类别的置信度
100 |                     self.logits = tf.identity(score, 'logits')
101 |                     # 得到N个文本分别属于各个类别的概率值
102 |                     self.probability = tf.nn.softmax(self.logits, name='probability')
103 |                     # 得到最终的预测id
104 |                     self.predictions = tf.argmax(self.logits, axis=-1, name='predictions')
105 | 
106 |         # 配置一个参数表示仅恢复模型参数
107 |         self.saver_parameters['var_list'] = tf.global_variables()
108 | 
109 | 
110 | 
111 | 
112 | 
113 | 
114 | 
115 | 
116 | 
117 | 


--------------------------------------------------------------------------------
/Res2Net/data_utils.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from PIL import Image
  3 | import json
  4 | import matplotlib.pyplot as plt
  5 | import matplotlib
  6 | import pickle
  7 | 
  8 | import torch
  9 | from torchvision import datasets, transforms
 10 | 
 11 | 
 12 | def load_data(path):
 13 |     """
 14 |     加载，转换，创建torch.utils.data.Dataloaders
 15 |     Args:
 16 |         path (str): 路径需要包含子文件夹，例如
 17 |                     path/train/..
 18 |                     path/eval/..
 19 |                     path/test/..
 20 | 
 21 |     Returns:
 22 |         dataloaders (dict): {'train': Dataloader(train_data),
 23 |                              'eval':, Dataloader(valid_data),
 24 |                              'test': Dataloader(test_data)}
 25 |     """
 26 |     #训练图片的数据属性
 27 |     IMG_SIZE = 224  #训练数据尺寸
 28 |     IMG_MEAN = [0.485, 0.456, 0.406]  # 图片归一化均值
 29 |     IMG_SDEV = [0.229, 0.224, 0.225]  # 图片归一化标准差
 30 | 
 31 |     #训练阶段
 32 |     phases = ['train', 'eval', 'test']
 33 | 
 34 |     #文件夹路径
 35 |     data_dir = {n: path + n for n in phases}
 36 | 
 37 |     #设置transforms
 38 |     data_transforms = {
 39 |         'train':
 40 |             transforms.Compose([
 41 |                 transforms.RandomRotation(30),
 42 |                 transforms.RandomResizedCrop(IMG_SIZE),
 43 |                 transforms.RandomHorizontalFlip(p=0.5),
 44 |                 transforms.ToTensor(),
 45 |                 transforms.Normalize(IMG_MEAN, IMG_SDEV)]),
 46 |         'eval':
 47 |             transforms.Compose([
 48 |                 transforms.Resize(256),
 49 |                 transforms.CenterCrop(IMG_SIZE),
 50 |                 transforms.ToTensor(),
 51 |                 transforms.Normalize(IMG_MEAN, IMG_SDEV)]),
 52 |         'test':
 53 |             transforms.Compose([
 54 |                 transforms.Resize(256),
 55 |                 transforms.CenterCrop(IMG_SIZE),
 56 |                 transforms.ToTensor(),
 57 |                 transforms.Normalize(IMG_MEAN, IMG_SDEV)])
 58 |     }
 59 | 
 60 |     #加载文件生成datasets
 61 |     image_datasets = {n: datasets.ImageFolder(
 62 |                             data_dir[n], transform=data_transforms[n])
 63 |                       for n in phases}
 64 | 
 65 |     #创建dataloaders
 66 |     dataloaders = {n: torch.utils.data.DataLoader(
 67 |                         image_datasets[n], batch_size=64, shuffle=True)
 68 |                     for n in phases}
 69 | 
 70 |     # 类别到id
 71 |     class_to_idx = image_datasets['train'].class_to_idx
 72 | 
 73 |     return dataloaders, class_to_idx
 74 | 
 75 | 
 76 | def display_prediction(image_path, probabilities, predictions):
 77 |     """
 78 |     绘制分类图像，将top预测类别作为标题，并显示预测top类别的预测概率图
 79 |     Args:
 80 |         image_path (str): 分类图片的路径
 81 |         probabilities ([float]): topk预测概率的列表
 82 |         class_idxs ([int]): topk类别id的列表
 83 |         class_names ([str]): topk的类别名称
 84 |     """
 85 |     top_class = predictions[0]
 86 |     #设置字体
 87 |     matplotlib.rcParams['font.family'] = ['Kaiti']
 88 | 
 89 |     #设置网格和标题
 90 |     fig = plt.figure(figsize=(4, 5.4))
 91 |     ax1 = plt.subplot2grid((2, 1), (0, 0))
 92 |     ax2 = plt.subplot2grid((2, 1), (1, 0))
 93 |     fig.suptitle(top_class.capitalize(), x=0.6, y=1, fontsize=16)
 94 | 
 95 |     #显示图片
 96 |     ax1.imshow(Image.open(image_path))
 97 |     ax1.set_xticks([])
 98 |     ax1.set_yticks([])
 99 | 
100 |     # 显示预测的类别和概率
101 |     #设置y轴
102 |     y = np.arange(len(predictions))
103 |     ax2.barh(y, probabilities)
104 |     ax2.set_yticks(y)
105 |     ax2.set_yticklabels(predictions)
106 |     #预测的最高概率
107 |     ax2.invert_yaxis()
108 |     ax2.set_xlabel('Prediction probability')
109 | 
110 |     #调整layout
111 |     fig.tight_layout()
112 |     plt.subplots_adjust(top=0.93)
113 | 
114 |     plt.show()
115 | 
116 | def prediction_class_names(predictions, class_to_idx):
117 |     """
118 |     转换索引到类别名称
119 |     Args:
120 |         predictions ([int]): 要预测的类别索引
121 |         class_to_idx (dict): 类别到id映射
122 | 
123 |     Returns:
124 |         class_names ([str]): 返回预测的类别名称
125 |     """
126 |     class_dict = {val: key for key, val in class_to_idx.items()}
127 |     class_idxs = [class_dict[pred] for pred in predictions]
128 | 
129 |     return class_idxs
130 | 
131 | def process_image(image_path):
132 |     """
133 |     缩放，裁剪，归一化PIL 图片， 返回一个Numpy数组
134 |     Args:
135 |         image_path : 输入PIL图片的路径
136 | 
137 |     Returns:
138 |         image_tensor (Tensor): 处理图片，返回torch.FloatTensor
139 |     """
140 |     IMG_SIZE = 224
141 |     IMG_MEAN = [0.485, 0.456, 0.406]
142 |     IMG_SDEV = [0.229, 0.224, 0.225]
143 | 
144 |     #加载图片
145 |     image = Image.open(image_path)
146 | 
147 |     # Resize最大维度256
148 |     if image.size[0] >= image.size[1]:
149 |         image.thumbnail((256, image.size[1] * 256 // image.size[0]))
150 |     else:
151 |         image.thumbnail((image.size[0] * 256 // image.size[1], 256))
152 | 
153 |     #中间裁切
154 |     image = image.crop((
155 |             (image.size[0] - IMG_SIZE) // 2,
156 |             (image.size[1] - IMG_SIZE) // 2,
157 |             (image.size[0] + IMG_SIZE) // 2 ,
158 |             (image.size[1] + IMG_SIZE) // 2))
159 |     # 转换到np.array ，rescape channels到0-1之间
160 |     image = np.array(image) / 255
161 |     # 归一化图片
162 |     image = (image - np.array(IMG_MEAN)) / np.array(IMG_SDEV)
163 |     # 调整颜色通道到维度1
164 |     image = image.transpose(2, 0, 1)
165 |     # 转换成toch.FloatTensor
166 |     image_tensor = torch.from_numpy(
167 |             np.expand_dims(image, axis=0)).type(torch.FloatTensor)
168 | 
169 |     return image_tensor
170 | 
171 | 
172 | def save_label(label, filename):
173 |     """
174 |     使用pickle保存字典
175 |     :param vocab:
176 |     :param filename:
177 |     :return:
178 |     """
179 |     with open(filename, 'wb') as f:
180 |         pickle.dump(label, f)
181 | 
182 | def load_label(filename):
183 |     """
184 |     使用pickle加载字典
185 |     :param filename:
186 |     :return:
187 |     """
188 |     with open(filename, 'rb') as f:
189 |         vocab = pickle.load(f)
190 |     return vocab


--------------------------------------------------------------------------------
/translate/data/sacrebleu.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2020 The HuggingFace Datasets Authors.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """ SACREBLEU metric. """
 16 | 
 17 | import sacrebleu as scb
 18 | from packaging import version
 19 | 
 20 | import datasets
 21 | 
 22 | 
 23 | _CITATION = """\
 24 | @inproceedings{post-2018-call,
 25 |     title = "A Call for Clarity in Reporting {BLEU} Scores",
 26 |     author = "Post, Matt",
 27 |     booktitle = "Proceedings of the Third Conference on Machine Translation: Research Papers",
 28 |     month = oct,
 29 |     year = "2018",
 30 |     address = "Belgium, Brussels",
 31 |     publisher = "Association for Computational Linguistics",
 32 |     url = "https://www.aclweb.org/anthology/W18-6319",
 33 |     pages = "186--191",
 34 | }
 35 | """
 36 | 
 37 | _DESCRIPTION = """\
 38 | SacreBLEU provides hassle-free computation of shareable, comparable, and reproducible BLEU scores.
 39 | Inspired by Rico Sennrich's `multi-bleu-detok.perl`, it produces the official WMT scores but works with plain text.
 40 | It also knows all the standard test sets and handles downloading, processing, and tokenization for you.
 41 | 
 42 | See the [README.md] file at https://github.com/mjpost/sacreBLEU for more information.
 43 | """
 44 | 
 45 | _KWARGS_DESCRIPTION = """
 46 | Produces BLEU scores along with its sufficient statistics
 47 | from a source against one or more references.
 48 | 
 49 | Args:
 50 |     predictions: The system stream (a sequence of segments)
 51 |     references: A list of one or more reference streams (each a sequence of segments)
 52 |     smooth: The smoothing method to use
 53 |     smooth_value: For 'floor' smoothing, the floor to use
 54 |     force: Ignore data that looks already tokenized
 55 |     lowercase: Lowercase the data
 56 |     tokenize: The tokenizer to use
 57 | Returns:
 58 |     'score': BLEU score,
 59 |     'counts': Counts,
 60 |     'totals': Totals,
 61 |     'precisions': Precisions,
 62 |     'bp': Brevity penalty,
 63 |     'sys_len': predictions length,
 64 |     'ref_len': reference length,
 65 | Examples:
 66 | 
 67 |     >>> predictions = ["hello there general kenobi", "foo bar foobar"]
 68 |     >>> references = [["hello there general kenobi", "hello there !"], ["foo bar foobar", "foo bar foobar"]]
 69 |     >>> sacrebleu = datasets.load_metric("sacrebleu")
 70 |     >>> results = sacrebleu.compute(predictions=predictions, references=references)
 71 |     >>> print(list(results.keys()))
 72 |     ['score', 'counts', 'totals', 'precisions', 'bp', 'sys_len', 'ref_len']
 73 |     >>> print(round(results["score"], 1))
 74 |     100.0
 75 | """
 76 | 
 77 | 
 78 | @datasets.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
 79 | class Sacrebleu(datasets.Metric):
 80 |     def _info(self):
 81 |         if version.parse(scb.__version__) < version.parse("1.4.12"):
 82 |             raise ImportWarning(
 83 |                 "To use `sacrebleu`, the module `sacrebleu>=1.4.12` is required, and the current version of `sacrebleu` doesn't match this condition.\n"
 84 |                 'You can install it with `pip install "sacrebleu>=1.4.12"`.'
 85 |             )
 86 |         return datasets.MetricInfo(
 87 |             description=_DESCRIPTION,
 88 |             citation=_CITATION,
 89 |             homepage="https://github.com/mjpost/sacreBLEU",
 90 |             inputs_description=_KWARGS_DESCRIPTION,
 91 |             features=datasets.Features(
 92 |                 {
 93 |                     "predictions": datasets.Value("string", id="sequence"),
 94 |                     "references": datasets.Sequence(datasets.Value("string", id="sequence"), id="references"),
 95 |                 }
 96 |             ),
 97 |             codebase_urls=["https://github.com/mjpost/sacreBLEU"],
 98 |             reference_urls=[
 99 |                 "https://github.com/mjpost/sacreBLEU",
100 |                 "https://en.wikipedia.org/wiki/BLEU",
101 |                 "https://towardsdatascience.com/evaluating-text-output-in-nlp-bleu-at-your-own-risk-e8609665a213",
102 |             ],
103 |         )
104 | 
105 |     def _compute(
106 |         self,
107 |         predictions,
108 |         references,
109 |         smooth_method="exp",
110 |         smooth_value=None,
111 |         force=False,
112 |         lowercase=False,
113 |         tokenize=scb.DEFAULT_TOKENIZER,
114 |         use_effective_order=False,
115 |     ):
116 |         references_per_prediction = len(references[0])
117 |         if any(len(refs) != references_per_prediction for refs in references):
118 |             raise ValueError("Sacrebleu requires the same number of references for each prediction")
119 |         transformed_references = [[refs[i] for refs in references] for i in range(references_per_prediction)]
120 |         output = scb.corpus_bleu(
121 |             sys_stream=predictions,
122 |             ref_streams=transformed_references,
123 |             smooth_method=smooth_method,
124 |             smooth_value=smooth_value,
125 |             force=force,
126 |             lowercase=lowercase,
127 |             tokenize=tokenize,
128 |             use_effective_order=use_effective_order,
129 |         )
130 |         output_dict = {
131 |             "score": output.score,
132 |             "counts": output.counts,
133 |             "totals": output.totals,
134 |             "precisions": output.precisions,
135 |             "bp": output.bp,
136 |             "sys_len": output.sys_len,
137 |             "ref_len": output.ref_len,
138 |         }
139 |         return output_dict
140 | 


--------------------------------------------------------------------------------
/text_classsification/nets/text_cnn.py:
--------------------------------------------------------------------------------
  1 | # -- encoding:utf-8 --
  2 | 
  3 | 
  4 | import numpy as np
  5 | import tensorflow as tf
  6 | from tensorflow.contrib import slim
  7 | 
  8 | from nets.metric import Metrics
  9 | from nets import base_model
 10 | 
 11 | 
 12 | class Network(base_model.Network):
 13 | 
 14 |     def __init__(self, with_word2vec=False, vocab_size=None, embedding_dimensions=None,
 15 |                  embedding_table=None, train_embedding_table=False,
 16 |                  num_class=2, network_name="TextCNN", weight_decay=0.01,
 17 |                  optimizer_type="adam", optimizer_parameters_func=None, saver_parameters={'max_to_keep': 2},
 18 |                  num_filters=128, region_sizes=[2, 3, 4], *args, **kwargs):
 19 |         """
 20 |         :param with_word2vec: 是否使用Word2Vec训练好的转换参数作为Embedding Lookup的参赛值
 21 |         :param vocab_size:  词汇数目
 22 |         :param embedding_dimensions: Embedding Loopup转换的时候，单词转换的词向量大小
 23 |         :param embedding_table: 训练好的单词向量映射表
 24 |         :param train_embedding_table: 是否训练train_embedding_table的参数值
 25 |         :param num_class:  类别数目
 26 |         :param network_name:  网络名称
 27 |         :param weight_decay: L2正则项的系数
 28 |         :param optimizer_type: 优化器的类别
 29 |         :param optimizer_parameters_func: 构建优化器的参数的函数
 30 |         :param saver_parameters: 模型持久化器的参数
 31 |         :param num_filters: TextCNN 各个不同类型卷积核的数目，可以给定为int或者list
 32 |         :param region_sizes: TextCNN各个不同类别卷积核提取单词特征的单词数量范围
 33 |         """
 34 |         self.region_sizes = region_sizes  # 使用CNN提取特征信息的时候，提取范围大小
 35 |         if isinstance(num_filters, list):
 36 |             # 相当于针对每个范围给定不同的卷积核数目
 37 |             if len(region_sizes) != len(num_filters):
 38 |                 raise Exception("resize_sizes和num_filters大小必须一致!!!")
 39 |             else:
 40 |                 self.num_filters = num_filters
 41 |         elif isinstance(num_filters, int):
 42 |             self.num_filters = [num_filters] * len(region_sizes)
 43 |         else:
 44 |             raise Exception("参数num_filters仅支持int类型或者list类型数据!!")
 45 | 
 46 |         super(Network, self).__init__(with_word2vec=with_word2vec, vocab_size=vocab_size,
 47 |                                       embedding_dimensions=embedding_dimensions,
 48 |                                       embedding_table=embedding_table, train_embedding_table=train_embedding_table,
 49 |                                       num_class=num_class, network_name=network_name, weight_decay=weight_decay,
 50 |                                       optimizer_type=optimizer_type,
 51 |                                       optimizer_parameters_func=optimizer_parameters_func,
 52 |                                       saver_parameters=saver_parameters)
 53 | 
 54 |     def interface(self):
 55 |         """
 56 |         前向网络构建
 57 |         batch_size: N
 58 |         feature height: H, 将序列长度T认为是H
 59 |         feature width: W，将Embedding size大小认为是W
 60 |         feature channel : C，一个文本就相当于一个Feature Map，通道数为1
 61 |         sentence_length: T
 62 |         embedding size: E
 63 |         :return:
 64 |         """
 65 |         with tf.variable_scope(self.network_name):
 66 |             with slim.arg_scope(self.arg_score()):
 67 |                 with tf.variable_scope("placeholders"):
 68 |                     self.global_step = tf.train.get_or_create_global_step()
 69 |                     # 输入的单词id，形状为:[N,T]
 70 |                     self.inputs = tf.placeholder(dtype=tf.int32, shape=[None, None], name='input_word_id')
 71 |                     # 希望输出的类别id, 形状为:[N,]
 72 |                     self.targets = tf.placeholder(dtype=tf.int32, shape=[None], name='target_class_id')
 73 |                     # Dropout
 74 |                     self.dropout_keep_prob = tf.placeholder_with_default(1.0, shape=[], name='dropout_keep_prob')
 75 | 
 76 |                 # 1. Embedding Layer
 77 |                 # 将单词id转换为单词向量，[N,T] --> [N,T,E]
 78 |                 embedding_inputs = self.embedding_lookup(self.inputs)
 79 |                 # 增加维度信息，将其转换为四维对象, [N,T,E] --> [N,T,E,1]
 80 |                 expanded_embedding_inputs = tf.expand_dims(embedding_inputs, axis=-1)
 81 | 
 82 |                 # 2. 使用卷积来提取高阶特征
 83 |                 outputs = []
 84 |                 for idx, region_size in enumerate(self.region_sizes):
 85 |                     with tf.variable_scope("conv-max-pooling-{}".format(idx)):
 86 |                         # 卷积的功能相当于将region_size个单词看成一个整体，然后进行单词的特征向量信息的融合提取
 87 |                         # 最终返回结果形状为: [N,T,1,C]
 88 |                         # NOTE: 这里的T实际上是比原来的序列长度小， T = sequence_length - region_size + 1
 89 |                         conv = slim.conv2d(
 90 |                             expanded_embedding_inputs,  # [N,T,E,1]
 91 |                             num_outputs=self.num_filters[idx],  # C, eg:2
 92 |                             kernel_size=(region_size, self.embedding_dimensions)  # (h,w), eg:(3,E)
 93 |                         )
 94 |                         # 针对序列的每个通道获取一个最大值，相当于认为每个卷积核提取某种特征信息，这里直接获取主要特征信息出来
 95 |                         # [N,T,1,C] --> [N,1,1,C]
 96 |                         pooled = tf.reduce_max(conv, axis=[1, 2], keep_dims=True)
 97 |                         # 通道压缩，因为维度1其实是无用的
 98 |                         output = tf.squeeze(pooled, axis=[1, 2])
 99 |                         # 添加到临时列表中
100 |                         outputs.append(output)
101 | 
102 |                 # 3. 将高阶特征拼接到一起,作为CNN提取出来的最终高阶特征信息
103 |                 with tf.variable_scope("merge_feature"):
104 |                     features = tf.concat(outputs, axis=-1)
105 |                     features = tf.nn.dropout(features, keep_prob=self.dropout_keep_prob)
106 | 
107 |                 # 4. FFN+Softmax做最终的决策输出
108 |                 with tf.variable_scope("project"):
109 |                     score = slim.fully_connected(features, num_outputs=self.num_class, activation_fn=None)
110 |                     # 重命名, 得到的是N个文本属于num_class个类别的置信度
111 |                     self.logits = tf.identity(score, 'logits')
112 |                     # 得到N个文本分别属于各个类别的概率值
113 |                     self.probability = tf.nn.softmax(self.logits, name='probability')
114 |                     # 得到最终的预测id
115 |                     self.predictions = tf.argmax(self.logits, axis=-1, name='predictions')
116 | 
117 |         # 配置一个参数表示仅恢复模型参数
118 |         self.saver_parameters['var_list'] = tf.global_variables()
119 | 


--------------------------------------------------------------------------------
/text_classsification/nets/text_rnn_improve.py:
--------------------------------------------------------------------------------
  1 | # -- encoding:utf-8 --
  2 | 
  3 | 
  4 | import numpy as np
  5 | import tensorflow as tf
  6 | from tensorflow.contrib import slim
  7 | 
  8 | from nets.metric import Metrics
  9 | from nets import base_model
 10 | 
 11 | 
 12 | class Network(base_model.Network):
 13 | 
 14 |     def __init__(self, with_word2vec=False, vocab_size=None, embedding_dimensions=None,
 15 |                  embedding_table=None, train_embedding_table=False,
 16 |                  num_class=2, network_name="TextRNN", weight_decay=0.01,
 17 |                  optimizer_type="adam", optimizer_parameters_func=None, saver_parameters={'max_to_keep': 2},
 18 |                  num_units=128, layers=3, *args, **kwargs):
 19 |         """
 20 |         :param with_word2vec: 是否使用Word2Vec训练好的转换参数作为Embedding Lookup的参赛值
 21 |         :param vocab_size:  词汇数目
 22 |         :param embedding_dimensions: Embedding Loopup转换的时候，单词转换的词向量大小
 23 |         :param embedding_table: 训练好的单词向量映射表
 24 |         :param train_embedding_table: 是否训练train_embedding_table的参数值
 25 |         :param num_class:  类别数目
 26 |         :param network_name:  网络名称
 27 |         :param weight_decay: L2正则项的系数
 28 |         :param optimizer_type: 优化器的类别
 29 |         :param optimizer_parameters_func: 构建优化器的参数的函数
 30 |         :param saver_parameters: 模型持久化器的参数
 31 |         :param num_units: RNN Cell中的神经元数目
 32 |         :param layers: RNN的层次
 33 |         """
 34 |         self.num_units = num_units  # RNN Cell的神经元数目
 35 |         self.layers = layers  # RNN的层次
 36 | 
 37 |         super(Network, self).__init__(with_word2vec=with_word2vec, vocab_size=vocab_size,
 38 |                                       embedding_dimensions=embedding_dimensions,
 39 |                                       embedding_table=embedding_table, train_embedding_table=train_embedding_table,
 40 |                                       num_class=num_class, network_name=network_name, weight_decay=weight_decay,
 41 |                                       optimizer_type=optimizer_type,
 42 |                                       optimizer_parameters_func=optimizer_parameters_func,
 43 |                                       saver_parameters=saver_parameters)
 44 | 
 45 |     def interface(self):
 46 |         """
 47 |         前向网络构建
 48 |         batch_size: N
 49 |         feature height: H, 将序列长度T认为是H
 50 |         feature width: W，将Embedding size大小认为是W
 51 |         feature channel : C，一个文本就相当于一个Feature Map，通道数为1
 52 |         sentence_length: T
 53 |         embedding size: E
 54 |         :return:
 55 |         """
 56 |         with tf.variable_scope(self.network_name):
 57 |             with slim.arg_scope(self.arg_score()):
 58 |                 with tf.variable_scope("placeholders"):
 59 |                     self.global_step = tf.train.get_or_create_global_step()
 60 |                     # 输入的单词id，形状为:[N,T]
 61 |                     self.inputs = tf.placeholder(dtype=tf.int32, shape=[None, None], name='input_word_id')
 62 |                     # 希望输出的类别id, 形状为:[N,]
 63 |                     self.targets = tf.placeholder(dtype=tf.int32, shape=[None], name='target_class_id')
 64 |                     # Dropout
 65 |                     self.dropout_keep_prob = tf.placeholder_with_default(1.0, shape=[], name='dropout_keep_prob')
 66 |                     # 计算序列实际长度, 最终形状为:[N,]
 67 |                     sequence_length = tf.reduce_sum(tf.sign(tf.abs(self.inputs)), axis=-1)
 68 | 
 69 |                 # 1. Embedding Layer
 70 |                 embedding_inputs = self.embedding_lookup(self.inputs)
 71 | 
 72 |                 # 2. 使用RNN来提取高阶特征
 73 |                 with tf.variable_scope("rnn"):
 74 |                     # a. 定义RNN的cell构建函数
 75 |                     def cell(_units):
 76 |                         _cell = tf.nn.rnn_cell.BasicLSTMCell(num_units=_units)
 77 |                         return tf.nn.rnn_cell.DropoutWrapper(cell=_cell, output_keep_prob=self.dropout_keep_prob)
 78 | 
 79 |                     # b. 构建前向的cell和反向cell
 80 |                     cell_fw = tf.nn.rnn_cell.MultiRNNCell(cells=[cell(self.num_units) for _ in range(self.layers)])
 81 |                     cell_bw = tf.nn.rnn_cell.MultiRNNCell(cells=[cell(self.num_units) for _ in range(self.layers)])
 82 | 
 83 |                     # c. 获取得到序列的输出向量
 84 |                     # 数据都是按照原始的从左往右的序列得到的最终特征
 85 |                     # (正向提取特征信息[N,T,E], 反向提取特征信息[N,T,E])，(正向最终的状态信息，反向最终的状态信息)
 86 |                     # 如果给定了序列的实际长度，那么在进行计算的时候，仅计算实际序列长度部分的内容，对于后面填充的内直接返回zero
 87 |                     (output_fw, output_bw), (output_state_fw, output_state_bw) = tf.nn.bidirectional_dynamic_rnn(
 88 |                         cell_fw,  # 前向的RNN Cell
 89 |                         cell_bw,  # 反向的RNN Cell
 90 |                         inputs=embedding_inputs,  # 输入值, [N,T,E]
 91 |                         dtype=tf.float32,  # 给定RNN状态初始化值的类型
 92 |                         sequence_length=sequence_length,  # 给定序列的实际长度(因为序列是经过填充的)
 93 |                     )
 94 | 
 95 |                 # 3. 将高阶特征拼接到一起,作为CNN提取出来的最终高阶特征信息
 96 |                 with tf.variable_scope("merge_feature"):
 97 |                     # 4. 直接将所有时刻的输出特征值mean作为最终特征信息(由于填充位置输出是zero，所以求均值不会产生影响)
 98 |                     # [N,T,E] --> [N,E] --> [N,E]
 99 |                     div_denominator = tf.reshape(tf.to_float(sequence_length), shape=(-1, 1))
100 |                     features_fw = tf.div(tf.reduce_sum(output_fw, axis=1), div_denominator)
101 |                     features_bw = tf.div(tf.reduce_sum(output_bw, axis=1), div_denominator)
102 |                     features = tf.concat([features_fw, features_bw], axis=-1)
103 |                     # TODO: 获取实际序列最后要给时刻的输出特征向量作为高阶向量(下周一做)
104 | 
105 |                 # 4. FFN+Softmax做最终的决策输出
106 |                 with tf.variable_scope("project"):
107 |                     score = slim.fully_connected(features, num_outputs=self.num_class, activation_fn=None)
108 |                     # 重命名, 得到的是N个文本属于num_class个类别的置信度
109 |                     self.logits = tf.identity(score, 'logits')
110 |                     # 得到N个文本分别属于各个类别的概率值
111 |                     self.probability = tf.nn.softmax(self.logits, name='probability')
112 |                     # 得到最终的预测id
113 |                     self.predictions = tf.argmax(self.logits, axis=-1, name='predictions')
114 | 
115 |         # 配置一个参数表示仅恢复模型参数
116 |         self.saver_parameters['var_list'] = tf.global_variables()
117 | 


--------------------------------------------------------------------------------
/text_classsification/nets/text_rnn_improve2.py:
--------------------------------------------------------------------------------
  1 | # -- encoding:utf-8 --
  2 | 
  3 | 
  4 | import numpy as np
  5 | import tensorflow as tf
  6 | from tensorflow.contrib import slim
  7 | 
  8 | from nets.metric import Metrics
  9 | from nets import base_model
 10 | 
 11 | 
 12 | class Network(base_model.Network):
 13 | 
 14 |     def __init__(self, with_word2vec=False, vocab_size=None, embedding_dimensions=None,
 15 |                  embedding_table=None, train_embedding_table=False,
 16 |                  num_class=2, network_name="TextRNN", weight_decay=0.01,
 17 |                  optimizer_type="adam", optimizer_parameters_func=None, saver_parameters={'max_to_keep': 2},
 18 |                  num_units=128, layers=3, *args, **kwargs):
 19 |         """
 20 |         :param with_word2vec: 是否使用Word2Vec训练好的转换参数作为Embedding Lookup的参赛值
 21 |         :param vocab_size:  词汇数目
 22 |         :param embedding_dimensions: Embedding Loopup转换的时候，单词转换的词向量大小
 23 |         :param embedding_table: 训练好的单词向量映射表
 24 |         :param train_embedding_table: 是否训练train_embedding_table的参数值
 25 |         :param num_class:  类别数目
 26 |         :param network_name:  网络名称
 27 |         :param weight_decay: L2正则项的系数
 28 |         :param optimizer_type: 优化器的类别
 29 |         :param optimizer_parameters_func: 构建优化器的参数的函数
 30 |         :param saver_parameters: 模型持久化器的参数
 31 |         :param num_units: RNN Cell中的神经元数目
 32 |         :param layers: RNN的层次
 33 |         """
 34 |         self.num_units = num_units  # RNN Cell的神经元数目
 35 |         self.layers = layers  # RNN的层次
 36 | 
 37 |         super(Network, self).__init__(with_word2vec=with_word2vec, vocab_size=vocab_size,
 38 |                                       embedding_dimensions=embedding_dimensions,
 39 |                                       embedding_table=embedding_table, train_embedding_table=train_embedding_table,
 40 |                                       num_class=num_class, network_name=network_name, weight_decay=weight_decay,
 41 |                                       optimizer_type=optimizer_type,
 42 |                                       optimizer_parameters_func=optimizer_parameters_func,
 43 |                                       saver_parameters=saver_parameters)
 44 | 
 45 |     def interface(self):
 46 |         """
 47 |         前向网络构建
 48 |         batch_size: N
 49 |         feature height: H, 将序列长度T认为是H
 50 |         feature width: W，将Embedding size大小认为是W
 51 |         feature channel : C，一个文本就相当于一个Feature Map，通道数为1
 52 |         sentence_length: T
 53 |         embedding size: E
 54 |         :return:
 55 |         """
 56 |         with tf.variable_scope(self.network_name):
 57 |             with slim.arg_scope(self.arg_score()):
 58 |                 with tf.variable_scope("placeholders"):
 59 |                     self.global_step = tf.train.get_or_create_global_step()
 60 |                     # 输入的单词id，形状为:[N,T]
 61 |                     self.inputs = tf.placeholder(dtype=tf.int32, shape=[None, None], name='input_word_id')
 62 |                     # 希望输出的类别id, 形状为:[N,]
 63 |                     self.targets = tf.placeholder(dtype=tf.int32, shape=[None], name='target_class_id')
 64 |                     # Dropout
 65 |                     self.dropout_keep_prob = tf.placeholder_with_default(1.0, shape=[], name='dropout_keep_prob')
 66 |                     # 计算序列实际长度, 最终形状为:[N,]
 67 |                     sequence_length = tf.reduce_sum(tf.sign(tf.abs(self.inputs)), axis=-1)
 68 | 
 69 |                 # 1. Embedding Layer
 70 |                 embedding_inputs = self.embedding_lookup(self.inputs)
 71 | 
 72 |                 # 2. 使用RNN来提取高阶特征
 73 |                 with tf.variable_scope("rnn"):
 74 |                     # a. 定义RNN的cell构建函数
 75 |                     def cell(_units):
 76 |                         _cell = tf.nn.rnn_cell.BasicLSTMCell(num_units=_units)
 77 |                         return tf.nn.rnn_cell.DropoutWrapper(cell=_cell, output_keep_prob=self.dropout_keep_prob)
 78 | 
 79 |                     # b. 构建前向的cell和反向cell
 80 |                     cell_fw = tf.nn.rnn_cell.MultiRNNCell(cells=[cell(self.num_units) for _ in range(self.layers)])
 81 |                     cell_bw = tf.nn.rnn_cell.MultiRNNCell(cells=[cell(self.num_units) for _ in range(self.layers)])
 82 | 
 83 |                     # c. 获取得到序列的输出向量
 84 |                     # 数据都是按照原始的从左往右的序列得到的最终特征
 85 |                     # (正向提取特征信息[N,T,E], 反向提取特征信息[N,T,E])，(正向最终的状态信息，反向最终的状态信息)
 86 |                     # 如果给定了序列的实际长度，那么在进行计算的时候，仅计算实际序列长度部分的内容，对于后面填充的内直接返回zero
 87 |                     (output_fw, output_bw), (output_state_fw, output_state_bw) = tf.nn.bidirectional_dynamic_rnn(
 88 |                         cell_fw,  # 前向的RNN Cell
 89 |                         cell_bw,  # 反向的RNN Cell
 90 |                         inputs=embedding_inputs,  # 输入值, [N,T,E]
 91 |                         dtype=tf.float32,  # 给定RNN状态初始化值的类型
 92 |                         sequence_length=sequence_length,  # 给定序列的实际长度(因为序列是经过填充的)
 93 |                     )
 94 | 
 95 |                 # 3. 将高阶特征拼接到一起,作为CNN提取出来的最终高阶特征信息
 96 |                 with tf.variable_scope("merge_feature"):
 97 |                     #  a. 获取正向执行网络中执行序列的最后一个时刻的输出值作为正向的特征
 98 |                     # 实现方式一：先将output_fw反转，然后获取output_fw[:,0,:]即为最终结果
 99 |                     # 实现方式二：使用tf.gather_nd获取不固定索引位置的向量信息
100 |                     batch_size = tf.shape(output_fw)[0]  # 批次大小
101 |                     indices_fw = tf.concat([
102 |                         tf.reshape(tf.range(batch_size), shape=(-1, 1)),  # 样本索引, [0,N-1]
103 |                         tf.reshape(sequence_length - 1, shape=(-1, 1))  # 样本长度最后一个时刻的索引值, 每个样本的长度信息
104 |                     ], axis=-1)
105 |                     features_fw = tf.gather_nd(output_fw, indices_fw)
106 | 
107 |                     # b. 获取反向执行网络中执行序列的最后一个时刻的输出值作为反向的特征，也就是真实序列中的第0个时刻
108 |                     features_bw = output_bw[:, 0, :]
109 | 
110 |                     # c. 将正向和方向结果合并
111 |                     features = tf.concat([features_fw, features_bw], axis=-1)
112 | 
113 |                 # 4. FFN+Softmax做最终的决策输出
114 |                 with tf.variable_scope("project"):
115 |                     score = slim.fully_connected(features, num_outputs=self.num_class, activation_fn=None)
116 |                     # 重命名, 得到的是N个文本属于num_class个类别的置信度
117 |                     self.logits = tf.identity(score, 'logits')
118 |                     # 得到N个文本分别属于各个类别的概率值
119 |                     self.probability = tf.nn.softmax(self.logits, name='probability')
120 |                     # 得到最终的预测id
121 |                     self.predictions = tf.argmax(self.logits, axis=-1, name='predictions')
122 | 
123 |         # 配置一个参数表示仅恢复模型参数
124 |         self.saver_parameters['var_list'] = tf.global_variables()
125 | 


--------------------------------------------------------------------------------
/Res2Net/res2next.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | import math
  3 | import torch.nn as nn
  4 | import torch.nn.functional as F
  5 | from torch.nn import init
  6 | import torch
  7 | import torch.utils.model_zoo as model_zoo
  8 | 
  9 | __all__ = ['res2next50']
 10 | model_urls = {
 11 |     'res2next50': 'https://shanghuagao.oss-cn-beijing.aliyuncs.com/res2net/res2next50_4s-6ef7e7bf.pth',
 12 | }
 13 | 
 14 | class Bottle2neckX(nn.Module):
 15 |     expansion = 4
 16 | 
 17 |     def __init__(self, inplanes, planes, baseWidth, cardinality, stride=1, downsample=None, scale = 4, stype='normal'):
 18 |         """ Constructor
 19 |         Args:
 20 |             inplanes: input channel dimensionality
 21 |             planes: output channel dimensionality
 22 |             baseWidth: base width.
 23 |             cardinality: num of convolution groups.
 24 |             stride: conv stride. Replaces pooling layer.
 25 |             scale: number of scale.
 26 |             type: 'normal': normal set. 'stage': frist blokc of a new stage.
 27 |         """
 28 |         super(Bottle2neckX, self).__init__()
 29 | 
 30 |         D = int(math.floor(planes * (baseWidth/64.0)))
 31 |         C = cardinality
 32 | 
 33 |         self.conv1 = nn.Conv2d(inplanes, D*C*scale, kernel_size=1, stride=1, padding=0, bias=False)
 34 |         self.bn1 = nn.BatchNorm2d(D*C*scale)
 35 | 
 36 |         if scale == 1:
 37 |           self.nums = 1
 38 |         else:
 39 |           self.nums = scale -1
 40 |         if stype == 'stage':
 41 |             self.pool = nn.AvgPool2d(kernel_size=3, stride = stride, padding=1)
 42 |         convs = []
 43 |         bns = []
 44 |         for i in range(self.nums):
 45 |           convs.append(nn.Conv2d(D*C, D*C, kernel_size=3, stride = stride, padding=1, groups=C, bias=False))
 46 |           bns.append(nn.BatchNorm2d(D*C))
 47 |         self.convs = nn.ModuleList(convs)
 48 |         self.bns = nn.ModuleList(bns)
 49 | 
 50 |         self.conv3 = nn.Conv2d(D*C*scale, planes * 4, kernel_size=1, stride=1, padding=0, bias=False)
 51 |         self.bn3 = nn.BatchNorm2d(planes * 4)
 52 |         self.relu = nn.ReLU(inplace=True)
 53 | 
 54 |         self.downsample = downsample
 55 |         self.width  = D*C
 56 |         self.stype = stype
 57 |         self.scale = scale
 58 | 
 59 |     def forward(self, x):
 60 |         residual = x
 61 | 
 62 |         out = self.conv1(x)
 63 |         out = self.bn1(out)
 64 |         out = self.relu(out)
 65 | 
 66 |         spx = torch.split(out, self.width, 1)
 67 |         for i in range(self.nums):
 68 |           if i==0 or self.stype=='stage':
 69 |             sp = spx[i]
 70 |           else:
 71 |             sp = sp + spx[i]
 72 |           sp = self.convs[i](sp)
 73 |           sp = self.relu(self.bns[i](sp))
 74 |           if i==0:
 75 |             out = sp
 76 |           else:
 77 |             out = torch.cat((out, sp), 1)
 78 |         if self.scale != 1 and self.stype=='normal':
 79 |           out = torch.cat((out, spx[self.nums]),1)
 80 |         elif self.scale != 1 and self.stype=='stage':
 81 |           out = torch.cat((out, self.pool(spx[self.nums])),1)
 82 | 
 83 |         out = self.conv3(out)
 84 |         out = self.bn3(out)
 85 | 
 86 |         if self.downsample is not None:
 87 |             residual = self.downsample(x)
 88 | 
 89 |         out += residual
 90 |         out = self.relu(out)
 91 | 
 92 |         return out
 93 | 
 94 | 
 95 | class Res2NeXt(nn.Module):
 96 |     def __init__(self, block, baseWidth, cardinality, layers, num_classes, scale=4):
 97 |         """ Constructor
 98 |         Args:
 99 |             baseWidth: baseWidth for ResNeXt.
100 |             cardinality: number of convolution groups.
101 |             layers: config of layers, e.g., [3, 4, 6, 3]
102 |             num_classes: number of classes
103 |             scale: scale in res2net
104 |         """
105 |         super(Res2NeXt, self).__init__()
106 | 
107 |         self.cardinality = cardinality
108 |         self.baseWidth = baseWidth
109 |         self.num_classes = num_classes
110 |         self.inplanes = 64
111 |         self.output_size = 64
112 |         self.scale = scale
113 | 
114 |         self.conv1 = nn.Conv2d(3, 64, 7, 2, 3, bias=False)
115 |         self.bn1 = nn.BatchNorm2d(64)
116 |         self.relu = nn.ReLU(inplace=True)
117 |         self.maxpool1 = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
118 |         self.layer1 = self._make_layer(block, 64, layers[0])
119 |         self.layer2 = self._make_layer(block, 128, layers[1], 2)
120 |         self.layer3 = self._make_layer(block, 256, layers[2], 2)
121 |         self.layer4 = self._make_layer(block, 512, layers[3], 2)
122 |         self.avgpool = nn.AdaptiveAvgPool2d(1)  
123 |         self.fc = nn.Linear(512 * block.expansion, num_classes)
124 | 
125 |         for m in self.modules():
126 |             if isinstance(m, nn.Conv2d):
127 |                 n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
128 |                 m.weight.data.normal_(0, math.sqrt(2. / n))
129 |             elif isinstance(m, nn.BatchNorm2d):
130 |                 m.weight.data.fill_(1)
131 |                 m.bias.data.zero_()
132 | 
133 |     def _make_layer(self, block, planes, blocks, stride=1):
134 |         downsample = None
135 |         if stride != 1 or self.inplanes != planes * block.expansion:
136 |             downsample = nn.Sequential(
137 |                 nn.Conv2d(self.inplanes, planes * block.expansion,
138 |                           kernel_size=1, stride=stride, bias=False),
139 |                 nn.BatchNorm2d(planes * block.expansion),
140 |             )
141 | 
142 |         layers = []
143 |         layers.append(block(self.inplanes, planes, self.baseWidth, self.cardinality, stride, downsample, scale=self.scale, stype='stage'))
144 |         self.inplanes = planes * block.expansion
145 |         for i in range(1, blocks):
146 |             layers.append(block(self.inplanes, planes, self.baseWidth, self.cardinality, scale=self.scale))
147 | 
148 |         return nn.Sequential(*layers)
149 | 
150 |     def forward(self, x):
151 |         x = self.conv1(x)
152 |         x = self.bn1(x)
153 |         x = self.relu(x)
154 |         x = self.maxpool1(x)
155 |         x = self.layer1(x)
156 |         x = self.layer2(x)
157 |         x = self.layer3(x)
158 |         x = self.layer4(x)
159 |         x = self.avgpool(x)
160 |         x = x.view(x.size(0), -1)
161 |         x = self.fc(x)
162 | 
163 |         return x
164 | def res2next50(pretrained=False, map_location='cpu', **kwargs):
165 |     """    Construct Res2NeXt-50.
166 |     The default scale is 4.
167 |     Args:
168 |         pretrained (bool): If True, returns a model pre-trained on ImageNet
169 |     """
170 |     model = Res2NeXt(Bottle2neckX, layers = [3, 4, 6, 3], baseWidth = 4, cardinality=8, scale = 4, num_classes=1000)
171 |     if pretrained:
172 |         model.load_state_dict(model_zoo.load_url(model_urls['res2next50'], map_location=map_location))
173 |     return model
174 | 
175 | if __name__ == '__main__':
176 |     images = torch.rand(1, 3, 224, 224).cuda(0)
177 |     model = res2next50(pretrained=True)
178 |     model = model.cuda(0)
179 |     print(model(images).size())
180 | 


--------------------------------------------------------------------------------
/text_classsification/utils/vocabulary_utils.py:
--------------------------------------------------------------------------------
  1 | # -- encoding:utf-8 --
  2 | 
  3 | import os
  4 | import itertools
  5 | import jieba
  6 | import numpy as np
  7 | import tensorflow as tf
  8 | from tensorflow.contrib.learn import preprocessing
  9 | from gensim import utils
 10 | from gensim.models import word2vec
 11 | 
 12 | 
 13 | def default_split_fn(documents):
 14 |     return split_with_char(documents)
 15 | 
 16 | 
 17 | def split_with_char(documents):
 18 |     return [list(sentence) for sentence in documents]
 19 | 
 20 | 
 21 | def split_with_word(documents):
 22 |     return [list(filter(lambda word: len(word) > 0, jieba.cut(sentence.strip()))) for sentence in documents]
 23 | 
 24 | 
 25 | class CategoricalVocabulary(preprocessing.CategoricalVocabulary):
 26 |     def __init__(self, unknown_token="<UNK>"):
 27 |         super(CategoricalVocabulary, self).__init__(unknown_token, False)
 28 | 
 29 |         # 特殊值（填充0，未知1）
 30 |         self.padding_token = "<PAD>"
 31 |         self._mapping[self.padding_token] = 0
 32 |         self._mapping[self._unknown_token] = 1
 33 |         # 添加一个属性
 34 |         self.vocab_size = 2
 35 | 
 36 |     def get(self, category):
 37 |         if category not in self._mapping:
 38 |             return 1
 39 |         return self._mapping[category]
 40 | 
 41 |     def set(self, category, index):
 42 |         self._mapping[category] = index
 43 |         self.vocab_size += 1
 44 | 
 45 | 
 46 | class PathLineSentences(object):
 47 |     """Like :class:`~gensim.models.word2vec.LineSentence`, but process all files in a directory
 48 |     in alphabetical order by filename.
 49 | 
 50 |     The directory must only contain files that can be read by :class:`gensim.models.word2vec.LineSentence`:
 51 |     .bz2, .gz, and text files. Any file not ending with .bz2 or .gz is assumed to be a text file.
 52 | 
 53 |     The format of files (either text, or compressed text files) in the path is one sentence = one line,
 54 |     with words already preprocessed and separated by whitespace.
 55 | 
 56 |     Warnings
 57 |     --------
 58 |     Does **not recurse** into subdirectories.
 59 | 
 60 |     """
 61 | 
 62 |     def __init__(self, source, max_sentence_length=word2vec.MAX_WORDS_IN_BATCH, limit=None, split_fn=None):
 63 |         self.source = source
 64 |         self.max_sentence_length = max_sentence_length
 65 |         self.limit = limit
 66 |         if split_fn is None:
 67 |             self.split_fn = default_split_fn
 68 |         else:
 69 |             self.split_fn = split_fn
 70 | 
 71 |         if os.path.isfile(self.source):
 72 |             self.input_files = [self.source]  # force code compatibility with list of files
 73 |         elif os.path.isdir(self.source):
 74 |             self.source = os.path.join(self.source, '')  # ensures os-specific slash at end of path
 75 |             self.input_files = os.listdir(self.source)
 76 |             self.input_files = [self.source + filename for filename in self.input_files]  # make full paths
 77 |             self.input_files.sort()
 78 |         else:
 79 |             raise ValueError('input is neither a file nor a path')
 80 | 
 81 |     def __iter__(self):
 82 |         """iterate through the files"""
 83 |         for file_name in self.input_files:
 84 |             with utils.open(file_name, 'rb') as fin:
 85 |                 for line in itertools.islice(fin, self.limit):
 86 |                     line = self.split_fn([utils.to_unicode(line).strip()])[0]
 87 |                     i = 0
 88 |                     while i < len(line):
 89 |                         yield line[i:i + self.max_sentence_length]
 90 |                         i += self.max_sentence_length
 91 | 
 92 | 
 93 | class VocabularyProcessorUtil(object):
 94 | 
 95 |     @staticmethod
 96 |     def building_model(documents, save_path, max_document_length=512, vocabulary=None, split_fn=default_split_fn):
 97 |         """
 98 |         基于传入的文档数据构建字典相关信息
 99 |         :param documents: 进行模型训练的时候的文本数据
100 |         :param save_path: 模型持久化的路径
101 |         :param vocabulary: 词汇映射表
102 |         :param split_fn: 将文本转换为单词过程中的函数, 默认是将每个字当作一个单词
103 |         :param max_document_length: 将文本单词id转换的时候，最长文本允许的单词数目
104 |         :return:
105 |         """
106 |         tf.logging.info("开始构建词汇转换模型.....")
107 |         model = preprocessing.VocabularyProcessor(max_document_length=max_document_length,
108 |                                                   vocabulary=vocabulary, tokenizer_fn=split_fn)
109 |         model.fit(raw_documents=documents)
110 |         tf.logging.info("词汇转换模型构建完成，开始模型保存操作!!!")
111 |         model.save(save_path)
112 |         tf.logging.info("词汇转换模型保存完成，保存位置为:{}".format(save_path))
113 | 
114 |     @staticmethod
115 |     def load_model(save_path) -> preprocessing.VocabularyProcessor:
116 |         """
117 |         基于给定的路径加载模型并返回
118 |         :param save_path:
119 |         :return:
120 |         """
121 |         if os.path.exists(save_path):
122 |             tf.logging.info("从【{}】位置进行词汇转换模型的恢复!!!".format(save_path))
123 |             return preprocessing.VocabularyProcessor.restore(save_path)
124 |         else:
125 |             raise Exception("词汇转换模型不存在，请检查磁盘路径：{}".format(save_path))
126 | 
127 |     @staticmethod
128 |     def build_word2vec_embedding(data_path, save_path, embedding_dimensions):
129 |         """
130 |         基于data_path下的文件内容构建Word2Vec向量，并将向量保存到save_path这个路径中
131 |         :param data_path: 原始数据所在的文件夹路径
132 |         :param save_path: 训练好的数据保存路径
133 |         :param embedding_dimensions:  转换的Embedding向量大小
134 |         :return:
135 |         """
136 |         # 0. 加载数据
137 |         sentences = PathLineSentences(source=data_path, split_fn=split_with_word)
138 |         # 1. 构建Word2Vec模型
139 |         model = word2vec.Word2Vec(sentences=sentences, size=embedding_dimensions,
140 |                                   window=9, min_count=2, iter=50)
141 |         # 3. 模型保存(以文本形式保存)
142 |         model.wv.save_word2vec_format(fname=save_path, binary=True)
143 | 
144 |     @staticmethod
145 |     def load_word2vec_embedding(save_path):
146 |         """
147 |         加载Word2Vec训练好的embedding转换矩阵
148 |         :param save_path:  数据存储的路径
149 |         :param binary: 是否是二进制存储
150 |         :return: embedding_table, vocabulary
151 |         """
152 |         # 1. 加载数据
153 |         model = word2vec.Word2VecKeyedVectors.load_word2vec_format(save_path, binary=True)
154 |         # 2. 获取embedding_table
155 |         embedding_table = model.vectors
156 |         embedding_dimensions = np.shape(embedding_table)[1]
157 |         # 3. 获取单词和id之间的映射关系
158 |         vocabulary = CategoricalVocabulary()
159 |         vocab_size = vocabulary.vocab_size
160 |         for word in model.vocab:
161 |             vocabulary.set(word, model.vocab[word].index + vocab_size)
162 |         # 4. 在embedding_table前面加入特征字符所代表的含义
163 |         embedding_table = np.concatenate(
164 |             [
165 |                 np.zeros(shape=(1, embedding_dimensions), dtype=embedding_table.dtype),  # PAD对应的的特征值
166 |                 np.random.normal(0, 0.01, size=(1, embedding_dimensions)),  # UNK对应的特征值
167 |                 embedding_table  # 原始单词对应的特征值
168 |             ],
169 |             axis=0
170 |         )
171 |         return embedding_table, vocabulary
172 | 
173 | 
174 | if __name__ == '__main__':
175 |     VocabularyProcessorUtil.build_word2vec_embedding("../data", "../model/w2v2.bin", 128)
176 |     embedding_table, vob = VocabularyProcessorUtil.load_word2vec_embedding("../model/w2v.bin")
177 |     print(vob.vocab_size)
178 | 


--------------------------------------------------------------------------------
/text_classsification/nets/text_transformer.py:
--------------------------------------------------------------------------------
  1 | # -- encoding:utf-8 --
  2 | 
  3 | 
  4 | import numpy as np
  5 | import tensorflow as tf
  6 | from tensorflow.contrib import slim
  7 | 
  8 | from nets.metric import Metrics
  9 | from nets import base_model
 10 | 
 11 | 
 12 | class Network(base_model.Network):
 13 | 
 14 |     def __init__(self, with_word2vec=False, vocab_size=None, embedding_dimensions=None,
 15 |                  embedding_table=None, train_embedding_table=False,
 16 |                  num_class=2, network_name="TextRNN", weight_decay=0.01,
 17 |                  optimizer_type="adam", optimizer_parameters_func=None, saver_parameters={'max_to_keep': 2},
 18 |                  attention_dimension_size=128, attention_layers=3, attention_headers=16, *args, **kwargs):
 19 |         """
 20 |         :param with_word2vec: 是否使用Word2Vec训练好的转换参数作为Embedding Lookup的参赛值
 21 |         :param vocab_size:  词汇数目
 22 |         :param embedding_dimensions: Embedding Loopup转换的时候，单词转换的词向量大小
 23 |         :param embedding_table: 训练好的单词向量映射表
 24 |         :param train_embedding_table: 是否训练train_embedding_table的参数值
 25 |         :param num_class:  类别数目
 26 |         :param network_name:  网络名称
 27 |         :param weight_decay: L2正则项的系数
 28 |         :param optimizer_type: 优化器的类别
 29 |         :param optimizer_parameters_func: 构建优化器的参数的函数
 30 |         :param saver_parameters: 模型持久化器的参数
 31 |         :param attention_dimension_size: Self Attention计算过程中的维度大小
 32 |         :param attention_layers: RNN的层次
 33 |         :param attention_headers: 头的数目
 34 |         """
 35 |         self.attention_dimension_size = attention_dimension_size
 36 |         self.attention_layers = attention_layers
 37 |         self.attention_headers = attention_headers
 38 | 
 39 |         super(Network, self).__init__(with_word2vec=with_word2vec, vocab_size=vocab_size,
 40 |                                       embedding_dimensions=embedding_dimensions,
 41 |                                       embedding_table=embedding_table, train_embedding_table=train_embedding_table,
 42 |                                       num_class=num_class, network_name=network_name, weight_decay=weight_decay,
 43 |                                       optimizer_type=optimizer_type,
 44 |                                       optimizer_parameters_func=optimizer_parameters_func,
 45 |                                       saver_parameters=saver_parameters)
 46 | 
 47 |     def interface(self):
 48 |         """
 49 |         前向网络构建
 50 |         batch_size: N
 51 |         feature height: H, 将序列长度T认为是H
 52 |         feature width: W，将Embedding size大小认为是W
 53 |         feature channel : C，一个文本就相当于一个Feature Map，通道数为1
 54 |         sentence_length: T
 55 |         embedding size: E
 56 |         :return:
 57 |         """
 58 |         with tf.variable_scope(self.network_name):
 59 |             with slim.arg_scope(self.arg_score()):
 60 |                 with tf.variable_scope("placeholders"):
 61 |                     self.global_step = tf.train.get_or_create_global_step()
 62 |                     # 输入的单词id，形状为:[N,T]
 63 |                     self.inputs = tf.placeholder(dtype=tf.int32, shape=[None, None], name='input_word_id')
 64 |                     # 希望输出的类别id, 形状为:[N,]
 65 |                     self.targets = tf.placeholder(dtype=tf.int32, shape=[None], name='target_class_id')
 66 |                     # Dropout
 67 |                     self.dropout_keep_prob = tf.placeholder_with_default(1.0, shape=[], name='dropout_keep_prob')
 68 | 
 69 |                 # 1. Embedding Layer(N,T,E)
 70 |                 embedding_inputs = self.embedding_lookup(self.inputs)
 71 | 
 72 |                 # 2. 使用Transformer来提取高阶特征
 73 |                 with tf.variable_scope("transformer"):
 74 |                     with tf.variable_scope("Input"):
 75 |                         encoder_input = tf.layers.dense(embedding_inputs, units=self.attention_dimension_size,
 76 |                                                         activation=tf.nn.relu)
 77 | 
 78 |                     for layer in range(self.attention_layers):
 79 |                         with tf.variable_scope("Encoder_{}".format(layer)):
 80 |                             # 1. 得到各个头的信息
 81 |                             attention_outputs = []
 82 |                             for header in range(self.attention_headers):
 83 |                                 with tf.variable_scope("Header_{}".format(header)):
 84 |                                     attention_output = self._self_attention(
 85 |                                         H=encoder_input,
 86 |                                         attention_dimension_size=self.attention_dimension_size
 87 |                                     )
 88 |                                     attention_outputs.append(attention_output)
 89 | 
 90 |                             # 2. 拼接
 91 |                             attention_output = tf.concat(attention_outputs, axis=-1)
 92 | 
 93 |                             # 3. 做一个线性转换
 94 |                             attention_output = tf.layers.dense(attention_output,
 95 |                                                                units=self.attention_dimension_size,
 96 |                                                                activation=None)
 97 | 
 98 |                             # 4. 将当前层的输出和当前层的输入做一个残差结构
 99 |                             attention_output = tf.nn.relu(attention_output + encoder_input)
100 | 
101 |                             # 5. 将当前层输出作为下一层的输入
102 |                             encoder_input = attention_output
103 | 
104 |                 # 3. 将高阶特征拼接到一起,作为CNN提取出来的最终高阶特征信息
105 |                 with tf.variable_scope("merge_feature"):
106 |                     # 4. 将所有时刻的特征信息求均值
107 |                     features = tf.reduce_mean(attention_output, axis=1)
108 | 
109 |                 # 4. FFN+Softmax做最终的决策输出
110 |                 with tf.variable_scope("project"):
111 |                     score = slim.fully_connected(features, num_outputs=self.num_class, activation_fn=None)
112 |                     # 重命名, 得到的是N个文本属于num_class个类别的置信度
113 |                     self.logits = tf.identity(score, 'logits')
114 |                     # 得到N个文本分别属于各个类别的概率值
115 |                     self.probability = tf.nn.softmax(self.logits, name='probability')
116 |                     # 得到最终的预测id
117 |                     self.predictions = tf.argmax(self.logits, axis=-1, name='predictions')
118 | 
119 |         # 配置一个参数表示仅恢复模型参数
120 |         self.saver_parameters['var_list'] = tf.global_variables()
121 | 
122 |     def _self_attention(self, H, attention_dimension_size):
123 |         """
124 |         计算Self-Attention
125 |         :param H: [N,T,E]， N个序列，每个序列T个时刻，每个时刻E维的向量
126 |         :return:
127 |         """
128 |         # 0. 获取大小信息
129 |         hidden_size = H.shape[-1]
130 |         batch_size, sequence_length, _ = tf.unstack(tf.shape(H))
131 |         # 1. 对输入数据reshape操作
132 |         H = tf.reshape(H, shape=tf.stack([batch_size * sequence_length, hidden_size]))
133 |         # 2. 分别计算Q、K、V
134 |         Q = tf.layers.dense(H, units=attention_dimension_size)
135 |         K = tf.layers.dense(H, units=attention_dimension_size)
136 |         V = tf.layers.dense(H, units=attention_dimension_size, activation=tf.nn.relu)
137 |         # 3. Reshape
138 |         Q = tf.reshape(Q, shape=tf.stack([batch_size, sequence_length, attention_dimension_size]))
139 |         K = tf.reshape(K, shape=tf.stack([batch_size, sequence_length, attention_dimension_size]))
140 |         V = tf.reshape(V, shape=tf.stack([batch_size, sequence_length, attention_dimension_size]))
141 |         # 4. 计算相关性([N,T,E],[N,T,E],F,T) --> [N,T,T]
142 |         scores = tf.matmul(Q, K, False, True) / np.sqrt(attention_dimension_size)
143 |         # 5. 计算概率值([N,T,T])
144 |         weights = tf.nn.softmax(scores)
145 |         # 6. 计算最终结果
146 |         attention = tf.matmul(weights, V)
147 |         return attention
148 | 
149 | 
150 | 
151 | 
152 | 
153 | 
154 | 
155 | 
156 | 
157 | 
158 | 
159 | 
160 | 


--------------------------------------------------------------------------------
/TextCNN/model_utils.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import torch
  3 | import torch.nn as nn
  4 | import torch.nn.functional as F
  5 | import data_utils
  6 | from tqdm import tqdm
  7 | 
  8 | class TextCNN(nn.Module):
  9 |     """
 10 |     TextCNN模型
 11 |     """
 12 |     def __init__(self, args):
 13 |         super(TextCNN, self).__init__()
 14 |         self.args = args
 15 |         # 单词表的长度，用于做词嵌入lookingup查表
 16 |         vocab_num = args.vocab_num
 17 |         # 词嵌入后生成的单词的维度
 18 |         embed_dim = args.embed_dim
 19 |         # 类别的维度
 20 |         class_num = args.class_num
 21 |         # 卷积核的起始输入维度，因为开始只有1个维度输入，所以默认in_channels维度是1
 22 |         kernel_in = 1
 23 |         # 卷积核的数量，等于输出的卷积核的channel数量
 24 |         kernel_num = args.kernel_num
 25 |         #卷积核尺寸，是一个列表[3,4,5]
 26 |         kernel_sizes = args.kernel_sizes
 27 |         #单词做lookingup查表的词嵌入，将词id变成词向量
 28 |         self.embed = nn.Embedding(vocab_num, embed_dim)
 29 |         #ModuleList，子模型作为一个列表传入, kernel_size卷积核的尺寸，这里是分别是[3,embed_dim], [4,embed_dim], [5,embed_dim]
 30 |         #kernel_size卷积核的尺寸的形状是[H,W], 高是3，代表3个词之间的关系
 31 |         self.convs1 = nn.ModuleList(
 32 |             [nn.Conv2d(in_channels=kernel_in, out_channels=kernel_num, kernel_size=(size, embed_dim))
 33 |              for size in kernel_sizes]
 34 |         )
 35 |         #做一次dropout
 36 |         self.dropout = nn.Dropout(args.dropout)
 37 |         #做全连接, 输入维度是len(kernel_sizes) * kernel_num， 因为是把所有卷积后的结果进行拼接，所以这个是拼接后的维度，class_num是要预测的类别
 38 |         self.fc1 = nn.Linear(len(kernel_sizes) * kernel_num, class_num)
 39 | 
 40 |     def conv_and_pool(self, x, conv):
 41 |         x = F.relu(conv(x)).squeeze(3)  # (N, Co, W)
 42 |         x = F.max_pool1d(x, x.size(2)).squeeze(2)
 43 |         return x
 44 | 
 45 |     def forward(self, text):
 46 |         """
 47 |         前向网络
 48 |         :param text: text的维度是[batch_size, sequence_length], 输入的是text的单词的id
 49 |         :return:
 50 |         """
 51 |         # 对text进行embedding lookup，生成的维度是[batch_size,sequence_length,Embedding_demission], 即[N,W,D]
 52 |         x = self.embed(text)
 53 |         # 添加一个维度，用于卷积，在第二个维度上扩充，变成[batch_size,1,sequence_length,Embedding_demission],
 54 |         x = x.unsqueeze(1)
 55 |         # 使用ModuleList中的卷积，卷积后进行relu激活，激活后,
 56 |         #第一次卷积Conv2d(1, 100, kernel_size=(3, 128), stride=(1, 1))，输入的x[batch_size,1,sequence_length,Embedding_demission], 卷积后x[batch_size,kernel_num,sequence_length,1], squeeze最后一个维度
 57 |         #第二次Conv2d(1, 100, kernel_size=(4, 128), stride=(1, 1)), 输出的形状和第一次相同
 58 |         #第三次Conv2d(1, 100, kernel_size=(5, 128), stride=(1, 1))， 输出的形状和第一次相同
 59 |         x_conv_pool_result = []
 60 |         #分别进行3次卷积，x_conv_result存储3次卷积的结果, 分布进行池化操作
 61 |         for conv in self.convs1:
 62 |             #输入的x[batch_size,1,sequence_length,Embedding_demission], 卷积后x[batch_size,kernel_num,sequence_length,1]
 63 |             x1 = conv(x)
 64 |             #激活不改变形状
 65 |             x1 = F.relu(x1)
 66 |             #squeeze后 [batch_size, kernel_num, sequence_length】
 67 |             x1 = x1.squeeze(3)
 68 |             #x1的shape是[batch_size,kernel_num,sequence_length], 设置kernel_size的大小是sequence_length * sequence_length
 69 |             x1 = F.max_pool1d(x1, kernel_size=x1.size(2))
 70 |             # max_pool1d后输出的x1的shape是[batch_size, kernel_num, 1]
 71 |             x1 = x1.squeeze(2)
 72 |             #squeeze后的shape是[batch_size, kernel_num]
 73 |             x_conv_pool_result.append(x1)
 74 |         #拼接输出结果, 形状是[batch_size, kernel_num*卷积的次数]
 75 |         x = torch.cat(x_conv_pool_result, 1)
 76 |         #做一次dropout, 形状不变
 77 |         x = self.dropout(x)
 78 |         #做全连接后得到输出结果 [batch_size, class_num]
 79 |         logit = self.fc1(x)
 80 |         return logit
 81 | 
 82 | 
 83 | def train(train_iter, model, args):
 84 |     """
 85 |     训练
 86 |     :param train_iter: 训练数据
 87 |     :param model:  模型，例如初始化的TextCNN
 88 |     :param args: paraser传入的config信息
 89 |     :return:
 90 |     """
 91 |     print("开始训练模型")
 92 |     #创建优化器
 93 |     optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate)
 94 |     model.train()
 95 | 
 96 |     #如果有GPU，使用gpu
 97 |     if args.cuda:
 98 |         model.cuda()
 99 | 
100 |     for epoch in range(1, args.epochs+1):
101 |         training_loss = 0.0
102 |         training_acc = 0.0
103 |         training_count = 0.0
104 | 
105 |         for batch in tqdm(train_iter):
106 |             # batch.text返回的形状是[sequence_length, batch_size], batch.label[batch_size]
107 |             feature, target = batch.text, batch.label
108 |             #feautre进行转置，形状变成【batch_size, sequence_length]
109 |             feature.t_()
110 |             # 所有label的数值减去1
111 |             target.sub_(1)
112 |             #如果是gpu，转换成gpu资源
113 |             if args.cuda:
114 |                 feature, target = feature.cuda(), target.cuda()
115 | 
116 |             optimizer.zero_grad()
117 |             #得到预测结果
118 |             logit = model(feature)
119 |             #计算交叉熵损失
120 |             loss = F.cross_entropy(logit, target)
121 |             loss.backward()
122 |             optimizer.step()
123 |             #计算准确率
124 |             corrects = (torch.max(logit, 1)[1].view(target.size()).data == target.data).sum()
125 |             #损失training_loss更新
126 |             training_loss += loss.item()
127 |             training_acc += corrects.item()
128 |             training_count += batch.batch_size
129 | 
130 |         #计算平均算是和准确率
131 |         training_loss /= training_count
132 |         training_acc /= training_count
133 |         accuracy = 100.0 * training_acc
134 |         print('Training epoch [{}/{}] - loss: {:.6f}  acc: {:.2f}%'.format(
135 |             epoch, args.epochs, training_loss, accuracy))
136 |         #保存模型
137 |         if epoch % args.save_interval == 0:
138 |             torch.save(model, args.save_path + f"textcnn.model-{epoch}")
139 |             print('保存模型完成')
140 |     #训练完成后再次保存模型
141 |     torch.save(model, args.save_path + "textcnn.model")
142 |     print("训练完成")
143 | 
144 | 
145 | def eval(data_iter, model, args):
146 |     """
147 |     评估模型
148 |     :param train_iter: 训练数据
149 |     :param model:  模型，例如初始化的TextCNN
150 |     :param args: paraser传入的config信息
151 |     :return:
152 |     """
153 |     print("开始评估模型")
154 |     #设置评估模型
155 |     model.eval()
156 |     if args.cuda:
157 |         model.cuda()
158 |     #评估准确率和损失
159 |     corrects, avg_loss = 0, 0
160 |     for batch in data_iter:
161 |         feature, target = batch.text, batch.label
162 |         feature.t_()
163 |         target.sub_(1)
164 |         if args.cuda:
165 |             feature, target = feature.cuda(), target.cuda()
166 | 
167 |         logit = model(feature)
168 |         loss = F.cross_entropy(logit, target, size_average=False)
169 | 
170 |         avg_loss += loss.data.item()
171 |         corrects += (torch.max(logit, 1)[1].view(target.size()).data == target.data).sum()
172 | 
173 |     size = len(data_iter.dataset)
174 |     avg_loss /= size
175 |     accuracy = 100.0 * corrects/size
176 |     print('Evaluation - loss: {:.6f}  acc: {:.4f}%'.format(avg_loss, accuracy))
177 |     print('评估完成')
178 |     return accuracy
179 | 
180 | def predict(path, model, text_field, label_feild, cuda):
181 |     """
182 |     模型预测
183 |     :param path: 要预测文本文件的路径
184 |     :param model: 初始化好的模型
185 |     :param text_field: text_field 文件
186 |     :param label_feild:
187 |     :param cuda: 是否使用gpu
188 |     :return:
189 |     """
190 |     model.eval()
191 |     if cuda:
192 |         model.cuda()
193 | 
194 |     document = ''
195 |     with open(path, encoding="utf8", errors='ignore') as f:
196 |         for line in f:
197 |             if line != '\n':
198 |                 document += data_utils.text_filter(line)
199 | 
200 |     #对文本进行jieba处理
201 |     text = text_field.preprocess(document)
202 | 
203 |     #文本转换成id
204 |     text = [[text_field.vocab.stoi[x] for x in text]]
205 |     x = torch.LongTensor(text)
206 |     if cuda:
207 |         x = x.cuda()
208 |     #预测结果
209 |     output = model(x)
210 |     #获取概率最大的结果
211 |     _, predicted = torch.max(output, 1)
212 |     #预测的索引id转换成文字
213 |     label = label_feild.vocab.itos[predicted.data[0] + 1]
214 |     return document, label
215 | 


--------------------------------------------------------------------------------
/text_classsification/nets/text_cnn_rnn.py:
--------------------------------------------------------------------------------
  1 | # -- encoding:utf-8 --
  2 | 
  3 | 
  4 | import numpy as np
  5 | import tensorflow as tf
  6 | from tensorflow.contrib import slim
  7 | 
  8 | from nets.metric import Metrics
  9 | from nets import base_model
 10 | 
 11 | 
 12 | class Network(base_model.Network):
 13 | 
 14 |     def __init__(self, with_word2vec=False, vocab_size=None, embedding_dimensions=None,
 15 |                  embedding_table=None, train_embedding_table=False,
 16 |                  num_class=2, network_name="TextCNNRNN", weight_decay=0.01,
 17 |                  optimizer_type="adam", optimizer_parameters_func=None, saver_parameters={'max_to_keep': 2},
 18 |                  num_filters=128, region_sizes=[2, 3, 4], num_units=128, layers=3, *args, **kwargs):
 19 |         """
 20 |         :param with_word2vec: 是否使用Word2Vec训练好的转换参数作为Embedding Lookup的参赛值
 21 |         :param vocab_size:  词汇数目
 22 |         :param embedding_dimensions: Embedding Loopup转换的时候，单词转换的词向量大小
 23 |         :param embedding_table: 训练好的单词向量映射表
 24 |         :param train_embedding_table: 是否训练train_embedding_table的参数值
 25 |         :param num_class:  类别数目
 26 |         :param network_name:  网络名称
 27 |         :param weight_decay: L2正则项的系数
 28 |         :param optimizer_type: 优化器的类别
 29 |         :param optimizer_parameters_func: 构建优化器的参数的函数
 30 |         :param saver_parameters: 模型持久化器的参数
 31 |         :param num_filters: TextCNN 各个不同类型卷积核的数目，可以给定为int或者list
 32 |         :param region_sizes: TextCNN各个不同类别卷积核提取单词特征的单词数量范围
 33 |         :param num_units: RNN Cell中的神经元数目
 34 |         :param layers: RNN的层次
 35 |         """
 36 |         self.num_units = num_units  # RNN Cell的神经元数目
 37 |         self.layers = layers  # RNN的层次
 38 |         self.region_sizes = region_sizes  # 使用CNN提取特征信息的时候，提取范围大小
 39 |         if isinstance(num_filters, list):
 40 |             # 相当于针对每个范围给定不同的卷积核数目
 41 |             if len(region_sizes) != len(num_filters):
 42 |                 raise Exception("resize_sizes和num_filters大小必须一致!!!")
 43 |             else:
 44 |                 self.num_filters = num_filters
 45 |         elif isinstance(num_filters, int):
 46 |             self.num_filters = [num_filters] * len(region_sizes)
 47 |         else:
 48 |             raise Exception("参数num_filters仅支持int类型或者list类型数据!!")
 49 | 
 50 |         super(Network, self).__init__(with_word2vec=with_word2vec, vocab_size=vocab_size,
 51 |                                       embedding_dimensions=embedding_dimensions,
 52 |                                       embedding_table=embedding_table, train_embedding_table=train_embedding_table,
 53 |                                       num_class=num_class, network_name=network_name, weight_decay=weight_decay,
 54 |                                       optimizer_type=optimizer_type,
 55 |                                       optimizer_parameters_func=optimizer_parameters_func,
 56 |                                       saver_parameters=saver_parameters)
 57 | 
 58 |     def interface(self):
 59 |         """
 60 |         前向网络构建
 61 |         batch_size: N
 62 |         feature height: H, 将序列长度T认为是H
 63 |         feature width: W，将Embedding size大小认为是W
 64 |         feature channel : C，一个文本就相当于一个Feature Map，通道数为1
 65 |         sentence_length: T
 66 |         embedding size: E
 67 |         :return:
 68 |         """
 69 |         with tf.variable_scope(self.network_name):
 70 |             with slim.arg_scope(self.arg_score()):
 71 |                 with tf.variable_scope("placeholders"):
 72 |                     self.global_step = tf.train.get_or_create_global_step()
 73 |                     # 输入的单词id，形状为:[N,T]
 74 |                     self.inputs = tf.placeholder(dtype=tf.int32, shape=[None, None], name='input_word_id')
 75 |                     # 希望输出的类别id, 形状为:[N,]
 76 |                     self.targets = tf.placeholder(dtype=tf.int32, shape=[None], name='target_class_id')
 77 |                     # Dropout
 78 |                     self.dropout_keep_prob = tf.placeholder_with_default(1.0, shape=[], name='dropout_keep_prob')
 79 | 
 80 |                 # 1. Embedding Layer
 81 |                 # 将单词id转换为单词向量，[N,T] --> [N,T,E]
 82 |                 embedding_inputs = self.embedding_lookup(self.inputs)
 83 |                 # 增加维度信息，将其转换为四维对象, [N,T,E] --> [N,T,E,1]
 84 |                 expanded_embedding_inputs = tf.expand_dims(embedding_inputs, axis=-1)
 85 | 
 86 |                 # 2. 使用卷积来提取高阶特征
 87 |                 outputs = []
 88 |                 with tf.variable_scope("cnn"):
 89 |                     for idx, region_size in enumerate(self.region_sizes):
 90 |                         with tf.variable_scope("conv-max-pooling-{}".format(idx)):
 91 |                             conv2d_input = expanded_embedding_inputs
 92 |                             # 卷积的功能相当于将region_size个单词看成一个整体，然后进行单词的特征向量信息的融合提取
 93 |                             # 最终返回结果形状为: [N,T,1,C]
 94 |                             # 为了保障卷积之后的Feature Map大小和原始大小一致(序列长度一致)，所以这里进行数据的填充
 95 |                             if region_size - 1 != 0:
 96 |                                 top = (region_size - 1) // 2
 97 |                                 bottom = region_size - 1 - top
 98 |                                 conv2d_input = tf.pad(conv2d_input, paddings=[[0, 0], [top, bottom], [0, 0], [0, 0]])
 99 |                             # 卷积(序列长度不变)
100 |                             conv = slim.conv2d(
101 |                                 conv2d_input,  # [N,T,E,1]
102 |                                 num_outputs=self.num_filters[idx],  # C, eg:2
103 |                                 kernel_size=(region_size, self.embedding_dimensions)  # (h,w), eg:(3,E)
104 |                             )
105 |                             # 添加到临时列表中
106 |                             outputs.append(tf.squeeze(conv, axis=2))
107 |                 with tf.variable_scope("rnn"):
108 |                     with tf.variable_scope("input"):
109 |                         # 数据合并，将不同卷积核提取的特征信息作为不同维度的特征
110 |                         rnn_input = tf.concat(outputs, axis=-1)
111 | 
112 |                     with tf.variable_scope("feature"):
113 |                         with tf.variable_scope("rnn"):
114 |                             # a. 定义RNN的cell构建函数
115 |                             def cell(_units):
116 |                                 _cell = tf.nn.rnn_cell.BasicLSTMCell(num_units=_units)
117 |                                 return tf.nn.rnn_cell.DropoutWrapper(cell=_cell,
118 |                                                                      output_keep_prob=self.dropout_keep_prob)
119 | 
120 |                             # b. 构建前向的cell和反向cell
121 |                             cell_fw = tf.nn.rnn_cell.MultiRNNCell(
122 |                                 cells=[cell(self.num_units) for _ in range(self.layers)])
123 |                             cell_bw = tf.nn.rnn_cell.MultiRNNCell(
124 |                                 cells=[cell(self.num_units) for _ in range(self.layers)])
125 | 
126 |                             # c. 获取得到序列的输出向量
127 |                             # 数据都是按照原始的从左往右的序列得到的最终特征
128 |                             # (正向提取特征信息[N,T,E], 反向提取特征信息[N,T,E])，(正向最终的状态信息，反向最终的状态信息)
129 |                             (output_fw, output_bw), _ = tf.nn.bidirectional_dynamic_rnn(
130 |                                 cell_fw,  # 前向的RNN Cell
131 |                                 cell_bw,  # 反向的RNN Cell
132 |                                 inputs=rnn_input,  # 输入值, [N,T,E]
133 |                                 dtype=tf.float32,  # 给定RNN状态初始化值的类型
134 |                             )
135 | 
136 |                 # 3. 将高阶特征拼接到一起,作为CNN提取出来的最终高阶特征信息
137 |                 with tf.variable_scope("merge_feature"):
138 |                     # 前向使用最后一个时刻，后向使用第一个时刻
139 |                     features = tf.concat([output_fw[:, -1, :], output_bw[:, 0, :]], axis=-1)
140 | 
141 |                 # 4. FFN+Softmax做最终的决策输出
142 |                 with tf.variable_scope("project"):
143 |                     score = slim.fully_connected(features, num_outputs=self.num_class, activation_fn=None)
144 |                     # 重命名, 得到的是N个文本属于num_class个类别的置信度
145 |                     self.logits = tf.identity(score, 'logits')
146 |                     # 得到N个文本分别属于各个类别的概率值
147 |                     self.probability = tf.nn.softmax(self.logits, name='probability')
148 |                     # 得到最终的预测id
149 |                     self.predictions = tf.argmax(self.logits, axis=-1, name='predictions')
150 | 
151 |         # 配置一个参数表示仅恢复模型参数
152 |         self.saver_parameters['var_list'] = tf.global_variables()
153 | 


--------------------------------------------------------------------------------
/langconv.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | from copy import deepcopy
  5 | import re
  6 | 
  7 | try:
  8 |     import psyco
  9 |     psyco.full()
 10 | except:
 11 |     pass
 12 | 
 13 | try:
 14 |     from zh_wiki import zh2Hant, zh2Hans
 15 | except ImportError:
 16 |     from zhtools.zh_wiki import zh2Hant, zh2Hans
 17 | 
 18 | import sys
 19 | py3k = sys.version_info >= (3, 0, 0)
 20 | 
 21 | if py3k:
 22 |     UEMPTY = ''
 23 | else:
 24 |     _zh2Hant, _zh2Hans = {}, {}
 25 |     for old, new in ((zh2Hant, _zh2Hant), (zh2Hans, _zh2Hans)):
 26 |         for k, v in old.items():
 27 |             new[k.decode('utf8')] = v.decode('utf8')
 28 |     zh2Hant = _zh2Hant
 29 |     zh2Hans = _zh2Hans
 30 |     UEMPTY = ''.decode('utf8')
 31 | 
 32 | # states
 33 | (START, END, FAIL, WAIT_TAIL) = list(range(4))
 34 | # conditions
 35 | (TAIL, ERROR, MATCHED_SWITCH, UNMATCHED_SWITCH, CONNECTOR) = list(range(5))
 36 | 
 37 | MAPS = {}
 38 | 
 39 | class Node(object):
 40 |     def __init__(self, from_word, to_word=None, is_tail=True,
 41 |             have_child=False):
 42 |         self.from_word = from_word
 43 |         if to_word is None:
 44 |             self.to_word = from_word
 45 |             self.data = (is_tail, have_child, from_word)
 46 |             self.is_original = True
 47 |         else:
 48 |             self.to_word = to_word or from_word
 49 |             self.data = (is_tail, have_child, to_word)
 50 |             self.is_original = False
 51 |         self.is_tail = is_tail
 52 |         self.have_child = have_child
 53 | 
 54 |     def is_original_long_word(self):
 55 |         return self.is_original and len(self.from_word)>1
 56 | 
 57 |     def is_follow(self, chars):
 58 |         return chars != self.from_word[:-1]
 59 | 
 60 |     def __str__(self):
 61 |         return '<Node, %s, %s, %s, %s>' % (repr(self.from_word),
 62 |                 repr(self.to_word), self.is_tail, self.have_child)
 63 | 
 64 |     __repr__ = __str__
 65 | 
 66 | class ConvertMap(object):
 67 |     def __init__(self, name, mapping=None):
 68 |         self.name = name
 69 |         self._map = {}
 70 |         if mapping:
 71 |             self.set_convert_map(mapping)
 72 | 
 73 |     def set_convert_map(self, mapping):
 74 |         convert_map = {}
 75 |         have_child = {}
 76 |         max_key_length = 0
 77 |         for key in sorted(mapping.keys()):
 78 |             if len(key)>1:
 79 |                 for i in range(1, len(key)):
 80 |                     parent_key = key[:i]
 81 |                     have_child[parent_key] = True
 82 |             have_child[key] = False
 83 |             max_key_length = max(max_key_length, len(key))
 84 |         for key in sorted(have_child.keys()):
 85 |             convert_map[key] = (key in mapping, have_child[key],
 86 |                     mapping.get(key, UEMPTY))
 87 |         self._map = convert_map
 88 |         self.max_key_length = max_key_length
 89 | 
 90 |     def __getitem__(self, k):
 91 |         try:
 92 |             is_tail, have_child, to_word  = self._map[k]
 93 |             return Node(k, to_word, is_tail, have_child)
 94 |         except:
 95 |             return Node(k)
 96 | 
 97 |     def __contains__(self, k):
 98 |         return k in self._map
 99 | 
100 |     def __len__(self):
101 |         return len(self._map)
102 | 
103 | class StatesMachineException(Exception): pass
104 | 
105 | class StatesMachine(object):
106 |     def __init__(self):
107 |         self.state = START
108 |         self.final = UEMPTY
109 |         self.len = 0
110 |         self.pool = UEMPTY
111 | 
112 |     def clone(self, pool):
113 |         new = deepcopy(self)
114 |         new.state = WAIT_TAIL
115 |         new.pool = pool
116 |         return new
117 | 
118 |     def feed(self, char, map):
119 |         node = map[self.pool+char]
120 | 
121 |         if node.have_child:
122 |             if node.is_tail:
123 |                 if node.is_original:
124 |                     cond = UNMATCHED_SWITCH
125 |                 else:
126 |                     cond = MATCHED_SWITCH
127 |             else:
128 |                 cond = CONNECTOR
129 |         else:
130 |             if node.is_tail:
131 |                 cond = TAIL
132 |             else:
133 |                 cond = ERROR
134 | 
135 |         new = None
136 |         if cond == ERROR:
137 |             self.state = FAIL
138 |         elif cond == TAIL:
139 |             if self.state == WAIT_TAIL and node.is_original_long_word():
140 |                 self.state = FAIL
141 |             else:
142 |                 self.final += node.to_word
143 |                 self.len += 1
144 |                 self.pool = UEMPTY
145 |                 self.state = END
146 |         elif self.state == START or self.state == WAIT_TAIL:
147 |             if cond == MATCHED_SWITCH:
148 |                 new = self.clone(node.from_word)
149 |                 self.final += node.to_word
150 |                 self.len += 1
151 |                 self.state = END
152 |                 self.pool = UEMPTY
153 |             elif cond == UNMATCHED_SWITCH or cond == CONNECTOR:
154 |                 if self.state == START:
155 |                     new = self.clone(node.from_word)
156 |                     self.final += node.to_word
157 |                     self.len += 1
158 |                     self.state = END
159 |                 else:
160 |                     if node.is_follow(self.pool):
161 |                         self.state = FAIL
162 |                     else:
163 |                         self.pool = node.from_word
164 |         elif self.state == END:
165 |             # END is a new START
166 |             self.state = START
167 |             new = self.feed(char, map)
168 |         elif self.state == FAIL:
169 |             raise StatesMachineException('Translate States Machine '
170 |                     'have error with input data %s' % node)
171 |         return new
172 | 
173 |     def __len__(self):
174 |         return self.len + 1
175 | 
176 |     def __str__(self):
177 |         return '<StatesMachine %s, pool: "%s", state: %s, final: %s>' % (
178 |                 id(self), self.pool, self.state, self.final)
179 |     __repr__ = __str__
180 | 
181 | class Converter(object):
182 |     def __init__(self, to_encoding):
183 |         self.to_encoding = to_encoding
184 |         self.map = MAPS[to_encoding]
185 |         self.start()
186 | 
187 |     def feed(self, char):
188 |         branches = []
189 |         for fsm in self.machines:
190 |             new = fsm.feed(char, self.map)
191 |             if new:
192 |                 branches.append(new)
193 |         if branches:
194 |             self.machines.extend(branches)
195 |         self.machines = [fsm for fsm in self.machines if fsm.state != FAIL]
196 |         all_ok = True
197 |         for fsm in self.machines:
198 |             if fsm.state != END:
199 |                 all_ok = False
200 |         if all_ok:
201 |             self._clean()
202 |         return self.get_result()
203 | 
204 |     def _clean(self):
205 |         if len(self.machines):
206 |             self.machines.sort(key=lambda x: len(x))
207 |             # self.machines.sort(cmp=lambda x,y: cmp(len(x), len(y)))
208 |             self.final += self.machines[0].final
209 |         self.machines = [StatesMachine()]
210 | 
211 |     def start(self):
212 |         self.machines = [StatesMachine()]
213 |         self.final = UEMPTY
214 | 
215 |     def end(self):
216 |         self.machines = [fsm for fsm in self.machines
217 |                 if fsm.state == FAIL or fsm.state == END]
218 |         self._clean()
219 | 
220 |     def convert(self, string):
221 |         self.start()
222 |         for char in string:
223 |             self.feed(char)
224 |         self.end()
225 |         return self.get_result()
226 | 
227 |     def get_result(self):
228 |         return self.final
229 | 
230 | 
231 | def registery(name, mapping):
232 |     global MAPS
233 |     MAPS[name] = ConvertMap(name, mapping)
234 | 
235 | registery('zh-hant', zh2Hant)
236 | registery('zh-hans', zh2Hans)
237 | del zh2Hant, zh2Hans
238 | 
239 | 
240 | def run():
241 |     import sys
242 |     from optparse import OptionParser
243 |     parser = OptionParser()
244 |     parser.add_option('-e', type='string', dest='encoding',
245 |             help='encoding')
246 |     parser.add_option('-f', type='string', dest='file_in',
247 |             help='input file (- for stdin)')
248 |     parser.add_option('-t', type='string', dest='file_out',
249 |             help='output file')
250 |     (options, args) = parser.parse_args()
251 |     if not options.encoding:
252 |         parser.error('encoding must be set')
253 |     if options.file_in:
254 |         if options.file_in == '-':
255 |             file_in = sys.stdin
256 |         else:
257 |             file_in = open(options.file_in)
258 |     else:
259 |         file_in = sys.stdin
260 |     if options.file_out:
261 |         if options.file_out == '-':
262 |             file_out = sys.stdout
263 |         else:
264 |             file_out = open(options.file_out, 'wb')
265 |     else:
266 |         file_out = sys.stdout
267 | 
268 |     c = Converter(options.encoding)
269 |     for line in file_in:
270 |         # print >> file_out, c.convert(line.rstrip('\n').decode(
271 |         file_out.write(c.convert(line.rstrip('\n').decode(
272 |             'utf8')).encode('utf8'))
273 | 
274 | 
275 | if __name__ == '__main__':
276 |     run()
277 | 
278 | 


--------------------------------------------------------------------------------
/text_classsification/nets/text_rnn_transformer.py:
--------------------------------------------------------------------------------
  1 | # -- encoding:utf-8 --
  2 | 
  3 | 
  4 | import numpy as np
  5 | import tensorflow as tf
  6 | from tensorflow.contrib import slim
  7 | 
  8 | from nets.metric import Metrics
  9 | from nets import base_model
 10 | 
 11 | 
 12 | class Network(base_model.Network):
 13 | 
 14 |     def __init__(self, with_word2vec=False, vocab_size=None, embedding_dimensions=None,
 15 |                  embedding_table=None, train_embedding_table=False,
 16 |                  num_class=2, network_name="TextRNN", weight_decay=0.01,
 17 |                  optimizer_type="adam", optimizer_parameters_func=None, saver_parameters={'max_to_keep': 2},
 18 |                  num_units=128, layers=3,
 19 |                  attention_dimension_size=128, attention_layers=3, attention_headers=16, *args, **kwargs):
 20 |         """
 21 |         :param with_word2vec: 是否使用Word2Vec训练好的转换参数作为Embedding Lookup的参赛值
 22 |         :param vocab_size:  词汇数目
 23 |         :param embedding_dimensions: Embedding Loopup转换的时候，单词转换的词向量大小
 24 |         :param embedding_table: 训练好的单词向量映射表
 25 |         :param train_embedding_table: 是否训练train_embedding_table的参数值
 26 |         :param num_class:  类别数目
 27 |         :param network_name:  网络名称
 28 |         :param weight_decay: L2正则项的系数
 29 |         :param optimizer_type: 优化器的类别
 30 |         :param optimizer_parameters_func: 构建优化器的参数的函数
 31 |         :param saver_parameters: 模型持久化器的参数
 32 |         :param num_units: RNN Cell中的神经元数目
 33 |         :param layers: RNN的层次
 34 |         :param attention_dimension_size: Self Attention计算过程中的维度大小
 35 |         :param attention_layers: Transformer的层次
 36 |         :param attention_headers: 头的数目
 37 |         """
 38 |         self.attention_dimension_size = attention_dimension_size
 39 |         self.attention_layers = attention_layers
 40 |         self.attention_headers = attention_headers
 41 |         self.num_units = num_units  # RNN Cell的神经元数目
 42 |         self.layers = layers  # RNN的层次
 43 | 
 44 |         super(Network, self).__init__(with_word2vec=with_word2vec, vocab_size=vocab_size,
 45 |                                       embedding_dimensions=embedding_dimensions,
 46 |                                       embedding_table=embedding_table, train_embedding_table=train_embedding_table,
 47 |                                       num_class=num_class, network_name=network_name, weight_decay=weight_decay,
 48 |                                       optimizer_type=optimizer_type,
 49 |                                       optimizer_parameters_func=optimizer_parameters_func,
 50 |                                       saver_parameters=saver_parameters)
 51 | 
 52 |     def interface(self):
 53 |         """
 54 |         前向网络构建
 55 |         batch_size: N
 56 |         feature height: H, 将序列长度T认为是H
 57 |         feature width: W，将Embedding size大小认为是W
 58 |         feature channel : C，一个文本就相当于一个Feature Map，通道数为1
 59 |         sentence_length: T
 60 |         embedding size: E
 61 |         :return:
 62 |         """
 63 |         with tf.variable_scope(self.network_name):
 64 |             with slim.arg_scope(self.arg_score()):
 65 |                 with tf.variable_scope("placeholders"):
 66 |                     self.global_step = tf.train.get_or_create_global_step()
 67 |                     # 输入的单词id，形状为:[N,T]
 68 |                     self.inputs = tf.placeholder(dtype=tf.int32, shape=[None, None], name='input_word_id')
 69 |                     # 希望输出的类别id, 形状为:[N,]
 70 |                     self.targets = tf.placeholder(dtype=tf.int32, shape=[None], name='target_class_id')
 71 |                     # Dropout
 72 |                     self.dropout_keep_prob = tf.placeholder_with_default(1.0, shape=[], name='dropout_keep_prob')
 73 | 
 74 |                 # 1. Embedding Layer(N,T,E)
 75 |                 embedding_inputs = self.embedding_lookup(self.inputs)
 76 | 
 77 |                 # 2. 使用Transformer来提取高阶特征
 78 |                 with tf.variable_scope("rnn"):
 79 |                     # a. 定义RNN的cell构建函数
 80 |                     def cell(_units):
 81 |                         _cell = tf.nn.rnn_cell.BasicLSTMCell(num_units=_units)
 82 |                         return tf.nn.rnn_cell.DropoutWrapper(cell=_cell, output_keep_prob=self.dropout_keep_prob)
 83 | 
 84 |                     # b. 构建前向的cell和反向cell
 85 |                     cell_fw = tf.nn.rnn_cell.MultiRNNCell(cells=[cell(self.num_units) for _ in range(self.layers)])
 86 |                     cell_bw = tf.nn.rnn_cell.MultiRNNCell(cells=[cell(self.num_units) for _ in range(self.layers)])
 87 | 
 88 |                     # c. 获取得到序列的输出向量
 89 |                     # 数据都是按照原始的从左往右的序列得到的最终特征
 90 |                     # (正向提取特征信息[N,T,E], 反向提取特征信息[N,T,E])，(正向最终的状态信息，反向最终的状态信息)
 91 |                     (output_fw, output_bw), (output_state_fw, output_state_bw) = tf.nn.bidirectional_dynamic_rnn(
 92 |                         cell_fw,  # 前向的RNN Cell
 93 |                         cell_bw,  # 反向的RNN Cell
 94 |                         inputs=embedding_inputs,  # 输入值, [N,T,E]
 95 |                         dtype=tf.float32,  # 给定RNN状态初始化值的类型
 96 |                     )
 97 | 
 98 |                 with tf.variable_scope("transformer"):
 99 |                     with tf.variable_scope("Input"):
100 |                         encoder_input = tf.layers.dense(tf.concat([output_fw, output_bw], axis=-1),
101 |                                                         units=self.attention_dimension_size,
102 |                                                         activation=tf.nn.relu)
103 | 
104 |                     for layer in range(self.attention_layers):
105 |                         with tf.variable_scope("Encoder_{}".format(layer)):
106 |                             # 1. 得到各个头的信息
107 |                             attention_outputs = []
108 |                             for header in range(self.attention_headers):
109 |                                 with tf.variable_scope("Header_{}".format(header)):
110 |                                     attention_output = self._self_attention(
111 |                                         H=encoder_input,
112 |                                         attention_dimension_size=self.attention_dimension_size
113 |                                     )
114 |                                     attention_outputs.append(attention_output)
115 | 
116 |                             # 2. 拼接
117 |                             attention_output = tf.concat(attention_outputs, axis=-1)
118 | 
119 |                             # 3. 做一个线性转换
120 |                             attention_output = tf.layers.dense(attention_output,
121 |                                                                units=self.attention_dimension_size,
122 |                                                                activation=None)
123 | 
124 |                             # 4. 将当前层的输出和当前层的输入做一个残差结构
125 |                             attention_output = tf.nn.relu(attention_output + encoder_input)
126 | 
127 |                             # 5. 将当前层输出作为下一层的输入
128 |                             encoder_input = attention_output
129 | 
130 |                 # 3. 将高阶特征拼接到一起,作为CNN提取出来的最终高阶特征信息
131 |                 with tf.variable_scope("merge_feature"):
132 |                     # 4. 将所有时刻的特征信息求均值
133 |                     features = tf.reduce_mean(attention_output, axis=1)
134 | 
135 |                 # 4. FFN+Softmax做最终的决策输出
136 |                 with tf.variable_scope("project"):
137 |                     score = slim.fully_connected(features, num_outputs=self.num_class, activation_fn=None)
138 |                     # 重命名, 得到的是N个文本属于num_class个类别的置信度
139 |                     self.logits = tf.identity(score, 'logits')
140 |                     # 得到N个文本分别属于各个类别的概率值
141 |                     self.probability = tf.nn.softmax(self.logits, name='probability')
142 |                     # 得到最终的预测id
143 |                     self.predictions = tf.argmax(self.logits, axis=-1, name='predictions')
144 | 
145 |         # 配置一个参数表示仅恢复模型参数
146 |         self.saver_parameters['var_list'] = tf.global_variables()
147 | 
148 |     def _self_attention(self, H, attention_dimension_size):
149 |         """
150 |         计算Self-Attention
151 |         :param H: [N,T,E]， N个序列，每个序列T个时刻，每个时刻E维的向量
152 |         :return:
153 |         """
154 |         # 0. 获取大小信息
155 |         hidden_size = H.shape[-1]
156 |         batch_size, sequence_length, _ = tf.unstack(tf.shape(H))
157 |         # 1. 对输入数据reshape操作
158 |         H = tf.reshape(H, shape=tf.stack([batch_size * sequence_length, hidden_size]))
159 |         # 2. 分别计算Q、K、V
160 |         Q = tf.layers.dense(H, units=attention_dimension_size)
161 |         K = tf.layers.dense(H, units=attention_dimension_size)
162 |         V = tf.layers.dense(H, units=attention_dimension_size, activation=tf.nn.relu)
163 |         # 3. Reshape
164 |         Q = tf.reshape(Q, shape=tf.stack([batch_size, sequence_length, attention_dimension_size]))
165 |         K = tf.reshape(K, shape=tf.stack([batch_size, sequence_length, attention_dimension_size]))
166 |         V = tf.reshape(V, shape=tf.stack([batch_size, sequence_length, attention_dimension_size]))
167 |         # 4. 计算相关性([N,T,E],[N,T,E],F,T) --> [N,T,T]
168 |         scores = tf.matmul(Q, K, False, True) / np.sqrt(attention_dimension_size)
169 |         # 5. 计算概率值([N,T,T])
170 |         weights = tf.nn.softmax(scores)
171 |         # 6. 计算最终结果
172 |         attention = tf.matmul(weights, V)
173 |         return attention
174 | 


--------------------------------------------------------------------------------
/text_classsification/nets/text_cnn_transformer.py:
--------------------------------------------------------------------------------
  1 | # -- encoding:utf-8 --
  2 | 
  3 | 
  4 | import numpy as np
  5 | import tensorflow as tf
  6 | from tensorflow.contrib import slim
  7 | 
  8 | from nets.metric import Metrics
  9 | from nets import base_model
 10 | 
 11 | 
 12 | class Network(base_model.Network):
 13 | 
 14 |     def __init__(self, with_word2vec=False, vocab_size=None, embedding_dimensions=None,
 15 |                  embedding_table=None, train_embedding_table=False,
 16 |                  num_class=2, network_name="TextCNNRNN", weight_decay=0.01,
 17 |                  optimizer_type="adam", optimizer_parameters_func=None, saver_parameters={'max_to_keep': 2},
 18 |                  num_filters=128, region_sizes=[2, 3, 4],
 19 |                  attention_dimension_size=128, attention_layers=3, attention_headers=16, *args, **kwargs):
 20 |         """
 21 |         :param with_word2vec: 是否使用Word2Vec训练好的转换参数作为Embedding Lookup的参赛值
 22 |         :param vocab_size:  词汇数目
 23 |         :param embedding_dimensions: Embedding Loopup转换的时候，单词转换的词向量大小
 24 |         :param embedding_table: 训练好的单词向量映射表
 25 |         :param train_embedding_table: 是否训练train_embedding_table的参数值
 26 |         :param num_class:  类别数目
 27 |         :param network_name:  网络名称
 28 |         :param weight_decay: L2正则项的系数
 29 |         :param optimizer_type: 优化器的类别
 30 |         :param optimizer_parameters_func: 构建优化器的参数的函数
 31 |         :param saver_parameters: 模型持久化器的参数
 32 |         :param num_filters: TextCNN 各个不同类型卷积核的数目，可以给定为int或者list
 33 |         :param region_sizes: TextCNN各个不同类别卷积核提取单词特征的单词数量范围
 34 |         :param attention_dimension_size: Self Attention计算过程中的维度大小
 35 |         :param attention_layers: Transformer的层次
 36 |         :param attention_headers: 头的数目
 37 |         """
 38 |         self.attention_dimension_size = attention_dimension_size
 39 |         self.attention_layers = attention_layers
 40 |         self.attention_headers = attention_headers
 41 |         self.region_sizes = region_sizes  # 使用CNN提取特征信息的时候，提取范围大小
 42 |         if isinstance(num_filters, list):
 43 |             # 相当于针对每个范围给定不同的卷积核数目
 44 |             if len(region_sizes) != len(num_filters):
 45 |                 raise Exception("resize_sizes和num_filters大小必须一致!!!")
 46 |             else:
 47 |                 self.num_filters = num_filters
 48 |         elif isinstance(num_filters, int):
 49 |             self.num_filters = [num_filters] * len(region_sizes)
 50 |         else:
 51 |             raise Exception("参数num_filters仅支持int类型或者list类型数据!!")
 52 | 
 53 |         super(Network, self).__init__(with_word2vec=with_word2vec, vocab_size=vocab_size,
 54 |                                       embedding_dimensions=embedding_dimensions,
 55 |                                       embedding_table=embedding_table, train_embedding_table=train_embedding_table,
 56 |                                       num_class=num_class, network_name=network_name, weight_decay=weight_decay,
 57 |                                       optimizer_type=optimizer_type,
 58 |                                       optimizer_parameters_func=optimizer_parameters_func,
 59 |                                       saver_parameters=saver_parameters)
 60 | 
 61 |     def interface(self):
 62 |         """
 63 |         前向网络构建
 64 |         batch_size: N
 65 |         feature height: H, 将序列长度T认为是H
 66 |         feature width: W，将Embedding size大小认为是W
 67 |         feature channel : C，一个文本就相当于一个Feature Map，通道数为1
 68 |         sentence_length: T
 69 |         embedding size: E
 70 |         :return:
 71 |         """
 72 |         with tf.variable_scope(self.network_name):
 73 |             with slim.arg_scope(self.arg_score()):
 74 |                 with tf.variable_scope("placeholders"):
 75 |                     self.global_step = tf.train.get_or_create_global_step()
 76 |                     # 输入的单词id，形状为:[N,T]
 77 |                     self.inputs = tf.placeholder(dtype=tf.int32, shape=[None, None], name='input_word_id')
 78 |                     # 希望输出的类别id, 形状为:[N,]
 79 |                     self.targets = tf.placeholder(dtype=tf.int32, shape=[None], name='target_class_id')
 80 |                     # Dropout
 81 |                     self.dropout_keep_prob = tf.placeholder_with_default(1.0, shape=[], name='dropout_keep_prob')
 82 | 
 83 |                 # 1. Embedding Layer
 84 |                 # 将单词id转换为单词向量，[N,T] --> [N,T,E]
 85 |                 embedding_inputs = self.embedding_lookup(self.inputs)
 86 |                 # 增加维度信息，将其转换为四维对象, [N,T,E] --> [N,T,E,1]
 87 |                 expanded_embedding_inputs = tf.expand_dims(embedding_inputs, axis=-1)
 88 | 
 89 |                 # 2. 使用卷积来提取高阶特征
 90 |                 outputs = []
 91 |                 with tf.variable_scope("cnn"):
 92 |                     for idx, region_size in enumerate(self.region_sizes):
 93 |                         with tf.variable_scope("conv-max-pooling-{}".format(idx)):
 94 |                             conv2d_input = expanded_embedding_inputs
 95 |                             # 卷积的功能相当于将region_size个单词看成一个整体，然后进行单词的特征向量信息的融合提取
 96 |                             # 最终返回结果形状为: [N,T,1,C]
 97 |                             # 为了保障卷积之后的Feature Map大小和原始大小一致(序列长度一致)，所以这里进行数据的填充
 98 |                             if region_size - 1 != 0:
 99 |                                 top = (region_size - 1) // 2
100 |                                 bottom = region_size - 1 - top
101 |                                 conv2d_input = tf.pad(conv2d_input, paddings=[[0, 0], [top, bottom], [0, 0], [0, 0]])
102 |                             # 卷积(序列长度不变)
103 |                             conv = slim.conv2d(
104 |                                 conv2d_input,  # [N,T,E,1]
105 |                                 num_outputs=self.num_filters[idx],  # C, eg:2
106 |                                 kernel_size=(region_size, self.embedding_dimensions)  # (h,w), eg:(3,E)
107 |                             )
108 | 
109 |                             # 最大池化(变成原来的一半), 将相邻单词的特征提取主要特征信息
110 |                             pool = slim.max_pool2d(conv, (3, 1), stride=[2, 1])
111 | 
112 |                             # 添加到临时列表中
113 |                             outputs.append(tf.squeeze(pool, axis=2))
114 | 
115 |                 with tf.variable_scope("transformer"):
116 |                     with tf.variable_scope("Input"):
117 |                         encoder_input = tf.layers.dense(tf.concat(outputs, axis=-1),
118 |                                                         units=self.attention_dimension_size,
119 |                                                         activation=tf.nn.relu)
120 | 
121 |                     for layer in range(self.attention_layers):
122 |                         with tf.variable_scope("Encoder_{}".format(layer)):
123 |                             # 1. 得到各个头的信息
124 |                             attention_outputs = []
125 |                             for header in range(self.attention_headers):
126 |                                 with tf.variable_scope("Header_{}".format(header)):
127 |                                     attention_output = self._self_attention(
128 |                                         H=encoder_input,
129 |                                         attention_dimension_size=self.attention_dimension_size
130 |                                     )
131 |                                     attention_outputs.append(attention_output)
132 | 
133 |                             # 2. 拼接
134 |                             attention_output = tf.concat(attention_outputs, axis=-1)
135 | 
136 |                             # 3. 做一个线性转换
137 |                             attention_output = tf.layers.dense(attention_output,
138 |                                                                units=self.attention_dimension_size,
139 |                                                                activation=None)
140 | 
141 |                             # 4. 将当前层的输出和当前层的输入做一个残差结构
142 |                             attention_output = tf.nn.relu(attention_output + encoder_input)
143 | 
144 |                             # 5. 将当前层输出作为下一层的输入
145 |                             encoder_input = attention_output
146 | 
147 |                 # 3. 将高阶特征拼接到一起,作为CNN提取出来的最终高阶特征信息
148 |                 with tf.variable_scope("merge_feature"):
149 |                     features = tf.reduce_mean(attention_output, axis=1)
150 | 
151 |                 # 4. FFN+Softmax做最终的决策输出
152 |                 with tf.variable_scope("project"):
153 |                     score = slim.fully_connected(features, num_outputs=self.num_class, activation_fn=None)
154 |                     # 重命名, 得到的是N个文本属于num_class个类别的置信度
155 |                     self.logits = tf.identity(score, 'logits')
156 |                     # 得到N个文本分别属于各个类别的概率值
157 |                     self.probability = tf.nn.softmax(self.logits, name='probability')
158 |                     # 得到最终的预测id
159 |                     self.predictions = tf.argmax(self.logits, axis=-1, name='predictions')
160 | 
161 |         # 配置一个参数表示仅恢复模型参数
162 |         self.saver_parameters['var_list'] = tf.global_variables()
163 | 
164 |     def _self_attention(self, H, attention_dimension_size):
165 |         """
166 |         计算Self-Attention
167 |         :param H: [N,T,E]， N个序列，每个序列T个时刻，每个时刻E维的向量
168 |         :return:
169 |         """
170 |         # 0. 获取大小信息
171 |         hidden_size = H.shape[-1]
172 |         batch_size, sequence_length, _ = tf.unstack(tf.shape(H))
173 |         # 1. 对输入数据reshape操作
174 |         H = tf.reshape(H, shape=tf.stack([batch_size * sequence_length, hidden_size]))
175 |         # 2. 分别计算Q、K、V
176 |         Q = tf.layers.dense(H, units=attention_dimension_size)
177 |         K = tf.layers.dense(H, units=attention_dimension_size)
178 |         V = tf.layers.dense(H, units=attention_dimension_size, activation=tf.nn.relu)
179 |         # 3. Reshape
180 |         Q = tf.reshape(Q, shape=tf.stack([batch_size, sequence_length, attention_dimension_size]))
181 |         K = tf.reshape(K, shape=tf.stack([batch_size, sequence_length, attention_dimension_size]))
182 |         V = tf.reshape(V, shape=tf.stack([batch_size, sequence_length, attention_dimension_size]))
183 |         # 4. 计算相关性([N,T,E],[N,T,E],F,T) --> [N,T,T]
184 |         scores = tf.matmul(Q, K, False, True) / np.sqrt(attention_dimension_size)
185 |         # 5. 计算概率值([N,T,T])
186 |         weights = tf.nn.softmax(scores)
187 |         # 6. 计算最终结果
188 |         attention = tf.matmul(weights, V)
189 |         return attention
190 | 


--------------------------------------------------------------------------------
/text_classsification/nets/base_model.py:
--------------------------------------------------------------------------------
  1 | # -- encoding:utf-8 --
  2 | 
  3 | import numpy as np
  4 | import tensorflow as tf
  5 | from tensorflow.contrib import slim
  6 | 
  7 | from nets.metric import Metrics
  8 | 
  9 | 
 10 | class Network(object):
 11 | 
 12 |     def __init__(self, with_word2vec=False, vocab_size=None, embedding_dimensions=None,
 13 |                  embedding_table=None, train_embedding_table=False,
 14 |                  num_class=2, network_name="TextCNN", weight_decay=0.01,
 15 |                  optimizer_type="adam", optimizer_parameters_func=None, saver_parameters={'max_to_keep': 2},
 16 |                  *args, **kwargs):
 17 |         """
 18 |         :param with_word2vec: 是否使用Word2Vec训练好的转换参数作为Embedding Lookup的参赛值
 19 |         :param vocab_size:  词汇数目
 20 |         :param embedding_dimensions: Embedding Loopup转换的时候，单词转换的词向量大小
 21 |         :param embedding_table: 训练好的单词向量映射表
 22 |         :param train_embedding_table: 是否训练train_embedding_table的参数值
 23 |         :param num_class:  类别数目
 24 |         :param network_name:  网络名称
 25 |         :param weight_decay: L2正则项的系数
 26 |         :param optimizer_type: 优化器的类别
 27 |         :param optimizer_parameters_func: 构建优化器的参数的函数
 28 |         :param saver_parameters: 模型持久化器的参数
 29 |         :param num_filters: TextCNN 各个不同类型卷积核的数目，可以给定为int或者list
 30 |         :param region_sizes: TextCNN各个不同类别卷积核提取单词特征的单词数量范围
 31 |         """
 32 |         self.with_word2vec = with_word2vec
 33 |         self.weight_decay = weight_decay  # 正则的权重系数
 34 |         self.network_name = network_name  # 网络名称
 35 |         self.vocab_size = vocab_size  # 词汇表大小
 36 |         self.embedding_dimensions = embedding_dimensions  # 做单词id转换为向量的时候，向量维度大小
 37 |         self.input_embedding_table = embedding_table
 38 |         self.train_embedding_table = train_embedding_table
 39 |         self.num_class = num_class  # 类别数目
 40 | 
 41 |         if self.with_word2vec:
 42 |             if self.input_embedding_table is None or np.ndim(self.input_embedding_table) != 2:
 43 |                 tf.logging.warn("当参数with_word2vec为True的时候，必须给定embedding_table的2维转换矩阵值!!")
 44 |                 self.with_word2vec = False
 45 |             else:
 46 |                 self.vocab_size, self.embedding_dimensions = np.shape(self.input_embedding_table)
 47 |         else:
 48 |             if self.embedding_dimensions is None or self.vocab_size is None:
 49 |                 raise Exception("当参数with_word2vec为False的时候，必须给定embedding_dimensions和vocab_size的参数值!!")
 50 | 
 51 |         self.global_step = None  # Tensor变量对象，用于记录模型的更新次数
 52 |         self.embedding_table = None  # 做词嵌入的变量
 53 |         self.inputs = None  # 输入的文本单词id，[None,None]
 54 |         self.targets = None  # 实际标签下标对象, [None,]
 55 |         self.dropout_keep_prob = None  # Drouout系数
 56 |         self.logits = None  # 模型前向网络执行之后得到的预测置信度信息，[None, num_class]
 57 |         self.probability = None  # 模型前向网络执行之后得到的预测概率信息, [None, num_class]
 58 |         self.predictions = None  # 模型前向网络的预测结果/类别下标，[None,]
 59 |         self.saver = None  # 模型持久化的对象
 60 |         self.saver_parameters = saver_parameters  # 初始化模型持久化对象的参数
 61 | 
 62 |         self.optimizer_type = optimizer_type  # 优化器类型
 63 |         self.optimizer_parameters_func = optimizer_parameters_func  # 优化器参数
 64 | 
 65 |         self.interface()
 66 | 
 67 |     def arg_score(self):
 68 |         """
 69 |         作用域默认参数给定
 70 |         :return:
 71 |         """
 72 |         with slim.arg_scope([slim.conv2d, slim.fully_connected],
 73 |                             activation_fn=tf.nn.relu,
 74 |                             weights_regularizer=slim.l2_regularizer(self.weight_decay),
 75 |                             weights_initializer=tf.contrib.layers.xavier_initializer(),
 76 |                             biases_initializer=tf.zeros_initializer()):
 77 |             with slim.arg_scope([slim.conv2d, slim.max_pool2d], padding='VALID', stride=1) as sc:
 78 |                 return sc
 79 | 
 80 |     def embedding_lookup(self, inputs):
 81 |         """
 82 |         对输入做一个Embedding转换处理
 83 |         :param inputs: 输入的Tensor对象
 84 |         :return:
 85 |         """
 86 |         with tf.device('/cpu:0'), tf.variable_scope('embedding'):
 87 |             if self.with_word2vec:
 88 |                 tf.logging.info("Embedding Table初始化使用Word2Vec训练好的转换参数.....")
 89 |                 _embedding = tf.get_variable(
 90 |                     name='embedding_table',
 91 |                     shape=[self.vocab_size, self.embedding_dimensions],
 92 |                     initializer=tf.constant_initializer(value=self.input_embedding_table),
 93 |                     trainable=self.train_embedding_table  # 给定是否参与模型训练
 94 |                 )
 95 |             else:
 96 |                 tf.logging.info("Embedding Table初始化使用随机初始化值.....")
 97 |                 _embedding = tf.get_variable(name='embedding_table',
 98 |                                              shape=[self.vocab_size, self.embedding_dimensions])
 99 |             self.embedding_table = _embedding
100 |             # 将单词id转换为单词向量，[N,T] --> [N,T,E]
101 |             embedding_inputs = tf.nn.embedding_lookup(self.embedding_table, inputs)
102 |         return embedding_inputs
103 | 
104 |     def interface(self):
105 |         raise NotImplementedError("请实现具体的interface代码，用于构建前向网络结构!!!")
106 | 
107 |     def losses(self):
108 |         """
109 |         计算损失函数，并返回对应的Tensor对象值
110 |         基于预测的置信度logits以及实际的标签值来构建分类损失函数
111 |         :return:
112 |         """
113 |         with tf.name_scope("Loss"):
114 |             # 1. 计算实际值和预测值之间差值所导致的损失值
115 |             if self.num_class == 2:
116 |                 # 二分类，可以考虑使用sigmoid交叉熵损失函数
117 |                 # 将id哑编码: [None,] --> [None,num_class]
118 |                 labels = tf.one_hot(self.targets, depth=self.num_class)
119 |                 # 计算损失:([None,num_class], [None,num_class]) --> [None,]
120 |                 loss = tf.nn.sigmoid_cross_entropy_with_logits(labels=labels, logits=self.logits)
121 |                 # 所有样本损失合并求均值:[None,] --> []
122 |                 loss = tf.reduce_mean(loss)
123 |             else:
124 |                 # 多分类，考虑使用softmax交叉熵损失函数
125 |                 # 基于id和logits置信度直接计算损失: ([None,], [None,num_class]) --> [None,]
126 |                 loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.targets, logits=self.logits)
127 |                 # 所有样本损失合并求均值:[None,] --> []
128 |                 loss = tf.reduce_mean(loss)
129 | 
130 |             # 2. 将损失添加到collection中
131 |             tf.losses.add_loss(loss)
132 | 
133 |             # 3. 获取所有损失合并之后的值(分类损失、正则损失等等)
134 |             total_loss = tf.losses.get_total_loss(name='total_loss')
135 | 
136 |             # 4. 可视化操作
137 |             tf.summary.scalar('loss', loss)
138 |             tf.summary.scalar('total_loss', total_loss)
139 |         return total_loss
140 | 
141 |     def optimizer(self, loss=None, return_train_operation=True):
142 |         """
143 |         构建优化器，并根据参数return_train_operation决定是否返回训练对象
144 |         :param loss: 如果return_train_operation为True，那么loss参数必须有值，并且表示为损失值
145 |         :param return_train_operation: True or False，True表示返回训练对象，False表示不返回
146 |         :return:  如果return_train_operation为True，返回优化器以及训练操作对象，否则仅返回优化器本身
147 |         """
148 |         if return_train_operation and loss is None:
149 |             raise Exception("当需要返回训练对象的时候，loss参数必须有值!!")
150 | 
151 |         with tf.name_scope("optimizer"):
152 |             # 1. 构建优化器
153 |             parameters = self.optimizer_parameters_func(self.global_step)
154 |             if self.optimizer_type == 'adam':
155 |                 opt = tf.train.AdamOptimizer(**parameters)
156 |             elif self.optimizer_type == 'adadelta':
157 |                 opt = tf.train.AdadeltaOptimizer(**parameters)
158 |             elif self.optimizer_type == 'adagrad':
159 |                 opt = tf.train.AdagradOptimizer(**parameters)
160 |             elif self.optimizer_type == 'ftrl':
161 |                 opt = tf.train.FtrlOptimizer(**parameters)
162 |             elif self.optimizer_type == 'momentum':
163 |                 opt = tf.train.MomentumOptimizer(**parameters)
164 |             else:
165 |                 opt = tf.train.GradientDescentOptimizer(**parameters)
166 | 
167 |             # 2. 构建训练对象
168 |             train_op = None
169 |             if return_train_operation:
170 |                 train_op = opt.minimize(loss=loss, global_step=self.global_step)
171 |         return opt, train_op
172 | 
173 |     def metrics(self, loss=None):
174 |         """
175 |         构建模型的评估指标，并返回对象
176 |         :param loss:
177 |         :return:
178 |         """
179 | 
180 |         def accuracy(true_y, pre_y):
181 |             with tf.name_scope("accuracy"):
182 |                 is_correct = tf.to_float(tf.equal(true_y, pre_y))
183 |                 return tf.reduce_mean(is_correct)
184 | 
185 |         with tf.name_scope("metrics"):
186 |             labels = self.targets
187 |             predictions = self.predictions
188 |             # 要求shape形状一致
189 |             predictions.get_shape().assert_is_compatible_with(labels.get_shape())
190 |             # 要求数据类型一致，不一致进行转换
191 |             if labels.dtype != predictions.dtype:
192 |                 predictions = tf.cast(predictions, labels.dtype)
193 |             # 基于预测索引id和实际的索引id，构建这个准确率
194 |             accuracy_ = accuracy(true_y=labels, pre_y=predictions)
195 |             tf.summary.scalar('accuracy', accuracy_)
196 | 
197 |             metrics = Metrics(accuracy=accuracy_, recall=None, f1=None)
198 |         return metrics
199 | 
200 |     def restore(self, checkpoint_dir, session):
201 |         """
202 |         进行模型参数恢复操作(直接恢复)
203 |         :param checkpoint_dir:
204 |         :param session:
205 |         :return:
206 |         """
207 |         # 0. 相关参数初始化
208 |         if self.saver is None:
209 |             self.saver = tf.train.Saver(**self.saver_parameters)
210 | 
211 |         # 1. 检查是否存在持久化的模型文件
212 |         ckpt = tf.train.get_checkpoint_state(checkpoint_dir)
213 |         # 2. 进行判断
214 |         if ckpt and ckpt.model_checkpoint_path:
215 |             tf.logging.info("开始进行模型恢复操作:{}".format(ckpt.model_checkpoint_path))
216 |             # 参数恢复
217 |             self.saver.restore(sess=session, save_path=ckpt.model_checkpoint_path)
218 |             # 恢复模型管理(保存磁盘中最多存在max_to_keep个模型)
219 |             self.saver.recover_last_checkpoints(ckpt.all_model_checkpoint_paths)
220 |         else:
221 |             tf.logging.warn("从文件夹【{}】没有发现训练好的模型文件，不能进行模型恢复操作!!".format(checkpoint_dir))
222 | 
223 |     def save(self, session, save_path):
224 |         # 0. 相关参数初始化
225 |         if self.saver is None:
226 |             self.saver = tf.train.Saver(**self.saver_parameters)
227 | 
228 |         # 1. 模型持久化
229 |         tf.logging.info("进行模型持久化操作, 持久化路径为:{}".format(save_path))
230 |         self.saver.save(sess=session, save_path=save_path, global_step=self.global_step)
231 |         tf.logging.info("模型持久化完成!!!")
232 | 


--------------------------------------------------------------------------------
/text_classsification/nets/text_adversarial_rnn_improve.py:
--------------------------------------------------------------------------------
  1 | # -- encoding:utf-8 --
  2 | 
  3 | 
  4 | import numpy as np
  5 | import tensorflow as tf
  6 | from tensorflow.contrib import slim
  7 | 
  8 | from nets.metric import Metrics
  9 | from nets import base_model
 10 | 
 11 | 
 12 | class Network(base_model.Network):
 13 | 
 14 |     def __init__(self, with_word2vec=False, vocab_size=None, embedding_dimensions=None,
 15 |                  embedding_table=None, train_embedding_table=False,
 16 |                  num_class=2, network_name="TextRNN", weight_decay=0.01,
 17 |                  optimizer_type="adam", optimizer_parameters_func=None, saver_parameters={'max_to_keep': 2},
 18 |                  num_units=128, layers=3, *args, **kwargs):
 19 |         """
 20 |         :param with_word2vec: 是否使用Word2Vec训练好的转换参数作为Embedding Lookup的参赛值
 21 |         :param vocab_size:  词汇数目
 22 |         :param embedding_dimensions: Embedding Loopup转换的时候，单词转换的词向量大小
 23 |         :param embedding_table: 训练好的单词向量映射表
 24 |         :param train_embedding_table: 是否训练train_embedding_table的参数值
 25 |         :param num_class:  类别数目
 26 |         :param network_name:  网络名称
 27 |         :param weight_decay: L2正则项的系数
 28 |         :param optimizer_type: 优化器的类别
 29 |         :param optimizer_parameters_func: 构建优化器的参数的函数
 30 |         :param saver_parameters: 模型持久化器的参数
 31 |         :param num_units: RNN Cell中的神经元数目
 32 |         :param layers: RNN的层次
 33 |         """
 34 |         self.num_units = num_units  # RNN Cell的神经元数目
 35 |         self.layers = layers  # RNN的层次
 36 |         self.embedding_inputs = None
 37 |         self.sequence_length = None
 38 | 
 39 |         super(Network, self).__init__(with_word2vec=with_word2vec, vocab_size=vocab_size,
 40 |                                       embedding_dimensions=embedding_dimensions,
 41 |                                       embedding_table=embedding_table, train_embedding_table=train_embedding_table,
 42 |                                       num_class=num_class, network_name=network_name, weight_decay=weight_decay,
 43 |                                       optimizer_type=optimizer_type,
 44 |                                       optimizer_parameters_func=optimizer_parameters_func,
 45 |                                       saver_parameters=saver_parameters)
 46 | 
 47 |     def interface(self):
 48 |         """
 49 |         前向网络构建
 50 |         batch_size: N
 51 |         feature height: H, 将序列长度T认为是H
 52 |         feature width: W，将Embedding size大小认为是W
 53 |         feature channel : C，一个文本就相当于一个Feature Map，通道数为1
 54 |         sentence_length: T
 55 |         embedding size: E
 56 |         :return:
 57 |         """
 58 |         with tf.variable_scope(self.network_name):
 59 |             with slim.arg_scope(self.arg_score()):
 60 |                 with tf.variable_scope("placeholders"):
 61 |                     self.global_step = tf.train.get_or_create_global_step()
 62 |                     # 输入的单词id，形状为:[N,T]
 63 |                     self.inputs = tf.placeholder(dtype=tf.int32, shape=[None, None], name='input_word_id')
 64 |                     # 希望输出的类别id, 形状为:[N,]
 65 |                     self.targets = tf.placeholder(dtype=tf.int32, shape=[None], name='target_class_id')
 66 |                     # Dropout
 67 |                     self.dropout_keep_prob = tf.placeholder_with_default(1.0, shape=[], name='dropout_keep_prob')
 68 |                     # 计算序列实际长度, 最终形状为:[N,]
 69 |                     self.sequence_length = tf.reduce_sum(tf.sign(tf.abs(self.inputs)), axis=-1)
 70 | 
 71 |                 # 1. Embedding Layer
 72 |                 self.embedding_inputs = self.embedding_lookup(self.inputs)
 73 | 
 74 |                 # 2. 使用RNN来提取高阶特征
 75 |                 with tf.variable_scope("rnn"):
 76 |                     # a. 定义RNN的cell构建函数
 77 |                     def cell(_units):
 78 |                         _cell = tf.nn.rnn_cell.BasicLSTMCell(num_units=_units)
 79 |                         return tf.nn.rnn_cell.DropoutWrapper(cell=_cell, output_keep_prob=self.dropout_keep_prob)
 80 | 
 81 |                     # b. 构建前向的cell和反向cell
 82 |                     cell_fw = tf.nn.rnn_cell.MultiRNNCell(cells=[cell(self.num_units) for _ in range(self.layers)])
 83 |                     cell_bw = tf.nn.rnn_cell.MultiRNNCell(cells=[cell(self.num_units) for _ in range(self.layers)])
 84 | 
 85 |                     # c. 获取得到序列的输出向量
 86 |                     # 数据都是按照原始的从左往右的序列得到的最终特征
 87 |                     # (正向提取特征信息[N,T,E], 反向提取特征信息[N,T,E])，(正向最终的状态信息，反向最终的状态信息)
 88 |                     # 如果给定了序列的实际长度，那么在进行计算的时候，仅计算实际序列长度部分的内容，对于后面填充的内直接返回zero
 89 |                     (output_fw, output_bw), (output_state_fw, output_state_bw) = tf.nn.bidirectional_dynamic_rnn(
 90 |                         cell_fw,  # 前向的RNN Cell
 91 |                         cell_bw,  # 反向的RNN Cell
 92 |                         inputs=self.embedding_inputs,  # 输入值, [N,T,E]
 93 |                         dtype=tf.float32,  # 给定RNN状态初始化值的类型
 94 |                         sequence_length=self.sequence_length,  # 给定序列的实际长度(因为序列是经过填充的)
 95 |                     )
 96 | 
 97 |                 # 3. 将高阶特征拼接到一起,作为CNN提取出来的最终高阶特征信息
 98 |                 with tf.variable_scope("merge_feature"):
 99 |                     # 4. 直接将所有时刻的输出特征值mean作为最终特征信息(由于填充位置输出是zero，所以求均值不会产生影响)
100 |                     # [N,T,E] --> [N,E] --> [N,E]
101 |                     div_denominator = tf.reshape(tf.to_float(self.sequence_length), shape=(-1, 1))
102 |                     features_fw = tf.div(tf.reduce_sum(output_fw, axis=1), div_denominator)
103 |                     features_bw = tf.div(tf.reduce_sum(output_bw, axis=1), div_denominator)
104 |                     features = tf.concat([features_fw, features_bw], axis=-1)
105 |                     # TODO: 获取实际序列最后要给时刻的输出特征向量作为高阶向量(下周一做)
106 | 
107 |                 # 4. FFN+Softmax做最终的决策输出
108 |                 with tf.variable_scope("project"):
109 |                     score = slim.fully_connected(features, num_outputs=self.num_class, activation_fn=None)
110 |                     # 重命名, 得到的是N个文本属于num_class个类别的置信度
111 |                     self.logits = tf.identity(score, 'logits')
112 |                     # 得到N个文本分别属于各个类别的概率值
113 |                     self.probability = tf.nn.softmax(self.logits, name='probability')
114 |                     # 得到最终的预测id
115 |                     self.predictions = tf.argmax(self.logits, axis=-1, name='predictions')
116 | 
117 |         # 配置一个参数表示仅恢复模型参数
118 |         self.saver_parameters['var_list'] = tf.global_variables()
119 | 
120 |     def losses(self):
121 |         with tf.name_scope("loss"):
122 |             # 1. 调用父类获得正常的分类损失函数
123 |             total_loss = super(Network, self).losses()
124 |             # 2. 加入对抗学习部分的损失函数
125 |             with tf.name_scope("perturLoss"):
126 |                 with tf.variable_scope(self.network_name, reuse=True):
127 |                     # a. 在Embedding上加入噪声信息
128 |                     pertur_embedding_inputs = self._add_perturbation(total_loss)
129 |                     # b. 正常网络结构的构建
130 |                     with tf.variable_scope("rnn"):
131 |                         # a. 定义RNN的cell构建函数
132 |                         def cell(_units):
133 |                             _cell = tf.nn.rnn_cell.BasicLSTMCell(num_units=_units)
134 |                             return tf.nn.rnn_cell.DropoutWrapper(cell=_cell, output_keep_prob=self.dropout_keep_prob)
135 | 
136 |                         # b. 构建前向的cell和反向cell
137 |                         cell_fw = tf.nn.rnn_cell.MultiRNNCell(cells=[cell(self.num_units) for _ in range(self.layers)])
138 |                         cell_bw = tf.nn.rnn_cell.MultiRNNCell(cells=[cell(self.num_units) for _ in range(self.layers)])
139 | 
140 |                         # c. 获取得到序列的输出向量
141 |                         # 数据都是按照原始的从左往右的序列得到的最终特征
142 |                         # (正向提取特征信息[N,T,E], 反向提取特征信息[N,T,E])，(正向最终的状态信息，反向最终的状态信息)
143 |                         # 如果给定了序列的实际长度，那么在进行计算的时候，仅计算实际序列长度部分的内容，对于后面填充的内直接返回zero
144 |                         (output_fw, output_bw), (output_state_fw, output_state_bw) = tf.nn.bidirectional_dynamic_rnn(
145 |                             cell_fw,  # 前向的RNN Cell
146 |                             cell_bw,  # 反向的RNN Cell
147 |                             inputs=pertur_embedding_inputs,  # 输入值, [N,T,E]
148 |                             dtype=tf.float32,  # 给定RNN状态初始化值的类型
149 |                             sequence_length=self.sequence_length,  # 给定序列的实际长度(因为序列是经过填充的)
150 |                         )
151 | 
152 |                     # 3. 将高阶特征拼接到一起,作为CNN提取出来的最终高阶特征信息
153 |                     with tf.variable_scope("merge_feature"):
154 |                         # 4. 直接将所有时刻的输出特征值mean作为最终特征信息(由于填充位置输出是zero，所以求均值不会产生影响)
155 |                         # [N,T,E] --> [N,E] --> [N,E]
156 |                         div_denominator = tf.reshape(tf.to_float(self.sequence_length), shape=(-1, 1))
157 |                         features_fw = tf.div(tf.reduce_sum(output_fw, axis=1), div_denominator)
158 |                         features_bw = tf.div(tf.reduce_sum(output_bw, axis=1), div_denominator)
159 |                         features = tf.concat([features_fw, features_bw], axis=-1)
160 | 
161 |                     # 4. FFN+Softmax做最终的决策输出
162 |                     with tf.variable_scope("project"):
163 |                         score = slim.fully_connected(features, num_outputs=self.num_class, activation_fn=None)
164 | 
165 |                     # 构建损失
166 |                     if self.num_class == 2:
167 |                         # 二分类，可以考虑使用sigmoid交叉熵损失函数
168 |                         # 将id哑编码: [None,] --> [None,num_class]
169 |                         labels = tf.one_hot(self.targets, depth=self.num_class)
170 |                         # 计算损失:([None,num_class], [None,num_class]) --> [None,]
171 |                         loss = tf.nn.sigmoid_cross_entropy_with_logits(labels=labels, logits=score)
172 |                         # 所有样本损失合并求均值:[None,] --> []
173 |                         perturLoss = tf.reduce_mean(loss)
174 |                     else:
175 |                         # 多分类，考虑使用softmax交叉熵损失函数
176 |                         # 基于id和logits置信度直接计算损失: ([None,], [None,num_class]) --> [None,]
177 |                         loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.targets, logits=score)
178 |                         # 所有样本损失合并求均值:[None,] --> []
179 |                         perturLoss = tf.reduce_mean(loss)
180 | 
181 |                 pass
182 |             # 3. 合并损失
183 |             total_loss = total_loss + perturLoss
184 |             tf.summary.scalar('total_loss2', total_loss)
185 |             tf.summary.scalar('pertur_loss', perturLoss)
186 |             return total_loss
187 | 
188 |     def _add_perturbation(self, loss):
189 |         """
190 |         给词向量添加噪声信息
191 |         :param loss:
192 |         :return:
193 |         """
194 |         with tf.name_scope("add_noise"):
195 |             # 求解loss关于embedding input值求解对应梯度值
196 |             grad, = tf.gradients(loss, self.embedding_inputs,
197 |                                  aggregation_method=tf.AggregationMethod.EXPERIMENTAL_ACCUMULATE_N)
198 |             # 停止反向传播([N,T,E])
199 |             grad = tf.stop_gradient(grad)
200 |             # 计算噪声信息
201 |             with tf.name_scope("noise"):
202 |                 # a. 求每个样本的梯度均值([N,T,E] --> [N,1,1])
203 |                 alpha = tf.reduce_mean(tf.abs(grad), axis=[1, 2], keep_dims=True) + 1e-12
204 |                 # b. 求解L2 norm值
205 |                 l2_norm = alpha * tf.sqrt(tf.reduce_mean(tf.pow(grad / alpha, 2), [1, 2], keep_dims=True) + 1e-6)
206 |                 # c. 将grad除以l2_norm
207 |                 x_unit = grad / l2_norm
208 |                 # d. 扩展一下数据
209 |                 perturb = x_unit * 2.0
210 |             return self.embedding_inputs + perturb
211 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/unsuper_classification.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | import json
  3 | from typing import List
  4 | import re
  5 | import os
  6 | import jieba
  7 | from langconv import Converter
  8 | from gensim.models.doc2vec import Doc2Vec, TaggedDocument
  9 | from gensim.models.tfidfmodel import TfidfModel
 10 | from gensim import corpora, similarities
 11 | from gensim.models.keyedvectors import Word2VecKeyedVectors
 12 | from transformers import AlbertModel, AlbertTokenizer, BertTokenizer,  AlbertForMaskedLM
 13 | import torch
 14 | 
 15 | ######################################################
 16 | # 参数, 使用tfidf+doc2vec+albert实现无监督文本分类
 17 | #对内存影响较大的是文件读取, 可以改用迭代器(todo)，  影响运行时间的doc2vec训练次数，albert使用的模型
 18 | ######################################################
 19 | # 加载nlpcc的中文数据
 20 | source_file = '/Users/admin/Downloads/nlpcc2017textsummarization/train_without_summ.txt'
 21 | #把nlpcc的数据过滤后放入des_file文件夹中
 22 | des_file = 'data/test/'
 23 | #停止词文件
 24 | stopword_file = 'stopwords.txt'
 25 | #经过预处理后的文件缓存位置
 26 | final_file = 'data/documents.txt'
 27 | #doc2vec 模型保存文章
 28 | docmodel = 'data/doc.model'
 29 | #tfidf模型保存位置
 30 | tfidfmodel = 'data/tfidf.model'
 31 | #人工定义的文章分类的类别，标签
 32 | finTags = ['明星', '诗歌', '故事', '美食', '企业', '个人', '证件', '新闻']
 33 | #停止词过滤
 34 | stopwords_list = [line.rstrip() for line in open(stopword_file, encoding="utf-8")]
 35 | 
 36 | def percent_chinese(sentence: str)-> bool:
 37 |     """
 38 |     过滤掉英文字符和数字占30%的文档
 39 |     :param sentence:
 40 |     :return:
 41 |     """
 42 |     #文本总的长度
 43 |     tol = len(list(sentence.split()))
 44 |     pattern = '[a-z0-9]+'
 45 |     #英文和数字的长度
 46 |     english_count = len(re.findall(pattern, sentence))
 47 |     return english_count/tol < 0.3
 48 | 
 49 | def filter_data():
 50 |     """
 51 |     过滤掉单词小于10000的文本，并且中文占比过低的文本, 保存到des_file文件夹
 52 |     :return:
 53 |     """
 54 |     count = 0
 55 |     with open(source_file) as f:
 56 |         for line in f:
 57 |             line_dict = json.loads(line)
 58 |             article = line_dict['article']
 59 |             if len(article) > 5000 and percent_chinese(article) :
 60 |                 count += 1
 61 |                 des = des_file + str(count) + '.txt'
 62 |                 with open(des, 'w', encoding='utf-8') as wf:
 63 |                     wf.write(article + "\n")
 64 |     print('生成文档的个数',count)
 65 | 
 66 | def filter_chinese(sentence: str)-> str:
 67 |     """
 68 |     中文的一些预处理
 69 |     :param sentence: 输入的句子或文本
 70 |     :return:
 71 |     """
 72 |     # 去除文本中的url
 73 |     # sentence = re.sub(r"http\S+", "", sentence)
 74 |     #剔除所有数字
 75 |     # decimal_regex = re.compile(r"[^a-zA-Z]\d+")
 76 |     # sentence = decimal_regex.sub(r"", sentence)
 77 |     #删除英文字符
 78 |     # eng_regex = re.compile(r'[a-zA-z]')
 79 |     # sentence = eng_regex.sub(r"", sentence)
 80 |     #只保留中文和标点符号
 81 |     words = [word for word in sentence if word >= u'\u4e00' and word <= u'\u9fa5' or word in ['，','。','？','！']]
 82 |     sentence = ''.join(words)
 83 |     # 去除空格
 84 |     space_regex = re.compile(r"\s+")
 85 |     sentence = space_regex.sub(r"", sentence)
 86 |     # 繁体字转换成简体字
 87 |     sentence = Converter('zh-hans').convert(sentence)
 88 |     return sentence.strip().lower()
 89 | 
 90 | def jieba_segment(sentence: str)-> str:
 91 |     """
 92 |     jieba分词，并去掉停止词
 93 |     :param sentence:
 94 |     :return:
 95 |     """
 96 |     sentence_list = jieba.cut(sentence)
 97 |     sentence_list = [w for w in sentence_list if w not in stopwords_list]
 98 |     sentence = ' '.join(sentence_list)
 99 |     return sentence
100 | 
101 | def get_documents(cache=True, jieba=True)-> List:
102 |     """
103 |     返回所有文档预处理和jieba分词后的一个列表
104 |     :param cache:  是否使用缓存的文件
105 |     :param jieba:  是否进行分词
106 |     :return:
107 |     """
108 |     documents = []
109 |     #使用缓存文件
110 |     if os.path.isfile(final_file) and cache:
111 |         with open(final_file, 'r', encoding='utf-8') as file:
112 |             for document in file:
113 |                 if jieba:
114 |                     document = jieba_segment(document)
115 |                 documents.append(document)
116 |     else:
117 |         #读取要处理的文件列表
118 |         desfiles = os.listdir(des_file)
119 |         #处理后存入到final_file单个文件
120 |         with open(final_file, 'w', encoding='utf-8') as wf:
121 |             for des in desfiles:
122 |                 document = ''
123 |                 with open(des_file+des, 'r', encoding='utf-8', errors='ignore') as file:
124 |                     for sentence in file:
125 |                         sentence = filter_chinese(sentence)
126 |                         if sentence:
127 |                             document = document + sentence + '。'
128 |                 if document:
129 |                     wf.write(document + "\n")
130 |                     if jieba:
131 |                         document = jieba_segment(document)
132 |                     documents.append(document)
133 |     print("文档的个数:",len(documents))
134 |     return documents
135 | 
136 | def cal_tfidf(documents, topk=10)-> List:
137 |     """
138 |     tfidf模型训练
139 |     :param documents: 要进行训练的文档
140 |     :param topk: 提取tfidf score 的前多少个单词, 如果topk大于提取到的单词个数，返回所有单词
141 |     :return:
142 |     """
143 |     # 单个文档分成列表
144 |     docs = [[word for word in document.split(' ')] for document in documents]
145 |     # 生成字典
146 |     dictionary = corpora.Dictionary(docs)
147 |     # 生成bag of word
148 |     docs_bow = [dictionary.doc2bow(doc) for doc in docs]
149 |     if os.path.isfile(tfidfmodel):
150 |         model = TfidfModel.load(tfidfmodel)
151 |     else:
152 |         model = TfidfModel(docs_bow)
153 |         model.save(tfidfmodel)
154 |     # 生成文本向量
155 |     docs_vector = list(model[docs_bow])
156 |     # 对所有的文本向量进行排序，取钱topk
157 |     docs_sort_vector = [sorted(doc, key=lambda x: x[1], reverse=True)[:topk] for doc in docs_vector]
158 |     # 把对应的向量id转换成中文单词，docs_sort_chinese是中文单词和tfidf的score的列表
159 |     docs_sort_chinese = [[(dictionary[vec[0]],vec[1]) for vec in doc] for doc in docs_sort_vector]
160 |     return docs_sort_chinese
161 | 
162 | def albert_model(seq_length=510, model_name='voidful/albert_chinese_xxlarge'):
163 |     """
164 |     albert模型计算fintags和文档的相似度（使用余弦相似度)
165 |     :param seq_length: 一个序列的最长长度
166 |     :param model_name: 使用的albert的模型名称, 可选模型如下
167 |                     voidful/albert_chinese_tiny
168 |                     voidful/albert_chinese_small
169 |                     voidful/albert_chinese_base
170 |                     voidful/albert_chinese_large
171 |                     voidful/albert_chinese_xlarge
172 |                     voidful/albert_chinese_xxlarge
173 |     :return: 返回所有文档和每个fintags的相似度列表
174 |     """
175 |     tokenizer = BertTokenizer.from_pretrained(model_name)
176 |     model = AlbertModel.from_pretrained(model_name)
177 |     #不是用jieba分词
178 |     docs = get_documents(cache=True, jieba=False)
179 |     #用于保存所有tags的向量
180 |     tags_cls = []
181 |     for tag in finTags:
182 |         #对单个单词encode，生成单词对应的字典的id，是逐个字的id
183 |         tag_token = tokenizer.encode(tag, add_special_tokens=True)
184 |         # 转变成tensor向量，并扩充一个batch_size维度
185 |         tagid = torch.tensor(tag_token).unsqueeze(0)
186 |         #获取模型的输出结果
187 |         outputs = model(tagid)
188 |         #获取hidden_states的向量
189 |         last_hidden_states = outputs[0]
190 |         #获取单词的cls向量，代表整个单词的向量
191 |         tag_cls = last_hidden_states[:, :1, :].squeeze(1)
192 |         tags_cls.append(tag_cls)
193 |     # 初始化余弦相速度
194 |     cos = torch.nn.CosineSimilarity(dim=1, eps=1e-6)
195 |     # 用于保存计算所有关键字和所有文档计算余弦相似度后的结果
196 |     docs_similiarity = []
197 |     for doc in docs:
198 |         # 对每个文档进行拆分分长度固定的句子
199 |         doc_tup = [doc[i:i + seq_length] for i in range(0, len(doc), seq_length)]
200 |         # 对每个文档进行换成token id，如果最后一个句子不够512，就padding到512位
201 |         doc_token = tokenizer.batch_encode_plus(doc_tup, pad_to_max_length=True)
202 |         # 获取生成的token id
203 |         docid = torch.tensor(doc_token['input_ids'])
204 |         # 放入模型
205 |         outputs = model(docid)
206 |         # 获取隐藏层的状态
207 |         last_hidden_states = outputs[0]
208 |         # 获取文档的cls向量，维度是[batch_size, Hidden_dimension], 这个batch_size就是上面的一个文档拆出来的每个句子，Hidden_dimension是模型的输出维度
209 |         doc_cls = last_hidden_states[:,:1,:].squeeze(1)
210 |         #用于保存每个fintags对这个句子的余弦相似度，就是这个类别关键字和这个句子的相似度
211 |         tags_similiarity = []
212 |         for tag_cls in tags_cls:
213 |             #计算余弦相似度,tag_cls的维度是[1,Hidden_dimension], doc_cls维度[batch_size, Hidden_dimension], tag_doc_simliarity的维度[batch_size]
214 |             # tag_doc_simliarity 这个关键字和每个句子的余弦相似度
215 |             tag_doc_simliarity = cos(tag_cls,doc_cls)
216 |             # 对比这个关键字和所有句子，取最大相似度
217 |             tags_similiarity.append(torch.mean(tag_doc_simliarity))
218 |         docs_similiarity.append(tags_similiarity)
219 |         #用于测试，否则太慢
220 |         if len(docs_similiarity) ==10:
221 |             break
222 |     return docs_similiarity
223 | 
224 | def test_tfidf():
225 |     """
226 |     测试tfidf的效果
227 |     :return: 输出结果
228 |     """
229 |     documents = get_documents()
230 |     #取前20个tfidf分数最大的值
231 |     res = cal_tfidf(documents, topk=100)
232 |     #用于打印文档，有标点符号，比较好看
233 |     documents = get_documents(cache=True, jieba=False)
234 |     keywords = []
235 |     for idx, doc in enumerate(res):
236 |         #取出关键tfidf文档计算得到的的关键字
237 |         docword = [vec[0] for vec in doc]
238 |         # 如果我们自定义的关键字在tfidf关键字列表中，就打印出来
239 |         tags = [tag for tag in finTags if tag in docword]
240 |         if not tags:
241 |             #如果没有和给定的类别关键字重合，打印tfidf给出的前3个关键字
242 |             print('没有找到和给定关键字匹配的，取tfidf的前3个关键字')
243 |             tags = docword[:3]
244 |         print(f"tfidf计算的最接近的keyword是: {tags}, 文档是: {documents[idx]}")
245 |         keywords.append(tags[0])
246 |     print(keywords)
247 |     return keywords
248 | 
249 | def train_doc2vec(documents, training=False, epoch=300):
250 |     """
251 |     训练doc2vec
252 |     :param documents:预处理后的文档
253 |     :param training:是否继续训练
254 |     :param epoch: 训练次数
255 |     :return:
256 |     """
257 |     # 单个文档分成列表
258 |     docs = [[word for word in document.split(' ')] for document in documents]
259 |     # 是否使用已缓存的模型
260 |     if os.path.isfile(docmodel):
261 |         model = Doc2Vec.load(docmodel)
262 |     else:
263 |         #使用TaggedDocument处理成文档和文档名称索引处理数据
264 |         documents = [TaggedDocument(doc, tags = [i]) for i, doc in enumerate(docs)]
265 |         model = Doc2Vec(documents, vector_size=100, window=6, min_count=1, workers=3, dm=1, negative=20, epochs=epoch)
266 |         model.save(docmodel)
267 |     #是否继续训练, 这里有bug，需要改进
268 |     if training:
269 |         documents = [TaggedDocument(doc, tags = [i]) for i, doc in enumerate(docs)]
270 |         model.train(documents, total_examples=model.corpus_count, epochs=epoch)
271 |     return model
272 | 
273 | def test_doc2vec():
274 |     """
275 |     测试doc2vec的效果
276 |     :return: 输出结果
277 |     """
278 |     documents = get_documents(cache=True, jieba=True)
279 |     #加载模型, training继续训练模型
280 |     model = train_doc2vec(documents, training=True, epoch=200)
281 |     #用于打印
282 |     documents = get_documents(cache=True, jieba=False)
283 |     # 过滤出给的关键字fintags不在字典中的词语 ，所以这个词语没有词向量，无法计算相似度
284 |     filter_tags = [tag for tag in finTags if tag in model.wv]
285 |     if finTags != filter_tags:
286 |         print('给定的fintags这写关键字不在doc2vec生成的字典中, 请更改关键字或者扩充训练文档, 使得训练文档包含这个关键字', set(finTags) - set(filter_tags))
287 |     tagsvec = model.wv[filter_tags]
288 |     keywords = []
289 |     for idx, doc in enumerate(documents):
290 |         docvec = model.docvecs[idx]
291 |         #计算所有tag与这个文档的相似度
292 |         tagssim = Word2VecKeyedVectors.cosine_similarities(docvec, tagsvec)
293 |         maxsim = max(tagssim)
294 |         keyword = finTags[list(tagssim).index(maxsim)]
295 |         print(f"doc2vec计算的最接近的keyword是: {keyword}, 相似度是: {maxsim}, 文档是: {doc}")
296 |         keywords.append(keyword)
297 |     print(keywords)
298 |     return keywords
299 | 
300 | def test_albert():
301 |     """
302 |     测试albert模型的效果
303 |     :return:
304 |     """
305 |     docs_similiarity = albert_model(model_name='voidful/albert_chinese_tiny')
306 |     # docs_similiarity = albert_model(model_name='voidful/albert_chinese_base')
307 |     #获取所有文档列表
308 |     docs = get_documents(cache=True, jieba=False)
309 |     keywords = []
310 |     for idx, doc_similiarity in enumerate(docs_similiarity):
311 |         #找出最高的相似度
312 |         maxsim = max(doc_similiarity)
313 |         #找出最高相似度所对应的单词
314 |         keyword = finTags[doc_similiarity.index(maxsim)]
315 |         print(f'albert计算后的结果最相似的标签是{keyword}, 相似度是:{maxsim}, 文档是: {docs[idx]}')
316 |         keywords.append(keyword)
317 |     print(keywords)
318 |     return keywords
319 | 
320 | if __name__ == '__main__':
321 |     # filter_data()
322 |     # docs = get_documents(cache=False, jieba=True)
323 |     # twords = test_tfidf()
324 |     dwords = test_doc2vec()
325 |     # awords = test_albert()
326 |     # print(twords,dwords,awords)
327 | 


--------------------------------------------------------------------------------
/translate/do_translate.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | # @Date  : 2021/4/28 5:44 下午
  4 | # @File  : translate_ppt.py
  5 | # @Author: johnson
  6 | # @Desc  : m2m100的翻译的API接口
  7 | 
  8 | from flask import Flask, request, jsonify, abort
  9 | import os
 10 | from pptx import Presentation
 11 | from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
 12 | import logging
 13 | import re
 14 | import torch
 15 | import gc
 16 | import atexit,time
 17 | from apscheduler.schedulers.background import BackgroundScheduler
 18 | import pandas as pd
 19 | import pymysql
 20 | 
 21 | def unload_model_schedule():
 22 |     global nobusy_count
 23 |     global model
 24 |     if model.model:
 25 |         nobusy_count += 1
 26 |         app.logger.info(f"Model 是存在的，不使用GPU的时间是 {nobusy_count*10}秒")
 27 |     if nobusy_count >= 30:
 28 |         #开始清理GPU, 30次，一共300秒，即5分钟不使用GPU，就卸载
 29 |         app.logger.info(f"开始清理模型")
 30 |         model.unload_model()
 31 |         nobusy_count = 0
 32 | 
 33 | scheduler = BackgroundScheduler()
 34 | scheduler.add_job(func=unload_model_schedule, trigger="interval", seconds=10)
 35 | scheduler.start()
 36 | 
 37 | atexit.register(lambda: scheduler.shutdown())
 38 | 
 39 | app = Flask(__name__)
 40 | app.config['DEBUG'] = False
 41 | if os.path.exists('/data/var/log/'):
 42 |     app.config['API_LOG_FILE'] = '/data/var/log/translate_api.log'
 43 | else:
 44 |     app.config['API_LOG_FILE'] = 'translate_api.log'
 45 | # 日志配置信息, Running on之类的显示在日志里面
 46 | if app.config['DEBUG']:
 47 |     logging.basicConfig(filename=app.config['API_LOG_FILE'], level=logging.DEBUG)
 48 | else:
 49 |     logging.basicConfig(filename=app.config['API_LOG_FILE'], level=logging.INFO)
 50 | 
 51 | class TranslateModel(object):
 52 |     def __init__(self, verbose=False):
 53 |         self.model_name = "./translate_model"
 54 |         self.excel_file = '翻译对照表总表.xlsx'
 55 |         # 判断使用的设备
 56 |         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 57 |         self.model = None
 58 |         self.tokenizer = None
 59 |         # 提取中英文的替换的关键字列表
 60 |         self.corres_keys = None
 61 |         self.cn_keys = None
 62 |         self.en_keys = None
 63 |         self.connet_mysql()
 64 |         # beam size 为3
 65 |         self.num_beams = 3
 66 |         # 返回句子为3
 67 |         self.num_return_sequences = 3
 68 |         # 防止重复生成单词的ngram
 69 |         self.no_repeat_ngram_size = 2
 70 |     def load_keywords_from_mysql(self):
 71 |         """
 72 |         不同于extract_dict从excel中加载，这里从mysql数据库加载
 73 |         :return:
 74 |         :rtype:
 75 |         """
 76 |         self.connet_mysql()
 77 |         sql = 'select * from pro_translate_dict'
 78 |         df = pd.read_sql(sql,self.db)
 79 |         self.cn_keys = df['cnword'].tolist()
 80 |         self.en_keys = df['enword'].tolist()
 81 |         self.corres_keys = df['modelword'].tolist()
 82 | 
 83 |     def connet_mysql(self):
 84 |         db = pymysql.connect(host="192.168.xx.xx",
 85 |                              user="xx",
 86 |                              password="xxx",
 87 |                              port=3306,
 88 |                              db='xxx',
 89 |                              charset='utf8mb4',
 90 |                              )
 91 |         self.db = db
 92 |     def submit_mysql(self,cn_keys,en_keys,corres_keys):
 93 |         # 提交数据data到数据库表pro_translate_dict
 94 |         total = 0
 95 |         loss = 0
 96 |         for cnword, enword, modelword in zip(cn_keys,en_keys,corres_keys):
 97 |             cursor = self.db.cursor()
 98 |             sql = "INSERT INTO `pro_translate_dict` (`cnword`, `enword`, `modelword`) VALUES ('%s','%s', '%s')" % (
 99 |             cnword, enword, modelword)
100 |             try:
101 |                 cursor.execute(sql)
102 |                 self.db.commit()
103 |                 total += 1
104 |                 app.logger.info(f"提交{cnword},{enword},{modelword}数据到mysql成功")
105 |             except Exception as e:
106 |                 self.db.rollback()
107 |                 app.logger.info(f"提交{cnword},{enword},{modelword}数据到mysql失败")
108 |                 print(e)
109 |                 loss += 1
110 |         self.db.commit()
111 |         app.logger.info(f"提交{total}条数据到mysql成功, {loss}条失败")
112 | 
113 |     def load_model(self):
114 |         """
115 |         加载模型
116 |         :return:
117 |         """
118 |         app.logger.info(f"开始加载模型")
119 |         model = M2M100ForConditionalGeneration.from_pretrained(self.model_name)
120 |         model.to(self.device)
121 |         self.tokenizer = M2M100Tokenizer.from_pretrained(self.model_name)
122 |         self.model = model
123 | 
124 |     def extract_dict(self, excel_file, write2mysql=False):
125 |         """
126 |         提取excel文件中的中英文关键字
127 |         :param excel_file: excel文件
128 |         :param write2mysql: 是否把结果写到mysql
129 |         :return:
130 |         :rtype:
131 |         """
132 |         df = pd.read_excel(excel_file)
133 |         cn_keywords = df['中文'].tolist()
134 |         en_keywords = df['英文'].tolist()
135 |         app.logger.info(f"处理前中文和英文单词个数分别是 {len(cn_keywords)}, {len(en_keywords)}")
136 |         cn_keys = []
137 |         en_keys = []
138 |         # 处理一下/分隔的词，这样的词是多个的
139 |         for cn, en in zip(cn_keywords, en_keywords):
140 |             if '/' in cn:
141 |                 cn_words = cn.split('/')
142 |             else:
143 |                 cn_words = [cn]
144 |             if '/' in en:
145 |                 en_words = en.split('/')
146 |             else:
147 |                 en_words = [en]
148 |             # 中英文单词都加入单词表
149 |             for c in cn_words:
150 |                 for e in en_words:
151 |                     cn_keys.append(c)
152 |                     en_keys.append(e)
153 |         app.logger.info(f"处理后中文和英文单词个数分别是 {len(cn_keys)}, {len(en_keys)}")
154 |         app.logger.info(f"中文关键字cn_keys是{cn_keys}")
155 |         app.logger.info(f"英文关键字en_keys是{en_keys}")
156 |         self.cn_keys = cn_keys
157 |         self.en_keys = en_keys
158 |         # 每个中文关键字对应的翻译结果，从模型中获取
159 |         app.logger.info(f"首先处理对应的关键字的中文到英文的翻译结果")
160 |         corres_keys = []
161 |         for cnkey in cn_keys:
162 |             #每个中文对应的英文翻译结果
163 |             corres_key = self.translate2en(text=cnkey,do_replace=False)
164 |             corres_keys.append(corres_key)
165 |         app.logger.info(f"corres_keys是 {corres_keys}")
166 |         self.corres_keys = corres_keys
167 |         if write2mysql:
168 |             #开始同步到mysql数据库
169 |             self.submit_mysql(cn_keys,en_keys,corres_keys)
170 | 
171 |     def start_replace(self, cntext, entext):
172 |         """
173 |         使用cn_keys和en_keys进行正则替换
174 |         :param cntext:
175 |         :param entext:
176 |         :return:
177 |         """
178 |         result = entext
179 |         for cnkey, enkey, corres_key in zip(self.cn_keys, self.en_keys, self.corres_keys):
180 |             if cnkey in cntext:
181 |                 result = re.sub(corres_key,enkey,entext,flags=re.I)
182 |                 if result != entext:
183 |                     app.logger.info(f"进行了替换: {entext},被从{corres_key}替换成{enkey}")
184 |         return result
185 |     def translate2en(self, text, do_replace=True):
186 |         """
187 |         翻译中文text到英文
188 |         :param do_replace: 默认使用excel中的关键字进行替换
189 |         """
190 |         global nobusy_count
191 |         nobusy_count = 0
192 |         if self.model is None:
193 |             #如果predict_model没有加载，自动加载默认的模型
194 |             self.load_model()
195 |         res = re.findall('[\u4e00-\u9fa5]+', text)
196 |         if not res:
197 |             app.logger.info(f"原文是: {text}, 不包含中文字符，不需要翻译")
198 |             return text
199 |         self.tokenizer.src_lang = "zh"
200 |         encoded_zh = self.tokenizer(text, return_tensors="pt")
201 |         encoded_zh.data['attention_mask'] = encoded_zh.data['attention_mask'].to(self.device)
202 |         encoded_zh.data['input_ids'] = encoded_zh.data['input_ids'].to(self.device)
203 |         generated_tokens = self.model.generate(**encoded_zh, num_beams=self.num_beams, num_return_sequences=self.num_return_sequences,no_repeat_ngram_size=self.no_repeat_ngram_size, forced_bos_token_id=self.tokenizer.get_lang_id("en"))
204 |         entext = self.tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
205 |         app.logger.info(f"原文是: {text}\n模型翻译的结果是: {entext}")
206 |         result_text = entext[0]
207 |         for ent in entext:
208 |             #如果没发现中文，就用这个翻译结果，如果发现中文，就换一个翻译结果，如果仍然有中文，那也没办法了
209 |             if not re.findall('[\u4e00-\u9fa5]+', ent):
210 |                 result_text = ent
211 |                 break
212 |         if do_replace:
213 |             result_text = self.start_replace(cntext=text, entext=result_text)
214 |         app.logger.info(f"原文是: {text}\n最终翻译的结果是: {entext}")
215 |         return result_text
216 |     def unload_model(self):
217 |         """
218 |         卸载模型，不把模型加载到GPU中
219 |         :return:
220 |         :rtype:
221 |         """
222 |         app.logger.info(f"开始卸载模型")
223 |         self.model = None
224 |         self.tokenizer = None
225 |         gc.collect()
226 |         torch.cuda.empty_cache()
227 |         app.logger.info(f"模型卸载完成")
228 | 
229 | def slade_change(SRC_PPT, TARGET_PPT, DICT_FILE=None):
230 |     """
231 |     中文到英文PPT的翻译，SRC_PPT中文PPT，TARGET_PPT英文PPT
232 |     :param DICT_FILE: excel格式的，包含中文和英文2列，用于正则替换
233 |     """
234 |     if DICT_FILE:
235 |         # 如果传了excel，那么用用户传的excel做替换，否则用默认的
236 |         model.extract_dict(excel_file=DICT_FILE)
237 |     else:
238 |         model.load_keywords_from_mysql()
239 |     prs = Presentation(SRC_PPT)
240 |     for slide_idx, slide in enumerate(prs.slides):
241 |         for shape_idx, shape in enumerate(slide.shapes):
242 |             # 翻译表格
243 |             if shape.has_table:
244 |                 for row_idx, row in enumerate(shape.table.rows):
245 |                     for cell_idx, cell in enumerate(row.cells):
246 |                         cell_text = cell.text_frame.text
247 |                         cell_entext = model.translate2en(cell_text)
248 |                         prs.slides[slide_idx].shapes[shape_idx].table.rows[row_idx].cells[cell_idx].text_frame.text = cell_entext
249 |             # 图表翻译
250 |             if shape.has_chart:
251 |                 # 图表的标题翻译
252 |                 title_txt = shape.chart.chart_title.text_frame.text
253 |                 title_entext = model.translate2en(title_txt)
254 |                 shape.chart.chart_title.text_frame.text = title_entext
255 |             #翻译其它文本
256 |             if not shape.has_text_frame:
257 |                 continue
258 |             for paragraph_idx,paragraph in enumerate(shape.text_frame.paragraphs):
259 |                 paragraph_text = ""
260 |                 for run_idx, run in enumerate(paragraph.runs):
261 |                     paragraph_text = paragraph_text + run.text
262 |                     # 只要第一个run的txt，其它的都设为空
263 |                     if run_idx != 0:
264 |                         prs.slides[slide_idx].shapes[shape_idx].text_frame.paragraphs[paragraph_idx].runs[run_idx].text = ''
265 |                 if paragraph_text:
266 |                     entext = model.translate2en(paragraph_text)
267 |                     prs.slides[slide_idx].shapes[shape_idx].text_frame.paragraphs[paragraph_idx].runs[0].text = entext
268 |     prs.save(TARGET_PPT)
269 |     app.logger.info(f"读取{SRC_PPT},修改完成了{len(prs.slides)}页的PPT{TARGET_PPT}")
270 |     return 200, "成功"
271 | 
272 | @app.route("/fileTranslate", methods=['POST'])
273 | def translate():
274 |     """
275 |     翻译api
276 |     :return:
277 |     """
278 |     form_dict = request.form.to_dict()
279 |     json_dict = request.get_json()
280 |     app.logger.info(f"用户请求的form内容是{form_dict}")
281 |     app.logger.info(f"用户请求的json内容是{json_dict}")
282 |     # dict_file是翻译后检查和替换的字典
283 |     if json_dict:
284 |         source_ppt = json_dict.get('inputFilePath', None)
285 |         des_ppt = json_dict.get('resultFilePath', None)
286 |         dict_file = json_dict.get('dictFilePath', None)
287 |     else:
288 |         source_ppt = form_dict.get('inputFilePath', None)
289 |         des_ppt = form_dict.get('resultFilePath', None)
290 |         dict_file = form_dict.get('dictFilePath', None)
291 |     if source_ppt.split('.')[-1].lower() not in ['pptx']:
292 |         app.logger.warning('ppt格式不符合要求')
293 |         return jsonify({'ret': -101, 'msg': 'ppt格式不符合要求'})
294 |     if not os.path.exists(source_ppt):
295 |         app.logger.warning('ppt文件不存在，请检查服务器上是否存在这个ppt')
296 |         return jsonify({'ret': -102, 'msg': 'ppt文件不存在，请检查服务器上是否存在这个ppt'})
297 |     if dict_file and not os.path.exists(dict_file):
298 |         app.logger.warning('给了dictFilePath参数，但文件不在服务器上')
299 |         return jsonify({'ret': -103, 'msg': '给了dictFilePath参数，但文件不在服务器上'})
300 |     # 保存文件
301 |     code, msg = slade_change(SRC_PPT=source_ppt, TARGET_PPT=des_ppt, DICT_FILE=dict_file)
302 |     # 判断OCR后端识别的结果是否正确
303 |     if code == 200:
304 |         return jsonify({'ret': 0, 'msg': '成功'})
305 |     else:
306 |         return jsonify({'ret': -104, 'msg': msg})
307 | 
308 | 
309 | @app.route("/syncMysql", methods=['POST'])
310 | def syncmysql():
311 |     """
312 |     把dictfile的内容写到mysql
313 |     :return:
314 |     """
315 |     form_dict = request.form.to_dict()
316 |     json_dict = request.get_json()
317 |     app.logger.info(f"准备同步excel内容到mysql数据库")
318 |     app.logger.info(f"用户请求的form内容是{form_dict}")
319 |     app.logger.info(f"用户请求的json内容是{json_dict}")
320 |     # dict_file是翻译后检查和替换的字典
321 |     if json_dict:
322 |         source_ppt = json_dict.get('inputFilePath', None)
323 |         des_ppt = json_dict.get('resultFilePath', None)
324 |         dict_file = json_dict.get('dictFilePath', None)
325 |     else:
326 |         source_ppt = form_dict.get('inputFilePath', None)
327 |         des_ppt = form_dict.get('resultFilePath', None)
328 |         dict_file = form_dict.get('dictFilePath', None)
329 |     if dict_file and not os.path.exists(dict_file):
330 |         app.logger.warning('给了dictFilePath参数，但文件不在服务器上')
331 |         return jsonify({'ret': -103, 'msg': '给了dictFilePath参数，但文件不在服务器上'})
332 |     # 保存文件
333 |     model.extract_dict(excel_file=dict_file,write2mysql=True)
334 |     return jsonify({'ret': 0, 'msg': '成功'})
335 | 
336 | if __name__ == "__main__":
337 |     # 预训练模型
338 |     model = TranslateModel()
339 |     nobusy_count = 0
340 |     # slade_change(SRC_PPT="图表翻译.pptx",TARGET_PPT="表格翻译-英文.pptx")
341 |     app.run(host='0.0.0.0', port=3325, debug=False, threaded=True)
342 | 


--------------------------------------------------------------------------------