├── README.md ├── data ├── example.json └── records │ └── Info.txt ├── data_process ├── file2file.py ├── generate_json.py └── info.txt ├── tool ├── basewin.py └── mainwin.py ├── 标注主界面.png └── 标注结果示例.jpg /README.md: -------------------------------------------------------------------------------- 1 | # =======标注工具======= 2 | # *功能介绍* 3 | ## 1、为当前文本打标签(可以多个) 4 | ## 2、提取文本中的“地名”、“人名”、“中心词”。(每项均可有多个值) 5 | ### --------------------------------------------------------------------------------------------------------------- 6 | # *python环境配置及外部依赖包* 7 | ## 1、python2.7 8 | ## 2、wxPython4.0.4 9 | ## 3、其余python版本也可以,代码适用python2及python3 10 | ### --------------------------------------------------------------------------------------------------------------- 11 | # *文件说明* 12 | ## data文件夹 13 | ### ----data/example.json:包含待标注文件 14 | ### ----data/records/record_01.txt:标注结果 15 | ## data_process文件夹 16 | ### ----file2file.py:文件处理类 17 | ### ----generate_json.py:生成最终的符合标注要求的.json数据 18 | ## tool文件夹 19 | ### ----basewin.py:主要是界面布局 20 | ### ----mainwin.py:包含标注程序的主要逻辑,包含主函数,继承了basewin.py中的类。 21 | ### --------------------------------------------------------------------------------------------------------------- 22 | # *使用步骤* 23 | ## 运行mainwin.py: 24 | ### 1、在运行mainwin.py前,需要依据自己的.json数据以及标签(tags)需求,对标注工具进行一些设置。 25 | ### 2、设置分为3步 26 | ### 3、查看mainwin.py,即可查看3步设置的提示,依次进行设置。 27 | ## 标注具体操作: 28 | ### 1、单击tag按钮,就为该文本选择了该标签(可点击多个)。 29 | ### 2、点击选中“地名”或“人名”或“中心词”后,鼠标在上述文本中,拖动,选中内容,可选多个内容,就为这个选中的标签赋值了。 30 | ### 3、点击“下一条”,进入下一条数据的标注。 31 | ### 4、关闭主窗口后(程序结束运行),已标注文件会保存,下次重新运行程序时,程序会自动从上次标注位置开始载入数据。 32 | ### 5、标注完成。 33 | ### --------------------------------------------------------------------------------------------------------------- 34 | # *界面展示* 35 | ## 主界面 36 | ![](https://github.com/GHY73/BiaoZhuTool/blob/master/%E6%A0%87%E6%B3%A8%E4%B8%BB%E7%95%8C%E9%9D%A2.png?raw=true) 37 | ## 标注结果 38 | ![](https://github.com/GHY73/BiaoZhuTool/blob/master/%E6%A0%87%E6%B3%A8%E7%BB%93%E6%9E%9C%E7%A4%BA%E4%BE%8B.jpg?raw=true) 39 | ### --------------------------------------------------------------------------------------------------------------- 40 | # *致谢* 41 | ## 本标注工具代码改编自下面这位大佬,附上大佬的博客:https://www.cnblogs.com/JohnRain/ 42 | -------------------------------------------------------------------------------- /data/example.json: -------------------------------------------------------------------------------- 1 | { 2 | "data": { 3 | "0": { 4 | "id": "01050243571530377778", 5 | "time": "1921-05-02", 6 | "author": "鲁迅", 7 | "title": "《故乡》", 8 | "content": "《故乡》是现代文学家鲁迅于1921年创作的一篇短篇小说。小说以“我”回故乡的活动为线索,按照“回故乡”——“在故乡”——“离故乡”的情节安排,依据“我”的所见所闻所忆所感,着重描写了闰土和杨二嫂的人物形象,从而反映了辛亥革命前后农村破产、农民痛苦生活的现实;同时深刻指出了由于受封建社会传统观念的影响,劳苦大众所受的精神上的束缚,造成纯真的人性的扭曲,造成人与人之间的冷漠、隔膜,表达了作者对现实的强烈不满和改造旧社会、创造新生活的强烈愿望。该小说入选初中语文,人民教育出版社九年级(上册)" 9 | }, 10 | "1": { 11 | "id": "01050251921530374299", 12 | "time": "2019-05-02", 13 | "author": "老舍", 14 | "title": "《骆驼祥子》", 15 | "content": "《骆驼祥子》是老舍的代表作之一,以现实主义的笔法与悲天悯人的情怀,塑造了祥子、虎妞等一批令人难忘的艺术形象,在中国现代文学历史上拥有重要地位。" 16 | } 17 | } 18 | } -------------------------------------------------------------------------------- /data/records/Info.txt: -------------------------------------------------------------------------------- 1 | # 这里保存标注好的文件。 2 | -------------------------------------------------------------------------------- /data_process/file2file.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Time : 19-3-20 下午3:18 3 | # @Author : HaiYang Gao 4 | # @OS: ubuntu 16.04 5 | # @File : file2file.py 6 | import pandas as pd 7 | import re 8 | import jieba 9 | 10 | class file2file: 11 | 12 | def gbk_csv2utf8_csv(self, inpath, outpath, splitchar=','): 13 | """国标csv文件-->utf-8csv文件(处理乱码)""" 14 | all_datas = [] 15 | with open(inpath, mode='r') as f: 16 | for line in f: 17 | # ,分行 18 | data = line.strip().split(splitchar) 19 | # 添加数据 20 | all_datas.append(data) 21 | # 每行转为指定编码的字符串 22 | str1 = ','.join(data).encode('utf-8') 23 | # 解码写入,添加写:a+ 24 | open(outpath, mode='a+').write('\n' + str1.decode('utf-8')) 25 | 26 | def csv2json(self,inpath,outpath,orient_str='index'): 27 | """以csv列号为key,构建对应的json数据,默认orient='index'""" 28 | # 读入csv文件 29 | data = pd.read_csv(inpath) 30 | # 指定orient及其他参数 31 | data.to_json(outpath, orient=orient_str, force_ascii=False) 32 | 33 | def csv2pickle(self,inpath,outpath): 34 | """csv文件转为pickle文件""" 35 | tempdata = pd.read_csv(inpath) 36 | tempdata.to_pickle(outpath) 37 | 38 | def full_csv2part_csv_bycolumn(self,inpath,outpath,column_number_list,head_name_list,index=False): 39 | """column_number_list,head_name_list大小相同,列号与名字一一对应""" 40 | data = pd.read_csv(inpath, usecols=column_number_list) 41 | data.to_csv(outpath, encoding="utf-8-sig", index=index,header=head_name_list) 42 | 43 | def full_csv2part_csv_byrows(self,inpath,outpath,skiprow,nrow,index=False,index_label='index'): 44 | """读取指定行,skip表示跳过多少行,nrow表示读多少行,第一行header不计入行数""" 45 | data = pd.read_csv(inpath, skiprows=skiprow, nrows=nrow) 46 | data.to_csv(outpath,index=index,index_label=index_label) 47 | 48 | def full_csv2part_txt(self,inpath,outpath,column_name_list): 49 | """从csv中提取出某几列,构造新的txt,column_name_list指定列名""" 50 | data = pd.read_csv(inpath) 51 | content = [] 52 | # 自定义正则,去掉括号及内容 53 | pattern = '\\(.*?\\)|\\{|\\}|\\[.*?]|\\<.*?>' 54 | column_num = len(column_name_list) 55 | for i in range(column_num): 56 | content.append(list(data[column_name_list[i]])) 57 | # csv文件:每列长度是一样的 58 | length = len(content[0]) 59 | for i in range(0, length): 60 | temp_str = '' 61 | for j in range(column_num): 62 | if j > 0: 63 | # '+x+'给后面的空格占个位置,if控制位置,不要占错 64 | temp_str = temp_str + '+×+' + str(content[j][i]) 65 | else: 66 | temp_str = temp_str + str(content[j][i]) 67 | # 去掉乱七八糟的字符,手动定义 68 | temp_str = temp_str.replace('\r', '').replace('\n', '').replace(' ', '').replace(' ','' ) 69 | temp_str = re.sub(pattern,'',temp_str) 70 | # 加上换行符 71 | temp_str = temp_str + '\n' 72 | # 更换占位符为' ' 73 | temp_str = temp_str.replace('+×+', ' ') 74 | # 每次写入1行,io没有优化 75 | open(outpath, 'a+').write(temp_str) 76 | 77 | def txt2csv(self,inpath,outpath,columnnamelist,index=False,splitchar=' ',index_label='index'): 78 | """读取txt文件,按txt的列,转为csv文件,手动指定csv各列列名list""" 79 | datas = self.readtxt(inpath,splitchar=splitchar) 80 | columns = columnnamelist 81 | # 构造DataFrame格式数据 82 | csvdatas = pd.DataFrame(columns=columns, data=datas) 83 | csvdatas.to_csv(outpath,index=index, sep='\t', index_label=index_label) 84 | 85 | def txt2pickle(self,inpath,outpath,columnnamelist): 86 | """读取txt文件,按txt的列,转为pickle文件,手动指定pickle各列列名list""" 87 | datas = self.readtxt(inpath,splitchar=' ') 88 | columns = columnnamelist 89 | # 构造DataFrame格式数据 90 | pkdatas = pd.DataFrame(columns=columns,data=datas) 91 | pkdatas.to_pickle(outpath) 92 | 93 | def readjson(self,inpath,orient_str): 94 | """读json数据,不同格式的json数据,orient_str不一样""" 95 | data = pd.read_json(inpath, orient=orient_str) 96 | return data 97 | 98 | def readtxt(self,inpath,splitchar=' '): 99 | """读取txt,返回list,txt默认以‘空格’分行""" 100 | datas = [] 101 | with open(inpath, 'r') as f: 102 | for line in f: 103 | content = line.strip().split(splitchar) 104 | # 不空 105 | if content: 106 | datas.append(content) 107 | return datas 108 | 109 | def readcsv(self,inpath): 110 | """读取csv,返回DataFrame""" 111 | data = pd.read_csv(inpath) 112 | return data 113 | 114 | def readpickle(self,inpath): 115 | """读取pickle,返回DataFrame""" 116 | data = pd.read_pickle(inpath) 117 | return data 118 | 119 | def twoDlist2txt(self,twoDlist, outpath, linesplitor=' ', joinlink=','): 120 | temp = '' 121 | count = 0 122 | for i in twoDlist: 123 | # 每个子list字符化,构成字符串 124 | temp = temp + joinlink.join(i) + '\n' 125 | count += 1 126 | # 分批写入,注意temp重置 127 | if count % 200 == 0: 128 | open(outpath, 'a+').write(temp) 129 | temp = '' 130 | open(outpath, 'a+').write(temp) 131 | 132 | def txt2jieba_stopwords(self,txtinpath,stopwordspath,outpath,splitchar=' '): 133 | # 读入待处理数据 134 | datas = self.readtxt(txtinpath, splitchar=splitchar) 135 | # 读入停用词,并且使list一维化 136 | stopwords = set(sum(self.readtxt(stopwordspath), [])) 137 | labels = [] 138 | contents = [] 139 | # 循环处理 140 | count = 0 141 | for i in datas: 142 | count += 1 143 | labels.append(i[0]) 144 | # 分词 145 | temp = i[1].replace(' ','') 146 | content = jieba.lcut(temp.replace(' ','')) 147 | # k = [] 148 | # for each in content: 149 | # k.append(each.encode('utf-8')) 150 | # 去停用词 151 | cut_data = [word for word in content if word not in stopwords] 152 | contents.append(' '.join(cut_data)) 153 | if count % 50 == 0: 154 | print('分词到第{}条'.format(count)) 155 | # 写入本地 156 | write_txt = '' 157 | for i in range(len(labels)): 158 | write_txt = write_txt + labels[i] + '\t' + contents[i] + '\n' 159 | if i % 50 == 0: 160 | print('deal with {}'.format(i)) 161 | open(outpath, 'a+').write(write_txt) 162 | write_txt = '' 163 | open(outpath, 'a+').write(write_txt) 164 | 165 | 166 | if __name__ == '__main__': 167 | print('file2file.py is running...') 168 | -------------------------------------------------------------------------------- /data_process/generate_json.py: -------------------------------------------------------------------------------- 1 | """ 2 | 把.csv格式的数据,转化成标注工具需要的.json数据 3 | 把.csv文件也放在这个目录下,运行程序后,.json会生成在data文件夹下 4 | """ 5 | from utils import file2file 6 | f2f = file2file.file2file() 7 | 8 | # 固定参数 9 | outpath = '../data/' 10 | # 可变参数 11 | infilename = 'GuangQing_10w_V1.csv' 12 | outfilename = 'GuangQing_10w_V1_BiaoZhu.json' 13 | # 不变语句 14 | f2f.csv2json(inpath=infilename,outpath=outpath+outfilename) 15 | # 对生成好的json外面嵌套一层{'data:'my.json} 16 | datas = open(outpath + outfilename,'r').read() 17 | new_datas = '{"data": ' + datas + '}' 18 | open(outpath + outfilename,'w').write(new_datas) 19 | print('==========Translate Over==========') 20 | -------------------------------------------------------------------------------- /data_process/info.txt: -------------------------------------------------------------------------------- 1 | ## 具体处理细节还需要自己来,file2file.py只提供功能有限的部分函数 2 | -------------------------------------------------------------------------------- /tool/basewin.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Time : 19-5-2 下午5:07 3 | # @Author : HaiYang Gao 4 | # @OS: ubuntu 16.04 5 | # @File : basewin.py 6 | 7 | import wx 8 | 9 | """ 10 | =============================step2:修改tag按钮的值================================= 11 | 选择性更改从line43到line108的‘自定义按钮区’的按钮值 12 | 1个修改的例子: 13 | 原句: 14 | self.m_toggleBtn1 = wx.ToggleButton(self, wx.ID_ANY, u"1.自定义按钮", wx.DefaultPosition, wx.DefaultSize, 0) 15 | 要改的参数:u"1.自定义按钮"--->u"体育" 16 | """ 17 | 18 | 19 | class MyFrame1(wx.Frame): 20 | 21 | def __init__(self, parent): 22 | wx.Frame.__init__(self, parent, id=wx.ID_ANY, title=wx.EmptyString, pos=wx.DefaultPosition, 23 | size=wx.Size(748, 748), style=wx.DEFAULT_FRAME_STYLE | wx.TAB_TRAVERSAL) 24 | 25 | self.SetSizeHint(wx.DefaultSize, wx.DefaultSize) 26 | self.SetBackgroundColour(wx.SystemSettings.GetColour(wx.SYS_COLOUR_ACTIVECAPTION)) 27 | 28 | fgSizer3 = wx.FlexGridSizer(4, 1, 0, 0) 29 | fgSizer3.SetFlexibleDirection(wx.BOTH) 30 | fgSizer3.SetNonFlexibleGrowMode(wx.FLEX_GROWMODE_SPECIFIED) 31 | 32 | main_text = wx.FlexGridSizer(2, 1, 0, 0) 33 | main_text.SetFlexibleDirection(wx.BOTH) 34 | main_text.SetNonFlexibleGrowMode(wx.FLEX_GROWMODE_SPECIFIED) 35 | 36 | self.title = wx.StaticText(self, wx.ID_ANY, u"title", wx.DefaultPosition, wx.DefaultSize, wx.ALIGN_CENTRE) 37 | self.title.Wrap(-1) 38 | self.title.SetMinSize(wx.Size(730, -1)) 39 | 40 | main_text.Add(self.title, 0, wx.ALIGN_CENTER | wx.ALL, 5) 41 | 42 | self.content = wx.TextCtrl(self, wx.ID_ANY, wx.EmptyString, wx.DefaultPosition, wx.Size(730, 200), 43 | wx.TE_MULTILINE | wx.TE_RICH | wx.TE_RICH2) 44 | main_text.Add(self.content, 0, wx.ALL, 5) 45 | 46 | fgSizer3.Add(main_text, 1, wx.ALIGN_CENTER, 5) 47 | 48 | sbSizer12 = wx.StaticBoxSizer(wx.StaticBox(self, wx.ID_ANY, u"tag类别"), wx.VERTICAL) 49 | 50 | gSizer8 = wx.GridSizer(4, 5, 0, 0) 51 | 52 | """ 53 | ========================自定义按钮区========================================= 54 | 自己修改tag按钮的名字,本次提供20个可选的tag。 55 | 其余代码不要变动有任何变动,包括顺序 56 | """ 57 | self.m_toggleBtn1 = wx.ToggleButton(self, wx.ID_ANY, u"1.自定义按钮", wx.DefaultPosition, wx.DefaultSize, 0) 58 | 59 | gSizer8.Add(self.m_toggleBtn1, 0, wx.ALIGN_CENTER | wx.ALL, 5) 60 | 61 | self.m_toggleBtn2 = wx.ToggleButton(self, wx.ID_ANY, u"2.自定义按钮", wx.DefaultPosition, wx.DefaultSize, 0) 62 | gSizer8.Add(self.m_toggleBtn2, 0, wx.ALIGN_CENTER | wx.ALL, 5) 63 | 64 | self.m_toggleBtn3 = wx.ToggleButton(self, wx.ID_ANY, u"3.自定义按钮", wx.DefaultPosition, wx.DefaultSize, 0) 65 | gSizer8.Add(self.m_toggleBtn3, 0, wx.ALIGN_CENTER | wx.ALL, 5) 66 | 67 | self.m_toggleBtn4 = wx.ToggleButton(self, wx.ID_ANY, u"4.自定义按钮", wx.DefaultPosition, wx.DefaultSize, 0) 68 | gSizer8.Add(self.m_toggleBtn4, 0, wx.ALIGN_CENTER | wx.ALL, 5) 69 | 70 | self.m_toggleBtn5 = wx.ToggleButton(self, wx.ID_ANY, u"5.自定义按钮", wx.DefaultPosition, wx.DefaultSize, 0) 71 | gSizer8.Add(self.m_toggleBtn5, 0, wx.ALIGN_CENTER | wx.ALL, 5) 72 | 73 | self.m_toggleBtn6 = wx.ToggleButton(self, wx.ID_ANY, u"6.自定义按钮", wx.DefaultPosition, wx.DefaultSize, 0) 74 | gSizer8.Add(self.m_toggleBtn6, 0, wx.ALIGN_CENTER | wx.ALL, 5) 75 | 76 | self.m_toggleBtn7 = wx.ToggleButton(self, wx.ID_ANY, u"7.自定义按钮", wx.DefaultPosition, wx.DefaultSize, 0) 77 | gSizer8.Add(self.m_toggleBtn7, 0, wx.ALIGN_CENTER | wx.ALL, 5) 78 | 79 | self.m_toggleBtn8 = wx.ToggleButton(self, wx.ID_ANY, u"8.自定义按钮", wx.DefaultPosition, wx.DefaultSize, 0) 80 | gSizer8.Add(self.m_toggleBtn8, 0, wx.ALIGN_CENTER | wx.ALL, 5) 81 | 82 | self.m_toggleBtn9 = wx.ToggleButton(self, wx.ID_ANY, u"9.自定义按钮", wx.DefaultPosition, wx.DefaultSize, 0) 83 | gSizer8.Add(self.m_toggleBtn9, 0, wx.ALIGN_CENTER | wx.ALL, 5) 84 | 85 | self.m_toggleBtn10 = wx.ToggleButton(self, wx.ID_ANY, u"10.自定义按钮", wx.DefaultPosition, wx.DefaultSize, 0) 86 | gSizer8.Add(self.m_toggleBtn10, 0, wx.ALIGN_CENTER | wx.ALL, 5) 87 | 88 | self.m_toggleBtn11 = wx.ToggleButton(self, wx.ID_ANY, u"11.自定义按钮", wx.DefaultPosition, wx.DefaultSize, 0) 89 | gSizer8.Add(self.m_toggleBtn11, 0, wx.ALIGN_CENTER | wx.ALL, 5) 90 | 91 | self.m_toggleBtn12 = wx.ToggleButton(self, wx.ID_ANY, u"12.自定义按钮", wx.DefaultPosition, wx.DefaultSize, 0) 92 | gSizer8.Add(self.m_toggleBtn12, 0, wx.ALIGN_CENTER | wx.ALL, 5) 93 | 94 | self.m_toggleBtn13 = wx.ToggleButton(self, wx.ID_ANY, u"13.自定义按钮", wx.DefaultPosition, wx.DefaultSize, 0) 95 | gSizer8.Add(self.m_toggleBtn13, 0, wx.ALIGN_CENTER | wx.ALL, 5) 96 | 97 | self.m_toggleBtn14 = wx.ToggleButton(self, wx.ID_ANY, u"14.自定义按钮", wx.DefaultPosition, wx.DefaultSize, 0) 98 | gSizer8.Add(self.m_toggleBtn14, 0, wx.ALIGN_CENTER | wx.ALL, 5) 99 | 100 | self.m_toggleBtn15 = wx.ToggleButton(self, wx.ID_ANY, u"15.自定义按钮", wx.DefaultPosition, wx.DefaultSize, 0) 101 | gSizer8.Add(self.m_toggleBtn15, 0, wx.ALIGN_CENTER | wx.ALL, 5) 102 | 103 | self.m_toggleBtn16 = wx.ToggleButton(self, wx.ID_ANY, u"16.自定义按钮", wx.DefaultPosition, wx.DefaultSize, 0) 104 | gSizer8.Add(self.m_toggleBtn16, 0, wx.ALIGN_CENTER | wx.ALL, 5) 105 | 106 | self.m_toggleBtn17 = wx.ToggleButton(self, wx.ID_ANY, u"17.自定义按钮", wx.DefaultPosition, wx.DefaultSize, 0) 107 | gSizer8.Add(self.m_toggleBtn17, 0, wx.ALIGN_CENTER | wx.ALL, 5) 108 | 109 | self.m_toggleBtn18 = wx.ToggleButton(self, wx.ID_ANY, u"18.自定义按钮", wx.DefaultPosition, wx.DefaultSize, 0) 110 | gSizer8.Add(self.m_toggleBtn18, 0, wx.ALIGN_CENTER | wx.ALL, 5) 111 | 112 | self.m_toggleBtn19 = wx.ToggleButton(self, wx.ID_ANY, u"19.自定义按钮", wx.DefaultPosition, wx.DefaultSize, 0) 113 | gSizer8.Add(self.m_toggleBtn19, 0, wx.ALIGN_CENTER | wx.ALL, 5) 114 | 115 | self.m_toggleBtn20 = wx.ToggleButton(self, wx.ID_ANY, u"20.自定义按钮", wx.DefaultPosition, wx.DefaultSize, 0) 116 | gSizer8.Add(self.m_toggleBtn20, 0, wx.ALIGN_CENTER | wx.ALL, 5) 117 | """ 118 | ========================自定义按钮区末尾========================================= 119 | =======step2调整结束,step3在mainwin.py中,对write_a_record()函数进行更改========== 120 | """ 121 | 122 | self.m_toggleBtn1.SetValue(True) 123 | 124 | sbSizer12.Add(gSizer8, 1, wx.EXPAND, 5) 125 | 126 | fgSizer3.Add(sbSizer12, 1, wx.EXPAND, 5) 127 | 128 | sbSizer13 = wx.StaticBoxSizer(wx.StaticBox(self, wx.ID_ANY, u"关键词抽取"), wx.VERTICAL) 129 | 130 | gSizer12 = wx.GridSizer(6, 1, 0, 0) 131 | 132 | self.address_radio = wx.RadioButton(self, wx.ID_ANY, u"地名: ", wx.DefaultPosition, wx.DefaultSize, 0) 133 | gSizer12.Add(self.address_radio, 0, wx.ALL, 5) 134 | 135 | self.add_words = wx.TextCtrl(self, wx.ID_ANY, wx.EmptyString, wx.DefaultPosition, wx.DefaultSize, 136 | 0 | wx.NO_BORDER) 137 | self.add_words.SetBackgroundColour(wx.SystemSettings.GetColour(wx.SYS_COLOUR_ACTIVECAPTION)) 138 | self.add_words.SetMinSize(wx.Size(710, -1)) 139 | 140 | gSizer12.Add(self.add_words, 0, wx.ALL, 5) 141 | 142 | self.name_radio = wx.RadioButton(self, wx.ID_ANY, u"人员: ", wx.DefaultPosition, wx.DefaultSize, 0) 143 | gSizer12.Add(self.name_radio, 0, wx.ALL, 5) 144 | 145 | self.name_words = wx.TextCtrl(self, wx.ID_ANY, wx.EmptyString, wx.DefaultPosition, wx.Size(710, 20), 146 | 0 | wx.NO_BORDER) 147 | self.name_words.SetBackgroundColour(wx.SystemSettings.GetColour(wx.SYS_COLOUR_ACTIVECAPTION)) 148 | 149 | gSizer12.Add(self.name_words, 0, wx.ALL, 5) 150 | 151 | self.keyword_radio = wx.RadioButton(self, wx.ID_ANY, u"中心词: ", wx.DefaultPosition, wx.DefaultSize, 0) 152 | gSizer12.Add(self.keyword_radio, 0, wx.ALL, 5) 153 | 154 | self.key_words = wx.TextCtrl(self, wx.ID_ANY, wx.EmptyString, wx.DefaultPosition, wx.Size(710, -1), 155 | 0 | wx.NO_BORDER) 156 | self.key_words.SetBackgroundColour(wx.SystemSettings.GetColour(wx.SYS_COLOUR_ACTIVECAPTION)) 157 | 158 | gSizer12.Add(self.key_words, 0, wx.ALL, 5) 159 | 160 | sbSizer13.Add(gSizer12, 1, wx.EXPAND, 5) 161 | 162 | fgSizer3.Add(sbSizer13, 1, wx.EXPAND, 5) 163 | 164 | gSizer9 = wx.GridSizer(1, 2, 10, 0) 165 | 166 | gSizer9.SetMinSize(wx.Size(-1, 50)) 167 | self.m_button40 = wx.Button(self, wx.ID_ANY, u"下一条", wx.DefaultPosition, wx.Size(200, -1), 0) 168 | gSizer9.Add(self.m_button40, 0, wx.ALIGN_CENTER | wx.ALL, 5) 169 | 170 | self.m_button41 = wx.Button(self, wx.ID_ANY, u"重载(暂时无用)", wx.DefaultPosition, wx.Size(200, -1), 0) 171 | gSizer9.Add(self.m_button41, 0, wx.ALIGN_CENTER | wx.ALL, 5) 172 | 173 | fgSizer3.Add(gSizer9, 1, wx.EXPAND, 5) 174 | 175 | self.SetSizer(fgSizer3) 176 | self.Layout() 177 | 178 | self.Centre(wx.BOTH) 179 | 180 | # Connect Events 181 | self.content.Bind(wx.EVT_LEFT_UP, self.get_sellect_words) 182 | self.m_toggleBtn1.Bind(wx.EVT_TOGGLEBUTTON, self.toggle_label) 183 | self.m_toggleBtn2.Bind(wx.EVT_TOGGLEBUTTON, self.toggle_label) 184 | self.m_toggleBtn3.Bind(wx.EVT_TOGGLEBUTTON, self.toggle_label) 185 | self.m_toggleBtn4.Bind(wx.EVT_TOGGLEBUTTON, self.toggle_label) 186 | self.m_toggleBtn5.Bind(wx.EVT_TOGGLEBUTTON, self.toggle_label) 187 | self.m_toggleBtn7.Bind(wx.EVT_TOGGLEBUTTON, self.toggle_label) 188 | self.m_toggleBtn9.Bind(wx.EVT_TOGGLEBUTTON, self.toggle_label) 189 | self.m_toggleBtn10.Bind(wx.EVT_TOGGLEBUTTON, self.toggle_label) 190 | self.m_toggleBtn11.Bind(wx.EVT_TOGGLEBUTTON, self.toggle_label) 191 | self.m_toggleBtn13.Bind(wx.EVT_TOGGLEBUTTON, self.toggle_label) 192 | self.m_toggleBtn14.Bind(wx.EVT_TOGGLEBUTTON, self.toggle_label) 193 | self.m_toggleBtn15.Bind(wx.EVT_TOGGLEBUTTON, self.toggle_label) 194 | self.m_toggleBtn17.Bind(wx.EVT_TOGGLEBUTTON, self.toggle_label) 195 | self.m_toggleBtn19.Bind(wx.EVT_TOGGLEBUTTON, self.toggle_label) 196 | self.m_toggleBtn20.Bind(wx.EVT_TOGGLEBUTTON, self.toggle_label) 197 | self.address_radio.Bind(wx.EVT_RADIOBUTTON, self.select_address) 198 | self.name_radio.Bind(wx.EVT_RADIOBUTTON, self.select_name) 199 | self.keyword_radio.Bind(wx.EVT_RADIOBUTTON, self.select_keyword) 200 | self.m_button40.Bind(wx.EVT_BUTTON, self.next_doc) 201 | self.m_button41.Bind(wx.EVT_BUTTON, self.reload) 202 | 203 | def __del__(self): 204 | pass 205 | 206 | def get_sellect_words(self, event): 207 | event.Skip() 208 | 209 | def toggle_label(self, event): 210 | event.Skip() 211 | 212 | def select_address(self, event): 213 | event.Skip() 214 | 215 | def select_name(self, event): 216 | event.Skip() 217 | 218 | def select_keyword(self, event): 219 | event.Skip() 220 | 221 | def next_doc(self, event): 222 | event.Skip() 223 | 224 | def reload(self, event): 225 | event.Skip() 226 | -------------------------------------------------------------------------------- /tool/mainwin.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Time : 19-5-2 下午5:06 3 | # @Author : HaiYang Gao 4 | # @OS: ubuntu 16.04 5 | # @File : mainwin.py 6 | 7 | from basewin import MyFrame1 8 | import wx 9 | import os 10 | import codecs 11 | import platform 12 | import json 13 | import sys 14 | import collections 15 | 16 | """ 17 | ==============================step1:参数自定义======================================= 18 | 自定义标注参数,自行调整; 19 | 通常修改record_path,id_str,content_str就可以了 20 | """ 21 | Numbers = 99999 # 待标注的数据条数 22 | MaxLengthOfKeyWord = 15 # 中心词最大长度 23 | Tag_flag = 0 # value = 0 or -1; 如果Tag_flag=-1,则可以不选tag,也可以进行下一条内容的标注 24 | data_path = '../data/example.json' # 待标注文件的路径 25 | record_path = '../data/records/record_01.txt' # 标注文件的位置 26 | # id_str改为自己json文件对应的键值,content_str改为想显示的内容 27 | # 中文键要在字符串前加上u,比如id_str = u'序号' 28 | id_str = 'id' # 主键,唯一 29 | content_str = 'content' # 文本内容 30 | """===============================step1调整结束,step2在basewin.py中=====================""" 31 | sys.setrecursionlimit(Numbers + 100) 32 | 33 | 34 | class MainWindows(MyFrame1): 35 | record_file = record_path 36 | data_file = data_path 37 | 38 | def __init__(self, title): 39 | MyFrame1.__init__(self, None) 40 | self.cur_progress = 0 # 记录当前标注的条数 41 | self.which_words_items = -1 # 记录哪种类型的关键词 42 | self.textboxfont = wx.Font(12, wx.DEFAULT, wx.NORMAL, wx.NORMAL) 43 | self.content.SetFont(self.textboxfont) 44 | self.read_data() 45 | self.load_record() 46 | self.reinit_record() 47 | self.print_content() 48 | self.keyword_radio.item_name = 'keyword' 49 | self.name_radio.item_name = 'name' 50 | self.address_radio.item_name = 'address' 51 | self.add_words.item_name = 'address' 52 | self.name_words.item_name = 'name' 53 | self.key_words.item_name = 'keyword' 54 | 55 | # 初始化 56 | def reinit_record(self): 57 | self.tags = [] 58 | self.address = [] 59 | self.person = [] 60 | self.keyword = [] 61 | self.which_words_items = -1 62 | self.print_keywords() 63 | 64 | for i in range(1, 21): 65 | self.__getattribute__('m_toggleBtn' + str(i)).SetValue(False) 66 | 67 | self.keyword_radio.SetValue(False) 68 | self.name_radio.SetValue(False) 69 | self.address_radio.SetValue(False) 70 | 71 | def read_data(self): 72 | with codecs.open(self.data_file, 'r', encoding='utf-8') as f: 73 | data = json.load(f, object_pairs_hook=collections.OrderedDict) 74 | self.data = [] 75 | for _, v in data['data'].items(): 76 | self.data.append(v) 77 | self.data_size = len(self.data) 78 | 79 | def print_content(self): 80 | cur_doc = self.data[self.cur_progress] 81 | if cur_doc[id_str] in self.ids: 82 | self.cur_progress += 1 83 | return self.print_content() 84 | # 标注界面显示的标题 85 | title = str(cur_doc[id_str]) + '-----' + u'这里可以继续添加标题内容,当前显示了数据的id' 86 | content = cur_doc[content_str] 87 | self.title.SetLabel(title) 88 | self.redefine_Textctrl_write(self.content, content) 89 | 90 | # 定义按钮事件 91 | def toggle_label(self, event): 92 | eo = event.GetEventObject() 93 | if eo.GetValue(): 94 | self.tags.append(eo.GetLabel()) 95 | else: 96 | idx = self.tags.index(eo.GetLabel()) 97 | del (self.tags[idx]) 98 | 99 | # 加载标注记录, 获得续标断点 100 | def load_record(self): 101 | self.ids = [] 102 | if os.path.exists(self.record_file): 103 | with codecs.open(self.record_file, 'r', encoding='utf-8') as f: 104 | records = [d for d in f.read().split('\n') if len(d)] 105 | for rd in records: 106 | self.ids.append(json.loads(rd)[id_str]) 107 | 108 | # 显示选择的数据 109 | def print_keywords(self): 110 | if len(self.address): 111 | self.redefine_Textctrl_write(self.add_words, '(' + '),('.join(self.address) + ')') 112 | else: 113 | self.redefine_Textctrl_write(self.add_words, '') 114 | if len(self.person): 115 | self.redefine_Textctrl_write(self.name_words, '(' + '),('.join(self.person) + ')') 116 | else: 117 | self.redefine_Textctrl_write(self.name_words, '') 118 | if len(self.keyword): 119 | self.redefine_Textctrl_write(self.key_words, '(' + '),('.join(self.keyword) + ')') 120 | else: 121 | self.redefine_Textctrl_write(self.key_words, '') 122 | 123 | """ 124 | =============================step3:选择写入文件的内容============================= 125 | """ 126 | 127 | # 写入记录 128 | def write_a_record(self): 129 | save_record = collections.OrderedDict() 130 | 131 | save_record[id_str] = self.data[self.cur_progress][id_str] 132 | save_record[content_str] = self.data[self.cur_progress][content_str] 133 | save_record[u'地名'] = self.address 134 | save_record[u'人员'] = self.person 135 | save_record[u'中心词'] = self.keyword 136 | save_record['tags'] = self.tags 137 | """ 138 | 例子:添加一个写入。 139 | save_record[u'作者'] = self.data[self.cur_progress]['author'] 140 | save_record['time'] = self.data[self.cur_progress]['time'] 141 | # 这里的'author'必须是json文件中的键,中文前要加'u' 142 | 143 | 不写入: 144 | 不想写入,就注释或删掉某一句,如 # line126,则不会写入地名信息 145 | """ 146 | save_text = json.dumps(save_record, ensure_ascii=False) 147 | with codecs.open(self.record_file, 'a', encoding='utf-8') as f: 148 | f.write(save_text + '\n') 149 | 150 | """ 151 | ========step3调整结束,可运行"mainwin.py",开始标注================================ 152 | """ 153 | 154 | def next_doc(self, event): 155 | if len(self.tags) == Tag_flag: 156 | wx.MessageBox(u'您没有选择任何标签', u'警告', wx.OK | wx.ICON_EXCLAMATION) 157 | else: 158 | self.write_a_record() 159 | self.cur_progress += 1 160 | if self.cur_progress == self.data_size: 161 | self.exit(0) 162 | self.reinit_record() 163 | self.print_content() 164 | self.print_keywords() 165 | 166 | def get_sellect_words(self, event): 167 | words = self.content.GetStringSelection() 168 | if self.which_words_items == -1 and len(words) > 0: 169 | wx.MessageBox(u'请选择一种要标注的关键字类型', u'警告', wx.OK | wx.ICON_EXCLAMATION) 170 | return False 171 | elif len(words) > MaxLengthOfKeyWord: 172 | a = wx.MessageBox(words + u' \n\n是否选为关键词?', u'关键词过长验证', wx.YES_NO | wx.ICON_EXCLAMATION) 173 | if not a == wx.YES: 174 | return False 175 | elif len(words) < 2: 176 | return False 177 | 178 | if self.which_words_items == 1: 179 | self.address.append(words) 180 | elif self.which_words_items == 2: 181 | self.person.append(words) 182 | elif self.which_words_items == 3: 183 | self.keyword.append(words) 184 | self.print_keywords() 185 | 186 | def select_address(self, event): 187 | self.which_words_items = 1 188 | 189 | def select_name(self, event): 190 | self.which_words_items = 2 191 | 192 | def select_keyword(self, event): 193 | self.which_words_items = 3 194 | 195 | def del_words(self, event): 196 | print(dir(event)) 197 | 198 | def exit(self, flag): 199 | if flag == 0: 200 | wx.MessageBox(u'当前标注已经完成, 感谢您的参与', u'警告', wx.OK | wx.ICON_EXCLAMATION) 201 | 202 | # 根据不同平台使用不同的Textctrl方法 203 | def redefine_Textctrl_write(self, object, string): 204 | if 'windows' in platform.system().lower(): 205 | object.SetLabel(string) 206 | else: 207 | object.SetValue(string) 208 | 209 | 210 | if __name__ == '__main__': 211 | app = wx.App() 212 | main_win = MainWindows(title='文本标注工具') 213 | main_win.Show() 214 | app.MainLoop() 215 | -------------------------------------------------------------------------------- /标注主界面.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GHY73/BiaoZhuTool/9e0cce297b3caa20bd56184929abb6a07e6e03ca/标注主界面.png -------------------------------------------------------------------------------- /标注结果示例.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GHY73/BiaoZhuTool/9e0cce297b3caa20bd56184929abb6a07e6e03ca/标注结果示例.jpg --------------------------------------------------------------------------------