├── baidu_search ├── baidu_search │ ├── __init__.py │ ├── spiders │ │ ├── __init__.py │ │ └── baidu_search.py │ ├── pipelines.py │ ├── items.py │ └── settings.py └── scrapy.cfg ├── subtitle ├── subtitle_crawler │ ├── __init__.py │ ├── items.pyc │ ├── __init__.pyc │ ├── settings.pyc │ ├── pipelines.pyc │ ├── spiders │ │ ├── __init__.pyc │ │ ├── subtitle_spider.pyc │ │ ├── __init__.py │ │ └── subtitle_spider.py │ ├── items.py │ ├── pipelines.py │ └── settings.py ├── preprocess │ ├── conv_big5.sh │ ├── unzip.sh │ ├── conv.sh │ ├── clear_empty_dir.py │ ├── get_file_charset.py │ ├── change_name.py │ ├── del_file.py │ ├── conv2simple.py │ ├── mv_ass.py │ ├── mv_lrc.py │ ├── mv_rar.py │ ├── mv_smi.py │ ├── mv_srt.py │ ├── mv_ssa.py │ ├── mv_str.py │ ├── mv_sup.py │ ├── mv_vtt.py │ ├── mv_zip.py │ ├── get_charset.py │ ├── get_charset_and_conv.py │ ├── extract_sentence_srt.py │ ├── extract_sentence_ass.py │ ├── extract_sentence_ssa.py │ └── filter.py └── scrapy.cfg ├── chatbotv1 ├── src │ ├── main │ │ ├── resources │ │ │ ├── ext.dic │ │ │ ├── IKAnalyzer.cfg.xml │ │ │ ├── stopword.dic │ │ │ └── quantifier.dic │ │ ├── .DS_Store │ │ └── java │ │ │ ├── com │ │ │ └── shareditor │ │ │ │ └── chatbotv1 │ │ │ │ ├── HttpServerInboundHandler.java │ │ │ │ ├── Searcher.java │ │ │ │ ├── NettyHttpServletResponse.java │ │ │ │ └── Indexer.java │ │ │ └── org │ │ │ └── wltea │ │ │ └── analyzer │ │ │ ├── core │ │ │ ├── ISegmenter.java │ │ │ ├── CharacterUtil.java │ │ │ ├── CJKSegmenter.java │ │ │ ├── IKArbitrator.java │ │ │ ├── IKSegmenter.java │ │ │ ├── QuickSortSet.java │ │ │ ├── CN_QuantifierSegmenter.java │ │ │ ├── LexemePath.java │ │ │ └── Lexeme.java │ │ │ ├── cfg │ │ │ ├── Configuration.java │ │ │ └── DefaultConfig.java │ │ │ ├── lucene │ │ │ ├── IKAnalyzer.java │ │ │ └── IKTokenizer.java │ │ │ ├── dic │ │ │ ├── quantifier.dic │ │ │ └── Hit.java │ │ │ ├── sample │ │ │ ├── IKAnalzyerDemo.java │ │ │ └── LuceneIndexAndSearchDemo.java │ │ │ └── query │ │ │ └── SWMCQueryBuilder.java │ └── test │ │ └── java │ │ └── com │ │ └── shareditor │ │ └── chatbotv1 │ │ └── AppTest.java └── pom.xml ├── lstm_code ├── tensorflow │ └── test.py ├── nicodjimenez │ ├── test.py │ ├── README.md │ └── test2.py └── iamtrask │ └── lstm.py ├── learning_tensorflow ├── tmp │ └── events.out.tfevents.1481183189.localhost ├── 1.py ├── 3.py └── 2.py ├── word2vec ├── demo-word.sh ├── demo-classes.sh ├── demo-word-accuracy.sh ├── demo-analogy.sh ├── makefile ├── demo-phrases.sh ├── demo-phrase-accuracy.sh ├── README.txt ├── distance.c ├── word-analogy.c ├── demo-train-big-model-v1.sh └── compute-accuracy.c ├── gensim_word2vec.py ├── chatbotv2 ├── readme.txt └── my_seq2seq.py ├── word_segment.py ├── digital_recognition.py ├── word_vectors_loader.py ├── seq2seq ├── tflearn_prj │ ├── my_tflearn_demo.py │ ├── 07_lstm.py │ └── my_lstm_test.py └── hello_sequence.py ├── digital_recognition_cnn.py ├── read_images.c ├── pattern_recognition.lua └── README.md /baidu_search/baidu_search/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /subtitle/subtitle_crawler/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /chatbotv1/src/main/resources/ext.dic: -------------------------------------------------------------------------------- 1 | 诛仙 2 | 诛仙2 3 | 梦幻诛仙 4 | 梦幻诛仙2 -------------------------------------------------------------------------------- /chatbotv1/src/main/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/0oVicero0/ChatBotCourse/master/chatbotv1/src/main/.DS_Store -------------------------------------------------------------------------------- /subtitle/subtitle_crawler/items.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/0oVicero0/ChatBotCourse/master/subtitle/subtitle_crawler/items.pyc -------------------------------------------------------------------------------- /subtitle/subtitle_crawler/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/0oVicero0/ChatBotCourse/master/subtitle/subtitle_crawler/__init__.pyc -------------------------------------------------------------------------------- /subtitle/subtitle_crawler/settings.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/0oVicero0/ChatBotCourse/master/subtitle/subtitle_crawler/settings.pyc -------------------------------------------------------------------------------- /subtitle/subtitle_crawler/pipelines.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/0oVicero0/ChatBotCourse/master/subtitle/subtitle_crawler/pipelines.pyc -------------------------------------------------------------------------------- /lstm_code/tensorflow/test.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | def main(_): 4 | pass 5 | 6 | if __name__ == "__main__": 7 | tf.app.run() 8 | -------------------------------------------------------------------------------- /subtitle/subtitle_crawler/spiders/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/0oVicero0/ChatBotCourse/master/subtitle/subtitle_crawler/spiders/__init__.pyc -------------------------------------------------------------------------------- /subtitle/subtitle_crawler/spiders/subtitle_spider.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/0oVicero0/ChatBotCourse/master/subtitle/subtitle_crawler/spiders/subtitle_spider.pyc -------------------------------------------------------------------------------- /learning_tensorflow/tmp/events.out.tfevents.1481183189.localhost: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/0oVicero0/ChatBotCourse/master/learning_tensorflow/tmp/events.out.tfevents.1481183189.localhost -------------------------------------------------------------------------------- /baidu_search/baidu_search/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /subtitle/subtitle_crawler/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /subtitle/preprocess/conv_big5.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | while read line 4 | do 5 | file=`echo $line|awk '{print $1}'`; echo $file; 6 | iconv -f big5 -t utf-8 $file > ${file}.2 7 | if [ $? -eq 0 ];then 8 | mv ${file}.2 ${file} 9 | else 10 | rm ${file}.2 11 | fi 12 | done 13 | -------------------------------------------------------------------------------- /word2vec/demo-word.sh: -------------------------------------------------------------------------------- 1 | make 2 | if [ ! -e text8 ]; then 3 | wget http://mattmahoney.net/dc/text8.zip -O text8.gz 4 | gzip -d text8.gz -f 5 | fi 6 | time ./word2vec -train text8 -output vectors.bin -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 20 -binary 1 -iter 15 7 | ./distance vectors.bin 8 | -------------------------------------------------------------------------------- /subtitle/preprocess/unzip.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | i=0; for file in `ls`; do mkdir output/${i}; echo "unzip $file -d output/${i}";unzip -P abc $file -d output/${i} > /dev/null; ((i++)); done 4 | i=0; for file in `ls`; do mkdir output/${i}; echo "${i} unrar x $file output/${i}";unrar x $file output/${i} > /dev/null; ((i++)); done 5 | -------------------------------------------------------------------------------- /baidu_search/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = baidu_search.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = baidu_search 12 | -------------------------------------------------------------------------------- /subtitle/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = subtitle_crawler.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = subtitle_crawler 12 | -------------------------------------------------------------------------------- /subtitle/preprocess/conv.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | while read line 4 | do 5 | file=`echo $line|awk '{print $1}'` 6 | iconv -f gb2312 -t utf-8 $file > ${file}.2 2>/dev/null 7 | if [ $? -eq 0 ];then 8 | mv ${file}.2 ${file} 9 | echo "mv ${file}.2 ${file}" 10 | else 11 | rm ${file}.2 12 | fi 13 | done 14 | -------------------------------------------------------------------------------- /learning_tensorflow/1.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | sess = tf.Session() 4 | 5 | a = tf.placeholder("float") 6 | b = tf.placeholder("float") 7 | c = tf.constant(6.0) 8 | d = tf.mul(a, b) 9 | y = tf.mul(d, c) 10 | print sess.run(y, feed_dict={a: 3, b: 3}) 11 | 12 | A = [[1.1,2.3],[3.4,4.1]] 13 | Y = tf.matrix_inverse(A) 14 | print sess.run(Y) 15 | sess.close() 16 | -------------------------------------------------------------------------------- /subtitle/preprocess/clear_empty_dir.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import os 3 | import fnmatch 4 | import shutil 5 | import sys 6 | 7 | 8 | def iterfindfiles(path, fnexp): 9 | for root, dirs, files in os.walk(path): 10 | if 0 == len(files) and len(dirs) == 0: 11 | print root 12 | os.rmdir(root) 13 | 14 | iterfindfiles(r"./input/", "*.srt") 15 | -------------------------------------------------------------------------------- /baidu_search/baidu_search/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class BaiduSearchPipeline(object): 10 | def process_item(self, item, spider): 11 | return item 12 | -------------------------------------------------------------------------------- /baidu_search/baidu_search/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class BaiduSearchItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | pass 15 | -------------------------------------------------------------------------------- /subtitle/subtitle_crawler/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class SubtitleCrawlerItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | url = scrapy.Field() 14 | body = scrapy.Field() 15 | -------------------------------------------------------------------------------- /word2vec/demo-classes.sh: -------------------------------------------------------------------------------- 1 | make 2 | if [ ! -e text8 ]; then 3 | wget http://mattmahoney.net/dc/text8.zip -O text8.gz 4 | gzip -d text8.gz -f 5 | fi 6 | time ./word2vec -train text8 -output classes.txt -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 20 -iter 15 -classes 500 7 | sort classes.txt -k 2 -n > classes.sorted.txt 8 | echo The word classes were saved to file classes.sorted.txt 9 | -------------------------------------------------------------------------------- /chatbotv1/src/main/resources/IKAnalyzer.cfg.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | IK Analyzer 扩展配置 5 | 6 | ext.dic; 7 | 8 | 9 | stopword.dic; 10 | 11 | -------------------------------------------------------------------------------- /subtitle/preprocess/get_file_charset.py: -------------------------------------------------------------------------------- 1 | import chardet 2 | import sys 3 | import os 4 | 5 | if __name__ == '__main__': 6 | if len(sys.argv) == 2: 7 | file_path = sys.argv[1] 8 | f = open(file_path,'r') 9 | data = f.read() 10 | encoding = chardet.detect(data)["encoding"] 11 | if encoding not in ("UTF-8-SIG", "UTF-16LE", "utf-8"): 12 | print file_path, encoding 13 | f.close() 14 | -------------------------------------------------------------------------------- /word2vec/demo-word-accuracy.sh: -------------------------------------------------------------------------------- 1 | make 2 | if [ ! -e text8 ]; then 3 | wget http://mattmahoney.net/dc/text8.zip -O text8.gz 4 | gzip -d text8.gz -f 5 | fi 6 | time ./word2vec -train text8 -output vectors.bin -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 20 -binary 1 -iter 15 7 | ./compute-accuracy vectors.bin 30000 < questions-words.txt 8 | # to compute accuracy with the full vocabulary, use: ./compute-accuracy vectors.bin < questions-words.txt 9 | -------------------------------------------------------------------------------- /subtitle/preprocess/change_name.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import os 3 | import fnmatch 4 | import shutil 5 | import sys 6 | 7 | 8 | def iterfindfiles(path, fnexp): 9 | for root, dirs, files in os.walk(path): 10 | for filename in fnmatch.filter(files, fnexp): 11 | yield os.path.join(root, filename) 12 | 13 | 14 | i=0 15 | for filename in iterfindfiles(r"./", "*"): 16 | i=i+1 17 | newfilename = str(i) + ".vtt" 18 | #print filename + " <===> " + newfilename 19 | shutil.move(filename, newfilename) 20 | -------------------------------------------------------------------------------- /subtitle/preprocess/del_file.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import os 3 | import fnmatch 4 | import shutil 5 | import sys 6 | 7 | 8 | def iterfindfiles(path, fnexp): 9 | for root, dirs, files in os.walk(path): 10 | for filename in fnmatch.filter(files, fnexp): 11 | yield os.path.join(root, filename) 12 | 13 | 14 | 15 | 16 | for suffix in ("*.mp4", "*.txt", "*.JPG", "*.htm", "*.doc", "*.docx", "*.nfo", "*.sub", "*.idx"): 17 | for filename in iterfindfiles(r"./input/", suffix): 18 | print filename 19 | os.remove(filename) 20 | -------------------------------------------------------------------------------- /subtitle/preprocess/conv2simple.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | from langconv import * 3 | import sys 4 | 5 | def tradition2simple(line): 6 | line = Converter('zh-hans').convert(line.decode('utf-8')) 7 | line = line.encode('utf-8') 8 | return line 9 | 10 | if __name__ == '__main__': 11 | if len(sys.argv) == 2: 12 | f = open(sys.argv[1], "r") 13 | while True: 14 | line = f.readline() 15 | if line: 16 | print tradition2simple(line).strip() 17 | else: 18 | break 19 | f.close() 20 | -------------------------------------------------------------------------------- /subtitle/subtitle_crawler/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class SubtitleCrawlerPipeline(object): 10 | def process_item(self, item, spider): 11 | url = item['url'] 12 | file_name = url.replace('/','_').replace(':','_') 13 | fp = open('result/'+file_name, 'w') 14 | fp.write(item['body']) 15 | fp.close() 16 | return item 17 | 18 | -------------------------------------------------------------------------------- /chatbotv1/src/main/resources/stopword.dic: -------------------------------------------------------------------------------- 1 | a 2 | an 3 | and 4 | are 5 | as 6 | at 7 | be 8 | but 9 | by 10 | for 11 | if 12 | in 13 | into 14 | is 15 | it 16 | no 17 | not 18 | of 19 | on 20 | or 21 | such 22 | that 23 | the 24 | their 25 | then 26 | there 27 | these 28 | they 29 | this 30 | to 31 | was 32 | will 33 | with 34 | 也 35 | 了 36 | 仍 37 | 从 38 | 以 39 | 使 40 | 则 41 | 却 42 | 又 43 | 及 44 | 对 45 | 就 46 | 并 47 | 很 48 | 或 49 | 把 50 | 是 51 | 的 52 | 着 53 | 给 54 | 而 55 | 被 56 | 让 57 | 在 58 | 还 59 | 比 60 | 等 61 | 当 62 | 与 63 | 于 64 | 但 -------------------------------------------------------------------------------- /subtitle/preprocess/mv_ass.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import os 3 | import fnmatch 4 | import shutil 5 | import sys 6 | 7 | 8 | def iterfindfiles(path, fnexp): 9 | for root, dirs, files in os.walk(path): 10 | for filename in fnmatch.filter(files, fnexp): 11 | yield os.path.join(root, filename) 12 | 13 | 14 | i=0 15 | for filename in iterfindfiles(r"./input/", "*.ass"): 16 | i=i+1 17 | newfilename = "ass/" + str(i) + "_" + os.path.basename(filename) 18 | print filename + " <===> " + newfilename 19 | shutil.move(filename, newfilename) 20 | #sys.exit(-1) 21 | -------------------------------------------------------------------------------- /subtitle/preprocess/mv_lrc.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import os 3 | import fnmatch 4 | import shutil 5 | import sys 6 | 7 | 8 | def iterfindfiles(path, fnexp): 9 | for root, dirs, files in os.walk(path): 10 | for filename in fnmatch.filter(files, fnexp): 11 | yield os.path.join(root, filename) 12 | 13 | 14 | i=0 15 | for filename in iterfindfiles(r"./input/", "*.LRC"): 16 | i=i+1 17 | newfilename = "lrc/" + str(i) + "_" + os.path.basename(filename) 18 | print filename + " <===> " + newfilename 19 | shutil.move(filename, newfilename) 20 | #sys.exit(-1) 21 | -------------------------------------------------------------------------------- /subtitle/preprocess/mv_rar.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import os 3 | import fnmatch 4 | import shutil 5 | import sys 6 | 7 | 8 | def iterfindfiles(path, fnexp): 9 | for root, dirs, files in os.walk(path): 10 | for filename in fnmatch.filter(files, fnexp): 11 | yield os.path.join(root, filename) 12 | 13 | 14 | i=0 15 | for filename in iterfindfiles(r"./input/", "*.RAR"): 16 | i=i+1 17 | newfilename = "rar/" + str(i) + "_" + os.path.basename(filename) 18 | print filename + " <===> " + newfilename 19 | shutil.move(filename, newfilename) 20 | #sys.exit(-1) 21 | -------------------------------------------------------------------------------- /subtitle/preprocess/mv_smi.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import os 3 | import fnmatch 4 | import shutil 5 | import sys 6 | 7 | 8 | def iterfindfiles(path, fnexp): 9 | for root, dirs, files in os.walk(path): 10 | for filename in fnmatch.filter(files, fnexp): 11 | yield os.path.join(root, filename) 12 | 13 | 14 | i=0 15 | for filename in iterfindfiles(r"./input/", "*.SMI"): 16 | i=i+1 17 | newfilename = "smi/" + str(i) + "_" + os.path.basename(filename) 18 | print filename + " <===> " + newfilename 19 | shutil.move(filename, newfilename) 20 | #sys.exit(-1) 21 | -------------------------------------------------------------------------------- /subtitle/preprocess/mv_srt.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import os 3 | import fnmatch 4 | import shutil 5 | import sys 6 | 7 | 8 | def iterfindfiles(path, fnexp): 9 | for root, dirs, files in os.walk(path): 10 | for filename in fnmatch.filter(files, fnexp): 11 | yield os.path.join(root, filename) 12 | 13 | 14 | i=0 15 | for filename in iterfindfiles(r"./input/", "*.SRT"): 16 | i=i+1 17 | newfilename = "srt/" + str(i) + "_" + os.path.basename(filename) 18 | print filename + " <===> " + newfilename 19 | shutil.move(filename, newfilename) 20 | #sys.exit(-1) 21 | -------------------------------------------------------------------------------- /subtitle/preprocess/mv_ssa.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import os 3 | import fnmatch 4 | import shutil 5 | import sys 6 | 7 | 8 | def iterfindfiles(path, fnexp): 9 | for root, dirs, files in os.walk(path): 10 | for filename in fnmatch.filter(files, fnexp): 11 | yield os.path.join(root, filename) 12 | 13 | 14 | i=0 15 | for filename in iterfindfiles(r"./input/", "*.ssa"): 16 | i=i+1 17 | newfilename = "ssa/" + str(i) + "_" + os.path.basename(filename) 18 | print filename + " <===> " + newfilename 19 | shutil.move(filename, newfilename) 20 | #sys.exit(-1) 21 | -------------------------------------------------------------------------------- /subtitle/preprocess/mv_str.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import os 3 | import fnmatch 4 | import shutil 5 | import sys 6 | 7 | 8 | def iterfindfiles(path, fnexp): 9 | for root, dirs, files in os.walk(path): 10 | for filename in fnmatch.filter(files, fnexp): 11 | yield os.path.join(root, filename) 12 | 13 | 14 | i=0 15 | for filename in iterfindfiles(r"./input/", "*.str"): 16 | i=i+1 17 | newfilename = "str/" + str(i) + "_" + os.path.basename(filename) 18 | print filename + " <===> " + newfilename 19 | shutil.move(filename, newfilename) 20 | #sys.exit(-1) 21 | -------------------------------------------------------------------------------- /subtitle/preprocess/mv_sup.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import os 3 | import fnmatch 4 | import shutil 5 | import sys 6 | 7 | 8 | def iterfindfiles(path, fnexp): 9 | for root, dirs, files in os.walk(path): 10 | for filename in fnmatch.filter(files, fnexp): 11 | yield os.path.join(root, filename) 12 | 13 | 14 | i=0 15 | for filename in iterfindfiles(r"./input/", "*.sup"): 16 | i=i+1 17 | newfilename = "sup/" + str(i) + "_" + os.path.basename(filename) 18 | print filename + " <===> " + newfilename 19 | shutil.move(filename, newfilename) 20 | #sys.exit(-1) 21 | -------------------------------------------------------------------------------- /subtitle/preprocess/mv_vtt.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import os 3 | import fnmatch 4 | import shutil 5 | import sys 6 | 7 | 8 | def iterfindfiles(path, fnexp): 9 | for root, dirs, files in os.walk(path): 10 | for filename in fnmatch.filter(files, fnexp): 11 | yield os.path.join(root, filename) 12 | 13 | 14 | i=0 15 | for filename in iterfindfiles(r"./input/", "*.vtt"): 16 | i=i+1 17 | newfilename = "vtt/" + str(i) + "_" + os.path.basename(filename) 18 | print filename + " <===> " + newfilename 19 | shutil.move(filename, newfilename) 20 | #sys.exit(-1) 21 | -------------------------------------------------------------------------------- /subtitle/preprocess/mv_zip.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import os 3 | import fnmatch 4 | import shutil 5 | import sys 6 | 7 | 8 | def iterfindfiles(path, fnexp): 9 | for root, dirs, files in os.walk(path): 10 | for filename in fnmatch.filter(files, fnexp): 11 | yield os.path.join(root, filename) 12 | 13 | 14 | i=0 15 | for filename in iterfindfiles(r"./input/", "*.ZIP"): 16 | i=i+1 17 | newfilename = "zip/" + str(i) + "_" + os.path.basename(filename) 18 | print filename + " <===> " + newfilename 19 | shutil.move(filename, newfilename) 20 | #sys.exit(-1) 21 | -------------------------------------------------------------------------------- /gensim_word2vec.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | 3 | from gensim.models import word2vec 4 | import logging 5 | 6 | #logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) 7 | #sentences = word2vec.LineSentence('segment_result_lined') 8 | #model = word2vec.Word2Vec(sentences, size=200, workers=4, iter=20) 9 | #model.save("word_vec_model/model") 10 | model_2 = word2vec.Word2Vec.load("word_vec_model/model") 11 | y = model_2.most_similar(u"学习", topn=10) 12 | for (word, score) in y: 13 | print word 14 | print score 15 | #print model_2.syn0norm[model_2.vocab[u"小兔"].index] 16 | -------------------------------------------------------------------------------- /subtitle/preprocess/get_charset.py: -------------------------------------------------------------------------------- 1 | import chardet 2 | import sys 3 | import os 4 | 5 | if __name__ == '__main__': 6 | for dir in ("srt", "ass", "lrc", "ssa", "str", "vtt"): 7 | for root, dirs, files in os.walk(dir): 8 | for file in files: 9 | file_path = root + "/" + file 10 | f = open(file_path,'r') 11 | data = f.read() 12 | encoding = chardet.detect(data)["encoding"] 13 | if encoding not in ("UTF-8-SIG", "UTF-16LE", "utf-8"): 14 | print file_path, encoding 15 | f.close() 16 | -------------------------------------------------------------------------------- /word2vec/demo-analogy.sh: -------------------------------------------------------------------------------- 1 | make 2 | if [ ! -e text8 ]; then 3 | wget http://mattmahoney.net/dc/text8.zip -O text8.gz 4 | gzip -d text8.gz -f 5 | fi 6 | echo --------------------------------------------------------------------------------------------------- 7 | echo Note that for the word analogy to perform well, the model should be trained on much larger data set 8 | echo Example input: paris france berlin 9 | echo --------------------------------------------------------------------------------------------------- 10 | time ./word2vec -train text8 -output vectors.bin -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 20 -binary 1 -iter 15 11 | ./word-analogy vectors.bin 12 | -------------------------------------------------------------------------------- /chatbotv2/readme.txt: -------------------------------------------------------------------------------- 1 | python ../word_segment.py zhenhuanzhuan.txt zhenhuanzhuan.segment 2 | ../word2vec/word2vec -train ./zhenhuanzhuan.segment -output vectors.bin -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 20 -binary 1 -iter 15 3 | 4 | 5 | 6 | head -10000 ../subtitle/raw_subtitles/subtitle.corpus > subtitle.corpus.10000 7 | python ../word_segment.py subtitle.corpus.10000 subtitle.corpus.10000.segment 8 | ../word2vec/word2vec -train ./subtitle.corpus.10000.segment -output vectors.bin -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-5 -threads 20 -binary 1 -iter 15 9 | cat subtitle.corpus.10000.segment | awk '{if(last!="")print last"|"$0;last=$0}' | sed 's/| /|/g' > subtitle.corpus.10000.segment.pair 10 | -------------------------------------------------------------------------------- /word2vec/makefile: -------------------------------------------------------------------------------- 1 | CC = gcc 2 | #Using -Ofast instead of -O3 might result in faster code, but is supported only by newer GCC versions 3 | CFLAGS = -lm -pthread -O3 -march=native -Wall -funroll-loops -Wno-unused-result 4 | 5 | all: word2vec word2phrase distance word-analogy compute-accuracy 6 | 7 | word2vec : word2vec.c 8 | $(CC) word2vec.c -o word2vec $(CFLAGS) 9 | word2phrase : word2phrase.c 10 | $(CC) word2phrase.c -o word2phrase $(CFLAGS) 11 | distance : distance.c 12 | $(CC) distance.c -o distance $(CFLAGS) 13 | word-analogy : word-analogy.c 14 | $(CC) word-analogy.c -o word-analogy $(CFLAGS) 15 | compute-accuracy : compute-accuracy.c 16 | $(CC) compute-accuracy.c -o compute-accuracy $(CFLAGS) 17 | chmod +x *.sh 18 | 19 | clean: 20 | rm -rf word2vec word2phrase distance word-analogy compute-accuracy -------------------------------------------------------------------------------- /chatbotv1/src/test/java/com/shareditor/chatbotv1/AppTest.java: -------------------------------------------------------------------------------- 1 | package com.shareditor.chatbotv1; 2 | 3 | import junit.framework.Test; 4 | import junit.framework.TestCase; 5 | import junit.framework.TestSuite; 6 | 7 | /** 8 | * Unit test for simple App. 9 | */ 10 | public class AppTest 11 | extends TestCase 12 | { 13 | /** 14 | * Create the test case 15 | * 16 | * @param testName name of the test case 17 | */ 18 | public AppTest( String testName ) 19 | { 20 | super( testName ); 21 | } 22 | 23 | /** 24 | * @return the suite of tests being tested 25 | */ 26 | public static Test suite() 27 | { 28 | return new TestSuite( AppTest.class ); 29 | } 30 | 31 | /** 32 | * Rigourous Test :-) 33 | */ 34 | public void testApp() 35 | { 36 | assertTrue( true ); 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /word2vec/demo-phrases.sh: -------------------------------------------------------------------------------- 1 | make 2 | if [ ! -e news.2012.en.shuffled ]; then 3 | wget http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2012.en.shuffled.gz 4 | gzip -d news.2012.en.shuffled.gz -f 5 | fi 6 | sed -e "s/’/'/g" -e "s/′/'/g" -e "s/''/ /g" < news.2012.en.shuffled | tr -c "A-Za-z'_ \n" " " > news.2012.en.shuffled-norm0 7 | time ./word2phrase -train news.2012.en.shuffled-norm0 -output news.2012.en.shuffled-norm0-phrase0 -threshold 200 -debug 2 8 | time ./word2phrase -train news.2012.en.shuffled-norm0-phrase0 -output news.2012.en.shuffled-norm0-phrase1 -threshold 100 -debug 2 9 | tr A-Z a-z < news.2012.en.shuffled-norm0-phrase1 > news.2012.en.shuffled-norm1-phrase1 10 | time ./word2vec -train news.2012.en.shuffled-norm1-phrase1 -output vectors-phrase.bin -cbow 1 -size 200 -window 10 -negative 25 -hs 0 -sample 1e-5 -threads 20 -binary 1 -iter 15 11 | ./distance vectors-phrase.bin 12 | -------------------------------------------------------------------------------- /word_segment.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | 3 | import sys 4 | reload(sys) 5 | sys.setdefaultencoding( "utf-8" ) 6 | 7 | import jieba 8 | from jieba import analyse 9 | 10 | def segment(input, output): 11 | input_file = open(input, "r") 12 | output_file = open(output, "w") 13 | while True: 14 | line = input_file.readline() 15 | if line: 16 | line = line.strip() 17 | seg_list = jieba.cut(line) 18 | segments = "" 19 | for str in seg_list: 20 | segments = segments + " " + str 21 | segments = segments + "\n" 22 | output_file.write(segments) 23 | else: 24 | break 25 | input_file.close() 26 | output_file.close() 27 | 28 | if __name__ == '__main__': 29 | if 3 != len(sys.argv): 30 | print "Usage: ", sys.argv[0], "input output" 31 | sys.exit(-1) 32 | segment(sys.argv[1], sys.argv[2]); 33 | -------------------------------------------------------------------------------- /subtitle/preprocess/get_charset_and_conv.py: -------------------------------------------------------------------------------- 1 | import chardet 2 | import sys 3 | import os 4 | 5 | if __name__ == '__main__': 6 | if len(sys.argv) == 2: 7 | for root, dirs, files in os.walk(sys.argv[1]): 8 | for file in files: 9 | file_path = root + "/" + file 10 | f = open(file_path,'r') 11 | data = f.read() 12 | f.close() 13 | encoding = chardet.detect(data)["encoding"] 14 | if encoding not in ("UTF-8-SIG", "UTF-16LE", "utf-8", "ascii"): 15 | try: 16 | gb_content = data.decode("gb18030") 17 | gb_content.encode('utf-8') 18 | f = open(file_path, 'w') 19 | f.write(gb_content.encode('utf-8')) 20 | f.close() 21 | except: 22 | print "except:", file_path 23 | -------------------------------------------------------------------------------- /word2vec/demo-phrase-accuracy.sh: -------------------------------------------------------------------------------- 1 | make 2 | if [ ! -e news.2012.en.shuffled ]; then 3 | wget http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2012.en.shuffled.gz 4 | gzip -d news.2012.en.shuffled.gz -f 5 | fi 6 | sed -e "s/’/'/g" -e "s/′/'/g" -e "s/''/ /g" < news.2012.en.shuffled | tr -c "A-Za-z'_ \n" " " > news.2012.en.shuffled-norm0 7 | time ./word2phrase -train news.2012.en.shuffled-norm0 -output news.2012.en.shuffled-norm0-phrase0 -threshold 200 -debug 2 8 | time ./word2phrase -train news.2012.en.shuffled-norm0-phrase0 -output news.2012.en.shuffled-norm0-phrase1 -threshold 100 -debug 2 9 | tr A-Z a-z < news.2012.en.shuffled-norm0-phrase1 > news.2012.en.shuffled-norm1-phrase1 10 | time ./word2vec -train news.2012.en.shuffled-norm1-phrase1 -output vectors-phrase.bin -cbow 1 -size 200 -window 10 -negative 25 -hs 0 -sample 1e-5 -threads 20 -binary 1 -iter 15 11 | ./compute-accuracy vectors-phrase.bin < questions-phrases.txt 12 | -------------------------------------------------------------------------------- /chatbotv1/src/main/java/com/shareditor/chatbotv1/HttpServerInboundHandler.java: -------------------------------------------------------------------------------- 1 | package com.shareditor.chatbotv1; 2 | 3 | import io.netty.channel.ChannelFuture; 4 | import io.netty.channel.ChannelFutureListener; 5 | import io.netty.channel.ChannelHandlerContext; 6 | import io.netty.channel.SimpleChannelInboundHandler; 7 | import io.netty.handler.codec.http.FullHttpRequest; 8 | import io.netty.handler.codec.http.HttpResponseStatus; 9 | import io.netty.handler.codec.http.HttpVersion; 10 | 11 | public class HttpServerInboundHandler extends SimpleChannelInboundHandler { 12 | 13 | @Override 14 | protected void messageReceived(ChannelHandlerContext ctx, FullHttpRequest msg) throws Exception { 15 | NettyHttpServletResponse res = new NettyHttpServletResponse(HttpVersion.HTTP_1_1, HttpResponseStatus.OK); 16 | Action.doServlet(msg, res); 17 | ChannelFuture future = ctx.channel().writeAndFlush(res); 18 | future.addListener(ChannelFutureListener.CLOSE); 19 | } 20 | 21 | } 22 | -------------------------------------------------------------------------------- /digital_recognition.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | 3 | import sys 4 | reload(sys) 5 | sys.setdefaultencoding( "utf-8" ) 6 | 7 | from tensorflow.examples.tutorials.mnist import input_data 8 | import tensorflow as tf 9 | 10 | flags = tf.app.flags 11 | FLAGS = flags.FLAGS 12 | flags.DEFINE_string('data_dir', './', 'Directory for storing data') 13 | 14 | mnist = input_data.read_data_sets(FLAGS.data_dir, one_hot=True) 15 | 16 | 17 | x = tf.placeholder(tf.float32, [None, 784]) 18 | W = tf.Variable(tf.zeros([784,10])) 19 | b = tf.Variable(tf.zeros([10])) 20 | y = tf.nn.softmax(tf.matmul(x,W) + b) 21 | y_ = tf.placeholder("float", [None,10]) 22 | cross_entropy = -tf.reduce_sum(y_*tf.log(y)) 23 | train_step = tf.train.GradientDescentOptimizer(0.01).minimize(cross_entropy) 24 | 25 | init = tf.initialize_all_variables() 26 | sess = tf.InteractiveSession() 27 | sess.run(init) 28 | for i in range(1000): 29 | batch_xs, batch_ys = mnist.train.next_batch(100) 30 | sess.run(train_step, feed_dict={x: batch_xs, y_: batch_ys}) 31 | 32 | correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1)) 33 | accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) 34 | print(accuracy.eval({x: mnist.test.images, y_: mnist.test.labels})) 35 | -------------------------------------------------------------------------------- /word2vec/README.txt: -------------------------------------------------------------------------------- 1 | Tools for computing distributed representtion of words 2 | ------------------------------------------------------ 3 | 4 | We provide an implementation of the Continuous Bag-of-Words (CBOW) and the Skip-gram model (SG), as well as several demo scripts. 5 | 6 | Given a text corpus, the word2vec tool learns a vector for every word in the vocabulary using the Continuous 7 | Bag-of-Words or the Skip-Gram neural network architectures. The user should to specify the following: 8 | - desired vector dimensionality 9 | - the size of the context window for either the Skip-Gram or the Continuous Bag-of-Words model 10 | - training algorithm: hierarchical softmax and / or negative sampling 11 | - threshold for downsampling the frequent words 12 | - number of threads to use 13 | - the format of the output word vector file (text or binary) 14 | 15 | Usually, the other hyper-parameters such as the learning rate do not need to be tuned for different training sets. 16 | 17 | The script demo-word.sh downloads a small (100MB) text corpus from the web, and trains a small word vector model. After the training 18 | is finished, the user can interactively explore the similarity of the words. 19 | 20 | More information about the scripts is provided at https://code.google.com/p/word2vec/ 21 | 22 | -------------------------------------------------------------------------------- /subtitle/preprocess/extract_sentence_srt.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | import chardet 3 | import os 4 | import re 5 | 6 | cn=ur"([\u4e00-\u9fa5]+)" 7 | pattern_cn = re.compile(cn) 8 | jp1=ur"([\u3040-\u309F]+)" 9 | pattern_jp1 = re.compile(jp1) 10 | jp2=ur"([\u30A0-\u30FF]+)" 11 | pattern_jp2 = re.compile(jp2) 12 | 13 | for root, dirs, files in os.walk("./srt"): 14 | file_count = len(files) 15 | if file_count > 0: 16 | for index, file in enumerate(files): 17 | f = open(root + "/" + file, "r") 18 | content = f.read() 19 | f.close() 20 | encoding = chardet.detect(content)["encoding"] 21 | try: 22 | for sentence in content.decode(encoding).split('\n'): 23 | if len(sentence) > 0: 24 | match_cn = pattern_cn.findall(sentence) 25 | match_jp1 = pattern_jp1.findall(sentence) 26 | match_jp2 = pattern_jp2.findall(sentence) 27 | sentence = sentence.strip() 28 | if len(match_cn)>0 and len(match_jp1)==0 and len(match_jp2) == 0 and len(sentence)>1 and len(sentence.split(' ')) < 10: 29 | print sentence.encode('utf-8') 30 | except: 31 | continue 32 | -------------------------------------------------------------------------------- /learning_tensorflow/3.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import numpy as np 4 | import tensorflow as tf 5 | 6 | # 随机生成1000个点,围绕在y=0.1x+0.3的直线周围 7 | num_points = 4 8 | vectors_set = [] 9 | for i in xrange(num_points): 10 | x1 = np.random.normal(0.0, 0.55) 11 | y1 = x1 * 0.1 + 0.3 + np.random.normal(0.0, 0.03) 12 | vectors_set.append([x1, y1]) 13 | 14 | # 生成一些样本 15 | x_data = [v[0] for v in vectors_set] 16 | y_data = [v[1] for v in vectors_set] 17 | print "x_data=", x_data 18 | 19 | 20 | # 生成1维的W矩阵,取值是[-1,1]之间的随机数 21 | W = tf.Variable(tf.random_uniform([1], -1.0, 1.0), name='W') 22 | # 生成1维的b矩阵,初始值是0 23 | b = tf.Variable(tf.zeros([1]), name='b') 24 | # 经过计算得出预估值y 25 | y = W * x_data + b 26 | print "y=", y 27 | 28 | # 以预估值y和实际值y_data之间的均方误差作为损失 29 | loss = tf.reduce_mean(tf.square(y - y_data), name='loss') 30 | # 采用梯度下降法来优化参数 31 | optimizer = tf.train.GradientDescentOptimizer(0.5) 32 | # 训练的过程就是最小化这个误差值 33 | train = optimizer.minimize(loss, name='train') 34 | 35 | sess = tf.Session() 36 | # 输出图结构 37 | #print sess.graph_def 38 | 39 | init = tf.initialize_all_variables() 40 | sess.run(init) 41 | 42 | # 初始化的W和b是多少 43 | print "W =", sess.run(W), "b =", sess.run(b), "loss =", sess.run(loss) 44 | # 执行20次训练 45 | for step in xrange(20): 46 | sess.run(train) 47 | # 输出训练好的W和b 48 | print "W =", sess.run(W), "b =", sess.run(b), "loss =", sess.run(loss) 49 | # 生成summary文件,用于tensorboard使用 50 | writer = tf.train.SummaryWriter("./tmp", sess.graph) 51 | -------------------------------------------------------------------------------- /chatbotv1/src/main/java/org/wltea/analyzer/core/ISegmenter.java: -------------------------------------------------------------------------------- 1 | /** 2 | * IK 中文分词 版本 5.0 3 | * IK Analyzer release 5.0 4 | * 5 | * Licensed to the Apache Software Foundation (ASF) under one or more 6 | * contributor license agreements. See the NOTICE file distributed with 7 | * this work for additional information regarding copyright ownership. 8 | * The ASF licenses this file to You under the Apache License, Version 2.0 9 | * (the "License"); you may not use this file except in compliance with 10 | * the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | * 20 | * 源代码由林良益(linliangyi2005@gmail.com)提供 21 | * 版权声明 2012,乌龙茶工作室 22 | * provided by Linliangyi and copyright 2012 by Oolong studio 23 | * 24 | */ 25 | package org.wltea.analyzer.core; 26 | 27 | 28 | /** 29 | * 30 | * 子分词器接口 31 | */ 32 | interface ISegmenter { 33 | 34 | /** 35 | * 从分析器读取下一个可能分解的词元对象 36 | * @param context 分词算法上下文 37 | */ 38 | void analyze(AnalyzeContext context); 39 | 40 | 41 | /** 42 | * 重置子分析器状态 43 | */ 44 | void reset(); 45 | 46 | } 47 | -------------------------------------------------------------------------------- /baidu_search/baidu_search/spiders/baidu_search.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | 3 | import sys 4 | reload(sys) 5 | sys.setdefaultencoding( "utf-8" ) 6 | 7 | import scrapy 8 | from w3lib.html import remove_tags 9 | 10 | class BaiduSearchSpider(scrapy.Spider): 11 | name = "baidu_search" 12 | allowed_domains = ["baidu.com"] 13 | start_urls = [ 14 | "https://www.baidu.com/s?wd=机器学习" 15 | ] 16 | 17 | def parse(self, response): 18 | hrefs = response.selector.xpath('//div[contains(@class, "c-container")]/h3/a/@href').extract() 19 | containers = response.selector.xpath('//div[contains(@class, "c-container")]') 20 | for container in containers: 21 | href = container.xpath('h3/a/@href').extract()[0] 22 | title = remove_tags(container.xpath('h3/a').extract()[0]) 23 | c_abstract = container.xpath('div/div/div[contains(@class, "c-abstract")]').extract() 24 | abstract = "" 25 | if len(c_abstract) > 0: 26 | abstract = remove_tags(c_abstract[0]) 27 | request = scrapy.Request(href, callback=self.parse_url) 28 | request.meta['title'] = title 29 | request.meta['abstract'] = abstract 30 | yield request 31 | 32 | def parse_url(self, response): 33 | print "url:", response.url 34 | print "title:", response.meta['title'] 35 | print "abstract:", response.meta['abstract'] 36 | content = remove_tags(response.selector.xpath('//body').extract()[0]) 37 | print "content_len:", len(content) 38 | -------------------------------------------------------------------------------- /lstm_code/nicodjimenez/test.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import sys 3 | 4 | from lstm import LstmParam, LstmNetwork 5 | 6 | class ToyLossLayer: 7 | """ 8 | Computes square loss with first element of hidden layer array. 9 | """ 10 | @classmethod 11 | def loss(self, pred, label): 12 | return (pred[0] - label) ** 2 13 | 14 | @classmethod 15 | def bottom_diff(self, pred, label): 16 | diff = np.zeros_like(pred) 17 | diff[0] = 2 * (pred[0] - label) 18 | return diff 19 | 20 | def example_0(): 21 | # learns to repeat simple sequence from random inputs 22 | np.random.seed(0) 23 | 24 | # parameters for input data dimension and lstm cell count 25 | mem_cell_ct = 100 26 | x_dim = 50 27 | concat_len = x_dim + mem_cell_ct 28 | lstm_param = LstmParam(mem_cell_ct, x_dim) 29 | lstm_net = LstmNetwork(lstm_param) 30 | y_list = [-0.5,0.2,0.1, -0.5] 31 | input_val_arr = [np.random.random(x_dim) for _ in y_list] 32 | 33 | for cur_iter in range(100): 34 | print "cur iter: ", cur_iter 35 | print "input_val_arr=", input_val_arr 36 | print "y_list=", y_list 37 | for ind in range(len(y_list)): 38 | lstm_net.x_list_add(input_val_arr[ind]) 39 | print "y_pred[%d] : %f" % (ind, lstm_net.lstm_node_list[ind].state.h[0]) 40 | 41 | loss = lstm_net.y_list_is(y_list, ToyLossLayer) 42 | print "loss: ", loss 43 | lstm_param.apply_diff(lr=0.1) 44 | lstm_net.x_list_clear() 45 | 46 | if __name__ == "__main__": 47 | example_0() 48 | 49 | -------------------------------------------------------------------------------- /word_vectors_loader.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | 3 | import sys 4 | import struct 5 | import math 6 | import numpy as np 7 | 8 | reload(sys) 9 | sys.setdefaultencoding( "utf-8" ) 10 | 11 | max_w = 50 12 | float_size = 4 13 | 14 | def load_vectors(input): 15 | print "begin load vectors" 16 | 17 | input_file = open(input, "rb") 18 | 19 | # 获取词表数目及向量维度 20 | words_and_size = input_file.readline() 21 | words_and_size = words_and_size.strip() 22 | words = long(words_and_size.split(' ')[0]) 23 | size = long(words_and_size.split(' ')[1]) 24 | print "words =", words 25 | print "size =", size 26 | 27 | word_vector = {} 28 | 29 | for b in range(0, words): 30 | a = 0 31 | word = '' 32 | # 读取一个词 33 | while True: 34 | c = input_file.read(1) 35 | word = word + c 36 | if False == c or c == ' ': 37 | break 38 | if a < max_w and c != '\n': 39 | a = a + 1 40 | word = word.strip() 41 | 42 | # 读取词向量 43 | vector = np.empty([200]) 44 | for index in range(0, size): 45 | m = input_file.read(float_size) 46 | (weight,) = struct.unpack('f', m) 47 | vector[index] = weight 48 | 49 | # 将词及其对应的向量存到dict中 50 | word_vector[word.decode('utf-8')] = vector 51 | 52 | input_file.close() 53 | 54 | print "load vectors finish" 55 | return word_vector 56 | 57 | if __name__ == '__main__': 58 | if 2 != len(sys.argv): 59 | print "Usage: ", sys.argv[0], "vectors.bin" 60 | sys.exit(-1) 61 | d = load_vectors(sys.argv[1]) 62 | print d[u'真的'] 63 | -------------------------------------------------------------------------------- /subtitle/preprocess/extract_sentence_ass.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | import chardet 3 | import os 4 | import re 5 | 6 | cn=ur"([\u4e00-\u9fa5]+)" 7 | pattern_cn = re.compile(cn) 8 | jp1=ur"([\u3040-\u309F]+)" 9 | pattern_jp1 = re.compile(jp1) 10 | jp2=ur"([\u30A0-\u30FF]+)" 11 | pattern_jp2 = re.compile(jp2) 12 | 13 | for root, dirs, files in os.walk("./ass"): 14 | file_count = len(files) 15 | if file_count > 0: 16 | for index, file in enumerate(files): 17 | f = open(root + "/" + file, "r") 18 | content = f.read() 19 | f.close() 20 | encoding = chardet.detect(content)["encoding"] 21 | try: 22 | for line in content.decode(encoding).split('\n'): 23 | if line.find('Dialogue') == 0 and len(line) < 500: 24 | fields = line.split(',') 25 | sentence = fields[len(fields)-1] 26 | tag_fields = sentence.split('}') 27 | if len(tag_fields) > 1: 28 | sentence = tag_fields[len(tag_fields)-1] 29 | match_cn = pattern_cn.findall(sentence) 30 | match_jp1 = pattern_jp1.findall(sentence) 31 | match_jp2 = pattern_jp2.findall(sentence) 32 | sentence = sentence.strip() 33 | if len(match_cn)>0 and len(match_jp1)==0 and len(match_jp2) == 0 and len(sentence)>1 and len(sentence.split(' ')) < 10: 34 | sentence = sentence.replace('\N', '') 35 | print sentence.encode('utf-8') 36 | except: 37 | continue 38 | -------------------------------------------------------------------------------- /subtitle/preprocess/extract_sentence_ssa.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | import chardet 3 | import os 4 | import re 5 | 6 | cn=ur"([\u4e00-\u9fa5]+)" 7 | pattern_cn = re.compile(cn) 8 | jp1=ur"([\u3040-\u309F]+)" 9 | pattern_jp1 = re.compile(jp1) 10 | jp2=ur"([\u30A0-\u30FF]+)" 11 | pattern_jp2 = re.compile(jp2) 12 | 13 | for root, dirs, files in os.walk("./ssa"): 14 | file_count = len(files) 15 | if file_count > 0: 16 | for index, file in enumerate(files): 17 | f = open(root + "/" + file, "r") 18 | content = f.read() 19 | f.close() 20 | encoding = chardet.detect(content)["encoding"] 21 | try: 22 | for line in content.decode(encoding).split('\n'): 23 | if line.find('Dialogue') == 0 and len(line) < 500: 24 | fields = line.split(',') 25 | sentence = fields[len(fields)-1] 26 | tag_fields = sentence.split('}') 27 | if len(tag_fields) > 1: 28 | sentence = tag_fields[len(tag_fields)-1] 29 | match_cn = pattern_cn.findall(sentence) 30 | match_jp1 = pattern_jp1.findall(sentence) 31 | match_jp2 = pattern_jp2.findall(sentence) 32 | sentence = sentence.strip() 33 | if len(match_cn)>0 and len(match_jp1)==0 and len(match_jp2) == 0 and len(sentence)>1 and len(sentence.split(' ')) < 10: 34 | sentence = sentence.replace('\N', '') 35 | print sentence.encode('utf-8') 36 | except: 37 | continue 38 | -------------------------------------------------------------------------------- /chatbotv1/src/main/java/com/shareditor/chatbotv1/Searcher.java: -------------------------------------------------------------------------------- 1 | package com.shareditor.chatbotv1; 2 | 3 | import io.netty.bootstrap.ServerBootstrap; 4 | import io.netty.channel.ChannelFuture; 5 | import io.netty.channel.ChannelInitializer; 6 | import io.netty.channel.ChannelOption; 7 | import io.netty.channel.ChannelPipeline; 8 | import io.netty.channel.EventLoopGroup; 9 | import io.netty.channel.nio.NioEventLoopGroup; 10 | import io.netty.channel.socket.SocketChannel; 11 | import io.netty.channel.socket.nio.NioServerSocketChannel; 12 | import io.netty.handler.codec.http.HttpObjectAggregator; 13 | import io.netty.handler.codec.http.HttpRequestDecoder; 14 | import io.netty.handler.codec.http.HttpResponseEncoder; 15 | import io.netty.handler.logging.LogLevel; 16 | import io.netty.handler.logging.LoggingHandler; 17 | 18 | public class Searcher { 19 | 20 | 21 | public static void main(String[] args) throws InterruptedException { 22 | EventLoopGroup bossGroup = new NioEventLoopGroup(1); 23 | EventLoopGroup workerGroup = new NioEventLoopGroup(); 24 | ServerBootstrap b = new ServerBootstrap(); 25 | b.group(bossGroup, workerGroup) 26 | .channel(NioServerSocketChannel.class) 27 | .option(ChannelOption.SO_BACKLOG, 128) 28 | .handler(new LoggingHandler(LogLevel.TRACE)) 29 | .childHandler(new ChannelInitializer() { 30 | @Override 31 | public void initChannel(SocketChannel ch) throws Exception { 32 | ChannelPipeline p = ch.pipeline(); 33 | p.addLast("http-decoder", new HttpRequestDecoder()); 34 | p.addLast("http-aggregator", new HttpObjectAggregator(65535)); 35 | p.addLast("http-encoder", new HttpResponseEncoder()); 36 | p.addLast("handler", new HttpServerInboundHandler()); 37 | } 38 | }); 39 | ChannelFuture f = b.bind("0.0.0.0", 8765).sync(); 40 | f.channel().closeFuture().sync(); 41 | } 42 | 43 | } 44 | -------------------------------------------------------------------------------- /seq2seq/tflearn_prj/my_tflearn_demo.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import tflearn 5 | import sys 6 | 7 | # Download the Titanic dataset 8 | from tflearn.datasets import titanic 9 | titanic.download_dataset('titanic_dataset.csv') 10 | 11 | # Load CSV file, indicate that the first column represents labels 12 | from tflearn.data_utils import load_csv 13 | data, labels = load_csv('titanic_dataset.csv', target_column=0, 14 | categorical_labels=True, n_classes=2) 15 | 16 | # Preprocessing function 17 | def preprocess(data, columns_to_ignore): 18 | # Sort by descending id and delete columns 19 | for id in sorted(columns_to_ignore, reverse=True): 20 | [r.pop(id) for r in data] 21 | for i in range(len(data)): 22 | # Converting 'sex' field to float (id is 1 after removing labels column) 23 | data[i][1] = 1. if data[i][1] == 'female' else 0. 24 | return np.array(data, dtype=np.float32) 25 | 26 | # Ignore 'name' and 'ticket' columns (id 1 & 6 of data array) 27 | to_ignore=[1, 6] 28 | 29 | # Preprocess data 30 | data = preprocess(data, to_ignore) 31 | 32 | # Build neural network 33 | net = tflearn.input_data(shape=[None, 6]) 34 | net = tflearn.fully_connected(net, 32) 35 | net = tflearn.fully_connected(net, 32) 36 | net = tflearn.fully_connected(net, 2, activation='softmax') 37 | net = tflearn.regression(net) 38 | 39 | # Define model 40 | model = tflearn.DNN(net) 41 | # Start training (apply gradient descent algorithm) 42 | model.fit(data, labels, n_epoch=10, batch_size=16, show_metric=True) 43 | 44 | # Let's create some data for DiCaprio and Winslet 45 | dicaprio = [3, 'Jack Dawson', 'male', 19, 0, 0, 'N/A', 5.0000] 46 | winslet = [1, 'Rose DeWitt Bukater', 'female', 17, 1, 2, 'N/A', 100.0000] 47 | # Preprocess data 48 | dicaprio, winslet = preprocess([dicaprio, winslet], to_ignore) 49 | # Predict surviving chances (class 1 results) 50 | pred = model.predict([dicaprio, winslet]) 51 | print("DiCaprio Surviving Rate:", pred[0][1]) 52 | print("Winslet Surviving Rate:", pred[1][1]) 53 | 54 | -------------------------------------------------------------------------------- /lstm_code/nicodjimenez/README.md: -------------------------------------------------------------------------------- 1 | # lstm 2 | A basic lstm network can be written from scratch in a few hundred lines of python, yet most of us have a hard time figuring out how lstm's actually work. The original Neural Computation [paper](https://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=3&cad=rja&uact=8&ved=0CDAQFjACahUKEwj1iZLX5efGAhVMpIgKHbv3DiI&url=http%3A%2F%2Fdeeplearning.cs.cmu.edu%2Fpdfs%2FHochreiter97_lstm.pdf&ei=ZuirVfW-GMzIogS777uQAg&usg=AFQjCNGoFvqrva4rDCNIcqNe_SiPL_VPxg&sig2=ZYnsGpdfHjRbK8xdr1thBg&bvm=bv.98197061,d.cGU) is too technical for non experts. Most blogs online on the topic seem to be written by people 3 | who have never implemented lstm's for people who will not implement them either. Other blogs are written by experts (like this [blog post](http://karpathy.github.io/2015/05/21/rnn-effectiveness/)) and lack simplified illustrative source code that actually does something. The [Apollo](https://github.com/Russell91/apollo) library built on top of caffe is terrific and features a fast lstm implementation. However, the downside of efficient implementations is that the source code is hard to follow. 4 | 5 | This repo features a minimal lstm implementation for people that are curious about lstms to the point of wanting to know how lstm's might be implemented. The code here follows notational conventions set forth in [this](http://arxiv.org/abs/1506.00019) 6 | well written tutorial introduction. This article should be read before trying to understand this code (at least the part about lstm's). By running `python test.py` you will have a minimal example of an lstm network learning to predict an output sequence of numbers in [-1,1] by using a Euclidean loss on the first element of each node's hidden layer. 7 | 8 | Play with code, add functionality, and try it on different datasets. Pull requests welcome. 9 | 10 | Please read [my blog article](http://nicodjimenez.github.io/2014/08/08/lstm.html) if you want details on the backprop part of the code. 11 | 12 | Also, check out a version of this code written in the D programming language by Mathias Baumann: https://github.com/Marenz/lstm 13 | -------------------------------------------------------------------------------- /chatbotv1/src/main/java/org/wltea/analyzer/cfg/Configuration.java: -------------------------------------------------------------------------------- 1 | /** 2 | * IK 中文分词 版本 5.0 3 | * IK Analyzer release 5.0 4 | * 5 | * Licensed to the Apache Software Foundation (ASF) under one or more 6 | * contributor license agreements. See the NOTICE file distributed with 7 | * this work for additional information regarding copyright ownership. 8 | * The ASF licenses this file to You under the Apache License, Version 2.0 9 | * (the "License"); you may not use this file except in compliance with 10 | * the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | * 20 | * 源代码由林良益(linliangyi2005@gmail.com)提供 21 | * 版权声明 2012,乌龙茶工作室 22 | * provided by Linliangyi and copyright 2012 by Oolong studio 23 | * 24 | */ 25 | package org.wltea.analyzer.cfg; 26 | 27 | import java.util.List; 28 | 29 | /** 30 | * 31 | * 配置管理类接口 32 | * 33 | */ 34 | public interface Configuration { 35 | 36 | 37 | 38 | /** 39 | * 返回useSmart标志位 40 | * useSmart =true ,分词器使用智能切分策略, =false则使用细粒度切分 41 | * @return useSmart 42 | */ 43 | public boolean useSmart(); 44 | 45 | /** 46 | * 设置useSmart标志位 47 | * useSmart =true ,分词器使用智能切分策略, =false则使用细粒度切分 48 | * @param useSmart 49 | */ 50 | public void setUseSmart(boolean useSmart); 51 | 52 | 53 | /** 54 | * 获取主词典路径 55 | * 56 | * @return String 主词典路径 57 | */ 58 | public String getMainDictionary(); 59 | 60 | /** 61 | * 获取量词词典路径 62 | * @return String 量词词典路径 63 | */ 64 | public String getQuantifierDicionary(); 65 | 66 | /** 67 | * 获取扩展字典配置路径 68 | * @return List 相对类加载器的路径 69 | */ 70 | public List getExtDictionarys(); 71 | 72 | 73 | /** 74 | * 获取扩展停止词典配置路径 75 | * @return List 相对类加载器的路径 76 | */ 77 | public List getExtStopWordDictionarys(); 78 | 79 | } 80 | -------------------------------------------------------------------------------- /lstm_code/nicodjimenez/test2.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import sys 3 | 4 | from lstm import LstmParam, LstmNetwork 5 | 6 | class ToyLossLayer: 7 | """ 8 | Computes square loss with first element of hidden layer array. 9 | """ 10 | @classmethod 11 | def loss(self, pred, label): 12 | return (pred[0] - label) ** 2 13 | 14 | @classmethod 15 | def bottom_diff(self, pred, label): 16 | diff = np.zeros_like(pred) 17 | diff[0] = 2 * (pred[0] - label) 18 | return diff 19 | 20 | class Primes: 21 | def __init__(self): 22 | self.primes = list() 23 | for i in range(2, 100): 24 | is_prime = True 25 | for j in range(2, i-1): 26 | if i % j == 0: 27 | is_prime = False 28 | if is_prime: 29 | self.primes.append(i) 30 | self.primes_count = len(self.primes) 31 | def get_sample(self, x_dim, y_dim, index): 32 | result = np.zeros((x_dim+y_dim)) 33 | for i in range(index, index + x_dim + y_dim): 34 | result[i-index] = self.primes[i%self.primes_count]/100.0 35 | return result 36 | 37 | 38 | def example_0(): 39 | mem_cell_ct = 100 40 | x_dim = 50 41 | concat_len = x_dim + mem_cell_ct 42 | lstm_param = LstmParam(mem_cell_ct, x_dim) 43 | lstm_net = LstmNetwork(lstm_param) 44 | 45 | primes = Primes() 46 | x_list = [] 47 | y_list = [] 48 | for i in range(0, 10): 49 | sample = primes.get_sample(x_dim, 1, i) 50 | x = sample[0:x_dim] 51 | y = sample[x_dim:x_dim+1].tolist()[0] 52 | x_list.append(x) 53 | y_list.append(y) 54 | 55 | for cur_iter in range(10000): 56 | if cur_iter % 1000 == 0: 57 | print "y_list=", y_list 58 | for ind in range(len(y_list)): 59 | lstm_net.x_list_add(x_list[ind]) 60 | if cur_iter % 1000 == 0: 61 | print "y_pred[%d] : %f" % (ind, lstm_net.lstm_node_list[ind].state.h[0]) 62 | 63 | loss = lstm_net.y_list_is(y_list, ToyLossLayer) 64 | if cur_iter % 1000 == 0: 65 | print "loss: ", loss 66 | lstm_param.apply_diff(lr=0.01) 67 | lstm_net.x_list_clear() 68 | 69 | if __name__ == "__main__": 70 | example_0() 71 | -------------------------------------------------------------------------------- /learning_tensorflow/2.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import tensorflow as tf 4 | 5 | with tf.Graph().as_default() as g: 6 | with g.name_scope("myscope") as scope: # 有了这个scope,下面的op的name都是类似myscope/Placeholder这样的前缀 7 | sess = tf.Session(target='', graph = g, config=None) # target表示要连接的tf执行引擎 8 | print "graph version:", g.version # 0 9 | a = tf.placeholder("float") 10 | print a.op # 输出整个operation信息,跟下面g.get_operations返回结果一样 11 | print "graph version:", g.version # 1 12 | b = tf.placeholder("float") 13 | print "graph version:", g.version # 2 14 | c = tf.placeholder("float") 15 | print "graph version:", g.version # 3 16 | y1 = tf.mul(a, b) # 也可以写成a * b 17 | print "graph version:", g.version # 4 18 | y2 = tf.mul(y1, c) # 也可以写成y1 * c 19 | print "graph version:", g.version # 5 20 | operations = g.get_operations() 21 | for (i, op) in enumerate(operations): 22 | print "============ operation", i+1, "===========" 23 | print op # 一个结构,包括:name、op、attr、input等,不同op不一样 24 | assert y1.graph is g 25 | assert sess.graph is g 26 | print "================ graph object address ================" 27 | print sess.graph 28 | print "================ graph define ================" 29 | print sess.graph_def 30 | print "================ sess str ================" 31 | print sess.sess_str 32 | print sess.run(y1, feed_dict={a: 3, b: 3}) # 9.0 feed_dictgraph中的元素和值的映射 33 | print sess.run(fetches=[b,y1], feed_dict={a: 3, b: 3}, options=None, run_metadata=None) # 传入的feches和返回值的shape相同 34 | print sess.run({'ret_name':y1}, feed_dict={a: 3, b: 3}) # {'ret_name': 9.0} 传入的feches和返回值的shape相同 35 | 36 | assert tf.get_default_session() is not sess 37 | with sess.as_default(): # 把sess作为默认的session,那么tf.get_default_session就是sess, 否则不是 38 | assert tf.get_default_session() is sess 39 | 40 | h = sess.partial_run_setup([y1, y2], [a, b, c]) # 分阶段运行,参数指明了feches和feed_dict列表 41 | res = sess.partial_run(h, y1, feed_dict={a: 3, b: 4}) # 12 运行第一阶段 42 | res = sess.partial_run(h, y2, feed_dict={c: res}) # 144.0 运行第二阶段,其中使用了第一阶段的执行结果 43 | print "partial_run res:", res 44 | sess.close() 45 | -------------------------------------------------------------------------------- /chatbotv1/src/main/java/org/wltea/analyzer/lucene/IKAnalyzer.java: -------------------------------------------------------------------------------- 1 | /** 2 | * IK 中文分词 版本 5.0.1 3 | * IK Analyzer release 5.0.1 4 | * 5 | * Licensed to the Apache Software Foundation (ASF) under one or more 6 | * contributor license agreements. See the NOTICE file distributed with 7 | * this work for additional information regarding copyright ownership. 8 | * The ASF licenses this file to You under the Apache License, Version 2.0 9 | * (the "License"); you may not use this file except in compliance with 10 | * the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | * 20 | * 源代码由林良益(linliangyi2005@gmail.com)提供 21 | * 版权声明 2012,乌龙茶工作室 22 | * provided by Linliangyi and copyright 2012 by Oolong studio 23 | * 24 | */ 25 | package org.wltea.analyzer.lucene; 26 | 27 | import java.io.Reader; 28 | 29 | import org.apache.lucene.analysis.Analyzer; 30 | import org.apache.lucene.analysis.Tokenizer; 31 | 32 | /** 33 | * IK分词器,Lucene Analyzer接口实现 34 | * 兼容Lucene 4.0版本 35 | */ 36 | public final class IKAnalyzer extends Analyzer{ 37 | 38 | private boolean useSmart; 39 | 40 | public boolean useSmart() { 41 | return useSmart; 42 | } 43 | 44 | public void setUseSmart(boolean useSmart) { 45 | this.useSmart = useSmart; 46 | } 47 | 48 | /** 49 | * IK分词器Lucene Analyzer接口实现类 50 | * 51 | * 默认细粒度切分算法 52 | */ 53 | public IKAnalyzer(){ 54 | this(false); 55 | } 56 | 57 | /** 58 | * IK分词器Lucene Analyzer接口实现类 59 | * 60 | * @param useSmart 当为true时,分词器进行智能切分 61 | */ 62 | public IKAnalyzer(boolean useSmart){ 63 | super(); 64 | this.useSmart = useSmart; 65 | } 66 | 67 | /** 68 | * 重载Analyzer接口,构造分词组件 69 | */ 70 | @Override 71 | protected TokenStreamComponents createComponents(String fieldName, final Reader in) { 72 | Tokenizer _IKTokenizer = new IKTokenizer(in , this.useSmart()); 73 | return new TokenStreamComponents(_IKTokenizer); 74 | } 75 | 76 | } 77 | -------------------------------------------------------------------------------- /chatbotv1/src/main/java/com/shareditor/chatbotv1/NettyHttpServletResponse.java: -------------------------------------------------------------------------------- 1 | package com.shareditor.chatbotv1; 2 | 3 | import io.netty.buffer.ByteBuf; 4 | import io.netty.handler.codec.http.DefaultHttpResponse; 5 | import io.netty.handler.codec.http.FullHttpResponse; 6 | import io.netty.handler.codec.http.HttpHeaders; 7 | import io.netty.handler.codec.http.HttpResponseStatus; 8 | import io.netty.handler.codec.http.HttpVersion; 9 | 10 | public class NettyHttpServletResponse extends DefaultHttpResponse implements FullHttpResponse { 11 | 12 | private ByteBuf content; 13 | 14 | public NettyHttpServletResponse(HttpVersion version, HttpResponseStatus status) { 15 | super(version, status); 16 | } 17 | 18 | public HttpHeaders trailingHeaders() { 19 | // TODO Auto-generated method stub 20 | return null; 21 | } 22 | 23 | public void setContent(ByteBuf buf) { 24 | this.content = buf; 25 | } 26 | 27 | public ByteBuf content() { 28 | return content; 29 | } 30 | 31 | public int refCnt() { 32 | // TODO Auto-generated method stub 33 | return 0; 34 | } 35 | 36 | public boolean release() { 37 | // TODO Auto-generated method stub 38 | return false; 39 | } 40 | 41 | public boolean release(int decrement) { 42 | // TODO Auto-generated method stub 43 | return false; 44 | } 45 | 46 | public FullHttpResponse copy(ByteBuf newContent) { 47 | // TODO Auto-generated method stub 48 | return null; 49 | } 50 | 51 | public FullHttpResponse copy() { 52 | // TODO Auto-generated method stub 53 | return null; 54 | } 55 | 56 | public FullHttpResponse retain(int increment) { 57 | // TODO Auto-generated method stub 58 | return null; 59 | } 60 | 61 | public FullHttpResponse retain() { 62 | // TODO Auto-generated method stub 63 | return null; 64 | } 65 | 66 | public FullHttpResponse touch() { 67 | // TODO Auto-generated method stub 68 | return null; 69 | } 70 | 71 | public FullHttpResponse touch(Object hint) { 72 | // TODO Auto-generated method stub 73 | return null; 74 | } 75 | 76 | public FullHttpResponse duplicate() { 77 | // TODO Auto-generated method stub 78 | return null; 79 | } 80 | 81 | public FullHttpResponse setProtocolVersion(HttpVersion version) { 82 | // TODO Auto-generated method stub 83 | return null; 84 | } 85 | 86 | public FullHttpResponse setStatus(HttpResponseStatus status) { 87 | // TODO Auto-generated method stub 88 | return null; 89 | } 90 | 91 | } 92 | -------------------------------------------------------------------------------- /chatbotv1/src/main/resources/quantifier.dic: -------------------------------------------------------------------------------- 1 | 丈 2 | 下 3 | 世 4 | 世纪 5 | 两 6 | 个 7 | 中 8 | 串 9 | 亩 10 | 人 11 | 介 12 | 付 13 | 代 14 | 件 15 | 任 16 | 份 17 | 伏 18 | 伙 19 | 位 20 | 位数 21 | 例 22 | 倍 23 | 像素 24 | 元 25 | 克 26 | 克拉 27 | 公亩 28 | 公克 29 | 公分 30 | 公升 31 | 公尺 32 | 公担 33 | 公斤 34 | 公里 35 | 公顷 36 | 具 37 | 册 38 | 出 39 | 刀 40 | 分 41 | 分钟 42 | 分米 43 | 划 44 | 列 45 | 则 46 | 刻 47 | 剂 48 | 剑 49 | 副 50 | 加仑 51 | 勺 52 | 包 53 | 匙 54 | 匹 55 | 区 56 | 千克 57 | 千米 58 | 升 59 | 卷 60 | 厅 61 | 厘 62 | 厘米 63 | 双 64 | 发 65 | 口 66 | 句 67 | 只 68 | 台 69 | 叶 70 | 号 71 | 名 72 | 吨 73 | 听 74 | 员 75 | 周 76 | 周年 77 | 品 78 | 回 79 | 团 80 | 圆 81 | 圈 82 | 地 83 | 场 84 | 块 85 | 坪 86 | 堆 87 | 声 88 | 壶 89 | 处 90 | 夜 91 | 大 92 | 天 93 | 头 94 | 套 95 | 女 96 | 孔 97 | 字 98 | 宗 99 | 室 100 | 家 101 | 寸 102 | 对 103 | 封 104 | 尊 105 | 小时 106 | 尺 107 | 尾 108 | 局 109 | 层 110 | 届 111 | 岁 112 | 师 113 | 帧 114 | 幅 115 | 幕 116 | 幢 117 | 平方 118 | 平方公尺 119 | 平方公里 120 | 平方分米 121 | 平方厘米 122 | 平方码 123 | 平方米 124 | 平方英寸 125 | 平方英尺 126 | 平方英里 127 | 平米 128 | 年 129 | 年代 130 | 年级 131 | 度 132 | 座 133 | 式 134 | 引 135 | 张 136 | 成 137 | 战 138 | 截 139 | 户 140 | 房 141 | 所 142 | 扇 143 | 手 144 | 打 145 | 批 146 | 把 147 | 折 148 | 担 149 | 拍 150 | 招 151 | 拨 152 | 拳 153 | 指 154 | 掌 155 | 排 156 | 撮 157 | 支 158 | 文 159 | 斗 160 | 斤 161 | 方 162 | 族 163 | 日 164 | 时 165 | 曲 166 | 月 167 | 月份 168 | 期 169 | 本 170 | 朵 171 | 村 172 | 束 173 | 条 174 | 来 175 | 杯 176 | 枚 177 | 枝 178 | 枪 179 | 架 180 | 柄 181 | 柜 182 | 栋 183 | 栏 184 | 株 185 | 样 186 | 根 187 | 格 188 | 案 189 | 桌 190 | 档 191 | 桩 192 | 桶 193 | 梯 194 | 棵 195 | 楼 196 | 次 197 | 款 198 | 步 199 | 段 200 | 毛 201 | 毫 202 | 毫升 203 | 毫米 204 | 毫克 205 | 池 206 | 洲 207 | 派 208 | 海里 209 | 滴 210 | 炮 211 | 点 212 | 点钟 213 | 片 214 | 版 215 | 环 216 | 班 217 | 瓣 218 | 瓶 219 | 生 220 | 男 221 | 画 222 | 界 223 | 盆 224 | 盎司 225 | 盏 226 | 盒 227 | 盘 228 | 相 229 | 眼 230 | 石 231 | 码 232 | 碗 233 | 碟 234 | 磅 235 | 种 236 | 科 237 | 秒 238 | 秒钟 239 | 窝 240 | 立方公尺 241 | 立方分米 242 | 立方厘米 243 | 立方码 244 | 立方米 245 | 立方英寸 246 | 立方英尺 247 | 站 248 | 章 249 | 笔 250 | 等 251 | 筐 252 | 筒 253 | 箱 254 | 篇 255 | 篓 256 | 篮 257 | 簇 258 | 米 259 | 类 260 | 粒 261 | 级 262 | 组 263 | 维 264 | 缕 265 | 缸 266 | 罐 267 | 网 268 | 群 269 | 股 270 | 脚 271 | 船 272 | 艇 273 | 艘 274 | 色 275 | 节 276 | 英亩 277 | 英寸 278 | 英尺 279 | 英里 280 | 行 281 | 袋 282 | 角 283 | 言 284 | 课 285 | 起 286 | 趟 287 | 路 288 | 车 289 | 转 290 | 轮 291 | 辆 292 | 辈 293 | 连 294 | 通 295 | 遍 296 | 部 297 | 里 298 | 重 299 | 针 300 | 钟 301 | 钱 302 | 锅 303 | 门 304 | 间 305 | 队 306 | 阶段 307 | 隅 308 | 集 309 | 页 310 | 顶 311 | 顷 312 | 项 313 | 顿 314 | 颗 315 | 餐 316 | 首 -------------------------------------------------------------------------------- /chatbotv1/src/main/java/org/wltea/analyzer/dic/quantifier.dic: -------------------------------------------------------------------------------- 1 | 丈 2 | 下 3 | 世 4 | 世纪 5 | 两 6 | 个 7 | 中 8 | 串 9 | 亩 10 | 人 11 | 介 12 | 付 13 | 代 14 | 件 15 | 任 16 | 份 17 | 伏 18 | 伙 19 | 位 20 | 位数 21 | 例 22 | 倍 23 | 像素 24 | 元 25 | 克 26 | 克拉 27 | 公亩 28 | 公克 29 | 公分 30 | 公升 31 | 公尺 32 | 公担 33 | 公斤 34 | 公里 35 | 公顷 36 | 具 37 | 册 38 | 出 39 | 刀 40 | 分 41 | 分钟 42 | 分米 43 | 划 44 | 列 45 | 则 46 | 刻 47 | 剂 48 | 剑 49 | 副 50 | 加仑 51 | 勺 52 | 包 53 | 匙 54 | 匹 55 | 区 56 | 千克 57 | 千米 58 | 升 59 | 卷 60 | 厅 61 | 厘 62 | 厘米 63 | 双 64 | 发 65 | 口 66 | 句 67 | 只 68 | 台 69 | 叶 70 | 号 71 | 名 72 | 吨 73 | 听 74 | 员 75 | 周 76 | 周年 77 | 品 78 | 回 79 | 团 80 | 圆 81 | 圈 82 | 地 83 | 场 84 | 块 85 | 坪 86 | 堆 87 | 声 88 | 壶 89 | 处 90 | 夜 91 | 大 92 | 天 93 | 头 94 | 套 95 | 女 96 | 孔 97 | 字 98 | 宗 99 | 室 100 | 家 101 | 寸 102 | 对 103 | 封 104 | 尊 105 | 小时 106 | 尺 107 | 尾 108 | 局 109 | 层 110 | 届 111 | 岁 112 | 师 113 | 帧 114 | 幅 115 | 幕 116 | 幢 117 | 平方 118 | 平方公尺 119 | 平方公里 120 | 平方分米 121 | 平方厘米 122 | 平方码 123 | 平方米 124 | 平方英寸 125 | 平方英尺 126 | 平方英里 127 | 平米 128 | 年 129 | 年代 130 | 年级 131 | 度 132 | 座 133 | 式 134 | 引 135 | 张 136 | 成 137 | 战 138 | 截 139 | 户 140 | 房 141 | 所 142 | 扇 143 | 手 144 | 打 145 | 批 146 | 把 147 | 折 148 | 担 149 | 拍 150 | 招 151 | 拨 152 | 拳 153 | 指 154 | 掌 155 | 排 156 | 撮 157 | 支 158 | 文 159 | 斗 160 | 斤 161 | 方 162 | 族 163 | 日 164 | 时 165 | 曲 166 | 月 167 | 月份 168 | 期 169 | 本 170 | 朵 171 | 村 172 | 束 173 | 条 174 | 来 175 | 杯 176 | 枚 177 | 枝 178 | 枪 179 | 架 180 | 柄 181 | 柜 182 | 栋 183 | 栏 184 | 株 185 | 样 186 | 根 187 | 格 188 | 案 189 | 桌 190 | 档 191 | 桩 192 | 桶 193 | 梯 194 | 棵 195 | 楼 196 | 次 197 | 款 198 | 步 199 | 段 200 | 毛 201 | 毫 202 | 毫升 203 | 毫米 204 | 毫克 205 | 池 206 | 洲 207 | 派 208 | 海里 209 | 滴 210 | 炮 211 | 点 212 | 点钟 213 | 片 214 | 版 215 | 环 216 | 班 217 | 瓣 218 | 瓶 219 | 生 220 | 男 221 | 画 222 | 界 223 | 盆 224 | 盎司 225 | 盏 226 | 盒 227 | 盘 228 | 相 229 | 眼 230 | 石 231 | 码 232 | 碗 233 | 碟 234 | 磅 235 | 种 236 | 科 237 | 秒 238 | 秒钟 239 | 窝 240 | 立方公尺 241 | 立方分米 242 | 立方厘米 243 | 立方码 244 | 立方米 245 | 立方英寸 246 | 立方英尺 247 | 站 248 | 章 249 | 笔 250 | 等 251 | 筐 252 | 筒 253 | 箱 254 | 篇 255 | 篓 256 | 篮 257 | 簇 258 | 米 259 | 类 260 | 粒 261 | 级 262 | 组 263 | 维 264 | 缕 265 | 缸 266 | 罐 267 | 网 268 | 群 269 | 股 270 | 脚 271 | 船 272 | 艇 273 | 艘 274 | 色 275 | 节 276 | 英亩 277 | 英寸 278 | 英尺 279 | 英里 280 | 行 281 | 袋 282 | 角 283 | 言 284 | 课 285 | 起 286 | 趟 287 | 路 288 | 车 289 | 转 290 | 轮 291 | 辆 292 | 辈 293 | 连 294 | 通 295 | 遍 296 | 部 297 | 里 298 | 重 299 | 针 300 | 钟 301 | 钱 302 | 锅 303 | 门 304 | 间 305 | 队 306 | 阶段 307 | 隅 308 | 集 309 | 页 310 | 顶 311 | 顷 312 | 项 313 | 顿 314 | 颗 315 | 餐 316 | 首 -------------------------------------------------------------------------------- /chatbotv1/pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | 5 | com.shareditor 6 | chatbotv1 7 | 0.0.1-SNAPSHOT 8 | jar 9 | 10 | chatbotv1 11 | http://maven.apache.org 12 | 13 | 14 | UTF-8 15 | 16 | 17 | 18 | 19 | junit 20 | junit 21 | 3.8.1 22 | test 23 | 24 | 25 | 26 | org.apache.lucene 27 | lucene-core 28 | 4.9.0 29 | 30 | 31 | org.apache.lucene 32 | lucene-queryparser 33 | 4.9.0 34 | 35 | 36 | org.apache.lucene 37 | lucene-analyzers-common 38 | 4.9.0 39 | 40 | 41 | io.netty 42 | netty-all 43 | 5.0.0.Alpha2 44 | 45 | 46 | com.alibaba 47 | fastjson 48 | 1.1.41 49 | 50 | 51 | log4j 52 | log4j 53 | 1.2.14 54 | 55 | 56 | 57 | 58 | 59 | 60 | org.apache.maven.plugins 61 | maven-dependency-plugin 62 | 63 | 64 | copy-dependencies 65 | prepare-package 66 | 67 | copy-dependencies 68 | 69 | 70 | ${project.build.directory}/lib 71 | false 72 | false 73 | true 74 | 75 | 76 | 77 | 78 | 79 | org.apache.maven.plugins 80 | maven-jar-plugin 81 | 82 | 83 | 84 | true 85 | lib/ 86 | theMainClass 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | -------------------------------------------------------------------------------- /chatbotv1/src/main/java/org/wltea/analyzer/sample/IKAnalzyerDemo.java: -------------------------------------------------------------------------------- 1 | /** 2 | * IK 中文分词 版本 5.0.1 3 | * IK Analyzer release 5.0.1 4 | * 5 | * Licensed to the Apache Software Foundation (ASF) under one or more 6 | * contributor license agreements. See the NOTICE file distributed with 7 | * this work for additional information regarding copyright ownership. 8 | * The ASF licenses this file to You under the Apache License, Version 2.0 9 | * (the "License"); you may not use this file except in compliance with 10 | * the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | * 20 | * 源代码由林良益(linliangyi2005@gmail.com)提供 21 | * 版权声明 2012,乌龙茶工作室 22 | * provided by Linliangyi and copyright 2012 by Oolong studio 23 | * 24 | * 25 | */ 26 | package org.wltea.analyzer.sample; 27 | 28 | import java.io.IOException; 29 | import java.io.StringReader; 30 | 31 | import org.apache.lucene.analysis.Analyzer; 32 | import org.apache.lucene.analysis.TokenStream; 33 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; 34 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; 35 | import org.apache.lucene.analysis.tokenattributes.TypeAttribute; 36 | import org.wltea.analyzer.lucene.IKAnalyzer; 37 | 38 | /** 39 | * 使用IKAnalyzer进行分词的演示 40 | * 2012-10-22 41 | * 42 | */ 43 | public class IKAnalzyerDemo { 44 | 45 | public static void main(String[] args){ 46 | //构建IK分词器,使用smart分词模式 47 | Analyzer analyzer = new IKAnalyzer(true); 48 | 49 | //获取Lucene的TokenStream对象 50 | TokenStream ts = null; 51 | try { 52 | ts = analyzer.tokenStream("myfield", new StringReader("这是一个中文分词的例子,你可以直接运行它!IKAnalyer can analysis english text too")); 53 | //获取词元位置属性 54 | OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class); 55 | //获取词元文本属性 56 | CharTermAttribute term = ts.addAttribute(CharTermAttribute.class); 57 | //获取词元文本属性 58 | TypeAttribute type = ts.addAttribute(TypeAttribute.class); 59 | 60 | 61 | //重置TokenStream(重置StringReader) 62 | ts.reset(); 63 | //迭代获取分词结果 64 | while (ts.incrementToken()) { 65 | System.out.println(offset.startOffset() + " - " + offset.endOffset() + " : " + term.toString() + " | " + type.type()); 66 | } 67 | //关闭TokenStream(关闭StringReader) 68 | ts.end(); // Perform end-of-stream operations, e.g. set the final offset. 69 | 70 | } catch (IOException e) { 71 | e.printStackTrace(); 72 | } finally { 73 | //释放TokenStream的所有资源 74 | if(ts != null){ 75 | try { 76 | ts.close(); 77 | } catch (IOException e) { 78 | e.printStackTrace(); 79 | } 80 | } 81 | } 82 | 83 | } 84 | 85 | } 86 | -------------------------------------------------------------------------------- /chatbotv1/src/main/java/org/wltea/analyzer/dic/Hit.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | * IK 中文分词 版本 5.0 4 | * IK Analyzer release 5.0 5 | * 6 | * Licensed to the Apache Software Foundation (ASF) under one or more 7 | * contributor license agreements. See the NOTICE file distributed with 8 | * this work for additional information regarding copyright ownership. 9 | * The ASF licenses this file to You under the Apache License, Version 2.0 10 | * (the "License"); you may not use this file except in compliance with 11 | * the License. You may obtain a copy of the License at 12 | * 13 | * http://www.apache.org/licenses/LICENSE-2.0 14 | * 15 | * Unless required by applicable law or agreed to in writing, software 16 | * distributed under the License is distributed on an "AS IS" BASIS, 17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 | * See the License for the specific language governing permissions and 19 | * limitations under the License. 20 | * 21 | * 源代码由林良益(linliangyi2005@gmail.com)提供 22 | * 版权声明 2012,乌龙茶工作室 23 | * provided by Linliangyi and copyright 2012 by Oolong studio 24 | * 25 | */ 26 | package org.wltea.analyzer.dic; 27 | 28 | /** 29 | * 表示一次词典匹配的命中 30 | */ 31 | public class Hit { 32 | //Hit不匹配 33 | private static final int UNMATCH = 0x00000000; 34 | //Hit完全匹配 35 | private static final int MATCH = 0x00000001; 36 | //Hit前缀匹配 37 | private static final int PREFIX = 0x00000010; 38 | 39 | 40 | //该HIT当前状态,默认未匹配 41 | private int hitState = UNMATCH; 42 | 43 | //记录词典匹配过程中,当前匹配到的词典分支节点 44 | private DictSegment matchedDictSegment; 45 | /* 46 | * 词段开始位置 47 | */ 48 | private int begin; 49 | /* 50 | * 词段的结束位置 51 | */ 52 | private int end; 53 | 54 | 55 | /** 56 | * 判断是否完全匹配 57 | */ 58 | public boolean isMatch() { 59 | return (this.hitState & MATCH) > 0; 60 | } 61 | /** 62 | * 63 | */ 64 | public void setMatch() { 65 | this.hitState = this.hitState | MATCH; 66 | } 67 | 68 | /** 69 | * 判断是否是词的前缀 70 | */ 71 | public boolean isPrefix() { 72 | return (this.hitState & PREFIX) > 0; 73 | } 74 | /** 75 | * 76 | */ 77 | public void setPrefix() { 78 | this.hitState = this.hitState | PREFIX; 79 | } 80 | /** 81 | * 判断是否是不匹配 82 | */ 83 | public boolean isUnmatch() { 84 | return this.hitState == UNMATCH ; 85 | } 86 | /** 87 | * 88 | */ 89 | public void setUnmatch() { 90 | this.hitState = UNMATCH; 91 | } 92 | 93 | public DictSegment getMatchedDictSegment() { 94 | return matchedDictSegment; 95 | } 96 | 97 | public void setMatchedDictSegment(DictSegment matchedDictSegment) { 98 | this.matchedDictSegment = matchedDictSegment; 99 | } 100 | 101 | public int getBegin() { 102 | return begin; 103 | } 104 | 105 | public void setBegin(int begin) { 106 | this.begin = begin; 107 | } 108 | 109 | public int getEnd() { 110 | return end; 111 | } 112 | 113 | public void setEnd(int end) { 114 | this.end = end; 115 | } 116 | 117 | } 118 | -------------------------------------------------------------------------------- /digital_recognition_cnn.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | 3 | import sys 4 | reload(sys) 5 | sys.setdefaultencoding( "utf-8" ) 6 | 7 | from tensorflow.examples.tutorials.mnist import input_data 8 | import tensorflow as tf 9 | 10 | flags = tf.app.flags 11 | FLAGS = flags.FLAGS 12 | flags.DEFINE_string('data_dir', './', 'Directory for storing data') 13 | 14 | mnist = input_data.read_data_sets(FLAGS.data_dir, one_hot=True) 15 | 16 | # 初始化生成随机的权重(变量),避免神经元输出恒为0 17 | def weight_variable(shape): 18 | # 以正态分布生成随机值 19 | initial = tf.truncated_normal(shape, stddev=0.1) 20 | return tf.Variable(initial) 21 | 22 | # 初始化生成随机的偏置项(常量),避免神经元输出恒为0 23 | def bias_variable(shape): 24 | initial = tf.constant(0.1, shape=shape) 25 | return tf.Variable(initial) 26 | 27 | # 卷积采用1步长,0边距,保证输入输出大小相同 28 | def conv2d(x, W): 29 | return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME') 30 | 31 | # 池化采用2×2模板 32 | def max_pool_2x2(x): 33 | return tf.nn.max_pool(x, ksize=[1, 2, 2, 1], 34 | strides=[1, 2, 2, 1], padding='SAME') 35 | 36 | # 28*28=784 37 | x = tf.placeholder(tf.float32, [None, 784]) 38 | # 输出类别共10个:0-9 39 | y_ = tf.placeholder("float", [None,10]) 40 | 41 | # 第一层卷积权重,视野是5*5,输入通道1个,输出通道32个 42 | W_conv1 = weight_variable([5, 5, 1, 32]) 43 | # 第一层卷积偏置项有32个 44 | b_conv1 = bias_variable([32]) 45 | 46 | # 把x变成4d向量,第二维和第三维是图像尺寸,第四维是颜色通道数1 47 | x_image = tf.reshape(x, [-1,28,28,1]) 48 | 49 | h_conv1 = tf.nn.relu(conv2d(x_image, W_conv1) + b_conv1) 50 | h_pool1 = max_pool_2x2(h_conv1) 51 | 52 | # 第二层卷积权重,视野是5*5,输入通道32个,输出通道64个 53 | W_conv2 = weight_variable([5, 5, 32, 64]) 54 | # 第二层卷积偏置项有64个 55 | b_conv2 = bias_variable([64]) 56 | 57 | h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2) 58 | h_pool2 = max_pool_2x2(h_conv2) 59 | 60 | # 第二层池化后尺寸编程7*7,第三层是全连接,输入是64个通道,输出是1024个神经元 61 | W_fc1 = weight_variable([7 * 7 * 64, 1024]) 62 | # 第三层全连接偏置项有1024个 63 | b_fc1 = bias_variable([1024]) 64 | 65 | h_pool2_flat = tf.reshape(h_pool2, [-1, 7*7*64]) 66 | h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1) 67 | 68 | # 按float做dropout,以减少过拟合 69 | keep_prob = tf.placeholder("float") 70 | h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob) 71 | 72 | # 最后的softmax层生成10种分类 73 | W_fc2 = weight_variable([1024, 10]) 74 | b_fc2 = bias_variable([10]) 75 | 76 | y_conv=tf.nn.softmax(tf.matmul(h_fc1_drop, W_fc2) + b_fc2) 77 | 78 | cross_entropy = -tf.reduce_sum(y_*tf.log(y_conv)) 79 | # Adam优化器来做梯度最速下降 80 | train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy) 81 | correct_prediction = tf.equal(tf.argmax(y_conv,1), tf.argmax(y_,1)) 82 | accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float")) 83 | 84 | sess = tf.InteractiveSession() 85 | sess.run(tf.initialize_all_variables()) 86 | 87 | for i in range(20000): 88 | batch = mnist.train.next_batch(50) 89 | if i%100 == 0: 90 | train_accuracy = accuracy.eval(feed_dict={ 91 | x:batch[0], y_: batch[1], keep_prob: 1.0}) 92 | print "step %d, training accuracy %g"%(i, train_accuracy) 93 | train_step.run(feed_dict={x: batch[0], y_: batch[1], keep_prob: 0.5}) 94 | 95 | print "test accuracy %g"%accuracy.eval(feed_dict={ 96 | x: mnist.test.images, y_: mnist.test.labels, keep_prob: 1.0}) 97 | -------------------------------------------------------------------------------- /subtitle/preprocess/filter.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | import sys 3 | import re 4 | import chardet 5 | 6 | if __name__ == '__main__': 7 | #illegal=ur"([\u2000-\u2010]+)" 8 | illegal=ur"([\u0000-\u2010]+)" 9 | pattern_illegals = [re.compile(ur"([\u2000-\u2010]+)"), re.compile(ur"([\u0090-\u0099]+)")] 10 | filters = ["字幕", "时间轴:", "校对:", "翻译:", "后期:", "监制:"] 11 | filters.append("时间轴:") 12 | filters.append("校对:") 13 | filters.append("翻译:") 14 | filters.append("后期:") 15 | filters.append("监制:") 16 | filters.append("禁止用作任何商业盈利行为") 17 | filters.append("http") 18 | htmltagregex = re.compile(r'<[^>]+>',re.S) 19 | brace_regex = re.compile(r'\{.*\}',re.S) 20 | slash_regex = re.compile(r'\\\w',re.S) 21 | repeat_regex = re.compile(r'[-=]{10}',re.S) 22 | f = open("./corpus/all.out", "r") 23 | count=0 24 | while True: 25 | line = f.readline() 26 | if line: 27 | line = line.strip() 28 | 29 | # 编码识别,不是utf-8就过滤 30 | gb_content = '' 31 | try: 32 | gb_content = line.decode("utf-8") 33 | except Exception as e: 34 | sys.stderr.write("decode error: ", line) 35 | continue 36 | 37 | # 中文识别,不是中文就过滤 38 | need_continue = False 39 | for pattern_illegal in pattern_illegals: 40 | match_illegal = pattern_illegal.findall(gb_content) 41 | if len(match_illegal) > 0: 42 | sys.stderr.write("match_illegal error: %s\n" % line) 43 | need_continue = True 44 | break 45 | if need_continue: 46 | continue 47 | 48 | # 关键词过滤 49 | need_continue = False 50 | for filter in filters: 51 | try: 52 | line.index(filter) 53 | sys.stderr.write("filter keyword of %s %s\n" % (filter, line)) 54 | need_continue = True 55 | break 56 | except: 57 | pass 58 | if need_continue: 59 | continue 60 | 61 | # 去掉剧集信息 62 | if re.match('.*第.*季.*', line): 63 | sys.stderr.write("filter copora %s\n" % line) 64 | continue 65 | if re.match('.*第.*集.*', line): 66 | sys.stderr.write("filter copora %s\n" % line) 67 | continue 68 | if re.match('.*第.*帧.*', line): 69 | sys.stderr.write("filter copora %s\n" % line) 70 | continue 71 | 72 | # 去html标签 73 | line = htmltagregex.sub('',line) 74 | 75 | # 去花括号修饰 76 | line = brace_regex.sub('', line) 77 | 78 | # 去转义 79 | line = slash_regex.sub('', line) 80 | 81 | # 去重复 82 | new_line = repeat_regex.sub('', line) 83 | if len(new_line) != len(line): 84 | continue 85 | 86 | # 去特殊字符 87 | line = line.replace('-', '').strip() 88 | 89 | if len(line) > 0: 90 | sys.stdout.write("%s\n" % line) 91 | count+=1 92 | else: 93 | break 94 | f.close() 95 | pass 96 | -------------------------------------------------------------------------------- /read_images.c: -------------------------------------------------------------------------------- 1 | /************************ 2 | * author: SharEDITor 3 | * date: 2016-08-02 4 | * brief: read MNIST data 5 | ************************/ 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | unsigned char *lables = NULL; 12 | 13 | /** 14 | * All the integers in the files are stored in the MSB first (high endian) format 15 | */ 16 | void copy_int(uint32_t *target, unsigned char *src) 17 | { 18 | *(((unsigned char*)target)+0) = src[3]; 19 | *(((unsigned char*)target)+1) = src[2]; 20 | *(((unsigned char*)target)+2) = src[1]; 21 | *(((unsigned char*)target)+3) = src[0]; 22 | } 23 | 24 | int read_lables() 25 | { 26 | FILE *fp = fopen("./train-labels-idx1-ubyte", "r"); 27 | if (NULL == fp) 28 | { 29 | return -1; 30 | } 31 | unsigned char head[8]; 32 | fread(head, sizeof(unsigned char), 8, fp); 33 | uint32_t magic_number = 0; 34 | uint32_t item_num = 0; 35 | copy_int(&magic_number, &head[0]); 36 | // magic number check 37 | assert(magic_number == 2049); 38 | copy_int(&item_num, &head[4]); 39 | 40 | uint64_t values_size = sizeof(unsigned char) * item_num; 41 | lables = (unsigned char*)malloc(values_size); 42 | fread(lables, sizeof(unsigned char), values_size, fp); 43 | 44 | fclose(fp); 45 | return 0; 46 | } 47 | 48 | int read_images() 49 | { 50 | FILE *fp = fopen("./train-images-idx3-ubyte", "r"); 51 | if (NULL == fp) 52 | { 53 | return -1; 54 | } 55 | unsigned char head[16]; 56 | fread(head, sizeof(unsigned char), 16, fp); 57 | uint32_t magic_number = 0; 58 | uint32_t images_num = 0; 59 | uint32_t rows = 0; 60 | uint32_t cols = 0; 61 | copy_int(&magic_number, &head[0]); 62 | // magic number check 63 | assert(magic_number == 2051); 64 | copy_int(&images_num, &head[4]); 65 | copy_int(&rows, &head[8]); 66 | copy_int(&cols, &head[12]); 67 | 68 | printf("rows=%d cols=%d\n", rows, cols); 69 | 70 | uint64_t image_size = rows * cols; 71 | uint64_t values_size = sizeof(unsigned char) * images_num * rows * cols; 72 | unsigned char *values = (unsigned char*)malloc(values_size); 73 | fread(values, sizeof(unsigned char), values_size, fp); 74 | 75 | for (int image_index = 0; image_index < images_num; image_index++) 76 | { 77 | // print the label 78 | printf("========================================= %d ======================================\n", lables[image_index]); 79 | for (int row_index = 0; row_index < rows; row_index++) 80 | { 81 | for (int col_index = 0; col_index < cols; col_index++) 82 | { 83 | // print the pixels of image 84 | printf("%3d", values[image_index*image_size+row_index*cols+col_index]); 85 | } 86 | printf("\n"); 87 | } 88 | printf("\n"); 89 | } 90 | 91 | free(values); 92 | fclose(fp); 93 | return 0; 94 | } 95 | 96 | int main(int argc, char *argv[]) 97 | { 98 | if (-1 == read_lables()) 99 | { 100 | return -1; 101 | } 102 | if (-1 == read_images()) 103 | { 104 | return -1; 105 | } 106 | return 0; 107 | } 108 | -------------------------------------------------------------------------------- /chatbotv1/src/main/java/org/wltea/analyzer/core/CharacterUtil.java: -------------------------------------------------------------------------------- 1 | /** 2 | * IK 中文分词 版本 5.0 3 | * IK Analyzer release 5.0 4 | * 5 | * Licensed to the Apache Software Foundation (ASF) under one or more 6 | * contributor license agreements. See the NOTICE file distributed with 7 | * this work for additional information regarding copyright ownership. 8 | * The ASF licenses this file to You under the Apache License, Version 2.0 9 | * (the "License"); you may not use this file except in compliance with 10 | * the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | * 20 | * 源代码由林良益(linliangyi2005@gmail.com)提供 21 | * 版权声明 2012,乌龙茶工作室 22 | * provided by Linliangyi and copyright 2012 by Oolong studio 23 | * 24 | * 字符集识别工具类 25 | */ 26 | package org.wltea.analyzer.core; 27 | 28 | /** 29 | * 30 | * 字符集识别工具类 31 | */ 32 | class CharacterUtil { 33 | 34 | public static final int CHAR_USELESS = 0; 35 | 36 | public static final int CHAR_ARABIC = 0X00000001; 37 | 38 | public static final int CHAR_ENGLISH = 0X00000002; 39 | 40 | public static final int CHAR_CHINESE = 0X00000004; 41 | 42 | public static final int CHAR_OTHER_CJK = 0X00000008; 43 | 44 | 45 | /** 46 | * 识别字符类型 47 | * @param input 48 | * @return int CharacterUtil定义的字符类型常量 49 | */ 50 | static int identifyCharType(char input){ 51 | if(input >= '0' && input <= '9'){ 52 | return CHAR_ARABIC; 53 | 54 | }else if((input >= 'a' && input <= 'z') 55 | || (input >= 'A' && input <= 'Z')){ 56 | return CHAR_ENGLISH; 57 | 58 | }else { 59 | Character.UnicodeBlock ub = Character.UnicodeBlock.of(input); 60 | 61 | if(ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS 62 | || ub == Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS 63 | || ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A){ 64 | //目前已知的中文字符UTF-8集合 65 | return CHAR_CHINESE; 66 | 67 | }else if(ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS //全角数字字符和日韩字符 68 | //韩文字符集 69 | || ub == Character.UnicodeBlock.HANGUL_SYLLABLES 70 | || ub == Character.UnicodeBlock.HANGUL_JAMO 71 | || ub == Character.UnicodeBlock.HANGUL_COMPATIBILITY_JAMO 72 | //日文字符集 73 | || ub == Character.UnicodeBlock.HIRAGANA //平假名 74 | || ub == Character.UnicodeBlock.KATAKANA //片假名 75 | || ub == Character.UnicodeBlock.KATAKANA_PHONETIC_EXTENSIONS){ 76 | return CHAR_OTHER_CJK; 77 | 78 | } 79 | } 80 | //其他的不做处理的字符 81 | return CHAR_USELESS; 82 | } 83 | 84 | /** 85 | * 进行字符规格化(全角转半角,大写转小写处理) 86 | * @param input 87 | * @return char 88 | */ 89 | static char regularize(char input){ 90 | if (input == 12288) { 91 | input = (char) 32; 92 | 93 | }else if (input > 65280 && input < 65375) { 94 | input = (char) (input - 65248); 95 | 96 | }else if (input >= 'A' && input <= 'Z') { 97 | input += 32; 98 | } 99 | 100 | return input; 101 | } 102 | } 103 | -------------------------------------------------------------------------------- /seq2seq/hello_sequence.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | 3 | from __future__ import print_function 4 | import numpy as np 5 | import tensorflow as tf 6 | import sys 7 | 8 | vocab_size=256 9 | learning_rate=0.1 10 | # 暂时只试验一个bucket 11 | buckets=[(10, 10)] 12 | bucket_id=0 13 | # 填充0 14 | PAD=[0] 15 | sample_size=20 16 | # LSTM中的记忆单元数目 17 | num_units=100 18 | # 多少层的lstm 19 | num_layers=2 20 | 21 | # sample_size个样本,每个样本有一个question、answer、weights,question、answer分别是10维的向量 22 | # 这sample_size个样本有时间序上的依赖关系 23 | question_sample_list = [map(ord, "hello?") + PAD * 4] * sample_size 24 | answer_sample_list = [map(ord, "world!") + PAD * 4] * sample_size 25 | init_weights_list = [[1.0]*7 + [0.0]*3] *sample_size # mask padding. todo: redundant -- 26 | 27 | with tf.Session() as session: 28 | 29 | # 初始化神经网络单元 30 | cell = single_cell = tf.nn.rnn_cell.LSTMCell(num_units) 31 | if num_layers > 1: 32 | cell = tf.nn.rnn_cell.MultiRNNCell([single_cell] * num_layers) 33 | 34 | # 定义函数 35 | def seq2seq_f(encoder_inputs, decoder_inputs, do_decode): 36 | return tf.nn.seq2seq.embedding_rnn_seq2seq( 37 | encoder_inputs, decoder_inputs, cell, 38 | num_encoder_symbols=vocab_size, 39 | num_decoder_symbols=vocab_size, 40 | embedding_size=num_units, 41 | feed_previous=do_decode) 42 | 43 | # 初始化训练用的变量,如果是多个层,权重共享 44 | encoder_inputs = [] 45 | decoder_inputs = [] 46 | weights = [] 47 | for i in xrange(sample_size): 48 | encoder_inputs.append(tf.placeholder(tf.int32, shape=[None], name="encoder{0}".format(i))) 49 | for i in xrange(sample_size): 50 | decoder_inputs.append(tf.placeholder(tf.int32, shape=[None], name="decoder{0}".format(i))) 51 | weights.append(tf.placeholder(tf.float32, shape=[None], name="weight{0}".format(i))) 52 | targets = [decoder_inputs[i] for i in xrange(len(decoder_inputs))] 53 | 54 | # 创建模型及损失计算方法 55 | buckets_outputs, losses = tf.nn.seq2seq.model_with_buckets( 56 | encoder_inputs, decoder_inputs, targets, 57 | weights, buckets, 58 | lambda x, y: seq2seq_f(x, y, False)) 59 | 60 | 61 | # 梯度更新算法 62 | updates=[] 63 | for b in xrange(len(buckets)): 64 | updates.append(tf.train.AdamOptimizer(learning_rate).minimize(losses[b])) 65 | 66 | # 用于保存模型 67 | saver = tf.train.Saver(tf.all_variables()) 68 | 69 | # 初始化 70 | session.run(tf.initialize_all_variables()) 71 | 72 | while True: 73 | encoder_size = len(encoder_inputs) 74 | decoder_size = len(decoder_inputs) 75 | 76 | # 初始化feed_dict数据 77 | feed_dict = {} 78 | for i in xrange(encoder_size): 79 | feed_dict[encoder_inputs[i].name] = question_sample_list[i] 80 | for i in xrange(decoder_size): 81 | feed_dict[decoder_inputs[i].name] = answer_sample_list[i] 82 | feed_dict[weights[i].name] = init_weights_list[i] 83 | 84 | # 初始化fetches模型相关信息,fetches就是想拿什么就拿什么,比如updates就是拿更新值,losses就是拿损失值,buckets_outputs就是拿输出值 85 | fetches = [updates[bucket_id], losses[bucket_id]] 86 | fetches.append(buckets_outputs[bucket_id][0]) 87 | # 这一句是为了拿输出,训练过程可以不要 88 | for i in xrange(len(buckets_outputs[bucket_id])): 89 | fetches.append(buckets_outputs[bucket_id][i]) 90 | 91 | # 参数传递进去的是数据和计算逻辑,具体执行时可以传到各种介质中执行 92 | fetches_outputs = session.run(fetches, feed_dict) 93 | perplexity = fetches_outputs[1] 94 | outputs = fetches_outputs[2:] 95 | print ("perplexity =", perplexity) 96 | words = np.argmax(outputs, axis=2) 97 | word = "".join(map(chr, words[0])).replace('\x00', '').replace('\n', '') 98 | print("output: %s" % word) 99 | -------------------------------------------------------------------------------- /baidu_search/baidu_search/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for baidu_search project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'baidu_search' 13 | 14 | SPIDER_MODULES = ['baidu_search.spiders'] 15 | NEWSPIDER_MODULE = 'baidu_search.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'baidu_search (+http://www.yourdomain.com)' 20 | USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36' 21 | 22 | # Obey robots.txt rules 23 | ROBOTSTXT_OBEY = False 24 | 25 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 26 | #CONCURRENT_REQUESTS = 32 27 | 28 | # Configure a delay for requests for the same website (default: 0) 29 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 30 | # See also autothrottle settings and docs 31 | #DOWNLOAD_DELAY = 3 32 | DOWNLOAD_TIMEOUT = 5 33 | # The download delay setting will honor only one of: 34 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 35 | #CONCURRENT_REQUESTS_PER_IP = 16 36 | 37 | # Disable cookies (enabled by default) 38 | #COOKIES_ENABLED = False 39 | 40 | # Disable Telnet Console (enabled by default) 41 | #TELNETCONSOLE_ENABLED = False 42 | 43 | # Override the default request headers: 44 | #DEFAULT_REQUEST_HEADERS = { 45 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 46 | # 'Accept-Language': 'en', 47 | #} 48 | 49 | # Enable or disable spider middlewares 50 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 51 | #SPIDER_MIDDLEWARES = { 52 | # 'baidu_search.middlewares.MyCustomSpiderMiddleware': 543, 53 | #} 54 | 55 | # Enable or disable downloader middlewares 56 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 57 | #DOWNLOADER_MIDDLEWARES = { 58 | # 'baidu_search.middlewares.MyCustomDownloaderMiddleware': 543, 59 | #} 60 | 61 | # Enable or disable extensions 62 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 63 | #EXTENSIONS = { 64 | # 'scrapy.extensions.telnet.TelnetConsole': None, 65 | #} 66 | 67 | # Configure item pipelines 68 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 69 | #ITEM_PIPELINES = { 70 | # 'baidu_search.pipelines.SomePipeline': 300, 71 | #} 72 | 73 | # Enable and configure the AutoThrottle extension (disabled by default) 74 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 75 | #AUTOTHROTTLE_ENABLED = True 76 | # The initial download delay 77 | #AUTOTHROTTLE_START_DELAY = 5 78 | # The maximum download delay to be set in case of high latencies 79 | #AUTOTHROTTLE_MAX_DELAY = 60 80 | # The average number of requests Scrapy should be sending in parallel to 81 | # each remote server 82 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 83 | # Enable showing throttling stats for every response received: 84 | #AUTOTHROTTLE_DEBUG = False 85 | 86 | # Enable and configure HTTP caching (disabled by default) 87 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 88 | #HTTPCACHE_ENABLED = True 89 | #HTTPCACHE_EXPIRATION_SECS = 0 90 | #HTTPCACHE_DIR = 'httpcache' 91 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 92 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 93 | -------------------------------------------------------------------------------- /seq2seq/tflearn_prj/07_lstm.py: -------------------------------------------------------------------------------- 1 | #Inspired by https://github.com/aymericdamien/TensorFlow-Examples/blob/master/examples/3%20-%20Neural%20Networks/recurrent_network.py 2 | import tensorflow as tf 3 | 4 | import numpy as np 5 | from tensorflow.examples.tutorials.mnist import input_data 6 | 7 | # configuration 8 | # O * W + b -> 10 labels for each image, O[? 28], W[28 10], B[10] 9 | # ^ (O: output 28 vec from 28 vec input) 10 | # | 11 | # +-+ +-+ +--+ 12 | # |1|->|2|-> ... |28| time_step_size = 28 13 | # +-+ +-+ +--+ 14 | # ^ ^ ... ^ 15 | # | | | 16 | # img1:[28] [28] ... [28] 17 | # img2:[28] [28] ... [28] 18 | # img3:[28] [28] ... [28] 19 | # ... 20 | # img128 or img256 (batch_size or test_size 256) 21 | # each input size = input_vec_size=lstm_size=28 22 | 23 | # configuration variables 24 | input_vec_size = lstm_size = 28 25 | time_step_size = 28 26 | 27 | batch_size = 128 28 | test_size = 256 29 | 30 | def init_weights(shape): 31 | return tf.Variable(tf.random_normal(shape, stddev=0.01)) 32 | 33 | 34 | def model(X, W, B, lstm_size): 35 | # X, input shape: (batch_size, time_step_size, input_vec_size) 36 | print "X=", X 37 | XT = tf.transpose(X, [1, 0, 2]) # permute time_step_size and batch_size 38 | print "XT=", XT 39 | # XT shape: (time_step_size, batch_size, input_vec_size) 40 | XR = tf.reshape(XT, [-1, lstm_size]) # each row has input for each lstm cell (lstm_size=input_vec_size) 41 | print "XR=", XR 42 | # XR shape: (time_step_size * batch_size, input_vec_size) 43 | X_split = tf.split(0, time_step_size, XR) # split them to time_step_size (28 arrays) 44 | print "X_split=", X_split 45 | # Each array shape: (batch_size, input_vec_size) 46 | 47 | # Make lstm with lstm_size (each input vector size) 48 | lstm = tf.nn.rnn_cell.BasicLSTMCell(lstm_size, forget_bias=1.0, state_is_tuple=True) 49 | 50 | # Get lstm cell output, time_step_size (28) arrays with lstm_size output: (batch_size, lstm_size) 51 | outputs, _states = tf.nn.rnn(lstm, X_split, dtype=tf.float32) 52 | 53 | # Linear activation 54 | # Get the last output 55 | return tf.matmul(outputs[-1], W) + B, lstm.state_size # State size to initialize the stat 56 | 57 | mnist = input_data.read_data_sets("MNIST_data/", one_hot=True) 58 | trX, trY, teX, teY = mnist.train.images, mnist.train.labels, mnist.test.images, mnist.test.labels 59 | trX = trX.reshape(-1, 28, 28) 60 | teX = teX.reshape(-1, 28, 28) 61 | 62 | X = tf.placeholder("float", [None, 28, 28]) 63 | Y = tf.placeholder("float", [None, 10]) 64 | 65 | # get lstm_size and output 10 labels 66 | W = init_weights([lstm_size, 10]) 67 | B = init_weights([10]) 68 | 69 | py_x, state_size = model(X, W, B, lstm_size) 70 | 71 | cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(py_x, Y)) 72 | train_op = tf.train.RMSPropOptimizer(0.001, 0.9).minimize(cost) 73 | predict_op = tf.argmax(py_x, 1) 74 | 75 | # Launch the graph in a session 76 | with tf.Session() as sess: 77 | # you need to initialize all variables 78 | tf.initialize_all_variables().run() 79 | 80 | for i in range(100): 81 | for start, end in zip(range(0, len(trX), batch_size), range(batch_size, len(trX)+1, batch_size)): 82 | sess.run(train_op, feed_dict={X: trX[start:end], Y: trY[start:end]}) 83 | 84 | test_indices = np.arange(len(teX)) # Get A Test Batch 85 | np.random.shuffle(test_indices) 86 | test_indices = test_indices[0:test_size] 87 | 88 | print(i, np.mean(np.argmax(teY[test_indices], axis=1) == 89 | sess.run(predict_op, feed_dict={X: teX[test_indices]}))) 90 | -------------------------------------------------------------------------------- /subtitle/subtitle_crawler/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for subtitle_crawler project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'subtitle' 13 | 14 | SPIDER_MODULES = ['subtitle_crawler.spiders'] 15 | NEWSPIDER_MODULE = 'subtitle_crawler.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'subtitle_crawler (+http://www.yourdomain.com)' 20 | USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36' 21 | 22 | # Obey robots.txt rules 23 | ROBOTSTXT_OBEY = False 24 | 25 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 26 | #CONCURRENT_REQUESTS = 32 27 | 28 | # Configure a delay for requests for the same website (default: 0) 29 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 30 | # See also autothrottle settings and docs 31 | #DOWNLOAD_DELAY = 5 32 | DOWNLOAD_TIMEOUT = 60 33 | # The download delay setting will honor only one of: 34 | CONCURRENT_REQUESTS_PER_DOMAIN = 3 35 | CONCURRENT_REQUESTS_PER_IP = 1 36 | 37 | # Disable cookies (enabled by default) 38 | #COOKIES_ENABLED = False 39 | 40 | # Disable Telnet Console (enabled by default) 41 | #TELNETCONSOLE_ENABLED = False 42 | 43 | # Override the default request headers: 44 | #DEFAULT_REQUEST_HEADERS = { 45 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 46 | # 'Accept-Language': 'en', 47 | #} 48 | 49 | # Enable or disable spider middlewares 50 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 51 | #SPIDER_MIDDLEWARES = { 52 | # 'subtitle_crawler.middlewares.MyCustomSpiderMiddleware': 543, 53 | #} 54 | 55 | # Enable or disable downloader middlewares 56 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 57 | #DOWNLOADER_MIDDLEWARES = { 58 | # 'subtitle_crawler.middlewares.MyCustomDownloaderMiddleware': 543, 59 | #} 60 | 61 | # Enable or disable extensions 62 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 63 | #EXTENSIONS = { 64 | # 'scrapy.extensions.telnet.TelnetConsole': None, 65 | #} 66 | 67 | # Configure item pipelines 68 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 69 | ITEM_PIPELINES = { 70 | 'subtitle_crawler.pipelines.SubtitleCrawlerPipeline': 300, 71 | } 72 | 73 | # Enable and configure the AutoThrottle extension (disabled by default) 74 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 75 | #AUTOTHROTTLE_ENABLED = True 76 | # The initial download delay 77 | #AUTOTHROTTLE_START_DELAY = 5 78 | # The maximum download delay to be set in case of high latencies 79 | #AUTOTHROTTLE_MAX_DELAY = 60 80 | # The average number of requests Scrapy should be sending in parallel to 81 | # each remote server 82 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 83 | # Enable showing throttling stats for every response received: 84 | #AUTOTHROTTLE_DEBUG = False 85 | 86 | # Enable and configure HTTP caching (disabled by default) 87 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 88 | #HTTPCACHE_ENABLED = True 89 | #HTTPCACHE_EXPIRATION_SECS = 0 90 | #HTTPCACHE_DIR = 'httpcache' 91 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 92 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 93 | 94 | LOG_LEVEL = 'INFO' 95 | -------------------------------------------------------------------------------- /seq2seq/tflearn_prj/my_lstm_test.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | 3 | 4 | import sys 5 | import numpy as np 6 | import tensorflow as tf 7 | from tensorflow.python.ops import seq2seq 8 | from tensorflow.python.ops import rnn_cell 9 | import tflearn 10 | 11 | np.set_printoptions(threshold=np.nan) 12 | 13 | class Primes: 14 | def __init__(self): 15 | self.primes = list() 16 | for i in range(2, 100): 17 | is_prime = True 18 | for j in range(2, i-1): 19 | if i % j == 0: 20 | is_prime = False 21 | if is_prime: 22 | self.primes.append(i) 23 | self.primes_count = len(self.primes) 24 | def get_sample(self, x_dim, y_dim, index): 25 | result = np.zeros((x_dim+y_dim)) 26 | for i in range(index, index + x_dim + y_dim): 27 | result[i-index] = self.primes[i%self.primes_count] 28 | return result 29 | 30 | 31 | max_input_len = 10 32 | max_output_len = 10 33 | embedding_size = 20 34 | max_int = 100 35 | GO_VALUE = max_int + 1 36 | learning_rate = 0.01 37 | 38 | network = tflearn.input_data(shape=[None, max_input_len + max_output_len], dtype=tf.int32, name="XY") 39 | encoder_inputs = tf.slice(network, [0, 0], [-1, max_input_len], name="enc_in") 40 | encoder_inputs = tf.unpack(encoder_inputs, axis=1) 41 | decoder_inputs = tf.slice(network, [0, max_input_len], [-1, max_output_len], name="dec_in") 42 | decoder_inputs = tf.unpack(decoder_inputs, axis=1) 43 | go_input = tf.mul( tf.ones_like(decoder_inputs[0], dtype=tf.int32), GO_VALUE ) 44 | decoder_inputs = [go_input] + decoder_inputs[: max_output_len-1] 45 | num_encoder_symbols = max_int + 1 # 从0起始 46 | num_decoder_symbols = max_int + 2 # 包括GO 47 | print encoder_inputs 48 | print decoder_inputs 49 | 50 | cell = rnn_cell.BasicLSTMCell(16, state_is_tuple=True) 51 | 52 | model_outputs, states = seq2seq.embedding_rnn_seq2seq( 53 | encoder_inputs, 54 | decoder_inputs, 55 | cell, 56 | num_encoder_symbols=num_encoder_symbols, 57 | num_decoder_symbols=num_decoder_symbols, 58 | embedding_size=embedding_size, 59 | feed_previous=False) 60 | 61 | network = tf.pack(model_outputs, axis=1) 62 | 63 | 64 | 65 | def sequence_loss(y_pred, y_true): 66 | logits = tf.unpack(y_pred, axis=1) 67 | targets = tf.unpack(y_true, axis=1) 68 | weights = [tf.ones_like(yp, dtype=tf.float32) for yp in targets] 69 | return seq2seq.sequence_loss(logits, targets, weights) 70 | 71 | def accuracy(y_pred, y_true, x_in): 72 | pred_idx = tf.to_int32(tf.argmax(y_pred, 2)) 73 | return tf.reduce_mean(tf.cast(tf.equal(pred_idx, y_true), tf.float32), name='acc') 74 | 75 | targetY = tf.placeholder(shape=[None, max_output_len], dtype=tf.int32, name="Y") 76 | 77 | network = tflearn.regression( 78 | network, 79 | placeholder=targetY, 80 | optimizer='adam', 81 | learning_rate=learning_rate, 82 | loss=sequence_loss, 83 | metric=accuracy, 84 | name="Y") 85 | 86 | model = tflearn.DNN(network, tensorboard_verbose=0, checkpoint_path=None) 87 | 88 | primes = Primes() 89 | XY = [ primes.get_sample(10, 10, i)[0:20] for i in range(10) ] 90 | Y = [ primes.get_sample(10, 10, i)[10:20] for i in range(10) ] 91 | model.fit( 92 | XY, 93 | Y, 94 | n_epoch=10, 95 | validation_set=0.01, 96 | batch_size=1, 97 | shuffle=True, 98 | show_metric=True, 99 | snapshot_step=50, 100 | snapshot_epoch=False, 101 | run_id="my_lstm_test") 102 | 103 | 104 | TEST_XY = [XY[0]] 105 | TEST_XY[0][10:20]=0 106 | res = model.predict(TEST_XY) 107 | print TEST_XY 108 | res = np.array(res) 109 | print res.shape 110 | y = res.reshape(max_output_len, num_decoder_symbols) 111 | prediction = np.argmax(y, axis=1) 112 | print prediction 113 | 114 | 115 | -------------------------------------------------------------------------------- /chatbotv1/src/main/java/com/shareditor/chatbotv1/Indexer.java: -------------------------------------------------------------------------------- 1 | package com.shareditor.chatbotv1; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.File; 5 | import java.io.FileInputStream; 6 | import java.io.IOException; 7 | import java.io.InputStreamReader; 8 | import java.nio.charset.Charset; 9 | import java.security.MessageDigest; 10 | import java.security.NoSuchAlgorithmException; 11 | import java.util.HashSet; 12 | 13 | import org.apache.lucene.analysis.Analyzer; 14 | import org.apache.lucene.document.Document; 15 | import org.apache.lucene.document.Field.Store; 16 | import org.apache.lucene.document.StoredField; 17 | import org.apache.lucene.document.TextField; 18 | import org.apache.lucene.index.IndexWriter; 19 | import org.apache.lucene.index.IndexWriterConfig; 20 | import org.apache.lucene.index.IndexWriterConfig.OpenMode; 21 | import org.apache.lucene.store.FSDirectory; 22 | import org.apache.lucene.util.Version; 23 | import org.wltea.analyzer.lucene.IKAnalyzer; 24 | 25 | public class Indexer 26 | { 27 | 28 | public static final Charset UTF8 = Charset.forName("utf8"); 29 | 30 | public static String hexString(byte[] b) { 31 | String ret = ""; 32 | for (int i = 0; i < b.length; i++) { 33 | String hex = Integer.toHexString(b[i] & 0xF); 34 | ret += hex.toUpperCase(); 35 | } 36 | return ret; 37 | } 38 | 39 | public static void main( String[] args ) throws IOException, NoSuchAlgorithmException 40 | { 41 | if (args.length != 2) { 42 | System.err.println("Usage: " + Indexer.class.getSimpleName() + " corpus_path index_path"); 43 | System.exit(-1); 44 | } 45 | 46 | String corpusPath = args[0]; 47 | String indexPath = args[1]; 48 | 49 | Analyzer analyzer = new IKAnalyzer(true); 50 | IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_4_9, analyzer); 51 | iwc.setOpenMode(OpenMode.CREATE); 52 | iwc.setUseCompoundFile(true); 53 | IndexWriter indexWriter = new IndexWriter(FSDirectory.open(new File(indexPath)), iwc); 54 | 55 | BufferedReader br = new BufferedReader(new InputStreamReader( 56 | new FileInputStream(corpusPath), "UTF-8")); 57 | String line = ""; 58 | String last = ""; 59 | long lineNum = 0; 60 | MessageDigest md = MessageDigest.getInstance("MD5"); 61 | HashSet mc = new HashSet(); 62 | int dupCount = 0; 63 | int totalCount = 0; 64 | long last_t = 0; 65 | while ((line = br.readLine()) != null) { 66 | totalCount++; 67 | if (totalCount % 15000000 == 0) { 68 | System.out.println("clear set"); 69 | mc.clear(); 70 | } 71 | line = line.trim(); 72 | 73 | if (0 == line.length()) { 74 | continue; 75 | } 76 | 77 | if (!last.equals("")) { 78 | String pair = last + line; 79 | 80 | byte[] md5 = md.digest(pair.getBytes(UTF8)); 81 | String md5_str = hexString(md5); 82 | 83 | if (mc.contains(md5_str)) { 84 | dupCount++; 85 | continue; 86 | } else { 87 | mc.add(md5_str); 88 | } 89 | Document doc = new Document(); 90 | doc.add(new TextField("question", last, Store.YES)); 91 | doc.add(new StoredField("answer", line)); 92 | indexWriter.addDocument(doc); 93 | } 94 | last = line; 95 | lineNum++; 96 | if (lineNum % 100000 == 0) { 97 | long t = System.currentTimeMillis(); 98 | System.out.println("elapse second: " + (t-last_t)/1000 + " add doc " + lineNum + " totalCount:" + totalCount + " dup:" + dupCount); 99 | last_t = t; 100 | } 101 | } 102 | br.close(); 103 | 104 | indexWriter.forceMerge(1); 105 | indexWriter.close(); 106 | } 107 | } 108 | -------------------------------------------------------------------------------- /chatbotv1/src/main/java/org/wltea/analyzer/lucene/IKTokenizer.java: -------------------------------------------------------------------------------- 1 | /** 2 | * IK 中文分词 版本 5.0.1 3 | * IK Analyzer release 5.0.1 4 | * 5 | * Licensed to the Apache Software Foundation (ASF) under one or more 6 | * contributor license agreements. See the NOTICE file distributed with 7 | * this work for additional information regarding copyright ownership. 8 | * The ASF licenses this file to You under the Apache License, Version 2.0 9 | * (the "License"); you may not use this file except in compliance with 10 | * the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | * 20 | * 源代码由林良益(linliangyi2005@gmail.com)提供 21 | * 版权声明 2012,乌龙茶工作室 22 | * provided by Linliangyi and copyright 2012 by Oolong studio 23 | * 24 | 25 | * 26 | */ 27 | package org.wltea.analyzer.lucene; 28 | 29 | import java.io.IOException; 30 | import java.io.Reader; 31 | 32 | import org.apache.lucene.analysis.Tokenizer; 33 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; 34 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; 35 | import org.apache.lucene.analysis.tokenattributes.TypeAttribute; 36 | 37 | import org.wltea.analyzer.core.IKSegmenter; 38 | import org.wltea.analyzer.core.Lexeme; 39 | 40 | /** 41 | * IK分词器 Lucene Tokenizer适配器类 42 | * 兼容Lucene 4.0版本 43 | */ 44 | public final class IKTokenizer extends Tokenizer { 45 | 46 | //IK分词器实现 47 | private IKSegmenter _IKImplement; 48 | 49 | //词元文本属性 50 | private final CharTermAttribute termAtt; 51 | //词元位移属性 52 | private final OffsetAttribute offsetAtt; 53 | //词元分类属性(该属性分类参考org.wltea.analyzer.core.Lexeme中的分类常量) 54 | private final TypeAttribute typeAtt; 55 | //记录最后一个词元的结束位置 56 | private int endPosition; 57 | 58 | /** 59 | * Lucene 4.0 Tokenizer适配器类构造函数 60 | * @param in 61 | * @param useSmart 62 | */ 63 | public IKTokenizer(Reader in , boolean useSmart){ 64 | super(in); 65 | offsetAtt = addAttribute(OffsetAttribute.class); 66 | termAtt = addAttribute(CharTermAttribute.class); 67 | typeAtt = addAttribute(TypeAttribute.class); 68 | _IKImplement = new IKSegmenter(input , useSmart); 69 | } 70 | 71 | /* (non-Javadoc) 72 | * @see org.apache.lucene.analysis.TokenStream#incrementToken() 73 | */ 74 | @Override 75 | public boolean incrementToken() throws IOException { 76 | //清除所有的词元属性 77 | clearAttributes(); 78 | Lexeme nextLexeme = _IKImplement.next(); 79 | if(nextLexeme != null){ 80 | //将Lexeme转成Attributes 81 | //设置词元文本 82 | termAtt.append(nextLexeme.getLexemeText()); 83 | //设置词元长度 84 | termAtt.setLength(nextLexeme.getLength()); 85 | //设置词元位移 86 | offsetAtt.setOffset(nextLexeme.getBeginPosition(), nextLexeme.getEndPosition()); 87 | //记录分词的最后位置 88 | endPosition = nextLexeme.getEndPosition(); 89 | //记录词元分类 90 | typeAtt.setType(nextLexeme.getLexemeTypeString()); 91 | //返会true告知还有下个词元 92 | return true; 93 | } 94 | //返会false告知词元输出完毕 95 | return false; 96 | } 97 | 98 | /* 99 | * (non-Javadoc) 100 | * @see org.apache.lucene.analysis.Tokenizer#reset(java.io.Reader) 101 | */ 102 | @Override 103 | public void reset() throws IOException { 104 | super.reset(); 105 | _IKImplement.reset(input); 106 | } 107 | 108 | @Override 109 | public final void end() { 110 | // set final offset 111 | int finalOffset = correctOffset(this.endPosition); 112 | offsetAtt.setOffset(finalOffset, finalOffset); 113 | } 114 | } 115 | -------------------------------------------------------------------------------- /pattern_recognition.lua: -------------------------------------------------------------------------------- 1 | require 'nn' 2 | require 'paths' 3 | if (not paths.filep("cifar10torchsmall.zip")) then 4 | os.execute('wget -c https://s3.amazonaws.com/torch7/data/cifar10torchsmall.zip') 5 | os.execute('unzip cifar10torchsmall.zip') 6 | end 7 | trainset = torch.load('cifar10-train.t7') 8 | testset = torch.load('cifar10-test.t7') 9 | classes = {'airplane', 'automobile', 'bird', 'cat', 10 | 'deer', 'dog', 'frog', 'horse', 'ship', 'truck'} 11 | setmetatable(trainset, 12 | {__index = function(t, i) 13 | return {t.data[i], t.label[i]} 14 | end} 15 | ); 16 | trainset.data = trainset.data:double() -- convert the data from a ByteTensor to a DoubleTensor. 17 | 18 | function trainset:size() 19 | return self.data:size(1) 20 | end 21 | mean = {} -- store the mean, to normalize the test set in the future 22 | stdv = {} -- store the standard-deviation for the future 23 | for i=1,3 do -- over each image channel 24 | mean[i] = trainset.data[{ {}, {i}, {}, {} }]:mean() -- mean estimation 25 | print('Channel ' .. i .. ', Mean: ' .. mean[i]) 26 | trainset.data[{ {}, {i}, {}, {} }]:add(-mean[i]) -- mean subtraction 27 | 28 | stdv[i] = trainset.data[{ {}, {i}, {}, {} }]:std() -- std estimation 29 | print('Channel ' .. i .. ', Standard Deviation: ' .. stdv[i]) 30 | trainset.data[{ {}, {i}, {}, {} }]:div(stdv[i]) -- std scaling 31 | end 32 | net = nn.Sequential() 33 | net:add(nn.SpatialConvolution(3, 6, 5, 5)) -- 3 input image channels, 6 output channels, 5x5 convolution kernel 34 | net:add(nn.ReLU()) -- non-linearity 35 | net:add(nn.SpatialMaxPooling(2,2,2,2)) -- A max-pooling operation that looks at 2x2 windows and finds the max. 36 | net:add(nn.SpatialConvolution(6, 16, 5, 5)) 37 | net:add(nn.ReLU()) -- non-linearity 38 | net:add(nn.SpatialMaxPooling(2,2,2,2)) 39 | net:add(nn.View(16*5*5)) -- reshapes from a 3D tensor of 16x5x5 into 1D tensor of 16*5*5 40 | net:add(nn.Linear(16*5*5, 120)) -- fully connected layer (matrix multiplication between input and weights) 41 | net:add(nn.ReLU()) -- non-linearity 42 | net:add(nn.Linear(120, 84)) 43 | net:add(nn.ReLU()) -- non-linearity 44 | net:add(nn.Linear(84, 10)) -- 10 is the number of outputs of the network (in this case, 10 digits) 45 | net:add(nn.LogSoftMax()) -- converts the output to a log-probability. Useful for classification problems 46 | criterion = nn.ClassNLLCriterion() 47 | trainer = nn.StochasticGradient(net, criterion) 48 | trainer.learningRate = 0.001 49 | trainer.maxIteration = 5 50 | trainer:train(trainset) 51 | testset.data = testset.data:double() -- convert from Byte tensor to Double tensor 52 | for i=1,3 do -- over each image channel 53 | testset.data[{ {}, {i}, {}, {} }]:add(-mean[i]) -- mean subtraction 54 | testset.data[{ {}, {i}, {}, {} }]:div(stdv[i]) -- std scaling 55 | end 56 | predicted = net:forward(testset.data[100]) 57 | print(classes[testset.label[100]]) 58 | print(predicted:exp()) 59 | for i=1,predicted:size(1) do 60 | print(classes[i], predicted[i]) 61 | end 62 | correct = 0 63 | for i=1,10000 do 64 | local groundtruth = testset.label[i] 65 | local prediction = net:forward(testset.data[i]) 66 | local confidences, indices = torch.sort(prediction, true) -- true means sort in descending order 67 | if groundtruth == indices[1] then 68 | correct = correct + 1 69 | end 70 | end 71 | 72 | print(correct, 100*correct/10000 .. ' % ') 73 | class_performance = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0} 74 | for i=1,10000 do 75 | local groundtruth = testset.label[i] 76 | local prediction = net:forward(testset.data[i]) 77 | local confidences, indices = torch.sort(prediction, true) -- true means sort in descending order 78 | if groundtruth == indices[1] then 79 | class_performance[groundtruth] = class_performance[groundtruth] + 1 80 | end 81 | end 82 | 83 | for i=1,#classes do 84 | print(classes[i], 100*class_performance[i]/1000 .. ' %') 85 | end 86 | -------------------------------------------------------------------------------- /chatbotv1/src/main/java/org/wltea/analyzer/core/CJKSegmenter.java: -------------------------------------------------------------------------------- 1 | 2 | /** 3 | * IK 中文分词 版本 5.0 4 | * IK Analyzer release 5.0 5 | * 6 | * Licensed to the Apache Software Foundation (ASF) under one or more 7 | * contributor license agreements. See the NOTICE file distributed with 8 | * this work for additional information regarding copyright ownership. 9 | * The ASF licenses this file to You under the Apache License, Version 2.0 10 | * (the "License"); you may not use this file except in compliance with 11 | * the License. You may obtain a copy of the License at 12 | * 13 | * http://www.apache.org/licenses/LICENSE-2.0 14 | * 15 | * Unless required by applicable law or agreed to in writing, software 16 | * distributed under the License is distributed on an "AS IS" BASIS, 17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 | * See the License for the specific language governing permissions and 19 | * limitations under the License. 20 | * 21 | * 源代码由林良益(linliangyi2005@gmail.com)提供 22 | * 版权声明 2012,乌龙茶工作室 23 | * provided by Linliangyi and copyright 2012 by Oolong studio 24 | * 25 | */ 26 | package org.wltea.analyzer.core; 27 | 28 | import java.util.LinkedList; 29 | import java.util.List; 30 | 31 | import org.wltea.analyzer.dic.Dictionary; 32 | import org.wltea.analyzer.dic.Hit; 33 | 34 | 35 | /** 36 | * 中文-日韩文子分词器 37 | */ 38 | class CJKSegmenter implements ISegmenter { 39 | 40 | //子分词器标签 41 | static final String SEGMENTER_NAME = "CJK_SEGMENTER"; 42 | //待处理的分词hit队列 43 | private List tmpHits; 44 | 45 | 46 | CJKSegmenter(){ 47 | this.tmpHits = new LinkedList(); 48 | } 49 | 50 | /* (non-Javadoc) 51 | * @see org.wltea.analyzer.core.ISegmenter#analyze(org.wltea.analyzer.core.AnalyzeContext) 52 | */ 53 | public void analyze(AnalyzeContext context) { 54 | if(CharacterUtil.CHAR_USELESS != context.getCurrentCharType()){ 55 | 56 | //优先处理tmpHits中的hit 57 | if(!this.tmpHits.isEmpty()){ 58 | //处理词段队列 59 | Hit[] tmpArray = this.tmpHits.toArray(new Hit[this.tmpHits.size()]); 60 | for(Hit hit : tmpArray){ 61 | hit = Dictionary.getSingleton().matchWithHit(context.getSegmentBuff(), context.getCursor() , hit); 62 | if(hit.isMatch()){ 63 | //输出当前的词 64 | Lexeme newLexeme = new Lexeme(context.getBufferOffset() , hit.getBegin() , context.getCursor() - hit.getBegin() + 1 , Lexeme.TYPE_CNWORD); 65 | context.addLexeme(newLexeme); 66 | 67 | if(!hit.isPrefix()){//不是词前缀,hit不需要继续匹配,移除 68 | this.tmpHits.remove(hit); 69 | } 70 | 71 | }else if(hit.isUnmatch()){ 72 | //hit不是词,移除 73 | this.tmpHits.remove(hit); 74 | } 75 | } 76 | } 77 | 78 | //********************************* 79 | //再对当前指针位置的字符进行单字匹配 80 | Hit singleCharHit = Dictionary.getSingleton().matchInMainDict(context.getSegmentBuff(), context.getCursor(), 1); 81 | if(singleCharHit.isMatch()){//首字成词 82 | //输出当前的词 83 | Lexeme newLexeme = new Lexeme(context.getBufferOffset() , context.getCursor() , 1 , Lexeme.TYPE_CNWORD); 84 | context.addLexeme(newLexeme); 85 | 86 | //同时也是词前缀 87 | if(singleCharHit.isPrefix()){ 88 | //前缀匹配则放入hit列表 89 | this.tmpHits.add(singleCharHit); 90 | } 91 | }else if(singleCharHit.isPrefix()){//首字为词前缀 92 | //前缀匹配则放入hit列表 93 | this.tmpHits.add(singleCharHit); 94 | } 95 | 96 | 97 | }else{ 98 | //遇到CHAR_USELESS字符 99 | //清空队列 100 | this.tmpHits.clear(); 101 | } 102 | 103 | //判断缓冲区是否已经读完 104 | if(context.isBufferConsumed()){ 105 | //清空队列 106 | this.tmpHits.clear(); 107 | } 108 | 109 | //判断是否锁定缓冲区 110 | if(this.tmpHits.size() == 0){ 111 | context.unlockBuffer(SEGMENTER_NAME); 112 | 113 | }else{ 114 | context.lockBuffer(SEGMENTER_NAME); 115 | } 116 | } 117 | 118 | /* (non-Javadoc) 119 | * @see org.wltea.analyzer.core.ISegmenter#reset() 120 | */ 121 | public void reset() { 122 | //清空队列 123 | this.tmpHits.clear(); 124 | } 125 | 126 | } 127 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ChatBotCourse 2 | ============== 3 | _读本人更多原创文章,欢迎关注微信订阅号_ 4 | 5 | SharEDITor 6 | 7 | _欢迎关注我的另外两个github项目_ 8 | * [_教你成为全栈工程师_](https://github.com/warmheartli/FullStackDeveloperCourse) 9 | * [_机器学习精简入门教程_](https://github.com/warmheartli/MachineLearningCourse) 10 | 11 | 自己动手做聊天机器人教程 12 | ============== 13 | * [自己动手做聊天机器人 一-涉及知识](http://www.shareditor.com/blogshow/?blogId=63)(2016-06-09) 14 | * [自己动手做聊天机器人 二-初识NLTK库](http://www.shareditor.com/blogshow/?blogId=64)(2016-06-10) 15 | * [自己动手做聊天机器人 三-语料与词汇资源](http://www.shareditor.com/blogshow/?blogId=65)(2016-06-12) 16 | * [自己动手做聊天机器人 四-何须动手?完全自动化对语料做词性标注](http://www.shareditor.com/blogshow/?blogId=67)(2016-06-17) 17 | * [自己动手做聊天机器人 五-自然语言处理中的文本分类](http://www.shareditor.com/blogshow/?blogId=69)(2016-06-21) 18 | * [自己动手做聊天机器人 六-教你怎么从一句话里提取出十句话的信息](http://www.shareditor.com/blogshow/?blogId=70)(2016-06-22) 19 | * [自己动手做聊天机器人 七-文法分析还是基于特征好啊](http://www.shareditor.com/blogshow/?blogId=71)(2016-06-23) 20 | * [自己动手做聊天机器人 八-重温自然语言处理](http://www.shareditor.com/blogshow/?blogId=72)(2016-06-24) 21 | * [自己动手做聊天机器人 九-聊天机器人应该怎么做](http://www.shareditor.com/blogshow/?blogId=73)(2016-06-25) 22 | * [自己动手做聊天机器人 十-半个小时搞定词性标注与关键词提取](http://www.shareditor.com/blogshow/?blogId=74)(2016-06-28) 23 | * [自己动手做聊天机器人 十一-0字节存储海量语料资源](http://www.shareditor.com/blogshow/?blogId=76)(2016-07-01) 24 | * [自己动手做聊天机器人 十二-教你如何利用强大的中文语言技术平台做依存句法和语义依存分析](http://www.shareditor.com/blogshow/?blogId=77)(2016-07-04) 25 | * [自己动手做聊天机器人 十三-把语言模型探究到底](http://www.shareditor.com/blogshow/?blogId=78)(2016-07-05) 26 | * [自己动手做聊天机器人 十四-探究中文分词的艺术](http://www.shareditor.com/blogshow/?blogId=80)(2016-07-06) 27 | * [自己动手做聊天机器人 十五-一篇文章读懂拿了图灵奖和诺贝尔奖的概率图模型](http://www.shareditor.com/blogshow/?blogId=81)(2016-07-09) 28 | * [自己动手做聊天机器人 十六-大话自然语言处理中的囊中取物](http://www.shareditor.com/blogshow/?blogId=82)(2016-07-09) 29 | * [自己动手做聊天机器人 十七-让机器做词性自动标注的具体方法](http://www.shareditor.com/blogshow/?blogId=86)(2016-07-15) 30 | * [自己动手做聊天机器人 十八-神奇算法之句法分析树的生成](http://www.shareditor.com/blogshow/?blogId=87)(2016-07-19) 31 | * [自己动手做聊天机器人 十九-机器人是怎么理解“日后再说”的](http://www.shareditor.com/blogshow/?blogId=88)(2016-07-21) 32 | * [自己动手做聊天机器人 二十-语义角色标注的基本方法](http://www.shareditor.com/blogshow/?blogId=89)(2016-07-22) 33 | * [自己动手做聊天机器人 二十一-比TF-IDF更好的隐含语义索引模型是个什么鬼](http://www.shareditor.com/blogshow/?blogId=90)(2016-07-26) 34 | * [自己动手做聊天机器人 二十二-神奇算法之人工神经网络](http://www.shareditor.com/blogshow/?blogId=92)(2016-08-01) 35 | * [自己动手做聊天机器人 二十三-用CNN做深度学习](http://www.shareditor.com/blogshow/?blogId=97)(2016-08-12) 36 | * [自己动手做聊天机器人 二十四-将深度学习应用到NLP](http://www.shareditor.com/blogshow/?blogId=99)(2016-08-18) 37 | * [自己动手做聊天机器人 二十五-google的文本挖掘深度学习工具word2vec的实现原理](http://www.shareditor.com/blogshow/?blogId=100)(2016-08-20) 38 | * [自己动手做聊天机器人 二十六-图解递归神经网络(RNN)](http://www.shareditor.com/blogshow/?blogId=103)(2016-08-25) 39 | * [自己动手做聊天机器人 二十七-用深度学习来做自动问答的一般方法](http://www.shareditor.com/blogshow/?blogId=104)(2016-08-26) 40 | * [自己动手做聊天机器人 二十八-脑洞大开:基于美剧字幕的聊天语料库建设方案](http://www.shareditor.com/blogshow/?blogId=105)(2016-08-30) 41 | * [自己动手做聊天机器人 二十九-重磅:近1GB的三千万聊天语料供出](http://www.shareditor.com/blogshow/?blogId=112)(2016-09-18) 42 | * [自己动手做聊天机器人 三十-第一版聊天机器人诞生——吃了字幕长大的小二兔](http://www.shareditor.com/blogshow/?blogId=113)(2016-09-26) 43 | * [自己动手做聊天机器人 三十一-如何把网站流量导向小二兔机器人](http://www.shareditor.com/blogshow/?blogId=114)(2016-09-30) 44 | * [自己动手做聊天机器人 三十二-用三千万影视剧字幕语料库生成词向量](http://www.shareditor.com/blogshow/?blogId=115)(2016-10-10) 45 | * [自己动手做聊天机器人 三十三-两套代码详解LSTM-RNN——有记忆的神经网络](http://www.shareditor.com/blogshow/?blogId=116)(2016-10-13) 46 | * [自己动手做聊天机器人 三十四-最快的深度学习框架torch](http://www.shareditor.com/blogshow/?blogId=117)(2016-10-28) 47 | * [自己动手做聊天机器人 三十五-一个lstm单元让聊天机器人学会甄嬛体](http://www.shareditor.com/blogshow/?blogId=118)(2016-11-23) 48 | * [自己动手做聊天机器人 三十六-深入理解tensorflow的session和graph](http://www.shareditor.com/blogshow/?blogId=119)(2016-12-01) 49 | * [自己动手做聊天机器人 三十七-一张图了解tensorflow中的线性回归工作原理](http://www.shareditor.com/blogshow/?blogId=120)(2016-12-08) 50 | * [自己动手做聊天机器人 三十八-原来聊天机器人是这么做出来的](http://www.shareditor.com/blogshow/?blogId=121)(2017-01-10) 51 | * [自己动手做聊天机器人 三十九-满腔热血:在家里搭建一台GPU云服务共享给人工智能和大数据爱好者](http://www.shareditor.com/blogshow/?blogId=122)(2017-01-16) 52 | * [自己动手做聊天机器人 四十-视频教程之开篇宣言与知识点梳理](http://www.shareditor.com/blogshow/?blogId=124)(2017-03-05) 53 | * [自己动手做聊天机器人 四十一-视频教程之环境搭建与python基础](http://www.shareditor.com/blogshow/?blogId=125)(2017-03-31) 54 | -------------------------------------------------------------------------------- /lstm_code/iamtrask/lstm.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | import copy, numpy as np 3 | np.random.seed(0) 4 | # compute sigmoid nonlinearity 5 | def sigmoid(x): 6 | output = 1/(1+np.exp(-x)) 7 | return output 8 | 9 | # convert output of sigmoid function to its derivative 10 | def sigmoid_output_to_derivative(output): 11 | return output*(1-output) 12 | 13 | 14 | # training dataset generation 15 | int2binary = {} 16 | binary_dim = 8 17 | 18 | largest_number = pow(2,binary_dim) 19 | binary = np.unpackbits( 20 | np.array([range(largest_number)],dtype=np.uint8).T,axis=1) 21 | for i in range(largest_number): 22 | int2binary[i] = binary[i] 23 | 24 | 25 | # input variables 26 | alpha = 0.1 27 | input_dim = 2 28 | hidden_dim = 16 29 | output_dim = 1 30 | 31 | 32 | # initialize neural network weights 33 | synapse_0 = 2*np.random.random((input_dim,hidden_dim)) - 1 34 | synapse_1 = 2*np.random.random((hidden_dim,output_dim)) - 1 35 | synapse_h = 2*np.random.random((hidden_dim,hidden_dim)) - 1 36 | 37 | synapse_0_update = np.zeros_like(synapse_0) 38 | synapse_1_update = np.zeros_like(synapse_1) 39 | synapse_h_update = np.zeros_like(synapse_h) 40 | 41 | # training logic 42 | for j in range(10000): 43 | 44 | # generate a simple addition problem (a + b = c) 45 | a_int = np.random.randint(largest_number/2) # int version 46 | a = int2binary[a_int] # binary encoding 47 | 48 | b_int = np.random.randint(largest_number/2) # int version 49 | b = int2binary[b_int] # binary encoding 50 | 51 | # true answer 52 | c_int = a_int + b_int 53 | c = int2binary[c_int] 54 | 55 | # where we'll store our best guess (binary encoded) 56 | d = np.zeros_like(c) 57 | 58 | overallError = 0 59 | 60 | layer_2_deltas = list() 61 | layer_1_values = list() 62 | layer_1_values.append(np.zeros(hidden_dim)) 63 | 64 | # moving along the positions in the binary encoding 65 | for position in range(binary_dim): 66 | 67 | # generate input and output 68 | X = np.array([[a[binary_dim - position - 1],b[binary_dim - position - 1]]]) 69 | y = np.array([[c[binary_dim - position - 1]]]).T 70 | 71 | # hidden layer (input ~+ prev_hidden) 72 | layer_1 = sigmoid(np.dot(X,synapse_0) + np.dot(layer_1_values[-1],synapse_h)) 73 | 74 | # output layer (new binary representation) 75 | layer_2 = sigmoid(np.dot(layer_1,synapse_1)) 76 | 77 | # did we miss?... if so by how much? 78 | layer_2_error = y - layer_2 79 | layer_2_deltas.append((layer_2_error)*sigmoid_output_to_derivative(layer_2)) 80 | overallError += np.abs(layer_2_error[0]) 81 | 82 | # decode estimate so we can print it out 83 | d[binary_dim - position - 1] = np.round(layer_2[0][0]) 84 | 85 | # store hidden layer so we can use it in the next timestep 86 | layer_1_values.append(copy.deepcopy(layer_1)) 87 | 88 | future_layer_1_delta = np.zeros(hidden_dim) 89 | 90 | for position in range(binary_dim): 91 | 92 | X = np.array([[a[position],b[position]]]) 93 | layer_1 = layer_1_values[-position-1] 94 | prev_layer_1 = layer_1_values[-position-2] 95 | 96 | # error at output layer 97 | layer_2_delta = layer_2_deltas[-position-1] 98 | # error at hidden layer 99 | layer_1_delta = (future_layer_1_delta.dot(synapse_h.T) + \ 100 | layer_2_delta.dot(synapse_1.T)) * sigmoid_output_to_derivative(layer_1) 101 | # let's update all our weights so we can try again 102 | synapse_1_update += np.atleast_2d(layer_1).T.dot(layer_2_delta) 103 | synapse_h_update += np.atleast_2d(prev_layer_1).T.dot(layer_1_delta) 104 | synapse_0_update += X.T.dot(layer_1_delta) 105 | 106 | future_layer_1_delta = layer_1_delta 107 | 108 | 109 | synapse_0 += synapse_0_update * alpha 110 | synapse_1 += synapse_1_update * alpha 111 | synapse_h += synapse_h_update * alpha 112 | 113 | synapse_0_update *= 0 114 | synapse_1_update *= 0 115 | synapse_h_update *= 0 116 | 117 | # print out progress 118 | if(j % 1000 == 0): 119 | print "Error:" + str(overallError) 120 | print "Pred:" + str(d) 121 | print "True:" + str(c) 122 | out = 0 123 | for index,x in enumerate(reversed(d)): 124 | out += x*pow(2,index) 125 | print str(a_int) + " + " + str(b_int) + " = " + str(out) 126 | print "------------" 127 | -------------------------------------------------------------------------------- /chatbotv1/src/main/java/org/wltea/analyzer/core/IKArbitrator.java: -------------------------------------------------------------------------------- 1 | /** 2 | * IK 中文分词 版本 5.0 3 | * IK Analyzer release 5.0 4 | * 5 | * Licensed to the Apache Software Foundation (ASF) under one or more 6 | * contributor license agreements. See the NOTICE file distributed with 7 | * this work for additional information regarding copyright ownership. 8 | * The ASF licenses this file to You under the Apache License, Version 2.0 9 | * (the "License"); you may not use this file except in compliance with 10 | * the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | * 20 | * 源代码由林良益(linliangyi2005@gmail.com)提供 21 | * 版权声明 2012,乌龙茶工作室 22 | * provided by Linliangyi and copyright 2012 by Oolong studio 23 | * 24 | */ 25 | package org.wltea.analyzer.core; 26 | 27 | import java.util.Stack; 28 | import java.util.TreeSet; 29 | 30 | /** 31 | * IK分词歧义裁决器 32 | */ 33 | class IKArbitrator { 34 | 35 | IKArbitrator(){ 36 | 37 | } 38 | 39 | /** 40 | * 分词歧义处理 41 | * @param orgLexemes 42 | * @param useSmart 43 | */ 44 | void process(AnalyzeContext context , boolean useSmart){ 45 | QuickSortSet orgLexemes = context.getOrgLexemes(); 46 | Lexeme orgLexeme = orgLexemes.pollFirst(); 47 | 48 | LexemePath crossPath = new LexemePath(); 49 | while(orgLexeme != null){ 50 | if(!crossPath.addCrossLexeme(orgLexeme)){ 51 | //找到与crossPath不相交的下一个crossPath 52 | if(crossPath.size() == 1 || !useSmart){ 53 | //crossPath没有歧义 或者 不做歧义处理 54 | //直接输出当前crossPath 55 | context.addLexemePath(crossPath); 56 | }else{ 57 | //对当前的crossPath进行歧义处理 58 | QuickSortSet.Cell headCell = crossPath.getHead(); 59 | LexemePath judgeResult = this.judge(headCell, crossPath.getPathLength()); 60 | //输出歧义处理结果judgeResult 61 | context.addLexemePath(judgeResult); 62 | } 63 | 64 | //把orgLexeme加入新的crossPath中 65 | crossPath = new LexemePath(); 66 | crossPath.addCrossLexeme(orgLexeme); 67 | } 68 | orgLexeme = orgLexemes.pollFirst(); 69 | } 70 | 71 | 72 | //处理最后的path 73 | if(crossPath.size() == 1 || !useSmart){ 74 | //crossPath没有歧义 或者 不做歧义处理 75 | //直接输出当前crossPath 76 | context.addLexemePath(crossPath); 77 | }else{ 78 | //对当前的crossPath进行歧义处理 79 | QuickSortSet.Cell headCell = crossPath.getHead(); 80 | LexemePath judgeResult = this.judge(headCell, crossPath.getPathLength()); 81 | //输出歧义处理结果judgeResult 82 | context.addLexemePath(judgeResult); 83 | } 84 | } 85 | 86 | /** 87 | * 歧义识别 88 | * @param lexemeCell 歧义路径链表头 89 | * @param fullTextLength 歧义路径文本长度 90 | * @param option 候选结果路径 91 | * @return 92 | */ 93 | private LexemePath judge(QuickSortSet.Cell lexemeCell , int fullTextLength){ 94 | //候选路径集合 95 | TreeSet pathOptions = new TreeSet(); 96 | //候选结果路径 97 | LexemePath option = new LexemePath(); 98 | 99 | //对crossPath进行一次遍历,同时返回本次遍历中有冲突的Lexeme栈 100 | Stack lexemeStack = this.forwardPath(lexemeCell , option); 101 | 102 | //当前词元链并非最理想的,加入候选路径集合 103 | pathOptions.add(option.copy()); 104 | 105 | //存在歧义词,处理 106 | QuickSortSet.Cell c = null; 107 | while(!lexemeStack.isEmpty()){ 108 | c = lexemeStack.pop(); 109 | //回滚词元链 110 | this.backPath(c.getLexeme() , option); 111 | //从歧义词位置开始,递归,生成可选方案 112 | this.forwardPath(c , option); 113 | pathOptions.add(option.copy()); 114 | } 115 | 116 | //返回集合中的最优方案 117 | return pathOptions.first(); 118 | 119 | } 120 | 121 | /** 122 | * 向前遍历,添加词元,构造一个无歧义词元组合 123 | * @param LexemePath path 124 | * @return 125 | */ 126 | private Stack forwardPath(QuickSortSet.Cell lexemeCell , LexemePath option){ 127 | //发生冲突的Lexeme栈 128 | Stack conflictStack = new Stack(); 129 | QuickSortSet.Cell c = lexemeCell; 130 | //迭代遍历Lexeme链表 131 | while(c != null && c.getLexeme() != null){ 132 | if(!option.addNotCrossLexeme(c.getLexeme())){ 133 | //词元交叉,添加失败则加入lexemeStack栈 134 | conflictStack.push(c); 135 | } 136 | c = c.getNext(); 137 | } 138 | return conflictStack; 139 | } 140 | 141 | /** 142 | * 回滚词元链,直到它能够接受指定的词元 143 | * @param lexeme 144 | * @param l 145 | */ 146 | private void backPath(Lexeme l , LexemePath option){ 147 | while(option.checkCross(l)){ 148 | option.removeTail(); 149 | } 150 | 151 | } 152 | 153 | } 154 | -------------------------------------------------------------------------------- /chatbotv1/src/main/java/org/wltea/analyzer/cfg/DefaultConfig.java: -------------------------------------------------------------------------------- 1 | /** 2 | * IK 中文分词 版本 5.0 3 | * IK Analyzer release 5.0 4 | * 5 | * Licensed to the Apache Software Foundation (ASF) under one or more 6 | * contributor license agreements. See the NOTICE file distributed with 7 | * this work for additional information regarding copyright ownership. 8 | * The ASF licenses this file to You under the Apache License, Version 2.0 9 | * (the "License"); you may not use this file except in compliance with 10 | * the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | * 20 | * 源代码由林良益(linliangyi2005@gmail.com)提供 21 | * 版权声明 2012,乌龙茶工作室 22 | * provided by Linliangyi and copyright 2012 by Oolong studio 23 | * 24 | * 25 | */ 26 | package org.wltea.analyzer.cfg; 27 | 28 | import java.io.IOException; 29 | import java.io.InputStream; 30 | import java.util.ArrayList; 31 | import java.util.InvalidPropertiesFormatException; 32 | import java.util.List; 33 | import java.util.Properties; 34 | 35 | /** 36 | * Configuration 默认实现 37 | * 2012-5-8 38 | * 39 | */ 40 | public class DefaultConfig implements Configuration{ 41 | 42 | /* 43 | * 分词器默认字典路径 44 | */ 45 | private static final String PATH_DIC_MAIN = "main2012.dic"; 46 | private static final String PATH_DIC_QUANTIFIER = "quantifier.dic"; 47 | 48 | /* 49 | * 分词器配置文件路径 50 | */ 51 | private static final String FILE_NAME = "IKAnalyzer.cfg.xml"; 52 | //配置属性——扩展字典 53 | private static final String EXT_DICT = "ext_dict"; 54 | //配置属性——扩展停止词典 55 | private static final String EXT_STOP = "ext_stopwords"; 56 | 57 | private Properties props; 58 | /* 59 | * 是否使用smart方式分词 60 | */ 61 | private boolean useSmart; 62 | 63 | /** 64 | * 返回单例 65 | * @return Configuration单例 66 | */ 67 | public static Configuration getInstance(){ 68 | return new DefaultConfig(); 69 | } 70 | 71 | /* 72 | * 初始化配置文件 73 | */ 74 | private DefaultConfig(){ 75 | props = new Properties(); 76 | 77 | InputStream input = this.getClass().getClassLoader().getResourceAsStream(FILE_NAME); 78 | if(input != null){ 79 | try { 80 | props.loadFromXML(input); 81 | } catch (InvalidPropertiesFormatException e) { 82 | e.printStackTrace(); 83 | } catch (IOException e) { 84 | e.printStackTrace(); 85 | } 86 | } 87 | } 88 | 89 | 90 | /** 91 | * 返回useSmart标志位 92 | * useSmart =true ,分词器使用智能切分策略, =false则使用细粒度切分 93 | * @return useSmart 94 | */ 95 | public boolean useSmart() { 96 | return useSmart; 97 | } 98 | 99 | /** 100 | * 设置useSmart标志位 101 | * useSmart =true ,分词器使用智能切分策略, =false则使用细粒度切分 102 | * @param useSmart 103 | */ 104 | public void setUseSmart(boolean useSmart) { 105 | this.useSmart = useSmart; 106 | } 107 | 108 | /** 109 | * 获取主词典路径 110 | * 111 | * @return String 主词典路径 112 | */ 113 | public String getMainDictionary(){ 114 | return PATH_DIC_MAIN; 115 | } 116 | 117 | /** 118 | * 获取量词词典路径 119 | * @return String 量词词典路径 120 | */ 121 | public String getQuantifierDicionary(){ 122 | return PATH_DIC_QUANTIFIER; 123 | } 124 | 125 | /** 126 | * 获取扩展字典配置路径 127 | * @return List 相对类加载器的路径 128 | */ 129 | public List getExtDictionarys(){ 130 | List extDictFiles = new ArrayList(2); 131 | String extDictCfg = props.getProperty(EXT_DICT); 132 | if(extDictCfg != null){ 133 | //使用;分割多个扩展字典配置 134 | String[] filePaths = extDictCfg.split(";"); 135 | if(filePaths != null){ 136 | for(String filePath : filePaths){ 137 | if(filePath != null && !"".equals(filePath.trim())){ 138 | extDictFiles.add(filePath.trim()); 139 | } 140 | } 141 | } 142 | } 143 | return extDictFiles; 144 | } 145 | 146 | 147 | /** 148 | * 获取扩展停止词典配置路径 149 | * @return List 相对类加载器的路径 150 | */ 151 | public List getExtStopWordDictionarys(){ 152 | List extStopWordDictFiles = new ArrayList(2); 153 | String extStopWordDictCfg = props.getProperty(EXT_STOP); 154 | if(extStopWordDictCfg != null){ 155 | //使用;分割多个扩展字典配置 156 | String[] filePaths = extStopWordDictCfg.split(";"); 157 | if(filePaths != null){ 158 | for(String filePath : filePaths){ 159 | if(filePath != null && !"".equals(filePath.trim())){ 160 | extStopWordDictFiles.add(filePath.trim()); 161 | } 162 | } 163 | } 164 | } 165 | return extStopWordDictFiles; 166 | } 167 | 168 | 169 | } 170 | -------------------------------------------------------------------------------- /chatbotv1/src/main/java/org/wltea/analyzer/core/IKSegmenter.java: -------------------------------------------------------------------------------- 1 | /** 2 | * IK 中文分词 版本 5.0 3 | * IK Analyzer release 5.0 4 | * 5 | * Licensed to the Apache Software Foundation (ASF) under one or more 6 | * contributor license agreements. See the NOTICE file distributed with 7 | * this work for additional information regarding copyright ownership. 8 | * The ASF licenses this file to You under the Apache License, Version 2.0 9 | * (the "License"); you may not use this file except in compliance with 10 | * the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | * 20 | * 源代码由林良益(linliangyi2005@gmail.com)提供 21 | * 版权声明 2012,乌龙茶工作室 22 | * provided by Linliangyi and copyright 2012 by Oolong studio 23 | */ 24 | package org.wltea.analyzer.core; 25 | 26 | import java.io.IOException; 27 | import java.io.Reader; 28 | import java.util.ArrayList; 29 | import java.util.List; 30 | 31 | import org.wltea.analyzer.cfg.Configuration; 32 | import org.wltea.analyzer.cfg.DefaultConfig; 33 | import org.wltea.analyzer.dic.Dictionary; 34 | 35 | /** 36 | * IK分词器主类 37 | * 38 | */ 39 | public final class IKSegmenter { 40 | 41 | //字符窜reader 42 | private Reader input; 43 | //分词器配置项 44 | private Configuration cfg; 45 | //分词器上下文 46 | private AnalyzeContext context; 47 | //分词处理器列表 48 | private List segmenters; 49 | //分词歧义裁决器 50 | private IKArbitrator arbitrator; 51 | 52 | 53 | /** 54 | * IK分词器构造函数 55 | * @param input 56 | * @param useSmart 为true,使用智能分词策略 57 | * 58 | * 非智能分词:细粒度输出所有可能的切分结果 59 | * 智能分词: 合并数词和量词,对分词结果进行歧义判断 60 | */ 61 | public IKSegmenter(Reader input , boolean useSmart){ 62 | this.input = input; 63 | this.cfg = DefaultConfig.getInstance(); 64 | this.cfg.setUseSmart(useSmart); 65 | this.init(); 66 | } 67 | 68 | /** 69 | * IK分词器构造函数 70 | * @param input 71 | * @param cfg 使用自定义的Configuration构造分词器 72 | * 73 | */ 74 | public IKSegmenter(Reader input , Configuration cfg){ 75 | this.input = input; 76 | this.cfg = cfg; 77 | this.init(); 78 | } 79 | 80 | /** 81 | * 初始化 82 | */ 83 | private void init(){ 84 | //初始化词典单例 85 | Dictionary.initial(this.cfg); 86 | //初始化分词上下文 87 | this.context = new AnalyzeContext(this.cfg); 88 | //加载子分词器 89 | this.segmenters = this.loadSegmenters(); 90 | //加载歧义裁决器 91 | this.arbitrator = new IKArbitrator(); 92 | } 93 | 94 | /** 95 | * 初始化词典,加载子分词器实现 96 | * @return List 97 | */ 98 | private List loadSegmenters(){ 99 | List segmenters = new ArrayList(4); 100 | //处理字母的子分词器 101 | segmenters.add(new LetterSegmenter()); 102 | //处理中文数量词的子分词器 103 | segmenters.add(new CN_QuantifierSegmenter()); 104 | //处理中文词的子分词器 105 | segmenters.add(new CJKSegmenter()); 106 | return segmenters; 107 | } 108 | 109 | /** 110 | * 分词,获取下一个词元 111 | * @return Lexeme 词元对象 112 | * @throws IOException 113 | */ 114 | public synchronized Lexeme next()throws IOException{ 115 | Lexeme l = null; 116 | while((l = context.getNextLexeme()) == null ){ 117 | /* 118 | * 从reader中读取数据,填充buffer 119 | * 如果reader是分次读入buffer的,那么buffer要 进行移位处理 120 | * 移位处理上次读入的但未处理的数据 121 | */ 122 | int available = context.fillBuffer(this.input); 123 | if(available <= 0){ 124 | //reader已经读完 125 | context.reset(); 126 | return null; 127 | 128 | }else{ 129 | //初始化指针 130 | context.initCursor(); 131 | do{ 132 | //遍历子分词器 133 | for(ISegmenter segmenter : segmenters){ 134 | segmenter.analyze(context); 135 | } 136 | //字符缓冲区接近读完,需要读入新的字符 137 | if(context.needRefillBuffer()){ 138 | break; 139 | } 140 | //向前移动指针 141 | }while(context.moveCursor()); 142 | //重置子分词器,为下轮循环进行初始化 143 | for(ISegmenter segmenter : segmenters){ 144 | segmenter.reset(); 145 | } 146 | } 147 | //对分词进行歧义处理 148 | this.arbitrator.process(context, this.cfg.useSmart()); 149 | //将分词结果输出到结果集,并处理未切分的单个CJK字符 150 | context.outputToResult(); 151 | //记录本次分词的缓冲区位移 152 | context.markBufferOffset(); 153 | } 154 | return l; 155 | } 156 | 157 | /** 158 | * 重置分词器到初始状态 159 | * @param input 160 | */ 161 | public synchronized void reset(Reader input) { 162 | this.input = input; 163 | context.reset(); 164 | for(ISegmenter segmenter : segmenters){ 165 | segmenter.reset(); 166 | } 167 | } 168 | } 169 | -------------------------------------------------------------------------------- /word2vec/distance.c: -------------------------------------------------------------------------------- 1 | // Copyright 2013 Google Inc. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include 16 | #include 17 | #include 18 | //#include 19 | #include 20 | 21 | const long long max_size = 2000; // max length of strings 22 | const long long N = 40; // number of closest words that will be shown 23 | const long long max_w = 50; // max length of vocabulary entries 24 | 25 | int main(int argc, char **argv) { 26 | FILE *f; 27 | char st1[max_size]; 28 | char *bestw[N]; 29 | char file_name[max_size], st[100][max_size]; 30 | float dist, len, bestd[N], vec[max_size]; 31 | long long words, size, a, b, c, d, cn, bi[100]; 32 | char ch; 33 | float *M; 34 | char *vocab; 35 | if (argc < 2) { 36 | printf("Usage: ./distance \nwhere FILE contains word projections in the BINARY FORMAT\n"); 37 | return 0; 38 | } 39 | strcpy(file_name, argv[1]); 40 | f = fopen(file_name, "rb"); 41 | if (f == NULL) { 42 | printf("Input file not found\n"); 43 | return -1; 44 | } 45 | fscanf(f, "%lld", &words); 46 | fscanf(f, "%lld", &size); 47 | vocab = (char *)malloc((long long)words * max_w * sizeof(char)); 48 | for (a = 0; a < N; a++) bestw[a] = (char *)malloc(max_size * sizeof(char)); 49 | M = (float *)malloc((long long)words * (long long)size * sizeof(float)); 50 | if (M == NULL) { 51 | printf("Cannot allocate memory: %lld MB %lld %lld\n", (long long)words * size * sizeof(float) / 1048576, words, size); 52 | return -1; 53 | } 54 | for (b = 0; b < words; b++) { 55 | a = 0; 56 | while (1) { 57 | vocab[b * max_w + a] = fgetc(f); 58 | if (feof(f) || (vocab[b * max_w + a] == ' ')) break; 59 | if ((a < max_w) && (vocab[b * max_w + a] != '\n')) a++; 60 | } 61 | vocab[b * max_w + a] = 0; 62 | for (a = 0; a < size; a++) fread(&M[a + b * size], sizeof(float), 1, f); 63 | len = 0; 64 | for (a = 0; a < size; a++) len += M[a + b * size] * M[a + b * size]; 65 | len = sqrt(len); 66 | for (a = 0; a < size; a++) M[a + b * size] /= len; 67 | } 68 | fclose(f); 69 | while (1) { 70 | for (a = 0; a < N; a++) bestd[a] = 0; 71 | for (a = 0; a < N; a++) bestw[a][0] = 0; 72 | printf("Enter word or sentence (EXIT to break): "); 73 | a = 0; 74 | while (1) { 75 | st1[a] = fgetc(stdin); 76 | if ((st1[a] == '\n') || (a >= max_size - 1)) { 77 | st1[a] = 0; 78 | break; 79 | } 80 | a++; 81 | } 82 | if (!strcmp(st1, "EXIT")) break; 83 | cn = 0; 84 | b = 0; 85 | c = 0; 86 | while (1) { 87 | st[cn][b] = st1[c]; 88 | b++; 89 | c++; 90 | st[cn][b] = 0; 91 | if (st1[c] == 0) break; 92 | if (st1[c] == ' ') { 93 | cn++; 94 | b = 0; 95 | c++; 96 | } 97 | } 98 | cn++; 99 | for (a = 0; a < cn; a++) { 100 | for (b = 0; b < words; b++) if (!strcmp(&vocab[b * max_w], st[a])) break; 101 | if (b == words) b = -1; 102 | bi[a] = b; 103 | printf("\nWord: %s Position in vocabulary: %lld\n", st[a], bi[a]); 104 | if (b == -1) { 105 | printf("Out of dictionary word!\n"); 106 | break; 107 | } 108 | } 109 | if (b == -1) continue; 110 | printf("\n Word Cosine distance\n------------------------------------------------------------------------\n"); 111 | for (a = 0; a < size; a++) vec[a] = 0; 112 | for (b = 0; b < cn; b++) { 113 | if (bi[b] == -1) continue; 114 | for (a = 0; a < size; a++) vec[a] += M[a + bi[b] * size]; 115 | } 116 | len = 0; 117 | for (a = 0; a < size; a++) len += vec[a] * vec[a]; 118 | len = sqrt(len); 119 | for (a = 0; a < size; a++) vec[a] /= len; 120 | for (a = 0; a < N; a++) bestd[a] = -1; 121 | for (a = 0; a < N; a++) bestw[a][0] = 0; 122 | for (c = 0; c < words; c++) { 123 | a = 0; 124 | for (b = 0; b < cn; b++) if (bi[b] == c) a = 1; 125 | if (a == 1) continue; 126 | dist = 0; 127 | for (a = 0; a < size; a++) dist += vec[a] * M[a + c * size]; 128 | for (a = 0; a < N; a++) { 129 | if (dist > bestd[a]) { 130 | for (d = N - 1; d > a; d--) { 131 | bestd[d] = bestd[d - 1]; 132 | strcpy(bestw[d], bestw[d - 1]); 133 | } 134 | bestd[a] = dist; 135 | strcpy(bestw[a], &vocab[c * max_w]); 136 | break; 137 | } 138 | } 139 | } 140 | for (a = 0; a < N; a++) printf("%50s\t\t%f\n", bestw[a], bestd[a]); 141 | } 142 | return 0; 143 | } 144 | -------------------------------------------------------------------------------- /chatbotv1/src/main/java/org/wltea/analyzer/query/SWMCQueryBuilder.java: -------------------------------------------------------------------------------- 1 | /** 2 | * IK 中文分词 版本 5.0 3 | * IK Analyzer release 5.0 4 | * 5 | * Licensed to the Apache Software Foundation (ASF) under one or more 6 | * contributor license agreements. See the NOTICE file distributed with 7 | * this work for additional information regarding copyright ownership. 8 | * The ASF licenses this file to You under the Apache License, Version 2.0 9 | * (the "License"); you may not use this file except in compliance with 10 | * the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | * 20 | * 源代码由林良益(linliangyi2005@gmail.com)提供 21 | * 版权声明 2012,乌龙茶工作室 22 | * provided by Linliangyi and copyright 2012 by Oolong studio 23 | * 24 | */ 25 | package org.wltea.analyzer.query; 26 | 27 | import java.io.IOException; 28 | import java.io.StringReader; 29 | import java.util.ArrayList; 30 | import java.util.List; 31 | 32 | import org.apache.lucene.analysis.standard.StandardAnalyzer; 33 | import org.apache.lucene.queryparser.classic.ParseException; 34 | import org.apache.lucene.queryparser.classic.QueryParser; 35 | import org.apache.lucene.search.Query; 36 | import org.apache.lucene.util.Version; 37 | import org.wltea.analyzer.core.IKSegmenter; 38 | import org.wltea.analyzer.core.Lexeme; 39 | 40 | /** 41 | * Single Word Multi Char Query Builder 42 | * IK分词算法专用 43 | * @author linliangyi 44 | * 45 | */ 46 | public class SWMCQueryBuilder { 47 | 48 | /** 49 | * 生成SWMCQuery 50 | * @param fieldName 51 | * @param keywords 52 | * @param quickMode 53 | * @return Lucene Query 54 | */ 55 | public static Query create(String fieldName ,String keywords , boolean quickMode){ 56 | if(fieldName == null || keywords == null){ 57 | throw new IllegalArgumentException("参数 fieldName 、 keywords 不能为null."); 58 | } 59 | //1.对keywords进行分词处理 60 | List lexemes = doAnalyze(keywords); 61 | //2.根据分词结果,生成SWMCQuery 62 | Query _SWMCQuery = getSWMCQuery(fieldName , lexemes , quickMode); 63 | return _SWMCQuery; 64 | } 65 | 66 | /** 67 | * 分词切分,并返回结链表 68 | * @param keywords 69 | * @return 70 | */ 71 | private static List doAnalyze(String keywords){ 72 | List lexemes = new ArrayList(); 73 | IKSegmenter ikSeg = new IKSegmenter(new StringReader(keywords) , true); 74 | try{ 75 | Lexeme l = null; 76 | while( (l = ikSeg.next()) != null){ 77 | lexemes.add(l); 78 | } 79 | }catch(IOException e){ 80 | e.printStackTrace(); 81 | } 82 | return lexemes; 83 | } 84 | 85 | 86 | /** 87 | * 根据分词结果生成SWMC搜索 88 | * @param fieldName 89 | * @param pathOption 90 | * @param quickMode 91 | * @return 92 | */ 93 | private static Query getSWMCQuery(String fieldName , List lexemes , boolean quickMode){ 94 | //构造SWMC的查询表达式 95 | StringBuffer keywordBuffer = new StringBuffer(); 96 | //精简的SWMC的查询表达式 97 | StringBuffer keywordBuffer_Short = new StringBuffer(); 98 | //记录最后词元长度 99 | int lastLexemeLength = 0; 100 | //记录最后词元结束位置 101 | int lastLexemeEnd = -1; 102 | 103 | int shortCount = 0; 104 | int totalCount = 0; 105 | for(Lexeme l : lexemes){ 106 | totalCount += l.getLength(); 107 | //精简表达式 108 | if(l.getLength() > 1){ 109 | keywordBuffer_Short.append(' ').append(l.getLexemeText()); 110 | shortCount += l.getLength(); 111 | } 112 | 113 | if(lastLexemeLength == 0){ 114 | keywordBuffer.append(l.getLexemeText()); 115 | }else if(lastLexemeLength == 1 && l.getLength() == 1 116 | && lastLexemeEnd == l.getBeginPosition()){//单字位置相邻,长度为一,合并) 117 | keywordBuffer.append(l.getLexemeText()); 118 | }else{ 119 | keywordBuffer.append(' ').append(l.getLexemeText()); 120 | 121 | } 122 | lastLexemeLength = l.getLength(); 123 | lastLexemeEnd = l.getEndPosition(); 124 | } 125 | 126 | //借助lucene queryparser 生成SWMC Query 127 | QueryParser qp = new QueryParser(Version.LUCENE_40, fieldName, new StandardAnalyzer(Version.LUCENE_40)); 128 | qp.setDefaultOperator(QueryParser.AND_OPERATOR); 129 | qp.setAutoGeneratePhraseQueries(true); 130 | 131 | if(quickMode && (shortCount * 1.0f / totalCount) > 0.5f){ 132 | try { 133 | //System.out.println(keywordBuffer.toString()); 134 | Query q = qp.parse(keywordBuffer_Short.toString()); 135 | return q; 136 | } catch (ParseException e) { 137 | e.printStackTrace(); 138 | } 139 | 140 | }else{ 141 | if(keywordBuffer.length() > 0){ 142 | try { 143 | //System.out.println(keywordBuffer.toString()); 144 | Query q = qp.parse(keywordBuffer.toString()); 145 | return q; 146 | } catch (ParseException e) { 147 | e.printStackTrace(); 148 | } 149 | } 150 | } 151 | return null; 152 | } 153 | } 154 | -------------------------------------------------------------------------------- /chatbotv1/src/main/java/org/wltea/analyzer/sample/LuceneIndexAndSearchDemo.java: -------------------------------------------------------------------------------- 1 | /** 2 | * IK 中文分词 版本 5.0 3 | * IK Analyzer release 5.0 4 | * 5 | * Licensed to the Apache Software Foundation (ASF) under one or more 6 | * contributor license agreements. See the NOTICE file distributed with 7 | * this work for additional information regarding copyright ownership. 8 | * The ASF licenses this file to You under the Apache License, Version 2.0 9 | * (the "License"); you may not use this file except in compliance with 10 | * the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | * 20 | * 源代码由林良益(linliangyi2005@gmail.com)提供 21 | * 版权声明 2012,乌龙茶工作室 22 | * provided by Linliangyi and copyright 2012 by Oolong studio 23 | * 24 | * 25 | */ 26 | package org.wltea.analyzer.sample; 27 | 28 | import java.io.IOException; 29 | 30 | import org.apache.lucene.analysis.Analyzer; 31 | import org.apache.lucene.document.Document; 32 | import org.apache.lucene.document.Field; 33 | import org.apache.lucene.document.StringField; 34 | import org.apache.lucene.document.TextField; 35 | import org.apache.lucene.index.CorruptIndexException; 36 | import org.apache.lucene.index.DirectoryReader; 37 | import org.apache.lucene.index.IndexReader; 38 | import org.apache.lucene.index.IndexWriter; 39 | import org.apache.lucene.index.IndexWriterConfig; 40 | import org.apache.lucene.index.IndexWriterConfig.OpenMode; 41 | import org.apache.lucene.queryparser.classic.ParseException; 42 | import org.apache.lucene.queryparser.classic.QueryParser; 43 | import org.apache.lucene.search.IndexSearcher; 44 | import org.apache.lucene.search.Query; 45 | import org.apache.lucene.search.ScoreDoc; 46 | import org.apache.lucene.search.TopDocs; 47 | import org.apache.lucene.store.Directory; 48 | import org.apache.lucene.store.LockObtainFailedException; 49 | import org.apache.lucene.store.RAMDirectory; 50 | import org.apache.lucene.util.Version; 51 | import org.wltea.analyzer.lucene.IKAnalyzer; 52 | 53 | 54 | 55 | 56 | /** 57 | * 使用IKAnalyzer进行Lucene索引和查询的演示 58 | * 2012-3-2 59 | * 60 | * 以下是结合Lucene4.0 API的写法 61 | * 62 | */ 63 | public class LuceneIndexAndSearchDemo { 64 | 65 | 66 | /** 67 | * 模拟: 68 | * 创建一个单条记录的索引,并对其进行搜索 69 | * @param args 70 | */ 71 | public static void main(String[] args){ 72 | //Lucene Document的域名 73 | String fieldName = "text"; 74 | //检索内容 75 | String text = "IK Analyzer是一个结合词典分词和文法分词的中文分词开源工具包。它使用了全新的正向迭代最细粒度切分算法。"; 76 | 77 | //实例化IKAnalyzer分词器 78 | Analyzer analyzer = new IKAnalyzer(true); 79 | 80 | Directory directory = null; 81 | IndexWriter iwriter = null; 82 | IndexReader ireader = null; 83 | IndexSearcher isearcher = null; 84 | try { 85 | //建立内存索引对象 86 | directory = new RAMDirectory(); 87 | 88 | //配置IndexWriterConfig 89 | IndexWriterConfig iwConfig = new IndexWriterConfig(Version.LUCENE_40 , analyzer); 90 | iwConfig.setOpenMode(OpenMode.CREATE_OR_APPEND); 91 | iwriter = new IndexWriter(directory , iwConfig); 92 | //写入索引 93 | Document doc = new Document(); 94 | doc.add(new StringField("ID", "10000", Field.Store.YES)); 95 | doc.add(new TextField(fieldName, text, Field.Store.YES)); 96 | iwriter.addDocument(doc); 97 | iwriter.close(); 98 | 99 | 100 | //搜索过程********************************** 101 | //实例化搜索器 102 | ireader = DirectoryReader.open(directory); 103 | isearcher = new IndexSearcher(ireader); 104 | 105 | String keyword = "中文分词工具包"; 106 | //使用QueryParser查询分析器构造Query对象 107 | QueryParser qp = new QueryParser(Version.LUCENE_40, fieldName, analyzer); 108 | qp.setDefaultOperator(QueryParser.AND_OPERATOR); 109 | Query query = qp.parse(keyword); 110 | System.out.println("Query = " + query); 111 | 112 | //搜索相似度最高的5条记录 113 | TopDocs topDocs = isearcher.search(query , 5); 114 | System.out.println("命中:" + topDocs.totalHits); 115 | //输出结果 116 | ScoreDoc[] scoreDocs = topDocs.scoreDocs; 117 | for (int i = 0; i < topDocs.totalHits; i++){ 118 | Document targetDoc = isearcher.doc(scoreDocs[i].doc); 119 | System.out.println("内容:" + targetDoc.toString()); 120 | } 121 | 122 | } catch (CorruptIndexException e) { 123 | e.printStackTrace(); 124 | } catch (LockObtainFailedException e) { 125 | e.printStackTrace(); 126 | } catch (IOException e) { 127 | e.printStackTrace(); 128 | } catch (ParseException e) { 129 | e.printStackTrace(); 130 | } finally{ 131 | if(ireader != null){ 132 | try { 133 | ireader.close(); 134 | } catch (IOException e) { 135 | e.printStackTrace(); 136 | } 137 | } 138 | if(directory != null){ 139 | try { 140 | directory.close(); 141 | } catch (IOException e) { 142 | e.printStackTrace(); 143 | } 144 | } 145 | } 146 | } 147 | } 148 | -------------------------------------------------------------------------------- /word2vec/word-analogy.c: -------------------------------------------------------------------------------- 1 | // Copyright 2013 Google Inc. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include 16 | #include 17 | #include 18 | //#include 19 | #include 20 | 21 | const long long max_size = 2000; // max length of strings 22 | const long long N = 40; // number of closest words that will be shown 23 | const long long max_w = 50; // max length of vocabulary entries 24 | 25 | int main(int argc, char **argv) { 26 | FILE *f; 27 | char st1[max_size]; 28 | char bestw[N][max_size]; 29 | char file_name[max_size], st[100][max_size]; 30 | float dist, len, bestd[N], vec[max_size]; 31 | long long words, size, a, b, c, d, cn, bi[100]; 32 | char ch; 33 | float *M; 34 | char *vocab; 35 | if (argc < 2) { 36 | printf("Usage: ./word-analogy \nwhere FILE contains word projections in the BINARY FORMAT\n"); 37 | return 0; 38 | } 39 | strcpy(file_name, argv[1]); 40 | f = fopen(file_name, "rb"); 41 | if (f == NULL) { 42 | printf("Input file not found\n"); 43 | return -1; 44 | } 45 | fscanf(f, "%lld", &words); 46 | fscanf(f, "%lld", &size); 47 | vocab = (char *)malloc((long long)words * max_w * sizeof(char)); 48 | M = (float *)malloc((long long)words * (long long)size * sizeof(float)); 49 | if (M == NULL) { 50 | printf("Cannot allocate memory: %lld MB %lld %lld\n", (long long)words * size * sizeof(float) / 1048576, words, size); 51 | return -1; 52 | } 53 | for (b = 0; b < words; b++) { 54 | a = 0; 55 | while (1) { 56 | vocab[b * max_w + a] = fgetc(f); 57 | if (feof(f) || (vocab[b * max_w + a] == ' ')) break; 58 | if ((a < max_w) && (vocab[b * max_w + a] != '\n')) a++; 59 | } 60 | vocab[b * max_w + a] = 0; 61 | for (a = 0; a < size; a++) fread(&M[a + b * size], sizeof(float), 1, f); 62 | len = 0; 63 | for (a = 0; a < size; a++) len += M[a + b * size] * M[a + b * size]; 64 | len = sqrt(len); 65 | for (a = 0; a < size; a++) M[a + b * size] /= len; 66 | } 67 | fclose(f); 68 | while (1) { 69 | for (a = 0; a < N; a++) bestd[a] = 0; 70 | for (a = 0; a < N; a++) bestw[a][0] = 0; 71 | printf("Enter three words (EXIT to break): "); 72 | a = 0; 73 | while (1) { 74 | st1[a] = fgetc(stdin); 75 | if ((st1[a] == '\n') || (a >= max_size - 1)) { 76 | st1[a] = 0; 77 | break; 78 | } 79 | a++; 80 | } 81 | if (!strcmp(st1, "EXIT")) break; 82 | cn = 0; 83 | b = 0; 84 | c = 0; 85 | while (1) { 86 | st[cn][b] = st1[c]; 87 | b++; 88 | c++; 89 | st[cn][b] = 0; 90 | if (st1[c] == 0) break; 91 | if (st1[c] == ' ') { 92 | cn++; 93 | b = 0; 94 | c++; 95 | } 96 | } 97 | cn++; 98 | if (cn < 3) { 99 | printf("Only %lld words were entered.. three words are needed at the input to perform the calculation\n", cn); 100 | continue; 101 | } 102 | for (a = 0; a < cn; a++) { 103 | for (b = 0; b < words; b++) if (!strcmp(&vocab[b * max_w], st[a])) break; 104 | if (b == words) b = 0; 105 | bi[a] = b; 106 | printf("\nWord: %s Position in vocabulary: %lld\n", st[a], bi[a]); 107 | if (b == 0) { 108 | printf("Out of dictionary word!\n"); 109 | break; 110 | } 111 | } 112 | if (b == 0) continue; 113 | printf("\n Word Distance\n------------------------------------------------------------------------\n"); 114 | for (a = 0; a < size; a++) vec[a] = M[a + bi[1] * size] - M[a + bi[0] * size] + M[a + bi[2] * size]; 115 | len = 0; 116 | for (a = 0; a < size; a++) len += vec[a] * vec[a]; 117 | len = sqrt(len); 118 | for (a = 0; a < size; a++) vec[a] /= len; 119 | for (a = 0; a < N; a++) bestd[a] = 0; 120 | for (a = 0; a < N; a++) bestw[a][0] = 0; 121 | for (c = 0; c < words; c++) { 122 | if (c == bi[0]) continue; 123 | if (c == bi[1]) continue; 124 | if (c == bi[2]) continue; 125 | a = 0; 126 | for (b = 0; b < cn; b++) if (bi[b] == c) a = 1; 127 | if (a == 1) continue; 128 | dist = 0; 129 | for (a = 0; a < size; a++) dist += vec[a] * M[a + c * size]; 130 | for (a = 0; a < N; a++) { 131 | if (dist > bestd[a]) { 132 | for (d = N - 1; d > a; d--) { 133 | bestd[d] = bestd[d - 1]; 134 | strcpy(bestw[d], bestw[d - 1]); 135 | } 136 | bestd[a] = dist; 137 | strcpy(bestw[a], &vocab[c * max_w]); 138 | break; 139 | } 140 | } 141 | } 142 | for (a = 0; a < N; a++) printf("%50s\t\t%f\n", bestw[a], bestd[a]); 143 | } 144 | return 0; 145 | } 146 | -------------------------------------------------------------------------------- /word2vec/demo-train-big-model-v1.sh: -------------------------------------------------------------------------------- 1 | ############################################################################################### 2 | # 3 | # Script for training good word and phrase vector model using public corpora, version 1.0. 4 | # The training time will be from several hours to about a day. 5 | # 6 | # Downloads about 8 billion words, makes phrases using two runs of word2phrase, trains 7 | # a 500-dimensional vector model and evaluates it on word and phrase analogy tasks. 8 | # 9 | ############################################################################################### 10 | 11 | # This function will convert text to lowercase and remove special characters 12 | normalize_text() { 13 | awk '{print tolower($0);}' | sed -e "s/’/'/g" -e "s/′/'/g" -e "s/''/ /g" -e "s/'/ ' /g" -e "s/“/\"/g" -e "s/”/\"/g" \ 14 | -e 's/"/ " /g' -e 's/\./ \. /g' -e 's/
/ /g' -e 's/, / , /g' -e 's/(/ ( /g' -e 's/)/ ) /g' -e 's/\!/ \! /g' \ 15 | -e 's/\?/ \? /g' -e 's/\;/ /g' -e 's/\:/ /g' -e 's/-/ - /g' -e 's/=/ /g' -e 's/=/ /g' -e 's/*/ /g' -e 's/|/ /g' \ 16 | -e 's/«/ /g' | tr 0-9 " " 17 | } 18 | 19 | mkdir word2vec 20 | cd word2vec 21 | 22 | wget http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2012.en.shuffled.gz 23 | wget http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2013.en.shuffled.gz 24 | gzip -d news.2012.en.shuffled.gz 25 | gzip -d news.2013.en.shuffled.gz 26 | normalize_text < news.2012.en.shuffled > data.txt 27 | normalize_text < news.2013.en.shuffled >> data.txt 28 | 29 | wget http://www.statmt.org/lm-benchmark/1-billion-word-language-modeling-benchmark-r13output.tar.gz 30 | tar -xvf 1-billion-word-language-modeling-benchmark-r13output.tar.gz 31 | for i in `ls 1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled`; do 32 | normalize_text < 1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/$i >> data.txt 33 | done 34 | 35 | wget http://ebiquity.umbc.edu/redirect/to/resource/id/351/UMBC-webbase-corpus 36 | tar -zxvf umbc_webbase_corpus.tar.gz webbase_all/*.txt 37 | for i in `ls webbase_all`; do 38 | normalize_text < webbase_all/$i >> data.txt 39 | done 40 | 41 | wget http://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2 42 | bzip2 -c -d enwiki-latest-pages-articles.xml.bz2 | awk '{print tolower($0);}' | perl -e ' 43 | # Program to filter Wikipedia XML dumps to "clean" text consisting only of lowercase 44 | # letters (a-z, converted from A-Z), and spaces (never consecutive)... 45 | # All other characters are converted to spaces. Only text which normally appears. 46 | # in the web browser is displayed. Tables are removed. Image captions are. 47 | # preserved. Links are converted to normal text. Digits are spelled out. 48 | # *** Modified to not spell digits or throw away non-ASCII characters *** 49 | 50 | # Written by Matt Mahoney, June 10, 2006. This program is released to the public domain. 51 | 52 | $/=">"; # input record separator 53 | while (<>) { 54 | if (/ ... 55 | if (/#redirect/i) {$text=0;} # remove #REDIRECT 56 | if ($text) { 57 | 58 | # Remove any text not normally visible 59 | if (/<\/text>/) {$text=0;} 60 | s/<.*>//; # remove xml tags 61 | s/&/&/g; # decode URL encoded chars 62 | s/<//g; 64 | s///g; # remove references ... 65 | s/<[^>]*>//g; # remove xhtml tags 66 | s/\[http:[^] ]*/[/g; # remove normal url, preserve visible text 67 | s/\|thumb//ig; # remove images links, preserve caption 68 | s/\|left//ig; 69 | s/\|right//ig; 70 | s/\|\d+px//ig; 71 | s/\[\[image:[^\[\]]*\|//ig; 72 | s/\[\[category:([^|\]]*)[^]]*\]\]/[[$1]]/ig; # show categories without markup 73 | s/\[\[[a-z\-]*:[^\]]*\]\]//g; # remove links to other languages 74 | s/\[\[[^\|\]]*\|/[[/g; # remove wiki url, preserve visible text 75 | s/{{[^}]*}}//g; # remove {{icons}} and {tables} 76 | s/{[^}]*}//g; 77 | s/\[//g; # remove [ and ] 78 | s/\]//g; 79 | s/&[^;]*;/ /g; # remove URL encoded chars 80 | 81 | $_=" $_ "; 82 | chop; 83 | print $_; 84 | } 85 | } 86 | ' | normalize_text | awk '{if (NF>1) print;}' >> data.txt 87 | 88 | wget http://word2vec.googlecode.com/svn/trunk/word2vec.c 89 | wget http://word2vec.googlecode.com/svn/trunk/word2phrase.c 90 | wget http://word2vec.googlecode.com/svn/trunk/compute-accuracy.c 91 | wget http://word2vec.googlecode.com/svn/trunk/questions-words.txt 92 | wget http://word2vec.googlecode.com/svn/trunk/questions-phrases.txt 93 | gcc word2vec.c -o word2vec -lm -pthread -O3 -march=native -funroll-loops 94 | gcc word2phrase.c -o word2phrase -lm -pthread -O3 -march=native -funroll-loops 95 | gcc compute-accuracy.c -o compute-accuracy -lm -pthread -O3 -march=native -funroll-loops 96 | ./word2phrase -train data.txt -output data-phrase.txt -threshold 200 -debug 2 97 | ./word2phrase -train data-phrase.txt -output data-phrase2.txt -threshold 100 -debug 2 98 | ./word2vec -train data-phrase2.txt -output vectors.bin -cbow 1 -size 500 -window 10 -negative 10 -hs 0 -sample 1e-5 -threads 40 -binary 1 -iter 3 -min-count 10 99 | ./compute-accuracy vectors.bin 400000 < questions-words.txt # should get to almost 78% accuracy on 99.7% of questions 100 | ./compute-accuracy vectors.bin 1000000 < questions-phrases.txt # about 78% accuracy with 77% coverage 101 | -------------------------------------------------------------------------------- /word2vec/compute-accuracy.c: -------------------------------------------------------------------------------- 1 | // Copyright 2013 Google Inc. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include 16 | #include 17 | #include 18 | #include 19 | //#include 20 | #include 21 | #include 22 | 23 | const long long max_size = 2000; // max length of strings 24 | const long long N = 1; // number of closest words 25 | const long long max_w = 50; // max length of vocabulary entries 26 | 27 | int main(int argc, char **argv) 28 | { 29 | FILE *f; 30 | char st1[max_size], st2[max_size], st3[max_size], st4[max_size], bestw[N][max_size], file_name[max_size], ch; 31 | float dist, len, bestd[N], vec[max_size]; 32 | long long words, size, a, b, c, d, b1, b2, b3, threshold = 0; 33 | float *M; 34 | char *vocab; 35 | int TCN, CCN = 0, TACN = 0, CACN = 0, SECN = 0, SYCN = 0, SEAC = 0, SYAC = 0, QID = 0, TQ = 0, TQS = 0; 36 | if (argc < 2) { 37 | printf("Usage: ./compute-accuracy \nwhere FILE contains word projections, and threshold is used to reduce vocabulary of the model for fast approximate evaluation (0 = off, otherwise typical value is 30000)\n"); 38 | return 0; 39 | } 40 | strcpy(file_name, argv[1]); 41 | if (argc > 2) threshold = atoi(argv[2]); 42 | f = fopen(file_name, "rb"); 43 | if (f == NULL) { 44 | printf("Input file not found\n"); 45 | return -1; 46 | } 47 | fscanf(f, "%lld", &words); 48 | if (threshold) if (words > threshold) words = threshold; 49 | fscanf(f, "%lld", &size); 50 | vocab = (char *)malloc(words * max_w * sizeof(char)); 51 | M = (float *)malloc(words * size * sizeof(float)); 52 | if (M == NULL) { 53 | printf("Cannot allocate memory: %lld MB\n", words * size * sizeof(float) / 1048576); 54 | return -1; 55 | } 56 | for (b = 0; b < words; b++) { 57 | a = 0; 58 | while (1) { 59 | vocab[b * max_w + a] = fgetc(f); 60 | if (feof(f) || (vocab[b * max_w + a] == ' ')) break; 61 | if ((a < max_w) && (vocab[b * max_w + a] != '\n')) a++; 62 | } 63 | vocab[b * max_w + a] = 0; 64 | for (a = 0; a < max_w; a++) vocab[b * max_w + a] = toupper(vocab[b * max_w + a]); 65 | for (a = 0; a < size; a++) fread(&M[a + b * size], sizeof(float), 1, f); 66 | len = 0; 67 | for (a = 0; a < size; a++) len += M[a + b * size] * M[a + b * size]; 68 | len = sqrt(len); 69 | for (a = 0; a < size; a++) M[a + b * size] /= len; 70 | } 71 | fclose(f); 72 | TCN = 0; 73 | while (1) { 74 | for (a = 0; a < N; a++) bestd[a] = 0; 75 | for (a = 0; a < N; a++) bestw[a][0] = 0; 76 | scanf("%s", st1); 77 | for (a = 0; a < strlen(st1); a++) st1[a] = toupper(st1[a]); 78 | if ((!strcmp(st1, ":")) || (!strcmp(st1, "EXIT")) || feof(stdin)) { 79 | if (TCN == 0) TCN = 1; 80 | if (QID != 0) { 81 | printf("ACCURACY TOP1: %.2f %% (%d / %d)\n", CCN / (float)TCN * 100, CCN, TCN); 82 | printf("Total accuracy: %.2f %% Semantic accuracy: %.2f %% Syntactic accuracy: %.2f %% \n", CACN / (float)TACN * 100, SEAC / (float)SECN * 100, SYAC / (float)SYCN * 100); 83 | } 84 | QID++; 85 | scanf("%s", st1); 86 | if (feof(stdin)) break; 87 | printf("%s:\n", st1); 88 | TCN = 0; 89 | CCN = 0; 90 | continue; 91 | } 92 | if (!strcmp(st1, "EXIT")) break; 93 | scanf("%s", st2); 94 | for (a = 0; a < strlen(st2); a++) st2[a] = toupper(st2[a]); 95 | scanf("%s", st3); 96 | for (a = 0; a bestd[a]) { 123 | for (d = N - 1; d > a; d--) { 124 | bestd[d] = bestd[d - 1]; 125 | strcpy(bestw[d], bestw[d - 1]); 126 | } 127 | bestd[a] = dist; 128 | strcpy(bestw[a], &vocab[c * max_w]); 129 | break; 130 | } 131 | } 132 | } 133 | if (!strcmp(st4, bestw[0])) { 134 | CCN++; 135 | CACN++; 136 | if (QID <= 5) SEAC++; else SYAC++; 137 | } 138 | if (QID <= 5) SECN++; else SYCN++; 139 | TCN++; 140 | TACN++; 141 | } 142 | printf("Questions seen / total: %d %d %.2f %% \n", TQS, TQ, TQS/(float)TQ*100); 143 | return 0; 144 | } 145 | -------------------------------------------------------------------------------- /chatbotv1/src/main/java/org/wltea/analyzer/core/QuickSortSet.java: -------------------------------------------------------------------------------- 1 | /** 2 | * IK 中文分词 版本 5.0 3 | * IK Analyzer release 5.0 4 | * 5 | * Licensed to the Apache Software Foundation (ASF) under one or more 6 | * contributor license agreements. See the NOTICE file distributed with 7 | * this work for additional information regarding copyright ownership. 8 | * The ASF licenses this file to You under the Apache License, Version 2.0 9 | * (the "License"); you may not use this file except in compliance with 10 | * the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | * 20 | * 源代码由林良益(linliangyi2005@gmail.com)提供 21 | * 版权声明 2012,乌龙茶工作室 22 | * provided by Linliangyi and copyright 2012 by Oolong studio 23 | * 24 | */ 25 | package org.wltea.analyzer.core; 26 | 27 | /** 28 | * IK分词器专用的Lexem快速排序集合 29 | */ 30 | class QuickSortSet { 31 | //链表头 32 | private Cell head; 33 | //链表尾 34 | private Cell tail; 35 | //链表的实际大小 36 | private int size; 37 | 38 | QuickSortSet(){ 39 | this.size = 0; 40 | } 41 | 42 | /** 43 | * 向链表集合添加词元 44 | * @param lexeme 45 | */ 46 | boolean addLexeme(Lexeme lexeme){ 47 | Cell newCell = new Cell(lexeme); 48 | if(this.size == 0){ 49 | this.head = newCell; 50 | this.tail = newCell; 51 | this.size++; 52 | return true; 53 | 54 | }else{ 55 | if(this.tail.compareTo(newCell) == 0){//词元与尾部词元相同,不放入集合 56 | return false; 57 | 58 | }else if(this.tail.compareTo(newCell) < 0){//词元接入链表尾部 59 | this.tail.next = newCell; 60 | newCell.prev = this.tail; 61 | this.tail = newCell; 62 | this.size++; 63 | return true; 64 | 65 | }else if(this.head.compareTo(newCell) > 0){//词元接入链表头部 66 | this.head.prev = newCell; 67 | newCell.next = this.head; 68 | this.head = newCell; 69 | this.size++; 70 | return true; 71 | 72 | }else{ 73 | //从尾部上逆 74 | Cell index = this.tail; 75 | while(index != null && index.compareTo(newCell) > 0){ 76 | index = index.prev; 77 | } 78 | if(index.compareTo(newCell) == 0){//词元与集合中的词元重复,不放入集合 79 | return false; 80 | 81 | }else if(index.compareTo(newCell) < 0){//词元插入链表中的某个位置 82 | newCell.prev = index; 83 | newCell.next = index.next; 84 | index.next.prev = newCell; 85 | index.next = newCell; 86 | this.size++; 87 | return true; 88 | } 89 | } 90 | } 91 | return false; 92 | } 93 | 94 | /** 95 | * 返回链表头部元素 96 | * @return 97 | */ 98 | Lexeme peekFirst(){ 99 | if(this.head != null){ 100 | return this.head.lexeme; 101 | } 102 | return null; 103 | } 104 | 105 | /** 106 | * 取出链表集合的第一个元素 107 | * @return Lexeme 108 | */ 109 | Lexeme pollFirst(){ 110 | if(this.size == 1){ 111 | Lexeme first = this.head.lexeme; 112 | this.head = null; 113 | this.tail = null; 114 | this.size--; 115 | return first; 116 | }else if(this.size > 1){ 117 | Lexeme first = this.head.lexeme; 118 | this.head = this.head.next; 119 | this.size --; 120 | return first; 121 | }else{ 122 | return null; 123 | } 124 | } 125 | 126 | /** 127 | * 返回链表尾部元素 128 | * @return 129 | */ 130 | Lexeme peekLast(){ 131 | if(this.tail != null){ 132 | return this.tail.lexeme; 133 | } 134 | return null; 135 | } 136 | 137 | /** 138 | * 取出链表集合的最后一个元素 139 | * @return Lexeme 140 | */ 141 | Lexeme pollLast(){ 142 | if(this.size == 1){ 143 | Lexeme last = this.head.lexeme; 144 | this.head = null; 145 | this.tail = null; 146 | this.size--; 147 | return last; 148 | 149 | }else if(this.size > 1){ 150 | Lexeme last = this.tail.lexeme; 151 | this.tail = this.tail.prev; 152 | this.size--; 153 | return last; 154 | 155 | }else{ 156 | return null; 157 | } 158 | } 159 | 160 | /** 161 | * 返回集合大小 162 | * @return 163 | */ 164 | int size(){ 165 | return this.size; 166 | } 167 | 168 | /** 169 | * 判断集合是否为空 170 | * @return 171 | */ 172 | boolean isEmpty(){ 173 | return this.size == 0; 174 | } 175 | 176 | /** 177 | * 返回lexeme链的头部 178 | * @return 179 | */ 180 | Cell getHead(){ 181 | return this.head; 182 | } 183 | 184 | /** 185 | * 186 | * IK 中文分词 版本 5.0 187 | * IK Analyzer release 5.0 188 | * 189 | * Licensed to the Apache Software Foundation (ASF) under one or more 190 | * contributor license agreements. See the NOTICE file distributed with 191 | * this work for additional information regarding copyright ownership. 192 | * The ASF licenses this file to You under the Apache License, Version 2.0 193 | * (the "License"); you may not use this file except in compliance with 194 | * the License. You may obtain a copy of the License at 195 | * 196 | * http://www.apache.org/licenses/LICENSE-2.0 197 | * 198 | * Unless required by applicable law or agreed to in writing, software 199 | * distributed under the License is distributed on an "AS IS" BASIS, 200 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | * See the License for the specific language governing permissions and 202 | * limitations under the License. 203 | * 204 | * 源代码由林良益(linliangyi2005@gmail.com)提供 205 | * 版权声明 2012,乌龙茶工作室 206 | * provided by Linliangyi and copyright 2012 by Oolong studio 207 | * 208 | * QuickSortSet集合单元 209 | * 210 | */ 211 | class Cell implements Comparable{ 212 | private Cell prev; 213 | private Cell next; 214 | private Lexeme lexeme; 215 | 216 | Cell(Lexeme lexeme){ 217 | if(lexeme == null){ 218 | throw new IllegalArgumentException("lexeme must not be null"); 219 | } 220 | this.lexeme = lexeme; 221 | } 222 | 223 | public int compareTo(Cell o) { 224 | return this.lexeme.compareTo(o.lexeme); 225 | } 226 | 227 | public Cell getPrev(){ 228 | return this.prev; 229 | } 230 | 231 | public Cell getNext(){ 232 | return this.next; 233 | } 234 | 235 | public Lexeme getLexeme(){ 236 | return this.lexeme; 237 | } 238 | } 239 | } 240 | -------------------------------------------------------------------------------- /chatbotv1/src/main/java/org/wltea/analyzer/core/CN_QuantifierSegmenter.java: -------------------------------------------------------------------------------- 1 | /** 2 | * IK 中文分词 版本 5.0 3 | * IK Analyzer release 5.0 4 | * 5 | * Licensed to the Apache Software Foundation (ASF) under one or more 6 | * contributor license agreements. See the NOTICE file distributed with 7 | * this work for additional information regarding copyright ownership. 8 | * The ASF licenses this file to You under the Apache License, Version 2.0 9 | * (the "License"); you may not use this file except in compliance with 10 | * the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | * 20 | * 源代码由林良益(linliangyi2005@gmail.com)提供 21 | * 版权声明 2012,乌龙茶工作室 22 | * provided by Linliangyi and copyright 2012 by Oolong studio 23 | * 24 | */ 25 | package org.wltea.analyzer.core; 26 | 27 | import java.util.HashSet; 28 | import java.util.LinkedList; 29 | import java.util.List; 30 | import java.util.Set; 31 | 32 | import org.wltea.analyzer.dic.Dictionary; 33 | import org.wltea.analyzer.dic.Hit; 34 | 35 | /** 36 | * 37 | * 中文数量词子分词器 38 | */ 39 | class CN_QuantifierSegmenter implements ISegmenter{ 40 | 41 | //子分词器标签 42 | static final String SEGMENTER_NAME = "QUAN_SEGMENTER"; 43 | 44 | //中文数词 45 | private static String Chn_Num = "一二两三四五六七八九十零壹贰叁肆伍陆柒捌玖拾百千万亿拾佰仟萬億兆卅廿";//Cnum 46 | private static Set ChnNumberChars = new HashSet(); 47 | static{ 48 | char[] ca = Chn_Num.toCharArray(); 49 | for(char nChar : ca){ 50 | ChnNumberChars.add(nChar); 51 | } 52 | } 53 | 54 | /* 55 | * 词元的开始位置, 56 | * 同时作为子分词器状态标识 57 | * 当start > -1 时,标识当前的分词器正在处理字符 58 | */ 59 | private int nStart; 60 | /* 61 | * 记录词元结束位置 62 | * end记录的是在词元中最后一个出现的合理的数词结束 63 | */ 64 | private int nEnd; 65 | 66 | //待处理的量词hit队列 67 | private List countHits; 68 | 69 | 70 | CN_QuantifierSegmenter(){ 71 | nStart = -1; 72 | nEnd = -1; 73 | this.countHits = new LinkedList(); 74 | } 75 | 76 | /** 77 | * 分词 78 | */ 79 | public void analyze(AnalyzeContext context) { 80 | //处理中文数词 81 | this.processCNumber(context); 82 | //处理中文量词 83 | this.processCount(context); 84 | 85 | //判断是否锁定缓冲区 86 | if(this.nStart == -1 && this.nEnd == -1 && countHits.isEmpty()){ 87 | //对缓冲区解锁 88 | context.unlockBuffer(SEGMENTER_NAME); 89 | }else{ 90 | context.lockBuffer(SEGMENTER_NAME); 91 | } 92 | } 93 | 94 | 95 | /** 96 | * 重置子分词器状态 97 | */ 98 | public void reset() { 99 | nStart = -1; 100 | nEnd = -1; 101 | countHits.clear(); 102 | } 103 | 104 | /** 105 | * 处理数词 106 | */ 107 | private void processCNumber(AnalyzeContext context){ 108 | if(nStart == -1 && nEnd == -1){//初始状态 109 | if(CharacterUtil.CHAR_CHINESE == context.getCurrentCharType() 110 | && ChnNumberChars.contains(context.getCurrentChar())){ 111 | //记录数词的起始、结束位置 112 | nStart = context.getCursor(); 113 | nEnd = context.getCursor(); 114 | } 115 | }else{//正在处理状态 116 | if(CharacterUtil.CHAR_CHINESE == context.getCurrentCharType() 117 | && ChnNumberChars.contains(context.getCurrentChar())){ 118 | //记录数词的结束位置 119 | nEnd = context.getCursor(); 120 | }else{ 121 | //输出数词 122 | this.outputNumLexeme(context); 123 | //重置头尾指针 124 | nStart = -1; 125 | nEnd = -1; 126 | } 127 | } 128 | 129 | //缓冲区已经用完,还有尚未输出的数词 130 | if(context.isBufferConsumed()){ 131 | if(nStart != -1 && nEnd != -1){ 132 | //输出数词 133 | outputNumLexeme(context); 134 | //重置头尾指针 135 | nStart = -1; 136 | nEnd = -1; 137 | } 138 | } 139 | } 140 | 141 | /** 142 | * 处理中文量词 143 | * @param context 144 | */ 145 | private void processCount(AnalyzeContext context){ 146 | // 判断是否需要启动量词扫描 147 | if(!this.needCountScan(context)){ 148 | return; 149 | } 150 | 151 | if(CharacterUtil.CHAR_CHINESE == context.getCurrentCharType()){ 152 | 153 | //优先处理countHits中的hit 154 | if(!this.countHits.isEmpty()){ 155 | //处理词段队列 156 | Hit[] tmpArray = this.countHits.toArray(new Hit[this.countHits.size()]); 157 | for(Hit hit : tmpArray){ 158 | hit = Dictionary.getSingleton().matchWithHit(context.getSegmentBuff(), context.getCursor() , hit); 159 | if(hit.isMatch()){ 160 | //输出当前的词 161 | Lexeme newLexeme = new Lexeme(context.getBufferOffset() , hit.getBegin() , context.getCursor() - hit.getBegin() + 1 , Lexeme.TYPE_COUNT); 162 | context.addLexeme(newLexeme); 163 | 164 | if(!hit.isPrefix()){//不是词前缀,hit不需要继续匹配,移除 165 | this.countHits.remove(hit); 166 | } 167 | 168 | }else if(hit.isUnmatch()){ 169 | //hit不是词,移除 170 | this.countHits.remove(hit); 171 | } 172 | } 173 | } 174 | 175 | //********************************* 176 | //对当前指针位置的字符进行单字匹配 177 | Hit singleCharHit = Dictionary.getSingleton().matchInQuantifierDict(context.getSegmentBuff(), context.getCursor(), 1); 178 | if(singleCharHit.isMatch()){//首字成量词词 179 | //输出当前的词 180 | Lexeme newLexeme = new Lexeme(context.getBufferOffset() , context.getCursor() , 1 , Lexeme.TYPE_COUNT); 181 | context.addLexeme(newLexeme); 182 | 183 | //同时也是词前缀 184 | if(singleCharHit.isPrefix()){ 185 | //前缀匹配则放入hit列表 186 | this.countHits.add(singleCharHit); 187 | } 188 | }else if(singleCharHit.isPrefix()){//首字为量词前缀 189 | //前缀匹配则放入hit列表 190 | this.countHits.add(singleCharHit); 191 | } 192 | 193 | 194 | }else{ 195 | //输入的不是中文字符 196 | //清空未成形的量词 197 | this.countHits.clear(); 198 | } 199 | 200 | //缓冲区数据已经读完,还有尚未输出的量词 201 | if(context.isBufferConsumed()){ 202 | //清空未成形的量词 203 | this.countHits.clear(); 204 | } 205 | } 206 | 207 | /** 208 | * 判断是否需要扫描量词 209 | * @return 210 | */ 211 | private boolean needCountScan(AnalyzeContext context){ 212 | if((nStart != -1 && nEnd != -1 ) || !countHits.isEmpty()){ 213 | //正在处理中文数词,或者正在处理量词 214 | return true; 215 | }else{ 216 | //找到一个相邻的数词 217 | if(!context.getOrgLexemes().isEmpty()){ 218 | Lexeme l = context.getOrgLexemes().peekLast(); 219 | if(Lexeme.TYPE_CNUM == l.getLexemeType() || Lexeme.TYPE_ARABIC == l.getLexemeType()){ 220 | if(l.getBegin() + l.getLength() == context.getCursor()){ 221 | return true; 222 | } 223 | } 224 | } 225 | } 226 | return false; 227 | } 228 | 229 | /** 230 | * 添加数词词元到结果集 231 | * @param context 232 | */ 233 | private void outputNumLexeme(AnalyzeContext context){ 234 | if(nStart > -1 && nEnd > -1){ 235 | //输出数词 236 | Lexeme newLexeme = new Lexeme(context.getBufferOffset() , nStart , nEnd - nStart + 1 , Lexeme.TYPE_CNUM); 237 | context.addLexeme(newLexeme); 238 | 239 | } 240 | } 241 | 242 | } 243 | -------------------------------------------------------------------------------- /chatbotv1/src/main/java/org/wltea/analyzer/core/LexemePath.java: -------------------------------------------------------------------------------- 1 | /** 2 | * IK 中文分词 版本 5.0 3 | * IK Analyzer release 5.0 4 | * 5 | * Licensed to the Apache Software Foundation (ASF) under one or more 6 | * contributor license agreements. See the NOTICE file distributed with 7 | * this work for additional information regarding copyright ownership. 8 | * The ASF licenses this file to You under the Apache License, Version 2.0 9 | * (the "License"); you may not use this file except in compliance with 10 | * the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | * 20 | * 源代码由林良益(linliangyi2005@gmail.com)提供 21 | * 版权声明 2012,乌龙茶工作室 22 | * provided by Linliangyi and copyright 2012 by Oolong studio 23 | * 24 | */ 25 | package org.wltea.analyzer.core; 26 | 27 | 28 | /** 29 | * Lexeme链(路径) 30 | */ 31 | class LexemePath extends QuickSortSet implements Comparable{ 32 | 33 | //起始位置 34 | private int pathBegin; 35 | //结束 36 | private int pathEnd; 37 | //词元链的有效字符长度 38 | private int payloadLength; 39 | 40 | LexemePath(){ 41 | this.pathBegin = -1; 42 | this.pathEnd = -1; 43 | this.payloadLength = 0; 44 | } 45 | 46 | /** 47 | * 向LexemePath追加相交的Lexeme 48 | * @param lexeme 49 | * @return 50 | */ 51 | boolean addCrossLexeme(Lexeme lexeme){ 52 | if(this.isEmpty()){ 53 | this.addLexeme(lexeme); 54 | this.pathBegin = lexeme.getBegin(); 55 | this.pathEnd = lexeme.getBegin() + lexeme.getLength(); 56 | this.payloadLength += lexeme.getLength(); 57 | return true; 58 | 59 | }else if(this.checkCross(lexeme)){ 60 | this.addLexeme(lexeme); 61 | if(lexeme.getBegin() + lexeme.getLength() > this.pathEnd){ 62 | this.pathEnd = lexeme.getBegin() + lexeme.getLength(); 63 | } 64 | this.payloadLength = this.pathEnd - this.pathBegin; 65 | return true; 66 | 67 | }else{ 68 | return false; 69 | 70 | } 71 | } 72 | 73 | /** 74 | * 向LexemePath追加不相交的Lexeme 75 | * @param lexeme 76 | * @return 77 | */ 78 | boolean addNotCrossLexeme(Lexeme lexeme){ 79 | if(this.isEmpty()){ 80 | this.addLexeme(lexeme); 81 | this.pathBegin = lexeme.getBegin(); 82 | this.pathEnd = lexeme.getBegin() + lexeme.getLength(); 83 | this.payloadLength += lexeme.getLength(); 84 | return true; 85 | 86 | }else if(this.checkCross(lexeme)){ 87 | return false; 88 | 89 | }else{ 90 | this.addLexeme(lexeme); 91 | this.payloadLength += lexeme.getLength(); 92 | Lexeme head = this.peekFirst(); 93 | this.pathBegin = head.getBegin(); 94 | Lexeme tail = this.peekLast(); 95 | this.pathEnd = tail.getBegin() + tail.getLength(); 96 | return true; 97 | 98 | } 99 | } 100 | 101 | /** 102 | * 移除尾部的Lexeme 103 | * @return 104 | */ 105 | Lexeme removeTail(){ 106 | Lexeme tail = this.pollLast(); 107 | if(this.isEmpty()){ 108 | this.pathBegin = -1; 109 | this.pathEnd = -1; 110 | this.payloadLength = 0; 111 | }else{ 112 | this.payloadLength -= tail.getLength(); 113 | Lexeme newTail = this.peekLast(); 114 | this.pathEnd = newTail.getBegin() + newTail.getLength(); 115 | } 116 | return tail; 117 | } 118 | 119 | /** 120 | * 检测词元位置交叉(有歧义的切分) 121 | * @param lexeme 122 | * @return 123 | */ 124 | boolean checkCross(Lexeme lexeme){ 125 | return (lexeme.getBegin() >= this.pathBegin && lexeme.getBegin() < this.pathEnd) 126 | || (this.pathBegin >= lexeme.getBegin() && this.pathBegin < lexeme.getBegin()+ lexeme.getLength()); 127 | } 128 | 129 | int getPathBegin() { 130 | return pathBegin; 131 | } 132 | 133 | int getPathEnd() { 134 | return pathEnd; 135 | } 136 | 137 | /** 138 | * 获取Path的有效词长 139 | * @return 140 | */ 141 | int getPayloadLength(){ 142 | return this.payloadLength; 143 | } 144 | 145 | /** 146 | * 获取LexemePath的路径长度 147 | * @return 148 | */ 149 | int getPathLength(){ 150 | return this.pathEnd - this.pathBegin; 151 | } 152 | 153 | 154 | /** 155 | * X权重(词元长度积) 156 | * @return 157 | */ 158 | int getXWeight(){ 159 | int product = 1; 160 | Cell c = this.getHead(); 161 | while( c != null && c.getLexeme() != null){ 162 | product *= c.getLexeme().getLength(); 163 | c = c.getNext(); 164 | } 165 | return product; 166 | } 167 | 168 | /** 169 | * 词元位置权重 170 | * @return 171 | */ 172 | int getPWeight(){ 173 | int pWeight = 0; 174 | int p = 0; 175 | Cell c = this.getHead(); 176 | while( c != null && c.getLexeme() != null){ 177 | p++; 178 | pWeight += p * c.getLexeme().getLength() ; 179 | c = c.getNext(); 180 | } 181 | return pWeight; 182 | } 183 | 184 | LexemePath copy(){ 185 | LexemePath theCopy = new LexemePath(); 186 | theCopy.pathBegin = this.pathBegin; 187 | theCopy.pathEnd = this.pathEnd; 188 | theCopy.payloadLength = this.payloadLength; 189 | Cell c = this.getHead(); 190 | while( c != null && c.getLexeme() != null){ 191 | theCopy.addLexeme(c.getLexeme()); 192 | c = c.getNext(); 193 | } 194 | return theCopy; 195 | } 196 | 197 | public int compareTo(LexemePath o) { 198 | //比较有效文本长度 199 | if(this.payloadLength > o.payloadLength){ 200 | return -1; 201 | }else if(this.payloadLength < o.payloadLength){ 202 | return 1; 203 | }else{ 204 | //比较词元个数,越少越好 205 | if(this.size() < o.size()){ 206 | return -1; 207 | }else if (this.size() > o.size()){ 208 | return 1; 209 | }else{ 210 | //路径跨度越大越好 211 | if(this.getPathLength() > o.getPathLength()){ 212 | return -1; 213 | }else if(this.getPathLength() < o.getPathLength()){ 214 | return 1; 215 | }else { 216 | //根据统计学结论,逆向切分概率高于正向切分,因此位置越靠后的优先 217 | if(this.pathEnd > o.pathEnd){ 218 | return -1; 219 | }else if(pathEnd < o.pathEnd){ 220 | return 1; 221 | }else{ 222 | //词长越平均越好 223 | if(this.getXWeight() > o.getXWeight()){ 224 | return -1; 225 | }else if(this.getXWeight() < o.getXWeight()){ 226 | return 1; 227 | }else { 228 | //词元位置权重比较 229 | if(this.getPWeight() > o.getPWeight()){ 230 | return -1; 231 | }else if(this.getPWeight() < o.getPWeight()){ 232 | return 1; 233 | } 234 | 235 | } 236 | } 237 | } 238 | } 239 | } 240 | return 0; 241 | } 242 | 243 | public String toString(){ 244 | StringBuffer sb = new StringBuffer(); 245 | sb.append("pathBegin : ").append(pathBegin).append("\r\n"); 246 | sb.append("pathEnd : ").append(pathEnd).append("\r\n"); 247 | sb.append("payloadLength : ").append(payloadLength).append("\r\n"); 248 | Cell head = this.getHead(); 249 | while(head != null){ 250 | sb.append("lexeme : ").append(head.getLexeme()).append("\r\n"); 251 | head = head.getNext(); 252 | } 253 | return sb.toString(); 254 | } 255 | 256 | } 257 | -------------------------------------------------------------------------------- /chatbotv1/src/main/java/org/wltea/analyzer/core/Lexeme.java: -------------------------------------------------------------------------------- 1 | /** 2 | * IK 中文分词 版本 5.0 3 | * IK Analyzer release 5.0 4 | * 5 | * Licensed to the Apache Software Foundation (ASF) under one or more 6 | * contributor license agreements. See the NOTICE file distributed with 7 | * this work for additional information regarding copyright ownership. 8 | * The ASF licenses this file to You under the Apache License, Version 2.0 9 | * (the "License"); you may not use this file except in compliance with 10 | * the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | * 20 | * 源代码由林良益(linliangyi2005@gmail.com)提供 21 | * 版权声明 2012,乌龙茶工作室 22 | * provided by Linliangyi and copyright 2012 by Oolong studio 23 | * 24 | */ 25 | package org.wltea.analyzer.core; 26 | 27 | /** 28 | * IK词元对象 29 | */ 30 | public class Lexeme implements Comparable{ 31 | //lexemeType常量 32 | //未知 33 | public static final int TYPE_UNKNOWN = 0; 34 | //英文 35 | public static final int TYPE_ENGLISH = 1; 36 | //数字 37 | public static final int TYPE_ARABIC = 2; 38 | //英文数字混合 39 | public static final int TYPE_LETTER = 3; 40 | //中文词元 41 | public static final int TYPE_CNWORD = 4; 42 | //中文单字 43 | public static final int TYPE_CNCHAR = 64; 44 | //日韩文字 45 | public static final int TYPE_OTHER_CJK = 8; 46 | //中文数词 47 | public static final int TYPE_CNUM = 16; 48 | //中文量词 49 | public static final int TYPE_COUNT = 32; 50 | //中文数量词 51 | public static final int TYPE_CQUAN = 48; 52 | 53 | //词元的起始位移 54 | private int offset; 55 | //词元的相对起始位置 56 | private int begin; 57 | //词元的长度 58 | private int length; 59 | //词元文本 60 | private String lexemeText; 61 | //词元类型 62 | private int lexemeType; 63 | 64 | 65 | public Lexeme(int offset , int begin , int length , int lexemeType){ 66 | this.offset = offset; 67 | this.begin = begin; 68 | if(length < 0){ 69 | throw new IllegalArgumentException("length < 0"); 70 | } 71 | this.length = length; 72 | this.lexemeType = lexemeType; 73 | } 74 | 75 | /* 76 | * 判断词元相等算法 77 | * 起始位置偏移、起始位置、终止位置相同 78 | * @see java.lang.Object#equals(Object o) 79 | */ 80 | public boolean equals(Object o){ 81 | if(o == null){ 82 | return false; 83 | } 84 | 85 | if(this == o){ 86 | return true; 87 | } 88 | 89 | if(o instanceof Lexeme){ 90 | Lexeme other = (Lexeme)o; 91 | if(this.offset == other.getOffset() 92 | && this.begin == other.getBegin() 93 | && this.length == other.getLength()){ 94 | return true; 95 | }else{ 96 | return false; 97 | } 98 | }else{ 99 | return false; 100 | } 101 | } 102 | 103 | /* 104 | * 词元哈希编码算法 105 | * @see java.lang.Object#hashCode() 106 | */ 107 | public int hashCode(){ 108 | int absBegin = getBeginPosition(); 109 | int absEnd = getEndPosition(); 110 | return (absBegin * 37) + (absEnd * 31) + ((absBegin * absEnd) % getLength()) * 11; 111 | } 112 | 113 | /* 114 | * 词元在排序集合中的比较算法 115 | * @see java.lang.Comparable#compareTo(java.lang.Object) 116 | */ 117 | public int compareTo(Lexeme other) { 118 | //起始位置优先 119 | if(this.begin < other.getBegin()){ 120 | return -1; 121 | }else if(this.begin == other.getBegin()){ 122 | //词元长度优先 123 | if(this.length > other.getLength()){ 124 | return -1; 125 | }else if(this.length == other.getLength()){ 126 | return 0; 127 | }else {//this.length < other.getLength() 128 | return 1; 129 | } 130 | 131 | }else{//this.begin > other.getBegin() 132 | return 1; 133 | } 134 | } 135 | 136 | public int getOffset() { 137 | return offset; 138 | } 139 | 140 | public void setOffset(int offset) { 141 | this.offset = offset; 142 | } 143 | 144 | public int getBegin() { 145 | return begin; 146 | } 147 | /** 148 | * 获取词元在文本中的起始位置 149 | * @return int 150 | */ 151 | public int getBeginPosition(){ 152 | return offset + begin; 153 | } 154 | 155 | public void setBegin(int begin) { 156 | this.begin = begin; 157 | } 158 | 159 | /** 160 | * 获取词元在文本中的结束位置 161 | * @return int 162 | */ 163 | public int getEndPosition(){ 164 | return offset + begin + length; 165 | } 166 | 167 | /** 168 | * 获取词元的字符长度 169 | * @return int 170 | */ 171 | public int getLength(){ 172 | return this.length; 173 | } 174 | 175 | public void setLength(int length) { 176 | if(this.length < 0){ 177 | throw new IllegalArgumentException("length < 0"); 178 | } 179 | this.length = length; 180 | } 181 | 182 | /** 183 | * 获取词元的文本内容 184 | * @return String 185 | */ 186 | public String getLexemeText() { 187 | if(lexemeText == null){ 188 | return ""; 189 | } 190 | return lexemeText; 191 | } 192 | 193 | public void setLexemeText(String lexemeText) { 194 | if(lexemeText == null){ 195 | this.lexemeText = ""; 196 | this.length = 0; 197 | }else{ 198 | this.lexemeText = lexemeText; 199 | this.length = lexemeText.length(); 200 | } 201 | } 202 | 203 | /** 204 | * 获取词元类型 205 | * @return int 206 | */ 207 | public int getLexemeType() { 208 | return lexemeType; 209 | } 210 | 211 | /** 212 | * 获取词元类型标示字符串 213 | * @return String 214 | */ 215 | public String getLexemeTypeString(){ 216 | switch(lexemeType) { 217 | 218 | case TYPE_ENGLISH : 219 | return "ENGLISH"; 220 | 221 | case TYPE_ARABIC : 222 | return "ARABIC"; 223 | 224 | case TYPE_LETTER : 225 | return "LETTER"; 226 | 227 | case TYPE_CNWORD : 228 | return "CN_WORD"; 229 | 230 | case TYPE_CNCHAR : 231 | return "CN_CHAR"; 232 | 233 | case TYPE_OTHER_CJK : 234 | return "OTHER_CJK"; 235 | 236 | case TYPE_COUNT : 237 | return "COUNT"; 238 | 239 | case TYPE_CNUM : 240 | return "TYPE_CNUM"; 241 | 242 | case TYPE_CQUAN: 243 | return "TYPE_CQUAN"; 244 | 245 | default : 246 | return "UNKONW"; 247 | } 248 | } 249 | 250 | 251 | public void setLexemeType(int lexemeType) { 252 | this.lexemeType = lexemeType; 253 | } 254 | 255 | /** 256 | * 合并两个相邻的词元 257 | * @param l 258 | * @param lexemeType 259 | * @return boolean 词元是否成功合并 260 | */ 261 | public boolean append(Lexeme l , int lexemeType){ 262 | if(l != null && this.getEndPosition() == l.getBeginPosition()){ 263 | this.length += l.getLength(); 264 | this.lexemeType = lexemeType; 265 | return true; 266 | }else { 267 | return false; 268 | } 269 | } 270 | 271 | 272 | /** 273 | * 274 | */ 275 | public String toString(){ 276 | StringBuffer strbuf = new StringBuffer(); 277 | strbuf.append(this.getBeginPosition()).append("-").append(this.getEndPosition()); 278 | strbuf.append(" : ").append(this.lexemeText).append(" : \t"); 279 | strbuf.append(this.getLexemeTypeString()); 280 | return strbuf.toString(); 281 | } 282 | 283 | 284 | } 285 | -------------------------------------------------------------------------------- /chatbotv2/my_seq2seq.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | import math 5 | import tflearn 6 | import tensorflow as tf 7 | from tensorflow.python.ops import rnn_cell 8 | from tensorflow.python.ops import rnn 9 | import chardet 10 | import numpy as np 11 | import struct 12 | 13 | seq = [] 14 | 15 | max_w = 50 16 | float_size = 4 17 | word_vector_dict = {} 18 | word_vec_dim = 200 19 | max_seq_len = 16 20 | 21 | def load_vectors(input): 22 | """从vectors.bin加载词向量,返回一个word_vector_dict的词典,key是词,value是200维的向量 23 | """ 24 | print "begin load vectors" 25 | 26 | input_file = open(input, "rb") 27 | 28 | # 获取词表数目及向量维度 29 | words_and_size = input_file.readline() 30 | words_and_size = words_and_size.strip() 31 | words = long(words_and_size.split(' ')[0]) 32 | size = long(words_and_size.split(' ')[1]) 33 | print "words =", words 34 | print "size =", size 35 | 36 | for b in range(0, words): 37 | a = 0 38 | word = '' 39 | # 读取一个词 40 | while True: 41 | c = input_file.read(1) 42 | word = word + c 43 | if False == c or c == ' ': 44 | break 45 | if a < max_w and c != '\n': 46 | a = a + 1 47 | word = word.strip() 48 | 49 | vector = [] 50 | for index in range(0, size): 51 | m = input_file.read(float_size) 52 | (weight,) = struct.unpack('f', m) 53 | vector.append(float(weight)) 54 | 55 | # 将词及其对应的向量存到dict中 56 | #word_vector_dict[word.decode('utf-8')] = vector 57 | word_vector_dict[word.decode('utf-8')] = vector[0:word_vec_dim] 58 | 59 | input_file.close() 60 | 61 | print "load vectors finish" 62 | 63 | def init_seq(): 64 | """读取切好词的文本文件,加载全部词序列 65 | """ 66 | file_object = open('zhenhuanzhuan.segment', 'r') 67 | vocab_dict = {} 68 | while True: 69 | line = file_object.readline() 70 | if line: 71 | for word in line.decode('utf-8').split(' '): 72 | if word_vector_dict.has_key(word): 73 | seq.append(word_vector_dict[word]) 74 | else: 75 | break 76 | file_object.close() 77 | 78 | def vector_sqrtlen(vector): 79 | len = 0 80 | for item in vector: 81 | len += item * item 82 | len = math.sqrt(len) 83 | return len 84 | 85 | def vector_cosine(v1, v2): 86 | if len(v1) != len(v2): 87 | sys.exit(1) 88 | sqrtlen1 = vector_sqrtlen(v1) 89 | sqrtlen2 = vector_sqrtlen(v2) 90 | value = 0 91 | for item1, item2 in zip(v1, v2): 92 | value += item1 * item2 93 | return value / (sqrtlen1*sqrtlen2) 94 | 95 | 96 | def vector2word(vector): 97 | max_cos = -10000 98 | match_word = '' 99 | for word in word_vector_dict: 100 | v = word_vector_dict[word] 101 | cosine = vector_cosine(vector, v) 102 | if cosine > max_cos: 103 | max_cos = cosine 104 | match_word = word 105 | return (match_word, max_cos) 106 | 107 | 108 | class MySeq2Seq(object): 109 | """ 110 | 思路:输入输出序列一起作为input,然后通过slick和unpack切分 111 | 完全按照论文说的编码器解码器来做 112 | 输出的时候把解码器的输出按照词向量的200维展平,这样输出就是(?,seqlen*200) 113 | 这样就可以通过regression来做回归计算了,输入的y也展平,保持一致 114 | """ 115 | def __init__(self, max_seq_len = 16, word_vec_dim = 200): 116 | self.max_seq_len = max_seq_len 117 | self.word_vec_dim = word_vec_dim 118 | 119 | def generate_trainig_data(self): 120 | load_vectors("./vectors.bin") 121 | init_seq() 122 | xy_data = [] 123 | y_data = [] 124 | for i in range(30,40,10): 125 | # 问句、答句都是16字,所以取32个 126 | start = i*self.max_seq_len*2 127 | middle = i*self.max_seq_len*2 + self.max_seq_len 128 | end = (i+1)*self.max_seq_len*2 129 | sequence_xy = seq[start:end] 130 | sequence_y = seq[middle:end] 131 | print "right answer" 132 | for w in sequence_y: 133 | (match_word, max_cos) = vector2word(w) 134 | print match_word 135 | sequence_y = [np.ones(self.word_vec_dim)] + sequence_y 136 | xy_data.append(sequence_xy) 137 | y_data.append(sequence_y) 138 | 139 | return np.array(xy_data), np.array(y_data) 140 | 141 | 142 | def model(self, feed_previous=False): 143 | # 通过输入的XY生成encoder_inputs和带GO头的decoder_inputs 144 | input_data = tflearn.input_data(shape=[None, self.max_seq_len*2, self.word_vec_dim], dtype=tf.float32, name = "XY") 145 | encoder_inputs = tf.slice(input_data, [0, 0, 0], [-1, self.max_seq_len, self.word_vec_dim], name="enc_in") 146 | decoder_inputs_tmp = tf.slice(input_data, [0, self.max_seq_len, 0], [-1, self.max_seq_len-1, self.word_vec_dim], name="dec_in_tmp") 147 | go_inputs = tf.ones_like(decoder_inputs_tmp) 148 | go_inputs = tf.slice(go_inputs, [0, 0, 0], [-1, 1, self.word_vec_dim]) 149 | decoder_inputs = tf.concat(1, [go_inputs, decoder_inputs_tmp], name="dec_in") 150 | 151 | # 编码器 152 | # 把encoder_inputs交给编码器,返回一个输出(预测序列的第一个值)和一个状态(传给解码器) 153 | (encoder_output_tensor, states) = tflearn.lstm(encoder_inputs, self.word_vec_dim, return_state=True, scope='encoder_lstm') 154 | encoder_output_sequence = tf.pack([encoder_output_tensor], axis=1) 155 | 156 | # 解码器 157 | # 预测过程用前一个时间序的输出作为下一个时间序的输入 158 | # 先用编码器的最后一个输出作为第一个输入 159 | if feed_previous: 160 | first_dec_input = go_inputs 161 | else: 162 | first_dec_input = tf.slice(decoder_inputs, [0, 0, 0], [-1, 1, self.word_vec_dim]) 163 | decoder_output_tensor = tflearn.lstm(first_dec_input, self.word_vec_dim, initial_state=states, return_seq=False, reuse=False, scope='decoder_lstm') 164 | decoder_output_sequence_single = tf.pack([decoder_output_tensor], axis=1) 165 | decoder_output_sequence_list = [decoder_output_tensor] 166 | # 再用解码器的输出作为下一个时序的输入 167 | for i in range(self.max_seq_len-1): 168 | if feed_previous: 169 | next_dec_input = decoder_output_sequence_single 170 | else: 171 | next_dec_input = tf.slice(decoder_inputs, [0, i+1, 0], [-1, 1, self.word_vec_dim]) 172 | decoder_output_tensor = tflearn.lstm(next_dec_input, self.word_vec_dim, return_seq=False, reuse=True, scope='decoder_lstm') 173 | decoder_output_sequence_single = tf.pack([decoder_output_tensor], axis=1) 174 | decoder_output_sequence_list.append(decoder_output_tensor) 175 | 176 | decoder_output_sequence = tf.pack(decoder_output_sequence_list, axis=1) 177 | real_output_sequence = tf.concat(1, [encoder_output_sequence, decoder_output_sequence]) 178 | 179 | net = tflearn.regression(real_output_sequence, optimizer='sgd', learning_rate=0.1, loss='mean_square') 180 | model = tflearn.DNN(net) 181 | return model 182 | 183 | def train(self): 184 | trainXY, trainY = self.generate_trainig_data() 185 | model = self.model(feed_previous=False) 186 | model.fit(trainXY, trainY, n_epoch=1000, snapshot_epoch=False) 187 | model.save('./model/model') 188 | return model 189 | 190 | def load(self): 191 | model = self.model(feed_previous=True) 192 | model.load('./model/model') 193 | return model 194 | 195 | if __name__ == '__main__': 196 | phrase = sys.argv[1] 197 | my_seq2seq = MySeq2Seq(word_vec_dim=word_vec_dim, max_seq_len=max_seq_len) 198 | if phrase == 'train': 199 | my_seq2seq.train() 200 | else: 201 | model = my_seq2seq.load() 202 | trainXY, trainY = my_seq2seq.generate_trainig_data() 203 | predict = model.predict(trainXY) 204 | for sample in predict: 205 | print "predict answer" 206 | for w in sample[1:]: 207 | (match_word, max_cos) = vector2word(w) 208 | print match_word, max_cos 209 | -------------------------------------------------------------------------------- /subtitle/subtitle_crawler/spiders/subtitle_spider.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | 3 | import sys 4 | reload(sys) 5 | sys.setdefaultencoding( "utf-8" ) 6 | 7 | import scrapy 8 | from w3lib.html import remove_tags 9 | from subtitle_crawler.items import SubtitleCrawlerItem 10 | 11 | class SubTitleSpider(scrapy.Spider): 12 | name = "subtitle" 13 | allowed_domains = ["zimuku.net"] 14 | start_urls = [ 15 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=900", 16 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=901", 17 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=902", 18 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=903", 19 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=904", 20 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=905", 21 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=906", 22 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=907", 23 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=908", 24 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=909", 25 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=910", 26 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=911", 27 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=912", 28 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=913", 29 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=914", 30 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=915", 31 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=916", 32 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=917", 33 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=918", 34 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=919", 35 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=920", 36 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=921", 37 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=922", 38 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=923", 39 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=924", 40 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=925", 41 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=926", 42 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=927", 43 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=928", 44 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=929", 45 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=930", 46 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=931", 47 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=932", 48 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=933", 49 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=934", 50 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=935", 51 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=936", 52 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=937", 53 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=938", 54 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=939", 55 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=940", 56 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=941", 57 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=942", 58 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=943", 59 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=944", 60 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=945", 61 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=946", 62 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=947", 63 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=948", 64 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=949", 65 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=950", 66 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=951", 67 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=952", 68 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=953", 69 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=954", 70 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=955", 71 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=956", 72 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=957", 73 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=958", 74 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=959", 75 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=960", 76 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=961", 77 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=962", 78 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=963", 79 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=964", 80 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=965", 81 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=966", 82 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=967", 83 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=968", 84 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=969", 85 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=970", 86 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=971", 87 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=972", 88 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=973", 89 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=974", 90 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=975", 91 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=976", 92 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=977", 93 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=978", 94 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=979", 95 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=980", 96 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=981", 97 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=982", 98 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=983", 99 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=984", 100 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=985", 101 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=986", 102 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=987", 103 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=988", 104 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=989", 105 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=990", 106 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=991", 107 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=992", 108 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=993", 109 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=994", 110 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=995", 111 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=996", 112 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=997", 113 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=998", 114 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=999", 115 | ] 116 | 117 | def parse(self, response): 118 | hrefs = response.selector.xpath('//div[contains(@class, "persub")]/h1/a/@href').extract() 119 | for href in hrefs: 120 | url = response.urljoin(href) 121 | request = scrapy.Request(url, callback=self.parse_detail) 122 | yield request 123 | 124 | def parse_detail(self, response): 125 | url = response.selector.xpath('//li[contains(@class, "dlsub")]/div/a/@href').extract()[0] 126 | print "processing: ", url 127 | request = scrapy.Request(url, callback=self.parse_file) 128 | yield request 129 | 130 | def parse_file(self, response): 131 | body = response.body 132 | item = SubtitleCrawlerItem() 133 | item['url'] = response.url 134 | item['body'] = body 135 | return item 136 | --------------------------------------------------------------------------------