├── baidu_search
├── baidu_search
│ ├── __init__.py
│ ├── spiders
│ │ ├── __init__.py
│ │ └── baidu_search.py
│ ├── pipelines.py
│ ├── items.py
│ └── settings.py
└── scrapy.cfg
├── subtitle
├── subtitle_crawler
│ ├── __init__.py
│ ├── items.pyc
│ ├── __init__.pyc
│ ├── settings.pyc
│ ├── pipelines.pyc
│ ├── spiders
│ │ ├── __init__.pyc
│ │ ├── subtitle_spider.pyc
│ │ ├── __init__.py
│ │ └── subtitle_spider.py
│ ├── items.py
│ ├── pipelines.py
│ └── settings.py
├── preprocess
│ ├── conv_big5.sh
│ ├── unzip.sh
│ ├── conv.sh
│ ├── clear_empty_dir.py
│ ├── get_file_charset.py
│ ├── change_name.py
│ ├── del_file.py
│ ├── conv2simple.py
│ ├── mv_ass.py
│ ├── mv_lrc.py
│ ├── mv_rar.py
│ ├── mv_smi.py
│ ├── mv_srt.py
│ ├── mv_ssa.py
│ ├── mv_str.py
│ ├── mv_sup.py
│ ├── mv_vtt.py
│ ├── mv_zip.py
│ ├── get_charset.py
│ ├── get_charset_and_conv.py
│ ├── extract_sentence_srt.py
│ ├── extract_sentence_ass.py
│ ├── extract_sentence_ssa.py
│ └── filter.py
└── scrapy.cfg
├── chatbotv1
├── src
│ ├── main
│ │ ├── resources
│ │ │ ├── ext.dic
│ │ │ ├── IKAnalyzer.cfg.xml
│ │ │ ├── stopword.dic
│ │ │ └── quantifier.dic
│ │ ├── .DS_Store
│ │ └── java
│ │ │ ├── com
│ │ │ └── shareditor
│ │ │ │ └── chatbotv1
│ │ │ │ ├── HttpServerInboundHandler.java
│ │ │ │ ├── Searcher.java
│ │ │ │ ├── NettyHttpServletResponse.java
│ │ │ │ └── Indexer.java
│ │ │ └── org
│ │ │ └── wltea
│ │ │ └── analyzer
│ │ │ ├── core
│ │ │ ├── ISegmenter.java
│ │ │ ├── CharacterUtil.java
│ │ │ ├── CJKSegmenter.java
│ │ │ ├── IKArbitrator.java
│ │ │ ├── IKSegmenter.java
│ │ │ ├── QuickSortSet.java
│ │ │ ├── CN_QuantifierSegmenter.java
│ │ │ ├── LexemePath.java
│ │ │ └── Lexeme.java
│ │ │ ├── cfg
│ │ │ ├── Configuration.java
│ │ │ └── DefaultConfig.java
│ │ │ ├── lucene
│ │ │ ├── IKAnalyzer.java
│ │ │ └── IKTokenizer.java
│ │ │ ├── dic
│ │ │ ├── quantifier.dic
│ │ │ └── Hit.java
│ │ │ ├── sample
│ │ │ ├── IKAnalzyerDemo.java
│ │ │ └── LuceneIndexAndSearchDemo.java
│ │ │ └── query
│ │ │ └── SWMCQueryBuilder.java
│ └── test
│ │ └── java
│ │ └── com
│ │ └── shareditor
│ │ └── chatbotv1
│ │ └── AppTest.java
└── pom.xml
├── lstm_code
├── tensorflow
│ └── test.py
├── nicodjimenez
│ ├── test.py
│ ├── README.md
│ └── test2.py
└── iamtrask
│ └── lstm.py
├── learning_tensorflow
├── tmp
│ └── events.out.tfevents.1481183189.localhost
├── 1.py
├── 3.py
└── 2.py
├── word2vec
├── demo-word.sh
├── demo-classes.sh
├── demo-word-accuracy.sh
├── demo-analogy.sh
├── makefile
├── demo-phrases.sh
├── demo-phrase-accuracy.sh
├── README.txt
├── distance.c
├── word-analogy.c
├── demo-train-big-model-v1.sh
└── compute-accuracy.c
├── gensim_word2vec.py
├── chatbotv2
├── readme.txt
└── my_seq2seq.py
├── word_segment.py
├── digital_recognition.py
├── word_vectors_loader.py
├── seq2seq
├── tflearn_prj
│ ├── my_tflearn_demo.py
│ ├── 07_lstm.py
│ └── my_lstm_test.py
└── hello_sequence.py
├── digital_recognition_cnn.py
├── read_images.c
├── pattern_recognition.lua
└── README.md
/baidu_search/baidu_search/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/subtitle/subtitle_crawler/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/chatbotv1/src/main/resources/ext.dic:
--------------------------------------------------------------------------------
1 | 诛仙
2 | 诛仙2
3 | 梦幻诛仙
4 | 梦幻诛仙2
--------------------------------------------------------------------------------
/chatbotv1/src/main/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/0oVicero0/ChatBotCourse/master/chatbotv1/src/main/.DS_Store
--------------------------------------------------------------------------------
/subtitle/subtitle_crawler/items.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/0oVicero0/ChatBotCourse/master/subtitle/subtitle_crawler/items.pyc
--------------------------------------------------------------------------------
/subtitle/subtitle_crawler/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/0oVicero0/ChatBotCourse/master/subtitle/subtitle_crawler/__init__.pyc
--------------------------------------------------------------------------------
/subtitle/subtitle_crawler/settings.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/0oVicero0/ChatBotCourse/master/subtitle/subtitle_crawler/settings.pyc
--------------------------------------------------------------------------------
/subtitle/subtitle_crawler/pipelines.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/0oVicero0/ChatBotCourse/master/subtitle/subtitle_crawler/pipelines.pyc
--------------------------------------------------------------------------------
/lstm_code/tensorflow/test.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 |
3 | def main(_):
4 | pass
5 |
6 | if __name__ == "__main__":
7 | tf.app.run()
8 |
--------------------------------------------------------------------------------
/subtitle/subtitle_crawler/spiders/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/0oVicero0/ChatBotCourse/master/subtitle/subtitle_crawler/spiders/__init__.pyc
--------------------------------------------------------------------------------
/subtitle/subtitle_crawler/spiders/subtitle_spider.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/0oVicero0/ChatBotCourse/master/subtitle/subtitle_crawler/spiders/subtitle_spider.pyc
--------------------------------------------------------------------------------
/learning_tensorflow/tmp/events.out.tfevents.1481183189.localhost:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/0oVicero0/ChatBotCourse/master/learning_tensorflow/tmp/events.out.tfevents.1481183189.localhost
--------------------------------------------------------------------------------
/baidu_search/baidu_search/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/subtitle/subtitle_crawler/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/subtitle/preprocess/conv_big5.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | while read line
4 | do
5 | file=`echo $line|awk '{print $1}'`; echo $file;
6 | iconv -f big5 -t utf-8 $file > ${file}.2
7 | if [ $? -eq 0 ];then
8 | mv ${file}.2 ${file}
9 | else
10 | rm ${file}.2
11 | fi
12 | done
13 |
--------------------------------------------------------------------------------
/word2vec/demo-word.sh:
--------------------------------------------------------------------------------
1 | make
2 | if [ ! -e text8 ]; then
3 | wget http://mattmahoney.net/dc/text8.zip -O text8.gz
4 | gzip -d text8.gz -f
5 | fi
6 | time ./word2vec -train text8 -output vectors.bin -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 20 -binary 1 -iter 15
7 | ./distance vectors.bin
8 |
--------------------------------------------------------------------------------
/subtitle/preprocess/unzip.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | i=0; for file in `ls`; do mkdir output/${i}; echo "unzip $file -d output/${i}";unzip -P abc $file -d output/${i} > /dev/null; ((i++)); done
4 | i=0; for file in `ls`; do mkdir output/${i}; echo "${i} unrar x $file output/${i}";unrar x $file output/${i} > /dev/null; ((i++)); done
5 |
--------------------------------------------------------------------------------
/baidu_search/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
5 |
6 | [settings]
7 | default = baidu_search.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = baidu_search
12 |
--------------------------------------------------------------------------------
/subtitle/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
5 |
6 | [settings]
7 | default = subtitle_crawler.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = subtitle_crawler
12 |
--------------------------------------------------------------------------------
/subtitle/preprocess/conv.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | while read line
4 | do
5 | file=`echo $line|awk '{print $1}'`
6 | iconv -f gb2312 -t utf-8 $file > ${file}.2 2>/dev/null
7 | if [ $? -eq 0 ];then
8 | mv ${file}.2 ${file}
9 | echo "mv ${file}.2 ${file}"
10 | else
11 | rm ${file}.2
12 | fi
13 | done
14 |
--------------------------------------------------------------------------------
/learning_tensorflow/1.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 |
3 | sess = tf.Session()
4 |
5 | a = tf.placeholder("float")
6 | b = tf.placeholder("float")
7 | c = tf.constant(6.0)
8 | d = tf.mul(a, b)
9 | y = tf.mul(d, c)
10 | print sess.run(y, feed_dict={a: 3, b: 3})
11 |
12 | A = [[1.1,2.3],[3.4,4.1]]
13 | Y = tf.matrix_inverse(A)
14 | print sess.run(Y)
15 | sess.close()
16 |
--------------------------------------------------------------------------------
/subtitle/preprocess/clear_empty_dir.py:
--------------------------------------------------------------------------------
1 | import glob
2 | import os
3 | import fnmatch
4 | import shutil
5 | import sys
6 |
7 |
8 | def iterfindfiles(path, fnexp):
9 | for root, dirs, files in os.walk(path):
10 | if 0 == len(files) and len(dirs) == 0:
11 | print root
12 | os.rmdir(root)
13 |
14 | iterfindfiles(r"./input/", "*.srt")
15 |
--------------------------------------------------------------------------------
/baidu_search/baidu_search/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 |
8 |
9 | class BaiduSearchPipeline(object):
10 | def process_item(self, item, spider):
11 | return item
12 |
--------------------------------------------------------------------------------
/baidu_search/baidu_search/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # http://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | import scrapy
9 |
10 |
11 | class BaiduSearchItem(scrapy.Item):
12 | # define the fields for your item here like:
13 | # name = scrapy.Field()
14 | pass
15 |
--------------------------------------------------------------------------------
/subtitle/subtitle_crawler/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # http://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | import scrapy
9 |
10 |
11 | class SubtitleCrawlerItem(scrapy.Item):
12 | # define the fields for your item here like:
13 | url = scrapy.Field()
14 | body = scrapy.Field()
15 |
--------------------------------------------------------------------------------
/word2vec/demo-classes.sh:
--------------------------------------------------------------------------------
1 | make
2 | if [ ! -e text8 ]; then
3 | wget http://mattmahoney.net/dc/text8.zip -O text8.gz
4 | gzip -d text8.gz -f
5 | fi
6 | time ./word2vec -train text8 -output classes.txt -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 20 -iter 15 -classes 500
7 | sort classes.txt -k 2 -n > classes.sorted.txt
8 | echo The word classes were saved to file classes.sorted.txt
9 |
--------------------------------------------------------------------------------
/chatbotv1/src/main/resources/IKAnalyzer.cfg.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | IK Analyzer 扩展配置
5 |
6 | ext.dic;
7 |
8 |
9 | stopword.dic;
10 |
11 |
--------------------------------------------------------------------------------
/subtitle/preprocess/get_file_charset.py:
--------------------------------------------------------------------------------
1 | import chardet
2 | import sys
3 | import os
4 |
5 | if __name__ == '__main__':
6 | if len(sys.argv) == 2:
7 | file_path = sys.argv[1]
8 | f = open(file_path,'r')
9 | data = f.read()
10 | encoding = chardet.detect(data)["encoding"]
11 | if encoding not in ("UTF-8-SIG", "UTF-16LE", "utf-8"):
12 | print file_path, encoding
13 | f.close()
14 |
--------------------------------------------------------------------------------
/word2vec/demo-word-accuracy.sh:
--------------------------------------------------------------------------------
1 | make
2 | if [ ! -e text8 ]; then
3 | wget http://mattmahoney.net/dc/text8.zip -O text8.gz
4 | gzip -d text8.gz -f
5 | fi
6 | time ./word2vec -train text8 -output vectors.bin -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 20 -binary 1 -iter 15
7 | ./compute-accuracy vectors.bin 30000 < questions-words.txt
8 | # to compute accuracy with the full vocabulary, use: ./compute-accuracy vectors.bin < questions-words.txt
9 |
--------------------------------------------------------------------------------
/subtitle/preprocess/change_name.py:
--------------------------------------------------------------------------------
1 | import glob
2 | import os
3 | import fnmatch
4 | import shutil
5 | import sys
6 |
7 |
8 | def iterfindfiles(path, fnexp):
9 | for root, dirs, files in os.walk(path):
10 | for filename in fnmatch.filter(files, fnexp):
11 | yield os.path.join(root, filename)
12 |
13 |
14 | i=0
15 | for filename in iterfindfiles(r"./", "*"):
16 | i=i+1
17 | newfilename = str(i) + ".vtt"
18 | #print filename + " <===> " + newfilename
19 | shutil.move(filename, newfilename)
20 |
--------------------------------------------------------------------------------
/subtitle/preprocess/del_file.py:
--------------------------------------------------------------------------------
1 | import glob
2 | import os
3 | import fnmatch
4 | import shutil
5 | import sys
6 |
7 |
8 | def iterfindfiles(path, fnexp):
9 | for root, dirs, files in os.walk(path):
10 | for filename in fnmatch.filter(files, fnexp):
11 | yield os.path.join(root, filename)
12 |
13 |
14 |
15 |
16 | for suffix in ("*.mp4", "*.txt", "*.JPG", "*.htm", "*.doc", "*.docx", "*.nfo", "*.sub", "*.idx"):
17 | for filename in iterfindfiles(r"./input/", suffix):
18 | print filename
19 | os.remove(filename)
20 |
--------------------------------------------------------------------------------
/subtitle/preprocess/conv2simple.py:
--------------------------------------------------------------------------------
1 | # coding:utf-8
2 | from langconv import *
3 | import sys
4 |
5 | def tradition2simple(line):
6 | line = Converter('zh-hans').convert(line.decode('utf-8'))
7 | line = line.encode('utf-8')
8 | return line
9 |
10 | if __name__ == '__main__':
11 | if len(sys.argv) == 2:
12 | f = open(sys.argv[1], "r")
13 | while True:
14 | line = f.readline()
15 | if line:
16 | print tradition2simple(line).strip()
17 | else:
18 | break
19 | f.close()
20 |
--------------------------------------------------------------------------------
/subtitle/subtitle_crawler/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 |
8 |
9 | class SubtitleCrawlerPipeline(object):
10 | def process_item(self, item, spider):
11 | url = item['url']
12 | file_name = url.replace('/','_').replace(':','_')
13 | fp = open('result/'+file_name, 'w')
14 | fp.write(item['body'])
15 | fp.close()
16 | return item
17 |
18 |
--------------------------------------------------------------------------------
/chatbotv1/src/main/resources/stopword.dic:
--------------------------------------------------------------------------------
1 | a
2 | an
3 | and
4 | are
5 | as
6 | at
7 | be
8 | but
9 | by
10 | for
11 | if
12 | in
13 | into
14 | is
15 | it
16 | no
17 | not
18 | of
19 | on
20 | or
21 | such
22 | that
23 | the
24 | their
25 | then
26 | there
27 | these
28 | they
29 | this
30 | to
31 | was
32 | will
33 | with
34 | 也
35 | 了
36 | 仍
37 | 从
38 | 以
39 | 使
40 | 则
41 | 却
42 | 又
43 | 及
44 | 对
45 | 就
46 | 并
47 | 很
48 | 或
49 | 把
50 | 是
51 | 的
52 | 着
53 | 给
54 | 而
55 | 被
56 | 让
57 | 在
58 | 还
59 | 比
60 | 等
61 | 当
62 | 与
63 | 于
64 | 但
--------------------------------------------------------------------------------
/subtitle/preprocess/mv_ass.py:
--------------------------------------------------------------------------------
1 | import glob
2 | import os
3 | import fnmatch
4 | import shutil
5 | import sys
6 |
7 |
8 | def iterfindfiles(path, fnexp):
9 | for root, dirs, files in os.walk(path):
10 | for filename in fnmatch.filter(files, fnexp):
11 | yield os.path.join(root, filename)
12 |
13 |
14 | i=0
15 | for filename in iterfindfiles(r"./input/", "*.ass"):
16 | i=i+1
17 | newfilename = "ass/" + str(i) + "_" + os.path.basename(filename)
18 | print filename + " <===> " + newfilename
19 | shutil.move(filename, newfilename)
20 | #sys.exit(-1)
21 |
--------------------------------------------------------------------------------
/subtitle/preprocess/mv_lrc.py:
--------------------------------------------------------------------------------
1 | import glob
2 | import os
3 | import fnmatch
4 | import shutil
5 | import sys
6 |
7 |
8 | def iterfindfiles(path, fnexp):
9 | for root, dirs, files in os.walk(path):
10 | for filename in fnmatch.filter(files, fnexp):
11 | yield os.path.join(root, filename)
12 |
13 |
14 | i=0
15 | for filename in iterfindfiles(r"./input/", "*.LRC"):
16 | i=i+1
17 | newfilename = "lrc/" + str(i) + "_" + os.path.basename(filename)
18 | print filename + " <===> " + newfilename
19 | shutil.move(filename, newfilename)
20 | #sys.exit(-1)
21 |
--------------------------------------------------------------------------------
/subtitle/preprocess/mv_rar.py:
--------------------------------------------------------------------------------
1 | import glob
2 | import os
3 | import fnmatch
4 | import shutil
5 | import sys
6 |
7 |
8 | def iterfindfiles(path, fnexp):
9 | for root, dirs, files in os.walk(path):
10 | for filename in fnmatch.filter(files, fnexp):
11 | yield os.path.join(root, filename)
12 |
13 |
14 | i=0
15 | for filename in iterfindfiles(r"./input/", "*.RAR"):
16 | i=i+1
17 | newfilename = "rar/" + str(i) + "_" + os.path.basename(filename)
18 | print filename + " <===> " + newfilename
19 | shutil.move(filename, newfilename)
20 | #sys.exit(-1)
21 |
--------------------------------------------------------------------------------
/subtitle/preprocess/mv_smi.py:
--------------------------------------------------------------------------------
1 | import glob
2 | import os
3 | import fnmatch
4 | import shutil
5 | import sys
6 |
7 |
8 | def iterfindfiles(path, fnexp):
9 | for root, dirs, files in os.walk(path):
10 | for filename in fnmatch.filter(files, fnexp):
11 | yield os.path.join(root, filename)
12 |
13 |
14 | i=0
15 | for filename in iterfindfiles(r"./input/", "*.SMI"):
16 | i=i+1
17 | newfilename = "smi/" + str(i) + "_" + os.path.basename(filename)
18 | print filename + " <===> " + newfilename
19 | shutil.move(filename, newfilename)
20 | #sys.exit(-1)
21 |
--------------------------------------------------------------------------------
/subtitle/preprocess/mv_srt.py:
--------------------------------------------------------------------------------
1 | import glob
2 | import os
3 | import fnmatch
4 | import shutil
5 | import sys
6 |
7 |
8 | def iterfindfiles(path, fnexp):
9 | for root, dirs, files in os.walk(path):
10 | for filename in fnmatch.filter(files, fnexp):
11 | yield os.path.join(root, filename)
12 |
13 |
14 | i=0
15 | for filename in iterfindfiles(r"./input/", "*.SRT"):
16 | i=i+1
17 | newfilename = "srt/" + str(i) + "_" + os.path.basename(filename)
18 | print filename + " <===> " + newfilename
19 | shutil.move(filename, newfilename)
20 | #sys.exit(-1)
21 |
--------------------------------------------------------------------------------
/subtitle/preprocess/mv_ssa.py:
--------------------------------------------------------------------------------
1 | import glob
2 | import os
3 | import fnmatch
4 | import shutil
5 | import sys
6 |
7 |
8 | def iterfindfiles(path, fnexp):
9 | for root, dirs, files in os.walk(path):
10 | for filename in fnmatch.filter(files, fnexp):
11 | yield os.path.join(root, filename)
12 |
13 |
14 | i=0
15 | for filename in iterfindfiles(r"./input/", "*.ssa"):
16 | i=i+1
17 | newfilename = "ssa/" + str(i) + "_" + os.path.basename(filename)
18 | print filename + " <===> " + newfilename
19 | shutil.move(filename, newfilename)
20 | #sys.exit(-1)
21 |
--------------------------------------------------------------------------------
/subtitle/preprocess/mv_str.py:
--------------------------------------------------------------------------------
1 | import glob
2 | import os
3 | import fnmatch
4 | import shutil
5 | import sys
6 |
7 |
8 | def iterfindfiles(path, fnexp):
9 | for root, dirs, files in os.walk(path):
10 | for filename in fnmatch.filter(files, fnexp):
11 | yield os.path.join(root, filename)
12 |
13 |
14 | i=0
15 | for filename in iterfindfiles(r"./input/", "*.str"):
16 | i=i+1
17 | newfilename = "str/" + str(i) + "_" + os.path.basename(filename)
18 | print filename + " <===> " + newfilename
19 | shutil.move(filename, newfilename)
20 | #sys.exit(-1)
21 |
--------------------------------------------------------------------------------
/subtitle/preprocess/mv_sup.py:
--------------------------------------------------------------------------------
1 | import glob
2 | import os
3 | import fnmatch
4 | import shutil
5 | import sys
6 |
7 |
8 | def iterfindfiles(path, fnexp):
9 | for root, dirs, files in os.walk(path):
10 | for filename in fnmatch.filter(files, fnexp):
11 | yield os.path.join(root, filename)
12 |
13 |
14 | i=0
15 | for filename in iterfindfiles(r"./input/", "*.sup"):
16 | i=i+1
17 | newfilename = "sup/" + str(i) + "_" + os.path.basename(filename)
18 | print filename + " <===> " + newfilename
19 | shutil.move(filename, newfilename)
20 | #sys.exit(-1)
21 |
--------------------------------------------------------------------------------
/subtitle/preprocess/mv_vtt.py:
--------------------------------------------------------------------------------
1 | import glob
2 | import os
3 | import fnmatch
4 | import shutil
5 | import sys
6 |
7 |
8 | def iterfindfiles(path, fnexp):
9 | for root, dirs, files in os.walk(path):
10 | for filename in fnmatch.filter(files, fnexp):
11 | yield os.path.join(root, filename)
12 |
13 |
14 | i=0
15 | for filename in iterfindfiles(r"./input/", "*.vtt"):
16 | i=i+1
17 | newfilename = "vtt/" + str(i) + "_" + os.path.basename(filename)
18 | print filename + " <===> " + newfilename
19 | shutil.move(filename, newfilename)
20 | #sys.exit(-1)
21 |
--------------------------------------------------------------------------------
/subtitle/preprocess/mv_zip.py:
--------------------------------------------------------------------------------
1 | import glob
2 | import os
3 | import fnmatch
4 | import shutil
5 | import sys
6 |
7 |
8 | def iterfindfiles(path, fnexp):
9 | for root, dirs, files in os.walk(path):
10 | for filename in fnmatch.filter(files, fnexp):
11 | yield os.path.join(root, filename)
12 |
13 |
14 | i=0
15 | for filename in iterfindfiles(r"./input/", "*.ZIP"):
16 | i=i+1
17 | newfilename = "zip/" + str(i) + "_" + os.path.basename(filename)
18 | print filename + " <===> " + newfilename
19 | shutil.move(filename, newfilename)
20 | #sys.exit(-1)
21 |
--------------------------------------------------------------------------------
/gensim_word2vec.py:
--------------------------------------------------------------------------------
1 | # coding:utf-8
2 |
3 | from gensim.models import word2vec
4 | import logging
5 |
6 | #logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
7 | #sentences = word2vec.LineSentence('segment_result_lined')
8 | #model = word2vec.Word2Vec(sentences, size=200, workers=4, iter=20)
9 | #model.save("word_vec_model/model")
10 | model_2 = word2vec.Word2Vec.load("word_vec_model/model")
11 | y = model_2.most_similar(u"学习", topn=10)
12 | for (word, score) in y:
13 | print word
14 | print score
15 | #print model_2.syn0norm[model_2.vocab[u"小兔"].index]
16 |
--------------------------------------------------------------------------------
/subtitle/preprocess/get_charset.py:
--------------------------------------------------------------------------------
1 | import chardet
2 | import sys
3 | import os
4 |
5 | if __name__ == '__main__':
6 | for dir in ("srt", "ass", "lrc", "ssa", "str", "vtt"):
7 | for root, dirs, files in os.walk(dir):
8 | for file in files:
9 | file_path = root + "/" + file
10 | f = open(file_path,'r')
11 | data = f.read()
12 | encoding = chardet.detect(data)["encoding"]
13 | if encoding not in ("UTF-8-SIG", "UTF-16LE", "utf-8"):
14 | print file_path, encoding
15 | f.close()
16 |
--------------------------------------------------------------------------------
/word2vec/demo-analogy.sh:
--------------------------------------------------------------------------------
1 | make
2 | if [ ! -e text8 ]; then
3 | wget http://mattmahoney.net/dc/text8.zip -O text8.gz
4 | gzip -d text8.gz -f
5 | fi
6 | echo ---------------------------------------------------------------------------------------------------
7 | echo Note that for the word analogy to perform well, the model should be trained on much larger data set
8 | echo Example input: paris france berlin
9 | echo ---------------------------------------------------------------------------------------------------
10 | time ./word2vec -train text8 -output vectors.bin -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 20 -binary 1 -iter 15
11 | ./word-analogy vectors.bin
12 |
--------------------------------------------------------------------------------
/chatbotv2/readme.txt:
--------------------------------------------------------------------------------
1 | python ../word_segment.py zhenhuanzhuan.txt zhenhuanzhuan.segment
2 | ../word2vec/word2vec -train ./zhenhuanzhuan.segment -output vectors.bin -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 20 -binary 1 -iter 15
3 |
4 |
5 |
6 | head -10000 ../subtitle/raw_subtitles/subtitle.corpus > subtitle.corpus.10000
7 | python ../word_segment.py subtitle.corpus.10000 subtitle.corpus.10000.segment
8 | ../word2vec/word2vec -train ./subtitle.corpus.10000.segment -output vectors.bin -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-5 -threads 20 -binary 1 -iter 15
9 | cat subtitle.corpus.10000.segment | awk '{if(last!="")print last"|"$0;last=$0}' | sed 's/| /|/g' > subtitle.corpus.10000.segment.pair
10 |
--------------------------------------------------------------------------------
/word2vec/makefile:
--------------------------------------------------------------------------------
1 | CC = gcc
2 | #Using -Ofast instead of -O3 might result in faster code, but is supported only by newer GCC versions
3 | CFLAGS = -lm -pthread -O3 -march=native -Wall -funroll-loops -Wno-unused-result
4 |
5 | all: word2vec word2phrase distance word-analogy compute-accuracy
6 |
7 | word2vec : word2vec.c
8 | $(CC) word2vec.c -o word2vec $(CFLAGS)
9 | word2phrase : word2phrase.c
10 | $(CC) word2phrase.c -o word2phrase $(CFLAGS)
11 | distance : distance.c
12 | $(CC) distance.c -o distance $(CFLAGS)
13 | word-analogy : word-analogy.c
14 | $(CC) word-analogy.c -o word-analogy $(CFLAGS)
15 | compute-accuracy : compute-accuracy.c
16 | $(CC) compute-accuracy.c -o compute-accuracy $(CFLAGS)
17 | chmod +x *.sh
18 |
19 | clean:
20 | rm -rf word2vec word2phrase distance word-analogy compute-accuracy
--------------------------------------------------------------------------------
/chatbotv1/src/test/java/com/shareditor/chatbotv1/AppTest.java:
--------------------------------------------------------------------------------
1 | package com.shareditor.chatbotv1;
2 |
3 | import junit.framework.Test;
4 | import junit.framework.TestCase;
5 | import junit.framework.TestSuite;
6 |
7 | /**
8 | * Unit test for simple App.
9 | */
10 | public class AppTest
11 | extends TestCase
12 | {
13 | /**
14 | * Create the test case
15 | *
16 | * @param testName name of the test case
17 | */
18 | public AppTest( String testName )
19 | {
20 | super( testName );
21 | }
22 |
23 | /**
24 | * @return the suite of tests being tested
25 | */
26 | public static Test suite()
27 | {
28 | return new TestSuite( AppTest.class );
29 | }
30 |
31 | /**
32 | * Rigourous Test :-)
33 | */
34 | public void testApp()
35 | {
36 | assertTrue( true );
37 | }
38 | }
39 |
--------------------------------------------------------------------------------
/word2vec/demo-phrases.sh:
--------------------------------------------------------------------------------
1 | make
2 | if [ ! -e news.2012.en.shuffled ]; then
3 | wget http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2012.en.shuffled.gz
4 | gzip -d news.2012.en.shuffled.gz -f
5 | fi
6 | sed -e "s/’/'/g" -e "s/′/'/g" -e "s/''/ /g" < news.2012.en.shuffled | tr -c "A-Za-z'_ \n" " " > news.2012.en.shuffled-norm0
7 | time ./word2phrase -train news.2012.en.shuffled-norm0 -output news.2012.en.shuffled-norm0-phrase0 -threshold 200 -debug 2
8 | time ./word2phrase -train news.2012.en.shuffled-norm0-phrase0 -output news.2012.en.shuffled-norm0-phrase1 -threshold 100 -debug 2
9 | tr A-Z a-z < news.2012.en.shuffled-norm0-phrase1 > news.2012.en.shuffled-norm1-phrase1
10 | time ./word2vec -train news.2012.en.shuffled-norm1-phrase1 -output vectors-phrase.bin -cbow 1 -size 200 -window 10 -negative 25 -hs 0 -sample 1e-5 -threads 20 -binary 1 -iter 15
11 | ./distance vectors-phrase.bin
12 |
--------------------------------------------------------------------------------
/word_segment.py:
--------------------------------------------------------------------------------
1 | # coding:utf-8
2 |
3 | import sys
4 | reload(sys)
5 | sys.setdefaultencoding( "utf-8" )
6 |
7 | import jieba
8 | from jieba import analyse
9 |
10 | def segment(input, output):
11 | input_file = open(input, "r")
12 | output_file = open(output, "w")
13 | while True:
14 | line = input_file.readline()
15 | if line:
16 | line = line.strip()
17 | seg_list = jieba.cut(line)
18 | segments = ""
19 | for str in seg_list:
20 | segments = segments + " " + str
21 | segments = segments + "\n"
22 | output_file.write(segments)
23 | else:
24 | break
25 | input_file.close()
26 | output_file.close()
27 |
28 | if __name__ == '__main__':
29 | if 3 != len(sys.argv):
30 | print "Usage: ", sys.argv[0], "input output"
31 | sys.exit(-1)
32 | segment(sys.argv[1], sys.argv[2]);
33 |
--------------------------------------------------------------------------------
/subtitle/preprocess/get_charset_and_conv.py:
--------------------------------------------------------------------------------
1 | import chardet
2 | import sys
3 | import os
4 |
5 | if __name__ == '__main__':
6 | if len(sys.argv) == 2:
7 | for root, dirs, files in os.walk(sys.argv[1]):
8 | for file in files:
9 | file_path = root + "/" + file
10 | f = open(file_path,'r')
11 | data = f.read()
12 | f.close()
13 | encoding = chardet.detect(data)["encoding"]
14 | if encoding not in ("UTF-8-SIG", "UTF-16LE", "utf-8", "ascii"):
15 | try:
16 | gb_content = data.decode("gb18030")
17 | gb_content.encode('utf-8')
18 | f = open(file_path, 'w')
19 | f.write(gb_content.encode('utf-8'))
20 | f.close()
21 | except:
22 | print "except:", file_path
23 |
--------------------------------------------------------------------------------
/word2vec/demo-phrase-accuracy.sh:
--------------------------------------------------------------------------------
1 | make
2 | if [ ! -e news.2012.en.shuffled ]; then
3 | wget http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2012.en.shuffled.gz
4 | gzip -d news.2012.en.shuffled.gz -f
5 | fi
6 | sed -e "s/’/'/g" -e "s/′/'/g" -e "s/''/ /g" < news.2012.en.shuffled | tr -c "A-Za-z'_ \n" " " > news.2012.en.shuffled-norm0
7 | time ./word2phrase -train news.2012.en.shuffled-norm0 -output news.2012.en.shuffled-norm0-phrase0 -threshold 200 -debug 2
8 | time ./word2phrase -train news.2012.en.shuffled-norm0-phrase0 -output news.2012.en.shuffled-norm0-phrase1 -threshold 100 -debug 2
9 | tr A-Z a-z < news.2012.en.shuffled-norm0-phrase1 > news.2012.en.shuffled-norm1-phrase1
10 | time ./word2vec -train news.2012.en.shuffled-norm1-phrase1 -output vectors-phrase.bin -cbow 1 -size 200 -window 10 -negative 25 -hs 0 -sample 1e-5 -threads 20 -binary 1 -iter 15
11 | ./compute-accuracy vectors-phrase.bin < questions-phrases.txt
12 |
--------------------------------------------------------------------------------
/chatbotv1/src/main/java/com/shareditor/chatbotv1/HttpServerInboundHandler.java:
--------------------------------------------------------------------------------
1 | package com.shareditor.chatbotv1;
2 |
3 | import io.netty.channel.ChannelFuture;
4 | import io.netty.channel.ChannelFutureListener;
5 | import io.netty.channel.ChannelHandlerContext;
6 | import io.netty.channel.SimpleChannelInboundHandler;
7 | import io.netty.handler.codec.http.FullHttpRequest;
8 | import io.netty.handler.codec.http.HttpResponseStatus;
9 | import io.netty.handler.codec.http.HttpVersion;
10 |
11 | public class HttpServerInboundHandler extends SimpleChannelInboundHandler {
12 |
13 | @Override
14 | protected void messageReceived(ChannelHandlerContext ctx, FullHttpRequest msg) throws Exception {
15 | NettyHttpServletResponse res = new NettyHttpServletResponse(HttpVersion.HTTP_1_1, HttpResponseStatus.OK);
16 | Action.doServlet(msg, res);
17 | ChannelFuture future = ctx.channel().writeAndFlush(res);
18 | future.addListener(ChannelFutureListener.CLOSE);
19 | }
20 |
21 | }
22 |
--------------------------------------------------------------------------------
/digital_recognition.py:
--------------------------------------------------------------------------------
1 | # coding:utf-8
2 |
3 | import sys
4 | reload(sys)
5 | sys.setdefaultencoding( "utf-8" )
6 |
7 | from tensorflow.examples.tutorials.mnist import input_data
8 | import tensorflow as tf
9 |
10 | flags = tf.app.flags
11 | FLAGS = flags.FLAGS
12 | flags.DEFINE_string('data_dir', './', 'Directory for storing data')
13 |
14 | mnist = input_data.read_data_sets(FLAGS.data_dir, one_hot=True)
15 |
16 |
17 | x = tf.placeholder(tf.float32, [None, 784])
18 | W = tf.Variable(tf.zeros([784,10]))
19 | b = tf.Variable(tf.zeros([10]))
20 | y = tf.nn.softmax(tf.matmul(x,W) + b)
21 | y_ = tf.placeholder("float", [None,10])
22 | cross_entropy = -tf.reduce_sum(y_*tf.log(y))
23 | train_step = tf.train.GradientDescentOptimizer(0.01).minimize(cross_entropy)
24 |
25 | init = tf.initialize_all_variables()
26 | sess = tf.InteractiveSession()
27 | sess.run(init)
28 | for i in range(1000):
29 | batch_xs, batch_ys = mnist.train.next_batch(100)
30 | sess.run(train_step, feed_dict={x: batch_xs, y_: batch_ys})
31 |
32 | correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
33 | accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
34 | print(accuracy.eval({x: mnist.test.images, y_: mnist.test.labels}))
35 |
--------------------------------------------------------------------------------
/word2vec/README.txt:
--------------------------------------------------------------------------------
1 | Tools for computing distributed representtion of words
2 | ------------------------------------------------------
3 |
4 | We provide an implementation of the Continuous Bag-of-Words (CBOW) and the Skip-gram model (SG), as well as several demo scripts.
5 |
6 | Given a text corpus, the word2vec tool learns a vector for every word in the vocabulary using the Continuous
7 | Bag-of-Words or the Skip-Gram neural network architectures. The user should to specify the following:
8 | - desired vector dimensionality
9 | - the size of the context window for either the Skip-Gram or the Continuous Bag-of-Words model
10 | - training algorithm: hierarchical softmax and / or negative sampling
11 | - threshold for downsampling the frequent words
12 | - number of threads to use
13 | - the format of the output word vector file (text or binary)
14 |
15 | Usually, the other hyper-parameters such as the learning rate do not need to be tuned for different training sets.
16 |
17 | The script demo-word.sh downloads a small (100MB) text corpus from the web, and trains a small word vector model. After the training
18 | is finished, the user can interactively explore the similarity of the words.
19 |
20 | More information about the scripts is provided at https://code.google.com/p/word2vec/
21 |
22 |
--------------------------------------------------------------------------------
/subtitle/preprocess/extract_sentence_srt.py:
--------------------------------------------------------------------------------
1 | # coding:utf-8
2 | import chardet
3 | import os
4 | import re
5 |
6 | cn=ur"([\u4e00-\u9fa5]+)"
7 | pattern_cn = re.compile(cn)
8 | jp1=ur"([\u3040-\u309F]+)"
9 | pattern_jp1 = re.compile(jp1)
10 | jp2=ur"([\u30A0-\u30FF]+)"
11 | pattern_jp2 = re.compile(jp2)
12 |
13 | for root, dirs, files in os.walk("./srt"):
14 | file_count = len(files)
15 | if file_count > 0:
16 | for index, file in enumerate(files):
17 | f = open(root + "/" + file, "r")
18 | content = f.read()
19 | f.close()
20 | encoding = chardet.detect(content)["encoding"]
21 | try:
22 | for sentence in content.decode(encoding).split('\n'):
23 | if len(sentence) > 0:
24 | match_cn = pattern_cn.findall(sentence)
25 | match_jp1 = pattern_jp1.findall(sentence)
26 | match_jp2 = pattern_jp2.findall(sentence)
27 | sentence = sentence.strip()
28 | if len(match_cn)>0 and len(match_jp1)==0 and len(match_jp2) == 0 and len(sentence)>1 and len(sentence.split(' ')) < 10:
29 | print sentence.encode('utf-8')
30 | except:
31 | continue
32 |
--------------------------------------------------------------------------------
/learning_tensorflow/3.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import numpy as np
4 | import tensorflow as tf
5 |
6 | # 随机生成1000个点,围绕在y=0.1x+0.3的直线周围
7 | num_points = 4
8 | vectors_set = []
9 | for i in xrange(num_points):
10 | x1 = np.random.normal(0.0, 0.55)
11 | y1 = x1 * 0.1 + 0.3 + np.random.normal(0.0, 0.03)
12 | vectors_set.append([x1, y1])
13 |
14 | # 生成一些样本
15 | x_data = [v[0] for v in vectors_set]
16 | y_data = [v[1] for v in vectors_set]
17 | print "x_data=", x_data
18 |
19 |
20 | # 生成1维的W矩阵,取值是[-1,1]之间的随机数
21 | W = tf.Variable(tf.random_uniform([1], -1.0, 1.0), name='W')
22 | # 生成1维的b矩阵,初始值是0
23 | b = tf.Variable(tf.zeros([1]), name='b')
24 | # 经过计算得出预估值y
25 | y = W * x_data + b
26 | print "y=", y
27 |
28 | # 以预估值y和实际值y_data之间的均方误差作为损失
29 | loss = tf.reduce_mean(tf.square(y - y_data), name='loss')
30 | # 采用梯度下降法来优化参数
31 | optimizer = tf.train.GradientDescentOptimizer(0.5)
32 | # 训练的过程就是最小化这个误差值
33 | train = optimizer.minimize(loss, name='train')
34 |
35 | sess = tf.Session()
36 | # 输出图结构
37 | #print sess.graph_def
38 |
39 | init = tf.initialize_all_variables()
40 | sess.run(init)
41 |
42 | # 初始化的W和b是多少
43 | print "W =", sess.run(W), "b =", sess.run(b), "loss =", sess.run(loss)
44 | # 执行20次训练
45 | for step in xrange(20):
46 | sess.run(train)
47 | # 输出训练好的W和b
48 | print "W =", sess.run(W), "b =", sess.run(b), "loss =", sess.run(loss)
49 | # 生成summary文件,用于tensorboard使用
50 | writer = tf.train.SummaryWriter("./tmp", sess.graph)
51 |
--------------------------------------------------------------------------------
/chatbotv1/src/main/java/org/wltea/analyzer/core/ISegmenter.java:
--------------------------------------------------------------------------------
1 | /**
2 | * IK 中文分词 版本 5.0
3 | * IK Analyzer release 5.0
4 | *
5 | * Licensed to the Apache Software Foundation (ASF) under one or more
6 | * contributor license agreements. See the NOTICE file distributed with
7 | * this work for additional information regarding copyright ownership.
8 | * The ASF licenses this file to You under the Apache License, Version 2.0
9 | * (the "License"); you may not use this file except in compliance with
10 | * the License. You may obtain a copy of the License at
11 | *
12 | * http://www.apache.org/licenses/LICENSE-2.0
13 | *
14 | * Unless required by applicable law or agreed to in writing, software
15 | * distributed under the License is distributed on an "AS IS" BASIS,
16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | * See the License for the specific language governing permissions and
18 | * limitations under the License.
19 | *
20 | * 源代码由林良益(linliangyi2005@gmail.com)提供
21 | * 版权声明 2012,乌龙茶工作室
22 | * provided by Linliangyi and copyright 2012 by Oolong studio
23 | *
24 | */
25 | package org.wltea.analyzer.core;
26 |
27 |
28 | /**
29 | *
30 | * 子分词器接口
31 | */
32 | interface ISegmenter {
33 |
34 | /**
35 | * 从分析器读取下一个可能分解的词元对象
36 | * @param context 分词算法上下文
37 | */
38 | void analyze(AnalyzeContext context);
39 |
40 |
41 | /**
42 | * 重置子分析器状态
43 | */
44 | void reset();
45 |
46 | }
47 |
--------------------------------------------------------------------------------
/baidu_search/baidu_search/spiders/baidu_search.py:
--------------------------------------------------------------------------------
1 | # coding:utf-8
2 |
3 | import sys
4 | reload(sys)
5 | sys.setdefaultencoding( "utf-8" )
6 |
7 | import scrapy
8 | from w3lib.html import remove_tags
9 |
10 | class BaiduSearchSpider(scrapy.Spider):
11 | name = "baidu_search"
12 | allowed_domains = ["baidu.com"]
13 | start_urls = [
14 | "https://www.baidu.com/s?wd=机器学习"
15 | ]
16 |
17 | def parse(self, response):
18 | hrefs = response.selector.xpath('//div[contains(@class, "c-container")]/h3/a/@href').extract()
19 | containers = response.selector.xpath('//div[contains(@class, "c-container")]')
20 | for container in containers:
21 | href = container.xpath('h3/a/@href').extract()[0]
22 | title = remove_tags(container.xpath('h3/a').extract()[0])
23 | c_abstract = container.xpath('div/div/div[contains(@class, "c-abstract")]').extract()
24 | abstract = ""
25 | if len(c_abstract) > 0:
26 | abstract = remove_tags(c_abstract[0])
27 | request = scrapy.Request(href, callback=self.parse_url)
28 | request.meta['title'] = title
29 | request.meta['abstract'] = abstract
30 | yield request
31 |
32 | def parse_url(self, response):
33 | print "url:", response.url
34 | print "title:", response.meta['title']
35 | print "abstract:", response.meta['abstract']
36 | content = remove_tags(response.selector.xpath('//body').extract()[0])
37 | print "content_len:", len(content)
38 |
--------------------------------------------------------------------------------
/lstm_code/nicodjimenez/test.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import sys
3 |
4 | from lstm import LstmParam, LstmNetwork
5 |
6 | class ToyLossLayer:
7 | """
8 | Computes square loss with first element of hidden layer array.
9 | """
10 | @classmethod
11 | def loss(self, pred, label):
12 | return (pred[0] - label) ** 2
13 |
14 | @classmethod
15 | def bottom_diff(self, pred, label):
16 | diff = np.zeros_like(pred)
17 | diff[0] = 2 * (pred[0] - label)
18 | return diff
19 |
20 | def example_0():
21 | # learns to repeat simple sequence from random inputs
22 | np.random.seed(0)
23 |
24 | # parameters for input data dimension and lstm cell count
25 | mem_cell_ct = 100
26 | x_dim = 50
27 | concat_len = x_dim + mem_cell_ct
28 | lstm_param = LstmParam(mem_cell_ct, x_dim)
29 | lstm_net = LstmNetwork(lstm_param)
30 | y_list = [-0.5,0.2,0.1, -0.5]
31 | input_val_arr = [np.random.random(x_dim) for _ in y_list]
32 |
33 | for cur_iter in range(100):
34 | print "cur iter: ", cur_iter
35 | print "input_val_arr=", input_val_arr
36 | print "y_list=", y_list
37 | for ind in range(len(y_list)):
38 | lstm_net.x_list_add(input_val_arr[ind])
39 | print "y_pred[%d] : %f" % (ind, lstm_net.lstm_node_list[ind].state.h[0])
40 |
41 | loss = lstm_net.y_list_is(y_list, ToyLossLayer)
42 | print "loss: ", loss
43 | lstm_param.apply_diff(lr=0.1)
44 | lstm_net.x_list_clear()
45 |
46 | if __name__ == "__main__":
47 | example_0()
48 |
49 |
--------------------------------------------------------------------------------
/word_vectors_loader.py:
--------------------------------------------------------------------------------
1 | # coding:utf-8
2 |
3 | import sys
4 | import struct
5 | import math
6 | import numpy as np
7 |
8 | reload(sys)
9 | sys.setdefaultencoding( "utf-8" )
10 |
11 | max_w = 50
12 | float_size = 4
13 |
14 | def load_vectors(input):
15 | print "begin load vectors"
16 |
17 | input_file = open(input, "rb")
18 |
19 | # 获取词表数目及向量维度
20 | words_and_size = input_file.readline()
21 | words_and_size = words_and_size.strip()
22 | words = long(words_and_size.split(' ')[0])
23 | size = long(words_and_size.split(' ')[1])
24 | print "words =", words
25 | print "size =", size
26 |
27 | word_vector = {}
28 |
29 | for b in range(0, words):
30 | a = 0
31 | word = ''
32 | # 读取一个词
33 | while True:
34 | c = input_file.read(1)
35 | word = word + c
36 | if False == c or c == ' ':
37 | break
38 | if a < max_w and c != '\n':
39 | a = a + 1
40 | word = word.strip()
41 |
42 | # 读取词向量
43 | vector = np.empty([200])
44 | for index in range(0, size):
45 | m = input_file.read(float_size)
46 | (weight,) = struct.unpack('f', m)
47 | vector[index] = weight
48 |
49 | # 将词及其对应的向量存到dict中
50 | word_vector[word.decode('utf-8')] = vector
51 |
52 | input_file.close()
53 |
54 | print "load vectors finish"
55 | return word_vector
56 |
57 | if __name__ == '__main__':
58 | if 2 != len(sys.argv):
59 | print "Usage: ", sys.argv[0], "vectors.bin"
60 | sys.exit(-1)
61 | d = load_vectors(sys.argv[1])
62 | print d[u'真的']
63 |
--------------------------------------------------------------------------------
/subtitle/preprocess/extract_sentence_ass.py:
--------------------------------------------------------------------------------
1 | # coding:utf-8
2 | import chardet
3 | import os
4 | import re
5 |
6 | cn=ur"([\u4e00-\u9fa5]+)"
7 | pattern_cn = re.compile(cn)
8 | jp1=ur"([\u3040-\u309F]+)"
9 | pattern_jp1 = re.compile(jp1)
10 | jp2=ur"([\u30A0-\u30FF]+)"
11 | pattern_jp2 = re.compile(jp2)
12 |
13 | for root, dirs, files in os.walk("./ass"):
14 | file_count = len(files)
15 | if file_count > 0:
16 | for index, file in enumerate(files):
17 | f = open(root + "/" + file, "r")
18 | content = f.read()
19 | f.close()
20 | encoding = chardet.detect(content)["encoding"]
21 | try:
22 | for line in content.decode(encoding).split('\n'):
23 | if line.find('Dialogue') == 0 and len(line) < 500:
24 | fields = line.split(',')
25 | sentence = fields[len(fields)-1]
26 | tag_fields = sentence.split('}')
27 | if len(tag_fields) > 1:
28 | sentence = tag_fields[len(tag_fields)-1]
29 | match_cn = pattern_cn.findall(sentence)
30 | match_jp1 = pattern_jp1.findall(sentence)
31 | match_jp2 = pattern_jp2.findall(sentence)
32 | sentence = sentence.strip()
33 | if len(match_cn)>0 and len(match_jp1)==0 and len(match_jp2) == 0 and len(sentence)>1 and len(sentence.split(' ')) < 10:
34 | sentence = sentence.replace('\N', '')
35 | print sentence.encode('utf-8')
36 | except:
37 | continue
38 |
--------------------------------------------------------------------------------
/subtitle/preprocess/extract_sentence_ssa.py:
--------------------------------------------------------------------------------
1 | # coding:utf-8
2 | import chardet
3 | import os
4 | import re
5 |
6 | cn=ur"([\u4e00-\u9fa5]+)"
7 | pattern_cn = re.compile(cn)
8 | jp1=ur"([\u3040-\u309F]+)"
9 | pattern_jp1 = re.compile(jp1)
10 | jp2=ur"([\u30A0-\u30FF]+)"
11 | pattern_jp2 = re.compile(jp2)
12 |
13 | for root, dirs, files in os.walk("./ssa"):
14 | file_count = len(files)
15 | if file_count > 0:
16 | for index, file in enumerate(files):
17 | f = open(root + "/" + file, "r")
18 | content = f.read()
19 | f.close()
20 | encoding = chardet.detect(content)["encoding"]
21 | try:
22 | for line in content.decode(encoding).split('\n'):
23 | if line.find('Dialogue') == 0 and len(line) < 500:
24 | fields = line.split(',')
25 | sentence = fields[len(fields)-1]
26 | tag_fields = sentence.split('}')
27 | if len(tag_fields) > 1:
28 | sentence = tag_fields[len(tag_fields)-1]
29 | match_cn = pattern_cn.findall(sentence)
30 | match_jp1 = pattern_jp1.findall(sentence)
31 | match_jp2 = pattern_jp2.findall(sentence)
32 | sentence = sentence.strip()
33 | if len(match_cn)>0 and len(match_jp1)==0 and len(match_jp2) == 0 and len(sentence)>1 and len(sentence.split(' ')) < 10:
34 | sentence = sentence.replace('\N', '')
35 | print sentence.encode('utf-8')
36 | except:
37 | continue
38 |
--------------------------------------------------------------------------------
/chatbotv1/src/main/java/com/shareditor/chatbotv1/Searcher.java:
--------------------------------------------------------------------------------
1 | package com.shareditor.chatbotv1;
2 |
3 | import io.netty.bootstrap.ServerBootstrap;
4 | import io.netty.channel.ChannelFuture;
5 | import io.netty.channel.ChannelInitializer;
6 | import io.netty.channel.ChannelOption;
7 | import io.netty.channel.ChannelPipeline;
8 | import io.netty.channel.EventLoopGroup;
9 | import io.netty.channel.nio.NioEventLoopGroup;
10 | import io.netty.channel.socket.SocketChannel;
11 | import io.netty.channel.socket.nio.NioServerSocketChannel;
12 | import io.netty.handler.codec.http.HttpObjectAggregator;
13 | import io.netty.handler.codec.http.HttpRequestDecoder;
14 | import io.netty.handler.codec.http.HttpResponseEncoder;
15 | import io.netty.handler.logging.LogLevel;
16 | import io.netty.handler.logging.LoggingHandler;
17 |
18 | public class Searcher {
19 |
20 |
21 | public static void main(String[] args) throws InterruptedException {
22 | EventLoopGroup bossGroup = new NioEventLoopGroup(1);
23 | EventLoopGroup workerGroup = new NioEventLoopGroup();
24 | ServerBootstrap b = new ServerBootstrap();
25 | b.group(bossGroup, workerGroup)
26 | .channel(NioServerSocketChannel.class)
27 | .option(ChannelOption.SO_BACKLOG, 128)
28 | .handler(new LoggingHandler(LogLevel.TRACE))
29 | .childHandler(new ChannelInitializer() {
30 | @Override
31 | public void initChannel(SocketChannel ch) throws Exception {
32 | ChannelPipeline p = ch.pipeline();
33 | p.addLast("http-decoder", new HttpRequestDecoder());
34 | p.addLast("http-aggregator", new HttpObjectAggregator(65535));
35 | p.addLast("http-encoder", new HttpResponseEncoder());
36 | p.addLast("handler", new HttpServerInboundHandler());
37 | }
38 | });
39 | ChannelFuture f = b.bind("0.0.0.0", 8765).sync();
40 | f.channel().closeFuture().sync();
41 | }
42 |
43 | }
44 |
--------------------------------------------------------------------------------
/seq2seq/tflearn_prj/my_tflearn_demo.py:
--------------------------------------------------------------------------------
1 | from __future__ import print_function
2 |
3 | import numpy as np
4 | import tflearn
5 | import sys
6 |
7 | # Download the Titanic dataset
8 | from tflearn.datasets import titanic
9 | titanic.download_dataset('titanic_dataset.csv')
10 |
11 | # Load CSV file, indicate that the first column represents labels
12 | from tflearn.data_utils import load_csv
13 | data, labels = load_csv('titanic_dataset.csv', target_column=0,
14 | categorical_labels=True, n_classes=2)
15 |
16 | # Preprocessing function
17 | def preprocess(data, columns_to_ignore):
18 | # Sort by descending id and delete columns
19 | for id in sorted(columns_to_ignore, reverse=True):
20 | [r.pop(id) for r in data]
21 | for i in range(len(data)):
22 | # Converting 'sex' field to float (id is 1 after removing labels column)
23 | data[i][1] = 1. if data[i][1] == 'female' else 0.
24 | return np.array(data, dtype=np.float32)
25 |
26 | # Ignore 'name' and 'ticket' columns (id 1 & 6 of data array)
27 | to_ignore=[1, 6]
28 |
29 | # Preprocess data
30 | data = preprocess(data, to_ignore)
31 |
32 | # Build neural network
33 | net = tflearn.input_data(shape=[None, 6])
34 | net = tflearn.fully_connected(net, 32)
35 | net = tflearn.fully_connected(net, 32)
36 | net = tflearn.fully_connected(net, 2, activation='softmax')
37 | net = tflearn.regression(net)
38 |
39 | # Define model
40 | model = tflearn.DNN(net)
41 | # Start training (apply gradient descent algorithm)
42 | model.fit(data, labels, n_epoch=10, batch_size=16, show_metric=True)
43 |
44 | # Let's create some data for DiCaprio and Winslet
45 | dicaprio = [3, 'Jack Dawson', 'male', 19, 0, 0, 'N/A', 5.0000]
46 | winslet = [1, 'Rose DeWitt Bukater', 'female', 17, 1, 2, 'N/A', 100.0000]
47 | # Preprocess data
48 | dicaprio, winslet = preprocess([dicaprio, winslet], to_ignore)
49 | # Predict surviving chances (class 1 results)
50 | pred = model.predict([dicaprio, winslet])
51 | print("DiCaprio Surviving Rate:", pred[0][1])
52 | print("Winslet Surviving Rate:", pred[1][1])
53 |
54 |
--------------------------------------------------------------------------------
/lstm_code/nicodjimenez/README.md:
--------------------------------------------------------------------------------
1 | # lstm
2 | A basic lstm network can be written from scratch in a few hundred lines of python, yet most of us have a hard time figuring out how lstm's actually work. The original Neural Computation [paper](https://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=3&cad=rja&uact=8&ved=0CDAQFjACahUKEwj1iZLX5efGAhVMpIgKHbv3DiI&url=http%3A%2F%2Fdeeplearning.cs.cmu.edu%2Fpdfs%2FHochreiter97_lstm.pdf&ei=ZuirVfW-GMzIogS777uQAg&usg=AFQjCNGoFvqrva4rDCNIcqNe_SiPL_VPxg&sig2=ZYnsGpdfHjRbK8xdr1thBg&bvm=bv.98197061,d.cGU) is too technical for non experts. Most blogs online on the topic seem to be written by people
3 | who have never implemented lstm's for people who will not implement them either. Other blogs are written by experts (like this [blog post](http://karpathy.github.io/2015/05/21/rnn-effectiveness/)) and lack simplified illustrative source code that actually does something. The [Apollo](https://github.com/Russell91/apollo) library built on top of caffe is terrific and features a fast lstm implementation. However, the downside of efficient implementations is that the source code is hard to follow.
4 |
5 | This repo features a minimal lstm implementation for people that are curious about lstms to the point of wanting to know how lstm's might be implemented. The code here follows notational conventions set forth in [this](http://arxiv.org/abs/1506.00019)
6 | well written tutorial introduction. This article should be read before trying to understand this code (at least the part about lstm's). By running `python test.py` you will have a minimal example of an lstm network learning to predict an output sequence of numbers in [-1,1] by using a Euclidean loss on the first element of each node's hidden layer.
7 |
8 | Play with code, add functionality, and try it on different datasets. Pull requests welcome.
9 |
10 | Please read [my blog article](http://nicodjimenez.github.io/2014/08/08/lstm.html) if you want details on the backprop part of the code.
11 |
12 | Also, check out a version of this code written in the D programming language by Mathias Baumann: https://github.com/Marenz/lstm
13 |
--------------------------------------------------------------------------------
/chatbotv1/src/main/java/org/wltea/analyzer/cfg/Configuration.java:
--------------------------------------------------------------------------------
1 | /**
2 | * IK 中文分词 版本 5.0
3 | * IK Analyzer release 5.0
4 | *
5 | * Licensed to the Apache Software Foundation (ASF) under one or more
6 | * contributor license agreements. See the NOTICE file distributed with
7 | * this work for additional information regarding copyright ownership.
8 | * The ASF licenses this file to You under the Apache License, Version 2.0
9 | * (the "License"); you may not use this file except in compliance with
10 | * the License. You may obtain a copy of the License at
11 | *
12 | * http://www.apache.org/licenses/LICENSE-2.0
13 | *
14 | * Unless required by applicable law or agreed to in writing, software
15 | * distributed under the License is distributed on an "AS IS" BASIS,
16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | * See the License for the specific language governing permissions and
18 | * limitations under the License.
19 | *
20 | * 源代码由林良益(linliangyi2005@gmail.com)提供
21 | * 版权声明 2012,乌龙茶工作室
22 | * provided by Linliangyi and copyright 2012 by Oolong studio
23 | *
24 | */
25 | package org.wltea.analyzer.cfg;
26 |
27 | import java.util.List;
28 |
29 | /**
30 | *
31 | * 配置管理类接口
32 | *
33 | */
34 | public interface Configuration {
35 |
36 |
37 |
38 | /**
39 | * 返回useSmart标志位
40 | * useSmart =true ,分词器使用智能切分策略, =false则使用细粒度切分
41 | * @return useSmart
42 | */
43 | public boolean useSmart();
44 |
45 | /**
46 | * 设置useSmart标志位
47 | * useSmart =true ,分词器使用智能切分策略, =false则使用细粒度切分
48 | * @param useSmart
49 | */
50 | public void setUseSmart(boolean useSmart);
51 |
52 |
53 | /**
54 | * 获取主词典路径
55 | *
56 | * @return String 主词典路径
57 | */
58 | public String getMainDictionary();
59 |
60 | /**
61 | * 获取量词词典路径
62 | * @return String 量词词典路径
63 | */
64 | public String getQuantifierDicionary();
65 |
66 | /**
67 | * 获取扩展字典配置路径
68 | * @return List 相对类加载器的路径
69 | */
70 | public List getExtDictionarys();
71 |
72 |
73 | /**
74 | * 获取扩展停止词典配置路径
75 | * @return List 相对类加载器的路径
76 | */
77 | public List getExtStopWordDictionarys();
78 |
79 | }
80 |
--------------------------------------------------------------------------------
/lstm_code/nicodjimenez/test2.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import sys
3 |
4 | from lstm import LstmParam, LstmNetwork
5 |
6 | class ToyLossLayer:
7 | """
8 | Computes square loss with first element of hidden layer array.
9 | """
10 | @classmethod
11 | def loss(self, pred, label):
12 | return (pred[0] - label) ** 2
13 |
14 | @classmethod
15 | def bottom_diff(self, pred, label):
16 | diff = np.zeros_like(pred)
17 | diff[0] = 2 * (pred[0] - label)
18 | return diff
19 |
20 | class Primes:
21 | def __init__(self):
22 | self.primes = list()
23 | for i in range(2, 100):
24 | is_prime = True
25 | for j in range(2, i-1):
26 | if i % j == 0:
27 | is_prime = False
28 | if is_prime:
29 | self.primes.append(i)
30 | self.primes_count = len(self.primes)
31 | def get_sample(self, x_dim, y_dim, index):
32 | result = np.zeros((x_dim+y_dim))
33 | for i in range(index, index + x_dim + y_dim):
34 | result[i-index] = self.primes[i%self.primes_count]/100.0
35 | return result
36 |
37 |
38 | def example_0():
39 | mem_cell_ct = 100
40 | x_dim = 50
41 | concat_len = x_dim + mem_cell_ct
42 | lstm_param = LstmParam(mem_cell_ct, x_dim)
43 | lstm_net = LstmNetwork(lstm_param)
44 |
45 | primes = Primes()
46 | x_list = []
47 | y_list = []
48 | for i in range(0, 10):
49 | sample = primes.get_sample(x_dim, 1, i)
50 | x = sample[0:x_dim]
51 | y = sample[x_dim:x_dim+1].tolist()[0]
52 | x_list.append(x)
53 | y_list.append(y)
54 |
55 | for cur_iter in range(10000):
56 | if cur_iter % 1000 == 0:
57 | print "y_list=", y_list
58 | for ind in range(len(y_list)):
59 | lstm_net.x_list_add(x_list[ind])
60 | if cur_iter % 1000 == 0:
61 | print "y_pred[%d] : %f" % (ind, lstm_net.lstm_node_list[ind].state.h[0])
62 |
63 | loss = lstm_net.y_list_is(y_list, ToyLossLayer)
64 | if cur_iter % 1000 == 0:
65 | print "loss: ", loss
66 | lstm_param.apply_diff(lr=0.01)
67 | lstm_net.x_list_clear()
68 |
69 | if __name__ == "__main__":
70 | example_0()
71 |
--------------------------------------------------------------------------------
/learning_tensorflow/2.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import tensorflow as tf
4 |
5 | with tf.Graph().as_default() as g:
6 | with g.name_scope("myscope") as scope: # 有了这个scope,下面的op的name都是类似myscope/Placeholder这样的前缀
7 | sess = tf.Session(target='', graph = g, config=None) # target表示要连接的tf执行引擎
8 | print "graph version:", g.version # 0
9 | a = tf.placeholder("float")
10 | print a.op # 输出整个operation信息,跟下面g.get_operations返回结果一样
11 | print "graph version:", g.version # 1
12 | b = tf.placeholder("float")
13 | print "graph version:", g.version # 2
14 | c = tf.placeholder("float")
15 | print "graph version:", g.version # 3
16 | y1 = tf.mul(a, b) # 也可以写成a * b
17 | print "graph version:", g.version # 4
18 | y2 = tf.mul(y1, c) # 也可以写成y1 * c
19 | print "graph version:", g.version # 5
20 | operations = g.get_operations()
21 | for (i, op) in enumerate(operations):
22 | print "============ operation", i+1, "==========="
23 | print op # 一个结构,包括:name、op、attr、input等,不同op不一样
24 | assert y1.graph is g
25 | assert sess.graph is g
26 | print "================ graph object address ================"
27 | print sess.graph
28 | print "================ graph define ================"
29 | print sess.graph_def
30 | print "================ sess str ================"
31 | print sess.sess_str
32 | print sess.run(y1, feed_dict={a: 3, b: 3}) # 9.0 feed_dictgraph中的元素和值的映射
33 | print sess.run(fetches=[b,y1], feed_dict={a: 3, b: 3}, options=None, run_metadata=None) # 传入的feches和返回值的shape相同
34 | print sess.run({'ret_name':y1}, feed_dict={a: 3, b: 3}) # {'ret_name': 9.0} 传入的feches和返回值的shape相同
35 |
36 | assert tf.get_default_session() is not sess
37 | with sess.as_default(): # 把sess作为默认的session,那么tf.get_default_session就是sess, 否则不是
38 | assert tf.get_default_session() is sess
39 |
40 | h = sess.partial_run_setup([y1, y2], [a, b, c]) # 分阶段运行,参数指明了feches和feed_dict列表
41 | res = sess.partial_run(h, y1, feed_dict={a: 3, b: 4}) # 12 运行第一阶段
42 | res = sess.partial_run(h, y2, feed_dict={c: res}) # 144.0 运行第二阶段,其中使用了第一阶段的执行结果
43 | print "partial_run res:", res
44 | sess.close()
45 |
--------------------------------------------------------------------------------
/chatbotv1/src/main/java/org/wltea/analyzer/lucene/IKAnalyzer.java:
--------------------------------------------------------------------------------
1 | /**
2 | * IK 中文分词 版本 5.0.1
3 | * IK Analyzer release 5.0.1
4 | *
5 | * Licensed to the Apache Software Foundation (ASF) under one or more
6 | * contributor license agreements. See the NOTICE file distributed with
7 | * this work for additional information regarding copyright ownership.
8 | * The ASF licenses this file to You under the Apache License, Version 2.0
9 | * (the "License"); you may not use this file except in compliance with
10 | * the License. You may obtain a copy of the License at
11 | *
12 | * http://www.apache.org/licenses/LICENSE-2.0
13 | *
14 | * Unless required by applicable law or agreed to in writing, software
15 | * distributed under the License is distributed on an "AS IS" BASIS,
16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | * See the License for the specific language governing permissions and
18 | * limitations under the License.
19 | *
20 | * 源代码由林良益(linliangyi2005@gmail.com)提供
21 | * 版权声明 2012,乌龙茶工作室
22 | * provided by Linliangyi and copyright 2012 by Oolong studio
23 | *
24 | */
25 | package org.wltea.analyzer.lucene;
26 |
27 | import java.io.Reader;
28 |
29 | import org.apache.lucene.analysis.Analyzer;
30 | import org.apache.lucene.analysis.Tokenizer;
31 |
32 | /**
33 | * IK分词器,Lucene Analyzer接口实现
34 | * 兼容Lucene 4.0版本
35 | */
36 | public final class IKAnalyzer extends Analyzer{
37 |
38 | private boolean useSmart;
39 |
40 | public boolean useSmart() {
41 | return useSmart;
42 | }
43 |
44 | public void setUseSmart(boolean useSmart) {
45 | this.useSmart = useSmart;
46 | }
47 |
48 | /**
49 | * IK分词器Lucene Analyzer接口实现类
50 | *
51 | * 默认细粒度切分算法
52 | */
53 | public IKAnalyzer(){
54 | this(false);
55 | }
56 |
57 | /**
58 | * IK分词器Lucene Analyzer接口实现类
59 | *
60 | * @param useSmart 当为true时,分词器进行智能切分
61 | */
62 | public IKAnalyzer(boolean useSmart){
63 | super();
64 | this.useSmart = useSmart;
65 | }
66 |
67 | /**
68 | * 重载Analyzer接口,构造分词组件
69 | */
70 | @Override
71 | protected TokenStreamComponents createComponents(String fieldName, final Reader in) {
72 | Tokenizer _IKTokenizer = new IKTokenizer(in , this.useSmart());
73 | return new TokenStreamComponents(_IKTokenizer);
74 | }
75 |
76 | }
77 |
--------------------------------------------------------------------------------
/chatbotv1/src/main/java/com/shareditor/chatbotv1/NettyHttpServletResponse.java:
--------------------------------------------------------------------------------
1 | package com.shareditor.chatbotv1;
2 |
3 | import io.netty.buffer.ByteBuf;
4 | import io.netty.handler.codec.http.DefaultHttpResponse;
5 | import io.netty.handler.codec.http.FullHttpResponse;
6 | import io.netty.handler.codec.http.HttpHeaders;
7 | import io.netty.handler.codec.http.HttpResponseStatus;
8 | import io.netty.handler.codec.http.HttpVersion;
9 |
10 | public class NettyHttpServletResponse extends DefaultHttpResponse implements FullHttpResponse {
11 |
12 | private ByteBuf content;
13 |
14 | public NettyHttpServletResponse(HttpVersion version, HttpResponseStatus status) {
15 | super(version, status);
16 | }
17 |
18 | public HttpHeaders trailingHeaders() {
19 | // TODO Auto-generated method stub
20 | return null;
21 | }
22 |
23 | public void setContent(ByteBuf buf) {
24 | this.content = buf;
25 | }
26 |
27 | public ByteBuf content() {
28 | return content;
29 | }
30 |
31 | public int refCnt() {
32 | // TODO Auto-generated method stub
33 | return 0;
34 | }
35 |
36 | public boolean release() {
37 | // TODO Auto-generated method stub
38 | return false;
39 | }
40 |
41 | public boolean release(int decrement) {
42 | // TODO Auto-generated method stub
43 | return false;
44 | }
45 |
46 | public FullHttpResponse copy(ByteBuf newContent) {
47 | // TODO Auto-generated method stub
48 | return null;
49 | }
50 |
51 | public FullHttpResponse copy() {
52 | // TODO Auto-generated method stub
53 | return null;
54 | }
55 |
56 | public FullHttpResponse retain(int increment) {
57 | // TODO Auto-generated method stub
58 | return null;
59 | }
60 |
61 | public FullHttpResponse retain() {
62 | // TODO Auto-generated method stub
63 | return null;
64 | }
65 |
66 | public FullHttpResponse touch() {
67 | // TODO Auto-generated method stub
68 | return null;
69 | }
70 |
71 | public FullHttpResponse touch(Object hint) {
72 | // TODO Auto-generated method stub
73 | return null;
74 | }
75 |
76 | public FullHttpResponse duplicate() {
77 | // TODO Auto-generated method stub
78 | return null;
79 | }
80 |
81 | public FullHttpResponse setProtocolVersion(HttpVersion version) {
82 | // TODO Auto-generated method stub
83 | return null;
84 | }
85 |
86 | public FullHttpResponse setStatus(HttpResponseStatus status) {
87 | // TODO Auto-generated method stub
88 | return null;
89 | }
90 |
91 | }
92 |
--------------------------------------------------------------------------------
/chatbotv1/src/main/resources/quantifier.dic:
--------------------------------------------------------------------------------
1 | 丈
2 | 下
3 | 世
4 | 世纪
5 | 两
6 | 个
7 | 中
8 | 串
9 | 亩
10 | 人
11 | 介
12 | 付
13 | 代
14 | 件
15 | 任
16 | 份
17 | 伏
18 | 伙
19 | 位
20 | 位数
21 | 例
22 | 倍
23 | 像素
24 | 元
25 | 克
26 | 克拉
27 | 公亩
28 | 公克
29 | 公分
30 | 公升
31 | 公尺
32 | 公担
33 | 公斤
34 | 公里
35 | 公顷
36 | 具
37 | 册
38 | 出
39 | 刀
40 | 分
41 | 分钟
42 | 分米
43 | 划
44 | 列
45 | 则
46 | 刻
47 | 剂
48 | 剑
49 | 副
50 | 加仑
51 | 勺
52 | 包
53 | 匙
54 | 匹
55 | 区
56 | 千克
57 | 千米
58 | 升
59 | 卷
60 | 厅
61 | 厘
62 | 厘米
63 | 双
64 | 发
65 | 口
66 | 句
67 | 只
68 | 台
69 | 叶
70 | 号
71 | 名
72 | 吨
73 | 听
74 | 员
75 | 周
76 | 周年
77 | 品
78 | 回
79 | 团
80 | 圆
81 | 圈
82 | 地
83 | 场
84 | 块
85 | 坪
86 | 堆
87 | 声
88 | 壶
89 | 处
90 | 夜
91 | 大
92 | 天
93 | 头
94 | 套
95 | 女
96 | 孔
97 | 字
98 | 宗
99 | 室
100 | 家
101 | 寸
102 | 对
103 | 封
104 | 尊
105 | 小时
106 | 尺
107 | 尾
108 | 局
109 | 层
110 | 届
111 | 岁
112 | 师
113 | 帧
114 | 幅
115 | 幕
116 | 幢
117 | 平方
118 | 平方公尺
119 | 平方公里
120 | 平方分米
121 | 平方厘米
122 | 平方码
123 | 平方米
124 | 平方英寸
125 | 平方英尺
126 | 平方英里
127 | 平米
128 | 年
129 | 年代
130 | 年级
131 | 度
132 | 座
133 | 式
134 | 引
135 | 张
136 | 成
137 | 战
138 | 截
139 | 户
140 | 房
141 | 所
142 | 扇
143 | 手
144 | 打
145 | 批
146 | 把
147 | 折
148 | 担
149 | 拍
150 | 招
151 | 拨
152 | 拳
153 | 指
154 | 掌
155 | 排
156 | 撮
157 | 支
158 | 文
159 | 斗
160 | 斤
161 | 方
162 | 族
163 | 日
164 | 时
165 | 曲
166 | 月
167 | 月份
168 | 期
169 | 本
170 | 朵
171 | 村
172 | 束
173 | 条
174 | 来
175 | 杯
176 | 枚
177 | 枝
178 | 枪
179 | 架
180 | 柄
181 | 柜
182 | 栋
183 | 栏
184 | 株
185 | 样
186 | 根
187 | 格
188 | 案
189 | 桌
190 | 档
191 | 桩
192 | 桶
193 | 梯
194 | 棵
195 | 楼
196 | 次
197 | 款
198 | 步
199 | 段
200 | 毛
201 | 毫
202 | 毫升
203 | 毫米
204 | 毫克
205 | 池
206 | 洲
207 | 派
208 | 海里
209 | 滴
210 | 炮
211 | 点
212 | 点钟
213 | 片
214 | 版
215 | 环
216 | 班
217 | 瓣
218 | 瓶
219 | 生
220 | 男
221 | 画
222 | 界
223 | 盆
224 | 盎司
225 | 盏
226 | 盒
227 | 盘
228 | 相
229 | 眼
230 | 石
231 | 码
232 | 碗
233 | 碟
234 | 磅
235 | 种
236 | 科
237 | 秒
238 | 秒钟
239 | 窝
240 | 立方公尺
241 | 立方分米
242 | 立方厘米
243 | 立方码
244 | 立方米
245 | 立方英寸
246 | 立方英尺
247 | 站
248 | 章
249 | 笔
250 | 等
251 | 筐
252 | 筒
253 | 箱
254 | 篇
255 | 篓
256 | 篮
257 | 簇
258 | 米
259 | 类
260 | 粒
261 | 级
262 | 组
263 | 维
264 | 缕
265 | 缸
266 | 罐
267 | 网
268 | 群
269 | 股
270 | 脚
271 | 船
272 | 艇
273 | 艘
274 | 色
275 | 节
276 | 英亩
277 | 英寸
278 | 英尺
279 | 英里
280 | 行
281 | 袋
282 | 角
283 | 言
284 | 课
285 | 起
286 | 趟
287 | 路
288 | 车
289 | 转
290 | 轮
291 | 辆
292 | 辈
293 | 连
294 | 通
295 | 遍
296 | 部
297 | 里
298 | 重
299 | 针
300 | 钟
301 | 钱
302 | 锅
303 | 门
304 | 间
305 | 队
306 | 阶段
307 | 隅
308 | 集
309 | 页
310 | 顶
311 | 顷
312 | 项
313 | 顿
314 | 颗
315 | 餐
316 | 首
--------------------------------------------------------------------------------
/chatbotv1/src/main/java/org/wltea/analyzer/dic/quantifier.dic:
--------------------------------------------------------------------------------
1 | 丈
2 | 下
3 | 世
4 | 世纪
5 | 两
6 | 个
7 | 中
8 | 串
9 | 亩
10 | 人
11 | 介
12 | 付
13 | 代
14 | 件
15 | 任
16 | 份
17 | 伏
18 | 伙
19 | 位
20 | 位数
21 | 例
22 | 倍
23 | 像素
24 | 元
25 | 克
26 | 克拉
27 | 公亩
28 | 公克
29 | 公分
30 | 公升
31 | 公尺
32 | 公担
33 | 公斤
34 | 公里
35 | 公顷
36 | 具
37 | 册
38 | 出
39 | 刀
40 | 分
41 | 分钟
42 | 分米
43 | 划
44 | 列
45 | 则
46 | 刻
47 | 剂
48 | 剑
49 | 副
50 | 加仑
51 | 勺
52 | 包
53 | 匙
54 | 匹
55 | 区
56 | 千克
57 | 千米
58 | 升
59 | 卷
60 | 厅
61 | 厘
62 | 厘米
63 | 双
64 | 发
65 | 口
66 | 句
67 | 只
68 | 台
69 | 叶
70 | 号
71 | 名
72 | 吨
73 | 听
74 | 员
75 | 周
76 | 周年
77 | 品
78 | 回
79 | 团
80 | 圆
81 | 圈
82 | 地
83 | 场
84 | 块
85 | 坪
86 | 堆
87 | 声
88 | 壶
89 | 处
90 | 夜
91 | 大
92 | 天
93 | 头
94 | 套
95 | 女
96 | 孔
97 | 字
98 | 宗
99 | 室
100 | 家
101 | 寸
102 | 对
103 | 封
104 | 尊
105 | 小时
106 | 尺
107 | 尾
108 | 局
109 | 层
110 | 届
111 | 岁
112 | 师
113 | 帧
114 | 幅
115 | 幕
116 | 幢
117 | 平方
118 | 平方公尺
119 | 平方公里
120 | 平方分米
121 | 平方厘米
122 | 平方码
123 | 平方米
124 | 平方英寸
125 | 平方英尺
126 | 平方英里
127 | 平米
128 | 年
129 | 年代
130 | 年级
131 | 度
132 | 座
133 | 式
134 | 引
135 | 张
136 | 成
137 | 战
138 | 截
139 | 户
140 | 房
141 | 所
142 | 扇
143 | 手
144 | 打
145 | 批
146 | 把
147 | 折
148 | 担
149 | 拍
150 | 招
151 | 拨
152 | 拳
153 | 指
154 | 掌
155 | 排
156 | 撮
157 | 支
158 | 文
159 | 斗
160 | 斤
161 | 方
162 | 族
163 | 日
164 | 时
165 | 曲
166 | 月
167 | 月份
168 | 期
169 | 本
170 | 朵
171 | 村
172 | 束
173 | 条
174 | 来
175 | 杯
176 | 枚
177 | 枝
178 | 枪
179 | 架
180 | 柄
181 | 柜
182 | 栋
183 | 栏
184 | 株
185 | 样
186 | 根
187 | 格
188 | 案
189 | 桌
190 | 档
191 | 桩
192 | 桶
193 | 梯
194 | 棵
195 | 楼
196 | 次
197 | 款
198 | 步
199 | 段
200 | 毛
201 | 毫
202 | 毫升
203 | 毫米
204 | 毫克
205 | 池
206 | 洲
207 | 派
208 | 海里
209 | 滴
210 | 炮
211 | 点
212 | 点钟
213 | 片
214 | 版
215 | 环
216 | 班
217 | 瓣
218 | 瓶
219 | 生
220 | 男
221 | 画
222 | 界
223 | 盆
224 | 盎司
225 | 盏
226 | 盒
227 | 盘
228 | 相
229 | 眼
230 | 石
231 | 码
232 | 碗
233 | 碟
234 | 磅
235 | 种
236 | 科
237 | 秒
238 | 秒钟
239 | 窝
240 | 立方公尺
241 | 立方分米
242 | 立方厘米
243 | 立方码
244 | 立方米
245 | 立方英寸
246 | 立方英尺
247 | 站
248 | 章
249 | 笔
250 | 等
251 | 筐
252 | 筒
253 | 箱
254 | 篇
255 | 篓
256 | 篮
257 | 簇
258 | 米
259 | 类
260 | 粒
261 | 级
262 | 组
263 | 维
264 | 缕
265 | 缸
266 | 罐
267 | 网
268 | 群
269 | 股
270 | 脚
271 | 船
272 | 艇
273 | 艘
274 | 色
275 | 节
276 | 英亩
277 | 英寸
278 | 英尺
279 | 英里
280 | 行
281 | 袋
282 | 角
283 | 言
284 | 课
285 | 起
286 | 趟
287 | 路
288 | 车
289 | 转
290 | 轮
291 | 辆
292 | 辈
293 | 连
294 | 通
295 | 遍
296 | 部
297 | 里
298 | 重
299 | 针
300 | 钟
301 | 钱
302 | 锅
303 | 门
304 | 间
305 | 队
306 | 阶段
307 | 隅
308 | 集
309 | 页
310 | 顶
311 | 顷
312 | 项
313 | 顿
314 | 颗
315 | 餐
316 | 首
--------------------------------------------------------------------------------
/chatbotv1/pom.xml:
--------------------------------------------------------------------------------
1 |
3 | 4.0.0
4 |
5 | com.shareditor
6 | chatbotv1
7 | 0.0.1-SNAPSHOT
8 | jar
9 |
10 | chatbotv1
11 | http://maven.apache.org
12 |
13 |
14 | UTF-8
15 |
16 |
17 |
18 |
19 | junit
20 | junit
21 | 3.8.1
22 | test
23 |
24 |
25 |
26 | org.apache.lucene
27 | lucene-core
28 | 4.9.0
29 |
30 |
31 | org.apache.lucene
32 | lucene-queryparser
33 | 4.9.0
34 |
35 |
36 | org.apache.lucene
37 | lucene-analyzers-common
38 | 4.9.0
39 |
40 |
41 | io.netty
42 | netty-all
43 | 5.0.0.Alpha2
44 |
45 |
46 | com.alibaba
47 | fastjson
48 | 1.1.41
49 |
50 |
51 | log4j
52 | log4j
53 | 1.2.14
54 |
55 |
56 |
57 |
58 |
59 |
60 | org.apache.maven.plugins
61 | maven-dependency-plugin
62 |
63 |
64 | copy-dependencies
65 | prepare-package
66 |
67 | copy-dependencies
68 |
69 |
70 | ${project.build.directory}/lib
71 | false
72 | false
73 | true
74 |
75 |
76 |
77 |
78 |
79 | org.apache.maven.plugins
80 | maven-jar-plugin
81 |
82 |
83 |
84 | true
85 | lib/
86 | theMainClass
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
--------------------------------------------------------------------------------
/chatbotv1/src/main/java/org/wltea/analyzer/sample/IKAnalzyerDemo.java:
--------------------------------------------------------------------------------
1 | /**
2 | * IK 中文分词 版本 5.0.1
3 | * IK Analyzer release 5.0.1
4 | *
5 | * Licensed to the Apache Software Foundation (ASF) under one or more
6 | * contributor license agreements. See the NOTICE file distributed with
7 | * this work for additional information regarding copyright ownership.
8 | * The ASF licenses this file to You under the Apache License, Version 2.0
9 | * (the "License"); you may not use this file except in compliance with
10 | * the License. You may obtain a copy of the License at
11 | *
12 | * http://www.apache.org/licenses/LICENSE-2.0
13 | *
14 | * Unless required by applicable law or agreed to in writing, software
15 | * distributed under the License is distributed on an "AS IS" BASIS,
16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | * See the License for the specific language governing permissions and
18 | * limitations under the License.
19 | *
20 | * 源代码由林良益(linliangyi2005@gmail.com)提供
21 | * 版权声明 2012,乌龙茶工作室
22 | * provided by Linliangyi and copyright 2012 by Oolong studio
23 | *
24 | *
25 | */
26 | package org.wltea.analyzer.sample;
27 |
28 | import java.io.IOException;
29 | import java.io.StringReader;
30 |
31 | import org.apache.lucene.analysis.Analyzer;
32 | import org.apache.lucene.analysis.TokenStream;
33 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
34 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
35 | import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
36 | import org.wltea.analyzer.lucene.IKAnalyzer;
37 |
38 | /**
39 | * 使用IKAnalyzer进行分词的演示
40 | * 2012-10-22
41 | *
42 | */
43 | public class IKAnalzyerDemo {
44 |
45 | public static void main(String[] args){
46 | //构建IK分词器,使用smart分词模式
47 | Analyzer analyzer = new IKAnalyzer(true);
48 |
49 | //获取Lucene的TokenStream对象
50 | TokenStream ts = null;
51 | try {
52 | ts = analyzer.tokenStream("myfield", new StringReader("这是一个中文分词的例子,你可以直接运行它!IKAnalyer can analysis english text too"));
53 | //获取词元位置属性
54 | OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class);
55 | //获取词元文本属性
56 | CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
57 | //获取词元文本属性
58 | TypeAttribute type = ts.addAttribute(TypeAttribute.class);
59 |
60 |
61 | //重置TokenStream(重置StringReader)
62 | ts.reset();
63 | //迭代获取分词结果
64 | while (ts.incrementToken()) {
65 | System.out.println(offset.startOffset() + " - " + offset.endOffset() + " : " + term.toString() + " | " + type.type());
66 | }
67 | //关闭TokenStream(关闭StringReader)
68 | ts.end(); // Perform end-of-stream operations, e.g. set the final offset.
69 |
70 | } catch (IOException e) {
71 | e.printStackTrace();
72 | } finally {
73 | //释放TokenStream的所有资源
74 | if(ts != null){
75 | try {
76 | ts.close();
77 | } catch (IOException e) {
78 | e.printStackTrace();
79 | }
80 | }
81 | }
82 |
83 | }
84 |
85 | }
86 |
--------------------------------------------------------------------------------
/chatbotv1/src/main/java/org/wltea/analyzer/dic/Hit.java:
--------------------------------------------------------------------------------
1 | /**
2 | *
3 | * IK 中文分词 版本 5.0
4 | * IK Analyzer release 5.0
5 | *
6 | * Licensed to the Apache Software Foundation (ASF) under one or more
7 | * contributor license agreements. See the NOTICE file distributed with
8 | * this work for additional information regarding copyright ownership.
9 | * The ASF licenses this file to You under the Apache License, Version 2.0
10 | * (the "License"); you may not use this file except in compliance with
11 | * the License. You may obtain a copy of the License at
12 | *
13 | * http://www.apache.org/licenses/LICENSE-2.0
14 | *
15 | * Unless required by applicable law or agreed to in writing, software
16 | * distributed under the License is distributed on an "AS IS" BASIS,
17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 | * See the License for the specific language governing permissions and
19 | * limitations under the License.
20 | *
21 | * 源代码由林良益(linliangyi2005@gmail.com)提供
22 | * 版权声明 2012,乌龙茶工作室
23 | * provided by Linliangyi and copyright 2012 by Oolong studio
24 | *
25 | */
26 | package org.wltea.analyzer.dic;
27 |
28 | /**
29 | * 表示一次词典匹配的命中
30 | */
31 | public class Hit {
32 | //Hit不匹配
33 | private static final int UNMATCH = 0x00000000;
34 | //Hit完全匹配
35 | private static final int MATCH = 0x00000001;
36 | //Hit前缀匹配
37 | private static final int PREFIX = 0x00000010;
38 |
39 |
40 | //该HIT当前状态,默认未匹配
41 | private int hitState = UNMATCH;
42 |
43 | //记录词典匹配过程中,当前匹配到的词典分支节点
44 | private DictSegment matchedDictSegment;
45 | /*
46 | * 词段开始位置
47 | */
48 | private int begin;
49 | /*
50 | * 词段的结束位置
51 | */
52 | private int end;
53 |
54 |
55 | /**
56 | * 判断是否完全匹配
57 | */
58 | public boolean isMatch() {
59 | return (this.hitState & MATCH) > 0;
60 | }
61 | /**
62 | *
63 | */
64 | public void setMatch() {
65 | this.hitState = this.hitState | MATCH;
66 | }
67 |
68 | /**
69 | * 判断是否是词的前缀
70 | */
71 | public boolean isPrefix() {
72 | return (this.hitState & PREFIX) > 0;
73 | }
74 | /**
75 | *
76 | */
77 | public void setPrefix() {
78 | this.hitState = this.hitState | PREFIX;
79 | }
80 | /**
81 | * 判断是否是不匹配
82 | */
83 | public boolean isUnmatch() {
84 | return this.hitState == UNMATCH ;
85 | }
86 | /**
87 | *
88 | */
89 | public void setUnmatch() {
90 | this.hitState = UNMATCH;
91 | }
92 |
93 | public DictSegment getMatchedDictSegment() {
94 | return matchedDictSegment;
95 | }
96 |
97 | public void setMatchedDictSegment(DictSegment matchedDictSegment) {
98 | this.matchedDictSegment = matchedDictSegment;
99 | }
100 |
101 | public int getBegin() {
102 | return begin;
103 | }
104 |
105 | public void setBegin(int begin) {
106 | this.begin = begin;
107 | }
108 |
109 | public int getEnd() {
110 | return end;
111 | }
112 |
113 | public void setEnd(int end) {
114 | this.end = end;
115 | }
116 |
117 | }
118 |
--------------------------------------------------------------------------------
/digital_recognition_cnn.py:
--------------------------------------------------------------------------------
1 | # coding:utf-8
2 |
3 | import sys
4 | reload(sys)
5 | sys.setdefaultencoding( "utf-8" )
6 |
7 | from tensorflow.examples.tutorials.mnist import input_data
8 | import tensorflow as tf
9 |
10 | flags = tf.app.flags
11 | FLAGS = flags.FLAGS
12 | flags.DEFINE_string('data_dir', './', 'Directory for storing data')
13 |
14 | mnist = input_data.read_data_sets(FLAGS.data_dir, one_hot=True)
15 |
16 | # 初始化生成随机的权重(变量),避免神经元输出恒为0
17 | def weight_variable(shape):
18 | # 以正态分布生成随机值
19 | initial = tf.truncated_normal(shape, stddev=0.1)
20 | return tf.Variable(initial)
21 |
22 | # 初始化生成随机的偏置项(常量),避免神经元输出恒为0
23 | def bias_variable(shape):
24 | initial = tf.constant(0.1, shape=shape)
25 | return tf.Variable(initial)
26 |
27 | # 卷积采用1步长,0边距,保证输入输出大小相同
28 | def conv2d(x, W):
29 | return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME')
30 |
31 | # 池化采用2×2模板
32 | def max_pool_2x2(x):
33 | return tf.nn.max_pool(x, ksize=[1, 2, 2, 1],
34 | strides=[1, 2, 2, 1], padding='SAME')
35 |
36 | # 28*28=784
37 | x = tf.placeholder(tf.float32, [None, 784])
38 | # 输出类别共10个:0-9
39 | y_ = tf.placeholder("float", [None,10])
40 |
41 | # 第一层卷积权重,视野是5*5,输入通道1个,输出通道32个
42 | W_conv1 = weight_variable([5, 5, 1, 32])
43 | # 第一层卷积偏置项有32个
44 | b_conv1 = bias_variable([32])
45 |
46 | # 把x变成4d向量,第二维和第三维是图像尺寸,第四维是颜色通道数1
47 | x_image = tf.reshape(x, [-1,28,28,1])
48 |
49 | h_conv1 = tf.nn.relu(conv2d(x_image, W_conv1) + b_conv1)
50 | h_pool1 = max_pool_2x2(h_conv1)
51 |
52 | # 第二层卷积权重,视野是5*5,输入通道32个,输出通道64个
53 | W_conv2 = weight_variable([5, 5, 32, 64])
54 | # 第二层卷积偏置项有64个
55 | b_conv2 = bias_variable([64])
56 |
57 | h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2)
58 | h_pool2 = max_pool_2x2(h_conv2)
59 |
60 | # 第二层池化后尺寸编程7*7,第三层是全连接,输入是64个通道,输出是1024个神经元
61 | W_fc1 = weight_variable([7 * 7 * 64, 1024])
62 | # 第三层全连接偏置项有1024个
63 | b_fc1 = bias_variable([1024])
64 |
65 | h_pool2_flat = tf.reshape(h_pool2, [-1, 7*7*64])
66 | h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1)
67 |
68 | # 按float做dropout,以减少过拟合
69 | keep_prob = tf.placeholder("float")
70 | h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)
71 |
72 | # 最后的softmax层生成10种分类
73 | W_fc2 = weight_variable([1024, 10])
74 | b_fc2 = bias_variable([10])
75 |
76 | y_conv=tf.nn.softmax(tf.matmul(h_fc1_drop, W_fc2) + b_fc2)
77 |
78 | cross_entropy = -tf.reduce_sum(y_*tf.log(y_conv))
79 | # Adam优化器来做梯度最速下降
80 | train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy)
81 | correct_prediction = tf.equal(tf.argmax(y_conv,1), tf.argmax(y_,1))
82 | accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
83 |
84 | sess = tf.InteractiveSession()
85 | sess.run(tf.initialize_all_variables())
86 |
87 | for i in range(20000):
88 | batch = mnist.train.next_batch(50)
89 | if i%100 == 0:
90 | train_accuracy = accuracy.eval(feed_dict={
91 | x:batch[0], y_: batch[1], keep_prob: 1.0})
92 | print "step %d, training accuracy %g"%(i, train_accuracy)
93 | train_step.run(feed_dict={x: batch[0], y_: batch[1], keep_prob: 0.5})
94 |
95 | print "test accuracy %g"%accuracy.eval(feed_dict={
96 | x: mnist.test.images, y_: mnist.test.labels, keep_prob: 1.0})
97 |
--------------------------------------------------------------------------------
/subtitle/preprocess/filter.py:
--------------------------------------------------------------------------------
1 | # coding:utf-8
2 | import sys
3 | import re
4 | import chardet
5 |
6 | if __name__ == '__main__':
7 | #illegal=ur"([\u2000-\u2010]+)"
8 | illegal=ur"([\u0000-\u2010]+)"
9 | pattern_illegals = [re.compile(ur"([\u2000-\u2010]+)"), re.compile(ur"([\u0090-\u0099]+)")]
10 | filters = ["字幕", "时间轴:", "校对:", "翻译:", "后期:", "监制:"]
11 | filters.append("时间轴:")
12 | filters.append("校对:")
13 | filters.append("翻译:")
14 | filters.append("后期:")
15 | filters.append("监制:")
16 | filters.append("禁止用作任何商业盈利行为")
17 | filters.append("http")
18 | htmltagregex = re.compile(r'<[^>]+>',re.S)
19 | brace_regex = re.compile(r'\{.*\}',re.S)
20 | slash_regex = re.compile(r'\\\w',re.S)
21 | repeat_regex = re.compile(r'[-=]{10}',re.S)
22 | f = open("./corpus/all.out", "r")
23 | count=0
24 | while True:
25 | line = f.readline()
26 | if line:
27 | line = line.strip()
28 |
29 | # 编码识别,不是utf-8就过滤
30 | gb_content = ''
31 | try:
32 | gb_content = line.decode("utf-8")
33 | except Exception as e:
34 | sys.stderr.write("decode error: ", line)
35 | continue
36 |
37 | # 中文识别,不是中文就过滤
38 | need_continue = False
39 | for pattern_illegal in pattern_illegals:
40 | match_illegal = pattern_illegal.findall(gb_content)
41 | if len(match_illegal) > 0:
42 | sys.stderr.write("match_illegal error: %s\n" % line)
43 | need_continue = True
44 | break
45 | if need_continue:
46 | continue
47 |
48 | # 关键词过滤
49 | need_continue = False
50 | for filter in filters:
51 | try:
52 | line.index(filter)
53 | sys.stderr.write("filter keyword of %s %s\n" % (filter, line))
54 | need_continue = True
55 | break
56 | except:
57 | pass
58 | if need_continue:
59 | continue
60 |
61 | # 去掉剧集信息
62 | if re.match('.*第.*季.*', line):
63 | sys.stderr.write("filter copora %s\n" % line)
64 | continue
65 | if re.match('.*第.*集.*', line):
66 | sys.stderr.write("filter copora %s\n" % line)
67 | continue
68 | if re.match('.*第.*帧.*', line):
69 | sys.stderr.write("filter copora %s\n" % line)
70 | continue
71 |
72 | # 去html标签
73 | line = htmltagregex.sub('',line)
74 |
75 | # 去花括号修饰
76 | line = brace_regex.sub('', line)
77 |
78 | # 去转义
79 | line = slash_regex.sub('', line)
80 |
81 | # 去重复
82 | new_line = repeat_regex.sub('', line)
83 | if len(new_line) != len(line):
84 | continue
85 |
86 | # 去特殊字符
87 | line = line.replace('-', '').strip()
88 |
89 | if len(line) > 0:
90 | sys.stdout.write("%s\n" % line)
91 | count+=1
92 | else:
93 | break
94 | f.close()
95 | pass
96 |
--------------------------------------------------------------------------------
/read_images.c:
--------------------------------------------------------------------------------
1 | /************************
2 | * author: SharEDITor
3 | * date: 2016-08-02
4 | * brief: read MNIST data
5 | ************************/
6 | #include
7 | #include
8 | #include
9 | #include
10 |
11 | unsigned char *lables = NULL;
12 |
13 | /**
14 | * All the integers in the files are stored in the MSB first (high endian) format
15 | */
16 | void copy_int(uint32_t *target, unsigned char *src)
17 | {
18 | *(((unsigned char*)target)+0) = src[3];
19 | *(((unsigned char*)target)+1) = src[2];
20 | *(((unsigned char*)target)+2) = src[1];
21 | *(((unsigned char*)target)+3) = src[0];
22 | }
23 |
24 | int read_lables()
25 | {
26 | FILE *fp = fopen("./train-labels-idx1-ubyte", "r");
27 | if (NULL == fp)
28 | {
29 | return -1;
30 | }
31 | unsigned char head[8];
32 | fread(head, sizeof(unsigned char), 8, fp);
33 | uint32_t magic_number = 0;
34 | uint32_t item_num = 0;
35 | copy_int(&magic_number, &head[0]);
36 | // magic number check
37 | assert(magic_number == 2049);
38 | copy_int(&item_num, &head[4]);
39 |
40 | uint64_t values_size = sizeof(unsigned char) * item_num;
41 | lables = (unsigned char*)malloc(values_size);
42 | fread(lables, sizeof(unsigned char), values_size, fp);
43 |
44 | fclose(fp);
45 | return 0;
46 | }
47 |
48 | int read_images()
49 | {
50 | FILE *fp = fopen("./train-images-idx3-ubyte", "r");
51 | if (NULL == fp)
52 | {
53 | return -1;
54 | }
55 | unsigned char head[16];
56 | fread(head, sizeof(unsigned char), 16, fp);
57 | uint32_t magic_number = 0;
58 | uint32_t images_num = 0;
59 | uint32_t rows = 0;
60 | uint32_t cols = 0;
61 | copy_int(&magic_number, &head[0]);
62 | // magic number check
63 | assert(magic_number == 2051);
64 | copy_int(&images_num, &head[4]);
65 | copy_int(&rows, &head[8]);
66 | copy_int(&cols, &head[12]);
67 |
68 | printf("rows=%d cols=%d\n", rows, cols);
69 |
70 | uint64_t image_size = rows * cols;
71 | uint64_t values_size = sizeof(unsigned char) * images_num * rows * cols;
72 | unsigned char *values = (unsigned char*)malloc(values_size);
73 | fread(values, sizeof(unsigned char), values_size, fp);
74 |
75 | for (int image_index = 0; image_index < images_num; image_index++)
76 | {
77 | // print the label
78 | printf("========================================= %d ======================================\n", lables[image_index]);
79 | for (int row_index = 0; row_index < rows; row_index++)
80 | {
81 | for (int col_index = 0; col_index < cols; col_index++)
82 | {
83 | // print the pixels of image
84 | printf("%3d", values[image_index*image_size+row_index*cols+col_index]);
85 | }
86 | printf("\n");
87 | }
88 | printf("\n");
89 | }
90 |
91 | free(values);
92 | fclose(fp);
93 | return 0;
94 | }
95 |
96 | int main(int argc, char *argv[])
97 | {
98 | if (-1 == read_lables())
99 | {
100 | return -1;
101 | }
102 | if (-1 == read_images())
103 | {
104 | return -1;
105 | }
106 | return 0;
107 | }
108 |
--------------------------------------------------------------------------------
/chatbotv1/src/main/java/org/wltea/analyzer/core/CharacterUtil.java:
--------------------------------------------------------------------------------
1 | /**
2 | * IK 中文分词 版本 5.0
3 | * IK Analyzer release 5.0
4 | *
5 | * Licensed to the Apache Software Foundation (ASF) under one or more
6 | * contributor license agreements. See the NOTICE file distributed with
7 | * this work for additional information regarding copyright ownership.
8 | * The ASF licenses this file to You under the Apache License, Version 2.0
9 | * (the "License"); you may not use this file except in compliance with
10 | * the License. You may obtain a copy of the License at
11 | *
12 | * http://www.apache.org/licenses/LICENSE-2.0
13 | *
14 | * Unless required by applicable law or agreed to in writing, software
15 | * distributed under the License is distributed on an "AS IS" BASIS,
16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | * See the License for the specific language governing permissions and
18 | * limitations under the License.
19 | *
20 | * 源代码由林良益(linliangyi2005@gmail.com)提供
21 | * 版权声明 2012,乌龙茶工作室
22 | * provided by Linliangyi and copyright 2012 by Oolong studio
23 | *
24 | * 字符集识别工具类
25 | */
26 | package org.wltea.analyzer.core;
27 |
28 | /**
29 | *
30 | * 字符集识别工具类
31 | */
32 | class CharacterUtil {
33 |
34 | public static final int CHAR_USELESS = 0;
35 |
36 | public static final int CHAR_ARABIC = 0X00000001;
37 |
38 | public static final int CHAR_ENGLISH = 0X00000002;
39 |
40 | public static final int CHAR_CHINESE = 0X00000004;
41 |
42 | public static final int CHAR_OTHER_CJK = 0X00000008;
43 |
44 |
45 | /**
46 | * 识别字符类型
47 | * @param input
48 | * @return int CharacterUtil定义的字符类型常量
49 | */
50 | static int identifyCharType(char input){
51 | if(input >= '0' && input <= '9'){
52 | return CHAR_ARABIC;
53 |
54 | }else if((input >= 'a' && input <= 'z')
55 | || (input >= 'A' && input <= 'Z')){
56 | return CHAR_ENGLISH;
57 |
58 | }else {
59 | Character.UnicodeBlock ub = Character.UnicodeBlock.of(input);
60 |
61 | if(ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS
62 | || ub == Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS
63 | || ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A){
64 | //目前已知的中文字符UTF-8集合
65 | return CHAR_CHINESE;
66 |
67 | }else if(ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS //全角数字字符和日韩字符
68 | //韩文字符集
69 | || ub == Character.UnicodeBlock.HANGUL_SYLLABLES
70 | || ub == Character.UnicodeBlock.HANGUL_JAMO
71 | || ub == Character.UnicodeBlock.HANGUL_COMPATIBILITY_JAMO
72 | //日文字符集
73 | || ub == Character.UnicodeBlock.HIRAGANA //平假名
74 | || ub == Character.UnicodeBlock.KATAKANA //片假名
75 | || ub == Character.UnicodeBlock.KATAKANA_PHONETIC_EXTENSIONS){
76 | return CHAR_OTHER_CJK;
77 |
78 | }
79 | }
80 | //其他的不做处理的字符
81 | return CHAR_USELESS;
82 | }
83 |
84 | /**
85 | * 进行字符规格化(全角转半角,大写转小写处理)
86 | * @param input
87 | * @return char
88 | */
89 | static char regularize(char input){
90 | if (input == 12288) {
91 | input = (char) 32;
92 |
93 | }else if (input > 65280 && input < 65375) {
94 | input = (char) (input - 65248);
95 |
96 | }else if (input >= 'A' && input <= 'Z') {
97 | input += 32;
98 | }
99 |
100 | return input;
101 | }
102 | }
103 |
--------------------------------------------------------------------------------
/seq2seq/hello_sequence.py:
--------------------------------------------------------------------------------
1 | # coding:utf-8
2 |
3 | from __future__ import print_function
4 | import numpy as np
5 | import tensorflow as tf
6 | import sys
7 |
8 | vocab_size=256
9 | learning_rate=0.1
10 | # 暂时只试验一个bucket
11 | buckets=[(10, 10)]
12 | bucket_id=0
13 | # 填充0
14 | PAD=[0]
15 | sample_size=20
16 | # LSTM中的记忆单元数目
17 | num_units=100
18 | # 多少层的lstm
19 | num_layers=2
20 |
21 | # sample_size个样本,每个样本有一个question、answer、weights,question、answer分别是10维的向量
22 | # 这sample_size个样本有时间序上的依赖关系
23 | question_sample_list = [map(ord, "hello?") + PAD * 4] * sample_size
24 | answer_sample_list = [map(ord, "world!") + PAD * 4] * sample_size
25 | init_weights_list = [[1.0]*7 + [0.0]*3] *sample_size # mask padding. todo: redundant --
26 |
27 | with tf.Session() as session:
28 |
29 | # 初始化神经网络单元
30 | cell = single_cell = tf.nn.rnn_cell.LSTMCell(num_units)
31 | if num_layers > 1:
32 | cell = tf.nn.rnn_cell.MultiRNNCell([single_cell] * num_layers)
33 |
34 | # 定义函数
35 | def seq2seq_f(encoder_inputs, decoder_inputs, do_decode):
36 | return tf.nn.seq2seq.embedding_rnn_seq2seq(
37 | encoder_inputs, decoder_inputs, cell,
38 | num_encoder_symbols=vocab_size,
39 | num_decoder_symbols=vocab_size,
40 | embedding_size=num_units,
41 | feed_previous=do_decode)
42 |
43 | # 初始化训练用的变量,如果是多个层,权重共享
44 | encoder_inputs = []
45 | decoder_inputs = []
46 | weights = []
47 | for i in xrange(sample_size):
48 | encoder_inputs.append(tf.placeholder(tf.int32, shape=[None], name="encoder{0}".format(i)))
49 | for i in xrange(sample_size):
50 | decoder_inputs.append(tf.placeholder(tf.int32, shape=[None], name="decoder{0}".format(i)))
51 | weights.append(tf.placeholder(tf.float32, shape=[None], name="weight{0}".format(i)))
52 | targets = [decoder_inputs[i] for i in xrange(len(decoder_inputs))]
53 |
54 | # 创建模型及损失计算方法
55 | buckets_outputs, losses = tf.nn.seq2seq.model_with_buckets(
56 | encoder_inputs, decoder_inputs, targets,
57 | weights, buckets,
58 | lambda x, y: seq2seq_f(x, y, False))
59 |
60 |
61 | # 梯度更新算法
62 | updates=[]
63 | for b in xrange(len(buckets)):
64 | updates.append(tf.train.AdamOptimizer(learning_rate).minimize(losses[b]))
65 |
66 | # 用于保存模型
67 | saver = tf.train.Saver(tf.all_variables())
68 |
69 | # 初始化
70 | session.run(tf.initialize_all_variables())
71 |
72 | while True:
73 | encoder_size = len(encoder_inputs)
74 | decoder_size = len(decoder_inputs)
75 |
76 | # 初始化feed_dict数据
77 | feed_dict = {}
78 | for i in xrange(encoder_size):
79 | feed_dict[encoder_inputs[i].name] = question_sample_list[i]
80 | for i in xrange(decoder_size):
81 | feed_dict[decoder_inputs[i].name] = answer_sample_list[i]
82 | feed_dict[weights[i].name] = init_weights_list[i]
83 |
84 | # 初始化fetches模型相关信息,fetches就是想拿什么就拿什么,比如updates就是拿更新值,losses就是拿损失值,buckets_outputs就是拿输出值
85 | fetches = [updates[bucket_id], losses[bucket_id]]
86 | fetches.append(buckets_outputs[bucket_id][0])
87 | # 这一句是为了拿输出,训练过程可以不要
88 | for i in xrange(len(buckets_outputs[bucket_id])):
89 | fetches.append(buckets_outputs[bucket_id][i])
90 |
91 | # 参数传递进去的是数据和计算逻辑,具体执行时可以传到各种介质中执行
92 | fetches_outputs = session.run(fetches, feed_dict)
93 | perplexity = fetches_outputs[1]
94 | outputs = fetches_outputs[2:]
95 | print ("perplexity =", perplexity)
96 | words = np.argmax(outputs, axis=2)
97 | word = "".join(map(chr, words[0])).replace('\x00', '').replace('\n', '')
98 | print("output: %s" % word)
99 |
--------------------------------------------------------------------------------
/baidu_search/baidu_search/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Scrapy settings for baidu_search project
4 | #
5 | # For simplicity, this file contains only settings considered important or
6 | # commonly used. You can find more settings consulting the documentation:
7 | #
8 | # http://doc.scrapy.org/en/latest/topics/settings.html
9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 |
12 | BOT_NAME = 'baidu_search'
13 |
14 | SPIDER_MODULES = ['baidu_search.spiders']
15 | NEWSPIDER_MODULE = 'baidu_search.spiders'
16 |
17 |
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'baidu_search (+http://www.yourdomain.com)'
20 | USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
21 |
22 | # Obey robots.txt rules
23 | ROBOTSTXT_OBEY = False
24 |
25 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
26 | #CONCURRENT_REQUESTS = 32
27 |
28 | # Configure a delay for requests for the same website (default: 0)
29 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
30 | # See also autothrottle settings and docs
31 | #DOWNLOAD_DELAY = 3
32 | DOWNLOAD_TIMEOUT = 5
33 | # The download delay setting will honor only one of:
34 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
35 | #CONCURRENT_REQUESTS_PER_IP = 16
36 |
37 | # Disable cookies (enabled by default)
38 | #COOKIES_ENABLED = False
39 |
40 | # Disable Telnet Console (enabled by default)
41 | #TELNETCONSOLE_ENABLED = False
42 |
43 | # Override the default request headers:
44 | #DEFAULT_REQUEST_HEADERS = {
45 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
46 | # 'Accept-Language': 'en',
47 | #}
48 |
49 | # Enable or disable spider middlewares
50 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
51 | #SPIDER_MIDDLEWARES = {
52 | # 'baidu_search.middlewares.MyCustomSpiderMiddleware': 543,
53 | #}
54 |
55 | # Enable or disable downloader middlewares
56 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
57 | #DOWNLOADER_MIDDLEWARES = {
58 | # 'baidu_search.middlewares.MyCustomDownloaderMiddleware': 543,
59 | #}
60 |
61 | # Enable or disable extensions
62 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
63 | #EXTENSIONS = {
64 | # 'scrapy.extensions.telnet.TelnetConsole': None,
65 | #}
66 |
67 | # Configure item pipelines
68 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
69 | #ITEM_PIPELINES = {
70 | # 'baidu_search.pipelines.SomePipeline': 300,
71 | #}
72 |
73 | # Enable and configure the AutoThrottle extension (disabled by default)
74 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
75 | #AUTOTHROTTLE_ENABLED = True
76 | # The initial download delay
77 | #AUTOTHROTTLE_START_DELAY = 5
78 | # The maximum download delay to be set in case of high latencies
79 | #AUTOTHROTTLE_MAX_DELAY = 60
80 | # The average number of requests Scrapy should be sending in parallel to
81 | # each remote server
82 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
83 | # Enable showing throttling stats for every response received:
84 | #AUTOTHROTTLE_DEBUG = False
85 |
86 | # Enable and configure HTTP caching (disabled by default)
87 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
88 | #HTTPCACHE_ENABLED = True
89 | #HTTPCACHE_EXPIRATION_SECS = 0
90 | #HTTPCACHE_DIR = 'httpcache'
91 | #HTTPCACHE_IGNORE_HTTP_CODES = []
92 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
93 |
--------------------------------------------------------------------------------
/seq2seq/tflearn_prj/07_lstm.py:
--------------------------------------------------------------------------------
1 | #Inspired by https://github.com/aymericdamien/TensorFlow-Examples/blob/master/examples/3%20-%20Neural%20Networks/recurrent_network.py
2 | import tensorflow as tf
3 |
4 | import numpy as np
5 | from tensorflow.examples.tutorials.mnist import input_data
6 |
7 | # configuration
8 | # O * W + b -> 10 labels for each image, O[? 28], W[28 10], B[10]
9 | # ^ (O: output 28 vec from 28 vec input)
10 | # |
11 | # +-+ +-+ +--+
12 | # |1|->|2|-> ... |28| time_step_size = 28
13 | # +-+ +-+ +--+
14 | # ^ ^ ... ^
15 | # | | |
16 | # img1:[28] [28] ... [28]
17 | # img2:[28] [28] ... [28]
18 | # img3:[28] [28] ... [28]
19 | # ...
20 | # img128 or img256 (batch_size or test_size 256)
21 | # each input size = input_vec_size=lstm_size=28
22 |
23 | # configuration variables
24 | input_vec_size = lstm_size = 28
25 | time_step_size = 28
26 |
27 | batch_size = 128
28 | test_size = 256
29 |
30 | def init_weights(shape):
31 | return tf.Variable(tf.random_normal(shape, stddev=0.01))
32 |
33 |
34 | def model(X, W, B, lstm_size):
35 | # X, input shape: (batch_size, time_step_size, input_vec_size)
36 | print "X=", X
37 | XT = tf.transpose(X, [1, 0, 2]) # permute time_step_size and batch_size
38 | print "XT=", XT
39 | # XT shape: (time_step_size, batch_size, input_vec_size)
40 | XR = tf.reshape(XT, [-1, lstm_size]) # each row has input for each lstm cell (lstm_size=input_vec_size)
41 | print "XR=", XR
42 | # XR shape: (time_step_size * batch_size, input_vec_size)
43 | X_split = tf.split(0, time_step_size, XR) # split them to time_step_size (28 arrays)
44 | print "X_split=", X_split
45 | # Each array shape: (batch_size, input_vec_size)
46 |
47 | # Make lstm with lstm_size (each input vector size)
48 | lstm = tf.nn.rnn_cell.BasicLSTMCell(lstm_size, forget_bias=1.0, state_is_tuple=True)
49 |
50 | # Get lstm cell output, time_step_size (28) arrays with lstm_size output: (batch_size, lstm_size)
51 | outputs, _states = tf.nn.rnn(lstm, X_split, dtype=tf.float32)
52 |
53 | # Linear activation
54 | # Get the last output
55 | return tf.matmul(outputs[-1], W) + B, lstm.state_size # State size to initialize the stat
56 |
57 | mnist = input_data.read_data_sets("MNIST_data/", one_hot=True)
58 | trX, trY, teX, teY = mnist.train.images, mnist.train.labels, mnist.test.images, mnist.test.labels
59 | trX = trX.reshape(-1, 28, 28)
60 | teX = teX.reshape(-1, 28, 28)
61 |
62 | X = tf.placeholder("float", [None, 28, 28])
63 | Y = tf.placeholder("float", [None, 10])
64 |
65 | # get lstm_size and output 10 labels
66 | W = init_weights([lstm_size, 10])
67 | B = init_weights([10])
68 |
69 | py_x, state_size = model(X, W, B, lstm_size)
70 |
71 | cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(py_x, Y))
72 | train_op = tf.train.RMSPropOptimizer(0.001, 0.9).minimize(cost)
73 | predict_op = tf.argmax(py_x, 1)
74 |
75 | # Launch the graph in a session
76 | with tf.Session() as sess:
77 | # you need to initialize all variables
78 | tf.initialize_all_variables().run()
79 |
80 | for i in range(100):
81 | for start, end in zip(range(0, len(trX), batch_size), range(batch_size, len(trX)+1, batch_size)):
82 | sess.run(train_op, feed_dict={X: trX[start:end], Y: trY[start:end]})
83 |
84 | test_indices = np.arange(len(teX)) # Get A Test Batch
85 | np.random.shuffle(test_indices)
86 | test_indices = test_indices[0:test_size]
87 |
88 | print(i, np.mean(np.argmax(teY[test_indices], axis=1) ==
89 | sess.run(predict_op, feed_dict={X: teX[test_indices]})))
90 |
--------------------------------------------------------------------------------
/subtitle/subtitle_crawler/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Scrapy settings for subtitle_crawler project
4 | #
5 | # For simplicity, this file contains only settings considered important or
6 | # commonly used. You can find more settings consulting the documentation:
7 | #
8 | # http://doc.scrapy.org/en/latest/topics/settings.html
9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 |
12 | BOT_NAME = 'subtitle'
13 |
14 | SPIDER_MODULES = ['subtitle_crawler.spiders']
15 | NEWSPIDER_MODULE = 'subtitle_crawler.spiders'
16 |
17 |
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'subtitle_crawler (+http://www.yourdomain.com)'
20 | USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
21 |
22 | # Obey robots.txt rules
23 | ROBOTSTXT_OBEY = False
24 |
25 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
26 | #CONCURRENT_REQUESTS = 32
27 |
28 | # Configure a delay for requests for the same website (default: 0)
29 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
30 | # See also autothrottle settings and docs
31 | #DOWNLOAD_DELAY = 5
32 | DOWNLOAD_TIMEOUT = 60
33 | # The download delay setting will honor only one of:
34 | CONCURRENT_REQUESTS_PER_DOMAIN = 3
35 | CONCURRENT_REQUESTS_PER_IP = 1
36 |
37 | # Disable cookies (enabled by default)
38 | #COOKIES_ENABLED = False
39 |
40 | # Disable Telnet Console (enabled by default)
41 | #TELNETCONSOLE_ENABLED = False
42 |
43 | # Override the default request headers:
44 | #DEFAULT_REQUEST_HEADERS = {
45 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
46 | # 'Accept-Language': 'en',
47 | #}
48 |
49 | # Enable or disable spider middlewares
50 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
51 | #SPIDER_MIDDLEWARES = {
52 | # 'subtitle_crawler.middlewares.MyCustomSpiderMiddleware': 543,
53 | #}
54 |
55 | # Enable or disable downloader middlewares
56 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
57 | #DOWNLOADER_MIDDLEWARES = {
58 | # 'subtitle_crawler.middlewares.MyCustomDownloaderMiddleware': 543,
59 | #}
60 |
61 | # Enable or disable extensions
62 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
63 | #EXTENSIONS = {
64 | # 'scrapy.extensions.telnet.TelnetConsole': None,
65 | #}
66 |
67 | # Configure item pipelines
68 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
69 | ITEM_PIPELINES = {
70 | 'subtitle_crawler.pipelines.SubtitleCrawlerPipeline': 300,
71 | }
72 |
73 | # Enable and configure the AutoThrottle extension (disabled by default)
74 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
75 | #AUTOTHROTTLE_ENABLED = True
76 | # The initial download delay
77 | #AUTOTHROTTLE_START_DELAY = 5
78 | # The maximum download delay to be set in case of high latencies
79 | #AUTOTHROTTLE_MAX_DELAY = 60
80 | # The average number of requests Scrapy should be sending in parallel to
81 | # each remote server
82 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
83 | # Enable showing throttling stats for every response received:
84 | #AUTOTHROTTLE_DEBUG = False
85 |
86 | # Enable and configure HTTP caching (disabled by default)
87 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
88 | #HTTPCACHE_ENABLED = True
89 | #HTTPCACHE_EXPIRATION_SECS = 0
90 | #HTTPCACHE_DIR = 'httpcache'
91 | #HTTPCACHE_IGNORE_HTTP_CODES = []
92 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
93 |
94 | LOG_LEVEL = 'INFO'
95 |
--------------------------------------------------------------------------------
/seq2seq/tflearn_prj/my_lstm_test.py:
--------------------------------------------------------------------------------
1 | # coding:utf-8
2 |
3 |
4 | import sys
5 | import numpy as np
6 | import tensorflow as tf
7 | from tensorflow.python.ops import seq2seq
8 | from tensorflow.python.ops import rnn_cell
9 | import tflearn
10 |
11 | np.set_printoptions(threshold=np.nan)
12 |
13 | class Primes:
14 | def __init__(self):
15 | self.primes = list()
16 | for i in range(2, 100):
17 | is_prime = True
18 | for j in range(2, i-1):
19 | if i % j == 0:
20 | is_prime = False
21 | if is_prime:
22 | self.primes.append(i)
23 | self.primes_count = len(self.primes)
24 | def get_sample(self, x_dim, y_dim, index):
25 | result = np.zeros((x_dim+y_dim))
26 | for i in range(index, index + x_dim + y_dim):
27 | result[i-index] = self.primes[i%self.primes_count]
28 | return result
29 |
30 |
31 | max_input_len = 10
32 | max_output_len = 10
33 | embedding_size = 20
34 | max_int = 100
35 | GO_VALUE = max_int + 1
36 | learning_rate = 0.01
37 |
38 | network = tflearn.input_data(shape=[None, max_input_len + max_output_len], dtype=tf.int32, name="XY")
39 | encoder_inputs = tf.slice(network, [0, 0], [-1, max_input_len], name="enc_in")
40 | encoder_inputs = tf.unpack(encoder_inputs, axis=1)
41 | decoder_inputs = tf.slice(network, [0, max_input_len], [-1, max_output_len], name="dec_in")
42 | decoder_inputs = tf.unpack(decoder_inputs, axis=1)
43 | go_input = tf.mul( tf.ones_like(decoder_inputs[0], dtype=tf.int32), GO_VALUE )
44 | decoder_inputs = [go_input] + decoder_inputs[: max_output_len-1]
45 | num_encoder_symbols = max_int + 1 # 从0起始
46 | num_decoder_symbols = max_int + 2 # 包括GO
47 | print encoder_inputs
48 | print decoder_inputs
49 |
50 | cell = rnn_cell.BasicLSTMCell(16, state_is_tuple=True)
51 |
52 | model_outputs, states = seq2seq.embedding_rnn_seq2seq(
53 | encoder_inputs,
54 | decoder_inputs,
55 | cell,
56 | num_encoder_symbols=num_encoder_symbols,
57 | num_decoder_symbols=num_decoder_symbols,
58 | embedding_size=embedding_size,
59 | feed_previous=False)
60 |
61 | network = tf.pack(model_outputs, axis=1)
62 |
63 |
64 |
65 | def sequence_loss(y_pred, y_true):
66 | logits = tf.unpack(y_pred, axis=1)
67 | targets = tf.unpack(y_true, axis=1)
68 | weights = [tf.ones_like(yp, dtype=tf.float32) for yp in targets]
69 | return seq2seq.sequence_loss(logits, targets, weights)
70 |
71 | def accuracy(y_pred, y_true, x_in):
72 | pred_idx = tf.to_int32(tf.argmax(y_pred, 2))
73 | return tf.reduce_mean(tf.cast(tf.equal(pred_idx, y_true), tf.float32), name='acc')
74 |
75 | targetY = tf.placeholder(shape=[None, max_output_len], dtype=tf.int32, name="Y")
76 |
77 | network = tflearn.regression(
78 | network,
79 | placeholder=targetY,
80 | optimizer='adam',
81 | learning_rate=learning_rate,
82 | loss=sequence_loss,
83 | metric=accuracy,
84 | name="Y")
85 |
86 | model = tflearn.DNN(network, tensorboard_verbose=0, checkpoint_path=None)
87 |
88 | primes = Primes()
89 | XY = [ primes.get_sample(10, 10, i)[0:20] for i in range(10) ]
90 | Y = [ primes.get_sample(10, 10, i)[10:20] for i in range(10) ]
91 | model.fit(
92 | XY,
93 | Y,
94 | n_epoch=10,
95 | validation_set=0.01,
96 | batch_size=1,
97 | shuffle=True,
98 | show_metric=True,
99 | snapshot_step=50,
100 | snapshot_epoch=False,
101 | run_id="my_lstm_test")
102 |
103 |
104 | TEST_XY = [XY[0]]
105 | TEST_XY[0][10:20]=0
106 | res = model.predict(TEST_XY)
107 | print TEST_XY
108 | res = np.array(res)
109 | print res.shape
110 | y = res.reshape(max_output_len, num_decoder_symbols)
111 | prediction = np.argmax(y, axis=1)
112 | print prediction
113 |
114 |
115 |
--------------------------------------------------------------------------------
/chatbotv1/src/main/java/com/shareditor/chatbotv1/Indexer.java:
--------------------------------------------------------------------------------
1 | package com.shareditor.chatbotv1;
2 |
3 | import java.io.BufferedReader;
4 | import java.io.File;
5 | import java.io.FileInputStream;
6 | import java.io.IOException;
7 | import java.io.InputStreamReader;
8 | import java.nio.charset.Charset;
9 | import java.security.MessageDigest;
10 | import java.security.NoSuchAlgorithmException;
11 | import java.util.HashSet;
12 |
13 | import org.apache.lucene.analysis.Analyzer;
14 | import org.apache.lucene.document.Document;
15 | import org.apache.lucene.document.Field.Store;
16 | import org.apache.lucene.document.StoredField;
17 | import org.apache.lucene.document.TextField;
18 | import org.apache.lucene.index.IndexWriter;
19 | import org.apache.lucene.index.IndexWriterConfig;
20 | import org.apache.lucene.index.IndexWriterConfig.OpenMode;
21 | import org.apache.lucene.store.FSDirectory;
22 | import org.apache.lucene.util.Version;
23 | import org.wltea.analyzer.lucene.IKAnalyzer;
24 |
25 | public class Indexer
26 | {
27 |
28 | public static final Charset UTF8 = Charset.forName("utf8");
29 |
30 | public static String hexString(byte[] b) {
31 | String ret = "";
32 | for (int i = 0; i < b.length; i++) {
33 | String hex = Integer.toHexString(b[i] & 0xF);
34 | ret += hex.toUpperCase();
35 | }
36 | return ret;
37 | }
38 |
39 | public static void main( String[] args ) throws IOException, NoSuchAlgorithmException
40 | {
41 | if (args.length != 2) {
42 | System.err.println("Usage: " + Indexer.class.getSimpleName() + " corpus_path index_path");
43 | System.exit(-1);
44 | }
45 |
46 | String corpusPath = args[0];
47 | String indexPath = args[1];
48 |
49 | Analyzer analyzer = new IKAnalyzer(true);
50 | IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_4_9, analyzer);
51 | iwc.setOpenMode(OpenMode.CREATE);
52 | iwc.setUseCompoundFile(true);
53 | IndexWriter indexWriter = new IndexWriter(FSDirectory.open(new File(indexPath)), iwc);
54 |
55 | BufferedReader br = new BufferedReader(new InputStreamReader(
56 | new FileInputStream(corpusPath), "UTF-8"));
57 | String line = "";
58 | String last = "";
59 | long lineNum = 0;
60 | MessageDigest md = MessageDigest.getInstance("MD5");
61 | HashSet mc = new HashSet();
62 | int dupCount = 0;
63 | int totalCount = 0;
64 | long last_t = 0;
65 | while ((line = br.readLine()) != null) {
66 | totalCount++;
67 | if (totalCount % 15000000 == 0) {
68 | System.out.println("clear set");
69 | mc.clear();
70 | }
71 | line = line.trim();
72 |
73 | if (0 == line.length()) {
74 | continue;
75 | }
76 |
77 | if (!last.equals("")) {
78 | String pair = last + line;
79 |
80 | byte[] md5 = md.digest(pair.getBytes(UTF8));
81 | String md5_str = hexString(md5);
82 |
83 | if (mc.contains(md5_str)) {
84 | dupCount++;
85 | continue;
86 | } else {
87 | mc.add(md5_str);
88 | }
89 | Document doc = new Document();
90 | doc.add(new TextField("question", last, Store.YES));
91 | doc.add(new StoredField("answer", line));
92 | indexWriter.addDocument(doc);
93 | }
94 | last = line;
95 | lineNum++;
96 | if (lineNum % 100000 == 0) {
97 | long t = System.currentTimeMillis();
98 | System.out.println("elapse second: " + (t-last_t)/1000 + " add doc " + lineNum + " totalCount:" + totalCount + " dup:" + dupCount);
99 | last_t = t;
100 | }
101 | }
102 | br.close();
103 |
104 | indexWriter.forceMerge(1);
105 | indexWriter.close();
106 | }
107 | }
108 |
--------------------------------------------------------------------------------
/chatbotv1/src/main/java/org/wltea/analyzer/lucene/IKTokenizer.java:
--------------------------------------------------------------------------------
1 | /**
2 | * IK 中文分词 版本 5.0.1
3 | * IK Analyzer release 5.0.1
4 | *
5 | * Licensed to the Apache Software Foundation (ASF) under one or more
6 | * contributor license agreements. See the NOTICE file distributed with
7 | * this work for additional information regarding copyright ownership.
8 | * The ASF licenses this file to You under the Apache License, Version 2.0
9 | * (the "License"); you may not use this file except in compliance with
10 | * the License. You may obtain a copy of the License at
11 | *
12 | * http://www.apache.org/licenses/LICENSE-2.0
13 | *
14 | * Unless required by applicable law or agreed to in writing, software
15 | * distributed under the License is distributed on an "AS IS" BASIS,
16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | * See the License for the specific language governing permissions and
18 | * limitations under the License.
19 | *
20 | * 源代码由林良益(linliangyi2005@gmail.com)提供
21 | * 版权声明 2012,乌龙茶工作室
22 | * provided by Linliangyi and copyright 2012 by Oolong studio
23 | *
24 |
25 | *
26 | */
27 | package org.wltea.analyzer.lucene;
28 |
29 | import java.io.IOException;
30 | import java.io.Reader;
31 |
32 | import org.apache.lucene.analysis.Tokenizer;
33 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
34 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
35 | import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
36 |
37 | import org.wltea.analyzer.core.IKSegmenter;
38 | import org.wltea.analyzer.core.Lexeme;
39 |
40 | /**
41 | * IK分词器 Lucene Tokenizer适配器类
42 | * 兼容Lucene 4.0版本
43 | */
44 | public final class IKTokenizer extends Tokenizer {
45 |
46 | //IK分词器实现
47 | private IKSegmenter _IKImplement;
48 |
49 | //词元文本属性
50 | private final CharTermAttribute termAtt;
51 | //词元位移属性
52 | private final OffsetAttribute offsetAtt;
53 | //词元分类属性(该属性分类参考org.wltea.analyzer.core.Lexeme中的分类常量)
54 | private final TypeAttribute typeAtt;
55 | //记录最后一个词元的结束位置
56 | private int endPosition;
57 |
58 | /**
59 | * Lucene 4.0 Tokenizer适配器类构造函数
60 | * @param in
61 | * @param useSmart
62 | */
63 | public IKTokenizer(Reader in , boolean useSmart){
64 | super(in);
65 | offsetAtt = addAttribute(OffsetAttribute.class);
66 | termAtt = addAttribute(CharTermAttribute.class);
67 | typeAtt = addAttribute(TypeAttribute.class);
68 | _IKImplement = new IKSegmenter(input , useSmart);
69 | }
70 |
71 | /* (non-Javadoc)
72 | * @see org.apache.lucene.analysis.TokenStream#incrementToken()
73 | */
74 | @Override
75 | public boolean incrementToken() throws IOException {
76 | //清除所有的词元属性
77 | clearAttributes();
78 | Lexeme nextLexeme = _IKImplement.next();
79 | if(nextLexeme != null){
80 | //将Lexeme转成Attributes
81 | //设置词元文本
82 | termAtt.append(nextLexeme.getLexemeText());
83 | //设置词元长度
84 | termAtt.setLength(nextLexeme.getLength());
85 | //设置词元位移
86 | offsetAtt.setOffset(nextLexeme.getBeginPosition(), nextLexeme.getEndPosition());
87 | //记录分词的最后位置
88 | endPosition = nextLexeme.getEndPosition();
89 | //记录词元分类
90 | typeAtt.setType(nextLexeme.getLexemeTypeString());
91 | //返会true告知还有下个词元
92 | return true;
93 | }
94 | //返会false告知词元输出完毕
95 | return false;
96 | }
97 |
98 | /*
99 | * (non-Javadoc)
100 | * @see org.apache.lucene.analysis.Tokenizer#reset(java.io.Reader)
101 | */
102 | @Override
103 | public void reset() throws IOException {
104 | super.reset();
105 | _IKImplement.reset(input);
106 | }
107 |
108 | @Override
109 | public final void end() {
110 | // set final offset
111 | int finalOffset = correctOffset(this.endPosition);
112 | offsetAtt.setOffset(finalOffset, finalOffset);
113 | }
114 | }
115 |
--------------------------------------------------------------------------------
/pattern_recognition.lua:
--------------------------------------------------------------------------------
1 | require 'nn'
2 | require 'paths'
3 | if (not paths.filep("cifar10torchsmall.zip")) then
4 | os.execute('wget -c https://s3.amazonaws.com/torch7/data/cifar10torchsmall.zip')
5 | os.execute('unzip cifar10torchsmall.zip')
6 | end
7 | trainset = torch.load('cifar10-train.t7')
8 | testset = torch.load('cifar10-test.t7')
9 | classes = {'airplane', 'automobile', 'bird', 'cat',
10 | 'deer', 'dog', 'frog', 'horse', 'ship', 'truck'}
11 | setmetatable(trainset,
12 | {__index = function(t, i)
13 | return {t.data[i], t.label[i]}
14 | end}
15 | );
16 | trainset.data = trainset.data:double() -- convert the data from a ByteTensor to a DoubleTensor.
17 |
18 | function trainset:size()
19 | return self.data:size(1)
20 | end
21 | mean = {} -- store the mean, to normalize the test set in the future
22 | stdv = {} -- store the standard-deviation for the future
23 | for i=1,3 do -- over each image channel
24 | mean[i] = trainset.data[{ {}, {i}, {}, {} }]:mean() -- mean estimation
25 | print('Channel ' .. i .. ', Mean: ' .. mean[i])
26 | trainset.data[{ {}, {i}, {}, {} }]:add(-mean[i]) -- mean subtraction
27 |
28 | stdv[i] = trainset.data[{ {}, {i}, {}, {} }]:std() -- std estimation
29 | print('Channel ' .. i .. ', Standard Deviation: ' .. stdv[i])
30 | trainset.data[{ {}, {i}, {}, {} }]:div(stdv[i]) -- std scaling
31 | end
32 | net = nn.Sequential()
33 | net:add(nn.SpatialConvolution(3, 6, 5, 5)) -- 3 input image channels, 6 output channels, 5x5 convolution kernel
34 | net:add(nn.ReLU()) -- non-linearity
35 | net:add(nn.SpatialMaxPooling(2,2,2,2)) -- A max-pooling operation that looks at 2x2 windows and finds the max.
36 | net:add(nn.SpatialConvolution(6, 16, 5, 5))
37 | net:add(nn.ReLU()) -- non-linearity
38 | net:add(nn.SpatialMaxPooling(2,2,2,2))
39 | net:add(nn.View(16*5*5)) -- reshapes from a 3D tensor of 16x5x5 into 1D tensor of 16*5*5
40 | net:add(nn.Linear(16*5*5, 120)) -- fully connected layer (matrix multiplication between input and weights)
41 | net:add(nn.ReLU()) -- non-linearity
42 | net:add(nn.Linear(120, 84))
43 | net:add(nn.ReLU()) -- non-linearity
44 | net:add(nn.Linear(84, 10)) -- 10 is the number of outputs of the network (in this case, 10 digits)
45 | net:add(nn.LogSoftMax()) -- converts the output to a log-probability. Useful for classification problems
46 | criterion = nn.ClassNLLCriterion()
47 | trainer = nn.StochasticGradient(net, criterion)
48 | trainer.learningRate = 0.001
49 | trainer.maxIteration = 5
50 | trainer:train(trainset)
51 | testset.data = testset.data:double() -- convert from Byte tensor to Double tensor
52 | for i=1,3 do -- over each image channel
53 | testset.data[{ {}, {i}, {}, {} }]:add(-mean[i]) -- mean subtraction
54 | testset.data[{ {}, {i}, {}, {} }]:div(stdv[i]) -- std scaling
55 | end
56 | predicted = net:forward(testset.data[100])
57 | print(classes[testset.label[100]])
58 | print(predicted:exp())
59 | for i=1,predicted:size(1) do
60 | print(classes[i], predicted[i])
61 | end
62 | correct = 0
63 | for i=1,10000 do
64 | local groundtruth = testset.label[i]
65 | local prediction = net:forward(testset.data[i])
66 | local confidences, indices = torch.sort(prediction, true) -- true means sort in descending order
67 | if groundtruth == indices[1] then
68 | correct = correct + 1
69 | end
70 | end
71 |
72 | print(correct, 100*correct/10000 .. ' % ')
73 | class_performance = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
74 | for i=1,10000 do
75 | local groundtruth = testset.label[i]
76 | local prediction = net:forward(testset.data[i])
77 | local confidences, indices = torch.sort(prediction, true) -- true means sort in descending order
78 | if groundtruth == indices[1] then
79 | class_performance[groundtruth] = class_performance[groundtruth] + 1
80 | end
81 | end
82 |
83 | for i=1,#classes do
84 | print(classes[i], 100*class_performance[i]/1000 .. ' %')
85 | end
86 |
--------------------------------------------------------------------------------
/chatbotv1/src/main/java/org/wltea/analyzer/core/CJKSegmenter.java:
--------------------------------------------------------------------------------
1 |
2 | /**
3 | * IK 中文分词 版本 5.0
4 | * IK Analyzer release 5.0
5 | *
6 | * Licensed to the Apache Software Foundation (ASF) under one or more
7 | * contributor license agreements. See the NOTICE file distributed with
8 | * this work for additional information regarding copyright ownership.
9 | * The ASF licenses this file to You under the Apache License, Version 2.0
10 | * (the "License"); you may not use this file except in compliance with
11 | * the License. You may obtain a copy of the License at
12 | *
13 | * http://www.apache.org/licenses/LICENSE-2.0
14 | *
15 | * Unless required by applicable law or agreed to in writing, software
16 | * distributed under the License is distributed on an "AS IS" BASIS,
17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 | * See the License for the specific language governing permissions and
19 | * limitations under the License.
20 | *
21 | * 源代码由林良益(linliangyi2005@gmail.com)提供
22 | * 版权声明 2012,乌龙茶工作室
23 | * provided by Linliangyi and copyright 2012 by Oolong studio
24 | *
25 | */
26 | package org.wltea.analyzer.core;
27 |
28 | import java.util.LinkedList;
29 | import java.util.List;
30 |
31 | import org.wltea.analyzer.dic.Dictionary;
32 | import org.wltea.analyzer.dic.Hit;
33 |
34 |
35 | /**
36 | * 中文-日韩文子分词器
37 | */
38 | class CJKSegmenter implements ISegmenter {
39 |
40 | //子分词器标签
41 | static final String SEGMENTER_NAME = "CJK_SEGMENTER";
42 | //待处理的分词hit队列
43 | private List tmpHits;
44 |
45 |
46 | CJKSegmenter(){
47 | this.tmpHits = new LinkedList();
48 | }
49 |
50 | /* (non-Javadoc)
51 | * @see org.wltea.analyzer.core.ISegmenter#analyze(org.wltea.analyzer.core.AnalyzeContext)
52 | */
53 | public void analyze(AnalyzeContext context) {
54 | if(CharacterUtil.CHAR_USELESS != context.getCurrentCharType()){
55 |
56 | //优先处理tmpHits中的hit
57 | if(!this.tmpHits.isEmpty()){
58 | //处理词段队列
59 | Hit[] tmpArray = this.tmpHits.toArray(new Hit[this.tmpHits.size()]);
60 | for(Hit hit : tmpArray){
61 | hit = Dictionary.getSingleton().matchWithHit(context.getSegmentBuff(), context.getCursor() , hit);
62 | if(hit.isMatch()){
63 | //输出当前的词
64 | Lexeme newLexeme = new Lexeme(context.getBufferOffset() , hit.getBegin() , context.getCursor() - hit.getBegin() + 1 , Lexeme.TYPE_CNWORD);
65 | context.addLexeme(newLexeme);
66 |
67 | if(!hit.isPrefix()){//不是词前缀,hit不需要继续匹配,移除
68 | this.tmpHits.remove(hit);
69 | }
70 |
71 | }else if(hit.isUnmatch()){
72 | //hit不是词,移除
73 | this.tmpHits.remove(hit);
74 | }
75 | }
76 | }
77 |
78 | //*********************************
79 | //再对当前指针位置的字符进行单字匹配
80 | Hit singleCharHit = Dictionary.getSingleton().matchInMainDict(context.getSegmentBuff(), context.getCursor(), 1);
81 | if(singleCharHit.isMatch()){//首字成词
82 | //输出当前的词
83 | Lexeme newLexeme = new Lexeme(context.getBufferOffset() , context.getCursor() , 1 , Lexeme.TYPE_CNWORD);
84 | context.addLexeme(newLexeme);
85 |
86 | //同时也是词前缀
87 | if(singleCharHit.isPrefix()){
88 | //前缀匹配则放入hit列表
89 | this.tmpHits.add(singleCharHit);
90 | }
91 | }else if(singleCharHit.isPrefix()){//首字为词前缀
92 | //前缀匹配则放入hit列表
93 | this.tmpHits.add(singleCharHit);
94 | }
95 |
96 |
97 | }else{
98 | //遇到CHAR_USELESS字符
99 | //清空队列
100 | this.tmpHits.clear();
101 | }
102 |
103 | //判断缓冲区是否已经读完
104 | if(context.isBufferConsumed()){
105 | //清空队列
106 | this.tmpHits.clear();
107 | }
108 |
109 | //判断是否锁定缓冲区
110 | if(this.tmpHits.size() == 0){
111 | context.unlockBuffer(SEGMENTER_NAME);
112 |
113 | }else{
114 | context.lockBuffer(SEGMENTER_NAME);
115 | }
116 | }
117 |
118 | /* (non-Javadoc)
119 | * @see org.wltea.analyzer.core.ISegmenter#reset()
120 | */
121 | public void reset() {
122 | //清空队列
123 | this.tmpHits.clear();
124 | }
125 |
126 | }
127 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | ChatBotCourse
2 | ==============
3 | _读本人更多原创文章,欢迎关注微信订阅号_
4 |
5 |
6 |
7 | _欢迎关注我的另外两个github项目_
8 | * [_教你成为全栈工程师_](https://github.com/warmheartli/FullStackDeveloperCourse)
9 | * [_机器学习精简入门教程_](https://github.com/warmheartli/MachineLearningCourse)
10 |
11 | 自己动手做聊天机器人教程
12 | ==============
13 | * [自己动手做聊天机器人 一-涉及知识](http://www.shareditor.com/blogshow/?blogId=63)(2016-06-09)
14 | * [自己动手做聊天机器人 二-初识NLTK库](http://www.shareditor.com/blogshow/?blogId=64)(2016-06-10)
15 | * [自己动手做聊天机器人 三-语料与词汇资源](http://www.shareditor.com/blogshow/?blogId=65)(2016-06-12)
16 | * [自己动手做聊天机器人 四-何须动手?完全自动化对语料做词性标注](http://www.shareditor.com/blogshow/?blogId=67)(2016-06-17)
17 | * [自己动手做聊天机器人 五-自然语言处理中的文本分类](http://www.shareditor.com/blogshow/?blogId=69)(2016-06-21)
18 | * [自己动手做聊天机器人 六-教你怎么从一句话里提取出十句话的信息](http://www.shareditor.com/blogshow/?blogId=70)(2016-06-22)
19 | * [自己动手做聊天机器人 七-文法分析还是基于特征好啊](http://www.shareditor.com/blogshow/?blogId=71)(2016-06-23)
20 | * [自己动手做聊天机器人 八-重温自然语言处理](http://www.shareditor.com/blogshow/?blogId=72)(2016-06-24)
21 | * [自己动手做聊天机器人 九-聊天机器人应该怎么做](http://www.shareditor.com/blogshow/?blogId=73)(2016-06-25)
22 | * [自己动手做聊天机器人 十-半个小时搞定词性标注与关键词提取](http://www.shareditor.com/blogshow/?blogId=74)(2016-06-28)
23 | * [自己动手做聊天机器人 十一-0字节存储海量语料资源](http://www.shareditor.com/blogshow/?blogId=76)(2016-07-01)
24 | * [自己动手做聊天机器人 十二-教你如何利用强大的中文语言技术平台做依存句法和语义依存分析](http://www.shareditor.com/blogshow/?blogId=77)(2016-07-04)
25 | * [自己动手做聊天机器人 十三-把语言模型探究到底](http://www.shareditor.com/blogshow/?blogId=78)(2016-07-05)
26 | * [自己动手做聊天机器人 十四-探究中文分词的艺术](http://www.shareditor.com/blogshow/?blogId=80)(2016-07-06)
27 | * [自己动手做聊天机器人 十五-一篇文章读懂拿了图灵奖和诺贝尔奖的概率图模型](http://www.shareditor.com/blogshow/?blogId=81)(2016-07-09)
28 | * [自己动手做聊天机器人 十六-大话自然语言处理中的囊中取物](http://www.shareditor.com/blogshow/?blogId=82)(2016-07-09)
29 | * [自己动手做聊天机器人 十七-让机器做词性自动标注的具体方法](http://www.shareditor.com/blogshow/?blogId=86)(2016-07-15)
30 | * [自己动手做聊天机器人 十八-神奇算法之句法分析树的生成](http://www.shareditor.com/blogshow/?blogId=87)(2016-07-19)
31 | * [自己动手做聊天机器人 十九-机器人是怎么理解“日后再说”的](http://www.shareditor.com/blogshow/?blogId=88)(2016-07-21)
32 | * [自己动手做聊天机器人 二十-语义角色标注的基本方法](http://www.shareditor.com/blogshow/?blogId=89)(2016-07-22)
33 | * [自己动手做聊天机器人 二十一-比TF-IDF更好的隐含语义索引模型是个什么鬼](http://www.shareditor.com/blogshow/?blogId=90)(2016-07-26)
34 | * [自己动手做聊天机器人 二十二-神奇算法之人工神经网络](http://www.shareditor.com/blogshow/?blogId=92)(2016-08-01)
35 | * [自己动手做聊天机器人 二十三-用CNN做深度学习](http://www.shareditor.com/blogshow/?blogId=97)(2016-08-12)
36 | * [自己动手做聊天机器人 二十四-将深度学习应用到NLP](http://www.shareditor.com/blogshow/?blogId=99)(2016-08-18)
37 | * [自己动手做聊天机器人 二十五-google的文本挖掘深度学习工具word2vec的实现原理](http://www.shareditor.com/blogshow/?blogId=100)(2016-08-20)
38 | * [自己动手做聊天机器人 二十六-图解递归神经网络(RNN)](http://www.shareditor.com/blogshow/?blogId=103)(2016-08-25)
39 | * [自己动手做聊天机器人 二十七-用深度学习来做自动问答的一般方法](http://www.shareditor.com/blogshow/?blogId=104)(2016-08-26)
40 | * [自己动手做聊天机器人 二十八-脑洞大开:基于美剧字幕的聊天语料库建设方案](http://www.shareditor.com/blogshow/?blogId=105)(2016-08-30)
41 | * [自己动手做聊天机器人 二十九-重磅:近1GB的三千万聊天语料供出](http://www.shareditor.com/blogshow/?blogId=112)(2016-09-18)
42 | * [自己动手做聊天机器人 三十-第一版聊天机器人诞生——吃了字幕长大的小二兔](http://www.shareditor.com/blogshow/?blogId=113)(2016-09-26)
43 | * [自己动手做聊天机器人 三十一-如何把网站流量导向小二兔机器人](http://www.shareditor.com/blogshow/?blogId=114)(2016-09-30)
44 | * [自己动手做聊天机器人 三十二-用三千万影视剧字幕语料库生成词向量](http://www.shareditor.com/blogshow/?blogId=115)(2016-10-10)
45 | * [自己动手做聊天机器人 三十三-两套代码详解LSTM-RNN——有记忆的神经网络](http://www.shareditor.com/blogshow/?blogId=116)(2016-10-13)
46 | * [自己动手做聊天机器人 三十四-最快的深度学习框架torch](http://www.shareditor.com/blogshow/?blogId=117)(2016-10-28)
47 | * [自己动手做聊天机器人 三十五-一个lstm单元让聊天机器人学会甄嬛体](http://www.shareditor.com/blogshow/?blogId=118)(2016-11-23)
48 | * [自己动手做聊天机器人 三十六-深入理解tensorflow的session和graph](http://www.shareditor.com/blogshow/?blogId=119)(2016-12-01)
49 | * [自己动手做聊天机器人 三十七-一张图了解tensorflow中的线性回归工作原理](http://www.shareditor.com/blogshow/?blogId=120)(2016-12-08)
50 | * [自己动手做聊天机器人 三十八-原来聊天机器人是这么做出来的](http://www.shareditor.com/blogshow/?blogId=121)(2017-01-10)
51 | * [自己动手做聊天机器人 三十九-满腔热血:在家里搭建一台GPU云服务共享给人工智能和大数据爱好者](http://www.shareditor.com/blogshow/?blogId=122)(2017-01-16)
52 | * [自己动手做聊天机器人 四十-视频教程之开篇宣言与知识点梳理](http://www.shareditor.com/blogshow/?blogId=124)(2017-03-05)
53 | * [自己动手做聊天机器人 四十一-视频教程之环境搭建与python基础](http://www.shareditor.com/blogshow/?blogId=125)(2017-03-31)
54 |
--------------------------------------------------------------------------------
/lstm_code/iamtrask/lstm.py:
--------------------------------------------------------------------------------
1 | # coding:utf-8
2 | import copy, numpy as np
3 | np.random.seed(0)
4 | # compute sigmoid nonlinearity
5 | def sigmoid(x):
6 | output = 1/(1+np.exp(-x))
7 | return output
8 |
9 | # convert output of sigmoid function to its derivative
10 | def sigmoid_output_to_derivative(output):
11 | return output*(1-output)
12 |
13 |
14 | # training dataset generation
15 | int2binary = {}
16 | binary_dim = 8
17 |
18 | largest_number = pow(2,binary_dim)
19 | binary = np.unpackbits(
20 | np.array([range(largest_number)],dtype=np.uint8).T,axis=1)
21 | for i in range(largest_number):
22 | int2binary[i] = binary[i]
23 |
24 |
25 | # input variables
26 | alpha = 0.1
27 | input_dim = 2
28 | hidden_dim = 16
29 | output_dim = 1
30 |
31 |
32 | # initialize neural network weights
33 | synapse_0 = 2*np.random.random((input_dim,hidden_dim)) - 1
34 | synapse_1 = 2*np.random.random((hidden_dim,output_dim)) - 1
35 | synapse_h = 2*np.random.random((hidden_dim,hidden_dim)) - 1
36 |
37 | synapse_0_update = np.zeros_like(synapse_0)
38 | synapse_1_update = np.zeros_like(synapse_1)
39 | synapse_h_update = np.zeros_like(synapse_h)
40 |
41 | # training logic
42 | for j in range(10000):
43 |
44 | # generate a simple addition problem (a + b = c)
45 | a_int = np.random.randint(largest_number/2) # int version
46 | a = int2binary[a_int] # binary encoding
47 |
48 | b_int = np.random.randint(largest_number/2) # int version
49 | b = int2binary[b_int] # binary encoding
50 |
51 | # true answer
52 | c_int = a_int + b_int
53 | c = int2binary[c_int]
54 |
55 | # where we'll store our best guess (binary encoded)
56 | d = np.zeros_like(c)
57 |
58 | overallError = 0
59 |
60 | layer_2_deltas = list()
61 | layer_1_values = list()
62 | layer_1_values.append(np.zeros(hidden_dim))
63 |
64 | # moving along the positions in the binary encoding
65 | for position in range(binary_dim):
66 |
67 | # generate input and output
68 | X = np.array([[a[binary_dim - position - 1],b[binary_dim - position - 1]]])
69 | y = np.array([[c[binary_dim - position - 1]]]).T
70 |
71 | # hidden layer (input ~+ prev_hidden)
72 | layer_1 = sigmoid(np.dot(X,synapse_0) + np.dot(layer_1_values[-1],synapse_h))
73 |
74 | # output layer (new binary representation)
75 | layer_2 = sigmoid(np.dot(layer_1,synapse_1))
76 |
77 | # did we miss?... if so by how much?
78 | layer_2_error = y - layer_2
79 | layer_2_deltas.append((layer_2_error)*sigmoid_output_to_derivative(layer_2))
80 | overallError += np.abs(layer_2_error[0])
81 |
82 | # decode estimate so we can print it out
83 | d[binary_dim - position - 1] = np.round(layer_2[0][0])
84 |
85 | # store hidden layer so we can use it in the next timestep
86 | layer_1_values.append(copy.deepcopy(layer_1))
87 |
88 | future_layer_1_delta = np.zeros(hidden_dim)
89 |
90 | for position in range(binary_dim):
91 |
92 | X = np.array([[a[position],b[position]]])
93 | layer_1 = layer_1_values[-position-1]
94 | prev_layer_1 = layer_1_values[-position-2]
95 |
96 | # error at output layer
97 | layer_2_delta = layer_2_deltas[-position-1]
98 | # error at hidden layer
99 | layer_1_delta = (future_layer_1_delta.dot(synapse_h.T) + \
100 | layer_2_delta.dot(synapse_1.T)) * sigmoid_output_to_derivative(layer_1)
101 | # let's update all our weights so we can try again
102 | synapse_1_update += np.atleast_2d(layer_1).T.dot(layer_2_delta)
103 | synapse_h_update += np.atleast_2d(prev_layer_1).T.dot(layer_1_delta)
104 | synapse_0_update += X.T.dot(layer_1_delta)
105 |
106 | future_layer_1_delta = layer_1_delta
107 |
108 |
109 | synapse_0 += synapse_0_update * alpha
110 | synapse_1 += synapse_1_update * alpha
111 | synapse_h += synapse_h_update * alpha
112 |
113 | synapse_0_update *= 0
114 | synapse_1_update *= 0
115 | synapse_h_update *= 0
116 |
117 | # print out progress
118 | if(j % 1000 == 0):
119 | print "Error:" + str(overallError)
120 | print "Pred:" + str(d)
121 | print "True:" + str(c)
122 | out = 0
123 | for index,x in enumerate(reversed(d)):
124 | out += x*pow(2,index)
125 | print str(a_int) + " + " + str(b_int) + " = " + str(out)
126 | print "------------"
127 |
--------------------------------------------------------------------------------
/chatbotv1/src/main/java/org/wltea/analyzer/core/IKArbitrator.java:
--------------------------------------------------------------------------------
1 | /**
2 | * IK 中文分词 版本 5.0
3 | * IK Analyzer release 5.0
4 | *
5 | * Licensed to the Apache Software Foundation (ASF) under one or more
6 | * contributor license agreements. See the NOTICE file distributed with
7 | * this work for additional information regarding copyright ownership.
8 | * The ASF licenses this file to You under the Apache License, Version 2.0
9 | * (the "License"); you may not use this file except in compliance with
10 | * the License. You may obtain a copy of the License at
11 | *
12 | * http://www.apache.org/licenses/LICENSE-2.0
13 | *
14 | * Unless required by applicable law or agreed to in writing, software
15 | * distributed under the License is distributed on an "AS IS" BASIS,
16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | * See the License for the specific language governing permissions and
18 | * limitations under the License.
19 | *
20 | * 源代码由林良益(linliangyi2005@gmail.com)提供
21 | * 版权声明 2012,乌龙茶工作室
22 | * provided by Linliangyi and copyright 2012 by Oolong studio
23 | *
24 | */
25 | package org.wltea.analyzer.core;
26 |
27 | import java.util.Stack;
28 | import java.util.TreeSet;
29 |
30 | /**
31 | * IK分词歧义裁决器
32 | */
33 | class IKArbitrator {
34 |
35 | IKArbitrator(){
36 |
37 | }
38 |
39 | /**
40 | * 分词歧义处理
41 | * @param orgLexemes
42 | * @param useSmart
43 | */
44 | void process(AnalyzeContext context , boolean useSmart){
45 | QuickSortSet orgLexemes = context.getOrgLexemes();
46 | Lexeme orgLexeme = orgLexemes.pollFirst();
47 |
48 | LexemePath crossPath = new LexemePath();
49 | while(orgLexeme != null){
50 | if(!crossPath.addCrossLexeme(orgLexeme)){
51 | //找到与crossPath不相交的下一个crossPath
52 | if(crossPath.size() == 1 || !useSmart){
53 | //crossPath没有歧义 或者 不做歧义处理
54 | //直接输出当前crossPath
55 | context.addLexemePath(crossPath);
56 | }else{
57 | //对当前的crossPath进行歧义处理
58 | QuickSortSet.Cell headCell = crossPath.getHead();
59 | LexemePath judgeResult = this.judge(headCell, crossPath.getPathLength());
60 | //输出歧义处理结果judgeResult
61 | context.addLexemePath(judgeResult);
62 | }
63 |
64 | //把orgLexeme加入新的crossPath中
65 | crossPath = new LexemePath();
66 | crossPath.addCrossLexeme(orgLexeme);
67 | }
68 | orgLexeme = orgLexemes.pollFirst();
69 | }
70 |
71 |
72 | //处理最后的path
73 | if(crossPath.size() == 1 || !useSmart){
74 | //crossPath没有歧义 或者 不做歧义处理
75 | //直接输出当前crossPath
76 | context.addLexemePath(crossPath);
77 | }else{
78 | //对当前的crossPath进行歧义处理
79 | QuickSortSet.Cell headCell = crossPath.getHead();
80 | LexemePath judgeResult = this.judge(headCell, crossPath.getPathLength());
81 | //输出歧义处理结果judgeResult
82 | context.addLexemePath(judgeResult);
83 | }
84 | }
85 |
86 | /**
87 | * 歧义识别
88 | * @param lexemeCell 歧义路径链表头
89 | * @param fullTextLength 歧义路径文本长度
90 | * @param option 候选结果路径
91 | * @return
92 | */
93 | private LexemePath judge(QuickSortSet.Cell lexemeCell , int fullTextLength){
94 | //候选路径集合
95 | TreeSet pathOptions = new TreeSet();
96 | //候选结果路径
97 | LexemePath option = new LexemePath();
98 |
99 | //对crossPath进行一次遍历,同时返回本次遍历中有冲突的Lexeme栈
100 | Stack lexemeStack = this.forwardPath(lexemeCell , option);
101 |
102 | //当前词元链并非最理想的,加入候选路径集合
103 | pathOptions.add(option.copy());
104 |
105 | //存在歧义词,处理
106 | QuickSortSet.Cell c = null;
107 | while(!lexemeStack.isEmpty()){
108 | c = lexemeStack.pop();
109 | //回滚词元链
110 | this.backPath(c.getLexeme() , option);
111 | //从歧义词位置开始,递归,生成可选方案
112 | this.forwardPath(c , option);
113 | pathOptions.add(option.copy());
114 | }
115 |
116 | //返回集合中的最优方案
117 | return pathOptions.first();
118 |
119 | }
120 |
121 | /**
122 | * 向前遍历,添加词元,构造一个无歧义词元组合
123 | * @param LexemePath path
124 | * @return
125 | */
126 | private Stack forwardPath(QuickSortSet.Cell lexemeCell , LexemePath option){
127 | //发生冲突的Lexeme栈
128 | Stack conflictStack = new Stack();
129 | QuickSortSet.Cell c = lexemeCell;
130 | //迭代遍历Lexeme链表
131 | while(c != null && c.getLexeme() != null){
132 | if(!option.addNotCrossLexeme(c.getLexeme())){
133 | //词元交叉,添加失败则加入lexemeStack栈
134 | conflictStack.push(c);
135 | }
136 | c = c.getNext();
137 | }
138 | return conflictStack;
139 | }
140 |
141 | /**
142 | * 回滚词元链,直到它能够接受指定的词元
143 | * @param lexeme
144 | * @param l
145 | */
146 | private void backPath(Lexeme l , LexemePath option){
147 | while(option.checkCross(l)){
148 | option.removeTail();
149 | }
150 |
151 | }
152 |
153 | }
154 |
--------------------------------------------------------------------------------
/chatbotv1/src/main/java/org/wltea/analyzer/cfg/DefaultConfig.java:
--------------------------------------------------------------------------------
1 | /**
2 | * IK 中文分词 版本 5.0
3 | * IK Analyzer release 5.0
4 | *
5 | * Licensed to the Apache Software Foundation (ASF) under one or more
6 | * contributor license agreements. See the NOTICE file distributed with
7 | * this work for additional information regarding copyright ownership.
8 | * The ASF licenses this file to You under the Apache License, Version 2.0
9 | * (the "License"); you may not use this file except in compliance with
10 | * the License. You may obtain a copy of the License at
11 | *
12 | * http://www.apache.org/licenses/LICENSE-2.0
13 | *
14 | * Unless required by applicable law or agreed to in writing, software
15 | * distributed under the License is distributed on an "AS IS" BASIS,
16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | * See the License for the specific language governing permissions and
18 | * limitations under the License.
19 | *
20 | * 源代码由林良益(linliangyi2005@gmail.com)提供
21 | * 版权声明 2012,乌龙茶工作室
22 | * provided by Linliangyi and copyright 2012 by Oolong studio
23 | *
24 | *
25 | */
26 | package org.wltea.analyzer.cfg;
27 |
28 | import java.io.IOException;
29 | import java.io.InputStream;
30 | import java.util.ArrayList;
31 | import java.util.InvalidPropertiesFormatException;
32 | import java.util.List;
33 | import java.util.Properties;
34 |
35 | /**
36 | * Configuration 默认实现
37 | * 2012-5-8
38 | *
39 | */
40 | public class DefaultConfig implements Configuration{
41 |
42 | /*
43 | * 分词器默认字典路径
44 | */
45 | private static final String PATH_DIC_MAIN = "main2012.dic";
46 | private static final String PATH_DIC_QUANTIFIER = "quantifier.dic";
47 |
48 | /*
49 | * 分词器配置文件路径
50 | */
51 | private static final String FILE_NAME = "IKAnalyzer.cfg.xml";
52 | //配置属性——扩展字典
53 | private static final String EXT_DICT = "ext_dict";
54 | //配置属性——扩展停止词典
55 | private static final String EXT_STOP = "ext_stopwords";
56 |
57 | private Properties props;
58 | /*
59 | * 是否使用smart方式分词
60 | */
61 | private boolean useSmart;
62 |
63 | /**
64 | * 返回单例
65 | * @return Configuration单例
66 | */
67 | public static Configuration getInstance(){
68 | return new DefaultConfig();
69 | }
70 |
71 | /*
72 | * 初始化配置文件
73 | */
74 | private DefaultConfig(){
75 | props = new Properties();
76 |
77 | InputStream input = this.getClass().getClassLoader().getResourceAsStream(FILE_NAME);
78 | if(input != null){
79 | try {
80 | props.loadFromXML(input);
81 | } catch (InvalidPropertiesFormatException e) {
82 | e.printStackTrace();
83 | } catch (IOException e) {
84 | e.printStackTrace();
85 | }
86 | }
87 | }
88 |
89 |
90 | /**
91 | * 返回useSmart标志位
92 | * useSmart =true ,分词器使用智能切分策略, =false则使用细粒度切分
93 | * @return useSmart
94 | */
95 | public boolean useSmart() {
96 | return useSmart;
97 | }
98 |
99 | /**
100 | * 设置useSmart标志位
101 | * useSmart =true ,分词器使用智能切分策略, =false则使用细粒度切分
102 | * @param useSmart
103 | */
104 | public void setUseSmart(boolean useSmart) {
105 | this.useSmart = useSmart;
106 | }
107 |
108 | /**
109 | * 获取主词典路径
110 | *
111 | * @return String 主词典路径
112 | */
113 | public String getMainDictionary(){
114 | return PATH_DIC_MAIN;
115 | }
116 |
117 | /**
118 | * 获取量词词典路径
119 | * @return String 量词词典路径
120 | */
121 | public String getQuantifierDicionary(){
122 | return PATH_DIC_QUANTIFIER;
123 | }
124 |
125 | /**
126 | * 获取扩展字典配置路径
127 | * @return List 相对类加载器的路径
128 | */
129 | public List getExtDictionarys(){
130 | List extDictFiles = new ArrayList(2);
131 | String extDictCfg = props.getProperty(EXT_DICT);
132 | if(extDictCfg != null){
133 | //使用;分割多个扩展字典配置
134 | String[] filePaths = extDictCfg.split(";");
135 | if(filePaths != null){
136 | for(String filePath : filePaths){
137 | if(filePath != null && !"".equals(filePath.trim())){
138 | extDictFiles.add(filePath.trim());
139 | }
140 | }
141 | }
142 | }
143 | return extDictFiles;
144 | }
145 |
146 |
147 | /**
148 | * 获取扩展停止词典配置路径
149 | * @return List 相对类加载器的路径
150 | */
151 | public List getExtStopWordDictionarys(){
152 | List extStopWordDictFiles = new ArrayList(2);
153 | String extStopWordDictCfg = props.getProperty(EXT_STOP);
154 | if(extStopWordDictCfg != null){
155 | //使用;分割多个扩展字典配置
156 | String[] filePaths = extStopWordDictCfg.split(";");
157 | if(filePaths != null){
158 | for(String filePath : filePaths){
159 | if(filePath != null && !"".equals(filePath.trim())){
160 | extStopWordDictFiles.add(filePath.trim());
161 | }
162 | }
163 | }
164 | }
165 | return extStopWordDictFiles;
166 | }
167 |
168 |
169 | }
170 |
--------------------------------------------------------------------------------
/chatbotv1/src/main/java/org/wltea/analyzer/core/IKSegmenter.java:
--------------------------------------------------------------------------------
1 | /**
2 | * IK 中文分词 版本 5.0
3 | * IK Analyzer release 5.0
4 | *
5 | * Licensed to the Apache Software Foundation (ASF) under one or more
6 | * contributor license agreements. See the NOTICE file distributed with
7 | * this work for additional information regarding copyright ownership.
8 | * The ASF licenses this file to You under the Apache License, Version 2.0
9 | * (the "License"); you may not use this file except in compliance with
10 | * the License. You may obtain a copy of the License at
11 | *
12 | * http://www.apache.org/licenses/LICENSE-2.0
13 | *
14 | * Unless required by applicable law or agreed to in writing, software
15 | * distributed under the License is distributed on an "AS IS" BASIS,
16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | * See the License for the specific language governing permissions and
18 | * limitations under the License.
19 | *
20 | * 源代码由林良益(linliangyi2005@gmail.com)提供
21 | * 版权声明 2012,乌龙茶工作室
22 | * provided by Linliangyi and copyright 2012 by Oolong studio
23 | */
24 | package org.wltea.analyzer.core;
25 |
26 | import java.io.IOException;
27 | import java.io.Reader;
28 | import java.util.ArrayList;
29 | import java.util.List;
30 |
31 | import org.wltea.analyzer.cfg.Configuration;
32 | import org.wltea.analyzer.cfg.DefaultConfig;
33 | import org.wltea.analyzer.dic.Dictionary;
34 |
35 | /**
36 | * IK分词器主类
37 | *
38 | */
39 | public final class IKSegmenter {
40 |
41 | //字符窜reader
42 | private Reader input;
43 | //分词器配置项
44 | private Configuration cfg;
45 | //分词器上下文
46 | private AnalyzeContext context;
47 | //分词处理器列表
48 | private List segmenters;
49 | //分词歧义裁决器
50 | private IKArbitrator arbitrator;
51 |
52 |
53 | /**
54 | * IK分词器构造函数
55 | * @param input
56 | * @param useSmart 为true,使用智能分词策略
57 | *
58 | * 非智能分词:细粒度输出所有可能的切分结果
59 | * 智能分词: 合并数词和量词,对分词结果进行歧义判断
60 | */
61 | public IKSegmenter(Reader input , boolean useSmart){
62 | this.input = input;
63 | this.cfg = DefaultConfig.getInstance();
64 | this.cfg.setUseSmart(useSmart);
65 | this.init();
66 | }
67 |
68 | /**
69 | * IK分词器构造函数
70 | * @param input
71 | * @param cfg 使用自定义的Configuration构造分词器
72 | *
73 | */
74 | public IKSegmenter(Reader input , Configuration cfg){
75 | this.input = input;
76 | this.cfg = cfg;
77 | this.init();
78 | }
79 |
80 | /**
81 | * 初始化
82 | */
83 | private void init(){
84 | //初始化词典单例
85 | Dictionary.initial(this.cfg);
86 | //初始化分词上下文
87 | this.context = new AnalyzeContext(this.cfg);
88 | //加载子分词器
89 | this.segmenters = this.loadSegmenters();
90 | //加载歧义裁决器
91 | this.arbitrator = new IKArbitrator();
92 | }
93 |
94 | /**
95 | * 初始化词典,加载子分词器实现
96 | * @return List
97 | */
98 | private List loadSegmenters(){
99 | List segmenters = new ArrayList(4);
100 | //处理字母的子分词器
101 | segmenters.add(new LetterSegmenter());
102 | //处理中文数量词的子分词器
103 | segmenters.add(new CN_QuantifierSegmenter());
104 | //处理中文词的子分词器
105 | segmenters.add(new CJKSegmenter());
106 | return segmenters;
107 | }
108 |
109 | /**
110 | * 分词,获取下一个词元
111 | * @return Lexeme 词元对象
112 | * @throws IOException
113 | */
114 | public synchronized Lexeme next()throws IOException{
115 | Lexeme l = null;
116 | while((l = context.getNextLexeme()) == null ){
117 | /*
118 | * 从reader中读取数据,填充buffer
119 | * 如果reader是分次读入buffer的,那么buffer要 进行移位处理
120 | * 移位处理上次读入的但未处理的数据
121 | */
122 | int available = context.fillBuffer(this.input);
123 | if(available <= 0){
124 | //reader已经读完
125 | context.reset();
126 | return null;
127 |
128 | }else{
129 | //初始化指针
130 | context.initCursor();
131 | do{
132 | //遍历子分词器
133 | for(ISegmenter segmenter : segmenters){
134 | segmenter.analyze(context);
135 | }
136 | //字符缓冲区接近读完,需要读入新的字符
137 | if(context.needRefillBuffer()){
138 | break;
139 | }
140 | //向前移动指针
141 | }while(context.moveCursor());
142 | //重置子分词器,为下轮循环进行初始化
143 | for(ISegmenter segmenter : segmenters){
144 | segmenter.reset();
145 | }
146 | }
147 | //对分词进行歧义处理
148 | this.arbitrator.process(context, this.cfg.useSmart());
149 | //将分词结果输出到结果集,并处理未切分的单个CJK字符
150 | context.outputToResult();
151 | //记录本次分词的缓冲区位移
152 | context.markBufferOffset();
153 | }
154 | return l;
155 | }
156 |
157 | /**
158 | * 重置分词器到初始状态
159 | * @param input
160 | */
161 | public synchronized void reset(Reader input) {
162 | this.input = input;
163 | context.reset();
164 | for(ISegmenter segmenter : segmenters){
165 | segmenter.reset();
166 | }
167 | }
168 | }
169 |
--------------------------------------------------------------------------------
/word2vec/distance.c:
--------------------------------------------------------------------------------
1 | // Copyright 2013 Google Inc. All Rights Reserved.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | #include
16 | #include
17 | #include
18 | //#include
19 | #include
20 |
21 | const long long max_size = 2000; // max length of strings
22 | const long long N = 40; // number of closest words that will be shown
23 | const long long max_w = 50; // max length of vocabulary entries
24 |
25 | int main(int argc, char **argv) {
26 | FILE *f;
27 | char st1[max_size];
28 | char *bestw[N];
29 | char file_name[max_size], st[100][max_size];
30 | float dist, len, bestd[N], vec[max_size];
31 | long long words, size, a, b, c, d, cn, bi[100];
32 | char ch;
33 | float *M;
34 | char *vocab;
35 | if (argc < 2) {
36 | printf("Usage: ./distance \nwhere FILE contains word projections in the BINARY FORMAT\n");
37 | return 0;
38 | }
39 | strcpy(file_name, argv[1]);
40 | f = fopen(file_name, "rb");
41 | if (f == NULL) {
42 | printf("Input file not found\n");
43 | return -1;
44 | }
45 | fscanf(f, "%lld", &words);
46 | fscanf(f, "%lld", &size);
47 | vocab = (char *)malloc((long long)words * max_w * sizeof(char));
48 | for (a = 0; a < N; a++) bestw[a] = (char *)malloc(max_size * sizeof(char));
49 | M = (float *)malloc((long long)words * (long long)size * sizeof(float));
50 | if (M == NULL) {
51 | printf("Cannot allocate memory: %lld MB %lld %lld\n", (long long)words * size * sizeof(float) / 1048576, words, size);
52 | return -1;
53 | }
54 | for (b = 0; b < words; b++) {
55 | a = 0;
56 | while (1) {
57 | vocab[b * max_w + a] = fgetc(f);
58 | if (feof(f) || (vocab[b * max_w + a] == ' ')) break;
59 | if ((a < max_w) && (vocab[b * max_w + a] != '\n')) a++;
60 | }
61 | vocab[b * max_w + a] = 0;
62 | for (a = 0; a < size; a++) fread(&M[a + b * size], sizeof(float), 1, f);
63 | len = 0;
64 | for (a = 0; a < size; a++) len += M[a + b * size] * M[a + b * size];
65 | len = sqrt(len);
66 | for (a = 0; a < size; a++) M[a + b * size] /= len;
67 | }
68 | fclose(f);
69 | while (1) {
70 | for (a = 0; a < N; a++) bestd[a] = 0;
71 | for (a = 0; a < N; a++) bestw[a][0] = 0;
72 | printf("Enter word or sentence (EXIT to break): ");
73 | a = 0;
74 | while (1) {
75 | st1[a] = fgetc(stdin);
76 | if ((st1[a] == '\n') || (a >= max_size - 1)) {
77 | st1[a] = 0;
78 | break;
79 | }
80 | a++;
81 | }
82 | if (!strcmp(st1, "EXIT")) break;
83 | cn = 0;
84 | b = 0;
85 | c = 0;
86 | while (1) {
87 | st[cn][b] = st1[c];
88 | b++;
89 | c++;
90 | st[cn][b] = 0;
91 | if (st1[c] == 0) break;
92 | if (st1[c] == ' ') {
93 | cn++;
94 | b = 0;
95 | c++;
96 | }
97 | }
98 | cn++;
99 | for (a = 0; a < cn; a++) {
100 | for (b = 0; b < words; b++) if (!strcmp(&vocab[b * max_w], st[a])) break;
101 | if (b == words) b = -1;
102 | bi[a] = b;
103 | printf("\nWord: %s Position in vocabulary: %lld\n", st[a], bi[a]);
104 | if (b == -1) {
105 | printf("Out of dictionary word!\n");
106 | break;
107 | }
108 | }
109 | if (b == -1) continue;
110 | printf("\n Word Cosine distance\n------------------------------------------------------------------------\n");
111 | for (a = 0; a < size; a++) vec[a] = 0;
112 | for (b = 0; b < cn; b++) {
113 | if (bi[b] == -1) continue;
114 | for (a = 0; a < size; a++) vec[a] += M[a + bi[b] * size];
115 | }
116 | len = 0;
117 | for (a = 0; a < size; a++) len += vec[a] * vec[a];
118 | len = sqrt(len);
119 | for (a = 0; a < size; a++) vec[a] /= len;
120 | for (a = 0; a < N; a++) bestd[a] = -1;
121 | for (a = 0; a < N; a++) bestw[a][0] = 0;
122 | for (c = 0; c < words; c++) {
123 | a = 0;
124 | for (b = 0; b < cn; b++) if (bi[b] == c) a = 1;
125 | if (a == 1) continue;
126 | dist = 0;
127 | for (a = 0; a < size; a++) dist += vec[a] * M[a + c * size];
128 | for (a = 0; a < N; a++) {
129 | if (dist > bestd[a]) {
130 | for (d = N - 1; d > a; d--) {
131 | bestd[d] = bestd[d - 1];
132 | strcpy(bestw[d], bestw[d - 1]);
133 | }
134 | bestd[a] = dist;
135 | strcpy(bestw[a], &vocab[c * max_w]);
136 | break;
137 | }
138 | }
139 | }
140 | for (a = 0; a < N; a++) printf("%50s\t\t%f\n", bestw[a], bestd[a]);
141 | }
142 | return 0;
143 | }
144 |
--------------------------------------------------------------------------------
/chatbotv1/src/main/java/org/wltea/analyzer/query/SWMCQueryBuilder.java:
--------------------------------------------------------------------------------
1 | /**
2 | * IK 中文分词 版本 5.0
3 | * IK Analyzer release 5.0
4 | *
5 | * Licensed to the Apache Software Foundation (ASF) under one or more
6 | * contributor license agreements. See the NOTICE file distributed with
7 | * this work for additional information regarding copyright ownership.
8 | * The ASF licenses this file to You under the Apache License, Version 2.0
9 | * (the "License"); you may not use this file except in compliance with
10 | * the License. You may obtain a copy of the License at
11 | *
12 | * http://www.apache.org/licenses/LICENSE-2.0
13 | *
14 | * Unless required by applicable law or agreed to in writing, software
15 | * distributed under the License is distributed on an "AS IS" BASIS,
16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | * See the License for the specific language governing permissions and
18 | * limitations under the License.
19 | *
20 | * 源代码由林良益(linliangyi2005@gmail.com)提供
21 | * 版权声明 2012,乌龙茶工作室
22 | * provided by Linliangyi and copyright 2012 by Oolong studio
23 | *
24 | */
25 | package org.wltea.analyzer.query;
26 |
27 | import java.io.IOException;
28 | import java.io.StringReader;
29 | import java.util.ArrayList;
30 | import java.util.List;
31 |
32 | import org.apache.lucene.analysis.standard.StandardAnalyzer;
33 | import org.apache.lucene.queryparser.classic.ParseException;
34 | import org.apache.lucene.queryparser.classic.QueryParser;
35 | import org.apache.lucene.search.Query;
36 | import org.apache.lucene.util.Version;
37 | import org.wltea.analyzer.core.IKSegmenter;
38 | import org.wltea.analyzer.core.Lexeme;
39 |
40 | /**
41 | * Single Word Multi Char Query Builder
42 | * IK分词算法专用
43 | * @author linliangyi
44 | *
45 | */
46 | public class SWMCQueryBuilder {
47 |
48 | /**
49 | * 生成SWMCQuery
50 | * @param fieldName
51 | * @param keywords
52 | * @param quickMode
53 | * @return Lucene Query
54 | */
55 | public static Query create(String fieldName ,String keywords , boolean quickMode){
56 | if(fieldName == null || keywords == null){
57 | throw new IllegalArgumentException("参数 fieldName 、 keywords 不能为null.");
58 | }
59 | //1.对keywords进行分词处理
60 | List lexemes = doAnalyze(keywords);
61 | //2.根据分词结果,生成SWMCQuery
62 | Query _SWMCQuery = getSWMCQuery(fieldName , lexemes , quickMode);
63 | return _SWMCQuery;
64 | }
65 |
66 | /**
67 | * 分词切分,并返回结链表
68 | * @param keywords
69 | * @return
70 | */
71 | private static List doAnalyze(String keywords){
72 | List lexemes = new ArrayList();
73 | IKSegmenter ikSeg = new IKSegmenter(new StringReader(keywords) , true);
74 | try{
75 | Lexeme l = null;
76 | while( (l = ikSeg.next()) != null){
77 | lexemes.add(l);
78 | }
79 | }catch(IOException e){
80 | e.printStackTrace();
81 | }
82 | return lexemes;
83 | }
84 |
85 |
86 | /**
87 | * 根据分词结果生成SWMC搜索
88 | * @param fieldName
89 | * @param pathOption
90 | * @param quickMode
91 | * @return
92 | */
93 | private static Query getSWMCQuery(String fieldName , List lexemes , boolean quickMode){
94 | //构造SWMC的查询表达式
95 | StringBuffer keywordBuffer = new StringBuffer();
96 | //精简的SWMC的查询表达式
97 | StringBuffer keywordBuffer_Short = new StringBuffer();
98 | //记录最后词元长度
99 | int lastLexemeLength = 0;
100 | //记录最后词元结束位置
101 | int lastLexemeEnd = -1;
102 |
103 | int shortCount = 0;
104 | int totalCount = 0;
105 | for(Lexeme l : lexemes){
106 | totalCount += l.getLength();
107 | //精简表达式
108 | if(l.getLength() > 1){
109 | keywordBuffer_Short.append(' ').append(l.getLexemeText());
110 | shortCount += l.getLength();
111 | }
112 |
113 | if(lastLexemeLength == 0){
114 | keywordBuffer.append(l.getLexemeText());
115 | }else if(lastLexemeLength == 1 && l.getLength() == 1
116 | && lastLexemeEnd == l.getBeginPosition()){//单字位置相邻,长度为一,合并)
117 | keywordBuffer.append(l.getLexemeText());
118 | }else{
119 | keywordBuffer.append(' ').append(l.getLexemeText());
120 |
121 | }
122 | lastLexemeLength = l.getLength();
123 | lastLexemeEnd = l.getEndPosition();
124 | }
125 |
126 | //借助lucene queryparser 生成SWMC Query
127 | QueryParser qp = new QueryParser(Version.LUCENE_40, fieldName, new StandardAnalyzer(Version.LUCENE_40));
128 | qp.setDefaultOperator(QueryParser.AND_OPERATOR);
129 | qp.setAutoGeneratePhraseQueries(true);
130 |
131 | if(quickMode && (shortCount * 1.0f / totalCount) > 0.5f){
132 | try {
133 | //System.out.println(keywordBuffer.toString());
134 | Query q = qp.parse(keywordBuffer_Short.toString());
135 | return q;
136 | } catch (ParseException e) {
137 | e.printStackTrace();
138 | }
139 |
140 | }else{
141 | if(keywordBuffer.length() > 0){
142 | try {
143 | //System.out.println(keywordBuffer.toString());
144 | Query q = qp.parse(keywordBuffer.toString());
145 | return q;
146 | } catch (ParseException e) {
147 | e.printStackTrace();
148 | }
149 | }
150 | }
151 | return null;
152 | }
153 | }
154 |
--------------------------------------------------------------------------------
/chatbotv1/src/main/java/org/wltea/analyzer/sample/LuceneIndexAndSearchDemo.java:
--------------------------------------------------------------------------------
1 | /**
2 | * IK 中文分词 版本 5.0
3 | * IK Analyzer release 5.0
4 | *
5 | * Licensed to the Apache Software Foundation (ASF) under one or more
6 | * contributor license agreements. See the NOTICE file distributed with
7 | * this work for additional information regarding copyright ownership.
8 | * The ASF licenses this file to You under the Apache License, Version 2.0
9 | * (the "License"); you may not use this file except in compliance with
10 | * the License. You may obtain a copy of the License at
11 | *
12 | * http://www.apache.org/licenses/LICENSE-2.0
13 | *
14 | * Unless required by applicable law or agreed to in writing, software
15 | * distributed under the License is distributed on an "AS IS" BASIS,
16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | * See the License for the specific language governing permissions and
18 | * limitations under the License.
19 | *
20 | * 源代码由林良益(linliangyi2005@gmail.com)提供
21 | * 版权声明 2012,乌龙茶工作室
22 | * provided by Linliangyi and copyright 2012 by Oolong studio
23 | *
24 | *
25 | */
26 | package org.wltea.analyzer.sample;
27 |
28 | import java.io.IOException;
29 |
30 | import org.apache.lucene.analysis.Analyzer;
31 | import org.apache.lucene.document.Document;
32 | import org.apache.lucene.document.Field;
33 | import org.apache.lucene.document.StringField;
34 | import org.apache.lucene.document.TextField;
35 | import org.apache.lucene.index.CorruptIndexException;
36 | import org.apache.lucene.index.DirectoryReader;
37 | import org.apache.lucene.index.IndexReader;
38 | import org.apache.lucene.index.IndexWriter;
39 | import org.apache.lucene.index.IndexWriterConfig;
40 | import org.apache.lucene.index.IndexWriterConfig.OpenMode;
41 | import org.apache.lucene.queryparser.classic.ParseException;
42 | import org.apache.lucene.queryparser.classic.QueryParser;
43 | import org.apache.lucene.search.IndexSearcher;
44 | import org.apache.lucene.search.Query;
45 | import org.apache.lucene.search.ScoreDoc;
46 | import org.apache.lucene.search.TopDocs;
47 | import org.apache.lucene.store.Directory;
48 | import org.apache.lucene.store.LockObtainFailedException;
49 | import org.apache.lucene.store.RAMDirectory;
50 | import org.apache.lucene.util.Version;
51 | import org.wltea.analyzer.lucene.IKAnalyzer;
52 |
53 |
54 |
55 |
56 | /**
57 | * 使用IKAnalyzer进行Lucene索引和查询的演示
58 | * 2012-3-2
59 | *
60 | * 以下是结合Lucene4.0 API的写法
61 | *
62 | */
63 | public class LuceneIndexAndSearchDemo {
64 |
65 |
66 | /**
67 | * 模拟:
68 | * 创建一个单条记录的索引,并对其进行搜索
69 | * @param args
70 | */
71 | public static void main(String[] args){
72 | //Lucene Document的域名
73 | String fieldName = "text";
74 | //检索内容
75 | String text = "IK Analyzer是一个结合词典分词和文法分词的中文分词开源工具包。它使用了全新的正向迭代最细粒度切分算法。";
76 |
77 | //实例化IKAnalyzer分词器
78 | Analyzer analyzer = new IKAnalyzer(true);
79 |
80 | Directory directory = null;
81 | IndexWriter iwriter = null;
82 | IndexReader ireader = null;
83 | IndexSearcher isearcher = null;
84 | try {
85 | //建立内存索引对象
86 | directory = new RAMDirectory();
87 |
88 | //配置IndexWriterConfig
89 | IndexWriterConfig iwConfig = new IndexWriterConfig(Version.LUCENE_40 , analyzer);
90 | iwConfig.setOpenMode(OpenMode.CREATE_OR_APPEND);
91 | iwriter = new IndexWriter(directory , iwConfig);
92 | //写入索引
93 | Document doc = new Document();
94 | doc.add(new StringField("ID", "10000", Field.Store.YES));
95 | doc.add(new TextField(fieldName, text, Field.Store.YES));
96 | iwriter.addDocument(doc);
97 | iwriter.close();
98 |
99 |
100 | //搜索过程**********************************
101 | //实例化搜索器
102 | ireader = DirectoryReader.open(directory);
103 | isearcher = new IndexSearcher(ireader);
104 |
105 | String keyword = "中文分词工具包";
106 | //使用QueryParser查询分析器构造Query对象
107 | QueryParser qp = new QueryParser(Version.LUCENE_40, fieldName, analyzer);
108 | qp.setDefaultOperator(QueryParser.AND_OPERATOR);
109 | Query query = qp.parse(keyword);
110 | System.out.println("Query = " + query);
111 |
112 | //搜索相似度最高的5条记录
113 | TopDocs topDocs = isearcher.search(query , 5);
114 | System.out.println("命中:" + topDocs.totalHits);
115 | //输出结果
116 | ScoreDoc[] scoreDocs = topDocs.scoreDocs;
117 | for (int i = 0; i < topDocs.totalHits; i++){
118 | Document targetDoc = isearcher.doc(scoreDocs[i].doc);
119 | System.out.println("内容:" + targetDoc.toString());
120 | }
121 |
122 | } catch (CorruptIndexException e) {
123 | e.printStackTrace();
124 | } catch (LockObtainFailedException e) {
125 | e.printStackTrace();
126 | } catch (IOException e) {
127 | e.printStackTrace();
128 | } catch (ParseException e) {
129 | e.printStackTrace();
130 | } finally{
131 | if(ireader != null){
132 | try {
133 | ireader.close();
134 | } catch (IOException e) {
135 | e.printStackTrace();
136 | }
137 | }
138 | if(directory != null){
139 | try {
140 | directory.close();
141 | } catch (IOException e) {
142 | e.printStackTrace();
143 | }
144 | }
145 | }
146 | }
147 | }
148 |
--------------------------------------------------------------------------------
/word2vec/word-analogy.c:
--------------------------------------------------------------------------------
1 | // Copyright 2013 Google Inc. All Rights Reserved.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | #include
16 | #include
17 | #include
18 | //#include
19 | #include
20 |
21 | const long long max_size = 2000; // max length of strings
22 | const long long N = 40; // number of closest words that will be shown
23 | const long long max_w = 50; // max length of vocabulary entries
24 |
25 | int main(int argc, char **argv) {
26 | FILE *f;
27 | char st1[max_size];
28 | char bestw[N][max_size];
29 | char file_name[max_size], st[100][max_size];
30 | float dist, len, bestd[N], vec[max_size];
31 | long long words, size, a, b, c, d, cn, bi[100];
32 | char ch;
33 | float *M;
34 | char *vocab;
35 | if (argc < 2) {
36 | printf("Usage: ./word-analogy \nwhere FILE contains word projections in the BINARY FORMAT\n");
37 | return 0;
38 | }
39 | strcpy(file_name, argv[1]);
40 | f = fopen(file_name, "rb");
41 | if (f == NULL) {
42 | printf("Input file not found\n");
43 | return -1;
44 | }
45 | fscanf(f, "%lld", &words);
46 | fscanf(f, "%lld", &size);
47 | vocab = (char *)malloc((long long)words * max_w * sizeof(char));
48 | M = (float *)malloc((long long)words * (long long)size * sizeof(float));
49 | if (M == NULL) {
50 | printf("Cannot allocate memory: %lld MB %lld %lld\n", (long long)words * size * sizeof(float) / 1048576, words, size);
51 | return -1;
52 | }
53 | for (b = 0; b < words; b++) {
54 | a = 0;
55 | while (1) {
56 | vocab[b * max_w + a] = fgetc(f);
57 | if (feof(f) || (vocab[b * max_w + a] == ' ')) break;
58 | if ((a < max_w) && (vocab[b * max_w + a] != '\n')) a++;
59 | }
60 | vocab[b * max_w + a] = 0;
61 | for (a = 0; a < size; a++) fread(&M[a + b * size], sizeof(float), 1, f);
62 | len = 0;
63 | for (a = 0; a < size; a++) len += M[a + b * size] * M[a + b * size];
64 | len = sqrt(len);
65 | for (a = 0; a < size; a++) M[a + b * size] /= len;
66 | }
67 | fclose(f);
68 | while (1) {
69 | for (a = 0; a < N; a++) bestd[a] = 0;
70 | for (a = 0; a < N; a++) bestw[a][0] = 0;
71 | printf("Enter three words (EXIT to break): ");
72 | a = 0;
73 | while (1) {
74 | st1[a] = fgetc(stdin);
75 | if ((st1[a] == '\n') || (a >= max_size - 1)) {
76 | st1[a] = 0;
77 | break;
78 | }
79 | a++;
80 | }
81 | if (!strcmp(st1, "EXIT")) break;
82 | cn = 0;
83 | b = 0;
84 | c = 0;
85 | while (1) {
86 | st[cn][b] = st1[c];
87 | b++;
88 | c++;
89 | st[cn][b] = 0;
90 | if (st1[c] == 0) break;
91 | if (st1[c] == ' ') {
92 | cn++;
93 | b = 0;
94 | c++;
95 | }
96 | }
97 | cn++;
98 | if (cn < 3) {
99 | printf("Only %lld words were entered.. three words are needed at the input to perform the calculation\n", cn);
100 | continue;
101 | }
102 | for (a = 0; a < cn; a++) {
103 | for (b = 0; b < words; b++) if (!strcmp(&vocab[b * max_w], st[a])) break;
104 | if (b == words) b = 0;
105 | bi[a] = b;
106 | printf("\nWord: %s Position in vocabulary: %lld\n", st[a], bi[a]);
107 | if (b == 0) {
108 | printf("Out of dictionary word!\n");
109 | break;
110 | }
111 | }
112 | if (b == 0) continue;
113 | printf("\n Word Distance\n------------------------------------------------------------------------\n");
114 | for (a = 0; a < size; a++) vec[a] = M[a + bi[1] * size] - M[a + bi[0] * size] + M[a + bi[2] * size];
115 | len = 0;
116 | for (a = 0; a < size; a++) len += vec[a] * vec[a];
117 | len = sqrt(len);
118 | for (a = 0; a < size; a++) vec[a] /= len;
119 | for (a = 0; a < N; a++) bestd[a] = 0;
120 | for (a = 0; a < N; a++) bestw[a][0] = 0;
121 | for (c = 0; c < words; c++) {
122 | if (c == bi[0]) continue;
123 | if (c == bi[1]) continue;
124 | if (c == bi[2]) continue;
125 | a = 0;
126 | for (b = 0; b < cn; b++) if (bi[b] == c) a = 1;
127 | if (a == 1) continue;
128 | dist = 0;
129 | for (a = 0; a < size; a++) dist += vec[a] * M[a + c * size];
130 | for (a = 0; a < N; a++) {
131 | if (dist > bestd[a]) {
132 | for (d = N - 1; d > a; d--) {
133 | bestd[d] = bestd[d - 1];
134 | strcpy(bestw[d], bestw[d - 1]);
135 | }
136 | bestd[a] = dist;
137 | strcpy(bestw[a], &vocab[c * max_w]);
138 | break;
139 | }
140 | }
141 | }
142 | for (a = 0; a < N; a++) printf("%50s\t\t%f\n", bestw[a], bestd[a]);
143 | }
144 | return 0;
145 | }
146 |
--------------------------------------------------------------------------------
/word2vec/demo-train-big-model-v1.sh:
--------------------------------------------------------------------------------
1 | ###############################################################################################
2 | #
3 | # Script for training good word and phrase vector model using public corpora, version 1.0.
4 | # The training time will be from several hours to about a day.
5 | #
6 | # Downloads about 8 billion words, makes phrases using two runs of word2phrase, trains
7 | # a 500-dimensional vector model and evaluates it on word and phrase analogy tasks.
8 | #
9 | ###############################################################################################
10 |
11 | # This function will convert text to lowercase and remove special characters
12 | normalize_text() {
13 | awk '{print tolower($0);}' | sed -e "s/’/'/g" -e "s/′/'/g" -e "s/''/ /g" -e "s/'/ ' /g" -e "s/“/\"/g" -e "s/”/\"/g" \
14 | -e 's/"/ " /g' -e 's/\./ \. /g' -e 's/
/ /g' -e 's/, / , /g' -e 's/(/ ( /g' -e 's/)/ ) /g' -e 's/\!/ \! /g' \
15 | -e 's/\?/ \? /g' -e 's/\;/ /g' -e 's/\:/ /g' -e 's/-/ - /g' -e 's/=/ /g' -e 's/=/ /g' -e 's/*/ /g' -e 's/|/ /g' \
16 | -e 's/«/ /g' | tr 0-9 " "
17 | }
18 |
19 | mkdir word2vec
20 | cd word2vec
21 |
22 | wget http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2012.en.shuffled.gz
23 | wget http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2013.en.shuffled.gz
24 | gzip -d news.2012.en.shuffled.gz
25 | gzip -d news.2013.en.shuffled.gz
26 | normalize_text < news.2012.en.shuffled > data.txt
27 | normalize_text < news.2013.en.shuffled >> data.txt
28 |
29 | wget http://www.statmt.org/lm-benchmark/1-billion-word-language-modeling-benchmark-r13output.tar.gz
30 | tar -xvf 1-billion-word-language-modeling-benchmark-r13output.tar.gz
31 | for i in `ls 1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled`; do
32 | normalize_text < 1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/$i >> data.txt
33 | done
34 |
35 | wget http://ebiquity.umbc.edu/redirect/to/resource/id/351/UMBC-webbase-corpus
36 | tar -zxvf umbc_webbase_corpus.tar.gz webbase_all/*.txt
37 | for i in `ls webbase_all`; do
38 | normalize_text < webbase_all/$i >> data.txt
39 | done
40 |
41 | wget http://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2
42 | bzip2 -c -d enwiki-latest-pages-articles.xml.bz2 | awk '{print tolower($0);}' | perl -e '
43 | # Program to filter Wikipedia XML dumps to "clean" text consisting only of lowercase
44 | # letters (a-z, converted from A-Z), and spaces (never consecutive)...
45 | # All other characters are converted to spaces. Only text which normally appears.
46 | # in the web browser is displayed. Tables are removed. Image captions are.
47 | # preserved. Links are converted to normal text. Digits are spelled out.
48 | # *** Modified to not spell digits or throw away non-ASCII characters ***
49 |
50 | # Written by Matt Mahoney, June 10, 2006. This program is released to the public domain.
51 |
52 | $/=">"; # input record separator
53 | while (<>) {
54 | if (/ ...
55 | if (/#redirect/i) {$text=0;} # remove #REDIRECT
56 | if ($text) {
57 |
58 | # Remove any text not normally visible
59 | if (/<\/text>/) {$text=0;}
60 | s/<.*>//; # remove xml tags
61 | s/&/&/g; # decode URL encoded chars
62 | s/<//g;
64 | s/[//g; # remove references ... ]
65 | s/<[^>]*>//g; # remove xhtml tags
66 | s/\[http:[^] ]*/[/g; # remove normal url, preserve visible text
67 | s/\|thumb//ig; # remove images links, preserve caption
68 | s/\|left//ig;
69 | s/\|right//ig;
70 | s/\|\d+px//ig;
71 | s/\[\[image:[^\[\]]*\|//ig;
72 | s/\[\[category:([^|\]]*)[^]]*\]\]/[[$1]]/ig; # show categories without markup
73 | s/\[\[[a-z\-]*:[^\]]*\]\]//g; # remove links to other languages
74 | s/\[\[[^\|\]]*\|/[[/g; # remove wiki url, preserve visible text
75 | s/{{[^}]*}}//g; # remove {{icons}} and {tables}
76 | s/{[^}]*}//g;
77 | s/\[//g; # remove [ and ]
78 | s/\]//g;
79 | s/&[^;]*;/ /g; # remove URL encoded chars
80 |
81 | $_=" $_ ";
82 | chop;
83 | print $_;
84 | }
85 | }
86 | ' | normalize_text | awk '{if (NF>1) print;}' >> data.txt
87 |
88 | wget http://word2vec.googlecode.com/svn/trunk/word2vec.c
89 | wget http://word2vec.googlecode.com/svn/trunk/word2phrase.c
90 | wget http://word2vec.googlecode.com/svn/trunk/compute-accuracy.c
91 | wget http://word2vec.googlecode.com/svn/trunk/questions-words.txt
92 | wget http://word2vec.googlecode.com/svn/trunk/questions-phrases.txt
93 | gcc word2vec.c -o word2vec -lm -pthread -O3 -march=native -funroll-loops
94 | gcc word2phrase.c -o word2phrase -lm -pthread -O3 -march=native -funroll-loops
95 | gcc compute-accuracy.c -o compute-accuracy -lm -pthread -O3 -march=native -funroll-loops
96 | ./word2phrase -train data.txt -output data-phrase.txt -threshold 200 -debug 2
97 | ./word2phrase -train data-phrase.txt -output data-phrase2.txt -threshold 100 -debug 2
98 | ./word2vec -train data-phrase2.txt -output vectors.bin -cbow 1 -size 500 -window 10 -negative 10 -hs 0 -sample 1e-5 -threads 40 -binary 1 -iter 3 -min-count 10
99 | ./compute-accuracy vectors.bin 400000 < questions-words.txt # should get to almost 78% accuracy on 99.7% of questions
100 | ./compute-accuracy vectors.bin 1000000 < questions-phrases.txt # about 78% accuracy with 77% coverage
101 |
--------------------------------------------------------------------------------
/word2vec/compute-accuracy.c:
--------------------------------------------------------------------------------
1 | // Copyright 2013 Google Inc. All Rights Reserved.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | #include
16 | #include
17 | #include
18 | #include
19 | //#include
20 | #include
21 | #include
22 |
23 | const long long max_size = 2000; // max length of strings
24 | const long long N = 1; // number of closest words
25 | const long long max_w = 50; // max length of vocabulary entries
26 |
27 | int main(int argc, char **argv)
28 | {
29 | FILE *f;
30 | char st1[max_size], st2[max_size], st3[max_size], st4[max_size], bestw[N][max_size], file_name[max_size], ch;
31 | float dist, len, bestd[N], vec[max_size];
32 | long long words, size, a, b, c, d, b1, b2, b3, threshold = 0;
33 | float *M;
34 | char *vocab;
35 | int TCN, CCN = 0, TACN = 0, CACN = 0, SECN = 0, SYCN = 0, SEAC = 0, SYAC = 0, QID = 0, TQ = 0, TQS = 0;
36 | if (argc < 2) {
37 | printf("Usage: ./compute-accuracy \nwhere FILE contains word projections, and threshold is used to reduce vocabulary of the model for fast approximate evaluation (0 = off, otherwise typical value is 30000)\n");
38 | return 0;
39 | }
40 | strcpy(file_name, argv[1]);
41 | if (argc > 2) threshold = atoi(argv[2]);
42 | f = fopen(file_name, "rb");
43 | if (f == NULL) {
44 | printf("Input file not found\n");
45 | return -1;
46 | }
47 | fscanf(f, "%lld", &words);
48 | if (threshold) if (words > threshold) words = threshold;
49 | fscanf(f, "%lld", &size);
50 | vocab = (char *)malloc(words * max_w * sizeof(char));
51 | M = (float *)malloc(words * size * sizeof(float));
52 | if (M == NULL) {
53 | printf("Cannot allocate memory: %lld MB\n", words * size * sizeof(float) / 1048576);
54 | return -1;
55 | }
56 | for (b = 0; b < words; b++) {
57 | a = 0;
58 | while (1) {
59 | vocab[b * max_w + a] = fgetc(f);
60 | if (feof(f) || (vocab[b * max_w + a] == ' ')) break;
61 | if ((a < max_w) && (vocab[b * max_w + a] != '\n')) a++;
62 | }
63 | vocab[b * max_w + a] = 0;
64 | for (a = 0; a < max_w; a++) vocab[b * max_w + a] = toupper(vocab[b * max_w + a]);
65 | for (a = 0; a < size; a++) fread(&M[a + b * size], sizeof(float), 1, f);
66 | len = 0;
67 | for (a = 0; a < size; a++) len += M[a + b * size] * M[a + b * size];
68 | len = sqrt(len);
69 | for (a = 0; a < size; a++) M[a + b * size] /= len;
70 | }
71 | fclose(f);
72 | TCN = 0;
73 | while (1) {
74 | for (a = 0; a < N; a++) bestd[a] = 0;
75 | for (a = 0; a < N; a++) bestw[a][0] = 0;
76 | scanf("%s", st1);
77 | for (a = 0; a < strlen(st1); a++) st1[a] = toupper(st1[a]);
78 | if ((!strcmp(st1, ":")) || (!strcmp(st1, "EXIT")) || feof(stdin)) {
79 | if (TCN == 0) TCN = 1;
80 | if (QID != 0) {
81 | printf("ACCURACY TOP1: %.2f %% (%d / %d)\n", CCN / (float)TCN * 100, CCN, TCN);
82 | printf("Total accuracy: %.2f %% Semantic accuracy: %.2f %% Syntactic accuracy: %.2f %% \n", CACN / (float)TACN * 100, SEAC / (float)SECN * 100, SYAC / (float)SYCN * 100);
83 | }
84 | QID++;
85 | scanf("%s", st1);
86 | if (feof(stdin)) break;
87 | printf("%s:\n", st1);
88 | TCN = 0;
89 | CCN = 0;
90 | continue;
91 | }
92 | if (!strcmp(st1, "EXIT")) break;
93 | scanf("%s", st2);
94 | for (a = 0; a < strlen(st2); a++) st2[a] = toupper(st2[a]);
95 | scanf("%s", st3);
96 | for (a = 0; a bestd[a]) {
123 | for (d = N - 1; d > a; d--) {
124 | bestd[d] = bestd[d - 1];
125 | strcpy(bestw[d], bestw[d - 1]);
126 | }
127 | bestd[a] = dist;
128 | strcpy(bestw[a], &vocab[c * max_w]);
129 | break;
130 | }
131 | }
132 | }
133 | if (!strcmp(st4, bestw[0])) {
134 | CCN++;
135 | CACN++;
136 | if (QID <= 5) SEAC++; else SYAC++;
137 | }
138 | if (QID <= 5) SECN++; else SYCN++;
139 | TCN++;
140 | TACN++;
141 | }
142 | printf("Questions seen / total: %d %d %.2f %% \n", TQS, TQ, TQS/(float)TQ*100);
143 | return 0;
144 | }
145 |
--------------------------------------------------------------------------------
/chatbotv1/src/main/java/org/wltea/analyzer/core/QuickSortSet.java:
--------------------------------------------------------------------------------
1 | /**
2 | * IK 中文分词 版本 5.0
3 | * IK Analyzer release 5.0
4 | *
5 | * Licensed to the Apache Software Foundation (ASF) under one or more
6 | * contributor license agreements. See the NOTICE file distributed with
7 | * this work for additional information regarding copyright ownership.
8 | * The ASF licenses this file to You under the Apache License, Version 2.0
9 | * (the "License"); you may not use this file except in compliance with
10 | * the License. You may obtain a copy of the License at
11 | *
12 | * http://www.apache.org/licenses/LICENSE-2.0
13 | *
14 | * Unless required by applicable law or agreed to in writing, software
15 | * distributed under the License is distributed on an "AS IS" BASIS,
16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | * See the License for the specific language governing permissions and
18 | * limitations under the License.
19 | *
20 | * 源代码由林良益(linliangyi2005@gmail.com)提供
21 | * 版权声明 2012,乌龙茶工作室
22 | * provided by Linliangyi and copyright 2012 by Oolong studio
23 | *
24 | */
25 | package org.wltea.analyzer.core;
26 |
27 | /**
28 | * IK分词器专用的Lexem快速排序集合
29 | */
30 | class QuickSortSet {
31 | //链表头
32 | private Cell head;
33 | //链表尾
34 | private Cell tail;
35 | //链表的实际大小
36 | private int size;
37 |
38 | QuickSortSet(){
39 | this.size = 0;
40 | }
41 |
42 | /**
43 | * 向链表集合添加词元
44 | * @param lexeme
45 | */
46 | boolean addLexeme(Lexeme lexeme){
47 | Cell newCell = new Cell(lexeme);
48 | if(this.size == 0){
49 | this.head = newCell;
50 | this.tail = newCell;
51 | this.size++;
52 | return true;
53 |
54 | }else{
55 | if(this.tail.compareTo(newCell) == 0){//词元与尾部词元相同,不放入集合
56 | return false;
57 |
58 | }else if(this.tail.compareTo(newCell) < 0){//词元接入链表尾部
59 | this.tail.next = newCell;
60 | newCell.prev = this.tail;
61 | this.tail = newCell;
62 | this.size++;
63 | return true;
64 |
65 | }else if(this.head.compareTo(newCell) > 0){//词元接入链表头部
66 | this.head.prev = newCell;
67 | newCell.next = this.head;
68 | this.head = newCell;
69 | this.size++;
70 | return true;
71 |
72 | }else{
73 | //从尾部上逆
74 | Cell index = this.tail;
75 | while(index != null && index.compareTo(newCell) > 0){
76 | index = index.prev;
77 | }
78 | if(index.compareTo(newCell) == 0){//词元与集合中的词元重复,不放入集合
79 | return false;
80 |
81 | }else if(index.compareTo(newCell) < 0){//词元插入链表中的某个位置
82 | newCell.prev = index;
83 | newCell.next = index.next;
84 | index.next.prev = newCell;
85 | index.next = newCell;
86 | this.size++;
87 | return true;
88 | }
89 | }
90 | }
91 | return false;
92 | }
93 |
94 | /**
95 | * 返回链表头部元素
96 | * @return
97 | */
98 | Lexeme peekFirst(){
99 | if(this.head != null){
100 | return this.head.lexeme;
101 | }
102 | return null;
103 | }
104 |
105 | /**
106 | * 取出链表集合的第一个元素
107 | * @return Lexeme
108 | */
109 | Lexeme pollFirst(){
110 | if(this.size == 1){
111 | Lexeme first = this.head.lexeme;
112 | this.head = null;
113 | this.tail = null;
114 | this.size--;
115 | return first;
116 | }else if(this.size > 1){
117 | Lexeme first = this.head.lexeme;
118 | this.head = this.head.next;
119 | this.size --;
120 | return first;
121 | }else{
122 | return null;
123 | }
124 | }
125 |
126 | /**
127 | * 返回链表尾部元素
128 | * @return
129 | */
130 | Lexeme peekLast(){
131 | if(this.tail != null){
132 | return this.tail.lexeme;
133 | }
134 | return null;
135 | }
136 |
137 | /**
138 | * 取出链表集合的最后一个元素
139 | * @return Lexeme
140 | */
141 | Lexeme pollLast(){
142 | if(this.size == 1){
143 | Lexeme last = this.head.lexeme;
144 | this.head = null;
145 | this.tail = null;
146 | this.size--;
147 | return last;
148 |
149 | }else if(this.size > 1){
150 | Lexeme last = this.tail.lexeme;
151 | this.tail = this.tail.prev;
152 | this.size--;
153 | return last;
154 |
155 | }else{
156 | return null;
157 | }
158 | }
159 |
160 | /**
161 | * 返回集合大小
162 | * @return
163 | */
164 | int size(){
165 | return this.size;
166 | }
167 |
168 | /**
169 | * 判断集合是否为空
170 | * @return
171 | */
172 | boolean isEmpty(){
173 | return this.size == 0;
174 | }
175 |
176 | /**
177 | * 返回lexeme链的头部
178 | * @return
179 | */
180 | Cell getHead(){
181 | return this.head;
182 | }
183 |
184 | /**
185 | *
186 | * IK 中文分词 版本 5.0
187 | * IK Analyzer release 5.0
188 | *
189 | * Licensed to the Apache Software Foundation (ASF) under one or more
190 | * contributor license agreements. See the NOTICE file distributed with
191 | * this work for additional information regarding copyright ownership.
192 | * The ASF licenses this file to You under the Apache License, Version 2.0
193 | * (the "License"); you may not use this file except in compliance with
194 | * the License. You may obtain a copy of the License at
195 | *
196 | * http://www.apache.org/licenses/LICENSE-2.0
197 | *
198 | * Unless required by applicable law or agreed to in writing, software
199 | * distributed under the License is distributed on an "AS IS" BASIS,
200 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 | * See the License for the specific language governing permissions and
202 | * limitations under the License.
203 | *
204 | * 源代码由林良益(linliangyi2005@gmail.com)提供
205 | * 版权声明 2012,乌龙茶工作室
206 | * provided by Linliangyi and copyright 2012 by Oolong studio
207 | *
208 | * QuickSortSet集合单元
209 | *
210 | */
211 | class Cell implements Comparable{
212 | private Cell prev;
213 | private Cell next;
214 | private Lexeme lexeme;
215 |
216 | Cell(Lexeme lexeme){
217 | if(lexeme == null){
218 | throw new IllegalArgumentException("lexeme must not be null");
219 | }
220 | this.lexeme = lexeme;
221 | }
222 |
223 | public int compareTo(Cell o) {
224 | return this.lexeme.compareTo(o.lexeme);
225 | }
226 |
227 | public Cell getPrev(){
228 | return this.prev;
229 | }
230 |
231 | public Cell getNext(){
232 | return this.next;
233 | }
234 |
235 | public Lexeme getLexeme(){
236 | return this.lexeme;
237 | }
238 | }
239 | }
240 |
--------------------------------------------------------------------------------
/chatbotv1/src/main/java/org/wltea/analyzer/core/CN_QuantifierSegmenter.java:
--------------------------------------------------------------------------------
1 | /**
2 | * IK 中文分词 版本 5.0
3 | * IK Analyzer release 5.0
4 | *
5 | * Licensed to the Apache Software Foundation (ASF) under one or more
6 | * contributor license agreements. See the NOTICE file distributed with
7 | * this work for additional information regarding copyright ownership.
8 | * The ASF licenses this file to You under the Apache License, Version 2.0
9 | * (the "License"); you may not use this file except in compliance with
10 | * the License. You may obtain a copy of the License at
11 | *
12 | * http://www.apache.org/licenses/LICENSE-2.0
13 | *
14 | * Unless required by applicable law or agreed to in writing, software
15 | * distributed under the License is distributed on an "AS IS" BASIS,
16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | * See the License for the specific language governing permissions and
18 | * limitations under the License.
19 | *
20 | * 源代码由林良益(linliangyi2005@gmail.com)提供
21 | * 版权声明 2012,乌龙茶工作室
22 | * provided by Linliangyi and copyright 2012 by Oolong studio
23 | *
24 | */
25 | package org.wltea.analyzer.core;
26 |
27 | import java.util.HashSet;
28 | import java.util.LinkedList;
29 | import java.util.List;
30 | import java.util.Set;
31 |
32 | import org.wltea.analyzer.dic.Dictionary;
33 | import org.wltea.analyzer.dic.Hit;
34 |
35 | /**
36 | *
37 | * 中文数量词子分词器
38 | */
39 | class CN_QuantifierSegmenter implements ISegmenter{
40 |
41 | //子分词器标签
42 | static final String SEGMENTER_NAME = "QUAN_SEGMENTER";
43 |
44 | //中文数词
45 | private static String Chn_Num = "一二两三四五六七八九十零壹贰叁肆伍陆柒捌玖拾百千万亿拾佰仟萬億兆卅廿";//Cnum
46 | private static Set ChnNumberChars = new HashSet();
47 | static{
48 | char[] ca = Chn_Num.toCharArray();
49 | for(char nChar : ca){
50 | ChnNumberChars.add(nChar);
51 | }
52 | }
53 |
54 | /*
55 | * 词元的开始位置,
56 | * 同时作为子分词器状态标识
57 | * 当start > -1 时,标识当前的分词器正在处理字符
58 | */
59 | private int nStart;
60 | /*
61 | * 记录词元结束位置
62 | * end记录的是在词元中最后一个出现的合理的数词结束
63 | */
64 | private int nEnd;
65 |
66 | //待处理的量词hit队列
67 | private List countHits;
68 |
69 |
70 | CN_QuantifierSegmenter(){
71 | nStart = -1;
72 | nEnd = -1;
73 | this.countHits = new LinkedList();
74 | }
75 |
76 | /**
77 | * 分词
78 | */
79 | public void analyze(AnalyzeContext context) {
80 | //处理中文数词
81 | this.processCNumber(context);
82 | //处理中文量词
83 | this.processCount(context);
84 |
85 | //判断是否锁定缓冲区
86 | if(this.nStart == -1 && this.nEnd == -1 && countHits.isEmpty()){
87 | //对缓冲区解锁
88 | context.unlockBuffer(SEGMENTER_NAME);
89 | }else{
90 | context.lockBuffer(SEGMENTER_NAME);
91 | }
92 | }
93 |
94 |
95 | /**
96 | * 重置子分词器状态
97 | */
98 | public void reset() {
99 | nStart = -1;
100 | nEnd = -1;
101 | countHits.clear();
102 | }
103 |
104 | /**
105 | * 处理数词
106 | */
107 | private void processCNumber(AnalyzeContext context){
108 | if(nStart == -1 && nEnd == -1){//初始状态
109 | if(CharacterUtil.CHAR_CHINESE == context.getCurrentCharType()
110 | && ChnNumberChars.contains(context.getCurrentChar())){
111 | //记录数词的起始、结束位置
112 | nStart = context.getCursor();
113 | nEnd = context.getCursor();
114 | }
115 | }else{//正在处理状态
116 | if(CharacterUtil.CHAR_CHINESE == context.getCurrentCharType()
117 | && ChnNumberChars.contains(context.getCurrentChar())){
118 | //记录数词的结束位置
119 | nEnd = context.getCursor();
120 | }else{
121 | //输出数词
122 | this.outputNumLexeme(context);
123 | //重置头尾指针
124 | nStart = -1;
125 | nEnd = -1;
126 | }
127 | }
128 |
129 | //缓冲区已经用完,还有尚未输出的数词
130 | if(context.isBufferConsumed()){
131 | if(nStart != -1 && nEnd != -1){
132 | //输出数词
133 | outputNumLexeme(context);
134 | //重置头尾指针
135 | nStart = -1;
136 | nEnd = -1;
137 | }
138 | }
139 | }
140 |
141 | /**
142 | * 处理中文量词
143 | * @param context
144 | */
145 | private void processCount(AnalyzeContext context){
146 | // 判断是否需要启动量词扫描
147 | if(!this.needCountScan(context)){
148 | return;
149 | }
150 |
151 | if(CharacterUtil.CHAR_CHINESE == context.getCurrentCharType()){
152 |
153 | //优先处理countHits中的hit
154 | if(!this.countHits.isEmpty()){
155 | //处理词段队列
156 | Hit[] tmpArray = this.countHits.toArray(new Hit[this.countHits.size()]);
157 | for(Hit hit : tmpArray){
158 | hit = Dictionary.getSingleton().matchWithHit(context.getSegmentBuff(), context.getCursor() , hit);
159 | if(hit.isMatch()){
160 | //输出当前的词
161 | Lexeme newLexeme = new Lexeme(context.getBufferOffset() , hit.getBegin() , context.getCursor() - hit.getBegin() + 1 , Lexeme.TYPE_COUNT);
162 | context.addLexeme(newLexeme);
163 |
164 | if(!hit.isPrefix()){//不是词前缀,hit不需要继续匹配,移除
165 | this.countHits.remove(hit);
166 | }
167 |
168 | }else if(hit.isUnmatch()){
169 | //hit不是词,移除
170 | this.countHits.remove(hit);
171 | }
172 | }
173 | }
174 |
175 | //*********************************
176 | //对当前指针位置的字符进行单字匹配
177 | Hit singleCharHit = Dictionary.getSingleton().matchInQuantifierDict(context.getSegmentBuff(), context.getCursor(), 1);
178 | if(singleCharHit.isMatch()){//首字成量词词
179 | //输出当前的词
180 | Lexeme newLexeme = new Lexeme(context.getBufferOffset() , context.getCursor() , 1 , Lexeme.TYPE_COUNT);
181 | context.addLexeme(newLexeme);
182 |
183 | //同时也是词前缀
184 | if(singleCharHit.isPrefix()){
185 | //前缀匹配则放入hit列表
186 | this.countHits.add(singleCharHit);
187 | }
188 | }else if(singleCharHit.isPrefix()){//首字为量词前缀
189 | //前缀匹配则放入hit列表
190 | this.countHits.add(singleCharHit);
191 | }
192 |
193 |
194 | }else{
195 | //输入的不是中文字符
196 | //清空未成形的量词
197 | this.countHits.clear();
198 | }
199 |
200 | //缓冲区数据已经读完,还有尚未输出的量词
201 | if(context.isBufferConsumed()){
202 | //清空未成形的量词
203 | this.countHits.clear();
204 | }
205 | }
206 |
207 | /**
208 | * 判断是否需要扫描量词
209 | * @return
210 | */
211 | private boolean needCountScan(AnalyzeContext context){
212 | if((nStart != -1 && nEnd != -1 ) || !countHits.isEmpty()){
213 | //正在处理中文数词,或者正在处理量词
214 | return true;
215 | }else{
216 | //找到一个相邻的数词
217 | if(!context.getOrgLexemes().isEmpty()){
218 | Lexeme l = context.getOrgLexemes().peekLast();
219 | if(Lexeme.TYPE_CNUM == l.getLexemeType() || Lexeme.TYPE_ARABIC == l.getLexemeType()){
220 | if(l.getBegin() + l.getLength() == context.getCursor()){
221 | return true;
222 | }
223 | }
224 | }
225 | }
226 | return false;
227 | }
228 |
229 | /**
230 | * 添加数词词元到结果集
231 | * @param context
232 | */
233 | private void outputNumLexeme(AnalyzeContext context){
234 | if(nStart > -1 && nEnd > -1){
235 | //输出数词
236 | Lexeme newLexeme = new Lexeme(context.getBufferOffset() , nStart , nEnd - nStart + 1 , Lexeme.TYPE_CNUM);
237 | context.addLexeme(newLexeme);
238 |
239 | }
240 | }
241 |
242 | }
243 |
--------------------------------------------------------------------------------
/chatbotv1/src/main/java/org/wltea/analyzer/core/LexemePath.java:
--------------------------------------------------------------------------------
1 | /**
2 | * IK 中文分词 版本 5.0
3 | * IK Analyzer release 5.0
4 | *
5 | * Licensed to the Apache Software Foundation (ASF) under one or more
6 | * contributor license agreements. See the NOTICE file distributed with
7 | * this work for additional information regarding copyright ownership.
8 | * The ASF licenses this file to You under the Apache License, Version 2.0
9 | * (the "License"); you may not use this file except in compliance with
10 | * the License. You may obtain a copy of the License at
11 | *
12 | * http://www.apache.org/licenses/LICENSE-2.0
13 | *
14 | * Unless required by applicable law or agreed to in writing, software
15 | * distributed under the License is distributed on an "AS IS" BASIS,
16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | * See the License for the specific language governing permissions and
18 | * limitations under the License.
19 | *
20 | * 源代码由林良益(linliangyi2005@gmail.com)提供
21 | * 版权声明 2012,乌龙茶工作室
22 | * provided by Linliangyi and copyright 2012 by Oolong studio
23 | *
24 | */
25 | package org.wltea.analyzer.core;
26 |
27 |
28 | /**
29 | * Lexeme链(路径)
30 | */
31 | class LexemePath extends QuickSortSet implements Comparable{
32 |
33 | //起始位置
34 | private int pathBegin;
35 | //结束
36 | private int pathEnd;
37 | //词元链的有效字符长度
38 | private int payloadLength;
39 |
40 | LexemePath(){
41 | this.pathBegin = -1;
42 | this.pathEnd = -1;
43 | this.payloadLength = 0;
44 | }
45 |
46 | /**
47 | * 向LexemePath追加相交的Lexeme
48 | * @param lexeme
49 | * @return
50 | */
51 | boolean addCrossLexeme(Lexeme lexeme){
52 | if(this.isEmpty()){
53 | this.addLexeme(lexeme);
54 | this.pathBegin = lexeme.getBegin();
55 | this.pathEnd = lexeme.getBegin() + lexeme.getLength();
56 | this.payloadLength += lexeme.getLength();
57 | return true;
58 |
59 | }else if(this.checkCross(lexeme)){
60 | this.addLexeme(lexeme);
61 | if(lexeme.getBegin() + lexeme.getLength() > this.pathEnd){
62 | this.pathEnd = lexeme.getBegin() + lexeme.getLength();
63 | }
64 | this.payloadLength = this.pathEnd - this.pathBegin;
65 | return true;
66 |
67 | }else{
68 | return false;
69 |
70 | }
71 | }
72 |
73 | /**
74 | * 向LexemePath追加不相交的Lexeme
75 | * @param lexeme
76 | * @return
77 | */
78 | boolean addNotCrossLexeme(Lexeme lexeme){
79 | if(this.isEmpty()){
80 | this.addLexeme(lexeme);
81 | this.pathBegin = lexeme.getBegin();
82 | this.pathEnd = lexeme.getBegin() + lexeme.getLength();
83 | this.payloadLength += lexeme.getLength();
84 | return true;
85 |
86 | }else if(this.checkCross(lexeme)){
87 | return false;
88 |
89 | }else{
90 | this.addLexeme(lexeme);
91 | this.payloadLength += lexeme.getLength();
92 | Lexeme head = this.peekFirst();
93 | this.pathBegin = head.getBegin();
94 | Lexeme tail = this.peekLast();
95 | this.pathEnd = tail.getBegin() + tail.getLength();
96 | return true;
97 |
98 | }
99 | }
100 |
101 | /**
102 | * 移除尾部的Lexeme
103 | * @return
104 | */
105 | Lexeme removeTail(){
106 | Lexeme tail = this.pollLast();
107 | if(this.isEmpty()){
108 | this.pathBegin = -1;
109 | this.pathEnd = -1;
110 | this.payloadLength = 0;
111 | }else{
112 | this.payloadLength -= tail.getLength();
113 | Lexeme newTail = this.peekLast();
114 | this.pathEnd = newTail.getBegin() + newTail.getLength();
115 | }
116 | return tail;
117 | }
118 |
119 | /**
120 | * 检测词元位置交叉(有歧义的切分)
121 | * @param lexeme
122 | * @return
123 | */
124 | boolean checkCross(Lexeme lexeme){
125 | return (lexeme.getBegin() >= this.pathBegin && lexeme.getBegin() < this.pathEnd)
126 | || (this.pathBegin >= lexeme.getBegin() && this.pathBegin < lexeme.getBegin()+ lexeme.getLength());
127 | }
128 |
129 | int getPathBegin() {
130 | return pathBegin;
131 | }
132 |
133 | int getPathEnd() {
134 | return pathEnd;
135 | }
136 |
137 | /**
138 | * 获取Path的有效词长
139 | * @return
140 | */
141 | int getPayloadLength(){
142 | return this.payloadLength;
143 | }
144 |
145 | /**
146 | * 获取LexemePath的路径长度
147 | * @return
148 | */
149 | int getPathLength(){
150 | return this.pathEnd - this.pathBegin;
151 | }
152 |
153 |
154 | /**
155 | * X权重(词元长度积)
156 | * @return
157 | */
158 | int getXWeight(){
159 | int product = 1;
160 | Cell c = this.getHead();
161 | while( c != null && c.getLexeme() != null){
162 | product *= c.getLexeme().getLength();
163 | c = c.getNext();
164 | }
165 | return product;
166 | }
167 |
168 | /**
169 | * 词元位置权重
170 | * @return
171 | */
172 | int getPWeight(){
173 | int pWeight = 0;
174 | int p = 0;
175 | Cell c = this.getHead();
176 | while( c != null && c.getLexeme() != null){
177 | p++;
178 | pWeight += p * c.getLexeme().getLength() ;
179 | c = c.getNext();
180 | }
181 | return pWeight;
182 | }
183 |
184 | LexemePath copy(){
185 | LexemePath theCopy = new LexemePath();
186 | theCopy.pathBegin = this.pathBegin;
187 | theCopy.pathEnd = this.pathEnd;
188 | theCopy.payloadLength = this.payloadLength;
189 | Cell c = this.getHead();
190 | while( c != null && c.getLexeme() != null){
191 | theCopy.addLexeme(c.getLexeme());
192 | c = c.getNext();
193 | }
194 | return theCopy;
195 | }
196 |
197 | public int compareTo(LexemePath o) {
198 | //比较有效文本长度
199 | if(this.payloadLength > o.payloadLength){
200 | return -1;
201 | }else if(this.payloadLength < o.payloadLength){
202 | return 1;
203 | }else{
204 | //比较词元个数,越少越好
205 | if(this.size() < o.size()){
206 | return -1;
207 | }else if (this.size() > o.size()){
208 | return 1;
209 | }else{
210 | //路径跨度越大越好
211 | if(this.getPathLength() > o.getPathLength()){
212 | return -1;
213 | }else if(this.getPathLength() < o.getPathLength()){
214 | return 1;
215 | }else {
216 | //根据统计学结论,逆向切分概率高于正向切分,因此位置越靠后的优先
217 | if(this.pathEnd > o.pathEnd){
218 | return -1;
219 | }else if(pathEnd < o.pathEnd){
220 | return 1;
221 | }else{
222 | //词长越平均越好
223 | if(this.getXWeight() > o.getXWeight()){
224 | return -1;
225 | }else if(this.getXWeight() < o.getXWeight()){
226 | return 1;
227 | }else {
228 | //词元位置权重比较
229 | if(this.getPWeight() > o.getPWeight()){
230 | return -1;
231 | }else if(this.getPWeight() < o.getPWeight()){
232 | return 1;
233 | }
234 |
235 | }
236 | }
237 | }
238 | }
239 | }
240 | return 0;
241 | }
242 |
243 | public String toString(){
244 | StringBuffer sb = new StringBuffer();
245 | sb.append("pathBegin : ").append(pathBegin).append("\r\n");
246 | sb.append("pathEnd : ").append(pathEnd).append("\r\n");
247 | sb.append("payloadLength : ").append(payloadLength).append("\r\n");
248 | Cell head = this.getHead();
249 | while(head != null){
250 | sb.append("lexeme : ").append(head.getLexeme()).append("\r\n");
251 | head = head.getNext();
252 | }
253 | return sb.toString();
254 | }
255 |
256 | }
257 |
--------------------------------------------------------------------------------
/chatbotv1/src/main/java/org/wltea/analyzer/core/Lexeme.java:
--------------------------------------------------------------------------------
1 | /**
2 | * IK 中文分词 版本 5.0
3 | * IK Analyzer release 5.0
4 | *
5 | * Licensed to the Apache Software Foundation (ASF) under one or more
6 | * contributor license agreements. See the NOTICE file distributed with
7 | * this work for additional information regarding copyright ownership.
8 | * The ASF licenses this file to You under the Apache License, Version 2.0
9 | * (the "License"); you may not use this file except in compliance with
10 | * the License. You may obtain a copy of the License at
11 | *
12 | * http://www.apache.org/licenses/LICENSE-2.0
13 | *
14 | * Unless required by applicable law or agreed to in writing, software
15 | * distributed under the License is distributed on an "AS IS" BASIS,
16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | * See the License for the specific language governing permissions and
18 | * limitations under the License.
19 | *
20 | * 源代码由林良益(linliangyi2005@gmail.com)提供
21 | * 版权声明 2012,乌龙茶工作室
22 | * provided by Linliangyi and copyright 2012 by Oolong studio
23 | *
24 | */
25 | package org.wltea.analyzer.core;
26 |
27 | /**
28 | * IK词元对象
29 | */
30 | public class Lexeme implements Comparable{
31 | //lexemeType常量
32 | //未知
33 | public static final int TYPE_UNKNOWN = 0;
34 | //英文
35 | public static final int TYPE_ENGLISH = 1;
36 | //数字
37 | public static final int TYPE_ARABIC = 2;
38 | //英文数字混合
39 | public static final int TYPE_LETTER = 3;
40 | //中文词元
41 | public static final int TYPE_CNWORD = 4;
42 | //中文单字
43 | public static final int TYPE_CNCHAR = 64;
44 | //日韩文字
45 | public static final int TYPE_OTHER_CJK = 8;
46 | //中文数词
47 | public static final int TYPE_CNUM = 16;
48 | //中文量词
49 | public static final int TYPE_COUNT = 32;
50 | //中文数量词
51 | public static final int TYPE_CQUAN = 48;
52 |
53 | //词元的起始位移
54 | private int offset;
55 | //词元的相对起始位置
56 | private int begin;
57 | //词元的长度
58 | private int length;
59 | //词元文本
60 | private String lexemeText;
61 | //词元类型
62 | private int lexemeType;
63 |
64 |
65 | public Lexeme(int offset , int begin , int length , int lexemeType){
66 | this.offset = offset;
67 | this.begin = begin;
68 | if(length < 0){
69 | throw new IllegalArgumentException("length < 0");
70 | }
71 | this.length = length;
72 | this.lexemeType = lexemeType;
73 | }
74 |
75 | /*
76 | * 判断词元相等算法
77 | * 起始位置偏移、起始位置、终止位置相同
78 | * @see java.lang.Object#equals(Object o)
79 | */
80 | public boolean equals(Object o){
81 | if(o == null){
82 | return false;
83 | }
84 |
85 | if(this == o){
86 | return true;
87 | }
88 |
89 | if(o instanceof Lexeme){
90 | Lexeme other = (Lexeme)o;
91 | if(this.offset == other.getOffset()
92 | && this.begin == other.getBegin()
93 | && this.length == other.getLength()){
94 | return true;
95 | }else{
96 | return false;
97 | }
98 | }else{
99 | return false;
100 | }
101 | }
102 |
103 | /*
104 | * 词元哈希编码算法
105 | * @see java.lang.Object#hashCode()
106 | */
107 | public int hashCode(){
108 | int absBegin = getBeginPosition();
109 | int absEnd = getEndPosition();
110 | return (absBegin * 37) + (absEnd * 31) + ((absBegin * absEnd) % getLength()) * 11;
111 | }
112 |
113 | /*
114 | * 词元在排序集合中的比较算法
115 | * @see java.lang.Comparable#compareTo(java.lang.Object)
116 | */
117 | public int compareTo(Lexeme other) {
118 | //起始位置优先
119 | if(this.begin < other.getBegin()){
120 | return -1;
121 | }else if(this.begin == other.getBegin()){
122 | //词元长度优先
123 | if(this.length > other.getLength()){
124 | return -1;
125 | }else if(this.length == other.getLength()){
126 | return 0;
127 | }else {//this.length < other.getLength()
128 | return 1;
129 | }
130 |
131 | }else{//this.begin > other.getBegin()
132 | return 1;
133 | }
134 | }
135 |
136 | public int getOffset() {
137 | return offset;
138 | }
139 |
140 | public void setOffset(int offset) {
141 | this.offset = offset;
142 | }
143 |
144 | public int getBegin() {
145 | return begin;
146 | }
147 | /**
148 | * 获取词元在文本中的起始位置
149 | * @return int
150 | */
151 | public int getBeginPosition(){
152 | return offset + begin;
153 | }
154 |
155 | public void setBegin(int begin) {
156 | this.begin = begin;
157 | }
158 |
159 | /**
160 | * 获取词元在文本中的结束位置
161 | * @return int
162 | */
163 | public int getEndPosition(){
164 | return offset + begin + length;
165 | }
166 |
167 | /**
168 | * 获取词元的字符长度
169 | * @return int
170 | */
171 | public int getLength(){
172 | return this.length;
173 | }
174 |
175 | public void setLength(int length) {
176 | if(this.length < 0){
177 | throw new IllegalArgumentException("length < 0");
178 | }
179 | this.length = length;
180 | }
181 |
182 | /**
183 | * 获取词元的文本内容
184 | * @return String
185 | */
186 | public String getLexemeText() {
187 | if(lexemeText == null){
188 | return "";
189 | }
190 | return lexemeText;
191 | }
192 |
193 | public void setLexemeText(String lexemeText) {
194 | if(lexemeText == null){
195 | this.lexemeText = "";
196 | this.length = 0;
197 | }else{
198 | this.lexemeText = lexemeText;
199 | this.length = lexemeText.length();
200 | }
201 | }
202 |
203 | /**
204 | * 获取词元类型
205 | * @return int
206 | */
207 | public int getLexemeType() {
208 | return lexemeType;
209 | }
210 |
211 | /**
212 | * 获取词元类型标示字符串
213 | * @return String
214 | */
215 | public String getLexemeTypeString(){
216 | switch(lexemeType) {
217 |
218 | case TYPE_ENGLISH :
219 | return "ENGLISH";
220 |
221 | case TYPE_ARABIC :
222 | return "ARABIC";
223 |
224 | case TYPE_LETTER :
225 | return "LETTER";
226 |
227 | case TYPE_CNWORD :
228 | return "CN_WORD";
229 |
230 | case TYPE_CNCHAR :
231 | return "CN_CHAR";
232 |
233 | case TYPE_OTHER_CJK :
234 | return "OTHER_CJK";
235 |
236 | case TYPE_COUNT :
237 | return "COUNT";
238 |
239 | case TYPE_CNUM :
240 | return "TYPE_CNUM";
241 |
242 | case TYPE_CQUAN:
243 | return "TYPE_CQUAN";
244 |
245 | default :
246 | return "UNKONW";
247 | }
248 | }
249 |
250 |
251 | public void setLexemeType(int lexemeType) {
252 | this.lexemeType = lexemeType;
253 | }
254 |
255 | /**
256 | * 合并两个相邻的词元
257 | * @param l
258 | * @param lexemeType
259 | * @return boolean 词元是否成功合并
260 | */
261 | public boolean append(Lexeme l , int lexemeType){
262 | if(l != null && this.getEndPosition() == l.getBeginPosition()){
263 | this.length += l.getLength();
264 | this.lexemeType = lexemeType;
265 | return true;
266 | }else {
267 | return false;
268 | }
269 | }
270 |
271 |
272 | /**
273 | *
274 | */
275 | public String toString(){
276 | StringBuffer strbuf = new StringBuffer();
277 | strbuf.append(this.getBeginPosition()).append("-").append(this.getEndPosition());
278 | strbuf.append(" : ").append(this.lexemeText).append(" : \t");
279 | strbuf.append(this.getLexemeTypeString());
280 | return strbuf.toString();
281 | }
282 |
283 |
284 | }
285 |
--------------------------------------------------------------------------------
/chatbotv2/my_seq2seq.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import sys
4 | import math
5 | import tflearn
6 | import tensorflow as tf
7 | from tensorflow.python.ops import rnn_cell
8 | from tensorflow.python.ops import rnn
9 | import chardet
10 | import numpy as np
11 | import struct
12 |
13 | seq = []
14 |
15 | max_w = 50
16 | float_size = 4
17 | word_vector_dict = {}
18 | word_vec_dim = 200
19 | max_seq_len = 16
20 |
21 | def load_vectors(input):
22 | """从vectors.bin加载词向量,返回一个word_vector_dict的词典,key是词,value是200维的向量
23 | """
24 | print "begin load vectors"
25 |
26 | input_file = open(input, "rb")
27 |
28 | # 获取词表数目及向量维度
29 | words_and_size = input_file.readline()
30 | words_and_size = words_and_size.strip()
31 | words = long(words_and_size.split(' ')[0])
32 | size = long(words_and_size.split(' ')[1])
33 | print "words =", words
34 | print "size =", size
35 |
36 | for b in range(0, words):
37 | a = 0
38 | word = ''
39 | # 读取一个词
40 | while True:
41 | c = input_file.read(1)
42 | word = word + c
43 | if False == c or c == ' ':
44 | break
45 | if a < max_w and c != '\n':
46 | a = a + 1
47 | word = word.strip()
48 |
49 | vector = []
50 | for index in range(0, size):
51 | m = input_file.read(float_size)
52 | (weight,) = struct.unpack('f', m)
53 | vector.append(float(weight))
54 |
55 | # 将词及其对应的向量存到dict中
56 | #word_vector_dict[word.decode('utf-8')] = vector
57 | word_vector_dict[word.decode('utf-8')] = vector[0:word_vec_dim]
58 |
59 | input_file.close()
60 |
61 | print "load vectors finish"
62 |
63 | def init_seq():
64 | """读取切好词的文本文件,加载全部词序列
65 | """
66 | file_object = open('zhenhuanzhuan.segment', 'r')
67 | vocab_dict = {}
68 | while True:
69 | line = file_object.readline()
70 | if line:
71 | for word in line.decode('utf-8').split(' '):
72 | if word_vector_dict.has_key(word):
73 | seq.append(word_vector_dict[word])
74 | else:
75 | break
76 | file_object.close()
77 |
78 | def vector_sqrtlen(vector):
79 | len = 0
80 | for item in vector:
81 | len += item * item
82 | len = math.sqrt(len)
83 | return len
84 |
85 | def vector_cosine(v1, v2):
86 | if len(v1) != len(v2):
87 | sys.exit(1)
88 | sqrtlen1 = vector_sqrtlen(v1)
89 | sqrtlen2 = vector_sqrtlen(v2)
90 | value = 0
91 | for item1, item2 in zip(v1, v2):
92 | value += item1 * item2
93 | return value / (sqrtlen1*sqrtlen2)
94 |
95 |
96 | def vector2word(vector):
97 | max_cos = -10000
98 | match_word = ''
99 | for word in word_vector_dict:
100 | v = word_vector_dict[word]
101 | cosine = vector_cosine(vector, v)
102 | if cosine > max_cos:
103 | max_cos = cosine
104 | match_word = word
105 | return (match_word, max_cos)
106 |
107 |
108 | class MySeq2Seq(object):
109 | """
110 | 思路:输入输出序列一起作为input,然后通过slick和unpack切分
111 | 完全按照论文说的编码器解码器来做
112 | 输出的时候把解码器的输出按照词向量的200维展平,这样输出就是(?,seqlen*200)
113 | 这样就可以通过regression来做回归计算了,输入的y也展平,保持一致
114 | """
115 | def __init__(self, max_seq_len = 16, word_vec_dim = 200):
116 | self.max_seq_len = max_seq_len
117 | self.word_vec_dim = word_vec_dim
118 |
119 | def generate_trainig_data(self):
120 | load_vectors("./vectors.bin")
121 | init_seq()
122 | xy_data = []
123 | y_data = []
124 | for i in range(30,40,10):
125 | # 问句、答句都是16字,所以取32个
126 | start = i*self.max_seq_len*2
127 | middle = i*self.max_seq_len*2 + self.max_seq_len
128 | end = (i+1)*self.max_seq_len*2
129 | sequence_xy = seq[start:end]
130 | sequence_y = seq[middle:end]
131 | print "right answer"
132 | for w in sequence_y:
133 | (match_word, max_cos) = vector2word(w)
134 | print match_word
135 | sequence_y = [np.ones(self.word_vec_dim)] + sequence_y
136 | xy_data.append(sequence_xy)
137 | y_data.append(sequence_y)
138 |
139 | return np.array(xy_data), np.array(y_data)
140 |
141 |
142 | def model(self, feed_previous=False):
143 | # 通过输入的XY生成encoder_inputs和带GO头的decoder_inputs
144 | input_data = tflearn.input_data(shape=[None, self.max_seq_len*2, self.word_vec_dim], dtype=tf.float32, name = "XY")
145 | encoder_inputs = tf.slice(input_data, [0, 0, 0], [-1, self.max_seq_len, self.word_vec_dim], name="enc_in")
146 | decoder_inputs_tmp = tf.slice(input_data, [0, self.max_seq_len, 0], [-1, self.max_seq_len-1, self.word_vec_dim], name="dec_in_tmp")
147 | go_inputs = tf.ones_like(decoder_inputs_tmp)
148 | go_inputs = tf.slice(go_inputs, [0, 0, 0], [-1, 1, self.word_vec_dim])
149 | decoder_inputs = tf.concat(1, [go_inputs, decoder_inputs_tmp], name="dec_in")
150 |
151 | # 编码器
152 | # 把encoder_inputs交给编码器,返回一个输出(预测序列的第一个值)和一个状态(传给解码器)
153 | (encoder_output_tensor, states) = tflearn.lstm(encoder_inputs, self.word_vec_dim, return_state=True, scope='encoder_lstm')
154 | encoder_output_sequence = tf.pack([encoder_output_tensor], axis=1)
155 |
156 | # 解码器
157 | # 预测过程用前一个时间序的输出作为下一个时间序的输入
158 | # 先用编码器的最后一个输出作为第一个输入
159 | if feed_previous:
160 | first_dec_input = go_inputs
161 | else:
162 | first_dec_input = tf.slice(decoder_inputs, [0, 0, 0], [-1, 1, self.word_vec_dim])
163 | decoder_output_tensor = tflearn.lstm(first_dec_input, self.word_vec_dim, initial_state=states, return_seq=False, reuse=False, scope='decoder_lstm')
164 | decoder_output_sequence_single = tf.pack([decoder_output_tensor], axis=1)
165 | decoder_output_sequence_list = [decoder_output_tensor]
166 | # 再用解码器的输出作为下一个时序的输入
167 | for i in range(self.max_seq_len-1):
168 | if feed_previous:
169 | next_dec_input = decoder_output_sequence_single
170 | else:
171 | next_dec_input = tf.slice(decoder_inputs, [0, i+1, 0], [-1, 1, self.word_vec_dim])
172 | decoder_output_tensor = tflearn.lstm(next_dec_input, self.word_vec_dim, return_seq=False, reuse=True, scope='decoder_lstm')
173 | decoder_output_sequence_single = tf.pack([decoder_output_tensor], axis=1)
174 | decoder_output_sequence_list.append(decoder_output_tensor)
175 |
176 | decoder_output_sequence = tf.pack(decoder_output_sequence_list, axis=1)
177 | real_output_sequence = tf.concat(1, [encoder_output_sequence, decoder_output_sequence])
178 |
179 | net = tflearn.regression(real_output_sequence, optimizer='sgd', learning_rate=0.1, loss='mean_square')
180 | model = tflearn.DNN(net)
181 | return model
182 |
183 | def train(self):
184 | trainXY, trainY = self.generate_trainig_data()
185 | model = self.model(feed_previous=False)
186 | model.fit(trainXY, trainY, n_epoch=1000, snapshot_epoch=False)
187 | model.save('./model/model')
188 | return model
189 |
190 | def load(self):
191 | model = self.model(feed_previous=True)
192 | model.load('./model/model')
193 | return model
194 |
195 | if __name__ == '__main__':
196 | phrase = sys.argv[1]
197 | my_seq2seq = MySeq2Seq(word_vec_dim=word_vec_dim, max_seq_len=max_seq_len)
198 | if phrase == 'train':
199 | my_seq2seq.train()
200 | else:
201 | model = my_seq2seq.load()
202 | trainXY, trainY = my_seq2seq.generate_trainig_data()
203 | predict = model.predict(trainXY)
204 | for sample in predict:
205 | print "predict answer"
206 | for w in sample[1:]:
207 | (match_word, max_cos) = vector2word(w)
208 | print match_word, max_cos
209 |
--------------------------------------------------------------------------------
/subtitle/subtitle_crawler/spiders/subtitle_spider.py:
--------------------------------------------------------------------------------
1 | # coding:utf-8
2 |
3 | import sys
4 | reload(sys)
5 | sys.setdefaultencoding( "utf-8" )
6 |
7 | import scrapy
8 | from w3lib.html import remove_tags
9 | from subtitle_crawler.items import SubtitleCrawlerItem
10 |
11 | class SubTitleSpider(scrapy.Spider):
12 | name = "subtitle"
13 | allowed_domains = ["zimuku.net"]
14 | start_urls = [
15 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=900",
16 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=901",
17 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=902",
18 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=903",
19 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=904",
20 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=905",
21 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=906",
22 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=907",
23 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=908",
24 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=909",
25 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=910",
26 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=911",
27 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=912",
28 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=913",
29 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=914",
30 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=915",
31 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=916",
32 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=917",
33 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=918",
34 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=919",
35 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=920",
36 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=921",
37 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=922",
38 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=923",
39 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=924",
40 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=925",
41 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=926",
42 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=927",
43 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=928",
44 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=929",
45 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=930",
46 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=931",
47 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=932",
48 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=933",
49 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=934",
50 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=935",
51 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=936",
52 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=937",
53 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=938",
54 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=939",
55 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=940",
56 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=941",
57 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=942",
58 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=943",
59 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=944",
60 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=945",
61 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=946",
62 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=947",
63 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=948",
64 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=949",
65 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=950",
66 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=951",
67 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=952",
68 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=953",
69 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=954",
70 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=955",
71 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=956",
72 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=957",
73 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=958",
74 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=959",
75 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=960",
76 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=961",
77 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=962",
78 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=963",
79 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=964",
80 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=965",
81 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=966",
82 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=967",
83 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=968",
84 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=969",
85 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=970",
86 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=971",
87 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=972",
88 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=973",
89 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=974",
90 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=975",
91 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=976",
92 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=977",
93 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=978",
94 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=979",
95 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=980",
96 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=981",
97 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=982",
98 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=983",
99 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=984",
100 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=985",
101 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=986",
102 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=987",
103 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=988",
104 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=989",
105 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=990",
106 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=991",
107 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=992",
108 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=993",
109 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=994",
110 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=995",
111 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=996",
112 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=997",
113 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=998",
114 | "http://www.zimuku.net/search?q=&t=onlyst&ad=1&p=999",
115 | ]
116 |
117 | def parse(self, response):
118 | hrefs = response.selector.xpath('//div[contains(@class, "persub")]/h1/a/@href').extract()
119 | for href in hrefs:
120 | url = response.urljoin(href)
121 | request = scrapy.Request(url, callback=self.parse_detail)
122 | yield request
123 |
124 | def parse_detail(self, response):
125 | url = response.selector.xpath('//li[contains(@class, "dlsub")]/div/a/@href').extract()[0]
126 | print "processing: ", url
127 | request = scrapy.Request(url, callback=self.parse_file)
128 | yield request
129 |
130 | def parse_file(self, response):
131 | body = response.body
132 | item = SubtitleCrawlerItem()
133 | item['url'] = response.url
134 | item['body'] = body
135 | return item
136 |
--------------------------------------------------------------------------------
|