├── .gitignore ├── offline ├── Makefile ├── conf │ ├── log4cpp.conf │ └── offline.conf ├── data │ ├── stopWords.utf8 │ └── stop_words.utf8 ├── include │ ├── Configure.h │ ├── DirScanner.h │ ├── MyLogger.h │ ├── Page.h │ ├── PageLib.h │ ├── RssReader.h │ ├── Simhasher.hpp │ ├── WordSegmentation.h │ ├── cppjieba │ │ ├── DictTrie.hpp │ │ ├── FullSegment.hpp │ │ ├── HMMModel.hpp │ │ ├── HMMSegment.hpp │ │ ├── Jieba.hpp │ │ ├── KeywordExtractor.hpp │ │ ├── MPSegment.hpp │ │ ├── MixSegment.hpp │ │ ├── PosTagger.hpp │ │ ├── PreFilter.hpp │ │ ├── QuerySegment.hpp │ │ ├── SegmentBase.hpp │ │ ├── SegmentTagged.hpp │ │ ├── TextRankExtractor.hpp │ │ ├── Trie.hpp │ │ ├── Unicode.hpp │ │ └── limonp │ │ │ ├── ArgvContext.hpp │ │ │ ├── BlockingQueue.hpp │ │ │ ├── BoundedBlockingQueue.hpp │ │ │ ├── BoundedQueue.hpp │ │ │ ├── Closure.hpp │ │ │ ├── Colors.hpp │ │ │ ├── Condition.hpp │ │ │ ├── Config.hpp │ │ │ ├── FileLock.hpp │ │ │ ├── ForcePublic.hpp │ │ │ ├── LocalVector.hpp │ │ │ ├── Logging.hpp │ │ │ ├── Md5.hpp │ │ │ ├── MutexLock.hpp │ │ │ ├── NonCopyable.hpp │ │ │ ├── StdExtension.hpp │ │ │ ├── StringUtil.hpp │ │ │ ├── Thread.hpp │ │ │ └── ThreadPool.hpp │ ├── jenkins.h │ └── tinyxml2.h ├── src │ ├── Configure.cc │ ├── DirScanner.cc │ ├── MyLogger.cc │ ├── Page.cc │ ├── PageLib.cc │ ├── RssReader.cc │ ├── main.cc │ └── tinyxml2.cc └── tinyse_offline ├── online ├── Makefile ├── conf │ ├── log4cpp.conf │ └── online.conf ├── data │ └── stop_words.utf8 ├── include │ ├── Acceptor.h │ ├── Buffer.h │ ├── Callbacks.h │ ├── Channel.h │ ├── Condition.h │ ├── Configure.h │ ├── CurrentThread.h │ ├── Epoller.h │ ├── EventLoop.h │ ├── EventLoopThread.h │ ├── InetAddress.h │ ├── MutexLock.h │ ├── MyLogger.h │ ├── Page.h │ ├── Poller.hpp │ ├── Simhasher.hpp │ ├── Socket.h │ ├── SocketsOps.h │ ├── TcpConnection.h │ ├── TcpServer.h │ ├── Thread.h │ ├── ThreadPool.h │ ├── TimerQueue.h │ ├── Timestamp.h │ ├── Uncopyable.h │ ├── WordQuery.h │ ├── WordSegmentation.h │ └── cppjieba │ │ ├── DictTrie.hpp │ │ ├── FullSegment.hpp │ │ ├── HMMModel.hpp │ │ ├── HMMSegment.hpp │ │ ├── Jieba.hpp │ │ ├── KeywordExtractor.hpp │ │ ├── MPSegment.hpp │ │ ├── MixSegment.hpp │ │ ├── PosTagger.hpp │ │ ├── PreFilter.hpp │ │ ├── QuerySegment.hpp │ │ ├── SegmentBase.hpp │ │ ├── SegmentTagged.hpp │ │ ├── TextRankExtractor.hpp │ │ ├── Trie.hpp │ │ ├── Unicode.hpp │ │ └── limonp │ │ ├── ArgvContext.hpp │ │ ├── BlockingQueue.hpp │ │ ├── BoundedBlockingQueue.hpp │ │ ├── BoundedQueue.hpp │ │ ├── Closure.hpp │ │ ├── Colors.hpp │ │ ├── Condition.hpp │ │ ├── Config.hpp │ │ ├── FileLock.hpp │ │ ├── ForcePublic.hpp │ │ ├── LocalVector.hpp │ │ ├── Logging.hpp │ │ ├── Md5.hpp │ │ ├── MutexLock.hpp │ │ ├── NonCopyable.hpp │ │ ├── StdExtension.hpp │ │ ├── StringUtil.hpp │ │ ├── Thread.hpp │ │ └── ThreadPool.hpp ├── log │ ├── tiny_se.log.01 │ └── tiny_se.log.02 ├── src │ ├── Acceptor.cc │ ├── Buffer.cc │ ├── Channel.cc │ ├── Configure.cc │ ├── Epoller.cc │ ├── EventLoop.cc │ ├── EventLoopThread.cc │ ├── Makefile │ ├── MyLogger.cc │ ├── Page.cc │ ├── TcpConnection.cc │ ├── TcpServer.cc │ ├── Thread.cc │ ├── ThreadPool.cc │ ├── TimerQueue.cc │ ├── Timestamp.cc │ ├── TinySearchEngine.cc │ ├── WordQuery.cc │ └── test_file │ │ ├── Poller.cpp │ │ ├── test1 │ │ ├── test1.cpp │ │ ├── test10 │ │ ├── test10.cpp │ │ ├── test11 │ │ ├── test11.cpp │ │ ├── test2 │ │ ├── test2.cpp │ │ ├── test3 │ │ ├── test3.cpp │ │ ├── test4 │ │ ├── test4.cpp │ │ ├── test5 │ │ ├── test5.cpp │ │ ├── test6 │ │ ├── test6.cpp │ │ ├── test7 │ │ ├── test7.cpp │ │ ├── test8 │ │ ├── test8.cpp │ │ └── test9.cpp ├── test │ ├── test.cc │ ├── test1.cc │ ├── test2.cc │ ├── test3.cc │ └── test4.cc └── tinyse_online └── readme.md /.gitignore: -------------------------------------------------------------------------------- 1 | ## 以下这些文件不会被git跟踪 ## 2 | *.o 3 | *.txt 4 | *.log 5 | *.lib 6 | *.a 7 | *.so 8 | *.dat 9 | *.out 10 | *.xml 11 | -------------------------------------------------------------------------------- /offline/Makefile: -------------------------------------------------------------------------------- 1 | SRCS:=$(wildcard ./src/*.cc) 2 | OBJS:=$(patsubst %.cc,%.o,$(SRCS)) 3 | ELF:= tinyse_offline 4 | CC:=g++ 5 | CXXFLAGS:=-std=c++11 -g -Wall 6 | $(ELF):$(OBJS) 7 | g++ $^ -o $@ -lpthread -llog4cpp $(CXXFLAGS) 8 | .PHONY:clean 9 | clean: 10 | rm -rf $(ELF) $(OBJS) 11 | 12 | -------------------------------------------------------------------------------- /offline/conf/log4cpp.conf: -------------------------------------------------------------------------------- 1 | log4cpp.rootCategory=DEBUG, console, rollAppender 2 | 3 | log4cpp.appender.console=ConsoleAppender 4 | log4cpp.appender.console.layout=PatternLayout 5 | log4cpp.appender.console.layout.ConversionPattern=%d [%p] %m%n 6 | 7 | log4cpp.appender.rollAppender=RollingFileAppender 8 | log4cpp.appender.rollAppender.fileName=../log/tiny_se.log 9 | log4cpp.appender.rollAppender.maxFileSize=1048576 #1MB 10 | log4cpp.appender.rollAppender.maxBackupIndex=10 11 | log4cpp.appender.rollAppender.layout=PatternLayout 12 | log4cpp.appender.rollAppender.layout.ConversionPattern=%d [%p] %m%n 13 | 14 | -------------------------------------------------------------------------------- /offline/conf/offline.conf: -------------------------------------------------------------------------------- 1 | corpus data/rss/ 2 | pagelib data/pagelib.dat 3 | offsetlib data/offsetlib.dat 4 | invertedindexlib data/invertedindexlib.dat 5 | stopwords data/stop_words.utf8 6 | -------------------------------------------------------------------------------- /offline/data/stopWords.utf8: -------------------------------------------------------------------------------- 1 | 了 2 | 呢 3 | 吗 4 | 的 5 | 得 6 | 嗯 7 | 之 8 | -------------------------------------------------------------------------------- /offline/include/Configure.h: -------------------------------------------------------------------------------- 1 | /********************************************* 2 | * file: Configure.h 3 | * author: AIWWZ(wzj1524@qq.com) 4 | * date: 2019-06-18 18:08:35 5 | **********************************************/ 6 | #ifndef __CONFIGURE_H__ 7 | #define __CONFIGURE_H__ 8 | #include 9 | #include 10 | #include 11 | #include 12 | using std::string; using std::map; using std::ifstream; 13 | using std::set; 14 | 15 | namespace tinyse { 16 | 17 | /* 读取配置文件 */ 18 | class Configure { 19 | public: 20 | Configure(const string &filepath); 21 | ~Configure() { } 22 | 23 | map getConfigMap(); //获取存放配置信息的map 24 | set& getStopWords(); //获取停用词集 25 | void print() const; //for debug; 26 | 27 | private: 28 | void defaultConfig(); //默认配置 29 | void loadConfig(ifstream &configs); //加载配置信息 30 | void readStopWords(); //读取停用词集 31 | 32 | private: 33 | map m_configMap; //数据文件-->存储路径 34 | set m_stopWords; //停用词集 35 | }; 36 | 37 | } //end of namespace tinyse 38 | 39 | #endif /* __CONFIGURE_H__ */ 40 | -------------------------------------------------------------------------------- /offline/include/DirScanner.h: -------------------------------------------------------------------------------- 1 | /********************************************* 2 | * file: DirScanner.h 3 | * author: AIWWZ(wzj1524@qq.com) 4 | * date: 2019-06-18 21:49:36 5 | **********************************************/ 6 | #ifndef __DIRSCANNER_H__ 7 | #define __DIRSCANNER_H__ 8 | #include 9 | #include 10 | using std::vector; using std::string; 11 | 12 | namespace tinyse { 13 | 14 | class Configure; 15 | 16 | class DirScanner { 17 | public: 18 | DirScanner(Configure &config); 19 | 20 | const vector& getCorpusPages() const; 21 | void print() const; //for debug 22 | 23 | private: 24 | void traverse(const string &corpusDir); 25 | 26 | private: 27 | vector m_vecCorpus; 28 | Configure &m_config; 29 | }; 30 | 31 | } //end of namespace tinyse 32 | 33 | #endif /* __DIRSCANNER_H__ */ 34 | -------------------------------------------------------------------------------- /offline/include/MyLogger.h: -------------------------------------------------------------------------------- 1 | #ifndef __MYLOGGER_H__ 2 | #define __MYLOGGER_H__ 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #define prefix(msg) (std::string("[")+__FILE__+":"+__FUNCTION__+":"+std::to_string(__LINE__)+"] "+msg).c_str() 9 | #define LogError(msg, ...) MyLogger::getInstance()->error(prefix(msg), ##__VA_ARGS__) 10 | #define LogWarn(msg, ...) MyLogger::getInstance()->warn(prefix(msg), ##__VA_ARGS__) 11 | #define LogInfo(msg, ...) MyLogger::getInstance()->info(prefix(msg), ##__VA_ARGS__) 12 | #define LogDebug(msg, ...) MyLogger::getInstance()->debug(prefix(msg), ##__VA_ARGS__) 13 | 14 | class MyLogger { 15 | public: 16 | static MyLogger* getInstance() { 17 | if(nullptr == m_pInstance) { 18 | pthread_once(&m_once_control, init); //确保线程安全 19 | } 20 | return m_pInstance; 21 | } 22 | 23 | void error(const char *msg); 24 | 25 | template 26 | void error(Args ... args) { m_logger.error(args...); } 27 | 28 | void warn(const char *msg); 29 | 30 | template 31 | void warn(Args ... args) { m_logger.warn(args...); } 32 | 33 | void info(const char *msg); 34 | 35 | template 36 | void info(Args ... args) { m_logger.info(args...); } 37 | 38 | void debug(const char *msg); 39 | 40 | template 41 | void debug(Args ... args) { m_logger.debug(args...); } 42 | 43 | private: 44 | MyLogger(); 45 | ~MyLogger(); 46 | 47 | static void destroy() { 48 | if(nullptr != m_pInstance) { 49 | delete m_pInstance; 50 | m_pInstance = nullptr; 51 | } 52 | } 53 | 54 | static void init() { 55 | m_pInstance = new MyLogger(); 56 | atexit(destroy); 57 | } 58 | 59 | private: 60 | static MyLogger *m_pInstance; 61 | static pthread_once_t m_once_control; 62 | log4cpp::Category & m_logger; 63 | }; 64 | 65 | 66 | #endif /* __MYLOGGER_H__ */ 67 | -------------------------------------------------------------------------------- /offline/include/Page.h: -------------------------------------------------------------------------------- 1 | /********************************************* 2 | * file: Page.h 3 | * author: AIWWZ(wzj1524@qq.com) 4 | * date: 2019-06-19 18:15:15 5 | **********************************************/ 6 | #ifndef __PAGE_H__ 7 | #define __PAGE_H__ 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include "Simhasher.hpp" 13 | using std::string; using std::vector; using std::map; 14 | using std::ostream; 15 | 16 | 17 | namespace tinyse { 18 | 19 | class Configure; 20 | class WordSegmentation; 21 | 22 | class Page { 23 | friend ostream& operator<<(ostream &os, const Page &page); 24 | friend bool operator==(const Page &lhs, const Page &rhs); 25 | public: 26 | Page() { } 27 | 28 | void setDocID(const size_t &docid); 29 | void setTitle(const string &title); 30 | void setLink(const string &url); 31 | void setContent(const string &content); 32 | void parse(Configure &config, WordSegmentation &jieba, simhash::Simhasher &simhasher); 33 | map& getWordsMap(); 34 | size_t getDocID() const; 35 | string getTitle() const; 36 | string getContent() const; 37 | void clear(); 38 | void operator=(const Page &rhs); 39 | 40 | private: 41 | size_t m_docid; 42 | string m_title; 43 | string m_link; 44 | string m_content; 45 | 46 | map m_wordsMap; //分词(去停用词)之后的词:词频 47 | uint64_t m_simhash; //该Page的simhash值 48 | }; 49 | 50 | } //end of namespace tinyse 51 | 52 | #endif /* __PAGE_H__ */ 53 | -------------------------------------------------------------------------------- /offline/include/PageLib.h: -------------------------------------------------------------------------------- 1 | /********************************************* 2 | * file: PageLib.h 3 | * author: AIWWZ(wzj1524@qq.com) 4 | * date: 2019-06-18 21:45:55 5 | **********************************************/ 6 | #ifndef __PAGELIB_H__ 7 | #define __PAGELIB_H__ 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include "Page.h" 13 | using std::vector; using std::string; using std::map; 14 | using std::pair; using std::unordered_map; 15 | 16 | namespace tinyse { 17 | 18 | class Configure; 19 | 20 | class PageLib { 21 | public: 22 | PageLib(Configure &config); 23 | void doit(); 24 | 25 | private: 26 | void load(); //加载并处理原始网页 27 | void dedup(); //页面去重 28 | void buildInvertedIndex(); //建立倒排索引 29 | void store(); //存储网页库/偏移库/倒排索引库 30 | 31 | private: 32 | Configure &m_config; //配置 33 | vector m_vecPages; //存放格式化之后的网页 34 | unordered_map>> m_invertedIndex; //倒排索引 35 | }; 36 | 37 | } //end of namespace tinyse 38 | 39 | #endif /* __PAGELIB_H__ */ 40 | -------------------------------------------------------------------------------- /offline/include/RssReader.h: -------------------------------------------------------------------------------- 1 | #ifndef __RSSREADER_H__ 2 | #define __RSSREADER_H__ 3 | #include 4 | #include 5 | #include "Simhasher.hpp" 6 | using std::string; using std::vector; 7 | 8 | namespace tinyse { 9 | 10 | class Page; 11 | class Configure; 12 | class WordSegmentation; 13 | 14 | class RssReader{ 15 | public: 16 | RssReader(vector &vecPages) : m_vecPages(vecPages) { } 17 | 18 | bool parseRss(const string &xmlPath, Configure &config, WordSegmentation &jieba, simhash::Simhasher &simhasher); //解析XML, 并将结果存入m_rss 19 | 20 | private: 21 | vector &m_vecPages; //存放解析结果 22 | }; 23 | 24 | } //end of namespace tinyse 25 | 26 | #endif /* __RSSREADER_H__ */ 27 | -------------------------------------------------------------------------------- /offline/include/WordSegmentation.h: -------------------------------------------------------------------------------- 1 | /********************************************* 2 | * file: WordSegmentation.h 3 | * author: AIWWZ(wzj1524@qq.com) 4 | * date: 2019-06-19 21:48:01 5 | **********************************************/ 6 | #ifndef __WORDSEGMENTATION_H__ 7 | #define __WORDSEGMENTATION_H__ 8 | #include "cppjieba/Jieba.hpp" 9 | #include "MyLogger.h" 10 | #include 11 | #include 12 | using std::vector; using std::string; 13 | 14 | const char* const DICT_PATH = "/home/wzjj1314/github/cppjieba/dict/jieba.dict.utf8"; 15 | const char* const HMM_PATH = "/home/wzjj1314/github/cppjieba/dict/hmm_model.utf8"; 16 | const char* const USER_DICT_PATH = "/home/wzjj1314/github/cppjieba/dict/user.dict.utf8"; 17 | const char* const IDF_PATH = "/home/wzjj1314/github/cppjieba/dict/idf.utf8"; 18 | const char* const STOP_WORD_PATH = "/home/wzjj1314/github/cppjieba/dict/stop_words.utf8"; 19 | 20 | namespace tinyse { 21 | 22 | class WordSegmentation { 23 | public: 24 | WordSegmentation() : m_jieba(DICT_PATH, HMM_PATH, USER_DICT_PATH, IDF_PATH, STOP_WORD_PATH) { 25 | LogInfo("cppjieba init!"); 26 | } 27 | 28 | vector operator()(const string &src) { 29 | vector words; 30 | m_jieba.CutAll(src, words); 31 | return words; 32 | } 33 | 34 | private: 35 | cppjieba::Jieba m_jieba; 36 | }; 37 | 38 | } //end of namespace tinyse 39 | 40 | #endif /* __WORDSEGMENTATION_H__ */ 41 | -------------------------------------------------------------------------------- /offline/include/cppjieba/FullSegment.hpp: -------------------------------------------------------------------------------- 1 | #ifndef CPPJIEBA_FULLSEGMENT_H 2 | #define CPPJIEBA_FULLSEGMENT_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include "limonp/Logging.hpp" 8 | #include "DictTrie.hpp" 9 | #include "SegmentBase.hpp" 10 | #include "Unicode.hpp" 11 | 12 | namespace cppjieba { 13 | class FullSegment: public SegmentBase { 14 | public: 15 | FullSegment(const string& dictPath) { 16 | dictTrie_ = new DictTrie(dictPath); 17 | isNeedDestroy_ = true; 18 | } 19 | FullSegment(const DictTrie* dictTrie) 20 | : dictTrie_(dictTrie), isNeedDestroy_(false) { 21 | assert(dictTrie_); 22 | } 23 | ~FullSegment() { 24 | if (isNeedDestroy_) { 25 | delete dictTrie_; 26 | } 27 | } 28 | void Cut(const string& sentence, 29 | vector& words) const { 30 | vector tmp; 31 | Cut(sentence, tmp); 32 | GetStringsFromWords(tmp, words); 33 | } 34 | void Cut(const string& sentence, 35 | vector& words) const { 36 | PreFilter pre_filter(symbols_, sentence); 37 | PreFilter::Range range; 38 | vector wrs; 39 | wrs.reserve(sentence.size()/2); 40 | while (pre_filter.HasNext()) { 41 | range = pre_filter.Next(); 42 | Cut(range.begin, range.end, wrs); 43 | } 44 | words.clear(); 45 | words.reserve(wrs.size()); 46 | GetWordsFromWordRanges(sentence, wrs, words); 47 | } 48 | void Cut(RuneStrArray::const_iterator begin, 49 | RuneStrArray::const_iterator end, 50 | vector& res) const { 51 | // resut of searching in trie tree 52 | LocalVector > tRes; 53 | 54 | // max index of res's words 55 | size_t maxIdx = 0; 56 | 57 | // always equals to (uItr - begin) 58 | size_t uIdx = 0; 59 | 60 | // tmp variables 61 | size_t wordLen = 0; 62 | assert(dictTrie_); 63 | vector dags; 64 | dictTrie_->Find(begin, end, dags); 65 | for (size_t i = 0; i < dags.size(); i++) { 66 | for (size_t j = 0; j < dags[i].nexts.size(); j++) { 67 | size_t nextoffset = dags[i].nexts[j].first; 68 | assert(nextoffset < dags.size()); 69 | const DictUnit* du = dags[i].nexts[j].second; 70 | if (du == NULL) { 71 | if (dags[i].nexts.size() == 1 && maxIdx <= uIdx) { 72 | WordRange wr(begin + i, begin + nextoffset); 73 | res.push_back(wr); 74 | } 75 | } else { 76 | wordLen = du->word.size(); 77 | if (wordLen >= 2 || (dags[i].nexts.size() == 1 && maxIdx <= uIdx)) { 78 | WordRange wr(begin + i, begin + nextoffset); 79 | res.push_back(wr); 80 | } 81 | } 82 | maxIdx = uIdx + wordLen > maxIdx ? uIdx + wordLen : maxIdx; 83 | } 84 | uIdx++; 85 | } 86 | } 87 | private: 88 | const DictTrie* dictTrie_; 89 | bool isNeedDestroy_; 90 | }; 91 | } 92 | 93 | #endif 94 | -------------------------------------------------------------------------------- /offline/include/cppjieba/HMMModel.hpp: -------------------------------------------------------------------------------- 1 | #ifndef CPPJIEBA_HMMMODEL_H 2 | #define CPPJIEBA_HMMMODEL_H 3 | 4 | #include "limonp/StringUtil.hpp" 5 | #include "Trie.hpp" 6 | 7 | namespace cppjieba { 8 | 9 | using namespace limonp; 10 | typedef unordered_map EmitProbMap; 11 | 12 | struct HMMModel { 13 | /* 14 | * STATUS: 15 | * 0: HMMModel::B, 1: HMMModel::E, 2: HMMModel::M, 3:HMMModel::S 16 | * */ 17 | enum {B = 0, E = 1, M = 2, S = 3, STATUS_SUM = 4}; 18 | 19 | HMMModel(const string& modelPath) { 20 | memset(startProb, 0, sizeof(startProb)); 21 | memset(transProb, 0, sizeof(transProb)); 22 | statMap[0] = 'B'; 23 | statMap[1] = 'E'; 24 | statMap[2] = 'M'; 25 | statMap[3] = 'S'; 26 | emitProbVec.push_back(&emitProbB); 27 | emitProbVec.push_back(&emitProbE); 28 | emitProbVec.push_back(&emitProbM); 29 | emitProbVec.push_back(&emitProbS); 30 | LoadModel(modelPath); 31 | } 32 | ~HMMModel() { 33 | } 34 | void LoadModel(const string& filePath) { 35 | ifstream ifile(filePath.c_str()); 36 | XCHECK(ifile.is_open()) << "open " << filePath << " failed"; 37 | string line; 38 | vector tmp; 39 | vector tmp2; 40 | //Load startProb 41 | XCHECK(GetLine(ifile, line)); 42 | Split(line, tmp, " "); 43 | XCHECK(tmp.size() == STATUS_SUM); 44 | for (size_t j = 0; j< tmp.size(); j++) { 45 | startProb[j] = atof(tmp[j].c_str()); 46 | } 47 | 48 | //Load transProb 49 | for (size_t i = 0; i < STATUS_SUM; i++) { 50 | XCHECK(GetLine(ifile, line)); 51 | Split(line, tmp, " "); 52 | XCHECK(tmp.size() == STATUS_SUM); 53 | for (size_t j =0; j < STATUS_SUM; j++) { 54 | transProb[i][j] = atof(tmp[j].c_str()); 55 | } 56 | } 57 | 58 | //Load emitProbB 59 | XCHECK(GetLine(ifile, line)); 60 | XCHECK(LoadEmitProb(line, emitProbB)); 61 | 62 | //Load emitProbE 63 | XCHECK(GetLine(ifile, line)); 64 | XCHECK(LoadEmitProb(line, emitProbE)); 65 | 66 | //Load emitProbM 67 | XCHECK(GetLine(ifile, line)); 68 | XCHECK(LoadEmitProb(line, emitProbM)); 69 | 70 | //Load emitProbS 71 | XCHECK(GetLine(ifile, line)); 72 | XCHECK(LoadEmitProb(line, emitProbS)); 73 | } 74 | double GetEmitProb(const EmitProbMap* ptMp, Rune key, 75 | double defVal)const { 76 | EmitProbMap::const_iterator cit = ptMp->find(key); 77 | if (cit == ptMp->end()) { 78 | return defVal; 79 | } 80 | return cit->second; 81 | } 82 | bool GetLine(ifstream& ifile, string& line) { 83 | while (getline(ifile, line)) { 84 | Trim(line); 85 | if (line.empty()) { 86 | continue; 87 | } 88 | if (StartsWith(line, "#")) { 89 | continue; 90 | } 91 | return true; 92 | } 93 | return false; 94 | } 95 | bool LoadEmitProb(const string& line, EmitProbMap& mp) { 96 | if (line.empty()) { 97 | return false; 98 | } 99 | vector tmp, tmp2; 100 | Unicode unicode; 101 | Split(line, tmp, ","); 102 | for (size_t i = 0; i < tmp.size(); i++) { 103 | Split(tmp[i], tmp2, ":"); 104 | if (2 != tmp2.size()) { 105 | XLOG(ERROR) << "emitProb illegal."; 106 | return false; 107 | } 108 | if (!DecodeRunesInString(tmp2[0], unicode) || unicode.size() != 1) { 109 | XLOG(ERROR) << "TransCode failed."; 110 | return false; 111 | } 112 | mp[unicode[0]] = atof(tmp2[1].c_str()); 113 | } 114 | return true; 115 | } 116 | 117 | char statMap[STATUS_SUM]; 118 | double startProb[STATUS_SUM]; 119 | double transProb[STATUS_SUM][STATUS_SUM]; 120 | EmitProbMap emitProbB; 121 | EmitProbMap emitProbE; 122 | EmitProbMap emitProbM; 123 | EmitProbMap emitProbS; 124 | vector emitProbVec; 125 | }; // struct HMMModel 126 | 127 | } // namespace cppjieba 128 | 129 | #endif 130 | -------------------------------------------------------------------------------- /offline/include/cppjieba/Jieba.hpp: -------------------------------------------------------------------------------- 1 | #ifndef CPPJIEAB_JIEBA_H 2 | #define CPPJIEAB_JIEBA_H 3 | 4 | #include "QuerySegment.hpp" 5 | #include "KeywordExtractor.hpp" 6 | 7 | namespace cppjieba { 8 | 9 | class Jieba { 10 | public: 11 | Jieba(const string& dict_path, 12 | const string& model_path, 13 | const string& user_dict_path, 14 | const string& idfPath, 15 | const string& stopWordPath) 16 | : dict_trie_(dict_path, user_dict_path), 17 | model_(model_path), 18 | mp_seg_(&dict_trie_), 19 | hmm_seg_(&model_), 20 | mix_seg_(&dict_trie_, &model_), 21 | full_seg_(&dict_trie_), 22 | query_seg_(&dict_trie_, &model_), 23 | extractor(&dict_trie_, &model_, idfPath, stopWordPath) { 24 | } 25 | ~Jieba() { 26 | } 27 | 28 | struct LocWord { 29 | string word; 30 | size_t begin; 31 | size_t end; 32 | }; // struct LocWord 33 | 34 | void Cut(const string& sentence, vector& words, bool hmm = true) const { 35 | mix_seg_.Cut(sentence, words, hmm); 36 | } 37 | void Cut(const string& sentence, vector& words, bool hmm = true) const { 38 | mix_seg_.Cut(sentence, words, hmm); 39 | } 40 | void CutAll(const string& sentence, vector& words) const { 41 | full_seg_.Cut(sentence, words); 42 | } 43 | void CutAll(const string& sentence, vector& words) const { 44 | full_seg_.Cut(sentence, words); 45 | } 46 | void CutForSearch(const string& sentence, vector& words, bool hmm = true) const { 47 | query_seg_.Cut(sentence, words, hmm); 48 | } 49 | void CutForSearch(const string& sentence, vector& words, bool hmm = true) const { 50 | query_seg_.Cut(sentence, words, hmm); 51 | } 52 | void CutHMM(const string& sentence, vector& words) const { 53 | hmm_seg_.Cut(sentence, words); 54 | } 55 | void CutHMM(const string& sentence, vector& words) const { 56 | hmm_seg_.Cut(sentence, words); 57 | } 58 | void CutSmall(const string& sentence, vector& words, size_t max_word_len) const { 59 | mp_seg_.Cut(sentence, words, max_word_len); 60 | } 61 | void CutSmall(const string& sentence, vector& words, size_t max_word_len) const { 62 | mp_seg_.Cut(sentence, words, max_word_len); 63 | } 64 | 65 | void Tag(const string& sentence, vector >& words) const { 66 | mix_seg_.Tag(sentence, words); 67 | } 68 | string LookupTag(const string &str) const { 69 | return mix_seg_.LookupTag(str); 70 | } 71 | bool InsertUserWord(const string& word, const string& tag = UNKNOWN_TAG) { 72 | return dict_trie_.InsertUserWord(word, tag); 73 | } 74 | 75 | bool InsertUserWord(const string& word,int freq, const string& tag = UNKNOWN_TAG) { 76 | return dict_trie_.InsertUserWord(word,freq, tag); 77 | } 78 | 79 | bool Find(const string& word) 80 | { 81 | return dict_trie_.Find(word); 82 | } 83 | 84 | void ResetSeparators(const string& s) { 85 | //TODO 86 | mp_seg_.ResetSeparators(s); 87 | hmm_seg_.ResetSeparators(s); 88 | mix_seg_.ResetSeparators(s); 89 | full_seg_.ResetSeparators(s); 90 | query_seg_.ResetSeparators(s); 91 | } 92 | 93 | const DictTrie* GetDictTrie() const { 94 | return &dict_trie_; 95 | } 96 | 97 | const HMMModel* GetHMMModel() const { 98 | return &model_; 99 | } 100 | 101 | void LoadUserDict(const vector& buf) { 102 | dict_trie_.LoadUserDict(buf); 103 | } 104 | 105 | void LoadUserDict(const set& buf) { 106 | dict_trie_.LoadUserDict(buf); 107 | } 108 | 109 | void LoadUserDict(const string& path) { 110 | dict_trie_.LoadUserDict(path); 111 | } 112 | 113 | private: 114 | DictTrie dict_trie_; 115 | HMMModel model_; 116 | 117 | // They share the same dict trie and model 118 | MPSegment mp_seg_; 119 | HMMSegment hmm_seg_; 120 | MixSegment mix_seg_; 121 | FullSegment full_seg_; 122 | QuerySegment query_seg_; 123 | 124 | public: 125 | KeywordExtractor extractor; 126 | }; // class Jieba 127 | 128 | } // namespace cppjieba 129 | 130 | #endif // CPPJIEAB_JIEBA_H 131 | -------------------------------------------------------------------------------- /offline/include/cppjieba/MPSegment.hpp: -------------------------------------------------------------------------------- 1 | #ifndef CPPJIEBA_MPSEGMENT_H 2 | #define CPPJIEBA_MPSEGMENT_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include "limonp/Logging.hpp" 8 | #include "DictTrie.hpp" 9 | #include "SegmentTagged.hpp" 10 | #include "PosTagger.hpp" 11 | 12 | namespace cppjieba { 13 | 14 | class MPSegment: public SegmentTagged { 15 | public: 16 | MPSegment(const string& dictPath, const string& userDictPath = "") 17 | : dictTrie_(new DictTrie(dictPath, userDictPath)), isNeedDestroy_(true) { 18 | } 19 | MPSegment(const DictTrie* dictTrie) 20 | : dictTrie_(dictTrie), isNeedDestroy_(false) { 21 | assert(dictTrie_); 22 | } 23 | ~MPSegment() { 24 | if (isNeedDestroy_) { 25 | delete dictTrie_; 26 | } 27 | } 28 | 29 | void Cut(const string& sentence, vector& words) const { 30 | Cut(sentence, words, MAX_WORD_LENGTH); 31 | } 32 | 33 | void Cut(const string& sentence, 34 | vector& words, 35 | size_t max_word_len) const { 36 | vector tmp; 37 | Cut(sentence, tmp, max_word_len); 38 | GetStringsFromWords(tmp, words); 39 | } 40 | void Cut(const string& sentence, 41 | vector& words, 42 | size_t max_word_len = MAX_WORD_LENGTH) const { 43 | PreFilter pre_filter(symbols_, sentence); 44 | PreFilter::Range range; 45 | vector wrs; 46 | wrs.reserve(sentence.size()/2); 47 | while (pre_filter.HasNext()) { 48 | range = pre_filter.Next(); 49 | Cut(range.begin, range.end, wrs, max_word_len); 50 | } 51 | words.clear(); 52 | words.reserve(wrs.size()); 53 | GetWordsFromWordRanges(sentence, wrs, words); 54 | } 55 | void Cut(RuneStrArray::const_iterator begin, 56 | RuneStrArray::const_iterator end, 57 | vector& words, 58 | size_t max_word_len = MAX_WORD_LENGTH) const { 59 | vector dags; 60 | dictTrie_->Find(begin, 61 | end, 62 | dags, 63 | max_word_len); 64 | CalcDP(dags); 65 | CutByDag(begin, end, dags, words); 66 | } 67 | 68 | const DictTrie* GetDictTrie() const { 69 | return dictTrie_; 70 | } 71 | 72 | bool Tag(const string& src, vector >& res) const { 73 | return tagger_.Tag(src, res, *this); 74 | } 75 | 76 | bool IsUserDictSingleChineseWord(const Rune& value) const { 77 | return dictTrie_->IsUserDictSingleChineseWord(value); 78 | } 79 | private: 80 | void CalcDP(vector& dags) const { 81 | size_t nextPos; 82 | const DictUnit* p; 83 | double val; 84 | 85 | for (vector::reverse_iterator rit = dags.rbegin(); rit != dags.rend(); rit++) { 86 | rit->pInfo = NULL; 87 | rit->weight = MIN_DOUBLE; 88 | assert(!rit->nexts.empty()); 89 | for (LocalVector >::const_iterator it = rit->nexts.begin(); it != rit->nexts.end(); it++) { 90 | nextPos = it->first; 91 | p = it->second; 92 | val = 0.0; 93 | if (nextPos + 1 < dags.size()) { 94 | val += dags[nextPos + 1].weight; 95 | } 96 | 97 | if (p) { 98 | val += p->weight; 99 | } else { 100 | val += dictTrie_->GetMinWeight(); 101 | } 102 | if (val > rit->weight) { 103 | rit->pInfo = p; 104 | rit->weight = val; 105 | } 106 | } 107 | } 108 | } 109 | void CutByDag(RuneStrArray::const_iterator begin, 110 | RuneStrArray::const_iterator end, 111 | const vector& dags, 112 | vector& words) const { 113 | size_t i = 0; 114 | while (i < dags.size()) { 115 | const DictUnit* p = dags[i].pInfo; 116 | if (p) { 117 | assert(p->word.size() >= 1); 118 | WordRange wr(begin + i, begin + i + p->word.size() - 1); 119 | words.push_back(wr); 120 | i += p->word.size(); 121 | } else { //single chinese word 122 | WordRange wr(begin + i, begin + i); 123 | words.push_back(wr); 124 | i++; 125 | } 126 | } 127 | } 128 | 129 | const DictTrie* dictTrie_; 130 | bool isNeedDestroy_; 131 | PosTagger tagger_; 132 | 133 | }; // class MPSegment 134 | 135 | } // namespace cppjieba 136 | 137 | #endif 138 | -------------------------------------------------------------------------------- /offline/include/cppjieba/MixSegment.hpp: -------------------------------------------------------------------------------- 1 | #ifndef CPPJIEBA_MIXSEGMENT_H 2 | #define CPPJIEBA_MIXSEGMENT_H 3 | 4 | #include 5 | #include "MPSegment.hpp" 6 | #include "HMMSegment.hpp" 7 | #include "limonp/StringUtil.hpp" 8 | #include "PosTagger.hpp" 9 | 10 | namespace cppjieba { 11 | class MixSegment: public SegmentTagged { 12 | public: 13 | MixSegment(const string& mpSegDict, const string& hmmSegDict, 14 | const string& userDict = "") 15 | : mpSeg_(mpSegDict, userDict), 16 | hmmSeg_(hmmSegDict) { 17 | } 18 | MixSegment(const DictTrie* dictTrie, const HMMModel* model) 19 | : mpSeg_(dictTrie), hmmSeg_(model) { 20 | } 21 | ~MixSegment() { 22 | } 23 | 24 | void Cut(const string& sentence, vector& words) const { 25 | Cut(sentence, words, true); 26 | } 27 | void Cut(const string& sentence, vector& words, bool hmm) const { 28 | vector tmp; 29 | Cut(sentence, tmp, hmm); 30 | GetStringsFromWords(tmp, words); 31 | } 32 | void Cut(const string& sentence, vector& words, bool hmm = true) const { 33 | PreFilter pre_filter(symbols_, sentence); 34 | PreFilter::Range range; 35 | vector wrs; 36 | wrs.reserve(sentence.size() / 2); 37 | while (pre_filter.HasNext()) { 38 | range = pre_filter.Next(); 39 | Cut(range.begin, range.end, wrs, hmm); 40 | } 41 | words.clear(); 42 | words.reserve(wrs.size()); 43 | GetWordsFromWordRanges(sentence, wrs, words); 44 | } 45 | 46 | void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector& res, bool hmm) const { 47 | if (!hmm) { 48 | mpSeg_.Cut(begin, end, res); 49 | return; 50 | } 51 | vector words; 52 | assert(end >= begin); 53 | words.reserve(end - begin); 54 | mpSeg_.Cut(begin, end, words); 55 | 56 | vector hmmRes; 57 | hmmRes.reserve(end - begin); 58 | for (size_t i = 0; i < words.size(); i++) { 59 | //if mp Get a word, it's ok, put it into result 60 | if (words[i].left != words[i].right || (words[i].left == words[i].right && mpSeg_.IsUserDictSingleChineseWord(words[i].left->rune))) { 61 | res.push_back(words[i]); 62 | continue; 63 | } 64 | 65 | // if mp Get a single one and it is not in userdict, collect it in sequence 66 | size_t j = i; 67 | while (j < words.size() && words[j].left == words[j].right && !mpSeg_.IsUserDictSingleChineseWord(words[j].left->rune)) { 68 | j++; 69 | } 70 | 71 | // Cut the sequence with hmm 72 | assert(j - 1 >= i); 73 | // TODO 74 | hmmSeg_.Cut(words[i].left, words[j - 1].left + 1, hmmRes); 75 | //put hmm result to result 76 | for (size_t k = 0; k < hmmRes.size(); k++) { 77 | res.push_back(hmmRes[k]); 78 | } 79 | 80 | //clear tmp vars 81 | hmmRes.clear(); 82 | 83 | //let i jump over this piece 84 | i = j - 1; 85 | } 86 | } 87 | 88 | const DictTrie* GetDictTrie() const { 89 | return mpSeg_.GetDictTrie(); 90 | } 91 | 92 | bool Tag(const string& src, vector >& res) const { 93 | return tagger_.Tag(src, res, *this); 94 | } 95 | 96 | string LookupTag(const string &str) const { 97 | return tagger_.LookupTag(str, *this); 98 | } 99 | 100 | private: 101 | MPSegment mpSeg_; 102 | HMMSegment hmmSeg_; 103 | PosTagger tagger_; 104 | 105 | }; // class MixSegment 106 | 107 | } // namespace cppjieba 108 | 109 | #endif 110 | -------------------------------------------------------------------------------- /offline/include/cppjieba/PosTagger.hpp: -------------------------------------------------------------------------------- 1 | #ifndef CPPJIEBA_POS_TAGGING_H 2 | #define CPPJIEBA_POS_TAGGING_H 3 | 4 | #include "limonp/StringUtil.hpp" 5 | #include "SegmentTagged.hpp" 6 | #include "DictTrie.hpp" 7 | 8 | namespace cppjieba { 9 | using namespace limonp; 10 | 11 | static const char* const POS_M = "m"; 12 | static const char* const POS_ENG = "eng"; 13 | static const char* const POS_X = "x"; 14 | 15 | class PosTagger { 16 | public: 17 | PosTagger() { 18 | } 19 | ~PosTagger() { 20 | } 21 | 22 | bool Tag(const string& src, vector >& res, const SegmentTagged& segment) const { 23 | vector CutRes; 24 | segment.Cut(src, CutRes); 25 | 26 | for (vector::iterator itr = CutRes.begin(); itr != CutRes.end(); ++itr) { 27 | res.push_back(make_pair(*itr, LookupTag(*itr, segment))); 28 | } 29 | return !res.empty(); 30 | } 31 | 32 | string LookupTag(const string &str, const SegmentTagged& segment) const { 33 | const DictUnit *tmp = NULL; 34 | RuneStrArray runes; 35 | const DictTrie * dict = segment.GetDictTrie(); 36 | assert(dict != NULL); 37 | if (!DecodeRunesInString(str, runes)) { 38 | XLOG(ERROR) << "Decode failed."; 39 | return POS_X; 40 | } 41 | tmp = dict->Find(runes.begin(), runes.end()); 42 | if (tmp == NULL || tmp->tag.empty()) { 43 | return SpecialRule(runes); 44 | } else { 45 | return tmp->tag; 46 | } 47 | } 48 | 49 | private: 50 | const char* SpecialRule(const RuneStrArray& unicode) const { 51 | size_t m = 0; 52 | size_t eng = 0; 53 | for (size_t i = 0; i < unicode.size() && eng < unicode.size() / 2; i++) { 54 | if (unicode[i].rune < 0x80) { 55 | eng ++; 56 | if ('0' <= unicode[i].rune && unicode[i].rune <= '9') { 57 | m++; 58 | } 59 | } 60 | } 61 | // ascii char is not found 62 | if (eng == 0) { 63 | return POS_X; 64 | } 65 | // all the ascii is number char 66 | if (m == eng) { 67 | return POS_M; 68 | } 69 | // the ascii chars contain english letter 70 | return POS_ENG; 71 | } 72 | 73 | }; // class PosTagger 74 | 75 | } // namespace cppjieba 76 | 77 | #endif 78 | -------------------------------------------------------------------------------- /offline/include/cppjieba/PreFilter.hpp: -------------------------------------------------------------------------------- 1 | #ifndef CPPJIEBA_PRE_FILTER_H 2 | #define CPPJIEBA_PRE_FILTER_H 3 | 4 | #include "Trie.hpp" 5 | #include "limonp/Logging.hpp" 6 | 7 | namespace cppjieba { 8 | 9 | class PreFilter { 10 | public: 11 | //TODO use WordRange instead of Range 12 | struct Range { 13 | RuneStrArray::const_iterator begin; 14 | RuneStrArray::const_iterator end; 15 | }; // struct Range 16 | 17 | PreFilter(const unordered_set& symbols, 18 | const string& sentence) 19 | : symbols_(symbols) { 20 | if (!DecodeRunesInString(sentence, sentence_)) { 21 | XLOG(ERROR) << "decode failed. "; 22 | } 23 | cursor_ = sentence_.begin(); 24 | } 25 | ~PreFilter() { 26 | } 27 | bool HasNext() const { 28 | return cursor_ != sentence_.end(); 29 | } 30 | Range Next() { 31 | Range range; 32 | range.begin = cursor_; 33 | while (cursor_ != sentence_.end()) { 34 | if (IsIn(symbols_, cursor_->rune)) { 35 | if (range.begin == cursor_) { 36 | cursor_ ++; 37 | } 38 | range.end = cursor_; 39 | return range; 40 | } 41 | cursor_ ++; 42 | } 43 | range.end = sentence_.end(); 44 | return range; 45 | } 46 | private: 47 | RuneStrArray::const_iterator cursor_; 48 | RuneStrArray sentence_; 49 | const unordered_set& symbols_; 50 | }; // class PreFilter 51 | 52 | } // namespace cppjieba 53 | 54 | #endif // CPPJIEBA_PRE_FILTER_H 55 | -------------------------------------------------------------------------------- /offline/include/cppjieba/QuerySegment.hpp: -------------------------------------------------------------------------------- 1 | #ifndef CPPJIEBA_QUERYSEGMENT_H 2 | #define CPPJIEBA_QUERYSEGMENT_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include "limonp/Logging.hpp" 8 | #include "DictTrie.hpp" 9 | #include "SegmentBase.hpp" 10 | #include "FullSegment.hpp" 11 | #include "MixSegment.hpp" 12 | #include "Unicode.hpp" 13 | #include "DictTrie.hpp" 14 | 15 | namespace cppjieba { 16 | class QuerySegment: public SegmentBase { 17 | public: 18 | QuerySegment(const string& dict, const string& model, const string& userDict = "") 19 | : mixSeg_(dict, model, userDict), 20 | trie_(mixSeg_.GetDictTrie()) { 21 | } 22 | QuerySegment(const DictTrie* dictTrie, const HMMModel* model) 23 | : mixSeg_(dictTrie, model), trie_(dictTrie) { 24 | } 25 | ~QuerySegment() { 26 | } 27 | 28 | void Cut(const string& sentence, vector& words) const { 29 | Cut(sentence, words, true); 30 | } 31 | void Cut(const string& sentence, vector& words, bool hmm) const { 32 | vector tmp; 33 | Cut(sentence, tmp, hmm); 34 | GetStringsFromWords(tmp, words); 35 | } 36 | void Cut(const string& sentence, vector& words, bool hmm = true) const { 37 | PreFilter pre_filter(symbols_, sentence); 38 | PreFilter::Range range; 39 | vector wrs; 40 | wrs.reserve(sentence.size()/2); 41 | while (pre_filter.HasNext()) { 42 | range = pre_filter.Next(); 43 | Cut(range.begin, range.end, wrs, hmm); 44 | } 45 | words.clear(); 46 | words.reserve(wrs.size()); 47 | GetWordsFromWordRanges(sentence, wrs, words); 48 | } 49 | void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector& res, bool hmm) const { 50 | //use mix Cut first 51 | vector mixRes; 52 | mixSeg_.Cut(begin, end, mixRes, hmm); 53 | 54 | vector fullRes; 55 | for (vector::const_iterator mixResItr = mixRes.begin(); mixResItr != mixRes.end(); mixResItr++) { 56 | if (mixResItr->Length() > 2) { 57 | for (size_t i = 0; i + 1 < mixResItr->Length(); i++) { 58 | WordRange wr(mixResItr->left + i, mixResItr->left + i + 1); 59 | if (trie_->Find(wr.left, wr.right + 1) != NULL) { 60 | res.push_back(wr); 61 | } 62 | } 63 | } 64 | if (mixResItr->Length() > 3) { 65 | for (size_t i = 0; i + 2 < mixResItr->Length(); i++) { 66 | WordRange wr(mixResItr->left + i, mixResItr->left + i + 2); 67 | if (trie_->Find(wr.left, wr.right + 1) != NULL) { 68 | res.push_back(wr); 69 | } 70 | } 71 | } 72 | res.push_back(*mixResItr); 73 | } 74 | } 75 | private: 76 | bool IsAllAscii(const Unicode& s) const { 77 | for(size_t i = 0; i < s.size(); i++) { 78 | if (s[i] >= 0x80) { 79 | return false; 80 | } 81 | } 82 | return true; 83 | } 84 | MixSegment mixSeg_; 85 | const DictTrie* trie_; 86 | }; // QuerySegment 87 | 88 | } // namespace cppjieba 89 | 90 | #endif 91 | -------------------------------------------------------------------------------- /offline/include/cppjieba/SegmentBase.hpp: -------------------------------------------------------------------------------- 1 | #ifndef CPPJIEBA_SEGMENTBASE_H 2 | #define CPPJIEBA_SEGMENTBASE_H 3 | 4 | #include "limonp/Logging.hpp" 5 | #include "PreFilter.hpp" 6 | #include 7 | 8 | 9 | namespace cppjieba { 10 | 11 | const char* const SPECIAL_SEPARATORS = " \t\n\xEF\xBC\x8C\xE3\x80\x82"; 12 | 13 | using namespace limonp; 14 | 15 | class SegmentBase { 16 | public: 17 | SegmentBase() { 18 | XCHECK(ResetSeparators(SPECIAL_SEPARATORS)); 19 | } 20 | virtual ~SegmentBase() { 21 | } 22 | 23 | virtual void Cut(const string& sentence, vector& words) const = 0; 24 | 25 | bool ResetSeparators(const string& s) { 26 | symbols_.clear(); 27 | RuneStrArray runes; 28 | if (!DecodeRunesInString(s, runes)) { 29 | XLOG(ERROR) << "decode " << s << " failed"; 30 | return false; 31 | } 32 | for (size_t i = 0; i < runes.size(); i++) { 33 | if (!symbols_.insert(runes[i].rune).second) { 34 | XLOG(ERROR) << s.substr(runes[i].offset, runes[i].len) << " already exists"; 35 | return false; 36 | } 37 | } 38 | return true; 39 | } 40 | protected: 41 | unordered_set symbols_; 42 | }; // class SegmentBase 43 | 44 | } // cppjieba 45 | 46 | #endif 47 | -------------------------------------------------------------------------------- /offline/include/cppjieba/SegmentTagged.hpp: -------------------------------------------------------------------------------- 1 | #ifndef CPPJIEBA_SEGMENTTAGGED_H 2 | #define CPPJIEBA_SEGMENTTAGGED_H 3 | 4 | #include "SegmentBase.hpp" 5 | 6 | namespace cppjieba { 7 | 8 | class SegmentTagged : public SegmentBase{ 9 | public: 10 | SegmentTagged() { 11 | } 12 | virtual ~SegmentTagged() { 13 | } 14 | 15 | virtual bool Tag(const string& src, vector >& res) const = 0; 16 | 17 | virtual const DictTrie* GetDictTrie() const = 0; 18 | 19 | }; // class SegmentTagged 20 | 21 | } // cppjieba 22 | 23 | #endif 24 | -------------------------------------------------------------------------------- /offline/include/cppjieba/limonp/ArgvContext.hpp: -------------------------------------------------------------------------------- 1 | /************************************ 2 | * file enc : ascii 3 | * author : wuyanyi09@gmail.com 4 | ************************************/ 5 | 6 | #ifndef LIMONP_ARGV_FUNCTS_H 7 | #define LIMONP_ARGV_FUNCTS_H 8 | 9 | #include 10 | #include 11 | #include "StringUtil.hpp" 12 | 13 | namespace limonp { 14 | 15 | using namespace std; 16 | 17 | class ArgvContext { 18 | public : 19 | ArgvContext(int argc, const char* const * argv) { 20 | for(int i = 0; i < argc; i++) { 21 | if(StartsWith(argv[i], "-")) { 22 | if(i + 1 < argc && !StartsWith(argv[i + 1], "-")) { 23 | mpss_[argv[i]] = argv[i+1]; 24 | i++; 25 | } else { 26 | sset_.insert(argv[i]); 27 | } 28 | } else { 29 | args_.push_back(argv[i]); 30 | } 31 | } 32 | } 33 | ~ArgvContext() { 34 | } 35 | 36 | friend ostream& operator << (ostream& os, const ArgvContext& args); 37 | string operator [](size_t i) const { 38 | if(i < args_.size()) { 39 | return args_[i]; 40 | } 41 | return ""; 42 | } 43 | string operator [](const string& key) const { 44 | map::const_iterator it = mpss_.find(key); 45 | if(it != mpss_.end()) { 46 | return it->second; 47 | } 48 | return ""; 49 | } 50 | 51 | bool HasKey(const string& key) const { 52 | if(mpss_.find(key) != mpss_.end() || sset_.find(key) != sset_.end()) { 53 | return true; 54 | } 55 | return false; 56 | } 57 | 58 | private: 59 | vector args_; 60 | map mpss_; 61 | set sset_; 62 | }; // class ArgvContext 63 | 64 | inline ostream& operator << (ostream& os, const ArgvContext& args) { 65 | return os< 5 | #include "Condition.hpp" 6 | 7 | namespace limonp { 8 | template 9 | class BlockingQueue: NonCopyable { 10 | public: 11 | BlockingQueue() 12 | : mutex_(), notEmpty_(mutex_), queue_() { 13 | } 14 | 15 | void Push(const T& x) { 16 | MutexLockGuard lock(mutex_); 17 | queue_.push(x); 18 | notEmpty_.Notify(); // Wait morphing saves us 19 | } 20 | 21 | T Pop() { 22 | MutexLockGuard lock(mutex_); 23 | // always use a while-loop, due to spurious wakeup 24 | while (queue_.empty()) { 25 | notEmpty_.Wait(); 26 | } 27 | assert(!queue_.empty()); 28 | T front(queue_.front()); 29 | queue_.pop(); 30 | return front; 31 | } 32 | 33 | size_t Size() const { 34 | MutexLockGuard lock(mutex_); 35 | return queue_.size(); 36 | } 37 | bool Empty() const { 38 | return Size() == 0; 39 | } 40 | 41 | private: 42 | mutable MutexLock mutex_; 43 | Condition notEmpty_; 44 | std::queue queue_; 45 | }; // class BlockingQueue 46 | 47 | } // namespace limonp 48 | 49 | #endif // LIMONP_BLOCKINGQUEUE_HPP 50 | -------------------------------------------------------------------------------- /offline/include/cppjieba/limonp/BoundedBlockingQueue.hpp: -------------------------------------------------------------------------------- 1 | #ifndef LIMONP_BOUNDED_BLOCKING_QUEUE_HPP 2 | #define LIMONP_BOUNDED_BLOCKING_QUEUE_HPP 3 | 4 | #include "BoundedQueue.hpp" 5 | 6 | namespace limonp { 7 | 8 | template 9 | class BoundedBlockingQueue : NonCopyable { 10 | public: 11 | explicit BoundedBlockingQueue(size_t maxSize) 12 | : mutex_(), 13 | notEmpty_(mutex_), 14 | notFull_(mutex_), 15 | queue_(maxSize) { 16 | } 17 | 18 | void Push(const T& x) { 19 | MutexLockGuard lock(mutex_); 20 | while (queue_.Full()) { 21 | notFull_.Wait(); 22 | } 23 | assert(!queue_.Full()); 24 | queue_.Push(x); 25 | notEmpty_.Notify(); 26 | } 27 | 28 | T Pop() { 29 | MutexLockGuard lock(mutex_); 30 | while (queue_.Empty()) { 31 | notEmpty_.Wait(); 32 | } 33 | assert(!queue_.Empty()); 34 | T res = queue_.Pop(); 35 | notFull_.Notify(); 36 | return res; 37 | } 38 | 39 | bool Empty() const { 40 | MutexLockGuard lock(mutex_); 41 | return queue_.Empty(); 42 | } 43 | 44 | bool Full() const { 45 | MutexLockGuard lock(mutex_); 46 | return queue_.Full(); 47 | } 48 | 49 | size_t size() const { 50 | MutexLockGuard lock(mutex_); 51 | return queue_.size(); 52 | } 53 | 54 | size_t capacity() const { 55 | return queue_.capacity(); 56 | } 57 | 58 | private: 59 | mutable MutexLock mutex_; 60 | Condition notEmpty_; 61 | Condition notFull_; 62 | BoundedQueue queue_; 63 | }; // class BoundedBlockingQueue 64 | 65 | } // namespace limonp 66 | 67 | #endif // LIMONP_BOUNDED_BLOCKING_QUEUE_HPP 68 | -------------------------------------------------------------------------------- /offline/include/cppjieba/limonp/BoundedQueue.hpp: -------------------------------------------------------------------------------- 1 | #ifndef LIMONP_BOUNDED_QUEUE_HPP 2 | #define LIMONP_BOUNDED_QUEUE_HPP 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | namespace limonp { 9 | using namespace std; 10 | template 11 | class BoundedQueue { 12 | public: 13 | explicit BoundedQueue(size_t capacity): capacity_(capacity), circular_buffer_(capacity) { 14 | head_ = 0; 15 | tail_ = 0; 16 | size_ = 0; 17 | assert(capacity_); 18 | } 19 | ~BoundedQueue() { 20 | } 21 | 22 | void Clear() { 23 | head_ = 0; 24 | tail_ = 0; 25 | size_ = 0; 26 | } 27 | bool Empty() const { 28 | return !size_; 29 | } 30 | bool Full() const { 31 | return capacity_ == size_; 32 | } 33 | size_t Size() const { 34 | return size_; 35 | } 36 | size_t Capacity() const { 37 | return capacity_; 38 | } 39 | 40 | void Push(const T& t) { 41 | assert(!Full()); 42 | circular_buffer_[tail_] = t; 43 | tail_ = (tail_ + 1) % capacity_; 44 | size_ ++; 45 | } 46 | 47 | T Pop() { 48 | assert(!Empty()); 49 | size_t oldPos = head_; 50 | head_ = (head_ + 1) % capacity_; 51 | size_ --; 52 | return circular_buffer_[oldPos]; 53 | } 54 | 55 | private: 56 | size_t head_; 57 | size_t tail_; 58 | size_t size_; 59 | const size_t capacity_; 60 | vector circular_buffer_; 61 | 62 | }; // class BoundedQueue 63 | } // namespace limonp 64 | 65 | #endif 66 | -------------------------------------------------------------------------------- /offline/include/cppjieba/limonp/Colors.hpp: -------------------------------------------------------------------------------- 1 | #ifndef LIMONP_COLOR_PRINT_HPP 2 | #define LIMONP_COLOR_PRINT_HPP 3 | 4 | #include 5 | #include 6 | 7 | namespace limonp { 8 | 9 | using std::string; 10 | 11 | enum Color { 12 | BLACK = 30, 13 | RED, 14 | GREEN, 15 | YELLOW, 16 | BLUE, 17 | PURPLE 18 | }; // enum Color 19 | 20 | static void ColorPrintln(enum Color color, const char * fmt, ...) { 21 | va_list ap; 22 | printf("\033[0;%dm", color); 23 | va_start(ap, fmt); 24 | vprintf(fmt, ap); 25 | va_end(ap); 26 | printf("\033[0m\n"); // if not \n , in some situation , the next lines will be set the same color unexpectedly 27 | } 28 | 29 | } // namespace limonp 30 | 31 | #endif // LIMONP_COLOR_PRINT_HPP 32 | -------------------------------------------------------------------------------- /offline/include/cppjieba/limonp/Condition.hpp: -------------------------------------------------------------------------------- 1 | #ifndef LIMONP_CONDITION_HPP 2 | #define LIMONP_CONDITION_HPP 3 | 4 | #include "MutexLock.hpp" 5 | 6 | namespace limonp { 7 | 8 | class Condition : NonCopyable { 9 | public: 10 | explicit Condition(MutexLock& mutex) 11 | : mutex_(mutex) { 12 | XCHECK(!pthread_cond_init(&pcond_, NULL)); 13 | } 14 | 15 | ~Condition() { 16 | XCHECK(!pthread_cond_destroy(&pcond_)); 17 | } 18 | 19 | void Wait() { 20 | XCHECK(!pthread_cond_wait(&pcond_, mutex_.GetPthreadMutex())); 21 | } 22 | 23 | void Notify() { 24 | XCHECK(!pthread_cond_signal(&pcond_)); 25 | } 26 | 27 | void NotifyAll() { 28 | XCHECK(!pthread_cond_broadcast(&pcond_)); 29 | } 30 | 31 | private: 32 | MutexLock& mutex_; 33 | pthread_cond_t pcond_; 34 | }; // class Condition 35 | 36 | } // namespace limonp 37 | 38 | #endif // LIMONP_CONDITION_HPP 39 | -------------------------------------------------------------------------------- /offline/include/cppjieba/limonp/Config.hpp: -------------------------------------------------------------------------------- 1 | /************************************ 2 | * file enc : utf8 3 | * author : wuyanyi09@gmail.com 4 | ************************************/ 5 | #ifndef LIMONP_CONFIG_H 6 | #define LIMONP_CONFIG_H 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include "StringUtil.hpp" 13 | 14 | namespace limonp { 15 | 16 | using namespace std; 17 | 18 | class Config { 19 | public: 20 | explicit Config(const string& filePath) { 21 | LoadFile(filePath); 22 | } 23 | 24 | operator bool () { 25 | return !map_.empty(); 26 | } 27 | 28 | string Get(const string& key, const string& defaultvalue) const { 29 | map::const_iterator it = map_.find(key); 30 | if(map_.end() != it) { 31 | return it->second; 32 | } 33 | return defaultvalue; 34 | } 35 | int Get(const string& key, int defaultvalue) const { 36 | string str = Get(key, ""); 37 | if("" == str) { 38 | return defaultvalue; 39 | } 40 | return atoi(str.c_str()); 41 | } 42 | const char* operator [] (const char* key) const { 43 | if(NULL == key) { 44 | return NULL; 45 | } 46 | map::const_iterator it = map_.find(key); 47 | if(map_.end() != it) { 48 | return it->second.c_str(); 49 | } 50 | return NULL; 51 | } 52 | 53 | string GetConfigInfo() const { 54 | string res; 55 | res << *this; 56 | return res; 57 | } 58 | 59 | private: 60 | void LoadFile(const string& filePath) { 61 | ifstream ifs(filePath.c_str()); 62 | assert(ifs); 63 | string line; 64 | vector vecBuf; 65 | size_t lineno = 0; 66 | while(getline(ifs, line)) { 67 | lineno ++; 68 | Trim(line); 69 | if(line.empty() || StartsWith(line, "#")) { 70 | continue; 71 | } 72 | vecBuf.clear(); 73 | Split(line, vecBuf, "="); 74 | if(2 != vecBuf.size()) { 75 | fprintf(stderr, "line[%s] illegal.\n", line.c_str()); 76 | assert(false); 77 | continue; 78 | } 79 | string& key = vecBuf[0]; 80 | string& value = vecBuf[1]; 81 | Trim(key); 82 | Trim(value); 83 | if(!map_.insert(make_pair(key, value)).second) { 84 | fprintf(stderr, "key[%s] already exits.\n", key.c_str()); 85 | assert(false); 86 | continue; 87 | } 88 | } 89 | ifs.close(); 90 | } 91 | 92 | friend ostream& operator << (ostream& os, const Config& config); 93 | 94 | map map_; 95 | }; // class Config 96 | 97 | inline ostream& operator << (ostream& os, const Config& config) { 98 | return os << config.map_; 99 | } 100 | 101 | } // namespace limonp 102 | 103 | #endif // LIMONP_CONFIG_H 104 | -------------------------------------------------------------------------------- /offline/include/cppjieba/limonp/FileLock.hpp: -------------------------------------------------------------------------------- 1 | #ifndef LIMONP_FILELOCK_HPP 2 | #define LIMONP_FILELOCK_HPP 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | namespace limonp { 14 | 15 | using std::string; 16 | 17 | class FileLock { 18 | public: 19 | FileLock() : fd_(-1), ok_(true) { 20 | } 21 | ~FileLock() { 22 | if(fd_ > 0) { 23 | Close(); 24 | } 25 | } 26 | void Open(const string& fname) { 27 | assert(fd_ == -1); 28 | fd_ = open(fname.c_str(), O_RDWR | O_CREAT, 0644); 29 | if(fd_ < 0) { 30 | ok_ = false; 31 | err_ = strerror(errno); 32 | } 33 | } 34 | void Close() { 35 | ::close(fd_); 36 | } 37 | void Lock() { 38 | if(LockOrUnlock(fd_, true) < 0) { 39 | ok_ = false; 40 | err_ = strerror(errno); 41 | } 42 | } 43 | void UnLock() { 44 | if(LockOrUnlock(fd_, false) < 0) { 45 | ok_ = false; 46 | err_ = strerror(errno); 47 | } 48 | } 49 | bool Ok() const { 50 | return ok_; 51 | } 52 | string Error() const { 53 | return err_; 54 | } 55 | private: 56 | static int LockOrUnlock(int fd, bool lock) { 57 | errno = 0; 58 | struct flock f; 59 | memset(&f, 0, sizeof(f)); 60 | f.l_type = (lock ? F_WRLCK : F_UNLCK); 61 | f.l_whence = SEEK_SET; 62 | f.l_start = 0; 63 | f.l_len = 0; // Lock/unlock entire file 64 | return fcntl(fd, F_SETLK, &f); 65 | } 66 | 67 | int fd_; 68 | bool ok_; 69 | string err_; 70 | }; // class FileLock 71 | 72 | }// namespace limonp 73 | 74 | #endif // LIMONP_FILELOCK_HPP 75 | -------------------------------------------------------------------------------- /offline/include/cppjieba/limonp/ForcePublic.hpp: -------------------------------------------------------------------------------- 1 | #ifndef LIMONP_FORCE_PUBLIC_H 2 | #define LIMONP_FORCE_PUBLIC_H 3 | 4 | #define private public 5 | #define protected public 6 | 7 | #endif // LIMONP_FORCE_PUBLIC_H 8 | -------------------------------------------------------------------------------- /offline/include/cppjieba/limonp/LocalVector.hpp: -------------------------------------------------------------------------------- 1 | #ifndef LIMONP_LOCAL_VECTOR_HPP 2 | #define LIMONP_LOCAL_VECTOR_HPP 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | namespace limonp { 10 | using namespace std; 11 | /* 12 | * LocalVector : T must be primitive type (char , int, size_t), if T is struct or class, LocalVector may be dangerous.. 13 | * LocalVector is simple and not well-tested. 14 | */ 15 | const size_t LOCAL_VECTOR_BUFFER_SIZE = 16; 16 | template 17 | class LocalVector { 18 | public: 19 | typedef const T* const_iterator ; 20 | typedef T value_type; 21 | typedef size_t size_type; 22 | private: 23 | T buffer_[LOCAL_VECTOR_BUFFER_SIZE]; 24 | T * ptr_; 25 | size_t size_; 26 | size_t capacity_; 27 | public: 28 | LocalVector() { 29 | init_(); 30 | }; 31 | LocalVector(const LocalVector& vec) { 32 | init_(); 33 | *this = vec; 34 | } 35 | LocalVector(const_iterator begin, const_iterator end) { // TODO: make it faster 36 | init_(); 37 | while(begin != end) { 38 | push_back(*begin++); 39 | } 40 | } 41 | LocalVector(size_t size, const T& t) { // TODO: make it faster 42 | init_(); 43 | while(size--) { 44 | push_back(t); 45 | } 46 | } 47 | ~LocalVector() { 48 | if(ptr_ != buffer_) { 49 | free(ptr_); 50 | } 51 | }; 52 | public: 53 | LocalVector& operator = (const LocalVector& vec) { 54 | clear(); 55 | size_ = vec.size(); 56 | capacity_ = vec.capacity(); 57 | if(vec.buffer_ == vec.ptr_) { 58 | memcpy(buffer_, vec.buffer_, sizeof(T) * size_); 59 | ptr_ = buffer_; 60 | } else { 61 | ptr_ = (T*) malloc(vec.capacity() * sizeof(T)); 62 | assert(ptr_); 63 | memcpy(ptr_, vec.ptr_, vec.size() * sizeof(T)); 64 | } 65 | return *this; 66 | } 67 | private: 68 | void init_() { 69 | ptr_ = buffer_; 70 | size_ = 0; 71 | capacity_ = LOCAL_VECTOR_BUFFER_SIZE; 72 | } 73 | public: 74 | T& operator [] (size_t i) { 75 | return ptr_[i]; 76 | } 77 | const T& operator [] (size_t i) const { 78 | return ptr_[i]; 79 | } 80 | void push_back(const T& t) { 81 | if(size_ == capacity_) { 82 | assert(capacity_); 83 | reserve(capacity_ * 2); 84 | } 85 | ptr_[size_ ++ ] = t; 86 | } 87 | void reserve(size_t size) { 88 | if(size <= capacity_) { 89 | return; 90 | } 91 | T * next = (T*)malloc(sizeof(T) * size); 92 | assert(next); 93 | T * old = ptr_; 94 | ptr_ = next; 95 | memcpy(ptr_, old, sizeof(T) * capacity_); 96 | capacity_ = size; 97 | if(old != buffer_) { 98 | free(old); 99 | } 100 | } 101 | bool empty() const { 102 | return 0 == size(); 103 | } 104 | size_t size() const { 105 | return size_; 106 | } 107 | size_t capacity() const { 108 | return capacity_; 109 | } 110 | const_iterator begin() const { 111 | return ptr_; 112 | } 113 | const_iterator end() const { 114 | return ptr_ + size_; 115 | } 116 | void clear() { 117 | if(ptr_ != buffer_) { 118 | free(ptr_); 119 | } 120 | init_(); 121 | } 122 | }; 123 | 124 | template 125 | ostream & operator << (ostream& os, const LocalVector& vec) { 126 | if(vec.empty()) { 127 | return os << "[]"; 128 | } 129 | os<<"[\""< 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #ifdef XLOG 11 | #error "XLOG has been defined already" 12 | #endif // XLOG 13 | #ifdef XCHECK 14 | #error "XCHECK has been defined already" 15 | #endif // XCHECK 16 | 17 | #define XLOG(level) limonp::Logger(limonp::LL_##level, __FILE__, __LINE__).Stream() 18 | #define XCHECK(exp) if(!(exp)) XLOG(FATAL) << "exp: ["#exp << "] false. " 19 | 20 | namespace limonp { 21 | 22 | enum { 23 | LL_DEBUG = 0, 24 | LL_INFO = 1, 25 | LL_WARNING = 2, 26 | LL_ERROR = 3, 27 | LL_FATAL = 4, 28 | }; // enum 29 | 30 | static const char * LOG_LEVEL_ARRAY[] = {"DEBUG","INFO","WARN","ERROR","FATAL"}; 31 | static const char * LOG_TIME_FORMAT = "%Y-%m-%d %H:%M:%S"; 32 | 33 | class Logger { 34 | public: 35 | Logger(size_t level, const char* filename, int lineno) 36 | : level_(level) { 37 | #ifdef LOGGING_LEVEL 38 | if (level_ < LOGGING_LEVEL) { 39 | return; 40 | } 41 | #endif 42 | assert(level_ <= sizeof(LOG_LEVEL_ARRAY)/sizeof(*LOG_LEVEL_ARRAY)); 43 | char buf[32]; 44 | time_t now; 45 | time(&now); 46 | strftime(buf, sizeof(buf), LOG_TIME_FORMAT, localtime(&now)); 47 | stream_ << buf 48 | << " " << filename 49 | << ":" << lineno 50 | << " " << LOG_LEVEL_ARRAY[level_] 51 | << " "; 52 | } 53 | ~Logger() { 54 | #ifdef LOGGING_LEVEL 55 | if (level_ < LOGGING_LEVEL) { 56 | return; 57 | } 58 | #endif 59 | std::cerr << stream_.str() << std::endl; 60 | if (level_ == LL_FATAL) { 61 | abort(); 62 | } 63 | } 64 | 65 | std::ostream& Stream() { 66 | return stream_; 67 | } 68 | 69 | private: 70 | std::ostringstream stream_; 71 | size_t level_; 72 | }; // class Logger 73 | 74 | } // namespace limonp 75 | 76 | #endif // LIMONP_LOGGING_HPP 77 | -------------------------------------------------------------------------------- /offline/include/cppjieba/limonp/MutexLock.hpp: -------------------------------------------------------------------------------- 1 | #ifndef LIMONP_MUTEX_LOCK_HPP 2 | #define LIMONP_MUTEX_LOCK_HPP 3 | 4 | #include 5 | #include "NonCopyable.hpp" 6 | #include "Logging.hpp" 7 | 8 | namespace limonp { 9 | 10 | class MutexLock: NonCopyable { 11 | public: 12 | MutexLock() { 13 | XCHECK(!pthread_mutex_init(&mutex_, NULL)); 14 | } 15 | ~MutexLock() { 16 | XCHECK(!pthread_mutex_destroy(&mutex_)); 17 | } 18 | pthread_mutex_t* GetPthreadMutex() { 19 | return &mutex_; 20 | } 21 | 22 | private: 23 | void Lock() { 24 | XCHECK(!pthread_mutex_lock(&mutex_)); 25 | } 26 | void Unlock() { 27 | XCHECK(!pthread_mutex_unlock(&mutex_)); 28 | } 29 | friend class MutexLockGuard; 30 | 31 | pthread_mutex_t mutex_; 32 | }; // class MutexLock 33 | 34 | class MutexLockGuard: NonCopyable { 35 | public: 36 | explicit MutexLockGuard(MutexLock & mutex) 37 | : mutex_(mutex) { 38 | mutex_.Lock(); 39 | } 40 | ~MutexLockGuard() { 41 | mutex_.Unlock(); 42 | } 43 | private: 44 | MutexLock & mutex_; 45 | }; // class MutexLockGuard 46 | 47 | #define MutexLockGuard(x) XCHECK(false); 48 | 49 | } // namespace limonp 50 | 51 | #endif // LIMONP_MUTEX_LOCK_HPP 52 | -------------------------------------------------------------------------------- /offline/include/cppjieba/limonp/NonCopyable.hpp: -------------------------------------------------------------------------------- 1 | /************************************ 2 | ************************************/ 3 | #ifndef LIMONP_NONCOPYABLE_H 4 | #define LIMONP_NONCOPYABLE_H 5 | 6 | namespace limonp { 7 | 8 | class NonCopyable { 9 | protected: 10 | NonCopyable() { 11 | } 12 | ~NonCopyable() { 13 | } 14 | private: 15 | NonCopyable(const NonCopyable& ); 16 | const NonCopyable& operator=(const NonCopyable& ); 17 | }; // class NonCopyable 18 | 19 | } // namespace limonp 20 | 21 | #endif // LIMONP_NONCOPYABLE_H 22 | -------------------------------------------------------------------------------- /offline/include/cppjieba/limonp/StdExtension.hpp: -------------------------------------------------------------------------------- 1 | #ifndef LIMONP_STD_EXTEMSION_HPP 2 | #define LIMONP_STD_EXTEMSION_HPP 3 | 4 | #include 5 | 6 | #ifdef __APPLE__ 7 | #include 8 | #include 9 | #elif(__cplusplus >= 201103L) 10 | #include 11 | #include 12 | #elif defined _MSC_VER 13 | #include 14 | #include 15 | #else 16 | #include 17 | #include 18 | namespace std { 19 | using std::tr1::unordered_map; 20 | using std::tr1::unordered_set; 21 | } 22 | 23 | #endif 24 | 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | 32 | 33 | namespace std { 34 | 35 | template 36 | ostream& operator << (ostream& os, const vector& v) { 37 | if(v.empty()) { 38 | return os << "[]"; 39 | } 40 | os<<"["< 49 | inline ostream& operator << (ostream& os, const vector& v) { 50 | if(v.empty()) { 51 | return os << "[]"; 52 | } 53 | os<<"[\""< 62 | ostream& operator << (ostream& os, const deque& dq) { 63 | if(dq.empty()) { 64 | return os << "[]"; 65 | } 66 | os<<"[\""< 76 | ostream& operator << (ostream& os, const pair& pr) { 77 | os << pr.first << ":" << pr.second ; 78 | return os; 79 | } 80 | 81 | 82 | template 83 | string& operator << (string& str, const T& obj) { 84 | stringstream ss; 85 | ss << obj; // call ostream& operator << (ostream& os, 86 | return str = ss.str(); 87 | } 88 | 89 | template 90 | ostream& operator << (ostream& os, const map& mp) { 91 | if(mp.empty()) { 92 | os<<"{}"; 93 | return os; 94 | } 95 | os<<'{'; 96 | typename map::const_iterator it = mp.begin(); 97 | os<<*it; 98 | it++; 99 | while(it != mp.end()) { 100 | os<<", "<<*it; 101 | it++; 102 | } 103 | os<<'}'; 104 | return os; 105 | } 106 | template 107 | ostream& operator << (ostream& os, const std::unordered_map& mp) { 108 | if(mp.empty()) { 109 | return os << "{}"; 110 | } 111 | os<<'{'; 112 | typename std::unordered_map::const_iterator it = mp.begin(); 113 | os<<*it; 114 | it++; 115 | while(it != mp.end()) { 116 | os<<", "<<*it++; 117 | } 118 | return os<<'}'; 119 | } 120 | 121 | template 122 | ostream& operator << (ostream& os, const set& st) { 123 | if(st.empty()) { 124 | os << "{}"; 125 | return os; 126 | } 127 | os<<'{'; 128 | typename set::const_iterator it = st.begin(); 129 | os<<*it; 130 | it++; 131 | while(it != st.end()) { 132 | os<<", "<<*it; 133 | it++; 134 | } 135 | os<<'}'; 136 | return os; 137 | } 138 | 139 | template 140 | bool IsIn(const ContainType& contain, const KeyType& key) { 141 | return contain.end() != contain.find(key); 142 | } 143 | 144 | template 145 | basic_string & operator << (basic_string & s, ifstream & ifs) { 146 | return s.assign((istreambuf_iterator(ifs)), istreambuf_iterator()); 147 | } 148 | 149 | template 150 | ofstream & operator << (ofstream & ofs, const basic_string& s) { 151 | ostreambuf_iterator itr (ofs); 152 | copy(s.begin(), s.end(), itr); 153 | return ofs; 154 | } 155 | 156 | } // namespace std 157 | 158 | #endif 159 | -------------------------------------------------------------------------------- /offline/include/cppjieba/limonp/Thread.hpp: -------------------------------------------------------------------------------- 1 | #ifndef LIMONP_THREAD_HPP 2 | #define LIMONP_THREAD_HPP 3 | 4 | #include "Logging.hpp" 5 | #include "NonCopyable.hpp" 6 | 7 | namespace limonp { 8 | 9 | class IThread: NonCopyable { 10 | public: 11 | IThread(): isStarted(false), isJoined(false) { 12 | } 13 | virtual ~IThread() { 14 | if(isStarted && !isJoined) { 15 | XCHECK(!pthread_detach(thread_)); 16 | } 17 | }; 18 | 19 | virtual void Run() = 0; 20 | void Start() { 21 | XCHECK(!isStarted); 22 | XCHECK(!pthread_create(&thread_, NULL, Worker, this)); 23 | isStarted = true; 24 | } 25 | void Join() { 26 | XCHECK(!isJoined); 27 | XCHECK(!pthread_join(thread_, NULL)); 28 | isJoined = true; 29 | } 30 | private: 31 | static void * Worker(void * data) { 32 | IThread * ptr = (IThread* ) data; 33 | ptr->Run(); 34 | return NULL; 35 | } 36 | 37 | pthread_t thread_; 38 | bool isStarted; 39 | bool isJoined; 40 | }; // class IThread 41 | 42 | } // namespace limonp 43 | 44 | #endif // LIMONP_THREAD_HPP 45 | -------------------------------------------------------------------------------- /offline/include/cppjieba/limonp/ThreadPool.hpp: -------------------------------------------------------------------------------- 1 | #ifndef LIMONP_THREAD_POOL_HPP 2 | #define LIMONP_THREAD_POOL_HPP 3 | 4 | #include "Thread.hpp" 5 | #include "BlockingQueue.hpp" 6 | #include "BoundedBlockingQueue.hpp" 7 | #include "Closure.hpp" 8 | 9 | namespace limonp { 10 | 11 | using namespace std; 12 | 13 | //class ThreadPool; 14 | class ThreadPool: NonCopyable { 15 | public: 16 | class Worker: public IThread { 17 | public: 18 | Worker(ThreadPool* pool): ptThreadPool_(pool) { 19 | assert(ptThreadPool_); 20 | } 21 | virtual ~Worker() { 22 | } 23 | 24 | virtual void Run() { 25 | while (true) { 26 | ClosureInterface* closure = ptThreadPool_->queue_.Pop(); 27 | if (closure == NULL) { 28 | break; 29 | } 30 | try { 31 | closure->Run(); 32 | } catch(std::exception& e) { 33 | XLOG(ERROR) << e.what(); 34 | } catch(...) { 35 | XLOG(ERROR) << " unknown exception."; 36 | } 37 | delete closure; 38 | } 39 | } 40 | private: 41 | ThreadPool * ptThreadPool_; 42 | }; // class Worker 43 | 44 | ThreadPool(size_t thread_num) 45 | : threads_(thread_num), 46 | queue_(thread_num) { 47 | assert(thread_num); 48 | for(size_t i = 0; i < threads_.size(); i ++) { 49 | threads_[i] = new Worker(this); 50 | } 51 | } 52 | ~ThreadPool() { 53 | Stop(); 54 | } 55 | 56 | void Start() { 57 | for(size_t i = 0; i < threads_.size(); i++) { 58 | threads_[i]->Start(); 59 | } 60 | } 61 | void Stop() { 62 | for(size_t i = 0; i < threads_.size(); i ++) { 63 | queue_.Push(NULL); 64 | } 65 | for(size_t i = 0; i < threads_.size(); i ++) { 66 | threads_[i]->Join(); 67 | delete threads_[i]; 68 | } 69 | threads_.clear(); 70 | } 71 | 72 | void Add(ClosureInterface* task) { 73 | assert(task); 74 | queue_.Push(task); 75 | } 76 | 77 | private: 78 | friend class Worker; 79 | 80 | vector threads_; 81 | BoundedBlockingQueue queue_; 82 | }; // class ThreadPool 83 | 84 | } // namespace limonp 85 | 86 | #endif // LIMONP_THREAD_POOL_HPP 87 | -------------------------------------------------------------------------------- /offline/src/Configure.cc: -------------------------------------------------------------------------------- 1 | /********************************************* 2 | * file: Configure.cc 3 | * author: AIWWZ(wzj1524@qq.com) 4 | * date: 2019-06-18 18:25:43 5 | **********************************************/ 6 | #include "../include/Configure.h" 7 | #include "../include/MyLogger.h" 8 | #include 9 | #include 10 | #include //for debug 11 | using namespace tinyse; 12 | using std::istringstream; using std::cout; using std::endl; 13 | 14 | 15 | Configure::Configure(const string &filepath) { 16 | defaultConfig(); //初始化默认配置 17 | 18 | ifstream configs(filepath); 19 | if(configs) { 20 | LogInfo("Load Configuration file: %s", filepath.c_str()); 21 | loadConfig(configs); //加载配置 22 | } 23 | else { //打开配置文件失败 24 | LogWarn("Couldn't open configure file: %s", filepath.c_str()); 25 | //使用默认配置 26 | } 27 | 28 | readStopWords(); 29 | } 30 | 31 | 32 | /* 配置文件中没有指定路径时, 使用默认路径 */ 33 | void Configure::defaultConfig() { 34 | m_configMap["corpus"] = "../rss/"; //语料库 35 | m_configMap["pagelib"] = "../data/pagelib.dat"; //去重之后的网页库 36 | m_configMap["offsetlib"] = "../data/offsetlib.dat"; //去重之后的网页偏移库 37 | m_configMap["invertedindexlib"] = "../data/invertedindexlib.dat"; //倒排索引 38 | m_configMap["stopwords"] = "../data/stop_words.utf8"; //停用词 39 | } 40 | 41 | 42 | void Configure::loadConfig(ifstream &configs) { 43 | string line, item, path; 44 | while(getline(configs, line)) { 45 | istringstream record(line); 46 | record >> item >> path; 47 | if(m_configMap.find(item) != m_configMap.end()) { //使用find可以防止向map中添加新项 48 | if(0 == path.size()) { //简单检查一下是否是空路径 49 | LogWarn("Empty Path of %s", item.c_str()); 50 | //直接使用默认配置 51 | } 52 | else { //非空路径(但并不保证路径合法) 53 | m_configMap[item] = path; 54 | } 55 | } 56 | else { //非法配置项 57 | LogWarn("Invalid configuration item: %s", item.c_str()); 58 | //忽略之 59 | } 60 | //防止污染下一次读取 61 | item.clear(); 62 | path.clear(); 63 | } 64 | } 65 | 66 | void Configure::readStopWords() { 67 | ifstream stopWords(getConfigMap()["stopwords"]); 68 | if(stopWords) { 69 | LogInfo("Read stop words"); 70 | string word; 71 | while(stopWords >> word) { 72 | m_stopWords.insert(word); 73 | } 74 | } 75 | else { 76 | LogError("Couldn't open stopwords file"); 77 | exit(-1); 78 | } 79 | } 80 | 81 | 82 | map Configure::getConfigMap() { 83 | return m_configMap; 84 | } 85 | 86 | set& Configure::getStopWords() { 87 | return m_stopWords; 88 | } 89 | 90 | void Configure::print() const { //for debug 91 | cout << endl << "Print config info:" << endl; 92 | for(auto &it : m_configMap) { 93 | cout << it.first << " -> " << it.second << endl; 94 | } 95 | cout << endl << "Print stop words:" << endl; 96 | for(auto &word : m_stopWords) { 97 | cout << word << " "; 98 | } 99 | cout << endl; 100 | } 101 | -------------------------------------------------------------------------------- /offline/src/DirScanner.cc: -------------------------------------------------------------------------------- 1 | /********************************************* 2 | * file: DirScanner.cc 3 | * author: AIWWZ(wzj1524@qq.com) 4 | * date: 2019-06-18 22:32:12 5 | **********************************************/ 6 | #include "../include/DirScanner.h" 7 | #include "../include/MyLogger.h" 8 | #include "../include/Configure.h" 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include //for debug 14 | using namespace tinyse; 15 | using std::cout; using std::endl; 16 | 17 | DirScanner::DirScanner(Configure &config) : m_config(config) { 18 | traverse(m_config.getConfigMap()["corpus"]); //获取语料库中所有语料的绝对路径 19 | } 20 | 21 | 22 | /* 遍历语料库目录下的所有文件, 获取其绝对路径 */ 23 | inline void DirScanner::traverse(const string &corpusDir) { 24 | DIR *dir = ::opendir(corpusDir.c_str()); 25 | if(nullptr == dir) { //语料库目录打开失败: 直接终止程序 26 | LogError("Can't open dir %s", corpusDir.c_str()); 27 | exit(-1); 28 | } 29 | 30 | string oldPwd = ::getcwd(NULL, 0); //保存当前路径 31 | ::chdir(corpusDir.c_str()); 32 | string pwd = ::getcwd(NULL, 0); //获取语料库目录的绝对路径 33 | 34 | struct dirent *entry; //目录下的一项(可能是普通文件, 也可能是目录) 35 | struct stat st; 36 | while((entry = ::readdir(dir))) { 37 | ::stat(entry->d_name, &st); //获取该项的信息 38 | 39 | if(S_ISDIR(st.st_mode)) { //是目录 40 | if(strcmp(entry->d_name, ".") == 0 || strcmp(entry->d_name, "..") == 0) { 41 | continue; //任何目录下都有此两项, 忽略之 42 | } 43 | else { //普通目录, 则递归获取其中的语料信息 44 | traverse(entry->d_name); 45 | } 46 | } 47 | else { //非目录, 则记录其绝对路径 48 | m_vecCorpus.push_back(pwd + "/" + entry->d_name); 49 | } 50 | } 51 | 52 | ::chdir(oldPwd.c_str()); //还原当前路径 53 | ::closedir(dir); 54 | } 55 | 56 | const vector& DirScanner::getCorpusPages() const { 57 | return m_vecCorpus; 58 | } 59 | 60 | void DirScanner::print() const { 61 | cout << endl << "Print corpus:" << endl; 62 | for(auto &it : m_vecCorpus) { 63 | cout << it << endl; 64 | } 65 | cout << endl; 66 | } 67 | -------------------------------------------------------------------------------- /offline/src/MyLogger.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include "../include/MyLogger.h" 3 | #include 4 | using namespace log4cpp; 5 | 6 | /* 初始化静态成员 */ 7 | MyLogger *MyLogger::m_pInstance = nullptr; 8 | pthread_once_t MyLogger::m_once_control = PTHREAD_ONCE_INIT; 9 | 10 | MyLogger::MyLogger() : m_logger(Category::getRoot().getInstance("logger")) { 11 | try { 12 | PropertyConfigurator::configure("conf/log4cpp.conf"); 13 | } 14 | catch(ConfigureFailure &err) { 15 | std::cerr << "Configure failure: " << err.what() << std::endl; 16 | exit(1); 17 | } 18 | } 19 | 20 | MyLogger::~MyLogger() { 21 | Category::shutdown(); 22 | } 23 | 24 | void MyLogger::error(const char *msg) { 25 | m_logger.error(msg); 26 | } 27 | 28 | void MyLogger::warn(const char *msg) { 29 | m_logger.warn(msg); 30 | } 31 | 32 | void MyLogger::info(const char *msg) { 33 | m_logger.info(msg); 34 | } 35 | 36 | void MyLogger::debug(const char *msg) { 37 | m_logger.debug(msg); 38 | } 39 | 40 | -------------------------------------------------------------------------------- /offline/src/Page.cc: -------------------------------------------------------------------------------- 1 | /********************************************* 2 | * file: Page.cc 3 | * author: AIWWZ(wzj1524@qq.com) 4 | * date: 2019-06-20 00:00:23 5 | **********************************************/ 6 | #include "../include/Page.h" 7 | #include "../include/Configure.h" 8 | #include "../include/WordSegmentation.h" 9 | #include "../include/MyLogger.h" 10 | #include 11 | using std::cout; using std::endl; 12 | 13 | namespace tinyse { 14 | 15 | void Page::setDocID(const size_t &docid) { 16 | m_docid = docid; 17 | } 18 | 19 | void Page::setTitle(const string &title) { 20 | m_title = title; 21 | } 22 | 23 | void Page::setLink(const string &link) { 24 | m_link = link; 25 | } 26 | 27 | void Page::setContent(const string &content) { 28 | m_content = content; 29 | } 30 | 31 | void Page::parse(Configure &config, WordSegmentation &jieba, simhash::Simhasher &simhasher) { 32 | /* 对content进行分词 */ 33 | vector vecWords = jieba(m_content); 34 | 35 | /* 统计词频 --> 去掉单字词和停用词 */ 36 | set &stopWords = config.getStopWords(); 37 | for(auto &word : vecWords) { 38 | if(word.size() > 1 && stopWords.find(word) == stopWords.end()) { 39 | ++m_wordsMap[word]; 40 | } 41 | } 42 | 43 | /* 求该Page的simhash */ 44 | size_t topN = 20; //默认为10 45 | simhasher.make(m_content, topN, m_simhash); //计算simhash 46 | /* 47 | vector> res; 48 | simhasher.extract(m_content, res, topN); 49 | for(auto &it : res) { 50 | cout << "[" << it.first << ":" << it.second << "] "; 51 | } 52 | cout << endl; 53 | */ 54 | } 55 | 56 | map& Page::getWordsMap() { 57 | return m_wordsMap; 58 | } 59 | 60 | size_t Page::getDocID() const { 61 | return m_docid; 62 | } 63 | 64 | string Page::getTitle() const { 65 | return m_title; 66 | } 67 | 68 | string Page::getContent() const { 69 | return m_content; 70 | } 71 | 72 | void Page::clear() { 73 | m_docid = 0; 74 | m_title.clear(); 75 | m_link.clear(); 76 | m_content.clear(); 77 | m_wordsMap.clear(); 78 | m_simhash = 0; 79 | } 80 | 81 | /* 赋值 --> 仅修改内容, 不改动docid */ 82 | void Page::operator=(const Page &rhs) { 83 | m_title = rhs.m_title; 84 | m_link = rhs.m_link; 85 | m_content = rhs.m_content; 86 | m_wordsMap = rhs.m_wordsMap; 87 | m_simhash = rhs.m_simhash; 88 | } 89 | ostream& operator<<(ostream &os, const Page &page) { 90 | os << "\n" 91 | << " " << page.m_docid << "\n" 92 | << " " << page.m_title << "\n" 93 | << " " << page.m_link << "\n" 94 | << " " << page.m_content << "\n" 95 | << "\n"; 96 | return os; 97 | } 98 | 99 | /* 判断两个Page是否相似 */ 100 | bool operator==(const Page &lhs, const Page &rhs) { 101 | return simhash::Simhasher::isEqual(lhs.m_simhash, rhs.m_simhash); //海明距离阈值默认值为3 102 | 103 | return true; 104 | } 105 | 106 | } //end of namespace tinyse 107 | -------------------------------------------------------------------------------- /offline/src/RssReader.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include "../include/RssReader.h" 3 | #include "../include/tinyxml2.h" 4 | #include "../include/MyLogger.h" 5 | #include "../include/Page.h" 6 | #include "../include/Configure.h" 7 | #include "../include/WordSegmentation.h" 8 | using namespace tinyxml2; 9 | using namespace tinyse; 10 | using std::to_string; 11 | 12 | 13 | /* 将src中所有的<...>过滤掉 */ 14 | string regexFilter(const string &src) { 15 | string pattern = "<.*?>"; //.*?的目的是采用最短匹配 16 | std::regex reg(pattern); 17 | return regex_replace(src, reg, ""); 18 | } 19 | 20 | 21 | bool RssReader::parseRss(const string &xmlPath, Configure &config, WordSegmentation &jieba, simhash::Simhasher &simhasher) { 22 | //加载待解析xml文件 23 | XMLDocument doc; 24 | if(XML_SUCCESS != doc.LoadFile(xmlPath.c_str())) { 25 | LogWarn("Load rss file failed!"); 26 | return false; 27 | } 28 | 29 | XMLElement *rssRoot = doc.RootElement(); //根节点 30 | 31 | XMLElement *channel = rssRoot->FirstChildElement("channel"); // 32 | 33 | XMLElement *item = channel->FirstChildElement("item"); // 34 | 35 | Page tmpPage; //临时存放结点中的数据 36 | XMLElement *tmpNode; //指向的子结点 37 | 38 | while(item != nullptr) { //获取指定结点的内容, 并存入m_rss中 39 | tmpNode = item->FirstChildElement("title"); 40 | if(nullptr != tmpNode) { 41 | tmpPage.setTitle(tmpNode->GetText()); 42 | } 43 | else { //忽略不完整Page 44 | continue; 45 | } 46 | 47 | tmpNode = item->FirstChildElement("link"); 48 | if(nullptr != tmpNode) { 49 | tmpPage.setLink(tmpNode->GetText()); 50 | } 51 | else { //忽略不完整Page 52 | continue; 53 | } 54 | 55 | tmpNode = item->FirstChildElement("content:encoded"); 56 | if(nullptr == tmpNode) { //若没有content, 则用description代替之 57 | tmpNode = item->FirstChildElement("description"); 58 | } 59 | if(nullptr != tmpNode) { 60 | //使用正则表达式过滤无用的数据 61 | tmpPage.setContent(regexFilter(tmpNode->GetText())); 62 | } 63 | else { //忽略不完整Page 64 | continue; 65 | } 66 | 67 | tmpPage.parse(config, jieba, simhasher); //该Page已格式化, 进行进一步处理 68 | 69 | m_vecPages.push_back(tmpPage); 70 | tmpPage.clear(); 71 | 72 | item = item->NextSiblingElement("item"); //继续解析下一个结点 73 | } 74 | 75 | return true; 76 | } 77 | -------------------------------------------------------------------------------- /offline/src/main.cc: -------------------------------------------------------------------------------- 1 | /********************************************* 2 | * file: main.cc 3 | * author: AIWWZ(wzj1524@qq.com) 4 | * date: 2019-06-18 19:00:31 5 | **********************************************/ 6 | #include 7 | #include "../include/Configure.h" 8 | #include "../include/DirScanner.h" 9 | #include "../include/PageLib.h" 10 | using namespace std; 11 | 12 | int main() { 13 | tinyse::Configure config("conf/offline.conf"); //加载配置文件 14 | 15 | tinyse::PageLib pageLib(config); //存储格式化后的网页和偏移库 16 | pageLib.doit(); 17 | 18 | return 0; 19 | } 20 | -------------------------------------------------------------------------------- /offline/tinyse_offline: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aiwwz/tiny-search-engine/0ed91aeb17e5d75f982669c3432959dc7c27473d/offline/tinyse_offline -------------------------------------------------------------------------------- /online/Makefile: -------------------------------------------------------------------------------- 1 | SRCS:=$(wildcard ./src/*.cc) 2 | OBJS:=$(patsubst %.cc,%.o,$(SRCS)) 3 | ELF:= tinyse_online 4 | CC:=g++ 5 | CXXFLAGS:=-std=c++11 -g -Wall 6 | $(ELF):$(OBJS) 7 | g++ $^ -o $@ -lpthread -llog4cpp -ljson $(CXXFLAGS) 8 | .PHONY:clean 9 | clean: 10 | rm -rf $(ELF) $(OBJS) 11 | 12 | -------------------------------------------------------------------------------- /online/conf/log4cpp.conf: -------------------------------------------------------------------------------- 1 | log4cpp.rootCategory=DEBUG, console, rollAppender 2 | 3 | log4cpp.appender.console=ConsoleAppender 4 | log4cpp.appender.console.layout=PatternLayout 5 | log4cpp.appender.console.layout.ConversionPattern=%d [%p] %m%n 6 | 7 | log4cpp.appender.rollAppender=RollingFileAppender 8 | log4cpp.appender.rollAppender.fileName=../log/tiny_se.log 9 | log4cpp.appender.rollAppender.maxFileSize=1048576 #1MB 10 | log4cpp.appender.rollAppender.maxBackupIndex=10 11 | log4cpp.appender.rollAppender.layout=PatternLayout 12 | log4cpp.appender.rollAppender.layout.ConversionPattern=%d [%p] %m%n 13 | 14 | -------------------------------------------------------------------------------- /online/conf/online.conf: -------------------------------------------------------------------------------- 1 | pagelib ../data/pagelib.dat 2 | offsetlib ../data/offsetlib.dat 3 | invertedindexlib ../data/invertedindexlib.dat 4 | stopwords ../data/stop_words.utf8 5 | -------------------------------------------------------------------------------- /online/include/Acceptor.h: -------------------------------------------------------------------------------- 1 | /********************************************* 2 | * file: Acceptor.h 3 | * author: AIWWZ(wzj1524@qq.com) 4 | * date: 2019-06-13 11:17:51 5 | **********************************************/ 6 | #ifndef __ACCEPTOR_H__ 7 | #define __ACCEPTOR_H__ 8 | #include 9 | #include "Socket.h" 10 | #include "Channel.h" 11 | #include "Uncopyable.h" 12 | 13 | namespace tinyse { 14 | 15 | class InetAddress; 16 | class EventLoop; 17 | 18 | /* 封装accept, 用于服务端接受新连接, 并通过回调通知使用者 */ 19 | class Acceptor : Uncopyable { // 内部类, 仅供TcpServer使用 20 | using NewConnectionCallback = std::function; 21 | public: 22 | Acceptor(EventLoop *loop, const InetAddress &listenAddr, bool reuseport = true); 23 | 24 | void setNewConnectionCallback(const NewConnectionCallback &cb) { 25 | m_newConnectionCallback = cb; 26 | } 27 | 28 | bool listenning() const { 29 | return m_listenning; 30 | } 31 | 32 | void listen(); 33 | 34 | private: 35 | void handleRead(); 36 | 37 | private: 38 | bool m_listenning; 39 | EventLoop *m_loop; 40 | Socket m_socket; //listen套接字 41 | Channel m_channel; //用于观察此socket上的可读事件 42 | NewConnectionCallback m_newConnectionCallback; 43 | }; 44 | 45 | } //end of namespace tinyse 46 | 47 | #endif /* __ACCEPTOR_H__ */ 48 | -------------------------------------------------------------------------------- /online/include/Callbacks.h: -------------------------------------------------------------------------------- 1 | /********************************************* 2 | * file: Callbacks.h 3 | * author: AIWWZ(wzj1524@qq.com) 4 | * date: 2019-07-03 18:17:08 5 | **********************************************/ 6 | #ifndef __CALLBACKS_H__ 7 | #define __CALLBACKS_H__ 8 | #include 9 | #include 10 | #include 11 | #include "Timestamp.h" 12 | using std::function; using std::shared_ptr; using std::string; 13 | 14 | namespace tinyse { 15 | 16 | class TcpConnection; 17 | class Buffer; 18 | 19 | using TcpConnectionPtr = shared_ptr; 20 | using ConnectionCallback = function; 21 | using MessageCallback = function; 22 | using CloseCallback = function; 23 | using TimerCallback = function; 24 | 25 | } //end of namespace tinyse 26 | 27 | #endif /* __CALLBACKS_H__ */ 28 | -------------------------------------------------------------------------------- /online/include/Channel.h: -------------------------------------------------------------------------------- 1 | /********************************************* 2 | * file: Channel.h 3 | * author: AIWWZ(wzj1524@qq.com) 4 | * date: 2019-06-14 15:36:47 5 | **********************************************/ 6 | #ifndef __CHANNEL_H__ 7 | #define __CHANNEL_H__ 8 | #include 9 | #include 10 | #include "Uncopyable.h" 11 | using namespace std; 12 | 13 | namespace tinyse { 14 | 15 | class EventLoop; 16 | 17 | /* Channel负责将一个文件描述符的I/O事件的分发为不同的回调 */ 18 | class Channel : Uncopyable { 19 | using EventCallback = std::function; 20 | public: 21 | Channel(EventLoop *loop, int fd) 22 | : m_loop(loop) 23 | , m_fd(fd) { } 24 | 25 | ~Channel(); 26 | 27 | void setReadCallback(const EventCallback &cb) { 28 | m_readCallback = cb; 29 | } 30 | 31 | void setWriteCallback(const EventCallback &cb) { 32 | m_writeCallback = cb; 33 | } 34 | 35 | void setCloseCallback(const EventCallback &cb) { //用于关闭Tcp连接 36 | m_closeCallback = cb; 37 | } 38 | 39 | void setErrorCallback(const EventCallback &cb) { 40 | m_errorCallback = cb; 41 | } 42 | 43 | void handleEvent(); 44 | 45 | void enableReading() { 46 | m_events |= kReadEvent; 47 | update(); 48 | } 49 | 50 | void enableWriting() { 51 | m_events |= kWriteEvent; 52 | update(); 53 | } 54 | 55 | void disableReading() { 56 | m_events &= ~kReadEvent; 57 | update(); 58 | } 59 | 60 | void disableWriting() { 61 | m_events &= ~kWriteEvent; 62 | update(); 63 | } 64 | 65 | bool isWriting() const { 66 | return m_events & kWriteEvent; 67 | } 68 | 69 | void disableAll() { 70 | m_events = kNoneEvent; 71 | update(); 72 | } 73 | 74 | int fd() const { 75 | return m_fd; 76 | } 77 | 78 | int events() const { 79 | return m_events; 80 | } 81 | 82 | bool isNoneEvent() const { 83 | return m_events == kNoneEvent; 84 | } 85 | 86 | void setRevents(const int revent) { 87 | m_revents = revent; 88 | } 89 | 90 | EventLoop* ownerLoop() const { 91 | return m_loop; 92 | } 93 | 94 | //void remove(); 95 | 96 | //在Poller中使用 97 | int index() const { //Channel对应的fd在m_pollfds数组中的下标 98 | return m_index; 99 | } 100 | 101 | void setIndex(const int idx) { //设置在m_pollfds数组中的下标 102 | m_index = idx; 103 | } 104 | 105 | private: 106 | void update(); //将当前Channel更新到所属EventLoop 107 | 108 | private: 109 | static const int kNoneEvent; 110 | static const int kReadEvent; 111 | static const int kWriteEvent; 112 | 113 | EventCallback m_readCallback; 114 | EventCallback m_writeCallback; 115 | EventCallback m_closeCallback; 116 | EventCallback m_errorCallback; 117 | 118 | EventLoop *m_loop; 119 | const int m_fd; //负责该文件描述符的I/O事件分发 120 | int m_events = 0; //关注的I/O事件 121 | int m_revents = 0; //当前活动的事件 122 | int m_index = -1; //该Channel在poolfds中的下标 123 | 124 | bool m_eventHandling = false; 125 | }; 126 | 127 | } //end of namespace tinyse 128 | 129 | #endif /* __CHANNEL_H__ */ 130 | -------------------------------------------------------------------------------- /online/include/Condition.h: -------------------------------------------------------------------------------- 1 | /********************************************* 2 | * file: Condition.h 3 | * author: AIWWZ(wzj1524@qq.com) 4 | * date: 2019-06-09 13:01:56 5 | **********************************************/ 6 | #ifndef __CONDITION_H__ 7 | #define __CONDITION_H__ 8 | #include 9 | #include "Uncopyable.h" 10 | #include "MutexLock.h" 11 | #include 12 | 13 | namespace tinyse { 14 | 15 | class Condition : tinyse::Uncopyable { 16 | public: 17 | explicit Condition(MutexLock &mutex) : m_mutex(mutex) { 18 | if(pthread_cond_init(&m_cond, nullptr)) { 19 | perror("Condition.h: pthread_cond_init"); 20 | } 21 | } 22 | 23 | ~Condition() { 24 | if(pthread_cond_destroy(&m_cond)) { 25 | perror("Condition.h: pthread_cond_destroy"); 26 | } 27 | } 28 | 29 | void signal() { 30 | if(pthread_cond_signal(&m_cond)) { 31 | perror("Condition.h: pthread_cond_signal"); 32 | } 33 | } 34 | 35 | void broadcast() { 36 | if(pthread_cond_broadcast(&m_cond)) { 37 | perror("Condition.h: pthread_cond_broadcast"); 38 | } 39 | } 40 | 41 | void wait() { 42 | if(pthread_cond_wait(&m_cond, m_mutex.getMutex())) { 43 | perror("Condition.h: pthread_cond_wait"); 44 | } 45 | } 46 | 47 | private: 48 | MutexLock &m_mutex; 49 | pthread_cond_t m_cond; 50 | }; 51 | 52 | } //end of namespace tinyse 53 | 54 | #endif /* __CONDITION_H__ */ 55 | -------------------------------------------------------------------------------- /online/include/Configure.h: -------------------------------------------------------------------------------- 1 | /********************************************* 2 | * file: Configure.h 3 | * author: AIWWZ(wzj1524@qq.com) 4 | * date: 2019-06-18 18:08:35 5 | **********************************************/ 6 | #ifndef __CONFIGURE_H__ 7 | #define __CONFIGURE_H__ 8 | #include 9 | #include 10 | #include 11 | #include 12 | using std::string; using std::map; using std::ifstream; 13 | using std::set; 14 | 15 | namespace tinyse { 16 | 17 | /* 读取配置文件 */ 18 | class Configure { 19 | public: 20 | Configure(const string &filepath); 21 | ~Configure() { } 22 | 23 | map getConfigMap(); //获取存放配置信息的map 24 | set& getStopWords(); //获取停用词集 25 | void print() const; //for debug; 26 | 27 | private: 28 | void defaultConfig(); //默认配置 29 | void loadConfig(ifstream &configs); //加载配置信息 30 | void readStopWords(); //读取停用词集 31 | 32 | private: 33 | map m_configMap; //数据文件-->存储路径 34 | set m_stopWords; //停用词集 35 | }; 36 | 37 | } //end of namespace tinyse 38 | 39 | #endif /* __CONFIGURE_H__ */ 40 | -------------------------------------------------------------------------------- /online/include/CurrentThread.h: -------------------------------------------------------------------------------- 1 | /********************************************* 2 | * file: CurrentThread.h 3 | * author: AIWWZ(wzj1524@qq.com) 4 | * date: 2019-06-14 16:57:00 5 | **********************************************/ 6 | #ifndef __CURRENTTHREAD_H__ 7 | #define __CURRENTTHREAD_H__ 8 | #include "../include/Config.h" 9 | #include 10 | #include 11 | #include 12 | 13 | BEGIN_NAMESPACE_TINYSE 14 | 15 | pid_t gettid() { 16 | return syscall(SYS_gettid); 17 | } 18 | 19 | 20 | END_NAMESPACE_TINYSE 21 | 22 | #endif /* __CURRENTTHREAD_H__ */ 23 | -------------------------------------------------------------------------------- /online/include/Epoller.h: -------------------------------------------------------------------------------- 1 | /********************************************* 2 | * file: Epoller.h 3 | * author: AIWWZ(wzj1524@qq.com) 4 | * date: 2019-06-15 10:01:39 5 | **********************************************/ 6 | #ifndef __EPOLLER_H__ 7 | #define __EPOLLER_H__ 8 | #include "Uncopyable.h" 9 | #include "../include/Timestamp.h" 10 | #include 11 | #include 12 | using std::vector; using std::map; 13 | 14 | struct epoll_event; 15 | 16 | namespace tinyse { 17 | 18 | class Channel; 19 | class EventLoop; 20 | 21 | /* 封装I/O多路复用epoll */ 22 | class Epoller : Uncopyable { 23 | using ChannelList = vector; 24 | using EventList = vector; 25 | using ChannelMap = map; 26 | public: 27 | Epoller(EventLoop *loop); 28 | ~Epoller(); 29 | 30 | Timestamp poll(int timeoutMs, ChannelList *activeChannels); 31 | void updateChannel(Channel *channel); 32 | void removeChannel(Channel *channel); 33 | 34 | void assertInLoopThread(); 35 | 36 | private: 37 | static const int kInitEventListSize = 16; 38 | void fillActiveChannels(int numEvents, ChannelList *activeChannels) const; 39 | void update(int operation, Channel *channel); 40 | 41 | private: 42 | EventLoop *m_ownerLoop; //该Poller的所有者EventLoop 43 | int m_epfd; 44 | EventList m_events; 45 | ChannelMap m_channels; //fd到Channel*的映射 46 | }; 47 | 48 | } //end of namespace tinyse 49 | 50 | #endif /* __EPOLLER_H__ */ 51 | -------------------------------------------------------------------------------- /online/include/EventLoop.h: -------------------------------------------------------------------------------- 1 | /********************************************* 2 | * file: EventLoop.h 3 | * author: AIWWZ(wzj1524@qq.com) 4 | * date: 2019-06-14 16:37:17 5 | **********************************************/ 6 | #ifndef __EVENTLOOP_H__ 7 | #define __EVENTLOOP_H__ 8 | #include "Uncopyable.h" 9 | #include "TimerQueue.h" 10 | #include "../include/MutexLock.h" 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | namespace tinyse { 17 | 18 | class Channel; 19 | class Epoller; 20 | 21 | class EventLoop { 22 | using ChannelList = std::vector; 23 | using Functor = std::function; 24 | public: 25 | EventLoop(); 26 | ~EventLoop(); 27 | 28 | void loop(); 29 | void assertInLoopThread() const; 30 | bool isInLoopThread() const; 31 | static EventLoop* getEventLoopOfCurrentThread(); 32 | void updateChannel(Channel *channel); 33 | void removeChannel(Channel *channel); 34 | void quit(); 35 | 36 | void runAt(const Timestamp &time, const TimerCallback &cb) { 37 | m_timerQueue->addTimer(cb, time, 0); 38 | } 39 | 40 | void runAfter(double delay, const TimerCallback &cb) { 41 | Timestamp time(addTime(Timestamp::now(), delay)); 42 | runAt(time, cb); 43 | } 44 | 45 | void runEvery(double interval, const TimerCallback &cb) { 46 | Timestamp time(addTime(Timestamp::now(), interval)); 47 | m_timerQueue->addTimer(cb, time, interval); 48 | } 49 | 50 | 51 | void wakeup() { 52 | uint64_t one = 1; 53 | ssize_t n = ::write(m_wakeupFd, &one, sizeof(one)); 54 | if(n != sizeof(one)) { 55 | perror("EventLoop:write"); 56 | } 57 | } 58 | 59 | void queueInLoop(const Functor &cb) { 60 | { 61 | MutexLockGuard lock(m_mutex); 62 | m_pendingFunctors.push_back(cb); 63 | } 64 | if(!isInLoopThread() || m_callingPendingFuntors) { 65 | wakeup(); 66 | } 67 | } 68 | 69 | void runInLoop(const Functor &cb) { 70 | if(isInLoopThread()) { 71 | cb(); 72 | } 73 | else { 74 | queueInLoop(cb); 75 | } 76 | } 77 | 78 | private: 79 | void abortNotInLoopThread() const; 80 | void handleRead(); //for wakeup 81 | void doPendingFunctors(); 82 | 83 | private: 84 | bool m_looping; 85 | bool m_quit; 86 | bool m_callingPendingFuntors; 87 | const pthread_t m_threadID; 88 | Timestamp m_pollRuntime; 89 | std::unique_ptr m_poller; //间接持有poller 90 | std::unique_ptr m_timerQueue; 91 | ChannelList m_activeChannels; 92 | int m_wakeupFd; 93 | unique_ptr m_wakeupChannel; 94 | MutexLock m_mutex; 95 | std::vector m_pendingFunctors; 96 | }; 97 | 98 | } //end of namespace tinyse 99 | 100 | #endif /* __EVENTLOOP_H__ */ 101 | -------------------------------------------------------------------------------- /online/include/EventLoopThread.h: -------------------------------------------------------------------------------- 1 | /********************************************* 2 | * file: EventLoopThread.h 3 | * author: AIWWZ(wzj1524@qq.com) 4 | * date: 2019-07-02 09:38:31 5 | **********************************************/ 6 | #ifndef __EVENTLOOPTHREAD_H__ 7 | #define __EVENTLOOPTHREAD_H__ 8 | #include 9 | #include "MutexLock.h" 10 | #include "Condition.h" 11 | #include "Thread.h" 12 | #include "Uncopyable.h" 13 | using std::string; 14 | 15 | namespace tinyse { 16 | 17 | class EventLoop; 18 | 19 | class EventLoopThread : Uncopyable { 20 | using ThreadInitCallback = std::function; 21 | public: 22 | EventLoopThread(const ThreadInitCallback &cb = ThreadInitCallback()); 23 | ~EventLoopThread(); 24 | EventLoop* startLoop(); 25 | 26 | private: 27 | void threadFunc(); 28 | 29 | private: 30 | EventLoop *m_loop; 31 | MutexLock m_mutex; 32 | Condition m_cond; 33 | Thread m_thread; 34 | bool m_exit; 35 | ThreadInitCallback m_cb; 36 | }; 37 | 38 | } //end of namespace tinyse 39 | 40 | #endif /* __EVENTLOOPTHREAD_H__ */ 41 | -------------------------------------------------------------------------------- /online/include/InetAddress.h: -------------------------------------------------------------------------------- 1 | /********************************************* 2 | * file: InetAddress.h 3 | * author: AIWWZ(wzj1524@qq.com) 4 | * date: 2019-06-13 10:10:37 5 | **********************************************/ 6 | #ifndef __INETADDRESS_H__ 7 | #define __INETADDRESS_H__ 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | using std::string; 16 | 17 | namespace tinyse { 18 | 19 | /* 对struct sockaddr_in的简单封装, 能自动转换字节序 */ 20 | class InetAddress { 21 | public: 22 | InetAddress(uint16_t port = 0) { 23 | bzero(&m_addr, sizeof(m_addr)); 24 | m_addr.sin_family = AF_INET; 25 | m_addr.sin_port = htons(port); 26 | m_addr.sin_addr.s_addr = INADDR_ANY; //使用本机IP 27 | } 28 | 29 | InetAddress(const char *ip, uint16_t port) { 30 | bzero(&m_addr, sizeof(m_addr)); 31 | m_addr.sin_family = AF_INET; 32 | m_addr.sin_port = htons(port); 33 | m_addr.sin_addr.s_addr = inet_addr(ip); 34 | } 35 | 36 | InetAddress(const struct sockaddr_in &addr) 37 | : m_addr(addr) { } 38 | 39 | const struct sockaddr* getSockAddrPtr() const { 40 | return static_cast((void*)(&m_addr)); 41 | } 42 | void setSockAddr(const sockaddr_in &addr) { 43 | m_addr = addr; 44 | } 45 | 46 | string ip() const { 47 | return inet_ntoa(m_addr.sin_addr); 48 | } 49 | 50 | uint16_t port() const { 51 | return ntohs(m_addr.sin_port); 52 | } 53 | 54 | private: 55 | struct sockaddr_in m_addr; 56 | }; 57 | 58 | } //end of namespace tinyse 59 | 60 | #endif /* __INETADDRESS_H__ */ 61 | -------------------------------------------------------------------------------- /online/include/MutexLock.h: -------------------------------------------------------------------------------- 1 | /********************************************* 2 | * file: MutexLock.h 3 | * author: AIWWZ(wzj1524@qq.com) 4 | * date: 2019-06-09 11:46:31 5 | **********************************************/ 6 | #ifndef __MUTEXLOCK_H__ 7 | #define __MUTEXLOCK_H__ 8 | #include "Uncopyable.h" 9 | #include 10 | #include 11 | 12 | namespace tinyse { 13 | 14 | /* 封装互斥锁mutex */ 15 | class MutexLock : Uncopyable { 16 | friend class MutexLockGuard; 17 | public: 18 | MutexLock() { 19 | if(pthread_mutex_init(&m_mutex, nullptr)) { 20 | perror("MutexLock.h: pthread_mutex_init"); 21 | } 22 | } 23 | 24 | ~MutexLock() { 25 | if(pthread_mutex_destroy(&m_mutex)) { 26 | perror("MutexLock.h: pthread_mutex_destroy"); 27 | } 28 | } 29 | 30 | pthread_mutex_t *getMutex() { 31 | return &m_mutex; 32 | } 33 | 34 | protected: 35 | /* 只能由MutexLockGuard调用, 防止用户代码调用 */ 36 | void lock() { 37 | if(pthread_mutex_lock(&m_mutex)) { 38 | perror("MutexLock.h: pthread_mutex_lock"); 39 | } 40 | } 41 | 42 | void unlock() { 43 | if(pthread_mutex_unlock(&m_mutex)) { 44 | perror("MutexLock.h: pthread_mutex_unlock"); 45 | } 46 | } 47 | 48 | private: 49 | pthread_mutex_t m_mutex; 50 | }; 51 | 52 | /* 通过对象生命周期自动加/解锁 */ 53 | class MutexLockGuard : Uncopyable { 54 | public: 55 | explicit MutexLockGuard(MutexLock &mutex) : m_mutex(mutex) { 56 | m_mutex.lock(); 57 | } 58 | 59 | ~MutexLockGuard() { 60 | m_mutex.unlock(); 61 | } 62 | 63 | private: 64 | MutexLock &m_mutex; 65 | }; 66 | 67 | } //end of namespace tinyse 68 | 69 | #endif /* __MUTEXLOCK_H__ */ 70 | -------------------------------------------------------------------------------- /online/include/MyLogger.h: -------------------------------------------------------------------------------- 1 | #ifndef __MYLOGGER_H__ 2 | #define __MYLOGGER_H__ 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #define prefix(msg) (std::string("[")+__FILE__+":"+__FUNCTION__+":"+std::to_string(__LINE__)+"] "+msg).c_str() 9 | #define LogError(msg, ...) MyLogger::getInstance()->error(prefix(msg), ##__VA_ARGS__) 10 | #define LogWarn(msg, ...) MyLogger::getInstance()->warn(prefix(msg), ##__VA_ARGS__) 11 | #define LogInfo(msg, ...) MyLogger::getInstance()->info(prefix(msg), ##__VA_ARGS__) 12 | #define LogDebug(msg, ...) MyLogger::getInstance()->debug(prefix(msg), ##__VA_ARGS__) 13 | 14 | class MyLogger { 15 | public: 16 | static MyLogger* getInstance() { 17 | if(nullptr == m_pInstance) { 18 | pthread_once(&m_once_control, init); //确保线程安全 19 | } 20 | return m_pInstance; 21 | } 22 | 23 | void error(const char *msg); 24 | 25 | template 26 | void error(Args ... args) { m_logger.error(args...); } 27 | 28 | void warn(const char *msg); 29 | 30 | template 31 | void warn(Args ... args) { m_logger.warn(args...); } 32 | 33 | void info(const char *msg); 34 | 35 | template 36 | void info(Args ... args) { m_logger.info(args...); } 37 | 38 | void debug(const char *msg); 39 | 40 | template 41 | void debug(Args ... args) { m_logger.debug(args...); } 42 | 43 | private: 44 | MyLogger(); 45 | ~MyLogger(); 46 | 47 | static void destroy() { 48 | if(nullptr != m_pInstance) { 49 | delete m_pInstance; 50 | m_pInstance = nullptr; 51 | } 52 | } 53 | 54 | static void init() { 55 | m_pInstance = new MyLogger(); 56 | atexit(destroy); 57 | } 58 | 59 | private: 60 | static MyLogger *m_pInstance; 61 | static pthread_once_t m_once_control; 62 | log4cpp::Category & m_logger; 63 | }; 64 | 65 | 66 | #endif /* __MYLOGGER_H__ */ 67 | -------------------------------------------------------------------------------- /online/include/Page.h: -------------------------------------------------------------------------------- 1 | /********************************************* 2 | * file: Page.h 3 | * author: AIWWZ(wzj1524@qq.com) 4 | * date: 2019-07-06 00:29:44 5 | **********************************************/ 6 | #ifndef __PAGE_H__ 7 | #define __PAGE_H__ 8 | #include 9 | #include 10 | #include 11 | #include 12 | using std::vector; using std::string; 13 | using std::map; using std::set; 14 | 15 | namespace tinyse { 16 | 17 | class Configure; 18 | class WordSegmentation; 19 | 20 | class Page { 21 | public: 22 | Page() { } 23 | Page(const string &doc, Configure &conf, WordSegmentation &jieba); 24 | 25 | int docid() const { return m_docid; } 26 | string title() const { return m_title; } 27 | string link() const { return m_link; } 28 | string summary(const vector &queryWords); 29 | 30 | private: 31 | void parseDoc(const string &doc, Configure &conf, WordSegmentation &jieba); 32 | void calcTopK(vector &wordVec, size_t K, set &stopWords); 33 | const static size_t topK = 20; 34 | 35 | private: 36 | int m_docid; 37 | string m_title; 38 | string m_link; 39 | string m_content; 40 | string m_summary; 41 | 42 | vector m_topWords; //topK词 43 | map m_wordsMap; //<词, 频次> 44 | }; 45 | 46 | } //end of namespacei tinyse 47 | 48 | #endif /* __PAGE_H__ */ 49 | -------------------------------------------------------------------------------- /online/include/Poller.hpp: -------------------------------------------------------------------------------- 1 | /********************************************* 2 | * file: Poller.h 3 | * author: AIWWZ(wzj1524@qq.com) 4 | * date: 2019-06-15 10:01:39 5 | **********************************************/ 6 | #ifndef __POLLER_H__ 7 | #define __POLLER_H__ 8 | #include "Uncopyable.h" 9 | #include "../include/Timestamp.h" 10 | #include 11 | #include 12 | 13 | struct pollfd; 14 | 15 | namespace tinyse { 16 | 17 | class Channel; 18 | class EventLoop; 19 | 20 | /* 封装I/O多路复用epoll */ 21 | class Poller : Uncopyable { 22 | using ChannelList = std::vector; 23 | using PollFdList = std::vector; 24 | using ChannelMap = std::map; 25 | public: 26 | Poller(EventLoop *loop); 27 | ~Poller(); 28 | 29 | Timestamp poll(int timeoutMs, ChannelList *activeChannels); 30 | void updateChannel(Channel *channel); 31 | void removeChannel(Channel *channel); 32 | 33 | void assertInLoopThread(); 34 | 35 | private: 36 | void fillActiveChannels(int numEvents, ChannelList *activeChannels); 37 | 38 | private: 39 | EventLoop *m_ownerLoop; //该Poller的所有者EventLoop 40 | PollFdList m_pollfds; 41 | ChannelMap m_channels; //fd到Channel*的映射 42 | }; 43 | 44 | } //end of namespace tinyse 45 | 46 | #endif /* __POLLER_H__ */ 47 | -------------------------------------------------------------------------------- /online/include/Socket.h: -------------------------------------------------------------------------------- 1 | /********************************************* 2 | * file: Socket.h 3 | * author: AIWWZ(wzj1524@qq.com) 4 | * date: 2019-06-13 11:14:32 5 | **********************************************/ 6 | #ifndef __SOCKET_T__ 7 | #define __SOCKET_T__ 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include "InetAddress.h" 14 | 15 | namespace tinyse { 16 | 17 | /* 封装socket描述符, 负责创建/关闭套接字 */ 18 | class Socket { 19 | public: 20 | Socket(int fd) : m_fd(fd) { } 21 | 22 | ~Socket() { 23 | ::close(m_fd); 24 | } 25 | 26 | int fd() { 27 | return m_fd; 28 | } 29 | 30 | void bind(const InetAddress &addr) { 31 | if(::bind(m_fd, addr.getSockAddrPtr(), sizeof(addr)) < 0) { 32 | perror("Socket: bind"); 33 | exit(-1); 34 | } 35 | } 36 | 37 | void listen() { 38 | if(::listen(m_fd, SOMAXCONN) < 0) { 39 | perror("Socket: listen"); 40 | exit(-1); 41 | } 42 | } 43 | 44 | int accept(InetAddress &peerAddr) { 45 | struct sockaddr_in addr; 46 | bzero(&addr, sizeof(addr)); 47 | socklen_t addrlen = sizeof(addr); 48 | int connfd = ::accept4(m_fd, static_cast((void*)(&addr)), &addrlen, SOCK_NONBLOCK | SOCK_CLOEXEC); //使connfd上发生的I/O操作非阻塞 49 | if(connfd == -1) { 50 | perror("Socket: accept"); 51 | exit(-1); 52 | } 53 | peerAddr.setSockAddr(addr); 54 | return connfd; 55 | } 56 | 57 | void shutdownWrite() { 58 | if(shutdown(m_fd, SHUT_WR) < 0) { 59 | perror("Socket: shutdown"); 60 | exit(-1); 61 | } 62 | } 63 | 64 | void setReuseAddr(bool flag) { 65 | int reuse = flag ? 1 : 0; 66 | if(setsockopt(m_fd, SOL_SOCKET, SO_REUSEADDR, &reuse, sizeof(int)) < 0) { 67 | perror("Socket: setsockopt"); 68 | exit(-1); 69 | } 70 | } 71 | 72 | void setReusePort(bool flag) { 73 | int reuse = flag ? 1 : 0; 74 | if(setsockopt(m_fd, SOL_SOCKET, SO_REUSEPORT, &reuse, sizeof(int)) < 0) { 75 | perror("Socket: setsockopt"); 76 | exit(-1); 77 | } 78 | } 79 | 80 | private: 81 | int m_fd; 82 | }; 83 | 84 | } //end of namespace tinyse 85 | 86 | #endif /* __SOCKET_T__ */ 87 | -------------------------------------------------------------------------------- /online/include/SocketsOps.h: -------------------------------------------------------------------------------- 1 | /********************************************* 2 | * file: SocketsOps.h 3 | * author: AIWWZ(wzj1524@qq.com) 4 | * date: 2019-07-03 18:46:35 5 | **********************************************/ 6 | #ifndef __SOCKETSOPS_H__ 7 | #define __SOCKETSOPS_H__ 8 | #include 9 | #include 10 | #include 11 | #include "MyLogger.h" 12 | 13 | namespace tinyse { 14 | namespace sockets { 15 | 16 | struct sockaddr_in getLocalAddr(int sockfd) { 17 | struct sockaddr_in localaddr; 18 | bzero(&localaddr, sizeof(localaddr)); 19 | socklen_t addrlen = static_cast(sizeof(localaddr)); 20 | 21 | if(::getsockname(sockfd, static_cast((void*)(&localaddr)), &addrlen) < 0) { 22 | perror("ScoketsOps::getLocalAddr"); 23 | } 24 | 25 | return localaddr; 26 | } 27 | 28 | } //end of namespace sockets 29 | } //end of namespace tinyse 30 | 31 | #endif /* __SOCKETSOPS_H__ */ 32 | -------------------------------------------------------------------------------- /online/include/TcpConnection.h: -------------------------------------------------------------------------------- 1 | /********************************************* 2 | * file: TcpConnection.h 3 | * author: AIWWZ(wzj1524@qq.com) 4 | * date: 2019-07-03 18:15:34 5 | **********************************************/ 6 | #ifndef __TCPCONNECTION_H__ 7 | #define __TCPCONNECTION_H__ 8 | #include "Callbacks.h" 9 | #include "Uncopyable.h" 10 | #include "InetAddress.h" 11 | #include "Buffer.h" 12 | using std::unique_ptr; 13 | 14 | namespace tinyse { 15 | 16 | class EventLoop; 17 | class Socket; 18 | class Channel; 19 | 20 | /* */ 21 | class TcpConnection : Uncopyable 22 | , public std::enable_shared_from_this { 23 | using State = enum { kConnecting, kConnected, kDisconnecting, kDisconnected}; 24 | public: 25 | TcpConnection(EventLoop *loop, const string &name, int sockfd, const InetAddress &localAddr, const InetAddress &peerAddr); 26 | ~TcpConnection(); 27 | 28 | void setConnectionCallback(const ConnectionCallback &cb) { 29 | m_connectionCallback = cb; 30 | } 31 | 32 | void setMessageCallback(const MessageCallback &cb) { 33 | m_messageCallback = cb; 34 | } 35 | 36 | void setCloseCallback(const CloseCallback &cb) { 37 | m_closeCallback = cb; 38 | } 39 | 40 | void connectEstablished(); 41 | void connectDestroyed(); //当TcpServer将本对象移出map时调用 42 | 43 | bool connected() const { 44 | return m_state == kConnected; 45 | } 46 | 47 | const string& name() const { 48 | return m_name; 49 | } 50 | 51 | const InetAddress& localAddr() const { 52 | return m_localAddr; 53 | } 54 | 55 | const InetAddress& peerAddr() const { 56 | return m_peerAddr; 57 | } 58 | 59 | void send(const string &message); 60 | void shutdown(); 61 | 62 | private: 63 | void setState(State st) { 64 | m_state = st; 65 | } 66 | const char* stateToString() const; 67 | 68 | void handleRead(); 69 | void handleWrite(); 70 | void handleClose(); 71 | void sendInLoop(const string &message); 72 | void shutdownInLoop(); 73 | 74 | private: 75 | EventLoop *m_loop; 76 | string m_name; 77 | State m_state = kConnecting; 78 | unique_ptr m_socket; 79 | unique_ptr m_channel; 80 | InetAddress m_localAddr; 81 | InetAddress m_peerAddr; 82 | ConnectionCallback m_connectionCallback; 83 | MessageCallback m_messageCallback; 84 | CloseCallback m_closeCallback; //此回调绑定到TcpServer::removeConnection() 85 | Buffer m_inputBuffer; 86 | Buffer m_outputBuffer; 87 | }; 88 | 89 | } //end of namespace tinyse 90 | 91 | #endif /* __TCPCONNECTION_H__ */ 92 | -------------------------------------------------------------------------------- /online/include/TcpServer.h: -------------------------------------------------------------------------------- 1 | /********************************************* 2 | * file: TcpServer.h 3 | * author: AIWWZ(wzj1524@qq.com) 4 | * date: 2019-07-03 17:11:13 5 | **********************************************/ 6 | #ifndef __TCPSERVER_H__ 7 | #define __TCPSERVER_H__ 8 | #include "Uncopyable.h" 9 | #include "Callbacks.h" 10 | #include 11 | using std::map; 12 | 13 | namespace tinyse { 14 | 15 | class EventLoop; 16 | class Acceptor; 17 | class InetAddress; 18 | 19 | /* 管理accept获得的TcpConnection */ 20 | class TcpServer : Uncopyable { 21 | using ConnectionMap = std::map; 22 | public: 23 | TcpServer(EventLoop *loop, const InetAddress &listenAddr, const string &name, bool reuseport = true); 24 | ~TcpServer(); 25 | 26 | void start(); 27 | 28 | void setConnectionCallback(const ConnectionCallback &cb) { 29 | m_connectionCallback = cb; 30 | } 31 | 32 | void setMessageCallback(const MessageCallback &cb) { 33 | m_messageCallback = cb; 34 | } 35 | 36 | private: 37 | void newConnection(int sockfd, const InetAddress &peerAddr); 38 | void removeConnection(const TcpConnectionPtr &conn); 39 | 40 | private: 41 | EventLoop *m_loop; 42 | const string m_name; //Tcp Server的名字 43 | std::unique_ptr m_acceptor; //通过acceptor获取新连接的fd 44 | ConnectionCallback m_connectionCallback; 45 | MessageCallback m_messageCallback; 46 | bool m_started = false; 47 | int m_nextConnectionID = 1; // 48 | ConnectionMap m_connections; // 49 | }; 50 | 51 | } //end of namespace tinyse 52 | 53 | #endif /* __TCPSERVER_H__ */ 54 | -------------------------------------------------------------------------------- /online/include/Thread.h: -------------------------------------------------------------------------------- 1 | /********************************************* 2 | * file: Thread.h 3 | * author: AIWWZ(wzj1524@qq.com) 4 | * date: 2019-06-09 21:55:19 5 | **********************************************/ 6 | #ifndef __THREAD_H__ 7 | #define __THREAD_H__ 8 | #include 9 | #include 10 | #include 11 | using std::function; using std::string; 12 | 13 | namespace tinyse { 14 | 15 | /* 封装线程pthread */ 16 | class Thread { 17 | using ThreadFunc = function; 18 | public: 19 | Thread(ThreadFunc &&func, const string &name = string()) 20 | : m_started(false) 21 | , m_pthid(0) 22 | , m_tid(0) 23 | , m_func(move(func)) 24 | , m_name(name) { } 25 | 26 | virtual ~Thread(); 27 | 28 | void start(); 29 | void join(); 30 | pid_t tid() const { return m_tid; } 31 | const string& name() const { return m_name; } 32 | bool started() const { return m_started; } 33 | 34 | private: 35 | static void* threadFunc(void*); 36 | 37 | private: 38 | bool m_started; //是否开始运行 39 | pthread_t m_pthid; 40 | pid_t m_tid; 41 | ThreadFunc m_func; //执行任务的回调函数 42 | string m_name; //此线程的名字(for logging) 43 | }; 44 | 45 | } //end of namespace tinyse 46 | 47 | #endif /* __THREAD_H__ */ 48 | -------------------------------------------------------------------------------- /online/include/ThreadPool.h: -------------------------------------------------------------------------------- 1 | /********************************************* 2 | * file: ThreadPool.h 3 | * author: AIWWZ(wzj1524@qq.com) 4 | * date: 2019-07-05 15:43:05 5 | **********************************************/ 6 | #ifndef __THREADPOOL_H__ 7 | #define __THREADPOOL_H__ 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include "MutexLock.h" 13 | #include "Condition.h" 14 | #include "Uncopyable.h" 15 | using std::vector; using std::queue; using std::string; 16 | using std::function; 17 | 18 | namespace tinyse { 19 | 20 | class Thread; 21 | 22 | class ThreadPool : Uncopyable { 23 | public: 24 | using Task = function; 25 | 26 | ThreadPool(const string &name); 27 | ~ThreadPool(); 28 | 29 | void start(int threadNums); 30 | void stop(); 31 | void addTask(const Task &task); 32 | 33 | private: 34 | void threadFunc(); 35 | Task getTask(); 36 | 37 | private: 38 | vector m_threads; //存放线程的容器 39 | queue m_taskQue; //任务队列 40 | MutexLock m_mutex; 41 | Condition m_cond; 42 | bool m_running = false; 43 | string m_name; 44 | }; 45 | 46 | } //end of namespace tinyse 47 | 48 | #endif /* __THREADPOOL_H__ */ 49 | -------------------------------------------------------------------------------- /online/include/TimerQueue.h: -------------------------------------------------------------------------------- 1 | /********************************************* 2 | * file: TimerQueue.h 3 | * author: AIWWZ(wzj1524@qq.com) 4 | * date: 2019-06-15 19:01:29 5 | **********************************************/ 6 | #ifndef __TIMERQUEUE_H__ 7 | #define __TIMERQUEUE_H__ 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include "Channel.h" 13 | #include "Uncopyable.h" 14 | #include "../include/Timestamp.h" 15 | #include 16 | 17 | namespace tinyse { 18 | 19 | using TimerCallback = std::function; 20 | 21 | class EventLoop; 22 | 23 | class Timer { 24 | public: 25 | Timer(const TimerCallback &cb, Timestamp when, double interval) 26 | : m_timerCallback(cb) 27 | , m_expiration(when) 28 | , m_interval(interval) 29 | , m_repeat(interval > 0) { } 30 | 31 | ~Timer() { } 32 | 33 | void run() const { 34 | m_timerCallback(); 35 | } 36 | 37 | Timestamp expiration() const { 38 | return m_expiration; 39 | } 40 | 41 | bool repeat() const { 42 | return m_repeat; 43 | } 44 | 45 | void restart(Timestamp now) { 46 | if(m_repeat) { 47 | m_expiration = addTime(now, m_interval); 48 | } 49 | else { 50 | m_expiration = Timestamp::invalid(); 51 | } 52 | } 53 | 54 | private: 55 | TimerCallback m_timerCallback; 56 | Timestamp m_expiration; 57 | const double m_interval; 58 | const bool m_repeat; 59 | }; 60 | 61 | 62 | /* 63 | class TimerID { 64 | public: 65 | TimerID(Timer *timer) : m_timer(timer) { } 66 | ~TimerID() { } 67 | 68 | private: 69 | Timer *m_timer; 70 | }; 71 | */ 72 | 73 | 74 | class TimerQueue : Uncopyable { 75 | using Entry = std::pair; 76 | using TimerList = std::set; 77 | public: 78 | TimerQueue(EventLoop *loop); 79 | ~TimerQueue(); 80 | 81 | void addTimer(const TimerCallback &cb, Timestamp when, double interval); 82 | void addTimerInLoop(Timer *timer); 83 | //void cancel(TimerID timerID); 84 | 85 | private: 86 | void handleRead(); 87 | std::vector getExpired(Timestamp now); 88 | void reset(const std::vector &expird, Timestamp now); 89 | bool insert(Timer *timer); 90 | 91 | private: 92 | EventLoop *m_loop; 93 | const int m_timerfd; 94 | TimerList m_timers; 95 | Channel m_timerfdChannel; 96 | }; 97 | 98 | } //end of namespace tinyse 99 | 100 | #endif /* __TIMERQUEUE_H__ */ 101 | -------------------------------------------------------------------------------- /online/include/Timestamp.h: -------------------------------------------------------------------------------- 1 | /********************************************* 2 | * file: TimeStamp.h 3 | * author: AIWWZ(wzj1524@qq.com) 4 | * date: 2019-06-15 19:26:26 5 | **********************************************/ 6 | #ifndef __TIMESTAMP_H__ 7 | #define __TIMESTAMP_H__ 8 | #include //for int64_t 9 | #include 10 | #include 11 | using std::cout; using std::endl; 12 | 13 | namespace tinyse { 14 | 15 | class Timestamp { 16 | public: 17 | Timestamp(); 18 | Timestamp(int64_t microSecondsSinceEpoch); 19 | ~Timestamp() { } 20 | 21 | void swap(Timestamp &rhs) { 22 | std::swap(m_microSecondsSinceEpoch, rhs.m_microSecondsSinceEpoch); 23 | } 24 | 25 | bool valid() const { 26 | return m_microSecondsSinceEpoch > 0; 27 | } 28 | 29 | static Timestamp invalid() { //获取一个非法时间 30 | return Timestamp(); 31 | } 32 | 33 | int64_t microSecondsSinceEpoch() const { 34 | return m_microSecondsSinceEpoch; 35 | } 36 | 37 | std::string toString() const; 38 | static Timestamp now(); //获取当前时间 39 | 40 | static const int kMicroSecondsPerSecond = 1000000; 41 | 42 | private: 43 | int64_t m_microSecondsSinceEpoch; 44 | }; 45 | 46 | inline bool operator<(const Timestamp &lhs, const Timestamp &rhs) { 47 | return lhs.microSecondsSinceEpoch() < rhs.microSecondsSinceEpoch(); 48 | } 49 | 50 | inline bool operator==(const Timestamp &lhs, const Timestamp &rhs) { 51 | return lhs.microSecondsSinceEpoch() == rhs.microSecondsSinceEpoch(); 52 | } 53 | 54 | inline double timeDifference(const Timestamp &rhs, const Timestamp &lhs) { 55 | int64_t diff = rhs.microSecondsSinceEpoch() - lhs.microSecondsSinceEpoch(); 56 | if(diff < 0) { 57 | diff = -diff; 58 | } 59 | return static_cast(diff) / Timestamp::kMicroSecondsPerSecond; 60 | } 61 | 62 | inline Timestamp addTime(const Timestamp ×tamp, double seconds) { 63 | return Timestamp(timestamp.microSecondsSinceEpoch() + static_cast(seconds * Timestamp::kMicroSecondsPerSecond)); 64 | } 65 | 66 | } //end of namespace tinyse 67 | 68 | #endif /* __TIMESTAMP_H__ */ 69 | -------------------------------------------------------------------------------- /online/include/Uncopyable.h: -------------------------------------------------------------------------------- 1 | /********************************************* 2 | * file: Uncopyable.h 3 | * author: AIWWZ(wzj1524@qq.com) 4 | * date: 2019-06-09 11:48:27 5 | **********************************************/ 6 | #ifndef __UNCOPYABLE_H__ 7 | #define __UNCOPYABLE_H__ 8 | 9 | namespace tinyse { 10 | 11 | class Uncopyable { 12 | public: 13 | /* 不允许拷贝 */ 14 | Uncopyable(const Uncopyable &) = delete; 15 | Uncopyable &operator=(const Uncopyable &) = delete; 16 | 17 | protected: 18 | /* 但允许derived对象构造和析构 */ 19 | Uncopyable() {} 20 | ~Uncopyable() {} 21 | }; 22 | 23 | } //end of namespace tinyse 24 | 25 | #endif /* __UNCOPYABLE_H__ */ 26 | -------------------------------------------------------------------------------- /online/include/WordQuery.h: -------------------------------------------------------------------------------- 1 | /********************************************* 2 | * file: WordQuery.h 3 | * author: AIWWZ(wzj1524@qq.com) 4 | * date: 2019-07-07 11:51:34 5 | **********************************************/ 6 | #ifndef __WORDQUERY_H__ 7 | #define __WORDQUERY_H__ 8 | #include "WordSegmentation.h" 9 | #include "Page.h" 10 | #include 11 | using std::unordered_map; using std::pair; 12 | 13 | namespace tinyse { 14 | 15 | class Configure; 16 | 17 | /* */ 18 | class WordQuery { 19 | public: 20 | WordQuery(Configure &conf); 21 | string query(const string &str); 22 | 23 | private: 24 | void load(); 25 | vector getQueryWordsWeightVec(const vector &queryWords); 26 | bool executeQuery(const vector &queryWords, vector>> &result); 27 | string createJson(const vector &docidVec, const vector &queryWords); 28 | string noResult(const string &str); 29 | 30 | private: 31 | Configure &m_conf; 32 | WordSegmentation m_jieba; //分词器 33 | unordered_map m_pageLib; //网页库 34 | unordered_map> m_offsetLib; //> 35 | unordered_map>> m_invertedIndex; //倒排索引库 36 | }; 37 | 38 | } //end of namespace tinyse 39 | 40 | #endif /* __WORDQUERY_H__ */ 41 | -------------------------------------------------------------------------------- /online/include/WordSegmentation.h: -------------------------------------------------------------------------------- 1 | /********************************************* 2 | * file: WordSegmentation.h 3 | * author: AIWWZ(wzj1524@qq.com) 4 | * date: 2019-06-19 21:48:01 5 | **********************************************/ 6 | #ifndef __WORDSEGMENTATION_H__ 7 | #define __WORDSEGMENTATION_H__ 8 | #include "cppjieba/Jieba.hpp" 9 | #include "MyLogger.h" 10 | #include 11 | #include 12 | using std::vector; using std::string; 13 | 14 | const char* const DICT_PATH = "/home/wzjj1314/github/cppjieba/dict/jieba.dict.utf8"; 15 | const char* const HMM_PATH = "/home/wzjj1314/github/cppjieba/dict/hmm_model.utf8"; 16 | const char* const USER_DICT_PATH = "/home/wzjj1314/github/cppjieba/dict/user.dict.utf8"; 17 | const char* const IDF_PATH = "/home/wzjj1314/github/cppjieba/dict/idf.utf8"; 18 | const char* const STOP_WORD_PATH = "/home/wzjj1314/github/cppjieba/dict/stop_words.utf8"; 19 | 20 | namespace tinyse { 21 | 22 | class WordSegmentation { 23 | public: 24 | WordSegmentation() : m_jieba(DICT_PATH, HMM_PATH, USER_DICT_PATH, IDF_PATH, STOP_WORD_PATH) { 25 | LogInfo("cppjieba init!"); 26 | } 27 | 28 | vector operator()(const string &src) { 29 | vector words; 30 | m_jieba.CutAll(src, words); 31 | return words; 32 | } 33 | 34 | private: 35 | cppjieba::Jieba m_jieba; 36 | }; 37 | 38 | } //end of namespace tinyse 39 | 40 | #endif /* __WORDSEGMENTATION_H__ */ 41 | -------------------------------------------------------------------------------- /online/include/cppjieba/FullSegment.hpp: -------------------------------------------------------------------------------- 1 | #ifndef CPPJIEBA_FULLSEGMENT_H 2 | #define CPPJIEBA_FULLSEGMENT_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include "limonp/Logging.hpp" 8 | #include "DictTrie.hpp" 9 | #include "SegmentBase.hpp" 10 | #include "Unicode.hpp" 11 | 12 | namespace cppjieba { 13 | class FullSegment: public SegmentBase { 14 | public: 15 | FullSegment(const string& dictPath) { 16 | dictTrie_ = new DictTrie(dictPath); 17 | isNeedDestroy_ = true; 18 | } 19 | FullSegment(const DictTrie* dictTrie) 20 | : dictTrie_(dictTrie), isNeedDestroy_(false) { 21 | assert(dictTrie_); 22 | } 23 | ~FullSegment() { 24 | if (isNeedDestroy_) { 25 | delete dictTrie_; 26 | } 27 | } 28 | void Cut(const string& sentence, 29 | vector& words) const { 30 | vector tmp; 31 | Cut(sentence, tmp); 32 | GetStringsFromWords(tmp, words); 33 | } 34 | void Cut(const string& sentence, 35 | vector& words) const { 36 | PreFilter pre_filter(symbols_, sentence); 37 | PreFilter::Range range; 38 | vector wrs; 39 | wrs.reserve(sentence.size()/2); 40 | while (pre_filter.HasNext()) { 41 | range = pre_filter.Next(); 42 | Cut(range.begin, range.end, wrs); 43 | } 44 | words.clear(); 45 | words.reserve(wrs.size()); 46 | GetWordsFromWordRanges(sentence, wrs, words); 47 | } 48 | void Cut(RuneStrArray::const_iterator begin, 49 | RuneStrArray::const_iterator end, 50 | vector& res) const { 51 | // resut of searching in trie tree 52 | LocalVector > tRes; 53 | 54 | // max index of res's words 55 | size_t maxIdx = 0; 56 | 57 | // always equals to (uItr - begin) 58 | size_t uIdx = 0; 59 | 60 | // tmp variables 61 | size_t wordLen = 0; 62 | assert(dictTrie_); 63 | vector dags; 64 | dictTrie_->Find(begin, end, dags); 65 | for (size_t i = 0; i < dags.size(); i++) { 66 | for (size_t j = 0; j < dags[i].nexts.size(); j++) { 67 | size_t nextoffset = dags[i].nexts[j].first; 68 | assert(nextoffset < dags.size()); 69 | const DictUnit* du = dags[i].nexts[j].second; 70 | if (du == NULL) { 71 | if (dags[i].nexts.size() == 1 && maxIdx <= uIdx) { 72 | WordRange wr(begin + i, begin + nextoffset); 73 | res.push_back(wr); 74 | } 75 | } else { 76 | wordLen = du->word.size(); 77 | if (wordLen >= 2 || (dags[i].nexts.size() == 1 && maxIdx <= uIdx)) { 78 | WordRange wr(begin + i, begin + nextoffset); 79 | res.push_back(wr); 80 | } 81 | } 82 | maxIdx = uIdx + wordLen > maxIdx ? uIdx + wordLen : maxIdx; 83 | } 84 | uIdx++; 85 | } 86 | } 87 | private: 88 | const DictTrie* dictTrie_; 89 | bool isNeedDestroy_; 90 | }; 91 | } 92 | 93 | #endif 94 | -------------------------------------------------------------------------------- /online/include/cppjieba/HMMModel.hpp: -------------------------------------------------------------------------------- 1 | #ifndef CPPJIEBA_HMMMODEL_H 2 | #define CPPJIEBA_HMMMODEL_H 3 | 4 | #include "limonp/StringUtil.hpp" 5 | #include "Trie.hpp" 6 | 7 | namespace cppjieba { 8 | 9 | using namespace limonp; 10 | typedef unordered_map EmitProbMap; 11 | 12 | struct HMMModel { 13 | /* 14 | * STATUS: 15 | * 0: HMMModel::B, 1: HMMModel::E, 2: HMMModel::M, 3:HMMModel::S 16 | * */ 17 | enum {B = 0, E = 1, M = 2, S = 3, STATUS_SUM = 4}; 18 | 19 | HMMModel(const string& modelPath) { 20 | memset(startProb, 0, sizeof(startProb)); 21 | memset(transProb, 0, sizeof(transProb)); 22 | statMap[0] = 'B'; 23 | statMap[1] = 'E'; 24 | statMap[2] = 'M'; 25 | statMap[3] = 'S'; 26 | emitProbVec.push_back(&emitProbB); 27 | emitProbVec.push_back(&emitProbE); 28 | emitProbVec.push_back(&emitProbM); 29 | emitProbVec.push_back(&emitProbS); 30 | LoadModel(modelPath); 31 | } 32 | ~HMMModel() { 33 | } 34 | void LoadModel(const string& filePath) { 35 | ifstream ifile(filePath.c_str()); 36 | XCHECK(ifile.is_open()) << "open " << filePath << " failed"; 37 | string line; 38 | vector tmp; 39 | vector tmp2; 40 | //Load startProb 41 | XCHECK(GetLine(ifile, line)); 42 | Split(line, tmp, " "); 43 | XCHECK(tmp.size() == STATUS_SUM); 44 | for (size_t j = 0; j< tmp.size(); j++) { 45 | startProb[j] = atof(tmp[j].c_str()); 46 | } 47 | 48 | //Load transProb 49 | for (size_t i = 0; i < STATUS_SUM; i++) { 50 | XCHECK(GetLine(ifile, line)); 51 | Split(line, tmp, " "); 52 | XCHECK(tmp.size() == STATUS_SUM); 53 | for (size_t j =0; j < STATUS_SUM; j++) { 54 | transProb[i][j] = atof(tmp[j].c_str()); 55 | } 56 | } 57 | 58 | //Load emitProbB 59 | XCHECK(GetLine(ifile, line)); 60 | XCHECK(LoadEmitProb(line, emitProbB)); 61 | 62 | //Load emitProbE 63 | XCHECK(GetLine(ifile, line)); 64 | XCHECK(LoadEmitProb(line, emitProbE)); 65 | 66 | //Load emitProbM 67 | XCHECK(GetLine(ifile, line)); 68 | XCHECK(LoadEmitProb(line, emitProbM)); 69 | 70 | //Load emitProbS 71 | XCHECK(GetLine(ifile, line)); 72 | XCHECK(LoadEmitProb(line, emitProbS)); 73 | } 74 | double GetEmitProb(const EmitProbMap* ptMp, Rune key, 75 | double defVal)const { 76 | EmitProbMap::const_iterator cit = ptMp->find(key); 77 | if (cit == ptMp->end()) { 78 | return defVal; 79 | } 80 | return cit->second; 81 | } 82 | bool GetLine(ifstream& ifile, string& line) { 83 | while (getline(ifile, line)) { 84 | Trim(line); 85 | if (line.empty()) { 86 | continue; 87 | } 88 | if (StartsWith(line, "#")) { 89 | continue; 90 | } 91 | return true; 92 | } 93 | return false; 94 | } 95 | bool LoadEmitProb(const string& line, EmitProbMap& mp) { 96 | if (line.empty()) { 97 | return false; 98 | } 99 | vector tmp, tmp2; 100 | Unicode unicode; 101 | Split(line, tmp, ","); 102 | for (size_t i = 0; i < tmp.size(); i++) { 103 | Split(tmp[i], tmp2, ":"); 104 | if (2 != tmp2.size()) { 105 | XLOG(ERROR) << "emitProb illegal."; 106 | return false; 107 | } 108 | if (!DecodeRunesInString(tmp2[0], unicode) || unicode.size() != 1) { 109 | XLOG(ERROR) << "TransCode failed."; 110 | return false; 111 | } 112 | mp[unicode[0]] = atof(tmp2[1].c_str()); 113 | } 114 | return true; 115 | } 116 | 117 | char statMap[STATUS_SUM]; 118 | double startProb[STATUS_SUM]; 119 | double transProb[STATUS_SUM][STATUS_SUM]; 120 | EmitProbMap emitProbB; 121 | EmitProbMap emitProbE; 122 | EmitProbMap emitProbM; 123 | EmitProbMap emitProbS; 124 | vector emitProbVec; 125 | }; // struct HMMModel 126 | 127 | } // namespace cppjieba 128 | 129 | #endif 130 | -------------------------------------------------------------------------------- /online/include/cppjieba/Jieba.hpp: -------------------------------------------------------------------------------- 1 | #ifndef CPPJIEAB_JIEBA_H 2 | #define CPPJIEAB_JIEBA_H 3 | 4 | #include "QuerySegment.hpp" 5 | #include "KeywordExtractor.hpp" 6 | 7 | namespace cppjieba { 8 | 9 | class Jieba { 10 | public: 11 | Jieba(const string& dict_path, 12 | const string& model_path, 13 | const string& user_dict_path, 14 | const string& idfPath, 15 | const string& stopWordPath) 16 | : dict_trie_(dict_path, user_dict_path), 17 | model_(model_path), 18 | mp_seg_(&dict_trie_), 19 | hmm_seg_(&model_), 20 | mix_seg_(&dict_trie_, &model_), 21 | full_seg_(&dict_trie_), 22 | query_seg_(&dict_trie_, &model_), 23 | extractor(&dict_trie_, &model_, idfPath, stopWordPath) { 24 | } 25 | ~Jieba() { 26 | } 27 | 28 | struct LocWord { 29 | string word; 30 | size_t begin; 31 | size_t end; 32 | }; // struct LocWord 33 | 34 | void Cut(const string& sentence, vector& words, bool hmm = true) const { 35 | mix_seg_.Cut(sentence, words, hmm); 36 | } 37 | void Cut(const string& sentence, vector& words, bool hmm = true) const { 38 | mix_seg_.Cut(sentence, words, hmm); 39 | } 40 | void CutAll(const string& sentence, vector& words) const { 41 | full_seg_.Cut(sentence, words); 42 | } 43 | void CutAll(const string& sentence, vector& words) const { 44 | full_seg_.Cut(sentence, words); 45 | } 46 | void CutForSearch(const string& sentence, vector& words, bool hmm = true) const { 47 | query_seg_.Cut(sentence, words, hmm); 48 | } 49 | void CutForSearch(const string& sentence, vector& words, bool hmm = true) const { 50 | query_seg_.Cut(sentence, words, hmm); 51 | } 52 | void CutHMM(const string& sentence, vector& words) const { 53 | hmm_seg_.Cut(sentence, words); 54 | } 55 | void CutHMM(const string& sentence, vector& words) const { 56 | hmm_seg_.Cut(sentence, words); 57 | } 58 | void CutSmall(const string& sentence, vector& words, size_t max_word_len) const { 59 | mp_seg_.Cut(sentence, words, max_word_len); 60 | } 61 | void CutSmall(const string& sentence, vector& words, size_t max_word_len) const { 62 | mp_seg_.Cut(sentence, words, max_word_len); 63 | } 64 | 65 | void Tag(const string& sentence, vector >& words) const { 66 | mix_seg_.Tag(sentence, words); 67 | } 68 | string LookupTag(const string &str) const { 69 | return mix_seg_.LookupTag(str); 70 | } 71 | bool InsertUserWord(const string& word, const string& tag = UNKNOWN_TAG) { 72 | return dict_trie_.InsertUserWord(word, tag); 73 | } 74 | 75 | bool InsertUserWord(const string& word,int freq, const string& tag = UNKNOWN_TAG) { 76 | return dict_trie_.InsertUserWord(word,freq, tag); 77 | } 78 | 79 | bool Find(const string& word) 80 | { 81 | return dict_trie_.Find(word); 82 | } 83 | 84 | void ResetSeparators(const string& s) { 85 | //TODO 86 | mp_seg_.ResetSeparators(s); 87 | hmm_seg_.ResetSeparators(s); 88 | mix_seg_.ResetSeparators(s); 89 | full_seg_.ResetSeparators(s); 90 | query_seg_.ResetSeparators(s); 91 | } 92 | 93 | const DictTrie* GetDictTrie() const { 94 | return &dict_trie_; 95 | } 96 | 97 | const HMMModel* GetHMMModel() const { 98 | return &model_; 99 | } 100 | 101 | void LoadUserDict(const vector& buf) { 102 | dict_trie_.LoadUserDict(buf); 103 | } 104 | 105 | void LoadUserDict(const set& buf) { 106 | dict_trie_.LoadUserDict(buf); 107 | } 108 | 109 | void LoadUserDict(const string& path) { 110 | dict_trie_.LoadUserDict(path); 111 | } 112 | 113 | private: 114 | DictTrie dict_trie_; 115 | HMMModel model_; 116 | 117 | // They share the same dict trie and model 118 | MPSegment mp_seg_; 119 | HMMSegment hmm_seg_; 120 | MixSegment mix_seg_; 121 | FullSegment full_seg_; 122 | QuerySegment query_seg_; 123 | 124 | public: 125 | KeywordExtractor extractor; 126 | }; // class Jieba 127 | 128 | } // namespace cppjieba 129 | 130 | #endif // CPPJIEAB_JIEBA_H 131 | -------------------------------------------------------------------------------- /online/include/cppjieba/MixSegment.hpp: -------------------------------------------------------------------------------- 1 | #ifndef CPPJIEBA_MIXSEGMENT_H 2 | #define CPPJIEBA_MIXSEGMENT_H 3 | 4 | #include 5 | #include "MPSegment.hpp" 6 | #include "HMMSegment.hpp" 7 | #include "limonp/StringUtil.hpp" 8 | #include "PosTagger.hpp" 9 | 10 | namespace cppjieba { 11 | class MixSegment: public SegmentTagged { 12 | public: 13 | MixSegment(const string& mpSegDict, const string& hmmSegDict, 14 | const string& userDict = "") 15 | : mpSeg_(mpSegDict, userDict), 16 | hmmSeg_(hmmSegDict) { 17 | } 18 | MixSegment(const DictTrie* dictTrie, const HMMModel* model) 19 | : mpSeg_(dictTrie), hmmSeg_(model) { 20 | } 21 | ~MixSegment() { 22 | } 23 | 24 | void Cut(const string& sentence, vector& words) const { 25 | Cut(sentence, words, true); 26 | } 27 | void Cut(const string& sentence, vector& words, bool hmm) const { 28 | vector tmp; 29 | Cut(sentence, tmp, hmm); 30 | GetStringsFromWords(tmp, words); 31 | } 32 | void Cut(const string& sentence, vector& words, bool hmm = true) const { 33 | PreFilter pre_filter(symbols_, sentence); 34 | PreFilter::Range range; 35 | vector wrs; 36 | wrs.reserve(sentence.size() / 2); 37 | while (pre_filter.HasNext()) { 38 | range = pre_filter.Next(); 39 | Cut(range.begin, range.end, wrs, hmm); 40 | } 41 | words.clear(); 42 | words.reserve(wrs.size()); 43 | GetWordsFromWordRanges(sentence, wrs, words); 44 | } 45 | 46 | void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector& res, bool hmm) const { 47 | if (!hmm) { 48 | mpSeg_.Cut(begin, end, res); 49 | return; 50 | } 51 | vector words; 52 | assert(end >= begin); 53 | words.reserve(end - begin); 54 | mpSeg_.Cut(begin, end, words); 55 | 56 | vector hmmRes; 57 | hmmRes.reserve(end - begin); 58 | for (size_t i = 0; i < words.size(); i++) { 59 | //if mp Get a word, it's ok, put it into result 60 | if (words[i].left != words[i].right || (words[i].left == words[i].right && mpSeg_.IsUserDictSingleChineseWord(words[i].left->rune))) { 61 | res.push_back(words[i]); 62 | continue; 63 | } 64 | 65 | // if mp Get a single one and it is not in userdict, collect it in sequence 66 | size_t j = i; 67 | while (j < words.size() && words[j].left == words[j].right && !mpSeg_.IsUserDictSingleChineseWord(words[j].left->rune)) { 68 | j++; 69 | } 70 | 71 | // Cut the sequence with hmm 72 | assert(j - 1 >= i); 73 | // TODO 74 | hmmSeg_.Cut(words[i].left, words[j - 1].left + 1, hmmRes); 75 | //put hmm result to result 76 | for (size_t k = 0; k < hmmRes.size(); k++) { 77 | res.push_back(hmmRes[k]); 78 | } 79 | 80 | //clear tmp vars 81 | hmmRes.clear(); 82 | 83 | //let i jump over this piece 84 | i = j - 1; 85 | } 86 | } 87 | 88 | const DictTrie* GetDictTrie() const { 89 | return mpSeg_.GetDictTrie(); 90 | } 91 | 92 | bool Tag(const string& src, vector >& res) const { 93 | return tagger_.Tag(src, res, *this); 94 | } 95 | 96 | string LookupTag(const string &str) const { 97 | return tagger_.LookupTag(str, *this); 98 | } 99 | 100 | private: 101 | MPSegment mpSeg_; 102 | HMMSegment hmmSeg_; 103 | PosTagger tagger_; 104 | 105 | }; // class MixSegment 106 | 107 | } // namespace cppjieba 108 | 109 | #endif 110 | -------------------------------------------------------------------------------- /online/include/cppjieba/PosTagger.hpp: -------------------------------------------------------------------------------- 1 | #ifndef CPPJIEBA_POS_TAGGING_H 2 | #define CPPJIEBA_POS_TAGGING_H 3 | 4 | #include "limonp/StringUtil.hpp" 5 | #include "SegmentTagged.hpp" 6 | #include "DictTrie.hpp" 7 | 8 | namespace cppjieba { 9 | using namespace limonp; 10 | 11 | static const char* const POS_M = "m"; 12 | static const char* const POS_ENG = "eng"; 13 | static const char* const POS_X = "x"; 14 | 15 | class PosTagger { 16 | public: 17 | PosTagger() { 18 | } 19 | ~PosTagger() { 20 | } 21 | 22 | bool Tag(const string& src, vector >& res, const SegmentTagged& segment) const { 23 | vector CutRes; 24 | segment.Cut(src, CutRes); 25 | 26 | for (vector::iterator itr = CutRes.begin(); itr != CutRes.end(); ++itr) { 27 | res.push_back(make_pair(*itr, LookupTag(*itr, segment))); 28 | } 29 | return !res.empty(); 30 | } 31 | 32 | string LookupTag(const string &str, const SegmentTagged& segment) const { 33 | const DictUnit *tmp = NULL; 34 | RuneStrArray runes; 35 | const DictTrie * dict = segment.GetDictTrie(); 36 | assert(dict != NULL); 37 | if (!DecodeRunesInString(str, runes)) { 38 | XLOG(ERROR) << "Decode failed."; 39 | return POS_X; 40 | } 41 | tmp = dict->Find(runes.begin(), runes.end()); 42 | if (tmp == NULL || tmp->tag.empty()) { 43 | return SpecialRule(runes); 44 | } else { 45 | return tmp->tag; 46 | } 47 | } 48 | 49 | private: 50 | const char* SpecialRule(const RuneStrArray& unicode) const { 51 | size_t m = 0; 52 | size_t eng = 0; 53 | for (size_t i = 0; i < unicode.size() && eng < unicode.size() / 2; i++) { 54 | if (unicode[i].rune < 0x80) { 55 | eng ++; 56 | if ('0' <= unicode[i].rune && unicode[i].rune <= '9') { 57 | m++; 58 | } 59 | } 60 | } 61 | // ascii char is not found 62 | if (eng == 0) { 63 | return POS_X; 64 | } 65 | // all the ascii is number char 66 | if (m == eng) { 67 | return POS_M; 68 | } 69 | // the ascii chars contain english letter 70 | return POS_ENG; 71 | } 72 | 73 | }; // class PosTagger 74 | 75 | } // namespace cppjieba 76 | 77 | #endif 78 | -------------------------------------------------------------------------------- /online/include/cppjieba/PreFilter.hpp: -------------------------------------------------------------------------------- 1 | #ifndef CPPJIEBA_PRE_FILTER_H 2 | #define CPPJIEBA_PRE_FILTER_H 3 | 4 | #include "Trie.hpp" 5 | #include "limonp/Logging.hpp" 6 | 7 | namespace cppjieba { 8 | 9 | class PreFilter { 10 | public: 11 | //TODO use WordRange instead of Range 12 | struct Range { 13 | RuneStrArray::const_iterator begin; 14 | RuneStrArray::const_iterator end; 15 | }; // struct Range 16 | 17 | PreFilter(const unordered_set& symbols, 18 | const string& sentence) 19 | : symbols_(symbols) { 20 | if (!DecodeRunesInString(sentence, sentence_)) { 21 | XLOG(ERROR) << "decode failed. "; 22 | } 23 | cursor_ = sentence_.begin(); 24 | } 25 | ~PreFilter() { 26 | } 27 | bool HasNext() const { 28 | return cursor_ != sentence_.end(); 29 | } 30 | Range Next() { 31 | Range range; 32 | range.begin = cursor_; 33 | while (cursor_ != sentence_.end()) { 34 | if (IsIn(symbols_, cursor_->rune)) { 35 | if (range.begin == cursor_) { 36 | cursor_ ++; 37 | } 38 | range.end = cursor_; 39 | return range; 40 | } 41 | cursor_ ++; 42 | } 43 | range.end = sentence_.end(); 44 | return range; 45 | } 46 | private: 47 | RuneStrArray::const_iterator cursor_; 48 | RuneStrArray sentence_; 49 | const unordered_set& symbols_; 50 | }; // class PreFilter 51 | 52 | } // namespace cppjieba 53 | 54 | #endif // CPPJIEBA_PRE_FILTER_H 55 | -------------------------------------------------------------------------------- /online/include/cppjieba/QuerySegment.hpp: -------------------------------------------------------------------------------- 1 | #ifndef CPPJIEBA_QUERYSEGMENT_H 2 | #define CPPJIEBA_QUERYSEGMENT_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include "limonp/Logging.hpp" 8 | #include "DictTrie.hpp" 9 | #include "SegmentBase.hpp" 10 | #include "FullSegment.hpp" 11 | #include "MixSegment.hpp" 12 | #include "Unicode.hpp" 13 | #include "DictTrie.hpp" 14 | 15 | namespace cppjieba { 16 | class QuerySegment: public SegmentBase { 17 | public: 18 | QuerySegment(const string& dict, const string& model, const string& userDict = "") 19 | : mixSeg_(dict, model, userDict), 20 | trie_(mixSeg_.GetDictTrie()) { 21 | } 22 | QuerySegment(const DictTrie* dictTrie, const HMMModel* model) 23 | : mixSeg_(dictTrie, model), trie_(dictTrie) { 24 | } 25 | ~QuerySegment() { 26 | } 27 | 28 | void Cut(const string& sentence, vector& words) const { 29 | Cut(sentence, words, true); 30 | } 31 | void Cut(const string& sentence, vector& words, bool hmm) const { 32 | vector tmp; 33 | Cut(sentence, tmp, hmm); 34 | GetStringsFromWords(tmp, words); 35 | } 36 | void Cut(const string& sentence, vector& words, bool hmm = true) const { 37 | PreFilter pre_filter(symbols_, sentence); 38 | PreFilter::Range range; 39 | vector wrs; 40 | wrs.reserve(sentence.size()/2); 41 | while (pre_filter.HasNext()) { 42 | range = pre_filter.Next(); 43 | Cut(range.begin, range.end, wrs, hmm); 44 | } 45 | words.clear(); 46 | words.reserve(wrs.size()); 47 | GetWordsFromWordRanges(sentence, wrs, words); 48 | } 49 | void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector& res, bool hmm) const { 50 | //use mix Cut first 51 | vector mixRes; 52 | mixSeg_.Cut(begin, end, mixRes, hmm); 53 | 54 | vector fullRes; 55 | for (vector::const_iterator mixResItr = mixRes.begin(); mixResItr != mixRes.end(); mixResItr++) { 56 | if (mixResItr->Length() > 2) { 57 | for (size_t i = 0; i + 1 < mixResItr->Length(); i++) { 58 | WordRange wr(mixResItr->left + i, mixResItr->left + i + 1); 59 | if (trie_->Find(wr.left, wr.right + 1) != NULL) { 60 | res.push_back(wr); 61 | } 62 | } 63 | } 64 | if (mixResItr->Length() > 3) { 65 | for (size_t i = 0; i + 2 < mixResItr->Length(); i++) { 66 | WordRange wr(mixResItr->left + i, mixResItr->left + i + 2); 67 | if (trie_->Find(wr.left, wr.right + 1) != NULL) { 68 | res.push_back(wr); 69 | } 70 | } 71 | } 72 | res.push_back(*mixResItr); 73 | } 74 | } 75 | private: 76 | bool IsAllAscii(const Unicode& s) const { 77 | for(size_t i = 0; i < s.size(); i++) { 78 | if (s[i] >= 0x80) { 79 | return false; 80 | } 81 | } 82 | return true; 83 | } 84 | MixSegment mixSeg_; 85 | const DictTrie* trie_; 86 | }; // QuerySegment 87 | 88 | } // namespace cppjieba 89 | 90 | #endif 91 | -------------------------------------------------------------------------------- /online/include/cppjieba/SegmentBase.hpp: -------------------------------------------------------------------------------- 1 | #ifndef CPPJIEBA_SEGMENTBASE_H 2 | #define CPPJIEBA_SEGMENTBASE_H 3 | 4 | #include "limonp/Logging.hpp" 5 | #include "PreFilter.hpp" 6 | #include 7 | 8 | 9 | namespace cppjieba { 10 | 11 | const char* const SPECIAL_SEPARATORS = " \t\n\xEF\xBC\x8C\xE3\x80\x82"; 12 | 13 | using namespace limonp; 14 | 15 | class SegmentBase { 16 | public: 17 | SegmentBase() { 18 | XCHECK(ResetSeparators(SPECIAL_SEPARATORS)); 19 | } 20 | virtual ~SegmentBase() { 21 | } 22 | 23 | virtual void Cut(const string& sentence, vector& words) const = 0; 24 | 25 | bool ResetSeparators(const string& s) { 26 | symbols_.clear(); 27 | RuneStrArray runes; 28 | if (!DecodeRunesInString(s, runes)) { 29 | XLOG(ERROR) << "decode " << s << " failed"; 30 | return false; 31 | } 32 | for (size_t i = 0; i < runes.size(); i++) { 33 | if (!symbols_.insert(runes[i].rune).second) { 34 | XLOG(ERROR) << s.substr(runes[i].offset, runes[i].len) << " already exists"; 35 | return false; 36 | } 37 | } 38 | return true; 39 | } 40 | protected: 41 | unordered_set symbols_; 42 | }; // class SegmentBase 43 | 44 | } // cppjieba 45 | 46 | #endif 47 | -------------------------------------------------------------------------------- /online/include/cppjieba/SegmentTagged.hpp: -------------------------------------------------------------------------------- 1 | #ifndef CPPJIEBA_SEGMENTTAGGED_H 2 | #define CPPJIEBA_SEGMENTTAGGED_H 3 | 4 | #include "SegmentBase.hpp" 5 | 6 | namespace cppjieba { 7 | 8 | class SegmentTagged : public SegmentBase{ 9 | public: 10 | SegmentTagged() { 11 | } 12 | virtual ~SegmentTagged() { 13 | } 14 | 15 | virtual bool Tag(const string& src, vector >& res) const = 0; 16 | 17 | virtual const DictTrie* GetDictTrie() const = 0; 18 | 19 | }; // class SegmentTagged 20 | 21 | } // cppjieba 22 | 23 | #endif 24 | -------------------------------------------------------------------------------- /online/include/cppjieba/limonp/ArgvContext.hpp: -------------------------------------------------------------------------------- 1 | /************************************ 2 | * file enc : ascii 3 | * author : wuyanyi09@gmail.com 4 | ************************************/ 5 | 6 | #ifndef LIMONP_ARGV_FUNCTS_H 7 | #define LIMONP_ARGV_FUNCTS_H 8 | 9 | #include 10 | #include 11 | #include "StringUtil.hpp" 12 | 13 | namespace limonp { 14 | 15 | using namespace std; 16 | 17 | class ArgvContext { 18 | public : 19 | ArgvContext(int argc, const char* const * argv) { 20 | for(int i = 0; i < argc; i++) { 21 | if(StartsWith(argv[i], "-")) { 22 | if(i + 1 < argc && !StartsWith(argv[i + 1], "-")) { 23 | mpss_[argv[i]] = argv[i+1]; 24 | i++; 25 | } else { 26 | sset_.insert(argv[i]); 27 | } 28 | } else { 29 | args_.push_back(argv[i]); 30 | } 31 | } 32 | } 33 | ~ArgvContext() { 34 | } 35 | 36 | friend ostream& operator << (ostream& os, const ArgvContext& args); 37 | string operator [](size_t i) const { 38 | if(i < args_.size()) { 39 | return args_[i]; 40 | } 41 | return ""; 42 | } 43 | string operator [](const string& key) const { 44 | map::const_iterator it = mpss_.find(key); 45 | if(it != mpss_.end()) { 46 | return it->second; 47 | } 48 | return ""; 49 | } 50 | 51 | bool HasKey(const string& key) const { 52 | if(mpss_.find(key) != mpss_.end() || sset_.find(key) != sset_.end()) { 53 | return true; 54 | } 55 | return false; 56 | } 57 | 58 | private: 59 | vector args_; 60 | map mpss_; 61 | set sset_; 62 | }; // class ArgvContext 63 | 64 | inline ostream& operator << (ostream& os, const ArgvContext& args) { 65 | return os< 5 | #include "Condition.hpp" 6 | 7 | namespace limonp { 8 | template 9 | class BlockingQueue: NonCopyable { 10 | public: 11 | BlockingQueue() 12 | : mutex_(), notEmpty_(mutex_), queue_() { 13 | } 14 | 15 | void Push(const T& x) { 16 | MutexLockGuard lock(mutex_); 17 | queue_.push(x); 18 | notEmpty_.Notify(); // Wait morphing saves us 19 | } 20 | 21 | T Pop() { 22 | MutexLockGuard lock(mutex_); 23 | // always use a while-loop, due to spurious wakeup 24 | while (queue_.empty()) { 25 | notEmpty_.Wait(); 26 | } 27 | assert(!queue_.empty()); 28 | T front(queue_.front()); 29 | queue_.pop(); 30 | return front; 31 | } 32 | 33 | size_t Size() const { 34 | MutexLockGuard lock(mutex_); 35 | return queue_.size(); 36 | } 37 | bool Empty() const { 38 | return Size() == 0; 39 | } 40 | 41 | private: 42 | mutable MutexLock mutex_; 43 | Condition notEmpty_; 44 | std::queue queue_; 45 | }; // class BlockingQueue 46 | 47 | } // namespace limonp 48 | 49 | #endif // LIMONP_BLOCKINGQUEUE_HPP 50 | -------------------------------------------------------------------------------- /online/include/cppjieba/limonp/BoundedBlockingQueue.hpp: -------------------------------------------------------------------------------- 1 | #ifndef LIMONP_BOUNDED_BLOCKING_QUEUE_HPP 2 | #define LIMONP_BOUNDED_BLOCKING_QUEUE_HPP 3 | 4 | #include "BoundedQueue.hpp" 5 | 6 | namespace limonp { 7 | 8 | template 9 | class BoundedBlockingQueue : NonCopyable { 10 | public: 11 | explicit BoundedBlockingQueue(size_t maxSize) 12 | : mutex_(), 13 | notEmpty_(mutex_), 14 | notFull_(mutex_), 15 | queue_(maxSize) { 16 | } 17 | 18 | void Push(const T& x) { 19 | MutexLockGuard lock(mutex_); 20 | while (queue_.Full()) { 21 | notFull_.Wait(); 22 | } 23 | assert(!queue_.Full()); 24 | queue_.Push(x); 25 | notEmpty_.Notify(); 26 | } 27 | 28 | T Pop() { 29 | MutexLockGuard lock(mutex_); 30 | while (queue_.Empty()) { 31 | notEmpty_.Wait(); 32 | } 33 | assert(!queue_.Empty()); 34 | T res = queue_.Pop(); 35 | notFull_.Notify(); 36 | return res; 37 | } 38 | 39 | bool Empty() const { 40 | MutexLockGuard lock(mutex_); 41 | return queue_.Empty(); 42 | } 43 | 44 | bool Full() const { 45 | MutexLockGuard lock(mutex_); 46 | return queue_.Full(); 47 | } 48 | 49 | size_t size() const { 50 | MutexLockGuard lock(mutex_); 51 | return queue_.size(); 52 | } 53 | 54 | size_t capacity() const { 55 | return queue_.capacity(); 56 | } 57 | 58 | private: 59 | mutable MutexLock mutex_; 60 | Condition notEmpty_; 61 | Condition notFull_; 62 | BoundedQueue queue_; 63 | }; // class BoundedBlockingQueue 64 | 65 | } // namespace limonp 66 | 67 | #endif // LIMONP_BOUNDED_BLOCKING_QUEUE_HPP 68 | -------------------------------------------------------------------------------- /online/include/cppjieba/limonp/BoundedQueue.hpp: -------------------------------------------------------------------------------- 1 | #ifndef LIMONP_BOUNDED_QUEUE_HPP 2 | #define LIMONP_BOUNDED_QUEUE_HPP 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | namespace limonp { 9 | using namespace std; 10 | template 11 | class BoundedQueue { 12 | public: 13 | explicit BoundedQueue(size_t capacity): capacity_(capacity), circular_buffer_(capacity) { 14 | head_ = 0; 15 | tail_ = 0; 16 | size_ = 0; 17 | assert(capacity_); 18 | } 19 | ~BoundedQueue() { 20 | } 21 | 22 | void Clear() { 23 | head_ = 0; 24 | tail_ = 0; 25 | size_ = 0; 26 | } 27 | bool Empty() const { 28 | return !size_; 29 | } 30 | bool Full() const { 31 | return capacity_ == size_; 32 | } 33 | size_t Size() const { 34 | return size_; 35 | } 36 | size_t Capacity() const { 37 | return capacity_; 38 | } 39 | 40 | void Push(const T& t) { 41 | assert(!Full()); 42 | circular_buffer_[tail_] = t; 43 | tail_ = (tail_ + 1) % capacity_; 44 | size_ ++; 45 | } 46 | 47 | T Pop() { 48 | assert(!Empty()); 49 | size_t oldPos = head_; 50 | head_ = (head_ + 1) % capacity_; 51 | size_ --; 52 | return circular_buffer_[oldPos]; 53 | } 54 | 55 | private: 56 | size_t head_; 57 | size_t tail_; 58 | size_t size_; 59 | const size_t capacity_; 60 | vector circular_buffer_; 61 | 62 | }; // class BoundedQueue 63 | } // namespace limonp 64 | 65 | #endif 66 | -------------------------------------------------------------------------------- /online/include/cppjieba/limonp/Colors.hpp: -------------------------------------------------------------------------------- 1 | #ifndef LIMONP_COLOR_PRINT_HPP 2 | #define LIMONP_COLOR_PRINT_HPP 3 | 4 | #include 5 | #include 6 | 7 | namespace limonp { 8 | 9 | using std::string; 10 | 11 | enum Color { 12 | BLACK = 30, 13 | RED, 14 | GREEN, 15 | YELLOW, 16 | BLUE, 17 | PURPLE 18 | }; // enum Color 19 | 20 | static void ColorPrintln(enum Color color, const char * fmt, ...) { 21 | va_list ap; 22 | printf("\033[0;%dm", color); 23 | va_start(ap, fmt); 24 | vprintf(fmt, ap); 25 | va_end(ap); 26 | printf("\033[0m\n"); // if not \n , in some situation , the next lines will be set the same color unexpectedly 27 | } 28 | 29 | } // namespace limonp 30 | 31 | #endif // LIMONP_COLOR_PRINT_HPP 32 | -------------------------------------------------------------------------------- /online/include/cppjieba/limonp/Condition.hpp: -------------------------------------------------------------------------------- 1 | #ifndef LIMONP_CONDITION_HPP 2 | #define LIMONP_CONDITION_HPP 3 | 4 | #include "MutexLock.hpp" 5 | 6 | namespace limonp { 7 | 8 | class Condition : NonCopyable { 9 | public: 10 | explicit Condition(MutexLock& mutex) 11 | : mutex_(mutex) { 12 | XCHECK(!pthread_cond_init(&pcond_, NULL)); 13 | } 14 | 15 | ~Condition() { 16 | XCHECK(!pthread_cond_destroy(&pcond_)); 17 | } 18 | 19 | void Wait() { 20 | XCHECK(!pthread_cond_wait(&pcond_, mutex_.GetPthreadMutex())); 21 | } 22 | 23 | void Notify() { 24 | XCHECK(!pthread_cond_signal(&pcond_)); 25 | } 26 | 27 | void NotifyAll() { 28 | XCHECK(!pthread_cond_broadcast(&pcond_)); 29 | } 30 | 31 | private: 32 | MutexLock& mutex_; 33 | pthread_cond_t pcond_; 34 | }; // class Condition 35 | 36 | } // namespace limonp 37 | 38 | #endif // LIMONP_CONDITION_HPP 39 | -------------------------------------------------------------------------------- /online/include/cppjieba/limonp/Config.hpp: -------------------------------------------------------------------------------- 1 | /************************************ 2 | * file enc : utf8 3 | * author : wuyanyi09@gmail.com 4 | ************************************/ 5 | #ifndef LIMONP_CONFIG_H 6 | #define LIMONP_CONFIG_H 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include "StringUtil.hpp" 13 | 14 | namespace limonp { 15 | 16 | using namespace std; 17 | 18 | class Config { 19 | public: 20 | explicit Config(const string& filePath) { 21 | LoadFile(filePath); 22 | } 23 | 24 | operator bool () { 25 | return !map_.empty(); 26 | } 27 | 28 | string Get(const string& key, const string& defaultvalue) const { 29 | map::const_iterator it = map_.find(key); 30 | if(map_.end() != it) { 31 | return it->second; 32 | } 33 | return defaultvalue; 34 | } 35 | int Get(const string& key, int defaultvalue) const { 36 | string str = Get(key, ""); 37 | if("" == str) { 38 | return defaultvalue; 39 | } 40 | return atoi(str.c_str()); 41 | } 42 | const char* operator [] (const char* key) const { 43 | if(NULL == key) { 44 | return NULL; 45 | } 46 | map::const_iterator it = map_.find(key); 47 | if(map_.end() != it) { 48 | return it->second.c_str(); 49 | } 50 | return NULL; 51 | } 52 | 53 | string GetConfigInfo() const { 54 | string res; 55 | res << *this; 56 | return res; 57 | } 58 | 59 | private: 60 | void LoadFile(const string& filePath) { 61 | ifstream ifs(filePath.c_str()); 62 | assert(ifs); 63 | string line; 64 | vector vecBuf; 65 | size_t lineno = 0; 66 | while(getline(ifs, line)) { 67 | lineno ++; 68 | Trim(line); 69 | if(line.empty() || StartsWith(line, "#")) { 70 | continue; 71 | } 72 | vecBuf.clear(); 73 | Split(line, vecBuf, "="); 74 | if(2 != vecBuf.size()) { 75 | fprintf(stderr, "line[%s] illegal.\n", line.c_str()); 76 | assert(false); 77 | continue; 78 | } 79 | string& key = vecBuf[0]; 80 | string& value = vecBuf[1]; 81 | Trim(key); 82 | Trim(value); 83 | if(!map_.insert(make_pair(key, value)).second) { 84 | fprintf(stderr, "key[%s] already exits.\n", key.c_str()); 85 | assert(false); 86 | continue; 87 | } 88 | } 89 | ifs.close(); 90 | } 91 | 92 | friend ostream& operator << (ostream& os, const Config& config); 93 | 94 | map map_; 95 | }; // class Config 96 | 97 | inline ostream& operator << (ostream& os, const Config& config) { 98 | return os << config.map_; 99 | } 100 | 101 | } // namespace limonp 102 | 103 | #endif // LIMONP_CONFIG_H 104 | -------------------------------------------------------------------------------- /online/include/cppjieba/limonp/FileLock.hpp: -------------------------------------------------------------------------------- 1 | #ifndef LIMONP_FILELOCK_HPP 2 | #define LIMONP_FILELOCK_HPP 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | namespace limonp { 14 | 15 | using std::string; 16 | 17 | class FileLock { 18 | public: 19 | FileLock() : fd_(-1), ok_(true) { 20 | } 21 | ~FileLock() { 22 | if(fd_ > 0) { 23 | Close(); 24 | } 25 | } 26 | void Open(const string& fname) { 27 | assert(fd_ == -1); 28 | fd_ = open(fname.c_str(), O_RDWR | O_CREAT, 0644); 29 | if(fd_ < 0) { 30 | ok_ = false; 31 | err_ = strerror(errno); 32 | } 33 | } 34 | void Close() { 35 | ::close(fd_); 36 | } 37 | void Lock() { 38 | if(LockOrUnlock(fd_, true) < 0) { 39 | ok_ = false; 40 | err_ = strerror(errno); 41 | } 42 | } 43 | void UnLock() { 44 | if(LockOrUnlock(fd_, false) < 0) { 45 | ok_ = false; 46 | err_ = strerror(errno); 47 | } 48 | } 49 | bool Ok() const { 50 | return ok_; 51 | } 52 | string Error() const { 53 | return err_; 54 | } 55 | private: 56 | static int LockOrUnlock(int fd, bool lock) { 57 | errno = 0; 58 | struct flock f; 59 | memset(&f, 0, sizeof(f)); 60 | f.l_type = (lock ? F_WRLCK : F_UNLCK); 61 | f.l_whence = SEEK_SET; 62 | f.l_start = 0; 63 | f.l_len = 0; // Lock/unlock entire file 64 | return fcntl(fd, F_SETLK, &f); 65 | } 66 | 67 | int fd_; 68 | bool ok_; 69 | string err_; 70 | }; // class FileLock 71 | 72 | }// namespace limonp 73 | 74 | #endif // LIMONP_FILELOCK_HPP 75 | -------------------------------------------------------------------------------- /online/include/cppjieba/limonp/ForcePublic.hpp: -------------------------------------------------------------------------------- 1 | #ifndef LIMONP_FORCE_PUBLIC_H 2 | #define LIMONP_FORCE_PUBLIC_H 3 | 4 | #define private public 5 | #define protected public 6 | 7 | #endif // LIMONP_FORCE_PUBLIC_H 8 | -------------------------------------------------------------------------------- /online/include/cppjieba/limonp/LocalVector.hpp: -------------------------------------------------------------------------------- 1 | #ifndef LIMONP_LOCAL_VECTOR_HPP 2 | #define LIMONP_LOCAL_VECTOR_HPP 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | namespace limonp { 10 | using namespace std; 11 | /* 12 | * LocalVector : T must be primitive type (char , int, size_t), if T is struct or class, LocalVector may be dangerous.. 13 | * LocalVector is simple and not well-tested. 14 | */ 15 | const size_t LOCAL_VECTOR_BUFFER_SIZE = 16; 16 | template 17 | class LocalVector { 18 | public: 19 | typedef const T* const_iterator ; 20 | typedef T value_type; 21 | typedef size_t size_type; 22 | private: 23 | T buffer_[LOCAL_VECTOR_BUFFER_SIZE]; 24 | T * ptr_; 25 | size_t size_; 26 | size_t capacity_; 27 | public: 28 | LocalVector() { 29 | init_(); 30 | }; 31 | LocalVector(const LocalVector& vec) { 32 | init_(); 33 | *this = vec; 34 | } 35 | LocalVector(const_iterator begin, const_iterator end) { // TODO: make it faster 36 | init_(); 37 | while(begin != end) { 38 | push_back(*begin++); 39 | } 40 | } 41 | LocalVector(size_t size, const T& t) { // TODO: make it faster 42 | init_(); 43 | while(size--) { 44 | push_back(t); 45 | } 46 | } 47 | ~LocalVector() { 48 | if(ptr_ != buffer_) { 49 | free(ptr_); 50 | } 51 | }; 52 | public: 53 | LocalVector& operator = (const LocalVector& vec) { 54 | clear(); 55 | size_ = vec.size(); 56 | capacity_ = vec.capacity(); 57 | if(vec.buffer_ == vec.ptr_) { 58 | memcpy(buffer_, vec.buffer_, sizeof(T) * size_); 59 | ptr_ = buffer_; 60 | } else { 61 | ptr_ = (T*) malloc(vec.capacity() * sizeof(T)); 62 | assert(ptr_); 63 | memcpy(ptr_, vec.ptr_, vec.size() * sizeof(T)); 64 | } 65 | return *this; 66 | } 67 | private: 68 | void init_() { 69 | ptr_ = buffer_; 70 | size_ = 0; 71 | capacity_ = LOCAL_VECTOR_BUFFER_SIZE; 72 | } 73 | public: 74 | T& operator [] (size_t i) { 75 | return ptr_[i]; 76 | } 77 | const T& operator [] (size_t i) const { 78 | return ptr_[i]; 79 | } 80 | void push_back(const T& t) { 81 | if(size_ == capacity_) { 82 | assert(capacity_); 83 | reserve(capacity_ * 2); 84 | } 85 | ptr_[size_ ++ ] = t; 86 | } 87 | void reserve(size_t size) { 88 | if(size <= capacity_) { 89 | return; 90 | } 91 | T * next = (T*)malloc(sizeof(T) * size); 92 | assert(next); 93 | T * old = ptr_; 94 | ptr_ = next; 95 | memcpy(ptr_, old, sizeof(T) * capacity_); 96 | capacity_ = size; 97 | if(old != buffer_) { 98 | free(old); 99 | } 100 | } 101 | bool empty() const { 102 | return 0 == size(); 103 | } 104 | size_t size() const { 105 | return size_; 106 | } 107 | size_t capacity() const { 108 | return capacity_; 109 | } 110 | const_iterator begin() const { 111 | return ptr_; 112 | } 113 | const_iterator end() const { 114 | return ptr_ + size_; 115 | } 116 | void clear() { 117 | if(ptr_ != buffer_) { 118 | free(ptr_); 119 | } 120 | init_(); 121 | } 122 | }; 123 | 124 | template 125 | ostream & operator << (ostream& os, const LocalVector& vec) { 126 | if(vec.empty()) { 127 | return os << "[]"; 128 | } 129 | os<<"[\""< 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #ifdef XLOG 11 | #error "XLOG has been defined already" 12 | #endif // XLOG 13 | #ifdef XCHECK 14 | #error "XCHECK has been defined already" 15 | #endif // XCHECK 16 | 17 | #define XLOG(level) limonp::Logger(limonp::LL_##level, __FILE__, __LINE__).Stream() 18 | #define XCHECK(exp) if(!(exp)) XLOG(FATAL) << "exp: ["#exp << "] false. " 19 | 20 | namespace limonp { 21 | 22 | enum { 23 | LL_DEBUG = 0, 24 | LL_INFO = 1, 25 | LL_WARNING = 2, 26 | LL_ERROR = 3, 27 | LL_FATAL = 4, 28 | }; // enum 29 | 30 | static const char * LOG_LEVEL_ARRAY[] = {"DEBUG","INFO","WARN","ERROR","FATAL"}; 31 | static const char * LOG_TIME_FORMAT = "%Y-%m-%d %H:%M:%S"; 32 | 33 | class Logger { 34 | public: 35 | Logger(size_t level, const char* filename, int lineno) 36 | : level_(level) { 37 | #ifdef LOGGING_LEVEL 38 | if (level_ < LOGGING_LEVEL) { 39 | return; 40 | } 41 | #endif 42 | assert(level_ <= sizeof(LOG_LEVEL_ARRAY)/sizeof(*LOG_LEVEL_ARRAY)); 43 | char buf[32]; 44 | time_t now; 45 | time(&now); 46 | strftime(buf, sizeof(buf), LOG_TIME_FORMAT, localtime(&now)); 47 | stream_ << buf 48 | << " " << filename 49 | << ":" << lineno 50 | << " " << LOG_LEVEL_ARRAY[level_] 51 | << " "; 52 | } 53 | ~Logger() { 54 | #ifdef LOGGING_LEVEL 55 | if (level_ < LOGGING_LEVEL) { 56 | return; 57 | } 58 | #endif 59 | std::cerr << stream_.str() << std::endl; 60 | if (level_ == LL_FATAL) { 61 | abort(); 62 | } 63 | } 64 | 65 | std::ostream& Stream() { 66 | return stream_; 67 | } 68 | 69 | private: 70 | std::ostringstream stream_; 71 | size_t level_; 72 | }; // class Logger 73 | 74 | } // namespace limonp 75 | 76 | #endif // LIMONP_LOGGING_HPP 77 | -------------------------------------------------------------------------------- /online/include/cppjieba/limonp/MutexLock.hpp: -------------------------------------------------------------------------------- 1 | #ifndef LIMONP_MUTEX_LOCK_HPP 2 | #define LIMONP_MUTEX_LOCK_HPP 3 | 4 | #include 5 | #include "NonCopyable.hpp" 6 | #include "Logging.hpp" 7 | 8 | namespace limonp { 9 | 10 | class MutexLock: NonCopyable { 11 | public: 12 | MutexLock() { 13 | XCHECK(!pthread_mutex_init(&mutex_, NULL)); 14 | } 15 | ~MutexLock() { 16 | XCHECK(!pthread_mutex_destroy(&mutex_)); 17 | } 18 | pthread_mutex_t* GetPthreadMutex() { 19 | return &mutex_; 20 | } 21 | 22 | private: 23 | void Lock() { 24 | XCHECK(!pthread_mutex_lock(&mutex_)); 25 | } 26 | void Unlock() { 27 | XCHECK(!pthread_mutex_unlock(&mutex_)); 28 | } 29 | friend class MutexLockGuard; 30 | 31 | pthread_mutex_t mutex_; 32 | }; // class MutexLock 33 | 34 | class MutexLockGuard: NonCopyable { 35 | public: 36 | explicit MutexLockGuard(MutexLock & mutex) 37 | : mutex_(mutex) { 38 | mutex_.Lock(); 39 | } 40 | ~MutexLockGuard() { 41 | mutex_.Unlock(); 42 | } 43 | private: 44 | MutexLock & mutex_; 45 | }; // class MutexLockGuard 46 | 47 | #define MutexLockGuard(x) XCHECK(false); 48 | 49 | } // namespace limonp 50 | 51 | #endif // LIMONP_MUTEX_LOCK_HPP 52 | -------------------------------------------------------------------------------- /online/include/cppjieba/limonp/NonCopyable.hpp: -------------------------------------------------------------------------------- 1 | /************************************ 2 | ************************************/ 3 | #ifndef LIMONP_NONCOPYABLE_H 4 | #define LIMONP_NONCOPYABLE_H 5 | 6 | namespace limonp { 7 | 8 | class NonCopyable { 9 | protected: 10 | NonCopyable() { 11 | } 12 | ~NonCopyable() { 13 | } 14 | private: 15 | NonCopyable(const NonCopyable& ); 16 | const NonCopyable& operator=(const NonCopyable& ); 17 | }; // class NonCopyable 18 | 19 | } // namespace limonp 20 | 21 | #endif // LIMONP_NONCOPYABLE_H 22 | -------------------------------------------------------------------------------- /online/include/cppjieba/limonp/StdExtension.hpp: -------------------------------------------------------------------------------- 1 | #ifndef LIMONP_STD_EXTEMSION_HPP 2 | #define LIMONP_STD_EXTEMSION_HPP 3 | 4 | #include 5 | 6 | #ifdef __APPLE__ 7 | #include 8 | #include 9 | #elif(__cplusplus >= 201103L) 10 | #include 11 | #include 12 | #elif defined _MSC_VER 13 | #include 14 | #include 15 | #else 16 | #include 17 | #include 18 | namespace std { 19 | using std::tr1::unordered_map; 20 | using std::tr1::unordered_set; 21 | } 22 | 23 | #endif 24 | 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | 32 | 33 | namespace std { 34 | 35 | template 36 | ostream& operator << (ostream& os, const vector& v) { 37 | if(v.empty()) { 38 | return os << "[]"; 39 | } 40 | os<<"["< 49 | inline ostream& operator << (ostream& os, const vector& v) { 50 | if(v.empty()) { 51 | return os << "[]"; 52 | } 53 | os<<"[\""< 62 | ostream& operator << (ostream& os, const deque& dq) { 63 | if(dq.empty()) { 64 | return os << "[]"; 65 | } 66 | os<<"[\""< 76 | ostream& operator << (ostream& os, const pair& pr) { 77 | os << pr.first << ":" << pr.second ; 78 | return os; 79 | } 80 | 81 | 82 | template 83 | string& operator << (string& str, const T& obj) { 84 | stringstream ss; 85 | ss << obj; // call ostream& operator << (ostream& os, 86 | return str = ss.str(); 87 | } 88 | 89 | template 90 | ostream& operator << (ostream& os, const map& mp) { 91 | if(mp.empty()) { 92 | os<<"{}"; 93 | return os; 94 | } 95 | os<<'{'; 96 | typename map::const_iterator it = mp.begin(); 97 | os<<*it; 98 | it++; 99 | while(it != mp.end()) { 100 | os<<", "<<*it; 101 | it++; 102 | } 103 | os<<'}'; 104 | return os; 105 | } 106 | template 107 | ostream& operator << (ostream& os, const std::unordered_map& mp) { 108 | if(mp.empty()) { 109 | return os << "{}"; 110 | } 111 | os<<'{'; 112 | typename std::unordered_map::const_iterator it = mp.begin(); 113 | os<<*it; 114 | it++; 115 | while(it != mp.end()) { 116 | os<<", "<<*it++; 117 | } 118 | return os<<'}'; 119 | } 120 | 121 | template 122 | ostream& operator << (ostream& os, const set& st) { 123 | if(st.empty()) { 124 | os << "{}"; 125 | return os; 126 | } 127 | os<<'{'; 128 | typename set::const_iterator it = st.begin(); 129 | os<<*it; 130 | it++; 131 | while(it != st.end()) { 132 | os<<", "<<*it; 133 | it++; 134 | } 135 | os<<'}'; 136 | return os; 137 | } 138 | 139 | template 140 | bool IsIn(const ContainType& contain, const KeyType& key) { 141 | return contain.end() != contain.find(key); 142 | } 143 | 144 | template 145 | basic_string & operator << (basic_string & s, ifstream & ifs) { 146 | return s.assign((istreambuf_iterator(ifs)), istreambuf_iterator()); 147 | } 148 | 149 | template 150 | ofstream & operator << (ofstream & ofs, const basic_string& s) { 151 | ostreambuf_iterator itr (ofs); 152 | copy(s.begin(), s.end(), itr); 153 | return ofs; 154 | } 155 | 156 | } // namespace std 157 | 158 | #endif 159 | -------------------------------------------------------------------------------- /online/include/cppjieba/limonp/Thread.hpp: -------------------------------------------------------------------------------- 1 | #ifndef LIMONP_THREAD_HPP 2 | #define LIMONP_THREAD_HPP 3 | 4 | #include "Logging.hpp" 5 | #include "NonCopyable.hpp" 6 | 7 | namespace limonp { 8 | 9 | class IThread: NonCopyable { 10 | public: 11 | IThread(): isStarted(false), isJoined(false) { 12 | } 13 | virtual ~IThread() { 14 | if(isStarted && !isJoined) { 15 | XCHECK(!pthread_detach(thread_)); 16 | } 17 | }; 18 | 19 | virtual void Run() = 0; 20 | void Start() { 21 | XCHECK(!isStarted); 22 | XCHECK(!pthread_create(&thread_, NULL, Worker, this)); 23 | isStarted = true; 24 | } 25 | void Join() { 26 | XCHECK(!isJoined); 27 | XCHECK(!pthread_join(thread_, NULL)); 28 | isJoined = true; 29 | } 30 | private: 31 | static void * Worker(void * data) { 32 | IThread * ptr = (IThread* ) data; 33 | ptr->Run(); 34 | return NULL; 35 | } 36 | 37 | pthread_t thread_; 38 | bool isStarted; 39 | bool isJoined; 40 | }; // class IThread 41 | 42 | } // namespace limonp 43 | 44 | #endif // LIMONP_THREAD_HPP 45 | -------------------------------------------------------------------------------- /online/include/cppjieba/limonp/ThreadPool.hpp: -------------------------------------------------------------------------------- 1 | #ifndef LIMONP_THREAD_POOL_HPP 2 | #define LIMONP_THREAD_POOL_HPP 3 | 4 | #include "Thread.hpp" 5 | #include "BlockingQueue.hpp" 6 | #include "BoundedBlockingQueue.hpp" 7 | #include "Closure.hpp" 8 | 9 | namespace limonp { 10 | 11 | using namespace std; 12 | 13 | //class ThreadPool; 14 | class ThreadPool: NonCopyable { 15 | public: 16 | class Worker: public IThread { 17 | public: 18 | Worker(ThreadPool* pool): ptThreadPool_(pool) { 19 | assert(ptThreadPool_); 20 | } 21 | virtual ~Worker() { 22 | } 23 | 24 | virtual void Run() { 25 | while (true) { 26 | ClosureInterface* closure = ptThreadPool_->queue_.Pop(); 27 | if (closure == NULL) { 28 | break; 29 | } 30 | try { 31 | closure->Run(); 32 | } catch(std::exception& e) { 33 | XLOG(ERROR) << e.what(); 34 | } catch(...) { 35 | XLOG(ERROR) << " unknown exception."; 36 | } 37 | delete closure; 38 | } 39 | } 40 | private: 41 | ThreadPool * ptThreadPool_; 42 | }; // class Worker 43 | 44 | ThreadPool(size_t thread_num) 45 | : threads_(thread_num), 46 | queue_(thread_num) { 47 | assert(thread_num); 48 | for(size_t i = 0; i < threads_.size(); i ++) { 49 | threads_[i] = new Worker(this); 50 | } 51 | } 52 | ~ThreadPool() { 53 | Stop(); 54 | } 55 | 56 | void Start() { 57 | for(size_t i = 0; i < threads_.size(); i++) { 58 | threads_[i]->Start(); 59 | } 60 | } 61 | void Stop() { 62 | for(size_t i = 0; i < threads_.size(); i ++) { 63 | queue_.Push(NULL); 64 | } 65 | for(size_t i = 0; i < threads_.size(); i ++) { 66 | threads_[i]->Join(); 67 | delete threads_[i]; 68 | } 69 | threads_.clear(); 70 | } 71 | 72 | void Add(ClosureInterface* task) { 73 | assert(task); 74 | queue_.Push(task); 75 | } 76 | 77 | private: 78 | friend class Worker; 79 | 80 | vector threads_; 81 | BoundedBlockingQueue queue_; 82 | }; // class ThreadPool 83 | 84 | } // namespace limonp 85 | 86 | #endif // LIMONP_THREAD_POOL_HPP 87 | -------------------------------------------------------------------------------- /online/log/tiny_se.log.02: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aiwwz/tiny-search-engine/0ed91aeb17e5d75f982669c3432959dc7c27473d/online/log/tiny_se.log.02 -------------------------------------------------------------------------------- /online/src/Acceptor.cc: -------------------------------------------------------------------------------- 1 | /********************************************* 2 | * file: Acceptor.cc 3 | * author: AIWWZ(wzj1524@qq.com) 4 | * date: 2019-06-27 08:32:56 5 | **********************************************/ 6 | #include "../include/Acceptor.h" 7 | #include "../include/EventLoop.h" 8 | #include "../include/InetAddress.h" 9 | #include 10 | using namespace tinyse; 11 | 12 | int createNonblockSocket() { //创建非阻塞socket 13 | int sockfd =::socket(AF_INET, SOCK_STREAM | SOCK_NONBLOCK | SOCK_CLOEXEC, 0); 14 | if(sockfd < 0) { 15 | perror("Acceptor: socket"); 16 | } 17 | 18 | return sockfd; 19 | } 20 | 21 | Acceptor::Acceptor(EventLoop *loop, const InetAddress &listenAddr, bool reuseport) 22 | : m_listenning(false) 23 | , m_loop(loop) 24 | , m_socket(createNonblockSocket()) 25 | , m_channel(loop, m_socket.fd()) { 26 | 27 | m_socket.setReuseAddr(true); 28 | m_socket.setReusePort(reuseport); 29 | m_socket.bind(listenAddr); 30 | m_channel.setReadCallback(std::bind(&Acceptor::handleRead, this)); 31 | } 32 | 33 | void Acceptor::listen() { 34 | m_loop->assertInLoopThread(); 35 | m_listenning = true; 36 | m_socket.listen(); 37 | m_channel.enableReading(); 38 | } 39 | 40 | void Acceptor::handleRead() { 41 | m_loop->assertInLoopThread(); 42 | InetAddress peerAddr; 43 | int connfd = m_socket.accept(peerAddr); 44 | 45 | if(connfd >= 0) { 46 | if(m_newConnectionCallback) { 47 | m_newConnectionCallback(connfd, peerAddr); 48 | } 49 | else { 50 | ::close(connfd); 51 | } 52 | } 53 | } 54 | 55 | -------------------------------------------------------------------------------- /online/src/Buffer.cc: -------------------------------------------------------------------------------- 1 | /********************************************* 2 | * file: Buffer.cc 3 | * author: AIWWZ(wzj1524@qq.com) 4 | * date: 2019-07-04 12:24:58 5 | **********************************************/ 6 | #include 7 | #include 8 | #include "../include/Buffer.h" 9 | using namespace tinyse; 10 | 11 | ssize_t Buffer::readFd(int fd) { 12 | char extrabuf[65536]; 13 | const size_t writable = writableBytes(); 14 | 15 | struct iovec vec[2]; 16 | vec[0].iov_base = begin() + m_writerIndex; 17 | vec[0].iov_len = writable; 18 | vec[1].iov_base = extrabuf; 19 | vec[1].iov_len = sizeof(extrabuf); 20 | 21 | const ssize_t n = readv(fd, vec, 2); //由于epoll采用水平触发, 所以若未读完也不会丢失数据 22 | if(n < 0) { 23 | perror("Buffer::readFd::readv"); 24 | } 25 | else if(static_cast(n) <= writable){ //buffer未装满 26 | m_writerIndex += n; 27 | } 28 | else { 29 | m_writerIndex = m_buffer.size(); 30 | append(extrabuf, n - writable); 31 | } 32 | return n; 33 | } 34 | -------------------------------------------------------------------------------- /online/src/Channel.cc: -------------------------------------------------------------------------------- 1 | /********************************************* 2 | * file: Channel.cc 3 | * author: AIWWZ(wzj1524@qq.com) 4 | * date: 2019-06-14 16:27:07 5 | **********************************************/ 6 | #include 7 | #include "../include/Channel.h" 8 | #include "../include/EventLoop.h" 9 | #include "../include/MyLogger.h" 10 | #include 11 | #include 12 | using namespace std; 13 | using namespace tinyse; 14 | 15 | const int Channel::kNoneEvent = 0; 16 | const int Channel::kReadEvent = POLLIN | POLLPRI; 17 | const int Channel::kWriteEvent = POLLOUT; 18 | 19 | Channel::~Channel() { 20 | assert(!m_eventHandling); 21 | } 22 | 23 | void Channel::update() { 24 | m_loop->updateChannel(this); 25 | } 26 | 27 | /* 28 | void Channel::remove() { 29 | assert(isNoneEvent()); 30 | } 31 | */ 32 | 33 | void Channel::handleEvent() { 34 | m_eventHandling = true; 35 | 36 | if(m_revents & POLLNVAL) { 37 | LogWarn("Channel::handleEvent() POLLNVAL"); 38 | } 39 | 40 | if((m_revents & POLLHUP) && !(m_revents & POLLIN)) { 41 | if(m_closeCallback) { 42 | m_closeCallback(); 43 | } 44 | } 45 | 46 | if(m_revents & (POLLERR | POLLNVAL)) { 47 | if(m_errorCallback) { 48 | m_errorCallback(); 49 | } 50 | } 51 | 52 | if(m_revents & (POLLIN | POLLPRI | POLLRDHUP)) { 53 | if(m_readCallback) { 54 | m_readCallback(); 55 | } 56 | } 57 | 58 | if(m_revents & POLLOUT) { 59 | if(m_writeCallback) { 60 | m_writeCallback(); 61 | } 62 | } 63 | 64 | m_eventHandling = false; 65 | } 66 | -------------------------------------------------------------------------------- /online/src/Configure.cc: -------------------------------------------------------------------------------- 1 | /********************************************* 2 | * file: Configure.cc 3 | * author: AIWWZ(wzj1524@qq.com) 4 | * date: 2019-06-18 18:25:43 5 | **********************************************/ 6 | #include "../include/Configure.h" 7 | #include "../include/MyLogger.h" 8 | #include 9 | #include 10 | #include 11 | #include //for debug 12 | using namespace tinyse; 13 | using std::istringstream; using std::cout; using std::endl; 14 | 15 | 16 | Configure::Configure(const string &filepath) { 17 | defaultConfig(); //初始化默认配置 18 | 19 | ifstream configs(filepath); 20 | if(configs) { 21 | LogInfo("Load Configuration file: %s", filepath.c_str()); 22 | loadConfig(configs); //加载配置 23 | } 24 | else { //打开配置文件失败 25 | LogWarn("Couldn't open configure file: %s", filepath.c_str()); 26 | //使用默认配置 27 | } 28 | 29 | readStopWords(); 30 | } 31 | 32 | 33 | /* 配置文件中没有指定路径时, 使用默认路径 */ 34 | void Configure::defaultConfig() { 35 | m_configMap["pagelib"] = "../data/pagelib.dat"; //去重之后的网页库 36 | m_configMap["offsetlib"] = "../data/offsetlib.dat"; //去重之后的网页偏移库 37 | m_configMap["invertedindexlib"] = "../data/invertedindexlib.dat"; //倒排索引 38 | m_configMap["stopwords"] = "../data/stop_words.utf8"; //停用词 39 | } 40 | 41 | 42 | void Configure::loadConfig(ifstream &configs) { 43 | string line, item, path; 44 | while(getline(configs, line)) { 45 | istringstream record(line); 46 | record >> item >> path; 47 | if(m_configMap.find(item) != m_configMap.end()) { //使用find可以防止向map中添加新项 48 | if(0 == path.size()) { //简单检查一下是否是空路径 49 | LogWarn("Empty Path of %s", item.c_str()); 50 | //直接使用默认配置 51 | } 52 | else { //非空路径(但并不保证路径合法) 53 | m_configMap[item] = path; 54 | } 55 | } 56 | else { //非法配置项 57 | LogWarn("Invalid configuration item: %s", item.c_str()); 58 | //忽略之 59 | } 60 | //防止污染下一次读取 61 | item.clear(); 62 | path.clear(); 63 | } 64 | } 65 | 66 | void Configure::readStopWords() { 67 | ifstream stopWords(getConfigMap()["stopwords"]); 68 | if(stopWords) { 69 | LogInfo("Read stop words"); 70 | string word; 71 | while(stopWords >> word) { 72 | m_stopWords.insert(word); 73 | } 74 | } 75 | else { 76 | LogError("Couldn't open stopwords file"); 77 | exit(-1); 78 | } 79 | } 80 | 81 | 82 | map Configure::getConfigMap() { 83 | return m_configMap; 84 | } 85 | 86 | set& Configure::getStopWords() { 87 | return m_stopWords; 88 | } 89 | 90 | void Configure::print() const { //for debug 91 | cout << endl << "Print config info:" << endl; 92 | for(auto &it : m_configMap) { 93 | cout << it.first << " -> " << it.second << endl; 94 | } 95 | cout << endl << "Print stop words:" << endl; 96 | for(auto &word : m_stopWords) { 97 | cout << word << " "; 98 | } 99 | cout << endl; 100 | } 101 | -------------------------------------------------------------------------------- /online/src/EventLoopThread.cc: -------------------------------------------------------------------------------- 1 | /********************************************* 2 | * file: EventLoopThread.cc 3 | * author: AIWWZ(wzj1524@qq.com) 4 | * date: 2019-07-02 09:40:04 5 | **********************************************/ 6 | #include "../include/EventLoopThread.h" 7 | #include "../include/EventLoop.h" 8 | #include 9 | using namespace tinyse; 10 | 11 | EventLoopThread::EventLoopThread(const ThreadInitCallback &cb) 12 | : m_loop(nullptr) 13 | , m_mutex() 14 | , m_cond(m_mutex) 15 | , m_thread(std::bind(&EventLoopThread::threadFunc, this)) 16 | , m_exit(false) 17 | , m_cb(cb) { } 18 | 19 | EventLoopThread::~EventLoopThread() { 20 | m_exit = true; 21 | if(m_loop != nullptr) { 22 | m_loop->quit(); 23 | m_thread.join(); 24 | } 25 | } 26 | 27 | /* EventLoopThread启动自己的线程, 并在其中运行loop() */ 28 | EventLoop* EventLoopThread::startLoop() { 29 | assert(!m_thread.started()); 30 | m_thread.start(); 31 | 32 | { 33 | MutexLockGuard lock(m_mutex); 34 | while(m_loop == nullptr) { //防止虚假唤醒 35 | m_cond.wait(); //等待线程的创建与运行 36 | } 37 | } 38 | 39 | return m_loop; //返回新线程中的EventLoop对象 40 | } 41 | 42 | void EventLoopThread::threadFunc() { 43 | EventLoop loop; //在栈上定义EventLoop对象 44 | 45 | { 46 | MutexLockGuard lock(m_mutex); 47 | m_loop = &loop; 48 | m_cond.signal(); //唤醒startLoop() 49 | } 50 | 51 | loop.loop(); 52 | 53 | { 54 | MutexLockGuard lock(m_mutex); 55 | m_loop = nullptr; 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /online/src/Makefile: -------------------------------------------------------------------------------- 1 | SRCS:=$(wildcard *.cc) 2 | OBJS:=$(patsubst %.cc,%.o,$(SRCS)) 3 | ELF:=a.out 4 | CC:=g++ 5 | CXXFLAGS:=-std=c++11 -g -Wall 6 | $(ELF):$(OBJS) 7 | g++ $^ -o $@ -lpthread -llog4cpp -ljson $(CXXFLAGS) 8 | .PHONY:clean 9 | clean: 10 | rm -rf $(ELF) $(OBJS) 11 | -------------------------------------------------------------------------------- /online/src/MyLogger.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include "../include/MyLogger.h" 3 | #include 4 | using namespace log4cpp; 5 | 6 | /* 初始化静态成员 */ 7 | MyLogger *MyLogger::m_pInstance = nullptr; 8 | pthread_once_t MyLogger::m_once_control = PTHREAD_ONCE_INIT; 9 | 10 | MyLogger::MyLogger() : m_logger(Category::getRoot().getInstance("logger")) { 11 | try { 12 | PropertyConfigurator::configure("../conf/log4cpp.conf"); 13 | } 14 | catch(ConfigureFailure &err) { 15 | std::cerr << "Configure failure: " << err.what() << std::endl; 16 | exit(1); 17 | } 18 | } 19 | 20 | MyLogger::~MyLogger() { 21 | Category::shutdown(); 22 | } 23 | 24 | void MyLogger::error(const char *msg) { 25 | m_logger.error(msg); 26 | } 27 | 28 | void MyLogger::warn(const char *msg) { 29 | m_logger.warn(msg); 30 | } 31 | 32 | void MyLogger::info(const char *msg) { 33 | m_logger.info(msg); 34 | } 35 | 36 | void MyLogger::debug(const char *msg) { 37 | m_logger.debug(msg); 38 | } 39 | 40 | -------------------------------------------------------------------------------- /online/src/Page.cc: -------------------------------------------------------------------------------- 1 | /********************************************* 2 | * file: Page.cc 3 | * author: AIWWZ(wzj1524@qq.com) 4 | * date: 2019-07-06 15:20:40 5 | **********************************************/ 6 | #include "../include/Page.h" 7 | #include "../include/Configure.h" 8 | #include "../include/WordSegmentation.h" 9 | #include 10 | #include 11 | using namespace tinyse; 12 | using std::priority_queue; using std::pair; 13 | using std::istringstream; 14 | 15 | Page::Page(const string &doc, Configure &conf, WordSegmentation &jieba) { 16 | parseDoc(doc, conf, jieba); 17 | } 18 | 19 | void Page::parseDoc(const string &doc, //包含标记的完整结点 20 | Configure &conf, //获取配置信息 21 | WordSegmentation &jieba) { //jieba分词器 22 | int beg = 0, end = 0; 23 | size_t len = 0; 24 | 25 | //获取docid 26 | beg = doc.find(""); 27 | end = doc.find(""); 28 | len = strlen(""); 29 | string docid = doc.substr(beg+len, end-beg-len); 30 | m_docid = atoi(docid.c_str()); 31 | 32 | //获取title 33 | beg = doc.find(""); 34 | end = doc.find(""); 35 | len = strlen(""); 36 | m_title = doc.substr(beg+len, end-beg-len); 37 | 38 | //获取link 39 | beg = doc.find("<link>"); 40 | end = doc.find("</link>"); 41 | len = strlen("<link>"); 42 | m_link = doc.substr(beg+len, end-beg-len); 43 | 44 | //获取content 45 | beg = doc.find("<content>"); 46 | end = doc.find("</content>"); 47 | len = strlen("<content>"); 48 | m_content = doc.substr(beg+len, end-beg-len); 49 | 50 | //分词 51 | vector<string> wordVec = jieba(m_content); 52 | set<string> &stopWords = conf.getStopWords(); 53 | 54 | //计算topK 55 | calcTopK(wordVec, topK, stopWords); 56 | } 57 | 58 | class WordFreqCompare { 59 | public: 60 | bool operator()(const pair<string, int> &lhs, const pair<string, int> &rhs) { 61 | if(lhs.second < rhs.second) { 62 | return true; 63 | } 64 | else if(lhs.second == rhs.second && lhs.first < rhs.first){ 65 | return true; 66 | } 67 | else { 68 | return false; 69 | } 70 | } 71 | }; 72 | 73 | void Page::calcTopK(vector<string> &wordVec, size_t topK, set<string> &stopWords) { 74 | //去停用词&获取词频 75 | for(auto it : wordVec) { 76 | if(stopWords.find(it) == stopWords.end()) { 77 | ++m_wordsMap[it]; //非停用词, 词频++ 78 | } 79 | } 80 | 81 | //获取topK 82 | priority_queue<pair<string, int>, vector<pair<string, int>>, WordFreqCompare> heap(m_wordsMap.begin(), m_wordsMap.end()); 83 | string topWord; 84 | while(!heap.empty() && m_topWords.size() < topK) { //获取词频最高的K个word 85 | topWord = heap.top().first; 86 | heap.pop(); 87 | m_topWords.push_back(topWord); 88 | } 89 | } 90 | 91 | 92 | /* 获取所查询词的摘要信息 */ 93 | string Page::summary(const vector<string> &queryWords) { 94 | vector<string> summaryVec; 95 | istringstream is(m_content); 96 | string line; 97 | while(std::getline(is, line)) { 98 | for(auto &word : queryWords) { 99 | if(line.find(word) != string::npos && summaryVec.size() < 3) { 100 | summaryVec.push_back(line); 101 | } 102 | } 103 | } 104 | 105 | string summary; 106 | for(auto &line : summaryVec) { 107 | summary.append(line); 108 | } 109 | 110 | return summary; 111 | } 112 | 113 | 114 | bool operator<(const Page &lhs, const Page &rhs) { 115 | return lhs.docid() < rhs.docid(); 116 | } 117 | -------------------------------------------------------------------------------- /online/src/TcpServer.cc: -------------------------------------------------------------------------------- 1 | /********************************************* 2 | * file: TcpServer.cc 3 | * author: AIWWZ(wzj1524@qq.com) 4 | * date: 2019-07-03 17:57:35 5 | **********************************************/ 6 | #include "../include/TcpServer.h" 7 | #include "../include/EventLoop.h" 8 | #include "../include/Acceptor.h" 9 | #include "../include/MyLogger.h" 10 | #include "../include/SocketsOps.h" 11 | #include "../include/TcpConnection.h" 12 | using namespace tinyse; 13 | using namespace std::placeholders; 14 | 15 | TcpServer::TcpServer(EventLoop *loop, const InetAddress &listenAddr, const string &name, bool option) 16 | : m_loop(loop) 17 | , m_name(name) 18 | , m_acceptor(new Acceptor(loop, listenAddr, option)) { 19 | 20 | m_acceptor->setNewConnectionCallback(std::bind(&TcpServer::newConnection, this, _1, _2)); 21 | } 22 | 23 | TcpServer::~TcpServer() { 24 | 25 | } 26 | 27 | void TcpServer::start() { 28 | if(!m_started) { 29 | m_started = true; 30 | } 31 | 32 | if(!m_acceptor->listenning()) { 33 | m_loop->runInLoop(bind(&Acceptor::listen, m_acceptor.get())); 34 | } 35 | } 36 | 37 | /* 新连接到达时, acceptor会回调此函数 */ 38 | void TcpServer::newConnection(int sockfd, const InetAddress &peerAddr) { 39 | m_loop->assertInLoopThread(); 40 | char buf[32]; 41 | snprintf(buf, sizeof(buf), "#%d", m_nextConnectionID++); 42 | string connName = m_name + buf; //每个TcpConnection对象一个唯一的名字 43 | 44 | LogInfo("new connection [%s] from %s:%d", connName.c_str(), peerAddr.ip().c_str(), peerAddr.port()); 45 | InetAddress localAddr(sockets::getLocalAddr(sockfd)); 46 | TcpConnectionPtr conn(make_shared<TcpConnection>(m_loop, connName, sockfd, localAddr, peerAddr)); 47 | m_connections[connName] = conn; 48 | conn->setConnectionCallback(m_connectionCallback); 49 | conn->setMessageCallback(m_messageCallback); 50 | conn->setCloseCallback(bind(&TcpServer::removeConnection, this, _1)); 51 | conn->connectEstablished(); 52 | } 53 | 54 | void TcpServer::removeConnection(const TcpConnectionPtr &conn) { 55 | m_loop->assertInLoopThread(); 56 | LogInfo("[%s] - connection %s", m_name.c_str(), conn->name().c_str()); 57 | size_t n = m_connections.erase(conn->name()); 58 | assert(n == 1); (void)n; 59 | m_loop->queueInLoop(bind(&TcpConnection::connectDestroyed, conn)); 60 | } 61 | -------------------------------------------------------------------------------- /online/src/Thread.cc: -------------------------------------------------------------------------------- 1 | /********************************************* 2 | * file: Thread.cc 3 | * author: AIWWZ(wzj1524@qq.com) 4 | * date: 2019-06-10 20:20:26 5 | **********************************************/ 6 | #include "../include/Thread.h" 7 | #include <cstdio> 8 | #include <unistd.h> 9 | #include <syscall.h> 10 | using namespace tinyse; 11 | 12 | namespace tinyse { 13 | 14 | Thread::~Thread() { 15 | if(m_started) { 16 | if(pthread_detach(m_pthid)) { 17 | perror("Thread.cc: pthread_detach"); 18 | } 19 | } 20 | } 21 | 22 | void Thread::start() { 23 | if(pthread_create(&m_pthid, nullptr, threadFunc, this)) { 24 | perror("Thread.cc: pthread_create"); 25 | return; 26 | } 27 | m_started = true; 28 | } 29 | 30 | void Thread::join() { 31 | if(m_started) { 32 | if(pthread_join(m_pthid, nullptr)) { 33 | perror("Thread: pthread_join"); 34 | return; 35 | } 36 | m_started = false; 37 | } 38 | } 39 | 40 | void* Thread::threadFunc(void *obj) { 41 | Thread *thd = static_cast<Thread*>(obj); 42 | thd->m_tid = pthread_self(); //设置当前线程id 43 | if(nullptr != thd) { 44 | thd->m_func(); 45 | return thd; 46 | } 47 | return nullptr; 48 | } 49 | 50 | } //end of namespace tinyse 51 | -------------------------------------------------------------------------------- /online/src/ThreadPool.cc: -------------------------------------------------------------------------------- 1 | /********************************************* 2 | * file: ThreadPool.cc 3 | * author: AIWWZ(wzj1524@qq.com) 4 | * date: 2019-07-05 16:41:40 5 | **********************************************/ 6 | #include <iostream> 7 | #include <cassert> 8 | #include "../include/ThreadPool.h" 9 | #include "../include/Thread.h" 10 | #include "../include/MyLogger.h" 11 | using namespace tinyse; 12 | using std::cout; using std::endl; 13 | using std::bind; 14 | 15 | ThreadPool::ThreadPool(const string &name) 16 | : m_mutex() 17 | , m_cond(m_mutex) 18 | , m_name(name) { 19 | 20 | LogInfo("create new thread pool: %s", name.c_str()); 21 | } 22 | 23 | ThreadPool::~ThreadPool() { 24 | if(m_running) { 25 | stop(); 26 | } 27 | } 28 | 29 | void ThreadPool::start(int threadNums) { 30 | assert(m_threads.empty()); 31 | m_running = true; 32 | m_threads.reserve(threadNums); //若threadNums<0, reserve将抛出异常 33 | //为线程池创建线程 34 | for(int idx = 0; idx < threadNums; ++idx) { 35 | char id[32] = {0}; 36 | snprintf(id, sizeof(id), "#%d", idx+1); 37 | Thread *thread = new Thread(bind(&ThreadPool::threadFunc, this), m_name + id); 38 | m_threads.push_back(thread); 39 | LogInfo("create new thread: %s", thread->name().c_str()); 40 | thread->start(); 41 | } 42 | } 43 | 44 | void ThreadPool::stop() { 45 | if(m_running) { 46 | m_running = false; 47 | m_cond.broadcast(); 48 | //回收线程 49 | for(auto it : m_threads) { 50 | it->join(); 51 | delete it; 52 | } 53 | m_threads.clear(); 54 | } 55 | LogInfo("stop thread pool!"); 56 | } 57 | 58 | void ThreadPool::addTask(const Task &task) { 59 | if(m_threads.empty()) { //线程池中无线程, 则直接在主线程中运行 60 | task(); 61 | } 62 | else { 63 | MutexLockGuard lock(m_mutex); 64 | m_taskQue.push(task); 65 | m_cond.signal(); 66 | } 67 | } 68 | 69 | ThreadPool::Task ThreadPool::getTask() { 70 | MutexLockGuard lock(m_mutex); 71 | while(m_taskQue.empty() && m_running) { 72 | m_cond.wait(); 73 | } 74 | 75 | Task task; 76 | if(!m_taskQue.empty()) { 77 | task = m_taskQue.front(); 78 | m_taskQue.pop(); 79 | } 80 | return task; 81 | } 82 | 83 | void ThreadPool::threadFunc() { 84 | while(m_running) { 85 | Task task = getTask(); 86 | if(task) { 87 | task(); 88 | } 89 | } 90 | } 91 | -------------------------------------------------------------------------------- /online/src/Timestamp.cc: -------------------------------------------------------------------------------- 1 | /********************************************* 2 | * file: TimeStamp.cc 3 | * author: AIWWZ(wzj1524@qq.com) 4 | * date: 2019-06-15 21:03:38 5 | **********************************************/ 6 | #include "../include/Timestamp.h" 7 | #include <cstdio> 8 | #include <sys/time.h> 9 | using namespace tinyse; 10 | 11 | Timestamp::Timestamp() : m_microSecondsSinceEpoch(0) { } 12 | 13 | Timestamp::Timestamp(int64_t microSecondsSinceEpoch) 14 | : m_microSecondsSinceEpoch(microSecondsSinceEpoch) { } 15 | 16 | std::string Timestamp::toString() const { 17 | char buf[32] = {0}; 18 | int64_t seconds = m_microSecondsSinceEpoch / kMicroSecondsPerSecond; 19 | int64_t microseconds = m_microSecondsSinceEpoch % kMicroSecondsPerSecond; 20 | snprintf(buf, sizeof(buf)-1, "%ld.%06ld", seconds, microseconds); 21 | return buf; 22 | } 23 | 24 | Timestamp Timestamp::now() { 25 | struct timeval tv; 26 | gettimeofday(&tv, nullptr); //获取microSecondsSinceEpoch 27 | return Timestamp(tv.tv_sec * kMicroSecondsPerSecond + tv.tv_usec); 28 | } 29 | 30 | -------------------------------------------------------------------------------- /online/src/TinySearchEngine.cc: -------------------------------------------------------------------------------- 1 | /********************************************* 2 | * file: TinySearchEngine.cc 3 | * author: AIWWZ(wzj1524@qq.com) 4 | * date: 2019-07-07 17:52:23 5 | **********************************************/ 6 | #include "../include/InetAddress.h" 7 | #include "../include/TcpConnection.h" 8 | #include "../include/EventLoop.h" 9 | #include "../include/TcpServer.h" 10 | #include "../include/ThreadPool.h" 11 | #include "../include/WordQuery.h" 12 | #include "../include/Configure.h" 13 | #include "../include/Callbacks.h" 14 | #include <iostream> 15 | using namespace tinyse; 16 | using namespace::placeholders; 17 | 18 | class TinySearchEngine : Uncopyable { 19 | public: 20 | TinySearchEngine(const string &confiureFile, EventLoop &loop, InetAddress &listenAddr); 21 | void start(); 22 | 23 | private: 24 | void onConnection(const TcpConnectionPtr &conn); 25 | void onMessage(const TcpConnectionPtr &conn, Buffer *buf); 26 | void doTask(const TcpConnectionPtr &conn, const string &msg); 27 | 28 | private: 29 | TcpServer m_server; 30 | ThreadPool m_pool; 31 | Configure m_conf; 32 | WordQuery m_wordQuery; 33 | }; 34 | 35 | TinySearchEngine::TinySearchEngine(const string &confiureFile, EventLoop &loop, InetAddress &listenAddr) 36 | : m_server(&loop, listenAddr, "tinyse_server") 37 | , m_pool("worker") 38 | , m_conf(confiureFile) 39 | , m_wordQuery(m_conf) { 40 | 41 | LogInfo("new tinyse_server"); 42 | m_server.setConnectionCallback(bind(&TinySearchEngine::onConnection, this, _1)); 43 | m_server.setMessageCallback(bind(&TinySearchEngine::onMessage, this, _1, _2)); 44 | } 45 | 46 | /* 启动服务器 */ 47 | void TinySearchEngine::start() { 48 | m_pool.start(8); 49 | m_server.start(); 50 | } 51 | 52 | void TinySearchEngine::onConnection(const TcpConnectionPtr &conn) { 53 | //do nothing 54 | (void)conn; //防止warn 55 | } 56 | 57 | void TinySearchEngine::onMessage(const TcpConnectionPtr &conn, Buffer *buf) { 58 | string msg(buf->retrieveAsString()); 59 | size_t pos = msg.find('\n'); 60 | msg = msg.substr(0, pos); 61 | //cout << msg << endl; 62 | LogInfo("%s", msg.c_str()); 63 | m_pool.addTask(bind(&TinySearchEngine::doTask, this, conn, msg)); 64 | } 65 | 66 | void TinySearchEngine::doTask(const TcpConnectionPtr &conn, const string &msg) { 67 | string ret = m_wordQuery.query(msg); 68 | 69 | int sz = ret.size(); 70 | string message(to_string(sz)); 71 | message.append("\n").append(ret); 72 | 73 | conn->send(message); 74 | } 75 | 76 | int main() { 77 | InetAddress listenAddr(8888); 78 | EventLoop loop; 79 | TinySearchEngine tinyse_server("../conf/online.conf", loop, listenAddr); 80 | 81 | tinyse_server.start(); 82 | loop.loop(); 83 | 84 | return 0; 85 | } 86 | -------------------------------------------------------------------------------- /online/src/test_file/test1: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aiwwz/tiny-search-engine/0ed91aeb17e5d75f982669c3432959dc7c27473d/online/src/test_file/test1 -------------------------------------------------------------------------------- /online/src/test_file/test1.cpp: -------------------------------------------------------------------------------- 1 | /********************************************* 2 | * file: test1.cc 3 | * author: AIWWZ(wzj1524@qq.com) 4 | * date: 2019-06-28 18:21:03 5 | **********************************************/ 6 | #include <iostream> 7 | #include <unistd.h> 8 | #include <sys/types.h> 9 | #include "../include/EventLoop.h" 10 | #include "../include/Thread.h" 11 | using namespace std; 12 | using namespace tinyse; 13 | 14 | void threadFunc() { 15 | printf("threadFunc(): pid = %d, tid = %ld\n", getpid(), pthread_self()); 16 | EventLoop loop; 17 | loop.loop(); 18 | } 19 | 20 | int main() { 21 | printf("main(): pid = %d, tid = %ld\n", getpid(), pthread_self()); 22 | 23 | EventLoop loop; 24 | 25 | Thread thread(threadFunc); 26 | thread.start(); 27 | 28 | loop.loop(); 29 | pthread_exit(NULL); 30 | 31 | return 0; 32 | } 33 | -------------------------------------------------------------------------------- /online/src/test_file/test10: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aiwwz/tiny-search-engine/0ed91aeb17e5d75f982669c3432959dc7c27473d/online/src/test_file/test10 -------------------------------------------------------------------------------- /online/src/test_file/test10.cpp: -------------------------------------------------------------------------------- 1 | /********************************************* 2 | * file: test8.cc 3 | * author: AIWWZ(wzj1524@qq.com) 4 | * date: 2019-07-04 08:19:02 5 | **********************************************/ 6 | #include <iostream> 7 | #include "../include/TcpConnection.h" 8 | #include "../include/TcpServer.h" 9 | #include "../include/EventLoop.h" 10 | #include "../include/Buffer.h" 11 | #include <cstdio> 12 | #include <unistd.h> 13 | using namespace std; 14 | using namespace tinyse; 15 | std::string message1; 16 | std::string message2; 17 | 18 | void onConnection(const TcpConnectionPtr& conn) 19 | { 20 | if (conn->connected()) 21 | { 22 | sleep(3); 23 | conn->send(message1); 24 | conn->send(message2); 25 | conn->shutdown(); 26 | } 27 | else 28 | { 29 | printf("onConnection(): connection [%s] is down\n", 30 | conn->name().c_str()); 31 | } 32 | } 33 | 34 | void onMessage(const TcpConnectionPtr& conn, Buffer* buf) 35 | { 36 | printf("onMessage(): received %zd bytes from connection [%s]\n", 37 | buf->readableBytes(), 38 | conn->name().c_str()); 39 | 40 | buf->retrieveAll(); 41 | } 42 | 43 | int main(int argc, char* argv[]) 44 | { 45 | printf("main(): pid = %d\n", getpid()); 46 | 47 | int len1 = 500; 48 | int len2 = 300; 49 | 50 | if (argc > 2) 51 | { 52 | len1 = atoi(argv[1]); 53 | len2 = atoi(argv[2]); 54 | } 55 | 56 | message1.resize(len1); 57 | message2.resize(len2); 58 | std::fill(message1.begin(), message1.end(), 'A'); 59 | std::fill(message2.begin(), message2.end(), 'B'); 60 | 61 | InetAddress listenAddr(8888); 62 | EventLoop loop; 63 | 64 | TcpServer server(&loop, listenAddr, "server"); 65 | server.setConnectionCallback(onConnection); 66 | server.setMessageCallback(onMessage); 67 | server.start(); 68 | 69 | loop.loop(); 70 | } 71 | 72 | -------------------------------------------------------------------------------- /online/src/test_file/test11: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aiwwz/tiny-search-engine/0ed91aeb17e5d75f982669c3432959dc7c27473d/online/src/test_file/test11 -------------------------------------------------------------------------------- /online/src/test_file/test11.cpp: -------------------------------------------------------------------------------- 1 | /********************************************* 2 | * file: test11.cc 3 | * author: AIWWZ(wzj1524@qq.com) 4 | * date: 2019-07-05 21:39:42 5 | **********************************************/ 6 | #include <iostream> 7 | #include <unistd.h> 8 | #include "../include/ThreadPool.h" 9 | using namespace std; 10 | using namespace tinyse; 11 | 12 | void threadFunc() { 13 | cout << "hello, world!" << endl; 14 | } 15 | 16 | int main() { 17 | ThreadPool pool("worker"); 18 | pool.start(10); 19 | pool.addTask(threadFunc); 20 | 21 | return 0; 22 | } 23 | -------------------------------------------------------------------------------- /online/src/test_file/test2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aiwwz/tiny-search-engine/0ed91aeb17e5d75f982669c3432959dc7c27473d/online/src/test_file/test2 -------------------------------------------------------------------------------- /online/src/test_file/test2.cpp: -------------------------------------------------------------------------------- 1 | /********************************************* 2 | * file: test1.cc 3 | * author: AIWWZ(wzj1524@qq.com) 4 | * date: 2019-06-28 18:21:03 5 | **********************************************/ 6 | #include <iostream> 7 | #include <unistd.h> 8 | #include <sys/types.h> 9 | #include "../include/EventLoop.h" 10 | #include "../include/Thread.h" 11 | using namespace std; 12 | using namespace tinyse; 13 | 14 | EventLoop *g_loop; 15 | 16 | void threadFunc() { 17 | printf("threadFunc(): pid = %d, tid = %ld\n", getpid(), pthread_self()); 18 | g_loop->loop(); 19 | } 20 | 21 | int main() { 22 | printf("main(): pid = %d, tid = %ld\n", getpid(), pthread_self()); 23 | 24 | EventLoop loop; 25 | g_loop = &loop; 26 | 27 | Thread thread(threadFunc); 28 | thread.start(); 29 | thread.join(); 30 | 31 | return 0; 32 | } 33 | -------------------------------------------------------------------------------- /online/src/test_file/test3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aiwwz/tiny-search-engine/0ed91aeb17e5d75f982669c3432959dc7c27473d/online/src/test_file/test3 -------------------------------------------------------------------------------- /online/src/test_file/test3.cpp: -------------------------------------------------------------------------------- 1 | /********************************************* 2 | * file: test3.cc 3 | * author: AIWWZ(wzj1524@qq.com) 4 | * date: 2019-07-01 22:40:04 5 | **********************************************/ 6 | #include <iostream> 7 | #include <sys/timerfd.h> 8 | #include <cstring> 9 | #include "../include/EventLoop.h" 10 | using namespace std; 11 | using namespace tinyse; 12 | 13 | EventLoop *g_loop; 14 | void timeout() { 15 | cout << "main: Timeout!" << endl; 16 | g_loop->quit(); 17 | } 18 | 19 | int main() { 20 | EventLoop loop; 21 | g_loop = &loop; 22 | 23 | int timerfd = ::timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK | TFD_CLOEXEC); 24 | Channel channel(&loop, timerfd); 25 | channel.setReadCallback(timeout); 26 | channel.enableReading(); 27 | 28 | struct itimerspec howlong; 29 | bzero(&howlong, sizeof(howlong)); 30 | howlong.it_value.tv_sec = 5; 31 | ::timerfd_settime(timerfd, 0, &howlong, nullptr); 32 | 33 | loop.loop(); 34 | 35 | return 0; 36 | } 37 | -------------------------------------------------------------------------------- /online/src/test_file/test4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aiwwz/tiny-search-engine/0ed91aeb17e5d75f982669c3432959dc7c27473d/online/src/test_file/test4 -------------------------------------------------------------------------------- /online/src/test_file/test4.cpp: -------------------------------------------------------------------------------- 1 | //copied from muduo/net/tests/TimerQueue_unittest.cc 2 | #include "../include/EventLoop.h" 3 | #include <functional> 4 | #include <stdio.h> 5 | #include <pthread.h> 6 | using namespace tinyse; 7 | 8 | int cnt = 0; 9 | EventLoop* g_loop; 10 | 11 | void printTid() { 12 | printf("pid = %d, tid = %ld\n", getpid(), pthread_self()); 13 | printf("now %s\n", Timestamp::now().toString().c_str()); 14 | } 15 | 16 | void print(const char* msg) { 17 | printf("msg %s %s\n", Timestamp::now().toString().c_str(), msg); 18 | if (++cnt == 10) { 19 | g_loop->quit(); 20 | } 21 | } 22 | 23 | int main() { 24 | printTid(); 25 | EventLoop loop; 26 | g_loop = &loop; 27 | 28 | print("main"); 29 | loop.runAfter(1, std::bind(print, "once1")); 30 | loop.runAfter(1.5, std::bind(print, "once1.5")); 31 | loop.runAfter(2.5, std::bind(print, "once2.5")); 32 | loop.runAfter(3.5, std::bind(print, "once3.5")); 33 | loop.runEvery(2, std::bind(print, "every2")); 34 | loop.runEvery(3, std::bind(print, "every3")); 35 | 36 | loop.loop(); 37 | print("main loop exits"); 38 | sleep(1); 39 | } 40 | -------------------------------------------------------------------------------- /online/src/test_file/test5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aiwwz/tiny-search-engine/0ed91aeb17e5d75f982669c3432959dc7c27473d/online/src/test_file/test5 -------------------------------------------------------------------------------- /online/src/test_file/test5.cpp: -------------------------------------------------------------------------------- 1 | #include "../include/EventLoop.h" 2 | #include <stdio.h> 3 | using namespace tinyse; 4 | 5 | EventLoop* g_loop; 6 | int g_flag = 0; 7 | 8 | void run4() { 9 | printf("run4(): pid = %d, flag = %d\n", getpid(), g_flag); 10 | g_loop->quit(); 11 | } 12 | 13 | void run3() { 14 | printf("run3(): pid = %d, flag = %d\n", getpid(), g_flag); 15 | g_loop->runAfter(3, run4); 16 | g_flag = 3; 17 | } 18 | 19 | void run2() { 20 | printf("run2(): pid = %d, flag = %d\n", getpid(), g_flag); 21 | g_loop->queueInLoop(run3); 22 | } 23 | 24 | void run1() { 25 | g_flag = 1; 26 | printf("run1(): pid = %d, flag = %d\n", getpid(), g_flag); 27 | g_loop->runInLoop(run2); 28 | g_flag = 2; 29 | } 30 | 31 | int main() { 32 | printf("main(): pid = %d, flag = %d\n", getpid(), g_flag); 33 | 34 | EventLoop loop; 35 | g_loop = &loop; 36 | 37 | loop.runAfter(2, run1); 38 | loop.loop(); 39 | printf("main(): pid = %d, flag = %d\n", getpid(), g_flag); 40 | } 41 | 42 | -------------------------------------------------------------------------------- /online/src/test_file/test6: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aiwwz/tiny-search-engine/0ed91aeb17e5d75f982669c3432959dc7c27473d/online/src/test_file/test6 -------------------------------------------------------------------------------- /online/src/test_file/test6.cpp: -------------------------------------------------------------------------------- 1 | /********************************************* 2 | * file: test6.cc 3 | * author: AIWWZ(wzj1524@qq.com) 4 | * date: 2019-07-02 10:35:09 5 | **********************************************/ 6 | #include "../include/EventLoop.h" 7 | #include "../include/EventLoopThread.h" 8 | #include <stdio.h> 9 | #include <iostream> 10 | using namespace tinyse; 11 | using namespace std; 12 | 13 | void runInThread() { 14 | printf("runInThread(): pid = %d, tid = %ld\n", getpid(), pthread_self()); 15 | } 16 | 17 | int main() { 18 | printf("main(): pid = %d, tid = %ld\n", getpid(), pthread_self()); 19 | 20 | EventLoopThread loopThread; 21 | EventLoop* loop = loopThread.startLoop(); 22 | loop->runInLoop(runInThread); 23 | sleep(1); 24 | 25 | loop->runAfter(2, runInThread); 26 | sleep(3); 27 | 28 | loop->quit(); 29 | printf("exit main().\n"); 30 | } 31 | 32 | -------------------------------------------------------------------------------- /online/src/test_file/test7: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aiwwz/tiny-search-engine/0ed91aeb17e5d75f982669c3432959dc7c27473d/online/src/test_file/test7 -------------------------------------------------------------------------------- /online/src/test_file/test7.cpp: -------------------------------------------------------------------------------- 1 | /********************************************* 2 | * file: test7.cc 3 | * author: AIWWZ(wzj1524@qq.com) 4 | * date: 2019-07-02 15:10:09 5 | **********************************************/ 6 | #include <iostream> 7 | #include <cstdio> 8 | #include "../include/Acceptor.h" 9 | #include "../include/EventLoop.h" 10 | #include "../include/InetAddress.h" 11 | using namespace std; 12 | using namespace tinyse; 13 | 14 | void newConnection(int sockfd, const InetAddress &peerAddr) { 15 | printf("newConnection(): accepted a new connection from %s\n", peerAddr.ip().c_str()); 16 | ::write(sockfd, "How are you?\n", 13); 17 | ::close(sockfd); 18 | } 19 | 20 | void newConnection2(int sockfd, const InetAddress &peerAddr) { 21 | printf("newConnection(): accepted a new connection from %s\n", peerAddr.ip().c_str()); 22 | ::write(sockfd, "I am fine!\n", 11); 23 | ::close(sockfd); 24 | } 25 | 26 | int main() { 27 | printf("main(): pid = %d\n", getpid()); 28 | 29 | EventLoop loop; 30 | 31 | InetAddress listenAddr(8888); 32 | Acceptor acceptor(&loop, listenAddr); 33 | acceptor.setNewConnectionCallback(newConnection); 34 | acceptor.listen(); 35 | 36 | InetAddress listenAddr2(8889); 37 | Acceptor acceptor2(&loop, listenAddr2); 38 | acceptor2.setNewConnectionCallback(newConnection2); 39 | acceptor2.listen(); 40 | 41 | loop.loop(); 42 | 43 | return 0; 44 | } 45 | -------------------------------------------------------------------------------- /online/src/test_file/test8: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aiwwz/tiny-search-engine/0ed91aeb17e5d75f982669c3432959dc7c27473d/online/src/test_file/test8 -------------------------------------------------------------------------------- /online/src/test_file/test8.cpp: -------------------------------------------------------------------------------- 1 | /********************************************* 2 | * file: test8.cc 3 | * author: AIWWZ(wzj1524@qq.com) 4 | * date: 2019-07-04 08:19:02 5 | **********************************************/ 6 | #include <iostream> 7 | #include "../include/TcpConnection.h" 8 | #include "../include/TcpServer.h" 9 | #include "../include/EventLoop.h" 10 | #include "../include/Buffer.h" 11 | #include <cstdio> 12 | #include <unistd.h> 13 | using namespace std; 14 | using namespace tinyse; 15 | 16 | void onConnection(const TcpConnectionPtr &conn) { 17 | if(conn->connected()) { 18 | printf("onConnection(): new connection [%s] from %s:%d\n", conn->name().c_str(), conn->peerAddr().ip().c_str(), conn->peerAddr().port()); 19 | } 20 | else { 21 | printf("onConnection(): connection [%s] is down\n", conn->name().c_str()); 22 | } 23 | } 24 | 25 | void onMessage(const TcpConnectionPtr &conn, Buffer *data) { 26 | printf("onMessage(): received %lu bytes from connection [%s]\n", data->readableBytes(), conn->name().c_str()); 27 | printf("onMessage(): [%s]\n", data->retrieveAsString().c_str()); 28 | } 29 | 30 | 31 | int main() { 32 | printf("main(): pid = %d\n", getpid()); 33 | InetAddress listenAddr(8888); 34 | EventLoop loop; 35 | 36 | TcpServer server(&loop, listenAddr, "server"); 37 | cout << "1111111111111" << endl; 38 | server.setConnectionCallback(onConnection); 39 | cout << "2222222222222" << endl; 40 | server.setMessageCallback(onMessage); 41 | cout << "3333333333333" << endl; 42 | server.start(); 43 | cout << "4444444444444" << endl; 44 | 45 | loop.loop(); 46 | 47 | return 0; 48 | } 49 | -------------------------------------------------------------------------------- /online/src/test_file/test9.cpp: -------------------------------------------------------------------------------- 1 | /********************************************* 2 | * file: test8.cc 3 | * author: AIWWZ(wzj1524@qq.com) 4 | * date: 2019-07-04 08:19:02 5 | **********************************************/ 6 | #include <iostream> 7 | #include "../include/TcpConnection.h" 8 | #include "../include/TcpServer.h" 9 | #include "../include/EventLoop.h" 10 | #include "../include/Buffer.h" 11 | #include <cstdio> 12 | #include <unistd.h> 13 | using namespace std; 14 | using namespace tinyse; 15 | 16 | void onConnection(const TcpConnectionPtr &conn) { 17 | if(conn->connected()) { 18 | printf("onConnection(): new connection [%s] from %s:%d\n", conn->name().c_str(), conn->peerAddr().ip().c_str(), conn->peerAddr().port()); 19 | } 20 | else { 21 | printf("onConnection(): connection [%s] is down\n", conn->name().c_str()); 22 | } 23 | } 24 | 25 | void onMessage(const TcpConnectionPtr &conn, Buffer *data) { 26 | printf("onMessage(): received %lu bytes from connection [%s]\n", data->readableBytes(), conn->name().c_str()); 27 | conn->send(data->retrieveAsString()); 28 | } 29 | 30 | 31 | int main() { 32 | printf("main(): pid = %d\n", getpid()); 33 | InetAddress listenAddr(8888); 34 | EventLoop loop; 35 | 36 | TcpServer server(&loop, listenAddr, "server"); 37 | server.setConnectionCallback(onConnection); 38 | server.setMessageCallback(onMessage); 39 | server.start(); 40 | 41 | loop.loop(); 42 | 43 | return 0; 44 | } 45 | -------------------------------------------------------------------------------- /online/test/test.cc: -------------------------------------------------------------------------------- 1 | /********************************************* 2 | * file: test.cc 3 | * author: AIWWZ(wzj1524@qq.com) 4 | * date: 2019-06-09 12:36:36 5 | **********************************************/ 6 | #include <iostream> 7 | #include "../include/MutexLock.h" 8 | #include "../include/Condition.h" 9 | using namespace std; 10 | using namespace tinyse; 11 | 12 | 13 | MutexLock mutex; 14 | Condition cond(mutex); 15 | bool flag = false; 16 | int g_int = 1; 17 | 18 | void* threadFunc(void *) { 19 | cout << "start" << endl; 20 | while(false == flag) { 21 | MutexLockGuard lock(mutex); 22 | cond.wait(); 23 | } 24 | cout << g_int++ << endl; 25 | 26 | return nullptr; 27 | } 28 | 29 | int main() { 30 | pthread_t tid1; 31 | pthread_t tid2; 32 | pthread_create(&tid1, nullptr, threadFunc, nullptr); 33 | pthread_create(&tid2, nullptr, threadFunc, nullptr); 34 | 35 | { 36 | MutexLockGuard lock(mutex); 37 | flag = true; 38 | cond.signal(); 39 | } 40 | cout << "--------------" << endl; 41 | { 42 | MutexLockGuard lock(mutex); 43 | flag = true; 44 | cond.signal(); 45 | } 46 | 47 | pthread_join(tid1, nullptr); 48 | pthread_join(tid2, nullptr); 49 | 50 | return 0; 51 | } 52 | -------------------------------------------------------------------------------- /online/test/test1.cc: -------------------------------------------------------------------------------- 1 | /********************************************* 2 | * file: test1.cc 3 | * author: AIWWZ(wzj1524@qq.com) 4 | * date: 2019-06-14 23:58:56 5 | **********************************************/ 6 | #include <iostream> 7 | #include "EventLoop.h" 8 | #include <sys/types.h> 9 | #include <unistd.h> 10 | #include "../include/Thread.h" 11 | using namespace std; 12 | 13 | void threadFunc() { 14 | cout << "threadFunc(): pid = " << getpid() 15 | << ", tid = " << pthread_self() << endl; 16 | tinyse::net::EventLoop loop; 17 | loop.loop(); 18 | } 19 | 20 | int main() { 21 | cout << "threadFunc(): pid = " << getpid() 22 | << ", tid = " << pthread_self() << endl; 23 | tinyse::net::EventLoop loop; 24 | tinyse::Thread thread(threadFunc); 25 | thread.start(); 26 | 27 | loop.loop(); 28 | pthread_exit(NULL); 29 | return 0; 30 | } 31 | -------------------------------------------------------------------------------- /online/test/test2.cc: -------------------------------------------------------------------------------- 1 | /********************************************* 2 | * file: test1.cc 3 | * author: AIWWZ(wzj1524@qq.com) 4 | * date: 2019-06-14 23:58:56 5 | **********************************************/ 6 | #include <iostream> 7 | #include "EventLoop.h" 8 | #include <sys/types.h> 9 | #include <unistd.h> 10 | #include "../include/Thread.h" 11 | using namespace std; 12 | 13 | tinyse::net::EventLoop *g_loop; 14 | 15 | void threadFunc() { 16 | g_loop->loop(); 17 | } 18 | 19 | int main() { 20 | cout << "threadFunc(): pid = " << getpid() 21 | << ", tid = " << pthread_self() << endl; 22 | tinyse::net::EventLoop loop; 23 | g_loop = &loop; 24 | tinyse::Thread thread(threadFunc); 25 | thread.start(); 26 | thread.join(); 27 | 28 | return 0; 29 | } 30 | -------------------------------------------------------------------------------- /online/test/test3.cc: -------------------------------------------------------------------------------- 1 | /********************************************* 2 | * file: test3.cc 3 | * author: AIWWZ(wzj1524@qq.com) 4 | * date: 2019-06-15 16:42:39 5 | **********************************************/ 6 | #include <iostream> 7 | #include <sys/timerfd.h> 8 | #include <cstring> 9 | #include <unistd.h> 10 | #include "EventLoop.h" 11 | #include "Channel.h" 12 | using namespace std; 13 | 14 | tinyse::net::EventLoop *g_loop; 15 | 16 | void timeout() { 17 | cout << "Timeout!" << endl; 18 | g_loop->quit(); 19 | } 20 | 21 | int main() { 22 | tinyse::net::EventLoop loop; 23 | g_loop = &loop; 24 | int timerfd = ::timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK | TFD_CLOEXEC); 25 | cout << "1111111111" << endl; 26 | tinyse::net::Channel channel(&loop, timerfd); 27 | channel.setReadCallback(timeout); 28 | cout << "1111155555" << endl; 29 | channel.enableReading(); 30 | cout << "2222222222" << endl; 31 | 32 | struct itimerspec howlong; 33 | bzero(&howlong, sizeof(howlong)); 34 | howlong.it_value.tv_sec = 5; 35 | ::timerfd_settime(timerfd, 0, &howlong, NULL); 36 | cout << "3333333333" << endl; 37 | 38 | loop.loop(); 39 | cout << "4444444444" << endl; 40 | ::close(timerfd); 41 | cout << "5555555555" << endl; 42 | 43 | return 0; 44 | } 45 | -------------------------------------------------------------------------------- /online/test/test4.cc: -------------------------------------------------------------------------------- 1 | /********************************************* 2 | * file: test4.cc 3 | * author: AIWWZ(wzj1524@qq.com) 4 | * date: 2019-06-15 23:28:11 5 | **********************************************/ 6 | #include <iostream> 7 | #include "EventLoop.h" 8 | #include <stdio.h> 9 | #include <sys/types.h> 10 | #include <unistd.h> 11 | #include "../include/CurrentThread.h" 12 | #include "../include/Timestamp.h" 13 | using namespace std; 14 | 15 | int cnt = 0; 16 | tinyse::net::EventLoop *g_loop; 17 | void printTid() { 18 | printf("pid = %d, tid = %d\n", getpid(), tinyse::gettid()); 19 | printf("now %s\n", tinyse::Timestamp::now().toString().c_str()); 20 | } 21 | 22 | void print(const char* msg) { 23 | printf("%d -- msg %s %s\n", cnt, tinyse::Timestamp::now().toString().c_str(), msg); 24 | if (++cnt == 20) { 25 | g_loop->quit(); 26 | } 27 | } 28 | 29 | int main() { 30 | printTid(); 31 | tinyse::net::EventLoop loop; 32 | g_loop = &loop; 33 | 34 | print("main"); 35 | loop.runAfter(3, std::bind(print, "once1")); 36 | loop.runAfter(1.5, std::bind(print, "once1.5")); 37 | loop.runAfter(2.5, std::bind(print, "once2.5")); 38 | loop.runAfter(3.5, std::bind(print, "once3.5")); 39 | loop.runEvery(2, std::bind(print, "every2")); 40 | loop.runEvery(3, std::bind(print, "every3")); 41 | 42 | loop.loop(); 43 | print("main loop exits"); 44 | sleep(1); 45 | return 0; 46 | } 47 | -------------------------------------------------------------------------------- /online/tinyse_online: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aiwwz/tiny-search-engine/0ed91aeb17e5d75f982669c3432959dc7c27473d/online/tinyse_online -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | tiny-search-engine for study. 2 | Let's go! 3 | --------------------------------------------------------------------------------