├── data └── dict │ └── user.dict.utf8 ├── offline ├── data │ └── dict │ │ └── user.dict.utf8 ├── offline.jpg ├── include │ ├── cppjieba │ │ ├── limonp │ │ │ ├── ForcePublic.hpp │ │ │ ├── NonCopyable.hpp │ │ │ ├── Colors.hpp │ │ │ ├── Condition.hpp │ │ │ ├── Thread.hpp │ │ │ ├── MutexLock.hpp │ │ │ ├── BlockingQueue.hpp │ │ │ ├── BoundedQueue.hpp │ │ │ ├── BoundedBlockingQueue.hpp │ │ │ ├── FileLock.hpp │ │ │ ├── ArgvContext.hpp │ │ │ ├── Logging.hpp │ │ │ ├── ThreadPool.hpp │ │ │ ├── Config.hpp │ │ │ ├── LocalVector.hpp │ │ │ └── StdExtension.hpp │ │ ├── SegmentTagged.hpp │ │ ├── SegmentBase.hpp │ │ ├── PreFilter.hpp │ │ ├── PosTagger.hpp │ │ ├── FullSegment.hpp │ │ ├── QuerySegment.hpp │ │ ├── MixSegment.hpp │ │ ├── HMMModel.hpp │ │ ├── Jieba.hpp │ │ ├── MPSegment.hpp │ │ ├── KeywordExtractor.hpp │ │ └── Trie.hpp │ ├── PageLib.h │ ├── DirScanner.h │ ├── PageLibPreprocessor.h │ ├── RssReader.h │ ├── WebPage.h │ ├── WordSegmentation.h │ └── Configuration.h ├── Makefile ├── src │ ├── TestOffline.cc │ ├── PageLib.cc │ ├── DirScanner.cc │ ├── Configuration.cc │ ├── WebPage.cc │ ├── RssReader.cc │ └── PageLibPreprocessor.cc └── conf │ └── offline.conf ├── SearchEngine.jpg ├── .assets ├── Screenshot from 2019-09-02 21-08-50.png ├── Screenshot from 2019-09-02 21-21-17.png └── Screenshot from 2019-09-02 21-23-23.png ├── include ├── cppjieba │ ├── limonp │ │ ├── ForcePublic.hpp │ │ ├── NonCopyable.hpp │ │ ├── Colors.hpp │ │ ├── Condition.hpp │ │ ├── Thread.hpp │ │ ├── MutexLock.hpp │ │ ├── BlockingQueue.hpp │ │ ├── BoundedQueue.hpp │ │ ├── BoundedBlockingQueue.hpp │ │ ├── FileLock.hpp │ │ ├── ArgvContext.hpp │ │ ├── Logging.hpp │ │ ├── ThreadPool.hpp │ │ ├── Config.hpp │ │ ├── LocalVector.hpp │ │ ├── StdExtension.hpp │ │ └── Closure.hpp │ ├── SegmentTagged.hpp │ ├── SegmentBase.hpp │ ├── PreFilter.hpp │ ├── PosTagger.hpp │ ├── FullSegment.hpp │ ├── QuerySegment.hpp │ ├── MixSegment.hpp │ ├── HMMModel.hpp │ ├── Jieba.hpp │ ├── MPSegment.hpp │ ├── KeywordExtractor.hpp │ └── Trie.hpp ├── net │ ├── Socket.h │ ├── SocketIO.h │ ├── TCPServer.h │ ├── InetAddress.h │ ├── Acceptor.h │ ├── TCPConnection.h │ └── EventLoop.h ├── Nocopyble.h ├── threadpool │ ├── Condition.h │ ├── TaskQueue.h │ ├── MutexLock.h │ ├── Thread.h │ └── Threadpool.h ├── WordQueryServer.h ├── WordQuery.h ├── WebPage.h ├── WordSegmentation.h ├── Redis.h ├── mylogger.h └── Configuration.h ├── .gitignore ├── Makefile ├── src ├── TestOnline.cc ├── net │ ├── Socket.cc │ ├── TCPServer.cc │ ├── InetAddress.cc │ ├── Acceptor.cc │ ├── SocketIO.cc │ ├── TCPConnection.cc │ └── EventLoop.cc ├── threadpool │ ├── MutexLock.cc │ ├── Thread.cc │ ├── Condition.cc │ ├── TaskQueue.cc │ └── Threadpool.cc ├── mylogger.cc ├── WordQueryServer.cc ├── Configuration.cc └── WebPage.cc ├── conf └── online.conf ├── php_TCP ├── site.css ├── php_client.php └── index.html ├── LICENSE └── README.md /data/dict/user.dict.utf8: -------------------------------------------------------------------------------- 1 | 云计算 2 | 韩玉鉴赏 3 | 蓝翔 nz 4 | 区块链 10 nz 5 | -------------------------------------------------------------------------------- /offline/data/dict/user.dict.utf8: -------------------------------------------------------------------------------- 1 | 云计算 2 | 韩玉鉴赏 3 | 蓝翔 nz 4 | 区块链 10 nz 5 | -------------------------------------------------------------------------------- /SearchEngine.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iovz/SearchEnigine/HEAD/SearchEngine.jpg -------------------------------------------------------------------------------- /offline/offline.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iovz/SearchEnigine/HEAD/offline/offline.jpg -------------------------------------------------------------------------------- /.assets/Screenshot from 2019-09-02 21-08-50.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iovz/SearchEnigine/HEAD/.assets/Screenshot from 2019-09-02 21-08-50.png -------------------------------------------------------------------------------- /.assets/Screenshot from 2019-09-02 21-21-17.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iovz/SearchEnigine/HEAD/.assets/Screenshot from 2019-09-02 21-21-17.png -------------------------------------------------------------------------------- /.assets/Screenshot from 2019-09-02 21-23-23.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iovz/SearchEnigine/HEAD/.assets/Screenshot from 2019-09-02 21-23-23.png -------------------------------------------------------------------------------- /include/cppjieba/limonp/ForcePublic.hpp: -------------------------------------------------------------------------------- 1 | #ifndef LIMONP_FORCE_PUBLIC_H 2 | #define LIMONP_FORCE_PUBLIC_H 3 | 4 | #define private public 5 | #define protected public 6 | 7 | #endif // LIMONP_FORCE_PUBLIC_H 8 | -------------------------------------------------------------------------------- /offline/include/cppjieba/limonp/ForcePublic.hpp: -------------------------------------------------------------------------------- 1 | #ifndef LIMONP_FORCE_PUBLIC_H 2 | #define LIMONP_FORCE_PUBLIC_H 3 | 4 | #define private public 5 | #define protected public 6 | 7 | #endif // LIMONP_FORCE_PUBLIC_H 8 | -------------------------------------------------------------------------------- /include/net/Socket.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | namespace wd { 4 | class Socket { 5 | public: 6 | Socket(); 7 | explicit Socket(int fd); 8 | ~Socket(); 9 | 10 | int fd() const; 11 | void shutdownonWrite(); 12 | 13 | private: 14 | int _fd; 15 | }; 16 | 17 | } // namespace wd 18 | -------------------------------------------------------------------------------- /include/Nocopyble.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | using std::cout; 4 | using std::endl; 5 | 6 | class Nocopyble { 7 | protected: 8 | Nocopyble() {} 9 | ~Nocopyble() {} 10 | Nocopyble(const Nocopyble&) = delete ; 11 | Nocopyble& operator=(const Nocopyble&) = delete ; 12 | }; 13 | 14 | -------------------------------------------------------------------------------- /offline/Makefile: -------------------------------------------------------------------------------- 1 | INC_DIR:= include/ 2 | SRC_DIR:= src/ 3 | SRCS:=$(wildcard src/*.cc) 4 | OBJS:= $(patsubst %.cc, %.o, $(SRCS)) 5 | LIBS:=-lpthread -lboost_regex 6 | 7 | CXX:=g++ 8 | 9 | CXXFLAGS:= -w -g $(addprefix -I , $(INC_DIR)) $(LIBS) 10 | 11 | EXE:=bin/offline.exe 12 | 13 | $(EXE):$(OBJS) 14 | $(CXX) -o $(EXE) $(OBJS) $(CXXFLAGS) 15 | 16 | clean: 17 | rm -rf $(EXE) 18 | rm -rf $(OBJS) 19 | -------------------------------------------------------------------------------- /include/net/SocketIO.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | namespace wd { 4 | class SocketIO { 5 | public: 6 | explicit SocketIO(int fd); 7 | 8 | int readn(char* buf, int len); 9 | int readLine(char* buf, int maxLen); 10 | int writen(const char* buf, int len); 11 | 12 | private: 13 | int recvPeek(char* buf, int len); 14 | 15 | private: 16 | int _fd; 17 | }; 18 | 19 | } // namespace wd 20 | -------------------------------------------------------------------------------- /include/threadpool/Condition.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "Nocopyble.h" 3 | #include 4 | 5 | namespace wd { 6 | class MutexLock; 7 | 8 | class Condition : Nocopyble { 9 | public: 10 | Condition(MutexLock& mutex); 11 | ~Condition(); 12 | void wait(); 13 | void notify(); 14 | void notifyall(); 15 | private: 16 | pthread_cond_t _cond; 17 | MutexLock& _mutex; 18 | }; 19 | } 20 | -------------------------------------------------------------------------------- /offline/include/PageLib.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include "DirScanner.h" 5 | using std::string; 6 | using std::vector; 7 | 8 | namespace wd { 9 | class PageLib { 10 | public: 11 | PageLib(DirScanner& scanner); 12 | void create(); 13 | //void store(); 14 | 15 | private: 16 | DirScanner& _scanner; 17 | vector _pages; 18 | }; 19 | 20 | } // namespace wd 21 | -------------------------------------------------------------------------------- /offline/include/DirScanner.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | using std::string; 5 | using std::vector; 6 | 7 | namespace wd { 8 | class DirScanner { 9 | public: 10 | DirScanner(); 11 | 12 | void operator()(); 13 | vector& getFiles(); 14 | void traverse(const string& dirPath); 15 | private: 16 | vector _files;//存放每个xml文件的绝对路径 17 | }; 18 | 19 | } // namespace wd 20 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Prerequisites 2 | *.d 3 | *.log 4 | 5 | # Compiled Object files 6 | *.slo 7 | *.lo 8 | *.o 9 | *.obj 10 | 11 | # Precompiled Headers 12 | *.gch 13 | *.pch 14 | 15 | # Compiled Dynamic libraries 16 | *.so 17 | *.dylib 18 | *.dll 19 | 20 | # Fortran module files 21 | *.mod 22 | *.smod 23 | 24 | # Compiled Static libraries 25 | *.lai 26 | *.la 27 | *.a 28 | 29 | # Executables 30 | *.exe 31 | *.out 32 | *.app 33 | 34 | .vscode/ 35 | test/ 36 | temp/ 37 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | INC_DIR:= include/ 2 | SRC_DIR:= src/ 3 | SRCS:=$(wildcard src/*.cc) $(wildcard src/net/*.cc) $(wildcard src/threadpool/*.cc) 4 | OBJS:= $(patsubst %.cc, %.o, $(SRCS)) 5 | LIBS:= -llog4cpp -lpthread -lhiredis 6 | 7 | CXX:=g++ 8 | 9 | CXXFLAGS:= -w -g $(addprefix -I , $(INC_DIR)) $(LIBS) 10 | 11 | EXE:=bin/SearchEngine.exe 12 | 13 | $(EXE):$(OBJS) 14 | $(CXX) -o $(EXE) $(OBJS) $(CXXFLAGS) 15 | 16 | clean: 17 | rm -rf $(EXE) 18 | rm -rf $(OBJS) 19 | -------------------------------------------------------------------------------- /src/TestOnline.cc: -------------------------------------------------------------------------------- 1 | #include "WordQuery.h" 2 | #include "WordQueryServer.h" 3 | #include "mylogger.h" 4 | using std::stoi; 5 | using namespace wd; 6 | 7 | int main() { 8 | Configuration::getInstance( 9 | "/home/whb/project/RssSearchEngine/conf/online.conf"); 10 | 11 | WordQueryServer server(stoi(CONFIG["threadNum"]), stoi(CONFIG["queSize"]), 12 | CONFIG["ip"], stoi(CONFIG["port"])); 13 | 14 | server.start(); 15 | 16 | Mylogger::destroy(); 17 | return 0; 18 | } -------------------------------------------------------------------------------- /include/cppjieba/limonp/NonCopyable.hpp: -------------------------------------------------------------------------------- 1 | /************************************ 2 | ************************************/ 3 | #ifndef LIMONP_NONCOPYABLE_H 4 | #define LIMONP_NONCOPYABLE_H 5 | 6 | namespace limonp { 7 | 8 | class NonCopyable { 9 | protected: 10 | NonCopyable() { 11 | } 12 | ~NonCopyable() { 13 | } 14 | private: 15 | NonCopyable(const NonCopyable& ); 16 | const NonCopyable& operator=(const NonCopyable& ); 17 | }; // class NonCopyable 18 | 19 | } // namespace limonp 20 | 21 | #endif // LIMONP_NONCOPYABLE_H 22 | -------------------------------------------------------------------------------- /offline/src/TestOffline.cc: -------------------------------------------------------------------------------- 1 | #include "DirScanner.h" 2 | #include "PageLib.h" 3 | #include "PageLibPreprocessor.h" 4 | #include "tinyxml2.h" 5 | using namespace wd; 6 | using namespace tinyxml2; 7 | 8 | int main() { 9 | Configuration::getInstance( 10 | "/home/whb/project/RssSearchEngine/offline/conf/offline.conf"); 11 | DirScanner scanner; 12 | scanner(); 13 | PageLib pagelib(scanner); 14 | pagelib.create(); 15 | 16 | PageLibPreprocessor processer; 17 | processer.doProcess(); 18 | 19 | return 0; 20 | } -------------------------------------------------------------------------------- /include/cppjieba/SegmentTagged.hpp: -------------------------------------------------------------------------------- 1 | #ifndef CPPJIEBA_SEGMENTTAGGED_H 2 | #define CPPJIEBA_SEGMENTTAGGED_H 3 | 4 | #include "SegmentBase.hpp" 5 | 6 | namespace cppjieba { 7 | 8 | class SegmentTagged : public SegmentBase{ 9 | public: 10 | SegmentTagged() { 11 | } 12 | virtual ~SegmentTagged() { 13 | } 14 | 15 | virtual bool Tag(const string& src, vector >& res) const = 0; 16 | 17 | virtual const DictTrie* GetDictTrie() const = 0; 18 | 19 | }; // class SegmentTagged 20 | 21 | } // cppjieba 22 | 23 | #endif 24 | -------------------------------------------------------------------------------- /include/net/TCPServer.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "Acceptor.h" 3 | #include "EventLoop.h" 4 | #include "TCPConnection.h" 5 | 6 | namespace wd { 7 | class TCPServer { 8 | public: 9 | TCPServer(const string& ip, unsigned short port); 10 | void start(); 11 | 12 | void setConnectionCallback(TCPConnectionCallback&& cb); 13 | void setMessageCallback(TCPConnectionCallback&& cb); 14 | void setCloseCallback(TCPConnectionCallback&& cb); 15 | 16 | private: 17 | Acceptor _acceptor; 18 | EventLoop _loop; 19 | }; 20 | 21 | } // namespace wd 22 | -------------------------------------------------------------------------------- /offline/include/cppjieba/limonp/NonCopyable.hpp: -------------------------------------------------------------------------------- 1 | /************************************ 2 | ************************************/ 3 | #ifndef LIMONP_NONCOPYABLE_H 4 | #define LIMONP_NONCOPYABLE_H 5 | 6 | namespace limonp { 7 | 8 | class NonCopyable { 9 | protected: 10 | NonCopyable() { 11 | } 12 | ~NonCopyable() { 13 | } 14 | private: 15 | NonCopyable(const NonCopyable& ); 16 | const NonCopyable& operator=(const NonCopyable& ); 17 | }; // class NonCopyable 18 | 19 | } // namespace limonp 20 | 21 | #endif // LIMONP_NONCOPYABLE_H 22 | -------------------------------------------------------------------------------- /src/net/Socket.cc: -------------------------------------------------------------------------------- 1 | #include "net/Socket.h" 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | namespace wd { 9 | Socket::Socket() { 10 | _fd = socket(AF_INET, SOCK_STREAM, 0); 11 | if (_fd == -1) { 12 | perror("socket"); 13 | } 14 | } 15 | 16 | Socket::Socket(int fd) : _fd(fd) {} 17 | 18 | int Socket::fd() const { return _fd; } 19 | 20 | void Socket::shutdownonWrite() { ::shutdown(_fd, SHUT_WR); } 21 | 22 | Socket::~Socket() { ::close(_fd); } 23 | } // namespace wd 24 | -------------------------------------------------------------------------------- /offline/include/cppjieba/SegmentTagged.hpp: -------------------------------------------------------------------------------- 1 | #ifndef CPPJIEBA_SEGMENTTAGGED_H 2 | #define CPPJIEBA_SEGMENTTAGGED_H 3 | 4 | #include "SegmentBase.hpp" 5 | 6 | namespace cppjieba { 7 | 8 | class SegmentTagged : public SegmentBase{ 9 | public: 10 | SegmentTagged() { 11 | } 12 | virtual ~SegmentTagged() { 13 | } 14 | 15 | virtual bool Tag(const string& src, vector >& res) const = 0; 16 | 17 | virtual const DictTrie* GetDictTrie() const = 0; 18 | 19 | }; // class SegmentTagged 20 | 21 | } // cppjieba 22 | 23 | #endif 24 | -------------------------------------------------------------------------------- /include/net/InetAddress.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | using std::string; 5 | 6 | namespace wd { 7 | class InetAddress { 8 | public: 9 | explicit InetAddress(unsigned short port); 10 | InetAddress(const string& ip, unsigned short port); 11 | InetAddress(const struct sockaddr_in& addr); 12 | 13 | string ip() const; 14 | unsigned short port() const; 15 | struct sockaddr_in* getInetAddressPtr() { 16 | return &_addr; 17 | } 18 | 19 | private: 20 | struct sockaddr_in _addr; 21 | }; 22 | } // namespace wd 23 | -------------------------------------------------------------------------------- /include/net/Acceptor.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "InetAddress.h" 3 | #include "Socket.h" 4 | 5 | namespace wd { 6 | class Acceptor { 7 | public: 8 | Acceptor(unsigned short port); 9 | Acceptor(const string& ip, unsigned short port); 10 | 11 | void ready(); 12 | int accept(); 13 | int fd() const { return _listensock.fd(); } 14 | 15 | private: 16 | void setReuseAddr(bool on); 17 | void setReusePort(bool on); 18 | void bind(); 19 | void listen(); 20 | 21 | private: 22 | InetAddress _addr; 23 | Socket _listensock; 24 | }; 25 | 26 | } // namespace wd 27 | -------------------------------------------------------------------------------- /offline/include/PageLibPreprocessor.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "WebPage.h" 3 | #include 4 | using std::vector; 5 | using std::pair; 6 | 7 | namespace wd { 8 | class PageLibPreprocessor { 9 | public: 10 | PageLibPreprocessor(); 11 | 12 | void doProcess(); 13 | void readPageFromFile(); 14 | void cutRedundantPages(); 15 | void buildInvertIndex(); 16 | void store(); 17 | 18 | private: 19 | WordSegmentation _jieba; 20 | vector _pageLib; 21 | unordered_map>> _invertIndexTable; 22 | }; 23 | 24 | } // namespace wd 25 | -------------------------------------------------------------------------------- /include/threadpool/TaskQueue.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include "Condition.h" 4 | #include "MutexLock.h" 5 | #include 6 | using std::queue; 7 | 8 | namespace wd { 9 | using ElemType = std::function; 10 | class TaskQueue { 11 | public: 12 | TaskQueue(size_t quesize); 13 | 14 | bool empty() const; 15 | bool full() const; 16 | void push(ElemType); 17 | ElemType pop(); 18 | void wakeup(); 19 | 20 | private: 21 | size_t _queSize; 22 | queue _que; 23 | MutexLock _mutex; 24 | Condition _notFull; 25 | Condition _notEmpty; 26 | bool _flag; 27 | }; 28 | } // namespace wd 29 | -------------------------------------------------------------------------------- /include/threadpool/MutexLock.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "Nocopyble.h" 3 | #include 4 | 5 | namespace wd { 6 | class MutexLock : Nocopyble { 7 | public: 8 | MutexLock(); 9 | ~MutexLock(); 10 | void lock(); 11 | void unlock(); 12 | pthread_mutex_t* getMutexPtr() { return &_mutex; } 13 | private: 14 | pthread_mutex_t _mutex; 15 | }; 16 | 17 | class MutexGuard 18 | { 19 | public: 20 | MutexGuard(MutexLock& mutexlock) 21 | : _mutexlock(mutexlock) { 22 | _mutexlock.lock(); 23 | } 24 | ~MutexGuard() { 25 | _mutexlock.unlock(); 26 | } 27 | private: 28 | MutexLock& _mutexlock; 29 | }; 30 | 31 | } 32 | -------------------------------------------------------------------------------- /offline/conf/offline.conf: -------------------------------------------------------------------------------- 1 | xmlDir /home/whb/project/RssSearchEngine/offline/data/xml 2 | ripepageLib /home/whb/project/RssSearchEngine/offline/data/ripepage.lib 3 | newRipepageLib /home/whb/project/RssSearchEngine/data/newRipepage.lib 4 | invertIndex /home/whb/project/RssSearchEngine/data/invertIndex.lib 5 | 6 | dict /home/whb/project/RssSearchEngine/data/dict/jieba.dict.utf8 7 | hmm_model /home/whb/project/RssSearchEngine/data/dict/hmm_model.utf8 8 | idf /home/whb/project/RssSearchEngine/data/dict/idf.utf8 9 | stop_words /home/whb/project/RssSearchEngine/data/dict/stop_words.utf8 10 | user_dict /home/whb/project/RssSearchEngine/data/dict/user.dict.utf8 -------------------------------------------------------------------------------- /src/net/TCPServer.cc: -------------------------------------------------------------------------------- 1 | #include "net/TCPServer.h" 2 | 3 | namespace wd { 4 | TCPServer::TCPServer(const string& ip, unsigned short port) 5 | : _acceptor(ip, port) 6 | , _loop(_acceptor) {} 7 | 8 | void TCPServer::start() { 9 | _acceptor.ready(); 10 | _loop.loop(); 11 | } 12 | 13 | void TCPServer::setConnectionCallback(TCPConnectionCallback&& cb) { 14 | _loop.setConnectionCallback(std::move(cb)); 15 | } 16 | 17 | void TCPServer::setMessageCallback(TCPConnectionCallback&& cb) { 18 | _loop.setMessageCallback(std::move(cb)); 19 | } 20 | 21 | void TCPServer::setCloseCallback(TCPConnectionCallback&& cb) { 22 | _loop.setCloseCallback(std::move(cb)); 23 | } 24 | 25 | } // namespace wd 26 | -------------------------------------------------------------------------------- /include/threadpool/Thread.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include "Nocopyble.h" 7 | using std::cout; 8 | using std::endl; 9 | using std::string; 10 | 11 | namespace wd { 12 | 13 | using ThreadCallback = std::function; 14 | 15 | class Thread : Nocopyble { 16 | public: 17 | Thread(ThreadCallback&& cb) 18 | : _pthid(0), _isRunning(false), _cb(std::move(cb)) {} 19 | 20 | ~Thread(); 21 | void start(); 22 | void join(); 23 | static void* threadFunc(void*); 24 | 25 | private: 26 | pthread_t _pthid; 27 | bool _isRunning; 28 | ThreadCallback _cb; 29 | }; 30 | } // namespace wd 31 | -------------------------------------------------------------------------------- /src/threadpool/MutexLock.cc: -------------------------------------------------------------------------------- 1 | #include "threadpool/MutexLock.h" 2 | #include 3 | #include 4 | 5 | namespace wd { 6 | MutexLock::MutexLock() { 7 | if (pthread_mutex_init(&_mutex, nullptr)) { 8 | perror("pthread_mutex_init"); 9 | } 10 | } 11 | 12 | MutexLock::~MutexLock() { 13 | if (pthread_mutex_destroy(&_mutex)) { 14 | perror("pthread_mutex_destroy"); 15 | } 16 | } 17 | 18 | void MutexLock::lock() { 19 | if (pthread_mutex_lock(&_mutex)) { 20 | perror("pthread_mutex_lock"); 21 | } 22 | } 23 | 24 | void MutexLock::unlock() { 25 | if (pthread_mutex_unlock(&_mutex)) { 26 | perror("pthread_mutex_unlock"); 27 | } 28 | } 29 | } // namespace wd -------------------------------------------------------------------------------- /conf/online.conf: -------------------------------------------------------------------------------- 1 | ip 127.0.0.1 2 | port 5080 3 | threadNum 10 4 | queSize 10 5 | newRipepageLib /home/whb/project/RssSearchEngine/data/newRipepage.lib 6 | invertIndex /home/whb/project/RssSearchEngine/data/invertIndex.lib 7 | Cache /home/whb/project/RssSearchEngine/data/cache.dat 8 | CacheSize 20 9 | initTime 10 10 | periodicTime 20 11 | 12 | dict /home/whb/project/RssSearchEngine/data/dict/jieba.dict.utf8 13 | hmm_model /home/whb/project/RssSearchEngine/data/dict/hmm_model.utf8 14 | idf /home/whb/project/RssSearchEngine/data/dict/idf.utf8 15 | stop_words /home/whb/project/RssSearchEngine/data/dict/stop_words.utf8 16 | user_dict /home/whb/project/RssSearchEngine/data/dict/user.dict.utf8 17 | -------------------------------------------------------------------------------- /include/WordQueryServer.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "threadpool/Threadpool.h" 3 | #include "net/TCPServer.h" 4 | #include "WordQuery.h" 5 | 6 | namespace wd { 7 | class WordQueryServer { 8 | public: 9 | WordQueryServer(int threadNum, int queSize, const string& ip, 10 | unsigned short port); 11 | 12 | void start(); 13 | 14 | void onConnection(const TCPConnectionPtr& conn); 15 | void onMessage(const TCPConnectionPtr& conn); 16 | void onClose(const TCPConnectionPtr& conn); 17 | 18 | void process(const TCPConnectionPtr& conn, const string& msg); 19 | 20 | private: 21 | Threadpool _threadpool; 22 | TCPServer _server; 23 | WordQuery _wordquery; 24 | }; 25 | 26 | } // namespace wd 27 | -------------------------------------------------------------------------------- /include/cppjieba/limonp/Colors.hpp: -------------------------------------------------------------------------------- 1 | #ifndef LIMONP_COLOR_PRINT_HPP 2 | #define LIMONP_COLOR_PRINT_HPP 3 | 4 | #include 5 | #include 6 | 7 | namespace limonp { 8 | 9 | using std::string; 10 | 11 | enum Color { 12 | BLACK = 30, 13 | RED, 14 | GREEN, 15 | YELLOW, 16 | BLUE, 17 | PURPLE 18 | }; // enum Color 19 | 20 | static void ColorPrintln(enum Color color, const char * fmt, ...) { 21 | va_list ap; 22 | printf("\033[0;%dm", color); 23 | va_start(ap, fmt); 24 | vprintf(fmt, ap); 25 | va_end(ap); 26 | printf("\033[0m\n"); // if not \n , in some situation , the next lines will be set the same color unexpectedly 27 | } 28 | 29 | } // namespace limonp 30 | 31 | #endif // LIMONP_COLOR_PRINT_HPP 32 | -------------------------------------------------------------------------------- /include/threadpool/Threadpool.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "Thread.h" 3 | #include "TaskQueue.h" 4 | #include 5 | #include 6 | #include 7 | #include 8 | using std::vector; 9 | using std::unique_ptr; 10 | using std::cout; 11 | using std::endl; 12 | 13 | namespace wd{ 14 | using Task = std::function; 15 | class Threadpool { 16 | public: 17 | Threadpool(size_t, size_t); 18 | ~Threadpool(); 19 | void start(); 20 | void stop(); 21 | void addTask(Task&& task); 22 | private: 23 | void threadFunc(); //子线程执行的任务 24 | Task getTask(); 25 | private: 26 | size_t _threadNum; 27 | size_t _queSize; 28 | vector> _threads; 29 | TaskQueue _taskque; 30 | bool _isExit; 31 | }; 32 | } 33 | -------------------------------------------------------------------------------- /offline/include/cppjieba/limonp/Colors.hpp: -------------------------------------------------------------------------------- 1 | #ifndef LIMONP_COLOR_PRINT_HPP 2 | #define LIMONP_COLOR_PRINT_HPP 3 | 4 | #include 5 | #include 6 | 7 | namespace limonp { 8 | 9 | using std::string; 10 | 11 | enum Color { 12 | BLACK = 30, 13 | RED, 14 | GREEN, 15 | YELLOW, 16 | BLUE, 17 | PURPLE 18 | }; // enum Color 19 | 20 | static void ColorPrintln(enum Color color, const char * fmt, ...) { 21 | va_list ap; 22 | printf("\033[0;%dm", color); 23 | va_start(ap, fmt); 24 | vprintf(fmt, ap); 25 | va_end(ap); 26 | printf("\033[0m\n"); // if not \n , in some situation , the next lines will be set the same color unexpectedly 27 | } 28 | 29 | } // namespace limonp 30 | 31 | #endif // LIMONP_COLOR_PRINT_HPP 32 | -------------------------------------------------------------------------------- /offline/include/RssReader.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include "tinyxml2.h" 5 | using std::string; 6 | using std::vector; 7 | using namespace tinyxml2; 8 | 9 | namespace wd { 10 | 11 | struct RssItem { 12 | string title; 13 | string link; 14 | string description; 15 | string content; 16 | }; 17 | 18 | class RssReader { 19 | public: 20 | RssReader(vector& files); 21 | ~RssReader(); 22 | void loadFiles(); 23 | void makePages(vector& pages); 24 | void createXML(); 25 | 26 | private: 27 | void parseRss(XMLDocument& doc); 28 | void loadXML(const string& xmlPath); 29 | 30 | private: 31 | vector _rssItems; 32 | vector _files;//所有xml文件 33 | }; 34 | 35 | } // namespace wd -------------------------------------------------------------------------------- /src/net/InetAddress.cc: -------------------------------------------------------------------------------- 1 | #include "net/InetAddress.h" 2 | #include 3 | 4 | namespace wd { 5 | InetAddress::InetAddress(unsigned short port) { 6 | ::memset(&_addr, 0, sizeof(struct sockaddr_in)); 7 | _addr.sin_family = AF_INET; 8 | _addr.sin_port = htons(port); 9 | _addr.sin_addr.s_addr = INADDR_ANY; 10 | } 11 | 12 | InetAddress::InetAddress(const string& ip, unsigned short port) { 13 | ::memset(&_addr, 0, sizeof(struct sockaddr_in)); 14 | _addr.sin_family = AF_INET; 15 | _addr.sin_port = htons(port); 16 | _addr.sin_addr.s_addr = inet_addr(ip.c_str()); 17 | } 18 | 19 | InetAddress::InetAddress(const struct sockaddr_in& addr) : _addr(addr) {} 20 | 21 | string InetAddress::ip() const { return string(::inet_ntoa(_addr.sin_addr)); } 22 | 23 | unsigned short InetAddress::port() const { return ntohs(_addr.sin_port); } 24 | } // namespace wd 25 | -------------------------------------------------------------------------------- /include/cppjieba/limonp/Condition.hpp: -------------------------------------------------------------------------------- 1 | #ifndef LIMONP_CONDITION_HPP 2 | #define LIMONP_CONDITION_HPP 3 | 4 | #include "MutexLock.hpp" 5 | 6 | namespace limonp { 7 | 8 | class Condition : NonCopyable { 9 | public: 10 | explicit Condition(MutexLock& mutex) 11 | : mutex_(mutex) { 12 | XCHECK(!pthread_cond_init(&pcond_, NULL)); 13 | } 14 | 15 | ~Condition() { 16 | XCHECK(!pthread_cond_destroy(&pcond_)); 17 | } 18 | 19 | void Wait() { 20 | XCHECK(!pthread_cond_wait(&pcond_, mutex_.GetPthreadMutex())); 21 | } 22 | 23 | void Notify() { 24 | XCHECK(!pthread_cond_signal(&pcond_)); 25 | } 26 | 27 | void NotifyAll() { 28 | XCHECK(!pthread_cond_broadcast(&pcond_)); 29 | } 30 | 31 | private: 32 | MutexLock& mutex_; 33 | pthread_cond_t pcond_; 34 | }; // class Condition 35 | 36 | } // namespace limonp 37 | 38 | #endif // LIMONP_CONDITION_HPP 39 | -------------------------------------------------------------------------------- /offline/include/cppjieba/limonp/Condition.hpp: -------------------------------------------------------------------------------- 1 | #ifndef LIMONP_CONDITION_HPP 2 | #define LIMONP_CONDITION_HPP 3 | 4 | #include "MutexLock.hpp" 5 | 6 | namespace limonp { 7 | 8 | class Condition : NonCopyable { 9 | public: 10 | explicit Condition(MutexLock& mutex) 11 | : mutex_(mutex) { 12 | XCHECK(!pthread_cond_init(&pcond_, NULL)); 13 | } 14 | 15 | ~Condition() { 16 | XCHECK(!pthread_cond_destroy(&pcond_)); 17 | } 18 | 19 | void Wait() { 20 | XCHECK(!pthread_cond_wait(&pcond_, mutex_.GetPthreadMutex())); 21 | } 22 | 23 | void Notify() { 24 | XCHECK(!pthread_cond_signal(&pcond_)); 25 | } 26 | 27 | void NotifyAll() { 28 | XCHECK(!pthread_cond_broadcast(&pcond_)); 29 | } 30 | 31 | private: 32 | MutexLock& mutex_; 33 | pthread_cond_t pcond_; 34 | }; // class Condition 35 | 36 | } // namespace limonp 37 | 38 | #endif // LIMONP_CONDITION_HPP 39 | -------------------------------------------------------------------------------- /include/WordQuery.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include "WebPage.h" 7 | #include "WordSegmentation.h" 8 | using std::pair; 9 | using std::set; 10 | using std::string; 11 | using std::unordered_map; 12 | using std::vector; 13 | 14 | namespace wd { 15 | class WordQuery { 16 | public: 17 | WordQuery(); 18 | 19 | void loadLibrary(); 20 | string doQuery(const string& str); 21 | 22 | private: 23 | vector getQueryWordsWeight(vector& queryWords); 24 | vector getPages(vector queryWords); 25 | string returnNoAnswer(); 26 | string createJson(vector& docIdVec, const vector& queryWords); 27 | 28 | private: 29 | WordSegmentation _jieba; 30 | unordered_map _pageLib; 31 | unordered_map>> _invertIndex; 32 | }; 33 | 34 | } // namespace wd 35 | -------------------------------------------------------------------------------- /include/WebPage.h: -------------------------------------------------------------------------------- 1 | #pragma ocne 2 | #include 3 | #include 4 | #include 5 | #include 6 | using std::cout; 7 | using std::endl; 8 | using std::string; 9 | using std::unordered_map; 10 | using std::vector; 11 | 12 | namespace wd { 13 | class WebPage { 14 | public: 15 | WebPage(int id, const string& title, const string& link, 16 | const string& content); 17 | 18 | WebPage() {}; 19 | 20 | int getDocId() { return _docid; } 21 | string getTitle() { return _title; } 22 | string getContent() { return _content; } 23 | string getUrl() { return _link; } 24 | string summary(const vector& queryWords); 25 | 26 | private: 27 | size_t getBytes(const char ch); 28 | size_t length(const std::string &str); 29 | 30 | private: 31 | int _docid; 32 | string _title; 33 | string _link; 34 | string _content; 35 | }; 36 | 37 | } // namespace wd 38 | -------------------------------------------------------------------------------- /php_TCP/site.css: -------------------------------------------------------------------------------- 1 | 2 | input, form{ 3 | margin:0; 4 | padding:0; 5 | } 6 | 7 | body { 8 | font-family:"yaHei Consolas Hybrid"; 9 | margin:0; 10 | } 11 | 12 | 13 | .searchBlock { 14 | width:678px; 15 | position:relative; 16 | margin:50px auto; 17 | } 18 | .searchBlock input { 19 | font-size:20px; 20 | } 21 | 22 | .txtDefault { 23 | width:578px; 24 | height:28px; 25 | border:1px solid #ddd; 26 | padding:10px; 27 | } 28 | 29 | .btn { 30 | border:0; 31 | width:100px; 32 | height:50px; 33 | background-color:#56bdf3; 34 | color:#fff; 35 | text-align:center; 36 | cursor:pointer; 37 | } 38 | 39 | .searchButton { 40 | position:absolute; 41 | left:600px; 42 | top:0; 43 | } 44 | 45 | 46 | 47 | -------------------------------------------------------------------------------- /src/threadpool/Thread.cc: -------------------------------------------------------------------------------- 1 | #include "threadpool/Thread.h" 2 | #include "Redis.h" 3 | #include 4 | #include 5 | using std::string; 6 | 7 | namespace wd { 8 | namespace current_thread { 9 | __thread Redis* predis; 10 | } 11 | 12 | void Thread::start() { 13 | pthread_create(&_pthid, nullptr, threadFunc, this); 14 | _isRunning = true; 15 | } 16 | 17 | void* Thread::threadFunc(void* arg) { 18 | current_thread::predis = new Redis(); 19 | current_thread::predis->connect("127.0.0.1", 6379); 20 | 21 | Thread* pThread = static_cast(arg); 22 | if (pThread) { 23 | pThread->_cb(); //执行任务 24 | } 25 | delete current_thread::predis; 26 | return nullptr; 27 | } 28 | 29 | void Thread::join() { 30 | if (_isRunning) { 31 | pthread_join(_pthid, nullptr); 32 | _isRunning = false; 33 | } 34 | } 35 | 36 | Thread::~Thread() { 37 | if (_isRunning) { 38 | pthread_detach(_pthid); 39 | } 40 | } 41 | 42 | } // namespace wd 43 | -------------------------------------------------------------------------------- /offline/src/PageLib.cc: -------------------------------------------------------------------------------- 1 | #include "Configuration.h" 2 | #include "PageLib.h" 3 | #include 4 | #include "RssReader.h" 5 | using std::ofstream; 6 | 7 | namespace wd { 8 | PageLib::PageLib(DirScanner& scanner) : _scanner(scanner) {} 9 | 10 | void PageLib::create() { 11 | vector& files = _scanner.getFiles(); 12 | RssReader reader(files); 13 | reader.loadFiles(); 14 | //reader.makePages(_pages); 15 | reader.createXML(); 16 | } 17 | 18 | #if 0 19 | void PageLib::store() { 20 | ofstream ofsPage(CONFIG[RIPEPAGE_PATH]); 21 | ofstream ofsOffset(CONFIG[OFFSET_PATH]); 22 | int i = 0; 23 | for (auto& page : _pages) { 24 | ofstream::pos_type offset = ofsPage.tellp(); 25 | size_t length = page.size(); 26 | ofsPage << page; 27 | ofsOffset << ++i << '\t' << offset << '\t' << length << '\n'; 28 | } 29 | cout << ">> store ripepage and offset success" << endl; 30 | ofsPage.close(); 31 | ofsOffset.close(); 32 | } 33 | #endif 34 | } // namespace wd 35 | -------------------------------------------------------------------------------- /src/threadpool/Condition.cc: -------------------------------------------------------------------------------- 1 | #include "threadpool/Condition.h" 2 | #include "threadpool/MutexLock.h" 3 | #include 4 | #include 5 | 6 | namespace wd { 7 | Condition::Condition(MutexLock& mutex) 8 | : _mutex(mutex) { 9 | if(pthread_cond_init(&_cond, nullptr)) { 10 | perror("pthread_cond_init"); 11 | } 12 | } 13 | 14 | Condition::~Condition() { 15 | if(pthread_cond_destroy(&_cond)) { 16 | perror("pthread_cond_destroy"); 17 | } 18 | } 19 | 20 | void Condition::wait() { 21 | if(pthread_cond_wait(&_cond, _mutex.getMutexPtr())) { 22 | perror("pthread_cond_wait"); 23 | } 24 | } 25 | 26 | void Condition::notify() { 27 | if(pthread_cond_signal(&_cond)) { 28 | perror("pthread_cond_signal"); 29 | } 30 | } 31 | 32 | void Condition::notifyall() { 33 | if(pthread_cond_broadcast(&_cond)) { 34 | perror("pthread_cond_broadcast"); 35 | } 36 | } 37 | } -------------------------------------------------------------------------------- /include/cppjieba/limonp/Thread.hpp: -------------------------------------------------------------------------------- 1 | #ifndef LIMONP_THREAD_HPP 2 | #define LIMONP_THREAD_HPP 3 | 4 | #include "Logging.hpp" 5 | #include "NonCopyable.hpp" 6 | 7 | namespace limonp { 8 | 9 | class IThread: NonCopyable { 10 | public: 11 | IThread(): isStarted(false), isJoined(false) { 12 | } 13 | virtual ~IThread() { 14 | if(isStarted && !isJoined) { 15 | XCHECK(!pthread_detach(thread_)); 16 | } 17 | }; 18 | 19 | virtual void Run() = 0; 20 | void Start() { 21 | XCHECK(!isStarted); 22 | XCHECK(!pthread_create(&thread_, NULL, Worker, this)); 23 | isStarted = true; 24 | } 25 | void Join() { 26 | XCHECK(!isJoined); 27 | XCHECK(!pthread_join(thread_, NULL)); 28 | isJoined = true; 29 | } 30 | private: 31 | static void * Worker(void * data) { 32 | IThread * ptr = (IThread* ) data; 33 | ptr->Run(); 34 | return NULL; 35 | } 36 | 37 | pthread_t thread_; 38 | bool isStarted; 39 | bool isJoined; 40 | }; // class IThread 41 | 42 | } // namespace limonp 43 | 44 | #endif // LIMONP_THREAD_HPP 45 | -------------------------------------------------------------------------------- /offline/include/WebPage.h: -------------------------------------------------------------------------------- 1 | #pragma ocne 2 | #include 3 | #include 4 | #include "WordSegmentation.h" 5 | #include "tinyxml2.h" 6 | using namespace tinyxml2; 7 | using std::unordered_map; 8 | using std::string; 9 | 10 | namespace wd { 11 | class WebPage { 12 | public: 13 | WebPage(int id, const string& title, const string& link, const string& content); 14 | 15 | int getDocId() { return _docid; } 16 | uint64_t getSimhash() { return _simhashVal; } 17 | void insertDoc(XMLDocument& pageLib); 18 | //string getDoc(); 19 | unordered_map& getWordsMap() { return _wordsMap; } 20 | 21 | void generateSimhash(WordSegmentation& simhasher); 22 | void buildWordsMap(WordSegmentation& jieba); 23 | bool operator<(const WebPage& rhs); 24 | bool operator==(const WebPage& rhs); 25 | 26 | private: 27 | int _docid; 28 | string _title; 29 | string _link; 30 | string _content; 31 | uint64_t _simhashVal; 32 | unordered_map _wordsMap; 33 | }; 34 | 35 | } // namespace wd 36 | -------------------------------------------------------------------------------- /offline/include/cppjieba/limonp/Thread.hpp: -------------------------------------------------------------------------------- 1 | #ifndef LIMONP_THREAD_HPP 2 | #define LIMONP_THREAD_HPP 3 | 4 | #include "Logging.hpp" 5 | #include "NonCopyable.hpp" 6 | 7 | namespace limonp { 8 | 9 | class IThread: NonCopyable { 10 | public: 11 | IThread(): isStarted(false), isJoined(false) { 12 | } 13 | virtual ~IThread() { 14 | if(isStarted && !isJoined) { 15 | XCHECK(!pthread_detach(thread_)); 16 | } 17 | }; 18 | 19 | virtual void Run() = 0; 20 | void Start() { 21 | XCHECK(!isStarted); 22 | XCHECK(!pthread_create(&thread_, NULL, Worker, this)); 23 | isStarted = true; 24 | } 25 | void Join() { 26 | XCHECK(!isJoined); 27 | XCHECK(!pthread_join(thread_, NULL)); 28 | isJoined = true; 29 | } 30 | private: 31 | static void * Worker(void * data) { 32 | IThread * ptr = (IThread* ) data; 33 | ptr->Run(); 34 | return NULL; 35 | } 36 | 37 | pthread_t thread_; 38 | bool isStarted; 39 | bool isJoined; 40 | }; // class IThread 41 | 42 | } // namespace limonp 43 | 44 | #endif // LIMONP_THREAD_HPP 45 | -------------------------------------------------------------------------------- /include/WordSegmentation.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #include "Configuration.h" 6 | #include "cppjieba/Jieba.hpp" 7 | using std::cout; 8 | using std::endl; 9 | using std::string; 10 | using std::vector; 11 | 12 | namespace wd { 13 | class WordSegmentation { 14 | public: 15 | WordSegmentation() 16 | : _jieba(CONFIG[DICT_PATH], CONFIG[HMM_PATH], CONFIG[USER_DICT_PATH], 17 | CONFIG[IDF_PATH], CONFIG[STOP_WORD_PATH]) { 18 | cout << ">> jieba init" << endl; 19 | } 20 | 21 | vector operator()(const string& str) { 22 | vector words; 23 | _jieba.Cut(str, words, true); 24 | auto stopWords = Configuration::getInstance()->getStopWords(); 25 | for (auto it = words.begin(); it != words.end(); ++it) { 26 | auto uit = stopWords.find(*it); 27 | if (uit != stopWords.end()) { 28 | words.erase(it); 29 | } 30 | } 31 | return words; 32 | } 33 | 34 | private: 35 | cppjieba::Jieba _jieba; 36 | }; 37 | 38 | } // namespace wd 39 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Hongbo Wang 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /include/cppjieba/limonp/MutexLock.hpp: -------------------------------------------------------------------------------- 1 | #ifndef LIMONP_MUTEX_LOCK_HPP 2 | #define LIMONP_MUTEX_LOCK_HPP 3 | 4 | #include 5 | #include "NonCopyable.hpp" 6 | #include "Logging.hpp" 7 | 8 | namespace limonp { 9 | 10 | class MutexLock: NonCopyable { 11 | public: 12 | MutexLock() { 13 | XCHECK(!pthread_mutex_init(&mutex_, NULL)); 14 | } 15 | ~MutexLock() { 16 | XCHECK(!pthread_mutex_destroy(&mutex_)); 17 | } 18 | pthread_mutex_t* GetPthreadMutex() { 19 | return &mutex_; 20 | } 21 | 22 | private: 23 | void Lock() { 24 | XCHECK(!pthread_mutex_lock(&mutex_)); 25 | } 26 | void Unlock() { 27 | XCHECK(!pthread_mutex_unlock(&mutex_)); 28 | } 29 | friend class MutexLockGuard; 30 | 31 | pthread_mutex_t mutex_; 32 | }; // class MutexLock 33 | 34 | class MutexLockGuard: NonCopyable { 35 | public: 36 | explicit MutexLockGuard(MutexLock & mutex) 37 | : mutex_(mutex) { 38 | mutex_.Lock(); 39 | } 40 | ~MutexLockGuard() { 41 | mutex_.Unlock(); 42 | } 43 | private: 44 | MutexLock & mutex_; 45 | }; // class MutexLockGuard 46 | 47 | #define MutexLockGuard(x) XCHECK(false); 48 | 49 | } // namespace limonp 50 | 51 | #endif // LIMONP_MUTEX_LOCK_HPP 52 | -------------------------------------------------------------------------------- /include/cppjieba/limonp/BlockingQueue.hpp: -------------------------------------------------------------------------------- 1 | #ifndef LIMONP_BLOCKINGQUEUE_HPP 2 | #define LIMONP_BLOCKINGQUEUE_HPP 3 | 4 | #include 5 | #include "Condition.hpp" 6 | 7 | namespace limonp { 8 | template 9 | class BlockingQueue: NonCopyable { 10 | public: 11 | BlockingQueue() 12 | : mutex_(), notEmpty_(mutex_), queue_() { 13 | } 14 | 15 | void Push(const T& x) { 16 | MutexLockGuard lock(mutex_); 17 | queue_.push(x); 18 | notEmpty_.Notify(); // Wait morphing saves us 19 | } 20 | 21 | T Pop() { 22 | MutexLockGuard lock(mutex_); 23 | // always use a while-loop, due to spurious wakeup 24 | while (queue_.empty()) { 25 | notEmpty_.Wait(); 26 | } 27 | assert(!queue_.empty()); 28 | T front(queue_.front()); 29 | queue_.pop(); 30 | return front; 31 | } 32 | 33 | size_t Size() const { 34 | MutexLockGuard lock(mutex_); 35 | return queue_.size(); 36 | } 37 | bool Empty() const { 38 | return Size() == 0; 39 | } 40 | 41 | private: 42 | mutable MutexLock mutex_; 43 | Condition notEmpty_; 44 | std::queue queue_; 45 | }; // class BlockingQueue 46 | 47 | } // namespace limonp 48 | 49 | #endif // LIMONP_BLOCKINGQUEUE_HPP 50 | -------------------------------------------------------------------------------- /include/cppjieba/SegmentBase.hpp: -------------------------------------------------------------------------------- 1 | #ifndef CPPJIEBA_SEGMENTBASE_H 2 | #define CPPJIEBA_SEGMENTBASE_H 3 | 4 | #include "limonp/Logging.hpp" 5 | #include "PreFilter.hpp" 6 | #include 7 | 8 | 9 | namespace cppjieba { 10 | 11 | const char* const SPECIAL_SEPARATORS = " \t\n\xEF\xBC\x8C\xE3\x80\x82"; 12 | 13 | using namespace limonp; 14 | 15 | class SegmentBase { 16 | public: 17 | SegmentBase() { 18 | XCHECK(ResetSeparators(SPECIAL_SEPARATORS)); 19 | } 20 | virtual ~SegmentBase() { 21 | } 22 | 23 | virtual void Cut(const string& sentence, vector& words) const = 0; 24 | 25 | bool ResetSeparators(const string& s) { 26 | symbols_.clear(); 27 | RuneStrArray runes; 28 | if (!DecodeRunesInString(s, runes)) { 29 | XLOG(ERROR) << "decode " << s << " failed"; 30 | return false; 31 | } 32 | for (size_t i = 0; i < runes.size(); i++) { 33 | if (!symbols_.insert(runes[i].rune).second) { 34 | XLOG(ERROR) << s.substr(runes[i].offset, runes[i].len) << " already exists"; 35 | return false; 36 | } 37 | } 38 | return true; 39 | } 40 | protected: 41 | unordered_set symbols_; 42 | }; // class SegmentBase 43 | 44 | } // cppjieba 45 | 46 | #endif 47 | -------------------------------------------------------------------------------- /offline/include/cppjieba/limonp/MutexLock.hpp: -------------------------------------------------------------------------------- 1 | #ifndef LIMONP_MUTEX_LOCK_HPP 2 | #define LIMONP_MUTEX_LOCK_HPP 3 | 4 | #include 5 | #include "NonCopyable.hpp" 6 | #include "Logging.hpp" 7 | 8 | namespace limonp { 9 | 10 | class MutexLock: NonCopyable { 11 | public: 12 | MutexLock() { 13 | XCHECK(!pthread_mutex_init(&mutex_, NULL)); 14 | } 15 | ~MutexLock() { 16 | XCHECK(!pthread_mutex_destroy(&mutex_)); 17 | } 18 | pthread_mutex_t* GetPthreadMutex() { 19 | return &mutex_; 20 | } 21 | 22 | private: 23 | void Lock() { 24 | XCHECK(!pthread_mutex_lock(&mutex_)); 25 | } 26 | void Unlock() { 27 | XCHECK(!pthread_mutex_unlock(&mutex_)); 28 | } 29 | friend class MutexLockGuard; 30 | 31 | pthread_mutex_t mutex_; 32 | }; // class MutexLock 33 | 34 | class MutexLockGuard: NonCopyable { 35 | public: 36 | explicit MutexLockGuard(MutexLock & mutex) 37 | : mutex_(mutex) { 38 | mutex_.Lock(); 39 | } 40 | ~MutexLockGuard() { 41 | mutex_.Unlock(); 42 | } 43 | private: 44 | MutexLock & mutex_; 45 | }; // class MutexLockGuard 46 | 47 | #define MutexLockGuard(x) XCHECK(false); 48 | 49 | } // namespace limonp 50 | 51 | #endif // LIMONP_MUTEX_LOCK_HPP 52 | -------------------------------------------------------------------------------- /offline/include/cppjieba/limonp/BlockingQueue.hpp: -------------------------------------------------------------------------------- 1 | #ifndef LIMONP_BLOCKINGQUEUE_HPP 2 | #define LIMONP_BLOCKINGQUEUE_HPP 3 | 4 | #include 5 | #include "Condition.hpp" 6 | 7 | namespace limonp { 8 | template 9 | class BlockingQueue: NonCopyable { 10 | public: 11 | BlockingQueue() 12 | : mutex_(), notEmpty_(mutex_), queue_() { 13 | } 14 | 15 | void Push(const T& x) { 16 | MutexLockGuard lock(mutex_); 17 | queue_.push(x); 18 | notEmpty_.Notify(); // Wait morphing saves us 19 | } 20 | 21 | T Pop() { 22 | MutexLockGuard lock(mutex_); 23 | // always use a while-loop, due to spurious wakeup 24 | while (queue_.empty()) { 25 | notEmpty_.Wait(); 26 | } 27 | assert(!queue_.empty()); 28 | T front(queue_.front()); 29 | queue_.pop(); 30 | return front; 31 | } 32 | 33 | size_t Size() const { 34 | MutexLockGuard lock(mutex_); 35 | return queue_.size(); 36 | } 37 | bool Empty() const { 38 | return Size() == 0; 39 | } 40 | 41 | private: 42 | mutable MutexLock mutex_; 43 | Condition notEmpty_; 44 | std::queue queue_; 45 | }; // class BlockingQueue 46 | 47 | } // namespace limonp 48 | 49 | #endif // LIMONP_BLOCKINGQUEUE_HPP 50 | -------------------------------------------------------------------------------- /offline/include/cppjieba/SegmentBase.hpp: -------------------------------------------------------------------------------- 1 | #ifndef CPPJIEBA_SEGMENTBASE_H 2 | #define CPPJIEBA_SEGMENTBASE_H 3 | 4 | #include "limonp/Logging.hpp" 5 | #include "PreFilter.hpp" 6 | #include 7 | 8 | 9 | namespace cppjieba { 10 | 11 | const char* const SPECIAL_SEPARATORS = " \t\n\xEF\xBC\x8C\xE3\x80\x82"; 12 | 13 | using namespace limonp; 14 | 15 | class SegmentBase { 16 | public: 17 | SegmentBase() { 18 | XCHECK(ResetSeparators(SPECIAL_SEPARATORS)); 19 | } 20 | virtual ~SegmentBase() { 21 | } 22 | 23 | virtual void Cut(const string& sentence, vector& words) const = 0; 24 | 25 | bool ResetSeparators(const string& s) { 26 | symbols_.clear(); 27 | RuneStrArray runes; 28 | if (!DecodeRunesInString(s, runes)) { 29 | XLOG(ERROR) << "decode " << s << " failed"; 30 | return false; 31 | } 32 | for (size_t i = 0; i < runes.size(); i++) { 33 | if (!symbols_.insert(runes[i].rune).second) { 34 | XLOG(ERROR) << s.substr(runes[i].offset, runes[i].len) << " already exists"; 35 | return false; 36 | } 37 | } 38 | return true; 39 | } 40 | protected: 41 | unordered_set symbols_; 42 | }; // class SegmentBase 43 | 44 | } // cppjieba 45 | 46 | #endif 47 | -------------------------------------------------------------------------------- /src/threadpool/TaskQueue.cc: -------------------------------------------------------------------------------- 1 | #include "threadpool/TaskQueue.h" 2 | 3 | namespace wd { 4 | TaskQueue::TaskQueue(size_t queSize) 5 | : _queSize(queSize), 6 | _mutex(), 7 | _notFull(_mutex), 8 | _notEmpty(_mutex), 9 | _flag(true) {} 10 | 11 | bool TaskQueue::empty() const { return _que.size() == 0; } 12 | 13 | bool TaskQueue::full() const { return _que.size() == _queSize; } 14 | 15 | void TaskQueue::push(ElemType elem) { 16 | MutexGuard autolock(_mutex); 17 | while (full()) { //使用while是为了防止虚假(异常)唤醒 18 | _notFull.wait(); 19 | } 20 | /*如果使用if,如果有N线程等待在_notfull条件变量上,执行_notFull.notify()会唤醒多个线程, 21 | * 唤醒的过程中会加锁,有一个线程会拿到锁,继续往下执行,其他N-1个线程在等待加锁,拿到锁的线程执行结束后, 22 | * 锁被释放,此时full()还是true,因为push()了任务又被pop()了,其他N-1个线程会有一个线程拿到锁,加锁后继续往下执行, 23 | * 根本不会判断full()是否为true,而此时队列是满的,所以要使用while 24 | */ 25 | _que.push(elem); 26 | _notEmpty.notify(); 27 | } 28 | 29 | ElemType TaskQueue::pop() { 30 | MutexGuard autolock(_mutex); 31 | while (_flag && empty()) { 32 | _notEmpty.wait(); 33 | } 34 | if (_flag) { 35 | ElemType elem = _que.front(); 36 | _que.pop(); 37 | _notFull.notify(); 38 | return elem; 39 | } else { 40 | return nullptr; 41 | } 42 | } 43 | 44 | void TaskQueue::wakeup() { 45 | _flag = false; 46 | _notEmpty.notifyall(); 47 | } 48 | } // namespace wd 49 | -------------------------------------------------------------------------------- /php_TCP/php_client.php: -------------------------------------------------------------------------------- 1 | 55 | 56 | -------------------------------------------------------------------------------- /include/cppjieba/limonp/BoundedQueue.hpp: -------------------------------------------------------------------------------- 1 | #ifndef LIMONP_BOUNDED_QUEUE_HPP 2 | #define LIMONP_BOUNDED_QUEUE_HPP 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | namespace limonp { 9 | using namespace std; 10 | template 11 | class BoundedQueue { 12 | public: 13 | explicit BoundedQueue(size_t capacity): capacity_(capacity), circular_buffer_(capacity) { 14 | head_ = 0; 15 | tail_ = 0; 16 | size_ = 0; 17 | assert(capacity_); 18 | } 19 | ~BoundedQueue() { 20 | } 21 | 22 | void Clear() { 23 | head_ = 0; 24 | tail_ = 0; 25 | size_ = 0; 26 | } 27 | bool Empty() const { 28 | return !size_; 29 | } 30 | bool Full() const { 31 | return capacity_ == size_; 32 | } 33 | size_t Size() const { 34 | return size_; 35 | } 36 | size_t Capacity() const { 37 | return capacity_; 38 | } 39 | 40 | void Push(const T& t) { 41 | assert(!Full()); 42 | circular_buffer_[tail_] = t; 43 | tail_ = (tail_ + 1) % capacity_; 44 | size_ ++; 45 | } 46 | 47 | T Pop() { 48 | assert(!Empty()); 49 | size_t oldPos = head_; 50 | head_ = (head_ + 1) % capacity_; 51 | size_ --; 52 | return circular_buffer_[oldPos]; 53 | } 54 | 55 | private: 56 | size_t head_; 57 | size_t tail_; 58 | size_t size_; 59 | const size_t capacity_; 60 | vector circular_buffer_; 61 | 62 | }; // class BoundedQueue 63 | } // namespace limonp 64 | 65 | #endif 66 | -------------------------------------------------------------------------------- /include/cppjieba/PreFilter.hpp: -------------------------------------------------------------------------------- 1 | #ifndef CPPJIEBA_PRE_FILTER_H 2 | #define CPPJIEBA_PRE_FILTER_H 3 | 4 | #include "Trie.hpp" 5 | #include "limonp/Logging.hpp" 6 | 7 | namespace cppjieba { 8 | 9 | class PreFilter { 10 | public: 11 | //TODO use WordRange instead of Range 12 | struct Range { 13 | RuneStrArray::const_iterator begin; 14 | RuneStrArray::const_iterator end; 15 | }; // struct Range 16 | 17 | PreFilter(const unordered_set& symbols, 18 | const string& sentence) 19 | : symbols_(symbols) { 20 | if (!DecodeRunesInString(sentence, sentence_)) { 21 | XLOG(ERROR) << "decode failed. "; 22 | } 23 | cursor_ = sentence_.begin(); 24 | } 25 | ~PreFilter() { 26 | } 27 | bool HasNext() const { 28 | return cursor_ != sentence_.end(); 29 | } 30 | Range Next() { 31 | Range range; 32 | range.begin = cursor_; 33 | while (cursor_ != sentence_.end()) { 34 | if (IsIn(symbols_, cursor_->rune)) { 35 | if (range.begin == cursor_) { 36 | cursor_ ++; 37 | } 38 | range.end = cursor_; 39 | return range; 40 | } 41 | cursor_ ++; 42 | } 43 | range.end = sentence_.end(); 44 | return range; 45 | } 46 | private: 47 | RuneStrArray::const_iterator cursor_; 48 | RuneStrArray sentence_; 49 | const unordered_set& symbols_; 50 | }; // class PreFilter 51 | 52 | } // namespace cppjieba 53 | 54 | #endif // CPPJIEBA_PRE_FILTER_H 55 | -------------------------------------------------------------------------------- /src/net/Acceptor.cc: -------------------------------------------------------------------------------- 1 | #include "net/Acceptor.h" 2 | 3 | namespace wd { 4 | Acceptor::Acceptor(unsigned short port) : _addr(port), _listensock() {} 5 | 6 | Acceptor::Acceptor(const string& ip, unsigned short port) 7 | : _addr(ip, port), _listensock() {} 8 | 9 | void Acceptor::ready() { 10 | setReuseAddr(true); 11 | setReusePort(true); 12 | bind(); 13 | listen(); 14 | } 15 | 16 | void Acceptor::setReuseAddr(bool on) { 17 | int one = on; 18 | if (setsockopt(_listensock.fd(), SOL_SOCKET, SO_REUSEADDR, &one, 19 | sizeof(one)) < 0) { 20 | perror("setsockopt"); 21 | } 22 | } 23 | 24 | void Acceptor::setReusePort(bool on) { 25 | int one = on; 26 | if (setsockopt(_listensock.fd(), SOL_SOCKET, SO_REUSEPORT, &one, 27 | sizeof(one)) < 0) { 28 | perror("setsockopt"); 29 | } 30 | } 31 | 32 | void Acceptor::bind() { 33 | int ret = 34 | ::bind(_listensock.fd(), (struct sockaddr*)_addr.getInetAddressPtr(), 35 | sizeof(struct sockaddr)); 36 | if (ret == -1) { 37 | perror("bind"); 38 | } 39 | } 40 | 41 | void Acceptor::listen() { 42 | int ret = ::listen(_listensock.fd(), 10); 43 | if (ret == -1) { 44 | perror("listen"); 45 | } 46 | } 47 | 48 | int Acceptor::accept() { 49 | int peerfd = ::accept(_listensock.fd(), NULL, NULL); 50 | if (peerfd == -1) { 51 | perror("accept"); 52 | } 53 | return peerfd; 54 | } 55 | } // namespace wd 56 | -------------------------------------------------------------------------------- /offline/include/cppjieba/PreFilter.hpp: -------------------------------------------------------------------------------- 1 | #ifndef CPPJIEBA_PRE_FILTER_H 2 | #define CPPJIEBA_PRE_FILTER_H 3 | 4 | #include "Trie.hpp" 5 | #include "limonp/Logging.hpp" 6 | 7 | namespace cppjieba { 8 | 9 | class PreFilter { 10 | public: 11 | //TODO use WordRange instead of Range 12 | struct Range { 13 | RuneStrArray::const_iterator begin; 14 | RuneStrArray::const_iterator end; 15 | }; // struct Range 16 | 17 | PreFilter(const unordered_set& symbols, 18 | const string& sentence) 19 | : symbols_(symbols) { 20 | if (!DecodeRunesInString(sentence, sentence_)) { 21 | XLOG(ERROR) << "decode failed. "; 22 | } 23 | cursor_ = sentence_.begin(); 24 | } 25 | ~PreFilter() { 26 | } 27 | bool HasNext() const { 28 | return cursor_ != sentence_.end(); 29 | } 30 | Range Next() { 31 | Range range; 32 | range.begin = cursor_; 33 | while (cursor_ != sentence_.end()) { 34 | if (IsIn(symbols_, cursor_->rune)) { 35 | if (range.begin == cursor_) { 36 | cursor_ ++; 37 | } 38 | range.end = cursor_; 39 | return range; 40 | } 41 | cursor_ ++; 42 | } 43 | range.end = sentence_.end(); 44 | return range; 45 | } 46 | private: 47 | RuneStrArray::const_iterator cursor_; 48 | RuneStrArray sentence_; 49 | const unordered_set& symbols_; 50 | }; // class PreFilter 51 | 52 | } // namespace cppjieba 53 | 54 | #endif // CPPJIEBA_PRE_FILTER_H 55 | -------------------------------------------------------------------------------- /offline/include/cppjieba/limonp/BoundedQueue.hpp: -------------------------------------------------------------------------------- 1 | #ifndef LIMONP_BOUNDED_QUEUE_HPP 2 | #define LIMONP_BOUNDED_QUEUE_HPP 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | namespace limonp { 9 | using namespace std; 10 | template 11 | class BoundedQueue { 12 | public: 13 | explicit BoundedQueue(size_t capacity): capacity_(capacity), circular_buffer_(capacity) { 14 | head_ = 0; 15 | tail_ = 0; 16 | size_ = 0; 17 | assert(capacity_); 18 | } 19 | ~BoundedQueue() { 20 | } 21 | 22 | void Clear() { 23 | head_ = 0; 24 | tail_ = 0; 25 | size_ = 0; 26 | } 27 | bool Empty() const { 28 | return !size_; 29 | } 30 | bool Full() const { 31 | return capacity_ == size_; 32 | } 33 | size_t Size() const { 34 | return size_; 35 | } 36 | size_t Capacity() const { 37 | return capacity_; 38 | } 39 | 40 | void Push(const T& t) { 41 | assert(!Full()); 42 | circular_buffer_[tail_] = t; 43 | tail_ = (tail_ + 1) % capacity_; 44 | size_ ++; 45 | } 46 | 47 | T Pop() { 48 | assert(!Empty()); 49 | size_t oldPos = head_; 50 | head_ = (head_ + 1) % capacity_; 51 | size_ --; 52 | return circular_buffer_[oldPos]; 53 | } 54 | 55 | private: 56 | size_t head_; 57 | size_t tail_; 58 | size_t size_; 59 | const size_t capacity_; 60 | vector circular_buffer_; 61 | 62 | }; // class BoundedQueue 63 | } // namespace limonp 64 | 65 | #endif 66 | -------------------------------------------------------------------------------- /offline/include/WordSegmentation.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #include "Configuration.h" 6 | #include "cppjieba/Jieba.hpp" 7 | #include "simhash/Simhasher.hpp" 8 | using std::cout; 9 | using std::endl; 10 | using std::string; 11 | using std::vector; 12 | 13 | namespace wd { 14 | class WordSegmentation { 15 | public: 16 | WordSegmentation() 17 | : _jieba(CONFIG[DICT_PATH], CONFIG[HMM_PATH], CONFIG[USER_DICT_PATH], 18 | CONFIG[IDF_PATH], CONFIG[STOP_WORD_PATH]), 19 | _simhasher(CONFIG[DICT_PATH], CONFIG[HMM_PATH], CONFIG[IDF_PATH], 20 | CONFIG[STOP_WORD_PATH]) { 21 | cout << ">> jieba init" << endl << ">> simhasher init" << endl; 22 | } 23 | 24 | vector cutWords(const string& str) { 25 | //using namespace cppjieba; 26 | 27 | vector words; 28 | _jieba.Cut(str, words, true); 29 | return words; 30 | } 31 | 32 | uint64_t makeSimhash(const string& content, size_t topN) { 33 | using namespace simhash; 34 | uint64_t hash; 35 | // vector> res; 36 | // _simhasher.extract(content, res, topN); 37 | // cout << "key words: " << res << endl; 38 | 39 | _simhasher.make(content, topN, hash); 40 | //cout << "simhash: " << hash << endl; 41 | return hash; 42 | } 43 | 44 | private: 45 | cppjieba::Jieba _jieba; 46 | simhash::Simhasher _simhasher; 47 | }; 48 | 49 | } // namespace wd 50 | -------------------------------------------------------------------------------- /offline/src/DirScanner.cc: -------------------------------------------------------------------------------- 1 | #include "DirScanner.h" 2 | #include 3 | #include 4 | #include 5 | #include "Configuration.h" 6 | #include "dirent.h" 7 | #include "unistd.h" 8 | 9 | namespace wd { 10 | DirScanner::DirScanner() { _files.reserve(20); } 11 | 12 | void DirScanner::operator()() { 13 | traverse(CONFIG[XML_PATH]); 14 | cout << ">> import xml files" << endl; 15 | for (auto& file : _files) { 16 | cout << file << endl; 17 | } 18 | } 19 | 20 | void DirScanner::traverse(const string& dirPath) { 21 | DIR* pdir = ::opendir(dirPath.c_str()); 22 | if (!pdir) { 23 | perror("opendir"); 24 | return; 25 | } 26 | ::chdir(dirPath.c_str()); 27 | struct dirent* filedirent; 28 | struct stat filestat; 29 | while ((filedirent = ::readdir(pdir)) != nullptr) { 30 | ::stat(filedirent->d_name, &filestat); 31 | if (S_ISDIR(filestat.st_mode)) { //是文件夹 32 | if (strcmp(filedirent->d_name, ".") == 0 || 33 | strcmp(filedirent->d_name, "..") == 0) { 34 | continue; 35 | } else { 36 | traverse(filedirent->d_name); //递归遍历 37 | } 38 | } else { 39 | string filePath; 40 | filePath.append(::getcwd(NULL, 0)) 41 | .append("/") 42 | .append(filedirent->d_name); 43 | _files.push_back(filePath); 44 | } 45 | } 46 | 47 | ::chdir(".."); 48 | ::closedir(pdir); 49 | } 50 | 51 | vector& DirScanner::getFiles() { return _files; } 52 | 53 | } // namespace wd 54 | -------------------------------------------------------------------------------- /src/threadpool/Threadpool.cc: -------------------------------------------------------------------------------- 1 | #include "threadpool/Threadpool.h" 2 | #include 3 | #include 4 | #include 5 | #include "threadpool/Thread.h" 6 | using std::string; 7 | 8 | namespace wd { 9 | Threadpool::Threadpool(size_t threadNum, size_t queSize) 10 | : _threadNum(threadNum), 11 | _queSize(queSize), 12 | _taskque(_queSize), 13 | _isExit(false) { 14 | _threads.reserve(_threadNum); 15 | } 16 | 17 | Threadpool::~Threadpool() { 18 | cout << "~Threadpool()" << endl; 19 | if (!_isExit) { 20 | stop(); 21 | } 22 | } 23 | 24 | void Threadpool::addTask(Task&& task) { _taskque.push(std::move(task)); } 25 | 26 | Task Threadpool::getTask() { return _taskque.pop(); } 27 | 28 | void Threadpool::start() { 29 | for (size_t idx = 0; idx != _threadNum; ++idx) { 30 | unique_ptr thread( 31 | new Thread(std::bind(&Threadpool::threadFunc, this))); 32 | _threads.push_back(std::move(thread)); 33 | } 34 | 35 | for (auto& thread : _threads) { 36 | thread->start(); 37 | } 38 | } 39 | 40 | //每一个子线程都要执行的任务 41 | void Threadpool::threadFunc() { 42 | while (!_isExit) { 43 | Task task = getTask(); 44 | if (task) { 45 | task(); //执行任务 46 | } 47 | } 48 | } 49 | 50 | void Threadpool::stop() { 51 | if (!_isExit) { 52 | while (!_taskque.empty()) { 53 | ::usleep(1); 54 | } 55 | _isExit = true; 56 | _taskque.wakeup(); 57 | for (auto& thread : _threads) { 58 | thread->join(); 59 | } 60 | } 61 | } 62 | } // namespace wd 63 | -------------------------------------------------------------------------------- /include/net/TCPConnection.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #include "InetAddress.h" 6 | #include "Nocopyble.h" 7 | #include "Socket.h" 8 | #include "SocketIO.h" 9 | using std::string; 10 | using std::function; 11 | 12 | namespace wd { 13 | class TCPConnection; 14 | using TCPConnectionPtr = std::shared_ptr; 15 | using TCPConnectionCallback = function; 16 | 17 | class EventLoop; 18 | 19 | class TCPConnection 20 | : Nocopyble 21 | , public std::enable_shared_from_this { 22 | public: 23 | TCPConnection(int fd, EventLoop* loop); 24 | ~TCPConnection(); 25 | 26 | string receive(); 27 | void send(const string& msg); 28 | void sendInLoop(const string& msg); 29 | 30 | string toString() const; 31 | void shutdown(); 32 | 33 | void setConnectionCallback(const TCPConnectionCallback& cb); 34 | void setMessageCallback(const TCPConnectionCallback& cb); 35 | void setCloseCallback(const TCPConnectionCallback& cb); 36 | 37 | void handleConnectionCallback(); 38 | void handleMessageCallback(); 39 | void handleCloseCallback(); 40 | 41 | private: 42 | InetAddress getLocalAddr(); 43 | InetAddress getPeerAddr(); 44 | 45 | private: 46 | Socket _sock; 47 | SocketIO _socketIO; 48 | InetAddress _localAddr; 49 | InetAddress _peerAddr; 50 | bool _isShutdownonWrite; 51 | EventLoop* _loop; 52 | 53 | TCPConnectionCallback _onConnection; 54 | TCPConnectionCallback _onMessage; 55 | TCPConnectionCallback _onClose; 56 | }; 57 | 58 | } // namespace wd 59 | -------------------------------------------------------------------------------- /include/cppjieba/limonp/BoundedBlockingQueue.hpp: -------------------------------------------------------------------------------- 1 | #ifndef LIMONP_BOUNDED_BLOCKING_QUEUE_HPP 2 | #define LIMONP_BOUNDED_BLOCKING_QUEUE_HPP 3 | 4 | #include "BoundedQueue.hpp" 5 | 6 | namespace limonp { 7 | 8 | template 9 | class BoundedBlockingQueue : NonCopyable { 10 | public: 11 | explicit BoundedBlockingQueue(size_t maxSize) 12 | : mutex_(), 13 | notEmpty_(mutex_), 14 | notFull_(mutex_), 15 | queue_(maxSize) { 16 | } 17 | 18 | void Push(const T& x) { 19 | MutexLockGuard lock(mutex_); 20 | while (queue_.Full()) { 21 | notFull_.Wait(); 22 | } 23 | assert(!queue_.Full()); 24 | queue_.Push(x); 25 | notEmpty_.Notify(); 26 | } 27 | 28 | T Pop() { 29 | MutexLockGuard lock(mutex_); 30 | while (queue_.Empty()) { 31 | notEmpty_.Wait(); 32 | } 33 | assert(!queue_.Empty()); 34 | T res = queue_.Pop(); 35 | notFull_.Notify(); 36 | return res; 37 | } 38 | 39 | bool Empty() const { 40 | MutexLockGuard lock(mutex_); 41 | return queue_.Empty(); 42 | } 43 | 44 | bool Full() const { 45 | MutexLockGuard lock(mutex_); 46 | return queue_.Full(); 47 | } 48 | 49 | size_t size() const { 50 | MutexLockGuard lock(mutex_); 51 | return queue_.size(); 52 | } 53 | 54 | size_t capacity() const { 55 | return queue_.capacity(); 56 | } 57 | 58 | private: 59 | mutable MutexLock mutex_; 60 | Condition notEmpty_; 61 | Condition notFull_; 62 | BoundedQueue queue_; 63 | }; // class BoundedBlockingQueue 64 | 65 | } // namespace limonp 66 | 67 | #endif // LIMONP_BOUNDED_BLOCKING_QUEUE_HPP 68 | -------------------------------------------------------------------------------- /include/Redis.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "mylogger.h" 3 | #include 4 | #include 5 | using std::string; 6 | 7 | namespace wd { 8 | class Redis { 9 | public: 10 | Redis() : _connect(NULL), _reply(NULL) {} 11 | 12 | ~Redis() { 13 | if (_connect) { 14 | redisFree(_connect); 15 | } 16 | _connect = NULL; 17 | _reply = NULL; 18 | } 19 | 20 | bool connect(const string& ip, int port) { 21 | _connect = redisConnect(ip.c_str(), port); 22 | if (_connect == NULL || _connect->err) { 23 | if (_connect) { 24 | LogError("Error: %s", _connect->errstr); 25 | } else { 26 | LogError("Can't allocate redis context"); 27 | } 28 | return false; 29 | } 30 | LogDebug("Connect to Redis server success"); 31 | return true; 32 | } 33 | 34 | string get(const string& key) { 35 | _reply = (redisReply*)redisCommand(_connect, "GET %s", key.c_str()); 36 | LogDebug("Succeed to execute command: GET %s", key.c_str()); 37 | 38 | if(_reply->type == REDIS_REPLY_NIL) { 39 | return string("-1"); 40 | } 41 | string value = _reply->str; 42 | freeReplyObject(_reply); 43 | return value; 44 | } 45 | 46 | void set(const string& key, const string& value) { 47 | redisCommand(_connect, "SET %s %s", key.c_str(), value.c_str()); 48 | LogDebug("Succeed to execute command: SET %s %s", key.c_str(), value.c_str()); 49 | } 50 | 51 | private: 52 | redisContext* _connect; 53 | redisReply* _reply; 54 | }; 55 | 56 | } // namespace wd 57 | -------------------------------------------------------------------------------- /include/cppjieba/limonp/FileLock.hpp: -------------------------------------------------------------------------------- 1 | #ifndef LIMONP_FILELOCK_HPP 2 | #define LIMONP_FILELOCK_HPP 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | namespace limonp { 14 | 15 | using std::string; 16 | 17 | class FileLock { 18 | public: 19 | FileLock() : fd_(-1), ok_(true) { 20 | } 21 | ~FileLock() { 22 | if(fd_ > 0) { 23 | Close(); 24 | } 25 | } 26 | void Open(const string& fname) { 27 | assert(fd_ == -1); 28 | fd_ = open(fname.c_str(), O_RDWR | O_CREAT, 0644); 29 | if(fd_ < 0) { 30 | ok_ = false; 31 | err_ = strerror(errno); 32 | } 33 | } 34 | void Close() { 35 | ::close(fd_); 36 | } 37 | void Lock() { 38 | if(LockOrUnlock(fd_, true) < 0) { 39 | ok_ = false; 40 | err_ = strerror(errno); 41 | } 42 | } 43 | void UnLock() { 44 | if(LockOrUnlock(fd_, false) < 0) { 45 | ok_ = false; 46 | err_ = strerror(errno); 47 | } 48 | } 49 | bool Ok() const { 50 | return ok_; 51 | } 52 | string Error() const { 53 | return err_; 54 | } 55 | private: 56 | static int LockOrUnlock(int fd, bool lock) { 57 | errno = 0; 58 | struct flock f; 59 | memset(&f, 0, sizeof(f)); 60 | f.l_type = (lock ? F_WRLCK : F_UNLCK); 61 | f.l_whence = SEEK_SET; 62 | f.l_start = 0; 63 | f.l_len = 0; // Lock/unlock entire file 64 | return fcntl(fd, F_SETLK, &f); 65 | } 66 | 67 | int fd_; 68 | bool ok_; 69 | string err_; 70 | }; // class FileLock 71 | 72 | }// namespace limonp 73 | 74 | #endif // LIMONP_FILELOCK_HPP 75 | -------------------------------------------------------------------------------- /src/mylogger.cc: -------------------------------------------------------------------------------- 1 | #include "mylogger.h" 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | using std::cout; 9 | using std::endl; 10 | 11 | namespace wd{ 12 | 13 | Mylogger* Mylogger::getInstance() { 14 | pthread_once(&_once, init); 15 | return _pInstance; 16 | } 17 | 18 | void Mylogger::init() { 19 | ::atexit(destroy); 20 | _pInstance = new Mylogger(); 21 | } 22 | 23 | void Mylogger::destroy() { 24 | if (_pInstance) { 25 | delete _pInstance; 26 | _pInstance = nullptr; 27 | } 28 | } 29 | 30 | Mylogger::Mylogger() 31 | : _mycategory(Category::getRoot().getInstance("mycategory")) { 32 | PatternLayout* ptnLayout1 = (new PatternLayout()); 33 | PatternLayout* ptnLayout2 = (new PatternLayout()); 34 | OstreamAppender* ostreamAppender = 35 | (new OstreamAppender("OstreamAppender", &cout)); 36 | FileAppender* fileAppender = (new FileAppender("FileAppender", _filename)); 37 | ptnLayout1->setConversionPattern("%d [%p] %m%n"); 38 | ptnLayout2->setConversionPattern("%d [%p] %m%n"); 39 | ostreamAppender->setLayout(ptnLayout1); 40 | fileAppender->setLayout(ptnLayout2); 41 | _mycategory.setPriority(Priority::DEBUG); 42 | _mycategory.addAppender(ostreamAppender); 43 | _mycategory.addAppender(fileAppender); 44 | } 45 | 46 | Mylogger::~Mylogger() { Category::shutdown(); } 47 | 48 | Mylogger* Mylogger::_pInstance = nullptr; 49 | pthread_once_t Mylogger::_once = PTHREAD_ONCE_INIT; 50 | string Mylogger::_filename = "/home/whb/project/RssSearchEngine/log/search.log"; 51 | 52 | } -------------------------------------------------------------------------------- /offline/include/cppjieba/limonp/BoundedBlockingQueue.hpp: -------------------------------------------------------------------------------- 1 | #ifndef LIMONP_BOUNDED_BLOCKING_QUEUE_HPP 2 | #define LIMONP_BOUNDED_BLOCKING_QUEUE_HPP 3 | 4 | #include "BoundedQueue.hpp" 5 | 6 | namespace limonp { 7 | 8 | template 9 | class BoundedBlockingQueue : NonCopyable { 10 | public: 11 | explicit BoundedBlockingQueue(size_t maxSize) 12 | : mutex_(), 13 | notEmpty_(mutex_), 14 | notFull_(mutex_), 15 | queue_(maxSize) { 16 | } 17 | 18 | void Push(const T& x) { 19 | MutexLockGuard lock(mutex_); 20 | while (queue_.Full()) { 21 | notFull_.Wait(); 22 | } 23 | assert(!queue_.Full()); 24 | queue_.Push(x); 25 | notEmpty_.Notify(); 26 | } 27 | 28 | T Pop() { 29 | MutexLockGuard lock(mutex_); 30 | while (queue_.Empty()) { 31 | notEmpty_.Wait(); 32 | } 33 | assert(!queue_.Empty()); 34 | T res = queue_.Pop(); 35 | notFull_.Notify(); 36 | return res; 37 | } 38 | 39 | bool Empty() const { 40 | MutexLockGuard lock(mutex_); 41 | return queue_.Empty(); 42 | } 43 | 44 | bool Full() const { 45 | MutexLockGuard lock(mutex_); 46 | return queue_.Full(); 47 | } 48 | 49 | size_t size() const { 50 | MutexLockGuard lock(mutex_); 51 | return queue_.size(); 52 | } 53 | 54 | size_t capacity() const { 55 | return queue_.capacity(); 56 | } 57 | 58 | private: 59 | mutable MutexLock mutex_; 60 | Condition notEmpty_; 61 | Condition notFull_; 62 | BoundedQueue queue_; 63 | }; // class BoundedBlockingQueue 64 | 65 | } // namespace limonp 66 | 67 | #endif // LIMONP_BOUNDED_BLOCKING_QUEUE_HPP 68 | -------------------------------------------------------------------------------- /offline/include/cppjieba/limonp/FileLock.hpp: -------------------------------------------------------------------------------- 1 | #ifndef LIMONP_FILELOCK_HPP 2 | #define LIMONP_FILELOCK_HPP 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | namespace limonp { 14 | 15 | using std::string; 16 | 17 | class FileLock { 18 | public: 19 | FileLock() : fd_(-1), ok_(true) { 20 | } 21 | ~FileLock() { 22 | if(fd_ > 0) { 23 | Close(); 24 | } 25 | } 26 | void Open(const string& fname) { 27 | assert(fd_ == -1); 28 | fd_ = open(fname.c_str(), O_RDWR | O_CREAT, 0644); 29 | if(fd_ < 0) { 30 | ok_ = false; 31 | err_ = strerror(errno); 32 | } 33 | } 34 | void Close() { 35 | ::close(fd_); 36 | } 37 | void Lock() { 38 | if(LockOrUnlock(fd_, true) < 0) { 39 | ok_ = false; 40 | err_ = strerror(errno); 41 | } 42 | } 43 | void UnLock() { 44 | if(LockOrUnlock(fd_, false) < 0) { 45 | ok_ = false; 46 | err_ = strerror(errno); 47 | } 48 | } 49 | bool Ok() const { 50 | return ok_; 51 | } 52 | string Error() const { 53 | return err_; 54 | } 55 | private: 56 | static int LockOrUnlock(int fd, bool lock) { 57 | errno = 0; 58 | struct flock f; 59 | memset(&f, 0, sizeof(f)); 60 | f.l_type = (lock ? F_WRLCK : F_UNLCK); 61 | f.l_whence = SEEK_SET; 62 | f.l_start = 0; 63 | f.l_len = 0; // Lock/unlock entire file 64 | return fcntl(fd, F_SETLK, &f); 65 | } 66 | 67 | int fd_; 68 | bool ok_; 69 | string err_; 70 | }; // class FileLock 71 | 72 | }// namespace limonp 73 | 74 | #endif // LIMONP_FILELOCK_HPP 75 | -------------------------------------------------------------------------------- /src/WordQueryServer.cc: -------------------------------------------------------------------------------- 1 | #include "WordQueryServer.h" 2 | #include "mylogger.h" 3 | using std::to_string; 4 | using std::placeholders::_1; 5 | 6 | namespace wd { 7 | WordQueryServer::WordQueryServer(int threadNum, int queSize, const string& ip, 8 | unsigned short port) 9 | : _threadpool(threadNum, queSize), _server(ip, port), _wordquery() {} 10 | 11 | void WordQueryServer::start() { 12 | _threadpool.start(); 13 | _server.setConnectionCallback( 14 | std::bind(&WordQueryServer::onConnection, this, _1)); 15 | _server.setMessageCallback( 16 | std::bind(&WordQueryServer::onMessage, this, _1)); 17 | _server.setCloseCallback(std::bind(&WordQueryServer::onClose, this, _1)); 18 | _server.start(); 19 | } 20 | 21 | void WordQueryServer::onConnection(const TCPConnectionPtr& conn) { 22 | LogInfo("%s connected", conn->toString().c_str()); 23 | } 24 | 25 | void WordQueryServer::onMessage(const TCPConnectionPtr& conn) { 26 | string msg = conn->receive(); 27 | if (msg.back() == '\n') { 28 | msg.erase(msg.size() - 1, 1); 29 | } 30 | LogInfo("receive from client: %s",msg.c_str()); 31 | //向任务队列中添加任务 32 | _threadpool.addTask(std::bind(&WordQueryServer::process, this, conn, msg)); 33 | } 34 | 35 | void WordQueryServer::onClose(const TCPConnectionPtr& conn) { 36 | LogInfo("%s disconnected", conn->toString().c_str()); 37 | } 38 | 39 | //运行在线程池的某一个线程 40 | void WordQueryServer::process(const TCPConnectionPtr& conn, const string& msg) { 41 | string ret = _wordquery.doQuery(msg); 42 | int sz = ret.size(); 43 | 44 | string message(to_string(sz)); 45 | message.append("\n").append(ret); 46 | cout << message << endl; 47 | 48 | conn->sendInLoop(message); 49 | } 50 | 51 | } // namespace wd 52 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Search Enigine 2 | 3 | ### 简介 4 | 5 | 基于文本检索的轻量级搜索引擎 6 | 7 | ### 特性 8 | 9 | - 使用[针对中文文档的simhash算法库](https://github.com/yanyiwu/simhash)进行网页去重 10 | - 使用[cppjieba](https://github.com/yanyiwu/cppjieba)进行中文分词 11 | - 使用[tinyxml2](https://github.com/leethomason/tinyxml2)解析和保存网页库 12 | - 通过TF-IDF算法建立网页库的倒排索引 13 | - 服务端框架采用了Reactor + 线程池的网络库 14 | - 对查找到的网页进行排序采用了余弦相似度算法 15 | - 将查询结果封装成`json`字符串发送给前端 16 | - 使用`Redis`作为缓存系统 17 | - 使用`log4cpp`作为日志系统 18 | 19 | ### 依赖 20 | 21 | - [针对中文文档的simhash算法库](https://github.com/yanyiwu/simhash) 22 | 23 | - [CppJieba](https://github.com/yanyiwu/cppjieba) 24 | 25 | - [TinyXML-2](https://github.com/leethomason/tinyxml2) 26 | 27 | - [JsonCpp](https://github.com/open-source-parsers/jsoncpp) (version = 1.8.3 is recommended) 28 | 29 | - [log4cpp](http://log4cpp.sourceforge.net/) 30 | 31 | - Redis 32 | 33 | ```shell 34 | sudo apt install redis-server 35 | git clone git@github.com:redis/hiredis.git 36 | cd hiredis 37 | make 38 | ./test.sh 39 | sudo make install 40 | ``` 41 | 42 | ### 用法 43 | 44 | - ##### 离线部分 45 | 46 | ```shell 47 | cd offline 48 | mkdir bin 49 | make 50 | ./bin/offline.exe 51 | ``` 52 | 53 | - ##### 在线部分 54 | 55 | ```shell 56 | cd .. 57 | mkdir bin 58 | make 59 | ./bin/SearchEngine.exe 60 | ``` 61 | 62 | - ##### 前端搭建 63 | 64 | ```shell 65 | sudo apt-get update 66 | sudo apt-get upgrade 67 | sudo apt-get install apache2 68 | sudo apt-get install php7.2 libapache2-mod-php7.2 69 | sudo cp php_TCP /var/www/html/ 70 | ``` 71 | 72 | 在浏览器中输入http://127.0.0.1/php_TCP/index.html 73 | 74 | ### 演示 75 | 76 | 离线部分 77 | 78 | ![](.assets/Screenshot%20from%202019-09-02%2021-08-50.png) 79 | 80 | 在线部分 81 | 82 | ![](.assets/Screenshot%20from%202019-09-02%2021-23-23.png) 83 | 84 | ![](.assets/Screenshot%20from%202019-09-02%2021-21-17.png) 85 | 86 | -------------------------------------------------------------------------------- /include/cppjieba/limonp/ArgvContext.hpp: -------------------------------------------------------------------------------- 1 | /************************************ 2 | * file enc : ascii 3 | * author : wuyanyi09@gmail.com 4 | ************************************/ 5 | 6 | #ifndef LIMONP_ARGV_FUNCTS_H 7 | #define LIMONP_ARGV_FUNCTS_H 8 | 9 | #include 10 | #include 11 | #include "StringUtil.hpp" 12 | 13 | namespace limonp { 14 | 15 | using namespace std; 16 | 17 | class ArgvContext { 18 | public : 19 | ArgvContext(int argc, const char* const * argv) { 20 | for(int i = 0; i < argc; i++) { 21 | if(StartsWith(argv[i], "-")) { 22 | if(i + 1 < argc && !StartsWith(argv[i + 1], "-")) { 23 | mpss_[argv[i]] = argv[i+1]; 24 | i++; 25 | } else { 26 | sset_.insert(argv[i]); 27 | } 28 | } else { 29 | args_.push_back(argv[i]); 30 | } 31 | } 32 | } 33 | ~ArgvContext() { 34 | } 35 | 36 | friend ostream& operator << (ostream& os, const ArgvContext& args); 37 | string operator [](size_t i) const { 38 | if(i < args_.size()) { 39 | return args_[i]; 40 | } 41 | return ""; 42 | } 43 | string operator [](const string& key) const { 44 | map::const_iterator it = mpss_.find(key); 45 | if(it != mpss_.end()) { 46 | return it->second; 47 | } 48 | return ""; 49 | } 50 | 51 | bool HasKey(const string& key) const { 52 | if(mpss_.find(key) != mpss_.end() || sset_.find(key) != sset_.end()) { 53 | return true; 54 | } 55 | return false; 56 | } 57 | 58 | private: 59 | vector args_; 60 | map mpss_; 61 | set sset_; 62 | }; // class ArgvContext 63 | 64 | inline ostream& operator << (ostream& os, const ArgvContext& args) { 65 | return os< 3 | #include 4 | #include 5 | using std::cout; 6 | using std::endl; 7 | using std::ifstream; 8 | using std::istringstream; 9 | using std::make_pair; 10 | 11 | namespace wd { 12 | Configuration* Configuration::getInstance() { 13 | if (_pInstance) 14 | return _pInstance; 15 | else 16 | return nullptr; 17 | } 18 | 19 | Configuration* Configuration::getInstance(const string& filepath) { 20 | if (_pInstance == nullptr) { 21 | ::atexit(destroy); 22 | _pInstance = new Configuration(filepath); 23 | } 24 | return _pInstance; 25 | } 26 | 27 | void Configuration::destroy() { 28 | if (_pInstance) delete _pInstance; 29 | } 30 | 31 | Configuration::Configuration(const string& filepath) : _filepath(filepath) { 32 | readFile(filepath); 33 | cout << "Configuration(const string&)" << endl; 34 | } 35 | 36 | void Configuration::readFile(const string& filename) { 37 | ifstream ifs(filename); 38 | if (!ifs) { 39 | perror("fopen"); 40 | return; 41 | } 42 | string line, key, value; 43 | while (getline(ifs, line)) { 44 | istringstream iss(line); 45 | iss >> key >> value; 46 | _configMap.insert(make_pair(key, value)); 47 | } 48 | cout << ">> read config file success" << endl; 49 | } 50 | 51 | unordered_set& Configuration::getStopWords() { 52 | if (_stopWords.size() > 0) { 53 | return _stopWords; 54 | } 55 | 56 | ifstream ifs(_configMap[STOP_WORD_PATH]); 57 | if (!ifs) { 58 | perror("fopen"); 59 | } 60 | 61 | string line; 62 | while (getline(ifs, line)) { 63 | _stopWords.insert(line); 64 | } 65 | return _stopWords; 66 | } 67 | 68 | Configuration* Configuration::_pInstance = nullptr; 69 | } // namespace wd 70 | -------------------------------------------------------------------------------- /src/Configuration.cc: -------------------------------------------------------------------------------- 1 | #include "Configuration.h" 2 | #include "mylogger.h" 3 | #include 4 | #include 5 | #include 6 | using std::cout; 7 | using std::endl; 8 | using std::ifstream; 9 | using std::istringstream; 10 | using std::make_pair; 11 | 12 | namespace wd { 13 | Configuration* Configuration::getInstance() { 14 | if (_pInstance) 15 | return _pInstance; 16 | else 17 | return nullptr; 18 | } 19 | 20 | Configuration* Configuration::getInstance(const string& filepath) { 21 | if (_pInstance == nullptr) { 22 | ::atexit(destroy); 23 | _pInstance = new Configuration(filepath); 24 | } 25 | return _pInstance; 26 | } 27 | 28 | void Configuration::destroy() { 29 | if (_pInstance) delete _pInstance; 30 | } 31 | 32 | Configuration::Configuration(const string& filepath) : _filepath(filepath) { 33 | readFile(filepath); 34 | cout << "Configuration(const string&)" << endl; 35 | } 36 | 37 | void Configuration::readFile(const string& filename) { 38 | ifstream ifs(filename); 39 | if (!ifs) { 40 | perror("fopen"); 41 | return; 42 | } 43 | string line, key, value; 44 | while (getline(ifs, line)) { 45 | istringstream iss(line); 46 | iss >> key >> value; 47 | _configMap.insert(make_pair(key, value)); 48 | } 49 | LogDebug("read config file success"); 50 | } 51 | 52 | unordered_set& Configuration::getStopWords() { 53 | if (_stopWords.size() > 0) { 54 | return _stopWords; 55 | } 56 | 57 | ifstream ifs(_configMap[STOP_WORD_PATH]); 58 | if (!ifs) { 59 | perror("fopen"); 60 | } 61 | 62 | string line; 63 | while (getline(ifs, line)) { 64 | _stopWords.insert(line); 65 | } 66 | return _stopWords; 67 | } 68 | 69 | Configuration* Configuration::_pInstance = nullptr; 70 | } // namespace wd 71 | -------------------------------------------------------------------------------- /include/net/EventLoop.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "threadpool/MutexLock.h" 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | using std::function; 9 | using std::map; 10 | using std::shared_ptr; 11 | using std::vector; 12 | 13 | namespace wd { 14 | class Acceptor; 15 | class TCPConnection; 16 | 17 | class EventLoop { 18 | public: 19 | using TCPConnectionPtr = shared_ptr; 20 | using TCPConnectionCallback = function; 21 | using Functor = function; 22 | 23 | EventLoop(Acceptor& acceptor); 24 | void loop(); 25 | void unloop(); 26 | void runInloop(Functor&& cb); 27 | void setConnectionCallback(const TCPConnectionCallback&& cb) { 28 | _onConnection = std::move(cb); 29 | } 30 | 31 | void setMessageCallback(const TCPConnectionCallback&& cb) { 32 | _onMessage = std::move(cb); 33 | } 34 | 35 | void setCloseCallback(const TCPConnectionCallback&& cb) { 36 | _onClose = std::move(cb); 37 | } 38 | 39 | private: 40 | void waitEpollFd(); 41 | void handleNewConnection(); 42 | void handleMessage(int fd); 43 | int createEpollFd(); 44 | 45 | void addEpollFdRead(int fd); 46 | void delEpollFdRead(int fd); 47 | bool isConnectionClosed(int fd); 48 | 49 | int createEventFd(); 50 | void handleRead(); 51 | void wakeup(); 52 | void doPendingFunctors(); 53 | 54 | private: 55 | int _efd; 56 | int _eventfd; 57 | Acceptor& _acceptor; 58 | vector _eventList; 59 | map _conns; 60 | bool _isLooping; 61 | MutexLock _mutex; 62 | vector _pendingFunctors; 63 | 64 | TCPConnectionCallback _onConnection; 65 | TCPConnectionCallback _onMessage; 66 | TCPConnectionCallback _onClose; 67 | }; 68 | 69 | } // namespace wd 70 | -------------------------------------------------------------------------------- /offline/include/cppjieba/limonp/ArgvContext.hpp: -------------------------------------------------------------------------------- 1 | /************************************ 2 | * file enc : ascii 3 | * author : wuyanyi09@gmail.com 4 | ************************************/ 5 | 6 | #ifndef LIMONP_ARGV_FUNCTS_H 7 | #define LIMONP_ARGV_FUNCTS_H 8 | 9 | #include 10 | #include 11 | #include "StringUtil.hpp" 12 | 13 | namespace limonp { 14 | 15 | using namespace std; 16 | 17 | class ArgvContext { 18 | public : 19 | ArgvContext(int argc, const char* const * argv) { 20 | for(int i = 0; i < argc; i++) { 21 | if(StartsWith(argv[i], "-")) { 22 | if(i + 1 < argc && !StartsWith(argv[i + 1], "-")) { 23 | mpss_[argv[i]] = argv[i+1]; 24 | i++; 25 | } else { 26 | sset_.insert(argv[i]); 27 | } 28 | } else { 29 | args_.push_back(argv[i]); 30 | } 31 | } 32 | } 33 | ~ArgvContext() { 34 | } 35 | 36 | friend ostream& operator << (ostream& os, const ArgvContext& args); 37 | string operator [](size_t i) const { 38 | if(i < args_.size()) { 39 | return args_[i]; 40 | } 41 | return ""; 42 | } 43 | string operator [](const string& key) const { 44 | map::const_iterator it = mpss_.find(key); 45 | if(it != mpss_.end()) { 46 | return it->second; 47 | } 48 | return ""; 49 | } 50 | 51 | bool HasKey(const string& key) const { 52 | if(mpss_.find(key) != mpss_.end() || sset_.find(key) != sset_.end()) { 53 | return true; 54 | } 55 | return false; 56 | } 57 | 58 | private: 59 | vector args_; 60 | map mpss_; 61 | set sset_; 62 | }; // class ArgvContext 63 | 64 | inline ostream& operator << (ostream& os, const ArgvContext& args) { 65 | return os< 2 | 3 | 4 | 5 | search engine 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 |
15 | 16 | 17 |
18 |
19 | 20 |
21 | 22 |
23 | 52 | 53 | 54 | -------------------------------------------------------------------------------- /include/cppjieba/limonp/Logging.hpp: -------------------------------------------------------------------------------- 1 | #ifndef LIMONP_LOGGING_HPP 2 | #define LIMONP_LOGGING_HPP 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #ifdef XLOG 11 | #error "XLOG has been defined already" 12 | #endif // XLOG 13 | #ifdef XCHECK 14 | #error "XCHECK has been defined already" 15 | #endif // XCHECK 16 | 17 | #define XLOG(level) limonp::Logger(limonp::LL_##level, __FILE__, __LINE__).Stream() 18 | #define XCHECK(exp) if(!(exp)) XLOG(FATAL) << "exp: ["#exp << "] false. " 19 | 20 | namespace limonp { 21 | 22 | enum { 23 | LL_DEBUG = 0, 24 | LL_INFO = 1, 25 | LL_WARNING = 2, 26 | LL_ERROR = 3, 27 | LL_FATAL = 4, 28 | }; // enum 29 | 30 | static const char * LOG_LEVEL_ARRAY[] = {"DEBUG","INFO","WARN","ERROR","FATAL"}; 31 | static const char * LOG_TIME_FORMAT = "%Y-%m-%d %H:%M:%S"; 32 | 33 | class Logger { 34 | public: 35 | Logger(size_t level, const char* filename, int lineno) 36 | : level_(level) { 37 | #ifdef LOGGING_LEVEL 38 | if (level_ < LOGGING_LEVEL) { 39 | return; 40 | } 41 | #endif 42 | assert(level_ <= sizeof(LOG_LEVEL_ARRAY)/sizeof(*LOG_LEVEL_ARRAY)); 43 | char buf[32]; 44 | time_t now; 45 | time(&now); 46 | strftime(buf, sizeof(buf), LOG_TIME_FORMAT, localtime(&now)); 47 | stream_ << buf 48 | << " " << filename 49 | << ":" << lineno 50 | << " " << LOG_LEVEL_ARRAY[level_] 51 | << " "; 52 | } 53 | ~Logger() { 54 | #ifdef LOGGING_LEVEL 55 | if (level_ < LOGGING_LEVEL) { 56 | return; 57 | } 58 | #endif 59 | std::cerr << stream_.str() << std::endl; 60 | if (level_ == LL_FATAL) { 61 | abort(); 62 | } 63 | } 64 | 65 | std::ostream& Stream() { 66 | return stream_; 67 | } 68 | 69 | private: 70 | std::ostringstream stream_; 71 | size_t level_; 72 | }; // class Logger 73 | 74 | } // namespace limonp 75 | 76 | #endif // LIMONP_LOGGING_HPP 77 | -------------------------------------------------------------------------------- /offline/include/cppjieba/limonp/Logging.hpp: -------------------------------------------------------------------------------- 1 | #ifndef LIMONP_LOGGING_HPP 2 | #define LIMONP_LOGGING_HPP 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #ifdef XLOG 11 | #error "XLOG has been defined already" 12 | #endif // XLOG 13 | #ifdef XCHECK 14 | #error "XCHECK has been defined already" 15 | #endif // XCHECK 16 | 17 | #define XLOG(level) limonp::Logger(limonp::LL_##level, __FILE__, __LINE__).Stream() 18 | #define XCHECK(exp) if(!(exp)) XLOG(FATAL) << "exp: ["#exp << "] false. " 19 | 20 | namespace limonp { 21 | 22 | enum { 23 | LL_DEBUG = 0, 24 | LL_INFO = 1, 25 | LL_WARNING = 2, 26 | LL_ERROR = 3, 27 | LL_FATAL = 4, 28 | }; // enum 29 | 30 | static const char * LOG_LEVEL_ARRAY[] = {"DEBUG","INFO","WARN","ERROR","FATAL"}; 31 | static const char * LOG_TIME_FORMAT = "%Y-%m-%d %H:%M:%S"; 32 | 33 | class Logger { 34 | public: 35 | Logger(size_t level, const char* filename, int lineno) 36 | : level_(level) { 37 | #ifdef LOGGING_LEVEL 38 | if (level_ < LOGGING_LEVEL) { 39 | return; 40 | } 41 | #endif 42 | assert(level_ <= sizeof(LOG_LEVEL_ARRAY)/sizeof(*LOG_LEVEL_ARRAY)); 43 | char buf[32]; 44 | time_t now; 45 | time(&now); 46 | strftime(buf, sizeof(buf), LOG_TIME_FORMAT, localtime(&now)); 47 | stream_ << buf 48 | << " " << filename 49 | << ":" << lineno 50 | << " " << LOG_LEVEL_ARRAY[level_] 51 | << " "; 52 | } 53 | ~Logger() { 54 | #ifdef LOGGING_LEVEL 55 | if (level_ < LOGGING_LEVEL) { 56 | return; 57 | } 58 | #endif 59 | std::cerr << stream_.str() << std::endl; 60 | if (level_ == LL_FATAL) { 61 | abort(); 62 | } 63 | } 64 | 65 | std::ostream& Stream() { 66 | return stream_; 67 | } 68 | 69 | private: 70 | std::ostringstream stream_; 71 | size_t level_; 72 | }; // class Logger 73 | 74 | } // namespace limonp 75 | 76 | #endif // LIMONP_LOGGING_HPP 77 | -------------------------------------------------------------------------------- /include/mylogger.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #include 6 | using std::string; 7 | using std::to_string; 8 | using namespace log4cpp; 9 | 10 | namespace wd{ 11 | 12 | #define suffix(msg) \ 13 | std::string(msg) \ 14 | .append(" [") \ 15 | .append(__FILE__) \ 16 | .append(" : ") \ 17 | .append(__func__) \ 18 | .append(":") \ 19 | .append(to_string(__LINE__)) \ 20 | .append("]") \ 21 | .c_str() 22 | 23 | class Mylogger { 24 | public: 25 | static Mylogger* getInstance(); 26 | static void destroy(); 27 | static void init(); 28 | 29 | static void setFilename(const string& filename) { 30 | _filename = filename; 31 | } 32 | 33 | template 34 | void warn(Args... args) { 35 | _mycategory.warn(args...); 36 | } 37 | 38 | template 39 | void error(Args... args) { 40 | _mycategory.error(args...); 41 | } 42 | 43 | template 44 | void debug(Args... args) { 45 | _mycategory.debug(args...); 46 | } 47 | 48 | template 49 | void info(Args... args) { 50 | _mycategory.info(args...); 51 | } 52 | 53 | private: 54 | Mylogger(); 55 | ~Mylogger(); 56 | 57 | private: 58 | static string _filename; 59 | Category& _mycategory; 60 | static Mylogger* _pInstance; 61 | static pthread_once_t _once; 62 | }; 63 | 64 | #define LogError(msg, ...) \ 65 | Mylogger::getInstance()->error(suffix(msg), ##__VA_ARGS__) 66 | #define LogWarn(msg, ...) \ 67 | Mylogger::getInstance()->warn(suffix(msg), ##__VA_ARGS__) 68 | #define LogInfo(msg, ...) \ 69 | Mylogger::getInstance()->info(suffix(msg), ##__VA_ARGS__) 70 | #define LogDebug(msg, ...) \ 71 | Mylogger::getInstance()->debug(suffix(msg), ##__VA_ARGS__) 72 | 73 | } -------------------------------------------------------------------------------- /include/Configuration.h: -------------------------------------------------------------------------------- 1 | #pragma ocne 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | using std::cout; 8 | using std::endl; 9 | using std::string; 10 | using std::unordered_map; 11 | using std::unordered_set; 12 | 13 | namespace wd { 14 | 15 | #define XML_PATH "xmlDir" 16 | #define RIPEPAGE_PATH "ripepageLib" 17 | #define OFFSET_PATH "offsetLib" 18 | #define NEW_RIPEPAGE_PATH "newRipepageLib" 19 | #define NEW_OFFSET_PATH "newOffsetLib" 20 | #define INDEX_PATH "invertIndex" 21 | #define DICT_PATH "dict" 22 | #define HMM_PATH "hmm_model" 23 | #define IDF_PATH "idf" 24 | #define STOP_WORD_PATH "stop_words" 25 | #define USER_DICT_PATH "user_dict" 26 | 27 | class Configuration { 28 | public: 29 | static Configuration* getInstance(); 30 | static Configuration* getInstance(const string& filepath); 31 | static void destroy(); 32 | 33 | std::unordered_map& getConfigMap() { 34 | return _configMap; 35 | } 36 | 37 | unordered_set& getStopWords(); 38 | 39 | private: 40 | Configuration(const string& filepath); 41 | ~Configuration() { cout << "~Configuration()" << endl; } 42 | void readFile(const string& filename); 43 | 44 | private: 45 | static Configuration* _pInstance; 46 | string _filepath; 47 | std::unordered_map _configMap; 48 | unordered_set _stopWords; 49 | }; 50 | 51 | #define CONFIG Configuration::getInstance()->getConfigMap() 52 | 53 | template 54 | void display(const Container& c) { 55 | typename Container::const_iterator cit = c.begin(); 56 | for (; cit != c.end(); ++cit) { 57 | cout << cit->first << " ---> " << cit->second << endl; 58 | } 59 | } 60 | 61 | template 62 | void displayS(const Container& c) { 63 | typename Container::const_iterator cit = c.begin(); 64 | while (cit != c.end()) { 65 | cout << *cit << endl;; 66 | ++cit; 67 | } 68 | //cout << endl; 69 | } 70 | 71 | } // namespace wd 72 | -------------------------------------------------------------------------------- /offline/include/Configuration.h: -------------------------------------------------------------------------------- 1 | #pragma ocne 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | using std::cout; 8 | using std::endl; 9 | using std::string; 10 | using std::unordered_map; 11 | using std::unordered_set; 12 | 13 | namespace wd { 14 | 15 | #define XML_PATH "xmlDir" 16 | #define RIPEPAGE_PATH "ripepageLib" 17 | #define OFFSET_PATH "offsetLib" 18 | #define NEW_RIPEPAGE_PATH "newRipepageLib" 19 | #define NEW_OFFSET_PATH "newOffsetLib" 20 | #define INDEX_PATH "invertIndex" 21 | #define DICT_PATH "dict" 22 | #define HMM_PATH "hmm_model" 23 | #define IDF_PATH "idf" 24 | #define STOP_WORD_PATH "stop_words" 25 | #define USER_DICT_PATH "user_dict" 26 | 27 | class Configuration { 28 | public: 29 | static Configuration* getInstance(); 30 | static Configuration* getInstance(const string& filepath); 31 | static void destroy(); 32 | 33 | std::unordered_map& getConfigMap() { 34 | return _configMap; 35 | } 36 | 37 | unordered_set& getStopWords(); 38 | 39 | private: 40 | Configuration(const string& filepath); 41 | ~Configuration() { cout << "~Configuration()" << endl; } 42 | void readFile(const string& filename); 43 | 44 | private: 45 | static Configuration* _pInstance; 46 | string _filepath; 47 | std::unordered_map _configMap; 48 | unordered_set _stopWords; 49 | }; 50 | 51 | #define CONFIG Configuration::getInstance()->getConfigMap() 52 | 53 | template 54 | void display(const Container& c) { 55 | typename Container::const_iterator cit = c.begin(); 56 | for (; cit != c.end(); ++cit) { 57 | cout << cit->first << " ---> " << cit->second << endl; 58 | } 59 | } 60 | 61 | template 62 | void displayS(const Container& c) { 63 | typename Container::const_iterator cit = c.begin(); 64 | while (cit != c.end()) { 65 | cout << *cit << endl;; 66 | ++cit; 67 | } 68 | //cout << endl; 69 | } 70 | 71 | } // namespace wd 72 | -------------------------------------------------------------------------------- /include/cppjieba/limonp/ThreadPool.hpp: -------------------------------------------------------------------------------- 1 | #ifndef LIMONP_THREAD_POOL_HPP 2 | #define LIMONP_THREAD_POOL_HPP 3 | 4 | #include "Thread.hpp" 5 | #include "BlockingQueue.hpp" 6 | #include "BoundedBlockingQueue.hpp" 7 | #include "Closure.hpp" 8 | 9 | namespace limonp { 10 | 11 | using namespace std; 12 | 13 | //class ThreadPool; 14 | class ThreadPool: NonCopyable { 15 | public: 16 | class Worker: public IThread { 17 | public: 18 | Worker(ThreadPool* pool): ptThreadPool_(pool) { 19 | assert(ptThreadPool_); 20 | } 21 | virtual ~Worker() { 22 | } 23 | 24 | virtual void Run() { 25 | while (true) { 26 | ClosureInterface* closure = ptThreadPool_->queue_.Pop(); 27 | if (closure == NULL) { 28 | break; 29 | } 30 | try { 31 | closure->Run(); 32 | } catch(std::exception& e) { 33 | XLOG(ERROR) << e.what(); 34 | } catch(...) { 35 | XLOG(ERROR) << " unknown exception."; 36 | } 37 | delete closure; 38 | } 39 | } 40 | private: 41 | ThreadPool * ptThreadPool_; 42 | }; // class Worker 43 | 44 | ThreadPool(size_t thread_num) 45 | : threads_(thread_num), 46 | queue_(thread_num) { 47 | assert(thread_num); 48 | for(size_t i = 0; i < threads_.size(); i ++) { 49 | threads_[i] = new Worker(this); 50 | } 51 | } 52 | ~ThreadPool() { 53 | Stop(); 54 | } 55 | 56 | void Start() { 57 | for(size_t i = 0; i < threads_.size(); i++) { 58 | threads_[i]->Start(); 59 | } 60 | } 61 | void Stop() { 62 | for(size_t i = 0; i < threads_.size(); i ++) { 63 | queue_.Push(NULL); 64 | } 65 | for(size_t i = 0; i < threads_.size(); i ++) { 66 | threads_[i]->Join(); 67 | delete threads_[i]; 68 | } 69 | threads_.clear(); 70 | } 71 | 72 | void Add(ClosureInterface* task) { 73 | assert(task); 74 | queue_.Push(task); 75 | } 76 | 77 | private: 78 | friend class Worker; 79 | 80 | vector threads_; 81 | BoundedBlockingQueue queue_; 82 | }; // class ThreadPool 83 | 84 | } // namespace limonp 85 | 86 | #endif // LIMONP_THREAD_POOL_HPP 87 | -------------------------------------------------------------------------------- /include/cppjieba/PosTagger.hpp: -------------------------------------------------------------------------------- 1 | #ifndef CPPJIEBA_POS_TAGGING_H 2 | #define CPPJIEBA_POS_TAGGING_H 3 | 4 | #include "limonp/StringUtil.hpp" 5 | #include "SegmentTagged.hpp" 6 | #include "DictTrie.hpp" 7 | 8 | namespace cppjieba { 9 | using namespace limonp; 10 | 11 | static const char* const POS_M = "m"; 12 | static const char* const POS_ENG = "eng"; 13 | static const char* const POS_X = "x"; 14 | 15 | class PosTagger { 16 | public: 17 | PosTagger() { 18 | } 19 | ~PosTagger() { 20 | } 21 | 22 | bool Tag(const string& src, vector >& res, const SegmentTagged& segment) const { 23 | vector CutRes; 24 | segment.Cut(src, CutRes); 25 | 26 | for (vector::iterator itr = CutRes.begin(); itr != CutRes.end(); ++itr) { 27 | res.push_back(make_pair(*itr, LookupTag(*itr, segment))); 28 | } 29 | return !res.empty(); 30 | } 31 | 32 | string LookupTag(const string &str, const SegmentTagged& segment) const { 33 | const DictUnit *tmp = NULL; 34 | RuneStrArray runes; 35 | const DictTrie * dict = segment.GetDictTrie(); 36 | assert(dict != NULL); 37 | if (!DecodeRunesInString(str, runes)) { 38 | XLOG(ERROR) << "Decode failed."; 39 | return POS_X; 40 | } 41 | tmp = dict->Find(runes.begin(), runes.end()); 42 | if (tmp == NULL || tmp->tag.empty()) { 43 | return SpecialRule(runes); 44 | } else { 45 | return tmp->tag; 46 | } 47 | } 48 | 49 | private: 50 | const char* SpecialRule(const RuneStrArray& unicode) const { 51 | size_t m = 0; 52 | size_t eng = 0; 53 | for (size_t i = 0; i < unicode.size() && eng < unicode.size() / 2; i++) { 54 | if (unicode[i].rune < 0x80) { 55 | eng ++; 56 | if ('0' <= unicode[i].rune && unicode[i].rune <= '9') { 57 | m++; 58 | } 59 | } 60 | } 61 | // ascii char is not found 62 | if (eng == 0) { 63 | return POS_X; 64 | } 65 | // all the ascii is number char 66 | if (m == eng) { 67 | return POS_M; 68 | } 69 | // the ascii chars contain english letter 70 | return POS_ENG; 71 | } 72 | 73 | }; // class PosTagger 74 | 75 | } // namespace cppjieba 76 | 77 | #endif 78 | -------------------------------------------------------------------------------- /offline/include/cppjieba/limonp/ThreadPool.hpp: -------------------------------------------------------------------------------- 1 | #ifndef LIMONP_THREAD_POOL_HPP 2 | #define LIMONP_THREAD_POOL_HPP 3 | 4 | #include "Thread.hpp" 5 | #include "BlockingQueue.hpp" 6 | #include "BoundedBlockingQueue.hpp" 7 | #include "Closure.hpp" 8 | 9 | namespace limonp { 10 | 11 | using namespace std; 12 | 13 | //class ThreadPool; 14 | class ThreadPool: NonCopyable { 15 | public: 16 | class Worker: public IThread { 17 | public: 18 | Worker(ThreadPool* pool): ptThreadPool_(pool) { 19 | assert(ptThreadPool_); 20 | } 21 | virtual ~Worker() { 22 | } 23 | 24 | virtual void Run() { 25 | while (true) { 26 | ClosureInterface* closure = ptThreadPool_->queue_.Pop(); 27 | if (closure == NULL) { 28 | break; 29 | } 30 | try { 31 | closure->Run(); 32 | } catch(std::exception& e) { 33 | XLOG(ERROR) << e.what(); 34 | } catch(...) { 35 | XLOG(ERROR) << " unknown exception."; 36 | } 37 | delete closure; 38 | } 39 | } 40 | private: 41 | ThreadPool * ptThreadPool_; 42 | }; // class Worker 43 | 44 | ThreadPool(size_t thread_num) 45 | : threads_(thread_num), 46 | queue_(thread_num) { 47 | assert(thread_num); 48 | for(size_t i = 0; i < threads_.size(); i ++) { 49 | threads_[i] = new Worker(this); 50 | } 51 | } 52 | ~ThreadPool() { 53 | Stop(); 54 | } 55 | 56 | void Start() { 57 | for(size_t i = 0; i < threads_.size(); i++) { 58 | threads_[i]->Start(); 59 | } 60 | } 61 | void Stop() { 62 | for(size_t i = 0; i < threads_.size(); i ++) { 63 | queue_.Push(NULL); 64 | } 65 | for(size_t i = 0; i < threads_.size(); i ++) { 66 | threads_[i]->Join(); 67 | delete threads_[i]; 68 | } 69 | threads_.clear(); 70 | } 71 | 72 | void Add(ClosureInterface* task) { 73 | assert(task); 74 | queue_.Push(task); 75 | } 76 | 77 | private: 78 | friend class Worker; 79 | 80 | vector threads_; 81 | BoundedBlockingQueue queue_; 82 | }; // class ThreadPool 83 | 84 | } // namespace limonp 85 | 86 | #endif // LIMONP_THREAD_POOL_HPP 87 | -------------------------------------------------------------------------------- /offline/include/cppjieba/PosTagger.hpp: -------------------------------------------------------------------------------- 1 | #ifndef CPPJIEBA_POS_TAGGING_H 2 | #define CPPJIEBA_POS_TAGGING_H 3 | 4 | #include "limonp/StringUtil.hpp" 5 | #include "SegmentTagged.hpp" 6 | #include "DictTrie.hpp" 7 | 8 | namespace cppjieba { 9 | using namespace limonp; 10 | 11 | static const char* const POS_M = "m"; 12 | static const char* const POS_ENG = "eng"; 13 | static const char* const POS_X = "x"; 14 | 15 | class PosTagger { 16 | public: 17 | PosTagger() { 18 | } 19 | ~PosTagger() { 20 | } 21 | 22 | bool Tag(const string& src, vector >& res, const SegmentTagged& segment) const { 23 | vector CutRes; 24 | segment.Cut(src, CutRes); 25 | 26 | for (vector::iterator itr = CutRes.begin(); itr != CutRes.end(); ++itr) { 27 | res.push_back(make_pair(*itr, LookupTag(*itr, segment))); 28 | } 29 | return !res.empty(); 30 | } 31 | 32 | string LookupTag(const string &str, const SegmentTagged& segment) const { 33 | const DictUnit *tmp = NULL; 34 | RuneStrArray runes; 35 | const DictTrie * dict = segment.GetDictTrie(); 36 | assert(dict != NULL); 37 | if (!DecodeRunesInString(str, runes)) { 38 | XLOG(ERROR) << "Decode failed."; 39 | return POS_X; 40 | } 41 | tmp = dict->Find(runes.begin(), runes.end()); 42 | if (tmp == NULL || tmp->tag.empty()) { 43 | return SpecialRule(runes); 44 | } else { 45 | return tmp->tag; 46 | } 47 | } 48 | 49 | private: 50 | const char* SpecialRule(const RuneStrArray& unicode) const { 51 | size_t m = 0; 52 | size_t eng = 0; 53 | for (size_t i = 0; i < unicode.size() && eng < unicode.size() / 2; i++) { 54 | if (unicode[i].rune < 0x80) { 55 | eng ++; 56 | if ('0' <= unicode[i].rune && unicode[i].rune <= '9') { 57 | m++; 58 | } 59 | } 60 | } 61 | // ascii char is not found 62 | if (eng == 0) { 63 | return POS_X; 64 | } 65 | // all the ascii is number char 66 | if (m == eng) { 67 | return POS_M; 68 | } 69 | // the ascii chars contain english letter 70 | return POS_ENG; 71 | } 72 | 73 | }; // class PosTagger 74 | 75 | } // namespace cppjieba 76 | 77 | #endif 78 | -------------------------------------------------------------------------------- /src/net/SocketIO.cc: -------------------------------------------------------------------------------- 1 | #include "net/SocketIO.h" 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | namespace wd { 10 | SocketIO::SocketIO(int fd) : _fd(fd) {} 11 | 12 | int SocketIO::readn(char* buf, int len) { 13 | int left = len; 14 | char* p = buf; 15 | while (left > 0) { 16 | int ret = ::read(_fd, p, left); 17 | if (ret == -1 && errno == EINTR) { 18 | continue; 19 | } else if (ret == -1) { 20 | perror("read"); 21 | return len - left; 22 | } else if (ret == 0) { 23 | return len - left; 24 | } else { 25 | left -= ret; 26 | p += ret; 27 | } 28 | } 29 | return len - left; 30 | } 31 | 32 | //读取一行数据 33 | int SocketIO::readLine(char* buf, int maxLen) { 34 | int left = maxLen - 1; 35 | char* p = buf; 36 | int ret; 37 | int total = 0; 38 | while (left > 0) { 39 | ret = recvPeek(p, left); 40 | //查找'\n' 41 | for (int idx = 0; idx != ret; ++idx) { 42 | if (p[idx] == '\n') { 43 | int sz = idx + 1; 44 | readn(p, sz); 45 | total += sz; 46 | p += sz; 47 | *p = '\0'; 48 | return total; 49 | } 50 | } 51 | //如果没有找到'\n' 52 | readn(p, ret); 53 | left -= ret; 54 | p += ret; 55 | total += ret; 56 | } 57 | //最终没有发现'\n' 58 | *p = '\0'; 59 | return total; 60 | } 61 | 62 | int SocketIO::recvPeek(char* buf, int len) { 63 | int ret; 64 | do { 65 | ret = ::recv(_fd, buf, len, MSG_PEEK); 66 | } while (ret == -1 && errno == EINTR); 67 | return ret; 68 | } 69 | 70 | int SocketIO::writen(const char* buf, int len) { 71 | int left = len; 72 | const char* p = buf; 73 | while (left > 0) { 74 | int ret = ::write(_fd, p, left); 75 | if (ret == -1 && errno == EINTR) { 76 | continue; 77 | } else if (ret == -1) { 78 | perror("write"); 79 | return len - left; 80 | } else { 81 | left -= ret; 82 | p += ret; 83 | } 84 | } 85 | 86 | printf(">> writen finish\n"); 87 | return len - left; 88 | } 89 | } // namespace wd 90 | -------------------------------------------------------------------------------- /offline/src/WebPage.cc: -------------------------------------------------------------------------------- 1 | #include "WebPage.h" 2 | #include 3 | #include 4 | #include 5 | #include 6 | using std::cout; 7 | using std::endl; 8 | using std::ostringstream; 9 | using std::vector; 10 | using std::to_string; 11 | 12 | // this define can avoid some logs which you don't need to care about. 13 | #define LOGGER_LEVEL LL_WARN 14 | 15 | namespace wd { 16 | WebPage::WebPage(int id, const string& title, const string& link, 17 | const string& content) 18 | : _docid(id), 19 | _title(title), 20 | _link(link), 21 | _content(content), 22 | _simhashVal(0) {} 23 | 24 | void WebPage::generateSimhash(WordSegmentation& simhasher) { 25 | size_t topN = 6; 26 | _simhashVal = simhasher.makeSimhash(_content, topN); 27 | } 28 | 29 | void WebPage::buildWordsMap(WordSegmentation& jieba) { 30 | unordered_set& stopWords = 31 | Configuration::getInstance()->getStopWords(); 32 | 33 | vector words = jieba.cutWords(_content); 34 | 35 | for (auto& word : words) { 36 | if (stopWords.count(word) == 0) { 37 | ++_wordsMap[word]; 38 | } 39 | } 40 | } 41 | 42 | void WebPage::insertDoc(XMLDocument& pageLib) { 43 | XMLElement* doc = pageLib.NewElement("doc"); 44 | pageLib.InsertEndChild(doc); 45 | XMLElement* docid = pageLib.NewElement("docid"); 46 | XMLText* docidText = pageLib.NewText(to_string(_docid).c_str()); 47 | docid->InsertEndChild(docidText); 48 | doc->InsertEndChild(docid); 49 | XMLElement* title = pageLib.NewElement("title"); 50 | title->InsertEndChild(pageLib.NewText(_title.c_str())); 51 | doc->InsertEndChild(title); 52 | XMLElement* link = pageLib.NewElement("link"); 53 | link->InsertEndChild(pageLib.NewText(_link.c_str())); 54 | doc->InsertEndChild(link); 55 | XMLElement* content = pageLib.NewElement("content"); 56 | content->InsertEndChild(pageLib.NewText(_content.c_str())); 57 | doc->InsertEndChild(content); 58 | } 59 | 60 | #if 0 61 | string WebPage::getDoc() { 62 | ostringstream oss; 63 | oss << "" << '\n' 64 | << '\t' << "" << _docid << "" << '\n' 65 | << '\t' << "" << _title << "" << '\n' 66 | << '\t' << "" << _link << "" << '\n' 67 | << '\t' << "" << _content << "" << '\n' 68 | << "" << '\n'; 69 | return oss.str(); 70 | } 71 | #endif 72 | 73 | bool WebPage::operator<(const WebPage& rhs) { 74 | return _simhashVal < rhs._simhashVal; 75 | } 76 | 77 | bool WebPage::operator==(const WebPage& rhs) { 78 | return simhash::Simhasher::isEqual(_simhashVal, rhs._simhashVal); 79 | } 80 | 81 | } // namespace wd 82 | -------------------------------------------------------------------------------- /include/cppjieba/limonp/Config.hpp: -------------------------------------------------------------------------------- 1 | /************************************ 2 | * file enc : utf8 3 | * author : wuyanyi09@gmail.com 4 | ************************************/ 5 | #ifndef LIMONP_CONFIG_H 6 | #define LIMONP_CONFIG_H 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include "StringUtil.hpp" 13 | 14 | namespace limonp { 15 | 16 | using namespace std; 17 | 18 | class Config { 19 | public: 20 | explicit Config(const string& filePath) { 21 | LoadFile(filePath); 22 | } 23 | 24 | operator bool () { 25 | return !map_.empty(); 26 | } 27 | 28 | string Get(const string& key, const string& defaultvalue) const { 29 | map::const_iterator it = map_.find(key); 30 | if(map_.end() != it) { 31 | return it->second; 32 | } 33 | return defaultvalue; 34 | } 35 | int Get(const string& key, int defaultvalue) const { 36 | string str = Get(key, ""); 37 | if("" == str) { 38 | return defaultvalue; 39 | } 40 | return atoi(str.c_str()); 41 | } 42 | const char* operator [] (const char* key) const { 43 | if(NULL == key) { 44 | return NULL; 45 | } 46 | map::const_iterator it = map_.find(key); 47 | if(map_.end() != it) { 48 | return it->second.c_str(); 49 | } 50 | return NULL; 51 | } 52 | 53 | string GetConfigInfo() const { 54 | string res; 55 | res << *this; 56 | return res; 57 | } 58 | 59 | private: 60 | void LoadFile(const string& filePath) { 61 | ifstream ifs(filePath.c_str()); 62 | assert(ifs); 63 | string line; 64 | vector vecBuf; 65 | size_t lineno = 0; 66 | while(getline(ifs, line)) { 67 | lineno ++; 68 | Trim(line); 69 | if(line.empty() || StartsWith(line, "#")) { 70 | continue; 71 | } 72 | vecBuf.clear(); 73 | Split(line, vecBuf, "="); 74 | if(2 != vecBuf.size()) { 75 | fprintf(stderr, "line[%s] illegal.\n", line.c_str()); 76 | assert(false); 77 | continue; 78 | } 79 | string& key = vecBuf[0]; 80 | string& value = vecBuf[1]; 81 | Trim(key); 82 | Trim(value); 83 | if(!map_.insert(make_pair(key, value)).second) { 84 | fprintf(stderr, "key[%s] already exits.\n", key.c_str()); 85 | assert(false); 86 | continue; 87 | } 88 | } 89 | ifs.close(); 90 | } 91 | 92 | friend ostream& operator << (ostream& os, const Config& config); 93 | 94 | map map_; 95 | }; // class Config 96 | 97 | inline ostream& operator << (ostream& os, const Config& config) { 98 | return os << config.map_; 99 | } 100 | 101 | } // namespace limonp 102 | 103 | #endif // LIMONP_CONFIG_H 104 | -------------------------------------------------------------------------------- /src/net/TCPConnection.cc: -------------------------------------------------------------------------------- 1 | #include "net/TCPConnection.h" 2 | #include "net/EventLoop.h" 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | namespace wd { 9 | TCPConnection::TCPConnection(int fd, EventLoop* loop) 10 | : _sock(fd), 11 | _socketIO(fd), 12 | _localAddr(getLocalAddr()), 13 | _peerAddr(getPeerAddr()), 14 | _isShutdownonWrite(false), 15 | _loop(loop) {} 16 | 17 | TCPConnection::~TCPConnection() { 18 | if (!_isShutdownonWrite) { 19 | shutdown(); 20 | } 21 | } 22 | 23 | string TCPConnection::receive() { 24 | char buf[65536] = {0}; 25 | _socketIO.readLine(buf, sizeof(buf)); 26 | return string(buf); 27 | } 28 | 29 | void TCPConnection::send(const string& msg) { 30 | _socketIO.writen(msg.c_str(), msg.size()); 31 | } 32 | 33 | void TCPConnection::sendInLoop(const string& msg) { 34 | if(_loop) { 35 | _loop->runInloop(std::bind(&TCPConnection::send, this, msg)); 36 | } 37 | } 38 | 39 | void TCPConnection::shutdown() { 40 | if (!_isShutdownonWrite) { 41 | _isShutdownonWrite = true; 42 | _sock.shutdownonWrite(); 43 | } 44 | } 45 | 46 | string TCPConnection::toString() const { 47 | std::ostringstream oss; 48 | oss << _localAddr.ip() << ":" << _localAddr.port() << " --> " 49 | << _peerAddr.ip() << ":" << _peerAddr.port(); 50 | return oss.str(); 51 | } 52 | 53 | InetAddress TCPConnection::getLocalAddr() { 54 | struct sockaddr_in addr; 55 | socklen_t len = sizeof(struct sockaddr); 56 | if (getsockname(_sock.fd(), (struct sockaddr*)&addr, &len) == -1) { 57 | perror("getsockname"); 58 | } 59 | return InetAddress(addr); 60 | } 61 | 62 | InetAddress TCPConnection::getPeerAddr() { 63 | struct sockaddr_in addr; 64 | socklen_t len = sizeof(struct sockaddr); 65 | if (getsockname(_sock.fd(), (struct sockaddr*)&addr, &len) == -1) { 66 | perror("getsockname"); 67 | } 68 | return InetAddress(addr); 69 | } 70 | 71 | void TCPConnection::setConnectionCallback(const TCPConnectionCallback& cb) { 72 | _onConnection = cb; 73 | } 74 | 75 | void TCPConnection::setMessageCallback(const TCPConnectionCallback& cb) { 76 | _onMessage = cb; 77 | } 78 | 79 | void TCPConnection::setCloseCallback(const TCPConnectionCallback& cb) { 80 | _onClose = cb; 81 | } 82 | 83 | void TCPConnection::handleConnectionCallback() { 84 | if(_onConnection) { 85 | _onConnection(shared_from_this()); 86 | } 87 | } 88 | 89 | void TCPConnection::handleMessageCallback() { 90 | if(_onMessage) { 91 | _onMessage(shared_from_this()); 92 | } 93 | } 94 | 95 | void TCPConnection::handleCloseCallback() { 96 | if(_onClose) { 97 | _onClose(shared_from_this()); 98 | } 99 | } 100 | 101 | } // namespace wd 102 | -------------------------------------------------------------------------------- /include/cppjieba/FullSegment.hpp: -------------------------------------------------------------------------------- 1 | #ifndef CPPJIEBA_FULLSEGMENT_H 2 | #define CPPJIEBA_FULLSEGMENT_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include "limonp/Logging.hpp" 8 | #include "DictTrie.hpp" 9 | #include "SegmentBase.hpp" 10 | #include "Unicode.hpp" 11 | 12 | namespace cppjieba { 13 | class FullSegment: public SegmentBase { 14 | public: 15 | FullSegment(const string& dictPath) { 16 | dictTrie_ = new DictTrie(dictPath); 17 | isNeedDestroy_ = true; 18 | } 19 | FullSegment(const DictTrie* dictTrie) 20 | : dictTrie_(dictTrie), isNeedDestroy_(false) { 21 | assert(dictTrie_); 22 | } 23 | ~FullSegment() { 24 | if (isNeedDestroy_) { 25 | delete dictTrie_; 26 | } 27 | } 28 | void Cut(const string& sentence, 29 | vector& words) const { 30 | vector tmp; 31 | Cut(sentence, tmp); 32 | GetStringsFromWords(tmp, words); 33 | } 34 | void Cut(const string& sentence, 35 | vector& words) const { 36 | PreFilter pre_filter(symbols_, sentence); 37 | PreFilter::Range range; 38 | vector wrs; 39 | wrs.reserve(sentence.size()/2); 40 | while (pre_filter.HasNext()) { 41 | range = pre_filter.Next(); 42 | Cut(range.begin, range.end, wrs); 43 | } 44 | words.clear(); 45 | words.reserve(wrs.size()); 46 | GetWordsFromWordRanges(sentence, wrs, words); 47 | } 48 | void Cut(RuneStrArray::const_iterator begin, 49 | RuneStrArray::const_iterator end, 50 | vector& res) const { 51 | // resut of searching in trie tree 52 | LocalVector > tRes; 53 | 54 | // max index of res's words 55 | size_t maxIdx = 0; 56 | 57 | // always equals to (uItr - begin) 58 | size_t uIdx = 0; 59 | 60 | // tmp variables 61 | size_t wordLen = 0; 62 | assert(dictTrie_); 63 | vector dags; 64 | dictTrie_->Find(begin, end, dags); 65 | for (size_t i = 0; i < dags.size(); i++) { 66 | for (size_t j = 0; j < dags[i].nexts.size(); j++) { 67 | size_t nextoffset = dags[i].nexts[j].first; 68 | assert(nextoffset < dags.size()); 69 | const DictUnit* du = dags[i].nexts[j].second; 70 | if (du == NULL) { 71 | if (dags[i].nexts.size() == 1 && maxIdx <= uIdx) { 72 | WordRange wr(begin + i, begin + nextoffset); 73 | res.push_back(wr); 74 | } 75 | } else { 76 | wordLen = du->word.size(); 77 | if (wordLen >= 2 || (dags[i].nexts.size() == 1 && maxIdx <= uIdx)) { 78 | WordRange wr(begin + i, begin + nextoffset); 79 | res.push_back(wr); 80 | } 81 | } 82 | maxIdx = uIdx + wordLen > maxIdx ? uIdx + wordLen : maxIdx; 83 | } 84 | uIdx++; 85 | } 86 | } 87 | private: 88 | const DictTrie* dictTrie_; 89 | bool isNeedDestroy_; 90 | }; 91 | } 92 | 93 | #endif 94 | -------------------------------------------------------------------------------- /offline/include/cppjieba/limonp/Config.hpp: -------------------------------------------------------------------------------- 1 | /************************************ 2 | * file enc : utf8 3 | * author : wuyanyi09@gmail.com 4 | ************************************/ 5 | #ifndef LIMONP_CONFIG_H 6 | #define LIMONP_CONFIG_H 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include "StringUtil.hpp" 13 | 14 | namespace limonp { 15 | 16 | using namespace std; 17 | 18 | class Config { 19 | public: 20 | explicit Config(const string& filePath) { 21 | LoadFile(filePath); 22 | } 23 | 24 | operator bool () { 25 | return !map_.empty(); 26 | } 27 | 28 | string Get(const string& key, const string& defaultvalue) const { 29 | map::const_iterator it = map_.find(key); 30 | if(map_.end() != it) { 31 | return it->second; 32 | } 33 | return defaultvalue; 34 | } 35 | int Get(const string& key, int defaultvalue) const { 36 | string str = Get(key, ""); 37 | if("" == str) { 38 | return defaultvalue; 39 | } 40 | return atoi(str.c_str()); 41 | } 42 | const char* operator [] (const char* key) const { 43 | if(NULL == key) { 44 | return NULL; 45 | } 46 | map::const_iterator it = map_.find(key); 47 | if(map_.end() != it) { 48 | return it->second.c_str(); 49 | } 50 | return NULL; 51 | } 52 | 53 | string GetConfigInfo() const { 54 | string res; 55 | res << *this; 56 | return res; 57 | } 58 | 59 | private: 60 | void LoadFile(const string& filePath) { 61 | ifstream ifs(filePath.c_str()); 62 | assert(ifs); 63 | string line; 64 | vector vecBuf; 65 | size_t lineno = 0; 66 | while(getline(ifs, line)) { 67 | lineno ++; 68 | Trim(line); 69 | if(line.empty() || StartsWith(line, "#")) { 70 | continue; 71 | } 72 | vecBuf.clear(); 73 | Split(line, vecBuf, "="); 74 | if(2 != vecBuf.size()) { 75 | fprintf(stderr, "line[%s] illegal.\n", line.c_str()); 76 | assert(false); 77 | continue; 78 | } 79 | string& key = vecBuf[0]; 80 | string& value = vecBuf[1]; 81 | Trim(key); 82 | Trim(value); 83 | if(!map_.insert(make_pair(key, value)).second) { 84 | fprintf(stderr, "key[%s] already exits.\n", key.c_str()); 85 | assert(false); 86 | continue; 87 | } 88 | } 89 | ifs.close(); 90 | } 91 | 92 | friend ostream& operator << (ostream& os, const Config& config); 93 | 94 | map map_; 95 | }; // class Config 96 | 97 | inline ostream& operator << (ostream& os, const Config& config) { 98 | return os << config.map_; 99 | } 100 | 101 | } // namespace limonp 102 | 103 | #endif // LIMONP_CONFIG_H 104 | -------------------------------------------------------------------------------- /offline/include/cppjieba/FullSegment.hpp: -------------------------------------------------------------------------------- 1 | #ifndef CPPJIEBA_FULLSEGMENT_H 2 | #define CPPJIEBA_FULLSEGMENT_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include "limonp/Logging.hpp" 8 | #include "DictTrie.hpp" 9 | #include "SegmentBase.hpp" 10 | #include "Unicode.hpp" 11 | 12 | namespace cppjieba { 13 | class FullSegment: public SegmentBase { 14 | public: 15 | FullSegment(const string& dictPath) { 16 | dictTrie_ = new DictTrie(dictPath); 17 | isNeedDestroy_ = true; 18 | } 19 | FullSegment(const DictTrie* dictTrie) 20 | : dictTrie_(dictTrie), isNeedDestroy_(false) { 21 | assert(dictTrie_); 22 | } 23 | ~FullSegment() { 24 | if (isNeedDestroy_) { 25 | delete dictTrie_; 26 | } 27 | } 28 | void Cut(const string& sentence, 29 | vector& words) const { 30 | vector tmp; 31 | Cut(sentence, tmp); 32 | GetStringsFromWords(tmp, words); 33 | } 34 | void Cut(const string& sentence, 35 | vector& words) const { 36 | PreFilter pre_filter(symbols_, sentence); 37 | PreFilter::Range range; 38 | vector wrs; 39 | wrs.reserve(sentence.size()/2); 40 | while (pre_filter.HasNext()) { 41 | range = pre_filter.Next(); 42 | Cut(range.begin, range.end, wrs); 43 | } 44 | words.clear(); 45 | words.reserve(wrs.size()); 46 | GetWordsFromWordRanges(sentence, wrs, words); 47 | } 48 | void Cut(RuneStrArray::const_iterator begin, 49 | RuneStrArray::const_iterator end, 50 | vector& res) const { 51 | // resut of searching in trie tree 52 | LocalVector > tRes; 53 | 54 | // max index of res's words 55 | size_t maxIdx = 0; 56 | 57 | // always equals to (uItr - begin) 58 | size_t uIdx = 0; 59 | 60 | // tmp variables 61 | size_t wordLen = 0; 62 | assert(dictTrie_); 63 | vector dags; 64 | dictTrie_->Find(begin, end, dags); 65 | for (size_t i = 0; i < dags.size(); i++) { 66 | for (size_t j = 0; j < dags[i].nexts.size(); j++) { 67 | size_t nextoffset = dags[i].nexts[j].first; 68 | assert(nextoffset < dags.size()); 69 | const DictUnit* du = dags[i].nexts[j].second; 70 | if (du == NULL) { 71 | if (dags[i].nexts.size() == 1 && maxIdx <= uIdx) { 72 | WordRange wr(begin + i, begin + nextoffset); 73 | res.push_back(wr); 74 | } 75 | } else { 76 | wordLen = du->word.size(); 77 | if (wordLen >= 2 || (dags[i].nexts.size() == 1 && maxIdx <= uIdx)) { 78 | WordRange wr(begin + i, begin + nextoffset); 79 | res.push_back(wr); 80 | } 81 | } 82 | maxIdx = uIdx + wordLen > maxIdx ? uIdx + wordLen : maxIdx; 83 | } 84 | uIdx++; 85 | } 86 | } 87 | private: 88 | const DictTrie* dictTrie_; 89 | bool isNeedDestroy_; 90 | }; 91 | } 92 | 93 | #endif 94 | -------------------------------------------------------------------------------- /include/cppjieba/QuerySegment.hpp: -------------------------------------------------------------------------------- 1 | #ifndef CPPJIEBA_QUERYSEGMENT_H 2 | #define CPPJIEBA_QUERYSEGMENT_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include "limonp/Logging.hpp" 8 | #include "DictTrie.hpp" 9 | #include "SegmentBase.hpp" 10 | #include "FullSegment.hpp" 11 | #include "MixSegment.hpp" 12 | #include "Unicode.hpp" 13 | #include "DictTrie.hpp" 14 | 15 | namespace cppjieba { 16 | class QuerySegment: public SegmentBase { 17 | public: 18 | QuerySegment(const string& dict, const string& model, const string& userDict = "") 19 | : mixSeg_(dict, model, userDict), 20 | trie_(mixSeg_.GetDictTrie()) { 21 | } 22 | QuerySegment(const DictTrie* dictTrie, const HMMModel* model) 23 | : mixSeg_(dictTrie, model), trie_(dictTrie) { 24 | } 25 | ~QuerySegment() { 26 | } 27 | 28 | void Cut(const string& sentence, vector& words) const { 29 | Cut(sentence, words, true); 30 | } 31 | void Cut(const string& sentence, vector& words, bool hmm) const { 32 | vector tmp; 33 | Cut(sentence, tmp, hmm); 34 | GetStringsFromWords(tmp, words); 35 | } 36 | void Cut(const string& sentence, vector& words, bool hmm = true) const { 37 | PreFilter pre_filter(symbols_, sentence); 38 | PreFilter::Range range; 39 | vector wrs; 40 | wrs.reserve(sentence.size()/2); 41 | while (pre_filter.HasNext()) { 42 | range = pre_filter.Next(); 43 | Cut(range.begin, range.end, wrs, hmm); 44 | } 45 | words.clear(); 46 | words.reserve(wrs.size()); 47 | GetWordsFromWordRanges(sentence, wrs, words); 48 | } 49 | void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector& res, bool hmm) const { 50 | //use mix Cut first 51 | vector mixRes; 52 | mixSeg_.Cut(begin, end, mixRes, hmm); 53 | 54 | vector fullRes; 55 | for (vector::const_iterator mixResItr = mixRes.begin(); mixResItr != mixRes.end(); mixResItr++) { 56 | if (mixResItr->Length() > 2) { 57 | for (size_t i = 0; i + 1 < mixResItr->Length(); i++) { 58 | WordRange wr(mixResItr->left + i, mixResItr->left + i + 1); 59 | if (trie_->Find(wr.left, wr.right + 1) != NULL) { 60 | res.push_back(wr); 61 | } 62 | } 63 | } 64 | if (mixResItr->Length() > 3) { 65 | for (size_t i = 0; i + 2 < mixResItr->Length(); i++) { 66 | WordRange wr(mixResItr->left + i, mixResItr->left + i + 2); 67 | if (trie_->Find(wr.left, wr.right + 1) != NULL) { 68 | res.push_back(wr); 69 | } 70 | } 71 | } 72 | res.push_back(*mixResItr); 73 | } 74 | } 75 | private: 76 | bool IsAllAscii(const Unicode& s) const { 77 | for(size_t i = 0; i < s.size(); i++) { 78 | if (s[i] >= 0x80) { 79 | return false; 80 | } 81 | } 82 | return true; 83 | } 84 | MixSegment mixSeg_; 85 | const DictTrie* trie_; 86 | }; // QuerySegment 87 | 88 | } // namespace cppjieba 89 | 90 | #endif 91 | -------------------------------------------------------------------------------- /offline/include/cppjieba/QuerySegment.hpp: -------------------------------------------------------------------------------- 1 | #ifndef CPPJIEBA_QUERYSEGMENT_H 2 | #define CPPJIEBA_QUERYSEGMENT_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include "limonp/Logging.hpp" 8 | #include "DictTrie.hpp" 9 | #include "SegmentBase.hpp" 10 | #include "FullSegment.hpp" 11 | #include "MixSegment.hpp" 12 | #include "Unicode.hpp" 13 | #include "DictTrie.hpp" 14 | 15 | namespace cppjieba { 16 | class QuerySegment: public SegmentBase { 17 | public: 18 | QuerySegment(const string& dict, const string& model, const string& userDict = "") 19 | : mixSeg_(dict, model, userDict), 20 | trie_(mixSeg_.GetDictTrie()) { 21 | } 22 | QuerySegment(const DictTrie* dictTrie, const HMMModel* model) 23 | : mixSeg_(dictTrie, model), trie_(dictTrie) { 24 | } 25 | ~QuerySegment() { 26 | } 27 | 28 | void Cut(const string& sentence, vector& words) const { 29 | Cut(sentence, words, true); 30 | } 31 | void Cut(const string& sentence, vector& words, bool hmm) const { 32 | vector tmp; 33 | Cut(sentence, tmp, hmm); 34 | GetStringsFromWords(tmp, words); 35 | } 36 | void Cut(const string& sentence, vector& words, bool hmm = true) const { 37 | PreFilter pre_filter(symbols_, sentence); 38 | PreFilter::Range range; 39 | vector wrs; 40 | wrs.reserve(sentence.size()/2); 41 | while (pre_filter.HasNext()) { 42 | range = pre_filter.Next(); 43 | Cut(range.begin, range.end, wrs, hmm); 44 | } 45 | words.clear(); 46 | words.reserve(wrs.size()); 47 | GetWordsFromWordRanges(sentence, wrs, words); 48 | } 49 | void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector& res, bool hmm) const { 50 | //use mix Cut first 51 | vector mixRes; 52 | mixSeg_.Cut(begin, end, mixRes, hmm); 53 | 54 | vector fullRes; 55 | for (vector::const_iterator mixResItr = mixRes.begin(); mixResItr != mixRes.end(); mixResItr++) { 56 | if (mixResItr->Length() > 2) { 57 | for (size_t i = 0; i + 1 < mixResItr->Length(); i++) { 58 | WordRange wr(mixResItr->left + i, mixResItr->left + i + 1); 59 | if (trie_->Find(wr.left, wr.right + 1) != NULL) { 60 | res.push_back(wr); 61 | } 62 | } 63 | } 64 | if (mixResItr->Length() > 3) { 65 | for (size_t i = 0; i + 2 < mixResItr->Length(); i++) { 66 | WordRange wr(mixResItr->left + i, mixResItr->left + i + 2); 67 | if (trie_->Find(wr.left, wr.right + 1) != NULL) { 68 | res.push_back(wr); 69 | } 70 | } 71 | } 72 | res.push_back(*mixResItr); 73 | } 74 | } 75 | private: 76 | bool IsAllAscii(const Unicode& s) const { 77 | for(size_t i = 0; i < s.size(); i++) { 78 | if (s[i] >= 0x80) { 79 | return false; 80 | } 81 | } 82 | return true; 83 | } 84 | MixSegment mixSeg_; 85 | const DictTrie* trie_; 86 | }; // QuerySegment 87 | 88 | } // namespace cppjieba 89 | 90 | #endif 91 | -------------------------------------------------------------------------------- /src/WebPage.cc: -------------------------------------------------------------------------------- 1 | #include "WebPage.h" 2 | #include 3 | #include 4 | //#include "Configuration.h" 5 | using std::cout; 6 | using std::endl; 7 | using std::istringstream; 8 | 9 | namespace wd { 10 | WebPage::WebPage(int id, const string& title, const string& link, 11 | const string& content) 12 | : _docid(id), _title(title), _link(link), _content(content) {} 13 | 14 | string WebPage::summary(const vector& queryWords) { 15 | vector summaryVec; 16 | string period = "。"; 17 | istringstream iss(_content); 18 | string line; 19 | while (iss >> line) { 20 | for (auto word : queryWords) { 21 | string result; 22 | size_t pos = line.find(word); 23 | if (pos != string::npos) { 24 | if (length(line) >= 100) { 25 | //找到word之前的距离word最近的第一个。 26 | size_t p1 = line.rfind(period, pos); 27 | size_t p2 = line.find(period, pos); 28 | 29 | if (p1 != string::npos && 30 | p2 != string::npos) { // word前后都有句号 31 | p1 += getBytes(line[p1]); 32 | p2 -= getBytes(line[p2]); 33 | 34 | result = line.substr(p1, p2 - p1); 35 | } else if (p1 != string::npos && 36 | p2 == string::npos) { // word之后没有句号 37 | p1 += getBytes(line[p1]); 38 | //取100个字长 39 | string temp = line.substr(p1); 40 | size_t len = length(temp); 41 | if (len > 100) { 42 | len = 100; 43 | } 44 | p2 = p1; 45 | for (size_t ilen = 0; ilen < len; ++ilen, ++p2) { 46 | size_t bytes = getBytes(temp[p1]); 47 | p2 += (bytes - 1); 48 | } 49 | result = line.substr(p1, p2 - p1); 50 | } else if (p1 == string::npos && 51 | p2 != string::npos) { // word之前没有句号 52 | p2 -= getBytes(line[p2]); 53 | 54 | result = line.substr(0, p2); 55 | } 56 | } else { 57 | result = line; 58 | } 59 | result.append("..."); 60 | summaryVec.push_back(result); 61 | break; 62 | } 63 | } 64 | 65 | if (summaryVec.size() >= 2) { 66 | break; 67 | } 68 | } 69 | string summary; 70 | for (auto s : summaryVec) { 71 | summary.append(s).append("\n"); 72 | } 73 | return summary; 74 | } 75 | 76 | size_t WebPage::getBytes(const char ch) { 77 | if (ch & (1 << 7)) { 78 | int nBytes = 1; 79 | for (int idx = 0; idx != 6; ++idx) { 80 | if (ch & (1 << (6 - idx))) { 81 | ++nBytes; 82 | } else 83 | break; 84 | } 85 | return nBytes; 86 | } 87 | return 1; 88 | } 89 | 90 | size_t WebPage::length(const std::string& str) { 91 | std::size_t ilen = 0; 92 | for (std::size_t idx = 0; idx != str.size(); ++idx) { 93 | int nBytes = getBytes(str[idx]); 94 | idx += (nBytes - 1); 95 | ++ilen; 96 | } 97 | return ilen; 98 | } 99 | 100 | } // namespace wd -------------------------------------------------------------------------------- /include/cppjieba/MixSegment.hpp: -------------------------------------------------------------------------------- 1 | #ifndef CPPJIEBA_MIXSEGMENT_H 2 | #define CPPJIEBA_MIXSEGMENT_H 3 | 4 | #include 5 | #include "MPSegment.hpp" 6 | #include "HMMSegment.hpp" 7 | #include "limonp/StringUtil.hpp" 8 | #include "PosTagger.hpp" 9 | 10 | namespace cppjieba { 11 | class MixSegment: public SegmentTagged { 12 | public: 13 | MixSegment(const string& mpSegDict, const string& hmmSegDict, 14 | const string& userDict = "") 15 | : mpSeg_(mpSegDict, userDict), 16 | hmmSeg_(hmmSegDict) { 17 | } 18 | MixSegment(const DictTrie* dictTrie, const HMMModel* model) 19 | : mpSeg_(dictTrie), hmmSeg_(model) { 20 | } 21 | ~MixSegment() { 22 | } 23 | 24 | void Cut(const string& sentence, vector& words) const { 25 | Cut(sentence, words, true); 26 | } 27 | void Cut(const string& sentence, vector& words, bool hmm) const { 28 | vector tmp; 29 | Cut(sentence, tmp, hmm); 30 | GetStringsFromWords(tmp, words); 31 | } 32 | void Cut(const string& sentence, vector& words, bool hmm = true) const { 33 | PreFilter pre_filter(symbols_, sentence); 34 | PreFilter::Range range; 35 | vector wrs; 36 | wrs.reserve(sentence.size() / 2); 37 | while (pre_filter.HasNext()) { 38 | range = pre_filter.Next(); 39 | Cut(range.begin, range.end, wrs, hmm); 40 | } 41 | words.clear(); 42 | words.reserve(wrs.size()); 43 | GetWordsFromWordRanges(sentence, wrs, words); 44 | } 45 | 46 | void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector& res, bool hmm) const { 47 | if (!hmm) { 48 | mpSeg_.Cut(begin, end, res); 49 | return; 50 | } 51 | vector words; 52 | assert(end >= begin); 53 | words.reserve(end - begin); 54 | mpSeg_.Cut(begin, end, words); 55 | 56 | vector hmmRes; 57 | hmmRes.reserve(end - begin); 58 | for (size_t i = 0; i < words.size(); i++) { 59 | //if mp Get a word, it's ok, put it into result 60 | if (words[i].left != words[i].right || (words[i].left == words[i].right && mpSeg_.IsUserDictSingleChineseWord(words[i].left->rune))) { 61 | res.push_back(words[i]); 62 | continue; 63 | } 64 | 65 | // if mp Get a single one and it is not in userdict, collect it in sequence 66 | size_t j = i; 67 | while (j < words.size() && words[j].left == words[j].right && !mpSeg_.IsUserDictSingleChineseWord(words[j].left->rune)) { 68 | j++; 69 | } 70 | 71 | // Cut the sequence with hmm 72 | assert(j - 1 >= i); 73 | // TODO 74 | hmmSeg_.Cut(words[i].left, words[j - 1].left + 1, hmmRes); 75 | //put hmm result to result 76 | for (size_t k = 0; k < hmmRes.size(); k++) { 77 | res.push_back(hmmRes[k]); 78 | } 79 | 80 | //clear tmp vars 81 | hmmRes.clear(); 82 | 83 | //let i jump over this piece 84 | i = j - 1; 85 | } 86 | } 87 | 88 | const DictTrie* GetDictTrie() const { 89 | return mpSeg_.GetDictTrie(); 90 | } 91 | 92 | bool Tag(const string& src, vector >& res) const { 93 | return tagger_.Tag(src, res, *this); 94 | } 95 | 96 | string LookupTag(const string &str) const { 97 | return tagger_.LookupTag(str, *this); 98 | } 99 | 100 | private: 101 | MPSegment mpSeg_; 102 | HMMSegment hmmSeg_; 103 | PosTagger tagger_; 104 | 105 | }; // class MixSegment 106 | 107 | } // namespace cppjieba 108 | 109 | #endif 110 | -------------------------------------------------------------------------------- /include/cppjieba/limonp/LocalVector.hpp: -------------------------------------------------------------------------------- 1 | #ifndef LIMONP_LOCAL_VECTOR_HPP 2 | #define LIMONP_LOCAL_VECTOR_HPP 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | namespace limonp { 10 | using namespace std; 11 | /* 12 | * LocalVector : T must be primitive type (char , int, size_t), if T is struct or class, LocalVector may be dangerous.. 13 | * LocalVector is simple and not well-tested. 14 | */ 15 | const size_t LOCAL_VECTOR_BUFFER_SIZE = 16; 16 | template 17 | class LocalVector { 18 | public: 19 | typedef const T* const_iterator ; 20 | typedef T value_type; 21 | typedef size_t size_type; 22 | private: 23 | T buffer_[LOCAL_VECTOR_BUFFER_SIZE]; 24 | T * ptr_; 25 | size_t size_; 26 | size_t capacity_; 27 | public: 28 | LocalVector() { 29 | init_(); 30 | }; 31 | LocalVector(const LocalVector& vec) { 32 | init_(); 33 | *this = vec; 34 | } 35 | LocalVector(const_iterator begin, const_iterator end) { // TODO: make it faster 36 | init_(); 37 | while(begin != end) { 38 | push_back(*begin++); 39 | } 40 | } 41 | LocalVector(size_t size, const T& t) { // TODO: make it faster 42 | init_(); 43 | while(size--) { 44 | push_back(t); 45 | } 46 | } 47 | ~LocalVector() { 48 | if(ptr_ != buffer_) { 49 | free(ptr_); 50 | } 51 | }; 52 | public: 53 | LocalVector& operator = (const LocalVector& vec) { 54 | clear(); 55 | size_ = vec.size(); 56 | capacity_ = vec.capacity(); 57 | if(vec.buffer_ == vec.ptr_) { 58 | memcpy(buffer_, vec.buffer_, sizeof(T) * size_); 59 | ptr_ = buffer_; 60 | } else { 61 | ptr_ = (T*) malloc(vec.capacity() * sizeof(T)); 62 | assert(ptr_); 63 | memcpy(ptr_, vec.ptr_, vec.size() * sizeof(T)); 64 | } 65 | return *this; 66 | } 67 | private: 68 | void init_() { 69 | ptr_ = buffer_; 70 | size_ = 0; 71 | capacity_ = LOCAL_VECTOR_BUFFER_SIZE; 72 | } 73 | public: 74 | T& operator [] (size_t i) { 75 | return ptr_[i]; 76 | } 77 | const T& operator [] (size_t i) const { 78 | return ptr_[i]; 79 | } 80 | void push_back(const T& t) { 81 | if(size_ == capacity_) { 82 | assert(capacity_); 83 | reserve(capacity_ * 2); 84 | } 85 | ptr_[size_ ++ ] = t; 86 | } 87 | void reserve(size_t size) { 88 | if(size <= capacity_) { 89 | return; 90 | } 91 | T * next = (T*)malloc(sizeof(T) * size); 92 | assert(next); 93 | T * old = ptr_; 94 | ptr_ = next; 95 | memcpy(ptr_, old, sizeof(T) * capacity_); 96 | capacity_ = size; 97 | if(old != buffer_) { 98 | free(old); 99 | } 100 | } 101 | bool empty() const { 102 | return 0 == size(); 103 | } 104 | size_t size() const { 105 | return size_; 106 | } 107 | size_t capacity() const { 108 | return capacity_; 109 | } 110 | const_iterator begin() const { 111 | return ptr_; 112 | } 113 | const_iterator end() const { 114 | return ptr_ + size_; 115 | } 116 | void clear() { 117 | if(ptr_ != buffer_) { 118 | free(ptr_); 119 | } 120 | init_(); 121 | } 122 | }; 123 | 124 | template 125 | ostream & operator << (ostream& os, const LocalVector& vec) { 126 | if(vec.empty()) { 127 | return os << "[]"; 128 | } 129 | os<<"[\""< 5 | #include "MPSegment.hpp" 6 | #include "HMMSegment.hpp" 7 | #include "limonp/StringUtil.hpp" 8 | #include "PosTagger.hpp" 9 | 10 | namespace cppjieba { 11 | class MixSegment: public SegmentTagged { 12 | public: 13 | MixSegment(const string& mpSegDict, const string& hmmSegDict, 14 | const string& userDict = "") 15 | : mpSeg_(mpSegDict, userDict), 16 | hmmSeg_(hmmSegDict) { 17 | } 18 | MixSegment(const DictTrie* dictTrie, const HMMModel* model) 19 | : mpSeg_(dictTrie), hmmSeg_(model) { 20 | } 21 | ~MixSegment() { 22 | } 23 | 24 | void Cut(const string& sentence, vector& words) const { 25 | Cut(sentence, words, true); 26 | } 27 | void Cut(const string& sentence, vector& words, bool hmm) const { 28 | vector tmp; 29 | Cut(sentence, tmp, hmm); 30 | GetStringsFromWords(tmp, words); 31 | } 32 | void Cut(const string& sentence, vector& words, bool hmm = true) const { 33 | PreFilter pre_filter(symbols_, sentence); 34 | PreFilter::Range range; 35 | vector wrs; 36 | wrs.reserve(sentence.size() / 2); 37 | while (pre_filter.HasNext()) { 38 | range = pre_filter.Next(); 39 | Cut(range.begin, range.end, wrs, hmm); 40 | } 41 | words.clear(); 42 | words.reserve(wrs.size()); 43 | GetWordsFromWordRanges(sentence, wrs, words); 44 | } 45 | 46 | void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector& res, bool hmm) const { 47 | if (!hmm) { 48 | mpSeg_.Cut(begin, end, res); 49 | return; 50 | } 51 | vector words; 52 | assert(end >= begin); 53 | words.reserve(end - begin); 54 | mpSeg_.Cut(begin, end, words); 55 | 56 | vector hmmRes; 57 | hmmRes.reserve(end - begin); 58 | for (size_t i = 0; i < words.size(); i++) { 59 | //if mp Get a word, it's ok, put it into result 60 | if (words[i].left != words[i].right || (words[i].left == words[i].right && mpSeg_.IsUserDictSingleChineseWord(words[i].left->rune))) { 61 | res.push_back(words[i]); 62 | continue; 63 | } 64 | 65 | // if mp Get a single one and it is not in userdict, collect it in sequence 66 | size_t j = i; 67 | while (j < words.size() && words[j].left == words[j].right && !mpSeg_.IsUserDictSingleChineseWord(words[j].left->rune)) { 68 | j++; 69 | } 70 | 71 | // Cut the sequence with hmm 72 | assert(j - 1 >= i); 73 | // TODO 74 | hmmSeg_.Cut(words[i].left, words[j - 1].left + 1, hmmRes); 75 | //put hmm result to result 76 | for (size_t k = 0; k < hmmRes.size(); k++) { 77 | res.push_back(hmmRes[k]); 78 | } 79 | 80 | //clear tmp vars 81 | hmmRes.clear(); 82 | 83 | //let i jump over this piece 84 | i = j - 1; 85 | } 86 | } 87 | 88 | const DictTrie* GetDictTrie() const { 89 | return mpSeg_.GetDictTrie(); 90 | } 91 | 92 | bool Tag(const string& src, vector >& res) const { 93 | return tagger_.Tag(src, res, *this); 94 | } 95 | 96 | string LookupTag(const string &str) const { 97 | return tagger_.LookupTag(str, *this); 98 | } 99 | 100 | private: 101 | MPSegment mpSeg_; 102 | HMMSegment hmmSeg_; 103 | PosTagger tagger_; 104 | 105 | }; // class MixSegment 106 | 107 | } // namespace cppjieba 108 | 109 | #endif 110 | -------------------------------------------------------------------------------- /offline/include/cppjieba/limonp/LocalVector.hpp: -------------------------------------------------------------------------------- 1 | #ifndef LIMONP_LOCAL_VECTOR_HPP 2 | #define LIMONP_LOCAL_VECTOR_HPP 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | namespace limonp { 10 | using namespace std; 11 | /* 12 | * LocalVector : T must be primitive type (char , int, size_t), if T is struct or class, LocalVector may be dangerous.. 13 | * LocalVector is simple and not well-tested. 14 | */ 15 | const size_t LOCAL_VECTOR_BUFFER_SIZE = 16; 16 | template 17 | class LocalVector { 18 | public: 19 | typedef const T* const_iterator ; 20 | typedef T value_type; 21 | typedef size_t size_type; 22 | private: 23 | T buffer_[LOCAL_VECTOR_BUFFER_SIZE]; 24 | T * ptr_; 25 | size_t size_; 26 | size_t capacity_; 27 | public: 28 | LocalVector() { 29 | init_(); 30 | }; 31 | LocalVector(const LocalVector& vec) { 32 | init_(); 33 | *this = vec; 34 | } 35 | LocalVector(const_iterator begin, const_iterator end) { // TODO: make it faster 36 | init_(); 37 | while(begin != end) { 38 | push_back(*begin++); 39 | } 40 | } 41 | LocalVector(size_t size, const T& t) { // TODO: make it faster 42 | init_(); 43 | while(size--) { 44 | push_back(t); 45 | } 46 | } 47 | ~LocalVector() { 48 | if(ptr_ != buffer_) { 49 | free(ptr_); 50 | } 51 | }; 52 | public: 53 | LocalVector& operator = (const LocalVector& vec) { 54 | clear(); 55 | size_ = vec.size(); 56 | capacity_ = vec.capacity(); 57 | if(vec.buffer_ == vec.ptr_) { 58 | memcpy(buffer_, vec.buffer_, sizeof(T) * size_); 59 | ptr_ = buffer_; 60 | } else { 61 | ptr_ = (T*) malloc(vec.capacity() * sizeof(T)); 62 | assert(ptr_); 63 | memcpy(ptr_, vec.ptr_, vec.size() * sizeof(T)); 64 | } 65 | return *this; 66 | } 67 | private: 68 | void init_() { 69 | ptr_ = buffer_; 70 | size_ = 0; 71 | capacity_ = LOCAL_VECTOR_BUFFER_SIZE; 72 | } 73 | public: 74 | T& operator [] (size_t i) { 75 | return ptr_[i]; 76 | } 77 | const T& operator [] (size_t i) const { 78 | return ptr_[i]; 79 | } 80 | void push_back(const T& t) { 81 | if(size_ == capacity_) { 82 | assert(capacity_); 83 | reserve(capacity_ * 2); 84 | } 85 | ptr_[size_ ++ ] = t; 86 | } 87 | void reserve(size_t size) { 88 | if(size <= capacity_) { 89 | return; 90 | } 91 | T * next = (T*)malloc(sizeof(T) * size); 92 | assert(next); 93 | T * old = ptr_; 94 | ptr_ = next; 95 | memcpy(ptr_, old, sizeof(T) * capacity_); 96 | capacity_ = size; 97 | if(old != buffer_) { 98 | free(old); 99 | } 100 | } 101 | bool empty() const { 102 | return 0 == size(); 103 | } 104 | size_t size() const { 105 | return size_; 106 | } 107 | size_t capacity() const { 108 | return capacity_; 109 | } 110 | const_iterator begin() const { 111 | return ptr_; 112 | } 113 | const_iterator end() const { 114 | return ptr_ + size_; 115 | } 116 | void clear() { 117 | if(ptr_ != buffer_) { 118 | free(ptr_); 119 | } 120 | init_(); 121 | } 122 | }; 123 | 124 | template 125 | ostream & operator << (ostream& os, const LocalVector& vec) { 126 | if(vec.empty()) { 127 | return os << "[]"; 128 | } 129 | os<<"[\""< 5 | 6 | #ifdef __APPLE__ 7 | #include 8 | #include 9 | #elif(__cplusplus >= 201103L) 10 | #include 11 | #include 12 | #elif defined _MSC_VER 13 | #include 14 | #include 15 | #else 16 | #include 17 | #include 18 | namespace std { 19 | using std::tr1::unordered_map; 20 | using std::tr1::unordered_set; 21 | } 22 | 23 | #endif 24 | 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | 32 | 33 | namespace std { 34 | 35 | template 36 | ostream& operator << (ostream& os, const vector& v) { 37 | if(v.empty()) { 38 | return os << "[]"; 39 | } 40 | os<<"["< 49 | inline ostream& operator << (ostream& os, const vector& v) { 50 | if(v.empty()) { 51 | return os << "[]"; 52 | } 53 | os<<"[\""< 62 | ostream& operator << (ostream& os, const deque& dq) { 63 | if(dq.empty()) { 64 | return os << "[]"; 65 | } 66 | os<<"[\""< 76 | ostream& operator << (ostream& os, const pair& pr) { 77 | os << pr.first << ":" << pr.second ; 78 | return os; 79 | } 80 | 81 | 82 | template 83 | string& operator << (string& str, const T& obj) { 84 | stringstream ss; 85 | ss << obj; // call ostream& operator << (ostream& os, 86 | return str = ss.str(); 87 | } 88 | 89 | template 90 | ostream& operator << (ostream& os, const map& mp) { 91 | if(mp.empty()) { 92 | os<<"{}"; 93 | return os; 94 | } 95 | os<<'{'; 96 | typename map::const_iterator it = mp.begin(); 97 | os<<*it; 98 | it++; 99 | while(it != mp.end()) { 100 | os<<", "<<*it; 101 | it++; 102 | } 103 | os<<'}'; 104 | return os; 105 | } 106 | template 107 | ostream& operator << (ostream& os, const std::unordered_map& mp) { 108 | if(mp.empty()) { 109 | return os << "{}"; 110 | } 111 | os<<'{'; 112 | typename std::unordered_map::const_iterator it = mp.begin(); 113 | os<<*it; 114 | it++; 115 | while(it != mp.end()) { 116 | os<<", "<<*it++; 117 | } 118 | return os<<'}'; 119 | } 120 | 121 | template 122 | ostream& operator << (ostream& os, const set& st) { 123 | if(st.empty()) { 124 | os << "{}"; 125 | return os; 126 | } 127 | os<<'{'; 128 | typename set::const_iterator it = st.begin(); 129 | os<<*it; 130 | it++; 131 | while(it != st.end()) { 132 | os<<", "<<*it; 133 | it++; 134 | } 135 | os<<'}'; 136 | return os; 137 | } 138 | 139 | template 140 | bool IsIn(const ContainType& contain, const KeyType& key) { 141 | return contain.end() != contain.find(key); 142 | } 143 | 144 | template 145 | basic_string & operator << (basic_string & s, ifstream & ifs) { 146 | return s.assign((istreambuf_iterator(ifs)), istreambuf_iterator()); 147 | } 148 | 149 | template 150 | ofstream & operator << (ofstream & ofs, const basic_string& s) { 151 | ostreambuf_iterator itr (ofs); 152 | copy(s.begin(), s.end(), itr); 153 | return ofs; 154 | } 155 | 156 | } // namespace std 157 | 158 | #endif 159 | -------------------------------------------------------------------------------- /include/cppjieba/HMMModel.hpp: -------------------------------------------------------------------------------- 1 | #ifndef CPPJIEBA_HMMMODEL_H 2 | #define CPPJIEBA_HMMMODEL_H 3 | 4 | #include "limonp/StringUtil.hpp" 5 | #include "Trie.hpp" 6 | 7 | namespace cppjieba { 8 | 9 | using namespace limonp; 10 | typedef unordered_map EmitProbMap; 11 | 12 | struct HMMModel { 13 | /* 14 | * STATUS: 15 | * 0: HMMModel::B, 1: HMMModel::E, 2: HMMModel::M, 3:HMMModel::S 16 | * */ 17 | enum {B = 0, E = 1, M = 2, S = 3, STATUS_SUM = 4}; 18 | 19 | HMMModel(const string& modelPath) { 20 | memset(startProb, 0, sizeof(startProb)); 21 | memset(transProb, 0, sizeof(transProb)); 22 | statMap[0] = 'B'; 23 | statMap[1] = 'E'; 24 | statMap[2] = 'M'; 25 | statMap[3] = 'S'; 26 | emitProbVec.push_back(&emitProbB); 27 | emitProbVec.push_back(&emitProbE); 28 | emitProbVec.push_back(&emitProbM); 29 | emitProbVec.push_back(&emitProbS); 30 | LoadModel(modelPath); 31 | } 32 | ~HMMModel() { 33 | } 34 | void LoadModel(const string& filePath) { 35 | ifstream ifile(filePath.c_str()); 36 | XCHECK(ifile.is_open()) << "open " << filePath << " failed"; 37 | string line; 38 | vector tmp; 39 | vector tmp2; 40 | //Load startProb 41 | XCHECK(GetLine(ifile, line)); 42 | Split(line, tmp, " "); 43 | XCHECK(tmp.size() == STATUS_SUM); 44 | for (size_t j = 0; j< tmp.size(); j++) { 45 | startProb[j] = atof(tmp[j].c_str()); 46 | } 47 | 48 | //Load transProb 49 | for (size_t i = 0; i < STATUS_SUM; i++) { 50 | XCHECK(GetLine(ifile, line)); 51 | Split(line, tmp, " "); 52 | XCHECK(tmp.size() == STATUS_SUM); 53 | for (size_t j =0; j < STATUS_SUM; j++) { 54 | transProb[i][j] = atof(tmp[j].c_str()); 55 | } 56 | } 57 | 58 | //Load emitProbB 59 | XCHECK(GetLine(ifile, line)); 60 | XCHECK(LoadEmitProb(line, emitProbB)); 61 | 62 | //Load emitProbE 63 | XCHECK(GetLine(ifile, line)); 64 | XCHECK(LoadEmitProb(line, emitProbE)); 65 | 66 | //Load emitProbM 67 | XCHECK(GetLine(ifile, line)); 68 | XCHECK(LoadEmitProb(line, emitProbM)); 69 | 70 | //Load emitProbS 71 | XCHECK(GetLine(ifile, line)); 72 | XCHECK(LoadEmitProb(line, emitProbS)); 73 | } 74 | double GetEmitProb(const EmitProbMap* ptMp, Rune key, 75 | double defVal)const { 76 | EmitProbMap::const_iterator cit = ptMp->find(key); 77 | if (cit == ptMp->end()) { 78 | return defVal; 79 | } 80 | return cit->second; 81 | } 82 | bool GetLine(ifstream& ifile, string& line) { 83 | while (getline(ifile, line)) { 84 | Trim(line); 85 | if (line.empty()) { 86 | continue; 87 | } 88 | if (StartsWith(line, "#")) { 89 | continue; 90 | } 91 | return true; 92 | } 93 | return false; 94 | } 95 | bool LoadEmitProb(const string& line, EmitProbMap& mp) { 96 | if (line.empty()) { 97 | return false; 98 | } 99 | vector tmp, tmp2; 100 | Unicode unicode; 101 | Split(line, tmp, ","); 102 | for (size_t i = 0; i < tmp.size(); i++) { 103 | Split(tmp[i], tmp2, ":"); 104 | if (2 != tmp2.size()) { 105 | XLOG(ERROR) << "emitProb illegal."; 106 | return false; 107 | } 108 | if (!DecodeRunesInString(tmp2[0], unicode) || unicode.size() != 1) { 109 | XLOG(ERROR) << "TransCode failed."; 110 | return false; 111 | } 112 | mp[unicode[0]] = atof(tmp2[1].c_str()); 113 | } 114 | return true; 115 | } 116 | 117 | char statMap[STATUS_SUM]; 118 | double startProb[STATUS_SUM]; 119 | double transProb[STATUS_SUM][STATUS_SUM]; 120 | EmitProbMap emitProbB; 121 | EmitProbMap emitProbE; 122 | EmitProbMap emitProbM; 123 | EmitProbMap emitProbS; 124 | vector emitProbVec; 125 | }; // struct HMMModel 126 | 127 | } // namespace cppjieba 128 | 129 | #endif 130 | -------------------------------------------------------------------------------- /offline/src/RssReader.cc: -------------------------------------------------------------------------------- 1 | #include "Configuration.h" 2 | #include "RssReader.h" 3 | #include 4 | #include 5 | #include 6 | #include 7 | using std::cout; 8 | using std::endl; 9 | using std::istringstream; 10 | using std::ostringstream; 11 | using std::to_string; 12 | 13 | namespace wd { 14 | 15 | RssReader::RssReader(vector& files) : _files(files) { 16 | cout << "RssReader()" << endl; 17 | } 18 | 19 | RssReader::~RssReader() { cout << " ~RssReader()" << endl; } 20 | 21 | void RssReader::loadFiles() { 22 | for (auto& path : _files) { 23 | loadXML(path); 24 | } 25 | cout << ">> parse xml files success" << endl; 26 | } 27 | 28 | void RssReader::loadXML(const string& xmlPath) { 29 | XMLDocument doc; 30 | doc.LoadFile(xmlPath.c_str()); 31 | if (doc.ErrorID()) { 32 | cout << ">> tinyxml load file error!" << endl; 33 | return; 34 | } 35 | parseRss(doc); 36 | } 37 | 38 | void RssReader::parseRss(XMLDocument& doc) { 39 | XMLElement* root = doc.RootElement(); 40 | XMLElement* channel = root->FirstChildElement("channel"); 41 | XMLElement* item = channel->FirstChildElement("item"); 42 | 43 | do { 44 | RssItem tmp; 45 | XMLElement* pNode = item->FirstChildElement("title"); 46 | tmp.title = pNode->GetText(); 47 | pNode = item->FirstChildElement("link"); 48 | tmp.link = pNode->GetText(); 49 | pNode = item->FirstChildElement("description"); 50 | string descrip = pNode->GetText(); 51 | pNode = item->FirstChildElement("content:encoded"); 52 | 53 | if (pNode) { 54 | tmp.content = pNode->GetText(); 55 | } else { 56 | tmp.content = descrip; 57 | } 58 | 59 | // boost::regex reg("<[^>]*>"); 60 | boost::regex reg("<.*?>"); //效果一样 61 | tmp.content = boost::regex_replace(tmp.content, reg, ""); 62 | //去掉多余的空格和换行 63 | // boost::regex reg2("[\\s]+"); 64 | boost::regex reg2(R"(( )+|(\n)+)"); 65 | tmp.content = boost::regex_replace(tmp.content, reg2, "$1$2"); 66 | 67 | _rssItems.push_back(tmp); 68 | } while (item = item->NextSiblingElement()); 69 | } 70 | 71 | void RssReader::createXML() { 72 | string declaration = ""; 73 | XMLDocument pageLib; 74 | pageLib.Parse(declaration.c_str()); 75 | 76 | int i = 0; 77 | for (auto& item : _rssItems) { 78 | XMLElement* doc = pageLib.NewElement("doc"); 79 | pageLib.InsertEndChild(doc); 80 | XMLElement* docid = pageLib.NewElement("docid"); 81 | XMLText* docidText = pageLib.NewText(to_string(++i).c_str()); 82 | docid->InsertEndChild(docidText); 83 | doc->InsertEndChild(docid); 84 | XMLElement* title = pageLib.NewElement("title"); 85 | title->InsertEndChild(pageLib.NewText(item.title.c_str())); 86 | doc->InsertEndChild(title); 87 | XMLElement* link = pageLib.NewElement("link"); 88 | link->InsertEndChild(pageLib.NewText(item.link.c_str())); 89 | doc->InsertEndChild(link); 90 | XMLElement* content = pageLib.NewElement("content"); 91 | content->InsertEndChild(pageLib.NewText(item.content.c_str())); 92 | doc->InsertEndChild(content); 93 | } 94 | 95 | pageLib.SaveFile(CONFIG[RIPEPAGE_PATH].c_str()); 96 | } 97 | 98 | void RssReader::makePages(vector& pages) { 99 | int i = 0; 100 | for (auto& item : _rssItems) { 101 | ostringstream oss; 102 | oss << "" << '\n' 103 | << '\t' << "" << ++i << "" << '\n' 104 | << '\t' << "" << item.title << "" << '\n' 105 | << '\t' << "" << item.link << "" << '\n' 106 | << '\t' << "" << item.content << "" << '\n' 107 | << "" << '\n'; 108 | pages.push_back(oss.str()); 109 | } 110 | } 111 | 112 | } // namespace wd -------------------------------------------------------------------------------- /offline/include/cppjieba/HMMModel.hpp: -------------------------------------------------------------------------------- 1 | #ifndef CPPJIEBA_HMMMODEL_H 2 | #define CPPJIEBA_HMMMODEL_H 3 | 4 | #include "limonp/StringUtil.hpp" 5 | #include "Trie.hpp" 6 | 7 | namespace cppjieba { 8 | 9 | using namespace limonp; 10 | typedef unordered_map EmitProbMap; 11 | 12 | struct HMMModel { 13 | /* 14 | * STATUS: 15 | * 0: HMMModel::B, 1: HMMModel::E, 2: HMMModel::M, 3:HMMModel::S 16 | * */ 17 | enum {B = 0, E = 1, M = 2, S = 3, STATUS_SUM = 4}; 18 | 19 | HMMModel(const string& modelPath) { 20 | memset(startProb, 0, sizeof(startProb)); 21 | memset(transProb, 0, sizeof(transProb)); 22 | statMap[0] = 'B'; 23 | statMap[1] = 'E'; 24 | statMap[2] = 'M'; 25 | statMap[3] = 'S'; 26 | emitProbVec.push_back(&emitProbB); 27 | emitProbVec.push_back(&emitProbE); 28 | emitProbVec.push_back(&emitProbM); 29 | emitProbVec.push_back(&emitProbS); 30 | LoadModel(modelPath); 31 | } 32 | ~HMMModel() { 33 | } 34 | void LoadModel(const string& filePath) { 35 | ifstream ifile(filePath.c_str()); 36 | XCHECK(ifile.is_open()) << "open " << filePath << " failed"; 37 | string line; 38 | vector tmp; 39 | vector tmp2; 40 | //Load startProb 41 | XCHECK(GetLine(ifile, line)); 42 | Split(line, tmp, " "); 43 | XCHECK(tmp.size() == STATUS_SUM); 44 | for (size_t j = 0; j< tmp.size(); j++) { 45 | startProb[j] = atof(tmp[j].c_str()); 46 | } 47 | 48 | //Load transProb 49 | for (size_t i = 0; i < STATUS_SUM; i++) { 50 | XCHECK(GetLine(ifile, line)); 51 | Split(line, tmp, " "); 52 | XCHECK(tmp.size() == STATUS_SUM); 53 | for (size_t j =0; j < STATUS_SUM; j++) { 54 | transProb[i][j] = atof(tmp[j].c_str()); 55 | } 56 | } 57 | 58 | //Load emitProbB 59 | XCHECK(GetLine(ifile, line)); 60 | XCHECK(LoadEmitProb(line, emitProbB)); 61 | 62 | //Load emitProbE 63 | XCHECK(GetLine(ifile, line)); 64 | XCHECK(LoadEmitProb(line, emitProbE)); 65 | 66 | //Load emitProbM 67 | XCHECK(GetLine(ifile, line)); 68 | XCHECK(LoadEmitProb(line, emitProbM)); 69 | 70 | //Load emitProbS 71 | XCHECK(GetLine(ifile, line)); 72 | XCHECK(LoadEmitProb(line, emitProbS)); 73 | } 74 | double GetEmitProb(const EmitProbMap* ptMp, Rune key, 75 | double defVal)const { 76 | EmitProbMap::const_iterator cit = ptMp->find(key); 77 | if (cit == ptMp->end()) { 78 | return defVal; 79 | } 80 | return cit->second; 81 | } 82 | bool GetLine(ifstream& ifile, string& line) { 83 | while (getline(ifile, line)) { 84 | Trim(line); 85 | if (line.empty()) { 86 | continue; 87 | } 88 | if (StartsWith(line, "#")) { 89 | continue; 90 | } 91 | return true; 92 | } 93 | return false; 94 | } 95 | bool LoadEmitProb(const string& line, EmitProbMap& mp) { 96 | if (line.empty()) { 97 | return false; 98 | } 99 | vector tmp, tmp2; 100 | Unicode unicode; 101 | Split(line, tmp, ","); 102 | for (size_t i = 0; i < tmp.size(); i++) { 103 | Split(tmp[i], tmp2, ":"); 104 | if (2 != tmp2.size()) { 105 | XLOG(ERROR) << "emitProb illegal."; 106 | return false; 107 | } 108 | if (!DecodeRunesInString(tmp2[0], unicode) || unicode.size() != 1) { 109 | XLOG(ERROR) << "TransCode failed."; 110 | return false; 111 | } 112 | mp[unicode[0]] = atof(tmp2[1].c_str()); 113 | } 114 | return true; 115 | } 116 | 117 | char statMap[STATUS_SUM]; 118 | double startProb[STATUS_SUM]; 119 | double transProb[STATUS_SUM][STATUS_SUM]; 120 | EmitProbMap emitProbB; 121 | EmitProbMap emitProbE; 122 | EmitProbMap emitProbM; 123 | EmitProbMap emitProbS; 124 | vector emitProbVec; 125 | }; // struct HMMModel 126 | 127 | } // namespace cppjieba 128 | 129 | #endif 130 | -------------------------------------------------------------------------------- /offline/include/cppjieba/limonp/StdExtension.hpp: -------------------------------------------------------------------------------- 1 | #ifndef LIMONP_STD_EXTEMSION_HPP 2 | #define LIMONP_STD_EXTEMSION_HPP 3 | 4 | #include 5 | 6 | #ifdef __APPLE__ 7 | #include 8 | #include 9 | #elif(__cplusplus >= 201103L) 10 | #include 11 | #include 12 | #elif defined _MSC_VER 13 | #include 14 | #include 15 | #else 16 | #include 17 | #include 18 | namespace std { 19 | using std::tr1::unordered_map; 20 | using std::tr1::unordered_set; 21 | } 22 | 23 | #endif 24 | 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | 32 | 33 | namespace std { 34 | 35 | template 36 | ostream& operator << (ostream& os, const vector& v) { 37 | if(v.empty()) { 38 | return os << "[]"; 39 | } 40 | os<<"["< 49 | inline ostream& operator << (ostream& os, const vector& v) { 50 | if(v.empty()) { 51 | return os << "[]"; 52 | } 53 | os<<"[\""< 62 | ostream& operator << (ostream& os, const deque& dq) { 63 | if(dq.empty()) { 64 | return os << "[]"; 65 | } 66 | os<<"[\""< 76 | ostream& operator << (ostream& os, const pair& pr) { 77 | os << pr.first << ":" << pr.second ; 78 | return os; 79 | } 80 | 81 | 82 | template 83 | string& operator << (string& str, const T& obj) { 84 | stringstream ss; 85 | ss << obj; // call ostream& operator << (ostream& os, 86 | return str = ss.str(); 87 | } 88 | 89 | template 90 | ostream& operator << (ostream& os, const map& mp) { 91 | if(mp.empty()) { 92 | os<<"{}"; 93 | return os; 94 | } 95 | os<<'{'; 96 | typename map::const_iterator it = mp.begin(); 97 | os<<*it; 98 | it++; 99 | while(it != mp.end()) { 100 | os<<", "<<*it; 101 | it++; 102 | } 103 | os<<'}'; 104 | return os; 105 | } 106 | template 107 | ostream& operator << (ostream& os, const std::unordered_map& mp) { 108 | if(mp.empty()) { 109 | return os << "{}"; 110 | } 111 | os<<'{'; 112 | typename std::unordered_map::const_iterator it = mp.begin(); 113 | os<<*it; 114 | it++; 115 | while(it != mp.end()) { 116 | os<<", "<<*it++; 117 | } 118 | return os<<'}'; 119 | } 120 | 121 | template 122 | ostream& operator << (ostream& os, const set& st) { 123 | if(st.empty()) { 124 | os << "{}"; 125 | return os; 126 | } 127 | os<<'{'; 128 | typename set::const_iterator it = st.begin(); 129 | os<<*it; 130 | it++; 131 | while(it != st.end()) { 132 | os<<", "<<*it; 133 | it++; 134 | } 135 | os<<'}'; 136 | return os; 137 | } 138 | 139 | template 140 | bool IsIn(const ContainType& contain, const KeyType& key) { 141 | return contain.end() != contain.find(key); 142 | } 143 | 144 | template 145 | basic_string & operator << (basic_string & s, ifstream & ifs) { 146 | return s.assign((istreambuf_iterator(ifs)), istreambuf_iterator()); 147 | } 148 | 149 | template 150 | ofstream & operator << (ofstream & ofs, const basic_string& s) { 151 | ostreambuf_iterator itr (ofs); 152 | copy(s.begin(), s.end(), itr); 153 | return ofs; 154 | } 155 | 156 | } // namespace std 157 | 158 | #endif 159 | -------------------------------------------------------------------------------- /include/cppjieba/Jieba.hpp: -------------------------------------------------------------------------------- 1 | #ifndef CPPJIEAB_JIEBA_H 2 | #define CPPJIEAB_JIEBA_H 3 | 4 | #include "QuerySegment.hpp" 5 | #include "KeywordExtractor.hpp" 6 | 7 | namespace cppjieba { 8 | 9 | class Jieba { 10 | public: 11 | Jieba(const string& dict_path, 12 | const string& model_path, 13 | const string& user_dict_path, 14 | const string& idfPath, 15 | const string& stopWordPath) 16 | : dict_trie_(dict_path, user_dict_path), 17 | model_(model_path), 18 | mp_seg_(&dict_trie_), 19 | hmm_seg_(&model_), 20 | mix_seg_(&dict_trie_, &model_), 21 | full_seg_(&dict_trie_), 22 | query_seg_(&dict_trie_, &model_), 23 | extractor(&dict_trie_, &model_, idfPath, stopWordPath) { 24 | } 25 | ~Jieba() { 26 | } 27 | 28 | struct LocWord { 29 | string word; 30 | size_t begin; 31 | size_t end; 32 | }; // struct LocWord 33 | 34 | void Cut(const string& sentence, vector& words, bool hmm = true) const { 35 | mix_seg_.Cut(sentence, words, hmm); 36 | } 37 | void Cut(const string& sentence, vector& words, bool hmm = true) const { 38 | mix_seg_.Cut(sentence, words, hmm); 39 | } 40 | void CutAll(const string& sentence, vector& words) const { 41 | full_seg_.Cut(sentence, words); 42 | } 43 | void CutAll(const string& sentence, vector& words) const { 44 | full_seg_.Cut(sentence, words); 45 | } 46 | void CutForSearch(const string& sentence, vector& words, bool hmm = true) const { 47 | query_seg_.Cut(sentence, words, hmm); 48 | } 49 | void CutForSearch(const string& sentence, vector& words, bool hmm = true) const { 50 | query_seg_.Cut(sentence, words, hmm); 51 | } 52 | void CutHMM(const string& sentence, vector& words) const { 53 | hmm_seg_.Cut(sentence, words); 54 | } 55 | void CutHMM(const string& sentence, vector& words) const { 56 | hmm_seg_.Cut(sentence, words); 57 | } 58 | void CutSmall(const string& sentence, vector& words, size_t max_word_len) const { 59 | mp_seg_.Cut(sentence, words, max_word_len); 60 | } 61 | void CutSmall(const string& sentence, vector& words, size_t max_word_len) const { 62 | mp_seg_.Cut(sentence, words, max_word_len); 63 | } 64 | 65 | void Tag(const string& sentence, vector >& words) const { 66 | mix_seg_.Tag(sentence, words); 67 | } 68 | string LookupTag(const string &str) const { 69 | return mix_seg_.LookupTag(str); 70 | } 71 | bool InsertUserWord(const string& word, const string& tag = UNKNOWN_TAG) { 72 | return dict_trie_.InsertUserWord(word, tag); 73 | } 74 | 75 | bool InsertUserWord(const string& word,int freq, const string& tag = UNKNOWN_TAG) { 76 | return dict_trie_.InsertUserWord(word,freq, tag); 77 | } 78 | 79 | bool Find(const string& word) 80 | { 81 | return dict_trie_.Find(word); 82 | } 83 | 84 | void ResetSeparators(const string& s) { 85 | //TODO 86 | mp_seg_.ResetSeparators(s); 87 | hmm_seg_.ResetSeparators(s); 88 | mix_seg_.ResetSeparators(s); 89 | full_seg_.ResetSeparators(s); 90 | query_seg_.ResetSeparators(s); 91 | } 92 | 93 | const DictTrie* GetDictTrie() const { 94 | return &dict_trie_; 95 | } 96 | 97 | const HMMModel* GetHMMModel() const { 98 | return &model_; 99 | } 100 | 101 | void LoadUserDict(const vector& buf) { 102 | dict_trie_.LoadUserDict(buf); 103 | } 104 | 105 | void LoadUserDict(const set& buf) { 106 | dict_trie_.LoadUserDict(buf); 107 | } 108 | 109 | void LoadUserDict(const string& path) { 110 | dict_trie_.LoadUserDict(path); 111 | } 112 | 113 | private: 114 | DictTrie dict_trie_; 115 | HMMModel model_; 116 | 117 | // They share the same dict trie and model 118 | MPSegment mp_seg_; 119 | HMMSegment hmm_seg_; 120 | MixSegment mix_seg_; 121 | FullSegment full_seg_; 122 | QuerySegment query_seg_; 123 | 124 | public: 125 | KeywordExtractor extractor; 126 | }; // class Jieba 127 | 128 | } // namespace cppjieba 129 | 130 | #endif // CPPJIEAB_JIEBA_H 131 | -------------------------------------------------------------------------------- /offline/include/cppjieba/Jieba.hpp: -------------------------------------------------------------------------------- 1 | #ifndef CPPJIEAB_JIEBA_H 2 | #define CPPJIEAB_JIEBA_H 3 | 4 | #include "QuerySegment.hpp" 5 | #include "KeywordExtractor.hpp" 6 | 7 | namespace cppjieba { 8 | 9 | class Jieba { 10 | public: 11 | Jieba(const string& dict_path, 12 | const string& model_path, 13 | const string& user_dict_path, 14 | const string& idfPath, 15 | const string& stopWordPath) 16 | : dict_trie_(dict_path, user_dict_path), 17 | model_(model_path), 18 | mp_seg_(&dict_trie_), 19 | hmm_seg_(&model_), 20 | mix_seg_(&dict_trie_, &model_), 21 | full_seg_(&dict_trie_), 22 | query_seg_(&dict_trie_, &model_), 23 | extractor(&dict_trie_, &model_, idfPath, stopWordPath) { 24 | } 25 | ~Jieba() { 26 | } 27 | 28 | struct LocWord { 29 | string word; 30 | size_t begin; 31 | size_t end; 32 | }; // struct LocWord 33 | 34 | void Cut(const string& sentence, vector& words, bool hmm = true) const { 35 | mix_seg_.Cut(sentence, words, hmm); 36 | } 37 | void Cut(const string& sentence, vector& words, bool hmm = true) const { 38 | mix_seg_.Cut(sentence, words, hmm); 39 | } 40 | void CutAll(const string& sentence, vector& words) const { 41 | full_seg_.Cut(sentence, words); 42 | } 43 | void CutAll(const string& sentence, vector& words) const { 44 | full_seg_.Cut(sentence, words); 45 | } 46 | void CutForSearch(const string& sentence, vector& words, bool hmm = true) const { 47 | query_seg_.Cut(sentence, words, hmm); 48 | } 49 | void CutForSearch(const string& sentence, vector& words, bool hmm = true) const { 50 | query_seg_.Cut(sentence, words, hmm); 51 | } 52 | void CutHMM(const string& sentence, vector& words) const { 53 | hmm_seg_.Cut(sentence, words); 54 | } 55 | void CutHMM(const string& sentence, vector& words) const { 56 | hmm_seg_.Cut(sentence, words); 57 | } 58 | void CutSmall(const string& sentence, vector& words, size_t max_word_len) const { 59 | mp_seg_.Cut(sentence, words, max_word_len); 60 | } 61 | void CutSmall(const string& sentence, vector& words, size_t max_word_len) const { 62 | mp_seg_.Cut(sentence, words, max_word_len); 63 | } 64 | 65 | void Tag(const string& sentence, vector >& words) const { 66 | mix_seg_.Tag(sentence, words); 67 | } 68 | string LookupTag(const string &str) const { 69 | return mix_seg_.LookupTag(str); 70 | } 71 | bool InsertUserWord(const string& word, const string& tag = UNKNOWN_TAG) { 72 | return dict_trie_.InsertUserWord(word, tag); 73 | } 74 | 75 | bool InsertUserWord(const string& word,int freq, const string& tag = UNKNOWN_TAG) { 76 | return dict_trie_.InsertUserWord(word,freq, tag); 77 | } 78 | 79 | bool Find(const string& word) 80 | { 81 | return dict_trie_.Find(word); 82 | } 83 | 84 | void ResetSeparators(const string& s) { 85 | //TODO 86 | mp_seg_.ResetSeparators(s); 87 | hmm_seg_.ResetSeparators(s); 88 | mix_seg_.ResetSeparators(s); 89 | full_seg_.ResetSeparators(s); 90 | query_seg_.ResetSeparators(s); 91 | } 92 | 93 | const DictTrie* GetDictTrie() const { 94 | return &dict_trie_; 95 | } 96 | 97 | const HMMModel* GetHMMModel() const { 98 | return &model_; 99 | } 100 | 101 | void LoadUserDict(const vector& buf) { 102 | dict_trie_.LoadUserDict(buf); 103 | } 104 | 105 | void LoadUserDict(const set& buf) { 106 | dict_trie_.LoadUserDict(buf); 107 | } 108 | 109 | void LoadUserDict(const string& path) { 110 | dict_trie_.LoadUserDict(path); 111 | } 112 | 113 | private: 114 | DictTrie dict_trie_; 115 | HMMModel model_; 116 | 117 | // They share the same dict trie and model 118 | MPSegment mp_seg_; 119 | HMMSegment hmm_seg_; 120 | MixSegment mix_seg_; 121 | FullSegment full_seg_; 122 | QuerySegment query_seg_; 123 | 124 | public: 125 | KeywordExtractor extractor; 126 | }; // class Jieba 127 | 128 | } // namespace cppjieba 129 | 130 | #endif // CPPJIEAB_JIEBA_H 131 | -------------------------------------------------------------------------------- /include/cppjieba/MPSegment.hpp: -------------------------------------------------------------------------------- 1 | #ifndef CPPJIEBA_MPSEGMENT_H 2 | #define CPPJIEBA_MPSEGMENT_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include "limonp/Logging.hpp" 8 | #include "DictTrie.hpp" 9 | #include "SegmentTagged.hpp" 10 | #include "PosTagger.hpp" 11 | 12 | namespace cppjieba { 13 | 14 | class MPSegment: public SegmentTagged { 15 | public: 16 | MPSegment(const string& dictPath, const string& userDictPath = "") 17 | : dictTrie_(new DictTrie(dictPath, userDictPath)), isNeedDestroy_(true) { 18 | } 19 | MPSegment(const DictTrie* dictTrie) 20 | : dictTrie_(dictTrie), isNeedDestroy_(false) { 21 | assert(dictTrie_); 22 | } 23 | ~MPSegment() { 24 | if (isNeedDestroy_) { 25 | delete dictTrie_; 26 | } 27 | } 28 | 29 | void Cut(const string& sentence, vector& words) const { 30 | Cut(sentence, words, MAX_WORD_LENGTH); 31 | } 32 | 33 | void Cut(const string& sentence, 34 | vector& words, 35 | size_t max_word_len) const { 36 | vector tmp; 37 | Cut(sentence, tmp, max_word_len); 38 | GetStringsFromWords(tmp, words); 39 | } 40 | void Cut(const string& sentence, 41 | vector& words, 42 | size_t max_word_len = MAX_WORD_LENGTH) const { 43 | PreFilter pre_filter(symbols_, sentence); 44 | PreFilter::Range range; 45 | vector wrs; 46 | wrs.reserve(sentence.size()/2); 47 | while (pre_filter.HasNext()) { 48 | range = pre_filter.Next(); 49 | Cut(range.begin, range.end, wrs, max_word_len); 50 | } 51 | words.clear(); 52 | words.reserve(wrs.size()); 53 | GetWordsFromWordRanges(sentence, wrs, words); 54 | } 55 | void Cut(RuneStrArray::const_iterator begin, 56 | RuneStrArray::const_iterator end, 57 | vector& words, 58 | size_t max_word_len = MAX_WORD_LENGTH) const { 59 | vector dags; 60 | dictTrie_->Find(begin, 61 | end, 62 | dags, 63 | max_word_len); 64 | CalcDP(dags); 65 | CutByDag(begin, end, dags, words); 66 | } 67 | 68 | const DictTrie* GetDictTrie() const { 69 | return dictTrie_; 70 | } 71 | 72 | bool Tag(const string& src, vector >& res) const { 73 | return tagger_.Tag(src, res, *this); 74 | } 75 | 76 | bool IsUserDictSingleChineseWord(const Rune& value) const { 77 | return dictTrie_->IsUserDictSingleChineseWord(value); 78 | } 79 | private: 80 | void CalcDP(vector& dags) const { 81 | size_t nextPos; 82 | const DictUnit* p; 83 | double val; 84 | 85 | for (vector::reverse_iterator rit = dags.rbegin(); rit != dags.rend(); rit++) { 86 | rit->pInfo = NULL; 87 | rit->weight = MIN_DOUBLE; 88 | assert(!rit->nexts.empty()); 89 | for (LocalVector >::const_iterator it = rit->nexts.begin(); it != rit->nexts.end(); it++) { 90 | nextPos = it->first; 91 | p = it->second; 92 | val = 0.0; 93 | if (nextPos + 1 < dags.size()) { 94 | val += dags[nextPos + 1].weight; 95 | } 96 | 97 | if (p) { 98 | val += p->weight; 99 | } else { 100 | val += dictTrie_->GetMinWeight(); 101 | } 102 | if (val > rit->weight) { 103 | rit->pInfo = p; 104 | rit->weight = val; 105 | } 106 | } 107 | } 108 | } 109 | void CutByDag(RuneStrArray::const_iterator begin, 110 | RuneStrArray::const_iterator end, 111 | const vector& dags, 112 | vector& words) const { 113 | size_t i = 0; 114 | while (i < dags.size()) { 115 | const DictUnit* p = dags[i].pInfo; 116 | if (p) { 117 | assert(p->word.size() >= 1); 118 | WordRange wr(begin + i, begin + i + p->word.size() - 1); 119 | words.push_back(wr); 120 | i += p->word.size(); 121 | } else { //single chinese word 122 | WordRange wr(begin + i, begin + i); 123 | words.push_back(wr); 124 | i++; 125 | } 126 | } 127 | } 128 | 129 | const DictTrie* dictTrie_; 130 | bool isNeedDestroy_; 131 | PosTagger tagger_; 132 | 133 | }; // class MPSegment 134 | 135 | } // namespace cppjieba 136 | 137 | #endif 138 | -------------------------------------------------------------------------------- /offline/include/cppjieba/MPSegment.hpp: -------------------------------------------------------------------------------- 1 | #ifndef CPPJIEBA_MPSEGMENT_H 2 | #define CPPJIEBA_MPSEGMENT_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include "limonp/Logging.hpp" 8 | #include "DictTrie.hpp" 9 | #include "SegmentTagged.hpp" 10 | #include "PosTagger.hpp" 11 | 12 | namespace cppjieba { 13 | 14 | class MPSegment: public SegmentTagged { 15 | public: 16 | MPSegment(const string& dictPath, const string& userDictPath = "") 17 | : dictTrie_(new DictTrie(dictPath, userDictPath)), isNeedDestroy_(true) { 18 | } 19 | MPSegment(const DictTrie* dictTrie) 20 | : dictTrie_(dictTrie), isNeedDestroy_(false) { 21 | assert(dictTrie_); 22 | } 23 | ~MPSegment() { 24 | if (isNeedDestroy_) { 25 | delete dictTrie_; 26 | } 27 | } 28 | 29 | void Cut(const string& sentence, vector& words) const { 30 | Cut(sentence, words, MAX_WORD_LENGTH); 31 | } 32 | 33 | void Cut(const string& sentence, 34 | vector& words, 35 | size_t max_word_len) const { 36 | vector tmp; 37 | Cut(sentence, tmp, max_word_len); 38 | GetStringsFromWords(tmp, words); 39 | } 40 | void Cut(const string& sentence, 41 | vector& words, 42 | size_t max_word_len = MAX_WORD_LENGTH) const { 43 | PreFilter pre_filter(symbols_, sentence); 44 | PreFilter::Range range; 45 | vector wrs; 46 | wrs.reserve(sentence.size()/2); 47 | while (pre_filter.HasNext()) { 48 | range = pre_filter.Next(); 49 | Cut(range.begin, range.end, wrs, max_word_len); 50 | } 51 | words.clear(); 52 | words.reserve(wrs.size()); 53 | GetWordsFromWordRanges(sentence, wrs, words); 54 | } 55 | void Cut(RuneStrArray::const_iterator begin, 56 | RuneStrArray::const_iterator end, 57 | vector& words, 58 | size_t max_word_len = MAX_WORD_LENGTH) const { 59 | vector dags; 60 | dictTrie_->Find(begin, 61 | end, 62 | dags, 63 | max_word_len); 64 | CalcDP(dags); 65 | CutByDag(begin, end, dags, words); 66 | } 67 | 68 | const DictTrie* GetDictTrie() const { 69 | return dictTrie_; 70 | } 71 | 72 | bool Tag(const string& src, vector >& res) const { 73 | return tagger_.Tag(src, res, *this); 74 | } 75 | 76 | bool IsUserDictSingleChineseWord(const Rune& value) const { 77 | return dictTrie_->IsUserDictSingleChineseWord(value); 78 | } 79 | private: 80 | void CalcDP(vector& dags) const { 81 | size_t nextPos; 82 | const DictUnit* p; 83 | double val; 84 | 85 | for (vector::reverse_iterator rit = dags.rbegin(); rit != dags.rend(); rit++) { 86 | rit->pInfo = NULL; 87 | rit->weight = MIN_DOUBLE; 88 | assert(!rit->nexts.empty()); 89 | for (LocalVector >::const_iterator it = rit->nexts.begin(); it != rit->nexts.end(); it++) { 90 | nextPos = it->first; 91 | p = it->second; 92 | val = 0.0; 93 | if (nextPos + 1 < dags.size()) { 94 | val += dags[nextPos + 1].weight; 95 | } 96 | 97 | if (p) { 98 | val += p->weight; 99 | } else { 100 | val += dictTrie_->GetMinWeight(); 101 | } 102 | if (val > rit->weight) { 103 | rit->pInfo = p; 104 | rit->weight = val; 105 | } 106 | } 107 | } 108 | } 109 | void CutByDag(RuneStrArray::const_iterator begin, 110 | RuneStrArray::const_iterator end, 111 | const vector& dags, 112 | vector& words) const { 113 | size_t i = 0; 114 | while (i < dags.size()) { 115 | const DictUnit* p = dags[i].pInfo; 116 | if (p) { 117 | assert(p->word.size() >= 1); 118 | WordRange wr(begin + i, begin + i + p->word.size() - 1); 119 | words.push_back(wr); 120 | i += p->word.size(); 121 | } else { //single chinese word 122 | WordRange wr(begin + i, begin + i); 123 | words.push_back(wr); 124 | i++; 125 | } 126 | } 127 | } 128 | 129 | const DictTrie* dictTrie_; 130 | bool isNeedDestroy_; 131 | PosTagger tagger_; 132 | 133 | }; // class MPSegment 134 | 135 | } // namespace cppjieba 136 | 137 | #endif 138 | -------------------------------------------------------------------------------- /offline/src/PageLibPreprocessor.cc: -------------------------------------------------------------------------------- 1 | #include "PageLibPreprocessor.h" 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | using std::cout; 8 | using std::endl; 9 | using std::make_pair; 10 | using std::ofstream; 11 | 12 | namespace wd { 13 | PageLibPreprocessor::PageLibPreprocessor() { _pageLib.reserve(300); } 14 | 15 | void PageLibPreprocessor::doProcess() { 16 | readPageFromFile(); 17 | 18 | auto start = std::chrono::system_clock::now(); 19 | cutRedundantPages(); 20 | auto end = std::chrono::system_clock::now(); 21 | std::chrono::duration elapsed_seconds = end - start; 22 | cout << ">> cut redundant pages success, use time: " 23 | << elapsed_seconds.count() << " sec" << endl; 24 | 25 | buildInvertIndex(); 26 | auto end2 = std::chrono::system_clock::now(); 27 | elapsed_seconds = end2 - end; 28 | cout << ">> build invert index success, use time: " 29 | << elapsed_seconds.count() << " sec" << endl; 30 | 31 | elapsed_seconds = end2-start; 32 | cout << ">> total time cost: " << elapsed_seconds.count() << " sec" << endl; 33 | store(); 34 | } 35 | 36 | void PageLibPreprocessor::readPageFromFile() { 37 | XMLDocument doc; 38 | doc.LoadFile(CONFIG[RIPEPAGE_PATH].c_str()); 39 | XMLElement* page = doc.FirstChildElement("doc"); 40 | 41 | do { 42 | string docid = page->FirstChildElement("docid")->GetText(); 43 | string title = page->FirstChildElement("title")->GetText(); 44 | string link = page->FirstChildElement("link")->GetText(); 45 | string content = page->FirstChildElement("content")->GetText(); 46 | 47 | _pageLib.emplace_back(std::stoi(docid), title, link, content); 48 | } while (page = page->NextSiblingElement()); 49 | } 50 | 51 | void PageLibPreprocessor::cutRedundantPages() { 52 | simhash::Simhasher simhasher(CONFIG[DICT_PATH], CONFIG[HMM_PATH], CONFIG[IDF_PATH], 53 | CONFIG[STOP_WORD_PATH]); 54 | 55 | cout << ">> before cut: " << _pageLib.size() << endl; 56 | for (auto& page : _pageLib) { 57 | page.generateSimhash(_jieba); 58 | } 59 | std::sort(_pageLib.begin(), _pageLib.end()); 60 | 61 | /* ofstream 62 | ofsSort("/home/whb/project/RssSearchEngine/offline/data/sorted.dat"); for 63 | (auto& page : _pageLib) { string temp = page.getDoc(); ofsSort << 64 | page.getSimhash() << '\n' << temp; 65 | } 66 | cout << ">> store sorted ripepage success" << endl; 67 | ofsSort.close(); */ 68 | 69 | auto it = std::unique(_pageLib.begin(), _pageLib.end()); 70 | 71 | _pageLib.erase(it, _pageLib.end()); 72 | cout << ">> after cut: " << _pageLib.size() << endl; 73 | } 74 | 75 | void PageLibPreprocessor::buildInvertIndex() { 76 | for (auto& page : _pageLib) { 77 | page.buildWordsMap(_jieba); 78 | } 79 | 80 | for (auto& page : _pageLib) { 81 | unordered_map wordsMap = page.getWordsMap(); 82 | for (auto& wordFreq : wordsMap) { 83 | _invertIndexTable[wordFreq.first].push_back( 84 | make_pair(page.getDocId(), wordFreq.second)); 85 | } 86 | } 87 | //保存每篇文档中所有词的权重的平方和, int为docid 88 | unordered_map weightSum; 89 | 90 | int totalPageNum = _pageLib.size(); 91 | for (auto& elem : _invertIndexTable) { 92 | int df = elem.second.size(); //关键词在所有文章中出现的次数 93 | double idf = log2( 94 | static_cast(totalPageNum / (df + 1))); //关键词的逆文档频率 95 | 96 | for (auto& item : elem.second) { 97 | double weight = item.second * idf; 98 | item.second = weight; 99 | //计算每篇文档中词语的权重 100 | weightSum[item.first] += pow(weight, 2); 101 | } 102 | } 103 | 104 | for (auto& elem : _invertIndexTable) { 105 | for (auto& item : elem.second) { 106 | //归一化处理 107 | item.second = item.second / sqrt(weightSum[item.first]); 108 | } 109 | } 110 | } 111 | 112 | void PageLibPreprocessor::store() { 113 | string declaration = ""; 114 | XMLDocument xmlpages; 115 | xmlpages.Parse(declaration.c_str()); 116 | 117 | for (auto &page : _pageLib) 118 | { 119 | page.insertDoc(xmlpages); 120 | } 121 | 122 | xmlpages.SaveFile(CONFIG[NEW_RIPEPAGE_PATH].c_str()); 123 | cout << ">> store new ripepage success" << endl; 124 | 125 | ofstream ofsIndex(CONFIG[INDEX_PATH]); 126 | 127 | for (auto& elem : _invertIndexTable) { 128 | ofsIndex << elem.first << '\t'; 129 | for (auto& item : elem.second) { 130 | ofsIndex << item.first << '\t' << item.second << '\t'; 131 | } 132 | ofsIndex << '\n'; 133 | } 134 | 135 | ofsIndex.close(); 136 | cout << ">> store invert index success" << endl; 137 | } 138 | 139 | } // namespace wd 140 | -------------------------------------------------------------------------------- /include/cppjieba/KeywordExtractor.hpp: -------------------------------------------------------------------------------- 1 | #ifndef CPPJIEBA_KEYWORD_EXTRACTOR_H 2 | #define CPPJIEBA_KEYWORD_EXTRACTOR_H 3 | 4 | #include 5 | #include 6 | #include "MixSegment.hpp" 7 | 8 | namespace cppjieba { 9 | 10 | using namespace limonp; 11 | using namespace std; 12 | 13 | /*utf8*/ 14 | class KeywordExtractor { 15 | public: 16 | struct Word { 17 | string word; 18 | vector offsets; 19 | double weight; 20 | }; // struct Word 21 | 22 | KeywordExtractor(const string& dictPath, 23 | const string& hmmFilePath, 24 | const string& idfPath, 25 | const string& stopWordPath, 26 | const string& userDict = "") 27 | : segment_(dictPath, hmmFilePath, userDict) { 28 | LoadIdfDict(idfPath); 29 | LoadStopWordDict(stopWordPath); 30 | } 31 | KeywordExtractor(const DictTrie* dictTrie, 32 | const HMMModel* model, 33 | const string& idfPath, 34 | const string& stopWordPath) 35 | : segment_(dictTrie, model) { 36 | LoadIdfDict(idfPath); 37 | LoadStopWordDict(stopWordPath); 38 | } 39 | ~KeywordExtractor() { 40 | } 41 | 42 | void Extract(const string& sentence, vector& keywords, size_t topN) const { 43 | vector topWords; 44 | Extract(sentence, topWords, topN); 45 | for (size_t i = 0; i < topWords.size(); i++) { 46 | keywords.push_back(topWords[i].word); 47 | } 48 | } 49 | 50 | void Extract(const string& sentence, vector >& keywords, size_t topN) const { 51 | vector topWords; 52 | Extract(sentence, topWords, topN); 53 | for (size_t i = 0; i < topWords.size(); i++) { 54 | keywords.push_back(pair(topWords[i].word, topWords[i].weight)); 55 | } 56 | } 57 | 58 | void Extract(const string& sentence, vector& keywords, size_t topN) const { 59 | vector words; 60 | segment_.Cut(sentence, words); 61 | 62 | map wordmap; 63 | size_t offset = 0; 64 | for (size_t i = 0; i < words.size(); ++i) { 65 | size_t t = offset; 66 | offset += words[i].size(); 67 | if (IsSingleWord(words[i]) || stopWords_.find(words[i]) != stopWords_.end()) { 68 | continue; 69 | } 70 | wordmap[words[i]].offsets.push_back(t); 71 | wordmap[words[i]].weight += 1.0; 72 | } 73 | if (offset != sentence.size()) { 74 | XLOG(ERROR) << "words illegal"; 75 | return; 76 | } 77 | 78 | keywords.clear(); 79 | keywords.reserve(wordmap.size()); 80 | for (map::iterator itr = wordmap.begin(); itr != wordmap.end(); ++itr) { 81 | unordered_map::const_iterator cit = idfMap_.find(itr->first); 82 | if (cit != idfMap_.end()) { 83 | itr->second.weight *= cit->second; 84 | } else { 85 | itr->second.weight *= idfAverage_; 86 | } 87 | itr->second.word = itr->first; 88 | keywords.push_back(itr->second); 89 | } 90 | topN = min(topN, keywords.size()); 91 | partial_sort(keywords.begin(), keywords.begin() + topN, keywords.end(), Compare); 92 | keywords.resize(topN); 93 | } 94 | private: 95 | void LoadIdfDict(const string& idfPath) { 96 | ifstream ifs(idfPath.c_str()); 97 | XCHECK(ifs.is_open()) << "open " << idfPath << " failed"; 98 | string line ; 99 | vector buf; 100 | double idf = 0.0; 101 | double idfSum = 0.0; 102 | size_t lineno = 0; 103 | for (; getline(ifs, line); lineno++) { 104 | buf.clear(); 105 | if (line.empty()) { 106 | XLOG(ERROR) << "lineno: " << lineno << " empty. skipped."; 107 | continue; 108 | } 109 | Split(line, buf, " "); 110 | if (buf.size() != 2) { 111 | XLOG(ERROR) << "line: " << line << ", lineno: " << lineno << " empty. skipped."; 112 | continue; 113 | } 114 | idf = atof(buf[1].c_str()); 115 | idfMap_[buf[0]] = idf; 116 | idfSum += idf; 117 | 118 | } 119 | 120 | assert(lineno); 121 | idfAverage_ = idfSum / lineno; 122 | assert(idfAverage_ > 0.0); 123 | } 124 | void LoadStopWordDict(const string& filePath) { 125 | ifstream ifs(filePath.c_str()); 126 | XCHECK(ifs.is_open()) << "open " << filePath << " failed"; 127 | string line ; 128 | while (getline(ifs, line)) { 129 | stopWords_.insert(line); 130 | } 131 | assert(stopWords_.size()); 132 | } 133 | 134 | static bool Compare(const Word& lhs, const Word& rhs) { 135 | return lhs.weight > rhs.weight; 136 | } 137 | 138 | MixSegment segment_; 139 | unordered_map idfMap_; 140 | double idfAverage_; 141 | 142 | unordered_set stopWords_; 143 | }; // class KeywordExtractor 144 | 145 | inline ostream& operator << (ostream& os, const KeywordExtractor::Word& word) { 146 | return os << "{\"word\": \"" << word.word << "\", \"offset\": " << word.offsets << ", \"weight\": " << word.weight << "}"; 147 | } 148 | 149 | } // namespace cppjieba 150 | 151 | #endif 152 | 153 | 154 | -------------------------------------------------------------------------------- /src/net/EventLoop.cc: -------------------------------------------------------------------------------- 1 | #include "net/EventLoop.h" 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include "net/Acceptor.h" 7 | #include "net/TCPConnection.h" 8 | using std::cout; 9 | using std::endl; 10 | 11 | namespace wd { 12 | EventLoop::EventLoop(Acceptor& acceptor) 13 | : _efd(createEpollFd()), 14 | _eventfd(createEventFd()), 15 | _acceptor(acceptor), 16 | _eventList(1024), 17 | _isLooping(false) { 18 | addEpollFdRead(_acceptor.fd()); 19 | addEpollFdRead(_eventfd); 20 | } 21 | 22 | void EventLoop::loop() { 23 | _isLooping = true; 24 | while (_isLooping) { 25 | waitEpollFd(); 26 | } 27 | } 28 | 29 | //运行在另一个线程 30 | void EventLoop::unloop() { 31 | if (_isLooping) { 32 | _isLooping = false; 33 | } 34 | } 35 | 36 | void EventLoop::runInloop(Functor&& cb) { 37 | { 38 | MutexGuard autolock(_mutex); 39 | _pendingFunctors.push_back(std::move(cb)); 40 | } 41 | wakeup(); 42 | } 43 | 44 | void EventLoop::waitEpollFd() { 45 | int nready; 46 | do { 47 | nready = 48 | epoll_wait(_efd, &*_eventList.begin(), _eventList.size(), 5000); 49 | } while (nready == -1 && errno == EINTR); 50 | 51 | if (nready == -1) { 52 | perror("epoll_wait"); 53 | return; 54 | } else if (nready == 0) { 55 | cout << ">> epoll_wait timeout!" << endl; 56 | } else { 57 | if (nready == _eventList.size()) { 58 | _eventList.resize(2 * nready); 59 | } 60 | 61 | for (int idx = 0; idx != nready; ++idx) { 62 | int fd = _eventList[idx].data.fd; 63 | if (fd == _acceptor.fd()) { 64 | //处理新连接 65 | if (_eventList[idx].events & EPOLLIN) { 66 | handleNewConnection(); 67 | } 68 | } else if (fd == _eventfd) { 69 | if (_eventList[idx].events & EPOLLIN) { 70 | handleRead(); 71 | cout << ">> do pending functors" << endl; 72 | doPendingFunctors(); 73 | cout << ">> finish do pending functors" << endl; 74 | } 75 | } else { 76 | //处理消息 77 | if (_eventList[idx].events & EPOLLIN) { 78 | handleMessage(fd); 79 | } 80 | } 81 | } 82 | } 83 | } 84 | 85 | void EventLoop::handleNewConnection() { 86 | int peerFd = _acceptor.accept(); 87 | addEpollFdRead(peerFd); 88 | TCPConnectionPtr conn(new TCPConnection(peerFd, this)); 89 | conn->setConnectionCallback(_onConnection); 90 | conn->setMessageCallback(_onMessage); 91 | conn->setCloseCallback(_onClose); 92 | _conns.insert(std::make_pair(peerFd, conn)); 93 | conn->handleConnectionCallback(); 94 | } 95 | 96 | void EventLoop::handleMessage(int fd) { 97 | bool isClosed = isConnectionClosed(fd); 98 | auto it = _conns.find(fd); 99 | assert(it != _conns.end()); 100 | if (!isClosed) { 101 | it->second->handleMessageCallback(); 102 | } else { 103 | delEpollFdRead(fd); 104 | it->second->handleCloseCallback(); 105 | _conns.erase(it); 106 | } 107 | } 108 | 109 | bool EventLoop::isConnectionClosed(int fd) { 110 | int ret; 111 | do { 112 | char buf[1024]; 113 | ret = recv(fd, buf, sizeof(buf), MSG_PEEK); 114 | } while (ret == -1 && errno == EINTR); 115 | return (ret == 0 || ret == -1); 116 | } 117 | 118 | int EventLoop::createEpollFd() { 119 | int ret = ::epoll_create1(0); 120 | if (ret == -1) { 121 | perror("epoll_create1"); 122 | } 123 | return ret; 124 | } 125 | 126 | void EventLoop::addEpollFdRead(int fd) { 127 | struct epoll_event evt; 128 | evt.data.fd = fd; 129 | evt.events = EPOLLIN; 130 | int ret = epoll_ctl(_efd, EPOLL_CTL_ADD, fd, &evt); 131 | if (ret == -1) { 132 | perror("epoll_ctl"); 133 | } 134 | } 135 | 136 | void EventLoop::delEpollFdRead(int fd) { 137 | struct epoll_event evt; 138 | evt.data.fd = fd; 139 | int ret = epoll_ctl(_efd, EPOLL_CTL_DEL, fd, &evt); 140 | if (ret == -1) { 141 | perror("epoll_ctl"); 142 | } 143 | } 144 | 145 | int EventLoop::createEventFd() { 146 | int fd = ::eventfd(0, 0); 147 | if (fd == -1) { 148 | perror("eventfd"); 149 | } 150 | return fd; 151 | } 152 | 153 | void EventLoop::handleRead() { 154 | uint64_t count; 155 | int ret = ::read(_eventfd, &count, sizeof(count)); 156 | if (ret == -1) { 157 | perror("read"); 158 | } 159 | } 160 | 161 | void EventLoop::wakeup() { 162 | uint64_t one = 1; 163 | int ret = ::write(_eventfd, &one, sizeof(one)); 164 | if (ret != sizeof(one)) { 165 | perror("write"); 166 | } 167 | } 168 | 169 | void EventLoop::doPendingFunctors() { 170 | vector temp; 171 | { 172 | MutexGuard autolock(_mutex); 173 | _pendingFunctors.swap(temp); 174 | } 175 | 176 | for (auto& functor : temp) { 177 | functor(); 178 | } 179 | } 180 | 181 | } // namespace wd 182 | -------------------------------------------------------------------------------- /offline/include/cppjieba/KeywordExtractor.hpp: -------------------------------------------------------------------------------- 1 | #ifndef CPPJIEBA_KEYWORD_EXTRACTOR_H 2 | #define CPPJIEBA_KEYWORD_EXTRACTOR_H 3 | 4 | #include 5 | #include 6 | #include "MixSegment.hpp" 7 | 8 | namespace cppjieba { 9 | 10 | using namespace limonp; 11 | using namespace std; 12 | 13 | /*utf8*/ 14 | class KeywordExtractor { 15 | public: 16 | struct Word { 17 | string word; 18 | vector offsets; 19 | double weight; 20 | }; // struct Word 21 | 22 | KeywordExtractor(const string& dictPath, 23 | const string& hmmFilePath, 24 | const string& idfPath, 25 | const string& stopWordPath, 26 | const string& userDict = "") 27 | : segment_(dictPath, hmmFilePath, userDict) { 28 | LoadIdfDict(idfPath); 29 | LoadStopWordDict(stopWordPath); 30 | } 31 | KeywordExtractor(const DictTrie* dictTrie, 32 | const HMMModel* model, 33 | const string& idfPath, 34 | const string& stopWordPath) 35 | : segment_(dictTrie, model) { 36 | LoadIdfDict(idfPath); 37 | LoadStopWordDict(stopWordPath); 38 | } 39 | ~KeywordExtractor() { 40 | } 41 | 42 | void Extract(const string& sentence, vector& keywords, size_t topN) const { 43 | vector topWords; 44 | Extract(sentence, topWords, topN); 45 | for (size_t i = 0; i < topWords.size(); i++) { 46 | keywords.push_back(topWords[i].word); 47 | } 48 | } 49 | 50 | void Extract(const string& sentence, vector >& keywords, size_t topN) const { 51 | vector topWords; 52 | Extract(sentence, topWords, topN); 53 | for (size_t i = 0; i < topWords.size(); i++) { 54 | keywords.push_back(pair(topWords[i].word, topWords[i].weight)); 55 | } 56 | } 57 | 58 | void Extract(const string& sentence, vector& keywords, size_t topN) const { 59 | vector words; 60 | segment_.Cut(sentence, words); 61 | 62 | map wordmap; 63 | size_t offset = 0; 64 | for (size_t i = 0; i < words.size(); ++i) { 65 | size_t t = offset; 66 | offset += words[i].size(); 67 | if (IsSingleWord(words[i]) || stopWords_.find(words[i]) != stopWords_.end()) { 68 | continue; 69 | } 70 | wordmap[words[i]].offsets.push_back(t); 71 | wordmap[words[i]].weight += 1.0; 72 | } 73 | if (offset != sentence.size()) { 74 | XLOG(ERROR) << "words illegal"; 75 | return; 76 | } 77 | 78 | keywords.clear(); 79 | keywords.reserve(wordmap.size()); 80 | for (map::iterator itr = wordmap.begin(); itr != wordmap.end(); ++itr) { 81 | unordered_map::const_iterator cit = idfMap_.find(itr->first); 82 | if (cit != idfMap_.end()) { 83 | itr->second.weight *= cit->second; 84 | } else { 85 | itr->second.weight *= idfAverage_; 86 | } 87 | itr->second.word = itr->first; 88 | keywords.push_back(itr->second); 89 | } 90 | topN = min(topN, keywords.size()); 91 | partial_sort(keywords.begin(), keywords.begin() + topN, keywords.end(), Compare); 92 | keywords.resize(topN); 93 | } 94 | private: 95 | void LoadIdfDict(const string& idfPath) { 96 | ifstream ifs(idfPath.c_str()); 97 | XCHECK(ifs.is_open()) << "open " << idfPath << " failed"; 98 | string line ; 99 | vector buf; 100 | double idf = 0.0; 101 | double idfSum = 0.0; 102 | size_t lineno = 0; 103 | for (; getline(ifs, line); lineno++) { 104 | buf.clear(); 105 | if (line.empty()) { 106 | XLOG(ERROR) << "lineno: " << lineno << " empty. skipped."; 107 | continue; 108 | } 109 | Split(line, buf, " "); 110 | if (buf.size() != 2) { 111 | XLOG(ERROR) << "line: " << line << ", lineno: " << lineno << " empty. skipped."; 112 | continue; 113 | } 114 | idf = atof(buf[1].c_str()); 115 | idfMap_[buf[0]] = idf; 116 | idfSum += idf; 117 | 118 | } 119 | 120 | assert(lineno); 121 | idfAverage_ = idfSum / lineno; 122 | assert(idfAverage_ > 0.0); 123 | } 124 | void LoadStopWordDict(const string& filePath) { 125 | ifstream ifs(filePath.c_str()); 126 | XCHECK(ifs.is_open()) << "open " << filePath << " failed"; 127 | string line ; 128 | while (getline(ifs, line)) { 129 | stopWords_.insert(line); 130 | } 131 | assert(stopWords_.size()); 132 | } 133 | 134 | static bool Compare(const Word& lhs, const Word& rhs) { 135 | return lhs.weight > rhs.weight; 136 | } 137 | 138 | MixSegment segment_; 139 | unordered_map idfMap_; 140 | double idfAverage_; 141 | 142 | unordered_set stopWords_; 143 | }; // class KeywordExtractor 144 | 145 | inline ostream& operator << (ostream& os, const KeywordExtractor::Word& word) { 146 | return os << "{\"word\": \"" << word.word << "\", \"offset\": " << word.offsets << ", \"weight\": " << word.weight << "}"; 147 | } 148 | 149 | } // namespace cppjieba 150 | 151 | #endif 152 | 153 | 154 | -------------------------------------------------------------------------------- /include/cppjieba/Trie.hpp: -------------------------------------------------------------------------------- 1 | #ifndef CPPJIEBA_TRIE_HPP 2 | #define CPPJIEBA_TRIE_HPP 3 | 4 | #include 5 | #include 6 | #include "limonp/StdExtension.hpp" 7 | #include "Unicode.hpp" 8 | 9 | namespace cppjieba { 10 | 11 | using namespace std; 12 | 13 | const size_t MAX_WORD_LENGTH = 512; 14 | 15 | struct DictUnit { 16 | Unicode word; 17 | double weight; 18 | string tag; 19 | }; // struct DictUnit 20 | 21 | // for debugging 22 | // inline ostream & operator << (ostream& os, const DictUnit& unit) { 23 | // string s; 24 | // s << unit.word; 25 | // return os << StringFormat("%s %s %.3lf", s.c_str(), unit.tag.c_str(), unit.weight); 26 | // } 27 | 28 | struct Dag { 29 | RuneStr runestr; 30 | // [offset, nexts.first] 31 | limonp::LocalVector > nexts; 32 | const DictUnit * pInfo; 33 | double weight; 34 | size_t nextPos; // TODO 35 | Dag():runestr(), pInfo(NULL), weight(0.0), nextPos(0) { 36 | } 37 | }; // struct Dag 38 | 39 | typedef Rune TrieKey; 40 | 41 | class TrieNode { 42 | public : 43 | TrieNode(): next(NULL), ptValue(NULL) { 44 | } 45 | public: 46 | typedef unordered_map NextMap; 47 | NextMap *next; 48 | const DictUnit *ptValue; 49 | }; 50 | 51 | class Trie { 52 | public: 53 | Trie(const vector& keys, const vector& valuePointers) 54 | : root_(new TrieNode) { 55 | CreateTrie(keys, valuePointers); 56 | } 57 | ~Trie() { 58 | DeleteNode(root_); 59 | } 60 | 61 | const DictUnit* Find(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end) const { 62 | if (begin == end) { 63 | return NULL; 64 | } 65 | 66 | const TrieNode* ptNode = root_; 67 | TrieNode::NextMap::const_iterator citer; 68 | for (RuneStrArray::const_iterator it = begin; it != end; it++) { 69 | if (NULL == ptNode->next) { 70 | return NULL; 71 | } 72 | citer = ptNode->next->find(it->rune); 73 | if (ptNode->next->end() == citer) { 74 | return NULL; 75 | } 76 | ptNode = citer->second; 77 | } 78 | return ptNode->ptValue; 79 | } 80 | 81 | void Find(RuneStrArray::const_iterator begin, 82 | RuneStrArray::const_iterator end, 83 | vector&res, 84 | size_t max_word_len = MAX_WORD_LENGTH) const { 85 | assert(root_ != NULL); 86 | res.resize(end - begin); 87 | 88 | const TrieNode *ptNode = NULL; 89 | TrieNode::NextMap::const_iterator citer; 90 | for (size_t i = 0; i < size_t(end - begin); i++) { 91 | res[i].runestr = *(begin + i); 92 | 93 | if (root_->next != NULL && root_->next->end() != (citer = root_->next->find(res[i].runestr.rune))) { 94 | ptNode = citer->second; 95 | } else { 96 | ptNode = NULL; 97 | } 98 | if (ptNode != NULL) { 99 | res[i].nexts.push_back(pair(i, ptNode->ptValue)); 100 | } else { 101 | res[i].nexts.push_back(pair(i, static_cast(NULL))); 102 | } 103 | 104 | for (size_t j = i + 1; j < size_t(end - begin) && (j - i + 1) <= max_word_len; j++) { 105 | if (ptNode == NULL || ptNode->next == NULL) { 106 | break; 107 | } 108 | citer = ptNode->next->find((begin + j)->rune); 109 | if (ptNode->next->end() == citer) { 110 | break; 111 | } 112 | ptNode = citer->second; 113 | if (NULL != ptNode->ptValue) { 114 | res[i].nexts.push_back(pair(j, ptNode->ptValue)); 115 | } 116 | } 117 | } 118 | } 119 | 120 | void InsertNode(const Unicode& key, const DictUnit* ptValue) { 121 | if (key.begin() == key.end()) { 122 | return; 123 | } 124 | 125 | TrieNode::NextMap::const_iterator kmIter; 126 | TrieNode *ptNode = root_; 127 | for (Unicode::const_iterator citer = key.begin(); citer != key.end(); ++citer) { 128 | if (NULL == ptNode->next) { 129 | ptNode->next = new TrieNode::NextMap; 130 | } 131 | kmIter = ptNode->next->find(*citer); 132 | if (ptNode->next->end() == kmIter) { 133 | TrieNode *nextNode = new TrieNode; 134 | 135 | ptNode->next->insert(make_pair(*citer, nextNode)); 136 | ptNode = nextNode; 137 | } else { 138 | ptNode = kmIter->second; 139 | } 140 | } 141 | assert(ptNode != NULL); 142 | ptNode->ptValue = ptValue; 143 | } 144 | 145 | private: 146 | void CreateTrie(const vector& keys, const vector& valuePointers) { 147 | if (valuePointers.empty() || keys.empty()) { 148 | return; 149 | } 150 | assert(keys.size() == valuePointers.size()); 151 | 152 | for (size_t i = 0; i < keys.size(); i++) { 153 | InsertNode(keys[i], valuePointers[i]); 154 | } 155 | } 156 | 157 | void DeleteNode(TrieNode* node) { 158 | if (NULL == node) { 159 | return; 160 | } 161 | if (NULL != node->next) { 162 | for (TrieNode::NextMap::iterator it = node->next->begin(); it != node->next->end(); ++it) { 163 | DeleteNode(it->second); 164 | } 165 | delete node->next; 166 | } 167 | delete node; 168 | } 169 | 170 | TrieNode* root_; 171 | }; // class Trie 172 | } // namespace cppjieba 173 | 174 | #endif // CPPJIEBA_TRIE_HPP 175 | -------------------------------------------------------------------------------- /include/cppjieba/limonp/Closure.hpp: -------------------------------------------------------------------------------- 1 | #ifndef LIMONP_CLOSURE_HPP 2 | #define LIMONP_CLOSURE_HPP 3 | 4 | namespace limonp { 5 | 6 | class ClosureInterface { 7 | public: 8 | virtual ~ClosureInterface() { 9 | } 10 | virtual void Run() = 0; 11 | }; 12 | 13 | template 14 | class Closure0: public ClosureInterface { 15 | public: 16 | Closure0(Funct fun) { 17 | fun_ = fun; 18 | } 19 | virtual ~Closure0() { 20 | } 21 | virtual void Run() { 22 | (*fun_)(); 23 | } 24 | private: 25 | Funct fun_; 26 | }; 27 | 28 | template 29 | class Closure1: public ClosureInterface { 30 | public: 31 | Closure1(Funct fun, Arg1 arg1) { 32 | fun_ = fun; 33 | arg1_ = arg1; 34 | } 35 | virtual ~Closure1() { 36 | } 37 | virtual void Run() { 38 | (*fun_)(arg1_); 39 | } 40 | private: 41 | Funct fun_; 42 | Arg1 arg1_; 43 | }; 44 | 45 | template 46 | class Closure2: public ClosureInterface { 47 | public: 48 | Closure2(Funct fun, Arg1 arg1, Arg2 arg2) { 49 | fun_ = fun; 50 | arg1_ = arg1; 51 | arg2_ = arg2; 52 | } 53 | virtual ~Closure2() { 54 | } 55 | virtual void Run() { 56 | (*fun_)(arg1_, arg2_); 57 | } 58 | private: 59 | Funct fun_; 60 | Arg1 arg1_; 61 | Arg2 arg2_; 62 | }; 63 | 64 | template 65 | class Closure3: public ClosureInterface { 66 | public: 67 | Closure3(Funct fun, Arg1 arg1, Arg2 arg2, Arg3 arg3) { 68 | fun_ = fun; 69 | arg1_ = arg1; 70 | arg2_ = arg2; 71 | arg3_ = arg3; 72 | } 73 | virtual ~Closure3() { 74 | } 75 | virtual void Run() { 76 | (*fun_)(arg1_, arg2_, arg3_); 77 | } 78 | private: 79 | Funct fun_; 80 | Arg1 arg1_; 81 | Arg2 arg2_; 82 | Arg3 arg3_; 83 | }; 84 | 85 | template 86 | class ObjClosure0: public ClosureInterface { 87 | public: 88 | ObjClosure0(Obj* p, Funct fun) { 89 | p_ = p; 90 | fun_ = fun; 91 | } 92 | virtual ~ObjClosure0() { 93 | } 94 | virtual void Run() { 95 | (p_->*fun_)(); 96 | } 97 | private: 98 | Obj* p_; 99 | Funct fun_; 100 | }; 101 | 102 | template 103 | class ObjClosure1: public ClosureInterface { 104 | public: 105 | ObjClosure1(Obj* p, Funct fun, Arg1 arg1) { 106 | p_ = p; 107 | fun_ = fun; 108 | arg1_ = arg1; 109 | } 110 | virtual ~ObjClosure1() { 111 | } 112 | virtual void Run() { 113 | (p_->*fun_)(arg1_); 114 | } 115 | private: 116 | Obj* p_; 117 | Funct fun_; 118 | Arg1 arg1_; 119 | }; 120 | 121 | template 122 | class ObjClosure2: public ClosureInterface { 123 | public: 124 | ObjClosure2(Obj* p, Funct fun, Arg1 arg1, Arg2 arg2) { 125 | p_ = p; 126 | fun_ = fun; 127 | arg1_ = arg1; 128 | arg2_ = arg2; 129 | } 130 | virtual ~ObjClosure2() { 131 | } 132 | virtual void Run() { 133 | (p_->*fun_)(arg1_, arg2_); 134 | } 135 | private: 136 | Obj* p_; 137 | Funct fun_; 138 | Arg1 arg1_; 139 | Arg2 arg2_; 140 | }; 141 | template 142 | class ObjClosure3: public ClosureInterface { 143 | public: 144 | ObjClosure3(Obj* p, Funct fun, Arg1 arg1, Arg2 arg2, Arg3 arg3) { 145 | p_ = p; 146 | fun_ = fun; 147 | arg1_ = arg1; 148 | arg2_ = arg2; 149 | arg3_ = arg3; 150 | } 151 | virtual ~ObjClosure3() { 152 | } 153 | virtual void Run() { 154 | (p_->*fun_)(arg1_, arg2_, arg3_); 155 | } 156 | private: 157 | Obj* p_; 158 | Funct fun_; 159 | Arg1 arg1_; 160 | Arg2 arg2_; 161 | Arg3 arg3_; 162 | }; 163 | 164 | template 165 | ClosureInterface* NewClosure(R (*fun)()) { 166 | return new Closure0(fun); 167 | } 168 | 169 | template 170 | ClosureInterface* NewClosure(R (*fun)(Arg1), Arg1 arg1) { 171 | return new Closure1(fun, arg1); 172 | } 173 | 174 | template 175 | ClosureInterface* NewClosure(R (*fun)(Arg1, Arg2), Arg1 arg1, Arg2 arg2) { 176 | return new Closure2(fun, arg1, arg2); 177 | } 178 | 179 | template 180 | ClosureInterface* NewClosure(R (*fun)(Arg1, Arg2, Arg3), Arg1 arg1, Arg2 arg2, Arg3 arg3) { 181 | return new Closure3(fun, arg1, arg2, arg3); 182 | } 183 | 184 | template 185 | ClosureInterface* NewClosure(Obj* obj, R (Obj::* fun)()) { 186 | return new ObjClosure0(obj, fun); 187 | } 188 | 189 | template 190 | ClosureInterface* NewClosure(Obj* obj, R (Obj::* fun)(Arg1), Arg1 arg1) { 191 | return new ObjClosure1(obj, fun, arg1); 192 | } 193 | 194 | template 195 | ClosureInterface* NewClosure(Obj* obj, R (Obj::* fun)(Arg1, Arg2), Arg1 arg1, Arg2 arg2) { 196 | return new ObjClosure2(obj, fun, arg1, arg2); 197 | } 198 | 199 | template 200 | ClosureInterface* NewClosure(Obj* obj, R (Obj::* fun)(Arg1, Arg2, Arg3), Arg1 arg1, Arg2 arg2, Arg3 arg3) { 201 | return new ObjClosure3(obj, fun, arg1, arg2, arg3); 202 | } 203 | 204 | } // namespace limonp 205 | 206 | #endif // LIMONP_CLOSURE_HPP 207 | -------------------------------------------------------------------------------- /offline/include/cppjieba/Trie.hpp: -------------------------------------------------------------------------------- 1 | #ifndef CPPJIEBA_TRIE_HPP 2 | #define CPPJIEBA_TRIE_HPP 3 | 4 | #include 5 | #include 6 | #include "limonp/StdExtension.hpp" 7 | #include "Unicode.hpp" 8 | 9 | namespace cppjieba { 10 | 11 | using namespace std; 12 | 13 | const size_t MAX_WORD_LENGTH = 512; 14 | 15 | struct DictUnit { 16 | Unicode word; 17 | double weight; 18 | string tag; 19 | }; // struct DictUnit 20 | 21 | // for debugging 22 | // inline ostream & operator << (ostream& os, const DictUnit& unit) { 23 | // string s; 24 | // s << unit.word; 25 | // return os << StringFormat("%s %s %.3lf", s.c_str(), unit.tag.c_str(), unit.weight); 26 | // } 27 | 28 | struct Dag { 29 | RuneStr runestr; 30 | // [offset, nexts.first] 31 | limonp::LocalVector > nexts; 32 | const DictUnit * pInfo; 33 | double weight; 34 | size_t nextPos; // TODO 35 | Dag():runestr(), pInfo(NULL), weight(0.0), nextPos(0) { 36 | } 37 | }; // struct Dag 38 | 39 | typedef Rune TrieKey; 40 | 41 | class TrieNode { 42 | public : 43 | TrieNode(): next(NULL), ptValue(NULL) { 44 | } 45 | public: 46 | typedef unordered_map NextMap; 47 | NextMap *next; 48 | const DictUnit *ptValue; 49 | }; 50 | 51 | class Trie { 52 | public: 53 | Trie(const vector& keys, const vector& valuePointers) 54 | : root_(new TrieNode) { 55 | CreateTrie(keys, valuePointers); 56 | } 57 | ~Trie() { 58 | DeleteNode(root_); 59 | } 60 | 61 | const DictUnit* Find(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end) const { 62 | if (begin == end) { 63 | return NULL; 64 | } 65 | 66 | const TrieNode* ptNode = root_; 67 | TrieNode::NextMap::const_iterator citer; 68 | for (RuneStrArray::const_iterator it = begin; it != end; it++) { 69 | if (NULL == ptNode->next) { 70 | return NULL; 71 | } 72 | citer = ptNode->next->find(it->rune); 73 | if (ptNode->next->end() == citer) { 74 | return NULL; 75 | } 76 | ptNode = citer->second; 77 | } 78 | return ptNode->ptValue; 79 | } 80 | 81 | void Find(RuneStrArray::const_iterator begin, 82 | RuneStrArray::const_iterator end, 83 | vector&res, 84 | size_t max_word_len = MAX_WORD_LENGTH) const { 85 | assert(root_ != NULL); 86 | res.resize(end - begin); 87 | 88 | const TrieNode *ptNode = NULL; 89 | TrieNode::NextMap::const_iterator citer; 90 | for (size_t i = 0; i < size_t(end - begin); i++) { 91 | res[i].runestr = *(begin + i); 92 | 93 | if (root_->next != NULL && root_->next->end() != (citer = root_->next->find(res[i].runestr.rune))) { 94 | ptNode = citer->second; 95 | } else { 96 | ptNode = NULL; 97 | } 98 | if (ptNode != NULL) { 99 | res[i].nexts.push_back(pair(i, ptNode->ptValue)); 100 | } else { 101 | res[i].nexts.push_back(pair(i, static_cast(NULL))); 102 | } 103 | 104 | for (size_t j = i + 1; j < size_t(end - begin) && (j - i + 1) <= max_word_len; j++) { 105 | if (ptNode == NULL || ptNode->next == NULL) { 106 | break; 107 | } 108 | citer = ptNode->next->find((begin + j)->rune); 109 | if (ptNode->next->end() == citer) { 110 | break; 111 | } 112 | ptNode = citer->second; 113 | if (NULL != ptNode->ptValue) { 114 | res[i].nexts.push_back(pair(j, ptNode->ptValue)); 115 | } 116 | } 117 | } 118 | } 119 | 120 | void InsertNode(const Unicode& key, const DictUnit* ptValue) { 121 | if (key.begin() == key.end()) { 122 | return; 123 | } 124 | 125 | TrieNode::NextMap::const_iterator kmIter; 126 | TrieNode *ptNode = root_; 127 | for (Unicode::const_iterator citer = key.begin(); citer != key.end(); ++citer) { 128 | if (NULL == ptNode->next) { 129 | ptNode->next = new TrieNode::NextMap; 130 | } 131 | kmIter = ptNode->next->find(*citer); 132 | if (ptNode->next->end() == kmIter) { 133 | TrieNode *nextNode = new TrieNode; 134 | 135 | ptNode->next->insert(make_pair(*citer, nextNode)); 136 | ptNode = nextNode; 137 | } else { 138 | ptNode = kmIter->second; 139 | } 140 | } 141 | assert(ptNode != NULL); 142 | ptNode->ptValue = ptValue; 143 | } 144 | 145 | private: 146 | void CreateTrie(const vector& keys, const vector& valuePointers) { 147 | if (valuePointers.empty() || keys.empty()) { 148 | return; 149 | } 150 | assert(keys.size() == valuePointers.size()); 151 | 152 | for (size_t i = 0; i < keys.size(); i++) { 153 | InsertNode(keys[i], valuePointers[i]); 154 | } 155 | } 156 | 157 | void DeleteNode(TrieNode* node) { 158 | if (NULL == node) { 159 | return; 160 | } 161 | if (NULL != node->next) { 162 | for (TrieNode::NextMap::iterator it = node->next->begin(); it != node->next->end(); ++it) { 163 | DeleteNode(it->second); 164 | } 165 | delete node->next; 166 | } 167 | delete node; 168 | } 169 | 170 | TrieNode* root_; 171 | }; // class Trie 172 | } // namespace cppjieba 173 | 174 | #endif // CPPJIEBA_TRIE_HPP 175 | --------------------------------------------------------------------------------