├── tools ├── dummy └── bazel.rc.template ├── third_party ├── boost │ ├── BUILD │ └── boost.bzl ├── crow │ ├── .gitmodules │ ├── tests │ │ ├── template │ │ │ ├── README.template_test │ │ │ ├── Makefile │ │ │ ├── mustachetest.cpp │ │ │ ├── CMakeLists.txt │ │ │ ├── test.py │ │ │ ├── comments.json │ │ │ ├── partials.json │ │ │ ├── comments.yml │ │ │ ├── delimiters.json │ │ │ ├── partials.yml │ │ │ └── delimiters.yml │ │ └── CMakeLists.txt │ ├── examples │ │ ├── helloworld.cpp │ │ ├── example.py │ │ ├── ssl │ │ │ └── example_ssl.cpp │ │ ├── websocket │ │ │ ├── templates │ │ │ │ └── ws.html │ │ │ └── example_ws.cpp │ │ ├── example_chat.html │ │ ├── example_test.py │ │ ├── CMakeLists.txt │ │ ├── example_chat.cpp │ │ ├── example_with_all.cpp │ │ └── example_vs.cpp │ ├── .gitignore │ ├── BUILD │ ├── include │ │ ├── crow.h │ │ └── crow │ │ │ ├── settings.h │ │ │ ├── ci_map.h │ │ │ ├── http_request.h │ │ │ ├── dumb_timer_queue.h │ │ │ ├── socket_adaptors.h │ │ │ ├── middleware_context.h │ │ │ ├── http_response.h │ │ │ ├── common.h │ │ │ ├── logging.h │ │ │ └── parser.h │ ├── .travis.yml │ ├── cmake │ │ └── FindTcmalloc.cmake │ ├── LICENSE │ ├── amalgamate │ │ └── merge_all.py │ ├── CMakeLists.txt │ └── README.md ├── gflags │ ├── empty.cc │ ├── gflags-2.0.tar.gz │ └── BUILD ├── glog │ ├── empty.cc │ ├── glog-0.3.4.tar.gz │ └── BUILD ├── setuptools │ ├── setuptools.egg-info │ │ ├── zip-safe │ │ ├── top_level.txt │ │ ├── requires.txt │ │ ├── dependency_links.txt │ │ └── entry_points.txt │ ├── README │ └── BUILD ├── python │ ├── pylint │ │ ├── main.py │ │ └── BUILD │ ├── cpplint │ │ └── BUILD │ └── semver │ │ ├── README.md │ │ ├── setup.py │ │ └── PKG-INFO ├── pybind11 │ ├── BUILD │ ├── complex.h │ ├── typeid.h │ └── functional.h └── word2vec │ ├── demo-word.sh │ ├── demo-classes.sh │ ├── BUILD │ ├── demo-word-accuracy.sh │ ├── demo-analogy.sh │ ├── makefile │ ├── demo-phrases.sh │ ├── demo-phrase-accuracy.sh │ └── README.txt ├── docs ├── dl.jpeg └── qrcode_dzgz.jpg ├── kcws ├── models │ ├── word_vocab.txt │ ├── seg_model.pbtxt │ └── pos_vocab.txt ├── cc │ ├── dump_vocab.py │ ├── prepare_test_file.py │ ├── viterbi_decode.h │ ├── sentence_breaker.h │ ├── test_breaker.cc │ ├── test_ac_scanner.cc │ ├── pos_tagger.h │ ├── tf_seg_model.h │ ├── demo.html │ ├── viterbi_decode.cc │ ├── sentence_breaker.cc │ ├── gen_seg_eval.cc │ ├── BUILD │ ├── seg_backend_api.cc │ └── test_seg.cc └── train │ ├── BUILD │ ├── replace_unk.py │ ├── sentence.py │ ├── filter_sentence.py │ ├── merge_vec.py │ ├── sampling_for_train.py │ ├── generate_char_embedding.py │ ├── generate_train_free.py │ ├── process_icwb.py │ ├── process_people.py │ ├── bilstm.py │ ├── prepare_pos.py │ ├── process_anno_file.py │ ├── stats_pos.py │ ├── idcnn.py │ └── generate_training.py ├── .gitignore ├── base ├── base.cc ├── BUILD └── base.h ├── tfmodel ├── BUILD ├── tfmodel.h └── tfmodel.cc ├── utils ├── vocab.h ├── basic_vocab.h ├── py_word2vec_vob.cc ├── BUILD ├── word2vec_vob.h ├── basic_vocab.cc └── json_util.h ├── util └── python │ ├── BUILD │ └── python_config.sh ├── BUILD.tf_dist ├── configure ├── BUILD.boost ├── WORKSPACE ├── pos_train.md └── README.md /tools/dummy: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /third_party/boost/BUILD: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /third_party/crow/.gitmodules: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /third_party/gflags/empty.cc: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /third_party/glog/empty.cc: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /third_party/setuptools/setuptools.egg-info/zip-safe: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /docs/dl.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koth/kcws/HEAD/docs/dl.jpeg -------------------------------------------------------------------------------- /docs/qrcode_dzgz.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koth/kcws/HEAD/docs/qrcode_dzgz.jpg -------------------------------------------------------------------------------- /kcws/models/word_vocab.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koth/kcws/HEAD/kcws/models/word_vocab.txt -------------------------------------------------------------------------------- /kcws/models/seg_model.pbtxt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koth/kcws/HEAD/kcws/models/seg_model.pbtxt -------------------------------------------------------------------------------- /third_party/glog/glog-0.3.4.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koth/kcws/HEAD/third_party/glog/glog-0.3.4.tar.gz -------------------------------------------------------------------------------- /third_party/crow/tests/template/README.template_test: -------------------------------------------------------------------------------- 1 | spec json/yml files from https://github.com/mustache/spec 2 | -------------------------------------------------------------------------------- /third_party/gflags/gflags-2.0.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koth/kcws/HEAD/third_party/gflags/gflags-2.0.tar.gz -------------------------------------------------------------------------------- /third_party/python/pylint/main.py: -------------------------------------------------------------------------------- 1 | import pylint 2 | 3 | if __name__ == '__main__': 4 | pylint.run_pylint() 5 | -------------------------------------------------------------------------------- /third_party/setuptools/setuptools.egg-info/top_level.txt: -------------------------------------------------------------------------------- 1 | setuptools 2 | easy_install 3 | _markerlib 4 | pkg_resources 5 | -------------------------------------------------------------------------------- /third_party/setuptools/setuptools.egg-info/requires.txt: -------------------------------------------------------------------------------- 1 | 2 | 3 | [ssl:sys_platform=='win32'] 4 | wincertstore==0.2 5 | 6 | [certs] 7 | certifi==1.0.1 -------------------------------------------------------------------------------- /third_party/crow/tests/template/Makefile: -------------------------------------------------------------------------------- 1 | all: 2 | $(CXX) -Wall -std=c++11 -g -o mustachetest mustachetest.cc 3 | .PHONY: clean 4 | clean: 5 | rm -f mustachetest *.o 6 | -------------------------------------------------------------------------------- /third_party/setuptools/README: -------------------------------------------------------------------------------- 1 | Project URL: https://pypi.python.org/packages/source/s/setuptools/setuptools-3.6.tar.gz#md5=8f3a1dcdc14313c8334eb6af4f66ea0a 2 | Version: 3.6 3 | License: PSF or ZPL 4 | Local modifications: none 5 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | .DS_* 3 | datas 4 | logs 5 | bazel-* 6 | ner/address_logs 7 | tools/bazel.rc 8 | tools/python_bin_path.sh 9 | util/python/python_include 10 | util/python/python_lib 11 | u_company.txt 12 | resume_extractor/config.json 13 | -------------------------------------------------------------------------------- /third_party/python/cpplint/BUILD: -------------------------------------------------------------------------------- 1 | licenses(["notice"]) 2 | 3 | package(default_visibility = ["//visibility:public"]) 4 | 5 | py_binary( 6 | name = "cpplint", 7 | srcs = ["cpplint.py"], 8 | main = "cpplint.py", 9 | stamp = 1, 10 | ) 11 | -------------------------------------------------------------------------------- /third_party/crow/examples/helloworld.cpp: -------------------------------------------------------------------------------- 1 | #include "crow.h" 2 | 3 | int main() 4 | { 5 | crow::SimpleApp app; 6 | 7 | CROW_ROUTE(app, "/") 8 | ([]() { 9 | return "Hello world!"; 10 | }); 11 | 12 | app.port(18080).run(); 13 | } 14 | -------------------------------------------------------------------------------- /third_party/setuptools/BUILD: -------------------------------------------------------------------------------- 1 | licenses(["notice"]) 2 | 3 | package(default_visibility = ["//visibility:public"]) 4 | 5 | py_library( 6 | name = "pkg_resources", 7 | srcs = [ 8 | "pkg_resources.py", 9 | ], 10 | ) 11 | 12 | exports_files(["pkg_resources.py"]) 13 | -------------------------------------------------------------------------------- /third_party/setuptools/setuptools.egg-info/dependency_links.txt: -------------------------------------------------------------------------------- 1 | https://pypi.python.org/packages/source/c/certifi/certifi-1.0.1.tar.gz#md5=45f5cb94b8af9e1df0f9450a8f61b790 2 | https://pypi.python.org/packages/source/w/wincertstore/wincertstore-0.2.zip#md5=ae728f2f007185648d0c7a8679b361e2 3 | -------------------------------------------------------------------------------- /third_party/pybind11/BUILD: -------------------------------------------------------------------------------- 1 | package(default_visibility = ["//visibility:public"]) 2 | 3 | licenses(["notice"]) 4 | 5 | cc_library( 6 | name = "pybind11", 7 | srcs = glob(["*.h"]), 8 | linkstatic = 1, 9 | deps = [ 10 | "//util/python:python_headers", 11 | ], 12 | ) 13 | -------------------------------------------------------------------------------- /third_party/word2vec/demo-word.sh: -------------------------------------------------------------------------------- 1 | make 2 | if [ ! -e text8 ]; then 3 | wget http://mattmahoney.net/dc/text8.zip -O text8.gz 4 | gzip -d text8.gz -f 5 | fi 6 | time ./word2vec -train text8 -output vectors.bin -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 20 -binary 1 -iter 15 7 | ./distance vectors.bin 8 | -------------------------------------------------------------------------------- /base/base.cc: -------------------------------------------------------------------------------- 1 | // Copyright Koth 2016 2 | 3 | #include "base/base.h" 4 | 5 | namespace base { 6 | 7 | void Init(int argc, char** argv) { 8 | // google::InstallFailureSignalHandler(); 9 | google::ParseCommandLineFlags(&argc, &argv, true); 10 | google::InitGoogleLogging(argv[0]); 11 | } 12 | 13 | } // namesapace 14 | -------------------------------------------------------------------------------- /tfmodel/BUILD: -------------------------------------------------------------------------------- 1 | package(default_visibility = ["//visibility:public"]) 2 | 3 | cc_library( 4 | name="tfmodel", 5 | srcs=[ 6 | "tfmodel.cc" 7 | ], 8 | hdrs=[ 9 | "tfmodel.h" 10 | ], 11 | linkstatic=1, 12 | deps=[ 13 | '//base:base', 14 | '//utils:basic_string_util', 15 | '@tf//:tensorflow', 16 | ] 17 | ) 18 | -------------------------------------------------------------------------------- /base/BUILD: -------------------------------------------------------------------------------- 1 | package(default_visibility = ["//visibility:public"]) 2 | 3 | cc_library( 4 | name = "base", 5 | srcs = ["base.cc"], 6 | hdrs = ["base.h"], 7 | # linkstatic = 1, 8 | deps = [ 9 | "//third_party/gflags:gflags-cxx", 10 | "//third_party/glog:glog-cxx", 11 | ], 12 | # alwayslink = 1, 13 | ) 14 | -------------------------------------------------------------------------------- /third_party/python/pylint/BUILD: -------------------------------------------------------------------------------- 1 | licenses(["notice"]) # apache 2.0 2 | 3 | package(default_visibility = ["//visibility:public"]) 4 | 5 | load("/tools/rules/pex_rules", "pex_library", "pex_binary") 6 | 7 | pex_binary( 8 | name = "pylint", 9 | srcs = ["main.py"], 10 | main = "main.py", 11 | reqs = ["pylint==1.5.5"], 12 | ) 13 | -------------------------------------------------------------------------------- /utils/vocab.h: -------------------------------------------------------------------------------- 1 | 2 | #ifndef UTILS_VOCAB_H_ 3 | #define UTILS_VOCAB_H_ 4 | #include 5 | namespace utils { 6 | class Vocab { 7 | public: 8 | virtual ~Vocab() {}; 9 | virtual bool Load(const std::string& path) = 0; 10 | virtual int GetWordIndex(const std::string& word) = 0; 11 | virtual int GetTotalWord() = 0; 12 | }; 13 | } // namespace utils 14 | #endif // UTILS_VOCAB_H_ 15 | -------------------------------------------------------------------------------- /third_party/word2vec/demo-classes.sh: -------------------------------------------------------------------------------- 1 | make 2 | if [ ! -e text8 ]; then 3 | wget http://mattmahoney.net/dc/text8.zip -O text8.gz 4 | gzip -d text8.gz -f 5 | fi 6 | time ./word2vec -train text8 -output classes.txt -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 20 -iter 15 -classes 500 7 | sort classes.txt -k 2 -n > classes.sorted.txt 8 | echo The word classes were saved to file classes.sorted.txt 9 | -------------------------------------------------------------------------------- /third_party/crow/.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled Object files 2 | *.slo 3 | *.lo 4 | *.o 5 | *.obj 6 | 7 | # Compiled Dynamic libraries 8 | *.so 9 | *.dylib 10 | *.dll 11 | 12 | # Compiled Static libraries 13 | *.lai 14 | *.la 15 | *.a 16 | *.lib 17 | 18 | # Executables 19 | *.exe 20 | *.out 21 | *.app 22 | 23 | example 24 | unittest 25 | 26 | *.swp 27 | *.gcov 28 | 29 | *.gcda 30 | *.gcno 31 | 32 | 33 | .directory 34 | -------------------------------------------------------------------------------- /third_party/crow/examples/example.py: -------------------------------------------------------------------------------- 1 | from flask import Flask 2 | app = Flask(__name__) 3 | 4 | @app.route("/") 5 | def hello(): 6 | return "Hello World!" 7 | 8 | @app.route("/about//hello") 9 | def hello1(path): 10 | return "about1" 11 | 12 | @app.route("/about") 13 | def hello2(): 14 | return "about2" 15 | 16 | print app.url_map 17 | 18 | if __name__ == "__main__": 19 | app.run(host="0.0.0.0", port=8888) 20 | -------------------------------------------------------------------------------- /third_party/word2vec/BUILD: -------------------------------------------------------------------------------- 1 | licenses(["notice"]) 2 | 3 | package(default_visibility = ["//visibility:public"]) 4 | 5 | cc_binary( 6 | name = "word2vec", 7 | srcs = [ 8 | "word2vec.c", 9 | ], 10 | linkopts = [ 11 | "-pthread", 12 | ], 13 | ) 14 | 15 | cc_binary( 16 | name = "distance", 17 | srcs = [ 18 | "distance.cc", 19 | ], 20 | deps = [ 21 | "//utils:basic_string_util", 22 | ], 23 | ) 24 | -------------------------------------------------------------------------------- /third_party/word2vec/demo-word-accuracy.sh: -------------------------------------------------------------------------------- 1 | make 2 | if [ ! -e text8 ]; then 3 | wget http://mattmahoney.net/dc/text8.zip -O text8.gz 4 | gzip -d text8.gz -f 5 | fi 6 | time ./word2vec -train text8 -output vectors.bin -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 20 -binary 1 -iter 15 7 | ./compute-accuracy vectors.bin 30000 < questions-words.txt 8 | # to compute accuracy with the full vocabulary, use: ./compute-accuracy vectors.bin < questions-words.txt 9 | -------------------------------------------------------------------------------- /base/base.h: -------------------------------------------------------------------------------- 1 | // Copyright Koth 2016 2 | 3 | #ifndef BASE_BASE_H_ 4 | #define BASE_BASE_H_ 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | #include "third_party/gflags/include/gflags/gflags.h" 15 | #include "third_party/glog/include/glog/logging.h" 16 | 17 | 18 | 19 | 20 | namespace base { 21 | 22 | void Init(int argc, char** argv); 23 | 24 | } // namesapace base 25 | 26 | #endif // BASE_BASE_H_ 27 | -------------------------------------------------------------------------------- /kcws/cc/dump_vocab.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: Koth 3 | # @Date: 2016-11-20 15:04:18 4 | # @Last Modified by: Koth 5 | # @Last Modified time: 2016-11-20 15:07:51 6 | import sys 7 | import os 8 | import w2v 9 | 10 | 11 | def main(argc, argv): 12 | if argc < 3: 13 | print("Usage:%s " % (argv[0])) 14 | sys.exit(1) 15 | vob = w2v.Word2vecVocab() 16 | vob.Load(argv[1]) 17 | vob.DumpBasicVocab(argv[2]) 18 | 19 | 20 | if __name__ == '__main__': 21 | main(len(sys.argv), sys.argv) -------------------------------------------------------------------------------- /third_party/crow/BUILD: -------------------------------------------------------------------------------- 1 | licenses(["notice"]) 2 | 3 | package(default_visibility = ["//visibility:public"]) 4 | 5 | cc_library( 6 | name = "crow", 7 | hdrs = glob([ 8 | "include/crow.h", 9 | "include/crow/*.h", 10 | "include/crow/*.hpp", 11 | ]), 12 | linkopts = [ 13 | "-pthread", 14 | ], 15 | visibility = ["//visibility:public"], 16 | deps = [ 17 | "@boost//:system", 18 | ], 19 | ) 20 | 21 | cc_binary( 22 | name = "crow_ex", 23 | srcs = [ 24 | "examples/example.cpp", 25 | ], 26 | deps = [ 27 | ":crow", 28 | ], 29 | ) 30 | -------------------------------------------------------------------------------- /util/python/BUILD: -------------------------------------------------------------------------------- 1 | licenses(["restricted"]) 2 | 3 | package(default_visibility = ["//visibility:public"]) 4 | 5 | cc_library( 6 | name = "python_headers", 7 | hdrs = glob([ 8 | "python_include/**/*.h", 9 | ]), 10 | data = [":python_checked"], 11 | includes = ["python_include"], 12 | ) 13 | 14 | genrule( 15 | name = "python_check", 16 | srcs = [ 17 | "python_config.sh", 18 | ], 19 | outs = [ 20 | "python_checked", 21 | ], 22 | cmd = "OUTPUTDIR=\"$(@D)/\"; $(location :python_config.sh) --check && touch $$OUTPUTDIR/python_checked", 23 | local = 1, 24 | ) 25 | -------------------------------------------------------------------------------- /utils/basic_vocab.h: -------------------------------------------------------------------------------- 1 | 2 | #ifndef UTILS_BASIC_VOCAB_H_ 3 | #define UTILS_BASIC_VOCAB_H_ 4 | #include 5 | #include 6 | #include 7 | 8 | #include "vocab.h" 9 | 10 | namespace utils { 11 | class BasicVocab: public Vocab { 12 | public: 13 | BasicVocab() {use_map_ = false;} 14 | BasicVocab(bool useMap): use_map_(useMap) {} 15 | bool Load(const std::string& path) override; 16 | int GetWordIndex(const std::string& word) override; 17 | int GetTotalWord() override; 18 | private: 19 | std::unordered_map w_map_; 20 | bool use_map_; 21 | }; 22 | } // namespace utils 23 | #endif // UTILS_BASIC_VOCAB_H_ 24 | -------------------------------------------------------------------------------- /third_party/crow/examples/ssl/example_ssl.cpp: -------------------------------------------------------------------------------- 1 | #define CROW_ENABLE_SSL 2 | #include "crow.h" 3 | 4 | int main() 5 | { 6 | crow::SimpleApp app; 7 | 8 | CROW_ROUTE(app, "/") 9 | ([]() { 10 | return "Hello world!"; 11 | }); 12 | 13 | app.port(18080).ssl_file("test.crt", "test.key").run(); 14 | 15 | // Use .pem file 16 | //app.port(18080).ssl_file("test.pem").run(); 17 | 18 | // Use custom context; see boost::asio::ssl::context 19 | /* 20 | * crow::ssl_context_t ctx; 21 | * ctx.set_verify_mode(...) 22 | * 23 | * ... configuring ctx 24 | * 25 | * app.port(18080).ssl(ctx).run(); 26 | */ 27 | } 28 | -------------------------------------------------------------------------------- /third_party/word2vec/demo-analogy.sh: -------------------------------------------------------------------------------- 1 | make 2 | if [ ! -e text8 ]; then 3 | wget http://mattmahoney.net/dc/text8.zip -O text8.gz 4 | gzip -d text8.gz -f 5 | fi 6 | echo --------------------------------------------------------------------------------------------------- 7 | echo Note that for the word analogy to perform well, the model should be trained on much larger data set 8 | echo Example input: paris france berlin 9 | echo --------------------------------------------------------------------------------------------------- 10 | time ./word2vec -train text8 -output vectors.bin -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 20 -binary 1 -iter 15 11 | ./word-analogy vectors.bin 12 | -------------------------------------------------------------------------------- /kcws/models/pos_vocab.txt: -------------------------------------------------------------------------------- 1 | j 1 2 | vd 2 3 | ad 3 4 | vf 4 5 | vg 5 6 | cc 6 7 | vi 7 8 | rr 8 9 | al 9 10 | vn 10 11 | an 11 12 | gg 12 13 | vs 13 14 | gc 14 15 | nf 15 16 | vx 16 17 | vy 17 18 | gm 18 19 | u 19 20 | gi 20 21 | nh 21 22 | ni 22 23 | ag 23 24 | nn 24 25 | ul 25 26 | na 26 27 | nb 27 28 | z 28 29 | ry 29 30 | rz 30 31 | ng 31 32 | pb 32 33 | nz 33 34 | dg 34 35 | tg 35 36 | nr 36 37 | ns 37 38 | nt 38 39 | dl 39 40 | bl 40 41 | vl 41 42 | gp 42 43 | o 43 44 | x 44 45 | e 45 46 | qt 46 47 | a 47 48 | nm 48 49 | c 49 50 | b 50 51 | uy 51 52 | d 52 53 | f 53 54 | i 54 55 | k 55 56 | uz 56 57 | m 57 58 | l 58 59 | us 59 60 | n 60 61 | q 61 62 | p 62 63 | s 63 64 | r 64 65 | mq 65 66 | t 66 67 | w 67 68 | v 68 69 | y 69 70 | ud 70 71 | ug 71 72 | qv 72 73 | gb 73 74 | 75 | -------------------------------------------------------------------------------- /third_party/crow/include/crow.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "crow/query_string.h" 3 | #include "crow/http_parser_merged.h" 4 | #include "crow/ci_map.h" 5 | #include "crow/TinySHA1.hpp" 6 | #include "crow/settings.h" 7 | #include "crow/socket_adaptors.h" 8 | #include "crow/json.h" 9 | #include "crow/mustache.h" 10 | #include "crow/logging.h" 11 | #include "crow/dumb_timer_queue.h" 12 | #include "crow/utility.h" 13 | #include "crow/common.h" 14 | #include "crow/http_request.h" 15 | #include "crow/websocket.h" 16 | #include "crow/parser.h" 17 | #include "crow/http_response.h" 18 | #include "crow/middleware.h" 19 | #include "crow/routing.h" 20 | #include "crow/middleware_context.h" 21 | #include "crow/http_connection.h" 22 | #include "crow/http_server.h" 23 | #include "crow/app.h" 24 | -------------------------------------------------------------------------------- /third_party/word2vec/makefile: -------------------------------------------------------------------------------- 1 | CC = gcc 2 | #Using -Ofast instead of -O3 might result in faster code, but is supported only by newer GCC versions 3 | CFLAGS = -lm -pthread -O3 -march=native -Wall -funroll-loops -Wno-unused-result 4 | 5 | all: word2vec word2phrase distance word-analogy compute-accuracy 6 | 7 | word2vec : word2vec.c 8 | $(CC) word2vec.c -o word2vec $(CFLAGS) 9 | word2phrase : word2phrase.c 10 | $(CC) word2phrase.c -o word2phrase $(CFLAGS) 11 | distance : distance.c 12 | $(CC) distance.c -o distance $(CFLAGS) 13 | word-analogy : word-analogy.c 14 | $(CC) word-analogy.c -o word-analogy $(CFLAGS) 15 | compute-accuracy : compute-accuracy.c 16 | $(CC) compute-accuracy.c -o compute-accuracy $(CFLAGS) 17 | chmod +x *.sh 18 | 19 | clean: 20 | rm -rf word2vec word2phrase distance word-analogy compute-accuracy -------------------------------------------------------------------------------- /third_party/crow/include/crow/settings.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | // settings for crow 3 | // TODO - replace with runtime config. libucl? 4 | 5 | /* #ifdef - enables debug mode */ 6 | #define CROW_ENABLE_DEBUG 7 | 8 | /* #ifdef - enables logging */ 9 | #define CROW_ENABLE_LOGGING 10 | 11 | /* #ifdef - enables ssl */ 12 | //#define CROW_ENABLE_SSL 13 | 14 | /* #define - specifies log level */ 15 | /* 16 | Debug = 0 17 | Info = 1 18 | Warning = 2 19 | Error = 3 20 | Critical = 4 21 | 22 | default to INFO 23 | */ 24 | #define CROW_LOG_LEVEL 1 25 | 26 | 27 | // compiler flags 28 | #if __cplusplus >= 201402L 29 | #define CROW_CAN_USE_CPP14 30 | #endif 31 | 32 | #if defined(_MSC_VER) 33 | #if _MSC_VER < 1900 34 | #define CROW_MSVC_WORKAROUND 35 | #define constexpr const 36 | #define noexcept throw() 37 | #endif 38 | #endif 39 | -------------------------------------------------------------------------------- /BUILD.tf_dist: -------------------------------------------------------------------------------- 1 | # Bazel build file for binary tf 2 | licenses(["notice"]) 3 | 4 | 5 | config_setting( 6 | name = "darwin", 7 | values = {"cpu": "darwin"}, 8 | visibility = ["//visibility:public"], 9 | ) 10 | 11 | filegroup( 12 | name="tf_unix_lib", 13 | srcs=glob( 14 | ["lib/unix/*.o"] 15 | ) 16 | ) 17 | filegroup( 18 | name="tf_mac_lib", 19 | srcs=glob( 20 | ["lib/mac/*.o"], 21 | exclude = ["lib/mac/__.SYMDEF_*.o"] 22 | ) 23 | ) 24 | cc_library( 25 | name="tensorflow", 26 | hdrs = glob(["tensorflow/*","google/*"]), 27 | includes = [ 28 | ".", 29 | ], 30 | alwayslink=1, 31 | visibility = ["//visibility:public"], 32 | deps=[ 33 | '@protobuf//:protobuf' 34 | ], 35 | srcs=select({ 36 | ":darwin": [ 37 | ":tf_mac_lib", 38 | ], 39 | "//conditions:default": [ 40 | ":tf_unix_lib", 41 | ], 42 | }), 43 | ) -------------------------------------------------------------------------------- /third_party/crow/include/crow/ci_map.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | namespace crow 8 | { 9 | struct ci_hash 10 | { 11 | size_t operator()(const std::string& key) const 12 | { 13 | std::size_t seed = 0; 14 | std::locale locale; 15 | 16 | for(auto c : key) 17 | { 18 | boost::hash_combine(seed, std::toupper(c, locale)); 19 | } 20 | 21 | return seed; 22 | } 23 | }; 24 | 25 | struct ci_key_eq 26 | { 27 | bool operator()(const std::string& l, const std::string& r) const 28 | { 29 | return boost::iequals(l, r); 30 | } 31 | }; 32 | 33 | using ci_map = std::unordered_multimap; 34 | } 35 | -------------------------------------------------------------------------------- /third_party/word2vec/demo-phrases.sh: -------------------------------------------------------------------------------- 1 | make 2 | if [ ! -e news.2012.en.shuffled ]; then 3 | wget http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2012.en.shuffled.gz 4 | gzip -d news.2012.en.shuffled.gz -f 5 | fi 6 | sed -e "s/’/'/g" -e "s/′/'/g" -e "s/''/ /g" < news.2012.en.shuffled | tr -c "A-Za-z'_ \n" " " > news.2012.en.shuffled-norm0 7 | time ./word2phrase -train news.2012.en.shuffled-norm0 -output news.2012.en.shuffled-norm0-phrase0 -threshold 200 -debug 2 8 | time ./word2phrase -train news.2012.en.shuffled-norm0-phrase0 -output news.2012.en.shuffled-norm0-phrase1 -threshold 100 -debug 2 9 | tr A-Z a-z < news.2012.en.shuffled-norm0-phrase1 > news.2012.en.shuffled-norm1-phrase1 10 | time ./word2vec -train news.2012.en.shuffled-norm1-phrase1 -output vectors-phrase.bin -cbow 1 -size 200 -window 10 -negative 25 -hs 0 -sample 1e-5 -threads 20 -binary 1 -iter 15 11 | ./distance vectors-phrase.bin 12 | -------------------------------------------------------------------------------- /third_party/crow/.travis.yml: -------------------------------------------------------------------------------- 1 | language: cpp 2 | 3 | sudo: false 4 | 5 | notifications: 6 | irc: "chat.freenode.net##crow" 7 | 8 | compiler: 9 | - gcc 10 | 11 | env: 12 | matrix: 13 | - COMPILER=g++-4.8 CCOMPILER=gcc-4.8 PUSH_COVERAGE=ON 14 | 15 | addons: 16 | apt: 17 | sources: 18 | - ubuntu-toolchain-r-test 19 | - boost-latest 20 | packages: 21 | - g++-4.8 22 | - libboost1.55-all-dev 23 | - python-pip 24 | 25 | install: 26 | - if [ "$PUSH_COVERAGE" == "ON" ]; then pip install --user git+git://github.com/eddyxu/cpp-coveralls.git; fi 27 | 28 | before_script: 29 | - export CXX=$COMPILER CC=$CCOMPILER 30 | - mkdir build 31 | - cd build 32 | - cmake --version 33 | - cmake .. 34 | 35 | script: make && ctest 36 | 37 | after_success: 38 | - cd .. 39 | - if [ "PUSH_COVERAGE" == "ON" ]; then coveralls --gcov gcov-4.8 -i include --gcov-options '\-lp'; fi 40 | -------------------------------------------------------------------------------- /third_party/crow/tests/template/mustachetest.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include "crow/mustache.h" 6 | #include "crow/json.h" 7 | using namespace std; 8 | using namespace crow; 9 | using namespace crow::mustache; 10 | 11 | string read_all(const string& filename) 12 | { 13 | ifstream is(filename); 14 | return {istreambuf_iterator(is), istreambuf_iterator()}; 15 | } 16 | 17 | int main() 18 | { 19 | auto data = json::load(read_all("data")); 20 | auto templ = compile(read_all("template")); 21 | auto partials = json::load(read_all("partials")); 22 | set_loader([&](std::string name)->std::string 23 | { 24 | if (partials.count(name)) 25 | { 26 | return partials[name].s(); 27 | } 28 | return ""; 29 | }); 30 | context ctx(data); 31 | cout << templ.render(ctx); 32 | return 0; 33 | } 34 | -------------------------------------------------------------------------------- /third_party/word2vec/demo-phrase-accuracy.sh: -------------------------------------------------------------------------------- 1 | make 2 | if [ ! -e news.2012.en.shuffled ]; then 3 | wget http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2012.en.shuffled.gz 4 | gzip -d news.2012.en.shuffled.gz -f 5 | fi 6 | sed -e "s/’/'/g" -e "s/′/'/g" -e "s/''/ /g" < news.2012.en.shuffled | tr -c "A-Za-z'_ \n" " " > news.2012.en.shuffled-norm0 7 | time ./word2phrase -train news.2012.en.shuffled-norm0 -output news.2012.en.shuffled-norm0-phrase0 -threshold 200 -debug 2 8 | time ./word2phrase -train news.2012.en.shuffled-norm0-phrase0 -output news.2012.en.shuffled-norm0-phrase1 -threshold 100 -debug 2 9 | tr A-Z a-z < news.2012.en.shuffled-norm0-phrase1 > news.2012.en.shuffled-norm1-phrase1 10 | time ./word2vec -train news.2012.en.shuffled-norm1-phrase1 -output vectors-phrase.bin -cbow 1 -size 200 -window 10 -negative 25 -hs 0 -sample 1e-5 -threads 20 -binary 1 -iter 15 11 | ./compute-accuracy vectors-phrase.bin < questions-phrases.txt 12 | -------------------------------------------------------------------------------- /kcws/cc/prepare_test_file.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: Koth 3 | # @Date: 2016-11-22 21:20:59 4 | # @Last Modified by: Koth 5 | # @Last Modified time: 2016-11-22 21:39:22 6 | 7 | import sys 8 | import os 9 | 10 | 11 | def main(argc, argv): 12 | if argc < 3: 13 | print("Usage:%s " % (argv[0])) 14 | sys.exit(1) 15 | inp = open(argv[1], "r") 16 | oup = open(argv[2], "w") 17 | totalLine = 0 18 | while True: 19 | line = inp.readline() 20 | if not line: 21 | break 22 | line = line.strip() 23 | if not line or len(line) == 0: 24 | continue 25 | ustr = unicode(line.decode("utf8")) 26 | if len(ustr) >= 80 or len(ustr) < 10: 27 | continue 28 | oup.write("%s\n" % (line)) 29 | totalLine += 1 30 | print("totalLine:%d" % (totalLine)) 31 | 32 | 33 | if __name__ == '__main__': 34 | main(len(sys.argv), sys.argv) 35 | -------------------------------------------------------------------------------- /third_party/crow/tests/template/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.8) 2 | project (template_test) 3 | 4 | 5 | set(PROJECT_INCLUDE_DIR 6 | ${PROJECT_SOURCE_DIR}/include 7 | ) 8 | 9 | set(TEST_SRCS 10 | mustachetest.cpp 11 | ) 12 | 13 | add_executable(mustachetest ${TEST_SRCS}) 14 | #target_link_libraries(unittest crow) 15 | #target_link_libraries(unittest ${Boost_LIBRARIES} ) 16 | set_target_properties(mustachetest PROPERTIES COMPILE_FLAGS "-Wall -std=c++1y") 17 | 18 | #message(${PROJECT_SOURCE_DIR}) 19 | #message(${CMAKE_CURRENT_BINARY_DIR}) 20 | file(COPY DIRECTORY . DESTINATION ${CMAKE_CURRENT_BINARY_DIR} 21 | FILES_MATCHING 22 | PATTERN "*.json") 23 | 24 | add_custom_command(OUTPUT test.py 25 | COMMAND ${CMAKE_COMMAND} -E 26 | copy ${PROJECT_SOURCE_DIR}/test.py ${CMAKE_CURRENT_BINARY_DIR}/test.py 27 | DEPENDS ${PROJECT_SOURCE_DIR}/test.py 28 | ) 29 | add_custom_target(template_test_copy ALL DEPENDS test.py) 30 | -------------------------------------------------------------------------------- /configure: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | DO_NOT_SUBMIT_WARNING="Unofficial setting. DO NOT SUBMIT!!!" 4 | 5 | ## Set up python-related environment settings 6 | while true; do 7 | fromuser="" 8 | if [ -z "$PYTHON_BIN_PATH" ]; then 9 | default_python_bin_path=$(which python) 10 | read -p "Please specify the location of python. [Default is $default_python_bin_path]: " PYTHON_BIN_PATH 11 | fromuser="1" 12 | if [ -z "$PYTHON_BIN_PATH" ]; then 13 | PYTHON_BIN_PATH=$default_python_bin_path 14 | fi 15 | fi 16 | if [ -e "$PYTHON_BIN_PATH" ]; then 17 | break 18 | fi 19 | echo "Invalid python path. ${PYTHON_BIN_PATH} cannot be found" 1>&2 20 | if [ -z "$fromuser" ]; then 21 | exit 1 22 | fi 23 | PYTHON_BIN_PATH="" 24 | # Retry 25 | done 26 | 27 | 28 | # Invoke python_config and set up symlinks to python includes 29 | (./util/python/python_config.sh --setup "$PYTHON_BIN_PATH";) || exit -1 30 | 31 | 32 | echo "Configuration finished" 33 | -------------------------------------------------------------------------------- /tools/bazel.rc.template: -------------------------------------------------------------------------------- 1 | build:cuda --crosstool_top=//third_party/gpus/crosstool 2 | build:cuda --define=using_cuda=true --define=using_cuda_nvcc=true 3 | 4 | build --force_python=py$PYTHON_MAJOR_VERSION 5 | build --python$PYTHON_MAJOR_VERSION_path=$PYTHON_BINARY 6 | build --define=use_fast_cpp_protos=true 7 | build --define=allow_oversize_protos=true 8 | 9 | build --define PYTHON_BIN_PATH=$PYTHON_BINARY 10 | test --define PYTHON_BIN_PATH=$PYTHON_BINARY 11 | test --force_python=py$PYTHON_MAJOR_VERSION 12 | test --host_force_python=py$PYTHON_MAJOR_VERSION 13 | run --define PYTHON_BIN_PATH=$PYTHON_BINARY 14 | 15 | build --spawn_strategy=standalone 16 | test --spawn_strategy=standalone 17 | run --spawn_strategy=standalone 18 | 19 | build --copt="-D_GLIBCXX_USE_CXX11_ABI=0" 20 | test --copt="-D_GLIBCXX_USE_CXX11_ABI=0" 21 | run --copt="-D_GLIBCXX_USE_CXX11_ABI=0" 22 | 23 | build --genrule_strategy=standalone 24 | test --genrule_strategy=standalone 25 | run --genrule_strategy=standalone 26 | -------------------------------------------------------------------------------- /third_party/crow/tests/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.8) 2 | project (crow_test) 3 | 4 | 5 | set(TEST_SRCS 6 | unittest.cpp 7 | ) 8 | 9 | add_executable(unittest ${TEST_SRCS}) 10 | #target_link_libraries(unittest crow) 11 | target_link_libraries(unittest ${Boost_LIBRARIES}) 12 | target_link_libraries(unittest ${CMAKE_THREAD_LIBS_INIT}) 13 | 14 | if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") 15 | # using Clang 16 | elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") 17 | # using GCC 18 | set_target_properties(unittest PROPERTIES COMPILE_FLAGS "--coverage -fprofile-arcs -ftest-coverage") 19 | target_link_libraries(unittest gcov) 20 | endif() 21 | 22 | add_subdirectory(template) 23 | #CXXFLAGS="-g -O0 -Wall -W -Wshadow -Wunused-variable \ 24 | #Wunused-parameter -Wunused-function -Wunused -Wno-system-headers \ 25 | #-Wno-deprecated -Woverloaded-virtual -Wwrite-strings -fprofile-arcs -ftest-coverage" 26 | #CFLAGS="-g -O0 -Wall -W -fprofile-arcs -ftest-coverage" 27 | #LDFLAGS="-fprofile-arcs -ftest-coverage" 28 | -------------------------------------------------------------------------------- /third_party/crow/examples/websocket/templates/ws.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 |
11 | 13 | 40 | 41 | 42 | -------------------------------------------------------------------------------- /tfmodel/tfmodel.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2016- 2018 Koth. All Rights Reserved. 3 | * ===================================================================================== 4 | * Filename: tfmodel.h 5 | * Author: Koth 6 | * Create Time: 2017-02-01 13:34:04 7 | * Description: 8 | * 9 | */ 10 | #ifndef TF_TFMODEL_H_ 11 | #define TF_TFMODEL_H_ 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include "tensorflow/core/framework/types.pb.h" 17 | #include "tensorflow/core/public/session.h" 18 | 19 | namespace tf { 20 | class TfModel { 21 | public: 22 | virtual ~TfModel(); 23 | virtual bool Load(const std::string& path); 24 | bool Eval(const std::vector >& inputTensors, 25 | const std::vector& outputNames, 26 | std::vector& outputTensors); 27 | 28 | protected: 29 | std::unique_ptr session_; 30 | }; 31 | 32 | } // namespace tf 33 | #endif // TF_TFMODEL_H_ 34 | 35 | 36 | -------------------------------------------------------------------------------- /kcws/train/BUILD: -------------------------------------------------------------------------------- 1 | py_binary( 2 | name = "generate_training", 3 | srcs = ["generate_training.py"], 4 | data = ["//utils:w2v.so"], 5 | imports = ["../../utils"], 6 | ) 7 | 8 | py_binary( 9 | name = "process_icwb", 10 | srcs = ["process_icwb.py"], 11 | data = ["//utils:w2v.so"], 12 | imports = ["../../utils"], 13 | ) 14 | 15 | py_binary( 16 | name = "process_people", 17 | srcs = ["process_people.py"], 18 | data = ["//utils:w2v.so"], 19 | imports = ["../../utils"], 20 | ) 21 | 22 | py_binary( 23 | name = "generate_char_embedding", 24 | srcs = ["generate_char_embedding.py"], 25 | data = ["//utils:w2v.so"], 26 | imports = ["../../utils"], 27 | ) 28 | 29 | py_binary( 30 | name = "generate_pos_train", 31 | srcs = ["generate_pos_train.py"], 32 | data=['//utils:w2v.so'], 33 | imports=['../../utils'] 34 | ) 35 | 36 | py_binary( 37 | name = "generate_train_free", 38 | srcs = ["generate_train_free.py"], 39 | data = ["//utils:w2v.so"], 40 | imports = ["../../utils"], 41 | ) -------------------------------------------------------------------------------- /kcws/train/replace_unk.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: Koth 3 | # @Date: 2016-12-09 19:37:43 4 | # @Last Modified by: Koth 5 | # @Last Modified time: 2016-12-09 19:49:37 6 | import sys 7 | 8 | 9 | def main(argc, argv): 10 | if argc < 4: 11 | print("Usage:%s " % (argv[0])) 12 | sys.exit(1) 13 | vp = open(argv[1], "r") 14 | inp = open(argv[2], "r") 15 | oup = open(argv[3], "w") 16 | vobsMap = {} 17 | for line in vp: 18 | line = line.strip() 19 | ss = line.split(" ") 20 | vobsMap[ss[0]] = 1 21 | while True: 22 | line = inp.readline() 23 | if not line: 24 | break 25 | line = line.strip() 26 | if not line: 27 | continue 28 | ss = line.split(" ") 29 | tokens = [] 30 | for s in ss: 31 | if s in vobsMap: 32 | tokens.append(s) 33 | else: 34 | tokens.append("") 35 | oup.write("%s\n" % (" ".join(tokens))) 36 | oup.close() 37 | inp.close() 38 | vp.close() 39 | 40 | 41 | if __name__ == '__main__': 42 | main(len(sys.argv), sys.argv) -------------------------------------------------------------------------------- /kcws/cc/viterbi_decode.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2016- 2018 Koth. All Rights Reserved. 3 | * ===================================================================================== 4 | * Filename: viterbi_decode.h 5 | * Author: Koth 6 | * Create Time: 2017-02-01 13:43:51 7 | * Description: 8 | * 9 | */ 10 | #ifndef KCWS_CC_VITERBI_DECODE_H_ 11 | #define KCWS_CC_VITERBI_DECODE_H_ 12 | #include 13 | #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" 14 | namespace kcws { 15 | void get_best_path( 16 | const Eigen::TensorMap, Eigen::Aligned>& predictions, 17 | int sentenceIdx, 18 | int nn, 19 | const std::vector>& trans, 20 | int** bp, 21 | float** scores, 22 | std::vector& resultTags, 23 | int ntags); 24 | 25 | int viterbi_decode( 26 | const Eigen::TensorMap, Eigen::Aligned>& predictions, 27 | int sentenceIdx, 28 | int nn, 29 | const std::vector>& trans, 30 | int** bp, 31 | float** scores, 32 | int ntags); 33 | 34 | } // namespace kcws 35 | #endif // KCWS_CC_VITERBI_DECODE_H_ 36 | -------------------------------------------------------------------------------- /third_party/crow/cmake/FindTcmalloc.cmake: -------------------------------------------------------------------------------- 1 | # - Find Tcmalloc 2 | # Find the native Tcmalloc library 3 | # 4 | # Tcmalloc_LIBRARIES - List of libraries when using Tcmalloc. 5 | # Tcmalloc_FOUND - True if Tcmalloc found. 6 | 7 | if (USE_TCMALLOC) 8 | set(Tcmalloc_NAMES tcmalloc) 9 | else () 10 | set(Tcmalloc_NAMES tcmalloc_minimal tcmalloc) 11 | endif () 12 | 13 | find_library(Tcmalloc_LIBRARY NO_DEFAULT_PATH 14 | NAMES ${Tcmalloc_NAMES} 15 | PATHS ${HT_DEPENDENCY_LIB_DIR} /lib /usr/lib /usr/local/lib /opt/local/lib 16 | ) 17 | 18 | if (Tcmalloc_LIBRARY) 19 | set(Tcmalloc_FOUND TRUE) 20 | set( Tcmalloc_LIBRARIES ${Tcmalloc_LIBRARY} ) 21 | else () 22 | set(Tcmalloc_FOUND FALSE) 23 | set( Tcmalloc_LIBRARIES ) 24 | endif () 25 | 26 | if (Tcmalloc_FOUND) 27 | message(STATUS "Found Tcmalloc: ${Tcmalloc_LIBRARY}") 28 | else () 29 | message(STATUS "Not Found Tcmalloc: ${Tcmalloc_LIBRARY}") 30 | if (Tcmalloc_FIND_REQUIRED) 31 | message(STATUS "Looked for Tcmalloc libraries named ${Tcmalloc_NAMES}.") 32 | message(FATAL_ERROR "Could NOT find Tcmalloc library") 33 | endif () 34 | endif () 35 | 36 | mark_as_advanced( 37 | Tcmalloc_LIBRARY 38 | ) 39 | -------------------------------------------------------------------------------- /utils/py_word2vec_vob.cc: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2016- 2018 Koth. All Rights Reserved. 3 | * ===================================================================================== 4 | * Filename: py_word2vec_vob.cc 5 | * Author: Koth Chen 6 | * Create Time: 2016-07-25 18:46:27 7 | * Description: 8 | * 9 | */ 10 | #include "third_party/pybind11/pybind11.h" 11 | #include "third_party/pybind11/stl.h" 12 | #include "word2vec_vob.h" 13 | namespace py = pybind11; 14 | 15 | PYBIND11_PLUGIN(w2v) { 16 | py::module m("w2v", "python binding for word2vec vocab"); 17 | py::class_(m, "Word2vecVocab", "python class Word2vecVocab") 18 | .def(py::init()) 19 | .def("Load", &utils::Word2vecVocab::Load, "load word2vec from text file") 20 | .def("SetMapword", &utils::Word2vecVocab::SetMapword, "set whether map to word") 21 | .def("GetFeature", &utils::Word2vecVocab::GetFeatureOrEmpty, "get word embedding or empty if not exist") 22 | .def("GetTotalWord", &utils::Word2vecVocab::GetTotalWord, "get total words") 23 | .def("GetWordIndex", &utils::Word2vecVocab::GetWordIndex, "get word idx") 24 | .def("DumpBasicVocab", &utils::Word2vecVocab::DumpBasicVocab, "dump the word2vec vocab into basic mode"); 25 | return m.ptr(); 26 | } -------------------------------------------------------------------------------- /third_party/crow/tests/template/test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from __future__ import print_function 3 | import glob 4 | import json 5 | import os 6 | import subprocess 7 | for testfile in glob.glob("*.json"): 8 | testdoc = json.load(open(testfile)) 9 | for test in testdoc["tests"]: 10 | if "lambda" in test["data"]: 11 | continue 12 | open('data', 'w').write(json.dumps(test["data"])) 13 | open('template', 'w').write(test["template"]) 14 | if "partials" in test: 15 | open('partials', 'w').write(json.dumps(test["partials"])) 16 | else: 17 | open('partials', 'w').write("{}") 18 | ret = subprocess.check_output("./mustachetest").decode('utf8') 19 | print(testfile, test["name"]) 20 | if ret != test["expected"]: 21 | if 'partials' in test: 22 | print('partials:', json.dumps(test["partials"])) 23 | print(json.dumps(test["data"])) 24 | print(test["template"]) 25 | print('Expected:',repr(test["expected"])) 26 | print('Actual:',repr(ret)) 27 | assert ret == test["expected"] 28 | os.unlink('data') 29 | os.unlink('template') 30 | os.unlink('partials') 31 | -------------------------------------------------------------------------------- /kcws/cc/sentence_breaker.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2016- 2018 Koth. All Rights Reserved. 3 | * ===================================================================================== 4 | * Filename: sentence_breaker.h 5 | * Author: Koth 6 | * Create Time: 2016-11-23 21:54:41 7 | * Description: 8 | * 9 | */ 10 | #ifndef KCWS_SENTENCE_BREAKER_H_ 11 | #define KCWS_SENTENCE_BREAKER_H_ 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include "utils/basic_string_util.h" 17 | namespace kcws { 18 | 19 | class SentenceBreaker { 20 | public: 21 | explicit SentenceBreaker(int maxLen); 22 | virtual ~SentenceBreaker(); 23 | bool breakSentences(const UnicodeStr& text, 24 | std::vector* lines); 25 | 26 | private: 27 | static char* kInlineMarks[]; 28 | static char* kBreakMarks[]; 29 | 30 | bool is_inline_mark(UnicodeCharT uch) ; 31 | bool is_break_mark(UnicodeCharT uch) ; 32 | 33 | std::unordered_map inline_marks_; 34 | std::unordered_set break_marks_; 35 | std::unordered_set inline_marks_set_; 36 | int max_len_; 37 | }; 38 | } // namespace kcws 39 | 40 | #endif // KCWS_SENTENCE_BREAKER_H_ 41 | -------------------------------------------------------------------------------- /third_party/python/semver/README.md: -------------------------------------------------------------------------------- 1 | Semver -- python module for semantic versioning 2 | =============================================== 3 | 4 | ![Travis CI](https://travis-ci.org/k-bx/python-semver.svg?branch=master) 5 | 6 | Simple module for comparing versions as noted at [semver.org](http://semver.org/). 7 | 8 | This module provides just couple of functions, main of which are: 9 | 10 | ```python 11 | >>> import semver 12 | >>> semver.compare("1.0.0", "2.0.0") 13 | -1 14 | >>> semver.compare("2.0.0", "1.0.0") 15 | 1 16 | >>> semver.compare("2.0.0", "2.0.0") 17 | 0 18 | >>> semver.match("2.0.0", ">=1.0.0") 19 | True 20 | >>> semver.match("1.0.0", ">1.0.0") 21 | False 22 | >>> semver.format_version(3, 4, 5, 'pre.2', 'build.4') 23 | '3.4.5-pre.2+build.4' 24 | >>> semver.bump_major("3.4.5") 25 | '4.0.0' 26 | >>> semver.bump_minor("3.4.5") 27 | '3.5.0' 28 | >>> semver.bump_patch("3.4.5") 29 | '3.4.6' 30 | >>> semver.max_ver("1.0.0", "2.0.0") 31 | '2.0.0' 32 | >>> semver.min_ver("1.0.0", "2.0.0") 33 | '1.0.0' 34 | ``` 35 | 36 | Installation 37 | ------------ 38 | 39 | For Python 2: 40 | 41 | ``` 42 | pip install semver 43 | ``` 44 | 45 | For Python 3: 46 | 47 | ``` 48 | pip3 install semver 49 | ``` 50 | 51 | Homepage at PyPi: https://pypi.python.org/pypi/semver 52 | -------------------------------------------------------------------------------- /kcws/train/sentence.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding:utf-8 -*- 3 | 4 | # File: sentence.py 5 | # Project: /e/code/kcws 6 | # Created: Thu Jul 27 2017 7 | # Author: Koth Chen 8 | # Copyright (c) 2017 Koth 9 | # 10 | # <> 11 | 12 | 13 | class Sentence: 14 | def __init__(self): 15 | self.tokens = [] 16 | self.chars = 0 17 | 18 | def addToken(self, t): 19 | self.chars += len(t) 20 | self.tokens.append(t) 21 | 22 | def clear(self): 23 | self.tokens = [] 24 | self.chars = 0 25 | 26 | # label -1, unknown 27 | # 0-> 'S' 28 | # 1-> 'B' 29 | # 2-> 'M' 30 | # 3-> 'E' 31 | def generate_tr_line(self, x, y, vob): 32 | for t in self.tokens: 33 | if len(t) == 1: 34 | x.append(vob.GetWordIndex(str(t[0].encode("utf8")))) 35 | y.append(0) 36 | else: 37 | nn = len(t) 38 | for i in range(nn): 39 | x.append(vob.GetWordIndex(str(t[i].encode("utf8")))) 40 | if i == 0: 41 | y.append(1) 42 | elif i == (nn - 1): 43 | y.append(3) 44 | else: 45 | y.append(2) 46 | -------------------------------------------------------------------------------- /third_party/word2vec/README.txt: -------------------------------------------------------------------------------- 1 | Tools for computing distributed representtion of words 2 | ------------------------------------------------------ 3 | 4 | We provide an implementation of the Continuous Bag-of-Words (CBOW) and the Skip-gram model (SG), as well as several demo scripts. 5 | 6 | Given a text corpus, the word2vec tool learns a vector for every word in the vocabulary using the Continuous 7 | Bag-of-Words or the Skip-Gram neural network architectures. The user should to specify the following: 8 | - desired vector dimensionality 9 | - the size of the context window for either the Skip-Gram or the Continuous Bag-of-Words model 10 | - training algorithm: hierarchical softmax and / or negative sampling 11 | - threshold for downsampling the frequent words 12 | - number of threads to use 13 | - the format of the output word vector file (text or binary) 14 | 15 | Usually, the other hyper-parameters such as the learning rate do not need to be tuned for different training sets. 16 | 17 | The script demo-word.sh downloads a small (100MB) text corpus from the web, and trains a small word vector model. After the training 18 | is finished, the user can interactively explore the similarity of the words. 19 | 20 | More information about the scripts is provided at https://code.google.com/p/word2vec/ 21 | 22 | -------------------------------------------------------------------------------- /kcws/cc/test_breaker.cc: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2016- 2018 Koth. All Rights Reserved. 3 | * ===================================================================================== 4 | * Filename: test_breaker.cc 5 | * Author: Koth 6 | * Create Time: 2016-11-24 19:40:33 7 | * Description: 8 | * 9 | */ 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | 18 | #include "base/base.h" 19 | #include "utils/basic_string_util.h" 20 | 21 | #include "sentence_breaker.h" //NOLINT 22 | 23 | DEFINE_string(test_str, "", "the test string"); 24 | 25 | int main(int argc, char *argv[]) { 26 | FLAGS_v = 0; 27 | FLAGS_logtostderr = 1; 28 | base::Init(argc, argv); 29 | kcws::SentenceBreaker breaker(80); 30 | CHECK(!FLAGS_test_str.empty()) << "test string should be set"; 31 | UnicodeStr ustr; 32 | CHECK(BasicStringUtil::u8tou16(FLAGS_test_str.c_str(), FLAGS_test_str.size(), ustr)); 33 | std::vector results; 34 | CHECK(breaker.breakSentences(ustr, &results)) << "break error"; 35 | VLOG(0) << "results is :"; 36 | for (auto u : results) { 37 | std::string todo; 38 | CHECK(BasicStringUtil::u16tou8(u.c_str(), u.size(), todo)); 39 | VLOG(0) << todo; 40 | } 41 | return 0; 42 | } 43 | -------------------------------------------------------------------------------- /third_party/boost/boost.bzl: -------------------------------------------------------------------------------- 1 | include_pattern = "boost/%s/" 2 | hdrs_patterns = [ 3 | "boost/%s.h", 4 | "boost/%s.hpp", 5 | "boost/%s/**/*.hpp", 6 | "boost/%s/**/*.ipp", 7 | "boost/%s/**/*.h", 8 | "libs/%s/src/*.ipp", 9 | ] 10 | srcs_patterns = [ 11 | "libs/%s/src/*.cpp", 12 | "libs/%s/src/*.hpp", 13 | ] 14 | 15 | def srcs_list(library_name): 16 | return native.glob([p % (library_name,) for p in srcs_patterns]) 17 | 18 | def includes_list(library_name): 19 | return [".", include_pattern % library_name] 20 | 21 | def hdr_list(library_name): 22 | return native.glob([p % (library_name,) for p in hdrs_patterns]) 23 | 24 | def boost_library(name, defines=None, includes=None, hdrs=None, srcs=None, deps=None, copts=None): 25 | if defines == None: 26 | defines = [] 27 | 28 | if includes == None: 29 | includes = [] 30 | 31 | if hdrs == None: 32 | hdrs = [] 33 | 34 | if srcs == None: 35 | srcs = [] 36 | 37 | if deps == None: 38 | deps = [] 39 | 40 | if copts == None: 41 | copts = [] 42 | 43 | return native.cc_library( 44 | name = name, 45 | visibility = ["//visibility:public"], 46 | defines = defines, 47 | includes = includes_list(name) + includes, 48 | hdrs = hdr_list(name) + hdrs, 49 | srcs = srcs_list(name) + srcs, 50 | deps = deps, 51 | copts = copts, 52 | licenses = ["notice"], 53 | ) 54 | 55 | -------------------------------------------------------------------------------- /BUILD.boost: -------------------------------------------------------------------------------- 1 | # Description: 2 | # The Boost library collection (http://www.boost.org) 3 | # 4 | # Most Boost libraries are header-only, in which case you only need to depend 5 | # on :boost. If you need one of the libraries that has a separately-compiled 6 | # implementation, depend on the appropriate libs rule. 7 | 8 | package(default_visibility = ["//visibility:public"]) 9 | 10 | licenses(["notice"]) # Boost software license 11 | 12 | 13 | cc_library( 14 | name = "boost", 15 | hdrs = glob([ 16 | "boost/**/*.hpp", 17 | "boost/**/*.h", 18 | "boost/**/*.ipp", 19 | ]), 20 | includes = [ 21 | "." 22 | ], 23 | ) 24 | 25 | cc_library( 26 | name = "filesystem", 27 | srcs = glob([ "libs/filesystem/src/*.cpp"]), 28 | deps = [ 29 | ":boost", 30 | ":system", 31 | ], 32 | ) 33 | 34 | cc_library( 35 | name = "iostreams", 36 | srcs = glob(["libs/iostreams/src/*.cpp"]), 37 | deps = [ 38 | ":boost", 39 | "@bzip2_archive//:bz2lib", 40 | "@zlib_archive//:zlib", 41 | ], 42 | ) 43 | 44 | cc_library( 45 | name = "program_options", 46 | srcs = glob([ "libs/program_options/src/*.cpp"]), 47 | deps = [ 48 | ":boost", 49 | ], 50 | ) 51 | 52 | cc_library( 53 | name = "system", 54 | srcs = glob(["libs/system/src/*.cpp"]), 55 | deps = [ 56 | ":boost", 57 | ], 58 | ) 59 | -------------------------------------------------------------------------------- /third_party/python/semver/setup.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from distutils.core import setup 4 | 5 | with open('README.md') as f: 6 | LONG_DESCRIPTION = f.read() 7 | 8 | setup( 9 | name='semver', 10 | version='2.4.1', 11 | description='Python package to work with Semantic Versioning (http://semver.org/)', 12 | long_description=LONG_DESCRIPTION, 13 | author='Konstantine Rybnikov', 14 | author_email='k-bx@k-bx.com', 15 | url='https://github.com/k-bx/python-semver', 16 | download_url='https://github.com/k-bx/python-semver/downloads', 17 | py_modules=['semver'], 18 | include_package_data=True, 19 | license='BSD', 20 | classifiers=[ 21 | 'Environment :: Web Environment', 22 | 'Framework :: Django', 23 | 'Intended Audience :: Developers', 24 | 'License :: OSI Approved :: BSD License', 25 | 'Operating System :: OS Independent', 26 | 'Programming Language :: Python', 27 | 'Programming Language :: Python :: 2', 28 | 'Programming Language :: Python :: 2.6', 29 | 'Programming Language :: Python :: 2.7', 30 | 'Programming Language :: Python :: 3', 31 | 'Programming Language :: Python :: 3.2', 32 | 'Programming Language :: Python :: 3.3', 33 | 'Programming Language :: Python :: 3.4', 34 | 'Topic :: Software Development :: Libraries :: Python Modules', 35 | ], 36 | ) 37 | -------------------------------------------------------------------------------- /third_party/crow/examples/example_chat.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 |
9 |
10 | 53 | 54 | 55 | -------------------------------------------------------------------------------- /WORKSPACE: -------------------------------------------------------------------------------- 1 | # Uncomment and update the paths in these entries to build the Android demo. 2 | #since support libraries are not published in Maven Central or jCenter, we'll have a local copy 3 | 4 | 5 | new_http_archive( 6 | name = "boost", 7 | urls = [ 8 | #"https://sourceforge.net/projects/boost/files/boost/1.61.0/boost_1_61_0.tar.bz2/download", 9 | "https://dl.bintray.com/boostorg/release/1.64.0/source/boost_1_64_0.tar.bz2", 10 | ], 11 | build_file = "BUILD.boost", 12 | type = "tar.bz2", 13 | strip_prefix = "boost_1_64_0/", 14 | sha256 = "7bcc5caace97baa948931d712ea5f37038dbb1c5d89b43ad4def4ed7cb683332", 15 | ) 16 | 17 | 18 | new_http_archive( 19 | name="tf", 20 | url = "https://gitlab.com/yovnchine/tfrelates/raw/master/tf_dist_1.2.0_rc1_0604.zip", 21 | strip_prefix = "tf_dist/", 22 | sha256 = "269115820a2ea4b7260f2ff131ed47860809e3ff05da763704a004724cea9775", 23 | build_file="BUILD.tf_dist", 24 | ) 25 | 26 | 27 | #new_local_repository( 28 | # name="tf", 29 | # path = "/e/code/tf_dist", 30 | # build_file="BUILD.tf_dist", 31 | #) 32 | 33 | 34 | http_archive( 35 | name = "protobuf", 36 | urls = [ 37 | "https://github.com/google/protobuf/archive/2b7430d96aeff2bb624c8d52182ff5e4b9f7f18a.tar.gz", 38 | ], 39 | sha256 = "94789497712726816f154f8441ed4319573c78c3f8cc6398bb00f464ffd82bd2", 40 | strip_prefix = "protobuf-2b7430d96aeff2bb624c8d52182ff5e4b9f7f18a", 41 | ) 42 | -------------------------------------------------------------------------------- /third_party/pybind11/complex.h: -------------------------------------------------------------------------------- 1 | /* 2 | pybind11/complex.h: Complex number support 3 | 4 | Copyright (c) 2016 Wenzel Jakob 5 | 6 | All rights reserved. Use of this source code is governed by a 7 | BSD-style license that can be found in the LICENSE file. 8 | */ 9 | 10 | #pragma once 11 | 12 | #include "pybind11.h" 13 | #include 14 | 15 | /// glibc defines I as a macro which breaks things, e.g., boost template names 16 | #ifdef I 17 | # undef I 18 | #endif 19 | 20 | NAMESPACE_BEGIN(pybind11) 21 | 22 | PYBIND11_DECL_FMT(std::complex, "Zf"); 23 | PYBIND11_DECL_FMT(std::complex, "Zd"); 24 | 25 | NAMESPACE_BEGIN(detail) 26 | template class type_caster> { 27 | public: 28 | bool load(handle src, bool) { 29 | if (!src) 30 | return false; 31 | Py_complex result = PyComplex_AsCComplex(src.ptr()); 32 | if (result.real == -1.0 && PyErr_Occurred()) { 33 | PyErr_Clear(); 34 | return false; 35 | } 36 | value = std::complex((T) result.real, (T) result.imag); 37 | return true; 38 | } 39 | 40 | static handle cast(const std::complex &src, return_value_policy /* policy */, handle /* parent */) { 41 | return PyComplex_FromDoubles((double) src.real(), (double) src.imag()); 42 | } 43 | 44 | PYBIND11_TYPE_CASTER(std::complex, _("complex")); 45 | }; 46 | NAMESPACE_END(detail) 47 | NAMESPACE_END(pybind11) 48 | -------------------------------------------------------------------------------- /third_party/crow/examples/websocket/example_ws.cpp: -------------------------------------------------------------------------------- 1 | #include "crow.h" 2 | #include 3 | #include 4 | 5 | 6 | int main() 7 | { 8 | crow::SimpleApp app; 9 | 10 | std::mutex mtx;; 11 | std::unordered_set users; 12 | 13 | CROW_ROUTE(app, "/ws") 14 | .websocket() 15 | .onopen([&](crow::websocket::connection& conn){ 16 | CROW_LOG_INFO << "new websocket connection"; 17 | std::lock_guard _(mtx); 18 | users.insert(&conn); 19 | }) 20 | .onclose([&](crow::websocket::connection& conn, const std::string& reason){ 21 | CROW_LOG_INFO << "websocket connection closed: " << reason; 22 | std::lock_guard _(mtx); 23 | users.erase(&conn); 24 | }) 25 | .onmessage([&](crow::websocket::connection& /*conn*/, const std::string& data, bool is_binary){ 26 | std::lock_guard _(mtx); 27 | for(auto u:users) 28 | if (is_binary) 29 | u->send_binary(data); 30 | else 31 | u->send_text(data); 32 | }); 33 | 34 | CROW_ROUTE(app, "/") 35 | ([]{ 36 | auto page = crow::mustache::load("ws.html"); 37 | return page.render(); 38 | }); 39 | 40 | app.port(40080) 41 | .multithreaded() 42 | .run(); 43 | } 44 | -------------------------------------------------------------------------------- /kcws/train/filter_sentence.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: Koth 3 | # @Date: 2016-11-16 22:46:50 4 | # @Last Modified by: Koth 5 | # @Last Modified time: 2016-11-21 22:40:47 6 | import sys 7 | import random 8 | 9 | 10 | def main(argc, argv): 11 | if argc < 2: 12 | print("Usage:%s " % (argv[0])) 13 | sys.exit(1) 14 | SENTENCE_LEN = 80 15 | fp = open(argv[1], "r") 16 | nl = 0 17 | bad = 0 18 | test = 0 19 | tr_p = open("train.txt", "w") 20 | te_p = open("test.txt", "w") 21 | while True: 22 | line = fp.readline() 23 | if not line: 24 | break 25 | line = line.strip() 26 | if not line: 27 | continue 28 | ss = line.split(' ') 29 | 30 | if len(ss) != (2 * SENTENCE_LEN): 31 | print("len is:%d" % (len(ss))) 32 | continue 33 | numV = 0 34 | for i in range(SENTENCE_LEN): 35 | if int(ss[i]) != 0: 36 | numV += 1 37 | if numV > 2: 38 | break 39 | if numV <= 2: 40 | bad += 1 41 | else: 42 | r = random.random() 43 | if r <= 0.02 and test < 8000: 44 | te_p.write("%s\n" % (line)) 45 | test += 1 46 | else: 47 | tr_p.write("%s\n" % (line)) 48 | nl += 1 49 | fp.close() 50 | print("got bad:%d" % (bad)) 51 | 52 | 53 | if __name__ == '__main__': 54 | main(len(sys.argv), sys.argv) 55 | -------------------------------------------------------------------------------- /third_party/crow/LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2014, ipkn 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | * Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 10 | * Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | * Neither the name of the author nor the names of its 15 | contributors may be used to endorse or promote products derived from 16 | this software without specific prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 19 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 22 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 24 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 25 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | -------------------------------------------------------------------------------- /third_party/crow/examples/example_test.py: -------------------------------------------------------------------------------- 1 | import urllib 2 | assert "Hello World!" == urllib.urlopen('http://localhost:18080').read() 3 | assert "About Crow example." == urllib.urlopen('http://localhost:18080/about').read() 4 | assert 404 == urllib.urlopen('http://localhost:18080/list').getcode() 5 | assert "3 bottles of beer!" == urllib.urlopen('http://localhost:18080/hello/3').read() 6 | assert "100 bottles of beer!" == urllib.urlopen('http://localhost:18080/hello/100').read() 7 | assert 400 == urllib.urlopen('http://localhost:18080/hello/500').getcode() 8 | assert "3" == urllib.urlopen('http://localhost:18080/add_json', data='{"a":1,"b":2}').read() 9 | assert "3" == urllib.urlopen('http://localhost:18080/add/1/2').read() 10 | 11 | # test persistent connection 12 | import socket 13 | import time 14 | s = socket.socket() 15 | s.connect(('localhost', 18080)) 16 | for i in xrange(10): 17 | s.send('''GET / HTTP/1.1 18 | Host: localhost\r\n\r\n'''); 19 | assert 'Hello World!' in s.recv(1024) 20 | 21 | # test large 22 | s = socket.socket() 23 | s.connect(('localhost', 18080)) 24 | s.send('''GET /large HTTP/1.1 25 | Host: localhost\r\nConnection: close\r\n\r\n''') 26 | r = '' 27 | while True: 28 | d = s.recv(1024*1024) 29 | if not d: 30 | break; 31 | r += d 32 | print len(r), len(d) 33 | print len(r), r[:100] 34 | assert len(r) > 512*1024 35 | 36 | # test timeout 37 | s = socket.socket() 38 | s.connect(('localhost', 18080)) 39 | # invalid request, connection will be closed after timeout 40 | s.send('''GET / HTTP/1.1 41 | hHhHHefhwjkefhklwejfklwejf 42 | ''') 43 | print s.recv(1024) 44 | 45 | -------------------------------------------------------------------------------- /kcws/train/merge_vec.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: Koth 3 | # @Date: 2016-12-02 13:02:30 4 | # @Last Modified by: Koth 5 | # @Last Modified time: 2016-12-02 13:35:42 6 | import sys 7 | 8 | 9 | def main(argc, argv): 10 | if argc < 3: 11 | print("Usage:%s " % (argv[0])) 12 | sys.exit(1) 13 | inwp = open(argv[1], "r") 14 | ingp = open(argv[2], "r") 15 | oup = open("merged_vec.txt", "w") 16 | inwp.readline() 17 | fmap = {} 18 | n1 = 0 19 | n2 = 0 20 | k1 = -1 21 | k2 = -1 22 | while True: 23 | line = inwp.readline() 24 | if not line: 25 | break 26 | n1 += 1 27 | line = line.strip() 28 | ss = line.split(' ') 29 | nn = len(ss) 30 | if k1 == -1: 31 | k1 = nn - 1 32 | else: 33 | assert (k1 == (nn - 1)) 34 | if ss[0] == '': 35 | ss[0] = '' 36 | fv = " ".join(ss[1:]) 37 | fmap[ss[0]] = fv 38 | while True: 39 | line = ingp.readline() 40 | if not line: 41 | break 42 | n2 += 1 43 | line = line.strip() 44 | ss = line.split(' ') 45 | nn = len(ss) 46 | if k2 == -1: 47 | k2 = nn - 1 48 | else: 49 | assert (k2 == (nn - 1)) 50 | assert (ss[0] in fmap) 51 | fv = " ".join(ss[1:]) 52 | fmap[ss[0]] += " " + fv 53 | assert (n1 == n2) 54 | oup.write("%d %d\n" % (n1, k1 + k2)) 55 | fv = fmap[""] 56 | oup.write(" %s\n" % (fv)) 57 | for k, v in fmap.iteritems(): 58 | if k == '': 59 | continue 60 | oup.write("%s %s\n" % (k, v)) 61 | oup.close() 62 | 63 | 64 | if __name__ == '__main__': 65 | main(len(sys.argv), sys.argv) 66 | -------------------------------------------------------------------------------- /kcws/train/sampling_for_train.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: Koth 3 | # @Date: 2016-12-01 09:30:11 4 | # @Last Modified by: Koth 5 | # @Last Modified time: 2016-12-01 10:19:15 6 | import sys 7 | import random 8 | 9 | 10 | def main(argc, argv): 11 | if argc < 2: 12 | print("Usage: %s " % (argv[0])) 13 | sys.exit(1) 14 | inp = open(argv[1], "r") 15 | trp = open("train.txt", "w") 16 | tep = open("test.txt", "w") 17 | sampleNum = 5000 18 | if argc > 2: 19 | sampleNum = int(argv[2]) 20 | allf = [] 21 | allp = [] 22 | nf = 0 23 | np = 0 24 | while True: 25 | line = inp.readline() 26 | if not line: 27 | break 28 | line = line.strip() 29 | if not line: 30 | continue 31 | ss = line.split(" ") 32 | assert (len(ss) == 6) 33 | if int(ss[5]) == 0: 34 | nf += 1 35 | if len(allf) < sampleNum: 36 | allf.append(line) 37 | else: 38 | k = random.randint(0, nf - 1) 39 | if k < sampleNum: 40 | trp.write("%s\n" % (allf[k])) 41 | allf[k] = line 42 | else: 43 | trp.write("%s\n" % (line)) 44 | else: 45 | np += 1 46 | if len(allp) < sampleNum: 47 | allp.append(line) 48 | else: 49 | k = random.randint(0, np - 1) 50 | if k < sampleNum: 51 | trp.write("%s\n" % (allp[k])) 52 | allp[k] = line 53 | else: 54 | trp.write("%s\n" % (line)) 55 | for s in allp: 56 | tep.write("%s\n" % (s)) 57 | for s in allf: 58 | tep.write("%s\n" % (s)) 59 | 60 | 61 | if __name__ == '__main__': 62 | main(len(sys.argv), sys.argv) -------------------------------------------------------------------------------- /third_party/pybind11/typeid.h: -------------------------------------------------------------------------------- 1 | /* 2 | pybind11/typeid.h: Compiler-independent access to type identifiers 3 | 4 | Copyright (c) 2016 Wenzel Jakob 5 | 6 | All rights reserved. Use of this source code is governed by a 7 | BSD-style license that can be found in the LICENSE file. 8 | */ 9 | 10 | #pragma once 11 | 12 | #include 13 | #include 14 | 15 | #if defined(__GNUG__) 16 | #include 17 | #endif 18 | 19 | NAMESPACE_BEGIN(pybind11) 20 | NAMESPACE_BEGIN(detail) 21 | /// Erase all occurrences of a substring 22 | inline void erase_all(std::string &string, const std::string &search) { 23 | for (size_t pos = 0;;) { 24 | pos = string.find(search, pos); 25 | if (pos == std::string::npos) break; 26 | string.erase(pos, search.length()); 27 | } 28 | } 29 | 30 | PYBIND11_NOINLINE inline void clean_type_id(std::string &name) { 31 | #if defined(__GNUG__) 32 | int status = 0; 33 | std::unique_ptr res { 34 | abi::__cxa_demangle(name.c_str(), nullptr, nullptr, &status), std::free }; 35 | if (status == 0) 36 | name = res.get(); 37 | #else 38 | detail::erase_all(name, "class "); 39 | detail::erase_all(name, "struct "); 40 | detail::erase_all(name, "enum "); 41 | #endif 42 | detail::erase_all(name, "pybind11::"); 43 | } 44 | NAMESPACE_END(detail) 45 | 46 | /// Return a string representation of a C++ type 47 | template static std::string type_id() { 48 | std::string name(typeid(T).name()); 49 | detail::clean_type_id(name); 50 | return name; 51 | } 52 | 53 | NAMESPACE_END(pybind11) 54 | -------------------------------------------------------------------------------- /third_party/crow/amalgamate/merge_all.py: -------------------------------------------------------------------------------- 1 | """Merges all the header files.""" 2 | from glob import glob 3 | from os import path as pt 4 | import re 5 | from collections import defaultdict 6 | import sys 7 | 8 | header_path = "../include" 9 | if len(sys.argv) > 1: 10 | header_path = sys.argv[1] 11 | 12 | OUTPUT = 'crow_all.h' 13 | re_depends = re.compile('^#include "(.*)"', re.MULTILINE) 14 | headers = [x.rsplit('/', 1)[-1] for x in glob(pt.join(header_path, '*.h*'))] 15 | headers += ['crow/' + x.rsplit('/', 1)[-1] for x in glob(pt.join(header_path, 'crow/*.h*'))] 16 | print(headers) 17 | edges = defaultdict(list) 18 | for header in headers: 19 | d = open(pt.join(header_path, header)).read() 20 | match = re_depends.findall(d) 21 | for m in match: 22 | # m should included before header 23 | edges[m].append(header) 24 | 25 | visited = defaultdict(bool) 26 | order = [] 27 | 28 | 29 | def dfs(x): 30 | """Ensure all header files are visited.""" 31 | visited[x] = True 32 | for y in edges[x]: 33 | if not visited[y]: 34 | dfs(y) 35 | order.append(x) 36 | 37 | for header in headers: 38 | if not visited[header]: 39 | dfs(header) 40 | 41 | order = order[::-1] 42 | for x in edges: 43 | print(x, edges[x]) 44 | for x in edges: 45 | for y in edges[x]: 46 | assert order.index(x) < order.index(y), 'cyclic include detected' 47 | 48 | print(order) 49 | build = [] 50 | for header in order: 51 | d = open(pt.join(header_path, header)).read() 52 | build.append(re_depends.sub(lambda x: '\n', d)) 53 | build.append('\n') 54 | 55 | open(OUTPUT, 'w').write('\n'.join(build)) 56 | -------------------------------------------------------------------------------- /third_party/gflags/BUILD: -------------------------------------------------------------------------------- 1 | licenses(["notice"]) 2 | 3 | package(default_visibility = ["//visibility:public"]) 4 | 5 | 6 | package_version = "2.0" 7 | 8 | package_file = "gflags-" + package_version + ".tar.gz" 9 | 10 | package_dir = "gflags-" + package_version 11 | 12 | genrule( 13 | name = "gflags-srcs", 14 | srcs = [ 15 | package_file, 16 | ], 17 | outs = [ 18 | "include/gflags/gflags.h", 19 | "include/gflags/gflags_completions.h", 20 | "include/gflags/gflags_declare.h", 21 | "include/google/gflags.h", 22 | "include/google/gflags_completions.h", 23 | "lib/libgflags.a", 24 | "lib/libgflags_nothreads.a", 25 | ], 26 | cmd = "\n".join([ 27 | "export INSTALL_DIR=$$(pwd)/$(@D)", 28 | "export TMP_DIR=$$(mktemp -d -t gflags.XXXXX)", 29 | "mkdir -p $$TMP_DIR", 30 | "cp -R $(SRCS) $$TMP_DIR", 31 | "cd $$TMP_DIR", 32 | "tar xfz " + package_file, 33 | "cd " + package_dir, 34 | "CPPFLAGS=\"-D_GLIBCXX_USE_CXX11_ABI=0\" ./configure --prefix=$$INSTALL_DIR --with-pic=yes --enable-shared=no", 35 | "make install", 36 | "rm -rf $$TMP_DIR", 37 | ]), 38 | ) 39 | 40 | cc_library( 41 | name = "gflags-cxx", 42 | srcs = [ 43 | "empty.cc", 44 | "include/gflags/gflags_declare.h", 45 | "lib/libgflags.a", 46 | ], 47 | hdrs = [ 48 | "include/gflags/gflags.h", 49 | ], 50 | includes = [ 51 | "include", 52 | ], 53 | # linkstatic = 1, 54 | ) 55 | 56 | filegroup( 57 | name = "gflags", 58 | srcs = [ 59 | ":gflags-cxx", 60 | ], 61 | ) 62 | -------------------------------------------------------------------------------- /kcws/cc/test_ac_scanner.cc: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2016- 2018 Koth. All Rights Reserved. 3 | * ===================================================================================== 4 | * Filename: test_ac_scanner.cc 5 | * Author: Koth 6 | * Create Time: 2016-12-09 17:02:56 7 | * Description: 8 | * 9 | */ 10 | #include 11 | 12 | #include "base/base.h" 13 | #include "kcws/cc/ac_scanner.h" 14 | #include "utils/basic_string_util.h" 15 | 16 | DEFINE_string(test_string, "挑战中共创辉煌国际", "the test string"); 17 | class TestScanReporter: public ScanReporter { 18 | public: 19 | bool callback(uint32_t pos, uint32_t& data, size_t len) override { 20 | VLOG(0) << "got data:" << data << ",at pos:" << pos << ",len:" << len; 21 | return false; 22 | } 23 | }; 24 | int main(int argc, char* argv[]) { 25 | FLAGS_v = 0; 26 | FLAGS_logtostderr = true; 27 | base::Init(argc, argv); 28 | AcScanner ac_scanner; 29 | const char* dicts[] = { 30 | "中共", 31 | "共创", 32 | "挑战", 33 | "辉煌", 34 | "辉煌国际" 35 | }; 36 | for (size_t i = 0; i < sizeof(dicts) / sizeof(char*); i++) { 37 | UnicodeStr ustr; 38 | BasicStringUtil::u8tou16(dicts[i], strlen(dicts[i]), ustr); 39 | ac_scanner.pushNode(ustr, i); 40 | } 41 | ac_scanner.buildFailNode(); 42 | VLOG(0) << "total node:" << ac_scanner.NumItem(); 43 | UnicodeStr testu; 44 | TestScanReporter reporter; 45 | BasicStringUtil::u8tou16(FLAGS_test_string.c_str(), FLAGS_test_string.size(), testu); 46 | VLOG(0) << "test string len:" << testu.size(); 47 | bool ret = ac_scanner.doScan(testu, &reporter); 48 | VLOG(0) << "scan return:" << ret; 49 | return 0; 50 | } -------------------------------------------------------------------------------- /kcws/cc/pos_tagger.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2016- 2018 Koth. All Rights Reserved. 3 | * ===================================================================================== 4 | * Filename: pos_tagger.h 5 | * Author: Koth 6 | * Create Time: 2017-02-01 14:02:35 7 | * Description: 8 | * 9 | */ 10 | #ifndef KCWS_CC_POS_TAGGER_H_ 11 | #define KCWS_CC_POS_TAGGER_H_ 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include "utils/basic_string_util.h" 17 | #include "utils/basic_vocab.h" 18 | namespace tf { 19 | class TfModel; 20 | } // namespace tf 21 | namespace kcws { 22 | struct WordInfo { 23 | UnicodeCharT chars[5]; 24 | int idx; 25 | }; 26 | class PosTagger { 27 | public: 28 | PosTagger(); 29 | virtual ~PosTagger(); 30 | 31 | bool LoadModel(const std::string& modelPath, 32 | const std::string& wordVocabPath, 33 | const std::string& charVocabPath, 34 | const std::string& tagVocabPath, 35 | int maxSentenceLen); 36 | bool Tag(const std::vector>& sentences, 37 | std::vector>& tags); 38 | void BuildWordInfo(const std::string& str, WordInfo& word); 39 | private: 40 | std::unique_ptr model_; 41 | std::unordered_map char_vocab_; 42 | std::unordered_map word_vocab_; 43 | std::unordered_map tag_vocab_; 44 | int max_sentence_len_; 45 | int num_tags_; 46 | std::vector> transitions_; 47 | int** bp_; 48 | float** scores_; 49 | }; 50 | 51 | } // namespace kcws 52 | #endif // KCWS_CC_POS_TAGGER_H_ 53 | -------------------------------------------------------------------------------- /third_party/glog/BUILD: -------------------------------------------------------------------------------- 1 | licenses(["notice"]) 2 | 3 | package(default_visibility = ["//visibility:public"]) 4 | 5 | package_version = "0.3.4" 6 | 7 | package_file = "glog-" + package_version + ".tar.gz" 8 | 9 | package_dir = "glog-" + package_version 10 | 11 | genrule( 12 | name = "glog-srcs", 13 | srcs = [ 14 | package_file, 15 | ], 16 | outs = [ 17 | "include/glog/log_severity.h", 18 | "include/glog/logging.h", 19 | "include/glog/raw_logging.h", 20 | "include/glog/stl_logging.h", 21 | "include/glog/vlog_is_on.h", 22 | "lib/libglog.a", 23 | ], 24 | cmd = "\n".join([ 25 | "export INSTALL_DIR=$$(pwd)/$(@D)", 26 | "export TMP_DIR=$$(mktemp -d -t glog.XXXXX)", 27 | "mkdir -p $$TMP_DIR", 28 | "cp -R $(SRCS) $$TMP_DIR", 29 | "cd $$TMP_DIR", 30 | "tar xfz " + package_file, 31 | "cd " + package_dir, 32 | "CPPFLAGS=\"-D_GLIBCXX_USE_CXX11_ABI=0\" ./configure --prefix=$$INSTALL_DIR --enable-shared=no --with-pic=yes", 33 | "make install", 34 | "rm -rf $$TMP_DIR", 35 | ]), 36 | ) 37 | 38 | cc_library( 39 | name = "glog-cxx", 40 | srcs = [ 41 | "empty.cc", 42 | "include/glog/log_severity.h", 43 | "include/glog/raw_logging.h", 44 | "include/glog/stl_logging.h", 45 | "include/glog/vlog_is_on.h", 46 | "lib/libglog.a", 47 | ], 48 | hdrs = [ 49 | "include/glog/logging.h", 50 | ], 51 | includes = [ 52 | "include", 53 | ], 54 | # linkstatic = 1, 55 | deps = [ 56 | "//third_party/gflags:gflags-cxx", 57 | ], 58 | ) 59 | 60 | filegroup( 61 | name = "glog", 62 | srcs = [ 63 | ":glog-cxx", 64 | ], 65 | ) 66 | -------------------------------------------------------------------------------- /third_party/pybind11/functional.h: -------------------------------------------------------------------------------- 1 | /* 2 | pybind11/functional.h: std::function<> support 3 | 4 | Copyright (c) 2016 Wenzel Jakob 5 | 6 | All rights reserved. Use of this source code is governed by a 7 | BSD-style license that can be found in the LICENSE file. 8 | */ 9 | 10 | #pragma once 11 | 12 | #include "pybind11.h" 13 | #include 14 | 15 | NAMESPACE_BEGIN(pybind11) 16 | NAMESPACE_BEGIN(detail) 17 | 18 | template struct type_caster> { 19 | typedef std::function type; 20 | typedef typename std::conditional::value, void_type, Return>::type retval_type; 21 | public: 22 | bool load(handle src_, bool) { 23 | src_ = detail::get_function(src_); 24 | if (!src_ || !PyCallable_Check(src_.ptr())) 25 | return false; 26 | object src(src_, true); 27 | value = [src](Args... args) -> Return { 28 | gil_scoped_acquire acq; 29 | object retval(src(std::move(args)...)); 30 | /* Visual studio 2015 parser issue: need parentheses around this expression */ 31 | return (retval.template cast()); 32 | }; 33 | return true; 34 | } 35 | 36 | template 37 | static handle cast(Func &&f_, return_value_policy policy, handle /* parent */) { 38 | return cpp_function(std::forward(f_), policy).release(); 39 | } 40 | 41 | PYBIND11_TYPE_CASTER(type, _("function<") + 42 | type_caster>::name() + _(" -> ") + 43 | type_caster::name() + 44 | _(">")); 45 | }; 46 | 47 | NAMESPACE_END(detail) 48 | NAMESPACE_END(pybind11) 49 | -------------------------------------------------------------------------------- /utils/BUILD: -------------------------------------------------------------------------------- 1 | cc_library( 2 | name = "basic_string_util", 3 | srcs = [ 4 | "basic_string_util.h", 5 | ], 6 | visibility = ["//visibility:public"], 7 | ) 8 | 9 | cc_library( 10 | name = "jsonxx", 11 | srcs = [ 12 | "json_util.h", 13 | "jsonxx.cc", 14 | "jsonxx.h", 15 | ], 16 | hdrs = [ 17 | "json_util.h", 18 | "jsonxx.h", 19 | ], 20 | linkstatic = 1, 21 | visibility = ["//visibility:public"], 22 | alwayslink = 1, 23 | ) 24 | 25 | cc_library( 26 | name = "word2vec_vob", 27 | srcs = [ 28 | "word2vec_vob.cc", 29 | ], 30 | hdrs = [ 31 | "vocab.h", 32 | "word2vec_vob.h", 33 | ], 34 | copts = [ 35 | "-g", 36 | "-O3", 37 | "-std=c++11", 38 | ], 39 | visibility = ["//visibility:public"], 40 | deps = [ 41 | ":basic_string_util", 42 | "//base", 43 | # '@re2//:re2', 44 | ], 45 | ) 46 | 47 | cc_binary( 48 | name = "w2v.so", 49 | srcs = glob([ 50 | "py_word2vec_vob.cc", 51 | ]), 52 | copts = [ 53 | "-std=c++11", 54 | "-fPIC", 55 | ], 56 | linkshared = 1, 57 | visibility = ["//visibility:public"], 58 | deps = [ 59 | ":word2vec_vob", 60 | "//base", 61 | "//third_party/pybind11", 62 | ], 63 | ) 64 | 65 | cc_library( 66 | name = "basic_vocab", 67 | srcs = [ 68 | "basic_vocab.cc", 69 | ], 70 | hdrs = [ 71 | "basic_vocab.h", 72 | "vocab.h", 73 | ], 74 | copts = [ 75 | "-g", 76 | "-O3", 77 | "-std=c++11", 78 | ], 79 | visibility = ["//visibility:public"], 80 | deps = [ 81 | ":basic_string_util", 82 | "//base", 83 | ], 84 | ) 85 | -------------------------------------------------------------------------------- /third_party/crow/include/crow/http_request.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include "third_party/crow/include/crow/common.h" 6 | #include "third_party/crow/include/crow/ci_map.h" 7 | #include "third_party/crow/include/crow/query_string.h" 8 | 9 | namespace crow { 10 | template 11 | inline const std::string& get_header_value(const T& headers, const std::string& key) { 12 | if (headers.count(key)) { 13 | return headers.find(key)->second; 14 | } 15 | static std::string empty; 16 | return empty; 17 | } 18 | 19 | struct DetachHelper; 20 | 21 | struct request { 22 | HTTPMethod method; 23 | std::string raw_url; 24 | std::string url; 25 | query_string url_params; 26 | ci_map headers; 27 | std::string body; 28 | 29 | void* middleware_context{}; 30 | boost::asio::io_service* io_service{}; 31 | 32 | request() 33 | : method(HTTPMethod::Get) { 34 | } 35 | 36 | request(HTTPMethod method, std::string raw_url, std::string url, query_string url_params, ci_map headers, std::string body) 37 | : method(method), raw_url(std::move(raw_url)), url(std::move(url)), url_params(std::move(url_params)), headers(std::move(headers)), body(std::move(body)) { 38 | } 39 | 40 | void add_header(std::string key, std::string value) { 41 | headers.emplace(std::move(key), std::move(value)); 42 | } 43 | 44 | const std::string& get_header_value(const std::string& key) const { 45 | return crow::get_header_value(headers, key); 46 | } 47 | 48 | template 49 | void post(CompletionHandler handler) { 50 | io_service->post(handler); 51 | } 52 | 53 | template 54 | void dispatch(CompletionHandler handler) { 55 | io_service->dispatch(handler); 56 | } 57 | 58 | }; 59 | } 60 | -------------------------------------------------------------------------------- /utils/word2vec_vob.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2015 Rongall.com. All Rights Reserved. 3 | * ===================================================================================== 4 | * Filename: word2vec_vob.h 5 | * Description: description 6 | * Author: Koth(Yaowen Chen) 7 | * 8 | */ 9 | #ifndef UTILS_WORD2VEC_VOB_H_ 10 | #define UTILS_WORD2VEC_VOB_H_ 11 | #include 12 | #include 13 | #include 14 | 15 | #include "utils/vocab.h" 16 | 17 | namespace utils { 18 | struct WV; 19 | class Word2vecVocab: public Vocab { 20 | public: 21 | enum OOV_OPT { 22 | USE_BLANK = 0, 23 | USE_OOV = 1, 24 | USE_RANDOM = 2, 25 | USE_ONE_RANDOM = 3, 26 | }; 27 | Word2vecVocab(): f_dim_(0), avg_vals_(NULL), std_vals_(NULL), map_word_(false) {} 28 | virtual ~Word2vecVocab() { 29 | if (avg_vals_) { 30 | delete[] avg_vals_; 31 | } 32 | if (std_vals_) { 33 | delete[] std_vals_; 34 | } 35 | avg_vals_ = std_vals_ = NULL; 36 | } 37 | bool Load(const std::string& path) override; 38 | int GetVectorDim()const { 39 | return f_dim_; 40 | } 41 | void SetMapword(bool mapword); 42 | bool GetMapword(); 43 | bool GetVector(const std::string& word, std::vector** vec, OOV_OPT opt = USE_BLANK); 44 | std::vector GetFeatureOrEmpty(const std::string& word); 45 | int GetWordIndex(const std::string& word) override; 46 | int GetTotalWord() override; 47 | bool DumpBasicVocab(const std::string& path); 48 | 49 | private: 50 | struct WV { 51 | std::vector vect; 52 | int idx; 53 | }; 54 | std::unordered_map f_map_; 55 | int f_dim_; 56 | float* avg_vals_; 57 | float* std_vals_; 58 | std::vector oov_feature_; 59 | bool map_word_; 60 | }; 61 | } // namespace utils 62 | #endif // UTILS_WORD2VEC_VOB_H_ 63 | -------------------------------------------------------------------------------- /kcws/train/generate_char_embedding.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: Koth 3 | # @Date: 2016-11-30 19:59:15 4 | # @Last Modified by: Koth 5 | # @Last Modified time: 2016-11-30 20:58:29 6 | import sys 7 | import w2v 8 | 9 | SEQ_LEN = 5 10 | 11 | 12 | def processFile(inp, oup, vob): 13 | global SEQ_LEN 14 | while True: 15 | line = inp.readline() 16 | if not line: 17 | break 18 | line = line.strip() 19 | if not line: 20 | continue 21 | ss = line.split(" ") 22 | x = [] 23 | y = [] 24 | for s in ss: 25 | ustr = unicode(s.decode("utf-8")) 26 | if len(ustr) < 1: 27 | continue 28 | nn = len(ustr) 29 | for i in range(nn): 30 | theStr = str(ustr[i].encode("utf8")) 31 | x.append(str(vob.GetWordIndex(theStr))) 32 | if i == (nn - 1): 33 | y.append(1) 34 | else: 35 | y.append(0) 36 | nn = len(x) 37 | for i in range(nn): 38 | seqLen = SEQ_LEN 39 | if y[i] == 1: 40 | seqLen = 2 41 | hasStop = (y[i] == 1) 42 | for j in range(1, seqLen): 43 | if (i + j + 1) > nn: 44 | continue 45 | newX = x[i:i + j + 1] 46 | for k in range(j + 1, SEQ_LEN): 47 | newX.append("0") 48 | newY = 0 49 | if y[i + j] == 1: 50 | if not hasStop: 51 | newY = 1 52 | hasStop = True 53 | line = " ".join(newX) 54 | line += " " + str(newY) 55 | oup.write("%s\n" % (line)) 56 | 57 | 58 | def main(argc, argv): 59 | if argc < 4: 60 | print("Usage: %s " % (argv[0])) 61 | sys.exit(1) 62 | vob = w2v.Word2vecVocab() 63 | vob.Load(argv[3]) 64 | inp = open(argv[1], "r") 65 | oup = open(argv[2], "w") 66 | processFile(inp, oup, vob) 67 | 68 | 69 | if __name__ == '__main__': 70 | main(len(sys.argv), sys.argv) -------------------------------------------------------------------------------- /pos_train.md: -------------------------------------------------------------------------------- 1 | ### 词性标注训练过程 2 | 3 | 4 | - 1)准备单词word2vec训练样本 5 | 6 | 7 | ``` 8 |   python kcws/train/prepare_pos.py /e/data/people_2014 pos_lines.txt 9 | ``` 10 | 11 | 12 | 13 | - 2)使用word2vec导出即将使用的词词典 14 | 15 | ``` 16 | bazel build -c opt third_party/word2vec:word2vec 17 | bazel-bin/third_party/word2vec/word2vec -train pos_lines.txt -min-count 5 -save-vocab pre_word_vec.txt 18 | ``` 19 | - 3)替换单词中的UNK 20 | 21 | 22 | ``` 23 | python kcws/train/replace_unk.py pre_word_vec.txt pos_lines.txt pos_lines_with_unk.txt 24 | ``` 25 | 26 | - 4)训练词向量 27 | 28 | ``` 29 | bazel-bin/third_party/word2vec/word2vec -train pos_lines_with_unk.txt -output word_vec.txt -size 150 -window 5 -sample 1e-4 -negative 5 -hs 0 -binary 0 -cbow 0 -iter 3 -min-count 5 -hs 1 30 | ``` 31 | 32 | - 5)统计词性tag出现频次,生成词性tag集合 33 | 34 | ``` 35 | python kcws/train/stats_pos.py /e/data/people_2014 pos_vocab.txt lines_withpos.txt 36 | ``` 37 | 38 | - 6)生成训练样本 39 | 40 | ``` 41 | bazel build -c opt kcws/train:generate_pos_train 42 | ``` 43 | 44 | 45 | ``` 46 | bazel-bin/kcws/train/generate_pos_train word_vec.txt char_vec.txt pos_vocab.txt /e/data/people_2014 pos_train.txt 47 | ``` 48 | 49 | 以上char_vec.txt可使用分词中相同的文件 50 | 51 | 52 | 53 | - 7)去重,乱序,分开训练集,测试集 54 | 55 | 56 | 57 | ``` 58 | sort -u pos_train.txt>pos_train.u 59 | shuf pos_train.u >pos_train.txt 60 | head -n 230000 pos_train.txt >train.txt 61 | tail -n 51362 pos_train.txt >test.txt 62 | ``` 63 | 64 | - 8)训练 65 | 66 | ``` 67 | python kcws/train/train_pos.py --train_data_path train.txt --test_data_path test.txt --log_dir pos_logs --word_word2vec_path word_vec.txt --char_word2vec_path char_vec.txt 68 | ``` 69 | 70 | 71 | - 9)模型导出 72 | 73 | ``` 74 | python tools/freeze_graph.py --input_graph pos_logs/graph.pbtxt --input_checkpoint pos_logs/model.ckpt --output_node_names "transitions,Reshape_9" --output_graph kcws/models/pos_model.pbtxt 75 | ``` 76 | -------------------------------------------------------------------------------- /kcws/cc/tf_seg_model.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2016- 2018 Koth. All Rights Reserved. 3 | * ===================================================================================== 4 | * Filename: tf_seg_model.h 5 | * Author: Koth 6 | * Create Time: 2016-11-20 10:31:03 7 | * Description: 8 | * 9 | */ 10 | #ifndef KCWS_TF_SEG_MODEL_H_ 11 | #define KCWS_TF_SEG_MODEL_H_ 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include "utils/basic_string_util.h" 17 | #include "utils/basic_vocab.h" 18 | #include "kcws/cc/ac_scanner.h" 19 | namespace tf { 20 | class TfModel; 21 | } // namespace tf 22 | namespace kcws { 23 | typedef std::pair SegTok; 24 | class SentenceBreaker; 25 | class PosTagger; 26 | class TfSegModel { 27 | public: 28 | TfSegModel(); 29 | virtual ~TfSegModel(); 30 | 31 | bool LoadModel(const std::string& modelPath, 32 | const std::string& vocabPath, 33 | int maxSentenceLen, 34 | const std::string& userDictPath = std::string()); 35 | bool Segment(const std::string& sentence, 36 | std::vector* pTopResult, 37 | std::vector* posTaggs = nullptr); 38 | bool Segment(const std::vector& sentences, 39 | std::vector>* pTopKResults); 40 | void SetPosTagger(PosTagger* tagger); 41 | private: 42 | bool loadUserDict(const std::string& userDictPath); 43 | std::unique_ptr model_; 44 | std::unique_ptr tagger_; 45 | std::unordered_map vocab_; 46 | std::unique_ptr breaker_; 47 | int max_sentence_len_; 48 | int num_words_; 49 | int num_tags_; 50 | std::vector> transitions_; 51 | int** bp_; 52 | float** scores_; 53 | AcScanner scanner_; 54 | }; 55 | 56 | } // namespace kcws 57 | 58 | #endif // KCWS_TF_SEG_MODEL_H_ -------------------------------------------------------------------------------- /third_party/crow/include/crow/dumb_timer_queue.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include "third_party/crow/include/crow/logging.h" 10 | 11 | namespace crow { 12 | namespace detail { 13 | // fast timer queue for fixed tick value. 14 | class dumb_timer_queue { 15 | public: 16 | using key = std::pair; 17 | 18 | void cancel(key& k) { 19 | auto self = k.first; 20 | k.first = nullptr; 21 | if (!self) 22 | return; 23 | 24 | unsigned int index = (unsigned int)(k.second - self->step_); 25 | if (index < self->dq_.size()) 26 | self->dq_[index].second = nullptr; 27 | } 28 | 29 | key add(std::function f) { 30 | dq_.emplace_back(std::chrono::steady_clock::now(), std::move(f)); 31 | int ret = step_ + dq_.size() - 1; 32 | 33 | CROW_LOG_DEBUG << "timer add inside: " << this << ' ' << ret ; 34 | return {this, ret}; 35 | } 36 | 37 | void process() { 38 | if (!io_service_) 39 | return; 40 | 41 | auto now = std::chrono::steady_clock::now(); 42 | while (!dq_.empty()) { 43 | auto& x = dq_.front(); 44 | if (now - x.first < std::chrono::seconds(tick)) 45 | break; 46 | if (x.second) { 47 | CROW_LOG_DEBUG << "timer call: " << this << ' ' << step_; 48 | // we know that timer handlers are very simple currenty; call here 49 | x.second(); 50 | } 51 | dq_.pop_front(); 52 | step_++; 53 | } 54 | } 55 | 56 | void set_io_service(boost::asio::io_service& io_service) { 57 | io_service_ = &io_service; 58 | } 59 | 60 | dumb_timer_queue() noexcept { 61 | } 62 | 63 | private: 64 | 65 | int tick{5}; 66 | boost::asio::io_service* io_service_{}; 67 | std::deque>> dq_; 68 | int step_{}; 69 | }; 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /kcws/cc/demo.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | DEMO 6 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 55 | 61 | 62 | 63 | 64 | 65 | 68 | 69 | 70 |
53 | 样例数据(JSON): 54 |

66 | API返回结果: 67 |
71 | 72 | 73 | 74 | 75 | 76 | 77 | -------------------------------------------------------------------------------- /utils/basic_vocab.cc: -------------------------------------------------------------------------------- 1 | 2 | #include 3 | #include 4 | #include 5 | #include "base/base.h" 6 | #include "basic_string_util.h" 7 | #include "basic_vocab.h" 8 | namespace utils { 9 | 10 | namespace { 11 | static std::string map_word(const std::string& word) { 12 | return word; 13 | } 14 | } // namespace 15 | bool BasicVocab::Load(const std::string& path) { 16 | FILE *fp = fopen(path.c_str(), "r"); 17 | if (fp == NULL) { 18 | fprintf(stderr, "open file error:%s\n", path.c_str()); 19 | return false; 20 | } 21 | char line[4096] = {0}; 22 | int tn = 0; 23 | while (fgets(line, sizeof(line) - 1, fp)) { 24 | int nn = strlen(line); 25 | while (nn && (line[nn - 1] == '\n' || line[nn - 1] == '\r')) { 26 | nn -= 1; 27 | } 28 | if (nn <= 0) { 29 | continue; 30 | } 31 | std::vector terms; 32 | BasicStringUtil::SplitString(line, nn, '\t', &terms); 33 | nn = terms.size(); 34 | if (nn != 2) { 35 | fprintf(stderr, "line len not comformed to dimension:%s:%d\n", line, nn); 36 | return false; 37 | } 38 | const std::string& word = terms[0]; 39 | if (w_map_.find(word) != w_map_.end()) { 40 | fprintf(stderr, "duplicate word:%s\n", word.c_str()); 41 | return false; 42 | } 43 | int idx = atoi(terms[1].c_str()); 44 | w_map_[word] = idx; 45 | tn += 1; 46 | } 47 | fclose(fp); 48 | return true; 49 | } 50 | int BasicVocab::GetWordIndex(const std::string& word) { 51 | auto it = w_map_.find(word); 52 | if ( it != w_map_.end()) { 53 | return it->second; 54 | } else { 55 | if (!use_map_)return 0; 56 | std::string mword = map_word(word); 57 | it = w_map_.find(mword); 58 | if (it != w_map_.end()) { 59 | return it->second; 60 | } else { 61 | VLOG(0) << "not found map word:" << mword; 62 | return 0; 63 | } 64 | } 65 | } 66 | 67 | 68 | int BasicVocab::GetTotalWord() { 69 | return w_map_.size(); 70 | } 71 | 72 | 73 | } // namespace utils 74 | -------------------------------------------------------------------------------- /third_party/crow/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.8) 2 | project (crow_all) 3 | set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake/") 4 | find_package(Tcmalloc) 5 | find_package(Threads) 6 | find_package(OpenSSL) 7 | if(OPENSSL_FOUND) 8 | include_directories(${OPENSSL_INCLUDE_DIR}) 9 | endif() 10 | 11 | if (NOT CMAKE_BUILD_TYPE) 12 | message(STATUS "No build type selected, default to Release") 13 | set(CMAKE_BUILD_TYPE "Release") 14 | endif() 15 | 16 | 17 | if (MSVC) 18 | set(Boost_USE_STATIC_LIBS "On") 19 | find_package( Boost 1.52 COMPONENTS system thread regex REQUIRED ) 20 | else() 21 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -std=c++1y -pedantic -Wextra") 22 | find_package( Boost 1.52 COMPONENTS system thread REQUIRED ) 23 | endif() 24 | 25 | include_directories( ${Boost_INCLUDE_DIR} ) 26 | 27 | set(PROJECT_INCLUDE_DIR 28 | ${PROJECT_SOURCE_DIR}/include 29 | ) 30 | 31 | include_directories("${PROJECT_INCLUDE_DIR}") 32 | include_directories("${PROJECT_SOURCE_DIR}") 33 | 34 | #add_subdirectory(src) 35 | add_subdirectory(examples) 36 | if (MSVC) 37 | else() 38 | add_subdirectory(tests) 39 | 40 | enable_testing() 41 | add_test(NAME crow_test COMMAND ${CMAKE_CURRENT_BINARY_DIR}/tests/unittest) 42 | add_test(NAME template_test COMMAND ${CMAKE_CURRENT_BINARY_DIR}/tests/template/test.py WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/tests/template) 43 | 44 | file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/amalgamate) 45 | 46 | add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/amalgamate/crow_all.h 47 | COMMAND python ${PROJECT_SOURCE_DIR}/amalgamate/merge_all.py ${PROJECT_SOURCE_DIR}/include 48 | COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/amalgamate/crow_all.h ${PROJECT_SOURCE_DIR}/amalgamate 49 | WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/amalgamate 50 | DEPENDS ${PROJECT_SOURCE_DIR}/include/*.h 51 | ) 52 | 53 | add_custom_target(amalgamation ALL DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/amalgamate/crow_all.h) 54 | endif() 55 | -------------------------------------------------------------------------------- /kcws/cc/viterbi_decode.cc: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2016- 2018 Koth. All Rights Reserved. 3 | * ===================================================================================== 4 | * Filename: viterbi_decode.cc 5 | * Author: Koth 6 | * Create Time: 2017-02-01 13:47:48 7 | * Description: 8 | * 9 | */ 10 | #include "kcws/cc/viterbi_decode.h" 11 | 12 | namespace kcws { 13 | 14 | int viterbi_decode( 15 | const Eigen::TensorMap, Eigen::Aligned>& predictions, 16 | int sentenceIdx, 17 | int nn, 18 | const std::vector>& trans, 19 | int** bp, 20 | float** scores, 21 | int ntags) { 22 | for (int i = 0; i < ntags; i++) { 23 | scores[0][i] = predictions(sentenceIdx, 0, i); 24 | } 25 | for (int i = 1; i < nn; i++) { 26 | for (int t = 0; t < ntags; t++) { 27 | float maxScore = -1e7; 28 | float emission = predictions(sentenceIdx, i, t); 29 | for (int prev = 0; prev < ntags; prev++) { 30 | float score = scores[(i - 1) % 2][prev] + trans[prev][t] + emission; 31 | if (score > maxScore) { 32 | maxScore = score; 33 | bp[i - 1][t] = prev; 34 | } 35 | } 36 | scores[i % 2][t] = maxScore; 37 | } 38 | } 39 | float maxScore = scores[(nn - 1) % 2][0]; 40 | int ret = 0; 41 | for (int i = 1; i < ntags; i++) { 42 | if (scores[(nn - 1) % 2][i] > maxScore) { 43 | ret = i; 44 | maxScore = scores[(nn - 1) % 2][i]; 45 | } 46 | } 47 | return ret; 48 | } 49 | void get_best_path( 50 | const Eigen::TensorMap, Eigen::Aligned>& predictions, 51 | int sentenceIdx, 52 | int nn, 53 | const std::vector>& trans, 54 | int** bp, 55 | float** scores, 56 | std::vector& resultTags, 57 | int ntags) { 58 | int lastTag = viterbi_decode(predictions, sentenceIdx, nn, trans, bp, scores, ntags); 59 | resultTags.push_back(lastTag); 60 | for (int i = nn - 2; i >= 0; i--) { 61 | int bpTag = bp[i][lastTag]; 62 | resultTags.push_back(bpTag); 63 | lastTag = bpTag; 64 | } 65 | } 66 | 67 | } // namespace kcws 68 | -------------------------------------------------------------------------------- /kcws/train/generate_train_free.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding:utf-8 -*- 3 | 4 | # File: generate_train_free.py 5 | # Project: /e/code/kcws 6 | # Created: Thu Jul 27 2017 7 | # Author: Koth Chen 8 | # Copyright (c) 2017 Koth 9 | # 10 | # <> 11 | 12 | 13 | import sys 14 | import os 15 | import w2v 16 | import fire 17 | from sentence import Sentence 18 | 19 | totalLine = 0 20 | longLine = 0 21 | 22 | MAX_LEN = 80 23 | totalChars = 0 24 | 25 | 26 | def processLine(line, vob, out): 27 | global totalLine 28 | global longLine 29 | global totalChars 30 | ss = line.split("\t") 31 | 32 | sentence = Sentence() 33 | nn = len(ss) 34 | for i in range(nn): 35 | ts = ss[i].split(" ") 36 | ustr = unicode(ts[0].decode('utf8')) 37 | sentence.addToken(ustr) 38 | if sentence.chars > MAX_LEN: 39 | longLine += 1 40 | else: 41 | x = [] 42 | y = [] 43 | totalChars += sentence.chars 44 | sentence.generate_tr_line(x, y, vob) 45 | nn = len(x) 46 | assert (nn == len(y)) 47 | for j in range(nn, MAX_LEN): 48 | x.append(0) 49 | y.append(0) 50 | line = '' 51 | for i in range(MAX_LEN): 52 | if i > 0: 53 | line += " " 54 | line += str(x[i]) 55 | for j in range(MAX_LEN): 56 | line += " " + str(y[j]) 57 | out.write("%s\n" % (line)) 58 | totalLine += 1 59 | 60 | 61 | def doGen(inputPath, outputPath, vocabPath): 62 | global totalLine 63 | global longLine 64 | global totalChars 65 | vob = w2v.Word2vecVocab() 66 | vob.Load(vocabPath) 67 | with open(inputPath, "r") as inp: 68 | with open(outputPath, "w") as out: 69 | for line in inp.readlines(): 70 | line = line.strip() 71 | if not line: 72 | continue 73 | processLine(line, vob, out) 74 | print("total:%d, long lines:%d, chars:%d" % 75 | (totalLine, longLine, totalChars)) 76 | 77 | 78 | def main(): 79 | fire.Fire() 80 | 81 | 82 | if __name__ == '__main__': 83 | main() 84 | -------------------------------------------------------------------------------- /kcws/train/process_icwb.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: Koth 3 | # @Date: 2016-11-27 12:01:18 4 | # @Last Modified by: Koth 5 | # @Last Modified time: 2016-11-27 20:26:31 6 | import sys 7 | import w2v 8 | 9 | SEQ_LEN = 80 10 | 11 | 12 | def processToken(x, y, tok, vob): 13 | if len(tok) == 1: 14 | x.append(vob.GetWordIndex(str(tok[0].encode("utf8")))) 15 | y.append(0) 16 | else: 17 | nn = len(tok) 18 | for i in range(nn): 19 | x.append(vob.GetWordIndex(str(tok[i].encode("utf8")))) 20 | if i == 0: 21 | y.append(1) 22 | elif i == (nn - 1): 23 | y.append(3) 24 | else: 25 | y.append(2) 26 | 27 | 28 | def processFile(inp, oup, mode, vob): 29 | global SEQ_LEN 30 | while True: 31 | line = inp.readline() 32 | if not line: 33 | break 34 | line = line.strip() 35 | if not line: 36 | continue 37 | ss = line.split(" ") 38 | oline = "" 39 | x = [] 40 | y = [] 41 | for s in ss: 42 | ustr = unicode(s.decode("utf-8")) 43 | if len(ustr) < 1: 44 | continue 45 | if mode == 0: 46 | for i in range(len(ustr)): 47 | oline += str(ustr[i].encode("utf8")) 48 | oline += " " 49 | else: 50 | processToken(x, y, ustr, vob) 51 | if mode != 0: 52 | nn = len(x) 53 | for i in range(nn, SEQ_LEN): 54 | x.append(0) 55 | y.append(0) 56 | for i in range(SEQ_LEN): 57 | oline += str(x[i]) + " " 58 | for i in range(SEQ_LEN): 59 | oline += str(y[i]) + " " 60 | olen = len(oline) 61 | oline = oline[:olen - 1] 62 | oup.write("%s\n" % (oline)) 63 | 64 | 65 | def main(argc, argv): 66 | if argc < 3: 67 | print( 68 | "Usage: %s [model | 0 for w2v , 1 for training] [vec_path | if mode if not 0]" 69 | % (argv[0])) 70 | sys.exit(1) 71 | mode = 0 72 | vob = None 73 | if argc > 4: 74 | mode = int(argv[3]) 75 | vob = w2v.Word2vecVocab() 76 | vob.Load(argv[4]) 77 | inp = open(argv[1], "r") 78 | oup = open(argv[2], "w") 79 | processFile(inp, oup, mode, vob) 80 | 81 | 82 | if __name__ == '__main__': 83 | main(len(sys.argv), sys.argv) -------------------------------------------------------------------------------- /third_party/crow/include/crow/socket_adaptors.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #ifdef CROW_ENABLE_SSL 4 | #include 5 | #endif 6 | #include "third_party/crow/include/crow/settings.h" 7 | namespace crow { 8 | using namespace boost; 9 | using tcp = asio::ip::tcp; 10 | 11 | struct SocketAdaptor { 12 | using context = void; 13 | SocketAdaptor(boost::asio::io_service& io_service, context*) 14 | : socket_(io_service) { 15 | } 16 | 17 | boost::asio::io_service& get_io_service() { 18 | return socket_.get_io_service(); 19 | } 20 | 21 | tcp::socket& raw_socket() { 22 | return socket_; 23 | } 24 | 25 | tcp::socket& socket() { 26 | return socket_; 27 | } 28 | 29 | tcp::endpoint remote_endpoint() { 30 | return socket_.remote_endpoint(); 31 | } 32 | 33 | bool is_open() { 34 | return socket_.is_open(); 35 | } 36 | 37 | void close() { 38 | socket_.close(); 39 | } 40 | 41 | template 42 | void start(F f) { 43 | f(boost::system::error_code()); 44 | } 45 | 46 | tcp::socket socket_; 47 | }; 48 | 49 | #ifdef CROW_ENABLE_SSL 50 | struct SSLAdaptor { 51 | using context = boost::asio::ssl::context; 52 | using ssl_socket_t = boost::asio::ssl::stream; 53 | SSLAdaptor(boost::asio::io_service& io_service, context* ctx) 54 | : ssl_socket_(new ssl_socket_t(io_service, *ctx)) { 55 | } 56 | 57 | boost::asio::ssl::stream& socket() { 58 | return *ssl_socket_; 59 | } 60 | 61 | tcp::socket::lowest_layer_type& 62 | raw_socket() { 63 | return ssl_socket_->lowest_layer(); 64 | } 65 | 66 | tcp::endpoint remote_endpoint() { 67 | return raw_socket().remote_endpoint(); 68 | } 69 | 70 | bool is_open() { 71 | return raw_socket().is_open(); 72 | } 73 | 74 | void close() { 75 | raw_socket().close(); 76 | } 77 | 78 | boost::asio::io_service& get_io_service() { 79 | return raw_socket().get_io_service(); 80 | } 81 | 82 | template 83 | void start(F f) { 84 | ssl_socket_->async_handshake(boost::asio::ssl::stream_base::server, 85 | [f](const boost::system::error_code & ec) { 86 | f(ec); 87 | }); 88 | } 89 | 90 | std::unique_ptr> ssl_socket_; 91 | }; 92 | #endif 93 | } 94 | -------------------------------------------------------------------------------- /third_party/crow/tests/template/comments.json: -------------------------------------------------------------------------------- 1 | {"__ATTN__":"Do not edit this file; changes belong in the appropriate YAML file.","overview":"Comment tags represent content that should never appear in the resulting\noutput.\n\nThe tag's content may contain any substring (including newlines) EXCEPT the\nclosing delimiter.\n\nComment tags SHOULD be treated as standalone when appropriate.\n","tests":[{"name":"Inline","data":{},"expected":"1234567890","template":"12345{{! Comment Block! }}67890","desc":"Comment blocks should be removed from the template."},{"name":"Multiline","data":{},"expected":"1234567890\n","template":"12345{{!\n This is a\n multi-line comment...\n}}67890\n","desc":"Multiline comments should be permitted."},{"name":"Standalone","data":{},"expected":"Begin.\nEnd.\n","template":"Begin.\n{{! Comment Block! }}\nEnd.\n","desc":"All standalone comment lines should be removed."},{"name":"Indented Standalone","data":{},"expected":"Begin.\nEnd.\n","template":"Begin.\n {{! Indented Comment Block! }}\nEnd.\n","desc":"All standalone comment lines should be removed."},{"name":"Standalone Line Endings","data":{},"expected":"|\r\n|","template":"|\r\n{{! Standalone Comment }}\r\n|","desc":"\"\\r\\n\" should be considered a newline for standalone tags."},{"name":"Standalone Without Previous Line","data":{},"expected":"!","template":" {{! I'm Still Standalone }}\n!","desc":"Standalone tags should not require a newline to precede them."},{"name":"Standalone Without Newline","data":{},"expected":"!\n","template":"!\n {{! I'm Still Standalone }}","desc":"Standalone tags should not require a newline to follow them."},{"name":"Multiline Standalone","data":{},"expected":"Begin.\nEnd.\n","template":"Begin.\n{{!\nSomething's going on here...\n}}\nEnd.\n","desc":"All standalone comment lines should be removed."},{"name":"Indented Multiline Standalone","data":{},"expected":"Begin.\nEnd.\n","template":"Begin.\n {{!\n Something's going on here...\n }}\nEnd.\n","desc":"All standalone comment lines should be removed."},{"name":"Indented Inline","data":{},"expected":" 12 \n","template":" 12 {{! 34 }}\n","desc":"Inline comments should not strip whitespace"},{"name":"Surrounding Whitespace","data":{},"expected":"12345 67890","template":"12345 {{! Comment Block! }} 67890","desc":"Comment removal should preserve surrounding whitespace."}]} -------------------------------------------------------------------------------- /kcws/train/process_people.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: Koth 3 | # @Date: 2016-11-29 09:20:36 4 | # @Last Modified by: Koth 5 | # @Last Modified time: 2016-11-29 15:58:30 6 | 7 | import sys 8 | import w2v 9 | 10 | SEQ_LEN = 80 11 | 12 | 13 | def processToken(x, y, tok, vob): 14 | if len(tok) == 1: 15 | x.append(vob.GetWordIndex(str(tok[0].encode("utf8")))) 16 | y.append(0) 17 | else: 18 | nn = len(tok) 19 | for i in range(nn): 20 | x.append(vob.GetWordIndex(str(tok[i].encode("utf8")))) 21 | if i == 0: 22 | y.append(1) 23 | elif i == (nn - 1): 24 | y.append(3) 25 | else: 26 | y.append(2) 27 | 28 | 29 | def processFile(inp, oup, mode, vob): 30 | global SEQ_LEN 31 | while True: 32 | line = inp.readline() 33 | if not line: 34 | break 35 | line = line.strip() 36 | if not line: 37 | continue 38 | ss = line.split(" ") 39 | oline = "" 40 | x = [] 41 | y = [] 42 | for s in ss: 43 | pos = s.find("/") 44 | if not pos: 45 | print("fatal error '/' not found") 46 | sys.exit(0) 47 | s = s[:pos] 48 | ustr = unicode(s.decode("utf-8")) 49 | if len(ustr) < 1: 50 | continue 51 | if mode == 0: 52 | for i in range(len(ustr)): 53 | oline += str(ustr[i].encode("utf8")) 54 | oline += " " 55 | else: 56 | processToken(x, y, ustr, vob) 57 | if mode != 0: 58 | nn = len(x) 59 | for i in range(nn, SEQ_LEN): 60 | x.append(0) 61 | y.append(0) 62 | for i in range(SEQ_LEN): 63 | oline += str(x[i]) + " " 64 | for i in range(SEQ_LEN): 65 | oline += str(y[i]) + " " 66 | olen = len(oline) 67 | oline = oline[:olen - 1] 68 | oup.write("%s\n" % (oline)) 69 | 70 | 71 | def main(argc, argv): 72 | if argc < 3: 73 | print( 74 | "Usage: %s [model | 0 for w2v , 1 for training] [vec_path | if mode if not 0]" 75 | % (argv[0])) 76 | sys.exit(1) 77 | mode = 0 78 | vob = None 79 | if argc > 4: 80 | mode = int(argv[3]) 81 | vob = w2v.Word2vecVocab() 82 | vob.Load(argv[4]) 83 | inp = open(argv[1], "r") 84 | oup = open(argv[2], "w") 85 | processFile(inp, oup, mode, vob) 86 | 87 | 88 | if __name__ == '__main__': 89 | main(len(sys.argv), sys.argv) -------------------------------------------------------------------------------- /third_party/crow/include/crow/middleware_context.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "third_party/crow/include/crow/utility.h" 4 | #include "third_party/crow/include/crow/http_request.h" 5 | #include "third_party/crow/include/crow/http_response.h" 6 | 7 | namespace crow { 8 | namespace detail { 9 | template 10 | struct partial_context 11 | : public black_magic::pop_back::template rebind 12 | , public black_magic::last_element_type::type::context { 13 | using parent_context = typename black_magic::pop_back::template rebind<::crow::detail::partial_context>; 14 | template 15 | using partial = typename std::conditional < N == sizeof...(Middlewares) - 1, partial_context, typename parent_context::template partial>::type; 16 | 17 | template 18 | typename T::context& get() { 19 | return static_cast(*this); 20 | } 21 | }; 22 | 23 | template <> 24 | struct partial_context<> { 25 | template 26 | using partial = partial_context; 27 | }; 28 | 29 | template 30 | bool middleware_call_helper(Container& middlewares, request& req, response& res, Context& ctx); 31 | 32 | template 33 | struct context : private partial_context 34 | //struct context : private Middlewares::context... // simple but less type-safe 35 | { 36 | template 37 | friend typename std::enable_if<(N == 0)>::type after_handlers_call_helper(Container& middlewares, Context& ctx, request& req, response& res); 38 | template 39 | friend typename std::enable_if < (N > 0) >::type after_handlers_call_helper(Container& middlewares, Context& ctx, request& req, response& res); 40 | 41 | template 42 | friend bool middleware_call_helper(Container& middlewares, request& req, response& res, Context& ctx); 43 | 44 | template 45 | typename T::context& get() { 46 | return static_cast(*this); 47 | } 48 | 49 | template 50 | using partial = typename partial_context::template partial; 51 | }; 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /third_party/crow/examples/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.8) 2 | project (crow_examples) 3 | 4 | if (MSVC) 5 | add_executable(example_vs example_vs.cpp) 6 | target_link_libraries(example_vs ${Boost_LIBRARIES}) 7 | target_link_libraries(example_vs ${CMAKE_THREAD_LIBS_INIT}) 8 | else () 9 | 10 | add_executable(helloworld helloworld.cpp) 11 | target_link_libraries(helloworld ${Boost_LIBRARIES}) 12 | target_link_libraries(helloworld ${CMAKE_THREAD_LIBS_INIT}) 13 | 14 | if (OPENSSL_FOUND) 15 | add_executable(example_ssl ssl/example_ssl.cpp) 16 | target_link_libraries(example_ssl ${Boost_LIBRARIES}) 17 | target_link_libraries(example_ssl ${CMAKE_THREAD_LIBS_INIT} ${OPENSSL_LIBRARIES}) 18 | endif() 19 | 20 | add_executable(example_websocket websocket/example_ws.cpp) 21 | target_link_libraries(example_websocket ${Boost_LIBRARIES}) 22 | target_link_libraries(example_websocket ${CMAKE_THREAD_LIBS_INIT} ssl crypto) 23 | 24 | add_executable(example example.cpp) 25 | #target_link_libraries(example crow) 26 | target_link_libraries(example ${Boost_LIBRARIES}) 27 | target_link_libraries(example ${CMAKE_THREAD_LIBS_INIT}) 28 | 29 | if (Tcmalloc_FOUND) 30 | target_link_libraries(example ${Tcmalloc_LIBRARIES}) 31 | endif(Tcmalloc_FOUND) 32 | 33 | add_executable(example_with_all example_with_all.cpp) 34 | #target_link_libraries(example crow) 35 | target_link_libraries(example_with_all ${Boost_LIBRARIES}) 36 | target_link_libraries(example_with_all ${CMAKE_THREAD_LIBS_INIT}) 37 | 38 | add_custom_command(OUTPUT example_test.py 39 | COMMAND ${CMAKE_COMMAND} -E 40 | copy ${PROJECT_SOURCE_DIR}/example_test.py ${CMAKE_CURRENT_BINARY_DIR}/example_test.py 41 | DEPENDS ${PROJECT_SOURCE_DIR}/example_test.py 42 | ) 43 | add_custom_target(example_copy ALL DEPENDS example_test.py) 44 | 45 | add_executable(example_chat example_chat.cpp) 46 | #target_link_libraries(example_chat crow) 47 | target_link_libraries(example_chat ${Boost_LIBRARIES}) 48 | target_link_libraries(example_chat ${CMAKE_THREAD_LIBS_INIT}) 49 | add_custom_command(OUTPUT example_chat.html 50 | COMMAND ${CMAKE_COMMAND} -E 51 | copy ${PROJECT_SOURCE_DIR}/example_chat.html ${CMAKE_CURRENT_BINARY_DIR}/example_chat.html 52 | DEPENDS ${PROJECT_SOURCE_DIR}/example_chat.html 53 | ) 54 | add_custom_target(example_chat_copy ALL DEPENDS example_chat.html) 55 | 56 | #SET( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -pg" ) 57 | #SET( CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -g -pg" ) 58 | endif() 59 | -------------------------------------------------------------------------------- /kcws/train/bilstm.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding:utf-8 -*- 3 | 4 | # File: bilstm.py 5 | # Project: /e/code/kcws 6 | # Created: Thu Aug 03 2017 7 | # Author: Koth Chen 8 | # Copyright (c) 2017 Koth 9 | # 10 | # <> 11 | 12 | import tensorflow as tf 13 | 14 | 15 | class Model: 16 | def __init__(self, 17 | numHidden, 18 | maxSeqLen, 19 | numTags): 20 | self.num_hidden = numHidden 21 | self.num_tags = numTags 22 | self.max_seq_len = maxSeqLen 23 | self.W = tf.get_variable( 24 | shape=[numHidden * 2, numTags], 25 | initializer=tf.contrib.layers.xavier_initializer(), 26 | name="weights", 27 | regularizer=tf.contrib.layers.l2_regularizer(0.001)) 28 | self.b = tf.Variable(tf.zeros([numTags], name="bias")) 29 | 30 | def inference(self, X, length, reuse=False): 31 | length_64 = tf.cast(length, tf.int64) 32 | with tf.variable_scope("bilstm", reuse=reuse): 33 | forward_output, _ = tf.nn.dynamic_rnn( 34 | tf.contrib.rnn.LSTMCell(self.num_hidden, 35 | reuse=reuse), 36 | X, 37 | dtype=tf.float32, 38 | sequence_length=length, 39 | scope="RNN_forward") 40 | backward_output_, _ = tf.nn.dynamic_rnn( 41 | tf.contrib.rnn.LSTMCell(self.num_hidden, 42 | reuse=reuse), 43 | inputs=tf.reverse_sequence(X, 44 | length_64, 45 | seq_dim=1), 46 | dtype=tf.float32, 47 | sequence_length=length, 48 | scope="RNN_backword") 49 | 50 | backward_output = tf.reverse_sequence(backward_output_, 51 | length_64, 52 | seq_dim=1) 53 | 54 | output = tf.concat([forward_output, backward_output], 2) 55 | output = tf.reshape(output, [-1, self.num_hidden * 2]) 56 | if reuse is None or not reuse: 57 | output = tf.nn.dropout(output, 0.5) 58 | 59 | matricized_unary_scores = tf.matmul(output, self.W) + self.b 60 | unary_scores = tf.reshape( 61 | matricized_unary_scores, 62 | [-1, self.max_seq_len, self.num_tags], 63 | name="Reshape_7" if reuse else None) 64 | return unary_scores 65 | -------------------------------------------------------------------------------- /utils/json_util.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2016- 2018 Koth. All Rights Reserved. 3 | * ===================================================================================== 4 | * Filename: json_util.h 5 | * Description: description 6 | * Author: Koth(Yaowen Chen) 7 | * 8 | */ 9 | #ifndef UTILS_JSON_UTIL_H_ 10 | #define UTILS_JSON_UTIL_H_ 11 | #include 12 | #include "jsonxx.h" 13 | 14 | namespace json_util { 15 | 16 | template 17 | T FromJsonValue(const jsonxx::Value& jval) { 18 | return jval.get(); 19 | } 20 | 21 | template 22 | bool ReadFromJson(const std::string& name, const jsonxx::Object& obj, T& val) { 23 | const std::map& kvs=obj.kv_map(); 24 | auto it=kvs.find(name); 25 | if(it==kvs.end()){ 26 | return false; 27 | } 28 | val= FromJsonValue(*(it->second)); 29 | return true; 30 | } 31 | 32 | template 33 | bool ReadArray(std::string name, const jsonxx::Object& obj, std::vector& rets) { 34 | if (!obj.has(name)) return false; 35 | jsonxx::Array arr = obj.get(name); 36 | const std::vector& values=arr.values(); 37 | for(size_t i=0;i(val)); 40 | } 41 | return true; 42 | } 43 | 44 | template 45 | jsonxx::Value ToJsonValue(const T& val) { 46 | return jsonxx::Value(val); 47 | } 48 | 49 | template 50 | void WriteToJson(const std::string& name,jsonxx::Object& obj, const T& val) { 51 | obj<(val); 52 | } 53 | 54 | template 55 | bool WriteArray(const std::string& name, jsonxx::Object& obj, const std::vector& rets) { 56 | jsonxx::Array arr; 57 | int nn=rets.size(); 58 | for(int i=0;i(rets[i]); 60 | arr< 67 | inline float FromJsonValue(const jsonxx::Value& jval) { 68 | return static_cast(jval.get()); 69 | } 70 | template<> 71 | inline double FromJsonValue(const jsonxx::Value& jval) { 72 | return static_cast(jval.get()); 73 | } 74 | template<> 75 | inline int32_t FromJsonValue(const jsonxx::Value& jval) { 76 | return static_cast(jval.get()); 77 | } 78 | template<> 79 | inline int64_t FromJsonValue(const jsonxx::Value& jval) { 80 | return static_cast(jval.get()); 81 | } 82 | 83 | } 84 | 85 | #endif // UTILS_JSON_UTIL_H_ 86 | -------------------------------------------------------------------------------- /third_party/python/semver/PKG-INFO: -------------------------------------------------------------------------------- 1 | Metadata-Version: 1.1 2 | Name: semver 3 | Version: 2.4.1 4 | Summary: Python package to work with Semantic Versioning (http://semver.org/) 5 | Home-page: https://github.com/k-bx/python-semver 6 | Author: Konstantine Rybnikov 7 | Author-email: k-bx@k-bx.com 8 | License: BSD 9 | Download-URL: https://github.com/k-bx/python-semver/downloads 10 | Description: Semver -- python module for semantic versioning 11 | =============================================== 12 | 13 | ![Travis CI](https://travis-ci.org/k-bx/python-semver.svg?branch=master) 14 | 15 | Simple module for comparing versions as noted at [semver.org](http://semver.org/). 16 | 17 | This module provides just couple of functions, main of which are: 18 | 19 | ```python 20 | >>> import semver 21 | >>> semver.compare("1.0.0", "2.0.0") 22 | -1 23 | >>> semver.compare("2.0.0", "1.0.0") 24 | 1 25 | >>> semver.compare("2.0.0", "2.0.0") 26 | 0 27 | >>> semver.match("2.0.0", ">=1.0.0") 28 | True 29 | >>> semver.match("1.0.0", ">1.0.0") 30 | False 31 | >>> semver.format_version(3, 4, 5, 'pre.2', 'build.4') 32 | '3.4.5-pre.2+build.4' 33 | >>> semver.bump_major("3.4.5") 34 | '4.0.0' 35 | >>> semver.bump_minor("3.4.5") 36 | '3.5.0' 37 | >>> semver.bump_patch("3.4.5") 38 | '3.4.6' 39 | >>> semver.max_ver("1.0.0", "2.0.0") 40 | '2.0.0' 41 | >>> semver.min_ver("1.0.0", "2.0.0") 42 | '1.0.0' 43 | ``` 44 | 45 | Installation 46 | ------------ 47 | 48 | For Python 2: 49 | 50 | ``` 51 | pip install semver 52 | ``` 53 | 54 | For Python 3: 55 | 56 | ``` 57 | pip3 install semver 58 | ``` 59 | 60 | Homepage at PyPi: https://pypi.python.org/pypi/semver 61 | 62 | Platform: UNKNOWN 63 | Classifier: Environment :: Web Environment 64 | Classifier: Framework :: Django 65 | Classifier: Intended Audience :: Developers 66 | Classifier: License :: OSI Approved :: BSD License 67 | Classifier: Operating System :: OS Independent 68 | Classifier: Programming Language :: Python 69 | Classifier: Programming Language :: Python :: 2 70 | Classifier: Programming Language :: Python :: 2.6 71 | Classifier: Programming Language :: Python :: 2.7 72 | Classifier: Programming Language :: Python :: 3 73 | Classifier: Programming Language :: Python :: 3.2 74 | Classifier: Programming Language :: Python :: 3.3 75 | Classifier: Programming Language :: Python :: 3.4 76 | Classifier: Topic :: Software Development :: Libraries :: Python Modules 77 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | ### 引用  3 | 4 |   5 | 本项目模型BiLSTM+CRF参考论文:http://www.aclweb.org/anthology/N16-1030 ,IDCNN+CRF参考论文:https://arxiv.org/abs/1702.02098 6 | 7 | 8 | ### 构建 9 | 10 | 1. 安装好bazel代码构建工具,安装好tensorflow(目前本项目需要tf 1.0.0alpha版本以上) 11 | 2. 切换到本项目代码目录,运行./configure 12 | 3. 编译后台服务 13 | 14 | > bazel build //kcws/cc:seg_backend_api 15 | 16 | 17 | ### 训练 18 | 19 | 1. 关注待字闺中公众号 回复 kcws 获取语料下载地址: 20 | 21 | ![logo](https://github.com/koth/kcws/blob/master/docs/qrcode_dzgz.jpg?raw=true "待字闺中") 22 | 23 | 24 | 2. 解压语料到一个目录 25 | 26 | 3. 切换到代码目录,运行: 27 | > python kcws/train/process_anno_file.py <语料目录> pre_chars_for_w2v.txt 28 | 29 | > bazel build third_party/word2vec:word2vec 30 | 31 | > 先得到初步词表 32 | 33 | > ./bazel-bin/third_party/word2vec/word2vec -train pre_chars_for_w2v.txt -save-vocab pre_vocab.txt -min-count 3 34 | 35 | > 处理低频词 36 |   37 | > python kcws/train/replace_unk.py pre_vocab.txt pre_chars_for_w2v.txt chars_for_w2v.txt 38 | > 39 | > 训练word2vec 40 | > 41 | > ./bazel-bin/third_party/word2vec/word2vec -train chars_for_w2v.txt -output vec.txt -size 50 -sample 1e-4 -negative 5 -hs 1 -binary 0 -iter 5 42 | > 43 | > 构建训练语料工具 44 | > 45 | > bazel build kcws/train:generate_training 46 | > 47 | > 生成语料 48 | > 49 | > ./bazel-bin/kcws/train/generate_training vec.txt <语料目录> all.txt 50 | > 51 | > 得到train.txt , test.txt文件 52 | > 53 | > python kcws/train/filter_sentence.py all.txt 54 | 55 | 4. 安装好tensorflow,切换到kcws代码目录,运行: 56 | 57 | > python kcws/train/train_cws.py --word2vec_path vec.txt --train_data_path <绝对路径到train.txt> --test_data_path test.txt --max_sentence_len 80 --learning_rate 0.001 58 |  (默认使用IDCNN模型,可设置参数”--use_idcnn False“来切换BiLSTM模型) 59 | 60 | 5. 生成vocab 61 | > bazel build kcws/cc:dump_vocab 62 | 63 | > ./bazel-bin/kcws/cc/dump_vocab vec.txt kcws/models/basic_vocab.txt 64 | 65 | 6. 导出训练好的模型 66 | > python tools/freeze_graph.py --input_graph logs/graph.pbtxt --input_checkpoint logs/model.ckpt --output_node_names "transitions,Reshape_7" --output_graph kcws/models/seg_model.pbtxt 67 | 68 | 7. 词性标注模型下载 (临时方案,后续文档给出词性标注模型训练,导出等) 69 | 70 | > 从 https://pan.baidu.com/s/1bYmABk 下载pos_model.pbtxt到kcws/models/目录下 71 | 72 | 8. 运行web service 73 | > ./bazel-bin/kcws/cc/seg_backend_api --model_path=kcws/models/seg_model.pbtxt(绝对路径到seg_model.pbtxt>) --vocab_path=kcws/models/basic_vocab.txt --max_sentence_len=80 74 | 75 | ### 词性标注的训练说明: 76 | 77 | https://github.com/koth/kcws/blob/master/pos_train.md 78 | 79 | ### 自定义词典 80 | 目前支持自定义词典是在解码阶段,参考具体使用方式请参考kcws/cc/test_seg.cc 81 | 字典为文本格式,每一行格式如下: 82 | ><自定义词条>\t<权重> 83 | 84 | 比如: 85 | >蓝瘦香菇 4 86 | 87 | 权重为一个正整数,一般4以上,越大越重要 88 | 89 | ### demo 90 | http://45.32.100.248:9090/ 91 | 92 | 附: 使用相同模型训练的公司名识别demo: 93 | 94 | http://45.32.100.248:18080 95 | 96 | 97 | 98 | 99 | -------------------------------------------------------------------------------- /third_party/crow/examples/example_chat.cpp: -------------------------------------------------------------------------------- 1 | #include "crow.h" 2 | #include 3 | #include 4 | #include 5 | 6 | using namespace std; 7 | 8 | vector msgs; 9 | vector> ress; 10 | 11 | void broadcast(const string& msg) 12 | { 13 | msgs.push_back(msg); 14 | crow::json::wvalue x; 15 | x["msgs"][0] = msgs.back(); 16 | x["last"] = msgs.size(); 17 | string body = crow::json::dump(x); 18 | for(auto p:ress) 19 | { 20 | auto* res = p.first; 21 | CROW_LOG_DEBUG << res << " replied: " << body; 22 | res->end(body); 23 | } 24 | ress.clear(); 25 | } 26 | // To see how it works go on {ip}:40080 but I just got it working with external build (not directly in IDE, I guess a problem with dependency) 27 | int main() 28 | { 29 | crow::SimpleApp app; 30 | crow::mustache::set_base("."); 31 | 32 | CROW_ROUTE(app, "/") 33 | ([]{ 34 | crow::mustache::context ctx; 35 | return crow::mustache::load("example_chat.html").render(); 36 | }); 37 | 38 | CROW_ROUTE(app, "/logs") 39 | ([]{ 40 | CROW_LOG_INFO << "logs requested"; 41 | crow::json::wvalue x; 42 | int start = max(0, (int)msgs.size()-100); 43 | for(int i = start; i < (int)msgs.size(); i++) 44 | x["msgs"][i-start] = msgs[i]; 45 | x["last"] = msgs.size(); 46 | CROW_LOG_INFO << "logs completed"; 47 | return x; 48 | }); 49 | 50 | CROW_ROUTE(app, "/logs/") 51 | ([](const crow::request& /*req*/, crow::response& res, int after){ 52 | CROW_LOG_INFO << "logs with last " << after; 53 | if (after < (int)msgs.size()) 54 | { 55 | crow::json::wvalue x; 56 | for(int i = after; i < (int)msgs.size(); i ++) 57 | x["msgs"][i-after] = msgs[i]; 58 | x["last"] = msgs.size(); 59 | 60 | res.write(crow::json::dump(x)); 61 | res.end(); 62 | } 63 | else 64 | { 65 | vector> filtered; 66 | for(auto p : ress) 67 | { 68 | if (p.first->is_alive() && chrono::steady_clock::now() - p.second < chrono::seconds(30)) 69 | filtered.push_back(p); 70 | else 71 | p.first->end(); 72 | } 73 | ress.swap(filtered); 74 | ress.push_back({&res, chrono::steady_clock::now()}); 75 | CROW_LOG_DEBUG << &res << " stored " << ress.size(); 76 | } 77 | }); 78 | 79 | CROW_ROUTE(app, "/send") 80 | .methods("GET"_method, "POST"_method) 81 | ([](const crow::request& req) 82 | { 83 | CROW_LOG_INFO << "msg from client: " << req.body; 84 | broadcast(req.body); 85 | return ""; 86 | }); 87 | 88 | app.port(40080) 89 | //.multithreaded() 90 | .run(); 91 | } 92 | -------------------------------------------------------------------------------- /kcws/train/prepare_pos.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: Koth 3 | # @Date: 2017-01-25 11:46:37 4 | # @Last Modified by: Koth 5 | # @Last Modified time: 2017-01-25 12:05:16 6 | 7 | import sys 8 | import os 9 | 10 | totalLine = 0 11 | longLine = 0 12 | maxLen = 80 13 | 14 | 15 | def processToken(token, collect, out, end): 16 | global totalLine 17 | global longLine 18 | global maxLen 19 | nn = len(token) 20 | #print token 21 | while nn > 0 and token[nn - 1] != '/': 22 | nn = nn - 1 23 | 24 | token = token[:nn - 1].strip() 25 | if not token: 26 | return 27 | out.write("%s " % (token)) 28 | if end: 29 | out.write("\n") 30 | 31 | 32 | def processLine(line, out): 33 | line = line.strip() 34 | nn = len(line) 35 | seeLeftB = False 36 | start = 0 37 | collect = [] 38 | try: 39 | for i in range(nn): 40 | if line[i] == ' ': 41 | if not seeLeftB: 42 | token = line[start:i] 43 | if token.startswith('['): 44 | tokenLen = len(token) 45 | while tokenLen > 0 and token[tokenLen - 1] != ']': 46 | tokenLen = tokenLen - 1 47 | token = token[1:tokenLen - 1] 48 | ss = token.split(' ') 49 | for s in ss: 50 | processToken(s, collect, out, False) 51 | else: 52 | processToken(token, collect, out, False) 53 | start = i + 1 54 | elif line[i] == '[': 55 | seeLeftB = True 56 | elif line[i] == ']': 57 | seeLeftB = False 58 | if start < nn: 59 | token = line[start:] 60 | if token.startswith('['): 61 | tokenLen = len(token) 62 | while tokenLen > 0 and token[tokenLen - 1] != ']': 63 | tokenLen = tokenLen - 1 64 | token = token[1:tokenLen - 1] 65 | ss = token.split(' ') 66 | ns = len(ss) 67 | for i in range(ns - 1): 68 | processToken(ss[i], collect, out, False) 69 | processToken(ss[-1], collect, out, True) 70 | else: 71 | processToken(token, collect, out, True) 72 | except Exception as e: 73 | pass 74 | 75 | 76 | def main(argc, argv): 77 | global totalLine 78 | global longLine 79 | if argc < 3: 80 | print("Usage:%s " % (argv[0])) 81 | sys.exit(1) 82 | rootDir = argv[1] 83 | out = open(argv[2], "w") 84 | for dirName, subdirList, fileList in os.walk(rootDir): 85 | curDir = os.path.join(rootDir, dirName) 86 | for file in fileList: 87 | if file.endswith(".txt"): 88 | curFile = os.path.join(curDir, file) 89 | # print("processing:%s" % (curFile)) 90 | fp = open(curFile, "r") 91 | for line in fp.readlines(): 92 | line = line.strip() 93 | processLine(line, out) 94 | fp.close() 95 | out.close() 96 | print("total:%d, long lines:%d" % (totalLine, longLine)) 97 | 98 | 99 | if __name__ == '__main__': 100 | main(len(sys.argv), sys.argv) 101 | -------------------------------------------------------------------------------- /third_party/crow/examples/example_with_all.cpp: -------------------------------------------------------------------------------- 1 | #include "../amalgamate/crow_all.h" 2 | 3 | #include 4 | 5 | class ExampleLogHandler : public crow::ILogHandler { 6 | public: 7 | void log(std::string /*message*/, crow::LogLevel /*level*/) override { 8 | // cerr << "ExampleLogHandler -> " << message; 9 | } 10 | }; 11 | 12 | int main() 13 | { 14 | crow::SimpleApp app; 15 | 16 | CROW_ROUTE(app, "/") 17 | .name("hello") 18 | ([]{ 19 | return "Hello World!"; 20 | }); 21 | 22 | CROW_ROUTE(app, "/about") 23 | ([](){ 24 | return "About Crow example."; 25 | }); 26 | 27 | // simple json response 28 | CROW_ROUTE(app, "/json") 29 | ([]{ 30 | crow::json::wvalue x; 31 | x["message"] = "Hello, World!"; 32 | return x; 33 | }); 34 | 35 | CROW_ROUTE(app,"/hello/") 36 | ([](int count){ 37 | if (count > 100) 38 | return crow::response(400); 39 | std::ostringstream os; 40 | os << count << " bottles of beer!"; 41 | return crow::response(os.str()); 42 | }); 43 | 44 | CROW_ROUTE(app,"/add//") 45 | ([](const crow::request& /*req*/, crow::response& res, int a, int b){ 46 | std::ostringstream os; 47 | os << a+b; 48 | res.write(os.str()); 49 | res.end(); 50 | }); 51 | 52 | // Compile error with message "Handler type is mismatched with URL paramters" 53 | //CROW_ROUTE(app,"/another/") 54 | //([](int a, int b){ 55 | //return crow::response(500); 56 | //}); 57 | 58 | // more json example 59 | CROW_ROUTE(app, "/add_json") 60 | ([](const crow::request& req){ 61 | auto x = crow::json::load(req.body); 62 | if (!x) 63 | return crow::response(400); 64 | int sum = x["a"].i()+x["b"].i(); 65 | std::ostringstream os; 66 | os << sum; 67 | return crow::response{os.str()}; 68 | }); 69 | 70 | CROW_ROUTE(app, "/params") 71 | ([](const crow::request& req){ 72 | std::ostringstream os; 73 | os << "Params: " << req.url_params << "\n\n"; 74 | os << "The key 'foo' was " << (req.url_params.get("foo") == nullptr ? "not " : "") << "found.\n"; 75 | if(req.url_params.get("pew") != nullptr) { 76 | double countD = boost::lexical_cast(req.url_params.get("pew")); 77 | os << "The value of 'pew' is " << countD << '\n'; 78 | } 79 | auto count = req.url_params.get_list("count"); 80 | os << "The key 'count' contains " << count.size() << " value(s).\n"; 81 | for(const auto& countVal : count) { 82 | os << " - " << countVal << '\n'; 83 | } 84 | return crow::response{os.str()}; 85 | }); 86 | 87 | // ignore all log 88 | crow::logger::setLogLevel(crow::LogLevel::Debug); 89 | //crow::logger::setHandler(std::make_shared()); 90 | 91 | app.port(18080) 92 | .multithreaded() 93 | .run(); 94 | } 95 | -------------------------------------------------------------------------------- /third_party/crow/tests/template/partials.json: -------------------------------------------------------------------------------- 1 | {"__ATTN__":"Do not edit this file; changes belong in the appropriate YAML file.","overview":"Partial tags are used to expand an external template into the current\ntemplate.\n\nThe tag's content MUST be a non-whitespace character sequence NOT containing\nthe current closing delimiter.\n\nThis tag's content names the partial to inject. Set Delimiter tags MUST NOT\naffect the parsing of a partial. The partial MUST be rendered against the\ncontext stack local to the tag. If the named partial cannot be found, the\nempty string SHOULD be used instead, as in interpolations.\n\nPartial tags SHOULD be treated as standalone when appropriate. If this tag\nis used standalone, any whitespace preceding the tag should treated as\nindentation, and prepended to each line of the partial before rendering.\n","tests":[{"name":"Basic Behavior","data":{},"expected":"\"from partial\"","template":"\"{{>text}}\"","desc":"The greater-than operator should expand to the named partial.","partials":{"text":"from partial"}},{"name":"Failed Lookup","data":{},"expected":"\"\"","template":"\"{{>text}}\"","desc":"The empty string should be used when the named partial is not found.","partials":{}},{"name":"Context","data":{"text":"content"},"expected":"\"*content*\"","template":"\"{{>partial}}\"","desc":"The greater-than operator should operate within the current context.","partials":{"partial":"*{{text}}*"}},{"name":"Recursion","data":{"content":"X","nodes":[{"content":"Y","nodes":[]}]},"expected":"X>","template":"{{>node}}","desc":"The greater-than operator should properly recurse.","partials":{"node":"{{content}}<{{#nodes}}{{>node}}{{/nodes}}>"}},{"name":"Surrounding Whitespace","data":{},"expected":"| \t|\t |","template":"| {{>partial}} |","desc":"The greater-than operator should not alter surrounding whitespace.","partials":{"partial":"\t|\t"}},{"name":"Inline Indentation","data":{"data":"|"},"expected":" | >\n>\n","template":" {{data}} {{> partial}}\n","desc":"Whitespace should be left untouched.","partials":{"partial":">\n>"}},{"name":"Standalone Line Endings","data":{},"expected":"|\r\n>|","template":"|\r\n{{>partial}}\r\n|","desc":"\"\\r\\n\" should be considered a newline for standalone tags.","partials":{"partial":">"}},{"name":"Standalone Without Previous Line","data":{},"expected":" >\n >>","template":" {{>partial}}\n>","desc":"Standalone tags should not require a newline to precede them.","partials":{"partial":">\n>"}},{"name":"Standalone Without Newline","data":{},"expected":">\n >\n >","template":">\n {{>partial}}","desc":"Standalone tags should not require a newline to follow them.","partials":{"partial":">\n>"}},{"name":"Standalone Indentation","data":{"content":"<\n->"},"expected":"\\\n |\n <\n->\n |\n/\n","template":"\\\n {{>partial}}\n/\n","desc":"Each line of the partial should be indented before rendering.","partials":{"partial":"|\n{{{content}}}\n|\n"}},{"name":"Padding Whitespace","data":{"boolean":true},"expected":"|[]|","template":"|{{> partial }}|","desc":"Superfluous in-tag whitespace should be ignored.","partials":{"partial":"[]"}}]} -------------------------------------------------------------------------------- /third_party/crow/tests/template/comments.yml: -------------------------------------------------------------------------------- 1 | overview: | 2 | Comment tags represent content that should never appear in the resulting 3 | output. 4 | 5 | The tag's content may contain any substring (including newlines) EXCEPT the 6 | closing delimiter. 7 | 8 | Comment tags SHOULD be treated as standalone when appropriate. 9 | tests: 10 | - name: Inline 11 | desc: Comment blocks should be removed from the template. 12 | data: { } 13 | template: '12345{{! Comment Block! }}67890' 14 | expected: '1234567890' 15 | 16 | - name: Multiline 17 | desc: Multiline comments should be permitted. 18 | data: { } 19 | template: | 20 | 12345{{! 21 | This is a 22 | multi-line comment... 23 | }}67890 24 | expected: | 25 | 1234567890 26 | 27 | - name: Standalone 28 | desc: All standalone comment lines should be removed. 29 | data: { } 30 | template: | 31 | Begin. 32 | {{! Comment Block! }} 33 | End. 34 | expected: | 35 | Begin. 36 | End. 37 | 38 | - name: Indented Standalone 39 | desc: All standalone comment lines should be removed. 40 | data: { } 41 | template: | 42 | Begin. 43 | {{! Indented Comment Block! }} 44 | End. 45 | expected: | 46 | Begin. 47 | End. 48 | 49 | - name: Standalone Line Endings 50 | desc: '"\r\n" should be considered a newline for standalone tags.' 51 | data: { } 52 | template: "|\r\n{{! Standalone Comment }}\r\n|" 53 | expected: "|\r\n|" 54 | 55 | - name: Standalone Without Previous Line 56 | desc: Standalone tags should not require a newline to precede them. 57 | data: { } 58 | template: " {{! I'm Still Standalone }}\n!" 59 | expected: "!" 60 | 61 | - name: Standalone Without Newline 62 | desc: Standalone tags should not require a newline to follow them. 63 | data: { } 64 | template: "!\n {{! I'm Still Standalone }}" 65 | expected: "!\n" 66 | 67 | - name: Multiline Standalone 68 | desc: All standalone comment lines should be removed. 69 | data: { } 70 | template: | 71 | Begin. 72 | {{! 73 | Something's going on here... 74 | }} 75 | End. 76 | expected: | 77 | Begin. 78 | End. 79 | 80 | - name: Indented Multiline Standalone 81 | desc: All standalone comment lines should be removed. 82 | data: { } 83 | template: | 84 | Begin. 85 | {{! 86 | Something's going on here... 87 | }} 88 | End. 89 | expected: | 90 | Begin. 91 | End. 92 | 93 | - name: Indented Inline 94 | desc: Inline comments should not strip whitespace 95 | data: { } 96 | template: " 12 {{! 34 }}\n" 97 | expected: " 12 \n" 98 | 99 | - name: Surrounding Whitespace 100 | desc: Comment removal should preserve surrounding whitespace. 101 | data: { } 102 | template: '12345 {{! Comment Block! }} 67890' 103 | expected: '12345 67890' 104 | -------------------------------------------------------------------------------- /third_party/setuptools/setuptools.egg-info/entry_points.txt: -------------------------------------------------------------------------------- 1 | [console_scripts] 2 | easy_install = setuptools.command.easy_install:main 3 | easy_install-3.4 = setuptools.command.easy_install:main 4 | 5 | [distutils.commands] 6 | alias = setuptools.command.alias:alias 7 | bdist_egg = setuptools.command.bdist_egg:bdist_egg 8 | bdist_rpm = setuptools.command.bdist_rpm:bdist_rpm 9 | bdist_wininst = setuptools.command.bdist_wininst:bdist_wininst 10 | build_ext = setuptools.command.build_ext:build_ext 11 | build_py = setuptools.command.build_py:build_py 12 | develop = setuptools.command.develop:develop 13 | easy_install = setuptools.command.easy_install:easy_install 14 | egg_info = setuptools.command.egg_info:egg_info 15 | install = setuptools.command.install:install 16 | install_egg_info = setuptools.command.install_egg_info:install_egg_info 17 | install_lib = setuptools.command.install_lib:install_lib 18 | install_scripts = setuptools.command.install_scripts:install_scripts 19 | register = setuptools.command.register:register 20 | rotate = setuptools.command.rotate:rotate 21 | saveopts = setuptools.command.saveopts:saveopts 22 | sdist = setuptools.command.sdist:sdist 23 | setopt = setuptools.command.setopt:setopt 24 | test = setuptools.command.test:test 25 | upload_docs = setuptools.command.upload_docs:upload_docs 26 | 27 | [distutils.setup_keywords] 28 | convert_2to3_doctests = setuptools.dist:assert_string_list 29 | dependency_links = setuptools.dist:assert_string_list 30 | eager_resources = setuptools.dist:assert_string_list 31 | entry_points = setuptools.dist:check_entry_points 32 | exclude_package_data = setuptools.dist:check_package_data 33 | extras_require = setuptools.dist:check_extras 34 | include_package_data = setuptools.dist:assert_bool 35 | install_requires = setuptools.dist:check_requirements 36 | namespace_packages = setuptools.dist:check_nsp 37 | package_data = setuptools.dist:check_package_data 38 | packages = setuptools.dist:check_packages 39 | test_loader = setuptools.dist:check_importable 40 | test_runner = setuptools.dist:check_importable 41 | test_suite = setuptools.dist:check_test_suite 42 | tests_require = setuptools.dist:check_requirements 43 | use_2to3 = setuptools.dist:assert_bool 44 | use_2to3_exclude_fixers = setuptools.dist:assert_string_list 45 | use_2to3_fixers = setuptools.dist:assert_string_list 46 | zip_safe = setuptools.dist:assert_bool 47 | 48 | [egg_info.writers] 49 | PKG-INFO = setuptools.command.egg_info:write_pkg_info 50 | dependency_links.txt = setuptools.command.egg_info:overwrite_arg 51 | depends.txt = setuptools.command.egg_info:warn_depends_obsolete 52 | eager_resources.txt = setuptools.command.egg_info:overwrite_arg 53 | entry_points.txt = setuptools.command.egg_info:write_entries 54 | namespace_packages.txt = setuptools.command.egg_info:overwrite_arg 55 | requires.txt = setuptools.command.egg_info:write_requirements 56 | top_level.txt = setuptools.command.egg_info:write_toplevel_names 57 | 58 | [setuptools.file_finders] 59 | svn_cvs = setuptools.command.sdist:_default_revctrl 60 | 61 | [setuptools.installation] 62 | eggsecutable = setuptools.command.easy_install:bootstrap 63 | 64 | -------------------------------------------------------------------------------- /kcws/cc/sentence_breaker.cc: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2016- 2018 Koth. All Rights Reserved. 3 | * ===================================================================================== 4 | * Filename: sentence_breaker.cc 5 | * Author: Koth 6 | * Create Time: 2016-11-23 22:02:40 7 | * Description: 8 | * 9 | */ 10 | #include "sentence_breaker.h" //NOLINT 11 | #include "base/base.h" 12 | 13 | 14 | namespace kcws { 15 | char* SentenceBreaker::kInlineMarks[] = { 16 | "(", ")", "(", ")", "[", "]", "【", "】", "《", "》", "“", "”" 17 | }; 18 | char* SentenceBreaker::kBreakMarks[] = { 19 | "。", ",", ",", " ", "\t", "?", "?", "!", "!", ";", ";" 20 | }; 21 | SentenceBreaker::SentenceBreaker(int maxLen) { 22 | for (size_t i = 0; i < sizeof(kInlineMarks) / sizeof(char*); i += 2) { 23 | UnicodeStr ustr1; 24 | UnicodeStr ustr2; 25 | BasicStringUtil::u8tou16(kInlineMarks[i], strlen(kInlineMarks[i]), ustr1); 26 | BasicStringUtil::u8tou16(kInlineMarks[i + 1], strlen(kInlineMarks[i + 1]), ustr2); 27 | inline_marks_.insert(std::make_pair(ustr1[0], ustr2[0])); 28 | inline_marks_set_.insert(ustr1[0]); 29 | inline_marks_set_.insert(ustr2[0]); 30 | } 31 | for (size_t i = 0; i < sizeof(kBreakMarks) / sizeof(char*); i++) { 32 | UnicodeStr ustr; 33 | BasicStringUtil::u8tou16(kBreakMarks[i], strlen(kBreakMarks[i]), ustr); 34 | break_marks_.insert(ustr[0]); 35 | } 36 | max_len_ = maxLen; 37 | } 38 | bool SentenceBreaker::is_inline_mark(UnicodeCharT uch) { 39 | return inline_marks_.find(uch) != inline_marks_.end(); 40 | } 41 | bool SentenceBreaker::is_break_mark(UnicodeCharT uch) { 42 | return break_marks_.find(uch) != break_marks_.end(); 43 | } 44 | SentenceBreaker::~SentenceBreaker() = default; 45 | 46 | bool SentenceBreaker::breakSentences(const UnicodeStr& text, 47 | std::vector* lines) { 48 | UnicodeCharT markChar = 0; 49 | size_t nn = text.size(); 50 | if (nn == 0) { 51 | return true; 52 | } 53 | size_t markPos = 0; 54 | for (size_t i = 0; i < nn; i++) { 55 | if (is_inline_mark(text[i])) { 56 | if (markChar == text[i]) { 57 | lines->push_back(text.substr(markPos, i - markPos + 1)); 58 | markPos = i + 1; 59 | markChar = 0; 60 | } else { 61 | if (markPos != i) { 62 | lines->push_back(text.substr(markPos, i - markPos )); 63 | markPos = i; 64 | } 65 | markChar = inline_marks_[text[i]]; 66 | } 67 | } else if (markChar == 0) { 68 | if (is_break_mark(text[i]) || 69 | (i - markPos + 1) >= static_cast(max_len_)) { 70 | // Oops, too long 71 | lines->push_back(text.substr(markPos, i - markPos + 1)); 72 | markPos = i + 1; 73 | } 74 | } else if ((i - markPos + 1) >= static_cast(max_len_) ) { 75 | // Oops, too long 76 | lines->push_back(text.substr(markPos, i - markPos + 1)); 77 | markPos = i + 1; 78 | markChar = 0; 79 | } 80 | } 81 | if (markPos < nn) { 82 | lines->push_back(text.substr(markPos, nn - markPos)); 83 | } 84 | return true; 85 | } 86 | 87 | } // namespace kcws 88 | -------------------------------------------------------------------------------- /third_party/crow/include/crow/http_response.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | 5 | #include "third_party/crow/include/crow/json.h" 6 | #include "third_party/crow/include/crow/http_request.h" 7 | #include "third_party/crow/include/crow/ci_map.h" 8 | 9 | namespace crow { 10 | template 11 | class Connection; 12 | struct response { 13 | template 14 | friend class crow::Connection; 15 | 16 | int code{200}; 17 | std::string body; 18 | json::wvalue json_value; 19 | 20 | // `headers' stores HTTP headers. 21 | ci_map headers; 22 | 23 | void set_header(std::string key, std::string value) { 24 | headers.erase(key); 25 | headers.emplace(std::move(key), std::move(value)); 26 | } 27 | void add_header(std::string key, std::string value) { 28 | headers.emplace(std::move(key), std::move(value)); 29 | } 30 | 31 | const std::string& get_header_value(const std::string& key) { 32 | return crow::get_header_value(headers, key); 33 | } 34 | 35 | 36 | response() {} 37 | explicit response(int code) : code(code) {} 38 | response(std::string body) : body(std::move(body)) {} 39 | response(json::wvalue&& json_value) : json_value(std::move(json_value)) { 40 | json_mode(); 41 | } 42 | response(int code, std::string body) : code(code), body(std::move(body)) {} 43 | response(const json::wvalue& json_value) : body(json::dump(json_value)) { 44 | json_mode(); 45 | } 46 | response(int code, const json::wvalue& json_value) : code(code), body(json::dump(json_value)) { 47 | json_mode(); 48 | } 49 | 50 | response(response&& r) { 51 | *this = std::move(r); 52 | } 53 | 54 | response& operator = (const response& r) = delete; 55 | 56 | response& operator = (response&& r) noexcept { 57 | body = std::move(r.body); 58 | json_value = std::move(r.json_value); 59 | code = r.code; 60 | headers = std::move(r.headers); 61 | completed_ = r.completed_; 62 | return *this; 63 | } 64 | 65 | bool is_completed() const noexcept { 66 | return completed_; 67 | } 68 | 69 | void clear() { 70 | body.clear(); 71 | json_value.clear(); 72 | code = 200; 73 | headers.clear(); 74 | completed_ = false; 75 | } 76 | 77 | void write(const std::string& body_part) { 78 | body += body_part; 79 | } 80 | 81 | void end() { 82 | if (!completed_) { 83 | completed_ = true; 84 | 85 | if (complete_request_handler_) { 86 | complete_request_handler_(); 87 | } 88 | } 89 | } 90 | 91 | void end(const std::string& body_part) { 92 | body += body_part; 93 | end(); 94 | } 95 | 96 | bool is_alive() { 97 | return is_alive_helper_ && is_alive_helper_(); 98 | } 99 | 100 | private: 101 | bool completed_{}; 102 | std::function complete_request_handler_; 103 | std::function is_alive_helper_; 104 | 105 | //In case of a JSON object, set the Content-Type header 106 | void json_mode() { 107 | set_header("Content-Type", "application/json"); 108 | } 109 | }; 110 | } 111 | -------------------------------------------------------------------------------- /kcws/train/process_anno_file.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: Koth Chen 3 | # @Date: 2016-10-15 14:49:40 4 | # @Last Modified by: Koth 5 | # @Last Modified time: 2016-12-09 20:33:30 6 | import sys 7 | import os 8 | 9 | totalLine = 0 10 | longLine = 0 11 | maxLen = 80 12 | 13 | 14 | def processToken(token, collect, out, end): 15 | global totalLine 16 | global longLine 17 | global maxLen 18 | nn = len(token) 19 | #print token 20 | while nn > 0 and token[nn - 1] != '/': 21 | nn = nn - 1 22 | 23 | token = token[:nn - 1].strip() 24 | ustr = unicode(token.decode('utf8')) 25 | for u in ustr: 26 | collect.append(u) 27 | uline = u'' 28 | if token == '。' or end: 29 | if len(collect) > maxLen: 30 | longLine += 1 31 | totalLine += 1 32 | for s in collect: 33 | if uline: 34 | uline = uline + u" " + s 35 | else: 36 | uline = s 37 | out.write("%s\n" % (str(uline.encode('utf8')))) 38 | del collect[:] 39 | 40 | 41 | def processLine(line, out): 42 | line = line.strip() 43 | nn = len(line) 44 | seeLeftB = False 45 | start = 0 46 | collect = [] 47 | try: 48 | for i in range(nn): 49 | if line[i] == ' ': 50 | if not seeLeftB: 51 | token = line[start:i] 52 | if token.startswith('['): 53 | tokenLen = len(token) 54 | while tokenLen > 0 and token[tokenLen - 1] != ']': 55 | tokenLen = tokenLen - 1 56 | token = token[1:tokenLen - 1] 57 | ss = token.split(' ') 58 | for s in ss: 59 | processToken(s, collect, out, False) 60 | else: 61 | processToken(token, collect, out, False) 62 | start = i + 1 63 | elif line[i] == '[': 64 | seeLeftB = True 65 | elif line[i] == ']': 66 | seeLeftB = False 67 | if start < nn: 68 | token = line[start:] 69 | if token.startswith('['): 70 | tokenLen = len(token) 71 | while tokenLen > 0 and token[tokenLen - 1] != ']': 72 | tokenLen = tokenLen - 1 73 | token = token[1:tokenLen - 1] 74 | ss = token.split(' ') 75 | ns = len(ss) 76 | for i in range(ns - 1): 77 | processToken(ss[i], collect, out, False) 78 | processToken(ss[-1], collect, out, True) 79 | else: 80 | processToken(token, collect, out, True) 81 | except Exception as e: 82 | pass 83 | 84 | 85 | def main(argc, argv): 86 | global totalLine 87 | global longLine 88 | if argc < 3: 89 | print("Usage:%s " % (argv[0])) 90 | sys.exit(1) 91 | rootDir = argv[1] 92 | out = open(argv[2], "w") 93 | for dirName, subdirList, fileList in os.walk(rootDir): 94 | curDir = os.path.join(rootDir, dirName) 95 | for file in fileList: 96 | if file.endswith(".txt"): 97 | curFile = os.path.join(curDir, file) 98 | # print("processing:%s" % (curFile)) 99 | fp = open(curFile, "r") 100 | for line in fp.readlines(): 101 | line = line.strip() 102 | processLine(line, out) 103 | fp.close() 104 | out.close() 105 | print("total:%d, long lines:%d" % (totalLine, longLine)) 106 | 107 | 108 | if __name__ == '__main__': 109 | main(len(sys.argv), sys.argv) -------------------------------------------------------------------------------- /third_party/crow/tests/template/delimiters.json: -------------------------------------------------------------------------------- 1 | {"__ATTN__":"Do not edit this file; changes belong in the appropriate YAML file.","overview":"Set Delimiter tags are used to change the tag delimiters for all content\nfollowing the tag in the current compilation unit.\n\nThe tag's content MUST be any two non-whitespace sequences (separated by\nwhitespace) EXCEPT an equals sign ('=') followed by the current closing\ndelimiter.\n\nSet Delimiter tags SHOULD be treated as standalone when appropriate.\n","tests":[{"name":"Pair Behavior","data":{"text":"Hey!"},"expected":"(Hey!)","template":"{{=<% %>=}}(<%text%>)","desc":"The equals sign (used on both sides) should permit delimiter changes."},{"name":"Special Characters","data":{"text":"It worked!"},"expected":"(It worked!)","template":"({{=[ ]=}}[text])","desc":"Characters with special meaning regexen should be valid delimiters."},{"name":"Sections","data":{"section":true,"data":"I got interpolated."},"expected":"[\n I got interpolated.\n |data|\n\n {{data}}\n I got interpolated.\n]\n","template":"[\n{{#section}}\n {{data}}\n |data|\n{{/section}}\n\n{{= | | =}}\n|#section|\n {{data}}\n |data|\n|/section|\n]\n","desc":"Delimiters set outside sections should persist."},{"name":"Inverted Sections","data":{"section":false,"data":"I got interpolated."},"expected":"[\n I got interpolated.\n |data|\n\n {{data}}\n I got interpolated.\n]\n","template":"[\n{{^section}}\n {{data}}\n |data|\n{{/section}}\n\n{{= | | =}}\n|^section|\n {{data}}\n |data|\n|/section|\n]\n","desc":"Delimiters set outside inverted sections should persist."},{"name":"Partial Inheritence","data":{"value":"yes"},"expected":"[ .yes. ]\n[ .yes. ]\n","template":"[ {{>include}} ]\n{{= | | =}}\n[ |>include| ]\n","desc":"Delimiters set in a parent template should not affect a partial.","partials":{"include":".{{value}}."}},{"name":"Post-Partial Behavior","data":{"value":"yes"},"expected":"[ .yes. .yes. ]\n[ .yes. .|value|. ]\n","template":"[ {{>include}} ]\n[ .{{value}}. .|value|. ]\n","desc":"Delimiters set in a partial should not affect the parent template.","partials":{"include":".{{value}}. {{= | | =}} .|value|."}},{"name":"Surrounding Whitespace","data":{},"expected":"| |","template":"| {{=@ @=}} |","desc":"Surrounding whitespace should be left untouched."},{"name":"Outlying Whitespace (Inline)","data":{},"expected":" | \n","template":" | {{=@ @=}}\n","desc":"Whitespace should be left untouched."},{"name":"Standalone Tag","data":{},"expected":"Begin.\nEnd.\n","template":"Begin.\n{{=@ @=}}\nEnd.\n","desc":"Standalone lines should be removed from the template."},{"name":"Indented Standalone Tag","data":{},"expected":"Begin.\nEnd.\n","template":"Begin.\n {{=@ @=}}\nEnd.\n","desc":"Indented standalone lines should be removed from the template."},{"name":"Standalone Line Endings","data":{},"expected":"|\r\n|","template":"|\r\n{{= @ @ =}}\r\n|","desc":"\"\\r\\n\" should be considered a newline for standalone tags."},{"name":"Standalone Without Previous Line","data":{},"expected":"=","template":" {{=@ @=}}\n=","desc":"Standalone tags should not require a newline to precede them."},{"name":"Standalone Without Newline","data":{},"expected":"=\n","template":"=\n {{=@ @=}}","desc":"Standalone tags should not require a newline to follow them."},{"name":"Pair with Padding","data":{},"expected":"||","template":"|{{= @ @ =}}|","desc":"Superfluous in-tag whitespace should be ignored."}]} -------------------------------------------------------------------------------- /tfmodel/tfmodel.cc: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2016- 2018 Koth. All Rights Reserved. 3 | * ===================================================================================== 4 | * Filename: tfmodel.cc 5 | * Author: Koth 6 | * Create Time: 2017-02-01 13:28:34 7 | * Description: 8 | * 9 | */ 10 | #include "tfmodel/tfmodel.h" 11 | 12 | #include 13 | 14 | #include "base/base.h" 15 | #include "utils/basic_string_util.h" 16 | 17 | #include "google/protobuf/io/coded_stream.h" 18 | #include "google/protobuf/io/zero_copy_stream_impl.h" 19 | #include "google/protobuf/io/zero_copy_stream_impl_lite.h" 20 | #include "google/protobuf/message_lite.h" 21 | 22 | class IfstreamInputStream : public ::google::protobuf::io::CopyingInputStream { 23 | public: 24 | explicit IfstreamInputStream(const std::string& file_name) 25 | : ifs_(file_name.c_str(), std::ios::in | std::ios::binary) {} 26 | ~IfstreamInputStream() { ifs_.close(); } 27 | 28 | int Read(void* buffer, int size) { 29 | if (!ifs_) { 30 | return -1; 31 | } 32 | ifs_.read(static_cast(buffer), size); 33 | return ifs_.gcount(); 34 | } 35 | 36 | private: 37 | std::ifstream ifs_; 38 | }; 39 | 40 | bool PortableReadFileToProto(const std::string& file_name, 41 | ::google::protobuf::MessageLite* proto) { 42 | ::google::protobuf::io::CopyingInputStreamAdaptor stream( 43 | new IfstreamInputStream(file_name)); 44 | stream.SetOwnsCopyingStream(true); 45 | // TODO(jiayq): the following coded stream is for debugging purposes to allow 46 | // one to parse arbitrarily large messages for MessageLite. One most likely 47 | // doesn't want to put protobufs larger than 64MB on Android, so we should 48 | // eventually remove this and quit loud when a large protobuf is passed in. 49 | ::google::protobuf::io::CodedInputStream coded_stream(&stream); 50 | // Total bytes hard limit / warning limit are set to 1GB and 512MB 51 | // respectively. 52 | coded_stream.SetTotalBytesLimit(1024LL << 20, 512LL << 20); 53 | return proto->ParseFromCodedStream(&coded_stream); 54 | } 55 | 56 | namespace tf { 57 | TfModel::~TfModel() = default; 58 | bool TfModel::Load(const std::string& path) { 59 | tensorflow::SessionOptions options; 60 | tensorflow::ConfigProto& config = options.config; 61 | 62 | session_.reset(tensorflow::NewSession(options)); 63 | tensorflow::GraphDef tensorflow_graph; 64 | VLOG(0) << "Reading file to proto: " << path; 65 | if (!PortableReadFileToProto(path.c_str(), &tensorflow_graph)) { 66 | VLOG(0) << "Load model error from:" << path; 67 | return false; 68 | } 69 | VLOG(0) << "Creating session."; 70 | tensorflow::Status s = session_->Create(tensorflow_graph); 71 | if (!s.ok()) { 72 | VLOG(0) << "Could not create Tensorflow Graph: " << s; 73 | return false; 74 | } 75 | // Clear the proto to save memory space. 76 | tensorflow_graph.Clear(); 77 | VLOG(0) << "Tensorflow graph loaded from: " << path; 78 | return true; 79 | } 80 | bool TfModel::Eval( 81 | const std::vector >& 82 | inputTensors, 83 | const std::vector& outputNames, 84 | std::vector& outputTensors) { 85 | tensorflow::Status s = 86 | session_->Run(inputTensors, outputNames, {}, &outputTensors); 87 | if (!s.ok()) { 88 | LOG(ERROR) << "Error during inference: " << s; 89 | return false; 90 | } 91 | return true; 92 | } 93 | 94 | } // namespace tf 95 | -------------------------------------------------------------------------------- /third_party/crow/include/crow/common.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include "third_party/crow/include/crow/utility.h" 8 | 9 | namespace crow { 10 | enum class HTTPMethod { 11 | #ifndef DELETE 12 | DELETE = 0, 13 | GET, 14 | HEAD, 15 | POST, 16 | PUT, 17 | CONNECT, 18 | OPTIONS, 19 | TRACE, 20 | #endif 21 | 22 | Delete = 0, 23 | Get, 24 | Head, 25 | Post, 26 | Put, 27 | Connect, 28 | Options, 29 | Trace, 30 | }; 31 | 32 | inline std::string method_name(HTTPMethod method) { 33 | switch (method) { 34 | case HTTPMethod::Delete: 35 | return "DELETE"; 36 | case HTTPMethod::Get: 37 | return "GET"; 38 | case HTTPMethod::Head: 39 | return "HEAD"; 40 | case HTTPMethod::Post: 41 | return "POST"; 42 | case HTTPMethod::Put: 43 | return "PUT"; 44 | case HTTPMethod::Connect: 45 | return "CONNECT"; 46 | case HTTPMethod::Options: 47 | return "OPTIONS"; 48 | case HTTPMethod::Trace: 49 | return "TRACE"; 50 | } 51 | return "invalid"; 52 | } 53 | 54 | enum class ParamType { 55 | INT, 56 | UINT, 57 | DOUBLE, 58 | STRING, 59 | PATH, 60 | 61 | MAX 62 | }; 63 | 64 | struct routing_params { 65 | std::vector int_params; 66 | std::vector uint_params; 67 | std::vector double_params; 68 | std::vector string_params; 69 | 70 | void debug_print() const { 71 | std::cerr << "routing_params" << std::endl; 72 | for (auto i : int_params) 73 | std::cerr << i << ", " ; 74 | std::cerr << std::endl; 75 | for (auto i : uint_params) 76 | std::cerr << i << ", " ; 77 | std::cerr << std::endl; 78 | for (auto i : double_params) 79 | std::cerr << i << ", " ; 80 | std::cerr << std::endl; 81 | for (auto& i : string_params) 82 | std::cerr << i << ", " ; 83 | std::cerr << std::endl; 84 | } 85 | 86 | template 87 | T get(unsigned) const; 88 | 89 | }; 90 | 91 | template<> 92 | inline int64_t routing_params::get(unsigned index) const { 93 | return int_params[index]; 94 | } 95 | 96 | template<> 97 | inline uint64_t routing_params::get(unsigned index) const { 98 | return uint_params[index]; 99 | } 100 | 101 | template<> 102 | inline double routing_params::get(unsigned index) const { 103 | return double_params[index]; 104 | } 105 | 106 | template<> 107 | inline std::string routing_params::get(unsigned index) const { 108 | return string_params[index]; 109 | } 110 | } 111 | 112 | #ifndef CROW_MSVC_WORKAROUND 113 | constexpr crow::HTTPMethod operator "" _method(const char* str, size_t /*len*/) { 114 | return 115 | crow::black_magic::is_equ_p(str, "GET", 3) ? crow::HTTPMethod::Get : 116 | crow::black_magic::is_equ_p(str, "DELETE", 6) ? crow::HTTPMethod::Delete : 117 | crow::black_magic::is_equ_p(str, "HEAD", 4) ? crow::HTTPMethod::Head : 118 | crow::black_magic::is_equ_p(str, "POST", 4) ? crow::HTTPMethod::Post : 119 | crow::black_magic::is_equ_p(str, "PUT", 3) ? crow::HTTPMethod::Put : 120 | crow::black_magic::is_equ_p(str, "OPTIONS", 7) ? crow::HTTPMethod::Options : 121 | crow::black_magic::is_equ_p(str, "CONNECT", 7) ? crow::HTTPMethod::Connect : 122 | crow::black_magic::is_equ_p(str, "TRACE", 5) ? crow::HTTPMethod::Trace : 123 | throw std::runtime_error("invalid http method"); 124 | } 125 | #endif 126 | -------------------------------------------------------------------------------- /third_party/crow/include/crow/logging.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include "third_party/crow/include/crow/settings.h" 11 | 12 | namespace crow { 13 | enum class LogLevel { 14 | Debug = 0, 15 | Info, 16 | Warning, 17 | Error, 18 | Critical, 19 | }; 20 | 21 | class ILogHandler { 22 | public: 23 | virtual void log(std::string message, LogLevel level) = 0; 24 | }; 25 | 26 | class CerrLogHandler : public ILogHandler { 27 | public: 28 | void log(std::string message, LogLevel /*level*/) override { 29 | std::cerr << message; 30 | } 31 | }; 32 | 33 | class logger { 34 | 35 | private: 36 | // 37 | static std::string timestamp() { 38 | char date[32]; 39 | time_t t = time(0); 40 | 41 | tm my_tm; 42 | 43 | #ifdef _MSC_VER 44 | gmtime_s(&my_tm, &t); 45 | #else 46 | gmtime_r(&t, &my_tm); 47 | #endif 48 | 49 | size_t sz = strftime(date, sizeof(date), "%Y-%m-%d %H:%M:%S", &my_tm); 50 | return std::string(date, date + sz); 51 | } 52 | 53 | public: 54 | 55 | 56 | logger(std::string prefix, LogLevel level) : level_(level) { 57 | #ifdef CROW_ENABLE_LOGGING 58 | stringstream_ << "(" << timestamp() << ") [" << prefix << "] "; 59 | #endif 60 | 61 | } 62 | ~logger() { 63 | #ifdef CROW_ENABLE_LOGGING 64 | if (level_ >= get_current_log_level()) { 65 | stringstream_ << std::endl; 66 | get_handler_ref()->log(stringstream_.str(), level_); 67 | } 68 | #endif 69 | } 70 | 71 | // 72 | template 73 | logger& operator<<(T const &value) { 74 | 75 | #ifdef CROW_ENABLE_LOGGING 76 | if (level_ >= get_current_log_level()) { 77 | stringstream_ << value; 78 | } 79 | #endif 80 | return *this; 81 | } 82 | 83 | // 84 | static void setLogLevel(LogLevel level) { 85 | get_log_level_ref() = level; 86 | } 87 | 88 | static void setHandler(ILogHandler* handler) { 89 | get_handler_ref() = handler; 90 | } 91 | 92 | static LogLevel get_current_log_level() { 93 | return get_log_level_ref(); 94 | } 95 | 96 | private: 97 | // 98 | static LogLevel& get_log_level_ref() { 99 | static LogLevel current_level = (LogLevel)CROW_LOG_LEVEL; 100 | return current_level; 101 | } 102 | static ILogHandler*& get_handler_ref() { 103 | static CerrLogHandler default_handler; 104 | static ILogHandler* current_handler = &default_handler; 105 | return current_handler; 106 | } 107 | 108 | // 109 | std::ostringstream stringstream_; 110 | LogLevel level_; 111 | }; 112 | } 113 | 114 | #define CROW_LOG_CRITICAL \ 115 | if (crow::logger::get_current_log_level() <= crow::LogLevel::Critical) \ 116 | crow::logger("CRITICAL", crow::LogLevel::Critical) 117 | #define CROW_LOG_ERROR \ 118 | if (crow::logger::get_current_log_level() <= crow::LogLevel::Error) \ 119 | crow::logger("ERROR ", crow::LogLevel::Error) 120 | #define CROW_LOG_WARNING \ 121 | if (crow::logger::get_current_log_level() <= crow::LogLevel::Warning) \ 122 | crow::logger("WARNING ", crow::LogLevel::Warning) 123 | #define CROW_LOG_INFO \ 124 | if (crow::logger::get_current_log_level() <= crow::LogLevel::Info) \ 125 | crow::logger("INFO ", crow::LogLevel::Info) 126 | #define CROW_LOG_DEBUG \ 127 | if (crow::logger::get_current_log_level() <= crow::LogLevel::Debug) \ 128 | crow::logger("DEBUG ", crow::LogLevel::Debug) 129 | 130 | -------------------------------------------------------------------------------- /kcws/cc/gen_seg_eval.cc: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2016- 2018 Koth. All Rights Reserved. 3 | * ===================================================================================== 4 | * Filename: gen_seg_eval.cc 5 | * Author: Koth 6 | * Create Time: 2016-11-29 09:26:39 7 | * Description: 8 | * 9 | */ 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | 18 | #include "base/base.h" 19 | #include "utils/basic_string_util.h" 20 | 21 | 22 | #include "tf_seg_model.h" //NOLINT 23 | #include "sentence_breaker.h" // NOLINT 24 | #include "tensorflow/core/platform/init_main.h" 25 | 26 | DEFINE_string(test_file, "", "the test file"); 27 | DEFINE_string(model_path, "", "the model path"); 28 | DEFINE_string(vocab_path, "", "vocab path"); 29 | 30 | DEFINE_int32(max_setence_len, 80, "max sentence len"); 31 | 32 | const int BATCH_SIZE = 2000; 33 | int load_test_file(const std::string& path, 34 | std::vector* pstrs) { 35 | FILE *fp = fopen(path.c_str(), "r"); 36 | if (fp == NULL) { 37 | VLOG(0) << "open file error:" << path; 38 | return 0; 39 | } 40 | char line[4096] = {0}; 41 | int tn = 0; 42 | while (fgets(line, sizeof(line) - 1, fp)) { 43 | int nn = strlen(line); 44 | while (nn && (line[nn - 1] == '\n' || line[nn - 1] == '\r')) { 45 | nn -= 1; 46 | } 47 | if (nn <= 0) { 48 | continue; 49 | } 50 | pstrs->push_back(std::string(line, nn)); 51 | tn += 1; 52 | } 53 | fclose(fp); 54 | return tn; 55 | } 56 | int main(int argc, char *argv[]) { 57 | tensorflow::port::InitMain(argv[0], &argc, &argv); 58 | google::ParseCommandLineFlags(&argc, &argv, true); 59 | if (FLAGS_vocab_path.empty()) { 60 | VLOG(0) << "basic bocab path is not set"; 61 | return 1; 62 | } 63 | if (FLAGS_model_path.empty()) { 64 | VLOG(0) << " model path is not set"; 65 | return 1; 66 | } 67 | if (FLAGS_test_file.empty()) { 68 | VLOG(0) << " test_file path is not set"; 69 | return 1; 70 | } 71 | FILE* outfp = fopen("out_eval.txt", "w"); 72 | CHECK(outfp != nullptr) << "open file 'out_eval.txt' error"; 73 | kcws::TfSegModel sm; 74 | CHECK(sm.LoadModel(FLAGS_model_path, 75 | FLAGS_vocab_path, 76 | FLAGS_max_setence_len)) 77 | << "Load model error"; 78 | 79 | std::vector teststrs; 80 | int ns = load_test_file(FLAGS_test_file, &teststrs); 81 | std::string todo; 82 | for (int i = 0; i < ns; i++) { 83 | todo.append(teststrs[i]); 84 | } 85 | VLOG(0) << "loaded :" << FLAGS_test_file << " ,got " << ns << " lines"; 86 | 87 | auto start = std::chrono::steady_clock::now(); 88 | for (int i = 0; i < ns; i++) { 89 | // VLOG(0) << "do line:" << i; 90 | if (teststrs[i].empty()) { 91 | VLOG(0) << "empty line , continue"; 92 | continue; 93 | } 94 | std::vector results; 95 | CHECK(sm.Segment(teststrs[i], &results)) << "segment error"; 96 | int nr = results.size(); 97 | CHECK_NE(nr, 0); 98 | fprintf(outfp, "%s", results[0].c_str()); 99 | for (int i = 1; i < nr; i++) { 100 | fprintf(outfp, " %s", results[i].c_str()); 101 | } 102 | fprintf(outfp, "\n"); 103 | } 104 | auto duration = std::chrono::duration_cast 105 | (std::chrono::steady_clock::now() - start); 106 | VLOG(0) << "spend " << duration.count() << " milliseconds for file:" << FLAGS_test_file; 107 | 108 | return 0; 109 | } 110 | -------------------------------------------------------------------------------- /third_party/crow/tests/template/partials.yml: -------------------------------------------------------------------------------- 1 | overview: | 2 | Partial tags are used to expand an external template into the current 3 | template. 4 | 5 | The tag's content MUST be a non-whitespace character sequence NOT containing 6 | the current closing delimiter. 7 | 8 | This tag's content names the partial to inject. Set Delimiter tags MUST NOT 9 | affect the parsing of a partial. The partial MUST be rendered against the 10 | context stack local to the tag. If the named partial cannot be found, the 11 | empty string SHOULD be used instead, as in interpolations. 12 | 13 | Partial tags SHOULD be treated as standalone when appropriate. If this tag 14 | is used standalone, any whitespace preceding the tag should treated as 15 | indentation, and prepended to each line of the partial before rendering. 16 | tests: 17 | - name: Basic Behavior 18 | desc: The greater-than operator should expand to the named partial. 19 | data: { } 20 | template: '"{{>text}}"' 21 | partials: { text: 'from partial' } 22 | expected: '"from partial"' 23 | 24 | - name: Failed Lookup 25 | desc: The empty string should be used when the named partial is not found. 26 | data: { } 27 | template: '"{{>text}}"' 28 | partials: { } 29 | expected: '""' 30 | 31 | - name: Context 32 | desc: The greater-than operator should operate within the current context. 33 | data: { text: 'content' } 34 | template: '"{{>partial}}"' 35 | partials: { partial: '*{{text}}*' } 36 | expected: '"*content*"' 37 | 38 | - name: Recursion 39 | desc: The greater-than operator should properly recurse. 40 | data: { content: "X", nodes: [ { content: "Y", nodes: [] } ] } 41 | template: '{{>node}}' 42 | partials: { node: '{{content}}<{{#nodes}}{{>node}}{{/nodes}}>' } 43 | expected: 'X>' 44 | 45 | # Whitespace Sensitivity 46 | 47 | - name: Surrounding Whitespace 48 | desc: The greater-than operator should not alter surrounding whitespace. 49 | data: { } 50 | template: '| {{>partial}} |' 51 | partials: { partial: "\t|\t" } 52 | expected: "| \t|\t |" 53 | 54 | - name: Inline Indentation 55 | desc: Whitespace should be left untouched. 56 | data: { data: '|' } 57 | template: " {{data}} {{> partial}}\n" 58 | partials: { partial: ">\n>" } 59 | expected: " | >\n>\n" 60 | 61 | - name: Standalone Line Endings 62 | desc: '"\r\n" should be considered a newline for standalone tags.' 63 | data: { } 64 | template: "|\r\n{{>partial}}\r\n|" 65 | partials: { partial: ">" } 66 | expected: "|\r\n>|" 67 | 68 | - name: Standalone Without Previous Line 69 | desc: Standalone tags should not require a newline to precede them. 70 | data: { } 71 | template: " {{>partial}}\n>" 72 | partials: { partial: ">\n>"} 73 | expected: " >\n >>" 74 | 75 | - name: Standalone Without Newline 76 | desc: Standalone tags should not require a newline to follow them. 77 | data: { } 78 | template: ">\n {{>partial}}" 79 | partials: { partial: ">\n>" } 80 | expected: ">\n >\n >" 81 | 82 | - name: Standalone Indentation 83 | desc: Each line of the partial should be indented before rendering. 84 | data: { content: "<\n->" } 85 | template: | 86 | \ 87 | {{>partial}} 88 | / 89 | partials: 90 | partial: | 91 | | 92 | {{{content}}} 93 | | 94 | expected: | 95 | \ 96 | | 97 | < 98 | -> 99 | | 100 | / 101 | 102 | # Whitespace Insensitivity 103 | 104 | - name: Padding Whitespace 105 | desc: Superfluous in-tag whitespace should be ignored. 106 | data: { boolean: true } 107 | template: "|{{> partial }}|" 108 | partials: { partial: "[]" } 109 | expected: '|[]|' 110 | -------------------------------------------------------------------------------- /kcws/cc/BUILD: -------------------------------------------------------------------------------- 1 | package(default_visibility = ["//visibility:public"]) 2 | 3 | cc_library( 4 | name="tf_seg_model", 5 | srcs=[ 6 | "tf_seg_model.cc" 7 | ], 8 | hdrs=[ 9 | "tf_seg_model.h" 10 | ], 11 | deps=[ 12 | '//utils:basic_string_util', 13 | '//utils:basic_vocab', 14 | ':pos_tagger', 15 | ':sentence_breaker', 16 | ':ac_scanner', 17 | '@tf//:tensorflow', 18 | '@protobuf//:protobuf', 19 | ] 20 | ) 21 | 22 | cc_library( 23 | name="pos_tagger", 24 | srcs=[ 25 | "pos_tagger.cc" 26 | ], 27 | hdrs=[ 28 | "pos_tagger.h" 29 | ], 30 | deps=[ 31 | '//utils:basic_string_util', 32 | '//utils:basic_vocab', 33 | '//tfmodel:tfmodel', 34 | ':viterbi_decode', 35 | '@tf//:tensorflow', 36 | '@protobuf//:protobuf', 37 | ] 38 | ) 39 | 40 | cc_library( 41 | name="viterbi_decode", 42 | srcs=[ 43 | "viterbi_decode.cc" 44 | ], 45 | hdrs=[ 46 | "viterbi_decode.h" 47 | ], 48 | deps=[ 49 | '//utils:basic_string_util', 50 | '@tf//:tensorflow', 51 | ] 52 | ) 53 | 54 | cc_library( 55 | name="sentence_breaker", 56 | srcs=[ 57 | "sentence_breaker.cc" 58 | ], 59 | hdrs=[ 60 | "sentence_breaker.h" 61 | ], 62 | copts=[ 63 | "-Wno-writable-strings" 64 | ], 65 | deps=[ 66 | '//base:base', 67 | '//utils:basic_string_util', 68 | ], 69 | linkstatic=1, 70 | ) 71 | 72 | cc_binary( 73 | name = "test_breaker", 74 | srcs = [ 75 | "test_breaker.cc", 76 | ], 77 | copts = [ 78 | "-g", 79 | "-std=c++11", 80 | ], 81 | linkopts = [ 82 | "-ldl", 83 | "-lpthread", 84 | ], 85 | deps = [ 86 | ":sentence_breaker", 87 | "//base", 88 | ], 89 | ) 90 | 91 | py_binary( 92 | name = "dump_vocab", 93 | srcs = ["dump_vocab.py"], 94 | data = ["//utils:w2v.so"], 95 | imports = ["../../utils"], 96 | ) 97 | 98 | cc_binary( 99 | name = "test_seg", 100 | srcs = [ 101 | "test_seg.cc", 102 | ], 103 | copts = [ 104 | "-g", 105 | "-std=c++11", 106 | ], 107 | linkopts = [ 108 | "-ldl", 109 | "-lpthread", 110 | ], 111 | deps = [ 112 | ":tf_seg_model", 113 | "//base", 114 | ], 115 | ) 116 | 117 | cc_binary( 118 | name = "gen_seg_eval", 119 | srcs = [ 120 | "gen_seg_eval.cc", 121 | ], 122 | copts = [ 123 | "-g", 124 | "-std=c++11", 125 | ], 126 | linkopts = [ 127 | "-ldl", 128 | "-lpthread", 129 | ], 130 | deps = [ 131 | ":tf_seg_model", 132 | "//base", 133 | ], 134 | ) 135 | 136 | genrule( 137 | name = "demo_html_gen", 138 | srcs = ["demo.html"], 139 | outs = ["demo_html.h"], 140 | cmd = "xxd -i \"$<\" >\"$@\"", 141 | ) 142 | 143 | cc_library( 144 | name = "demo_html", 145 | srcs = ["demo_html.h"], 146 | data = [ 147 | ":demo_html_gen", 148 | ], 149 | ) 150 | 151 | cc_binary( 152 | name = "seg_backend_api", 153 | srcs = [ 154 | "seg_backend_api.cc", 155 | ], 156 | linkopts = ["-ldl"], 157 | deps = [ 158 | ":demo_html", 159 | ":tf_seg_model", 160 | "//base", 161 | "//third_party/crow", 162 | "//utils:jsonxx", 163 | ], 164 | ) 165 | 166 | cc_library( 167 | name = "ac_scanner", 168 | srcs = [ 169 | "ac_scanner.h", 170 | ], 171 | linkstatic = 1, 172 | deps = [ 173 | "//base", 174 | "//utils:basic_string_util", 175 | ], 176 | ) 177 | 178 | cc_binary( 179 | name = "test_ac_scanner", 180 | srcs = [ 181 | "test_ac_scanner.cc", 182 | ], 183 | copts = [ 184 | "-Wno-writable-strings", 185 | ], 186 | linkopts = ["-ldl","-pthread"], 187 | deps = [ 188 | ":ac_scanner", 189 | ], 190 | ) 191 | -------------------------------------------------------------------------------- /kcws/train/stats_pos.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: Koth 3 | # @Date: 2017-01-25 14:55:00 4 | # @Last Modified by: Koth 5 | # @Last Modified time: 2017-04-07 22:12:33 6 | 7 | import sys 8 | import os 9 | 10 | totalLine = 0 11 | longLine = 0 12 | maxLen = 80 13 | posMap = {} 14 | 15 | 16 | def processToken(token, collect, out, end): 17 | global totalLine 18 | global longLine 19 | global maxLen 20 | global posMap 21 | nn = len(token) 22 | oline = token 23 | while nn > 0 and token[nn - 1] != '/': 24 | nn = nn - 1 25 | pos = token[nn:] 26 | token = token[:nn - 1].strip() 27 | if not token: 28 | return 29 | if (not pos[0:1].isalpha()) or pos[0:1].isupper(): 30 | return 31 | if len(pos) > 2: 32 | pos = pos[:2] 33 | posMap.setdefault(pos, 0) 34 | posMap[pos] += 1 35 | out.write("%s %s\t" % (token, pos)) 36 | if end: 37 | out.write("\n") 38 | 39 | 40 | def processLine(line, out): 41 | line = line.strip() 42 | nn = len(line) 43 | seeLeftB = False 44 | start = 0 45 | collect = [] 46 | try: 47 | for i in range(nn): 48 | if line[i] == ' ': 49 | if not seeLeftB: 50 | token = line[start:i] 51 | if token.startswith('['): 52 | tokenLen = len(token) 53 | while tokenLen > 0 and token[tokenLen - 1] != ']': 54 | tokenLen = tokenLen - 1 55 | token = token[1:tokenLen - 1] 56 | ss = token.split(' ') 57 | for s in ss: 58 | processToken(s, collect, out, False) 59 | else: 60 | processToken(token, collect, out, False) 61 | start = i + 1 62 | elif line[i] == '[': 63 | seeLeftB = True 64 | elif line[i] == ']': 65 | seeLeftB = False 66 | if start < nn: 67 | token = line[start:] 68 | if token.startswith('['): 69 | tokenLen = len(token) 70 | while tokenLen > 0 and token[tokenLen - 1] != ']': 71 | tokenLen = tokenLen - 1 72 | token = token[1:tokenLen - 1] 73 | ss = token.split(' ') 74 | ns = len(ss) 75 | for i in range(ns - 1): 76 | processToken(ss[i], collect, out, False) 77 | processToken(ss[-1], collect, out, True) 78 | else: 79 | processToken(token, collect, out, True) 80 | except Exception as e: 81 | pass 82 | 83 | 84 | def main(argc, argv): 85 | global totalLine 86 | global longLine 87 | global posMap 88 | if argc < 4: 89 | print("Usage:%s " % (argv[0])) 90 | sys.exit(1) 91 | rootDir = argv[1] 92 | out = open(argv[3], "w") 93 | tagvobFp = open(argv[2], "w") 94 | for dirName, subdirList, fileList in os.walk(rootDir): 95 | curDir = os.path.join(rootDir, dirName) 96 | for file in fileList: 97 | if file.endswith(".txt"): 98 | curFile = os.path.join(curDir, file) 99 | fp = open(curFile, "r") 100 | for line in fp.readlines(): 101 | line = line.strip() 102 | processLine(line, out) 103 | fp.close() 104 | out.close() 105 | print("total:%d, long lines:%d" % (totalLine, longLine)) 106 | print("total pos tags:%d" % (len(posMap))) 107 | idx = 0 108 | for k, v in posMap.iteritems(): 109 | tagvobFp.write("%s\t%d\n" % (k, idx + 1)) 110 | idx += 1 111 | 112 | 113 | if __name__ == '__main__': 114 | main(len(sys.argv), sys.argv) 115 | -------------------------------------------------------------------------------- /third_party/crow/examples/example_vs.cpp: -------------------------------------------------------------------------------- 1 | #include "crow.h" 2 | 3 | #include 4 | 5 | class ExampleLogHandler : public crow::ILogHandler { 6 | public: 7 | void log(std::string message, crow::LogLevel level) override { 8 | // cerr << "ExampleLogHandler -> " << message; 9 | } 10 | }; 11 | 12 | struct ExampleMiddleware 13 | { 14 | std::string message; 15 | 16 | ExampleMiddleware() 17 | { 18 | message = "foo"; 19 | } 20 | 21 | void setMessage(std::string newMsg) 22 | { 23 | message = newMsg; 24 | } 25 | 26 | struct context 27 | { 28 | }; 29 | 30 | void before_handle(crow::request& req, crow::response& res, context& ctx) 31 | { 32 | CROW_LOG_DEBUG << " - MESSAGE: " << message; 33 | } 34 | 35 | void after_handle(crow::request& req, crow::response& res, context& ctx) 36 | { 37 | // no-op 38 | } 39 | }; 40 | 41 | int main() 42 | { 43 | crow::App app; 44 | 45 | app.get_middleware().setMessage("hello"); 46 | 47 | app.route_dynamic("/") 48 | ([]{ 49 | return "Hello World!"; 50 | }); 51 | 52 | app.route_dynamic("/about") 53 | ([](){ 54 | return "About Crow example."; 55 | }); 56 | 57 | // a request to /path should be forwarded to /path/ 58 | app.route_dynamic("/path/") 59 | ([](){ 60 | return "Trailing slash test case.."; 61 | }); 62 | 63 | // simple json response 64 | app.route_dynamic("/json") 65 | ([]{ 66 | crow::json::wvalue x; 67 | x["message"] = "Hello, World!"; 68 | return x; 69 | }); 70 | 71 | app.route_dynamic("/hello/") 72 | ([](int count){ 73 | if (count > 100) 74 | return crow::response(400); 75 | std::ostringstream os; 76 | os << count << " bottles of beer!"; 77 | return crow::response(os.str()); 78 | }); 79 | 80 | app.route_dynamic("/add//") 81 | ([](const crow::request& req, crow::response& res, int a, int b){ 82 | std::ostringstream os; 83 | os << a+b; 84 | res.write(os.str()); 85 | res.end(); 86 | }); 87 | 88 | // Compile error with message "Handler type is mismatched with URL paramters" 89 | //CROW_ROUTE(app,"/another/") 90 | //([](int a, int b){ 91 | //return crow::response(500); 92 | //}); 93 | 94 | // more json example 95 | app.route_dynamic("/add_json") 96 | .methods(crow::HTTPMethod::POST) 97 | ([](const crow::request& req){ 98 | auto x = crow::json::load(req.body); 99 | if (!x) 100 | return crow::response(400); 101 | auto sum = x["a"].i()+x["b"].i(); 102 | std::ostringstream os; 103 | os << sum; 104 | return crow::response{os.str()}; 105 | }); 106 | 107 | app.route_dynamic("/params") 108 | ([](const crow::request& req){ 109 | std::ostringstream os; 110 | os << "Params: " << req.url_params << "\n\n"; 111 | os << "The key 'foo' was " << (req.url_params.get("foo") == nullptr ? "not " : "") << "found.\n"; 112 | if(req.url_params.get("pew") != nullptr) { 113 | double countD = boost::lexical_cast(req.url_params.get("pew")); 114 | os << "The value of 'pew' is " << countD << '\n'; 115 | } 116 | auto count = req.url_params.get_list("count"); 117 | os << "The key 'count' contains " << count.size() << " value(s).\n"; 118 | for(const auto& countVal : count) { 119 | os << " - " << countVal << '\n'; 120 | } 121 | return crow::response{os.str()}; 122 | }); 123 | 124 | // ignore all log 125 | crow::logger::setLogLevel(crow::LogLevel::DEBUG); 126 | //crow::logger::setHandler(std::make_shared()); 127 | 128 | app.port(18080) 129 | .multithreaded() 130 | .run(); 131 | } 132 | -------------------------------------------------------------------------------- /kcws/train/idcnn.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding:utf-8 -*- 3 | 4 | # File: idcnn.py 5 | # Project: /Users/tech/code/kcws 6 | # Created: Mon Jul 31 2017 7 | # Author: Koth Chen 8 | # Copyright (c) 2017 Koth 9 | # 10 | # <> 11 | 12 | import tensorflow as tf 13 | 14 | 15 | class Model: 16 | def __init__(self, 17 | layers, 18 | filterWidth, 19 | numFilter, 20 | embeddingDim, 21 | maxSeqLen, 22 | numTags, 23 | repeatTimes=4): 24 | self.layers = layers 25 | self.filter_width = filterWidth 26 | self.num_filter = numFilter 27 | self.embedding_dim = embeddingDim 28 | self.repeat_times = repeatTimes 29 | self.num_tags = numTags 30 | self.max_seq_len = maxSeqLen 31 | 32 | def inference(self, X, reuse=False): 33 | with tf.variable_scope("idcnn", reuse=reuse): 34 | filter_weights = tf.get_variable( 35 | "idcnn_filter", 36 | shape=[1, self.filter_width, self.embedding_dim, 37 | self.num_filter], 38 | initializer=tf.contrib.layers.xavier_initializer()) 39 | layerInput = tf.nn.conv2d(X, 40 | filter_weights, 41 | strides=[1, 1, 1, 1], 42 | padding="SAME", 43 | name="init_layer") 44 | finalOutFromLayers = [] 45 | totalWidthForLastDim = 0 46 | for j in range(self.repeat_times): 47 | for i in range(len(self.layers)): 48 | dilation = self.layers[i]['dilation'] 49 | isLast = True if i == (len(self.layers) - 1) else False 50 | with tf.variable_scope("atrous-conv-layer-%d" % i, 51 | reuse=True 52 | if (reuse or j > 0) else False): 53 | w = tf.get_variable( 54 | "filterW", 55 | shape=[1, self.filter_width, self.num_filter, 56 | self.num_filter], 57 | initializer=tf.contrib.layers.xavier_initializer()) 58 | b = tf.get_variable("filterB", shape=[self.num_filter]) 59 | conv = tf.nn.atrous_conv2d(layerInput, 60 | w, 61 | rate=dilation, 62 | padding="SAME") 63 | conv = tf.nn.bias_add(conv, b) 64 | conv = tf.nn.relu(conv) 65 | if isLast: 66 | finalOutFromLayers.append(conv) 67 | totalWidthForLastDim += self.num_filter 68 | layerInput = conv 69 | finalOut = tf.concat(axis=3, values=finalOutFromLayers) 70 | keepProb = 1.0 if reuse else 0.5 71 | finalOut = tf.nn.dropout(finalOut, keepProb) 72 | 73 | finalOut = tf.squeeze(finalOut, [1]) 74 | finalOut = tf.reshape(finalOut, [-1, totalWidthForLastDim]) 75 | 76 | finalW = tf.get_variable( 77 | "finalW", 78 | shape=[totalWidthForLastDim, self.num_tags], 79 | initializer=tf.contrib.layers.xavier_initializer()) 80 | 81 | finalB = tf.get_variable("finalB", 82 | initializer=tf.constant( 83 | 0.001, shape=[self.num_tags])) 84 | 85 | scores = tf.nn.xw_plus_b(finalOut, finalW, finalB, name="scores") 86 | if reuse: 87 | scores = tf.reshape(scores, [-1, self.max_seq_len, self.num_tags], 88 | name="Reshape_7") 89 | else: 90 | scores = tf.reshape(scores, [-1, self.max_seq_len, self.num_tags], 91 | name=None) 92 | return scores 93 | -------------------------------------------------------------------------------- /kcws/cc/seg_backend_api.cc: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2016- 2018 Koth. All Rights Reserved. 3 | * ===================================================================================== 4 | * Filename: seg_backend_api.cc 5 | * Author: Koth 6 | * Create Time: 2016-11-20 20:43:26 7 | * Description: 8 | * 9 | */ 10 | #include 11 | #include 12 | #include 13 | 14 | #include "base/base.h" 15 | #include "utils/jsonxx.h" 16 | #include "utils/basic_string_util.h" 17 | #include "kcws/cc/demo_html.h" 18 | #include "kcws/cc/tf_seg_model.h" 19 | #include "kcws/cc/pos_tagger.h" 20 | #include "third_party/crow/include/crow.h" 21 | #include "tensorflow/core/platform/init_main.h" 22 | 23 | DEFINE_int32(port, 9090, "the api serving binding port"); 24 | DEFINE_string(model_path, "kcws/models/seg_model.pbtxt", "the model path"); 25 | DEFINE_string(vocab_path, "kcws/models/basic_vocab.txt", "char vocab path"); 26 | DEFINE_string(pos_model_path, "kcws/models/pos_model.pbtxt", "the pos tagging model path"); 27 | DEFINE_string(word_vocab_path, "kcws/models/word_vocab.txt", "word vocab path"); 28 | DEFINE_string(pos_vocab_path, "kcws/models/pos_vocab.txt", "pos vocab path"); 29 | DEFINE_int32(max_sentence_len, 80, "max sentence len "); 30 | DEFINE_string(user_dict_path, "", "user dict path"); 31 | DEFINE_int32(max_word_num, 50, "max num of word per sentence "); 32 | class SegMiddleware { 33 | public: 34 | struct context {}; 35 | SegMiddleware() {} 36 | ~SegMiddleware() {} 37 | void before_handle(crow::request& req, crow::response& res, context& ctx) {} 38 | void after_handle(crow::request& req, crow::response& res, context& ctx) {} 39 | private: 40 | }; 41 | int main(int argc, char* argv[]) { 42 | tensorflow::port::InitMain(argv[0], &argc, &argv); 43 | google::ParseCommandLineFlags(&argc, &argv, true); 44 | crow::App app; 45 | kcws::TfSegModel model; 46 | CHECK(model.LoadModel(FLAGS_model_path, 47 | FLAGS_vocab_path, 48 | FLAGS_max_sentence_len, 49 | FLAGS_user_dict_path)) 50 | << "Load model error"; 51 | if (!FLAGS_pos_model_path.empty()) { 52 | kcws::PosTagger* tagger = new kcws::PosTagger; 53 | CHECK(tagger->LoadModel(FLAGS_pos_model_path, 54 | FLAGS_word_vocab_path, 55 | FLAGS_vocab_path, 56 | FLAGS_pos_vocab_path, 57 | FLAGS_max_word_num)) << "load pos model error"; 58 | model.SetPosTagger(tagger); 59 | } 60 | CROW_ROUTE(app, "/tf_seg/api").methods("POST"_method) 61 | ([&model](const crow::request & req) { 62 | jsonxx::Object obj; 63 | int status = -1; 64 | std::string desc = "OK"; 65 | std::string gotReqBody = req.body; 66 | VLOG(0) << "got body:"; 67 | fprintf(stderr, "%s\n", gotReqBody.c_str()); 68 | jsonxx::Object toRet; 69 | if (obj.parse(gotReqBody) && obj.has("sentence")) { 70 | std::string sentence = obj.get("sentence"); 71 | std::vector result; 72 | std::vector tags; 73 | if (model.Segment(sentence, &result, &tags)) { 74 | status = 0; 75 | jsonxx::Array rarr; 76 | if (result.size() == tags.size()) { 77 | int nl = result.size(); 78 | for (int i = 0; i < nl; i++) { 79 | jsonxx::Object obj; 80 | obj << "tok" << result[i]; 81 | obj << "pos" << tags[i]; 82 | rarr << obj; 83 | } 84 | } else { 85 | for (std::string str : result) { 86 | rarr << str; 87 | } 88 | } 89 | toRet << "segments" << rarr; 90 | } 91 | } else { 92 | desc = "Parse request error"; 93 | } 94 | toRet << "status" << status; 95 | toRet << "msg" << desc; 96 | return crow::response(toRet.json()); 97 | }); 98 | CROW_ROUTE(app, "/")([](const crow::request & req) { 99 | return crow::response(std::string(reinterpret_cast(&kcws_cc_demo_html[0]), kcws_cc_demo_html_len)); 100 | }); 101 | app.port(FLAGS_port).multithreaded().run(); 102 | return 0; 103 | } 104 | -------------------------------------------------------------------------------- /util/python/python_config.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Copyright 2015 The TensorFlow Authors. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ============================================================================== 16 | 17 | set -e -o errexit 18 | 19 | if [ -d "../org_tensorflow" ]; then 20 | script_path="../org_tensorflow" 21 | else 22 | # Prefix expected paths with ./ locally and external/reponame/ for remote repos. 23 | # TODO(kchodorow): remove once runfiles paths are fixed, see 24 | # https://github.com/bazelbuild/bazel/issues/848. 25 | script_path=$(dirname $(dirname $(dirname "$0"))) 26 | script_path=${script_path:-.} 27 | fi 28 | 29 | EXPECTED_PATHS="$script_path/util/python/python_include"\ 30 | " $script_path/util/python/python_lib" 31 | 32 | function main { 33 | argument="$1" 34 | shift 35 | case $argument in 36 | --check) 37 | check_python 38 | exit 0 39 | ;; 40 | --setup) 41 | setup_python "$1" 42 | exit 0 43 | ;; 44 | esac 45 | } 46 | 47 | function setup_python { 48 | PYTHON_BIN_PATH="$1"; 49 | 50 | if [ -z "$PYTHON_BIN_PATH" ]; then 51 | echo "PYTHON_BIN_PATH was not provided. Did you run configure?" 52 | exit 1 53 | fi 54 | if [ ! -x "$PYTHON_BIN_PATH" ] || [ -d "$PYTHON_BIN_PATH" ]; then 55 | echo "PYTHON_BIN_PATH is not executable. Is it the python binary?" 56 | exit 1 57 | fi 58 | 59 | local python_major_version=$("${PYTHON_BIN_PATH}" -c 'from __future__ import print_function; import sys; print(sys.version_info[0]);') 60 | if [ "$python_major_version" == "" ]; then 61 | echo -e "\n\nERROR: Problem getting python version. Is $PYTHON_BIN_PATH the correct python binary?" 62 | exit 1 63 | fi 64 | 65 | local python_include=$("${PYTHON_BIN_PATH}" -c 'from __future__ import print_function; from distutils import sysconfig; print(sysconfig.get_python_inc());') 66 | if [ "$python_include" == "" ]; then 67 | echo -e "\n\nERROR: Problem getting python include path. Is distutils installed?" 68 | exit 1 69 | fi 70 | local python_lib=$("${PYTHON_BIN_PATH}" -c 'from __future__ import print_function; from distutils import sysconfig; print(sysconfig.get_python_lib());') 71 | if [ "$python_lib" == "" ]; then 72 | echo -e "\n\nERROR: Problem getting python lib path. Is distutils installed?" 73 | exit 1 74 | fi 75 | 76 | 77 | for x in $EXPECTED_PATHS; do 78 | if [ -e "$x" ]; then 79 | rm "$x" 80 | fi 81 | done 82 | 83 | ln -sf "${python_include}" util/python/python_include 84 | ln -sf "${python_lib}" util/python/python_lib 85 | 86 | # Write tools/bazel.rc 87 | echo "# Autogenerated by configure: DO NOT EDIT" > tools/bazel.rc 88 | sed -e "s/\$PYTHON_MAJOR_VERSION/$python_major_version/g" \ 89 | -e "s[\$PYTHON_BINARY[$PYTHON_BIN_PATH[g" \ 90 | tools/bazel.rc.template >> tools/bazel.rc 91 | # Write tools/python_bin_path.sh 92 | echo "export PYTHON_BIN_PATH=$PYTHON_BIN_PATH" > tools/python_bin_path.sh 93 | } 94 | 95 | function check_python { 96 | for x in $EXPECTED_PATHS; do 97 | if [ ! -e "$x" ]; then 98 | echo -e "\n\nERROR: Cannot find '${x}'. Did you run configure?\n\n" 1>&2 99 | exit 1 100 | fi 101 | if [ ! -L "${x}" ]; then 102 | echo -e "\n\nERROR: '${x}' is not a symbolic link. Internal error.\n\n" 1>&2 103 | exit 1 104 | fi 105 | true_path=$(readlink "${x}") 106 | if [ ! -d "${true_path}" ]; then 107 | echo -e "\n\nERROR: '${x}' does not refer to an existing directory: ${true_path}. Do you need to rerun configure?\n\n" 1>&2 108 | exit 1 109 | fi 110 | done 111 | } 112 | 113 | main "$@" 114 | -------------------------------------------------------------------------------- /kcws/cc/test_seg.cc: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2016- 2018 Koth. All Rights Reserved. 3 | * ===================================================================================== 4 | * Filename: test_seg.cc 5 | * Author: Koth 6 | * Create Time: 2016-11-20 12:13:21 7 | * Description: 8 | * 9 | */ 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | 18 | #include "base/base.h" 19 | #include "utils/basic_string_util.h" 20 | 21 | 22 | #include "tf_seg_model.h" //NOLINT 23 | #include "sentence_breaker.h" // NOLINT 24 | #include "tensorflow/core/platform/init_main.h" 25 | 26 | DEFINE_string(test_sentence, "", "the test string"); 27 | DEFINE_string(test_file, "", "the test file"); 28 | DEFINE_string(model_path, "", "the model path"); 29 | DEFINE_string(vocab_path, "", "vocab path"); 30 | DEFINE_string(user_dict_path, "", "user dict path"); 31 | DEFINE_int32(max_setence_len, 80, "max sentence len"); 32 | 33 | const int BATCH_SIZE = 2000; 34 | int load_test_file(const std::string& path, 35 | std::vector* pstrs) { 36 | FILE *fp = fopen(path.c_str(), "r"); 37 | if (fp == NULL) { 38 | VLOG(0) << "open file error:" << path; 39 | return 0; 40 | } 41 | char line[4096] = {0}; 42 | int tn = 0; 43 | while (fgets(line, sizeof(line) - 1, fp)) { 44 | int nn = strlen(line); 45 | while (nn && (line[nn - 1] == '\n' || line[nn - 1] == '\r')) { 46 | nn -= 1; 47 | } 48 | if (nn <= 0) { 49 | continue; 50 | } 51 | pstrs->push_back(std::string(line, nn)); 52 | tn += 1; 53 | } 54 | fclose(fp); 55 | return tn; 56 | } 57 | int main(int argc, char *argv[]) { 58 | tensorflow::port::InitMain(argv[0], &argc, &argv); 59 | google::ParseCommandLineFlags(&argc, &argv, true); 60 | if (FLAGS_vocab_path.empty()) { 61 | VLOG(0) << "basic bocab path is not set"; 62 | return 1; 63 | } 64 | if (FLAGS_model_path.empty()) { 65 | VLOG(0) << " model path is not set"; 66 | return 1; 67 | } 68 | kcws::TfSegModel sm; 69 | CHECK(sm.LoadModel(FLAGS_model_path, 70 | FLAGS_vocab_path, 71 | FLAGS_max_setence_len, 72 | FLAGS_user_dict_path)) 73 | << "Load model error"; 74 | if (!FLAGS_test_sentence.empty()) { 75 | std::vector results; 76 | CHECK(sm.Segment(FLAGS_test_sentence, &results)) << "segment error"; 77 | VLOG(0) << "results is :"; 78 | for (auto str : results) { 79 | VLOG(0) << str; 80 | } 81 | } else if (!FLAGS_test_file.empty()) { 82 | kcws::SentenceBreaker breaker(FLAGS_max_setence_len); 83 | std::vector teststrs; 84 | int ns = load_test_file(FLAGS_test_file, &teststrs); 85 | std::string todo; 86 | for (int i = 0; i < ns; i++) { 87 | todo.append(teststrs[i]); 88 | } 89 | UnicodeStr utodo; 90 | BasicStringUtil::u8tou16(todo.c_str(), todo.size(), utodo); 91 | std::vector sentences; 92 | breaker.breakSentences(utodo, &sentences); 93 | 94 | VLOG(0) << "loaded :" << FLAGS_test_file << " ,got " << ns << " lines," 95 | << sentences.size() << " sentences, " << utodo.size() << " characters"; 96 | int batch = (sentences.size() - 1) / BATCH_SIZE + 1; 97 | 98 | auto start = std::chrono::steady_clock::now(); 99 | for (int i = 0; i < batch; i++) { 100 | // VLOG(0) << "seg batch:" << i; 101 | int end = BATCH_SIZE * (i + 1); 102 | if (end > static_cast(sentences.size())) { 103 | end = sentences.size(); 104 | } 105 | std::vector> results; 106 | std::vector todoSentences(sentences.begin() + (BATCH_SIZE * i), sentences.begin() + end); 107 | CHECK(sm.Segment(todoSentences, &results)) << "segment error"; 108 | } 109 | auto duration = std::chrono::duration_cast 110 | (std::chrono::steady_clock::now() - start); 111 | VLOG(0) << "spend " << duration.count() << " milliseconds for file:" << FLAGS_test_file; 112 | } else { 113 | VLOG(0) << "either test sentence or test file should be set"; 114 | return 1; 115 | } 116 | 117 | return 0; 118 | } 119 | -------------------------------------------------------------------------------- /third_party/crow/README.md: -------------------------------------------------------------------------------- 1 | ![Crow logo](http://i.imgur.com/wqivvjK.jpg) 2 | 3 | Crow is C++ microframework for web. (inspired by Python Flask) 4 | 5 | [![Travis Build](https://travis-ci.org/ipkn/crow.svg?branch=master)](https://travis-ci.org/ipkn/crow) 6 | [![Coverage Status](https://coveralls.io/repos/ipkn/crow/badge.svg?branch=master)](https://coveralls.io/r/ipkn/crow?branch=master) 7 | 8 | ```c++ 9 | #include "crow.h" 10 | 11 | int main() 12 | { 13 | crow::SimpleApp app; 14 | 15 | CROW_ROUTE(app, "/")([](){ 16 | return "Hello world"; 17 | }); 18 | 19 | app.port(18080).multithreaded().run(); 20 | } 21 | ``` 22 | 23 | ## Features 24 | 25 | - Easy routing 26 | - Similiar to Flask 27 | - Type-safe Handlers (see Example) 28 | - Very Fast 29 | - ![Benchmark Result in one chart](https://docs.google.com/spreadsheets/d/1KidO9XpuwCRZ2p_JRDJj2aep61H8Sh_KDOhApizv4LE/pubchart?oid=2041467789&format=image) 30 | - More data on [crow-benchmark](https://github.com/ipkn/crow-benchmark) 31 | - Fast built-in JSON parser (crow::json) 32 | - [Mustache](http://mustache.github.io/) based templating library (crow::mustache) 33 | - Header only 34 | - Provide an amalgamated header file `crow_all.h' with every features 35 | - Middleware support 36 | 37 | ## Still in development 38 | - ~~Built-in ORM~~ 39 | - Check [sqlpp11](https://github.com/rbock/sqlpp11) if you want one. 40 | 41 | ## Examples 42 | 43 | #### JSON Response 44 | ```c++ 45 | CROW_ROUTE(app, "/json") 46 | ([]{ 47 | crow::json::wvalue x; 48 | x["message"] = "Hello, World!"; 49 | return x; 50 | }); 51 | ``` 52 | 53 | #### Arguments 54 | ```c++ 55 | CROW_ROUTE(app,"/hello/") 56 | ([](int count){ 57 | if (count > 100) 58 | return crow::response(400); 59 | std::ostringstream os; 60 | os << count << " bottles of beer!"; 61 | return crow::response(os.str()); 62 | }); 63 | ``` 64 | Handler arguments type check at compile time 65 | ```c++ 66 | // Compile error with message "Handler type is mismatched with URL paramters" 67 | CROW_ROUTE(app,"/another/") 68 | ([](int a, int b){ 69 | return crow::response(500); 70 | }); 71 | ``` 72 | 73 | #### Handling JSON Requests 74 | ```c++ 75 | CROW_ROUTE(app, "/add_json") 76 | .methods("POST"_method) 77 | ([](const crow::request& req){ 78 | auto x = crow::json::load(req.body); 79 | if (!x) 80 | return crow::response(400); 81 | int sum = x["a"].i()+x["b"].i(); 82 | std::ostringstream os; 83 | os << sum; 84 | return crow::response{os.str()}; 85 | }); 86 | ``` 87 | 88 | ## How to Build 89 | 90 | If you just want to use crow, copy amalgamate/crow_all.h and include it. 91 | 92 | ### Requirements 93 | 94 | - C++ compiler with good C++11 support (tested with g++>=4.8) 95 | - boost library 96 | - CMake for build examples 97 | - Linking with tcmalloc/jemalloc is recommended for speed. 98 | 99 | - Now supporting VS2013 with limited functionality (only run-time check for url is available.) 100 | 101 | ### Building (Tests, Examples) 102 | 103 | Out-of-source build with CMake is recommended. 104 | 105 | ``` 106 | mkdir build 107 | cd build 108 | cmake .. 109 | make 110 | ``` 111 | 112 | You can run tests with following commands: 113 | ``` 114 | ctest 115 | ``` 116 | 117 | 118 | ### Installing missing dependencies 119 | 120 | #### Ubuntu 121 | sudo apt-get install build-essential libtcmalloc-minimal4 && sudo ln -s /usr/lib/libtcmalloc_minimal.so.4 /usr/lib/libtcmalloc_minimal.so 122 | 123 | #### OSX 124 | brew install boost google-perftools 125 | 126 | ### Attributions 127 | 128 | Crow uses the following libraries. 129 | 130 | qs_parse 131 | 132 | https://github.com/bartgrantham/qs_parse 133 | 134 | Copyright (c) 2010 Bart Grantham 135 | Permission is hereby granted, free of charge, to any person obtaining a copy 136 | of this software and associated documentation files (the "Software"), to deal 137 | in the Software without restriction, including without limitation the rights 138 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 139 | copies of the Software, and to permit persons to whom the Software is 140 | furnished to do so, subject to the following conditions: 141 | The above copyright notice and this permission notice shall be included in 142 | all copies or substantial portions of the Software. 143 | 144 | -------------------------------------------------------------------------------- /kcws/train/generate_training.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: Koth Chen 3 | # @Date: 2016-10-21 16:17:53 4 | # @Last Modified by: Koth 5 | # @Last Modified time: 2017-01-25 16:54:11 6 | 7 | import sys 8 | import os 9 | import w2v 10 | from sentence import Sentence 11 | 12 | totalLine = 0 13 | longLine = 0 14 | 15 | MAX_LEN = 80 16 | totalChars = 0 17 | 18 | 19 | def processToken(token, sentence, out, end, vob): 20 | global totalLine 21 | global longLine 22 | global totalChars 23 | global MAX_LEN 24 | nn = len(token) 25 | while nn > 0 and token[nn - 1] != '/': 26 | nn = nn - 1 27 | 28 | token = token[:nn - 1].strip() 29 | if token != '。': 30 | ustr = unicode(token.decode('utf8')) 31 | sentence.addToken(ustr) 32 | uline = u'' 33 | if token == '。' or end: 34 | if sentence.chars > MAX_LEN: 35 | longLine += 1 36 | else: 37 | x = [] 38 | y = [] 39 | totalChars += sentence.chars 40 | sentence.generate_tr_line(x, y, vob) 41 | nn = len(x) 42 | assert (nn == len(y)) 43 | for j in range(nn, MAX_LEN): 44 | x.append(0) 45 | y.append(0) 46 | line = '' 47 | for i in range(MAX_LEN): 48 | if i > 0: 49 | line += " " 50 | line += str(x[i]) 51 | for j in range(MAX_LEN): 52 | line += " " + str(y[j]) 53 | out.write("%s\n" % (line)) 54 | totalLine += 1 55 | sentence.clear() 56 | 57 | 58 | def processLine(line, out, vob): 59 | line = line.strip() 60 | nn = len(line) 61 | seeLeftB = False 62 | start = 0 63 | sentence = Sentence() 64 | try: 65 | for i in range(nn): 66 | if line[i] == ' ': 67 | if not seeLeftB: 68 | token = line[start:i] 69 | if token.startswith('['): 70 | tokenLen = len(token) 71 | while tokenLen > 0 and token[tokenLen - 1] != ']': 72 | tokenLen = tokenLen - 1 73 | token = token[1:tokenLen - 1] 74 | ss = token.split(' ') 75 | for s in ss: 76 | processToken(s, sentence, out, False, vob) 77 | else: 78 | processToken(token, sentence, out, False, vob) 79 | start = i + 1 80 | elif line[i] == '[': 81 | seeLeftB = True 82 | elif line[i] == ']': 83 | seeLeftB = False 84 | if start < nn: 85 | token = line[start:] 86 | if token.startswith('['): 87 | tokenLen = len(token) 88 | while tokenLen > 0 and token[tokenLen - 1] != ']': 89 | tokenLen = tokenLen - 1 90 | token = token[1:tokenLen - 1] 91 | ss = token.split(' ') 92 | ns = len(ss) 93 | for i in range(ns - 1): 94 | processToken(ss[i], sentence, out, False, vob) 95 | processToken(ss[-1], sentence, out, True, vob) 96 | else: 97 | processToken(token, sentence, out, True, vob) 98 | except Exception as e: 99 | pass 100 | 101 | 102 | def main(argc, argv): 103 | global totalLine 104 | global longLine 105 | global totalChars 106 | if argc < 4: 107 | print("Usage:%s " % (argv[0])) 108 | sys.exit(1) 109 | vobPath = argv[1] 110 | rootDir = argv[2] 111 | vob = w2v.Word2vecVocab() 112 | vob.Load(vobPath) 113 | out = open(argv[3], "w") 114 | for dirName, subdirList, fileList in os.walk(rootDir): 115 | curDir = os.path.join(rootDir, dirName) 116 | for file in fileList: 117 | if file.endswith(".txt"): 118 | curFile = os.path.join(curDir, file) 119 | #print("processing:%s" % (curFile)) 120 | fp = open(curFile, "r") 121 | for line in fp.readlines(): 122 | line = line.strip() 123 | processLine(line, out, vob) 124 | fp.close() 125 | out.close() 126 | print("total:%d, long lines:%d, chars:%d" % 127 | (totalLine, longLine, totalChars)) 128 | 129 | 130 | if __name__ == '__main__': 131 | main(len(sys.argv), sys.argv) 132 | -------------------------------------------------------------------------------- /third_party/crow/tests/template/delimiters.yml: -------------------------------------------------------------------------------- 1 | overview: | 2 | Set Delimiter tags are used to change the tag delimiters for all content 3 | following the tag in the current compilation unit. 4 | 5 | The tag's content MUST be any two non-whitespace sequences (separated by 6 | whitespace) EXCEPT an equals sign ('=') followed by the current closing 7 | delimiter. 8 | 9 | Set Delimiter tags SHOULD be treated as standalone when appropriate. 10 | tests: 11 | - name: Pair Behavior 12 | desc: The equals sign (used on both sides) should permit delimiter changes. 13 | data: { text: 'Hey!' } 14 | template: '{{=<% %>=}}(<%text%>)' 15 | expected: '(Hey!)' 16 | 17 | - name: Special Characters 18 | desc: Characters with special meaning regexen should be valid delimiters. 19 | data: { text: 'It worked!' } 20 | template: '({{=[ ]=}}[text])' 21 | expected: '(It worked!)' 22 | 23 | - name: Sections 24 | desc: Delimiters set outside sections should persist. 25 | data: { section: true, data: 'I got interpolated.' } 26 | template: | 27 | [ 28 | {{#section}} 29 | {{data}} 30 | |data| 31 | {{/section}} 32 | 33 | {{= | | =}} 34 | |#section| 35 | {{data}} 36 | |data| 37 | |/section| 38 | ] 39 | expected: | 40 | [ 41 | I got interpolated. 42 | |data| 43 | 44 | {{data}} 45 | I got interpolated. 46 | ] 47 | 48 | - name: Inverted Sections 49 | desc: Delimiters set outside inverted sections should persist. 50 | data: { section: false, data: 'I got interpolated.' } 51 | template: | 52 | [ 53 | {{^section}} 54 | {{data}} 55 | |data| 56 | {{/section}} 57 | 58 | {{= | | =}} 59 | |^section| 60 | {{data}} 61 | |data| 62 | |/section| 63 | ] 64 | expected: | 65 | [ 66 | I got interpolated. 67 | |data| 68 | 69 | {{data}} 70 | I got interpolated. 71 | ] 72 | 73 | - name: Partial Inheritence 74 | desc: Delimiters set in a parent template should not affect a partial. 75 | data: { value: 'yes' } 76 | partials: 77 | include: '.{{value}}.' 78 | template: | 79 | [ {{>include}} ] 80 | {{= | | =}} 81 | [ |>include| ] 82 | expected: | 83 | [ .yes. ] 84 | [ .yes. ] 85 | 86 | - name: Post-Partial Behavior 87 | desc: Delimiters set in a partial should not affect the parent template. 88 | data: { value: 'yes' } 89 | partials: 90 | include: '.{{value}}. {{= | | =}} .|value|.' 91 | template: | 92 | [ {{>include}} ] 93 | [ .{{value}}. .|value|. ] 94 | expected: | 95 | [ .yes. .yes. ] 96 | [ .yes. .|value|. ] 97 | 98 | # Whitespace Sensitivity 99 | 100 | - name: Surrounding Whitespace 101 | desc: Surrounding whitespace should be left untouched. 102 | data: { } 103 | template: '| {{=@ @=}} |' 104 | expected: '| |' 105 | 106 | - name: Outlying Whitespace (Inline) 107 | desc: Whitespace should be left untouched. 108 | data: { } 109 | template: " | {{=@ @=}}\n" 110 | expected: " | \n" 111 | 112 | - name: Standalone Tag 113 | desc: Standalone lines should be removed from the template. 114 | data: { } 115 | template: | 116 | Begin. 117 | {{=@ @=}} 118 | End. 119 | expected: | 120 | Begin. 121 | End. 122 | 123 | - name: Indented Standalone Tag 124 | desc: Indented standalone lines should be removed from the template. 125 | data: { } 126 | template: | 127 | Begin. 128 | {{=@ @=}} 129 | End. 130 | expected: | 131 | Begin. 132 | End. 133 | 134 | - name: Standalone Line Endings 135 | desc: '"\r\n" should be considered a newline for standalone tags.' 136 | data: { } 137 | template: "|\r\n{{= @ @ =}}\r\n|" 138 | expected: "|\r\n|" 139 | 140 | - name: Standalone Without Previous Line 141 | desc: Standalone tags should not require a newline to precede them. 142 | data: { } 143 | template: " {{=@ @=}}\n=" 144 | expected: "=" 145 | 146 | - name: Standalone Without Newline 147 | desc: Standalone tags should not require a newline to follow them. 148 | data: { } 149 | template: "=\n {{=@ @=}}" 150 | expected: "=\n" 151 | 152 | # Whitespace Insensitivity 153 | 154 | - name: Pair with Padding 155 | desc: Superfluous in-tag whitespace should be ignored. 156 | data: { } 157 | template: '|{{= @ @ =}}|' 158 | expected: '||' 159 | -------------------------------------------------------------------------------- /third_party/crow/include/crow/parser.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include "third_party/crow/include/crow/http_parser_merged.h" 10 | #include "third_party/crow/include/crow/http_request.h" 11 | 12 | namespace crow { 13 | template 14 | struct HTTPParser : public http_parser { 15 | static int on_message_begin(http_parser* self_) { 16 | HTTPParser* self = static_cast(self_); 17 | self->clear(); 18 | return 0; 19 | } 20 | static int on_url(http_parser* self_, const char* at, size_t length) { 21 | HTTPParser* self = static_cast(self_); 22 | self->raw_url.insert(self->raw_url.end(), at, at + length); 23 | return 0; 24 | } 25 | static int on_header_field(http_parser* self_, const char* at, size_t length) { 26 | HTTPParser* self = static_cast(self_); 27 | switch (self->header_building_state) { 28 | case 0: 29 | if (!self->header_value.empty()) { 30 | self->headers.emplace(std::move(self->header_field), std::move(self->header_value)); 31 | } 32 | self->header_field.assign(at, at + length); 33 | self->header_building_state = 1; 34 | break; 35 | case 1: 36 | self->header_field.insert(self->header_field.end(), at, at + length); 37 | break; 38 | } 39 | return 0; 40 | } 41 | static int on_header_value(http_parser* self_, const char* at, size_t length) { 42 | HTTPParser* self = static_cast(self_); 43 | switch (self->header_building_state) { 44 | case 0: 45 | self->header_value.insert(self->header_value.end(), at, at + length); 46 | break; 47 | case 1: 48 | self->header_building_state = 0; 49 | self->header_value.assign(at, at + length); 50 | break; 51 | } 52 | return 0; 53 | } 54 | static int on_headers_complete(http_parser* self_) { 55 | HTTPParser* self = static_cast(self_); 56 | if (!self->header_field.empty()) { 57 | self->headers.emplace(std::move(self->header_field), std::move(self->header_value)); 58 | } 59 | self->process_header(); 60 | return 0; 61 | } 62 | static int on_body(http_parser* self_, const char* at, size_t length) { 63 | HTTPParser* self = static_cast(self_); 64 | self->body.insert(self->body.end(), at, at + length); 65 | return 0; 66 | } 67 | static int on_message_complete(http_parser* self_) { 68 | HTTPParser* self = static_cast(self_); 69 | 70 | // url params 71 | self->url = self->raw_url.substr(0, self->raw_url.find("?")); 72 | self->url_params = query_string(self->raw_url); 73 | 74 | self->process_message(); 75 | return 0; 76 | } 77 | HTTPParser(Handler* handler) : 78 | handler_(handler) { 79 | http_parser_init(this, HTTP_REQUEST); 80 | } 81 | 82 | // return false on error 83 | bool feed(const char* buffer, int length) { 84 | const static http_parser_settings settings_{ 85 | on_message_begin, 86 | on_url, 87 | nullptr, 88 | on_header_field, 89 | on_header_value, 90 | on_headers_complete, 91 | on_body, 92 | on_message_complete, 93 | }; 94 | 95 | int nparsed = http_parser_execute(this, &settings_, buffer, length); 96 | return nparsed == length; 97 | } 98 | 99 | bool done() { 100 | return feed(nullptr, 0); 101 | } 102 | 103 | void clear() { 104 | url.clear(); 105 | raw_url.clear(); 106 | header_building_state = 0; 107 | header_field.clear(); 108 | header_value.clear(); 109 | headers.clear(); 110 | url_params.clear(); 111 | body.clear(); 112 | } 113 | 114 | void process_header() { 115 | handler_->handle_header(); 116 | } 117 | 118 | void process_message() { 119 | handler_->handle(); 120 | } 121 | 122 | request to_request() const { 123 | return request{(HTTPMethod)method, std::move(raw_url), std::move(url), std::move(url_params), std::move(headers), std::move(body)}; 124 | } 125 | 126 | bool is_upgrade() const { 127 | return upgrade; 128 | } 129 | 130 | bool check_version(int major, int minor) const { 131 | return http_major == major && http_minor == minor; 132 | } 133 | 134 | std::string raw_url; 135 | std::string url; 136 | 137 | int header_building_state = 0; 138 | std::string header_field; 139 | std::string header_value; 140 | ci_map headers; 141 | query_string url_params; 142 | std::string body; 143 | 144 | Handler* handler_; 145 | }; 146 | } 147 | --------------------------------------------------------------------------------