├── tools
    ├── dummy
    └── bazel.rc.template
├── third_party
    ├── boost
    │   ├── BUILD
    │   └── boost.bzl
    ├── crow
    │   ├── .gitmodules
    │   ├── tests
    │   │   ├── template
    │   │   │   ├── README.template_test
    │   │   │   ├── Makefile
    │   │   │   ├── mustachetest.cpp
    │   │   │   ├── CMakeLists.txt
    │   │   │   ├── test.py
    │   │   │   ├── comments.json
    │   │   │   ├── partials.json
    │   │   │   ├── comments.yml
    │   │   │   ├── delimiters.json
    │   │   │   ├── partials.yml
    │   │   │   └── delimiters.yml
    │   │   └── CMakeLists.txt
    │   ├── examples
    │   │   ├── helloworld.cpp
    │   │   ├── example.py
    │   │   ├── ssl
    │   │   │   └── example_ssl.cpp
    │   │   ├── websocket
    │   │   │   ├── templates
    │   │   │   │   └── ws.html
    │   │   │   └── example_ws.cpp
    │   │   ├── example_chat.html
    │   │   ├── example_test.py
    │   │   ├── CMakeLists.txt
    │   │   ├── example_chat.cpp
    │   │   ├── example_with_all.cpp
    │   │   └── example_vs.cpp
    │   ├── .gitignore
    │   ├── BUILD
    │   ├── include
    │   │   ├── crow.h
    │   │   └── crow
    │   │   │   ├── settings.h
    │   │   │   ├── ci_map.h
    │   │   │   ├── http_request.h
    │   │   │   ├── dumb_timer_queue.h
    │   │   │   ├── socket_adaptors.h
    │   │   │   ├── middleware_context.h
    │   │   │   ├── http_response.h
    │   │   │   ├── common.h
    │   │   │   ├── logging.h
    │   │   │   └── parser.h
    │   ├── .travis.yml
    │   ├── cmake
    │   │   └── FindTcmalloc.cmake
    │   ├── LICENSE
    │   ├── amalgamate
    │   │   └── merge_all.py
    │   ├── CMakeLists.txt
    │   └── README.md
    ├── gflags
    │   ├── empty.cc
    │   ├── gflags-2.0.tar.gz
    │   └── BUILD
    ├── glog
    │   ├── empty.cc
    │   ├── glog-0.3.4.tar.gz
    │   └── BUILD
    ├── setuptools
    │   ├── setuptools.egg-info
    │   │   ├── zip-safe
    │   │   ├── top_level.txt
    │   │   ├── requires.txt
    │   │   ├── dependency_links.txt
    │   │   └── entry_points.txt
    │   ├── README
    │   └── BUILD
    ├── python
    │   ├── pylint
    │   │   ├── main.py
    │   │   └── BUILD
    │   ├── cpplint
    │   │   └── BUILD
    │   └── semver
    │   │   ├── README.md
    │   │   ├── setup.py
    │   │   └── PKG-INFO
    ├── pybind11
    │   ├── BUILD
    │   ├── complex.h
    │   ├── typeid.h
    │   └── functional.h
    └── word2vec
    │   ├── demo-word.sh
    │   ├── demo-classes.sh
    │   ├── BUILD
    │   ├── demo-word-accuracy.sh
    │   ├── demo-analogy.sh
    │   ├── makefile
    │   ├── demo-phrases.sh
    │   ├── demo-phrase-accuracy.sh
    │   └── README.txt
├── docs
    ├── dl.jpeg
    └── qrcode_dzgz.jpg
├── kcws
    ├── models
    │   ├── word_vocab.txt
    │   ├── seg_model.pbtxt
    │   └── pos_vocab.txt
    ├── cc
    │   ├── dump_vocab.py
    │   ├── prepare_test_file.py
    │   ├── viterbi_decode.h
    │   ├── sentence_breaker.h
    │   ├── test_breaker.cc
    │   ├── test_ac_scanner.cc
    │   ├── pos_tagger.h
    │   ├── tf_seg_model.h
    │   ├── demo.html
    │   ├── viterbi_decode.cc
    │   ├── sentence_breaker.cc
    │   ├── gen_seg_eval.cc
    │   ├── BUILD
    │   ├── seg_backend_api.cc
    │   └── test_seg.cc
    └── train
    │   ├── BUILD
    │   ├── replace_unk.py
    │   ├── sentence.py
    │   ├── filter_sentence.py
    │   ├── merge_vec.py
    │   ├── sampling_for_train.py
    │   ├── generate_char_embedding.py
    │   ├── generate_train_free.py
    │   ├── process_icwb.py
    │   ├── process_people.py
    │   ├── bilstm.py
    │   ├── prepare_pos.py
    │   ├── process_anno_file.py
    │   ├── stats_pos.py
    │   ├── idcnn.py
    │   └── generate_training.py
├── .gitignore
├── base
    ├── base.cc
    ├── BUILD
    └── base.h
├── tfmodel
    ├── BUILD
    ├── tfmodel.h
    └── tfmodel.cc
├── utils
    ├── vocab.h
    ├── basic_vocab.h
    ├── py_word2vec_vob.cc
    ├── BUILD
    ├── word2vec_vob.h
    ├── basic_vocab.cc
    └── json_util.h
├── util
    └── python
    │   ├── BUILD
    │   └── python_config.sh
├── BUILD.tf_dist
├── configure
├── BUILD.boost
├── WORKSPACE
├── pos_train.md
└── README.md


/tools/dummy:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/third_party/boost/BUILD:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/third_party/crow/.gitmodules:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/third_party/gflags/empty.cc:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/third_party/glog/empty.cc:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/third_party/setuptools/setuptools.egg-info/zip-safe:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/docs/dl.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koth/kcws/HEAD/docs/dl.jpeg


--------------------------------------------------------------------------------
/docs/qrcode_dzgz.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koth/kcws/HEAD/docs/qrcode_dzgz.jpg


--------------------------------------------------------------------------------
/kcws/models/word_vocab.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koth/kcws/HEAD/kcws/models/word_vocab.txt


--------------------------------------------------------------------------------
/kcws/models/seg_model.pbtxt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koth/kcws/HEAD/kcws/models/seg_model.pbtxt


--------------------------------------------------------------------------------
/third_party/glog/glog-0.3.4.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koth/kcws/HEAD/third_party/glog/glog-0.3.4.tar.gz


--------------------------------------------------------------------------------
/third_party/crow/tests/template/README.template_test:
--------------------------------------------------------------------------------
1 | spec json/yml files from https://github.com/mustache/spec
2 | 


--------------------------------------------------------------------------------
/third_party/gflags/gflags-2.0.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koth/kcws/HEAD/third_party/gflags/gflags-2.0.tar.gz


--------------------------------------------------------------------------------
/third_party/python/pylint/main.py:
--------------------------------------------------------------------------------
1 | import pylint
2 | 
3 | if __name__ == '__main__':
4 |     pylint.run_pylint()
5 | 


--------------------------------------------------------------------------------
/third_party/setuptools/setuptools.egg-info/top_level.txt:
--------------------------------------------------------------------------------
1 | setuptools
2 | easy_install
3 | _markerlib
4 | pkg_resources
5 | 


--------------------------------------------------------------------------------
/third_party/setuptools/setuptools.egg-info/requires.txt:
--------------------------------------------------------------------------------
1 | 
2 | 
3 | [ssl:sys_platform=='win32']
4 | wincertstore==0.2
5 | 
6 | [certs]
7 | certifi==1.0.1


--------------------------------------------------------------------------------
/third_party/crow/tests/template/Makefile:
--------------------------------------------------------------------------------
1 | all:
2 | 	$(CXX) -Wall -std=c++11 -g -o mustachetest mustachetest.cc
3 | .PHONY: clean
4 | clean:
5 | 	rm -f mustachetest *.o
6 | 


--------------------------------------------------------------------------------
/third_party/setuptools/README:
--------------------------------------------------------------------------------
1 | Project URL: https://pypi.python.org/packages/source/s/setuptools/setuptools-3.6.tar.gz#md5=8f3a1dcdc14313c8334eb6af4f66ea0a
2 | Version: 3.6
3 | License: PSF or ZPL
4 | Local modifications: none
5 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .idea
 2 | .DS_*
 3 | datas
 4 | logs
 5 | bazel-*
 6 | ner/address_logs
 7 | tools/bazel.rc
 8 | tools/python_bin_path.sh
 9 | util/python/python_include
10 | util/python/python_lib
11 | u_company.txt
12 | resume_extractor/config.json
13 | 


--------------------------------------------------------------------------------
/third_party/python/cpplint/BUILD:
--------------------------------------------------------------------------------
 1 | licenses(["notice"])
 2 | 
 3 | package(default_visibility = ["//visibility:public"])
 4 | 
 5 | py_binary(
 6 |     name = "cpplint",
 7 |     srcs = ["cpplint.py"],
 8 |     main = "cpplint.py",
 9 |     stamp = 1,
10 | )
11 | 


--------------------------------------------------------------------------------
/third_party/crow/examples/helloworld.cpp:
--------------------------------------------------------------------------------
 1 | #include "crow.h"
 2 | 
 3 | int main()
 4 | {
 5 |     crow::SimpleApp app;
 6 | 
 7 |     CROW_ROUTE(app, "/")
 8 |     ([]() {
 9 |         return "Hello world!";
10 |     });
11 | 
12 |     app.port(18080).run();
13 | }
14 | 


--------------------------------------------------------------------------------
/third_party/setuptools/BUILD:
--------------------------------------------------------------------------------
 1 | licenses(["notice"])
 2 | 
 3 | package(default_visibility = ["//visibility:public"])
 4 | 
 5 | py_library(
 6 |     name = "pkg_resources",
 7 |     srcs = [
 8 |         "pkg_resources.py",
 9 |     ],
10 | )
11 | 
12 | exports_files(["pkg_resources.py"])
13 | 


--------------------------------------------------------------------------------
/third_party/setuptools/setuptools.egg-info/dependency_links.txt:
--------------------------------------------------------------------------------
1 | https://pypi.python.org/packages/source/c/certifi/certifi-1.0.1.tar.gz#md5=45f5cb94b8af9e1df0f9450a8f61b790
2 | https://pypi.python.org/packages/source/w/wincertstore/wincertstore-0.2.zip#md5=ae728f2f007185648d0c7a8679b361e2
3 | 


--------------------------------------------------------------------------------
/third_party/pybind11/BUILD:
--------------------------------------------------------------------------------
 1 | package(default_visibility = ["//visibility:public"])
 2 | 
 3 | licenses(["notice"])
 4 | 
 5 | cc_library(
 6 |     name = "pybind11",
 7 |     srcs = glob(["*.h"]),
 8 |     linkstatic = 1,
 9 |     deps = [
10 |         "//util/python:python_headers",
11 |     ],
12 | )
13 | 


--------------------------------------------------------------------------------
/third_party/word2vec/demo-word.sh:
--------------------------------------------------------------------------------
1 | make
2 | if [ ! -e text8 ]; then
3 |   wget http://mattmahoney.net/dc/text8.zip -O text8.gz
4 |   gzip -d text8.gz -f
5 | fi
6 | time ./word2vec -train text8 -output vectors.bin -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 20 -binary 1 -iter 15
7 | ./distance vectors.bin
8 | 


--------------------------------------------------------------------------------
/base/base.cc:
--------------------------------------------------------------------------------
 1 | // Copyright Koth 2016
 2 | 
 3 | #include "base/base.h"
 4 | 
 5 | namespace base {
 6 | 
 7 | void Init(int argc, char** argv) {
 8 |   // google::InstallFailureSignalHandler();
 9 |   google::ParseCommandLineFlags(&argc, &argv, true);
10 |   google::InitGoogleLogging(argv[0]);
11 | }
12 | 
13 | }  // namesapace
14 | 


--------------------------------------------------------------------------------
/tfmodel/BUILD:
--------------------------------------------------------------------------------
 1 | package(default_visibility = ["//visibility:public"])
 2 | 
 3 | cc_library(
 4 |   name="tfmodel",
 5 |   srcs=[
 6 |     "tfmodel.cc"
 7 |   ],
 8 |   hdrs=[
 9 |     "tfmodel.h"
10 |   ],
11 |   linkstatic=1,
12 |   deps=[
13 |    '//base:base',
14 |    '//utils:basic_string_util',
15 |    '@tf//:tensorflow',
16 |   ]
17 | )
18 | 


--------------------------------------------------------------------------------
/base/BUILD:
--------------------------------------------------------------------------------
 1 | package(default_visibility = ["//visibility:public"])
 2 | 
 3 | cc_library(
 4 |     name = "base",
 5 |     srcs = ["base.cc"],
 6 |     hdrs = ["base.h"],
 7 |     # linkstatic = 1,
 8 |     deps = [
 9 |         "//third_party/gflags:gflags-cxx",
10 |         "//third_party/glog:glog-cxx",
11 |     ],
12 |     # alwayslink = 1,
13 | )
14 | 


--------------------------------------------------------------------------------
/third_party/python/pylint/BUILD:
--------------------------------------------------------------------------------
 1 | licenses(["notice"])  # apache 2.0
 2 | 
 3 | package(default_visibility = ["//visibility:public"])
 4 | 
 5 | load("/tools/rules/pex_rules", "pex_library", "pex_binary")
 6 | 
 7 | pex_binary(
 8 |     name = "pylint",
 9 |     srcs = ["main.py"],
10 |     main = "main.py",
11 |     reqs = ["pylint==1.5.5"],
12 | )
13 | 


--------------------------------------------------------------------------------
/utils/vocab.h:
--------------------------------------------------------------------------------
 1 | 
 2 | #ifndef UTILS_VOCAB_H_
 3 | #define UTILS_VOCAB_H_
 4 | #include <string>
 5 | namespace utils {
 6 | class Vocab {
 7 |  public:
 8 |   virtual ~Vocab() {};
 9 |   virtual bool Load(const std::string& path) = 0;
10 |   virtual int GetWordIndex(const std::string& word) = 0;
11 |   virtual int GetTotalWord() = 0;
12 | };
13 | }  // namespace utils
14 | #endif  // UTILS_VOCAB_H_
15 | 


--------------------------------------------------------------------------------
/third_party/word2vec/demo-classes.sh:
--------------------------------------------------------------------------------
1 | make
2 | if [ ! -e text8 ]; then
3 |   wget http://mattmahoney.net/dc/text8.zip -O text8.gz
4 |   gzip -d text8.gz -f
5 | fi
6 | time ./word2vec -train text8 -output classes.txt -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 20 -iter 15 -classes 500
7 | sort classes.txt -k 2 -n > classes.sorted.txt
8 | echo The word classes were saved to file classes.sorted.txt
9 | 


--------------------------------------------------------------------------------
/third_party/crow/.gitignore:
--------------------------------------------------------------------------------
 1 | # Compiled Object files
 2 | *.slo
 3 | *.lo
 4 | *.o
 5 | *.obj
 6 | 
 7 | # Compiled Dynamic libraries
 8 | *.so
 9 | *.dylib
10 | *.dll
11 | 
12 | # Compiled Static libraries
13 | *.lai
14 | *.la
15 | *.a
16 | *.lib
17 | 
18 | # Executables
19 | *.exe
20 | *.out
21 | *.app
22 | 
23 | example
24 | unittest
25 | 
26 | *.swp
27 | *.gcov
28 | 
29 | *.gcda
30 | *.gcno
31 | 
32 | 
33 | .directory
34 | 


--------------------------------------------------------------------------------
/third_party/crow/examples/example.py:
--------------------------------------------------------------------------------
 1 | from flask import Flask
 2 | app = Flask(__name__)
 3 | 
 4 | @app.route("/")
 5 | def hello():
 6 |     return "Hello World!"
 7 | 
 8 | @app.route("/about/<path:path>/hello")
 9 | def hello1(path):
10 |     return "about1"
11 | 
12 | @app.route("/about")
13 | def hello2():
14 |     return "about2"
15 | 
16 | print app.url_map
17 | 
18 | if __name__ == "__main__":
19 |     app.run(host="0.0.0.0", port=8888)
20 | 


--------------------------------------------------------------------------------
/third_party/word2vec/BUILD:
--------------------------------------------------------------------------------
 1 | licenses(["notice"])
 2 | 
 3 | package(default_visibility = ["//visibility:public"])
 4 | 
 5 | cc_binary(
 6 |     name = "word2vec",
 7 |     srcs = [
 8 |         "word2vec.c",
 9 |     ],
10 |     linkopts = [
11 |         "-pthread",
12 |     ],
13 | )
14 | 
15 | cc_binary(
16 |     name = "distance",
17 |     srcs = [
18 |         "distance.cc",
19 |     ],
20 |     deps = [
21 |         "//utils:basic_string_util",
22 |     ],
23 | )
24 | 


--------------------------------------------------------------------------------
/third_party/word2vec/demo-word-accuracy.sh:
--------------------------------------------------------------------------------
1 | make
2 | if [ ! -e text8 ]; then
3 |   wget http://mattmahoney.net/dc/text8.zip -O text8.gz
4 |   gzip -d text8.gz -f
5 | fi
6 | time ./word2vec -train text8 -output vectors.bin -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 20 -binary 1 -iter 15
7 | ./compute-accuracy vectors.bin 30000 < questions-words.txt
8 | # to compute accuracy with the full vocabulary, use: ./compute-accuracy vectors.bin < questions-words.txt
9 | 


--------------------------------------------------------------------------------
/base/base.h:
--------------------------------------------------------------------------------
 1 | // Copyright Koth 2016
 2 | 
 3 | #ifndef BASE_BASE_H_
 4 | #define BASE_BASE_H_
 5 | 
 6 | #include <stdint.h>
 7 | #include <algorithm>
 8 | #include <map>
 9 | #include <memory>
10 | #include <set>
11 | #include <string>
12 | #include <vector>
13 | 
14 | #include "third_party/gflags/include/gflags/gflags.h"
15 | #include "third_party/glog/include/glog/logging.h"
16 | 
17 | 
18 | 
19 | 
20 | namespace base {
21 | 
22 | void Init(int argc, char** argv);
23 | 
24 | }  // namesapace base
25 | 
26 | #endif  // BASE_BASE_H_
27 | 


--------------------------------------------------------------------------------
/kcws/cc/dump_vocab.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Author: Koth
 3 | # @Date:   2016-11-20 15:04:18
 4 | # @Last Modified by:   Koth
 5 | # @Last Modified time: 2016-11-20 15:07:51
 6 | import sys
 7 | import os
 8 | import w2v
 9 | 
10 | 
11 | def main(argc, argv):
12 |   if argc < 3:
13 |     print("Usage:%s <word2vec_vocab_path> <output_path>" % (argv[0]))
14 |     sys.exit(1)
15 |   vob = w2v.Word2vecVocab()
16 |   vob.Load(argv[1])
17 |   vob.DumpBasicVocab(argv[2])
18 | 
19 | 
20 | if __name__ == '__main__':
21 |   main(len(sys.argv), sys.argv)


--------------------------------------------------------------------------------
/third_party/crow/BUILD:
--------------------------------------------------------------------------------
 1 | licenses(["notice"])
 2 | 
 3 | package(default_visibility = ["//visibility:public"])
 4 | 
 5 | cc_library(
 6 |     name = "crow",
 7 |     hdrs = glob([
 8 |         "include/crow.h",
 9 |         "include/crow/*.h",
10 |         "include/crow/*.hpp",
11 |     ]),
12 |     linkopts = [
13 |         "-pthread",
14 |     ],
15 |     visibility = ["//visibility:public"],
16 |     deps = [
17 |         "@boost//:system",
18 |     ],
19 | )
20 | 
21 | cc_binary(
22 |     name = "crow_ex",
23 |     srcs = [
24 |         "examples/example.cpp",
25 |     ],
26 |     deps = [
27 |         ":crow",
28 |     ],
29 | )
30 | 


--------------------------------------------------------------------------------
/util/python/BUILD:
--------------------------------------------------------------------------------
 1 | licenses(["restricted"])
 2 | 
 3 | package(default_visibility = ["//visibility:public"])
 4 | 
 5 | cc_library(
 6 |     name = "python_headers",
 7 |     hdrs = glob([
 8 |         "python_include/**/*.h",
 9 |     ]),
10 |     data = [":python_checked"],
11 |     includes = ["python_include"],
12 | )
13 | 
14 | genrule(
15 |     name = "python_check",
16 |     srcs = [
17 |         "python_config.sh",
18 |     ],
19 |     outs = [
20 |         "python_checked",
21 |     ],
22 |     cmd = "OUTPUTDIR=\"$(@D)/\"; $(location :python_config.sh) --check && touch $$OUTPUTDIR/python_checked",
23 |     local = 1,
24 | )
25 | 


--------------------------------------------------------------------------------
/utils/basic_vocab.h:
--------------------------------------------------------------------------------
 1 | 
 2 | #ifndef UTILS_BASIC_VOCAB_H_
 3 | #define UTILS_BASIC_VOCAB_H_
 4 | #include <vector>
 5 | #include <string>
 6 | #include <unordered_map>
 7 | 
 8 | #include "vocab.h"
 9 | 
10 | namespace utils {
11 | class BasicVocab: public Vocab {
12 |  public:
13 |   BasicVocab() {use_map_ = false;}
14 |   BasicVocab(bool useMap): use_map_(useMap) {}
15 |   bool Load(const std::string& path) override;
16 |   int GetWordIndex(const std::string& word) override;
17 |   int GetTotalWord() override;
18 |  private:
19 |   std::unordered_map<std::string, int> w_map_;
20 |   bool use_map_;
21 | };
22 | }  // namespace utils
23 | #endif  // UTILS_BASIC_VOCAB_H_
24 | 


--------------------------------------------------------------------------------
/third_party/crow/examples/ssl/example_ssl.cpp:
--------------------------------------------------------------------------------
 1 | #define CROW_ENABLE_SSL
 2 | #include "crow.h"
 3 | 
 4 | int main()
 5 | {
 6 |     crow::SimpleApp app;
 7 | 
 8 |     CROW_ROUTE(app, "/")
 9 |     ([]() {
10 |         return "Hello world!";
11 |     });
12 | 
13 |     app.port(18080).ssl_file("test.crt", "test.key").run();
14 | 
15 |     // Use .pem file
16 |     //app.port(18080).ssl_file("test.pem").run();
17 |     
18 |     // Use custom context; see boost::asio::ssl::context
19 |     /*
20 |      * crow::ssl_context_t ctx;
21 |      * ctx.set_verify_mode(...)
22 |      *
23 |      *   ... configuring ctx
24 |      *
25 |      *   app.port(18080).ssl(ctx).run();
26 |      */
27 | }
28 | 


--------------------------------------------------------------------------------
/third_party/word2vec/demo-analogy.sh:
--------------------------------------------------------------------------------
 1 | make
 2 | if [ ! -e text8 ]; then
 3 |   wget http://mattmahoney.net/dc/text8.zip -O text8.gz
 4 |   gzip -d text8.gz -f
 5 | fi
 6 | echo ---------------------------------------------------------------------------------------------------
 7 | echo Note that for the word analogy to perform well, the model should be trained on much larger data set
 8 | echo Example input: paris france berlin
 9 | echo ---------------------------------------------------------------------------------------------------
10 | time ./word2vec -train text8 -output vectors.bin -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 20 -binary 1 -iter 15
11 | ./word-analogy vectors.bin
12 | 


--------------------------------------------------------------------------------
/kcws/models/pos_vocab.txt:
--------------------------------------------------------------------------------
 1 | j	1
 2 | vd	2
 3 | ad	3
 4 | vf	4
 5 | vg	5
 6 | cc	6
 7 | vi	7
 8 | rr	8
 9 | al	9
10 | vn	10
11 | an	11
12 | gg	12
13 | vs	13
14 | gc	14
15 | nf	15
16 | vx	16
17 | vy	17
18 | gm	18
19 | u	19
20 | gi	20
21 | nh	21
22 | ni	22
23 | ag	23
24 | nn	24
25 | ul	25
26 | na	26
27 | nb	27
28 | z	28
29 | ry	29
30 | rz	30
31 | ng	31
32 | pb	32
33 | nz	33
34 | dg	34
35 | tg	35
36 | nr	36
37 | ns	37
38 | nt	38
39 | dl	39
40 | bl	40
41 | vl	41
42 | gp	42
43 | o	43
44 | x	44
45 | e	45
46 | qt	46
47 | a	47
48 | nm	48
49 | c	49
50 | b	50
51 | uy	51
52 | d	52
53 | f	53
54 | i	54
55 | k	55
56 | uz	56
57 | m	57
58 | l	58
59 | us	59
60 | n	60
61 | q	61
62 | p	62
63 | s	63
64 | r	64
65 | mq	65
66 | t	66
67 | w	67
68 | v	68
69 | y	69
70 | ud	70
71 | ug	71
72 | qv	72
73 | gb	73
74 | 
75 | 


--------------------------------------------------------------------------------
/third_party/crow/include/crow.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "crow/query_string.h"
 3 | #include "crow/http_parser_merged.h"
 4 | #include "crow/ci_map.h"
 5 | #include "crow/TinySHA1.hpp"
 6 | #include "crow/settings.h"
 7 | #include "crow/socket_adaptors.h"
 8 | #include "crow/json.h"
 9 | #include "crow/mustache.h"
10 | #include "crow/logging.h"
11 | #include "crow/dumb_timer_queue.h"
12 | #include "crow/utility.h"
13 | #include "crow/common.h"
14 | #include "crow/http_request.h"
15 | #include "crow/websocket.h"
16 | #include "crow/parser.h"
17 | #include "crow/http_response.h"
18 | #include "crow/middleware.h"
19 | #include "crow/routing.h"
20 | #include "crow/middleware_context.h"
21 | #include "crow/http_connection.h"
22 | #include "crow/http_server.h"
23 | #include "crow/app.h"
24 | 


--------------------------------------------------------------------------------
/third_party/word2vec/makefile:
--------------------------------------------------------------------------------
 1 | CC = gcc
 2 | #Using -Ofast instead of -O3 might result in faster code, but is supported only by newer GCC versions
 3 | CFLAGS = -lm -pthread -O3 -march=native -Wall -funroll-loops -Wno-unused-result
 4 | 
 5 | all: word2vec word2phrase distance word-analogy compute-accuracy
 6 | 
 7 | word2vec : word2vec.c
 8 | 	$(CC) word2vec.c -o word2vec $(CFLAGS)
 9 | word2phrase : word2phrase.c
10 | 	$(CC) word2phrase.c -o word2phrase $(CFLAGS)
11 | distance : distance.c
12 | 	$(CC) distance.c -o distance $(CFLAGS)
13 | word-analogy : word-analogy.c
14 | 	$(CC) word-analogy.c -o word-analogy $(CFLAGS)
15 | compute-accuracy : compute-accuracy.c
16 | 	$(CC) compute-accuracy.c -o compute-accuracy $(CFLAGS)
17 | 	chmod +x *.sh
18 | 
19 | clean:
20 | 	rm -rf word2vec word2phrase distance word-analogy compute-accuracy


--------------------------------------------------------------------------------
/third_party/crow/include/crow/settings.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | // settings for crow
 3 | // TODO - replace with runtime config. libucl?
 4 | 
 5 | /* #ifdef - enables debug mode */
 6 | #define CROW_ENABLE_DEBUG
 7 | 
 8 | /* #ifdef - enables logging */
 9 | #define CROW_ENABLE_LOGGING
10 | 
11 | /* #ifdef - enables ssl */
12 | //#define CROW_ENABLE_SSL
13 | 
14 | /* #define - specifies log level */
15 | /*
16 |     Debug       = 0
17 |     Info        = 1
18 |     Warning     = 2
19 |     Error       = 3
20 |     Critical    = 4
21 | 
22 |     default to INFO
23 | */
24 | #define CROW_LOG_LEVEL 1
25 | 
26 | 
27 | // compiler flags
28 | #if __cplusplus >= 201402L
29 | #define CROW_CAN_USE_CPP14
30 | #endif
31 | 
32 | #if defined(_MSC_VER)
33 | #if _MSC_VER < 1900
34 | #define CROW_MSVC_WORKAROUND
35 | #define constexpr const
36 | #define noexcept throw()
37 | #endif
38 | #endif
39 | 


--------------------------------------------------------------------------------
/BUILD.tf_dist:
--------------------------------------------------------------------------------
 1 | # Bazel build file for binary tf
 2 | licenses(["notice"])
 3 | 
 4 | 
 5 | config_setting(
 6 |     name = "darwin",
 7 |     values = {"cpu": "darwin"},
 8 |     visibility = ["//visibility:public"],
 9 | )
10 | 
11 | filegroup(
12 |    name="tf_unix_lib",
13 |    srcs=glob(
14 |    ["lib/unix/*.o"]
15 |    )
16 | )
17 | filegroup(
18 |    name="tf_mac_lib",
19 |    srcs=glob(
20 |    ["lib/mac/*.o"],
21 |    exclude = ["lib/mac/__.SYMDEF_*.o"]
22 |    )
23 | )
24 | cc_library(
25 |   name="tensorflow",
26 |   hdrs = glob(["tensorflow/*","google/*"]),
27 |   includes = [
28 |     ".",
29 |   ],
30 |   alwayslink=1,
31 |   visibility = ["//visibility:public"],
32 |   deps=[
33 |   '@protobuf//:protobuf'
34 |   ],
35 |   srcs=select({
36 |         ":darwin": [
37 |             ":tf_mac_lib",
38 |         ],
39 |         "//conditions:default": [
40 |            ":tf_unix_lib",
41 |         ],
42 |     }),
43 | )


--------------------------------------------------------------------------------
/third_party/crow/include/crow/ci_map.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <boost/algorithm/string/predicate.hpp>
 4 | #include <boost/functional/hash.hpp>
 5 | #include <unordered_map>
 6 | 
 7 | namespace crow
 8 | {
 9 |     struct ci_hash
10 |     {
11 |         size_t operator()(const std::string& key) const
12 |         {
13 |             std::size_t seed = 0;
14 |             std::locale locale;
15 | 
16 |             for(auto c : key)
17 |             {
18 |                 boost::hash_combine(seed, std::toupper(c, locale));
19 |             }
20 | 
21 |             return seed;
22 |         }
23 |     };
24 | 
25 |     struct ci_key_eq
26 |     {
27 |         bool operator()(const std::string& l, const std::string& r) const
28 |         {
29 |             return boost::iequals(l, r);
30 |         }
31 |     };
32 | 
33 |     using ci_map = std::unordered_multimap<std::string, std::string, ci_hash, ci_key_eq>;
34 | }
35 | 


--------------------------------------------------------------------------------
/third_party/word2vec/demo-phrases.sh:
--------------------------------------------------------------------------------
 1 | make
 2 | if [ ! -e news.2012.en.shuffled ]; then
 3 |   wget http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2012.en.shuffled.gz
 4 |   gzip -d news.2012.en.shuffled.gz -f
 5 | fi
 6 | sed -e "s/’/'/g" -e "s/′/'/g" -e "s/''/ /g" < news.2012.en.shuffled | tr -c "A-Za-z'_ \n" " " > news.2012.en.shuffled-norm0
 7 | time ./word2phrase -train news.2012.en.shuffled-norm0 -output news.2012.en.shuffled-norm0-phrase0 -threshold 200 -debug 2
 8 | time ./word2phrase -train news.2012.en.shuffled-norm0-phrase0 -output news.2012.en.shuffled-norm0-phrase1 -threshold 100 -debug 2
 9 | tr A-Z a-z < news.2012.en.shuffled-norm0-phrase1 > news.2012.en.shuffled-norm1-phrase1
10 | time ./word2vec -train news.2012.en.shuffled-norm1-phrase1 -output vectors-phrase.bin -cbow 1 -size 200 -window 10 -negative 25 -hs 0 -sample 1e-5 -threads 20 -binary 1 -iter 15
11 | ./distance vectors-phrase.bin
12 | 


--------------------------------------------------------------------------------
/third_party/crow/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: cpp
 2 | 
 3 | sudo: false
 4 | 
 5 | notifications:
 6 |   irc: "chat.freenode.net##crow"
 7 | 
 8 | compiler:
 9 |   - gcc
10 | 
11 | env:
12 |   matrix:
13 |     - COMPILER=g++-4.8 CCOMPILER=gcc-4.8 PUSH_COVERAGE=ON
14 | 
15 | addons:
16 |   apt:
17 |     sources:
18 |       - ubuntu-toolchain-r-test
19 |       - boost-latest
20 |     packages:
21 |       - g++-4.8
22 |       - libboost1.55-all-dev
23 |       - python-pip
24 | 
25 | install:
26 |   - if [ "$PUSH_COVERAGE" == "ON" ]; then pip install --user git+git://github.com/eddyxu/cpp-coveralls.git; fi
27 | 
28 | before_script:
29 |   - export CXX=$COMPILER CC=$CCOMPILER
30 |   - mkdir build
31 |   - cd build
32 |   - cmake --version
33 |   - cmake ..
34 | 
35 | script: make && ctest
36 | 
37 | after_success:
38 |   - cd ..
39 |   - if [ "PUSH_COVERAGE" == "ON" ]; then coveralls --gcov gcov-4.8 -i include --gcov-options '\-lp'; fi
40 | 


--------------------------------------------------------------------------------
/third_party/crow/tests/template/mustachetest.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <fstream>
 3 | #include <string>
 4 | #include <iterator>
 5 | #include "crow/mustache.h"
 6 | #include "crow/json.h"
 7 | using namespace std;
 8 | using namespace crow;
 9 | using namespace crow::mustache;
10 | 
11 | string read_all(const string& filename)
12 | {
13 |     ifstream is(filename);
14 |     return {istreambuf_iterator<char>(is), istreambuf_iterator<char>()};
15 | }
16 | 
17 | int main()
18 | {
19 |     auto data = json::load(read_all("data")); 
20 |     auto templ = compile(read_all("template"));
21 |     auto partials = json::load(read_all("partials"));
22 |     set_loader([&](std::string name)->std::string
23 |     {
24 |         if (partials.count(name))
25 |         {
26 |             return partials[name].s();
27 |         }
28 |         return "";
29 |     });
30 |     context ctx(data);
31 |     cout << templ.render(ctx);
32 |     return 0;
33 | }
34 | 


--------------------------------------------------------------------------------
/third_party/word2vec/demo-phrase-accuracy.sh:
--------------------------------------------------------------------------------
 1 | make
 2 | if [ ! -e news.2012.en.shuffled ]; then
 3 |   wget http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2012.en.shuffled.gz
 4 |   gzip -d news.2012.en.shuffled.gz -f
 5 | fi
 6 | sed -e "s/’/'/g" -e "s/′/'/g" -e "s/''/ /g" < news.2012.en.shuffled | tr -c "A-Za-z'_ \n" " " > news.2012.en.shuffled-norm0
 7 | time ./word2phrase -train news.2012.en.shuffled-norm0 -output news.2012.en.shuffled-norm0-phrase0 -threshold 200 -debug 2
 8 | time ./word2phrase -train news.2012.en.shuffled-norm0-phrase0 -output news.2012.en.shuffled-norm0-phrase1 -threshold 100 -debug 2
 9 | tr A-Z a-z < news.2012.en.shuffled-norm0-phrase1 > news.2012.en.shuffled-norm1-phrase1
10 | time ./word2vec -train news.2012.en.shuffled-norm1-phrase1 -output vectors-phrase.bin -cbow 1 -size 200 -window 10 -negative 25 -hs 0 -sample 1e-5 -threads 20 -binary 1 -iter 15
11 | ./compute-accuracy vectors-phrase.bin < questions-phrases.txt
12 | 


--------------------------------------------------------------------------------
/kcws/cc/prepare_test_file.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Author: Koth
 3 | # @Date:   2016-11-22 21:20:59
 4 | # @Last Modified by:   Koth
 5 | # @Last Modified time: 2016-11-22 21:39:22
 6 | 
 7 | import sys
 8 | import os
 9 | 
10 | 
11 | def main(argc, argv):
12 |     if argc < 3:
13 |         print("Usage:%s <input> <output>" % (argv[0]))
14 |         sys.exit(1)
15 |     inp = open(argv[1], "r")
16 |     oup = open(argv[2], "w")
17 |     totalLine = 0
18 |     while True:
19 |         line = inp.readline()
20 |         if not line:
21 |             break
22 |         line = line.strip()
23 |         if not line or len(line) == 0:
24 |             continue
25 |         ustr = unicode(line.decode("utf8"))
26 |         if len(ustr) >= 80 or len(ustr) < 10:
27 |             continue
28 |         oup.write("%s\n" % (line))
29 |         totalLine += 1
30 |     print("totalLine:%d" % (totalLine))
31 | 
32 | 
33 | if __name__ == '__main__':
34 |     main(len(sys.argv), sys.argv)
35 | 


--------------------------------------------------------------------------------
/third_party/crow/tests/template/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 2.8)
 2 | project (template_test)
 3 |  
 4 | 
 5 | set(PROJECT_INCLUDE_DIR 
 6 | ${PROJECT_SOURCE_DIR}/include
 7 | )
 8 | 
 9 | set(TEST_SRCS
10 | mustachetest.cpp
11 | )
12 | 
13 | add_executable(mustachetest ${TEST_SRCS})
14 | #target_link_libraries(unittest crow)
15 | #target_link_libraries(unittest ${Boost_LIBRARIES} )
16 | set_target_properties(mustachetest PROPERTIES COMPILE_FLAGS "-Wall -std=c++1y")
17 | 
18 | #message(${PROJECT_SOURCE_DIR})
19 | #message(${CMAKE_CURRENT_BINARY_DIR})
20 | file(COPY DIRECTORY . DESTINATION ${CMAKE_CURRENT_BINARY_DIR}
21 |     FILES_MATCHING 
22 |     PATTERN "*.json")
23 | 
24 | add_custom_command(OUTPUT test.py
25 |         COMMAND ${CMAKE_COMMAND} -E
26 |         copy ${PROJECT_SOURCE_DIR}/test.py ${CMAKE_CURRENT_BINARY_DIR}/test.py
27 |         DEPENDS ${PROJECT_SOURCE_DIR}/test.py
28 |         )
29 | add_custom_target(template_test_copy ALL DEPENDS test.py)
30 | 


--------------------------------------------------------------------------------
/configure:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | DO_NOT_SUBMIT_WARNING="Unofficial setting. DO NOT SUBMIT!!!"
 4 | 
 5 | ## Set up python-related environment settings
 6 | while true; do
 7 |   fromuser=""
 8 |   if [ -z "$PYTHON_BIN_PATH" ]; then
 9 |     default_python_bin_path=$(which python)
10 |     read -p "Please specify the location of python. [Default is $default_python_bin_path]: " PYTHON_BIN_PATH
11 |     fromuser="1"
12 |     if [ -z "$PYTHON_BIN_PATH" ]; then
13 |       PYTHON_BIN_PATH=$default_python_bin_path
14 |     fi
15 |   fi
16 |   if [ -e "$PYTHON_BIN_PATH" ]; then
17 |     break
18 |   fi
19 |   echo "Invalid python path. ${PYTHON_BIN_PATH} cannot be found" 1>&2
20 |   if [ -z "$fromuser" ]; then
21 |     exit 1
22 |   fi
23 |   PYTHON_BIN_PATH=""
24 |   # Retry
25 | done
26 | 
27 | 
28 | # Invoke python_config and set up symlinks to python includes
29 | (./util/python/python_config.sh --setup "$PYTHON_BIN_PATH";) || exit -1
30 | 
31 | 
32 | echo "Configuration finished"
33 | 


--------------------------------------------------------------------------------
/tools/bazel.rc.template:
--------------------------------------------------------------------------------
 1 | build:cuda --crosstool_top=//third_party/gpus/crosstool
 2 | build:cuda --define=using_cuda=true --define=using_cuda_nvcc=true
 3 | 
 4 | build --force_python=py$PYTHON_MAJOR_VERSION
 5 | build --python$PYTHON_MAJOR_VERSION_path=$PYTHON_BINARY
 6 | build --define=use_fast_cpp_protos=true
 7 | build --define=allow_oversize_protos=true
 8 | 
 9 | build --define PYTHON_BIN_PATH=$PYTHON_BINARY
10 | test --define PYTHON_BIN_PATH=$PYTHON_BINARY
11 | test --force_python=py$PYTHON_MAJOR_VERSION
12 | test --host_force_python=py$PYTHON_MAJOR_VERSION
13 | run --define PYTHON_BIN_PATH=$PYTHON_BINARY
14 | 
15 | build --spawn_strategy=standalone
16 | test --spawn_strategy=standalone
17 | run --spawn_strategy=standalone
18 | 
19 | build --copt="-D_GLIBCXX_USE_CXX11_ABI=0"
20 | test --copt="-D_GLIBCXX_USE_CXX11_ABI=0"
21 | run --copt="-D_GLIBCXX_USE_CXX11_ABI=0"
22 | 
23 | build --genrule_strategy=standalone
24 | test --genrule_strategy=standalone
25 | run --genrule_strategy=standalone
26 | 


--------------------------------------------------------------------------------
/third_party/crow/tests/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 2.8)
 2 | project (crow_test)
 3 |  
 4 | 
 5 | set(TEST_SRCS
 6 | unittest.cpp
 7 | )
 8 | 
 9 | add_executable(unittest ${TEST_SRCS})
10 | #target_link_libraries(unittest crow)
11 | target_link_libraries(unittest ${Boost_LIBRARIES})
12 | target_link_libraries(unittest ${CMAKE_THREAD_LIBS_INIT})
13 | 
14 | if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
15 | # using Clang
16 | elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
17 | # using GCC
18 | set_target_properties(unittest PROPERTIES COMPILE_FLAGS "--coverage -fprofile-arcs -ftest-coverage")
19 | target_link_libraries(unittest gcov)
20 | endif()
21 | 
22 | add_subdirectory(template)
23 | #CXXFLAGS="-g -O0 -Wall -W -Wshadow -Wunused-variable \
24 | #Wunused-parameter -Wunused-function -Wunused -Wno-system-headers \
25 | #-Wno-deprecated -Woverloaded-virtual -Wwrite-strings -fprofile-arcs -ftest-coverage"
26 | #CFLAGS="-g -O0 -Wall -W -fprofile-arcs -ftest-coverage"
27 | #LDFLAGS="-fprofile-arcs -ftest-coverage"
28 | 


--------------------------------------------------------------------------------
/third_party/crow/examples/websocket/templates/ws.html:
--------------------------------------------------------------------------------
 1 | <!doctype html>
 2 | <html>
 3 | <head>
 4 |     <script src="https://code.jquery.com/jquery-3.1.0.min.js"></script>
 5 | </head>
 6 | <body>
 7 |     <input id="msg" type="text"></input>
 8 |     <button id="send">
 9 |         Send
10 |     </button><BR>
11 |     <textarea id="log" cols=100 rows=50>
12 |     </textarea>
13 |     <script>
14 | var sock = new WebSocket("ws://i.ipkn.me:40080/ws");
15 | sock.onopen = ()=>{
16 |     console.log('open')
17 | }
18 | sock.onerror = (e)=>{
19 |     console.log('error',e)
20 | }
21 | sock.onclose = ()=>{
22 |     console.log('close')
23 | }
24 | sock.onmessage = (e)=>{
25 |     $("#log").val(
26 |             e.data +"\n" + $("#log").val());
27 | }
28 | $("#msg").keypress(function(e){
29 |     if (e.which == 13)
30 |     {
31 |     sock.send($("#msg").val());
32 |     $("#msg").val("");
33 |     }
34 | });
35 | $("#send").click(()=>{
36 |     sock.send($("#msg").val());
37 |     $("#msg").val("");
38 | });
39 |     </script>
40 | </body>
41 | </html>
42 | 


--------------------------------------------------------------------------------
/tfmodel/tfmodel.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2016- 2018 Koth. All Rights Reserved.
 3 |  * =====================================================================================
 4 |  * Filename:  tfmodel.h
 5 |  * Author:  Koth
 6 |  * Create Time: 2017-02-01 13:34:04
 7 |  * Description:
 8 |  *
 9 |  */
10 | #ifndef TF_TFMODEL_H_
11 | #define TF_TFMODEL_H_
12 | #include <memory>
13 | #include <string>
14 | #include <vector>
15 | #include <utility>
16 | #include "tensorflow/core/framework/types.pb.h"
17 | #include "tensorflow/core/public/session.h"
18 | 
19 | namespace tf {
20 | class TfModel {
21 |  public:
22 |   virtual ~TfModel();
23 |   virtual bool Load(const std::string& path);
24 |   bool Eval(const std::vector<std::pair<std::string, tensorflow::Tensor> >& inputTensors,
25 |             const std::vector<std::string>& outputNames,
26 |             std::vector<tensorflow::Tensor>& outputTensors);
27 | 
28 |  protected:
29 |   std::unique_ptr<tensorflow::Session> session_;
30 | };
31 | 
32 | }  // namespace tf
33 | #endif  // TF_TFMODEL_H_
34 | 
35 | 
36 | 


--------------------------------------------------------------------------------
/kcws/train/BUILD:
--------------------------------------------------------------------------------
 1 | py_binary(
 2 |     name = "generate_training",
 3 |     srcs = ["generate_training.py"],
 4 |     data = ["//utils:w2v.so"],
 5 |     imports = ["../../utils"],
 6 | )
 7 | 
 8 | py_binary(
 9 |     name = "process_icwb",
10 |     srcs = ["process_icwb.py"],
11 |     data = ["//utils:w2v.so"],
12 |     imports = ["../../utils"],
13 | )
14 | 
15 | py_binary(
16 |     name = "process_people",
17 |     srcs = ["process_people.py"],
18 |     data = ["//utils:w2v.so"],
19 |     imports = ["../../utils"],
20 | )
21 | 
22 | py_binary(
23 |     name = "generate_char_embedding",
24 |     srcs = ["generate_char_embedding.py"],
25 |     data = ["//utils:w2v.so"],
26 |     imports = ["../../utils"],
27 | )
28 | 
29 | py_binary(
30 |     name = "generate_pos_train",
31 |     srcs = ["generate_pos_train.py"],
32 |     data=['//utils:w2v.so'],
33 |     imports=['../../utils']
34 | )
35 | 
36 | py_binary(
37 |     name = "generate_train_free",
38 |     srcs = ["generate_train_free.py"],
39 |     data = ["//utils:w2v.so"],
40 |     imports = ["../../utils"],
41 | )


--------------------------------------------------------------------------------
/kcws/train/replace_unk.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Author: Koth
 3 | # @Date:   2016-12-09 19:37:43
 4 | # @Last Modified by:   Koth
 5 | # @Last Modified time: 2016-12-09 19:49:37
 6 | import sys
 7 | 
 8 | 
 9 | def main(argc, argv):
10 |   if argc < 4:
11 |     print("Usage:%s <vob> <input> <output>" % (argv[0]))
12 |     sys.exit(1)
13 |   vp = open(argv[1], "r")
14 |   inp = open(argv[2], "r")
15 |   oup = open(argv[3], "w")
16 |   vobsMap = {}
17 |   for line in vp:
18 |     line = line.strip()
19 |     ss = line.split(" ")
20 |     vobsMap[ss[0]] = 1
21 |   while True:
22 |     line = inp.readline()
23 |     if not line:
24 |       break
25 |     line = line.strip()
26 |     if not line:
27 |       continue
28 |     ss = line.split(" ")
29 |     tokens = []
30 |     for s in ss:
31 |       if s in vobsMap:
32 |         tokens.append(s)
33 |       else:
34 |         tokens.append("<UNK>")
35 |     oup.write("%s\n" % (" ".join(tokens)))
36 |   oup.close()
37 |   inp.close()
38 |   vp.close()
39 | 
40 | 
41 | if __name__ == '__main__':
42 |   main(len(sys.argv), sys.argv)


--------------------------------------------------------------------------------
/kcws/cc/viterbi_decode.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2016- 2018 Koth. All Rights Reserved.
 3 |  * =====================================================================================
 4 |  * Filename:  viterbi_decode.h
 5 |  * Author:  Koth
 6 |  * Create Time: 2017-02-01 13:43:51
 7 |  * Description:
 8 |  *
 9 |  */
10 | #ifndef KCWS_CC_VITERBI_DECODE_H_
11 | #define KCWS_CC_VITERBI_DECODE_H_
12 | #include <vector>
13 | #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
14 | namespace kcws {
15 | void get_best_path(
16 |   const Eigen::TensorMap<Eigen::Tensor<float, 3, Eigen::RowMajor>, Eigen::Aligned>& predictions,
17 |   int sentenceIdx,
18 |   int nn,
19 |   const std::vector<std::vector<float>>& trans,
20 |   int** bp,
21 |   float** scores,
22 |   std::vector<int>& resultTags,
23 |   int ntags);
24 | 
25 | int viterbi_decode(
26 |   const Eigen::TensorMap<Eigen::Tensor<float, 3, Eigen::RowMajor>, Eigen::Aligned>& predictions,
27 |   int sentenceIdx,
28 |   int nn,
29 |   const std::vector<std::vector<float>>& trans,
30 |   int** bp,
31 |   float** scores,
32 |   int ntags);
33 | 
34 | }  // namespace kcws
35 | #endif  // KCWS_CC_VITERBI_DECODE_H_
36 | 


--------------------------------------------------------------------------------
/third_party/crow/cmake/FindTcmalloc.cmake:
--------------------------------------------------------------------------------
 1 | # - Find Tcmalloc
 2 | # Find the native Tcmalloc library
 3 | #
 4 | #  Tcmalloc_LIBRARIES   - List of libraries when using Tcmalloc.
 5 | #  Tcmalloc_FOUND       - True if Tcmalloc found.
 6 | 
 7 | if (USE_TCMALLOC)
 8 |   set(Tcmalloc_NAMES tcmalloc)
 9 | else ()
10 |   set(Tcmalloc_NAMES tcmalloc_minimal tcmalloc)
11 | endif ()
12 | 
13 | find_library(Tcmalloc_LIBRARY NO_DEFAULT_PATH
14 |   NAMES ${Tcmalloc_NAMES}
15 |   PATHS ${HT_DEPENDENCY_LIB_DIR} /lib /usr/lib /usr/local/lib /opt/local/lib
16 | )
17 | 
18 | if (Tcmalloc_LIBRARY)
19 |   set(Tcmalloc_FOUND TRUE)
20 |   set( Tcmalloc_LIBRARIES ${Tcmalloc_LIBRARY} )
21 | else ()
22 |   set(Tcmalloc_FOUND FALSE)
23 |   set( Tcmalloc_LIBRARIES )
24 | endif ()
25 | 
26 | if (Tcmalloc_FOUND)
27 |   message(STATUS "Found Tcmalloc: ${Tcmalloc_LIBRARY}")
28 | else ()
29 |   message(STATUS "Not Found Tcmalloc: ${Tcmalloc_LIBRARY}")
30 |   if (Tcmalloc_FIND_REQUIRED)
31 |     message(STATUS "Looked for Tcmalloc libraries named ${Tcmalloc_NAMES}.")
32 |     message(FATAL_ERROR "Could NOT find Tcmalloc library")
33 |   endif ()
34 | endif ()
35 | 
36 | mark_as_advanced(
37 |   Tcmalloc_LIBRARY
38 |   )
39 | 


--------------------------------------------------------------------------------
/utils/py_word2vec_vob.cc:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2016- 2018 Koth. All Rights Reserved.
 3 |  * =====================================================================================
 4 |  * Filename:  py_word2vec_vob.cc
 5 |  * Author:  Koth Chen
 6 |  * Create Time: 2016-07-25 18:46:27
 7 |  * Description:
 8 |  *
 9 |  */
10 | #include "third_party/pybind11/pybind11.h"
11 | #include "third_party/pybind11/stl.h"
12 | #include "word2vec_vob.h"
13 | namespace py = pybind11;
14 | 
15 | PYBIND11_PLUGIN(w2v) {
16 |   py::module m("w2v", "python binding for  word2vec vocab");
17 |   py::class_<utils::Word2vecVocab>(m, "Word2vecVocab", "python class Word2vecVocab")
18 |   .def(py::init())
19 |   .def("Load", &utils::Word2vecVocab::Load, "load word2vec from text file")
20 |   .def("SetMapword", &utils::Word2vecVocab::SetMapword, "set whether map to word")
21 |   .def("GetFeature", &utils::Word2vecVocab::GetFeatureOrEmpty, "get word embedding or empty if not exist")
22 |   .def("GetTotalWord", &utils::Word2vecVocab::GetTotalWord, "get total words")
23 |   .def("GetWordIndex", &utils::Word2vecVocab::GetWordIndex, "get word idx")
24 |   .def("DumpBasicVocab", &utils::Word2vecVocab::DumpBasicVocab, "dump the word2vec vocab into basic mode");
25 |   return m.ptr();
26 | }


--------------------------------------------------------------------------------
/third_party/crow/tests/template/test.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from __future__ import print_function
 3 | import glob
 4 | import json
 5 | import os
 6 | import subprocess
 7 | for testfile in glob.glob("*.json"):
 8 |     testdoc = json.load(open(testfile))
 9 |     for test in testdoc["tests"]:
10 |         if "lambda" in test["data"]:
11 |             continue
12 |         open('data', 'w').write(json.dumps(test["data"]))
13 |         open('template', 'w').write(test["template"])
14 |         if "partials" in test:
15 |             open('partials', 'w').write(json.dumps(test["partials"]))
16 |         else:
17 |             open('partials', 'w').write("{}")
18 |         ret = subprocess.check_output("./mustachetest").decode('utf8')
19 |         print(testfile, test["name"])
20 |         if ret != test["expected"]:
21 |             if 'partials' in test:
22 |                 print('partials:', json.dumps(test["partials"]))
23 |             print(json.dumps(test["data"]))
24 |             print(test["template"])
25 |             print('Expected:',repr(test["expected"]))
26 |             print('Actual:',repr(ret))
27 |         assert ret == test["expected"]
28 |         os.unlink('data')
29 |         os.unlink('template')
30 |         os.unlink('partials')
31 | 


--------------------------------------------------------------------------------
/kcws/cc/sentence_breaker.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2016- 2018 Koth. All Rights Reserved.
 3 |  * =====================================================================================
 4 |  * Filename:  sentence_breaker.h
 5 |  * Author:  Koth
 6 |  * Create Time: 2016-11-23 21:54:41
 7 |  * Description:
 8 |  *
 9 |  */
10 | #ifndef KCWS_SENTENCE_BREAKER_H_
11 | #define KCWS_SENTENCE_BREAKER_H_
12 | #include <vector>
13 | #include <string>
14 | #include <unordered_map>
15 | #include <unordered_set>
16 | #include "utils/basic_string_util.h"
17 | namespace kcws {
18 | 
19 | class SentenceBreaker {
20 |  public:
21 |   explicit SentenceBreaker(int maxLen);
22 |   virtual ~SentenceBreaker();
23 |   bool breakSentences(const UnicodeStr& text,
24 |                       std::vector<UnicodeStr>* lines);
25 | 
26 |  private:
27 |   static char*  kInlineMarks[];
28 |   static char* kBreakMarks[];
29 | 
30 |   bool is_inline_mark(UnicodeCharT uch) ;
31 |   bool is_break_mark(UnicodeCharT uch) ;
32 | 
33 |   std::unordered_map<UnicodeCharT, UnicodeCharT> inline_marks_;
34 |   std::unordered_set<UnicodeCharT> break_marks_;
35 |   std::unordered_set<UnicodeCharT> inline_marks_set_;
36 |   int max_len_;
37 | };
38 | }  // namespace kcws
39 | 
40 | #endif  // KCWS_SENTENCE_BREAKER_H_
41 | 


--------------------------------------------------------------------------------
/third_party/python/semver/README.md:
--------------------------------------------------------------------------------
 1 | Semver -- python module for semantic versioning
 2 | ===============================================
 3 | 
 4 | ![Travis CI](https://travis-ci.org/k-bx/python-semver.svg?branch=master)
 5 | 
 6 | Simple module for comparing versions as noted at [semver.org](http://semver.org/).
 7 | 
 8 | This module provides just couple of functions, main of which are:
 9 | 
10 | ```python
11 | >>> import semver
12 | >>> semver.compare("1.0.0", "2.0.0")
13 | -1
14 | >>> semver.compare("2.0.0", "1.0.0")
15 | 1
16 | >>> semver.compare("2.0.0", "2.0.0")
17 | 0
18 | >>> semver.match("2.0.0", ">=1.0.0")
19 | True
20 | >>> semver.match("1.0.0", ">1.0.0")
21 | False
22 | >>> semver.format_version(3, 4, 5, 'pre.2', 'build.4')
23 | '3.4.5-pre.2+build.4'
24 | >>> semver.bump_major("3.4.5")
25 | '4.0.0'
26 | >>> semver.bump_minor("3.4.5")
27 | '3.5.0'
28 | >>> semver.bump_patch("3.4.5")
29 | '3.4.6'
30 | >>> semver.max_ver("1.0.0", "2.0.0")
31 | '2.0.0'
32 | >>> semver.min_ver("1.0.0", "2.0.0")
33 | '1.0.0'
34 | ```
35 | 
36 | Installation
37 | ------------
38 | 
39 | For Python 2:
40 | 
41 | ```
42 | pip install semver
43 | ```
44 | 
45 | For Python 3:
46 | 
47 | ```
48 | pip3 install semver
49 | ```
50 | 
51 | Homepage at PyPi: https://pypi.python.org/pypi/semver
52 | 


--------------------------------------------------------------------------------
/kcws/train/sentence.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding:utf-8 -*-
 3 | 
 4 | # File: sentence.py
 5 | # Project: /e/code/kcws
 6 | # Created: Thu Jul 27 2017
 7 | # Author: Koth Chen
 8 | # Copyright (c) 2017 Koth
 9 | #
10 | # <<licensetext>>
11 | 
12 | 
13 | class Sentence:
14 |     def __init__(self):
15 |         self.tokens = []
16 |         self.chars = 0
17 | 
18 |     def addToken(self, t):
19 |         self.chars += len(t)
20 |         self.tokens.append(t)
21 | 
22 |     def clear(self):
23 |         self.tokens = []
24 |         self.chars = 0
25 | 
26 |     # label -1, unknown
27 |     # 0-> 'S'
28 |     # 1-> 'B'
29 |     # 2-> 'M'
30 |     # 3-> 'E'
31 |     def generate_tr_line(self, x, y, vob):
32 |         for t in self.tokens:
33 |             if len(t) == 1:
34 |                 x.append(vob.GetWordIndex(str(t[0].encode("utf8"))))
35 |                 y.append(0)
36 |             else:
37 |                 nn = len(t)
38 |                 for i in range(nn):
39 |                     x.append(vob.GetWordIndex(str(t[i].encode("utf8"))))
40 |                     if i == 0:
41 |                         y.append(1)
42 |                     elif i == (nn - 1):
43 |                         y.append(3)
44 |                     else:
45 |                         y.append(2)
46 | 


--------------------------------------------------------------------------------
/third_party/word2vec/README.txt:
--------------------------------------------------------------------------------
 1 | Tools for computing distributed representtion of words
 2 | ------------------------------------------------------
 3 | 
 4 | We provide an implementation of the Continuous Bag-of-Words (CBOW) and the Skip-gram model (SG), as well as several demo scripts.
 5 | 
 6 | Given a text corpus, the word2vec tool learns a vector for every word in the vocabulary using the Continuous
 7 | Bag-of-Words or the Skip-Gram neural network architectures. The user should to specify the following:
 8 |  - desired vector dimensionality
 9 |  - the size of the context window for either the Skip-Gram or the Continuous Bag-of-Words model
10 |  - training algorithm: hierarchical softmax and / or negative sampling
11 |  - threshold for downsampling the frequent words 
12 |  - number of threads to use
13 |  - the format of the output word vector file (text or binary)
14 | 
15 | Usually, the other hyper-parameters such as the learning rate do not need to be tuned for different training sets. 
16 | 
17 | The script demo-word.sh downloads a small (100MB) text corpus from the web, and trains a small word vector model. After the training
18 | is finished, the user can interactively explore the similarity of the words.
19 | 
20 | More information about the scripts is provided at https://code.google.com/p/word2vec/
21 | 
22 | 


--------------------------------------------------------------------------------
/kcws/cc/test_breaker.cc:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2016- 2018 Koth. All Rights Reserved.
 3 |  * =====================================================================================
 4 |  * Filename:  test_breaker.cc
 5 |  * Author:  Koth
 6 |  * Create Time: 2016-11-24 19:40:33
 7 |  * Description:
 8 |  *
 9 |  */
10 | #include <stdio.h>
11 | #include <string.h>
12 | #include <iostream>
13 | #include <fstream>
14 | #include <string>
15 | #include <sstream>
16 | #include <chrono>
17 | 
18 | #include "base/base.h"
19 | #include "utils/basic_string_util.h"
20 | 
21 | #include "sentence_breaker.h"  //NOLINT
22 | 
23 | DEFINE_string(test_str, "", "the test string");
24 | 
25 | int main(int argc, char *argv[]) {
26 |   FLAGS_v = 0;
27 |   FLAGS_logtostderr = 1;
28 |   base::Init(argc, argv);
29 |   kcws::SentenceBreaker breaker(80);
30 |   CHECK(!FLAGS_test_str.empty()) << "test string should be set";
31 |   UnicodeStr ustr;
32 |   CHECK(BasicStringUtil::u8tou16(FLAGS_test_str.c_str(), FLAGS_test_str.size(), ustr));
33 |   std::vector<UnicodeStr> results;
34 |   CHECK(breaker.breakSentences(ustr, &results)) << "break error";
35 |   VLOG(0) << "results is :";
36 |   for (auto u : results) {
37 |     std::string todo;
38 |     CHECK(BasicStringUtil::u16tou8(u.c_str(), u.size(), todo));
39 |     VLOG(0) << todo;
40 |   }
41 |   return 0;
42 | }
43 | 


--------------------------------------------------------------------------------
/third_party/boost/boost.bzl:
--------------------------------------------------------------------------------
 1 | include_pattern = "boost/%s/"
 2 | hdrs_patterns = [
 3 |   "boost/%s.h",
 4 |   "boost/%s.hpp",
 5 |   "boost/%s/**/*.hpp",
 6 |   "boost/%s/**/*.ipp",
 7 |   "boost/%s/**/*.h",
 8 |   "libs/%s/src/*.ipp",
 9 | ]
10 | srcs_patterns = [
11 |   "libs/%s/src/*.cpp",
12 |   "libs/%s/src/*.hpp",
13 | ]
14 | 
15 | def srcs_list(library_name):
16 |   return native.glob([p % (library_name,) for p in srcs_patterns])
17 | 
18 | def includes_list(library_name):
19 |   return [".", include_pattern % library_name]
20 | 
21 | def hdr_list(library_name):
22 |   return native.glob([p % (library_name,) for p in hdrs_patterns])
23 | 
24 | def boost_library(name, defines=None, includes=None, hdrs=None, srcs=None, deps=None, copts=None):
25 |   if defines == None:
26 |     defines = []
27 | 
28 |   if includes == None:
29 |     includes = []
30 | 
31 |   if hdrs == None:
32 |     hdrs = []
33 | 
34 |   if srcs == None:
35 |     srcs = []
36 | 
37 |   if deps == None:
38 |     deps = []
39 | 
40 |   if copts == None:
41 |     copts = []
42 | 
43 |   return native.cc_library(
44 |     name = name,
45 |     visibility = ["//visibility:public"],
46 |     defines = defines,
47 |     includes = includes_list(name) + includes,
48 |     hdrs = hdr_list(name) + hdrs,
49 |     srcs = srcs_list(name) + srcs,
50 |     deps = deps,
51 |     copts = copts,
52 |     licenses = ["notice"],
53 |   )
54 | 
55 | 


--------------------------------------------------------------------------------
/BUILD.boost:
--------------------------------------------------------------------------------
 1 | # Description:
 2 | #   The Boost library collection (http://www.boost.org)
 3 | #
 4 | # Most Boost libraries are header-only, in which case you only need to depend
 5 | # on :boost. If you need one of the libraries that has a separately-compiled
 6 | # implementation, depend on the appropriate libs rule.
 7 | 
 8 | package(default_visibility = ["//visibility:public"])
 9 | 
10 | licenses(["notice"])  # Boost software license
11 | 
12 | 
13 | cc_library(
14 |     name = "boost",
15 |     hdrs = glob([
16 |         "boost/**/*.hpp",
17 |         "boost/**/*.h",
18 |         "boost/**/*.ipp",
19 |     ]),
20 |     includes = [
21 |     "."
22 |     ],
23 | )
24 | 
25 | cc_library(
26 |     name = "filesystem",
27 |     srcs = glob([ "libs/filesystem/src/*.cpp"]),
28 |     deps = [
29 |         ":boost",
30 |         ":system",
31 |     ],
32 | )
33 | 
34 | cc_library(
35 |     name = "iostreams",
36 |     srcs = glob(["libs/iostreams/src/*.cpp"]),
37 |     deps = [
38 |         ":boost",
39 |         "@bzip2_archive//:bz2lib",
40 |         "@zlib_archive//:zlib",
41 |     ],
42 | )
43 | 
44 | cc_library(
45 |     name = "program_options",
46 |     srcs = glob([ "libs/program_options/src/*.cpp"]),
47 |     deps = [
48 |         ":boost",
49 |     ],
50 | )
51 | 
52 | cc_library(
53 |     name = "system",
54 |     srcs = glob(["libs/system/src/*.cpp"]),
55 |     deps = [
56 |         ":boost",
57 |     ],
58 | )
59 | 


--------------------------------------------------------------------------------
/third_party/python/semver/setup.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from distutils.core import setup
 4 | 
 5 | with open('README.md') as f:
 6 |     LONG_DESCRIPTION = f.read()
 7 | 
 8 | setup(
 9 |     name='semver',
10 |     version='2.4.1',
11 |     description='Python package to work with Semantic Versioning (http://semver.org/)',
12 |     long_description=LONG_DESCRIPTION,
13 |     author='Konstantine Rybnikov',
14 |     author_email='k-bx@k-bx.com',
15 |     url='https://github.com/k-bx/python-semver',
16 |     download_url='https://github.com/k-bx/python-semver/downloads',
17 |     py_modules=['semver'],
18 |     include_package_data=True,
19 |     license='BSD',
20 |     classifiers=[
21 |         'Environment :: Web Environment',
22 |         'Framework :: Django',
23 |         'Intended Audience :: Developers',
24 |         'License :: OSI Approved :: BSD License',
25 |         'Operating System :: OS Independent',
26 |         'Programming Language :: Python',
27 |         'Programming Language :: Python :: 2',
28 |         'Programming Language :: Python :: 2.6',
29 |         'Programming Language :: Python :: 2.7',
30 |         'Programming Language :: Python :: 3',
31 |         'Programming Language :: Python :: 3.2',
32 |         'Programming Language :: Python :: 3.3',
33 |         'Programming Language :: Python :: 3.4',
34 |         'Topic :: Software Development :: Libraries :: Python Modules',
35 |     ],
36 | )
37 | 


--------------------------------------------------------------------------------
/third_party/crow/examples/example_chat.html:
--------------------------------------------------------------------------------
 1 | <html>
 2 | <head>
 3 | <script src="//code.jquery.com/jquery-1.11.0.min.js"></script>
 4 | </head>
 5 | <body>
 6 | <input id="msg" type="text">
 7 | <button id="send">Send</button>
 8 | <div id="logs">
 9 | </div>
10 | <script>
11 | $(document).ready(function(){
12 | 	$("#send").click(function(){
13 | 		var msg = $("#msg").val();
14 | 		console.log(msg);
15 | 		if (msg.length > 0)
16 | 			$.post("/send", msg);
17 | 		$("#msg").val("");
18 | 	});
19 | 	$("#msg").keyup(function(event){
20 | 		if(event.keyCode == 13){
21 | 			$("#send").click();
22 | 		}
23 | 	});
24 | 	var lastLog = 0;
25 | 	var updateLog;
26 | 	updateLog = function(data)
27 | 	{
28 | 		console.log("recv ");
29 | 		console.log(data);
30 | 		var lastLog = data.last*1;
31 | 		console.log("lastLog: " + lastLog);
32 | 		var s = "";
33 |         function htmlEncode(s)
34 |         {
35 |             return s.replace(/&(?!\w+([;\s]|$))/g, "&amp;")
36 |             .replace(/</g, "&lt;").replace(/>/g, "&gt;");
37 |         }
38 | 		for(var x in data.msgs)
39 | 		{
40 | 
41 | 			s = htmlEncode(data.msgs[x]) + "<BR>" + s;
42 | 		}
43 | 		$("#logs").html(s+$("#logs").html());
44 | 		var failFunction;
45 | 		failFunction = function(){
46 | 			$.getJSON("/logs/"+lastLog, updateLog).fail(failFunction);
47 | 		};
48 | 		$.getJSON("/logs/"+lastLog, updateLog).fail(failFunction);
49 | 	}
50 | 	$.getJSON("/logs", updateLog);
51 | });
52 | </script>
53 | </body>
54 | </html>
55 | 


--------------------------------------------------------------------------------
/WORKSPACE:
--------------------------------------------------------------------------------
 1 | # Uncomment and update the paths in these entries to build the Android demo.
 2 | #since support libraries are not published in Maven Central or jCenter, we'll have a local copy
 3 | 
 4 | 
 5 | new_http_archive(
 6 |     name = "boost",
 7 |     urls = [
 8 |             #"https://sourceforge.net/projects/boost/files/boost/1.61.0/boost_1_61_0.tar.bz2/download",
 9 |             "https://dl.bintray.com/boostorg/release/1.64.0/source/boost_1_64_0.tar.bz2",
10 |     ],
11 |     build_file = "BUILD.boost",
12 |     type = "tar.bz2",
13 |     strip_prefix = "boost_1_64_0/",
14 |     sha256 = "7bcc5caace97baa948931d712ea5f37038dbb1c5d89b43ad4def4ed7cb683332",
15 | )
16 | 
17 | 
18 | new_http_archive(
19 |    name="tf",
20 |    url = "https://gitlab.com/yovnchine/tfrelates/raw/master/tf_dist_1.2.0_rc1_0604.zip",
21 |    strip_prefix = "tf_dist/",
22 |    sha256 = "269115820a2ea4b7260f2ff131ed47860809e3ff05da763704a004724cea9775",
23 |    build_file="BUILD.tf_dist",
24 | )
25 | 
26 | 
27 | #new_local_repository(
28 | #   name="tf",
29 | #   path = "/e/code/tf_dist",
30 | #   build_file="BUILD.tf_dist",
31 | #)
32 | 
33 | 
34 | http_archive(
35 |     name = "protobuf",
36 |     urls = [
37 |           "https://github.com/google/protobuf/archive/2b7430d96aeff2bb624c8d52182ff5e4b9f7f18a.tar.gz",
38 |     ],
39 |     sha256 = "94789497712726816f154f8441ed4319573c78c3f8cc6398bb00f464ffd82bd2",
40 |     strip_prefix = "protobuf-2b7430d96aeff2bb624c8d52182ff5e4b9f7f18a",
41 | )
42 | 


--------------------------------------------------------------------------------
/third_party/pybind11/complex.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |     pybind11/complex.h: Complex number support
 3 | 
 4 |     Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
 5 | 
 6 |     All rights reserved. Use of this source code is governed by a
 7 |     BSD-style license that can be found in the LICENSE file.
 8 | */
 9 | 
10 | #pragma once
11 | 
12 | #include "pybind11.h"
13 | #include <complex>
14 | 
15 | /// glibc defines I as a macro which breaks things, e.g., boost template names
16 | #ifdef I
17 | #  undef I
18 | #endif
19 | 
20 | NAMESPACE_BEGIN(pybind11)
21 | 
22 | PYBIND11_DECL_FMT(std::complex<float>, "Zf");
23 | PYBIND11_DECL_FMT(std::complex<double>, "Zd");
24 | 
25 | NAMESPACE_BEGIN(detail)
26 | template <typename T> class type_caster<std::complex<T>> {
27 | public:
28 |     bool load(handle src, bool) {
29 |         if (!src)
30 |             return false;
31 |         Py_complex result = PyComplex_AsCComplex(src.ptr());
32 |         if (result.real == -1.0 && PyErr_Occurred()) {
33 |             PyErr_Clear();
34 |             return false;
35 |         }
36 |         value = std::complex<T>((T) result.real, (T) result.imag);
37 |         return true;
38 |     }
39 | 
40 |     static handle cast(const std::complex<T> &src, return_value_policy /* policy */, handle /* parent */) {
41 |         return PyComplex_FromDoubles((double) src.real(), (double) src.imag());
42 |     }
43 | 
44 |     PYBIND11_TYPE_CASTER(std::complex<T>, _("complex"));
45 | };
46 | NAMESPACE_END(detail)
47 | NAMESPACE_END(pybind11)
48 | 


--------------------------------------------------------------------------------
/third_party/crow/examples/websocket/example_ws.cpp:
--------------------------------------------------------------------------------
 1 | #include "crow.h"
 2 | #include <unordered_set>
 3 | #include <mutex>
 4 | 
 5 | 
 6 | int main()
 7 | {
 8 |     crow::SimpleApp app;
 9 | 
10 |     std::mutex mtx;;
11 |     std::unordered_set<crow::websocket::connection*> users;
12 | 
13 |     CROW_ROUTE(app, "/ws")
14 |         .websocket()
15 |         .onopen([&](crow::websocket::connection& conn){
16 |                 CROW_LOG_INFO << "new websocket connection";
17 |                 std::lock_guard<std::mutex> _(mtx);
18 |                 users.insert(&conn);
19 |                 })
20 |         .onclose([&](crow::websocket::connection& conn, const std::string& reason){
21 |                 CROW_LOG_INFO << "websocket connection closed: " << reason;
22 |                 std::lock_guard<std::mutex> _(mtx);
23 |                 users.erase(&conn);
24 |                 })
25 |         .onmessage([&](crow::websocket::connection& /*conn*/, const std::string& data, bool is_binary){
26 |                 std::lock_guard<std::mutex> _(mtx);
27 |                 for(auto u:users)
28 |                     if (is_binary)
29 |                         u->send_binary(data);
30 |                     else
31 |                         u->send_text(data);
32 |                 });
33 | 
34 |     CROW_ROUTE(app, "/")
35 |     ([]{
36 |         auto page = crow::mustache::load("ws.html");
37 |         return page.render();
38 |      });
39 | 
40 |     app.port(40080)
41 |         .multithreaded()
42 |         .run();
43 | }
44 | 


--------------------------------------------------------------------------------
/kcws/train/filter_sentence.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Author: Koth
 3 | # @Date:   2016-11-16 22:46:50
 4 | # @Last Modified by:   Koth
 5 | # @Last Modified time: 2016-11-21 22:40:47
 6 | import sys
 7 | import random
 8 | 
 9 | 
10 | def main(argc, argv):
11 |     if argc < 2:
12 |         print("Usage:%s <input>" % (argv[0]))
13 |         sys.exit(1)
14 |     SENTENCE_LEN = 80
15 |     fp = open(argv[1], "r")
16 |     nl = 0
17 |     bad = 0
18 |     test = 0
19 |     tr_p = open("train.txt", "w")
20 |     te_p = open("test.txt", "w")
21 |     while True:
22 |         line = fp.readline()
23 |         if not line:
24 |             break
25 |         line = line.strip()
26 |         if not line:
27 |             continue
28 |         ss = line.split(' ')
29 | 
30 |         if len(ss) != (2 * SENTENCE_LEN):
31 |             print("len is:%d" % (len(ss)))
32 |             continue
33 |         numV = 0
34 |         for i in range(SENTENCE_LEN):
35 |             if int(ss[i]) != 0:
36 |                 numV += 1
37 |                 if numV > 2:
38 |                     break
39 |         if numV <= 2:
40 |             bad += 1
41 |         else:
42 |             r = random.random()
43 |             if r <= 0.02 and test < 8000:
44 |                 te_p.write("%s\n" % (line))
45 |                 test += 1
46 |             else:
47 |                 tr_p.write("%s\n" % (line))
48 |         nl += 1
49 |     fp.close()
50 |     print("got bad:%d" % (bad))
51 | 
52 | 
53 | if __name__ == '__main__':
54 |     main(len(sys.argv), sys.argv)
55 | 


--------------------------------------------------------------------------------
/third_party/crow/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2014, ipkn
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without
 5 | modification, are permitted provided that the following conditions are met:
 6 | 
 7 | * Redistributions of source code must retain the above copyright notice, this
 8 |   list of conditions and the following disclaimer.
 9 | 
10 | * Redistributions in binary form must reproduce the above copyright notice,
11 |   this list of conditions and the following disclaimer in the documentation
12 |   and/or other materials provided with the distribution.
13 | 
14 | * Neither the name of the author nor the names of its
15 |   contributors may be used to endorse or promote products derived from
16 |   this software without specific prior written permission.
17 | 
18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 | 


--------------------------------------------------------------------------------
/third_party/crow/examples/example_test.py:
--------------------------------------------------------------------------------
 1 | import urllib
 2 | assert "Hello World!" ==  urllib.urlopen('http://localhost:18080').read()
 3 | assert "About Crow example." ==  urllib.urlopen('http://localhost:18080/about').read()
 4 | assert 404 == urllib.urlopen('http://localhost:18080/list').getcode()
 5 | assert "3 bottles of beer!" == urllib.urlopen('http://localhost:18080/hello/3').read()
 6 | assert "100 bottles of beer!" == urllib.urlopen('http://localhost:18080/hello/100').read()
 7 | assert 400 == urllib.urlopen('http://localhost:18080/hello/500').getcode()
 8 | assert "3" == urllib.urlopen('http://localhost:18080/add_json', data='{"a":1,"b":2}').read()
 9 | assert "3" == urllib.urlopen('http://localhost:18080/add/1/2').read()
10 | 
11 | # test persistent connection
12 | import socket
13 | import time
14 | s = socket.socket()
15 | s.connect(('localhost', 18080))
16 | for i in xrange(10):
17 |     s.send('''GET / HTTP/1.1
18 | Host: localhost\r\n\r\n''');
19 |     assert 'Hello World!' in s.recv(1024)
20 | 
21 | # test large
22 | s = socket.socket()
23 | s.connect(('localhost', 18080))
24 | s.send('''GET /large HTTP/1.1
25 | Host: localhost\r\nConnection: close\r\n\r\n''')
26 | r = ''
27 | while True:
28 |      d = s.recv(1024*1024)
29 |      if not d:
30 |          break;
31 |      r += d
32 |      print len(r), len(d)
33 | print len(r), r[:100]
34 | assert len(r) > 512*1024
35 | 
36 | # test timeout
37 | s = socket.socket()
38 | s.connect(('localhost', 18080))
39 | # invalid request, connection will be closed after timeout
40 | s.send('''GET / HTTP/1.1
41 | hHhHHefhwjkefhklwejfklwejf
42 | ''')
43 | print s.recv(1024)
44 | 
45 | 


--------------------------------------------------------------------------------
/kcws/train/merge_vec.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Author: Koth
 3 | # @Date:   2016-12-02 13:02:30
 4 | # @Last Modified by:   Koth
 5 | # @Last Modified time: 2016-12-02 13:35:42
 6 | import sys
 7 | 
 8 | 
 9 | def main(argc, argv):
10 |   if argc < 3:
11 |     print("Usage:%s <w2v> <glove>" % (argv[0]))
12 |     sys.exit(1)
13 |   inwp = open(argv[1], "r")
14 |   ingp = open(argv[2], "r")
15 |   oup = open("merged_vec.txt", "w")
16 |   inwp.readline()
17 |   fmap = {}
18 |   n1 = 0
19 |   n2 = 0
20 |   k1 = -1
21 |   k2 = -1
22 |   while True:
23 |     line = inwp.readline()
24 |     if not line:
25 |       break
26 |     n1 += 1
27 |     line = line.strip()
28 |     ss = line.split(' ')
29 |     nn = len(ss)
30 |     if k1 == -1:
31 |       k1 = nn - 1
32 |     else:
33 |       assert (k1 == (nn - 1))
34 |     if ss[0] == '</s>':
35 |       ss[0] = '<unk>'
36 |     fv = " ".join(ss[1:])
37 |     fmap[ss[0]] = fv
38 |   while True:
39 |     line = ingp.readline()
40 |     if not line:
41 |       break
42 |     n2 += 1
43 |     line = line.strip()
44 |     ss = line.split(' ')
45 |     nn = len(ss)
46 |     if k2 == -1:
47 |       k2 = nn - 1
48 |     else:
49 |       assert (k2 == (nn - 1))
50 |     assert (ss[0] in fmap)
51 |     fv = " ".join(ss[1:])
52 |     fmap[ss[0]] += " " + fv
53 |   assert (n1 == n2)
54 |   oup.write("%d %d\n" % (n1, k1 + k2))
55 |   fv = fmap["<unk>"]
56 |   oup.write("<unk> %s\n" % (fv))
57 |   for k, v in fmap.iteritems():
58 |     if k == '<unk>':
59 |       continue
60 |     oup.write("%s %s\n" % (k, v))
61 |   oup.close()
62 | 
63 | 
64 | if __name__ == '__main__':
65 |   main(len(sys.argv), sys.argv)
66 | 


--------------------------------------------------------------------------------
/kcws/train/sampling_for_train.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Author: Koth
 3 | # @Date:   2016-12-01 09:30:11
 4 | # @Last Modified by:   Koth
 5 | # @Last Modified time: 2016-12-01 10:19:15
 6 | import sys
 7 | import random
 8 | 
 9 | 
10 | def main(argc, argv):
11 |   if argc < 2:
12 |     print("Usage: %s <input>" % (argv[0]))
13 |     sys.exit(1)
14 |   inp = open(argv[1], "r")
15 |   trp = open("train.txt", "w")
16 |   tep = open("test.txt", "w")
17 |   sampleNum = 5000
18 |   if argc > 2:
19 |     sampleNum = int(argv[2])
20 |   allf = []
21 |   allp = []
22 |   nf = 0
23 |   np = 0
24 |   while True:
25 |     line = inp.readline()
26 |     if not line:
27 |       break
28 |     line = line.strip()
29 |     if not line:
30 |       continue
31 |     ss = line.split(" ")
32 |     assert (len(ss) == 6)
33 |     if int(ss[5]) == 0:
34 |       nf += 1
35 |       if len(allf) < sampleNum:
36 |         allf.append(line)
37 |       else:
38 |         k = random.randint(0, nf - 1)
39 |         if k < sampleNum:
40 |           trp.write("%s\n" % (allf[k]))
41 |           allf[k] = line
42 |         else:
43 |           trp.write("%s\n" % (line))
44 |     else:
45 |       np += 1
46 |       if len(allp) < sampleNum:
47 |         allp.append(line)
48 |       else:
49 |         k = random.randint(0, np - 1)
50 |         if k < sampleNum:
51 |           trp.write("%s\n" % (allp[k]))
52 |           allp[k] = line
53 |         else:
54 |           trp.write("%s\n" % (line))
55 |   for s in allp:
56 |     tep.write("%s\n" % (s))
57 |   for s in allf:
58 |     tep.write("%s\n" % (s))
59 | 
60 | 
61 | if __name__ == '__main__':
62 |   main(len(sys.argv), sys.argv)


--------------------------------------------------------------------------------
/third_party/pybind11/typeid.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |     pybind11/typeid.h: Compiler-independent access to type identifiers
 3 | 
 4 |     Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
 5 | 
 6 |     All rights reserved. Use of this source code is governed by a
 7 |     BSD-style license that can be found in the LICENSE file.
 8 | */
 9 | 
10 | #pragma once
11 | 
12 | #include <cstdio>
13 | #include <cstdlib>
14 | 
15 | #if defined(__GNUG__)
16 | #include <cxxabi.h>
17 | #endif
18 | 
19 | NAMESPACE_BEGIN(pybind11)
20 | NAMESPACE_BEGIN(detail)
21 | /// Erase all occurrences of a substring
22 | inline void erase_all(std::string &string, const std::string &search) {
23 |     for (size_t pos = 0;;) {
24 |         pos = string.find(search, pos);
25 |         if (pos == std::string::npos) break;
26 |         string.erase(pos, search.length());
27 |     }
28 | }
29 | 
30 | PYBIND11_NOINLINE inline void clean_type_id(std::string &name) {
31 | #if defined(__GNUG__)
32 |     int status = 0;
33 |     std::unique_ptr<char, void (*)(void *)> res {
34 |         abi::__cxa_demangle(name.c_str(), nullptr, nullptr, &status), std::free };
35 |     if (status == 0)
36 |         name = res.get();
37 | #else
38 |     detail::erase_all(name, "class ");
39 |     detail::erase_all(name, "struct ");
40 |     detail::erase_all(name, "enum ");
41 | #endif
42 |     detail::erase_all(name, "pybind11::");
43 | }
44 | NAMESPACE_END(detail)
45 | 
46 | /// Return a string representation of a C++ type
47 | template <typename T> static std::string type_id() {
48 |     std::string name(typeid(T).name());
49 |     detail::clean_type_id(name);
50 |     return name;
51 | }
52 | 
53 | NAMESPACE_END(pybind11)
54 | 


--------------------------------------------------------------------------------
/third_party/crow/amalgamate/merge_all.py:
--------------------------------------------------------------------------------
 1 | """Merges all the header files."""
 2 | from glob import glob
 3 | from os import path as pt
 4 | import re
 5 | from collections import defaultdict
 6 | import sys
 7 | 
 8 | header_path = "../include"
 9 | if len(sys.argv) > 1:
10 |     header_path = sys.argv[1]
11 | 
12 | OUTPUT = 'crow_all.h'
13 | re_depends = re.compile('^#include "(.*)"', re.MULTILINE)
14 | headers = [x.rsplit('/', 1)[-1] for x in glob(pt.join(header_path, '*.h*'))]
15 | headers += ['crow/' + x.rsplit('/', 1)[-1] for x in glob(pt.join(header_path, 'crow/*.h*'))]
16 | print(headers)
17 | edges = defaultdict(list)
18 | for header in headers:
19 |     d = open(pt.join(header_path, header)).read()
20 |     match = re_depends.findall(d)
21 |     for m in match:
22 |         # m should included before header
23 |         edges[m].append(header)
24 | 
25 | visited = defaultdict(bool)
26 | order = []
27 | 
28 | 
29 | def dfs(x):
30 |     """Ensure all header files are visited."""
31 |     visited[x] = True
32 |     for y in edges[x]:
33 |         if not visited[y]:
34 |             dfs(y)
35 |     order.append(x)
36 | 
37 | for header in headers:
38 |     if not visited[header]:
39 |         dfs(header)
40 | 
41 | order = order[::-1]
42 | for x in edges:
43 |     print(x, edges[x])
44 | for x in edges:
45 |     for y in edges[x]:
46 |         assert order.index(x) < order.index(y), 'cyclic include detected'
47 | 
48 | print(order)
49 | build = []
50 | for header in order:
51 |     d = open(pt.join(header_path, header)).read()
52 |     build.append(re_depends.sub(lambda x: '\n', d))
53 |     build.append('\n')
54 | 
55 | open(OUTPUT, 'w').write('\n'.join(build))
56 | 


--------------------------------------------------------------------------------
/third_party/gflags/BUILD:
--------------------------------------------------------------------------------
 1 | licenses(["notice"])
 2 | 
 3 | package(default_visibility = ["//visibility:public"])
 4 | 
 5 | 
 6 | package_version = "2.0"
 7 | 
 8 | package_file =  "gflags-" + package_version + ".tar.gz"
 9 | 
10 | package_dir = "gflags-" + package_version
11 | 
12 | genrule(
13 |     name = "gflags-srcs",
14 |     srcs = [
15 |         package_file,
16 |     ],
17 |     outs = [
18 |         "include/gflags/gflags.h",
19 |         "include/gflags/gflags_completions.h",
20 |         "include/gflags/gflags_declare.h",
21 |         "include/google/gflags.h",
22 |         "include/google/gflags_completions.h",
23 |         "lib/libgflags.a",
24 |         "lib/libgflags_nothreads.a",
25 |     ],
26 |     cmd = "\n".join([
27 |         "export INSTALL_DIR=$$(pwd)/$(@D)",
28 |         "export TMP_DIR=$$(mktemp -d -t gflags.XXXXX)",
29 |         "mkdir -p $$TMP_DIR",
30 |         "cp -R $(SRCS) $$TMP_DIR",
31 |         "cd $$TMP_DIR",
32 |         "tar xfz " + package_file,
33 |         "cd " + package_dir,
34 |         "CPPFLAGS=\"-D_GLIBCXX_USE_CXX11_ABI=0\" ./configure --prefix=$$INSTALL_DIR  --with-pic=yes --enable-shared=no",
35 |         "make install",
36 |         "rm -rf $$TMP_DIR",
37 |     ]),
38 | )
39 | 
40 | cc_library(
41 |     name = "gflags-cxx",
42 |     srcs = [
43 |         "empty.cc",
44 |         "include/gflags/gflags_declare.h",
45 |         "lib/libgflags.a",
46 |     ],
47 |     hdrs = [
48 |         "include/gflags/gflags.h",
49 |     ],
50 |     includes = [
51 |         "include",
52 |     ],
53 |     # linkstatic = 1,
54 | )
55 | 
56 | filegroup(
57 |     name = "gflags",
58 |     srcs = [
59 |         ":gflags-cxx",
60 |     ],
61 | )
62 | 


--------------------------------------------------------------------------------
/kcws/cc/test_ac_scanner.cc:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2016- 2018 Koth. All Rights Reserved.
 3 |  * =====================================================================================
 4 |  * Filename:  test_ac_scanner.cc
 5 |  * Author:  Koth
 6 |  * Create Time: 2016-12-09 17:02:56
 7 |  * Description:
 8 |  *
 9 |  */
10 | #include <cstring>
11 | 
12 | #include "base/base.h"
13 | #include "kcws/cc/ac_scanner.h"
14 | #include "utils/basic_string_util.h"
15 | 
16 | DEFINE_string(test_string, "挑战中共创辉煌国际", "the test string");
17 | class TestScanReporter: public ScanReporter<uint32_t> {
18 |  public:
19 |   bool callback(uint32_t pos, uint32_t& data, size_t len) override {
20 |     VLOG(0) << "got data:" << data << ",at pos:" << pos << ",len:" << len;
21 |     return false;
22 |   }
23 | };
24 | int main(int argc,  char* argv[]) {
25 |   FLAGS_v = 0;
26 |   FLAGS_logtostderr = true;
27 |   base::Init(argc, argv);
28 |   AcScanner<UnicodeStr, uint32_t> ac_scanner;
29 |   const char* dicts[] = {
30 |     "中共",
31 |     "共创",
32 |     "挑战",
33 |     "辉煌",
34 |     "辉煌国际"
35 |   };
36 |   for (size_t i = 0; i < sizeof(dicts) / sizeof(char*); i++) {
37 |     UnicodeStr ustr;
38 |     BasicStringUtil::u8tou16(dicts[i], strlen(dicts[i]), ustr);
39 |     ac_scanner.pushNode(ustr, i);
40 |   }
41 |   ac_scanner.buildFailNode();
42 |   VLOG(0) << "total node:" << ac_scanner.NumItem();
43 |   UnicodeStr testu;
44 |   TestScanReporter reporter;
45 |   BasicStringUtil::u8tou16(FLAGS_test_string.c_str(), FLAGS_test_string.size(), testu);
46 |   VLOG(0) << "test string len:" << testu.size();
47 |   bool ret = ac_scanner.doScan(testu, &reporter);
48 |   VLOG(0) << "scan return:" << ret;
49 |   return 0;
50 | }


--------------------------------------------------------------------------------
/kcws/cc/pos_tagger.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2016- 2018 Koth. All Rights Reserved.
 3 |  * =====================================================================================
 4 |  * Filename:  pos_tagger.h
 5 |  * Author:  Koth
 6 |  * Create Time: 2017-02-01 14:02:35
 7 |  * Description:
 8 |  *
 9 |  */
10 | #ifndef KCWS_CC_POS_TAGGER_H_
11 | #define KCWS_CC_POS_TAGGER_H_
12 | #include <string>
13 | #include <vector>
14 | #include <memory>
15 | #include <unordered_map>
16 | #include "utils/basic_string_util.h"
17 | #include "utils/basic_vocab.h"
18 | namespace tf {
19 | class TfModel;
20 | }  // namespace tf
21 | namespace kcws {
22 | struct WordInfo {
23 |   UnicodeCharT chars[5];
24 |   int idx;
25 | };
26 | class PosTagger {
27 |  public:
28 |   PosTagger();
29 |   virtual  ~PosTagger();
30 | 
31 |   bool LoadModel(const std::string& modelPath,
32 |                  const std::string& wordVocabPath,
33 |                  const std::string& charVocabPath,
34 |                  const std::string& tagVocabPath,
35 |                  int maxSentenceLen);
36 |   bool Tag(const std::vector<std::vector<std::string>>& sentences,
37 |            std::vector<std::vector<std::string>>& tags);
38 |   void BuildWordInfo(const std::string& str, WordInfo& word);
39 |  private:
40 |   std::unique_ptr<tf::TfModel> model_;
41 |   std::unordered_map<UnicodeCharT, int> char_vocab_;
42 |   std::unordered_map<std::string, WordInfo> word_vocab_;
43 |   std::unordered_map<int, std::string> tag_vocab_;
44 |   int max_sentence_len_;
45 |   int num_tags_;
46 |   std::vector<std::vector<float>> transitions_;
47 |   int** bp_;
48 |   float** scores_;
49 | };
50 | 
51 | }  // namespace kcws
52 | #endif  // KCWS_CC_POS_TAGGER_H_
53 | 


--------------------------------------------------------------------------------
/third_party/glog/BUILD:
--------------------------------------------------------------------------------
 1 | licenses(["notice"])
 2 | 
 3 | package(default_visibility = ["//visibility:public"])
 4 | 
 5 | package_version = "0.3.4"
 6 | 
 7 | package_file = "glog-" + package_version + ".tar.gz"
 8 | 
 9 | package_dir = "glog-" + package_version
10 | 
11 | genrule(
12 |     name = "glog-srcs",
13 |     srcs = [
14 |         package_file,
15 |     ],
16 |     outs = [
17 |         "include/glog/log_severity.h",
18 |         "include/glog/logging.h",
19 |         "include/glog/raw_logging.h",
20 |         "include/glog/stl_logging.h",
21 |         "include/glog/vlog_is_on.h",
22 |         "lib/libglog.a",
23 |     ],
24 |     cmd = "\n".join([
25 |         "export INSTALL_DIR=$$(pwd)/$(@D)",
26 |         "export TMP_DIR=$$(mktemp -d -t glog.XXXXX)",
27 |         "mkdir -p $$TMP_DIR",
28 |         "cp -R $(SRCS) $$TMP_DIR",
29 |         "cd $$TMP_DIR",
30 |         "tar xfz " + package_file,
31 |         "cd " + package_dir,
32 |         "CPPFLAGS=\"-D_GLIBCXX_USE_CXX11_ABI=0\" ./configure --prefix=$$INSTALL_DIR --enable-shared=no --with-pic=yes",
33 |         "make install",
34 |         "rm -rf $$TMP_DIR",
35 |     ]),
36 | )
37 | 
38 | cc_library(
39 |     name = "glog-cxx",
40 |     srcs = [
41 |         "empty.cc",
42 |         "include/glog/log_severity.h",
43 |         "include/glog/raw_logging.h",
44 |         "include/glog/stl_logging.h",
45 |         "include/glog/vlog_is_on.h",
46 |         "lib/libglog.a",
47 |     ],
48 |     hdrs = [
49 |         "include/glog/logging.h",
50 |     ],
51 |     includes = [
52 |         "include",
53 |     ],
54 |     # linkstatic = 1,
55 |     deps = [
56 |         "//third_party/gflags:gflags-cxx",
57 |     ],
58 | )
59 | 
60 | filegroup(
61 |     name = "glog",
62 |     srcs = [
63 |         ":glog-cxx",
64 |     ],
65 | )
66 | 


--------------------------------------------------------------------------------
/third_party/pybind11/functional.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |     pybind11/functional.h: std::function<> support
 3 | 
 4 |     Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
 5 | 
 6 |     All rights reserved. Use of this source code is governed by a
 7 |     BSD-style license that can be found in the LICENSE file.
 8 | */
 9 | 
10 | #pragma once
11 | 
12 | #include "pybind11.h"
13 | #include <functional>
14 | 
15 | NAMESPACE_BEGIN(pybind11)
16 | NAMESPACE_BEGIN(detail)
17 | 
18 | template <typename Return, typename... Args> struct type_caster<std::function<Return(Args...)>> {
19 |     typedef std::function<Return(Args...)> type;
20 |     typedef typename std::conditional<std::is_same<Return, void>::value, void_type, Return>::type retval_type;
21 | public:
22 |     bool load(handle src_, bool) {
23 |         src_ = detail::get_function(src_);
24 |         if (!src_ || !PyCallable_Check(src_.ptr()))
25 |             return false;
26 |         object src(src_, true);
27 |         value = [src](Args... args) -> Return {
28 |             gil_scoped_acquire acq;
29 |             object retval(src(std::move(args)...));
30 |             /* Visual studio 2015 parser issue: need parentheses around this expression */
31 |             return (retval.template cast<Return>());
32 |         };
33 |         return true;
34 |     }
35 | 
36 |     template <typename Func>
37 |     static handle cast(Func &&f_, return_value_policy policy, handle /* parent */) {
38 |         return cpp_function(std::forward<Func>(f_), policy).release();
39 |     }
40 | 
41 |     PYBIND11_TYPE_CASTER(type, _("function<") +
42 |             type_caster<std::tuple<Args...>>::name() + _(" -> ") +
43 |             type_caster<retval_type>::name() +
44 |             _(">"));
45 | };
46 | 
47 | NAMESPACE_END(detail)
48 | NAMESPACE_END(pybind11)
49 | 


--------------------------------------------------------------------------------
/utils/BUILD:
--------------------------------------------------------------------------------
 1 | cc_library(
 2 |     name = "basic_string_util",
 3 |     srcs = [
 4 |         "basic_string_util.h",
 5 |     ],
 6 |     visibility = ["//visibility:public"],
 7 | )
 8 | 
 9 | cc_library(
10 |     name = "jsonxx",
11 |     srcs = [
12 |         "json_util.h",
13 |         "jsonxx.cc",
14 |         "jsonxx.h",
15 |     ],
16 |     hdrs = [
17 |         "json_util.h",
18 |         "jsonxx.h",
19 |     ],
20 |     linkstatic = 1,
21 |     visibility = ["//visibility:public"],
22 |     alwayslink = 1,
23 | )
24 | 
25 | cc_library(
26 |     name = "word2vec_vob",
27 |     srcs = [
28 |         "word2vec_vob.cc",
29 |     ],
30 |     hdrs = [
31 |         "vocab.h",
32 |         "word2vec_vob.h",
33 |     ],
34 |     copts = [
35 |         "-g",
36 |         "-O3",
37 |         "-std=c++11",
38 |     ],
39 |     visibility = ["//visibility:public"],
40 |     deps = [
41 |         ":basic_string_util",
42 |         "//base",
43 |         # '@re2//:re2',
44 |     ],
45 | )
46 | 
47 | cc_binary(
48 |     name = "w2v.so",
49 |     srcs = glob([
50 |         "py_word2vec_vob.cc",
51 |     ]),
52 |     copts = [
53 |         "-std=c++11",
54 |         "-fPIC",
55 |     ],
56 |     linkshared = 1,
57 |     visibility = ["//visibility:public"],
58 |     deps = [
59 |         ":word2vec_vob",
60 |         "//base",
61 |         "//third_party/pybind11",
62 |     ],
63 | )
64 | 
65 | cc_library(
66 |     name = "basic_vocab",
67 |     srcs = [
68 |         "basic_vocab.cc",
69 |     ],
70 |     hdrs = [
71 |         "basic_vocab.h",
72 |         "vocab.h",
73 |     ],
74 |     copts = [
75 |         "-g",
76 |         "-O3",
77 |         "-std=c++11",
78 |     ],
79 |     visibility = ["//visibility:public"],
80 |     deps = [
81 |         ":basic_string_util",
82 |         "//base",
83 |     ],
84 | )
85 | 


--------------------------------------------------------------------------------
/third_party/crow/include/crow/http_request.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <boost/asio.hpp>
 4 | 
 5 | #include "third_party/crow/include/crow/common.h"
 6 | #include "third_party/crow/include/crow/ci_map.h"
 7 | #include "third_party/crow/include/crow/query_string.h"
 8 | 
 9 | namespace crow {
10 | template <typename T>
11 | inline const std::string& get_header_value(const T& headers, const std::string& key) {
12 |   if (headers.count(key)) {
13 |     return headers.find(key)->second;
14 |   }
15 |   static std::string empty;
16 |   return empty;
17 | }
18 | 
19 | struct DetachHelper;
20 | 
21 | struct request {
22 |   HTTPMethod method;
23 |   std::string raw_url;
24 |   std::string url;
25 |   query_string url_params;
26 |   ci_map headers;
27 |   std::string body;
28 | 
29 |   void* middleware_context{};
30 |   boost::asio::io_service* io_service{};
31 | 
32 |   request()
33 |     : method(HTTPMethod::Get) {
34 |   }
35 | 
36 |   request(HTTPMethod method, std::string raw_url, std::string url, query_string url_params, ci_map headers, std::string body)
37 |     : method(method), raw_url(std::move(raw_url)), url(std::move(url)), url_params(std::move(url_params)), headers(std::move(headers)), body(std::move(body)) {
38 |   }
39 | 
40 |   void add_header(std::string key, std::string value) {
41 |     headers.emplace(std::move(key), std::move(value));
42 |   }
43 | 
44 |   const std::string& get_header_value(const std::string& key) const {
45 |     return crow::get_header_value(headers, key);
46 |   }
47 | 
48 |   template<typename CompletionHandler>
49 |   void post(CompletionHandler handler) {
50 |     io_service->post(handler);
51 |   }
52 | 
53 |   template<typename CompletionHandler>
54 |   void dispatch(CompletionHandler handler) {
55 |     io_service->dispatch(handler);
56 |   }
57 | 
58 | };
59 | }
60 | 


--------------------------------------------------------------------------------
/utils/word2vec_vob.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2015 Rongall.com. All Rights Reserved.
 3 |  * =====================================================================================
 4 |  * Filename:  word2vec_vob.h
 5 |  * Description:   description
 6 |  * Author: Koth(Yaowen Chen)
 7 |  *
 8 |  */
 9 | #ifndef UTILS_WORD2VEC_VOB_H_
10 | #define UTILS_WORD2VEC_VOB_H_
11 | #include <vector>
12 | #include <string>
13 | #include <unordered_map>
14 | 
15 | #include "utils/vocab.h"
16 | 
17 | namespace utils {
18 | struct WV;
19 | class Word2vecVocab: public Vocab {
20 |  public:
21 |   enum OOV_OPT {
22 |     USE_BLANK = 0,
23 |     USE_OOV = 1,
24 |     USE_RANDOM = 2,
25 |     USE_ONE_RANDOM = 3,
26 |   };
27 |   Word2vecVocab(): f_dim_(0), avg_vals_(NULL), std_vals_(NULL), map_word_(false) {}
28 |   virtual ~Word2vecVocab() {
29 |     if (avg_vals_) {
30 |       delete[] avg_vals_;
31 |     }
32 |     if (std_vals_) {
33 |       delete[] std_vals_;
34 |     }
35 |     avg_vals_ = std_vals_ = NULL;
36 |   }
37 |   bool Load(const std::string& path) override;
38 |   int GetVectorDim()const {
39 |     return f_dim_;
40 |   }
41 |   void SetMapword(bool mapword);
42 |   bool GetMapword();
43 |   bool GetVector(const std::string& word, std::vector<float>** vec, OOV_OPT opt = USE_BLANK);
44 |   std::vector<float> GetFeatureOrEmpty(const std::string& word);
45 |   int GetWordIndex(const std::string& word) override;
46 |   int GetTotalWord() override;
47 |   bool DumpBasicVocab(const std::string& path);
48 | 
49 |  private:
50 |   struct WV {
51 |     std::vector<float> vect;
52 |     int idx;
53 |   };
54 |   std::unordered_map<std::string, WV> f_map_;
55 |   int f_dim_;
56 |   float*  avg_vals_;
57 |   float* std_vals_;
58 |   std::vector<float> oov_feature_;
59 |   bool map_word_;
60 | };
61 | }  // namespace utils
62 | #endif  // UTILS_WORD2VEC_VOB_H_
63 | 


--------------------------------------------------------------------------------
/kcws/train/generate_char_embedding.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Author: Koth
 3 | # @Date:   2016-11-30 19:59:15
 4 | # @Last Modified by:   Koth
 5 | # @Last Modified time: 2016-11-30 20:58:29
 6 | import sys
 7 | import w2v
 8 | 
 9 | SEQ_LEN = 5
10 | 
11 | 
12 | def processFile(inp, oup, vob):
13 |   global SEQ_LEN
14 |   while True:
15 |     line = inp.readline()
16 |     if not line:
17 |       break
18 |     line = line.strip()
19 |     if not line:
20 |       continue
21 |     ss = line.split("  ")
22 |     x = []
23 |     y = []
24 |     for s in ss:
25 |       ustr = unicode(s.decode("utf-8"))
26 |       if len(ustr) < 1:
27 |         continue
28 |       nn = len(ustr)
29 |       for i in range(nn):
30 |         theStr = str(ustr[i].encode("utf8"))
31 |         x.append(str(vob.GetWordIndex(theStr)))
32 |         if i == (nn - 1):
33 |           y.append(1)
34 |         else:
35 |           y.append(0)
36 |     nn = len(x)
37 |     for i in range(nn):
38 |       seqLen = SEQ_LEN
39 |       if y[i] == 1:
40 |         seqLen = 2
41 |       hasStop = (y[i] == 1)
42 |       for j in range(1, seqLen):
43 |         if (i + j + 1) > nn:
44 |           continue
45 |         newX = x[i:i + j + 1]
46 |         for k in range(j + 1, SEQ_LEN):
47 |           newX.append("0")
48 |         newY = 0
49 |         if y[i + j] == 1:
50 |           if not hasStop:
51 |             newY = 1
52 |           hasStop = True
53 |         line = " ".join(newX)
54 |         line += " " + str(newY)
55 |         oup.write("%s\n" % (line))
56 | 
57 | 
58 | def main(argc, argv):
59 |   if argc < 4:
60 |     print("Usage: %s <input>  <output> <vec>" % (argv[0]))
61 |     sys.exit(1)
62 |   vob = w2v.Word2vecVocab()
63 |   vob.Load(argv[3])
64 |   inp = open(argv[1], "r")
65 |   oup = open(argv[2], "w")
66 |   processFile(inp, oup, vob)
67 | 
68 | 
69 | if __name__ == '__main__':
70 |   main(len(sys.argv), sys.argv)


--------------------------------------------------------------------------------
/pos_train.md:
--------------------------------------------------------------------------------
 1 | ### 词性标注训练过程
 2 | 
 3 | 
 4 | - 1)准备单词word2vec训练样本
 5 | 
 6 | 
 7 | ``` 
 8 |     python kcws/train/prepare_pos.py  /e/data/people_2014  pos_lines.txt
 9 | ```
10 | 
11 | 
12 | 
13 | - 2)使用word2vec导出即将使用的词词典
14 | 
15 | ``` 
16 |     bazel build -c opt third_party/word2vec:word2vec
17 | 	bazel-bin/third_party/word2vec/word2vec -train pos_lines.txt -min-count 5 -save-vocab pre_word_vec.txt
18 | ```
19 | - 3)替换单词中的UNK
20 | 
21 | 
22 | ``` 
23 |  python kcws/train/replace_unk.py  pre_word_vec.txt pos_lines.txt pos_lines_with_unk.txt
24 | ```
25 | 
26 | - 4)训练词向量
27 | 
28 | ``` 
29 | bazel-bin/third_party/word2vec/word2vec  -train pos_lines_with_unk.txt -output word_vec.txt -size 150 -window 5 -sample 1e-4 -negative 5 -hs 0 -binary 0  -cbow 0 -iter 3 -min-count 5 -hs 1
30 | ```
31 | 
32 | - 5)统计词性tag出现频次，生成词性tag集合
33 |   
34 | ``` 
35 | python kcws/train/stats_pos.py  /e/data/people_2014 pos_vocab.txt  lines_withpos.txt
36 | ```
37 | 
38 | - 6)生成训练样本
39 |   
40 | ``` 
41 |  bazel build -c opt kcws/train:generate_pos_train
42 | ```
43 | 
44 | 
45 | ``` 
46 |  bazel-bin/kcws/train/generate_pos_train word_vec.txt char_vec.txt  pos_vocab.txt  /e/data/people_2014  pos_train.txt
47 | ```
48 | 
49 | 以上char_vec.txt可使用分词中相同的文件
50 | 
51 | 
52 |  
53 | - 7)去重，乱序，分开训练集，测试集
54 | 
55 |    
56 | 
57 | ``` 
58 | sort -u pos_train.txt>pos_train.u
59 | shuf pos_train.u >pos_train.txt
60 | head -n 230000 pos_train.txt >train.txt
61 | tail -n 51362 pos_train.txt >test.txt
62 | ``` 
63 | 
64 | - 8)训练
65 | 
66 | ``` 
67 | python kcws/train/train_pos.py --train_data_path train.txt --test_data_path test.txt --log_dir pos_logs --word_word2vec_path word_vec.txt --char_word2vec_path char_vec.txt 
68 | ```
69 | 
70 | 
71 | - 9)模型导出
72 | 
73 | ```
74 | python tools/freeze_graph.py --input_graph pos_logs/graph.pbtxt --input_checkpoint pos_logs/model.ckpt --output_node_names "transitions,Reshape_9" --output_graph kcws/models/pos_model.pbtxt
75 | ```
76 | 


--------------------------------------------------------------------------------
/kcws/cc/tf_seg_model.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2016- 2018 Koth. All Rights Reserved.
 3 |  * =====================================================================================
 4 |  * Filename:  tf_seg_model.h
 5 |  * Author:  Koth
 6 |  * Create Time: 2016-11-20 10:31:03
 7 |  * Description:
 8 |  *
 9 |  */
10 | #ifndef KCWS_TF_SEG_MODEL_H_
11 | #define KCWS_TF_SEG_MODEL_H_
12 | #include <string>
13 | #include <vector>
14 | #include <memory>
15 | #include <unordered_map>
16 | #include "utils/basic_string_util.h"
17 | #include "utils/basic_vocab.h"
18 | #include "kcws/cc/ac_scanner.h"
19 | namespace tf {
20 | class TfModel;
21 | }  // namespace tf
22 | namespace kcws {
23 | typedef std::pair<size_t, size_t> SegTok;
24 | class SentenceBreaker;
25 | class PosTagger;
26 | class TfSegModel {
27 |  public:
28 |   TfSegModel();
29 |   virtual  ~TfSegModel();
30 | 
31 |   bool LoadModel(const std::string& modelPath,
32 |                  const std::string& vocabPath,
33 |                  int maxSentenceLen,
34 |                  const std::string& userDictPath = std::string());
35 |   bool Segment(const std::string& sentence,
36 |                std::vector<std::string>* pTopResult,
37 |                std::vector<std::string>* posTaggs = nullptr);
38 |   bool Segment(const std::vector<UnicodeStr>& sentences,
39 |                std::vector<std::vector<SegTok>>* pTopKResults);
40 |   void SetPosTagger(PosTagger* tagger);
41 |  private:
42 |   bool loadUserDict(const std::string& userDictPath);
43 |   std::unique_ptr<tf::TfModel> model_;
44 |   std::unique_ptr<PosTagger> tagger_;
45 |   std::unordered_map<UnicodeCharT, int> vocab_;
46 |   std::unique_ptr<SentenceBreaker> breaker_;
47 |   int max_sentence_len_;
48 |   int num_words_;
49 |   int num_tags_;
50 |   std::vector<std::vector<float>> transitions_;
51 |   int** bp_;
52 |   float** scores_;
53 |   AcScanner<UnicodeStr, int> scanner_;
54 | };
55 | 
56 | }  // namespace kcws
57 | 
58 | #endif  // KCWS_TF_SEG_MODEL_H_


--------------------------------------------------------------------------------
/third_party/crow/include/crow/dumb_timer_queue.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <boost/asio.hpp>
 4 | #include <deque>
 5 | #include <functional>
 6 | #include <chrono>
 7 | #include <thread>
 8 | 
 9 | #include "third_party/crow/include/crow/logging.h"
10 | 
11 | namespace crow {
12 | namespace detail {
13 | // fast timer queue for fixed tick value.
14 | class dumb_timer_queue {
15 |  public:
16 |   using key = std::pair<dumb_timer_queue*, int>;
17 | 
18 |   void cancel(key& k) {
19 |     auto self = k.first;
20 |     k.first = nullptr;
21 |     if (!self)
22 |       return;
23 | 
24 |     unsigned int index = (unsigned int)(k.second - self->step_);
25 |     if (index < self->dq_.size())
26 |       self->dq_[index].second = nullptr;
27 |   }
28 | 
29 |   key add(std::function<void()> f) {
30 |     dq_.emplace_back(std::chrono::steady_clock::now(), std::move(f));
31 |     int ret = step_ + dq_.size() - 1;
32 | 
33 |     CROW_LOG_DEBUG << "timer add inside: " << this << ' ' << ret ;
34 |     return {this, ret};
35 |   }
36 | 
37 |   void process() {
38 |     if (!io_service_)
39 |       return;
40 | 
41 |     auto now = std::chrono::steady_clock::now();
42 |     while (!dq_.empty()) {
43 |       auto& x = dq_.front();
44 |       if (now - x.first < std::chrono::seconds(tick))
45 |         break;
46 |       if (x.second) {
47 |         CROW_LOG_DEBUG << "timer call: " << this << ' ' << step_;
48 |         // we know that timer handlers are very simple currenty; call here
49 |         x.second();
50 |       }
51 |       dq_.pop_front();
52 |       step_++;
53 |     }
54 |   }
55 | 
56 |   void set_io_service(boost::asio::io_service& io_service) {
57 |     io_service_ = &io_service;
58 |   }
59 | 
60 |   dumb_timer_queue() noexcept {
61 |   }
62 | 
63 |  private:
64 | 
65 |   int tick{5};
66 |   boost::asio::io_service* io_service_{};
67 |   std::deque<std::pair<decltype(std::chrono::steady_clock::now()), std::function<void()>>> dq_;
68 |   int step_{};
69 | };
70 | }
71 | }
72 | 


--------------------------------------------------------------------------------
/kcws/cc/demo.html:
--------------------------------------------------------------------------------
 1 | <html>
 2 | <head>
 3 | <meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
 4 | 
 5 | <title>DEMO</title>
 6 | <script language="Javascript">
 7 | function createXMLHttpRequest() {
 8 | 	var xmlHttp;
 9 | 	if (window.XMLHttpRequest) {
10 | 		xmlHttp = new XMLHttpRequest();
11 | 		if (xmlHttp.overrideMimeType)
12 | 			xmlHttp.overrideMimeType('text/xml');
13 | 	} else if (window.ActiveXObject) {
14 | 		try {
15 | 			xmlHttp = new ActiveXObject("Msxml2.XMLHTTP");
16 | 		} catch (e) {
17 | 			try {
18 | 				xmlHttp = new ActiveXObject("Microsoft.XMLHTTP");
19 | 			} catch (e) {
20 | 			}
21 | 		}
22 | 	}
23 | 	return xmlHttp;
24 | }
25 | function submitContent(){
26 |     var url = "/tf_seg/api";
27 | 	var content = document.getElementById('content').value;
28 | 	xmlHttp = createXMLHttpRequest(); 
29 |     xmlHttp.open("POST", url, true);
30 | 	xmlHttp.setRequestHeader("Content-Type", "application/json;");
31 |     xmlHttp.onreadystatechange = function(){
32 | 		if(xmlHttp.readyState == 4 && xmlHttp.status == 200) {
33 | 			var result = document.getElementById('result');
34 | 			result.value = xmlHttp.responseText;
35 | 			//alert(xmlHttp.responseText);
36 | 		}
37 | 	}
38 |     
39 |     xmlHttp.send(content);
40 | }
41 | 
42 | </script>
43 | </head>
44 | 
45 | <body>
46 | 
47 | 
48 | <table align="center">
49 | <tbody>
50 | <!--<form id="form1" name="form1" method="post" action="" onsubmit="return submitContent();" target="_blank">-->
51 | <tr>
52 | <td>
53 | 样例数据（JSON）:
54 | </td>
55 | <td><textarea id="content" cols="80" rows="17">
56 | {
57 | 	"sentence" : "赵雅淇洒泪道歉 和林丹没有任何经济关系"
58 | }
59 | 
60 | </textarea></td></tr>
61 | <tr><td></td><td><p align="center"><input type="submit" value="点击提交" onclick="submitContent();"></td></tr>
62 | <tr><td></td><td></td></tr>
63 | 
64 | <tr>
65 | <td>
66 | API返回结果：
67 | </td>
68 | <td><textarea id="result" cols="80" rows="17" readonly="yes"></textarea></td></tr>
69 | 
70 | </tbody></table>
71 | </form>
72 | 
73 | 
74 | 
75 | 
76 | </body></html>
77 | 


--------------------------------------------------------------------------------
/utils/basic_vocab.cc:
--------------------------------------------------------------------------------
 1 | 
 2 | #include <ctime>
 3 | #include <random>
 4 | #include <cmath>
 5 | #include "base/base.h"
 6 | #include "basic_string_util.h"
 7 | #include "basic_vocab.h"
 8 | namespace utils {
 9 | 
10 | namespace {
11 | static std::string map_word(const std::string& word) {
12 |   return word;
13 | }
14 | }  // namespace
15 | bool BasicVocab::Load(const std::string& path) {
16 |   FILE *fp = fopen(path.c_str(), "r");
17 |   if (fp == NULL) {
18 |     fprintf(stderr, "open file error:%s\n", path.c_str());
19 |     return false;
20 |   }
21 |   char line[4096] = {0};
22 |   int tn = 0;
23 |   while (fgets(line, sizeof(line) - 1, fp)) {
24 |     int nn = strlen(line);
25 |     while (nn && (line[nn - 1] == '\n' || line[nn - 1] == '\r')) {
26 |       nn -= 1;
27 |     }
28 |     if (nn <= 0) {
29 |       continue;
30 |     }
31 |     std::vector<std::string> terms;
32 |     BasicStringUtil::SplitString(line, nn, '\t', &terms);
33 |     nn = terms.size();
34 |     if (nn != 2) {
35 |       fprintf(stderr, "line len not comformed to dimension:%s:%d\n", line, nn);
36 |       return false;
37 |     }
38 |     const std::string& word = terms[0];
39 |     if (w_map_.find(word) != w_map_.end()) {
40 |       fprintf(stderr, "duplicate word:%s\n", word.c_str());
41 |       return false;
42 |     }
43 |     int idx = atoi(terms[1].c_str());
44 |     w_map_[word] = idx;
45 |     tn += 1;
46 |   }
47 |   fclose(fp);
48 |   return true;
49 | }
50 | int BasicVocab::GetWordIndex(const std::string& word) {
51 |   auto it = w_map_.find(word);
52 |   if ( it != w_map_.end()) {
53 |     return it->second;
54 |   } else {
55 |     if (!use_map_)return 0;
56 |     std::string mword = map_word(word);
57 |     it = w_map_.find(mword);
58 |     if (it != w_map_.end()) {
59 |       return it->second;
60 |     } else {
61 |       VLOG(0) << "not found map word:" << mword;
62 |       return 0;
63 |     }
64 |   }
65 | }
66 | 
67 | 
68 | int BasicVocab::GetTotalWord() {
69 |   return w_map_.size();
70 | }
71 | 
72 | 
73 | }  //  namespace utils
74 | 


--------------------------------------------------------------------------------
/third_party/crow/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 2.8)
 2 | project (crow_all)
 3 | set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake/")
 4 | find_package(Tcmalloc)
 5 | find_package(Threads)
 6 | find_package(OpenSSL)
 7 | if(OPENSSL_FOUND)
 8 |   include_directories(${OPENSSL_INCLUDE_DIR})
 9 | endif()
10 | 
11 | if (NOT CMAKE_BUILD_TYPE)
12 |     message(STATUS "No build type selected, default to Release")
13 |     set(CMAKE_BUILD_TYPE "Release")
14 | endif()
15 | 
16 | 
17 | if (MSVC)
18 | set(Boost_USE_STATIC_LIBS "On")
19 | find_package( Boost 1.52 COMPONENTS system thread regex REQUIRED )
20 | else()
21 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -std=c++1y -pedantic -Wextra")
22 | find_package( Boost 1.52 COMPONENTS system thread REQUIRED )
23 | endif()
24 | 
25 | include_directories( ${Boost_INCLUDE_DIR} )
26 | 
27 | set(PROJECT_INCLUDE_DIR 
28 | ${PROJECT_SOURCE_DIR}/include
29 | )
30 | 
31 | include_directories("${PROJECT_INCLUDE_DIR}")
32 | include_directories("${PROJECT_SOURCE_DIR}")
33 |  
34 | #add_subdirectory(src)
35 | add_subdirectory(examples)
36 | if (MSVC)
37 | else()
38 | add_subdirectory(tests)
39 | 
40 | enable_testing()
41 | add_test(NAME crow_test COMMAND ${CMAKE_CURRENT_BINARY_DIR}/tests/unittest)
42 | add_test(NAME template_test COMMAND ${CMAKE_CURRENT_BINARY_DIR}/tests/template/test.py WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/tests/template)
43 | 
44 | file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/amalgamate)
45 | 
46 | add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/amalgamate/crow_all.h
47 |     COMMAND python ${PROJECT_SOURCE_DIR}/amalgamate/merge_all.py ${PROJECT_SOURCE_DIR}/include
48 |     COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/amalgamate/crow_all.h ${PROJECT_SOURCE_DIR}/amalgamate
49 |     WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/amalgamate
50 |     DEPENDS ${PROJECT_SOURCE_DIR}/include/*.h
51 |     )
52 | 
53 | add_custom_target(amalgamation ALL DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/amalgamate/crow_all.h)
54 | endif()
55 | 


--------------------------------------------------------------------------------
/kcws/cc/viterbi_decode.cc:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2016- 2018 Koth. All Rights Reserved.
 3 |  * =====================================================================================
 4 |  * Filename:  viterbi_decode.cc
 5 |  * Author:  Koth
 6 |  * Create Time: 2017-02-01 13:47:48
 7 |  * Description:
 8 |  *
 9 |  */
10 | #include "kcws/cc/viterbi_decode.h"
11 | 
12 | namespace kcws {
13 | 
14 | int viterbi_decode(
15 |   const Eigen::TensorMap<Eigen::Tensor<float, 3, Eigen::RowMajor>, Eigen::Aligned>& predictions,
16 |   int sentenceIdx,
17 |   int nn,
18 |   const std::vector<std::vector<float>>& trans,
19 |   int** bp,
20 |   float** scores,
21 |   int ntags) {
22 |   for (int i = 0; i < ntags; i++) {
23 |     scores[0][i] = predictions(sentenceIdx, 0, i);
24 |   }
25 |   for (int i = 1; i < nn; i++) {
26 |     for (int  t = 0; t < ntags; t++) {
27 |       float maxScore = -1e7;
28 |       float emission = predictions(sentenceIdx, i, t);
29 |       for (int prev = 0; prev < ntags; prev++) {
30 |         float score = scores[(i - 1) % 2][prev] + trans[prev][t] + emission;
31 |         if (score > maxScore) {
32 |           maxScore = score;
33 |           bp[i - 1][t] = prev;
34 |         }
35 |       }
36 |       scores[i % 2][t] = maxScore;
37 |     }
38 |   }
39 |   float maxScore = scores[(nn - 1) % 2][0];
40 |   int ret = 0;
41 |   for (int i = 1; i < ntags; i++) {
42 |     if (scores[(nn - 1) % 2][i] > maxScore) {
43 |       ret = i;
44 |       maxScore = scores[(nn - 1) % 2][i];
45 |     }
46 |   }
47 |   return ret;
48 | }
49 | void get_best_path(
50 |   const Eigen::TensorMap<Eigen::Tensor<float, 3, Eigen::RowMajor>, Eigen::Aligned>& predictions,
51 |   int sentenceIdx,
52 |   int nn,
53 |   const std::vector<std::vector<float>>& trans,
54 |   int** bp,
55 |   float** scores,
56 |   std::vector<int>& resultTags,
57 |   int ntags) {
58 |   int lastTag = viterbi_decode(predictions, sentenceIdx, nn, trans, bp, scores, ntags);
59 |   resultTags.push_back(lastTag);
60 |   for (int i = nn - 2; i >= 0; i--) {
61 |     int bpTag = bp[i][lastTag];
62 |     resultTags.push_back(bpTag);
63 |     lastTag = bpTag;
64 |   }
65 | }
66 | 
67 | }  // namespace kcws
68 | 


--------------------------------------------------------------------------------
/kcws/train/generate_train_free.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding:utf-8 -*-
 3 | 
 4 | # File: generate_train_free.py
 5 | # Project: /e/code/kcws
 6 | # Created: Thu Jul 27 2017
 7 | # Author: Koth Chen
 8 | # Copyright (c) 2017 Koth
 9 | #
10 | # <<licensetext>>
11 | 
12 | 
13 | import sys
14 | import os
15 | import w2v
16 | import fire
17 | from sentence import Sentence
18 | 
19 | totalLine = 0
20 | longLine = 0
21 | 
22 | MAX_LEN = 80
23 | totalChars = 0
24 | 
25 | 
26 | def processLine(line, vob, out):
27 |     global totalLine
28 |     global longLine
29 |     global totalChars
30 |     ss = line.split("\t")
31 | 
32 |     sentence = Sentence()
33 |     nn = len(ss)
34 |     for i in range(nn):
35 |         ts = ss[i].split(" ")
36 |         ustr = unicode(ts[0].decode('utf8'))
37 |         sentence.addToken(ustr)
38 |     if sentence.chars > MAX_LEN:
39 |         longLine += 1
40 |     else:
41 |         x = []
42 |         y = []
43 |         totalChars += sentence.chars
44 |         sentence.generate_tr_line(x, y, vob)
45 |         nn = len(x)
46 |         assert (nn == len(y))
47 |         for j in range(nn, MAX_LEN):
48 |             x.append(0)
49 |             y.append(0)
50 |             line = ''
51 |         for i in range(MAX_LEN):
52 |             if i > 0:
53 |                 line += " "
54 |             line += str(x[i])
55 |         for j in range(MAX_LEN):
56 |             line += " " + str(y[j])
57 |         out.write("%s\n" % (line))
58 |     totalLine += 1
59 | 
60 | 
61 | def doGen(inputPath, outputPath, vocabPath):
62 |     global totalLine
63 |     global longLine
64 |     global totalChars
65 |     vob = w2v.Word2vecVocab()
66 |     vob.Load(vocabPath)
67 |     with open(inputPath, "r") as inp:
68 |         with open(outputPath, "w") as out:
69 |             for line in inp.readlines():
70 |                 line = line.strip()
71 |                 if not line:
72 |                     continue
73 |                 processLine(line, vob, out)
74 |     print("total:%d, long lines:%d, chars:%d" %
75 |           (totalLine, longLine, totalChars))
76 | 
77 | 
78 | def main():
79 |     fire.Fire()
80 | 
81 | 
82 | if __name__ == '__main__':
83 |     main()
84 | 


--------------------------------------------------------------------------------
/kcws/train/process_icwb.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Author: Koth
 3 | # @Date:   2016-11-27 12:01:18
 4 | # @Last Modified by:   Koth
 5 | # @Last Modified time: 2016-11-27 20:26:31
 6 | import sys
 7 | import w2v
 8 | 
 9 | SEQ_LEN = 80
10 | 
11 | 
12 | def processToken(x, y, tok, vob):
13 |   if len(tok) == 1:
14 |     x.append(vob.GetWordIndex(str(tok[0].encode("utf8"))))
15 |     y.append(0)
16 |   else:
17 |     nn = len(tok)
18 |     for i in range(nn):
19 |       x.append(vob.GetWordIndex(str(tok[i].encode("utf8"))))
20 |       if i == 0:
21 |         y.append(1)
22 |       elif i == (nn - 1):
23 |         y.append(3)
24 |       else:
25 |         y.append(2)
26 | 
27 | 
28 | def processFile(inp, oup, mode, vob):
29 |   global SEQ_LEN
30 |   while True:
31 |     line = inp.readline()
32 |     if not line:
33 |       break
34 |     line = line.strip()
35 |     if not line:
36 |       continue
37 |     ss = line.split("  ")
38 |     oline = ""
39 |     x = []
40 |     y = []
41 |     for s in ss:
42 |       ustr = unicode(s.decode("utf-8"))
43 |       if len(ustr) < 1:
44 |         continue
45 |       if mode == 0:
46 |         for i in range(len(ustr)):
47 |           oline += str(ustr[i].encode("utf8"))
48 |           oline += " "
49 |       else:
50 |         processToken(x, y, ustr, vob)
51 |     if mode != 0:
52 |       nn = len(x)
53 |       for i in range(nn, SEQ_LEN):
54 |         x.append(0)
55 |         y.append(0)
56 |       for i in range(SEQ_LEN):
57 |         oline += str(x[i]) + " "
58 |       for i in range(SEQ_LEN):
59 |         oline += str(y[i]) + " "
60 |     olen = len(oline)
61 |     oline = oline[:olen - 1]
62 |     oup.write("%s\n" % (oline))
63 | 
64 | 
65 | def main(argc, argv):
66 |   if argc < 3:
67 |     print(
68 |         "Usage: %s <input>  <output> [model | 0 for w2v , 1 for training]  [vec_path | if mode if not 0]"
69 |         % (argv[0]))
70 |     sys.exit(1)
71 |   mode = 0
72 |   vob = None
73 |   if argc > 4:
74 |     mode = int(argv[3])
75 |     vob = w2v.Word2vecVocab()
76 |     vob.Load(argv[4])
77 |   inp = open(argv[1], "r")
78 |   oup = open(argv[2], "w")
79 |   processFile(inp, oup, mode, vob)
80 | 
81 | 
82 | if __name__ == '__main__':
83 |   main(len(sys.argv), sys.argv)


--------------------------------------------------------------------------------
/third_party/crow/include/crow/socket_adaptors.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <boost/asio.hpp>
 3 | #ifdef CROW_ENABLE_SSL
 4 | #include <boost/asio/ssl.hpp>
 5 | #endif
 6 | #include "third_party/crow/include/crow/settings.h"
 7 | namespace crow {
 8 | using namespace boost;
 9 | using tcp = asio::ip::tcp;
10 | 
11 | struct SocketAdaptor {
12 |   using context = void;
13 |   SocketAdaptor(boost::asio::io_service& io_service, context*)
14 |     : socket_(io_service) {
15 |   }
16 | 
17 |   boost::asio::io_service& get_io_service() {
18 |     return socket_.get_io_service();
19 |   }
20 | 
21 |   tcp::socket& raw_socket() {
22 |     return socket_;
23 |   }
24 | 
25 |   tcp::socket& socket() {
26 |     return socket_;
27 |   }
28 | 
29 |   tcp::endpoint remote_endpoint() {
30 |     return socket_.remote_endpoint();
31 |   }
32 | 
33 |   bool is_open() {
34 |     return socket_.is_open();
35 |   }
36 | 
37 |   void close() {
38 |     socket_.close();
39 |   }
40 | 
41 |   template <typename F>
42 |   void start(F f) {
43 |     f(boost::system::error_code());
44 |   }
45 | 
46 |   tcp::socket socket_;
47 | };
48 | 
49 | #ifdef CROW_ENABLE_SSL
50 | struct SSLAdaptor {
51 |   using context = boost::asio::ssl::context;
52 |   using ssl_socket_t = boost::asio::ssl::stream<tcp::socket>;
53 |   SSLAdaptor(boost::asio::io_service& io_service, context* ctx)
54 |     : ssl_socket_(new ssl_socket_t(io_service, *ctx)) {
55 |   }
56 | 
57 |   boost::asio::ssl::stream<tcp::socket>& socket() {
58 |     return *ssl_socket_;
59 |   }
60 | 
61 |   tcp::socket::lowest_layer_type&
62 |   raw_socket() {
63 |     return ssl_socket_->lowest_layer();
64 |   }
65 | 
66 |   tcp::endpoint remote_endpoint() {
67 |     return raw_socket().remote_endpoint();
68 |   }
69 | 
70 |   bool is_open() {
71 |     return raw_socket().is_open();
72 |   }
73 | 
74 |   void close() {
75 |     raw_socket().close();
76 |   }
77 | 
78 |   boost::asio::io_service& get_io_service() {
79 |     return raw_socket().get_io_service();
80 |   }
81 | 
82 |   template <typename F>
83 |   void start(F f) {
84 |     ssl_socket_->async_handshake(boost::asio::ssl::stream_base::server,
85 |     [f](const boost::system::error_code & ec) {
86 |       f(ec);
87 |     });
88 |   }
89 | 
90 |   std::unique_ptr<boost::asio::ssl::stream<tcp::socket>> ssl_socket_;
91 | };
92 | #endif
93 | }
94 | 


--------------------------------------------------------------------------------
/third_party/crow/tests/template/comments.json:
--------------------------------------------------------------------------------
1 | {"__ATTN__":"Do not edit this file; changes belong in the appropriate YAML file.","overview":"Comment tags represent content that should never appear in the resulting\noutput.\n\nThe tag's content may contain any substring (including newlines) EXCEPT the\nclosing delimiter.\n\nComment tags SHOULD be treated as standalone when appropriate.\n","tests":[{"name":"Inline","data":{},"expected":"1234567890","template":"12345{{! Comment Block! }}67890","desc":"Comment blocks should be removed from the template."},{"name":"Multiline","data":{},"expected":"1234567890\n","template":"12345{{!\n  This is a\n  multi-line comment...\n}}67890\n","desc":"Multiline comments should be permitted."},{"name":"Standalone","data":{},"expected":"Begin.\nEnd.\n","template":"Begin.\n{{! Comment Block! }}\nEnd.\n","desc":"All standalone comment lines should be removed."},{"name":"Indented Standalone","data":{},"expected":"Begin.\nEnd.\n","template":"Begin.\n  {{! Indented Comment Block! }}\nEnd.\n","desc":"All standalone comment lines should be removed."},{"name":"Standalone Line Endings","data":{},"expected":"|\r\n|","template":"|\r\n{{! Standalone Comment }}\r\n|","desc":"\"\\r\\n\" should be considered a newline for standalone tags."},{"name":"Standalone Without Previous Line","data":{},"expected":"!","template":"  {{! I'm Still Standalone }}\n!","desc":"Standalone tags should not require a newline to precede them."},{"name":"Standalone Without Newline","data":{},"expected":"!\n","template":"!\n  {{! I'm Still Standalone }}","desc":"Standalone tags should not require a newline to follow them."},{"name":"Multiline Standalone","data":{},"expected":"Begin.\nEnd.\n","template":"Begin.\n{{!\nSomething's going on here...\n}}\nEnd.\n","desc":"All standalone comment lines should be removed."},{"name":"Indented Multiline Standalone","data":{},"expected":"Begin.\nEnd.\n","template":"Begin.\n  {{!\n    Something's going on here...\n  }}\nEnd.\n","desc":"All standalone comment lines should be removed."},{"name":"Indented Inline","data":{},"expected":"  12 \n","template":"  12 {{! 34 }}\n","desc":"Inline comments should not strip whitespace"},{"name":"Surrounding Whitespace","data":{},"expected":"12345  67890","template":"12345 {{! Comment Block! }} 67890","desc":"Comment removal should preserve surrounding whitespace."}]}


--------------------------------------------------------------------------------
/kcws/train/process_people.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Author: Koth
 3 | # @Date:   2016-11-29 09:20:36
 4 | # @Last Modified by:   Koth
 5 | # @Last Modified time: 2016-11-29 15:58:30
 6 | 
 7 | import sys
 8 | import w2v
 9 | 
10 | SEQ_LEN = 80
11 | 
12 | 
13 | def processToken(x, y, tok, vob):
14 |   if len(tok) == 1:
15 |     x.append(vob.GetWordIndex(str(tok[0].encode("utf8"))))
16 |     y.append(0)
17 |   else:
18 |     nn = len(tok)
19 |     for i in range(nn):
20 |       x.append(vob.GetWordIndex(str(tok[i].encode("utf8"))))
21 |       if i == 0:
22 |         y.append(1)
23 |       elif i == (nn - 1):
24 |         y.append(3)
25 |       else:
26 |         y.append(2)
27 | 
28 | 
29 | def processFile(inp, oup, mode, vob):
30 |   global SEQ_LEN
31 |   while True:
32 |     line = inp.readline()
33 |     if not line:
34 |       break
35 |     line = line.strip()
36 |     if not line:
37 |       continue
38 |     ss = line.split("  ")
39 |     oline = ""
40 |     x = []
41 |     y = []
42 |     for s in ss:
43 |       pos = s.find("/")
44 |       if not pos:
45 |         print("fatal error '/' not found")
46 |         sys.exit(0)
47 |       s = s[:pos]
48 |       ustr = unicode(s.decode("utf-8"))
49 |       if len(ustr) < 1:
50 |         continue
51 |       if mode == 0:
52 |         for i in range(len(ustr)):
53 |           oline += str(ustr[i].encode("utf8"))
54 |           oline += " "
55 |       else:
56 |         processToken(x, y, ustr, vob)
57 |     if mode != 0:
58 |       nn = len(x)
59 |       for i in range(nn, SEQ_LEN):
60 |         x.append(0)
61 |         y.append(0)
62 |       for i in range(SEQ_LEN):
63 |         oline += str(x[i]) + " "
64 |       for i in range(SEQ_LEN):
65 |         oline += str(y[i]) + " "
66 |     olen = len(oline)
67 |     oline = oline[:olen - 1]
68 |     oup.write("%s\n" % (oline))
69 | 
70 | 
71 | def main(argc, argv):
72 |   if argc < 3:
73 |     print(
74 |         "Usage: %s <input>  <output> [model | 0 for w2v , 1 for training]  [vec_path | if mode if not 0]"
75 |         % (argv[0]))
76 |     sys.exit(1)
77 |   mode = 0
78 |   vob = None
79 |   if argc > 4:
80 |     mode = int(argv[3])
81 |     vob = w2v.Word2vecVocab()
82 |     vob.Load(argv[4])
83 |   inp = open(argv[1], "r")
84 |   oup = open(argv[2], "w")
85 |   processFile(inp, oup, mode, vob)
86 | 
87 | 
88 | if __name__ == '__main__':
89 |   main(len(sys.argv), sys.argv)


--------------------------------------------------------------------------------
/third_party/crow/include/crow/middleware_context.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "third_party/crow/include/crow/utility.h"
 4 | #include "third_party/crow/include/crow/http_request.h"
 5 | #include "third_party/crow/include/crow/http_response.h"
 6 | 
 7 | namespace crow {
 8 | namespace detail {
 9 | template <typename ... Middlewares>
10 | struct partial_context
11 |   : public black_magic::pop_back<Middlewares...>::template rebind<partial_context>
12 | , public black_magic::last_element_type<Middlewares...>::type::context {
13 |   using parent_context = typename black_magic::pop_back<Middlewares...>::template rebind<::crow::detail::partial_context>;
14 |   template <int N>
15 |   using partial = typename std::conditional < N == sizeof...(Middlewares) - 1, partial_context, typename parent_context::template partial<N >>::type;
16 | 
17 |   template <typename T>
18 | typename T::context& get() {
19 |   return static_cast<typename T::context&>(*this);
20 | }
21 |            };
22 | 
23 | template <>
24 | struct partial_context<> {
25 |   template <int>
26 |   using partial = partial_context;
27 | };
28 | 
29 | template <int N, typename Context, typename Container, typename CurrentMW, typename ... Middlewares>
30 | bool middleware_call_helper(Container& middlewares, request& req, response& res, Context& ctx);
31 | 
32 | template <typename ... Middlewares>
33 | struct context : private partial_context<Middlewares...>
34 | //struct context : private Middlewares::context... // simple but less type-safe
35 | {
36 |   template <int N, typename Context, typename Container>
37 |   friend typename std::enable_if<(N == 0)>::type after_handlers_call_helper(Container& middlewares, Context& ctx, request& req, response& res);
38 |   template <int N, typename Context, typename Container>
39 |   friend typename std::enable_if < (N > 0) >::type after_handlers_call_helper(Container& middlewares, Context& ctx, request& req, response& res);
40 | 
41 |   template <int N, typename Context, typename Container, typename CurrentMW, typename ... Middlewares2>
42 |   friend bool middleware_call_helper(Container& middlewares, request& req, response& res, Context& ctx);
43 | 
44 |   template <typename T>
45 |   typename T::context& get() {
46 |     return static_cast<typename T::context&>(*this);
47 |   }
48 | 
49 |   template <int N>
50 |   using partial = typename partial_context<Middlewares...>::template partial<N>;
51 | };
52 | }
53 | }
54 | 


--------------------------------------------------------------------------------
/third_party/crow/examples/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 2.8)
 2 | project (crow_examples)
 3 | 
 4 | if (MSVC)
 5 | add_executable(example_vs example_vs.cpp)
 6 | target_link_libraries(example_vs ${Boost_LIBRARIES})
 7 | target_link_libraries(example_vs ${CMAKE_THREAD_LIBS_INIT})
 8 | else ()
 9 |  
10 | add_executable(helloworld helloworld.cpp)
11 | target_link_libraries(helloworld ${Boost_LIBRARIES})
12 | target_link_libraries(helloworld ${CMAKE_THREAD_LIBS_INIT})
13 | 
14 | if (OPENSSL_FOUND)
15 | add_executable(example_ssl ssl/example_ssl.cpp)
16 | target_link_libraries(example_ssl ${Boost_LIBRARIES})
17 | target_link_libraries(example_ssl ${CMAKE_THREAD_LIBS_INIT} ${OPENSSL_LIBRARIES})
18 | endif()
19 | 
20 | add_executable(example_websocket websocket/example_ws.cpp)
21 | target_link_libraries(example_websocket ${Boost_LIBRARIES})
22 | target_link_libraries(example_websocket ${CMAKE_THREAD_LIBS_INIT} ssl crypto)
23 | 
24 | add_executable(example example.cpp)
25 | #target_link_libraries(example crow)
26 | target_link_libraries(example ${Boost_LIBRARIES})
27 | target_link_libraries(example ${CMAKE_THREAD_LIBS_INIT})
28 | 
29 | if (Tcmalloc_FOUND)
30 | target_link_libraries(example ${Tcmalloc_LIBRARIES})
31 | endif(Tcmalloc_FOUND)
32 | 
33 | add_executable(example_with_all example_with_all.cpp)
34 | #target_link_libraries(example crow)
35 | target_link_libraries(example_with_all ${Boost_LIBRARIES})
36 | target_link_libraries(example_with_all ${CMAKE_THREAD_LIBS_INIT})
37 | 
38 | add_custom_command(OUTPUT example_test.py
39 |         COMMAND ${CMAKE_COMMAND} -E
40 |         copy ${PROJECT_SOURCE_DIR}/example_test.py ${CMAKE_CURRENT_BINARY_DIR}/example_test.py
41 |         DEPENDS ${PROJECT_SOURCE_DIR}/example_test.py 
42 |         )
43 | add_custom_target(example_copy ALL DEPENDS example_test.py)
44 | 
45 | add_executable(example_chat example_chat.cpp)
46 | #target_link_libraries(example_chat crow)
47 | target_link_libraries(example_chat ${Boost_LIBRARIES})
48 | target_link_libraries(example_chat ${CMAKE_THREAD_LIBS_INIT})
49 | add_custom_command(OUTPUT example_chat.html
50 |         COMMAND ${CMAKE_COMMAND} -E
51 |         copy ${PROJECT_SOURCE_DIR}/example_chat.html ${CMAKE_CURRENT_BINARY_DIR}/example_chat.html
52 |         DEPENDS ${PROJECT_SOURCE_DIR}/example_chat.html 
53 |         )
54 | add_custom_target(example_chat_copy ALL DEPENDS example_chat.html)
55 | 
56 | #SET( CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -g -pg" )
57 | #SET( CMAKE_EXE_LINKER_FLAGS  "${CMAKE_EXE_LINKER_FLAGS} -g -pg" )
58 | endif()
59 | 


--------------------------------------------------------------------------------
/kcws/train/bilstm.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding:utf-8 -*-
 3 | 
 4 | # File: bilstm.py
 5 | # Project: /e/code/kcws
 6 | # Created: Thu Aug 03 2017
 7 | # Author: Koth Chen
 8 | # Copyright (c) 2017 Koth
 9 | #
10 | # <<licensetext>>
11 | 
12 | import tensorflow as tf
13 | 
14 | 
15 | class Model:
16 |     def __init__(self,
17 |                  numHidden,
18 |                  maxSeqLen,
19 |                  numTags):
20 |         self.num_hidden = numHidden
21 |         self.num_tags = numTags
22 |         self.max_seq_len = maxSeqLen
23 |         self.W = tf.get_variable(
24 |             shape=[numHidden * 2, numTags],
25 |             initializer=tf.contrib.layers.xavier_initializer(),
26 |             name="weights",
27 |             regularizer=tf.contrib.layers.l2_regularizer(0.001))
28 |         self.b = tf.Variable(tf.zeros([numTags], name="bias"))
29 | 
30 |     def inference(self, X, length, reuse=False):
31 |         length_64 = tf.cast(length, tf.int64)
32 |         with tf.variable_scope("bilstm", reuse=reuse):
33 |             forward_output, _ = tf.nn.dynamic_rnn(
34 |                 tf.contrib.rnn.LSTMCell(self.num_hidden,
35 |                                         reuse=reuse),
36 |                 X,
37 |                 dtype=tf.float32,
38 |                 sequence_length=length,
39 |                 scope="RNN_forward")
40 |             backward_output_, _ = tf.nn.dynamic_rnn(
41 |                 tf.contrib.rnn.LSTMCell(self.num_hidden,
42 |                                         reuse=reuse),
43 |                 inputs=tf.reverse_sequence(X,
44 |                                            length_64,
45 |                                            seq_dim=1),
46 |                 dtype=tf.float32,
47 |                 sequence_length=length,
48 |                 scope="RNN_backword")
49 | 
50 |         backward_output = tf.reverse_sequence(backward_output_,
51 |                                               length_64,
52 |                                               seq_dim=1)
53 | 
54 |         output = tf.concat([forward_output, backward_output], 2)
55 |         output = tf.reshape(output, [-1, self.num_hidden * 2])
56 |         if reuse is None or not reuse:
57 |             output = tf.nn.dropout(output, 0.5)
58 | 
59 |         matricized_unary_scores = tf.matmul(output, self.W) + self.b
60 |         unary_scores = tf.reshape(
61 |             matricized_unary_scores,
62 |             [-1, self.max_seq_len, self.num_tags],
63 |             name="Reshape_7" if reuse else None)
64 |         return unary_scores
65 | 


--------------------------------------------------------------------------------
/utils/json_util.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2016- 2018 Koth. All Rights Reserved.
 3 |  * =====================================================================================
 4 |  * Filename:  json_util.h
 5 |  * Description:   description
 6 |  * Author:	Koth(Yaowen Chen)
 7 |  *
 8 |  */
 9 | #ifndef UTILS_JSON_UTIL_H_
10 | #define UTILS_JSON_UTIL_H_
11 | #include <stdint.h>
12 | #include "jsonxx.h"
13 | 
14 | namespace json_util {
15 | 
16 | template <typename T>
17 | T FromJsonValue(const jsonxx::Value& jval) {
18 |     return jval.get<T>();
19 | }
20 | 
21 | template <typename T>
22 | bool ReadFromJson(const std::string& name, const jsonxx::Object& obj, T& val) {
23 |     const std::map<std::string, jsonxx::Value*>& kvs=obj.kv_map();
24 |     auto it=kvs.find(name);
25 |     if(it==kvs.end()){
26 |       return false;
27 |     }
28 |     val= FromJsonValue<T>(*(it->second));
29 |     return true;
30 | }
31 | 
32 | template <typename T>
33 | bool ReadArray(std::string name, const jsonxx::Object& obj, std::vector<T>& rets) {
34 |   if (!obj.has<jsonxx::Array>(name)) return false;
35 |   jsonxx::Array arr = obj.get<jsonxx::Array>(name);
36 |   const std::vector<jsonxx::Value*>& values=arr.values();
37 |   for(size_t i=0;i<values.size();i++){
38 |     jsonxx::Value& val=*values[i];
39 |     rets.push_back(FromJsonValue<T>(val));
40 |   }
41 |   return true;
42 | }
43 | 
44 | template <typename T>
45 | jsonxx::Value ToJsonValue(const T& val) {
46 |   return jsonxx::Value(val);
47 | }
48 | 
49 | template <typename T>
50 | void WriteToJson(const std::string& name,jsonxx::Object& obj,  const T& val) {
51 |    obj<<name<<ToJsonValue<T>(val);
52 | }
53 | 
54 | template <typename T>
55 | bool WriteArray(const std::string& name, jsonxx::Object& obj, const std::vector<T>& rets) {
56 |   jsonxx::Array arr;
57 |   int nn=rets.size();
58 |   for(int i=0;i<nn;i++){
59 |       jsonxx::Value val=ToJsonValue<T>(rets[i]);
60 |       arr<<val;
61 |   }
62 |   obj<<name<<arr;
63 |   return true;
64 | }
65 | 
66 | template<>
67 | inline float FromJsonValue(const jsonxx::Value& jval) {
68 |     return static_cast<float>(jval.get<jsonxx::Number>());
69 | }
70 | template<>
71 | inline double FromJsonValue(const jsonxx::Value& jval) {
72 |     return static_cast<double>(jval.get<jsonxx::Number>());
73 | }
74 | template<>
75 | inline int32_t FromJsonValue(const jsonxx::Value& jval) {
76 |     return static_cast<int32_t>(jval.get<jsonxx::Number>());
77 | }
78 | template<>
79 | inline int64_t FromJsonValue(const jsonxx::Value& jval) {
80 |     return static_cast<int64_t>(jval.get<jsonxx::Number>());
81 | }
82 | 
83 | }
84 | 
85 | #endif  // UTILS_JSON_UTIL_H_
86 | 


--------------------------------------------------------------------------------
/third_party/python/semver/PKG-INFO:
--------------------------------------------------------------------------------
 1 | Metadata-Version: 1.1
 2 | Name: semver
 3 | Version: 2.4.1
 4 | Summary: Python package to work with Semantic Versioning (http://semver.org/)
 5 | Home-page: https://github.com/k-bx/python-semver
 6 | Author: Konstantine Rybnikov
 7 | Author-email: k-bx@k-bx.com
 8 | License: BSD
 9 | Download-URL: https://github.com/k-bx/python-semver/downloads
10 | Description: Semver -- python module for semantic versioning
11 |         ===============================================
12 |         
13 |         ![Travis CI](https://travis-ci.org/k-bx/python-semver.svg?branch=master)
14 |         
15 |         Simple module for comparing versions as noted at [semver.org](http://semver.org/).
16 |         
17 |         This module provides just couple of functions, main of which are:
18 |         
19 |         ```python
20 |         >>> import semver
21 |         >>> semver.compare("1.0.0", "2.0.0")
22 |         -1
23 |         >>> semver.compare("2.0.0", "1.0.0")
24 |         1
25 |         >>> semver.compare("2.0.0", "2.0.0")
26 |         0
27 |         >>> semver.match("2.0.0", ">=1.0.0")
28 |         True
29 |         >>> semver.match("1.0.0", ">1.0.0")
30 |         False
31 |         >>> semver.format_version(3, 4, 5, 'pre.2', 'build.4')
32 |         '3.4.5-pre.2+build.4'
33 |         >>> semver.bump_major("3.4.5")
34 |         '4.0.0'
35 |         >>> semver.bump_minor("3.4.5")
36 |         '3.5.0'
37 |         >>> semver.bump_patch("3.4.5")
38 |         '3.4.6'
39 |         >>> semver.max_ver("1.0.0", "2.0.0")
40 |         '2.0.0'
41 |         >>> semver.min_ver("1.0.0", "2.0.0")
42 |         '1.0.0'
43 |         ```
44 |         
45 |         Installation
46 |         ------------
47 |         
48 |         For Python 2:
49 |         
50 |         ```
51 |         pip install semver
52 |         ```
53 |         
54 |         For Python 3:
55 |         
56 |         ```
57 |         pip3 install semver
58 |         ```
59 |         
60 |         Homepage at PyPi: https://pypi.python.org/pypi/semver
61 |         
62 | Platform: UNKNOWN
63 | Classifier: Environment :: Web Environment
64 | Classifier: Framework :: Django
65 | Classifier: Intended Audience :: Developers
66 | Classifier: License :: OSI Approved :: BSD License
67 | Classifier: Operating System :: OS Independent
68 | Classifier: Programming Language :: Python
69 | Classifier: Programming Language :: Python :: 2
70 | Classifier: Programming Language :: Python :: 2.6
71 | Classifier: Programming Language :: Python :: 2.7
72 | Classifier: Programming Language :: Python :: 3
73 | Classifier: Programming Language :: Python :: 3.2
74 | Classifier: Programming Language :: Python :: 3.3
75 | Classifier: Programming Language :: Python :: 3.4
76 | Classifier: Topic :: Software Development :: Libraries :: Python Modules
77 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | ### 引用 
 3 | 
 4 |  
 5 | 本项目模型BiLSTM+CRF参考论文：http://www.aclweb.org/anthology/N16-1030 ,IDCNN+CRF参考论文：https://arxiv.org/abs/1702.02098
 6 | 
 7 | 
 8 | ### 构建
 9 | 
10 | 1. 安装好bazel代码构建工具，安装好tensorflow（目前本项目需要tf 1.0.0alpha版本以上)
11 | 2. 切换到本项目代码目录，运行./configure
12 | 3. 编译后台服务 
13 | 
14 |    > bazel build //kcws/cc:seg_backend_api
15 | 
16 | 
17 | ### 训练
18 | 
19 | 1. 关注待字闺中公众号 回复 kcws 获取语料下载地址：
20 |    
21 |    ![logo](https://github.com/koth/kcws/blob/master/docs/qrcode_dzgz.jpg?raw=true "待字闺中")
22 |    
23 |    
24 | 2. 解压语料到一个目录
25 | 
26 | 3. 切换到代码目录，运行:
27 |   > python kcws/train/process_anno_file.py <语料目录> pre_chars_for_w2v.txt
28 |   
29 |   > bazel build third_party/word2vec:word2vec
30 |   
31 |   > 先得到初步词表
32 |   
33 |   > ./bazel-bin/third_party/word2vec/word2vec -train pre_chars_for_w2v.txt -save-vocab pre_vocab.txt -min-count 3
34 |   
35 |   > 处理低频词
36 |   
37 |   > python kcws/train/replace_unk.py pre_vocab.txt pre_chars_for_w2v.txt chars_for_w2v.txt
38 |   > 
39 |   > 训练word2vec
40 |   > 
41 |   > ./bazel-bin/third_party/word2vec/word2vec -train chars_for_w2v.txt -output vec.txt -size 50 -sample 1e-4 -negative 5 -hs 1 -binary 0 -iter 5
42 |   > 
43 |   > 构建训练语料工具
44 |   > 
45 |   > bazel build kcws/train:generate_training
46 |   > 
47 |   > 生成语料
48 |   > 
49 |   > ./bazel-bin/kcws/train/generate_training vec.txt <语料目录> all.txt
50 |   > 
51 |   > 得到train.txt , test.txt文件
52 |   > 
53 |   > python kcws/train/filter_sentence.py all.txt
54 |   
55 | 4. 安装好tensorflow,切换到kcws代码目录，运行:
56 | 
57 |   > python kcws/train/train_cws.py --word2vec_path vec.txt --train_data_path <绝对路径到train.txt> --test_data_path test.txt --max_sentence_len 80 --learning_rate 0.001
58 |   （默认使用IDCNN模型，可设置参数”--use_idcnn False“来切换BiLSTM模型)
59 |   
60 | 5. 生成vocab
61 |   > bazel  build kcws/cc:dump_vocab
62 |   
63 |   > ./bazel-bin/kcws/cc/dump_vocab vec.txt kcws/models/basic_vocab.txt
64 |   
65 | 6. 导出训练好的模型
66 |  >  python tools/freeze_graph.py --input_graph logs/graph.pbtxt  --input_checkpoint logs/model.ckpt --output_node_names  "transitions,Reshape_7"   --output_graph kcws/models/seg_model.pbtxt
67 | 
68 | 7. 词性标注模型下载  (临时方案，后续文档给出词性标注模型训练，导出等）
69 | 
70 |    >  从 https://pan.baidu.com/s/1bYmABk 下载pos_model.pbtxt到kcws/models/目录下
71 | 
72 | 8. 运行web service
73 |  >  ./bazel-bin/kcws/cc/seg_backend_api --model_path=kcws/models/seg_model.pbtxt(绝对路径到seg_model.pbtxt>)   --vocab_path=kcws/models/basic_vocab.txt   --max_sentence_len=80
74 | 
75 | ### 词性标注的训练说明：
76 | 
77 | https://github.com/koth/kcws/blob/master/pos_train.md
78 | 
79 | ### 自定义词典
80 | 目前支持自定义词典是在解码阶段，参考具体使用方式请参考kcws/cc/test_seg.cc
81 | 字典为文本格式，每一行格式如下:
82 | ><自定义词条>\t<权重>
83 | 
84 | 比如：
85 | >蓝瘦香菇	4
86 | 
87 | 权重为一个正整数，一般4以上，越大越重要
88 |  
89 | ### demo
90 | http://45.32.100.248:9090/
91 | 
92 | 附： 使用相同模型训练的公司名识别demo:
93 | 
94 | http://45.32.100.248:18080
95 | 
96 | 
97 | 
98 | 
99 | 


--------------------------------------------------------------------------------
/third_party/crow/examples/example_chat.cpp:
--------------------------------------------------------------------------------
 1 | #include "crow.h"
 2 | #include <string>
 3 | #include <vector>
 4 | #include <chrono>
 5 | 
 6 | using namespace std;
 7 | 
 8 | vector<string> msgs;
 9 | vector<pair<crow::response*, decltype(chrono::steady_clock::now())>> ress;
10 | 
11 | void broadcast(const string& msg)
12 | {
13 |     msgs.push_back(msg);
14 |     crow::json::wvalue x;
15 |     x["msgs"][0] = msgs.back();
16 |     x["last"] = msgs.size();
17 |     string body = crow::json::dump(x);
18 |     for(auto p:ress)
19 |     {
20 |         auto* res = p.first;
21 |         CROW_LOG_DEBUG << res << " replied: " << body;
22 |         res->end(body);
23 |     }
24 |     ress.clear();
25 | }
26 | // To see how it works go on {ip}:40080 but I just got it working with external build (not directly in IDE, I guess a problem with dependency)
27 | int main()
28 | {
29 |     crow::SimpleApp app;
30 |     crow::mustache::set_base(".");
31 | 
32 |     CROW_ROUTE(app, "/")
33 |     ([]{
34 |         crow::mustache::context ctx;
35 |         return crow::mustache::load("example_chat.html").render();
36 |     });
37 | 
38 |     CROW_ROUTE(app, "/logs")
39 |     ([]{
40 |         CROW_LOG_INFO << "logs requested";
41 |         crow::json::wvalue x;
42 |         int start = max(0, (int)msgs.size()-100);
43 |         for(int i = start; i < (int)msgs.size(); i++)
44 |             x["msgs"][i-start] = msgs[i];
45 |         x["last"] = msgs.size();
46 |         CROW_LOG_INFO << "logs completed";
47 |         return x;
48 |     });
49 | 
50 |     CROW_ROUTE(app, "/logs/<int>")
51 |     ([](const crow::request& /*req*/, crow::response& res, int after){
52 |         CROW_LOG_INFO << "logs with last " << after;
53 |         if (after < (int)msgs.size())
54 |         {
55 |             crow::json::wvalue x;
56 |             for(int i = after; i < (int)msgs.size(); i ++)
57 |                 x["msgs"][i-after] = msgs[i];
58 |             x["last"] = msgs.size();
59 | 
60 |             res.write(crow::json::dump(x));
61 |             res.end();
62 |         }
63 |         else
64 |         {
65 |             vector<pair<crow::response*, decltype(chrono::steady_clock::now())>> filtered;
66 |             for(auto p : ress)
67 |             {
68 |                 if (p.first->is_alive() && chrono::steady_clock::now() - p.second < chrono::seconds(30))
69 |                     filtered.push_back(p);
70 |                 else
71 |                     p.first->end();
72 |             }
73 |             ress.swap(filtered);
74 |             ress.push_back({&res, chrono::steady_clock::now()});
75 |             CROW_LOG_DEBUG << &res << " stored " << ress.size();
76 |         }
77 |     });
78 | 
79 |     CROW_ROUTE(app, "/send")
80 |         .methods("GET"_method, "POST"_method)
81 |     ([](const crow::request& req)
82 |     {
83 |         CROW_LOG_INFO << "msg from client: " << req.body;
84 |         broadcast(req.body);
85 |         return "";
86 |     });
87 | 
88 |     app.port(40080)
89 |         //.multithreaded()
90 |         .run();
91 | }
92 | 


--------------------------------------------------------------------------------
/kcws/train/prepare_pos.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # @Author: Koth
  3 | # @Date:   2017-01-25 11:46:37
  4 | # @Last Modified by:   Koth
  5 | # @Last Modified time: 2017-01-25 12:05:16
  6 | 
  7 | import sys
  8 | import os
  9 | 
 10 | totalLine = 0
 11 | longLine = 0
 12 | maxLen = 80
 13 | 
 14 | 
 15 | def processToken(token, collect, out, end):
 16 |   global totalLine
 17 |   global longLine
 18 |   global maxLen
 19 |   nn = len(token)
 20 |   #print token
 21 |   while nn > 0 and token[nn - 1] != '/':
 22 |     nn = nn - 1
 23 | 
 24 |   token = token[:nn - 1].strip()
 25 |   if not token:
 26 |     return
 27 |   out.write("%s " % (token))
 28 |   if end:
 29 |     out.write("\n")
 30 | 
 31 | 
 32 | def processLine(line, out):
 33 |   line = line.strip()
 34 |   nn = len(line)
 35 |   seeLeftB = False
 36 |   start = 0
 37 |   collect = []
 38 |   try:
 39 |     for i in range(nn):
 40 |       if line[i] == ' ':
 41 |         if not seeLeftB:
 42 |           token = line[start:i]
 43 |           if token.startswith('['):
 44 |             tokenLen = len(token)
 45 |             while tokenLen > 0 and token[tokenLen - 1] != ']':
 46 |               tokenLen = tokenLen - 1
 47 |             token = token[1:tokenLen - 1]
 48 |             ss = token.split(' ')
 49 |             for s in ss:
 50 |               processToken(s, collect, out, False)
 51 |           else:
 52 |             processToken(token, collect, out, False)
 53 |           start = i + 1
 54 |       elif line[i] == '[':
 55 |         seeLeftB = True
 56 |       elif line[i] == ']':
 57 |         seeLeftB = False
 58 |     if start < nn:
 59 |       token = line[start:]
 60 |       if token.startswith('['):
 61 |         tokenLen = len(token)
 62 |         while tokenLen > 0 and token[tokenLen - 1] != ']':
 63 |           tokenLen = tokenLen - 1
 64 |         token = token[1:tokenLen - 1]
 65 |         ss = token.split(' ')
 66 |         ns = len(ss)
 67 |         for i in range(ns - 1):
 68 |           processToken(ss[i], collect, out, False)
 69 |         processToken(ss[-1], collect, out, True)
 70 |       else:
 71 |         processToken(token, collect, out, True)
 72 |   except Exception as e:
 73 |     pass
 74 | 
 75 | 
 76 | def main(argc, argv):
 77 |   global totalLine
 78 |   global longLine
 79 |   if argc < 3:
 80 |     print("Usage:%s <dir> <output>" % (argv[0]))
 81 |     sys.exit(1)
 82 |   rootDir = argv[1]
 83 |   out = open(argv[2], "w")
 84 |   for dirName, subdirList, fileList in os.walk(rootDir):
 85 |     curDir = os.path.join(rootDir, dirName)
 86 |     for file in fileList:
 87 |       if file.endswith(".txt"):
 88 |         curFile = os.path.join(curDir, file)
 89 |         # print("processing:%s" % (curFile))
 90 |         fp = open(curFile, "r")
 91 |         for line in fp.readlines():
 92 |           line = line.strip()
 93 |           processLine(line, out)
 94 |         fp.close()
 95 |   out.close()
 96 |   print("total:%d, long lines:%d" % (totalLine, longLine))
 97 | 
 98 | 
 99 | if __name__ == '__main__':
100 |   main(len(sys.argv), sys.argv)
101 | 


--------------------------------------------------------------------------------
/third_party/crow/examples/example_with_all.cpp:
--------------------------------------------------------------------------------
 1 | #include "../amalgamate/crow_all.h"
 2 | 
 3 | #include <sstream>
 4 | 
 5 | class ExampleLogHandler : public crow::ILogHandler {
 6 |     public:
 7 |         void log(std::string /*message*/, crow::LogLevel /*level*/) override {
 8 | //            cerr << "ExampleLogHandler -> " << message;
 9 |         }
10 | };
11 | 
12 | int main()
13 | {
14 |     crow::SimpleApp app;
15 | 
16 |     CROW_ROUTE(app, "/")
17 |         .name("hello")
18 |     ([]{
19 |         return "Hello World!";
20 |     });
21 | 
22 |     CROW_ROUTE(app, "/about")
23 |     ([](){
24 |         return "About Crow example.";
25 |     });
26 | 
27 |     // simple json response
28 |     CROW_ROUTE(app, "/json")
29 |     ([]{
30 |         crow::json::wvalue x;
31 |         x["message"] = "Hello, World!";
32 |         return x;
33 |     });
34 | 
35 |     CROW_ROUTE(app,"/hello/<int>")
36 |     ([](int count){
37 |         if (count > 100)
38 |             return crow::response(400);
39 |         std::ostringstream os;
40 |         os << count << " bottles of beer!";
41 |         return crow::response(os.str());
42 |     });
43 | 
44 |     CROW_ROUTE(app,"/add/<int>/<int>")
45 |     ([](const crow::request& /*req*/, crow::response& res, int a, int b){
46 |         std::ostringstream os;
47 |         os << a+b;
48 |         res.write(os.str());
49 |         res.end();
50 |     });
51 | 
52 |     // Compile error with message "Handler type is mismatched with URL paramters"
53 |     //CROW_ROUTE(app,"/another/<int>")
54 |     //([](int a, int b){
55 |         //return crow::response(500);
56 |     //});
57 | 
58 |     // more json example
59 |     CROW_ROUTE(app, "/add_json")
60 |     ([](const crow::request& req){
61 |         auto x = crow::json::load(req.body);
62 |         if (!x)
63 |             return crow::response(400);
64 |         int sum = x["a"].i()+x["b"].i();
65 |         std::ostringstream os;
66 |         os << sum;
67 |         return crow::response{os.str()};
68 |     });
69 | 
70 |     CROW_ROUTE(app, "/params")
71 |     ([](const crow::request& req){
72 |         std::ostringstream os;
73 |         os << "Params: " << req.url_params << "\n\n"; 
74 |         os << "The key 'foo' was " << (req.url_params.get("foo") == nullptr ? "not " : "") << "found.\n";
75 |         if(req.url_params.get("pew") != nullptr) {
76 |             double countD = boost::lexical_cast<double>(req.url_params.get("pew"));
77 |             os << "The value of 'pew' is " <<  countD << '\n';
78 |         }
79 |         auto count = req.url_params.get_list("count");
80 |         os << "The key 'count' contains " << count.size() << " value(s).\n";
81 |         for(const auto& countVal : count) {
82 |             os << " - " << countVal << '\n';
83 |         }
84 |         return crow::response{os.str()};
85 |     });    
86 | 
87 |     // ignore all log
88 |     crow::logger::setLogLevel(crow::LogLevel::Debug);
89 |     //crow::logger::setHandler(std::make_shared<ExampleLogHandler>());
90 | 
91 |     app.port(18080)
92 |         .multithreaded()
93 |         .run();
94 | }
95 | 


--------------------------------------------------------------------------------
/third_party/crow/tests/template/partials.json:
--------------------------------------------------------------------------------
1 | {"__ATTN__":"Do not edit this file; changes belong in the appropriate YAML file.","overview":"Partial tags are used to expand an external template into the current\ntemplate.\n\nThe tag's content MUST be a non-whitespace character sequence NOT containing\nthe current closing delimiter.\n\nThis tag's content names the partial to inject.  Set Delimiter tags MUST NOT\naffect the parsing of a partial.  The partial MUST be rendered against the\ncontext stack local to the tag.  If the named partial cannot be found, the\nempty string SHOULD be used instead, as in interpolations.\n\nPartial tags SHOULD be treated as standalone when appropriate.  If this tag\nis used standalone, any whitespace preceding the tag should treated as\nindentation, and prepended to each line of the partial before rendering.\n","tests":[{"name":"Basic Behavior","data":{},"expected":"\"from partial\"","template":"\"{{>text}}\"","desc":"The greater-than operator should expand to the named partial.","partials":{"text":"from partial"}},{"name":"Failed Lookup","data":{},"expected":"\"\"","template":"\"{{>text}}\"","desc":"The empty string should be used when the named partial is not found.","partials":{}},{"name":"Context","data":{"text":"content"},"expected":"\"*content*\"","template":"\"{{>partial}}\"","desc":"The greater-than operator should operate within the current context.","partials":{"partial":"*{{text}}*"}},{"name":"Recursion","data":{"content":"X","nodes":[{"content":"Y","nodes":[]}]},"expected":"X<Y<>>","template":"{{>node}}","desc":"The greater-than operator should properly recurse.","partials":{"node":"{{content}}<{{#nodes}}{{>node}}{{/nodes}}>"}},{"name":"Surrounding Whitespace","data":{},"expected":"| \t|\t |","template":"| {{>partial}} |","desc":"The greater-than operator should not alter surrounding whitespace.","partials":{"partial":"\t|\t"}},{"name":"Inline Indentation","data":{"data":"|"},"expected":"  |  >\n>\n","template":"  {{data}}  {{> partial}}\n","desc":"Whitespace should be left untouched.","partials":{"partial":">\n>"}},{"name":"Standalone Line Endings","data":{},"expected":"|\r\n>|","template":"|\r\n{{>partial}}\r\n|","desc":"\"\\r\\n\" should be considered a newline for standalone tags.","partials":{"partial":">"}},{"name":"Standalone Without Previous Line","data":{},"expected":"  >\n  >>","template":"  {{>partial}}\n>","desc":"Standalone tags should not require a newline to precede them.","partials":{"partial":">\n>"}},{"name":"Standalone Without Newline","data":{},"expected":">\n  >\n  >","template":">\n  {{>partial}}","desc":"Standalone tags should not require a newline to follow them.","partials":{"partial":">\n>"}},{"name":"Standalone Indentation","data":{"content":"<\n->"},"expected":"\\\n |\n <\n->\n |\n/\n","template":"\\\n {{>partial}}\n/\n","desc":"Each line of the partial should be indented before rendering.","partials":{"partial":"|\n{{{content}}}\n|\n"}},{"name":"Padding Whitespace","data":{"boolean":true},"expected":"|[]|","template":"|{{> partial }}|","desc":"Superfluous in-tag whitespace should be ignored.","partials":{"partial":"[]"}}]}


--------------------------------------------------------------------------------
/third_party/crow/tests/template/comments.yml:
--------------------------------------------------------------------------------
  1 | overview: |
  2 |   Comment tags represent content that should never appear in the resulting
  3 |   output.
  4 | 
  5 |   The tag's content may contain any substring (including newlines) EXCEPT the
  6 |   closing delimiter.
  7 | 
  8 |   Comment tags SHOULD be treated as standalone when appropriate.
  9 | tests:
 10 |   - name: Inline
 11 |     desc: Comment blocks should be removed from the template.
 12 |     data: { }
 13 |     template: '12345{{! Comment Block! }}67890'
 14 |     expected: '1234567890'
 15 | 
 16 |   - name: Multiline
 17 |     desc: Multiline comments should be permitted.
 18 |     data: { }
 19 |     template: |
 20 |       12345{{!
 21 |         This is a
 22 |         multi-line comment...
 23 |       }}67890
 24 |     expected: |
 25 |       1234567890
 26 | 
 27 |   - name: Standalone
 28 |     desc: All standalone comment lines should be removed.
 29 |     data: { }
 30 |     template: |
 31 |       Begin.
 32 |       {{! Comment Block! }}
 33 |       End.
 34 |     expected: |
 35 |       Begin.
 36 |       End.
 37 | 
 38 |   - name: Indented Standalone
 39 |     desc: All standalone comment lines should be removed.
 40 |     data: { }
 41 |     template: |
 42 |       Begin.
 43 |         {{! Indented Comment Block! }}
 44 |       End.
 45 |     expected: |
 46 |       Begin.
 47 |       End.
 48 | 
 49 |   - name: Standalone Line Endings
 50 |     desc: '"\r\n" should be considered a newline for standalone tags.'
 51 |     data: { }
 52 |     template: "|\r\n{{! Standalone Comment }}\r\n|"
 53 |     expected: "|\r\n|"
 54 | 
 55 |   - name: Standalone Without Previous Line
 56 |     desc: Standalone tags should not require a newline to precede them.
 57 |     data: { }
 58 |     template: "  {{! I'm Still Standalone }}\n!"
 59 |     expected: "!"
 60 | 
 61 |   - name: Standalone Without Newline
 62 |     desc: Standalone tags should not require a newline to follow them.
 63 |     data: { }
 64 |     template: "!\n  {{! I'm Still Standalone }}"
 65 |     expected: "!\n"
 66 | 
 67 |   - name: Multiline Standalone
 68 |     desc: All standalone comment lines should be removed.
 69 |     data: { }
 70 |     template: |
 71 |       Begin.
 72 |       {{!
 73 |       Something's going on here...
 74 |       }}
 75 |       End.
 76 |     expected: |
 77 |       Begin.
 78 |       End.
 79 | 
 80 |   - name: Indented Multiline Standalone
 81 |     desc: All standalone comment lines should be removed.
 82 |     data: { }
 83 |     template: |
 84 |       Begin.
 85 |         {{!
 86 |           Something's going on here...
 87 |         }}
 88 |       End.
 89 |     expected: |
 90 |       Begin.
 91 |       End.
 92 | 
 93 |   - name: Indented Inline
 94 |     desc: Inline comments should not strip whitespace
 95 |     data: { }
 96 |     template: "  12 {{! 34 }}\n"
 97 |     expected: "  12 \n"
 98 | 
 99 |   - name: Surrounding Whitespace
100 |     desc: Comment removal should preserve surrounding whitespace.
101 |     data: { }
102 |     template: '12345 {{! Comment Block! }} 67890'
103 |     expected: '12345  67890'
104 | 


--------------------------------------------------------------------------------
/third_party/setuptools/setuptools.egg-info/entry_points.txt:
--------------------------------------------------------------------------------
 1 | [console_scripts]
 2 | easy_install = setuptools.command.easy_install:main
 3 | easy_install-3.4 = setuptools.command.easy_install:main
 4 | 
 5 | [distutils.commands]
 6 | alias = setuptools.command.alias:alias
 7 | bdist_egg = setuptools.command.bdist_egg:bdist_egg
 8 | bdist_rpm = setuptools.command.bdist_rpm:bdist_rpm
 9 | bdist_wininst = setuptools.command.bdist_wininst:bdist_wininst
10 | build_ext = setuptools.command.build_ext:build_ext
11 | build_py = setuptools.command.build_py:build_py
12 | develop = setuptools.command.develop:develop
13 | easy_install = setuptools.command.easy_install:easy_install
14 | egg_info = setuptools.command.egg_info:egg_info
15 | install = setuptools.command.install:install
16 | install_egg_info = setuptools.command.install_egg_info:install_egg_info
17 | install_lib = setuptools.command.install_lib:install_lib
18 | install_scripts = setuptools.command.install_scripts:install_scripts
19 | register = setuptools.command.register:register
20 | rotate = setuptools.command.rotate:rotate
21 | saveopts = setuptools.command.saveopts:saveopts
22 | sdist = setuptools.command.sdist:sdist
23 | setopt = setuptools.command.setopt:setopt
24 | test = setuptools.command.test:test
25 | upload_docs = setuptools.command.upload_docs:upload_docs
26 | 
27 | [distutils.setup_keywords]
28 | convert_2to3_doctests = setuptools.dist:assert_string_list
29 | dependency_links = setuptools.dist:assert_string_list
30 | eager_resources = setuptools.dist:assert_string_list
31 | entry_points = setuptools.dist:check_entry_points
32 | exclude_package_data = setuptools.dist:check_package_data
33 | extras_require = setuptools.dist:check_extras
34 | include_package_data = setuptools.dist:assert_bool
35 | install_requires = setuptools.dist:check_requirements
36 | namespace_packages = setuptools.dist:check_nsp
37 | package_data = setuptools.dist:check_package_data
38 | packages = setuptools.dist:check_packages
39 | test_loader = setuptools.dist:check_importable
40 | test_runner = setuptools.dist:check_importable
41 | test_suite = setuptools.dist:check_test_suite
42 | tests_require = setuptools.dist:check_requirements
43 | use_2to3 = setuptools.dist:assert_bool
44 | use_2to3_exclude_fixers = setuptools.dist:assert_string_list
45 | use_2to3_fixers = setuptools.dist:assert_string_list
46 | zip_safe = setuptools.dist:assert_bool
47 | 
48 | [egg_info.writers]
49 | PKG-INFO = setuptools.command.egg_info:write_pkg_info
50 | dependency_links.txt = setuptools.command.egg_info:overwrite_arg
51 | depends.txt = setuptools.command.egg_info:warn_depends_obsolete
52 | eager_resources.txt = setuptools.command.egg_info:overwrite_arg
53 | entry_points.txt = setuptools.command.egg_info:write_entries
54 | namespace_packages.txt = setuptools.command.egg_info:overwrite_arg
55 | requires.txt = setuptools.command.egg_info:write_requirements
56 | top_level.txt = setuptools.command.egg_info:write_toplevel_names
57 | 
58 | [setuptools.file_finders]
59 | svn_cvs = setuptools.command.sdist:_default_revctrl
60 | 
61 | [setuptools.installation]
62 | eggsecutable = setuptools.command.easy_install:bootstrap
63 | 
64 | 


--------------------------------------------------------------------------------
/kcws/cc/sentence_breaker.cc:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2016- 2018 Koth. All Rights Reserved.
 3 |  * =====================================================================================
 4 |  * Filename:  sentence_breaker.cc
 5 |  * Author:  Koth
 6 |  * Create Time: 2016-11-23 22:02:40
 7 |  * Description:
 8 |  *
 9 |  */
10 | #include "sentence_breaker.h"  //NOLINT
11 | #include "base/base.h"
12 | 
13 | 
14 | namespace kcws {
15 | char* SentenceBreaker::kInlineMarks[] = {
16 |   "（", "）", "(", ")", "[", "]", "【", "】", "《", "》", "“", "”"
17 | };
18 | char* SentenceBreaker::kBreakMarks[] = {
19 |   "。", ",", "，", " ", "\t", "?", "？", "!", "！", ";", "；"
20 | };
21 | SentenceBreaker::SentenceBreaker(int maxLen) {
22 |   for (size_t i = 0; i < sizeof(kInlineMarks) / sizeof(char*); i += 2) {
23 |     UnicodeStr ustr1;
24 |     UnicodeStr ustr2;
25 |     BasicStringUtil::u8tou16(kInlineMarks[i], strlen(kInlineMarks[i]), ustr1);
26 |     BasicStringUtil::u8tou16(kInlineMarks[i + 1], strlen(kInlineMarks[i + 1]), ustr2);
27 |     inline_marks_.insert(std::make_pair(ustr1[0], ustr2[0]));
28 |     inline_marks_set_.insert(ustr1[0]);
29 |     inline_marks_set_.insert(ustr2[0]);
30 |   }
31 |   for (size_t i = 0; i < sizeof(kBreakMarks) / sizeof(char*); i++) {
32 |     UnicodeStr ustr;
33 |     BasicStringUtil::u8tou16(kBreakMarks[i], strlen(kBreakMarks[i]), ustr);
34 |     break_marks_.insert(ustr[0]);
35 |   }
36 |   max_len_ = maxLen;
37 | }
38 | bool SentenceBreaker::is_inline_mark(UnicodeCharT uch) {
39 |   return inline_marks_.find(uch) != inline_marks_.end();
40 | }
41 | bool SentenceBreaker::is_break_mark(UnicodeCharT uch) {
42 |   return break_marks_.find(uch) != break_marks_.end();
43 | }
44 | SentenceBreaker::~SentenceBreaker() = default;
45 | 
46 | bool SentenceBreaker::breakSentences(const UnicodeStr& text,
47 |                                      std::vector<UnicodeStr>* lines) {
48 |   UnicodeCharT markChar = 0;
49 |   size_t nn = text.size();
50 |   if (nn == 0) {
51 |     return true;
52 |   }
53 |   size_t markPos = 0;
54 |   for (size_t i = 0; i < nn; i++) {
55 |     if (is_inline_mark(text[i])) {
56 |       if (markChar == text[i]) {
57 |         lines->push_back(text.substr(markPos, i - markPos + 1));
58 |         markPos = i + 1;
59 |         markChar = 0;
60 |       } else {
61 |         if (markPos != i) {
62 |           lines->push_back(text.substr(markPos, i - markPos ));
63 |           markPos = i;
64 |         }
65 |         markChar = inline_marks_[text[i]];
66 |       }
67 |     } else  if (markChar == 0) {
68 |       if (is_break_mark(text[i]) ||
69 |           (i - markPos + 1) >= static_cast<size_t>(max_len_)) {
70 |         // Oops, too long
71 |         lines->push_back(text.substr(markPos, i - markPos + 1));
72 |         markPos = i + 1;
73 |       }
74 |     } else  if ((i - markPos + 1) >= static_cast<size_t>(max_len_) ) {
75 |       // Oops, too long
76 |       lines->push_back(text.substr(markPos, i - markPos + 1));
77 |       markPos = i + 1;
78 |       markChar = 0;
79 |     }
80 |   }
81 |   if (markPos < nn) {
82 |     lines->push_back(text.substr(markPos, nn - markPos));
83 |   }
84 |   return true;
85 | }
86 | 
87 | }  // namespace kcws
88 | 


--------------------------------------------------------------------------------
/third_party/crow/include/crow/http_response.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | #include <string>
  3 | #include <unordered_map>
  4 | 
  5 | #include "third_party/crow/include/crow/json.h"
  6 | #include "third_party/crow/include/crow/http_request.h"
  7 | #include "third_party/crow/include/crow/ci_map.h"
  8 | 
  9 | namespace crow {
 10 | template <typename Adaptor, typename Handler, typename ... Middlewares>
 11 | class Connection;
 12 | struct response {
 13 |   template <typename Adaptor, typename Handler, typename ... Middlewares>
 14 |   friend class crow::Connection;
 15 | 
 16 |   int code{200};
 17 |   std::string body;
 18 |   json::wvalue json_value;
 19 | 
 20 |   // `headers' stores HTTP headers.
 21 |   ci_map headers;
 22 | 
 23 |   void set_header(std::string key, std::string value) {
 24 |     headers.erase(key);
 25 |     headers.emplace(std::move(key), std::move(value));
 26 |   }
 27 |   void add_header(std::string key, std::string value) {
 28 |     headers.emplace(std::move(key), std::move(value));
 29 |   }
 30 | 
 31 |   const std::string& get_header_value(const std::string& key) {
 32 |     return crow::get_header_value(headers, key);
 33 |   }
 34 | 
 35 | 
 36 |   response() {}
 37 |   explicit response(int code) : code(code) {}
 38 |   response(std::string body) : body(std::move(body)) {}
 39 |   response(json::wvalue&& json_value) : json_value(std::move(json_value)) {
 40 |     json_mode();
 41 |   }
 42 |   response(int code, std::string body) : code(code), body(std::move(body)) {}
 43 |   response(const json::wvalue& json_value) : body(json::dump(json_value)) {
 44 |     json_mode();
 45 |   }
 46 |   response(int code, const json::wvalue& json_value) : code(code), body(json::dump(json_value)) {
 47 |     json_mode();
 48 |   }
 49 | 
 50 |   response(response&& r) {
 51 |     *this = std::move(r);
 52 |   }
 53 | 
 54 |   response& operator = (const response& r) = delete;
 55 | 
 56 |   response& operator = (response&& r) noexcept {
 57 |     body = std::move(r.body);
 58 |     json_value = std::move(r.json_value);
 59 |     code = r.code;
 60 |     headers = std::move(r.headers);
 61 |     completed_ = r.completed_;
 62 |     return *this;
 63 |   }
 64 | 
 65 |   bool is_completed() const noexcept {
 66 |     return completed_;
 67 |   }
 68 | 
 69 |   void clear() {
 70 |     body.clear();
 71 |     json_value.clear();
 72 |     code = 200;
 73 |     headers.clear();
 74 |     completed_ = false;
 75 |   }
 76 | 
 77 |   void write(const std::string& body_part) {
 78 |     body += body_part;
 79 |   }
 80 | 
 81 |   void end() {
 82 |     if (!completed_) {
 83 |       completed_ = true;
 84 | 
 85 |       if (complete_request_handler_) {
 86 |         complete_request_handler_();
 87 |       }
 88 |     }
 89 |   }
 90 | 
 91 |   void end(const std::string& body_part) {
 92 |     body += body_part;
 93 |     end();
 94 |   }
 95 | 
 96 |   bool is_alive() {
 97 |     return is_alive_helper_ && is_alive_helper_();
 98 |   }
 99 | 
100 |  private:
101 |   bool completed_{};
102 |   std::function<void()> complete_request_handler_;
103 |   std::function<bool()> is_alive_helper_;
104 | 
105 |   //In case of a JSON object, set the Content-Type header
106 |   void json_mode() {
107 |     set_header("Content-Type", "application/json");
108 |   }
109 | };
110 | }
111 | 


--------------------------------------------------------------------------------
/kcws/train/process_anno_file.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # @Author: Koth Chen
  3 | # @Date:   2016-10-15 14:49:40
  4 | # @Last Modified by:   Koth
  5 | # @Last Modified time: 2016-12-09 20:33:30
  6 | import sys
  7 | import os
  8 | 
  9 | totalLine = 0
 10 | longLine = 0
 11 | maxLen = 80
 12 | 
 13 | 
 14 | def processToken(token, collect, out, end):
 15 |   global totalLine
 16 |   global longLine
 17 |   global maxLen
 18 |   nn = len(token)
 19 |   #print token
 20 |   while nn > 0 and token[nn - 1] != '/':
 21 |     nn = nn - 1
 22 | 
 23 |   token = token[:nn - 1].strip()
 24 |   ustr = unicode(token.decode('utf8'))
 25 |   for u in ustr:
 26 |     collect.append(u)
 27 |   uline = u''
 28 |   if token == '。' or end:
 29 |     if len(collect) > maxLen:
 30 |       longLine += 1
 31 |     totalLine += 1
 32 |     for s in collect:
 33 |       if uline:
 34 |         uline = uline + u" " + s
 35 |       else:
 36 |         uline = s
 37 |     out.write("%s\n" % (str(uline.encode('utf8'))))
 38 |     del collect[:]
 39 | 
 40 | 
 41 | def processLine(line, out):
 42 |   line = line.strip()
 43 |   nn = len(line)
 44 |   seeLeftB = False
 45 |   start = 0
 46 |   collect = []
 47 |   try:
 48 |     for i in range(nn):
 49 |       if line[i] == ' ':
 50 |         if not seeLeftB:
 51 |           token = line[start:i]
 52 |           if token.startswith('['):
 53 |             tokenLen = len(token)
 54 |             while tokenLen > 0 and token[tokenLen - 1] != ']':
 55 |               tokenLen = tokenLen - 1
 56 |             token = token[1:tokenLen - 1]
 57 |             ss = token.split(' ')
 58 |             for s in ss:
 59 |               processToken(s, collect, out, False)
 60 |           else:
 61 |             processToken(token, collect, out, False)
 62 |           start = i + 1
 63 |       elif line[i] == '[':
 64 |         seeLeftB = True
 65 |       elif line[i] == ']':
 66 |         seeLeftB = False
 67 |     if start < nn:
 68 |       token = line[start:]
 69 |       if token.startswith('['):
 70 |         tokenLen = len(token)
 71 |         while tokenLen > 0 and token[tokenLen - 1] != ']':
 72 |           tokenLen = tokenLen - 1
 73 |         token = token[1:tokenLen - 1]
 74 |         ss = token.split(' ')
 75 |         ns = len(ss)
 76 |         for i in range(ns - 1):
 77 |           processToken(ss[i], collect, out, False)
 78 |         processToken(ss[-1], collect, out, True)
 79 |       else:
 80 |         processToken(token, collect, out, True)
 81 |   except Exception as e:
 82 |     pass
 83 | 
 84 | 
 85 | def main(argc, argv):
 86 |   global totalLine
 87 |   global longLine
 88 |   if argc < 3:
 89 |     print("Usage:%s <dir> <output>" % (argv[0]))
 90 |     sys.exit(1)
 91 |   rootDir = argv[1]
 92 |   out = open(argv[2], "w")
 93 |   for dirName, subdirList, fileList in os.walk(rootDir):
 94 |     curDir = os.path.join(rootDir, dirName)
 95 |     for file in fileList:
 96 |       if file.endswith(".txt"):
 97 |         curFile = os.path.join(curDir, file)
 98 |         # print("processing:%s" % (curFile))
 99 |         fp = open(curFile, "r")
100 |         for line in fp.readlines():
101 |           line = line.strip()
102 |           processLine(line, out)
103 |         fp.close()
104 |   out.close()
105 |   print("total:%d, long lines:%d" % (totalLine, longLine))
106 | 
107 | 
108 | if __name__ == '__main__':
109 |   main(len(sys.argv), sys.argv)


--------------------------------------------------------------------------------
/third_party/crow/tests/template/delimiters.json:
--------------------------------------------------------------------------------
1 | {"__ATTN__":"Do not edit this file; changes belong in the appropriate YAML file.","overview":"Set Delimiter tags are used to change the tag delimiters for all content\nfollowing the tag in the current compilation unit.\n\nThe tag's content MUST be any two non-whitespace sequences (separated by\nwhitespace) EXCEPT an equals sign ('=') followed by the current closing\ndelimiter.\n\nSet Delimiter tags SHOULD be treated as standalone when appropriate.\n","tests":[{"name":"Pair Behavior","data":{"text":"Hey!"},"expected":"(Hey!)","template":"{{=<% %>=}}(<%text%>)","desc":"The equals sign (used on both sides) should permit delimiter changes."},{"name":"Special Characters","data":{"text":"It worked!"},"expected":"(It worked!)","template":"({{=[ ]=}}[text])","desc":"Characters with special meaning regexen should be valid delimiters."},{"name":"Sections","data":{"section":true,"data":"I got interpolated."},"expected":"[\n  I got interpolated.\n  |data|\n\n  {{data}}\n  I got interpolated.\n]\n","template":"[\n{{#section}}\n  {{data}}\n  |data|\n{{/section}}\n\n{{= | | =}}\n|#section|\n  {{data}}\n  |data|\n|/section|\n]\n","desc":"Delimiters set outside sections should persist."},{"name":"Inverted Sections","data":{"section":false,"data":"I got interpolated."},"expected":"[\n  I got interpolated.\n  |data|\n\n  {{data}}\n  I got interpolated.\n]\n","template":"[\n{{^section}}\n  {{data}}\n  |data|\n{{/section}}\n\n{{= | | =}}\n|^section|\n  {{data}}\n  |data|\n|/section|\n]\n","desc":"Delimiters set outside inverted sections should persist."},{"name":"Partial Inheritence","data":{"value":"yes"},"expected":"[ .yes. ]\n[ .yes. ]\n","template":"[ {{>include}} ]\n{{= | | =}}\n[ |>include| ]\n","desc":"Delimiters set in a parent template should not affect a partial.","partials":{"include":".{{value}}."}},{"name":"Post-Partial Behavior","data":{"value":"yes"},"expected":"[ .yes.  .yes. ]\n[ .yes.  .|value|. ]\n","template":"[ {{>include}} ]\n[ .{{value}}.  .|value|. ]\n","desc":"Delimiters set in a partial should not affect the parent template.","partials":{"include":".{{value}}. {{= | | =}} .|value|."}},{"name":"Surrounding Whitespace","data":{},"expected":"|  |","template":"| {{=@ @=}} |","desc":"Surrounding whitespace should be left untouched."},{"name":"Outlying Whitespace (Inline)","data":{},"expected":" | \n","template":" | {{=@ @=}}\n","desc":"Whitespace should be left untouched."},{"name":"Standalone Tag","data":{},"expected":"Begin.\nEnd.\n","template":"Begin.\n{{=@ @=}}\nEnd.\n","desc":"Standalone lines should be removed from the template."},{"name":"Indented Standalone Tag","data":{},"expected":"Begin.\nEnd.\n","template":"Begin.\n  {{=@ @=}}\nEnd.\n","desc":"Indented standalone lines should be removed from the template."},{"name":"Standalone Line Endings","data":{},"expected":"|\r\n|","template":"|\r\n{{= @ @ =}}\r\n|","desc":"\"\\r\\n\" should be considered a newline for standalone tags."},{"name":"Standalone Without Previous Line","data":{},"expected":"=","template":"  {{=@ @=}}\n=","desc":"Standalone tags should not require a newline to precede them."},{"name":"Standalone Without Newline","data":{},"expected":"=\n","template":"=\n  {{=@ @=}}","desc":"Standalone tags should not require a newline to follow them."},{"name":"Pair with Padding","data":{},"expected":"||","template":"|{{= @   @ =}}|","desc":"Superfluous in-tag whitespace should be ignored."}]}


--------------------------------------------------------------------------------
/tfmodel/tfmodel.cc:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2016- 2018 Koth. All Rights Reserved.
 3 |  * =====================================================================================
 4 |  * Filename:  tfmodel.cc
 5 |  * Author:  Koth
 6 |  * Create Time: 2017-02-01 13:28:34
 7 |  * Description:
 8 |  *
 9 |  */
10 | #include "tfmodel/tfmodel.h"
11 | 
12 | #include <fstream>
13 | 
14 | #include "base/base.h"
15 | #include "utils/basic_string_util.h"
16 | 
17 | #include "google/protobuf/io/coded_stream.h"
18 | #include "google/protobuf/io/zero_copy_stream_impl.h"
19 | #include "google/protobuf/io/zero_copy_stream_impl_lite.h"
20 | #include "google/protobuf/message_lite.h"
21 | 
22 | class IfstreamInputStream : public ::google::protobuf::io::CopyingInputStream {
23 |  public:
24 |   explicit IfstreamInputStream(const std::string& file_name)
25 |       : ifs_(file_name.c_str(), std::ios::in | std::ios::binary) {}
26 |   ~IfstreamInputStream() { ifs_.close(); }
27 | 
28 |   int Read(void* buffer, int size) {
29 |     if (!ifs_) {
30 |       return -1;
31 |     }
32 |     ifs_.read(static_cast<char*>(buffer), size);
33 |     return ifs_.gcount();
34 |   }
35 | 
36 |  private:
37 |   std::ifstream ifs_;
38 | };
39 | 
40 | bool PortableReadFileToProto(const std::string& file_name,
41 |                              ::google::protobuf::MessageLite* proto) {
42 |   ::google::protobuf::io::CopyingInputStreamAdaptor stream(
43 |       new IfstreamInputStream(file_name));
44 |   stream.SetOwnsCopyingStream(true);
45 |   // TODO(jiayq): the following coded stream is for debugging purposes to allow
46 |   // one to parse arbitrarily large messages for MessageLite. One most likely
47 |   // doesn't want to put protobufs larger than 64MB on Android, so we should
48 |   // eventually remove this and quit loud when a large protobuf is passed in.
49 |   ::google::protobuf::io::CodedInputStream coded_stream(&stream);
50 |   // Total bytes hard limit / warning limit are set to 1GB and 512MB
51 |   // respectively.
52 |   coded_stream.SetTotalBytesLimit(1024LL << 20, 512LL << 20);
53 |   return proto->ParseFromCodedStream(&coded_stream);
54 | }
55 | 
56 | namespace tf {
57 | TfModel::~TfModel() = default;
58 | bool TfModel::Load(const std::string& path) {
59 |   tensorflow::SessionOptions options;
60 |   tensorflow::ConfigProto& config = options.config;
61 | 
62 |   session_.reset(tensorflow::NewSession(options));
63 |   tensorflow::GraphDef tensorflow_graph;
64 |   VLOG(0) << "Reading file to proto: " << path;
65 |   if (!PortableReadFileToProto(path.c_str(), &tensorflow_graph)) {
66 |     VLOG(0) << "Load model error from:" << path;
67 |     return false;
68 |   }
69 |   VLOG(0) << "Creating session.";
70 |   tensorflow::Status s = session_->Create(tensorflow_graph);
71 |   if (!s.ok()) {
72 |     VLOG(0) << "Could not create Tensorflow Graph: " << s;
73 |     return false;
74 |   }
75 |   // Clear the proto to save memory space.
76 |   tensorflow_graph.Clear();
77 |   VLOG(0) << "Tensorflow graph loaded from: " << path;
78 |   return true;
79 | }
80 | bool TfModel::Eval(
81 |     const std::vector<std::pair<std::string, tensorflow::Tensor> >&
82 |         inputTensors,
83 |     const std::vector<std::string>& outputNames,
84 |     std::vector<tensorflow::Tensor>& outputTensors) {
85 |   tensorflow::Status s =
86 |       session_->Run(inputTensors, outputNames, {}, &outputTensors);
87 |   if (!s.ok()) {
88 |     LOG(ERROR) << "Error during inference: " << s;
89 |     return false;
90 |   }
91 |   return true;
92 | }
93 | 
94 | }  // namespace tf
95 | 


--------------------------------------------------------------------------------
/third_party/crow/include/crow/common.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <vector>
  4 | #include <string>
  5 | #include <stdexcept>
  6 | #include <iostream>
  7 | #include "third_party/crow/include/crow/utility.h"
  8 | 
  9 | namespace crow {
 10 | enum class HTTPMethod {
 11 | #ifndef DELETE
 12 |   DELETE = 0,
 13 |   GET,
 14 |   HEAD,
 15 |   POST,
 16 |   PUT,
 17 |   CONNECT,
 18 |   OPTIONS,
 19 |   TRACE,
 20 | #endif
 21 | 
 22 |   Delete = 0,
 23 |   Get,
 24 |   Head,
 25 |   Post,
 26 |   Put,
 27 |   Connect,
 28 |   Options,
 29 |   Trace,
 30 | };
 31 | 
 32 | inline std::string method_name(HTTPMethod method) {
 33 |   switch (method) {
 34 |   case HTTPMethod::Delete:
 35 |     return "DELETE";
 36 |   case HTTPMethod::Get:
 37 |     return "GET";
 38 |   case HTTPMethod::Head:
 39 |     return "HEAD";
 40 |   case HTTPMethod::Post:
 41 |     return "POST";
 42 |   case HTTPMethod::Put:
 43 |     return "PUT";
 44 |   case HTTPMethod::Connect:
 45 |     return "CONNECT";
 46 |   case HTTPMethod::Options:
 47 |     return "OPTIONS";
 48 |   case HTTPMethod::Trace:
 49 |     return "TRACE";
 50 |   }
 51 |   return "invalid";
 52 | }
 53 | 
 54 | enum class ParamType {
 55 |   INT,
 56 |   UINT,
 57 |   DOUBLE,
 58 |   STRING,
 59 |   PATH,
 60 | 
 61 |   MAX
 62 | };
 63 | 
 64 | struct routing_params {
 65 |   std::vector<int64_t> int_params;
 66 |   std::vector<uint64_t> uint_params;
 67 |   std::vector<double> double_params;
 68 |   std::vector<std::string> string_params;
 69 | 
 70 |   void debug_print() const {
 71 |     std::cerr << "routing_params" << std::endl;
 72 |     for (auto i : int_params)
 73 |       std::cerr << i << ", " ;
 74 |     std::cerr << std::endl;
 75 |     for (auto i : uint_params)
 76 |       std::cerr << i << ", " ;
 77 |     std::cerr << std::endl;
 78 |     for (auto i : double_params)
 79 |       std::cerr << i << ", " ;
 80 |     std::cerr << std::endl;
 81 |     for (auto& i : string_params)
 82 |       std::cerr << i << ", " ;
 83 |     std::cerr << std::endl;
 84 |   }
 85 | 
 86 |   template <typename T>
 87 |   T get(unsigned) const;
 88 | 
 89 | };
 90 | 
 91 | template<>
 92 | inline int64_t routing_params::get<int64_t>(unsigned index) const {
 93 |   return int_params[index];
 94 | }
 95 | 
 96 | template<>
 97 | inline uint64_t routing_params::get<uint64_t>(unsigned index) const {
 98 |   return uint_params[index];
 99 | }
100 | 
101 | template<>
102 | inline double routing_params::get<double>(unsigned index) const {
103 |   return double_params[index];
104 | }
105 | 
106 | template<>
107 | inline std::string routing_params::get<std::string>(unsigned index) const {
108 |   return string_params[index];
109 | }
110 | }
111 | 
112 | #ifndef CROW_MSVC_WORKAROUND
113 | constexpr crow::HTTPMethod operator "" _method(const char* str, size_t /*len*/) {
114 |   return
115 |     crow::black_magic::is_equ_p(str, "GET", 3) ? crow::HTTPMethod::Get :
116 |     crow::black_magic::is_equ_p(str, "DELETE", 6) ? crow::HTTPMethod::Delete :
117 |     crow::black_magic::is_equ_p(str, "HEAD", 4) ? crow::HTTPMethod::Head :
118 |     crow::black_magic::is_equ_p(str, "POST", 4) ? crow::HTTPMethod::Post :
119 |     crow::black_magic::is_equ_p(str, "PUT", 3) ? crow::HTTPMethod::Put :
120 |     crow::black_magic::is_equ_p(str, "OPTIONS", 7) ? crow::HTTPMethod::Options :
121 |     crow::black_magic::is_equ_p(str, "CONNECT", 7) ? crow::HTTPMethod::Connect :
122 |     crow::black_magic::is_equ_p(str, "TRACE", 5) ? crow::HTTPMethod::Trace :
123 |     throw std::runtime_error("invalid http method");
124 | }
125 | #endif
126 | 


--------------------------------------------------------------------------------
/third_party/crow/include/crow/logging.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <string>
  4 | #include <cstdio>
  5 | #include <cstdlib>
  6 | #include <ctime>
  7 | #include <iostream>
  8 | #include <sstream>
  9 | 
 10 | #include "third_party/crow/include/crow/settings.h"
 11 | 
 12 | namespace crow {
 13 | enum class LogLevel {
 14 |   Debug = 0,
 15 |   Info,
 16 |   Warning,
 17 |   Error,
 18 |   Critical,
 19 | };
 20 | 
 21 | class ILogHandler {
 22 |  public:
 23 |   virtual void log(std::string message, LogLevel level) = 0;
 24 | };
 25 | 
 26 | class CerrLogHandler : public ILogHandler {
 27 |  public:
 28 |   void log(std::string message, LogLevel /*level*/) override {
 29 |     std::cerr << message;
 30 |   }
 31 | };
 32 | 
 33 | class logger {
 34 | 
 35 |  private:
 36 |   //
 37 |   static std::string timestamp() {
 38 |     char date[32];
 39 |     time_t t = time(0);
 40 | 
 41 |     tm my_tm;
 42 | 
 43 | #ifdef _MSC_VER
 44 |     gmtime_s(&my_tm, &t);
 45 | #else
 46 |     gmtime_r(&t, &my_tm);
 47 | #endif
 48 | 
 49 |     size_t sz = strftime(date, sizeof(date), "%Y-%m-%d %H:%M:%S", &my_tm);
 50 |     return std::string(date, date + sz);
 51 |   }
 52 | 
 53 |  public:
 54 | 
 55 | 
 56 |   logger(std::string prefix, LogLevel level) : level_(level) {
 57 | #ifdef CROW_ENABLE_LOGGING
 58 |     stringstream_ << "(" << timestamp() << ") [" << prefix << "] ";
 59 | #endif
 60 | 
 61 |   }
 62 |   ~logger() {
 63 | #ifdef CROW_ENABLE_LOGGING
 64 |     if (level_ >= get_current_log_level()) {
 65 |       stringstream_ << std::endl;
 66 |       get_handler_ref()->log(stringstream_.str(), level_);
 67 |     }
 68 | #endif
 69 |   }
 70 | 
 71 |   //
 72 |   template <typename T>
 73 |   logger& operator<<(T const &value) {
 74 | 
 75 | #ifdef CROW_ENABLE_LOGGING
 76 |     if (level_ >= get_current_log_level()) {
 77 |       stringstream_ << value;
 78 |     }
 79 | #endif
 80 |     return *this;
 81 |   }
 82 | 
 83 |   //
 84 |   static void setLogLevel(LogLevel level) {
 85 |     get_log_level_ref() = level;
 86 |   }
 87 | 
 88 |   static void setHandler(ILogHandler* handler) {
 89 |     get_handler_ref() = handler;
 90 |   }
 91 | 
 92 |   static LogLevel get_current_log_level() {
 93 |     return get_log_level_ref();
 94 |   }
 95 | 
 96 |  private:
 97 |   //
 98 |   static LogLevel& get_log_level_ref() {
 99 |     static LogLevel current_level = (LogLevel)CROW_LOG_LEVEL;
100 |     return current_level;
101 |   }
102 |   static ILogHandler*& get_handler_ref() {
103 |     static CerrLogHandler default_handler;
104 |     static ILogHandler* current_handler = &default_handler;
105 |     return current_handler;
106 |   }
107 | 
108 |   //
109 |   std::ostringstream stringstream_;
110 |   LogLevel level_;
111 | };
112 | }
113 | 
114 | #define CROW_LOG_CRITICAL   \
115 |         if (crow::logger::get_current_log_level() <= crow::LogLevel::Critical) \
116 |             crow::logger("CRITICAL", crow::LogLevel::Critical)
117 | #define CROW_LOG_ERROR      \
118 |         if (crow::logger::get_current_log_level() <= crow::LogLevel::Error) \
119 |             crow::logger("ERROR   ", crow::LogLevel::Error)
120 | #define CROW_LOG_WARNING    \
121 |         if (crow::logger::get_current_log_level() <= crow::LogLevel::Warning) \
122 |             crow::logger("WARNING ", crow::LogLevel::Warning)
123 | #define CROW_LOG_INFO       \
124 |         if (crow::logger::get_current_log_level() <= crow::LogLevel::Info) \
125 |             crow::logger("INFO    ", crow::LogLevel::Info)
126 | #define CROW_LOG_DEBUG      \
127 |         if (crow::logger::get_current_log_level() <= crow::LogLevel::Debug) \
128 |             crow::logger("DEBUG   ", crow::LogLevel::Debug)
129 | 
130 | 


--------------------------------------------------------------------------------
/kcws/cc/gen_seg_eval.cc:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2016- 2018 Koth. All Rights Reserved.
  3 |  * =====================================================================================
  4 |  * Filename:  gen_seg_eval.cc
  5 |  * Author:  Koth
  6 |  * Create Time: 2016-11-29 09:26:39
  7 |  * Description:
  8 |  *
  9 |  */
 10 | #include <stdio.h>
 11 | #include <string.h>
 12 | #include <iostream>
 13 | #include <fstream>
 14 | #include <string>
 15 | #include <sstream>
 16 | #include <chrono>
 17 | 
 18 | #include "base/base.h"
 19 | #include "utils/basic_string_util.h"
 20 | 
 21 | 
 22 | #include "tf_seg_model.h"  //NOLINT
 23 | #include "sentence_breaker.h"  // NOLINT
 24 | #include "tensorflow/core/platform/init_main.h"
 25 | 
 26 | DEFINE_string(test_file, "", "the test file");
 27 | DEFINE_string(model_path, "", "the model path");
 28 | DEFINE_string(vocab_path, "", "vocab path");
 29 | 
 30 | DEFINE_int32(max_setence_len, 80, "max sentence len");
 31 | 
 32 | const int BATCH_SIZE = 2000;
 33 | int load_test_file(const std::string& path,
 34 |                    std::vector<std::string>* pstrs) {
 35 |   FILE *fp = fopen(path.c_str(), "r");
 36 |   if (fp == NULL) {
 37 |     VLOG(0) << "open file error:" << path;
 38 |     return 0;
 39 |   }
 40 |   char line[4096] = {0};
 41 |   int tn = 0;
 42 |   while (fgets(line, sizeof(line) - 1, fp)) {
 43 |     int nn = strlen(line);
 44 |     while (nn && (line[nn - 1] == '\n' || line[nn - 1] == '\r')) {
 45 |       nn -= 1;
 46 |     }
 47 |     if (nn <= 0) {
 48 |       continue;
 49 |     }
 50 |     pstrs->push_back(std::string(line, nn));
 51 |     tn += 1;
 52 |   }
 53 |   fclose(fp);
 54 |   return tn;
 55 | }
 56 | int main(int argc, char *argv[]) {
 57 |   tensorflow::port::InitMain(argv[0], &argc, &argv);
 58 |   google::ParseCommandLineFlags(&argc, &argv, true);
 59 |   if (FLAGS_vocab_path.empty()) {
 60 |     VLOG(0) << "basic bocab path is not set";
 61 |     return 1;
 62 |   }
 63 |   if (FLAGS_model_path.empty()) {
 64 |     VLOG(0) << " model path is not set";
 65 |     return 1;
 66 |   }
 67 |   if (FLAGS_test_file.empty()) {
 68 |     VLOG(0) << " test_file path is not set";
 69 |     return 1;
 70 |   }
 71 |   FILE* outfp = fopen("out_eval.txt", "w");
 72 |   CHECK(outfp != nullptr) << "open file 'out_eval.txt' error";
 73 |   kcws::TfSegModel sm;
 74 |   CHECK(sm.LoadModel(FLAGS_model_path,
 75 |                      FLAGS_vocab_path,
 76 |                      FLAGS_max_setence_len))
 77 |       << "Load model error";
 78 | 
 79 |   std::vector<std::string> teststrs;
 80 |   int ns = load_test_file(FLAGS_test_file, &teststrs);
 81 |   std::string todo;
 82 |   for (int i = 0; i < ns; i++) {
 83 |     todo.append(teststrs[i]);
 84 |   }
 85 |   VLOG(0) << "loaded :" << FLAGS_test_file << " ,got " << ns << " lines";
 86 | 
 87 |   auto start = std::chrono::steady_clock::now();
 88 |   for (int i = 0; i < ns; i++) {
 89 |     // VLOG(0) << "do line:" << i;
 90 |     if (teststrs[i].empty()) {
 91 |       VLOG(0) << "empty line , continue";
 92 |       continue;
 93 |     }
 94 |     std::vector<std::string> results;
 95 |     CHECK(sm.Segment(teststrs[i], &results)) << "segment error";
 96 |     int nr = results.size();
 97 |     CHECK_NE(nr, 0);
 98 |     fprintf(outfp, "%s", results[0].c_str());
 99 |     for (int i = 1; i < nr; i++) {
100 |       fprintf(outfp, " %s", results[i].c_str());
101 |     }
102 |     fprintf(outfp, "\n");
103 |   }
104 |   auto duration = std::chrono::duration_cast<std::chrono::milliseconds>
105 |                   (std::chrono::steady_clock::now() - start);
106 |   VLOG(0) << "spend " << duration.count() << " milliseconds for file:" << FLAGS_test_file;
107 | 
108 |   return 0;
109 | }
110 | 


--------------------------------------------------------------------------------
/third_party/crow/tests/template/partials.yml:
--------------------------------------------------------------------------------
  1 | overview: |
  2 |   Partial tags are used to expand an external template into the current
  3 |   template.
  4 | 
  5 |   The tag's content MUST be a non-whitespace character sequence NOT containing
  6 |   the current closing delimiter.
  7 | 
  8 |   This tag's content names the partial to inject.  Set Delimiter tags MUST NOT
  9 |   affect the parsing of a partial.  The partial MUST be rendered against the
 10 |   context stack local to the tag.  If the named partial cannot be found, the
 11 |   empty string SHOULD be used instead, as in interpolations.
 12 | 
 13 |   Partial tags SHOULD be treated as standalone when appropriate.  If this tag
 14 |   is used standalone, any whitespace preceding the tag should treated as
 15 |   indentation, and prepended to each line of the partial before rendering.
 16 | tests:
 17 |   - name: Basic Behavior
 18 |     desc: The greater-than operator should expand to the named partial.
 19 |     data: { }
 20 |     template: '"{{>text}}"'
 21 |     partials: { text: 'from partial' }
 22 |     expected: '"from partial"'
 23 | 
 24 |   - name: Failed Lookup
 25 |     desc: The empty string should be used when the named partial is not found.
 26 |     data: { }
 27 |     template: '"{{>text}}"'
 28 |     partials: { }
 29 |     expected: '""'
 30 | 
 31 |   - name: Context
 32 |     desc: The greater-than operator should operate within the current context.
 33 |     data: { text: 'content' }
 34 |     template: '"{{>partial}}"'
 35 |     partials: { partial: '*{{text}}*' }
 36 |     expected: '"*content*"'
 37 | 
 38 |   - name: Recursion
 39 |     desc: The greater-than operator should properly recurse.
 40 |     data: { content: "X", nodes: [ { content: "Y", nodes: [] } ] }
 41 |     template: '{{>node}}'
 42 |     partials: { node: '{{content}}<{{#nodes}}{{>node}}{{/nodes}}>' }
 43 |     expected: 'X<Y<>>'
 44 | 
 45 |   # Whitespace Sensitivity
 46 | 
 47 |   - name: Surrounding Whitespace
 48 |     desc: The greater-than operator should not alter surrounding whitespace.
 49 |     data: { }
 50 |     template: '| {{>partial}} |'
 51 |     partials: { partial: "\t|\t" }
 52 |     expected: "| \t|\t |"
 53 | 
 54 |   - name: Inline Indentation
 55 |     desc: Whitespace should be left untouched.
 56 |     data: { data: '|' }
 57 |     template: "  {{data}}  {{> partial}}\n"
 58 |     partials: { partial: ">\n>" }
 59 |     expected: "  |  >\n>\n"
 60 | 
 61 |   - name: Standalone Line Endings
 62 |     desc: '"\r\n" should be considered a newline for standalone tags.'
 63 |     data: { }
 64 |     template: "|\r\n{{>partial}}\r\n|"
 65 |     partials: { partial: ">" }
 66 |     expected: "|\r\n>|"
 67 | 
 68 |   - name: Standalone Without Previous Line
 69 |     desc: Standalone tags should not require a newline to precede them.
 70 |     data: { }
 71 |     template: "  {{>partial}}\n>"
 72 |     partials: { partial: ">\n>"}
 73 |     expected: "  >\n  >>"
 74 | 
 75 |   - name: Standalone Without Newline
 76 |     desc: Standalone tags should not require a newline to follow them.
 77 |     data: { }
 78 |     template: ">\n  {{>partial}}"
 79 |     partials: { partial: ">\n>" }
 80 |     expected: ">\n  >\n  >"
 81 | 
 82 |   - name: Standalone Indentation
 83 |     desc: Each line of the partial should be indented before rendering.
 84 |     data: { content: "<\n->" }
 85 |     template: |
 86 |       \
 87 |        {{>partial}}
 88 |       /
 89 |     partials:
 90 |       partial: |
 91 |         |
 92 |         {{{content}}}
 93 |         |
 94 |     expected: |
 95 |       \
 96 |        |
 97 |        <
 98 |       ->
 99 |        |
100 |       /
101 | 
102 |   # Whitespace Insensitivity
103 | 
104 |   - name: Padding Whitespace
105 |     desc: Superfluous in-tag whitespace should be ignored.
106 |     data: { boolean: true }
107 |     template: "|{{> partial }}|"
108 |     partials: { partial: "[]" }
109 |     expected: '|[]|'
110 | 


--------------------------------------------------------------------------------
/kcws/cc/BUILD:
--------------------------------------------------------------------------------
  1 | package(default_visibility = ["//visibility:public"])
  2 | 
  3 | cc_library(
  4 |   name="tf_seg_model",
  5 |   srcs=[
  6 |     "tf_seg_model.cc"
  7 |   ],
  8 |   hdrs=[
  9 |     "tf_seg_model.h"
 10 |   ],
 11 |   deps=[
 12 |    '//utils:basic_string_util',
 13 |    '//utils:basic_vocab',
 14 |    ':pos_tagger',
 15 |    ':sentence_breaker',
 16 |    ':ac_scanner',
 17 |    '@tf//:tensorflow',
 18 |    '@protobuf//:protobuf',
 19 |   ]
 20 | )
 21 | 
 22 | cc_library(
 23 |   name="pos_tagger",
 24 |   srcs=[
 25 |     "pos_tagger.cc"
 26 |   ],
 27 |   hdrs=[
 28 |     "pos_tagger.h"
 29 |   ],
 30 |   deps=[
 31 |    '//utils:basic_string_util',
 32 |    '//utils:basic_vocab',
 33 |    '//tfmodel:tfmodel',
 34 |    ':viterbi_decode',
 35 |    '@tf//:tensorflow',
 36 |    '@protobuf//:protobuf',
 37 |   ]
 38 | )
 39 | 
 40 | cc_library(
 41 |   name="viterbi_decode",
 42 |   srcs=[
 43 |     "viterbi_decode.cc"
 44 |   ],
 45 |   hdrs=[
 46 |     "viterbi_decode.h"
 47 |   ],
 48 |   deps=[
 49 |    '//utils:basic_string_util',
 50 |    '@tf//:tensorflow',
 51 |   ]
 52 | )
 53 | 
 54 | cc_library(
 55 |   name="sentence_breaker",
 56 |   srcs=[
 57 |     "sentence_breaker.cc"
 58 |   ],
 59 |   hdrs=[
 60 |     "sentence_breaker.h"
 61 |   ],
 62 |   copts=[
 63 |   "-Wno-writable-strings"
 64 |   ],
 65 |   deps=[
 66 |   '//base:base',
 67 |    '//utils:basic_string_util',
 68 |   ],
 69 |   linkstatic=1,
 70 | )
 71 | 
 72 | cc_binary(
 73 |     name = "test_breaker",
 74 |     srcs = [
 75 |         "test_breaker.cc",
 76 |     ],
 77 |     copts = [
 78 |         "-g",
 79 |         "-std=c++11",
 80 |     ],
 81 |     linkopts = [
 82 |         "-ldl",
 83 |         "-lpthread",
 84 |     ],
 85 |     deps = [
 86 |         ":sentence_breaker",
 87 |         "//base",
 88 |     ],
 89 | )
 90 | 
 91 | py_binary(
 92 |     name = "dump_vocab",
 93 |     srcs = ["dump_vocab.py"],
 94 |     data = ["//utils:w2v.so"],
 95 |     imports = ["../../utils"],
 96 | )
 97 | 
 98 | cc_binary(
 99 |     name = "test_seg",
100 |     srcs = [
101 |         "test_seg.cc",
102 |     ],
103 |     copts = [
104 |         "-g",
105 |         "-std=c++11",
106 |     ],
107 |     linkopts = [
108 |         "-ldl",
109 |         "-lpthread",
110 |     ],
111 |     deps = [
112 |         ":tf_seg_model",
113 |         "//base",
114 |     ],
115 | )
116 | 
117 | cc_binary(
118 |     name = "gen_seg_eval",
119 |     srcs = [
120 |         "gen_seg_eval.cc",
121 |     ],
122 |     copts = [
123 |         "-g",
124 |         "-std=c++11",
125 |     ],
126 |     linkopts = [
127 |         "-ldl",
128 |         "-lpthread",
129 |     ],
130 |     deps = [
131 |         ":tf_seg_model",
132 |         "//base",
133 |     ],
134 | )
135 | 
136 | genrule(
137 |     name = "demo_html_gen",
138 |     srcs = ["demo.html"],
139 |     outs = ["demo_html.h"],
140 |     cmd = "xxd -i \"$<\"  >\"$@\"",
141 | )
142 | 
143 | cc_library(
144 |     name = "demo_html",
145 |     srcs = ["demo_html.h"],
146 |     data = [
147 |         ":demo_html_gen",
148 |     ],
149 | )
150 | 
151 | cc_binary(
152 |     name = "seg_backend_api",
153 |     srcs = [
154 |         "seg_backend_api.cc",
155 |     ],
156 |     linkopts = ["-ldl"],
157 |     deps = [
158 |         ":demo_html",
159 |         ":tf_seg_model",
160 |         "//base",
161 |         "//third_party/crow",
162 |         "//utils:jsonxx",
163 |     ],
164 | )
165 | 
166 | cc_library(
167 |     name = "ac_scanner",
168 |     srcs = [
169 |         "ac_scanner.h",
170 |     ],
171 |     linkstatic = 1,
172 |     deps = [
173 |         "//base",
174 |         "//utils:basic_string_util",
175 |     ],
176 | )
177 | 
178 | cc_binary(
179 |     name = "test_ac_scanner",
180 |     srcs = [
181 |         "test_ac_scanner.cc",
182 |     ],
183 |     copts = [
184 |         "-Wno-writable-strings",
185 |     ],
186 |     linkopts = ["-ldl","-pthread"],
187 |     deps = [
188 |         ":ac_scanner",
189 |     ],
190 | )
191 | 


--------------------------------------------------------------------------------
/kcws/train/stats_pos.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # @Author: Koth
  3 | # @Date:   2017-01-25 14:55:00
  4 | # @Last Modified by:   Koth
  5 | # @Last Modified time: 2017-04-07 22:12:33
  6 | 
  7 | import sys
  8 | import os
  9 | 
 10 | totalLine = 0
 11 | longLine = 0
 12 | maxLen = 80
 13 | posMap = {}
 14 | 
 15 | 
 16 | def processToken(token, collect, out, end):
 17 |     global totalLine
 18 |     global longLine
 19 |     global maxLen
 20 |     global posMap
 21 |     nn = len(token)
 22 |     oline = token
 23 |     while nn > 0 and token[nn - 1] != '/':
 24 |         nn = nn - 1
 25 |     pos = token[nn:]
 26 |     token = token[:nn - 1].strip()
 27 |     if not token:
 28 |         return
 29 |     if (not pos[0:1].isalpha()) or pos[0:1].isupper():
 30 |         return
 31 |     if len(pos) > 2:
 32 |         pos = pos[:2]
 33 |     posMap.setdefault(pos, 0)
 34 |     posMap[pos] += 1
 35 |     out.write("%s %s\t" % (token, pos))
 36 |     if end:
 37 |         out.write("\n")
 38 | 
 39 | 
 40 | def processLine(line, out):
 41 |     line = line.strip()
 42 |     nn = len(line)
 43 |     seeLeftB = False
 44 |     start = 0
 45 |     collect = []
 46 |     try:
 47 |         for i in range(nn):
 48 |             if line[i] == ' ':
 49 |                 if not seeLeftB:
 50 |                     token = line[start:i]
 51 |                     if token.startswith('['):
 52 |                         tokenLen = len(token)
 53 |                         while tokenLen > 0 and token[tokenLen - 1] != ']':
 54 |                             tokenLen = tokenLen - 1
 55 |                         token = token[1:tokenLen - 1]
 56 |                         ss = token.split(' ')
 57 |                         for s in ss:
 58 |                             processToken(s, collect, out, False)
 59 |                     else:
 60 |                         processToken(token, collect, out, False)
 61 |                     start = i + 1
 62 |             elif line[i] == '[':
 63 |                 seeLeftB = True
 64 |             elif line[i] == ']':
 65 |                 seeLeftB = False
 66 |         if start < nn:
 67 |             token = line[start:]
 68 |             if token.startswith('['):
 69 |                 tokenLen = len(token)
 70 |                 while tokenLen > 0 and token[tokenLen - 1] != ']':
 71 |                     tokenLen = tokenLen - 1
 72 |                 token = token[1:tokenLen - 1]
 73 |                 ss = token.split(' ')
 74 |                 ns = len(ss)
 75 |                 for i in range(ns - 1):
 76 |                     processToken(ss[i], collect, out, False)
 77 |                 processToken(ss[-1], collect, out, True)
 78 |             else:
 79 |                 processToken(token, collect, out, True)
 80 |     except Exception as e:
 81 |         pass
 82 | 
 83 | 
 84 | def main(argc, argv):
 85 |     global totalLine
 86 |     global longLine
 87 |     global posMap
 88 |     if argc < 4:
 89 |         print("Usage:%s <dir> <pos_vob_out> <for_train_out>" % (argv[0]))
 90 |         sys.exit(1)
 91 |     rootDir = argv[1]
 92 |     out = open(argv[3], "w")
 93 |     tagvobFp = open(argv[2], "w")
 94 |     for dirName, subdirList, fileList in os.walk(rootDir):
 95 |         curDir = os.path.join(rootDir, dirName)
 96 |         for file in fileList:
 97 |             if file.endswith(".txt"):
 98 |                 curFile = os.path.join(curDir, file)
 99 |                 fp = open(curFile, "r")
100 |                 for line in fp.readlines():
101 |                     line = line.strip()
102 |                     processLine(line, out)
103 |                 fp.close()
104 |     out.close()
105 |     print("total:%d, long lines:%d" % (totalLine, longLine))
106 |     print("total pos tags:%d" % (len(posMap)))
107 |     idx = 0
108 |     for k, v in posMap.iteritems():
109 |         tagvobFp.write("%s\t%d\n" % (k, idx + 1))
110 |         idx += 1
111 | 
112 | 
113 | if __name__ == '__main__':
114 |     main(len(sys.argv), sys.argv)
115 | 


--------------------------------------------------------------------------------
/third_party/crow/examples/example_vs.cpp:
--------------------------------------------------------------------------------
  1 | #include "crow.h"
  2 | 
  3 | #include <sstream>
  4 | 
  5 | class ExampleLogHandler : public crow::ILogHandler {
  6 |     public:
  7 |         void log(std::string message, crow::LogLevel level) override {
  8 | //            cerr << "ExampleLogHandler -> " << message;
  9 |         }
 10 | };
 11 | 
 12 | struct ExampleMiddleware 
 13 | {
 14 |     std::string message;
 15 | 
 16 |     ExampleMiddleware() 
 17 |     {
 18 |         message = "foo";
 19 |     }
 20 | 
 21 |     void setMessage(std::string newMsg)
 22 |     {
 23 |         message = newMsg;
 24 |     }
 25 | 
 26 |     struct context
 27 |     {
 28 |     };
 29 | 
 30 |     void before_handle(crow::request& req, crow::response& res, context& ctx)
 31 |     {
 32 |         CROW_LOG_DEBUG << " - MESSAGE: " << message;
 33 |     }
 34 | 
 35 |     void after_handle(crow::request& req, crow::response& res, context& ctx)
 36 |     {
 37 |         // no-op
 38 |     }
 39 | };
 40 | 
 41 | int main()
 42 | {
 43 |     crow::App<ExampleMiddleware> app;
 44 | 
 45 |     app.get_middleware<ExampleMiddleware>().setMessage("hello");
 46 | 
 47 |     app.route_dynamic("/")
 48 |     ([]{
 49 |         return "Hello World!";
 50 |     });
 51 | 
 52 |     app.route_dynamic("/about")
 53 |     ([](){
 54 |         return "About Crow example.";
 55 |     });
 56 | 
 57 |     // a request to /path should be forwarded to /path/
 58 |     app.route_dynamic("/path/")
 59 |     ([](){
 60 |         return "Trailing slash test case..";
 61 |     });
 62 | 
 63 |     // simple json response
 64 |     app.route_dynamic("/json")
 65 |     ([]{
 66 |         crow::json::wvalue x;
 67 |         x["message"] = "Hello, World!";
 68 |         return x;
 69 |     });
 70 | 
 71 |     app.route_dynamic("/hello/<int>")
 72 |     ([](int count){
 73 |         if (count > 100)
 74 |             return crow::response(400);
 75 |         std::ostringstream os;
 76 |         os << count << " bottles of beer!";
 77 |         return crow::response(os.str());
 78 |     });
 79 | 
 80 |     app.route_dynamic("/add/<int>/<int>")
 81 |     ([](const crow::request& req, crow::response& res, int a, int b){
 82 |         std::ostringstream os;
 83 |         os << a+b;
 84 |         res.write(os.str());
 85 |         res.end();
 86 |     });
 87 | 
 88 |     // Compile error with message "Handler type is mismatched with URL paramters"
 89 |     //CROW_ROUTE(app,"/another/<int>")
 90 |     //([](int a, int b){
 91 |         //return crow::response(500);
 92 |     //});
 93 | 
 94 |     // more json example
 95 |     app.route_dynamic("/add_json")
 96 |         .methods(crow::HTTPMethod::POST)
 97 |     ([](const crow::request& req){
 98 |         auto x = crow::json::load(req.body);
 99 |         if (!x)
100 |             return crow::response(400);
101 |         auto sum = x["a"].i()+x["b"].i();
102 |         std::ostringstream os;
103 |         os << sum;
104 |         return crow::response{os.str()};
105 |     });
106 | 
107 |     app.route_dynamic("/params")
108 |     ([](const crow::request& req){
109 |         std::ostringstream os;
110 |         os << "Params: " << req.url_params << "\n\n"; 
111 |         os << "The key 'foo' was " << (req.url_params.get("foo") == nullptr ? "not " : "") << "found.\n";
112 |         if(req.url_params.get("pew") != nullptr) {
113 |             double countD = boost::lexical_cast<double>(req.url_params.get("pew"));
114 |             os << "The value of 'pew' is " <<  countD << '\n';
115 |         }
116 |         auto count = req.url_params.get_list("count");
117 |         os << "The key 'count' contains " << count.size() << " value(s).\n";
118 |         for(const auto& countVal : count) {
119 |             os << " - " << countVal << '\n';
120 |         }
121 |         return crow::response{os.str()};
122 |     });    
123 | 
124 |     // ignore all log
125 |     crow::logger::setLogLevel(crow::LogLevel::DEBUG);
126 |     //crow::logger::setHandler(std::make_shared<ExampleLogHandler>());
127 | 
128 |     app.port(18080)
129 |         .multithreaded()
130 |         .run();
131 | }
132 | 


--------------------------------------------------------------------------------
/kcws/train/idcnn.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding:utf-8 -*-
 3 | 
 4 | # File: idcnn.py
 5 | # Project: /Users/tech/code/kcws
 6 | # Created: Mon Jul 31 2017
 7 | # Author: Koth Chen
 8 | # Copyright (c) 2017 Koth
 9 | #
10 | # <<licensetext>>
11 | 
12 | import tensorflow as tf
13 | 
14 | 
15 | class Model:
16 |     def __init__(self,
17 |                  layers,
18 |                  filterWidth,
19 |                  numFilter,
20 |                  embeddingDim,
21 |                  maxSeqLen,
22 |                  numTags,
23 |                  repeatTimes=4):
24 |         self.layers = layers
25 |         self.filter_width = filterWidth
26 |         self.num_filter = numFilter
27 |         self.embedding_dim = embeddingDim
28 |         self.repeat_times = repeatTimes
29 |         self.num_tags = numTags
30 |         self.max_seq_len = maxSeqLen
31 | 
32 |     def inference(self, X, reuse=False):
33 |         with tf.variable_scope("idcnn", reuse=reuse):
34 |             filter_weights = tf.get_variable(
35 |                 "idcnn_filter",
36 |                 shape=[1, self.filter_width, self.embedding_dim,
37 |                        self.num_filter],
38 |                 initializer=tf.contrib.layers.xavier_initializer())
39 |             layerInput = tf.nn.conv2d(X,
40 |                                       filter_weights,
41 |                                       strides=[1, 1, 1, 1],
42 |                                       padding="SAME",
43 |                                       name="init_layer")
44 |             finalOutFromLayers = []
45 |             totalWidthForLastDim = 0
46 |             for j in range(self.repeat_times):
47 |                 for i in range(len(self.layers)):
48 |                     dilation = self.layers[i]['dilation']
49 |                     isLast = True if i == (len(self.layers) - 1) else False
50 |                     with tf.variable_scope("atrous-conv-layer-%d" % i,
51 |                                            reuse=True
52 |                                            if (reuse or j > 0) else False):
53 |                         w = tf.get_variable(
54 |                             "filterW",
55 |                             shape=[1, self.filter_width, self.num_filter,
56 |                                    self.num_filter],
57 |                             initializer=tf.contrib.layers.xavier_initializer())
58 |                         b = tf.get_variable("filterB", shape=[self.num_filter])
59 |                         conv = tf.nn.atrous_conv2d(layerInput,
60 |                                                    w,
61 |                                                    rate=dilation,
62 |                                                    padding="SAME")
63 |                         conv = tf.nn.bias_add(conv, b)
64 |                         conv = tf.nn.relu(conv)
65 |                         if isLast:
66 |                             finalOutFromLayers.append(conv)
67 |                             totalWidthForLastDim += self.num_filter
68 |                         layerInput = conv
69 |             finalOut = tf.concat(axis=3, values=finalOutFromLayers)
70 |             keepProb = 1.0 if reuse else 0.5
71 |             finalOut = tf.nn.dropout(finalOut, keepProb)
72 | 
73 |             finalOut = tf.squeeze(finalOut, [1])
74 |             finalOut = tf.reshape(finalOut, [-1, totalWidthForLastDim])
75 | 
76 |             finalW = tf.get_variable(
77 |                 "finalW",
78 |                 shape=[totalWidthForLastDim, self.num_tags],
79 |                 initializer=tf.contrib.layers.xavier_initializer())
80 | 
81 |             finalB = tf.get_variable("finalB",
82 |                                      initializer=tf.constant(
83 |                                          0.001, shape=[self.num_tags]))
84 | 
85 |             scores = tf.nn.xw_plus_b(finalOut, finalW, finalB, name="scores")
86 |         if reuse:
87 |             scores = tf.reshape(scores, [-1, self.max_seq_len, self.num_tags],
88 |                                 name="Reshape_7")
89 |         else:
90 |             scores = tf.reshape(scores, [-1, self.max_seq_len, self.num_tags],
91 |                                 name=None)
92 |         return scores
93 | 


--------------------------------------------------------------------------------
/kcws/cc/seg_backend_api.cc:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2016- 2018 Koth. All Rights Reserved.
  3 |  * =====================================================================================
  4 |  * Filename:  seg_backend_api.cc
  5 |  * Author:  Koth
  6 |  * Create Time: 2016-11-20 20:43:26
  7 |  * Description:
  8 |  *
  9 |  */
 10 | #include <string>
 11 | #include <thread>
 12 | #include <memory>
 13 | 
 14 | #include "base/base.h"
 15 | #include "utils/jsonxx.h"
 16 | #include "utils/basic_string_util.h"
 17 | #include "kcws/cc/demo_html.h"
 18 | #include "kcws/cc/tf_seg_model.h"
 19 | #include "kcws/cc/pos_tagger.h"
 20 | #include "third_party/crow/include/crow.h"
 21 | #include "tensorflow/core/platform/init_main.h"
 22 | 
 23 | DEFINE_int32(port, 9090, "the  api serving binding port");
 24 | DEFINE_string(model_path, "kcws/models/seg_model.pbtxt", "the model path");
 25 | DEFINE_string(vocab_path, "kcws/models/basic_vocab.txt", "char vocab path");
 26 | DEFINE_string(pos_model_path, "kcws/models/pos_model.pbtxt", "the pos tagging model path");
 27 | DEFINE_string(word_vocab_path, "kcws/models/word_vocab.txt", "word vocab path");
 28 | DEFINE_string(pos_vocab_path, "kcws/models/pos_vocab.txt", "pos vocab path");
 29 | DEFINE_int32(max_sentence_len, 80, "max sentence len ");
 30 | DEFINE_string(user_dict_path, "", "user dict path");
 31 | DEFINE_int32(max_word_num, 50, "max num of word per sentence ");
 32 | class SegMiddleware {
 33 |  public:
 34 |   struct context {};
 35 |   SegMiddleware() {}
 36 |   ~SegMiddleware() {}
 37 |   void before_handle(crow::request& req, crow::response& res, context& ctx) {}
 38 |   void after_handle(crow::request& req, crow::response& res, context& ctx) {}
 39 |  private:
 40 | };
 41 | int main(int argc, char* argv[]) {
 42 |   tensorflow::port::InitMain(argv[0], &argc, &argv);
 43 |   google::ParseCommandLineFlags(&argc, &argv, true);
 44 |   crow::App<SegMiddleware> app;
 45 |   kcws::TfSegModel model;
 46 |   CHECK(model.LoadModel(FLAGS_model_path,
 47 |                         FLAGS_vocab_path,
 48 |                         FLAGS_max_sentence_len,
 49 |                         FLAGS_user_dict_path))
 50 |       << "Load model error";
 51 |   if (!FLAGS_pos_model_path.empty()) {
 52 |     kcws::PosTagger* tagger = new kcws::PosTagger;
 53 |     CHECK(tagger->LoadModel(FLAGS_pos_model_path,
 54 |                             FLAGS_word_vocab_path,
 55 |                             FLAGS_vocab_path,
 56 |                             FLAGS_pos_vocab_path,
 57 |                             FLAGS_max_word_num)) << "load pos model error";
 58 |     model.SetPosTagger(tagger);
 59 |   }
 60 |   CROW_ROUTE(app, "/tf_seg/api").methods("POST"_method)
 61 |   ([&model](const crow::request & req) {
 62 |     jsonxx::Object obj;
 63 |     int status = -1;
 64 |     std::string desc = "OK";
 65 |     std::string gotReqBody = req.body;
 66 |     VLOG(0) << "got body:";
 67 |     fprintf(stderr, "%s\n", gotReqBody.c_str());
 68 |     jsonxx::Object toRet;
 69 |     if (obj.parse(gotReqBody) && obj.has<std::string>("sentence")) {
 70 |       std::string sentence = obj.get<std::string>("sentence");
 71 |       std::vector<std::string> result;
 72 |       std::vector<std::string> tags;
 73 |       if (model.Segment(sentence, &result, &tags)) {
 74 |         status = 0;
 75 |         jsonxx::Array rarr;
 76 |         if (result.size() == tags.size()) {
 77 |           int nl = result.size();
 78 |           for (int i = 0; i < nl; i++) {
 79 |             jsonxx::Object obj;
 80 |             obj << "tok" << result[i];
 81 |             obj << "pos" << tags[i];
 82 |             rarr << obj;
 83 |           }
 84 |         } else {
 85 |           for (std::string str : result) {
 86 |             rarr << str;
 87 |           }
 88 |         }
 89 |         toRet << "segments" << rarr;
 90 |       }
 91 |     } else {
 92 |       desc = "Parse request error";
 93 |     }
 94 |     toRet << "status" << status;
 95 |     toRet << "msg" << desc;
 96 |     return crow::response(toRet.json());
 97 |   });
 98 |   CROW_ROUTE(app, "/")([](const crow::request & req) {
 99 |     return crow::response(std::string(reinterpret_cast<char*>(&kcws_cc_demo_html[0]), kcws_cc_demo_html_len));
100 |   });
101 |   app.port(FLAGS_port).multithreaded().run();
102 |   return 0;
103 | }
104 | 


--------------------------------------------------------------------------------
/util/python/python_config.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | # Copyright 2015 The TensorFlow Authors. All Rights Reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | # ==============================================================================
 16 | 
 17 | set -e -o errexit
 18 | 
 19 | if [ -d "../org_tensorflow" ]; then
 20 |   script_path="../org_tensorflow"
 21 | else
 22 |   # Prefix expected paths with ./ locally and external/reponame/ for remote repos.
 23 |   # TODO(kchodorow): remove once runfiles paths are fixed, see
 24 |   # https://github.com/bazelbuild/bazel/issues/848.
 25 |   script_path=$(dirname $(dirname $(dirname "$0")))
 26 |   script_path=${script_path:-.}
 27 | fi
 28 | 
 29 | EXPECTED_PATHS="$script_path/util/python/python_include"\
 30 | " $script_path/util/python/python_lib"
 31 | 
 32 | function main {
 33 |   argument="$1"
 34 |   shift
 35 |   case $argument in
 36 |     --check)
 37 |       check_python
 38 |       exit 0
 39 |       ;;
 40 |     --setup)
 41 |       setup_python "$1"
 42 |       exit 0
 43 |       ;;
 44 |   esac
 45 | }
 46 | 
 47 | function setup_python {
 48 |   PYTHON_BIN_PATH="$1";
 49 | 
 50 |   if [ -z "$PYTHON_BIN_PATH" ]; then
 51 |     echo "PYTHON_BIN_PATH was not provided.  Did you run configure?"
 52 |     exit 1
 53 |   fi
 54 |   if [ ! -x "$PYTHON_BIN_PATH" ]  || [ -d "$PYTHON_BIN_PATH" ]; then
 55 |     echo "PYTHON_BIN_PATH is not executable.  Is it the python binary?"
 56 |     exit 1
 57 |   fi
 58 | 
 59 |   local python_major_version=$("${PYTHON_BIN_PATH}" -c 'from __future__ import print_function; import sys; print(sys.version_info[0]);')
 60 |   if [ "$python_major_version" == "" ]; then
 61 |     echo -e "\n\nERROR: Problem getting python version.  Is $PYTHON_BIN_PATH the correct python binary?"
 62 |     exit 1
 63 |   fi
 64 | 
 65 |   local python_include=$("${PYTHON_BIN_PATH}" -c 'from __future__ import print_function; from distutils import sysconfig; print(sysconfig.get_python_inc());')
 66 |   if [ "$python_include" == "" ]; then
 67 |     echo -e "\n\nERROR: Problem getting python include path.  Is distutils installed?"
 68 |     exit 1
 69 |   fi
 70 |   local python_lib=$("${PYTHON_BIN_PATH}" -c 'from __future__ import print_function; from distutils import sysconfig; print(sysconfig.get_python_lib());')
 71 |   if [ "$python_lib" == "" ]; then
 72 |     echo -e "\n\nERROR: Problem getting python lib path.  Is distutils installed?"
 73 |     exit 1
 74 |   fi
 75 |   
 76 | 
 77 |   for x in $EXPECTED_PATHS; do
 78 |     if [ -e "$x" ]; then
 79 |       rm "$x"
 80 |     fi
 81 |   done
 82 | 
 83 |   ln -sf "${python_include}" util/python/python_include
 84 |   ln -sf "${python_lib}" util/python/python_lib
 85 | 
 86 |   # Write tools/bazel.rc
 87 |   echo "# Autogenerated by configure: DO NOT EDIT" > tools/bazel.rc
 88 |   sed -e "s/\$PYTHON_MAJOR_VERSION/$python_major_version/g" \
 89 |       -e "s[\$PYTHON_BINARY[$PYTHON_BIN_PATH[g" \
 90 |       tools/bazel.rc.template >> tools/bazel.rc
 91 |   # Write tools/python_bin_path.sh
 92 |   echo "export PYTHON_BIN_PATH=$PYTHON_BIN_PATH" > tools/python_bin_path.sh
 93 | }
 94 | 
 95 | function check_python {
 96 |   for x in $EXPECTED_PATHS; do
 97 |     if [ ! -e "$x" ]; then
 98 |       echo -e "\n\nERROR: Cannot find '${x}'.  Did you run configure?\n\n" 1>&2
 99 |       exit 1
100 |     fi
101 |     if [ ! -L "${x}" ]; then
102 |       echo -e "\n\nERROR: '${x}' is not a symbolic link.  Internal error.\n\n" 1>&2
103 |       exit 1
104 |     fi
105 |     true_path=$(readlink "${x}")
106 |     if [ ! -d "${true_path}" ]; then
107 |       echo -e "\n\nERROR: '${x}' does not refer to an existing directory: ${true_path}.  Do you need to rerun configure?\n\n" 1>&2
108 |       exit 1
109 |     fi
110 |   done
111 | }
112 | 
113 | main "$@"
114 | 


--------------------------------------------------------------------------------
/kcws/cc/test_seg.cc:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2016- 2018 Koth. All Rights Reserved.
  3 |  * =====================================================================================
  4 |  * Filename:  test_seg.cc
  5 |  * Author:  Koth
  6 |  * Create Time: 2016-11-20 12:13:21
  7 |  * Description:
  8 |  *
  9 |  */
 10 | #include <stdio.h>
 11 | #include <string.h>
 12 | #include <iostream>
 13 | #include <fstream>
 14 | #include <string>
 15 | #include <sstream>
 16 | #include <chrono>
 17 | 
 18 | #include "base/base.h"
 19 | #include "utils/basic_string_util.h"
 20 | 
 21 | 
 22 | #include "tf_seg_model.h"  //NOLINT
 23 | #include "sentence_breaker.h"  // NOLINT
 24 | #include "tensorflow/core/platform/init_main.h"
 25 | 
 26 | DEFINE_string(test_sentence, "", "the test string");
 27 | DEFINE_string(test_file, "", "the test file");
 28 | DEFINE_string(model_path, "", "the model path");
 29 | DEFINE_string(vocab_path, "", "vocab path");
 30 | DEFINE_string(user_dict_path, "", "user dict path");
 31 | DEFINE_int32(max_setence_len, 80, "max sentence len");
 32 | 
 33 | const int BATCH_SIZE = 2000;
 34 | int load_test_file(const std::string& path,
 35 |                    std::vector<std::string>* pstrs) {
 36 |   FILE *fp = fopen(path.c_str(), "r");
 37 |   if (fp == NULL) {
 38 |     VLOG(0) << "open file error:" << path;
 39 |     return 0;
 40 |   }
 41 |   char line[4096] = {0};
 42 |   int tn = 0;
 43 |   while (fgets(line, sizeof(line) - 1, fp)) {
 44 |     int nn = strlen(line);
 45 |     while (nn && (line[nn - 1] == '\n' || line[nn - 1] == '\r')) {
 46 |       nn -= 1;
 47 |     }
 48 |     if (nn <= 0) {
 49 |       continue;
 50 |     }
 51 |     pstrs->push_back(std::string(line, nn));
 52 |     tn += 1;
 53 |   }
 54 |   fclose(fp);
 55 |   return tn;
 56 | }
 57 | int main(int argc, char *argv[]) {
 58 |   tensorflow::port::InitMain(argv[0], &argc, &argv);
 59 |   google::ParseCommandLineFlags(&argc, &argv, true);
 60 |   if (FLAGS_vocab_path.empty()) {
 61 |     VLOG(0) << "basic bocab path is not set";
 62 |     return 1;
 63 |   }
 64 |   if (FLAGS_model_path.empty()) {
 65 |     VLOG(0) << " model path is not set";
 66 |     return 1;
 67 |   }
 68 |   kcws::TfSegModel sm;
 69 |   CHECK(sm.LoadModel(FLAGS_model_path,
 70 |                      FLAGS_vocab_path,
 71 |                      FLAGS_max_setence_len,
 72 |                      FLAGS_user_dict_path))
 73 |       << "Load model error";
 74 |   if (!FLAGS_test_sentence.empty()) {
 75 |     std::vector<std::string> results;
 76 |     CHECK(sm.Segment(FLAGS_test_sentence, &results)) << "segment error";
 77 |     VLOG(0) << "results is :";
 78 |     for (auto str : results) {
 79 |       VLOG(0) << str;
 80 |     }
 81 |   } else if (!FLAGS_test_file.empty()) {
 82 |     kcws::SentenceBreaker breaker(FLAGS_max_setence_len);
 83 |     std::vector<std::string> teststrs;
 84 |     int ns = load_test_file(FLAGS_test_file, &teststrs);
 85 |     std::string todo;
 86 |     for (int i = 0; i < ns; i++) {
 87 |       todo.append(teststrs[i]);
 88 |     }
 89 |     UnicodeStr utodo;
 90 |     BasicStringUtil::u8tou16(todo.c_str(), todo.size(), utodo);
 91 |     std::vector<UnicodeStr> sentences;
 92 |     breaker.breakSentences(utodo, &sentences);
 93 | 
 94 |     VLOG(0) << "loaded :" << FLAGS_test_file << " ,got " << ns << " lines,"
 95 |             << sentences.size() << " sentences, " << utodo.size() << " characters";
 96 |     int batch = (sentences.size() - 1) / BATCH_SIZE + 1;
 97 | 
 98 |     auto start = std::chrono::steady_clock::now();
 99 |     for (int i = 0; i < batch; i++) {
100 |       // VLOG(0) << "seg batch:" << i;
101 |       int end = BATCH_SIZE * (i + 1);
102 |       if (end > static_cast<int>(sentences.size())) {
103 |         end = sentences.size();
104 |       }
105 |       std::vector<std::vector<kcws::SegTok>> results;
106 |       std::vector<UnicodeStr>  todoSentences(sentences.begin() + (BATCH_SIZE * i), sentences.begin() + end);
107 |       CHECK(sm.Segment(todoSentences, &results)) << "segment error";
108 |     }
109 |     auto duration = std::chrono::duration_cast<std::chrono::milliseconds>
110 |                     (std::chrono::steady_clock::now() - start);
111 |     VLOG(0) << "spend " << duration.count() << " milliseconds for file:" << FLAGS_test_file;
112 |   } else {
113 |     VLOG(0) << "either test sentence or test file  should be set";
114 |     return 1;
115 |   }
116 | 
117 |   return 0;
118 | }
119 | 


--------------------------------------------------------------------------------
/third_party/crow/README.md:
--------------------------------------------------------------------------------
  1 | ![Crow logo](http://i.imgur.com/wqivvjK.jpg)
  2 | 
  3 | Crow is C++ microframework for web. (inspired by Python Flask)
  4 | 
  5 | [![Travis Build](https://travis-ci.org/ipkn/crow.svg?branch=master)](https://travis-ci.org/ipkn/crow)
  6 | [![Coverage Status](https://coveralls.io/repos/ipkn/crow/badge.svg?branch=master)](https://coveralls.io/r/ipkn/crow?branch=master)
  7 | 
  8 | ```c++
  9 | #include "crow.h"
 10 | 
 11 | int main()
 12 | {
 13 |     crow::SimpleApp app;
 14 | 
 15 |     CROW_ROUTE(app, "/")([](){
 16 |         return "Hello world";
 17 |     });
 18 | 
 19 |     app.port(18080).multithreaded().run();
 20 | }
 21 | ```
 22 | 
 23 | ## Features
 24 | 
 25 |  - Easy routing
 26 |    - Similiar to Flask
 27 |    - Type-safe Handlers (see Example)
 28 |  - Very Fast
 29 |    - ![Benchmark Result in one chart](https://docs.google.com/spreadsheets/d/1KidO9XpuwCRZ2p_JRDJj2aep61H8Sh_KDOhApizv4LE/pubchart?oid=2041467789&format=image)
 30 |    - More data on [crow-benchmark](https://github.com/ipkn/crow-benchmark)
 31 |  - Fast built-in JSON parser (crow::json)
 32 |  - [Mustache](http://mustache.github.io/) based templating library (crow::mustache)
 33 |  - Header only
 34 |  - Provide an amalgamated header file `crow_all.h' with every features
 35 |  - Middleware support
 36 | 
 37 | ## Still in development
 38 |  - ~~Built-in ORM~~
 39 |    - Check [sqlpp11](https://github.com/rbock/sqlpp11) if you want one.
 40 | 
 41 | ## Examples
 42 | 
 43 | #### JSON Response
 44 | ```c++
 45 | CROW_ROUTE(app, "/json")
 46 | ([]{
 47 |     crow::json::wvalue x;
 48 |     x["message"] = "Hello, World!";
 49 |     return x;
 50 | });
 51 | ```
 52 | 
 53 | #### Arguments
 54 | ```c++
 55 | CROW_ROUTE(app,"/hello/<int>")
 56 | ([](int count){
 57 |     if (count > 100)
 58 |         return crow::response(400);
 59 |     std::ostringstream os;
 60 |     os << count << " bottles of beer!";
 61 |     return crow::response(os.str());
 62 | });
 63 | ```
 64 | Handler arguments type check at compile time
 65 | ```c++
 66 | // Compile error with message "Handler type is mismatched with URL paramters"
 67 | CROW_ROUTE(app,"/another/<int>")
 68 | ([](int a, int b){
 69 |     return crow::response(500);
 70 | });
 71 | ```
 72 | 
 73 | #### Handling JSON Requests
 74 | ```c++
 75 | CROW_ROUTE(app, "/add_json")
 76 | .methods("POST"_method)
 77 | ([](const crow::request& req){
 78 |     auto x = crow::json::load(req.body);
 79 |     if (!x)
 80 |         return crow::response(400);
 81 |     int sum = x["a"].i()+x["b"].i();
 82 |     std::ostringstream os;
 83 |     os << sum;
 84 |     return crow::response{os.str()};
 85 | });
 86 | ```
 87 | 
 88 | ## How to Build
 89 | 
 90 | If you just want to use crow, copy amalgamate/crow_all.h and include it.
 91 | 
 92 | ### Requirements
 93 | 
 94 |  - C++ compiler with good C++11 support (tested with g++>=4.8)
 95 |  - boost library
 96 |  - CMake for build examples
 97 |  - Linking with tcmalloc/jemalloc is recommended for speed.
 98 | 
 99 |  - Now supporting VS2013 with limited functionality (only run-time check for url is available.)
100 | 
101 | ### Building (Tests, Examples)
102 | 
103 | Out-of-source build with CMake is recommended.
104 | 
105 | ```
106 | mkdir build
107 | cd build
108 | cmake ..
109 | make
110 | ```
111 | 
112 | You can run tests with following commands:
113 | ```
114 | ctest
115 | ```
116 | 
117 | 
118 | ### Installing missing dependencies
119 | 
120 | #### Ubuntu
121 |     sudo apt-get install build-essential libtcmalloc-minimal4 && sudo ln -s /usr/lib/libtcmalloc_minimal.so.4 /usr/lib/libtcmalloc_minimal.so
122 | 
123 | #### OSX
124 |     brew install boost google-perftools
125 | 
126 | ### Attributions
127 | 
128 | Crow uses the following libraries.
129 | 
130 |     qs_parse
131 | 
132 |     https://github.com/bartgrantham/qs_parse
133 | 
134 |     Copyright (c) 2010 Bart Grantham
135 |     Permission is hereby granted, free of charge, to any person obtaining a copy
136 |     of this software and associated documentation files (the "Software"), to deal
137 |     in the Software without restriction, including without limitation the rights
138 |     to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
139 |     copies of the Software, and to permit persons to whom the Software is
140 |     furnished to do so, subject to the following conditions:
141 |     The above copyright notice and this permission notice shall be included in
142 |     all copies or substantial portions of the Software.
143 | 
144 | 


--------------------------------------------------------------------------------
/kcws/train/generate_training.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # @Author: Koth Chen
  3 | # @Date:   2016-10-21 16:17:53
  4 | # @Last Modified by:   Koth
  5 | # @Last Modified time: 2017-01-25 16:54:11
  6 | 
  7 | import sys
  8 | import os
  9 | import w2v
 10 | from sentence import Sentence
 11 | 
 12 | totalLine = 0
 13 | longLine = 0
 14 | 
 15 | MAX_LEN = 80
 16 | totalChars = 0
 17 | 
 18 | 
 19 | def processToken(token, sentence, out, end, vob):
 20 |     global totalLine
 21 |     global longLine
 22 |     global totalChars
 23 |     global MAX_LEN
 24 |     nn = len(token)
 25 |     while nn > 0 and token[nn - 1] != '/':
 26 |         nn = nn - 1
 27 | 
 28 |     token = token[:nn - 1].strip()
 29 |     if token != '。':
 30 |         ustr = unicode(token.decode('utf8'))
 31 |         sentence.addToken(ustr)
 32 |     uline = u''
 33 |     if token == '。' or end:
 34 |         if sentence.chars > MAX_LEN:
 35 |             longLine += 1
 36 |         else:
 37 |             x = []
 38 |             y = []
 39 |             totalChars += sentence.chars
 40 |             sentence.generate_tr_line(x, y, vob)
 41 |             nn = len(x)
 42 |             assert (nn == len(y))
 43 |             for j in range(nn, MAX_LEN):
 44 |                 x.append(0)
 45 |                 y.append(0)
 46 |             line = ''
 47 |             for i in range(MAX_LEN):
 48 |                 if i > 0:
 49 |                     line += " "
 50 |                 line += str(x[i])
 51 |             for j in range(MAX_LEN):
 52 |                 line += " " + str(y[j])
 53 |             out.write("%s\n" % (line))
 54 |         totalLine += 1
 55 |         sentence.clear()
 56 | 
 57 | 
 58 | def processLine(line, out, vob):
 59 |     line = line.strip()
 60 |     nn = len(line)
 61 |     seeLeftB = False
 62 |     start = 0
 63 |     sentence = Sentence()
 64 |     try:
 65 |         for i in range(nn):
 66 |             if line[i] == ' ':
 67 |                 if not seeLeftB:
 68 |                     token = line[start:i]
 69 |                     if token.startswith('['):
 70 |                         tokenLen = len(token)
 71 |                         while tokenLen > 0 and token[tokenLen - 1] != ']':
 72 |                             tokenLen = tokenLen - 1
 73 |                         token = token[1:tokenLen - 1]
 74 |                         ss = token.split(' ')
 75 |                         for s in ss:
 76 |                             processToken(s, sentence, out, False, vob)
 77 |                     else:
 78 |                         processToken(token, sentence, out, False, vob)
 79 |                     start = i + 1
 80 |             elif line[i] == '[':
 81 |                 seeLeftB = True
 82 |             elif line[i] == ']':
 83 |                 seeLeftB = False
 84 |         if start < nn:
 85 |             token = line[start:]
 86 |             if token.startswith('['):
 87 |                 tokenLen = len(token)
 88 |                 while tokenLen > 0 and token[tokenLen - 1] != ']':
 89 |                     tokenLen = tokenLen - 1
 90 |                 token = token[1:tokenLen - 1]
 91 |                 ss = token.split(' ')
 92 |                 ns = len(ss)
 93 |                 for i in range(ns - 1):
 94 |                     processToken(ss[i], sentence, out, False, vob)
 95 |                 processToken(ss[-1], sentence, out, True, vob)
 96 |             else:
 97 |                 processToken(token, sentence, out, True, vob)
 98 |     except Exception as e:
 99 |         pass
100 | 
101 | 
102 | def main(argc, argv):
103 |     global totalLine
104 |     global longLine
105 |     global totalChars
106 |     if argc < 4:
107 |         print("Usage:%s <vob> <dir> <output>" % (argv[0]))
108 |         sys.exit(1)
109 |     vobPath = argv[1]
110 |     rootDir = argv[2]
111 |     vob = w2v.Word2vecVocab()
112 |     vob.Load(vobPath)
113 |     out = open(argv[3], "w")
114 |     for dirName, subdirList, fileList in os.walk(rootDir):
115 |         curDir = os.path.join(rootDir, dirName)
116 |         for file in fileList:
117 |             if file.endswith(".txt"):
118 |                 curFile = os.path.join(curDir, file)
119 |                 #print("processing:%s" % (curFile))
120 |                 fp = open(curFile, "r")
121 |                 for line in fp.readlines():
122 |                     line = line.strip()
123 |                     processLine(line, out, vob)
124 |                 fp.close()
125 |     out.close()
126 |     print("total:%d, long lines:%d, chars:%d" %
127 |           (totalLine, longLine, totalChars))
128 | 
129 | 
130 | if __name__ == '__main__':
131 |     main(len(sys.argv), sys.argv)
132 | 


--------------------------------------------------------------------------------
/third_party/crow/tests/template/delimiters.yml:
--------------------------------------------------------------------------------
  1 | overview: |
  2 |   Set Delimiter tags are used to change the tag delimiters for all content
  3 |   following the tag in the current compilation unit.
  4 | 
  5 |   The tag's content MUST be any two non-whitespace sequences (separated by
  6 |   whitespace) EXCEPT an equals sign ('=') followed by the current closing
  7 |   delimiter.
  8 | 
  9 |   Set Delimiter tags SHOULD be treated as standalone when appropriate.
 10 | tests:
 11 |   - name: Pair Behavior
 12 |     desc: The equals sign (used on both sides) should permit delimiter changes.
 13 |     data: { text: 'Hey!' }
 14 |     template: '{{=<% %>=}}(<%text%>)'
 15 |     expected: '(Hey!)'
 16 | 
 17 |   - name: Special Characters
 18 |     desc: Characters with special meaning regexen should be valid delimiters.
 19 |     data: { text: 'It worked!' }
 20 |     template: '({{=[ ]=}}[text])'
 21 |     expected: '(It worked!)'
 22 | 
 23 |   - name: Sections
 24 |     desc: Delimiters set outside sections should persist.
 25 |     data: { section: true, data: 'I got interpolated.' }
 26 |     template: |
 27 |       [
 28 |       {{#section}}
 29 |         {{data}}
 30 |         |data|
 31 |       {{/section}}
 32 | 
 33 |       {{= | | =}}
 34 |       |#section|
 35 |         {{data}}
 36 |         |data|
 37 |       |/section|
 38 |       ]
 39 |     expected: |
 40 |       [
 41 |         I got interpolated.
 42 |         |data|
 43 | 
 44 |         {{data}}
 45 |         I got interpolated.
 46 |       ]
 47 | 
 48 |   - name: Inverted Sections
 49 |     desc: Delimiters set outside inverted sections should persist.
 50 |     data: { section: false, data: 'I got interpolated.' }
 51 |     template: |
 52 |       [
 53 |       {{^section}}
 54 |         {{data}}
 55 |         |data|
 56 |       {{/section}}
 57 | 
 58 |       {{= | | =}}
 59 |       |^section|
 60 |         {{data}}
 61 |         |data|
 62 |       |/section|
 63 |       ]
 64 |     expected: |
 65 |       [
 66 |         I got interpolated.
 67 |         |data|
 68 | 
 69 |         {{data}}
 70 |         I got interpolated.
 71 |       ]
 72 | 
 73 |   - name: Partial Inheritence
 74 |     desc: Delimiters set in a parent template should not affect a partial.
 75 |     data: { value: 'yes' }
 76 |     partials:
 77 |       include: '.{{value}}.'
 78 |     template: |
 79 |       [ {{>include}} ]
 80 |       {{= | | =}}
 81 |       [ |>include| ]
 82 |     expected: |
 83 |       [ .yes. ]
 84 |       [ .yes. ]
 85 | 
 86 |   - name: Post-Partial Behavior
 87 |     desc: Delimiters set in a partial should not affect the parent template.
 88 |     data: { value: 'yes' }
 89 |     partials:
 90 |       include: '.{{value}}. {{= | | =}} .|value|.'
 91 |     template: |
 92 |       [ {{>include}} ]
 93 |       [ .{{value}}.  .|value|. ]
 94 |     expected: |
 95 |       [ .yes.  .yes. ]
 96 |       [ .yes.  .|value|. ]
 97 | 
 98 |   # Whitespace Sensitivity
 99 | 
100 |   - name: Surrounding Whitespace
101 |     desc: Surrounding whitespace should be left untouched.
102 |     data: { }
103 |     template: '| {{=@ @=}} |'
104 |     expected: '|  |'
105 | 
106 |   - name: Outlying Whitespace (Inline)
107 |     desc: Whitespace should be left untouched.
108 |     data: { }
109 |     template: " | {{=@ @=}}\n"
110 |     expected: " | \n"
111 | 
112 |   - name: Standalone Tag
113 |     desc: Standalone lines should be removed from the template.
114 |     data: { }
115 |     template: |
116 |       Begin.
117 |       {{=@ @=}}
118 |       End.
119 |     expected: |
120 |       Begin.
121 |       End.
122 | 
123 |   - name: Indented Standalone Tag
124 |     desc: Indented standalone lines should be removed from the template.
125 |     data: { }
126 |     template: |
127 |       Begin.
128 |         {{=@ @=}}
129 |       End.
130 |     expected: |
131 |       Begin.
132 |       End.
133 | 
134 |   - name: Standalone Line Endings
135 |     desc: '"\r\n" should be considered a newline for standalone tags.'
136 |     data: { }
137 |     template: "|\r\n{{= @ @ =}}\r\n|"
138 |     expected: "|\r\n|"
139 | 
140 |   - name: Standalone Without Previous Line
141 |     desc: Standalone tags should not require a newline to precede them.
142 |     data: { }
143 |     template: "  {{=@ @=}}\n="
144 |     expected: "="
145 | 
146 |   - name: Standalone Without Newline
147 |     desc: Standalone tags should not require a newline to follow them.
148 |     data: { }
149 |     template: "=\n  {{=@ @=}}"
150 |     expected: "=\n"
151 | 
152 |   # Whitespace Insensitivity
153 | 
154 |   - name: Pair with Padding
155 |     desc: Superfluous in-tag whitespace should be ignored.
156 |     data: { }
157 |     template: '|{{= @   @ =}}|'
158 |     expected: '||'
159 | 


--------------------------------------------------------------------------------
/third_party/crow/include/crow/parser.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <string>
  4 | #include <unordered_map>
  5 | #include <boost/algorithm/string.hpp>
  6 | #include <boost/tokenizer.hpp>
  7 | #include <algorithm>
  8 | 
  9 | #include "third_party/crow/include/crow/http_parser_merged.h"
 10 | #include "third_party/crow/include/crow/http_request.h"
 11 | 
 12 | namespace crow {
 13 | template <typename Handler>
 14 | struct HTTPParser : public http_parser {
 15 |   static int on_message_begin(http_parser* self_) {
 16 |     HTTPParser* self = static_cast<HTTPParser*>(self_);
 17 |     self->clear();
 18 |     return 0;
 19 |   }
 20 |   static int on_url(http_parser* self_, const char* at, size_t length) {
 21 |     HTTPParser* self = static_cast<HTTPParser*>(self_);
 22 |     self->raw_url.insert(self->raw_url.end(), at, at + length);
 23 |     return 0;
 24 |   }
 25 |   static int on_header_field(http_parser* self_, const char* at, size_t length) {
 26 |     HTTPParser* self = static_cast<HTTPParser*>(self_);
 27 |     switch (self->header_building_state) {
 28 |     case 0:
 29 |       if (!self->header_value.empty()) {
 30 |         self->headers.emplace(std::move(self->header_field), std::move(self->header_value));
 31 |       }
 32 |       self->header_field.assign(at, at + length);
 33 |       self->header_building_state = 1;
 34 |       break;
 35 |     case 1:
 36 |       self->header_field.insert(self->header_field.end(), at, at + length);
 37 |       break;
 38 |     }
 39 |     return 0;
 40 |   }
 41 |   static int on_header_value(http_parser* self_, const char* at, size_t length) {
 42 |     HTTPParser* self = static_cast<HTTPParser*>(self_);
 43 |     switch (self->header_building_state) {
 44 |     case 0:
 45 |       self->header_value.insert(self->header_value.end(), at, at + length);
 46 |       break;
 47 |     case 1:
 48 |       self->header_building_state = 0;
 49 |       self->header_value.assign(at, at + length);
 50 |       break;
 51 |     }
 52 |     return 0;
 53 |   }
 54 |   static int on_headers_complete(http_parser* self_) {
 55 |     HTTPParser* self = static_cast<HTTPParser*>(self_);
 56 |     if (!self->header_field.empty()) {
 57 |       self->headers.emplace(std::move(self->header_field), std::move(self->header_value));
 58 |     }
 59 |     self->process_header();
 60 |     return 0;
 61 |   }
 62 |   static int on_body(http_parser* self_, const char* at, size_t length) {
 63 |     HTTPParser* self = static_cast<HTTPParser*>(self_);
 64 |     self->body.insert(self->body.end(), at, at + length);
 65 |     return 0;
 66 |   }
 67 |   static int on_message_complete(http_parser* self_) {
 68 |     HTTPParser* self = static_cast<HTTPParser*>(self_);
 69 | 
 70 |     // url params
 71 |     self->url = self->raw_url.substr(0, self->raw_url.find("?"));
 72 |     self->url_params = query_string(self->raw_url);
 73 | 
 74 |     self->process_message();
 75 |     return 0;
 76 |   }
 77 |   HTTPParser(Handler* handler) :
 78 |     handler_(handler) {
 79 |     http_parser_init(this, HTTP_REQUEST);
 80 |   }
 81 | 
 82 |   // return false on error
 83 |   bool feed(const char* buffer, int length) {
 84 |     const static http_parser_settings settings_{
 85 |       on_message_begin,
 86 |       on_url,
 87 |       nullptr,
 88 |       on_header_field,
 89 |       on_header_value,
 90 |       on_headers_complete,
 91 |       on_body,
 92 |       on_message_complete,
 93 |     };
 94 | 
 95 |     int nparsed = http_parser_execute(this, &settings_, buffer, length);
 96 |     return nparsed == length;
 97 |   }
 98 | 
 99 |   bool done() {
100 |     return feed(nullptr, 0);
101 |   }
102 | 
103 |   void clear() {
104 |     url.clear();
105 |     raw_url.clear();
106 |     header_building_state = 0;
107 |     header_field.clear();
108 |     header_value.clear();
109 |     headers.clear();
110 |     url_params.clear();
111 |     body.clear();
112 |   }
113 | 
114 |   void process_header() {
115 |     handler_->handle_header();
116 |   }
117 | 
118 |   void process_message() {
119 |     handler_->handle();
120 |   }
121 | 
122 |   request to_request() const {
123 |     return request{(HTTPMethod)method, std::move(raw_url), std::move(url), std::move(url_params), std::move(headers), std::move(body)};
124 |   }
125 | 
126 |   bool is_upgrade() const {
127 |     return upgrade;
128 |   }
129 | 
130 |   bool check_version(int major, int minor) const {
131 |     return http_major == major && http_minor == minor;
132 |   }
133 | 
134 |   std::string raw_url;
135 |   std::string url;
136 | 
137 |   int header_building_state = 0;
138 |   std::string header_field;
139 |   std::string header_value;
140 |   ci_map headers;
141 |   query_string url_params;
142 |   std::string body;
143 | 
144 |   Handler* handler_;
145 | };
146 | }
147 | 


--------------------------------------------------------------------------------