├── data
    ├── swathdic.abm
    ├── .cvsignore
    ├── tdict-slang.txt
    ├── update-dict.sh
    ├── tdict-collection.txt
    ├── tdict-history.txt
    ├── tdict-lang-ethnic.txt
    ├── tdict-geo.txt
    ├── Makefile.am
    ├── tdict-spell.txt
    ├── tdict-district.txt
    ├── tdict-country.txt
    ├── tdict-city.txt
    ├── tdict-science.txt
    ├── tdict-ict.txt
    ├── tdict-proper.txt
    └── tdict-common.txt
├── conv
    ├── .cvsignore
    ├── conv.h
    ├── unichar.h
    ├── tischar.h
    ├── Makefile.am
    ├── convfact.h
    ├── tis620.h
    ├── utf8.h
    ├── convfact.cxx
    ├── tis620.cxx
    ├── conv.cxx
    ├── convkit.h
    └── utf8.cxx
├── src
    ├── .cvsignore
    ├── longwordseg.h
    ├── utils.h
    ├── filefilter.h
    ├── maxwordseg.h
    ├── filterhtml.h
    ├── filterlambda.h
    ├── filterlatex.h
    ├── convutil.h
    ├── Makefile.am
    ├── filefilter.cpp
    ├── wordstack.h
    ├── filterrtf.h
    ├── dict.cpp
    ├── filterx.h
    ├── abswordseg.h
    ├── worddef.h
    ├── dict.h
    ├── longwordseg.cpp
    ├── filterhtml.cpp
    ├── convutil.cpp
    ├── swath.1
    ├── filterlatex.cpp
    ├── maxwordseg.cpp
    ├── abswordseg.cpp
    ├── wordseg.cpp
    └── filterrtf.cpp
├── lib
    ├── Makefile.am
    └── wcpcpy.c
├── .gitignore
├── Makefile.am
├── .cvsignore
├── tests
    ├── test-rtf.sh
    ├── mixed.txt
    ├── mixed-wseg.txt
    ├── test-long.sh
    ├── test-mixed.sh
    ├── test-latex.sh
    ├── test-simple.sh
    ├── test-utf8-wbr.sh
    ├── thai-latex.tex
    ├── Makefile.am
    ├── thai-latex-wseg.tex
    ├── long.txt
    └── long-wseg.txt
├── AUTHORS
├── autogen.sh
├── INSTALL
├── README
├── configure.ac
├── NEWS
└── COPYING


/data/swathdic.abm:
--------------------------------------------------------------------------------
1 | [0x002d,0x002e]
2 | [0x0e01,0x0e5b]
3 | 


--------------------------------------------------------------------------------
/conv/.cvsignore:
--------------------------------------------------------------------------------
1 | .deps
2 | .libs
3 | Makefile
4 | Makefile.in
5 | *.lo
6 | *.la
7 | 


--------------------------------------------------------------------------------
/data/.cvsignore:
--------------------------------------------------------------------------------
1 | .deps
2 | .libs
3 | Makefile
4 | Makefile.in
5 | *.lo
6 | *.la
7 | 


--------------------------------------------------------------------------------
/src/.cvsignore:
--------------------------------------------------------------------------------
1 | .deps
2 | .libs
3 | Makefile
4 | Makefile.in
5 | *.lo
6 | *.la
7 | 


--------------------------------------------------------------------------------
/lib/Makefile.am:
--------------------------------------------------------------------------------
1 | noinst_LTLIBRARIES = libcompat.la
2 | libcompat_la_SOURCES =
3 | libcompat_la_LIBADD = $(LTLIBOBJS)
4 | 


--------------------------------------------------------------------------------
/data/tdict-slang.txt:
--------------------------------------------------------------------------------
 1 | กึ๊ก
 2 | คิกขุ
 3 | เงิบ
 4 | จ๊าบ
 5 | จุงเบย
 6 | ไลฟ์บอย
 7 | แว้น
 8 | สตรอง
 9 | อาโนเนะ
10 | แอ๊บแบ๊ว
11 | 


--------------------------------------------------------------------------------
/lib/wcpcpy.c:
--------------------------------------------------------------------------------
 1 | #include <wchar.h>
 2 | 
 3 | wchar_t*
 4 | wcpcpy (wchar_t* dest, const wchar_t* src)
 5 | {
 6 |   wcscpy (dest, src);
 7 |   return dest + wcslen (dest);
 8 | }
 9 | 
10 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | autom4te.cache/
 2 | aclocal.m4
 3 | compile
 4 | config.guess
 5 | config.sub
 6 | configure
 7 | depcomp
 8 | install-sh
 9 | ltmain.sh
10 | m4/
11 | Makefile
12 | Makefile.in
13 | missing
14 | test-driver
15 | tests/Makefile.in
16 | 


--------------------------------------------------------------------------------
/conv/conv.h:
--------------------------------------------------------------------------------
 1 | #ifndef __CONV_H
 2 | #define __CONV_H
 3 | 
 4 | // inFormat, outFormat: 't' = TIS-620; 'u' = UTF-8
 5 | int conv(char inFormat, char outFormat, const char *inText,
 6 |          char *outText, int outLen);
 7 | 
 8 | #endif  // __CONV_H
 9 | 
10 | 


--------------------------------------------------------------------------------
/Makefile.am:
--------------------------------------------------------------------------------
 1 | ACLOCAL_AMFLAGS = -I m4
 2 | 
 3 | SUBDIRS = lib conv src data tests
 4 | 
 5 | EXTRA_DIST = \
 6 | 	build-aux/git-version-gen	\
 7 | 	$(NULL)
 8 | 
 9 | MAINTAINERCLEANFILES = Makefile.in
10 | 
11 | dist-hook:
12 | 	echo $(VERSION) > $(distdir)/VERSION
13 | 
14 | 


--------------------------------------------------------------------------------
/.cvsignore:
--------------------------------------------------------------------------------
 1 | aclocal.m4
 2 | autom4te.cache
 3 | m4
 4 | depcomp
 5 | configure
 6 | INSTALL
 7 | install-sh
 8 | Makefile.in
 9 | missing
10 | mkinstalldirs
11 | config.cache
12 | config.log
13 | config.status
14 | config.guess
15 | config.sub
16 | ltmain.sh
17 | Makefile
18 | *.spec
19 | 
20 | 


--------------------------------------------------------------------------------
/tests/test-rtf.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | set -e
 4 | 
 5 | echo -n "Testing RTF... "
 6 | 
 7 | ${SWATH} -d ${DICTDIR} -f rtf < ${srcdir}/petavatthu1.rtf > petavatthu1-out.rtf
 8 | 
 9 | if ! cmp petavatthu1-out.rtf ${srcdir}/petavatthu1-wseg.rtf >/dev/null; then
10 |   echo "FAIL!"
11 |   exit 1
12 | fi
13 | echo "OK"
14 | 
15 | 


--------------------------------------------------------------------------------
/conv/unichar.h:
--------------------------------------------------------------------------------
 1 | //
 2 | // unichar.h - Unicode char set definitions
 3 | // Created: 19 Jan 1999
 4 | // Author:  Theppitak Karoonboonyanan <theppitak@gmail.com>
 5 | //
 6 | 
 7 | #ifndef __UNICHAR_H
 8 | #define __UNICHAR_H
 9 | 
10 | typedef unsigned unichar;
11 | 
12 | const unichar BAD_WCHAR = 0x80;
13 | 
14 | #endif  // __UNICHAR_H
15 | 
16 | 


--------------------------------------------------------------------------------
/conv/tischar.h:
--------------------------------------------------------------------------------
 1 | //
 2 | // tischar.h - TIS-620 char set definitions
 3 | // Created: 19 Jan 1999
 4 | // Author:  Theppitak Karoonboonyanan <theppitak@gmail.com>
 5 | //
 6 | 
 7 | #ifndef __TISCHAR_H
 8 | #define __TISCHAR_H
 9 | 
10 | typedef unsigned char tischar;
11 | 
12 | const tischar BAD_TISCHAR = 0x80;
13 | 
14 | #endif  // __TISCHAR_H
15 | 
16 | 


--------------------------------------------------------------------------------
/conv/Makefile.am:
--------------------------------------------------------------------------------
 1 | MAINTAINERCLEANFILES = $(srcdir)/Makefile.in
 2 | 
 3 | EXTRA_DIST = \
 4 | 	convfact.h \
 5 | 	conv.h \
 6 | 	convkit.h \
 7 | 	tis620.h \
 8 | 	tischar.h \
 9 | 	unichar.h \
10 | 	utf8.h
11 | 
12 | noinst_LTLIBRARIES = libconv.la
13 | libconv_la_SOURCES = \
14 | 	conv.cxx \
15 | 	convfact.cxx \
16 | 	tis620.cxx \
17 | 	utf8.cxx
18 | 
19 | 


--------------------------------------------------------------------------------
/tests/mixed.txt:
--------------------------------------------------------------------------------
1 | เมื่อวานนี้ (๘ พ.ย. ๒๕๖๐) การจัดประกวดข้าวดีเด่นโลก หรือ The World's Best Rice 2017 ครั้งที่ 9 ซึ่งจัดโดยองค์กร The Rice Trader องค์กรให้บริการข้อมูลข่าวสารเชิงลึกเกี่ยวกับอุตสาหกรรมข้าวโลก โดยข้าวหอมมะลิไทย ๑๐๕ สามารถคว้าแชมป์ข้าวที่รสชาติดีที่สุดในโลกมาครองอีก ๑ สมัย ขณะที่ข้าวหอมของประเทศกัมพูชา ได้อันดับที่ ๒ และข้าว ST24 ของประเทศเวียดนาม ได้อันดับที่ ๓
2 | 


--------------------------------------------------------------------------------
/src/longwordseg.h:
--------------------------------------------------------------------------------
 1 | // longwordseg.h: interface for the LongWordSeg class.
 2 | //
 3 | //////////////////////////////////////////////////////////////////////
 4 | 
 5 | #ifndef __LONGWORDSEG_H
 6 | #define __LONGWORDSEG_H
 7 | 
 8 | #include "abswordseg.h"
 9 | 
10 | class LongWordSeg : public AbsWordSeg
11 | {
12 | protected:
13 |   virtual int CreateSentence ();
14 | };
15 | 
16 | #endif
17 | 


--------------------------------------------------------------------------------
/AUTHORS:
--------------------------------------------------------------------------------
 1 | Original creator:
 2 | - Phaisarn Charoenpornsawat <phaisarn@nectec.or.th>
 3 | 
 4 | Current maintainer:
 5 | - Theppitak Karoonboonyanan <theppitak@gmail.com>
 6 | 
 7 | Word list:
 8 | - Taken from ThaiLaTeX project [1], which in turn are taken from LibThai
 9 |   project [2].
10 |     [1] https://linux.thai.net/projects/thailatex
11 |     [2] https://linux.thai.net/projects/libthai
12 | 
13 | 


--------------------------------------------------------------------------------
/data/update-dict.sh:
--------------------------------------------------------------------------------
 1 | #/bin/sh
 2 | 
 3 | # Update dictionary from thailatex hyphenation source.
 4 | # Usage: update-dict.sh {thailatex-hyphenation-src-dir}
 5 | 
 6 | if [ $# -ne 1 ]; then
 7 |   echo "Usage: update-dict.sh {thailatex-hyphenation-src-dir}"
 8 |   exit 1
 9 | fi
10 | 
11 | TL_HYPH_SRC_DIR=$1
12 | 
13 | for f in ${TL_HYPH_SRC_DIR}/tdict*.txt; do
14 |   sed 's/-//g' $f > `basename $f`
15 | done
16 | 
17 | 


--------------------------------------------------------------------------------
/tests/mixed-wseg.txt:
--------------------------------------------------------------------------------
1 | เมื่อ|วาน|นี้ (๘ พ.ย. ๒๕๖๐) การ|จัด|ประกวด|ข้าว|ดี|เด่น|โลก หรือ The World's Best Rice 2017 ครั้ง|ที่ 9 ซึ่ง|จัด|โดย|องค์กร The Rice Trader องค์กร|ให้|บริการ|ข้อมูล|ข่าวสาร|เชิง|ลึก|เกี่ยว|กับ|อุตสาหกรรม|ข้าว|โลก โดย|ข้าว|หอม|มะลิ|ไทย ๑๐๕ สามารถ|คว้า|แชมป์|ข้าว|ที่|รสชาติ|ดี|ที่สุด|ใน|โลก|มา|ครอง|อีก ๑ สมัย ขณะ|ที่|ข้าว|หอม|ของ|ประเทศ|กัมพูชา ได้|อันดับ|ที่ ๒ และ|ข้าว ST24 ของ|ประเทศ|เวียดนาม ได้|อันดับ|ที่ ๓
2 | 


--------------------------------------------------------------------------------
/src/utils.h:
--------------------------------------------------------------------------------
 1 | // utils.h: utility functions.
 2 | //
 3 | //////////////////////////////////////////////////////////////////////
 4 | 
 5 | #ifndef __UTILS_H
 6 | #define __UTILS_H
 7 | 
 8 | #define N_ELM(a)  (sizeof (a)/sizeof (a)[0])
 9 | 
10 | template <class T>
11 | inline T min (T a, T b) { return (a < b) ? a : b; }
12 | 
13 | template <class T>
14 | inline T max (T a, T b) { return (a > b) ? a : b; }
15 | 
16 | #endif // __UTILS_H
17 | 


--------------------------------------------------------------------------------
/src/filefilter.h:
--------------------------------------------------------------------------------
 1 | // FileFilter.h: interface for the FileFilter class.
 2 | //
 3 | //////////////////////////////////////////////////////////////////////
 4 | 
 5 | #ifndef __FILEFILTER_H
 6 | #define __FILEFILTER_H
 7 | 
 8 | #include "filterx.h"
 9 | 
10 | class FileFilter
11 | {
12 | public:
13 |   static FilterX* CreateFilter (FILE* filein, FILE* fileout,
14 |                                 bool isUniIn, bool isUniOut,
15 |                                 const char *fileformat);
16 | };
17 | 
18 | #endif
19 | 


--------------------------------------------------------------------------------
/src/maxwordseg.h:
--------------------------------------------------------------------------------
 1 | // maxwordseg.h: interface for the MaxWordSeg class.
 2 | //
 3 | //////////////////////////////////////////////////////////////////////
 4 | 
 5 | #ifndef __MAXWORDSEG_H
 6 | #define __MAXWORDSEG_H
 7 | 
 8 | #include "abswordseg.h"
 9 | 
10 | class MaxWordSeg : public AbsWordSeg
11 | {
12 | protected:
13 |   virtual int CreateSentence ();
14 | 
15 | private:
16 |   int saveSegment (int sepIdx, int idxSen, int lastPoint);
17 |   int WordSegArea (int stSeg, int enSeg);
18 | };
19 | 
20 | #endif
21 | 


--------------------------------------------------------------------------------
/conv/convfact.h:
--------------------------------------------------------------------------------
 1 | //
 2 | // convfact.h - Text Agent Factory
 3 | // Created: 19 Jan 1999
 4 | // Author:  Theppitak Karoonboonyanan <theppitak@gmail.com>
 5 | //
 6 | 
 7 | #ifndef __CONVFACT_H
 8 | #define __CONVFACT_H
 9 | 
10 | #include "convkit.h"
11 | 
12 | enum ETextFormat {
13 |     TIS620,
14 |     UTF8,
15 | };
16 | 
17 | TextReader* CreateTextReader (ETextFormat format, const char* inText);
18 | TextWriter* CreateTextWriter (ETextFormat format, char* outText, int outLen);
19 | 
20 | #endif // __CONVFACT_H
21 | 
22 | 


--------------------------------------------------------------------------------
/autogen.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | set -x
 4 | 
 5 | DIE=0
 6 | 
 7 | # Detect [g]libtoolize
 8 | if libtoolize --version > /dev/null 2>&1; then
 9 |   LIBTOOLIZE=libtoolize
10 | elif glibtoolize --version > /dev/null 2>&1; then
11 |   LIBTOOLIZE=glibtoolize
12 | else
13 |   echo "**Error**: Missing 'libtoolize'"
14 |   DIE=1
15 | fi
16 | 
17 | if [ $DIE -eq 1 ]; then
18 |   exit 1
19 | fi
20 | 
21 | ${LIBTOOLIZE} --force
22 | aclocal
23 | automake --add-missing
24 | # Use -f so git-version-gen does refresh
25 | autoconf -f
26 | 
27 | 


--------------------------------------------------------------------------------
/INSTALL:
--------------------------------------------------------------------------------
 1 | Basic Installation
 2 | ==================
 3 | 
 4 | # Build requirements:
 5 | # - libtoolize (from package libtool)
 6 | # - libdatrie development package (or https://github.com/tlwg/libdatrie)
 7 | 
 8 | # When downloaded from the Github repo (Skip for released tarballs)
 9 | ./autogen.sh
10 | 
11 | ./configure
12 | # Or to install elsewhere: ./configure --prefix=...
13 | 
14 | make
15 | 
16 | # Recommended
17 | make check
18 | 
19 | # With elevated permissions if required
20 | make install
21 | 
22 | # In order to uninstall
23 | make uninstall
24 | 
25 | 


--------------------------------------------------------------------------------
/src/filterhtml.h:
--------------------------------------------------------------------------------
 1 | // FilterHtml.h: interface for the FilterHtml class.
 2 | //
 3 | //////////////////////////////////////////////////////////////////////
 4 | 
 5 | #ifndef __FILTERHTML_H
 6 | #define __FILTERHTML_H
 7 | 
 8 | #include "filterx.h"
 9 | 
10 | class FilterHtml : public FilterX
11 | {
12 | public:
13 |   FilterHtml (FILE* filein, FILE* fileout, bool isUniIn, bool isUniOut);
14 | 
15 |   bool GetNextToken (wchar_t* token, int tokenSz, bool* thaiFlag);
16 |   void Print (const wchar_t* token, bool thaiFlag);
17 | 
18 | private:
19 |   wchar_t chbuff;
20 | };
21 | 
22 | #endif
23 | 


--------------------------------------------------------------------------------
/src/filterlambda.h:
--------------------------------------------------------------------------------
 1 | // filterlambda.h: interface for the filterlambda class.
 2 | //
 3 | //////////////////////////////////////////////////////////////////////
 4 | 
 5 | #ifndef __FILTERLAMBDA_H
 6 | #define __FILTERLAMBDA_H
 7 | 
 8 | #include "filterlatex.h"
 9 | 
10 | class FilterLambda : public FilterLatex
11 | {
12 | public:
13 |   FilterLambda (FILE* filein, FILE* fileout, bool isUniIn);
14 | };
15 | 
16 | inline
17 | FilterLambda::FilterLambda (FILE* filein, FILE* fileout, bool isUniIn)
18 |   : FilterLatex (filein, fileout, isUniIn, true, L"^^^^200b")
19 | {
20 | }
21 | 
22 | #endif
23 | 


--------------------------------------------------------------------------------
/tests/test-long.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | set -e
 4 | 
 5 | 
 6 | echo -n "Testing maximal matching scheme... "
 7 | 
 8 | ${SWATH} -m max -d ${DICTDIR} -u u,u < ${srcdir}/long.txt > long-out.txt
 9 | 
10 | if ! cmp long-out.txt ${srcdir}/long-wseg.txt >/dev/null; then
11 |   echo "FAIL!"
12 |   exit 1
13 | fi
14 | echo "OK"
15 | 
16 | 
17 | echo -n "Testing longest matching scheme... "
18 | 
19 | ${SWATH} -m long -d ${DICTDIR} -u u,u < ${srcdir}/long.txt > long-out.txt
20 | 
21 | if ! cmp long-out.txt ${srcdir}/long-wseg.txt >/dev/null; then
22 |   echo "FAIL!"
23 |   exit 1
24 | fi
25 | echo "OK"
26 | 
27 | 


--------------------------------------------------------------------------------
/data/tdict-collection.txt:
--------------------------------------------------------------------------------
 1 | มกรา
 2 | กุมภา
 3 | มีนา
 4 | เมษา
 5 | พฤษภา
 6 | มิถุนา
 7 | กรกฎา
 8 | สิงหา
 9 | กันยา
10 | ตุลา
11 | พฤศจิกา
12 | ธันวา
13 | เอ
14 | บี
15 | ซี
16 | ดี
17 | อี
18 | เอฟ
19 | จี
20 | เอช
21 | ไอ
22 | เจ
23 | เค
24 | แอล
25 | เอ็ม
26 | เอ็น
27 | โอ
28 | พี
29 | คิว
30 | อาร์
31 | เอส
32 | ที
33 | ยู
34 | วี
35 | ดับเบิล
36 | ดับบลิว
37 | เอ็กซ์
38 | เอ๊กซ์
39 | วาย
40 | แซด
41 | แอลฟา
42 | แอลฟ่า
43 | เบตา
44 | เบต้า
45 | แกมมา
46 | แกมม่า
47 | เดลตา
48 | เดลต้า
49 | โอเมกา
50 | โอเมก้า
51 | เมกะ
52 | กิกะ
53 | นาโน
54 | ไมโคร
55 | โด
56 | เร
57 | มี
58 | ฟา
59 | ซอล
60 | ลา
61 | ที
62 | 


--------------------------------------------------------------------------------
/tests/test-mixed.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | set -e
 4 | 
 5 | 
 6 | echo -n "Testing maximal matching scheme... "
 7 | 
 8 | ${SWATH} -m max -d ${DICTDIR} -u u,u < ${srcdir}/mixed.txt > mixed-out.txt
 9 | 
10 | if ! cmp mixed-out.txt ${srcdir}/mixed-wseg.txt >/dev/null; then
11 |   echo "FAIL!"
12 |   exit 1
13 | fi
14 | echo "OK"
15 | 
16 | 
17 | echo -n "Testing longest matching scheme... "
18 | 
19 | ${SWATH} -m long -d ${DICTDIR} -u u,u < ${srcdir}/mixed.txt > mixed-out.txt
20 | 
21 | if ! cmp mixed-out.txt ${srcdir}/mixed-wseg.txt >/dev/null; then
22 |   echo "FAIL!"
23 |   exit 1
24 | fi
25 | echo "OK"
26 | 
27 | 


--------------------------------------------------------------------------------
/conv/tis620.h:
--------------------------------------------------------------------------------
 1 | //
 2 | // tis620.h - TIS-620 Agents
 3 | // Created: 19 Jan 1999
 4 | // Author:  Theppitak Karoonboonyanan <theppitak@gmail.com>
 5 | //
 6 | 
 7 | #ifndef __TIS620_H
 8 | #define __TIS620_H
 9 | 
10 | #include "convkit.h"
11 | 
12 | class TIS620Reader : public TextReader {
13 | public:
14 |     TIS620Reader(const char*inText)
15 |         : TextReader (inText) {}
16 | 
17 |     bool Read(unichar& c);
18 | };
19 | 
20 | class TIS620Writer : public TextWriter {
21 | public:
22 |     TIS620Writer(char* outText, int outLen)
23 |         : TextWriter (outText, outLen) {}
24 | 
25 |     bool Write(unichar c);
26 | };
27 | 
28 | #endif // __TIS620_H
29 | 
30 | 


--------------------------------------------------------------------------------
/tests/test-latex.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | set -e
 4 | 
 5 | echo -n "Testing maximal matching scheme... "
 6 | 
 7 | ${SWATH} -m max -d ${DICTDIR} -f latex -u u,u < ${srcdir}/thai-latex.tex \
 8 |   > thai-latex-out.tex
 9 | 
10 | if ! cmp thai-latex-out.tex ${srcdir}/thai-latex-wseg.tex >/dev/null; then
11 |   echo "FAIL!"
12 |   exit 1
13 | fi
14 | echo "OK"
15 | 
16 | 
17 | echo -n "Testing longest matching scheme... "
18 | 
19 | ${SWATH} -m long -d ${DICTDIR} -f latex -u u,u < ${srcdir}/thai-latex.tex \
20 |   > thai-latex-out.tex
21 | 
22 | if ! cmp thai-latex-out.tex ${srcdir}/thai-latex-wseg.tex >/dev/null; then
23 |   echo "FAIL!"
24 |   exit 1
25 | fi
26 | echo "OK"
27 | 
28 | 


--------------------------------------------------------------------------------
/tests/test-simple.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | set -e
 4 | 
 5 | test_wseg()
 6 | {
 7 |   SCHEME=$1
 8 |   INPUT=$2
 9 |   EXPECT=$3
10 | 
11 |   echo -n "Testing ${SCHEME} scheme (${INPUT})... "
12 | 
13 |   RES=$(echo -n "${INPUT}" | ${SWATH} -m ${SCHEME} -d ${DICTDIR} -u u,u)
14 | 
15 |   if test "x${RES}" != "x${EXPECT}"; then
16 |     echo "FAIL!"
17 |     return 1
18 |   fi
19 |   echo "OK"
20 | 
21 |   return 0
22 | }
23 | 
24 | INPUT="ทดสอบการตัดคำอย่างง่าย"
25 | EXPECT="ทดสอบ|การ|ตัด|คำ|อย่าง|ง่าย"
26 | 
27 | test_wseg long ${INPUT} ${EXPECT}
28 | test_wseg max ${INPUT} ${EXPECT}
29 | 
30 | test_wseg long "ไปหามเหสี" "ไป|หาม|เห|สี"
31 | test_wseg max "ไปหามเหสี" "ไป|หา|มเหสี"
32 | 


--------------------------------------------------------------------------------
/conv/utf8.h:
--------------------------------------------------------------------------------
 1 | //
 2 | // utf8.h - UTF-8 Agents
 3 | // Created: 19 Jan 1999
 4 | // Author:  Theppitak Karoonboonyanan <theppitak@gmail.com>
 5 | //
 6 | 
 7 | #ifndef __UTF8_H
 8 | #define __UTF8_H
 9 | 
10 | #include "convkit.h"
11 | 
12 | class UTF8Reader : public TextReader {
13 | public:
14 |     UTF8Reader(const char* inText)
15 |         : TextReader (inText) {}
16 | 
17 |     bool Read(unichar& c);
18 | };
19 | 
20 | class UTF8Writer : public TextWriter {
21 | public:
22 |     UTF8Writer(char* outText, int outLen)
23 |         : TextWriter (outText, outLen) {}
24 | 
25 |     bool Write(unichar c);
26 | };
27 | 
28 | int UTF8Bytes (unichar c);
29 | 
30 | #endif  // __UTF8_H
31 | 
32 | 


--------------------------------------------------------------------------------
/data/tdict-history.txt:
--------------------------------------------------------------------------------
 1 | กบิลพัสดุ์
 2 | กาวิละ
 3 | กุสินารา
 4 | โกลิยะ
 5 | โกสัมพี
 6 | คิชฌกูฏ
 7 | โคตรบอง
 8 | โคตรบูร
 9 | โคตรบูรณ์
10 | เจนละ
11 | เจินละ
12 | โชฎึก
13 | ดองซอน
14 | ตองอู
15 | ทรอย
16 | ทวารวดี
17 | ทวาราวดี
18 | ทุเรียง
19 | เทวทหะ
20 | ไทรบุรี
21 | น่านเจ้า
22 | นาลันทา
23 | บาบิโลน
24 | บาบิโลเนีย
25 | ไบแซนไทน์
26 | ปโตเลมี
27 | ปรัสเซีย
28 | ปัลลวะ
29 | ปาฏลีบุตร
30 | พุทธคยา
31 | ฟูนัน
32 | มถุรา
33 | มายัน
34 | มิถิลา
35 | โมกุล
36 | ราชคฤห์
37 | ล้านช้าง
38 | ล้านนา
39 | ลุมพินี
40 | ลูแบร์
41 | วัชชี
42 | เวสาลี
43 | ศรีวิชัย
44 | สาเกด
45 | สาเกต
46 | สารนาถ
47 | สาวัตถี
48 | สุเมเรียน
49 | หริภุญชัย
50 | อโยธยา
51 | ออตโตมัน
52 | อังวะ
53 | อินทปัตถ์
54 | อุชเชนี
55 | 


--------------------------------------------------------------------------------
/src/filterlatex.h:
--------------------------------------------------------------------------------
 1 | // FilterLatex.h: interface for the FilterLatex class.
 2 | //
 3 | //////////////////////////////////////////////////////////////////////
 4 | 
 5 | #ifndef __FILTERLATEX_H
 6 | #define __FILTERLATEX_H
 7 | 
 8 | #include "worddef.h"
 9 | #include "filterx.h"
10 | 
11 | 
12 | class FilterLatex : public FilterX
13 | {
14 | public:
15 |   FilterLatex (FILE* filein, FILE* fileout, bool isUniIn, bool isUniOut,
16 |                const wchar_t* wordBreakStr = L"{\\wbr}");
17 | 
18 |   bool GetNextToken (wchar_t* token, int tokenSz, bool* thaiFlag);
19 |   void Print (const wchar_t* token, bool thaiFlag);
20 | 
21 | private:
22 |   wchar_t buffer[MAXLEN];
23 |   bool verbatim;
24 | };
25 | 
26 | #endif
27 | 


--------------------------------------------------------------------------------
/tests/test-utf8-wbr.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | #
 4 | # Test of UTF-8 word delimiter code
 5 | # Ref: https://github.com/tlwg/swath/issues/1
 6 | #
 7 | 
 8 | set -e
 9 | 
10 | INPUT="ผมมีแฟนแล้วนะครับ"
11 | EXPECT="ผม​มี​แฟน​แล้ว​นะ​ครับ"
12 | 
13 | echo -n "Testing maximal matching scheme... "
14 | 
15 | RES=$(echo -n "${INPUT}" | ${SWATH} -m max -d ${DICTDIR} -u u,u -b "​")
16 | 
17 | if test "x${RES}" != "x${EXPECT}"; then
18 |   echo "FAIL!"
19 |   exit 1
20 | fi
21 | echo "OK"
22 | 
23 | 
24 | echo -n "Testing longest matching scheme... "
25 | 
26 | RES=$(echo -n "${INPUT}" | ${SWATH} -m long -d ${DICTDIR} -u u,u -b "​")
27 | 
28 | if test "x${RES}" != "x${EXPECT}"; then
29 |   echo "FAIL!"
30 |   exit 1
31 | fi
32 | echo "OK"
33 | 
34 | 


--------------------------------------------------------------------------------
/tests/thai-latex.tex:
--------------------------------------------------------------------------------
 1 | \documentclass[a4paper]{article}
 2 | \usepackage[utf8x]{inputenc}
 3 | \usepackage[thai]{babel}
 4 | \usepackage{fonts-tlwg}
 5 | 
 6 | \title{เอกสาร \LaTeX{} ภาษาไทย}
 7 | \author{เพื่อการทดสอบ}
 8 | \date{}
 9 | 
10 | \begin{document}
11 | \maketitle
12 | 
13 | กาลครั้งหนึ่งนานมาแล้ว ยังมีกระต่ายตัวหนึ่งวิ่งเล่นอยู่ในป่า ไปพบเต่าตัวหนึ่งเดินต้วมเตี้ยมอยู่
14 | รู้สึกรำคาญในความเชื่องช้าของเต่า จึงท้าเต่าวิ่งแข่งกัน โดยต่อให้เต่าซึ่งเชื่องช้าวิ่งนำหน้าไปก่อน
15 | 
16 | กระต่ายออกวิ่งทีหลังก็แซงหน้าเต่าอย่างรวดเร็ว มองกลับหลังมาดูก็ไม่เห็นเต่าวิ่งตามมา
17 | จึงหยุดพักใต้ต้นไม้ใหญ่ต้นหนึ่ง ลมโชยพัดเอื่อย~ๆ ทำให้กระต่ายผล็อยหลับไป
18 | จนกระทั่งพลบค่ำจึงรู้สึกตัวตื่นขึ้นมาพบว่าเต่าเข้าเส้นชัยไปนานแล้ว
19 | \end{document}
20 | 


--------------------------------------------------------------------------------
/data/tdict-lang-ethnic.txt:
--------------------------------------------------------------------------------
 1 | กรีก
 2 | กัณณาฑ
 3 | กัศมีรี
 4 | กุรข่า
 5 | กูรข่า
 6 | คันจิ
 7 | คาตาคานะ
 8 | คุชราตี
 9 | คุรุมุขี
10 | เคิร์ด
11 | ซาไก
12 | ซีริลลิก
13 | ซูลู
14 | เซลติก
15 | เซิร์บ
16 | แซกซอน
17 | ตากาล็อก
18 | เตลุคู
19 | เติร์ก
20 | ทราวิฑ
21 | นอร์ส
22 | บาสก์
23 | เบงกาลี
24 | ปัญจาบี
25 | ปาตานี
26 | พินอิน
27 | มลยาฬัม
28 | มองโกล
29 | มาคธี
30 | มาราฐี
31 | มาเลย์
32 | เม็กซิกัน
33 | แมนจู
34 | แมนดาริน
35 | ไมถิลี
36 | เยอรมัน
37 | รัสเซียน
38 | โรฮิงญา
39 | โรฮิงยา
40 | โรฮีนจา
41 | สก็อต
42 | สวาฮิลี
43 | สวิส
44 | สินธี
45 | อูรดู
46 | อะบอริจิน
47 | อัฟกัน
48 | อัสสมี
49 | อาข่า
50 | อารบิก
51 | อิตาเลียน
52 | อุยกูร์
53 | เอเชียน
54 | แอฟริกัน
55 | โอริยา
56 | ไอยคุปต์
57 | ไอริช
58 | ฮันกึล
59 | ฮากกา
60 | ฮินดี
61 | ฮิรางานะ
62 | ฮีบรู
63 | 


--------------------------------------------------------------------------------
/src/convutil.h:
--------------------------------------------------------------------------------
 1 | // convutil.h - Utility functions for character encoding conversion
 2 | //
 3 | //////////////////////////////////////////////////////////////////////
 4 | 
 5 | #ifndef __CONVUTIL_H
 6 | #define __CONVUTIL_H
 7 | 
 8 | #include <stdio.h>
 9 | #include <wchar.h>
10 | 
11 | bool ConvGetS (wchar_t* buffer, int buffSz, FILE* fpin, bool isUniIn);
12 | int  ConvGetC (FILE* fpin, bool isUniIn);
13 | 
14 | bool ConvPrint (FILE* fpout, const wchar_t* wcs, bool isUniOut);
15 | wchar_t* ConvStrDup (const char* s, bool isUniIn);
16 | 
17 | wchar_t* Ascii2WcsCopy (wchar_t* dst, const char* src);
18 | wchar_t* Ascii2WcsCat (wchar_t* dst, const char* src);
19 | wchar_t* Ascii2WcsNCopy (wchar_t* dst, const char* src, int n);
20 | wchar_t* Ascii2WcsNCat (wchar_t* dst, const char* src, int n);
21 | 
22 | #endif
23 | 


--------------------------------------------------------------------------------
/conv/convfact.cxx:
--------------------------------------------------------------------------------
 1 | //
 2 | // convfact.h - Text Agent Factory
 3 | // Created: 19 Jan 1999
 4 | // Author:  Theppitak Karoonboonyanan <theppitak@gmail.com>
 5 | //
 6 | 
 7 | #include "convfact.h"
 8 | #include "utf8.h"
 9 | #include "tis620.h"
10 | 
11 | TextReader*
12 | CreateTextReader (ETextFormat format, const char* inText)
13 | {
14 |     switch (format) {
15 |         case TIS620:  return new TIS620Reader (inText);
16 |         case UTF8:    return new UTF8Reader (inText);
17 |         default:      return 0;
18 |     }
19 | }
20 | 
21 | TextWriter*
22 | CreateTextWriter (ETextFormat format, char* outText, int outLen)
23 | {
24 |     switch (format) {
25 |         case TIS620:  return new TIS620Writer (outText, outLen);
26 |         case UTF8:    return new UTF8Writer (outText, outLen);
27 |         default:      return 0;
28 |     }
29 | }
30 | 
31 | 


--------------------------------------------------------------------------------
/tests/Makefile.am:
--------------------------------------------------------------------------------
 1 | TESTS = \
 2 | 	test-simple.sh \
 3 | 	test-long.sh \
 4 | 	test-mixed.sh \
 5 | 	test-utf8-wbr.sh \
 6 | 	test-latex.sh \
 7 | 	test-rtf.sh \
 8 | 	$(NULL)
 9 | 
10 | AM_TESTS_ENVIRONMENT = \
11 | 	DICTDIR=$(top_builddir)/data; \
12 | 	SWATH=$(top_builddir)/src/swath; \
13 | 	export DICTDIR SWATH; \
14 | 	$(NULL)
15 | 
16 | check_SCRIPTS = \
17 | 	test-simple.sh \
18 | 	test-long.sh \
19 | 	test-mixed.sh \
20 | 	test-utf8-wbr.sh \
21 | 	test-latex.sh \
22 | 	test-rtf.sh \
23 | 	$(NULL)
24 | 
25 | CLEANFILES = \
26 | 	long-out.txt \
27 | 	mixed-out.txt \
28 | 	thai-latex-out.tex \
29 | 	petavatthu1-out.rtf \
30 | 	$(NULL)
31 | 
32 | EXTRA_DIST = \
33 | 	$(check_SCRIPTS) \
34 | 	long.txt \
35 | 	long-wseg.txt \
36 | 	mixed.txt \
37 | 	mixed-wseg.txt \
38 | 	thai-latex.tex \
39 | 	thai-latex-wseg.tex \
40 | 	petavatthu1.rtf \
41 | 	petavatthu1-wseg.rtf \
42 | 	$(NULL)
43 | 


--------------------------------------------------------------------------------
/data/tdict-geo.txt:
--------------------------------------------------------------------------------
 1 | คอเคซัส
 2 | คิลิมันจาโร
 3 | คิวชู
 4 | แคริบเบียน
 5 | แคสเปียน
 6 | ไครเมีย
 7 | ซาฮารา
 8 | ซาฮาร่า
 9 | ดานูบ
10 | ตะนาวศรี
11 | ตังเกี๋ย
12 | ไทกริส
13 | นอร์วีเจียน
14 | นิโคบาร์
15 | เนรัญชรา
16 | ไนล์
17 | บอร์เนียว
18 | บอลติก
19 | เบงกอล
20 | ปิง
21 | แปซิฟิก
22 | แปซิฟิค
23 | ฟอล์คแลนด์
24 | มะละกา
25 | มินดาเนา
26 | มิสซิสซิปปี
27 | เมดิเตอร์เรเนียน
28 | เมดิเตอเรเนียน
29 | เมโสโปเตเมีย
30 | ยมุนา
31 | ยุโรป
32 | ยูเฟรติส
33 | ยูเรเชีย
34 | ยูเรเซีย
35 | แยงซี
36 | แยงซีเกียง
37 | ลังกาวี
38 | สแกนดิเนเวีย
39 | สะโตง
40 | สาละวิน
41 | สุมาตรา
42 | สุเอซ
43 | อะเมซอน
44 | อันดามัน
45 | อัลไต
46 | อาร์กติก
47 | อาหรับ
48 | อินโดจีน
49 | อิรวดี
50 | อิระวดี
51 | อีเจียน
52 | อุษาคเนย์
53 | อูราล
54 | เอเชีย
55 | เอเดรียติก
56 | เอเวอเรสต์
57 | แอตแลนติก
58 | แอตแลนติส
59 | แอนตาร์กติก
60 | แอนตาร์กติกา
61 | แอฟริกา
62 | แอลป์
63 | โอเชียเนีย
64 | โอลิมปัส
65 | ไอโอเนียน
66 | ฮวงโห
67 | ฮอกไกโด
68 | ฮอนชู
69 | ฮอร์มุซ
70 | 


--------------------------------------------------------------------------------
/src/Makefile.am:
--------------------------------------------------------------------------------
 1 | MAINTAINERCLEANFILES = $(srcdir)/Makefile.in
 2 | 
 3 | AUTOMAKE_OPTIONS = subdir-objects
 4 | 
 5 | AM_CPPFLAGS = -I$(top_srcdir) $(DATRIE_CFLAGS)
 6 | DEFS = -DWORDSEGDATA_DIR=\"${pkgdatadir}\" -DVERSION=\"$(VERSION)\"
 7 | 
 8 | EXTRA_DIST = \
 9 | 	abswordseg.h \
10 | 	convutil.h \
11 | 	dict.h \
12 | 	filefilter.h \
13 | 	filterhtml.h \
14 | 	filterlambda.h \
15 | 	filterlatex.h \
16 | 	filterrtf.h \
17 | 	filterx.h \
18 | 	longwordseg.h \
19 | 	maxwordseg.h \
20 | 	utils.h \
21 | 	worddef.h \
22 | 	wordstack.h \
23 | 	swath.1
24 | 
25 | bin_PROGRAMS = swath
26 | 
27 | swath_SOURCES = \
28 | 	dict.cpp \
29 | 	abswordseg.cpp \
30 | 	convutil.cpp \
31 | 	filefilter.cpp \
32 | 	filterhtml.cpp \
33 | 	filterlatex.cpp \
34 | 	filterrtf.cpp \
35 | 	longwordseg.cpp \
36 | 	maxwordseg.cpp \
37 | 	wordseg.cpp
38 | 
39 | man_MANS = swath.1
40 | 
41 | swath_LDADD = \
42 | 	$(top_builddir)/conv/libconv.la \
43 | 	$(DATRIE_LIBS) \
44 | 	$(LTLIBOBJS)
45 | 
46 | 


--------------------------------------------------------------------------------
/data/Makefile.am:
--------------------------------------------------------------------------------
 1 | MAINTAINERCLEANFILES = $(srcdir)/Makefile.in
 2 | CLEANFILES = swathdic.lst swathdic.tri
 3 | 
 4 | DICT_SRC =  \
 5 | 	tdict-city.txt \
 6 | 	tdict-collection.txt \
 7 | 	tdict-common.txt \
 8 | 	tdict-country.txt \
 9 | 	tdict-district.txt \
10 | 	tdict-geo.txt \
11 | 	tdict-history.txt \
12 | 	tdict-ict.txt \
13 | 	tdict-lang-ethnic.txt \
14 | 	tdict-proper.txt \
15 | 	tdict-science.txt \
16 | 	tdict-slang.txt \
17 | 	tdict-spell.txt \
18 | 	tdict-std-compound.txt \
19 | 	tdict-std.txt
20 | 
21 | EXTRA_DIST = \
22 | 	update-dict.sh \
23 | 	swathdic.abm \
24 | 	$(DICT_SRC)
25 | 
26 | if ENABLE_DICT
27 | pkgdata_DATA = \
28 | 	swathdic.tri
29 | else
30 | pkgdata_DATA =
31 | endif
32 | 
33 | swathdic.lst: ${DICT_SRC}
34 | 	cat $^ | LC_ALL=C sort -u > $@
35 | 
36 | swathdic.tri: swathdic.lst
37 | 	if test ! -e swathdic.abm; then \
38 | 	  ln -s $(srcdir)/swathdic.abm . ; \
39 | 	fi
40 | 	rm -f swathdic.tri
41 | 	$(TRIETOOL) swathdic add-list -e utf-8 swathdic.lst
42 | 


--------------------------------------------------------------------------------
/src/filefilter.cpp:
--------------------------------------------------------------------------------
 1 | // FileFilter.cpp: implementation of the FileFilter class.
 2 | //
 3 | //////////////////////////////////////////////////////////////////////
 4 | 
 5 | #include <string.h>
 6 | #include "filefilter.h"
 7 | 
 8 | #include "filterhtml.h"
 9 | #include "filterrtf.h"
10 | #include "filterlatex.h"
11 | #include "filterlambda.h"
12 | 
13 | FilterX*
14 | FileFilter::CreateFilter (FILE* filein, FILE* fileout,
15 |                           bool isUniIn, bool isUniOut,
16 |                           const char* fileformat)
17 | {
18 |   if (strcmp (fileformat, "html") == 0)
19 |     return new FilterHtml (filein, fileout, isUniIn, isUniOut);
20 |   else if (strcmp (fileformat, "latex") == 0)
21 |     return new FilterLatex (filein, fileout, isUniIn, isUniOut);
22 |   else if (strcmp (fileformat, "lambda") == 0)
23 |     return new FilterLambda (filein, fileout, isUniIn);
24 |   else if (strcmp (fileformat, "rtf") == 0)
25 |     return new FilterRTF (filein, fileout, isUniIn, isUniOut);
26 |   else
27 |     return NULL;
28 | }
29 | 


--------------------------------------------------------------------------------
/src/wordstack.h:
--------------------------------------------------------------------------------
 1 | #ifndef __WORDSTACK_H
 2 | #define __WORDSTACK_H
 3 | 
 4 | #include "worddef.h"
 5 | 
 6 | #define STACKSIZE 500
 7 | 
 8 | class WordStack
 9 | {
10 | public:
11 |   WordStack ();
12 | 
13 |   bool Push (WordState w);
14 |   bool Pop ();
15 |   WordState Top () const;
16 |   bool Empty () const;
17 | 
18 | private:
19 |   WordState stack[STACKSIZE];
20 |   short int top_idx;
21 | };
22 | 
23 | inline
24 | WordStack::WordStack ()
25 |   : top_idx (-1)
26 | {
27 | }
28 | 
29 | inline bool
30 | WordStack::Push (WordState w)
31 | {
32 |   if (top_idx < STACKSIZE - 1)
33 |     {
34 |       stack[++top_idx] = w;
35 |       return true;
36 |     }
37 |   return false;
38 | }
39 | 
40 | inline bool
41 | WordStack::Pop ()
42 | {
43 |   if (top_idx >= 0)
44 |     {
45 |       top_idx--;
46 |       return true;
47 |     }
48 |   return false;
49 | }
50 | 
51 | inline bool
52 | WordStack::Empty () const
53 | {
54 |   return top_idx < 0;
55 | }
56 | 
57 | inline WordState
58 | WordStack::Top () const
59 | {
60 |   return (top_idx >= 0) ? stack[top_idx] : WordState (-1, 0);
61 | }
62 | 
63 | #endif
64 | 


--------------------------------------------------------------------------------
/src/filterrtf.h:
--------------------------------------------------------------------------------
 1 | // FilterRTF.h: interface for the FilterRTF class.
 2 | //
 3 | //////////////////////////////////////////////////////////////////////
 4 | 
 5 | #ifndef __FILTERRTF_H
 6 | #define __FILTERRTF_H
 7 | 
 8 | #include "filterx.h"
 9 | 
10 | // Private implementation class
11 | class RTFToken;
12 | 
13 | class FilterRTF : public FilterX
14 | {
15 | public:
16 |   FilterRTF (FILE* filein, FILE* fileout, bool isUniIn, bool isUniOut);
17 | 
18 |   bool GetNextToken (wchar_t* token, int tokenSz, bool* thaiFlag);
19 |   void Print (const wchar_t* token, bool thaiFlag);
20 | 
21 | private:
22 |   enum ECharState
23 |   {
24 |     CS_START,
25 |     CS_ESCAPE,
26 |     CS_CH_BYTE,
27 |     CS_UNI,
28 |     CS_UNI_COUNT,
29 |     CS_UNI_CODE
30 |   };
31 | 
32 | private:
33 |   static ECharState chgCharState (ECharState state, char charIn,
34 |                                   bool* charConsumed, RTFToken* rtfToken);
35 | 
36 | private:
37 |   void  printNonThai (wchar_t wc);
38 | 
39 | private:
40 |   ECharState psState;
41 |   wchar_t    strbuff[40];
42 |   int  curUTFReadBytes;
43 |   int  curUTFWriteBytes;
44 | };
45 | 
46 | #endif
47 | 


--------------------------------------------------------------------------------
/conv/tis620.cxx:
--------------------------------------------------------------------------------
 1 | //
 2 | // tis620.cc - TIS-620 Agents
 3 | // Created: 19 Jan 1999
 4 | // Author:  Theppitak Karoonboonyanan <theppitak@gmail.com>
 5 | //
 6 | 
 7 | #include "tis620.h"
 8 | #include "tischar.h"
 9 | 
10 | static unichar tis2unicode(tischar c)
11 | {
12 |     if (c < 0x80) {
13 |         return c;
14 |     } else if (
15 |         (0xA1 <= c && c <= 0xDA) ||
16 |         (0xDF <= c && c <= 0xFB)
17 |     ) {
18 |         return 0x0E00 + c - 0xA0;
19 |     } else {
20 |         return BAD_WCHAR;
21 |     }
22 | }
23 | 
24 | static tischar unicode2tis(unichar u)
25 | {
26 |     if (u < 0x0080) {
27 |         return u;
28 |     } else if (
29 |         (0x0E01 <= u && u <= 0x0E3A) ||
30 |         (0x0E3F <= u && u <= 0x0E5B)
31 |     ) {
32 |         return 0xA0 + u - 0x0E00;
33 |     } else {
34 |         return BAD_TISCHAR;
35 |     }
36 | }
37 | 
38 | bool TIS620Reader::Read(unichar& c)
39 | {
40 |     tischar t = tischar (getChar());
41 |     if (!t) return false;
42 |     c = tis2unicode (t);
43 |     return true;
44 | }
45 | 
46 | bool TIS620Writer::Write(unichar c)
47 | {
48 |     return writeChar (unicode2tis (c));
49 | }
50 | 
51 | 


--------------------------------------------------------------------------------
/src/dict.cpp:
--------------------------------------------------------------------------------
 1 | //
 2 | // dict.cpp - dictionary class
 3 | // Created: 2017-11-08
 4 | // Author: Theppitak Karoonboonyanan
 5 | //
 6 | 
 7 | #include "dict.h"
 8 | 
 9 | #include <stdio.h>
10 | #include <string.h>
11 | #include <sys/stat.h>
12 | 
13 | #ifdef WIN32
14 | #define PATH_SEP "\\"
15 | #else
16 | #define PATH_SEP "/"
17 | #endif
18 | 
19 | #define DICT_FILENAME "swathdic.tri"
20 | 
21 | //
22 | // class Dict
23 | //
24 | 
25 | bool
26 | Dict::open (const char* dictPath)
27 | {
28 |   char*       trieBuff = 0;
29 |   const char* triePath;
30 |   struct stat sb;
31 | 
32 |   if (stat (dictPath, &sb) == -1)
33 |     {
34 |       perror (dictPath);
35 |       return false;
36 |     }
37 | 
38 |   if (S_ISDIR (sb.st_mode))
39 |     {
40 |       trieBuff = new char[strlen (dictPath) + 2 + sizeof (DICT_FILENAME)];
41 |       sprintf (trieBuff, "%s" PATH_SEP DICT_FILENAME, dictPath);
42 |       triePath = trieBuff;
43 |     }
44 |   else if (S_ISREG (sb.st_mode))
45 |     {
46 |       triePath = dictPath;
47 |     }
48 |   else
49 |     {
50 |       fprintf (stderr, "%s is not a directory or regular file\n", dictPath);
51 |       return false;
52 |     }
53 | 
54 |   dict = trie_new_from_file (triePath);
55 | 
56 |   if (trieBuff)
57 |     {
58 |       delete[] trieBuff;
59 |     }
60 | 
61 |   return dict != 0;
62 | }
63 | 
64 | 


--------------------------------------------------------------------------------
/tests/thai-latex-wseg.tex:
--------------------------------------------------------------------------------
 1 | \documentclass[a4paper]{article}
 2 | \usepackage[utf8x]{inputenc}
 3 | \usepackage[thai]{babel}
 4 | \usepackage{fonts-tlwg}
 5 | 
 6 | \title{เอกสาร \LaTeX{} ภาษา{\wbr}ไทย}
 7 | \author{เพื่อ{\wbr}การ{\wbr}ทดสอบ}
 8 | \date{}
 9 | 
10 | \begin{document}
11 | \maketitle
12 | 
13 | กาล{\wbr}ครั้ง{\wbr}หนึ่ง{\wbr}นาน{\wbr}มา{\wbr}แล้ว ยัง{\wbr}มี{\wbr}กระต่าย{\wbr}ตัว{\wbr}หนึ่ง{\wbr}วิ่ง{\wbr}เล่น{\wbr}อยู่{\wbr}ใน{\wbr}ป่า ไป{\wbr}พบ{\wbr}เต่า{\wbr}ตัว{\wbr}หนึ่ง{\wbr}เดิน{\wbr}ต้วมเตี้ยม{\wbr}อยู่{\wbr}
14 | รู้สึก{\wbr}รำคาญ{\wbr}ใน{\wbr}ความ{\wbr}เชื่องช้า{\wbr}ของ{\wbr}เต่า จึง{\wbr}ท้า{\wbr}เต่า{\wbr}วิ่ง{\wbr}แข่ง{\wbr}กัน โดย{\wbr}ต่อ{\wbr}ให้{\wbr}เต่า{\wbr}ซึ่ง{\wbr}เชื่องช้า{\wbr}วิ่ง{\wbr}นำ{\wbr}หน้า{\wbr}ไป{\wbr}ก่อน{\wbr}
15 | 
16 | กระต่าย{\wbr}ออก{\wbr}วิ่ง{\wbr}ทีหลัง{\wbr}ก็{\wbr}แซง{\wbr}หน้า{\wbr}เต่า{\wbr}อย่าง{\wbr}รวดเร็ว มอง{\wbr}กลับ{\wbr}หลัง{\wbr}มา{\wbr}ดู{\wbr}ก็{\wbr}ไม่{\wbr}เห็น{\wbr}เต่า{\wbr}วิ่ง{\wbr}ตาม{\wbr}มา{\wbr}
17 | จึง{\wbr}หยุด{\wbr}พัก{\wbr}ใต้{\wbr}ต้นไม้{\wbr}ใหญ่{\wbr}ต้น{\wbr}หนึ่ง ลม{\wbr}โชย{\wbr}พัด{\wbr}เอื่อย~{\wbr}ๆ ทำให้{\wbr}กระต่าย{\wbr}ผล็อย{\wbr}หลับ{\wbr}ไป{\wbr}
18 | จน{\wbr}กระทั่ง{\wbr}พลบค่ำ{\wbr}จึง{\wbr}รู้สึก{\wbr}ตัว{\wbr}ตื่น{\wbr}ขึ้น{\wbr}มา{\wbr}พบ{\wbr}ว่า{\wbr}เต่า{\wbr}เข้า{\wbr}เส้นชัย{\wbr}ไป{\wbr}นาน{\wbr}แล้ว{\wbr}
19 | \end{document}
20 | 


--------------------------------------------------------------------------------
/src/filterx.h:
--------------------------------------------------------------------------------
 1 | // FilterX.h: interface for the FilterX class.
 2 | //
 3 | //////////////////////////////////////////////////////////////////////
 4 | 
 5 | #ifndef __FILTER_X_H
 6 | #define __FILTER_X_H
 7 | 
 8 | #include <stdio.h>
 9 | #include <wchar.h>
10 | 
11 | class FilterX
12 | {
13 | public:
14 |   FilterX (FILE* filein, FILE* fileout, bool isUniIn, bool isUniOut,
15 |            const wchar_t* wordBreakStr);
16 |   virtual ~FilterX ();
17 | 
18 |   const wchar_t* GetWordBreak ();
19 | 
20 |   virtual bool GetNextToken (wchar_t* token, int tokenSz, bool* thaiFlag) = 0;
21 |   virtual void Print (const wchar_t* token, bool thaiFlag) = 0;
22 | 
23 | protected:
24 |   FILE* fpin;
25 |   FILE* fpout;
26 |   bool  isUniIn;
27 |   bool  isUniOut;
28 | 
29 | private:
30 |   const wchar_t* wordBreakStr;
31 | };
32 | 
33 | inline
34 | FilterX::FilterX (FILE* filein, FILE* fileout, bool isUniIn, bool isUniOut,
35 |                   const wchar_t* wordBreakStr)
36 |   : fpin (filein ? filein : stdin),
37 |     fpout (fileout ? fileout : stdout),
38 |     isUniIn (isUniIn),
39 |     isUniOut (isUniOut),
40 |     wordBreakStr (wordBreakStr)
41 | {
42 | }
43 | 
44 | inline
45 | FilterX::~FilterX ()
46 | {
47 |   fflush (fpout);
48 | }
49 | 
50 | inline const wchar_t*
51 | FilterX::GetWordBreak ()
52 | {
53 |   return wordBreakStr;
54 | }
55 | 
56 | #endif
57 | 


--------------------------------------------------------------------------------
/src/abswordseg.h:
--------------------------------------------------------------------------------
 1 | // AbsWordSeg.h: interface for the AbsWordSeg class.
 2 | //
 3 | //////////////////////////////////////////////////////////////////////
 4 | 
 5 | #ifndef __ABSWORDSEG_H
 6 | #define __ABSWORDSEG_H
 7 | 
 8 | #include <wchar.h>
 9 | #include "worddef.h"
10 | #include "dict.h"
11 | 
12 | class AbsWordSeg
13 | {
14 | public:
15 |   virtual ~AbsWordSeg ();
16 | 
17 |   int WordSeg (const Dict* dict, const wchar_t* senstr,
18 |                short int* outSeps, int outSepsSz);
19 | 
20 | protected:
21 |   unsigned short int copySepData (short int sourceIdxSen,
22 |                                   short int targetIdxSen,
23 |                                   short int sepPoint);
24 | 
25 | protected:
26 |   SepType SepData[3];
27 |   unsigned short int textLen;
28 |   wchar_t text[MAXLEN];
29 |   short int LinkSep[3 * MAXLEN];
30 |   short int IdxSep[MAXLEN];
31 | 
32 | private:
33 |   //============Functions for Wordseg====================
34 |   void CreateWordList (const Dict* dict);
35 |   virtual int CreateSentence () = 0;    //return idx of best sen
36 |   int GetBestSen (int bestidx, short int* outSeps, int outSepsSz);
37 | 
38 |   //============Check Character Type Function============
39 |   static bool IsLeadChar (wchar_t wc);
40 |   static bool IsLastChar (wchar_t wc);
41 | 
42 |   //============Check Karan rule=========================
43 |   static bool HasKaran (const wchar_t* sen_ptr);
44 | };
45 | 
46 | #endif
47 | 


--------------------------------------------------------------------------------
/conv/conv.cxx:
--------------------------------------------------------------------------------
 1 | //
 2 | // conv.cc - Main Program for text converter program
 3 | // Created: 19 Jan 1999
 4 | // Author:  Theppitak Karoonboonyanan <theppitak@gmail.com>
 5 | //
 6 | 
 7 | #include "convfact.h"
 8 | 
 9 | //
10 | // SYNOPSIS
11 | //    conv ([-t|-u], [source], [target]);
12 | // DESCRIPTION
13 | //    'conv' converts text data from the file specified by 'source' and
14 | //    writes the result to 'target' file. If either 'source' or 'target'
15 | //    is omitted, standard input and standard output will be assumed.
16 | // OPTIONS
17 | //    -t  Converts UTF-8 to TIS-620
18 | //    -u  Converts TIS-620 to UTF-8 (default)
19 | //
20 | 
21 | static inline ETextFormat
22 | TextFormatFromChar (char f)
23 | {
24 |   return (f == 't') ? TIS620 : UTF8;
25 | }
26 | 
27 | int
28 | conv (char inFormat, char outFormat, const char *inText,
29 |       char *outText, int outLen)
30 | {
31 |     ETextFormat inputFormat = TextFormatFromChar (inFormat);
32 |     ETextFormat outputFormat = TextFormatFromChar (outFormat);
33 | 
34 |     TextReader* pReader = CreateTextReader (inputFormat, inText);
35 |     TextWriter* pWriter = CreateTextWriter (outputFormat, outText, outLen);
36 | 
37 |     if (pReader && pWriter) {
38 |         unichar c;
39 |         while (pReader->Read (c)) {
40 |             pWriter->Write (c);
41 |         }
42 |     }
43 |     pWriter->Write (0);
44 | 
45 |     delete pReader;
46 |     delete pWriter;
47 | 
48 |     return 0;
49 | }
50 | 
51 | 


--------------------------------------------------------------------------------
/conv/convkit.h:
--------------------------------------------------------------------------------
 1 | //
 2 | // convkit.h - Text Converter Kit
 3 | // Created: 19 Jan 1999
 4 | // Author:  Theppitak Karoonboonyanan <theppitak@gmail.com>
 5 | //
 6 | 
 7 | #ifndef __CONVKIT_H
 8 | #define __CONVKIT_H
 9 | 
10 | #include "unichar.h"
11 | 
12 | class TextReader {
13 | public:
14 |     virtual ~TextReader () {}
15 | 
16 |     virtual bool Read(unichar& c) = 0;
17 | 
18 | public:
19 |     TextReader (const char* inText)
20 |         : inText (inText) {}
21 | 
22 |     const char* curPos () const;
23 | 
24 | protected:
25 |     char getChar ();
26 | 
27 | private:
28 |     const char* inText;
29 | };
30 | 
31 | inline char
32 | TextReader::getChar ()
33 | {
34 |     char c = *inText;
35 |     if (c)
36 |         ++inText;
37 |     return c;
38 | }
39 | 
40 | inline const char*
41 | TextReader::curPos () const
42 | {
43 |   return inText;
44 | }
45 | 
46 | class TextWriter {
47 | public:
48 |     virtual ~TextWriter () {}
49 | 
50 |     virtual bool Write(unichar c) = 0;
51 | 
52 | public:
53 |     TextWriter (char *outText, int outLen)
54 |         : outText (outText), outLen (outLen) {}
55 | 
56 | protected:
57 |     bool writeChar (char c);
58 |     int  spaceLeft () const;
59 | 
60 | private:
61 |     char* outText;
62 |     int   outLen;
63 | };
64 | 
65 | inline bool
66 | TextWriter::writeChar (char c)
67 | {
68 |     if (outLen > 0) {
69 |         *outText++ = c;
70 |         --outLen;
71 |         return true;
72 |     }
73 |     return false;
74 | }
75 | 
76 | inline int
77 | TextWriter::spaceLeft () const
78 | {
79 |     return outLen;
80 | }
81 | 
82 | #endif  // __CONVKIT_H
83 | 
84 | 


--------------------------------------------------------------------------------
/src/worddef.h:
--------------------------------------------------------------------------------
 1 | #ifndef __WORDDEF_H
 2 | #define __WORDDEF_H
 3 | 
 4 | #define MAXLEN 6000             // maximum characters in a sentence.
 5 | #define MAXSEP 1500             // maximum seperation points
 6 |                                 //   (maximum words in a sentences)
 7 | 
 8 | struct WordState
 9 | {
10 |   int backState;
11 |   unsigned char branchState;
12 | 
13 |   WordState () {}
14 |   WordState (int backState, unsigned char branchState)
15 |     : backState (backState), branchState (branchState) {}
16 | };
17 | 
18 | struct wt_detail_type
19 | {
20 |   char tag_no;
21 |   float prob;
22 | };
23 | 
24 | struct wt_data_type
25 | {
26 |   char no_pos;
27 |   wt_detail_type detail[15];
28 | };
29 | 
30 | struct CDAreaType
31 | {
32 |   int st;
33 |   int en;
34 | };
35 | 
36 | struct SepType
37 | {
38 |   int Score;
39 |   short int Sep[MAXSEP];
40 | };
41 | 
42 | inline bool
43 | isThai (unsigned char ch)
44 | {
45 |   return 0xa0 < ch;
46 | }
47 | 
48 | inline bool
49 | isThaiDigit (unsigned char ch)
50 | {
51 |   return 0xf0 <= ch && ch <= 0xf9;
52 | }
53 | 
54 | inline bool
55 | isThaiLongTailChar (unsigned char ch)
56 | {
57 |   return ch == 0xbb || ch == 0xbd || ch == 0xbf;
58 | }
59 | 
60 | inline bool
61 | isThaiUni (wchar_t uc)
62 | {
63 |   return (0x0e01 <= uc) && (uc <= 0x0e5b);
64 | }
65 | 
66 | inline bool
67 | isThaiUniDigit (wchar_t wc)
68 | {
69 |   return 0x0e50 <= wc && wc <= 0x0e59;
70 | }
71 | 
72 | inline wchar_t
73 | tis2uni (unsigned char ch)
74 | {
75 |   return (0xa0 < ch) ? ch - 0xa0 + 0x0e00 : ch;
76 | }
77 | 
78 | inline unsigned char
79 | uni2tis (wchar_t uc)
80 | {
81 |   return isThaiUni (uc) ? uc - 0x0e00 + 0x0a0 : uc;
82 | }
83 | 
84 | #endif
85 | 


--------------------------------------------------------------------------------
/data/tdict-spell.txt:
--------------------------------------------------------------------------------
  1 | กงเต็ก
  2 | กฎุมพี
  3 | กฏ
  4 | กฏหมาย
  5 | กบฎ
  6 | กระทันหัน
  7 | กระเทาะ
  8 | กราไฟต์
  9 | ก๋วยจั๊บ
 10 | ก็อบปี้
 11 | ก็อปปี้
 12 | ก๊อปปี้
 13 | กะทะ
 14 | กังวาล
 15 | การุณ
 16 | กาละแม
 17 | กุฎฐัง
 18 | กุฏุมพี
 19 | ฃวด
 20 | คฑา
 21 | คลินิค
 22 | คลีนิก
 23 | คลีนิค
 24 | คาทอลิค
 25 | คึ่นฉ่าย
 26 | แคตตาล็อก
 27 | แคลอรี่
 28 | โควต้า
 29 | ฅน
 30 | จุมพฎ
 31 | ช็อคโกแลต
 32 | ซิลิกอน
 33 | เซ็ต
 34 | แซ่ด
 35 | แซ่บ
 36 | ดัทช์
 37 | ดั๊มพ์
 38 | เดนมาร์ค
 39 | เตนท์
 40 | เต๊นท์
 41 | ทนง
 42 | ทรมาณ
 43 | ทราก
 44 | ทะแยง
 45 | ทิฏฐิ
 46 | นฤมิตร
 47 | บล็อค
 48 | บ๊องแบ๊ว
 49 | บัลเล่ต์
 50 | เบรค
 51 | แบงค์
 52 | ปรากฎ
 53 | ปอเปี๊ยะ
 54 | ปัคคหะ
 55 | ปาฏิโมกข์
 56 | ปิคนิค
 57 | ปิติ
 58 | ปิรามิด
 59 | โปแตช
 60 | โปแตสเซียม
 61 | โปรเตสแตนท์
 62 | พนิช
 63 | พยักเพยิด
 64 | พริ้ว
 65 | พลูโตเนียม
 66 | พากษ์
 67 | แพลตินัม
 68 | แพลตินั่ม
 69 | แพลตตินัม
 70 | แพลตตินั่ม
 71 | ฟอร์มาลีน
 72 | ฟาสซิสท์
 73 | เฟิร์น
 74 | ยากี้
 75 | เยภุยยสิกา
 76 | รุสเซีย
 77 | รูเลตต์
 78 | ฤาษี
 79 | ล็อค
 80 | ล็อคเกอร์
 81 | ลิเธียม
 82 | ลิฟท์
 83 | วันทยาหัตถ์
 84 | วานิช
 85 | วิญญาน
 86 | วิศิษฐ์
 87 | ศรีษะ
 88 | สเต็ก
 89 | สถิตย์
 90 | สปาเกตตี้
 91 | สปาเก็ตตี้
 92 | สเปกโทรสโคป
 93 | สมพงษ์
 94 | สรรเพชญ์
 95 | สรรเพ็ชญ
 96 | สรรเพ็ชญ์
 97 | สฤษฎ์
 98 | สลิ่ม
 99 | สัตตสดก
100 | สันตปาปา
101 | สัมนา
102 | สาธร
103 | สาราณียากร
104 | สุกี้
105 | สุกี้ยากี้
106 | สูญญากาศ
107 | หยอมแหยม
108 | หยอย
109 | หล่ะ
110 | อลูมินัม
111 | อลูมินา
112 | อลูมิเนียม
113 | อะดรีนาลีน
114 | อะหลั่ย
115 | อัตคัต
116 | อัฟริกา
117 | อานิสงค์
118 | อาฟริกา
119 | อิ่มแปร้
120 | อิริยาบท
121 | อิเลคโทรนิคส์
122 | อิสาน
123 | อีรุงตุงนัง
124 | อุตรายัน
125 | อุลตรา
126 | อุลตร้า
127 | โอกาศ
128 | โอลิมปิค
129 | ไอศครีม
130 | ฮีโมโกลบิน
131 | 


--------------------------------------------------------------------------------
/data/tdict-district.txt:
--------------------------------------------------------------------------------
  1 | กระบี่
  2 | กรุงเทพ
  3 | กาญจนบุรี
  4 | กาฬสินธุ์
  5 | กำแพงเพชร
  6 | ขอนแก่น
  7 | จันทบุรี
  8 | ฉะเชิงเทรา
  9 | ชลบุรี
 10 | ชัยนาท
 11 | ชัยภูมิ
 12 | ชุมพร
 13 | เชียงราย
 14 | เชียงใหม่
 15 | ตรัง
 16 | ตราด
 17 | ตาก
 18 | นครนายก
 19 | นครปฐม
 20 | นครพนม
 21 | นครราชสีมา
 22 | นครศรีธรรมราช
 23 | นครสวรรค์
 24 | นนทบุรี
 25 | นราธิวาส
 26 | น่าน
 27 | บึงกาฬ
 28 | บุรีรัมย์
 29 | ปทุมธานี
 30 | ประจวบคีรีขันธ์
 31 | ปราจีนบุรี
 32 | ปัตตานี
 33 | พะเยา
 34 | พังงา
 35 | พัทลุง
 36 | พิจิตร
 37 | พิษณุโลก
 38 | เพชรบุรี
 39 | เพชรบูรณ์
 40 | แพร่
 41 | ภูเก็ต
 42 | มหาสารคาม
 43 | มุกดาหาร
 44 | แม่ฮ่องสอน
 45 | ยโสธร
 46 | ยะลา
 47 | ร้อยเอ็ด
 48 | ระนอง
 49 | ระยอง
 50 | ราชบุรี
 51 | ลพบุรี
 52 | ลำปาง
 53 | ลำพูน
 54 | เลย
 55 | ศรีสะเกษ
 56 | สกลนคร
 57 | สงขลา
 58 | สตูล
 59 | สมุทรปราการ
 60 | สมุทรสงคราม
 61 | สมุทรสาคร
 62 | สระแก้ว
 63 | สระบุรี
 64 | สิงห์บุรี
 65 | สุโขทัย
 66 | สุพรรณบุรี
 67 | สุราษฎร์
 68 | สุราษฎร์ธานี
 69 | สุรินทร์
 70 | หนองคาย
 71 | หนองบัวลำภู
 72 | อยุธยา
 73 | อ่างทอง
 74 | อำนาจเจริญ
 75 | อุดรธานี
 76 | อุตรดิตถ์
 77 | อุทัยธานี
 78 | อุบลราชธานี
 79 | กระนวน
 80 | กรือเซะ
 81 | กันทรลักษ์
 82 | กุยบุรี
 83 | ขนอม
 84 | งาว
 85 | จตุจักร
 86 | ไชยา
 87 | ซีคอน
 88 | ดอนเมือง
 89 | ตะรุเตา
 90 | ถลาง
 91 | ไทรโยค
 92 | ธนบุรี
 93 | ธัญบุรี
 94 | บันนังสตา
 95 | บางกอก
 96 | บางปะกง
 97 | บางระจัน
 98 | แบงค็อก
 99 | แบริ่ง
100 | ปะทิว
101 | ปาย
102 | พญาไท
103 | พหลโยธิน
104 | พัฒน์พงษ์
105 | พัทยา
106 | พารากอน
107 | พุนพิน
108 | เพชรเกษม
109 | ภูมิซรอล
110 | รังสิต
111 | รัชโยธิน
112 | รัตนาธิเบศร์
113 | ละมุง
114 | ลันตา
115 | ลาดพร้าว
116 | ลิบง
117 | วโรรส
118 | วิภาวดี
119 | สตึก
120 | สมุย
121 | สัตหีบ
122 | สิมิลัน
123 | สุขุมวิท
124 | สุทธิสาร
125 | สุวินทวงศ์
126 | สุไหง
127 | เสลภูมิ
128 | หล่มสัก
129 | เหม่งจ๋าย
130 | อังรีดูนังต์
131 | อัษฎางค์
132 | อ่างขาง
133 | อินทนนท์
134 | อินทามระ
135 | เอ็มโพเรียม
136 | 


--------------------------------------------------------------------------------
/src/dict.h:
--------------------------------------------------------------------------------
  1 | //
  2 | // dict.h - dictionary class
  3 | // Created: 2017-11-08
  4 | // Author: Theppitak Karoonboonyanan
  5 | //
  6 | 
  7 | #ifndef __DICT_H
  8 | #define __DICT_H
  9 | 
 10 | #include <datrie/trie.h>
 11 | #include <wchar.h>
 12 | 
 13 | class Dict
 14 | {
 15 | public:
 16 |   class State
 17 |   {
 18 |     friend class Dict;
 19 | 
 20 |   public:
 21 |     ~State ();
 22 | 
 23 |     bool isTerminal () const;
 24 |     void rewind ();
 25 |     bool walk (wchar_t c);
 26 | 
 27 |   private:
 28 |     State (TrieState* s);
 29 | 
 30 |   private:
 31 |     TrieState* state;
 32 |   };
 33 | 
 34 | public:
 35 |   Dict ();
 36 |   explicit Dict (const char* dictPath);
 37 |   ~Dict ();
 38 | 
 39 |   bool   open (const char* dictPath);
 40 |   State* root () const;
 41 | 
 42 | private:
 43 |   Trie*  dict;
 44 | };
 45 | 
 46 | 
 47 | //
 48 | // class Dict::State
 49 | //
 50 | 
 51 | inline
 52 | Dict::State::State (TrieState *s)
 53 |   : state (s) {}
 54 | 
 55 | inline
 56 | Dict::State::~State ()
 57 | {
 58 |   trie_state_free (state);
 59 | }
 60 | 
 61 | inline bool
 62 | Dict::State::isTerminal() const
 63 | {
 64 |   return trie_state_is_terminal (state);
 65 | }
 66 | 
 67 | inline void
 68 | Dict::State::rewind ()
 69 | {
 70 |   trie_state_rewind (state);
 71 | }
 72 | 
 73 | inline bool
 74 | Dict::State::walk (wchar_t c)
 75 | {
 76 |   return trie_state_walk (state, c);
 77 | }
 78 | 
 79 | 
 80 | //
 81 | // class Dict
 82 | //
 83 | 
 84 | inline
 85 | Dict::Dict ()
 86 |   : dict (0) {}
 87 | 
 88 | inline
 89 | Dict::Dict (const char* dictPath)
 90 | {
 91 |   open (dictPath);
 92 | }
 93 | 
 94 | inline
 95 | Dict::~Dict ()
 96 | {
 97 |   if (dict)
 98 |     {
 99 |       trie_free (dict);
100 |     }
101 | }
102 | 
103 | inline Dict::State*
104 | Dict::root () const
105 | {
106 |   return new State (trie_root (dict));
107 | }
108 | 
109 | #endif // __DICT_H
110 | 
111 | 


--------------------------------------------------------------------------------
/README:
--------------------------------------------------------------------------------
 1 | SWATH (Smart Word Analysis for THai)
 2 | ====================================
 3 | 
 4 | Thai script has no word delimiter. While it's trivial for human readers to
 5 | recognize word boundaries while reading, it requires some knowledge for the
 6 | machine to do the same when wrapping lines or moving cursor word-wise, etc.
 7 | Normally, applications need such feature to support Thai text processing.
 8 | 
 9 | Swath is a general-purpose utility to workaround the lack of such capability
10 | in applications. It analyzes the given Thai text by consulting a Thai word
11 | list for word boundaries, before outputting the same text with the predefined
12 | word delimiters inserted.
13 | 
14 | It can read many kinds of input, including plain text and structured documents
15 | like HTML, RTF, LaTeX and Lambda (Unicode version of LaTeX with Omega
16 | typesetter kernel). [See -f option].
17 | 
18 | For the known documents, it inserts the common word delimiters used in the
19 | corresponding formats, and pipes (|) for plain text. But the user can always
20 | override this with a preferred delimiter. [See -b option.]
21 | 
22 | Swath can also be configured to use different algorithms for the analysis.
23 | Currently, it supports two schemes: longest (greedy) matching and maximal
24 | (least words) matching. [See -m option.]
25 | 
26 | EXAMPLES
27 | ========
28 | 
29 | - For LaTeX (to be used with babel-thai package):
30 | 
31 |     $ swath -f latex < mydoc.tex > mydoc.ttex
32 |     $ latex mydoc.ttex
33 | 
34 |   Or if you composed your LaTeX source in UTF-8:
35 | 
36 |     $ swath -f latex -u u,t mydoc.tex > mydoc.ttex
37 |     $ latex mydoc.ttex
38 | 
39 |   This is equivalent to filtering with iconv(1):
40 | 
41 |     $ iconv -f UTF-8 -t TIS-620 mydoc.tex | swath -f latex > mydoc.ttex
42 |     $ latex mydoc.ttex
43 | 
44 | - For HTML (to provide web pages to web browsers that cannot wrap Thai lines
45 |   properly, but support the <wbr> tag):
46 | 
47 |     $ swath -f html < mydoc.html > mydoc-wbr.html
48 | 
49 | 
50 | 


--------------------------------------------------------------------------------
/configure.ac:
--------------------------------------------------------------------------------
 1 | dnl Process this file with autoconf to produce a configure script.
 2 | 
 3 | AC_PREREQ([2.71])
 4 | 
 5 | AC_INIT([swath],
 6 |         [m4_esyscmd(build-aux/git-version-gen)],
 7 |         [https://github.com/tlwg/swath/issues])
 8 | AC_CONFIG_SRCDIR([src/wordseg.cpp])
 9 | AC_CONFIG_MACRO_DIR([m4])
10 | AC_CONFIG_LIBOBJ_DIR([lib])
11 | 
12 | AM_INIT_AUTOMAKE(dist-xz no-dist-gzip)
13 | 
14 | dnl Checks for programs.
15 | AC_PROG_CXX
16 | AC_PROG_INSTALL
17 | AC_PROG_LN_S
18 | AC_PROG_MAKE_SET
19 | LT_INIT
20 | 
21 | dnl Checks for libraries.
22 | PKG_CHECK_MODULES(DATRIE,datrie-0.2)
23 | 
24 | dnl Checks for header files.
25 | AC_CHECK_HEADERS(limits.h)
26 | 
27 | dnl Checks for functions
28 | 
29 | dnl NetBSD is reported to lack wcpcpy()
30 | AC_REPLACE_FUNCS([wcpcpy])
31 | 
32 | dnl Checks for typedefs, structures, and compiler characteristics.
33 | AC_C_CONST
34 | AC_TYPE_SIZE_T
35 | 
36 | dnl Check commandline options
37 | AC_ARG_ENABLE(debug,
38 |               [AS_HELP_STRING([--enable-debug],
39 |                               [enable assertion checks])],
40 |               , enable_debug="no")
41 | if test "x$enable_debug" = "xno"; then
42 |   CPPFLAGS="$CPPFLAGS -DNDEBUG"
43 | fi
44 | 
45 | AC_ARG_ENABLE(catthai,
46 |               [AS_HELP_STRING([--enable-catthai],
47 |                               [enable catenation of Thai lines])],
48 |               , enable_catthai="no")
49 | if test "x$enable_catthai" = "xyes"; then
50 |   CPPFLAGS="$CPPFLAGS -DCAT_THAI_LINES"
51 | fi
52 | 
53 | dnl dictionary data generation
54 | AC_ARG_ENABLE(dict,
55 |               [AS_HELP_STRING([--disable-dict],
56 |                               [disable dictionary data generation])],
57 |               , enable_dict="yes")
58 | 
59 | if test "x$enable_dict" = "xyes"; then
60 |   AC_CHECK_PROGS(TRIETOOL,[trietool-0.2 trietool],no)
61 |   if test "x$TRIETOOL" = "xno"; then
62 |     AC_MSG_ERROR([You need trietool[[-0.2]] (from libdatrie package) to generate dict, or just use --disable-dict to skip])
63 |   fi
64 | fi
65 | 
66 | AM_CONDITIONAL(ENABLE_DICT,test "x$enable_dict" = "xyes")
67 | 
68 | AC_CONFIG_FILES([Makefile
69 |   lib/Makefile
70 |   conv/Makefile
71 |   src/Makefile
72 |   data/Makefile
73 |   tests/Makefile])
74 | AC_OUTPUT
75 | 


--------------------------------------------------------------------------------
/src/longwordseg.cpp:
--------------------------------------------------------------------------------
 1 | // longwordseg.cpp: implementation of the LongWordSeg class.
 2 | //
 3 | //////////////////////////////////////////////////////////////////////
 4 | 
 5 | #include "wordstack.h"
 6 | #include "longwordseg.h"
 7 | #include <limits.h>
 8 | 
 9 | int
10 | LongWordSeg::CreateSentence ()
11 | {
12 |   WordStack BackTrackStack;
13 |   int penalty[MAXSEP];     // aggregated penalty at Sep[] positions
14 | 
15 |   const int senIdx = 0;
16 |   const int bestSenIdx = 1;
17 | 
18 |   BackTrackStack.Push (WordState (0, 0));
19 |   int bestPenalty = INT_MAX;
20 | 
21 |   while (!BackTrackStack.Empty())
22 |     {
23 |       WordState wState = BackTrackStack.Top ();
24 |       BackTrackStack.Pop ();
25 | 
26 |       int Idx = wState.backState;
27 |       int branchIdx = wState.branchState;
28 |       int nextSepIdx = (Idx == 0) ? 0 : copySepData (senIdx, senIdx, Idx);
29 |       penalty[nextSepIdx] = (nextSepIdx == 0) ? 0 : penalty[nextSepIdx - 1];
30 | 
31 |       while (Idx < textLen)
32 |         {
33 |           if (IdxSep[Idx] >= 0)
34 |             {
35 |               // some words start at Idx
36 |               if (LinkSep[IdxSep[Idx] + branchIdx + 1] != -1)
37 |                 {
38 |                   BackTrackStack.Push (WordState (Idx, branchIdx + 1));
39 |                 }
40 |               Idx = LinkSep[IdxSep[Idx] + branchIdx];
41 |               branchIdx = 0;
42 |             }
43 |           else
44 |             {
45 |               // skip unknown words and add penalty
46 |               while (Idx < textLen && IdxSep[Idx] < 0)
47 |                 Idx++;
48 |               ++penalty[nextSepIdx];
49 |             }
50 |           SepData[senIdx].Sep[nextSepIdx++] = Idx;
51 |           penalty[nextSepIdx] = penalty[nextSepIdx - 1];
52 |         }
53 | 
54 |       if (SepData[senIdx].Sep[nextSepIdx - 1] != textLen)
55 |         {
56 |           // unknown words at the end
57 |           SepData[senIdx].Sep[nextSepIdx++] = textLen;
58 |           penalty[nextSepIdx] = penalty[nextSepIdx - 1] + 1;
59 |         }
60 |       SepData[senIdx].Sep[nextSepIdx] = -1;
61 | 
62 |       // clean solution found, return it immediately
63 |       if (penalty[nextSepIdx] == 0)
64 |         return senIdx;
65 | 
66 |       // otherwise, try to minimize unknown words in best sentence
67 |       if (penalty[nextSepIdx] < bestPenalty)
68 |         {
69 |           copySepData (senIdx, bestSenIdx, SepData[senIdx].Sep[nextSepIdx - 1]);
70 |           bestPenalty = penalty[nextSepIdx];
71 |         }
72 |     }
73 | 
74 |   return bestSenIdx;
75 | }
76 | 


--------------------------------------------------------------------------------
/src/filterhtml.cpp:
--------------------------------------------------------------------------------
 1 | // FilterHtml.cpp: implementation of the FilterHtml class.
 2 | //
 3 | //////////////////////////////////////////////////////////////////////
 4 | 
 5 | #include <string.h>
 6 | #include <wchar.h>
 7 | #include <wctype.h>
 8 | #include "filterhtml.h"
 9 | #include "worddef.h"
10 | #include "convutil.h"
11 | 
12 | //////////////////////////////////////////////////////////////////////
13 | // Construction/Destruction
14 | //////////////////////////////////////////////////////////////////////
15 | 
16 | FilterHtml::FilterHtml (FILE* filein, FILE* fileout,
17 |                         bool isUniIn, bool isUniOut)
18 |   : FilterX (filein, fileout, isUniIn, isUniOut, L"<wbr>"),
19 |     chbuff (0)
20 | {
21 | }
22 | 
23 | bool
24 | FilterHtml::GetNextToken (wchar_t* token, int tokenSz, bool* thaiFlag)
25 | {
26 |   if ((fpin == NULL) || (feof (fpin) != 0) || tokenSz < 2)
27 |     return false;
28 | 
29 |   if (chbuff == 0)
30 |     {
31 |       wchar_t wc = ConvGetC (fpin, isUniIn);
32 |       if (EOF == wc)
33 |         return false;
34 |       *token = wc;
35 |       --tokenSz;
36 |     }
37 |   else
38 |     {
39 |       *token = chbuff;
40 |       --tokenSz;
41 |       chbuff = 0;
42 |     }
43 |   *thaiFlag = isThaiUni (*token);
44 |   // find a token containing only Thai characters or non-Thai+space characters
45 |   for (;;)
46 |     {
47 |       int nextChar = ConvGetC (fpin, isUniIn);
48 |       if (EOF == nextChar)
49 |         break;
50 |       if (iswspace (nextChar) || L'.' == nextChar
51 |           || isThaiUni (*token) != isThaiUni (nextChar))
52 |         {
53 |           if (*thaiFlag)
54 |             {
55 |               if ('\n' == nextChar)
56 |                 {
57 |                   nextChar = ConvGetC (fpin, isUniIn);
58 |                   if (EOF == nextChar)
59 |                     break;
60 |                   if (isThaiUni (nextChar))
61 |                     {
62 |                       if (tokenSz < 2)
63 |                         goto stop_reading;
64 |                       *++token = nextChar;
65 |                       --tokenSz;
66 |                       continue;
67 |                     }
68 |                 }
69 |             }
70 |         }
71 |       else
72 |         {
73 |           if (tokenSz < 2)
74 |             goto stop_reading;
75 |           *++token = nextChar;
76 |           --tokenSz;
77 |           continue;
78 |         }
79 | 
80 | stop_reading:
81 |       // not continued -> put back next char & stop
82 |       chbuff = nextChar;
83 |       break;
84 |     }
85 |   *++token = 0;
86 | 
87 |   return true;
88 | }
89 | 
90 | void
91 | FilterHtml::Print (const wchar_t* token, bool)
92 | {
93 |   ConvPrint (fpout, token, isUniOut);
94 | }
95 | 


--------------------------------------------------------------------------------
/data/tdict-country.txt:
--------------------------------------------------------------------------------
  1 | กรีซ
  2 | กัมพูชา
  3 | กัวเตมาลา
  4 | กาตาร์
  5 | กานา
  6 | กาบอง
  7 | กายอานา
  8 | กินี
  9 | เกรนาดีนส์
 10 | เกรเนดา
 11 | เกาหลี
 12 | แกมเบีย
 13 | โกตดิวัวร์
 14 | คองโก
 15 | คอโมโรส
 16 | คอสตาริกา
 17 | คาซัคสถาน
 18 | คิตส์
 19 | คิริบาตี
 20 | คิริบาส
 21 | คิวบา
 22 | คีร์กีซสถาน
 23 | คูเวต
 24 | เคนยา
 25 | เคปเวิร์ด
 26 | เคย์แมน
 27 | แคนาดา
 28 | แคเมอรูน
 29 | โครเอเชีย
 30 | โคลอมเบีย
 31 | โคลัมเบีย
 32 | จอร์เจีย
 33 | จอร์แดน
 34 | จาเมกา
 35 | จิบูตี
 36 | จีน
 37 | ชาด
 38 | ชิลี
 39 | เช็ก
 40 | ซามัว
 41 | ซาอุ
 42 | ซิมบับเว
 43 | ซีเรีย
 44 | ซูดาน
 45 | ซูรินาเม
 46 | เซนต์
 47 | เซเนกัล
 48 | เซอร์เบีย
 49 | เซาตูเม
 50 | เซียร์รา
 51 | แซมเบีย
 52 | โซมาเลีย
 53 | โซเวียต
 54 | ไซปรัส
 55 | ญี่ปุ่น
 56 | ดารุสซาลาม
 57 | เดนมาร์ก
 58 | โดมินิกัน
 59 | โดมินิกา
 60 | ตรินิแดด
 61 | ตองกา
 62 | ติมอร์
 63 | ตุรกี
 64 | ตูนิเซีย
 65 | เติร์กเมนิสถาน
 66 | โตโก
 67 | โตเบโก
 68 | ไต้หวัน
 69 | ทาจิกิสถาน
 70 | แทนซาเนีย
 71 | นอร์เวย์
 72 | นามิเบีย
 73 | นาอูรู
 74 | นิการากัว
 75 | นิวซีแลนด์
 76 | เนเธอร์แลนด์
 77 | เนปาล
 78 | เนวิส
 79 | ไนจีเรีย
 80 | ไนเจอร์
 81 | บราซิล
 82 | บริติช
 83 | บริเตน
 84 | บรูไน
 85 | บอตสวานา
 86 | บอสเนีย
 87 | บังกลาเทศ
 88 | บังคลาเทศ
 89 | บัลกาเรีย
 90 | บัลแกเรีย
 91 | บาร์บูดา
 92 | บาร์เบโดส
 93 | บาห์เรน
 94 | บาฮามาส
 95 | บิสเซา
 96 | บุรุนดี
 97 | บูร์กินาฟาโซ
 98 | เบนิน
 99 | เบลเยียม
100 | เบลารุส
101 | เบลีซ
102 | เบอร์มิวดา
103 | โบลิเวีย
104 | ปรินซิปี
105 | ปากีสถาน
106 | ปานามา
107 | ปาปัวนิวกินี
108 | ปารากวัย
109 | ปาเลสไตน์
110 | ปาเลา
111 | เปรู
112 | เปอร์เซีย
113 | เปอร์โตริโก
114 | โปรตุเกส
115 | โปแลนด์
116 | ฝรั่งเศส
117 | พม่า
118 | ฟิจิ
119 | ฟินแลนด์
120 | ฟิลิปปินส์
121 | เฟรนช์
122 | ภูฏาน
123 | ภูฐาน
124 | มองโกเลีย
125 | มอนเตเนโกร
126 | มอนแทนา
127 | มอริเชียส
128 | มอริเตเนีย
129 | มอลโดวา
130 | มอลตา
131 | มัลดีฟส์
132 | มาเก๊า
133 | มาซิโดเนีย
134 | มาดากัสการ์
135 | มาร์แชลล์
136 | มาลาวี
137 | มาลี
138 | มาเลเซีย
139 | เม็กซิโก
140 | เมียนมา
141 | เมียนมาร์
142 | โมซัมบิก
143 | โมนาโก
144 | โมนาโค
145 | โมร็อกโก
146 | ไมโครนีเซีย
147 | ยูกันดา
148 | ยูโกสลาเวีย
149 | ยูเครน
150 | เยเมน
151 | เยอรมนี
152 | รวันดา
153 | รัสเซีย
154 | โรมาเนีย
155 | ลักเซมเบิร์ก
156 | ลัตเวีย
157 | ลาว
158 | ลิกเตนสไตน์
159 | ลิทัวเนีย
160 | ลิเบีย
161 | ลีโอน
162 | ลูเซีย
163 | เลโซโท
164 | เลบานอน
165 | เลสเต
166 | ไลบีเรีย
167 | วาติกัน
168 | วานูอาตู
169 | วินเซนต์
170 | เวเนซุเอลา
171 | เวียดนาม
172 | ศรีลังกา
173 | สเปน
174 | สโลวะเกีย
175 | สโลวัก
176 | สโลวีเนีย
177 | สวาซิแลนด์
178 | สวิตเซอร์แลนด์
179 | สวีเดน
180 | สหรัฐ
181 | สหราชอาณาจักร
182 | สิกขิม
183 | สิงคโปร์
184 | อเมริกา
185 | ออสเตรเลีย
186 | ออสเตรีย
187 | อันดอร์รา
188 | อัฟกานิสถาน
189 | อาเซอร์ไบจาน
190 | อาร์เจนตินา
191 | อาร์เมเนีย
192 | อาระเบีย
193 | อิเควทอเรียล
194 | อิตาลี
195 | อินเดีย
196 | อินโดนีเซีย
197 | อิรัก
198 | อิสราเอล
199 | อิหร่าน
200 | อียิปต์
201 | อุซเบกิสถาน
202 | อุรุกวัย
203 | เอกวาดอร์
204 | เอธิโอเปีย
205 | เอมิเรตส์
206 | เอริเทรีย
207 | เอลซัลวาดอร์
208 | เอสโตเนีย
209 | แองโกลา
210 | แอนติกา
211 | แอลจีเรีย
212 | แอลเบเนีย
213 | โอมาน
214 | ไอซ์แลนด์
215 | ไอร์แลนด์
216 | ฮ่องกง
217 | ฮอนดูรัส
218 | ฮังการี
219 | เฮติ
220 | เฮอร์เซโกวีนา
221 | 


--------------------------------------------------------------------------------
/src/convutil.cpp:
--------------------------------------------------------------------------------
  1 | // convutil.cpp - Utility functions for character encoding conversion
  2 | //
  3 | //////////////////////////////////////////////////////////////////////
  4 | 
  5 | #include "convutil.h"
  6 | #include "conv/utf8.h"
  7 | #include "conv/tis620.h"
  8 | #include "worddef.h"
  9 | #include <string.h>
 10 | 
 11 | bool
 12 | ConvGetS (wchar_t* buffer, int buffSz, FILE* fpin, bool isUniIn)
 13 | {
 14 |   // UTF-8 requires up to 5 bytes per char
 15 |   int   lineSz = isUniIn ? buffSz * 5 : buffSz;
 16 |   char* line = new char[lineSz];
 17 |   if (!fgets (line, lineSz, fpin))
 18 |     goto fail;
 19 | 
 20 |   TextReader* reader;
 21 |   if (isUniIn)
 22 |     reader = new UTF8Reader (line);
 23 |   else
 24 |     reader = new TIS620Reader (line);
 25 |   unichar uc;
 26 |   wchar_t* wp;
 27 |   for (wp = buffer; buffSz > 1 && reader->Read (uc); ++wp, --buffSz)
 28 |     {
 29 |       *wp = uc;
 30 |     }
 31 |   *wp = 0;
 32 |   delete reader;
 33 | 
 34 |   delete[] line;
 35 |   return true;
 36 | 
 37 | fail:
 38 |   delete[] line;
 39 |   return false;
 40 | }
 41 | 
 42 | int
 43 | ConvGetC (FILE* fpin, bool isUniIn)
 44 | {
 45 |   if (!isUniIn)
 46 |     return fgetc (fpin);
 47 | 
 48 |   int c = fgetc (fpin);
 49 |   if (EOF == c)
 50 |     return EOF;
 51 | 
 52 |   if (0 == (c & 0x80))
 53 |     return c;
 54 | 
 55 |   char uniBuff[10];
 56 |   uniBuff[0] = char (c);
 57 |   int  len = 1;
 58 |   for (c <<= 1; c & 0x80; c <<= 1)
 59 |     {
 60 |       int inp = fgetc (fpin);
 61 |       if (EOF == inp)
 62 |         return EOF;
 63 |       uniBuff[len++] = char (inp);
 64 |     }
 65 |   uniBuff[len] = '\0';
 66 | 
 67 |   UTF8Reader ur (uniBuff);
 68 |   unichar uc;
 69 |   ur.Read (uc);
 70 | 
 71 |   return uc;
 72 | }
 73 | 
 74 | bool
 75 | ConvPrint (FILE* fpout, const wchar_t* wcs, bool isUniOut)
 76 | {
 77 |   char output[MAXLEN * 2 * 5];
 78 |   TextWriter* writer;
 79 |   if (isUniOut)
 80 |     writer = new UTF8Writer (output, sizeof output);
 81 |   else
 82 |     writer = new TIS620Writer (output, sizeof output);
 83 | 
 84 |   while (*wcs)
 85 |     {
 86 |       if (!writer->Write (*wcs++))
 87 |         {
 88 |           delete writer;
 89 |           return false;
 90 |         }
 91 |     }
 92 |   writer->Write (0);
 93 |   fprintf (fpout, "%s", output);
 94 | 
 95 |   delete writer;
 96 |   return true;
 97 | }
 98 | 
 99 | wchar_t*
100 | ConvStrDup (const char* s, bool isUniIn)
101 | {
102 |   wchar_t* output = new wchar_t[strlen (s) + 1];
103 |   if (!output)
104 |     return NULL;
105 | 
106 |   TextReader* reader;
107 |   if (isUniIn)
108 |     reader = new UTF8Reader (s);
109 |   else
110 |     reader = new TIS620Reader (s);
111 |   unichar uc;
112 |   wchar_t* wp;
113 |   for (wp = output; reader->Read (uc); ++wp)
114 |     {
115 |       *wp = uc;
116 |     }
117 |   *wp = 0;
118 |   delete reader;
119 | 
120 |   return output;
121 | }
122 | 
123 | wchar_t*
124 | Ascii2WcsCopy (wchar_t* dst, const char* src)
125 | {
126 |   while (*src)
127 |     {
128 |       *dst++ = *src++;
129 |     }
130 |   *dst = 0;
131 |   return dst;
132 | }
133 | 
134 | wchar_t*
135 | Ascii2WcsCat (wchar_t* dst, const char* src)
136 | {
137 |   while (*dst)
138 |     {
139 |       ++dst;
140 |     }
141 |   return Ascii2WcsCopy (dst, src);
142 | }
143 | 
144 | wchar_t*
145 | Ascii2WcsNCopy (wchar_t* dst, const char* src, int n)
146 | {
147 |   while (*src && n > 0)
148 |     {
149 |       *dst++ = *src++;
150 |       n--;
151 |     }
152 |   if (n > 0)
153 |     {
154 |       *dst = 0;
155 |     }
156 |   return dst;
157 | }
158 | 
159 | wchar_t*
160 | Ascii2WcsNCat (wchar_t* dst, const char* src, int n)
161 | {
162 |   while (*dst)
163 |     {
164 |       ++dst;
165 |     }
166 |   return Ascii2WcsNCopy (dst, src, n);
167 | }
168 | 
169 | 


--------------------------------------------------------------------------------
/conv/utf8.cxx:
--------------------------------------------------------------------------------
  1 | //
  2 | // utf8.cc - UTF-8 Text Agents
  3 | // Created: 19 Jan 1999
  4 | // Author:  Theppitak Karoonboonyanan <theppitak@gmail.com>
  5 | //
  6 | 
  7 | #include "utf8.h"
  8 | #include "unichar.h"
  9 | 
 10 | bool
 11 | UTF8Writer::Write (unichar uniChar)
 12 | {
 13 |     if (uniChar <= 0x007F) {
 14 |         writeChar ((unsigned char)(uniChar & 0x7F));
 15 |     } else if (uniChar <= 0x07FF) {
 16 |         if (spaceLeft() < 2)  return false;
 17 |         writeChar ((unsigned char)(0xC0 | (uniChar >> 6)));
 18 |         writeChar ((unsigned char)(0x80 | (uniChar & 0x3F)));
 19 |     } else if (uniChar <= 0xFFFF) {
 20 |         if (spaceLeft() < 3)  return false;
 21 |         writeChar ((unsigned char)(0xE0 | (uniChar >> 12)));
 22 |         writeChar ((unsigned char)(0x80 | ((uniChar >> 6) & 0x3F)));
 23 |         writeChar ((unsigned char)(0x80 | (uniChar & 0x3F)));
 24 |     } else if (uniChar <= 0x1FFFFF) {
 25 |         if (spaceLeft() < 4)  return false;
 26 |         writeChar ((unsigned char)(0xF0 | (uniChar >> 18)));
 27 |         writeChar ((unsigned char)(0x80 | ((uniChar >> 12) & 0x3F)));
 28 |         writeChar ((unsigned char)(0x80 | ((uniChar >> 6) & 0x3F)));
 29 |         writeChar ((unsigned char)(0x80 | (uniChar & 0x3F)));
 30 |     } else if (uniChar <= 0x3FFFFFF) {
 31 |         if (spaceLeft() < 5)  return false;
 32 |         writeChar ((unsigned char)(0xF8 | (uniChar >> 24)));
 33 |         writeChar ((unsigned char)(0x80 | ((uniChar >> 18) & 0x3F)));
 34 |         writeChar ((unsigned char)(0x80 | ((uniChar >> 12) & 0x3F)));
 35 |         writeChar ((unsigned char)(0x80 | ((uniChar >> 6) & 0x3F)));
 36 |         writeChar ((unsigned char)(0x80 | (uniChar & 0x3F)));
 37 |     } else if (uniChar <= 0x7FFFFFFF) {
 38 |         if (spaceLeft() < 6)  return false;
 39 |         writeChar ((unsigned char)(0xFC | (uniChar >> 30)));
 40 |         writeChar ((unsigned char)(0x80 | ((uniChar >> 24) & 0x3F)));
 41 |         writeChar ((unsigned char)(0x80 | ((uniChar >> 18) & 0x3F)));
 42 |         writeChar ((unsigned char)(0x80 | ((uniChar >> 12) & 0x3F)));
 43 |         writeChar ((unsigned char)(0x80 | ((uniChar >> 6) & 0x3F)));
 44 |         writeChar ((unsigned char)(0x80 | (uniChar & 0x3F)));
 45 |     } else {
 46 |         // error
 47 |         return false;
 48 |     }
 49 |     return true;
 50 | }
 51 | 
 52 | bool
 53 | UTF8Reader::Read (unichar& uniChar)
 54 | {
 55 |     unsigned char c = getChar();
 56 |     if (!c) return false;
 57 | 
 58 |     if ((c & 0x80) == 0x00) {
 59 |         uniChar = c;
 60 |     } else {
 61 |         // count rest bytes
 62 |         unsigned char sig = (c << 1);
 63 |         int nBytes = 0;
 64 |         while (sig & 0x80) { nBytes++; sig <<= 1; }
 65 |         if (nBytes == 0) return false;  // undefined signature
 66 | 
 67 |         // get most significant bits of unicode data
 68 |         //    signature bits = nBytes+2 (MSB)
 69 |         // -> data bits = 8 - (nBytes+2) = 6 - nBytes (LSB)
 70 |         uniChar = (c & (0x3F >> nBytes));
 71 | 
 72 |         // get rest bits
 73 |         while (nBytes-- > 0) {
 74 |             c = getChar();
 75 |             if (!c) return false;
 76 |             c ^= 0x80;  // 10xx xxxx -> 00xx xxxx
 77 |             if (c & 0xC0) return false;  // not 10xx xxxx form
 78 |             uniChar = (uniChar << 6) | c;
 79 |         }
 80 |     }
 81 | 
 82 |     return true;
 83 | }
 84 | 
 85 | int
 86 | UTF8Bytes (unichar uc)
 87 | {
 88 |     if (uc <= 0x007F) {
 89 |         return 1;
 90 |     } else if (uc <= 0x07FF) {
 91 |         return 2;
 92 |     } else if (uc <= 0xFFFF) {
 93 |         return 3;
 94 |     } else if (uc <= 0x1FFFFF) {
 95 |         return 4;
 96 |     } else if (uc <= 0x3FFFFFF) {
 97 |         return 5;
 98 |     } else if (uc <= 0x7FFFFFFF) {
 99 |         return 6;
100 |     } else {
101 |         // error
102 |         return -1;
103 |     }
104 | }
105 | 
106 | 


--------------------------------------------------------------------------------
/tests/long.txt:
--------------------------------------------------------------------------------
1 | เหมือนเป็นสัญญาณจากทางคณะรักษาความสงบแห่งชาติและฝ่ายที่อยู่ตรงข้ามเข้าใจตรงกันว่า สนามการลงคะแนนประชามติของร่างรัฐธรรมนูญมีความหมายต่อความชอบธรรมในการบริหารราชการแผ่นดินของรัฐบาลที่มาจากการควบคุมอำนาจเป็นอย่างยิ่ง เพราะผลที่ออกมานั้นชี้วัดถึงทัศนคติของคนในสังคมที่จะเลือกว่ารัฐบาลทหารเดินมาถูกทางและถูกใจคนส่วนใหญ่หรือไม่ เอาเข้าจริงคะแนนเสียงที่ออกมากลับปฏิเสธอำนาจทหารด้วยเงื่อนไขบางประการที่ผูกเป็นเงื่อนปมในรัฐธรรมนูญ หรือสังคมกลับกลัวการกลับมาของเผด็จการรัฐสภา และการผูกขาดอำนาจของทุนการเมืองที่เกิดขึ้นมาในอดีต จึงเห็นชอบกับร่างรัฐธรรมนูญนี้ เมื่อวันก่อน นายณัฐวุฒิ ใสยเกื้อ แกนนำแนวร่วมประชาธิปไตยต่อต้านเผด็จการแห่งชาติ พูดถึงการลงประชามติคือสนามการต่อสู้แล้ว ยิ่งทำให้เห็นว่าระหว่างทางที่จะไปถึงวันนั้น สถานการณ์น่าจะเข้มข้นมากขึ้น เนื่องจากผลลัพธ์ที่ออกมามีเดิมพันสูงมาก การงัดกลยุทธ์ การทำลายจุดอ่อนของฝ่ายตรงข้ามย่อมมีความจำเป็น ขณะที่ฝ่ายคณะรักษาความมั่นคงแห่งชาติและรัฐบาล คือฝ่ายที่ถืออำนาจรัฐ พร้อมทั้งได้สร้างกฎเกณฑ์ กฎเหล็ก ขึ้นมามากมายเพื่อป้องกันการใช้กลไกในโลกของโซเชียลมีเดีย ขยายจุดอ่อนของร่างรัฐธรรมนูญ จนมีผลที่ทำให้คนที่อยู่ระหว่างกลางที่เป็นฝ่ายเสรีนิยมเกิดปฏิกิริยาต่อต้าน แน่นอนว่า ด้วยพลังอำนาจอันมหาศาลจากการเป็นรัฏฐาธิปัตย์ การมีกองกำลังรักษาความสงบเรียบร้อย จากการใช้ทหารในประจำการเข้ามาทำหน้าที่ดูแลสถานการณ์ การออกคำสั่ง ประกาศ การแก้ไขกฎหมายเพิ่มโทษของผู้กระทำผิดที่เกิดจากความมั่นคงเริ่มมีมากขึ้น โดยเฉพาะการแก้ไขพระราชบัญญัติคอมพิวเตอร์ ฝ่ายคณะรักษาความสงบแห่งชาติย่อมคาดหวังต่อเครื่องมือเหล่านั้น ไม่ต้องการให้กลุ่มต่อต้านรัฐธรรมนูญทำอะไรไม่ได้สะดวกใจ จุดประสงค์คือผลักดันให้ร่างรัฐธรรมนูญได้รับความเห็นชอบจากประชาชนอย่างท่วมท้นโดยไม่มีขบวนการใดมาขัดขวาง ทั้งที่รัฐธรรมนูญไม่ผ่านประชามติ ก็ยังมีอำนาจในการดำเนินการยำเนื้อหาร่างรัฐธรรมนูญใหม่ได้ นั่นเพราะคำว่าความชอบธรรมในการดำรงอยู่ของคณะรักษาความสงบแห่งชาติเป็นเรื่องสำคัญที่สุด จากนี้ไปอีกปีกว่า คณะรักษาความสงบแห่งชาติจะต้องเจอกับแรงปะทะอีกมากมาย เพราะการลงสนามแข่งขันใหญ่ในการเลือกตั้งนั้นไม่มีชิ้นปลามัน หรืออำนาจสิทธิ์ขาดให้กับนักเลือกตั้งได้เข้ามาคุมอำนาจได้อย่างเบ็ดเสร็จหลังการเลือกตั้ง ซึ่งนั่นก็เป็นเหตุผลสำคัญหนึ่งที่เกิดการเคลื่อนไหวอย่างหนักในการคัดค้านร่างรัฐธรรมนูญที่มีคำถามพ่วง ที่ถูกวิจารณ์ว่าเป็นการสืบทอดอำนาจทหาร และจะกลายเป็นประเด็นที่กลายเป็นเป้าใหญ่ให้ฝ่ายที่ไม่เห็นด้วยนำไปโจมตี เพราะเป็นที่รู้กันว่าทหารต้องการเป็นเงาคู่ขนานผ่านอำนาจของสมาชิกวุฒิสภา และเรื่องของนายกรัฐมนตรีคนนอกไปอีกหลายปีหลังการเลือกตั้ง การเดินหน้าเอาผิดกับกลุ่ม หรือขบวนการที่ใช้โซเชียลมีเดียในการดิสเครดิตรัฐบาลและคณะรักษาความสงบแห่งชาติในขณะนี้ แม้รัฐจะไม่ได้มีเป้าหมายให้ได้รับโทษหนักเท่าไหร่นัก แต่ในแง่ของคณะรักษาความสงบแห่งชาติก็หวังผลในเชิงจิตวิทยา ให้คนที่เคลื่อนไหวในระดับย่อยระมัดระวังและเกรงกลัวที่จะโพสต์ข้อความเรื่องรัฐธรรมนูญ โดยงัดเอาพระราชบัญญัติว่าด้วยการออกเสียงประชามติมาเป็นอาวุธ ประกอบกับการที่สภานิติบัญญัติแห่งชาติแก้ไขพระราชบัญญัติคอมพิวเตอร์เพิ่มโทษ ก็ยิ่งทำให้เครื่องมือในการห้ามเข้มข้นขึ้น แต่ในอีกแง่หนึ่งก็ต้องเสี่ยงกับการถูกมองภาพจากคนที่มีเจตนาดีในการวิพากษ์วิจารณ์ ว่ารัฐปิดกั้นในการแสดงความคิดเห็นมากเกินไปจนกระดิกตัวไม่ได้ กระทบต่อแนวร่วมของคณะรักษาความสงบแห่งชาติที่พร้อมหนุน ดังนั้นเจ้าหน้าที่รัฐเองก็ต้องแยกแยะ ดำเนินการ และชี้แจงอย่างเป็นระบบ ในการแบ่งให้เห็นถึงคนที่ปฏิบัติและคนที่อยู่เบื้องหลัง รวมไปถึงคนที่อยากแสดงความคิดเห็นต่อร่างรัฐธรรมนูญอย่างบริสุทธิ์ใจ และพร้อมที่จะรับได้กับคำว่าสืบทอดอำนาจ ในขณะเดียวกัน การจัดทำผังล้มคณะรักษาความสงบแห่งชาติที่เจ้าหน้าที่ได้แจกจ่ายนั้น ก็เหมือนเป็นการยืนยันในหลักฐานในชั้นพนักงานสอบสวนว่ามีความเชื่อมโยง การใช้ไม้แข็งโดยใช้กฎหมายอย่างเต็มรูปแบบกับปลาซิวปลาสร้อยแทนการปรับทัศนคติในช่วงก่อนหน้านี้ ก็ต้องมีการประเมินให้ดีว่าจะเป็นผลดีมากกว่าผลเสีย หรือแท้ที่จริงแล้วยิ่งสร้างแนวร่วมแห่งความเกลียดชัง เราจึงหวังว่า ชัยชนะในประชามติร่างรัฐธรรมนูญที่จะมีขึ้นในอีกสองเดือนข้างหน้า ที่สองฝ่ายมองว่านั่นคือสนามการแข่งขันของจริง โดยมีเสียงของประชาชนเป็นเดิมพันนั้น ไม่ใช่บทสรุปทั้งหมด กล่าวคือ ถ้าฝ่ายหนึ่งได้คะแนนเสียงมากกว่า คือชยชนะแห่งหลักการประชาธิปไตย หรืออีกฝ่ายหนึ่งได้มากกว่า คือความชอบธรรมของอำนาจทหารในการปกครองประเทศ เราเชื่อว่านั่นไม่ใช่ชัยชนะของประชาชนอย่างแท้จริง และกว่าจะถึงวันนั้นก็ยังไม่รู้ว่าจะมีผู้ถูกจับกุมดำเนินคดีอีกเท่าไหร่ จะเกิดปรากฏการณ์ทางการเมืองที่วุ่นวายจนในที่สุดต้องปิดสนามการแข่งขันลงหรือไม่
2 | 


--------------------------------------------------------------------------------
/data/tdict-city.txt:
--------------------------------------------------------------------------------
  1 | กรีนิช
  2 | กลันตัน
  3 | กังนัม
  4 | กัลกัตตา
  5 | กัวลาลัมเปอร์
  6 | กัศมีร์
  7 | กาซา
  8 | กาฐมาณฑุ
  9 | กุ้ยโจว
 10 | เกรละ
 11 | เกาลูน
 12 | เกียวโต
 13 | โกเบ
 14 | โกลกาตา
 15 | ควิเบก
 16 | คอนเนตทิคัต
 17 | คังนัม
 18 | คานาเวอรัล
 19 | คาบูล
 20 | คาร์ทูม
 21 | คาสเซิล
 22 | คุชราต
 23 | คุนหมิง
 24 | เคนตักกี
 25 | เคนทักกี
 26 | เคปทาวน์
 27 | เคมบริดจ์
 28 | เคียฟ
 29 | แคชเมียร์
 30 | แคนซัส
 31 | แคนเบอร์รา
 32 | แคโรไลนา
 33 | แคลิฟอร์เนีย
 34 | โคเปนเฮเกน
 35 | โคลัมโบ
 36 | โคโลราโด
 37 | ไครสต์เชิร์ช
 38 | ไคโร
 39 | จาการ์ตา
 40 | จำปาศักดิ์
 41 | จิตตะกอง
 42 | เจนไน
 43 | เจนีวา
 44 | เจ้อเจียง
 45 | ฉงชิ่ง
 46 | เฉิงตู
 47 | ชานตง
 48 | ชิคาโก
 49 | เชนไน
 50 | เชอร์โนบิล
 51 | แชงกรีลา
 52 | แชงกรีล่า
 53 | ซัปโปโร
 54 | ซานมารีโน
 55 | ซาบาห์
 56 | ซาราเยโว
 57 | ซาราวัก
 58 | ซิดนีย์
 59 | ซินเกียง
 60 | ซินเจียง
 61 | ซีอาน
 62 | ซีแอตเทิล
 63 | ซูริก
 64 | ซูริค
 65 | เซเชลส์
 66 | เซนได
 67 | เซนโตซ่า
 68 | เซ็นโตซ่า
 69 | เซนโทซ่า
 70 | เซ็นโทซ่า
 71 | เซี่ยงไฮ้
 72 | โซเฟีย
 73 | โซล
 74 | โซโลมอน
 75 | ไซ่ง่อน
 76 | ไซบีเรีย
 77 | ดัลลัส
 78 | ดาโคตา
 79 | ดานัง
 80 | ดามัสกัส
 81 | ดีทรอยต์
 82 | ดูไบ
 83 | เดนเวอร์
 84 | เดลาแวร์
 85 | เดียนเบียนฟู
 86 | โดเวอร์
 87 | โดฮา
 88 | ไดฟุกุ
 89 | ไดฟูกุ
 90 | ตรังกานู
 91 | ตริโปลี
 92 | ตูวาลู
 93 | เตหะราน
 94 | โตเกียว
 95 | โตรอนโต
 96 | ทมิฬนาฑู
 97 | ทริโปลี
 98 | ทิเบต
 99 | เทกซัส
100 | เท็กซัส
101 | เทนเนสซี
102 | เทลอาวีฟ
103 | แทสเมเนีย
104 | โทรอนโต
105 | ไทเป
106 | ธากา
107 | นาโกยา
108 | นางาซากิ
109 | นาโงยา
110 | นาริตะ
111 | นิวเจอร์ซีย์
112 | นิวเดลี
113 | นิวยอร์ก
114 | นิวยอร์ค
115 | นิวแฮมป์เชียร์
116 | เนบราสกา
117 | เนแบรสกา
118 | เนปิดอว์
119 | เนปีดอ
120 | เนวาดา
121 | บรัสเซลส์
122 | บราซิเลีย
123 | บอมเบย์
124 | บอสตัน
125 | บังกาลอร์
126 | บังคาลอร์
127 | บัลติมอร์
128 | บาร์เซโลนา
129 | บูคาเรสต์
130 | บูดาเปสต์
131 | เบงกาซี
132 | เบนกาซี
133 | เบรุต
134 | เบลเกรด
135 | เบอร์ลิน
136 | แบกแดด
137 | ปอยเปต
138 | ปะลิส
139 | ปะหัง
140 | ปักกิ่ง
141 | ปัญจาบ
142 | ปัฏนา
143 | ปารีส
144 | ปีนัง
145 | ปุตราจายา
146 | เประ
147 | เปียงยาง
148 | พนมเปญ
149 | พระตะบอง
150 | พะโค
151 | พะสิม
152 | พาราณสี
153 | พิหารี
154 | เพนซิลวาเนีย
155 | เพนซิลเวเนีย
156 | ฟรานซ์
157 | ฟลอริดา
158 | ฟิลาเดลเฟีย
159 | ฟุกุชิมะ
160 | ฟุกุชิมา
161 | ฟุกุโอกะ
162 | ฟูกูโอกะ
163 | แฟรงก์เฟิร์ต
164 | มอสโก
165 | มะนิลา
166 | มะละแหม่ง
167 | มัณฑะเลย์
168 | มัทราส
169 | มาดริด
170 | มิชิแกน
171 | มินนิโซตา
172 | มิยางิ
173 | มิลาน
174 | มิวนิก
175 | มิสซูรี
176 | มุมไบ
177 | เมกกะ
178 | เมดินา
179 | เมน
180 | เมลเบิร์น
181 | เมาะตะมะ
182 | เมาะลำเลิง
183 | เมียวดี
184 | แมนจูเรีย
185 | แมนเชสเตอร์
186 | แมนฮัตตัน
187 | แมริแลนด์
188 | แมรีแลนด์
189 | แมสซาชูเซตส์
190 | ไมอามี
191 | ยะไข่
192 | ย่างกุ้ง
193 | ยูทาห์
194 | ยูนนาน
195 | เยรูซาเลม
196 | เยรูซาเล็ม
197 | โยโกฮามา
198 | ริยาด
199 | รีโอเดจาเนโร
200 | โรดไอแลนด์
201 | ลอนดอน
202 | ลอสแองเจลิส
203 | ลากอส
204 | ลาปาซ
205 | ลาสเวกัส
206 | ลิสบอน
207 | ลุยเซียนา
208 | โลซาน
209 | โลซานน์
210 | วอชิงตัน
211 | วอร์ซอ
212 | วิสคอนซิน
213 | เวนิส
214 | เวลส์
215 | เวอร์จิเนีย
216 | เวอร์มอนต์
217 | เวียงจันทน์
218 | เวียนนา
219 | แวนคูเวอร์
220 | ไวโอมิง
221 | สกอตแลนด์
222 | สก็อตแลนด์
223 | สตอกโฮล์ม
224 | สต็อกโฮล์ม
225 | สลังงอร์
226 | เสฉวน
227 | เสิ่นเจิ้น
228 | เสียมราฐ
229 | เสียมเรียบ
230 | หงสา
231 | หงสาวดี
232 | หนานไห่
233 | หลวงพระบาง
234 | หูเป่ย
235 | หูเป่ย์
236 | หูหนาน
237 | เหอเป่ย
238 | เหอเป่ย์
239 | เหอหนาน
240 | อชันตา
241 | อลาสกา
242 | อวันตี
243 | ออตตาวา
244 | ออริกอน
245 | ออสโล
246 | อะแลสกา
247 | อังกอร์
248 | อัตตะปือ
249 | อัมมาน
250 | อัมสเตอร์ดัม
251 | อัสวาน
252 | อัสสัม
253 | อาบูดาบี
254 | อาร์คันซอ
255 | อาระกัน
256 | อินชอน
257 | อินเดียนา
258 | อิบารากิ
259 | อิลลินอยส์
260 | อิสตันบูล
261 | อิสลามาบัด
262 | อุรุมชี
263 | อูลานบาตอร์
264 | เอดินบะระ
265 | เอเธนส์
266 | แอตแลนตา
267 | แอริโซนา
268 | แอลเจียร์
269 | โอคลาโฮมา
270 | โอค็อตสค์
271 | โอกินาวา
272 | โอซากา
273 | โอริสสา
274 | โอเรกอน
275 | โอไฮโอ
276 | ไอดาโฮ
277 | ไอโอวา
278 | ฮอนโนลูลู
279 | ฮานอย
280 | ฮาเนดะ
281 | ฮาราเร
282 | ฮาวาย
283 | ฮิโรชิมา
284 | ฮุสตัน
285 | เฮลซิงกิ
286 | 


--------------------------------------------------------------------------------
/NEWS:
--------------------------------------------------------------------------------
  1 | 0.6.1 (2018-08-20)
  2 | =====
  3 | - Updated word break dictionary.
  4 | - Fix a defect in RTF parsing, so RTF gets more complete word break positions.
  5 | - Compiler warning fixes.
  6 | - Minor code cleanups.
  7 | - Useful installation instructions in INSTALL file.
  8 |   (Thanks @pepa65 for the pull request.)
  9 | 
 10 | 0.6.0 (2017-11-28)
 11 | =====
 12 | - Updated word break dictionary.
 13 | - Drop undocumented option '-l'.
 14 | - Revamped internal word break engine.
 15 | - Updated manpage.
 16 | 
 17 | 0.5.5 (2016-12-25)
 18 | =====
 19 | - Updated word break dictionary.
 20 | 
 21 | 0.5.4 (2016-07-08)
 22 | =====
 23 | - Updated word break dictionary.
 24 | - Fix segfault on extremely long input lines.
 25 | - Support longer input lines.
 26 |   (Bug report by Santi Romeyen)
 27 | - Support non-ASCII word break string.
 28 |   https://github.com/tlwg/swath/issues/1
 29 | - Some source code clean-ups.
 30 | - Add test suite.
 31 | 
 32 | 0.5.3 (2014-09-01)
 33 | =====
 34 | - Updated word break dictionary.
 35 | - Fix premature output ending on long UTF-8 input line.
 36 |   (Bug report by Sorawee Porncharoenwase)
 37 | - Fix excessive break positions in plain text mode.
 38 |   (Bug report by Sorawee Porncharoenwase)
 39 | - Remove dead codes, resulting in a little smaller binary.
 40 | 
 41 | 0.5.2 (2013-12-23)
 42 | =====
 43 | - Fix infinite loops in LaTeX filter.
 44 |   (Bug report and patch by Neutron Soutmun)
 45 | - Fix off-by-one character loss in long HTML tokens.
 46 |   (Bug report and analysis by Nicolas Brouard)
 47 | 
 48 | 0.5.1 (2013-10-30)
 49 | =====
 50 | - Correct word break code for Lambda.
 51 | - Updated word break dictionary.
 52 | - Adjust file filters to prevent potential buffer overflow.
 53 | 
 54 | 0.5.0 (2013-02-11)
 55 | =====
 56 | - Character encoding conversion is now spontaneous, no more buffering via
 57 |   temporary file.
 58 | - Rewritten RTF filter. It's now tested to work with real RTF document.
 59 | - Process characters as Unicode internally, so that characters not present
 60 |   in TIS-620 are not lost in output.
 61 | - Fix potential buffer overflow vulnerability in Mule mode.
 62 | - Updated word break dictionary.
 63 | - Significant source clean-ups.
 64 | - Switch to XZ tarball compression.
 65 | 
 66 | 0.4.3 (2012-06-13)
 67 | =====
 68 | - Replace word break dictionary with the word lists used to prepare
 69 |   hyphenation patterns for the upcoming ThaiLaTeX, for best results.
 70 | 
 71 | 0.4.2 (2012-02-08)
 72 | =====
 73 | - Fix wrong word break within punctuation sequences like LaTeX `` and ''.
 74 | - Reformat source for readability.
 75 | - Significant source clean-ups.
 76 | 
 77 | 0.4.1 (2011-03-20)
 78 | =====
 79 | - Fixed bugs which caused wrong word break before PHINTHU, NIKHAHIT, YAMAKKAN
 80 |   and LAKKHANGYAO.
 81 | - Enhanced custom dictionary options ('-d' on command line and SWATHDICT
 82 |   environment [formerly WORDSEGDATA]) to cover any datrie file, not just
 83 |   directory, for more convenient use of custom dictionary.
 84 | - Various more code clean-ups.
 85 | - Long-outdated and unmaintained RPM spec is now removed.
 86 | 
 87 | 0.4.0 (2009-04-08)
 88 | =====
 89 | - New dictionary engine with libdatrie 0.2.
 90 | - Fix memory leaks.
 91 | - Various clean-ups, including Valgrind warnings.
 92 | 
 93 | 0.3.4 (2008-04-06)
 94 | =====
 95 | - More secure temporary file usage.
 96 | 
 97 | 0.3.3 (2008-03-20)
 98 | =====
 99 | - Fix regression introduced during portability fix in 0.3.2.
100 |   (Bug report by Pisut Tempatarachoke)
101 | - Fix bug that prevent '-u u,u' from working.
102 |   (Bug report by Neutron Soutmun)
103 | - Minor code and doc improvements.
104 | 
105 | 0.3.2 (2008-02-02)
106 | =====
107 | - Fix char signedness portability issues.
108 | - Improved messages and documentation.
109 | 
110 | 0.3.1 (2006-03-28)
111 | =====
112 | Includes all changes in CVS during the last 4 years
113 | - More configure options (debug, thai line catenation control)
114 | - C++ transition
115 | - Misceleneous bug fixes
116 | 
117 | 0.3.0 (2001-12-21)
118 | =====
119 | - First autotoolized version.
120 | 
121 | 


--------------------------------------------------------------------------------
/data/tdict-science.txt:
--------------------------------------------------------------------------------
  1 | กลีเซอรีน
  2 | กลีเซอไรด์
  3 | กลูเตน
  4 | กลูเต็น
  5 | กัมมันต์
  6 | กาแลกโตส
  7 | กาแล็กโตส
  8 | กาแลกโทส
  9 | กาแล็กโทส
 10 | กำทอน
 11 | แกนีมีด
 12 | ครอส
 13 | คริสตัล
 14 | คลอโรพลาสต์
 15 | คลอโรฟิลล์
 16 | คลอไรด์
 17 | ควอนตัม
 18 | ควาร์ก
 19 | คอนดักเตอร์
 20 | คอปเปอร์
 21 | คอร์ติซอล
 22 | คอลลอยด์
 23 | คอลลาเจน
 24 | คอเลสเตอรอล
 25 | คอสมิก
 26 | คาร์บอนิก
 27 | คาไลโดสโคป
 28 | คิวมูลัส
 29 | คิวมูโลนิมบัส
 30 | คีโตเจนิก
 31 | คีโตน
 32 | คูลอมบ์
 33 | เคราติน
 34 | เคอราติน
 35 | แคโรทีน
 36 | แคโรทีนอยด์
 37 | แคสสินี
 38 | โครมาติน
 39 | โครมาทิน
 40 | โครมาโทกราฟี
 41 | โคริโอลิส
 42 | โคเลสเตอรอล
 43 | ไคโตซาน
 44 | จีโนม
 45 | จุลชีววิทยา
 46 | ชิคุนกุนยา
 47 | ซัลฟิวริก
 48 | ซัลเฟต
 49 | ซัลเฟอร์
 50 | ซัลไฟด์
 51 | ซิงค์
 52 | ซิริอุส
 53 | ซิลิกา
 54 | ซิลิเกต
 55 | ซิลิโคน
 56 | ซีแซนทีน
 57 | ซีเทน
 58 | ซีร์รัส
 59 | ซีร์โรคิวมูลัส
 60 | ซีร์โรสเตรตัส
 61 | ซีรีบรัม
 62 | ซีโรโทนิน
 63 | ซีเวิร์ต
 64 | เซกเตอร์
 65 | เซ็กเตอร์
 66 | เซกเมนต์
 67 | เซ็กเมนต์
 68 | เซนทริโอล
 69 | เซมิ
 70 | เซลโลเฟน
 71 | เซอร์รัส
 72 | โซนาร์
 73 | ไซโกต
 74 | ไซโตไคน์
 75 | ไซโทพลาสซึม
 76 | ไซแนปส์
 77 | ไซบอร์ก
 78 | ดอปเปลอร์
 79 | ดอปเพลอร์
 80 | ดาราจักร
 81 | โดพามีน
 82 | ไดนามิก
 83 | ไดนามิกส์
 84 | ไดนามิค
 85 | ไดนามิคส์
 86 | ไดออกไซด์
 87 | ทวินาม
 88 | ทรานส์
 89 | ทามิฟลู
 90 | ทาลัสซีเมีย
 91 | ทินดอลล์
 92 | เทฟลอน
 93 | เทเลพอร์ต
 94 | เทสโทสเตอโรน
 95 | เทอร์มอมิเตอร์
 96 | เทอร์โม
 97 | เทอร์โมสเฟียร์
 98 | แทนซาไนต์
 99 | แทนซาไนท์
100 | แทนนิน
101 | โทรโพสเฟียร์
102 | ไททัน
103 | ไทฟอยด์
104 | ไทรอยด์
105 | ธาลัสซีเมีย
106 | นาโนทิวบ์
107 | นิมโบสเตรตัส
108 | นิวคลีโอลัส
109 | นิวตริโน
110 | นิวทริโน
111 | นิวโมเนีย
112 | เนกาตีฟ
113 | โนวา
114 | บรรพชีวิน
115 | บอแรกซ์
116 | เบคเคอเรล
117 | โบซอน
118 | โบทอกซ์
119 | โบท็อกซ์
120 | ไบโนเมียล
121 | ไบโอติน
122 | ปฏิยานุพันธ์
123 | โปรเจสเตอโรน
124 | โปรติสตา
125 | โปรไบโอติก
126 | พลาสโมลิซิส
127 | พหุนาม
128 | พอลิเมอร์
129 | พันธุศาสตร์
130 | พัลซาร์
131 | พาร์กินสัน
132 | พาราเซตามอล
133 | พาราโบลา
134 | พาลินโดรม
135 | เพนนิซิลลิน
136 | เพนนิซิลิน
137 | เพอร์ออกไซด์
138 | โพแทช
139 | โพรเจสเทอโรน
140 | โพรทิสตา
141 | โพรโทพลาสซึม
142 | โพลาร์
143 | โพลาไรซ์
144 | โพลิโนเมียล
145 | โพลิเมอร์
146 | โพลีเอทิลีน
147 | ไพรเมต
148 | ฟลาโวนอยด์
149 | ฟลูออเรสเซนซ์
150 | ฟลูออไรด์
151 | ฟอสซิล
152 | ฟิชชัน
153 | ฟิโบนักซี
154 | ฟิวชัน
155 | ฟีโรโมน
156 | เฟอร์มิออน
157 | โฟตอน
158 | โฟเบีย
159 | โฟลิก
160 | โฟเลท
161 | โฟเลต
162 | ไฟเบอร์
163 | มอนอกไซด์
164 | มิราจ
165 | มิวออน
166 | มีโซสเฟียร์
167 | เมตริกซ์
168 | เมตาบอลิซึม
169 | เมตาโบลิซึม
170 | เมทริกซ์
171 | เมมเบรน
172 | เมลาโทนิน
173 | เมลานิน
174 | เมลามีน
175 | เมลิออยด์
176 | แมกนิจูด
177 | โมเมนตัม
178 | ไมโตคอนเดรีย
179 | ไมโทคอนเดรีย
180 | โมโนเมอร์
181 | โมโนเมียล
182 | ยีสต์
183 | ยูริก
184 | ยูเรีย
185 | รีเวิร์ส
186 | รูมาตอยด์
187 | ไรโบโซม
188 | ลูทีน
189 | เลปตอน
190 | แลกโตส
191 | แล็กโตส
192 | แลกโทส
193 | แล็กโทส
194 | ไลโซโซม
195 | วีก้า
196 | วาร์ป
197 | เวกเตอร์
198 | เวก้า
199 | เวสิเคิล
200 | แวคิวโอล
201 | แวนเดอร์วาลส์
202 | โวลต์
203 | สเกล
204 | สเกลาร์
205 | สตราตัส
206 | สตราโตสเฟียร์
207 | สเต็ม
208 | สเตรตัส
209 | สเตรโตคิวมูลัส
210 | สเตียรอยด์
211 | สไตรีน
212 | สปีชีส์
213 | สเปิร์ม
214 | สเฟียร์
215 | สัมพัทธภาพ
216 | สุริยจักรวาล
217 | ออกเทน
218 | ออร์แกเนลล์
219 | ออร์โธปิดิกส์
220 | ออโรรา
221 | ออโรร่า
222 | ออวุล
223 | ออสโมซิส
224 | อะคริลิก
225 | อะครีลิก
226 | อะซีติก
227 | อะซีโตน
228 | อะนิโมมิเตอร์
229 | อะมิโน
230 | อะลูมินัม
231 | อะลูมินา
232 | อันโดรเมดา
233 | อัลคาไลน์
234 | อัลตราซาวด์
235 | อัลตราซาวนด์
236 | อัลติมิเตอร์
237 | อัลโตคิวมูลัส
238 | อัลโตสเตรตัส
239 | อัลลอยด์
240 | อาร์คัส
241 | อินทิกรัล
242 | อินทิเกรต
243 | อีโบลา
244 | อีโบล่า
245 | อุณหพลศาสตร์
246 | เอ็กซ์โพเนนเชียล
247 | เอกนาม
248 | เอทานอล
249 | เอทิลีน
250 | เอนดอร์ฟิน
251 | เอนโดพลาสซึม
252 | เอนโดสเปิร์ม
253 | เอนโทรปี
254 | เอสเตอร์
255 | เอสโตรเจน
256 | เอสโทรเจน
257 | แอนดรอยด์
258 | แอนแทร็กซ์
259 | แอนิรอยด์
260 | แอมพลิจูด
261 | แอมเฟตามีน
262 | แอมโมเนียม
263 | แอลกอฮอลิซึม
264 | แอสพาร์แตม
265 | โอเซลทามิเวียร์
266 | ไอโซเมตริก
267 | ไอรอน
268 | ไอโอโนสเฟียร์
269 | ฮับเบิล
270 | ฮาร์โมนิก
271 | ฮาโลเจน
272 | ฮิกส์
273 | ฮิวมัส
274 | ฮีสตามีน
275 | โฮโลแกรม
276 | ไฮกรอมิเตอร์
277 | ไฮดรอกไซด์
278 | ไฮดรอลิก
279 | ไฮโดรโปนิกส์
280 | ไฮโดรลิก
281 | ไฮไดรด์
282 | ไฮเพอร์โบลา
283 | 


--------------------------------------------------------------------------------
/data/tdict-ict.txt:
--------------------------------------------------------------------------------
  1 | กราฟิก
  2 | กราฟิกส์
  3 | กราฟิค
  4 | กริด
  5 | กิกะไบต์
  6 | กูเกิล
  7 | กูเกิ้ล
  8 | เกตเวย์
  9 | แกดเจ็ต
 10 | แกร็บ
 11 | โกลบอล
 12 | คลัสเตอร์
 13 | คลาวด์
 14 | คลาส
 15 | คลิก
 16 | คลิปอาร์ต
 17 | คอนโซล
 18 | คอนเทนต์
 19 | คอนเทนท์
 20 | คอนฟิก
 21 | คอมพิวติ้ง
 22 | คอมไพล์
 23 | คอมไพเลอร์
 24 | คอมมูนิเคชั่น
 25 | คอร์
 26 | คิวบิต
 27 | คีย์
 28 | คีย์บอร์ด
 29 | เครือข่าย
 30 | เคอร์เซอร์
 31 | เคอร์เนล
 32 | แคช
 33 | แคมฟรอก
 34 | แคมฟร็อก
 35 | แคร็ก
 36 | โค้ด
 37 | ไคลเอนต์
 38 | จอยสติ๊ก
 39 | จาวา
 40 | จีพีเอส
 41 | ชิป
 42 | ชิพ
 43 | เชลล์
 44 | แชต
 45 | แช็ต
 46 | แชนเนล
 47 | แชนแนล
 48 | ซ็อกเก็ต
 49 | ซอฟต์แวร์
 50 | ซอฟท์แวร์
 51 | ซอร์ส
 52 | ชัตดาวน์
 53 | ซัพพอร์ต
 54 | ซัพพอร์ท
 55 | ซีดี
 56 | ซีดีรอม
 57 | ซีเนอร์
 58 | เซลฟี่
 59 | เซิร์ฟเวอร์
 60 | โซลูชัน
 61 | โซลูชั่น
 62 | ไซต์
 63 | ไซเบอร์
 64 | ทรานแซกชัน
 65 | ทรานแซกชั่น
 66 | ทรานแซ็กชัน
 67 | ทรานแซ็กชั่น
 68 | ทรานแซคชัน
 69 | ทรานแซคชั่น
 70 | ทรานแซ็คชัน
 71 | ทรานแซ็คชั่น
 72 | ทวิตเตอร์
 73 | ทวีต
 74 | ทัชแพด
 75 | ทัชสกรีน
 76 | เทมเพลต
 77 | เทอร์มินัล
 78 | แท็ก
 79 | แท็บ
 80 | แทบเล็ต
 81 | แท็บเล็ต
 82 | โทรจัน
 83 | เน็ต
 84 | เน็ตบุ๊ก
 85 | เน็ตบุค
 86 | เน็ตบุ๊ค
 87 | เน็ตเวิร์ก
 88 | เน็ตเวิร์ค
 89 | เนิร์ด
 90 | โน้ตบุ๊ก
 91 | โน้ตบุค
 92 | โน้ตบุ๊ค
 93 | ดอทคอม
 94 | ดอส
 95 | ดาวน์เกรด
 96 | ดาวน์โหลด
 97 | ดิจิตอล
 98 | ดิจิทัล
 99 | ดิสก์
100 | ดิสโทร
101 | ดีบั๊ก
102 | ดีวีดี
103 | ดีไวซ์
104 | เดเบียน
105 | เดลไฟ
106 | เดสก์ท็อป
107 | โดเมน
108 | โดรน
109 | ไดรฟ์
110 | ไดรว์
111 | ไดรเวอร์
112 | ไดเรกทอรี
113 | ไดโอด
114 | เทเลคอม
115 | บล็อกเกอร์
116 | บรอดแบนด์
117 | บราวเซอร์
118 | บลูทูท
119 | บลูทูธ
120 | บลูเรย์
121 | บั๊ก
122 | บัฟเฟอร์
123 | บิต
124 | บิตคอยน์
125 | บิท
126 | บิทคอยน์
127 | บูต
128 | เบราว์เซอร์
129 | แบ็กอัป
130 | แบ็กอัพ
131 | แบคอัพ
132 | แบ็คอัพ
133 | แบนด์วิดท์
134 | ไบต์
135 | ไบนารี
136 | ปรินต์
137 | ปรินท์
138 | ปรินเตอร์
139 | โปรแกรมเมอร์
140 | โปรเซส
141 | โปรเซสเซอร์
142 | โปรโตคอล
143 | โปรไฟล์
144 | พร็อกซี
145 | พรินต์
146 | พรินท์
147 | พรินเตอร์
148 | พอร์ต
149 | พอร์ท
150 | พาเนล
151 | พาร์ทิชัน
152 | พารามิเตอร์
153 | พาสเวิร์ด
154 | พิกเซล
155 | เพจ
156 | เพจเจอร์
157 | แพกเก็ต
158 | แพตช์
159 | แพลตฟอร์ม
160 | โพรเซส
161 | โพรเซสเซอร์
162 | โพรโทคอล
163 | โพรไฟล์
164 | ไพทอน
165 | ไพธอน
166 | ฟล็อปส์
167 | ฟอนต์
168 | ฟอนท์
169 | ฟอร์แมต
170 | ฟอร์เวิร์ด
171 | ฟอรัม
172 | ฟอลโลว์
173 | ฟอลโลเวอร์
174 | ฟีเจอร์
175 | เฟซบุ๊ก
176 | เฟิร์มแวร์
177 | แฟล็ก
178 | โฟลเดอร์
179 | ไฟร์ฟอกซ์
180 | ไฟร์วอลล์
181 | ไฟล์
182 | มอดูล
183 | มอนิเตอร์
184 | มัลติ
185 | มัลติทัช
186 | มัลติเพล็กซ์
187 | มัลแวร์
188 | มาสเตอร์
189 | มิลลิวินาที
190 | มีเดีย
191 | มีม
192 | เมนู
193 | เมมโมรี
194 | เมล
195 | เมาส์
196 | แมค
197 | โมดูล
198 | โมเด็ม
199 | โมบาย
200 | โมบายล์
201 | โมไบล์
202 | ไมโครซอฟท์
203 | ยูนิกซ์
204 | ยูนิโคด
205 | ยูนิโค้ด
206 | ยูสเซอร์
207 | ริงโทน
208 | รีเฟรช
209 | รีเลย์
210 | เรนเดอร์
211 | เราเตอร์
212 | เรียลไทม์
213 | ลิงก์
214 | ลินุกซ์
215 | ลีนุกซ์
216 | ลูป
217 | เลเยอร์
218 | แล็ปท็อป
219 | โลเกชัน
220 | โลเกชั่น
221 | โลเคชัน
222 | โลเคชั่น
223 | ไลก์
224 | ไลค์
225 | ไลเซนส์
226 | ไลบรารี
227 | วิกิ
228 | วิกิพีเดีย
229 | วิดเจ็ต
230 | วินโดวส์
231 | วินโดว์ส
232 | เว็บ
233 | เวอร์ชวล
234 | เวอร์ชัน
235 | เวอร์ชั่น
236 | เวิร์กสเตชัน
237 | เวิร์กสเตชั่น
238 | เวิร์คสเตชัน
239 | เวิร์คสเตชั่น
240 | เวิร์ด
241 | เวิร์ม
242 | ไวแมกซ์
243 | สกรีน
244 | สกิมเมอร์
245 | สแกน
246 | สแกนเนอร์
247 | สไกป์
248 | สตรีม
249 | สเตตัส
250 | สแต็ก
251 | สนิฟเฟอร์
252 | สปายแวร์
253 | สเปซ
254 | สแปม
255 | สมาร์ต
256 | สมาร์ท
257 | สล็อต
258 | เสิร์ช
259 | โหนด
260 | โหลด
261 | อนาล็อก
262 | ออนไลน์
263 | ออบเจกต์
264 | ออบเจ็กต์
265 | อ็อบเจกต์
266 | อ็อบเจ็กต์
267 | ออปติก
268 | ออปติคอล
269 | ออปติคัล
270 | ออปติไมซ์
271 | ออพติก
272 | ออพติคอล
273 | ออพติคัล
274 | ออพติไมซ์
275 | ออฟไลน์
276 | ออราเคิล
277 | อัพเกรด
278 | อัพเดต
279 | อัพโหลด
280 | อัปเกรด
281 | อัปเดต
282 | อัปโหลด
283 | อัลกอริทึม
284 | อาร์กิวเมนต์
285 | อินเตอร์เน็ต
286 | อินทิเกรเตอร์
287 | อินเทอร์เน็ต
288 | อินเทอร์เฟซ
289 | อินเทล
290 | อินพุต
291 | อินพุท
292 | อินสตาแกรม
293 | อิมเมจ
294 | อีเมล
295 | อีเมล์
296 | อีโมจิ
297 | อูบันตู
298 | อูบุนตู
299 | อูเบอร์
300 | เอนจิน
301 | เอ็นจิน
302 | เอาต์พุต
303 | เอาต์พุท
304 | เอาท์พุต
305 | เอาท์พุท
306 | แอนะล็อก
307 | แอนิเมชัน
308 | แอนิเมชั่น
309 | แอดเดรส
310 | แอดมิน
311 | แอป
312 | แอปพลิเคชัน
313 | แอปพลิเคชั่น
314 | แอพ
315 | แอพพลิเคชัน
316 | แอพพลิเคชั่น
317 | แอสเซมบลี
318 | แอสเซมเบลอร์
319 | โอเพน
320 | ไอคอน
321 | ไอซี
322 | ไอเทม
323 | ไอพอด
324 | ไอพ็อด
325 | ไอแพด
326 | ไอโฟน
327 | ฮับ
328 | ฮาร์ดดิสก์
329 | ฮาร์ดแวร์
330 | แฮ็ก
331 | แฮกเกอร์
332 | แฮ็กเกอร์
333 | แฮ็ค
334 | แฮคเกอร์
335 | แฮ็คเกอร์
336 | แฮงเอาท์
337 | แฮงเอาต์
338 | แฮชแท็ก
339 | แฮนด์เฮลด์
340 | โฮสต์
341 | ไฮเอนด์
342 | 


--------------------------------------------------------------------------------
/tests/long-wseg.txt:
--------------------------------------------------------------------------------
1 | เหมือน|เป็น|สัญญาณ|จาก|ทาง|คณะ|รักษา|ความ|สงบ|แห่ง|ชาติ|และ|ฝ่าย|ที่|อยู่|ตรง|ข้าม|เข้าใจ|ตรง|กัน|ว่า สนาม|การ|ลง|คะแนน|ประชามติ|ของ|ร่าง|รัฐธรรมนูญ|มี|ความหมาย|ต่อ|ความ|ชอบธรรม|ใน|การ|บริหาร|ราชการ|แผ่นดิน|ของ|รัฐบาล|ที่|มา|จาก|การ|ควบคุม|อำนาจ|เป็น|อย่าง|ยิ่ง เพราะ|ผล|ที่|ออก|มา|นั้น|ชี้|วัด|ถึง|ทัศนคติ|ของ|คน|ใน|สังคม|ที่|จะ|เลือก|ว่า|รัฐบาล|ทหาร|เดิน|มา|ถูก|ทาง|และ|ถูกใจ|คน|ส่วน|ใหญ่|หรือ|ไม่ เอา|เข้า|จริง|คะแนน|เสียง|ที่|ออก|มาก|ลับ|ปฏิเสธ|อำนาจ|ทหาร|ด้วย|เงื่อนไข|บาง|ประการ|ที่|ผูก|เป็น|เงื่อน|ปม|ใน|รัฐธรรมนูญ หรือ|สังคม|กลับ|กลัว|การ|กลับ|มา|ของ|เผด็จการ|รัฐสภา และ|การ|ผูกขาด|อำนาจ|ของ|ทุน|การเมือง|ที่|เกิด|ขึ้น|มา|ใน|อดีต จึง|เห็น|ชอบ|กับ|ร่าง|รัฐธรรมนูญ|นี้ เมื่อ|วัน|ก่อน นาย|ณัฐวุฒิ ใส|ย|เกื้อ แกน|นำ|แนวร่วม|ประชาธิปไตย|ต่อต้าน|เผด็จการ|แห่ง|ชาติ พูด|ถึง|การ|ลง|ประชามติ|คือ|สนาม|การ|ต่อสู้|แล้ว ยิ่ง|ทำให้|เห็น|ว่า|ระหว่าง|ทาง|ที่|จะ|ไป|ถึง|วัน|นั้น สถานการณ์|น่า|จะ|เข้มข้น|มาก|ขึ้น เนื่อง|จาก|ผลลัพธ์|ที่|ออก|มา|มี|เดิมพัน|สูง|มาก การ|งัด|กลยุทธ์ การ|ทำลาย|จุด|อ่อน|ของ|ฝ่าย|ตรง|ข้าม|ย่อม|มี|ความ|จำเป็น ขณะ|ที่|ฝ่าย|คณะ|รักษา|ความ|มั่นคง|แห่ง|ชาติ|และ|รัฐบาล คือ|ฝ่าย|ที่|ถือ|อำนาจ|รัฐ พร้อม|ทั้ง|ได้|สร้าง|กฎเกณฑ์ กฎ|เหล็ก ขึ้น|มา|มากมาย|เพื่อ|ป้องกัน|การ|ใช้|กลไก|ใน|โลก|ของ|โซเชียล|มีเดีย ขยาย|จุด|อ่อน|ของ|ร่าง|รัฐธรรมนูญ จน|มี|ผล|ที่|ทำให้|คน|ที่|อยู่|ระหว่าง|กลาง|ที่|เป็น|ฝ่าย|เสรีนิยม|เกิด|ปฏิกิริยา|ต่อต้าน แน่นอน|ว่า ด้วย|พลัง|อำนาจ|อัน|มหาศาล|จาก|การ|เป็น|รัฏฐาธิปัตย์ การ|มี|กอง|กำลัง|รักษา|ความ|สงบ|เรียบร้อย จาก|การ|ใช้|ทหาร|ใน|ประจำการ|เข้า|มา|ทำ|หน้าที่|ดูแล|สถานการณ์ การ|ออก|คำ|สั่ง ประกาศ การ|แก้ไข|กฎหมาย|เพิ่ม|โทษ|ของ|ผู้|กระทำ|ผิด|ที่|เกิด|จาก|ความ|มั่นคง|เริ่ม|มี|มาก|ขึ้น โดย|เฉพาะ|การ|แก้ไข|พระ|ราช|บัญญัติ|คอมพิวเตอร์ ฝ่าย|คณะ|รักษา|ความ|สงบ|แห่ง|ชาติ|ย่อม|คาด|หวัง|ต่อ|เครื่องมือ|เหล่า|นั้น ไม่|ต้องการ|ให้|กลุ่ม|ต่อต้าน|รัฐธรรมนูญ|ทำ|อะไร|ไม่|ได้|สะดวก|ใจ จุด|ประสงค์|คือ|ผลักดัน|ให้|ร่าง|รัฐธรรมนูญ|ได้|รับ|ความเห็น|ชอบ|จาก|ประชาชน|อย่าง|ท่วมท้น|โดย|ไม่|มี|ขบวนการ|ใด|มา|ขัด|ขวาง ทั้ง|ที่|รัฐธรรมนูญ|ไม่|ผ่าน|ประชามติ ก็|ยัง|มี|อำนาจ|ใน|การ|ดำเนิน|การ|ยำ|เนื้อหา|ร่าง|รัฐธรรมนูญ|ใหม่|ได้ นั่น|เพราะ|คำ|ว่า|ความ|ชอบธรรม|ใน|การ|ดำรง|อยู่|ของ|คณะ|รักษา|ความ|สงบ|แห่ง|ชาติ|เป็น|เรื่อง|สำคัญ|ที่สุด จาก|นี้|ไป|อีก|ปีก|ว่า คณะ|รักษา|ความ|สงบ|แห่ง|ชาติ|จะ|ต้อง|เจอ|กับ|แรง|ปะทะ|อีก|มากมาย เพราะ|การ|ลง|สนาม|แข่งขัน|ใหญ่|ใน|การ|เลือกตั้ง|นั้น|ไม่|มี|ชิ้น|ปลา|มัน หรือ|อำนาจ|สิทธิ์ขาด|ให้|กับ|นัก|เลือกตั้ง|ได้|เข้า|มา|คุม|อำนาจ|ได้|อย่าง|เบ็ดเสร็จ|หลัง|การ|เลือกตั้ง ซึ่ง|นั่น|ก็|เป็น|เหตุผล|สำคัญ|หนึ่ง|ที่|เกิด|การ|เคลื่อนไหว|อย่าง|หนัก|ใน|การ|คัดค้าน|ร่าง|รัฐธรรมนูญ|ที่|มี|คำ|ถาม|พ่วง ที่|ถูก|วิจารณ์|ว่า|เป็น|การ|สืบทอด|อำนาจ|ทหาร และ|จะ|กลาย|เป็น|ประเด็น|ที่|กลาย|เป็น|เป้า|ใหญ่|ให้|ฝ่าย|ที่|ไม่|เห็นด้วย|นำ|ไป|โจมตี เพราะ|เป็น|ที่|รู้|กัน|ว่า|ทหาร|ต้องการ|เป็น|เงา|คู่|ขนาน|ผ่าน|อำนาจ|ของ|สมาชิก|วุฒิสภา และ|เรื่อง|ของ|นายก|รัฐมนตรี|คน|นอก|ไป|อีก|หลาย|ปี|หลัง|การ|เลือกตั้ง การ|เดิน|หน้า|เอา|ผิด|กับ|กลุ่ม หรือ|ขบวนการ|ที่|ใช้|โซเชียล|มีเดีย|ใน|การ|ดิสเครดิต|รัฐบาล|และ|คณะ|รักษา|ความ|สงบ|แห่ง|ชาติ|ใน|ขณะ|นี้ แม้|รัฐ|จะ|ไม่|ได้|มี|เป้าหมาย|ให้|ได้|รับ|โทษ|หนัก|เท่าไหร่|นัก แต่|ใน|แง่|ของ|คณะ|รักษา|ความ|สงบ|แห่ง|ชาติ|ก็|หวัง|ผล|ใน|เชิง|จิตวิทยา ให้|คน|ที่|เคลื่อนไหว|ใน|ระดับ|ย่อย|ระมัดระวัง|และ|เกรงกลัว|ที่|จะ|โพสต์|ข้อความ|เรื่อง|รัฐธรรมนูญ โดย|งัด|เอา|พระ|ราช|บัญญัติ|ว่าด้วย|การ|ออก|เสียง|ประชามติ|มา|เป็น|อาวุธ ประกอบ|กับ|การ|ที่|สภา|นิติบัญญัติ|แห่ง|ชาติ|แก้ไข|พระ|ราช|บัญญัติ|คอมพิวเตอร์|เพิ่ม|โทษ ก็|ยิ่ง|ทำให้|เครื่องมือ|ใน|การ|ห้าม|เข้มข้น|ขึ้น แต่|ใน|อีก|แง่|หนึ่ง|ก็|ต้อง|เสี่ยง|กับ|การ|ถูก|มอง|ภาพ|จาก|คน|ที่|มี|เจตนา|ดี|ใน|การ|วิพากษ์|วิจารณ์ ว่า|รัฐ|ปิด|กั้น|ใน|การ|แสดง|ความคิด|เห็น|มาก|เกิน|ไป|จน|กระดิก|ตัว|ไม่|ได้ กระทบ|ต่อ|แนวร่วม|ของ|คณะ|รักษา|ความ|สงบ|แห่ง|ชาติ|ที่|พร้อม|หนุน ดังนั้น|เจ้าหน้าที่|รัฐ|เอง|ก็|ต้อง|แยกแยะ ดำเนิน|การ และ|ชี้แจง|อย่าง|เป็น|ระบบ ใน|การ|แบ่ง|ให้|เห็น|ถึง|คน|ที่|ปฏิบัติ|และ|คน|ที่|อยู่|เบื้องหลัง รวม|ไป|ถึง|คน|ที่|อยาก|แสดง|ความคิด|เห็น|ต่อ|ร่าง|รัฐธรรมนูญ|อย่าง|บริสุทธิ์|ใจ และ|พร้อม|ที่|จะ|รับ|ได้|กับ|คำ|ว่า|สืบทอด|อำนาจ ใน|ขณะ|เดียวกัน การ|จัด|ทำ|ผัง|ล้ม|คณะ|รักษา|ความ|สงบ|แห่ง|ชาติ|ที่|เจ้าหน้าที่|ได้|แจกจ่าย|นั้น ก็|เหมือน|เป็น|การ|ยืนยัน|ใน|หลักฐาน|ใน|ชั้น|พนักงาน|สอบสวน|ว่า|มี|ความ|เชื่อม|โยง การ|ใช้|ไม้|แข็ง|โดย|ใช้|กฎหมาย|อย่าง|เต็ม|รูปแบบ|กับ|ปลา|ซิว|ปลา|สร้อย|แทน|การ|ปรับ|ทัศนคติ|ใน|ช่วง|ก่อนหน้า|นี้ ก็|ต้อง|มี|การ|ประเมิน|ให้|ดี|ว่า|จะ|เป็น|ผล|ดี|มาก|กว่า|ผล|เสีย หรือ|แท้|ที่|จริง|แล้ว|ยิ่ง|สร้าง|แนวร่วม|แห่ง|ความ|เกลียด|ชัง เรา|จึง|หวัง|ว่า ชัย|ชนะ|ใน|ประชามติ|ร่าง|รัฐธรรมนูญ|ที่|จะ|มี|ขึ้น|ใน|อีก|สอง|เดือน|ข้าง|หน้า ที่|สอง|ฝ่าย|มอง|ว่า|นั่น|คือ|สนาม|การ|แข่งขัน|ของ|จริง โดย|มี|เสียง|ของ|ประชาชน|เป็น|เดิมพัน|นั้น ไม่|ใช่|บท|สรุป|ทั้งหมด กล่าว|คือ ถ้า|ฝ่าย|หนึ่ง|ได้|คะแนน|เสียง|มาก|กว่า คือ|ชย|ชนะ|แห่ง|หลักการ|ประชาธิปไตย หรือ|อีก|ฝ่าย|หนึ่ง|ได้|มาก|กว่า คือ|ความ|ชอบธรรม|ของ|อำนาจ|ทหาร|ใน|การ|ปกครอง|ประเทศ เรา|เชื่อ|ว่า|นั่น|ไม่|ใช่|ชัย|ชนะ|ของ|ประชาชน|อย่าง|แท้|จริง และ|กว่า|จะ|ถึง|วัน|นั้น|ก็|ยัง|ไม่|รู้|ว่า|จะ|มี|ผู้|ถูก|จับกุม|ดำเนิน|คดี|อีก|เท่าไหร่ จะ|เกิด|ปรากฏการณ์|ทางการ|เมือง|ที่|วุ่นวาย|จน|ใน|ที่สุด|ต้อง|ปิด|สนาม|การ|แข่งขัน|ลง|หรือ|ไม่
2 | 


--------------------------------------------------------------------------------
/src/swath.1:
--------------------------------------------------------------------------------
  1 | .\"                                      Hey, EMACS: -*- nroff -*-
  2 | .\" First parameter, NAME, should be all caps
  3 | .\" Second parameter, SECTION, should be 1-8, maybe w/ subsection
  4 | .\" other parameters are allowed: see man(7), man(1)
  5 | .TH SWATH 1 "January 2008"
  6 | .\" Please adjust this date whenever revising the manpage.
  7 | .\"
  8 | .\" Some roff macros, for reference:
  9 | .\" .nh        disable hyphenation
 10 | .\" .hy        enable hyphenation
 11 | .\" .ad l      left justify
 12 | .\" .ad b      justify to both left and right margins
 13 | .\" .nf        disable filling
 14 | .\" .fi        enable filling
 15 | .\" .br        insert line break
 16 | .\" .sp <n>    insert n+1 empty lines
 17 | .\" for manpage-specific macros, see man(7)
 18 | .SH NAME
 19 | swath \- General-purpose Thai word segmentation utility
 20 | .SH SYNOPSIS
 21 | .B swath
 22 | [options] \<\ \fIinfile\fP\ \>\ \fIoutfile\fP
 23 | .br
 24 | .SH DESCRIPTION
 25 | Thai script has no word delimiter.  Applications need to recognize word
 26 | boundaries before they can do useful things with Thai text, such as line
 27 | wrapping.
 28 | .sp
 29 | Swath provides word analysis filter to insert word delimiters into a given
 30 | text stream.  It reads text from standard input, analyzes it for word
 31 | boundaries by consulting a Thai word list, and outputs to standard output the
 32 | same text with the predefined word delimiters inserted.
 33 | .sp
 34 | Currently, it can read plain text, HTML, RTF, LaTeX and Lambda (Unicode version
 35 | of LaTeX with Omega typesetter kernel) documents and insert common word
 36 | delimiters for each format (pipe `|' for plain text). But user can always
 37 | override this with a preferred delimiter.
 38 | .SH OPTIONS
 39 | .TP
 40 | .B \-b [\fIdelimiter\fP]
 41 | Define the string to be inserted as word delimiter in the output text.
 42 | .TP
 43 | .B \-d [\fIdict-path\fP]
 44 | Specify alternative dictionary location.  \fIdict-path\fP must be either a
 45 | directory containing the swath dictionary file `swathdic.tri', or a path
 46 | to the dictionary file itself.  The dictionary file must be a trie file
 47 | prepared using \fBtrietool\fP(1) utility from the libdatrie package.
 48 | .sp
 49 | If this option is given, \fBswath\fP will override normal dictionary search
 50 | and will exit if the given dictionary cannot be found.  Otherwise, if
 51 | \fBSWATHDICT\fP environment is set, it will try to open dictionary from the
 52 | location specified by its value.  Otherwise, it will try the current working
 53 | directory, and finally the usual installed location.
 54 | .TP
 55 | .B \-f [\fIformat\fP]
 56 | Specify format of the input.  Possible formats are: html, rtf, latex, lambda.
 57 | .TP
 58 | .B \-m [\fIscheme\fP]
 59 | Choose word matching scheme when analyzing word boundaries.  Possible schemes
 60 | are `long' (for longest or greedy matching) and `max' (for maximal matching,
 61 | with least words preferred).  Maximal matching is the default value.
 62 | .TP
 63 | .B \-u \fIinput-enc\fP,\fIoutput-enc\fP
 64 | Specify encodings of the input and the output.  \fIinput-enc\fP and
 65 | \fIoutput-enc\fP can be one of 'u' (for UTF-8 encoding) and 't' (for TIS-620
 66 | encoding).  Swath will convert the character encoding as necessary.
 67 | If this option is omitted, TIS-620 encodings on both input and output are
 68 | assumed.
 69 | .TP
 70 | .B \-v, \-\-verbose
 71 | Turn on verbose mode.
 72 | .TP
 73 | .B \-help, \-\-help
 74 | Show help.
 75 | .SH ENVIRONMENT VARIABLES
 76 | .IP SWATHDICT
 77 | If specified, \fBswath\fP will search for dictionary from this location before
 78 | the usual places (current working directory and usual installed directory,
 79 | respectively).  This value is overridden by \fB\-d\fP option.
 80 | .SH EXAMPLES
 81 | For LaTeX (to be used with babel-thai package):
 82 | .sp
 83 | $ \fBswath\fP \-f latex < thaifile.tex > thaifile.ttex
 84 | .br
 85 | $ \fBlatex\fP thaifile.ttex
 86 | .sp
 87 | For HTML (to provide web pages to web browsers that cannot wrap Thai lines
 88 | properly, but support the <wbr> tag):
 89 | .sp
 90 | $ \fBswath\fP \-f html < myweb.html > myweb-wbr.html
 91 | .sp
 92 | To preprocess a Thai UTF-8 encoded LaTeX file for babel-thai with tis620
 93 | inputenc:
 94 | .sp
 95 | $ \fBswath\fP \-f latex \-u u,t < thaifile.tex > thaifile.ttex
 96 | .br
 97 | $ \fBlatex\fP thaifile.ttex
 98 | .sp
 99 | This is equivalent to filtering with \fBiconv(1)\fP:
100 | .sp
101 | $ \fBiconv\fP \-f UTF-8 \-t TIS-620 thaifile.tex | \fBswath\fP \-f latex > 
102 | thaifile.ttex
103 | .br
104 | $ \fBlatex\fP thaifile.ttex
105 | .sp
106 | To use longest matching scheme with LaTeX document:
107 | .sp
108 | $ \fBswath\fP \-f latex \-m long < thaifile.tex > thaifile.ttex
109 | .br
110 | $ \fBlatex\fP thaifile.ttex
111 | .sp
112 | To use an alternative dictionary from libthai:
113 | .sp
114 | $ \fBswath\fP \-f latex \-d /usr/share/libthai/thbrk.tri < thaifile.tex >
115 | thaifile.ttex
116 | .SH AUTHOR
117 | This manual page was written by Theppitak Karoonboonyanan <theppitak@gmail.com>.
118 | 


--------------------------------------------------------------------------------
/src/filterlatex.cpp:
--------------------------------------------------------------------------------
  1 | // FilterLatex.cpp: implementation of the FilterLatex class.
  2 | //
  3 | //////////////////////////////////////////////////////////////////////
  4 | 
  5 | #include <string.h>
  6 | #include <wctype.h>
  7 | #include "filterlatex.h"
  8 | #include "worddef.h"
  9 | #include "convutil.h"
 10 | #include "utils.h"
 11 | //////////////////////////////////////////////////////////////////////
 12 | // Construction/Destruction
 13 | //////////////////////////////////////////////////////////////////////
 14 | 
 15 | FilterLatex::FilterLatex (FILE* filein, FILE* fileout,
 16 |                           bool isUniIn, bool isUniOut,
 17 |                           const wchar_t* wordBreakStr)
 18 |   : FilterX (filein, fileout, isUniIn, isUniOut, wordBreakStr),
 19 |     verbatim (false)
 20 | {
 21 |   buffer[0] = 0;
 22 | }
 23 | 
 24 | static int
 25 | consumeToken (wchar_t* token, int tokenSz, wchar_t* buffer, int nChars)
 26 | {
 27 |   int nCopy = min<int> (nChars, tokenSz - 1);
 28 |   wcsncpy (token, buffer, nCopy);
 29 |   token[nCopy] = 0;
 30 | 
 31 |   return nCopy;
 32 | }
 33 | 
 34 | bool
 35 | FilterLatex::GetNextToken (wchar_t* token, int tokenSz, bool* thaiFlag)
 36 | {
 37 |   if (0 == buffer[0] && !ConvGetS (buffer, N_ELM (buffer), fpin, isUniIn))
 38 |     return false;
 39 | 
 40 |   *token = 0;
 41 |   if (verbatim)
 42 |     {
 43 |       *thaiFlag = false;
 44 |       wchar_t* curPtr = buffer;
 45 |       wchar_t* stVer = wcsstr (buffer, L"\\end{verbatim}");
 46 |       if (!stVer)
 47 |         {
 48 |           curPtr += consumeToken (token, tokenSz, buffer, wcslen (buffer));
 49 |           wmemmove (buffer, curPtr, wcslen (curPtr) + 1);
 50 |         }
 51 |       else
 52 |         {
 53 |           int verbLen = wcslen (L"\\end{verbatim}");
 54 |           if (stVer - buffer + verbLen < tokenSz)
 55 |             {
 56 |               stVer += verbLen;
 57 |             }
 58 |           curPtr += consumeToken (token, tokenSz, buffer, stVer - buffer);
 59 |           wmemmove (buffer, curPtr, wcslen (curPtr) + 1);
 60 |           verbatim = false;
 61 |         }
 62 |       return true;
 63 |     }
 64 | 
 65 |   wchar_t* beginPtr = buffer;
 66 |   wchar_t* curPtr = buffer;
 67 |   wchar_t* lastAllowed = buffer + tokenSz - 1;
 68 |   wchar_t* pToken = token;
 69 | 
 70 |   while (curPtr < lastAllowed && *curPtr && iswpunct (*curPtr))
 71 |     {
 72 |       curPtr++;
 73 |     }
 74 |   if (curPtr == lastAllowed || 0 == *curPtr)
 75 |     goto get_cur_token_return;
 76 | 
 77 |   *thaiFlag = isThaiUni (*curPtr);
 78 |   if (*thaiFlag)
 79 |     {
 80 |       //for finding thai line + thailine +...+ thailine
 81 |       for (;;)
 82 |         {
 83 |           while (curPtr < lastAllowed && *curPtr && isThaiUni (*curPtr))
 84 |             {
 85 |               curPtr++;
 86 |             }
 87 |           if (curPtr == lastAllowed || 0 == *curPtr)
 88 |             goto get_cur_token_return;
 89 | 
 90 |           if (*curPtr != 0 && *curPtr != L'\n')
 91 |             {
 92 |               // non-Thai char is found
 93 |               beginPtr += consumeToken (pToken, tokenSz,
 94 |                                         beginPtr, curPtr - beginPtr);
 95 |               wmemmove (buffer, beginPtr, wcslen (beginPtr) + 1);
 96 |               return true;
 97 |             }
 98 | 
 99 |           // EOL is found
100 |           if (*curPtr == L'\n')
101 |             {
102 |               *curPtr = 0;
103 |             }
104 |           int tkLen = consumeToken (pToken, tokenSz,
105 |                                     beginPtr, curPtr - beginPtr);
106 |           pToken += tkLen;
107 |           tokenSz -= tkLen;
108 |           lastAllowed -= tkLen;
109 |           //if next a line is thai string, concat next line
110 |           // to current line
111 |           if (!ConvGetS (buffer, N_ELM (buffer), fpin, isUniIn))
112 |             {
113 |               if (tokenSz > 1)
114 |                 {
115 |                   wcscpy (pToken, L"\n");
116 |                 }
117 |               return true;  //next GetToken() must return false
118 |             }
119 |           beginPtr = curPtr = buffer;
120 | #ifdef CAT_THAI_LINES
121 |           if (!isThaiUni (*curPtr))    //not thai character
122 | #endif
123 |             {
124 |               if (tokenSz > 1)
125 |                 {
126 |                   wcscpy (pToken, L"\n");
127 |                 }
128 |               return true;
129 |             }
130 |         }
131 |     }
132 |   else
133 |     {
134 |       while (curPtr < lastAllowed && *curPtr && !isThaiUni (*curPtr))
135 |         {
136 |           curPtr++;
137 |         }
138 | 
139 |       beginPtr += consumeToken (pToken, tokenSz, beginPtr, curPtr - beginPtr);
140 |       wmemmove (buffer, beginPtr, wcslen (beginPtr) + 1);
141 | 
142 |       if (wcsstr (token, L"\\begin{verbatim}") != NULL)
143 |         {
144 |           verbatim = true;      //entrance to verbatim model
145 |         }
146 |     }
147 | 
148 |   return true;
149 | 
150 | get_cur_token_return:
151 |   beginPtr += consumeToken (pToken, tokenSz, beginPtr, curPtr - beginPtr);
152 |   wmemmove (buffer, beginPtr, wcslen (beginPtr) + 1);
153 |   return true;
154 | }
155 | 
156 | void
157 | FilterLatex::Print (const wchar_t* token, bool thaiFlag)
158 | {
159 |   ConvPrint (fpout, token, isUniOut);
160 |   fflush (fpout);
161 | }
162 | 
163 | 


--------------------------------------------------------------------------------
/src/maxwordseg.cpp:
--------------------------------------------------------------------------------
  1 | // maxwordseg.cpp: implementation of the MaxWordSeg class.
  2 | //
  3 | //////////////////////////////////////////////////////////////////////
  4 | 
  5 | #include "maxwordseg.h"
  6 | #include "wordstack.h"
  7 | 
  8 | // Return number of ambiguous areas.
  9 | int
 10 | MaxWordSeg::CreateSentence ()
 11 | {
 12 |   int stWord, enAmb;
 13 |   if (IdxSep[0] >= 0)
 14 |     {
 15 |       stWord = 1;
 16 |       enAmb = LinkSep[IdxSep[0]];
 17 |     }
 18 |   else
 19 |     {
 20 |       int i;
 21 |       for (i = 1; i < textLen && IdxSep[i] < 0; i++)
 22 |         ;
 23 | 
 24 |       if (i == textLen)
 25 |         {
 26 |           stWord = enAmb = textLen;
 27 |         }
 28 |       else
 29 |         {
 30 |           stWord = i + 1;
 31 |           enAmb = LinkSep[IdxSep[i]];
 32 |         }
 33 |     }
 34 | 
 35 |   int sepIdx = 0;
 36 |   int stAmb = 0;
 37 |   while (enAmb <= textLen)
 38 |     {
 39 |       bool singleWord = true;
 40 |       for (; stWord < enAmb; stWord++)
 41 |         {
 42 |           if (IdxSep[stWord] >= 0 && LinkSep[IdxSep[stWord]] > enAmb)
 43 |             {
 44 |               singleWord = false;
 45 |               enAmb = LinkSep[IdxSep[stWord]];
 46 |             }
 47 |         }
 48 |       if (IdxSep[enAmb] < 0)
 49 |         {
 50 |           // Check for unknown
 51 |           while (enAmb < textLen && IdxSep[enAmb] < 0)
 52 |             enAmb++;
 53 |           singleWord = false;
 54 |         }
 55 |       if (singleWord)
 56 |         {
 57 |           SepData[2].Sep[sepIdx++] = enAmb;
 58 |         }
 59 |       else
 60 |         {
 61 |           int idxSen = WordSegArea (stAmb, enAmb);
 62 |           sepIdx = saveSegment (sepIdx, idxSen, enAmb);
 63 |         }
 64 |       stAmb = enAmb;
 65 |       if (stAmb >= textLen)
 66 |         break;
 67 |       enAmb = LinkSep[IdxSep[stAmb]];
 68 |     }
 69 | 
 70 |   return 2;
 71 | }
 72 | 
 73 | int
 74 | MaxWordSeg::saveSegment (int sepIdx, int idxSen, int lastPoint)
 75 | {
 76 |   const short int *srcSep = SepData[idxSen].Sep;
 77 |   short int       *dstSep = SepData[2].Sep;
 78 | 
 79 |   while (*srcSep != -1 && *srcSep <= lastPoint)
 80 |     {
 81 |       dstSep[sepIdx++] = *srcSep++;
 82 |     }
 83 | 
 84 |   return sepIdx;
 85 | }
 86 | 
 87 | int
 88 | MaxWordSeg::WordSegArea (int stSeg, int enSeg)
 89 | {
 90 |   WordStack BackTrackStack;
 91 | 
 92 |   short int senIdx = 0, bestSenIdx = 0, prevSenIdx = 0;
 93 |   short int sepIdx = 0;
 94 |   short int scoreidx;
 95 |   int bestScore = 0;
 96 |   int score[MAXSEP];
 97 | 
 98 |   // ========================================
 99 |   // this loop gets the first sentence
100 |   //      and Create Backtrack point.....
101 |   // ========================================
102 |   int Idx = stSeg;
103 | 
104 |   scoreidx = -1;
105 |   SepData[0].Score = 10000;
106 |   SepData[1].Score = 10000;
107 |   while (Idx < enSeg) //(LinkSep[IdxSep[Idx]]!=enSeg)
108 |     {
109 |       // found some words that start with Idx character
110 |       if (IdxSep[Idx] >= 0)
111 |         {
112 |           if (LinkSep[IdxSep[Idx] + 1] != -1)
113 |             {
114 |               BackTrackStack.Push (WordState (Idx, 0));
115 |             }
116 |           SepData[senIdx].Sep[sepIdx++] = LinkSep[IdxSep[Idx]];
117 |           score[++scoreidx] = ++SepData[senIdx].Score;
118 |           Idx = LinkSep[IdxSep[Idx]];
119 |         }
120 |       else
121 |         {
122 |           //at Idx there is no word in dictionary
123 |           while (Idx < enSeg && IdxSep[Idx] < 0)
124 |             Idx++;
125 |           SepData[senIdx].Sep[sepIdx++] = Idx;
126 |           SepData[senIdx].Score += 5;
127 |           score[++scoreidx] = SepData[senIdx].Score;
128 |         }
129 |     }
130 |   SepData[senIdx].Sep[sepIdx] = -1;
131 |   bestScore = SepData[senIdx].Score;
132 |   bestSenIdx = senIdx++;
133 |   //================================================
134 |   //create all posible sentences
135 |   //using backtrack (use my stack not use recursive)
136 |   //================================================
137 |   int looptime = 0;
138 |   while (!BackTrackStack.Empty () && looptime++ <= 200)
139 |     {
140 |       bool stopCreate = false;
141 |       WordState wState = BackTrackStack.Top ();
142 |       BackTrackStack.Pop ();
143 | 
144 |       wState.branchState++;
145 |       int curState = LinkSep[IdxSep[wState.backState] + wState.branchState];
146 |       if (curState == -1)
147 |         continue;
148 | 
149 |       BackTrackStack.Push (wState);
150 |       //create new sentence from prev sentence and save score to new sen.
151 |       //change 1st parameter of copySepData from bestSenIdx to prevSenIdx
152 |       short int nextSepIdx = copySepData (prevSenIdx, senIdx, wState.backState);
153 |       scoreidx = nextSepIdx - 1;
154 |       SepData[senIdx].Score = (scoreidx < 0) ? 10000 : score[scoreidx];
155 | 
156 |       //loop for filling the rest sep point of new sentence
157 |       while (curState != enSeg)
158 |         {
159 |           SepData[senIdx].Sep[nextSepIdx++] = curState;
160 |           SepData[senIdx].Score++;
161 |           if (IdxSep[curState] < 0)
162 |             {
163 |               //unknown string, find its boundary and add penalty
164 |               while (curState < enSeg && IdxSep[curState] < 0)
165 |                 curState++;
166 |               SepData[senIdx].Score += 5;
167 |             }
168 |           else
169 |             {
170 |               if (LinkSep[IdxSep[curState] + 1] != -1)
171 |                 {
172 |                   //another branch exists, push backtrack state
173 |                   BackTrackStack.Push (WordState (curState, 0));
174 |                 }
175 |               curState = LinkSep[IdxSep[curState]];
176 |             }
177 |           score[++scoreidx] = SepData[senIdx].Score;
178 |           if (SepData[senIdx].Score >= bestScore - 1)
179 |             {
180 |               prevSenIdx = senIdx;
181 |               stopCreate = true;
182 |               break;
183 |             }
184 |         }
185 |       if (stopCreate)
186 |         continue;
187 |       SepData[senIdx].Sep[nextSepIdx++] = enSeg;
188 |       SepData[senIdx].Score++;
189 |       SepData[senIdx].Sep[nextSepIdx] = -1;
190 |       prevSenIdx = senIdx;
191 |       if (SepData[senIdx].Score < bestScore)
192 |         {
193 |           bestScore = SepData[senIdx].Score;
194 |           short int tmpidx = bestSenIdx;
195 |           bestSenIdx = senIdx;
196 |           senIdx = tmpidx;
197 |         }
198 |     }
199 | 
200 |   return bestSenIdx;
201 | }
202 | 


--------------------------------------------------------------------------------
/src/abswordseg.cpp:
--------------------------------------------------------------------------------
  1 | // abswordseg.cpp: implementation of the AbsWordSeg class.
  2 | //
  3 | //////////////////////////////////////////////////////////////////////
  4 | 
  5 | #include "abswordseg.h"
  6 | #include <wctype.h>
  7 | 
  8 | //////////////////////////////////////////////////////////////////////
  9 | // Construction/Destruction
 10 | //////////////////////////////////////////////////////////////////////
 11 | 
 12 | AbsWordSeg::~AbsWordSeg ()
 13 | {
 14 | }
 15 | 
 16 | void
 17 | AbsWordSeg::CreateWordList (const Dict* dict)
 18 | {
 19 |   int cntLink = 0;
 20 |   Dict::State* curState = dict->root();
 21 | 
 22 |   int i = 0;
 23 |   while (i < textLen)
 24 |     {
 25 |       wchar_t lead_ch = text[i];
 26 | 
 27 |       if (!IsLeadChar (lead_ch) || (i > 0 && !IsLastChar (text[i - 1])))
 28 |         {
 29 |           // Unleadable chars
 30 |           IdxSep[i++] = -2;      //cannot leading for unknown word.
 31 |         }
 32 |       else if (iswpunct (lead_ch))
 33 |         {
 34 |           // Chunk of punctuation marks
 35 |           IdxSep[i] = cntLink;
 36 |           while (++i < textLen && iswpunct (text[i]))
 37 |             {
 38 |               IdxSep[i] = -1;
 39 |             }
 40 |           LinkSep[cntLink++] = i;
 41 |           LinkSep[cntLink++] = -1;
 42 |         }
 43 |       else if (iswdigit (lead_ch) || isThaiUniDigit (lead_ch))
 44 |         {
 45 |           // Chunk of numbers
 46 |           IdxSep[i] = cntLink;
 47 |           while (++i < textLen)
 48 |             {
 49 |               wchar_t wc = text[i];
 50 |               if (!iswdigit (wc) && !isThaiUniDigit (wc)
 51 |                   && wc != L'.' && wc != L',')
 52 |                 break;
 53 | 
 54 |               IdxSep[i] = -1;
 55 |             }
 56 |           LinkSep[cntLink++] = i;
 57 |           LinkSep[cntLink++] = -1;
 58 |         }
 59 |       else if (!isThaiUni (lead_ch))
 60 |         {
 61 |           // Chunk of non-Thai characters
 62 |           IdxSep[i] = cntLink;
 63 |           while (++i < textLen && !isThaiUni (text[i]))
 64 |             {
 65 |               IdxSep[i] = -1;
 66 |             }
 67 |           LinkSep[cntLink++] = i;
 68 |           LinkSep[cntLink++] = -1;
 69 |         }
 70 |       else
 71 |         {
 72 |           // Thai character: find breakabilities starting at text[i]
 73 |           IdxSep[i] = -1;
 74 |           bool isWordFound = false;
 75 |           curState->rewind ();
 76 |           for (int j = i; j < textLen; j++)
 77 |             {
 78 |               if (text[j] == 0x0e46 && isWordFound)
 79 |                 {
 80 |                   //Mai-Ya-Mok -- break position
 81 |                   LinkSep[cntLink - 1] = j + 1;
 82 |                   break;
 83 |                 }
 84 |               if (!curState->walk (text[j]))
 85 |                 break;
 86 |               if (curState->isTerminal ())
 87 |                 {
 88 |                   //found word in dictionary
 89 |                   //check whether it should be segmented here
 90 |                   if (IsLeadChar (text[j + 1]) && !HasKaran (&text[j]))
 91 |                     {
 92 |                       if (!isWordFound)
 93 |                         {
 94 |                           IdxSep[i] = cntLink;
 95 |                           isWordFound = true;
 96 |                         }
 97 |                       LinkSep[cntLink++] = j + 1;
 98 |                     }
 99 |                 }
100 |             }
101 |           if (isWordFound)
102 |             {
103 |               // reverse the word list
104 |               for (int lower = IdxSep[i], upper = cntLink - 1;
105 |                    lower < upper;
106 |                    ++lower, --upper)
107 |                 {
108 |                   short int tmp = LinkSep[lower];
109 |                   LinkSep[lower] = LinkSep[upper];
110 |                   LinkSep[upper] = tmp;
111 |                 }
112 |               LinkSep[cntLink++] = -1;
113 |             }
114 |           ++i;
115 |         }
116 |     }
117 |   delete curState;
118 | }
119 | 
120 | bool
121 | AbsWordSeg::IsLeadChar (wchar_t wc)
122 | {
123 |   return (wc < 0x007f)                      // All ASCII
124 |          || (0x0e01 <= wc && wc <= 0x0e2e)  // KO KAI .. HO NOKHUK
125 |          || (0x0e3f <= wc && wc <= 0x0e44)  // BAHT .. MAIMALAI
126 |          || (0x0e4f <= wc && wc <= 0x0e5b); // FONGMAN .. KHOMUT
127 | }
128 | 
129 | bool
130 | AbsWordSeg::IsLastChar (wchar_t wc)
131 | {
132 |   return (wc < 0x007f)                      // All ASCII
133 |          || (0x0e01 <= wc && wc <= 0x0e3f)  // KO KAI .. BAHT
134 |          || (0x0e45 <= wc && wc <= 0x0e5b); // LAKKHANGYAO .. KHOMUT
135 | }
136 | 
137 | bool
138 | AbsWordSeg::HasKaran (const wchar_t* sen_ptr)
139 | {
140 |   for (int i = 1; i <= 3 && sen_ptr[i] != 0; i++)
141 |     {
142 |       if (sen_ptr[i] == 0x0e4c) // THANTHAKHAT
143 |         return true;
144 |     }
145 |   return false;
146 | }
147 | 
148 | int
149 | AbsWordSeg::WordSeg (const Dict* dict, const wchar_t* senstr,
150 |                      short int* outSeps, int outSepsSz)
151 | {
152 |   int bestidx;
153 | 
154 |   wcscpy (text, senstr);
155 |   textLen = wcslen (senstr);
156 |   CreateWordList (dict);
157 |   bestidx = CreateSentence ();
158 |   return GetBestSen (bestidx, outSeps, outSepsSz);
159 | }
160 | 
161 | // =======================================================
162 | // function that copy previous seperation point from
163 | // idxSen-1 to idxSen (copy from idx=0 to idx that
164 | // has value= sepPoint).
165 | // Return index of next seperation point that will be fill.
166 | // ========================================================
167 | unsigned short int
168 | AbsWordSeg::copySepData (short int sourceIdxSen, short int targetIdxSen,
169 |                          short int sepPoint)
170 | {
171 |   short int i = 0;
172 |   if (sourceIdxSen == targetIdxSen)
173 |     {
174 |       while (SepData[sourceIdxSen].Sep[i] <= sepPoint)
175 |         {
176 |           i++;
177 |           if (SepData[sourceIdxSen].Sep[i - 1] == sepPoint)
178 |             break;
179 |         }
180 |     }
181 |   else
182 |     {
183 |       while (SepData[sourceIdxSen].Sep[i] <= sepPoint)
184 |         {
185 |           SepData[targetIdxSen].Sep[i] = SepData[sourceIdxSen].Sep[i];
186 |           i++;
187 |           if (SepData[sourceIdxSen].Sep[i - 1] == sepPoint)
188 |             break;
189 |         }
190 |     }
191 |   return i;
192 | }
193 | 
194 | int
195 | AbsWordSeg::GetBestSen (int bestidx, short int* outSeps, int outSepsSz)
196 | {
197 |   int t;
198 | 
199 |   for (t = 0; t < outSepsSz && SepData[bestidx].Sep[t] != textLen; t++)
200 |     {
201 |       outSeps[t] = SepData[bestidx].Sep[t];
202 |     }
203 | 
204 |   return t;
205 | }
206 | 


--------------------------------------------------------------------------------
/data/tdict-proper.txt:
--------------------------------------------------------------------------------
  1 | กรีนพีซ
  2 | กฤษณะ
  3 | กวนอิม
  4 | กวนอู
  5 | กัดดาฟี
  6 | กันตนา
  7 | กัลยาณวัตร
  8 | กัสสปะ
  9 | กามนิต
 10 | การ์เร็ต
 11 | กาลิเลโอ
 12 | กินเนส
 13 | กินเนสส์
 14 | กิฟฟารีน
 15 | กุบไล
 16 | กุมภกรรณ
 17 | กูเตนเบิร์ก
 18 | เกตส์
 19 | เกรซ
 20 | เกษมณี
 21 | เการพ
 22 | แกรมมี่
 23 | โกณฑัญญะ
 24 | โกโบริ
 25 | โกเศศ
 26 | โกษา
 27 | โกษาธิบดี
 28 | ขงเบ้ง
 29 | คริสตินา
 30 | คริสโตเฟอร์
 31 | คลินตัน
 32 | คลีโอพัตรา
 33 | คองคอร์ด
 34 | คอนสแตนติน
 35 | คอร์เนลล์
 36 | คอลเกต
 37 | คาเธ่ย์
 38 | คานธี
 39 | คาเบรียล
 40 | คาร์ฟูร์
 41 | คาร์สัน
 42 | คาราบาว
 43 | คาสิโอ
 44 | คิริน
 45 | คุนลุ้น
 46 | คูโบต้า
 47 | เคปเลอร์
 48 | เครมลิน
 49 | แคทรีนา
 50 | แคนนอน
 51 | โคดม
 52 | โคตมะ
 53 | โคตมี
 54 | โคลัมบัส
 55 | ไคฟง
 56 | ไครสเลอร์
 57 | ง้อไบ๊
 58 | จตุพร
 59 | จ็อบส์
 60 | จอย
 61 | จอร์จ
 62 | จอห์น
 63 | จันทรา
 64 | จิ้น
 65 | จิ๋นซี
 66 | จิม
 67 | จิ๋ม
 68 | จิว
 69 | จุฬาภรณ์
 70 | จุฬาลงกรณ์
 71 | จูปิเตอร์
 72 | จูเลียต
 73 | จูเลียส
 74 | เจงกิส
 75 | เจงกีส
 76 | เจตริน
 77 | เจนี่
 78 | เจฟฟรีย์
 79 | เจมส์
 80 | เจียไต๋
 81 | แจ็กสัน
 82 | โจเซฟ
 83 | โจว
 84 | โจอี้
 85 | ฉิม
 86 | ชมัยมรุเชฐ
 87 | ชมัยมรุเชษฐ์
 88 | ชเวดากอง
 89 | ชัชชาติ
 90 | ชาร์ลส์
 91 | ชาร์ลี
 92 | ชินราช
 93 | ชินวัตร
 94 | ชุนชิว
 95 | เช็ง
 96 | เชตวัน
 97 | เชฟรอน
 98 | เชฟโรเลต
 99 | เชลซี
100 | ไชยานุชิต
101 | ซ่ง
102 | ซังฮี้
103 | ซัดดัม
104 | ซันซิล
105 | ซัมซุง
106 | ซัวเจ๋ง
107 | ซัสโก้
108 | ซาราห์
109 | ซินหัว
110 | ซีซาร์
111 | ซีแพค
112 | ซีม่า
113 | ซูจี
114 | ซูซาน
115 | ซูซูกิ
116 | ซูบารุ
117 | ซูฮาร์โต
118 | เซ็นทารา
119 | เซบาสเตียน
120 | เซโรงัง
121 | เซเวน
122 | เซเว่น
123 | โซฟิเทล
124 | โซฟี
125 | โซยุซ
126 | โซยูซ
127 | ญาญ่า
128 | ณเดชน์
129 | ณัฐวุฒิ
130 | ดักลาส
131 | ดัชชี่
132 | ดัชมิลล์
133 | ดาร์ลี่
134 | ดาวโจนส์
135 | ดิสนีย์
136 | ดีแทค
137 | ดูปองท์
138 | เดโมแครต
139 | เดลล์
140 | เดลินิวส์
141 | เดวิด
142 | แดวู
143 | โดนัลด์
144 | โดราเอมอน
145 | โดเรมอน
146 | ไดอานา
147 | ต๋อง
148 | ตะเบ็งชะเวตี้
149 | ตั๊กม้อ
150 | ตากสิน
151 | ตาเมือน
152 | ตาลีบัน
153 | ตูน
154 | เตมีย์
155 | เตมูจิน
156 | โต๋
157 | โตชิบา
158 | โตโน่
159 | โตโยต้า
160 | โตราห์
161 | ถังซัมจั๋ง
162 | ถังซำจั๋ง
163 | ทรพา
164 | ทรัมป์
165 | ทราเวล
166 | ทรูมูฟ
167 | ทักษอร
168 | ทาร์ซาน
169 | ทีปังกร
170 | เทปโก
171 | เทพรัตน
172 | เทวทัต
173 | เทสโก้
174 | เทสลา
175 | โทมัส
176 | ไททานิก
177 | ไททานิค
178 | ไทยรัฐ
179 | ธนญชัย
180 | ธีออส
181 | นครินทรา
182 | นโปเลียน
183 | นพดล
184 | นราดูร
185 | นเรนทร
186 | นอสตราดามุส
187 | นันยาง
188 | นาคะ
189 | นาซา
190 | นาซ่า
191 | นาซี
192 | นาโต
193 | นาโต้
194 | นาธาน
195 | นาลแก
196 | นิกเกอิ
197 | นิคอน
198 | นิโคลัส
199 | นิด้า
200 | นินเทนโด
201 | นิปปอน
202 | นิพิฏฐ์
203 | นิวตัน
204 | นิสสัน
205 | เนคเทค
206 | เนชั่น
207 | เนชันแนล
208 | เนชั่นแนล
209 | เนวิน
210 | เนสท์เล่
211 | เนสเล่
212 | เนสาด
213 | แนท
214 | แนสแดค
215 | โนเกีย
216 | โนเบล
217 | โนเวลล์
218 | โนโวเทล
219 | ไนเม็กซ์
220 | บรอดเวย์
221 | บรัดเลย์
222 | บรู๊ซ
223 | บรูตัส
224 | บลูมเบิร์ก
225 | บอย
226 | บัลเมอร์
227 | บารัก
228 | บารัค
229 | บีโทเฟน
230 | บีโทเฟ่น
231 | บีโธเฟน
232 | บีโธเฟ่น
233 | บุเรงนอง
234 | บู๊ตึ๊ง
235 | เบญกาย
236 | เบตาดีน
237 | เบนซ์
238 | เบ็นซ์
239 | เบนจามิน
240 | เบนโล
241 | เบเนดิกต์
242 | เบลล่า
243 | เบอร์นาร์ด
244 | โบตัน
245 | โบนันซ่า
246 | โบอิ้ง
247 | ไบโอเทค
248 | ประชาธิปัตย์
249 | ปรีดิยาธร
250 | ปวีณา
251 | ปอเต็กตึ๊ง
252 | ปอเต๊กตึ๊ง
253 | ป่อเต็กตึ๊ง
254 | ปอร์เช่
255 | ปัตตะโชติ
256 | ปันยารชุน
257 | ปาเจโร่
258 | ปาณฑพ
259 | ปารุสก์
260 | ปีเตอร์
261 | ปูติน
262 | เป๊ปซี่
263 | เป้ย
264 | เปอร์โยต์
265 | เปาบุ้นจิ้น
266 | เปาโล
267 | โปเกมอน
268 | โป๊ยก่าย
269 | พงศพัศ
270 | พรหมทัต
271 | พลาโต
272 | พอตเตอร์
273 | พอลล่า
274 | พัชราภา
275 | พานาโซนิค
276 | พานาโซนิก
277 | พิเชษฐ์
278 | พิทยานุกูล
279 | พิมพิสาร
280 | เพนแทกซ์
281 | เพลโต
282 | เพิร์ล
283 | แพทริเซีย
284 | แพทริออต
285 | โพไซดอน
286 | ไพโอเนียร์
287 | ฟรอยด์
288 | ฟรังซิส
289 | ฟรานซิส
290 | ฟลอเรนซ์
291 | ฟอร์ด
292 | ฟอลคอน
293 | ฟิลิปส์
294 | ฟีฟ่า
295 | ฟูจิ
296 | เฟอร์รารี
297 | แฟซ่า
298 | แฟนต้า
299 | โฟร์โมสต์
300 | ภาคิน
301 | ภีษมะ
302 | ภูมิพล
303 | ภูริทัต
304 | มงฟอร์ต
305 | มณโฑ
306 | มติชน
307 | มหิตลาธิเบศร
308 | มโหสถ
309 | มะกะโท
310 | มังตรา
311 | มังระ
312 | มังราย
313 | มัจฉานุ
314 | มาร์กซ์
315 | มาร์กาเร็ต
316 | มาร์ติน
317 | มาสด้า
318 | มิชลิน
319 | มิชเชลลิน
320 | มิเชล
321 | มิตซูบิชิ
322 | มิราเคิล
323 | มิรินด้า
324 | มุสโสลินี
325 | มูบารัก
326 | มูบารัค
327 | มูฮัมหมัด
328 | เม้ง
329 | เม็งราย
330 | เมจิ
331 | เมย์
332 | เมลาเนีย
333 | เมอร์ซีเดส
334 | เมอร์เซเดส
335 | เมาคลี
336 | แมกซ์เวลล์
337 | แมกไซไซ
338 | แม็กนัม
339 | แม็กนั่ม
340 | แม็กโนเลีย
341 | แมคอินทอช
342 | แมชีนเนอรี่
343 | โมกขศักดิ์
344 | โมคคัลลานะ
345 | โมซาร์ท
346 | โมโตโรลา
347 | โมโตโรล่า
348 | โมเนีย
349 | โมโนไทป์
350 | โมริยะ
351 | โมสาร์ต
352 | โมสาร์ท
353 | โมอาย
354 | โมฮัมหมัด
355 | ไมเคิล
356 | ไมยราพณ์
357 | ยโสธรา
358 | ยะโฮวา
359 | ยะโฮวาห์
360 | ยาคูลท์
361 | ยามาฮ่า
362 | ยาเวห์
363 | ยาฮู
364 | ยูคลิด
365 | ยูคลิเดียน
366 | ยูนิเซฟ
367 | ยูเนสโก
368 | ยูฟ่า
369 | ยูไล
370 | เยโฮวาห์
371 | รอยเตอร์
372 | รอยัล
373 | รัชดา
374 | รัสเซลล์
375 | รัสปูติน
376 | ราณี
377 | ราฟาเอล
378 | รามาวตาร
379 | ราเมศวร
380 | ราหุล
381 | ริชาร์ด
382 | รีพับลิกัน
383 | รูนีย์
384 | เรนโบว์
385 | แรมโบ้
386 | โรตารี
387 | โรนัลโด
388 | โรนัลโด้
389 | โรบินสัน
390 | โรเบิร์ต
391 | โรมานอฟ
392 | โรเล็กซ์
393 | โรส
394 | ล็อกซเล่ย์
395 | ลอเรนซ์
396 | ลอว์เรนซ์
397 | ลอว์สัน
398 | ลัมโบกินี
399 | ลัมโบร์กินี
400 | ลิงคอล์น
401 | ลิจฉวี
402 | ลิไท
403 | ลิไทย
404 | ลินคอล์น
405 | ลิเวอร์พูล
406 | ลิสเตอรีน
407 | เลกซัส
408 | เล็กซัส
409 | เลโนโว
410 | เลียดก๊ก
411 | แลมบอร์กินี
412 | โลตัส
413 | วชิราลงกรณ์
414 | วลาดิเมียร์
415 | วอลล์สตรีท
416 | วอลโว่
417 | วัชรพล
418 | วัตสัน
419 | วาดะห์
420 | วาเลนไทน์
421 | วิกตอเรีย
422 | วิคตอเรีย
423 | วิชั่นส์
424 | วิชาเยนทร์
425 | วิทยานุสรณ์
426 | วิทยายน
427 | วิมเบิลดัน
428 | วิลเลียม
429 | วีนัส
430 | วีระ
431 | วุฒิชัย
432 | เวสปอยต์
433 | เวียดกง
434 | เวียร์
435 | แวร์ซายส์
436 | ไวตามิลค์
437 | ศกุนตลา
438 | ศศินทร์
439 | ศิริพงษ์
440 | ศิริราช
441 | ศิลปสาร
442 | ศุภชลาศัย
443 | สดกก๊อกธม
444 | สดายุ
445 | สตาร์บัคส์
446 | สตาลิน
447 | สตีฟ
448 | สตีเฟน
449 | สตีเฟ่น
450 | สตีเวน
451 | สตีเว่น
452 | สแตนฟอร์ด
453 | สไปรท์
454 | สมิติเวช
455 | สรยุทธ์
456 | สโรชา
457 | สวรินทิรา
458 | สอพินยา
459 | สังกัจจายน์
460 | สาทิตย์
461 | สารีบุตร
462 | สิริกิติ์
463 | สิรินธร
464 | สิหิงค์
465 | สีวลี
466 | สีหนุ
467 | สีหมุนี
468 | สีหโมนี
469 | สุครีพ
470 | สุทโธทนะ
471 | สุเทพ
472 | สุนทราภรณ์
473 | สุนีย์
474 | สุรชัย
475 | สุรนารี
476 | สุรยุทธ์
477 | สุรศักดิ์
478 | สุริยาสน์
479 | สุริโยทัย
480 | สุวิทย์
481 | เสฐียร
482 | เส้าหลิน
483 | เสิ่นโจว
484 | โสกราตีส
485 | โสภิต
486 | โสรัจจะ
487 | หนุมาน
488 | หลินฮุ่ย
489 | หลุยส์
490 | หั่งเส็ง
491 | หัวเว่ย
492 | หัวเหว่ย
493 | เห้งเจีย
494 | ไหหม่า
495 | องคต
496 | องคุลิมาล
497 | อชาตศัตรู
498 | อดัม
499 | อดุลยเดช
500 | อพอลโล
501 | อริสโตเติล
502 | อริสมันต์
503 | อลองพญา
504 | อลิซาเบธ
505 | อเล็กซานเดอร์
506 | อวานี
507 | อองซาน
508 | อ๋อม
509 | ออยเลอร์
510 | ออร์คิด
511 | ออสการ์
512 | อะแซหวุ่นกี้
513 | อะธีนา
514 | อะพอลโล
515 | อังศุมาลิน
516 | อับราฮัม
517 | อั้ม
518 | อัลกออิดะห์
519 | อัลคาเทล
520 | อัลจาซีราห์
521 | อัลเฟรด
522 | อัลเลาะห์
523 | อัสซุส
524 | อัสสชิ
525 | อัสสัมชัญ
526 | อาเซม
527 | อาเซ็ม
528 | อาเซียน
529 | อาเธอร์
530 | อาฟต้า
531 | อาร์เซนอล
532 | อาลีบาบา
533 | อิเกีย
534 | อิชิตัน
535 | อินทรชิต
536 | อินทรปาลิต
537 | อินทรา
538 | อินทราทิตย์
539 | อิเลียด
540 | อิศรา
541 | อิเหนา
542 | อีซูซุ
543 | อีฟ
544 | อีเลฟเวน
545 | อีเลฟเว่น
546 | อีสป
547 | อุณรุท
548 | อุบลรัตน์
549 | อุบาลี
550 | อุปกิต
551 | อุ๋ย
552 | อุรัสยา
553 | อุศเรน
554 | เอกทัศน์
555 | เอเซอร์
556 | เอ็ดเวิร์ด
557 | เอดิสัน
558 | เอแบค
559 | เอเปค
560 | เอฟเวอร์ตัน
561 | เอลิซาเบธ
562 | เอสพลานาด
563 | เอสพลานาร์ด
564 | แอคคอร์
565 | แอคคอร์ด
566 | แองเจลิน่า
567 | แอน
568 | แอนน์
569 | แอนดรูว์
570 | แอ๋ม
571 | แอมบาสซาเดอร์
572 | แอมบาสเดอร์
573 | แอมเวย์
574 | แอ๋ว
575 | โอดีสซีย์
576 | โอเดียน
577 | โอบามา
578 | โอเปก
579 | โอรสาราม
580 | โอลิมเปีย
581 | โออิชิ
582 | ไอซิส
583 | ไอน์สไตน์
584 | ไอเฟล
585 | ฮอนด้า
586 | ฮอปกินส์
587 | ฮอลลีวูด
588 | ฮอลลีวู้ด
589 | ฮั่งเส็ง
590 | ฮานามิ
591 | ฮามาส
592 | ฮาร์เบอร์
593 | ฮาร์โมนี
594 | ฮาร์วาร์ด
595 | ฮิซบอลเลาะห์
596 | ฮิซบอลลาห์
597 | ฮิซบุลลอฮ์
598 | ฮิตเลอร์
599 | ฮิตาชิ
600 | ฮิลลารี
601 | ฮิวโก
602 | ฮิวโก้
603 | ฮิสบอลเลาะห์
604 | ฮิสบอลลาห์
605 | ฮุนเซน
606 | ฮุนเซ็น
607 | ฮุนได
608 | ฮุสเซน
609 | ฮุสเซ็น
610 | เฮซบอลเลาะห์
611 | เฮนรี
612 | เฮนรี่
613 | เฮเลน
614 | เฮอร์คิวลิส
615 | แฮร์รี
616 | แฮร์รี่
617 | โฮจิมินห์
618 | โฮป
619 | โฮปเวลล์
620 | โฮเมอร์
621 | 


--------------------------------------------------------------------------------
/src/wordseg.cpp:
--------------------------------------------------------------------------------
  1 | // ==========================================
  2 | // Author: Chaivit Poovanich
  3 | // October 14, 1998
  4 | // Modified by : Paisarn Charoenpornsawat
  5 | // 1999
  6 | // ==========================================
  7 | // New features.
  8 | //       1. Support many formats: LaTeX, HTML,
  9 | //               Text, RTF (for Opensource, Emacs 1999)
 10 | //       2. Remove garbage characters.
 11 | //               (for a. virach requirement 2000)
 12 | //       3. Select wordseg algorithms, Bi-gram
 13 | //           Maximal Matching, Longest Matching
 14 | //               (for Opensource, Emacs 1999)
 15 | // ===========================================
 16 | 
 17 | 
 18 | #include <stdio.h>
 19 | #include <stdlib.h>
 20 | #include <string.h>
 21 | #include <wchar.h>
 22 | #include <wctype.h>
 23 | 
 24 | #include "worddef.h"
 25 | #include "filefilter.h"
 26 | #include "filterx.h"
 27 | #include "longwordseg.h"
 28 | #include "maxwordseg.h"
 29 | #include "convutil.h"
 30 | #include "utils.h"
 31 | 
 32 | enum TextToken {
 33 |   TTOK_EOT    = -1,
 34 |   TTOK_WSPACE = 0,
 35 |   TTOK_THAI   = 1,
 36 |   TTOK_ETC    = 2,
 37 | };
 38 | 
 39 | static TextToken SplitToken (wchar_t** str, wchar_t* token);
 40 | 
 41 | static bool
 42 | OpenDict (Dict* dict, const char* dictpath)
 43 | {
 44 |   // Dict search order:
 45 |   // 1. dictpath, if not NULL, exit on failure
 46 |   // 2. ${SWATHDICT} env, if not NULL, continue on failure
 47 |   // 3. current dir, continue on failure
 48 |   // 4. WORDSEGDATADIR macro
 49 |   if (dictpath)
 50 |     {
 51 |       if (dict->open (dictpath))
 52 |         return true;
 53 |     }
 54 |   else
 55 |     {
 56 |       dictpath = getenv ("SWATHDICT");
 57 |       if (dictpath && dict->open (dictpath))
 58 |         return true;
 59 | 
 60 |       if (dict->open ("."))
 61 |         return true;
 62 | 
 63 |       if (dict->open (WORDSEGDATA_DIR))
 64 |         return true;
 65 |     }
 66 | 
 67 |   return false;
 68 | }
 69 | 
 70 | static AbsWordSeg*
 71 | InitWordSegmentation (const char* method)
 72 | {
 73 |   if (method)
 74 |     {
 75 |       if (strcmp (method, "long") == 0)
 76 |         return new LongWordSeg;
 77 |       else if (strcmp (method, "max") == 0)
 78 |         return new MaxWordSeg;
 79 |     }
 80 | 
 81 |   // fallback
 82 |   return new MaxWordSeg;
 83 | }
 84 | 
 85 | static void
 86 | ExitWordSegmentation (AbsWordSeg* wseg)
 87 | {
 88 |   delete wseg;
 89 | }
 90 | 
 91 | static void
 92 | WordSegmentation (AbsWordSeg* wseg, const Dict* dict, const wchar_t* wbr,
 93 |                   const wchar_t* line, wchar_t* output, int outputSz)
 94 | {
 95 |   short int *seps = new short int [outputSz];
 96 |   int nSeps = wseg->WordSeg (dict, line, seps, outputSz);
 97 | 
 98 |   const wchar_t* pLine = line;
 99 |   int wbrLen = wcslen (wbr);
100 |   for (int i = 0; i < nSeps; i++)
101 |     {
102 |       const wchar_t* wordEnd = line + seps[i];
103 |       int outLen = wordEnd - pLine;
104 |       if (outLen >= outputSz)
105 |         break;
106 |       wcsncpy (output, pLine, outLen);
107 |       output[outLen] = 0;
108 |       output += outLen;
109 |       outputSz -= outLen;
110 | 
111 |       if (wbrLen >= outputSz)
112 |         break;
113 |       wcsncpy (output, wbr, wbrLen);
114 |       output += wbrLen;
115 |       outputSz -= wbrLen;
116 | 
117 |       pLine = wordEnd;
118 |     }
119 | 
120 |   wcsncpy (output, pLine, outputSz);
121 | 
122 |   delete[] seps;
123 | }
124 | 
125 | static void
126 | Version ()
127 | {
128 |   printf ("swath " VERSION "\n");
129 |   printf
130 |     ("Copyright (C) 2001-2006 Phaisarn Charoenpornsawat <phaisarn@nectec.or.th>\n"
131 |      "Copyright (C) 2001-2014 Theppitak Karoonboonyanan <theppitak@gmail.com>\n"
132 |      "License: GNU GPL version 2 or later <http://gnu.org/licenses/gpl-2.0.html>\n"
133 |      "This is free software; you are free to change and redistribute it.\n"
134 |      "There is NO WARRANTY, to the extent permitted by law.\n");
135 | }
136 | 
137 | static void
138 | Usage (int verbose)
139 | {
140 |   fprintf (stderr,
141 |            "Usage: swath [mule] [-v|--verbose] [-b \"delimiter\"] [-d dict-dir]\n"
142 |            "       [-f html|rtf|latex|lambda] [-m long|max] [-u {u|t},{u|t}] [-h|[-]-help]\n");
143 |   if (verbose)
144 |     {
145 |       fprintf (stderr,
146 |                "Options:\n"
147 |                "\tmule : for use with mule\n"
148 |                "\t-v   : verbose mode\n"
149 |                "\t-b   : define a word delimiter string for the output\n"
150 |                "\t-d   : specify dictionary path\n"
151 |                "\t-f   : specify format of the input\n"
152 |                "\t\thtml     : HTML file\n"
153 |                "\t\trtf      : RTF file\n"
154 |                "\t\tlatex    : LaTeX file\n"
155 |                "\t\tlambda   : The input and output are same as latex, except that\n"
156 |                "\t\t           the word delimiter is ^^^^200b\n"
157 |                "\t-m   : choose word matching scheme when analyzing\n"
158 |                "\t\tlong     : longest matching scheme\n"
159 |                "\t\tmax      : maximal matching scheme\n"
160 |                "\t-u   : specify encodings of input and output in 'i,o' form,\n"
161 |                "\t       for input and output respectively, where 'i', 'o' is one of:\n"
162 |                "\t\tu        : The input/output is in UTF-8\n"
163 |                "\t\tt        : The input/output is in TIS-620\n"
164 |                "\t-help: display this help message\n");
165 |     }
166 | }
167 | 
168 | #ifndef WORDSEGDATA_DIR
169 | #define WORDSEGDATA_DIR "/usr/local/lib/wordseg"
170 | #endif
171 | 
172 | int
173 | main (int argc, char* argv[])
174 | {
175 |   const char* wbr = NULL;
176 |   const char* dictpath = NULL;
177 |   const char* method = NULL;
178 |   const char* fileformat = NULL;
179 |   const char* unicode = NULL;
180 |   bool verbose = false;
181 |   bool muleMode;
182 |   bool thaiFlag;
183 | 
184 |   muleMode = false;
185 |   for (int iargc = 1; iargc < argc; iargc++)
186 |     {
187 |       if (strcmp ("mule", argv[iargc]) == 0)
188 |         {
189 |           muleMode = true;
190 |           verbose = false;
191 |         }
192 |       else if (strcmp ("-b", argv[iargc]) == 0 && iargc + 1 < argc)
193 |         {
194 |           wbr = argv[++iargc];
195 |         }
196 |       else if (strcmp ("-d", argv[iargc]) == 0 && iargc + 1 < argc)
197 |         {
198 |           dictpath = argv[++iargc];
199 |         }
200 |       else if (strcmp ("-f", argv[iargc]) == 0 && iargc + 1 < argc)
201 |         {
202 |           fileformat = argv[++iargc];
203 |         }
204 |       else if (strcmp ("-v", argv[iargc]) == 0 ||
205 |                strcmp ("--verbose", argv[iargc]) == 0)
206 |         {
207 |           verbose = true;
208 |         }
209 |       else if (strcmp ("-m", argv[iargc]) == 0 && iargc + 1 < argc)
210 |         {
211 |           method = argv[++iargc];
212 |         }
213 |       else if (strcmp ("-u", argv[iargc]) == 0 && iargc + 1 < argc)
214 |         {
215 |           unicode = argv[++iargc];
216 |         }
217 |       else if (strcmp ("-V", argv[iargc]) == 0 ||
218 |                strcmp ("--version", argv[iargc]) == 0)
219 |         {
220 |           Version ();
221 |           return 0;
222 |         }
223 |       else if (strcmp ("-help", argv[iargc]) == 0 ||
224 |                strcmp ("-h", argv[iargc]) == 0 ||
225 |                strcmp ("--help", argv[iargc]) == 0)
226 |         {
227 |           Usage (1);
228 |           return 1;
229 |         }
230 |       else
231 |         {
232 |           Usage (0);
233 |           return 1;
234 |         }
235 |     }
236 | 
237 |   if (verbose)
238 |     printf ("*** Word Segmentation ***\n");
239 | 
240 |   Dict dict;
241 |   if (!OpenDict (&dict, dictpath))
242 |     return 1;
243 | 
244 |   AbsWordSeg* wseg = InitWordSegmentation (method);
245 |   if (!wseg)
246 |     return 1;
247 | 
248 |   bool  isUniIn = false;
249 |   bool  isUniOut = false;
250 |   if (unicode != NULL)
251 |     {                           //Option -u
252 |       if (unicode[0] == 'u')
253 |         {
254 |           isUniIn = true;
255 |         }
256 |       if (unicode[2] == 'u')
257 |         {
258 |           isUniOut = true;
259 |         }
260 |     }
261 | 
262 |   wchar_t wLine[MAXLEN];
263 |   wchar_t wsegOut[MAXLEN * 2];
264 | 
265 |   if (fileformat != NULL)
266 |     {
267 |       FilterX* FltX = FileFilter::CreateFilter (stdin, stdout,
268 |                                                 isUniIn, isUniOut,
269 |                                                 fileformat);
270 |       if (FltX == NULL)
271 |         {
272 |           fprintf (stderr, "Invalid file format: %s\n", fileformat);
273 |           ExitWordSegmentation (wseg);
274 |           return 1;
275 |         }
276 |       while (FltX->GetNextToken (wLine, N_ELM (wLine), &thaiFlag))
277 |         {
278 |           if (!thaiFlag)
279 |             {
280 |               FltX->Print (wLine, thaiFlag);
281 |               continue;
282 |             }
283 |           WordSegmentation (wseg, &dict, FltX->GetWordBreak(), wLine,
284 |                             wsegOut, N_ELM (wsegOut));
285 |           FltX->Print (wsegOut, thaiFlag);
286 |         }
287 |       delete FltX;
288 |     }
289 |   else
290 |     {
291 |       wchar_t* wcwbr_buff = NULL;
292 |       const wchar_t* wcwbr = L"|";
293 |       if (wbr)
294 |         {
295 |           wcwbr = wcwbr_buff = ConvStrDup (wbr, isUniOut);
296 |         }
297 |       while (!feof (stdin))
298 |         {
299 |           if (verbose)
300 |             printf ("Input : ");
301 | 
302 |           if (!ConvGetS (wLine, N_ELM (wLine), stdin, isUniIn))
303 |             break;
304 |           int len = wcslen (wLine);
305 |           if (L'\n' == wLine [len - 1])
306 |             wLine [--len] = 0;
307 | 
308 |           if (verbose)
309 |             printf ("Output: ");
310 | 
311 |           if (0 == wLine[0])
312 |             {
313 |               printf ("\n");
314 |               continue;
315 |             }
316 | 
317 |           int tokenFlag;
318 |           wchar_t* startStr = wLine;
319 |           wchar_t wToken[MAXLEN];
320 | 
321 |           while ((tokenFlag = SplitToken (&startStr, wToken)) != TTOK_EOT)
322 |             {
323 |               switch (tokenFlag)
324 |                 {
325 |                   case TTOK_THAI:
326 |                     WordSegmentation (wseg, &dict, wcwbr, wToken, wsegOut,
327 |                                       N_ELM (wsegOut));
328 |                     ConvPrint (stdout, wsegOut, isUniOut);
329 |                     break;
330 | 
331 |                   case TTOK_WSPACE:
332 |                   case TTOK_ETC:
333 |                   default:
334 |                     ConvPrint (stdout, wToken, isUniOut);
335 |                     break;
336 |                 }
337 |               if (muleMode)
338 |                 {
339 |                   ConvPrint (stdout, wcwbr, isUniOut);
340 |                 }
341 |             }
342 |           printf ("\n");
343 |         }
344 |       if (wcwbr_buff)
345 |         free (wcwbr_buff);
346 |     }
347 | 
348 |   ExitWordSegmentation (wseg);
349 | 
350 |   return 0;
351 | }
352 | 
353 | static TextToken
354 | SplitToken (wchar_t** str, wchar_t* token)
355 | {
356 |   if (**str == 0)
357 |     return TTOK_EOT;
358 | 
359 |   if (iswspace (**str))
360 |     {
361 |       while (**str != 0 && iswspace (**str))
362 |         {
363 |           *token++ = *(*str)++;
364 |         }
365 |       *token = 0;
366 |       return TTOK_WSPACE;
367 |     }
368 |   else if (isThaiUni (**str))
369 |     {
370 |       while (**str != 0 && isThaiUni (**str))
371 |         {
372 |           *token++ = *(*str)++;
373 |         }
374 |       *token = 0;
375 |       return TTOK_THAI;
376 |     }
377 |   else
378 |     {
379 |       while (**str != 0 && !iswspace (**str) && !isThaiUni (**str))
380 |         {
381 |           *token++ = *(*str)++;
382 |         }
383 |       *token = 0;
384 |       return TTOK_ETC;
385 |     }
386 | }
387 | 
388 | 


--------------------------------------------------------------------------------
/COPYING:
--------------------------------------------------------------------------------
  1 | 		    GNU GENERAL PUBLIC LICENSE
  2 | 		       Version 2, June 1991
  3 | 
  4 |  Copyright (C) 1989, 1991 Free Software Foundation, Inc.
  5 |      59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  6 |  Everyone is permitted to copy and distribute verbatim copies
  7 |  of this license document, but changing it is not allowed.
  8 | 
  9 | 			    Preamble
 10 | 
 11 |   The licenses for most software are designed to take away your
 12 | freedom to share and change it.  By contrast, the GNU General Public
 13 | License is intended to guarantee your freedom to share and change free
 14 | software--to make sure the software is free for all its users.  This
 15 | General Public License applies to most of the Free Software
 16 | Foundation's software and to any other program whose authors commit to
 17 | using it.  (Some other Free Software Foundation software is covered by
 18 | the GNU Library General Public License instead.)  You can apply it to
 19 | your programs, too.
 20 | 
 21 |   When we speak of free software, we are referring to freedom, not
 22 | price.  Our General Public Licenses are designed to make sure that you
 23 | have the freedom to distribute copies of free software (and charge for
 24 | this service if you wish), that you receive source code or can get it
 25 | if you want it, that you can change the software or use pieces of it
 26 | in new free programs; and that you know you can do these things.
 27 | 
 28 |   To protect your rights, we need to make restrictions that forbid
 29 | anyone to deny you these rights or to ask you to surrender the rights.
 30 | These restrictions translate to certain responsibilities for you if you
 31 | distribute copies of the software, or if you modify it.
 32 | 
 33 |   For example, if you distribute copies of such a program, whether
 34 | gratis or for a fee, you must give the recipients all the rights that
 35 | you have.  You must make sure that they, too, receive or can get the
 36 | source code.  And you must show them these terms so they know their
 37 | rights.
 38 | 
 39 |   We protect your rights with two steps: (1) copyright the software, and
 40 | (2) offer you this license which gives you legal permission to copy,
 41 | distribute and/or modify the software.
 42 | 
 43 |   Also, for each author's protection and ours, we want to make certain
 44 | that everyone understands that there is no warranty for this free
 45 | software.  If the software is modified by someone else and passed on, we
 46 | want its recipients to know that what they have is not the original, so
 47 | that any problems introduced by others will not reflect on the original
 48 | authors' reputations.
 49 | 
 50 |   Finally, any free program is threatened constantly by software
 51 | patents.  We wish to avoid the danger that redistributors of a free
 52 | program will individually obtain patent licenses, in effect making the
 53 | program proprietary.  To prevent this, we have made it clear that any
 54 | patent must be licensed for everyone's free use or not licensed at all.
 55 | 
 56 |   The precise terms and conditions for copying, distribution and
 57 | modification follow.
 58 | 
 59 | 		    GNU GENERAL PUBLIC LICENSE
 60 |    TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
 61 | 
 62 |   0. This License applies to any program or other work which contains
 63 | a notice placed by the copyright holder saying it may be distributed
 64 | under the terms of this General Public License.  The "Program", below,
 65 | refers to any such program or work, and a "work based on the Program"
 66 | means either the Program or any derivative work under copyright law:
 67 | that is to say, a work containing the Program or a portion of it,
 68 | either verbatim or with modifications and/or translated into another
 69 | language.  (Hereinafter, translation is included without limitation in
 70 | the term "modification".)  Each licensee is addressed as "you".
 71 | 
 72 | Activities other than copying, distribution and modification are not
 73 | covered by this License; they are outside its scope.  The act of
 74 | running the Program is not restricted, and the output from the Program
 75 | is covered only if its contents constitute a work based on the
 76 | Program (independent of having been made by running the Program).
 77 | Whether that is true depends on what the Program does.
 78 | 
 79 |   1. You may copy and distribute verbatim copies of the Program's
 80 | source code as you receive it, in any medium, provided that you
 81 | conspicuously and appropriately publish on each copy an appropriate
 82 | copyright notice and disclaimer of warranty; keep intact all the
 83 | notices that refer to this License and to the absence of any warranty;
 84 | and give any other recipients of the Program a copy of this License
 85 | along with the Program.
 86 | 
 87 | You may charge a fee for the physical act of transferring a copy, and
 88 | you may at your option offer warranty protection in exchange for a fee.
 89 | 
 90 |   2. You may modify your copy or copies of the Program or any portion
 91 | of it, thus forming a work based on the Program, and copy and
 92 | distribute such modifications or work under the terms of Section 1
 93 | above, provided that you also meet all of these conditions:
 94 | 
 95 |     a) You must cause the modified files to carry prominent notices
 96 |     stating that you changed the files and the date of any change.
 97 | 
 98 |     b) You must cause any work that you distribute or publish, that in
 99 |     whole or in part contains or is derived from the Program or any
100 |     part thereof, to be licensed as a whole at no charge to all third
101 |     parties under the terms of this License.
102 | 
103 |     c) If the modified program normally reads commands interactively
104 |     when run, you must cause it, when started running for such
105 |     interactive use in the most ordinary way, to print or display an
106 |     announcement including an appropriate copyright notice and a
107 |     notice that there is no warranty (or else, saying that you provide
108 |     a warranty) and that users may redistribute the program under
109 |     these conditions, and telling the user how to view a copy of this
110 |     License.  (Exception: if the Program itself is interactive but
111 |     does not normally print such an announcement, your work based on
112 |     the Program is not required to print an announcement.)
113 | 
114 | These requirements apply to the modified work as a whole.  If
115 | identifiable sections of that work are not derived from the Program,
116 | and can be reasonably considered independent and separate works in
117 | themselves, then this License, and its terms, do not apply to those
118 | sections when you distribute them as separate works.  But when you
119 | distribute the same sections as part of a whole which is a work based
120 | on the Program, the distribution of the whole must be on the terms of
121 | this License, whose permissions for other licensees extend to the
122 | entire whole, and thus to each and every part regardless of who wrote it.
123 | 
124 | Thus, it is not the intent of this section to claim rights or contest
125 | your rights to work written entirely by you; rather, the intent is to
126 | exercise the right to control the distribution of derivative or
127 | collective works based on the Program.
128 | 
129 | In addition, mere aggregation of another work not based on the Program
130 | with the Program (or with a work based on the Program) on a volume of
131 | a storage or distribution medium does not bring the other work under
132 | the scope of this License.
133 | 
134 |   3. You may copy and distribute the Program (or a work based on it,
135 | under Section 2) in object code or executable form under the terms of
136 | Sections 1 and 2 above provided that you also do one of the following:
137 | 
138 |     a) Accompany it with the complete corresponding machine-readable
139 |     source code, which must be distributed under the terms of Sections
140 |     1 and 2 above on a medium customarily used for software interchange; or,
141 | 
142 |     b) Accompany it with a written offer, valid for at least three
143 |     years, to give any third party, for a charge no more than your
144 |     cost of physically performing source distribution, a complete
145 |     machine-readable copy of the corresponding source code, to be
146 |     distributed under the terms of Sections 1 and 2 above on a medium
147 |     customarily used for software interchange; or,
148 | 
149 |     c) Accompany it with the information you received as to the offer
150 |     to distribute corresponding source code.  (This alternative is
151 |     allowed only for noncommercial distribution and only if you
152 |     received the program in object code or executable form with such
153 |     an offer, in accord with Subsection b above.)
154 | 
155 | The source code for a work means the preferred form of the work for
156 | making modifications to it.  For an executable work, complete source
157 | code means all the source code for all modules it contains, plus any
158 | associated interface definition files, plus the scripts used to
159 | control compilation and installation of the executable.  However, as a
160 | special exception, the source code distributed need not include
161 | anything that is normally distributed (in either source or binary
162 | form) with the major components (compiler, kernel, and so on) of the
163 | operating system on which the executable runs, unless that component
164 | itself accompanies the executable.
165 | 
166 | If distribution of executable or object code is made by offering
167 | access to copy from a designated place, then offering equivalent
168 | access to copy the source code from the same place counts as
169 | distribution of the source code, even though third parties are not
170 | compelled to copy the source along with the object code.
171 | 
172 |   4. You may not copy, modify, sublicense, or distribute the Program
173 | except as expressly provided under this License.  Any attempt
174 | otherwise to copy, modify, sublicense or distribute the Program is
175 | void, and will automatically terminate your rights under this License.
176 | However, parties who have received copies, or rights, from you under
177 | this License will not have their licenses terminated so long as such
178 | parties remain in full compliance.
179 | 
180 |   5. You are not required to accept this License, since you have not
181 | signed it.  However, nothing else grants you permission to modify or
182 | distribute the Program or its derivative works.  These actions are
183 | prohibited by law if you do not accept this License.  Therefore, by
184 | modifying or distributing the Program (or any work based on the
185 | Program), you indicate your acceptance of this License to do so, and
186 | all its terms and conditions for copying, distributing or modifying
187 | the Program or works based on it.
188 | 
189 |   6. Each time you redistribute the Program (or any work based on the
190 | Program), the recipient automatically receives a license from the
191 | original licensor to copy, distribute or modify the Program subject to
192 | these terms and conditions.  You may not impose any further
193 | restrictions on the recipients' exercise of the rights granted herein.
194 | You are not responsible for enforcing compliance by third parties to
195 | this License.
196 | 
197 |   7. If, as a consequence of a court judgment or allegation of patent
198 | infringement or for any other reason (not limited to patent issues),
199 | conditions are imposed on you (whether by court order, agreement or
200 | otherwise) that contradict the conditions of this License, they do not
201 | excuse you from the conditions of this License.  If you cannot
202 | distribute so as to satisfy simultaneously your obligations under this
203 | License and any other pertinent obligations, then as a consequence you
204 | may not distribute the Program at all.  For example, if a patent
205 | license would not permit royalty-free redistribution of the Program by
206 | all those who receive copies directly or indirectly through you, then
207 | the only way you could satisfy both it and this License would be to
208 | refrain entirely from distribution of the Program.
209 | 
210 | If any portion of this section is held invalid or unenforceable under
211 | any particular circumstance, the balance of the section is intended to
212 | apply and the section as a whole is intended to apply in other
213 | circumstances.
214 | 
215 | It is not the purpose of this section to induce you to infringe any
216 | patents or other property right claims or to contest validity of any
217 | such claims; this section has the sole purpose of protecting the
218 | integrity of the free software distribution system, which is
219 | implemented by public license practices.  Many people have made
220 | generous contributions to the wide range of software distributed
221 | through that system in reliance on consistent application of that
222 | system; it is up to the author/donor to decide if he or she is willing
223 | to distribute software through any other system and a licensee cannot
224 | impose that choice.
225 | 
226 | This section is intended to make thoroughly clear what is believed to
227 | be a consequence of the rest of this License.
228 | 
229 |   8. If the distribution and/or use of the Program is restricted in
230 | certain countries either by patents or by copyrighted interfaces, the
231 | original copyright holder who places the Program under this License
232 | may add an explicit geographical distribution limitation excluding
233 | those countries, so that distribution is permitted only in or among
234 | countries not thus excluded.  In such case, this License incorporates
235 | the limitation as if written in the body of this License.
236 | 
237 |   9. The Free Software Foundation may publish revised and/or new versions
238 | of the General Public License from time to time.  Such new versions will
239 | be similar in spirit to the present version, but may differ in detail to
240 | address new problems or concerns.
241 | 
242 | Each version is given a distinguishing version number.  If the Program
243 | specifies a version number of this License which applies to it and "any
244 | later version", you have the option of following the terms and conditions
245 | either of that version or of any later version published by the Free
246 | Software Foundation.  If the Program does not specify a version number of
247 | this License, you may choose any version ever published by the Free Software
248 | Foundation.
249 | 
250 |   10. If you wish to incorporate parts of the Program into other free
251 | programs whose distribution conditions are different, write to the author
252 | to ask for permission.  For software which is copyrighted by the Free
253 | Software Foundation, write to the Free Software Foundation; we sometimes
254 | make exceptions for this.  Our decision will be guided by the two goals
255 | of preserving the free status of all derivatives of our free software and
256 | of promoting the sharing and reuse of software generally.
257 | 
258 | 			    NO WARRANTY
259 | 
260 |   11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
261 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
262 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
263 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
264 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
265 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
266 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
267 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
268 | REPAIR OR CORRECTION.
269 | 
270 |   12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
271 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
272 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
273 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
274 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
275 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
276 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
277 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
278 | POSSIBILITY OF SUCH DAMAGES.
279 | 
280 | 		     END OF TERMS AND CONDITIONS
281 | 
282 | 	    How to Apply These Terms to Your New Programs
283 | 
284 |   If you develop a new program, and you want it to be of the greatest
285 | possible use to the public, the best way to achieve this is to make it
286 | free software which everyone can redistribute and change under these terms.
287 | 
288 |   To do so, attach the following notices to the program.  It is safest
289 | to attach them to the start of each source file to most effectively
290 | convey the exclusion of warranty; and each file should have at least
291 | the "copyright" line and a pointer to where the full notice is found.
292 | 
293 |     <one line to give the program's name and a brief idea of what it does.>
294 |     Copyright (C) <year>  <name of author>
295 | 
296 |     This program is free software; you can redistribute it and/or modify
297 |     it under the terms of the GNU General Public License as published by
298 |     the Free Software Foundation; either version 2 of the License, or
299 |     (at your option) any later version.
300 | 
301 |     This program is distributed in the hope that it will be useful,
302 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
303 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
304 |     GNU General Public License for more details.
305 | 
306 |     You should have received a copy of the GNU General Public License
307 |     along with this program; if not, write to the Free Software
308 |     Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
309 | 
310 | 
311 | Also add information on how to contact you by electronic and paper mail.
312 | 
313 | If the program is interactive, make it output a short notice like this
314 | when it starts in an interactive mode:
315 | 
316 |     Gnomovision version 69, Copyright (C) year  name of author
317 |     Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
318 |     This is free software, and you are welcome to redistribute it
319 |     under certain conditions; type `show c' for details.
320 | 
321 | The hypothetical commands `show w' and `show c' should show the appropriate
322 | parts of the General Public License.  Of course, the commands you use may
323 | be called something other than `show w' and `show c'; they could even be
324 | mouse-clicks or menu items--whatever suits your program.
325 | 
326 | You should also get your employer (if you work as a programmer) or your
327 | school, if any, to sign a "copyright disclaimer" for the program, if
328 | necessary.  Here is a sample; alter the names:
329 | 
330 |   Yoyodyne, Inc., hereby disclaims all copyright interest in the program
331 |   `Gnomovision' (which makes passes at compilers) written by James Hacker.
332 | 
333 |   <signature of Ty Coon>, 1 April 1989
334 |   Ty Coon, President of Vice
335 | 
336 | This General Public License does not permit incorporating your program into
337 | proprietary programs.  If your program is a subroutine library, you may
338 | consider it more useful to permit linking proprietary applications with the
339 | library.  If this is what you want to do, use the GNU Library General
340 | Public License instead of this License.
341 | 


--------------------------------------------------------------------------------
/src/filterrtf.cpp:
--------------------------------------------------------------------------------
  1 | // FilterRTF.cpp: implementation of the FilterRTF class.
  2 | //
  3 | //////////////////////////////////////////////////////////////////////
  4 | #include <string.h>
  5 | #include <ctype.h>
  6 | #include <wctype.h>
  7 | #include "filterrtf.h"
  8 | #include "conv/utf8.h"
  9 | #include "convutil.h"
 10 | #include "utils.h"
 11 | #include "worddef.h"
 12 | 
 13 | #ifndef HAVE_WCPCPY
 14 | extern "C"
 15 | wchar_t* wcpcpy (wchar_t* dest, const wchar_t* src);
 16 | #endif
 17 | 
 18 | //////////////////////////////////////////////////////////////////////
 19 | // class RTFToken
 20 | //////////////////////////////////////////////////////////////////////
 21 | class RTFToken
 22 | {
 23 | public:
 24 |   enum ETokenType
 25 |   {
 26 |     RTK_TEXT,
 27 |     RTK_BYTE,
 28 |     RTK_UNI_COUNT,
 29 |     RTK_UNI_CODE,
 30 |     RTK_NONE
 31 |   };
 32 | 
 33 | public:
 34 |   RTFToken() : type (RTK_NONE), valLen (0) {}
 35 | 
 36 |   ETokenType  getType() const { return type; }
 37 |   const char* getVal() const  { return val; }
 38 |   bool        isEmpty() const { return valLen == 0; }
 39 | 
 40 |   void reset ();
 41 |   void set (char c);
 42 |   void appendChar (char c);
 43 |   void terminate (ETokenType t);
 44 | 
 45 | private:
 46 |   ETokenType type;
 47 |   char       val[1024];
 48 |   size_t     valLen;
 49 | };
 50 | 
 51 | inline void
 52 | RTFToken::reset ()
 53 | {
 54 |   type = RTK_NONE;
 55 |   val[0] = 0;
 56 |   valLen = 0;
 57 | }
 58 | 
 59 | inline void
 60 | RTFToken::set (char c)
 61 | {
 62 |   val[0] = c;
 63 |   val[1] = '\0';
 64 |   valLen = 1;
 65 | }
 66 | 
 67 | inline void
 68 | RTFToken::appendChar (char c)
 69 | {
 70 |   if (valLen + 1 < sizeof val)
 71 |     {
 72 |       val[valLen++] = c;
 73 |     }
 74 | }
 75 | 
 76 | inline void
 77 | RTFToken::terminate (ETokenType t)
 78 | {
 79 |   val[valLen] = '\0';
 80 |   type = t;
 81 | }
 82 | 
 83 | //////////////////////////////////////////////////////////////////////
 84 | // Construction/Destruction
 85 | //////////////////////////////////////////////////////////////////////
 86 | 
 87 | FilterRTF::FilterRTF (FILE* filein, FILE* fileout, bool isUniIn, bool isUniOut)
 88 |   : FilterX (filein, fileout, isUniIn, isUniOut,
 89 |              isUniOut ? L"\\u8203\\'e2\\'80\\'8b" : L"\\u8203\\'3f"), // U+200B
 90 |     psState (CS_START),
 91 |     curUTFReadBytes (1),
 92 |     curUTFWriteBytes (1)
 93 | {
 94 |   strbuff[0] = 0;
 95 | }
 96 | 
 97 | bool
 98 | FilterRTF::GetNextToken (wchar_t* token, int tokenSz, bool* thaiFlag)
 99 | {
100 |   int   nCopy;
101 |   int   bytesToSkip = 0;
102 |   bool  isCopySkipBytes = false;
103 |   bool  charConsumed = true;
104 |   int   c;
105 |   RTFToken rtfToken;
106 | 
107 |   *token = 0;
108 | 
109 |   if (!fpin || feof (fpin))
110 |     return false;
111 | 
112 |   // get first character to determine *thaiFlag
113 |   if (0 != strbuff[0])
114 |     {
115 |       *thaiFlag = isThaiUni (strbuff[0]);
116 |       nCopy = min<int> (tokenSz - 1, wcslen (strbuff));
117 |       wcsncpy (token, strbuff, nCopy);
118 |       token += nCopy;
119 |       *token = 0;
120 |       tokenSz -= nCopy;
121 |       wmemmove (strbuff, strbuff + nCopy, wcslen (strbuff + nCopy) + 1);
122 |     }
123 |   else
124 |     {
125 |       while (!charConsumed || (c = fgetc (fpin)) != EOF)
126 |         {
127 |           psState = chgCharState (psState, c, &charConsumed, &rtfToken);
128 |           switch (rtfToken.getType())
129 |             {
130 |             case RTFToken::RTK_TEXT:
131 |               nCopy = min<int> (tokenSz - 1, strlen (rtfToken.getVal()));
132 |               token = Ascii2WcsNCopy (token, rtfToken.getVal(), nCopy);
133 |               *token = 0;
134 |               tokenSz -= nCopy;
135 |               rtfToken.reset ();
136 |               *thaiFlag = false;
137 |               goto thai_flag_determined;
138 | 
139 |             case RTFToken::RTK_UNI_COUNT:
140 |               sscanf (rtfToken.getVal(), "%d", &curUTFReadBytes);
141 |               rtfToken.reset ();
142 |               break;
143 | 
144 |             case RTFToken::RTK_UNI_CODE:
145 |               {
146 |                 unichar uc;
147 |                 sscanf (rtfToken.getVal(), "%d", &uc);
148 |                 if (isThaiUni (uc))
149 |                   {
150 |                     if (tokenSz >= 2)
151 |                       {
152 |                         *token++ = uc;
153 |                         *token = 0;
154 |                         --tokenSz;
155 |                       }
156 |                     *thaiFlag = true;
157 |                     isCopySkipBytes = false;
158 |                   }
159 |                 else
160 |                   {
161 |                     int len = strlen (rtfToken.getVal());
162 |                     if (len + 2 <= tokenSz - 1)
163 |                       {
164 |                         token = wcpcpy (token, L"\\u");
165 |                         token = Ascii2WcsCopy (token, rtfToken.getVal());
166 |                         tokenSz -= len + 2;
167 |                       }
168 | 
169 |                     *thaiFlag = false;
170 |                     isCopySkipBytes = true;
171 |                   }
172 |                 rtfToken.reset ();
173 |                 bytesToSkip = curUTFReadBytes;
174 |               }
175 |               break;
176 | 
177 |             case RTFToken::RTK_BYTE:
178 |               if (bytesToSkip > 0)
179 |                 {
180 |                   // skip bytes after \uDDDD
181 |                   if (isCopySkipBytes)
182 |                     {
183 |                       int len = strlen (rtfToken.getVal());
184 |                       if (len + 2 <= tokenSz - 1)
185 |                         {
186 |                           token = wcpcpy (token, L"\\'");
187 |                           token = Ascii2WcsCopy (token, rtfToken.getVal());
188 |                           tokenSz -= len + 2;
189 |                         }
190 |                     }
191 |                   rtfToken.reset ();
192 |                   if (0 == --bytesToSkip)
193 |                     goto thai_flag_determined;
194 |                 }
195 |               else
196 |                 {
197 |                   // normal ANSI data bytes
198 |                   int ch;
199 |                   sscanf (rtfToken.getVal(), "%x", &ch);
200 |                   rtfToken.reset ();
201 |                   if (tokenSz >= 2)
202 |                     {
203 |                       *token++ = tis2uni (ch);
204 |                       *token = 0;
205 |                       --tokenSz;
206 |                     }
207 |                   *thaiFlag = isThai (ch);
208 |                   goto thai_flag_determined;
209 |                 }
210 |               break;
211 | 
212 |             default:
213 |               // RTFToken::RTK_NONE
214 |               break;
215 |             }
216 |         }
217 |     }
218 | 
219 | thai_flag_determined:
220 | 
221 |   // continue reading until opposite type of chunk is found
222 |   if (*thaiFlag)
223 |     {
224 |       // continue reading Thai chunk until non-Thai token is found
225 |       while (!charConsumed || (c = fgetc (fpin)) != EOF)
226 |         {
227 |           psState = chgCharState (psState, c, &charConsumed, &rtfToken);
228 |           switch (rtfToken.getType())
229 |             {
230 |             case RTFToken::RTK_TEXT:
231 |               // non-Thai text is found -> stop
232 |               nCopy = min<int> (N_ELM (strbuff) - 1,
233 |                                 strlen (rtfToken.getVal()));
234 |               *Ascii2WcsNCopy (strbuff, rtfToken.getVal(), nCopy) = 0;
235 |               rtfToken.reset ();
236 |               goto end_of_chunk;
237 | 
238 |             case RTFToken::RTK_UNI_COUNT:
239 |               sscanf (rtfToken.getVal(), "%d", &curUTFReadBytes);
240 |               rtfToken.reset ();
241 |               break;
242 | 
243 |             case RTFToken::RTK_UNI_CODE:
244 |               {
245 |                 unichar uc;
246 |                 sscanf (rtfToken.getVal(), "%d", &uc);
247 |                 if (isThaiUni (uc))
248 |                   {
249 |                     if (tokenSz >= 2)
250 |                       {
251 |                         *token++ = uc;
252 |                         *token = 0;
253 |                         --tokenSz;
254 |                       }
255 |                     isCopySkipBytes = false;
256 |                   }
257 |                 else
258 |                   {
259 |                     // non-Thai text is found -> prepare to stop
260 |                     wchar_t* p = wcpcpy (strbuff, L"\\u");
261 |                     Ascii2WcsCopy (p, rtfToken.getVal());
262 |                     isCopySkipBytes = true;
263 |                   }
264 |                 rtfToken.reset ();
265 |                 bytesToSkip = curUTFReadBytes;
266 |               }
267 |               break;
268 | 
269 |             case RTFToken::RTK_BYTE:
270 |               if (bytesToSkip > 0)
271 |                 {
272 |                   // skip bytes after \uDDDD
273 |                   if (isCopySkipBytes)
274 |                     {
275 |                       wcscat (strbuff, L"\\'");
276 |                       Ascii2WcsCat (strbuff, rtfToken.getVal());
277 |                     }
278 |                   rtfToken.reset ();
279 |                   // decrement bytes to skip
280 |                   // stop if non-Thai Unicode is ended
281 |                   if (0 == --bytesToSkip && isCopySkipBytes)
282 |                     goto end_of_chunk;
283 |                 }
284 |               else
285 |                 {
286 |                   // normal ANSI data bytes
287 |                   int ch;
288 |                   sscanf (rtfToken.getVal(), "%x", &ch);
289 |                   rtfToken.reset ();
290 |                   // if non-Thai -> stop
291 |                   if (!isThai (ch))
292 |                     goto end_of_chunk;
293 |                   if (tokenSz >= 2)
294 |                     {
295 |                       *token++ = tis2uni (ch);
296 |                       *token = 0;
297 |                       --tokenSz;
298 |                     }
299 |                 }
300 |               break;
301 | 
302 |             default:
303 |               // RTFToken::RTK_NONE
304 |               break;
305 |             }
306 |         }
307 |     }
308 |   else
309 |     {
310 |       // continue reading non-Thai chunk until Thai token is found
311 |       // To prevent too long chunk, also stop at white space.
312 |       while (!charConsumed || (c = fgetc (fpin)) != EOF)
313 |         {
314 |           psState = chgCharState (psState, c, &charConsumed, &rtfToken);
315 |           switch (rtfToken.getType())
316 |             {
317 |             case RTFToken::RTK_TEXT:
318 |               nCopy = min<int> (tokenSz - 1, strlen (rtfToken.getVal()));
319 |               token = Ascii2WcsNCopy (token, rtfToken.getVal(), nCopy);
320 |               *token = 0;
321 |               tokenSz -= nCopy;
322 |               rtfToken.reset ();
323 |               if (iswspace (token[-1]))
324 |                 goto end_of_chunk;
325 |               break;
326 | 
327 |             case RTFToken::RTK_UNI_COUNT:
328 |               sscanf (rtfToken.getVal(), "%d", &curUTFReadBytes);
329 |               rtfToken.reset ();
330 |               break;
331 | 
332 |             case RTFToken::RTK_UNI_CODE:
333 |               {
334 |                 unichar uc;
335 |                 sscanf (rtfToken.getVal(), "%d", &uc);
336 |                 if (isThaiUni (uc))
337 |                   {
338 |                     // Thai chunk is found -> stop
339 |                     strbuff[0] = uc;
340 |                     strbuff[1] = 0;
341 |                     isCopySkipBytes = false;
342 |                   }
343 |                 else
344 |                   {
345 |                     int len = strlen (rtfToken.getVal());
346 |                     if (len + 2 <= tokenSz - 1)
347 |                       {
348 |                         token = wcpcpy (token, L"\\u");
349 |                         token = Ascii2WcsCopy (token, rtfToken.getVal());
350 |                         tokenSz -= len + 2;
351 |                       }
352 |                     isCopySkipBytes = true;
353 |                   }
354 |                 rtfToken.reset ();
355 |                 bytesToSkip = curUTFReadBytes;
356 |               }
357 |               break;
358 | 
359 |             case RTFToken::RTK_BYTE:
360 |               if (bytesToSkip > 0)
361 |                 {
362 |                   // skip bytes after \uDDDD
363 |                   if (isCopySkipBytes)
364 |                     {
365 |                       int len = strlen (rtfToken.getVal());
366 |                       if (len + 2 <= tokenSz - 1)
367 |                         {
368 |                           token = wcpcpy (token, L"\\'");
369 |                           token = Ascii2WcsCopy (token, rtfToken.getVal());
370 |                           tokenSz -= len + 2;
371 |                         }
372 |                     }
373 |                   rtfToken.reset ();
374 |                   // decrement bytes to skip
375 |                   // stop if Thai Unicode is ended
376 |                   if (0 == --bytesToSkip && !isCopySkipBytes)
377 |                     goto end_of_chunk;
378 |                 }
379 |               else
380 |                 {
381 |                   // normal ANSI data bytes
382 |                   int ch;
383 |                   sscanf (rtfToken.getVal(), "%x", &ch);
384 |                   rtfToken.reset ();
385 |                   if (isThai (ch))
386 |                     {
387 |                       // Thai chunk is found -> stop
388 |                       strbuff[0] = tis2uni (ch);
389 |                       strbuff[1] = 0;
390 |                       goto end_of_chunk;
391 |                     }
392 |                   if (tokenSz >= 2)
393 |                     {
394 |                       *token++ = ch;
395 |                       *token = 0;
396 |                       --tokenSz;
397 |                     }
398 |                 }
399 |               break;
400 | 
401 |             default:
402 |               // RTFToken::RTK_NONE
403 |               break;
404 |             }
405 |         }
406 |     }
407 | 
408 | end_of_chunk:
409 |   if (!charConsumed)
410 |     {
411 |       int len = wcslen (strbuff);
412 |       strbuff[len++] = c;
413 |       strbuff[len] = 0;
414 |     }
415 | 
416 |   return true;
417 | }
418 | 
419 | FilterRTF::ECharState
420 | FilterRTF::chgCharState (ECharState state, char charIn, bool* charConsumed,
421 |                          RTFToken* rtfToken)
422 | {
423 |   // assume it's consumed by default; let's overwrite it otherwise
424 |   *charConsumed = true;
425 | 
426 |   //sequence of characters is    \ ' x x (one character)
427 |   //states of this sequence are  1 2 3 4  
428 |   //Assume thai character has format like \'xx.
429 |   //Unicode sequences are:
430 |   // - \ucN : set skip bytes after \uDDDD to N
431 |   // - \uDDDD\'XX\'YY\'ZZ : DDDD decimal Unicode code point,
432 |   //   followed by legacy data bytes "\xXX\xYY\xZZ" to be skipped by N bytes
433 |   //   (as set by the last \ucN)
434 |   //retune current state of character.
435 |   switch (state)
436 |     {
437 |     case CS_START:
438 |       if ('\\' == charIn)
439 |         {
440 |           // flush currently collected token
441 |           if (!rtfToken->isEmpty())
442 |             {
443 |               rtfToken->terminate (RTFToken::RTK_TEXT);
444 |             }
445 | 
446 |           return CS_ESCAPE;
447 |         }
448 | 
449 |       rtfToken->appendChar (charIn);
450 |       rtfToken->terminate (RTFToken::RTK_TEXT);
451 |       return CS_START;
452 | 
453 |     case CS_ESCAPE:
454 |       switch (charIn)
455 |         {
456 |         case '\'':
457 |           return CS_CH_BYTE;
458 |         case 'u':
459 |           return CS_UNI;
460 |         default:
461 |           // not interesting sequence, make it normal text
462 |           rtfToken->set ('\\');
463 |           rtfToken->appendChar (charIn);
464 |           rtfToken->terminate (RTFToken::RTK_TEXT);
465 |           return CS_START;
466 |         }
467 |       break;
468 | 
469 |     case CS_CH_BYTE:
470 |       if (isxdigit (charIn))
471 |         {
472 |           rtfToken->appendChar (charIn);
473 |           return CS_CH_BYTE;
474 |         }
475 | 
476 |       rtfToken->terminate (RTFToken::RTK_BYTE);
477 |       goto end_number;
478 | 
479 |     case CS_UNI:
480 |       if (isdigit (charIn))
481 |         {
482 |           rtfToken->set (charIn);
483 |           return CS_UNI_CODE;
484 |         }
485 |       if ('c' == charIn)
486 |         return CS_UNI_COUNT;
487 | 
488 |       // not interesting sequence, make it normal text
489 |       rtfToken->set ('\\');
490 |       rtfToken->appendChar ('u');
491 |       if ('\\' == charIn)
492 |         {
493 |           rtfToken->terminate (RTFToken::RTK_TEXT);
494 |           return CS_ESCAPE;
495 |         }
496 |       rtfToken->appendChar (charIn);
497 |       rtfToken->terminate (RTFToken::RTK_TEXT);
498 |       return CS_START;
499 | 
500 |     case CS_UNI_COUNT:
501 |       if (isdigit (charIn))
502 |         {
503 |           rtfToken->appendChar (charIn);
504 |           return CS_UNI_COUNT;
505 |         }
506 | 
507 |       rtfToken->terminate (RTFToken::RTK_UNI_COUNT);
508 |       goto end_number;
509 | 
510 |     case CS_UNI_CODE:
511 |       if (isdigit (charIn))
512 |         {
513 |           rtfToken->appendChar (charIn);
514 |           return CS_UNI_CODE;
515 |         }
516 | 
517 |       rtfToken->terminate (RTFToken::RTK_UNI_CODE);
518 |       goto end_number;
519 |     }
520 | 
521 |   // All cases are covered, this line should not be reached.
522 | 
523 | end_number:
524 |   if ('\\' == charIn)
525 |     return CS_ESCAPE;
526 | 
527 |   *charConsumed = (' ' == charIn);
528 |   return CS_START;
529 | }
530 | 
531 | static void PrintEscapedUTF8 (FILE* fpout, wchar_t wc);
532 | 
533 | void
534 | FilterRTF::Print (const wchar_t* token, bool thaiFlag)
535 | {
536 |   if (!thaiFlag)
537 |     {
538 |       for (const wchar_t *p = token; *p; ++p)
539 |         {
540 |           printNonThai (*p);
541 |         }
542 |     }
543 |   else
544 |     {
545 |       if (isUniOut)
546 |         {
547 |           for (const wchar_t* p = token; *p; ++p)
548 |             {
549 |               if (isThaiUni (*p))
550 |                 {
551 |                   int bytes = UTF8Bytes (*p);
552 |                   if (bytes != curUTFWriteBytes)
553 |                     {
554 |                       curUTFWriteBytes = bytes;
555 |                       fprintf (fpout, "\\uc%d ", curUTFWriteBytes);
556 |                     }
557 | 
558 |                   fprintf (fpout, "\\u%d", *p);
559 |                   PrintEscapedUTF8 (fpout, *p);
560 |                 }
561 |               else
562 |                 {
563 |                   printNonThai (*p);
564 |                 }
565 |             }
566 |         }
567 |       else
568 |         {
569 |           while (*token != 0)
570 |             {
571 |               if (isThaiUni (*token))
572 |                 {
573 |                   fprintf (fpout, "\\'%02x", uni2tis (*token));
574 |                 }
575 |               else if (*token >= 0x80)
576 |                 {
577 |                   fprintf (fpout, "\\'%02x", (unsigned char) *token);
578 |                 }
579 |               else
580 |                 {
581 |                   fprintf (fpout, "%c", (unsigned char) *token);
582 |                 }
583 |               token++;
584 |             }
585 |         }
586 |     }
587 | }
588 | 
589 | void
590 | FilterRTF::printNonThai (wchar_t wc)
591 | {
592 |   switch (wc)
593 |     {
594 |     case L'}':
595 |       if (curUTFWriteBytes != 1)
596 |         {
597 |           fprintf (fpout, " \\uc1 ");
598 |         }
599 |         /* fall through */
600 |     case L'{':
601 |       curUTFWriteBytes = 1;
602 |       break;
603 |     }
604 |   if (wc < 0x80)
605 |     {
606 |       fprintf (fpout, "%c", char (wc));
607 |     }
608 |   else
609 |     {
610 |       PrintEscapedUTF8 (fpout, wc);
611 |     }
612 | }
613 | 
614 | static void
615 | PrintEscapedUTF8 (FILE* fpout, wchar_t wc)
616 | {
617 |   char utfBuff[10];
618 |   UTF8Writer uw (utfBuff, sizeof utfBuff);
619 |   uw.Write (wc);
620 |   uw.Write (0);
621 |   for (const char* u = utfBuff; *u; ++u)
622 |     {
623 |       fprintf (fpout, "\\'%02x", (unsigned char) *u);
624 |     }
625 | }
626 | 


--------------------------------------------------------------------------------
/data/tdict-common.txt:
--------------------------------------------------------------------------------
   1 | กรรมาชน
   2 | กร๊วก
   3 | กรอบรูป
   4 | กระดี๊กระด๊า
   5 | กระท่าง
   6 | กระบับ
   7 | กรังด์ปรีซ์
   8 | กราฟิตี
   9 | กราฟิตี้
  10 | กราวนด์
  11 | กราเวียร์
  12 | กรีน
  13 | กรุ๊ป
  14 | กฤษณ์
  15 | กลาส
  16 | ก๊วน
  17 | กษัตริยา
  18 | กษัตริยาธิราช
  19 | ก๊อง
  20 | ก่อนหน้า
  21 | กะบับ
  22 | กับดัก
  23 | กัมมันตะ
  24 | ก๊าก
  25 | ก๋ากั่น
  26 | กาญจน์
  27 | กาญจนาภิเษก
  28 | กาบัดดี้
  29 | กามสูตร
  30 | กามิกาเซ่
  31 | การันตี
  32 | การุณยฆาต
  33 | กาลามสูตร
  34 | กาหลิบ
  35 | กิฟท์
  36 | กิมจิ
  37 | กีวี
  38 | กึ๋ย
  39 | กุนซือ
  40 | กุมภาพันธ์
  41 | กู๋
  42 | กูรู
  43 | เกจิ
  44 | เกมส์
  45 | เกย์
  46 | เกรด
  47 | เกรย์
  48 | เกษตราธิการ
  49 | เกสต์เฮาส์
  50 | เก๊ะ
  51 | เก๋ากี้
  52 | เกิร์ล
  53 | แกงค์
  54 | แกรนด์
  55 | แกรมม่า
  56 | แกลเลอรี
  57 | แกลเลอรี่
  58 | แกสโซฮอล์
  59 | แก๊สโซฮอล์
  60 | โกจิ
  61 | โกเต็กซ์
  62 | โกลด์
  63 | โกะ
  64 | โก๊ะ
  65 | ไกด์
  66 | ขั้นตอน
  67 | ข่าน
  68 | เขวี้ยง
  69 | คชบาล
  70 | คณาญาติ
  71 | ครัวซอง
  72 | ครัวซองต์
  73 | คร้าบ
  74 | คราฟต์
  75 | คร่ำครวญ
  76 | ครีเอท
  77 | ครีเอทีฟ
  78 | ครูเสด
  79 | คลับ
  80 | คลาสสิก
  81 | คลาสสิค
  82 | คลิตอริส
  83 | คลิป
  84 | คลีน
  85 | ความหมาย
  86 | ควิก
  87 | ควีน
  88 | คองเกรส
  89 | คอนแชร์โต
  90 | คอนซูเมอร์
  91 | คอนเซปต์
  92 | คอนเซ็ปต์
  93 | คอนโด
  94 | คอนโดมิเนียม
  95 | คอนเทนเนอร์
  96 | คอนแทค
  97 | คอนแท็ค
  98 | คอนโทรล
  99 | คอนเฟิร์ม
 100 | คอนสตรักชัน
 101 | คอนสตรักชั่น
 102 | คอนสตรัคชัน
 103 | คอนสตรัคชั่น
 104 | คอปเตอร์
 105 | คอฟฟี
 106 | คอฟฟี่
 107 | คอมพ์
 108 | คอมเพล็กซ์
 109 | คอมฟอร์ต
 110 | คอมฟอร์ท
 111 | คอมมอนส์
 112 | คอมมูน
 113 | คอมเมนท์
 114 | คอมเมนต์
 115 | คอมเมิร์ซ
 116 | คอร์ต
 117 | คอร์ท
 118 | คอร์น
 119 | คอร์ป
 120 | คอร์ปอเรชั่น
 121 | คอร์รัปชัน
 122 | คอร์รัปชั่น
 123 | คอรัปชัน
 124 | คอรัปชั่น
 125 | คอร์ส
 126 | คอลเล็กชั่น
 127 | คอลัมน์
 128 | คอลัมนิสต์
 129 | คอสตูม
 130 | คอสเพลย์
 131 | คะนิ้ง
 132 | คัตเอาต์
 133 | คันคาก
 134 | คันถธุระ
 135 | คันทรี
 136 | คันทรี่
 137 | คันธาระ
 138 | คันยิ
 139 | คัสตาร์ด
 140 | คาเฟอีน
 141 | คาร์ดินัล
 142 | คาราโอเกะ
 143 | คีตกวี
 144 | คีตปฏิภาณ
 145 | คีตราชัน
 146 | คาบาเรต์
 147 | คาบาเร่ต์
 148 | คาปูชิโน
 149 | คามิคาเซ่
 150 | คาเฟ่
 151 | คาร์
 152 | คาร์โก้
 153 | คาร์นิวัล
 154 | คาราเมล
 155 | คาแรกเตอร์
 156 | คาแร็กเตอร์
 157 | คาแรคเตอร์
 158 | คาแร็คเตอร์
 159 | คาวบอย
 160 | คาสิโน
 161 | คิดส์
 162 | คิวบิก
 163 | คีโม
 164 | คูล
 165 | คูลเลอร์
 166 | เคบับ
 167 | เคบิน
 168 | เครป
 169 | เคลม
 170 | เคลียร์
 171 | เคลื่อนย้าย
 172 | เคส
 173 | เคอร์ฟิว
 174 | แคชเชียร์
 175 | แคทวอล์ค
 176 | แคนดิเดต
 177 | แคนตาลูป
 178 | แคนยอน
 179 | แคนู
 180 | แคป
 181 | แคมป์
 182 | แคมปัส
 183 | แคมเปญ
 184 | แคร์
 185 | แครกเกอร์
 186 | แคร็กเกอร์
 187 | แครนเบอร์รี
 188 | แครนเบอร์รี่
 189 | แครนเบอรี
 190 | แครนเบอรี่
 191 | แครอท
 192 | แคสต์
 193 | แคสติง
 194 | แคสติ้ง
 195 | โค้ก
 196 | โค้ช
 197 | โค้ท
 198 | โคโยตี
 199 | โคโยตี้
 200 | โครเชต์
 201 | โครนา
 202 | โคล่า
 203 | โคลีเซียม
 204 | โควต
 205 | โคอะล่า
 206 | โคอาลา
 207 | โคอาล่า
 208 | ไคลแมกซ์
 209 | ไคลแม็กซ์
 210 | งั้น
 211 | ง่าว
 212 | งี้
 213 | เง็ง
 214 | โง่เขลา
 215 | ไง
 216 | จงหยวน
 217 | จตุคาม
 218 | จ๊อกกี้
 219 | จอหงวน
 220 | จักรินทร์
 221 | จังโก้
 222 | จัมโบ้
 223 | จากัวร์
 224 | จารกรรม
 225 | จารชน
 226 | จ้ำบ๊ะ
 227 | จิ๊ก
 228 | จิ๊กโก๋
 229 | จิ๊กซอว์
 230 | จิตพิสัย
 231 | จิตเภท
 232 | จีดีพี
 233 | จึ๊ก
 234 | จุ๊ย
 235 | จูดาห์
 236 | จูน
 237 | จูเนียร์
 238 | เจ๊
 239 | เจได
 240 | เจ๊งบ๊ง
 241 | เจ็ต
 242 | เจ๊ต
 243 | เจนเนอเรชั่น
 244 | เจล
 245 | เจ๊าะแจ๊ะ
 246 | เจี๊ยว
 247 | แจ็กเก็ต
 248 | แจ๊กเก็ต
 249 | แจ็กพอต
 250 | แจ็กพ็อต
 251 | แจ๊กพอต
 252 | แจ๊กพ็อต
 253 | แจม
 254 | แจ่วฮ้อน
 255 | แจ๊ส
 256 | โจ๋
 257 | ฉลุย
 258 | ฉันทามติ
 259 | เฉิ่ม
 260 | ชนะเลิศ
 261 | ชยันตี
 262 | ช็อค
 263 | ช็อต
 264 | ช็อป
 265 | ช็อปปิ้ง
 266 | ช็อปเปอร์
 267 | ชะโนด
 268 | ชัตเตอร์
 269 | ชัวร์
 270 | ชาบู
 271 | ชาร์จ
 272 | ชาร์ต
 273 | ชาร์ป
 274 | ชินคังเซน
 275 | ชินคังเซ็น
 276 | ชินคันเซน
 277 | ชินคันเซ็น
 278 | ชินบัญชร
 279 | ชิฟฟอน
 280 | ชีส
 281 | ชีอะห์
 282 | ชู้ต
 283 | เช็ก
 284 | เช็งเม้ง
 285 | เชฟ
 286 | เชลโล
 287 | เชลโล่
 288 | เชลียร์
 289 | เชอร์รี่
 290 | เชียะ
 291 | แชเชือน
 292 | แช่แข็ง
 293 | แชมป์
 294 | แชมปิยอง
 295 | แชมเปญ
 296 | แชมเปี้ยน
 297 | แชมพู
 298 | โชกุน
 299 | โชเฟอร์
 300 | โชว์รูม
 301 | โชห่วย
 302 | ใช้งาน
 303 | ไชน่า
 304 | ซกมก
 305 | ซ้อ
 306 | ซอฟต์บอล
 307 | ซอมบี้
 308 | ซะ
 309 | ซังเต
 310 | ซันตาคลอส
 311 | ซัพพลาย
 312 | ซัพพลายเออร์
 313 | ซัมเมอร์
 314 | ซัลโว
 315 | ซากุระ
 316 | ซาดิสต์
 317 | ซาดิสม์
 318 | ซาตาน
 319 | ซานตาคลอส
 320 | ซาฟารี
 321 | ซาบะ
 322 | ซามูไร
 323 | ซาร์
 324 | ซาร์ดีน
 325 | ซาร์ส
 326 | ซาลอน
 327 | ซาลามานเดอร์
 328 | ซาลาแมนเดอร์
 329 | ซาเล้ง
 330 | ซิง
 331 | ซิ่ง
 332 | ซิงเกิล
 333 | ซิตี
 334 | ซิตี้
 335 | ซินโดรม
 336 | ซิม
 337 | ซิ้ม
 338 | ซิมโฟนี
 339 | ซิมโฟนี่
 340 | ซิลเวอร์
 341 | ซี้
 342 | ซี้ซั้ว
 343 | ซีดาน
 344 | ซีน
 345 | ซีนีเพล็กซ์
 346 | ซีเนียร์
 347 | ซีร็อกซ์
 348 | ซีรีส์
 349 | ซีเรียล
 350 | ซีเรียส
 351 | ซีโร่
 352 | ซีล
 353 | ซีอีโอ
 354 | ซื่อบื้อ
 355 | ซุนหนี่
 356 | ซุปตาร์
 357 | ซุปเปอร์
 358 | ซูชิ
 359 | ซูเปอร์
 360 | ซูม
 361 | ซูโม่
 362 | ซูเอี๋ย
 363 | ซูฮก
 364 | เซ็กซ์
 365 | เซ็กซี่
 366 | เซ็กส์
 367 | เซนเซอร์
 368 | เซ็นเซอร์
 369 | เซนเตอร์
 370 | เซ็นเตอร์
 371 | เซนทรัล
 372 | เซ็นทรัล
 373 | เซนส์
 374 | เซ่นไหว้
 375 | เซฟ
 376 | เซฟตี้
 377 | เซรามิก
 378 | เซลส์
 379 | เซลส์แมน
 380 | เซเลบ
 381 | เซเล็บ
 382 | เซอร์
 383 | เซอร์กิต
 384 | เซอร์ไพรส์
 385 | เซอร์วิส
 386 | เซาท์
 387 | เซาท์เทิร์น
 388 | เซาเทิร์น
 389 | เซี้ยว
 390 | เซียะ
 391 | แซ็ก
 392 | แซกโซโฟน
 393 | แซ็กโซโฟน
 394 | แซนด์วิช
 395 | แซมบ้า
 396 | แซลมอน
 397 | แซว
 398 | โซเชียล
 399 | โซตัส
 400 | โซน
 401 | โซนี่
 402 | โซฟา
 403 | โซ้ย
 404 | โซลาร์
 405 | โซโล
 406 | โซโล่
 407 | ไซบีเรียน
 408 | ไซรัป
 409 | ไซออนนิสต์
 410 | ญาณทัสสนะ
 411 | ดยุก
 412 | ดยุค
 413 | ดร็อป
 414 | ดรัมเมเยอร์
 415 | ดรากอน
 416 | ดรามา
 417 | ดราม่า
 418 | ดรีม
 419 | ดวล
 420 | ดอกเตอร์
 421 | ด็อกเตอร์
 422 | ดอลล์
 423 | ดัชเชส
 424 | ดาวน์
 425 | ดิกชันนารี
 426 | ดินแดน
 427 | ดิสเครดิต
 428 | ดีกรี
 429 | ดีเจ
 430 | ดีไซน์
 431 | ดีไซน์เนอร์
 432 | ดีไซเนอร์
 433 | ดีท็อกซ์
 434 | ดีเบต
 435 | ดีพาร์ตเมนต์
 436 | ดีพาร์ตเมนท์
 437 | ดีพาร์ทเมนต์
 438 | ดีพาร์ทเมนท์
 439 | ดีมานด์
 440 | ดีล
 441 | ดีลเลอร์
 442 | ดีเลย์
 443 | เดชานุภาพ
 444 | เดบิต
 445 | เดโม
 446 | เดย์
 447 | เด้อ
 448 | เดอะ
 449 | เด๊ะ
 450 | เดี้ยง
 451 | เดี๊ยน
 452 | เดี๊ยะ
 453 | แดนซ์
 454 | แดนเซอร์
 455 | แดนดิไลออน
 456 | แดรี่
 457 | โดนัท
 458 | โด๊ป
 459 | โดมิโน
 460 | โดรายากิ
 461 | โดโลไมท์
 462 | ไดเอ็ต
 463 | ตงง้วน
 464 | ตถตา
 465 | ตนเอง
 466 | ตรวจทาน
 467 | ตรวจสอบ
 468 | ตอกย้ำ
 469 | ต๊อง
 470 | ต่อยอด
 471 | ต่อรอง
 472 | ตะเบบูย่า
 473 | ตะหงิด
 474 | ตังค์
 475 | ตันเถียน
 476 | ตัวตน
 477 | ตัวเอง
 478 | ตาปรือ
 479 | ต้าอ่วย
 480 | ติงต๊อง
 481 | ติ๋ม
 482 | ติ่มซำ
 483 | ติว
 484 | ติวเตอร์
 485 | ตี๋
 486 | ตี๊ต่าง
 487 | ตื้บ
 488 | ตุ๊ก
 489 | ตุ๊กตุ๊ก
 490 | ตุ๊ด
 491 | ตุ๋ย
 492 | ตู้เซฟ
 493 | เต๊ะ
 494 | เตี๊ยม
 495 | แตงกวา
 496 | แตงโม
 497 | แต๋ว
 498 | แต๊ะอั๋ง
 499 | แต๊ะเอีย
 500 | โต๋เต๋
 501 | โต๊ะจีน
 502 | ไตเติล
 503 | ไตเติ้ล
 504 | ไตรมาส
 505 | ถ่ายทำ
 506 | ถูกต้อง
 507 | ทงคัตสึ
 508 | ทรานสคริปต์
 509 | ทรานสฟอร์เมอร์
 510 | ทริป
 511 | ทรู
 512 | ทอม
 513 | ท็อป
 514 | ทอร์นาโด
 515 | ทอล์ค
 516 | ทักซิโด
 517 | ทันตกรรม
 518 | ทันตแพทยศาสตร์
 519 | ทับซ้อน
 520 | ทัลมุด
 521 | ทัวร์
 522 | ทัวร์นาเมนต์
 523 | ทัวร์นาเมนท์
 524 | ทัวริสต์
 525 | ทาร์ต
 526 | ทาเลนต์
 527 | ทาวน์
 528 | ทาวน์เฮาส์
 529 | ทำงาน
 530 | ทิป
 531 | ทิพยสมบัติ
 532 | ทิวลิป
 533 | ทีรามิสุ
 534 | ทีวี
 535 | ทุติยภูมิ
 536 | ทูน่า
 537 | เท็กซ์
 538 | เทค
 539 | เทคโน
 540 | เทคโนแครต
 541 | เทควันโด
 542 | เท่งทึง
 543 | เทป
 544 | เทรด
 545 | เทรน
 546 | เทรนด์
 547 | เทรนนิ่ง
 548 | เทรนเนอร์
 549 | เทรลเลอร์
 550 | เทรลเล่อร์
 551 | เทเลกราฟ
 552 | เทวบัญชา
 553 | เทวบุตร
 554 | เทวา
 555 | เทวาธิราช
 556 | เทโวโรหนะ
 557 | เทศกิจ
 558 | เทอร์โบ
 559 | เทอริยากิ
 560 | เท่าไหร่
 561 | เทิร์น
 562 | เที่ยงคืน
 563 | เที่ยงวัน
 564 | เทียมทาน
 565 | แทกติค
 566 | แทคติก
 567 | แทคติค
 568 | แทงกั๊ก
 569 | แทงโก้
 570 | แทรมโปลีน
 571 | แทรมโพลีน
 572 | โทมาฮอก
 573 | โทมาฮอว์ก
 574 | โทมาฮอว์ค
 575 | โทร
 576 | โทรโข่ง
 577 | โทรจิต
 578 | ไทแทน
 579 | ไทม์
 580 | ไทยแลนด์
 581 | ไทเฮา
 582 | ธรรมา
 583 | ธรรมาภิบาล
 584 | ธัมโม
 585 | ธีม
 586 | ธุรกรรม
 587 | ธุหร่ำ
 588 | เธค
 589 | นครา
 590 | นบี
 591 | นพมาศ
 592 | นรลักษณ์
 593 | นรีแพทย์
 594 | น็อก
 595 | น็อค
 596 | น้องใหม่
 597 | นอมินี
 598 | นอร์ท
 599 | นอร์ทเทิร์น
 600 | นอร์เทิร์น
 601 | น่ะ
 602 | นางแบบ
 603 | นาฏยลักษณ์
 604 | นาฏยศาลา
 605 | นายแบบ
 606 | นายพราน
 607 | นิตติ้ง
 608 | นินจา
 609 | นิปาห์
 610 | นิรันดร์
 611 | นิว
 612 | นิวส์
 613 | นู้ด
 614 | เนอะ
 615 | เนิร์สเซอรี
 616 | เนิร์สเซอรี่
 617 | เนี้ยบ
 618 | โนติส
 619 | ไนท์
 620 | ไนน์
 621 | ดี๊ด๊า
 622 | บรรพชน
 623 | บร็อกโคลี
 624 | บร็อคโคลี
 625 | บรา
 626 | บริกร
 627 | บริวเวอรี่ส์
 628 | บลอนด์
 629 | บลู
 630 | บลูเบอร์รี
 631 | บลูเบอร์รี่
 632 | บลูส์
 633 | บอกซ์
 634 | บ็อกซ์
 635 | บ๊อกซ์
 636 | บอดี้
 637 | บอนไซ
 638 | บอนด์
 639 | บ๊อบ
 640 | บอมบ์
 641 | บ๋อย
 642 | บอยคอต
 643 | บอยคอตต์
 644 | บอร์ด
 645 | บอส
 646 | บะกุ๊ดเต๋
 647 | บักกุ๊ดเต๋
 648 | บังเกอร์
 649 | บัดเจ็ต
 650 | บัดเจ็ท
 651 | บัตเตอร์
 652 | บัลลาสต์
 653 | บัส
 654 | บาซาร์
 655 | บาบูน
 656 | บาร์บีคิว
 657 | บาร์บี้
 658 | บารากู่
 659 | บาริสตา
 660 | บาริสต้า
 661 | บาลานซ์
 662 | บาส
 663 | บิ๊ก
 664 | บิกินี
 665 | บิกินี่
 666 | บิงซู
 667 | บิชอป
 668 | บิล
 669 | บิวตี้
 670 | บิวตี้ฟูล
 671 | บีช
 672 | บึม
 673 | บึ้ม
 674 | บิสซิเนส
 675 | บุญคุณ
 676 | บุ๋น
 677 | บุปผา
 678 | บุฟเฟต์
 679 | บุฟเฟ่ต์
 680 | บู๊
 681 | บูชิโด
 682 | บูติก
 683 | บูติค
 684 | บูท
 685 | บูธ
 686 | บูม
 687 | บูมเมอแรง
 688 | บูโร
 689 | เบเกอรี่
 690 | เบคอน
 691 | เบญจมบพิตร
 692 | เบนโตะ
 693 | เบบี้
 694 | เบบี๋
 695 | เบร็กซิต
 696 | เบรกซิท
 697 | เบลนด์
 698 | เบลอ
 699 | เบสบอล
 700 | เบอร์เกอร์
 701 | เบอร์รี
 702 | เบิร์ด
 703 | เบิร์น
 704 | เบื๊อก
 705 | แบ็กโฮ
 706 | แบคโฮ
 707 | แบด
 708 | แบต
 709 | แบนเนอร์
 710 | แบรนด์
 711 | แบรนด์เนม
 712 | แบริเออร์
 713 | แบล็ก
 714 | แบล็กเมล์
 715 | แบล็ค
 716 | แบล็คเมล์
 717 | ไบโอ
 718 | โบกี้
 719 | โบตั๋น
 720 | โบโนโบ
 721 | โบ้ย
 722 | โบรก
 723 | โบรกเกอร์
 724 | โบรชัวร์
 725 | โบลิวาร์
 726 | โบว์
 727 | โบว์ลิ่ง
 728 | ไบเบิล
 729 | ไบเซ็กชวล
 730 | ปฏิมากรรม
 731 | ปฏิสัมพันธ์
 732 | ปฐมภูมิ
 733 | ประกาศก
 734 | ปวยเล้ง
 735 | ปศุสัตว์
 736 | ป๊อก
 737 | ปอดแหก
 738 | ป็อป
 739 | ป๊อป
 740 | ป๊อปคอร์น
 741 | ป็อปปูลาร์
 742 | ป๊อปปูลาร์
 743 | ป๋อหลอ
 744 | ปักขคณนา
 745 | ปัจเจกชน
 746 | ปัจฉิมนิเทศ
 747 | ปันจักสีลัต
 748 | ป๊า
 749 | ป๋า
 750 | ปาปิรัส
 751 | ปาปิรุส
 752 | ป่าไม้
 753 | ปาร์ก
 754 | ปาร์ค
 755 | ปาร์ตี้
 756 | ปาสกาล
 757 | ปาสคาล
 758 | ปาสเตอร์
 759 | ปิกอัพ
 760 | ปิ๊ง
 761 | ปิโตรเคมี
 762 | ปิยมิตร
 763 | ปิรันย่า
 764 | ปึ้ก
 765 | ปุ๊น
 766 | ปูอัด
 767 | เปโซ
 768 | เป็นไง
 769 | เปปเปอร์มินต์
 770 | เปเปอร์
 771 | เปราะบาง
 772 | เป๊ะ
 773 | เป่ายิงฉุบ
 774 | เป่ายิ้งฉุบ
 775 | เปียโน
 776 | แป้ก
 777 | แป๋ว
 778 | แป๊ะเจี๊ยะ
 779 | โปร
 780 | โปรดักชัน
 781 | โปรดักชั่น
 782 | โปรดักต์
 783 | โปรดักส์
 784 | โปรเจกต์
 785 | โปรเจ็กต์
 786 | โปรเจกเตอร์
 787 | โปรเจ็กเตอร์
 788 | โปรเจคท์
 789 | โปรเจ็คท์
 790 | โปรดักชั่น
 791 | โปรดักต์
 792 | โปรดิวเซอร์
 793 | โปรโมชัน
 794 | โปรโมชั่น
 795 | โปรโมต
 796 | โปรโมเตอร์
 797 | โปรโมท
 798 | โปลิศ
 799 | โปสเตอร์
 800 | ผลไม้
 801 | ผลักดัน
 802 | ผ้าห่ม
 803 | ผิดพลาด
 804 | ผู้นำ
 805 | แผดเผา
 806 | เฝอ
 807 | พงษ์
 808 | พริตตี้
 809 | พรีเซนต์
 810 | พรีเซ็นต์
 811 | พรีเซนเตอร์
 812 | พรีเซ็นเตอร์
 813 | พรีเมียม
 814 | พรีเมียร์
 815 | พฤหัส
 816 | พล็อต
 817 | พลาซา
 818 | พลาซ่า
 819 | พลานุภาพ
 820 | พ่อค้า
 821 | พอเพียง
 822 | พอยต์
 823 | พอยท์
 824 | พะเรอ
 825 | พันธกิจ
 826 | พันธุวิศวกรรม
 827 | พับลิก
 828 | พับลิค
 829 | พาโนรามา
 830 | พาร์
 831 | พาร์ก
 832 | พาร์ค
 833 | พาร์ตไทม์
 834 | พาร์ตเนอร์
 835 | พาร์ทไทม์
 836 | พาร์ทเนอร์
 837 | พาวเวอร์
 838 | พาสเจอร์ไรซ์
 839 | พาสเจอร์ไรส์
 840 | พาสเจอไรซ์
 841 | พาสเจอไรส์
 842 | พาสตา
 843 | พาสต้า
 844 | พาสเทล
 845 | พาสปอร์ต
 846 | พาเหรด
 847 | พิซซ่า
 848 | พิตบูล
 849 | พิสตาชิโอ
 850 | พีเรียด
 851 | พุดดิ้ง
 852 | พุทธภูมิ
 853 | พุทธศตวรรษ
 854 | พุทโธ
 855 | พูล
 856 | เพทนาการ
 857 | เพนกวิน
 858 | เพนตากอน
 859 | เพรส
 860 | เพรียวบาง
 861 | เพลซ
 862 | เพลท
 863 | เพลย์บอย
 864 | เพียบแปร้
 865 | เพียว
 866 | เพาเวอร์
 867 | แพกเกจ
 868 | แพ็ค
 869 | แพคเกจ
 870 | แพตเทิร์น
 871 | แพทเทิร์น
 872 | แพทยสภา
 873 | แพนงเชิญ
 874 | แพนดา
 875 | แพนด้า
 876 | แพลน
 877 | โพเดียม
 878 | โพรดักชัน
 879 | โพรดักชั่น
 880 | โพรดักต์
 881 | โพรดักส์
 882 | โพล
 883 | โพลล์
 884 | โพลารอยด์
 885 | โพส
 886 | โพสต์
 887 | ไพลิน
 888 | ฟยอร์ด
 889 | ฟรังก์
 890 | ฟรีแลนซ์
 891 | ฟรุต
 892 | ฟลอร์
 893 | ฟลุก
 894 | ฟลุค
 895 | ฟลุต
 896 | ฟลุท
 897 | ฟอยล์
 898 | ฟอร์ซ
 899 | ฟอร์ม
 900 | ฟันด์
 901 | ฟาวล์
 902 | ฟาสต์ฟู้ด
 903 | ฟาโรห์
 904 | ฟิตเนส
 905 | ฟิน
 906 | ฟินิกซ์
 907 | ฟิวเจอร์
 908 | ฟิวดัล
 909 | ฟีด
 910 | ฟีเวอร์
 911 | ฟึดฟัด
 912 | ฟุตซอล
 913 | ฟุตบาท
 914 | เฟซติวัล
 915 | เฟซทิวัล
 916 | เฟรช
 917 | เฟรชชี่
 918 | เฟรม
 919 | เฟมินิสต์
 920 | เฟส
 921 | เฟสติวัล
 922 | เฟสทิวัล
 923 | เฟอร์นิเจอร์
 924 | เฟอร์รี่
 925 | เฟิร์ม
 926 | เฟี้ยวฟ้าว
 927 | แฟกซ์
 928 | แฟ็กซ์
 929 | แฟนซี
 930 | แฟนตาซี
 931 | แฟ้บ
 932 | แฟมิลี่
 933 | แฟร์
 934 | แฟรนไชส์
 935 | แฟรี
 936 | แฟรี่
 937 | แฟลช
 938 | แฟล็ต
 939 | โฟน
 940 | โฟม
 941 | โฟร์แมน
 942 | โฟร์แฮนด์
 943 | โฟล์ค
 944 | ไฟต์
 945 | ไฟแนนซ์
 946 | ไฟลต์
 947 | ไฟลท์
 948 | ภควัทคีตา
 949 | ภควัมบดี
 950 | ภควัมปติ
 951 | ภคันทลาพาธ
 952 | ภววิสัย
 953 | ภารตะ
 954 | ภูมิทัศน์
 955 | ม้ง
 956 | มวลชน
 957 | มยุราภิรมย์
 958 | มลภาวะ
 959 | มหภาค
 960 | มหาลัย
 961 | มหาอุปราชา
 962 | มอคคา
 963 | มอคค่า
 964 | ม่อง
 965 | มองซิเออร์
 966 | ม่องเท่ง
 967 | มอนสเตอร์
 968 | ม็อบ
 969 | มอบตัว
 970 | มอยส์เจอไรเซอร์
 971 | มอลต์
 972 | มอลล์
 973 | มะกัน
 974 | มั้ง
 975 | มัฟฟิน
 976 | มั้ย
 977 | มัลเบอร์รี
 978 | มัลเบอร์รี่
 979 | มาการอง
 980 | มาการีน
 981 | มาคุ
 982 | มาดาม
 983 | มาธาดอร์
 984 | ม้านั่ง
 985 | มาเฟีย
 986 | มาม่า
 987 | มายองเนส
 988 | มายาคติ
 989 | มาร์ก
 990 | มาร์กซิสต์
 991 | มาร์เก็ต
 992 | มาร์เก็ตติ้ง
 993 | มาร์ค
 994 | มาร์จิน
 995 | มาร์ช
 996 | มาร์ต
 997 | มาร์ท
 998 | มาร์ล
 999 | มาราธอน
1000 | มาสคอต
1001 | ม้าหินอ่อน
1002 | มิชชันนารี
1003 | มินต์
1004 | มินท์
1005 | มินิ
1006 | มิลค์
1007 | มิวเซียม
1008 | มิวสิก
1009 | มิวสิค
1010 | มิวสิคัล
1011 | มิสซัง
1012 | มิสซิส
1013 | มิสไซล์
1014 | มิสเตอร์
1015 | มีตติ้ง
1016 | มีนกร
1017 | มือถือ
1018 | มุมมอง
1019 | มูฟวี่
1020 | เมคอัพ
1021 | เมจิก
1022 | เมจิค
1023 | เมเจอร์
1024 | เมดเลย์
1025 | เมดเล่ย์
1026 | เมทัล
1027 | เมโทร
1028 | เมเปิล
1029 | เมลอน
1030 | เมล่อน
1031 | เมสเซนเจอร์
1032 | เมสเซ็นเจอร์
1033 | เมอร์ส
1034 | เมาท์
1035 | เมี่ยงคำ
1036 | แม็ก
1037 | แมกกาซีน
1038 | แม็กกาซีน
1039 | แม็กซ์
1040 | แมกาซีน
1041 | แมคคาเดเมีย
1042 | แมคเคอเรล
1043 | แม่ค้า
1044 | แม่ง
1045 | แมชชีน
1046 | แมชีน
1047 | แมตช์
1048 | แมนชั่น
1049 | แมมบ้า
1050 | แมมโบ้
1051 | แมสเซนเจอร์
1052 | แมสเซ็นเจอร์
1053 | โมจิ
1054 | โมเดล
1055 | โมเดิร์น
1056 | โมเต็ล
1057 | โมโนเรล
1058 | โมหจริต
1059 | ไมเกรน
1060 | ไมค์
1061 | ไมเนอร์
1062 | ยนตรกรรม
1063 | ยอมรับ
1064 | ยะเยือก
1065 | ยังไง
1066 | ยัย
1067 | ยากิโซบะ
1068 | ยากูซ่า
1069 | ยาวี
1070 | ยิม
1071 | ยิว
1072 | ยิวยิตสู
1073 | ยุวทูต
1074 | ยูคาลิปตัส
1075 | ยูดาห์
1076 | ยูเทิร์น
1077 | ยูโทเปีย
1078 | ยูนิคอร์น
1079 | ยูนิต
1080 | ยูนิฟอร์ม
1081 | ยูนิเวิร์ส
1082 | ยูไนเต็ด
1083 | ยูยิตสู
1084 | ยูโร
1085 | ยูวี
1086 | เยน
1087 | เยลลี่
1088 | เย้ว
1089 | เยอบีรา
1090 | เยอบีร่า
1091 | เยอร์บีรา
1092 | เยอร์บีร่า
1093 | แยมโรล
1094 | โยเกิร์ต
1095 | โยโย่
1096 | รวมมิตร
1097 | รหัสยนัย
1098 | รหัสยวิทยา
1099 | ร็อค
1100 | ร็อคเก็ต
1101 | รองรับ
1102 | ร็อตไวเลอร์
1103 | รอมฎอน
1104 | รอยัลตี้
1105 | ระโงก
1106 | รัชสมัย
1107 | รัฏฐาธิปัตย์
1108 | รันเวย์
1109 | รัม
1110 | รากหญ้า
1111 | ราชบัณฑิตยสถาน
1112 | ราชบัณฑิตยสภา
1113 | ราชานุญาต
1114 | ราชานุสาวรีย์
1115 | ราโชบาย
1116 | รามเทพ
1117 | รามาธิบดี
1118 | รามายณะ
1119 | ราเม็ง
1120 | ราเมน
1121 | รายชื่อ
1122 | ราสเบอร์รี
1123 | ริกเตอร์
1124 | ริคเตอร์
1125 | ริงกิต
1126 | รีไซเคิล
1127 | รีดไถ
1128 | รีทัช
1129 | รีเทิร์น
1130 | รีไทร์
1131 | รีโนเวท
1132 | รีบาวด์
1133 | รีแบรนด์
1134 | รีพอร์ต
1135 | รีพอร์ท
1136 | รีโมต
1137 | รีโมท
1138 | รีวิว
1139 | รีสอร์ต
1140 | รีสอร์ท
1141 | รีเสิร์ช
1142 | รุมบ้า
1143 | รุสโซ
1144 | รูทีน
1145 | รูบิก
1146 | รูบิค
1147 | รูฟ
1148 | เรซซิ่ง
1149 | เรซิน
1150 | เรซิ่น
1151 | เรดิโอ
1152 | เรต
1153 | เรตติ้ง
1154 | เรนเจอร์
1155 | เรนเดียร์
1156 | แรคคูน
1157 | แรงใจ
1158 | แรงดูด
1159 | แรงผลัก
1160 | แรลลี
1161 | แรลลี่
1162 | โรดโชว์
1163 | โรดแมป
1164 | โรเนียว
1165 | โรบัสตา
1166 | โรบัสต้า
1167 | โรแมนติก
1168 | โรแมนติค
1169 | โรล
1170 | โรลออน
1171 | ไรเฟิล
1172 | ล็อกเกอร์
1173 | ลอจิสติกส์
1174 | ล็อต
1175 | ล็อบบี้
1176 | ลอร์ด
1177 | ล้มเหลว
1178 | ละติน
1179 | ละอ่อน
1180 | ลาซานญ่า
1181 | ลาติน
1182 | ลาเต้
1183 | ลาเท็กซ์
1184 | ลานีญา
1185 | ลาบราดอร์
1186 | ลามะ
1187 | ลาเวนเดอร์
1188 | ลิเบอรัล
1189 | ลิมิต
1190 | ลิมูซีน
1191 | ลิลลี่
1192 | ลิสต์
1193 | ลีก
1194 | ลีด
1195 | ลีดเดอร์
1196 | ลีมูซีน
1197 | ลีเมอร์
1198 | ลีลาวดี
1199 | ลุค
1200 | ลูกชาย
1201 | ลูกสาว
1202 | เลกเชอร์
1203 | เลโก้
1204 | เลคเชอร์
1205 | เลดี้
1206 | เลมอน
1207 | เลย์เอาต์
1208 | เลสเบี้ยน
1209 | เลิฟ
1210 | แลนด์
1211 | แล็บ
1212 | โลโก้
1213 | โลชั่น
1214 | โละ
1215 | ไลท์
1216 | ไลน์
1217 | ไลฟ์
1218 | วนาราม
1219 | วรยุทธ์
1220 | วราราม
1221 | วโรกาส
1222 | ว้อดก้า
1223 | วอเตอร์
1224 | วอฟเฟิล
1225 | ว้อย
1226 | วอยซ์
1227 | วอร์ม
1228 | วอร์มอัพ
1229 | วอร์รูม
1230 | วอล์ก
1231 | วอล์ค
1232 | วอลซ์
1233 | วอลนัต
1234 | วอลนัท
1235 | วอลเปเปอร์
1236 | วอลล์
1237 | วอลล์เปเปอร์
1238 | วอลเลย์
1239 | ว่ะ
1240 | วันเวย์
1241 | วัสดุภัณฑ์
1242 | วัสสา
1243 | วาซาบิ
1244 | วาทกรรม
1245 | วาทะ
1246 | วานิลลา
1247 | วานิลา
1248 | วาฟเฟิล
1249 | วาริชศาสตร์
1250 | ว้าว
1251 | วัคค์
1252 | วัจนะ
1253 | วาไรตี้
1254 | วิก
1255 | วิดีโอ
1256 | วิทย์
1257 | วิน
1258 | วินเซอร์
1259 | วินด์เซิร์ฟ
1260 | วินเทจ
1261 | วิป
1262 | วิปปิ้ง
1263 | วิภัชภาค
1264 | วิว
1265 | วิลล์
1266 | วิลเลจ
1267 | วิโอลา
1268 | วีเจ
1269 | วีซ่า
1270 | วีดิทัศน์
1271 | วีน
1272 | วีลแชร์
1273 | วีไอพี
1274 | วืด
1275 | วูซู
1276 | วู้ด
1277 | เวณิกา
1278 | เวเฟอร์
1279 | เวล่ำ
1280 | เวสต์
1281 | เวสต์เทิร์น
1282 | เวสเทิร์น
1283 | เวอร์
1284 | เวิร์ก
1285 | เวิร์กช็อป
1286 | เวิร์ค
1287 | เวิร์ลด์
1288 | เวิลด์
1289 | แว้บ
1290 | แวมไพร์
1291 | ไวกิ้ง
1292 | ไวทัล
1293 | ไวนิล
1294 | ไวเบรเตอร์
1295 | ไวรัล
1296 | ไวอะกร้า
1297 | ไวอากร้า
1298 | ศรีอริย
1299 | ศรีอาริย
1300 | ศรีอาริย์
1301 | ศรีอาริยะ
1302 | ศากยบุตร
1303 | ศิรินทร์
1304 | ศิลปวัฒนธรรม
1305 | ศิลปากร
1306 | ศิวิไลซ์
1307 | ศึกษาศาสตร์
1308 | เศรษฐินี
1309 | สกรัม
1310 | สกรู
1311 | สกอร์
1312 | สกาย
1313 | สกู๊ป
1314 | สเกตช์
1315 | สเก็ตช์
1316 | สคริปต์
1317 | สแควร์
1318 | สงบสุข
1319 | สจ๊วต
1320 | สตรอเบอร์รี
1321 | สตรอเบอรี
1322 | สตรอว์เบอร์รี
1323 | สตริง
1324 | สตรีท
1325 | สต็อก
1326 | สต๊อก
1327 | สต็อค
1328 | สต๊อค
1329 | สตอรี
1330 | สตาฟฟ์
1331 | สตาร์
1332 | สตาร์ตอัป
1333 | สตาร์ตอัพ
1334 | สตาร์ท
1335 | สตาร์ทอัป
1336 | สตาร์ทอัพ
1337 | สติกเกอร์
1338 | สติ๊กเกอร์
1339 | สตีล
1340 | สตูดิโอ
1341 | สเตชัน
1342 | สเตชั่น
1343 | สเตเดียม
1344 | สเตนเลส
1345 | สเต็ป
1346 | สเตย์
1347 | สเตริโอ
1348 | สเตอร์ลิง
1349 | สเตอริไลส์
1350 | สเตอริโอ
1351 | สแตนดาร์ด
1352 | สแตนเลส
1353 | สโตน
1354 | สโตร์
1355 | สไตรค์
1356 | สไตล์
1357 | สถาปัตย์
1358 | สแนร์
1359 | สไนเปอร์
1360 | สปอต
1361 | สป็อต
1362 | สปอนเซอร์
1363 | สปอร์ต
1364 | สปา
1365 | สปาย
1366 | สปาร์ก
1367 | สปาร์ค
1368 | สปิริต
1369 | สปีด
1370 | สเปก
1371 | สเปค
1372 | สเปรย์
1373 | สไปเดอร์
1374 | สมณพราหมณ์
1375 | สมณเพศ
1376 | สมาพันธ์
1377 | สไมล์
1378 | สรรพาวุธ
1379 | สลัม
1380 | สแล็ก
1381 | สโลแกน
1382 | สโลว์
1383 | สไลด์
1384 | สวิงกิ้ง
1385 | สวีท
1386 | สหรัฐ
1387 | สหัชญาณ
1388 | สหัสวรรษ
1389 | สะกอม
1390 | สะเด่า
1391 | สะบึม
1392 | สะบึมส์
1393 | สะออน
1394 | สังฆภัณฑ์
1395 | สังโฆ
1396 | สะโหลสะเหล
1397 | สันทนาการ
1398 | สับปะรังเค
1399 | สัปปะรังเค
1400 | สาธารณกุศล
1401 | สามช่า
1402 | สามแยก
1403 | สารขัน
1404 | สารขันธ์
1405 | สารขัณฑ์
1406 | สิม
1407 | สี่แยก
1408 | สึนามิ
1409 | สุขภาวะ
1410 | สุทธาราม
1411 | สุนทรีย์
1412 | สุรทิน
1413 | สุริยยาตร
1414 | สุริยยาตร์
1415 | สุหนัด
1416 | สุหนี่
1417 | เสกสรรค์
1418 | เสพติด
1419 | เสือโคร่ง
1420 | หงวน
1421 | หน่อมแน้ม
1422 | หมวย
1423 | หมั่นโถว
1424 | หม่านโถว
1425 | หมายปอง
1426 | หมิง
1427 | หยวน
1428 | หยาง
1429 | หยิน
1430 | หลวงตา
1431 | หลวงปู่
1432 | หลวงพี่
1433 | หล่อฮังก้วย
1434 | หลินจือ
1435 | ห่วย
1436 | หะเบส
1437 | หะเรีย
1438 | หายนภัย
1439 | หิญาบ
1440 | เห็นด้วย
1441 | เหมย
1442 | เหมยขาบ
1443 | เห่ย
1444 | เหลวเป๋ว
1445 | เหวอ
1446 | เหี่ยวย่น
1447 | แหม็บ
1448 | แหวว
1449 | โหงว
1450 | โหงวเฮ้ง
1451 | โหลน
1452 | โหลยโท่ย
1453 | ไหง
1454 | ไหร่
1455 | องคชาติ
1456 | อนิเมะ
1457 | อพาร์ตเมนต์
1458 | อพาร์ตเมนท์
1459 | อพาร์ทเมนต์
1460 | อพาร์ทเมนท์
1461 | อภิวัฒน์
1462 | อมาตยาธิปไตย
1463 | อเมริกาโน
1464 | อยุติธรรม
1465 | อราบิกา
1466 | อราบิก้า
1467 | อริยสงฆ์
1468 | อ่วม
1469 | อวอร์ด
1470 | อโวคาโด
1471 | ออกแบบ
1472 | ออเคสตรา
1473 | ออดิชัน
1474 | ออดิชั่น
1475 | ออดิทอเรียม
1476 | ออเดอร์
1477 | ออโต้
1478 | ออทิสติก
1479 | อ่อนด้อย
1480 | ออฟ
1481 | ออฟโรด
1482 | ออยล์
1483 | ออร์แกน
1484 | ออร์แกนิก
1485 | ออร์แกนิค
1486 | ออร์เคสตรา
1487 | ออร์เดอร์
1488 | ออร์โทดอกซ์
1489 | ออรัล
1490 | ออสซี่
1491 | อะ
1492 | อะคูสติก
1493 | อะโวคาโด
1494 | อะฮั้น
1495 | อัครสาวก
1496 | อั่งเปา
1497 | อัตลักษณ์
1498 | อัตวิสัย
1499 | อันเดอร์
1500 | อันตรกิริยา
1501 | อัลตรา
1502 | อัลเซเชียน
1503 | อัลไซเมอร์
1504 | อัลบัม
1505 | อัลบั้ม
1506 | อัลมอนด์
1507 | อาฟเตอร์
1508 | อามาสัย
1509 | อาร์ค
1510 | อาร์ติสต์
1511 | อาร์พีจี
1512 | อาราบิกา
1513 | อาราบิก้า
1514 | อาว์
1515 | อาโวคาโด
1516 | อาสวะ
1517 | อิกัวนา
1518 | อินดอร์
1519 | อินดัสตรี
1520 | อินดัสตรีส์
1521 | อินดัสทรี
1522 | อินดัสทรีส์
1523 | อินดี้
1524 | อินเตอร์
1525 | อินเตอร์โพล
1526 | อินน์
1527 | อินเนอร์
1528 | อินบาวด์
1529 | อินโฟกราฟิก
1530 | อิมพอร์ต
1531 | อิมพีเรียล
1532 | อิเล็กทริก
1533 | อิเล็กทริค
1534 | อิสรชน
1535 | อิออน
1536 | อีคิว
1537 | อีโคคาร์
1538 | อีแต๋น
1539 | อีโรติก
1540 | อีเวนต์
1541 | อีเวนท์
1542 | อีสต์
1543 | อีสต์เทิร์น
1544 | อีสเตอร์
1545 | อีสเทิร์น
1546 | อึ๊บ
1547 | อึ้ม
1548 | อึ๋ม
1549 | อึมครึม
1550 | อุด้ง
1551 | อุตสาหการ
1552 | อุเทน
1553 | อุนาโลม
1554 | อุบัติการณ์
1555 | อุปการคุณ
1556 | อุปทาน
1557 | อุปนายก
1558 | อุปนายิกา
1559 | อุปสงค์
1560 | อุปัทวเหตุ
1561 | อุรังคธาตุ
1562 | อูคูเลเล่
1563 | อู้ฟู่
1564 | อูมามิ
1565 | เอ๋
1566 | เอ็กซ์คลูซีฟ
1567 | เอ็กเซอร์ไซส์
1568 | เอ็กซ์โป
1569 | เอกซ์พอร์ต
1570 | เอ็กซ์พอร์ต
1571 | เอ็กซ์เพรส
1572 | เอ็กโซเซต์
1573 | เอ็กโซเซ่ต์
1574 | เอแคลร์
1575 | เอเจนซี
1576 | เอเจนซี่
1577 | เอเซีย
1578 | เอนจอย
1579 | เอ็นจีโอ
1580 | เอ็นเตอร์เทน
1581 | เอนทรานซ์
1582 | เอ็นทรานซ์
1583 | เอฟเฟกต์
1584 | เอฟเฟ็กต์
1585 | เอเยนต์
1586 | เอลนีโญ
1587 | เอเลียน
1588 | เอเลี่ยน
1589 | เอสเปรสโซ
1590 | เอสเพรสโซ
1591 | เอ๋อ
1592 | เอาต์
1593 | เอาท์
1594 | เอาท์ดอร์
1595 | เอ๊าะ
1596 | แอ็กชั่น
1597 | แอ็คชั่น
1598 | แอคทีฟ
1599 | แอดมิชชั่น
1600 | แอดมิสชัน
1601 | แอนด์
1602 | แอนตี้
1603 | แอบสแตรก
1604 | แอ็บสแตรก
1605 | แอบสแตรค
1606 | แอ็บสแตรค
1607 | แอปเปิล
1608 | แอปเปิ้ล
1609 | แอปพริคอท
1610 | แอปริคอต
1611 | แอพพริคอท
1612 | แอพริคอต
1613 | แอมป์
1614 | แอร์
1615 | แอโรบิก
1616 | แอโรบิค
1617 | แอลมอนด์
1618 | แอสเตอร์
1619 | โอเค
1620 | โอปป้า
1621 | โอเปอเรเตอร์
1622 | โอเปรา
1623 | โอเปร่า
1624 | โอเพ่น
1625 | โอ้ย
1626 | โอยัวะ
1627 | โอรสาธิราช
1628 | โอเลี้ยง
1629 | โอวัลติน
1630 | โอเวอร์
1631 | ไอคิว
1632 | ไอคิโด
1633 | ไอซ์
1634 | ไอซียู
1635 | ไอดอล
1636 | ไอเดีย
1637 | ไอติม
1638 | ฮวงจุ้ย
1639 | ฮ่องเต้
1640 | ฮองเฮา
1641 | ฮอต
1642 | ฮ็อต
1643 | ฮอตดอก
1644 | ฮ็อตด็อก
1645 | ฮอลิเดย์
1646 | ฮันนีมูน
1647 | ฮัม
1648 | ฮัมวี่
1649 | ฮัลโลวีน
1650 | ฮัลโหล
1651 | ฮัสกี้
1652 | ฮาร์ด
1653 | ฮาราคีรี
1654 | ฮาลาล
1655 | ฮาโลวีน
1656 | ฮิ
1657 | ฮิญาบ
1658 | ฮิต
1659 | ฮิบรู
1660 | ฮิปปี้
1661 | ฮิปโป
1662 | ฮิปสเตอร์
1663 | ฮิปฮอป
1664 | ฮีโร่
1665 | ฮูลาฮูป
1666 | ฮูล่าฮูป
1667 | เฮดจ์
1668 | เฮฟวี
1669 | เฮฟวี่
1670 | เฮอร์ริเคน
1671 | เฮีย
1672 | เฮาส์
1673 | แฮนด์
1674 | แฮปปี้
1675 | แฮมเบอร์เกอร์
1676 | แฮมสเตอร์
1677 | โฮม
1678 | โฮมรูม
1679 | โฮลดิงส์
1680 | โฮลวีต
1681 | โฮสเตส
1682 | ไฮ
1683 | ไฮกุ
1684 | ไฮแจ็ค
1685 | ไฮโซ
1686 | ไฮเดรนเยีย
1687 | ไฮเทค
1688 | ไฮบริด
1689 | ไฮเปอร์
1690 | ไฮไลต์
1691 | ไฮไลท์
1692 | ไฮเวย์
1693 | ไฮสคูล
1694 | 


--------------------------------------------------------------------------------