├── .gitattributes ├── .gitignore ├── Dockerfile ├── README.md ├── assets └── sources.list ├── bin ├── index.sh ├── run.sh └── searchd.sh ├── coreseek_prod.sh └── data └── prod ├── conf ├── mmseg.ini ├── uni.lib └── unigram.txt ├── data ├── stopwords.txt └── wordforms.txt └── sphinx.conf /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | 4 | # Custom for Visual Studio 5 | *.cs diff=csharp 6 | 7 | # Standard to msysgit 8 | *.doc diff=astextplain 9 | *.DOC diff=astextplain 10 | *.docx diff=astextplain 11 | *.DOCX diff=astextplain 12 | *.dot diff=astextplain 13 | *.DOT diff=astextplain 14 | *.pdf diff=astextplain 15 | *.PDF diff=astextplain 16 | *.rtf diff=astextplain 17 | *.RTF diff=astextplain 18 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Windows image file caches 2 | Thumbs.db 3 | ehthumbs.db 4 | 5 | # Folder config file 6 | Desktop.ini 7 | 8 | # Recycle Bin used on file shares 9 | $RECYCLE.BIN/ 10 | 11 | # Windows Installer files 12 | *.cab 13 | *.msi 14 | *.msm 15 | *.msp 16 | 17 | # Windows shortcuts 18 | *.lnk 19 | 20 | # ========================= 21 | # Operating System Files 22 | # ========================= 23 | 24 | # OSX 25 | # ========================= 26 | 27 | .DS_Store 28 | .AppleDouble 29 | .LSOverride 30 | 31 | # Thumbnails 32 | ._* 33 | 34 | # Files that might appear on external disk 35 | .Spotlight-V100 36 | .Trashes 37 | 38 | # Directories potentially created on remote AFP share 39 | .AppleDB 40 | .AppleDesktop 41 | Network Trash Folder 42 | Temporary Items 43 | .apdisk 44 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # Author: Huijie Wei 2 | FROM ubuntu:14.04.2 3 | MAINTAINER Huijie Wei huijiewei@outlook.com 4 | ADD ./assets/sources.list /etc/apt/sources.list 5 | RUN apt-get update 6 | RUN apt-get upgrade -y 7 | RUN DEBIAN_FRONTEND=noninteractive apt-get install -y \ 8 | make \ 9 | gcc \ 10 | g++ \ 11 | automake \ 12 | libtool \ 13 | mysql-client \ 14 | libmysqlclient15-dev \ 15 | libxml2-dev \ 16 | libexpat1-dev \ 17 | git 18 | RUN mkdir -p /usr/src/coreseek/ 19 | RUN git clone https://github.com/huijiewei/Coreseek-Fix.git /usr/src/coreseek/ 20 | RUN chmod 777 -R /usr/src/coreseek 21 | WORKDIR /usr/src/coreseek/mmseg-3.2.14 22 | RUN ./bootstrap 23 | RUN ./configure --prefix=/usr/local/mmseg3 24 | RUN make && make install 25 | WORKDIR /usr/src/coreseek/csft-4.1 26 | RUN sh buildconf.sh 27 | RUN ./configure --prefix=/usr/local/coreseek --without-unixodbc --with-mmseg --with-mmseg-includes=/usr/local/mmseg3/include/mmseg/ --with-mmseg-libs=/usr/local/mmseg3/lib/ --with-mysql 28 | RUN make && make install 29 | WORKDIR / 30 | ADD ./bin/index.sh / 31 | RUN chmod a+x index.sh 32 | ADD ./bin/searchd.sh / 33 | RUN chmod a+x searchd.sh 34 | ADD ./bin/run.sh / 35 | RUN chmod a+x run.sh 36 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Coreseek Dockerfile 2 | 3 | ------ 4 | 5 | Coreseek 是一款中文全文检索/搜索软件,以 GPLv2 许可协议开源发布,基于 Sphinx 研发并独立发布,专攻中文搜索和信息处理领域,适用于行业/垂直搜索、论坛/站内搜索、数据库搜索、文档/文献检索、信息检索、数据挖掘等应用场景,用户可以免费下载使用 6 | 7 | Docker 提供了一个可以运行你的应用程序的封套(envelope),或者说容器。它原本是 dotCloud 启动的一个业余项目,并在前些时候开源了。它吸引了大量的关注和讨论,导致 dotCloud 把它重命名到 Docker Inc。它最初是用 Go 语言编写的,它就相当于是加在 LXC(LinuX Containers,linux 容器)上的管道,允许开发者在更高层次的概念上工作。 8 | 9 | ------ 10 | 11 | coreseek_prod.sh 会开启一个绑定到 3312 端口的 coreseek 服务 12 | 13 | 14 | -------------------------------------------------------------------------------- /assets/sources.list: -------------------------------------------------------------------------------- 1 | # 2 | 3 | # deb cdrom:[Ubuntu-Server 14.04.2 LTS _Trusty Tahr_ - Release amd64 (20150218.1)]/ trusty main restricted 4 | 5 | # deb cdrom:[Ubuntu-Server 14.04.2 LTS _Trusty Tahr_ - Release amd64 (20150218.1)]/ trusty main restricted 6 | 7 | # See http://help.ubuntu.com/community/UpgradeNotes for how to upgrade to 8 | # newer versions of the distribution. 9 | deb http://cn.archive.ubuntu.com/ubuntu/ trusty main restricted 10 | deb-src http://cn.archive.ubuntu.com/ubuntu/ trusty main restricted 11 | 12 | ## Major bug fix updates produced after the final release of the 13 | ## distribution. 14 | deb http://cn.archive.ubuntu.com/ubuntu/ trusty-updates main restricted 15 | deb-src http://cn.archive.ubuntu.com/ubuntu/ trusty-updates main restricted 16 | 17 | ## N.B. software from this repository is ENTIRELY UNSUPPORTED by the Ubuntu 18 | ## team. Also, please note that software in universe WILL NOT receive any 19 | ## review or updates from the Ubuntu security team. 20 | deb http://cn.archive.ubuntu.com/ubuntu/ trusty universe 21 | deb-src http://cn.archive.ubuntu.com/ubuntu/ trusty universe 22 | deb http://cn.archive.ubuntu.com/ubuntu/ trusty-updates universe 23 | deb-src http://cn.archive.ubuntu.com/ubuntu/ trusty-updates universe 24 | 25 | ## N.B. software from this repository is ENTIRELY UNSUPPORTED by the Ubuntu 26 | ## team, and may not be under a free licence. Please satisfy yourself as to 27 | ## your rights to use the software. Also, please note that software in 28 | ## multiverse WILL NOT receive any review or updates from the Ubuntu 29 | ## security team. 30 | deb http://cn.archive.ubuntu.com/ubuntu/ trusty multiverse 31 | deb-src http://cn.archive.ubuntu.com/ubuntu/ trusty multiverse 32 | deb http://cn.archive.ubuntu.com/ubuntu/ trusty-updates multiverse 33 | deb-src http://cn.archive.ubuntu.com/ubuntu/ trusty-updates multiverse 34 | 35 | ## N.B. software from this repository may not have been tested as 36 | ## extensively as that contained in the main release, although it includes 37 | ## newer versions of some applications which may provide useful features. 38 | ## Also, please note that software in backports WILL NOT receive any review 39 | ## or updates from the Ubuntu security team. 40 | deb http://cn.archive.ubuntu.com/ubuntu/ trusty-backports main restricted universe multiverse 41 | deb-src http://cn.archive.ubuntu.com/ubuntu/ trusty-backports main restricted universe multiverse 42 | 43 | deb http://security.ubuntu.com/ubuntu trusty-security main restricted 44 | deb-src http://security.ubuntu.com/ubuntu trusty-security main restricted 45 | deb http://security.ubuntu.com/ubuntu trusty-security universe 46 | deb-src http://security.ubuntu.com/ubuntu trusty-security universe 47 | deb http://security.ubuntu.com/ubuntu trusty-security multiverse 48 | deb-src http://security.ubuntu.com/ubuntu trusty-security multiverse 49 | 50 | ## Uncomment the following two lines to add software from Canonical's 51 | ## 'partner' repository. 52 | ## This software is not part of Ubuntu, but is offered by Canonical and the 53 | ## respective vendors as a service to Ubuntu users. 54 | # deb http://archive.canonical.com/ubuntu trusty partner 55 | # deb-src http://archive.canonical.com/ubuntu trusty partner 56 | 57 | ## Uncomment the following two lines to add software from Ubuntu's 58 | ## 'extras' repository. 59 | ## This software is not part of Ubuntu, but is offered by third-party 60 | ## developers who want to ship their latest software. 61 | # deb http://extras.ubuntu.com/ubuntu trusty main 62 | # deb-src http://extras.ubuntu.com/ubuntu trusty main 63 | -------------------------------------------------------------------------------- /bin/index.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | /usr/local/coreseek/bin/indexer -c opt/coreseek/sphinx.conf --all 4 | -------------------------------------------------------------------------------- /bin/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | /usr/local/coreseek/bin/indexer -c etc/coreseek/sphinx.conf --all 4 | ./searchd.sh 5 | -------------------------------------------------------------------------------- /bin/searchd.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | /usr/local/coreseek/bin/searchd -c /opt/coreseek/sphinx.conf --nodetach "$@" 4 | -------------------------------------------------------------------------------- /coreseek_prod.sh: -------------------------------------------------------------------------------- 1 | docker run -p 3312:3312 -v /data/dockers/coreseek/data/prod:/etc/coreseek --name coreseek_prod -d huijiewei/coreseek ./run.sh 2 | 3 | -------------------------------------------------------------------------------- /data/prod/conf/mmseg.ini: -------------------------------------------------------------------------------- 1 | [mmseg] 2 | merge_number_and_ascii=0; ;合并英文和数字 abc123/x 3 | number_and_ascii_joint=; ;定义可以连接英文和数字的字符 4 | compress_space=1; ;暂不支持 5 | seperate_number_ascii=0; ;就是将字母和数字打散 6 | -------------------------------------------------------------------------------- /data/prod/conf/uni.lib: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huijiewei/CoreseekDocker/fdf7be2f904dc95369841879ec8badbe8f74ef44/data/prod/conf/uni.lib -------------------------------------------------------------------------------- /data/prod/data/stopwords.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huijiewei/CoreseekDocker/fdf7be2f904dc95369841879ec8badbe8f74ef44/data/prod/data/stopwords.txt -------------------------------------------------------------------------------- /data/prod/data/wordforms.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huijiewei/CoreseekDocker/fdf7be2f904dc95369841879ec8badbe8f74ef44/data/prod/data/wordforms.txt -------------------------------------------------------------------------------- /data/prod/sphinx.conf: -------------------------------------------------------------------------------- 1 | source mysql 2 | { 3 | type = mysql 4 | sql_host = 172.17.42.1 5 | sql_user = user 6 | sql_pass = pass 7 | sql_db = db 8 | sql_port = 3306 9 | 10 | sql_query_pre = SET NAMES utf8 11 | sql_query_pre = SET SESSION query_cache_type=OFF 12 | 13 | sql_query = SELECT id, group_id, UNIX_TIMESTAMP(date_added) AS date_added, title, content FROM documents 14 | 15 | sql_attr_uint = group_id 16 | sql_attr_timestamp = date_added 17 | 18 | sql_ranged_throttle = 0 19 | 20 | sql_query_info_pre = SET NAMES utf8s 21 | sql_query_info = SELECT * FROM documents WHERE id=$id 22 | } 23 | 24 | 25 | index index_goods_name 26 | { 27 | source = mysql 28 | path = /opt/coreseek/data/mysql 29 | docinfo = extern 30 | mlock = 0 31 | morphology = none 32 | stopwords = /opt/coreseek/conf/stopwords.txt 33 | wordforms = /opt/coreseek/conf/wordforms.txt 34 | min_word_len = 1 35 | charset_type = zh_cn.utf-8 36 | charset_dictpath = /opt/coreseek/conf 37 | html_strip = 0 38 | } 39 | 40 | 41 | indexer 42 | { 43 | mem_limit = 256M 44 | } 45 | 46 | 47 | searchd 48 | { 49 | listen = 3312 50 | log = /opt/coreseek/log/searchd.log 51 | query_log = /opt/coreseek/log/query.log 52 | read_timeout = 5 53 | client_timeout = 300 54 | max_children = 30 55 | pid_file = /opt/coreseek/log/searchd.pid 56 | max_matches = 1000 57 | 58 | seamless_rotate = 1 59 | preopen_indexes = 1 60 | unlink_old = 1 61 | mva_updates_pool = 1M 62 | max_packet_size = 8M 63 | max_filters = 256 64 | max_filter_values = 4096 65 | } --------------------------------------------------------------------------------