ubuntu下的中文搜索sphinx的安装配置

ubuntu下的中文搜索sphinx的安装配置

一.安装依赖包

$ sudo apt-get install make gcc g++ automake libtool mysql-client libmysqlclient15-dev libxml2-dev libexpat1-dev

二.安装中文分词

$ sudo wget -c http://www.coreseek.cn/uploads/csft/3.1/Source/mmseg-3.1.tar.gz

$ sudo tar zxvf mmseg-3.1.tar.gz -C ../software/

$ sudo ./configure –prefix=/usr/local/mmseg

$ sudo make

$ sudo make install

$ sudo mkdir dict

$ sudo cp /usr/local/src/tarbag/words.txt.uni ./uni.lib

$ sudo vim mmseg.ini

[mmseg]

merge_number_and_asci=1; //字母和数字连续出现是否切分

number_and_asci_joint=-.; //连接数字和字母可用的符号

compress_space=0;

seperate_number_asci=1; //是否拆分数字

三.安装sphinx

$ sudo wget http://www.coreseek.cn/uploads/csft/3.1/Source/csft-3.1.tar.gz

$ sudo tar zxvf csft-3.1.tar.gz -C ../software/

$ sudo ./configure –prefix=/usr/local/csft –with-mysql=/usr/local/mysql –with-mysql-includes=/usr/local/mysql/include –with-mysql-libs=/usr/local/mysql/lib –with-mmseg=/usr/local/mmseg –with-mmseg-includes=/usr/local/mmseg/include/mmseg –with-mmseg-libs=/usr/local/mmseg/lib

$ sudo make

$ sudo make install

四.新建sph_counter表

CREATE TABLE sph_counter (

counter_id int(11) NOT NULL,

max_doc_id int(11) NOT NULL,

PRIMARY KEY (counter_id)

) ENGINE=InnoDB DEFAULT CHARSET=utf8

五.配置

$ cd /usr/local/csft/etc/

$ sudo cp sphinx.conf.dist sphinx.conf

$ sudo vim sphinx.conf

source bbs

{

type = mysql

sql_host = localhost

sql_user = root

sql_pass =

sql_db = test

sql_sock = /tmp/mysqld.sock

sql_query_pre = SET NAMES utf8

sql_query_pre = SET SESSION query_cache_type=OFF

sql_query_pre = REPLACE INTO sph_counter SELECT 1,MAX(pid) FROM pre_forum_post

sql_query = \

SELECT pid, fid, tid, first, invisible, authorid, dateline, subject, message \

FROM pre_forum_post \

WHERE pid<=(SELECT max_doc_id FROM sph_counter WHERE counter_id=1)

sql_attr_uint = fid

sql_attr_uint = tid

sql_attr_uint = first

sql_attr_uint = invisible

sql_attr_uint = authorid

sql_attr_timestamp = dateline

sql_query_info = SELECT * FROM documents WHERE id=$id

}

source bbs_delta : bbs

{

sql_query_pre = SET NAMES utf8

sql_query_pre = SET SESSION query_cache_type=OFF

sql_query = \

SELECT pid, fid, tid, first, invisible, authorid, dateline, subject, message \

FROM pre_forum_post \

WHERE pid>(SELECT max_doc_id FROM sph_counter WHERE counter_id=1)

}

source bbs_merge : bbs

{

sql_query_pre = SET NAMES utf8

sql_query_pre = SET SESSION query_cache_type=OFF

sql_query = \

SELECT pid, fid, tid, first, invisible, authorid, dateline, subject, message \

FROM pre_forum_post \

WHERE pid>(SELECT max_doc_id FROM sph_counter WHERE counter_id=1)

sql_query_post = REPLACE INTO sph_counter SELECT 1, MAX(pid) FROM pre_forum_post

}

index bbs

{

source = bbs

path = /usr/local/csft/var/data/bbs

docinfo = extern

mlock = 0

morphology = none

min_word_len = 1

charset_type = zh_cn.utf-8

charset_dictpath = /usr/local/mmseg/dict

html_strip = 0

}

index bbs_delta : bbs

{

source = bbs

path = /usr/local/csft/var/data/bbs_delta

}

index bbs_merge : bbs

{

source = bbs

path = /usr/local/csft/var/data/bbs_merge

}

indexer

{

mem_limit = 256M

}

searchd

{

log = /usr/local/csft/var/log/searchd.log

query_log = /usr/local/csft/var/log/query.log

read_timeout = 5

client_timeout = 300

max_children = 30

pid_file = /usr/local/csft/var/log/searchd.pid

max_matches = 1000

seamless_rotate = 1

preopen_indexes = 0

unlink_old = 1

mva_updates_pool = 1M

max_packet_size = 8M

max_filters = 256

max_filter_values = 4096

}

六.生成索引

$ sudo /usr/local/csft/bin/indexer –config /usr/local/csft/etc/sphinx.conf –all

Coreseek Full Text Server 3.1

Copyright (c) 2006-2008 coreseek.com

using config file ‘/usr/local/csft/etc/sphinx.conf’…

indexing index ‘bbs’…

iniparser: cannot open /usr/local/mmseg/dict/mmseg.ini

collected 3 docs, 0.0 MB

sorted 0.0 Mhits, 100.0% done

total 3 docs, 39578 bytes

total 0.050 sec, 799410.19 bytes/sec, 60.60 docs/sec

indexing index ‘bbs_delta’…

collected 3 docs, 0.0 MB

sorted 0.0 Mhits, 100.0% done

total 3 docs, 39578 bytes

total 0.044 sec, 902329.94 bytes/sec, 68.40 docs/sec

indexing index ‘bbs_merge’…

collected 3 docs, 0.0 MB

sorted 0.0 Mhits, 100.0% done

total 3 docs, 39578 bytes

total 0.022 sec, 1767980.00 bytes/sec, 134.01 docs/sec

total 9 reads, 0.0 sec, 21.3 kb/read avg, 0.0 msec/read avg

total 21 writes, 0.0 sec, 10.9 kb/write avg, 0.0 msec/write avg

七.测试

$ sudo /usr/local/csft/bin/search –config /usr/local/csft/etc/sphinx.conf “盛大”

Coreseek Full Text Server 3.1

Copyright (c) 2006-2008 coreseek.com

using config file ‘/usr/local/csft/etc/sphinx.conf’…

index ‘bbs’: query ‘盛大 ‘: returned 1 matches of 1 total in 0.004 sec

displaying matches:

1. document=29, weight=2, fid=33, tid=20, first=1, invisible=0, authorid=2, dateline=Thu Dec 23 06:14:00 2004

words:

1. ‘盛’: 1 documents, 1 hits

2. ‘大’: 2 documents, 54 hits

index ‘bbs_delta’: query ‘盛大 ‘: returned 1 matches of 1 total in 0.000 sec

displaying matches:

1. document=29, weight=2, fid=33, tid=20, first=1, invisible=0, authorid=2, dateline=Thu Dec 23 06:14:00 2004

words:

1. ‘盛’: 1 documents, 1 hits

2. ‘大’: 2 documents, 54 hits

index ‘bbs_merge’: query ‘盛大 ‘: returned 1 matches of 1 total in 0.000 sec

displaying matches:

1. document=29, weight=2, fid=33, tid=20, first=1, invisible=0, authorid=2, dateline=Thu Dec 23 06:14:00 2004

words:

1. ‘盛’: 1 documents, 1 hits

2. ‘大’: 2 documents, 54 hits

八.启动searchd

$ sudo /usr/local/csft/bin/searchd –config /usr/local/csft/etc/sphinx.conf

Coreseek Full Text Server 3.1

Copyright (c) 2006-2008 coreseek.com

using config file ‘/usr/local/csft/etc/sphinx.conf’…

listening on all interfaces, port=3312

九.计划任务更新合并索引

$ sudo crontab -e

# m h dom mon dow command

*/5 * * * * /usr/local/csft/bin/indexer –config /usr/local/csft/etc/sphinx.conf bbs_delta –rotate

00 04 * * * /usr/local/csft/bin/indexer –config /usr/local/csft/etc/sphinx.conf bbs_merge –rotate && /usr/local/csft/bin/indexer –config /usr/local/csft/etc/sphinx.conf –merge bbs bbs_merge –rotate

sphinx-1.x版本会实时索引的。