discuzX使用sphinx实现全文检索教程
发布:smiling 来源: PHP粉丝网 添加日期:2015-04-04 10:21:00 浏览: 评论:0
Sphinx是一个基于SQL的全文检索引擎,可以结合MySQL,PostgreSQL做全文搜索,它可以提供比数据库本身更专业的搜索功能,使得应用程序更容易实现专业化的全文检索,这儿为大家介绍两点,第一个是基于discuz的索引配置文件,这个配置文件比较灵活,可以根据不同的需求来配置,代码如下:
- #
- # LinuxTone full index search configure file
- #
- source lt_posts
- {
- type = mysql
- sql_host = 127.0.0.1
- sql_user = root
- sql_pass =
- sql_db = lt_bbs
- sql_port = 3306
- sql_query_pre = SET NAMES utf8
- sql_query = SELECT pid,tid,fid,dateline,subject,message,author FROM cdb_posts where first=1 #此处是基于posts表来做索引的,这样的目的是可以同时检索到subject,message,author 三个字段的值
- sql_attr_uint = fid
- sql_attr_timestamp = dateline
- sql_query_info = SELECT * FROM cdb_posts WHERE pid=$id
- }
- index lt_posts
- {
- source = lt_posts
- path = /data/sphinx/data/lt_posts
- docinfo = extern
- mlock = 0
- morphology = none
- min_word_len = 2
- html_strip = 1
- charset_dictpath = /usr/local/mmseg-3.2.13/etc/
- charset_type = zh_cn.utf-8
- ngram_len = 0
- }
- ########## 增量索引 ##################
- source delta
- {
- type = mysql
- sql_host = 127.0.0.1
- sql_user = root
- sql_pass =
- sql_db = lt_bbs
- sql_port = 3306 # optional, default is 3306
- sql_query_pre = SET NAMES utf8
- sql_query = SELECT pid,tid,fid,dateline,subject,message,author FROM cdb_posts where first=1 and dateline > unix_timestamp()-3600*10 #增量索引采用当前时间戳减去一个需要间隔的时间来新建新增的数据索引
- sql_attr_uint = fid
- sql_attr_timestamp = dateline
- sql_query_info = SELECT * FROM cdb_posts WHERE pid=$id
- }
- index delta
- {
- source = delta
- path = /data/sphinx/data/lt_delta
- docinfo = extern
- mlock = 0
- morphology = none
- min_word_len = 2
- html_strip = 1
- charset_dictpath = /usr/local/mmseg-3.2.13/etc/
- charset_type = zh_cn.utf-8 //开源软件:phpfensi.com
- ngram_len = 0
- }
- indexer
- {
- mem_limit = 32M
- }
- searchd
- {
- port = 9312
- log = /data/sphinx/var/log/searchd.log
- query_log = /data/sphinx/var/log/query.log
- read_timeout = 5
- max_children = 30
- pid_file = /data/sphinx/var/log/searchd.pid
- max_matches = 10000
- seamless_rotate = 1
- preopen_indexes = 0
- unlink_old = 1
- }
sphinx最主要的就是这个配置文件,当然在增量索引部分可以写一个脚本放到crontab里面来定时跑.
下面介绍下sphinx的PHP调用部分,sphinx的接口采用PHP的扩展,可以通过pecl或者http://pecl.php.net/package/sphinx来安装,代码如下:
- <?php
- /**
- *全文搜索服务
- */
- define('IN_DISCUZ', true);
- require_once './include/common.inc.php';
- $q = isset($_GET['q']) && !emptyempty($_GET['q']) ? $_GET['q'] : '';
- $q = str_replace(array('<', '>', ' ', ''', ','), array('', '', ' ', '', ''), strip_tags($q));
- $page = isset($_GET['page']) && intval($_GET['page']) > 0 ? intval($_GET['page']) : 1;
- $perNum = 20;
- $offset = ($page - 1) * $perNum;
- $search = new SphinxClient();
- $search -> setServer('127.0.0.1', 9312);
- $search -> setConnectTimeout(2);
- $search -> setArrayResult(true);
- $search -> setMatchMode(SPH_MATCH_ANY);
- $search -> setRankingMode(SPH_RANK_PROXIMITY_BM25);
- $search -> setSortMode(SPH_SORT_EXTENDED, '@relevance desc,@weight desc');
- $search -> setLimits($offset, $perNum);
- $search -> setFieldWeights(array('subject' => 2000, 'message' => 0));
- $rs = array();
- $query_totals = $query_time = 0;
- if (!emptyempty($q)) {
- $rs = $search -> Query($q, "*");
- $pages = ceil($rs['total'] / $perNum);
- $query_totals = $rs['total_found'];
- $query_time = $rs['time'];
- }
- $data = $title = $content = array();
- if (!emptyempty($rs) && $page <= $pages) {
- $pids = array();
- foreach($rs['matches'] as $v) {
- $pids[] = $v['id'];
- }
- $pid = implode(',', $pids);
- $sql = "select pid,tid,author,authorid,subject,message,dateline from cdb_posts where pid IN($pid) and status ='0' and invisible='0'";
- $query = $db -> query($sql);
- while ($row = $db -> fetch_array($query)) {
- $data[] = $row;
- $title[] = $row['subject'];
- $content[] = preg_replace('/[[/]?(b|img|url|color|s|hr|p|list|i|align|email|u|font|code|hide|table|tr|td|th|attach|list|indent|float).*]/', '', strip_tags($row['message']));
- }
- // 搜索词高亮
- $opts = array();
- $opts['before_match'] = '<em>';
- $opts['after_match'] = '</em>';
- $title = $search -> BuildExcerpts($title, 'lt_posts', $q, $opts);
- $content = $search -> BuildExcerpts($content, 'lt_posts', $q, $opts);
- foreach($data as $k => $v) {
- $data[$k]['subject'] = $title[$k];
- $data[$k]['message'] = $content[$k];
- }
- $url = "s.php?q=" . urlencode($q);
- $multipage = multi($rs['total'], $perNum, $page, $url);
- }
- include template("lt_search");
- ?>
跑主索引的shell脚本search-index.sh,代码如下:
- #!/bin/bash
- #
- # The BBS search exec full index
- #
- /usr/local/csft-3.2.13/bin/indexer -c /usr/local/csft-3.2.13/etc/lt_posts.conf --rotate lt_posts >> /data/sphinx/var/`date "+%Y-%m-%d-%H"`.log
跑增量索引的shell脚本search-delta.sh,代码如下:
- #!/bin/bash
- #
- # The BBS search exec delta index
- #
- #跑增量索引
- /usr/local/csft-3.2.13/bin/indexer -c /usr/local/csft-3.2.13/etc/lt_posts.conf --rotate delta
- #合并主索引和增量索引
- #/usr/local/csft-3.2.13/bin/indexer --config /usr/local/csft-3.2.13/etc/lt_posts.conf --rotate --merge lt_posts delta
Tags: discuzX全文检索 discuzX搜索
推荐文章
热门文章
最新评论文章
- 写给考虑创业的年轻程序员(10)
- PHP新手上路(一)(7)
- 惹恼程序员的十件事(5)
- PHP邮件发送例子,已测试成功(5)
- 致初学者:PHP比ASP优秀的七个理由(4)
- PHP会被淘汰吗?(4)
- PHP新手上路(四)(4)
- 如何去学习PHP?(2)
- 简单入门级php分页代码(2)
- php中邮箱email 电话等格式的验证(2)