Sphinx search engine Installation

From Teknologisk videncenter
Jump to: navigation, search

Start

Compiling

Sphinx installs default in /usr/local and search for database. No options really nessasary.

  1. Executed ./configure which did all the tests succesfully and generated the Makefiles.
  2. Executed make to generate the binaries. Showing minor warnings, but ran all tests succesfully
  3. Executed make install showing output below
Making install in src
if test -d ../.svn; then svn info .. --xml | perl svnxrev.pl; fi;
make  install-am
test -z "/usr/local/bin" || /usr/local/sw/sphinx-0.9.9-rc1/config/install-sh -d "/usr/local/bin"
  /usr/bin/install -c 'indexer' '/usr/local/bin/indexer'
  /usr/bin/install -c 'searchd' '/usr/local/bin/searchd'
  /usr/bin/install -c 'search' '/usr/local/bin/search'
  /usr/bin/install -c 'spelldump' '/usr/local/bin/spelldump'
Making install in test
test -z "/usr/local/etc" || /usr/local/sw/sphinx-0.9.9-rc1/config/install-sh -d "/usr/local/etc"
 /usr/bin/install -c -m 644 'sphinx.conf.dist' '/usr/local/etc/sphinx.conf.dist'
 /usr/bin/install -c -m 644 'sphinx-min.conf.dist' '/usr/local/etc/sphinx-min.conf.dist'
 /usr/bin/install -c -m 644 'example.sql' '/usr/local/etc/example.sql'
make  install-data-hook
mkdir -p /usr/local/var/data && mkdir -p /usr/local/var/log
  • created directory /var/data/tekkom
  • created directory /var/data/tekkomstemmed (English morpholgy)

Download mediawiki files for sphinx

Download Showfiles Unpack and copy all the files except sphinx.conf /usr/local/www/data/mediawiki/extensions (Create directory if nessasary)

  • Option 1: copy the sphnix.conf file to /usr/local/etc directory and change the file where nessasary.
  • Option 2: make your own sphinx.conf file. See below
  • Option 3: Copy and Edit a preparred sphinx.conf for mediawiki. See next section

sphnix.conf

  • /usr/local/etc/sphinx.conf
  • First config I made, indexed history of all articles. Only want latest version. See working sphinx.conf below

First sphinx.conf - not used

#
# Auther: HeTh
# date..: 28/2/2009
#

source src1
{
        type                                    = mysql

        sql_host                                = localhost
        sql_user                                = <REMOVED>
        sql_pass                                = <REMOVED>
        sql_db                                  = wikidb
## --> HeTh inserted
        sql_query_pre   =
        sql_query       = \
          SELECT old_id, old_text\
          FROM text
        sql_query_post  =
        sql_query_info  = SELECT * FROM text WHERE old_id=$id
#<--HeTh
## --> HeTh Commented
#       sql_query                               = \
#               SELECT id, group_id, UNIX_TIMESTAMP(date_added) AS date_added, title, content \
#               FROM documents
#
#       sql_attr_uint                   = group_id
#       sql_attr_timestamp              = date_added
#<--HeTh
        sql_query_info                  = SELECT * FROM documents WHERE id=$id
}


############# --> HeTh Testing out morphology in English
index tekkom
{
        source                  = src1
        path                    = /var/data/tekkom
        docinfo                 = extern
        mlock                   = 0
        morphology              = none
        min_word_len            = 1
        charset_type            = sbcs
        html_strip                              = 0
}
index tekkomstemmed : tekkom
{
        path                    = /var/data/tekkomstemmed
        morphology              = stem_en
}

############# <-- HeTh (Commented the simples index below)
#index tekkom
#{
#       source                                  = src1
#       path                                    = /var/data/tekkom
#       docinfo                                 = extern
#       charset_type                    = sbcs
#}


indexer
{
        # HeTh rasised to 64M from 32M 28/2-09
        mem_limit                               = 64M
}


searchd
{
        listen                                  = 3312
        log                                             = /var/log/searchd.log
        query_log                               = /var/log/query.log
        read_timeout                    = 5
        max_children                    = 30
        pid_file                                = /var/log/searchd.pid
        max_matches                             = 1000
        seamless_rotate                 = 1
        preopen_indexes                 = 0
        unlink_old                              = 1
}

sphinx.conf

#
# Sphinx configuration for MediaWiki
#
# Based on examples by Paul Grinberg at http://www.mediawiki.org/wiki/Extension:SphinxSearch
# and Hank at http://www.ralree.info/2007/9/15/fulltext-indexing-wikipedia-with-sphinx
#
# Modified by Svemir Brkic for http://www.newworldencyclopedia.org/
#
# Released under GNU General Public License (see http://www.fsf.org/licenses/gpl.html)
#
# Latest version available at http://www.mediawiki.org/wiki/Extension:SphinxSearch

# data source definition for the main index
source src_wiki_main
{
 # data source
 type           = mysql
 sql_host       = localhost
 sql_user       = <REMOVED>
 sql_pass       = <REMOVED>
 sql_db         = wikidb
 # these two are optional
 #sql_port      = 3306
 #sql_sock      = /var/lib/mysql/mysql.sock

 # pre-query, executed before the main fetch query
 sql_query_pre  = SET NAMES utf8

 # main document fetch query - change the table names if you are using a prefix
 sql_query      = SELECT page_id, page_title, page_namespace, old_id, old_text FROM page, revision, text WHERE rev_id=page_latest AND old_id=rev_text_id

 # attribute columns
 sql_attr_uint  = page_namespace
 sql_attr_uint  = old_id

 # uncomment next line to collect all category ids for a category filter
 #sql_attr_multi  = uint category from query; SELECT cl_from, page_id AS category FROM categorylinks, page WHERE page_title=cl_to AND page_namespace=14

 # optional - used by command-line search utility to display document information
 sql_query_info = SELECT page_title, page_namespace FROM page WHERE page_id=$id
}

# data source definition for the incremental index
source src_wiki_incremental : src_wiki_main
{
 # adjust this query based on the time you run the full index
 # in this case, full index runs at 3 AM (server time) which translates to 7 AM UTC
 sql_query      = SELECT page_id, page_title, page_namespace, old_id, old_text FROM page, revision, text WHERE rev_id=page_latest AND old_id=rev_text_id AND page_touched>=DATE_FORMAT(CURDATE(), '%Y%m%d070000')

 # all other parameters are copied from the parent source,
}

# main index definition
index wiki_main
{
 # which document source to index
 source         = src_wiki_main

 # this is path and index file name without extension
 # you may need to change this path or create this folder
 path           = /var/data/sphinx/wiki_main

 # docinfo (ie. per-document attribute values) storage strategy
 docinfo                = extern

 # morphology
 morphology     = stem_en

 # stopwords file
 #stopwords     = /var/data/sphinx/stopwords.txt

 # minimum word length
 min_word_len   = 1

 # uncomment next 2 lines to allow wildcard (*) searches
 #min_infix_len = 1
 #enable_star = 1

 # charset encoding type
 charset_type   = utf-8

 # charset definition and case folding rules "table"
 charset_table  = 0..9, A..Z->a..z, a..z, \
        U+C0->a, U+C1->a, U+C2->a, U+C3->a, U+C4->a, U+C5->a, U+C6->a, \
        U+C7->c,U+E7->c, U+C8->e, U+C9->e, U+CA->e, U+CB->e, U+CC->i, \
        U+CD->i, U+CE->i, U+CF->i, U+D0->d, U+D1->n, U+D2->o, U+D3->o, \
        U+D4->o, U+D5->o, U+D6->o, U+D8->o, U+D9->u, U+DA->u, U+DB->u, \
        U+DC->u, U+DD->y, U+DE->t, U+DF->s, \
        U+E0->a, U+E1->a, U+E2->a, U+E3->a, U+E4->a, U+E5->a, U+E6->a, \
        U+E7->c,U+E7->c, U+E8->e, U+E9->e, U+EA->e, U+EB->e, U+EC->i, \
        U+ED->i, U+EE->i, U+EF->i, U+F0->d, U+F1->n, U+F2->o, U+F3->o, \
        U+F4->o, U+F5->o, U+F6->o, U+F8->o, U+F9->u, U+FA->u, U+FB->u, \
        U+FC->u, U+FD->y, U+FE->t, U+FF->s,

}

# incremental index definition
index wiki_incremental : wiki_main
{
 path           = /var/data/sphinx/wiki_incremental
 source         = src_wiki_incremental
}


# indexer settings
indexer
{
 # memory limit (default is 32M)
 mem_limit      = 64M
}

# searchd settings
searchd
{
 # IP address on which search daemon will bind and accept
 # optional, default is to listen on all addresses,
 # ie. address = 0.0.0.0
 #address               = 127.0.0.1

 # port on which search daemon will listen
 listen         = 3312

 # searchd run info is logged here - create or change the folder
 log            = /var/log/searchd.log

 # all the search queries are logged here
 query_log      = /var/log/query.log

 # client read timeout, seconds
 read_timeout   = 5

 # maximum amount of children to fork
 max_children   = 30

 # a file which will contain searchd process ID
 pid_file       = /var/log/searchd.pid

 # maximum amount of matches this daemon would ever retrieve
 # from each index and serve to client
 max_matches    = 1000
}

# --eof--

Indexing the search database

[root@mars /usr/local/etc]#  indexer --config /usr/local/etc/sphinx.conf --all
Sphinx 0.9.9-rc1 (r1566)
Copyright (c) 2001-2008, Andrew Aksyonoff

using config file '/usr/local/etc/sphinx.conf'...
indexing index 'tekkom'...
collected 2259 docs, 6.1 MB
sorted 1.0 Mhits, 100.0% done
total 2259 docs, 6148478 bytes
total 0.826 sec, 7440030.11 bytes/sec, 2733.53 docs/sec
indexing index 'tekkomstemmed'...
collected 2259 docs, 6.1 MB
sorted 1.0 Mhits, 100.0% done
total 2259 docs, 6148478 bytes
total 1.559 sec, 3944462.15 bytes/sec, 1449.23 docs/sec
total 4 reads, 0.0 sec, 978.7 kb/read avg, 3.2 msec/read avg
total 28 writes, 0.0 sec, 344.0 kb/write avg, 1.1 msec/write avg

Testing the search database

time search "dhcp relay"
search -q "dhcp relay"

Works fine :-), but apparently shows history documents as well. Lets se later when searching the wiki.

Starting the search daemon

Manually start

searchd --config /usr/local/etc/sphinx.conf

Run command script to FreeBSD

Add this file as /usr/local/etc/rc.d/searchd to automatically start search daemon at boot.

#!/bin/sh
#
# $FreeBSD: src/etc/rc.d/searchd,v 1.00 2009/02/29 09:46:00 HeTh Exp $
# Copyright Mercantec, Viborg, Denmark. www.mercantec.dk
# Auther: Henrik Thomsen/heth@mercantec.dk

# PROVIDE: searchd
# REQUIRE: DAEMON
# BEFORE:  LOGIN

. /etc/rc.subr

name="searchd"
rcvar=`set_rcvar`
command="/usr/local/bin/${name}"

load_rc_config $name
run_rc_command "$1"

add the following line to /etc/rc.conf

searchd_enable="YES"

Updating the search database

The search database should be updated at regular intervals to let new articles and changes searchable.
While the database is small I've set the indexer to run once every hour in root's crontab file

3 * * * * /usr/local/bin/indexer --config /usr/local/etc/sphinx.conf  --rotate --all >/dev/null 2>&1
  • --rotate sends SIGHUP to the searchdaemon to use the new database index
  • --all indexes all index'es specified in sphinx.conf

Implementing sphinx in the wiki

extension ExtensionFunctions

Sphinx need teh extension ExtensionFunctions

cd /usr/local/www/data/mediawiki/extensions
svn export http://svn.wikimedia.org/svnroot/mediawiki/trunk/extensions/ExtensionFunctions.php

Copying sphinx php extension script

cp /usr/local/sw/sphinx-0.9.9-rc1/api/sphinxapi.php /usr/local/www/data/mediawiki/extensions/SphinxSearch

Installation change

LocalSettings.php

Add the following text to your LocalSettings.php

####SPHINX
$wgSearchType = 'SphinxSearch';
require_once( "$IP/extensions/SphinxSearch/SphinxSearch.php");
#$wgSphinxSearch_mode = 'SPH_MATCH_PHRASE';
#$wgSphinxSearch_mode = 'SPH_MATCH_BOOLEAN';
$wgSphinxSearch_mode = 'SPH_MATCH_EXTENDED';
####SPHINX

For different search modes see Sphinx Searching

Sphinx as default search engine

uncomment the following lines in /usr/local/www/mediawiki/extensions/SphinxSearch/SphinxSearch.php

$wgDisableInternalSearch = true;
$wgDisableSearchUpdate = true;
$wgSearchType = 'SphinxSearch';

Did you mean suggestions

Uses aspell or pspell. Aspell configured uncomment and change lines in usr/local/www/mediawiki/extensions/SphinxSearch/SphinxSearch.php

# Path to where aspell has location and language data files. Leave commented out if unsure
#$wgSphinxSearchPspellDictionaryDir = "/usr/local/lib/aspell-0.60/";

# Path to personal dictionary (for example personal.en.pws.) Needed only if using a personal dictionary
$wgSphinxSearchPersonalDictionary = "/usr/local/www/data/mediawiki/aspell.en_US.per";

# Path to Aspell. Needed only if using command line interface instead of the PHP built in PSpell interface.
$wgSphinxSearchAspellPath = "/usr/local/bin/aspell";

Configuring aspell

Aspell default don't accept special characters in dictionary files such as '-' or '0123456789'

[root@mars aspell-0.60]#cat /usr/local/lib/aspell-0.60/en.dat
name en
charset iso8859-1
special ' -*- - -*- 0 --* 1 --* 2 --* 3 --* 4 --* 5 --* 6 --* 7 --* 8 --* 9 --*
soundslike en
affix en
#repl-table en_affix.dat