Files
DomFramework/fts.php

387 lines
11 KiB
PHP

<?php
/** DomFramework
* @package domframework
* @author Dominique Fournier <dominique@fournier38.fr>
* @license BSD
*/
namespace Domframework;
/** The Full Text Search
* Analyze the provided search text (like a search engine), and create the
* sql query to found the answer. It also allow to check if a sentence is valid
* against a searched text
* Manage the sentences (enclosed in quotes), or the standalone words,
* Manage the non wanted field (beginning by -),
* Do not search if the word is smaller than a parameter.
* Each sentence or word can be modified by external methods separately
*/
class fts
{
///////////////////////////
//// PROPERTIES ////
///////////////////////////
// {{{
/** The minimum length of a token to search
*/
public $minLength = 3;
/** The tokens found in the query, with the minus state if the user do not
* want the provided token
*/
private $tokens = null;
/** The tokens without the too small tokens
*/
private $tokensMin =null;
/** The regexes created by the parser
*/
private $regexes = null;
/** The callable method to run on each word of the query
*/
private $callTokenWord = null;
/** The callable method to run on each sentence of the query
*/
private $callTokenSentence = null;
// }}}
////////////////////////////
//// CONSTRUCTOR ////
////////////////////////////
/** The constructor check the availability of the MB module
*/
public function __construct ()
// {{{
{
if (! function_exists ("mb_strlen"))
throw new \Exception ("PHP don't have the MB Support. Please add it !",
500);
}
// }}}
////////////////////////
//// GETTERS ////
////////////////////////
/** Get the tokens store after the search
*/
public function getTokens ()
// {{{
{
return $this->tokens;
}
// }}}
/** Get the tokens store after the search, without the too small ones
*/
public function getTokensMin ()
// {{{
{
return $this->tokensMin;
}
// }}}
/** Get the regexes defined after the analyzer
*/
public function getRegexes ()
// {{{
{
return $this->regexes;
}
// }}}
/** Set the method to call on tokens word only
* The method must return the token updated
* @param callable $callable The callable method
* @return $this
*/
public function callTokenWord ($callable)
// {{{
{
if (! is_callable ($callable))
throw new \Exception (dgettext ("domframework",
"SSE : callTokenWord : provided method is not callable"), 500);
$this->callTokenWord = $callable;
return $this;
}
// }}}
/** Set the method to call on tokens sentence only
* The method must return the token updated
* @param callable $callable The callable method
* @return $this
*/
public function callTokenSentence ($callable)
// {{{
{
if (! is_callable ($callable))
throw new \Exception (dgettext ("domframework",
"SSE : callTokenSentence : provided method is not callable"), 500);
$this->callTokenSentence = $callable;
return $this;
}
// }}}
//////////////////////////////
//// PUBLIC METHODS ////
//////////////////////////////
/** Explode the query text provided in $query, to be used to search in
* database, file...
* @param string $query The text to found in the database
* @return array The operator and the associated regex value to search
*/
public function search ($query)
// {{{
{
$query = trim ($query);
$this->tokens = $this->tokenizer ($query);
foreach ($this->tokens["tokens"] as $key => &$token)
{
if ($this->tokens["sentences"][$key] === true && $this->callTokenSentence)
$token = call_user_func ($this->callTokenSentence, $token);
elseif ($this->tokens["sentences"][$key] === false &&
$this->callTokenWord)
$token = call_user_func ($this->callTokenWord, $token);
}
$this->tokensMin = $this->tokenMinLength ($this->tokens["tokens"],
$this->tokens["minuses"]);
$this->regexes = $this->regex ($this->tokensMin["tokens"],
$this->tokensMin["minuses"]);
return $this->regexes;
}
// }}}
/** Construct the query based on the tokens.
* The tokens can be updated by methods so the query may be modified by the
* external methods
* @return string
*/
public function getQuery ()
// {{{
{
$res = "";
foreach ($this->tokens["tokens"] as $key => $token)
{
if ($key > 0)
$res .= " ";
if ($this->tokens["sentences"][$key] === true)
$res .= "\"$token\"";
else
$res .= "$token";
}
return $res;
}
// }}}
/** Return $line if the $query match against $line, or false if not
* @param string $line The line to examine
* @param string $query The query to apply on it
* @return string|false The $line if match, false
*/
public function searchString ($line, $query)
// {{{
{
$regexes = $this->search ($query);
if (empty ($this->tokens))
return false;
foreach ($this->tokens["tokens"] as $key => $searchPart)
{
if (trim ($searchPart) === "")
continue;
$match = (strpos ($line, $searchPart) !== false);
if ($this->tokens["minuses"][$key] === "" && $match == 0)
return false;
if ($this->tokens["minuses"][$key] === "-" && $match == 1)
return false;
}
return $line;
}
// }}}
/** Search in SQL
* @param string $query The text to found in the database
* @param object $dblayeroo The dblayeroo object to query
* @param array|null $fields The fields in $dblayeroo to look for data. If
* null, look in all the fields defined in the dblayeroo object
* @return array The result of the query
*/
public function searchSQL ($query, $dblayeroo, $fields)
// {{{
{
$regexes = $this->search ($query);
if (empty ($regexes["operator"]))
return array ();
// Clone the object to not modify a previously defined query
$dbl = clone $dblayeroo;
$dbl->clearRequest ();
$dbl->select ();
if ($fields === null)
$fields = array_keys ($dbl->fields ());
$i = 0;
foreach ($fields as $field)
{
if (! array_key_exists ($field, $dbl->fields ()))
throw new \Exception (sprintf (dgettext ("domframework",
"The field '%s' doesn't exists in database"), $field), 500);
if ($i > 0)
$dbl->whereAddOR ();
$dbl->whereAddParenthesisOpen ();
$j = 0;
foreach ($regexes["operator"] as $key=>$operator)
{
if ($j > 0)
$dbl->whereAddAND ();
$dbl->whereAdd ($field, $operator, $regexes["value"][$key]);
$j++;
}
$dbl->whereAddParenthesisClose ();
$i++;
}
// Look for the order by date if provided
foreach ($dbl->fields () as $field=>$params)
{
if ($params[0] == "date" || $params[0] == "datetime" ||
$params[0] == "time")
{
$dbl->orderAdd ($field, "DESC");
break;
}
}
return $dbl->execute ();
}
// }}}
///////////////////////////////
//// PRIVATE METHODS ////
///////////////////////////////
/** Create the regex associated to the provided tokens and minuses
* @param array $tokens The token list
* @param array $minuses The minuses list
* @return array The operator and the associated regex value to search
*/
private function regex ($tokens, $minuses)
// {{{
{
if (! is_array ($tokens))
throw new \Exception ("Invalid tokens provided to fts:tokenMinLength",
500);
if (! is_array ($minuses))
throw new \Exception ("Invalid minuses provided to fts:tokenMinLength",
500);
$operator = array ();
$value = array ();
foreach ($tokens as $key=>$token)
{
if ($minuses[$key] === "-")
$operator[$key] = "NOT REGEXP";
else
$operator[$key] = "REGEXP";
$value[$key] = "(^|[<> \[\]\(\"',.;/:!?\r\n])".
preg_quote ($token).
"([<> \[\]\)\"',.;/:!?\r\n]|$)";
}
return array ("operator"=>$operator, "value"=>$value);
}
// }}}
/** Remove the tokens with too small length. Remove the not desired minuses
* too.
* @param array $tokens The token list
* @param array $minuses The minuses list
* @return array tokens and minuses
*/
private function tokenMinLength ($tokens, $minuses)
// {{{
{
if (! is_array ($tokens))
throw new \Exception ("Invalid tokens provided to fts:tokenMinLength",
500);
if (! is_array ($minuses))
throw new \Exception ("Invalid minuses provided to fts:tokenMinLength",
500);
$newTokens = array ();
$newMinuses = array ();
foreach ($tokens as $key=>$token)
{
if (mb_strlen ($token) >= $this->minLength)
{
$newTokens[] = $token;
$newMinuses[] = $minuses[$key];
}
}
return array ("tokens"=>$newTokens, "minuses"=>$newMinuses);
}
// }}}
/** Return an array with the $query tokenized
* @param string $query The text to tokenize
* @return array tokens and minuses
*/
private function tokenizer ($query)
// {{{
{
if (! is_string ($query))
throw new \Exception ("Invalid query provided to fts:tokenizer", 500);
$debug = false;
$tokens = array ();
$sentences = array ();
$minuses = array ();
// Look for sentences
$offset = 0;
if ($debug) echo "\n012345678901234567890123456789\n$query\n";
while ($offset <= mb_strlen ($query))
{
if ($debug) echo "OFFSET=$offset\n";
if (substr ($query, $offset, 1) === "-")
{
if ($debug) echo "MINUS\n";
$minus = "-";
$offset++;
}
else
$minus = "";
$start = strpos ($query, "\"", $offset);
if ($start === $offset)
{
// Sentence, see if there is a end
$end = strpos ($query, "\"", $offset + 1);
if ($end !== false)
{
// Complete sentence (with ending double quote)
$nbchars = $end - $offset - 1;
if ($debug)
echo "COMPLETE SENTENCE (Start ".($offset+1).
" with $nbchars chars)\n";
$token = substr ($query, $offset + 1, $nbchars);
$tokens[] = $token;
$sentences[] = true;
$minuses[] = $minus;
$offset = $end + 1;
continue;
}
}
// Word analysis
$end = strpos ($query, " ", $offset);
if ($end === false)
$end = strlen ($query);
$nbchars = $end - $offset;
if ($nbchars > 0)
{
if ($debug) echo "WORD FOUND (Start $offset with $nbchars chars)\n";
$token = substr ($query, $offset, $nbchars);
$tokens[] = $token;
$sentences[] = false;
$minuses[] = $minus;
}
$offset = $end + 1;
}
if ($debug) print_r ($tokens);
return array ("tokens" => $tokens,
"sentences" => $sentences,
"minuses" => $minuses);
}
// }}}
}