00001 <?PHP
00002
00003 #
00004 # FILE: SearchEngine.php
00005 #
00006 # Open Source Metadata Archive Search Engine (OSMASE)
00007 # Copyright 2002-2011 Edward Almasy and Internet Scout
00008 # http://scout.wisc.edu
00009 #
00010
00011 class SearchEngine {
00012
00013 # ---- PUBLIC INTERFACE --------------------------------------------------
00014
00015 # possible types of logical operators
00016 const LOGIC_AND = 1;
00017 const LOGIC_OR = 2;
00018
00019 # flags used for indicating field types
00020 const FIELDTYPE_TEXT = 1;
00021 const FIELDTYPE_NUMERIC = 2;
00022 const FIELDTYPE_DATE = 3;
00023 const FIELDTYPE_DATERANGE = 4;
00024
00025 # object constructor
00026 function SearchEngine(&$DB, $ItemTableName, $ItemIdFieldName)
00027 {
00028 # save database object for our use
00029 $this->DB = $DB;
00030
00031 # save item access parameters
00032 $this->ItemTableName = $ItemTableName;
00033 $this->ItemIdFieldName = $ItemIdFieldName;
00034
00035 # define flags used for indicating word states
00036 if (!defined("WORD_PRESENT")) { define("WORD_PRESENT", 1); }
00037 if (!defined("WORD_EXCLUDED")) { define("WORD_EXCLUDED", 2); }
00038 if (!defined("WORD_REQUIRED")) { define("WORD_REQUIRED", 4); }
00039
00040 # set default debug state
00041 $this->DebugLevel = 0;
00042 }
00043
00044 # add field to be searched
00045 function AddField(
00046 $FieldName, $DBFieldName, $FieldType, $Weight, $UsedInKeywordSearch)
00047 {
00048 # save values
00049 $this->FieldInfo[$FieldName]["DBFieldName"] = $DBFieldName;
00050 $this->FieldInfo[$FieldName]["FieldType"] = $FieldType;
00051 $this->FieldInfo[$FieldName]["Weight"] = $Weight;
00052 $this->FieldInfo[$FieldName]["InKeywordSearch"] = $UsedInKeywordSearch;
00053 }
00054
00055 # retrieve info about tables and fields (useful for child objects)
00056 function ItemTableName() { return $this->ItemTableName; }
00057 function ItemIdFieldName() { return $this->ItemIdFieldName; }
00058 function DBFieldName($FieldName)
00059 { return $this->FieldInfo[$FieldName]["DBFieldName"]; }
00060 function FieldType($FieldName)
00061 { return $this->FieldInfo[$FieldName]["FieldType"]; }
00062 function FieldWeight($FieldName)
00063 { return $this->FieldInfo[$FieldName]["Weight"]; }
00064 function FieldInKeywordSearch($FieldName)
00065 { return $this->FieldInfo[$FieldName]["InKeywordSearch"]; }
00066
00067 # set debug level
00068 function DebugLevel($Setting)
00069 {
00070 $this->DebugLevel = $Setting;
00071 }
00072
00073
00074 # ---- search functions
00075
00076 # perform keyword search
00077 function Search($SearchString, $StartingResult = 0, $NumberOfResults = 10,
00078 $SortByField = NULL, $SortDescending = TRUE)
00079 {
00080 $SearchString = $this->SetDebugLevel($SearchString);
00081 $this->DMsg(0, "In Search() with search string \"".$SearchString."\"");
00082
00083 # save start time to use in calculating search time
00084 $StartTime = microtime(TRUE);
00085
00086 # clear word counts
00087 $this->InclusiveTermCount = 0;
00088 $this->RequiredTermCount = 0;
00089 $this->ExcludedTermCount = 0;
00090
00091 # parse search string into terms
00092 $Words = $this->ParseSearchStringForWords($SearchString);
00093 $this->DMsg(1, "Found ".count($Words)." words");
00094
00095 # parse search string for phrases
00096 $Phrases = $this->ParseSearchStringForPhrases($SearchString);
00097 $this->DMsg(1, "Found ".count($Phrases)." phrases");
00098
00099 # if only excluded terms specified
00100 if ($this->ExcludedTermCount && !$this->InclusiveTermCount)
00101 {
00102 # load all records
00103 $this->DMsg(1, "Loading all records");
00104 $Scores = $this->LoadScoresForAllRecords();
00105 }
00106 else
00107 {
00108 # perform searches
00109 $Scores = $this->SearchForWords($Words);
00110 $this->DMsg(1, "Found ".count($Scores)." results after word search");
00111 $Scores = $this->SearchForPhrases($Phrases, $Scores);
00112 $this->DMsg(1, "Found ".count($Scores)." results after phrase search");
00113 }
00114
00115 # if search results found
00116 if (count($Scores) > 0)
00117 {
00118 # handle any excluded words
00119 $Scores = $this->FilterOnExcludedWords($Words, $Scores);
00120
00121 # strip off any results that don't contain required words
00122 $Scores = $this->FilterOnRequiredWords($Scores);
00123 }
00124
00125 # count, sort, and trim search result scores list
00126 $Scores = $this->CleanScores($Scores, $StartingResult, $NumberOfResults,
00127 $SortByField, $SortDescending);
00128
00129 # record search time
00130 $this->LastSearchTime = microtime(TRUE) - $StartTime;
00131
00132 # return list of items to caller
00133 $this->DMsg(0, "Ended up with ".$this->NumberOfResultsAvailable." results");
00134 return $Scores;
00135 }
00136
00137 # perform search across multiple fields and return trimmed results to caller
00138 function FieldedSearch($SearchStrings, $StartingResult = 0, $NumberOfResults = 10,
00139 $SortByField = NULL, $SortDescending = TRUE)
00140 {
00141 $SearchStrings = $this->SetDebugLevel($SearchStrings);
00142 $this->DMsg(0, "In FieldedSearch() with "
00143 .count($SearchStrings)." search strings");
00144
00145 # save start time to use in calculating search time
00146 $StartTime = microtime(TRUE);
00147
00148 # perform search
00149 $Scores = $this->SearchAcrossFields($SearchStrings);
00150 $Scores = ($Scores === NULL) ? array() : $Scores;
00151
00152 # count, sort, and trim search result scores list
00153 $Scores = $this->CleanScores($Scores, $StartingResult, $NumberOfResults,
00154 $SortByField, $SortDescending);
00155
00156 # record search time
00157 $this->LastSearchTime = microtime(TRUE) - $StartTime;
00158
00159 # return list of items to caller
00160 $this->DMsg(0, "Ended up with ".$this->NumberOfResultsAvailable." results");
00161 return $Scores;
00162 }
00163
00164 # perform search with logical groups of fielded searches
00165 function GroupedSearch($SearchGroups, $StartingResult = 0, $NumberOfResults = 10,
00166 $SortByField = NULL, $SortDescending = TRUE)
00167 {
00168 foreach ($SearchGroups as $Index => $Groups)
00169 {
00170 if (isset($SearchGroups[$Index]["SearchStrings"]))
00171 {
00172 $SearchGroups[$Index]["SearchStrings"] =
00173 $this->SetDebugLevel($SearchGroups[$Index]["SearchStrings"]);
00174 }
00175 }
00176 $this->DMsg(0, "In GroupedSearch() with "
00177 .count($SearchGroups)." search groups");
00178
00179 # save start time to use in calculating search time
00180 $StartTime = microtime(TRUE);
00181
00182 # start with no results
00183 $Scores = array();
00184
00185 # save AND/OR search setting
00186 $SavedSearchLogic = $this->DefaultSearchLogic;
00187
00188 # for each search group
00189 $FirstSearch = TRUE;
00190 foreach ($SearchGroups as $Group)
00191 {
00192 $this->DMsg(0, "----- GROUP ---------------------------");
00193
00194 # if group has AND/OR setting specified
00195 if (isset($Group["Logic"]))
00196 {
00197 # use specified AND/OR setting
00198 $this->DefaultSearchLogic = $Group["Logic"];
00199 }
00200 else
00201 {
00202 # use saved AND/OR setting
00203 $this->DefaultSearchLogic = $SavedSearchLogic;
00204 }
00205 $this->DMsg(2, "Logic is "
00206 .(($this->DefaultSearchLogic == self::LOGIC_AND) ? "AND" : "OR"));
00207
00208 # if we have search strings for this group
00209 if (isset($Group["SearchStrings"]))
00210 {
00211 # perform search
00212 $GroupScores = $this->SearchAcrossFields($Group["SearchStrings"]);
00213
00214 # if search was conducted
00215 if ($GroupScores !== NULL)
00216 {
00217 # if saved AND/OR setting is OR or this is first search
00218 if (($SavedSearchLogic == self::LOGIC_OR) || $FirstSearch)
00219 {
00220 # add search results to result list
00221 foreach ($GroupScores as $ItemId => $Score)
00222 {
00223 if (isset($Scores[$ItemId]))
00224 {
00225 $Scores[$ItemId] += $Score;
00226 }
00227 else
00228 {
00229 $Scores[$ItemId] = $Score;
00230 }
00231 }
00232
00233 # (reset flag indicating first search)
00234 $FirstSearch = FALSE;
00235 }
00236 else
00237 {
00238 # AND search results with previous results
00239 $OldScores = $Scores;
00240 $Scores = array();
00241 foreach ($GroupScores as $ItemId => $Score)
00242 {
00243 if (isset($OldScores[$ItemId]))
00244 {
00245 $Scores[$ItemId] = $OldScores[$ItemId] + $Score;
00246 }
00247 }
00248 }
00249 }
00250 }
00251 }
00252
00253 # restore AND/OR search setting
00254 $this->DefaultSearchLogic = $SavedSearchLogic;
00255
00256 # count, sort, and trim search result scores list
00257 $Scores = $this->CleanScores($Scores, $StartingResult, $NumberOfResults,
00258 $SortByField, $SortDescending);
00259
00260 # record search time
00261 $this->LastSearchTime = microtime(TRUE) - $StartTime;
00262
00263 # return search results to caller
00264 $this->DMsg(0, "Ended up with ".$this->NumberOfResultsAvailable." results");
00265 return $Scores;
00266 }
00267
00268 # add function that will be called to filter search results
00269 function AddResultFilterFunction($FunctionName)
00270 {
00271 # save filter function name
00272 $this->FilterFuncs[] = $FunctionName;
00273 }
00274
00275 # get or set default search logic (AND or OR)
00276 function DefaultSearchLogic($NewSetting = NULL)
00277 {
00278 if ($NewSetting != NULL)
00279 {
00280 $this->DefaultSearchLogic = $NewSetting;
00281 }
00282 return $this->DefaultSearchLogic;
00283 }
00284
00285 function SearchTermsRequiredByDefault($NewSetting = TRUE)
00286 {
00287 if ($NewSetting)
00288 {
00289 $this->DefaultSearchLogic = self::LOGIC_AND;
00290 }
00291 else
00292 {
00293 $this->DefaultSearchLogic = self::LOGIC_OR;
00294 }
00295 }
00296
00297 function NumberOfResults()
00298 {
00299 return $this->NumberOfResultsAvailable;
00300 }
00301
00302 function SearchTerms()
00303 {
00304 return $this->SearchTermList;
00305 }
00306
00307 function SearchTime()
00308 {
00309 return $this->LastSearchTime;
00310 }
00311
00312 # report total weight for all fields involved in search
00313 function FieldedSearchWeightScale($SearchStrings)
00314 {
00315 $Weight = 0;
00316 $IncludedKeywordSearch = FALSE;
00317 foreach ($SearchStrings as $FieldName => $SearchStringArray)
00318 {
00319 if ($FieldName == "XXXKeywordXXX")
00320 {
00321 $IncludedKeywordSearch = TRUE;
00322 }
00323 else
00324 {
00325 $Weight += $this->FieldInfo[$FieldName]["Weight"];
00326 }
00327 }
00328 if ($IncludedKeywordSearch)
00329 {
00330 foreach ($this->FieldInfo as $FieldName => $Info)
00331 {
00332 if ($Info["InKeywordSearch"])
00333 {
00334 $Weight += $Info["Weight"];
00335 }
00336 }
00337 }
00338 return $Weight;
00339 }
00340
00341
00342 # ---- search database update functions
00343
00344 # update search DB for the specified item
00345 function UpdateForItem($ItemId)
00346 {
00347 # bail out if item ID is negative (indicating a temporary record)
00348 if ($ItemId < 0) { return; }
00349
00350 # clear word count added flags for this item
00351 unset($this->WordCountAdded);
00352
00353 # delete any existing info for this item
00354 $this->DB->Query("DELETE FROM SearchWordCounts WHERE ItemId = ".$ItemId);
00355
00356 # for each metadata field
00357 foreach ($this->FieldInfo as $FieldName => $Info)
00358 {
00359 # if search weight for field is positive
00360 if ($Info["Weight"] > 0)
00361 {
00362 # retrieve text for field
00363 $Text = $this->GetFieldContent($ItemId, $FieldName);
00364
00365 # if text is array
00366 if (is_array($Text))
00367 {
00368 # for each text string in array
00369 foreach ($Text as $String)
00370 {
00371 # record search info for text
00372 $this->RecordSearchInfoForText($ItemId, $FieldName,
00373 $Info["Weight"], $String,
00374 $Info["InKeywordSearch"]);
00375 }
00376 }
00377 else
00378 {
00379 # record search info for text
00380 $this->RecordSearchInfoForText($ItemId, $FieldName,
00381 $Info["Weight"], $Text,
00382 $Info["InKeywordSearch"]);
00383 }
00384 }
00385 }
00386 }
00387
00388 # update search DB for the specified range of items
00389 function UpdateForItems($StartingItemId, $NumberOfItems)
00390 {
00391 # retrieve IDs for specified number of items starting at specified ID
00392 $this->DB->Query("SELECT ".$this->ItemIdFieldName." FROM ".$this->ItemTableName
00393 ." WHERE ".$this->ItemIdFieldName." >= ".$StartingItemId
00394 ." ORDER BY ".$this->ItemIdFieldName." LIMIT ".$NumberOfItems);
00395 $ItemIds = $this->DB->FetchColumn($this->ItemIdFieldName);
00396
00397 # for each retrieved item ID
00398 foreach ($ItemIds as $ItemId)
00399 {
00400 # update search info for item
00401 $this->UpdateForItem($ItemId);
00402 }
00403
00404 # return ID of last item updated to caller
00405 return $ItemId;
00406 }
00407
00408 # drop all data pertaining to item from search DB
00409 function DropItem($ItemId)
00410 {
00411 # drop all entries pertaining to item from word count table
00412 $this->DB->Query("DELETE FROM SearchWordCounts WHERE ItemId = ".$ItemId);
00413 }
00414
00415 # drop all data pertaining to field from search DB
00416 function DropField($FieldName)
00417 {
00418 # retrieve our ID for field
00419 $FieldId = $this->DB->Query("SELECT FieldId FROM SearchFields "
00420 ."WHERE FieldName = '".addslashes($FieldName)."'", "FieldId");
00421
00422 # drop all entries pertaining to field from word counts table
00423 $this->DB->Query("DELETE FROM SearchWordCounts WHERE FieldId = \'".$FieldId."\'");
00424
00425 # drop field from our fields table
00426 $this->DB->Query("DELETE FROM SearchFields WHERE FieldId = \'".$FieldId."\'");
00427 }
00428
00429 # return total number of terms indexed by search engine
00430 function SearchTermCount()
00431 {
00432 return $this->DB->Query("SELECT COUNT(*) AS TermCount"
00433 ." FROM SearchWords", "TermCount");
00434 }
00435
00436 # return total number of items indexed by search engine
00437 function ItemCount()
00438 {
00439 return $this->DB->Query("SELECT COUNT(DISTINCT ItemId) AS ItemCount"
00440 ." FROM SearchWordCounts", "ItemCount");
00441 }
00442
00449 function AddSynonyms($Word, $Synonyms)
00450 {
00451 # asssume no synonyms will be added
00452 $AddCount = 0;
00453
00454 # get ID for word
00455 $WordId = $this->GetWordId($Word, TRUE);
00456
00457 # for each synonym passed in
00458 foreach ($Synonyms as $Synonym)
00459 {
00460 # get ID for synonym
00461 $SynonymId = $this->GetWordId($Synonym, TRUE);
00462
00463 # if synonym is not already in database
00464 $this->DB->Query("SELECT * FROM SearchWordSynonyms"
00465 ." WHERE (WordIdA = ".$WordId
00466 ." AND WordIdB = ".$SynonymId.")"
00467 ." OR (WordIdB = ".$WordId
00468 ." AND WordIdA = ".$SynonymId.")");
00469 if ($this->DB->NumRowsSelected() == 0)
00470 {
00471 # add synonym entry to database
00472 $this->DB->Query("INSERT INTO SearchWordSynonyms"
00473 ." (WordIdA, WordIdB)"
00474 ." VALUES (".$WordId.", ".$SynonymId.")");
00475 $AddCount++;
00476 }
00477 }
00478
00479 # report to caller number of new synonyms added
00480 return $AddCount;
00481 }
00482
00483 # remove synonym(s)
00484 function RemoveSynonyms($Word, $Synonyms = NULL)
00485 {
00486 # find ID for word
00487 $WordId = $this->GetWordId($Word);
00488
00489 # if ID found
00490 if ($WordId !== NULL)
00491 {
00492 # if no specific synonyms provided
00493 if ($Synonyms === NULL)
00494 {
00495 # remove all synonyms for word
00496 $this->DB->Query("DELETE FROM SearchWordSynonyms"
00497 ." WHERE WordIdA = '".$WordId."'"
00498 ." OR WordIdB = '".$WordId."'");
00499 }
00500 else
00501 {
00502 # for each specified synonym
00503 foreach ($Synonyms as $Synonym)
00504 {
00505 # look up ID for synonym
00506 $SynonymId = $this->GetWordId($Synonym);
00507
00508 # if synonym ID was found
00509 if ($SynonymId !== NULL)
00510 {
00511 # delete synonym entry
00512 $this->DB->Query("DELETE FROM SearchWordSynonyms"
00513 ." WHERE (WordIdA = '".$WordId."'"
00514 ." AND WordIdB = '".$SynonymId."')"
00515 ." OR (WordIdB = '".$WordId."'"
00516 ." AND WordIdA = '".$SynonymId."')");
00517 }
00518 }
00519 }
00520 }
00521 }
00522
00523 # remove all synonyms
00524 function RemoveAllSynonyms()
00525 {
00526 $this->DB->Query("DELETE FROM SearchWordSynonyms");
00527 }
00528
00529 # get synonyms for word (returns array of synonyms)
00530 function GetSynonyms($Word)
00531 {
00532 # assume no synonyms will be found
00533 $Synonyms = array();
00534
00535 # look up ID for word
00536 $WordId = $this->GetWordId($Word);
00537
00538 # if word ID was found
00539 if ($WordId !== NULL)
00540 {
00541 # look up IDs of all synonyms for this word
00542 $this->DB->Query("SELECT WordIdA, WordIdB FROM SearchWordSynonyms"
00543 ." WHERE WordIdA = ".$WordId
00544 ." OR WordIdB = ".$WordId);
00545 $SynonymIds = array();
00546 while ($Record = $this->DB->FetchRow)
00547 {
00548 $SynonymIds[] = ($Record["WordIdA"] == $WordId)
00549 ? $Record["WordIdB"] : $Record["WordIdA"];
00550 }
00551
00552 # for each synonym ID
00553 foreach ($SynonymIds as $SynonymId)
00554 {
00555 # look up synonym word and add to synonym list
00556 $Synonyms[] = $this->GetWord($SynonymId);
00557 }
00558 }
00559
00560 # return synonyms to caller
00561 return $Synonyms;
00562 }
00563
00564 # get all synonyms (returns 2D array w/ words as first index)
00565 function GetAllSynonyms()
00566 {
00567 # assume no synonyms will be found
00568 $SynonymList = array();
00569
00570 # for each synonym ID pair
00571 $OurDB = new SPTDatabase();
00572 $OurDB->Query("SELECT WordIdA, WordIdB FROM SearchWordSynonyms");
00573 while ($Record = $OurDB->FetchRow())
00574 {
00575 # look up words
00576 $Word = $this->GetWord($Record["WordIdA"]);
00577 $Synonym = $this->GetWord($Record["WordIdB"]);
00578
00579 # if we do not already have an entry for the word
00580 # or synonym is not listed for this word
00581 if (!isset($SynonymList[$Word])
00582 || !in_array($Synonym, $SynonymList[$Word]))
00583 {
00584 # add entry for synonym
00585 $SynonymList[$Word][] = $Synonym;
00586 }
00587
00588 # if we do not already have an entry for the synonym
00589 # or word is not listed for this synonym
00590 if (!isset($SynonymList[$Synonym])
00591 || !in_array($Word, $SynonymList[$Synonym]))
00592 {
00593 # add entry for word
00594 $SynonymList[$Synonym][] = $Word;
00595 }
00596 }
00597
00598 # for each word
00599 # (this loop removes reciprocal duplicates)
00600 foreach ($SynonymList as $Word => $Synonyms)
00601 {
00602 # for each synonym for that word
00603 foreach ($Synonyms as $Synonym)
00604 {
00605 # if synonym has synonyms and word is one of them
00606 if (isset($SynonymList[$Synonym])
00607 && isset($SynonymList[$Word])
00608 && in_array($Word, $SynonymList[$Synonym])
00609 && in_array($Synonym, $SynonymList[$Word]))
00610 {
00611 # if word has less synonyms than synonym
00612 if (count($SynonymList[$Word])
00613 < count($SynonymList[$Synonym]))
00614 {
00615 # remove synonym from synonym list for word
00616 $SynonymList[$Word] = array_diff(
00617 $SynonymList[$Word], array($Synonym));
00618
00619 # if no synonyms left for word
00620 if (!count($SynonymList[$Word]))
00621 {
00622 # remove empty synonym list for word
00623 unset($SynonymList[$Word]);
00624 }
00625 }
00626 else
00627 {
00628 # remove word from synonym list for synonym
00629 $SynonymList[$Synonym] = array_diff(
00630 $SynonymList[$Synonym], array($Word));
00631
00632 # if no synonyms left for word
00633 if (!count($SynonymList[$Synonym]))
00634 {
00635 # remove empty synonym list for word
00636 unset($SynonymList[$Synonym]);
00637 }
00638 }
00639 }
00640 }
00641 }
00642
00643 # sort array alphabetically (just for convenience)
00644 foreach ($SynonymList as $Word => $Synonyms)
00645 {
00646 asort($SynonymList[$Word]);
00647 }
00648 ksort($SynonymList);
00649
00650 # return 2D array of synonyms to caller
00651 return $SynonymList;
00652 }
00653
00654 # set all synonyms (accepts 2D array w/ words as first index)
00655 function SetAllSynonyms($SynonymList)
00656 {
00657 # remove all existing synonyms
00658 $this->RemoveAllSynonyms();
00659
00660 # for each synonym entry passed in
00661 foreach ($SynonymList as $Word => $Synonyms)
00662 {
00663 # add synonyms for word
00664 $this->AddSynonyms($Word, $Synonyms);
00665 }
00666 }
00667
00676 function LoadSynonymsFromFile($FileName)
00677 {
00678 # asssume no synonyms will be added
00679 $AddCount = 0;
00680
00681 # read in contents of file
00682 $Lines = file($FileName, FILE_IGNORE_NEW_LINES|FILE_SKIP_EMPTY_LINES);
00683
00684 # if file contained lines
00685 if (count($Lines))
00686 {
00687 # for each line of file
00688 foreach ($Lines as $Line)
00689 {
00690 # if line is not a comment
00691 if (!preg_match("/[\s]*#/", $Line))
00692 {
00693 # split line into words
00694 $Words = preg_split("/[\s,]+/", $Line);
00695
00696 # if synonyms found
00697 if (count($Words) > 1)
00698 {
00699 # separate out word and synonyms
00700 $Word = array_shift($Words);
00701
00702 # add synonyms
00703 $AddCount += $this->AddSynonyms($Word, $Words);
00704 }
00705 }
00706 }
00707 }
00708
00709 # return count of synonyms added to caller
00710 return $AddCount;
00711 }
00712
00713 # suggest alternatives
00714 function SuggestAlternateSearches($SearchString)
00715 {
00716 #
00717 }
00718
00719
00720 # ---- PRIVATE INTERFACE -------------------------------------------------
00721
00722 protected $DB;
00723 protected $DebugLevel;
00724 protected $ItemTableName;
00725 protected $ItemIdFieldName;
00726 protected $NumberOfResultsAvailable;
00727 protected $LastSearchTime;
00728 protected $FilterFuncs;
00729 protected $DefaultSearchLogic = self::LOGIC_AND;
00730 protected $StemmingEnabled = TRUE;
00731 protected $SynonymsEnabled = TRUE;
00732
00733 private $WordCountAdded;
00734 private $FieldIds;
00735 private $FieldInfo;
00736 private $RequiredTermCount;
00737 private $RequiredTermCounts;
00738 private $InclusiveTermCount;
00739 private $ExcludedTermCount;
00740 private $SearchTermList;
00741
00742 const STEM_ID_OFFSET = 1000000;
00743
00744
00745 # ---- common private functions (used in both searching and DB build)
00746
00747 # normalize and parse search string into list of search terms
00748 private function ParseSearchStringForWords($SearchString, $IgnorePhrases = FALSE)
00749 {
00750 # strip off any surrounding whitespace
00751 $Text = trim($SearchString);
00752
00753 # set up normalization replacement strings
00754 $Patterns = array(
00755 "/'s[^a-z0-9\\-+~]+/i", # get rid of possessive plurals
00756 "/'/", # get rid of single quotes / apostrophes
00757 "/\"[^\"]*\"/", # get rid of phrases (NOTE: HARD-CODED INDEX BELOW!!!) "
00758 "/\\([^)]*\\)/", # get rid of groups (NOTE: HARD-CODED INDEX BELOW!!!)
00759 "/[^a-z0-9\\-+~]+/i", # convert non-alphanumerics / non-minus/plus to a space
00760 "/([^\\s])-+/i", # convert minus preceded by anything but whitespace to a space
00761 "/([^\\s])\\++/i", # convert plus preceded by anything but whitespace to a space
00762 "/-\\s/i", # convert minus followed by whitespace to a space
00763 "/\\+\\s/i", # convert plus followed by whitespace to a space
00764 "/~\\s/i", # convert tilde followed by whitespace to a space
00765 "/[ ]+/" # convert multiple spaces to one space
00766 );
00767 $Replacements = array(
00768 " ",
00769 "",
00770 " ",
00771 " ",
00772 "\\1 ",
00773 "\\1 ",
00774 " ",
00775 " ",
00776 " ",
00777 " ",
00778 " "
00779 );
00780
00781 # if we are supposed to ignore phrases and groups (series of words in quotes or surrounded by parens)
00782 if ($IgnorePhrases)
00783 {
00784 # switch phrase removal to double quote removal (HARD-CODED INDEX INTO PATTERN LIST!!)
00785 $Patterns[2] = "/\"/";
00786
00787 # switch group removal to paren removal (HARD-CODED INDEX INTO PATTERN LIST!!)
00788 $Patterns[3] = "/[\(\)]+/";
00789 }
00790
00791 # remove punctuation from text and normalize whitespace
00792 $Text = preg_replace($Patterns, $Replacements, $Text);
00793 $this->DMsg(2, "Normalized search string is '".$Text."'");
00794
00795 # convert text to lower case
00796 $Text = strtolower($Text);
00797
00798 # strip off any extraneous whitespace
00799 $Text = trim($Text);
00800
00801 # start with an empty array
00802 $Words = array();
00803
00804 # if we have no words left after parsing
00805 if (strlen($Text) != 0)
00806 {
00807 # for each word
00808 foreach (explode(" ", $Text) as $Word)
00809 {
00810 # grab first character of word
00811 $FirstChar = substr($Word, 0, 1);
00812
00813 # strip off option characters and set flags appropriately
00814 $Flags = WORD_PRESENT;
00815 if ($FirstChar == "-")
00816 {
00817 $Word = substr($Word, 1);
00818 $Flags |= WORD_EXCLUDED;
00819 if (!isset($Words[$Word]))
00820 {
00821 $this->ExcludedTermCount++;
00822 }
00823 }
00824 else
00825 {
00826 if ($FirstChar == "~")
00827 {
00828 $Word = substr($Word, 1);
00829 }
00830 elseif (($this->DefaultSearchLogic == self::LOGIC_AND)
00831 || ($FirstChar == "+"))
00832 {
00833 if ($FirstChar == "+")
00834 {
00835 $Word = substr($Word, 1);
00836 }
00837 $Flags |= WORD_REQUIRED;
00838 if (!isset($Words[$Word]))
00839 {
00840 $this->RequiredTermCount++;
00841 }
00842 }
00843 if (!isset($Words[$Word]))
00844 {
00845 $this->InclusiveTermCount++;
00846 $this->SearchTermList[] = $Word;
00847 }
00848 }
00849
00850 # store flags to indicate word found
00851 $Words[$Word] = $Flags;
00852 $this->DMsg(3, "Word identified (".$Word.")");
00853 }
00854 }
00855
00856 # return normalized words to caller
00857 return $Words;
00858 }
00859
00860 protected function GetFieldId($FieldName)
00861 {
00862 # if field ID is not in cache
00863 if (!isset($this->FieldIds[$FieldName]))
00864 {
00865 # look up field info in database
00866 $this->DB->Query("SELECT FieldId FROM SearchFields "
00867 ."WHERE FieldName = '".addslashes($FieldName)."'");
00868
00869 # if field was found
00870 if ($Record = $this->DB->FetchRow())
00871 {
00872 # load info from DB record
00873 $FieldId = $Record["FieldId"];
00874 }
00875 else
00876 {
00877 # add field to database
00878 $this->DB->Query("INSERT INTO SearchFields (FieldName) "
00879 ."VALUES ('".addslashes($FieldName)."')");
00880
00881 # retrieve ID for newly added field
00882 $FieldId = $this->DB->LastInsertId("SearchFields");
00883 }
00884
00885 # cache field info
00886 $this->FieldIds[$FieldName] = $FieldId;
00887 }
00888
00889 # return cached ID to caller
00890 return $this->FieldIds[$FieldName];
00891 }
00892
00893 # retrieve ID for specified word (returns NULL if no ID found)
00894 private function GetWordId($Word, $AddIfNotFound = FALSE)
00895 {
00896 static $WordIdCache;
00897
00898 # if word was in ID cache
00899 if (isset($WordIdCache[$Word]))
00900 {
00901 # use ID from cache
00902 $WordId = $WordIdCache[$Word];
00903 }
00904 else
00905 {
00906 # look up ID in database
00907 $WordId = $this->DB->Query("SELECT WordId"
00908 ." FROM SearchWords"
00909 ." WHERE WordText='".addslashes($Word)."'",
00910 "WordId");
00911
00912 # if ID was not found and caller requested it be added
00913 if (($WordId === NULL) && $AddIfNotFound)
00914 {
00915 # add word to database
00916 $this->DB->Query("INSERT INTO SearchWords (WordText)"
00917 ." VALUES ('".addslashes(strtolower($Word))."')");
00918
00919 # get ID for newly added word
00920 $WordId = $this->DB->LastInsertId("SearchWords");
00921 }
00922
00923 # save ID to cache
00924 $WordIdCache[$Word] = $WordId;
00925 }
00926
00927 # return ID to caller
00928 return $WordId;
00929 }
00930
00931 # retrieve ID for specified word stem (returns NULL if no ID found)
00932 private function GetStemId($Stem, $AddIfNotFound = FALSE)
00933 {
00934 static $StemIdCache;
00935
00936 # if stem was in ID cache
00937 if (isset($StemIdCache[$Stem]))
00938 {
00939 # use ID from cache
00940 $StemId = $StemIdCache[$Stem];
00941 }
00942 else
00943 {
00944 # look up ID in database
00945 $StemId = $this->DB->Query("SELECT WordId"
00946 ." FROM SearchStems"
00947 ." WHERE WordText='".addslashes($Stem)."'",
00948 "WordId");
00949
00950 # if ID was not found and caller requested it be added
00951 if (($StemId === NULL) && $AddIfNotFound)
00952 {
00953 # add stem to database
00954 $this->DB->Query("INSERT INTO SearchStems (WordText)"
00955 ." VALUES ('".addslashes(strtolower($Stem))."')");
00956
00957 # get ID for newly added stem
00958 $StemId = $this->DB->LastInsertId("SearchStems");
00959 }
00960
00961 # adjust from DB ID value to stem ID value
00962 $StemId += self::STEM_ID_OFFSET;
00963
00964 # save ID to cache
00965 $StemIdCache[$Stem] = $StemId;
00966 }
00967
00968 # return ID to caller
00969 return $StemId;
00970 }
00971
00972 # retrieve word for specified word ID (returns FALSE if no word found)
00973 private function GetWord($WordId)
00974 {
00975 static $WordCache;
00976
00977 # if word was in cache
00978 if (isset($WordCache[$WordId]))
00979 {
00980 # use word from cache
00981 $Word = $WordCache[$WordId];
00982 }
00983 else
00984 {
00985 # adjust search location and word ID if word is stem
00986 $TableName = "SearchWords";
00987 if ($WordId >= self::STEM_ID_OFFSET)
00988 {
00989 $TableName = "SearchStems";
00990 $WordId -= self::STEM_ID_OFFSET;
00991 }
00992
00993 # look up word in database
00994 $Word = $this->DB->Query("SELECT WordText"
00995 ." FROM ".$TableName
00996 ." WHERE WordId='".$WordId."'",
00997 "WordText");
00998
00999 # save word to cache
01000 $WordCache[$WordId] = $Word;
01001 }
01002
01003 # return word to caller
01004 return $Word;
01005 }
01006
01007
01008 # ---- private functions used in searching
01009
01010 # perform search across multiple fields and return raw results to caller
01011 private function SearchAcrossFields($SearchStrings)
01012 {
01013 # start by assuming no search will be done
01014 $Scores = NULL;
01015
01016 # clear word counts
01017 $this->InclusiveTermCount = 0;
01018 $this->RequiredTermCount = 0;
01019 $this->ExcludedTermCount = 0;
01020
01021 # for each field
01022 $NeedComparisonSearch = FALSE;
01023 foreach ($SearchStrings as $FieldName => $SearchStringArray)
01024 {
01025 # convert search string to array if needed
01026 if (!is_array($SearchStringArray))
01027 {
01028 $SearchStringArray = array($SearchStringArray);
01029 }
01030
01031 # for each search string for this field
01032 foreach ($SearchStringArray as $SearchString)
01033 {
01034 # if field is keyword or field is text and does not look like comparison match
01035 if (($FieldName == "XXXKeywordXXX")
01036 || (isset($this->FieldInfo[$FieldName])
01037 && ($this->FieldInfo[$FieldName]["FieldType"] == self::FIELDTYPE_TEXT)
01038 && !preg_match("/^[><!]=./", $SearchString)
01039 && !preg_match("/^[><=]./", $SearchString)))
01040 {
01041 $this->DMsg(0, "Searching text field \""
01042 .$FieldName."\" for string \"$SearchString\"");
01043
01044 # normalize text and split into words
01045 $Words[$FieldName] =
01046 $this->ParseSearchStringForWords($SearchString);
01047
01048 # calculate scores for matching items
01049 if (count($Words[$FieldName]))
01050 {
01051 $Scores = $this->SearchForWords(
01052 $Words[$FieldName], $FieldName, $Scores);
01053 $this->DMsg(3, "Have "
01054 .count($Scores)." results after word search");
01055 }
01056
01057 # split into phrases
01058 $Phrases[$FieldName] =
01059 $this->ParseSearchStringForPhrases($SearchString);
01060
01061 # handle any phrases
01062 if (count($Phrases[$FieldName]))
01063 {
01064 $Scores = $this->SearchForPhrases(
01065 $Phrases[$FieldName], $Scores, $FieldName, TRUE, FALSE);
01066 $this->DMsg(3, "Have "
01067 .count($Scores)." results after phrase search");
01068 }
01069 }
01070 else
01071 {
01072 # set flag to indicate possible comparison search candidate found
01073 $NeedComparisonSearch = TRUE;
01074 }
01075 }
01076 }
01077
01078 # perform comparison searches
01079 if ($NeedComparisonSearch)
01080 {
01081 $Scores = $this->SearchForComparisonMatches($SearchStrings, $Scores);
01082 $this->DMsg(3, "Have ".count($Scores)." results after comparison search");
01083 }
01084
01085 # if no results found and exclusions specified
01086 if (!count($Scores) && $this->ExcludedTermCount)
01087 {
01088 # load all records
01089 $Scores = $this->LoadScoresForAllRecords();
01090 }
01091
01092 # if search results found
01093 if (count($Scores))
01094 {
01095 # for each search text string
01096 foreach ($SearchStrings as $FieldName => $SearchStringArray)
01097 {
01098 # convert search string to array if needed
01099 if (!is_array($SearchStringArray))
01100 {
01101 $SearchStringArray = array($SearchStringArray);
01102 }
01103
01104 # for each search string for this field
01105 foreach ($SearchStringArray as $SearchString)
01106 {
01107 # if field is text
01108 if (($FieldName == "XXXKeywordXXX")
01109 || (isset($this->FieldInfo[$FieldName])
01110 && ($this->FieldInfo[$FieldName]["FieldType"]
01111 == self::FIELDTYPE_TEXT)))
01112 {
01113 # if there are words in search text
01114 if (isset($Words[$FieldName]))
01115 {
01116 # handle any excluded words
01117 $Scores = $this->FilterOnExcludedWords($Words[$FieldName], $Scores, $FieldName);
01118 }
01119
01120 # handle any excluded phrases
01121 if (isset($Phrases[$FieldName]))
01122 {
01123 $Scores = $this->SearchForPhrases(
01124 $Phrases[$FieldName], $Scores, $FieldName, FALSE, TRUE);
01125 }
01126 }
01127 }
01128 }
01129
01130 # strip off any results that don't contain required words
01131 $Scores = $this->FilterOnRequiredWords($Scores);
01132 }
01133
01134 # return search result scores to caller
01135 return $Scores;
01136 }
01137
01138 # search for words in specified field
01139 private function SearchForWords(
01140 $Words, $FieldName = "XXXKeywordXXX", $Scores = NULL)
01141 {
01142 $DB = $this->DB;
01143
01144 # start with empty search result scores list if none passed in
01145 if ($Scores == NULL)
01146 {
01147 $Scores = array();
01148 }
01149
01150 # grab field ID
01151 $FieldId = $this->GetFieldId($FieldName);
01152
01153 # for each word
01154 foreach ($Words as $Word => $Flags)
01155 {
01156 $this->DMsg(2, "Searching for word '${Word}' in field ".$FieldName);
01157
01158 # if word is not excluded
01159 if (!($Flags & WORD_EXCLUDED))
01160 {
01161 # look up record ID for word
01162 $this->DMsg(2, "Looking up word \"".$Word."\"");
01163 $WordId = $this->GetWordId($Word);
01164
01165 # if word is in DB
01166 if ($WordId !== NULL)
01167 {
01168 # look up counts for word
01169 $DB->Query("SELECT ItemId,Count FROM SearchWordCounts "
01170 ."WHERE WordId = ".$WordId
01171 ." AND FieldId = ".$FieldId);
01172 $Counts = $DB->FetchColumn("Count", "ItemId");
01173
01174 # if synonym support is enabled
01175 if ($this->SynonymsEnabled)
01176 {
01177 # look for any synonyms
01178 $DB->Query("SELECT WordIdA, WordIdB"
01179 ." FROM SearchWordSynonyms"
01180 ." WHERE WordIdA = ".$WordId
01181 ." OR WordIdB = ".$WordId);
01182
01183 # if synonyms were found
01184 if ($DB->NumRowsSelected())
01185 {
01186 # retrieve synonym IDs
01187 $SynonymIds = array();
01188 while ($Record = $DB->FetchRow())
01189 {
01190 $SynonymIds[] = ($Record["WordIdA"] == $WordId)
01191 ? $Record["WordIdB"]
01192 : $Record["WordIdA"];
01193 }
01194
01195 # for each synonym
01196 foreach ($SynonymIds as $SynonymId)
01197 {
01198 # retrieve counts for synonym
01199 $DB->Query("SELECT ItemId,Count"
01200 ." FROM SearchWordCounts"
01201 ." WHERE WordId = ".$SynonymId
01202 ." AND FieldId = ".$FieldId);
01203 $SynonymCounts = $DB->FetchColumn("Count", "ItemId");
01204
01205 # for each count
01206 foreach ($SynonymCounts as $ItemId => $Count)
01207 {
01208 # adjust count because it's a synonym
01209 $AdjustedCount = ceil($Count / 2);
01210
01211 # add count to existing counts
01212 if (isset($Counts[$ItemId]))
01213 {
01214 $Counts[$ItemId] += $AdjustedCount;
01215 }
01216 else
01217 {
01218 $Counts[$ItemId] = $AdjustedCount;
01219 }
01220 }
01221 }
01222 }
01223 }
01224 }
01225
01226 # if stemming is enabled
01227 if ($this->StemmingEnabled)
01228 {
01229 # retrieve stem ID
01230 $Stem = PorterStemmer::Stem($Word);
01231 $this->DMsg(2, "Looking up stem \"".$Stem."\"");
01232 $StemId = $this->GetStemId($Stem);
01233
01234 # if ID found for stem
01235 if ($StemId !== NULL)
01236 {
01237 # retrieve counts for stem
01238 $DB->Query("SELECT ItemId,Count"
01239 ." FROM SearchWordCounts"
01240 ." WHERE WordId = ".$StemId
01241 ." AND FieldId = ".$FieldId);
01242 $StemCounts = $DB->FetchColumn("Count", "ItemId");
01243
01244 # for each count
01245 foreach ($StemCounts as $ItemId => $Count)
01246 {
01247 # adjust count because it's a stem
01248 $AdjustedCount = ceil($Count / 2);
01249
01250 # add count to existing counts
01251 if (isset($Counts[$ItemId]))
01252 {
01253 $Counts[$ItemId] += $AdjustedCount;
01254 }
01255 else
01256 {
01257 $Counts[$ItemId] = $AdjustedCount;
01258 }
01259 }
01260 }
01261 }
01262
01263 # if counts were found
01264 if (isset($Counts))
01265 {
01266 # for each count
01267 foreach ($Counts as $ItemId => $Count)
01268 {
01269 # if word flagged as required
01270 if ($Flags & WORD_REQUIRED)
01271 {
01272 # increment required word count for record
01273 if (isset($this->RequiredTermCounts[$ItemId]))
01274 {
01275 $this->RequiredTermCounts[$ItemId]++;
01276 }
01277 else
01278 {
01279 $this->RequiredTermCounts[$ItemId] = 1;
01280 }
01281 }
01282
01283 # add to item record score
01284 if (isset($Scores[$ItemId]))
01285 {
01286 $Scores[$ItemId] += $Count;
01287 }
01288 else
01289 {
01290 $Scores[$ItemId] = $Count;
01291 }
01292 }
01293 }
01294 }
01295 }
01296
01297 # return basic scores to caller
01298 return $Scores;
01299 }
01300
01301 # extract phrases (terms surrounded by quotes) from search string
01302 private function ParseSearchStringForPhrases($SearchString)
01303 {
01304 # split into chunks delimited by double quote marks
01305 $Pieces = explode("\"", $SearchString); # "
01306
01307 # for each pair of chunks
01308 $Index = 2;
01309 $Phrases = array();
01310 while ($Index < count($Pieces))
01311 {
01312 # grab phrase from chunk
01313 $Phrase = trim(addslashes($Pieces[$Index - 1]));
01314 $Flags = WORD_PRESENT;
01315
01316 # grab first character of phrase
01317 $FirstChar = substr($Pieces[$Index - 2], -1);
01318
01319 # set flags to reflect any option characters
01320 if ($FirstChar == "-")
01321 {
01322 $Flags |= WORD_EXCLUDED;
01323 if (!isset($Phrases[$Phrase]))
01324 {
01325 $this->ExcludedTermCount++;
01326 }
01327 }
01328 else
01329 {
01330 if ((($this->DefaultSearchLogic == self::LOGIC_AND) && ($FirstChar != "~"))
01331 || ($FirstChar == "+"))
01332 {
01333 $Flags |= WORD_REQUIRED;
01334 if (!isset($Phrases[$Phrase]))
01335 {
01336 $this->RequiredTermCount++;
01337 }
01338 }
01339 if (!isset($Phrases[$Phrase]))
01340 {
01341 $this->InclusiveTermCount++;
01342 $this->SearchTermList[] = $Phrase;
01343 }
01344 }
01345 $Phrases[$Phrase] = $Flags;
01346
01347 # move to next pair of chunks
01348 $Index += 2;
01349 }
01350
01351 # return phrases to caller
01352 return $Phrases;
01353 }
01354
01355 # extract groups (terms surrounded by parens) from search string
01356 # (NOTE: NOT YET IMPLEMENTED!!!)
01357 private function ParseSearchStringForGroups($SearchString)
01358 {
01359 # split into chunks delimited by open paren
01360 $Pieces = explode("(", $SearchString);
01361
01362 # for each chunk
01363 $Index = 2;
01364 while ($Index < count($Pieces))
01365 {
01366 # grab phrase from chunk
01367 $Group = trim(addslashes($Pieces[$Index - 1]));
01368 $Groups[] = $Group;
01369
01370 # move to next pair of chunks
01371 $Index += 2;
01372 }
01373
01374 # return phrases to caller
01375 return $Groups;
01376 }
01377
01378 protected function SearchFieldForPhrases($FieldName, $Phrase)
01379 {
01380 # error out
01381 exit("<br>SE - ERROR: SearchFieldForPhrases() not implemented<br>\n");
01382 }
01383
01384 private function SearchForPhrases($Phrases, $Scores, $FieldName = "XXXKeywordXXX",
01385 $ProcessNonExcluded = TRUE, $ProcessExcluded = TRUE)
01386 {
01387 # if phrases are found
01388 if (count($Phrases) > 0)
01389 {
01390 # if this is a keyword search
01391 if ($FieldName == "XXXKeywordXXX")
01392 {
01393 # for each field
01394 foreach ($this->FieldInfo as $KFieldName => $Info)
01395 {
01396 # if field is marked to be included in keyword searches
01397 if ($Info["InKeywordSearch"])
01398 {
01399 # call ourself with that field
01400 $Scores = $this->SearchForPhrases($Phrases, $Scores, $KFieldName,
01401 $ProcessNonExcluded, $ProcessExcluded);
01402 }
01403 }
01404 }
01405 else
01406 {
01407 # for each phrase
01408 foreach ($Phrases as $Phrase => $Flags)
01409 {
01410 $this->DMsg(2, "Searching for phrase '".$Phrase
01411 ."' in field ".$FieldName);
01412
01413 # if phrase flagged as excluded and we are doing excluded phrases
01414 # or phrase flagged as non-excluded and we are doing non-excluded phrases
01415 if (($ProcessExcluded && ($Flags & WORD_EXCLUDED))
01416 || ($ProcessNonExcluded && !($Flags & WORD_EXCLUDED)))
01417 {
01418 # initialize score list if necessary
01419 if ($Scores === NULL) { $Scores = array(); }
01420
01421 # retrieve list of items that contain phrase
01422 $ItemIds = $this->SearchFieldForPhrases(
01423 $FieldName, $Phrase);
01424
01425 # for each item that contains phrase
01426 foreach ($ItemIds as $ItemId)
01427 {
01428 # if we are doing excluded phrases and phrase flagged as excluded
01429 if ($ProcessExcluded && ($Flags & WORD_EXCLUDED))
01430 {
01431 # knock item off of list
01432 unset($Scores[$ItemId]);
01433 }
01434 elseif ($ProcessNonExcluded)
01435 {
01436 # calculate phrase value based on number of words and field weight
01437 $PhraseScore = count(preg_split("/[\s]+/", $Phrase, -1, PREG_SPLIT_NO_EMPTY))
01438 * $this->FieldInfo[$FieldName]["Weight"];
01439 $this->DMsg(2, "Phrase score is ".$PhraseScore);
01440
01441 # bump up item record score
01442 if (isset($Scores[$ItemId]))
01443 {
01444 $Scores[$ItemId] += $PhraseScore;
01445 }
01446 else
01447 {
01448 $Scores[$ItemId] = $PhraseScore;
01449 }
01450
01451 # if phrase flagged as required
01452 if ($Flags & WORD_REQUIRED)
01453 {
01454 # increment required word count for record
01455 if (isset($this->RequiredTermCounts[$ItemId]))
01456 {
01457 $this->RequiredTermCounts[$ItemId]++;
01458 }
01459 else
01460 {
01461 $this->RequiredTermCounts[$ItemId] = 1;
01462 }
01463 }
01464 }
01465 }
01466 }
01467 }
01468 }
01469 }
01470
01471 # return updated scores to caller
01472 return $Scores;
01473 }
01474
01475 private function FilterOnExcludedWords($Words, $Scores, $FieldName = "XXXKeywordXXX")
01476 {
01477 $DB = $this->DB;
01478
01479 # grab field ID
01480 $FieldId = $this->GetFieldId($FieldName);
01481
01482 # for each word
01483 foreach ($Words as $Word => $Flags)
01484 {
01485 # if word flagged as excluded
01486 if ($Flags & WORD_EXCLUDED)
01487 {
01488 # look up record ID for word
01489 $WordId = $this->GetWordId($Word);
01490
01491 # if word is in DB
01492 if ($WordId !== NULL)
01493 {
01494 # look up counts for word
01495 $DB->Query("SELECT ItemId FROM SearchWordCounts "
01496 ."WHERE WordId=${WordId} AND FieldId=${FieldId}");
01497
01498 # for each count
01499 while ($Record = $DB->FetchRow())
01500 {
01501 # if item record is in score list
01502 $ItemId = $Record["ItemId"];
01503 if (isset($Scores[$ItemId]))
01504 {
01505 # remove item record from score list
01506 $this->DMsg(3, "Filtering out item ".$ItemId
01507 ." because it contained word \"".$Word."\"");
01508 unset($Scores[$ItemId]);
01509 }
01510 }
01511 }
01512 }
01513 }
01514
01515 # returned filtered score list to caller
01516 return $Scores;
01517 }
01518
01519 private function FilterOnRequiredWords($Scores)
01520 {
01521 # if there were required words
01522 if ($this->RequiredTermCount > 0)
01523 {
01524 # for each item
01525 foreach ($Scores as $ItemId => $Score)
01526 {
01527 # if item does not meet required word count
01528 if (!isset($this->RequiredTermCounts[$ItemId])
01529 || ($this->RequiredTermCounts[$ItemId] < $this->RequiredTermCount))
01530 {
01531 # filter out item
01532 $this->DMsg(4, "Filtering out item ".$ItemId
01533 ." because it didn't have required word count of "
01534 .$this->RequiredTermCount
01535 .(isset($this->RequiredTermCounts[$ItemId])
01536 ? " (only had "
01537 .$this->RequiredTermCounts[$ItemId]
01538 : " (had none")
01539 .")");
01540 unset($Scores[$ItemId]);
01541 }
01542 }
01543 }
01544
01545 # return filtered list to caller
01546 return $Scores;
01547 }
01548
01549 # count, sort, and trim search result scores list
01550 private function CleanScores($Scores, $StartingResult, $NumberOfResults,
01551 $SortByField, $SortDescending)
01552 {
01553 # perform any requested filtering
01554 $this->DMsg(0, "Have ".count($Scores)." results before filter callbacks");
01555 $Scores = $this->FilterOnSuppliedFunctions($Scores);
01556
01557 # save total number of results available
01558 $this->NumberOfResultsAvailable = count($Scores);
01559
01560 # if no sorting field specified
01561 if ($SortByField === NULL)
01562 {
01563 # sort result list by score
01564 if ($SortDescending)
01565 arsort($Scores, SORT_NUMERIC);
01566 else
01567 asort($Scores, SORT_NUMERIC);
01568 }
01569 else
01570 {
01571 # get list of item IDs in sorted order
01572 $SortedIds = $this->GetItemIdsSortedByField(
01573 $SortByField, $SortDescending);
01574
01575 # if we have sorted item IDs
01576 if (count($SortedIds) && count($Scores))
01577 {
01578 # strip sorted ID list down to those that appear in search results
01579 $SortedIds = array_intersect($SortedIds, array_keys($Scores));
01580
01581 # rebuild score list in sorted order
01582 foreach ($SortedIds as $Id)
01583 {
01584 $NewScores[$Id] = $Scores[$Id];
01585 }
01586 $Scores = $NewScores;
01587 }
01588 else
01589 {
01590 # sort result list by score
01591 arsort($Scores, SORT_NUMERIC);
01592 }
01593 }
01594
01595 # trim result list to match range requested by caller
01596 $ScoresKeys = array_slice(
01597 array_keys($Scores), $StartingResult, $NumberOfResults);
01598 $TrimmedScores = array();
01599 foreach ($ScoresKeys as $Key) { $TrimmedScores[$Key] = $Scores[$Key]; }
01600
01601 # returned cleaned search result scores list to caller
01602 return $TrimmedScores;
01603 }
01604
01605 protected function FilterOnSuppliedFunctions($Scores)
01606 {
01607 # if filter functions have been set
01608 if (isset($this->FilterFuncs))
01609 {
01610 # for each result
01611 foreach ($Scores as $ItemId => $Score)
01612 {
01613 # for each filter function
01614 foreach ($this->FilterFuncs as $FuncName)
01615 {
01616 # if filter function return TRUE for item
01617 if ($FuncName($ItemId))
01618 {
01619 # discard result
01620 $this->DMsg(2, "Filter callback <i>".$FuncName
01621 ."</i> rejected item ".$ItemId);
01622 unset($Scores[$ItemId]);
01623
01624 # bail out of filter func loop
01625 continue 2;
01626 }
01627 }
01628 }
01629 }
01630
01631 # return filtered list to caller
01632 return $Scores;
01633 }
01634
01635 private function SearchForComparisonMatches($SearchStrings, $Scores)
01636 {
01637 # for each field
01638 $Index = 0;
01639 foreach ($SearchStrings as $SearchFieldName => $SearchStringArray)
01640 {
01641 # if field is not keyword
01642 if ($SearchFieldName != "XXXKeywordXXX")
01643 {
01644 # convert search string to array if needed
01645 if (!is_array($SearchStringArray))
01646 {
01647 $SearchStringArray = array($SearchStringArray);
01648 }
01649
01650 # for each search string for this field
01651 foreach ($SearchStringArray as $SearchString)
01652 {
01653 # if search string looks like comparison search
01654 $FoundOperator = preg_match("/^[><!]=./", $SearchString)
01655 || preg_match("/^[><=]./", $SearchString);
01656 if ($FoundOperator
01657 || (isset($this->FieldInfo[$SearchFieldName]["FieldType"])
01658 && ($this->FieldInfo[$SearchFieldName]["FieldType"]
01659 != self::FIELDTYPE_TEXT)))
01660 {
01661 # determine value
01662 $Patterns = array("/^[><!]=/", "/^[><=]/");
01663 $Replacements = array("", "");
01664 $Value = trim(preg_replace($Patterns, $Replacements, $SearchString));
01665
01666 # determine and save operator
01667 if (!$FoundOperator)
01668 {
01669 $Operators[$Index] = "=";
01670 }
01671 else
01672 {
01673 $Term = trim($SearchString);
01674 $FirstChar = $Term{0};
01675 $FirstTwoChars = $FirstChar.$Term{1};
01676 if ($FirstTwoChars == ">=") { $Operators[$Index] = ">="; }
01677 elseif ($FirstTwoChars == "<=") { $Operators[$Index] = "<="; }
01678 elseif ($FirstTwoChars == "!=") { $Operators[$Index] = "!="; }
01679 elseif ($FirstChar == ">") { $Operators[$Index] = ">"; }
01680 elseif ($FirstChar == "<") { $Operators[$Index] = "<"; }
01681 elseif ($FirstChar == "=") { $Operators[$Index] = "="; }
01682 }
01683
01684 # if operator was found
01685 if (isset($Operators[$Index]))
01686 {
01687 # save value
01688 $Values[$Index] = $Value;
01689
01690 # save field name
01691 $FieldNames[$Index] = $SearchFieldName;
01692 $this->DMsg(3, "Added comparison (field = <i>"
01693 .$FieldNames[$Index]."</i> op = <i>"
01694 .$Operators[$Index]."</i> val = <i>"
01695 .$Values[$Index]."</i>)");
01696
01697 # move to next comparison array entry
01698 $Index++;
01699 }
01700 }
01701 }
01702 }
01703 }
01704
01705 # if comparisons found
01706 if (isset($Operators))
01707 {
01708 # perform comparisons on fields and gather results
01709 $Results = $this->SearchFieldsForComparisonMatches($FieldNames, $Operators, $Values);
01710
01711 # if search logic is set to AND
01712 if ($this->DefaultSearchLogic == self::LOGIC_AND)
01713 {
01714 # if results were found
01715 if (count($Results))
01716 {
01717 # if there were no prior results and no terms for keyword search
01718 if ((count($Scores) == 0) && ($this->InclusiveTermCount == 0))
01719 {
01720 # add all results to scores
01721 foreach ($Results as $ItemId)
01722 {
01723 $Scores[$ItemId] = 1;
01724 }
01725 }
01726 else
01727 {
01728 # remove anything from scores that is not part of results
01729 foreach ($Scores as $ItemId => $Score)
01730 {
01731 if (in_array($ItemId, $Results) == FALSE)
01732 {
01733 unset($Scores[$ItemId]);
01734 }
01735 }
01736 }
01737 }
01738 else
01739 {
01740 # clear scores
01741 $Scores = array();
01742 }
01743 }
01744 else
01745 {
01746 # add result items to scores
01747 if ($Scores === NULL) { $Scores = array(); }
01748 foreach ($Results as $ItemId)
01749 {
01750 if (isset($Scores[$ItemId]))
01751 {
01752 $Scores[$ItemId] += 1;
01753 }
01754 else
01755 {
01756 $Scores[$ItemId] = 1;
01757 }
01758 }
01759 }
01760 }
01761
01762 # return results to caller
01763 return $Scores;
01764 }
01765
01766 private function SetDebugLevel($SearchStrings)
01767 {
01768 # if search info is an array
01769 if (is_array($SearchStrings))
01770 {
01771 # for each array element
01772 foreach ($SearchStrings as $FieldName => $SearchStringArray)
01773 {
01774 # if element is an array
01775 if (is_array($SearchStringArray))
01776 {
01777 # for each array element
01778 foreach ($SearchStringArray as $Index => $SearchString)
01779 {
01780 # pull out search string if present
01781 $SearchStrings[$FieldName][$Index] = $this->ExtractDebugLevel($SearchString);
01782 }
01783 }
01784 else
01785 {
01786 # pull out search string if present
01787 $SearchStrings[$FieldName] = $this->ExtractDebugLevel($SearchStringArray);
01788 }
01789 }
01790 }
01791 else
01792 {
01793 # pull out search string if present
01794 $SearchStrings = $this->ExtractDebugLevel($SearchStrings);
01795 }
01796
01797 # return new search info to caller
01798 return $SearchStrings;
01799 }
01800
01801 private function ExtractDebugLevel($SearchString)
01802 {
01803 # if search string contains debug level indicator
01804 if (strstr($SearchString, "DBUGLVL="))
01805 {
01806 # remove indicator and set debug level
01807 $Level = preg_replace("/^\\s*DBUGLVL=([1-9]{1,2}).*/", "\\1", $SearchString);
01808 if ($Level > 0)
01809 {
01810 $this->DebugLevel = $Level;
01811 $this->DMsg(0, "Setting debug level to ".$Level);
01812 $SearchString = preg_replace("/DBUGLVL=${Level}/", "", $SearchString);
01813 }
01814 }
01815
01816 # return (possibly) modified search string to caller
01817 return $SearchString;
01818 }
01819
01820 # load and return search result scores array containing all possible records
01821 private function LoadScoresForAllRecords()
01822 {
01823 # start with empty list
01824 $Scores = array();
01825
01826 # for every item
01827 $this->DB->Query("SELECT ".$this->ItemIdFieldName
01828 ." FROM ".$this->ItemTableName);
01829 while ($Record = $this->DB->FetchRow())
01830 {
01831 # set score for item to 1
01832 $Scores[$Record[$this->ItemIdFieldName]] = 1;
01833 }
01834
01835 # return array with all scores to caller
01836 return $Scores;
01837 }
01838
01839
01840 # ---- private functions used in building search database
01841
01849 private function UpdateWordCount($Word, $ItemId, $FieldId, $Weight = 1)
01850 {
01851 # retrieve ID for word
01852 $WordIds[] = $this->GetWordId($Word, TRUE);
01853
01854 # if stemming is enabled
01855 if ($this->StemmingEnabled)
01856 {
01857 # retrieve ID for stem of word
01858 $Stem = PorterStemmer::Stem($Word, TRUE);
01859 $WordIds[] = $this->GetStemId($Stem, TRUE);
01860 }
01861
01862 # for word and stem of word
01863 foreach ($WordIds as $WordId)
01864 {
01865 # if word count already added to database
01866 if (isset($this->WordCountAdded[$WordId][$FieldId]))
01867 {
01868 # update word count
01869 $this->DB->Query("UPDATE SearchWordCounts SET Count=Count+".$Weight
01870 ." WHERE WordId=".$WordId
01871 ." AND ItemId=".$ItemId
01872 ." AND FieldId=".$FieldId);
01873 }
01874 else
01875 {
01876 # add word count to DB
01877 $this->DB->Query("INSERT INTO SearchWordCounts"
01878 ." (WordId, ItemId, FieldId, Count) VALUES"
01879 ." (".$WordId.", ".$ItemId.", ".$FieldId.", ".$Weight.")");
01880
01881 # remember that we added count for this word
01882 $this->WordCountAdded[$WordId][$FieldId] = TRUE;
01883 }
01884
01885 # decrease weight for stem
01886 $Weight = ceil($Weight / 2);
01887 }
01888 }
01889
01890 protected function GetFieldContent($ItemId, $FieldName)
01891 {
01892 # error out
01893 exit("<br>SE - ERROR: GetFieldContent() not implemented<br>\n");
01894 }
01895
01896 private function RecordSearchInfoForText(
01897 $ItemId, $FieldName, $Weight, $Text, $IncludeInKeyword)
01898 {
01899 # normalize text
01900 $Words = $this->ParseSearchStringForWords($Text, TRUE);
01901
01902 # if there was text left after parsing
01903 if (count($Words) > 0)
01904 {
01905 # get ID for field
01906 $FieldId = $this->GetFieldId($FieldName);
01907
01908 # if text should be included in keyword searches
01909 if ($IncludeInKeyword)
01910 {
01911 # get ID for keyword field
01912 $KeywordFieldId = $this->GetFieldId("XXXKeywordXXX");
01913 }
01914
01915 # for each word
01916 foreach ($Words as $Word => $Flags)
01917 {
01918 # update count for word
01919 $this->UpdateWordCount($Word, $ItemId, $FieldId);
01920
01921 # if text should be included in keyword searches
01922 if ($IncludeInKeyword)
01923 {
01924 # update keyword field count for word
01925 $this->UpdateWordCount(
01926 $Word, $ItemId, $KeywordFieldId, $Weight);
01927 }
01928 }
01929 }
01930 }
01931
01932 # print debug message if level set high enough
01933 protected function DMsg($Level, $Msg)
01934 {
01935 if ($this->DebugLevel > $Level)
01936 {
01937 print("SE: ".$Msg."<br>\n");
01938 }
01939 }
01940
01941 # ---- BACKWARD COMPATIBILITY --------------------------------------------
01942
01943 # possible types of logical operators
01944 const SEARCHLOGIC_AND = 1;
01945 const SEARCHLOGIC_OR = 2;
01946 }
01947
01948 ?>